diff --git a/.clang-format b/.clang-format index 566d0efa8..3a0a27166 100644 --- a/.clang-format +++ b/.clang-format @@ -105,9 +105,10 @@ SpaceBeforeAssignmentOperators: true # Assignment = should be seperated by spaces on both sides. SpaceBeforeParens: ControlStatements -# for control statements a space is required before '{' -# Bad: for(){ statement; } -# Good: for() { statement; } +# for control statements a space is required before '(' +# Bad: for() { statement; } +# Good: for () { statement; } +# This setting distinguishes functions() from keywords like 'if' and 'for'. SpaceInEmptyParentheses: false # No spaces required for empty () diff --git a/.upstream_base_commits b/.upstream_base_commits new file mode 100644 index 000000000..d9ee6e9d6 --- /dev/null +++ b/.upstream_base_commits @@ -0,0 +1,3 @@ +#freebsd = https://github.com/freebsd/freebsd.git +bsd/man/man2/access.2 freebsd lib/libc/sys/access.2 5b882020081a138285227631c46a406c08e17bc8 +bsd/man/man7/sticky.7 freebsd share/man/man7/sticky.7 5b882020081a138285227631c46a406c08e17bc8 diff --git a/EXTERNAL_HEADERS/Availability.h b/EXTERNAL_HEADERS/Availability.h index 79b5894b4..4875d6171 100644 --- a/EXTERNAL_HEADERS/Availability.h +++ b/EXTERNAL_HEADERS/Availability.h @@ -130,36 +130,43 @@ #define __MAC_10_10_2 101002 #define __MAC_10_10_3 101003 #define __MAC_10_11 101100 +#define __MAC_10_12 101200 /* __MAC_NA is not defined to a value but is uses as a token by macros to indicate that the API is unavailable */ -#define __IPHONE_2_0 20000 -#define __IPHONE_2_1 20100 -#define __IPHONE_2_2 20200 -#define __IPHONE_3_0 30000 -#define __IPHONE_3_1 30100 -#define __IPHONE_3_2 30200 -#define __IPHONE_4_0 40000 -#define __IPHONE_4_1 40100 -#define __IPHONE_4_2 40200 -#define __IPHONE_4_3 40300 -#define __IPHONE_5_0 50000 -#define __IPHONE_5_1 50100 -#define __IPHONE_6_0 60000 -#define __IPHONE_6_1 60100 -#define __IPHONE_7_0 70000 -#define __IPHONE_7_1 70100 -#define __IPHONE_8_0 80000 -#define __IPHONE_8_1 80100 -#define __IPHONE_8_2 80200 -#define __IPHONE_8_3 80300 -#define __IPHONE_8_4 80400 -#define __IPHONE_9_0 90000 +#define __IPHONE_2_0 20000 +#define __IPHONE_2_1 20100 +#define __IPHONE_2_2 20200 +#define __IPHONE_3_0 30000 +#define __IPHONE_3_1 30100 +#define __IPHONE_3_2 30200 +#define __IPHONE_4_0 40000 +#define __IPHONE_4_1 40100 +#define __IPHONE_4_2 40200 +#define __IPHONE_4_3 40300 +#define __IPHONE_5_0 50000 +#define __IPHONE_5_1 50100 +#define __IPHONE_6_0 60000 +#define __IPHONE_6_1 60100 +#define __IPHONE_7_0 70000 +#define __IPHONE_7_1 70100 +#define __IPHONE_8_0 80000 +#define __IPHONE_8_1 80100 +#define __IPHONE_8_2 80200 +#define __IPHONE_8_3 80300 +#define __IPHONE_8_4 80400 +#define __IPHONE_9_0 90000 +#define __IPHONE_9_1 90100 +#define __IPHONE_9_2 90200 +#define __IPHONE_10_0 100000 /* __IPHONE_NA is not defined to a value but is uses as a token by macros to indicate that the API is unavailable */ -#define __TVOS_9_0 90000 +#define __TVOS_9_0 90000 +#define __TVOS_9_2 90200 +#define __TVOS_10_0 100000 -#define __WATCHOS_1_0 10000 -#define __WATCHOS_2_0 20000 +#define __WATCHOS_1_0 10000 +#define __WATCHOS_2_0 20000 +#define __WATCHOS_3_0 30000 #include @@ -318,5 +325,8 @@ #define __WATCHOS_DEPRECATED(_start, _dep, _msg) #endif +#if __has_include() + #include +#endif #endif /* __AVAILABILITY__ */ diff --git a/EXTERNAL_HEADERS/AvailabilityInternal.h b/EXTERNAL_HEADERS/AvailabilityInternal.h index 81bbd59b7..7e71e6145 100644 --- a/EXTERNAL_HEADERS/AvailabilityInternal.h +++ b/EXTERNAL_HEADERS/AvailabilityInternal.h @@ -80,7 +80,7 @@ #ifdef __IPHONE_OS_VERSION_MIN_REQUIRED /* make sure a default max version is set */ #ifndef __IPHONE_OS_VERSION_MAX_ALLOWED - #define __IPHONE_OS_VERSION_MAX_ALLOWED __IPHONE_9_0 + #define __IPHONE_OS_VERSION_MAX_ALLOWED __IPHONE_10_0 #endif /* make sure a valid min is set */ #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 @@ -224,6 +224,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=2.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=2.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=2.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=2.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=2.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=2.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=2.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=2.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=2.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=2.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=2.0))) #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=2.0))) #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __attribute__((availability(ios,introduced=2.1))) @@ -353,6 +371,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=2.1,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=2.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=2.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=2.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=2.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=2.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=2.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=2.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=2.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=2.1,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=2.1))) #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=2.1))) #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __attribute__((availability(ios,introduced=2.2))) @@ -476,6 +512,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=2.2,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=2.2,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=2.2,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=2.2,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=2.2,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=2.2,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=2.2,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=2.2,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=2.2,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=2.2,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=2.2))) #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=2.2))) #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __attribute__((availability(ios,introduced=3.0))) @@ -593,6 +647,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=3.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=3.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=3.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=3.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=3.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=3.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=3.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=3.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=3.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=3.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=3.0))) #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=3.0))) #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __attribute__((availability(ios,introduced=3.1))) @@ -704,6 +776,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=3.1,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=3.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=3.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=3.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=3.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=3.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=3.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=3.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=3.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=3.1,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=3.1))) #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=3.1))) #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __attribute__((availability(ios,introduced=3.2))) @@ -809,6 +899,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=3.2,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=3.2,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=3.2,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=3.2,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=3.2,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=3.2,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=3.2,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=3.2,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=3.2,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=3.2,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=3.2))) #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=3.2))) #define __AVAILABILITY_INTERNAL__IPHONE_4_0 __attribute__((availability(ios,introduced=4.0))) @@ -908,6 +1016,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=4.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=4.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=4.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=4.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.0))) #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=4.0))) #define __AVAILABILITY_INTERNAL__IPHONE_4_1 __attribute__((availability(ios,introduced=4.1))) @@ -1001,6 +1127,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=4.1,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=4.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=4.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=4.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.1,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.1))) #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=4.1))) #define __AVAILABILITY_INTERNAL__IPHONE_4_2 __attribute__((availability(ios,introduced=4.2))) @@ -1088,6 +1232,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=4.2,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=4.2,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.2,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.2,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=4.2,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.2,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.2,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=4.2,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.2,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.2,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.2))) #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=4.2))) #define __AVAILABILITY_INTERNAL__IPHONE_4_3 __attribute__((availability(ios,introduced=4.3))) @@ -1169,6 +1331,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=4.3,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=4.3,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.3,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=4.3,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=4.3,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.3,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=4.3,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=4.3,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.3,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=4.3,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.3))) #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=4.3))) #define __AVAILABILITY_INTERNAL__IPHONE_5_0 __attribute__((availability(ios,introduced=5.0))) @@ -1244,6 +1424,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=5.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=5.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=5.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=5.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=5.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=5.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=5.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=5.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=5.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=5.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=5.0))) #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=5.0))) #define __AVAILABILITY_INTERNAL__IPHONE_5_1 __attribute__((availability(ios,introduced=5.1))) @@ -1313,6 +1511,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=5.1,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=5.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=5.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=5.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=5.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=5.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=5.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=5.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=5.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=5.1,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=5.1))) #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=5.1))) #define __AVAILABILITY_INTERNAL__IPHONE_6_0 __attribute__((availability(ios,introduced=6.0))) @@ -1376,6 +1592,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=6.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=6.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=6.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=6.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=6.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=6.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=6.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=6.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=6.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=6.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=6.0))) #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=6.0))) #define __AVAILABILITY_INTERNAL__IPHONE_6_1 __attribute__((availability(ios,introduced=6.1))) @@ -1433,6 +1667,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=6.1,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=6.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=6.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=6.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=6.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=6.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=6.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=6.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=6.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=6.1,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=6.1))) #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=6.1))) #define __AVAILABILITY_INTERNAL__IPHONE_7_0 __attribute__((availability(ios,introduced=7.0))) @@ -1484,6 +1736,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=7.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=7.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=7.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=7.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=7.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=7.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=7.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=7.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=7.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=7.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=7.0))) #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=7.0))) #define __AVAILABILITY_INTERNAL__IPHONE_7_1 __attribute__((availability(ios,introduced=7.1))) @@ -1529,6 +1799,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=7.1,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=7.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=7.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=7.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=7.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=7.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=7.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=7.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=7.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=7.1,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=7.1))) #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=7.1))) #define __AVAILABILITY_INTERNAL__IPHONE_8_0 __attribute__((availability(ios,introduced=8.0))) @@ -1568,6 +1856,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=8.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=8.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=8.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=8.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=8.0))) #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=8.0))) #define __AVAILABILITY_INTERNAL__IPHONE_8_1 __attribute__((availability(ios,introduced=8.1))) @@ -1601,6 +1907,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=8.1,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=8.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=8.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=8.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.1,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=8.1))) #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=8.1))) #define __AVAILABILITY_INTERNAL__IPHONE_8_2 __attribute__((availability(ios,introduced=8.2))) @@ -1628,6 +1952,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=8.2,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=8.2,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.2,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.2,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=8.2,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.2,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.2,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=8.2,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.2,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.2,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=8.2))) #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=8.2))) #define __AVAILABILITY_INTERNAL__IPHONE_8_3 __attribute__((availability(ios,introduced=8.3))) @@ -1649,6 +1991,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=8.3,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=8.3,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.3,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.3,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=8.3,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.3,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.3,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=8.3,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.3,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.3,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_NA __attribute__((availability(ios,introduced=8.3))) #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=8.3))) #define __AVAILABILITY_INTERNAL__IPHONE_8_4 __attribute__((availability(ios,introduced=8.4))) @@ -1664,6 +2024,24 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=8.4,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=8.4,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.4,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=8.4,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=8.4,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.4,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=8.4,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=8.4,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.4,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=8.4,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_NA __attribute__((availability(ios,introduced=8.4))) #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=8.4))) #define __AVAILABILITY_INTERNAL__IPHONE_9_0 __attribute__((availability(ios,introduced=9.0))) @@ -1673,8 +2051,71 @@ #else #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_0_MSG(_msg) __attribute__((availability(ios,introduced=9.0,deprecated=9.0))) #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=9.0,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=9.0,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=9.0,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=9.0,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=9.0,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=9.0,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=9.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=9.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=9.0,deprecated=10.0))) + #endif #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=9.0))) #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=9.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1 __attribute__((availability(ios,introduced=9.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __attribute__((availability(ios,introduced=9.1,deprecated=9.1))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=9.1,deprecated=9.1,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __attribute__((availability(ios,introduced=9.1,deprecated=9.1))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=9.1,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=9.1,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=9.1,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=9.1,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=9.1,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=9.1,deprecated=10.0))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=9.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=9.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_9_2 __attribute__((availability(ios,introduced=9.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __attribute__((availability(ios,introduced=9.2,deprecated=9.2))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=9.2,deprecated=9.2,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __attribute__((availability(ios,introduced=9.2,deprecated=9.2))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=9.2,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=9.2,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=9.2,deprecated=10.0))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=9.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=9.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_10_0 __attribute__((availability(ios,introduced=10.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __attribute__((availability(ios,introduced=10.0,deprecated=10.0))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=10.0,deprecated=10.0,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __attribute__((availability(ios,introduced=10.0,deprecated=10.0))) + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=10.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,introduced=10.0))) #define __AVAILABILITY_INTERNAL__IPHONE_NA __attribute__((availability(ios,unavailable))) #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA __attribute__((availability(ios,unavailable))) #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA_MSG(_msg) __attribute__((availability(ios,unavailable))) @@ -9765,6 +10206,3571 @@ #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_0 __AVAILABILITY_INTERNAL_DEPRECATED #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) #endif + /* set up old style internal macros (up to 9.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_NA_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_1_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #endif + /* set up old style internal macros (up to 9.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_NA_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL__IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_9_2_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #endif + /* set up old style internal macros (up to 10.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_10_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_10_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_10_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_NA_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_10_0 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_4_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_5_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_6_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_6_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_7_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_7_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_3 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_8_4 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_8_4 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_1 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_9_2 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_9_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_10_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL__IPHONE_10_0 + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL__IPHONE_10_0 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_6_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_6_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_7_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_7_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_3_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_8_4_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_1_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_9_2_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_10_0_DEP__IPHONE_10_0_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #endif /* set up internal macros (n/a) */ #define __AVAILABILITY_INTERNAL__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE @@ -9776,7 +13782,7 @@ #define __MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ /* make sure a default max version is set */ #ifndef __MAC_OS_X_VERSION_MAX_ALLOWED - #define __MAC_OS_X_VERSION_MAX_ALLOWED __MAC_10_11 + #define __MAC_OS_X_VERSION_MAX_ALLOWED __MAC_10_12 #endif #if defined(__has_attribute) && defined(__has_feature) @@ -9867,6 +13873,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.0,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.0,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.0,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.0,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.0))) #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.0))) #define __AVAILABILITY_INTERNAL__MAC_10_1 __attribute__((availability(macosx,introduced=10.1))) @@ -9948,6 +13960,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.1,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.1,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.1,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.1,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.1))) #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.1))) #define __AVAILABILITY_INTERNAL__MAC_10_2 __attribute__((availability(macosx,introduced=10.2))) @@ -10023,6 +14041,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.2,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.2,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.2,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.2,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.2))) #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.2))) #define __AVAILABILITY_INTERNAL__MAC_10_3 __attribute__((availability(macosx,introduced=10.3))) @@ -10092,6 +14116,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.3,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.3,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.3,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.3,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.3))) #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.3))) #define __AVAILABILITY_INTERNAL__MAC_10_4 __attribute__((availability(macosx,introduced=10.4))) @@ -10155,6 +14185,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.4,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.4,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.4,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.4,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.4))) #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.4))) #define __AVAILABILITY_INTERNAL__MAC_10_5 __attribute__((availability(macosx,introduced=10.5))) @@ -10212,6 +14248,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.5,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.5,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.5,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.5,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.5))) #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.5))) #define __AVAILABILITY_INTERNAL__MAC_10_6 __attribute__((availability(macosx,introduced=10.6))) @@ -10263,6 +14305,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.6,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.6,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.6,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.6,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.6))) #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.6))) #define __AVAILABILITY_INTERNAL__MAC_10_7 __attribute__((availability(macosx,introduced=10.7))) @@ -10308,6 +14356,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.7,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.7,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.7,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.7,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.7))) #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.7))) #define __AVAILABILITY_INTERNAL__MAC_10_8 __attribute__((availability(macosx,introduced=10.8))) @@ -10347,6 +14401,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.8,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.8,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.8,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.8,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.8))) #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.8))) #define __AVAILABILITY_INTERNAL__MAC_10_9 __attribute__((availability(macosx,introduced=10.9))) @@ -10380,6 +14440,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.9,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.9,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.9,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.9,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.9))) #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.9))) #define __AVAILABILITY_INTERNAL__MAC_10_10 __attribute__((availability(macosx,introduced=10.10))) @@ -10407,6 +14473,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.10,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.10,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.10,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.10,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.10))) #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.10))) #define __AVAILABILITY_INTERNAL__MAC_10_10_2 __attribute__((availability(macosx,introduced=10.10.2))) @@ -10428,6 +14500,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.2,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.10.2,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.2,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.2,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.2))) #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.10.2))) #define __AVAILABILITY_INTERNAL__MAC_10_10_3 __attribute__((availability(macosx,introduced=10.10.3))) @@ -10443,6 +14521,12 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.3,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.10.3,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.3,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.3,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.10.3))) #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.10.3))) #define __AVAILABILITY_INTERNAL__MAC_10_11 __attribute__((availability(macosx,introduced=10.11))) @@ -10452,8 +14536,23 @@ #else #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_11_MSG(_msg) __attribute__((availability(macosx,introduced=10.11,deprecated=10.11))) #endif + #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.11,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.11,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.11,deprecated=10.12))) + #endif #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.11))) #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.11))) + #define __AVAILABILITY_INTERNAL__MAC_10_12 __attribute__((availability(macosx,introduced=10.12))) + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_10_12 __attribute__((availability(macosx,introduced=10.12,deprecated=10.12))) + #if __has_feature(attribute_availability_with_message) + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.12,deprecated=10.12,message=_msg))) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_10_12_MSG(_msg) __attribute__((availability(macosx,introduced=10.12,deprecated=10.12))) + #endif + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,introduced=10.12))) + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.12))) #define __AVAILABILITY_INTERNAL__MAC_NA __attribute__((availability(macosx,unavailable))) #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA __attribute__((availability(macosx,unavailable))) #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA_MSG(_msg) __attribute__((availability(macosx,unavailable))) @@ -10462,6 +14561,13 @@ #ifndef __AVAILABILITY_INTERNAL__MAC_10_0 /* use old style attributes */ + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_12 + #define __AVAILABILITY_INTERNAL__MAC_10_12 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_12 + #define __AVAILABILITY_INTERNAL__MAC_10_12 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_12 __AVAILABILITY_INTERNAL_REGULAR + #endif #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_11 #define __AVAILABILITY_INTERNAL__MAC_10_11 __AVAILABILITY_INTERNAL_UNAVAILABLE #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_11 @@ -11016,6 +15122,69 @@ #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_11 __AVAILABILITY_INTERNAL__MAC_10_11 #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_11_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_11 #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_12 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_10_12 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL_DEPRECATED_MSG(_msg) + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_8 + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_8 + #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_9 + #define __AVAILABILITY_INTERNAL__MAC_10_9_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_9 + #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_10 + #define __AVAILABILITY_INTERNAL__MAC_10_10_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_10 + #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_10_2_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_11 + #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_11 + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_10_12 __AVAILABILITY_INTERNAL__MAC_10_12 + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_10_12_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_12 + #endif #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_0 #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_0 #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_1 @@ -11044,6 +15213,8 @@ #define __AVAILABILITY_INTERNAL__MAC_10_10_3_DEP__MAC_NA_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_10_3 #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_11 #define __AVAILABILITY_INTERNAL__MAC_10_11_DEP__MAC_NA_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_11 + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_12 + #define __AVAILABILITY_INTERNAL__MAC_10_12_DEP__MAC_NA_MSG(_msg) __AVAILABILITY_INTERNAL__MAC_10_12 #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA_MSG(_msg) __AVAILABILITY_INTERNAL_UNAVAILABLE #endif diff --git a/EXTERNAL_HEADERS/AvailabilityMacros.h b/EXTERNAL_HEADERS/AvailabilityMacros.h index 9ff820a80..f0bbfe240 100644 --- a/EXTERNAL_HEADERS/AvailabilityMacros.h +++ b/EXTERNAL_HEADERS/AvailabilityMacros.h @@ -103,6 +103,7 @@ #define MAC_OS_X_VERSION_10_10_2 101002 #define MAC_OS_X_VERSION_10_10_3 101003 #define MAC_OS_X_VERSION_10_11 101100 +#define MAC_OS_X_VERSION_10_12 101200 /* * If min OS not specified, assume 10.4 for intel @@ -124,13 +125,13 @@ #endif /* - * if max OS not specified, assume larger of (10.11, min) + * if max OS not specified, assume larger of (10.12, min) */ #ifndef MAC_OS_X_VERSION_MAX_ALLOWED - #if MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_11 + #if MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_12 #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_MIN_REQUIRED #else - #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_10_11 + #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_10_12 #endif #endif @@ -229,7 +230,7 @@ * Used on declarations introduced in Mac OS X 10.1 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_1, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_1, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_1 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_1 @@ -245,7 +246,7 @@ * and deprecated in Mac OS X 10.1 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_1, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_1, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -259,7 +260,7 @@ * but later deprecated in Mac OS X 10.1 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_1, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_1, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 DEPRECATED_ATTRIBUTE #else @@ -272,7 +273,7 @@ * Used on types deprecated in Mac OS X 10.1 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_1, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_1, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -286,7 +287,7 @@ * Used on declarations introduced in Mac OS X 10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_2, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_2, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_2 @@ -302,7 +303,7 @@ * and deprecated in Mac OS X 10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -316,7 +317,7 @@ * but later deprecated in Mac OS X 10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 DEPRECATED_ATTRIBUTE #else @@ -330,7 +331,7 @@ * but later deprecated in Mac OS X 10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 DEPRECATED_ATTRIBUTE #else @@ -343,7 +344,7 @@ * Used on types deprecated in Mac OS X 10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_2, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -357,7 +358,7 @@ * Used on declarations introduced in Mac OS X 10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_3, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_3, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_3 @@ -373,7 +374,7 @@ * and deprecated in Mac OS X 10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -387,7 +388,7 @@ * but later deprecated in Mac OS X 10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE #else @@ -401,7 +402,7 @@ * but later deprecated in Mac OS X 10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE #else @@ -415,7 +416,7 @@ * but later deprecated in Mac OS X 10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE #else @@ -428,7 +429,7 @@ * Used on types deprecated in Mac OS X 10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_3, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -442,7 +443,7 @@ * Used on declarations introduced in Mac OS X 10.4 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_4, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_4, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_4 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_4 @@ -458,7 +459,7 @@ * and deprecated in Mac OS X 10.4 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_4, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_4, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -472,7 +473,7 @@ * but later deprecated in Mac OS X 10.4 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_4, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_4, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE #else @@ -486,7 +487,7 @@ * but later deprecated in Mac OS X 10.4 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_4, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_4, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE #else @@ -500,7 +501,7 @@ * but later deprecated in Mac OS X 10.4 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_4, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_4, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE #else @@ -514,7 +515,7 @@ * but later deprecated in Mac OS X 10.4 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_4, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_4, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE #else @@ -527,7 +528,7 @@ * Used on types deprecated in Mac OS X 10.4 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_4, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_4, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -541,7 +542,7 @@ * Used on declarations introduced in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_5 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_5 @@ -557,7 +558,7 @@ * and deprecated in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_5, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_5, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -571,7 +572,7 @@ * but later deprecated in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_5, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_5, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE #else @@ -585,7 +586,7 @@ * but later deprecated in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_5, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_5, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE #else @@ -599,7 +600,7 @@ * but later deprecated in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_5, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_5, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE #else @@ -613,7 +614,7 @@ * but later deprecated in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_5, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_5, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE #else @@ -627,7 +628,7 @@ * but later deprecated in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_5, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_5, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE #else @@ -640,7 +641,7 @@ * Used on types deprecated in Mac OS X 10.5 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_5, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_5, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -654,7 +655,7 @@ * Used on declarations introduced in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_6 @@ -670,7 +671,7 @@ * and deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -684,7 +685,7 @@ * but later deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE #else @@ -698,7 +699,7 @@ * but later deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE #else @@ -712,7 +713,7 @@ * but later deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE #else @@ -726,7 +727,7 @@ * but later deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE #else @@ -740,7 +741,7 @@ * but later deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE #else @@ -754,7 +755,7 @@ * but later deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE #else @@ -767,7 +768,7 @@ * Used on types deprecated in Mac OS X 10.6 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_6, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_6, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -781,7 +782,7 @@ * Used on declarations introduced in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_7 @@ -797,7 +798,7 @@ * and deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -811,7 +812,7 @@ * but later deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE #else @@ -825,7 +826,7 @@ * but later deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE #else @@ -839,7 +840,7 @@ * but later deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE #else @@ -853,7 +854,7 @@ * but later deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE #else @@ -867,7 +868,7 @@ * but later deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE #else @@ -881,7 +882,7 @@ * but later deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE #else @@ -895,7 +896,7 @@ * but later deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE #else @@ -908,7 +909,7 @@ * Used on types deprecated in Mac OS X 10.7 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_7, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_7, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -922,7 +923,7 @@ * Used on declarations introduced in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_8 @@ -938,7 +939,7 @@ * and deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -952,7 +953,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -966,7 +967,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -980,7 +981,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -994,7 +995,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -1008,7 +1009,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -1022,7 +1023,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -1036,7 +1037,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -1050,7 +1051,7 @@ * but later deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE #else @@ -1063,7 +1064,7 @@ * Used on types deprecated in Mac OS X 10.8 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_8_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_8_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_8_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -1077,7 +1078,7 @@ * Used on declarations introduced in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_9 @@ -1093,7 +1094,7 @@ * and deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -1107,7 +1108,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1121,7 +1122,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1135,7 +1136,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1149,7 +1150,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1163,7 +1164,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1177,7 +1178,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1191,7 +1192,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1205,7 +1206,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1219,7 +1220,7 @@ * but later deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_9 DEPRECATED_ATTRIBUTE #else @@ -1232,7 +1233,7 @@ * Used on types deprecated in Mac OS X 10.9 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_9_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_9, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_9_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_9, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_9_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -1246,7 +1247,7 @@ * Used on declarations introduced in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_10 @@ -1262,7 +1263,7 @@ * and deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -1276,7 +1277,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1290,7 +1291,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1304,7 +1305,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1318,7 +1319,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1332,7 +1333,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1346,7 +1347,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1360,7 +1361,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1374,7 +1375,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1388,7 +1389,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1402,7 +1403,7 @@ * but later deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10 DEPRECATED_ATTRIBUTE #else @@ -1415,7 +1416,7 @@ * Used on types deprecated in Mac OS X 10.10 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -1429,7 +1430,7 @@ * Used on declarations introduced in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_10_2, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_10_2, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_10_2 @@ -1445,7 +1446,7 @@ * and deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_2, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_2, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -1459,7 +1460,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1473,7 +1474,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1487,7 +1488,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1501,7 +1502,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1515,7 +1516,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1529,7 +1530,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1543,7 +1544,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1557,7 +1558,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1571,7 +1572,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1585,7 +1586,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1599,7 +1600,7 @@ * but later deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2 DEPRECATED_ATTRIBUTE #else @@ -1612,7 +1613,7 @@ * Used on types deprecated in Mac OS X 10.10.2 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_2, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_2, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_2 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_2_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -1626,7 +1627,7 @@ * Used on declarations introduced in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_10_3, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_10_3, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_10_3 @@ -1642,7 +1643,7 @@ * and deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_3, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_3, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -1656,7 +1657,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1670,7 +1671,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1684,7 +1685,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1698,7 +1699,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1712,7 +1713,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1726,7 +1727,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1740,7 +1741,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1754,7 +1755,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1768,7 +1769,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1782,7 +1783,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1796,7 +1797,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1810,7 +1811,7 @@ * but later deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_2, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_2, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3 DEPRECATED_ATTRIBUTE #else @@ -1823,7 +1824,7 @@ * Used on types deprecated in Mac OS X 10.10.3 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_3, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_10_3, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10_3 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_10_3_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -1837,7 +1838,7 @@ * Used on declarations introduced in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER UNAVAILABLE_ATTRIBUTE #elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_11 @@ -1853,7 +1854,7 @@ * and deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_11, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_11, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE #else @@ -1867,7 +1868,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1881,7 +1882,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1895,7 +1896,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1909,7 +1910,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1923,7 +1924,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1937,7 +1938,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1951,7 +1952,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1965,7 +1966,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1979,7 +1980,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -1993,7 +1994,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -2007,7 +2008,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -2021,7 +2022,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_2, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_2, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -2035,7 +2036,7 @@ * but later deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_3, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_3, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_11 DEPRECATED_ATTRIBUTE #else @@ -2048,7 +2049,7 @@ * Used on types deprecated in Mac OS X 10.11 */ #if __AVAILABILITY_MACROS_USES_AVAILABILITY - #define DEPRECATED_IN_MAC_OS_X_VERSION_10_11_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_11, __IPHONE_NA, __IPHONE_NA) + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_11_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_11, __IPHONE_4_0, __IPHONE_4_0) #elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_11_AND_LATER DEPRECATED_ATTRIBUTE #else @@ -2056,6 +2057,245 @@ #endif +/* + * AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER __OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.12, + * and deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER_BUT_DEPRECATED __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_12, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_12_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_1, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_2, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_3, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.5, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.6, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_6, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.7, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_7, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.8, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_8, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.9, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_9, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_9_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.10, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_10_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.10.2, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_2, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.10.3, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_10_3, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 + * + * Used on declarations introduced in Mac OS X 10.11, + * but later deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_11, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_12 AVAILABLE_MAC_OS_X_VERSION_10_11_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_12_AND_LATER + * + * Used on types deprecated in Mac OS X 10.12 + */ +#if __AVAILABILITY_MACROS_USES_AVAILABILITY + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_12_AND_LATER __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_12, __IPHONE_4_0, __IPHONE_4_0) +#elif MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_12_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_12_AND_LATER +#endif + + #endif /* __AVAILABILITYMACROS__ */ diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile index edf9fe869..988c22cee 100644 --- a/EXTERNAL_HEADERS/Makefile +++ b/EXTERNAL_HEADERS/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -34,7 +33,7 @@ KERNEL_FILES = \ stdbool.h \ stdint.h -INSTALL_MI_LIST = +INSTALL_MI_LIST = INSTALL_MI_DIR = . diff --git a/EXTERNAL_HEADERS/architecture/Makefile b/EXTERNAL_HEADERS/architecture/Makefile index ea393a5bf..9e0a1312b 100644 --- a/EXTERNAL_HEADERS/architecture/Makefile +++ b/EXTERNAL_HEADERS/architecture/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -21,9 +20,9 @@ INSTINC_SUBDIRS_ARM = \ INSTINC_SUBDIRS_ARM64 = \ arm -EXPORT_FILES = +EXPORT_FILES = -INSTALL_MI_LIST = +INSTALL_MI_LIST = INSTALL_MI_DIR = architecture @@ -35,4 +34,3 @@ EXPORT_MI_DIR = architecture include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/EXTERNAL_HEADERS/architecture/i386/Makefile b/EXTERNAL_HEADERS/architecture/i386/Makefile index e4c02e150..50c9a8df9 100644 --- a/EXTERNAL_HEADERS/architecture/i386/Makefile +++ b/EXTERNAL_HEADERS/architecture/i386/Makefile @@ -3,11 +3,10 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) -EXPORT_FILES = \ +EXPORT_FILES = \ asm_help.h \ cpu.h \ io.h \ @@ -18,10 +17,9 @@ EXPORT_FILES = \ reg_help.h \ table.h +INSTALL_MD_LIST = -INSTALL_MD_LIST = - -INSTALL_MD_DIR = +INSTALL_MD_DIR = EXPORT_MD_LIST = ${EXPORT_FILES} @@ -29,5 +27,3 @@ EXPORT_MD_DIR = architecture/i386 include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/EXTERNAL_HEADERS/corecrypto/cc.h b/EXTERNAL_HEADERS/corecrypto/cc.h index 6a05f106c..6b01e33c1 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc.h +++ b/EXTERNAL_HEADERS/corecrypto/cc.h @@ -29,9 +29,9 @@ The resulting struct can be used to create arrays that are aligned by a certain amount. */ #define cc_aligned_struct(_alignment_) \ - typedef struct { \ - uint8_t b[_alignment_]; \ - } __attribute__((aligned(_alignment_))) +typedef struct { \ +uint8_t b[_alignment_]; \ +} CC_ALIGNED(_alignment_) /* number of array elements used in a cc_ctx_decl */ #define cc_ctx_n(_type_, _size_) ((_size_ + sizeof(_type_) - 1) / sizeof(_type_)) @@ -39,16 +39,21 @@ /* sizeof of a context declared with cc_ctx_decl */ #define cc_ctx_sizeof(_type_, _size_) sizeof(_type_[cc_ctx_n(_type_, _size_)]) -#define cc_ctx_decl(_type_, _size_, _name_) \ - _type_ _name_[cc_ctx_n(_type_, _size_)] - -#if CC_HAS_BZERO -#define cc_zero(_size_,_data_) bzero((_data_), (_size_)) +//- WARNING: The _MSC_VER version of cc_ctx_decl() is not compatible with the way *_decl macros are used in CommonCrypto, AppleKeyStore and SecurityFrameworks +// to observe the incompatibilities and errors, use below definition. Corecrypto itself, accepts both deinitions +// #define cc_ctx_decl(_type_, _size_, _name_) _type_ _name_ ## _array[cc_ctx_n(_type_, (_size_))]; _type_ *_name_ = _name_ ## _array +//- Never use sizeof() operator for the variables declared with cc_ctx_decl(), because it is not be compatible with the _MSC_VER version of cc_ctx_decl(). +#if defined(_MSC_VER) + #define UNIQUE_ARRAY(data_type, _var_, total_count) data_type* _var_ = (data_type*)_alloca(sizeof(data_type)*(total_count)); + #define cc_ctx_decl(_type_, _size_, _name_) UNIQUE_ARRAY(_type_, _name_,cc_ctx_n(_type_, (_size_))) #else -/* Alternate version if you don't have bzero. */ -#define cc_zero(_size_,_data_) memset((_data_),0 ,(_size_)) + #define cc_ctx_decl(_type_, _size_, _name_) _type_ _name_ [cc_ctx_n(_type_, _size_)] #endif +/* bzero is deprecated. memset is the way to go */ +/* FWIW, L4, HEXAGON and ARMCC even with gnu compatibility mode don't have bzero */ +#define cc_zero(_size_,_data_) memset((_data_),0 ,(_size_)) + /* cc_clear: Set "len" bytes of memory to zero at address "dst". cc_clear has been developed so that it won't be optimized out. @@ -69,19 +74,17 @@ void cc_xor(size_t size, void *r, const void *s, const void *t) { } } -/* cc_cmp_safe: - Compare "num" pointed by ptr1 and ptr2, array of identical size. - Functional behavior: Returns 0 if the "num" bytes starting at ptr1 are identical to the "num" - bytes starting at ptr2. - Return !=0 if they are different or if "num" is 0 (empty arrays) - Security: The execution time/cycles is *independent* of the data and therefore guarantees - no leak about the data. - However, the execution time depends on "num". -*/ +/*! + @brief cc_cmp_safe(num, pt1, pt2) compares two array ptr1 and ptr2 of num bytes. + @discussion The execution time/cycles is independent of the data and therefore guarantees no leak about the data. However, the execution time depends on num. + @param num number of bytes in each array + @param ptr1 input array + @param ptr2 input array + @return returns 0 if the num bytes starting at ptr1 are identical to the num bytes starting at ptr2 and 1 if they are different or if num is 0 (empty arrays). + */ CC_NONNULL2 CC_NONNULL3 int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2); - /* Exchange S and T of any type. NOTE: Both and S and T are evaluated mutliple times and MUST NOT be expressions. */ #define CC_SWAP(S,T) do { \ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_config.h b/EXTERNAL_HEADERS/corecrypto/cc_config.h index 45979d8cf..2f78c45a6 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_config.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_config.h @@ -45,280 +45,382 @@ */ -#if (defined(DEBUG) && (DEBUG)) +//Do not set these macros to 1, unless you are developing/testing for Windows +#define CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT 0 +#define CORECRYPTO_HACK_FOR_WINDOWS_DEVELOPMENT 0 //to be removed after port corecrypto to Windows + +//this macro is used to turn on/off usage of transparent union in corecrypto +//it should be commented out in corecrypto and be used only by the software that use corecrypto +//#define CORECRYPTO_DONOT_USE_TRANSPARENT_UNION +#ifdef CORECRYPTO_DONOT_USE_TRANSPARENT_UNION + #define CORECRYPTO_USE_TRANSPARENT_UNION 0 +#else + #define CORECRYPTO_USE_TRANSPARENT_UNION 1 +#endif + +#if (defined(DEBUG) && (DEBUG)) || defined(_DEBUG) //MSVC defines _DEBUG /* CC_DEBUG is already used in CommonCrypto */ -#define CORECRYPTO_DEBUG 1 + #define CORECRYPTO_DEBUG 1 #else -#define CORECRYPTO_DEBUG 0 + #define CORECRYPTO_DEBUG 0 #endif +// This macro can be used to enable prints when a condition in the macro "cc_require" +// is false. This is especially useful to confirm that negative testing fails +// at the intended location +#define CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS 0 + #if defined(KERNEL) && (KERNEL) -#define CC_KERNEL 1 // KEXT, XNU repo or kernel components such as AppleKeyStore + #define CC_KERNEL 1 // KEXT, XNU repo or kernel components such as AppleKeyStore #else -#define CC_KERNEL 0 + #define CC_KERNEL 0 #endif // LINUX_BUILD_TEST is for sanity check of the configuration // > xcodebuild -scheme "corecrypto_test" OTHER_CFLAGS="$(values) -DLINUX_BUILD_TEST" #if defined(__linux__) || defined(LINUX_BUILD_TEST) -#define CC_LINUX 1 + #define CC_LINUX 1 #else -#define CC_LINUX 0 + #define CC_LINUX 0 #endif #if defined(USE_L4) && (USE_L4) -#define CC_USE_L4 1 + #define CC_USE_L4 1 #else -#define CC_USE_L4 0 + #define CC_USE_L4 0 #endif #if defined(USE_SEPROM) && (USE_SEPROM) -#define CC_USE_SEPROM 1 + #define CC_USE_SEPROM 1 #else -#define CC_USE_SEPROM 0 + #define CC_USE_SEPROM 0 #endif #if defined(USE_S3) && (USE_S3) -#define CC_USE_S3 1 + #define CC_USE_S3 1 #else -#define CC_USE_S3 0 + #define CC_USE_S3 0 #endif -#if defined(MAVERICK) && (MAVERICK) -#define CC_MAVERICK 1 +#if (defined(ICE_FEATURES_ENABLED)) || (defined(MAVERICK) && (MAVERICK)) + #define CC_BASEBAND 1 #else -#define CC_MAVERICK 0 + #define CC_BASEBAND 0 +#endif + +#if defined(EFI) && (EFI) + #define CC_EFI 1 +#else + #define CC_EFI 0 #endif #if defined(IBOOT) && (IBOOT) -#define CC_IBOOT 1 + #define CC_IBOOT 1 #else -#define CC_IBOOT 0 + #define CC_IBOOT 0 #endif // BB configuration -#if CC_MAVERICK +#if CC_BASEBAND // -- ENDIANESS -#if defined(ENDIAN_LITTLE) || (defined(__arm__) && !defined(__BIG_ENDIAN)) -#define __LITTLE_ENDIAN__ -#elif !defined(ENDIAN_BIG) && !defined(__BIG_ENDIAN) -#error Baseband endianess not defined. -#endif -#define AESOPT_ENDIAN_NO_FILE + #if defined(ENDIAN_LITTLE) || (defined(__arm__) && !defined(__BIG_ENDIAN)) + #define __LITTLE_ENDIAN__ + #elif !defined(ENDIAN_BIG) && !defined(__BIG_ENDIAN) + #error Baseband endianess not defined. + #endif + #define AESOPT_ENDIAN_NO_FILE // -- Architecture -#define CCN_UNIT_SIZE 4 // 32 bits -#define aligned(x) aligned((x)>8?8:(x)) // Alignment on 8 bytes max -#define SAFE_IO // AES support for unaligned Input/Output + #define CCN_UNIT_SIZE 4 // 32 bits + #define SAFE_IO // AES support for unaligned Input/Output // -- External function -#define assert ASSERT // sanity + #define assert ASSERT // sanity // -- Warnings // Ignore irrelevant warnings after verification // #1254-D: arithmetic on pointer to void or function type // #186-D: pointless comparison of unsigned integer with zero // #546-D: transfer of control bypasses initialization of -#if defined(__GNUC__) + #if defined(__GNUC__) // warning: pointer of type 'void *' used in arithmetic -#pragma GCC diagnostic ignored "-Wpointer-arith" -#endif // arm or gnuc + #pragma GCC diagnostic ignored "-Wpointer-arith" + #endif // arm or gnuc -#endif // MAVERICK +#endif // CC_BASEBAND -#if !defined(CCN_UNIT_SIZE) -#if defined(__arm64__) || defined(__x86_64__) -#define CCN_UNIT_SIZE 8 -#elif defined(__arm__) || defined(__i386__) -#define CCN_UNIT_SIZE 4 +//CC_XNU_KERNEL_AVAILABLE indicates the availibity of XNU kernel functions, +//like what we have on OSX, iOS, tvOS, Watch OS +#if defined(__APPLE__) && defined(__MACH__) + #define CC_XNU_KERNEL_AVAILABLE 1 #else -#define CCN_UNIT_SIZE 2 + #define CC_XNU_KERNEL_AVAILABLE 0 #endif + +#if !defined(CCN_UNIT_SIZE) + #if defined(__arm64__) || defined(__x86_64__) || defined(_WIN64) + #define CCN_UNIT_SIZE 8 + #elif defined(__arm__) || defined(__i386__) || defined(_WIN32) + #define CCN_UNIT_SIZE 4 + #else + #error undefined architecture + #endif #endif /* !defined(CCN_UNIT_SIZE) */ + +//this allows corecrypto Windows development using xcode +#if defined(CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT) + #if CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT && CC_XNU_KERNEL_AVAILABLE && CORECRYPTO_DEBUG + #define CC_USE_ASM 0 + #define CC_USE_HEAP_FOR_WORKSPACE 1 + #if (CCN_UNIT_SIZE==8) + #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0 + #else + #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1 + #endif + #endif +#endif + +#if !defined(CCN_UINT128_SUPPORT_FOR_64BIT_ARCH) + #if defined(_WIN64) && defined(_WIN32) && (CCN_UNIT_SIZE==8) + #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0 + #elif defined(_WIN32) + #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1//should not be a problem + #else + #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1 + #endif +#endif + +#if __clang__ || CCN_UNIT_SIZE==8 + #define CC_ALIGNED(x) __attribute__ ((aligned(x))) +#elif _MSC_VER + #define CC_ALIGNED(x) __declspec(align(x)) +#else + #define CC_ALIGNED(x) __attribute__ ((aligned((x)>8?8:(x)))) +#endif + + #if defined(__x86_64__) || defined(__i386__) -#define CCN_IOS 0 -#define CCN_OSX 1 + #define CCN_IOS 0 + #define CCN_OSX 1 #endif #if CC_USE_L4 || CC_USE_S3 /* No dynamic linking allowed in L4, e.g. avoid nonlazy symbols */ /* For corecrypto kext, CC_STATIC should be undefined */ -#define CC_STATIC 1 + #define CC_STATIC 1 #endif -#if CC_USE_L4 || CC_IBOOT -/* For L4, stack is too short, need to use HEAP for some computations */ -/* CC_USE_HEAP_FOR_WORKSPACE not supported for KERNEL! */ -#define CC_USE_HEAP_FOR_WORKSPACE 1 -#else -#define CC_USE_HEAP_FOR_WORKSPACE 0 -#endif - -/* L4 do not have bzero, neither does hexagon of ARMCC even with gnu compatibility mode */ -#if CC_USE_L4 || defined(__CC_ARM) || defined(__hexagon__) -#define CC_HAS_BZERO 0 -#else -#define CC_HAS_BZERO 1 +#if !defined(CC_USE_HEAP_FOR_WORKSPACE) + #if CC_USE_L4 || CC_IBOOT || defined(_MSC_VER) + /* For L4, stack is too short, need to use HEAP for some computations */ + /* CC_USE_HEAP_FOR_WORKSPACE not supported for KERNEL! */ + #define CC_USE_HEAP_FOR_WORKSPACE 1 + #else + #define CC_USE_HEAP_FOR_WORKSPACE 0 + #endif #endif /* memset_s is only available in few target */ -#if CC_USE_L4 || CC_KERNEL || CC_IBOOT || CC_USE_SEPROM || defined(__CC_ARM) || defined(__hexagon__) -#define CC_HAS_MEMSET_S 0 +#if CC_KERNEL || CC_USE_SEPROM || defined(__CC_ARM) \ + || defined(__hexagon__) || CC_EFI + #define CC_HAS_MEMSET_S 0 #else -#define CC_HAS_MEMSET_S 1 + #define CC_HAS_MEMSET_S 1 #endif - -#if defined(__CC_ARM) || defined(__hexagon__) || CC_LINUX || defined(__NO_ASM__) -// ARMASM.exe does not to like the file syntax of the asm implementation -#define CCN_DEDICATED_SQR 1 -#define CCN_MUL_KARATSUBA 1 // 4*n CCN_UNIT extra memory required. -#define CCN_ADD_ASM 0 -#define CCN_SUB_ASM 0 -#define CCN_MUL_ASM 0 -#define CCN_ADDMUL1_ASM 0 -#define CCN_MUL1_ASM 0 -#define CCN_CMP_ASM 0 -#define CCN_ADD1_ASM 0 -#define CCN_SUB1_ASM 0 -#define CCN_N_ASM 0 -#define CCN_SET_ASM 0 -#define CCAES_ARM 0 -#define CCAES_INTEL 0 -#define CCN_USE_BUILTIN_CLZ 0 -#if !defined(__NO_ASM__) -#define CCSHA1_VNG_INTEL 0 -#define CCSHA2_VNG_INTEL 0 -#define CCSHA1_VNG_ARMV7NEON 0 -#define CCSHA2_VNG_ARMV7NEON 0 -#endif -#define CCAES_MUX 0 - -#elif defined(__x86_64__) || defined(__i386__) -#define CCN_DEDICATED_SQR 1 -#define CCN_MUL_KARATSUBA 1 // 4*n CCN_UNIT extra memory required. -/* These assembly routines only work for a single CCN_UNIT_SIZE. */ -#if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) || (defined(__i386__) && CCN_UNIT_SIZE == 4) -#define CCN_ADD_ASM 1 -#define CCN_SUB_ASM 1 -#define CCN_MUL_ASM 0 -#else -#define CCN_ADD_ASM 0 -#define CCN_SUB_ASM 0 -#define CCN_MUL_ASM 0 +// Include target conditionals if available. +#if defined(__has_include) /* portability */ +#if __has_include() +#include +#endif /* __has_include() */ +#endif /* defined(__has_include) */ + +//- functions implemented in assembly ------------------------------------------ +//this the list of corecrypto clients that use assembly and the clang compiler +#if !(CC_XNU_KERNEL_AVAILABLE || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG + #warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform" #endif -#if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) -#define CCN_CMP_ASM 1 -#define CCN_N_ASM 1 -#else -#define CCN_CMP_ASM 0 -#define CCN_N_ASM 0 +// use this macro to strictly disable assembly regardless of cpu/os/compiler/etc +#if !defined(CC_USE_ASM) + #if defined(_MSC_VER) || CC_LINUX || CC_EFI || CC_BASEBAND + #define CC_USE_ASM 0 + #else + #define CC_USE_ASM 1 + #endif #endif -#define CCN_ADDMUL1_ASM 0 -#define CCN_MUL1_ASM 0 -#define CCN_ADD1_ASM 0 -#define CCN_SUB1_ASM 0 -#define CCN_SET_ASM 0 -#define CCAES_ARM 0 -#define CCAES_INTEL 1 -#define CCAES_MUX 0 -#define CCN_USE_BUILTIN_CLZ 0 -#define CCSHA1_VNG_INTEL 1 -#define CCSHA2_VNG_INTEL 1 -#define CCSHA1_VNG_ARMV7NEON 0 -#define CCSHA2_VNG_ARMV7NEON 0 - +//-(1) ARM V7 +#if defined(_ARM_ARCH_7) && __clang__ && CC_USE_ASM + #define CCN_DEDICATED_SQR 1 + #define CCN_MUL_KARATSUBA 0 // no performance improvement + #define CCN_ADD_ASM 1 + #define CCN_SUB_ASM 1 + #define CCN_MUL_ASM 0 + #define CCN_ADDMUL1_ASM 1 + #define CCN_MUL1_ASM 1 + #define CCN_CMP_ASM 1 + #define CCN_ADD1_ASM 0 + #define CCN_SUB1_ASM 0 + #define CCN_N_ASM 1 + #define CCN_SET_ASM 1 + #define CCN_SHIFT_RIGHT_ASM 1 + #define CCAES_ARM_ASM 1 + #define CCAES_INTEL_ASM 0 + #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_USE_SEPROM || CC_USE_S3 + #define CCAES_MUX 0 + #else + #define CCAES_MUX 1 + #endif + #define CCN_USE_BUILTIN_CLZ 1 + #define CCSHA1_VNG_INTEL 0 + #define CCSHA2_VNG_INTEL 0 + + #if defined(__ARM_NEON__) || CC_KERNEL + #define CCSHA1_VNG_ARMV7NEON 1 + #define CCSHA2_VNG_ARMV7NEON 1 + #else /* !defined(__ARM_NEON__) */ + #define CCSHA1_VNG_ARMV7NEON 0 + #define CCSHA2_VNG_ARMV7NEON 0 + #endif /* !defined(__ARM_NEON__) */ + #define CCSHA256_ARMV6M_ASM 0 + +//-(2) ARM 64 +#elif (defined(__x86_64__) || defined(__i386__)) && __clang__ && CC_USE_ASM + #define CCN_DEDICATED_SQR 1 + #define CCN_MUL_KARATSUBA 1 // 4*n CCN_UNIT extra memory required. + /* These assembly routines only work for a single CCN_UNIT_SIZE. */ + #if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) || (defined(__i386__) && CCN_UNIT_SIZE == 4) + #define CCN_ADD_ASM 1 + #define CCN_SUB_ASM 1 + #define CCN_MUL_ASM 1 + #else + #define CCN_ADD_ASM 0 + #define CCN_SUB_ASM 0 + #define CCN_MUL_ASM 0 + #endif + + #if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) + #define CCN_CMP_ASM 1 + #define CCN_N_ASM 1 + #define CCN_SHIFT_RIGHT_ASM 1 + #else + #define CCN_CMP_ASM 0 + #define CCN_N_ASM 0 + #define CCN_SHIFT_RIGHT_ASM 0 + #endif + + #define CCN_ADDMUL1_ASM 0 + #define CCN_MUL1_ASM 0 + #define CCN_ADD1_ASM 0 + #define CCN_SUB1_ASM 0 + #define CCN_SET_ASM 0 + #define CCAES_ARM_ASM 0 + #define CCAES_INTEL_ASM 1 + #define CCAES_MUX 0 + #define CCN_USE_BUILTIN_CLZ 0 + #define CCSHA1_VNG_INTEL 1 + #define CCSHA2_VNG_INTEL 1 + #define CCSHA1_VNG_ARMV7NEON 0 + #define CCSHA2_VNG_ARMV7NEON 0 + #define CCSHA256_ARMV6M_ASM 0 + +//-(4) disable assembly #else -#define CCN_DEDICATED_SQR 1 -#define CCN_MUL_KARATSUBA 1 // 4*n CCN_UNIT extra memory required. -#define CCN_ADD_ASM 0 -#define CCN_SUB_ASM 0 -#define CCN_MUL_ASM 0 -#define CCN_ADDMUL1_ASM 0 -#define CCN_MUL1_ASM 0 -#define CCN_CMP_ASM 0 -#define CCN_ADD1_ASM 0 -#define CCN_SUB1_ASM 0 -#define CCN_N_ASM 0 -#define CCN_SET_ASM 0 -#define CCAES_ARM 0 -#define CCAES_INTEL 0 -#define CCAES_MUX 0 -#define CCN_USE_BUILTIN_CLZ 0 -#define CCSHA1_VNG_INTEL 0 -#define CCSHA2_VNG_INTEL 0 -#define CCSHA1_VNG_ARMV7NEON 0 -#define CCSHA2_VNG_ARMV7NEON 0 - -#endif /* !defined(__i386__) */ + #if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH + #define CCN_DEDICATED_SQR 1 + #else + #define CCN_DEDICATED_SQR 0 //when assembly is off and 128-bit integers are not supported, dedicated square is off. This is the case on Windows + #endif + #define CCN_MUL_KARATSUBA 1 // 4*n CCN_UNIT extra memory required. + #define CCN_ADD_ASM 0 + #define CCN_SUB_ASM 0 + #define CCN_MUL_ASM 0 + #define CCN_ADDMUL1_ASM 0 + #define CCN_MUL1_ASM 0 + #define CCN_CMP_ASM 0 + #define CCN_ADD1_ASM 0 + #define CCN_SUB1_ASM 0 + #define CCN_N_ASM 0 + #define CCN_SET_ASM 0 + #define CCN_SHIFT_RIGHT_ASM 0 + #define CCAES_ARM_ASM 0 + #define CCAES_INTEL_ASM 0 + #define CCAES_MUX 0 + #define CCN_USE_BUILTIN_CLZ 0 + #define CCSHA1_VNG_INTEL 0 + #define CCSHA2_VNG_INTEL 0 + #define CCSHA1_VNG_ARMV7NEON 0 + #define CCSHA2_VNG_ARMV7NEON 0 + #define CCSHA256_ARMV6M_ASM 0 + +#endif #define CC_INLINE static inline -#ifdef __GNUC__ -#define CC_NORETURN __attribute__((__noreturn__)) -#define CC_NOTHROW __attribute__((__nothrow__)) -// Transparent Union -#if defined(__CC_ARM) || defined(__hexagon__) -#define CC_NONNULL_TU(N) +#if CORECRYPTO_USE_TRANSPARENT_UNION +// Non null for transparent unions is ambiguous and cause problems +// for most tools (GCC and others: 23919290). + #define CC_NONNULL_TU(N) #else -#define CC_NONNULL_TU(N) __attribute__((__nonnull__ N)) + #define CC_NONNULL_TU(N) CC_NONNULL(N) #endif -#define CC_NONNULL(N) __attribute__((__nonnull__ N)) -#define CC_NONNULL1 __attribute__((__nonnull__(1))) -#define CC_NONNULL2 __attribute__((__nonnull__(2))) -#define CC_NONNULL3 __attribute__((__nonnull__(3))) -#define CC_NONNULL4 __attribute__((__nonnull__(4))) -#define CC_NONNULL5 __attribute__((__nonnull__(5))) -#define CC_NONNULL6 __attribute__((__nonnull__(6))) -#define CC_NONNULL7 __attribute__((__nonnull__(7))) -#define CC_NONNULL_ALL __attribute__((__nonnull__)) -#define CC_SENTINEL __attribute__((__sentinel__)) -#define CC_CONST __attribute__((__const__)) -#define CC_PURE __attribute__((__pure__)) -#define CC_WARN_RESULT __attribute__((__warn_unused_result__)) -#define CC_MALLOC __attribute__((__malloc__)) -#define CC_UNUSED __attribute__((unused)) + +#ifdef __GNUC__ + #define CC_NORETURN __attribute__((__noreturn__)) + #define CC_NOTHROW __attribute__((__nothrow__)) + #define CC_NONNULL(N) __attribute__((__nonnull__ N)) + #define CC_NONNULL1 __attribute__((__nonnull__(1))) + #define CC_NONNULL2 __attribute__((__nonnull__(2))) + #define CC_NONNULL3 __attribute__((__nonnull__(3))) + #define CC_NONNULL4 __attribute__((__nonnull__(4))) + #define CC_NONNULL5 __attribute__((__nonnull__(5))) + #define CC_NONNULL6 __attribute__((__nonnull__(6))) + #define CC_NONNULL7 __attribute__((__nonnull__(7))) + #define CC_NONNULL_ALL __attribute__((__nonnull__)) + #define CC_SENTINEL __attribute__((__sentinel__)) + #define CC_CONST __attribute__((__const__)) + #define CC_PURE __attribute__((__pure__)) + #define CC_WARN_RESULT __attribute__((__warn_unused_result__)) + #define CC_MALLOC __attribute__((__malloc__)) + #define CC_UNUSED __attribute__((unused)) #else /* !__GNUC__ */ /*! @parseOnly */ -#define CC_UNUSED -/*! @parseOnly */ -#define CC_NONNULL_TU(N) + #define CC_UNUSED /*! @parseOnly */ -#define CC_NONNULL(N) + #define CC_NONNULL(N) /*! @parseOnly */ -#define CC_NORETURN + #define CC_NORETURN /*! @parseOnly */ -#define CC_NOTHROW + #define CC_NOTHROW /*! @parseOnly */ -#define CC_NONNULL1 + #define CC_NONNULL1 /*! @parseOnly */ -#define CC_NONNULL2 + #define CC_NONNULL2 /*! @parseOnly */ -#define CC_NONNULL3 + #define CC_NONNULL3 /*! @parseOnly */ -#define CC_NONNULL4 + #define CC_NONNULL4 /*! @parseOnly */ -#define CC_NONNULL5 + #define CC_NONNULL5 /*! @parseOnly */ -#define CC_NONNULL6 + #define CC_NONNULL6 /*! @parseOnly */ -#define CC_NONNULL7 + #define CC_NONNULL7 /*! @parseOnly */ -#define CC_NONNULL_ALL + #define CC_NONNULL_ALL /*! @parseOnly */ -#define CC_SENTINEL + #define CC_SENTINEL /*! @parseOnly */ -#define CC_CONST + #define CC_CONST /*! @parseOnly */ -#define CC_PURE + #define CC_PURE /*! @parseOnly */ -#define CC_WARN_RESULT + #define CC_WARN_RESULT /*! @parseOnly */ -#define CC_MALLOC + #define CC_MALLOC #endif /* !__GNUC__ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_debug.h b/EXTERNAL_HEADERS/corecrypto/cc_debug.h index a04402247..5c8ebbdc7 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_debug.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_debug.h @@ -22,9 +22,10 @@ // ======================== #if CC_KERNEL #include -#define cc_printf(x...) printf(x) +#define cc_printf(x...) kprintf(x) extern int printf(const char *format, ...) __printflike(1,2); #elif CC_USE_S3 +#include #define cc_printf(x...) printf(x) #else #include @@ -60,7 +61,10 @@ extern int printf(const char *format, ...) __printflike(1,2); // ======================== // Print utilities for corecrypto // ======================== + +#include + /* Print a byte array of arbitrary size */ -void cc_print(const char *label, unsigned long count, const uint8_t *s); +void cc_print(const char *label, size_t count, const uint8_t *s); #endif /* _CORECRYPTO_CCN_DEBUG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_macros.h b/EXTERNAL_HEADERS/corecrypto/cc_macros.h index 4d0b0be38..f678f944d 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_macros.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_macros.h @@ -21,24 +21,50 @@ #define __CC_DEBUG_ASSERT_PRODUCTION_CODE !CORECRYPTO_DEBUG #endif -#ifndef __CC_DEBUG_ASSERT_MESSAGE -#define __CC_DEBUG_ASSERT_MESSAGE(name, assertion, label, message, file, line, value) \ -cc_printf( "CCAssertMacros: %s, %s file: %s, line: %d\n", assertion, (message!=0) ? message : "", file, line); +#if CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS + +#if !CC_KERNEL + #include // for strstr +#endif // !CC_KERNEL + +CC_UNUSED static char *cc_strstr(const char *file) { +#if CC_KERNEL + (void) file; +#else + const char cc_char []="corecrypto"; + char *p=strstr(file, cc_char); + if (p) return (p+strlen(cc_char)+1); #endif + return NULL; +} + +#define __CC_DEBUG_REQUIRE_MESSAGE(name, assertion, label, message, file, line, value) \ +{char *___t = cc_strstr(file); cc_printf( "require: %s, %s%s:%d\n", assertion, (message!=0) ? message : "", ___t==NULL?file:___t, line);} + +#endif // CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS #ifndef cc_require -#if __CC_DEBUG_ASSERT_PRODUCTION_CODE +#if (__CC_DEBUG_ASSERT_PRODUCTION_CODE) || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS) + #if defined(_WIN32) && defined (__clang__) + #define cc_require(assertion, exceptionLabel) \ + do { \ + if (!(assertion) ) { \ + goto exceptionLabel; \ + } \ + } while ( 0 ) + #else #define cc_require(assertion, exceptionLabel) \ do { \ if ( __builtin_expect(!(assertion), 0) ) { \ goto exceptionLabel; \ } \ } while ( 0 ) + #endif #else #define cc_require(assertion, exceptionLabel) \ do { \ if ( __builtin_expect(!(assertion), 0) ) { \ - __CC_DEBUG_ASSERT_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \ + __CC_DEBUG_REQUIRE_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \ #assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \ goto exceptionLabel; \ } \ @@ -47,7 +73,20 @@ cc_printf( "CCAssertMacros: %s, %s file: %s, line: %d\n", assertion, (message!=0 #endif #ifndef cc_require_action -#if __CC_DEBUG_ASSERT_PRODUCTION_CODE +#if __CC_DEBUG_ASSERT_PRODUCTION_CODE || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS) + #if defined(_WIN32) && defined(__clang__) + #define cc_require_action(assertion, exceptionLabel, action) \ + do \ + { \ + if (!(assertion)) \ + { \ + { \ + action; \ + } \ + goto exceptionLabel; \ + } \ + } while ( 0 ) + #else #define cc_require_action(assertion, exceptionLabel, action) \ do \ { \ @@ -59,13 +98,14 @@ cc_printf( "CCAssertMacros: %s, %s file: %s, line: %d\n", assertion, (message!=0 goto exceptionLabel; \ } \ } while ( 0 ) + #endif #else #define cc_require_action(assertion, exceptionLabel, action) \ do \ { \ if ( __builtin_expect(!(assertion), 0) ) \ { \ - __CC_DEBUG_ASSERT_MESSAGE( \ + __CC_DEBUG_REQUIRE_MESSAGE( \ __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \ #assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \ { \ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_priv.h b/EXTERNAL_HEADERS/corecrypto/cc_priv.h index 2d0a47a5b..417d45c5c 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_priv.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_priv.h @@ -19,7 +19,6 @@ CC_MEMCPY : optimized memcpy. CC_MEMMOVE : optimized memmove. CC_MEMSET : optimized memset. - CC_BZERO : optimized bzero, CC_STORE32_BE : store 32 bit value in big endian in unaligned buffer. CC_STORE32_LE : store 32 bit value in little endian in unaligned buffer. @@ -72,8 +71,6 @@ The following are not defined yet... define them if needed. #define CC_MEMCPY(D,S,L) memcpy((D),(S),(L)) #define CC_MEMMOVE(D,S,L) memmove((D),(S),(L)) #define CC_MEMSET(D,V,L) memset((D),(V),(L)) -#define CC_BZERO(D,L) memset((D),0,(L)) // Deprecated, DO NOT USE - // MARK: - Loads and Store @@ -122,7 +119,7 @@ x = (((uint64_t)(((const unsigned char *)(y))[7] & 255))<<56) | \ // MARK: -- 32 bits - big endian // MARK: --- intel version -#if (defined(__i386__) || defined(__x86_64__)) +#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER) #define CC_STORE32_BE(x, y) \ __asm__ __volatile__ ( \ @@ -159,7 +156,7 @@ x = ((uint32_t)(((const unsigned char *)(y))[0] & 255)<<24) | \ // MARK: --- intel 64 bits version -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined (_MSC_VER) #define CC_STORE64_BE(x, y) \ __asm__ __volatile__ ( \ @@ -208,7 +205,9 @@ x = (((uint64_t)(((const unsigned char *)(y))[0] & 255))<<56) | \ // MARK: -- MSVC version #include -#pragma intrinsic(_lrotr,_lrotl) +#if !defined(__clang__) + #pragma intrinsic(_lrotr,_lrotl) +#endif #define CC_ROR(x,n) _lrotr(x,n) #define CC_ROL(x,n) _lrotl(x,n) #define CC_RORc(x,n) _lrotr(x,n) @@ -217,7 +216,7 @@ x = (((uint64_t)(((const unsigned char *)(y))[0] & 255))<<56) | \ #elif (defined(__i386__) || defined(__x86_64__)) // MARK: -- intel asm version -static inline uint32_t CC_ROL(uint32_t word, int i) +CC_INLINE uint32_t CC_ROL(uint32_t word, int i) { __asm__ ("roll %%cl,%0" :"=r" (word) @@ -225,7 +224,7 @@ static inline uint32_t CC_ROL(uint32_t word, int i) return word; } -static inline uint32_t CC_ROR(uint32_t word, int i) +CC_INLINE uint32_t CC_ROR(uint32_t word, int i) { __asm__ ("rorl %%cl,%0" :"=r" (word) @@ -255,12 +254,12 @@ static inline uint32_t CC_ROR(uint32_t word, int i) // MARK: -- default version -static inline uint32_t CC_ROL(uint32_t word, int i) +CC_INLINE uint32_t CC_ROL(uint32_t word, int i) { return ( (word<<(i&31)) | (word>>(32-(i&31))) ); } -static inline uint32_t CC_ROR(uint32_t word, int i) +CC_INLINE uint32_t CC_ROR(uint32_t word, int i) { return ( (word>>(i&31)) | (word<<(32-(i&31))) ); } @@ -272,10 +271,10 @@ static inline uint32_t CC_ROR(uint32_t word, int i) // MARK: - 64 bits rotates -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(_MSC_VER) //clang _MSVC doesn't support GNU-style inline assembly // MARK: -- intel 64 asm version -static inline uint64_t CC_ROL64(uint64_t word, int i) +CC_INLINE uint64_t CC_ROL64(uint64_t word, int i) { __asm__("rolq %%cl,%0" :"=r" (word) @@ -283,7 +282,7 @@ static inline uint64_t CC_ROL64(uint64_t word, int i) return word; } -static inline uint64_t CC_ROR64(uint64_t word, int i) +CC_INLINE uint64_t CC_ROR64(uint64_t word, int i) { __asm__("rorq %%cl,%0" :"=r" (word) @@ -315,12 +314,12 @@ static inline uint64_t CC_ROR64(uint64_t word, int i) // MARK: -- default C version -static inline uint64_t CC_ROL64(uint64_t word, int i) +CC_INLINE uint64_t CC_ROL64(uint64_t word, int i) { return ( (word<<(i&63)) | (word>>(64-(i&63))) ); } -static inline uint64_t CC_ROR64(uint64_t word, int i) +CC_INLINE uint64_t CC_ROR64(uint64_t word, int i) { return ( (word>>(i&63)) | (word<<(64-(i&63))) ); } @@ -333,7 +332,7 @@ static inline uint64_t CC_ROR64(uint64_t word, int i) // MARK: - Byte Swaps -static inline uint32_t CC_BSWAP(uint32_t x) +CC_INLINE uint32_t CC_BSWAP(uint32_t x) { return ( ((x>>24)&0x000000FF) | @@ -379,33 +378,30 @@ static inline uint32_t CC_BSWAP(uint32_t x) Run in constant time (log2()) Useful to run constant time checks */ -#define HEAVISIDE_STEP_UINT64(x) {uint64_t _t; \ - _t=(((uint64_t)x>>32) | x); \ +#define HEAVISIDE_STEP_UINT64(r,s) {uint64_t _t=s; \ + _t=(((_t)>>32) | (_t)); \ _t=(0xFFFFFFFF + (_t & 0xFFFFFFFF)); \ - x=_t >> 32;} + r=_t >> 32;} -#define HEAVISIDE_STEP_UINT32(x) {uint32_t _t; \ - _t=(((uint32_t)x>>16) | x); \ +#define HEAVISIDE_STEP_UINT32(r,s) {uint32_t _t=s; \ + _t=(((_t)>>16) | (_t)); \ _t=(0xFFFF + (_t & 0xFFFF)); \ - x=_t >> 16;} - -#define HEAVISIDE_STEP_UINT16(x) {uint16_t _t; \ - _t=(((uint16_t)x>>8) | x); \ - _t=(0xFF + (_t & 0xFF)); \ - x=_t >> 8;} - -#define HEAVISIDE_STEP_UINT8(x) {uint8_t _t; \ - _t=(((uint8_t)x>>4) | (uint8_t)x); \ - _t=((_t>>2) | _t); \ - _t=((_t>>1) | _t); \ - x=_t & 0x1;} - -#define CC_HEAVISIDE_STEP(x) { \ - if (sizeof(x) == 1) {HEAVISIDE_STEP_UINT8(x);} \ - else if (sizeof(x) == 2) {HEAVISIDE_STEP_UINT16(x);} \ - else if (sizeof(x) == 4) {HEAVISIDE_STEP_UINT32(x);} \ - else if (sizeof(x) == 8) {HEAVISIDE_STEP_UINT64(x);} \ - else {x=((x==0)?0:1);} \ + r=_t >> 16;} + +#define HEAVISIDE_STEP_UINT16(r,s) {uint32_t _t=s; \ + _t=(0xFFFF + ((_t) & 0xFFFF)); \ + r=_t >> 16;} + +#define HEAVISIDE_STEP_UINT8(r,s) {uint16_t _t=s; \ + _t=(0xFF + ((_t) & 0xFF)); \ + r=_t >> 8;} + +#define CC_HEAVISIDE_STEP(r,s) { \ + if (sizeof(s) == 1) {HEAVISIDE_STEP_UINT8(r,s);} \ + else if (sizeof(s) == 2) {HEAVISIDE_STEP_UINT16(r,s);} \ + else if (sizeof(s) == 4) {HEAVISIDE_STEP_UINT32(r,s);} \ + else if (sizeof(s) == 8) {HEAVISIDE_STEP_UINT64(r,s);} \ + else {r=(((s)==0)?0:1);} \ } /* Return 1 if x mod 4 =1,2,3, 0 otherwise */ @@ -414,8 +410,46 @@ static inline uint32_t CC_BSWAP(uint32_t x) /* Set a variable to the biggest power of 2 which can be represented */ #define MAX_POWER_OF_2(x) ((__typeof__(x))1<<(8*sizeof(x)-1)) - #define cc_ceiling(a,b) (((a)+((b)-1))/(b)) #define CC_BITLEN_TO_BYTELEN(x) cc_ceiling((x), 8) +//cc_abort() is implemented to comply with FIPS 140-2. See radar 19129408 +void cc_abort(const char * msg , ...); + +/*! + @brief cc_muxp(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time + @param a input pointer + @param b input pointer + @param s The selection parameter s must be 0 or 1. if s is integer 1 a is returned. If s is integer 0, b is returned. Otherwise, the output is undefined. + @return Returns a, if s is 1 and b if s is 0 + */ +void *cc_muxp(int s, const void *a, const void *b); + +/*! + @brief cc_mux2p + @param a input pointer + @param b input pointer + @param r_true output pointer: if s is integer 1 r_true=a is returned, otherwise r_true=b + @param r_false output pointer: if s is integer 1 r_false=b is returned, otherwise r_false=a + @param s The selection parameter s must be 0 or 1. + @discussion Executes in constant time + */ +void cc_mux2p(int s, void **r_true, void **r_false, const void *a, const void *b); + +/*! + @brief CC_MUXU(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time + @param a input unsigned type + @param b input unsigned type + @param s The selection parameter s must be 0 or 1. if s is integer 1 a is returned. If s is integer 0, b is returned. Otherwise, the output is undefined. + @param r output + @return r = a, if s is 1 and b if s is 0 + */ +#define CC_MUXU(r, s, a, b) \ +{ \ + __typeof__(r) _cond = ((__typeof__(r))(s)-(__typeof__(r))1); \ + r = (~_cond&(a))|(_cond&(b)); \ +} + +int cc_is_compiled_with_tu(void); + #endif /* _CORECRYPTO_CC_PRIV_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccaes.h b/EXTERNAL_HEADERS/corecrypto/ccaes.h index 85adca2fe..630cdd282 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccaes.h +++ b/EXTERNAL_HEADERS/corecrypto/ccaes.h @@ -25,7 +25,7 @@ extern const struct ccmode_ecb ccaes_ltc_ecb_encrypt_mode; extern const struct ccmode_cbc ccaes_gladman_cbc_encrypt_mode; extern const struct ccmode_cbc ccaes_gladman_cbc_decrypt_mode; -#if !defined(__NO_ASM__) && CCAES_ARM +#if CCAES_ARM_ASM extern const struct ccmode_ecb ccaes_arm_ecb_encrypt_mode; extern const struct ccmode_ecb ccaes_arm_ecb_decrypt_mode; @@ -50,7 +50,7 @@ extern const struct ccmode_cbc *ccaes_ios_mux_cbc_encrypt_mode(void); extern const struct ccmode_cbc *ccaes_ios_mux_cbc_decrypt_mode(void); #endif -#if !defined(__NO_ASM__) && CCAES_INTEL +#if CCAES_INTEL_ASM //extern const struct ccmode_ecb ccaes_intel_ecb_encrypt_mode; //extern const struct ccmode_ecb ccaes_intel_ecb_decrypt_mode; @@ -100,4 +100,7 @@ const struct ccmode_ccm *ccaes_ccm_decrypt_mode(void); const struct ccmode_ctr *ccaes_ctr_crypt_mode(void); const struct ccmode_ofb *ccaes_ofb_crypt_mode(void); +const struct ccmode_siv *ccaes_siv_encrypt_mode(void); +const struct ccmode_siv *ccaes_siv_decrypt_mode(void); + #endif /* _CORECRYPTO_CCAES_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccasn1.h b/EXTERNAL_HEADERS/corecrypto/ccasn1.h index 7fe1cc66c..7eb1182e6 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccasn1.h +++ b/EXTERNAL_HEADERS/corecrypto/ccasn1.h @@ -69,9 +69,15 @@ enum { CCASN1_CONSTRUCTED_SEQUENCE = CCASN1_SEQUENCE | CCASN1_CONSTRUCTED, }; +#if CORECRYPTO_USE_TRANSPARENT_UNION typedef union { - const unsigned char *oid; -} ccoid_t __attribute__((transparent_union)); + const unsigned char * oid; +} __attribute__((transparent_union)) ccoid_t; +#define CCOID(x) ((x).oid) +#else + typedef const unsigned char * ccoid_t; +#define CCOID(oid) (oid) +#endif /* Returns *der iff *der points to a DER encoded oid that fits within *der_len. */ ccoid_t ccoid_for_der(size_t *der_len, const uint8_t **der); @@ -79,14 +85,13 @@ ccoid_t ccoid_for_der(size_t *der_len, const uint8_t **der); /* Returns the size of an oid including it's tag and length. */ CC_INLINE CC_PURE CC_NONNULL_TU((1)) size_t ccoid_size(ccoid_t oid) { - return 2 + oid.oid[1]; + return 2 + CCOID(oid)[1]; } -CC_INLINE CC_PURE CC_NONNULL_TU((1)) CC_NONNULL_TU((2)) +CC_INLINE CC_PURE CC_NONNULL((1)) CC_NONNULL((2)) bool ccoid_equal(ccoid_t oid1, ccoid_t oid2) { - return(ccoid_size(oid1) == ccoid_size(oid2) && memcmp(oid1.oid, oid2.oid, ccoid_size(oid1))== 0); + return (ccoid_size(oid1) == ccoid_size(oid2) + && memcmp(CCOID(oid1), CCOID(oid2), ccoid_size(oid1))== 0); } -extern const unsigned char *ccsha1_oid; - #endif /* _CORECRYPTO_CCASN1_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cccmac.h b/EXTERNAL_HEADERS/corecrypto/cccmac.h new file mode 100644 index 000000000..f4262d5bb --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/cccmac.h @@ -0,0 +1,92 @@ +/* + * cccmac.h + * corecrypto + * + * Created on 11/07/2013 + * + * Copyright (c) 2013,2014,2015 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_cccmac_H_ +#define _CORECRYPTO_cccmac_H_ + +#include +#include +#include + +#define CMAC_BLOCKSIZE 16 + +#if CORECRYPTO_USE_TRANSPARENT_UNION +struct cccmac_ctx { + uint8_t b[8]; +} CC_ALIGNED(8); + +typedef struct cccmac_ctx_hdr { + uint8_t k1[16]; + uint8_t k2[16]; + uint8_t ctx[8]; +} CC_ALIGNED(8) cccmac_ctx_hdr; + + +typedef union { + struct cccmac_ctx *b; + cccmac_ctx_hdr *hdr; +} cccmac_ctx_t __attribute__((transparent_union)); +#define cccmac_hdr_size sizeof(struct cccmac_ctx_hdr) + +#else + +struct cccmac_ctx { + uint8_t k1[16]; + uint8_t k2[16]; + uint8_t ctx[8]; +} CC_ALIGNED(8);// cccmac_ctx_hdr; + +typedef struct cccmac_ctx* cccmac_ctx_t; + +#define cccmac_hdr_size sizeof(struct cccmac_ctx) + +#endif + + +#define cccmac_iv_size(_mode_) ((_mode_)->block_size) +#define cccmac_cbc_size(_mode_) ((_mode_)->size) + +#define cccmac_ctx_size(_mode_) (cccmac_hdr_size + cccmac_iv_size(_mode_) + cccmac_cbc_size(_mode_)) +#define cccmac_ctx_n(_mode_) ccn_nof_size(cccmac_ctx_size(_mode_)) + +#define cccmac_mode_decl(_mode_, _name_) cc_ctx_decl(struct cccmac_ctx, cccmac_ctx_size(_mode_), _name_) +#define cccmac_mode_clear(_mode_, _name_) cc_clear(cccmac_ctx_size(_mode_), _name_) + +#if CORECRYPTO_USE_TRANSPARENT_UNION +/* Return a cccbc_ctx * which can be accesed with the macros in ccmode.h */ +#define cccmac_mode_ctx_start(_mode_, HC) (((HC).hdr)->ctx) +#define CCCMAC_HDR(HC) (((cccmac_ctx_t)(HC)).hdr) +#else +/* Return a cccbc_ctx * which can be accesed with the macros in ccmode.h */ +#define cccmac_mode_ctx_start(_mode_, HC) (HC->ctx) +#define CCCMAC_HDR(HC) (HC) +#endif + +#define cccmac_mode_sym_ctx(_mode_, HC) (cccbc_ctx *)(cccmac_mode_ctx_start(_mode_, HC)) +#define cccmac_mode_iv(_mode_, HC) (cccbc_iv *)(cccmac_mode_ctx_start(_mode_, HC)+cccmac_cbc_size(_mode_)) +#define cccmac_k1(HC) (CCCMAC_HDR(HC)->k1) +#define cccmac_k2(HC) (CCCMAC_HDR(HC)->k2) + +void cccmac_init(const struct ccmode_cbc *cbc, cccmac_ctx_t ctx, const void *key); + + +void cccmac_block_update(const struct ccmode_cbc *cbc, cccmac_ctx_t cmac, + size_t nblocks, const void *data); + + +void cccmac_final(const struct ccmode_cbc *cbc, cccmac_ctx_t ctx, + size_t nbytes, const void *in, void *out); + +void cccmac(const struct ccmode_cbc *cbc, const void *key, + size_t data_len, const void *data, + void *mac); + + +#endif /* _CORECRYPTO_cccmac_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccder.h b/EXTERNAL_HEADERS/corecrypto/ccder.h index 12e940cc0..f29140edf 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccder.h +++ b/EXTERNAL_HEADERS/corecrypto/ccder.h @@ -17,7 +17,12 @@ #define CCDER_MULTIBYTE_TAGS 1 #ifdef CCDER_MULTIBYTE_TAGS -typedef unsigned long ccder_tag; + #if defined(_MSC_VER) + //TODO related to rdar://problem/24868013 + typedef int ccder_tag; //MSVC forces enums to be ints + #else + typedef unsigned long ccder_tag; + #endif #else typedef uint8_t ccder_tag; #endif @@ -89,70 +94,67 @@ enum { CCDER_CONSTRUCTED_SEQUENCE = CCDER_SEQUENCE | CCDER_CONSTRUCTED, }; - -#define CC_NO_INLINE // MARK: ccder_sizeof_ functions /* Returns the size of an asn1 encoded item of length l in bytes. */ -CC_NO_INLINE CC_CONST +CC_CONST size_t ccder_sizeof(ccder_tag tag, size_t len); -CC_NO_INLINE CC_PURE +CC_PURE size_t ccder_sizeof_implicit_integer(ccder_tag implicit_tag, cc_size n, const cc_unit *s); -CC_NO_INLINE CC_PURE +CC_PURE size_t ccder_sizeof_implicit_octet_string(ccder_tag implicit_tag, cc_size n, const cc_unit *s); -CC_NO_INLINE CC_CONST +CC_CONST size_t ccder_sizeof_implicit_raw_octet_string(ccder_tag implicit_tag, size_t s_size); -CC_NO_INLINE CC_CONST +CC_CONST size_t ccder_sizeof_implicit_uint64(ccder_tag implicit_tag, uint64_t value); -CC_NO_INLINE CC_PURE +CC_PURE size_t ccder_sizeof_integer(cc_size n, const cc_unit *s); -CC_NO_INLINE CC_CONST +CC_CONST size_t ccder_sizeof_len(size_t len); -CC_NO_INLINE CC_PURE +CC_PURE size_t ccder_sizeof_octet_string(cc_size n, const cc_unit *s); -CC_NO_INLINE CC_PURE +CC_PURE size_t ccder_sizeof_oid(ccoid_t oid); -CC_NO_INLINE CC_CONST +CC_CONST size_t ccder_sizeof_raw_octet_string(size_t s_size); -CC_NO_INLINE CC_CONST +CC_CONST size_t ccder_sizeof_tag(ccder_tag tag); -CC_NO_INLINE CC_CONST +CC_CONST size_t ccder_sizeof_uint64(uint64_t value); - // MARK: ccder_encode_ functions. /* Encode a tag backwards, der_end should point to one byte past the end of destination for the tag, returns a pointer to the first byte of the tag. Returns NULL if there is an encoding error. */ -CC_NO_INLINE CC_NONNULL2 +CC_NONNULL2 uint8_t *ccder_encode_tag(ccder_tag tag, const uint8_t *der, uint8_t *der_end); /* Returns a pointer to the start of the len field. returns NULL if there is an encoding error. */ -CC_NO_INLINE CC_NONNULL2 +CC_NONNULL2 uint8_t * ccder_encode_len(size_t len, const uint8_t *der, uint8_t *der_end); /* der_end should point to the first byte of the content of this der item. */ -CC_NO_INLINE CC_NONNULL3 +CC_NONNULL3 uint8_t * ccder_encode_tl(ccder_tag tag, size_t len, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_PURE CC_NONNULL2 +CC_PURE CC_NONNULL2 uint8_t * ccder_encode_body_nocopy(size_t size, const uint8_t *der, uint8_t *der_end); @@ -160,59 +162,67 @@ ccder_encode_body_nocopy(size_t size, const uint8_t *der, uint8_t *der_end); bound, der_end is one byte paste where we want to write the length and body_end is one byte past the end of the body of the der object we are encoding the tag and length of. */ -CC_NO_INLINE CC_NONNULL((2, 3)) +CC_NONNULL((2, 3)) uint8_t * ccder_encode_constructed_tl(ccder_tag tag, const uint8_t *body_end, const uint8_t *der, uint8_t *der_end); /* Encodes oid into der and returns der + ccder_sizeof_oid(oid). */ -CC_NO_INLINE CC_NONNULL_TU((1)) CC_NONNULL2 +CC_NONNULL_TU((1)) CC_NONNULL2 uint8_t *ccder_encode_oid(ccoid_t oid, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((3, 4)) +CC_NONNULL((3, 4)) uint8_t *ccder_encode_implicit_integer(ccder_tag implicit_tag, cc_size n, const cc_unit *s, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((2, 3)) +CC_NONNULL((2, 3)) uint8_t *ccder_encode_integer(cc_size n, const cc_unit *s, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL3 +CC_NONNULL3 uint8_t *ccder_encode_implicit_uint64(ccder_tag implicit_tag, uint64_t value, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL2 +CC_NONNULL2 uint8_t *ccder_encode_uint64(uint64_t value, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((3, 4)) +CC_NONNULL((3, 4)) uint8_t *ccder_encode_implicit_octet_string(ccder_tag implicit_tag, cc_size n, const cc_unit *s, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((2, 3)) +CC_NONNULL((2, 3)) uint8_t *ccder_encode_octet_string(cc_size n, const cc_unit *s, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((3, 4)) +CC_NONNULL((3, 4)) uint8_t *ccder_encode_implicit_raw_octet_string(ccder_tag implicit_tag, size_t s_size, const uint8_t *s, const uint8_t *der, uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((2, 3)) +CC_NONNULL((2, 3)) uint8_t *ccder_encode_raw_octet_string(size_t s_size, const uint8_t *s, const uint8_t *der, uint8_t *der_end); +size_t ccder_encode_eckey_size(size_t priv_size, ccoid_t oid, size_t pub_size); + +CC_NONNULL2 CC_NONNULL5 CC_NONNULL6 CC_NONNULL7 +uint8_t *ccder_encode_eckey(size_t priv_size, const uint8_t *priv_key, + ccoid_t oid, + size_t pub_size, const uint8_t *pub_key, + uint8_t *der, uint8_t *der_end); + /* ccder_encode_body COPIES the body into the der. It's inefficient – especially when you already have to convert to get to the form for the body. see encode integer for the right way to unify conversion and insertion */ -CC_NO_INLINE CC_NONNULL3 +CC_NONNULL3 uint8_t * ccder_encode_body(size_t size, const uint8_t* body, const uint8_t *der, uint8_t *der_end); @@ -221,86 +231,93 @@ ccder_encode_body(size_t size, const uint8_t* body, /* Returns a pointer to the start of the length field, and returns the decoded tag in tag. returns NULL if there is a decoding error. */ -CC_NO_INLINE CC_NONNULL((1, 3)) +CC_NONNULL((1, 3)) const uint8_t *ccder_decode_tag(ccder_tag *tagp, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((1, 3)) +CC_NONNULL((1, 3)) const uint8_t *ccder_decode_len(size_t *lenp, const uint8_t *der, const uint8_t *der_end); /* Returns a pointer to the start of the der object, and returns the length in len. returns NULL if there is a decoding error. */ -CC_NO_INLINE CC_NONNULL((2, 4)) +CC_NONNULL((2, 4)) const uint8_t *ccder_decode_tl(ccder_tag expected_tag, size_t *lenp, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((2, 4)) +CC_NONNULL((2, 4)) const uint8_t * ccder_decode_constructed_tl(ccder_tag expected_tag, const uint8_t **body_end, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((1, 3)) +CC_NONNULL((1, 3)) const uint8_t * ccder_decode_sequence_tl(const uint8_t **body_end, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((2, 4)) +/*! + @function ccder_decode_uint_n + @abstract length in cc_unit of a der unsigned integer after skipping the leading zeroes + + @param der Beginning of input DER buffer + @param der_end End of input DER buffer + @param n Output the number of cc_unit required to represent the number + + @result First byte after the parsed integer or + NULL if the integer is not valid (negative) or reach der_end when reading the integer + */ + +CC_NONNULL((3)) +const uint8_t *ccder_decode_uint_n(cc_size *n, + const uint8_t *der, const uint8_t *der_end); + +/*! + @function ccder_decode_uint + @abstract Represent in cc_unit a der unsigned integer after skipping the leading zeroes + + @param der Beginning of input DER buffer + @param der_end End of input DER buffer + @param n Number of cc_unit allocated for r + @param r Allocated array of cc_unit to copy the integer into. + + @result First byte after the parsed integer or + NULL if the integer is not valid (negative) + reach der_end when reading the integer + n cc_unit is not enough to represent the integer + */ +CC_NONNULL((4)) const uint8_t *ccder_decode_uint(cc_size n, cc_unit *r, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((3)) +CC_NONNULL((3)) const uint8_t *ccder_decode_uint64(uint64_t* r, const uint8_t *der, const uint8_t *der_end); /* Decode SEQUENCE { r, s -- (unsigned)integer } in der into r and s. Returns NULL on decode errors, returns pointer just past the end of the sequence of integers otherwise. */ -CC_NO_INLINE CC_NONNULL((2, 3, 5)) +CC_NONNULL((2, 3, 5)) const uint8_t *ccder_decode_seqii(cc_size n, cc_unit *r, cc_unit *s, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL_TU((1)) CC_NONNULL((3)) +CC_NONNULL_TU((1)) CC_NONNULL((3)) const uint8_t *ccder_decode_oid(ccoid_t *oidp, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL((1,2,4)) +CC_NONNULL((1,2,4)) const uint8_t *ccder_decode_bitstring(const uint8_t **bit_string, size_t *bit_length, const uint8_t *der, const uint8_t *der_end); -CC_NO_INLINE CC_NONNULL_TU((4)) CC_NONNULL((1,2,3,5,6,8)) +CC_NONNULL_TU((4)) CC_NONNULL((1,2,3,5,6,8)) const uint8_t *ccder_decode_eckey(uint64_t *version, size_t *priv_size, const uint8_t **priv_key, ccoid_t *oid, size_t *pub_size, const uint8_t **pub_key, const uint8_t *der, const uint8_t *der_end); -#ifndef CCDER_MULTIBYTE_TAGS -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#endif +#define CC_EC_OID_SECP192R1 {((unsigned char *)"\x06\x08\x2a\x86\x48\xce\x3d\x03\x01\x01")} +#define CC_EC_OID_SECP256R1 {((unsigned char *)"\x06\x08\x2a\x86\x48\xce\x3d\x03\x01\x07")} +#define CC_EC_OID_SECP224R1 {((unsigned char *)"\x06\x05\x2B\x81\x04\x00\x21")} +#define CC_EC_OID_SECP384R1 {((unsigned char *)"\x06\x05\x2B\x81\x04\x00\x22")} +#define CC_EC_OID_SECP521R1 {((unsigned char *)"\x06\x05\x2B\x81\x04\x00\x23")} + #endif /* _CORECRYPTO_CCDER_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdes.h b/EXTERNAL_HEADERS/corecrypto/ccdes.h index 6ca3c2a1e..b4925bd14 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdes.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdes.h @@ -57,12 +57,12 @@ const struct ccmode_ctr *ccdes3_ctr_crypt_mode(void); const struct ccmode_ofb *ccdes3_ofb_crypt_mode(void); -int ccdes_key_is_weak( void *key, unsigned long length); -void ccdes_key_set_odd_parity(void *key, unsigned long length); +int ccdes_key_is_weak( void *key, size_t length); +void ccdes_key_set_odd_parity(void *key, size_t length); uint32_t -ccdes_cbc_cksum(void *in, void *out, unsigned long length, - void *key, unsigned long keylen, void *ivec); +ccdes_cbc_cksum(void *in, void *out, size_t length, + void *key, size_t keylen, void *ivec); #endif /* _CORECRYPTO_CCDES_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest.h b/EXTERNAL_HEADERS/corecrypto/ccdigest.h index 0857678ff..a1b178a60 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdigest.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest.h @@ -13,12 +13,10 @@ #include #include -#ifdef USE_SUPER_COOL_NEW_CCOID_T -#include -#endif /* USE_SUPER_COOL_NEW_CCOID_T */ - + /* To malloc a digest context for a given di, use malloc(ccdigest_di_size(di)) and assign the result to a pointer to a struct ccdigest_ctx. */ +#if CORECRYPTO_USE_TRANSPARENT_UNION struct ccdigest_ctx { union { uint8_t u8; @@ -26,7 +24,7 @@ struct ccdigest_ctx { uint64_t u64; cc_unit ccn; } state; -} __attribute((aligned(8))); +} CC_ALIGNED(8); typedef union { struct ccdigest_ctx *hdr; @@ -39,26 +37,46 @@ struct ccdigest_state { uint64_t u64; cc_unit ccn; } state; -} __attribute((aligned(8))); +} CC_ALIGNED(8); typedef union { struct ccdigest_state *hdr; struct ccdigest_ctx *_ctx; ccdigest_ctx_t _ctxt; } ccdigest_state_t __attribute__((transparent_union)); +#else //======================================================= +struct ccdigest_ctx { + union { + uint8_t u8; + uint32_t u32; + uint64_t u64; + cc_unit ccn; + } state; +} CC_ALIGNED(8); + +typedef struct ccdigest_ctx *ccdigest_ctx_t ; + +struct ccdigest_state { + union { + uint8_t u8; + uint32_t u32; + uint64_t u64; + cc_unit ccn; + } state; +} CC_ALIGNED(8); + +typedef struct ccdigest_state *ccdigest_state_t; +#endif //======================================================= + struct ccdigest_info { - unsigned long output_size; - unsigned long state_size; - unsigned long block_size; - unsigned long oid_size; -#ifdef USE_SUPER_COOL_NEW_CCOID_T - ccoid_t oid; -#else - unsigned char *oid; -#endif + size_t output_size; + size_t state_size; + size_t block_size; + size_t oid_size; + const unsigned char *oid; const void *initial_state; - void(*compress)(ccdigest_state_t state, unsigned long nblocks, + void(*compress)(ccdigest_state_t state, size_t nblocks, const void *data); void(*final)(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest); @@ -81,22 +99,40 @@ struct ccdigest_info { #define ccdigest_di_clear(_di_, _name_) cc_clear(ccdigest_di_size(_di_), _name_) /* Digest context field accessors. Consider the implementation private. */ - +#if CORECRYPTO_USE_TRANSPARENT_UNION #define ccdigest_state(_di_, _ctx_) ((struct ccdigest_state *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + sizeof(uint64_t))) +#else +#define ccdigest_state(_di_, _ctx_) ((struct ccdigest_state *)(&((ccdigest_ctx_t)(_ctx_))->state.u8 + sizeof(uint64_t))) +#endif + #define ccdigest_state_u8(_di_, _ctx_) ccdigest_u8(ccdigest_state((_di_), (_ctx_))) #define ccdigest_state_u32(_di_, _ctx_) ccdigest_u32(ccdigest_state((_di_), (_ctx_))) #define ccdigest_state_u64(_di_, _ctx_) ccdigest_u64(ccdigest_state((_di_), (_ctx_))) #define ccdigest_state_ccn(_di_, _ctx_) ccdigest_ccn(ccdigest_state((_di_), (_ctx_))) -#define ccdigest_nbits(_di_, _ctx_) (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8))[0]) +#if CORECRYPTO_USE_TRANSPARENT_UNION +#define ccdigest_nbits(_di_, _ctx_) (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8))[0]) #define ccdigest_data(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t)) #define ccdigest_num(_di_, _ctx_) (((unsigned int *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t) + (_di_)->block_size))[0]) +#else +#define ccdigest_nbits(_di_, _ctx_) (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_))->state.u8))[0]) +#define ccdigest_data(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_))->state.u8 + (_di_)->state_size + sizeof(uint64_t)) +#define ccdigest_num(_di_, _ctx_) (((unsigned int *)(&((ccdigest_ctx_t)(_ctx_))->state.u8 + (_di_)->state_size + sizeof(uint64_t) + (_di_)->block_size))[0]) +#endif +#if CORECRYPTO_USE_TRANSPARENT_UNION /* Digest state field accessors. Consider the implementation private. */ #define ccdigest_u8(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u8) #define ccdigest_u32(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u32) #define ccdigest_u64(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u64) #define ccdigest_ccn(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.ccn) +#else +/* Digest state field accessors. Consider the implementation private. */ +#define ccdigest_u8(_state_) (&((ccdigest_state_t)(_state_))->state.u8) +#define ccdigest_u32(_state_) (&((ccdigest_state_t)(_state_))->state.u32) +#define ccdigest_u64(_state_) (&((ccdigest_state_t)(_state_))->state.u64) +#define ccdigest_ccn(_state_) (&((ccdigest_state_t)(_state_))->state.ccn) +#endif /* We could just use memcpy instead of this special macro, but this allows us to use the optimized ccn_set() assembly routine if we have one, which for @@ -109,7 +145,7 @@ struct ccdigest_info { void ccdigest_init(const struct ccdigest_info *di, ccdigest_ctx_t ctx); void ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, - unsigned long len, const void *data); + size_t len, const void *data); CC_INLINE void ccdigest_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest) @@ -117,30 +153,27 @@ void ccdigest_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned di->final(di,ctx,digest); } -void ccdigest(const struct ccdigest_info *di, unsigned long len, +void ccdigest(const struct ccdigest_info *di, size_t len, const void *data, void *digest); /* test functions */ -int ccdigest_test(const struct ccdigest_info *di, unsigned long len, +int ccdigest_test(const struct ccdigest_info *di, size_t len, const void *data, const void *digest); -int ccdigest_test_chunk(const struct ccdigest_info *di, unsigned long len, - const void *data, const void *digest, unsigned long chunk); +int ccdigest_test_chunk(const struct ccdigest_info *di, size_t len, + const void *data, const void *digest, size_t chunk); struct ccdigest_vector { - unsigned long len; + size_t len; const void *message; const void *digest; }; int ccdigest_test_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v); -int ccdigest_test_chunk_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v, unsigned long chunk); +int ccdigest_test_chunk_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v, size_t chunk); -#ifdef USE_SUPER_COOL_NEW_CCOID_T -#define OID_DEF(_VALUE_) {((const unsigned char *) _VALUE_)} -#else -#define OID_DEF(_VALUE_) _VALUE_ -#endif + +#define OID_DEF(_VALUE_) ((const unsigned char *)_VALUE_) #define CC_DIGEST_OID_MD2 OID_DEF("\x06\x08\x2A\x86\x48\x86\xF7\x0D\x02\x02") #define CC_DIGEST_OID_MD4 OID_DEF("\x06\x08\x2A\x86\x48\x86\xF7\x0D\x02\x04") @@ -155,17 +188,4 @@ int ccdigest_test_chunk_vector(const struct ccdigest_info *di, const struct ccdi #define CC_DIGEST_OID_RMD256 OID_DEF("\x06\x05\x2B\x24\x03\x02\x03") #define CC_DIGEST_OID_RMD320 OID_DEF(NULL) - -#ifdef USE_SUPER_COOL_NEW_CCOID_T -CC_INLINE CC_NONNULL_TU((1)) CC_NONNULL_TU((2)) -bool ccdigest_oid_equal(const struct ccdigest_info *di, ccoid_t oid) { - if(di->oid.oid == NULL && oid.oid == NULL) return true; - return ccoid_equal(di->oid, oid); -} - -typedef const struct ccdigest_info *(ccdigest_lookup)(ccoid_t oid); - -#include -const struct ccdigest_info *ccdigest_oid_lookup(ccoid_t oid, ...); -#endif /* USE_SUPER_COOL_NEW_CCOID_T*/ #endif /* _CORECRYPTO_CCDIGEST_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h b/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h index fa8d85de6..e888a734d 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h @@ -12,6 +12,7 @@ #define _CORECRYPTO_CCDIGEST_PRIV_H_ #include +#include void ccdigest_final_common(const struct ccdigest_info *di, ccdigest_ctx_t ctx, void *digest); @@ -20,4 +21,16 @@ void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t, void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t, unsigned char *digest); +CC_INLINE CC_NONNULL_TU((1)) +bool ccdigest_oid_equal(const struct ccdigest_info *di, ccoid_t oid) { + if(di->oid == NULL && CCOID(oid) == NULL) return true; + if(di->oid == NULL || CCOID(oid) == NULL) return false; + return ccoid_equal(di->oid, oid); +} + +typedef const struct ccdigest_info *(ccdigest_lookup)(ccoid_t oid); + +#include +const struct ccdigest_info *ccdigest_oid_lookup(ccoid_t oid, ...); + #endif /* _CORECRYPTO_CCDIGEST_PRIV_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdrbg.h b/EXTERNAL_HEADERS/corecrypto/ccdrbg.h index fdf450e13..7ab4f491d 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdrbg.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdrbg.h @@ -26,7 +26,11 @@ #define CCDRBG_STATUS_ERROR (-1) #define CCDRBG_STATUS_NEED_RESEED (-2) #define CCDRBG_STATUS_PARAM_ERROR (-3) - +// If this value is returned, the caller must abort or panic the process for security reasons. +// for example in the case of catastrophic error in +// http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf +// ccdrbg calls abort() or panic(), if they are available in the system. +#define CCDRBG_STATUS_ABORT (-4) /* * The maximum length of the entropy_input, additional_input (max_additional_input_length) , personalization string * (max_personalization_string_length) and max_number_of_bits_per_request are implementation dependent @@ -50,9 +54,9 @@ CC_INLINE int ccdrbg_init(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, - unsigned long entropyLength, const void* entropy, - unsigned long nonceLength, const void* nonce, - unsigned long psLength, const void* ps) + size_t entropyLength, const void* entropy, + size_t nonceLength, const void* nonce, + size_t psLength, const void* ps) { return info->init(info, drbg, entropyLength, entropy, nonceLength, nonce, psLength, ps); } @@ -62,8 +66,8 @@ CC_INLINE int ccdrbg_init(const struct ccdrbg_info *info, */ CC_INLINE int ccdrbg_reseed(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, - unsigned long entropyLength, const void *entropy, - unsigned long additionalLength, const void *additional) + size_t entropyLength, const void *entropy, + size_t additionalLength, const void *additional) { return info->reseed(drbg, entropyLength, entropy, additionalLength, additional); } @@ -71,8 +75,8 @@ CC_INLINE int ccdrbg_reseed(const struct ccdrbg_info *info, CC_INLINE int ccdrbg_generate(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, - unsigned long dataOutLength, void *dataOut, - unsigned long additionalLength, const void *additional) + size_t dataOutLength, void *dataOut, + size_t additionalLength, const void *additional) { return info->generate(drbg, dataOutLength, dataOut, additionalLength, additional); } @@ -95,7 +99,7 @@ CC_INLINE size_t ccdrbg_context_size(const struct ccdrbg_info *drbg) */ struct ccdrbg_nistctr_custom { const struct ccmode_ecb *ecb; - unsigned long keylen; + size_t keylen; int strictFIPS; int use_df; }; @@ -104,7 +108,7 @@ void ccdrbg_factory_nistctr(struct ccdrbg_info *info, const struct ccdrbg_nistct /* * NIST SP 800-90 HMAC_DRBG - * the mximum security strengh of drbg is half of output size of the input hash function and it internally is limited to 256 bits + * the maximum security strengh of drbg is half of output size of the input hash function and it internally is limited to 256 bits */ extern struct ccdrbg_info ccdrbg_nistdigest_info; diff --git a/EXTERNAL_HEADERS/corecrypto/ccdrbg_impl.h b/EXTERNAL_HEADERS/corecrypto/ccdrbg_impl.h index 129f92e7c..499f58792 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdrbg_impl.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdrbg_impl.h @@ -18,7 +18,7 @@ struct ccdrbg_info { /*! Size of the DRBG state in bytes **/ size_t size; - /** Instantiate the PRNG + /*! Instantiate the PRNG @param prng The PRNG state @param entropylen Length of entropy @param entropy Entropy bytes @@ -27,9 +27,9 @@ struct ccdrbg_info { @return 0 if successful */ int (*init)(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, - unsigned long entropyLength, const void* entropy, - unsigned long nonceLength, const void* nonce, - unsigned long psLength, const void* ps); + size_t entropyLength, const void* entropy, + size_t nonceLength, const void* nonce, + size_t psLength, const void* ps); /*! Add entropy to the PRNG @param prng The PRNG state @@ -40,8 +40,8 @@ struct ccdrbg_info { @return 0 if successful */ int (*reseed)(struct ccdrbg_state *prng, - unsigned long entropylen, const void *entropy, - unsigned long inlen, const void *in); + size_t entropylen, const void *entropy, + size_t inlen, const void *in); /*! Read from the PRNG in a FIPS Testing compliant manor @param prng The PRNG state to read from @@ -52,8 +52,8 @@ struct ccdrbg_info { @return 0 if successfull */ int (*generate)(struct ccdrbg_state *prng, - unsigned long outlen, void *out, - unsigned long inlen, const void *in); + size_t outlen, void *out, + size_t inlen, const void *in); /*! Terminate a PRNG state @param prng The PRNG state to terminate diff --git a/EXTERNAL_HEADERS/corecrypto/cchmac.h b/EXTERNAL_HEADERS/corecrypto/cchmac.h index 6e8d5134c..c3427eaab 100644 --- a/EXTERNAL_HEADERS/corecrypto/cchmac.h +++ b/EXTERNAL_HEADERS/corecrypto/cchmac.h @@ -17,7 +17,7 @@ /* An hmac_ctx_t is normally allocated as an array of these. */ struct cchmac_ctx { uint8_t b[8]; -} __attribute__((aligned(8))); +} CC_ALIGNED(8); typedef union { struct cchmac_ctx *hdr; @@ -55,30 +55,30 @@ typedef union { #define cchmac_nbits(_di_, HC) ccdigest_nbits(_di_, ((cchmac_ctx_t)(HC)).digest) void cchmac_init(const struct ccdigest_info *di, cchmac_ctx_t ctx, - unsigned long key_len, const void *key); + size_t key_len, const void *key); void cchmac_update(const struct ccdigest_info *di, cchmac_ctx_t ctx, - unsigned long data_len, const void *data); + size_t data_len, const void *data); void cchmac_final(const struct ccdigest_info *di, cchmac_ctx_t ctx, unsigned char *mac); -void cchmac(const struct ccdigest_info *di, unsigned long key_len, - const void *key, unsigned long data_len, const void *data, +void cchmac(const struct ccdigest_info *di, size_t key_len, + const void *key, size_t data_len, const void *data, unsigned char *mac); /* Test functions */ struct cchmac_test_input { const struct ccdigest_info *di; - unsigned long key_len; + size_t key_len; const void *key; - unsigned long data_len; + size_t data_len; const void *data; - unsigned long mac_len; + size_t mac_len; const void *expected_mac; }; int cchmac_test(const struct cchmac_test_input *input); -int cchmac_test_chunks(const struct cchmac_test_input *input, unsigned long chunk_size); +int cchmac_test_chunks(const struct cchmac_test_input *input, size_t chunk_size); #endif /* _CORECRYPTO_CCHMAC_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode.h b/EXTERNAL_HEADERS/corecrypto/ccmode.h index 4a8c78958..98057cce4 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode.h @@ -13,6 +13,7 @@ #include #include +#include /* ECB mode. */ @@ -26,7 +27,7 @@ CC_INLINE size_t ccecb_context_size(const struct ccmode_ecb *mode) return mode->size; } -CC_INLINE unsigned long ccecb_block_size(const struct ccmode_ecb *mode) +CC_INLINE size_t ccecb_block_size(const struct ccmode_ecb *mode) { return mode->block_size; } @@ -38,14 +39,14 @@ CC_INLINE void ccecb_init(const struct ccmode_ecb *mode, ccecb_ctx *ctx, } CC_INLINE void ccecb_update(const struct ccmode_ecb *mode, const ccecb_ctx *ctx, - unsigned long nblocks, const void *in, void *out) + size_t nblocks, const void *in, void *out) { mode->ecb(ctx, nblocks, in, out); } CC_INLINE void ccecb_one_shot(const struct ccmode_ecb *mode, size_t key_len, const void *key, - unsigned long nblocks, const void *in, void *out) + size_t nblocks, const void *in, void *out) { ccecb_ctx_decl(mode->size, ctx); mode->init(mode, ctx, key_len, key); @@ -84,7 +85,7 @@ CC_INLINE size_t cccbc_context_size(const struct ccmode_cbc *mode) return mode->size; } -CC_INLINE unsigned long cccbc_block_size(const struct ccmode_cbc *mode) +CC_INLINE size_t cccbc_block_size(const struct ccmode_cbc *mode) { return mode->block_size; } @@ -105,15 +106,15 @@ CC_INLINE void cccbc_set_iv(const struct ccmode_cbc *mode, cccbc_iv *iv_ctx, } CC_INLINE void cccbc_update(const struct ccmode_cbc *mode, cccbc_ctx *ctx, - cccbc_iv *iv, unsigned long nblocks, + cccbc_iv *iv, size_t nblocks, const void *in, void *out) { mode->cbc(ctx, iv, nblocks, in, out); } CC_INLINE void cccbc_one_shot(const struct ccmode_cbc *mode, - unsigned long key_len, const void *key, - const void *iv, unsigned long nblocks, + size_t key_len, const void *key, + const void *iv, size_t nblocks, const void *in, void *out) { cccbc_ctx_decl(mode->size, ctx); @@ -139,7 +140,7 @@ CC_INLINE size_t cccfb_context_size(const struct ccmode_cfb *mode) return mode->size; } -CC_INLINE unsigned long cccfb_block_size(const struct ccmode_cfb *mode) +CC_INLINE size_t cccfb_block_size(const struct ccmode_cfb *mode) { return mode->block_size; } @@ -179,7 +180,7 @@ CC_INLINE size_t cccfb8_context_size(const struct ccmode_cfb8 *mode) return mode->size; } -CC_INLINE unsigned long cccfb8_block_size(const struct ccmode_cfb8 *mode) +CC_INLINE size_t cccfb8_block_size(const struct ccmode_cfb8 *mode) { return mode->block_size; } @@ -222,7 +223,7 @@ CC_INLINE size_t ccctr_context_size(const struct ccmode_ctr *mode) return mode->size; } -CC_INLINE unsigned long ccctr_block_size(const struct ccmode_ctr *mode) +CC_INLINE size_t ccctr_block_size(const struct ccmode_ctr *mode) { return mode->block_size; } @@ -262,7 +263,7 @@ CC_INLINE size_t ccofb_context_size(const struct ccmode_ofb *mode) return mode->size; } -CC_INLINE unsigned long ccofb_block_size(const struct ccmode_ofb *mode) +CC_INLINE size_t ccofb_block_size(const struct ccmode_ofb *mode) { return mode->block_size; } @@ -321,7 +322,7 @@ CC_INLINE size_t ccxts_context_size(const struct ccmode_xts *mode) return mode->size; } -CC_INLINE unsigned long ccxts_block_size(const struct ccmode_xts *mode) +CC_INLINE size_t ccxts_block_size(const struct ccmode_xts *mode) { return mode->block_size; } @@ -340,7 +341,7 @@ CC_INLINE void ccxts_set_tweak(const struct ccmode_xts *mode, ccxts_ctx *ctx, } CC_INLINE void *ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx, - ccxts_tweak *tweak, unsigned long nblocks, const void *in, void *out) + ccxts_tweak *tweak, size_t nblocks, const void *in, void *out) { return mode->xts(ctx, tweak, nblocks, in, out); } @@ -348,7 +349,7 @@ CC_INLINE void *ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx, CC_INLINE void ccxts_one_shot(const struct ccmode_xts *mode, size_t key_len, const void *key, const void *tweak_key, const void *iv, - unsigned long nblocks, const void *in, void *out) + size_t nblocks, const void *in, void *out) { ccxts_ctx_decl(mode->size, ctx); ccxts_tweak_decl(mode->tweak_size, tweak); @@ -371,62 +372,70 @@ CC_INLINE size_t ccgcm_context_size(const struct ccmode_gcm *mode) return mode->size; } -CC_INLINE unsigned long ccgcm_block_size(const struct ccmode_gcm *mode) +CC_INLINE size_t ccgcm_block_size(const struct ccmode_gcm *mode) { return mode->block_size; } -CC_INLINE void ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, +CC_INLINE int ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t key_len, const void *key) { - mode->init(mode, ctx, key_len, key); + return mode->init(mode, ctx, key_len, key); } -CC_INLINE void ccgcm_set_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, +CC_INLINE int ccgcm_set_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t iv_size, const void *iv) { - mode->set_iv(ctx, iv_size, iv); + return mode->set_iv(ctx, iv_size, iv); +} + +// add Additional authenticated data (AAD) +CC_INLINE int ccgcm_aad(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, + size_t nbytes, const void *additional_data) +{ + return mode->gmac(ctx, nbytes, additional_data); } -CC_INLINE void ccgcm_gmac(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, +CC_INLINE int ccgcm_gmac(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *in) { - mode->gmac(ctx, nbytes, in); + return mode->gmac(ctx, nbytes, in); } -CC_INLINE void ccgcm_update(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, +// encrypt or decrypt +CC_INLINE int ccgcm_update(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *in, void *out) { - mode->gcm(ctx, nbytes, in, out); + return mode->gcm(ctx, nbytes, in, out); } -CC_INLINE void ccgcm_finalize(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, +CC_INLINE int ccgcm_finalize(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t tag_size, void *tag) { - mode->finalize(ctx, tag_size, tag); + return mode->finalize(ctx, tag_size, tag); } -CC_INLINE void ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx) +CC_INLINE int ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx) { - mode->reset(ctx); + return mode->reset(ctx); } -CC_INLINE void ccgcm_one_shot(const struct ccmode_gcm *mode, - size_t key_len, const void *key, - size_t iv_len, const void *iv, - size_t adata_len, const void *adata, - size_t nbytes, const void *in, void *out, - size_t tag_len, void *tag) -{ - ccgcm_ctx_decl(mode->size, ctx); - mode->init(mode, ctx, key_len, key); - mode->set_iv(ctx, iv_len, iv); - mode->gmac(ctx, adata_len, adata); - mode->gcm(ctx, nbytes, in, out); - mode->finalize(ctx, tag_len, tag); - ccgcm_ctx_clear(mode->size, ctx); -} +int ccgcm_one_shot(const struct ccmode_gcm *mode, + size_t key_len, const void *key, + size_t iv_len, const void *iv, + size_t adata_len, const void *adata, + size_t nbytes, const void *in, void *out, + size_t tag_len, void *tag); + +//do not call ccgcm_one_shot_legacy() in any new application +int ccgcm_one_shot_legacy(const struct ccmode_gcm *mode, + size_t key_len, const void *key, + size_t iv_len, const void *iv, + size_t adata_len, const void *adata, + size_t nbytes, const void *in, void *out, + size_t tag_len, void *tag); + /* CCM */ @@ -443,64 +452,67 @@ CC_INLINE size_t ccccm_context_size(const struct ccmode_ccm *mode) return mode->size; } -CC_INLINE unsigned long ccccm_block_size(const struct ccmode_ccm *mode) +CC_INLINE size_t ccccm_block_size(const struct ccmode_ccm *mode) { return mode->block_size; } -CC_INLINE void ccccm_init(const struct ccmode_ccm *mode, ccccm_ctx *ctx, +CC_INLINE int ccccm_init(const struct ccmode_ccm *mode, ccccm_ctx *ctx, size_t key_len, const void *key) { - mode->init(mode, ctx, key_len, key); + return mode->init(mode, ctx, key_len, key); } -CC_INLINE void ccccm_set_iv(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, +CC_INLINE int ccccm_set_iv(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nonce_len, const void *nonce, size_t mac_size, size_t auth_len, size_t data_len) { - mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, auth_len, data_len); + return mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, auth_len, data_len); } -CC_INLINE void ccccm_cbcmac(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, +CC_INLINE int ccccm_cbcmac(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in) { - mode->cbcmac(ctx, nonce_ctx, nbytes, in); + return mode->cbcmac(ctx, nonce_ctx, nbytes, in); } -CC_INLINE void ccccm_update(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, +CC_INLINE int ccccm_update(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out) { - mode->ccm(ctx, nonce_ctx, nbytes, in, out); + return mode->ccm(ctx, nonce_ctx, nbytes, in, out); } -CC_INLINE void ccccm_finalize(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, +CC_INLINE int ccccm_finalize(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, void *mac) { - mode->finalize(ctx, nonce_ctx, mac); + return mode->finalize(ctx, nonce_ctx, mac); } -CC_INLINE void ccccm_reset(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx) +CC_INLINE int ccccm_reset(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx) { - mode->reset(ctx, nonce_ctx); + return mode->reset(ctx, nonce_ctx); } -CC_INLINE void ccccm_one_shot(const struct ccmode_ccm *mode, - unsigned long key_len, const void *key, +CC_INLINE int ccccm_one_shot(const struct ccmode_ccm *mode, + size_t key_len, const void *key, unsigned nonce_len, const void *nonce, - unsigned long nbytes, const void *in, void *out, + size_t nbytes, const void *in, void *out, unsigned adata_len, const void* adata, unsigned mac_size, void *mac) { + int rc=0; ccccm_ctx_decl(mode->size, ctx); ccccm_nonce_decl(mode->nonce_size, nonce_ctx); - mode->init(mode, ctx, key_len, key); - mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, adata_len, nbytes); - mode->cbcmac(ctx, nonce_ctx, adata_len, adata); - mode->ccm(ctx, nonce_ctx, nbytes, in, out); - mode->finalize(ctx, nonce_ctx, mac); + rc = mode->init(mode, ctx, key_len, key); + if(rc==0) rc=mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, adata_len, nbytes); + if(rc==0) rc=mode->cbcmac(ctx, nonce_ctx, adata_len, adata); + if(rc==0) rc=mode->ccm(ctx, nonce_ctx, nbytes, in, out); + if(rc==0) rc=mode->finalize(ctx, nonce_ctx, mac); ccccm_ctx_clear(mode->size, ctx); ccccm_nonce_clear(mode->size, nonce_ctx); + + return rc; } @@ -517,7 +529,7 @@ CC_INLINE size_t ccomac_context_size(const struct ccmode_omac *mode) return mode->size; } -CC_INLINE unsigned long ccomac_block_size(const struct ccmode_omac *mode) +CC_INLINE size_t ccomac_block_size(const struct ccmode_omac *mode) { return mode->block_size; } @@ -525,18 +537,18 @@ CC_INLINE unsigned long ccomac_block_size(const struct ccmode_omac *mode) CC_INLINE void ccomac_init(const struct ccmode_omac *mode, ccomac_ctx *ctx, size_t tweak_len, size_t key_len, const void *key) { - return mode->init(mode, ctx, tweak_len, key_len, key); + mode->init(mode, ctx, tweak_len, key_len, key); } CC_INLINE int ccomac_update(const struct ccmode_omac *mode, ccomac_ctx *ctx, - unsigned long nblocks, const void *tweak, const void *in, void *out) + size_t nblocks, const void *tweak, const void *in, void *out) { return mode->omac(ctx, nblocks, tweak, in, out); } CC_INLINE int ccomac_one_shot(const struct ccmode_omac *mode, size_t tweak_len, size_t key_len, const void *key, - const void *tweak, unsigned long nblocks, const void *in, void *out) + const void *tweak, size_t nblocks, const void *in, void *out) { ccomac_ctx_decl(mode->size, ctx); mode->init(mode, ctx, tweak_len, key_len, key); diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h index 3a29111ae..482c6ce92 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h @@ -14,24 +14,9 @@ #include /* TODO: Remove dependency on this header. */ #include -#if !defined(__NO_ASM__) -#if (defined(__x86_64__) && CCAES_INTEL) || (CCAES_ARM && defined(__ARM_NEON__)) -#define CCMODE_GCM_VNG_SPEEDUP 1 -#define CCMODE_CCM_VNG_SPEEDUP 1 -#else -#define CCMODE_GCM_VNG_SPEEDUP 0 -#define CCMODE_CCM_VNG_SPEEDUP 0 -#endif - -#if ( (defined(__x86_64__) && CCAES_INTEL) \ - || (defined(__arm64__) && CCAES_ARM) \ - || defined(__ARM_NEON__)) // Supported even when not using the ARM AES - -#define CCMODE_CTR_VNG_SPEEDUP 1 -#else -#define CCMODE_CTR_VNG_SPEEDUP 0 -#endif -#endif /* !defined(__NO_ASM__) */ +/* Function and macros defined in this file are only to be used + within corecrypto files. + */ /* For CBC, direction of underlying ecb is the same as the cbc direction */ #define CCMODE_CBC_FACTORY(_cipher_, _dir_) \ @@ -132,9 +117,9 @@ const struct ccmode_cbc *cc3des_cbc_encrypt_mode(void) { void ccmode_cbc_init(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, size_t rawkey_len, const void *rawkey); -void ccmode_cbc_decrypt(const cccbc_ctx *ctx, cccbc_iv *iv, unsigned long nblocks, +void ccmode_cbc_decrypt(const cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks, const void *in, void *out); -void ccmode_cbc_encrypt(const cccbc_ctx *ctx, cccbc_iv *iv, unsigned long nblocks, +void ccmode_cbc_encrypt(const cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks, const void *in, void *out); struct _ccmode_cbc_key { @@ -164,23 +149,15 @@ struct _ccmode_cbc_key { example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_cbc_decrypt(struct ccmode_cbc *cbc, - const struct ccmode_ecb *ecb) { - struct ccmode_cbc cbc_decrypt = CCMODE_FACTORY_CBC_DECRYPT(ecb); - *cbc = cbc_decrypt; -} + const struct ccmode_ecb *ecb); /* Use these function to runtime initialize a ccmode_cbc encrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_cbc_encrypt(struct ccmode_cbc *cbc, - const struct ccmode_ecb *ecb) { - struct ccmode_cbc cbc_encrypt = CCMODE_FACTORY_CBC_ENCRYPT(ecb); - *cbc = cbc_encrypt; -} + const struct ccmode_ecb *ecb); void ccmode_cfb_init(const struct ccmode_cfb *cfb, cccfb_ctx *ctx, @@ -218,23 +195,15 @@ struct _ccmode_cfb_key { example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_cfb_decrypt(struct ccmode_cfb *cfb, - const struct ccmode_ecb *ecb) { - struct ccmode_cfb cfb_decrypt = CCMODE_FACTORY_CFB_DECRYPT(ecb); - *cfb = cfb_decrypt; -} + const struct ccmode_ecb *ecb); /* Use these function to runtime initialize a ccmode_cfb encrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_cfb_encrypt(struct ccmode_cfb *cfb, - const struct ccmode_ecb *ecb) { - struct ccmode_cfb cfb_encrypt = CCMODE_FACTORY_CFB_ENCRYPT(ecb); - *cfb = cfb_encrypt; -} + const struct ccmode_ecb *ecb); void ccmode_cfb8_init(const struct ccmode_cfb8 *cfb8, cccfb8_ctx *ctx, size_t rawkey_len, const void *rawkey, const void *iv); @@ -270,23 +239,15 @@ struct _ccmode_cfb8_key { example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_cfb8_decrypt(struct ccmode_cfb8 *cfb8, - const struct ccmode_ecb *ecb) { - struct ccmode_cfb8 cfb8_decrypt = CCMODE_FACTORY_CFB8_DECRYPT(ecb); - *cfb8 = cfb8_decrypt; -} + const struct ccmode_ecb *ecb); /* Use these function to runtime initialize a ccmode_cfb8 encrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_cfb8_encrypt(struct ccmode_cfb8 *cfb8, - const struct ccmode_ecb *ecb) { - struct ccmode_cfb8 cfb8_encrypt = CCMODE_FACTORY_CFB8_ENCRYPT(ecb); - *cfb8 = cfb8_encrypt; -} + const struct ccmode_ecb *ecb); void ccmode_ctr_init(const struct ccmode_ctr *ctr, ccctr_ctx *ctx, size_t rawkey_len, const void *rawkey, const void *iv); @@ -308,62 +269,42 @@ struct _ccmode_ctr_key { .custom = (ECB_ENCRYPT) \ } -#if !defined(__NO_ASM__) -#if CCMODE_CTR_VNG_SPEEDUP -void ccmode_aes_ctr_crypt_vng(ccctr_ctx *ctx, size_t nbytes, - const void *in, void *out); - -/* Use this to statically initialize a ccmode_ctr object for decryption. */ -#define CCMODE_VNG_AES_CTR_CRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_ctr_key)) + 2 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.block_size = 1, \ -.init = ccmode_ctr_init, \ -.ctr = ccmode_aes_ctr_crypt_vng, \ -.custom = (ECB_ENCRYPT) \ -} -#endif /* CCMODE_CTR_VNG_SPEEDUP */ -#endif /* defined(__NO_ASM__) */ - /* Use these function to runtime initialize a ccmode_ctr decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_ctr_crypt(struct ccmode_ctr *ctr, - const struct ccmode_ecb *ecb) { - struct ccmode_ctr ctr_crypt = CCMODE_FACTORY_CTR_CRYPT(ecb); - *ctr = ctr_crypt; -} - -/* GCM FEATURES. */ -//#define CCMODE_GCM_TABLES 1 -#define CCMODE_GCM_FAST 1 - -#ifdef CCMODE_GCM_FAST -#define CCMODE_GCM_FAST_TYPE cc_unit -#endif - -#ifdef CCMODE_GCM_TABLES + const struct ccmode_ecb *ecb); -//#define CCMODE_GCM_TABLES_SSE2 1 - -extern const unsigned char gcm_shift_table[256*2]; -#endif /* Create a gcm key from a gcm mode object. key must point to at least sizeof(CCMODE_GCM_KEY(ecb)) bytes of free storage. */ -void ccmode_gcm_init(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx, +int ccmode_gcm_init(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx, size_t rawkey_len, const void *rawkey); -void ccmode_gcm_set_iv(ccgcm_ctx *ctx, size_t iv_size, const void *iv); -void ccmode_gcm_gmac(ccgcm_ctx *ctx, size_t nbytes, const void *in); -void ccmode_gcm_decrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in, +int ccmode_gcm_set_iv(ccgcm_ctx *ctx, size_t iv_size, const void *iv); +int ccmode_gcm_aad(ccgcm_ctx *ctx, size_t nbytes, const void *in); +int ccmode_gcm_decrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in, void *out); -void ccmode_gcm_encrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in, +int ccmode_gcm_encrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in, void *out); -void ccmode_gcm_finalize(ccgcm_ctx *key, size_t tag_size, void *tag); -void ccmode_gcm_reset(ccgcm_ctx *key); +/*! + @function ccmode_gcm_finalize() finalizes AES-GCM call sequence + @param key encryption or decryption key + @param tag_size + @param tag + @result 0=success or non zero= error + @discussion For decryption, the tag parameter must be the expected-tag. A secure compare is performed between the provided expected-tag and the computed-tag. If they are the same, 0 is returned. Otherwise, non zero is returned. For encryption, tag is output and provides the authentication tag. + + */ +int ccmode_gcm_finalize(ccgcm_ctx *key, size_t tag_size, void *tag); +int ccmode_gcm_reset(ccgcm_ctx *key); + + +// Here is what the structure looks like in memory +// [ temp space | length | *ecb | *ecb_key | table | ecb_key ] +// size of table depends on the implementation (VNG vs factory) struct _ccmode_gcm_key { // 5 blocks of temp space. unsigned char H[16]; /* multiplier */ @@ -372,110 +313,60 @@ struct _ccmode_gcm_key { unsigned char Y_0[16]; /* initial counter */ unsigned char buf[16]; /* buffer for stuff */ - const struct ccmode_ecb *ecb; + // State and length uint32_t ivmode; /* Which mode is the IV in? */ - uint32_t mode; /* mode the GCM code is in */ + uint32_t state; /* state the GCM code is in */ uint32_t buflen; /* length of data in buf */ uint64_t totlen; /* 64-bit counter used for IV and AAD */ - uint64_t pttotlen; /* 64-bit counter for the PT */ - -#ifdef CCMODE_GCM_TABLES - /* TODO: Make table based gcm a separate mode object. */ - unsigned char PC[16][256][16] /* 16 tables of 8x128 */ -#ifdef CCMODE_GCM_TABLES_SSE2 - __attribute__ ((aligned (16))) -#endif /* CCMODE_GCM_TABLES_SSE2 */ - ; -#endif /* CCMODE_GCM_TABLES */ - -#if !defined(__NO_ASM__) -#if CCMODE_GCM_VNG_SPEEDUP -#if !defined(__arm64__) && defined(__ARM_NEON__) - unsigned char Htable[8*2] __attribute__((aligned(16))); -#else - unsigned char Htable[16*8*2] __attribute__((aligned(16))); -#endif -#endif /* CCMODE_GCM_VNG_SPEEDUP */ -#endif /* !defined(__NO_ASM__) */ - cc_unit u[]; + uint64_t pttotlen; /* 64-bit counter for the plaintext PT */ + + // ECB + const struct ccmode_ecb *ecb; // ecb mode + // Pointer to the ECB key in the buffer + void *ecb_key; // address of the ecb_key in u, set in init function + int encdec; //is it an encrypt or decrypt object + // Buffer with ECB key and H table if applicable + unsigned char u[] __attribute__ ((aligned (16))); // ecb key + tables }; -/* Use this to statically initialize a ccmode_gcm object for decryption. */ -#define CCMODE_FACTORY_GCM_DECRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_gcm_key)) + 5 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.block_size = 1, \ -.init = ccmode_gcm_init, \ -.set_iv = ccmode_gcm_set_iv, \ -.gmac = ccmode_gcm_gmac, \ -.gcm = ccmode_gcm_decrypt, \ -.finalize = ccmode_gcm_finalize, \ -.reset = ccmode_gcm_reset, \ -.custom = (ECB_ENCRYPT) \ -} - -/* Use this to statically initialize a ccmode_gcm object for encryption. */ -#define CCMODE_FACTORY_GCM_ENCRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_gcm_key)) + 5 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.block_size = 1, \ -.init = ccmode_gcm_init, \ -.set_iv = ccmode_gcm_set_iv, \ -.gmac = ccmode_gcm_gmac, \ -.gcm = ccmode_gcm_encrypt, \ -.finalize = ccmode_gcm_finalize, \ -.reset = ccmode_gcm_reset, \ -.custom = (ECB_ENCRYPT) \ -} +#define GCM_ECB_KEY_SIZE(ECB_ENCRYPT) \ + ((5 * ccn_sizeof_size((ECB_ENCRYPT)->block_size)) \ + + ccn_sizeof_size((ECB_ENCRYPT)->size)) /* Use these function to runtime initialize a ccmode_gcm decrypt object (for example if it's part of a larger structure). For GCM you always pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_gcm_decrypt(struct ccmode_gcm *gcm, - const struct ccmode_ecb *ecb_encrypt) { - struct ccmode_gcm gcm_decrypt = CCMODE_FACTORY_GCM_DECRYPT(ecb_encrypt); - *gcm = gcm_decrypt; -} + const struct ccmode_ecb *ecb_encrypt); /* Use these function to runtime initialize a ccmode_gcm encrypt object (for example if it's part of a larger structure). For GCM you always pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_gcm_encrypt(struct ccmode_gcm *gcm, - const struct ccmode_ecb *ecb_encrypt) { - struct ccmode_gcm gcm_encrypt = CCMODE_FACTORY_GCM_ENCRYPT(ecb_encrypt); - *gcm = gcm_encrypt; -} + const struct ccmode_ecb *ecb_encrypt); /* CCM (only NIST approved with AES) */ -void ccmode_ccm_init(const struct ccmode_ccm *ccm, ccccm_ctx *ctx, +int ccmode_ccm_init(const struct ccmode_ccm *ccm, ccccm_ctx *ctx, size_t rawkey_len, const void *rawkey); -void ccmode_ccm_set_iv(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nonce_len, const void *nonce, +int ccmode_ccm_set_iv(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nonce_len, const void *nonce, size_t mac_size, size_t auth_len, size_t data_len); /* internal function */ void ccmode_ccm_macdata(ccccm_ctx *key, ccccm_nonce *nonce_ctx, unsigned new_block, size_t nbytes, const void *in); /* api function - disallows only mac'd data after data to encrypt was sent */ -void ccmode_ccm_cbcmac(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in); +int ccmode_ccm_cbcmac(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in); /* internal function */ void ccmode_ccm_crypt(ccccm_ctx *key, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out); -void ccmode_ccm_decrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, - void *out); -void ccmode_ccm_encrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, +int ccmode_ccm_decrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out); -#if !defined(__NO_ASM__) -#if CCMODE_CCM_VNG_SPEEDUP -void ccmode_ccm_decrypt_vector(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, +int ccmode_ccm_encrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out); -void ccmode_ccm_encrypt_vector(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, - void *out); -#endif /* CCMODE_CCM_VNG_SPEEDUP */ -#endif /* !defined(__NO_ASM__) */ -void ccmode_ccm_finalize(ccccm_ctx *key, ccccm_nonce *nonce_ctx, void *mac); -void ccmode_ccm_reset(ccccm_ctx *key, ccccm_nonce *nonce_ctx); +int ccmode_ccm_finalize(ccccm_ctx *key, ccccm_nonce *nonce_ctx, void *mac); +int ccmode_ccm_reset(ccccm_ctx *key, ccccm_nonce *nonce_ctx); struct _ccmode_ccm_key { const struct ccmode_ecb *ecb; @@ -524,68 +415,20 @@ struct _ccmode_ccm_nonce { .custom = (ECB_ENCRYPT) \ } -#if !defined(__NO_ASM__) -/* for x86_64/arm64 speedup */ -#if CCMODE_CCM_VNG_SPEEDUP -/* Use this to statically initialize a ccmode_ccm object for decryption. */ -#define CCMODE_VNG_CCM_DECRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_key)) + ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.nonce_size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_nonce)), \ -.block_size = 1, \ -.init = ccmode_ccm_init, \ -.set_iv = ccmode_ccm_set_iv, \ -.cbcmac = ccmode_ccm_cbcmac, \ -.ccm = ccmode_ccm_decrypt_vector, \ -.finalize = ccmode_ccm_finalize, \ -.reset = ccmode_ccm_reset, \ -.custom = (ECB_ENCRYPT) \ -} - -/* Use this to statically initialize a ccmode_ccm object for encryption. */ -#define CCMODE_VNG_CCM_ENCRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_key)) + ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.nonce_size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_nonce)), \ -.block_size = 1, \ -.init = ccmode_ccm_init, \ -.set_iv = ccmode_ccm_set_iv, \ -.cbcmac = ccmode_ccm_cbcmac, \ -.ccm = ccmode_ccm_encrypt_vector, \ -.finalize = ccmode_ccm_finalize, \ -.reset = ccmode_ccm_reset, \ -.custom = (ECB_ENCRYPT) \ -} -#endif /* CCMODE_CCM_VNG_SPEEDUP */ -#endif /* !defined(__NO_ASM__) */ - /* Use these function to runtime initialize a ccmode_ccm decrypt object (for example if it's part of a larger structure). For CCM you always pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE + void ccmode_factory_ccm_decrypt(struct ccmode_ccm *ccm, - const struct ccmode_ecb *ecb_encrypt) { -#if !defined(__NO_ASM__) && CCMODE_CCM_VNG_SPEEDUP - struct ccmode_ccm ccm_decrypt = CCMODE_VNG_CCM_DECRYPT(ecb_encrypt); -#else - struct ccmode_ccm ccm_decrypt = CCMODE_FACTORY_CCM_DECRYPT(ecb_encrypt); -#endif /* CCMODE_CCM_VNG_SPEEDUP */ - *ccm = ccm_decrypt; -} + const struct ccmode_ecb *ecb_encrypt); /* Use these function to runtime initialize a ccmode_ccm encrypt object (for example if it's part of a larger structure). For CCM you always pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_ccm_encrypt(struct ccmode_ccm *ccm, - const struct ccmode_ecb *ecb_encrypt) { -#if !defined(__NO_ASM__) && CCMODE_CCM_VNG_SPEEDUP - struct ccmode_ccm ccm_encrypt = CCMODE_VNG_CCM_ENCRYPT(ecb_encrypt); -#else - struct ccmode_ccm ccm_encrypt = CCMODE_FACTORY_CCM_ENCRYPT(ecb_encrypt); -#endif /* CCMODE_CCM_VNG_SPEEDUP */ - *ccm = ccm_encrypt; -} + const struct ccmode_ecb *ecb_encrypt); void ccmode_ofb_init(const struct ccmode_ofb *ofb, ccofb_ctx *ctx, @@ -613,17 +456,12 @@ struct _ccmode_ofb_key { example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_ofb_crypt(struct ccmode_ofb *ofb, - const struct ccmode_ecb *ecb) { - struct ccmode_ofb ofb_crypt = CCMODE_FACTORY_OFB_CRYPT(ecb); - *ofb = ofb_crypt; -} - + const struct ccmode_ecb *ecb); -int ccmode_omac_decrypt(ccomac_ctx *ctx, unsigned long nblocks, +int ccmode_omac_decrypt(ccomac_ctx *ctx, size_t nblocks, const void *tweak, const void *in, void *out); -int ccmode_omac_encrypt(ccomac_ctx *ctx, unsigned long nblocks, +int ccmode_omac_encrypt(ccomac_ctx *ctx, size_t nblocks, const void *tweak, const void *in, void *out); /* Create a omac key from a omac mode object. The tweak_len here @@ -632,7 +470,7 @@ int ccmode_omac_encrypt(ccomac_ctx *ctx, unsigned long nblocks, key must point to at least sizeof(CCMODE_OMAC_KEY(ecb)) bytes of free storage. */ void ccmode_omac_init(const struct ccmode_omac *omac, ccomac_ctx *ctx, - cc_size tweak_len, size_t rawkey_len, + size_t tweak_len, size_t rawkey_len, const void *rawkey); struct _ccmode_omac_key { @@ -663,23 +501,15 @@ struct _ccmode_omac_key { example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_omac_decrypt(struct ccmode_omac *omac, - const struct ccmode_ecb *ecb) { - struct ccmode_omac omac_decrypt = CCMODE_FACTORY_OMAC_DECRYPT(ecb); - *omac = omac_decrypt; -} + const struct ccmode_ecb *ecb); /* Use these function to runtime initialize a ccmode_omac encrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_omac_encrypt(struct ccmode_omac *omac, - const struct ccmode_ecb *ecb) { - struct ccmode_omac omac_encrypt = CCMODE_FACTORY_OMAC_ENCRYPT(ecb); - *omac = omac_encrypt; -} + const struct ccmode_ecb *ecb); /* Function prototypes used by the macros below, do not call directly. */ @@ -687,7 +517,7 @@ void ccmode_xts_init(const struct ccmode_xts *xts, ccxts_ctx *ctx, size_t key_len, const void *data_key, const void *tweak_key); void *ccmode_xts_crypt(const ccxts_ctx *ctx, ccxts_tweak *tweak, - unsigned long nblocks, const void *in, void *out); + size_t nblocks, const void *in, void *out); void ccmode_xts_set_tweak(const ccxts_ctx *ctx, ccxts_tweak *tweak, const void *iv); @@ -704,15 +534,15 @@ struct _ccmode_xts_tweak { // the bytes_processed field in the context will accumuate the number of blocks processed and // will fail the encrypt/decrypt if the size is violated. This counter will be reset to 0 // when set_tweak is called. - unsigned long blocks_processed; + size_t blocks_processed; cc_unit u[]; }; /* Use this to statically initialize a ccmode_xts object for decryption. */ #define CCMODE_FACTORY_XTS_DECRYPT(ECB, ECB_ENCRYPT) { \ .size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size), \ -.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(16), \ -.block_size = 16, \ +.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(ecb->block_size), \ +.block_size = ecb->block_size, \ .init = ccmode_xts_init, \ .set_tweak = ccmode_xts_set_tweak, \ .xts = ccmode_xts_crypt, \ @@ -723,8 +553,8 @@ struct _ccmode_xts_tweak { /* Use this to statically initialize a ccmode_xts object for encryption. */ #define CCMODE_FACTORY_XTS_ENCRYPT(ECB, ECB_ENCRYPT) { \ .size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size), \ -.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(16), \ -.block_size = 16, \ +.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(ecb->block_size), \ +.block_size = ecb->block_size, \ .init = ccmode_xts_init, \ .set_tweak = ccmode_xts_set_tweak, \ .xts = ccmode_xts_crypt, \ @@ -736,24 +566,16 @@ struct _ccmode_xts_tweak { example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_xts_decrypt(struct ccmode_xts *xts, const struct ccmode_ecb *ecb, - const struct ccmode_ecb *ecb_encrypt) { - struct ccmode_xts xts_decrypt = CCMODE_FACTORY_XTS_DECRYPT(ecb, ecb_encrypt); - *xts = xts_decrypt; -} + const struct ccmode_ecb *ecb_encrypt); /* Use these function to runtime initialize a ccmode_xts encrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb parameter. */ -CC_INLINE void ccmode_factory_xts_encrypt(struct ccmode_xts *xts, const struct ccmode_ecb *ecb, - const struct ccmode_ecb *ecb_encrypt) { - struct ccmode_xts xts_encrypt = CCMODE_FACTORY_XTS_ENCRYPT(ecb, ecb_encrypt); - *xts = xts_encrypt; -} + const struct ccmode_ecb *ecb_encrypt); #endif /* _CORECRYPTO_CCMODE_FACTORY_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h b/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h index 94279d7e4..817d45070 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h @@ -20,88 +20,114 @@ cc_aligned_struct(16) ccecb_ctx; /* Actual symmetric algorithm implementation should provide you one of these. */ struct ccmode_ecb { size_t size; /* first argument to ccecb_ctx_decl(). */ - unsigned long block_size; + size_t block_size; void (*init)(const struct ccmode_ecb *ecb, ccecb_ctx *ctx, size_t key_len, const void *key); - void (*ecb)(const ccecb_ctx *ctx, unsigned long nblocks, const void *in, + void (*ecb)(const ccecb_ctx *ctx, size_t nblocks, const void *in, void *out); }; -/* CBC mode. */ +/*! + * @brief corecrypto symmetrical encryption and decryption modes + * + * corecrypto supports 6 stateless en(de)cryption modes and 2 stateful authenticated en(de)cryption modes + * stateless modes CBC, CFB, CFB8, CTR, OFB, XTS: They provide 3 interface functions that do not return errors codes + * 1- ccmod_xxx_init() + * 2- ccmod_xxx_decrypt() + * 3- ccmod_xxx_encrypt() + * + * stateful modes CCM and GCM: They provide 7 interface functions that return error codes if a function is called out of state + * 1- ccmod_xxx_init() + * 2- ccmod_xxx_setiv() + * 3- ccmod_xxx_aad() + * 4- ccmod_xxx_decrypt() + * 5- ccmod_xxx_encrypt() + * 6- ccmod_xxx_finalize() + * 7- ccmod_xxx_reset() + * + * the correct call sequences are: + * + * calls to 1, 2 and 6 arerequired + * 2 and 3 can be called as mant times as needed + * calls to 3, 4, 5 can be skipped + * + * 1, 2*n, 3*n, 4|5, 6 + * 1, 2*n, , 4|5, 6 + * 1, 2*n, , , 6 + * 1, 2*n, 3*n, , 6 + */ + +// 1- CBC mode, stateless cc_aligned_struct(16) cccbc_ctx; cc_aligned_struct(16) cccbc_iv; struct ccmode_cbc { size_t size; /* first argument to cccbc_ctx_decl(). */ - unsigned long block_size; + size_t block_size; void (*init)(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, size_t key_len, const void *key); /* cbc encrypt or decrypt nblocks from in to out, iv will be used and updated. */ void (*cbc)(const cccbc_ctx *ctx, cccbc_iv *iv, - unsigned long nblocks, const void *in, void *out); + size_t nblocks, const void *in, void *out); const void *custom; }; -/* CFB mode. */ +// 2- CFB mode, stateless cc_aligned_struct(16) cccfb_ctx; struct ccmode_cfb { size_t size; /* first argument to cccfb_ctx_decl(). */ - unsigned long block_size; + size_t block_size; void (*init)(const struct ccmode_cfb *cfb, cccfb_ctx *ctx, size_t key_len, const void *key, const void *iv); void (*cfb)(cccfb_ctx *ctx, size_t nbytes, const void *in, void *out); const void *custom; }; -/* CFB8 mode. */ - +// 3- CFB8 mode, stateless cc_aligned_struct(16) cccfb8_ctx; struct ccmode_cfb8 { size_t size; /* first argument to cccfb8_ctx_decl(). */ - unsigned long block_size; + size_t block_size; void (*init)(const struct ccmode_cfb8 *cfb8, cccfb8_ctx *ctx, size_t key_len, const void *key, const void *iv); void (*cfb8)(cccfb8_ctx *ctx, size_t nbytes, const void *in, void *out); const void *custom; }; -/* CTR mode. */ - +// 4- CTR mode, stateless cc_aligned_struct(16) ccctr_ctx; struct ccmode_ctr { size_t size; /* first argument to ccctr_ctx_decl(). */ - unsigned long block_size; + size_t block_size; void (*init)(const struct ccmode_ctr *ctr, ccctr_ctx *ctx, size_t key_len, const void *key, const void *iv); void (*ctr)(ccctr_ctx *ctx, size_t nbytes, const void *in, void *out); const void *custom; }; -/* OFB mode. */ - +// 5- OFB mode, stateless cc_aligned_struct(16) ccofb_ctx; struct ccmode_ofb { size_t size; /* first argument to ccofb_ctx_decl(). */ - unsigned long block_size; + size_t block_size; void (*init)(const struct ccmode_ofb *ofb, ccofb_ctx *ctx, size_t key_len, const void *key, const void *iv); void (*ofb)(ccofb_ctx *ctx, size_t nbytes, const void *in, void *out); const void *custom; }; -/* XTS mode. */ - +// 6- XTS mode, stateless cc_aligned_struct(16) ccxts_ctx; cc_aligned_struct(16) ccxts_tweak; struct ccmode_xts { size_t size; /* first argument to ccxts_ctx_decl(). */ size_t tweak_size; /* first argument to ccxts_tweak_decl(). */ - unsigned long block_size; + size_t block_size; /* Create a xts key from a xts mode object. The tweak_len here determines how long the tweak is in bytes, for each subsequent call to @@ -117,60 +143,60 @@ struct ccmode_xts { /* Encrypt blocks for a sector, clients must call set_tweak before calling this function. Return a pointer to the tweak buffer */ void *(*xts)(const ccxts_ctx *ctx, ccxts_tweak *tweak, - unsigned long nblocks, const void *in, void *out); + size_t nblocks, const void *in, void *out); const void *custom; const void *custom1; }; -/* GCM mode. */ - +//7- GCM mode, statful cc_aligned_struct(16) ccgcm_ctx; +#define CCMODE_GCM_DECRYPTOR 78647 +#define CCMODE_GCM_ENCRYPTOR 4073947 struct ccmode_gcm { size_t size; /* first argument to ccgcm_ctx_decl(). */ - unsigned long block_size; - void (*init)(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx, + int encdec; //is it encrypt or decrypt object + size_t block_size; + int (*init)(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx, size_t key_len, const void *key); - void (*set_iv)(ccgcm_ctx *ctx, size_t iv_size, const void *iv); - void (*gmac)(ccgcm_ctx *ctx, size_t nbytes, const void *in); // could just be gcm with NULL out - void (*gcm)(ccgcm_ctx *ctx, size_t nbytes, const void *in, void *out); - void (*finalize)(ccgcm_ctx *key, size_t tag_size, void *tag); - void (*reset)(ccgcm_ctx *ctx); + int (*set_iv)(ccgcm_ctx *ctx, size_t iv_size, const void *iv); + int (*gmac)(ccgcm_ctx *ctx, size_t nbytes, const void *in); // could just be gcm with NULL out + int (*gcm)(ccgcm_ctx *ctx, size_t nbytes, const void *in, void *out); + int (*finalize)(ccgcm_ctx *key, size_t tag_size, void *tag); + int (*reset)(ccgcm_ctx *ctx); const void *custom; }; -/* GCM mode. */ - +//8- GCM mode, statful cc_aligned_struct(16) ccccm_ctx; cc_aligned_struct(16) ccccm_nonce; struct ccmode_ccm { size_t size; /* first argument to ccccm_ctx_decl(). */ size_t nonce_size; /* first argument to ccccm_nonce_decl(). */ - unsigned long block_size; - void (*init)(const struct ccmode_ccm *ccm, ccccm_ctx *ctx, + size_t block_size; + int (*init)(const struct ccmode_ccm *ccm, ccccm_ctx *ctx, size_t key_len, const void *key); - void (*set_iv)(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nonce_len, const void *nonce, + int (*set_iv)(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nonce_len, const void *nonce, size_t mac_size, size_t auth_len, size_t data_len); - void (*cbcmac)(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in); // could just be ccm with NULL out - void (*ccm)(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out); - void (*finalize)(ccccm_ctx *key, ccccm_nonce *nonce_ctx, void *mac); - void (*reset)(ccccm_ctx *key, ccccm_nonce *nonce_ctx); + int (*cbcmac)(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in); // could just be ccm with NULL out + int (*ccm)(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out); + int (*finalize)(ccccm_ctx *key, ccccm_nonce *nonce_ctx, void *mac); + int (*reset)(ccccm_ctx *key, ccccm_nonce *nonce_ctx); const void *custom; }; /* OMAC mode. */ - cc_aligned_struct(16) ccomac_ctx; struct ccmode_omac { size_t size; /* first argument to ccomac_ctx_decl(). */ - unsigned long block_size; + size_t block_size; void (*init)(const struct ccmode_omac *omac, ccomac_ctx *ctx, size_t tweak_len, size_t key_len, const void *key); - int (*omac)(ccomac_ctx *ctx, unsigned long nblocks, + int (*omac)(ccomac_ctx *ctx, size_t nblocks, const void *tweak, const void *in, void *out); const void *custom; }; diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h b/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h new file mode 100644 index 000000000..69069bb3a --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h @@ -0,0 +1,138 @@ +/* + * ccmode_siv.h + * corecrypto + * + * Created on 11/13/2015 + * + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCMODE_SIV_H_ +#define _CORECRYPTO_CCMODE_SIV_H_ + +#include +#include +#include + +#include + +/* This provide an implementation of SIV + as specified in https://tools.ietf.org/html/rfc5297 + also in http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/siv/siv.pdf + Counter Mode where IV is based on CMAC + */ + +cc_aligned_struct(16) ccsiv_ctx; + +struct ccmode_siv { + size_t size; /* first argument to ccsiv_ctx_decl(). */ + size_t block_size; + int (*init)(const struct ccmode_siv *siv, ccsiv_ctx *ctx, + size_t key_len, const uint8_t *key); + int (*set_nonce)(ccsiv_ctx *ctx, size_t nbytes, const uint8_t *in); // could just be ccm with NULL out + int (*auth)(ccsiv_ctx *ctx, size_t nbytes, const uint8_t *in); // could just be ccm with NULL out + int (*crypt)(ccsiv_ctx *ctx, size_t nbytes, const uint8_t *in, uint8_t *out); + int (*reset)(ccsiv_ctx *ctx); + const struct ccmode_cbc *cbc; + const struct ccmode_ctr *ctr; +}; + +#define ccsiv_ctx_decl(_size_, _name_) cc_ctx_decl(ccsiv_ctx, _size_, _name_) +#define ccsiv_ctx_clear(_size_, _name_) cc_clear(_size_, _name_) + +// Functions + +CC_INLINE size_t ccsiv_context_size(const struct ccmode_siv *mode) +{ + return mode->size; +} + +CC_INLINE size_t ccsiv_block_size(const struct ccmode_siv *mode) +{ + return mode->block_size; +} + +CC_INLINE size_t ccsiv_ciphertext_size(const struct ccmode_siv *mode, + size_t plaintext_size) +{ + return plaintext_size+mode->cbc->block_size; +} + +CC_INLINE size_t ccsiv_plaintext_size(const struct ccmode_siv *mode, + size_t ciphertext_size) +{ + if (ciphertext_sizecbc->block_size) { + return 0; // error + } + return ciphertext_size-mode->cbc->block_size; +} + +// In theory, supported key sizes are 32, 48, 64 bytes +// In practice, we only support key size 32 bytes due to cmac limitation +CC_INLINE int ccsiv_init(const struct ccmode_siv *mode, ccsiv_ctx *ctx, + size_t key_byte_len, const uint8_t *key) +{ + return mode->init(mode, ctx, key_byte_len, key); +} + +// Process nonce. it is actually just an authenticated data +CC_INLINE int ccsiv_set_nonce(const struct ccmode_siv *mode, ccsiv_ctx *ctx, + size_t nbytes, const uint8_t *in) +{ + return mode->set_nonce(ctx, nbytes, in); +} + +// Process authenticated data. Taken into account for authentication but not +// encrypted +CC_INLINE int ccsiv_aad(const struct ccmode_siv *mode, ccsiv_ctx *ctx, + size_t nbytes, const uint8_t *in) +{ + return mode->auth(ctx, nbytes, in); +} + +// Encryption data. Authenticated and encrypted. +// Encrypt/Decrypt can only be called once +CC_INLINE int ccsiv_crypt(const struct ccmode_siv *mode, ccsiv_ctx *ctx, + size_t nbytes, const uint8_t *in, uint8_t *out) +{ + return mode->crypt(ctx, nbytes, in, out); +} + +// Clear all context for reuse. +// Key is clear to avoid leaking it +CC_INLINE int ccsiv_reset(const struct ccmode_siv *mode, ccsiv_ctx *ctx) +{ + return mode->reset(ctx); +} + +// One shot with only one vector of adata +CC_INLINE int ccsiv_one_shot(const struct ccmode_siv *mode, + size_t key_len, const uint8_t *key, + unsigned nonce_nbytes, const uint8_t* nonce, + unsigned adata_nbytes, const uint8_t* adata, + size_t in_nbytes, const uint8_t *in, uint8_t *out) +{ + int rc; + ccsiv_ctx_decl(mode->size, ctx); + ccsiv_init(mode, ctx, key_len, key); + rc=mode->set_nonce(ctx, nonce_nbytes, nonce); + if (rc) {return rc;} + rc=mode->auth(ctx, adata_nbytes, adata); + if (rc) {return rc;} + rc=mode->crypt(ctx, in_nbytes, in, out); + if (rc) {return rc;} + ccsiv_ctx_clear(mode->size, ctx); + return rc; +} + +void ccmode_factory_siv_encrypt(struct ccmode_siv *siv, + const struct ccmode_cbc *cbc, + const struct ccmode_ctr *ctr); + +void ccmode_factory_siv_decrypt(struct ccmode_siv *siv, + const struct ccmode_cbc *cbc, + const struct ccmode_ctr *ctr); + + +#endif /* _CORECRYPTO_CCMODE_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccn.h b/EXTERNAL_HEADERS/corecrypto/ccn.h index a66d0d618..53c152c88 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccn.h +++ b/EXTERNAL_HEADERS/corecrypto/ccn.h @@ -16,28 +16,48 @@ #include typedef uint8_t cc_byte; -typedef size_t cc_size; +typedef size_t cc_size; #if CCN_UNIT_SIZE == 8 typedef uint64_t cc_unit; // 64 bit unit -typedef unsigned cc_dunit __attribute__((mode(TI))); // 128 bit double width unit +typedef int64_t cc_int; #define CCN_LOG2_BITS_PER_UNIT 6 // 2^6 = 64 bits #define CC_UNIT_C(x) UINT64_C(x) + #if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH + typedef unsigned cc_dunit __attribute__((mode(TI))); // 128 bit double width unit + typedef signed cc_dint __attribute__((mode(TI))); + #else + typedef struct cc_dunit { + uint64_t l; //do not change the order of the variables. cc_dunit must be little endian + uint64_t h; + } cc_dunit; + + typedef struct cc_dint { + uint64_t l; + uint64_t h; + } cc_dint; + #endif + #elif CCN_UNIT_SIZE == 4 typedef uint32_t cc_unit; // 32 bit unit typedef uint64_t cc_dunit; // 64 bit double width unit +typedef int64_t cc_dint; +typedef int32_t cc_int; #define CCN_LOG2_BITS_PER_UNIT 5 // 2^5 = 32 bits #define CC_UNIT_C(x) UINT32_C(x) + #elif CCN_UNIT_SIZE == 2 typedef uint16_t cc_unit; // 16 bit unit typedef uint32_t cc_dunit; // 32 bit double width unit #define CCN_LOG2_BITS_PER_UNIT 4 // 2^4 = 16 bits #define CC_UNIT_C(x) UINT16_C(x) + #elif CCN_UNIT_SIZE == 1 typedef uint8_t cc_unit; // 8 bit unit typedef uint16_t cc_dunit; // 16 bit double width unit #define CCN_LOG2_BITS_PER_UNIT 3 // 2^3 = 8 bits #define CC_UNIT_C(x) UINT8_C(x) + #else #error invalid CCN_UNIT_SIZE #endif @@ -66,7 +86,7 @@ typedef struct { #define ccn_sizeof_n(_n_) (sizeof(cc_unit) * (_n_)) /* Returns the count (n) of a ccn vector that can represent _bits_. */ -#define ccn_nof(_bits_) (((_bits_) + CCN_UNIT_BITS - 1) / CCN_UNIT_BITS) +#define ccn_nof(_bits_) (((_bits_) + CCN_UNIT_BITS - 1) >> CCN_LOG2_BITS_PER_UNIT) /* Returns the sizeof a ccn vector that can represent _bits_. */ #define ccn_sizeof(_bits_) (ccn_sizeof_n(ccn_nof(_bits_))) @@ -85,14 +105,14 @@ typedef struct { /* Returns the value of bit _k_ of _ccn_, both are only evaluated once. */ #define ccn_bit(_ccn_, _k_) ({__typeof__ (_k_) __k = (_k_); \ - 1 & ((_ccn_)[__k / CCN_UNIT_BITS] >> (__k & (CCN_UNIT_BITS - 1)));}) + 1 & ((_ccn_)[ __k >> CCN_LOG2_BITS_PER_UNIT] >> (__k & (CCN_UNIT_BITS - 1)));}) /* Set the value of bit _k_ of _ccn_ to the value _v_ */ #define ccn_set_bit(_ccn_, _k_, _v_) ({__typeof__ (_k_) __k = (_k_); \ if (_v_) \ - (_ccn_)[__k/CCN_UNIT_BITS] |= CC_UNIT_C(1) << (__k & (CCN_UNIT_BITS - 1)); \ + (_ccn_)[ __k >> CCN_LOG2_BITS_PER_UNIT] |= CC_UNIT_C(1) << (__k & (CCN_UNIT_BITS - 1)); \ else \ - (_ccn_)[__k/CCN_UNIT_BITS] &= ~(CC_UNIT_C(1) << (__k & (CCN_UNIT_BITS - 1))); \ + (_ccn_)[ __k >> CCN_LOG2_BITS_PER_UNIT] &= ~(CC_UNIT_C(1) << (__k & (CCN_UNIT_BITS - 1))); \ }) /* Macros for making ccn constants. You must use list of CCN64_C() instances @@ -328,6 +348,9 @@ int ccn_cmpn(cc_size ns, const cc_unit *s, CC_NONNULL((2, 3, 4)) cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); +/* |s - t| -> r return 1 iff t > s, 0 otherwise */ +cc_unit ccn_abs(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); + /* s - v -> r return 1 iff v > s return 0 otherwise. { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */ CC_NONNULL((2, 3)) @@ -364,9 +387,6 @@ cc_unit ccn_addn(cc_size n, cc_unit *r, const cc_unit *s, return ccn_add1(n - nt, r + nt, s + nt, ccn_add(nt, r, s, t)); } -CC_NONNULL((4, 5)) -void ccn_divmod(cc_size n, cc_unit *q, cc_unit *r, const cc_unit *s, const cc_unit *t); - CC_NONNULL((2, 3, 4)) void ccn_lcm(cc_size n, cc_unit *r2n, const cc_unit *s, const cc_unit *t); @@ -611,4 +631,38 @@ int ccn_random(cc_size n, cc_unit *r, struct ccrng_state *rng) { CC_NONNULL((2, 3)) int ccn_random_bits(cc_size nbits, cc_unit *r, struct ccrng_state *rng); +/*! + @brief ccn_make_recip(cc_size nd, cc_unit *recip, const cc_unit *d) computes the reciprocal of d: recip = 2^2b/d where b=bitlen(d) + + @param nd length of array d + @param recip returned reciprocal of size nd+1 + @param d input number d +*/ +CC_NONNULL((2, 3)) +void ccn_make_recip(cc_size nd, cc_unit *recip, const cc_unit *d); + +CC_NONNULL((6, 8)) +int ccn_div_euclid(cc_size nq, cc_unit *q, cc_size nr, cc_unit *r, cc_size na, const cc_unit *a, cc_size nd, const cc_unit *d); + +#define ccn_div(nq, q, na, a, nd, d) ccn_div_euclid(nq, q, 0, NULL, na, a, nd, d) +#define ccn_mod(nr, r, na, a, nd, d) ccn_div_euclid(0 , NULL, nr, r, na, a, nd, d) + +/*! + @brief ccn_div_use_recip(nq, q, nr, r, na, a, nd, d) comutes q=a/d and r=a%d + @discussion q and rcan be NULL. Reads na from a and nd from d. Writes nq in q and nr in r. nq and nr must be large enough to accomodate results, otherwise error is retuned. Execution time depends on the size of a. Computation is perfomed on of fixedsize and the leadig zeros of a of q are are also used in the computation. + @param nq length of array q that hold the quotients. The maximum length of quotient is the actual length of dividend a + @param q returned quotient. If nq is larger than needed, it is filled with leading zeros. If it is smaller, error is returned. q can be set to NULL, if not needed. + @param nr length of array r that hold the remainder. The maximum length of remainder is the actual length of divisor d + @param r returned remainder. If nr is larger than needed, it is filled with leading zeros. Ifi is smaller error is returned. r can be set to NULL if not required. + @param na length of dividend. Dividend may have leading zeros. + @param a input Dividend + @param nd length of input divisor. Divisor may have leading zeros. + @param d input Divisor + @param recip_d The reciprocal of d, of length nd+1. + + @return returns 0 if successful, negative of error. + */ +CC_NONNULL((6, 8, 9)) +int ccn_div_use_recip(cc_size nq, cc_unit *q, cc_size nr, cc_unit *r, cc_size na, const cc_unit *a, cc_size nd, const cc_unit *d, const cc_unit *recip_d); + #endif /* _CORECRYPTO_CCN_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccpad.h b/EXTERNAL_HEADERS/corecrypto/ccpad.h index 451436615..5f8e3c38c 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccpad.h +++ b/EXTERNAL_HEADERS/corecrypto/ccpad.h @@ -13,34 +13,30 @@ #include -/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, - size_t nbytes, const void *in, void *out); - -/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, - size_t nbytes, const void *in, void *out); +// CTS1,2,3 are defined in Addendum to 800-38A, +// "Cipher Modes of Operation: Three Variants of Ciphertext Stealing for CBC Mode" +// CTS3 is also known as "CTS" in RFC3962 /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts1_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, +size_t ccpad_cts1_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, size_t nbytes, const void *in, void *out); /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts1_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, +size_t ccpad_cts1_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, size_t nbytes, const void *in, void *out); /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts2_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, +size_t ccpad_cts2_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, size_t nbytes, const void *in, void *out); /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts2_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, +size_t ccpad_cts2_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, size_t nbytes, const void *in, void *out); /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts3_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, +size_t ccpad_cts3_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, size_t nbytes, const void *in, void *out); /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_cts3_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, +size_t ccpad_cts3_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, size_t nbytes, const void *in, void *out); /* Contract is nbytes is non zero and a multiple of block_size. Furthermore in is nbytes long and out is nbytes long. Returns number of bytes written to out (technically we always write nbytes to out but the returned value is the number of bytes decrypted after removal of padding. @@ -51,7 +47,7 @@ size_t ccpad_pkcs7_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_i size_t nbytes, const void *in, void *out); /* Contract is in is nbytes long. Writes (nbytes / block_size) + 1 times block_size to out. In other words, out must be nbytes rounded down to the closest multiple of block_size plus block_size bytes. */ -void ccpad_pkcs7_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, +size_t ccpad_pkcs7_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv *iv, size_t nbytes, const void *in, void *out); /* Contract is 'don't break CommonCrypto functionality that allows PKCS7 padding with ECB mode'. This is basically the same routines above, without an IV, because calling @@ -60,14 +56,14 @@ void ccpad_pkcs7_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, cccbc_iv size_t ccpad_pkcs7_ecb_decrypt(const struct ccmode_ecb *ecb, ccecb_ctx *ecb_key, size_t nbytes, const void *in, void *out); -void ccpad_pkcs7_ecb_encrypt(const struct ccmode_ecb *ecb, ccecb_ctx *ctx, +size_t ccpad_pkcs7_ecb_encrypt(const struct ccmode_ecb *ecb, ccecb_ctx *ctx, size_t nbytes, const void *in, void *out); /* Function common to ccpad_pkcs7_ecb_decrypt and ccpad_pkcs7_decrypt */ size_t ccpad_pkcs7_decode(const size_t block_size, const uint8_t* last_block); /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ -void ccpad_xts_decrypt(const struct ccmode_xts *xts, ccxts_ctx *ctx, ccxts_tweak *tweak, +size_t ccpad_xts_decrypt(const struct ccmode_xts *xts, ccxts_ctx *ctx, ccxts_tweak *tweak, size_t nbytes, const void *in, void *out); /* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h b/EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h index 9e296ff19..bf1cd8f5a 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h +++ b/EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h @@ -34,9 +34,9 @@ SHA-1 is a good hash to use for the core of the HMAC PRF. */ int ccpbkdf2_hmac(const struct ccdigest_info *di, - unsigned long passwordLen, const void *password, - unsigned long saltLen, const void *salt, - unsigned long iterations, - unsigned long dkLen, void *dk); + size_t passwordLen, const void *password, + size_t saltLen, const void *salt, + size_t iterations, + size_t dkLen, void *dk); #endif /* _CORECRYPTO_CCPBKDF2_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrc4.h b/EXTERNAL_HEADERS/corecrypto/ccrc4.h index 6e1ec736a..3b50710a3 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrc4.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrc4.h @@ -23,7 +23,7 @@ cc_aligned_struct(16) ccrc4_ctx; struct ccrc4_info { size_t size; /* first argument to ccrc4_ctx_decl(). */ void (*init)(ccrc4_ctx *ctx, size_t key_len, const void *key); - void (*crypt)(ccrc4_ctx *ctx, unsigned long nbytes, const void *in, void *out); + void (*crypt)(ccrc4_ctx *ctx, size_t nbytes, const void *in, void *out); }; @@ -34,7 +34,7 @@ extern const struct ccrc4_info ccrc4_eay; struct ccrc4_vector { size_t keylen; const void *key; - unsigned long datalen; + size_t datalen; const void *pt; const void *ct; }; diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng.h b/EXTERNAL_HEADERS/corecrypto/ccrng.h index a3291c830..f32922276 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrng.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrng.h @@ -13,17 +13,25 @@ #include -#define CC_ERR_DEVICE -100 -#define CC_ERR_INTERUPTS -101 -#define CC_ERR_CRYPTO_CONFIG -102 -#define CC_ERR_PERMS -103 -#define CC_ERR_PARAMETER -104 -#define CC_ERR_MEMORY -105 -#define CC_ERR_FILEDESC -106 -#define CC_ERR_OUT_OF_ENTROPY -107 +#include + +#define CC_ERR_DEVICE -100 +#define CC_ERR_INTERUPTS -101 +#define CC_ERR_CRYPTO_CONFIG -102 +#define CC_ERR_PERMS -103 +#define CC_ERR_PARAMETER -104 +#define CC_ERR_MEMORY -105 +#define CC_ERR_FILEDESC -106 +#define CC_ERR_OUT_OF_ENTROPY -107 +#define CC_ERR_INTERNAL -108 +#define CC_ERR_ATFORK -109 +#define CC_ERR_OVERFLOW -110 #define CCRNG_STATE_COMMON \ - int (*generate)(struct ccrng_state *rng, unsigned long outlen, void *out); + int (*generate)(struct ccrng_state *rng, size_t outlen, void *out); + +/* Get a pointer to a ccrng has never been simpler! Just call this */ +struct ccrng_state *ccrng(int *error); /* default state structure - do not instantiate, instead use the specific one you need */ struct ccrng_state { diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng_system.h b/EXTERNAL_HEADERS/corecrypto/ccrng_system.h index b6c8c06fd..a5aab7ed2 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrng_system.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrng_system.h @@ -18,7 +18,21 @@ struct ccrng_system_state { int fd; }; -// Setup the system RNG (open descriptor on file /dev/random) +/*! + @function ccrng_system_init - DEPRECATED + @abstract Default ccrng. + Please transition to ccrng() which is easier to use and with provide the fastest, most secure option + + @param rng Structure containing the state of the RNG, must remain allocated as + long as the rng is used. + @result 0 iff successful + + @discussion + This RNG require call to "init" AND "done", otherwise it may leak a file descriptor. + */ + +// Initialize ccrng +// Deprecated, if you need a rng, just call the function ccrng() int ccrng_system_init(struct ccrng_system_state *rng); // Close the system RNG diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha1.h b/EXTERNAL_HEADERS/corecrypto/ccsha1.h index 1990c197e..3372324b9 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccsha1.h +++ b/EXTERNAL_HEADERS/corecrypto/ccsha1.h @@ -32,13 +32,16 @@ void ccsha1_final(const struct ccdigest_info *di, ccdigest_ctx_t, extern const struct ccdigest_info ccsha1_ltc_di; extern const struct ccdigest_info ccsha1_eay_di; -#if !defined(__NO_ASM__) && CCSHA1_VNG_INTEL +#if CCSHA1_VNG_INTEL //extern const struct ccdigest_info ccsha1_vng_intel_di; +#if defined(__x86_64__) +extern const struct ccdigest_info ccsha1_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha1_vng_intel_AVX1_di; +#endif extern const struct ccdigest_info ccsha1_vng_intel_SupplementalSSE3_di; -extern const struct ccdigest_info ccsha1_vng_intel_NOSupplementalSSE3_di; #endif -#if !defined(__NO_ASM__) && CCSHA1_VNG_ARMV7NEON +#if CCSHA1_VNG_ARMV7NEON extern const struct ccdigest_info ccsha1_vng_armv7neon_di; #endif diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha2.h b/EXTERNAL_HEADERS/corecrypto/ccsha2.h index 2029e327b..1efca569d 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccsha2.h +++ b/EXTERNAL_HEADERS/corecrypto/ccsha2.h @@ -38,27 +38,39 @@ const struct ccdigest_info *ccsha512_di(void); #define CCSHA256_OUTPUT_SIZE 32 #define CCSHA256_STATE_SIZE 32 extern const struct ccdigest_info ccsha256_ltc_di; -#if !defined(__NO_ASM__) && CCSHA2_VNG_INTEL +extern const struct ccdigest_info ccsha256_v6m_di; +#if CCSHA2_VNG_INTEL #if defined __x86_64__ +extern const struct ccdigest_info ccsha224_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha224_vng_intel_AVX1_di; extern const struct ccdigest_info ccsha256_vng_intel_AVX2_di; extern const struct ccdigest_info ccsha256_vng_intel_AVX1_di; +extern const struct ccdigest_info ccsha384_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha384_vng_intel_AVX1_di; +extern const struct ccdigest_info ccsha384_vng_intel_SupplementalSSE3_di; +extern const struct ccdigest_info ccsha512_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha512_vng_intel_AVX1_di; +extern const struct ccdigest_info ccsha512_vng_intel_SupplementalSSE3_di; #endif extern const struct ccdigest_info ccsha256_vng_intel_SupplementalSSE3_di; -extern const struct ccdigest_info ccsha256_vng_intel_NOSupplementalSSE3_di; #endif -#if !defined(__NO_ASM__) && CCSHA2_VNG_ARMV7NEON +#if CCSHA2_VNG_ARMV7NEON extern const struct ccdigest_info ccsha256_vng_armv7neon_di; +extern const struct ccdigest_info ccsha384_vng_arm64_di; +extern const struct ccdigest_info ccsha384_vng_armv7neon_di; +extern const struct ccdigest_info ccsha512_vng_arm64_di; +extern const struct ccdigest_info ccsha512_vng_armv7neon_di; #endif extern const uint32_t ccsha256_K[64]; +extern const uint64_t ccsha512_K[80]; /* SHA224 */ #define CCSHA224_OUTPUT_SIZE 28 extern const struct ccdigest_info ccsha224_ltc_di; -#if !defined(__NO_ASM__) && CCSHA2_VNG_INTEL +#if CCSHA2_VNG_INTEL extern const struct ccdigest_info ccsha224_vng_intel_SupplementalSSE3_di; -extern const struct ccdigest_info ccsha224_vng_intel_NOSupplementalSSE3_di; #endif -#if !defined(__NO_ASM__) && CCSHA2_VNG_ARMV7NEON +#if CCSHA2_VNG_ARMV7NEON extern const struct ccdigest_info ccsha224_vng_armv7neon_di; #endif diff --git a/EXTERNAL_HEADERS/mach-o/Makefile b/EXTERNAL_HEADERS/mach-o/Makefile index 55ac56a4b..98d2a59ba 100644 --- a/EXTERNAL_HEADERS/mach-o/Makefile +++ b/EXTERNAL_HEADERS/mach-o/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -13,9 +12,9 @@ EXPORT_FILES = \ nlist.h \ reloc.h -INSTALL_MI_LIST = +INSTALL_MI_LIST = -INSTALL_MI_DIR = +INSTALL_MI_DIR = EXPORT_MI_LIST = ${EXPORT_FILES} @@ -23,5 +22,3 @@ EXPORT_MI_DIR = mach-o include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/EXTERNAL_HEADERS/mach-o/loader.h b/EXTERNAL_HEADERS/mach-o/loader.h index aba7cb7f6..ffaf873d8 100644 --- a/EXTERNAL_HEADERS/mach-o/loader.h +++ b/EXTERNAL_HEADERS/mach-o/loader.h @@ -300,6 +300,7 @@ struct load_command { #define LC_ENCRYPTION_INFO_64 0x2C /* 64-bit encrypted segment information */ #define LC_LINKER_OPTION 0x2D /* linker options in MH_OBJECT files */ #define LC_LINKER_OPTIMIZATION_HINT 0x2E /* optimization hints in MH_OBJECT files */ +#define LC_VERSION_MIN_TVOS 0x2F /* build for AppleTV min OS version */ #define LC_VERSION_MIN_WATCHOS 0x30 /* build for Watch min OS version */ /* diff --git a/Makefile b/Makefile index 2a4b9ab19..c4c0663ef 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,6 @@ # -# Copyright (C) 1999-2013 Apple Inc. All rights reserved. +# Copyright (C) 1999-2016 Apple Inc. All rights reserved. # - ifndef VERSDIR export VERSDIR := $(shell /bin/pwd) endif @@ -39,12 +38,15 @@ endif default: install +# default to OS X +SDKROOT ?= macosx.internal + installhdrs install: cd libsyscall ; \ xcodebuild $@ $(TARGET) \ "SRCROOT=$(SRCROOT)/libsyscall" \ - "OBJROOT=$(OBJROOT)" \ - "SYMROOT=$(SYMROOT)" \ + "OBJROOT=$(OBJROOT)" \ + "SYMROOT=$(SYMROOT)" \ "DSTROOT=$(DSTROOT)" \ "SDKROOT=$(SDKROOT)" @@ -87,33 +89,28 @@ default: install installhdrs install: cd libkern/kmod ; \ - xcodebuild $@ \ + xcodebuild $@ \ "SRCROOT=$(SRCROOT)/libkern/kmod" \ - "OBJROOT=$(OBJROOT)" \ - "SYMROOT=$(SYMROOT)" \ + "OBJROOT=$(OBJROOT)" \ + "SYMROOT=$(SYMROOT)" \ "DSTROOT=$(DSTROOT)" \ "SDKROOT=$(SDKROOT)" clean: -installsrc: - pax -rw . $(SRCROOT) - -else ifeq ($(RC_ProjectName),xnu_quick_test) -# This rule should be removed once rdar://22820602 is complete. -default: install - -installhdrs: - -install: xnu_tests - -clean: - installsrc: pax -rw . $(SRCROOT) else ifeq ($(RC_ProjectName),xnu_tests) +export SYSCTL_HW_PHYSICALCPU := $(shell /usr/sbin/sysctl -n hw.physicalcpu) +export SYSCTL_HW_LOGICALCPU := $(shell /usr/sbin/sysctl -n hw.logicalcpu) +ifeq ($(SYSCTL_HW_PHYSICALCPU),$(SYSCTL_HW_LOGICALCPU)) +MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_PHYSICALCPU) + 1) +else +MAKEJOBS := --jobs=$(SYSCTL_HW_LOGICALCPU) +endif + default: install installhdrs: @@ -151,26 +148,26 @@ else MAKEJOBS := --jobs=$(SYSCTL_HW_LOGICALCPU) endif -TOP_TARGETS = \ - clean \ - installsrc \ - exporthdrs \ +TOP_TARGETS = \ + clean \ + installsrc \ + exporthdrs \ all all_desktop all_embedded \ - all_release_embedded all_development_embedded \ - installhdrs installhdrs_desktop installhdrs_embedded \ - installhdrs_release_embedded installhdrs_development_embedded \ - install install_desktop install_embedded \ - install_release_embedded install_development_embedded \ - installopensource \ + all_release_embedded all_development_embedded \ + installhdrs installhdrs_desktop installhdrs_embedded \ + installhdrs_release_embedded installhdrs_development_embedded \ + install install_desktop install_embedded \ + install_release_embedded install_development_embedded \ + installopensource \ cscope tags TAGS reindent \ help DEFAULT_TARGET = all # Targets for internal build system debugging -TOP_TARGETS += \ +TOP_TARGETS += \ print_exports print_exports_first_build_config \ - setup \ + setup \ build \ config \ install_textfiles \ @@ -242,10 +239,10 @@ endif # all other RC_ProjectName installhdrs_libkdd install_libkdd: cd libkdd; \ - xcodebuild $(subst _libkdd,,$@) \ + xcodebuild -target libkdd $(subst _libkdd,,$@) \ "SRCROOT=$(SRCROOT)/libkdd" \ - "OBJROOT=$(OBJROOT)" \ - "SYMROOT=$(SYMROOT)" \ + "OBJROOT=$(OBJROOT)" \ + "SYMROOT=$(SYMROOT)" \ "DSTROOT=$(DSTROOT)" \ "SDKROOT=$(SDKROOT)" @@ -254,14 +251,6 @@ installhdrs_libkdd install_libkdd: # "make xnu_tests" or via buildit/XBS with the RC_ProjectName=xnu_tests. # Define the target here in the outermost scope of the initial Makefile -xnu_tests xnu_quick_test: - $(MAKE) -C $(SRCROOT)/tools/tests \ +xnu_tests: + $(MAKE) -C $(SRCROOT)/tools/tests $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \ SRCROOT=$(SRCROOT)/tools/tests - -# This target is defined to compile and run xnu_quick_test under testbots -testbots: - $(MAKE) -C $(SRCROOT)/tools/tests/xnu_quick_test \ - SRCROOT=$(SRCROOT)/tools/tests/xnu_quick_test \ - MORECFLAGS="-DRUN_UNDER_TESTBOTS=1" \ - MAKE=$(MAKE) \ - testbots diff --git a/README b/README deleted file mode 100644 index 1294b6726..000000000 --- a/README +++ /dev/null @@ -1,255 +0,0 @@ -Table of contents: -A. How to build XNU -B. How to install a new header file from XNU - -============================================= -A. How to build XNU: - -1) Type: "make" - - This builds all the components for kernel, architecture, and machine - configurations defined in TARGET_CONFIGS. Additionally, we also support - architectures defined in ARCH_CONFIGS and kernel configurations defined in - KERNEL_CONFIGS. Note that TARGET_CONFIGS overrides any configurations defined - in ARCH_CONFIGS and KERNEL_CONFIGS. - - By default, architecture defaults to the build machine - architecture, and the kernel configuration is set to build for DEVELOPMENT. - - This will also create a bootable image, mach_kernel, and a kernel binary - with symbols, mach_kernel.sys. - - - /* this is all you need to do to build with RELEASE kernel configuration */ - make TARGET_CONFIGS="release x86_64 default" SDKROOT=/path/to/SDK - - or the following is equivalent (ommitted SDKROOT will use /) - - make ARCH_CONFIGS=X86_64 - -2) Building DEBUG - - Define kernel configuration to DEBUG in your environment or when running a - make command. Then, apply procedures 4, 5 - - $ make TARGET_CONFIGS="DEBUG X86_64 DEFAULT" all - - or - - $ make KERNEL_CONFIGS=DEBUG ARCH_CONFIGS=X86_64 all - - or - - $ export TARGET_CONFIGS="DEBUG X86_64 DEFAULT" - $ export SDKROOT=/path/to/SDK - $ make all - - Example: - $(OBJROOT)/DEBUG_X86_64/osfmk/DEBUG/osfmk.filelist: list of objects in osfmk component - $(OBJROOT)/DEBUG_X86_64/mach_kernel: bootable image - -3) Building fat - - Define architectures in your environment or when running a make command. - Apply procedures 3, 4, 5 - - $ make TARGET_CONFIGS="RELEASE I386 DEFAULT RELEASE X86_64 DEFAULT" exporthdrs all - - or - - $ make ARCH_CONFIGS="I386 X86_64" exporthdrs all - - or - - $ export ARCH_CONFIGS="I386 X86_64" - $ make exporthdrs all - -4) Verbose make - To display complete tool invocations rather than an abbreviated version, - $ make VERBOSE=YES - -5) Debug information formats - By default, a DWARF debug information repository is created during the install phase; this is a "bundle" named mach_kernel.dSYM - To select the older STABS debug information format (where debug information is embedded in the mach_kernel.sys image), set the BUILD_STABS environment variable. - $ export BUILD_STABS=1 - $ make - -6) Build check before integration - - From the top directory, run: - - $ ~rc/bin/buildit . -arch i386 -arch x86_64 -arch armv7 -arch ppc -noinstallsrc -nosum - - - xnu supports a number of XBS build aliases, which allow B&I to build - the same source submission multiple times in different ways, to - produce different results. Each build alias supports the standard - "clean", "install", "installsrc", "installhdrs" targets, but - conditionalize their behavior on the RC_ProjectName make variable - which is passed as the -buildAlias argument to ~rc/bin/buildit, which - can be one of: - - -buildAlias xnu # the default, builds /mach_kernel, kernel-space - # headers, user-space headers, man pages, - # symbol-set kexts - - -buildAlias xnu_debug # a DEBUG kernel in /AppleInternal with dSYM - - -buildAlias libkxld # user-space version of kernel linker - - -buildAlias libkmod # static library automatically linked into kexts - - -buildAlias Libsyscall # automatically generate BSD syscall stubs - - -buildAlias xnu_quick_test # install xnu unit tests - - - -7) Creating tags and cscope - - Set up your build environment as per instructions in 2a - - From the top directory, run: - - $ make tags # this will build ctags and etags on a case-sensitive - # volume, only ctags on case-insensitive - - $ make TAGS # this will build etags - - $ make cscope # this will build cscope database - -8) Reindenting files - - Source files can be reindented using clang-format setup in .clang-format. XNU follow a variant of WebKit style for source code formatting. Please refer to format styles at http://www.webkit.org/coding/coding-style.html. Further options about style options is available at http://clang.llvm.org/docs/ClangFormatStyleOptions.html - - Note: clang-format binary may not be part of base installation. It can be compiled from llvm clang sources and is reachable in $PATH. - - From the top directory, run: - - $ make reindent # reindent all source files using clang format. - - -9) Other makefile options - - $ make MAKEJOBS=-j8 # this will use 8 processes during the build. The default is 2x the number of active CPUS. - $ make -j8 # the standard command-line option is also accepted - - $ make -w # trace recursive make invocations. Useful in combination with VERBOSE=YES - - $ make BUILD_LTO=0 # build without LLVM Link Time Optimization - - $ make REMOTEBUILD=user@remotehost # perform build on remote host - - $ make BUILD_JSON_COMPILATION_DATABASE=1 # Build Clang JSON Compilation Database - -============================================= -B. How to install a new header file from XNU - -[To install IOKit headers, see additional comments in iokit/IOKit/Makefile.] - -1) XNU installs header files at the following locations - - a. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers - b. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders - c. $(DSTROOT)/usr/include/ - d. $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders - - Kernel.framework is used by kernel extensions. System.framework - and /usr/include are used by user level applications. The header - files in framework's "PrivateHeaders" are only available for Apple - Internal development. - -2) The directory containing the header file should have a Makefile that - creates the list of files that should be installed at different locations. - If you are adding first header file in a directory, you will need to - create Makefile similar to xnu/bsd/sys/Makefile. - - Add your header file to the correct file list depending on where you want - to install it. The default locations where the header files are installed - from each file list are - - - a. DATAFILES : To make header file available in user level - - $(DSTROOT)/usr/include - - b. PRIVATE_DATAFILES : To make header file available to Apple internal in - user level - - $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders - - c. KERNELFILES : To make header file available in kernel level - - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders - - d. PRIVATE_KERNELFILES : To make header file available to Apple internal - for kernel extensions - - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders - -3) The Makefile combines the file lists mentioned above into different - install lists which are used by build system to install the header files. - - If the install list that you are interested does not exist, create it - by adding the appropriate file lists. The default install lists, its - member file lists and their default location are described below - - - a. INSTALL_MI_LIST : Installs header file to a location that is available to - everyone in user level. - Locations - - $(DSTROOT)/usr/include - Definition - - INSTALL_MI_LIST = ${DATAFILES} - - b. INSTALL_MI_LCL_LIST : Installs header file to a location that is available - for Apple internal in user level. - Locations - - $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders - Definition - - INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} - - c. INSTALL_KF_MI_LIST : Installs header file to location that is available - to everyone for kernel extensions. - Locations - - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers - Definition - - INSTALL_KF_MI_LIST = ${KERNELFILES} - - d. INSTALL_KF_MI_LCL_LIST : Installs header file to location that is - available for Apple internal for kernel extensions. - Locations - - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders - Definition - - INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} - -4) If you want to install the header file in a sub-directory of the paths - described in (1), specify the directory name using two variable - INSTALL_MI_DIR and EXPORT_MI_DIR as follows - - - INSTALL_MI_DIR = dirname - EXPORT_MI_DIR = dirname - -5) A single header file can exist at different locations using the steps - mentioned above. However it might not be desirable to make all the code - in the header file available at all the locations. For example, you - want to export a function only to kernel level but not user level. - - You can use C language's pre-processor directive (#ifdef, #endif, #ifndef) - to control the text generated before a header file is installed. The kernel - only includes the code if the conditional macro is TRUE and strips out - code for FALSE conditions from the header file. - - Some pre-defined macros and their descriptions are - - a. PRIVATE : If true, code is available to all of the xnu kernel and is - not available in kernel extensions and user level header files. The - header files installed in all the paths described above in (1) will not - have code enclosed within this macro. - - b. KERNEL_PRIVATE : Same as PRIVATE - - c. BSD_KERNEL_PRIVATE : If true, code is available to the xnu/bsd part of - the kernel and is not available to rest of the kernel, kernel extensions - and user level header files. The header files installed in all the - paths described above in (1) will not have code enclosed within this - macro. - - d. KERNEL : If true, code is available only in kernel and kernel - extensions and is not available in user level header files. Only the - header files installed in following paths will have the code - - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders diff --git a/README.md b/README.md new file mode 100644 index 000000000..5a0601b8e --- /dev/null +++ b/README.md @@ -0,0 +1,357 @@ +What is XNU? +=========== + +XNU kernel is part of the Darwin operating system for use in OS X and iOS operating systems. XNU is an acronym for XNU is Not Unix. +XNU is a hybrid kernel combining the Mach kernel developed at Carnegie Mellon University with components from FreeBSD and C++ API for writing drivers called IOKit. +XNU runs on I386, X86_64 for both single processor and multi-processor configurations. + +XNU Source Tree +=============== + + * `config` - configurations for exported apis for supported architecture and platform + * `SETUP` - Basic set of tools used for configuring the kernel, versioning and kextsymbol management. + * `EXTERNAL_HEADERS` - Headers sourced from other projects to avoid dependency cycles when building. These headers should be regularly synced when source is updated. + * `libkern` - C++ IOKit library code for handling of drivers and kexts. + * `libsa` - kernel bootstrap code for startup + * `libsyscall` - syscall library interface for userspace programs + * `libkdd` - source for user library for parsing kernel data like kernel chunked data. + * `makedefs` - top level rules and defines for kernel build. + * `osfmk` - Mach kernel based subsystems + * `pexpert` - Platform specific code like interrupt handling, atomics etc. + * `security` - Mandatory Access Check policy interfaces and related implementation. + * `bsd` - BSD subsystems code + * `tools` - A set of utilities for testing, debugging and profiling kernel. + +How to build XNU +================ + +Building `DEVELOPMENT` kernel +----------------------------- + +The xnu make system can build kernel based on `KERNEL_CONFIGS` & `ARCH_CONFIGS` variables as arguments. +Here is the syntax: + + make SDKROOT= ARCH_CONFIGS= KERNEL_CONFIGS= + +Where: + + * \: path to MacOS SDK on disk. (defaults to `/`) + * \: can be `debug`, `development`, `release`, `profile` and configures compilation flags and asserts throughout kernel code. + * \ : can be valid arch to build for. (E.g. `i386` or `X86_64`) + +To build a kernel for the same architecture as running OS, just type + + $ make + $ make SDKROOT=macosx.internal + +Additionally, there is support for configuring architectures through `ARCH_CONFIGS` and kernel configurations with `KERNEL_CONFIGS`. + + $ make SDKROOT=macosx.internal ARCH_CONFIGS=X86_64 KERNEL_CONFIGS=DEVELOPMENT + $ make SDKROOT=macosx.internal ARCH_CONFIGS=X86_64 KERNEL_CONFIGS="RELEASE DEVELOPMENT DEBUG" + + +Note: + * By default, architecture is set to the build machine architecture, and the default kernel + config is set to build for DEVELOPMENT. + + +This will also create a bootable image, kernel.[config], and a kernel binary +with symbols, kernel.[config].unstripped. + + + * To build with RELEASE kernel configuration + + make KERNEL_CONFIGS=RELEASE SDKROOT=/path/to/SDK + + +Building FAT kernel binary +-------------------------- + +Define architectures in your environment or when running a make command. + + $ make ARCH_CONFIGS="I386 X86_64" exporthdrs all + +Other makefile options +---------------------- + + * $ make MAKEJOBS=-j8 # this will use 8 processes during the build. The default is 2x the number of active CPUS. + * $ make -j8 # the standard command-line option is also accepted + * $ make -w # trace recursive make invocations. Useful in combination with VERBOSE=YES + * $ make BUILD_LTO=0 # build without LLVM Link Time Optimization + * $ make REMOTEBUILD=user@remotehost # perform build on remote host + * $ make BUILD_JSON_COMPILATION_DATABASE=1 # Build Clang JSON Compilation Database + + + +Debug information formats +========================= + +By default, a DWARF debug information repository is created during the install phase; this is a "bundle" named kernel.development.\.dSYM +To select the older STABS debug information format (where debug information is embedded in the kernel.development.unstripped image), set the BUILD_STABS environment variable. + + $ export BUILD_STABS=1 + $ make + + +Building KernelCaches +===================== + +To test the xnu kernel, you need to build a kernelcache that links the kexts and +kernel together into a single bootable image. +To build a kernelcache you can use the following mechanisms: + + * Using automatic kernelcache generation with `kextd`. + The kextd daemon keeps watching for changing in `/System/Library/Extensions` directory. + So you can setup new kernel as + + $ cp BUILD/obj/DEVELOPMENT/X86_64/kernel.development /System/Library/Kernels/ + $ touch /System/Library/Extensions + $ ps -e | grep kextd + + * Manually invoking `kextcache` to build new kernelcache. + + $ kextcache -q -z -a x86_64 -l -n -c /var/tmp/kernelcache.test -K /var/tmp/kernel.test /System/Library/Extensions + + + +Running KernelCache on Target machine +===================================== + +The development kernel and iBoot supports configuring boot arguments so that we can safely boot into test kernel and, if things go wrong, safely fall back to previously used kernelcache. +Following are the steps to get such a setup: + + 1. Create kernel cache using the kextcache command as `/kernelcache.test` + 2. Copy exiting boot configurations to alternate file + + $ cp /Library/Preferences/SystemConfiguration/com.apple.Boot.plist /next_boot.plist + + 3. Update the kernelcache and boot-args for your setup + + $ plutil -insert "Kernel Cache" -string "kernelcache.test" /next_boot.plist + $ plutil -replace "Kernel Flags" -string "debug=0x144 -v kernelsuffix=test " /next_boot.plist + + 4. Copy the new config to `/Library/Preferences/SystemConfiguration/` + + $ cp /next_boot.plist /Library/Preferences/SystemConfiguration/boot.plist + + 5. Bless the volume with new configs. + + $ sudo -n bless --mount / --setBoot --nextonly --options "config=boot" + + The `--nextonly` flag specifies that use the `boot.plist` configs only for one boot. + So if the kernel panic's you can easily power reboot and recover back to original kernel. + + + + +Creating tags and cscope +======================== + +Set up your build environment and from the top directory, run: + + $ make tags # this will build ctags and etags on a case-sensitive volume, only ctags on case-insensitive + $ make TAGS # this will build etags + $ make cscope # this will build cscope database + + +Coding styles (Reindenting files) +================================= + +Source files can be reindented using clang-format setup in .clang-format. +XNU follows a variant of WebKit style for source code formatting. +Please refer to format styles at [WebKit website](http://www.webkit.org/coding/coding-style.html). +Further options about style options is available at [clang docs](http://clang.llvm.org/docs/ClangFormatStyleOptions.html) + + Note: clang-format binary may not be part of base installation. It can be compiled from llvm clang sources and is reachable in $PATH. + + From the top directory, run: + + $ make reindent # reindent all source files using clang format. + + + +How to install a new header file from XNU +========================================= + +To install IOKit headers, see additional comments in [iokit/IOKit/Makefile](). + +XNU installs header files at the following locations - + + a. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers + b. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders + c. $(DSTROOT)/usr/include/ + d. $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders + +`Kernel.framework` is used by kernel extensions.\ +The `System.framework` and `/usr/include` are used by user level applications. \ +The header files in framework's `PrivateHeaders` are only available for ** Apple Internal Development **. + +The directory containing the header file should have a Makefile that +creates the list of files that should be installed at different locations. +If you are adding first header file in a directory, you will need to +create Makefile similar to xnu/bsd/sys/Makefile. + +Add your header file to the correct file list depending on where you want +to install it. The default locations where the header files are installed +from each file list are - + + a. `DATAFILES` : To make header file available in user level - + `$(DSTROOT)/usr/include` + + b. `PRIVATE_DATAFILES` : To make header file available to Apple internal in + user level - + `$(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders` + + c. `KERNELFILES` : To make header file available in kernel level - + `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers` + `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders` + + d. `PRIVATE_KERNELFILES` : To make header file available to Apple internal + for kernel extensions - + `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders` + +The Makefile combines the file lists mentioned above into different +install lists which are used by build system to install the header files. + +If the install list that you are interested does not exist, create it +by adding the appropriate file lists. The default install lists, its +member file lists and their default location are described below - + + a. `INSTALL_MI_LIST` : Installs header file to a location that is available to everyone in user level. + Locations - + $(DSTROOT)/usr/include + Definition - + INSTALL_MI_LIST = ${DATAFILES} + + b. `INSTALL_MI_LCL_LIST` : Installs header file to a location that is available + for Apple internal in user level. + Locations - + $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders + Definition - + INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} + + c. `INSTALL_KF_MI_LIST` : Installs header file to location that is available + to everyone for kernel extensions. + Locations - + $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers + Definition - + INSTALL_KF_MI_LIST = ${KERNELFILES} + + d. `INSTALL_KF_MI_LCL_LIST` : Installs header file to location that is + available for Apple internal for kernel extensions. + Locations - + $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders + Definition - + INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} + +If you want to install the header file in a sub-directory of the paths +described in (1), specify the directory name using two variables +`INSTALL_MI_DIR` and `EXPORT_MI_DIR` as follows - + + INSTALL_MI_DIR = dirname + EXPORT_MI_DIR = dirname + +A single header file can exist at different locations using the steps +mentioned above. However it might not be desirable to make all the code +in the header file available at all the locations. For example, you +want to export a function only to kernel level but not user level. + + You can use C language's pre-processor directive (#ifdef, #endif, #ifndef) + to control the text generated before a header file is installed. The kernel + only includes the code if the conditional macro is TRUE and strips out + code for FALSE conditions from the header file. + + Some pre-defined macros and their descriptions are - + + a. `PRIVATE` : If true, code is available to all of the xnu kernel and is + not available in kernel extensions and user level header files. The + header files installed in all the paths described above in (1) will not + have code enclosed within this macro. + + b. `KERNEL_PRIVATE` : Same as PRIVATE + + c. `BSD_KERNEL_PRIVATE` : If true, code is available to the xnu/bsd part of + the kernel and is not available to rest of the kernel, kernel extensions + and user level header files. The header files installed in all the + paths described above in (1) will not have code enclosed within this macro. + + d. `KERNEL` : If true, code is available only in kernel and kernel + extensions and is not available in user level header files. Only the + header files installed in following paths will have the code - + + $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers + $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders + + you should check [Testing the kernel][] for details. + + +How to add a new syscall +======================== + + + + +Testing the kernel +================== + +XNU kernel has multiple mechanisms for testing. + + * Assertions - The DEVELOPMENT and DEBUG kernel configs are compiled with assertions enabled. This allows developers to easily + test invariants and conditions. + + * XNU Power On Self Tests (`XNUPOST`): The XNUPOST config allows for building the kernel with basic set of test functions + that are run before first user space process is launched. Since XNU is hybrid between MACH and BSD, we have two locations where + tests can be added. + + xnu/osfmk/tests/ # For testing mach based kernel structures and apis. + bsd/tests/ # For testing BSD interfaces. + Please follow the documentation at [osfmk/tests/README.md](osfmk/tests/README.md) + + * User level tests: The `tools/tests/` directory holds all the tests that verify syscalls and other features of the xnu kernel. + The make target `xnu_tests` can be used to build all the tests supported. + + $ make RC_ProjectName=xnu_tests SDKROOT=/path/to/SDK + + These tests are individual programs that can be run from Terminal and report tests status by means of std posix exit codes (0 -> success) and/or stdout. + Please read detailed documentation in [tools/tests/unit_tests/README.md](tools/tests/unit_tests/README.md) + + +Kernel data descriptors +======================= + +XNU uses different data formats for passing data in its api. The most standard way is using syscall arguments. But for complex data +it often relies of sending memory saved by C structs. This packaged data transport mechanism is fragile and leads to broken interfaces +between user space programs and kernel apis. `libkdd` directory holds user space library that can parse custom data provided by the +same version of kernel. The kernel chunked data format is described in detail at [libkdd/README.md](libkdd/README.md). + + +Debugging the kernel +==================== + +The xnu kernel supports debugging with a remote kernel debugging protocol (kdp). Please refer documentation at [technical note] [TN2063] +By default the kernel is setup to reboot on a panic. To debug a live kernel, the kdp server is setup to listen for UDP connections +over ethernet. For machines without ethernet port, this behavior can be altered with use of kernel boot-args. Following are some +common options. + + * `debug=0x144` - setups debug variables to start kdp debugserver on panic + * `-v` - print kernel logs on screen. By default XNU only shows grey screen with boot art. + * `kdp_match_name=en1` - Override default port selection for kdp. Supported for ethernet, thunderbolt and serial debugging. + +To debug a panic'ed kernel, use llvm debugger (lldb) along with unstripped symbol rich kernel binary. + + sh$ lldb kernel.development.unstripped + +And then you can connect to panic'ed machine with `kdp_remote [ip addr]` or `gdb_remote [hostip : port]` commands. + +Each kernel is packaged with kernel specific debug scripts as part of the build process. For security reasons these special commands +and scripts do not get loaded automatically when lldb is connected to machine. Please add the following setting to your `~/.lldbinit` +if you wish to always load these macros. + + settings set target.load-script-from-symbol-file true + +The `tools/lldbmacros` directory contains the source for each of these commands. Please follow the [README.md](tools/lldbmacros/README.md) +for detailed explanation of commands and their usage. + +[TN2118]: https://developer.apple.com/library/mac/technotes/tn2004/tn2118.html#//apple_ref/doc/uid/DTS10003352 "Kernel Core Dumps" +[TN2063]: https://developer.apple.com/library/mac/technotes/tn2063/_index.html "Understanding and Debugging Kernel Panics" +[Kernel Programming Guide]: https://developer.apple.com/library/mac/documentation/Darwin/Conceptual/KernelProgramming/build/build.html#//apple_ref/doc/uid/TP30000905-CH221-BABDGEGF diff --git a/SETUP/Makefile b/SETUP/Makefile index 6236960ac..8d8a20837 100644 --- a/SETUP/Makefile +++ b/SETUP/Makefile @@ -6,7 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ +SETUP_SUBDIRS = \ config \ kextsymboltool \ setsegname \ diff --git a/SETUP/config/Makefile b/SETUP/config/Makefile index eb25f4571..56032b45d 100644 --- a/SETUP/config/Makefile +++ b/SETUP/config/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -18,21 +17,21 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) config: $(OBJS) - @echo HOST_LD $@ + @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo HOST_CODESIGN $@ + @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo HOST_CC $@ + @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< parser.c: parser.y - @echo HOST_BISON $@ + @echo "$(ColorH)HOST_BISON$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_BISON) -y -d -d -o $@ $< lexer.yy.c: lexer.l - @echo HOST_FLEX $@ + @echo "$(ColorH)HOST_FLEX$(Color0) $(ColorF)$@$(Color0)" $(_v)env M4=$(HOST_GM4) $(HOST_FLEX) --header-file=lexer.yy.h -o $@ $< main.o mkheaders.o mkioconf.o mkmakefile.o lexer.yy.c: parser.c diff --git a/SETUP/config/main.c b/SETUP/config/main.c index 5dfcf79d6..f485b4e39 100644 --- a/SETUP/config/main.c +++ b/SETUP/config/main.c @@ -148,8 +148,8 @@ const char * get_word(FILE *fp) { static char line[80]; - register int ch; - register char *cp; + int ch; + char *cp; while ((ch = getc(fp)) != EOF) if (ch != ' ' && ch != '\t') @@ -184,8 +184,8 @@ char * get_rest(FILE *fp) { static char line[80]; - register int ch; - register char *cp; + int ch; + char *cp; cp = line; while ((ch = getc(fp)) != EOF) { @@ -205,7 +205,7 @@ get_rest(FILE *fp) char * path(const char *file) { - register char *cp; + char *cp; cp = malloc((unsigned)(strlen(build_directory)+ strlen(file)+ diff --git a/SETUP/config/mkioconf.c b/SETUP/config/mkioconf.c index 662166da6..9f210daa0 100644 --- a/SETUP/config/mkioconf.c +++ b/SETUP/config/mkioconf.c @@ -78,7 +78,7 @@ mkioconf(void) void pseudo_inits(FILE *fp) { - register struct device *dp; + struct device *dp; int count; fprintf(fp, "\n"); diff --git a/SETUP/config/mkmakefile.c b/SETUP/config/mkmakefile.c index cbb7d2bd8..9a8dc5c79 100644 --- a/SETUP/config/mkmakefile.c +++ b/SETUP/config/mkmakefile.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2016 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ - * + * * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights * Reserved. This file contains Original Code and/or Modifications of * Original Code as defined in and that are subject to the Apple Public @@ -10,7 +10,7 @@ * except in compliance with the License. Please obtain a copy of the * License at http://www.apple.com/publicsource and read it before using * this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -18,10 +18,10 @@ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License." - * + * * @APPLE_LICENSE_HEADER_END@ */ -/* +/* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University @@ -79,7 +79,7 @@ void put_source_file_name(FILE *fp, struct file_list *tp); #define next_word(fp, wd) \ - { register const char *word = get_word(fp); \ + { const char *word = get_word(fp); \ if (word == (char *)EOF) \ return; \ else \ @@ -96,7 +96,7 @@ char *allCaps(char *str); struct file_list * fl_lookup(char *file) { - register struct file_list *fp; + struct file_list *fp; for (fp = ftab ; fp != 0; fp = fp->f_next) { if (eq(fp->f_fn, file)) @@ -111,7 +111,7 @@ fl_lookup(char *file) struct file_list * fltail_lookup(char *file) { - register struct file_list *fp; + struct file_list *fp; for (fp = ftab ; fp != 0; fp = fp->f_next) { if (eq(tail(fp->f_fn), tail(file))) @@ -126,7 +126,7 @@ fltail_lookup(char *file) struct file_list * new_fent(void) { - register struct file_list *fp; + struct file_list *fp; fp = (struct file_list *) malloc(sizeof *fp); fp->f_needs = 0; @@ -152,7 +152,7 @@ get_VPATH(void) if ((vpath == NULL) && ((vpath = getenv("VPATH")) != NULL) && (*vpath != ':')) { - register char *buf = malloc((unsigned)(strlen(vpath) + 2)); + char *buf = malloc((unsigned)(strlen(vpath) + 2)); vpath = strcat(strcpy(buf, ":"), vpath); } @@ -210,7 +210,7 @@ makefile(void) if (*line == '%') goto percent; if (profiling && strncmp(line, "COPTS=", 6) == 0) { - register char *cp; + char *cp; fprintf(ofp, "GPROF.EX=$(SOURCE_DIR)/machdep/%s/gmon.ex\n", machinename); cp = index(line, '\n'); @@ -268,9 +268,9 @@ void read_files(void) { FILE *fp; - register struct file_list *tp, *pf; - register struct device *dp; - register struct opt *op; + struct file_list *tp, *pf; + struct device *dp; + struct opt *op; const char *wd; char *this, *needs; const char *devorprof; @@ -515,8 +515,8 @@ put_source_file_name(FILE *fp, struct file_list *tp) void do_objs(FILE *fp, const char *msg, int ext) { - register struct file_list *tp; - register int lpos, len; + struct file_list *tp; + int lpos, len; char *cp; char och; const char *sp; @@ -561,8 +561,8 @@ do_objs(FILE *fp, const char *msg, int ext) void do_files(FILE *fp, const char *msg, char ext) { - register struct file_list *tp; - register int lpos, len=0; /* dvw: init to 0 */ + struct file_list *tp; + int lpos, len=0; /* dvw: init to 0 */ fprintf(fp, "%s", msg); lpos = 8; @@ -613,7 +613,7 @@ do_machdep(FILE *ofp) const char * tail(const char *fn) { - register const char *cp; + const char *cp; cp = rindex(fn, '/'); if (cp == 0) @@ -634,7 +634,7 @@ do_rules(FILE *f) char *cp; char *np, och; const char *tp; - register struct file_list *ftp; + struct file_list *ftp; const char *extras = ""; /* dvw: init to "" */ char *source_dir; char och_upper; @@ -713,6 +713,12 @@ do_rules(FILE *f) fprintf(f, "\t${%c_RULE_2%s}%s\n", och_upper, extras, nl); fprintf(f, "\t${%c_CTFRULE_2%s}%s\n", och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_3%s}%s\n", och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_4A%s}", och_upper, extras); + if (ftp->f_extra) + fprintf(f, "%s", ftp->f_extra); + fprintf(f, "%s%.*s${%c_RULE_4B%s}%s\n", + source_dir, (int)(tp-np), np, och_upper, extras, nl); break; default: @@ -724,10 +730,9 @@ do_rules(FILE *f) } char * -allCaps(str) - register char *str; +allCaps(char *str) { - register char *cp = str; + char *cp = str; while (*str) { if (islower(*str)) @@ -745,7 +750,7 @@ static char makbuf[LINESIZE]; /* one line buffer for makefile */ void copy_dependencies(FILE *makin, FILE *makout) { - register int oldlen = (sizeof OLDSALUTATION - 1); + int oldlen = (sizeof OLDSALUTATION - 1); while (fgets(makbuf, LINESIZE, makin) != NULL) { if (! strncmp(makbuf, OLDSALUTATION, oldlen)) diff --git a/SETUP/decomment/Makefile b/SETUP/decomment/Makefile index 5de5e0d57..7018eb19e 100644 --- a/SETUP/decomment/Makefile +++ b/SETUP/decomment/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -16,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) decomment: $(OBJS) - @echo HOST_LD $@ + @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo HOST_CODESIGN $@ + @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo HOST_CC $@ + @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: decomment diff --git a/SETUP/installfile/Makefile b/SETUP/installfile/Makefile index 060d923fe..eb1f3afbb 100644 --- a/SETUP/installfile/Makefile +++ b/SETUP/installfile/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) installfile: $(OBJS) - @echo HOST_LD $@ + @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo HOST_CODESIGN $@ + @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo HOST_CC $@ + @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: installfile diff --git a/SETUP/json_compilation_db/Makefile b/SETUP/json_compilation_db/Makefile index c3634fe9e..518644cb5 100644 --- a/SETUP/json_compilation_db/Makefile +++ b/SETUP/json_compilation_db/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) json_compilation_db: $(OBJS) - @echo HOST_LD $@ + @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo HOST_CODESIGN $@ + @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo HOST_CC $@ + @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: json_compilation_db diff --git a/SETUP/kextsymboltool/Makefile b/SETUP/kextsymboltool/Makefile index 4c765d828..af6cdcafd 100644 --- a/SETUP/kextsymboltool/Makefile +++ b/SETUP/kextsymboltool/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -16,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) -lstdc++ kextsymboltool: $(OBJS) - @echo HOST_LD $@ + @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo HOST_CODESIGN $@ + @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo HOST_CC $@ + @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: kextsymboltool diff --git a/SETUP/replacecontents/Makefile b/SETUP/replacecontents/Makefile index aa12e725f..e1e84844e 100644 --- a/SETUP/replacecontents/Makefile +++ b/SETUP/replacecontents/Makefile @@ -15,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) replacecontents: $(OBJS) - @echo HOST_LD $@ + @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo HOST_CODESIGN $@ + @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo HOST_CC $@ + @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: replacecontents diff --git a/SETUP/setsegname/Makefile b/SETUP/setsegname/Makefile index ece876930..7e9224ef0 100644 --- a/SETUP/setsegname/Makefile +++ b/SETUP/setsegname/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -16,13 +15,13 @@ WARNFLAGS = -Wall LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) setsegname: $(OBJS) - @echo HOST_LD $@ + @echo "$(ColorH)HOST_LD$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ - @echo HOST_CODESIGN $@ + @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)" $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@ %.o: %.c - @echo HOST_CC $@ + @echo "$(ColorH)HOST_CC$(Color0) $(ColorF)$@$(Color0)" $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< do_build_setup:: setsegname diff --git a/bsd/Makefile b/bsd/Makefile index 99cd72176..c0cdd42fd 100644 --- a/bsd/Makefile +++ b/bsd/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -11,7 +10,6 @@ INSTINC_SUBDIRS = \ bsm \ crypto \ dev \ - hfs \ libkern \ machine \ miscfs \ @@ -25,6 +23,7 @@ INSTINC_SUBDIRS = \ uuid \ vfs + INSTINC_SUBDIRS_X86_64 = \ i386 \ crypto @@ -34,7 +33,7 @@ INSTINC_SUBDIRS_X86_64H = \ crypto INSTINC_SUBDIRS_ARM = \ - arm + arm INSTINC_SUBDIRS_ARM64 = \ arm @@ -42,7 +41,6 @@ INSTINC_SUBDIRS_ARM64 = \ EXPINC_SUBDIRS = \ bsm \ dev \ - hfs \ libkern \ machine \ miscfs \ @@ -56,25 +54,26 @@ EXPINC_SUBDIRS = \ vfs \ vm + EXPINC_SUBDIRS_X86_64 = \ - i386 + i386 EXPINC_SUBDIRS_X86_64H = \ - i386 + i386 EXPINC_SUBDIRS_ARM = \ - arm + arm EXPINC_SUBDIRS_ARM64 = \ arm -COMP_SUBDIRS = \ +COMP_SUBDIRS = \ conf INSTTEXTFILES_SUBDIRS = \ dev \ - kern \ - man + man \ + sys include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/bsm/Makefile b/bsd/bsm/Makefile index b2ff3b57d..e4f8475da 100644 --- a/bsd/bsm/Makefile +++ b/bsd/bsm/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -23,5 +22,3 @@ EXPORT_MI_DIR = bsm include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index eb75536e2..fff152e65 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2005-2010 Apple Inc. + * Copyright (c) 2005-2016 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -610,6 +610,8 @@ #define AUE_OPENBYID_WT 43207 /* Darwin. */ #define AUE_OPENBYID_RW 43208 /* Darwin. */ #define AUE_OPENBYID_RWT 43209 /* Darwin. */ +#define AUE_CLONEFILEAT 43210 /* Darwin. */ +#define AUE_FCLONEFILEAT 43211 /* Darwin. */ #define AUE_SESSION_START 44901 /* Darwin. */ #define AUE_SESSION_UPDATE 44902 /* Darwin. */ @@ -753,6 +755,8 @@ #define AUE_MODWATCH AUE_NULL #define AUE_MSGCL AUE_NULL #define AUE_MSYNC AUE_NULL +#define AUE_NECP AUE_NULL +#define AUE_NETAGENT AUE_NULL #define AUE_PREADV AUE_NULL #define AUE_PROCINFO AUE_NULL #define AUE_PTHREADCANCELED AUE_NULL @@ -811,5 +815,7 @@ #define AUE_WORKQOPS AUE_NULL #define AUE_PERSONA AUE_NULL #define AUE_USRCTL AUE_NULL +#define AUE_NEXUS AUE_NULL +#define AUE_CHANNEL AUE_NULL #endif /* !_BSM_AUDIT_KEVENTS_H_ */ diff --git a/bsd/conf/Makefile b/bsd/conf/Makefile index 76db9a7d8..7bd79d9ae 100644 --- a/bsd/conf/Makefile +++ b/bsd/conf/Makefile @@ -37,7 +37,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile OBJPATH=${OBJPATH} \ build_all; -do_build_all:: do_all +do_build_all:: do_all include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index 9636b05f7..d6bb61004 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -1,14 +1,14 @@ # -# Copyright (c) 2000-2011 Apple Inc. All rights reserved. +# Copyright (c) 2000-2016 Apple Inc. All rights reserved. # # @APPLE_LICENSE_HEADER_START@ -# +# # The contents of this file constitute Original Code as defined in and # are subject to the Apple Public Source License Version 1.1 (the # "License"). You may not use this file except in compliance with the # License. Please obtain a copy of the License at # http://www.apple.com/publicsource and read it before using this file. -# +# # This Original Code and all software distributed under the License are # distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER # EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -16,7 +16,7 @@ # FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the # License for the specific language governing rights and limitations # under the License. -# +# # @APPLE_LICENSE_HEADER_END@ # @@ -45,7 +45,7 @@ CFLAGS+= -include meta_features.h -DDRIVER_PRIVATE \ # # Directories for mig generated files # -COMP_SUBDIRS = +COMP_SUBDIRS = # # Make sure we don't remove this by accident if interrupted at the wrong @@ -82,6 +82,8 @@ vm_unix.o_CFLAGS_ADD += -Wshorten-64-to-32 pthread_synch.o_CFLAGS_ADD += -Wno-unused-parameter -Wno-missing-prototypes pthread_support.o_CFLAGS_ADD += -Wno-unused-parameter -Wno-missing-prototypes +ip_icmp.o_CFLFAGS_ADD += -O0 + # Objects that don't want -Wsign-compare OBJS_NO_SIGN_COMPARE = \ radix.o \ @@ -150,13 +152,6 @@ $(foreach file,$(OBJS_NO_SIGN_COMPARE),$(eval $(call add_perfile_cflags,$(file), # Objects that don't want -Wcast-align warning (8474835) OBJS_NO_CAST_ALIGN = \ - BTree.o \ - BTreeAllocate.o \ - BTreeMiscOps.o \ - BTreeNodeOps.o \ - BTreeScanner.o \ - BTreeTreeOps.o \ - CatalogUtilities.o \ audit_bsm_token.o \ audit_pipe.o \ audit_session.o \ @@ -167,19 +162,6 @@ OBJS_NO_CAST_ALIGN = \ fasttrap_isa.o \ fbt_arm.o \ fbt_x86.o \ - hfs_attrlist.o \ - hfs_btreeio.o \ - hfs_catalog.o \ - hfs_cnode.o \ - hfs_endian.o \ - hfs_hotfiles.o \ - hfs_link.o \ - hfs_quota.o \ - hfs_readwrite.o \ - hfs_search.o \ - hfs_vfsops.o \ - hfs_vnops.o \ - hfs_xattr.o \ if_bond.o \ ip6_fw.o \ ip_dummynet.o \ @@ -227,7 +209,6 @@ OBJS_NO_CAST_ALIGN = \ uipc_usrreq.o \ vfs_attrlist.o \ vfs_fsevents.o \ - vfs_journal.o \ vfs_lookup.o \ vfs_syscalls.o \ vfs_utfconv.o \ @@ -264,8 +245,8 @@ $(SOBJS): .SFLAGS .SFLAGS: ALWAYS $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS) -$(COMPONENT).filelist: $(OBJS) - @echo LDFILELIST $(COMPONENT) +$(COMPONENT).filelist: $(OBJS) + @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" $(_v)for obj in ${OBJS}; do \ echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist @@ -273,15 +254,15 @@ $(COMPONENT).filelist: $(OBJS) MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh init_sysent.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "Generating $@ from $<"; + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; $(_v)$(MAKESYSCALLS) $< table > /dev/null syscalls.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "Generating $@ from $<"; + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; $(_v)$(MAKESYSCALLS) $< names > /dev/null audit_kevents.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) - @echo "Generating $@ from $<"; + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; $(_v)$(MAKESYSCALLS) $< audit > /dev/null do_all: $(COMPONENT).filelist diff --git a/bsd/conf/Makefile.x86_64 b/bsd/conf/Makefile.x86_64 index 30072bf7b..c397a6e9b 100644 --- a/bsd/conf/Makefile.x86_64 +++ b/bsd/conf/Makefile.x86_64 @@ -1,7 +1,7 @@ ###################################################################### #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### - + # Files to build with certain warnings turned off dis_tables.o_CFLAGS_ADD += -Wno-cast-qual fbt_x86.o_CFLAGS_ADD += -Wno-cast-qual @@ -14,4 +14,3 @@ dtrace.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### - diff --git a/bsd/conf/files b/bsd/conf/files index 190d2964b..fcf0d4890 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -62,11 +62,7 @@ OPTIONS/bond optional bond OPTIONS/bpfilter optional bpfilter OPTIONS/multipath optional multipath OPTIONS/mptcp optional mptcp -OPTIONS/ipdivert optional ipdivert OPTIONS/dummynet optional dummynet -OPTIONS/ipfw2 optional ipfw2 -OPTIONS/ipfirewall optional ipfirewall -OPTIONS/ipv6firewall optional ipv6firewall OPTIONS/tcpdebug optional tcpdebug OPTIONS/if_bridge optional if_bridge OPTIONS/bridgestp optional bridgestp if_bridge @@ -84,17 +80,16 @@ OPTIONS/pktsched_hfsc optional pktsched_hfsc OPTIONS/pktsched_priq optional pktsched_priq OPTIONS/zlib optional zlib + # # Filesystem options # -OPTIONS/hfs optional hfs OPTIONS/fdesc optional fdesc OPTIONS/fifo optional fifo OPTIONS/devfs optional devfs OPTIONS/routefs optional routefs OPTIONS/crypto optional crypto -OPTIONS/journaling optional journaling -OPTIONS/hfs_compression optional hfs_compression +OPTIONS/fs_compression optional fs_compression OPTIONS/config_imageboot optional config_imageboot @@ -105,8 +100,6 @@ bsd/nfs/nfs_bio.c optional nfsclient bsd/nfs/nfs_boot.c optional nfsclient bsd/nfs/nfs_gss.c optional nfsclient bsd/nfs/nfs_gss.c optional nfsserver -bsd/nfs/nfs_gss_crypto.c optional nfsclient -bsd/nfs/nfs_gss_crypto.c optional nfsserver bsd/nfs/nfs_lock.c optional nfsclient bsd/nfs/nfs_node.c optional nfsclient bsd/nfs/nfs_serv.c optional nfsserver @@ -120,7 +113,10 @@ bsd/nfs/nfs_syscalls.c optional nfsserver bsd/nfs/nfs_vfsops.c optional nfsclient bsd/nfs/nfs_vnops.c optional nfsclient bsd/nfs/nfs_upcall.c optional nfsserver - +bsd/nfs/gss/gss_krb5_mech.c optional nfsclient +bsd/nfs/gss/gss_krb5_mech.c optional nfsserver +bsd/nfs/gss/ccrypto.c optional nfsclient +bsd/nfs/gss/ccrypto.c optional nfsserver bsd/kern/netboot.c optional nfsclient bsd/dev/dtrace/dtrace.c optional config_dtrace @@ -147,6 +143,7 @@ bsd/dev/unix_startup.c standard bsd/dev/vn/vn.c optional vndevice bsd/dev/vn/shadow.c optional vndevice +bsd/libkern/crc16.c standard bsd/libkern/crc32.c standard bsd/libkern/random.c standard bsd/libkern/scanc.c standard @@ -172,8 +169,9 @@ bsd/vfs/vfs_vnops.c standard bsd/vfs/vfs_xattr.c standard bsd/vfs/vnode_if.c standard bsd/vfs/kpi_vfs.c standard -bsd/vfs/vfs_journal.c standard bsd/vfs/vfs_fsevents.c standard +bsd/vfs/vfs_cprotect.c standard +bsd/vfs/doc_tombstone.c standard bsd/miscfs/deadfs/dead_vnops.c standard bsd/miscfs/devfs/devfs_fdesc_support.c optional fdesc @@ -225,11 +223,9 @@ bsd/net/kpi_protocol.c optional networking bsd/net/kpi_interfacefilter.c optional networking bsd/net/net_str_id.c optional networking bsd/net/if_utun.c optional networking -bsd/net/if_utun_crypto.c optional networking -bsd/net/if_utun_crypto_dtls.c optional networking -bsd/net/if_utun_crypto_ipsec.c optional networking bsd/net/if_ipsec.c optional ipsec bsd/net/necp.c optional necp +bsd/net/necp_client.c optional necp bsd/net/network_agent.c optional networking bsd/net/if_pflog.c optional pflog bsd/net/pf.c optional pf @@ -254,6 +250,7 @@ bsd/net/classq/classq_rio.c optional classq_rio bsd/net/classq/classq_sfb.c optional networking bsd/net/classq/classq_subr.c optional networking bsd/net/classq/classq_util.c optional networking +bsd/net/classq/classq_fq_codel.c optional networking bsd/net/pktsched/pktsched.c optional networking bsd/net/pktsched/pktsched_cbq.c optional pktsched_cbq @@ -263,6 +260,7 @@ bsd/net/pktsched/pktsched_priq.c optional pktsched_priq bsd/net/pktsched/pktsched_qfq.c optional networking bsd/net/pktsched/pktsched_rmclass.c optional pktsched_cbq bsd/net/pktsched/pktsched_tcq.c optional networking +bsd/net/pktsched/pktsched_fq_codel.c optional networking bsd/net/altq/altq_cbq.c optional pktsched_cbq pf_altq bsd/net/altq/altq_fairq.c optional pktsched_fairq pf_altq @@ -281,10 +279,7 @@ bsd/netinet/in_pcblist.c optional inet bsd/netinet/in_proto.c optional inet bsd/netinet/in_rmx.c optional inet bsd/netinet/in_tclass.c optional inet -bsd/netinet/ip_divert.c optional ipdivert bsd/netinet/ip_dummynet.c optional dummynet -bsd/netinet/ip_fw2.c optional ipfw2 -bsd/netinet/ip_fw2_compat.c optional ipfw2 bsd/netinet/ip_icmp.c optional inet bsd/netinet/ip_id.c optional inet bsd/netinet/ip_input.c optional inet @@ -332,7 +327,6 @@ bsd/netinet6/in6.c optional inet6 bsd/netinet6/in6_cga.c optional inet6 ipv6send bsd/netinet6/in6_cksum.c optional inet6 bsd/netinet6/in6_gif.c optional gif inet6 -bsd/netinet6/ip6_fw.c optional inet6 ipfw2 bsd/netinet6/ip6_forward.c optional inet6 bsd/netinet6/in6_ifattach.c optional inet6 bsd/netinet6/ip6_input.c optional inet6 @@ -376,46 +370,6 @@ bsd/crypto/rc4/rc4.c optional crypto #bsd/netpm/pm_route.c optional pm #bsd/netpm/pm_usrreq.c optional pm -#Some hfs files are standard due to exported KPI if HFS is not enabled -bsd/hfs/hfs_attrlist.c optional hfs -bsd/hfs/hfs_btreeio.c optional hfs -bsd/hfs/hfs_catalog.c optional hfs -bsd/hfs/hfs_chash.c optional hfs -bsd/hfs/hfs_cnode.c optional hfs -bsd/hfs/hfs_encodinghint.c standard -bsd/hfs/hfs_encodings.c standard -bsd/hfs/hfs_endian.c optional hfs -bsd/hfs/hfs_fsinfo.c optional hfs -bsd/hfs/hfs_hotfiles.c optional hfs -bsd/hfs/hfs_link.c optional hfs -bsd/hfs/hfs_lookup.c optional hfs -bsd/hfs/hfs_notification.c optional hfs -bsd/hfs/hfs_quota.c optional quota -bsd/hfs/hfs_readwrite.c optional hfs -bsd/hfs/hfs_resize.c optional hfs -bsd/hfs/hfs_search.c optional hfs -bsd/hfs/hfs_vfsops.c optional hfs -bsd/hfs/hfs_vfsutils.c optional hfs -bsd/hfs/hfs_vnops.c optional hfs -bsd/hfs/hfs_xattr.c optional hfs -bsd/hfs/MacOSStubs.c optional hfs -bsd/hfs/hfs_extents.c optional hfs -bsd/hfs/hfs_cprotect.c standard -bsd/hfs/rangelist.c optional hfs -bsd/hfs/hfscommon/BTree/BTree.c optional hfs -bsd/hfs/hfscommon/BTree/BTreeAllocate.c optional hfs -bsd/hfs/hfscommon/BTree/BTreeMiscOps.c optional hfs -bsd/hfs/hfscommon/BTree/BTreeNodeOps.c optional hfs -bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c optional hfs -bsd/hfs/hfscommon/BTree/BTreeScanner.c optional hfs -bsd/hfs/hfscommon/BTree/BTreeTreeOps.c optional hfs -bsd/hfs/hfscommon/Catalog/CatalogUtilities.c optional hfs -bsd/hfs/hfscommon/Catalog/FileIDsServices.c optional hfs -bsd/hfs/hfscommon/Misc/BTreeWrapper.c optional hfs -bsd/hfs/hfscommon/Misc/FileExtentMapping.c optional hfs -bsd/hfs/hfscommon/Misc/VolumeAllocation.c optional hfs -bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c standard - bsd/security/audit/audit.c optional config_audit bsd/security/audit/audit_arg.c optional config_audit bsd/security/audit/audit_bsd.c optional config_audit @@ -438,9 +392,10 @@ bsd/kern/bsd_init.c standard bsd/kern/kdebug.c standard bsd/kern/kern_acct.c standard bsd/kern/kern_aio.c standard -bsd/kern/kern_authorization.c standard +bsd/kern/kern_authorization.c standard +bsd/kern/kern_backtrace.c standard bsd/kern/kern_clock.c standard -bsd/kern/kern_core.c standard +bsd/kern/kern_core.c optional config_coredump bsd/kern/kern_credential.c standard bsd/kern/kern_cs.c standard bsd/kern/kern_csr.c optional config_csr @@ -451,6 +406,7 @@ bsd/kern/kern_event.c standard bsd/kern/kern_control.c optional networking bsd/kern/kern_exec.c standard bsd/kern/kern_exit.c standard +bsd/kern/kern_ktrace.c standard bsd/kern/kern_lockf.c standard bsd/kern/kern_fork.c standard bsd/kern/kern_asl.c standard @@ -477,6 +433,7 @@ bsd/kern/kern_xxx.c standard bsd/kern/mach_process.c standard bsd/kern/mcache.c optional sockets bsd/kern/spl.c standard +bsd/kern/stackshot.c standard bsd/kern/subr_log.c standard bsd/kern/subr_prf.c standard bsd/kern/subr_prof.c standard @@ -488,6 +445,7 @@ bsd/kern/sys_socket.c optional sockets bsd/kern/sys_domain.c optional sockets bsd/kern/sys_coalition.c optional config_coalitions bsd/kern/sys_persona.c optional config_personas +bsd/kern/sys_ulock.c standard bsd/kern/sys_work_interval.c standard ./syscalls.c standard bsd/kern/tty.c standard @@ -523,8 +481,8 @@ bsd/kern/pthread_shims.c standard bsd/kern/proc_info.c standard bsd/kern/process_policy.c standard bsd/kern/kern_overrides.c standard -bsd/kern/vm_pressure.c optional vm_pressure_events bsd/kern/socket_info.c optional sockets +bsd/kern/sys_reason.c standard bsd/vm/vnode_pager.c standard bsd/vm/vm_unix.c standard @@ -538,7 +496,6 @@ bsd/uxkern/ux_exception.c standard bsd/conf/param.c standard ./ioconf.c standard -bsd/dev/chud/chud_bsd_callback.c standard bsd/dev/chud/chud_process.c standard bsd/kern/imageboot.c optional config_imageboot @@ -550,3 +507,8 @@ bsd/kern/proc_uuid_policy.c optional config_proc_uuid_policy bsd/pgo/profile_runtime.c standard +bsd/miscfs/nullfs/null_subr.c optional nullfs +bsd/miscfs/nullfs/null_vfsops.c optional nullfs +bsd/miscfs/nullfs/null_vnops.c optional nullfs + + diff --git a/bsd/conf/param.c b/bsd/conf/param.c index f9feaa2c0..00da0c590 100644 --- a/bsd/conf/param.c +++ b/bsd/conf/param.c @@ -85,6 +85,7 @@ struct timezone tz = { 0, 0 }; #define NPROC (20 + 16 * 32) #define NPROC_PER_UID (NPROC/2) +/* NOTE: maxproc and hard_maxproc values are subject to device specific scaling in bsd_scale_setup */ #define HNPROC 2500 /* based on thread_max */ int maxproc = NPROC; int maxprocperuid = NPROC_PER_UID; diff --git a/bsd/crypto/Makefile b/bsd/crypto/Makefile index 8a0a0f3bb..08f2ff7e5 100644 --- a/bsd/crypto/Makefile +++ b/bsd/crypto/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -22,12 +21,10 @@ EXPORT_MI_DIR = ${INSTALL_MI_DIR} INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} -# We use this to install aesxts.h in Kernel.framework/PrivateHeaders +# We use this to install aesxts.h in Kernel.framework/PrivateHeaders # in addition to Kernel.framework/PrivateHeaders/crypto # This should be removed once all clients are switched to include libkern/crypto/aesxts.h INSTALL_KF_MD_LCL_LIST = aesxts.h include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/crypto/rc4/Makefile b/bsd/crypto/rc4/Makefile index dc49732bc..49f4d8129 100644 --- a/bsd/crypto/rc4/Makefile +++ b/bsd/crypto/rc4/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -18,5 +17,3 @@ INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/dev/Makefile b/bsd/dev/Makefile index 0dc6f85b8..8f67f466b 100644 --- a/bsd/dev/Makefile +++ b/bsd/dev/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -13,5 +12,3 @@ INSTTEXTFILES_SUBDIRS = dtrace include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/dev/chud/chud_bsd_callback.c b/bsd/dev/chud/chud_bsd_callback.c deleted file mode 100644 index 6519b06a5..000000000 --- a/bsd/dev/chud/chud_bsd_callback.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2003-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -#include -#include /* u_int */ -#include /* proc_t */ -#include /* struct sysent */ -#include -#include /* KDEBUG_ENABLE_CHUD */ -#include /* kauth_cred_get */ -#include -#if CONFIG_MACF -#include /* mac_system_check_chud */ -#endif - -#pragma mark **** kern debug **** -typedef void (*chudxnu_kdebug_callback_func_t)(uint32_t debugid, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); -static void chud_null_kdebug(uint32_t debugid, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); -static chudxnu_kdebug_callback_func_t kdebug_callback_fn = chud_null_kdebug; - -kern_return_t chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t); -kern_return_t chudxnu_kdebug_callback_cancel(void); - -extern void kdbg_control_chud(int val, void *fn); -extern void kperf_kdebug_callback(uint32_t debugid); - -static void chud_null_kdebug(uint32_t debugid __unused, uintptr_t arg0 __unused, - uintptr_t arg1 __unused, uintptr_t arg2 __unused, uintptr_t arg3 __unused, - uintptr_t arg4 __unused) { - return; -} - -static void -chudxnu_private_kdebug_callback( - uint32_t debugid, - uintptr_t arg0, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4) -{ - chudxnu_kdebug_callback_func_t fn = kdebug_callback_fn; - -#if KPERF - /* call out to kperf first */ - kperf_kdebug_callback(debugid); -#endif - - if(fn) { - (fn)(debugid, arg0, arg1, arg2, arg3, arg4); - } -} - -__private_extern__ kern_return_t -chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t func) -{ - /* Atomically set the callback. */ - if(OSCompareAndSwapPtr(chud_null_kdebug, func, - (void * volatile *)&kdebug_callback_fn)) { - - kdbg_control_chud(TRUE, (void *)chudxnu_private_kdebug_callback); - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ kern_return_t -chudxnu_kdebug_callback_cancel(void) -{ - kdbg_control_chud(FALSE, NULL); - - chudxnu_kdebug_callback_func_t old = kdebug_callback_fn; - - while(!OSCompareAndSwapPtr(old, chud_null_kdebug, - (void * volatile *)&kdebug_callback_fn)) { - old = kdebug_callback_fn; - } - - return KERN_SUCCESS; -} - -#pragma mark **** CHUD syscall **** -typedef kern_return_t (*chudxnu_syscall_callback_func_t)(uint64_t code, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4); - -static kern_return_t chud_null_syscall(uint64_t code, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4); -static chudxnu_syscall_callback_func_t syscall_callback_fn = chud_null_syscall; - -kern_return_t chudxnu_syscall_callback_enter(chudxnu_syscall_callback_func_t func); -kern_return_t chudxnu_syscall_callback_cancel(void); - -static kern_return_t chud_null_syscall(uint64_t code __unused, - uint64_t arg0 __unused, uint64_t arg1 __unused, uint64_t arg2 __unused, - uint64_t arg3 __unused, uint64_t arg4 __unused) { - return (kern_return_t)EINVAL; -} - -/* - * chud - * - * Performs performance-related tasks. A private interface registers a handler for this - * system call. The implementation is in the CHUDProf kernel extension. - * - * chud() is a callback style system call used by the CHUD Tools suite of performance tools. If the CHUD - * kexts are not loaded, this system call will always return EINVAL. The CHUD kexts contain the - * implementation of the system call. - * - * The current behavior of the chud() system call is as follows: - * - * Parameters: p (ignored) - * uap User argument descriptor (see below) - * retval return value of fn (the function returned by syscall_callback_fn) - * - * Indirect parameters: uap->code Selects the operation to do. This is broken down into a - * 16-bit facility and a 16-bit action. - * - * The rest of the indirect parameters depend on the facility and the action that is selected: - * - * Facility: 1 Amber instruction tracer - * Action: 1 Indicate that a new thread has been created. No arguments are used. - * - * Action: 2 Indicate that a thread is about to exit. No arguments are used. - * - * Facility: 2 Not Supported for this system call - * - * Facility: 3 CHUD Trace facility - * Action: 1 Record a backtrace of the calling process into the CHUD Trace facility sample - * buffer. - * - * uap->arg1 Number of frames to skip - * uap->arg2 Pointer to a uint64_t containing a timestamp for the - * beginning of the sample. NULL uses the current time. - * uap->arg3 Pointer to a uint64_t containing a timestamp for the end - * of the sample. NULL uses the current time. - * uap->arg4 Pointer to auxiliary data to be recorded with the sample - * uap->arg5 Size of the auxiliary data pointed to by arg4. - * - * Returns: EINVAL If syscall_callback_fn returns an invalid function - * KERN_SUCCESS Success - * KERN_FAILURE Generic failure - * KERN_NO_SPACE Auxiliary data is too large (only used by Facility: 3) - * - * Implicit returns: retval return value of fn (the function returned by syscall_callback_fn) - */ -int -chud(__unused proc_t p, struct chud_args *uap, int32_t *retval) -{ -#if CONFIG_MACF - int error = mac_system_check_chud(kauth_cred_get()); - if (error) - return error; -#endif - - chudxnu_syscall_callback_func_t fn = syscall_callback_fn; - - if(!fn) { - return EINVAL; - } - - *retval = fn(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, uap->arg5); - - return 0; -} - -__private_extern__ kern_return_t -chudxnu_syscall_callback_enter(chudxnu_syscall_callback_func_t func) -{ - if(OSCompareAndSwapPtr(chud_null_syscall, func, - (void * volatile *)&syscall_callback_fn)) { - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ kern_return_t -chudxnu_syscall_callback_cancel(void) -{ - chudxnu_syscall_callback_func_t old = syscall_callback_fn; - - while(!OSCompareAndSwapPtr(old, chud_null_syscall, - (void * volatile *)&syscall_callback_fn)) { - old = syscall_callback_fn; - } - - return KERN_SUCCESS; -} - -/* DTrace callback */ -typedef kern_return_t (*chudxnu_dtrace_callback_t)(uint64_t selector, - uint64_t *args, uint32_t count); -int chudxnu_dtrace_callback(uint64_t selector, uint64_t *args, uint32_t count); -kern_return_t chudxnu_dtrace_callback_enter(chudxnu_dtrace_callback_t fn); -void chudxnu_dtrace_callback_cancel(void); - -int -chud_null_dtrace(uint64_t selector, uint64_t *args, uint32_t count); - -static chudxnu_dtrace_callback_t - dtrace_callback = (chudxnu_dtrace_callback_t) chud_null_dtrace; - -int -chud_null_dtrace(uint64_t selector __unused, uint64_t *args __unused, - uint32_t count __unused) { - return ENXIO; -} - -int -chudxnu_dtrace_callback(uint64_t selector, uint64_t *args, uint32_t count) -{ - /* If no callback is hooked up, let's return ENXIO */ - int ret = ENXIO; - - /* Make a local stack copy of the function ptr */ - chudxnu_dtrace_callback_t fn = dtrace_callback; - - if(fn) { - ret = fn(selector, args, count); - } - - return ret; -} - -__private_extern__ kern_return_t -chudxnu_dtrace_callback_enter(chudxnu_dtrace_callback_t fn) -{ - /* Atomically enter the call back */ - if(!OSCompareAndSwapPtr(chud_null_dtrace, fn, - (void * volatile *) &dtrace_callback)) { - return KERN_FAILURE; - } - - return KERN_SUCCESS; -} - -__private_extern__ void -chudxnu_dtrace_callback_cancel(void) -{ - chudxnu_dtrace_callback_t old_fn = dtrace_callback; - - /* Atomically clear the call back */ - while(!OSCompareAndSwapPtr(old_fn, chud_null_dtrace, - (void * volatile *) &dtrace_callback)) { - old_fn = dtrace_callback; - } -} - diff --git a/bsd/dev/chud/chud_process.c b/bsd/dev/chud/chud_process.c index cc82b9890..f71987299 100644 --- a/bsd/dev/chud/chud_process.c +++ b/bsd/dev/chud/chud_process.c @@ -32,55 +32,5 @@ #include #include -int chudxnu_pid_for_task(task_t task); -task_t chudxnu_task_for_pid(int pid); int chudxnu_current_pid(void); -__private_extern__ int -chudxnu_pid_for_task(task_t task) -{ - proc_t p; - int pid = -1; - - if(task!=TASK_NULL) { - p = (proc_t)(get_bsdtask_info(task)); - if(p) { - return (proc_pid(p)); - } - } - return pid; -} - -__private_extern__ task_t -chudxnu_task_for_pid(int pid) -{ - task_t t = TASK_NULL; - proc_t p = proc_find(pid); - if(p) { - t = p->task; - proc_rele(p); - } - return (t); -} - -__private_extern__ int -chudxnu_current_pid(void) -{ - int pid = -1; - struct uthread *ut = get_bsdthread_info(current_thread()); - task_t t = current_task(); - - if(t != TASK_NULL) { - pid = chudxnu_pid_for_task(t); - } - if(-1 == pid) { - // no task, so try looking in the uthread and/or proc - pid = proc_pid(current_proc()); - - if(-1 == pid && ut && ut->uu_proc) { - pid = proc_pid(ut->uu_proc); - } - } - - return pid; -} diff --git a/bsd/dev/dtrace/Makefile b/bsd/dev/dtrace/Makefile index b16c0ed6c..a44fc6864 100644 --- a/bsd/dev/dtrace/Makefile +++ b/bsd/dev/dtrace/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -11,5 +10,3 @@ INSTTEXTFILES_SUBDIRS = scripts include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 4a2e5e23a..c90a465a9 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -20,7 +20,7 @@ */ /* - * Portions Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved. * Portions Copyright (c) 2013 by Delphix. All rights reserved. */ @@ -61,6 +61,7 @@ * - Enabling functions * - DOF functions * - Anonymous enabling functions + * - Process functions * - Consumer state functions * - Helper functions * - Hook functions @@ -93,8 +94,11 @@ #include #include #include +#include #include #include +#include +#include #include extern uint32_t pmap_find_phys(void *, uint64_t); @@ -112,19 +116,14 @@ extern void dtrace_resume(void); extern void dtrace_init(void); extern void helper_init(void); extern void fasttrap_init(void); -extern void dtrace_lazy_dofs_duplicate(proc_t *, proc_t *); + +static int dtrace_lazy_dofs_duplicate(proc_t *, proc_t *); extern void dtrace_lazy_dofs_destroy(proc_t *); extern void dtrace_postinit(void); -#include "../../../osfmk/chud/chud_dtrace.h" - -extern kern_return_t chudxnu_dtrace_callback - (uint64_t selector, uint64_t *args, uint32_t count); - -/* Import this function to retrieve the physical memory. */ -extern int kernel_sysctlbyname(const char *name, void *oldp, - size_t *oldlenp, void *newp, size_t newlen); - +extern void dtrace_proc_fork(proc_t*, proc_t*, int); +extern void dtrace_proc_exec(proc_t*); +extern void dtrace_proc_exit(proc_t*); /* * DTrace Tunable Variables * @@ -155,6 +154,8 @@ dtrace_optval_t dtrace_helper_actions_max = 32; dtrace_optval_t dtrace_helper_providers_max = 64; dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); size_t dtrace_strsize_default = 256; +dtrace_optval_t dtrace_strsize_min = 8; +dtrace_optval_t dtrace_strsize_max = 65536; dtrace_optval_t dtrace_cleanrate_default = 990099000; /* 1.1 hz */ dtrace_optval_t dtrace_cleanrate_min = 20000000; /* 50 hz */ dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */ @@ -168,6 +169,9 @@ dtrace_optval_t dtrace_stackframes_default = 20; dtrace_optval_t dtrace_ustackframes_default = 20; dtrace_optval_t dtrace_jstackframes_default = 50; dtrace_optval_t dtrace_jstackstrsize_default = 512; +dtrace_optval_t dtrace_buflimit_default = 75; +dtrace_optval_t dtrace_buflimit_min = 1; +dtrace_optval_t dtrace_buflimit_max = 99; int dtrace_msgdsize_max = 128; hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */ hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */ @@ -194,7 +198,6 @@ unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */ */ static dev_info_t *dtrace_devi; /* device info */ static vmem_t *dtrace_arena; /* probe ID arena */ -static vmem_t *dtrace_minor; /* minor number arena */ static taskq_t *dtrace_taskq; /* task queue */ static dtrace_probe_t **dtrace_probes; /* array of all probes */ static int dtrace_nprobes; /* number of probes */ @@ -202,7 +205,6 @@ static dtrace_provider_t *dtrace_provider; /* provider list */ static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ static int dtrace_opens; /* number of opens */ static int dtrace_helpers; /* number of helpers */ -static void *dtrace_softstate; /* softstate pointer */ static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */ static dtrace_hash_t *dtrace_byname; /* probes hashed by name */ @@ -227,6 +229,7 @@ static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either... */ int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */ +static uint32_t dtrace_wake_clients; /* @@ -437,6 +440,14 @@ static lck_mtx_t dtrace_errlock; return (0); \ } +#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \ +do { \ + if ((remp) != NULL) { \ + *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \ + } \ +} while (0) + + /* * Test whether a range of memory starting at testaddr of size testsz falls * within the range of memory described by addr, sz. We take care to avoid @@ -461,7 +472,7 @@ static lck_mtx_t dtrace_errlock; #define RECOVER_LABEL(bits) dtraceLoadRecover##bits: -#if defined (__x86_64__) +#if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__)) #define DTRACE_LOADFUNC(bits) \ /*CSTYLED*/ \ uint##bits##_t dtrace_load##bits(uintptr_t addr); \ @@ -504,6 +515,12 @@ dtrace_load##bits(uintptr_t addr) \ */ \ if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr))) \ rval = *((volatile uint##bits##_t *)addr); \ + else { \ + *flags |= CPU_DTRACE_BADADDR; \ + cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \ + return (0); \ + } \ + \ RECOVER_LABEL(bits); \ (void)dtrace_set_thread_recover(current_thread(), recover); \ *flags &= ~CPU_DTRACE_NOFAULT; \ @@ -551,7 +568,8 @@ dtrace_load##bits(uintptr_t addr) \ static size_t dtrace_strlen(const char *, size_t); static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id); static void dtrace_enabling_provide(dtrace_provider_t *); -static int dtrace_enabling_match(dtrace_enabling_t *, int *); +static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond); +static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond); static void dtrace_enabling_matchall(void); static dtrace_state_t *dtrace_anon_grab(void); static uint64_t dtrace_helper(int, dtrace_mstate_t *, @@ -564,6 +582,10 @@ static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, dtrace_optval_t); static int dtrace_ecb_create_enable(dtrace_probe_t *, void *); static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); +static int dtrace_canload_remains(uint64_t, size_t, size_t *, + dtrace_mstate_t *, dtrace_vstate_t *); +static int dtrace_canstore_remains(uint64_t, size_t, size_t *, + dtrace_mstate_t *, dtrace_vstate_t *); /* @@ -880,15 +902,15 @@ dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate) } static int -dtrace_canstore_statvar(uint64_t addr, size_t sz, +dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain, dtrace_statvar_t **svars, int nsvars) { int i; size_t maxglobalsize, maxlocalsize; - maxglobalsize = dtrace_statvar_maxsize; - maxlocalsize = (maxglobalsize + sizeof (uint64_t)) * NCPU; + maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t); + maxlocalsize = (maxglobalsize) * NCPU; if (nsvars == 0) return (0); @@ -909,11 +931,14 @@ dtrace_canstore_statvar(uint64_t addr, size_t sz, * DTrace to escalate an orthogonal kernel heap corruption bug * into the ability to store to arbitrary locations in memory. */ - VERIFY((scope == DIFV_SCOPE_GLOBAL && size < maxglobalsize) || - (scope == DIFV_SCOPE_LOCAL && size < maxlocalsize)); + VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) || + (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize)); - if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) + if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) { + DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data, + svar->dtsv_size); return (1); + } } return (0); @@ -928,14 +953,26 @@ dtrace_canstore_statvar(uint64_t addr, size_t sz, static int dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ + return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate)); +} +/* + * Implementation of dtrace_canstore which communicates the upper bound of the + * allowed memory region. + */ +static int +dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { /* * First, check to see if the address is in scratch space... */ if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base, - mstate->dtms_scratch_size)) + mstate->dtms_scratch_size)) { + DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base, + mstate->dtms_scratch_size); return (1); - + } /* * Now check to see if it's a dynamic variable. This check will pick * up both thread-local variables and any global dynamically-allocated @@ -947,6 +984,7 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, uintptr_t base = (uintptr_t)dstate->dtds_base + (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t)); uintptr_t chunkoffs; + dtrace_dynvar_t *dvar; /* * Before we assume that we can store here, we need to make @@ -963,6 +1001,8 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, * * (3) Not span a chunk boundary * + * (4) Not be in the tuple space of a dynamic variable + * */ if (addr < base) return (0); @@ -975,6 +1015,15 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, if (chunkoffs + sz > dstate->dtds_chunksize) return (0); + dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs); + + if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) + return (0); + + if (chunkoffs < sizeof (dtrace_dynvar_t) + + ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t))) + return (0); + return (1); } @@ -982,11 +1031,11 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, * Finally, check the static local and global variables. These checks * take the longest, so we perform them last. */ - if (dtrace_canstore_statvar(addr, sz, + if (dtrace_canstore_statvar(addr, sz, remain, vstate->dtvs_locals, vstate->dtvs_nlocals)) return (1); - if (dtrace_canstore_statvar(addr, sz, + if (dtrace_canstore_statvar(addr, sz, remain, vstate->dtvs_globals, vstate->dtvs_nglobals)) return (1); @@ -1006,6 +1055,17 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, static int dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ + return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate)); +} + +/* + * Implementation of dtrace_canload which communicates the upper bound of the + * allowed memory region. + */ +static int +dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval; @@ -1013,21 +1073,27 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, * If we hold the privilege to read from kernel memory, then * everything is readable. */ - if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, addr, addr, sz); return (1); + } /* * You can obviously read that which you can store. */ - if (dtrace_canstore(addr, sz, mstate, vstate)) + if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate)) return (1); /* * We're allowed to read from our own string table. */ if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab, - mstate->dtms_difo->dtdo_strlen)) + mstate->dtms_difo->dtdo_strlen)) { + DTRACE_RANGE_REMAIN(remain, addr, + mstate->dtms_difo->dtdo_strtab, + mstate->dtms_difo->dtdo_strlen); return (1); + } DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); *illval = addr; @@ -1041,21 +1107,41 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, * calls in the event that the user has all privileges. */ static int -dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, - dtrace_vstate_t *vstate) +dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { - size_t strsz; + size_t rsize; /* * If we hold the privilege to read from kernel memory, then * everything is readable. */ - if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, addr, addr, sz); return (1); + } - strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz); - if (dtrace_canload(addr, strsz, mstate, vstate)) - return (1); + /* + * Even if the caller is uninterested in querying the remaining valid + * range, it is required to ensure that the access is allowed. + */ + if (remain == NULL) { + remain = &rsize; + } + if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) { + size_t strsz; + /* + * Perform the strlen after determining the length of the + * memory region which is accessible. This prevents timing + * information from being used to find NULs in memory which is + * not accessible to the caller. + */ + strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, + MIN(sz, *remain)); + if (strsz <= *remain) { + return (1); + } + } return (0); } @@ -1065,26 +1151,49 @@ dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, * region in which a load may be issued given the user's privilege level. */ static int -dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, - dtrace_vstate_t *vstate) +dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { size_t sz; ASSERT(type->dtdt_flags & DIF_TF_BYREF); + /* + * Calculate the max size before performing any checks since even + * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function + * return the max length via 'remain'. + */ + if (type->dtdt_kind == DIF_TYPE_STRING) { + dtrace_state_t *state = vstate->dtvs_state; + + if (state != NULL) { + sz = state->dts_options[DTRACEOPT_STRSIZE]; + } else { + /* + * In helper context, we have a NULL state; fall back + * to using the system-wide default for the string size + * in this case. + */ + sz = dtrace_strsize_default; + } + } else { + sz = type->dtdt_size; + } + /* * If we hold the privilege to read from kernel memory, then * everything is readable. */ - if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz); return (1); + } - if (type->dtdt_kind == DIF_TYPE_STRING) - sz = dtrace_strlen(src, - vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1; - else - sz = type->dtdt_size; - - return (dtrace_canload((uintptr_t)src, sz, mstate, vstate)); + if (type->dtdt_kind == DIF_TYPE_STRING) { + return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate, + vstate)); + } + return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate, + vstate)); } /* @@ -1222,15 +1331,15 @@ dtrace_strcpy(const void *src, void *dst, size_t len) * specified type; we assume that we can store to directly. */ static void -dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type) +dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit) { ASSERT(type->dtdt_flags & DIF_TF_BYREF); if (type->dtdt_kind == DIF_TYPE_STRING) { - dtrace_strcpy(src, dst, type->dtdt_size); + dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit)); } else { - dtrace_bcopy(src, dst, type->dtdt_size); -} + dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit)); + } } /* @@ -1481,7 +1590,7 @@ dtrace_priv_proc(dtrace_state_t *state) if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) goto bad; - if (dtrace_is_restricted() && !dtrace_is_running_apple_internal() && !dtrace_can_attach_to_proc(current_proc())) + if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc())) goto bad; if (state->dts_cred.dcr_action & DTRACE_CRA_PROC) @@ -1513,7 +1622,7 @@ dtrace_priv_proc_relaxed(dtrace_state_t *state) static int dtrace_priv_kernel(dtrace_state_t *state) { - if (dtrace_is_restricted() && !dtrace_is_running_apple_internal()) + if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) goto bad; if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL) @@ -3593,6 +3702,14 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value; size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size; + /* + * Check whether the user can access kernel memory + */ + if (dtrace_priv_kernel(state) == 0) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); + regs[rd] = 0; + break; + } /* * This action doesn't require any credential checks since * probes will not activate in user contexts to which the @@ -3735,30 +3852,30 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, uintptr_t kaddr = tupregs[0].dttk_value; user_addr_t uaddr = tupregs[1].dttk_value; uint64_t size = tupregs[2].dttk_value; + size_t lim; if (!dtrace_destructive_disallow && dtrace_priv_proc_control(state) && !dtrace_istoxic(kaddr, size) && - dtrace_strcanload(kaddr, size, mstate, vstate)) { + dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - dtrace_copyoutstr(kaddr, uaddr, size, flags); + dtrace_copyoutstr(kaddr, uaddr, lim, flags); DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); } break; } case DIF_SUBR_STRLEN: { - size_t sz; + size_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t addr = (uintptr_t)tupregs[0].dttk_value; - sz = dtrace_strlen((char *)addr, - state->dts_options[DTRACEOPT_STRSIZE]); + size_t lim; - if (!dtrace_canload(addr, sz + 1, mstate, vstate)) { + if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) { regs[rd] = 0; break; } - regs[rd] = sz; + regs[rd] = dtrace_strlen((char *)addr, lim); break; } @@ -3772,12 +3889,19 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, * is DIF_SUBR_STRRCHR, we will look for the last occurrence * of the specified character instead of the first. */ - uintptr_t saddr = tupregs[0].dttk_value; uintptr_t addr = tupregs[0].dttk_value; - uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t addr_limit; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + size_t lim; char c, target = (char)tupregs[1].dttk_value; - for (regs[rd] = 0; addr < limit; addr++) { + if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) { + regs[rd] = NULL; + break; + } + addr_limit = addr + lim; + + for (regs[rd] = 0; addr < addr_limit; addr++) { if ((c = dtrace_load8(addr)) == target) { regs[rd] = addr; @@ -3789,11 +3913,6 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, break; } - if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) { - regs[rd] = 0; - break; - } - break; } @@ -3951,7 +4070,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, uintptr_t addr = tupregs[0].dttk_value; uintptr_t tokaddr = tupregs[1].dttk_value; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; - uintptr_t limit, toklimit = tokaddr + size; + uintptr_t limit, toklimit; + size_t clim; char *dest = (char *)mstate->dtms_scratch_ptr; uint8_t c='\0', tokmap[32]; /* 256 / 8 */ uint64_t i = 0; @@ -3960,10 +4080,11 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, * Check both the token buffer and (later) the input buffer, * since both could be non-scratch addresses. */ - if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) { + if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) { regs[rd] = 0; break; } + toklimit = tokaddr + clim; if (!DTRACE_INSCRATCH(mstate, size)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); @@ -3980,6 +4101,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, * it behaves like an implicit clause-local variable. */ addr = mstate->dtms_strtok; + limit = mstate->dtms_strtok_limit; } else { /* * If the user-specified address is non-NULL we must @@ -3989,10 +4111,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, * (when we fetch addr from mstate->dtms_strtok) * would fail this access check. */ - if (!dtrace_strcanload(addr, size, mstate, vstate)) { + if (!dtrace_strcanload(addr, size, &clim, mstate, + vstate)) { regs[rd] = 0; break; } + limit = addr + clim; } /* @@ -4011,10 +4135,10 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, tokmap[c >> 3] |= (1 << (c & 0x7)); } - for (limit = addr + size; addr < limit; addr++) { + for (; addr < limit; addr++) { /* - * We're looking for a character that is _not_ contained - * in the token string. + * We're looking for a character that is _not_ + * contained in the token string. */ if ((c = dtrace_load8(addr)) == '\0') break; @@ -4032,6 +4156,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, */ regs[rd] = 0; mstate->dtms_strtok = 0; + mstate->dtms_strtok_limit = NULL; break; } @@ -4054,6 +4179,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, regs[rd] = (uintptr_t)dest; mstate->dtms_scratch_ptr += size; mstate->dtms_strtok = addr; + mstate->dtms_strtok_limit = limit; break; } @@ -4129,10 +4255,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t s1 = tupregs[0].dttk_value; uintptr_t s2 = tupregs[1].dttk_value; - uint64_t i = 0; + uint64_t i = 0, j = 0; + size_t lim1, lim2; + char c; - if (!dtrace_strcanload(s1, size, mstate, vstate) || - !dtrace_strcanload(s2, size, mstate, vstate)) { + if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) || + !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) { regs[rd] = 0; break; } @@ -4149,8 +4277,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, regs[rd] = 0; break; } - - if ((d[i++] = dtrace_load8(s1++)) == '\0') { + c = (i >= lim1) ? '\0' : dtrace_load8(s1++); + if ((d[i++] = c) == '\0') { i--; break; } @@ -4162,8 +4290,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, regs[rd] = 0; break; } - - if ((d[i++] = dtrace_load8(s2++)) == '\0') + c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++); + if ((d[i++] = c) == '\0') break; } @@ -4366,9 +4494,10 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, char *dest = (char *)mstate->dtms_scratch_ptr, c; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t src = tupregs[0].dttk_value; - int i = 0, j = 0; + size_t lim; + size_t i = 0, j = 0; - if (!dtrace_strcanload(src, size, mstate, vstate)) { + if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) { regs[rd] = 0; break; } @@ -4383,7 +4512,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, * Move forward, loading each character. */ do { - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); next: if ((uint64_t)(j + 5) >= size) /* 5 = strlen("/..c\0") */ break; @@ -4393,7 +4522,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, continue; } - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); if (c == '/') { /* @@ -4414,7 +4543,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, continue; } - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); if (c == '/') { /* @@ -4437,7 +4566,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, continue; } - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); if (c != '/' && c != '\0') { /* @@ -4499,6 +4628,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, #if !defined(__APPLE__) ip4 = dtrace_load32(tupregs[argi].dttk_value); #else + if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4), + mstate, vstate)) { + regs[rd] = 0; + break; + } + dtrace_bcopy( (void *)(uintptr_t)tupregs[argi].dttk_value, (void *)(uintptr_t)&ip4, sizeof (ip4)); @@ -4559,6 +4694,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, * just the IPv4 string is returned for inet_ntoa6. */ + if (!dtrace_canload(tupregs[argi].dttk_value, + sizeof(struct in6_addr), mstate, vstate)) { + regs[rd] = 0; + break; + } + /* * Safely load the IPv6 address. */ @@ -4736,6 +4877,7 @@ inetout: regs[rd] = (uintptr_t)end + 1; break; } +#if defined(__APPLE__) case DIF_SUBR_VM_KERNEL_ADDRPERM: { if (!dtrace_priv_kernel(state)) { regs[rd] = 0; @@ -4745,38 +4887,60 @@ inetout: regs[rd] = (uintptr_t)end + 1; break; } -/* - * APPLE NOTE: - * CoreProfile callback ('core_profile (uint64_t, [uint64_t], [uint64_t] ...)') - */ - case DIF_SUBR_COREPROFILE: { - uint64_t selector = tupregs[0].dttk_value; - uint64_t args[DIF_DTR_NREGS-1] = {0ULL}; - uint32_t ii; - uint32_t count = (uint32_t)nargs; - - if (count < 1) { - regs[rd] = KERN_FAILURE; - break; + + case DIF_SUBR_KDEBUG_TRACE: { + uint32_t debugid; + uintptr_t args[4] = {0}; + int i; + + if (nargs < 2 || nargs > 5) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + break; } - - if(count > DIF_DTR_NREGS) - count = DIF_DTR_NREGS; - /* copy in any variadic argument list, bounded by DIF_DTR_NREGS */ - for(ii = 0; ii < count-1; ii++) { - args[ii] = tupregs[ii+1].dttk_value; + if (dtrace_destructive_disallow) + return; + + debugid = tupregs[0].dttk_value; + for (i = 0; i < nargs - 1; i++) + args[i] = tupregs[i + 1].dttk_value; + + kernel_debug(debugid, args[0], args[1], args[2], args[3], 0); + + break; + } + + case DIF_SUBR_KDEBUG_TRACE_STRING: { + if (nargs != 3) { + break; } - kern_return_t ret = - chudxnu_dtrace_callback(selector, args, count-1); - if(KERN_SUCCESS != ret) { - /* error */ + if (dtrace_destructive_disallow) + return; + + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uint32_t debugid = tupregs[0].dttk_value; + uint64_t str_id = tupregs[1].dttk_value; + uintptr_t src = tupregs[2].dttk_value; + size_t lim; + char buf[size]; + char* str = NULL; + + if (src != (uintptr_t)0) { + str = buf; + if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) { + break; + } + dtrace_strcpy((void*)src, buf, size); } - regs[rd] = ret; + (void)kernel_debug_string(debugid, &str_id, str); + regs[rd] = str_id; + break; } +#endif + } } @@ -5072,15 +5236,17 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, size_t sz = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t s1 = regs[r1]; uintptr_t s2 = regs[r2]; + size_t lim1 = sz, lim2 = sz; if (s1 != 0 && - !dtrace_strcanload(s1, sz, mstate, vstate)) + !dtrace_strcanload(s1, sz, &lim1, mstate, vstate)) break; if (s2 != 0 && - !dtrace_strcanload(s2, sz, mstate, vstate)) + !dtrace_strcanload(s2, sz, &lim2, mstate, vstate)) break; - cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz); + cc_r = dtrace_strncmp((char *)s1, (char *)s2, + MIN(lim1, lim2)); cc_n = cc_r < 0; cc_z = cc_r == 0; @@ -5132,12 +5298,14 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, ASSERT(id >= DIF_VAR_OTHER_UBASE); id -= DIF_VAR_OTHER_UBASE; + VERIFY(id < (uint_t)vstate->dtvs_nglobals); svar = vstate->dtvs_globals[id]; ASSERT(svar != NULL); v = &svar->dtsv_var; if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { uintptr_t a = (uintptr_t)svar->dtsv_data; + size_t lim; ASSERT(a != 0); ASSERT(svar->dtsv_size != 0); @@ -5151,11 +5319,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, } if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], &v->dtdv_type, - mstate, vstate)) + &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - (void *)a, &v->dtdv_type); + (void *)a, &v->dtdv_type, lim); break; } @@ -5222,7 +5390,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, ASSERT(id >= DIF_VAR_OTHER_UBASE); id -= DIF_VAR_OTHER_UBASE; - ASSERT(id < (uint_t)vstate->dtvs_nlocals); + VERIFY(id < (uint_t)vstate->dtvs_nlocals); ASSERT(vstate->dtvs_locals != NULL); svar = vstate->dtvs_locals[id]; ASSERT(svar != NULL); @@ -5231,6 +5399,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { uintptr_t a = (uintptr_t)svar->dtsv_data; size_t sz = v->dtdv_type.dtdt_size; + size_t lim; sz += sizeof (uint64_t); ASSERT(svar->dtsv_size == (int)NCPU * sz); @@ -5246,11 +5415,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], &v->dtdv_type, - mstate, vstate)) + &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - (void *)a, &v->dtdv_type); + (void *)a, &v->dtdv_type, lim); break; } @@ -5299,6 +5468,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, id = DIF_INSTR_VAR(instr); ASSERT(id >= DIF_VAR_OTHER_UBASE); id -= DIF_VAR_OTHER_UBASE; + VERIFY(id < (uint_t)vstate->dtvs_ntlocals); key = &tupregs[DIF_DTR_NREGS]; key[0].dttk_value = (uint64_t)id; @@ -5323,13 +5493,15 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, break; if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + size_t lim; + if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], - &v->dtdv_type, mstate, vstate)) + &v->dtdv_type, &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - dvar->dtdv_data, &v->dtdv_type); + dvar->dtdv_data, &v->dtdv_type, lim); } else { *((uint64_t *)dvar->dtdv_data) = regs[rd]; } @@ -5411,8 +5583,10 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) { DTRACE_TLS_THRKEY(key[nkeys].dttk_value); key[nkeys++].dttk_size = 0; + VERIFY(id < (uint_t)vstate->dtvs_ntlocals); v = &vstate->dtvs_tlocals[id]; } else { + VERIFY(id < (uint_t)vstate->dtvs_nglobals); v = &vstate->dtvs_globals[id]->dtsv_var; } @@ -5451,8 +5625,10 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) { DTRACE_TLS_THRKEY(key[nkeys].dttk_value); key[nkeys++].dttk_size = 0; + VERIFY(id < (uint_t)vstate->dtvs_ntlocals); v = &vstate->dtvs_tlocals[id]; } else { + VERIFY(id < (uint_t)vstate->dtvs_nglobals); v = &vstate->dtvs_globals[id]->dtsv_var; } @@ -5466,13 +5642,15 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, break; if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + size_t lim; + if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], &v->dtdv_type, - mstate, vstate)) + &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - dvar->dtdv_data, &v->dtdv_type); + dvar->dtdv_data, &v->dtdv_type, lim); } else { *((uint64_t *)dvar->dtdv_data) = regs[rd]; } @@ -6156,6 +6334,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, * not the case. */ if ((ecb->dte_cond & DTRACE_COND_USERMODE) && + prov->dtpv_pops.dtps_usermode && prov->dtpv_pops.dtps_usermode(prov->dtpv_arg, probe->dtpr_id, probe->dtpr_arg) == 0) continue; @@ -6439,7 +6618,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, tomax = buf->dtb_tomax; ASSERT(tomax != NULL); - if (ecb->dte_size != 0) + if (ecb->dte_size == 0) continue; ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t)); @@ -6574,7 +6753,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF && !dtrace_vcanload((void *)(uintptr_t)val, - &dp->dtdo_rtype, &mstate, vstate)) + &dp->dtdo_rtype, NULL, &mstate, vstate)) { continue; } @@ -6999,10 +7178,12 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp) uint32_t priv; if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) { - /* - * For DTRACE_PRIV_ALL, the uid and zoneid don't matter. - */ - priv = DTRACE_PRIV_ALL; + if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) { + priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC; + } + else { + priv = DTRACE_PRIV_ALL; + } } else { *uidp = crgetuid(cr); *zoneidp = crgetzoneid(cr); @@ -7433,6 +7614,17 @@ dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp) pkp->dtpk_fmatch = &dtrace_match_nonzero; } +static int +dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data) +{ + if (desc == NULL) + return 1; + + dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider); + + return func(desc->dtpd_provider, (char*)data, 0); +} + /* * DTrace Provider-to-Framework API Functions * @@ -7569,13 +7761,16 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, dtrace_enabling_provide(provider); /* - * Now we need to call dtrace_enabling_matchall() -- which - * will acquire cpu_lock and dtrace_lock. We therefore need + * Now we need to call dtrace_enabling_matchall_with_cond() -- + * with a condition matching the provider name we just added, + * which will acquire cpu_lock and dtrace_lock. We therefore need * to drop all of our locks before calling into it... */ lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&dtrace_provider_lock); - dtrace_enabling_matchall(); + + dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name}; + dtrace_enabling_matchall_with_cond(&cond); return (0); } @@ -8235,6 +8430,17 @@ dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid) mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb); } + + /* + * Since we just created probes, we need to match our enablings + * against those, with a precondition knowing that we have only + * added probes from this provider + */ + char *prov_name = mops->dtms_provider_name(parg); + ASSERT(prov_name != NULL); + dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name}; + + dtrace_enabling_matchall_with_cond(&cond); } static void @@ -8255,15 +8461,6 @@ dtrace_helper_provide(dof_helper_t *dhp, pid_t pid) dtrace_helper_provide_one(dhp, sec, pid); } - - /* - * We may have just created probes, so we must now rematch against - * any retained enablings. Note that this call will acquire both - * cpu_lock and dtrace_lock; the fact that we are holding - * dtrace_meta_lock now is what defines the ordering with respect to - * these three locks. - */ - dtrace_enabling_matchall(); } static void @@ -8480,6 +8677,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err; int kcheckload; uint_t pc; + int maxglobal = -1, maxlocal = -1, maxtlocal = -1; kcheckload = cr == NULL || (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0; @@ -8700,7 +8898,8 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, err += efunc(pc, "invalid register %u\n", rd); break; case DIF_OP_CALL: - if (subr > DIF_SUBR_MAX) + if (subr > DIF_SUBR_MAX && + !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX)) err += efunc(pc, "invalid subr %u\n", subr); if (rd >= nregs) err += efunc(pc, "invalid register %u\n", rd); @@ -8708,7 +8907,9 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, err += efunc(pc, "cannot write to %r0\n"); if (subr == DIF_SUBR_COPYOUT || - subr == DIF_SUBR_COPYOUTSTR) { + subr == DIF_SUBR_COPYOUTSTR || + subr == DIF_SUBR_KDEBUG_TRACE || + subr == DIF_SUBR_KDEBUG_TRACE_STRING) { dp->dtdo_destructive = 1; } break; @@ -8796,6 +8997,9 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, switch (v->dtdv_scope) { case DIFV_SCOPE_GLOBAL: + if (maxglobal == -1 || ndx > maxglobal) + maxglobal = ndx; + if (ndx < vstate->dtvs_nglobals) { dtrace_statvar_t *svar; @@ -8806,11 +9010,16 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, break; case DIFV_SCOPE_THREAD: + if (maxtlocal == -1 || ndx > maxtlocal) + maxtlocal = ndx; + if (ndx < vstate->dtvs_ntlocals) existing = &vstate->dtvs_tlocals[ndx]; break; case DIFV_SCOPE_LOCAL: + if (maxlocal == -1 || ndx > maxlocal) + maxlocal = ndx; if (ndx < vstate->dtvs_nlocals) { dtrace_statvar_t *svar; @@ -8859,6 +9068,37 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, } } + for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) { + dif_instr_t instr = dp->dtdo_buf[pc]; + + uint_t v = DIF_INSTR_VAR(instr); + uint_t op = DIF_INSTR_OP(instr); + + switch (op) { + case DIF_OP_LDGS: + case DIF_OP_LDGAA: + case DIF_OP_STGS: + case DIF_OP_STGAA: + if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal)) + err += efunc(pc, "invalid variable %u\n", v); + break; + case DIF_OP_LDTS: + case DIF_OP_LDTAA: + case DIF_OP_STTS: + case DIF_OP_STTAA: + if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal)) + err += efunc(pc, "invalid variable %u\n", v); + break; + case DIF_OP_LDLS: + case DIF_OP_STLS: + if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal)) + err += efunc(pc, "invalid variable %u\n", v); + break; + default: + break; + } + } + return (err); } @@ -8997,7 +9237,8 @@ dtrace_difo_validate_helper(dtrace_difo_t *dp) subr == DIF_SUBR_STRJOIN || subr == DIF_SUBR_STRRCHR || subr == DIF_SUBR_STRSTR || - subr == DIF_SUBR_COREPROFILE || + subr == DIF_SUBR_KDEBUG_TRACE || + subr == DIF_SUBR_KDEBUG_TRACE_STRING || subr == DIF_SUBR_HTONS || subr == DIF_SUBR_HTONL || subr == DIF_SUBR_HTONLL || @@ -9816,7 +10057,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) } } -static void +static int dtrace_ecb_resize(dtrace_ecb_t *ecb) { dtrace_action_t *act; @@ -9846,9 +10087,10 @@ dtrace_ecb_resize(dtrace_ecb_t *ecb) ASSERT(curneeded != UINT32_MAX); agg->dtag_base = aggbase; - curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); rec->dtrd_offset = curneeded; + if (curneeded + rec->dtrd_size < curneeded) + return (EINVAL); curneeded += rec->dtrd_size; ecb->dte_needed = MAX(ecb->dte_needed, curneeded); @@ -9875,11 +10117,15 @@ dtrace_ecb_resize(dtrace_ecb_t *ecb) curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); rec->dtrd_offset = curneeded; curneeded += rec->dtrd_size; + if (curneeded + rec->dtrd_size < curneeded) + return (EINVAL); } else { /* tuples must be followed by an aggregation */ ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple); ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment); rec->dtrd_offset = ecb->dte_size; + if (ecb->dte_size + rec->dtrd_size < ecb->dte_size) + return (EINVAL); ecb->dte_size += rec->dtrd_size; ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size); } @@ -9898,6 +10144,7 @@ dtrace_ecb_resize(dtrace_ecb_t *ecb) ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t)); ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t))); ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed); + return (0); } static dtrace_action_t * @@ -10568,7 +10815,10 @@ dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe, } } - dtrace_ecb_resize(ecb); + if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) { + dtrace_ecb_destroy(ecb); + return (NULL); + } return (dtrace_ecb_create_cache = ecb); } @@ -10675,6 +10925,8 @@ dtrace_buffer_switch(dtrace_buffer_t *buf) buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED); buf->dtb_interval = now - buf->dtb_switched; buf->dtb_switched = now; + buf->dtb_cur_limit = buf->dtb_limit; + dtrace_interrupt_enable(cookie); } @@ -10717,7 +10969,7 @@ dtrace_buffer_canalloc(size_t size) } static int -dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, +dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags, processorid_t cpu) { dtrace_cpu_t *cp; @@ -10751,6 +11003,7 @@ dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, ASSERT(buf->dtb_xamot == NULL); + /* DTrace, please do not eat all the memory. */ if (dtrace_buffer_canalloc(size) == B_FALSE) goto err; @@ -10758,6 +11011,10 @@ dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, goto err; dtrace_buffer_memory_inuse += size; + /* Unsure that limit is always lower than size */ + limit = limit == size ? limit - 1 : limit; + buf->dtb_cur_limit = limit; + buf->dtb_limit = limit; buf->dtb_size = size; buf->dtb_flags = flags; buf->dtb_offset = 0; @@ -10857,9 +11114,27 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, offs += sizeof (uint32_t); } - if ((uint64_t)(soffs = offs + needed) > buf->dtb_size) { - dtrace_buffer_drop(buf); - return (-1); + if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) { + if (buf->dtb_cur_limit == buf->dtb_limit) { + buf->dtb_cur_limit = buf->dtb_size; + + atomic_add_32(&state->dts_buf_over_limit, 1); + /** + * Set an AST on the current processor + * so that we can wake up the process + * outside of probe context, when we know + * it is safe to do so + */ + minor_t minor = getminor(state->dts_dev); + ASSERT(minor < 32); + + atomic_or_32(&dtrace_wake_clients, 1 << minor); + ast_dtrace_on(); + } + if ((uint64_t)soffs > buf->dtb_size) { + dtrace_buffer_drop(buf); + return (-1); + } } if (mstate == NULL) @@ -11429,7 +11704,7 @@ dtrace_enabling_retract(dtrace_state_t *state) } static int -dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) +dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond) { int i = 0; int total_matched = 0, matched = 0; @@ -11443,6 +11718,14 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_current = ep; enab->dten_error = 0; + /** + * Before doing a dtrace_probe_enable, which is really + * expensive, check that this enabling matches the matching precondition + * if we have one + */ + if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) { + continue; + } /* * If a provider failed to enable a probe then get out and * let the consumer know we failed. @@ -11484,7 +11767,7 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) } static void -dtrace_enabling_matchall(void) +dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond) { dtrace_enabling_t *enab; @@ -11507,13 +11790,22 @@ dtrace_enabling_matchall(void) * Behave as if always in "global" zone." */ for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) { - (void) dtrace_enabling_match(enab, NULL); + (void) dtrace_enabling_match(enab, NULL, cond); } lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&cpu_lock); + +} + +static void +dtrace_enabling_matchall(void) +{ + dtrace_enabling_matchall_with_cond(NULL); } + + /* * If an enabling is to be enabled without having matched probes (that is, if * dtrace_state_go() is to be called on the underlying dtrace_state_t), the @@ -12768,36 +13060,14 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) /* Cause restart */ *new_state = NULL; - /* - * Darwin's DEVFS layer acquired the minor number for this "device" when it called - * dtrace_devfs_clone_func(). At that time, dtrace_devfs_clone_func() proposed a minor number - * (next unused according to vmem_alloc()) and then immediately put the number back in play - * (by calling vmem_free()). Now that minor number is being used for an open, so committing it - * to use. The following vmem_alloc() must deliver that same minor number. FIXME. - */ + minor = getminor(*devp); - minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1, - VM_BESTFIT | VM_SLEEP); - - if (NULL != devp) { - ASSERT(getminor(*devp) == minor); - if (getminor(*devp) != minor) { - printf("dtrace_open: couldn't re-acquire vended minor number %d. Instead got %d\n", - getminor(*devp), minor); - vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1); - return (ERESTART); /* can't reacquire */ - } - } else { - /* NULL==devp iff "Anonymous state" (see dtrace_anon_property), - * so just vend the minor device number here de novo since no "open" has occurred. */ + state = dtrace_state_allocate(minor); + if (NULL == state) { + printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor); + return (ERESTART); /* can't reacquire */ } - if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) { - vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1); - return (EAGAIN); /* temporary resource shortage */ - } - - state = ddi_get_soft_state(dtrace_softstate, minor); state->dts_epid = DTRACE_EPIDNONE + 1; (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor); @@ -12823,6 +13093,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) */ state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP); state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP); + state->dts_buf_over_limit = 0; state->dts_cleaner = CYCLIC_NONE; state->dts_deadman = CYCLIC_NONE; state->dts_vstate.dtvs_state = state; @@ -12848,8 +13119,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default; opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default; opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default; - - state->dts_activity = DTRACE_ACTIVITY_INACTIVE; + opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default; /* * Depending on the user credentials, we set flag bits which alter probe @@ -12857,10 +13127,28 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) * actual anonymous tracing, or the possession of all privileges, all of * the normal checks are bypassed. */ +#if defined(__APPLE__) + if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) { + if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) { + /* + * Allow only proc credentials when DTrace is + * restricted by the current security policy + */ + state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC; + state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER; + } + else { + state->dts_cred.dcr_visible = DTRACE_CRV_ALL; + state->dts_cred.dcr_action = DTRACE_CRA_ALL; + } + } + +#else if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) { state->dts_cred.dcr_visible = DTRACE_CRV_ALL; state->dts_cred.dcr_action = DTRACE_CRA_ALL; - } else { + } + else { /* * Set up the credentials for this instantiation. We take a * hold on the credential to prevent it from disappearing on @@ -12977,6 +13265,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE; } } +#endif *new_state = state; return(0); /* Success */ @@ -12987,6 +13276,7 @@ dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) { dtrace_optval_t *opt = state->dts_options, size; processorid_t cpu = 0; + size_t limit = buf->dtb_size; int flags = 0, rval; lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -13034,8 +13324,8 @@ dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) */ return (E2BIG); } - - rval = dtrace_buffer_alloc(buf, size, flags, cpu); + limit = opt[DTRACEOPT_BUFLIMIT] * size / 100; + rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu); if (rval != ENOMEM) { opt[which] = size; @@ -13283,6 +13573,18 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu) if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max) opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max; + if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max) + opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max; + + if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min) + opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min; + + if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max) + opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max; + + if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min) + opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min; + hdlr.cyh_func = (cyc_func_t)dtrace_state_clean; hdlr.cyh_arg = state; hdlr.cyh_level = CY_LOW_LEVEL; @@ -13595,8 +13897,7 @@ dtrace_state_destroy(dtrace_state_t *state) dtrace_format_destroy(state); vmem_destroy(state->dts_aggid_arena); - ddi_soft_state_free(dtrace_softstate, minor); - vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1); + dtrace_state_free(minor); } /* @@ -14597,10 +14898,6 @@ dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claim lck_rw_lock_shared(&dtrace_dof_mode_lock); - /* - * If we have lazy dof, dof mode better be LAZY_ON. - */ - ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON); ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL); ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER); @@ -14694,10 +14991,6 @@ dtrace_lazy_dofs_remove(proc_t *p, int generation) lck_rw_lock_shared(&dtrace_dof_mode_lock); - /* - * If we have lazy dof, dof mode better be LAZY_ON. - */ - ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON); ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL); ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER); @@ -14769,7 +15062,7 @@ dtrace_lazy_dofs_remove(proc_t *p, int generation) } lck_rw_unlock_shared(&dtrace_dof_mode_lock); - + return rval; } @@ -14779,12 +15072,6 @@ dtrace_lazy_dofs_destroy(proc_t *p) lck_rw_lock_shared(&dtrace_dof_mode_lock); lck_mtx_lock(&p->p_dtrace_sprlock); - /* - * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting. - * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from - * kern_exit.c and kern_exec.c. - */ - ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON || p->p_lflag & P_LEXIT); ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL); dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs; @@ -14798,47 +15085,6 @@ dtrace_lazy_dofs_destroy(proc_t *p) } } -void -dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child) -{ - lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_assert(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_assert(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED); - - lck_rw_lock_shared(&dtrace_dof_mode_lock); - lck_mtx_lock(&parent->p_dtrace_sprlock); - - /* - * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting. - * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from - * kern_fork.c - */ - ASSERT(parent->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON); - ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL); - /* - * In theory we should hold the child sprlock, but this is safe... - */ - ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL); - - dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs; - dof_ioctl_data_t* child_dofs = NULL; - if (parent_dofs) { - size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count); - child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP); - bcopy(parent_dofs, child_dofs, parent_dofs_size); - } - - lck_mtx_unlock(&parent->p_dtrace_sprlock); - - if (child_dofs) { - lck_mtx_lock(&child->p_dtrace_sprlock); - child->p_dtrace_lazy_dofs = child_dofs; - lck_mtx_unlock(&child->p_dtrace_sprlock); - } - - lck_rw_unlock_shared(&dtrace_dof_mode_lock); -} - static int dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored) { @@ -14849,10 +15095,8 @@ dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored) return p->p_dtrace_lazy_dofs != NULL; } -static int -dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored) -{ -#pragma unused(ignored) +static void +dtrace_lazy_dofs_process(proc_t *p) { /* * It is possible this process may exit during our attempt to * fault in the dof. We could fix this by holding locks longer, @@ -14860,13 +15104,10 @@ dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored) */ lck_mtx_lock(&p->p_dtrace_sprlock); - /* - * In this case only, it is okay to have lazy dof when dof mode is DTRACE_DOF_MODE_LAZY_OFF - */ + ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL); ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF); - dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs; p->p_dtrace_lazy_dofs = NULL; @@ -14894,7 +15135,7 @@ dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored) dhp->dofhp_dof = dhp->dofhp_addr; dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval); - + if (dof != NULL) { dtrace_helpers_t *help; @@ -14929,10 +15170,73 @@ dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored) kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count)); } +} + +static int +dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored) +{ +#pragma unused(ignored) + + dtrace_lazy_dofs_process(p); return PROC_RETURNED; } +#define DTRACE_LAZY_DOFS_DUPLICATED 1 + +static int +dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child) +{ + lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_assert(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_assert(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED); + + lck_rw_lock_shared(&dtrace_dof_mode_lock); + lck_mtx_lock(&parent->p_dtrace_sprlock); + + /* + * We need to make sure that the transition to lazy dofs -> helpers + * was atomic for our parent + */ + ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL); + /* + * In theory we should hold the child sprlock, but this is safe... + */ + ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL); + + dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs; + dof_ioctl_data_t* child_dofs = NULL; + if (parent_dofs) { + size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count); + child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP); + bcopy(parent_dofs, child_dofs, parent_dofs_size); + } + + lck_mtx_unlock(&parent->p_dtrace_sprlock); + + if (child_dofs) { + lck_mtx_lock(&child->p_dtrace_sprlock); + child->p_dtrace_lazy_dofs = child_dofs; + lck_mtx_unlock(&child->p_dtrace_sprlock); + /** + * We process the DOF at this point if the mode is set to + * LAZY_OFF. This can happen if DTrace is still processing the + * DOF of other process (which can happen because the + * protected pager can have a huge latency) + * but has not processed our parent yet + */ + if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) { + dtrace_lazy_dofs_process(child); + } + lck_rw_unlock_shared(&dtrace_dof_mode_lock); + + return DTRACE_LAZY_DOFS_DUPLICATED; + } + lck_rw_unlock_shared(&dtrace_dof_mode_lock); + + return 0; +} + static dtrace_helpers_t * dtrace_helpers_create(proc_t *p) { @@ -15125,6 +15429,148 @@ dtrace_helpers_duplicate(proc_t *from, proc_t *to) dtrace_helper_provider_register(to, newhelp, NULL); } +/** + * DTrace Process functions + */ + +void +dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn) +{ + /* + * This code applies to new processes who are copying the task + * and thread state and address spaces of their parent process. + */ + if (!spawn) { + /* + * APPLE NOTE: Solaris does a sprlock() and drops the + * proc_lock here. We're cheating a bit and only taking + * the p_dtrace_sprlock lock. A full sprlock would + * task_suspend the parent. + */ + lck_mtx_lock(&parent_proc->p_dtrace_sprlock); + + /* + * Remove all DTrace tracepoints from the child process. We + * need to do this _before_ duplicating USDT providers since + * any associated probes may be immediately enabled. + */ + if (parent_proc->p_dtrace_count > 0) { + dtrace_fasttrap_fork(parent_proc, child_proc); + } + + lck_mtx_unlock(&parent_proc->p_dtrace_sprlock); + + /* + * Duplicate any lazy dof(s). This must be done while NOT + * holding the parent sprlock! Lock ordering is + * dtrace_dof_mode_lock, then sprlock. It is imperative we + * always call dtrace_lazy_dofs_duplicate, rather than null + * check and call if !NULL. If we NULL test, during lazy dof + * faulting we can race with the faulting code and proceed + * from here to beyond the helpers copy. The lazy dof + * faulting will then fail to copy the helpers to the child + * process. We return if we duplicated lazy dofs as a process + * can only have one at the same time to avoid a race between + * a dtrace client and dtrace_proc_fork where a process would + * end up with both lazy dofs and helpers. + */ + if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) { + return; + } + + /* + * Duplicate any helper actions and providers if they haven't + * already. + */ +#if !defined(__APPLE__) + /* + * The SFORKING + * we set above informs the code to enable USDT probes that + * sprlock() may fail because the child is being forked. + */ +#endif + /* + * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent + * never fails to find the child. We do not set SFORKING. + */ + if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) { + (*dtrace_helpers_fork)(parent_proc, child_proc); + } + } +} + +void +dtrace_proc_exec(proc_t *p) +{ + /* + * Invalidate any predicate evaluation already cached for this thread by DTrace. + * That's because we've just stored to p_comm and DTrace refers to that when it + * evaluates the "execname" special variable. uid and gid may have changed as well. + */ + dtrace_set_thread_predcache(current_thread(), 0); + + /* + * Free any outstanding lazy dof entries. It is imperative we + * always call dtrace_lazy_dofs_destroy, rather than null check + * and call if !NULL. If we NULL test, during lazy dof faulting + * we can race with the faulting code and proceed from here to + * beyond the helpers cleanup. The lazy dof faulting will then + * install new helpers which no longer belong to this process! + */ + dtrace_lazy_dofs_destroy(p); + + + /* + * Clean up any DTrace helpers for the process. + */ + if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) { + (*dtrace_helpers_cleanup)(p); + } + + /* + * Cleanup the DTrace provider associated with this process. + */ + proc_lock(p); + if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) { + (*dtrace_fasttrap_exec_ptr)(p); + } + proc_unlock(p); +} + +void +dtrace_proc_exit(proc_t *p) +{ + /* + * Free any outstanding lazy dof entries. It is imperative we + * always call dtrace_lazy_dofs_destroy, rather than null check + * and call if !NULL. If we NULL test, during lazy dof faulting + * we can race with the faulting code and proceed from here to + * beyond the helpers cleanup. The lazy dof faulting will then + * install new helpers which will never be cleaned up, and leak. + */ + dtrace_lazy_dofs_destroy(p); + + /* + * Clean up any DTrace helper actions or probes for the process. + */ + if (p->p_dtrace_helpers != NULL) { + (*dtrace_helpers_cleanup)(p); + } + + /* + * Clean up any DTrace probes associated with this process. + */ + /* + * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(), + * call this after dtrace_helpers_cleanup() + */ + proc_lock(p); + if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) { + (*dtrace_fasttrap_exit_ptr)(p); + } + proc_unlock(p); +} + /* * DTrace Hook Functions */ @@ -15704,15 +16150,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) lck_mtx_lock(&dtrace_provider_lock); lck_mtx_lock(&dtrace_lock); - if (ddi_soft_state_init(&dtrace_softstate, - sizeof (dtrace_state_t), 0) != 0) { - cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state"); - lck_mtx_unlock(&dtrace_lock); - lck_mtx_unlock(&dtrace_provider_lock); - lck_mtx_unlock(&cpu_lock); - return (DDI_FAILURE); - } - /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */ ddi_report_dev(devi); @@ -15734,9 +16171,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1, NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); - dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE, - UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0, - VM_SLEEP | VMC_IDENTIFIER); dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri, 1, INT_MAX, 0); @@ -15745,6 +16179,7 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) NULL, NULL, NULL, NULL, NULL, 0); lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); + dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod), offsetof(dtrace_probe_t, dtpr_nextmod), offsetof(dtrace_probe_t, dtpr_prevmod)); @@ -15841,7 +16276,7 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) lck_mtx_lock(&dtrace_lock); if ((enab = dtrace_anon.dta_enabling) != NULL) - (void) dtrace_enabling_match(enab, NULL); + (void) dtrace_enabling_match(enab, NULL, NULL); lck_mtx_unlock(&cpu_lock); } @@ -15933,7 +16368,16 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) */ if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) { dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF; - + /* + * We do not need to hold the exclusive lock while processing + * DOF on processes. We do need to make sure the mode does not get + * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though + * (which should not happen anyway since it only happens in + * dtrace_close). There is no way imcomplete USDT probes can be + * activate by any DTrace clients here since they all have to + * call dtrace_open and be blocked on dtrace_dof_mode_lock + */ + lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock); /* * Iterate all existing processes and load lazy dofs. */ @@ -15942,9 +16386,13 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) NULL, dtrace_lazy_dofs_proc_iterate_filter, NULL); + + lck_rw_unlock_shared(&dtrace_dof_mode_lock); + } + else { + lck_rw_unlock_exclusive(&dtrace_dof_mode_lock); } - lck_rw_unlock_exclusive(&dtrace_dof_mode_lock); /* * Update kernel symbol state. @@ -15979,8 +16427,7 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) dtrace_state_t *state; /* APPLE NOTE: Darwin puts Helper on its own major device. */ - - state = ddi_get_soft_state(dtrace_softstate, minor); + state = dtrace_state_get(minor); lck_mtx_lock(&cpu_lock); lck_mtx_lock(&dtrace_lock); @@ -16205,7 +16652,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv /* Darwin puts Helper on its own major device. */ - state = ddi_get_soft_state(dtrace_softstate, minor); + state = dtrace_state_get(minor); if (state->dts_anon) { ASSERT(dtrace_anon.dta_state == NULL); @@ -16465,7 +16912,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv return (rval); } - if ((err = dtrace_enabling_match(enab, rv)) == 0) { + if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) { err = dtrace_enabling_retain(enab); } else { dtrace_enabling_destroy(enab); @@ -16688,10 +17135,45 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv return (rval == 0 ? 0 : EFAULT); } + case DTRACEIOC_SLEEP: { + int64_t time; + uint64_t abstime; + uint64_t rvalue = DTRACE_WAKE_TIMEOUT; + + if (copyin(arg, &time, sizeof(time)) != 0) + return (EFAULT); + + nanoseconds_to_absolutetime((uint64_t)time, &abstime); + clock_absolutetime_interval_to_deadline(abstime, &abstime); + + if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) { + if (state->dts_buf_over_limit > 0) { + clear_wait(current_thread(), THREAD_INTERRUPTED); + rvalue = DTRACE_WAKE_BUF_LIMIT; + } else { + thread_block(THREAD_CONTINUE_NULL); + if (state->dts_buf_over_limit > 0) { + rvalue = DTRACE_WAKE_BUF_LIMIT; + } + } + } + + if (copyout(&rvalue, arg, sizeof(rvalue)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_SIGNAL: { + wakeup(state); + return (0); + } + case DTRACEIOC_AGGSNAP: case DTRACEIOC_BUFSNAP: { dtrace_bufdesc_t desc; caddr_t cached; + boolean_t over_limit; dtrace_buffer_t *buf; if (copyin(arg, &desc, sizeof (desc)) != 0) @@ -16773,6 +17255,8 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv } cached = buf->dtb_tomax; + over_limit = buf->dtb_cur_limit == buf->dtb_size; + ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); dtrace_xcall(desc.dtbd_cpu, @@ -16793,6 +17277,22 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv } ASSERT(cached == buf->dtb_xamot); + /* + * At this point we know the buffer have switched, so we + * can decrement the over limit count if the buffer was over + * its limit. The new buffer might already be over its limit + * yet, but we don't care since we're guaranteed not to be + * checking the buffer over limit count at this point. + */ + if (over_limit) { + uint32_t old = atomic_add_32(&state->dts_buf_over_limit, -1); + #pragma unused(old) + + /* + * Verify that we didn't underflow the value + */ + ASSERT(old != 0); + } /* * We have our snapshot; now copy it out. @@ -17159,7 +17659,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv /* NOTE! We can no longer exit this method via return */ if (copyin(arg, module_symbols, module_symbols_size) != 0) { - cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t, symbol count %llu", module_symbols->dtmodsyms_count); + cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t"); rval = EFAULT; goto module_symbols_cleanup; } @@ -17354,7 +17854,6 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) dtrace_byname = NULL; kmem_cache_destroy(dtrace_state_cache); - vmem_destroy(dtrace_minor); vmem_destroy(dtrace_arena); if (dtrace_toxrange != NULL) { @@ -17530,28 +18029,13 @@ helper_init( void ) #undef HELPER_MAJOR -/* - * Called with DEVFS_LOCK held, so vmem_alloc's underlying blist structures are protected. - */ static int dtrace_clone_func(dev_t dev, int action) { #pragma unused(dev) if (action == DEVFS_CLONE_ALLOC) { - if (NULL == dtrace_minor) /* Arena not created yet!?! */ - return 0; - else { - /* - * Propose a minor number, namely the next number that vmem_alloc() will return. - * Immediately put it back in play by calling vmem_free(). FIXME. - */ - int ret = (int)(uintptr_t)vmem_alloc(dtrace_minor, 1, VM_BESTFIT | VM_SLEEP); - - vmem_free(dtrace_minor, (void *)(uintptr_t)ret, 1); - - return ret; - } + return dtrace_state_reserve(); } else if (action == DEVFS_CLONE_FREE) { return 0; @@ -17559,6 +18043,34 @@ dtrace_clone_func(dev_t dev, int action) else return -1; } +void dtrace_ast(void); + +void +dtrace_ast(void) +{ + int i; + uint32_t clients = atomic_and_32(&dtrace_wake_clients, 0); + if (clients == 0) + return; + /** + * We disable preemption here to be sure that we won't get + * interrupted by a wakeup to a thread that is higher + * priority than us, so that we do issue all wakeups + */ + disable_preemption(); + for (i = 0; i < DTRACE_NCLIENTS; i++) { + if (clients & (1 << i)) { + dtrace_state_t *state = dtrace_state_get(i); + if (state) { + wakeup(state); + } + + } + } + enable_preemption(); +} + + #define DTRACE_MAJOR -24 /* let the kernel pick the device number */ static struct cdevsw dtrace_cdevsw = @@ -17703,7 +18215,7 @@ dtrace_init( void ) lck_mtx_lock(&cpu_lock); for (i = 0; i < ncpu; ++i) - /* FIXME: track CPU configuration a la CHUD Processor Pref Pane. */ + /* FIXME: track CPU configuration */ dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */ lck_mtx_unlock(&cpu_lock); diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index 4e7ede1d2..7bd5500b0 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -297,6 +297,12 @@ typedef struct wrap_timer_call { #define WAKEUP_REAPER 0x7FFFFFFFFFFFFFFFLL #define NEARLY_FOREVER 0x7FFFFFFFFFFFFFFELL + +typedef struct cyc_list { + cyc_omni_handler_t cyl_omni; + wrap_timer_call_t cyl_wrap_by_cpus[]; +} cyc_list_t; + /* CPU going online/offline notifications */ void (*dtrace_cpu_state_changed_hook)(int, boolean_t) = NULL; void dtrace_cpu_state_changed(int, boolean_t); @@ -386,10 +392,8 @@ timer_call_add_cyclic(wrap_timer_call_t *wrapTC, cyc_handler_t *handler, cyc_tim * Executed on the CPU the timer is running on. */ static void -timer_call_remove_cyclic(cyclic_id_t cyclic) +timer_call_remove_cyclic(wrap_timer_call_t *wrapTC) { - wrap_timer_call_t *wrapTC = (wrap_timer_call_t *)cyclic; - assert(wrapTC); assert(cpu_number() == wrapTC->cpuid); @@ -400,12 +404,10 @@ timer_call_remove_cyclic(cyclic_id_t cyclic) } static void * -timer_call_get_cyclic_arg(cyclic_id_t cyclic) -{ - wrap_timer_call_t *wrapTC = (wrap_timer_call_t *)cyclic; - +timer_call_get_cyclic_arg(wrap_timer_call_t *wrapTC) +{ return (wrapTC ? wrapTC->hdlr.cyh_arg : NULL); -} +} cyclic_id_t cyclic_timer_add(cyc_handler_t *handler, cyc_time_t *when) @@ -430,62 +432,48 @@ cyclic_timer_remove(cyclic_id_t cyclic) } static void -_cyclic_add_omni(cyclic_id_list_t cyc_list) +_cyclic_add_omni(cyc_list_t *cyc_list) { cyc_time_t cT; cyc_handler_t cH; - wrap_timer_call_t *wrapTC; - cyc_omni_handler_t *omni = (cyc_omni_handler_t *)cyc_list; - char *t; - - (omni->cyo_online)(omni->cyo_arg, CPU, &cH, &cT); - - t = (char *)cyc_list; - t += sizeof(cyc_omni_handler_t); - cyc_list = (cyclic_id_list_t)(uintptr_t)t; + cyc_omni_handler_t *omni = &cyc_list->cyl_omni; - t += sizeof(cyclic_id_t)*NCPU; - t += (sizeof(wrap_timer_call_t))*cpu_number(); - wrapTC = (wrap_timer_call_t *)(uintptr_t)t; + (omni->cyo_online)(omni->cyo_arg, CPU, &cH, &cT); - cyc_list[cpu_number()] = timer_call_add_cyclic(wrapTC, &cH, &cT); + wrap_timer_call_t *wrapTC = &cyc_list->cyl_wrap_by_cpus[cpu_number()]; + timer_call_add_cyclic(wrapTC, &cH, &cT); } cyclic_id_list_t cyclic_add_omni(cyc_omni_handler_t *omni) { - cyclic_id_list_t cyc_list = - _MALLOC( (sizeof(wrap_timer_call_t))*NCPU + - sizeof(cyclic_id_t)*NCPU + - sizeof(cyc_omni_handler_t), M_TEMP, M_ZERO | M_WAITOK); + cyc_list_t *cyc_list = + _MALLOC(sizeof(cyc_list_t) + NCPU * sizeof(wrap_timer_call_t), M_TEMP, M_ZERO | M_WAITOK); + if (NULL == cyc_list) - return (cyclic_id_list_t)CYCLIC_NONE; + return NULL; + + cyc_list->cyl_omni = *omni; - *(cyc_omni_handler_t *)cyc_list = *omni; dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)_cyclic_add_omni, (void *)cyc_list); - return cyc_list; + return (cyclic_id_list_t)cyc_list; } static void -_cyclic_remove_omni(cyclic_id_list_t cyc_list) +_cyclic_remove_omni(cyc_list_t *cyc_list) { - cyc_omni_handler_t *omni = (cyc_omni_handler_t *)cyc_list; + cyc_omni_handler_t *omni = &cyc_list->cyl_omni; void *oarg; - cyclic_id_t cid; - char *t; - - t = (char *)cyc_list; - t += sizeof(cyc_omni_handler_t); - cyc_list = (cyclic_id_list_t)(uintptr_t)t; + wrap_timer_call_t *wrapTC; /* * If the processor was offline when dtrace started, we did not allocate * a cyclic timer for this CPU. */ - if ((cid = cyc_list[cpu_number()]) != CYCLIC_NONE) { - oarg = timer_call_get_cyclic_arg(cid); - timer_call_remove_cyclic(cid); + if ((wrapTC = &cyc_list->cyl_wrap_by_cpus[cpu_number()]) != NULL) { + oarg = timer_call_get_cyclic_arg(wrapTC); + timer_call_remove_cyclic(wrapTC); (omni->cyo_offline)(omni->cyo_arg, CPU, oarg); } } @@ -493,7 +481,7 @@ _cyclic_remove_omni(cyclic_id_list_t cyc_list) void cyclic_remove_omni(cyclic_id_list_t cyc_list) { - ASSERT( cyc_list != (cyclic_id_list_t)CYCLIC_NONE ); + ASSERT(cyc_list != NULL); dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)_cyclic_remove_omni, (void *)cyc_list); _FREE(cyc_list, M_TEMP); @@ -617,54 +605,6 @@ ddi_report_dev(dev_info_t *devi) #pragma unused(devi) } -#define NSOFT_STATES 32 /* XXX No more than 32 clients at a time, please. */ -static void *soft[NSOFT_STATES]; - -int -ddi_soft_state_init(void **state_p, size_t size, size_t n_items) -{ -#pragma unused(n_items) - int i; - - for (i = 0; i < NSOFT_STATES; ++i) soft[i] = _MALLOC(size, M_TEMP, M_ZERO | M_WAITOK); - *(size_t *)state_p = size; - return 0; -} - -int -ddi_soft_state_zalloc(void *state, int item) -{ -#pragma unused(state) - if (item < NSOFT_STATES) - return DDI_SUCCESS; - else - return DDI_FAILURE; -} - -void * -ddi_get_soft_state(void *state, int item) -{ -#pragma unused(state) - ASSERT(item < NSOFT_STATES); - return soft[item]; -} - -int -ddi_soft_state_free(void *state, int item) -{ - ASSERT(item < NSOFT_STATES); - bzero( soft[item], (size_t)state ); - return DDI_SUCCESS; -} - -void -ddi_soft_state_fini(void **state_p) -{ -#pragma unused(state_p) - int i; - - for (i = 0; i < NSOFT_STATES; ++i) _FREE( soft[i], M_TEMP ); -} static unsigned int gRegisteredProps = 0; static struct { diff --git a/bsd/dev/dtrace/dtrace_ptss.c b/bsd/dev/dtrace/dtrace_ptss.c index b43d4b17f..c09b8f32e 100644 --- a/bsd/dev/dtrace/dtrace_ptss.c +++ b/bsd/dev/dtrace/dtrace_ptss.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -164,16 +165,15 @@ dtrace_ptss_allocate_page(struct proc* p) if (map == NULL) goto err; + mach_vm_size_t size = PAGE_MAX_SIZE; + mach_vm_offset_t addr = 0; vm_prot_t cur_protection = VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE; vm_prot_t max_protection = VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE; - mach_vm_offset_t addr = 0; - mach_vm_size_t size = PAGE_SIZE; // We need some way to assert that this matches vm_map_round_page() !!! kern_return_t kr = mach_vm_map(map, &addr, size, 0, VM_FLAGS_ANYWHERE, IPC_PORT_NULL, 0, FALSE, cur_protection, max_protection, VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { goto err; } - // Chain the page entries. int i; for (i=0; idthpv_provname, 1); } +static char* +fasttrap_meta_provider_name(void *arg) +{ + fasttrap_provider_t *fprovider = arg; + dtrace_provider_t *provider = (dtrace_provider_t*)(fprovider->ftp_provid); + return provider->dtpv_name; +} + static dtrace_mops_t fasttrap_mops = { fasttrap_meta_create_probe, fasttrap_meta_provide, - fasttrap_meta_remove + fasttrap_meta_remove, + fasttrap_meta_provider_name }; /* diff --git a/bsd/dev/dtrace/fbt.c b/bsd/dev/dtrace/fbt.c index 9ad1613b6..e05d5a922 100644 --- a/bsd/dev/dtrace/fbt.c +++ b/bsd/dev/dtrace/fbt.c @@ -158,7 +158,14 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) if (fbt->fbtp_currentval != fbt->fbtp_patchval) { (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_patchval)); + /* + * Make the patched instruction visible via a data + instruction + * cache flush for the platforms that need it + */ + flush_dcache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0); + invalidate_icache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0); fbt->fbtp_currentval = fbt->fbtp_patchval; + ctl->mod_nenabled++; } @@ -186,6 +193,13 @@ fbt_disable(void *arg, dtrace_id_t id, void *parg) if (fbt->fbtp_currentval != fbt->fbtp_savedval) { (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); + /* + * Make the patched instruction visible via a data + instruction + * cache flush for the platforms that need it + */ + flush_dcache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0); + invalidate_icache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0); + fbt->fbtp_currentval = fbt->fbtp_savedval; ASSERT(ctl->mod_nenabled > 0); ctl->mod_nenabled--; @@ -212,8 +226,14 @@ fbt_suspend(void *arg, dtrace_id_t id, void *parg) (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); + /* + * Make the patched instruction visible via a data + instruction + * cache flush for the platforms that need it + */ + flush_dcache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_savedval), 0); + invalidate_icache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_savedval), 0); - fbt->fbtp_currentval = fbt->fbtp_savedval; + fbt->fbtp_currentval = fbt->fbtp_savedval; } dtrace_membar_consumer(); diff --git a/bsd/dev/dtrace/scripts/Makefile b/bsd/dev/dtrace/scripts/Makefile index 3e55851a9..a6f2527cd 100644 --- a/bsd/dev/dtrace/scripts/Makefile +++ b/bsd/dev/dtrace/scripts/Makefile @@ -11,17 +11,28 @@ INSTALL_DTRACE_SCRIPTS_LIST = \ errno.d \ io.d \ ip.d \ - regs_x86_64.d \ sched.d \ signal.d \ socket.d \ tcp.d \ unistd.d +INSTALL_DTRACE_LIBEXEC_LIST = \ + log_unnest_badness.d + ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),) INSTALL_DTRACE_SCRIPTS_LIST += mptcp.d endif + +ifeq ($(CURRENT_ARCH_CONFIG),ARM64) +INSTALL_DTRACE_SCRIPTS_LIST += regs_arm64.d +else ifeq ($(CURRENT_ARCH_CONFIG),ARM) +INSTALL_DTRACE_SCRIPTS_LIST += regs_arm.d +else +INSTALL_DTRACE_SCRIPTS_LIST += regs_x86_64.d +endif + INSTALL_DTRACE_SCRIPTS_FILES = \ $(addprefix $(DSTROOT)/$(INSTALL_DTRACE_SCRIPTS_DIR)/, $(INSTALL_DTRACE_SCRIPTS_LIST)) @@ -30,9 +41,15 @@ $(INSTALL_DTRACE_SCRIPTS_FILES): $(DSTROOT)/$(INSTALL_DTRACE_SCRIPTS_DIR)/% : % @echo INSTALL $(@F) $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ -do_textfiles_install:: $(INSTALL_DTRACE_SCRIPTS_FILES) +INSTALL_DTRACE_LIBEXEC_FILES = \ + $(addprefix $(DSTROOT)/$(INSTALL_DTRACE_LIBEXEC_DIR)/, $(INSTALL_DTRACE_LIBEXEC_LIST)) -include $(MakeInc_rule) -include $(MakeInc_dir) +$(INSTALL_DTRACE_LIBEXEC_FILES): $(DSTROOT)/$(INSTALL_DTRACE_LIBEXEC_DIR)/% : % + $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_DTRACE_LIBEXEC_DIR) + @echo INSTALL $(@F) + $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@ +do_textfiles_install:: $(INSTALL_DTRACE_SCRIPTS_FILES) $(INSTALL_DTRACE_LIBEXEC_FILES) +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/dev/dtrace/scripts/log_unnest_badness.d b/bsd/dev/dtrace/scripts/log_unnest_badness.d new file mode 100644 index 000000000..2e8e0d003 --- /dev/null +++ b/bsd/dev/dtrace/scripts/log_unnest_badness.d @@ -0,0 +1,13 @@ +#!/usr/sbin/dtrace -s + +vminfo::log_unnest_badness: +{ + printf("%d[%s]: unexpected unnest(0x%llx, 0x%llx) below 0x%llx", + $pid, + execname, + (uint64_t) arg1, + (uint64_t) arg2, + (uint64_t) arg3); + stack(); + ustack(); +} diff --git a/bsd/dev/dtrace/scripts/regs_arm.d b/bsd/dev/dtrace/scripts/regs_arm.d new file mode 100644 index 000000000..23d3b5387 --- /dev/null +++ b/bsd/dev/dtrace/scripts/regs_arm.d @@ -0,0 +1,53 @@ +/* + * Copyright 2016 Apple, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)regs.d.in 1.0 04/09/28 SMI" + +inline int R_R0 = 0; +#pragma D binding "1.0" R_R0 +inline int R_R1 = 1; +#pragma D binding "1.0" R_R1 +inline int R_R2 = 2; +#pragma D binding "1.0" R_R2 +inline int R_R3 = 3; +#pragma D binding "1.0" R_R3 +inline int R_R4 = 4; +#pragma D binding "1.0" R_R4 +inline int R_R5 = 5; +#pragma D binding "1.0" R_R5 +inline int R_R6 = 6; +#pragma D binding "1.0" R_R6 +inline int R_R7 = 7; +#pragma D binding "1.0" R_R7 +inline int R_R8 = 8; +#pragma D binding "1.0" R_R8 +inline int R_R9 = 9; +#pragma D binding "1.0" R_R9 +inline int R_R10 = 10; +#pragma D binding "1.0" R_R10 +inline int R_R11 = 11; +#pragma D binding "1.0" R_R11 +inline int R_R12 = 12; +#pragma D binding "1.0" R_R12 +inline int R_R13 = 13; +#pragma D binding "1.0" R_R13 +inline int R_R14 = 14; +#pragma D binding "1.0" R_R14 +inline int R_R15 = 15; +#pragma D binding "1.0" R_R15 + +/* Apple-specific ABI to use R7 as the framepointer */ +inline int R_FP = R_R7; +#pragma D binding "1.0" R_FP + +inline int R_SP = R_R13; +#pragma D binding "1.0" R_SP +inline int R_LR = R_R14; +#pragma D binding "1.0" R_LR +inline int R_PC = R_R15; +#pragma D binding "1.0" R_PC +inline int R_CPSR = 16; +#pragma D binding "1.0" R_CPSR + diff --git a/bsd/dev/dtrace/scripts/regs_arm64.d b/bsd/dev/dtrace/scripts/regs_arm64.d new file mode 100644 index 000000000..8979dea77 --- /dev/null +++ b/bsd/dev/dtrace/scripts/regs_arm64.d @@ -0,0 +1,116 @@ +/* + * Copyright 2016 Apple, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)regs.d.in 1.0 04/09/28 SMI" + +inline int R_R0 = 0; +#pragma D binding "1.0" R_R0 +inline int R_R1 = 1; +#pragma D binding "1.0" R_R1 +inline int R_R2 = 2; +#pragma D binding "1.0" R_R2 +inline int R_R3 = 3; +#pragma D binding "1.0" R_R3 +inline int R_R4 = 4; +#pragma D binding "1.0" R_R4 +inline int R_R5 = 5; +#pragma D binding "1.0" R_R5 +inline int R_R6 = 6; +#pragma D binding "1.0" R_R6 +inline int R_R7 = 7; +#pragma D binding "1.0" R_R7 +inline int R_R8 = 8; +#pragma D binding "1.0" R_R8 +inline int R_R9 = 9; +#pragma D binding "1.0" R_R9 +inline int R_R10 = 10; +#pragma D binding "1.0" R_R10 +inline int R_R11 = 11; +#pragma D binding "1.0" R_R11 +inline int R_R12 = 12; +#pragma D binding "1.0" R_R12 +inline int R_R13 = 13; +#pragma D binding "1.0" R_R13 +inline int R_R14 = 14; +#pragma D binding "1.0" R_R14 +inline int R_R15 = 15; +#pragma D binding "1.0" R_R15 + +inline int R_X0 = 0; +#pragma D binding "1.0" R_X0 +inline int R_X1 = 1; +#pragma D binding "1.0" R_X1 +inline int R_X2 = 2; +#pragma D binding "1.0" R_X2 +inline int R_X3 = 3; +#pragma D binding "1.0" R_X3 +inline int R_X4 = 4; +#pragma D binding "1.0" R_X4 +inline int R_X5 = 5; +#pragma D binding "1.0" R_X5 +inline int R_X6 = 6; +#pragma D binding "1.0" R_X6 +inline int R_X7 = 7; +#pragma D binding "1.0" R_X7 +inline int R_X8 = 8; +#pragma D binding "1.0" R_X8 +inline int R_X9 = 9; +#pragma D binding "1.0" R_X9 +inline int R_X10 = 10; +#pragma D binding "1.0" R_X10 +inline int R_X11 = 11; +#pragma D binding "1.0" R_X11 +inline int R_X12 = 12; +#pragma D binding "1.0" R_X12 +inline int R_X13 = 13; +#pragma D binding "1.0" R_X13 +inline int R_X14 = 14; +#pragma D binding "1.0" R_X14 +inline int R_X15 = 15; +#pragma D binding "1.0" R_X15 +inline int R_X16 = 16; +#pragma D binding "1.0" R_X16 +inline int R_X17 = 17; +#pragma D binding "1.0" R_X17 +inline int R_X18 = 18; +#pragma D binding "1.0" R_X18 +inline int R_X19 = 19; +#pragma D binding "1.0" R_X19 +inline int R_X20 = 20; +#pragma D binding "1.0" R_X20 +inline int R_X21 = 21; +#pragma D binding "1.0" R_X21 +inline int R_X22 = 22; +#pragma D binding "1.0" R_X22 +inline int R_X23 = 23; +#pragma D binding "1.0" R_X23 +inline int R_X24 = 24; +#pragma D binding "1.0" R_X24 +inline int R_X25 = 25; +#pragma D binding "1.0" R_X25 +inline int R_X26 = 26; +#pragma D binding "1.0" R_X26 +inline int R_X27 = 27; +#pragma D binding "1.0" R_X27 +inline int R_X28 = 28; +#pragma D binding "1.0" R_X28 +inline int R_X29 = 29; +#pragma D binding "1.0" R_X29 +inline int R_X30 = 30; +#pragma D binding "1.0" R_X30 +inline int R_X31 = 31; +#pragma D binding "1.0" R_X31 + +inline int R_FP = R_X29; +#pragma D binding "1.0" R_FP +inline int R_LR = R_X30; +#pragma D binding "1.0" R_LR +inline int R_SP = R_X31; +#pragma D binding "1.0" R_SP +inline int R_PC = 32; +#pragma D binding "1.0" R_PC +inline int R_CPSR = 33; +#pragma D binding "1.0" R_CPSR + diff --git a/bsd/dev/dtrace/sdt.c b/bsd/dev/dtrace/sdt.c index f31f21be1..a157923ee 100644 --- a/bsd/dev/dtrace/sdt.c +++ b/bsd/dev/dtrace/sdt.c @@ -268,6 +268,14 @@ sdt_enable(void *arg, dtrace_id_t id, void *parg) while (sdp != NULL) { (void)ml_nofault_copy( (vm_offset_t)&sdp->sdp_patchval, (vm_offset_t)sdp->sdp_patchpoint, (vm_size_t)sizeof(sdp->sdp_patchval)); + + /* + * Make the patched instruction visible via a data + instruction + * cache fush on platforms that need it + */ + flush_dcache((vm_offset_t)sdp->sdp_patchpoint,(vm_size_t)sizeof(sdp->sdp_patchval), 0); + invalidate_icache((vm_offset_t)sdp->sdp_patchpoint,(vm_size_t)sizeof(sdp->sdp_patchval), 0); + sdp = sdp->sdp_next; } @@ -291,6 +299,12 @@ sdt_disable(void *arg, dtrace_id_t id, void *parg) while (sdp != NULL) { (void)ml_nofault_copy( (vm_offset_t)&sdp->sdp_savedval, (vm_offset_t)sdp->sdp_patchpoint, (vm_size_t)sizeof(sdp->sdp_savedval)); + /* + * Make the patched instruction visible via a data + instruction + * cache flush on platforms that need it + */ + flush_dcache((vm_offset_t)sdp->sdp_patchpoint,(vm_size_t)sizeof(sdp->sdp_savedval), 0); + invalidate_icache((vm_offset_t)sdp->sdp_patchpoint,(vm_size_t)sizeof(sdp->sdp_savedval), 0); sdp = sdp->sdp_next; } @@ -436,7 +450,7 @@ void sdt_init( void ) return; } - if (dtrace_fbt_probes_restricted()) { + if (dtrace_sdt_probes_restricted()) { return; } diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index c8a6305b1..00ee62d29 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -139,7 +140,7 @@ dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) #endif // Bounds "check" the value of code a la unix_syscall - sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code]; + sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code]; if ((id = sy->stsy_entry) != DTRACE_IDNONE) { uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); @@ -254,7 +255,7 @@ dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv) dtrace_id_t id; // Bounds "check" the value of code a la unix_syscall_return - sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code]; + sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code]; if ((id = sy->stsy_return) != DTRACE_IDNONE) { uint64_t munged_rv0, munged_rv1; @@ -338,7 +339,7 @@ systrace_init(struct sysent *actual, systrace_sysent_t **interposed) systrace_sysent_t *ssysent = *interposed; /* Avoid sysent shadow warning from bsd/sys/sysent.h */ - int i; + unsigned int i; if (ssysent == NULL) { *interposed = ssysent = kmem_zalloc(sizeof (systrace_sysent_t) * @@ -372,7 +373,7 @@ static void systrace_provide(void *arg, const dtrace_probedesc_t *desc) { #pragma unused(arg) /* __APPLE__ */ - int i; + unsigned int i; if (desc != NULL) return; @@ -943,6 +944,10 @@ void systrace_init( void ); void systrace_init( void ) { if (0 == gSysTraceInited) { + if (dtrace_sdt_probes_restricted()) { + return; + } + int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw); if (majdevno < 0) { diff --git a/bsd/dev/i386/conf.c b/bsd/dev/i386/conf.c index 62d62601e..f76413818 100644 --- a/bsd/dev/i386/conf.c +++ b/bsd/dev/i386/conf.c @@ -91,7 +91,7 @@ struct bdevsw bdevsw[] = NO_BDEVICE, /*23*/ }; -int nblkdev = sizeof (bdevsw) / sizeof (bdevsw[0]); +const int nblkdev = sizeof(bdevsw) / sizeof(bdevsw[0]); extern struct tty *km_tty[]; extern d_open_t cnopen; @@ -171,6 +171,17 @@ extern d_write_t fdesc_write; extern d_ioctl_t fdesc_ioctl; extern d_select_t fdesc_select; +extern d_open_t oslog_streamopen; +extern d_close_t oslog_streamclose; +extern d_read_t oslog_streamread; +extern d_ioctl_t oslog_streamioctl; +extern d_select_t oslog_streamselect; + +extern d_open_t oslogopen; +extern d_close_t oslogclose; +extern d_select_t oslogselect; +extern d_ioctl_t oslogioctl; + #define nullopen (d_open_t *)&nulldev #define nullclose (d_close_t *)&nulldev #define nullread (d_read_t *)&nulldev @@ -224,8 +235,16 @@ struct cdevsw cdevsw[] = logioctl, eno_stop, nullreset, 0, logselect, eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, - NO_CDEVICE, /* 7*/ - NO_CDEVICE, /* 8*/ + { + oslogopen, oslogclose, eno_rdwrt, eno_rdwrt, /* 7*/ + oslogioctl, eno_stop, nullreset, 0, oslogselect, + eno_mmap, eno_strat, eno_getc, eno_putc, 0 + }, + { + oslog_streamopen, oslog_streamclose, oslog_streamread, eno_rdwrt, /* 8*/ + oslog_streamioctl, eno_stop, nullreset, 0, oslog_streamselect, + eno_mmap, eno_strat, eno_getc, eno_putc, 0 + }, NO_CDEVICE, /* 9*/ NO_CDEVICE, /*10*/ NO_CDEVICE, /*11*/ @@ -285,9 +304,9 @@ struct cdevsw cdevsw[] = eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, }; -int nchrdev = sizeof (cdevsw) / sizeof (cdevsw[0]); +const int nchrdev = sizeof(cdevsw) / sizeof(cdevsw[0]); -uint64_t cdevsw_flags[sizeof (cdevsw) / sizeof (cdevsw[0])]; +uint64_t cdevsw_flags[sizeof(cdevsw) / sizeof(cdevsw[0])]; #include /* for VCHR and VBLK */ /* diff --git a/bsd/dev/i386/dis_tables.c b/bsd/dev/i386/dis_tables.c index b57481d27..03f3c6197 100644 --- a/bsd/dev/i386/dis_tables.c +++ b/bsd/dev/i386/dis_tables.c @@ -2126,7 +2126,7 @@ dtrace_imm_opnd(dis86_t *x, int wbit, int size, int opindex) { int i; int byte; - int valsize; + int valsize = 0; if (x->d86_numopnds < (uint_t)opindex + 1) x->d86_numopnds = (uint_t)opindex + 1; diff --git a/bsd/dev/i386/fbt_x86.c b/bsd/dev/i386/fbt_x86.c index 93c19c561..541cad6e9 100644 --- a/bsd/dev/i386/fbt_x86.c +++ b/bsd/dev/i386/fbt_x86.c @@ -174,8 +174,11 @@ static const char * probe_ctx_closure[] = "Debugger", "IS_64BIT_PROCESS", "OSCompareAndSwap", + "_disable_preemption", + "_enable_preemption", "absolutetime_to_microtime", "act_set_astbsd", + "ast_dtrace_on", "ast_pending", "clock_get_calendar_nanotime_nowait", "copyin", @@ -235,16 +238,16 @@ static int _cmp(const void *a, const void *b) } static const void * bsearch( - register const void *key, + const void *key, const void *base0, size_t nmemb, - register size_t size, - register int (*compar)(const void *, const void *)) { + size_t size, + int (*compar)(const void *, const void *)) { - register const char *base = base0; - register size_t lim; - register int cmp; - register const void *p; + const char *base = base0; + size_t lim; + int cmp; + const void *p; for (lim = nmemb; lim != 0; lim >>= 1) { p = base + (lim >> 1) * size; @@ -469,9 +472,6 @@ is_symbol_valid(const char* name) */ if (LIT_STRNSTART(name, "kdp_") || LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "kdbg_") || - LIT_STRNSTART(name, "kdebug_") || - LIT_STRNSTART(name, "kernel_debug") || LIT_STRNSTART(name, "debug_") || LIT_STRNEQL(name, "Debugger") || LIT_STRNEQL(name, "Call_DebuggerC") || diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index 2e4672f23..f80926b6d 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -39,6 +39,7 @@ #include #include #include +#include static int @@ -285,6 +286,7 @@ misc_interrupt_latency_max(__unused struct sysctl_oid *oidp, __unused void *arg1 return error; } +#if DEVELOPMENT || DEBUG /* * Triggers a machine-check exception - for a suitably configured kernel only. */ @@ -304,6 +306,30 @@ misc_machine_check_panic(__unused struct sysctl_oid *oidp, __unused void *arg1, return error; } +/* + * Triggers a non-responsive processor timeout panic - for a suitably configured kernel only. + */ +static uint64_t kernel_timeout_spin = 0; +static int +misc_kernel_timeout_spin(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + uint64_t old_value; + uint64_t new_value; + int changed = 0, error; + char buf[128]; + buf[0] = '\0'; + + absolutetime_to_nanoseconds(kernel_timeout_spin, &old_value); + + error = sysctl_io_number(req, old_value, sizeof(uint64_t), &new_value, &changed); + if (error == 0 && changed) { + nanoseconds_to_absolutetime(((uint64_t)new_value), &kernel_timeout_spin); + kernel_spin(kernel_timeout_spin); + } + return error; +} +#endif /* DEVELOPMENT || DEBUG */ + SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "CPU info"); @@ -796,19 +822,61 @@ SYSCTL_PROC(_machdep_misc, OID_AUTO, interrupt_latency_max, 0, 0, misc_interrupt_latency_max, "A", "Maximum Interrupt latency"); +#if DEVELOPMENT || DEBUG SYSCTL_PROC(_machdep_misc, OID_AUTO, machine_check_panic, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, misc_machine_check_panic, "A", "Machine-check exception test"); -#if DEVELOPMENT || DEBUG +SYSCTL_PROC(_machdep_misc, OID_AUTO, kernel_timeout_spin, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, sizeof(kernel_timeout_spin), + misc_kernel_timeout_spin, "Q", "Kernel timeout panic test"); + SYSCTL_QUAD(_machdep, OID_AUTO, reportphyreadabs, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, &reportphyreaddelayabs, ""); SYSCTL_INT(_machdep, OID_AUTO, reportphyreadosbt, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, &reportphyreadosbt, 0, ""); -#endif + +extern int pmap_pagezero_mitigation; +extern int pmap_asserts_enabled, pmap_asserts_traced; +/* On DEV/DEBUG kernels, clear this to disable the SMAP emulation + * (address space disconnect) for pagezero-less processes. + */ +SYSCTL_INT(_machdep, OID_AUTO, pmap_pagezero_mitigation, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &pmap_pagezero_mitigation, 0, ""); +/* Toggle pmap assertions */ +SYSCTL_INT(_machdep, OID_AUTO, pmap_asserts, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &pmap_asserts_enabled, 0, ""); +/* Transform pmap assertions into kernel trace terminations */ +SYSCTL_INT(_machdep, OID_AUTO, pmap_asserts_traced, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &pmap_asserts_traced, 0, ""); + +static int +misc_svisor_read(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + uint64_t new_value = 0, old_value = 0; + int changed = 0, error; + + error = sysctl_io_number(req, old_value, sizeof(uint64_t), &new_value, &changed); + if ((error == 0) && changed) { + volatile uint32_t *raddr = (uint32_t *) new_value; + printf("Supervisor: value at 0x%llx is 0x%x\n", new_value, *raddr); + } + return error; +} + +SYSCTL_PROC(_machdep_misc, OID_AUTO, misc_svisor_read, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, + misc_svisor_read, "I", "supervisor mode read"); + +#endif /* DEVELOPMENT || DEBUG */ extern void timer_queue_trace_cpu(int); static int @@ -885,3 +953,11 @@ extern uint64_t x86_isr_fp_simd_use; SYSCTL_QUAD(_machdep, OID_AUTO, x86_fp_simd_isr_uses, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, &x86_isr_fp_simd_use, ""); +#if DEVELOPMENT || DEBUG + +extern int plctrace_enabled; + +SYSCTL_INT(_machdep, OID_AUTO, pltrace, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &plctrace_enabled, 0, ""); +#endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index 2c7e93ea2..e6e995ac6 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,7 @@ extern const char *syscallnames[]; * * Outputs: none */ +__attribute__((noreturn)) void unix_syscall(x86_saved_state_t *state) { @@ -115,28 +116,19 @@ unix_syscall(x86_saved_state_t *state) else p = (struct proc *)get_bsdtask_info(current_task()); - /* Verify that we are not being called from a task without a proc */ - if (__improbable(p == NULL)) { - regs->eax = EPERM; - regs->efl |= EFL_CF; - task_terminate_internal(current_task()); - thread_exception_return(); - /* NOTREACHED */ - } - code = regs->eax & I386_SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", - code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); + code, syscallnames[code >= nsysent ? SYS_invalid : code], (uint32_t)regs->eip); params = (vm_offset_t) (regs->uesp + sizeof (int)); regs->efl &= ~(EFL_CF); - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; } vt = (void *)uthread->uu_arg; @@ -266,7 +258,7 @@ unix_syscall(x86_saved_state_t *state) /* NOTREACHED */ } - +__attribute__((noreturn)) void unix_syscall64(x86_saved_state_t *state) { @@ -312,8 +304,8 @@ unix_syscall64(x86_saved_state_t *state) code = regs->rax & SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: code=%d(%s) rip=%llx\n", - code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; + code, syscallnames[code >= nsysent ? SYS_invalid : code], regs->isf.rip); + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; vt = (void *)uthread->uu_arg; @@ -323,7 +315,7 @@ unix_syscall64(x86_saved_state_t *state) * passed as 'arg0' */ code = regs->rdi; - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; args_start_at_rdi = FALSE; args_in_regs = 5; } else { @@ -488,7 +480,7 @@ unix_syscall_return(int error) regs = saved_state64(find_user_regs(thread)); code = uthread->syscall_code; - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; #if CONFIG_DTRACE if (callp->sy_call == dtrace_systrace_syscall) @@ -545,7 +537,7 @@ unix_syscall_return(int error) regs->efl &= ~(EFL_CF); code = uthread->syscall_code; - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; #if CONFIG_DTRACE if (callp->sy_call == dtrace_systrace_syscall) @@ -591,4 +583,3 @@ unix_syscall_return(int error) thread_exception_return(); /* NOTREACHED */ } - diff --git a/bsd/dev/i386/unix_signal.c b/bsd/dev/i386/unix_signal.c index 4f31f83e5..bb073b026 100644 --- a/bsd/dev/i386/unix_signal.c +++ b/bsd/dev/i386/unix_signal.c @@ -63,7 +63,7 @@ /* Forward: */ extern boolean_t machine_exception(int, mach_exception_code_t, mach_exception_subcode_t, int *, mach_exception_subcode_t *); -extern kern_return_t thread_getstatus(register thread_t act, int flavor, +extern kern_return_t thread_getstatus(thread_t act, int flavor, thread_state_t tstate, mach_msg_type_number_t *count); extern kern_return_t thread_setstatus(thread_t thread, int flavor, thread_state_t tstate, mach_msg_type_number_t count); diff --git a/bsd/dev/mem.c b/bsd/dev/mem.c index bf7538b01..2b133e64e 100644 --- a/bsd/dev/mem.c +++ b/bsd/dev/mem.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -233,7 +233,6 @@ mmrw(dev_t dev, struct uio *uio, enum uio_rw rw) continue; /* Keep going until UIO is done */ default: return (ENODEV); - break; } if (error) diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index ac6dd485e..434fcbdf7 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -110,17 +110,19 @@ static int mdevcioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct p static int mdevrw(dev_t dev, struct uio *uio, int ioflag); #ifdef CONFIG_MEMDEV_INSECURE - static char * nonspace(char *pos, char *end); static char * getspace(char *pos, char *end); static char * cvtnum(char *pos, char *end, uint64_t *num); - #endif /* CONFIG_MEMDEV_INSECURE */ extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes); extern void mapping_set_mod(ppnum_t pn); extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); +/* + * Maximal number of memory devices. + */ +#define NB_MAX_MDEVICES (16) /* * cdevsw @@ -164,7 +166,7 @@ struct mdev { int mdCDev; /* Character device number */ void * mdbdevb; void * mdcdevb; -} mdev[16]; +} mdev[NB_MAX_MDEVICES]; /* mdFlags */ #define mdInited 0x01 /* This device defined */ @@ -174,10 +176,11 @@ struct mdev { int mdevBMajor = -1; int mdevCMajor = -1; -static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char); -dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys); -dev_t mdevlookup(int devid); -void mdevremoveall(void); +static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char); +dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys); +dev_t mdevlookup(int devid); +void mdevremoveall(void); +int mdevgetrange(int devid, uint64_t *base, uint64_t *size); static int mdevclose(__unused dev_t dev, __unused int flags, __unused int devtype, __unused struct proc *p) { @@ -191,7 +194,7 @@ static int mdevopen(dev_t dev, int flags, __unused int devtype, __unused struct devid = minor(dev); /* Get minor device number */ - if (devid >= 16) return (ENXIO); /* Not valid */ + if (devid >= NB_MAX_MDEVICES) return (ENXIO); /* Not valid */ if ((flags & FWRITE) && (mdev[devid].mdFlags & mdRO)) return (EACCES); /* Currently mounted RO */ @@ -206,7 +209,7 @@ static int mdevrw(dev_t dev, struct uio *uio, __unused int ioflag) { devid = minor(dev); /* Get minor device number */ - if (devid >= 16) return (ENXIO); /* Not valid */ + if (devid >= NB_MAX_MDEVICES) return (ENXIO); /* Not valid */ if (!(mdev[devid].mdFlags & mdInited)) return (ENXIO); /* Have we actually been defined yet? */ mdata = ((addr64_t)mdev[devid].mdBase << 12) + uio->uio_offset; /* Point to the area in "file" */ @@ -358,7 +361,7 @@ static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, __unused int flag, devid = minor(dev); /* Get minor device number */ - if (devid >= 16) return (ENXIO); /* Not valid */ + if (devid >= NB_MAX_MDEVICES) return (ENXIO); /* Not valid */ error = proc_suser(p); /* Are we superman? */ if (error) return (error); /* Nope... */ @@ -434,7 +437,7 @@ static int mdevsize(dev_t dev) { int devid; devid = minor(dev); /* Get minor device number */ - if (devid >= 16) return (ENXIO); /* Not valid */ + if (devid >= NB_MAX_MDEVICES) return (ENXIO); /* Not valid */ if ((mdev[devid].mdFlags & mdInited) == 0) return(-1); /* Not inited yet */ @@ -562,7 +565,7 @@ dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys) { if(devid < 0) { devid = -1; - for(i = 0; i < 16; i++) { /* Search all known memory devices */ + for(i = 0; i < NB_MAX_MDEVICES; i++) { /* Search all known memory devices */ if(!(mdev[i].mdFlags & mdInited)) { /* Is this a free one? */ if(devid < 0)devid = i; /* Remember first free one */ continue; /* Skip check */ @@ -572,11 +575,11 @@ dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys) { } } if(devid < 0) { /* Do we have free slots? */ - panic("mdevadd: attempt to add more than 16 memory devices\n"); + panic("mdevadd: attempt to add more than %d memory devices\n", NB_MAX_MDEVICES); } } else { - if(devid >= 16) { /* Giving us something bogus? */ + if(devid >= NB_MAX_MDEVICES) { /* Giving us something bogus? */ panic("mdevadd: attempt to explicitly add a bogus memory device: %08X\n", devid); } if(mdev[devid].mdFlags & mdInited) { /* Already there? */ @@ -631,7 +634,7 @@ dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys) { dev_t mdevlookup(int devid) { - if((devid < 0) || (devid > 15)) return -1; /* Filter any bogus requests */ + if((devid < 0) || (devid >= NB_MAX_MDEVICES)) return -1; /* Filter any bogus requests */ if(!(mdev[devid].mdFlags & mdInited)) return -1; /* This one hasn't been defined */ return mdev[devid].mdBDev; /* Return the device number */ } @@ -640,7 +643,7 @@ void mdevremoveall(void) { int i; - for(i = 0; i < 16; i++) { + for(i = 0; i < NB_MAX_MDEVICES; i++) { if(!(mdev[i].mdFlags & mdInited)) continue; /* Ignore unused mdevs */ devfs_remove(mdev[i].mdbdevb); /* Remove the block device */ @@ -656,3 +659,29 @@ void mdevremoveall(void) { mdev[i].mdcdevb = 0; } } + +int +mdevgetrange(int devid, uint64_t *base, uint64_t *size) +{ + assert(base); + assert(size); + + /* filter invalid request */ + if ((devid < 0) || (devid >= NB_MAX_MDEVICES)) { + return -1; + } + + /* filter non-initialized memory devices */ + if ((mdev[devid].mdFlags & mdInited) == 0) { + return -1; + } + + *base = mdev[devid].mdBase << 12; + *size = mdev[devid].mdSize << 12; + + /* make sure (base, size) is a valid range and will not overflow */ + assert(*size < (UINT64_MAX - *base)); + + return 0; +} + diff --git a/bsd/dev/munge.c b/bsd/dev/munge.c index edd1b7273..227720935 100644 --- a/bsd/dev/munge.c +++ b/bsd/dev/munge.c @@ -536,6 +536,19 @@ munge_lwww(void *args) out_args[0] = *(volatile uint64_t*)&in_args[0]; } +void +munge_wwlww(void *args) +{ + volatile uint64_t *out_args = (volatile uint64_t*)args; + volatile uint32_t *in_args = (volatile uint32_t*)args; + + out_args[4] = in_args[5]; + out_args[3] = in_args[4]; + out_args[2] = *(volatile uint64_t*)&in_args[2]; + out_args[1] = in_args[1]; + out_args[0] = in_args[0]; +} + void munge_wwlwww(void *args) { @@ -550,6 +563,35 @@ munge_wwlwww(void *args) out_args[0] = in_args[0]; } +void +munge_wlwwwl(void *args) +{ + volatile uint64_t *out_args = (volatile uint64_t*)args; + volatile uint32_t *in_args = (volatile uint32_t*)args; + + out_args[5] = *(volatile uint64_t*)&in_args[6]; + out_args[4] = in_args[5]; + out_args[3] = in_args[4]; + out_args[2] = in_args[3]; + out_args[1] = *(volatile uint64_t*)&in_args[1]; + out_args[0] = in_args[0]; +} + +void +munge_wwlwwwl(void *args) +{ + volatile uint64_t *out_args = (volatile uint64_t*)args; + volatile uint32_t *in_args = (volatile uint32_t*)args; + + out_args[6] = *(volatile uint64_t*)&in_args[7]; + out_args[5] = in_args[6]; + out_args[4] = in_args[5]; + out_args[3] = in_args[4]; + out_args[2] = *(volatile uint64_t*)&in_args[2]; + out_args[1] = in_args[1]; + out_args[0] = in_args[0]; +} + /* * Munge array of 32-bit values into an array of 64-bit values, * without sign extension. Note, src and dest can be the same diff --git a/bsd/dev/random/Makefile b/bsd/dev/random/Makefile index 412ca1e5f..86934de44 100644 --- a/bsd/dev/random/Makefile +++ b/bsd/dev/random/Makefile @@ -17,5 +17,3 @@ EXPORT_MI_DIR = dev/random include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/dev/random/randomdev.c b/bsd/dev/random/randomdev.c index 1b96f774c..6482f6094 100644 --- a/bsd/dev/random/randomdev.c +++ b/bsd/dev/random/randomdev.c @@ -2,7 +2,7 @@ * Copyright (c) 1999-2009 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -96,7 +97,7 @@ random_init(void) UID_ROOT, GID_WHEEL, 0666, "random", 0); /* - * also make urandom + * also make urandom * (which is exactly the same thing in our context) */ devfs_make_node(makedev (ret, URANDOM_MINOR), DEVFS_CHAR, @@ -105,7 +106,7 @@ random_init(void) } int -random_ioctl( __unused dev_t dev, u_long cmd, __unused caddr_t data, +random_ioctl( __unused dev_t dev, u_long cmd, __unused caddr_t data, __unused int flag, __unused struct proc *p ) { switch (cmd) { @@ -123,7 +124,7 @@ random_ioctl( __unused dev_t dev, u_long cmd, __unused caddr_t data, * Open the device. Make sure init happened, and make sure the caller is * authorized. */ - + int random_open(__unused dev_t dev, int flags, __unused int devtype, __unused struct proc *p) { @@ -147,7 +148,7 @@ random_open(__unused dev_t dev, int flags, __unused int devtype, __unused struct /* * close the device. */ - + int random_close(__unused dev_t dev, __unused int flags, __unused int mode, __unused struct proc *p) { @@ -187,7 +188,7 @@ random_write (dev_t dev, struct uio *uio, __unused int ioflag) /* * return data to the caller. Results unpredictable. - */ + */ int random_read(__unused dev_t dev, struct uio *uio, __unused int ioflag) { @@ -199,14 +200,14 @@ random_read(__unused dev_t dev, struct uio *uio, __unused int ioflag) int bytesToRead = MIN(bytes_remaining, (user_ssize_t) sizeof(buffer)); read_random(buffer, bytesToRead); - + retCode = uiomove(buffer, bytesToRead, uio); if (retCode != 0) break; - + bytes_remaining = uio_resid(uio); } - + return retCode; } @@ -221,3 +222,21 @@ RandomULong(void) return (buf); } + +int +getentropy(__unused struct proc * p, struct getentropy_args *gap, __unused int * ret) { + user_addr_t user_addr; + uint32_t user_size; + char buffer[256]; + + user_addr = (vm_map_offset_t)gap->buffer; + user_size = gap->size; + /* Can't request more than 256 random bytes + * at once. Complying with openbsd getentropy() + */ + if (user_size > sizeof(buffer)) { + return EINVAL; + } + read_random(buffer, user_size); + return copyout(buffer, user_addr, user_size); +} diff --git a/bsd/dev/unix_startup.c b/bsd/dev/unix_startup.c index 25c3610d0..98cbcce99 100644 --- a/bsd/dev/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -350,6 +350,10 @@ bsd_scale_setup(int scale) max_cached_sock_count = 60000 + ((scale-1) * 15000); } } + + if(maxproc > hard_maxproc) { + hard_maxproc = maxproc; + } #endif bsd_exec_setup(scale); } diff --git a/bsd/hfs/MacOSStubs.c b/bsd/hfs/MacOSStubs.c deleted file mode 100644 index 0a1bded31..000000000 --- a/bsd/hfs/MacOSStubs.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include - -#include - -#include "hfs.h" -#include "hfs_dbg.h" -#include "hfscommon/headers/FileMgrInternal.h" - -/* - * gTimeZone should only be used for HFS volumes! - * It is initialized when an HFS volume is mounted. - */ -struct timezone gTimeZone = {8*60,1}; - -/* - * GetTimeUTC - get the GMT Mac OS time (in seconds since 1/1/1904) - * - * called by the Catalog Manager when creating/updating HFS Plus records - */ -u_int32_t GetTimeUTC(void) -{ - struct timeval tv; - - microtime(&tv); - - return (tv.tv_sec + MAC_GMT_FACTOR); -} - - -/* - * LocalToUTC - convert from Mac OS local time to Mac OS GMT time. - * This should only be called for HFS volumes (not for HFS Plus). - */ -u_int32_t LocalToUTC(u_int32_t localTime) -{ - u_int32_t gtime = localTime; - - if (gtime != 0) { - gtime += (gTimeZone.tz_minuteswest * 60); - /* - * We no longer do DST adjustments here since we don't - * know if time supplied needs adjustment! - * - * if (gTimeZone.tz_dsttime) - * gtime -= 3600; - */ - } - return (gtime); -} - -/* - * UTCToLocal - convert from Mac OS GMT time to Mac OS local time. - * This should only be called for HFS volumes (not for HFS Plus). - */ -u_int32_t UTCToLocal(u_int32_t utcTime) -{ - u_int32_t ltime = utcTime; - - if (ltime != 0) { - ltime -= (gTimeZone.tz_minuteswest * 60); - /* - * We no longer do DST adjustments here since we don't - * know if time supplied needs adjustment! - * - * if (gTimeZone.tz_dsttime) - * ltime += 3600; - */ - } - return (ltime); -} - -/* - * to_bsd_time - convert from Mac OS time (seconds since 1/1/1904) - * to BSD time (seconds since 1/1/1970) - */ -time_t to_bsd_time(u_int32_t hfs_time) -{ - u_int32_t gmt = hfs_time; - - if (gmt > MAC_GMT_FACTOR) - gmt -= MAC_GMT_FACTOR; - else - gmt = 0; /* don't let date go negative! */ - - return (time_t)gmt; -} - -/* - * to_hfs_time - convert from BSD time (seconds since 1/1/1970) - * to Mac OS time (seconds since 1/1/1904) - */ -u_int32_t to_hfs_time(time_t bsd_time) -{ - u_int32_t hfs_time = (u_int32_t)bsd_time; - - /* don't adjust zero - treat as uninitialzed */ - if (hfs_time != 0) - hfs_time += MAC_GMT_FACTOR; - - return (hfs_time); -} - - -Ptr NewPtrSysClear (Size byteCount) -{ - Ptr tmptr; - MALLOC (tmptr, Ptr, byteCount, M_TEMP, M_WAITOK); - if (tmptr) - bzero(tmptr, byteCount); - return tmptr; -} - - - -Ptr NewPtr (Size byteCount) -{ - Ptr tmptr; - MALLOC (tmptr, Ptr, byteCount, M_TEMP, M_WAITOK); - return tmptr; -} - - -void DisposePtr (Ptr p) -{ - FREE (p, M_TEMP); -} - - -void -DebugStr( - const char * debuggerMsg - ) -{ - kprintf ("*** Mac OS Debugging Message: %s\n", debuggerMsg); - DEBUG_BREAK; -} - diff --git a/bsd/hfs/Makefile b/bsd/hfs/Makefile deleted file mode 100644 index ccf82f04f..000000000 --- a/bsd/hfs/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -DATAFILES = \ - hfs_encodings.h hfs_format.h hfs_mount.h hfs_unistr.h - -PRIVATE_DATAFILES = \ - hfs.h hfs_attrlist.h hfs_catalog.h hfs_cnode.h hfs_endian.h \ - hfs_fsctl.h hfs_macos_defs.h hfs_quota.h rangelist.h - -KERNELFILES = ${DATAFILES} - -INSTALL_MI_LIST = ${DATAFILES} - -INSTALL_MI_DIR = hfs - -EXPORT_MI_LIST = ${KERNELFILES} - -EXPORT_MI_DIR = hfs - -INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h deleted file mode 100644 index b19e23208..000000000 --- a/bsd/hfs/hfs.h +++ /dev/null @@ -1,1086 +0,0 @@ -/* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef __HFS__ -#define __HFS__ - -/* If set to 1, enables the code to allocate blocks from the start - * of the disk instead of the nextAllocation for sparse devices like - * sparse disk images or sparsebundle images. The free extent cache - * for such volumes is also maintained based on the start block instead - * of number of contiguous allocation blocks. These devices prefer - * allocation of blocks near the start of the disk to avoid the - * increasing the image size, but it can also result in file fragmentation. - */ -#define HFS_SPARSE_DEV 1 - -#if DEBUG -#define HFS_CHECK_LOCK_ORDER 1 -#endif - -#define HFS_TMPDBG 0 - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#if CONFIG_PROTECT -/* Forward declare the cprotect struct */ -struct cprotect; - - -#endif - -/* - * Just reported via MIG interface. - */ -#define VERSION_STRING "hfs-2 (4-12-99)" - -#define HFS_LINK_MAX 32767 - -#define HFS_MAX_DEFERED_ALLOC (1024*1024) - -// 400 megs is a "big" file (i.e. one that when deleted -// would touch enough data that we should break it into -// multiple separate transactions) -#define HFS_BIGFILE_SIZE (400LL * 1024LL * 1024LL) - - -enum { kMDBSize = 512 }; /* Size of I/O transfer to read entire MDB */ - -enum { kMasterDirectoryBlock = 2 }; /* MDB offset on disk in 512-byte blocks */ -enum { kMDBOffset = kMasterDirectoryBlock * 512 }; /* MDB offset on disk in bytes */ - -#define kRootDirID kHFSRootFolderID - - -/* number of locked buffer caches to hold for b-tree meta data */ -#define kMaxLockedMetaBuffers 32 - -extern struct timezone gTimeZone; - - -/* How many free extents to cache per volume */ -#define kMaxFreeExtents 10 - -/* The maximum time hfs locks can be held while performing hfs statistics gathering */ -#define HFS_FSINFO_MAX_LOCKHELD_TIME 20 * 1000000ULL /* at most 20 milliseconds. */ - -/* - * HFS_MINFREE gives the minimum acceptable percentage - * of file system blocks which may be free (but this - * minimum will never exceed HFS_MAXRESERVE bytes). If - * the free block count drops below this level only the - * superuser may continue to allocate blocks. - */ -#define HFS_MINFREE 1 -#define HFS_MAXRESERVE ((u_int64_t)(250*1024*1024)) -#define HFS_BT_MAXRESERVE ((u_int64_t)(10*1024*1024)) - -/* - * The system distinguishes between the desirable low-disk - * notifiaction levels for root volumes and non-root volumes. - * The various thresholds are computed as a fraction of the - * volume size, all capped at a certain fixed level - */ - -#define HFS_ROOTVERYLOWDISKTRIGGERFRACTION 5 -#define HFS_ROOTVERYLOWDISKTRIGGERLEVEL ((u_int64_t)(512*1024*1024)) -#define HFS_ROOTLOWDISKTRIGGERFRACTION 10 -#define HFS_ROOTLOWDISKTRIGGERLEVEL ((u_int64_t)(1024*1024*1024)) -#define HFS_ROOTLOWDISKSHUTOFFFRACTION 11 -#define HFS_ROOTLOWDISKSHUTOFFLEVEL ((u_int64_t)(1024*1024*1024 + 250*1024*1024)) - -#define HFS_VERYLOWDISKTRIGGERFRACTION 1 -#define HFS_VERYLOWDISKTRIGGERLEVEL ((u_int64_t)(100*1024*1024)) -#define HFS_LOWDISKTRIGGERFRACTION 2 -#define HFS_LOWDISKTRIGGERLEVEL ((u_int64_t)(150*1024*1024)) -#define HFS_LOWDISKSHUTOFFFRACTION 3 -#define HFS_LOWDISKSHUTOFFLEVEL ((u_int64_t)(200*1024*1024)) - -/* Internal Data structures*/ - -/* This structure describes the HFS specific mount structure data. */ -typedef struct hfsmount { - u_int32_t hfs_flags; /* see below */ - - /* Physical Description */ - u_int32_t hfs_logical_block_size; /* Logical block size of the disk as reported by ioctl(DKIOCGETBLOCKSIZE), always a multiple of 512 */ - daddr64_t hfs_logical_block_count; /* Number of logical blocks on the disk, as reported by ioctl(DKIOCGETBLOCKCOUNT) */ - u_int64_t hfs_logical_bytes; /* Number of bytes on the disk device this HFS is mounted on (blockcount * blocksize) */ - /* - * Regarding the two AVH sector fields below: - * Under normal circumstances, the filesystem's notion of the "right" location for the AVH is such that - * the partition and filesystem's are in sync. However, during a filesystem resize, HFS proactively - * writes a new AVH at the end of the filesystem, assuming that the partition will be resized accordingly. - * - * However, it is not technically a corruption if the partition size is never modified. As a result, we need - * to keep two copies of the AVH around "just in case" the partition size is not modified. - */ - daddr64_t hfs_partition_avh_sector; /* location of Alt VH w.r.t partition size */ - daddr64_t hfs_fs_avh_sector; /* location of Alt VH w.r.t filesystem size */ - - u_int32_t hfs_physical_block_size; /* Physical block size of the disk as reported by ioctl(DKIOCGETPHYSICALBLOCKSIZE) */ - u_int32_t hfs_log_per_phys; /* Number of logical blocks per physical block size */ - - /* Access to VFS and devices */ - struct mount *hfs_mp; /* filesystem vfs structure */ - struct vnode *hfs_devvp; /* block device mounted vnode */ - struct vnode * hfs_extents_vp; - struct vnode * hfs_catalog_vp; - struct vnode * hfs_allocation_vp; - struct vnode * hfs_attribute_vp; - struct vnode * hfs_startup_vp; - struct vnode * hfs_attrdata_vp; /* pseudo file */ - struct cnode * hfs_extents_cp; - struct cnode * hfs_catalog_cp; - struct cnode * hfs_allocation_cp; - struct cnode * hfs_attribute_cp; - struct cnode * hfs_startup_cp; - dev_t hfs_raw_dev; /* device mounted */ - u_int32_t hfs_logBlockSize; /* Size of buffer cache buffer for I/O */ - - /* Default values for HFS standard and non-init access */ - uid_t hfs_uid; /* uid to set as owner of the files */ - gid_t hfs_gid; /* gid to set as owner of the files */ - mode_t hfs_dir_mask; /* mask to and with directory protection bits */ - mode_t hfs_file_mask; /* mask to and with file protection bits */ - u_int32_t hfs_encoding; /* Default encoding for non hfs+ volumes */ - - /* Persistent fields (on disk, dynamic) */ - time_t hfs_mtime; /* file system last modification time */ - u_int32_t hfs_filecount; /* number of files in file system */ - u_int32_t hfs_dircount; /* number of directories in file system */ - u_int32_t freeBlocks; /* free allocation blocks */ - u_int32_t reclaimBlocks; /* number of blocks we are reclaiming during resize */ - u_int32_t tentativeBlocks; /* tentative allocation blocks -- see note below */ - u_int32_t nextAllocation; /* start of next allocation search */ - u_int32_t sparseAllocation; /* start of allocations for sparse devices */ - u_int32_t vcbNxtCNID; /* next unused catalog node ID - protected by catalog lock */ - u_int32_t vcbWrCnt; /* file system write count */ - u_int64_t encodingsBitmap; /* in-use encodings */ - u_int16_t vcbNmFls; /* HFS Only - root dir file count */ - u_int16_t vcbNmRtDirs; /* HFS Only - root dir directory count */ - - /* Persistent fields (on disk, static) */ - u_int16_t vcbSigWord; - - // Volume will be inconsistent if header is not flushed - bool hfs_header_dirty; - - // Volume header is dirty, but won't be inconsistent if not flushed - bool hfs_header_minor_change; - - u_int32_t vcbAtrb; - u_int32_t vcbJinfoBlock; - u_int32_t localCreateDate;/* volume create time from volume header (For HFS+, value is in local time) */ - time_t hfs_itime; /* file system creation time (creation date of the root folder) */ - time_t hfs_btime; /* file system last backup time */ - u_int32_t blockSize; /* size of allocation blocks */ - u_int32_t totalBlocks; /* total allocation blocks */ - u_int32_t allocLimit; /* Do not allocate this block or beyond */ - /* - * NOTE: When resizing a volume to make it smaller, allocLimit is set to the allocation - * block number which will contain the new alternate volume header. At all other times, - * allocLimit is set to totalBlocks. The allocation code uses allocLimit instead of - * totalBlocks to limit which blocks may be allocated, so that during a resize, we don't - * put new content into the blocks we're trying to truncate away. - */ - int32_t vcbClpSiz; - u_int32_t vcbFndrInfo[8]; - int16_t vcbVBMSt; /* HFS only */ - int16_t vcbAlBlSt; /* HFS only */ - - /* vcb stuff */ - u_int8_t vcbVN[256]; /* volume name in UTF-8 */ - u_int32_t volumeNameEncodingHint; - u_int32_t hfsPlusIOPosOffset; /* Disk block where HFS+ starts */ - u_int32_t vcbVBMIOSize; /* volume bitmap I/O size */ - - /* cache of largest known free extents */ - u_int32_t vcbFreeExtCnt; - HFSPlusExtentDescriptor vcbFreeExt[kMaxFreeExtents]; - lck_spin_t vcbFreeExtLock; - - /* Summary Table */ - u_int8_t *hfs_summary_table; /* Each bit is 1 vcbVBMIOSize of bitmap, byte indexed */ - u_int32_t hfs_summary_size; /* number of BITS in summary table defined above (not bytes!) */ - u_int32_t hfs_summary_bytes; /* number of BYTES in summary table */ - - u_int32_t scan_var; /* For initializing the summary table */ - - - u_int32_t reserveBlocks; /* free block reserve */ - u_int32_t loanedBlocks; /* blocks on loan for delayed allocations */ - u_int32_t lockedBlocks; /* blocks reserved and locked */ - - /* - * HFS+ Private system directories (two). Any access - * (besides looking at the cd_cnid) requires holding - * the Catalog File lock. - */ - struct cat_desc hfs_private_desc[2]; - struct cat_attr hfs_private_attr[2]; - - u_int32_t hfs_metadata_createdate; - hfs_to_unicode_func_t hfs_get_unicode; - unicode_to_hfs_func_t hfs_get_hfsname; - - /* Quota variables: */ - struct quotafile hfs_qfiles[MAXQUOTAS]; /* quota files */ - - /* Journaling variables: */ - struct journal *jnl; // the journal for this volume (if one exists) - struct vnode *jvp; // device where the journal lives (may be equal to devvp) - u_int32_t jnl_start; // start block of the journal file (so we don't delete it) - u_int32_t jnl_size; - u_int32_t hfs_jnlfileid; - u_int32_t hfs_jnlinfoblkid; - lck_rw_t hfs_global_lock; - thread_t hfs_global_lockowner; - u_int32_t hfs_transaction_nesting; - - /* Notification variables: */ - u_int32_t hfs_notification_conditions; - u_int32_t hfs_freespace_notify_dangerlimit; - u_int32_t hfs_freespace_notify_warninglimit; - u_int32_t hfs_freespace_notify_desiredlevel; - - /* time mounted and last mounted mod time "snapshot" */ - time_t hfs_mount_time; - time_t hfs_last_mounted_mtime; - - /* Metadata allocation zone variables: */ - u_int32_t hfs_metazone_start; - u_int32_t hfs_metazone_end; - u_int32_t hfs_hotfile_start; - u_int32_t hfs_hotfile_end; - u_int32_t hfs_min_alloc_start; - u_int32_t hfs_freed_block_count; - u_int64_t hfs_cs_hotfile_size; // in bytes - int hfs_hotfile_freeblks; - int hfs_hotfile_blk_adjust; - int hfs_hotfile_maxblks; - int hfs_overflow_maxblks; - int hfs_catalog_maxblks; - - /* Hot File Clustering variables: */ - lck_mtx_t hfc_mutex; /* serialize hot file stages */ - enum hfc_stage hfc_stage; /* what are we up to... */ - time_t hfc_timebase; /* recording period start time */ - time_t hfc_timeout; /* recording period stop time */ - void * hfc_recdata; /* recording data (opaque) */ - uint32_t hfc_maxfiles; /* maximum files to track */ - struct vnode * hfc_filevp; - -#if HFS_SPARSE_DEV - /* Sparse device variables: */ - struct vnode * hfs_backingfs_rootvp; - u_int32_t hfs_last_backingstatfs; - u_int32_t hfs_sparsebandblks; - u_int64_t hfs_backingfs_maxblocks; -#endif - size_t hfs_max_inline_attrsize; - - lck_mtx_t hfs_mutex; /* protects access to hfsmount data */ - - uint32_t hfs_syncers; // Count of the number of syncers running - enum { - HFS_THAWED, - HFS_WANT_TO_FREEZE, // This state stops hfs_sync from starting - HFS_FREEZING, // We're in this state whilst we're flushing - HFS_FROZEN // Everything gets blocked in hfs_lock_global - } hfs_freeze_state; - union { - /* - * When we're freezing (HFS_FREEZING) but not yet - * frozen (HFS_FROZEN), we record the freezing thread - * so that we stop other threads from taking locks, - * but allow the freezing thread. - */ - const struct thread *hfs_freezing_thread; - /* - * Once we have frozen (HFS_FROZEN), we record the - * process so that if it dies, we can automatically - * unfreeze. - */ - proc_t hfs_freezing_proc; - }; - - thread_t hfs_downgrading_thread; /* thread who's downgrading to rdonly */ - - /* Resize variables: */ - u_int32_t hfs_resize_blocksmoved; - u_int32_t hfs_resize_totalblocks; - u_int32_t hfs_resize_progress; -#if CONFIG_PROTECT - /* Data Protection fields */ - cpx_t hfs_resize_cpx; - u_int16_t hfs_running_cp_major_vers; - uint32_t default_cp_class; /* default effective class value */ - uint64_t cproot_flags; - uint8_t cp_crypto_generation; - uint8_t hfs_cp_lock_state; /* per-mount device lock state info */ -#if HFS_TMPDBG -#if !SECURE_KERNEL - boolean_t hfs_cp_verbose; -#endif -#endif - -#endif - - /* Per mount cnode hash variables: */ - lck_mtx_t hfs_chash_mutex; /* protects access to cnode hash table */ - u_long hfs_cnodehash; /* size of cnode hash table - 1 */ - LIST_HEAD(cnodehashhead, cnode) *hfs_cnodehashtbl; /* base of cnode hash */ - - /* Per mount fileid hash variables (protected by catalog lock!) */ - u_long hfs_idhash; /* size of cnid/fileid hash table -1 */ - LIST_HEAD(idhashhead, cat_preflightid) *hfs_idhashtbl; /* base of ID hash */ - - // Records the oldest outstanding sync request - struct timeval hfs_sync_req_oldest; - - // Records whether a sync has been queued or is in progress - boolean_t hfs_sync_incomplete; - - thread_call_t hfs_syncer; // removeable devices get sync'ed by this guy - - /* Records the syncer thread so that we can avoid the syncer - queing more syncs. */ - thread_t hfs_syncer_thread; - - // Not currently used except for debugging purposes - uint32_t hfs_active_threads; - - enum { - // These are indices into the array below - - // Tentative ranges can be claimed back at any time - HFS_TENTATIVE_BLOCKS = 0, - - // Locked ranges cannot be claimed back, but the allocation - // won't have been written to disk yet - HFS_LOCKED_BLOCKS = 1, - }; - // These lists are not sorted like a range list usually is - struct rl_head hfs_reserved_ranges[2]; -} hfsmount_t; - -/* - * HFS_META_DELAY is a duration (in usecs) used for triggering the - * hfs_syncer() routine. We will back off if writes are in - * progress, but... - * HFS_MAX_META_DELAY is the maximum time we will allow the - * syncer to be delayed. - */ -enum { - HFS_META_DELAY = 100 * 1000, // 0.1 secs - HFS_MAX_META_DELAY = 5000 * 1000 // 5 secs -}; - -typedef hfsmount_t ExtendedVCB; - -/* Aliases for legacy (Mac OS 9) field names */ -#define vcbLsMod hfs_mtime -#define vcbVolBkUp hfs_btime -#define extentsRefNum hfs_extents_vp -#define catalogRefNum hfs_catalog_vp -#define allocationsRefNum hfs_allocation_vp -#define vcbFilCnt hfs_filecount -#define vcbDirCnt hfs_dircount - -static inline void MarkVCBDirty(hfsmount_t *hfsmp) -{ - hfsmp->hfs_header_dirty = true; -} - -static inline void MarkVCBClean(hfsmount_t *hfsmp) -{ - hfsmp->hfs_header_dirty = false; - hfsmp->hfs_header_minor_change = false; -} - -static inline bool IsVCBDirty(ExtendedVCB *vcb) -{ - return vcb->hfs_header_minor_change || vcb->hfs_header_dirty; -} - -// Header is changed but won't be inconsistent if we don't write it -static inline void hfs_note_header_minor_change(hfsmount_t *hfsmp) -{ - hfsmp->hfs_header_minor_change = true; -} - -// Must header be flushed for volume to be consistent? -static inline bool hfs_header_needs_flushing(hfsmount_t *hfsmp) -{ - return (hfsmp->hfs_header_dirty - || ISSET(hfsmp->hfs_catalog_cp->c_flag, C_MODIFIED) - || ISSET(hfsmp->hfs_extents_cp->c_flag, C_MODIFIED) - || (hfsmp->hfs_attribute_cp - && ISSET(hfsmp->hfs_attribute_cp->c_flag, C_MODIFIED)) - || (hfsmp->hfs_allocation_cp - && ISSET(hfsmp->hfs_allocation_cp->c_flag, C_MODIFIED)) - || (hfsmp->hfs_startup_cp - && ISSET(hfsmp->hfs_startup_cp->c_flag, C_MODIFIED))); -} - -/* - * There are two private directories in HFS+. - * - * One contains inodes for files that are hardlinked or open/unlinked. - * The other contains inodes for directories that are hardlinked. - */ -enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; - -#define HFS_ALLOCATOR_SCAN_INFLIGHT 0x0001 /* scan started */ -#define HFS_ALLOCATOR_SCAN_COMPLETED 0x0002 /* initial scan was completed */ - -/* HFS mount point flags */ -#define HFS_READ_ONLY 0x00001 -#define HFS_UNKNOWN_PERMS 0x00002 -#define HFS_WRITEABLE_MEDIA 0x00004 -#define HFS_CLEANED_ORPHANS 0x00008 -#define HFS_X 0x00010 -#define HFS_CASE_SENSITIVE 0x00020 -#define HFS_STANDARD 0x00040 -#define HFS_METADATA_ZONE 0x00080 -#define HFS_FRAGMENTED_FREESPACE 0x00100 -#define HFS_NEED_JNL_RESET 0x00200 -#define HFS_HAS_SPARSE_DEVICE 0x00400 -#define HFS_RESIZE_IN_PROGRESS 0x00800 -#define HFS_QUOTAS 0x01000 -#define HFS_CREATING_BTREE 0x02000 -/* When set, do not update nextAllocation in the mount structure */ -#define HFS_SKIP_UPDATE_NEXT_ALLOCATION 0x04000 -/* When set, the file system supports extent-based extended attributes */ -#define HFS_XATTR_EXTENTS 0x08000 -#define HFS_FOLDERCOUNT 0x10000 -/* When set, the file system exists on a virtual device, like disk image */ -#define HFS_VIRTUAL_DEVICE 0x20000 -/* When set, we're in hfs_changefs, so hfs_sync should do nothing. */ -#define HFS_IN_CHANGEFS 0x40000 -/* When set, we are in process of downgrading or have downgraded to read-only, - * so hfs_start_transaction should return EROFS. - */ -#define HFS_RDONLY_DOWNGRADE 0x80000 -#define HFS_DID_CONTIG_SCAN 0x100000 -#define HFS_UNMAP 0x200000 -#define HFS_SSD 0x400000 -#define HFS_SUMMARY_TABLE 0x800000 -#define HFS_CS 0x1000000 -#define HFS_CS_METADATA_PIN 0x2000000 -#define HFS_CS_HOTFILE_PIN 0x4000000 /* cooperative fusion (enables a hotfile variant) */ -#define HFS_FEATURE_BARRIER 0x8000000 /* device supports barrier-only flush */ -#define HFS_CS_SWAPFILE_PIN 0x10000000 - -/* Macro to update next allocation block in the HFS mount structure. If - * the HFS_SKIP_UPDATE_NEXT_ALLOCATION is set, do not update - * nextAllocation block. - */ -#define HFS_UPDATE_NEXT_ALLOCATION(hfsmp, new_nextAllocation) \ - { \ - if ((hfsmp->hfs_flags & HFS_SKIP_UPDATE_NEXT_ALLOCATION) == 0)\ - hfsmp->nextAllocation = new_nextAllocation; \ - } \ - -/* Macro for incrementing and decrementing the folder count in a cnode - * attribute only if the HFS_FOLDERCOUNT bit is set in the mount flags - * and kHFSHasFolderCount bit is set in the cnode flags. Currently these - * bits are only set for case sensitive HFS+ volumes. - */ -#define INC_FOLDERCOUNT(hfsmp, cattr) \ - if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && \ - (cattr.ca_recflags & kHFSHasFolderCountMask)) { \ - cattr.ca_dircount++; \ - } \ - -#define DEC_FOLDERCOUNT(hfsmp, cattr) \ - if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && \ - (cattr.ca_recflags & kHFSHasFolderCountMask) && \ - (cattr.ca_dircount > 0)) { \ - cattr.ca_dircount--; \ - } \ - -typedef struct filefork FCB; - -/* - * Macros for creating item names for our special/private directories. - */ -#define MAKE_INODE_NAME(name, size, linkno) \ - (void) snprintf((name), size, "%s%d", HFS_INODE_PREFIX, (linkno)) -#define HFS_INODE_PREFIX_LEN 5 - -#define MAKE_DIRINODE_NAME(name, size, linkno) \ - (void) snprintf((name), size, "%s%d", HFS_DIRINODE_PREFIX, (linkno)) -#define HFS_DIRINODE_PREFIX_LEN 4 - -#define MAKE_DELETED_NAME(NAME, size, FID) \ - (void) snprintf((NAME), size, "%s%d", HFS_DELETE_PREFIX, (FID)) -#define HFS_DELETE_PREFIX_LEN 4 - - -#define HFS_AVERAGE_NAME_SIZE 22 -#define AVERAGE_HFSDIRENTRY_SIZE (8+HFS_AVERAGE_NAME_SIZE+4) - -#define STD_DIRENT_LEN(namlen) \ - ((sizeof(struct dirent) - (NAME_MAX+1)) + (((namlen)+1 + 3) &~ 3)) - -#define EXT_DIRENT_LEN(namlen) \ - ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7) - - -enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; - - -/* macro to determine if hfs or hfsplus */ -#define ISHFSPLUS(VCB) ((VCB)->vcbSigWord == kHFSPlusSigWord) -#define ISHFS(VCB) ((VCB)->vcbSigWord == kHFSSigWord) - - -/* - * Various ways to acquire a VFS mount point pointer: - */ -#define VTOVFS(VP) vnode_mount((VP)) -#define HFSTOVFS(HFSMP) ((HFSMP)->hfs_mp) -#define VCBTOVFS(VCB) HFSTOVFS(VCB) - -/* - * Various ways to acquire an HFS mount point pointer: - */ -#define VTOHFS(VP) ((struct hfsmount *)vfs_fsprivate(vnode_mount((VP)))) -#define VFSTOHFS(MP) ((struct hfsmount *)vfs_fsprivate((MP))) -#define VCBTOHFS(VCB) (VCB) -#define FCBTOHFS(FCB) ((struct hfsmount *)vfs_fsprivate(vnode_mount((FCB)->ff_cp->c_vp))) - -/* - * Various ways to acquire a VCB (legacy) pointer: - */ -#define VTOVCB(VP) VTOHFS(VP) -#define VFSTOVCB(MP) VFSTOHFS(MP) -#define HFSTOVCB(HFSMP) (HFSMP) -#define FCBTOVCB(FCB) FCBTOHFS(FCB) - - -#define E_NONE 0 -#define kHFSBlockSize 512 - -/* - * Macros for getting the MDB/VH sector and offset - */ -#define HFS_PRI_SECTOR(blksize) (1024 / (blksize)) -#define HFS_PRI_OFFSET(blksize) ((blksize) > 1024 ? 1024 : 0) - -#define HFS_ALT_SECTOR(blksize, blkcnt) (((blkcnt) - 1) - (512 / (blksize))) -#define HFS_ALT_OFFSET(blksize) ((blksize) > 1024 ? (blksize) - 1024 : 0) - -/* Convert the logical sector number to be aligned on physical block size boundary. - * We are assuming the partition is a multiple of physical block size. - */ -#define HFS_PHYSBLK_ROUNDDOWN(sector_num, log_per_phys) ((sector_num / log_per_phys) * log_per_phys) - -/* - * HFS specific fcntl()'s - */ -#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) -#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) -/* See HFSIOC_EXT_BULKACCESS and friends for HFS specific fsctls*/ - - - -/* - * This is the straight GMT conversion constant: - * 00:00:00 January 1, 1970 - 00:00:00 January 1, 1904 - * (3600 * 24 * ((365 * (1970 - 1904)) + (((1970 - 1904) / 4) + 1))) - */ -#define MAC_GMT_FACTOR 2082844800UL - -static inline __attribute__((const)) -off_t hfs_blk_to_bytes(uint32_t blk, uint32_t blk_size) -{ - return (off_t)blk * blk_size; // Avoid the overflow -} - -/* - * For now, we use EIO to indicate consistency issues. It is safe to - * return or assign an error value to HFS_EINCONSISTENT but it is - * *not* safe to compare against it because EIO can be generated for - * other reasons. We take advantage of the fact that == has - * left-to-right associativity and so any uses of: - * - * if (error == HFS_EINCONSISTENT) - * - * will produce a compiler warning: "comparison between pointer and - * integer". - * - * Note that not everwhere is consistent with the use of - * HFS_EINCONSISTENT. Some places return EINVAL, EIO directly or - * other error codes. - */ -#define HFS_EINCONSISTENT (void *)0 == (void *)0 ? EIO : EIO - -/***************************************************************************** - FUNCTION PROTOTYPES -******************************************************************************/ - -/***************************************************************************** - hfs_vnop_xxx functions from different files -******************************************************************************/ -int hfs_vnop_readdirattr(struct vnop_readdirattr_args *); /* in hfs_attrlist.c */ -int hfs_vnop_getattrlistbulk(struct vnop_getattrlistbulk_args *); /* in hfs_attrlist.c */ - -int hfs_vnop_inactive(struct vnop_inactive_args *); /* in hfs_cnode.c */ -int hfs_vnop_reclaim(struct vnop_reclaim_args *); /* in hfs_cnode.c */ - -int hfs_set_backingstore (struct vnode *vp, int val); /* in hfs_cnode.c */ -int hfs_is_backingstore (struct vnode *vp, int *val); /* in hfs_cnode.c */ - -int hfs_vnop_link(struct vnop_link_args *); /* in hfs_link.c */ - -int hfs_vnop_lookup(struct vnop_lookup_args *); /* in hfs_lookup.c */ - -int hfs_vnop_search(struct vnop_searchfs_args *); /* in hfs_search.c */ - -int hfs_vnop_read(struct vnop_read_args *); /* in hfs_readwrite.c */ -int hfs_vnop_write(struct vnop_write_args *); /* in hfs_readwrite.c */ -int hfs_vnop_ioctl(struct vnop_ioctl_args *); /* in hfs_readwrite.c */ -int hfs_vnop_select(struct vnop_select_args *); /* in hfs_readwrite.c */ -int hfs_vnop_strategy(struct vnop_strategy_args *); /* in hfs_readwrite.c */ -int hfs_vnop_allocate(struct vnop_allocate_args *); /* in hfs_readwrite.c */ -int hfs_vnop_pagein(struct vnop_pagein_args *); /* in hfs_readwrite.c */ -int hfs_vnop_pageout(struct vnop_pageout_args *); /* in hfs_readwrite.c */ -int hfs_vnop_bwrite(struct vnop_bwrite_args *); /* in hfs_readwrite.c */ -int hfs_vnop_blktooff(struct vnop_blktooff_args *); /* in hfs_readwrite.c */ -int hfs_vnop_offtoblk(struct vnop_offtoblk_args *); /* in hfs_readwrite.c */ -int hfs_vnop_blockmap(struct vnop_blockmap_args *); /* in hfs_readwrite.c */ -errno_t hfs_flush_invalid_ranges(vnode_t vp); /* in hfs_readwrite.c */ - -int hfs_vnop_getxattr(struct vnop_getxattr_args *); /* in hfs_xattr.c */ -int hfs_vnop_setxattr(struct vnop_setxattr_args *); /* in hfs_xattr.c */ -int hfs_vnop_removexattr(struct vnop_removexattr_args *); /* in hfs_xattr.c */ -int hfs_vnop_listxattr(struct vnop_listxattr_args *); /* in hfs_xattr.c */ -#if NAMEDSTREAMS -extern int hfs_vnop_getnamedstream(struct vnop_getnamedstream_args*); -extern int hfs_vnop_makenamedstream(struct vnop_makenamedstream_args*); -extern int hfs_vnop_removenamedstream(struct vnop_removenamedstream_args*); -#endif - - -/***************************************************************************** - Functions from MacOSStubs.c -******************************************************************************/ -time_t to_bsd_time(u_int32_t hfs_time); - -u_int32_t to_hfs_time(time_t bsd_time); - - -/***************************************************************************** - Functions from hfs_encodinghint.c -******************************************************************************/ -u_int32_t hfs_pickencoding(const u_int16_t *src, int len); - -u_int32_t hfs_getencodingbias(void); - -void hfs_setencodingbias(u_int32_t bias); - - -/***************************************************************************** - Functions from hfs_encodings.c -******************************************************************************/ -void hfs_converterinit(void); - -int hfs_relconverter (u_int32_t encoding); - -int hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode, - unicode_to_hfs_func_t *get_hfsname); - -#if CONFIG_HFS_STD -int hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, - ByteCount *actualDstLen, unsigned char* dstStr); - -int utf8_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, const unsigned char* srcStr, - Str31 dstStr); - -int mac_roman_to_utf8(const Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDstLen, - unsigned char* dstStr); - -int utf8_to_mac_roman(ByteCount srcLen, const unsigned char* srcStr, Str31 dstStr); - -int mac_roman_to_unicode(const Str31 hfs_str, UniChar *uni_str, u_int32_t maxCharLen, u_int32_t *usedCharLen); - -int unicode_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, u_int16_t* srcStr, Str31 dstStr, int retry); -#endif - -/***************************************************************************** - Functions from hfs_notifications.c -******************************************************************************/ -void hfs_generate_volume_notifications(struct hfsmount *hfsmp); - - -/***************************************************************************** - Functions from hfs_readwrite.c -******************************************************************************/ -extern int hfs_relocate(struct vnode *, u_int32_t, kauth_cred_t, struct proc *); - -/* flags for hfs_pin_block_range() and hfs_pin_vnode() */ -#define HFS_PIN_IT 0x0001 -#define HFS_UNPIN_IT 0x0002 -#define HFS_TEMP_PIN 0x0004 -#define HFS_EVICT_PIN 0x0008 -#define HFS_DATALESS_PIN 0x0010 - -// -// pin/un-pin an explicit range of blocks to the "fast" (usually ssd) device -// -int hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks, vfs_context_t ctx); - -// -// pin/un-pin all the extents belonging to a vnode. -// also, if it is non-null, "num_blocks_pinned" returns the number of blocks pin/unpinned by the function -// -int hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned, vfs_context_t ctx); - - -int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid, uint8_t forktype, uint32_t *pinned); - - -/* Flags for HFS truncate */ -#define HFS_TRUNCATE_SKIPTIMES 0x00000002 /* implied by skipupdate; it is a subset */ - - -extern int hfs_truncate(struct vnode *, off_t, int, int, vfs_context_t); - -extern int hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, - struct filefork *rsrcfork, u_int32_t fileid); - -extern int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp); - -extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *); - -extern errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock); - - -/***************************************************************************** - Functions from hfs_resize.c -******************************************************************************/ -int hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context); -int hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context); - - -/***************************************************************************** - Functions from hfs_vfsops.c -******************************************************************************/ -int hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context); - -/* used as a callback by the journaling code */ -extern void hfs_sync_metadata(void *arg); - -extern int hfs_vget(struct hfsmount *, cnid_t, struct vnode **, int, int); - -extern void hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding); - -enum volop {VOL_UPDATE, VOL_MKDIR, VOL_RMDIR, VOL_MKFILE, VOL_RMFILE}; -extern int hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot); - -enum { - HFS_FVH_WAIT = 0x0001, - HFS_FVH_WRITE_ALT = 0x0002, - HFS_FVH_FLUSH_IF_DIRTY = 0x0004, -}; -typedef uint32_t hfs_flush_volume_header_options_t; -int hfs_flushvolumeheader(struct hfsmount *hfsmp, hfs_flush_volume_header_options_t); - -extern int hfs_extendfs(struct hfsmount *, u_int64_t, vfs_context_t); -extern int hfs_truncatefs(struct hfsmount *, u_int64_t, vfs_context_t); -extern int hfs_resize_progress(struct hfsmount *, u_int32_t *); - -/* If a runtime corruption is detected, mark the volume inconsistent - * bit in the volume attributes. - */ - -typedef enum { - HFS_INCONSISTENCY_DETECTED, - - // Used when unable to rollback an operation that failed - HFS_ROLLBACK_FAILED, - - // Used when the latter part of an operation failed, but we chose not to roll back - HFS_OP_INCOMPLETE, - - // Used when someone told us to force an fsck on next mount - HFS_FSCK_FORCED, -} hfs_inconsistency_reason_t; - -void hfs_mark_inconsistent(struct hfsmount *hfsmp, - hfs_inconsistency_reason_t reason); - -void hfs_scan_blocks (struct hfsmount *hfsmp); - -/***************************************************************************** - Functions from hfs_vfsutils.c -******************************************************************************/ -u_int32_t BestBlockSizeFit(u_int32_t allocationBlockSize, - u_int32_t blockSizeLimit, - u_int32_t baseMultiple); - -#if CONFIG_HFS_STD -OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, - struct proc *p); -#endif -OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args, kauth_cred_t cred); - -OSErr hfs_ValidateHFSPlusVolumeHeader(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp); - -extern int hfsUnmount(struct hfsmount *hfsmp, struct proc *p); - -extern bool overflow_extents(struct filefork *fp); - -extern int hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, kauth_cred_t cred, - struct proc *p, int invokesuperuserstatus); - -extern int check_for_tracked_file(struct vnode *vp, time_t ctime, uint64_t op_type, void *arg); -extern int check_for_dataless_file(struct vnode *vp, uint64_t op_type); -extern int hfs_generate_document_id(struct hfsmount *hfsmp, uint32_t *docid); -extern void hfs_pin_fs_metadata(struct hfsmount *hfsmp); - -/* Return information about number of metadata blocks for volume */ -extern int hfs_getinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfsinfo_metadata *hinfo); - -/* - * Journal lock function prototypes - */ -int hfs_lock_global (struct hfsmount *hfsmp, enum hfs_locktype locktype); -void hfs_unlock_global (struct hfsmount *hfsmp); - -/* HFS mount lock/unlock prototypes */ -void hfs_lock_mount (struct hfsmount *hfsmp); -void hfs_unlock_mount (struct hfsmount *hfsmp); - - -/* HFS System file locking */ -#define SFL_CATALOG 0x0001 -#define SFL_EXTENTS 0x0002 -#define SFL_BITMAP 0x0004 -#define SFL_ATTRIBUTE 0x0008 -#define SFL_STARTUP 0x0010 -#define SFL_VM_PRIV 0x0020 -#define SFL_VALIDMASK (SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE | SFL_STARTUP | SFL_VM_PRIV) - -extern u_int32_t GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, const char *name, - struct cat_attr *fattr, struct cat_fork *forkinfo); - -extern void hfs_remove_orphans(struct hfsmount *); - -u_int32_t GetLogicalBlockSize(struct vnode *vp); - -extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve); - -short MacToVFSError(OSErr err); - -void hfs_metadatazone_init(struct hfsmount *hfsmp, int disable); - -/* HFS directory hint functions. */ -extern directoryhint_t * hfs_getdirhint(struct cnode *, int, int); -extern void hfs_reldirhint(struct cnode *, directoryhint_t *); -extern void hfs_reldirhints(struct cnode *, int); -extern void hfs_insertdirhint(struct cnode *, directoryhint_t *); - -extern int hfs_namecmp(const u_int8_t *str1, size_t len1, const u_int8_t *str2, size_t len2); - -extern int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - void *_args, off_t embeddedOffset, daddr64_t mdb_offset, - HFSMasterDirectoryBlock *mdbp, kauth_cred_t cred); - -extern int hfs_virtualmetafile(struct cnode *); - -extern int hfs_start_transaction(struct hfsmount *hfsmp); -extern int hfs_end_transaction(struct hfsmount *hfsmp); -extern void hfs_journal_lock(struct hfsmount *hfsmp); -extern void hfs_journal_unlock(struct hfsmount *hfsmp); -extern void hfs_syncer_lock(struct hfsmount *hfsmp); -extern void hfs_syncer_unlock(struct hfsmount *hfsmp); -extern void hfs_syncer_wait(struct hfsmount *hfsmp); -extern void hfs_syncer_wakeup(struct hfsmount *hfsmp); -extern void hfs_syncer_queue(thread_call_t syncer); -extern void hfs_sync_ejectable(struct hfsmount *hfsmp); - -typedef enum hfs_flush_mode { - HFS_FLUSH_JOURNAL, // Flush journal - HFS_FLUSH_JOURNAL_META, // Flush journal and metadata blocks - HFS_FLUSH_FULL, // Flush journal and does a cache flush - HFS_FLUSH_CACHE, // Flush track cache to media - HFS_FLUSH_BARRIER, // Barrier-only flush to ensure write order - HFS_FLUSH_JOURNAL_BARRIER // Flush journal with barrier -} hfs_flush_mode_t; - -extern errno_t hfs_flush(struct hfsmount *hfsmp, hfs_flush_mode_t mode); - -extern void hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents); - -/* Erase unused Catalog nodes due to . */ -extern int hfs_erase_unused_nodes(struct hfsmount *hfsmp); - -extern uint64_t hfs_usecs_to_deadline(uint64_t usecs); - -extern int hfs_freeze(struct hfsmount *hfsmp); -extern int hfs_thaw(struct hfsmount *hfsmp, const struct proc *process); - - -/***************************************************************************** - Functions from hfs_vnops.c -******************************************************************************/ -int hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags); - -int hfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct proc *p); - -int hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, struct proc *p); - -#define kMaxSecsForFsync 5 -#define HFS_SYNCTRANS 1 -extern int hfs_btsync(struct vnode *vp, int sync_transaction); - -extern void replace_desc(struct cnode *cp, struct cat_desc *cdp); - -extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, - struct vnode **rvpp); - -typedef enum { - // Push all modifications to disk (including minor ones) - HFS_UPDATE_FORCE = 0x01, -} hfs_update_options_t; - -extern int hfs_update(struct vnode *, int options); - -typedef enum hfs_sync_mode { - HFS_FSYNC, - HFS_FSYNC_FULL, - HFS_FSYNC_BARRIER -} hfs_fsync_mode_t; - -extern int hfs_fsync(struct vnode *, int, hfs_fsync_mode_t, struct proc *); - -const struct cat_fork * -hfs_prepare_fork_for_update(filefork_t *ff, - const struct cat_fork *cf, - struct cat_fork *cf_buf, - uint32_t block_size); - -/***************************************************************************** - Functions from hfs_xattr.c -******************************************************************************/ - -/* - * Maximum extended attribute size supported for all extended attributes except - * resource fork and finder info. - */ -#define HFS_XATTR_MAXSIZE INT32_MAX - -/* Number of bits used to represent maximum extended attribute size */ -#define HFS_XATTR_SIZE_BITS 31 - -int hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey); -int hfs_buildattrkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key); -void hfs_xattr_init(struct hfsmount * hfsmp); -int file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID); -int init_attrdata_vnode(struct hfsmount *hfsmp); -int hfs_xattr_read(vnode_t vp, const char *name, void *data, size_t *size); -int hfs_getxattr_internal(cnode_t *, struct vnop_getxattr_args *, - struct hfsmount *, u_int32_t); -int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size); -int hfs_setxattr_internal(struct cnode *, const void *, size_t, - struct vnop_setxattr_args *, struct hfsmount *, u_int32_t); -extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid, - bool *open_transaction); -extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state); - - - -/***************************************************************************** - Functions from hfs_link.c -******************************************************************************/ - -extern int hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, - struct componentname *cnp, int skip_reserve); -extern int hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, - cnid_t *prevlinkid, cnid_t *nextlinkid); -extern int hfs_lookup_lastlink(struct hfsmount *hfsmp, cnid_t linkfileid, - cnid_t *nextlinkid, struct cat_desc *cdesc); -extern void hfs_privatedir_init(struct hfsmount *, enum privdirtype); - -extern void hfs_savelinkorigin(cnode_t *cp, cnid_t parentcnid); -extern void hfs_relorigins(struct cnode *cp); -extern void hfs_relorigin(struct cnode *cp, cnid_t parentcnid); -extern int hfs_haslinkorigin(cnode_t *cp); -extern cnid_t hfs_currentparent(cnode_t *cp, bool have_lock); -extern cnid_t hfs_currentcnid(cnode_t *cp); -errno_t hfs_first_link(hfsmount_t *hfsmp, cnode_t *cp, cnid_t *link_id); - - -/***************************************************************************** - Functions from VolumeAllocation.c - ******************************************************************************/ -extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks); - -extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, - u_int32_t numBlocks, u_int32_t *alloc_count); - -extern int hfs_isrbtree_active (struct hfsmount *hfsmp); - -/***************************************************************************** - Functions from hfs_fsinfo.c - ******************************************************************************/ -extern errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data); -extern void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* __HFS__ */ diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c deleted file mode 100644 index 3ee064859..000000000 --- a/bsd/hfs/hfs_attrlist.c +++ /dev/null @@ -1,1728 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * hfs_attrlist.c - HFS attribute list processing - * - * Copyright (c) 1998-2002, Apple Computer, Inc. All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "hfs.h" -#include "hfs_cnode.h" -#include "hfs_mount.h" -#include "hfs_dbg.h" -#include "hfs_attrlist.h" -#include "hfs_btreeio.h" - -/* Packing routines: */ - -static void packnameattr(struct attrblock *abp, struct vnode *vp, - const u_int8_t *name, int namelen); - -static void packcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *vp, struct cat_desc * cdp, - struct cat_attr * cap, struct vfs_context *ctx); - -static void packfileattr(struct attrblock *abp, struct hfsmount *hfsmp, - struct cat_attr *cattrp, struct cat_fork *datafork, - struct cat_fork *rsrcfork, struct vnode *vp); - -static void packdirattr(struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *vp, struct cat_desc * descp, - struct cat_attr * cattrp); - -static u_int32_t hfs_real_user_access(vnode_t vp, vfs_context_t ctx); - -static void get_vattr_data_for_attrs(struct attrlist *, struct vnode_attr *, - struct hfsmount *, struct vnode *, struct cat_desc *, struct cat_attr *, - struct cat_fork *, struct cat_fork *, vfs_context_t); - -static void vattr_data_for_common_attrs(struct attrlist *, struct vnode_attr *, - struct hfsmount *, struct vnode *, struct cat_desc *, struct cat_attr *, - vfs_context_t); - -static void vattr_data_for_dir_attrs(struct attrlist *, struct vnode_attr *, - struct hfsmount *, struct vnode *, struct cat_desc *, struct cat_attr *); - -static void vattr_data_for_file_attrs(struct attrlist *, struct vnode_attr *, - struct hfsmount *, struct cat_attr *, struct cat_fork *, struct cat_fork *, - struct vnode *vp); - -static int hfs_readdirattr_internal(struct vnode *, struct attrlist *, - struct vnode_attr *, uio_t, uint64_t, int, uint32_t *, int *, int *, - vfs_context_t); - -/* - * readdirattr operation will return attributes for the items in the - * directory specified. - * - * It does not do . and .. entries. The problem is if you are at the root of the - * hfs directory and go to .. you could be crossing a mountpoint into a - * different (ufs) file system. The attributes that apply for it may not - * apply for the file system you are doing the readdirattr on. To make life - * simpler, this call will only return entries in its directory, hfs like. - */ -int -hfs_vnop_readdirattr(ap) - struct vnop_readdirattr_args /* { - struct vnode *a_vp; - struct attrlist *a_alist; - struct uio *a_uio; - u_long a_maxcount; - u_long a_options; - u_long *a_newstate; - int *a_eofflag; - u_long *a_actualcount; - vfs_context_t a_context; - } */ *ap; -{ - int error; - struct attrlist *alist = ap->a_alist; - - /* Check for invalid options and buffer space. */ - if (((ap->a_options & ~(FSOPT_NOINMEMUPDATE | FSOPT_NOFOLLOW)) != 0) || - (ap->a_maxcount <= 0)) { - return (EINVAL); - } - /* - * Reject requests for unsupported attributes. - */ - if ((alist->bitmapcount != ATTR_BIT_MAP_COUNT) || - (alist->commonattr & ~HFS_ATTR_CMN_VALID) || - (alist->volattr != 0) || - (alist->dirattr & ~HFS_ATTR_DIR_VALID) || - (alist->fileattr & ~HFS_ATTR_FILE_VALID) || - (alist->forkattr != 0)) { - return (EINVAL); - } - - error = hfs_readdirattr_internal(ap->a_vp, alist, NULL, ap->a_uio, - (uint64_t)ap->a_options, ap->a_maxcount, ap->a_newstate, - ap->a_eofflag, (int *)ap->a_actualcount, ap->a_context); - - return (error); -} - - -/* - * getattrlistbulk, like readdirattr, will return attributes for the items in - * the directory specified. - * - * It does not do . and .. entries. The problem is if you are at the root of the - * hfs directory and go to .. you could be crossing a mountpoint into a - * different (ufs) file system. The attributes that apply for it may not - * apply for the file system you are doing the readdirattr on. To make life - * simpler, this call will only return entries in its directory, hfs like. - */ -int -hfs_vnop_getattrlistbulk(ap) - struct vnop_getattrlistbulk_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - struct attrlist *a_alist; - struct vnode_attr *a_vap; - struct uio *a_uio; - void *a_private; - uint64_t a_options; - int32_t *a_eofflag; - int32_t *a_actualcount; - vfs_context_t a_context; - } */ *ap; -{ - int error = 0; - - error = hfs_readdirattr_internal(ap->a_vp, ap->a_alist, ap->a_vap, - ap->a_uio, (uint64_t)ap->a_options, 0, NULL, ap->a_eofflag, - (int *)ap->a_actualcount, ap->a_context); - - return (error); -} - -/* - * Common function for both hfs_vnop_readdirattr and hfs_vnop_getattrlistbulk. - * This either fills in a vnode_attr structure or fills in an attrbute buffer - * Currently the difference in behaviour required for the two vnops is keyed - * on whether the passed in vnode_attr pointer is null or not. If the pointer - * is null we fill in buffer passed and if it is not null we fill in the fields - * of the vnode_attr structure. - */ -int -hfs_readdirattr_internal(struct vnode *dvp, struct attrlist *alist, - struct vnode_attr *vap, uio_t uio, uint64_t options, int maxcount, - uint32_t *newstate, int *eofflag, int *actualcount, vfs_context_t ctx) -{ - struct cnode *dcp; - struct hfsmount * hfsmp; - u_int32_t fixedblocksize; - u_int32_t maxattrblocksize; - u_int32_t currattrbufsize; - void *attrbufptr = NULL; - void *attrptr = NULL; - void *varptr = NULL; - caddr_t namebuf = NULL; - struct attrblock attrblk; - int error = 0; - int index = 0; - int i = 0; - struct cat_desc *lastdescp = NULL; - struct cat_entrylist *ce_list = NULL; - directoryhint_t *dirhint = NULL; - unsigned int tag; - int maxentries; - int lockflags; - u_int32_t dirchg = 0; - int reachedeof = 0; - - *(actualcount) = 0; - *(eofflag) = 0; - - if ((uio_resid(uio) <= 0) || (uio_iovcnt(uio) > 1)) - return (EINVAL); - - if (VTOC(dvp)->c_bsdflags & UF_COMPRESSED) { - int compressed = hfs_file_is_compressed(VTOC(dvp), 0); /* 0 == take the cnode lock */ - - if (!compressed) { - error = check_for_dataless_file(dvp, NAMESPACE_HANDLER_READ_OP); - if (error) { - return error; - } - } - } - - /* - * Take an exclusive directory lock since we manipulate the directory hints - */ - if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (error); - } - dcp = VTOC(dvp); - hfsmp = VTOHFS(dvp); - - dirchg = dcp->c_dirchangecnt; - - /* Extract directory index and tag (sequence number) from uio_offset */ - index = uio_offset(uio) & HFS_INDEX_MASK; - tag = uio_offset(uio) & ~HFS_INDEX_MASK; - - /* - * We can't just use the valence as an optimization to avoid - * going to the catalog. It might be wrong (== 0), and that would - * cause us to avoid iterating the directory when it might actually have - * contents. Instead, use the catalog to tell us when we've hit EOF - * for this directory - */ - - /* Get a buffer to hold packed attributes. */ - fixedblocksize = (sizeof(u_int32_t) + hfs_attrblksize(alist)); /* 4 bytes for length */ - - if (!vap) { - maxattrblocksize = fixedblocksize; - if (alist->commonattr & ATTR_CMN_NAME) - maxattrblocksize += kHFSPlusMaxFileNameBytes + 1; - - MALLOC(attrbufptr, void *, maxattrblocksize, M_TEMP, M_WAITOK); - if (attrbufptr == NULL) { - error = ENOMEM; - goto exit2; - } - attrptr = attrbufptr; - varptr = (char *)attrbufptr + fixedblocksize; /* Point to variable-length storage */ - } else { - if ((alist->commonattr & ATTR_CMN_NAME) && !vap->va_name) { - MALLOC(namebuf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK); - if (!namebuf) { - error = ENOMEM; - goto exit2; - } - vap->va_name = namebuf; - } - } - /* Get a detached directory hint (cnode must be locked exclusive) */ - dirhint = hfs_getdirhint(dcp, ((index - 1) & HFS_INDEX_MASK) | tag, TRUE); - - /* Hide tag from catalog layer. */ - dirhint->dh_index &= HFS_INDEX_MASK; - if (dirhint->dh_index == HFS_INDEX_MASK) { - dirhint->dh_index = -1; - } - - /* - * Obtain a list of catalog entries and pack their attributes until - * the output buffer is full or maxcount entries have been packed. - */ - - /* - * Constrain our list size. - */ - maxentries = uio_resid(uio) / (fixedblocksize + HFS_AVERAGE_NAME_SIZE); - /* There is maxcount for the bulk vnop */ - if (!vap) - maxentries = min(maxentries, maxcount); - maxentries = min(maxentries, MAXCATENTRIES); - if (maxentries < 1) { - error = EINVAL; - goto exit2; - } - - /* Initialize a catalog entry list. */ - MALLOC(ce_list, struct cat_entrylist *, CE_LIST_SIZE(maxentries), M_TEMP, M_WAITOK); - if (ce_list == NULL) { - error = ENOMEM; - goto exit2; - } - bzero(ce_list, CE_LIST_SIZE(maxentries)); - ce_list->maxentries = maxentries; - - /* - * Populate the ce_list from the catalog file. - */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - error = cat_getentriesattr(hfsmp, dirhint, ce_list, &reachedeof); - /* Don't forget to release the descriptors later! */ - - hfs_systemfile_unlock(hfsmp, lockflags); - - if ((error == ENOENT) || (reachedeof != 0)) { - *(eofflag) = TRUE; - error = 0; - } - if (error) { - goto exit1; - } - - /* - * Check for a FS corruption in the valence. We're holding the cnode lock - * exclusive since we need to serialize the directory hints, so if we found - * that the valence reported 0, but we actually found some items here, then - * silently minimally self-heal and bump the valence to 1. - */ - if ((dcp->c_entries == 0) && (ce_list->realentries > 0)) { - dcp->c_entries++; - dcp->c_flag |= C_MODIFIED; - printf("hfs_vnop_readdirattr: repairing valence to non-zero! \n"); - /* force an update on dcp while we're still holding the lock. */ - hfs_update(dvp, 0); - } - - /* - * Drop the directory lock so we don't deadlock when we: - * - acquire a child cnode lock - * - make calls to vnode_authorize() - * - make calls to kauth_cred_ismember_gid() - */ - hfs_unlock(dcp); - dcp = NULL; - - /* Process the catalog entries. */ - for (i = 0; i < (int)ce_list->realentries; ++i) { - struct cnode *cp = NULL; - struct vnode *vp = NULL; - struct cat_desc * cdescp; - struct cat_attr * cattrp; - struct cat_fork c_datafork; - struct cat_fork c_rsrcfork; - - bzero(&c_datafork, sizeof(c_datafork)); - bzero(&c_rsrcfork, sizeof(c_rsrcfork)); - cdescp = &ce_list->entry[i].ce_desc; - cattrp = &ce_list->entry[i].ce_attr; - c_datafork.cf_size = ce_list->entry[i].ce_datasize; - c_datafork.cf_blocks = ce_list->entry[i].ce_datablks; - c_rsrcfork.cf_size = ce_list->entry[i].ce_rsrcsize; - c_rsrcfork.cf_blocks = ce_list->entry[i].ce_rsrcblks; - - if (((alist->commonattr & ATTR_CMN_USERACCESS) && - (cattrp->ca_recflags & kHFSHasSecurityMask)) -#if CONFIG_PROTECT - || - ((alist->commonattr & ATTR_CMN_DATA_PROTECT_FLAGS) && (vap)) -#endif - ) { - /* - * Obtain vnode for our vnode_authorize() calls. - */ - if (hfs_vget(hfsmp, cattrp->ca_fileid, &vp, 0, 0) != 0) { - vp = NULL; - } - } else if (vap || !(options & FSOPT_NOINMEMUPDATE)) { - /* Get in-memory cnode data (if any). */ - vp = hfs_chash_getvnode(hfsmp, cattrp->ca_fileid, 0, 0, 0); - } - if (vp != NULL) { - cp = VTOC(vp); - /* Only use cnode's decriptor for non-hardlinks */ - if (!(cp->c_flag & C_HARDLINK)) - cdescp = &cp->c_desc; - cattrp = &cp->c_attr; - if (cp->c_datafork) { - c_datafork.cf_size = cp->c_datafork->ff_size; - c_datafork.cf_blocks = cp->c_datafork->ff_blocks; - } - if (cp->c_rsrcfork) { - c_rsrcfork.cf_size = cp->c_rsrcfork->ff_size; - c_rsrcfork.cf_blocks = cp->c_rsrcfork->ff_blocks; - } - /* All done with cnode. */ - hfs_unlock(cp); - cp = NULL; - } - - if (!vap) { - *((u_int32_t *)attrptr) = 0; - attrptr = ((u_int32_t *)attrptr) + 1; - attrblk.ab_attrlist = alist; - attrblk.ab_attrbufpp = &attrptr; - attrblk.ab_varbufpp = &varptr; - attrblk.ab_flags = 0; - attrblk.ab_blocksize = maxattrblocksize; - attrblk.ab_context = ctx; - - /* Pack catalog entries into attribute buffer. */ - hfs_packattrblk(&attrblk, hfsmp, vp, cdescp, cattrp, &c_datafork, &c_rsrcfork, ctx); - currattrbufsize = ((char *)varptr - (char *)attrbufptr); - - /* All done with vnode. */ - if (vp != NULL) { - vnode_put(vp); - vp = NULL; - } - - /* Make sure there's enough buffer space remaining. */ - // LP64todo - fix this! - if (uio_resid(uio) < 0 || - currattrbufsize > (u_int32_t)uio_resid(uio)) { - break; - } else { - *((u_int32_t *)attrbufptr) = currattrbufsize; - error = uiomove((caddr_t)attrbufptr, currattrbufsize, uio); - if (error != E_NONE) { - break; - } - attrptr = attrbufptr; - /* Point to variable-length storage */ - varptr = (char *)attrbufptr + fixedblocksize; - /* Save the last valid catalog entry */ - lastdescp = &ce_list->entry[i].ce_desc; - index++; - *actualcount += 1; - - /* Termination checks */ - if ((--maxcount <= 0) || - // LP64todo - fix this! - uio_resid(uio) < 0 || - ((u_int32_t)uio_resid(uio) < (fixedblocksize + HFS_AVERAGE_NAME_SIZE))){ - break; - } - } - } else { - size_t orig_resid = (size_t)uio_resid(uio); - size_t resid; - - get_vattr_data_for_attrs(alist, vap, hfsmp, vp, cdescp, - cattrp, &c_datafork, &c_rsrcfork, ctx); - -#if CONFIG_PROTECT - if ((alist->commonattr & ATTR_CMN_DATA_PROTECT_FLAGS) && - vp) { - int class; - - if (!cp_vnode_getclass(vp, &class)) { - VATTR_RETURN(vap, va_dataprotect_class, - (uint32_t)class); - } - } -#endif - error = vfs_attr_pack(vp, uio, alist, options, vap, - NULL, ctx); - - /* All done with vnode. */ - if (vp) { - vnode_put(vp); - vp = NULL; - } - - resid = uio_resid(uio); - - /* Was this entry succesful ? */ - if (error || resid == orig_resid) - break; - - /* Save the last valid catalog entry */ - lastdescp = &ce_list->entry[i].ce_desc; - index++; - *actualcount += 1; - - /* Do we have the bare minimum for the next entry ? */ - if (resid < sizeof(uint32_t)) - break; - } - } /* for each catalog entry */ - - /* - * If we couldn't fit all the entries requested in the user's buffer, - * it's not EOF. - */ - if (*eofflag && (*actualcount < (int)ce_list->realentries)) - *eofflag = 0; - - /* If we skipped catalog entries for reserved files that should - * not be listed in namespace, update the index accordingly. - */ - if (ce_list->skipentries) { - index += ce_list->skipentries; - ce_list->skipentries = 0; - } - - /* - * If there are more entries then save the last name. - * Key this behavior based on whether or not we observed EOFFLAG. - * - * Do not use the valence as a way to determine if we hit EOF, since - * it can be wrong. Use the catalog's output only. - */ - if ((*(eofflag) == 0) && lastdescp != NULL) { - - /* Remember last entry */ - if ((dirhint->dh_desc.cd_flags & CD_HASBUF) && - (dirhint->dh_desc.cd_nameptr != NULL)) { - dirhint->dh_desc.cd_flags &= ~CD_HASBUF; - vfs_removename((const char *)dirhint->dh_desc.cd_nameptr); - } - dirhint->dh_desc.cd_namelen = lastdescp->cd_namelen; - dirhint->dh_desc.cd_nameptr = (const u_int8_t *) - vfs_addname((const char *)lastdescp->cd_nameptr, lastdescp->cd_namelen, 0, 0); - dirhint->dh_desc.cd_flags |= CD_HASBUF; - dirhint->dh_index = index - 1; - dirhint->dh_desc.cd_cnid = lastdescp->cd_cnid; - dirhint->dh_desc.cd_hint = lastdescp->cd_hint; - dirhint->dh_desc.cd_encoding = lastdescp->cd_encoding; - } - - /* All done with the catalog descriptors. */ - for (i = 0; i < (int)ce_list->realentries; ++i) - cat_releasedesc(&ce_list->entry[i].ce_desc); - ce_list->realentries = 0; - - (void) hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - dcp = VTOC(dvp); - -exit1: - /* Pack directory index and tag into uio_offset. */ - while (tag == 0) tag = (++dcp->c_dirhinttag) << HFS_INDEX_BITS; - uio_setoffset(uio, index | tag); - dirhint->dh_index |= tag; - -exit2: - if (newstate) - *newstate = dirchg; - - /* - * Drop directory hint on error or if there are no more entries, - * only if EOF was seen. - */ - if (dirhint) { - if ((error != 0) || *(eofflag)) - hfs_reldirhint(dcp, dirhint); - else - hfs_insertdirhint(dcp, dirhint); - } - if (namebuf) { - FREE(namebuf, M_TEMP); - vap->va_name = NULL; - } - if (attrbufptr) - FREE(attrbufptr, M_TEMP); - if (ce_list) - FREE(ce_list, M_TEMP); - - if (vap && *actualcount && error) - error = 0; - - hfs_unlock(dcp); - return (error); -} - - -/*==================== Attribute list support routines ====================*/ - -/* - * Pack cnode attributes into an attribute block. - */ -__private_extern__ -void -hfs_packattrblk(struct attrblock *abp, - struct hfsmount *hfsmp, - struct vnode *vp, - struct cat_desc *descp, - struct cat_attr *attrp, - struct cat_fork *datafork, - struct cat_fork *rsrcfork, - struct vfs_context *ctx) -{ - struct attrlist *attrlistp = abp->ab_attrlist; - - if (attrlistp->commonattr) - packcommonattr(abp, hfsmp, vp, descp, attrp, ctx); - - if (attrlistp->dirattr && S_ISDIR(attrp->ca_mode)) - packdirattr(abp, hfsmp, vp, descp,attrp); - - if (attrlistp->fileattr && !S_ISDIR(attrp->ca_mode)) - packfileattr(abp, hfsmp, attrp, datafork, rsrcfork, vp); -} - - -static char* -mountpointname(struct mount *mp) -{ - size_t namelength = strlen(mp->mnt_vfsstat.f_mntonname); - int foundchars = 0; - char *c; - - if (namelength == 0) - return (NULL); - - /* - * Look backwards through the name string, looking for - * the first slash encountered (which must precede the - * last part of the pathname). - */ - for (c = mp->mnt_vfsstat.f_mntonname + namelength - 1; - namelength > 0; --c, --namelength) { - if (*c != '/') { - foundchars = 1; - } else if (foundchars) { - return (c + 1); - } - } - - return (mp->mnt_vfsstat.f_mntonname); -} - - -static void -packnameattr( - struct attrblock *abp, - struct vnode *vp, - const u_int8_t *name, - int namelen) -{ - void *varbufptr; - struct attrreference * attr_refptr; - char *mpname; - size_t mpnamelen; - u_int32_t attrlength; - u_int8_t empty = 0; - - /* A cnode's name may be incorrect for the root of a mounted - * filesystem (it can be mounted on a different directory name - * than the name of the volume, such as "blah-1"). So for the - * root directory, it's best to return the last element of the - location where the volume's mounted: - */ - if ((vp != NULL) && vnode_isvroot(vp) && - (mpname = mountpointname(vnode_mount(vp)))) { - mpnamelen = strlen(mpname); - - /* Trim off any trailing slashes: */ - while ((mpnamelen > 0) && (mpname[mpnamelen-1] == '/')) - --mpnamelen; - - /* If there's anything left, use it instead of the volume's name */ - if (mpnamelen > 0) { - name = (u_int8_t *)mpname; - namelen = mpnamelen; - } - } - if (name == NULL) { - name = ∅ - namelen = 0; - } - - varbufptr = *abp->ab_varbufpp; - attr_refptr = (struct attrreference *)(*abp->ab_attrbufpp); - - attrlength = namelen + 1; - attr_refptr->attr_dataoffset = (char *)varbufptr - (char *)attr_refptr; - attr_refptr->attr_length = attrlength; - (void) strncpy((char *)varbufptr, (const char *) name, attrlength); - /* - * Advance beyond the space just allocated and - * round up to the next 4-byte boundary: - */ - varbufptr = ((char *)varbufptr) + attrlength + ((4 - (attrlength & 3)) & 3); - ++attr_refptr; - - *abp->ab_attrbufpp = attr_refptr; - *abp->ab_varbufpp = varbufptr; -} - -static void -packcommonattr( - struct attrblock *abp, - struct hfsmount *hfsmp, - struct vnode *vp, - struct cat_desc * cdp, - struct cat_attr * cap, - struct vfs_context * ctx) -{ - attrgroup_t attr = abp->ab_attrlist->commonattr; - struct mount *mp = HFSTOVFS(hfsmp); - void *attrbufptr = *abp->ab_attrbufpp; - void *varbufptr = *abp->ab_varbufpp; - boolean_t is_64_bit = proc_is64bit(vfs_context_proc(ctx)); - uid_t cuid = 1; - int isroot = 0; - - if (attr & (ATTR_CMN_OWNERID | ATTR_CMN_GRPID)) { - cuid = kauth_cred_getuid(vfs_context_ucred(ctx)); - isroot = cuid == 0; - } - - if (ATTR_CMN_NAME & attr) { - packnameattr(abp, vp, cdp->cd_nameptr, cdp->cd_namelen); - attrbufptr = *abp->ab_attrbufpp; - varbufptr = *abp->ab_varbufpp; - } - if (ATTR_CMN_DEVID & attr) { - *((dev_t *)attrbufptr) = hfsmp->hfs_raw_dev; - attrbufptr = ((dev_t *)attrbufptr) + 1; - } - if (ATTR_CMN_FSID & attr) { - fsid_t fsid; - - fsid.val[0] = hfsmp->hfs_raw_dev; - fsid.val[1] = vfs_typenum(mp); - *((fsid_t *)attrbufptr) = fsid; - attrbufptr = ((fsid_t *)attrbufptr) + 1; - } - if (ATTR_CMN_OBJTYPE & attr) { - *((fsobj_type_t *)attrbufptr) = IFTOVT(cap->ca_mode); - attrbufptr = ((fsobj_type_t *)attrbufptr) + 1; - } - if (ATTR_CMN_OBJTAG & attr) { - *((fsobj_tag_t *)attrbufptr) = VT_HFS; - attrbufptr = ((fsobj_tag_t *)attrbufptr) + 1; - } - /* - * Exporting file IDs from HFS Plus: - * - * For "normal" files the c_fileid is the same value as the - * c_cnid. But for hard link files, they are different - the - * c_cnid belongs to the active directory entry (ie the link) - * and the c_fileid is for the actual inode (ie the data file). - * - * The stat call (getattr) will always return the c_fileid - * and Carbon APIs, which are hardlink-ignorant, will always - * receive the c_cnid (from getattrlist). - */ - if (ATTR_CMN_OBJID & attr) { - ((fsobj_id_t *)attrbufptr)->fid_objno = cdp->cd_cnid; - ((fsobj_id_t *)attrbufptr)->fid_generation = 0; - attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; - } - if (ATTR_CMN_OBJPERMANENTID & attr) { - ((fsobj_id_t *)attrbufptr)->fid_objno = cdp->cd_cnid; - ((fsobj_id_t *)attrbufptr)->fid_generation = 0; - attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; - } - if (ATTR_CMN_PAROBJID & attr) { - ((fsobj_id_t *)attrbufptr)->fid_objno = cdp->cd_parentcnid; - ((fsobj_id_t *)attrbufptr)->fid_generation = 0; - attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; - } - if (ATTR_CMN_SCRIPT & attr) { - *((text_encoding_t *)attrbufptr) = cdp->cd_encoding; - attrbufptr = ((text_encoding_t *)attrbufptr) + 1; - } - if (ATTR_CMN_CRTIME & attr) { - if (is_64_bit) { - ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_itime; - ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; - } - else { - ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_itime; - ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; - } - } - if (ATTR_CMN_MODTIME & attr) { - if (is_64_bit) { - ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_mtime; - ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; - } - else { - ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_mtime; - ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; - } - } - if (ATTR_CMN_CHGTIME & attr) { - if (is_64_bit) { - ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_ctime; - ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; - } - else { - ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_ctime; - ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; - } - } - if (ATTR_CMN_ACCTIME & attr) { - if (is_64_bit) { - ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_atime; - ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; - } - else { - ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_atime; - ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; - } - } - if (ATTR_CMN_BKUPTIME & attr) { - if (is_64_bit) { - ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_btime; - ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; - } - else { - ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_btime; - ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; - attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; - } - } - if (ATTR_CMN_FNDRINFO & attr) { - u_int8_t *finfo = NULL; - bcopy(&cap->ca_finderinfo, attrbufptr, sizeof(u_int8_t) * 32); - finfo = (u_int8_t*)attrbufptr; - - /* Don't expose a symlink's private type/creator. */ - if (S_ISLNK(cap->ca_mode)) { - struct FndrFileInfo *fip; - - fip = (struct FndrFileInfo *)attrbufptr; - fip->fdType = 0; - fip->fdCreator = 0; - } - - /* advance 16 bytes into the attrbuf */ - finfo = finfo + 16; - - /* also don't expose the date_added or write_gen_counter fields */ - if (S_ISREG(cap->ca_mode) || S_ISLNK(cap->ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - extinfo->document_id = 0; - extinfo->date_added = 0; - extinfo->write_gen_counter = 0; - } - else if (S_ISDIR(cap->ca_mode)) { - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; - extinfo->document_id = 0; - extinfo->date_added = 0; - extinfo->write_gen_counter = 0; - } - - attrbufptr = (char *)attrbufptr + sizeof(u_int8_t) * 32; - } - if (ATTR_CMN_OWNERID & attr) { - uid_t nuid = cap->ca_uid; - - if (!isroot) { - if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) - nuid = cuid; - else if (nuid == UNKNOWNUID) - nuid = cuid; - } - - *((uid_t *)attrbufptr) = nuid; - attrbufptr = ((uid_t *)attrbufptr) + 1; - } - if (ATTR_CMN_GRPID & attr) { - gid_t ngid = cap->ca_gid; - - if (!isroot) { - gid_t cgid = kauth_cred_getgid(vfs_context_ucred(ctx)); - if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) - ngid = cgid; - else if (ngid == UNKNOWNUID) - ngid = cgid; - } - - *((gid_t *)attrbufptr) = ngid; - attrbufptr = ((gid_t *)attrbufptr) + 1; - } - if (ATTR_CMN_ACCESSMASK & attr) { - /* - * [2856576] Since we are dynamically changing the owner, also - * effectively turn off the set-user-id and set-group-id bits, - * just like chmod(2) would when changing ownership. This prevents - * a security hole where set-user-id programs run as whoever is - * logged on (or root if nobody is logged in yet!) - */ - *((u_int32_t *)attrbufptr) = (cap->ca_uid == UNKNOWNUID) ? - cap->ca_mode & ~(S_ISUID | S_ISGID) : cap->ca_mode; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_CMN_FLAGS & attr) { - *((u_int32_t *)attrbufptr) = cap->ca_flags; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_CMN_USERACCESS & attr) { - u_int32_t user_access; - - /* Take the long path when we have an ACL */ - if ((vp != NULLVP) && (cap->ca_recflags & kHFSHasSecurityMask)) { - user_access = hfs_real_user_access(vp, abp->ab_context); - } else { - user_access = DerivePermissionSummary(cap->ca_uid, cap->ca_gid, - cap->ca_mode, mp, vfs_context_ucred(ctx), 0); - } - /* Also consider READ-ONLY file system. */ - if (vfs_flags(mp) & MNT_RDONLY) { - user_access &= ~W_OK; - } - /* Locked objects are not writable either */ - if ((cap->ca_flags & UF_IMMUTABLE) && (vfs_context_suser(abp->ab_context) != 0)) - user_access &= ~W_OK; - if ((cap->ca_flags & SF_IMMUTABLE) && (vfs_context_suser(abp->ab_context) == 0)) - user_access &= ~W_OK; - - *((u_int32_t *)attrbufptr) = user_access; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_CMN_FILEID & attr) { - *((u_int64_t *)attrbufptr) = cap->ca_fileid; - attrbufptr = ((u_int64_t *)attrbufptr) + 1; - } - if (ATTR_CMN_PARENTID & attr) { - *((u_int64_t *)attrbufptr) = cdp->cd_parentcnid; - attrbufptr = ((u_int64_t *)attrbufptr) + 1; - } - - *abp->ab_attrbufpp = attrbufptr; - *abp->ab_varbufpp = varbufptr; -} - -static void -packdirattr( - struct attrblock *abp, - struct hfsmount *hfsmp, - struct vnode *vp, - struct cat_desc * descp, - struct cat_attr * cattrp) -{ - attrgroup_t attr = abp->ab_attrlist->dirattr; - void *attrbufptr = *abp->ab_attrbufpp; - u_int32_t entries; - - /* - * The DIR_LINKCOUNT is the count of real directory hard links. - * (i.e. its not the sum of the implied "." and ".." references - * typically used in stat's st_nlink field) - */ - if (ATTR_DIR_LINKCOUNT & attr) { - *((u_int32_t *)attrbufptr) = cattrp->ca_linkcount; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_DIR_ENTRYCOUNT & attr) { - entries = cattrp->ca_entries; - - if (descp->cd_parentcnid == kHFSRootParentID) { - if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) - --entries; /* hide private dir */ - if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) - --entries; /* hide private dir */ - if (hfsmp->jnl || - ((hfsmp->vcbAtrb & kHFSVolumeJournaledMask) && - (hfsmp->hfs_flags & HFS_READ_ONLY))) - entries -= 2; /* hide the journal files */ - } - - *((u_int32_t *)attrbufptr) = entries; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_DIR_MOUNTSTATUS & attr) { - if (vp != NULL && vnode_mountedhere(vp) != NULL) - *((u_int32_t *)attrbufptr) = DIR_MNTSTATUS_MNTPOINT; - else - *((u_int32_t *)attrbufptr) = 0; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - *abp->ab_attrbufpp = attrbufptr; -} - -static void -packfileattr( - struct attrblock *abp, - struct hfsmount *hfsmp, - struct cat_attr *cattrp, - struct cat_fork *datafork, - struct cat_fork *rsrcfork, - struct vnode *vp) -{ -#if !HFS_COMPRESSION -#pragma unused(vp) -#endif - attrgroup_t attr = abp->ab_attrlist->fileattr; - void *attrbufptr = *abp->ab_attrbufpp; - void *varbufptr = *abp->ab_varbufpp; - u_int32_t allocblksize; - - allocblksize = HFSTOVCB(hfsmp)->blockSize; - - off_t datasize = datafork->cf_size; - off_t totalsize = datasize + rsrcfork->cf_size; -#if HFS_COMPRESSION - int handle_compressed; - handle_compressed = (cattrp->ca_flags & UF_COMPRESSED);// && hfs_file_is_compressed(VTOC(vp), 1); - - if (handle_compressed) { - if (attr & (ATTR_FILE_DATALENGTH|ATTR_FILE_TOTALSIZE)) { - if ( 0 == hfs_uncompressed_size_of_compressed_file(hfsmp, vp, cattrp->ca_fileid, &datasize, 1) ) { /* 1 == don't take the cnode lock */ - /* total size of a compressed file is just the data size */ - totalsize = datasize; - } - } - } -#endif - - if (ATTR_FILE_LINKCOUNT & attr) { - *((u_int32_t *)attrbufptr) = cattrp->ca_linkcount; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_FILE_TOTALSIZE & attr) { - *((off_t *)attrbufptr) = totalsize; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - if (ATTR_FILE_ALLOCSIZE & attr) { - *((off_t *)attrbufptr) = - (off_t)cattrp->ca_blocks * (off_t)allocblksize; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - if (ATTR_FILE_IOBLOCKSIZE & attr) { - *((u_int32_t *)attrbufptr) = hfsmp->hfs_logBlockSize; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_FILE_CLUMPSIZE & attr) { - *((u_int32_t *)attrbufptr) = hfsmp->vcbClpSiz; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - if (ATTR_FILE_DEVTYPE & attr) { - if (S_ISBLK(cattrp->ca_mode) || S_ISCHR(cattrp->ca_mode)) - *((u_int32_t *)attrbufptr) = (u_int32_t)cattrp->ca_rdev; - else - *((u_int32_t *)attrbufptr) = 0; - attrbufptr = ((u_int32_t *)attrbufptr) + 1; - } - - if (ATTR_FILE_DATALENGTH & attr) { - *((off_t *)attrbufptr) = datasize; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - -#if HFS_COMPRESSION - /* fake the data fork size on a decmpfs compressed file to reflect the - * uncompressed size. This ensures proper reading and copying of these files. - * NOTE: we may need to get the vnode here because the vnode parameter - * passed by hfs_vnop_readdirattr() may be null. - */ - - if ( handle_compressed ) { - if (attr & ATTR_FILE_DATAALLOCSIZE) { - *((off_t *)attrbufptr) = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - if (attr & ATTR_FILE_RSRCLENGTH) { - *((off_t *)attrbufptr) = 0; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - if (attr & ATTR_FILE_RSRCALLOCSIZE) { - *((off_t *)attrbufptr) = 0; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - } - else -#endif - { - if (ATTR_FILE_DATAALLOCSIZE & attr) { - *((off_t *)attrbufptr) = (off_t)datafork->cf_blocks * (off_t)allocblksize; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - if (ATTR_FILE_RSRCLENGTH & attr) { - *((off_t *)attrbufptr) = rsrcfork->cf_size; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - if (ATTR_FILE_RSRCALLOCSIZE & attr) { - *((off_t *)attrbufptr) = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; - attrbufptr = ((off_t *)attrbufptr) + 1; - } - } - *abp->ab_attrbufpp = attrbufptr; - *abp->ab_varbufpp = varbufptr; -} - -/* - * Calculate the total size of an attribute block. - */ -__private_extern__ -int -hfs_attrblksize(struct attrlist *attrlist) -{ - int size; - attrgroup_t a; - int sizeof_timespec; - boolean_t is_64_bit = proc_is64bit(current_proc()); - - if (is_64_bit) - sizeof_timespec = sizeof(struct user64_timespec); - else - sizeof_timespec = sizeof(struct user32_timespec); - - DBG_ASSERT((attrlist->commonattr & ~ATTR_CMN_VALIDMASK) == 0); - - DBG_ASSERT((attrlist->volattr & ~ATTR_VOL_VALIDMASK) == 0); - - DBG_ASSERT((attrlist->dirattr & ~ATTR_DIR_VALIDMASK) == 0); - - DBG_ASSERT((attrlist->fileattr & ~ATTR_FILE_VALIDMASK) == 0); - - DBG_ASSERT((attrlist->forkattr & ~ATTR_FORK_VALIDMASK) == 0); - - size = 0; - - if ((a = attrlist->commonattr) != 0) { - if (a & ATTR_CMN_NAME) size += sizeof(struct attrreference); - if (a & ATTR_CMN_DEVID) size += sizeof(dev_t); - if (a & ATTR_CMN_FSID) size += sizeof(fsid_t); - if (a & ATTR_CMN_OBJTYPE) size += sizeof(fsobj_type_t); - if (a & ATTR_CMN_OBJTAG) size += sizeof(fsobj_tag_t); - if (a & ATTR_CMN_OBJID) size += sizeof(fsobj_id_t); - if (a & ATTR_CMN_OBJPERMANENTID) size += sizeof(fsobj_id_t); - if (a & ATTR_CMN_PAROBJID) size += sizeof(fsobj_id_t); - if (a & ATTR_CMN_SCRIPT) size += sizeof(text_encoding_t); - if (a & ATTR_CMN_CRTIME) size += sizeof_timespec; - if (a & ATTR_CMN_MODTIME) size += sizeof_timespec; - if (a & ATTR_CMN_CHGTIME) size += sizeof_timespec; - if (a & ATTR_CMN_ACCTIME) size += sizeof_timespec; - if (a & ATTR_CMN_BKUPTIME) size += sizeof_timespec; - if (a & ATTR_CMN_FNDRINFO) size += 32 * sizeof(u_int8_t); - if (a & ATTR_CMN_OWNERID) size += sizeof(uid_t); - if (a & ATTR_CMN_GRPID) size += sizeof(gid_t); - if (a & ATTR_CMN_ACCESSMASK) size += sizeof(u_int32_t); - if (a & ATTR_CMN_FLAGS) size += sizeof(u_int32_t); - if (a & ATTR_CMN_USERACCESS) size += sizeof(u_int32_t); - if (a & ATTR_CMN_FILEID) size += sizeof(u_int64_t); - if (a & ATTR_CMN_PARENTID) size += sizeof(u_int64_t); - } - if ((a = attrlist->dirattr) != 0) { - if (a & ATTR_DIR_LINKCOUNT) size += sizeof(u_int32_t); - if (a & ATTR_DIR_ENTRYCOUNT) size += sizeof(u_int32_t); - if (a & ATTR_DIR_MOUNTSTATUS) size += sizeof(u_int32_t); - } - if ((a = attrlist->fileattr) != 0) { - if (a & ATTR_FILE_LINKCOUNT) size += sizeof(u_int32_t); - if (a & ATTR_FILE_TOTALSIZE) size += sizeof(off_t); - if (a & ATTR_FILE_ALLOCSIZE) size += sizeof(off_t); - if (a & ATTR_FILE_IOBLOCKSIZE) size += sizeof(u_int32_t); - if (a & ATTR_FILE_CLUMPSIZE) size += sizeof(u_int32_t); - if (a & ATTR_FILE_DEVTYPE) size += sizeof(u_int32_t); - if (a & ATTR_FILE_DATALENGTH) size += sizeof(off_t); - if (a & ATTR_FILE_DATAALLOCSIZE) size += sizeof(off_t); - if (a & ATTR_FILE_RSRCLENGTH) size += sizeof(off_t); - if (a & ATTR_FILE_RSRCALLOCSIZE) size += sizeof(off_t); - } - - return (size); -} - -#define KAUTH_DIR_WRITE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | \ - KAUTH_VNODE_ADD_SUBDIRECTORY | \ - KAUTH_VNODE_DELETE_CHILD) - -#define KAUTH_DIR_READ_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY) - -#define KAUTH_DIR_EXECUTE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH) - -#define KAUTH_FILE_WRITE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA) - -#define KAUTH_FILE_READRIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA) - -#define KAUTH_FILE_EXECUTE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE) - - -/* - * Compute the same [expensive] user_access value as getattrlist does - */ -static u_int32_t -hfs_real_user_access(vnode_t vp, vfs_context_t ctx) -{ - u_int32_t user_access = 0; - - if (vnode_isdir(vp)) { - if (vnode_authorize(vp, NULLVP, KAUTH_DIR_WRITE_RIGHTS, ctx) == 0) - user_access |= W_OK; - if (vnode_authorize(vp, NULLVP, KAUTH_DIR_READ_RIGHTS, ctx) == 0) - user_access |= R_OK; - if (vnode_authorize(vp, NULLVP, KAUTH_DIR_EXECUTE_RIGHTS, ctx) == 0) - user_access |= X_OK; - } else { - if (vnode_authorize(vp, NULLVP, KAUTH_FILE_WRITE_RIGHTS, ctx) == 0) - user_access |= W_OK; - if (vnode_authorize(vp, NULLVP, KAUTH_FILE_READRIGHTS, ctx) == 0) - user_access |= R_OK; - if (vnode_authorize(vp, NULLVP, KAUTH_FILE_EXECUTE_RIGHTS, ctx) == 0) - user_access |= X_OK; - } - return (user_access); -} - - -u_int32_t -DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, mode_t obj_mode, - struct mount *mp, kauth_cred_t cred, __unused struct proc *p) -{ - u_int32_t permissions; - - if (obj_uid == UNKNOWNUID) - obj_uid = kauth_cred_getuid(cred); - - /* User id 0 (root) always gets access. */ - if (!suser(cred, NULL)) { - permissions = R_OK | W_OK | X_OK; - goto Exit; - }; - - /* Otherwise, check the owner. */ - if (hfs_owner_rights(VFSTOHFS(mp), obj_uid, cred, NULL, false) == 0) { - permissions = ((u_int32_t)obj_mode & S_IRWXU) >> 6; - goto Exit; - } - - /* Otherwise, check the groups. */ - if (! (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)) { - int is_member; - - if (kauth_cred_ismember_gid(cred, obj_gid, &is_member) == 0 && is_member) { - permissions = ((u_int32_t)obj_mode & S_IRWXG) >> 3; - goto Exit; - } - } - - /* Otherwise, settle for 'others' access. */ - permissions = (u_int32_t)obj_mode & S_IRWXO; - -Exit: - return (permissions); -} - - -/* - * =========================================================================== - * Support functions for filling up a vnode_attr structure based on attributes - * requested. - * =========================================================================== - */ -void -get_vattr_data_for_attrs(struct attrlist *alp, struct vnode_attr *vap, - struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc *descp, - struct cat_attr *atrp, struct cat_fork *datafork, struct cat_fork *rsrcfork, - vfs_context_t ctx) -{ - if (alp->commonattr) - vattr_data_for_common_attrs(alp, vap, hfsmp, vp, descp, atrp, - ctx); - - if (alp->dirattr && S_ISDIR(atrp->ca_mode)) - vattr_data_for_dir_attrs(alp, vap, hfsmp, vp, descp, atrp); - - if (alp->fileattr && !S_ISDIR(atrp->ca_mode)) { - vattr_data_for_file_attrs(alp, vap, hfsmp, atrp, datafork, - rsrcfork, vp); - } -} - -static void -copy_name_attr(struct vnode_attr *vap, struct vnode *vp, const u_int8_t *name, - int namelen) -{ - char *mpname; - size_t mpnamelen; - u_int32_t attrlength; - u_int8_t empty = 0; - - /* A cnode's name may be incorrect for the root of a mounted - * filesystem (it can be mounted on a different directory name - * than the name of the volume, such as "blah-1"). So for the - * root directory, it's best to return the last element of the - location where the volume's mounted: - */ - if ((vp != NULL) && vnode_isvroot(vp) && - (mpname = mountpointname(vnode_mount(vp)))) { - mpnamelen = strlen(mpname); - - /* Trim off any trailing slashes: */ - while ((mpnamelen > 0) && (mpname[mpnamelen-1] == '/')) - --mpnamelen; - - /* If there's anything left, use it instead of the volume's name */ - if (mpnamelen > 0) { - name = (u_int8_t *)mpname; - namelen = mpnamelen; - } - } - - if (name == NULL) { - name = ∅ - namelen = 0; - } - - attrlength = namelen + 1; - (void) strncpy((char *)vap->va_name, (const char *) name, attrlength); - /* - * round upto 8 and zero out the rounded up bytes. - */ - attrlength = min(kHFSPlusMaxFileNameBytes, ((attrlength + 7) & ~0x07)); - bzero(vap->va_name + attrlength, kHFSPlusMaxFileNameBytes - attrlength); -} - -static void -vattr_data_for_common_attrs( struct attrlist *alp, struct vnode_attr *vap, - struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc *cdp, - struct cat_attr *cap, vfs_context_t ctx) -{ - attrgroup_t attr = alp->commonattr; - struct mount *mp = HFSTOVFS(hfsmp); - uid_t cuid = 1; - int isroot = 0; - - if (attr & (ATTR_CMN_OWNERID | ATTR_CMN_GRPID)) { - cuid = kauth_cred_getuid(vfs_context_ucred(ctx)); - isroot = cuid == 0; - } - - if (ATTR_CMN_NAME & attr) { - if (vap->va_name) { - copy_name_attr(vap, vp, cdp->cd_nameptr, - cdp->cd_namelen); - VATTR_SET_SUPPORTED(vap, va_name); - } else { - VATTR_CLEAR_SUPPORTED(vap, va_name); - } - } - - if (ATTR_CMN_DEVID & attr) { - vap->va_devid = hfsmp->hfs_raw_dev; - VATTR_SET_SUPPORTED(vap, va_devid); - } - - if (ATTR_CMN_FSID & attr) { - vap->va_fsid64.val[0] = hfsmp->hfs_raw_dev; - vap->va_fsid64.val[1] = vfs_typenum(mp); - VATTR_SET_SUPPORTED(vap, va_fsid64); - } - /* - * We always provide the objtype even if not asked because VFS helper - * functions depend on knowing the object's type. - */ - vap->va_objtype = IFTOVT(cap->ca_mode); - VATTR_SET_SUPPORTED(vap, va_objtype); - - if (ATTR_CMN_OBJTAG & attr) { - vap->va_objtag = VT_HFS; - VATTR_SET_SUPPORTED(vap, va_objtag); - } - /* - * Exporting file IDs from HFS Plus: - * - * For "normal" files the c_fileid is the same value as the - * c_cnid. But for hard link files, they are different - the - * c_cnid belongs to the active directory entry (ie the link) - * and the c_fileid is for the actual inode (ie the data file). - * - * The stat call (getattr) will always return the c_fileid - * and Carbon APIs, which are hardlink-ignorant, will always - * receive the c_cnid (from getattrlist). - */ - if ((ATTR_CMN_OBJID & attr) || - (ATTR_CMN_OBJPERMANENTID & attr)) { - vap->va_linkid = cdp->cd_cnid; - VATTR_SET_SUPPORTED(vap, va_linkid); - } - - if (ATTR_CMN_PAROBJID & attr) { - vap->va_parentid = cdp->cd_parentcnid; - VATTR_SET_SUPPORTED(vap, va_parentid); - } - - if (ATTR_CMN_SCRIPT & attr) { - vap->va_encoding = cdp->cd_encoding; - VATTR_SET_SUPPORTED(vap, va_encoding); - } - - if (ATTR_CMN_CRTIME & attr) { - vap->va_create_time.tv_sec = cap->ca_itime; - vap->va_create_time.tv_nsec = 0; - VATTR_SET_SUPPORTED(vap, va_create_time); - } - - if (ATTR_CMN_MODTIME & attr) { - vap->va_modify_time.tv_sec = cap->ca_mtime; - vap->va_modify_time.tv_nsec = 0; - VATTR_SET_SUPPORTED(vap, va_modify_time); - } - - if (ATTR_CMN_CHGTIME & attr) { - vap->va_change_time.tv_sec = cap->ca_ctime; - vap->va_change_time.tv_nsec = 0; - VATTR_SET_SUPPORTED(vap, va_change_time); - } - - if (ATTR_CMN_ACCTIME & attr) { - vap->va_access_time.tv_sec = cap->ca_atime; - vap->va_access_time.tv_nsec = 0; - VATTR_SET_SUPPORTED(vap, va_access_time); - } - - if (ATTR_CMN_BKUPTIME & attr) { - vap->va_backup_time.tv_sec = cap->ca_btime; - vap->va_backup_time.tv_nsec = 0; - VATTR_SET_SUPPORTED(vap, va_backup_time); - } - - if (ATTR_CMN_FNDRINFO & attr) { - u_int8_t *finfo = NULL; - - bcopy(&cap->ca_finderinfo, &vap->va_finderinfo[0], - sizeof(u_int8_t) * 32); - finfo = (u_int8_t*)(&vap->va_finderinfo[0]); - - /* Don't expose a symlink's private type/creator. */ - if (S_ISLNK(cap->ca_mode)) { - struct FndrFileInfo *fip; - - fip = (struct FndrFileInfo *)finfo; - fip->fdType = 0; - fip->fdCreator = 0; - } - - /* advance 16 bytes into the attrbuf */ - finfo = finfo + 16; - - /* also don't expose the date_added or write_gen_counter fields */ - if (S_ISREG(cap->ca_mode) || S_ISLNK(cap->ca_mode)) { - struct FndrExtendedFileInfo *extinfo = - (struct FndrExtendedFileInfo *)finfo; - extinfo->document_id = 0; - extinfo->date_added = 0; - extinfo->write_gen_counter = 0; - } else if (S_ISDIR(cap->ca_mode)) { - struct FndrExtendedDirInfo *extinfo = - (struct FndrExtendedDirInfo *)finfo; - extinfo->document_id = 0; - extinfo->date_added = 0; - extinfo->write_gen_counter = 0; - } - - VATTR_SET_SUPPORTED(vap, va_finderinfo); - } - - if (ATTR_CMN_OWNERID & attr) { - uid_t nuid = cap->ca_uid; - - if (!isroot) { - if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) - nuid = cuid; - else if (nuid == UNKNOWNUID) - nuid = cuid; - } - - vap->va_uid = nuid; - VATTR_SET_SUPPORTED(vap, va_uid); - } - - if (ATTR_CMN_GRPID & attr) { - gid_t ngid = cap->ca_gid; - - if (!isroot) { - gid_t cgid = kauth_cred_getgid(vfs_context_ucred(ctx)); - if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) - ngid = cgid; - else if (ngid == UNKNOWNUID) - ngid = cgid; - } - - vap->va_gid = ngid; - VATTR_SET_SUPPORTED(vap, va_gid); - } - - if (ATTR_CMN_ACCESSMASK & attr) { - uint32_t nmode; - /* - * [2856576] Since we are dynamically changing the owner, also - * effectively turn off the set-user-id and set-group-id bits, - * just like chmod(2) would when changing ownership. This prevents - * a security hole where set-user-id programs run as whoever is - * logged on (or root if nobody is logged in yet!) - */ - nmode = (cap->ca_uid == UNKNOWNUID) ? - cap->ca_mode & ~(S_ISUID | S_ISGID) : cap->ca_mode; - - vap->va_mode = nmode; - VATTR_SET_SUPPORTED(vap, va_mode); - } - - if (ATTR_CMN_FLAGS & attr) { - vap->va_flags = cap->ca_flags; - VATTR_SET_SUPPORTED(vap, va_flags); - } - - if (ATTR_CMN_GEN_COUNT & attr) { - vap->va_write_gencount = hfs_get_gencount_from_blob( - (const uint8_t *)cap->ca_finderinfo, cap->ca_mode); - VATTR_SET_SUPPORTED(vap, va_write_gencount); - } - - if (ATTR_CMN_DOCUMENT_ID & attr) { - vap->va_document_id = hfs_get_document_id_from_blob( - (const uint8_t *)cap->ca_finderinfo, cap->ca_mode); - VATTR_SET_SUPPORTED(vap, va_document_id); - } - - if (ATTR_CMN_USERACCESS & attr) { - u_int32_t user_access; - - /* Take the long path when we have an ACL */ - if ((vp != NULLVP) && (cap->ca_recflags & kHFSHasSecurityMask)) { - user_access = hfs_real_user_access(vp, ctx); - } else { - user_access = DerivePermissionSummary(cap->ca_uid, cap->ca_gid, - cap->ca_mode, mp, vfs_context_ucred(ctx), 0); - } - /* Also consider READ-ONLY file system. */ - if (vfs_flags(mp) & MNT_RDONLY) { - user_access &= ~W_OK; - } - /* Locked objects are not writable either */ - if ((cap->ca_flags & UF_IMMUTABLE) && (vfs_context_suser(ctx) != 0)) - user_access &= ~W_OK; - if ((cap->ca_flags & SF_IMMUTABLE) && (vfs_context_suser(ctx) == 0)) - user_access &= ~W_OK; - - vap->va_user_access = user_access; - VATTR_SET_SUPPORTED(vap, va_user_access); - } - - /* - * Right now the best we can do is tell if we *don't* have extended - * security (like hfs_vnop_getattr). - */ - if (ATTR_CMN_EXTENDED_SECURITY & attr) { - if (!(cap->ca_recflags & kHFSHasSecurityMask)) { - vap->va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; - VATTR_SET_SUPPORTED(vap, va_acl); - } - } - - if (ATTR_CMN_FILEID & attr) { - vap->va_fileid = cap->ca_fileid; - VATTR_SET_SUPPORTED(vap, va_fileid); - } - - if (ATTR_CMN_PARENTID & attr) { - vap->va_parentid = cdp->cd_parentcnid; - VATTR_SET_SUPPORTED(vap, va_parentid); - } - - if (ATTR_CMN_ADDEDTIME & attr) { - if (cap->ca_recflags & kHFSHasDateAddedMask) { - vap->va_addedtime.tv_sec = hfs_get_dateadded_from_blob( - (const uint8_t *)cap->ca_finderinfo, cap->ca_mode); - vap->va_addedtime.tv_nsec = 0; - VATTR_SET_SUPPORTED(vap, va_addedtime); - } - } -} - -static void -vattr_data_for_dir_attrs(struct attrlist *alp, struct vnode_attr *vap, - struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * descp, - struct cat_attr * cattrp) -{ - attrgroup_t attr = alp->dirattr; - u_int32_t entries; - - /* - * The DIR_LINKCOUNT is the count of real directory hard links. - * (i.e. its not the sum of the implied "." and ".." references - * typically used in stat's st_nlink field) - */ - if (ATTR_DIR_LINKCOUNT & attr) { - vap->va_dirlinkcount = cattrp->ca_linkcount; - VATTR_SET_SUPPORTED(vap, va_dirlinkcount); - } - if (ATTR_DIR_ENTRYCOUNT & attr) { - entries = cattrp->ca_entries; - - if (descp->cd_parentcnid == kHFSRootParentID) { - if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) - --entries; /* hide private dir */ - if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) - --entries; /* hide private dir */ - if (hfsmp->jnl || - ((hfsmp->vcbAtrb & kHFSVolumeJournaledMask) && - (hfsmp->hfs_flags & HFS_READ_ONLY))) - entries -= 2; /* hide the journal files */ - } - - vap->va_nchildren = entries; - VATTR_SET_SUPPORTED(vap, va_nchildren); - } - - if (ATTR_DIR_MOUNTSTATUS & attr) { - /* - * There is not vnode_attr for mount point status. - * XXX. Should there be ? - */ - u_int32_t mstatus = 0; - - if (vp != NULL && vnode_mountedhere(vp) != NULL) - mstatus = DIR_MNTSTATUS_MNTPOINT; - } -} - -static void -vattr_data_for_file_attrs(struct attrlist *alp, struct vnode_attr *vap, - struct hfsmount *hfsmp, struct cat_attr *cattrp, struct cat_fork *datafork, - struct cat_fork *rsrcfork, struct vnode *vp) -{ -#if !HFS_COMPRESSION -#pragma unused(vp) -#endif - attrgroup_t attr = alp->fileattr; - off_t da_size, rsrc_len, rsrc_alloc; - u_int32_t allocblksize; - - allocblksize = HFSTOVCB(hfsmp)->blockSize; - - off_t datasize = datafork->cf_size; - off_t totalsize = datasize + rsrcfork->cf_size; -#if HFS_COMPRESSION - int handle_compressed; - handle_compressed = (cattrp->ca_flags & UF_COMPRESSED);// && hfs_file_is_compressed(VTOC(vp), 1); - - if (handle_compressed) { - if (attr & (ATTR_FILE_DATALENGTH|ATTR_FILE_TOTALSIZE)) { - if ( 0 == hfs_uncompressed_size_of_compressed_file(hfsmp, vp, cattrp->ca_fileid, &datasize, 1) ) { /* 1 == don't take the cnode lock */ - /* total size of a compressed file is just the data size */ - totalsize = datasize; - } - } - } -#endif - - if (ATTR_FILE_LINKCOUNT & attr) { - vap->va_nlink = cattrp->ca_linkcount; - VATTR_SET_SUPPORTED(vap, va_nlink); - } - if (ATTR_FILE_TOTALSIZE & attr) { - VATTR_RETURN(vap, va_total_size, totalsize); - } - if (ATTR_FILE_ALLOCSIZE & attr) { - VATTR_RETURN(vap, va_total_alloc, - (off_t)cattrp->ca_blocks * (off_t)allocblksize ); - } - if (ATTR_FILE_IOBLOCKSIZE & attr) { - VATTR_RETURN(vap, va_iosize, hfsmp->hfs_logBlockSize); - } - - /* ATTR_FILE_CLUMPSIZE is obsolete */ - - if (ATTR_FILE_DEVTYPE & attr) { - dev_t dev = 0; - - if (S_ISBLK(cattrp->ca_mode) || S_ISCHR(cattrp->ca_mode)) - dev = (u_int32_t)cattrp->ca_rdev; - - VATTR_RETURN(vap, va_rdev, dev); - } - - if (ATTR_FILE_DATALENGTH & attr) { - VATTR_RETURN(vap, va_data_size, datasize); - } -#if HFS_COMPRESSION - /* fake the data fork size on a decmpfs compressed file to reflect the - * uncompressed size. This ensures proper reading and copying of these - * files. - * NOTE: we may need to get the vnode here because the vnode parameter - * passed by hfs_vnop_readdirattr() may be null. - */ - - if (handle_compressed) { - da_size = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; - rsrc_len = 0; - rsrc_alloc = 0; - } - else -#endif - { - da_size = (off_t)datafork->cf_blocks * (off_t)allocblksize; - rsrc_len = rsrcfork->cf_size; - rsrc_alloc = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; - } - - if (ATTR_FILE_DATAALLOCSIZE & attr) { - VATTR_RETURN(vap, va_data_alloc, da_size); - } - - if (ATTR_FILE_RSRCLENGTH & attr) { - VATTR_RETURN(vap, va_rsrc_length, rsrc_len); - } - - if (ATTR_FILE_RSRCALLOCSIZE & attr) { - VATTR_RETURN(vap, va_rsrc_alloc, rsrc_alloc); - } -} diff --git a/bsd/hfs/hfs_attrlist.h b/bsd/hfs/hfs_attrlist.h deleted file mode 100644 index cb72bce1e..000000000 --- a/bsd/hfs/hfs_attrlist.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2002-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _HFS_ATTRLIST_H_ -#define _HFS_ATTRLIST_H_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -#include -#include - -#include -#include - - -struct attrblock { - struct attrlist * ab_attrlist; - void ** ab_attrbufpp; - void ** ab_varbufpp; - int ab_flags; - int ab_blocksize; - vfs_context_t ab_context; -}; - -/* - * The following define the attributes that HFS supports: - */ - -#define HFS_ATTR_CMN_VALID \ - (ATTR_CMN_NAME | ATTR_CMN_DEVID | \ - ATTR_CMN_FSID | ATTR_CMN_OBJTYPE | \ - ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | \ - ATTR_CMN_OBJPERMANENTID | ATTR_CMN_PAROBJID | \ - ATTR_CMN_SCRIPT | ATTR_CMN_CRTIME | \ - ATTR_CMN_MODTIME | ATTR_CMN_CHGTIME | \ - ATTR_CMN_ACCTIME | ATTR_CMN_BKUPTIME | \ - ATTR_CMN_FNDRINFO |ATTR_CMN_OWNERID | \ - ATTR_CMN_GRPID | ATTR_CMN_ACCESSMASK | \ - ATTR_CMN_FLAGS | ATTR_CMN_USERACCESS | \ - ATTR_CMN_FILEID | ATTR_CMN_PARENTID ) - -#define HFS_ATTR_CMN_SEARCH_VALID \ - (ATTR_CMN_NAME | ATTR_CMN_OBJID | \ - ATTR_CMN_PAROBJID | ATTR_CMN_CRTIME | \ - ATTR_CMN_MODTIME | ATTR_CMN_CHGTIME | \ - ATTR_CMN_ACCTIME | ATTR_CMN_BKUPTIME | \ - ATTR_CMN_FNDRINFO | ATTR_CMN_OWNERID | \ - ATTR_CMN_GRPID | ATTR_CMN_ACCESSMASK | \ - ATTR_CMN_FILEID | ATTR_CMN_PARENTID ) - - - -#define HFS_ATTR_DIR_VALID \ - (ATTR_DIR_LINKCOUNT | ATTR_DIR_ENTRYCOUNT | ATTR_DIR_MOUNTSTATUS) - -#define HFS_ATTR_DIR_SEARCH_VALID \ - (ATTR_DIR_ENTRYCOUNT) - -#define HFS_ATTR_FILE_VALID \ - (ATTR_FILE_LINKCOUNT |ATTR_FILE_TOTALSIZE | \ - ATTR_FILE_ALLOCSIZE | ATTR_FILE_IOBLOCKSIZE | \ - ATTR_FILE_CLUMPSIZE | ATTR_FILE_DEVTYPE | \ - ATTR_FILE_DATALENGTH | ATTR_FILE_DATAALLOCSIZE | \ - ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE) - -#define HFS_ATTR_FILE_SEARCH_VALID \ - (ATTR_FILE_DATALENGTH | ATTR_FILE_DATAALLOCSIZE | \ - ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE ) - -extern int hfs_attrblksize(struct attrlist *attrlist); - -extern u_int32_t DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, - mode_t obj_mode, struct mount *mp, - kauth_cred_t cred, struct proc *p); - -extern void hfs_packattrblk(struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *vp, struct cat_desc *descp, struct cat_attr *attrp, - struct cat_fork *datafork, struct cat_fork *rsrcfork, struct vfs_context *ctx); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* ! _HFS_ATTRLIST_H_ */ diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c deleted file mode 100644 index f6084e31f..000000000 --- a/bsd/hfs/hfs_btreeio.c +++ /dev/null @@ -1,932 +0,0 @@ -/* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "hfs.h" -#include "hfs_cnode.h" -#include "hfs_dbg.h" -#include "hfs_endian.h" -#include "hfs_btreeio.h" - -#include "hfscommon/headers/FileMgrInternal.h" -#include "hfscommon/headers/BTreesPrivate.h" - -#define FORCESYNCBTREEWRITES 0 - -/* From bsd/vfs/vfs_bio.c */ -extern int bdwrite_internal(struct buf *, int); - -static int ClearBTNodes(struct vnode *vp, int blksize, off_t offset, off_t amount); -static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp); - -void btree_swap_node(struct buf *bp, __unused void *arg); - -/* - * Return btree node size for given vnode. - * - * Returns: - * For btree vnode, returns btree node size. - * For non-btree vnodes, returns 0. - */ -u_int16_t get_btree_nodesize(struct vnode *vp) -{ - BTreeControlBlockPtr btree; - u_int16_t node_size = 0; - - if (vnode_issystem(vp)) { - btree = (BTreeControlBlockPtr) VTOF(vp)->fcbBTCBPtr; - if (btree) { - node_size = btree->nodeSize; - } - } - - return node_size; -} - -OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemCount minBlockCount) -{ - BTreeControlBlockPtr bTreePtr; - - DBG_ASSERT(vp != NULL); - DBG_ASSERT(blockSize >= kMinNodeSize); - if (blockSize > MAXBSIZE ) - return (fsBTBadNodeSize); - - bTreePtr = (BTreeControlBlockPtr)VTOF(vp)->fcbBTCBPtr; - bTreePtr->nodeSize = blockSize; - - return (E_NONE); -} - - -OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions options, BlockDescriptor *block) -{ - OSStatus retval = E_NONE; - struct buf *bp = NULL; - u_int8_t allow_empty_node; - - /* If the btree block is being read using hint, it is - * fine for the swap code to find zeroed out nodes. - */ - if (options & kGetBlockHint) { - allow_empty_node = true; - } else { - allow_empty_node = false; - } - - if (options & kGetEmptyBlock) { - daddr64_t blkno; - off_t offset; - - offset = (daddr64_t)blockNum * (daddr64_t)block->blockSize; - bp = buf_getblk(vp, (daddr64_t)blockNum, block->blockSize, 0, 0, BLK_META); - if (bp && - VNOP_BLOCKMAP(vp, offset, block->blockSize, &blkno, NULL, NULL, 0, NULL) == 0) { - buf_setblkno(bp, blkno); - } - } else { - retval = buf_meta_bread(vp, (daddr64_t)blockNum, block->blockSize, NOCRED, &bp); - } - if (bp == NULL) - retval = -1; //XXX need better error - - if (retval == E_NONE) { - block->blockHeader = bp; - block->buffer = (char *)buf_dataptr(bp); - block->blockNum = buf_lblkno(bp); - block->blockReadFromDisk = (buf_fromcache(bp) == 0); /* not found in cache ==> came from disk */ - - // XXXdbg - block->isModified = 0; - - /* Check and endian swap B-Tree node (only if it's a valid block) */ - if (!(options & kGetEmptyBlock)) { - - /* This happens when we first open the b-tree, we might not have all the node data on hand */ - if ((((BTNodeDescriptor *)block->buffer)->kind == kBTHeaderNode) && - (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != buf_count(bp)) && - (SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != buf_count(bp))) { - - /* - * Don't swap the node descriptor, record offsets, or other records. - * This record will be invalidated and re-read with the correct node - * size once the B-tree control block is set up with the node size - * from the header record. - */ - retval = hfs_swap_BTNode (block, vp, kSwapBTNodeHeaderRecordOnly, allow_empty_node); - - } else { - /* - * In this case, we have enough data in-hand to do basic validation - * on the B-Tree node. - */ - if (block->blockReadFromDisk) { - /* - * The node was just read from disk, so always swap/check it. - * This is necessary on big endian since the test below won't trigger. - */ - retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node); - } - else { - /* - * Block wasn't read from disk; it was found in the cache. - */ - if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x0e00) { - /* - * The node was left in the cache in non-native order, so swap it. - * This only happens on little endian, after the node is written - * back to disk. - */ - retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node); - } - else if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x000e) { - /* - * The node was in-cache in native-endianness. We don't need to do - * anything here, because the node is ready to use. Set retval == 0. - */ - retval = 0; - } - /* - * If the node doesn't have hex 14 (0xe) in the last two bytes of the buffer, - * it doesn't necessarily mean that this is a bad node. Zeroed nodes that are - * marked as unused in the b-tree map node would be OK and not have valid content. - */ - } - } - - /* - * If we got an error, then the node is only partially swapped. - * We mark the buffer invalid so that the next attempt to get the - * node will read it and attempt to swap again, and will notice - * the error again. If we didn't do this, the next attempt to get - * the node might use the partially swapped node as-is. - */ - if (retval) - buf_markinvalid(bp); - } - } - - if (retval) { - if (bp) - buf_brelse(bp); - block->blockHeader = NULL; - block->buffer = NULL; - } - - return (retval); -} - - -void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr) -{ - struct hfsmount *hfsmp = VTOHFS(vp); - struct buf *bp = NULL; - - if (hfsmp->jnl == NULL) { - return; - } - - bp = (struct buf *) blockPtr->blockHeader; - if (bp == NULL) { - panic("hfs: ModifyBlockStart: null bp for blockdescptr %p?!?\n", blockPtr); - return; - } - - journal_modify_block_start(hfsmp->jnl, bp); - blockPtr->isModified = 1; -} - -void -btree_swap_node(struct buf *bp, __unused void *arg) -{ - // struct hfsmount *hfsmp = (struct hfsmount *)arg; - int retval; - struct vnode *vp = buf_vnode(bp); - BlockDescriptor block; - - /* Prepare the block pointer */ - block.blockHeader = bp; - block.buffer = (char *)buf_dataptr(bp); - block.blockNum = buf_lblkno(bp); - /* not found in cache ==> came from disk */ - block.blockReadFromDisk = (buf_fromcache(bp) == 0); - block.blockSize = buf_count(bp); - - /* Swap the data now that this node is ready to go to disk. - * We allow swapping of zeroed out nodes here because we might - * be writing node whose last record just got deleted. - */ - retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, true); - if (retval) - panic("hfs: btree_swap_node: about to write corrupt node!\n"); -} - - -static int -btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp) -{ - return journal_modify_block_end(hfsmp->jnl, bp, btree_swap_node, hfsmp); -} - - -OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options) -{ - struct hfsmount *hfsmp = VTOHFS(vp); - OSStatus retval = E_NONE; - struct buf *bp = NULL; - - bp = (struct buf *) blockPtr->blockHeader; - - if (bp == NULL) { - retval = -1; - goto exit; - } - - if (options & kTrashBlock) { - buf_markinvalid(bp); - - if (hfsmp->jnl && (buf_flags(bp) & B_LOCKED)) { - journal_kill_block(hfsmp->jnl, bp); - } else { - buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ - } - - /* Don't let anyone else try to use this bp, it's been consumed */ - blockPtr->blockHeader = NULL; - - } else { - if (options & kForceWriteBlock) { - if (hfsmp->jnl) { - if (blockPtr->isModified == 0) { - panic("hfs: releaseblock: modified is 0 but forcewrite set! bp %p\n", bp); - } - - retval = btree_journal_modify_block_end(hfsmp, bp); - blockPtr->isModified = 0; - } else { - retval = VNOP_BWRITE(bp); - } - - /* Don't let anyone else try to use this bp, it's been consumed */ - blockPtr->blockHeader = NULL; - - } else if (options & kMarkBlockDirty) { - struct timeval tv; - microuptime(&tv); - if ((options & kLockTransaction) && hfsmp->jnl == NULL) { - /* - * - * Set the B_LOCKED flag and unlock the buffer, causing buf_brelse to move - * the buffer onto the LOCKED free list. This is necessary, otherwise - * getnewbuf() would try to reclaim the buffers using buf_bawrite, which - * isn't going to work. - * - */ - /* Don't hog all the buffers... */ - if (count_lock_queue() > kMaxLockedMetaBuffers) { - hfs_btsync(vp, HFS_SYNCTRANS); - /* Rollback sync time to cause a sync on lock release... */ - (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1)); - } - buf_setflags(bp, B_LOCKED); - } - - /* - * Delay-write this block. - * If the maximum delayed buffers has been exceeded then - * free up some buffers and fall back to an asynchronous write. - */ - if (hfsmp->jnl) { - if (blockPtr->isModified == 0) { - panic("hfs: releaseblock: modified is 0 but markdirty set! bp %p\n", bp); - } - retval = btree_journal_modify_block_end(hfsmp, bp); - blockPtr->isModified = 0; - } else if (bdwrite_internal(bp, 1) != 0) { - hfs_btsync(vp, 0); - /* Rollback sync time to cause a sync on lock release... */ - (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1)); - - buf_clearflags(bp, B_LOCKED); - buf_bawrite(bp); - } - - /* Don't let anyone else try to use this bp, it's been consumed */ - blockPtr->blockHeader = NULL; - - } else { - // check if we had previously called journal_modify_block_start() - // on this block and if so, abort it (which will call buf_brelse()). - if (hfsmp->jnl && blockPtr->isModified) { - // XXXdbg - I don't want to call modify_block_abort() - // because I think it may be screwing up the - // journal and blowing away a block that has - // valid data in it. - // - // journal_modify_block_abort(hfsmp->jnl, bp); - //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp); - btree_journal_modify_block_end(hfsmp, bp); - blockPtr->isModified = 0; - } else { - buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ - } - - /* Don't let anyone else try to use this bp, it's been consumed */ - blockPtr->blockHeader = NULL; - } - } - -exit: - return (retval); -} - - -OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) -{ -#pragma unused (maxEOF) - - OSStatus retval = 0, ret = 0; - int64_t actualBytesAdded, origSize; - u_int64_t bytesToAdd; - u_int32_t startAllocation; - u_int32_t fileblocks; - BTreeInfoRec btInfo; - ExtendedVCB *vcb; - FCB *filePtr; - struct proc *p = NULL; - int64_t trim = 0; - int lockflags = 0; - - filePtr = GetFileControlBlock(vp); - - if ( (off_t)minEOF > filePtr->fcbEOF ) - { - bytesToAdd = minEOF - filePtr->fcbEOF; - - if (bytesToAdd < filePtr->ff_clumpsize) - bytesToAdd = filePtr->ff_clumpsize; //XXX why not always be a mutiple of clump size? - } - else - { - return -1; - } - - vcb = VTOVCB(vp); - - /* - * The Extents B-tree can't have overflow extents. ExtendFileC will - * return an error if an attempt is made to extend the Extents B-tree - * when the resident extents are exhausted. - */ - - /* Protect allocation bitmap and extents overflow file. */ - lockflags = SFL_BITMAP; - if (VTOC(vp)->c_fileid != kHFSExtentsFileID) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(vcb, lockflags, HFS_EXCLUSIVE_LOCK); - - (void) BTGetInformation(filePtr, 0, &btInfo); - -#if 0 // XXXdbg - /* - * The b-tree code expects nodes to be contiguous. So when - * the allocation block size is less than the b-tree node - * size, we need to force disk allocations to be contiguous. - */ - if (vcb->blockSize >= btInfo.nodeSize) { - extendFlags = 0; - } else { - /* Ensure that all b-tree nodes are contiguous on disk */ - extendFlags = kEFContigMask; - } -#endif - - origSize = filePtr->fcbEOF; - fileblocks = filePtr->ff_blocks; - startAllocation = vcb->nextAllocation; - - // loop trying to get a contiguous chunk that's an integer multiple - // of the btree node size. if we can't get a contiguous chunk that - // is at least the node size then we break out of the loop and let - // the error propagate back up. - while((off_t)bytesToAdd >= btInfo.nodeSize) { - do { - retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, - kEFContigMask | kEFMetadataMask | kEFNoClumpMask, - (int64_t *)&actualBytesAdded); - if (retval == dskFulErr && actualBytesAdded == 0) { - bytesToAdd >>= 1; - if (bytesToAdd < btInfo.nodeSize) { - break; - } else if ((bytesToAdd % btInfo.nodeSize) != 0) { - // make sure it's an integer multiple of the nodeSize - bytesToAdd -= (bytesToAdd % btInfo.nodeSize); - } - } - } while (retval == dskFulErr && actualBytesAdded == 0); - - if (retval == dskFulErr && actualBytesAdded == 0 && bytesToAdd <= btInfo.nodeSize) { - break; - } - - filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; - bytesToAdd = minEOF - filePtr->fcbEOF; - } - - /* - * If a new extent was added then move the roving allocator - * reference forward by the current b-tree file size so - * there's plenty of room to grow. - */ - if ((retval == 0) && - ((VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) == 0) && - (vcb->nextAllocation > startAllocation) && - ((vcb->nextAllocation + fileblocks) < vcb->allocLimit)) { - HFS_UPDATE_NEXT_ALLOCATION(vcb, vcb->nextAllocation + fileblocks); - } - - filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; - - // XXXdbg ExtendFileC() could have returned an error even though - // it grew the file to be big enough for our needs. If this is - // the case, we don't care about retval so we blow it away. - // - if (filePtr->fcbEOF >= (off_t)minEOF && retval != 0) { - retval = 0; - } - - // XXXdbg if the file grew but isn't large enough or isn't an - // even multiple of the nodeSize then trim things back. if - // the file isn't large enough we trim back to the original - // size. otherwise we trim back to be an even multiple of the - // btree node size. - // - if ((filePtr->fcbEOF < (off_t)minEOF) || ((filePtr->fcbEOF - origSize) % btInfo.nodeSize) != 0) { - - if (filePtr->fcbEOF < (off_t)minEOF) { - retval = dskFulErr; - - if (filePtr->fcbEOF < origSize) { - panic("hfs: btree file eof %lld less than orig size %lld!\n", - filePtr->fcbEOF, origSize); - } - - trim = filePtr->fcbEOF - origSize; - } else { - trim = ((filePtr->fcbEOF - origSize) % btInfo.nodeSize); - } - - ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0, 0, FTOC(filePtr)->c_fileid, 0); - filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; - - // XXXdbg - panic if the file didn't get trimmed back properly - if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) { - panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb %p\n", - filePtr->fcbEOF, btInfo.nodeSize, filePtr); - } - - if (ret) { - // XXXdbg - this probably doesn't need to be a panic() - panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %ld)\n", - filePtr->fcbEOF, trim, (long)ret); - goto out; - } - } - - if(VTOC(vp)->c_fileid != kHFSExtentsFileID) { - /* - * Get any extents overflow b-tree changes to disk ASAP! - */ - (void) BTFlushPath(VTOF(vcb->extentsRefNum)); - (void) hfs_fsync(vcb->extentsRefNum, MNT_WAIT, 0, p); - } - hfs_systemfile_unlock(vcb, lockflags); - lockflags = 0; - - if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) { - panic("hfs: extendbtree: fcb %p has eof 0x%llx not a multiple of 0x%x (trim %llx)\n", - filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim); - } - - /* - * Update the Alternate MDB or Alternate VolumeHeader - */ - VTOC(vp)->c_flag |= C_MODIFIED; - if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) || - (VTOC(vp)->c_fileid == kHFSCatalogFileID) || - (VTOC(vp)->c_fileid == kHFSAttributesFileID) - ) { - MarkVCBDirty( vcb ); - ret = hfs_flushvolumeheader(VCBTOHFS(vcb), HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - } else { - VTOC(vp)->c_touch_chgtime = TRUE; - VTOC(vp)->c_touch_modtime = TRUE; - (void) hfs_update(vp, 0); - } - - ret = ClearBTNodes(vp, btInfo.nodeSize, origSize, (filePtr->fcbEOF - origSize)); -out: - if (retval == 0) - retval = ret; - - if (lockflags) - hfs_systemfile_unlock(vcb, lockflags); - - return retval; -} - - -/* - * Clear out (zero) new b-tree nodes on disk. - */ -static int -ClearBTNodes(struct vnode *vp, int blksize, off_t offset, off_t amount) -{ - struct hfsmount *hfsmp = VTOHFS(vp); - struct buf *bp = NULL; - daddr64_t blk; - daddr64_t blkcnt; - - blk = offset / blksize; - blkcnt = amount / blksize; - - while (blkcnt > 0) { - bp = buf_getblk(vp, blk, blksize, 0, 0, BLK_META); - if (bp == NULL) - continue; - - // XXXdbg - if (hfsmp->jnl) { - // XXXdbg -- skipping this for now since it makes a transaction - // become *way* too large - //journal_modify_block_start(hfsmp->jnl, bp); - } - bzero((char *)buf_dataptr(bp), blksize); - - buf_markaged(bp); - - // XXXdbg - if (hfsmp->jnl) { - // XXXdbg -- skipping this for now since it makes a transaction - // become *way* too large - //journal_modify_block_end(hfsmp->jnl, bp); - - // XXXdbg - remove this once we decide what to do with the - // writes to the journal - if ((blk % 32) == 0) - VNOP_BWRITE(bp); - else - buf_bawrite(bp); - } else { - /* wait/yield every 32 blocks so we don't hog all the buffers */ - if ((blk % 32) == 0) - VNOP_BWRITE(bp); - else - buf_bawrite(bp); - } - --blkcnt; - ++blk; - } - - return (0); -} - - -extern char hfs_attrname[]; - -/* - * Create an HFS+ Attribute B-tree File. - * - * No global resources should be held. - */ -int -hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t nodecnt) -{ - struct vnode* vp = NULLVP; - struct cat_desc cndesc; - struct cat_attr cnattr; - struct cat_fork cfork; - BlockDescriptor blkdesc; - BTNodeDescriptor *ndp; - BTHeaderRec *bthp; - BTreeControlBlockPtr btcb = NULL; - struct buf *bp = NULL; - void * buffer; - u_int8_t *bitmap; - u_int16_t *index; - u_int32_t node_num, num_map_nodes; - u_int32_t bytes_per_map_record; - u_int32_t temp; - u_int16_t offset; - int intrans = 0; - int result; - int newvnode_flags = 0; - -again: - /* - * Serialize creation using HFS_CREATING_BTREE flag. - */ - hfs_lock_mount (hfsmp); - if (hfsmp->hfs_flags & HFS_CREATING_BTREE) { - /* Someone else beat us, wait for them to finish. */ - (void) msleep(&hfsmp->hfs_attribute_cp, &hfsmp->hfs_mutex, - PDROP | PINOD, "hfs_create_attr_btree", 0); - if (hfsmp->hfs_attribute_vp) { - return (0); - } - goto again; - } - hfsmp->hfs_flags |= HFS_CREATING_BTREE; - hfs_unlock_mount (hfsmp); - - /* Check if were out of usable disk space. */ - if ((hfs_freeblks(hfsmp, 1) == 0)) { - result = ENOSPC; - goto exit; - } - - /* - * Set up Attribute B-tree vnode - * (this must be done before we start a transaction - * or take any system file locks) - */ - bzero(&cndesc, sizeof(cndesc)); - cndesc.cd_parentcnid = kHFSRootParentID; - cndesc.cd_flags |= CD_ISMETA; - cndesc.cd_nameptr = (const u_int8_t *)hfs_attrname; - cndesc.cd_namelen = strlen(hfs_attrname); - cndesc.cd_cnid = kHFSAttributesFileID; - - bzero(&cnattr, sizeof(cnattr)); - cnattr.ca_linkcount = 1; - cnattr.ca_mode = S_IFREG; - cnattr.ca_fileid = cndesc.cd_cnid; - - bzero(&cfork, sizeof(cfork)); - cfork.cf_clump = nodesize * nodecnt; - - result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, - &cfork, &vp, &newvnode_flags); - if (result) { - goto exit; - } - /* - * Set up Attribute B-tree control block - */ - MALLOC(btcb, BTreeControlBlock *, sizeof(BTreeControlBlock), M_TEMP, M_WAITOK); - bzero(btcb, sizeof(BTreeControlBlock)); - - btcb->nodeSize = nodesize; - btcb->maxKeyLength = kHFSPlusAttrKeyMaximumLength; - btcb->btreeType = 0xFF; - btcb->attributes = kBTVariableIndexKeysMask | kBTBigKeysMask; - btcb->version = kBTreeVersion; - btcb->writeCount = 1; - btcb->flags = 0; /* kBTHeaderDirty */ - btcb->fileRefNum = vp; - btcb->getBlockProc = GetBTreeBlock; - btcb->releaseBlockProc = ReleaseBTreeBlock; - btcb->setEndOfForkProc = ExtendBTreeFile; - btcb->keyCompareProc = (KeyCompareProcPtr)hfs_attrkeycompare; - VTOF(vp)->fcbBTCBPtr = btcb; - - /* - * Allocate some space - */ - if (hfs_start_transaction(hfsmp) != 0) { - result = EINVAL; - goto exit; - } - intrans = 1; - - /* Note ExtendBTreeFile will acquire the necessary system file locks. */ - result = ExtendBTreeFile(vp, nodesize, cfork.cf_clump); - if (result) - goto exit; - - btcb->totalNodes = VTOF(vp)->ff_size / nodesize; - - /* - * Figure out how many map nodes we'll need. - * - * bytes_per_map_record = the number of bytes in the map record of a - * map node. Since that is the only record in the node, it is the size - * of the node minus the node descriptor at the start, and two record - * offsets at the end of the node. The "- 2" is to round the size down - * to a multiple of 4 bytes (since sizeof(BTNodeDescriptor) is not a - * multiple of 4). - * - * The value "temp" here is the number of *bits* in the map record of - * the header node. - */ - bytes_per_map_record = nodesize - sizeof(BTNodeDescriptor) - 2*sizeof(u_int16_t) - 2; - temp = 8 * (nodesize - sizeof(BTNodeDescriptor) - - sizeof(BTHeaderRec) - - kBTreeHeaderUserBytes - - 4 * sizeof(u_int16_t)); - if (btcb->totalNodes > temp) { - num_map_nodes = howmany(btcb->totalNodes - temp, bytes_per_map_record * 8); - } - else { - num_map_nodes = 0; - } - - btcb->freeNodes = btcb->totalNodes - 1 - num_map_nodes; - - /* - * Initialize the b-tree header on disk - */ - bp = buf_getblk(vp, 0, nodesize, 0, 0, BLK_META); - if (bp == NULL) { - result = EIO; - goto exit; - } - - buffer = (void *)buf_dataptr(bp); - blkdesc.buffer = buffer; - blkdesc.blockHeader = (void *)bp; - blkdesc.blockReadFromDisk = 0; - blkdesc.isModified = 0; - - ModifyBlockStart(vp, &blkdesc); - - if (buf_size(bp) != nodesize) - panic("hfs_create_attr_btree: bad buffer size (%d)\n", buf_size(bp)); - - bzero(buffer, nodesize); - index = (u_int16_t *)buffer; - - /* FILL IN THE NODE DESCRIPTOR: */ - ndp = (BTNodeDescriptor *)buffer; - if (num_map_nodes != 0) - ndp->fLink = 1; - ndp->kind = kBTHeaderNode; - ndp->numRecords = 3; - offset = sizeof(BTNodeDescriptor); - index[(nodesize / 2) - 1] = offset; - - /* FILL IN THE HEADER RECORD: */ - bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset); - bthp->nodeSize = nodesize; - bthp->totalNodes = btcb->totalNodes; - bthp->freeNodes = btcb->freeNodes; - bthp->clumpSize = cfork.cf_clump; - bthp->btreeType = 0xFF; - bthp->attributes = kBTVariableIndexKeysMask | kBTBigKeysMask; - bthp->maxKeyLength = kHFSPlusAttrKeyMaximumLength; - bthp->keyCompareType = kHFSBinaryCompare; - offset += sizeof(BTHeaderRec); - index[(nodesize / 2) - 2] = offset; - - /* FILL IN THE USER RECORD: */ - offset += kBTreeHeaderUserBytes; - index[(nodesize / 2) - 3] = offset; - - /* Mark the header node and map nodes in use in the map record. - * - * NOTE: Assumes that the header node's map record has at least - * (num_map_nodes + 1) bits. - */ - bitmap = (u_int8_t *) buffer + offset; - temp = num_map_nodes + 1; /* +1 for the header node */ - while (temp >= 8) { - *(bitmap++) = 0xFF; - temp -= 8; - } - *bitmap = ~(0xFF >> temp); - - offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) - - kBTreeHeaderUserBytes - (4 * sizeof(int16_t)); - index[(nodesize / 2) - 4] = offset; - - if (hfsmp->jnl) { - result = btree_journal_modify_block_end(hfsmp, bp); - } else { - result = VNOP_BWRITE(bp); - } - if (result) - goto exit; - - /* Create the map nodes: node numbers 1 .. num_map_nodes */ - for (node_num=1; node_num <= num_map_nodes; ++node_num) { - bp = buf_getblk(vp, node_num, nodesize, 0, 0, BLK_META); - if (bp == NULL) { - result = EIO; - goto exit; - } - buffer = (void *)buf_dataptr(bp); - blkdesc.buffer = buffer; - blkdesc.blockHeader = (void *)bp; - blkdesc.blockReadFromDisk = 0; - blkdesc.isModified = 0; - - ModifyBlockStart(vp, &blkdesc); - - bzero(buffer, nodesize); - index = (u_int16_t *)buffer; - - /* Fill in the node descriptor */ - ndp = (BTNodeDescriptor *)buffer; - if (node_num != num_map_nodes) - ndp->fLink = node_num + 1; - ndp->kind = kBTMapNode; - ndp->numRecords = 1; - offset = sizeof(BTNodeDescriptor); - index[(nodesize / 2) - 1] = offset; - - - /* Fill in the map record's offset */ - /* Note: We assume that the map record is all zeroes */ - offset = sizeof(BTNodeDescriptor) + bytes_per_map_record; - index[(nodesize / 2) - 2] = offset; - - if (hfsmp->jnl) { - result = btree_journal_modify_block_end(hfsmp, bp); - } else { - result = VNOP_BWRITE(bp); - } - if (result) - goto exit; - } - - /* Update vp/cp for attribute btree */ - hfs_lock_mount (hfsmp); - hfsmp->hfs_attribute_cp = VTOC(vp); - hfsmp->hfs_attribute_vp = vp; - hfs_unlock_mount (hfsmp); - - (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - - if (intrans) { - hfs_end_transaction(hfsmp); - intrans = 0; - } - - /* Initialize the vnode for virtual attribute data file */ - result = init_attrdata_vnode(hfsmp); - if (result) { - printf("hfs_create_attr_btree: vol=%s init_attrdata_vnode() error=%d\n", hfsmp->vcbVN, result); - } - -exit: - if (vp) { - hfs_unlock(VTOC(vp)); - } - if (result) { - if (btcb) { - FREE (btcb, M_TEMP); - } - if (vp) { - vnode_put(vp); - } - /* XXX need to give back blocks ? */ - } - if (intrans) { - hfs_end_transaction(hfsmp); - } - - /* - * All done, clear HFS_CREATING_BTREE, and wake up any sleepers. - */ - hfs_lock_mount (hfsmp); - hfsmp->hfs_flags &= ~HFS_CREATING_BTREE; - wakeup((caddr_t)&hfsmp->hfs_attribute_cp); - hfs_unlock_mount (hfsmp); - - return (result); -} - diff --git a/bsd/hfs/hfs_btreeio.h b/bsd/hfs/hfs_btreeio.h deleted file mode 100644 index ebb81e1af..000000000 --- a/bsd/hfs/hfs_btreeio.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2005-2011 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _HFS_BTREEIO_H_ -#define _HFS_BTREEIO_H_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - -#include "hfs.h" -#include "hfscommon/headers/BTreesInternal.h" - -/* BTree accessor routines */ -extern OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, - ItemCount minBlockCount); - -extern OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, - GetBlockOptions options, BlockDescriptor *block); - -extern OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, - ReleaseBlockOptions options); - -extern OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF); - -extern void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr); - -int hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t nodecnt); - -u_int16_t get_btree_nodesize(struct vnode *vp); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* ! _HFS_BTREEIO_H_ */ diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c deleted file mode 100644 index 0069ac2d4..000000000 --- a/bsd/hfs/hfs_catalog.c +++ /dev/null @@ -1,4838 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_format.h" -#include "hfs_endian.h" - -#include "hfscommon/headers/BTreesInternal.h" -#include "hfscommon/headers/BTreesPrivate.h" -#include "hfscommon/headers/HFSUnicodeWrappers.h" - - -/* - * Initialization of an FSBufferDescriptor structure. - */ -#define BDINIT(bd, addr) { \ - (bd).bufferAddress = (addr); \ - (bd).itemSize = sizeof(*(addr)); \ - (bd).itemCount = 1; \ -} - - -struct btobj { - BTreeIterator iterator; - HFSPlusCatalogKey key; - CatalogRecord data; -}; - -struct update_state { - struct cat_desc * s_desc; - struct cat_attr * s_attr; - const struct cat_fork * s_datafork; - const struct cat_fork * s_rsrcfork; - struct hfsmount * s_hfsmp; -}; - -struct position_state { - int error; - u_int32_t count; - u_int32_t index; - u_int32_t parentID; - struct hfsmount *hfsmp; -}; - -/* Map file mode type to directory entry types */ -u_char modetodirtype[16] = { - DT_REG, DT_FIFO, DT_CHR, DT_UNKNOWN, - DT_DIR, DT_UNKNOWN, DT_BLK, DT_UNKNOWN, - DT_REG, DT_UNKNOWN, DT_LNK, DT_UNKNOWN, - DT_SOCK, DT_UNKNOWN, DT_WHT, DT_UNKNOWN -}; -#define MODE_TO_DT(mode) (modetodirtype[((mode) & S_IFMT) >> 12]) - - -#define HFS_LOOKUP_SYSFILE 0x1 /* If set, allow lookup of system files */ -#define HFS_LOOKUP_HARDLINK 0x2 /* If set, allow lookup of hard link records and not resolve the hard links */ -#define HFS_LOOKUP_CASESENSITIVE 0x4 /* If set, verify results of a file/directory record match input case */ -static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, - struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid); - -int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, - struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp); - -/* Internal catalog support routines */ - -static int cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, - struct position_state *state); - -static int resolvelinkid(struct hfsmount *hfsmp, u_int32_t linkref, ino_t *ino); - -static int getkey(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key); - -static int buildkey(struct hfsmount *hfsmp, struct cat_desc *descp, - HFSPlusCatalogKey *key, int retry); - -static void buildthreadkey(HFSCatalogNodeID parentID, int std_hfs, CatalogKey *key); - -static void buildrecord(struct cat_attr *attrp, cnid_t cnid, int std_hfs, u_int32_t encoding, CatalogRecord *crp, u_int32_t *recordSize); - -static int catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *state); - -static int builddesc(const HFSPlusCatalogKey *key, cnid_t cnid, u_int32_t hint, u_int32_t encoding, - int isdir, struct cat_desc *descp); - -static void getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct cat_attr * attrp); - -#if CONFIG_HFS_STD -static void promotekey(struct hfsmount *hfsmp, const HFSCatalogKey *hfskey, HFSPlusCatalogKey *keyp, u_int32_t *encoding); -static void promotefork(struct hfsmount *hfsmp, const struct HFSCatalogFile *file, int resource, struct cat_fork * forkp); -static void promoteattr(struct hfsmount *hfsmp, const CatalogRecord *dataPtr, struct HFSPlusCatalogFile *crp); -#endif - -static cnid_t getcnid(const CatalogRecord *crp); -static u_int32_t getencoding(const CatalogRecord *crp); -static cnid_t getparentcnid(const CatalogRecord *recp); - -static int isadir(const CatalogRecord *crp); - -static int buildthread(void *keyp, void *recp, int std_hfs, int directory); - -static int cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalogFile *crp); - -static int cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, - const struct cat_fork *dataforkp, const struct cat_fork *rsrcforkp); - - - -/* HFS ID Hashtable Functions */ -#define IDHASH(hfsmp, inum) (&hfsmp->hfs_idhashtbl[(inum) & hfsmp->hfs_idhash]) - -/* Initialize the HFS ID hash table */ -void -hfs_idhash_init (struct hfsmount *hfsmp) { - /* secured by catalog lock so no lock init needed */ - hfsmp->hfs_idhashtbl = hashinit(HFS_IDHASH_DEFAULT, M_HFSMNT, &hfsmp->hfs_idhash); -} - -/* Free the HFS ID hash table */ -void -hfs_idhash_destroy (struct hfsmount *hfsmp) { - /* during failed mounts & unmounts */ - FREE(hfsmp->hfs_idhashtbl, M_HFSMNT); -} - -/* -from hfs_catalog.h: -typedef struct cat_preflightid { - cnid_t fileid; - LIST_ENTRY(cat_preflightid) id_hash; -} cat_preflightid_t; - -from hfs.h: - u_long hfs_idhash; / size of cnid/fileid hash table -1 / - LIST_HEAD(idhashhead, cat_preflightid) *hfs_idhashtbl; / base of ID hash / -*/ - -/* - * Check the run-time ID hashtable. - * - * The catalog lock must be held (like other functions in this file). - * - * Returns: - * 1 if the ID is in the hash table. - * 0 if the ID is not in the hash table - */ -int cat_check_idhash (struct hfsmount *hfsmp, cnid_t test_fileid) { - - cat_preflightid_t *preflight; - int found = 0; - - for (preflight = IDHASH(hfsmp, test_fileid)->lh_first; preflight ; preflight = preflight->id_hash.le_next) { - if (preflight->fileid == test_fileid) { - found = 1; - break; - } - } - - return found; -} - -/* Insert the supplied preflight into the ID hash table */ -int cat_insert_idhash (struct hfsmount *hfsmp, cat_preflightid_t *preflight) { - - if (preflight) { - LIST_INSERT_HEAD(IDHASH(hfsmp, (preflight->fileid)), preflight, id_hash); - return 0; - } - return -1; -} - - -/* Remove the data structure with the specified ID from the hashtable */ -int cat_remove_idhash (cat_preflightid_t *preflight) { - - if ((preflight) && ((preflight->id_hash.le_next || preflight->id_hash.le_prev))) { - LIST_REMOVE (preflight, id_hash); - preflight->id_hash.le_next = NULL; - preflight->id_hash.le_prev = NULL; - - return 0; - } - - return -1; -} - -/* - * Acquire a new CNID for use. - * - * This is slightly more complicated than just pulling the value from the - * hfsmount data structure. We need to validate that the ID is not in-use - * even if we've not wrapped around and that there are not any lingering - * or orphaned fileIDs for this ID. - * - * Also validate that there are not any pending insertions into the - * catalog by checking the ID hash table. - */ -int -cat_acquire_cnid (struct hfsmount *hfsmp, cnid_t *new_cnid) -{ - uint32_t nextCNID; - struct BTreeIterator *iterator; - FSBufferDescriptor btdata; - uint16_t datasize; - CatalogRecord *recp; - int result = 0; - int std_hfs; - int wrapped = 0; - - std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); - /* - * Get the next CNID. We can change it since we hold the catalog lock. - */ -nextid: - nextCNID = hfsmp->vcbNxtCNID; - if (nextCNID == 0xFFFFFFFF) { - if (std_hfs) { - return (ENOSPC); - } else { - wrapped++; - if (wrapped > 1) { - /* don't allow more than one wrap-around */ - return ENOSPC; - } - hfs_lock_mount (hfsmp); - hfsmp->vcbNxtCNID = kHFSFirstUserCatalogNodeID; - hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; - hfs_unlock_mount (hfsmp); - } - } else { - hfsmp->vcbNxtCNID++; - } - hfs_note_header_minor_change(hfsmp); - - /* First check that there are not any entries pending in the hash table with this ID */ - if (cat_check_idhash (hfsmp, nextCNID)) { - /* Someone wants to insert this into the catalog but hasn't done so yet. Skip it */ - goto nextid; - } - - /* Check to see if a thread record exists for the target ID we just got */ - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - buildthreadkey(nextCNID, std_hfs, (CatalogKey *)&iterator->key); - - MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); - BDINIT(btdata, recp); - - result = BTSearchRecord(hfsmp->hfs_catalog_cp->c_datafork, iterator, &btdata, &datasize, iterator); - FREE (recp, M_TEMP); - FREE (iterator, M_TEMP); - - if (result == btNotFound) { - /* Good. File ID was not in use. Move on to checking EA B-Tree */ - result = file_attribute_exist (hfsmp, nextCNID); - if (result == EEXIST) { - /* This CNID has orphaned EAs. Skip it and move on to the next one */ - result = 0; - goto nextid; - } - if (result) { - /* For any other error, return the result */ - return result; - } - - /* - * Now validate that there are no lingering cnodes with this ID. If a cnode - * has been removed on-disk (marked C_NOEXISTS), but has not yet been reclaimed, - * then it will still have an entry in the cnode hash table. This means that - * a subsequent lookup will find THAT entry and believe this one has been deleted - * prematurely. If there is a lingering cnode, then just skip this entry and move on. - * - * Note that we pass (existence_only == 1) argument to hfs_chash_snoop. - */ - if (!std_hfs && (hfsmp->vcbAtrb & kHFSCatalogNodeIDsReusedMask)) { - if (hfs_chash_snoop (hfsmp, nextCNID, 1, NULL, NULL) == 0) { - goto nextid; - } - } - - /* - * If we get here, then we didn't see any thread records, orphaned EAs, - * or stale cnodes. This ID is safe to vend out. - */ - *new_cnid = nextCNID; - } - else if (result == noErr) { - /* move on to the next ID */ - goto nextid; - } - else { - /* For any other situation, just bail out */ - return EIO; - } - - return 0; - -} - -int -cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, __unused proc_t p) -{ - int lockflags = 0; - int result; - - if (hfsmp->hfs_catalog_cp->c_lockowner != current_thread()) - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - result = BTReserveSpace(hfsmp->hfs_catalog_cp->c_datafork, ops, (void*)cookie); - - if (lockflags) - hfs_systemfile_unlock(hfsmp, lockflags); - - return MacToVFSError(result); -} - -void -cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, __unused proc_t p) -{ - int lockflags = 0; - - if (hfsmp->hfs_catalog_cp->c_lockowner != current_thread()) - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - (void) BTReleaseReserve(hfsmp->hfs_catalog_cp->c_datafork, (void*)cookie); - - if (lockflags) - hfs_systemfile_unlock(hfsmp, lockflags); -} - -__private_extern__ -void -cat_convertattr( - struct hfsmount *hfsmp, - CatalogRecord * recp, - struct cat_attr *attrp, - struct cat_fork *datafp, - struct cat_fork *rsrcfp) -{ - int std_hfs = HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord; - - if (std_hfs == 0) { - getbsdattr(hfsmp, (struct HFSPlusCatalogFile *)recp, attrp); - } -#if CONFIG_HFS_STD - else { - struct HFSPlusCatalogFile cnoderec; - - promoteattr(hfsmp, recp, &cnoderec); - getbsdattr(hfsmp, &cnoderec, attrp); - } -#endif - - if (isadir(recp)) { - bzero(datafp, sizeof(*datafp)); - } -#if CONFIG_HFS_STD - else if (std_hfs) { - promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, 0, datafp); - promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, 1, rsrcfp); - } -#endif - else { - /* Convert the data fork. */ - datafp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; - datafp->cf_new_size = 0; - datafp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; - if ((hfsmp->hfc_stage == HFC_RECORDING) && - (attrp->ca_atime >= hfsmp->hfc_timebase)) { - datafp->cf_bytesread = - recp->hfsPlusFile.dataFork.clumpSize * - HFSTOVCB(hfsmp)->blockSize; - } else { - datafp->cf_bytesread = 0; - } - datafp->cf_vblocks = 0; - bcopy(&recp->hfsPlusFile.dataFork.extents[0], - &datafp->cf_extents[0], sizeof(HFSPlusExtentRecord)); - - /* Convert the resource fork. */ - rsrcfp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; - rsrcfp->cf_new_size = 0; - rsrcfp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; - if ((hfsmp->hfc_stage == HFC_RECORDING) && - (attrp->ca_atime >= hfsmp->hfc_timebase)) { - datafp->cf_bytesread = - recp->hfsPlusFile.resourceFork.clumpSize * - HFSTOVCB(hfsmp)->blockSize; - } else { - datafp->cf_bytesread = 0; - } - rsrcfp->cf_vblocks = 0; - bcopy(&recp->hfsPlusFile.resourceFork.extents[0], - &rsrcfp->cf_extents[0], sizeof(HFSPlusExtentRecord)); - } -} - -/* - * Convert a raw catalog key and record into an in-core catalog descriptor. - * - * Note: The caller is responsible for releasing the catalog descriptor. - */ -__private_extern__ -int -cat_convertkey( - struct hfsmount *hfsmp, - CatalogKey *key, - CatalogRecord * recp, - struct cat_desc *descp) -{ - int std_hfs = HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord; - HFSPlusCatalogKey * pluskey = NULL; - u_int32_t encoding; - cnid_t cnid = 0; - int err = 0; - - if (std_hfs == 0) { - pluskey = (HFSPlusCatalogKey *)key; - encoding = getencoding(recp); - } -#if CONFIG_HFS_STD - else { - MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); - promotekey(hfsmp, (HFSCatalogKey *)key, pluskey, &encoding); - } -#endif - - /* Get the CNID before calling builddesc. Need to error check it. */ - cnid = getcnid(recp); - if (cnid == 0) { - /* If ths CNID == 0, it's invalid. Mark as corrupt */ - hfs_mark_inconsistent (hfsmp, HFS_INCONSISTENCY_DETECTED); - err = EINVAL; - } - else { - builddesc(pluskey, cnid, 0, encoding, isadir(recp), descp); - } - -#if CONFIG_HFS_STD - if (std_hfs) { - FREE(pluskey, M_TEMP); - } -#endif - - return err; -} - - -/* - * cat_releasedesc - */ -__private_extern__ -void -cat_releasedesc(struct cat_desc *descp) -{ - const u_int8_t * name; - - if (descp == NULL) - return; - - if ((descp->cd_flags & CD_HASBUF) && - (descp->cd_nameptr != NULL)) { - name = descp->cd_nameptr; - descp->cd_nameptr = NULL; - descp->cd_namelen = 0; - vfs_removename((const char *)name); - } - descp->cd_nameptr = NULL; - descp->cd_namelen = 0; - descp->cd_flags &= ~CD_HASBUF; -} - -/* - * These Catalog functions allow access to the HFS Catalog (database). - * The catalog b-tree lock must be acquired before calling any of these routines. - */ - -/* - * cat_lookup - lookup a catalog node using a cnode descriptor - * - * Note: The caller is responsible for releasing the output - * catalog descriptor (when supplied outdescp is non-null). - */ -int -cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, int force_casesensitive_lookup, - struct cat_desc *outdescp, struct cat_attr *attrp, - struct cat_fork *forkp, cnid_t *desc_cnid) -{ - CatalogKey * keyp; - int std_hfs; - int result; - int flags; - - std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); - flags = force_casesensitive_lookup ? HFS_LOOKUP_CASESENSITIVE : 0; - - MALLOC(keyp, CatalogKey *, sizeof(CatalogKey), M_TEMP, M_WAITOK); - - result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)keyp, 1); - if (result) - goto exit; - - result = cat_lookupbykey(hfsmp, keyp, flags, descp->cd_hint, wantrsrc, outdescp, attrp, forkp, desc_cnid); - - if (result == ENOENT) { - if (!std_hfs) { - struct cat_desc temp_desc; - if (outdescp == NULL) { - bzero(&temp_desc, sizeof(temp_desc)); - outdescp = &temp_desc; - } - result = cat_lookupmangled(hfsmp, descp, wantrsrc, outdescp, attrp, forkp); - if (desc_cnid) { - *desc_cnid = outdescp->cd_cnid; - } - if (outdescp == &temp_desc) { - /* Release the local copy of desc */ - cat_releasedesc(outdescp); - } - } else if (hfsmp->hfs_encoding != kTextEncodingMacRoman) { - // make MacRoman key from utf-8 - // result = cat_lookupbykey(hfsmp, keyp, descp->cd_hint, attrp, forkp); - // update desc text encoding so that other catalog ops succeed - } - } -exit: - FREE(keyp, M_TEMP); - - return (result); -} - -int -cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) -{ - struct BTreeIterator *iterator; - struct FSBufferDescriptor file_data; - struct HFSCatalogFile file_rec; - u_int16_t datasize; - FCB *fcb; - int result; - - if (HFSTOVCB(hfsmp)->vcbSigWord != kHFSSigWord) - return (EINVAL); - - fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); - - MALLOC(iterator, BTreeIterator *, 2 * sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(&iterator[0], 2* sizeof(*iterator)); - result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator[0].key, 0); - if (result) - goto exit; - - BDINIT(file_data, &file_rec); - result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]); - if (result) - goto exit; - - if (file_rec.recordType != kHFSFileRecord) { - result = EISDIR; - goto exit; - } - - if ((file_rec.flags & kHFSThreadExistsMask) == 0) { - struct FSBufferDescriptor thread_data; - struct HFSCatalogThread thread_rec; - - file_rec.flags |= kHFSThreadExistsMask; - BDINIT(thread_data, &thread_rec); - thread_data.itemSize = buildthread(&iterator[0].key, &thread_rec, 1, 0); - buildthreadkey(file_rec.fileID, 1, (CatalogKey *)&iterator[1].key); - - result = BTInsertRecord(fcb, &iterator[1], &thread_data, thread_data.itemSize); - if (result) - goto exit; - - (void) BTReplaceRecord(fcb, &iterator[0], &file_data, datasize); - (void) BTFlushPath(fcb); - } -exit: - (void) BTFlushPath(fcb); - FREE(iterator, M_TEMP); - - return MacToVFSError(result); -} - - -/* - * cat_findname - obtain a descriptor from cnid - * - * Only a thread lookup is performed. - * - * Note: The caller is responsible for releasing the output - * catalog descriptor (when supplied outdescp is non-null). - - */ -int -cat_findname(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp) -{ - struct BTreeIterator * iterator; - FSBufferDescriptor btdata; - CatalogKey * keyp; - CatalogRecord * recp; - int isdir; - int result; - int std_hfs; - - isdir = 0; - std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); - iterator->hint.nodeNum = 0; - - MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); - BDINIT(btdata, recp); - - result = BTSearchRecord(VTOF(hfsmp->hfs_catalog_vp), iterator, &btdata, NULL, NULL); - if (result) - goto exit; - - /* Turn thread record into a cnode key (in place). */ - switch (recp->recordType) { - -#if CONFIG_HFS_STD - case kHFSFolderThreadRecord: - isdir = 1; - /* fall through */ - case kHFSFileThreadRecord: - keyp = (CatalogKey *)((char *)&recp->hfsThread.reserved + 6); - keyp->hfs.keyLength = kHFSCatalogKeyMinimumLength + keyp->hfs.nodeName[0]; - break; -#endif - - case kHFSPlusFolderThreadRecord: - isdir = 1; - /* fall through */ - case kHFSPlusFileThreadRecord: - keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; - keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + - (keyp->hfsPlus.nodeName.length * 2); - break; - default: - result = ENOENT; - goto exit; - } - - if (std_hfs == 0) { - builddesc((HFSPlusCatalogKey *)keyp, cnid, 0, 0, isdir, outdescp); - } -#if CONFIG_HFS_STD - else { - HFSPlusCatalogKey * pluskey = NULL; - u_int32_t encoding; - - MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); - promotekey(hfsmp, &keyp->hfs, pluskey, &encoding); - builddesc(pluskey, cnid, 0, encoding, isdir, outdescp); - FREE(pluskey, M_TEMP); - } -#endif - -exit: - FREE(recp, M_TEMP); - FREE(iterator, M_TEMP); - - return MacToVFSError(result); -} - -/* - * cat_idlookup - lookup a catalog node using a cnode id - * - * Note: The caller is responsible for releasing the output - * catalog descriptor (when supplied outdescp is non-null). - */ -int -cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, int allow_system_files, int wantrsrc, - struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) -{ - struct BTreeIterator * iterator; - FSBufferDescriptor btdata; - u_int16_t datasize; - CatalogKey * keyp; - CatalogRecord * recp; - int result; - int std_hfs; - - std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); - - MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); - BDINIT(btdata, recp); - - result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, - &btdata, &datasize, iterator); - if (result) - goto exit; - - /* Turn thread record into a cnode key (in place) */ - switch (recp->recordType) { - -#if CONFIG_HFS_STD - case kHFSFileThreadRecord: - case kHFSFolderThreadRecord: - keyp = (CatalogKey *)((char *)&recp->hfsThread.reserved + 6); - - /* check for NULL name */ - if (keyp->hfs.nodeName[0] == 0) { - result = ENOENT; - goto exit; - } - - keyp->hfs.keyLength = kHFSCatalogKeyMinimumLength + keyp->hfs.nodeName[0]; - break; -#endif - - case kHFSPlusFileThreadRecord: - case kHFSPlusFolderThreadRecord: - keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; - - /* check for NULL name */ - if (keyp->hfsPlus.nodeName.length == 0) { - result = ENOENT; - goto exit; - } - - keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + - (keyp->hfsPlus.nodeName.length * 2); - break; - - default: - result = ENOENT; - goto exit; - } - - result = cat_lookupbykey(hfsmp, keyp, - ((allow_system_files != 0) ? HFS_LOOKUP_SYSFILE : 0), - 0, wantrsrc, outdescp, attrp, forkp, NULL); - /* No corresponding file/folder record found for a thread record, - * mark the volume inconsistent. - */ - if (result == 0 && outdescp) { - cnid_t dcnid = outdescp->cd_cnid; - /* - * Just for sanity's case, let's make sure that - * the key in the thread matches the key in the record. - */ - if (cnid != dcnid) { - printf("hfs: cat_idlookup: Requested cnid (%d / %08x) != dcnid (%d / %08x)\n", cnid, cnid, dcnid, dcnid); - result = ENOENT; - } - } -exit: - FREE(recp, M_TEMP); - FREE(iterator, M_TEMP); - - return MacToVFSError(result); -} - - -/* - * cat_lookupmangled - lookup a catalog node using a mangled name - */ -int -cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, - struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) -{ - cnid_t fileID; - u_int32_t prefixlen; - int result; - u_int8_t utf8[NAME_MAX + 1]; - u_int32_t utf8len; - u_int16_t unicode[kHFSPlusMaxFileNameChars + 1]; - size_t unicodelen; - - if (wantrsrc) - return (ENOENT); - - fileID = GetEmbeddedFileID(descp->cd_nameptr, descp->cd_namelen, &prefixlen); - if (fileID < (cnid_t)kHFSFirstUserCatalogNodeID) - return (ENOENT); - - if (fileID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - fileID == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid || - fileID == hfsmp->hfs_jnlfileid || - fileID == hfsmp->hfs_jnlinfoblkid) { - return (ENOENT); - } - - result = cat_idlookup(hfsmp, fileID, 0, 0, outdescp, attrp, forkp); - if (result) - return (ENOENT); - /* It must be in the correct directory */ - if (descp->cd_parentcnid != outdescp->cd_parentcnid) - goto falsematch; - - /* - * Compare the mangled version of file name looked up from the - * disk with the mangled name provided by the user. Note that - * this comparison is case-sensitive, which should be fine - * since we're trying to prevent user space from constructing - * a mangled name that differs from the one they'd get from the - * file system. - */ - result = utf8_decodestr(outdescp->cd_nameptr, outdescp->cd_namelen, - unicode, &unicodelen, sizeof(unicode), ':', 0); - if (result) { - goto falsematch; - } - result = ConvertUnicodeToUTF8Mangled(unicodelen, unicode, - sizeof(utf8), &utf8len, utf8, fileID); - if ((result != 0) || - ((u_int16_t)descp->cd_namelen != utf8len) || - (bcmp(descp->cd_nameptr, utf8, utf8len) != 0)) { - goto falsematch; - } - - return (0); - -falsematch: - cat_releasedesc(outdescp); - return (ENOENT); -} - - -/* - * cat_lookupbykey - lookup a catalog node using a cnode key - */ -static int -cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, - struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid) -{ - struct BTreeIterator * iterator; - FSBufferDescriptor btdata; - CatalogRecord * recp; - u_int16_t datasize; - int result; - int std_hfs; - u_int32_t ilink = 0; - cnid_t cnid = 0; - u_int32_t encoding = 0; - cnid_t parentid = 0; - - std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); - - MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); - BDINIT(btdata, recp); - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - iterator->hint.nodeNum = hint; - bcopy(keyp, &iterator->key, sizeof(CatalogKey)); - - result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, - &btdata, &datasize, iterator); - if (result) - goto exit; - - /* Save the cnid, parentid, and encoding now in case there's a hard link or inode */ - cnid = getcnid(recp); - if (cnid == 0) { - /* CNID of 0 is invalid. Mark as corrupt */ - hfs_mark_inconsistent (hfsmp, HFS_INCONSISTENCY_DETECTED); - result = EINVAL; - goto exit; - } - - if (std_hfs == 0) { - parentid = keyp->hfsPlus.parentID; - } - - encoding = getencoding(recp); - hint = iterator->hint.nodeNum; - - /* Hide the journal files (if any) */ - if ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) && - ((cnid == hfsmp->hfs_jnlfileid) || (cnid == hfsmp->hfs_jnlinfoblkid)) && - !(flags & HFS_LOOKUP_SYSFILE)) { - result = ERESERVEDNAME; - goto exit; - } - - if (!std_hfs && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE)) { - /* Make sure the case of the file was correct if requested */ - if (flags & HFS_LOOKUP_CASESENSITIVE) { - if (0 != cat_binarykeycompare(&keyp->hfsPlus, (HFSPlusCatalogKey *)&iterator->key)) { - result = ERESERVEDNAME; - goto exit; - } - } - } - - /* - * When a hardlink link is encountered, auto resolve it. - * - * The catalog record will change, and possibly its type. - */ - if (!std_hfs - && (attrp || forkp) - && (recp->recordType == kHFSPlusFileRecord) - && ((to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_itime) || - (to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_metadata_createdate))) { - int isdirlink = 0; - int isfilelink = 0; - - if ((SWAP_BE32(recp->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && - (SWAP_BE32(recp->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator)) { - isfilelink = 1; - } else if ((recp->hfsPlusFile.flags & kHFSHasLinkChainMask) && - (SWAP_BE32(recp->hfsPlusFile.userInfo.fdType) == kHFSAliasType) && - (SWAP_BE32(recp->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator)) { - isdirlink = 1; - } - if ((isfilelink || isdirlink) && !(flags & HFS_LOOKUP_HARDLINK)) { - ilink = recp->hfsPlusFile.hl_linkReference; - (void) cat_resolvelink(hfsmp, ilink, isdirlink, (struct HFSPlusCatalogFile *)recp); - } - } - - if (attrp != NULL) { - if (std_hfs == 0) { - getbsdattr(hfsmp, (struct HFSPlusCatalogFile *)recp, attrp); - if (ilink) { - /* Update the inode number for this hard link */ - attrp->ca_linkref = ilink; - } - - /* - * Set kHFSHasLinkChainBit for hard links, and reset it for all - * other items. Also set linkCount to 1 for regular files. - * - * Due to some bug (rdar://8505977), some regular files can have - * kHFSHasLinkChainBit set and linkCount more than 1 even if they - * are not really hard links. The runtime code should not consider - * these files has hard links. Therefore we reset the kHFSHasLinkChainBit - * and linkCount for regular file before we vend it out. This might - * also result in repairing the bad files on disk, if the corresponding - * file is modified and updated on disk. - */ - if (ilink) { - /* This is a hard link and the link count bit was not set */ - if (!(attrp->ca_recflags & kHFSHasLinkChainMask)) { - printf ("hfs: set hardlink bit on vol=%s cnid=%u inoid=%u\n", hfsmp->vcbVN, cnid, ilink); - attrp->ca_recflags |= kHFSHasLinkChainMask; - } - } else { - /* Make sure that this non-hard link (regular) record is not - * an inode record that was looked up and we do not end up - * reseting the hard link bit on it. - */ - if ((parentid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && - (parentid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid)) { - /* This is not a hard link or inode and the link count bit was set */ - if (attrp->ca_recflags & kHFSHasLinkChainMask) { - printf ("hfs: clear hardlink bit on vol=%s cnid=%u\n", hfsmp->vcbVN, cnid); - attrp->ca_recflags &= ~kHFSHasLinkChainMask; - } - /* This is a regular file and the link count was more than 1 */ - if (S_ISREG(attrp->ca_mode) && (attrp->ca_linkcount > 1)) { - printf ("hfs: set linkcount=1 on vol=%s cnid=%u old=%u\n", hfsmp->vcbVN, cnid, attrp->ca_linkcount); - attrp->ca_linkcount = 1; - } - } - } - } -#if CONFIG_HFS_STD - else { - struct HFSPlusCatalogFile cnoderec; - - promoteattr(hfsmp, recp, &cnoderec); - getbsdattr(hfsmp, &cnoderec, attrp); - } -#endif - } - if (forkp != NULL) { - if (isadir(recp)) { - bzero(forkp, sizeof(*forkp)); - } -#if CONFIG_HFS_STD - else if (std_hfs) { - promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, wantrsrc, forkp); - } -#endif - else if (wantrsrc) { - /* Convert the resource fork. */ - forkp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; - forkp->cf_new_size = 0; - forkp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; - if ((hfsmp->hfc_stage == HFC_RECORDING) && - (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { - forkp->cf_bytesread = - recp->hfsPlusFile.resourceFork.clumpSize * - HFSTOVCB(hfsmp)->blockSize; - } else { - forkp->cf_bytesread = 0; - } - forkp->cf_vblocks = 0; - bcopy(&recp->hfsPlusFile.resourceFork.extents[0], - &forkp->cf_extents[0], sizeof(HFSPlusExtentRecord)); - } else { - int i; - u_int32_t validblks; - - /* Convert the data fork. */ - forkp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; - forkp->cf_new_size = 0; - forkp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; - if ((hfsmp->hfc_stage == HFC_RECORDING) && - (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { - forkp->cf_bytesread = - recp->hfsPlusFile.dataFork.clumpSize * - HFSTOVCB(hfsmp)->blockSize; - } else { - forkp->cf_bytesread = 0; - } - forkp->cf_vblocks = 0; - bcopy(&recp->hfsPlusFile.dataFork.extents[0], - &forkp->cf_extents[0], sizeof(HFSPlusExtentRecord)); - - /* Validate the fork's resident extents. */ - validblks = 0; - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (forkp->cf_extents[i].startBlock + forkp->cf_extents[i].blockCount >= hfsmp->totalBlocks) { - /* Suppress any bad extents so a remove can succeed. */ - forkp->cf_extents[i].startBlock = 0; - forkp->cf_extents[i].blockCount = 0; - /* Disable writes */ - if (attrp != NULL) { - attrp->ca_mode &= S_IFMT | S_IRUSR | S_IRGRP | S_IROTH; - } - } else { - validblks += forkp->cf_extents[i].blockCount; - } - } - /* Adjust for any missing blocks. */ - if ((validblks < forkp->cf_blocks) && (forkp->cf_extents[7].blockCount == 0)) { - off_t psize; - - /* - * This is technically a volume corruption. - * If the total number of blocks calculated by iterating + summing - * the extents in the resident extent records, is less than that - * which is reported in the catalog entry, we should force a fsck. - * Only modifying ca_blocks here is not guaranteed to make it out - * to disk; it is a runtime-only field. - * - * Note that we could have gotten into this state if we had invalid ranges - * that existed in borrowed blocks that somehow made it out to disk. - * The cnode's on disk block count should never be greater - * than that which is in its extent records. - */ - - (void) hfs_mark_inconsistent (hfsmp, HFS_INCONSISTENCY_DETECTED); - - forkp->cf_blocks = validblks; - if (attrp != NULL) { - attrp->ca_blocks = validblks + recp->hfsPlusFile.resourceFork.totalBlocks; - } - psize = (off_t)validblks * (off_t)hfsmp->blockSize; - if (psize < forkp->cf_size) { - forkp->cf_size = psize; - } - - } - } - } - if (descp != NULL) { - HFSPlusCatalogKey * pluskey = NULL; - - if (std_hfs == 0) { - pluskey = (HFSPlusCatalogKey *)&iterator->key; - } -#if CONFIG_HFS_STD - else { - MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); - promotekey(hfsmp, (HFSCatalogKey *)&iterator->key, pluskey, &encoding); - - } -#endif - - builddesc(pluskey, cnid, hint, encoding, isadir(recp), descp); - -#if CONFIG_HFS_STD - if (std_hfs) { - FREE(pluskey, M_TEMP); - } -#endif - - } - - if (desc_cnid != NULL) { - *desc_cnid = cnid; - } -exit: - FREE(iterator, M_TEMP); - FREE(recp, M_TEMP); - - return MacToVFSError(result); -} - - -/* - * cat_create - create a node in the catalog - * - * NOTE: both the catalog file and attribute file locks must - * be held before calling this function. - * - * The caller is responsible for releasing the output - * catalog descriptor (when supplied outdescp is non-null). - */ -int -cat_create(struct hfsmount *hfsmp, cnid_t new_fileid, struct cat_desc *descp, struct cat_attr *attrp, - struct cat_desc *out_descp) -{ - FCB * fcb; - struct btobj * bto; - FSBufferDescriptor btdata; - u_int32_t datalen; - int std_hfs; - int result = 0; - u_int32_t encoding = kTextEncodingMacRoman; - int modeformat; - - modeformat = attrp->ca_mode & S_IFMT; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); - - /* The caller is expected to reserve a CNID before calling this function! */ - - /* Get space for iterator, key and data */ - MALLOC(bto, struct btobj *, sizeof(struct btobj), M_TEMP, M_WAITOK); - bto->iterator.hint.nodeNum = 0; - - result = buildkey(hfsmp, descp, &bto->key, 0); - if (result) - goto exit; - - if (!std_hfs) { - encoding = hfs_pickencoding(bto->key.nodeName.unicode, - bto->key.nodeName.length); - hfs_setencodingbits(hfsmp, encoding); - } - - /* - * Insert the thread record first - */ - if (!std_hfs || (modeformat == S_IFDIR)) { - datalen = buildthread((void*)&bto->key, &bto->data, std_hfs, - S_ISDIR(attrp->ca_mode)); - btdata.bufferAddress = &bto->data; - btdata.itemSize = datalen; - btdata.itemCount = 1; - - /* Caller asserts the following: - * 1) this CNID is not in use by any orphaned EAs - * 2) There are no lingering cnodes (removed on-disk but still in-core) with this CNID - * 3) There are no thread or catalog records for this ID - */ - buildthreadkey(new_fileid, std_hfs, (CatalogKey *) &bto->iterator.key); - result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); - if (result) { - goto exit; - } - } - - /* - * Now insert the file/directory record - */ - buildrecord(attrp, new_fileid, std_hfs, encoding, &bto->data, &datalen); - btdata.bufferAddress = &bto->data; - btdata.itemSize = datalen; - btdata.itemCount = 1; - - bcopy(&bto->key, &bto->iterator.key, sizeof(bto->key)); - - result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); - if (result) { - if (result == btExists) - result = EEXIST; - - /* Back out the thread record */ - if (!std_hfs || S_ISDIR(attrp->ca_mode)) { - buildthreadkey(new_fileid, std_hfs, (CatalogKey *)&bto->iterator.key); - if (BTDeleteRecord(fcb, &bto->iterator)) { - /* Error on deleting extra thread record, mark - * volume inconsistent - */ - printf ("hfs: cat_create() failed to delete thread record id=%u on vol=%s\n", new_fileid, hfsmp->vcbVN); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } - } - goto exit; - } - - /* - * Insert was successful, update name, parent and volume - */ - if (out_descp != NULL) { - HFSPlusCatalogKey * pluskey = NULL; - - if (std_hfs == 0) { - pluskey = (HFSPlusCatalogKey *)&bto->iterator.key; - } -#if CONFIG_HFS_STD - else { - MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); - promotekey(hfsmp, (HFSCatalogKey *)&bto->iterator.key, pluskey, &encoding); - } -#endif - - builddesc(pluskey, new_fileid, bto->iterator.hint.nodeNum, - encoding, S_ISDIR(attrp->ca_mode), out_descp); -#if CONFIG_HFS_STD - if (std_hfs) { - FREE(pluskey, M_TEMP); - } -#endif - - } - attrp->ca_fileid = new_fileid; - -exit: - (void) BTFlushPath(fcb); - FREE(bto, M_TEMP); - - return MacToVFSError(result); -} - - -/* - * cnode_rename - rename a catalog node - * - * Assumes that the target's directory exists. - * - * Order of B-tree operations: - * 1. BTSearchRecord(from_cnode, &data); - * 2. BTInsertRecord(to_cnode, &data); - * 3. BTDeleteRecord(from_cnode); - * 4. BTDeleteRecord(from_thread); - * 5. BTInsertRecord(to_thread); - * - * Note: The caller is responsible for releasing the output - * catalog descriptor (when supplied out_cdp is non-null). - */ -int -cat_rename ( - struct hfsmount * hfsmp, - struct cat_desc * from_cdp, - struct cat_desc * todir_cdp, - struct cat_desc * to_cdp, - struct cat_desc * out_cdp ) -{ - struct BTreeIterator * to_iterator = NULL; - struct BTreeIterator * from_iterator = NULL; - FSBufferDescriptor btdata; - CatalogRecord * recp = NULL; - HFSPlusCatalogKey * to_key; - ExtendedVCB * vcb; - FCB * fcb; - u_int16_t datasize; - int result = 0; - int sourcegone = 0; - int skipthread = 0; - int directory = from_cdp->cd_flags & CD_ISDIR; - int is_dirlink = 0; - int std_hfs; - u_int32_t encoding = 0; - - vcb = HFSTOVCB(hfsmp); - fcb = GetFileControlBlock(vcb->catalogRefNum); - std_hfs = (vcb->vcbSigWord == kHFSSigWord); - - if (from_cdp->cd_namelen == 0 || to_cdp->cd_namelen == 0) - return (EINVAL); - - MALLOC(from_iterator, BTreeIterator *, sizeof(*from_iterator), M_TEMP, M_WAITOK); - bzero(from_iterator, sizeof(*from_iterator)); - if ((result = buildkey(hfsmp, from_cdp, (HFSPlusCatalogKey *)&from_iterator->key, 0))) - goto exit; - - MALLOC(to_iterator, BTreeIterator *, sizeof(*to_iterator), M_TEMP, M_WAITOK); - bzero(to_iterator, sizeof(*to_iterator)); - if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0))) - goto exit; - - to_key = (HFSPlusCatalogKey *)&to_iterator->key; - MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); - BDINIT(btdata, recp); - - /* - * When moving a directory, make sure its a valid move. - */ - if (directory && (from_cdp->cd_parentcnid != to_cdp->cd_parentcnid)) { - struct BTreeIterator *dir_iterator = NULL; - - cnid_t cnid = from_cdp->cd_cnid; - cnid_t pathcnid = todir_cdp->cd_parentcnid; - - /* First check the obvious ones */ - if (cnid == fsRtDirID || - cnid == to_cdp->cd_parentcnid || - cnid == pathcnid) { - result = EINVAL; - goto exit; - } - /* now allocate the dir_iterator */ - MALLOC (dir_iterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (dir_iterator == NULL) { - return ENOMEM; - } - bzero(dir_iterator, sizeof(*dir_iterator)); - - /* - * Traverse destination path all the way back to the root - * making sure that source directory is not encountered. - * - */ - while (pathcnid > fsRtDirID) { - buildthreadkey(pathcnid, std_hfs, (CatalogKey *)&dir_iterator->key); - result = BTSearchRecord(fcb, dir_iterator, &btdata, &datasize, NULL); - if (result) { - FREE(dir_iterator, M_TEMP); - goto exit; - } - pathcnid = getparentcnid(recp); - if (pathcnid == cnid || pathcnid == 0) { - result = EINVAL; - FREE(dir_iterator, M_TEMP); - goto exit; - } - } - FREE(dir_iterator, M_TEMP); - } - - /* - * Step 1: Find cnode data at old location - */ - result = BTSearchRecord(fcb, from_iterator, &btdata, - &datasize, from_iterator); - if (result) { - if (std_hfs || (result != btNotFound)) - goto exit; - - struct cat_desc temp_desc; - - /* Probably the node has mangled name */ - result = cat_lookupmangled(hfsmp, from_cdp, 0, &temp_desc, NULL, NULL); - if (result) - goto exit; - - /* The file has mangled name. Search the cnode data using full name */ - bzero(from_iterator, sizeof(*from_iterator)); - result = buildkey(hfsmp, &temp_desc, (HFSPlusCatalogKey *)&from_iterator->key, 0); - if (result) { - cat_releasedesc(&temp_desc); - goto exit; - } - - result = BTSearchRecord(fcb, from_iterator, &btdata, &datasize, from_iterator); - if (result) { - cat_releasedesc(&temp_desc); - goto exit; - } - - cat_releasedesc(&temp_desc); - } - - /* Check if the source is directory hard link. We do not change - * directory flag because it is later used to initialize result descp - */ - if ((!std_hfs) && - (directory) && - (recp->recordType == kHFSPlusFileRecord) && - (recp->hfsPlusFile.flags & kHFSHasLinkChainMask)) { - is_dirlink = 1; - } - - /* - * Update the text encoding (on disk and in descriptor). - * - * Note that hardlink inodes don't require a text encoding hint. - */ - if (!std_hfs && - todir_cdp->cd_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid && - todir_cdp->cd_parentcnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - encoding = hfs_pickencoding(to_key->nodeName.unicode, to_key->nodeName.length); - hfs_setencodingbits(hfsmp, encoding); - recp->hfsPlusFile.textEncoding = encoding; - if (out_cdp) - out_cdp->cd_encoding = encoding; - } - -#if CONFIG_HFS_STD - if (std_hfs && !directory && - !(recp->hfsFile.flags & kHFSThreadExistsMask)) { - skipthread = 1; - } -#endif - -#if 0 - /* - * If the keys are identical then there's nothing left to do! - * - * update the hint and exit - * - */ - if (std_hfs && hfskeycompare(to_key, iter->key) == 0) - goto exit; - if (!std_hfs && hfspluskeycompare(to_key, iter->key) == 0) - goto exit; -#endif - - /* Step 2: Insert cnode at new location */ - result = BTInsertRecord(fcb, to_iterator, &btdata, datasize); - if (result == btExists) { - int fromtype = recp->recordType; - cnid_t cnid = 0; - - if (from_cdp->cd_parentcnid != to_cdp->cd_parentcnid) - goto exit; /* EEXIST */ - - /* Find cnode data at new location */ - result = BTSearchRecord(fcb, to_iterator, &btdata, &datasize, NULL); - if (result) - goto exit; - - /* Get the CNID after calling searchrecord */ - cnid = getcnid (recp); - if (cnid == 0) { - hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); - result = EINVAL; - goto exit; - } - - if ((fromtype != recp->recordType) || - (from_cdp->cd_cnid != cnid)) { - result = EEXIST; - goto exit; /* EEXIST */ - } - /* The old name is a case variant and must be removed */ - result = BTDeleteRecord(fcb, from_iterator); - if (result) - goto exit; - - /* Insert cnode (now that case duplicate is gone) */ - result = BTInsertRecord(fcb, to_iterator, &btdata, datasize); - if (result) { - /* Try and restore original before leaving */ - // XXXdbg - #if 1 - { - int err; - err = BTInsertRecord(fcb, from_iterator, &btdata, datasize); - if (err) { - printf("hfs: cat_create: could not undo (BTInsert = %d)\n", err); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - result = err; - goto exit; - } - } - #else - (void) BTInsertRecord(fcb, from_iterator, &btdata, datasize); - #endif - goto exit; - } - sourcegone = 1; - } - if (result) - goto exit; - - /* Step 3: Remove cnode from old location */ - if (!sourcegone) { - result = BTDeleteRecord(fcb, from_iterator); - if (result) { - /* Try and delete new record before leaving */ - // XXXdbg - #if 1 - { - int err; - err = BTDeleteRecord(fcb, to_iterator); - if (err) { - printf("hfs: cat_create: could not undo (BTDelete = %d)\n", err); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - result = err; - goto exit; - } - } - #else - (void) BTDeleteRecord(fcb, to_iterator); - #endif - goto exit; - } - } - - /* #### POINT OF NO RETURN #### */ - - /* - * Step 4: Remove cnode's old thread record - */ - buildthreadkey(from_cdp->cd_cnid, std_hfs, (CatalogKey *)&from_iterator->key); - (void) BTDeleteRecord(fcb, from_iterator); - - /* - * Step 5: Insert cnode's new thread record - * (optional for HFS files) - */ - if (!skipthread) { - /* For directory hard links, always create a file thread - * record. For everything else, use the directory flag. - */ - if (is_dirlink) { - datasize = buildthread(&to_iterator->key, recp, std_hfs, false); - } else { - datasize = buildthread(&to_iterator->key, recp, std_hfs, directory); - } - btdata.itemSize = datasize; - buildthreadkey(from_cdp->cd_cnid, std_hfs, (CatalogKey *)&from_iterator->key); - result = BTInsertRecord(fcb, from_iterator, &btdata, datasize); - } - - if (out_cdp) { - HFSPlusCatalogKey * pluskey = NULL; - - if (std_hfs == 0) { - pluskey = (HFSPlusCatalogKey *)&to_iterator->key; - } -#if CONFIG_HFS_STD - else { - MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); - promotekey(hfsmp, (HFSCatalogKey *)&to_iterator->key, pluskey, &encoding); - - /* Save the real encoding hint in the Finder Info (field 4). */ - if (directory && from_cdp->cd_cnid == kHFSRootFolderID) { - u_int32_t realhint; - - realhint = hfs_pickencoding(pluskey->nodeName.unicode, pluskey->nodeName.length); - vcb->vcbFndrInfo[4] = SET_HFS_TEXT_ENCODING(realhint); - } - } -#endif - - builddesc(pluskey, from_cdp->cd_cnid, to_iterator->hint.nodeNum, - encoding, directory, out_cdp); -#if CONFIG_HFS_STD - if (std_hfs) { - FREE(pluskey, M_TEMP); - } -#endif - - } -exit: - (void) BTFlushPath(fcb); - if (from_iterator) - FREE(from_iterator, M_TEMP); - if (to_iterator) - FREE(to_iterator, M_TEMP); - if (recp) - FREE(recp, M_TEMP); - return MacToVFSError(result); -} - - -/* - * cat_delete - delete a node from the catalog - * - * Order of B-tree operations: - * 1. BTDeleteRecord(cnode); - * 2. BTDeleteRecord(thread); - * 3. BTUpdateRecord(parent); - */ -int -cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp) -{ - FCB * fcb; - BTreeIterator *iterator; - cnid_t cnid; - int std_hfs; - int result; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); - - /* Preflight check: - * - * The root directory cannot be deleted - * A directory must be empty - * A file must be zero length (no blocks) - */ - if (descp->cd_cnid < kHFSFirstUserCatalogNodeID || - descp->cd_parentcnid == kHFSRootParentID) - return (EINVAL); - - /* XXX Preflight Missing */ - - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; - - /* - * Derive a key from either the file ID (for a virtual inode) - * or the descriptor. - */ - if (descp->cd_namelen == 0) { - result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); - cnid = attrp->ca_fileid; - } else { - result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0); - cnid = descp->cd_cnid; - } - if (result) - goto exit; - - /* Delete record */ - result = BTDeleteRecord(fcb, iterator); - if (result) { - if (std_hfs || (result != btNotFound)) - goto exit; - - struct cat_desc temp_desc; - - /* Probably the node has mangled name */ - result = cat_lookupmangled(hfsmp, descp, 0, &temp_desc, attrp, NULL); - if (result) - goto exit; - - /* The file has mangled name. Delete the file using full name */ - bzero(iterator, sizeof(*iterator)); - result = buildkey(hfsmp, &temp_desc, (HFSPlusCatalogKey *)&iterator->key, 0); - cnid = temp_desc.cd_cnid; - if (result) { - cat_releasedesc(&temp_desc); - goto exit; - } - - result = BTDeleteRecord(fcb, iterator); - if (result) { - cat_releasedesc(&temp_desc); - goto exit; - } - - cat_releasedesc(&temp_desc); - } - - /* Delete thread record. On error, mark volume inconsistent */ - buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); - if (BTDeleteRecord(fcb, iterator)) { - if (!std_hfs) { - printf ("hfs: cat_delete() failed to delete thread record id=%u on vol=%s\n", cnid, hfsmp->vcbVN); - hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); - } - } - -exit: - (void) BTFlushPath(fcb); - - return MacToVFSError(result); -} - - -/* - * cat_update_internal - update the catalog node described by descp - * using the data from attrp and forkp. - * If update_hardlink is true, the hard link catalog record is updated - * and not the inode catalog record. - */ -static int -cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, - const struct cat_fork *dataforkp, const struct cat_fork *rsrcforkp) -{ - FCB * fcb; - BTreeIterator * iterator; - struct update_state state; - int result; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - state.s_desc = descp; - state.s_attr = attrp; - state.s_datafork = dataforkp; - state.s_rsrcfork = rsrcforkp; - state.s_hfsmp = hfsmp; - - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - - /* - * For open-deleted files we need to do a lookup by cnid - * (using thread rec). - * - * For hard links and if not requested by caller, the target - * of the update is the inode itself (not the link record) - * so a lookup by fileid (i.e. thread rec) is needed. - */ - if ((update_hardlink == false) && - ((descp->cd_cnid != attrp->ca_fileid) || - (descp->cd_namelen == 0) || - (attrp->ca_recflags & kHFSHasLinkChainMask))) { - result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); - } else { - result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0); - } - if (result) - goto exit; - - /* Pass a node hint */ - iterator->hint.nodeNum = descp->cd_hint; - - result = BTUpdateRecord(fcb, iterator, - (IterateCallBackProcPtr)catrec_update, &state); - if (result) - goto exit; - - /* Update the node hint. */ - descp->cd_hint = iterator->hint.nodeNum; - -exit: - (void) BTFlushPath(fcb); - - return MacToVFSError(result); -} - -/* - * cat_update - update the catalog node described by descp - * using the data from attrp and forkp. - */ -int -cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, - const struct cat_fork *dataforkp, const struct cat_fork *rsrcforkp) -{ - return cat_update_internal(hfsmp, false, descp, attrp, dataforkp, rsrcforkp); -} - -/* - * catrec_update - Update the fields of a catalog record - * This is called from within BTUpdateRecord. - */ -static int -catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *state) -{ - struct cat_desc *descp; - struct cat_attr *attrp; - const struct cat_fork *forkp; - struct hfsmount *hfsmp; - long blksize; - - descp = state->s_desc; - attrp = state->s_attr; - hfsmp = state->s_hfsmp; - blksize = HFSTOVCB(hfsmp)->blockSize; - - switch (crp->recordType) { - -#if CONFIG_HFS_STD - case kHFSFolderRecord: { - HFSCatalogFolder *dir; - - dir = (struct HFSCatalogFolder *)crp; - /* Do a quick sanity check */ - if ((ckp->hfs.parentID != descp->cd_parentcnid) || - (dir->folderID != descp->cd_cnid)) - return (btNotFound); - dir->valence = attrp->ca_entries; - dir->createDate = UTCToLocal(to_hfs_time(attrp->ca_itime)); - dir->modifyDate = UTCToLocal(to_hfs_time(attrp->ca_mtime)); - dir->backupDate = UTCToLocal(to_hfs_time(attrp->ca_btime)); - bcopy(&attrp->ca_finderinfo[0], &dir->userInfo, 16); - bcopy(&attrp->ca_finderinfo[16], &dir->finderInfo, 16); - break; - } - case kHFSFileRecord: { - HFSCatalogFile *file; - int i; - - file = (struct HFSCatalogFile *)crp; - /* Do a quick sanity check */ - if ((ckp->hfs.parentID != descp->cd_parentcnid) || - (file->fileID != attrp->ca_fileid)) - return (btNotFound); - file->createDate = UTCToLocal(to_hfs_time(attrp->ca_itime)); - file->modifyDate = UTCToLocal(to_hfs_time(attrp->ca_mtime)); - file->backupDate = UTCToLocal(to_hfs_time(attrp->ca_btime)); - bcopy(&attrp->ca_finderinfo[0], &file->userInfo, 16); - bcopy(&attrp->ca_finderinfo[16], &file->finderInfo, 16); - if (state->s_rsrcfork) { - forkp = state->s_rsrcfork; - file->rsrcLogicalSize = forkp->cf_size; - file->rsrcPhysicalSize = forkp->cf_blocks * blksize; - for (i = 0; i < kHFSExtentDensity; ++i) { - file->rsrcExtents[i].startBlock = - (u_int16_t)forkp->cf_extents[i].startBlock; - file->rsrcExtents[i].blockCount = - (u_int16_t)forkp->cf_extents[i].blockCount; - } - } - if (state->s_datafork) { - forkp = state->s_datafork; - file->dataLogicalSize = forkp->cf_size; - file->dataPhysicalSize = forkp->cf_blocks * blksize; - for (i = 0; i < kHFSExtentDensity; ++i) { - file->dataExtents[i].startBlock = - (u_int16_t)forkp->cf_extents[i].startBlock; - file->dataExtents[i].blockCount = - (u_int16_t)forkp->cf_extents[i].blockCount; - } - } - - /* Synchronize the lock state */ - if (attrp->ca_flags & (SF_IMMUTABLE | UF_IMMUTABLE)) - file->flags |= kHFSFileLockedMask; - else - file->flags &= ~kHFSFileLockedMask; - break; - } -#endif - - case kHFSPlusFolderRecord: { - HFSPlusCatalogFolder *dir; - - dir = (struct HFSPlusCatalogFolder *)crp; - /* Do a quick sanity check */ - if (dir->folderID != attrp->ca_fileid) { - printf("hfs: catrec_update: id %d != %d, vol=%s\n", dir->folderID, attrp->ca_fileid, hfsmp->vcbVN); - return (btNotFound); - } - dir->flags = attrp->ca_recflags; - dir->valence = attrp->ca_entries; - dir->createDate = to_hfs_time(attrp->ca_itime); - dir->contentModDate = to_hfs_time(attrp->ca_mtime); - dir->backupDate = to_hfs_time(attrp->ca_btime); - dir->accessDate = to_hfs_time(attrp->ca_atime); - attrp->ca_atimeondisk = attrp->ca_atime; - dir->attributeModDate = to_hfs_time(attrp->ca_ctime); - /* Note: directory hardlink inodes don't require a text encoding hint. */ - if (ckp->hfsPlus.parentID != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - dir->textEncoding = descp->cd_encoding; - } - dir->folderCount = attrp->ca_dircount; - bcopy(&attrp->ca_finderinfo[0], &dir->userInfo, 32); - /* - * Update the BSD Info if it was already initialized on - * disk or if the runtime values have been modified. - * - * If the BSD info was already initialized, but - * MNT_UNKNOWNPERMISSIONS is set, then the runtime IDs are - * probably different than what was on disk. We don't want - * to overwrite the on-disk values (so if we turn off - * MNT_UNKNOWNPERMISSIONS, the old IDs get used again). - * This way, we can still change fields like the mode or - * dates even when MNT_UNKNOWNPERMISSIONS is set. - * - * Note that if MNT_UNKNOWNPERMISSIONS is set, hfs_chown - * won't change the uid or gid from their defaults. So, if - * the BSD info wasn't set, and the runtime values are not - * default, then what changed was the mode or flags. We - * have to set the uid and gid to something, so use the - * supplied values (which will be default), which has the - * same effect as creating a new file while - * MNT_UNKNOWNPERMISSIONS is set. - */ - if ((dir->bsdInfo.fileMode != 0) || - (attrp->ca_flags != 0) || - (attrp->ca_uid != hfsmp->hfs_uid) || - (attrp->ca_gid != hfsmp->hfs_gid) || - ((attrp->ca_mode & ALLPERMS) != - (hfsmp->hfs_dir_mask & ACCESSPERMS))) { - if ((dir->bsdInfo.fileMode == 0) || - (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { - dir->bsdInfo.ownerID = attrp->ca_uid; - dir->bsdInfo.groupID = attrp->ca_gid; - } - dir->bsdInfo.ownerFlags = attrp->ca_flags & 0x000000FF; - dir->bsdInfo.adminFlags = attrp->ca_flags >> 16; - dir->bsdInfo.fileMode = attrp->ca_mode; - /* A directory hardlink has a link count. */ - if (attrp->ca_linkcount > 1 || dir->hl_linkCount > 1) { - dir->hl_linkCount = attrp->ca_linkcount; - } - } - break; - } - case kHFSPlusFileRecord: { - HFSPlusCatalogFile *file; - int is_dirlink; - - file = (struct HFSPlusCatalogFile *)crp; - /* Do a quick sanity check */ - if (file->fileID != attrp->ca_fileid) - return (btNotFound); - file->flags = attrp->ca_recflags; - file->createDate = to_hfs_time(attrp->ca_itime); - file->contentModDate = to_hfs_time(attrp->ca_mtime); - file->backupDate = to_hfs_time(attrp->ca_btime); - file->accessDate = to_hfs_time(attrp->ca_atime); - attrp->ca_atimeondisk = attrp->ca_atime; - file->attributeModDate = to_hfs_time(attrp->ca_ctime); - /* - * Note: file hardlink inodes don't require a text encoding - * hint, but they do have a first link value. - */ - if (ckp->hfsPlus.parentID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { - file->hl_firstLinkID = attrp->ca_firstlink; - } else { - file->textEncoding = descp->cd_encoding; - } - bcopy(&attrp->ca_finderinfo[0], &file->userInfo, 32); - /* - * Update the BSD Info if it was already initialized on - * disk or if the runtime values have been modified. - * - * If the BSD info was already initialized, but - * MNT_UNKNOWNPERMISSIONS is set, then the runtime IDs are - * probably different than what was on disk. We don't want - * to overwrite the on-disk values (so if we turn off - * MNT_UNKNOWNPERMISSIONS, the old IDs get used again). - * This way, we can still change fields like the mode or - * dates even when MNT_UNKNOWNPERMISSIONS is set. - * - * Note that if MNT_UNKNOWNPERMISSIONS is set, hfs_chown - * won't change the uid or gid from their defaults. So, if - * the BSD info wasn't set, and the runtime values are not - * default, then what changed was the mode or flags. We - * have to set the uid and gid to something, so use the - * supplied values (which will be default), which has the - * same effect as creating a new file while - * MNT_UNKNOWNPERMISSIONS is set. - * - * Do not modify bsdInfo for directory hard link records. - * They are set during creation and are not modifiable, so just - * leave them alone. - */ - is_dirlink = (file->flags & kHFSHasLinkChainMask) && - (SWAP_BE32(file->userInfo.fdType) == kHFSAliasType) && - (SWAP_BE32(file->userInfo.fdCreator) == kHFSAliasCreator); - - if (!is_dirlink && - ((file->bsdInfo.fileMode != 0) || - (attrp->ca_flags != 0) || - (attrp->ca_uid != hfsmp->hfs_uid) || - (attrp->ca_gid != hfsmp->hfs_gid) || - ((attrp->ca_mode & ALLPERMS) != - (hfsmp->hfs_file_mask & ACCESSPERMS)))) { - if ((file->bsdInfo.fileMode == 0) || - (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { - file->bsdInfo.ownerID = attrp->ca_uid; - file->bsdInfo.groupID = attrp->ca_gid; - } - file->bsdInfo.ownerFlags = attrp->ca_flags & 0x000000FF; - file->bsdInfo.adminFlags = attrp->ca_flags >> 16; - file->bsdInfo.fileMode = attrp->ca_mode; - } - if (state->s_rsrcfork) { - forkp = state->s_rsrcfork; - file->resourceFork.logicalSize = forkp->cf_size; - file->resourceFork.totalBlocks = forkp->cf_blocks; - bcopy(&forkp->cf_extents[0], &file->resourceFork.extents, - sizeof(HFSPlusExtentRecord)); - /* Push blocks read to disk */ - file->resourceFork.clumpSize = - howmany(forkp->cf_bytesread, blksize); - } - if (state->s_datafork) { - forkp = state->s_datafork; - file->dataFork.logicalSize = forkp->cf_size; - file->dataFork.totalBlocks = forkp->cf_blocks; - bcopy(&forkp->cf_extents[0], &file->dataFork.extents, - sizeof(HFSPlusExtentRecord)); - /* Push blocks read to disk */ - file->dataFork.clumpSize = - howmany(forkp->cf_bytesread, blksize); - } - - if ((file->resourceFork.extents[0].startBlock != 0) && - (file->resourceFork.extents[0].startBlock == - file->dataFork.extents[0].startBlock)) { - panic("hfs: catrec_update: rsrc fork == data fork"); - } - - /* Synchronize the lock state */ - if (attrp->ca_flags & (SF_IMMUTABLE | UF_IMMUTABLE)) - file->flags |= kHFSFileLockedMask; - else - file->flags &= ~kHFSFileLockedMask; - - /* Push out special field if necessary */ - if (S_ISBLK(attrp->ca_mode) || S_ISCHR(attrp->ca_mode)) { - file->bsdInfo.special.rawDevice = attrp->ca_rdev; - } - else { - /* - * Protect against the degenerate case where the descriptor contains the - * raw inode ID in its CNID field. If the HFSPlusCatalogFile record indicates - * the linkcount was greater than 1 (the default value), then it must have become - * a hardlink. In this case, update the linkcount from the cat_attr passed in. - */ - if ((descp->cd_cnid != attrp->ca_fileid) || (attrp->ca_linkcount > 1 ) || - (file->hl_linkCount > 1)) { - file->hl_linkCount = attrp->ca_linkcount; - } - } - break; - } - default: - return (btNotFound); - } - return (0); -} - -/* This function sets kHFSHasChildLinkBit in a directory hierarchy in the - * catalog btree of given cnid by walking up the parent chain till it reaches - * either the root folder, or the private metadata directory for storing - * directory hard links. This function updates the corresponding in-core - * cnode, if any, and the directory record in the catalog btree. - * On success, returns zero. On failure, returns non-zero value. - */ -__private_extern__ -int -cat_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid) -{ - int retval = 0; - int lockflags = 0; - struct cat_desc desc; - struct cat_attr attr; - - while ((cnid != kHFSRootFolderID) && (cnid != kHFSRootParentID) && - (cnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid)) { - /* Update the bit in corresponding cnode, if any, in the hash. - * If the cnode has the bit already set, stop the traversal. - */ - retval = hfs_chash_set_childlinkbit(hfsmp, cnid); - if (retval == 0) { - break; - } - - /* Update the catalog record on disk if either cnode was not - * found in the hash, or if a cnode was found and the cnode - * did not have the bit set previously. - */ - retval = hfs_start_transaction(hfsmp); - if (retval) { - break; - } - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - /* Look up our catalog folder record */ - retval = cat_idlookup(hfsmp, cnid, 0, 0, &desc, &attr, NULL); - if (retval) { - hfs_systemfile_unlock(hfsmp, lockflags); - hfs_end_transaction(hfsmp); - break; - } - - /* Update the bit in the catalog record */ - attr.ca_recflags |= kHFSHasChildLinkMask; - retval = cat_update(hfsmp, &desc, &attr, NULL, NULL); - if (retval) { - hfs_systemfile_unlock(hfsmp, lockflags); - hfs_end_transaction(hfsmp); - cat_releasedesc(&desc); - break; - } - - hfs_systemfile_unlock(hfsmp, lockflags); - hfs_end_transaction(hfsmp); - - cnid = desc.cd_parentcnid; - cat_releasedesc(&desc); - } - - return retval; -} - -/* This function traverses the parent directory hierarchy from the given - * directory to one level below root directory and checks if any of its - * ancestors is - - * 1. A directory hard link. - * 2. The 'pointed at' directory. - * If any of these conditions fail or an internal error is encountered - * during look up of the catalog record, this function returns non-zero value. - */ -__private_extern__ -int -cat_check_link_ancestry(struct hfsmount *hfsmp, cnid_t cnid, cnid_t pointed_at_cnid) -{ - HFSPlusCatalogKey *keyp; - BTreeIterator *ip; - FSBufferDescriptor btdata; - HFSPlusCatalogFolder folder; - FCB *fcb; - int invalid; - int result; - - invalid = 0; - BDINIT(btdata, &folder); - MALLOC(ip, BTreeIterator *, sizeof(*ip), M_TEMP, M_WAITOK); - keyp = (HFSPlusCatalogKey *)&ip->key; - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - while (cnid != kHFSRootParentID) { - /* Check if the 'pointed at' directory is an ancestor */ - if (pointed_at_cnid == cnid) { - invalid = 1; - break; - } - if ((result = getkey(hfsmp, cnid, (CatalogKey *)keyp))) { - printf("hfs: cat_check_link_ancestry: getkey failed id=%u, vol=%s\n", cnid, hfsmp->vcbVN); - invalid = 1; /* On errors, assume an invalid parent */ - break; - } - if ((result = BTSearchRecord(fcb, ip, &btdata, NULL, NULL))) { - printf("hfs: cat_check_link_ancestry: cannot find id=%u, vol=%s\n", cnid, hfsmp->vcbVN); - invalid = 1; /* On errors, assume an invalid parent */ - break; - } - /* Check if this ancestor is a directory hard link */ - if (folder.flags & kHFSHasLinkChainMask) { - invalid = 1; - break; - } - cnid = keyp->parentID; - } - FREE(ip, M_TEMP); - return (invalid); -} - - -/* - * update_siblinglinks_callback - update a link's chain - */ - -struct linkupdate_state { - cnid_t filelinkid; - cnid_t prevlinkid; - cnid_t nextlinkid; -}; - -static int -update_siblinglinks_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct linkupdate_state *state) -{ - HFSPlusCatalogFile *file; - - if (crp->recordType != kHFSPlusFileRecord) { - printf("hfs: update_siblinglinks_callback: unexpected rec type %d\n", crp->recordType); - return (btNotFound); - } - - file = (struct HFSPlusCatalogFile *)crp; - if (file->flags & kHFSHasLinkChainMask) { - if (state->prevlinkid != HFS_IGNORABLE_LINK) { - file->hl_prevLinkID = state->prevlinkid; - } - if (state->nextlinkid != HFS_IGNORABLE_LINK) { - file->hl_nextLinkID = state->nextlinkid; - } - } else { - printf("hfs: update_siblinglinks_callback: file %d isn't a chain\n", file->fileID); - } - return (0); -} - -/* - * cat_update_siblinglinks - update a link's chain - */ -int -cat_update_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid) -{ - FCB * fcb; - BTreeIterator * iterator; - struct linkupdate_state state; - int result; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - state.filelinkid = linkfileid; - state.prevlinkid = prevlinkid; - state.nextlinkid = nextlinkid; - - /* Create an iterator for use by us temporarily */ - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - - result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key); - if (result == 0) { - result = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr)update_siblinglinks_callback, &state); - (void) BTFlushPath(fcb); - } else { - printf("hfs: cat_update_siblinglinks: couldn't resolve cnid=%d, vol=%s\n", linkfileid, hfsmp->vcbVN); - } - - FREE (iterator, M_TEMP); - return MacToVFSError(result); -} - -/* - * cat_lookuplink - lookup a link by it's name - */ -int -cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) -{ - FCB * fcb; - BTreeIterator * iterator; - struct FSBufferDescriptor btdata; - struct HFSPlusCatalogFile file; - int result; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - /* Create an iterator for use by us temporarily */ - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - - if ((result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0))) { - goto exit; - } - BDINIT(btdata, &file); - - if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { - goto exit; - } - if (file.recordType != kHFSPlusFileRecord) { - result = ENOENT; - goto exit; - } - *linkfileid = file.fileID; - - if (file.flags & kHFSHasLinkChainMask) { - *prevlinkid = file.hl_prevLinkID; - *nextlinkid = file.hl_nextLinkID; - } else { - *prevlinkid = 0; - *nextlinkid = 0; - } -exit: - FREE(iterator, M_TEMP); - return MacToVFSError(result); -} - - -/* - * cat_lookup_siblinglinks - lookup previous and next link ID for link using its cnid - */ -int -cat_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) -{ - FCB * fcb; - BTreeIterator * iterator; - struct FSBufferDescriptor btdata; - struct HFSPlusCatalogFile file; - int result; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - /* Create an iterator for use by us temporarily */ - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - - if ((result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key))) { - goto exit; - } - BDINIT(btdata, &file); - - if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { - goto exit; - } - /* The prev/next chain is only valid when kHFSHasLinkChainMask is set. */ - if (file.flags & kHFSHasLinkChainMask) { - cnid_t parent; - - parent = ((HFSPlusCatalogKey *)&iterator->key)->parentID; - - /* directory inodes don't have a chain (its in an EA) */ - if (parent == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - result = ENOLINK; /* signal to caller to get head of list */ - } else if (parent == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { - *prevlinkid = 0; - *nextlinkid = file.hl_firstLinkID; - } else { - *prevlinkid = file.hl_prevLinkID; - *nextlinkid = file.hl_nextLinkID; - } - } else { - *prevlinkid = 0; - *nextlinkid = 0; - } -exit: - FREE(iterator, M_TEMP); - return MacToVFSError(result); -} - - -/* - * cat_lookup_lastlink - find the last sibling link in the chain (no "next" ptr) - */ -int -cat_lookup_lastlink(struct hfsmount *hfsmp, cnid_t linkfileid, - cnid_t *lastlink, struct cat_desc *cdesc) -{ - FCB * fcb; - BTreeIterator * iterator; - struct FSBufferDescriptor btdata; - struct HFSPlusCatalogFile file; - int result; - int itercount = 0; - int foundlast = 0; - cnid_t currentlink = linkfileid; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - /* Create an iterator for use by us temporarily */ - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - - while ((foundlast == 0) && (itercount < HFS_LINK_MAX )) { - itercount++; - bzero(iterator, sizeof(*iterator)); - - if ((result = getkey(hfsmp, currentlink, (CatalogKey *)&iterator->key))) { - goto exit; - } - BDINIT(btdata, &file); - - if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { - goto exit; - } - - /* The prev/next chain is only valid when kHFSHasLinkChainMask is set. */ - if (file.flags & kHFSHasLinkChainMask) { - cnid_t parent; - - parent = ((HFSPlusCatalogKey *)&iterator->key)->parentID; - /* - * The raw inode for a directory hardlink doesn't have a chain. - * Its link information lives in an EA. - */ - if (parent == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - /* We don't iterate to find the oldest directory hardlink. */ - result = ENOLINK; - goto exit; - } - else if (parent == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { - /* Raw inode for file hardlink (the base inode) */ - currentlink = file.hl_firstLinkID; - - /* - * One minor special-casing here is necessary. - * If our ID brought us to the raw hardlink inode, and it does - * not have any siblings, then it's an open-unlinked file, and we - * should not proceed any further. - */ - if (currentlink == 0) { - result = ENOLINK; - goto exit; - } - } - else { - /* Otherwise, this item's parent is a legitimate directory in the namespace */ - if (file.hl_nextLinkID == 0) { - /* If nextLinkID is 0, then we found the end; no more hardlinks */ - foundlast = 1; - *lastlink = currentlink; - /* - * Since we had to construct a catalog key to do this lookup - * we still hold it in-hand. We might as well use it to build - * the descriptor that the caller asked for. - */ - builddesc ((HFSPlusCatalogKey*)&iterator->key, currentlink, 0, 0, 0, cdesc); - break; - } - - currentlink = file.hl_nextLinkID; - } - } - else { - /* Sorry, can't help you without a link chain */ - result = ENOLINK; - goto exit; - } - } -exit: - /* If we didn't find what we were looking for, zero out the args */ - if (foundlast == 0) { - if (cdesc) { - bzero (cdesc, sizeof(struct cat_desc)); - } - if (lastlink) { - *lastlink = 0; - } - } - - FREE(iterator, M_TEMP); - return MacToVFSError(result); -} - - -/* - * cat_createlink - create a link in the catalog - * - * The following cat_attr fields are expected to be set: - * ca_linkref - * ca_itime - * ca_mode (S_IFREG) - * ca_recflags - * ca_flags - * ca_finderinfo (type and creator) - */ -int -cat_createlink(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, - cnid_t nextlinkid, cnid_t *linkfileid) -{ - FCB * fcb; - struct btobj * bto; - FSBufferDescriptor btdata; - HFSPlusForkData *rsrcforkp; - u_int32_t nextCNID; - u_int32_t datalen; - u_int32_t encoding; - int thread_inserted = 0; - int alias_allocated = 0; - int result = 0; - int std_hfs; - - std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - /* - * Get the next CNID. Note that we are currently holding catalog lock. - */ - result = cat_acquire_cnid(hfsmp, &nextCNID); - if (result) { - return result; - } - - /* Get space for iterator, key and data */ - MALLOC(bto, struct btobj *, sizeof(struct btobj), M_TEMP, M_WAITOK); - bto->iterator.hint.nodeNum = 0; - rsrcforkp = &bto->data.hfsPlusFile.resourceFork; - - result = buildkey(hfsmp, descp, &bto->key, 0); - if (result) { - printf("hfs: cat_createlink: err %d from buildkey\n", result); - goto exit; - } - - /* This is our only chance to set the encoding (other than a rename). */ - encoding = hfs_pickencoding(bto->key.nodeName.unicode, bto->key.nodeName.length); - - /* - * Insert the thread record first. - */ - datalen = buildthread((void*)&bto->key, &bto->data, 0, 0); - btdata.bufferAddress = &bto->data; - btdata.itemSize = datalen; - btdata.itemCount = 1; - - buildthreadkey(nextCNID, 0, (CatalogKey *) &bto->iterator.key); - result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); - if (result) { - goto exit; - } - thread_inserted = 1; - - /* - * Now insert the link record. - */ - buildrecord(attrp, nextCNID, 0, encoding, &bto->data, &datalen); - - bto->data.hfsPlusFile.hl_prevLinkID = 0; - bto->data.hfsPlusFile.hl_nextLinkID = nextlinkid; - bto->data.hfsPlusFile.hl_linkReference = attrp->ca_linkref; - - /* For directory hard links, create alias in resource fork */ - if (descp->cd_flags & CD_ISDIR) { - if ((result = cat_makealias(hfsmp, attrp->ca_linkref, &bto->data.hfsPlusFile))) { - goto exit; - } - alias_allocated = 1; - } - btdata.bufferAddress = &bto->data; - btdata.itemSize = datalen; - btdata.itemCount = 1; - - bcopy(&bto->key, &bto->iterator.key, sizeof(bto->key)); - - result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); - if (result) { - if (result == btExists) - result = EEXIST; - goto exit; - } - if (linkfileid != NULL) { - *linkfileid = nextCNID; - } -exit: - if (result) { - if (thread_inserted) { - printf("hfs: cat_createlink: BTInsertRecord err=%d, vol=%s\n", MacToVFSError(result), hfsmp->vcbVN); - - buildthreadkey(nextCNID, 0, (CatalogKey *)&bto->iterator.key); - if (BTDeleteRecord(fcb, &bto->iterator)) { - printf("hfs: cat_createlink() failed to delete thread record on volume %s\n", hfsmp->vcbVN); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } - } - if (alias_allocated && rsrcforkp->extents[0].startBlock != 0) { - (void) BlockDeallocate(hfsmp, rsrcforkp->extents[0].startBlock, - rsrcforkp->extents[0].blockCount, 0); - rsrcforkp->extents[0].startBlock = 0; - rsrcforkp->extents[0].blockCount = 0; - } - } - (void) BTFlushPath(fcb); - FREE(bto, M_TEMP); - - return MacToVFSError(result); -} - -/* Directory hard links are visible as aliases on pre-Leopard systems and - * as normal directories on Leopard or later. All directory hard link aliases - * have the same resource fork content except for the three uniquely - * identifying values that are updated in the resource fork data when the alias - * is created. The following array is the constant resource fork data used - * only for creating directory hard link aliases. - */ -static const char hfs_dirlink_alias_rsrc[] = { - 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x9e, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x32, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x2b, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x9e, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x32, 0x00, 0x00, 0x61, 0x6c, 0x69, 0x73, - 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 -}; - -/* Constants for directory hard link alias */ -enum { - /* Size of resource fork data array for directory hard link alias */ - kHFSAliasSize = 0x1d0, - - /* Volume type for ejectable devices like disk image */ - kHFSAliasVolTypeEjectable = 0x5, - - /* Offset for volume create date, in Mac OS local time */ - kHFSAliasVolCreateDateOffset = 0x12a, - - /* Offset for the type of volume */ - kHFSAliasVolTypeOffset = 0x130, - - /* Offset for folder ID of the parent directory of the directory inode */ - kHFSAliasParentIDOffset = 0x132, - - /* Offset for folder ID of the directory inode */ - kHFSAliasTargetIDOffset = 0x176, -}; - -/* Create and write an alias that points at the directory represented by given - * inode number on the same volume. Directory hard links are visible as - * aliases in pre-Leopard systems and this function creates these aliases. - * - * Note: This code is very specific to creating alias for the purpose - * of directory hard links only, and should not be generalized. - */ -static int -cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalogFile *crp) -{ - struct buf *bp; - daddr64_t blkno; - u_int32_t blkcount; - int blksize; - int sectorsize; - int result; - HFSPlusForkData *rsrcforkp; - char *alias; - uint32_t *valptr; - - rsrcforkp = &(crp->resourceFork); - - blksize = hfsmp->blockSize; - blkcount = howmany(kHFSAliasSize, blksize); - sectorsize = hfsmp->hfs_logical_block_size; - bzero(rsrcforkp, sizeof(HFSPlusForkData)); - - /* Allocate some disk space for the alias content. */ - result = BlockAllocate(hfsmp, 0, blkcount, blkcount, - HFS_ALLOC_FORCECONTIG | HFS_ALLOC_METAZONE, - &rsrcforkp->extents[0].startBlock, - &rsrcforkp->extents[0].blockCount); - /* Did it fail with an out of space error? If so, re-try and allow journal flushing. */ - if (result == dskFulErr ) { - result = BlockAllocate(hfsmp, 0, blkcount, blkcount, - HFS_ALLOC_FORCECONTIG | HFS_ALLOC_METAZONE | HFS_ALLOC_FLUSHTXN, - &rsrcforkp->extents[0].startBlock, - &rsrcforkp->extents[0].blockCount); - } - if (result) { - rsrcforkp->extents[0].startBlock = 0; - goto exit; - } - - /* Acquire a buffer cache block for our block. */ - blkno = ((u_int64_t)rsrcforkp->extents[0].startBlock * (u_int64_t)blksize) / sectorsize; - blkno += hfsmp->hfsPlusIOPosOffset / sectorsize; - - bp = buf_getblk(hfsmp->hfs_devvp, blkno, roundup(kHFSAliasSize, hfsmp->hfs_logical_block_size), 0, 0, BLK_META); - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, bp); - } - - /* Generate alias content */ - alias = (char *)buf_dataptr(bp); - bzero(alias, buf_size(bp)); - bcopy(hfs_dirlink_alias_rsrc, alias, kHFSAliasSize); - - /* Set the volume create date, local time in Mac OS format */ - valptr = (uint32_t *)(alias + kHFSAliasVolCreateDateOffset); - *valptr = OSSwapHostToBigInt32(hfsmp->localCreateDate); - - /* If the file system is on a virtual device like disk image, - * update the volume type to be ejectable device. - */ - if (hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) { - *(uint16_t *)(alias + kHFSAliasVolTypeOffset) = - OSSwapHostToBigInt16(kHFSAliasVolTypeEjectable); - } - - /* Set id of the parent of the target directory */ - valptr = (uint32_t *)(alias + kHFSAliasParentIDOffset); - *valptr = OSSwapHostToBigInt32(hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid); - - /* Set id of the target directory */ - valptr = (uint32_t *)(alias + kHFSAliasTargetIDOffset); - *valptr = OSSwapHostToBigInt32(inode_num); - - /* Write alias content to disk. */ - if (hfsmp->jnl) { - journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); - } else if ((result = buf_bwrite(bp))) { - goto exit; - } - - /* Finish initializing the fork data. */ - rsrcforkp->logicalSize = kHFSAliasSize; - rsrcforkp->totalBlocks = rsrcforkp->extents[0].blockCount; - -exit: - if (result && rsrcforkp->extents[0].startBlock != 0) { - (void) BlockDeallocate(hfsmp, rsrcforkp->extents[0].startBlock, rsrcforkp->extents[0].blockCount, 0); - rsrcforkp->extents[0].startBlock = 0; - rsrcforkp->extents[0].blockCount = 0; - rsrcforkp->logicalSize = 0; - rsrcforkp->totalBlocks = 0; - } - return (result); -} - -/* - * cat_deletelink - delete a link from the catalog - */ -int -cat_deletelink(struct hfsmount *hfsmp, struct cat_desc *descp) -{ - struct HFSPlusCatalogFile file; - struct cat_attr cattr; - uint32_t totalBlocks; - int i; - int result; - - bzero(&file, sizeof (file)); - bzero(&cattr, sizeof (cattr)); - cattr.ca_fileid = descp->cd_cnid; - - /* Directory links have alias content to remove. */ - if (descp->cd_flags & CD_ISDIR) { - FCB * fcb; - BTreeIterator * iterator; - struct FSBufferDescriptor btdata; - - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; - - if ((result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0))) { - goto exit; - } - BDINIT(btdata, &file); - - if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { - goto exit; - } - } - - result = cat_delete(hfsmp, descp, &cattr); - - if ((result == 0) && - (descp->cd_flags & CD_ISDIR) && - (file.recordType == kHFSPlusFileRecord)) { - - totalBlocks = file.resourceFork.totalBlocks; - - for (i = 0; (i < 8) && (totalBlocks > 0); i++) { - if ((file.resourceFork.extents[i].blockCount == 0) && - (file.resourceFork.extents[i].startBlock == 0)) { - break; - } - - (void) BlockDeallocate(hfsmp, - file.resourceFork.extents[i].startBlock, - file.resourceFork.extents[i].blockCount, 0); - - totalBlocks -= file.resourceFork.extents[i].blockCount; - file.resourceFork.extents[i].startBlock = 0; - file.resourceFork.extents[i].blockCount = 0; - } - } -exit: - return (result); -} - - -/* - * Callback to collect directory entries. - * Called with readattr_state for each item in a directory. - */ -struct readattr_state { - struct hfsmount *hfsmp; - struct cat_entrylist *list; - cnid_t dir_cnid; - int stdhfs; - int error; - int reached_eof; -}; - -static int -getentriesattr_callback(const CatalogKey *key, const CatalogRecord *rec, - struct readattr_state *state) -{ - struct cat_entrylist *list = state->list; - struct hfsmount *hfsmp = state->hfsmp; - struct cat_entry *cep; - cnid_t parentcnid; - - if (list->realentries >= list->maxentries) - return (0); /* stop */ - - parentcnid = state->stdhfs ? key->hfs.parentID : key->hfsPlus.parentID; - - switch(rec->recordType) { - case kHFSPlusFolderRecord: - case kHFSPlusFileRecord: -#if CONFIG_HFS_STD - case kHFSFolderRecord: - case kHFSFileRecord: -#endif - if (parentcnid != state->dir_cnid) { - state->error = ENOENT; - state->reached_eof = 1; - return (0); /* stop */ - } - break; - default: - state->error = ENOENT; - return (0); /* stop */ - } - - /* Hide the private system directories and journal files */ - if (parentcnid == kHFSRootFolderID) { - if (rec->recordType == kHFSPlusFolderRecord) { - if (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - rec->hfsPlusFolder.folderID == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - list->skipentries++; - return (1); /* continue */ - } - } - if ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) && - (rec->recordType == kHFSPlusFileRecord) && - ((rec->hfsPlusFile.fileID == hfsmp->hfs_jnlfileid) || - (rec->hfsPlusFile.fileID == hfsmp->hfs_jnlinfoblkid))) { - list->skipentries++; - return (1); /* continue */ - } - } - - cep = &list->entry[list->realentries++]; - - if (state->stdhfs == 0) { - getbsdattr(hfsmp, (const struct HFSPlusCatalogFile *)rec, &cep->ce_attr); - builddesc((const HFSPlusCatalogKey *)key, getcnid(rec), 0, getencoding(rec), - isadir(rec), &cep->ce_desc); - - if (rec->recordType == kHFSPlusFileRecord) { - cep->ce_datasize = rec->hfsPlusFile.dataFork.logicalSize; - cep->ce_datablks = rec->hfsPlusFile.dataFork.totalBlocks; - cep->ce_rsrcsize = rec->hfsPlusFile.resourceFork.logicalSize; - cep->ce_rsrcblks = rec->hfsPlusFile.resourceFork.totalBlocks; - - /* Save link reference for later processing. */ - if ((SWAP_BE32(rec->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && - (SWAP_BE32(rec->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator)) { - cep->ce_attr.ca_linkref = rec->hfsPlusFile.bsdInfo.special.iNodeNum; - } else if ((rec->hfsPlusFile.flags & kHFSHasLinkChainMask) && - (SWAP_BE32(rec->hfsPlusFile.userInfo.fdType) == kHFSAliasType) && - (SWAP_BE32(rec->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator)) { - cep->ce_attr.ca_linkref = rec->hfsPlusFile.bsdInfo.special.iNodeNum; - } - } - } -#if CONFIG_HFS_STD - else { - struct HFSPlusCatalogFile cnoderec; - HFSPlusCatalogKey * pluskey; - u_int32_t encoding; - - promoteattr(hfsmp, rec, &cnoderec); - getbsdattr(hfsmp, &cnoderec, &cep->ce_attr); - - MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); - promotekey(hfsmp, (const HFSCatalogKey *)key, pluskey, &encoding); - builddesc(pluskey, getcnid(rec), 0, encoding, isadir(rec), &cep->ce_desc); - FREE(pluskey, M_TEMP); - - if (rec->recordType == kHFSFileRecord) { - int blksize = HFSTOVCB(hfsmp)->blockSize; - - cep->ce_datasize = rec->hfsFile.dataLogicalSize; - cep->ce_datablks = rec->hfsFile.dataPhysicalSize / blksize; - cep->ce_rsrcsize = rec->hfsFile.rsrcLogicalSize; - cep->ce_rsrcblks = rec->hfsFile.rsrcPhysicalSize / blksize; - } - } -#endif - - return (list->realentries < list->maxentries); -} - -/* - * Pack a cat_entrylist buffer with attributes from the catalog - * - * Note: index is zero relative - */ -int -cat_getentriesattr(struct hfsmount *hfsmp, directoryhint_t *dirhint, struct cat_entrylist *ce_list, int *reachedeof) -{ - FCB* fcb; - CatalogKey * key; - BTreeIterator * iterator; - struct readattr_state state; - cnid_t parentcnid; - int i; - int std_hfs; - int index; - int have_key; - int result = 0; - int reached_eof = 0; - - ce_list->realentries = 0; - - fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); - std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); - parentcnid = dirhint->dh_desc.cd_parentcnid; - - bzero (&state, sizeof(struct readattr_state)); - - state.hfsmp = hfsmp; - state.list = ce_list; - state.dir_cnid = parentcnid; - state.stdhfs = std_hfs; - state.error = 0; - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - key = (CatalogKey *)&iterator->key; - have_key = 0; - iterator->hint.nodeNum = dirhint->dh_desc.cd_hint; - index = dirhint->dh_index + 1; - - /* - * Attempt to build a key from cached filename - */ - if (dirhint->dh_desc.cd_namelen != 0) { - if (buildkey(hfsmp, &dirhint->dh_desc, (HFSPlusCatalogKey *)key, 0) == 0) { - have_key = 1; - } - } - - /* - * If the last entry wasn't cached then position the btree iterator - */ - if ((index == 0) || !have_key) { - /* - * Position the iterator at the directory's thread record. - * (i.e. just before the first entry) - */ - buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); - result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); - if (result) { - result = MacToVFSError(result); - goto exit; - } - - /* - * Iterate until we reach the entry just - * before the one we want to start with. - */ - if (index > 0) { - struct position_state ps; - - ps.error = 0; - ps.count = 0; - ps.index = index; - ps.parentID = dirhint->dh_desc.cd_parentcnid; - ps.hfsmp = hfsmp; - - result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, - (IterateCallBackProcPtr)cat_findposition, &ps); - if (ps.error) - result = ps.error; - else - result = MacToVFSError(result); - - if (result) { - /* - * Note: the index may now point to EOF if the directory - * was modified in between system calls. We will return - * ENOENT from cat_findposition if this is the case, and - * when we bail out with an error, our caller (hfs_readdirattr_internal) - * will suppress the error and indicate EOF to its caller. - */ - result = MacToVFSError(result); - goto exit; - } - } - } - - /* Fill list with entries starting at iterator->key. */ - result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, - (IterateCallBackProcPtr)getentriesattr_callback, &state); - - if (state.error) { - result = state.error; - reached_eof = state.reached_eof; - } - else if (ce_list->realentries == 0) { - result = ENOENT; - reached_eof = 1; - } - else { - result = MacToVFSError(result); - } - - if (std_hfs) - goto exit; - - /* - * Resolve any hard links. - */ - for (i = 0; i < (int)ce_list->realentries; ++i) { - struct FndrFileInfo *fip; - struct cat_entry *cep; - struct HFSPlusCatalogFile filerec; - int isdirlink = 0; - int isfilelink = 0; - - cep = &ce_list->entry[i]; - if (cep->ce_attr.ca_linkref == 0) - continue; - - /* Note: Finder info is still in Big Endian */ - fip = (struct FndrFileInfo *)&cep->ce_attr.ca_finderinfo; - - if (S_ISREG(cep->ce_attr.ca_mode) && - (SWAP_BE32(fip->fdType) == kHardLinkFileType) && - (SWAP_BE32(fip->fdCreator) == kHFSPlusCreator)) { - isfilelink = 1; - } - if (S_ISREG(cep->ce_attr.ca_mode) && - (SWAP_BE32(fip->fdType) == kHFSAliasType) && - (SWAP_BE32(fip->fdCreator) == kHFSAliasCreator) && - (cep->ce_attr.ca_recflags & kHFSHasLinkChainMask)) { - isdirlink = 1; - } - if (isfilelink || isdirlink) { - if (cat_resolvelink(hfsmp, cep->ce_attr.ca_linkref, isdirlink, &filerec) != 0) - continue; - /* Repack entry from inode record. */ - getbsdattr(hfsmp, &filerec, &cep->ce_attr); - cep->ce_datasize = filerec.dataFork.logicalSize; - cep->ce_datablks = filerec.dataFork.totalBlocks; - cep->ce_rsrcsize = filerec.resourceFork.logicalSize; - cep->ce_rsrcblks = filerec.resourceFork.totalBlocks; - } - } - -exit: - FREE(iterator, M_TEMP); - *reachedeof = reached_eof; - return MacToVFSError(result); -} - -#define SMALL_DIRENTRY_SIZE (int)(sizeof(struct dirent) - (MAXNAMLEN + 1) + 8) -#define MAX_LINKINFO_ENTRIES 3000 - -/* - * Callback to pack directory entries. - * Called with packdirentry_state for each item in a directory. - */ - -/* Hard link information collected during cat_getdirentries. */ -struct linkinfo { - u_int32_t link_ref; - user_addr_t dirent_addr; -}; -typedef struct linkinfo linkinfo_t; - -/* State information for the getdirentries_callback function. */ -struct packdirentry_state { - int cbs_flags; /* VNODE_READDIR_* flags */ - u_int32_t cbs_parentID; - u_int32_t cbs_index; - uio_t cbs_uio; - ExtendedVCB * cbs_hfsmp; - int cbs_result; - int32_t cbs_nlinks; - int32_t cbs_maxlinks; - linkinfo_t * cbs_linkinfo; - struct cat_desc * cbs_desc; - u_int8_t * cbs_namebuf; - /* - * The following fields are only used for NFS readdir, which - * uses the next file id as the seek offset of each entry. - */ - struct direntry * cbs_direntry; - struct direntry * cbs_prevdirentry; - u_int32_t cbs_previlinkref; - Boolean cbs_hasprevdirentry; - Boolean cbs_eof; -}; - -/* - * getdirentries callback for HFS Plus directories. - */ -static int -getdirentries_callback(const CatalogKey *ckp, const CatalogRecord *crp, - struct packdirentry_state *state) -{ - struct hfsmount *hfsmp; - const CatalogName *cnp; - cnid_t curID; - OSErr result; - struct dirent catent; - struct direntry * entry = NULL; - time_t itime; - u_int32_t ilinkref = 0; - u_int32_t curlinkref = 0; - cnid_t cnid; - int hide = 0; - u_int8_t type = DT_UNKNOWN; - u_int8_t is_mangled = 0; - u_int8_t is_link = 0; - u_int8_t *nameptr; - user_addr_t uiobase = USER_ADDR_NULL; - size_t namelen = 0; - size_t maxnamelen; - size_t uiosize = 0; - caddr_t uioaddr; - Boolean stop_after_pack = false; - - hfsmp = state->cbs_hfsmp; - curID = ckp->hfsPlus.parentID; - - /* We're done when parent directory changes */ - if (state->cbs_parentID != curID) { - /* - * If the parent ID is different from curID this means we've hit - * the EOF for the directory. To help future callers, we mark - * the cbs_eof boolean. However, we should only mark the EOF - * boolean if we're about to return from this function. - * - * This is because this callback function does its own uiomove - * to get the data to userspace. If we set the boolean before determining - * whether or not the current entry has enough room to write its - * data to userland, we could fool the callers of this catalog function - * into thinking they've hit EOF earlier than they really would have. - * In that case, we'd know that we have more entries to process and - * send to userland, but we didn't have enough room. - * - * To be safe, we mark cbs_eof here ONLY for the cases where we know we're - * about to return and won't write any new data back - * to userland. In the stop_after_pack case, we'll set this boolean - * regardless, so it's slightly safer to let that logic mark the boolean, - * especially since it's closer to the return of this function. - */ - - if (state->cbs_flags & VNODE_READDIR_EXTENDED) { - /* The last record has not been returned yet, so we - * want to stop after packing the last item - */ - if (state->cbs_hasprevdirentry) { - stop_after_pack = true; - } else { - state->cbs_eof = true; - state->cbs_result = ENOENT; - return (0); /* stop */ - } - } else { - state->cbs_eof = true; - state->cbs_result = ENOENT; - return (0); /* stop */ - } - } - - if (state->cbs_flags & VNODE_READDIR_EXTENDED) { - entry = state->cbs_direntry; - nameptr = (u_int8_t *)&entry->d_name[0]; - if (state->cbs_flags & VNODE_READDIR_NAMEMAX) { - /* - * The NFS server sometimes needs to make filenames fit in - * NAME_MAX bytes (since its client may not be able to - * handle a longer name). In that case, NFS will ask us - * to mangle the name to keep it short enough. - */ - maxnamelen = NAME_MAX + 1; - } else { - maxnamelen = sizeof(entry->d_name); - } - } else { - nameptr = (u_int8_t *)&catent.d_name[0]; - maxnamelen = sizeof(catent.d_name); - } - - if ((state->cbs_flags & VNODE_READDIR_EXTENDED) && stop_after_pack) { - /* The last item returns a non-zero invalid cookie */ - cnid = INT_MAX; - } else { - switch(crp->recordType) { - case kHFSPlusFolderRecord: - type = DT_DIR; - cnid = crp->hfsPlusFolder.folderID; - /* Hide our private system directories. */ - if (curID == kHFSRootFolderID) { - if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - hide = 1; - } - } - break; - case kHFSPlusFileRecord: - itime = to_bsd_time(crp->hfsPlusFile.createDate); - type = MODE_TO_DT(crp->hfsPlusFile.bsdInfo.fileMode); - cnid = crp->hfsPlusFile.fileID; - /* - * When a hardlink link is encountered save its link ref. - */ - if ((SWAP_BE32(crp->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && - (SWAP_BE32(crp->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator) && - ((itime == (time_t)hfsmp->hfs_itime) || - (itime == (time_t)hfsmp->hfs_metadata_createdate))) { - /* If link ref is inode's file id then use it directly. */ - if (crp->hfsPlusFile.flags & kHFSHasLinkChainMask) { - cnid = crp->hfsPlusFile.hl_linkReference; - } else { - ilinkref = crp->hfsPlusFile.hl_linkReference; - } - is_link =1; - } else if ((SWAP_BE32(crp->hfsPlusFile.userInfo.fdType) == kHFSAliasType) && - (SWAP_BE32(crp->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator) && - (crp->hfsPlusFile.flags & kHFSHasLinkChainMask) && - (crp->hfsPlusFile.hl_linkReference >= kHFSFirstUserCatalogNodeID) && - ((itime == (time_t)hfsmp->hfs_itime) || - (itime == (time_t)hfsmp->hfs_metadata_createdate))) { - /* A directory's link resolves to a directory. */ - type = DT_DIR; - /* A directory's link ref is always inode's file id. */ - cnid = crp->hfsPlusFile.hl_linkReference; - is_link = 1; - } - /* Hide the journal files */ - if ((curID == kHFSRootFolderID) && - ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY)))) && - ((cnid == hfsmp->hfs_jnlfileid) || - (cnid == hfsmp->hfs_jnlinfoblkid))) { - hide = 1; - } - break; - default: - return (0); /* stop */ - }; - - cnp = (const CatalogName*) &ckp->hfsPlus.nodeName; - - namelen = cnp->ustr.length; - /* - * For MacRoman encoded names, assume that its ascii and - * convert it directly in an attempt to avoid the more - * expensive utf8_encodestr conversion. - */ - if ((namelen < maxnamelen) && (crp->hfsPlusFile.textEncoding == 0)) { - int i; - u_int16_t ch; - const u_int16_t *chp; - - chp = &cnp->ustr.unicode[0]; - for (i = 0; i < (int)namelen; ++i) { - ch = *chp++; - if (ch > 0x007f || ch == 0x0000) { - /* Perform expensive utf8_encodestr conversion */ - goto encodestr; - } - nameptr[i] = (ch == '/') ? ':' : (u_int8_t)ch; - } - nameptr[namelen] = '\0'; - result = 0; - } else { -encodestr: - result = utf8_encodestr(cnp->ustr.unicode, namelen * sizeof(UniChar), - nameptr, &namelen, maxnamelen, ':', 0); - } - - /* Check result returned from encoding the filename to utf8 */ - if (result == ENAMETOOLONG) { - /* - * If we were looking at a catalog record for a hardlink (not the inode), - * then we want to use its link ID as opposed to the inode ID for - * a mangled name. For all other cases, they are the same. Note that - * due to the way directory hardlinks are implemented, the actual link - * is going to be counted as a file record, so we can catch both - * with is_link. - */ - cnid_t linkid = cnid; - if (is_link) { - linkid = crp->hfsPlusFile.fileID; - } - - result = ConvertUnicodeToUTF8Mangled(cnp->ustr.length * sizeof(UniChar), - cnp->ustr.unicode, maxnamelen, - (ByteCount*)&namelen, nameptr, linkid); - is_mangled = 1; - } - } - - if (state->cbs_flags & VNODE_READDIR_EXTENDED) { - /* - * The index is 1 relative and includes "." and ".." - * - * Also stuff the cnid in the upper 32 bits of the cookie. - * The cookie is stored to the previous entry, which will - * be packed and copied this time - */ - state->cbs_prevdirentry->d_seekoff = (state->cbs_index + 3) | ((u_int64_t)cnid << 32); - uiosize = state->cbs_prevdirentry->d_reclen; - uioaddr = (caddr_t) state->cbs_prevdirentry; - } else { - catent.d_type = type; - catent.d_namlen = namelen; - catent.d_reclen = uiosize = STD_DIRENT_LEN(namelen); - if (hide) - catent.d_fileno = 0; /* file number = 0 means skip entry */ - else - catent.d_fileno = cnid; - uioaddr = (caddr_t) &catent; - } - - /* Save current base address for post processing of hard-links. */ - if (ilinkref || state->cbs_previlinkref) { - uiobase = uio_curriovbase(state->cbs_uio); - } - /* If this entry won't fit then we're done */ - if ((uiosize > (user_size_t)uio_resid(state->cbs_uio)) || - (ilinkref != 0 && state->cbs_nlinks == state->cbs_maxlinks)) { - return (0); /* stop */ - } - - if (!(state->cbs_flags & VNODE_READDIR_EXTENDED) || state->cbs_hasprevdirentry) { - state->cbs_result = uiomove(uioaddr, uiosize, state->cbs_uio); - if (state->cbs_result == 0) { - ++state->cbs_index; - - /* Remember previous entry */ - state->cbs_desc->cd_cnid = cnid; - if (type == DT_DIR) { - state->cbs_desc->cd_flags |= CD_ISDIR; - } else { - state->cbs_desc->cd_flags &= ~CD_ISDIR; - } - if (state->cbs_desc->cd_nameptr != NULL) { - state->cbs_desc->cd_namelen = 0; - } -#if 0 - state->cbs_desc->cd_encoding = xxxx; -#endif - if (!is_mangled) { - state->cbs_desc->cd_namelen = namelen; - bcopy(nameptr, state->cbs_namebuf, namelen + 1); - } else { - /* Store unmangled name for the directory hint else it will - * restart readdir at the last location again - */ - u_int8_t *new_nameptr; - size_t bufsize; - size_t tmp_namelen = 0; - - cnp = (const CatalogName *)&ckp->hfsPlus.nodeName; - bufsize = 1 + utf8_encodelen(cnp->ustr.unicode, - cnp->ustr.length * sizeof(UniChar), - ':', 0); - MALLOC(new_nameptr, u_int8_t *, bufsize, M_TEMP, M_WAITOK); - result = utf8_encodestr(cnp->ustr.unicode, - cnp->ustr.length * sizeof(UniChar), - new_nameptr, &tmp_namelen, bufsize, ':', 0); - - state->cbs_desc->cd_namelen = tmp_namelen; - bcopy(new_nameptr, state->cbs_namebuf, tmp_namelen + 1); - - FREE(new_nameptr, M_TEMP); - } - } - if (state->cbs_hasprevdirentry) { - curlinkref = ilinkref; /* save current */ - ilinkref = state->cbs_previlinkref; /* use previous */ - } - /* - * Record any hard links for post processing. - */ - if ((ilinkref != 0) && - (state->cbs_result == 0) && - (state->cbs_nlinks < state->cbs_maxlinks)) { - state->cbs_linkinfo[state->cbs_nlinks].dirent_addr = uiobase; - state->cbs_linkinfo[state->cbs_nlinks].link_ref = ilinkref; - state->cbs_nlinks++; - } - if (state->cbs_hasprevdirentry) { - ilinkref = curlinkref; /* restore current */ - } - } - - /* Fill the direntry to be used the next time */ - if (state->cbs_flags & VNODE_READDIR_EXTENDED) { - if (stop_after_pack) { - state->cbs_eof = true; - return (0); /* stop */ - } - entry->d_type = type; - entry->d_namlen = namelen; - entry->d_reclen = EXT_DIRENT_LEN(namelen); - if (hide) { - /* File number = 0 means skip entry */ - entry->d_fileno = 0; - } else { - entry->d_fileno = cnid; - } - /* swap the current and previous entry */ - struct direntry * tmp; - tmp = state->cbs_direntry; - state->cbs_direntry = state->cbs_prevdirentry; - state->cbs_prevdirentry = tmp; - state->cbs_hasprevdirentry = true; - state->cbs_previlinkref = ilinkref; - } - - /* Continue iteration if there's room */ - return (state->cbs_result == 0 && - uio_resid(state->cbs_uio) >= SMALL_DIRENTRY_SIZE); -} - -#if CONFIG_HFS_STD -/* - * getdirentries callback for standard HFS (non HFS+) directories. - */ -static int -getdirentries_std_callback(const CatalogKey *ckp, const CatalogRecord *crp, - struct packdirentry_state *state) -{ - struct hfsmount *hfsmp; - const CatalogName *cnp; - cnid_t curID; - OSErr result; - struct dirent catent; - cnid_t cnid; - u_int8_t type = DT_UNKNOWN; - u_int8_t *nameptr; - size_t namelen = 0; - size_t maxnamelen; - size_t uiosize = 0; - caddr_t uioaddr; - - hfsmp = state->cbs_hfsmp; - - curID = ckp->hfs.parentID; - - /* We're done when parent directory changes */ - if (state->cbs_parentID != curID) { - state->cbs_result = ENOENT; - return (0); /* stop */ - } - - nameptr = (u_int8_t *)&catent.d_name[0]; - maxnamelen = sizeof(catent.d_name); - - switch(crp->recordType) { - case kHFSFolderRecord: - type = DT_DIR; - cnid = crp->hfsFolder.folderID; - break; - case kHFSFileRecord: - type = DT_REG; - cnid = crp->hfsFile.fileID; - break; - default: - return (0); /* stop */ - }; - - cnp = (const CatalogName*) ckp->hfs.nodeName; - result = hfs_to_utf8(hfsmp, cnp->pstr, maxnamelen, (ByteCount *)&namelen, nameptr); - /* - * When an HFS name cannot be encoded with the current - * volume encoding we use MacRoman as a fallback. - */ - if (result) { - result = mac_roman_to_utf8(cnp->pstr, maxnamelen, (ByteCount *)&namelen, nameptr); - } - catent.d_type = type; - catent.d_namlen = namelen; - catent.d_reclen = uiosize = STD_DIRENT_LEN(namelen); - catent.d_fileno = cnid; - uioaddr = (caddr_t) &catent; - - /* If this entry won't fit then we're done */ - if (uiosize > (user_size_t)uio_resid(state->cbs_uio)) { - return (0); /* stop */ - } - - state->cbs_result = uiomove(uioaddr, uiosize, state->cbs_uio); - if (state->cbs_result == 0) { - ++state->cbs_index; - - /* Remember previous entry */ - state->cbs_desc->cd_cnid = cnid; - if (type == DT_DIR) { - state->cbs_desc->cd_flags |= CD_ISDIR; - } else { - state->cbs_desc->cd_flags &= ~CD_ISDIR; - } - if (state->cbs_desc->cd_nameptr != NULL) { - state->cbs_desc->cd_namelen = 0; - } - state->cbs_desc->cd_namelen = namelen; - bcopy(nameptr, state->cbs_namebuf, namelen + 1); - } - - /* Continue iteration if there's room */ - return (state->cbs_result == 0 && uio_resid(state->cbs_uio) >= SMALL_DIRENTRY_SIZE); -} -#endif - -/* - * Pack a uio buffer with directory entries from the catalog - */ -int -cat_getdirentries(struct hfsmount *hfsmp, u_int32_t entrycnt, directoryhint_t *dirhint, - uio_t uio, int flags, int * items, int * eofflag) -{ - FCB* fcb; - BTreeIterator * iterator; - CatalogKey * key; - struct packdirentry_state state; - void * buffer; - int bufsize; - - int maxlinks; - int result; - int index; - int have_key; - int extended; - - extended = flags & VNODE_READDIR_EXTENDED; - - if (extended && (hfsmp->hfs_flags & HFS_STANDARD)) { - return (ENOTSUP); - } - fcb = hfsmp->hfs_catalog_cp->c_datafork; - - /* - * Get a buffer for link info array, btree iterator and a direntry. - * - * We impose an cap of 3000 link entries when trying to compute - * the total number of hardlink entries that we'll allow in the - * linkinfo array. - * - * Note that in the case where there are very few hardlinks, - * this does not restrict or prevent us from vending out as many entries - * as we can to the uio_resid, because the getdirentries callback - * uiomoves the directory entries to the uio itself and does not use - * this MALLOC'd array. It also limits itself to maxlinks of hardlinks. - */ - - /* Now compute the maximum link array size */ - maxlinks = MIN (entrycnt, MAX_LINKINFO_ENTRIES); - - bufsize = MAXPATHLEN + (maxlinks * sizeof(linkinfo_t)) + sizeof(*iterator); - if (extended) { - bufsize += 2*sizeof(struct direntry); - } - MALLOC(buffer, void *, bufsize, M_TEMP, M_WAITOK); - bzero(buffer, bufsize); - - state.cbs_flags = flags; - state.cbs_hasprevdirentry = false; - state.cbs_previlinkref = 0; - state.cbs_nlinks = 0; - state.cbs_maxlinks = maxlinks; - state.cbs_linkinfo = (linkinfo_t *)((char *)buffer + MAXPATHLEN); - /* - * We need to set cbs_eof to false regardless of whether or not the - * control flow is actually in the extended case, since we use this - * field to track whether or not we've returned EOF from the iterator function. - */ - state.cbs_eof = false; - - iterator = (BTreeIterator *) ((char *)state.cbs_linkinfo + (maxlinks * sizeof(linkinfo_t))); - key = (CatalogKey *)&iterator->key; - have_key = 0; - index = dirhint->dh_index + 1; - if (extended) { - state.cbs_direntry = (struct direntry *)((char *)iterator + sizeof(BTreeIterator)); - state.cbs_prevdirentry = state.cbs_direntry + 1; - } - /* - * Attempt to build a key from cached filename - */ - if (dirhint->dh_desc.cd_namelen != 0) { - if (buildkey(hfsmp, &dirhint->dh_desc, (HFSPlusCatalogKey *)key, 0) == 0) { - iterator->hint.nodeNum = dirhint->dh_desc.cd_hint; - have_key = 1; - } - } - - if (index == 0 && dirhint->dh_threadhint != 0) { - /* - * Position the iterator at the directory's thread record. - * (i.e. just before the first entry) - */ - buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); - iterator->hint.nodeNum = dirhint->dh_threadhint; - iterator->hint.index = 0; - have_key = 1; - } - - /* - * If the last entry wasn't cached then position the btree iterator - */ - if (!have_key) { - /* - * Position the iterator at the directory's thread record. - * (i.e. just before the first entry) - */ - buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); - result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); - if (result) { - result = MacToVFSError(result); - goto cleanup; - } - if (index == 0) { - dirhint->dh_threadhint = iterator->hint.nodeNum; - } - /* - * Iterate until we reach the entry just - * before the one we want to start with. - */ - if (index > 0) { - struct position_state ps; - - ps.error = 0; - ps.count = 0; - ps.index = index; - ps.parentID = dirhint->dh_desc.cd_parentcnid; - ps.hfsmp = hfsmp; - - result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, - (IterateCallBackProcPtr)cat_findposition, &ps); - if (ps.error) - result = ps.error; - else - result = MacToVFSError(result); - if (result) { - result = MacToVFSError(result); - if (result == ENOENT) { - /* - * ENOENT means we've hit the EOF. - * suppress the error, and set the eof flag. - */ - result = 0; - dirhint->dh_desc.cd_flags |= CD_EOF; - *eofflag = 1; - } - goto cleanup; - } - } - } - - state.cbs_index = index; - state.cbs_hfsmp = hfsmp; - state.cbs_uio = uio; - state.cbs_desc = &dirhint->dh_desc; - state.cbs_namebuf = (u_int8_t *)buffer; - state.cbs_result = 0; - state.cbs_parentID = dirhint->dh_desc.cd_parentcnid; - - /* Use a temporary buffer to hold intermediate descriptor names. */ - if (dirhint->dh_desc.cd_namelen > 0 && dirhint->dh_desc.cd_nameptr != NULL) { - bcopy(dirhint->dh_desc.cd_nameptr, buffer, dirhint->dh_desc.cd_namelen+1); - if (dirhint->dh_desc.cd_flags & CD_HASBUF) { - dirhint->dh_desc.cd_flags &= ~CD_HASBUF; - vfs_removename((const char *)dirhint->dh_desc.cd_nameptr); - } - } - dirhint->dh_desc.cd_nameptr = (u_int8_t *)buffer; - - enum BTreeIterationOperations op; - if (extended && index != 0 && have_key) - op = kBTreeCurrentRecord; - else - op = kBTreeNextRecord; - - /* - * Process as many entries as possible starting at iterator->key. - */ - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { - /* HFS+ */ - result = BTIterateRecords(fcb, op, iterator, - (IterateCallBackProcPtr)getdirentries_callback, &state); - - /* For extended calls, every call to getdirentries_callback() - * transfers the previous directory entry found to the user - * buffer. Therefore when BTIterateRecords reaches the end of - * Catalog BTree, call getdirentries_callback() again with - * dummy values to copy the last directory entry stored in - * packdirentry_state - */ - if (extended && (result == fsBTRecordNotFoundErr)) { - CatalogKey ckp; - CatalogRecord crp; - - bzero(&ckp, sizeof(ckp)); - bzero(&crp, sizeof(crp)); - - result = getdirentries_callback(&ckp, &crp, &state); - } - } -#if CONFIG_HFS_STD - else { - /* HFS (standard) */ - result = BTIterateRecords(fcb, op, iterator, - (IterateCallBackProcPtr)getdirentries_std_callback, &state); - } -#endif - - /* Note that state.cbs_index is still valid on errors */ - *items = state.cbs_index - index; - index = state.cbs_index; - - /* - * Also note that cbs_eof is set in all cases if we ever hit EOF - * during the enumeration by the catalog callback. Mark the directory's hint - * descriptor as having hit EOF. - */ - - if (state.cbs_eof) { - dirhint->dh_desc.cd_flags |= CD_EOF; - *eofflag = 1; - } - - /* Finish updating the catalog iterator. */ - dirhint->dh_desc.cd_hint = iterator->hint.nodeNum; - dirhint->dh_desc.cd_flags |= CD_DECOMPOSED; - dirhint->dh_index = index - 1; - - /* Fix up the name. */ - if (dirhint->dh_desc.cd_namelen > 0) { - dirhint->dh_desc.cd_nameptr = (const u_int8_t *)vfs_addname((char *)buffer, dirhint->dh_desc.cd_namelen, 0, 0); - dirhint->dh_desc.cd_flags |= CD_HASBUF; - } else { - dirhint->dh_desc.cd_nameptr = NULL; - dirhint->dh_desc.cd_namelen = 0; - } - - /* - * Post process any hard links to get the real file id. - */ - if (state.cbs_nlinks > 0) { - ino_t fileid = 0; - user_addr_t address; - int i; - - for (i = 0; i < state.cbs_nlinks; ++i) { - if (resolvelinkid(hfsmp, state.cbs_linkinfo[i].link_ref, &fileid) != 0) - continue; - /* This assumes that d_ino is always first field. */ - address = state.cbs_linkinfo[i].dirent_addr; - if (address == (user_addr_t)0) - continue; - if (uio_isuserspace(uio)) { - if (extended) { - ino64_t fileid_64 = (ino64_t)fileid; - (void) copyout(&fileid_64, address, sizeof(fileid_64)); - } else { - (void) copyout(&fileid, address, sizeof(fileid)); - } - } else /* system space */ { - if (extended) { - ino64_t fileid_64 = (ino64_t)fileid; - bcopy(&fileid_64, (void*) CAST_DOWN(caddr_t, address), sizeof(fileid_64)); - } else { - bcopy(&fileid, (void*) CAST_DOWN(caddr_t, address), sizeof(fileid)); - } - } - } - } - - if (state.cbs_result) - result = state.cbs_result; - else - result = MacToVFSError(result); - - if (result == ENOENT) { - result = 0; - } - -cleanup: - FREE(buffer, M_TEMP); - - return (result); -} - - -/* - * Callback to establish directory position. - * Called with position_state for each item in a directory. - */ -static int -cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, - struct position_state *state) -{ - cnid_t curID = 0; - - if ((state->hfsmp->hfs_flags & HFS_STANDARD) == 0) { - curID = ckp->hfsPlus.parentID; - } -#if CONFIG_HFS_STD - else { - curID = ckp->hfs.parentID; - } -#endif - - /* Make sure parent directory didn't change */ - if (state->parentID != curID) { - /* - * The parent ID is different from curID this means we've hit - * the EOF for the directory. - */ - state->error = ENOENT; - return (0); /* stop */ - } - - /* Count this entry */ - switch(crp->recordType) { - case kHFSPlusFolderRecord: - case kHFSPlusFileRecord: -#if CONFIG_HFS_STD - case kHFSFolderRecord: - case kHFSFileRecord: -#endif - ++state->count; - break; - default: - printf("hfs: cat_findposition: invalid record type %d in dir %d\n", - crp->recordType, curID); - state->error = EINVAL; - return (0); /* stop */ - }; - - return (state->count < state->index); -} - - -/* - * cat_binarykeycompare - compare two HFS Plus catalog keys. - - * The name portion of the key is compared using a 16-bit binary comparison. - * This is called from the b-tree code. - */ -int -cat_binarykeycompare(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) -{ - u_int32_t searchParentID, trialParentID; - int result; - - searchParentID = searchKey->parentID; - trialParentID = trialKey->parentID; - result = 0; - - if (searchParentID > trialParentID) { - ++result; - } else if (searchParentID < trialParentID) { - --result; - } else { - u_int16_t * str1 = &searchKey->nodeName.unicode[0]; - u_int16_t * str2 = &trialKey->nodeName.unicode[0]; - int length1 = searchKey->nodeName.length; - int length2 = trialKey->nodeName.length; - - result = UnicodeBinaryCompare (str1, length1, str2, length2); - } - - return result; -} - - -#if CONFIG_HFS_STD -/* - * Compare two standard HFS catalog keys - * - * Result: +n search key > trial key - * 0 search key = trial key - * -n search key < trial key - */ -int -CompareCatalogKeys(HFSCatalogKey *searchKey, HFSCatalogKey *trialKey) -{ - cnid_t searchParentID, trialParentID; - int result; - - searchParentID = searchKey->parentID; - trialParentID = trialKey->parentID; - - if (searchParentID > trialParentID) - result = 1; - else if (searchParentID < trialParentID) - result = -1; - else /* parent dirID's are equal, compare names */ - result = FastRelString(searchKey->nodeName, trialKey->nodeName); - - return result; -} -#endif - - -/* - * Compare two HFS+ catalog keys - * - * Result: +n search key > trial key - * 0 search key = trial key - * -n search key < trial key - */ -int -CompareExtendedCatalogKeys(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) -{ - cnid_t searchParentID, trialParentID; - int result; - - searchParentID = searchKey->parentID; - trialParentID = trialKey->parentID; - - if (searchParentID > trialParentID) { - result = 1; - } - else if (searchParentID < trialParentID) { - result = -1; - } else { - /* parent node ID's are equal, compare names */ - if ( searchKey->nodeName.length == 0 || trialKey->nodeName.length == 0 ) - result = searchKey->nodeName.length - trialKey->nodeName.length; - else - result = FastUnicodeCompare(&searchKey->nodeName.unicode[0], - searchKey->nodeName.length, - &trialKey->nodeName.unicode[0], - trialKey->nodeName.length); - } - - return result; -} - - -/* - * buildkey - build a Catalog b-tree key from a cnode descriptor - */ -static int -buildkey(struct hfsmount *hfsmp, struct cat_desc *descp, - HFSPlusCatalogKey *key, int retry) -{ - int std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); - int utf8_flags = UTF_ESCAPE_ILLEGAL; - int result = 0; - size_t unicodeBytes = 0; - - if (std_hfs == 0) { - retry = 0; - } - - if (descp->cd_namelen == 0 || descp->cd_nameptr[0] == '\0') - return (EINVAL); /* invalid name */ - - key->parentID = descp->cd_parentcnid; - key->nodeName.length = 0; - /* - * Convert filename from UTF-8 into Unicode - */ - - if ((descp->cd_flags & CD_DECOMPOSED) == 0) - utf8_flags |= UTF_DECOMPOSED; - result = utf8_decodestr(descp->cd_nameptr, descp->cd_namelen, - key->nodeName.unicode, &unicodeBytes, - sizeof(key->nodeName.unicode), ':', utf8_flags); - key->nodeName.length = unicodeBytes / sizeof(UniChar); - key->keyLength = kHFSPlusCatalogKeyMinimumLength + unicodeBytes; - if (result) { - if (result != ENAMETOOLONG) - result = EINVAL; /* name has invalid characters */ - return (result); - } - -#if CONFIG_HFS_STD - /* - * For HFS volumes convert to an HFS compatible key - * - * XXX need to save the encoding that succeeded - */ - if (std_hfs) { - HFSCatalogKey hfskey; - - bzero(&hfskey, sizeof(hfskey)); - hfskey.keyLength = kHFSCatalogKeyMinimumLength; - hfskey.parentID = key->parentID; - hfskey.nodeName[0] = 0; - if (key->nodeName.length > 0) { - int res; - if ((res = unicode_to_hfs(HFSTOVCB(hfsmp), - key->nodeName.length * 2, - key->nodeName.unicode, - &hfskey.nodeName[0], retry)) != 0) { - if (res != ENAMETOOLONG) - res = EINVAL; - - return res; - } - hfskey.keyLength += hfskey.nodeName[0]; - } - bcopy(&hfskey, key, sizeof(hfskey)); - } -#endif - - return (0); - } - - -/* - * Resolve hard link reference to obtain the inode record. - */ -int -cat_resolvelink(struct hfsmount *hfsmp, u_int32_t linkref, int isdirlink, struct HFSPlusCatalogFile *recp) -{ - FSBufferDescriptor btdata; - struct BTreeIterator *iterator; - struct cat_desc idesc; - char inodename[32]; - cnid_t parentcnid; - int result = 0; - - BDINIT(btdata, recp); - - if (isdirlink) { - MAKE_DIRINODE_NAME(inodename, sizeof(inodename), (unsigned int)linkref); - parentcnid = hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid; - } else { - MAKE_INODE_NAME(inodename, sizeof(inodename), (unsigned int)linkref); - parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - } - - /* Get space for iterator */ - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - - /* Build a descriptor for private dir. */ - idesc.cd_parentcnid = parentcnid; - idesc.cd_nameptr = (const u_int8_t *)inodename; - idesc.cd_namelen = strlen(inodename); - idesc.cd_flags = 0; - idesc.cd_hint = 0; - idesc.cd_encoding = 0; - (void) buildkey(hfsmp, &idesc, (HFSPlusCatalogKey *)&iterator->key, 0); - - result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, - &btdata, NULL, NULL); - - if (result == 0) { - /* Make sure there's a reference */ - if (recp->hl_linkCount == 0) - recp->hl_linkCount = 2; - } else { - printf("hfs: cat_resolvelink: can't find inode=%s on vol=%s\n", inodename, hfsmp->vcbVN); - } - - FREE(iterator, M_TEMP); - - return (result ? ENOENT : 0); -} - -/* - * Resolve hard link reference to obtain the inode number. - */ -static int -resolvelinkid(struct hfsmount *hfsmp, u_int32_t linkref, ino_t *ino) -{ - struct HFSPlusCatalogFile record; - int error; - - /* - * Since we know resolvelinkid is only called from - * cat_getdirentries, we can assume that only file - * hardlinks need to be resolved (cat_getdirentries - * can resolve directory hardlinks in place). - */ - error = cat_resolvelink(hfsmp, linkref, 0, &record); - if (error == 0) { - if (record.fileID == 0) - error = ENOENT; - else - *ino = record.fileID; - } - return (error); -} - -/* - * getkey - get a key from id by doing a thread lookup - */ -static int -getkey(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key) -{ - struct BTreeIterator * iterator; - FSBufferDescriptor btdata; - u_int16_t datasize; - CatalogKey * keyp; - CatalogRecord * recp; - int result; - int std_hfs; - - std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); - - MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); - BDINIT(btdata, recp); - - result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, - &btdata, &datasize, iterator); - if (result) - goto exit; - - /* Turn thread record into a cnode key (in place) */ - switch (recp->recordType) { - -#if CONFIG_HFS_STD - case kHFSFileThreadRecord: - case kHFSFolderThreadRecord: - keyp = (CatalogKey *)((char *)&recp->hfsThread.reserved + 6); - keyp->hfs.keyLength = kHFSCatalogKeyMinimumLength + keyp->hfs.nodeName[0]; - bcopy(keyp, key, keyp->hfs.keyLength + 1); - break; -#endif - - case kHFSPlusFileThreadRecord: - case kHFSPlusFolderThreadRecord: - keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; - keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + - (keyp->hfsPlus.nodeName.length * 2); - bcopy(keyp, key, keyp->hfsPlus.keyLength + 2); - break; - - default: - result = ENOENT; - break; - } - -exit: - FREE(iterator, M_TEMP); - FREE(recp, M_TEMP); - - return MacToVFSError(result); -} - -/* - * getkeyplusattr - From id, fetch the key and the bsd attrs for a file/dir (could pass - * null arguments to cat_idlookup instead, but we save around 10% by not building the - * cat_desc here). Both key and attrp must point to real structures. - * - * The key's parent id is the only part of the key expected to be used by the caller. - * The name portion of the key may not always be valid (ie in the case of a hard link). - */ -int -cat_getkeyplusattr(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key, struct cat_attr *attrp) -{ - int result; - - result = getkey(hfsmp, cnid, key); - - if (result == 0) { - result = cat_lookupbykey(hfsmp, key, 0, 0, 0, NULL, attrp, NULL, NULL); - } - /* - * Check for a raw file hardlink inode. - * Fix up the parent id in the key if necessary. - * Only hard links created by Mac OS X 10.5 or later can be resolved here. - */ - if ((result == 0) && - (key->hfsPlus.parentID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && - (attrp->ca_recflags & kHFSHasLinkChainMask)) { - cnid_t nextlinkid = 0; - cnid_t prevlinkid = 0; - struct cat_desc linkdesc; - - /* - * Pick up the first link in the chain and get a descriptor for it. - * This allows blind bulk access checks to work for hardlinks. - */ - if ((cat_lookup_siblinglinks(hfsmp, cnid, &prevlinkid, &nextlinkid) == 0) && - (nextlinkid != 0)) { - if (cat_findname(hfsmp, nextlinkid, &linkdesc) == 0) { - key->hfsPlus.parentID = linkdesc.cd_parentcnid; - cat_releasedesc(&linkdesc); - } - } - } - return MacToVFSError(result); -} - - -/* - * buildrecord - build a default catalog directory or file record - */ -static void -buildrecord(struct cat_attr *attrp, cnid_t cnid, int std_hfs, u_int32_t encoding, - CatalogRecord *crp, u_int32_t *recordSize) -{ - int type = attrp->ca_mode & S_IFMT; - u_int32_t createtime = to_hfs_time(attrp->ca_itime); - - if (std_hfs == 0) { - struct HFSPlusBSDInfo * bsdp = NULL; - - if (type == S_IFDIR) { - crp->recordType = kHFSPlusFolderRecord; - crp->hfsPlusFolder.flags = attrp->ca_recflags; - crp->hfsPlusFolder.valence = 0; - crp->hfsPlusFolder.folderID = cnid; - crp->hfsPlusFolder.createDate = createtime; - crp->hfsPlusFolder.contentModDate = createtime; - crp->hfsPlusFolder.attributeModDate = createtime; - crp->hfsPlusFolder.accessDate = createtime; - crp->hfsPlusFolder.backupDate = 0; - crp->hfsPlusFolder.textEncoding = encoding; - crp->hfsPlusFolder.folderCount = 0; - bcopy(attrp->ca_finderinfo, &crp->hfsPlusFolder.userInfo, 32); - bsdp = &crp->hfsPlusFolder.bsdInfo; - bsdp->special.linkCount = 1; - *recordSize = sizeof(HFSPlusCatalogFolder); - } else { - crp->recordType = kHFSPlusFileRecord; - crp->hfsPlusFile.flags = attrp->ca_recflags; - crp->hfsPlusFile.reserved1 = 0; - crp->hfsPlusFile.fileID = cnid; - crp->hfsPlusFile.createDate = createtime; - crp->hfsPlusFile.contentModDate = createtime; - crp->hfsPlusFile.accessDate = createtime; - crp->hfsPlusFile.attributeModDate = createtime; - crp->hfsPlusFile.backupDate = 0; - crp->hfsPlusFile.textEncoding = encoding; - crp->hfsPlusFile.reserved2 = 0; - bcopy(attrp->ca_finderinfo, &crp->hfsPlusFile.userInfo, 32); - bsdp = &crp->hfsPlusFile.bsdInfo; - /* BLK/CHR need to save the device info */ - if (type == S_IFBLK || type == S_IFCHR) { - bsdp->special.rawDevice = attrp->ca_rdev; - } else { - bsdp->special.linkCount = 1; - } - bzero(&crp->hfsPlusFile.dataFork, 2*sizeof(HFSPlusForkData)); - *recordSize = sizeof(HFSPlusCatalogFile); - } - bsdp->ownerID = attrp->ca_uid; - bsdp->groupID = attrp->ca_gid; - bsdp->fileMode = attrp->ca_mode; - bsdp->adminFlags = attrp->ca_flags >> 16; - bsdp->ownerFlags = attrp->ca_flags & 0x000000FF; - } -#if CONFIG_HFS_STD - else { - createtime = UTCToLocal(createtime); - if (type == S_IFDIR) { - bzero(crp, sizeof(HFSCatalogFolder)); - crp->recordType = kHFSFolderRecord; - crp->hfsFolder.folderID = cnid; - crp->hfsFolder.createDate = createtime; - crp->hfsFolder.modifyDate = createtime; - bcopy(attrp->ca_finderinfo, &crp->hfsFolder.userInfo, 32); - *recordSize = sizeof(HFSCatalogFolder); - } else { - bzero(crp, sizeof(HFSCatalogFile)); - crp->recordType = kHFSFileRecord; - crp->hfsFile.fileID = cnid; - crp->hfsFile.createDate = createtime; - crp->hfsFile.modifyDate = createtime; - bcopy(attrp->ca_finderinfo, &crp->hfsFile.userInfo, 16); - bcopy(&attrp->ca_finderinfo[16], &crp->hfsFile.finderInfo, 16); - *recordSize = sizeof(HFSCatalogFile); - } - } -#endif - -} - - -/* - * builddesc - build a cnode descriptor from an HFS+ key - */ -static int -builddesc(const HFSPlusCatalogKey *key, cnid_t cnid, u_int32_t hint, u_int32_t encoding, - int isdir, struct cat_desc *descp) -{ - int result = 0; - unsigned char * nameptr; - size_t bufsize; - size_t utf8len; - unsigned char tmpbuff[128]; - - /* guess a size... */ - bufsize = (3 * key->nodeName.length) + 1; - if (bufsize >= sizeof(tmpbuff) - 1) { - MALLOC(nameptr, unsigned char *, bufsize, M_TEMP, M_WAITOK); - } else { - nameptr = &tmpbuff[0]; - } - - result = utf8_encodestr(key->nodeName.unicode, - key->nodeName.length * sizeof(UniChar), - nameptr, (size_t *)&utf8len, - bufsize, ':', 0); - - if (result == ENAMETOOLONG) { - bufsize = 1 + utf8_encodelen(key->nodeName.unicode, - key->nodeName.length * sizeof(UniChar), - ':', 0); - FREE(nameptr, M_TEMP); - MALLOC(nameptr, unsigned char *, bufsize, M_TEMP, M_WAITOK); - - result = utf8_encodestr(key->nodeName.unicode, - key->nodeName.length * sizeof(UniChar), - nameptr, (size_t *)&utf8len, - bufsize, ':', 0); - } - descp->cd_parentcnid = key->parentID; - descp->cd_nameptr = (const u_int8_t *)vfs_addname((char *)nameptr, utf8len, 0, 0); - descp->cd_namelen = utf8len; - descp->cd_cnid = cnid; - descp->cd_hint = hint; - descp->cd_flags = CD_DECOMPOSED | CD_HASBUF; - if (isdir) - descp->cd_flags |= CD_ISDIR; - descp->cd_encoding = encoding; - if (nameptr != &tmpbuff[0]) { - FREE(nameptr, M_TEMP); - } - return result; -} - - -/* - * getbsdattr - get attributes in bsd format - * - */ -static void -getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct cat_attr * attrp) -{ - int isDirectory = (crp->recordType == kHFSPlusFolderRecord); - const struct HFSPlusBSDInfo *bsd = &crp->bsdInfo; - - attrp->ca_recflags = crp->flags; - attrp->ca_atime = to_bsd_time(crp->accessDate); - attrp->ca_atimeondisk = attrp->ca_atime; - attrp->ca_mtime = to_bsd_time(crp->contentModDate); - attrp->ca_ctime = to_bsd_time(crp->attributeModDate); - attrp->ca_itime = to_bsd_time(crp->createDate); - attrp->ca_btime = to_bsd_time(crp->backupDate); - - if ((bsd->fileMode & S_IFMT) == 0) { - attrp->ca_flags = 0; - attrp->ca_uid = hfsmp->hfs_uid; - attrp->ca_gid = hfsmp->hfs_gid; - if (isDirectory) { - attrp->ca_mode = S_IFDIR | (hfsmp->hfs_dir_mask & ACCESSPERMS); - } else { - attrp->ca_mode = S_IFREG | (hfsmp->hfs_file_mask & ACCESSPERMS); - } - attrp->ca_linkcount = 1; - attrp->ca_rdev = 0; - } else { - attrp->ca_linkcount = 1; /* may be overridden below */ - attrp->ca_rdev = 0; - attrp->ca_uid = bsd->ownerID; - attrp->ca_gid = bsd->groupID; - attrp->ca_flags = bsd->ownerFlags | (bsd->adminFlags << 16); - attrp->ca_mode = (mode_t)bsd->fileMode; - switch (attrp->ca_mode & S_IFMT) { - case S_IFCHR: /* fall through */ - case S_IFBLK: - attrp->ca_rdev = bsd->special.rawDevice; - break; - case S_IFIFO: - case S_IFSOCK: - case S_IFDIR: - case S_IFREG: - /* Pick up the hard link count */ - if (bsd->special.linkCount > 0) - attrp->ca_linkcount = bsd->special.linkCount; - break; - } - - /* - * Override the permissions as determined by the mount auguments - * in ALMOST the same way unset permissions are treated but keep - * track of whether or not the file or folder is hfs locked - * by leaving the h_pflags field unchanged from what was unpacked - * out of the catalog. - */ - /* - * This code was used to do UID translation with MNT_IGNORE_OWNERS - * (aka MNT_UNKNOWNPERMISSIONS) at the HFS layer. It's largely done - * at the VFS layer, so there is no need to do it here now; this also - * allows VFS to let root see the real UIDs. - * - * if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) { - * attrp->ca_uid = hfsmp->hfs_uid; - * attrp->ca_gid = hfsmp->hfs_gid; - * } - */ - } - - if (isDirectory) { - if (!S_ISDIR(attrp->ca_mode)) { - attrp->ca_mode &= ~S_IFMT; - attrp->ca_mode |= S_IFDIR; - } - attrp->ca_entries = ((const HFSPlusCatalogFolder *)crp)->valence; - attrp->ca_dircount = ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && (attrp->ca_recflags & kHFSHasFolderCountMask)) ? - ((const HFSPlusCatalogFolder *)crp)->folderCount : 0; - - /* Keep UF_HIDDEN bit in sync with Finder Info's invisible bit */ - if (((const HFSPlusCatalogFolder *)crp)->userInfo.frFlags & OSSwapHostToBigConstInt16(kFinderInvisibleMask)) - attrp->ca_flags |= UF_HIDDEN; - } else { - /* Keep IMMUTABLE bits in sync with HFS locked flag */ - if (crp->flags & kHFSFileLockedMask) { - /* The file's supposed to be locked: - Make sure at least one of the IMMUTABLE bits is set: */ - if ((attrp->ca_flags & (SF_IMMUTABLE | UF_IMMUTABLE)) == 0) - attrp->ca_flags |= UF_IMMUTABLE; - } else { - /* The file's supposed to be unlocked: */ - attrp->ca_flags &= ~(SF_IMMUTABLE | UF_IMMUTABLE); - } - /* Keep UF_HIDDEN bit in sync with Finder Info's invisible bit */ - if (crp->userInfo.fdFlags & OSSwapHostToBigConstInt16(kFinderInvisibleMask)) - attrp->ca_flags |= UF_HIDDEN; - /* get total blocks (both forks) */ - attrp->ca_blocks = crp->dataFork.totalBlocks + crp->resourceFork.totalBlocks; - - /* On HFS+ the ThreadExists flag must always be set. */ - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) - attrp->ca_recflags |= kHFSThreadExistsMask; - - /* Pick up the hardlink first link, if any. */ - attrp->ca_firstlink = (attrp->ca_recflags & kHFSHasLinkChainMask) ? crp->hl_firstLinkID : 0; - } - - attrp->ca_fileid = crp->fileID; - - bcopy(&crp->userInfo, attrp->ca_finderinfo, 32); -} - -#if CONFIG_HFS_STD -/* - * promotekey - promote hfs key to hfs plus key - * - */ -static void -promotekey(struct hfsmount *hfsmp, const HFSCatalogKey *hfskey, - HFSPlusCatalogKey *keyp, u_int32_t *encoding) -{ - hfs_to_unicode_func_t hfs_get_unicode = hfsmp->hfs_get_unicode; - u_int32_t uniCount; - int error; - - *encoding = hfsmp->hfs_encoding; - - error = hfs_get_unicode(hfskey->nodeName, keyp->nodeName.unicode, - kHFSPlusMaxFileNameChars, &uniCount); - /* - * When an HFS name cannot be encoded with the current - * encoding use MacRoman as a fallback. - */ - if (error && hfsmp->hfs_encoding != kTextEncodingMacRoman) { - *encoding = 0; - (void) mac_roman_to_unicode(hfskey->nodeName, - keyp->nodeName.unicode, - kHFSPlusMaxFileNameChars, - &uniCount); - } - - keyp->nodeName.length = uniCount; - keyp->parentID = hfskey->parentID; -} - -/* - * promotefork - promote hfs fork info to hfs plus - * - */ -static void -promotefork(struct hfsmount *hfsmp, const struct HFSCatalogFile *filep, - int resource, struct cat_fork * forkp) -{ - struct HFSPlusExtentDescriptor *xp; - u_int32_t blocksize = HFSTOVCB(hfsmp)->blockSize; - - bzero(forkp, sizeof(*forkp)); - xp = &forkp->cf_extents[0]; - if (resource) { - forkp->cf_size = filep->rsrcLogicalSize; - forkp->cf_blocks = filep->rsrcPhysicalSize / blocksize; - forkp->cf_bytesread = 0; - forkp->cf_vblocks = 0; - xp[0].startBlock = (u_int32_t)filep->rsrcExtents[0].startBlock; - xp[0].blockCount = (u_int32_t)filep->rsrcExtents[0].blockCount; - xp[1].startBlock = (u_int32_t)filep->rsrcExtents[1].startBlock; - xp[1].blockCount = (u_int32_t)filep->rsrcExtents[1].blockCount; - xp[2].startBlock = (u_int32_t)filep->rsrcExtents[2].startBlock; - xp[2].blockCount = (u_int32_t)filep->rsrcExtents[2].blockCount; - } else { - forkp->cf_size = filep->dataLogicalSize; - forkp->cf_blocks = filep->dataPhysicalSize / blocksize; - forkp->cf_bytesread = 0; - forkp->cf_vblocks = 0; - xp[0].startBlock = (u_int32_t)filep->dataExtents[0].startBlock; - xp[0].blockCount = (u_int32_t)filep->dataExtents[0].blockCount; - xp[1].startBlock = (u_int32_t)filep->dataExtents[1].startBlock; - xp[1].blockCount = (u_int32_t)filep->dataExtents[1].blockCount; - xp[2].startBlock = (u_int32_t)filep->dataExtents[2].startBlock; - xp[2].blockCount = (u_int32_t)filep->dataExtents[2].blockCount; - } -} - -/* - * promoteattr - promote standard hfs catalog attributes to hfs plus - * - */ -static void -promoteattr(struct hfsmount *hfsmp, const CatalogRecord *dataPtr, struct HFSPlusCatalogFile *crp) -{ - u_int32_t blocksize = HFSTOVCB(hfsmp)->blockSize; - - if (dataPtr->recordType == kHFSFolderRecord) { - const struct HFSCatalogFolder * folder; - - folder = (const struct HFSCatalogFolder *) dataPtr; - crp->recordType = kHFSPlusFolderRecord; - crp->flags = folder->flags; - crp->fileID = folder->folderID; - crp->createDate = LocalToUTC(folder->createDate); - crp->contentModDate = LocalToUTC(folder->modifyDate); - crp->backupDate = LocalToUTC(folder->backupDate); - crp->reserved1 = folder->valence; - crp->reserved2 = 0; - bcopy(&folder->userInfo, &crp->userInfo, 32); - } else /* file */ { - const struct HFSCatalogFile * file; - - file = (const struct HFSCatalogFile *) dataPtr; - crp->recordType = kHFSPlusFileRecord; - crp->flags = file->flags; - crp->fileID = file->fileID; - crp->createDate = LocalToUTC(file->createDate); - crp->contentModDate = LocalToUTC(file->modifyDate); - crp->backupDate = LocalToUTC(file->backupDate); - crp->reserved1 = 0; - crp->reserved2 = 0; - bcopy(&file->userInfo, &crp->userInfo, 16); - bcopy(&file->finderInfo, &crp->finderInfo, 16); - crp->dataFork.totalBlocks = file->dataPhysicalSize / blocksize; - crp->resourceFork.totalBlocks = file->rsrcPhysicalSize / blocksize; - } - crp->textEncoding = 0; - crp->attributeModDate = crp->contentModDate; - crp->accessDate = crp->contentModDate; - bzero(&crp->bsdInfo, sizeof(HFSPlusBSDInfo)); -} -#endif - -/* - * Build a catalog node thread record from a catalog key - * and return the size of the record. - */ -static int -buildthread(void *keyp, void *recp, int std_hfs, int directory) -{ - int size = 0; - - if (std_hfs == 0) { - HFSPlusCatalogKey *key = (HFSPlusCatalogKey *)keyp; - HFSPlusCatalogThread *rec = (HFSPlusCatalogThread *)recp; - - size = sizeof(HFSPlusCatalogThread); - if (directory) - rec->recordType = kHFSPlusFolderThreadRecord; - else - rec->recordType = kHFSPlusFileThreadRecord; - rec->reserved = 0; - rec->parentID = key->parentID; - bcopy(&key->nodeName, &rec->nodeName, - sizeof(UniChar) * (key->nodeName.length + 1)); - - /* HFS Plus has variable sized thread records */ - size -= (sizeof(rec->nodeName.unicode) - - (rec->nodeName.length * sizeof(UniChar))); - - } -#if CONFIG_HFS_STD - else { - HFSCatalogKey *key = (HFSCatalogKey *)keyp; - HFSCatalogThread *rec = (HFSCatalogThread *)recp; - - size = sizeof(HFSCatalogThread); - bzero(rec, size); - if (directory) - rec->recordType = kHFSFolderThreadRecord; - else - rec->recordType = kHFSFileThreadRecord; - rec->parentID = key->parentID; - bcopy(key->nodeName, rec->nodeName, key->nodeName[0]+1); - - } -#endif - - return (size); -} - -/* - * Build a catalog node thread key. - */ -static void -buildthreadkey(HFSCatalogNodeID parentID, int std_hfs, CatalogKey *key) -{ - if (std_hfs == 0) { - key->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength; - key->hfsPlus.parentID = parentID; - key->hfsPlus.nodeName.length = 0; - } -#if CONFIG_HFS_STD - else { - key->hfs.keyLength = kHFSCatalogKeyMinimumLength; - key->hfs.reserved = 0; - key->hfs.parentID = parentID; - key->hfs.nodeName[0] = 0; - } -#endif - -} - -/* - * Extract the text encoding from a catalog node record. - */ -static u_int32_t -getencoding(const CatalogRecord *crp) -{ - u_int32_t encoding; - - if (crp->recordType == kHFSPlusFolderRecord) - encoding = crp->hfsPlusFolder.textEncoding; - else if (crp->recordType == kHFSPlusFileRecord) - encoding = crp->hfsPlusFile.textEncoding; - else - encoding = 0; - - return (encoding); -} - -/* - * Extract the CNID from a catalog node record. - */ -static cnid_t -getcnid(const CatalogRecord *crp) -{ - cnid_t cnid = 0; - - switch (crp->recordType) { - -#if CONFIG_HFS_STD - case kHFSFolderRecord: - cnid = crp->hfsFolder.folderID; - break; - case kHFSFileRecord: - cnid = crp->hfsFile.fileID; - break; -#endif - - case kHFSPlusFolderRecord: - cnid = crp->hfsPlusFolder.folderID; - break; - case kHFSPlusFileRecord: - cnid = crp->hfsPlusFile.fileID; - break; - default: - printf("hfs: getcnid: unknown recordType=%d\n", crp->recordType); - break; - } - - return (cnid); -} - -/* - * Extract the parent ID from a catalog node record. - */ -static cnid_t -getparentcnid(const CatalogRecord *recp) -{ - cnid_t cnid = 0; - - switch (recp->recordType) { - -#if CONFIG_HFS_STD - case kHFSFileThreadRecord: - case kHFSFolderThreadRecord: - cnid = recp->hfsThread.parentID; - break; -#endif - - case kHFSPlusFileThreadRecord: - case kHFSPlusFolderThreadRecord: - cnid = recp->hfsPlusThread.parentID; - break; - default: - panic("hfs: getparentcnid: unknown recordType (crp @ %p)\n", recp); - break; - } - - return (cnid); -} - -/* - * Determine if a catalog node record is a directory. - */ -static int -isadir(const CatalogRecord *crp) -{ - if (crp->recordType == kHFSPlusFolderRecord) { - return 1; - } -#if CONFIG_HFS_STD - if (crp->recordType == kHFSFolderRecord) { - return 1; - } -#endif - - return 0; -} - -/* - * cat_lookup_dirlink - lookup a catalog record for directory hard link - * (not inode) using catalog record id. Note that this function does - * NOT resolve directory hard link to its directory inode and return - * the link record. - * - * Note: The caller is responsible for releasing the output catalog - * descriptor (when supplied outdescp is non-null). - */ -int -cat_lookup_dirlink(struct hfsmount *hfsmp, cnid_t dirlink_id, - u_int8_t forktype, struct cat_desc *outdescp, - struct cat_attr *attrp, struct cat_fork *forkp) -{ - struct BTreeIterator *iterator = NULL; - FSBufferDescriptor btdata; - u_int16_t datasize; - CatalogKey *keyp; - CatalogRecord *recp = NULL; - int error; - - /* No directory hard links on standard HFS */ - if (hfsmp->vcbSigWord == kHFSSigWord) { - return ENOTSUP; - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return ENOMEM; - } - bzero(iterator, sizeof(*iterator)); - buildthreadkey(dirlink_id, 1, (CatalogKey *)&iterator->key); - - MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); - if (recp == NULL) { - error = ENOMEM; - goto out; - } - BDINIT(btdata, recp); - - error = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, - &btdata, &datasize, iterator); - if (error) { - goto out; - } - /* Directory hard links are catalog file record */ - if (recp->recordType != kHFSPlusFileThreadRecord) { - error = ENOENT; - goto out; - } - - keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; - keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + - (keyp->hfsPlus.nodeName.length * 2); - if (forktype == kHFSResourceForkType) { - /* Lookup resource fork for directory hard link */ - error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, true, outdescp, attrp, forkp, NULL); - } else { - /* Lookup data fork, if any, for directory hard link */ - error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, false, outdescp, attrp, forkp, NULL); - } - if (error) { - printf ("hfs: cat_lookup_dirlink(): Error looking up file record for id=%u (error=%d)\n", dirlink_id, error); - hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); - goto out; - } - /* Just for sanity, make sure that id in catalog record and thread record match */ - if ((outdescp != NULL) && (dirlink_id != outdescp->cd_cnid)) { - printf ("hfs: cat_lookup_dirlink(): Requested cnid=%u != found_cnid=%u\n", dirlink_id, outdescp->cd_cnid); - hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); - error = ENOENT; - } - -out: - if (recp) { - FREE(recp, M_TEMP); - } - FREE(iterator, M_TEMP); - - return MacToVFSError(error); -} - -/* - * cnode_update_dirlink - update the catalog node for directory hard link - * described by descp using the data from attrp and forkp. - */ -int -cat_update_dirlink(struct hfsmount *hfsmp, u_int8_t forktype, - struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp) -{ - if (forktype == kHFSResourceForkType) { - return cat_update_internal(hfsmp, true, descp, attrp, NULL, forkp); - } else { - return cat_update_internal(hfsmp, true, descp, attrp, forkp, NULL); - } -} - -void hfs_fork_copy(struct cat_fork *dst, const struct cat_fork *src, - HFSPlusExtentDescriptor *extents) -{ - /* Copy everything but the extents into the dest fork */ - memcpy(dst, src, offsetof(struct cat_fork, cf_extents)); - /* Then copy the supplied extents into the fork */ - memcpy(dst->cf_extents, extents, sizeof(HFSPlusExtentRecord)); -} diff --git a/bsd/hfs/hfs_catalog.h b/bsd/hfs/hfs_catalog.h deleted file mode 100644 index a4719ea41..000000000 --- a/bsd/hfs/hfs_catalog.h +++ /dev/null @@ -1,512 +0,0 @@ -/* - * Copyright (c) 2002-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef __HFS_CATALOG__ -#define __HFS_CATALOG__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -#include - -#include - -/* HFS Catalog */ - - -/* - * Catalog ADTs - * - * The cat_desc, cat_attr, and cat_fork structures are - * use to import/export data to/from the Catalog file. - * The fields in these structures are always in BSD - * runtime format (e.g. dates and names). - */ - -typedef u_int32_t cnid_t; - -/* - * Catalog Node Descriptor (runtime) - */ -struct cat_desc { - u_int8_t cd_flags; /* see below (8 bits) */ - u_int8_t cd_encoding; /* name encoding */ - int16_t cd_namelen; /* length of cnode name */ - cnid_t cd_parentcnid; /* parent directory CNID */ - u_int32_t cd_hint; /* catalog file hint */ - cnid_t cd_cnid; /* cnode id (for getattrlist) */ - const u_int8_t * cd_nameptr; /* pointer to cnode name */ -}; - -/* cd_flags - * - * CD_EOF is used by hfs_vnop_readdir / cat_getdirentries to indicate EOF was - * encountered during a directory enumeration. When this flag is observed - * on the next call to hfs_vnop_readdir it tells the caller that there's no - * need to descend into the catalog as EOF was encountered during the last call. - * This flag should only be set on the descriptor embedded in the directoryhint. - */ - -#define CD_HASBUF 0x01 /* allocated filename buffer */ -#define CD_DECOMPOSED 0x02 /* name is fully decomposed */ -#define CD_EOF 0x04 /* see above */ -#define CD_ISMETA 0x40 /* describes a metadata file */ -#define CD_ISDIR 0x80 /* describes a directory */ - -/* - * Catalog Node Attributes (runtime) - */ -struct cat_attr { - cnid_t ca_fileid; /* inode number (for stat) normally == cnid */ - mode_t ca_mode; /* file access mode and type (16 bits) */ - u_int16_t ca_recflags; /* catalog record flags (16 bit integer) */ - u_int32_t ca_linkcount; /* real hard link count */ - uid_t ca_uid; /* file owner */ - gid_t ca_gid; /* file group */ - union { - dev_t cau_rdev; /* special file device (VBLK or VCHAR only) */ - u_int32_t cau_linkref; /* hardlink reference number */ - } ca_union1; - time_t ca_atime; /* last access time */ - time_t ca_atimeondisk; /* access time value on disk */ - time_t ca_mtime; /* last data modification time */ - time_t ca_ctime; /* last file status change */ - time_t ca_itime; /* file initialization time */ - time_t ca_btime; /* last backup time */ - u_int32_t ca_flags; /* status flags (chflags) */ - union { - u_int32_t cau_blocks; /* total file blocks used (rsrc + data) */ - u_int32_t cau_entries; /* total directory entries (valence) */ - } ca_union2; - union { - u_int32_t cau_dircount; /* count of sub dirs (for posix nlink) */ - u_int32_t cau_firstlink; /* first hardlink link (files only) */ - } ca_union3; - union { - u_int8_t ca_finderinfo[32]; /* Opaque Finder information */ - struct { - FndrFileInfo ca_finderfileinfo; - struct FndrExtendedFileInfo ca_finderextendedfileinfo; - }; - struct { - FndrDirInfo ca_finderdirinfo; - struct FndrExtendedDirInfo ca_finderextendeddirinfo; - }; - }; -}; - -/* Aliases for common fields */ -#define ca_rdev ca_union1.cau_rdev -#define ca_linkref ca_union1.cau_linkref -#define ca_blocks ca_union2.cau_blocks -#define ca_entries ca_union2.cau_entries -#define ca_dircount ca_union3.cau_dircount -#define ca_firstlink ca_union3.cau_firstlink - -/* - * Catalog Node Fork (runtime) - * - * NOTE: this is not the same as a struct HFSPlusForkData - * - * NOTE: if cf_new_size > cf_size, then a write is in progress and is extending - * the EOF; the new EOF will be cf_new_size. Writes and pageouts may validly - * write up to cf_new_size, but reads should only read up to cf_size. When - * an extending write is not in progress, cf_new_size is zero. - */ -struct cat_fork { - off_t cf_size; /* fork's logical size in bytes */ - off_t cf_new_size; /* fork's logical size after write completes */ - union { - u_int32_t cfu_clump; /* fork's clump size in bytes (sys files only) */ - u_int64_t cfu_bytesread; /* bytes read from this fork */ - } cf_union; - u_int32_t cf_vblocks; /* virtual (unalloated) blocks */ - u_int32_t cf_blocks; /* total blocks used by this fork */ - struct HFSPlusExtentDescriptor cf_extents[8]; /* initial set of extents */ - - /* - * NOTE: If you change this structure, make sure you change you change - * hfs_fork_copy. - */ -}; - -#define cf_clump cf_union.cfu_clump -#define cf_bytesread cf_union.cfu_bytesread - -void hfs_fork_copy(struct cat_fork *dst, const struct cat_fork *src, - HFSPlusExtentDescriptor *extents); - -/* - * Directory Hint - * Used to hold state across directory enumerations. - * - */ -struct directoryhint { - TAILQ_ENTRY(directoryhint) dh_link; /* chain */ - int dh_index; /* index into directory (zero relative) */ - u_int32_t dh_threadhint; /* node hint of a directory's thread record */ - u_int32_t dh_time; - struct cat_desc dh_desc; /* entry's descriptor */ -}; -typedef struct directoryhint directoryhint_t; - -/* - * HFS_MAXDIRHINTS cannot be larger than 63 without reducing - * HFS_INDEX_BITS, because given the 6-bit tag, at most 63 different - * tags can exist. When HFS_MAXDIRHINTS is larger than 63, the same - * list may contain dirhints of the same tag, and a staled dirhint may - * be returned. - */ -#define HFS_MAXDIRHINTS 32 -#define HFS_DIRHINT_TTL 45 - -#define HFS_INDEX_MASK 0x03ffffff -#define HFS_INDEX_BITS 26 - - -/* - * Catalog Node Entry - * - * A cat_entry is used for bulk enumerations (hfs_readdirattr). - */ -struct cat_entry { - struct cat_desc ce_desc; - struct cat_attr ce_attr; - off_t ce_datasize; - off_t ce_rsrcsize; - u_int32_t ce_datablks; - u_int32_t ce_rsrcblks; -}; - -/* - * Starting in 10.5, hfs_vnop_readdirattr() only makes one - * call to cat_getentriesattr(). So we increased MAXCATENTRIES - * while keeping the total size of the CE LIST buffer <= 8K - * (which works out to be 60 entries per call). The 8K limit - * keeps the memory coming from a kalloc zone instead of - * valuable/fragment-able kernel map space. - */ -#define MAXCATENTRIES \ - (1 + (8192 - sizeof (struct cat_entrylist)) / sizeof (struct cat_entry)) - -/* - * Catalog Node Entry List - * - * A cat_entrylist is a list of Catalog Node Entries. - */ -struct cat_entrylist { - u_int32_t maxentries; /* number of entries requested */ - u_int32_t realentries; /* number of valid entries returned */ - u_int32_t skipentries; /* number of entries skipped (reserved HFS+ files) */ - struct cat_entry entry[1]; /* array of entries */ -}; - -#define CE_LIST_SIZE(entries) \ - sizeof (*ce_list) + (((entries) - 1) * sizeof (struct cat_entry)) - -struct hfsmount; - -/* - * Catalog FileID/CNID Acquisition / Lookup - * - * Some use-cases require that we find a valid CNID - * before we may be ready to enter the item into the namespace. - * In order to resolve this, we support a hashtable attached to - * the mount that is secured by the catalog lock. - * - * Finding the next valid CNID is easy if the wraparound bit is - * not set -- you just pull from the hfsmp next pointer. - * If it is set then you must find a free entry in the catalog - * and also query the hashtable to see if the item is free or not. - * - * If you want to request a CNID before there is a backing item - * in the catalog, you must find one that is valid, then insert - * it into the hash table until such time that the item is - * inserted into the catalog. After successful catalog insertion, - * you must remove the item from the hashtable. - */ - -typedef struct cat_preflightid { - cnid_t fileid; - LIST_ENTRY(cat_preflightid) id_hash; -} cat_preflightid_t; - -extern int cat_remove_idhash (cat_preflightid_t *preflight); -extern int cat_insert_idhash (struct hfsmount *hfsmp, cat_preflightid_t *preflight); -extern int cat_check_idhash (struct hfsmount *hfsmp, cnid_t test_fileid); - -/* initialize the id look up hashtable during mount */ -extern void hfs_idhash_init (struct hfsmount *hfsmp); - -/* release the id lookup hashtable during unmount */ -extern void hfs_idhash_destroy (struct hfsmount *hfsmp); - -/* Get a new CNID for use */ -extern int cat_acquire_cnid (struct hfsmount *hfsmp, cnid_t *new_cnid); - - -/* default size of ID hash is 64 entries */ -#define HFS_IDHASH_DEFAULT 64 - - -/* - * Catalog Operations Hint - * - * lower 16 bits: count of B-tree insert operations - * upper 16 bits: count of B-tree delete operations - * - */ -#define CAT_DELETE 0x00010000 -#define CAT_CREATE 0x00000002 -#define CAT_RENAME 0x00010002 -#define CAT_EXCHANGE 0x00010002 - -typedef u_int32_t catops_t; - -/* - * The size of cat_cookie_t much match the size of - * the nreserve struct (in BTreeNodeReserve.c). - */ -typedef struct cat_cookie_t { -#if defined(__LP64__) - char opaque[40]; -#else - char opaque[24]; -#endif -} cat_cookie_t; - -/* Universal catalog key */ -union CatalogKey { - HFSCatalogKey hfs; - HFSPlusCatalogKey hfsPlus; -}; -typedef union CatalogKey CatalogKey; - -/* Universal catalog data record */ -union CatalogRecord { - int16_t recordType; - HFSCatalogFolder hfsFolder; - HFSCatalogFile hfsFile; - HFSCatalogThread hfsThread; - HFSPlusCatalogFolder hfsPlusFolder; - HFSPlusCatalogFile hfsPlusFile; - HFSPlusCatalogThread hfsPlusThread; -}; -typedef union CatalogRecord CatalogRecord; - -/* Constants for HFS fork types */ -enum { - kHFSDataForkType = 0x0, /* data fork */ - kHFSResourceForkType = 0xff /* resource fork */ -}; - -/* - * Catalog Interface - * - * These functions perform a catalog transactions. The - * catalog b-tree is abstracted through this interface. - * (please don't go around it) - */ - - -extern void cat_releasedesc(struct cat_desc *descp); - -extern int cat_create ( struct hfsmount *hfsmp, - cnid_t new_fileid, - struct cat_desc *descp, - struct cat_attr *attrp, - struct cat_desc *out_descp); - -extern int cat_delete ( struct hfsmount *hfsmp, - struct cat_desc *descp, - struct cat_attr *attrp); - -extern int cat_lookup ( struct hfsmount *hfsmp, - struct cat_desc *descp, - int wantrsrc, - int force_casesensitive_lookup, - struct cat_desc *outdescp, - struct cat_attr *attrp, - struct cat_fork *forkp, - cnid_t *desc_cnid); - -extern int cat_idlookup (struct hfsmount *hfsmp, - cnid_t cnid, - int allow_system_files, - int wantrsrc, - struct cat_desc *outdescp, - struct cat_attr *attrp, - struct cat_fork *forkp); - -extern int cat_findname (struct hfsmount *hfsmp, - cnid_t cnid, - struct cat_desc *outdescp); - -extern int cat_getentriesattr( - struct hfsmount *hfsmp, - directoryhint_t *dirhint, - struct cat_entrylist *ce_list, - int *reachedeof); - -extern int cat_rename ( struct hfsmount * hfsmp, - struct cat_desc * from_cdp, - struct cat_desc * todir_cdp, - struct cat_desc * to_cdp, - struct cat_desc * cdp); - -extern int cat_update ( struct hfsmount *hfsmp, - struct cat_desc *descp, - struct cat_attr *attrp, - const struct cat_fork *dataforkp, - const struct cat_fork *rsrcforkp); - -extern int cat_getdirentries( - struct hfsmount *hfsmp, - u_int32_t entrycnt, - directoryhint_t *dirhint, - uio_t uio, - int extended, - int * items, - int * eofflag); - -extern int cat_insertfilethread ( - struct hfsmount *hfsmp, - struct cat_desc *descp); - -extern int cat_preflight( - struct hfsmount *hfsmp, - catops_t ops, - cat_cookie_t *cookie, - struct proc *p); - -extern void cat_postflight( - struct hfsmount *hfsmp, - cat_cookie_t *cookie, - struct proc *p); - -extern int cat_binarykeycompare( - HFSPlusCatalogKey *searchKey, - HFSPlusCatalogKey *trialKey); - -extern int CompareCatalogKeys( - HFSCatalogKey *searchKey, - HFSCatalogKey *trialKey); - -extern int CompareExtendedCatalogKeys( - HFSPlusCatalogKey *searchKey, - HFSPlusCatalogKey *trialKey); - -extern void cat_convertattr( - struct hfsmount *hfsmp, - CatalogRecord * recp, - struct cat_attr *attrp, - struct cat_fork *datafp, - struct cat_fork *rsrcfp); - -extern int cat_convertkey( - struct hfsmount *hfsmp, - CatalogKey *key, - CatalogRecord * recp, - struct cat_desc *descp); - -extern int cat_getkeyplusattr( - struct hfsmount *hfsmp, - cnid_t cnid, - CatalogKey *key, - struct cat_attr *attrp); - -/* Hard link functions. */ - -extern int cat_check_link_ancestry( - struct hfsmount *hfsmp, - cnid_t parentid, - cnid_t pointed_at_cnid); - -extern int cat_set_childlinkbit( - struct hfsmount *hfsmp, - cnid_t cnid); - -#define HFS_IGNORABLE_LINK 0x00000001 - -extern int cat_resolvelink( struct hfsmount *hfsmp, - u_int32_t linkref, - int isdirlink, - struct HFSPlusCatalogFile *recp); - -extern int cat_createlink( struct hfsmount *hfsmp, - struct cat_desc *descp, - struct cat_attr *attr, - cnid_t nextlinkid, - cnid_t *linkfileid); - -/* Finder Info's file type and creator for directory hard link alias */ -enum { - kHFSAliasType = 0x66647270, /* 'fdrp' */ - kHFSAliasCreator = 0x4D414353 /* 'MACS' */ -}; - -extern int cat_deletelink( struct hfsmount *hfsmp, - struct cat_desc *descp); - -extern int cat_update_siblinglinks( struct hfsmount *hfsmp, - cnid_t linkfileid, - cnid_t prevlinkid, - cnid_t nextlinkid); - -extern int cat_lookuplink( struct hfsmount *hfsmp, - struct cat_desc *descp, - cnid_t *linkfileid, - cnid_t *prevlinkid, - cnid_t *nextlinkid); - -extern int cat_lookup_siblinglinks( struct hfsmount *hfsmp, - cnid_t linkfileid, - cnid_t *prevlinkid, - cnid_t *nextlinkid); - -extern int cat_lookup_lastlink( struct hfsmount *hfsmp, - cnid_t startid, - cnid_t *nextlinkid, - struct cat_desc *cdesc); - -extern int cat_lookup_dirlink(struct hfsmount *hfsmp, - cnid_t dirlink_id, - u_int8_t forktype, - struct cat_desc *outdescp, - struct cat_attr *attrp, - struct cat_fork *forkp); - -extern int cat_update_dirlink(struct hfsmount *hfsmp, - u_int8_t forktype, - struct cat_desc *descp, - struct cat_attr *attrp, - struct cat_fork *rsrcforkp); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* __HFS_CATALOG__ */ diff --git a/bsd/hfs/hfs_chash.c b/bsd/hfs/hfs_chash.c deleted file mode 100644 index c52dc7521..000000000 --- a/bsd/hfs/hfs_chash.c +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Copyright (c) 2002-2012 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Copyright (c) 1982, 1986, 1989, 1991, 1993, 1995 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)hfs_chash.c - * derived from @(#)ufs_ihash.c 8.7 (Berkeley) 5/17/95 - */ - -#include -#include -#include -#include -#include -#include -#include - - -#include "hfs.h" /* XXX bringup */ -#include "hfs_cnode.h" - -extern lck_attr_t * hfs_lock_attr; -extern lck_grp_t * hfs_mutex_group; -extern lck_grp_t * hfs_rwlock_group; - -lck_grp_t * chash_lck_grp; -lck_grp_attr_t * chash_lck_grp_attr; -lck_attr_t * chash_lck_attr; - - -#define CNODEHASH(hfsmp, inum) (&hfsmp->hfs_cnodehashtbl[(inum) & hfsmp->hfs_cnodehash]) - - -/* - * Initialize cnode hash table. - */ -__private_extern__ -void -hfs_chashinit() -{ - chash_lck_grp_attr= lck_grp_attr_alloc_init(); - chash_lck_grp = lck_grp_alloc_init("cnode_hash", chash_lck_grp_attr); - chash_lck_attr = lck_attr_alloc_init(); -} - -static void hfs_chash_lock(struct hfsmount *hfsmp) -{ - lck_mtx_lock(&hfsmp->hfs_chash_mutex); -} - -static void hfs_chash_lock_spin(struct hfsmount *hfsmp) -{ - lck_mtx_lock_spin(&hfsmp->hfs_chash_mutex); -} - -static void hfs_chash_lock_convert (__unused struct hfsmount *hfsmp) -{ - lck_mtx_convert_spin(&hfsmp->hfs_chash_mutex); -} - -static void hfs_chash_unlock(struct hfsmount *hfsmp) -{ - lck_mtx_unlock(&hfsmp->hfs_chash_mutex); -} - -__private_extern__ -void -hfs_chashinit_finish(struct hfsmount *hfsmp) -{ - lck_mtx_init(&hfsmp->hfs_chash_mutex, chash_lck_grp, chash_lck_attr); - - hfsmp->hfs_cnodehashtbl = hashinit(desiredvnodes / 4, M_HFSMNT, &hfsmp->hfs_cnodehash); -} - -__private_extern__ -void -hfs_delete_chash(struct hfsmount *hfsmp) -{ - lck_mtx_destroy(&hfsmp->hfs_chash_mutex, chash_lck_grp); - - FREE(hfsmp->hfs_cnodehashtbl, M_HFSMNT); -} - - -/* - * Use the device, inum pair to find the incore cnode. - * - * If it is in core, but locked, wait for it. - */ -struct vnode * -hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock, int allow_deleted) -{ - struct cnode *cp; - struct vnode *vp; - int error; - u_int32_t vid; - - /* - * Go through the hash list - * If a cnode is in the process of being cleaned out or being - * allocated, wait for it to be finished and then try again. - */ -loop: - hfs_chash_lock_spin(hfsmp); - - for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { - if (cp->c_fileid != inum) - continue; - /* Wait if cnode is being created or reclaimed. */ - if (ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { - SET(cp->c_hflag, H_WAITING); - - (void) msleep(cp, &hfsmp->hfs_chash_mutex, PDROP | PINOD, - "hfs_chash_getvnode", 0); - goto loop; - } - /* Obtain the desired vnode. */ - vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; - if (vp == NULLVP) - goto exit; - - vid = vnode_vid(vp); - hfs_chash_unlock(hfsmp); - - if ((error = vnode_getwithvid(vp, vid))) { - /* - * If vnode is being reclaimed, or has - * already changed identity, no need to wait - */ - return (NULL); - } - if (!skiplock && hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - vnode_put(vp); - return (NULL); - } - - /* - * Skip cnodes that are not in the name space anymore - * we need to check with the cnode lock held because - * we may have blocked acquiring the vnode ref or the - * lock on the cnode which would allow the node to be - * unlinked - */ - if (!allow_deleted) { - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - if (!skiplock) { - hfs_unlock(cp); - } - vnode_put(vp); - return (NULL); - } - } - return (vp); - } -exit: - hfs_chash_unlock(hfsmp); - return (NULL); -} - - -/* - * Use the device, fileid pair to snoop an incore cnode. - * - * A cnode can exists in chash even after it has been - * deleted from the catalog, so this function returns - * ENOENT if C_NOEXIST is set in the cnode's flag. - * - */ -int -hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int existence_only, - int (*callout)(const cnode_t *cp, void *), void * arg) -{ - struct cnode *cp; - int result = ENOENT; - - /* - * Go through the hash list - * If a cnode is in the process of being cleaned out or being - * allocated, wait for it to be finished and then try again. - */ - hfs_chash_lock(hfsmp); - - for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { - if (cp->c_fileid != inum) - continue; - - /* - * Under normal circumstances, we would want to return ENOENT if a cnode is in - * the hash and it is marked C_NOEXISTS or C_DELETED. However, if the CNID - * namespace has wrapped around, then we have the possibility of collisions. - * In that case, we may use this function to validate whether or not we - * should trust the nextCNID value in the hfs mount point. - * - * If we didn't do this, then it would be possible for a cnode that is no longer backed - * by anything on-disk (C_NOEXISTS) to still exist in the hash along with its - * vnode. The cat_create routine could then create a new entry in the catalog - * re-using that CNID. Then subsequent hfs_getnewvnode calls will repeatedly fail - * trying to look it up/validate it because it is marked C_NOEXISTS. So we want - * to prevent that from happening as much as possible. - */ - if (existence_only) { - result = 0; - break; - } - - /* Skip cnodes that have been removed from the catalog */ - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - result = EACCES; - break; - } - - /* Skip cnodes being created or reclaimed. */ - if (!ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { - result = callout(cp, arg); - } - break; - } - hfs_chash_unlock(hfsmp); - - return (result); -} - - -/* - * Use the device, fileid pair to find the incore cnode. - * If no cnode if found one is created - * - * If it is in core, but locked, wait for it. - * - * If the cnode is C_DELETED, then return NULL since that - * inum is no longer valid for lookups (open-unlinked file). - * - * If the cnode is C_DELETED but also marked C_RENAMED, then that means - * the cnode was renamed over and a new entry exists in its place. The caller - * should re-drive the lookup to get the newer entry. In that case, we'll still - * return NULL for the cnode, but also return GNV_CHASH_RENAMED in the output flags - * of this function to indicate the caller that they should re-drive. - */ -struct cnode * -hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, - int wantrsrc, int skiplock, int *out_flags, int *hflags) -{ - struct cnode *cp; - struct cnode *ncp = NULL; - vnode_t vp; - u_int32_t vid; - - /* - * Go through the hash list - * If a cnode is in the process of being cleaned out or being - * allocated, wait for it to be finished and then try again. - */ -loop: - hfs_chash_lock_spin(hfsmp); - -loop_with_lock: - for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { - if (cp->c_fileid != inum) - continue; - /* - * Wait if cnode is being created, attached to or reclaimed. - */ - if (ISSET(cp->c_hflag, H_ALLOC | H_ATTACH | H_TRANSIT)) { - SET(cp->c_hflag, H_WAITING); - - (void) msleep(cp, &hfsmp->hfs_chash_mutex, PINOD, - "hfs_chash_getcnode", 0); - goto loop_with_lock; - } - vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; - if (vp == NULL) { - /* - * The desired vnode isn't there so tag the cnode. - */ - SET(cp->c_hflag, H_ATTACH); - *hflags |= H_ATTACH; - - hfs_chash_unlock(hfsmp); - } else { - vid = vnode_vid(vp); - - hfs_chash_unlock(hfsmp); - - if (vnode_getwithvid(vp, vid)) - goto loop; - } - if (ncp) { - /* - * someone else won the race to create - * this cnode and add it to the hash - * just dump our allocation - */ - FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); - ncp = NULL; - } - - if (!skiplock) { - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - } - - /* - * Skip cnodes that are not in the name space anymore - * we need to check with the cnode lock held because - * we may have blocked acquiring the vnode ref or the - * lock on the cnode which would allow the node to be - * unlinked. - * - * Don't return a cnode in this case since the inum - * is no longer valid for lookups. - */ - if ((cp->c_flag & (C_NOEXISTS | C_DELETED)) && !wantrsrc) { - int renamed = 0; - if (cp->c_flag & C_RENAMED) { - renamed = 1; - } - if (!skiplock) - hfs_unlock(cp); - if (vp != NULLVP) { - vnode_put(vp); - } else { - hfs_chash_lock_spin(hfsmp); - CLR(cp->c_hflag, H_ATTACH); - *hflags &= ~H_ATTACH; - if (ISSET(cp->c_hflag, H_WAITING)) { - CLR(cp->c_hflag, H_WAITING); - wakeup((caddr_t)cp); - } - hfs_chash_unlock(hfsmp); - } - vp = NULL; - cp = NULL; - if (renamed) { - *out_flags = GNV_CHASH_RENAMED; - } - } - *vpp = vp; - return (cp); - } - - /* - * Allocate a new cnode - */ - if (skiplock && !wantrsrc) - panic("%s - should never get here when skiplock is set \n", __FUNCTION__); - - if (ncp == NULL) { - hfs_chash_unlock(hfsmp); - - MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); - /* - * since we dropped the chash lock, - * we need to go back and re-verify - * that this node hasn't come into - * existence... - */ - goto loop; - } - hfs_chash_lock_convert(hfsmp); - - bzero(ncp, sizeof(struct cnode)); - SET(ncp->c_hflag, H_ALLOC); - *hflags |= H_ALLOC; - ncp->c_fileid = inum; - TAILQ_INIT(&ncp->c_hintlist); /* make the list empty */ - TAILQ_INIT(&ncp->c_originlist); - - lck_rw_init(&ncp->c_rwlock, hfs_rwlock_group, hfs_lock_attr); - if (!skiplock) - (void) hfs_lock(ncp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - - /* Insert the new cnode with it's H_ALLOC flag set */ - LIST_INSERT_HEAD(CNODEHASH(hfsmp, inum), ncp, c_hash); - hfs_chash_unlock(hfsmp); - - *vpp = NULL; - return (ncp); -} - - -__private_extern__ -void -hfs_chashwakeup(struct hfsmount *hfsmp, struct cnode *cp, int hflags) -{ - hfs_chash_lock_spin(hfsmp); - - CLR(cp->c_hflag, hflags); - - if (ISSET(cp->c_hflag, H_WAITING)) { - CLR(cp->c_hflag, H_WAITING); - wakeup((caddr_t)cp); - } - hfs_chash_unlock(hfsmp); -} - - -/* - * Re-hash two cnodes in the hash table. - */ -__private_extern__ -void -hfs_chash_rehash(struct hfsmount *hfsmp, struct cnode *cp1, struct cnode *cp2) -{ - hfs_chash_lock_spin(hfsmp); - - LIST_REMOVE(cp1, c_hash); - LIST_REMOVE(cp2, c_hash); - LIST_INSERT_HEAD(CNODEHASH(hfsmp, cp1->c_fileid), cp1, c_hash); - LIST_INSERT_HEAD(CNODEHASH(hfsmp, cp2->c_fileid), cp2, c_hash); - - hfs_chash_unlock(hfsmp); -} - - -/* - * Remove a cnode from the hash table. - */ -__private_extern__ -int -hfs_chashremove(struct hfsmount *hfsmp, struct cnode *cp) -{ - hfs_chash_lock_spin(hfsmp); - - /* Check if a vnode is getting attached */ - if (ISSET(cp->c_hflag, H_ATTACH)) { - hfs_chash_unlock(hfsmp); - return (EBUSY); - } - if (cp->c_hash.le_next || cp->c_hash.le_prev) { - LIST_REMOVE(cp, c_hash); - cp->c_hash.le_next = NULL; - cp->c_hash.le_prev = NULL; - } - hfs_chash_unlock(hfsmp); - - return (0); -} - -/* - * Remove a cnode from the hash table and wakeup any waiters. - */ -__private_extern__ -void -hfs_chash_abort(struct hfsmount *hfsmp, struct cnode *cp) -{ - hfs_chash_lock_spin(hfsmp); - - LIST_REMOVE(cp, c_hash); - cp->c_hash.le_next = NULL; - cp->c_hash.le_prev = NULL; - - CLR(cp->c_hflag, H_ATTACH | H_ALLOC); - if (ISSET(cp->c_hflag, H_WAITING)) { - CLR(cp->c_hflag, H_WAITING); - wakeup((caddr_t)cp); - } - hfs_chash_unlock(hfsmp); -} - - -/* - * mark a cnode as in transition - */ -__private_extern__ -void -hfs_chash_mark_in_transit(struct hfsmount *hfsmp, struct cnode *cp) -{ - hfs_chash_lock_spin(hfsmp); - - SET(cp->c_hflag, H_TRANSIT); - - hfs_chash_unlock(hfsmp); -} - -/* Search a cnode in the hash. This function does not return cnode which - * are getting created, destroyed or in transition. Note that this function - * does not acquire the cnode hash mutex, and expects the caller to acquire it. - * On success, returns pointer to the cnode found. On failure, returns NULL. - */ -static -struct cnode * -hfs_chash_search_cnid(struct hfsmount *hfsmp, cnid_t cnid) -{ - struct cnode *cp; - - for (cp = CNODEHASH(hfsmp, cnid)->lh_first; cp; cp = cp->c_hash.le_next) { - if (cp->c_fileid == cnid) { - break; - } - } - - /* If cnode is being created or reclaimed, return error. */ - if (cp && ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { - cp = NULL; - } - - return cp; -} - -/* Search a cnode corresponding to given device and ID in the hash. If the - * found cnode has kHFSHasChildLinkBit cleared, set it. If the cnode is not - * found, no new cnode is created and error is returned. - * - * Return values - - * -1 : The cnode was not found. - * 0 : The cnode was found, and the kHFSHasChildLinkBit was already set. - * 1 : The cnode was found, the kHFSHasChildLinkBit was not set, and the - * function had to set that bit. - */ -__private_extern__ -int -hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid) -{ - int retval = -1; - struct cnode *cp; - - hfs_chash_lock_spin(hfsmp); - - cp = hfs_chash_search_cnid(hfsmp, cnid); - if (cp) { - if (cp->c_attr.ca_recflags & kHFSHasChildLinkMask) { - retval = 0; - } else { - cp->c_attr.ca_recflags |= kHFSHasChildLinkMask; - retval = 1; - } - } - hfs_chash_unlock(hfsmp); - - return retval; -} diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c deleted file mode 100644 index 668cc7870..000000000 --- a/bsd/hfs/hfs_cnode.c +++ /dev/null @@ -1,2578 +0,0 @@ -/* - * Copyright (c) 2002-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -extern int prtactive; - -extern lck_attr_t * hfs_lock_attr; -extern lck_grp_t * hfs_mutex_group; -extern lck_grp_t * hfs_rwlock_group; - -static void hfs_reclaim_cnode(hfsmount_t *hfsmp, struct cnode *); -static int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim); -static int hfs_isordered(struct cnode *, struct cnode *); - -extern int hfs_removefile_callback(struct buf *bp, void *hfsmp); - - -__inline__ int hfs_checkdeleted (struct cnode *cp) { - return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); -} - -/* - * Function used by a special fcntl() that decorates a cnode/vnode that - * indicates it is backing another filesystem, like a disk image. - * - * the argument 'val' indicates whether or not to set the bit in the cnode flags - * - * Returns non-zero on failure. 0 on success - */ -int hfs_set_backingstore (struct vnode *vp, int val) { - struct cnode *cp = NULL; - int err = 0; - - cp = VTOC(vp); - if (!vnode_isreg(vp) && !vnode_isdir(vp)) { - return EINVAL; - } - - /* lock the cnode */ - err = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (err) { - return err; - } - - if (val) { - cp->c_flag |= C_BACKINGSTORE; - } - else { - cp->c_flag &= ~C_BACKINGSTORE; - } - - /* unlock everything */ - hfs_unlock (cp); - - return err; -} - -/* - * Function used by a special fcntl() that check to see if a cnode/vnode - * indicates it is backing another filesystem, like a disk image. - * - * the argument 'val' is an output argument for whether or not the bit is set - * - * Returns non-zero on failure. 0 on success - */ - -int hfs_is_backingstore (struct vnode *vp, int *val) { - struct cnode *cp = NULL; - int err = 0; - - if (!vnode_isreg(vp) && !vnode_isdir(vp)) { - *val = 0; - return 0; - } - - cp = VTOC(vp); - - /* lock the cnode */ - err = hfs_lock (cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - if (err) { - return err; - } - - if (cp->c_flag & C_BACKINGSTORE) { - *val = 1; - } - else { - *val = 0; - } - - /* unlock everything */ - hfs_unlock (cp); - - return err; -} - - -/* - * hfs_cnode_teardown - * - * This is an internal function that is invoked from both hfs_vnop_inactive - * and hfs_vnop_reclaim. As VNOP_INACTIVE is not necessarily called from vnodes - * being recycled and reclaimed, it is important that we do any post-processing - * necessary for the cnode in both places. Important tasks include things such as - * releasing the blocks from an open-unlinked file when all references to it have dropped, - * and handling resource forks separately from data forks. - * - * Note that we take only the vnode as an argument here (rather than the cnode). - * Recall that each cnode supports two forks (rsrc/data), and we can always get the right - * cnode from either of the vnodes, but the reverse is not true -- we can't determine which - * vnode we need to reclaim if only the cnode is supplied. - * - * This function is idempotent and safe to call from both hfs_vnop_inactive and hfs_vnop_reclaim - * if both are invoked right after the other. In the second call, most of this function's if() - * conditions will fail, since they apply generally to cnodes still marked with C_DELETED. - * As a quick check to see if this function is necessary, determine if the cnode is already - * marked C_NOEXISTS. If it is, then it is safe to skip this function. The only tasks that - * remain for cnodes marked in such a fashion is to teardown their fork references and - * release all directory hints and hardlink origins. However, both of those are done - * in hfs_vnop_reclaim. hfs_update, by definition, is not necessary if the cnode's catalog - * entry is no longer there. - * - * 'reclaim' argument specifies whether or not we were called from hfs_vnop_reclaim. If we are - * invoked from hfs_vnop_reclaim, we can not call functions that cluster_push since the UBC info - * is totally gone by that point. - * - * Assumes that both truncate and cnode locks for 'cp' are held. - */ -static -int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) -{ - int forkcount = 0; - enum vtype v_type; - struct cnode *cp; - int error = 0; - bool started_tr = false; - struct hfsmount *hfsmp = VTOHFS(vp); - struct proc *p = vfs_context_proc(ctx); - int truncated = 0; - cat_cookie_t cookie; - int cat_reserve = 0; - int lockflags; - int ea_error = 0; - - v_type = vnode_vtype(vp); - cp = VTOC(vp); - - if (cp->c_datafork) { - ++forkcount; - } - if (cp->c_rsrcfork) { - ++forkcount; - } - - /* - * Push file data out for normal files that haven't been evicted from - * the namespace. We only do this if this function was not called from reclaim, - * because by that point the UBC information has been totally torn down. - * - * There should also be no way that a normal file that has NOT been deleted from - * the namespace to skip INACTIVE and go straight to RECLAIM. That race only happens - * when the file becomes open-unlinked. - */ - if ((v_type == VREG) && - (!ISSET(cp->c_flag, C_DELETED)) && - (!ISSET(cp->c_flag, C_NOEXISTS)) && - (VTOF(vp)->ff_blocks) && - (reclaim == 0)) { - /* - * If we're called from hfs_vnop_inactive, all this means is at the time - * the logic for deciding to call this function, there were not any lingering - * mmap/fd references for this file. However, there is nothing preventing the system - * from creating a new reference in between the time that logic was checked - * and we entered hfs_vnop_inactive. As a result, the only time we can guarantee - * that there aren't any references is during vnop_reclaim. - */ - hfs_filedone(vp, ctx, 0); - } - - /* - * Remove any directory hints or cached origins - */ - if (v_type == VDIR) { - hfs_reldirhints(cp, 0); - } - if (cp->c_flag & C_HARDLINK) { - hfs_relorigins(cp); - } - - /* - * -- Handle open unlinked files -- - * - * If the vnode is in use, it means a force unmount is in progress - * in which case we defer cleaning up until either we come back - * through here via hfs_vnop_reclaim, at which point the UBC - * information will have been torn down and the vnode might no - * longer be in use, or if it's still in use, it will get cleaned - * up when next remounted. - */ - if (ISSET(cp->c_flag, C_DELETED) && !vnode_isinuse(vp, 0)) { - /* - * This check is slightly complicated. We should only truncate data - * in very specific cases for open-unlinked files. This is because - * we want to ensure that the resource fork continues to be available - * if the caller has the data fork open. However, this is not symmetric; - * someone who has the resource fork open need not be able to access the data - * fork once the data fork has gone inactive. - * - * If we're the last fork, then we have cleaning up to do. - * - * A) last fork, and vp == c_vp - * Truncate away own fork data. If rsrc fork is not in core, truncate it too. - * - * B) last fork, and vp == c_rsrc_vp - * Truncate ourselves, assume data fork has been cleaned due to C). - * - * If we're not the last fork, then things are a little different: - * - * C) not the last fork, vp == c_vp - * Truncate ourselves. Once the file has gone out of the namespace, - * it cannot be further opened. Further access to the rsrc fork may - * continue, however. - * - * D) not the last fork, vp == c_rsrc_vp - * Don't enter the block below, just clean up vnode and push it out of core. - */ - - if ((v_type == VREG || v_type == VLNK) && - ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { - - /* Truncate away our own fork data. (Case A, B, C above) */ - if (VTOF(vp)->ff_blocks != 0) { - /* - * SYMLINKS only: - * - * Encapsulate the entire change (including truncating the link) in - * nested transactions if we are modifying a symlink, because we know that its - * file length will be at most 4k, and we can fit both the truncation and - * any relevant bitmap changes into a single journal transaction. We also want - * the kill_block code to execute in the same transaction so that any dirty symlink - * blocks will not be written. Otherwise, rely on - * hfs_truncate doing its own transactions to ensure that we don't blow up - * the journal. - */ - if (!started_tr && (v_type == VLNK)) { - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - else { - started_tr = true; - } - } - - /* - * At this point, we have decided that this cnode is - * suitable for full removal. We are about to deallocate - * its blocks and remove its entry from the catalog. - * If it was a symlink, then it's possible that the operation - * which created it is still in the current transaction group - * due to coalescing. Take action here to kill the data blocks - * of the symlink out of the journal before moving to - * deallocate the blocks. We need to be in the middle of - * a transaction before calling buf_iterate like this. - * - * Note: we have to kill any potential symlink buffers out of - * the journal prior to deallocating their blocks. This is so - * that we don't race with another thread that may be doing an - * an allocation concurrently and pick up these blocks. It could - * generate I/O against them which could go out ahead of our journal - * transaction. - */ - - if (hfsmp->jnl && vnode_islnk(vp)) { - buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); - } - - - /* - * This truncate call (and the one below) is fine from VNOP_RECLAIM's - * context because we're only removing blocks, not zero-filling new - * ones. The C_DELETED check above makes things much simpler. - */ - error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 0, ctx); - if (error) { - goto out; - } - truncated = 1; - - /* (SYMLINKS ONLY): Close/End our transaction after truncating the file record */ - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = false; - } - - } - - /* - * Truncate away the resource fork, if we represent the data fork and - * it is the last fork. That means, by definition, the rsrc fork is not in - * core. To avoid bringing a vnode into core for the sole purpose of deleting the - * data in the resource fork, we call cat_lookup directly, then hfs_release_storage - * to get rid of the resource fork's data. Note that because we are holding the - * cnode lock, it is impossible for a competing thread to create the resource fork - * vnode from underneath us while we do this. - * - * This is invoked via case A above only. - */ - if ((cp->c_blocks > 0) && (forkcount == 1) && (vp != cp->c_rsrc_vp)) { - struct cat_lookup_buffer *lookup_rsrc = NULL; - struct cat_desc *desc_ptr = NULL; - lockflags = 0; - - MALLOC(lookup_rsrc, struct cat_lookup_buffer*, sizeof (struct cat_lookup_buffer), M_TEMP, M_WAITOK); - if (lookup_rsrc == NULL) { - printf("hfs_cnode_teardown: ENOMEM from MALLOC\n"); - error = ENOMEM; - goto out; - } - else { - bzero (lookup_rsrc, sizeof (struct cat_lookup_buffer)); - } - - if (cp->c_desc.cd_namelen == 0) { - /* Initialize the rsrc descriptor for lookup if necessary*/ - MAKE_DELETED_NAME (lookup_rsrc->lookup_name, HFS_TEMPLOOKUP_NAMELEN, cp->c_fileid); - - lookup_rsrc->lookup_desc.cd_nameptr = (const uint8_t*) lookup_rsrc->lookup_name; - lookup_rsrc->lookup_desc.cd_namelen = strlen (lookup_rsrc->lookup_name); - lookup_rsrc->lookup_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - lookup_rsrc->lookup_desc.cd_cnid = cp->c_cnid; - - desc_ptr = &lookup_rsrc->lookup_desc; - } - else { - desc_ptr = &cp->c_desc; - } - - lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - error = cat_lookup (hfsmp, desc_ptr, 1, 0, (struct cat_desc *) NULL, - (struct cat_attr*) NULL, &lookup_rsrc->lookup_fork.ff_data, NULL); - - hfs_systemfile_unlock (hfsmp, lockflags); - - if (error) { - FREE (lookup_rsrc, M_TEMP); - goto out; - } - - /* - * Make the filefork in our temporary struct look like a real - * filefork. Fill in the cp, sysfileinfo and rangelist fields.. - */ - rl_init (&lookup_rsrc->lookup_fork.ff_invalidranges); - lookup_rsrc->lookup_fork.ff_cp = cp; - - /* - * If there were no errors, then we have the catalog's fork information - * for the resource fork in question. Go ahead and delete the data in it now. - */ - - error = hfs_release_storage (hfsmp, NULL, &lookup_rsrc->lookup_fork, cp->c_fileid); - FREE(lookup_rsrc, M_TEMP); - - if (error) { - goto out; - } - - /* - * This fileid's resource fork extents have now been fully deleted on-disk - * and this CNID is no longer valid. At this point, we should be able to - * zero out cp->c_blocks to indicate there is no data left in this file. - */ - cp->c_blocks = 0; - } - } - - /* - * If we represent the last fork (or none in the case of a dir), - * and the cnode has become open-unlinked... - * - * We check c_blocks here because it is possible in the force - * unmount case for the data fork to be in use but the resource - * fork to not be in use in which case we will truncate the - * resource fork, but not the data fork. It will get cleaned - * up upon next mount. - */ - if (forkcount <= 1 && !cp->c_blocks) { - /* - * If it has EA's, then we need to get rid of them. - * - * Note that this must happen outside of any other transactions - * because it starts/ends its own transactions and grabs its - * own locks. This is to prevent a file with a lot of attributes - * from creating a transaction that is too large (which panics). - */ - if (ISSET(cp->c_attr.ca_recflags, kHFSHasAttributesMask)) - ea_error = hfs_removeallattr(hfsmp, cp->c_fileid, &started_tr); - - /* - * Remove the cnode's catalog entry and release all blocks it - * may have been using. - */ - - /* - * Mark cnode in transit so that no one can get this - * cnode from cnode hash. - */ - // hfs_chash_mark_in_transit(hfsmp, cp); - // XXXdbg - remove the cnode from the hash table since it's deleted - // otherwise someone could go to sleep on the cnode and not - // be woken up until this vnode gets recycled which could be - // a very long time... - hfs_chashremove(hfsmp, cp); - - cp->c_flag |= C_NOEXISTS; // XXXdbg - cp->c_rdev = 0; - - if (!started_tr) { - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - started_tr = true; - } - - /* - * Reserve some space in the Catalog file. - */ - if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { - goto out; - } - cat_reserve = 1; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - if (cp->c_blocks > 0) { - printf("hfs_inactive: deleting non-empty%sfile %d, " - "blks %d\n", VNODE_IS_RSRC(vp) ? " rsrc " : " ", - (int)cp->c_fileid, (int)cp->c_blocks); - } - - // - // release the name pointer in the descriptor so that - // cat_delete() will use the file-id to do the deletion. - // in the case of hard links this is imperative (in the - // case of regular files the fileid and cnid are the - // same so it doesn't matter). - // - cat_releasedesc(&cp->c_desc); - - /* - * The descriptor name may be zero, - * in which case the fileid is used. - */ - error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); - - if (error && truncated && (error != ENXIO)) { - printf("hfs_inactive: couldn't delete a truncated file!"); - } - - /* Update HFS Private Data dir */ - if (error == 0) { - hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; - if (vnode_isdir(vp)) { - DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); - } - (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], - &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); - } - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) { - goto out; - } - - #if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) - (void)hfs_chkiq(cp, -1, NOCRED, 0); - #endif /* QUOTA */ - - /* Already set C_NOEXISTS at the beginning of this block */ - cp->c_flag &= ~C_DELETED; - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - - if (error == 0) - hfs_volupdate(hfsmp, (v_type == VDIR) ? VOL_RMDIR : VOL_RMFILE, 0); - } - } // if - - hfs_update(vp, reclaim ? HFS_UPDATE_FORCE : 0); - - /* - * Since we are about to finish what might be an inactive call, propagate - * any remaining modified or touch bits from the cnode to the vnode. This - * serves as a hint to vnode recycling that we shouldn't recycle this vnode - * synchronously. - * - * For now, if the node *only* has a dirty atime, we don't mark - * the vnode as dirty. VFS's asynchronous recycling can actually - * lead to worse performance than having it synchronous. When VFS - * is fixed to be more performant, we can be more honest about - * marking vnodes as dirty when it's only the atime that's dirty. - */ - if (hfs_is_dirty(cp) == HFS_DIRTY || ISSET(cp->c_flag, C_DELETED)) { - vnode_setdirty(vp); - } else { - vnode_cleardirty(vp); - } - -out: - if (cat_reserve) - cat_postflight(hfsmp, &cookie, p); - - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = false; - } - - return error; -} - - -/* - * hfs_vnop_inactive - * - * The last usecount on the vnode has gone away, so we need to tear down - * any remaining data still residing in the cnode. If necessary, write out - * remaining blocks or delete the cnode's entry in the catalog. - */ -int -hfs_vnop_inactive(struct vnop_inactive_args *ap) -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct hfsmount *hfsmp = VTOHFS(vp); - struct proc *p = vfs_context_proc(ap->a_context); - int error = 0; - int took_trunc_lock = 0; - enum vtype v_type; - - v_type = vnode_vtype(vp); - cp = VTOC(vp); - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) || vnode_issystem(vp) || - (hfsmp->hfs_freezing_proc == p)) { - error = 0; - goto inactive_done; - } - - /* - * For safety, do NOT call vnode_recycle from inside this function. This can cause - * problems in the following scenario: - * - * vnode_create -> vnode_reclaim_internal -> vclean -> VNOP_INACTIVE - * - * If we're being invoked as a result of a reclaim that was already in-flight, then we - * cannot call vnode_recycle again. Being in reclaim means that there are no usecounts or - * iocounts by definition. As a result, if we were to call vnode_recycle, it would immediately - * try to re-enter reclaim again and panic. - * - * Currently, there are three things that can cause us (VNOP_INACTIVE) to get called. - * 1) last usecount goes away on the vnode (vnode_rele) - * 2) last iocount goes away on a vnode that previously had usecounts but didn't have - * vnode_recycle called (vnode_put) - * 3) vclean by way of reclaim - * - * In this function we would generally want to call vnode_recycle to speed things - * along to ensure that we don't leak blocks due to open-unlinked files. However, by - * virtue of being in this function already, we can call hfs_cnode_teardown, which - * will release blocks held by open-unlinked files, and mark them C_NOEXISTS so that - * there's no entry in the catalog and no backing store anymore. If that's the case, - * then we really don't care all that much when the vnode actually goes through reclaim. - * Further, the HFS VNOPs that manipulated the namespace in order to create the open- - * unlinked file in the first place should have already called vnode_recycle on the vnode - * to guarantee that it would go through reclaim in a speedy way. - */ - - if (cp->c_flag & C_NOEXISTS) { - /* - * If the cnode has already had its cat entry removed, then - * just skip to the end. We don't need to do anything here. - */ - error = 0; - goto inactive_done; - } - - if ((v_type == VREG || v_type == VLNK)) { - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - took_trunc_lock = 1; - } - - (void) hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - /* - * Call cnode_teardown to push out dirty blocks to disk, release open-unlinked - * files' blocks from being in use, and move the cnode from C_DELETED to C_NOEXISTS. - */ - error = hfs_cnode_teardown (vp, ap->a_context, 0); - - /* - * Drop the truncate lock before unlocking the cnode - * (which can potentially perform a vnode_put and - * recycle the vnode which in turn might require the - * truncate lock) - */ - if (took_trunc_lock) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - } - - hfs_unlock(cp); - -inactive_done: - - return error; -} - - -/* - * File clean-up (zero fill and shrink peof). - */ - -int -hfs_filedone(struct vnode *vp, vfs_context_t context, - hfs_file_done_opts_t opts) -{ - struct cnode *cp; - struct filefork *fp; - struct hfsmount *hfsmp; - off_t leof; - u_int32_t blks, blocksize; - - cp = VTOC(vp); - fp = VTOF(vp); - hfsmp = VTOHFS(vp); - leof = fp->ff_size; - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (fp->ff_blocks == 0)) - return (0); - - hfs_flush_invalid_ranges(vp); - - blocksize = VTOVCB(vp)->blockSize; - blks = leof / blocksize; - if (((off_t)blks * (off_t)blocksize) != leof) - blks++; - /* - * Shrink the peof to the smallest size neccessary to contain the leof. - */ - if (blks < fp->ff_blocks) { - (void) hfs_truncate(vp, leof, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES, context); - } - - if (!ISSET(opts, HFS_FILE_DONE_NO_SYNC)) { - hfs_unlock(cp); - cluster_push(vp, IO_CLOSE); - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - /* - * If the hfs_truncate didn't happen to flush the vnode's - * information out to disk, force it to be updated now that - * all invalid ranges have been zero-filled and validated: - */ - hfs_update(vp, 0); - } - - return (0); -} - - -/* - * Reclaim a cnode so that it can be used for other purposes. - */ -int -hfs_vnop_reclaim(struct vnop_reclaim_args *ap) -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct filefork *fp = NULL; - struct filefork *altfp = NULL; - struct hfsmount *hfsmp = VTOHFS(vp); - vfs_context_t ctx = ap->a_context; - int reclaim_cnode = 0; - int err = 0; - enum vtype v_type; - - v_type = vnode_vtype(vp); - cp = VTOC(vp); - - /* - * We don't take the truncate lock since by the time reclaim comes along, - * all dirty pages have been synced and nobody should be competing - * with us for this thread. - */ - (void) hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - /* - * Sync to disk any remaining data in the cnode/vnode. This includes - * a call to hfs_update if the cnode has outbound data. - * - * If C_NOEXISTS is set on the cnode, then there's nothing teardown needs to do - * because the catalog entry for this cnode is already gone. - */ - if (!ISSET(cp->c_flag, C_NOEXISTS)) { - err = hfs_cnode_teardown(vp, ctx, 1); - } - - /* - * Keep track of an inactive hot file. Don't bother on ssd's since - * the tracking is done differently (it's done at read() time) - */ - if (!vnode_isdir(vp) && - !vnode_issystem(vp) && - !(cp->c_flag & (C_DELETED | C_NOEXISTS)) && - !(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { - (void) hfs_addhotfile(vp); - } - vnode_removefsref(vp); - - /* - * Find file fork for this vnode (if any) - * Also check if another fork is active - */ - if (cp->c_vp == vp) { - fp = cp->c_datafork; - altfp = cp->c_rsrcfork; - - cp->c_datafork = NULL; - cp->c_vp = NULL; - } else if (cp->c_rsrc_vp == vp) { - fp = cp->c_rsrcfork; - altfp = cp->c_datafork; - - cp->c_rsrcfork = NULL; - cp->c_rsrc_vp = NULL; - } else { - panic("hfs_vnop_reclaim: vp points to wrong cnode (vp=%p cp->c_vp=%p cp->c_rsrc_vp=%p)\n", vp, cp->c_vp, cp->c_rsrc_vp); - } - /* - * On the last fork, remove the cnode from its hash chain. - */ - if (altfp == NULL) { - /* If we can't remove it then the cnode must persist! */ - if (hfs_chashremove(hfsmp, cp) == 0) - reclaim_cnode = 1; - /* - * Remove any directory hints - */ - if (vnode_isdir(vp)) { - hfs_reldirhints(cp, 0); - } - - if(cp->c_flag & C_HARDLINK) { - hfs_relorigins(cp); - } - } - /* Release the file fork and related data */ - if (fp) { - /* Dump cached symlink data */ - if (vnode_islnk(vp) && (fp->ff_symlinkptr != NULL)) { - FREE(fp->ff_symlinkptr, M_TEMP); - } - rl_remove_all(&fp->ff_invalidranges); - FREE_ZONE(fp, sizeof(struct filefork), M_HFSFORK); - } - - /* - * If there was only one active fork then we can release the cnode. - */ - if (reclaim_cnode) { - hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_TRANSIT); - hfs_unlock(cp); - hfs_reclaim_cnode(hfsmp, cp); - } - else { - /* - * cnode in use. If it is a directory, it could have - * no live forks. Just release the lock. - */ - hfs_unlock(cp); - } - - vnode_clearfsnode(vp); - return (0); -} - - -extern int (**hfs_vnodeop_p) (void *); -extern int (**hfs_specop_p) (void *); -#if FIFO -extern int (**hfs_fifoop_p) (void *); -#endif - -#if CONFIG_HFS_STD -extern int (**hfs_std_vnodeop_p) (void *); -#endif - -/* - * hfs_getnewvnode - get new default vnode - * - * The vnode is returned with an iocount and the cnode locked. - * The cnode of the parent vnode 'dvp' may or may not be locked, depending on - * the circumstances. The cnode in question (if acquiring the resource fork), - * may also already be locked at the time we enter this function. - * - * Note that there are both input and output flag arguments to this function. - * If one of the input flags (specifically, GNV_USE_VP), is set, then - * hfs_getnewvnode will use the parameter *vpp, which is traditionally only - * an output parameter, as both an input and output parameter. It will use - * the vnode provided in the output, and pass it to vnode_create with the - * proper flavor so that a new vnode is _NOT_ created on our behalf when - * we dispatch to VFS. This may be important in various HFS vnode creation - * routines, such a create or get-resource-fork, because we risk deadlock if - * jetsam is involved. - * - * Deadlock potential exists if jetsam is synchronously invoked while we are waiting - * for a vnode to be recycled in order to give it the identity we want. If jetsam - * happens to target a process for termination that is blocked in-kernel, waiting to - * acquire the cnode lock on our parent 'dvp', while our current thread has it locked, - * neither side will make forward progress and the watchdog timer will eventually fire. - * To prevent this, a caller of hfs_getnewvnode may choose to proactively force - * any necessary vnode reclamation/recycling while it is not holding any locks and - * thus not prone to deadlock. If this is the case, GNV_USE_VP will be set and - * the parameter will be used as described above. - * - * !!! !!!! - * In circumstances when GNV_USE_VP is set, this function _MUST_ clean up and either consume - * or dispose of the provided vnode. We funnel all errors to a single return value so that - * if provided_vp is still non-NULL, then we will dispose of the vnode. This will occur in - * all error cases of this function -- anywhere we zero/NULL out the *vpp parameter. It may - * also occur if the current thread raced with another to create the same vnode, and we - * find the entry already present in the cnode hash. - * !!! !!! - */ -int -hfs_getnewvnode( - struct hfsmount *hfsmp, - struct vnode *dvp, - struct componentname *cnp, - struct cat_desc *descp, - int flags, - struct cat_attr *attrp, - struct cat_fork *forkp, - struct vnode **vpp, - int *out_flags) -{ - struct mount *mp = HFSTOVFS(hfsmp); - struct vnode *vp = NULL; - struct vnode **cvpp; - struct vnode *tvp = NULLVP; - struct cnode *cp = NULL; - struct filefork *fp = NULL; - int hfs_standard = 0; - int retval = 0; - int issystemfile; - int wantrsrc; - int hflags = 0; - int need_update_identity = 0; - struct vnode_fsparam vfsp; - enum vtype vtype; - - struct vnode *provided_vp = NULL; - - -#if QUOTA - int i; -#endif /* QUOTA */ - - hfs_standard = (hfsmp->hfs_flags & HFS_STANDARD); - - if (flags & GNV_USE_VP) { - /* Store the provided VP for later use */ - provided_vp = *vpp; - } - - /* Zero out the vpp regardless of provided input */ - *vpp = NULL; - - /* Zero out the out_flags */ - *out_flags = 0; - - if (attrp->ca_fileid == 0) { - retval = ENOENT; - goto gnv_exit; - } - -#if !FIFO - if (IFTOVT(attrp->ca_mode) == VFIFO) { - retval = ENOTSUP; - goto gnv_exit; - } -#endif /* !FIFO */ - vtype = IFTOVT(attrp->ca_mode); - issystemfile = (descp->cd_flags & CD_ISMETA) && (vtype == VREG); - wantrsrc = flags & GNV_WANTRSRC; - - /* Sanity check the vtype and mode */ - if (vtype == VBAD) { - /* Mark the FS as corrupt and bail out */ - hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); - retval = EINVAL; - goto gnv_exit; - } - -#ifdef HFS_CHECK_LOCK_ORDER - /* - * The only case where it's permissible to hold the parent cnode - * lock is during a create operation (hfs_makenode) or when - * we don't need the cnode lock (GNV_SKIPLOCK). - */ - if ((dvp != NULL) && - (flags & (GNV_CREATE | GNV_SKIPLOCK)) == 0 && - VTOC(dvp)->c_lockowner == current_thread()) { - panic("hfs_getnewvnode: unexpected hold of parent cnode %p", VTOC(dvp)); - } -#endif /* HFS_CHECK_LOCK_ORDER */ - - /* - * Get a cnode (new or existing) - */ - cp = hfs_chash_getcnode(hfsmp, attrp->ca_fileid, vpp, wantrsrc, - (flags & GNV_SKIPLOCK), out_flags, &hflags); - - /* - * If the id is no longer valid for lookups we'll get back a NULL cp. - */ - if (cp == NULL) { - retval = ENOENT; - goto gnv_exit; - } - /* - * We may have been provided a vnode via - * GNV_USE_VP. In this case, we have raced with - * a 2nd thread to create the target vnode. The provided - * vnode that was passed in will be dealt with at the - * end of the function, as we don't zero out the field - * until we're ready to pass responsibility to VFS. - */ - - - /* - * If we get a cnode/vnode pair out of hfs_chash_getcnode, then update the - * descriptor in the cnode as needed if the cnode represents a hardlink. - * We want the caller to get the most up-to-date copy of the descriptor - * as possible. However, we only do anything here if there was a valid vnode. - * If there isn't a vnode, then the cnode is brand new and needs to be initialized - * as it doesn't have a descriptor or cat_attr yet. - * - * If we are about to replace the descriptor with the user-supplied one, then validate - * that the descriptor correctly acknowledges this item is a hardlink. We could be - * subject to a race where the calling thread invoked cat_lookup, got a valid lookup - * result but the file was not yet a hardlink. With sufficient delay between there - * and here, we might accidentally copy in the raw inode ID into the descriptor in the - * call below. If the descriptor's CNID is the same as the fileID then it must - * not yet have been a hardlink when the lookup occurred. - */ - - if (!(hfs_checkdeleted(cp))) { - // - // If the bytes of the filename in the descp do not match the bytes in the - // cnp (and we're not looking up the resource fork), then we want to update - // the vnode identity to contain the bytes that HFS stores so that when an - // fsevent gets generated, it has the correct filename. otherwise daemons - // that match filenames produced by fsevents with filenames they have stored - // elsewhere (e.g. bladerunner, backupd, mds), the filenames will not match. - // See: FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories - // for more details. - // -#ifdef CN_WANTSRSRCFORK - if (*vpp && cnp && cnp->cn_nameptr && !(cnp->cn_flags & CN_WANTSRSRCFORK) && descp && descp->cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)descp->cd_nameptr, descp->cd_namelen) != 0) { -#else - if (*vpp && cnp && cnp->cn_nameptr && descp && descp->cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)descp->cd_nameptr, descp->cd_namelen) != 0) { -#endif - vnode_update_identity (*vpp, dvp, (const char *)descp->cd_nameptr, descp->cd_namelen, 0, VNODE_UPDATE_NAME); - } - if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) { - /* If cnode is uninitialized, its c_attr will be zeroed out; cnids wont match. */ - if ((descp->cd_cnid == cp->c_attr.ca_fileid) && - (attrp->ca_linkcount != cp->c_attr.ca_linkcount)){ - - if ((flags & GNV_SKIPLOCK) == 0) { - /* - * Then we took the lock. Drop it before calling - * vnode_put, which may invoke hfs_vnop_inactive and need to take - * the cnode lock again. - */ - hfs_unlock(cp); - } - - /* - * Emit ERECYCLE and GNV_CAT_ATTRCHANGED to - * force a re-drive in the lookup routine. - * Drop the iocount on the vnode obtained from - * chash_getcnode if needed. - */ - if (*vpp != NULL) { - vnode_put (*vpp); - *vpp = NULL; - } - - /* - * If we raced with VNOP_RECLAIM for this vnode, the hash code could - * have observed it after the c_vp or c_rsrc_vp fields had been torn down; - * the hash code peeks at those fields without holding the cnode lock because - * it needs to be fast. As a result, we may have set H_ATTACH in the chash - * call above. Since we're bailing out, unset whatever flags we just set, and - * wake up all waiters for this cnode. - */ - if (hflags) { - hfs_chashwakeup(hfsmp, cp, hflags); - } - - *out_flags = GNV_CAT_ATTRCHANGED; - retval = ERECYCLE; - goto gnv_exit; - } - else { - /* - * Otherwise, CNID != fileid. Go ahead and copy in the new descriptor. - * - * Replacing the descriptor here is fine because we looked up the item without - * a vnode in hand before. If a vnode existed, its identity must be attached to this - * item. We are not susceptible to the lookup fastpath issue at this point. - */ - replace_desc(cp, descp); - - /* - * This item was a hardlink, and its name needed to be updated. By replacing the - * descriptor above, we've now updated the cnode's internal representation of - * its link ID/CNID, parent ID, and its name. However, VFS must now be alerted - * to the fact that this vnode now has a new parent, since we cannot guarantee - * that the new link lived in the same directory as the alternative name for - * this item. - */ - if ((*vpp != NULL) && (cnp || cp->c_desc.cd_nameptr)) { - /* we could be requesting the rsrc of a hardlink file... */ -#ifdef CN_WANTSRSRCFORK - if (cp->c_desc.cd_nameptr && (cnp == NULL || !(cnp->cn_flags & CN_WANTSRSRCFORK))) { -#else - if (cp->c_desc.cd_nameptr) { -#endif - // - // Update the identity with what we have stored on disk as - // the name of this file. This is related to: - // FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories - // - vnode_update_identity (*vpp, dvp, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, 0, - (VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME)); - } else if (cnp) { - vnode_update_identity (*vpp, dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, - (VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME)); - } - } - } - } - } - - /* - * At this point, we have performed hardlink and open-unlinked checks - * above. We have now validated the state of the vnode that was given back - * to us from the cnode hash code and find it safe to return. - */ - if (*vpp != NULL) { - retval = 0; - goto gnv_exit; - } - - /* - * If this is a new cnode then initialize it. - */ - if (ISSET(cp->c_hflag, H_ALLOC)) { - lck_rw_init(&cp->c_truncatelock, hfs_rwlock_group, hfs_lock_attr); -#if HFS_COMPRESSION - cp->c_decmp = NULL; -#endif - - /* Make sure its still valid (ie exists on disk). */ - if (!(flags & GNV_CREATE)) { - int error = 0; - if (!hfs_valid_cnode (hfsmp, dvp, (wantrsrc ? NULL : cnp), cp->c_fileid, attrp, &error)) { - hfs_chash_abort(hfsmp, cp); - if ((flags & GNV_SKIPLOCK) == 0) { - hfs_unlock(cp); - } - hfs_reclaim_cnode(hfsmp, cp); - *vpp = NULL; - /* - * If we hit this case, that means that the entry was there in the catalog when - * we did a cat_lookup earlier. Think hfs_lookup. However, in between the time - * that we checked the catalog and the time we went to get a vnode/cnode for it, - * it had been removed from the namespace and the vnode totally reclaimed. As a result, - * it's not there in the catalog during the check in hfs_valid_cnode and we bubble out - * an ENOENT. To indicate to the caller that they should really double-check the - * entry (it could have been renamed over and gotten a new fileid), we mark a bit - * in the output flags. - */ - if (error == ENOENT) { - *out_flags = GNV_CAT_DELETED; - retval = ENOENT; - goto gnv_exit; - } - - /* - * Also, we need to protect the cat_attr acquired during hfs_lookup and passed into - * this function as an argument because the catalog may have changed w.r.t hardlink - * link counts and the firstlink field. If that validation check fails, then let - * lookup re-drive itself to get valid/consistent data with the same failure condition below. - */ - if (error == ERECYCLE) { - *out_flags = GNV_CAT_ATTRCHANGED; - retval = ERECYCLE; - goto gnv_exit; - } - } - } - bcopy(attrp, &cp->c_attr, sizeof(struct cat_attr)); - bcopy(descp, &cp->c_desc, sizeof(struct cat_desc)); - - /* The name was inherited so clear descriptor state... */ - descp->cd_namelen = 0; - descp->cd_nameptr = NULL; - descp->cd_flags &= ~CD_HASBUF; - - /* Tag hardlinks */ - if ((vtype == VREG || vtype == VDIR - || vtype == VSOCK || vtype == VFIFO) - && (descp->cd_cnid != attrp->ca_fileid - || ISSET(attrp->ca_recflags, kHFSHasLinkChainMask))) { - cp->c_flag |= C_HARDLINK; - } - /* - * Fix-up dir link counts. - * - * Earlier versions of Leopard used ca_linkcount for posix - * nlink support (effectively the sub-directory count + 2). - * That is now accomplished using the ca_dircount field with - * the corresponding kHFSHasFolderCountMask flag. - * - * For directories the ca_linkcount is the true link count, - * tracking the number of actual hardlinks to a directory. - * - * We only do this if the mount has HFS_FOLDERCOUNT set; - * at the moment, we only set that for HFSX volumes. - */ - if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && - (vtype == VDIR) && - !(attrp->ca_recflags & kHFSHasFolderCountMask) && - (cp->c_attr.ca_linkcount > 1)) { - if (cp->c_attr.ca_entries == 0) - cp->c_attr.ca_dircount = 0; - else - cp->c_attr.ca_dircount = cp->c_attr.ca_linkcount - 2; - - cp->c_attr.ca_linkcount = 1; - cp->c_attr.ca_recflags |= kHFSHasFolderCountMask; - if ( !(hfsmp->hfs_flags & HFS_READ_ONLY) ) - cp->c_flag |= C_MODIFIED; - } -#if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) { - for (i = 0; i < MAXQUOTAS; i++) - cp->c_dquot[i] = NODQUOT; - } -#endif /* QUOTA */ - /* Mark the output flag that we're vending a new cnode */ - *out_flags |= GNV_NEW_CNODE; - } - - if (vtype == VDIR) { - if (cp->c_vp != NULL) - panic("hfs_getnewvnode: orphaned vnode (data)"); - cvpp = &cp->c_vp; - } else { - if (forkp && attrp->ca_blocks < forkp->cf_blocks) - panic("hfs_getnewvnode: bad ca_blocks (too small)"); - /* - * Allocate and initialize a file fork... - */ - MALLOC_ZONE(fp, struct filefork *, sizeof(struct filefork), - M_HFSFORK, M_WAITOK); - fp->ff_cp = cp; - if (forkp) - bcopy(forkp, &fp->ff_data, sizeof(struct cat_fork)); - else - bzero(&fp->ff_data, sizeof(struct cat_fork)); - rl_init(&fp->ff_invalidranges); - fp->ff_sysfileinfo = 0; - - if (wantrsrc) { - if (cp->c_rsrcfork != NULL) - panic("hfs_getnewvnode: orphaned rsrc fork"); - if (cp->c_rsrc_vp != NULL) - panic("hfs_getnewvnode: orphaned vnode (rsrc)"); - cp->c_rsrcfork = fp; - cvpp = &cp->c_rsrc_vp; - if ( (tvp = cp->c_vp) != NULLVP ) - cp->c_flag |= C_NEED_DVNODE_PUT; - } else { - if (cp->c_datafork != NULL) - panic("hfs_getnewvnode: orphaned data fork"); - if (cp->c_vp != NULL) - panic("hfs_getnewvnode: orphaned vnode (data)"); - cp->c_datafork = fp; - cvpp = &cp->c_vp; - if ( (tvp = cp->c_rsrc_vp) != NULLVP) - cp->c_flag |= C_NEED_RVNODE_PUT; - } - } - if (tvp != NULLVP) { - /* - * grab an iocount on the vnode we weren't - * interested in (i.e. we want the resource fork - * but the cnode already has the data fork) - * to prevent it from being - * recycled by us when we call vnode_create - * which will result in a deadlock when we - * try to take the cnode lock in hfs_vnop_fsync or - * hfs_vnop_reclaim... vnode_get can be called here - * because we already hold the cnode lock which will - * prevent the vnode from changing identity until - * we drop it.. vnode_get will not block waiting for - * a change of state... however, it will return an - * error if the current iocount == 0 and we've already - * started to terminate the vnode... we don't need/want to - * grab an iocount in the case since we can't cause - * the fileystem to be re-entered on this thread for this vp - * - * the matching vnode_put will happen in hfs_unlock - * after we've dropped the cnode lock - */ - if ( vnode_get(tvp) != 0) - cp->c_flag &= ~(C_NEED_RVNODE_PUT | C_NEED_DVNODE_PUT); - } - vfsp.vnfs_mp = mp; - vfsp.vnfs_vtype = vtype; - vfsp.vnfs_str = "hfs"; - if ((cp->c_flag & C_HARDLINK) && (vtype == VDIR)) { - vfsp.vnfs_dvp = NULL; /* no parent for me! */ - vfsp.vnfs_cnp = NULL; /* no name for me! */ - } else { - vfsp.vnfs_dvp = dvp; - vfsp.vnfs_cnp = cnp; - } - - vfsp.vnfs_fsnode = cp; - - /* - * Special Case HFS Standard VNOPs from HFS+, since - * HFS standard is readonly/deprecated as of 10.6 - */ - -#if FIFO - if (vtype == VFIFO ) - vfsp.vnfs_vops = hfs_fifoop_p; - else -#endif - if (vtype == VBLK || vtype == VCHR) - vfsp.vnfs_vops = hfs_specop_p; -#if CONFIG_HFS_STD - else if (hfs_standard) - vfsp.vnfs_vops = hfs_std_vnodeop_p; -#endif - else - vfsp.vnfs_vops = hfs_vnodeop_p; - - if (vtype == VBLK || vtype == VCHR) - vfsp.vnfs_rdev = attrp->ca_rdev; - else - vfsp.vnfs_rdev = 0; - - if (forkp) - vfsp.vnfs_filesize = forkp->cf_size; - else - vfsp.vnfs_filesize = 0; - - vfsp.vnfs_flags = VNFS_ADDFSREF; -#ifdef CN_WANTSRSRCFORK - if (cnp && cnp->cn_nameptr && !(cnp->cn_flags & CN_WANTSRSRCFORK) && cp->c_desc.cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0) { -#else - if (cnp && cnp->cn_nameptr && cp->c_desc.cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0) { -#endif - // - // We don't want VFS to add an entry for this vnode because the name in the - // cnp does not match the bytes stored on disk for this file. Instead we'll - // update the identity later after the vnode is created and we'll do so with - // the correct bytes for this filename. For more details, see: - // FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories - // - vfsp.vnfs_flags |= VNFS_NOCACHE; - need_update_identity = 1; - } else if (dvp == NULLVP || cnp == NULL || !(cnp->cn_flags & MAKEENTRY) || (flags & GNV_NOCACHE)) { - vfsp.vnfs_flags |= VNFS_NOCACHE; - } - - /* Tag system files */ - vfsp.vnfs_marksystem = issystemfile; - - /* Tag root directory */ - if (descp->cd_cnid == kHFSRootFolderID) - vfsp.vnfs_markroot = 1; - else - vfsp.vnfs_markroot = 0; - - /* - * If provided_vp was non-NULL, then it is an already-allocated (but not - * initialized) vnode. We simply need to initialize it to this identity. - * If it was NULL, then assume that we need to call vnode_create with the - * normal arguments/types. - */ - if (provided_vp) { - vp = provided_vp; - /* - * After we assign the value of provided_vp into 'vp' (so that it can be - * mutated safely by vnode_initialize), we can NULL it out. At this point, the disposal - * and handling of the provided vnode will be the responsibility of VFS, which will - * clean it up and vnode_put it properly if vnode_initialize fails. - */ - provided_vp = NULL; - - retval = vnode_initialize (VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp); - /* See error handling below for resolving provided_vp */ - } - else { - /* Do a standard vnode_create */ - retval = vnode_create (VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp); - } - - /* - * We used a local variable to hold the result of vnode_create/vnode_initialize so that - * on error cases in vnode_create we won't accidentally harm the cnode's fields - */ - - if (retval) { - /* Clean up if we encountered an error */ - if (fp) { - if (fp == cp->c_datafork) - cp->c_datafork = NULL; - else - cp->c_rsrcfork = NULL; - - FREE_ZONE(fp, sizeof(struct filefork), M_HFSFORK); - } - /* - * If this is a newly created cnode or a vnode reclaim - * occurred during the attachment, then cleanup the cnode. - */ - if ((cp->c_vp == NULL) && (cp->c_rsrc_vp == NULL)) { - hfs_chash_abort(hfsmp, cp); - hfs_reclaim_cnode(hfsmp, cp); - } - else { - hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_ATTACH); - if ((flags & GNV_SKIPLOCK) == 0){ - hfs_unlock(cp); - } - } - *vpp = NULL; - goto gnv_exit; - } - - /* If no error, then assign the value into the cnode's fields */ - *cvpp = vp; - - vnode_settag(vp, VT_HFS); - if (cp->c_flag & C_HARDLINK) { - vnode_setmultipath(vp); - } - - if (cp->c_attr.ca_recflags & kHFSFastDevCandidateMask) { - vnode_setfastdevicecandidate(vp); - } - - if (cp->c_attr.ca_recflags & kHFSAutoCandidateMask) { - vnode_setautocandidate(vp); - } - - - - - if (vp && need_update_identity) { - // - // As above, update the name of the vnode if the bytes stored in hfs do not match - // the bytes in the cnp. See this radar: - // FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories - // for more details. - // - vnode_update_identity (vp, dvp, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, 0, VNODE_UPDATE_NAME); - } - - /* - * Tag resource fork vnodes as needing an VNOP_INACTIVE - * so that any deferred removes (open unlinked files) - * have the chance to process the resource fork. - */ - if (VNODE_IS_RSRC(vp)) { - int err; - - KERNEL_DEBUG_CONSTANT(HFSDBG_GETNEWVNODE, VM_KERNEL_ADDRPERM(cp->c_vp), VM_KERNEL_ADDRPERM(cp->c_rsrc_vp), 0, 0, 0); - - /* Force VL_NEEDINACTIVE on this vnode */ - err = vnode_ref(vp); - if (err == 0) { - vnode_rele(vp); - } - } - hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_ATTACH); - - /* - * Stop tracking an active hot file. - */ - if (!(flags & GNV_CREATE) && (vtype != VDIR) && !issystemfile && !(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { - (void) hfs_removehotfile(vp); - } - -#if CONFIG_PROTECT - /* Initialize the cp data structures. The key should be in place now. */ - if (!issystemfile && (*out_flags & GNV_NEW_CNODE)) { - cp_entry_init(cp, mp); - } -#endif - - *vpp = vp; - retval = 0; - -gnv_exit: - if (provided_vp) { - /* Release our empty vnode if it was not used */ - vnode_put (provided_vp); - } - return retval; -} - - -static void -hfs_reclaim_cnode(hfsmount_t *hfsmp, struct cnode *cp) -{ -#if QUOTA - int i; - - for (i = 0; i < MAXQUOTAS; i++) { - if (cp->c_dquot[i] != NODQUOT) { - dqreclaim(cp->c_dquot[i]); - cp->c_dquot[i] = NODQUOT; - } - } -#endif /* QUOTA */ - - /* - * If the descriptor has a name then release it - */ - if ((cp->c_desc.cd_flags & CD_HASBUF) && (cp->c_desc.cd_nameptr != 0)) { - const char *nameptr; - - nameptr = (const char *) cp->c_desc.cd_nameptr; - cp->c_desc.cd_nameptr = 0; - cp->c_desc.cd_flags &= ~CD_HASBUF; - cp->c_desc.cd_namelen = 0; - vfs_removename(nameptr); - } - - /* - * We only call this function if we are in hfs_vnop_reclaim and - * attempting to reclaim a cnode with only one live fork. Because the vnode - * went through reclaim, any future attempts to use this item will have to - * go through lookup again, which will need to create a new vnode. Thus, - * destroying the locks below is safe. - */ - - lck_rw_destroy(&cp->c_rwlock, hfs_rwlock_group); - lck_rw_destroy(&cp->c_truncatelock, hfs_rwlock_group); -#if HFS_COMPRESSION - if (cp->c_decmp) { - decmpfs_cnode_destroy(cp->c_decmp); - FREE_ZONE(cp->c_decmp, sizeof(*(cp->c_decmp)), M_DECMPFS_CNODE); - } -#endif -#if CONFIG_PROTECT - cp_entry_destroy(hfsmp, cp->c_cpentry); - cp->c_cpentry = NULL; -#else - (void)hfsmp; // Prevent compiler warning -#endif - - bzero(cp, sizeof(struct cnode)); - FREE_ZONE(cp, sizeof(struct cnode), M_HFSNODE); -} - - -/* - * hfs_valid_cnode - * - * This function is used to validate data that is stored in-core against what is contained - * in the catalog. Common uses include validating that the parent-child relationship still exist - * for a specific directory entry (guaranteeing it has not been renamed into a different spot) at - * the point of the check. - */ -int -hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, - cnid_t cnid, struct cat_attr *cattr, int *error) -{ - struct cat_attr attr; - struct cat_desc cndesc; - int stillvalid = 0; - int lockflags; - - /* System files are always valid */ - if (cnid < kHFSFirstUserCatalogNodeID) { - *error = 0; - return (1); - } - - /* XXX optimization: check write count in dvp */ - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - if (dvp && cnp) { - int lookup = 0; - struct cat_fork fork; - bzero(&cndesc, sizeof(cndesc)); - cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - cndesc.cd_namelen = cnp->cn_namelen; - cndesc.cd_parentcnid = VTOC(dvp)->c_fileid; - cndesc.cd_hint = VTOC(dvp)->c_childhint; - - /* - * We have to be careful when calling cat_lookup. The result argument - * 'attr' may get different results based on whether or not you ask - * for the filefork to be supplied as output. This is because cat_lookupbykey - * will attempt to do basic validation/smoke tests against the resident - * extents if there are no overflow extent records, but it needs someplace - * in memory to store the on-disk fork structures. - * - * Since hfs_lookup calls cat_lookup with a filefork argument, we should - * do the same here, to verify that block count differences are not - * due to calling the function with different styles. cat_lookupbykey - * will request the volume be fsck'd if there is true on-disk corruption - * where the number of blocks does not match the number generated by - * summing the number of blocks in the resident extents. - */ - - lookup = cat_lookup (hfsmp, &cndesc, 0, 0, NULL, &attr, &fork, NULL); - - if ((lookup == 0) && (cnid == attr.ca_fileid)) { - stillvalid = 1; - *error = 0; - } - else { - *error = ENOENT; - } - - /* - * In hfs_getnewvnode, we may encounter a time-of-check vs. time-of-vnode creation - * race. Specifically, if there is no vnode/cnode pair for the directory entry - * being looked up, we have to go to the catalog. But since we don't hold any locks (aside - * from the dvp in 'shared' mode) there is nothing to protect us against the catalog record - * changing in between the time we do the cat_lookup there and the time we re-grab the - * catalog lock above to do another cat_lookup. - * - * However, we need to check more than just the CNID and parent-child name relationships above. - * Hardlinks can suffer the same race in the following scenario: Suppose we do a - * cat_lookup, and find a leaf record and a raw inode for a hardlink. Now, we have - * the cat_attr in hand (passed in above). But in between then and now, the vnode was - * created by a competing hfs_getnewvnode call, and is manipulated and reclaimed before we get - * a chance to do anything. This is possible if there are a lot of threads thrashing around - * with the cnode hash. In this case, if we don't check/validate the cat_attr in-hand, we will - * blindly stuff it into the cnode, which will make the in-core data inconsistent with what is - * on disk. So validate the cat_attr below, if required. This race cannot happen if the cnode/vnode - * already exists, as it does in the case of rename and delete. - */ - if (stillvalid && cattr != NULL) { - if (cattr->ca_linkcount != attr.ca_linkcount) { - stillvalid = 0; - *error = ERECYCLE; - goto notvalid; - } - - if (cattr->ca_union1.cau_linkref != attr.ca_union1.cau_linkref) { - stillvalid = 0; - *error = ERECYCLE; - goto notvalid; - } - - if (cattr->ca_union3.cau_firstlink != attr.ca_union3.cau_firstlink) { - stillvalid = 0; - *error = ERECYCLE; - goto notvalid; - } - - if (cattr->ca_union2.cau_blocks != attr.ca_union2.cau_blocks) { - stillvalid = 0; - *error = ERECYCLE; - goto notvalid; - } - } - } else { - if (cat_idlookup(hfsmp, cnid, 0, 0, NULL, NULL, NULL) == 0) { - stillvalid = 1; - *error = 0; - } - else { - *error = ENOENT; - } - } -notvalid: - hfs_systemfile_unlock(hfsmp, lockflags); - - return (stillvalid); -} - - -/* - * Per HI and Finder requirements, HFS should add in the - * date/time that a particular directory entry was added - * to the containing directory. - * This is stored in the extended Finder Info for the - * item in question. - * - * Note that this field is also set explicitly in the hfs_vnop_setxattr code. - * We must ignore user attempts to set this part of the finderinfo, and - * so we need to save a local copy of the date added, write in the user - * finderinfo, then stuff the value back in. - */ -void hfs_write_dateadded (struct cat_attr *attrp, u_int32_t dateadded) { - u_int8_t *finfo = NULL; - - /* overlay the FinderInfo to the correct pointer, and advance */ - finfo = (u_int8_t*)attrp->ca_finderinfo; - finfo = finfo + 16; - - /* - * Make sure to write it out as big endian, since that's how - * finder info is defined. - * - * NOTE: This is a Unix-epoch timestamp, not a HFS/Traditional Mac timestamp. - */ - if (S_ISREG(attrp->ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - extinfo->date_added = OSSwapHostToBigInt32(dateadded); - attrp->ca_recflags |= kHFSHasDateAddedMask; - } - else if (S_ISDIR(attrp->ca_mode)) { - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; - extinfo->date_added = OSSwapHostToBigInt32(dateadded); - attrp->ca_recflags |= kHFSHasDateAddedMask; - } - /* If it were neither directory/file, then we'd bail out */ - return; -} - -static u_int32_t -hfs_get_dateadded_internal(const uint8_t *finderinfo, mode_t mode) -{ - const uint8_t *finfo = NULL; - u_int32_t dateadded = 0; - - - - /* overlay the FinderInfo to the correct pointer, and advance */ - finfo = finderinfo + 16; - - /* - * FinderInfo is written out in big endian... make sure to convert it to host - * native before we use it. - */ - if (S_ISREG(mode)) { - const struct FndrExtendedFileInfo *extinfo = (const struct FndrExtendedFileInfo *)finfo; - dateadded = OSSwapBigToHostInt32 (extinfo->date_added); - } - else if (S_ISDIR(mode)) { - const struct FndrExtendedDirInfo *extinfo = (const struct FndrExtendedDirInfo *)finfo; - dateadded = OSSwapBigToHostInt32 (extinfo->date_added); - } - - return dateadded; -} - -u_int32_t -hfs_get_dateadded(struct cnode *cp) -{ - if ((cp->c_attr.ca_recflags & kHFSHasDateAddedMask) == 0) { - /* Date added was never set. Return 0. */ - return (0); - } - - return (hfs_get_dateadded_internal((u_int8_t*)cp->c_finderinfo, - cp->c_attr.ca_mode)); -} - -u_int32_t -hfs_get_dateadded_from_blob(const uint8_t *finderinfo, mode_t mode) -{ - return (hfs_get_dateadded_internal(finderinfo, mode)); -} - -/* - * Per HI and Finder requirements, HFS maintains a "write/generation - * count" for each file that is incremented on any write & pageout. - * It should start at 1 to reserve "0" as a special value. If it - * should ever wrap around, it will skip using 0. - * - * Note that finderinfo is manipulated in hfs_vnop_setxattr and care - * is and should be taken to ignore user attempts to set the part of - * the finderinfo that records the generation counter. - * - * Any change to the generation counter *must* not be visible before - * the change that caused it (for obvious reasons), and given the - * limitations of our current architecture, the change to the - * generation counter may occur some time afterwards (particularly in - * the case where a file is mapped writable---more on that below). - * - * We make no guarantees about the consistency of a file. In other - * words, a reader that is operating concurrently with a writer might - * see some, but not all of writer's changes, and the generation - * counter will *not* necessarily tell you this has happened. To - * enforce consistency, clients must make their own arrangements - * e.g. use file locking. - * - * We treat files that are mapped writable as a special case: when - * that happens, clients requesting the generation count will be told - * it has a generation count of zero and they use that knowledge as a - * hint that the file is changing and it therefore might be prudent to - * wait until it is no longer mapped writable. Clients should *not* - * rely on this behaviour however; we might decide that it's better - * for us to publish the fact that a file is mapped writable via - * alternate means and return the generation counter when it is mapped - * writable as it still has some, albeit limited, use. We reserve the - * right to make this change. - * - * Lastly, it's important to realise that because data and metadata - * take different paths through the system, it's possible upon crash - * or sudden power loss and after a restart, that a change may be - * visible to the rest of the system without a corresponding change to - * the generation counter. The reverse may also be true, but for all - * practical applications this shouldn't be an issue. - */ -void hfs_write_gencount (struct cat_attr *attrp, uint32_t gencount) { - u_int8_t *finfo = NULL; - - /* overlay the FinderInfo to the correct pointer, and advance */ - finfo = (u_int8_t*)attrp->ca_finderinfo; - finfo = finfo + 16; - - /* - * Make sure to write it out as big endian, since that's how - * finder info is defined. - * - * Generation count is only supported for files. - */ - if (S_ISREG(attrp->ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - extinfo->write_gen_counter = OSSwapHostToBigInt32(gencount); - } - - /* If it were neither directory/file, then we'd bail out */ - return; -} - -/* - * Increase the gen count by 1; if it wraps around to 0, increment by - * two. The cnode *must* be locked exclusively by the caller. - * - * You may think holding the lock is unnecessary because we only need - * to change the counter, but consider this sequence of events: thread - * A calls hfs_incr_gencount and the generation counter is 2 upon - * entry. A context switch occurs and thread B increments the counter - * to 3, thread C now gets the generation counter (for whatever - * purpose), and then another thread makes another change and the - * generation counter is incremented again---it's now 4. Now thread A - * continues and it sets the generation counter back to 3. So you can - * see, thread C would miss the change that caused the generation - * counter to increment to 4 and for this reason the cnode *must* - * always be locked exclusively. - */ -uint32_t hfs_incr_gencount (struct cnode *cp) { - u_int8_t *finfo = NULL; - u_int32_t gcount = 0; - - /* overlay the FinderInfo to the correct pointer, and advance */ - finfo = (u_int8_t*)cp->c_finderinfo; - finfo = finfo + 16; - - /* - * FinderInfo is written out in big endian... make sure to convert it to host - * native before we use it. - * - * NOTE: the write_gen_counter is stored in the same location in both the - * FndrExtendedFileInfo and FndrExtendedDirInfo structs (it's the - * last 32-bit word) so it is safe to have one code path here. - */ - if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - gcount = OSSwapBigToHostInt32 (extinfo->write_gen_counter); - - /* Was it zero to begin with (file originated in 10.8 or earlier?) */ - if (gcount == 0) { - gcount++; - } - - /* now bump it */ - gcount++; - - /* Did it wrap around ? */ - if (gcount == 0) { - gcount++; - } - extinfo->write_gen_counter = OSSwapHostToBigInt32 (gcount); - - SET(cp->c_flag, C_MINOR_MOD); - } - else { - gcount = 0; - } - - return gcount; -} - -/* - * There is no need for any locks here (other than an iocount on an - * associated vnode) because reading and writing an aligned 32 bit - * integer should be atomic on all platforms we support. - */ -static u_int32_t -hfs_get_gencount_internal(const uint8_t *finderinfo, mode_t mode) -{ - const uint8_t *finfo = NULL; - u_int32_t gcount = 0; - - /* overlay the FinderInfo to the correct pointer, and advance */ - finfo = finderinfo; - finfo = finfo + 16; - - /* - * FinderInfo is written out in big endian... make sure to convert it to host - * native before we use it. - * - * NOTE: the write_gen_counter is stored in the same location in both the - * FndrExtendedFileInfo and FndrExtendedDirInfo structs (it's the - * last 32-bit word) so it is safe to have one code path here. - */ - if (S_ISDIR(mode) || S_ISREG(mode)) { - const struct FndrExtendedFileInfo *extinfo = (const struct FndrExtendedFileInfo *)finfo; - gcount = OSSwapBigToHostInt32 (extinfo->write_gen_counter); - - /* - * Is it zero? File might originate in 10.8 or earlier. We lie and bump it to 1, - * since the incrementer code is able to handle this case and will double-increment - * for us. - */ - if (gcount == 0) { - gcount++; - } - } - - return gcount; -} - -/* Getter for the gen count */ -u_int32_t hfs_get_gencount (struct cnode *cp) { - return hfs_get_gencount_internal(cp->c_finderinfo, cp->c_attr.ca_mode); -} - -/* Getter for the gen count from a buffer (currently pointer to finderinfo)*/ -u_int32_t hfs_get_gencount_from_blob (const uint8_t *finfoblob, mode_t mode) { - return hfs_get_gencount_internal(finfoblob, mode); -} - -void hfs_clear_might_be_dirty_flag(cnode_t *cp) -{ - /* - * If we're about to touch both mtime and ctime, we can clear the - * C_MIGHT_BE_DIRTY_FROM_MAPPING since we can guarantee that - * subsequent page-outs can only be for data made dirty before - * now. - */ - CLR(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING); -} - -/* - * Touch cnode times based on c_touch_xxx flags - * - * cnode must be locked exclusive - * - * This will also update the volume modify time - */ -void -hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) -{ - vfs_context_t ctx; - - if (ISSET(hfsmp->hfs_flags, HFS_READ_ONLY) || ISSET(cp->c_flag, C_NOEXISTS)) { - cp->c_touch_acctime = FALSE; - cp->c_touch_chgtime = FALSE; - cp->c_touch_modtime = FALSE; - CLR(cp->c_flag, C_NEEDS_DATEADDED); - return; - } -#if CONFIG_HFS_STD - else if (hfsmp->hfs_flags & HFS_STANDARD) { - /* HFS Standard doesn't support access times */ - cp->c_touch_acctime = FALSE; - } -#endif - - ctx = vfs_context_current(); - /* - * Skip access time updates if: - * . MNT_NOATIME is set - * . a file system freeze is in progress - * . a file system resize is in progress - * . the vnode associated with this cnode is marked for rapid aging - */ - if (cp->c_touch_acctime) { - if ((vfs_flags(hfsmp->hfs_mp) & MNT_NOATIME) || - hfsmp->hfs_freeze_state != HFS_THAWED || - (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) || - (cp->c_vp && ((vnode_israge(cp->c_vp) || (vfs_ctx_skipatime(ctx)))))) { - - cp->c_touch_acctime = FALSE; - } - } - if (cp->c_touch_acctime || cp->c_touch_chgtime || - cp->c_touch_modtime || (cp->c_flag & C_NEEDS_DATEADDED)) { - struct timeval tv; - int touchvol = 0; - - if (cp->c_touch_modtime && cp->c_touch_chgtime) - hfs_clear_might_be_dirty_flag(cp); - - microtime(&tv); - - if (cp->c_touch_acctime) { - /* - * When the access time is the only thing changing, we - * won't necessarily write it to disk immediately. We - * only do the atime update at vnode recycle time, when - * fsync is called or when there's another reason to write - * to the metadata. - */ - cp->c_atime = tv.tv_sec; - cp->c_touch_acctime = FALSE; - } - if (cp->c_touch_modtime) { - cp->c_touch_modtime = FALSE; - time_t new_time = tv.tv_sec; -#if CONFIG_HFS_STD - /* - * HFS dates that WE set must be adjusted for DST - */ - if ((hfsmp->hfs_flags & HFS_STANDARD) && gTimeZone.tz_dsttime) { - new_time += 3600; - } -#endif - if (cp->c_mtime != new_time) { - cp->c_mtime = new_time; - cp->c_flag |= C_MINOR_MOD; - touchvol = 1; - } - } - if (cp->c_touch_chgtime) { - cp->c_touch_chgtime = FALSE; - if (cp->c_ctime != tv.tv_sec) { - cp->c_ctime = tv.tv_sec; - cp->c_flag |= C_MINOR_MOD; - touchvol = 1; - } - } - - if (cp->c_flag & C_NEEDS_DATEADDED) { - hfs_write_dateadded (&(cp->c_attr), tv.tv_sec); - cp->c_flag |= C_MINOR_MOD; - /* untwiddle the bit */ - cp->c_flag &= ~C_NEEDS_DATEADDED; - touchvol = 1; - } - - /* Touch the volume modtime if needed */ - if (touchvol) { - hfs_note_header_minor_change(hfsmp); - HFSTOVCB(hfsmp)->vcbLsMod = tv.tv_sec; - } - } -} - -// Use this if you don't want to check the return code -void hfs_lock_always(cnode_t *cp, enum hfs_locktype locktype) -{ - hfs_lock(cp, locktype, HFS_LOCK_ALWAYS); -} - -/* - * Lock a cnode. - * N.B. If you add any failure cases, *make* sure hfs_lock_always works - */ -int -hfs_lock(struct cnode *cp, enum hfs_locktype locktype, enum hfs_lockflags flags) -{ - thread_t thread = current_thread(); - - if (cp->c_lockowner == thread) { - /* - * Only the extents and bitmap files support lock recursion - * here. The other system files support lock recursion in - * hfs_systemfile_lock. Eventually, we should change to - * handle recursion solely in hfs_systemfile_lock. - */ - if ((cp->c_fileid == kHFSExtentsFileID) || - (cp->c_fileid == kHFSAllocationFileID)) { - cp->c_syslockcount++; - } else { - panic("hfs_lock: locking against myself!"); - } - } else if (locktype == HFS_SHARED_LOCK) { - lck_rw_lock_shared(&cp->c_rwlock); - cp->c_lockowner = HFS_SHARED_OWNER; - - } else { /* HFS_EXCLUSIVE_LOCK */ - lck_rw_lock_exclusive(&cp->c_rwlock); - cp->c_lockowner = thread; - - /* Only the extents and bitmap files support lock recursion. */ - if ((cp->c_fileid == kHFSExtentsFileID) || - (cp->c_fileid == kHFSAllocationFileID)) { - cp->c_syslockcount = 1; - } - } - -#ifdef HFS_CHECK_LOCK_ORDER - /* - * Regular cnodes (non-system files) cannot be locked - * while holding the journal lock or a system file lock. - */ - if (!(cp->c_desc.cd_flags & CD_ISMETA) && - ((cp->c_fileid > kHFSFirstUserCatalogNodeID) || (cp->c_fileid == kHFSRootFolderID))) { - vnode_t vp = NULLVP; - - /* Find corresponding vnode. */ - if (cp->c_vp != NULLVP && VTOC(cp->c_vp) == cp) { - vp = cp->c_vp; - } else if (cp->c_rsrc_vp != NULLVP && VTOC(cp->c_rsrc_vp) == cp) { - vp = cp->c_rsrc_vp; - } - if (vp != NULLVP) { - struct hfsmount *hfsmp = VTOHFS(vp); - - if (hfsmp->jnl && (journal_owner(hfsmp->jnl) == thread)) { - /* This will eventually be a panic here. */ - printf("hfs_lock: bad lock order (cnode after journal)\n"); - } - if (hfsmp->hfs_catalog_cp && hfsmp->hfs_catalog_cp->c_lockowner == thread) { - panic("hfs_lock: bad lock order (cnode after catalog)"); - } - if (hfsmp->hfs_attribute_cp && hfsmp->hfs_attribute_cp->c_lockowner == thread) { - panic("hfs_lock: bad lock order (cnode after attribute)"); - } - if (hfsmp->hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == thread) { - panic("hfs_lock: bad lock order (cnode after extents)"); - } - } - } -#endif /* HFS_CHECK_LOCK_ORDER */ - - /* - * Skip cnodes for regular files that no longer exist - * (marked deleted, catalog entry gone). - */ - if (((flags & HFS_LOCK_ALLOW_NOEXISTS) == 0) && - ((cp->c_desc.cd_flags & CD_ISMETA) == 0) && - (cp->c_flag & C_NOEXISTS)) { - hfs_unlock(cp); - return (ENOENT); - } - return (0); -} - -bool hfs_lock_upgrade(cnode_t *cp) -{ - if (lck_rw_lock_shared_to_exclusive(&cp->c_rwlock)) { - cp->c_lockowner = current_thread(); - return true; - } else - return false; -} - -/* - * Lock a pair of cnodes. - */ -int -hfs_lockpair(struct cnode *cp1, struct cnode *cp2, enum hfs_locktype locktype) -{ - struct cnode *first, *last; - int error; - - /* - * If cnodes match then just lock one. - */ - if (cp1 == cp2) { - return hfs_lock(cp1, locktype, HFS_LOCK_DEFAULT); - } - - /* - * Lock in cnode address order. - */ - if (cp1 < cp2) { - first = cp1; - last = cp2; - } else { - first = cp2; - last = cp1; - } - - if ( (error = hfs_lock(first, locktype, HFS_LOCK_DEFAULT))) { - return (error); - } - if ( (error = hfs_lock(last, locktype, HFS_LOCK_DEFAULT))) { - hfs_unlock(first); - return (error); - } - return (0); -} - -/* - * Check ordering of two cnodes. Return true if they are are in-order. - */ -static int -hfs_isordered(struct cnode *cp1, struct cnode *cp2) -{ - if (cp1 == cp2) - return (0); - if (cp1 == NULL || cp2 == (struct cnode *)0xffffffff) - return (1); - if (cp2 == NULL || cp1 == (struct cnode *)0xffffffff) - return (0); - /* - * Locking order is cnode address order. - */ - return (cp1 < cp2); -} - -/* - * Acquire 4 cnode locks. - * - locked in cnode address order (lesser address first). - * - all or none of the locks are taken - * - only one lock taken per cnode (dup cnodes are skipped) - * - some of the cnode pointers may be null - */ -int -hfs_lockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, - struct cnode *cp4, enum hfs_locktype locktype, struct cnode **error_cnode) -{ - struct cnode * a[3]; - struct cnode * b[3]; - struct cnode * list[4]; - struct cnode * tmp; - int i, j, k; - int error; - if (error_cnode) { - *error_cnode = NULL; - } - - if (hfs_isordered(cp1, cp2)) { - a[0] = cp1; a[1] = cp2; - } else { - a[0] = cp2; a[1] = cp1; - } - if (hfs_isordered(cp3, cp4)) { - b[0] = cp3; b[1] = cp4; - } else { - b[0] = cp4; b[1] = cp3; - } - a[2] = (struct cnode *)0xffffffff; /* sentinel value */ - b[2] = (struct cnode *)0xffffffff; /* sentinel value */ - - /* - * Build the lock list, skipping over duplicates - */ - for (i = 0, j = 0, k = 0; (i < 2 || j < 2); ) { - tmp = hfs_isordered(a[i], b[j]) ? a[i++] : b[j++]; - if (k == 0 || tmp != list[k-1]) - list[k++] = tmp; - } - - /* - * Now we can lock using list[0 - k]. - * Skip over NULL entries. - */ - for (i = 0; i < k; ++i) { - if (list[i]) - if ((error = hfs_lock(list[i], locktype, HFS_LOCK_DEFAULT))) { - /* Only stuff error_cnode if requested */ - if (error_cnode) { - *error_cnode = list[i]; - } - /* Drop any locks we acquired. */ - while (--i >= 0) { - if (list[i]) - hfs_unlock(list[i]); - } - return (error); - } - } - return (0); -} - - -/* - * Unlock a cnode. - */ -void -hfs_unlock(struct cnode *cp) -{ - vnode_t rvp = NULLVP; - vnode_t vp = NULLVP; - u_int32_t c_flag; - - /* - * Only the extents and bitmap file's support lock recursion. - */ - if ((cp->c_fileid == kHFSExtentsFileID) || - (cp->c_fileid == kHFSAllocationFileID)) { - if (--cp->c_syslockcount > 0) { - return; - } - } - - const thread_t thread = current_thread(); - - if (cp->c_lockowner == thread) { - c_flag = cp->c_flag; - - // If we have the truncate lock, we must defer the puts - if (cp->c_truncatelockowner == thread) { - if (ISSET(c_flag, C_NEED_DVNODE_PUT) - && !cp->c_need_dvnode_put_after_truncate_unlock) { - CLR(c_flag, C_NEED_DVNODE_PUT); - cp->c_need_dvnode_put_after_truncate_unlock = true; - } - if (ISSET(c_flag, C_NEED_RVNODE_PUT) - && !cp->c_need_rvnode_put_after_truncate_unlock) { - CLR(c_flag, C_NEED_RVNODE_PUT); - cp->c_need_rvnode_put_after_truncate_unlock = true; - } - } - - CLR(cp->c_flag, (C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE - | C_NEED_DVNODE_PUT | C_NEED_RVNODE_PUT)); - - if (c_flag & (C_NEED_DVNODE_PUT | C_NEED_DATA_SETSIZE)) { - vp = cp->c_vp; - } - if (c_flag & (C_NEED_RVNODE_PUT | C_NEED_RSRC_SETSIZE)) { - rvp = cp->c_rsrc_vp; - } - - cp->c_lockowner = NULL; - lck_rw_unlock_exclusive(&cp->c_rwlock); - } else { - lck_rw_unlock_shared(&cp->c_rwlock); - } - - /* Perform any vnode post processing after cnode lock is dropped. */ - if (vp) { - if (c_flag & C_NEED_DATA_SETSIZE) { - ubc_setsize(vp, VTOF(vp)->ff_size); -#if HFS_COMPRESSION - /* - * If this is a compressed file, we need to reset the - * compression state. We will have set the size to zero - * above and it will get fixed up later (in exactly the - * same way that new vnodes are fixed up). Note that we - * should only be able to get here if the truncate lock is - * held exclusively and so we do the reset when that's - * unlocked. - */ - decmpfs_cnode *dp = VTOCMP(vp); - if (dp && decmpfs_cnode_get_vnode_state(dp) != FILE_TYPE_UNKNOWN) - cp->c_need_decmpfs_reset = true; -#endif - } - if (c_flag & C_NEED_DVNODE_PUT) - vnode_put(vp); - } - if (rvp) { - if (c_flag & C_NEED_RSRC_SETSIZE) - ubc_setsize(rvp, VTOF(rvp)->ff_size); - if (c_flag & C_NEED_RVNODE_PUT) - vnode_put(rvp); - } -} - -/* - * Unlock a pair of cnodes. - */ -void -hfs_unlockpair(struct cnode *cp1, struct cnode *cp2) -{ - hfs_unlock(cp1); - if (cp2 != cp1) - hfs_unlock(cp2); -} - -/* - * Unlock a group of cnodes. - */ -void -hfs_unlockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct cnode *cp4) -{ - struct cnode * list[4]; - int i, k = 0; - - if (cp1) { - hfs_unlock(cp1); - list[k++] = cp1; - } - if (cp2) { - for (i = 0; i < k; ++i) { - if (list[i] == cp2) - goto skip1; - } - hfs_unlock(cp2); - list[k++] = cp2; - } -skip1: - if (cp3) { - for (i = 0; i < k; ++i) { - if (list[i] == cp3) - goto skip2; - } - hfs_unlock(cp3); - list[k++] = cp3; - } -skip2: - if (cp4) { - for (i = 0; i < k; ++i) { - if (list[i] == cp4) - return; - } - hfs_unlock(cp4); - } -} - - -/* - * Protect a cnode against a truncation. - * - * Used mainly by read/write since they don't hold the - * cnode lock across calls to the cluster layer. - * - * The process doing a truncation must take the lock - * exclusive. The read/write processes can take it - * shared. The locktype argument is the same as supplied to - * hfs_lock. - */ -void -hfs_lock_truncate(struct cnode *cp, enum hfs_locktype locktype, enum hfs_lockflags flags) -{ - thread_t thread = current_thread(); - - if (cp->c_truncatelockowner == thread) { - /* - * Ignore grabbing the lock if it the current thread already - * holds exclusive lock. - * - * This is needed on the hfs_vnop_pagein path where we need to ensure - * the file does not change sizes while we are paging in. However, - * we may already hold the lock exclusive due to another - * VNOP from earlier in the call stack. So if we already hold - * the truncate lock exclusive, allow it to proceed, but ONLY if - * it's in the recursive case. - */ - if ((flags & HFS_LOCK_SKIP_IF_EXCLUSIVE) == 0) { - panic("hfs_lock_truncate: cnode %p locked!", cp); - } - } else if (locktype == HFS_SHARED_LOCK) { - lck_rw_lock_shared(&cp->c_truncatelock); - cp->c_truncatelockowner = HFS_SHARED_OWNER; - } else { /* HFS_EXCLUSIVE_LOCK */ - lck_rw_lock_exclusive(&cp->c_truncatelock); - cp->c_truncatelockowner = thread; - } -} - -bool hfs_truncate_lock_upgrade(struct cnode *cp) -{ - assert(cp->c_truncatelockowner == HFS_SHARED_OWNER); - if (!lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock)) - return false; - cp->c_truncatelockowner = current_thread(); - return true; -} - -void hfs_truncate_lock_downgrade(struct cnode *cp) -{ - assert(cp->c_truncatelockowner == current_thread()); - lck_rw_lock_exclusive_to_shared(&cp->c_truncatelock); - cp->c_truncatelockowner = HFS_SHARED_OWNER; -} - -/* - * Attempt to get the truncate lock. If it cannot be acquired, error out. - * This function is needed in the degenerate hfs_vnop_pagein during force unmount - * case. To prevent deadlocks while a VM copy object is moving pages, HFS vnop pagein will - * temporarily need to disable V2 semantics. - */ -int hfs_try_trunclock (struct cnode *cp, enum hfs_locktype locktype, enum hfs_lockflags flags) -{ - thread_t thread = current_thread(); - boolean_t didlock = false; - - if (cp->c_truncatelockowner == thread) { - /* - * Ignore grabbing the lock if the current thread already - * holds exclusive lock. - * - * This is needed on the hfs_vnop_pagein path where we need to ensure - * the file does not change sizes while we are paging in. However, - * we may already hold the lock exclusive due to another - * VNOP from earlier in the call stack. So if we already hold - * the truncate lock exclusive, allow it to proceed, but ONLY if - * it's in the recursive case. - */ - if ((flags & HFS_LOCK_SKIP_IF_EXCLUSIVE) == 0) { - panic("hfs_lock_truncate: cnode %p locked!", cp); - } - } else if (locktype == HFS_SHARED_LOCK) { - didlock = lck_rw_try_lock(&cp->c_truncatelock, LCK_RW_TYPE_SHARED); - if (didlock) { - cp->c_truncatelockowner = HFS_SHARED_OWNER; - } - } else { /* HFS_EXCLUSIVE_LOCK */ - didlock = lck_rw_try_lock (&cp->c_truncatelock, LCK_RW_TYPE_EXCLUSIVE); - if (didlock) { - cp->c_truncatelockowner = thread; - } - } - - return didlock; -} - - -/* - * Unlock the truncate lock, which protects against size changes. - * - * If HFS_LOCK_SKIP_IF_EXCLUSIVE flag was set, it means that a previous - * hfs_lock_truncate() might have skipped grabbing a lock because - * the current thread was already holding the lock exclusive and - * we may need to return from this function without actually unlocking - * the truncate lock. - */ -void -hfs_unlock_truncate(struct cnode *cp, enum hfs_lockflags flags) -{ - thread_t thread = current_thread(); - - /* - * If HFS_LOCK_SKIP_IF_EXCLUSIVE is set in the flags AND the current - * lock owner of the truncate lock is our current thread, then - * we must have skipped taking the lock earlier by in - * hfs_lock_truncate() by setting HFS_LOCK_SKIP_IF_EXCLUSIVE in the - * flags (as the current thread was current lock owner). - * - * If HFS_LOCK_SKIP_IF_EXCLUSIVE is not set (most of the time) then - * we check the lockowner field to infer whether the lock was taken - * exclusively or shared in order to know what underlying lock - * routine to call. - */ - if (flags & HFS_LOCK_SKIP_IF_EXCLUSIVE) { - if (cp->c_truncatelockowner == thread) { - return; - } - } - - /* HFS_LOCK_EXCLUSIVE */ - if (thread == cp->c_truncatelockowner) { - vnode_t vp = NULL, rvp = NULL; - - /* - * If there are pending set sizes, the cnode lock should be dropped - * first. - */ -#if DEBUG - assert(!(cp->c_lockowner == thread - && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE))); -#elif DEVELOPMENT - if (cp->c_lockowner == thread - && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)) { - printf("hfs: hfs_unlock_truncate called with C_NEED_DATA/RSRC_SETSIZE set (caller: 0x%llx)\n", - (uint64_t)VM_KERNEL_UNSLIDE(__builtin_return_address(0))); - } -#endif - - if (cp->c_need_dvnode_put_after_truncate_unlock) { - vp = cp->c_vp; - cp->c_need_dvnode_put_after_truncate_unlock = false; - } - if (cp->c_need_rvnode_put_after_truncate_unlock) { - rvp = cp->c_rsrc_vp; - cp->c_need_rvnode_put_after_truncate_unlock = false; - } - -#if HFS_COMPRESSION - bool reset_decmpfs = cp->c_need_decmpfs_reset; - cp->c_need_decmpfs_reset = false; -#endif - - cp->c_truncatelockowner = NULL; - lck_rw_unlock_exclusive(&cp->c_truncatelock); - -#if HFS_COMPRESSION - if (reset_decmpfs) { - decmpfs_cnode *dp = cp->c_decmp; - if (dp && decmpfs_cnode_get_vnode_state(dp) != FILE_TYPE_UNKNOWN) - decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); - } -#endif - - // Do the puts now - if (vp) - vnode_put(vp); - if (rvp) - vnode_put(rvp); - } else { /* HFS_LOCK_SHARED */ - lck_rw_unlock_shared(&cp->c_truncatelock); - } -} diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h deleted file mode 100644 index d45b9d236..000000000 --- a/bsd/hfs/hfs_cnode.h +++ /dev/null @@ -1,623 +0,0 @@ -/* - * Copyright (c) 2002-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _HFS_CNODE_H_ -#define _HFS_CNODE_H_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#if HFS_COMPRESSION -#include -#endif -#if CONFIG_PROTECT -#include -#endif -#include - -/* - * The filefork is used to represent an HFS file fork (data or resource). - * Reading or writing any of these fields requires holding cnode lock. - */ -struct filefork { - struct cnode *ff_cp; /* cnode associated with this fork */ - struct rl_head ff_invalidranges; /* Areas of disk that should read back as zeroes */ - union { - void *ffu_sysfileinfo; /* additional info for system files */ - char *ffu_symlinkptr; /* symbolic link pathname */ - } ff_union; - struct cat_fork ff_data; /* fork data (size, extents) */ -}; -typedef struct filefork filefork_t; - - -#define HFS_TEMPLOOKUP_NAMELEN 32 - -/* - * Catalog Lookup struct (runtime) - * - * This is used so that when we need to malloc a container for a catalog - * lookup operation, we can acquire memory for everything in one fell swoop - * as opposed to putting many of these objects on the stack. The cat_fork - * data structure can take up 100+bytes easily, and that can add to stack - * overhead. - * - * As a result, we use this to easily pass around the memory needed for a - * lookup operation. - */ -struct cat_lookup_buffer { - struct cat_desc lookup_desc; - struct cat_attr lookup_attr; - struct filefork lookup_fork; - struct componentname lookup_cn; - char lookup_name[HFS_TEMPLOOKUP_NAMELEN]; /* for open-unlinked paths only */ -}; - - -/* Aliases for common fields */ -#define ff_size ff_data.cf_size -#define ff_new_size ff_data.cf_new_size -#define ff_clumpsize ff_data.cf_clump -#define ff_bytesread ff_data.cf_bytesread -#define ff_extents ff_data.cf_extents - -/* - * Note that the blocks fields are protected by the cnode lock, *not* - * the truncate lock. - */ -#define ff_blocks ff_data.cf_blocks -#define ff_unallocblocks ff_data.cf_vblocks -static inline uint32_t ff_allocblocks(filefork_t *ff) -{ - assert(ff->ff_blocks >= ff->ff_unallocblocks); - return ff->ff_blocks - ff->ff_unallocblocks; -} - -#define ff_symlinkptr ff_union.ffu_symlinkptr -#define ff_sysfileinfo ff_union.ffu_sysfileinfo - - -/* The btree code still needs these... */ -#define fcbEOF ff_size -#define fcbExtents ff_extents -#define fcbBTCBPtr ff_sysfileinfo - -typedef u_int8_t atomicflag_t; - - -/* - * Hardlink Origin (for hardlinked directories). - */ -struct linkorigin { - TAILQ_ENTRY(linkorigin) lo_link; /* chain */ - void * lo_thread; /* thread that performed the lookup */ - cnid_t lo_cnid; /* hardlink's cnid */ - cnid_t lo_parentcnid; /* hardlink's parent cnid */ -}; -typedef struct linkorigin linkorigin_t; - -#define MAX_CACHED_ORIGINS 10 -#define MAX_CACHED_FILE_ORIGINS 8 - -/* - * The cnode is used to represent each active (or recently active) - * file or directory in the HFS filesystem. - * - * Reading or writing any of these fields requires holding c_lock. - */ -struct cnode { - lck_rw_t c_rwlock; /* cnode's lock */ - thread_t c_lockowner; /* cnode's lock owner (exclusive case only) */ - lck_rw_t c_truncatelock; /* protects file from truncation during read/write */ - thread_t c_truncatelockowner; /* truncate lock owner (exclusive case only) */ - LIST_ENTRY(cnode) c_hash; /* cnode's hash chain */ - u_int32_t c_flag; /* cnode's runtime flags */ - u_int32_t c_hflag; /* cnode's flags for maintaining hash - protected by global hash lock */ - struct vnode *c_vp; /* vnode for data fork or dir */ - struct vnode *c_rsrc_vp; /* vnode for resource fork */ - struct dquot *c_dquot[MAXQUOTAS]; /* cnode's quota info */ - u_int32_t c_childhint; /* catalog hint for children (small dirs only) */ - u_int32_t c_dirthreadhint; /* catalog hint for directory's thread rec */ - struct cat_desc c_desc; /* cnode's descriptor */ - struct cat_attr c_attr; /* cnode's attributes */ - TAILQ_HEAD(hfs_originhead, linkorigin) c_originlist; /* hardlink origin cache */ - TAILQ_HEAD(hfs_hinthead, directoryhint) c_hintlist; /* readdir directory hint list */ - int16_t c_dirhinttag; /* directory hint tag */ - union { - int16_t cu_dirhintcnt; /* directory hint count */ - int16_t cu_syslockcount; /* system file use only */ - } c_union; - u_int32_t c_dirchangecnt; /* changes each insert/delete (in-core only) */ - struct filefork *c_datafork; /* cnode's data fork */ - struct filefork *c_rsrcfork; /* cnode's rsrc fork */ - atomicflag_t c_touch_acctime; - atomicflag_t c_touch_chgtime; - atomicflag_t c_touch_modtime; - - // The following flags are protected by the truncate lock - union { - struct { - bool c_need_dvnode_put_after_truncate_unlock : 1; - bool c_need_rvnode_put_after_truncate_unlock : 1; -#if HFS_COMPRESSION - bool c_need_decmpfs_reset : 1; -#endif - }; - uint8_t c_tflags; - }; - - /* - * Where we're using a journal, we keep track of the last - * transaction that we did an update in. If a minor modification - * is made, we'll still push it if we're still on the same - * transaction. - */ - uint32_t c_update_txn; - -#if HFS_COMPRESSION - decmpfs_cnode *c_decmp; -#endif /* HFS_COMPRESSION */ -#if CONFIG_PROTECT - cprotect_t c_cpentry; /* content protection data */ -#endif - -}; -typedef struct cnode cnode_t; - -/* Aliases for common cnode fields */ -#define c_cnid c_desc.cd_cnid -#define c_hint c_desc.cd_hint -#define c_parentcnid c_desc.cd_parentcnid -#define c_encoding c_desc.cd_encoding - -#define c_fileid c_attr.ca_fileid -#define c_mode c_attr.ca_mode -#define c_linkcount c_attr.ca_linkcount -#define c_uid c_attr.ca_uid -#define c_gid c_attr.ca_gid -#define c_rdev c_attr.ca_union1.cau_rdev -#define c_atime c_attr.ca_atime -#define c_mtime c_attr.ca_mtime -#define c_ctime c_attr.ca_ctime -#define c_itime c_attr.ca_itime -#define c_btime c_attr.ca_btime -#define c_bsdflags c_attr.ca_flags -#define c_finderinfo c_attr.ca_finderinfo -#define c_blocks c_attr.ca_union2.cau_blocks -#define c_entries c_attr.ca_union2.cau_entries -#define c_zftimeout c_childhint - -#define c_dirhintcnt c_union.cu_dirhintcnt -#define c_syslockcount c_union.cu_syslockcount - - -/* hash maintenance flags kept in c_hflag and protected by hfs_chash_mutex */ -#define H_ALLOC 0x00001 /* CNode is being allocated */ -#define H_ATTACH 0x00002 /* CNode is being attached to by another vnode */ -#define H_TRANSIT 0x00004 /* CNode is getting recycled */ -#define H_WAITING 0x00008 /* CNode is being waited for */ - - -/* - * Runtime cnode flags (kept in c_flag) - */ -#define C_NEED_RVNODE_PUT 0x0000001 /* Need to do a vnode_put on c_rsrc_vp after the unlock */ -#define C_NEED_DVNODE_PUT 0x0000002 /* Need to do a vnode_put on c_vp after the unlock */ -#define C_ZFWANTSYNC 0x0000004 /* fsync requested and file has holes */ -#define C_FROMSYNC 0x0000008 /* fsync was called from sync */ - -#define C_MODIFIED 0x0000010 /* CNode has been modified */ -#define C_NOEXISTS 0x0000020 /* CNode has been deleted, catalog entry is gone */ -#define C_DELETED 0x0000040 /* CNode has been marked to be deleted */ -#define C_HARDLINK 0x0000080 /* CNode is a hard link (file or dir) */ - -/* - * A minor modification is one where the volume would not be inconsistent if - * the change was not pushed to disk. For example, changes to times. - */ -#define C_MINOR_MOD 0x0000100 /* CNode has a minor modification */ - -#define C_HASXATTRS 0x0000200 /* cnode has extended attributes */ -#define C_NEG_ENTRIES 0x0000400 /* directory has negative name entries */ -/* - * For C_SSD_STATIC: SSDs may want to deal with the file payload data in a - * different manner knowing that the content is not likely to be modified. This is - * purely advisory at the HFS level, and is not maintained after the cnode goes out of core. - */ -#define C_SSD_STATIC 0x0000800 /* Assume future writes contain static content */ - -#define C_NEED_DATA_SETSIZE 0x0001000 /* Do a ubc_setsize(0) on c_rsrc_vp after the unlock */ -#define C_NEED_RSRC_SETSIZE 0x0002000 /* Do a ubc_setsize(0) on c_vp after the unlock */ -#define C_DIR_MODIFICATION 0x0004000 /* Directory is being modified, wait for lookups */ -#define C_ALWAYS_ZEROFILL 0x0008000 /* Always zero-fill the file on an fsync */ - -#define C_RENAMED 0x0010000 /* cnode was deleted as part of rename; C_DELETED should also be set */ -#define C_NEEDS_DATEADDED 0x0020000 /* cnode needs date-added written to the finderinfo bit */ -#define C_BACKINGSTORE 0x0040000 /* cnode is a backing store for an existing or currently-mounting filesystem */ - -/* - * This flag indicates the cnode might be dirty because it - * was mapped writable so if we get any page-outs, update - * the modification and change times. - */ -#define C_MIGHT_BE_DIRTY_FROM_MAPPING 0x0080000 - -/* - * For C_SSD_GREEDY_MODE: SSDs may want to write the file payload data using the greedy mode knowing - * that the content needs to be written out to the disk quicker than normal at the expense of storage efficiency. - * This is purely advisory at the HFS level, and is not maintained after the cnode goes out of core. - */ -#define C_SSD_GREEDY_MODE 0x0100000 /* Assume future writes are recommended to be written in SLC mode */ - -/* 0x0200000 is currently unused */ - -#define C_IO_ISOCHRONOUS 0x0400000 /* device-specific isochronous throughput I/O */ - -#define ZFTIMELIMIT (5 * 60) - -/* - * The following is the "invisible" bit from the fdFlags field - * in the FndrFileInfo. - */ -enum { kFinderInvisibleMask = 1 << 14 }; - - -/* - * Convert between cnode pointers and vnode pointers - */ -#define VTOC(vp) ((struct cnode *)vnode_fsnode((vp))) - -#define CTOV(cp,rsrc) (((rsrc) && S_ISREG((cp)->c_mode)) ? \ - (cp)->c_rsrc_vp : (cp)->c_vp) - -/* - * Convert between vnode pointers and file forks - * - * Note: no CTOF since that is ambiguous - */ - -#define FTOC(fp) ((fp)->ff_cp) - -#define VTOF(vp) ((vp) == VTOC((vp))->c_rsrc_vp ? \ - VTOC((vp))->c_rsrcfork : \ - VTOC((vp))->c_datafork) - -#define VCTOF(vp, cp) ((vp) == (cp)->c_rsrc_vp ? \ - (cp)->c_rsrcfork : \ - (cp)->c_datafork) - -#define FTOV(fp) ((fp) == FTOC(fp)->c_rsrcfork ? \ - FTOC(fp)->c_rsrc_vp : \ - FTOC(fp)->c_vp) - -/* - * This is a helper function used for determining whether or not a cnode has become open - * unlinked in between the time we acquired its vnode and the time we acquire the cnode lock - * to start manipulating it. Due to the SMP nature of VFS, it is probably necessary to - * use this macro every time we acquire a cnode lock, as the content of the Cnode may have - * been modified in betweeen the lookup and a VNOP. Whether or not to call this is dependent - * upon the VNOP in question. Sometimes it is OK to use an open-unlinked file, for example, in, - * reading. But other times, such as on the source of a VNOP_RENAME, it should be disallowed. - */ -int hfs_checkdeleted(struct cnode *cp); - -/* - * Test for a resource fork - */ -#define FORK_IS_RSRC(fp) ((fp) == FTOC(fp)->c_rsrcfork) - -#define VNODE_IS_RSRC(vp) ((vp) == VTOC((vp))->c_rsrc_vp) - -#if HFS_COMPRESSION -/* - * VTOCMP(vp) returns a pointer to vp's decmpfs_cnode; this could be NULL - * if the file is not compressed or if hfs_file_is_compressed() hasn't - * yet been called on this file. - */ -#define VTOCMP(vp) (VTOC((vp))->c_decmp) -int hfs_file_is_compressed(struct cnode *cp, int skiplock); -int hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *vp, cnid_t fid, off_t *size, int skiplock); -int hfs_hides_rsrc(vfs_context_t ctx, struct cnode *cp, int skiplock); -int hfs_hides_xattr(vfs_context_t ctx, struct cnode *cp, const char *name, int skiplock); -#endif - -#define ATIME_ONDISK_ACCURACY 300 - -static inline bool hfs_should_save_atime(cnode_t *cp) -{ - /* - * We only write atime updates to disk if the delta is greater - * than ATIME_ONDISK_ACCURACY. - */ - return (cp->c_atime < cp->c_attr.ca_atimeondisk - || cp->c_atime - cp->c_attr.ca_atimeondisk > ATIME_ONDISK_ACCURACY); -} - -typedef enum { - HFS_NOT_DIRTY = 0, - HFS_DIRTY = 1, - HFS_DIRTY_ATIME = 2 -} hfs_dirty_t; - -static inline hfs_dirty_t hfs_is_dirty(cnode_t *cp) -{ - if (ISSET(cp->c_flag, C_NOEXISTS)) - return HFS_NOT_DIRTY; - - if (ISSET(cp->c_flag, C_MODIFIED | C_MINOR_MOD | C_NEEDS_DATEADDED) - || cp->c_touch_chgtime || cp->c_touch_modtime) { - return HFS_DIRTY; - } - - if (cp->c_touch_acctime || hfs_should_save_atime(cp)) - return HFS_DIRTY_ATIME; - - return HFS_NOT_DIRTY; -} - -/* This overlays the FileID portion of NFS file handles. */ -struct hfsfid { - u_int32_t hfsfid_cnid; /* Catalog node ID. */ - u_int32_t hfsfid_gen; /* Generation number (create date). */ -}; - - -/* Get new default vnode */ -extern int hfs_getnewvnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, - struct cat_desc *descp, int flags, struct cat_attr *attrp, - struct cat_fork *forkp, struct vnode **vpp, int *out_flags); - -/* Input flags for hfs_getnewvnode */ - -#define GNV_WANTRSRC 0x01 /* Request the resource fork vnode. */ -#define GNV_SKIPLOCK 0x02 /* Skip taking the cnode lock (when getting resource fork). */ -#define GNV_CREATE 0x04 /* The vnode is for a newly created item. */ -#define GNV_NOCACHE 0x08 /* Delay entering this item in the name cache */ -#define GNV_USE_VP 0x10 /* Use the vnode provided in *vpp instead of creating a new one */ - -/* Output flags for hfs_getnewvnode */ -#define GNV_CHASH_RENAMED 0x01 /* The cnode was renamed in-flight */ -#define GNV_CAT_DELETED 0x02 /* The cnode was deleted from the catalog */ -#define GNV_NEW_CNODE 0x04 /* We are vending out a newly initialized cnode */ -#define GNV_CAT_ATTRCHANGED 0x08 /* Something in struct cat_attr changed in between cat_lookups */ - - -/* Touch cnode times based on c_touch_xxx flags */ -extern void hfs_touchtimes(struct hfsmount *, struct cnode *); -extern void hfs_write_dateadded (struct cat_attr *cattrp, u_int32_t dateadded); -extern u_int32_t hfs_get_dateadded (struct cnode *cp); -extern u_int32_t hfs_get_dateadded_from_blob(const uint8_t * /* finderinfo */, mode_t /* mode */); - -/* Gen counter methods */ -extern void hfs_write_gencount(struct cat_attr *cattrp, uint32_t gencount); -extern uint32_t hfs_get_gencount(struct cnode *cp); -extern uint32_t hfs_incr_gencount (struct cnode *cp); -extern uint32_t hfs_get_gencount_from_blob(const uint8_t * /* finderinfo */, mode_t /* mode */); - -/* Document id methods */ -extern uint32_t hfs_get_document_id(struct cnode * /* cp */); -extern uint32_t hfs_get_document_id_from_blob(const uint8_t * /* finderinfo */, mode_t /* mode */); - -/* Zero-fill file and push regions out to disk */ -enum { - // Use this flag if you're going to sync later - HFS_FILE_DONE_NO_SYNC = 1, -}; -typedef uint32_t hfs_file_done_opts_t; -extern int hfs_filedone(struct vnode *vp, vfs_context_t context, - hfs_file_done_opts_t opts); - -/* - * HFS cnode hash functions. - */ -extern void hfs_chashinit(void); -extern void hfs_chashinit_finish(struct hfsmount *hfsmp); -extern void hfs_delete_chash(struct hfsmount *hfsmp); -extern int hfs_chashremove(struct hfsmount *hfsmp, struct cnode *cp); -extern void hfs_chash_abort(struct hfsmount *hfsmp, struct cnode *cp); -extern void hfs_chash_rehash(struct hfsmount *hfsmp, struct cnode *cp1, struct cnode *cp2); -extern void hfs_chashwakeup(struct hfsmount *hfsmp, struct cnode *cp, int flags); -extern void hfs_chash_mark_in_transit(struct hfsmount *hfsmp, struct cnode *cp); - -extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, - int skiplock, int allow_deleted); -extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, - int wantrsrc, int skiplock, int *out_flags, int *hflags); -extern int hfs_chash_snoop(struct hfsmount *, ino_t, int, int (*)(const cnode_t *, void *), void *); -extern int hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, - cnid_t cnid, struct cat_attr *cattr, int *error); - -extern int hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid); - -/* - * HFS cnode lock functions. - * - * HFS Locking Order: - * - * 1. cnode truncate lock (if needed) -- see below for more on this - * - * + hfs_vnop_pagein/out handles recursive use of this lock (by - * using flag option HFS_LOCK_SKIP_IF_EXCLUSIVE) although there - * are issues with this (see #16620278). - * - * + If locking multiple cnodes then the truncate lock must be taken on - * all (in address order), before taking the cnode locks. - * - * 2. Hot Files stage mutex (grabbed before manipulating individual vnodes/cnodes) - * - * 3. cnode locks in address order (if needed) - * - * 4. journal (if needed) - * - * 5. Hot Files B-Tree lock (not treated as a system file) - * - * 6. system files (as needed) - * - * A. Catalog B-tree file - * B. Attributes B-tree file - * C. Startup file (if there is one) - * D. Allocation Bitmap file (always exclusive, supports recursion) - * E. Overflow Extents B-tree file (always exclusive, supports recursion) - * - * 7. hfs mount point (always last) - * - * - * I. HFS cnode hash lock (must not acquire any new locks while holding this lock, always taken last) - */ - -/* - * -- The Truncate Lock -- - * - * The truncate lock is used for a few purposes (more than its name - * might suggest). The first thing to note is that the cnode lock - * cannot be held whilst issuing any I/O other than metadata changes, - * so the truncate lock, in either shared or exclusive form, must - * usually be held in these cases. This includes calls to ubc_setsize - * where the new size is less than the current size known to the VM - * subsystem (for two reasons: a) because reaping pages can block - * (e.g. on pages that are busy or being cleaned); b) reaping pages - * might require page-in for tasks that have that region mapped - * privately). The same applies to other calls into the VM subsystem. - * - * Here are some (but not necessarily all) cases that the truncate - * lock protects for: - * - * + When reading and writing a file, we hold the truncate lock - * shared to ensure that the underlying blocks cannot be deleted - * and on systems that use content protection, this also ensures - * the keys remain valid (which might be being used by the - * underlying layers). - * - * + We need to protect against the following sequence of events: - * - * A file is initially size X. A thread issues an append to that - * file. Another thread truncates the file and then extends it - * to a a new size Y. Now the append can be applied at offset X - * and then the data is lost when the file is truncated; or it - * could be applied after the truncate, i.e. at offset 0; or it - * can be applied at offset Y. What we *cannot* do is apply the - * append at offset X and for the data to be visible at the end. - * (Note that we are free to choose when we apply the append - * operation.) - * - * To solve this, we keep things simple and take the truncate lock - * exclusively in order to sequence the append with other size - * changes. Therefore any size change must take the truncate lock - * exclusively. - * - * (N.B. we could do better and allow readers to run concurrently - * during the append and other size changes.) - * - * So here are the rules: - * - * + If you plan to change ff_size, you must take the truncate lock - * exclusively, *but* be careful what I/O you do whilst you have - * the truncate lock exclusively and try and avoid it if you can: - * if the VM subsystem tries to do something with some pages on a - * different thread and you try and do some I/O with those same - * pages, we will deadlock. (See #16620278.) - * - * + If you do anything that requires blocks to not be deleted or - * encryption keys to remain valid, you must take the truncate lock - * shared. - * - * + And it follows therefore, that if you want to delete blocks or - * delete keys, you must take the truncate lock exclusively. Note - * that for asynchronous writes, the truncate lock will be dropped - * after issuing I/O but before the I/O has completed which means - * that before manipulating keys, you *must* issue - * vnode_wait_for_writes in addition to holding the truncate lock. - * - * N.B. ff_size is actually protected by the cnode lock and so you - * must hold the cnode lock exclusively to change it and shared to - * read it. - * - */ - -enum hfs_locktype { - HFS_SHARED_LOCK = 1, - HFS_EXCLUSIVE_LOCK = 2 -}; - -/* Option flags for cnode and truncate lock functions */ -enum hfs_lockflags { - HFS_LOCK_DEFAULT = 0x0, /* Default flag, no options provided */ - HFS_LOCK_ALLOW_NOEXISTS = 0x1, /* Allow locking of all cnodes, including cnode marked deleted with no catalog entry */ - HFS_LOCK_SKIP_IF_EXCLUSIVE = 0x2, /* Skip locking if the current thread already holds the lock exclusive */ - - // Used when you do not want to check return from hfs_lock - HFS_LOCK_ALWAYS = HFS_LOCK_ALLOW_NOEXISTS, -}; -#define HFS_SHARED_OWNER (void *)0xffffffff - -void hfs_lock_always(cnode_t *cnode, enum hfs_locktype); -int hfs_lock(struct cnode *, enum hfs_locktype, enum hfs_lockflags); -bool hfs_lock_upgrade(cnode_t *cp); -int hfs_lockpair(struct cnode *, struct cnode *, enum hfs_locktype); -int hfs_lockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *, - enum hfs_locktype, struct cnode **); -void hfs_unlock(struct cnode *); -void hfs_unlockpair(struct cnode *, struct cnode *); -void hfs_unlockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *); - -void hfs_lock_truncate(struct cnode *, enum hfs_locktype, enum hfs_lockflags); -bool hfs_truncate_lock_upgrade(struct cnode *cp); -void hfs_truncate_lock_downgrade(struct cnode *cp); -void hfs_unlock_truncate(struct cnode *, enum hfs_lockflags); -int hfs_try_trunclock(struct cnode *, enum hfs_locktype, enum hfs_lockflags); - -extern int hfs_systemfile_lock(struct hfsmount *, int, enum hfs_locktype); -extern void hfs_systemfile_unlock(struct hfsmount *, int); - -void hfs_clear_might_be_dirty_flag(cnode_t *cp); - -// cnode must be locked -static inline __attribute__((pure)) -bool hfs_has_rsrc(const cnode_t *cp) -{ - if (cp->c_rsrcfork) - return cp->c_rsrcfork->ff_blocks > 0; - else - return cp->c_datafork && cp->c_blocks > cp->c_datafork->ff_blocks; -} - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ - -#endif /* ! _HFS_CNODE_H_ */ diff --git a/bsd/hfs/hfs_cprotect.c b/bsd/hfs/hfs_cprotect.c deleted file mode 100644 index 963305e02..000000000 --- a/bsd/hfs/hfs_cprotect.c +++ /dev/null @@ -1,2839 +0,0 @@ -/* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#if CONFIG_PROTECT - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hfs.h" -#include "hfs_cnode.h" -#include "hfs_fsctl.h" -#include "hfs_cprotect.h" - - -#define PTR_ADD(type, base, offset) (type)((uintptr_t)(base) + (offset)) - -/* - * The wrap function pointers and the variable to indicate if they - * are initialized are system-wide, and hence are defined globally. - */ -static struct cp_wrap_func g_cp_wrap_func = {}; -static int are_wraps_initialized = false; - -extern int (**hfs_vnodeop_p) (void *); - -/* - * CP private functions - */ -static int cp_root_major_vers(mount_t mp); -static int cp_getxattr(cnode_t *, struct hfsmount *hfsmp, struct cprotect **); -static void cp_entry_dealloc(hfsmount_t *hfsmp, struct cprotect *entry); -static int cp_restore_keys(struct cprotect *, struct hfsmount *hfsmp, struct cnode *); -static int cp_lock_vfs_callback(mount_t, void *); -static int cp_lock_vnode_callback(vnode_t, void *); -static int cp_vnode_is_eligible (vnode_t); -static int cp_check_access (cnode_t *cp, struct hfsmount *hfsmp, int vnop); -static int cp_unwrap(struct hfsmount *, struct cprotect *, struct cnode *); -static void cp_init_access(cp_cred_t access, struct cnode *cp); - - -#if DEVELOPMENT || DEBUG -#define CP_ASSERT(x) \ - if ((x) == 0) { \ - panic("Content Protection: failed assertion in %s", __FUNCTION__); \ - } -#else -#define CP_ASSERT(x) -#endif - -// -- cpx_t accessors -- - -size_t cpx_size(size_t key_size) -{ - size_t size = sizeof(struct cpx) + key_size; - -#if DEBUG - size += 4; // Extra for magic -#endif - - return size; -} - -static size_t cpx_sizex(const struct cpx *cpx) -{ - return cpx_size(cpx->cpx_max_key_len); -} - -cpx_t cpx_alloc(size_t key_len) -{ - cpx_t cpx; - - MALLOC(cpx, cpx_t, cpx_size(key_len), M_TEMP, M_WAITOK); - - cpx_init(cpx, key_len); - - return cpx; -} - -#if DEBUG -static const uint32_t cpx_magic1 = 0x7b787063; // cpx{ -static const uint32_t cpx_magic2 = 0x7870637d; // }cpx -#endif - -void cpx_free(cpx_t cpx) -{ -#if DEBUG - assert(cpx->cpx_magic1 == cpx_magic1); - assert(*PTR_ADD(uint32_t *, cpx, cpx_sizex(cpx) - 4) == cpx_magic2); -#endif - bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len); - FREE(cpx, M_TEMP); -} - -void cpx_init(cpx_t cpx, size_t key_len) -{ -#if DEBUG - cpx->cpx_magic1 = cpx_magic1; - *PTR_ADD(uint32_t *, cpx, cpx_size(key_len) - 4) = cpx_magic2; -#endif - cpx->cpx_flags = 0; - cpx->cpx_key_len = 0; - cpx->cpx_max_key_len = key_len; -} - -bool cpx_is_sep_wrapped_key(const struct cpx *cpx) -{ - return ISSET(cpx->cpx_flags, CPX_SEP_WRAPPEDKEY); -} - -void cpx_set_is_sep_wrapped_key(struct cpx *cpx, bool v) -{ - if (v) - SET(cpx->cpx_flags, CPX_SEP_WRAPPEDKEY); - else - CLR(cpx->cpx_flags, CPX_SEP_WRAPPEDKEY); -} - -bool cpx_use_offset_for_iv(const struct cpx *cpx) -{ - return ISSET(cpx->cpx_flags, CPX_USE_OFFSET_FOR_IV); -} - -void cpx_set_use_offset_for_iv(struct cpx *cpx, bool v) -{ - if (v) - SET(cpx->cpx_flags, CPX_USE_OFFSET_FOR_IV); - else - CLR(cpx->cpx_flags, CPX_USE_OFFSET_FOR_IV); -} - -uint16_t cpx_max_key_len(const struct cpx *cpx) -{ - return cpx->cpx_max_key_len; -} - -uint16_t cpx_key_len(const struct cpx *cpx) -{ - return cpx->cpx_key_len; -} - -void cpx_set_key_len(struct cpx *cpx, uint16_t key_len) -{ - cpx->cpx_key_len = key_len; - - if (ISSET(cpx->cpx_flags, CPX_IV_AES_CTX_HFS)) { - /* - * We assume that if the key length is being modified, the key - * has changed. As a result, un-set any bits related to the - * AES context, if needed. They should be re-generated - * on-demand. - */ - CLR(cpx->cpx_flags, CPX_IV_AES_CTX_INITIALIZED | CPX_IV_AES_CTX_HFS); - } -} - -bool cpx_has_key(const struct cpx *cpx) -{ - return cpx->cpx_key_len > 0; -} - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wcast-qual" -void *cpx_key(const struct cpx *cpx) -{ - return (void *)cpx->cpx_cached_key; -} -#pragma clang diagnostic pop - -static void cpx_set_aes_iv_key(struct cpx *cpx, void *iv_key) -{ - aes_encrypt_key128(iv_key, &cpx->cpx_iv_aes_ctx); - SET(cpx->cpx_flags, CPX_IV_AES_CTX_INITIALIZED | CPX_USE_OFFSET_FOR_IV); - CLR(cpx->cpx_flags, CPX_IV_AES_CTX_HFS); -} - -aes_encrypt_ctx *cpx_iv_aes_ctx(struct cpx *cpx) -{ - if (ISSET(cpx->cpx_flags, CPX_IV_AES_CTX_INITIALIZED)) - return &cpx->cpx_iv_aes_ctx; - - SHA1_CTX sha1ctxt; - uint8_t digest[SHA_DIGEST_LENGTH]; /* Kiv */ - - /* First init the cp_cache_iv_key[] */ - SHA1Init(&sha1ctxt); - - /* - * We can only use this when the keys are generated in the AP; As a result - * we only use the first 32 bytes of key length in the cache key - */ - SHA1Update(&sha1ctxt, cpx->cpx_cached_key, cpx->cpx_key_len); - SHA1Final(digest, &sha1ctxt); - - cpx_set_aes_iv_key(cpx, digest); - SET(cpx->cpx_flags, CPX_IV_AES_CTX_HFS); - - return &cpx->cpx_iv_aes_ctx; -} - -static void cpx_flush(cpx_t cpx) -{ - bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len); - bzero(&cpx->cpx_iv_aes_ctx, sizeof(cpx->cpx_iv_aes_ctx)); - cpx->cpx_flags = 0; - cpx->cpx_key_len = 0; -} - -static bool cpx_can_copy(const struct cpx *src, const struct cpx *dst) -{ - return src->cpx_key_len <= dst->cpx_max_key_len; -} - -void cpx_copy(const struct cpx *src, cpx_t dst) -{ - uint16_t key_len = cpx_key_len(src); - cpx_set_key_len(dst, key_len); - memcpy(cpx_key(dst), cpx_key(src), key_len); - dst->cpx_flags = src->cpx_flags; - if (ISSET(dst->cpx_flags, CPX_IV_AES_CTX_INITIALIZED)) - dst->cpx_iv_aes_ctx = src->cpx_iv_aes_ctx; -} - -// -- cp_key_pair accessors -- - -void cpkp_init(cp_key_pair_t *cpkp, uint16_t max_pers_key_len, - uint16_t max_cached_key_len) -{ - cpkp->cpkp_max_pers_key_len = max_pers_key_len; - cpkp->cpkp_pers_key_len = 0; - cpx_init(&cpkp->cpkp_cpx, max_cached_key_len); - - // Default to using offsets - cpx_set_use_offset_for_iv(&cpkp->cpkp_cpx, true); -} - -uint16_t cpkp_max_pers_key_len(const cp_key_pair_t *cpkp) -{ - return cpkp->cpkp_max_pers_key_len; -} - -uint16_t cpkp_pers_key_len(const cp_key_pair_t *cpkp) -{ - return cpkp->cpkp_pers_key_len; -} - -static bool cpkp_has_pers_key(const cp_key_pair_t *cpkp) -{ - return cpkp->cpkp_pers_key_len > 0; -} - -static void *cpkp_pers_key(const cp_key_pair_t *cpkp) -{ - return PTR_ADD(void *, &cpkp->cpkp_cpx, cpx_sizex(&cpkp->cpkp_cpx)); -} - -static void cpkp_set_pers_key_len(cp_key_pair_t *cpkp, uint16_t key_len) -{ - if (key_len > cpkp->cpkp_max_pers_key_len) - panic("hfs_cprotect: key too big!"); - cpkp->cpkp_pers_key_len = key_len; -} - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wcast-qual" -cpx_t cpkp_cpx(const cp_key_pair_t *cpkp) -{ - // Cast to remove const qualifier - return (cpx_t)&cpkp->cpkp_cpx; -} -#pragma clang diagnostic pop - -size_t cpkp_size(uint16_t pers_key_len, uint16_t cached_key_len) -{ - return (sizeof(cp_key_pair_t) - sizeof(struct cpx) - + pers_key_len + cpx_size(cached_key_len)); -} - -size_t cpkp_sizex(const cp_key_pair_t *cpkp) -{ - return cpkp_size(cpkp->cpkp_max_pers_key_len, cpkp->cpkp_cpx.cpx_max_key_len); -} - -void cpkp_flush(cp_key_pair_t *cpkp) -{ - cpx_flush(&cpkp->cpkp_cpx); - cpkp->cpkp_pers_key_len = 0; - bzero(cpkp_pers_key(cpkp), cpkp->cpkp_max_pers_key_len); -} - -bool cpkp_can_copy(const cp_key_pair_t *src, const cp_key_pair_t *dst) -{ - return (cpkp_pers_key_len(src) <= dst->cpkp_max_pers_key_len - && cpx_can_copy(&src->cpkp_cpx, &dst->cpkp_cpx)); -} - -void cpkp_copy(const cp_key_pair_t *src, cp_key_pair_t *dst) -{ - const uint16_t key_len = cpkp_pers_key_len(src); - cpkp_set_pers_key_len(dst, key_len); - memcpy(cpkp_pers_key(dst), cpkp_pers_key(src), key_len); - cpx_copy(&src->cpkp_cpx, &dst->cpkp_cpx); -} - -// -- - -bool cp_is_supported_version(uint16_t vers) -{ - return vers == CP_VERS_4 || vers == CP_VERS_5; -} - -/* - * Return the appropriate key and, if requested, the physical offset and - * maximum length for a particular I/O operation. - */ -void cp_io_params(__unused hfsmount_t *hfsmp, cprotect_t cpr, - __unused off_rsrc_t off_rsrc, - __unused int direction, cp_io_params_t *io_params) -{ - - io_params->max_len = INT64_MAX; - io_params->phys_offset = -1; - io_params->cpx = cpkp_cpx(&cpr->cp_keys); -} - -static void cp_flush_cached_keys(cprotect_t cpr) -{ - cpx_flush(cpkp_cpx(&cpr->cp_keys)); -} - -static bool cp_needs_pers_key(cprotect_t cpr) -{ - if (CP_CLASS(cpr->cp_pclass) == PROTECTION_CLASS_F) - return !cpx_has_key(cpkp_cpx(&cpr->cp_keys)); - else - return !cpkp_has_pers_key(&cpr->cp_keys); -} - -int -cp_key_store_action(int action) -{ - - if (action < 0 || action > CP_MAX_STATE) { - return -1; - } - - /* - * The lock state is kept locally to each data protected filesystem to - * avoid using globals. Pass along the lock request to each filesystem - * we iterate through. - */ - - /* - * Upcast the value in 'action' to be a pointer-width unsigned integer. - * This avoids issues relating to pointer-width. - */ - return vfs_iterate(0, cp_lock_vfs_callback, (void*)(uintptr_t)action); -} - - -int -cp_register_wraps(cp_wrap_func_t key_store_func) -{ - g_cp_wrap_func.new_key = key_store_func->new_key; - g_cp_wrap_func.unwrapper = key_store_func->unwrapper; - g_cp_wrap_func.rewrapper = key_store_func->rewrapper; - /* do not use invalidater until rdar://12170050 goes in ! */ - g_cp_wrap_func.invalidater = key_store_func->invalidater; - g_cp_wrap_func.backup_key = key_store_func->backup_key; - - /* Mark the functions as initialized in the function pointer container */ - are_wraps_initialized = true; - - return 0; -} - -static cp_key_revision_t cp_initial_key_revision(__unused hfsmount_t *hfsmp) -{ - return 1; -} - -cp_key_revision_t cp_next_key_revision(cp_key_revision_t rev) -{ - rev = (rev + 0x0100) ^ (mach_absolute_time() & 0xff); - if (!rev) - rev = 1; - return rev; -} - -/* - * Allocate and initialize a cprotect blob for a new cnode. - * Called from hfs_getnewvnode: cnode is locked exclusive. - * - * Read xattr data off the cnode. Then, if conditions permit, - * unwrap the file key and cache it in the cprotect blob. - */ -int -cp_entry_init(struct cnode *cp, struct mount *mp) -{ - struct cprotect *entry = NULL; - int error = 0; - struct hfsmount *hfsmp = VFSTOHFS(mp); - - /* - * The cnode should be locked at this point, regardless of whether or not - * we are creating a new item in the namespace or vending a vnode on behalf - * of lookup. The only time we tell getnewvnode to skip the lock is when - * constructing a resource fork vnode. But a resource fork vnode must come - * after the regular data fork cnode has already been constructed. - */ - if (!cp_fs_protected (mp)) { - cp->c_cpentry = NULL; - return 0; - } - - if (!S_ISREG(cp->c_mode) && !S_ISDIR(cp->c_mode)) { - cp->c_cpentry = NULL; - return 0; - } - - if (are_wraps_initialized == false) { - printf("hfs: cp_update_entry: wrap functions not yet set\n"); - return ENXIO; - } - - if (hfsmp->hfs_running_cp_major_vers == 0) { - panic ("hfs cp: no running mount point version! "); - } - - CP_ASSERT (cp->c_cpentry == NULL); - - error = cp_getxattr(cp, hfsmp, &entry); - if (error == ENOATTR) { - /* - * Normally, we should always have a CP EA for a file or directory that - * we are initializing here. However, there are some extenuating circumstances, - * such as the root directory immediately following a newfs_hfs. - * - * As a result, we leave code here to deal with an ENOATTR which will always - * default to a 'D/NONE' key, though we don't expect to use it much. - */ - cp_key_class_t target_class = PROTECTION_CLASS_D; - - if (S_ISDIR(cp->c_mode)) { - target_class = PROTECTION_CLASS_DIR_NONE; - } - - cp_key_revision_t key_revision = cp_initial_key_revision(hfsmp); - - /* allow keybag to override our class preferences */ - error = cp_new (&target_class, hfsmp, cp, cp->c_mode, CP_KEYWRAP_DIFFCLASS, - key_revision, (cp_new_alloc_fn)cp_entry_alloc, (void **)&entry); - if (error == 0) { - entry->cp_pclass = target_class; - entry->cp_key_os_version = cp_os_version(); - entry->cp_key_revision = key_revision; - error = cp_setxattr (cp, entry, hfsmp, cp->c_fileid, XATTR_CREATE); - } - } - - /* - * Bail out if: - * a) error was not ENOATTR (we got something bad from the getxattr call) - * b) we encountered an error setting the xattr above. - * c) we failed to generate a new cprotect data structure. - */ - if (error) { - goto out; - } - - cp->c_cpentry = entry; - -out: - if (error == 0) { - entry->cp_backing_cnode = cp; - } - else { - if (entry) { - cp_entry_destroy(hfsmp, entry); - } - cp->c_cpentry = NULL; - } - - return error; -} - -/* - * cp_setup_newentry - * - * Generate a keyless cprotect structure for use with the new AppleKeyStore kext. - * Since the kext is now responsible for vending us both wrapped/unwrapped keys - * we need to create a keyless xattr upon file / directory creation. When we have the inode value - * and the file/directory is established, then we can ask it to generate keys. Note that - * this introduces a potential race; If the device is locked and the wrapping - * keys are purged between the time we call this function and the time we ask it to generate - * keys for us, we could have to fail the open(2) call and back out the entry. - */ - -int cp_setup_newentry (struct hfsmount *hfsmp, struct cnode *dcp, - cp_key_class_t suppliedclass, mode_t cmode, - struct cprotect **tmpentry) -{ - int isdir = 0; - struct cprotect *entry = NULL; - uint32_t target_class = hfsmp->default_cp_class; - suppliedclass = CP_CLASS(suppliedclass); - - if (hfsmp->hfs_running_cp_major_vers == 0) { - panic ("CP: major vers not set in mount!"); - } - - if (S_ISDIR (cmode)) { - isdir = 1; - } - - /* Decide the target class. Input argument takes priority. */ - if (cp_is_valid_class (isdir, suppliedclass)) { - /* caller supplies -1 if it was not specified so we will default to the mount point value */ - target_class = suppliedclass; - /* - * One exception, F is never valid for a directory - * because its children may inherit and userland will be - * unable to read/write to the files. - */ - if (isdir) { - if (target_class == PROTECTION_CLASS_F) { - *tmpentry = NULL; - return EINVAL; - } - } - } - else { - /* - * If no valid class was supplied, behave differently depending on whether or not - * the item being created is a file or directory. - * - * for FILE: - * If parent directory has a non-zero class, use that. - * If parent directory has a zero class (not set), then attempt to - * apply the mount point default. - * - * for DIRECTORY: - * Directories always inherit from the parent; if the parent - * has a NONE class set, then we can continue to use that. - */ - if ((dcp) && (dcp->c_cpentry)) { - uint32_t parentclass = CP_CLASS(dcp->c_cpentry->cp_pclass); - /* If the parent class is not valid, default to the mount point value */ - if (cp_is_valid_class(1, parentclass)) { - if (isdir) { - target_class = parentclass; - } - else if (parentclass != PROTECTION_CLASS_DIR_NONE) { - /* files can inherit so long as it's not NONE */ - target_class = parentclass; - } - } - /* Otherwise, we already defaulted to the mount point's default */ - } - } - - /* Generate the cprotect to vend out */ - entry = cp_entry_alloc(NULL, 0, 0, NULL); - if (entry == NULL) { - *tmpentry = NULL; - return ENOMEM; - } - - /* - * We don't have keys yet, so fill in what we can. At this point - * this blob has no keys and it has no backing xattr. We just know the - * target class. - */ - entry->cp_flags = CP_NO_XATTR; - /* Note this is only the effective class */ - entry->cp_pclass = target_class; - *tmpentry = entry; - - return 0; -} - -/* - * Set up an initial key/class pair for a disassociated cprotect entry. - * This function is used to generate transient keys that will never be - * written to disk. We use class F for this since it provides the exact - * semantics that are needed here. Because we never attach this blob to - * a cnode directly, we take a pointer to the cprotect struct. - * - * This function is primarily used in the HFS FS truncation codepath - * where we may rely on AES symmetry to relocate encrypted data from - * one spot in the disk to another. - */ -int cpx_gentempkeys(cpx_t *pcpx, __unused struct hfsmount *hfsmp) -{ - cpx_t cpx = cpx_alloc(CP_MAX_KEYSIZE); - - cpx_set_key_len(cpx, CP_MAX_KEYSIZE); - read_random(cpx_key(cpx), CP_MAX_KEYSIZE); - cpx_set_use_offset_for_iv(cpx, true); - - *pcpx = cpx; - - return 0; -} - -/* - * Tear down and clear a cprotect blob for a closing file. - * Called at hfs_reclaim_cnode: cnode is locked exclusive. - */ -void -cp_entry_destroy(hfsmount_t *hfsmp, struct cprotect *entry_ptr) -{ - if (entry_ptr == NULL) { - /* nothing to clean up */ - return; - } - cp_entry_dealloc(hfsmp, entry_ptr); -} - - -int -cp_fs_protected (mount_t mnt) -{ - return (vfs_flags(mnt) & MNT_CPROTECT); -} - - -/* - * Return a pointer to underlying cnode if there is one for this vnode. - * Done without taking cnode lock, inspecting only vnode state. - */ -struct cnode * -cp_get_protected_cnode(struct vnode *vp) -{ - if (!cp_vnode_is_eligible(vp)) { - return NULL; - } - - if (!cp_fs_protected(VTOVFS(vp))) { - /* mount point doesn't support it */ - return NULL; - } - - return (struct cnode*) vp->v_data; -} - - -/* - * Sets *class to persistent class associated with vnode, - * or returns error. - */ -int -cp_vnode_getclass(struct vnode *vp, int *class) -{ - struct cprotect *entry; - int error = 0; - struct cnode *cp; - int took_truncate_lock = 0; - struct hfsmount *hfsmp = NULL; - - /* Is this an interesting vp? */ - if (!cp_vnode_is_eligible (vp)) { - return EBADF; - } - - /* Is the mount point formatted for content protection? */ - if (!cp_fs_protected(VTOVFS(vp))) { - return ENOTSUP; - } - - cp = VTOC(vp); - hfsmp = VTOHFS(vp); - - /* - * Take the truncate lock up-front in shared mode because we may need - * to manipulate the CP blob. Pend lock events until we're done here. - */ - hfs_lock_truncate (cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - took_truncate_lock = 1; - - /* - * We take only the shared cnode lock up-front. If it turns out that - * we need to manipulate the CP blob to write a key out, drop the - * shared cnode lock and acquire an exclusive lock. - */ - error = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - if (error) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - return error; - } - - /* pull the class from the live entry */ - entry = cp->c_cpentry; - - if (entry == NULL) { - panic("Content Protection: uninitialized cnode %p", cp); - } - - /* Note that we may not have keys yet, but we know the target class. */ - - if (error == 0) { - *class = CP_CLASS(entry->cp_pclass); - } - - if (took_truncate_lock) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - } - - hfs_unlock(cp); - return error; -} - -/* - * Sets persistent class for this file or directory. - * If vnode cannot be protected (system file, non-regular file, non-hfs), EBADF. - * If the new class can't be accessed now, EPERM. - * Otherwise, record class and re-wrap key if the mount point is content-protected. - */ -int -cp_vnode_setclass(struct vnode *vp, uint32_t newclass) -{ - struct cnode *cp; - struct cprotect *entry = 0; - int error = 0; - int took_truncate_lock = 0; - struct hfsmount *hfsmp = NULL; - int isdir = 0; - - if (vnode_isdir (vp)) { - isdir = 1; - } - - /* Ensure we only use the effective class here */ - newclass = CP_CLASS(newclass); - - if (!cp_is_valid_class(isdir, newclass)) { - printf("hfs: CP: cp_setclass called with invalid class %d\n", newclass); - return EINVAL; - } - - /* Is this an interesting vp? */ - if (!cp_vnode_is_eligible(vp)) { - return EBADF; - } - - /* Is the mount point formatted for content protection? */ - if (!cp_fs_protected(VTOVFS(vp))) { - return ENOTSUP; - } - - hfsmp = VTOHFS(vp); - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return EROFS; - } - - /* - * Take the cnode truncate lock exclusive because we want to manipulate the - * CP blob. The lock-event handling code is doing the same. This also forces - * all pending IOs to drain before we can re-write the persistent and cache keys. - */ - cp = VTOC(vp); - hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - took_truncate_lock = 1; - - /* - * The truncate lock is not sufficient to guarantee the CP blob - * isn't being used. We must wait for existing writes to finish. - */ - vnode_waitforwrites(vp, 0, 0, 0, "cp_vnode_setclass"); - - if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) { - return EINVAL; - } - - entry = cp->c_cpentry; - if (entry == NULL) { - error = EINVAL; - goto out; - } - - /* - * re-wrap per-file key with new class. - * Generate an entirely new key if switching to F. - */ - if (vnode_isreg(vp)) { - /* - * The vnode is a file. Before proceeding with the re-wrap, we need - * to unwrap the keys before proceeding. This is to ensure that - * the destination class's properties still work appropriately for the - * target class (since B allows I/O but an unwrap prior to the next unlock - * will not be allowed). - */ - if (!cpx_has_key(&entry->cp_keys.cpkp_cpx)) { - error = cp_restore_keys (entry, hfsmp, cp); - if (error) { - goto out; - } - } - - if (newclass == PROTECTION_CLASS_F) { - /* Verify that file is blockless if switching to class F */ - if (cp->c_datafork->ff_size > 0) { - error = EINVAL; - goto out; - } - - cp_key_pair_t *cpkp; - cprotect_t new_entry = cp_entry_alloc(NULL, 0, CP_MAX_KEYSIZE, &cpkp); - - if (!new_entry) { - error = ENOMEM; - goto out; - } - - /* newclass is only the effective class */ - new_entry->cp_pclass = newclass; - new_entry->cp_key_os_version = cp_os_version(); - new_entry->cp_key_revision = cp_next_key_revision(entry->cp_key_revision); - - cpx_t cpx = cpkp_cpx(cpkp); - - /* Class F files are not wrapped, so they continue to use MAX_KEYSIZE */ - cpx_set_key_len(cpx, CP_MAX_KEYSIZE); - read_random (cpx_key(cpx), CP_MAX_KEYSIZE); - - cp_replace_entry(hfsmp, cp, new_entry); - - error = 0; - goto out; - } - - /* Deny the setclass if file is to be moved from F to something else */ - if (entry->cp_pclass == PROTECTION_CLASS_F) { - error = EPERM; - goto out; - } - - if (!cpkp_has_pers_key(&entry->cp_keys)) { - struct cprotect *new_entry = NULL; - /* - * We want to fail if we can't wrap to the target class. By not setting - * CP_KEYWRAP_DIFFCLASS, we tell keygeneration that if it can't wrap - * to 'newclass' then error out. - */ - uint32_t flags = 0; - error = cp_generate_keys (hfsmp, cp, newclass, flags, &new_entry); - if (error == 0) { - cp_replace_entry (hfsmp, cp, new_entry); - } - /* Bypass the setxattr code below since generate_keys does it for us */ - goto out; - } - - cprotect_t new_entry; - error = cp_rewrap(cp, hfsmp, &newclass, &entry->cp_keys, entry, - (cp_new_alloc_fn)cp_entry_alloc, (void **)&new_entry); - if (error) { - /* we didn't have perms to set this class. leave file as-is and error out */ - goto out; - } - - - new_entry->cp_pclass = newclass; - - cp_replace_entry(hfsmp, cp, new_entry); - entry = new_entry; - } - else if (vnode_isdir(vp)) { - /* For directories, just update the pclass. newclass is only effective class */ - entry->cp_pclass = newclass; - error = 0; - } - else { - /* anything else, just error out */ - error = EINVAL; - goto out; - } - - /* - * We get here if the new class was F, or if we were re-wrapping a cprotect that already - * existed. If the keys were never generated, then they'll skip the setxattr calls. - */ - - error = cp_setxattr(cp, cp->c_cpentry, VTOHFS(vp), 0, XATTR_REPLACE); - if (error == ENOATTR) { - error = cp_setxattr(cp, cp->c_cpentry, VTOHFS(vp), 0, XATTR_CREATE); - } - -out: - - if (took_truncate_lock) { - hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); - } - hfs_unlock(cp); - return error; -} - - -int cp_vnode_transcode(vnode_t vp, void *key, unsigned *len) -{ - struct cnode *cp; - struct cprotect *entry = 0; - int error = 0; - int took_truncate_lock = 0; - struct hfsmount *hfsmp = NULL; - - /* Structures passed between HFS and AKS */ - cp_cred_s access_in; - cp_wrapped_key_s wrapped_key_in, wrapped_key_out; - - /* Is this an interesting vp? */ - if (!cp_vnode_is_eligible(vp)) { - return EBADF; - } - - /* Is the mount point formatted for content protection? */ - if (!cp_fs_protected(VTOVFS(vp))) { - return ENOTSUP; - } - - cp = VTOC(vp); - hfsmp = VTOHFS(vp); - - /* - * Take the cnode truncate lock exclusive because we want to manipulate the - * CP blob. The lock-event handling code is doing the same. This also forces - * all pending IOs to drain before we can re-write the persistent and cache keys. - */ - hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - took_truncate_lock = 1; - - if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) { - return EINVAL; - } - - entry = cp->c_cpentry; - if (entry == NULL) { - error = EINVAL; - goto out; - } - - /* Send the per-file key in wrapped form for re-wrap with the current class information - * Send NULLs in the output parameters of the wrapper() and AKS will do the rest. - * Don't need to process any outputs, so just clear the locks and pass along the error. */ - if (vnode_isreg(vp)) { - - /* Picked up the following from cp_wrap(). - * If needed, more comments available there. */ - - if (CP_CLASS(entry->cp_pclass) == PROTECTION_CLASS_F) { - error = EINVAL; - goto out; - } - - cp_init_access(&access_in, cp); - - bzero(&wrapped_key_in, sizeof(wrapped_key_in)); - bzero(&wrapped_key_out, sizeof(wrapped_key_out)); - - cp_key_pair_t *cpkp = &entry->cp_keys; - - - wrapped_key_in.key = cpkp_pers_key(cpkp); - wrapped_key_in.key_len = cpkp_pers_key_len(cpkp); - - if (!wrapped_key_in.key_len) { - error = EINVAL; - goto out; - } - - /* Use the actual persistent class when talking to AKS */ - wrapped_key_in.dp_class = entry->cp_pclass; - wrapped_key_out.key = key; - wrapped_key_out.key_len = *len; - - error = g_cp_wrap_func.backup_key(&access_in, - &wrapped_key_in, - &wrapped_key_out); - - if(error) - error = EPERM; - else - *len = wrapped_key_out.key_len; - } - -out: - if (took_truncate_lock) { - hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); - } - hfs_unlock(cp); - return error; -} - - -/* - * Check permission for the given operation (read, write) on this node. - * Additionally, if the node needs work, do it: - * - create a new key for the file if one hasn't been set before - * - write out the xattr if it hasn't already been saved - * - unwrap the key if needed - * - * Takes cnode lock, and upgrades to exclusive if modifying cprotect. - * - * Note that this function does *NOT* take the cnode truncate lock. This is because - * the thread calling us may already have the truncate lock. It is not necessary - * because either we successfully finish this function before the keys are tossed - * and the IO will fail, or the keys are tossed and then this function will fail. - * Either way, the cnode lock still ultimately guards the keys. We only rely on the - * truncate lock to protect us against tossing the keys as a cluster call is in-flight. - */ -int -cp_handle_vnop(struct vnode *vp, int vnop, int ioflag) -{ - struct cprotect *entry; - int error = 0; - struct hfsmount *hfsmp = NULL; - struct cnode *cp = NULL; - - /* - * First, do validation against the vnode before proceeding any further: - * Is this vnode originating from a valid content-protected filesystem ? - */ - if (cp_vnode_is_eligible(vp) == 0) { - /* - * It is either not HFS or not a file/dir. Just return success. This is a valid - * case if servicing i/o against another filesystem type from VFS - */ - return 0; - } - - if (cp_fs_protected (VTOVFS(vp)) == 0) { - /* - * The underlying filesystem does not support content protection. This is also - * a valid case. Simply return success. - */ - return 0; - } - - /* - * At this point, we know we have a HFS vnode that backs a file or directory on a - * filesystem that supports content protection - */ - cp = VTOC(vp); - - if ((error = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { - return error; - } - - entry = cp->c_cpentry; - - if (entry == NULL) { - /* - * If this cnode is not content protected, simply return success. - * Note that this function is called by all I/O-based call sites - * when CONFIG_PROTECT is enabled during XNU building. - */ - - /* - * All files should have cprotect structs. It's possible to encounter - * a directory from a V2.0 CP system but all files should have protection - * EAs - */ - if (vnode_isreg(vp)) { - error = EPERM; - } - - goto out; - } - - vp = CTOV(cp, 0); - if (vp == NULL) { - /* is it a rsrc */ - vp = CTOV(cp,1); - if (vp == NULL) { - error = EINVAL; - goto out; - } - } - hfsmp = VTOHFS(vp); - - if ((error = cp_check_access(cp, hfsmp, vnop))) { - /* check for raw encrypted access before bailing out */ - if ((ioflag & IO_ENCRYPTED) - && (vnop == CP_READ_ACCESS)) { - /* - * read access only + asking for the raw encrypted bytes - * is legitimate, so reset the error value to 0 - */ - error = 0; - } - else { - goto out; - } - } - - if (!ISSET(entry->cp_flags, CP_NO_XATTR)) { - if (!S_ISREG(cp->c_mode)) - goto out; - - // If we have a persistent key and the cached key, we're done - if (!cp_needs_pers_key(entry) - && cpx_has_key(cpkp_cpx(&entry->cp_keys))) { - goto out; - } - } - - /* upgrade to exclusive lock */ - if (lck_rw_lock_shared_to_exclusive(&cp->c_rwlock) == FALSE) { - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return error; - } - } else { - cp->c_lockowner = current_thread(); - } - - /* generate new keys if none have ever been saved */ - if (cp_needs_pers_key(entry)) { - struct cprotect *newentry = NULL; - /* - * It's ok if this ends up being wrapped in a different class than 'pclass'. - * class modification is OK here. - */ - uint32_t flags = CP_KEYWRAP_DIFFCLASS; - - error = cp_generate_keys (hfsmp, cp, CP_CLASS(cp->c_cpentry->cp_pclass), flags, &newentry); - if (error == 0) { - cp_replace_entry (hfsmp, cp, newentry); - entry = newentry; - } - else { - goto out; - } - } - - /* unwrap keys if needed */ - if (!cpx_has_key(cpkp_cpx(&entry->cp_keys))) { - if ((vnop == CP_READ_ACCESS) && (ioflag & IO_ENCRYPTED)) { - /* no need to try to restore keys; they are not going to be used */ - error = 0; - } - else { - error = cp_restore_keys(entry, hfsmp, cp); - if (error) { - goto out; - } - } - } - - /* write out the xattr if it's new */ - if (entry->cp_flags & CP_NO_XATTR) - error = cp_setxattr(cp, entry, VTOHFS(cp->c_vp), 0, XATTR_CREATE); - -out: - - hfs_unlock(cp); - return error; -} - -#if HFS_TMPDBG -#if !SECURE_KERNEL -static void cp_log_eperm (struct vnode* vp, int pclass, boolean_t create) { - char procname[256] = {}; - const char *fname = "unknown"; - const char *dbgop = "open"; - - int ppid = proc_selfpid(); - /* selfname does a strlcpy so we're OK */ - proc_selfname(procname, sizeof(procname)); - if (vp && vp->v_name) { - /* steal from the namecache */ - fname = vp->v_name; - } - - if (create) { - dbgop = "create"; - } - - printf("proc %s (pid %d) class %d, op: %s failure @ file %s\n", procname, ppid, pclass, dbgop, fname); -} -#endif -#endif - - -int -cp_handle_open(struct vnode *vp, int mode) -{ - struct cnode *cp = NULL ; - struct cprotect *entry = NULL; - struct hfsmount *hfsmp; - int error = 0; - - /* If vnode not eligible, just return success */ - if (!cp_vnode_is_eligible(vp)) { - return 0; - } - - /* If mount point not properly set up, then also return success */ - if (!cp_fs_protected(VTOVFS(vp))) { - return 0; - } - - cp = VTOC(vp); - - // Allow if raw encrypted mode requested - if (ISSET(mode, FENCRYPTED)) { - return 0; - } - if (ISSET(mode, FUNENCRYPTED)) { - return 0; - } - - /* We know the vnode is in a valid state. Acquire cnode and validate */ - hfsmp = VTOHFS(vp); - - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return error; - } - - entry = cp->c_cpentry; - if (entry == NULL) { - /* - * If the mount is protected and we couldn't get a cprotect for this vnode, - * then it's not valid for opening. - */ - if (vnode_isreg(vp)) { - error = EPERM; - } - goto out; - } - - if (!S_ISREG(cp->c_mode)) - goto out; - - /* - * Does the cnode have keys yet? If not, then generate them. - */ - if (cp_needs_pers_key(entry)) { - struct cprotect *newentry = NULL; - /* Allow the keybag to override our class preferences */ - uint32_t flags = CP_KEYWRAP_DIFFCLASS; - error = cp_generate_keys (hfsmp, cp, CP_CLASS(cp->c_cpentry->cp_pclass), flags, &newentry); - if (error == 0) { - cp_replace_entry (hfsmp, cp, newentry); - entry = newentry; - } - else { - goto out; - } - } - - /* - * We want to minimize the number of unwraps that we'll have to do since - * the cost can vary, depending on the platform we're running. - */ - switch (CP_CLASS(entry->cp_pclass)) { - case PROTECTION_CLASS_B: - if (mode & O_CREAT) { - /* - * Class B always allows creation. Since O_CREAT was passed through - * we infer that this was a newly created vnode/cnode. Even though a potential - * race exists when multiple threads attempt to create/open a particular - * file, only one can "win" and actually create it. VFS will unset the - * O_CREAT bit on the loser. - * - * Note that skipping the unwrap check here is not a security issue -- - * we have to unwrap the key permanently upon the first I/O. - */ - break; - } - - if (cpx_has_key(cpkp_cpx(&entry->cp_keys)) && !ISSET(mode, FENCRYPTED)) { - /* - * For a class B file, attempt the unwrap if we have the key in - * core already. - * The device could have just transitioned into the lock state, and - * this vnode may not yet have been purged from the vnode cache (which would - * remove the keys). - */ - cp_cred_s access_in; - cp_wrapped_key_s wrapped_key_in; - - cp_init_access(&access_in, cp); - bzero(&wrapped_key_in, sizeof(wrapped_key_in)); - wrapped_key_in.key = cpkp_pers_key(&entry->cp_keys); - wrapped_key_in.key_len = cpkp_pers_key_len(&entry->cp_keys); - /* Use the persistent class when talking to AKS */ - wrapped_key_in.dp_class = entry->cp_pclass; - error = g_cp_wrap_func.unwrapper(&access_in, &wrapped_key_in, NULL); - if (error) { - error = EPERM; - } - break; - } - /* otherwise, fall through to attempt the unwrap/restore */ - case PROTECTION_CLASS_A: - case PROTECTION_CLASS_C: - /* - * At this point, we know that we need to attempt an unwrap if needed; we want - * to makes sure that open(2) fails properly if the device is either just-locked - * or never made it past first unlock. Since the keybag serializes access to the - * unwrapping keys for us and only calls our VFS callback once they've been purged, - * we will get here in two cases: - * - * A) we're in a window before the wrapping keys are purged; this is OK since when they get - * purged, the vnode will get flushed if needed. - * - * B) The keys are already gone. In this case, the restore_keys call below will fail. - * - * Since this function is bypassed entirely if we're opening a raw encrypted file, - * we can always attempt the restore. - */ - if (!cpx_has_key(cpkp_cpx(&entry->cp_keys))) { - error = cp_restore_keys(entry, hfsmp, cp); - } - - if (error) { - error = EPERM; - } - - break; - - case PROTECTION_CLASS_D: - default: - break; - } - -out: - -#if HFS_TMPDBG -#if !SECURE_KERNEL - if ((hfsmp->hfs_cp_verbose) && (error == EPERM)) { - cp_log_eperm (vp, CP_CLASS(entry->cp_pclass), false); - } -#endif -#endif - - hfs_unlock(cp); - return error; -} - - -/* - * cp_getrootxattr: - * Gets the EA we set on the root folder (fileid 1) to get information about the - * version of Content Protection that was used to write to this filesystem. - * Note that all multi-byte fields are written to disk little endian so they must be - * converted to native endian-ness as needed. - */ -int -cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) -{ - uio_t auio; - char uio_buf[UIO_SIZEOF(1)]; - void *buf; - - /* - * We allow for an extra 64 bytes to cater for upgrades. This wouldn't - * be necessary if the xattr routines just returned what we asked for. - */ - size_t attrsize = roundup(sizeof(struct cp_root_xattr) + 64, 64); - - int error = 0; - struct vnop_getxattr_args args; - - if (!outxattr) { - panic("Content Protection: cp_xattr called with xattr == NULL"); - } - - MALLOC(buf, void *, attrsize, M_TEMP, M_WAITOK); - - auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); - uio_addiov(auio, CAST_USER_ADDR_T(buf), attrsize); - - args.a_desc = NULL; // unused - args.a_vp = NULL; //unused since we're writing EA to root folder. - args.a_name = CONTENT_PROTECTION_XATTR_NAME; - args.a_uio = auio; - args.a_size = &attrsize; - args.a_options = XATTR_REPLACE; - args.a_context = NULL; // unused - - error = hfs_getxattr_internal(NULL, &args, hfsmp, 1); - - if (error != 0) { - goto out; - } - - if (attrsize < CP_ROOT_XATTR_MIN_LEN) { - error = HFS_EINCONSISTENT; - goto out; - } - - const struct cp_root_xattr *xattr = buf; - - bzero(outxattr, sizeof(*outxattr)); - - /* Now convert the multi-byte fields to native endianness */ - outxattr->major_version = OSSwapLittleToHostInt16(xattr->major_version); - outxattr->minor_version = OSSwapLittleToHostInt16(xattr->minor_version); - outxattr->flags = OSSwapLittleToHostInt64(xattr->flags); - - if (outxattr->major_version >= CP_VERS_5) { - if (attrsize < sizeof(struct cp_root_xattr)) { - error = HFS_EINCONSISTENT; - goto out; - } - } - -out: - uio_free(auio); - FREE(buf, M_TEMP); - return error; -} - -/* - * cp_setrootxattr: - * Sets the EA we set on the root folder (fileid 1) to get information about the - * version of Content Protection that was used to write to this filesystem. - * Note that all multi-byte fields are written to disk little endian so they must be - * converted to little endian as needed. - * - * This will be written to the disk when it detects the EA is not there, or when we need - * to make a modification to the on-disk version that can be done in-place. - */ -int -cp_setrootxattr(struct hfsmount *hfsmp, struct cp_root_xattr *newxattr) -{ - int error = 0; - struct vnop_setxattr_args args; - - args.a_desc = NULL; - args.a_vp = NULL; - args.a_name = CONTENT_PROTECTION_XATTR_NAME; - args.a_uio = NULL; //pass data ptr instead - args.a_options = 0; - args.a_context = NULL; //no context needed, only done from mount. - - const uint32_t flags = newxattr->flags; - - /* Now convert the multi-byte fields to little endian before writing to disk. */ - newxattr->flags = OSSwapHostToLittleInt64(newxattr->flags); - - int xattr_size = sizeof(struct cp_root_xattr); - - - newxattr->major_version = OSSwapHostToLittleInt16(newxattr->major_version); - newxattr->minor_version = OSSwapHostToLittleInt16(newxattr->minor_version); - - error = hfs_setxattr_internal(NULL, (caddr_t)newxattr, - xattr_size, &args, hfsmp, 1); - - if (!error) { - hfsmp->cproot_flags = flags; - } - - return error; -} - - -/* - * Stores new xattr data on the cnode. - * cnode lock held exclusive (if available). - * - * This function is also invoked during file creation. - */ -int cp_setxattr(struct cnode *cp, struct cprotect *entry, struct hfsmount *hfsmp, - uint32_t fileid, int options) -{ - int error = 0; - cp_key_pair_t *cpkp = &entry->cp_keys; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return EROFS; - } - - if (hfsmp->hfs_running_cp_major_vers < CP_CURRENT_VERS) { - // Upgrade - printf("hfs: upgrading to cp version %u\n", CP_CURRENT_VERS); - - struct cp_root_xattr root_xattr; - - error = cp_getrootxattr(hfsmp, &root_xattr); - if (error) - return error; - - root_xattr.major_version = CP_CURRENT_VERS; - root_xattr.minor_version = CP_MINOR_VERS; - - error = cp_setrootxattr(hfsmp, &root_xattr); - if (error) - return error; - - hfsmp->hfs_running_cp_major_vers = CP_CURRENT_VERS; - } - - struct cp_xattr_v5 *xattr; - MALLOC(xattr, struct cp_xattr_v5 *, sizeof(*xattr), M_TEMP, M_WAITOK); - - xattr->xattr_major_version = OSSwapHostToLittleConstInt16(CP_VERS_5); - xattr->xattr_minor_version = OSSwapHostToLittleConstInt16(CP_MINOR_VERS); - xattr->flags = 0; - xattr->persistent_class = OSSwapHostToLittleInt32(entry->cp_pclass); - xattr->key_os_version = OSSwapHostToLittleInt32(entry->cp_key_os_version); - xattr->key_revision = OSSwapHostToLittleInt16(entry->cp_key_revision); - - uint16_t key_len = cpkp_pers_key_len(cpkp); - xattr->key_len = OSSwapHostToLittleInt16(key_len); - memcpy(xattr->persistent_key, cpkp_pers_key(cpkp), key_len); - - size_t xattr_len = offsetof(struct cp_xattr_v5, persistent_key) + key_len; - - - struct vnop_setxattr_args args = { - .a_vp = cp ? cp->c_vp : NULL, - .a_name = CONTENT_PROTECTION_XATTR_NAME, - .a_options = options, - .a_context = vfs_context_current(), - }; - - error = hfs_setxattr_internal(cp, xattr, xattr_len, &args, hfsmp, fileid); - - FREE(xattr, M_TEMP); - - if (error == 0 ) { - entry->cp_flags &= ~CP_NO_XATTR; - } - - return error; -} - -/* - * Used by an fcntl to query the underlying FS for its content protection version # - */ - -int -cp_get_root_major_vers(vnode_t vp, uint32_t *level) -{ - int err = 0; - struct hfsmount *hfsmp = NULL; - struct mount *mp = NULL; - - mp = VTOVFS(vp); - - /* check if it supports content protection */ - if (cp_fs_protected(mp) == 0) { - return ENOTSUP; - } - - hfsmp = VFSTOHFS(mp); - /* figure out the level */ - - err = cp_root_major_vers(mp); - - if (err == 0) { - *level = hfsmp->hfs_running_cp_major_vers; - } - /* in error case, cp_root_major_vers will just return EINVAL. Use that */ - - return err; -} - -/* Used by fcntl to query default protection level of FS */ -int cp_get_default_level (struct vnode *vp, uint32_t *level) { - int err = 0; - struct hfsmount *hfsmp = NULL; - struct mount *mp = NULL; - - mp = VTOVFS(vp); - - /* check if it supports content protection */ - if (cp_fs_protected(mp) == 0) { - return ENOTSUP; - } - - hfsmp = VFSTOHFS(mp); - /* figure out the default */ - - *level = hfsmp->default_cp_class; - return err; -} - -/******************** - * Private Functions - *******************/ - -static int -cp_root_major_vers(mount_t mp) -{ - int err = 0; - struct cp_root_xattr xattr; - struct hfsmount *hfsmp = NULL; - - hfsmp = vfs_fsprivate(mp); - err = cp_getrootxattr (hfsmp, &xattr); - - if (err == 0) { - hfsmp->hfs_running_cp_major_vers = xattr.major_version; - } - else { - return EINVAL; - } - - return 0; -} - -static int -cp_vnode_is_eligible(struct vnode *vp) -{ - return ((vp->v_op == hfs_vnodeop_p) && - (!vnode_issystem(vp)) && - (vnode_isreg(vp) || vnode_isdir(vp))); -} - - - -int -cp_is_valid_class(int isdir, int32_t protectionclass) -{ - /* - * The valid protection classes are from 0 -> N - * We use a signed argument to detect unassigned values from - * directory entry creation time in HFS. - */ - if (isdir) { - /* Directories are not allowed to have F, but they can have "NONE" */ - return ((protectionclass >= PROTECTION_CLASS_DIR_NONE) && - (protectionclass <= PROTECTION_CLASS_D)); - } - else { - return ((protectionclass >= PROTECTION_CLASS_A) && - (protectionclass <= PROTECTION_CLASS_F)); - } -} - -#if DEBUG -static const uint32_t cp_magic1 = 0x7b727063; // cpr{ -static const uint32_t cp_magic2 = 0x7270637d; // }cpr -#endif - -struct cprotect * -cp_entry_alloc(cprotect_t old, uint16_t pers_key_len, - uint16_t cached_key_len, cp_key_pair_t **pcpkp) -{ - struct cprotect *cp_entry; - - if (pers_key_len > CP_MAX_WRAPPEDKEYSIZE) - return (NULL); - - size_t size = (sizeof(struct cprotect) - sizeof(cp_key_pair_t) - + cpkp_size(pers_key_len, cached_key_len)); - -#if DEBUG - size += 4; // Extra for magic2 -#endif - - MALLOC(cp_entry, struct cprotect *, size, M_TEMP, M_WAITOK); - - if (old) { - memcpy(cp_entry, old, offsetof(struct cprotect, cp_keys)); - - } else { - bzero(cp_entry, offsetof(struct cprotect, cp_keys)); - } - -#if DEBUG - cp_entry->cp_magic1 = cp_magic1; - *PTR_ADD(uint32_t *, cp_entry, size - 4) = cp_magic2; -#endif - - cpkp_init(&cp_entry->cp_keys, pers_key_len, cached_key_len); - - /* - * If we've been passed the old entry, then we are in the process of - * rewrapping in which case we need to copy the cached key. This is - * important for class B files when the device is locked because we - * won't be able to unwrap whilst in this state, yet we still need the - * unwrapped key. - */ - if (old) - cpx_copy(cpkp_cpx(&old->cp_keys), cpkp_cpx(&cp_entry->cp_keys)); - - if (pcpkp) - *pcpkp = &cp_entry->cp_keys; - - return cp_entry; -} - -static void -cp_entry_dealloc(__unused hfsmount_t *hfsmp, struct cprotect *entry) -{ - - cpkp_flush(&entry->cp_keys); - -#if DEBUG - assert(entry->cp_magic1 == cp_magic1); - assert(*PTR_ADD(uint32_t *, entry, (sizeof(struct cprotect) - sizeof(cp_key_pair_t) - + cpkp_sizex(&entry->cp_keys) == cp_magic2))); -#endif - - FREE(entry, M_TEMP); -} - -static int cp_read_xattr_v4(__unused hfsmount_t *hfsmp, struct cp_xattr_v4 *xattr, - size_t xattr_len, cprotect_t *pcpr, cp_getxattr_options_t options) -{ - /* Endian swap the multi-byte fields into host endianness from L.E. */ - xattr->xattr_major_version = OSSwapLittleToHostInt16(xattr->xattr_major_version); - xattr->xattr_minor_version = OSSwapLittleToHostInt16(xattr->xattr_minor_version); - xattr->key_size = OSSwapLittleToHostInt32(xattr->key_size); - xattr->flags = OSSwapLittleToHostInt32(xattr->flags); - xattr->persistent_class = OSSwapLittleToHostInt32(xattr->persistent_class); - xattr->key_os_version = OSSwapLittleToHostInt32(xattr->key_os_version); - - /* - * Prevent a buffer overflow, and validate the key length obtained from the - * EA. If it's too big, then bail out, because the EA can't be trusted at this - * point. - */ - if (xattr->key_size > CP_MAX_WRAPPEDKEYSIZE) - return HFS_EINCONSISTENT; - - size_t min_len = offsetof(struct cp_xattr_v4, persistent_key) + xattr->key_size; - if (xattr_len < min_len) - return HFS_EINCONSISTENT; - - /* - * Class F files have no backing key; their keylength should be 0, - * though they should have the proper flags set. - * - * A request to instantiate a CP for a class F file should result - * in a bzero'd cp that just says class F, with key_flushed set. - */ - if (CP_CLASS(xattr->persistent_class) == PROTECTION_CLASS_F - || ISSET(xattr->flags, CP_XAF_NEEDS_KEYS)) { - xattr->key_size = 0; - } - - /* set up entry with information from xattr */ - cp_key_pair_t *cpkp; - cprotect_t entry; - - if (ISSET(options, CP_GET_XATTR_BASIC_INFO)) { - /* caller passed in a pre-allocated structure to get the basic info */ - entry = *pcpr; - bzero(entry, offsetof(struct cprotect, cp_keys)); - } - else { - entry = cp_entry_alloc(NULL, xattr->key_size, CP_MAX_CACHEBUFLEN, &cpkp); - } - - entry->cp_pclass = xattr->persistent_class; - entry->cp_key_os_version = xattr->key_os_version; - - - if (!ISSET(options, CP_GET_XATTR_BASIC_INFO)) { - if (xattr->key_size) { - cpkp_set_pers_key_len(cpkp, xattr->key_size); - memcpy(cpkp_pers_key(cpkp), xattr->persistent_key, xattr->key_size); - } - - *pcpr = entry; - } - else if (xattr->key_size) { - SET(entry->cp_flags, CP_HAS_A_KEY); - } - - return 0; -} - -int cp_read_xattr_v5(hfsmount_t *hfsmp, struct cp_xattr_v5 *xattr, - size_t xattr_len, cprotect_t *pcpr, cp_getxattr_options_t options) -{ - if (xattr->xattr_major_version == OSSwapHostToLittleConstInt16(CP_VERS_4)) { - return cp_read_xattr_v4(hfsmp, (struct cp_xattr_v4 *)xattr, xattr_len, pcpr, options); - } - - xattr->xattr_major_version = OSSwapLittleToHostInt16(xattr->xattr_major_version); - - if (xattr->xattr_major_version != CP_VERS_5) { - printf("hfs: cp_getxattr: unsupported xattr version %d\n", - xattr->xattr_major_version); - return ENOTSUP; - } - - size_t min_len = offsetof(struct cp_xattr_v5, persistent_key); - - if (xattr_len < min_len) - return HFS_EINCONSISTENT; - - xattr->xattr_minor_version = OSSwapLittleToHostInt16(xattr->xattr_minor_version); - xattr->flags = OSSwapLittleToHostInt32(xattr->flags); - xattr->persistent_class = OSSwapLittleToHostInt32(xattr->persistent_class); - xattr->key_os_version = OSSwapLittleToHostInt32(xattr->key_os_version); - xattr->key_revision = OSSwapLittleToHostInt16(xattr->key_revision); - xattr->key_len = OSSwapLittleToHostInt16(xattr->key_len); - - uint16_t pers_key_len = xattr->key_len; - - min_len += pers_key_len; - if (xattr_len < min_len) - return HFS_EINCONSISTENT; - - - cp_key_pair_t *cpkp; - cprotect_t entry; - - /* - * If option CP_GET_XATTR_BASIC_INFO is set, we only return basic - * information about the file's protection (and not the key) and - * we store the result in the structure the caller passed to us. - */ - if (ISSET(options, CP_GET_XATTR_BASIC_INFO)) { - entry = *pcpr; - bzero(entry, offsetof(struct cprotect, cp_keys)); - } else { - entry = cp_entry_alloc(NULL, xattr->key_len, CP_MAX_CACHEBUFLEN, &cpkp); - } - - entry->cp_pclass = xattr->persistent_class; - entry->cp_key_os_version = xattr->key_os_version; - entry->cp_key_revision = xattr->key_revision; - - if (!ISSET(options, CP_GET_XATTR_BASIC_INFO)) { - if (xattr->key_len) { - cpkp_set_pers_key_len(cpkp, xattr->key_len); - memcpy(cpkp_pers_key(cpkp), xattr->persistent_key, xattr->key_len); - } - - - *pcpr = entry; - } - else if (xattr->key_len) { - SET(entry->cp_flags, CP_HAS_A_KEY); - } - - return 0; -} - -/* - * Initializes a new cprotect entry with xattr data from the cnode. - * cnode lock held shared - */ -static int -cp_getxattr(struct cnode *cp, struct hfsmount *hfsmp, cprotect_t *outentry) -{ - size_t xattr_len = sizeof(struct cp_xattr_v5); - struct cp_xattr_v5 *xattr; - - MALLOC (xattr, struct cp_xattr_v5 *, xattr_len, - M_TEMP, M_WAITOK); - - int error = hfs_xattr_read(cp->c_vp, CONTENT_PROTECTION_XATTR_NAME, - xattr, &xattr_len); - - if (!error) { - if (xattr_len < CP_XATTR_MIN_LEN) - error = HFS_EINCONSISTENT; - else - error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, outentry, 0); - } - -#if DEBUG - if (error && error != ENOATTR) { - printf("cp_getxattr: bad cp xattr (%d):\n", error); - for (size_t i = 0; i < xattr_len; ++i) - printf("%02x ", ((uint8_t *)xattr)[i]); - printf("\n"); - } -#endif - - FREE(xattr, M_TEMP); - - return error; -} - -/* - * If permitted, restore entry's unwrapped key from the persistent key. - * If not, clear key and set CP_KEY_FLUSHED. - * cnode lock held exclusive - */ -static int -cp_restore_keys(struct cprotect *entry, struct hfsmount *hfsmp, struct cnode *cp) -{ - int error = 0; - - error = cp_unwrap(hfsmp, entry, cp); - if (error) { - cp_flush_cached_keys(entry); - error = EPERM; - } - return error; -} - -static int -cp_lock_vfs_callback(mount_t mp, void *arg) -{ - - /* Use a pointer-width integer field for casting */ - unsigned long new_state; - struct hfsmount *hfsmp; - - /* - * When iterating the various mount points that may - * be present on a content-protected device, we need to skip - * those that do not have it enabled. - */ - if (!cp_fs_protected(mp)) { - return 0; - } - new_state = (unsigned long) arg; - - hfsmp = VFSTOHFS(mp); - - hfs_lock_mount(hfsmp); - /* this loses all of the upper bytes of precision; that's OK */ - hfsmp->hfs_cp_lock_state = (uint8_t) new_state; - hfs_unlock_mount(hfsmp); - - if (new_state == CP_LOCKED_STATE) { - /* - * We respond only to lock events. Since cprotect structs - * decrypt/restore keys lazily, the unlock events don't - * actually cause anything to happen. - */ - return vnode_iterate(mp, 0, cp_lock_vnode_callback, arg); - } - /* Otherwise just return 0. */ - return 0; - -} - - -/* - * Deny access to protected files if keys have been locked. - */ -static int -cp_check_access(struct cnode *cp, struct hfsmount *hfsmp, int vnop __unused) -{ - int error = 0; - - /* - * For now it's OK to examine the state variable here without - * holding the HFS lock. This is only a short-circuit; if the state - * transitions (or is in transition) after we examine this field, we'd - * have to handle that anyway. - */ - if (hfsmp->hfs_cp_lock_state == CP_UNLOCKED_STATE) { - return 0; - } - - if (!cp->c_cpentry) { - /* unprotected node */ - return 0; - } - - if (!S_ISREG(cp->c_mode)) { - return 0; - } - - /* Deny all access for class A files */ - switch (CP_CLASS(cp->c_cpentry->cp_pclass)) { - case PROTECTION_CLASS_A: { - error = EPERM; - break; - } - default: - error = 0; - break; - } - - return error; -} - -/* - * Respond to a lock or unlock event. - * On lock: clear out keys from memory, then flush file contents. - * On unlock: nothing (function not called). - */ -static int -cp_lock_vnode_callback(struct vnode *vp, void *arg) -{ - cnode_t *cp = NULL; - struct cprotect *entry = NULL; - int error = 0; - int locked = 1; - unsigned long action = 0; - int took_truncate_lock = 0; - - error = vnode_getwithref (vp); - if (error) { - return error; - } - - cp = VTOC(vp); - - /* - * When cleaning cnodes due to a lock event, we must - * take the truncate lock AND the cnode lock. By taking - * the truncate lock here, we force (nearly) all pending IOs - * to drain before we can acquire the truncate lock. All HFS cluster - * io calls except for swapfile IO need to acquire the truncate lock - * prior to calling into the cluster layer. - */ - hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - took_truncate_lock = 1; - - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - entry = cp->c_cpentry; - if (!entry) { - /* unprotected vnode: not a regular file */ - goto out; - } - - action = (unsigned long) arg; - switch (action) { - case CP_LOCKED_STATE: { - vfs_context_t ctx; - if (CP_CLASS(entry->cp_pclass) != PROTECTION_CLASS_A || - vnode_isdir(vp)) { - /* - * There is no change at lock for other classes than A. - * B is kept in memory for writing, and class F (for VM) does - * not have a wrapped key, so there is no work needed for - * wrapping/unwrapping. - * - * Note that 'class F' is relevant here because if - * hfs_vnop_strategy does not take the cnode lock - * to protect the cp blob across IO operations, we rely - * implicitly on the truncate lock to be held when doing IO. - * The only case where the truncate lock is not held is during - * swapfile IO because HFS just funnels the VNOP_PAGEOUT - * directly to cluster_pageout. - */ - goto out; - } - - /* Before doing anything else, zero-fill sparse ranges as needed */ - ctx = vfs_context_current(); - (void) hfs_filedone (vp, ctx, 0); - - /* first, sync back dirty pages */ - hfs_unlock (cp); - ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); - hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - /* flush keys: - * There was a concern here(9206856) about flushing keys before nand layer is done using them. - * But since we are using ubc_msync with UBC_SYNC, it blocks until all IO is completed. - * Once IOFS caches or is done with these keys, it calls the completion routine in IOSF. - * Which in turn calls buf_biodone() and eventually unblocks ubc_msync() - * Also verified that the cached data in IOFS is overwritten by other data, and there - * is no key leakage in that layer. - */ - - cp_flush_cached_keys(entry); - - /* some write may have arrived in the mean time. dump those pages */ - hfs_unlock(cp); - locked = 0; - - ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_INVALIDATE | UBC_SYNC); - break; - } - case CP_UNLOCKED_STATE: { - /* no-op */ - break; - } - default: - panic("Content Protection: unknown lock action %lu\n", action); - } - -out: - if (locked) { - hfs_unlock(cp); - } - - if (took_truncate_lock) { - hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); - } - - vnode_put (vp); - return error; -} - - -/* - * cp_rewrap: - * - * Generate a new wrapped key based on the existing cache key. - */ - -int -cp_rewrap(struct cnode *cp, __unused hfsmount_t *hfsmp, - cp_key_class_t *newclass, cp_key_pair_t *cpkp, const void *old_holder, - cp_new_alloc_fn alloc_fn, void **pholder) -{ - struct cprotect *entry = cp->c_cpentry; - - uint8_t new_persistent_key[CP_MAX_WRAPPEDKEYSIZE]; - size_t keylen = CP_MAX_WRAPPEDKEYSIZE; - int error = 0; - const cp_key_class_t key_class = CP_CLASS(*newclass); - - /* Structures passed between HFS and AKS */ - cp_cred_s access_in; - cp_wrapped_key_s wrapped_key_in; - cp_wrapped_key_s wrapped_key_out; - - /* - * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient - * key that is only good as long as the file is open. There is no - * wrapped key, so there isn't anything to wrap. - */ - if (key_class == PROTECTION_CLASS_F) { - return EINVAL; - } - - cp_init_access(&access_in, cp); - - bzero(&wrapped_key_in, sizeof(wrapped_key_in)); - wrapped_key_in.key = cpkp_pers_key(cpkp); - wrapped_key_in.key_len = cpkp_pers_key_len(cpkp); - /* Use the persistent class when talking to AKS */ - wrapped_key_in.dp_class = entry->cp_pclass; - - bzero(&wrapped_key_out, sizeof(wrapped_key_out)); - wrapped_key_out.key = new_persistent_key; - wrapped_key_out.key_len = keylen; - - /* - * inode is passed here to find the backup bag wrapped blob - * from userspace. This lookup will occur shortly after creation - * and only if the file still exists. Beyond this lookup the - * inode is not used. Technically there is a race, we practically - * don't lose. - */ - error = g_cp_wrap_func.rewrapper(&access_in, - key_class, /* new class */ - &wrapped_key_in, - &wrapped_key_out); - - keylen = wrapped_key_out.key_len; - - if (error == 0) { - /* - * Verify that AKS returned to us a wrapped key of the - * target class requested. - */ - /* Get the effective class here */ - cp_key_class_t effective = CP_CLASS(wrapped_key_out.dp_class); - if (effective != key_class) { - /* - * Fail the operation if defaults or some other enforcement - * dictated that the class be wrapped differently. - */ - - /* TODO: Invalidate the key when 12170074 unblocked */ - return EPERM; - } - - /* Allocate a new cpentry */ - cp_key_pair_t *new_cpkp; - *pholder = alloc_fn(old_holder, keylen, CP_MAX_CACHEBUFLEN, &new_cpkp); - - /* copy the new key into the entry */ - cpkp_set_pers_key_len(new_cpkp, keylen); - memcpy(cpkp_pers_key(new_cpkp), new_persistent_key, keylen); - - /* Actually record/store what AKS reported back, not the effective class stored in newclass */ - *newclass = wrapped_key_out.dp_class; - } - else { - error = EPERM; - } - - return error; -} - -static int cpkp_unwrap(cnode_t *cp, cp_key_class_t key_class, cp_key_pair_t *cpkp) -{ - int error = 0; - uint8_t iv_key[CP_IV_KEYSIZE]; - cpx_t cpx = cpkp_cpx(cpkp); - - /* Structures passed between HFS and AKS */ - cp_cred_s access_in; - cp_wrapped_key_s wrapped_key_in; - cp_raw_key_s key_out; - - cp_init_access(&access_in, cp); - - bzero(&wrapped_key_in, sizeof(wrapped_key_in)); - wrapped_key_in.key = cpkp_pers_key(cpkp); - wrapped_key_in.key_len = cpkp_max_pers_key_len(cpkp); - /* Use the persistent class when talking to AKS */ - wrapped_key_in.dp_class = key_class; - - bzero(&key_out, sizeof(key_out)); - key_out.iv_key = iv_key; - key_out.key = cpx_key(cpx); - /* - * The unwrapper should validate/set the key length for - * the IV key length and the cache key length, however we need - * to supply the correct buffer length so that AKS knows how - * many bytes it has to work with. - */ - key_out.iv_key_len = CP_IV_KEYSIZE; - key_out.key_len = cpx_max_key_len(cpx); - - error = g_cp_wrap_func.unwrapper(&access_in, &wrapped_key_in, &key_out); - if (!error) { - if (key_out.key_len == 0 || key_out.key_len > CP_MAX_CACHEBUFLEN) { - panic ("cp_unwrap: invalid key length! (%ul)\n", key_out.key_len); - } - - if (key_out.iv_key_len != CP_IV_KEYSIZE) - panic ("cp_unwrap: invalid iv key length! (%ul)\n", key_out.iv_key_len); - - cpx_set_key_len(cpx, key_out.key_len); - - cpx_set_aes_iv_key(cpx, iv_key); - cpx_set_is_sep_wrapped_key(cpx, ISSET(key_out.flags, CP_RAW_KEY_WRAPPEDKEY)); - } else { - error = EPERM; - } - - return error; -} - -static int -cp_unwrap(__unused struct hfsmount *hfsmp, struct cprotect *entry, struct cnode *cp) -{ - /* - * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient - * key that is only good as long as the file is open. There is no - * wrapped key, so there isn't anything to unwrap. - */ - if (CP_CLASS(entry->cp_pclass) == PROTECTION_CLASS_F) { - return EPERM; - } - - int error = cpkp_unwrap(cp, entry->cp_pclass, &entry->cp_keys); - - - return error; -} - -/* - * cp_generate_keys - * - * Take a cnode that has already been initialized and establish persistent and - * cache keys for it at this time. Note that at the time this is called, the - * directory entry has already been created and we are holding the cnode lock - * on 'cp'. - * - */ -int cp_generate_keys (struct hfsmount *hfsmp, struct cnode *cp, cp_key_class_t targetclass, - uint32_t keyflags, struct cprotect **newentry) -{ - - int error = 0; - struct cprotect *newcp = NULL; - *newentry = NULL; - - /* Target class must be an effective class only */ - targetclass = CP_CLASS(targetclass); - - /* Validate that it has a cprotect already */ - if (cp->c_cpentry == NULL) { - /* We can't do anything if it shouldn't be protected. */ - return 0; - } - - /* Asserts for the underlying cprotect */ - if (cp->c_cpentry->cp_flags & CP_NO_XATTR) { - /* should already have an xattr by this point. */ - error = EINVAL; - goto out; - } - - if (S_ISREG(cp->c_mode)) { - if (!cp_needs_pers_key(cp->c_cpentry)) { - error = EINVAL; - goto out; - } - } - - cp_key_revision_t key_revision = cp_initial_key_revision(hfsmp); - - error = cp_new (&targetclass, hfsmp, cp, cp->c_mode, keyflags, key_revision, - (cp_new_alloc_fn)cp_entry_alloc, (void **)&newcp); - if (error) { - /* - * Key generation failed. This is not necessarily fatal - * since the device could have transitioned into the lock - * state before we called this. - */ - error = EPERM; - goto out; - } - - newcp->cp_pclass = targetclass; - newcp->cp_key_os_version = cp_os_version(); - newcp->cp_key_revision = key_revision; - - /* - * If we got here, then we have a new cprotect. - * Attempt to write the new one out. - */ - error = cp_setxattr (cp, newcp, hfsmp, cp->c_fileid, XATTR_REPLACE); - - if (error) { - /* Tear down the new cprotect; Tell MKB that it's invalid. Bail out */ - /* TODO: rdar://12170074 needs to be fixed before we can tell MKB */ - if (newcp) { - cp_entry_destroy(hfsmp, newcp); - } - goto out; - } - - /* - * If we get here then we can assert that: - * 1) generated wrapped/unwrapped keys. - * 2) wrote the new keys to disk. - * 3) cprotect is ready to go. - */ - - *newentry = newcp; - -out: - return error; - -} - -void cp_replace_entry (hfsmount_t *hfsmp, struct cnode *cp, struct cprotect *newentry) -{ - if (cp->c_cpentry) { - - cp_entry_destroy (hfsmp, cp->c_cpentry); - } - cp->c_cpentry = newentry; - newentry->cp_backing_cnode = cp; - - return; -} - - -/* - * cp_new - * - * Given a double-pointer to a cprotect, generate keys (either in-kernel or from keystore), - * allocate a cprotect, and vend it back to the caller. - * - * Additionally, decide if keys are even needed -- directories get cprotect data structures - * but they do not have keys. - * - */ - -int -cp_new(cp_key_class_t *newclass_eff, __unused struct hfsmount *hfsmp, struct cnode *cp, - mode_t cmode, int32_t keyflags, cp_key_revision_t key_revision, - cp_new_alloc_fn alloc_fn, void **pholder) -{ - int error = 0; - uint8_t new_key[CP_MAX_CACHEBUFLEN]; - size_t new_key_len = CP_MAX_CACHEBUFLEN; /* AKS tell us the proper key length, how much of this is used */ - uint8_t new_persistent_key[CP_MAX_WRAPPEDKEYSIZE]; - size_t new_persistent_len = CP_MAX_WRAPPEDKEYSIZE; - uint8_t iv_key[CP_IV_KEYSIZE]; - size_t iv_key_len = CP_IV_KEYSIZE; - int iswrapped = 0; - cp_key_class_t key_class = CP_CLASS(*newclass_eff); - - /* Structures passed between HFS and AKS */ - cp_cred_s access_in; - cp_wrapped_key_s wrapped_key_out; - cp_raw_key_s key_out; - - if (are_wraps_initialized == false) { - printf("hfs: cp_new: wrap/gen functions not yet set\n"); - return ENXIO; - } - - /* Sanity check that it's a file or directory here */ - if (!(S_ISREG(cmode)) && !(S_ISDIR(cmode))) { - return EPERM; - } - - /* - * Step 1: Generate Keys if needed. - * - * For class F files, the kernel provides the key. - * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient - * key that is only good as long as the file is open. There is no - * wrapped key, so there isn't anything to wrap. - * - * For class A->D files, the key store provides the key - * - * For Directories, we only give them a class ; no keys. - */ - if (S_ISDIR (cmode)) { - /* Directories */ - new_persistent_len = 0; - new_key_len = 0; - - error = 0; - } - else { - /* Must be a file */ - if (key_class == PROTECTION_CLASS_F) { - /* class F files are not wrapped; they can still use the max key size */ - new_key_len = CP_MAX_KEYSIZE; - read_random (&new_key[0], new_key_len); - new_persistent_len = 0; - - error = 0; - } - else { - /* - * The keystore is provided the file ID so that it can associate - * the wrapped backup blob with this key from userspace. This - * lookup occurs after successful file creation. Beyond this, the - * file ID is not used. Note that there is a potential race here if - * the file ID is re-used. - */ - cp_init_access(&access_in, cp); - - bzero(&key_out, sizeof(key_out)); - key_out.key = new_key; - key_out.iv_key = iv_key; - /* - * AKS will override our key length fields, but we need to supply - * the length of the buffer in those length fields so that - * AKS knows hoa many bytes it has to work with. - */ - key_out.key_len = new_key_len; - key_out.iv_key_len = iv_key_len; - - bzero(&wrapped_key_out, sizeof(wrapped_key_out)); - wrapped_key_out.key = new_persistent_key; - wrapped_key_out.key_len = new_persistent_len; - - access_in.key_revision = key_revision; - - error = g_cp_wrap_func.new_key(&access_in, - key_class, - &key_out, - &wrapped_key_out); - - if (error) { - /* keybag returned failure */ - error = EPERM; - goto cpnew_fail; - } - - /* Now sanity-check the output from new_key */ - if (key_out.key_len == 0 || key_out.key_len > CP_MAX_CACHEBUFLEN) { - panic ("cp_new: invalid key length! (%ul) \n", key_out.key_len); - } - - if (key_out.iv_key_len != CP_IV_KEYSIZE) { - panic ("cp_new: invalid iv key length! (%ul) \n", key_out.iv_key_len); - } - - /* - * AKS is allowed to override our preferences and wrap with a - * different class key for policy reasons. If we were told that - * any class other than the one specified is unacceptable then error out - * if that occurred. Check that the effective class returned by - * AKS is the same as our effective new class - */ - if (CP_CLASS(wrapped_key_out.dp_class) != key_class) { - if (!ISSET(keyflags, CP_KEYWRAP_DIFFCLASS)) { - error = EPERM; - /* TODO: When 12170074 fixed, release/invalidate the key! */ - goto cpnew_fail; - } - } - - *newclass_eff = wrapped_key_out.dp_class; - new_key_len = key_out.key_len; - iv_key_len = key_out.iv_key_len; - new_persistent_len = wrapped_key_out.key_len; - - /* Is the key a SEP wrapped key? */ - if (key_out.flags & CP_RAW_KEY_WRAPPEDKEY) { - iswrapped = 1; - } - } - } - - /* - * Step 2: allocate cprotect and initialize it. - */ - - cp_key_pair_t *cpkp; - *pholder = alloc_fn(NULL, new_persistent_len, new_key_len, &cpkp); - if (*pholder == NULL) { - return ENOMEM; - } - - /* Copy the cache key & IV keys into place if needed. */ - if (new_key_len > 0) { - cpx_t cpx = cpkp_cpx(cpkp); - - cpx_set_key_len(cpx, new_key_len); - memcpy(cpx_key(cpx), new_key, new_key_len); - - /* Initialize the IV key */ - if (key_class != PROTECTION_CLASS_F) - cpx_set_aes_iv_key(cpx, iv_key); - - cpx_set_is_sep_wrapped_key(cpx, iswrapped); - } - if (new_persistent_len > 0) { - cpkp_set_pers_key_len(cpkp, new_persistent_len); - memcpy(cpkp_pers_key(cpkp), new_persistent_key, new_persistent_len); - } - -cpnew_fail: - -#if HFS_TMPDBG -#if !SECURE_KERNEL - if ((hfsmp->hfs_cp_verbose) && (error == EPERM)) { - /* Only introspect the data fork */ - cp_log_eperm (cp->c_vp, *newclass_eff, true); - } -#endif -#endif - - return error; -} - -/* Initialize the cp_cred_t structure passed to AKS */ -static void cp_init_access(cp_cred_t access, struct cnode *cp) -{ - vfs_context_t context = vfs_context_current(); - kauth_cred_t cred = vfs_context_ucred(context); - proc_t proc = vfs_context_proc(context); - - bzero(access, sizeof(*access)); - - /* Note: HFS uses 32-bit fileID, even though inode is a 64-bit value */ - access->inode = cp->c_fileid; - access->pid = proc_pid(proc); - access->uid = kauth_cred_getuid(cred); - - if (cp->c_cpentry) - access->key_revision = cp->c_cpentry->cp_key_revision; - - return; -} - -/* - * Parses versions of the form 12A316, i.e. and - * returns a uint32_t in the form 0xaabbcccc where aa = , - * bb = , cccc = . - */ -static cp_key_os_version_t parse_os_version(void) -{ - const char *p = osversion; - - int a = 0; - while (*p >= '0' && *p <= '9') { - a = a * 10 + *p - '0'; - ++p; - } - - if (!a) - return 0; - - int b = *p++; - if (!b) - return 0; - - int c = 0; - while (*p >= '0' && *p <= '9') { - c = c * 10 + *p - '0'; - ++p; - } - - if (!c) - return 0; - - return (a & 0xff) << 24 | b << 16 | (c & 0xffff); -} - -cp_key_os_version_t cp_os_version(void) -{ - static cp_key_os_version_t cp_os_version; - - if (cp_os_version) - return cp_os_version; - - if (!osversion[0]) - return 0; - - cp_os_version = parse_os_version(); - if (!cp_os_version) { - printf("cp_os_version: unable to parse osversion `%s'\n", osversion); - cp_os_version = 1; - } - - return cp_os_version; -} - - -errno_t cp_handle_strategy(buf_t bp) -{ - vnode_t vp = buf_vnode(bp); - cnode_t *cp = NULL; - - if (bufattr_rawencrypted(buf_attr(bp)) - || !(cp = cp_get_protected_cnode(vp)) - || !cp->c_cpentry) { - // Nothing to do - return 0; - } - - /* - * For filesystem resize, we may not have access to the underlying - * file's cache key for whatever reason (device may be locked). - * However, we do not need it since we are going to use the - * temporary HFS-wide resize key which is generated once we start - * relocating file content. If this file's I/O should be done - * using the resize key, it will have been supplied already, so do - * not attach the file's cp blob to the buffer. - */ - if (ISSET(cp->c_cpentry->cp_flags, CP_RELOCATION_INFLIGHT)) - return 0; - - { - // Fast path - cpx_t cpx = cpkp_cpx(&cp->c_cpentry->cp_keys); - - if (cpx_has_key(cpx)) { - bufattr_setcpx(buf_attr(bp), cpx); - return 0; - } - } - - /* - * We rely mostly (see note below) upon the truncate lock to - * protect the CP cache key from getting tossed prior to our IO - * finishing here. Nearly all cluster io calls to manipulate file - * payload from HFS take the truncate lock before calling into the - * cluster layer to ensure the file size does not change, or that - * they have exclusive right to change the EOF of the file. That - * same guarantee protects us here since the code that deals with - * CP lock events must now take the truncate lock before doing - * anything. - * - * If you want to change content protection structures, then the - * truncate lock is not sufficient; you must take the truncate - * lock and then wait for outstanding writes to complete. This is - * necessary because asynchronous I/O only holds the truncate lock - * whilst I/O is being queued. - * - * One exception should be the VM swapfile IO, because HFS will - * funnel the VNOP_PAGEOUT directly into a cluster_pageout call - * for the swapfile code only without holding the truncate lock. - * This is because individual swapfiles are maintained at - * fixed-length sizes by the VM code. In non-swapfile IO we use - * PAGEOUT_V2 semantics which allow us to create our own UPL and - * thus take the truncate lock before calling into the cluster - * layer. In that case, however, we are not concerned with the CP - * blob being wiped out in the middle of the IO because there - * isn't anything to toss; the VM swapfile key stays in-core as - * long as the file is open. - */ - - off_rsrc_t off_rsrc = off_rsrc_make(buf_lblkno(bp) * GetLogicalBlockSize(vp), - VNODE_IS_RSRC(vp)); - cp_io_params_t io_params; - - - /* - * We want to take the cnode lock here and because the vnode write - * count is a pseudo-lock, we need to do something to preserve - * lock ordering; the cnode lock comes before the write count. - * Ideally, the write count would be incremented after the - * strategy routine returns, but that becomes complicated if the - * strategy routine where to call buf_iodone before returning. - * For now, we drop the write count here and then pick it up again - * later. - */ - if (!ISSET(buf_flags(bp), B_READ) && !ISSET(buf_flags(bp), B_RAW)) - vnode_writedone(vp); - - hfs_lock_always(cp, HFS_SHARED_LOCK); - cp_io_params(VTOHFS(vp), cp->c_cpentry, off_rsrc, - ISSET(buf_flags(bp), B_READ) ? VNODE_READ : VNODE_WRITE, - &io_params); - hfs_unlock(cp); - - /* - * Last chance: If this data protected I/O does not have unwrapped - * keys present, then try to get them. We already know that it - * should, by this point. - */ - if (!cpx_has_key(io_params.cpx)) { - int io_op = ( (buf_flags(bp) & B_READ) ? CP_READ_ACCESS : CP_WRITE_ACCESS); - errno_t error = cp_handle_vnop(vp, io_op, 0); - if (error) { - /* - * We have to be careful here. By this point in the I/O - * path, VM or the cluster engine has prepared a buf_t - * with the proper file offsets and all the rest, so - * simply erroring out will result in us leaking this - * particular buf_t. We need to properly decorate the - * buf_t just as buf_strategy would so as to make it - * appear that the I/O errored out with the particular - * error code. - */ - if (!ISSET(buf_flags(bp), B_READ) && !ISSET(buf_flags(bp), B_RAW)) - vnode_startwrite(vp); - buf_seterror (bp, error); - buf_biodone(bp); - return error; - } - - hfs_lock_always(cp, HFS_SHARED_LOCK); - cp_io_params(VTOHFS(vp), cp->c_cpentry, off_rsrc, - ISSET(buf_flags(bp), B_READ) ? VNODE_READ : VNODE_WRITE, - &io_params); - hfs_unlock(cp); - } - - assert(buf_count(bp) <= io_params.max_len); - bufattr_setcpx(buf_attr(bp), io_params.cpx); - - if (!ISSET(buf_flags(bp), B_READ) && !ISSET(buf_flags(bp), B_RAW)) - vnode_startwrite(vp); - - return 0; -} - -#else // !CONFIG_PROTECT - -#include -#include -#include - -int cp_key_store_action(int action __unused) -{ - return ENOTSUP; -} - -int cp_register_wraps(cp_wrap_func_t key_store_func __unused) -{ - return ENOTSUP; -} - -size_t cpx_size(__unused size_t key_size) -{ - return 0; -} - -cpx_t cpx_alloc(__unused size_t key_size) -{ - return NULL; -} - -void cpx_free(__unused cpx_t cpx) -{ -} - -bool cpx_is_sep_wrapped_key(__unused const struct cpx *cpx) -{ - return false; -} - -void cpx_set_is_sep_wrapped_key(__unused struct cpx *cpx, __unused bool v) -{ -} - -bool cpx_use_offset_for_iv(__unused const struct cpx *cpx) -{ - return false; -} - -void cpx_set_use_offset_for_iv(__unused struct cpx *cpx, __unused bool v) -{ -} - -uint16_t cpx_key_len(__unused const struct cpx *cpx) -{ - return 0; -} - -void cpx_set_key_len(__unused struct cpx *cpx, __unused uint16_t key_len) -{ -} - -void *cpx_key(__unused const struct cpx *cpx) -{ - return NULL; -} - -aes_encrypt_ctx *cpx_iv_aes_ctx(__unused cpx_t cpx) -{ - return NULL; -} - -#endif /* CONFIG_PROTECT */ diff --git a/bsd/hfs/hfs_cprotect.h b/bsd/hfs/hfs_cprotect.h deleted file mode 100644 index b25ecc70c..000000000 --- a/bsd/hfs/hfs_cprotect.h +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2009-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef HFS_CPROTECT_H_ -#define HFS_CPROTECT_H_ - -#if KERNEL_PRIVATE - -#include - -#include -#include -#include -#include -#include - -#include "hfs.h" -#include "hfs_fsctl.h" - -__BEGIN_DECLS - -#define CP_IV_KEYSIZE 16 /* 16x8 = 128 */ -#define CP_MAX_KEYSIZE 32 /* 8x4 = 32, 32x8 = 256 */ -#define CP_MAX_CACHEBUFLEN 64 /* Maximum size of cp cache buffer/array */ - -#define CP_INITIAL_WRAPPEDKEYSIZE 40 -#define CP_V2_WRAPPEDKEYSIZE 40 /* Size of the wrapped key in a v2 EA */ -#define CP_V4_RESERVEDBYTES 16 /* Number of reserved bytes in EA still present */ - -#define CP_LOCKED_KEYCHAIN 0 -#define CP_UNLOCKED_KEYCHAIN 1 - -#define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" -#define CONTENT_PROTECTION_XATTR_NAME_CHARS \ - { 'c', 'o', 'm', '.', 'a', 'p', 'p', 'l', 'e', \ - '.', 's', 'y', 's', 't', 'e', 'm', \ - '.', 'c', 'p', 'r', 'o', 't', 'e', 'c', 't' } -#define CP_CURRENT_VERS CP_VERS_5 -#define CP_VERS_5 5 // iOS 8.1 -#define CP_VERS_4 4 // iOS 5 -#define CP_VERS_2 2 // iOS 4 -#define CP_MINOR_VERS 0 - -/* the class occupies the lowest 5 bits, so there are 32 values (0-31) */ -#define CP_EFFECTIVE_CLASSMASK 0x0000001f - -typedef uint32_t cp_key_class_t; -typedef uint32_t cp_key_os_version_t; - -/* macros for quick access/typing to mask out the classmask */ -#define CP_CLASS(x) ((cp_key_class_t)(CP_EFFECTIVE_CLASSMASK & (x))) - -#define CP_CRYPTO_G1 0x00000020 - -typedef struct cp_xattr *cp_xattr_t; -typedef struct cnode * cnode_ptr_t; -//forward declare the struct. -struct hfsmount; - -/* - * Flags for Key Generation Behavior - * - * These are passed to cp_generate_keys() and cp_new() in the - * flags arguments - */ -#define CP_KEYWRAP_DIFFCLASS 0x00000001 /* wrapping with a different class bag is OK */ - -/* - * off_rsrc_t: this structure represents an offset and whether or not it's - * the resource fork. It's done this way so that we can easily do comparisons - * i.e. - * - * { 0, data-fork } < { 100, rsrc-fork } - */ - -enum { - OFF_RSRC_BIT = 0x4000000000000000, -}; - -typedef int64_t off_rsrc_t; - -static inline bool off_rsrc_is_rsrc(off_rsrc_t off_rsrc) -{ - return off_rsrc & OFF_RSRC_BIT; -} - -static inline off_t off_rsrc_get_off(off_rsrc_t off_rsrc) -{ - return off_rsrc & (OFF_RSRC_BIT - 1); -} - -static inline off_rsrc_t off_rsrc_make(off_t offset, bool is_rsrc) -{ - return offset | (is_rsrc ? OFF_RSRC_BIT : 0); -} - -// -- struct cpx -- - -/* - * This structure contains the unwrapped key and is passed to the lower layers. - * It is private so users must use the accessors declared in sys/cprotect.h - * to read/write it. - */ - -// cpx_flags -typedef uint32_t cpx_flags_t; -enum { - CPX_SEP_WRAPPEDKEY = 0x01, - CPX_IV_AES_CTX_INITIALIZED = 0x02, - CPX_USE_OFFSET_FOR_IV = 0x04, - - // Using AES IV context generated from key - CPX_IV_AES_CTX_HFS = 0x08, -}; - -struct cpx { -#if DEBUG - uint32_t cpx_magic1; -#endif - cpx_flags_t cpx_flags; - uint16_t cpx_max_key_len; - uint16_t cpx_key_len; - aes_encrypt_ctx cpx_iv_aes_ctx; // Context used for generating the IV - uint8_t cpx_cached_key[]; -} __attribute__((packed)); - -// -- struct cp_key_pair -- - -/* - * This structure maintains the pair of keys; the persistent, wrapped key that - * is written to disk, and the unwrapped key (cpx_t) that we pass to lower - * layers. - */ - -typedef struct cp_key_pair { - uint16_t cpkp_max_pers_key_len; - uint16_t cpkp_pers_key_len; - struct cpx cpkp_cpx; - - // cpkp_cpx is variable length so the location of the persistent key varies - // uint8_t cpkp_persistent_key[]; -} cp_key_pair_t; - -// -- struct cprotect -- - -/* - * Runtime-only structure containing the content protection status for - * the given file. This is referenced by the cnode. It has the - * variable length key pair at the end. - */ - -typedef uint32_t cp_flags_t; -enum { - CP_NO_XATTR = 0x01, /* Key info has not been saved as EA to the FS */ - CP_RELOCATION_INFLIGHT = 0x02, /* File with offset IVs is in the process of being relocated. */ - - CP_HAS_A_KEY = 0x08, /* File has a non-zero length key */ -}; - -struct cprotect { -#if DEBUG - uint32_t cp_magic1; -#endif - cp_flags_t cp_flags; - cp_key_class_t cp_pclass; /* persistent class stored on-disk */ - void* cp_backing_cnode; - cp_key_os_version_t cp_key_os_version; - cp_key_revision_t cp_key_revision; - uint16_t cp_raw_open_count; - cp_key_pair_t cp_keys; // Variable length -}; - -// -- On-Disk Structures -- - -typedef uint32_t cp_xattr_flags_t; -enum { - /* - * Be careful about using flags 0x02 to 0x20. Older code used to write - * flags that were used for in-memory purposes to disk and therefore - * they might be used in V4 structures. Here's what they were: - * - * CP_KEY_FLUSHED 0x02 Should never have made it to disk - * CP_NO_XATTR 0x04 Should never have made it to disk - * CP_OFF_IV_ENABLED 0x08 Probably made it to disk - * CP_RELOCATION_INFLIGHT 0x10 Should never have made it to disk - * CP_SEP_WRAPPEDKEY 0x20 Probably made it to disk - * - */ - - CP_XAF_NEEDS_KEYS = 0x0001, /* V4 only: file needs persistent keys */ - -}; - -/* - * V2 structure written as the per-file EA payload - * All on-disk multi-byte fields for the CP XATTR must be stored - * little-endian on-disk. This means they must be endian swapped to - * L.E on getxattr() and converted to LE on setxattr(). - * - * This structure is a fixed length and is tightly packed. - * 56 bytes total. - */ -struct cp_xattr_v2 { - u_int16_t xattr_major_version; - u_int16_t xattr_minor_version; - cp_xattr_flags_t flags; - u_int32_t persistent_class; - u_int32_t key_size; - uint8_t persistent_key[CP_V2_WRAPPEDKEYSIZE]; -} __attribute__((aligned(2), packed)); - - -/* - * V4 Content Protection EA On-Disk Layout. - * - * This structure must be tightly packed, but the *size can vary* - * depending on the length of the key. At MOST, the key length will be - * CP_MAX_WRAPPEDKEYSIZE, but the length is defined by the key_size field. - * - * Either way, the packing must be applied to ensure that the key data is - * retrievable in the right location relative to the start of the struct. - * - * Fully packed, this structure can range from : - * MIN: 36 bytes (no key -- used with directories) - * MAX: 164 bytes (with 128 byte key) - * - * During runtime we always allocate with the full 128 byte key, but only - * use as much of the key buffer as needed. It must be tightly packed, though. - */ - -struct cp_xattr_v4 { - u_int16_t xattr_major_version; - u_int16_t xattr_minor_version; - cp_xattr_flags_t flags; - cp_key_class_t persistent_class; - u_int32_t key_size; - // This field will be zero on older systems - cp_key_os_version_t key_os_version; - /* CP V4 Reserved Bytes == 16 */ - u_int8_t reserved[CP_V4_RESERVEDBYTES]; - /* All above fields are fixed regardless of key length (36 bytes) */ - /* Max Wrapped Size == 128 */ - uint8_t persistent_key[CP_MAX_WRAPPEDKEYSIZE]; -} __attribute__((aligned(2), packed)); - -// -- Version 5 -- - - -struct cp_xattr_v5 { - uint16_t xattr_major_version; - uint16_t xattr_minor_version; - cp_xattr_flags_t flags; - cp_key_class_t persistent_class; - cp_key_os_version_t key_os_version; - cp_key_revision_t key_revision; - uint16_t key_len; - - // 20 bytes to here - - // Variable length from here - uint8_t persistent_key[CP_MAX_WRAPPEDKEYSIZE]; - - - // Wouldn't be necessary if xattr routines returned just what we ask for - uint8_t spare[512]; -} __attribute__((aligned(2), packed)); - -enum { - CP_XATTR_MIN_LEN = 20, // Minimum length for all versions -}; - -/* - * The Root Directory's EA (fileid 1) is special; it defines information about - * what capabilities the filesystem is using. - * - * The data is still stored little endian. - */ -struct cp_root_xattr { - u_int16_t major_version; - u_int16_t minor_version; - u_int64_t flags; -} __attribute__((aligned(2), packed)); - -enum { - CP_ROOT_XATTR_MIN_LEN = 12, -}; - - -// -- Function Prototypes -- - -int cp_entry_init(cnode_ptr_t, struct mount *); -int cpx_gentempkeys(cpx_t *pcpx, struct hfsmount *hfsmp); -void cp_entry_destroy(struct hfsmount *hfsmp, struct cprotect *entry_ptr); -void cp_replace_entry (struct hfsmount *hfsmp, struct cnode *cp, struct cprotect *newentry); -cnode_ptr_t cp_get_protected_cnode(vnode_t); -int cp_fs_protected (mount_t); -int cp_getrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *outxattr); -int cp_setrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *newxattr); -int cp_generate_keys (struct hfsmount *hfsmp, struct cnode *cp, - cp_key_class_t targetclass, uint32_t flags, - struct cprotect **newentry); -int cp_setup_newentry (struct hfsmount *hfsmp, struct cnode *dcp, - cp_key_class_t suppliedclass, mode_t cmode, - struct cprotect **tmpentry); -int cp_is_valid_class (int isdir, int32_t protectionclass); -int cp_set_trimmed(struct hfsmount*); -int cp_set_rewrapped(struct hfsmount *); -int cp_flop_generation (struct hfsmount*); -bool cp_is_supported_version(uint16_t version); - - -typedef struct cp_io_params { - // The key to use - cpx_t cpx; - - /* - * The physical offset for this I/O or -1 if unknown (i.e. caller must - * do a regular look up). - */ - off_t phys_offset; - - // The maximum length allowed for this I/O - off_t max_len; -} cp_io_params_t; - -// Return the I/O parameters for this I/O -void cp_io_params(struct hfsmount *hfsmp, cprotect_t cpr, off_rsrc_t off_rsrc, - int direction, cp_io_params_t *io_params); - -int cp_setxattr(struct cnode *cp, struct cprotect *entry, struct hfsmount *hfsmp, - uint32_t fileid, int xattr_opts); - -typedef void * (* cp_new_alloc_fn)(const void *old, uint16_t pers_key_len, - uint16_t cached_key_len, - cp_key_pair_t **pcpkp); - -int cp_new(cp_key_class_t *newclass_eff, struct hfsmount *hfsmp, - struct cnode *cp, mode_t cmode, int32_t keyflags, - cp_key_revision_t key_revision, - cp_new_alloc_fn alloc_fn, void **pholder); - -int cp_rewrap(struct cnode *cp, __unused struct hfsmount *hfsmp, - cp_key_class_t *newclass, cp_key_pair_t *cpkp, const void *old_holder, - cp_new_alloc_fn alloc_fn, void **pholder); - -cprotect_t cp_entry_alloc(cprotect_t old, uint16_t pers_keylen, - uint16_t cached_key_len, cp_key_pair_t **pcpkp); - -cp_key_os_version_t cp_os_version(void); - -cp_key_revision_t cp_next_key_revision(cp_key_revision_t rev); - -typedef uint32_t cp_getxattr_options_t; -enum { - // Return just basic information (not the key) - CP_GET_XATTR_BASIC_INFO = 1, -}; - -int cp_read_xattr_v5(struct hfsmount *hfsmp, struct cp_xattr_v5 *xattr, - size_t xattr_len, cprotect_t *pcpr, cp_getxattr_options_t options); - - -errno_t cp_handle_strategy(buf_t bp); - -// -- cp_key_pair_t functions -- - -size_t cpkp_size(uint16_t pers_key_len, uint16_t cached_key_len); -size_t cpkp_sizex(const cp_key_pair_t *cpkp); -void cpkp_init(cp_key_pair_t *cpkp, uint16_t max_pers_key_len, - uint16_t max_cached_key_len); -void cpkp_flush(cp_key_pair_t *cpkp); -void cpkp_copy(const cp_key_pair_t *src, cp_key_pair_t *dst); -uint16_t cpkp_max_pers_key_len(const cp_key_pair_t *cpkp); -uint16_t cpkp_pers_key_len(const cp_key_pair_t *cpkp); -bool cpkp_can_copy(const cp_key_pair_t *src, const cp_key_pair_t *dst); - -// -- Private cpx functions -- - -void cpx_init(cpx_t, size_t key_len); -bool cpx_has_key(const struct cpx *cpx); -uint16_t cpx_max_key_len(const struct cpx *cpx); -cpx_t cpkp_cpx(const cp_key_pair_t *cpkp); -void cpx_copy(const struct cpx *src, cpx_t dst); - -// -- Helper Functions -- - -static inline int cp_get_crypto_generation (cp_key_class_t protclass) { - if (protclass & CP_CRYPTO_G1) { - return 1; - } - else return 0; -} - -__END_DECLS - -#endif /* KERNEL_PRIVATE */ - -#endif /* !HFS_CPROTECT_H_ */ diff --git a/bsd/hfs/hfs_dbg.h b/bsd/hfs/hfs_dbg.h deleted file mode 100644 index f2c9aea08..000000000 --- a/bsd/hfs/hfs_dbg.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2000, 2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* hfs_dbg.h - * - * (c) 1997 Apple Computer, Inc. All Rights Reserved - * - * hfs_dbg.h -- debugging macros for HFS file system. - * - * HISTORY - * 10-Nov-1998 Pat Dirks Cleaned up definition of DBG_ASSERT to handle embedded '%' correctly. - * 28-Apr-1998 Scott Roberts Reorganized and added HFS_DEBUG_STAGE - * 17-Nov-1997 Pat Dirks Pat Dirks at Apple Computer - * Derived from old hfs version. - */ - -struct componentname; -extern void Debugger(const char *message); - -/* Define the debugging stage... - 4 -> Do all, aggresive, call_kdp - 3 -> debug asserts and debug err, panic instead of call_kdp - 2 -> debug error, no kdb - 1 -> very little, panic only -*/ -#ifndef HFS_DIAGNOSTIC - #define HFS_DIAGNOSTIC 0 -#endif /* HFS_DIAGNOSTIC */ - -#ifndef HFS_DEBUG_STAGE -#if HFS_DIAGNOSTIC - #define HFS_DEBUG_STAGE 4 -#else - #define HFS_DEBUG_STAGE 1 -#endif /* KERNEL */ -#endif /* HFS_DEBUG_STAGE */ - -#ifdef KERNEL - #define PRINTIT kprintf -#else /* KERNEL */ - #define PRINTIT printf -#endif /* KERNEL */ - -#if (HFS_DEBUG_STAGE > 3) -#define DEBUG_BREAK Debugger(""); -#else -#define DEBUG_BREAK -#endif - -#if (HFS_DEBUG_STAGE == 4) - #define DEBUG_BREAK_MSG(PRINTF_ARGS) { PRINTIT PRINTF_ARGS; DEBUG_BREAK }; -#elif (HFS_DEBUG_STAGE == 3) - #define DEBUG_BREAK_MSG(PRINTF_ARGS) { panic PRINTF_ARGS;}; -#else - #define DEBUG_BREAK_MSG(PRINTF_ARGS) { PRINTIT PRINTF_ARGS; }; -#endif - - -#define PRINT_DELAY - -/* - * Debugging macros. - */ -#if HFS_DIAGNOSTIC -extern int hfs_dbg_all; -extern int hfs_dbg_err; - -#ifdef KERNEL - #if (HFS_DEBUG_STAGE == 4) - char gDebugAssertStr[255]; - #define DBG_ASSERT(a) { if (!(a)) { \ - snprintf(gDebugAssertStr, sizeof (gDebugAssertStr), "Oops - File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); \ - Debugger(gDebugAssertStr); } } - #else -#define DBG_ASSERT(a) { if (!(a)) { panic("File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); } } - #endif /* HFS_DEBUG_STAGE */ -#else - #define DBG_ASSERT(a) assert(a) -#endif /* KERNEL */ - -#define DBG_ERR(x) { \ - if(hfs_dbg_all || hfs_dbg_err) { \ - PRINTIT("%X: ", proc_selfpid()); \ - PRINTIT("HFS ERROR: "); \ - PRINTIT x; \ - PRINT_DELAY; \ - }; \ -} - -#else /* HFS_DIAGNOSTIC */ - -#define DBG_ASSERT(a) -#define DBG_ERR(x) - -#endif /* HFS_DIAGNOSTIC */ - diff --git a/bsd/hfs/hfs_encodinghint.c b/bsd/hfs/hfs_encodinghint.c deleted file mode 100644 index 232944bd9..000000000 --- a/bsd/hfs/hfs_encodinghint.c +++ /dev/null @@ -1,961 +0,0 @@ -/* - * Copyright (c) 2001-2013 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#if HFS - -#include -#include -#include - - -/* CJK Mac Encoding Bits */ -#define CJK_JAPAN 0x1 -#define CJK_KOREAN 0x2 -#define CJK_CHINESE_TRAD 0x4 -#define CJK_CHINESE_SIMP 0x8 -#define CJK_ALL 0xF - -#define CJK_CHINESE (CJK_CHINESE_TRAD | CJK_CHINESE_SIMP) -#define CJK_KATAKANA (CJK_JAPAN) - - -/* Remember the last unique CJK bit */ -u_int8_t cjk_lastunique = 0; - -/* Encoding bias */ -u_int32_t hfs_encodingbias = 0; -int hfs_islatinbias = 0; - -extern lck_mtx_t encodinglst_mutex; - - -/* Map CJK bits to Mac encoding */ -u_int8_t cjk_encoding[] = { - /* 0000 */ kTextEncodingMacUnicode, - /* 0001 */ kTextEncodingMacJapanese, - /* 0010 */ kTextEncodingMacKorean, - /* 0011 */ kTextEncodingMacJapanese, - /* 0100 */ kTextEncodingMacChineseTrad, - /* 0101 */ kTextEncodingMacJapanese, - /* 0110 */ kTextEncodingMacKorean, - /* 0111 */ kTextEncodingMacJapanese, - /* 1000 */ kTextEncodingMacChineseSimp, - /* 1001 */ kTextEncodingMacJapanese, - /* 1010 */ kTextEncodingMacKorean, - /* 1011 */ kTextEncodingMacJapanese, - /* 1100 */ kTextEncodingMacChineseTrad, - /* 1101 */ kTextEncodingMacJapanese, - /* 1110 */ kTextEncodingMacKorean, - /* 1111 */ kTextEncodingMacJapanese -}; - - -#if CONFIG_HFS_STD - -/* - * CJK Mac Encoding Bitmap - * - * Each entry in the table is 4-bits wide and represents - * every CJK codepoint (0x4E00 to 0x9FAF). - */ -static u_int8_t cjk_bitmap[] = { - 0xFF,0xF0,0x00,0xF0,0xFF,0xFF,0xFC,0x4D,0xFD,0x80,0xFF,0x1F,0xFF,0x88,0x88,0x4F, - 0x10,0x08,0x88,0x85,0x08,0xC9,0xF8,0x04,0x5C,0x4F,0x08,0x09,0xFF,0xF8,0x85,0x98, - 0x00,0xF7,0xF0,0xC0,0x8C,0xF0,0xF8,0xFF,0x08,0xCC,0x18,0x1F,0xFF,0x00,0xFC,0xFF, - 0x88,0x01,0x00,0x08,0xC0,0x20,0x20,0x00,0x98,0xF0,0x00,0x22,0x00,0x00,0x00,0x4F, - 0x01,0x47,0x14,0x0F,0x9F,0xF1,0xCF,0x8F,0xF2,0xCF,0xFF,0x00,0x5B,0xF8,0x01,0xD7, - 0xF9,0x0F,0xFF,0x8F,0x8F,0xF0,0xFF,0x0F,0x01,0xD8,0x80,0x07,0x40,0x8F,0x00,0x80, - 0xFF,0xCD,0x8F,0xFD,0xC4,0xFF,0xF0,0x19,0x80,0x80,0xFF,0xFF,0xFF,0x04,0xF4,0xFD, - 0xC0,0xF0,0xFF,0x00,0x4C,0x88,0x18,0x01,0x4F,0xCF,0xC4,0xFF,0x00,0xF0,0xC0,0xC0, - 0xF4,0x04,0x44,0x00,0xF4,0x6F,0xF0,0xFF,0xFF,0x44,0x44,0x80,0xCC,0x89,0x51,0x88, - 0x08,0x0C,0x88,0x88,0x00,0x88,0x44,0xF0,0x0D,0x48,0x0F,0x0F,0x0F,0x4F,0xFF,0x44, - 0x40,0xF0,0x00,0x7F,0x46,0x00,0xF4,0xFF,0xFF,0xD4,0xF4,0xD4,0xFC,0xFF,0xDF,0xCD, - 0x4C,0xC4,0x8C,0xC0,0xF0,0x44,0x0C,0xF4,0x0F,0xF0,0x5C,0x4F,0x44,0xD6,0x4D,0xFE, - 0x64,0xF4,0x4E,0x47,0xCF,0xF2,0xF0,0xF0,0xF4,0x04,0x4C,0xC7,0x07,0xF4,0xF4,0x04, - 0x19,0x80,0x80,0x88,0x88,0x18,0x18,0xFF,0x00,0x44,0xF0,0x47,0x40,0x44,0x00,0xF0, - 0x44,0xF7,0xCF,0x40,0x60,0x4F,0x40,0xCF,0xFD,0x60,0x07,0xF4,0x4D,0x5F,0xFC,0xFC, - 0xF6,0x90,0x11,0x08,0x88,0x08,0x84,0xFF,0xE0,0xF0,0x74,0x45,0x0F,0x0F,0x00,0x0D, - 0x04,0x02,0x50,0x47,0x70,0x70,0xFC,0xD4,0x70,0x4F,0x4D,0x47,0xFC,0x4F,0x0E,0xF6, - 0xF4,0x74,0xD3,0x6F,0xDF,0x7F,0xFF,0x48,0x44,0x40,0x40,0x41,0x10,0x28,0x0C,0x08, - 0x44,0xF0,0x40,0xF4,0x7F,0x44,0x4C,0xFC,0x41,0x40,0xF0,0x41,0x00,0x4F,0x4F,0x44, - 0x44,0x44,0xF4,0x00,0x44,0x44,0x49,0x44,0x04,0x45,0x77,0xCF,0x03,0x84,0x50,0x88, - 0x0F,0x44,0xF0,0x40,0x08,0x40,0xF4,0x04,0x70,0x04,0x44,0x04,0x77,0x45,0x44,0x04, - 0x00,0xC4,0x80,0x80,0x88,0x00,0x7F,0x44,0x44,0x7F,0x75,0x74,0x04,0xCC,0x40,0x47, - 0x40,0x05,0x74,0x44,0x74,0x47,0x10,0xF4,0x70,0x40,0x74,0x4F,0x00,0x4F,0x40,0x03, - 0x04,0x50,0x74,0xFC,0x44,0x04,0xFC,0x4D,0x44,0xC0,0xD0,0x04,0x74,0xF0,0x40,0x64, - 0x37,0x45,0x47,0xCE,0x74,0xC4,0x04,0x00,0x44,0x4F,0x55,0x41,0x05,0x05,0x04,0x70, - 0xF4,0x04,0x44,0x04,0x40,0x07,0x40,0x04,0x44,0x47,0x44,0x70,0x44,0x57,0x47,0xD0, - 0xFF,0xF0,0xFF,0x7F,0xFF,0xF0,0xF7,0x03,0x81,0x07,0xCD,0x48,0x40,0x4D,0x4F,0x44, - 0x40,0x4F,0xF0,0x60,0x7F,0xF3,0xFF,0x0F,0xF8,0x80,0xF8,0xFF,0x8F,0x80,0x8F,0x00, - 0x8F,0x09,0x90,0x43,0xD8,0x07,0xF9,0x50,0x51,0x5F,0xF4,0xF9,0x94,0x80,0x08,0x04, - 0x0F,0x0D,0xFD,0x01,0x11,0x97,0x0F,0x80,0xDD,0x99,0x91,0xFF,0x40,0x80,0xFC,0x04, - 0x08,0x00,0x5F,0x8F,0xB4,0xF4,0x7F,0x84,0x84,0x00,0x04,0x41,0x04,0x90,0xF7,0x06, - 0xFD,0x00,0x08,0x11,0x10,0x81,0x90,0x80,0x7F,0xC0,0xD0,0x0F,0xFF,0x8F,0xF8,0x81, - 0xCF,0xF8,0x01,0xFF,0x4F,0x1F,0x84,0x0F,0xF4,0x4C,0x01,0xFC,0x88,0x88,0xF4,0x04, - 0x48,0x00,0x7F,0x10,0xFC,0x87,0x80,0x0F,0x4F,0xD4,0x40,0xFF,0xBF,0xFF,0x80,0x80, - 0xC8,0xF8,0x05,0x74,0x40,0x7F,0xFF,0x14,0x88,0x04,0x4F,0x0F,0x00,0x74,0x6C,0x4D, - 0xE0,0x14,0x91,0x80,0xF0,0x4F,0x44,0xF4,0x11,0x1F,0x75,0x40,0x04,0x44,0xF4,0xF0, - 0xC4,0x7C,0x04,0x70,0x7F,0x44,0x74,0x00,0x78,0xC3,0x01,0x44,0x44,0xF0,0x80,0xF8, - 0x8F,0xF8,0x02,0x04,0xF8,0xFF,0xDD,0x04,0x90,0x88,0x11,0x00,0x10,0x40,0x05,0x8F, - 0x74,0xF4,0x30,0xF0,0xF0,0x80,0x70,0x00,0x08,0x4F,0x70,0x5E,0x7F,0x60,0x70,0xF7, - 0x01,0x77,0x0F,0x15,0x40,0x40,0x00,0x40,0x4C,0x71,0x74,0x40,0x97,0x6F,0x04,0xFF, - 0x18,0x01,0xF0,0x0D,0x4F,0x44,0xF0,0xF4,0x4F,0x04,0xF0,0xFF,0xF0,0x0D,0xD4,0x40, - 0xFF,0xF4,0x00,0x08,0x00,0x0F,0x40,0x78,0x54,0x10,0x04,0x40,0xF1,0x99,0x44,0xFC, - 0xF7,0xF0,0xD6,0xF1,0xFF,0x0F,0x74,0x88,0xF0,0xFF,0x87,0xF8,0x01,0x0F,0x0F,0x8F, - 0xCF,0xC8,0x08,0x8F,0x92,0x80,0x04,0xFD,0xFF,0xD4,0xFB,0xF0,0x4D,0x58,0x24,0xF0, - 0x00,0x0D,0x8F,0x08,0x80,0x84,0x88,0x44,0x00,0x24,0x84,0x45,0x4F,0x0F,0xC4,0xF4, - 0x03,0x88,0xF4,0x4B,0x99,0x00,0x74,0x09,0x01,0x14,0x04,0x09,0x40,0xF0,0x00,0x80, - 0x80,0x79,0x00,0x00,0xFF,0xFF,0xF9,0x01,0x80,0x00,0x0F,0xFF,0x98,0xF0,0x00,0xD0, - 0x78,0xF7,0xFF,0x0C,0xFD,0xFF,0xFF,0xFD,0xFF,0xFF,0xC0,0x99,0x8F,0xC1,0x8C,0x00, - 0xD0,0xF0,0x0F,0x4C,0xFF,0x5F,0xFF,0xFF,0xFF,0x8C,0x80,0x88,0x44,0xF0,0xF4,0xFC, - 0x8F,0x80,0x44,0xCF,0xFC,0xF4,0xDD,0x8D,0xC4,0x68,0xC8,0x45,0xFF,0xF0,0x5F,0x0F, - 0x4D,0xC7,0x40,0x4D,0x1F,0x8F,0x00,0x45,0x38,0x88,0x8C,0x88,0x88,0x80,0x08,0x10, - 0x44,0x4C,0x4C,0x4C,0x0F,0x43,0x04,0x40,0xF5,0xFC,0xF4,0xDD,0x0C,0xF4,0xFF,0x44, - 0x4F,0x0C,0x0D,0x4F,0x04,0xD0,0x4F,0x9F,0x4F,0x0D,0xC8,0x0C,0x84,0x8C,0x80,0x00, - 0x44,0x85,0x5B,0xCC,0xCF,0xFC,0xCF,0xD4,0xC4,0xF1,0x08,0x44,0x0F,0xC4,0xF5,0xC5, - 0xFF,0x4D,0x4F,0xDC,0xFF,0x00,0x88,0xCC,0x88,0x88,0x88,0x84,0x81,0x08,0x88,0x8C, - 0x74,0x05,0xF4,0xCD,0xDF,0x4C,0xF0,0x0F,0x40,0xCF,0x00,0x40,0x04,0x4F,0xDC,0xC0, - 0xC0,0x40,0x47,0xFF,0xC4,0x44,0x04,0xD6,0xCF,0x04,0x0D,0x41,0x00,0x84,0x02,0x00, - 0x08,0x88,0x08,0xC4,0x00,0x0C,0x4C,0xFD,0xFC,0xD4,0x64,0xC4,0x55,0x40,0x0C,0x8F, - 0xC5,0xC0,0x5F,0x0F,0x84,0x4C,0x41,0x74,0x4C,0x34,0xC0,0x5F,0x00,0x00,0x1D,0x46, - 0xC0,0x74,0xCC,0x8C,0x00,0x0C,0x88,0x08,0x00,0x00,0xC0,0xCC,0x08,0xD0,0x4F,0x4D, - 0xCF,0xDC,0x0F,0xF2,0xFC,0xDF,0x44,0xC4,0xC0,0x44,0x4C,0x00,0xFF,0x07,0xFF,0xD1, - 0x40,0x44,0x44,0xF4,0x35,0x77,0x47,0x07,0xC1,0xC4,0xC0,0x81,0x80,0xC0,0x80,0x48, - 0x04,0x44,0xFD,0x74,0xC4,0x44,0xCC,0x44,0xC4,0xCC,0x4F,0x0C,0x40,0x47,0xCF,0xF0, - 0xC0,0xF4,0xCF,0x4C,0x4C,0x88,0x08,0xC0,0x00,0x8C,0x80,0xD4,0x50,0x04,0xF4,0x4F, - 0xCC,0x04,0x04,0x07,0xFC,0x00,0x4C,0xCC,0x04,0x44,0x47,0x75,0x09,0xD0,0x44,0xC8, - 0x00,0x80,0x08,0xC0,0x50,0x04,0x8C,0x74,0x94,0x4F,0x4F,0x0F,0xC5,0xC4,0x40,0xC4, - 0x44,0x05,0x40,0x04,0x44,0x04,0x8D,0x0D,0x01,0x60,0x08,0xC0,0xCC,0x14,0x08,0x04, - 0x04,0x4C,0x4D,0x04,0xCF,0xFD,0x4D,0x44,0xC4,0x44,0x07,0x4C,0x45,0x81,0x08,0x44, - 0x45,0x44,0xC4,0x5F,0x00,0x01,0x44,0xDC,0x04,0xC0,0x41,0x40,0x44,0x04,0x44,0x00, - 0x01,0x81,0x70,0x44,0x04,0x44,0x46,0x87,0x00,0x44,0x47,0xC5,0x00,0x00,0x4F,0x04, - 0x55,0x55,0x40,0x04,0x45,0x4E,0x64,0x01,0x70,0x50,0x0C,0xD0,0x01,0xFF,0xC0,0xCF, - 0xCF,0x18,0x4C,0x40,0x00,0xC4,0x80,0x05,0x8F,0x11,0x88,0x40,0xF0,0x0F,0x90,0xD8, - 0x41,0xF4,0x0F,0x48,0xDF,0x7C,0x74,0x10,0x00,0x77,0x04,0x07,0x07,0x44,0x0D,0xF4, - 0x04,0xC4,0x00,0x11,0xCF,0x0C,0xFC,0xCC,0x0F,0xC0,0x04,0x10,0x81,0xF8,0x00,0x0C, - 0x4F,0x0B,0x40,0xF0,0x40,0x4F,0xEC,0xDF,0xFF,0x04,0x00,0x80,0x00,0x88,0x88,0x88, - 0xF8,0x04,0x0F,0x0F,0xDC,0xCF,0xC0,0xC2,0x46,0xC4,0x64,0xCC,0x00,0xC0,0x4E,0x10, - 0x04,0xCF,0x88,0x08,0x13,0xF0,0x0C,0x40,0x00,0xF8,0x44,0x40,0x44,0xC4,0x40,0x44, - 0x8F,0xFF,0x4D,0x88,0x80,0x81,0x80,0x0C,0x01,0x18,0x48,0x04,0x4C,0x04,0x44,0x40, - 0x41,0xFC,0x00,0x65,0x02,0xF0,0x04,0xCF,0x04,0x1D,0xCD,0x01,0x88,0x08,0x85,0xF0, - 0x4F,0x54,0x4C,0x40,0x40,0x00,0xC4,0x84,0x46,0x44,0x4F,0x74,0xFC,0x4F,0xC7,0x00, - 0x4F,0x0F,0x74,0xCF,0x66,0xD7,0xC4,0x04,0x84,0x00,0x94,0x00,0xD0,0x40,0x54,0x0E, - 0xFC,0x40,0x4F,0x60,0x44,0x0F,0x44,0x74,0x7F,0x44,0xF7,0x44,0x44,0x43,0x50,0x40, - 0x11,0x00,0x08,0x00,0x44,0x77,0xCC,0x64,0xF0,0x45,0x4F,0x70,0x5F,0x47,0x40,0x0F, - 0x20,0x47,0xC6,0x00,0x14,0xD0,0x48,0x40,0x41,0x01,0x74,0x04,0x70,0x44,0x46,0x4F, - 0xCC,0xF4,0xF0,0x44,0xE4,0x44,0x00,0x44,0x44,0xF8,0x04,0x10,0x84,0x08,0x47,0xFE, - 0x44,0x40,0x40,0x04,0xEF,0x50,0x04,0x47,0x40,0x70,0x00,0x00,0x11,0x37,0x4C,0x47, - 0xF0,0x04,0xF0,0x74,0x44,0x01,0x01,0x46,0xF0,0x74,0xF4,0x14,0x77,0x44,0x41,0x77, - 0x00,0x44,0x1F,0x40,0x44,0xF0,0x0F,0x79,0x19,0x81,0x04,0x18,0xF0,0x37,0x75,0x44, - 0x00,0x49,0x08,0x84,0x10,0x01,0xC4,0xF4,0x01,0x04,0xFE,0x4F,0xF1,0x1F,0x0F,0x80, - 0x04,0x07,0xDC,0xF0,0xF0,0xFF,0xF5,0xCF,0xF0,0x01,0x08,0xF0,0x8D,0x08,0x0C,0x07, - 0x84,0x08,0x4F,0xF0,0xFF,0x84,0x00,0xFF,0xF7,0x40,0xFF,0xF8,0x0D,0x0F,0x04,0x00, - 0x4F,0x0F,0x90,0x70,0x51,0x67,0x63,0x07,0x44,0xF4,0x0F,0x4C,0xCF,0x40,0xF4,0x44, - 0xD4,0xFF,0x4F,0x88,0x08,0x0F,0xD0,0x44,0x04,0xFC,0x00,0xEF,0xF4,0x10,0x50,0x0C, - 0x44,0xD4,0xFC,0x44,0x8F,0x88,0x03,0xCC,0x40,0x4D,0x44,0x04,0xF0,0xF4,0x44,0x0F, - 0x44,0x60,0x40,0x4F,0x34,0xFC,0x44,0x44,0xFF,0xFC,0x0F,0x84,0x3C,0x4F,0xEF,0x04, - 0x44,0xC0,0xD4,0x07,0x4F,0x17,0x4E,0x06,0x40,0x44,0x44,0x45,0x82,0xF4,0x44,0xF4, - 0xF4,0xF0,0x88,0x88,0xD8,0x04,0x0C,0x40,0xF0,0xC0,0x40,0x44,0x4F,0x61,0x0F,0xF4, - 0x0F,0xC0,0xF0,0x00,0xF0,0x00,0x40,0x14,0x80,0x48,0x58,0x4F,0x44,0x00,0x0F,0x04, - 0x7D,0x44,0x04,0x4F,0xF4,0x0C,0x44,0x00,0x44,0x44,0xC0,0x44,0x04,0x4F,0x44,0x44, - 0x04,0x0F,0x44,0xC7,0x40,0x0D,0x45,0x00,0x04,0x00,0x88,0xC8,0x04,0x4C,0x44,0xD0, - 0x00,0x40,0x04,0x00,0x00,0x44,0x04,0x44,0x04,0x4F,0x44,0x40,0x00,0xFF,0x44,0x44, - 0x00,0x04,0x42,0x44,0x40,0x08,0x04,0x44,0x44,0xCC,0xC4,0x44,0x4C,0x44,0x55,0x4D, - 0xF4,0x0F,0x06,0x44,0xF4,0x54,0x4F,0x00,0x01,0x08,0x48,0x5D,0x4C,0x44,0x4C,0x44, - 0xFC,0xD4,0x40,0x0F,0xF4,0xC4,0x44,0x04,0x80,0x40,0x44,0x44,0x44,0x51,0x40,0x40, - 0x40,0x44,0x60,0x40,0xF4,0x70,0x07,0x40,0x04,0x40,0x04,0xCF,0x40,0x44,0x40,0x04, - 0x44,0x41,0x44,0x44,0x04,0x07,0x45,0x44,0x05,0x0D,0x0C,0x81,0x04,0x00,0x44,0x45, - 0x0F,0x70,0x50,0x40,0x04,0x40,0x44,0x04,0xFF,0xC0,0xFF,0xF4,0x8F,0xDF,0xFF,0xF0, - 0x00,0xFC,0xDF,0x09,0xF0,0x78,0x0C,0x04,0xFF,0xD4,0xF0,0x40,0x07,0x4F,0xC2,0x40, - 0xC9,0xD0,0xFC,0xF0,0xFF,0xF0,0x1F,0xF4,0x00,0xE4,0xF0,0xF2,0xFF,0xFF,0x9F,0x18, - 0x88,0xFF,0xFF,0x4F,0x04,0x88,0x46,0x07,0x0F,0xF0,0xFF,0x0F,0xFF,0x00,0x80,0xF8, - 0x46,0x3F,0xFF,0xFF,0x10,0x44,0x4F,0x04,0x4F,0xFF,0x07,0x24,0x44,0x10,0x90,0xFF, - 0xF4,0x47,0xFF,0x77,0x7F,0x74,0x06,0x6F,0x4D,0x14,0x70,0x07,0x8F,0x8F,0x08,0x91, - 0xF0,0x41,0x1F,0x79,0xF7,0x7F,0x74,0xF7,0xF4,0x14,0xC8,0x0F,0x28,0x0D,0x88,0x40, - 0x01,0x0D,0xCF,0x80,0x07,0x04,0x1C,0x00,0xF4,0x40,0x08,0x40,0xFF,0xFF,0x9F,0xFF, - 0xDF,0x08,0xF4,0x47,0x8F,0xF9,0x74,0xDF,0xFD,0x10,0xF4,0x04,0xC4,0x20,0x44,0x09, - 0x9F,0xC6,0xF7,0x48,0x44,0x04,0x47,0xFD,0xF0,0x40,0x04,0x01,0xF0,0x4C,0x04,0x84, - 0x80,0x08,0x00,0x04,0x4C,0x44,0xCD,0x40,0xFF,0x44,0x4D,0x88,0x88,0x88,0x48,0x40, - 0x74,0xCC,0x44,0x44,0xF5,0xF4,0xCF,0x44,0xF4,0xF0,0xE0,0xF5,0x0F,0x12,0x81,0x83, - 0x82,0x00,0x18,0x54,0x44,0xC4,0x04,0x44,0x04,0x4C,0x04,0x44,0xF4,0x44,0x00,0x44, - 0x93,0x00,0x88,0x08,0x1F,0x0D,0xD4,0x34,0x4F,0x00,0x06,0x47,0x44,0xF1,0x70,0x40, - 0x44,0x88,0x00,0xFC,0x00,0x50,0x64,0x4F,0x70,0x04,0x1F,0x7F,0x71,0xD5,0x40,0x5C, - 0x04,0x45,0x4C,0xEC,0xF4,0x00,0x80,0x0C,0x44,0x44,0x4C,0x44,0x44,0x04,0xC4,0x48, - 0x44,0x44,0x40,0xC0,0x40,0xFC,0x0F,0x05,0x47,0x05,0x40,0x00,0x48,0x80,0x81,0x04, - 0x00,0x44,0x40,0x40,0xF4,0xC0,0x0F,0xF0,0x40,0x14,0x08,0x41,0x40,0x04,0x44,0x44, - 0x44,0x0D,0x05,0x74,0x44,0x34,0x41,0x00,0x01,0x44,0x44,0x40,0xC0,0x04,0xF4,0x44, - 0x46,0x07,0x00,0x40,0x44,0x06,0x45,0x45,0x44,0x04,0x44,0xD0,0x06,0x07,0x77,0x00, - 0x04,0x44,0x80,0x44,0x50,0x40,0xF1,0x40,0x40,0x17,0x44,0x07,0x04,0x90,0xF0,0x4F, - 0xF4,0x1E,0xF0,0xFF,0x8F,0xF0,0x00,0x8F,0xF4,0xFF,0x1F,0xF0,0x40,0x10,0xF0,0x4F, - 0x80,0xFF,0x84,0x0F,0x08,0x14,0x0F,0x84,0xF8,0x00,0xCC,0x4F,0xFC,0xFD,0xF8,0x40, - 0x44,0x44,0x74,0x88,0x44,0x70,0xF0,0x18,0x81,0x70,0x04,0xD7,0x0F,0x80,0xF8,0x24, - 0x47,0x58,0xFF,0x30,0x00,0x44,0x4F,0x45,0x00,0x40,0xFD,0x50,0x44,0xC0,0x44,0x78, - 0xF4,0x7F,0x01,0x44,0x44,0x44,0x44,0x44,0x04,0xFF,0x5F,0x2D,0x7F,0xF9,0xFF,0x97, - 0x1C,0x14,0x0F,0xF8,0x44,0xCF,0x44,0xF0,0x88,0x80,0xF8,0xFD,0x80,0x4F,0x0F,0x88, - 0x0F,0x44,0xC4,0xFF,0x04,0x74,0xF4,0x04,0x44,0xC4,0xF4,0xFF,0xCF,0x00,0x00,0x0E, - 0x50,0x17,0x44,0x44,0xF7,0x4F,0x04,0x14,0xC3,0xFC,0x44,0x4F,0x44,0xF7,0x54,0x74, - 0x57,0x77,0x40,0x44,0x5D,0x08,0x07,0x44,0x51,0x74,0x09,0xFF,0x01,0x3F,0x01,0xDD, - 0xF8,0x9C,0x4F,0x40,0x1C,0xDF,0x11,0xF0,0xB1,0xF4,0xF7,0xF1,0x0F,0xF4,0x40,0xF0, - 0x08,0x44,0x94,0xFF,0xF4,0x08,0xD0,0x94,0xF4,0x40,0x70,0x54,0x85,0x0A,0x0F,0x01, - 0x1C,0x40,0x04,0x04,0x47,0x47,0x07,0x47,0x18,0x98,0x84,0xFF,0x7C,0x20,0xD1,0x00, - 0x90,0x0F,0x4C,0x6B,0xF0,0x7F,0xFF,0x40,0xFF,0xD0,0x04,0xF4,0xF4,0x80,0x4F,0x74, - 0xFF,0x1D,0xF9,0xF4,0xCD,0xFF,0x0F,0x00,0x7F,0x1F,0x80,0xF4,0xFF,0x40,0x0C,0x47, - 0xF7,0x00,0x40,0x04,0x7F,0x4F,0xD4,0x4F,0x00,0x14,0xE1,0xE4,0x70,0x40,0xFD,0x44, - 0x44,0xF0,0xF8,0x08,0xC0,0x00,0xFF,0xC0,0xCC,0x0C,0x44,0xFF,0xFF,0x00,0xD1,0x04, - 0xCF,0x40,0x4D,0x80,0x04,0xFC,0xC0,0x4C,0xD1,0x40,0xF4,0x40,0x0D,0xD4,0xF0,0xF8, - 0x8C,0x88,0x88,0x08,0x40,0x4C,0xC4,0xFD,0x45,0x4F,0xDC,0x4F,0xD0,0xD4,0xFF,0x04, - 0xFF,0x04,0xF4,0xFD,0xDF,0xDF,0x44,0xF4,0x10,0x44,0xC4,0x40,0x40,0x81,0x08,0x80, - 0xF4,0xFF,0x44,0x45,0x40,0x91,0xF4,0x00,0x0F,0x4B,0xF4,0x00,0xF4,0x4D,0xE0,0x54, - 0x01,0xFF,0x7F,0xC4,0xFF,0xDF,0xFF,0xF4,0x0F,0x84,0x10,0x18,0x88,0x88,0x88,0xC0, - 0x54,0xD0,0x6D,0x44,0xF4,0x14,0xFF,0x04,0x04,0x0D,0x4F,0x5F,0x00,0xFF,0xC4,0xF0, - 0x0F,0xF4,0x02,0x39,0x1C,0x81,0x88,0x80,0xC6,0x3F,0x5F,0x47,0x4F,0xC4,0x7F,0x44, - 0x44,0x40,0xF4,0x7D,0x44,0xCC,0x44,0x44,0xF0,0x50,0xC4,0x00,0x4D,0x4F,0xCF,0xF0, - 0x7F,0x14,0x04,0x9C,0x89,0x80,0x88,0x88,0x7F,0xF4,0x4D,0x5F,0xF4,0x7F,0x04,0x44, - 0xFD,0x50,0x44,0x0F,0xFF,0x44,0xF0,0xFF,0x04,0x44,0xF4,0x04,0x04,0x7F,0x44,0xF0, - 0x18,0xC0,0x08,0xF8,0x45,0xC0,0x05,0x44,0x02,0x04,0x07,0x64,0x00,0x40,0x13,0xF7, - 0x44,0x03,0x47,0x74,0x4F,0x7F,0x5F,0x4D,0x80,0x74,0xF4,0x04,0x37,0x45,0xF0,0x74, - 0x40,0x7F,0x52,0xF4,0x0F,0x74,0x04,0x17,0x5F,0x54,0xD5,0xF7,0x40,0x04,0x24,0x07, - 0x24,0x47,0x00,0x10,0x40,0xC7,0x44,0x0F,0x77,0x44,0x0F,0x05,0x32,0x47,0xC0,0x40, - 0x40,0x04,0x07,0xF0,0xFC,0x74,0x4F,0x47,0x44,0x47,0x44,0x87,0x04,0x07,0x04,0x4F, - 0x40,0x7C,0x40,0x75,0x7F,0xFF,0x55,0x00,0x81,0x08,0x08,0x04,0x04,0x00,0x00,0x44, - 0x04,0x50,0x44,0x4F,0x44,0x44,0x40,0x04,0x44,0x07,0xC1,0x77,0x47,0x47,0x57,0xF5, - 0x47,0x40,0x04,0x68,0x5F,0x8F,0xFF,0x8F,0xFF,0x0F,0xC5,0x8F,0x48,0xDF,0x10,0xF1, - 0xF4,0x4C,0xCC,0x41,0x40,0x4F,0x48,0x1F,0x27,0xD5,0x0F,0x86,0x01,0x14,0xC0,0xFE, - 0xFF,0xC4,0x00,0xF4,0xFF,0xF4,0xF8,0x0D,0xC4,0xFC,0x1C,0x00,0x4F,0xD4,0x04,0x05, - 0x45,0xD4,0x44,0x8C,0x81,0x88,0xC8,0xCF,0x7C,0xC4,0x04,0x4F,0xF0,0x44,0x4F,0xFD, - 0x4F,0x51,0x0F,0x44,0xF4,0x0F,0x04,0x44,0xF0,0xDF,0xF5,0xFD,0x0F,0xB8,0x01,0x81, - 0x88,0x08,0x88,0x00,0x4C,0xF4,0x4D,0x44,0xF4,0x40,0xF4,0x04,0xF4,0xD0,0xFF,0xC4, - 0x00,0x0F,0x9C,0xFD,0xFF,0x4D,0xFF,0x7C,0x5F,0xFF,0x0F,0xFE,0xFF,0xFC,0x1F,0x80, - 0x11,0x88,0x80,0x88,0x88,0x40,0xFF,0xFF,0xF4,0xF4,0x5C,0xFD,0x44,0x44,0xCC,0xFF, - 0xF4,0x4D,0x00,0xF0,0xFD,0x40,0x45,0x4C,0xF4,0x40,0x44,0x0C,0x10,0x88,0x80,0x98, - 0x88,0x88,0x88,0x10,0x4D,0xFC,0x44,0xF0,0x00,0x4C,0x04,0x04,0xC4,0x6F,0xF4,0x17, - 0x44,0xCC,0xC4,0x4C,0xF4,0xC4,0xDF,0xFC,0x4F,0x00,0xF4,0x54,0x44,0x04,0x01,0x88, - 0x80,0x88,0x40,0xF0,0xD7,0x50,0xC0,0x4F,0xC4,0x07,0x40,0xFD,0x04,0xFF,0x44,0x20, - 0x4D,0x7C,0x44,0xC0,0xFF,0x0C,0x4F,0xDD,0x4C,0x0F,0x04,0x4F,0x4F,0x70,0x44,0x54, - 0x7F,0xDF,0xF4,0xF0,0xFF,0x5F,0xCD,0x4C,0x4C,0x81,0x19,0x80,0x08,0x18,0x48,0x0D, - 0x07,0x54,0x4F,0x4F,0xD4,0x44,0xC4,0xFC,0x0F,0x4D,0x40,0x4F,0x44,0x77,0x44,0x48, - 0xFC,0xD0,0x44,0x40,0xD4,0x4C,0xE0,0x47,0x44,0x4C,0x4F,0x2F,0x48,0x01,0x80,0x80, - 0x88,0x08,0x80,0x05,0x40,0xC4,0x7C,0xF4,0x0C,0xD4,0x4E,0x77,0x04,0xC4,0x0F,0x4C, - 0xCC,0x46,0x44,0x4D,0x05,0x4C,0xFF,0x44,0x04,0x40,0x4D,0x45,0x40,0x0B,0xC0,0xC7, - 0xC4,0x41,0x88,0x88,0x08,0x48,0x40,0x05,0x44,0x4C,0x0C,0x00,0x4F,0x40,0x44,0x4C, - 0x46,0x00,0x40,0xD4,0xF4,0x40,0xC4,0x74,0x04,0x44,0x44,0x45,0xEF,0x4F,0x40,0x40, - 0x00,0x1C,0xC8,0xC0,0x47,0x44,0x04,0x40,0xA4,0x7F,0xD0,0x4C,0xC4,0x07,0x44,0x4F, - 0x04,0x44,0x7F,0x04,0xD0,0x70,0xFC,0x0F,0x4F,0x47,0x80,0x80,0x18,0x78,0x4D,0x44, - 0x78,0x4D,0xF6,0x70,0x40,0x46,0xF0,0x4E,0x0C,0x0F,0x07,0xC4,0x4F,0x47,0x00,0x08, - 0x35,0x5F,0x2C,0x3F,0x44,0x40,0x47,0x50,0x54,0x47,0x07,0x41,0x04,0x47,0x54,0x47, - 0x4F,0x40,0x16,0x44,0xC0,0x00,0x44,0x44,0x04,0x48,0x44,0x44,0x4F,0x40,0x75,0x00, - 0x44,0x74,0xC5,0x04,0x40,0xF7,0x47,0xFC,0x00,0x44,0x99,0x7F,0xFF,0xF0,0x40,0xFF, - 0x40,0x40,0xF0,0x04,0xCF,0x04,0x38,0xF2,0xF0,0x40,0xD4,0x7F,0xD5,0x80,0xD4,0x0F, - 0x00,0xFF,0x44,0x4F,0x04,0x84,0x2F,0x40,0x09,0x4F,0x7F,0xF4,0x47,0x44,0x04,0x42, - 0x44,0x77,0x24,0xF0,0x11,0x80,0x0F,0x01,0xFF,0x84,0x44,0xF4,0xF0,0xF0,0x4F,0xF4, - 0xF4,0x04,0xFF,0xF0,0x84,0xF4,0x97,0xF4,0x0F,0x44,0x00,0x74,0xF4,0x40,0xFF,0x40, - 0xF0,0xD4,0xFD,0x0D,0x00,0xF0,0x4F,0xFC,0x04,0x4F,0x00,0xFC,0x10,0x54,0x40,0x44, - 0x5B,0x2D,0xF0,0x9F,0xFF,0x00,0xFF,0xCC,0xFC,0x44,0x46,0x88,0x00,0x7F,0x66,0x20, - 0x0C,0xDD,0x44,0x7F,0x64,0x4F,0x4F,0xFF,0x66,0xF4,0xEF,0x00,0x80,0x00,0xC5,0xF2, - 0x4F,0x04,0xF4,0xF4,0x0F,0x40,0xF0,0xF4,0xE2,0x44,0xDF,0x0F,0x40,0x26,0x09,0x18, - 0xF0,0xF7,0x43,0x40,0x70,0xB4,0x0C,0xF0,0x40,0x8D,0x88,0x88,0x60,0x6C,0x74,0xF7, - 0xC0,0x05,0x6F,0x3F,0x3F,0x24,0x04,0xFF,0x4D,0x24,0x0F,0xEF,0x46,0x4F,0x04,0x0C, - 0x14,0x18,0x0F,0xF4,0x77,0x44,0x4C,0x03,0xF6,0x00,0x44,0xFF,0x47,0x00,0xF0,0x40, - 0x46,0x07,0x00,0x81,0x4C,0x74,0x00,0x4F,0x44,0x26,0x4F,0x40,0xF5,0x64,0x41,0x0F, - 0x14,0x00,0x07,0x76,0x74,0x44,0x04,0x40,0x00,0x04,0x00,0x07,0xF4,0xC5,0xFF,0x04, - 0x07,0x40,0x04,0x0F,0xD4,0x40,0x44,0x04,0x0F,0xFF,0x1F,0xF4,0xF7,0x02,0x1F,0xFF, - 0x4F,0x70,0x44,0x00,0xFF,0xFC,0xF0,0x50,0x0C,0x44,0xFF,0xF1,0x04,0xF0,0xF0,0xF3, - 0x44,0x44,0x00,0x7F,0x0F,0xFF,0xFF,0x87,0xF0,0x40,0xCF,0x13,0x45,0x4F,0xF4,0x54, - 0x18,0x88,0x40,0x4F,0xFC,0x40,0x4C,0xFF,0xFF,0xF0,0x40,0x4F,0x50,0x04,0x4F,0xFF, - 0x9D,0x11,0x91,0x00,0x88,0x0D,0xF4,0xF0,0x7F,0xFD,0xF4,0xF4,0x44,0x64,0x4D,0xFF, - 0xC0,0x40,0x5C,0xF4,0xF0,0xF0,0x45,0x24,0x4F,0x64,0xF4,0xF0,0x4C,0x0F,0xFF,0x48, - 0x11,0x89,0x80,0x81,0x18,0x88,0x80,0xF4,0x0E,0xF4,0xC5,0xFF,0x5F,0x44,0x00,0x02, - 0xD4,0xC4,0x4F,0x05,0x44,0x45,0x44,0xF5,0xFF,0xFE,0x0F,0x00,0xCD,0x4F,0xFC,0x4D, - 0x08,0x4D,0xC5,0x54,0xF0,0x44,0x4F,0xF5,0xFE,0xF4,0x7F,0x46,0x04,0x34,0x84,0xD3, - 0x08,0x11,0x81,0x80,0x88,0x88,0x08,0x88,0x80,0xF6,0x04,0xFF,0x04,0x04,0xC4,0x41, - 0xF4,0x03,0x40,0x04,0xD4,0x5F,0x40,0x64,0x40,0xCD,0x45,0x80,0xFF,0x44,0xFF,0x08, - 0xFF,0xFF,0xCC,0x0D,0xCF,0x48,0x5C,0x4F,0xFF,0xF0,0x8F,0x00,0x10,0x00,0x11,0x10, - 0x88,0x98,0x88,0x98,0x88,0xC0,0x60,0x44,0x40,0x04,0x4D,0xDF,0x44,0x40,0x44,0x75, - 0xF4,0xD4,0xF0,0x4C,0x40,0x44,0x14,0xF0,0x44,0xF4,0x07,0xF4,0x00,0x50,0x74,0x70, - 0x65,0x4F,0x04,0xF9,0x4F,0x44,0xF4,0xF4,0x7F,0xF4,0xF4,0x03,0x10,0x01,0x01,0x00, - 0x08,0x08,0x27,0x45,0xF4,0xF1,0xF4,0x04,0x44,0x4F,0xF5,0x74,0x0F,0x0F,0x44,0x74, - 0x5F,0xD0,0x04,0x74,0x46,0x44,0x04,0x5F,0xCC,0x47,0xC4,0x44,0xF4,0x4F,0x4C,0x00, - 0x93,0x00,0xF5,0x44,0x05,0xD4,0xF5,0x5F,0x4C,0x4F,0x44,0x40,0x10,0x11,0x01,0x80, - 0x18,0x11,0x58,0x21,0x01,0x05,0x80,0x40,0x0F,0x44,0x4D,0x41,0xD4,0x40,0xF4,0xF0, - 0x44,0x0C,0x44,0x00,0x44,0x47,0x04,0x44,0x40,0x74,0x3F,0xE4,0x74,0x4F,0xD5,0x4F, - 0x3F,0xE7,0x40,0x0C,0x46,0xF5,0x74,0x7F,0xC4,0x10,0x75,0xF4,0xFF,0x44,0x19,0x01, - 0x18,0x4D,0x08,0x8C,0x88,0x01,0xC0,0x05,0x54,0x40,0xFD,0x44,0x48,0xF4,0x0F,0x04, - 0x45,0x40,0x64,0xF4,0x4C,0xC4,0xC4,0x47,0xD4,0x41,0x4F,0xC4,0x40,0xF0,0x44,0x55, - 0xF0,0x74,0x04,0x14,0x40,0x7D,0x7F,0x4D,0x0F,0x50,0x0C,0x00,0x51,0x80,0x10,0x81, - 0x08,0x04,0x44,0x54,0x07,0x52,0xD4,0x04,0x40,0x4D,0x04,0x44,0x14,0x70,0xF4,0xF4, - 0x44,0x07,0x54,0x44,0x44,0x1F,0x41,0x40,0x60,0x71,0x45,0xF4,0x7C,0x50,0x40,0xF7, - 0xF4,0x71,0x40,0x44,0x1C,0x19,0x00,0x81,0x80,0x04,0xF4,0x01,0x77,0x47,0xF4,0x4C, - 0x44,0x00,0x0F,0xD4,0x47,0x70,0x40,0x44,0x4C,0x22,0x44,0x04,0xFF,0xC4,0x40,0x74, - 0xF4,0x05,0x84,0x45,0x44,0x64,0x40,0x40,0x80,0x01,0x00,0x04,0x81,0x00,0x08,0x54, - 0x4F,0x40,0x4F,0x40,0x60,0x00,0x74,0x0F,0xCD,0x44,0x44,0xF4,0x00,0x44,0x07,0x44, - 0x4D,0x77,0x44,0x04,0x84,0x81,0x4D,0x45,0x00,0x70,0x04,0x44,0x45,0x74,0x40,0x00, - 0x10,0x77,0x40,0x44,0x00,0x40,0x44,0x40,0x54,0x70,0x00,0x00,0x40,0x77,0x44,0x55, - 0x44,0x00,0x40,0x40,0x05,0x45,0x04,0x44,0x44,0x40,0x00,0x02,0x44,0x75,0x04,0x00, - 0x04,0x44,0x17,0x00,0x44,0x47,0x02,0x40,0x44,0x47,0x00,0x47,0x44,0x04,0x10,0x14, - 0xFF,0xF8,0x48,0x90,0x04,0x00,0x44,0x40,0x40,0x4F,0x04,0xD4,0xD5,0x4F,0x74,0x4F, - 0x40,0xD4,0x40,0xFE,0xD4,0x44,0x4F,0x07,0x47,0x10,0x45,0x04,0xD0,0x50,0x04,0x74, - 0x74,0xFF,0xEF,0xCF,0x10,0x0F,0x40,0x10,0x00,0x16,0x01,0x64,0xD7,0xF0,0x08,0x74, - 0x85,0xFC,0x0F,0x8F,0xF4,0x9F,0xD4,0x44,0x40,0x88,0x54,0x4F,0x47,0xC8,0x00,0x47, - 0x84,0x44,0x45,0x44,0x00,0x5D,0x40,0x76,0x14,0xD7,0xF9,0xF4,0x00,0x17,0x47,0xF0, - 0xA5,0x48,0xF4,0x47,0x44,0xF4,0xF4,0xE1,0x04,0xFF,0x8F,0xEE,0x87,0xF4,0x00,0x14, - 0x84,0x44,0x04,0x44,0x04,0xF8,0x07,0xD0,0x04,0xD4,0x80,0x40,0xC4,0x40,0xC4,0x44, - 0x44,0x44,0xC4,0x8C,0x47,0x40,0xC4,0xF0,0xFC,0xF0,0xCD,0x1C,0xCC,0xDC,0x40,0xC0, - 0xC4,0x78,0x4D,0xCC,0x8C,0x4C,0x04,0xCC,0x0C,0x48,0x8F,0x34,0x0F,0x40,0x80,0x47, - 0xFF,0x4F,0x00,0x8C,0x80,0x4C,0x44,0x47,0x0F,0x04,0xFC,0xF0,0x40,0xC3,0xFC,0xFF, - 0xCF,0x01,0x48,0x44,0xCF,0x4F,0x60,0x40,0x4F,0x5F,0x0C,0x0E,0x84,0x47,0xF0,0x0F, - 0xF4,0xFF,0xE4,0x6E,0xCF,0x44,0x5F,0xC4,0x0F,0xF7,0x6E,0x07,0xF4,0xF7,0x44,0x80, - 0x90,0x81,0x88,0x88,0x80,0xF8,0xC4,0x0F,0xD4,0xF8,0x04,0x44,0xFF,0xF5,0xFF,0xFF, - 0x74,0x46,0xDF,0x44,0xF0,0x0F,0x4F,0x40,0x4C,0xF4,0xFC,0xFC,0x50,0xF4,0x54,0x0C, - 0xFC,0xFF,0xF0,0x40,0x4F,0xE9,0x44,0xFE,0xDF,0xF4,0xC0,0x88,0x48,0x88,0x88,0x08, - 0xC4,0x40,0x0C,0xC0,0x44,0xF4,0x4F,0x4C,0x64,0x0D,0x00,0xF4,0xF4,0xFC,0x40,0x5F, - 0x04,0x04,0xF0,0xE0,0x74,0xDF,0x44,0x4C,0xC4,0xDF,0xF4,0x47,0xE7,0xF4,0xFC,0x4F, - 0xF4,0x84,0x91,0x88,0x08,0x88,0x80,0x88,0x80,0x88,0x08,0x00,0xF4,0x0F,0x0B,0x4C, - 0x4C,0xF4,0x45,0x4F,0xF4,0x0F,0x47,0xCF,0x04,0x00,0x4F,0xF4,0x7F,0x44,0x4C,0x64, - 0x04,0x4C,0xF4,0x64,0xEF,0x44,0x6F,0x0F,0xE4,0xF4,0xFC,0x40,0x14,0x90,0x81,0x88, - 0x88,0x80,0x08,0x88,0x88,0xCC,0x04,0xFC,0x00,0x4F,0xF4,0x40,0x0D,0x44,0x45,0xC4, - 0x0F,0x24,0xFE,0xFF,0x44,0xF4,0x4D,0x60,0xF4,0x45,0x54,0x4C,0xFF,0x46,0xC4,0x4F, - 0xFC,0x44,0x4D,0x0D,0x47,0xF7,0x4D,0x4F,0xF4,0xF4,0x74,0xF4,0xF2,0xF7,0x4C,0x00, - 0x04,0x40,0xD0,0x10,0x11,0x19,0x88,0x08,0x88,0x10,0x18,0xA8,0x70,0x7F,0xD4,0x70, - 0xFF,0xF4,0xFF,0x47,0x94,0xF0,0xF7,0xF5,0x44,0x4E,0x4E,0x06,0x4F,0x4F,0x66,0x47, - 0x44,0xF0,0x4E,0x44,0x40,0x47,0xF0,0x0D,0x40,0xC4,0x4C,0x0F,0x0F,0xF4,0x46,0xF6, - 0x44,0x44,0x44,0x74,0x44,0xF0,0x00,0x7F,0x40,0x67,0x04,0x41,0x44,0x02,0x00,0x99, - 0x01,0x81,0x80,0x08,0xC4,0x00,0x41,0xD4,0x0F,0x44,0x04,0x47,0x4D,0x40,0x7F,0xF4, - 0x44,0x4F,0xF4,0xC4,0x00,0x6F,0x00,0xF4,0xE4,0x4D,0x0C,0xDF,0x40,0x8F,0xD4,0x44, - 0xC4,0x4F,0x77,0xC4,0x74,0xF4,0x47,0xC6,0xF4,0xF4,0xDF,0x84,0x04,0x08,0x14,0x89, - 0x88,0x08,0x88,0x08,0x88,0x40,0x45,0x74,0x40,0x05,0x4F,0x54,0xC7,0x40,0x44,0x77, - 0x74,0x4F,0x40,0x4F,0xD4,0x04,0x40,0xF4,0x30,0xD4,0xFF,0x00,0x44,0x04,0x04,0x44, - 0x4F,0x77,0x48,0x44,0xC0,0xFC,0xC7,0xC4,0xF4,0xC7,0x40,0x4C,0x47,0x44,0x04,0x7D, - 0x74,0x40,0x10,0x88,0x00,0x80,0x80,0x44,0x64,0x44,0x47,0x60,0x0F,0x76,0x49,0x7E, - 0x44,0x4E,0x07,0x4D,0x40,0x44,0xF4,0x5F,0x07,0x4C,0x09,0x44,0x0D,0x4F,0x6F,0x46, - 0x35,0x05,0x4F,0x45,0xCE,0x40,0xEC,0x0F,0x16,0x44,0x46,0x74,0x00,0x00,0x08,0x04, - 0xD4,0x74,0x47,0xE4,0x44,0x45,0x44,0x66,0x74,0xF4,0x04,0x0C,0xF4,0x04,0x44,0x42, - 0x7F,0x7F,0x04,0x45,0xC4,0x46,0x44,0x44,0x80,0x08,0x75,0x00,0x05,0x52,0x44,0x7C, - 0xFF,0x44,0x07,0x44,0x64,0x70,0x47,0xFD,0x74,0x14,0x04,0x41,0x00,0x45,0x04,0x47, - 0x74,0x00,0x64,0x46,0x70,0x74,0x44,0x74,0xF0,0x00,0x74,0x44,0x47,0xFF,0x76,0x73, - 0x44,0xE4,0x04,0x75,0x46,0x44,0x01,0x60,0x45,0x45,0xC4,0x60,0xC4,0x44,0x04,0x47, - 0x44,0x44,0x44,0x04,0x44,0x04,0x0F,0x80,0x72,0x04,0x40,0x44,0x07,0x44,0x60,0x4C, - 0x44,0x74,0x44,0x04,0x44,0xF4,0x88,0x90,0x4F,0x00,0x84,0x0C,0x0F,0x04,0x7F,0x88, - 0x08,0x44,0xE4,0x04,0x90,0x0F,0x00,0x0F,0x40,0x4F,0xCC,0x0C,0xF6,0x06,0x88,0x40, - 0x60,0x00,0x06,0x00,0x40,0xE0,0xFF,0xFD,0xC4,0xF0,0x44,0xE0,0xBF,0x85,0x88,0x04, - 0x88,0x88,0x40,0x44,0x0F,0x7C,0x40,0x74,0x40,0x44,0x04,0x40,0xFE,0x84,0x74,0xB4, - 0x44,0x04,0x0C,0x88,0x88,0x80,0x88,0xC0,0x34,0x40,0x04,0xC4,0xF4,0x44,0xF4,0x40, - 0x04,0x04,0x06,0x04,0xF0,0x4C,0x46,0x04,0x0C,0xC0,0x81,0x48,0xF8,0x4F,0x05,0x46, - 0x74,0x44,0x40,0x0F,0x04,0x00,0x00,0xC4,0xCE,0x04,0x00,0x0F,0x00,0x04,0x01,0x00, - 0x40,0x44,0x84,0x64,0x70,0x4A,0x4F,0x0F,0x06,0x44,0x14,0x07,0x74,0x04,0x4E,0x4E, - 0x44,0x45,0x7F,0xFF,0x7C,0x04,0x07,0x0F,0x04,0x8C,0x00,0x00,0x0C,0x48,0xF0,0x00, - 0x44,0x04,0x4F,0x44,0x61,0x0F,0x00,0xF0,0x04,0x06,0x1F,0x40,0xF8,0x44,0x44,0xF4, - 0x4C,0x02,0x44,0x40,0x4D,0x04,0x0F,0x40,0x74,0xC4,0xC0,0x00,0xF4,0x02,0x44,0x47, - 0x64,0xF4,0x44,0x44,0x77,0x44,0x00,0x4F,0x07,0x07,0xFF,0x14,0x44,0x44,0x04,0x70, - 0x4D,0x04,0xF4,0xF7,0x04,0x00,0x75,0x0F,0x44,0x04,0x14,0x00,0xD4,0x70,0x47,0x76, - 0x42,0x44,0x40,0x4F,0x00,0x04,0x54,0x00,0x07,0x40,0x04,0x00,0x40,0x74,0xC0,0x44, - 0x00,0x44,0x00,0x44,0x4D,0x0F,0x7F,0x00,0x8F,0x03,0xF0,0x8F,0xCC,0xF7,0xF1,0xD7, - 0x43,0x04,0x04,0xF7,0x4F,0x54,0x8F,0x40,0x00,0x4F,0x00,0x0C,0xF7,0xF4,0xF0,0xF4, - 0xF4,0x4F,0x00,0xF8,0xF0,0x04,0x04,0xCC,0x04,0x4F,0x85,0x44,0xF4,0x48,0x74,0xCD, - 0xFF,0x01,0x4C,0x14,0x44,0xC8,0xC4,0x84,0x44,0x4D,0x40,0x45,0x04,0x44,0x40,0x84, - 0x41,0x47,0x44,0x74,0x44,0x04,0x8F,0xF4,0x0C,0x01,0x4C,0x89,0x98,0x04,0x40,0x40, - 0xC6,0xDF,0x4F,0x05,0x48,0x44,0x84,0x0F,0x4F,0x0D,0x04,0xF4,0xF4,0x14,0x04,0x48, - 0xFD,0x41,0x04,0x04,0xFC,0x44,0x99,0x88,0x88,0xC8,0x0C,0xD4,0x7F,0xCC,0x7F,0x44, - 0xC4,0x80,0x00,0x40,0x04,0x4F,0x04,0x48,0x40,0xC4,0x80,0xDF,0x44,0xF0,0xFF,0x1C, - 0x80,0x4C,0xF0,0x44,0xF0,0xBB,0x48,0x19,0xC4,0x44,0x4D,0xF7,0x88,0x44,0x04,0xFF, - 0x04,0x44,0x77,0x00,0x40,0x04,0xC4,0x15,0x4E,0x0E,0x00,0xF0,0x44,0x44,0x40,0x44, - 0x4C,0x14,0x40,0x44,0x47,0x47,0x8C,0xC4,0x07,0x47,0x70,0x44,0x07,0x77,0x44,0x4C, - 0x44,0x44,0x4F,0xF6,0xF4,0xF4,0x00,0x0E,0x80,0x44,0x46,0x6F,0x02,0x80,0x40,0xE0, - 0x44,0x0C,0x44,0x24,0xF4,0xC0,0x64,0x88,0x08,0xFF,0x44,0xC4,0x64,0xD8,0x04,0x44, - 0x0F,0x4F,0x40,0x44,0xAD,0x4F,0xF6,0xA1,0x88,0x44,0x04,0x46,0xE4,0x44,0x64,0x0F, - 0x0F,0x60,0xF2,0xC2,0xE4,0x47,0xF0,0x05,0x10,0x08,0x44,0x24,0x25,0x04,0x64,0x47, - 0x64,0xF0,0xF4,0x6F,0xF4,0x4C,0x44,0x80,0x08,0x20,0x00,0x04,0x00,0xEC,0x40,0x00, - 0x66,0x4F,0xF4,0x0E,0x4E,0x6E,0x4E,0x6E,0x4E,0xF5,0xFF,0x0F,0x02,0x07,0x08,0x70, - 0xF6,0x24,0x06,0x04,0x00,0x64,0x40,0x44,0x44,0x04,0xF4,0xE0,0xF0,0xFF,0x0F,0xFF, - 0x03,0x56,0x27,0x40,0x72,0x07,0xC0,0x74,0x4D,0x54,0x40,0x89,0x40,0x00,0x44,0x4F, - 0xCC,0xF2,0x40,0xE4,0x64,0xF4,0x00,0x08,0x0C,0x04,0x44,0x40,0x06,0x04,0x0E,0x6F, - 0x64,0x61,0x60,0xF4,0xCE,0x46,0x40,0x40,0x47,0x06,0x60,0x44,0x24,0x4C,0x70,0x64, - 0x44,0x44,0x40,0x02,0x00,0x47,0x00,0x70,0x00,0x08,0x47,0x44,0x46,0x46,0x4F,0x4C, - 0x4F,0xFF,0x4C,0x1F,0x54,0x00,0x04,0x8F,0x11,0x01,0x4C,0xFD,0x01,0x40,0x40,0xC4, - 0x24,0x54,0x1F,0x40,0x04,0x40,0xD5,0xC1,0xF4,0xD4,0x74,0x44,0x8F,0x2F,0x4D,0xF1, - 0x40,0x34,0xF0,0x07,0xCF,0xF4,0xCF,0xC4,0xFF,0xFF,0x80,0xF0,0x4F,0x9F,0x41,0x4C, - 0x0C,0x00,0x81,0x61,0x1C,0xD1,0x1F,0xFC,0x30,0x20,0x0F,0x00,0xF0,0xFD,0x7F,0x40, - 0x01,0x47,0xF5,0x1F,0x10,0x5F,0x14,0x60,0x07,0x18,0x29,0x57,0xCF,0x02,0x40,0xF4, - 0x04,0x81,0x04,0x7F,0x10,0xF7,0x04,0xF3,0xF4,0x08,0x4D,0x88,0xC4,0x0D,0xF0,0x80, - 0x88,0xD4,0xFC,0x40,0x00,0xF4,0x08,0x88,0xBC,0xFF,0xF8,0x04,0xFF,0x44,0xFF,0x4F, - 0x44,0x9F,0xFC,0xF0,0x88,0x4D,0xF4,0x44,0x44,0x0F,0xFF,0x48,0x7F,0xF4,0x40,0x4D, - 0x44,0xDF,0x0C,0xCC,0x18,0x88,0x40,0x40,0xCF,0x57,0x4F,0x44,0xC4,0x47,0x4F,0xD5, - 0xDE,0xC0,0x80,0x00,0x54,0x5C,0x7C,0x40,0x4C,0x40,0xC4,0x84,0xE8,0xC4,0x44,0xF4, - 0x7F,0x4F,0xCF,0x5E,0xC4,0x88,0xC0,0x40,0x4D,0xC4,0xCD,0x00,0x0C,0x34,0x4C,0x88, - 0x08,0xC7,0x00,0x55,0x45,0x00,0x8F,0x02,0x04,0x47,0x08,0x4F,0x45,0x04,0x0D,0x48, - 0x74,0x85,0x46,0x10,0x71,0x85,0x47,0x84,0x47,0x07,0x40,0x01,0x4F,0xF1,0xF7,0x4F, - 0x41,0x1C,0x0F,0xFF,0x4D,0xD4,0x00,0x4F,0x83,0xF4,0x40,0x0D,0xD0,0x45,0x44,0x04, - 0x00,0x00,0x0C,0x04,0x00,0x44,0x40,0x4F,0x85,0x08,0x4D,0x10,0x15,0x47,0x40,0xF4, - 0x00,0x7F,0xC4,0x0F,0x4F,0x0F,0xD0,0x8C,0x88,0x4F,0x0C,0x9B,0x08,0xF4,0x47,0xF7, - 0x70,0x70,0xD7,0x74,0x40,0x05,0x44,0xCF,0xC4,0x4F,0x4F,0x40,0xCF,0x54,0x0E,0x0F, - 0xF0,0x40,0x4F,0xD0,0xFF,0xF4,0x81,0x00,0x44,0x44,0x40,0x00,0xC0,0x5C,0x40,0xD3, - 0x0F,0x4C,0x51,0x49,0xFC,0x00,0xC0,0xC0,0x40,0x44,0xC4,0xFC,0x4F,0x4F,0x4F,0x04, - 0x8B,0x80,0x44,0xD4,0x00,0x44,0x64,0x44,0x88,0x04,0x44,0x04,0x40,0xFD,0x04,0x44, - 0xF4,0xFC,0xD0,0x4F,0x4D,0xF4,0x4C,0x44,0x00,0x00,0x00,0x00,0xF0,0x00,0xC4,0xFF, - 0x4C,0x44,0xCC,0x40,0x44,0x70,0xCC,0x4D,0xF0,0x08,0x40,0x40,0x40,0x44,0x44,0xC7, - 0x4D,0x4C,0xF0,0xC0,0x84,0x4C,0x7F,0x00,0x4F,0xF4,0xC4,0x45,0x10,0xF4,0xD7,0xF0, - 0x00,0x04,0x04,0x50,0x40,0x04,0xD4,0x44,0x04,0x40,0x44,0xF0,0x44,0xF5,0x0F,0x04, - 0x04,0xFF,0xF0,0xD0,0xF4,0x80,0xFC,0x7F,0x44,0xF4,0x00,0x48,0x4C,0x44,0xC5,0x88, - 0x88,0x4F,0x40,0x04,0xC0,0x00,0xCD,0x44,0xC4,0x4F,0x1D,0x88,0x08,0x08,0xC8,0xC0, - 0x45,0xC4,0xF0,0xFF,0xC4,0x44,0xCE,0x44,0x4C,0x47,0x4F,0xC0,0x8C,0x89,0x08,0x18, - 0x08,0x40,0xF0,0x80,0x44,0x04,0x4C,0x2C,0x0C,0x0C,0x80,0x88,0x00,0x00,0xF4,0x04, - 0x04,0x04,0x44,0x00,0x44,0xFC,0xCF,0x7C,0x44,0x01,0x01,0x80,0x40,0x40,0x0F,0x44, - 0x34,0x40,0x44,0xF5,0xC0,0x00,0xBF,0x4F,0xF0,0xD0,0x54,0xF4,0x4C,0x8D,0x08,0xC4, - 0xC4,0xF4,0xC4,0xF0,0x74,0x45,0x44,0x10,0x8C,0xCC,0x1C,0x00,0x80,0x67,0x07,0x0D, - 0xF0,0x40,0xF4,0x01,0xC4,0xFF,0x44,0x46,0x5F,0x00,0xCD,0x00,0x80,0x45,0x40,0x44, - 0x44,0x44,0x40,0x50,0x4F,0x04,0x4F,0x70,0x00,0x4C,0x2D,0xC0,0x40,0x6C,0x54,0x44, - 0xF0,0x04,0xC4,0x10,0x40,0x00,0x04,0x07,0x54,0xC5,0x04,0x42,0x50,0x40,0x44,0x0C, - 0x40,0x40,0x4C,0x45,0x44,0x77,0x47,0x40,0x40,0x00,0x48,0x00,0x44,0x8F,0x49,0x4F, - 0xFF,0x04,0x44,0x7C,0xFF,0x44,0x04,0x40,0x47,0xD4,0x54,0xFF,0x00,0xEF,0xFE,0xDF, - 0x4F,0x49,0xF4,0xC0,0x4F,0x44,0xF0,0x80,0x04,0x44,0x04,0x90,0x48,0x0F,0x44,0x70, - 0xF9,0x04,0x99,0x00,0x04,0x4D,0x70,0xF7,0x24,0x44,0x44,0x44,0x04,0x4C,0x14,0x00, - 0x44,0x04,0x04,0xF7,0x04,0x47,0x44,0x07,0x65,0xF4,0x04,0x44,0xF4,0xCD,0xF0,0x7F, - 0xFF,0x80,0x40,0x08,0xF4,0xF2,0xC0,0x44,0xF0,0x0F,0xF0,0x04,0x0F,0x00,0x44,0xF4, - 0x14,0xD0,0x0F,0xFF,0xF0,0xC4,0xC5,0x84,0x09,0x00,0x00,0x44,0x0C,0xF4,0x80,0x00, - 0x0F,0xCC,0x64,0x08,0x07,0xF4,0xF4,0x09,0x44,0x44,0x4F,0xF0,0x75,0x4F,0x07,0x7C, - 0x0F,0x84,0x00,0x04,0x04,0x40,0x00,0x47,0x74,0x81,0x00,0xF2,0xC0,0xF0,0xFF,0xF1, - 0x07,0x11,0x04,0x4F,0x34,0x44,0x74,0x17,0x81,0x00,0x04,0xF4,0x04,0x44,0x04,0x40, - 0x74,0x17,0x00,0x40,0x74,0x70,0x44,0x04,0x4F,0x00,0x4F,0x8F,0xFC,0x4F,0x30,0xF4, - 0xFC,0x90,0x4F,0x4C,0x4F,0x44,0x84,0x40,0x84,0x3F,0xF4,0xDD,0x4F,0x00,0x88,0xF4, - 0x0C,0x44,0x80,0x08,0x7C,0x47,0x8C,0x77,0x41,0xC4,0x44,0x45,0x04,0x07,0x00,0xD4, - 0x44,0x10,0x77,0x70,0x01,0xF7,0x10,0x10,0x40,0x11,0x10,0x28,0xF4,0x01,0x31,0xF8, - 0x1F,0xF1,0xF4,0x0D,0x00,0x43,0xF0,0xF4,0x01,0x00,0x00,0x47,0xF1,0x4F,0xC0,0xF0, - 0x44,0x81,0x4D,0x0D,0x4D,0x9D,0x00,0xF4,0xF4,0x44,0x88,0x00,0xF5,0xF4,0x40,0x0F, - 0x0F,0x44,0xDC,0x0F,0x0D,0xCC,0x6F,0x4C,0xC4,0xD4,0x54,0x01,0x3C,0x48,0x08,0x08, - 0x04,0x00,0xD4,0xC7,0xF5,0xF4,0x7F,0xF4,0xFF,0x0F,0x0F,0x0F,0x0C,0x88,0x90,0x00, - 0x4E,0x48,0x54,0x54,0x40,0x00,0x43,0x0F,0xD5,0x4C,0xF5,0xC4,0x84,0x81,0x20,0x08, - 0x08,0x04,0xC4,0x71,0x04,0x74,0xD4,0x74,0x4C,0x01,0xFF,0xF4,0x55,0x43,0xFD,0x10, - 0xF4,0x08,0x04,0x88,0x88,0x89,0xFC,0x40,0xF0,0x00,0x4F,0x40,0x4F,0x00,0x00,0x04, - 0xF7,0x00,0x07,0xFF,0x70,0x54,0x0D,0x14,0x80,0x82,0x44,0x00,0xC4,0x4C,0xD4,0x44, - 0xC7,0x44,0xD7,0x4D,0x74,0x4C,0x10,0x08,0xC4,0x54,0x04,0xD1,0x44,0x40,0x48,0x4C, - 0x45,0x44,0x40,0xF4,0x40,0xC4,0x5C,0xC4,0x54,0x13,0x01,0x18,0x40,0x00,0x44,0xD6, - 0x74,0x54,0x40,0xFC,0x44,0x7F,0x44,0x00,0x04,0x40,0x00,0x50,0x4D,0x40,0x74,0xF7, - 0x8D,0x70,0x40,0x40,0x44,0x04,0xF7,0x10,0x05,0x40,0x05,0x41,0x41,0x44,0x04,0x50, - 0x07,0x40,0x57,0x44,0x40,0x44,0x07,0x44,0x00,0xF4,0x58,0x00,0x44,0x84,0xC8,0x41, - 0x54,0x31,0x44,0x00,0xF4,0x14,0x54,0x00,0xC1,0x0F,0xF4,0xF4,0x0F,0x10,0x88,0xFC, - 0x50,0x4D,0xF9,0x30,0x05,0x18,0x10,0x4B,0xF0,0xFF,0x44,0x00,0xF0,0x44,0xDC,0x4F, - 0x81,0x01,0xD0,0x80,0x0C,0x4F,0x8C,0x05,0x44,0x05,0xC4,0xCF,0xC1,0x00,0x4D,0xF7, - 0x0F,0x05,0x00,0x70,0x0C,0x04,0x04,0xD4,0x44,0x05,0x05,0x45,0x0D,0xF1,0x40,0x07, - 0x47,0x47,0x77,0x67,0x46,0x7F,0x74,0x44,0x47,0x44,0x57,0x74,0x77,0x77,0x45,0x44, - 0x7F,0x0F,0x00,0x80,0x44,0xF0,0x07,0xF7,0x47,0x75,0x70,0x04,0x74,0x47,0x44,0x54, - 0x44,0x77,0x77,0x47,0x00,0x14,0x01,0x55,0x47,0x44,0x44,0x05,0x04,0x50,0x04,0x47, - 0x70,0x57,0x00,0x47,0x47,0x46,0x40,0x4F,0x74,0x77,0x10,0x83,0x70,0x44,0x54,0x60, - 0x44,0x40,0x44,0x04,0x14,0x00,0x44,0x76,0x40,0x74,0x04,0x04,0x10,0x11,0x47,0x54, - 0x46,0x57,0x00,0x4C,0x40,0x14,0x77,0x5D,0x75,0x07,0x77,0x40,0x47,0x77,0x74,0x77, - 0x44,0x04,0x44,0x74,0x40,0x77,0x04,0x14,0x10,0x05,0x10,0x42,0x47,0x47,0x71,0x47, - 0x77,0x60,0x01,0x44,0x77,0x04,0x07,0x74,0x44,0x45,0x07,0x44,0x40,0x74,0x00,0x00, - 0x10,0x40,0x11,0x00,0x74,0x57,0x44,0x44,0x44,0x45,0x64,0x44,0x00,0x74,0x64,0x77, - 0x74,0x75,0x44,0x01,0x40,0x74,0x40,0x67,0x74,0x45,0x54,0x74,0x54,0xD5,0x74,0x47, - 0xF4,0x74,0x44,0xE7,0x44,0x11,0x14,0x00,0x44,0x46,0x77,0x45,0x54,0x05,0x14,0x47, - 0x64,0x44,0x00,0x11,0x74,0x67,0x70,0x40,0x07,0x44,0x44,0x44,0x74,0x54,0x57,0x14, - 0x44,0x1F,0x00,0x44,0x15,0x44,0x47,0x71,0x41,0x71,0x45,0x47,0x46,0xF4,0x07,0x80, - 0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x80,0x88,0x88,0x80,0x88,0x88,0x08,0x80,0x88, - 0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x80,0x88,0x88,0x88,0x88, - 0x88,0x88,0x80,0x88,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x88, - 0x88,0x88,0x88,0x88,0x88,0x80,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x80, - 0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x0F,0x4F,0x0F,0x40,0x44, - 0x00,0x48,0xDC,0x00,0x04,0x44,0x57,0x41,0xDF,0x00,0xFF,0x80,0x0D,0x48,0x44,0xD4, - 0xC5,0x48,0x40,0x54,0xDD,0x7F,0x44,0x0F,0x87,0x4F,0x78,0x74,0xF1,0x44,0x44,0x4C, - 0x80,0x51,0x70,0x55,0x47,0x4F,0x4F,0x0F,0x40,0x04,0x4F,0x04,0x00,0x4F,0xD4,0x8F, - 0xC4,0x14,0x4F,0xC4,0x77,0x00,0x44,0xD1,0x4C,0x4F,0x40,0x45,0xFF,0x44,0xFC,0xC4, - 0xF4,0x44,0xF0,0x41,0x40,0x4F,0x4F,0x4E,0x44,0x07,0xFF,0x40,0x08,0x40,0x04,0xE4, - 0xFF,0x44,0xC0,0x0D,0xF4,0x74,0x44,0x4C,0xCF,0xD4,0x44,0x40,0x74,0xD0,0x4F,0x44, - 0xFF,0xF0,0xFD,0x4F,0x20,0xD0,0xCF,0x44,0x4F,0x0D,0xFC,0xFC,0xDF,0x40,0x0D,0x44, - 0x58,0x08,0x84,0x8C,0xCD,0x0C,0x24,0x00,0x84,0xF0,0xC4,0xCF,0x48,0x90,0xF0,0xF4, - 0x00,0xE8,0x00,0x4F,0x04,0x8F,0x88,0x40,0x44,0x0D,0x08,0x07,0x0F,0x0F,0x44,0x17, - 0x10,0x01,0x04,0x40,0x81,0x08,0x04,0x70,0xC7,0x77,0x41,0x75,0x54,0x00,0x70,0xF7, - 0x08,0x84,0x7F,0xFF,0xF0,0xF4,0x0F,0x40,0x04,0xD4,0x40,0x0F,0x4D,0xFD,0xFC,0x80, - 0xF8,0x4F,0xF8,0x00,0xF0,0xCF,0xD1,0xFC,0xF0,0x0F,0x4F,0x80,0x04,0x0F,0x88,0x88, - 0x88,0xFC,0x4F,0x49,0x44,0x04,0xCF,0x4F,0x44,0x00,0x44,0xCD,0xD4,0xFF,0xDC,0x0F, - 0xF4,0x40,0xFF,0x84,0x88,0x88,0xC8,0xD0,0xFC,0xCC,0x4F,0x08,0x4F,0x4C,0xFD,0x04, - 0x44,0x0F,0x70,0x3F,0x97,0x0F,0x80,0x88,0x88,0x88,0x48,0x0C,0x4E,0x7B,0x40,0x4C, - 0x40,0x74,0x44,0x40,0x70,0x60,0x4C,0xF0,0x94,0x18,0x00,0x08,0x78,0x04,0x40,0x0F, - 0x00,0x40,0x04,0x4D,0x08,0xFC,0x4C,0x47,0xFF,0xD4,0xFF,0x00,0x80,0x48,0x04,0x14, - 0x4C,0x04,0xF4,0xC7,0xC0,0x70,0x80,0x4D,0xFF,0x44,0x0D,0x44,0xF7,0x8F,0x88,0xF8, - 0x0F,0x5D,0x00,0x44,0x4F,0x4F,0x44,0xF0,0x80,0x10,0x50,0x40,0x4C,0xC7,0xFF,0x44, - 0x07,0xF4,0x01,0x4C,0x5F,0x48,0x04,0x04,0x45,0xF4,0x74,0x40,0x41,0xCF,0x74,0x77, - 0x8F,0xCF,0x44,0x4F,0x51,0x0C,0x7C,0x40,0x54,0x14,0x40,0x40,0x57,0x45,0x40,0x74, - 0x45,0xF4,0x70,0xF4,0x47,0x0F,0xFC,0x04,0x00,0xF4,0x0F,0x40,0x44,0xF7,0x0F,0x4F, - 0xDC,0x0D,0xFC,0x78,0x57,0x47,0xFF,0x01,0x4D,0x0F,0x4C,0x15,0x01,0xF0,0x4F,0xFF, - 0xE4,0x8C,0x40,0x00,0x1C,0xFF,0x8F,0xC1,0x88,0xD4,0xFC,0xFF,0xFD,0x84,0x44,0x48, - 0x07,0x04,0x6C,0xF0,0xC0,0xC0,0x00,0xC4,0x44,0x44,0x40,0x44,0x7D,0x4D,0x14,0xD4, - 0x40,0x41,0x07,0x07,0x4D,0x51,0x44,0xFF,0x78,0x8F,0x4C,0x53,0x87,0x08,0xC4,0xCF, - 0x04,0x48,0x4C,0x00,0x08,0xFC,0xF0,0xCE,0xC4,0x4F,0x04,0x80,0xFC,0x46,0xF8,0xF4, - 0xC4,0x44,0xFC,0x4B,0xEC,0xD8,0xFF,0xFE,0xFC,0xF0,0x4C,0xE4,0xFF,0x74,0xF6,0x2C, - 0x84,0x44,0x18,0x80,0x08,0x88,0x88,0x88,0xF0,0xDF,0xEF,0xF4,0x58,0xF0,0x0D,0xFF, - 0xFC,0xD0,0xFC,0x7F,0x04,0xD4,0x44,0xC0,0xF4,0x54,0x4D,0x84,0xD0,0xD5,0x20,0x06, - 0xE4,0xFF,0xFF,0xCD,0xFC,0x00,0x4C,0x89,0x80,0x00,0x88,0xF5,0x41,0xC8,0x0D,0x04, - 0x04,0x14,0x44,0xC4,0x4F,0xF4,0xCC,0xF0,0xF0,0xC5,0xFF,0x4F,0xFF,0x4C,0x0C,0x40, - 0x4F,0xC4,0x54,0xE8,0xF4,0x47,0x44,0xF4,0xED,0x4F,0x0C,0x04,0x01,0x88,0x08,0x88, - 0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x00,0x70,0x54,0xF4,0x0C,0xF4,0xCD,0x44, - 0x40,0x40,0x90,0x5C,0xF4,0x47,0x44,0x4F,0x04,0xDC,0x44,0x47,0x4E,0xC5,0x48,0x1F, - 0x0D,0x47,0x44,0x44,0xED,0xFF,0x00,0x44,0x9C,0x88,0x18,0x88,0x88,0x08,0xF8,0x40, - 0xFC,0x44,0xF4,0xC4,0x64,0x4F,0x0F,0xC5,0x40,0x30,0x4C,0x4F,0x4D,0x40,0xCF,0xD4, - 0xCD,0x44,0xC4,0x40,0xF4,0x7C,0x04,0x74,0xFF,0x4F,0x47,0x14,0xEC,0x54,0xF4,0x44, - 0xC0,0xF0,0x0F,0x5C,0x40,0xD6,0xFF,0xCF,0xC4,0x54,0x00,0x00,0x08,0x40,0x88,0x00, - 0x01,0x41,0x88,0x88,0x78,0x41,0x47,0x40,0xF4,0x44,0x54,0x44,0x4F,0x44,0xFD,0x40, - 0x04,0x44,0x44,0x4D,0x70,0x40,0x44,0x05,0xC4,0x04,0x04,0xF4,0xC0,0xFC,0x48,0x44, - 0xF4,0xF1,0x40,0x47,0xD4,0xF0,0xDF,0x75,0x94,0xC0,0xF4,0x5C,0x5C,0x0F,0x40,0x04, - 0x00,0x0D,0x01,0x84,0x88,0x90,0x48,0x4C,0x47,0x00,0x07,0xC0,0xF4,0x44,0x4F,0x54, - 0xF4,0x04,0x04,0x40,0x44,0x44,0x14,0xC4,0x44,0x0F,0x0C,0x04,0xDF,0x7C,0x87,0xF0, - 0xD6,0x06,0x4F,0x47,0xF0,0x7D,0xF4,0x44,0xFD,0xC4,0x04,0x4D,0x10,0x43,0x80,0x80, - 0x08,0x80,0x80,0x48,0x44,0x44,0x0F,0x47,0x4C,0x44,0x07,0x44,0x40,0x44,0x4F,0xD4, - 0x05,0x04,0x00,0x45,0x44,0x44,0x4C,0x44,0xF0,0xF4,0x57,0xF4,0x47,0x0F,0x44,0xD6, - 0xF4,0x60,0x64,0x45,0x44,0xC4,0x7F,0x64,0x44,0x00,0x10,0x80,0x88,0xC8,0xF8,0x24, - 0x75,0xF0,0x44,0x44,0xFD,0x1D,0x40,0x07,0x40,0x60,0x54,0x5C,0xE5,0x41,0x40,0x0C, - 0x44,0x74,0x4C,0x44,0x7F,0x47,0x74,0x04,0x40,0x08,0x48,0x54,0xC4,0xCC,0x04,0x0D, - 0x45,0x44,0xCF,0xF4,0x45,0x45,0x44,0xE4,0x75,0x00,0x47,0x14,0x54,0xF4,0x4D,0x44, - 0x44,0x44,0x0D,0x47,0x7F,0x1F,0x01,0xF9,0x4E,0x40,0x44,0xC4,0xD4,0x07,0x40,0x44, - 0xB4,0x44,0x44,0x44,0xF4,0x40,0x70,0xF6,0x4D,0x84,0xF0,0x40,0x44,0x04,0x7F,0x44, - 0x40,0x44,0x7F,0x04,0xF4,0x47,0x44,0x40,0x44,0x04,0x00,0x74,0x54,0xF7,0x40,0xE5, - 0x44,0x03,0xC4,0x77,0x40,0x57,0x04,0x00,0xC0,0x10,0x00,0x7B,0x44,0x47,0x04,0x44, - 0x44,0x45,0x44,0xC4,0xC0,0x04,0x74,0x14,0x41,0x44,0x44,0x04,0x4C,0x44,0x0C,0x74, - 0x04,0x40,0x00,0x44,0x04,0x40,0xD4,0x8F,0x8F,0x44,0x7F,0x04,0x40,0x69,0x07,0x7F, - 0x40,0x4C,0x44,0x70,0x44,0xD4,0x48,0x4C,0xD4,0x40,0x04,0x40,0xF0,0xDC,0x8C,0x88, - 0x88,0x08,0x40,0x44,0x00,0xDF,0xCF,0x04,0x44,0xF0,0xD4,0x44,0x44,0x04,0xCC,0x04, - 0x40,0xF4,0x4F,0xC0,0xFC,0x1D,0x08,0xD0,0xCD,0x40,0xCC,0x4D,0x44,0x4C,0x44,0x44, - 0x4C,0x44,0x4D,0xFD,0xD4,0xF8,0x14,0x89,0xCC,0x40,0x0F,0x44,0xFC,0xF4,0x44,0xFD, - 0x00,0x44,0x0F,0x04,0xD4,0x44,0xF5,0x19,0x88,0x88,0x48,0x44,0xDD,0x54,0x00,0x0F, - 0x4F,0xFF,0x44,0xC5,0xDF,0x4D,0xD4,0x04,0x50,0xCD,0x80,0x80,0x4F,0x4F,0x0F,0x0C, - 0xC4,0xCC,0xD4,0x44,0xD4,0x04,0x44,0x0C,0xC4,0x44,0x4D,0xD0,0x04,0xD4,0x04,0xDC, - 0x44,0x44,0x00,0x84,0x98,0x10,0x4D,0x4F,0x44,0xD4,0x74,0xF4,0xD4,0x44,0x44,0x74, - 0x4D,0xD4,0xCC,0x47,0x47,0x45,0x44,0x4D,0x08,0x40,0x4F,0x4F,0x47,0xC4,0x88,0x18, - 0x40,0xCF,0xC4,0x40,0x4C,0x80,0xF0,0x40,0x44,0xC4,0x04,0xC4,0x04,0x44,0x44,0xF4, - 0x00,0x47,0x04,0x00,0x08,0xD4,0xCC,0xD4,0x04,0xF4,0xC4,0x44,0x44,0x5F,0xD4,0x44, - 0x0D,0x44,0x47,0x1D,0x44,0xDC,0x04,0x00,0xC1,0xCD,0x04,0x40,0x44,0xC0,0x44,0x44, - 0x4F,0x44,0xC4,0x44,0x04,0x4C,0x46,0x78,0x00,0x47,0x04,0x55,0xF0,0x74,0x04,0x4F, - 0x44,0xC4,0x70,0x04,0x44,0x4C,0x54,0x11,0x54,0xC0,0xD4,0x4D,0x40,0xC0,0x44,0x60, - 0xD4,0x7F,0x44,0x14,0x44,0x44,0x04,0x44,0x74,0x4C,0x40,0x47,0xD4,0x70,0x4C,0x44, - 0x4F,0x41,0x8D,0x03,0x04,0x44,0xFF,0x04,0x00,0x77,0x48,0xF4,0xF0,0x74,0x70,0x01, - 0xF0,0xFF,0x88,0x40,0xCF,0xF4,0x48,0x48,0x4F,0x0F,0x54,0xF4,0x40,0x00,0xD4,0xFF, - 0xF4,0x4F,0x88,0x00,0x4F,0xF0,0xF4,0x04,0x40,0x4F,0x40,0x7F,0x54,0x44,0x08,0x47, - 0x40,0x0D,0x0D,0x40,0x04,0xF4,0x84,0x01,0xD1,0x04,0x13,0xD4,0x44,0x04,0x4C,0x14, - 0xF4,0x1F,0x91,0x08,0xC0,0x44,0x44,0x3C,0x04,0x0C,0xFF,0x44,0xFD,0x44,0x77,0xF4, - 0x70,0x88,0x88,0x40,0x0F,0x40,0x04,0x44,0xCC,0xF5,0x2F,0x44,0xDF,0x44,0x7D,0x0D, - 0x40,0x0D,0x41,0x74,0x40,0x4D,0x05,0x04,0x4F,0xFF,0x44,0x44,0xE6,0x8C,0x10,0x45, - 0xC0,0x44,0xF0,0x44,0x40,0xDF,0x44,0x44,0x4C,0x44,0x48,0x4F,0x07,0x70,0x44,0x04, - 0xF0,0x14,0x0F,0x04,0x40,0x40,0x15,0x40,0x44,0x44,0x00,0x45,0x44,0x44,0x44,0xFD, - 0x45,0x44,0x07,0x0D,0x40,0x47,0x54,0x54,0x40,0x47,0x05,0x14,0x40,0xC4,0x04,0xF5, - 0xF0,0xF4,0x40,0x3F,0x05,0x71,0x00,0x70,0x00,0x70,0x40,0x57,0x05,0x41,0x44,0x44, - 0x70,0x44,0x04,0x15,0x30,0x07,0x45,0x54,0x00,0x17,0x00,0x44,0x40,0x07,0x70,0x54, - 0x87,0x08,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x4F,0x44,0x0C,0x40,0x4D,0x5D,0x48, - 0x44,0xF4,0xC4,0x1D,0x44,0xC0,0x44,0x80,0x44,0xC4,0x07,0x44,0x07,0x44,0x04,0x44, - 0x0F,0x77,0x04,0xC0,0x07,0x07,0x07,0x47,0x45,0x74,0x40,0x77,0x07,0x70,0x70,0x74, - 0x00,0x74,0x70,0x40,0x00,0x07,0x74,0x00,0x74,0x10,0x07,0x07,0x40,0x77,0x05,0x4C, - 0x54,0x00,0x44,0x05,0x0D,0x04,0x44,0x44,0x57,0x05,0x77,0x44,0x44,0x70,0x00,0x07, - 0x47,0x77,0x00,0x07,0x74,0x50,0x75,0x07,0x77,0x77,0x64,0x44,0xE0,0x44,0x05,0x40, - 0x40,0x45,0x75,0x74,0x90,0x48,0x77,0x40,0x50,0xF4,0x70,0x04,0x47,0x05,0x00,0x07, - 0x77,0x70,0x77,0x47,0x07,0x46,0x11,0x00,0x07,0x07,0x00,0x04,0x74,0x44,0x47,0x76, - 0x04,0x47,0x47,0x74,0x44,0x70,0x71,0x70,0x40,0x47,0x44,0x47,0x44,0x71,0x47,0x45, - 0x75,0x05,0x05,0x77,0x04,0x72,0x70,0x47,0x74,0x54,0x44,0x76,0x07,0x47,0x04,0x47, - 0x77,0x07,0x47,0xD4,0x04,0x44,0x41,0x47,0x47,0x44,0x47,0x77,0x74,0x77,0x74,0x04, - 0x17,0x44,0x44,0x4D,0x07,0x74,0x07,0x44,0x44,0x70,0x40,0x44,0x70,0x44,0x44,0x05, - 0x74,0x04,0x40,0x44,0x74,0x44,0x01,0x77,0x44,0x44,0x44,0x45,0x47,0x17,0x47,0x50, - 0x04,0x40,0x40,0x4F,0x04,0x54,0x4F,0x70,0x17,0x01,0x07,0x70,0x44,0x44,0x70,0x44, - 0x07,0x14,0x44,0x04,0x04,0x47,0x05,0x05,0x03,0x77,0x44,0x07,0x54,0x07,0x04,0x44, - 0x88,0x88,0x88,0x88,0x88,0x88,0x80,0x88,0x08,0x88,0x88,0x88,0x88,0x08,0x88,0x88, - 0x88,0x88,0x88,0x08,0x88,0x88,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x88, - 0x88,0x88,0x88,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88, - 0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x80, - 0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0xF8,0x40,0x41,0x44,0x74, - 0xF0,0x44,0x40,0xCF,0xC7,0x47,0x4F,0x45,0x05,0x00,0xF4,0x44,0x00,0x0F,0x44,0x40, - 0xF0,0x0D,0x44,0x04,0x44,0xFF,0x45,0x40,0x44,0xC4,0x40,0x44,0xFD,0x4F,0x41,0x00, - 0x44,0x0F,0xD4,0x04,0xD0,0x0F,0x5F,0x41,0x44,0x44,0x4D,0x40,0x4D,0x04,0x74,0x07, - 0x77,0x47,0x44,0x70,0x77,0x77,0x17,0x71,0x07,0x75,0x47,0x77,0x47,0x74,0x77,0x74, - 0x77,0x77,0x47,0x70,0x07,0x07,0x14,0x41,0x70,0x74,0x40,0x40,0x40,0x17,0x47,0x47, - 0x47,0x77,0x47,0x47,0x04,0x07,0x74,0x04,0x44,0x40,0x46,0x00,0x44,0x75,0x77,0x04, - 0x04,0x04,0x77,0x74,0x47,0x17,0x70,0x50,0x05,0x70,0x45,0x47,0x40,0x40,0x80,0x88, - 0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88, - 0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x08,0x80,0x88,0x08,0x88,0x88,0x88,0x08, - 0x88,0x88,0x0F,0xDF,0x44,0xF0,0xD4,0x44,0x1F,0xF4,0x8F,0xFC,0x44,0x40,0x40,0x00, - 0xD4,0x00,0xFC,0x00,0x40,0x8F,0x44,0x44,0xC4,0x44,0x4C,0x04,0x70,0x40,0x04,0xC0, - 0x44,0xF0,0x40,0x40,0x07,0x44,0x44,0x44,0x80,0xF4,0xCC,0x44,0x48,0x0F,0x0C,0xCF, - 0x40,0x85,0x48,0x4E,0x04,0xF0,0x4F,0xFC,0xC4,0x40,0x40,0xCD,0x44,0xFD,0xF4,0xD8, - 0x74,0xF4,0x0C,0x44,0x4F,0xDD,0x0C,0xF4,0x44,0xF4,0x94,0x80,0x88,0x8C,0xC5,0x54, - 0x04,0x44,0xC4,0x44,0xD1,0x0F,0x48,0xF0,0x47,0x44,0x4C,0x44,0x44,0x44,0xD4,0xDF, - 0x44,0xCC,0x44,0x44,0xC0,0x4B,0x08,0x8C,0xC7,0x40,0xF5,0x04,0xC4,0x08,0xC4,0x44, - 0xCC,0x0F,0x4F,0xF0,0xFF,0xCF,0x45,0x04,0x85,0x48,0x54,0x44,0xF0,0x44,0x44,0x74, - 0x47,0x54,0x45,0x4C,0xC0,0x04,0xCC,0xC0,0x08,0x4D,0x0F,0x0F,0x04,0x44,0x0D,0x80, - 0xF0,0x04,0xD5,0xF4,0x44,0x57,0x74,0x84,0x5C,0x54,0x4D,0x44,0x14,0x04,0x48,0x4C, - 0x54,0x40,0x44,0x04,0x44,0xF7,0x0F,0x90,0x11,0x0C,0x00,0x00,0x00,0x0C,0x40,0x01, - 0x06,0x04,0x10,0x01,0x41,0x77,0x77,0x48,0x40,0x47,0x04,0x40,0x04,0x50,0x44,0x74, - 0x44,0x11,0x40,0x44,0x44,0x70,0x04,0x44,0x40,0x00,0x44,0x44,0x47,0x74,0x15,0x47, - 0x44,0x74,0x70,0x44,0x74,0x45,0x41,0x04,0x44,0x77,0x77,0x44,0x14,0x74,0x77,0x76, - 0x04,0x40,0x44,0x27,0x70,0x07,0x04,0x74,0x00,0x74,0x44,0x44,0x77,0x70,0x00,0x77, - 0x04,0x45,0x77,0x45,0x74,0x40,0x71,0x47,0x44,0x44,0x44,0x54,0x44,0x44,0x61,0x74, - 0x54,0x57,0x05,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88, - 0x80,0x88,0x88,0x88,0x88,0x88,0x80,0x88,0x88,0x80,0x88,0x88,0x88,0xF8,0x0F,0xD9, - 0x00,0xF0,0x00,0x16,0x8F,0x80,0x70,0x75,0xFF,0x07,0x04,0x18,0x80,0x11,0x81,0x58, - 0x80,0x0F,0xFD,0x84,0x48,0x40,0x40,0x0F,0xF8,0xC4,0xCF,0x44,0x88,0x81,0x88,0x88, - 0x44,0x4D,0xDC,0x0F,0x9C,0xFF,0xF0,0x1C,0x0F,0x82,0x45,0xF4,0xBD,0x45,0xF4,0x44, - 0xFF,0xFE,0xFC,0x0F,0x82,0xF8,0xF4,0xF1,0xFF,0x18,0x7F,0xFD,0x70,0xCF,0xF4,0xFF, - 0xFF,0x7F,0x04,0x18,0x00,0x00,0xC0,0xCF,0x70,0x07,0xF4,0x0D,0x1F,0x80,0x4F,0x4F, - 0xF0,0x0F,0x1C,0xF0,0x50,0x77,0xF0,0xD7,0xFF,0xFD,0x77,0x81,0x7D,0xC0,0x67,0x07, - 0x37,0xFC,0x90,0x40,0x7D,0x40,0xF0,0x7F,0x04,0x47,0xFC,0x75,0x47,0x47,0xF7,0xF4, - 0x7F,0xFF,0x47,0x74,0x1E,0xC7,0x40,0x70,0xF4,0x80,0xE4,0xC0,0xE4,0xC0,0x80,0x44, - 0xC4,0xFE,0x40,0x4F,0x03,0x0F,0x08,0xF8,0xFE,0xC4,0xFC,0x0C,0x8F,0x88,0x40,0x4C, - 0xF0,0x40,0xC8,0xC0,0x04,0x4F,0x00,0x8D,0x88,0x80,0x44,0xC4,0x44,0xD4,0xCC,0x42, - 0xF4,0x4D,0x45,0x88,0x4F,0xC4,0xF4,0xC0,0x44,0x44,0x7C,0x10,0x48,0x44,0xF4,0x4C, - 0x44,0x4F,0x4C,0x44,0x44,0x40,0x40,0x44,0x44,0x07,0x24,0x44,0xF4,0x44,0x44,0x4C, - 0x44,0xCC,0x04,0x64,0x44,0x44,0x74,0xC4,0xC5,0x47,0x44,0x04,0xC4,0x44,0x00,0x44, - 0x44,0xC0,0x44,0x4C,0xF4,0xFF,0xFF,0xCF,0x0C,0x4F,0x41,0xC5,0x05,0x0C,0x80,0x48, - 0xC4,0xDF,0xDC,0x00,0xF4,0x0F,0x0F,0xCC,0x88,0x5D,0xFC,0xF8,0xCF,0x04,0x80,0x88, - 0x44,0x45,0xC4,0xF4,0xF0,0xF4,0xF8,0x40,0xCD,0x4F,0x00,0x10,0x40,0xCC,0x47,0x44, - 0x44,0xCD,0x41,0x40,0x04,0x7D,0xC6,0xFE,0x64,0x44,0xFF,0x40,0x41,0x0D,0x44,0x04, - 0x57,0x44,0x40,0xF5,0xF1,0x78,0xFF,0xFF,0xF7,0x40,0x44,0x61,0x47,0x14,0x7F,0x10, - 0x10,0x74,0x04,0x75,0x44,0x46,0x44,0x04,0x40,0x40,0x74,0x41,0x44,0x00,0x41,0x50, - 0x44,0x44,0x44,0x44,0x40,0x04,0x74,0x41,0x76,0x06,0x55,0x64,0x40,0x04,0x04,0x07, - 0x00,0x40,0x44,0x44,0x10,0x00,0x41,0x04,0x44,0x44,0x07,0x54,0x44,0x06,0x40,0x74, - 0x06,0x00,0x71,0x04,0x75,0x54,0x44,0x44,0x45,0x44,0x04,0x74,0x00,0x75,0x00,0x07, - 0x44,0x43,0x67,0x47,0x00,0x00,0x44,0x40,0x14,0x04,0x08,0x04,0x44,0x44,0x44,0x45, - 0x07,0x74,0x70,0x44,0x04,0x44,0x44,0x0C,0x70,0x70,0x54,0x47,0x47,0x55,0x47,0x00, - 0x44,0x44,0x44,0x44,0x44,0x44,0x14,0x08,0x00,0x64,0x44,0x76,0x30,0x40,0x04,0x00, - 0x44,0x44,0x44,0x44,0x4C,0x44,0x46,0x74,0x44,0x47,0x40,0x40,0x44,0x00,0x40,0x44, - 0x44,0x00,0x07,0x44,0x54,0x07,0x10,0x44,0x44,0x11,0x00,0x40,0x47,0x45,0x07,0x44, - 0x44,0x04,0x06,0x05,0x44,0x40,0x44,0x54,0x07,0x44,0x44,0x04,0x57,0x47,0x40,0x66, - 0x67,0x57,0x46,0x47,0x47,0x74,0x41,0x77,0x00,0x41,0x54,0x04,0x44,0x11,0x04,0x08, - 0x00,0x00,0x01,0x44,0x42,0x76,0x74,0x44,0x44,0x04,0x47,0x45,0x44,0x74,0x05,0x04, - 0x45,0x40,0x44,0x40,0x00,0x0C,0x45,0x03,0x44,0x00,0x70,0x44,0x40,0x44,0x07,0x07, - 0x04,0x44,0x00,0x00,0x44,0x04,0x47,0xC4,0x40,0x04,0x47,0x57,0x44,0x47,0x40,0x44, - 0x40,0x64,0x44,0x74,0x44,0x04,0x37,0x45,0x47,0x44,0x44,0x40,0x10,0x00,0x00,0x00, - 0x04,0x74,0x04,0x40,0x05,0x0C,0x44,0x44,0x75,0x04,0x44,0x5D,0x45,0x04,0x54,0x46, - 0x70,0x44,0x17,0x44,0x05,0x00,0x04,0x04,0x00,0x00,0x40,0x46,0x44,0x44,0x44,0x44, - 0x04,0x50,0x02,0x54,0x40,0x44,0x44,0x44,0x45,0x54,0x45,0x04,0x57,0x01,0x00,0x00, - 0x14,0x00,0x20,0x00,0x44,0x74,0x04,0x04,0x44,0x44,0x70,0x45,0x47,0x45,0x44,0x48, - 0x14,0x00,0x47,0x44,0x44,0x44,0x04,0x40,0x74,0x17,0x40,0x40,0x00,0x31,0x40,0x05, - 0x05,0x45,0x04,0x00,0x44,0xC5,0x40,0x44,0x45,0x44,0x54,0x54,0x04,0x00,0x75,0x75, - 0x54,0x44,0x80,0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x80,0x80,0x88,0x88,0x88, - 0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88, - 0x88,0x88,0x88,0x08,0x88,0x88,0x88,0x08,0x88,0x08,0x80,0x88,0x88,0x80,0x88,0x88, - 0x88,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x88, - 0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x80,0x88, - 0x80,0x88,0x88,0x08,0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x88,0x88,0x08,0x88, - 0x88,0x08,0x88,0x88,0x80,0x08,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x80,0x88,0x88, - 0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x80,0x88,0x88,0x00,0x78,0x00,0x44,0x44,0x80, - 0x07,0x75,0x00,0x14,0x74,0x71,0x44,0x74,0x74,0x76,0x07,0x01,0x17,0x40,0x04,0x44, - 0x41,0x71,0x77,0x10,0x47,0x40,0x74,0x04,0x64,0x01,0x40,0x44,0x50,0x74,0x47,0x45, - 0x04,0x50,0x40,0x74,0x44,0x47,0x55,0x00,0x44,0x44,0x77,0x07,0x01,0x44,0x07,0x44, - 0x74,0x47,0x54,0x00,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x88,0x08, - 0x88,0x88,0x88,0x08,0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x80,0x08,0x8F,0x84, - 0xF4,0x4C,0x04,0x00,0x05,0x0F,0x44,0x5F,0xC4,0x8F,0x88,0x08,0x40,0xF4,0xCC,0xF0, - 0x0F,0x4F,0x8F,0x88,0x88,0xF4,0xFF,0x54,0x4F,0x40,0x8C,0x00,0x04,0xF0,0x77,0xF7, - 0xC0,0x7F,0x1F,0x81,0x88,0x4F,0x4D,0x40,0x47,0x7D,0xFC,0xFF,0x07,0x01,0x74,0x04, - 0x04,0x40,0xF4,0x4F,0x4D,0xF7,0xF0,0x97,0x48,0x44,0x7F,0xD0,0xFF,0x70,0x0F,0x04, - 0x41,0x34,0x04,0xF0,0x47,0x07,0x04,0x04,0x7D,0xC1,0x05,0x39,0xD5,0x70,0x8D,0x48, - 0xFF,0x44,0xFF,0xFF,0xF4,0x74,0xFF,0x8F,0x10,0x4C,0xF4,0x47,0x74,0x74,0x47,0x44, - 0x48,0x77,0x40,0x00,0xEF,0x1F,0x00,0xE0,0x47,0x87,0x40,0xFF,0xF4,0x74,0x40,0x48, - 0x8F,0x04,0x4D,0xFF,0xCD,0x41,0xD0,0xDD,0x74,0xF4,0x00,0x0F,0x74,0x00,0x4F,0x4F, - 0x04,0x44,0x45,0x74,0x44,0x4D,0x84,0x44,0x0F,0x0F,0x40,0x00,0xFD,0x04,0x70,0x4D, - 0x00,0x47,0x07,0x45,0x57,0x40,0x00,0x00,0x20,0x8D,0x00,0x0F,0x94,0xC4,0x07,0x0F, - 0xFD,0x0F,0x81,0x05,0xF5,0x14,0x34,0x04,0x14,0xC4,0x0F,0x6C,0x14,0x47,0x4D,0x44, - 0x54,0x44,0xD5,0x01,0x04,0xF4,0xF0,0x74,0x81,0x08,0x0C,0x40,0x4D,0x04,0x45,0x04, - 0x4F,0xD4,0x44,0x05,0x07,0xF4,0xF4,0x84,0x00,0x58,0x09,0x44,0x40,0x40,0x00,0x40, - 0x40,0x50,0x44,0x47,0x41,0x70,0x44,0x44,0x04,0x70,0x44,0x44,0x44,0x00,0x47,0x44, - 0x40,0x40,0x40,0x88,0x80,0x88,0xD8,0x01,0x44,0xF1,0x90,0x0F,0x44,0x74,0x40,0x74, - 0x76,0x77,0x74,0x47,0x07,0x06,0x47,0x54,0x77,0x77,0x00,0x74,0x07,0x41,0x44,0x04, - 0x54,0x00,0x05,0x44,0x44,0x40,0x71,0x40,0x06,0x04,0x41,0x50,0x67,0x70,0x11,0x00, - 0x40,0x40,0x44,0x07,0x44,0x14,0x77,0x57,0x44,0x44,0x13,0x40,0x47,0x52,0x44,0x47, - 0x04,0x44,0x64,0x70,0x40,0x74,0x00,0x70,0x55,0x54,0x85,0x88,0x88,0x88,0x88,0x88, - 0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x80,0x88,0x80,0x08,0x88,0x08,0x88,0x88,0x88, - 0x88,0x08,0x88,0x88,0x47,0x01,0x44,0x74,0x70,0x44,0x00,0x05,0x04,0x44,0x44,0x44, - 0x44,0x14,0x07,0x25,0x40,0x40,0x04,0x08,0x80,0x88,0x80,0x00,0x88,0x78,0x03,0xF8, - 0x20,0x47,0x40,0xC0,0x58,0x54,0x70,0x73,0x00,0x05,0x07,0x04,0x40,0x04,0x77,0x07, - 0x04,0x74,0x50,0x40,0x74,0x07,0x87,0x00,0x4F,0x75,0x45,0x44,0x07,0x44,0x10,0x47, - 0x53,0x00,0x45,0x40,0x47,0x44,0x45,0x4D,0x44,0x44,0x40,0x00,0x00,0x04,0x54,0x45, - 0x40,0x41,0x70,0x40,0x74,0x70,0x27,0x04,0x75,0x47,0xDE,0x74,0x44,0x40,0x04,0x44, - 0x40,0x80,0x80,0x80,0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x00,0x08,0x88,0x80, - 0x88,0x00,0x88,0x88,0x08,0x88,0x80,0x80,0x88,0x88,0x88,0xDF,0xFD,0x00,0x64,0x04, - 0x40,0x40,0xF0,0x44,0x0F,0x40,0x77,0x47,0x64,0x74,0x47,0x00,0x60,0x44,0x41,0x00, - 0x70,0x44,0x11,0x41,0x41,0x40,0x44,0x44,0x77,0x47,0x74,0x44,0x65,0x50,0x74,0x70, - 0x00,0x45,0x44,0x40,0x40,0x04,0x74,0x05,0x74,0x01,0x04,0x44,0x45,0x54,0x44,0x74, - 0x70,0x44,0x54,0x44,0x42,0x44,0x40,0x77,0x40,0x11,0x44,0x04,0x70,0x44,0x44,0x04, - 0x04,0x44,0x44,0x40,0x41,0x74,0x44,0x04,0x47,0x04,0x44,0x76,0x44,0x04,0x40,0x45, - 0x47,0x75,0x74,0x04,0x44,0x04,0x74,0x44,0x04,0x44,0x74,0x74,0x40,0x77,0x00,0x74, - 0x04,0x07,0x75,0x44,0x74,0x57,0x88,0x88,0x88,0x80,0x88,0x88,0x88,0x88,0x88,0x88, - 0x88,0x08,0x88,0x88,0x08,0x88,0x08,0x80,0x88,0x88,0x00,0x88,0x08,0x88,0x88,0x88, - 0x88,0x88,0x88,0x80,0x0F,0x40,0x50,0x40,0xCD,0x40,0x04,0xC8,0x4F,0x48,0x0D,0x44, - 0xCD,0x0C,0x81,0x44,0x00,0x84,0x48,0x50,0xD4,0xF4,0x47,0x04,0x0F,0x00,0x04,0xD1, - 0xC0,0x51,0x20,0x4D,0x00,0xD1,0xD4,0xD7,0x50,0x44,0x01,0x54,0xC0,0xD4,0x44,0x04, - 0x40,0xC0,0x44,0x05,0x0C,0x44,0x44,0x84,0x44,0x84,0x40,0x44,0x45,0x07,0x00,0xD4, - 0x04,0xD5,0x54,0x70,0x55,0x43,0x00,0xD5,0x70,0x4D,0x40,0x40,0x00,0xD4,0x0F,0x44, - 0xF0,0xFF,0xFF,0x84,0x8C,0x44,0xD4,0xF5,0xD0,0x04,0x4F,0x04,0x45,0x47,0x00,0x40, - 0x44,0x00,0x04,0x44,0x04,0x00,0x04,0x70,0x44,0x00,0x45,0x44,0x00,0x44,0x44,0x04, - 0x04,0x14,0x40,0x44,0x04,0x00,0x00,0x03,0x74,0x55,0x40,0x11,0x00,0x44,0x00,0x34, - 0x45,0x04,0x44,0x04,0x05,0x75,0x50,0x47,0x00,0x00,0x41,0x04,0x54,0x00,0x40,0x40, - 0x45,0x40,0x04,0x45,0x74,0x05,0x00,0x10,0x10,0x41,0x45,0x47,0x40,0x54,0x04,0x04, - 0x54,0x15,0x47,0x44,0x07,0x44,0x04,0x00,0x15,0x01,0x10,0x40,0x04,0x00,0x40,0x00, - 0x00,0x00,0x41,0x45,0x55,0x41,0x71,0x04,0x03,0x57,0x35,0x40,0x00,0x10,0x44,0x00, - 0x50,0x40,0x75,0x00,0x44,0x40,0x54,0x11,0x41,0x47,0x04,0x44,0x50,0x71,0x44,0x45, - 0x44,0x00,0x04,0x35,0x65,0x44,0x44,0x04,0x04,0x05,0x44,0x74,0x44,0x01,0x00,0x44, - 0x05,0x44,0x00,0x54,0x04,0x00,0x40,0x04,0x40,0x40,0x44,0x41,0x45,0x04,0x08,0x80, - 0x80,0x08,0x80,0x88,0x08,0x80,0x80,0x08,0x88,0x08,0x88,0x00,0x00,0x88,0x08,0x88, - 0x88,0x88,0x88,0x88,0x88,0x80,0x80,0x08,0x88,0x88,0x88,0x88,0x08,0x88,0x88,0x00, - 0x00,0x80,0x88,0x88,0x00,0x88,0x88,0x88,0x08,0x80,0x88,0x88,0x88,0x00,0x88,0x88, - 0x00,0x08,0x70,0x74,0x70,0x14,0x41,0x00,0x41,0x74,0x47,0x47,0x40,0x44,0x44,0x40, - 0x04,0x50,0x44,0x55,0x75,0x00,0x00,0x01,0x04,0x05,0x54,0x40,0x44,0x70,0x40,0x54, - 0x04,0x54,0x40,0x07,0x47,0x11,0x41,0x44,0x44,0x40,0x00,0x44,0x04,0x70,0x40,0x55, - 0x54,0x44,0x41,0x01,0x01,0x44,0x04,0x40,0x71,0x44,0x04,0x44,0x54,0x44,0x75,0x41, - 0x77,0x00,0x01,0x40,0x44,0x44,0x07,0x50,0x44,0x47,0x44,0x40,0x44,0x41,0x40,0x40, - 0x44,0x04,0x44,0x54,0x54,0x44,0x04,0x10,0x04,0x04,0x04,0x44,0x44,0x45,0x44,0x44, - 0x44,0x44,0x05,0x44,0x74,0x14,0x44,0x70,0x40,0x45,0x47,0x44,0x45,0x55,0x04,0x44, - 0x50,0x45,0x43,0x41,0x04,0x44,0x44,0x54,0x44,0x54,0x40,0x64,0x54,0x44,0x44,0x44, - 0x40,0x44,0x44,0x05,0x44,0x40,0x54,0x54,0x04,0x47,0x44,0x44,0x77,0x47,0x50,0x44, - 0x44,0x44,0x44,0x44,0x40,0x40,0x40,0x40,0x44,0x44,0x44,0x40,0x40,0x57,0x40,0x87, - 0x88,0x88,0x80,0x08,0x88,0x88,0x88,0x80,0x80,0x88,0x80,0x88,0x88,0x08,0x80,0x88, - 0x80,0x88,0x88,0x88,0x88,0x88,0x08,0x88,0x80,0x00,0x80,0x80,0x08,0x88,0x08,0x08, - 0x00,0x80,0x08,0x88,0x88,0x88,0x88,0x00,0x88,0x80,0x70,0x00,0x71,0x04,0x74,0xF8, - 0x14,0x4C,0x00,0xC4,0x4D,0xD4,0x45,0x04,0x50,0xFF,0x14,0x70,0x40,0x44,0xF4,0xF0, - 0x44,0x00,0x74,0x49,0x50,0x01,0x50,0x04,0x04,0x00,0x6E,0x44,0x19,0xF1,0x85,0x1F, - 0x04,0x64,0x09,0x00,0x84,0x00,0xF5,0xDF,0xE5,0x41,0x4F,0x04,0x1F,0xF4,0xDF,0xC7, - 0x0D,0x08,0xD4,0xC4,0x87,0x48,0x40,0xD4,0x04,0x44,0x47,0x55,0xD0,0xD4,0x5D,0x48, - 0x44,0x00,0x00,0x74,0x43,0x84,0x80,0x4F,0x0C,0xF4,0x50,0x84,0xC4,0x44,0x04,0x04, - 0x1F,0x4C,0x44,0x00,0x44,0x44,0x4D,0xC4,0x44,0x44,0x44,0xC4,0x84,0xF0,0xC0,0x0D, - 0x44,0x44,0x08,0x44,0x44,0x77,0x44,0x57,0x88,0x07,0x45,0x44,0x44,0x40,0x44,0x74, - 0x75,0x51,0x44,0x75,0x00,0x47,0x07,0x44,0x44,0x05,0x44,0x75,0x44,0x44,0x00,0x84, - 0x08,0x80,0x88,0x88,0x88,0x88,0x78,0x00,0x46,0x04,0x74,0x00,0x84,0x88,0x17,0x80, - 0x0D,0x04,0x04,0x00,0x00,0x00,0x00,0x00, -}; - - -/* - * Pick a suitable Mac encoding value for a Unicode string. - * - * This routine is only used during file creation and renaming. - */ -u_int32_t -hfs_pickencoding(const u_int16_t *src, int len) -{ - u_int32_t guess; - u_int16_t ch; - u_int8_t bits; - u_int8_t cjkstate; - - cjkstate = 0; - guess = kTextEncodingMacRoman; - - while (len--) { - ch = *src++; - if (ch < 0x0080) /* ASCII */ - continue; - - if (ch >= 0x4E00 && ch <= 0x9FAF) { /* CJK */ - bits = cjk_bitmap[(ch - 0x4E00) >> 1]; - if (ch & 1) - bits = bits >> 4; - bits &= 0x0F; - if (bits) { - if (cjkstate) { - bits &= cjkstate; - if (bits) - cjkstate = bits; - } else - cjkstate = bits; - } - continue; - } - if (ch >= 0x3041 && ch <= 0x30FE) { /* Hiragana & Katakana */ - if (cjkstate) { - bits = cjkstate & CJK_KATAKANA; - if (bits) - cjkstate = bits; - } else - cjkstate = CJK_KATAKANA; - continue; - } - if ((ch >= 0x1100 && ch <= 0x11F9) || /* Hangul Jamo */ - (ch >= 0x3131 && ch <= 0x318E) || /* Hangul Compatibility Jamo */ - (ch >= 0xF900 && ch <= 0xFA0B)) { /* CJK Compatibility Ideographs */ - cjk_lastunique = CJK_KOREAN; - return kTextEncodingMacKorean; - } - if (ch >= 0x3105 && ch <= 0x3129) { /* Bopomofo */ - if (cjkstate) { - bits = cjkstate & CJK_CHINESE; - if (bits) - cjkstate = bits; - } else - cjkstate = CJK_CHINESE; - continue; - } - if (ch >= 0xFF01 && ch <= 0xFFE6) { /* Halfwidth and Fullwidth */ - if (cjkstate == 0) - cjkstate = CJK_ALL; - continue; - } - if (hfs_islatinbias && ch >= 0x0300 && ch <= 0x0329) { - guess = hfs_encodingbias; - continue; - } - if (ch <= 0x03CE && ch >= 0x0384) { - guess = kTextEncodingMacGreek; - continue; - } - if (ch <= 0x0491 && ch >= 0x0401) { - guess = kTextEncodingMacCyrillic; - continue; - } - if (ch >= 0x05B0 && ch <= 0x05EA) { - return kTextEncodingMacHebrew; - } - if (ch >= 0x060C && ch <= 0x06d5) { - return kTextEncodingMacArabic; - } - if (ch >= 0x0E00 && ch <= 0x0E5B) { - return kTextEncodingMacThai; - } - /* Catch a few Shift-JIS strays */ - if (guess == 0 || guess == kTextEncodingMacUnicode) { - if (ch == 0x2010 || ch == 0x2014 || ch == 0x2015 || ch == 0x2016) { - guess = kTextEncodingMacJapanese; - if ((cjkstate == 0) || (cjkstate & CJK_JAPAN)) - cjkstate = CJK_JAPAN; - else - cjkstate |= CJK_JAPAN; - continue; - } - if ((hfs_encodingbias == kTextEncodingMacJapanese) && - (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00AC)) { - guess = kTextEncodingMacJapanese; - continue; - } - /* TM char depends on the Mac encoding used. */ - if (ch == 0x2122) { - switch(hfs_encodingbias) { - case kTextEncodingMacJapanese: - case kTextEncodingMacChineseTrad: - case kTextEncodingMacKorean: - case kTextEncodingMacGreek: - case kTextEncodingMacThai: - case kTextEncodingMacChineseSimp: - guess = hfs_encodingbias; - break; - } - } - } - if (guess == 0 && ch > 0x2122) { - guess = kTextEncodingMacUnicode; - } - } /* end while */ - - if (cjkstate) { - if (powerof2(cjkstate)) { - cjk_lastunique = cjkstate; - return ((u_int32_t)cjk_encoding[cjkstate]); - } - if (hfs_encodingbias != 0) { - switch(hfs_encodingbias) { - case kTextEncodingMacJapanese: - if (cjkstate & CJK_JAPAN) - return (kTextEncodingMacJapanese); - break; - case kTextEncodingMacKorean: - if (cjkstate & CJK_KOREAN) - return (kTextEncodingMacKorean); - break; - case kTextEncodingMacChineseTrad: - if (cjkstate & CJK_CHINESE_TRAD) - return (kTextEncodingMacChineseTrad); - if (cjkstate & CJK_CHINESE_SIMP) - return (kTextEncodingMacChineseSimp); - break; - case kTextEncodingMacChineseSimp: - if (cjkstate & CJK_CHINESE_SIMP) - return (kTextEncodingMacChineseSimp); - if (cjkstate & CJK_CHINESE_TRAD) - return (kTextEncodingMacChineseTrad); - break; - } - } - if (cjk_lastunique) { - if (cjkstate & cjk_lastunique) - cjkstate = cjk_lastunique; - else - cjk_lastunique = 0; - } - return ((u_int32_t)cjk_encoding[cjkstate]); - } - - return guess; -} - -#else /* HFS standard *NOT* supported */ - -u_int32_t -hfs_pickencoding(__unused const u_int16_t *src, __unused int len) { - /* Just return kTextEncodingMacRoman if HFS standard is not supported. */ - return kTextEncodingMacRoman; -} - -#endif /* CONFIG_HFS_STD */ - - -__private_extern__ -u_int32_t -hfs_getencodingbias(void) -{ - return (hfs_encodingbias); -} - - -__private_extern__ -void -hfs_setencodingbias(u_int32_t bias) -{ - lck_mtx_lock(&encodinglst_mutex); - - hfs_encodingbias = bias; - - switch (bias) { - case kTextEncodingMacRoman: - case kTextEncodingMacCentralEurRoman: - case kTextEncodingMacTurkish: - case kTextEncodingMacCroatian: - case kTextEncodingMacIcelandic: - case kTextEncodingMacRomanian: - hfs_islatinbias = 1; - break; - default: - hfs_islatinbias = 0; - break; - } - - lck_mtx_unlock(&encodinglst_mutex); -} - -#else /* not HFS - temp workaround until 4277828 is fixed */ -/* stubs for exported routines that aren't present when we build kernel without HFS */ - -#include - -u_int32_t hfs_pickencoding(u_int16_t *src, int len); - -u_int32_t hfs_pickencoding(__unused u_int16_t *src, __unused int len) -{ - return(0); -} - -#endif /* HFS */ - diff --git a/bsd/hfs/hfs_encodings.c b/bsd/hfs/hfs_encodings.c deleted file mode 100644 index d4fc65fc6..000000000 --- a/bsd/hfs/hfs_encodings.c +++ /dev/null @@ -1,781 +0,0 @@ -/* - * Copyright (c) 2000-2013 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -#if HFS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hfs.h" - - -lck_grp_t * encodinglst_lck_grp; -lck_grp_attr_t * encodinglst_lck_grp_attr; -lck_attr_t * encodinglst_lck_attr; - - -/* hfs encoding converter list */ -SLIST_HEAD(encodinglst, hfs_encoding) hfs_encoding_list = {0}; - -lck_mtx_t encodinglst_mutex; - -/* hfs encoding converter entry */ -struct hfs_encoding { - SLIST_ENTRY(hfs_encoding) link; - int refcount; - int kmod_id; - u_int32_t encoding; - hfs_to_unicode_func_t get_unicode_func; - unicode_to_hfs_func_t get_hfsname_func; -}; - -#define MAX_HFS_UNICODE_CHARS (15*5) - -#if CONFIG_HFS_STD -static int unicode_to_mac_roman(UniChar *uni_str, u_int32_t unicodeChars, Str31 hfs_str); -#endif - -void -hfs_converterinit(void) -{ - SLIST_INIT(&hfs_encoding_list); - - encodinglst_lck_grp_attr= lck_grp_attr_alloc_init(); - encodinglst_lck_grp = lck_grp_alloc_init("cnode_hash", encodinglst_lck_grp_attr); - encodinglst_lck_attr = lck_attr_alloc_init(); - - lck_mtx_init(&encodinglst_mutex, encodinglst_lck_grp, encodinglst_lck_attr); - -#if CONFIG_HFS_STD - /* - * add resident MacRoman converter and take a reference - * since its always "loaded". MacRoman is the default converter - * for HFS standard volumes. - * - * Only do this if we are actually supporting HFS standard - * volumes. The converter is not used on configurations - * that do not support HFS standard. - */ - hfs_addconverter(0, kTextEncodingMacRoman, mac_roman_to_unicode, unicode_to_mac_roman); - SLIST_FIRST(&hfs_encoding_list)->refcount++; -#endif - -} - -#if !CONFIG_HFS_STD - -/* - * Function stubs are needed for KPI export. - * It is a little swizzly to have two separate copies of the stub functions in this file - * but the prototypes of these functions are different if we're using the real headers - * vs. the dummy prototypes at the end of the file. (hfs_to_unicode_func_t vs. void*) - * - * As a result, we need our own copies in the no-HFS-Standard configuration - */ -int hfs_addconverter( __unused int id, - __unused u_int32_t encoding, - __unused hfs_to_unicode_func_t get_unicode, - __unused unicode_to_hfs_func_t get_hfsname ) -{ - return(0); -} - -int hfs_getconverter( __unused u_int32_t encoding, - __unused hfs_to_unicode_func_t *get_unicode, - __unused unicode_to_hfs_func_t *get_hfsname) -{ - return(EINVAL); -} - -int hfs_relconverter(__unused u_int32_t encoding) -{ - return(EINVAL); -} - -int hfs_remconverter(__unused int id, __unused u_int32_t encoding) -{ - return(0); -} - -#else - -/* - * For configurations that do support HFS standard, we need all of these.. - */ - -/* - * hfs_addconverter - add an HFS encoding converter - * - * This is called exclusivly by kernel loadable modules - * (like HFS_Japanese.kmod) to register hfs encoding - * conversion routines. - * - */ -int -hfs_addconverter(int id, u_int32_t encoding, hfs_to_unicode_func_t get_unicode, unicode_to_hfs_func_t get_hfsname) -{ - struct hfs_encoding *encp; - - MALLOC(encp, struct hfs_encoding *, sizeof(struct hfs_encoding), M_TEMP, M_WAITOK); - - lck_mtx_lock(&encodinglst_mutex); - - encp->link.sle_next = NULL; - encp->refcount = 0; - encp->encoding = encoding; - encp->get_unicode_func = get_unicode; - encp->get_hfsname_func = get_hfsname; - encp->kmod_id = id; - SLIST_INSERT_HEAD(&hfs_encoding_list, encp, link); - - lck_mtx_unlock(&encodinglst_mutex); - return (0); -} - - -/* - * hfs_remconverter - remove an HFS encoding converter - * - * Can be called by a kernel loadable module's finalize - * routine to remove an encoding converter so that the - * module (i.e. the code) can be unloaded. - * - * However, in the normal case, the removing and unloading - * of these converters is done in hfs_relconverter. - * The call is initiated from within the kernel during the unmounting of an hfs voulume. - */ -int -hfs_remconverter(int id, u_int32_t encoding) -{ - struct hfs_encoding *encp; - - lck_mtx_lock(&encodinglst_mutex); - SLIST_FOREACH(encp, &hfs_encoding_list, link) { - if (encp->encoding == encoding && encp->kmod_id == id) { - encp->refcount--; - - /* if converter is no longer in use, release it */ - if (encp->refcount <= 0 && encp->kmod_id != 0) { - SLIST_REMOVE(&hfs_encoding_list, encp, hfs_encoding, link); - lck_mtx_unlock(&encodinglst_mutex); - FREE(encp, M_TEMP); - return (0); - } else { - lck_mtx_unlock(&encodinglst_mutex); - return (1); /* busy */ - } - break; - } - } - lck_mtx_unlock(&encodinglst_mutex); - - return (0); -} - - -/* - * hfs_getconverter - get HFS encoding converters - * - * Normally called during the mounting of an hfs voulume. - */ -int -hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode, unicode_to_hfs_func_t *get_hfsname) -{ - struct hfs_encoding *encp; - int found = 0; - - lck_mtx_lock(&encodinglst_mutex); - SLIST_FOREACH(encp, &hfs_encoding_list, link) { - if (encp->encoding == encoding) { - found = 1; - *get_unicode = encp->get_unicode_func; - *get_hfsname = encp->get_hfsname_func; - ++encp->refcount; - break; - } - } - lck_mtx_unlock(&encodinglst_mutex); - - if (!found) { - *get_unicode = NULL; - *get_hfsname = NULL; - return (EINVAL); - } - - return (0); -} - - -/* - * hfs_relconverter - release interest in an HFS encoding converter - * - * Normally called during the unmounting of an hfs voulume. - */ -int -hfs_relconverter(u_int32_t encoding) -{ - struct hfs_encoding *encp; - - lck_mtx_lock(&encodinglst_mutex); - SLIST_FOREACH(encp, &hfs_encoding_list, link) { - if (encp->encoding == encoding) { - encp->refcount--; - - /* if converter is no longer in use, release it */ - if (encp->refcount <= 0 && encp->kmod_id != 0) { - uint32_t loadTag = (uint32_t)encp->kmod_id; - - SLIST_REMOVE(&hfs_encoding_list, encp, hfs_encoding, link); - lck_mtx_unlock(&encodinglst_mutex); - - FREE(encp, M_TEMP); - (void)OSKextUnloadKextWithLoadTag(loadTag); - return (0); - } - lck_mtx_unlock(&encodinglst_mutex); - return (0); - } - } - lck_mtx_unlock(&encodinglst_mutex); - - return (EINVAL); -} - -/* - * Convert HFS encoded string into UTF-8 - * - * Unicode output is fully decomposed - * '/' chars are converted to ':' - */ -int -hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDstLen, unsigned char* dstStr) -{ - int error; - UniChar uniStr[MAX_HFS_UNICODE_CHARS]; - ItemCount uniCount; - size_t utf8len; - hfs_to_unicode_func_t hfs_get_unicode = VCBTOHFS(vcb)->hfs_get_unicode; - u_int8_t pascal_length = 0; - - /* - * Validate the length of the Pascal-style string before passing it - * down to the decoding engine. - */ - pascal_length = *((const u_int8_t*)(hfs_str)); - if (pascal_length > 31) { - /* invalid string; longer than 31 bytes */ - error = EINVAL; - return error; - } - - error = hfs_get_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount); - - if (uniCount == 0) - error = EINVAL; - - if (error == 0) { - error = utf8_encodestr(uniStr, uniCount * sizeof(UniChar), dstStr, &utf8len, maxDstLen , ':', 0); - if (error == ENAMETOOLONG) - *actualDstLen = utf8_encodelen(uniStr, uniCount * sizeof(UniChar), ':', 0); - else - *actualDstLen = utf8len; - } - - return error; -} - -/* - * When an HFS name cannot be encoded with the current - * volume encoding then MacRoman is used as a fallback. - */ -int -mac_roman_to_utf8(const Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDstLen, unsigned char* dstStr) -{ - int error; - UniChar uniStr[MAX_HFS_UNICODE_CHARS]; - ItemCount uniCount; - size_t utf8len; - u_int8_t pascal_length = 0; - - /* - * Validate the length of the Pascal-style string before passing it - * down to the decoding engine. - */ - pascal_length = *((const u_int8_t*)(hfs_str)); - if (pascal_length > 31) { - /* invalid string; longer than 31 bytes */ - error = EINVAL; - return error; - } - - error = mac_roman_to_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount); - - if (uniCount == 0) - error = EINVAL; - - if (error == 0) { - error = utf8_encodestr(uniStr, uniCount * sizeof(UniChar), dstStr, &utf8len, maxDstLen , ':', 0); - if (error == ENAMETOOLONG) - *actualDstLen = utf8_encodelen(uniStr, uniCount * sizeof(UniChar), ':', 0); - else - *actualDstLen = utf8len; - } - - return error; -} - -/* - * Convert Unicode string into HFS encoding - * - * ':' chars are converted to '/' - * Assumes input represents fully decomposed Unicode - */ -int -unicode_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, u_int16_t* srcStr, Str31 dstStr, int retry) -{ - int error; - unicode_to_hfs_func_t hfs_get_hfsname = VCBTOHFS(vcb)->hfs_get_hfsname; - - error = hfs_get_hfsname(srcStr, srcLen/sizeof(UniChar), dstStr); - if (error && retry) { - error = unicode_to_mac_roman(srcStr, srcLen/sizeof(UniChar), dstStr); - } - return error; -} - -/* - * Convert UTF-8 string into HFS encoding - * - * ':' chars are converted to '/' - * Assumes input represents fully decomposed Unicode - */ -int -utf8_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, const unsigned char* srcStr, Str31 dstStr/*, int retry*/) -{ - int error; - UniChar uniStr[MAX_HFS_UNICODE_CHARS]; - size_t ucslen; - - error = utf8_decodestr(srcStr, srcLen, uniStr, &ucslen, sizeof(uniStr), ':', 0); - if (error == 0) - error = unicode_to_hfs(vcb, ucslen, uniStr, dstStr, 1); - - return error; -} - - -int -utf8_to_mac_roman(ByteCount srcLen, const unsigned char* srcStr, Str31 dstStr) -{ - int error; - UniChar uniStr[MAX_HFS_UNICODE_CHARS]; - size_t ucslen; - - error = utf8_decodestr(srcStr, srcLen, uniStr, &ucslen, sizeof(uniStr), ':', 0); - if (error == 0) - error = unicode_to_mac_roman(uniStr, ucslen/sizeof(UniChar), dstStr); - - return error; -} - -/* - * HFS MacRoman to/from Unicode conversions are built into the kernel - * All others hfs encodings are loadable. - */ - -/* 0x00A0 - 0x00FF = Latin 1 Supplement (30 total) */ -static u_int8_t gLatin1Table[] = { - /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ - /* 0x00A0 */ 0xCA, 0xC1, 0xA2, 0xA3, 0xDB, 0xB4, '?', 0xA4, 0xAC, 0xA9, 0xBB, 0xC7, 0xC2, '?', 0xA8, 0xF8, - /* 0x00B0 */ 0xA1, 0XB1, '?', '?', 0xAB, 0xB5, 0xA6, 0xe1, 0xFC, '?', 0xBC, 0xC8, '?', '?', '?', 0xC0, - /* 0x00C0 */ '?', '?', '?', '?', '?', '?', 0xAE, '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x00D0 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xAF, '?', '?', '?', '?', '?', '?', 0xA7, - /* 0x00E0 */ '?', '?', '?', '?', '?', '?', 0xBE, '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x00F0 */ '?', '?', '?', '?', '?', '?', '?', 0xD6, 0xBF, '?', '?', '?', '?', '?', '?', '?' -}; - -/* 0x02C0 - 0x02DF = Spacing Modifiers (8 total) */ -static u_int8_t gSpaceModsTable[] = { - /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ - /* 0x02C0 */ '?', '?', '?', '?', '?', '?', 0xF6, 0xFF, '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x02D0 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xF9, 0xFA, 0xFB, 0xFE, 0xF7, 0xFD, '?', '?' -}; - -/* 0x2010 - 0x20AF = General Punctuation (17 total) */ -static u_int8_t gPunctTable[] = { - /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ - /* 0x2010 */ '?', '?', '?', 0xd0, 0xd1, '?', '?', '?', 0xd4, 0xd5, 0xe2, '?', 0xd2, 0xd3, 0xe3, '?', - /* 0x2020 */ 0xa0, 0xe0, 0xa5, '?', '?', '?', 0xc9, '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2030 */ 0xe4, '?', '?', '?', '?', '?', '?', '?', '?', 0xdc, 0xdd, '?', '?', '?', '?', '?', - /* 0x2040 */ '?', '?', '?', '?', 0xda, '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2050 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2060 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2070 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2080 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2090 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x20A0 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', 0xdb, '?', '?', '?' -}; - -/* 0x22xx = Mathematical Operators (11 total) */ -static u_int8_t gMathTable[] = { - /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ - /* 0x2200 */ '?', '?', 0xb6, '?', '?', '?', 0xc6, '?', '?', '?', '?', '?', '?', '?', '?', 0xb8, - /* 0x2210 */ '?', 0xb7, '?', '?', '?', '?', '?', '?', '?', '?', 0xc3, '?', '?', '?', 0xb0, '?', - /* 0x2220 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', 0xba, '?', '?', '?', '?', - /* 0x2230 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2240 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xc5, '?', '?', '?', '?', '?', '?', '?', - /* 0x2250 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', - /* 0x2260 */ 0xad, '?', '?', '?', 0xb2, 0xb3, '?', '?' -}; - -/* */ -static u_int8_t gReverseCombTable[] = { - /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ - /* 0x40 */ 0xDA, 0x40, 0xDA, 0xDA, 0xDA, 0x56, 0xDA, 0xDA, 0xDA, 0x6C, 0xDA, 0xDA, 0xDA, 0xDA, 0x82, 0x98, - /* 0x50 */ 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xAE, 0xDA, 0xDA, 0xDA, 0xC4, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, - /* 0x60 */ 0xDA, 0x4B, 0xDA, 0xDA, 0xDA, 0x61, 0xDA, 0xDA, 0xDA, 0x77, 0xDA, 0xDA, 0xDA, 0xDA, 0x8D, 0xA3, - /* 0x70 */ 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xB9, 0xDA, 0xDA, 0xDA, 0xCF, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, 0xDA, - - /* Combining Diacritical Marks (0x0300 - 0x030A) */ - /* 0 1 2 3 4 5 6 7 8 9 A */ - /* 'A' */ - /* 0x0300 */ 0xCB, 0xE7, 0xE5, 0xCC, '?', '?', '?', '?', 0x80, '?', 0x81, - - /* 'a' */ - /* 0x0300 */ 0x88, 0x87, 0x89, 0x8B, '?', '?', '?', '?', 0x8A, '?', 0x8C, - - /* 'E' */ - /* 0x0300 */ 0xE9, 0x83, 0xE6, '?', '?', '?', '?', '?', 0xE8, '?', '?', - - /* 'e' */ - /* 0x0300 */ 0x8F, 0x8E, 0x90, '?', '?', '?', '?', '?', 0x91, '?', '?', - - /* 'I' */ - /* 0x0300 */ 0xED, 0xEA, 0xEB, '?', '?', '?', '?', '?', 0xEC, '?', '?', - - /* 'i' */ - /* 0x0300 */ 0x93, 0x92, 0x94, '?', '?', '?', '?', '?', 0x95, '?', '?', - - /* 'N' */ - /* 0x0300 */ '?', '?', '?', 0x84, '?', '?', '?', '?', '?', '?', '?', - - /* 'n' */ - /* 0x0300 */ '?', '?', '?', 0x96, '?', '?', '?', '?', '?', '?', '?', - - /* 'O' */ - /* 0x0300 */ 0xF1, 0xEE, 0xEF, 0xCD, '?', '?', '?', '?', 0x85, '?', '?', - - /* 'o' */ - /* 0x0300 */ 0x98, 0x97, 0x99, 0x9B, '?', '?', '?', '?', 0x9A, '?', '?', - - /* 'U' */ - /* 0x0300 */ 0xF4, 0xF2, 0xF3, '?', '?', '?', '?', '?', 0x86, '?', '?', - - /* 'u' */ - /* 0x0300 */ 0x9D, 0x9C, 0x9E, '?', '?', '?', '?', '?', 0x9F, '?', '?', - - /* 'Y' */ - /* 0x0300 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xD9, '?', '?', - - /* 'y' */ - /* 0x0300 */ '?', '?', '?', '?', '?', '?', '?', '?', 0xD8, '?', '?', - - /* else */ - /* 0x0300 */ '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?' -}; - - -/* - * Convert Unicode string into HFS MacRoman encoding - * - * Assumes Unicode input is fully decomposed - */ -static int unicode_to_mac_roman(UniChar *uni_str, u_int32_t unicodeChars, Str31 hfs_str) -{ - u_int8_t *p; - const UniChar *u; - UniChar c; - UniChar mask; - u_int16_t inputChars; - u_int16_t pascalChars; - OSErr result = noErr; - u_int8_t lsb; - u_int8_t prevChar; - u_int8_t mc; - - mask = (UniChar) 0xFF80; - p = &hfs_str[1]; - u = uni_str; - inputChars = unicodeChars; - pascalChars = prevChar = 0; - - while (inputChars) { - c = *(u++); - lsb = (u_int8_t) c; - - /* - * If its not 7-bit ascii, then we need to map it - */ - if ( c & mask ) { - mc = '?'; - switch (c & 0xFF00) { - case 0x0000: - if (lsb >= 0xA0) - mc = gLatin1Table[lsb - 0xA0]; - break; - - case 0x0200: - if (lsb >= 0xC0 && lsb <= 0xDF) - mc = gSpaceModsTable[lsb - 0xC0]; - break; - - case 0x2000: - if (lsb >= 0x10 && lsb <= 0xAF) - mc = gPunctTable[lsb- 0x10]; - break; - - case 0x2200: - if (lsb < 0x68) - mc = gMathTable[lsb]; - break; - - case 0x0300: - if (c <= 0x030A) { - if (prevChar >= 'A' && prevChar < 'z') { - mc = gReverseCombTable[gReverseCombTable[prevChar - 0x40] + lsb]; - --p; /* backup over base char */ - --pascalChars; - } - } else { - switch (c) { - case 0x0327: /* combining cedilla */ - if (prevChar == 'C') - mc = 0x82; - else if (prevChar == 'c') - mc = 0x8D; - else - break; - --p; /* backup over base char */ - --pascalChars; - break; - - case 0x03A9: mc = 0xBD; break; /* omega */ - - case 0x03C0: mc = 0xB9; break; /* pi */ - } - } - break; - - default: - switch (c) { - case 0x0131: mc = 0xf5; break; /* dotless i */ - - case 0x0152: mc = 0xce; break; /* OE */ - - case 0x0153: mc = 0xcf; break; /* oe */ - - case 0x0192: mc = 0xc4; break; /* � */ - - case 0x2122: mc = 0xaa; break; /* TM */ - - case 0x25ca: mc = 0xd7; break; /* diamond */ - - case 0xf8ff: mc = 0xf0; break; /* apple logo */ - - case 0xfb01: mc = 0xde; break; /* fi */ - - case 0xfb02: mc = 0xdf; break; /* fl */ - } - } /* end switch (c & 0xFF00) */ - - /* - * If we have an unmapped character then we need to mangle the name... - */ - if (mc == '?') - result = kTECUsedFallbacksStatus; - - prevChar = 0; - lsb = mc; - - } else { - prevChar = lsb; - } - - if (pascalChars >= 31) - break; - - *(p++) = lsb; - ++pascalChars; - --inputChars; - - } /* end while */ - - hfs_str[0] = pascalChars; - - if (inputChars > 0) - result = ENAMETOOLONG; /* ran out of room! */ - - return result; -} - - -static UniChar gHiBitBaseUnicode[128] = { - /* 0x80 */ 0x0041, 0x0041, 0x0043, 0x0045, 0x004e, 0x004f, 0x0055, 0x0061, - /* 0x88 */ 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0063, 0x0065, 0x0065, - /* 0x90 */ 0x0065, 0x0065, 0x0069, 0x0069, 0x0069, 0x0069, 0x006e, 0x006f, - /* 0x98 */ 0x006f, 0x006f, 0x006f, 0x006f, 0x0075, 0x0075, 0x0075, 0x0075, - /* 0xa0 */ 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, - /* 0xa8 */ 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, - /* 0xb0 */ 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, - /* 0xb8 */ 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, - /* 0xc0 */ 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, - /* 0xc8 */ 0x00bb, 0x2026, 0x00a0, 0x0041, 0x0041, 0x004f, 0x0152, 0x0153, - /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, - /* 0xd8 */ 0x0079, 0x0059, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, - /* 0xe0 */ 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x0041, 0x0045, 0x0041, - /* 0xe8 */ 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049, 0x004f, 0x004f, - /* 0xf0 */ 0xf8ff, 0x004f, 0x0055, 0x0055, 0x0055, 0x0131, 0x02c6, 0x02dc, - /* 0xf8 */ 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7 -}; - -static UniChar gHiBitCombUnicode[128] = { - /* 0x80 */ 0x0308, 0x030a, 0x0327, 0x0301, 0x0303, 0x0308, 0x0308, 0x0301, - /* 0x88 */ 0x0300, 0x0302, 0x0308, 0x0303, 0x030a, 0x0327, 0x0301, 0x0300, - /* 0x90 */ 0x0302, 0x0308, 0x0301, 0x0300, 0x0302, 0x0308, 0x0303, 0x0301, - /* 0x98 */ 0x0300, 0x0302, 0x0308, 0x0303, 0x0301, 0x0300, 0x0302, 0x0308, - /* 0xa0 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0xa8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0xb0 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0xb8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0xc0 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0xc8 */ 0x0000, 0x0000, 0x0000, 0x0300, 0x0303, 0x0303, 0x0000, 0x0000, - /* 0xd0 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0xd8 */ 0x0308, 0x0308, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0xe0 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0302, 0x0302, 0x0301, - /* 0xe8 */ 0x0308, 0x0300, 0x0301, 0x0302, 0x0308, 0x0300, 0x0301, 0x0302, - /* 0xf0 */ 0x0000, 0x0300, 0x0301, 0x0302, 0x0300, 0x0000, 0x0000, 0x0000, - /* 0xf8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 -}; - - -/* - * Convert HFS MacRoman encoded string into Unicode - * - * Unicode output is fully decomposed - */ -int -mac_roman_to_unicode(const Str31 hfs_str, UniChar *uni_str, - __unused u_int32_t maxCharLen, u_int32_t *unicodeChars) -{ - const u_int8_t *p; - UniChar *u; - u_int16_t pascalChars; - u_int8_t c; - - p = hfs_str; - u = uni_str; - - *unicodeChars = pascalChars = *(p++); /* pick up length byte */ - - while (pascalChars--) { - c = *(p++); - - if ( (int8_t) c >= 0 ) { /* check if seven bit ascii */ - *(u++) = (UniChar) c; /* just pad high byte with zero */ - } else { /* its a hi bit character */ - UniChar uc; - - c &= 0x7F; - *(u++) = uc = gHiBitBaseUnicode[c]; - - /* - * if the unicode character we get back is an alpha char - * then we must have an additional combining character - */ - if ((uc <= (UniChar) 'z') && (uc >= (UniChar) 'A')) { - *(u++) = gHiBitCombUnicode[c]; - ++(*unicodeChars); - } - } - } - - return noErr; -} - -#endif /* CONFIG_STD_HFS */ - -#else /* not HFS */ - -/* - * These function prototypes are here because hfs.h is not #included - * so its prototypes are not provided. These are needed because they are exported - * as KPI for the conversion subroutines during mounting of HFS standard. - */ -int hfs_addconverter(int id, u_int32_t encoding, void * get_unicode, void * get_hfsname); -int hfs_getconverter(u_int32_t encoding, void *get_unicode, void *get_hfsname); -int hfs_relconverter(u_int32_t encoding); -int hfs_remconverter(int id, u_int32_t encoding); - -/* Function stubs are needed for KPI export */ - -int hfs_addconverter( __unused int id, - __unused u_int32_t encoding, - __unused void * get_unicode, - __unused void * get_hfsname ) -{ - return(0); -} - -int hfs_getconverter(__unused u_int32_t encoding, __unused void *get_unicode, __unused void *get_hfsname) -{ - return(EINVAL); -} - -int hfs_relconverter(__unused u_int32_t encoding) -{ - return(EINVAL); -} - -int hfs_remconverter(__unused int id, __unused u_int32_t encoding) -{ - return(0); -} -#endif - - diff --git a/bsd/hfs/hfs_endian.c b/bsd/hfs/hfs_endian.c deleted file mode 100644 index eb242b37f..000000000 --- a/bsd/hfs/hfs_endian.c +++ /dev/null @@ -1,1245 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * hfs_endian.c - * - * This file implements endian swapping routines for the HFS/HFS Plus - * volume format. - */ - -#include "hfs_endian.h" -#include "hfs_dbg.h" -#include "hfscommon/headers/BTreesPrivate.h" - -#undef ENDIAN_DEBUG - -/* - * Internal swapping routines - * - * These routines handle swapping the records of leaf and index nodes. The - * layout of the keys and records varies depending on the kind of B-tree - * (determined by fileID). - * - * The direction parameter must be kSwapBTNodeBigToHost or kSwapBTNodeHostToBig. - * The kSwapBTNodeHeaderRecordOnly "direction" is not valid for these routines. - */ -int hfs_swap_HFSPlusBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); -void hfs_swap_HFSPlusForkData (HFSPlusForkData *src); - -#if CONFIG_HFS_STD -int hfs_swap_HFSBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); -#endif - -/* - * hfs_swap_HFSPlusForkData - */ -void -hfs_swap_HFSPlusForkData ( - HFSPlusForkData *src -) -{ - int i; - - src->logicalSize = SWAP_BE64 (src->logicalSize); - - src->clumpSize = SWAP_BE32 (src->clumpSize); - src->totalBlocks = SWAP_BE32 (src->totalBlocks); - - for (i = 0; i < kHFSPlusExtentDensity; i++) { - src->extents[i].startBlock = SWAP_BE32 (src->extents[i].startBlock); - src->extents[i].blockCount = SWAP_BE32 (src->extents[i].blockCount); - } -} - -/* - * hfs_swap_BTNode - * - * NOTE: This operation is not naturally symmetric. - * We have to determine which way we're swapping things. - */ -int -hfs_swap_BTNode ( - BlockDescriptor *src, - vnode_t vp, - enum HFSBTSwapDirection direction, - u_int8_t allow_empty_node -) -{ - BTNodeDescriptor *srcDesc = src->buffer; - u_int16_t *srcOffs = NULL; - BTreeControlBlockPtr btcb = (BTreeControlBlockPtr)VTOF(vp)->fcbBTCBPtr; - u_int16_t i; /* index to match srcDesc->numRecords */ - int error = 0; - -#ifdef ENDIAN_DEBUG - if (direction == kSwapBTNodeBigToHost) { - printf ("hfs: BE -> Native Swap\n"); - } else if (direction == kSwapBTNodeHostToBig) { - printf ("hfs: Native -> BE Swap\n"); - } else if (direction == kSwapBTNodeHeaderRecordOnly) { - printf ("hfs: Not swapping descriptors\n"); - } else { - panic ("hfs_swap_BTNode: This is impossible"); - } -#endif - - /* - * If we are doing a swap from on-disk to in-memory, then swap the node - * descriptor and record offsets before we need to use them. - */ - if (direction == kSwapBTNodeBigToHost) { - srcDesc->fLink = SWAP_BE32 (srcDesc->fLink); - srcDesc->bLink = SWAP_BE32 (srcDesc->bLink); - - /* - * When first opening a BTree, we have to read the header node before the - * control block is initialized. In this case, totalNodes will be zero, - * so skip the bounds checking. Also, we should ignore the header node when - * checking for invalid forwards and backwards links, since the header node's - * links can point back to itself legitimately. - */ - if (btcb->totalNodes != 0) { - if (srcDesc->fLink >= btcb->totalNodes) { -#if DEVELOPMENT || DEBUG - panic("hfs_swap_BTNode: invalid forward link (0x%08x >= 0x%08x)\n", srcDesc->fLink, btcb->totalNodes); -#else - printf("hfs_swap_BTNode: invalid forward link (0x%08x >= 0x%08x)\n", srcDesc->fLink, btcb->totalNodes); -#endif - error = fsBTInvalidHeaderErr; - goto fail; - } - if (srcDesc->bLink >= btcb->totalNodes) { -#if DEVELOPMENT || DEBUG - panic("hfs_swap_BTNode: invalid backward link (0x%08x >= 0x%08x)\n", srcDesc->bLink, btcb->totalNodes); -#else - printf("hfs_swap_BTNode: invalid backward link (0x%08x >= 0x%08x)\n", srcDesc->bLink, btcb->totalNodes); -#endif - error = fsBTInvalidHeaderErr; - goto fail; - } - - if ((src->blockNum != 0) && (srcDesc->fLink == (u_int32_t) src->blockNum)) { -#if DEVELOPMENT || DEBUG - panic("hfs_swap_BTNode: invalid forward link (0x%08x == 0x%08x)\n", - srcDesc->fLink, (u_int32_t) src->blockNum); -#else - printf("hfs_swap_BTNode: invalid forward link (0x%08x == 0x%08x)\n", - srcDesc->fLink, (u_int32_t) src->blockNum); -#endif - error = fsBTInvalidHeaderErr; - goto fail; - } - if ((src->blockNum != 0) && (srcDesc->bLink == (u_int32_t) src->blockNum)) { -#if DEVELOPMENT || DEBUG - panic("hfs_swap_BTNode: invalid backward link (0x%08x == 0x%08x)\n", - srcDesc->bLink, (u_int32_t) src->blockNum); -#else - printf("hfs_swap_BTNode: invalid backward link (0x%08x == 0x%08x)\n", - srcDesc->bLink, (u_int32_t) src->blockNum); -#endif - error = fsBTInvalidHeaderErr; - goto fail; - } - - - } - - /* - * Check srcDesc->kind. Don't swap it because it's only one byte. - */ - if (srcDesc->kind < kBTLeafNode || srcDesc->kind > kBTMapNode) { - printf("hfs_swap_BTNode: invalid node kind (%d)\n", srcDesc->kind); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* - * Check srcDesc->height. Don't swap it because it's only one byte. - */ - if (srcDesc->height > kMaxTreeDepth) { - printf("hfs_swap_BTNode: invalid node height (%d)\n", srcDesc->height); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* Don't swap srcDesc->reserved */ - - srcDesc->numRecords = SWAP_BE16 (srcDesc->numRecords); - - /* - * Swap the node offsets (including the free space one!). - */ - srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - ((srcDesc->numRecords + 1) * sizeof (u_int16_t)))); - - /* - * Sanity check that the record offsets are within the node itself. - */ - if ((char *)srcOffs > ((char *)src->buffer + src->blockSize) || - (char *)srcOffs < ((char *)src->buffer + sizeof(BTNodeDescriptor))) { - printf("hfs_swap_BTNode: invalid record count (0x%04X)\n", srcDesc->numRecords); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* - * Swap and sanity check each of the record offsets. - */ - for (i = 0; i <= srcDesc->numRecords; i++) { - srcOffs[i] = SWAP_BE16 (srcOffs[i]); - - /* - * Sanity check: must be even, and within the node itself. - * - * We may be called to swap an unused node, which contains all zeroes. - * Unused nodes are expected only when allow_empty_node is true. - * If it is false and record offset is zero, return error. - */ - if ((srcOffs[i] & 1) || ( - (allow_empty_node == false) && (srcOffs[i] == 0)) || - (srcOffs[i] < sizeof(BTNodeDescriptor) && srcOffs[i] != 0) || - (srcOffs[i] >= src->blockSize)) { - printf("hfs_swap_BTNode: record #%d invalid offset (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* - * Make sure the offsets are strictly increasing. Note that we're looping over - * them backwards, hence the order in the comparison. - */ - if ((i != 0) && (srcOffs[i] >= srcOffs[i-1])) { - printf("hfs_swap_BTNode: offsets %d and %d out of order (0x%04X, 0x%04X)\n", - srcDesc->numRecords-i-1, srcDesc->numRecords-i, srcOffs[i], srcOffs[i-1]); - error = fsBTInvalidHeaderErr; - goto fail; - } - } - } - - /* - * Swap the records (ordered by frequency of access) - */ - if ((srcDesc->kind == kBTIndexNode) || - (srcDesc-> kind == kBTLeafNode)) { - - if (VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) { - error = hfs_swap_HFSPlusBTInternalNode (src, VTOC(vp)->c_fileid, direction); - } -#if CONFIG_HFS_STD - else { - error = hfs_swap_HFSBTInternalNode (src, VTOC(vp)->c_fileid, direction); - } -#endif - - if (error) goto fail; - - } else if (srcDesc-> kind == kBTMapNode) { - /* Don't swap the bitmaps, they'll be done in the bitmap routines */ - - } else if (srcDesc-> kind == kBTHeaderNode) { - /* The header's offset is hard-wired because we cannot trust the offset pointers. */ - BTHeaderRec *srcHead = (BTHeaderRec *)((char *)src->buffer + sizeof(BTNodeDescriptor)); - - srcHead->treeDepth = SWAP_BE16 (srcHead->treeDepth); - - srcHead->rootNode = SWAP_BE32 (srcHead->rootNode); - srcHead->leafRecords = SWAP_BE32 (srcHead->leafRecords); - srcHead->firstLeafNode = SWAP_BE32 (srcHead->firstLeafNode); - srcHead->lastLeafNode = SWAP_BE32 (srcHead->lastLeafNode); - - srcHead->nodeSize = SWAP_BE16 (srcHead->nodeSize); - srcHead->maxKeyLength = SWAP_BE16 (srcHead->maxKeyLength); - - srcHead->totalNodes = SWAP_BE32 (srcHead->totalNodes); - srcHead->freeNodes = SWAP_BE32 (srcHead->freeNodes); - - srcHead->clumpSize = SWAP_BE32 (srcHead->clumpSize); - srcHead->attributes = SWAP_BE32 (srcHead->attributes); - - /* Don't swap srcHead->reserved1 */ - /* Don't swap srcHead->btreeType; it's only one byte */ - /* Don't swap srcHead->reserved2 */ - /* Don't swap srcHead->reserved3 */ - /* Don't swap bitmap */ - } - - /* - * If we are doing a swap from in-memory to on-disk, then swap the node - * descriptor and record offsets after we're done using them. - */ - if (direction == kSwapBTNodeHostToBig) { - /* - * Sanity check and swap the forward and backward links. - * Ignore the header node since its forward and backwards links can legitimately - * point to itself. - */ - if (srcDesc->fLink >= btcb->totalNodes) { - panic("hfs_UNswap_BTNode: invalid forward link (0x%08X)\n", srcDesc->fLink); - error = fsBTInvalidHeaderErr; - goto fail; - } - if ((src->blockNum != 0) && (srcDesc->fLink == (u_int32_t) src->blockNum)) { - panic ("hfs_UNswap_BTNode: invalid forward link (0x%08x == 0x%08x)\n", - srcDesc->fLink, (u_int32_t) src->blockNum); - error = fsBTInvalidHeaderErr; - goto fail; - } - - if (srcDesc->bLink >= btcb->totalNodes) { - panic("hfs_UNswap_BTNode: invalid backward link (0x%08X)\n", srcDesc->bLink); - error = fsBTInvalidHeaderErr; - goto fail; - } - if ((src->blockNum != 0) && (srcDesc->bLink == (u_int32_t) src->blockNum)) { - panic ("hfs_UNswap_BTNode: invalid backward link (0x%08x == 0x%08x)\n", - srcDesc->bLink, (u_int32_t) src->blockNum); - error = fsBTInvalidHeaderErr; - goto fail; - } - - - srcDesc->fLink = SWAP_BE32 (srcDesc->fLink); - srcDesc->bLink = SWAP_BE32 (srcDesc->bLink); - - /* - * Check srcDesc->kind. Don't swap it because it's only one byte. - */ - if (srcDesc->kind < kBTLeafNode || srcDesc->kind > kBTMapNode) { - panic("hfs_UNswap_BTNode: invalid node kind (%d)\n", srcDesc->kind); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* - * Check srcDesc->height. Don't swap it because it's only one byte. - */ - if (srcDesc->height > kMaxTreeDepth) { - panic("hfs_UNswap_BTNode: invalid node height (%d)\n", srcDesc->height); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* Don't swap srcDesc->reserved */ - - /* - * Swap the node offsets (including the free space one!). - */ - srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - ((srcDesc->numRecords + 1) * sizeof (u_int16_t)))); - - /* - * Sanity check that the record offsets are within the node itself. - */ - if ((char *)srcOffs > ((char *)src->buffer + src->blockSize) || - (char *)srcOffs < ((char *)src->buffer + sizeof(BTNodeDescriptor))) { - panic("hfs_UNswap_BTNode: invalid record count (0x%04X)\n", srcDesc->numRecords); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* - * Swap and sanity check each of the record offsets. - */ - for (i = 0; i <= srcDesc->numRecords; i++) { - /* - * Sanity check: must be even, and within the node itself. - * - * We may be called to swap an unused node, which contains all zeroes. - * This can happen when the last record from a node gets deleted. - * This is why we allow the record offset to be zero. - * Unused nodes are expected only when allow_empty_node is true - * (the caller should set it to true for kSwapBTNodeBigToHost). - */ - if ((srcOffs[i] & 1) || - ((allow_empty_node == false) && (srcOffs[i] == 0)) || - (srcOffs[i] < sizeof(BTNodeDescriptor) && srcOffs[i] != 0) || - (srcOffs[i] >= src->blockSize)) { - panic("hfs_UNswap_BTNode: record #%d invalid offset (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - error = fsBTInvalidHeaderErr; - goto fail; - } - - /* - * Make sure the offsets are strictly increasing. Note that we're looping over - * them backwards, hence the order in the comparison. - */ - if ((i < srcDesc->numRecords) && (srcOffs[i+1] >= srcOffs[i])) { - panic("hfs_UNswap_BTNode: offsets %d and %d out of order (0x%04X, 0x%04X)\n", - srcDesc->numRecords-i-2, srcDesc->numRecords-i-1, srcOffs[i+1], srcOffs[i]); - error = fsBTInvalidHeaderErr; - goto fail; - } - - srcOffs[i] = SWAP_BE16 (srcOffs[i]); - } - - srcDesc->numRecords = SWAP_BE16 (srcDesc->numRecords); - } - -fail: - if (error) { - /* - * Log some useful information about where the corrupt node is. - */ - printf("hfs: node=%lld fileID=%u volume=%s device=%s\n", src->blockNum, VTOC(vp)->c_fileid, - VTOVCB(vp)->vcbVN, vfs_statfs(vnode_mount(vp))->f_mntfromname); - hfs_mark_inconsistent(VTOVCB(vp), HFS_INCONSISTENCY_DETECTED); - } - - return (error); -} - -int -hfs_swap_HFSPlusBTInternalNode ( - BlockDescriptor *src, - HFSCatalogNodeID fileID, - enum HFSBTSwapDirection direction -) -{ - BTNodeDescriptor *srcDesc = src->buffer; - u_int16_t *srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - (srcDesc->numRecords * sizeof (u_int16_t)))); - char *nextRecord; /* Points to start of record following current one */ - - /* - * i is an int32 because it needs to be negative to index the offset to free space. - * srcDesc->numRecords is a u_int16_t and is unlikely to become 32-bit so this should be ok. - */ - - int32_t i; - u_int32_t j; - - if (fileID == kHFSExtentsFileID) { - HFSPlusExtentKey *srcKey; - HFSPlusExtentDescriptor *srcRec; - size_t recordSize; /* Size of the data part of the record, or node number for index nodes */ - - if (srcDesc->kind == kBTIndexNode) - recordSize = sizeof(u_int32_t); - else - recordSize = sizeof(HFSPlusExtentDescriptor); - - for (i = 0; i < srcDesc->numRecords; i++) { - /* Point to the start of the record we're currently checking. */ - srcKey = (HFSPlusExtentKey *)((char *)src->buffer + srcOffs[i]); - - /* - * Point to start of next (larger offset) record. We'll use this - * to be sure the current record doesn't overflow into the next - * record. - */ - nextRecord = (char *)src->buffer + srcOffs[i-1]; - - /* - * Make sure the key and data are within the buffer. Since both key - * and data are fixed size, this is relatively easy. Note that this - * relies on the keyLength being a constant; we verify the keyLength - * below. - */ - if ((char *)srcKey + sizeof(HFSPlusExtentKey) + recordSize > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } - return fsBTInvalidNodeErr; - } - - if (direction == kSwapBTNodeBigToHost) - srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); - if (srcKey->keyLength != sizeof(*srcKey) - sizeof(srcKey->keyLength)) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } - return fsBTInvalidNodeErr; - } - srcRec = (HFSPlusExtentDescriptor *)((char *)srcKey + srcKey->keyLength + sizeof(srcKey->keyLength)); - if (direction == kSwapBTNodeHostToBig) - srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); - - /* Don't swap srcKey->forkType; it's only one byte */ - /* Don't swap srcKey->pad */ - - srcKey->fileID = SWAP_BE32 (srcKey->fileID); - srcKey->startBlock = SWAP_BE32 (srcKey->startBlock); - - if (srcDesc->kind == kBTIndexNode) { - /* For index nodes, the record data is just a child node number. */ - *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); - } else { - /* Swap the extent data */ - for (j = 0; j < kHFSPlusExtentDensity; j++) { - srcRec[j].startBlock = SWAP_BE32 (srcRec[j].startBlock); - srcRec[j].blockCount = SWAP_BE32 (srcRec[j].blockCount); - } - } - } - - } else if (fileID == kHFSCatalogFileID) { - HFSPlusCatalogKey *srcKey; - int16_t *srcPtr; - u_int16_t keyLength; - - for (i = 0; i < srcDesc->numRecords; i++) { - /* Point to the start of the record we're currently checking. */ - srcKey = (HFSPlusCatalogKey *)((char *)src->buffer + srcOffs[i]); - - /* - * Point to start of next (larger offset) record. We'll use this - * to be sure the current record doesn't overflow into the next - * record. - */ - nextRecord = (char *)src->buffer + (uintptr_t)(srcOffs[i-1]); - - /* - * Make sure we can safely dereference the keyLength and parentID fields. - */ - if ((char *)srcKey + offsetof(HFSPlusCatalogKey, nodeName.unicode[0]) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } - return fsBTInvalidNodeErr; - } - - /* - * Swap and sanity check the key length - */ - if (direction == kSwapBTNodeBigToHost) - srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); - keyLength = srcKey->keyLength; /* Put it in a local (native order) because we use it several times */ - if (direction == kSwapBTNodeHostToBig) - srcKey->keyLength = SWAP_BE16 (keyLength); - - /* Sanity check the key length */ - if (keyLength < kHFSPlusCatalogKeyMinimumLength || keyLength > kHFSPlusCatalogKeyMaximumLength) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, keyLength); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, keyLength); - } - return fsBTInvalidNodeErr; - } - - /* - * Make sure that we can safely dereference the record's type field or - * an index node's child node number. - */ - srcPtr = (int16_t *)((char *)srcKey + keyLength + sizeof(srcKey->keyLength)); - if ((char *)srcPtr + sizeof(u_int32_t) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - srcKey->parentID = SWAP_BE32 (srcKey->parentID); - - /* - * Swap and sanity check the key's node name - */ - if (direction == kSwapBTNodeBigToHost) - srcKey->nodeName.length = SWAP_BE16 (srcKey->nodeName.length); - /* Make sure name length is consistent with key length */ - if (keyLength < sizeof(srcKey->parentID) + sizeof(srcKey->nodeName.length) + - srcKey->nodeName.length*sizeof(srcKey->nodeName.unicode[0])) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog record #%d keyLength=%d expected=%lu\n", - srcDesc->numRecords-i, keyLength, sizeof(srcKey->parentID) + sizeof(srcKey->nodeName.length) + - srcKey->nodeName.length*sizeof(srcKey->nodeName.unicode[0])); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog record #%d keyLength=%d expected=%lu\n", - srcDesc->numRecords-i, keyLength, sizeof(srcKey->parentID) + sizeof(srcKey->nodeName.length) + - srcKey->nodeName.length*sizeof(srcKey->nodeName.unicode[0])); - } - return fsBTInvalidNodeErr; - } - for (j = 0; j < srcKey->nodeName.length; j++) { - srcKey->nodeName.unicode[j] = SWAP_BE16 (srcKey->nodeName.unicode[j]); - } - if (direction == kSwapBTNodeHostToBig) - srcKey->nodeName.length = SWAP_BE16 (srcKey->nodeName.length); - - /* - * For index nodes, the record data is just the child's node number. - * Skip over swapping the various types of catalog record. - */ - if (srcDesc->kind == kBTIndexNode) { - *((u_int32_t *)srcPtr) = SWAP_BE32 (*((u_int32_t *)srcPtr)); - continue; - } - - /* Make sure the recordType is in native order before using it. */ - if (direction == kSwapBTNodeBigToHost) - srcPtr[0] = SWAP_BE16 (srcPtr[0]); - - if (srcPtr[0] == kHFSPlusFolderRecord) { - HFSPlusCatalogFolder *srcRec = (HFSPlusCatalogFolder *)srcPtr; - if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - srcRec->flags = SWAP_BE16 (srcRec->flags); - srcRec->valence = SWAP_BE32 (srcRec->valence); - srcRec->folderID = SWAP_BE32 (srcRec->folderID); - srcRec->createDate = SWAP_BE32 (srcRec->createDate); - srcRec->contentModDate = SWAP_BE32 (srcRec->contentModDate); - srcRec->attributeModDate = SWAP_BE32 (srcRec->attributeModDate); - srcRec->accessDate = SWAP_BE32 (srcRec->accessDate); - srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); - - srcRec->bsdInfo.ownerID = SWAP_BE32 (srcRec->bsdInfo.ownerID); - srcRec->bsdInfo.groupID = SWAP_BE32 (srcRec->bsdInfo.groupID); - - /* Don't swap srcRec->bsdInfo.adminFlags; it's only one byte */ - /* Don't swap srcRec->bsdInfo.ownerFlags; it's only one byte */ - - srcRec->bsdInfo.fileMode = SWAP_BE16 (srcRec->bsdInfo.fileMode); - srcRec->bsdInfo.special.iNodeNum = SWAP_BE32 (srcRec->bsdInfo.special.iNodeNum); - - srcRec->textEncoding = SWAP_BE32 (srcRec->textEncoding); - - /* Don't swap srcRec->userInfo */ - /* Don't swap srcRec->finderInfo */ - srcRec->folderCount = SWAP_BE32 (srcRec->folderCount); - - } else if (srcPtr[0] == kHFSPlusFileRecord) { - HFSPlusCatalogFile *srcRec = (HFSPlusCatalogFile *)srcPtr; - if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - srcRec->flags = SWAP_BE16 (srcRec->flags); - - srcRec->fileID = SWAP_BE32 (srcRec->fileID); - - srcRec->createDate = SWAP_BE32 (srcRec->createDate); - srcRec->contentModDate = SWAP_BE32 (srcRec->contentModDate); - srcRec->attributeModDate = SWAP_BE32 (srcRec->attributeModDate); - srcRec->accessDate = SWAP_BE32 (srcRec->accessDate); - srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); - - srcRec->bsdInfo.ownerID = SWAP_BE32 (srcRec->bsdInfo.ownerID); - srcRec->bsdInfo.groupID = SWAP_BE32 (srcRec->bsdInfo.groupID); - - /* Don't swap srcRec->bsdInfo.adminFlags; it's only one byte */ - /* Don't swap srcRec->bsdInfo.ownerFlags; it's only one byte */ - - srcRec->bsdInfo.fileMode = SWAP_BE16 (srcRec->bsdInfo.fileMode); - srcRec->bsdInfo.special.iNodeNum = SWAP_BE32 (srcRec->bsdInfo.special.iNodeNum); - - srcRec->textEncoding = SWAP_BE32 (srcRec->textEncoding); - - /* If kHFSHasLinkChainBit is set, reserved1 is hl_FirstLinkID. - * In all other context, it is expected to be zero. - */ - srcRec->reserved1 = SWAP_BE32 (srcRec->reserved1); - - /* Don't swap srcRec->userInfo */ - /* Don't swap srcRec->finderInfo */ - /* Don't swap srcRec->reserved2 */ - - hfs_swap_HFSPlusForkData (&srcRec->dataFork); - hfs_swap_HFSPlusForkData (&srcRec->resourceFork); - - } else if ((srcPtr[0] == kHFSPlusFolderThreadRecord) || - (srcPtr[0] == kHFSPlusFileThreadRecord)) { - - /* - * Make sure there is room for parentID and name length. - */ - HFSPlusCatalogThread *srcRec = (HFSPlusCatalogThread *)srcPtr; - if ((char *) &srcRec->nodeName.unicode[0] > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* Don't swap srcRec->reserved */ - - srcRec->parentID = SWAP_BE32 (srcRec->parentID); - - if (direction == kSwapBTNodeBigToHost) - srcRec->nodeName.length = SWAP_BE16 (srcRec->nodeName.length); - - /* - * Make sure there is room for the name in the buffer. - * Then swap the characters of the name itself. - */ - if ((char *) &srcRec->nodeName.unicode[srcRec->nodeName.length] > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - for (j = 0; j < srcRec->nodeName.length; j++) { - srcRec->nodeName.unicode[j] = SWAP_BE16 (srcRec->nodeName.unicode[j]); - } - - if (direction == kSwapBTNodeHostToBig) - srcRec->nodeName.length = SWAP_BE16 (srcRec->nodeName.length); - - } else { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* We can swap the record type now that we're done using it. */ - if (direction == kSwapBTNodeHostToBig) - srcPtr[0] = SWAP_BE16 (srcPtr[0]); - } - - } else if (fileID == kHFSAttributesFileID) { - HFSPlusAttrKey *srcKey; - HFSPlusAttrRecord *srcRec; - u_int16_t keyLength; - u_int32_t attrSize = 0; - - for (i = 0; i < srcDesc->numRecords; i++) { - /* Point to the start of the record we're currently checking. */ - srcKey = (HFSPlusAttrKey *)((char *)src->buffer + srcOffs[i]); - - /* - * Point to start of next (larger offset) record. We'll use this - * to be sure the current record doesn't overflow into the next - * record. - */ - nextRecord = (char *)src->buffer + srcOffs[i-1]; - - /* Make sure there is room in the buffer for a minimal key */ - if ((char *) &srcKey->attrName[1] > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: attr key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: attr key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } - return fsBTInvalidNodeErr; - } - - /* Swap the key length field */ - if (direction == kSwapBTNodeBigToHost) - srcKey->keyLength = SWAP_BE16(srcKey->keyLength); - keyLength = srcKey->keyLength; /* Keep a copy in native order */ - if (direction == kSwapBTNodeHostToBig) - srcKey->keyLength = SWAP_BE16(srcKey->keyLength); - - /* - * Make sure that we can safely dereference the record's type field or - * an index node's child node number. - */ - srcRec = (HFSPlusAttrRecord *)((char *)srcKey + keyLength + sizeof(srcKey->keyLength)); - if ((char *)srcRec + sizeof(u_int32_t) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: attr key #%d too big (%d)\n", srcDesc->numRecords-i-1, keyLength); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: attr key #%d too big (%d)\n", srcDesc->numRecords-i-1, keyLength); - } - return fsBTInvalidNodeErr; - } - - srcKey->fileID = SWAP_BE32(srcKey->fileID); - srcKey->startBlock = SWAP_BE32(srcKey->startBlock); - - /* - * Swap and check the attribute name - */ - if (direction == kSwapBTNodeBigToHost) - srcKey->attrNameLen = SWAP_BE16(srcKey->attrNameLen); - /* Sanity check the attribute name length */ - if (srcKey->attrNameLen > kHFSMaxAttrNameLen || keyLength < (kHFSPlusAttrKeyMinimumLength + sizeof(u_int16_t)*srcKey->attrNameLen)) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: attr key #%d keyLength=%d attrNameLen=%d\n", srcDesc->numRecords-i-1, keyLength, srcKey->attrNameLen); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: attr key #%d keyLength=%d attrNameLen=%d\n", srcDesc->numRecords-i-1, keyLength, srcKey->attrNameLen); - } - return fsBTInvalidNodeErr; - } - for (j = 0; j < srcKey->attrNameLen; j++) - srcKey->attrName[j] = SWAP_BE16(srcKey->attrName[j]); - if (direction == kSwapBTNodeHostToBig) - srcKey->attrNameLen = SWAP_BE16(srcKey->attrNameLen); - - /* - * For index nodes, the record data is just the child's node number. - * Skip over swapping the various types of attribute record. - */ - if (srcDesc->kind == kBTIndexNode) { - *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); - continue; - } - - /* Swap the record data */ - if (direction == kSwapBTNodeBigToHost) - srcRec->recordType = SWAP_BE32(srcRec->recordType); - switch (srcRec->recordType) { - case kHFSPlusAttrInlineData: - /* Is there room for the inline data header? */ - if ((char *) &srcRec->attrData.attrData[0] > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* We're not swapping the reserved fields */ - - /* Swap the attribute size */ - if (direction == kSwapBTNodeHostToBig) - attrSize = srcRec->attrData.attrSize; - srcRec->attrData.attrSize = SWAP_BE32(srcRec->attrData.attrSize); - if (direction == kSwapBTNodeBigToHost) - attrSize = srcRec->attrData.attrSize; - - /* Is there room for the inline attribute data? */ - if ((char *) &srcRec->attrData.attrData[attrSize] > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big (attrSize=%u)\n", srcDesc->numRecords-i-1, attrSize); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big (attrSize=%u)\n", srcDesc->numRecords-i-1, attrSize); - } - return fsBTInvalidNodeErr; - } - - /* Not swapping the attribute data itself */ - break; - - case kHFSPlusAttrForkData: - /* Is there room for the fork data record? */ - if ((char *)srcRec + sizeof(HFSPlusAttrForkData) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: attr fork data #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: attr fork data #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* We're not swapping the reserved field */ - - hfs_swap_HFSPlusForkData(&srcRec->forkData.theFork); - break; - - case kHFSPlusAttrExtents: - /* Is there room for an extent record? */ - if ((char *)srcRec + sizeof(HFSPlusAttrExtents) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: attr extents #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: attr extents #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* We're not swapping the reserved field */ - - for (j = 0; j < kHFSPlusExtentDensity; j++) { - srcRec->overflowExtents.extents[j].startBlock = - SWAP_BE32(srcRec->overflowExtents.extents[j].startBlock); - srcRec->overflowExtents.extents[j].blockCount = - SWAP_BE32(srcRec->overflowExtents.extents[j].blockCount); - } - break; - } - if (direction == kSwapBTNodeHostToBig) - srcRec->recordType = SWAP_BE32(srcRec->recordType); - } - } else if (fileID > kHFSFirstUserCatalogNodeID) { - /* The only B-tree with a non-system CNID that we use is the hotfile B-tree */ - HotFileKey *srcKey; - u_int32_t *srcRec; - - for (i = 0; i < srcDesc->numRecords; i++) { - /* Point to the start of the record we're currently checking. */ - srcKey = (HotFileKey *)((char *)src->buffer + srcOffs[i]); - - /* - * Point to start of next (larger offset) record. We'll use this - * to be sure the current record doesn't overflow into the next - * record. - */ - nextRecord = (char *)src->buffer + srcOffs[i-1]; - - /* Make sure there is room for the key (HotFileKey) and data (u_int32_t) */ - if ((char *)srcKey + sizeof(HotFileKey) + sizeof(u_int32_t) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: hotfile #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: hotfile #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } - return fsBTInvalidNodeErr; - } - - /* Swap and sanity check the key length field */ - if (direction == kSwapBTNodeBigToHost) - srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); - if (srcKey->keyLength != sizeof(*srcKey) - sizeof(srcKey->keyLength)) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSPlusBTInternalNode: hotfile #%d incorrect keyLength %d\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } else { - printf("hfs_swap_HFSPlusBTInternalNode: hotfile #%d incorrect keyLength %d\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } - return fsBTInvalidNodeErr; - } - srcRec = (u_int32_t *)((char *)srcKey + srcKey->keyLength + sizeof(srcKey->keyLength)); - if (direction == kSwapBTNodeHostToBig) - srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); - - /* Don't swap srcKey->forkType */ - /* Don't swap srcKey->pad */ - - srcKey->temperature = SWAP_BE32 (srcKey->temperature); - srcKey->fileID = SWAP_BE32 (srcKey->fileID); - - *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); - } - } else { - panic ("hfs_swap_HFSPlusBTInternalNode: fileID %u is not a system B-tree\n", fileID); - } - - - return (0); -} - -#if CONFIG_HFS_STD -int -hfs_swap_HFSBTInternalNode ( - BlockDescriptor *src, - HFSCatalogNodeID fileID, - enum HFSBTSwapDirection direction -) -{ - BTNodeDescriptor *srcDesc = src->buffer; - u_int16_t *srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - (srcDesc->numRecords * sizeof (u_int16_t)))); - char *nextRecord; /* Points to start of record following current one */ - - /* - * i is an int32 because it needs to be negative to index the offset to free space. - * srcDesc->numRecords is a u_int16_t and is unlikely to become 32-bit so this should be ok. - */ - int32_t i; - u_int32_t j; - - if (fileID == kHFSExtentsFileID) { - HFSExtentKey *srcKey; - HFSExtentDescriptor *srcRec; - size_t recordSize; /* Size of the data part of the record, or node number for index nodes */ - - if (srcDesc->kind == kBTIndexNode) - recordSize = sizeof(u_int32_t); - else - recordSize = sizeof(HFSExtentDescriptor); - - for (i = 0; i < srcDesc->numRecords; i++) { - /* Point to the start of the record we're currently checking. */ - srcKey = (HFSExtentKey *)((char *)src->buffer + srcOffs[i]); - - /* - * Point to start of next (larger offset) record. We'll use this - * to be sure the current record doesn't overflow into the next - * record. - */ - nextRecord = (char *)src->buffer + srcOffs[i-1]; - - /* - * Make sure the key and data are within the buffer. Since both key - * and data are fixed size, this is relatively easy. Note that this - * relies on the keyLength being a constant; we verify the keyLength - * below. - */ - if ((char *)srcKey + sizeof(HFSExtentKey) + recordSize > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } else { - printf("hfs_swap_HFSBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } - return fsBTInvalidNodeErr; - } - - /* Don't swap srcKey->keyLength (it's only one byte), but do sanity check it */ - if (srcKey->keyLength != sizeof(*srcKey) - sizeof(srcKey->keyLength)) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } else { - printf("hfs_swap_HFSBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } - return fsBTInvalidNodeErr; - } - - /* Don't swap srcKey->forkType; it's only one byte */ - - srcKey->fileID = SWAP_BE32 (srcKey->fileID); - srcKey->startBlock = SWAP_BE16 (srcKey->startBlock); - - /* Point to record data (round up to even byte boundary) */ - srcRec = (HFSExtentDescriptor *)((char *)srcKey + ((srcKey->keyLength + 2) & ~1)); - - if (srcDesc->kind == kBTIndexNode) { - /* For index nodes, the record data is just a child node number. */ - *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); - } else { - /* Swap the extent data */ - for (j = 0; j < kHFSExtentDensity; j++) { - srcRec[j].startBlock = SWAP_BE16 (srcRec[j].startBlock); - srcRec[j].blockCount = SWAP_BE16 (srcRec[j].blockCount); - } - } - } - - } else if (fileID == kHFSCatalogFileID) { - HFSCatalogKey *srcKey; - int16_t *srcPtr; - unsigned expectedKeyLength; - - for (i = 0; i < srcDesc->numRecords; i++) { - /* Point to the start of the record we're currently checking. */ - srcKey = (HFSCatalogKey *)((char *)src->buffer + srcOffs[i]); - - /* - * Point to start of next (larger offset) record. We'll use this - * to be sure the current record doesn't overflow into the next - * record. - */ - nextRecord = (char *)src->buffer + srcOffs[i-1]; - - /* - * Make sure we can safely dereference the keyLength and parentID fields. - * The value 8 below is 1 bytes for keyLength + 1 byte reserved + 4 bytes - * for parentID + 1 byte for nodeName's length + 1 byte to round up the - * record start to an even offset, which forms a minimal key. - */ - if ((char *)srcKey + 8 > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); - } - return fsBTInvalidNodeErr; - } - - /* Don't swap srcKey->keyLength (it's only one byte), but do sanity check it */ - if (srcKey->keyLength < kHFSCatalogKeyMinimumLength || srcKey->keyLength > kHFSCatalogKeyMaximumLength) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); - } - return fsBTInvalidNodeErr; - } - - /* Don't swap srcKey->reserved */ - - srcKey->parentID = SWAP_BE32 (srcKey->parentID); - - /* Don't swap srcKey->nodeName */ - - /* Make sure the keyLength is big enough for the key's content */ - if (srcDesc->kind == kBTIndexNode) - expectedKeyLength = sizeof(*srcKey) - sizeof(srcKey->keyLength); - else - expectedKeyLength = srcKey->nodeName[0] + kHFSCatalogKeyMinimumLength; - if (srcKey->keyLength < expectedKeyLength) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog record #%d keyLength=%u expected=%u\n", - srcDesc->numRecords-i, srcKey->keyLength, expectedKeyLength); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog record #%d keyLength=%u expected=%u\n", - srcDesc->numRecords-i, srcKey->keyLength, expectedKeyLength); - } - return fsBTInvalidNodeErr; - } - - /* Point to record data (round up to even byte boundary) */ - srcPtr = (int16_t *)((char *)srcKey + ((srcKey->keyLength + 2) & ~1)); - - /* - * Make sure that we can safely dereference the record's type field or - * and index node's child node number. - */ - if ((char *)srcPtr + sizeof(u_int32_t) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* - * For index nodes, the record data is just the child's node number. - * Skip over swapping the various types of catalog record. - */ - if (srcDesc->kind == kBTIndexNode) { - *((u_int32_t *)srcPtr) = SWAP_BE32 (*((u_int32_t *)srcPtr)); - continue; - } - - /* Make sure the recordType is in native order before using it. */ - if (direction == kSwapBTNodeBigToHost) - srcPtr[0] = SWAP_BE16 (srcPtr[0]); - - if (srcPtr[0] == kHFSFolderRecord) { - HFSCatalogFolder *srcRec = (HFSCatalogFolder *)srcPtr; - if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - srcRec->flags = SWAP_BE16 (srcRec->flags); - srcRec->valence = SWAP_BE16 (srcRec->valence); - - srcRec->folderID = SWAP_BE32 (srcRec->folderID); - srcRec->createDate = SWAP_BE32 (srcRec->createDate); - srcRec->modifyDate = SWAP_BE32 (srcRec->modifyDate); - srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); - - /* Don't swap srcRec->userInfo */ - /* Don't swap srcRec->finderInfo */ - /* Don't swap resserved array */ - - } else if (srcPtr[0] == kHFSFileRecord) { - HFSCatalogFile *srcRec = (HFSCatalogFile *)srcPtr; - if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - srcRec->flags = srcRec->flags; - srcRec->fileType = srcRec->fileType; - - /* Don't swap srcRec->userInfo */ - - srcRec->fileID = SWAP_BE32 (srcRec->fileID); - - srcRec->dataStartBlock = SWAP_BE16 (srcRec->dataStartBlock); - srcRec->dataLogicalSize = SWAP_BE32 (srcRec->dataLogicalSize); - srcRec->dataPhysicalSize = SWAP_BE32 (srcRec->dataPhysicalSize); - - srcRec->rsrcStartBlock = SWAP_BE16 (srcRec->rsrcStartBlock); - srcRec->rsrcLogicalSize = SWAP_BE32 (srcRec->rsrcLogicalSize); - srcRec->rsrcPhysicalSize = SWAP_BE32 (srcRec->rsrcPhysicalSize); - - srcRec->createDate = SWAP_BE32 (srcRec->createDate); - srcRec->modifyDate = SWAP_BE32 (srcRec->modifyDate); - srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); - - /* Don't swap srcRec->finderInfo */ - - srcRec->clumpSize = SWAP_BE16 (srcRec->clumpSize); - - /* Swap the two sets of extents as an array of six (three each) u_int16_t */ - for (j = 0; j < kHFSExtentDensity * 2; j++) { - srcRec->dataExtents[j].startBlock = SWAP_BE16 (srcRec->dataExtents[j].startBlock); - srcRec->dataExtents[j].blockCount = SWAP_BE16 (srcRec->dataExtents[j].blockCount); - } - - /* Don't swap srcRec->reserved */ - - } else if ((srcPtr[0] == kHFSFolderThreadRecord) || - (srcPtr[0] == kHFSFileThreadRecord)) { - HFSCatalogThread *srcRec = (HFSCatalogThread *)srcPtr; - - /* Make sure there is room for parentID and name length */ - if ((char *) &srcRec->nodeName[1] > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* Don't swap srcRec->reserved array */ - - srcRec->parentID = SWAP_BE32 (srcRec->parentID); - - /* Don't swap srcRec->nodeName */ - - /* Make sure there is room for the name in the buffer */ - if ((char *) &srcRec->nodeName[srcRec->nodeName[0]] > nextRecord) { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - } else { - if (direction == kSwapBTNodeHostToBig) { - panic("hfs_swap_HFSBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); - } else { - printf("hfs_swap_HFSBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); - } - return fsBTInvalidNodeErr; - } - - /* We can swap the record type now that we're done using it */ - if (direction == kSwapBTNodeHostToBig) - srcPtr[0] = SWAP_BE16 (srcPtr[0]); - } - - } else { - panic ("hfs_swap_HFSBTInternalNode: fileID %u is not a system B-tree\n", fileID); - } - - return (0); -} -#endif - diff --git a/bsd/hfs/hfs_endian.h b/bsd/hfs/hfs_endian.h deleted file mode 100644 index c1c46f7aa..000000000 --- a/bsd/hfs/hfs_endian.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2000, 2002-2003, 2005-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef __HFS_ENDIAN_H__ -#define __HFS_ENDIAN_H__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -/* - * hfs_endian.h - * - * This file prototypes endian swapping routines for the HFS/HFS Plus - * volume format. - */ -#include "hfs.h" -#include "hfscommon/headers/BTreesInternal.h" -#include - -/*********************/ -/* BIG ENDIAN Macros */ -/*********************/ -#define SWAP_BE16(__a) OSSwapBigToHostInt16 (__a) -#define SWAP_BE32(__a) OSSwapBigToHostInt32 (__a) -#define SWAP_BE64(__a) OSSwapBigToHostInt64 (__a) - -#if BYTE_ORDER == BIG_ENDIAN - - /* HFS is always big endian, no swapping needed */ - #define SWAP_HFS_PLUS_FORK_DATA(__a) - -/************************/ -/* LITTLE ENDIAN Macros */ -/************************/ -#elif BYTE_ORDER == LITTLE_ENDIAN - - #define SWAP_HFS_PLUS_FORK_DATA(__a) hfs_swap_HFSPlusForkData ((__a)) - -#else -#warning Unknown byte order -#error -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Constants for the "unswap" argument to hfs_swap_BTNode: - */ -enum HFSBTSwapDirection { - kSwapBTNodeBigToHost = 0, - kSwapBTNodeHostToBig = 1, - - /* - * kSwapBTNodeHeaderRecordOnly is used to swap just the header record - * of a header node from big endian (on disk) to host endian (in memory). - * It does not swap the node descriptor (forward/backward links, record - * count, etc.). It assumes the header record is at offset 0x000E. - * - * Since HFS Plus doesn't have fixed B-tree node sizes, we have to read - * the header record to determine the actual node size for that tree - * before we can set up the B-tree control block. We read it initially - * as 512 bytes, then re-read it once we know the correct node size. Since - * we may not have read the entire header node the first time, we can't - * swap the record offsets, other records, or do most sanity checks. - */ - kSwapBTNodeHeaderRecordOnly = 3 -}; - -int hfs_swap_BTNode (BlockDescriptor *src, vnode_t vp, enum HFSBTSwapDirection direction, - u_int8_t allow_empty_node); - -#ifdef __cplusplus -} -#endif - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* __HFS_FORMAT__ */ diff --git a/bsd/hfs/hfs_extents.c b/bsd/hfs/hfs_extents.c deleted file mode 100644 index 509de326d..000000000 --- a/bsd/hfs/hfs_extents.c +++ /dev/null @@ -1,770 +0,0 @@ -/* - * Copyright (c) 2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#if HFS_EXTENTS_TEST - -#include "hfs_extents_test.h" -#include "hfs_extents.h" - -#else - -#include "hfs_extents.h" - -// In this file, group refers to a set of 8 extents - -static uint32_t hfs_total_blocks(const HFSPlusExtentDescriptor *ext, int count); -static errno_t hfs_ext_iter_next_group(struct hfs_ext_iter *iter); -static errno_t hfs_ext_iter_update(struct hfs_ext_iter *iter, - HFSPlusExtentDescriptor *extents, - int count, - HFSPlusExtentRecord cat_extents); -static errno_t hfs_ext_iter_check_group(hfs_ext_iter_t *iter); - -#endif - -#define CHECK(x, var, goto_label) \ - do { \ - var = (x); \ - if (var) { \ - printf("%s:%u error: %d\n", __func__, __LINE__, var); \ - goto goto_label; \ - } \ - } while (0) - -#define min(a,b) \ - ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; }) - -static __attribute__((pure)) -const HFSPlusExtentKey *hfs_ext_iter_key(const hfs_ext_iter_t *iter) -{ - return (const HFSPlusExtentKey *)&iter->bt_iter.key; -} - -static __attribute__((pure)) -HFSPlusExtentKey *hfs_ext_iter_key_mut(hfs_ext_iter_t *iter) -{ - return (HFSPlusExtentKey *)&iter->bt_iter.key; -} - -// Returns the total number of blocks for the @count extents provided -uint32_t hfs_total_blocks(const HFSPlusExtentDescriptor *extents, int count) -{ - uint32_t block_count = 0; - for (int i = 0; i < count; ++i) - block_count += extents[i].blockCount; - return block_count; -} - -/* - * Checks a group of extents: makes sure that if it's the last group - * for a fork, that all the remaining extents are properly zeroed and - * if it's not then checks that all extents are set. This also sets - * @group_block_count and @last_in_fork. Returns ESTALE if - * inconsistent. - */ -errno_t hfs_ext_iter_check_group(hfs_ext_iter_t *iter) -{ - filefork_t *ff = VTOF(iter->vp); - const HFSPlusExtentKey *key = hfs_ext_iter_key(iter); - uint32_t count = 0; - int i; - - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (!iter->group[i].blockCount) - break; - count += iter->group[i].blockCount; - } - - if (i < kHFSPlusExtentDensity) { - iter->last_in_fork = true; - if (key->startBlock + count != ff_allocblocks(ff)) - goto bad; - - // Check remainder of extents - for (++i; i < kHFSPlusExtentDensity; ++i) { - if (iter->group[i].blockCount) - goto bad; - } - } else { - if (key->startBlock + count > ff_allocblocks(ff)) - goto bad; - - iter->last_in_fork = (key->startBlock + count == ff_allocblocks(ff)); - } - - iter->group_block_count = count; - - return 0; - -bad: - -#if DEBUG - printf("hfs_ext_iter_check_group: bad group; start: %u, total blocks: %u\n", - key->startBlock, ff_allocblocks(ff)); - - for (int j = 0; j < kHFSPlusExtentDensity; ++j) { - printf("%s<%u, %u>", j ? ", " : "", - iter->group[j].startBlock, iter->group[j].blockCount); - } - - printf("\n"); -#endif - - return ESTALE; -} - -// NOTE: doesn't copy group data -static void hfs_ext_iter_copy(const hfs_ext_iter_t *src, hfs_ext_iter_t *dst) -{ - dst->vp = src->vp; - memcpy(&dst->bt_iter.key, &src->bt_iter.key, sizeof(HFSPlusExtentKey)); - - dst->file_block = src->file_block; - dst->ndx = src->ndx; - - dst->bt_iter.hint = src->bt_iter.hint; - dst->bt_iter.version = 0; - dst->bt_iter.reserved = 0; - dst->bt_iter.hitCount = 0; - dst->bt_iter.maxLeafRecs = 0; -} - -bool hfs_ext_iter_is_catalog_extents(hfs_ext_iter_t *iter) -{ - return hfs_ext_iter_key(iter)->startBlock == 0; -} - -#if !HFS_EXTENTS_TEST - -/* - * Finds the extent for offset. It might be in the catalog or the extents - * file. - */ -errno_t hfs_ext_find(vnode_t vp, off_t offset, hfs_ext_iter_t *iter) -{ - errno_t ret; - hfsmount_t *hfsmp = VTOHFS(vp); - - iter->vp = vp; - - uint32_t end_block, index; - HFSPlusExtentKey *key = hfs_ext_iter_key_mut(iter); - - filefork_t *ff = VTOF(vp); - - CHECK(SearchExtentFile(hfsmp, ff, offset, - key, iter->group, &index, - &iter->bt_iter.hint.nodeNum, &end_block), ret, exit); - - iter->ndx = index; - iter->file_block = end_block - iter->group[index].blockCount; - - if (!key->keyLength) { - // We're pointing at the catalog record extents so fix up the key - key->keyLength = kHFSPlusExtentKeyMaximumLength; - key->forkType = (VNODE_IS_RSRC(iter->vp) - ? kHFSResourceForkType : kHFSDataForkType); - key->pad = 0; - key->fileID = VTOC(iter->vp)->c_fileid; - key->startBlock = 0; - } - - CHECK(hfs_ext_iter_check_group(iter), ret, exit); - - ret = 0; - -exit: - - return MacToVFSError(ret); -} - -static uint32_t hfs_ext_iter_next_group_block(const hfs_ext_iter_t *iter) -{ - const HFSPlusExtentKey *key = hfs_ext_iter_key(iter); - - return key->startBlock + iter->group_block_count; -} - -/* - * Move the iterator to the next group. Don't call if there's a chance - * there is no entry; the caller should check last_in_fork instead. - */ -static errno_t hfs_ext_iter_next_group(hfs_ext_iter_t *iter) -{ - errno_t ret; - hfsmount_t *hfsmp = VTOHFS(iter->vp); - filefork_t * const tree = hfsmp->hfs_extents_cp->c_datafork; - HFSPlusExtentKey *key = hfs_ext_iter_key_mut(iter); - const bool catalog_extents = hfs_ext_iter_is_catalog_extents(iter); - const uint32_t next_block = hfs_ext_iter_next_group_block(iter); - - FSBufferDescriptor fbd = { - .bufferAddress = &iter->group, - .itemCount = 1, - .itemSize = sizeof(iter->group) - }; - - if (catalog_extents) { - key->startBlock = next_block; - - CHECK(BTSearchRecord(tree, &iter->bt_iter, &fbd, NULL, - &iter->bt_iter), ret, exit); - } else { - const uint32_t file_id = key->fileID; - const uint8_t fork_type = key->forkType; - - CHECK(BTIterateRecord(tree, kBTreeNextRecord, &iter->bt_iter, - &fbd, NULL), ret, exit); - - if (key->fileID != file_id - || key->forkType != fork_type - || key->startBlock != next_block) { - // This indicates an inconsistency - ret = ESTALE; - goto exit; - } - } - - iter->file_block = key->startBlock; - iter->ndx = 0; - - CHECK(hfs_ext_iter_check_group(iter), ret, exit); - - ret = 0; - -exit: - - return MacToVFSError(ret); -} - -/* - * Updates with the extents provided and sets the key up for the next group. - * It is assumed that any previous record that might collide has been deleted. - * NOTE: @extents must point to a buffer that can be zero padded to multiple - * of 8 extents. - */ -errno_t hfs_ext_iter_update(hfs_ext_iter_t *iter, - HFSPlusExtentDescriptor *extents, - int count, - HFSPlusExtentRecord cat_extents) -{ - errno_t ret; - hfsmount_t *hfsmp = VTOHFS(iter->vp); - cnode_t *cp = VTOC(iter->vp); - HFSPlusExtentKey *key = hfs_ext_iter_key_mut(iter); - int ndx = 0; - - if (!extents) - extents = iter->group; - - if (count % kHFSPlusExtentDensity) { - // Zero out last group - bzero(&extents[count], (kHFSPlusExtentDensity - - (count % 8)) * sizeof(*extents)); - } - - if (hfs_ext_iter_is_catalog_extents(iter)) { - // Caller is responsible for in-memory updates - - if (cat_extents) - hfs_ext_copy_rec(extents, cat_extents); - - struct cat_fork fork; - - hfs_fork_copy(&fork, &VTOF(iter->vp)->ff_data, extents); - hfs_prepare_fork_for_update(VTOF(iter->vp), &fork, &fork, hfsmp->blockSize); - - bool is_rsrc = VNODE_IS_RSRC(iter->vp); - CHECK(cat_update(hfsmp, &cp->c_desc, &cp->c_attr, - is_rsrc ? NULL : &fork, - is_rsrc ? &fork : NULL), ret, exit); - - // Set the key to the next group - key->startBlock = hfs_total_blocks(extents, kHFSPlusExtentDensity); - - ndx += 8; - } - - // Deal with the remainder which must be overflow extents - for (; ndx < count; ndx += 8) { - filefork_t * const tree = hfsmp->hfs_extents_cp->c_datafork; - - FSBufferDescriptor fbd = { - .bufferAddress = &extents[ndx], - .itemCount = 1, - .itemSize = sizeof(HFSPlusExtentRecord) - }; - - CHECK(BTInsertRecord(tree, &iter->bt_iter, &fbd, - sizeof(HFSPlusExtentRecord)), ret, exit); - - // Set the key to the next group - key->startBlock += hfs_total_blocks(&extents[ndx], kHFSPlusExtentDensity); - } - - ret = 0; - -exit: - - return ret; -} - -#endif // !HFS_EXTENTS_TEST - -static void push_ext(HFSPlusExtentDescriptor *extents, int *count, - const HFSPlusExtentDescriptor *ext) -{ - if (!ext->blockCount) - return; - - if (*count && hfs_ext_end(&extents[*count - 1]) == ext->startBlock) - extents[*count - 1].blockCount += ext->blockCount; - else - extents[(*count)++] = *ext; -} - -/* - * NOTE: Here we rely on the replacement extents not being too big as - * otherwise the number of BTree records that we have to delete could be - * too large. - */ -errno_t hfs_ext_replace(hfsmount_t *hfsmp, vnode_t vp, - uint32_t file_block, - const HFSPlusExtentDescriptor *repl, - int repl_count, - HFSPlusExtentRecord catalog_extents) -{ - errno_t ret; - filefork_t * const tree = hfsmp->hfs_extents_cp->c_datafork; - hfs_ext_iter_t *iter_in = NULL, *iter_out; - HFSPlusExtentDescriptor *extents = NULL; - HFSPlusExtentDescriptor *roll_back_extents = NULL; - int roll_back_count = 0; - const uint32_t end_file_block = file_block + hfs_total_blocks(repl, repl_count); - filefork_t *ff = VTOF(vp); - - // Indicate we haven't touched catalog extents - catalog_extents[0].blockCount = 0; - - if (end_file_block > ff_allocblocks(ff)) { - ret = EINVAL; - goto exit; - } - - MALLOC(iter_in, hfs_ext_iter_t *, sizeof(*iter_in) * 2, M_TEMP, M_WAITOK); - iter_out = iter_in + 1; - HFSPlusExtentKey *key_in = hfs_ext_iter_key_mut(iter_in); - - // Get to where we want to start - off_t offset = hfs_blk_to_bytes(file_block, hfsmp->blockSize); - - /* - * If the replacement is at the start of a group, we want to pull in the - * group before so that we tidy up any padding that we might have done - * in a prior hfs_ext_replace call. - */ - if (offset > 0) - --offset; - - CHECK(hfs_ext_find(vp, offset, iter_in), ret, exit); - - const uint32_t start_group_block = key_in->startBlock; - - const int max_roll_back_extents = 128 * 1024 / sizeof(HFSPlusExtentDescriptor); - MALLOC(roll_back_extents, HFSPlusExtentDescriptor *, 128 * 1024, M_TEMP, M_WAITOK); - - // Move to the first extent in this group - iter_in->ndx = 0; - - hfs_ext_iter_copy(iter_in, iter_out); - - // Create a buffer for our extents - const int buffered_extents = roundup(3 * kHFSPlusExtentDensity + repl_count, - kHFSPlusExtentDensity); - MALLOC(extents, HFSPlusExtentDescriptor *, - sizeof(*extents) * buffered_extents, M_TEMP, M_WAITOK); - int count = 0; - - /* - * Iterate through the extents that are affected by this replace operation. - * We cannot push more than 16 + repl_count extents here; 8 for the group - * containing the replacement start, repl_count for the replacements and 8 - * for the group containing the end. If we went back a group due to - * decrementing the offset above, it's still the same because we know in - * that case the replacement starts at the beginning of the next group. - */ - uint32_t block = start_group_block; - for (;;) { - if (!iter_in->ndx) { - hfs_ext_copy_rec(iter_in->group, &roll_back_extents[roll_back_count]); - roll_back_count += kHFSPlusExtentDensity; - - if (!hfs_ext_iter_is_catalog_extents(iter_in)) { - // Delete this extent group; we're going to replace it - CHECK(BTDeleteRecord(tree, &iter_in->bt_iter), ret, exit); - } - } - - HFSPlusExtentDescriptor *ext = &iter_in->group[iter_in->ndx]; - if (!ext->blockCount) { - /* - * We ran out of existing extents so we just write the - * extents and we're done. - */ - goto finish; - } - - // If the current extent does not overlap replacement... - if (block + ext->blockCount <= file_block || block >= end_file_block) { - // Keep the current extent exactly as it is - push_ext(extents, &count, ext); - } else { - HFSPlusExtentDescriptor dealloc_ext = *ext; - - if (block <= file_block) { - /* - * The middle or tail of the current extent overlaps - * the replacement extents. Keep the non-overlapping - * head of the current extent. - */ - uint32_t trimmed_len = file_block - block; - - if (trimmed_len) { - // Push (keep) non-overlapping head of current extent - push_ext(extents, &count, - &(HFSPlusExtentDescriptor){ ext->startBlock, - trimmed_len }); - - /* - * Deallocate the part of the current extent that - * overlaps the replacement extents. That starts - * at @file_block. For now, assume it goes - * through the end of the current extent. (If the - * current extent extends beyond the end of the - * replacement extents, we'll update the - * blockCount below.) - */ - dealloc_ext.startBlock += trimmed_len; - dealloc_ext.blockCount -= trimmed_len; - } - - // Insert the replacements - for (int i = 0; i < repl_count; ++i) - push_ext(extents, &count, &repl[i]); - } - - if (block + ext->blockCount > end_file_block) { - /* - * The head or middle of the current extent overlaps - * the replacement extents. Keep the non-overlapping - * tail of the current extent. - */ - uint32_t overlap = end_file_block - block; - - // Push (keep) non-overlapping tail of current extent - push_ext(extents, &count, - &(HFSPlusExtentDescriptor){ ext->startBlock + overlap, - ext->blockCount - overlap }); - - /* - * Deallocate the part of current extent that overlaps - * the replacements. - */ - dealloc_ext.blockCount = (ext->startBlock + overlap - - dealloc_ext.startBlock); - } - - CHECK(BlockDeallocate(hfsmp, dealloc_ext.startBlock, - dealloc_ext.blockCount, 0), ret, exit); - } - - // Move to next (existing) extent from iterator - block += ext->blockCount; - - if (++iter_in->ndx >= kHFSPlusExtentDensity) { - if (block >= end_file_block) { - if (iter_in->last_in_fork || !(count % kHFSPlusExtentDensity)) { - /* - * This is the easy case. We've hit the end or we have a - * multiple of 8, so we can just write out the extents we - * have and it should all fit within a transaction. - */ - - goto finish; - } - - if (count + kHFSPlusExtentDensity > buffered_extents - || (roll_back_count - + kHFSPlusExtentDensity > max_roll_back_extents)) { - /* - * We've run out of room for the next group, so drop out - * and take a different strategy. - */ - break; - } - } - - CHECK(hfs_ext_iter_next_group(iter_in), ret, exit); - } - } // for (;;) - - /* - * We're not at the end so we need to try and pad to a multiple of 8 - * so that we don't have to touch all the subsequent records. We pad - * by stealing single blocks. - */ - - int stop_at = 0; - - for (;;) { - // @in points to the record we're stealing from - int in = count - 1; - - count = roundup(count, kHFSPlusExtentDensity); - - // @out is where we put the stolen single blocks - int out = count - 1; - - do { - if (out <= in) { - // We suceeded in padding; we're done - goto finish; - } - - /* - * "Steal" a block, or move a one-block extent within the - * @extents array. - * - * If the extent we're "stealing" from (@in) is only one - * block long, we'll end up copying it to @out, setting - * @in's blockCount to zero, and decrementing @in. So, we - * either split a multi-block extent; or move it within - * the @extents array. - */ - extents[out].blockCount = 1; - extents[out].startBlock = (extents[in].startBlock - + extents[in].blockCount - 1); - --out; - } while (--extents[in].blockCount || --in >= stop_at); - - // We ran out of extents - if (roll_back_count + kHFSPlusExtentDensity > max_roll_back_extents) { - ret = ENOSPC; - goto exit; - } - - // Need to shift extents starting at out + 1 - ++out; - memmove(&extents[stop_at], &extents[out], - (count - out) * sizeof(*extents)); - count -= out - stop_at; - - // Pull in the next group - CHECK(hfs_ext_iter_next_group(iter_in), ret, exit); - - // Take a copy of these extents for roll back purposes - hfs_ext_copy_rec(iter_in->group, &roll_back_extents[roll_back_count]); - roll_back_count += kHFSPlusExtentDensity; - - // Delete this group; we're going to replace it - CHECK(BTDeleteRecord(tree, &iter_in->bt_iter), ret, exit); - - if (iter_in->last_in_fork) { - // Great! We've hit the end. Coalesce and write out. - int old_count = count; - count = 0; - - /* - * First coalesce the extents we already have. Takes - * advantage of push_ext coalescing the input extent with - * the last extent in @extents. If the extents are not - * contiguous, then this just copies the extents over - * themselves and sets @count back to @old_count. - */ - for (int i = 0; i < old_count; ++i) - push_ext(extents, &count, &extents[i]); - - // Make room if necessary - const int flush_count = buffered_extents - kHFSPlusExtentDensity; - if (count > flush_count) { - CHECK(hfs_ext_iter_update(iter_out, extents, - flush_count, catalog_extents), ret, exit); - - memmove(&extents[0], &extents[flush_count], - (count - flush_count) * sizeof(*extents)); - - count -= flush_count; - } - - // Add in the extents we just read in - for (int i = 0; i < kHFSPlusExtentDensity; ++i) { - HFSPlusExtentDescriptor *ext = &iter_in->group[i]; - if (!ext->blockCount) - break; - push_ext(extents, &count, ext); - } - - goto finish; - } // if (iter_in->last_in_fork) - - /* - * Otherwise, we're not at the end, so we add these extents and then - * try and pad out again to a multiple of 8. We start by making room. - */ - if (count > buffered_extents - kHFSPlusExtentDensity) { - // Only write out one group here - CHECK(hfs_ext_iter_update(iter_out, extents, - kHFSPlusExtentDensity, - catalog_extents), ret, exit); - - memmove(&extents[0], &extents[kHFSPlusExtentDensity], - (count - kHFSPlusExtentDensity) * sizeof(*extents)); - - count -= kHFSPlusExtentDensity; - } - - // Record where to stop when padding above - stop_at = count; - - // Copy in the new extents - hfs_ext_copy_rec(iter_in->group, &extents[count]); - count += kHFSPlusExtentDensity; - } // for (;;) - -finish: - - // Write the remaining extents - CHECK(hfs_ext_iter_update(iter_out, extents, count, - catalog_extents), ret, exit); - - CHECK(BTFlushPath(hfsmp->hfs_catalog_cp->c_datafork), ret, exit); - CHECK(BTFlushPath(hfsmp->hfs_extents_cp->c_datafork), ret, exit); - -exit: - - if (ret && roll_back_count) { - -#define RB_FAILED \ - do { \ - printf("hfs_ext_replace:%u: roll back failed\n", __LINE__); \ - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); \ - goto roll_back_failed; \ - } while (0) - - // First delete any groups we inserted - HFSPlusExtentKey *key_out = hfs_ext_iter_key_mut(iter_out); - - key_in->startBlock = start_group_block; - if (!key_in->startBlock && key_out->startBlock > key_in->startBlock) { - key_in->startBlock += hfs_total_blocks(catalog_extents, - kHFSPlusExtentDensity); - } - - if (key_out->startBlock > key_in->startBlock) { - FSBufferDescriptor fbd = { - .bufferAddress = &iter_in->group, - .itemCount = 1, - .itemSize = sizeof(iter_in->group) - }; - - if (BTSearchRecord(tree, &iter_in->bt_iter, &fbd, NULL, - &iter_in->bt_iter)) { - RB_FAILED; - } - - for (;;) { - if (BTDeleteRecord(tree, &iter_in->bt_iter)) - RB_FAILED; - - key_in->startBlock += hfs_total_blocks(iter_in->group, - kHFSPlusExtentDensity); - - if (key_in->startBlock >= key_out->startBlock) - break; - - if (BTSearchRecord(tree, &iter_in->bt_iter, &fbd, NULL, - &iter_in->bt_iter)) { - RB_FAILED; - } - } - } - - // Position iter_out - key_out->startBlock = start_group_block; - - // Roll back all the extents - if (hfs_ext_iter_update(iter_out, roll_back_extents, roll_back_count, - catalog_extents)) { - RB_FAILED; - } - - // And we need to reallocate the blocks we deallocated - const uint32_t end_block = min(block, end_file_block); - block = start_group_block; - for (int i = 0; i < roll_back_count && block < end_block; ++i) { - HFSPlusExtentDescriptor *ext = &roll_back_extents[i]; - - if (block + ext->blockCount <= file_block) - continue; - - HFSPlusExtentDescriptor alloc_ext = *ext; - - if (block <= file_block) { - uint32_t trimmed_len = file_block - block; - - alloc_ext.startBlock += trimmed_len; - alloc_ext.blockCount -= trimmed_len; - } - - if (block + ext->blockCount > end_file_block) { - uint32_t overlap = end_file_block - block; - - alloc_ext.blockCount = (ext->startBlock + overlap - - alloc_ext.startBlock); - } - - if (hfs_block_alloc(hfsmp, &alloc_ext, HFS_ALLOC_ROLL_BACK, NULL)) - RB_FAILED; - - block += ext->blockCount; - } - - if (BTFlushPath(hfsmp->hfs_catalog_cp->c_datafork) - || BTFlushPath(hfsmp->hfs_extents_cp->c_datafork)) { - RB_FAILED; - } - } // if (ret && roll_back_count) - -roll_back_failed: - - FREE(iter_in, M_TEMP); - FREE(extents, M_TEMP); - FREE(roll_back_extents, M_TEMP); - - return MacToVFSError(ret); -} diff --git a/bsd/hfs/hfs_extents.h b/bsd/hfs/hfs_extents.h deleted file mode 100644 index 9dd6073dd..000000000 --- a/bsd/hfs/hfs_extents.h +++ /dev/null @@ -1,54 +0,0 @@ -// -// hfs_extents.h -// hfs -// -// Created by csuter on 7/11/14. -// Copyright (c) 2014 Apple. All rights reserved. -// - -#ifndef HFS_EXTENTS_H_ -#define HFS_EXTENTS_H_ - -#include -#include - -#include "hfs_format.h" - -#if !HFS_EXTENTS_TEST && !HFS_ALLOC_TEST -#include "hfs_cnode.h" -#include "hfs.h" -#include "hfscommon/headers/BTreesInternal.h" -#endif - -typedef struct hfs_ext_iter { - struct vnode *vp; // If NULL, this is an xattr extent - BTreeIterator bt_iter; - uint8_t ndx; // Index in group - bool last_in_fork; - uint32_t file_block; - uint32_t group_block_count; - HFSPlusExtentRecord group; -} hfs_ext_iter_t; - -errno_t hfs_ext_find(vnode_t vp, off_t offset, hfs_ext_iter_t *iter); - -errno_t hfs_ext_replace(hfsmount_t *hfsmp, vnode_t vp, - uint32_t file_block, - const HFSPlusExtentDescriptor *repl, - int count, - HFSPlusExtentRecord catalog_extents); - -bool hfs_ext_iter_is_catalog_extents(hfs_ext_iter_t *iter); - -static inline void hfs_ext_copy_rec(const HFSPlusExtentRecord src, - HFSPlusExtentRecord dst) -{ - memcpy(dst, src, sizeof(HFSPlusExtentRecord)); -} - -static inline uint32_t hfs_ext_end(const HFSPlusExtentDescriptor *ext) -{ - return ext->startBlock + ext->blockCount; -} - -#endif // HFS_EXTENTS_H_ diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h deleted file mode 100644 index dcc180724..000000000 --- a/bsd/hfs/hfs_format.h +++ /dev/null @@ -1,821 +0,0 @@ -/* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef __HFS_FORMAT__ -#define __HFS_FORMAT__ - -#include -#include -#include - -/* - * hfs_format.h - * - * This file describes the on-disk format for HFS and HFS Plus volumes. - * The HFS Plus volume format is desciibed in detail in Apple Technote 1150. - * - * http://developer.apple.com/technotes/tn/tn1150.html - * - * Note: Starting 10.9, definition of struct HFSUniStr255 exists in hfs_unitstr.h - * - */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* some on-disk hfs structures have 68K alignment (misaligned) */ - -/* Signatures used to differentiate between HFS and HFS Plus volumes */ -enum { - kHFSSigWord = 0x4244, /* 'BD' in ASCII */ - kHFSPlusSigWord = 0x482B, /* 'H+' in ASCII */ - kHFSXSigWord = 0x4858, /* 'HX' in ASCII */ - - kHFSPlusVersion = 0x0004, /* 'H+' volumes are version 4 only */ - kHFSXVersion = 0x0005, /* 'HX' volumes start with version 5 */ - - kHFSPlusMountVersion = 0x31302E30, /* '10.0' for Mac OS X */ - kHFSJMountVersion = 0x4846534a, /* 'HFSJ' for journaled HFS+ on OS X */ - kFSKMountVersion = 0x46534b21 /* 'FSK!' for failed journal replay */ -}; - - -#ifdef __APPLE_API_PRIVATE -/* - * Mac OS X has two special directories on HFS+ volumes for hardlinked files - * and hardlinked directories as well as for open-unlinked files. - * - * These directories and their contents are not exported from the filesystem - * under Mac OS X. - */ -#define HFSPLUSMETADATAFOLDER "\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80HFS+ Private Data" -#define HFSPLUS_DIR_METADATA_FOLDER ".HFS+ Private Directory Data\xd" - -/* - * Files in the "HFS+ Private Data" folder have one of the following prefixes - * followed by a decimal number (no leading zeros) for the file ID. - * - * Note: Earlier version of Mac OS X used a 32 bit random number for the link - * ref number instead of the file id. - * - * e.g. iNode7182000 and temp3296 - */ -#define HFS_INODE_PREFIX "iNode" -#define HFS_DELETE_PREFIX "temp" - -/* - * Files in the ".HFS+ Private Directory Data" folder have the following - * prefix followed by a decimal number (no leading zeros) for the file ID. - * - * e.g. dir_555 - */ -#define HFS_DIRINODE_PREFIX "dir_" - -/* - * Hardlink inodes save the head of the link chain in - * an extended attribute named FIRST_LINK_XATTR_NAME. - * The attribute data is the decimal value in ASCII - * of the cnid for the first link in the chain. - * - * This extended attribute is private (i.e. its not - * exported in the getxattr/listxattr POSIX APIs). - */ -#define FIRST_LINK_XATTR_NAME "com.apple.system.hfs.firstlink" -#define FIRST_LINK_XATTR_REC_SIZE (sizeof(HFSPlusAttrData) - 2 + 12) - -/* - * The name space ID for generating an HFS volume UUID - * - * B3E20F39-F292-11D6-97A4-00306543ECAC - */ -#define HFS_UUID_NAMESPACE_ID "\xB3\xE2\x0F\x39\xF2\x92\x11\xD6\x97\xA4\x00\x30\x65\x43\xEC\xAC" - -#endif /* __APPLE_API_PRIVATE */ - -/* - * Indirect link files (hard links) have the following type/creator. - */ -enum { - kHardLinkFileType = 0x686C6E6B, /* 'hlnk' */ - kHFSPlusCreator = 0x6866732B /* 'hfs+' */ -}; - - -/* - * File type and creator for symbolic links - */ -enum { - kSymLinkFileType = 0x736C6E6B, /* 'slnk' */ - kSymLinkCreator = 0x72686170 /* 'rhap' */ -}; - - -enum { - kHFSMaxVolumeNameChars = 27, - kHFSMaxFileNameChars = 31, - kHFSPlusMaxFileNameChars = 255 -}; - - -/* Extent overflow file data structures */ - -/* HFS Extent key */ -struct HFSExtentKey { - u_int8_t keyLength; /* length of key, excluding this field */ - u_int8_t forkType; /* 0 = data fork, FF = resource fork */ - u_int32_t fileID; /* file ID */ - u_int16_t startBlock; /* first file allocation block number in this extent */ -} __attribute__((aligned(2), packed)); -typedef struct HFSExtentKey HFSExtentKey; - -/* HFS Plus Extent key */ -struct HFSPlusExtentKey { - u_int16_t keyLength; /* length of key, excluding this field */ - u_int8_t forkType; /* 0 = data fork, FF = resource fork */ - u_int8_t pad; /* make the other fields align on 32-bit boundary */ - u_int32_t fileID; /* file ID */ - u_int32_t startBlock; /* first file allocation block number in this extent */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusExtentKey HFSPlusExtentKey; - -/* Number of extent descriptors per extent record */ -enum { - kHFSExtentDensity = 3, - kHFSPlusExtentDensity = 8 -}; - -/* HFS extent descriptor */ -struct HFSExtentDescriptor { - u_int16_t startBlock; /* first allocation block */ - u_int16_t blockCount; /* number of allocation blocks */ -} __attribute__((aligned(2), packed)); -typedef struct HFSExtentDescriptor HFSExtentDescriptor; - -/* HFS Plus extent descriptor */ -struct HFSPlusExtentDescriptor { - u_int32_t startBlock; /* first allocation block */ - u_int32_t blockCount; /* number of allocation blocks */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusExtentDescriptor HFSPlusExtentDescriptor; - -/* HFS extent record */ -typedef HFSExtentDescriptor HFSExtentRecord[3]; - -/* HFS Plus extent record */ -typedef HFSPlusExtentDescriptor HFSPlusExtentRecord[8]; - - -/* Finder information */ -struct FndrFileInfo { - u_int32_t fdType; /* file type */ - u_int32_t fdCreator; /* file creator */ - u_int16_t fdFlags; /* Finder flags */ - struct { - int16_t v; /* file's location */ - int16_t h; - } fdLocation; - int16_t opaque; -} __attribute__((aligned(2), packed)); -typedef struct FndrFileInfo FndrFileInfo; - -struct FndrDirInfo { - struct { /* folder's window rectangle */ - int16_t top; - int16_t left; - int16_t bottom; - int16_t right; - } frRect; - unsigned short frFlags; /* Finder flags */ - struct { - u_int16_t v; /* folder's location */ - u_int16_t h; - } frLocation; - int16_t opaque; -} __attribute__((aligned(2), packed)); -typedef struct FndrDirInfo FndrDirInfo; - -struct FndrOpaqueInfo { - int8_t opaque[16]; -} __attribute__((aligned(2), packed)); -typedef struct FndrOpaqueInfo FndrOpaqueInfo; - -struct FndrExtendedDirInfo { - u_int32_t document_id; - u_int32_t date_added; - u_int16_t extended_flags; - u_int16_t reserved3; - u_int32_t write_gen_counter; -} __attribute__((aligned(2), packed)); - -struct FndrExtendedFileInfo { - u_int32_t document_id; - u_int32_t date_added; - u_int16_t extended_flags; - u_int16_t reserved2; - u_int32_t write_gen_counter; -} __attribute__((aligned(2), packed)); - -/* HFS Plus Fork data info - 80 bytes */ -struct HFSPlusForkData { - u_int64_t logicalSize; /* fork's logical size in bytes */ - u_int32_t clumpSize; /* fork's clump size in bytes */ - u_int32_t totalBlocks; /* total blocks used by this fork */ - HFSPlusExtentRecord extents; /* initial set of extents */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusForkData HFSPlusForkData; - - -/* Mac OS X has 16 bytes worth of "BSD" info. - * - * Note: Mac OS 9 implementations and applications - * should preserve, but not change, this information. - */ -struct HFSPlusBSDInfo { - u_int32_t ownerID; /* user-id of owner or hard link chain previous link */ - u_int32_t groupID; /* group-id of owner or hard link chain next link */ - u_int8_t adminFlags; /* super-user changeable flags */ - u_int8_t ownerFlags; /* owner changeable flags */ - u_int16_t fileMode; /* file type and permission bits */ - union { - u_int32_t iNodeNum; /* indirect node number (hard links only) */ - u_int32_t linkCount; /* links that refer to this indirect node */ - u_int32_t rawDevice; /* special file device (FBLK and FCHR only) */ - } special; -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusBSDInfo HFSPlusBSDInfo; - -/* - * Hardlink "links" resolve to an inode - * and the actual uid/gid comes from that - * inode. - * - * We repurpose the links's uid/gid fields - * for the hardlink link chain. The chain - * consists of a doubly linked list of file - * ids. - */ - -#define hl_firstLinkID reserved1 /* Valid only if HasLinkChain flag is set (indirect nodes only) */ - -#define hl_prevLinkID bsdInfo.ownerID /* Valid only if HasLinkChain flag is set */ -#define hl_nextLinkID bsdInfo.groupID /* Valid only if HasLinkChain flag is set */ - -#define hl_linkReference bsdInfo.special.iNodeNum -#define hl_linkCount bsdInfo.special.linkCount - - -/* Catalog file data structures */ - -enum { - kHFSRootParentID = 1, /* Parent ID of the root folder */ - kHFSRootFolderID = 2, /* Folder ID of the root folder */ - kHFSExtentsFileID = 3, /* File ID of the extents file */ - kHFSCatalogFileID = 4, /* File ID of the catalog file */ - kHFSBadBlockFileID = 5, /* File ID of the bad allocation block file */ - kHFSAllocationFileID = 6, /* File ID of the allocation file (HFS Plus only) */ - kHFSStartupFileID = 7, /* File ID of the startup file (HFS Plus only) */ - kHFSAttributesFileID = 8, /* File ID of the attribute file (HFS Plus only) */ - kHFSAttributeDataFileID = 13, /* Used in Mac OS X runtime for extent based attributes */ - /* kHFSAttributeDataFileID is never stored on disk. */ - kHFSRepairCatalogFileID = 14, /* Used when rebuilding Catalog B-tree */ - kHFSBogusExtentFileID = 15, /* Used for exchanging extents in extents file */ - kHFSFirstUserCatalogNodeID = 16 -}; - -/* HFS catalog key */ -struct HFSCatalogKey { - u_int8_t keyLength; /* key length (in bytes) */ - u_int8_t reserved; /* reserved (set to zero) */ - u_int32_t parentID; /* parent folder ID */ - u_int8_t nodeName[kHFSMaxFileNameChars + 1]; /* catalog node name */ -} __attribute__((aligned(2), packed)); -typedef struct HFSCatalogKey HFSCatalogKey; - -/* HFS Plus catalog key */ -struct HFSPlusCatalogKey { - u_int16_t keyLength; /* key length (in bytes) */ - u_int32_t parentID; /* parent folder ID */ - HFSUniStr255 nodeName; /* catalog node name */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusCatalogKey HFSPlusCatalogKey; - -/* Catalog record types */ -enum { - /* HFS Catalog Records */ - kHFSFolderRecord = 0x0100, /* Folder record */ - kHFSFileRecord = 0x0200, /* File record */ - kHFSFolderThreadRecord = 0x0300, /* Folder thread record */ - kHFSFileThreadRecord = 0x0400, /* File thread record */ - - /* HFS Plus Catalog Records */ - kHFSPlusFolderRecord = 1, /* Folder record */ - kHFSPlusFileRecord = 2, /* File record */ - kHFSPlusFolderThreadRecord = 3, /* Folder thread record */ - kHFSPlusFileThreadRecord = 4 /* File thread record */ -}; - - -/* Catalog file record flags */ -enum { - kHFSFileLockedBit = 0x0000, /* file is locked and cannot be written to */ - kHFSFileLockedMask = 0x0001, - - kHFSThreadExistsBit = 0x0001, /* a file thread record exists for this file */ - kHFSThreadExistsMask = 0x0002, - - kHFSHasAttributesBit = 0x0002, /* object has extended attributes */ - kHFSHasAttributesMask = 0x0004, - - kHFSHasSecurityBit = 0x0003, /* object has security data (ACLs) */ - kHFSHasSecurityMask = 0x0008, - - kHFSHasFolderCountBit = 0x0004, /* only for HFSX, folder maintains a separate sub-folder count */ - kHFSHasFolderCountMask = 0x0010, /* (sum of folder records and directory hard links) */ - - kHFSHasLinkChainBit = 0x0005, /* has hardlink chain (inode or link) */ - kHFSHasLinkChainMask = 0x0020, - - kHFSHasChildLinkBit = 0x0006, /* folder has a child that's a dir link */ - kHFSHasChildLinkMask = 0x0040, - - kHFSHasDateAddedBit = 0x0007, /* File/Folder has the date-added stored in the finder info. */ - kHFSHasDateAddedMask = 0x0080, - - kHFSFastDevPinnedBit = 0x0008, /* this file has been pinned to the fast-device by the hot-file code on cooperative fusion */ - kHFSFastDevPinnedMask = 0x0100, - - kHFSDoNotFastDevPinBit = 0x0009, /* this file can not be pinned to the fast-device */ - kHFSDoNotFastDevPinMask = 0x0200, - - kHFSFastDevCandidateBit = 0x000a, /* this item is a potential candidate for fast-dev pinning (as are any of its descendents */ - kHFSFastDevCandidateMask = 0x0400, - - kHFSAutoCandidateBit = 0x000b, /* this item was automatically marked as a fast-dev candidate by the kernel */ - kHFSAutoCandidateMask = 0x0800 - - // There are only 4 flag bits remaining: 0x1000, 0x2000, 0x4000, 0x8000 - -}; - - -/* HFS catalog folder record - 70 bytes */ -struct HFSCatalogFolder { - int16_t recordType; /* == kHFSFolderRecord */ - u_int16_t flags; /* folder flags */ - u_int16_t valence; /* folder valence */ - u_int32_t folderID; /* folder ID */ - u_int32_t createDate; /* date and time of creation */ - u_int32_t modifyDate; /* date and time of last modification */ - u_int32_t backupDate; /* date and time of last backup */ - FndrDirInfo userInfo; /* Finder information */ - FndrOpaqueInfo finderInfo; /* additional Finder information */ - u_int32_t reserved[4]; /* reserved - initialized as zero */ -} __attribute__((aligned(2), packed)); -typedef struct HFSCatalogFolder HFSCatalogFolder; - -/* HFS Plus catalog folder record - 88 bytes */ -struct HFSPlusCatalogFolder { - int16_t recordType; /* == kHFSPlusFolderRecord */ - u_int16_t flags; /* file flags */ - u_int32_t valence; /* folder's item count */ - u_int32_t folderID; /* folder ID */ - u_int32_t createDate; /* date and time of creation */ - u_int32_t contentModDate; /* date and time of last content modification */ - u_int32_t attributeModDate; /* date and time of last attribute modification */ - u_int32_t accessDate; /* date and time of last access (MacOS X only) */ - u_int32_t backupDate; /* date and time of last backup */ - HFSPlusBSDInfo bsdInfo; /* permissions (for MacOS X) */ - FndrDirInfo userInfo; /* Finder information */ - FndrOpaqueInfo finderInfo; /* additional Finder information */ - u_int32_t textEncoding; /* hint for name conversions */ - u_int32_t folderCount; /* number of enclosed folders, active when HasFolderCount is set */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusCatalogFolder HFSPlusCatalogFolder; - -/* HFS catalog file record - 102 bytes */ -struct HFSCatalogFile { - int16_t recordType; /* == kHFSFileRecord */ - u_int8_t flags; /* file flags */ - int8_t fileType; /* file type (unused ?) */ - FndrFileInfo userInfo; /* Finder information */ - u_int32_t fileID; /* file ID */ - u_int16_t dataStartBlock; /* not used - set to zero */ - int32_t dataLogicalSize; /* logical EOF of data fork */ - int32_t dataPhysicalSize; /* physical EOF of data fork */ - u_int16_t rsrcStartBlock; /* not used - set to zero */ - int32_t rsrcLogicalSize; /* logical EOF of resource fork */ - int32_t rsrcPhysicalSize; /* physical EOF of resource fork */ - u_int32_t createDate; /* date and time of creation */ - u_int32_t modifyDate; /* date and time of last modification */ - u_int32_t backupDate; /* date and time of last backup */ - FndrOpaqueInfo finderInfo; /* additional Finder information */ - u_int16_t clumpSize; /* file clump size (not used) */ - HFSExtentRecord dataExtents; /* first data fork extent record */ - HFSExtentRecord rsrcExtents; /* first resource fork extent record */ - u_int32_t reserved; /* reserved - initialized as zero */ -} __attribute__((aligned(2), packed)); -typedef struct HFSCatalogFile HFSCatalogFile; - -/* HFS Plus catalog file record - 248 bytes */ -struct HFSPlusCatalogFile { - int16_t recordType; /* == kHFSPlusFileRecord */ - u_int16_t flags; /* file flags */ - u_int32_t reserved1; /* reserved - initialized as zero */ - u_int32_t fileID; /* file ID */ - u_int32_t createDate; /* date and time of creation */ - u_int32_t contentModDate; /* date and time of last content modification */ - u_int32_t attributeModDate; /* date and time of last attribute modification */ - u_int32_t accessDate; /* date and time of last access (MacOS X only) */ - u_int32_t backupDate; /* date and time of last backup */ - HFSPlusBSDInfo bsdInfo; /* permissions (for MacOS X) */ - FndrFileInfo userInfo; /* Finder information */ - FndrOpaqueInfo finderInfo; /* additional Finder information */ - u_int32_t textEncoding; /* hint for name conversions */ - u_int32_t reserved2; /* reserved - initialized as zero */ - - /* Note: these start on double long (64 bit) boundary */ - HFSPlusForkData dataFork; /* size and block data for data fork */ - HFSPlusForkData resourceFork; /* size and block data for resource fork */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusCatalogFile HFSPlusCatalogFile; - -/* HFS catalog thread record - 46 bytes */ -struct HFSCatalogThread { - int16_t recordType; /* == kHFSFolderThreadRecord or kHFSFileThreadRecord */ - int32_t reserved[2]; /* reserved - initialized as zero */ - u_int32_t parentID; /* parent ID for this catalog node */ - u_int8_t nodeName[kHFSMaxFileNameChars + 1]; /* name of this catalog node */ -} __attribute__((aligned(2), packed)); -typedef struct HFSCatalogThread HFSCatalogThread; - -/* HFS Plus catalog thread record -- 264 bytes */ -struct HFSPlusCatalogThread { - int16_t recordType; /* == kHFSPlusFolderThreadRecord or kHFSPlusFileThreadRecord */ - int16_t reserved; /* reserved - initialized as zero */ - u_int32_t parentID; /* parent ID for this catalog node */ - HFSUniStr255 nodeName; /* name of this catalog node (variable length) */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusCatalogThread HFSPlusCatalogThread; - -#ifdef __APPLE_API_UNSTABLE -/* - * These are the types of records in the attribute B-tree. The values were - * chosen so that they wouldn't conflict with the catalog record types. - */ -enum { - kHFSPlusAttrInlineData = 0x10, /* attributes whose data fits in a b-tree node */ - kHFSPlusAttrForkData = 0x20, /* extent based attributes (data lives in extents) */ - kHFSPlusAttrExtents = 0x30 /* overflow extents for large attributes */ -}; - - -/* - * HFSPlusAttrForkData - * For larger attributes, whose value is stored in allocation blocks. - * If the attribute has more than 8 extents, there will be additional - * records (of type HFSPlusAttrExtents) for this attribute. - */ -struct HFSPlusAttrForkData { - u_int32_t recordType; /* == kHFSPlusAttrForkData*/ - u_int32_t reserved; - HFSPlusForkData theFork; /* size and first extents of value*/ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusAttrForkData HFSPlusAttrForkData; - -/* - * HFSPlusAttrExtents - * This record contains information about overflow extents for large, - * fragmented attributes. - */ -struct HFSPlusAttrExtents { - u_int32_t recordType; /* == kHFSPlusAttrExtents*/ - u_int32_t reserved; - HFSPlusExtentRecord extents; /* additional extents*/ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusAttrExtents HFSPlusAttrExtents; - -/* - * Atrributes B-tree Data Record - * - * For small attributes, whose entire value is stored - * within a single B-tree record. - */ -struct HFSPlusAttrData { - u_int32_t recordType; /* == kHFSPlusAttrInlineData */ - u_int32_t reserved[2]; - u_int32_t attrSize; /* size of attribute data in bytes */ - u_int8_t attrData[2]; /* variable length */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusAttrData HFSPlusAttrData; - - -/* HFSPlusAttrInlineData is obsolete use HFSPlusAttrData instead */ -struct HFSPlusAttrInlineData { - u_int32_t recordType; - u_int32_t reserved; - u_int32_t logicalSize; - u_int8_t userData[2]; -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusAttrInlineData HFSPlusAttrInlineData; - - -/* A generic Attribute Record */ -union HFSPlusAttrRecord { - u_int32_t recordType; - HFSPlusAttrInlineData inlineData; /* NOT USED */ - HFSPlusAttrData attrData; - HFSPlusAttrForkData forkData; - HFSPlusAttrExtents overflowExtents; -}; -typedef union HFSPlusAttrRecord HFSPlusAttrRecord; - -/* Attribute key */ -enum { kHFSMaxAttrNameLen = 127 }; -struct HFSPlusAttrKey { - u_int16_t keyLength; /* key length (in bytes) */ - u_int16_t pad; /* set to zero */ - u_int32_t fileID; /* file associated with attribute */ - u_int32_t startBlock; /* first allocation block number for extents */ - u_int16_t attrNameLen; /* number of unicode characters */ - u_int16_t attrName[kHFSMaxAttrNameLen]; /* attribute name (Unicode) */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusAttrKey HFSPlusAttrKey; - -#define kHFSPlusAttrKeyMaximumLength (sizeof(HFSPlusAttrKey) - sizeof(u_int16_t)) -#define kHFSPlusAttrKeyMinimumLength (kHFSPlusAttrKeyMaximumLength - kHFSMaxAttrNameLen*sizeof(u_int16_t)) - -#endif /* __APPLE_API_UNSTABLE */ - - -/* Key and node lengths */ -enum { - kHFSPlusExtentKeyMaximumLength = sizeof(HFSPlusExtentKey) - sizeof(u_int16_t), - kHFSExtentKeyMaximumLength = sizeof(HFSExtentKey) - sizeof(u_int8_t), - kHFSPlusCatalogKeyMaximumLength = sizeof(HFSPlusCatalogKey) - sizeof(u_int16_t), - kHFSPlusCatalogKeyMinimumLength = kHFSPlusCatalogKeyMaximumLength - sizeof(HFSUniStr255) + sizeof(u_int16_t), - kHFSCatalogKeyMaximumLength = sizeof(HFSCatalogKey) - sizeof(u_int8_t), - kHFSCatalogKeyMinimumLength = kHFSCatalogKeyMaximumLength - (kHFSMaxFileNameChars + 1) + sizeof(u_int8_t), - kHFSPlusCatalogMinNodeSize = 4096, - kHFSPlusExtentMinNodeSize = 512, - kHFSPlusAttrMinNodeSize = 4096 -}; - -/* HFS and HFS Plus volume attribute bits */ -enum { - /* Bits 0-6 are reserved (always cleared by MountVol call) */ - kHFSVolumeHardwareLockBit = 7, /* volume is locked by hardware */ - kHFSVolumeUnmountedBit = 8, /* volume was successfully unmounted */ - kHFSVolumeSparedBlocksBit = 9, /* volume has bad blocks spared */ - kHFSVolumeNoCacheRequiredBit = 10, /* don't cache volume blocks (i.e. RAM or ROM disk) */ - kHFSBootVolumeInconsistentBit = 11, /* boot volume is inconsistent (System 7.6 and later) */ - kHFSCatalogNodeIDsReusedBit = 12, - kHFSVolumeJournaledBit = 13, /* this volume has a journal on it */ - kHFSVolumeInconsistentBit = 14, /* serious inconsistencies detected at runtime */ - kHFSVolumeSoftwareLockBit = 15, /* volume is locked by software */ - /* - * HFS only has 16 bits of attributes in the MDB, but HFS Plus has 32 bits. - * Therefore, bits 16-31 can only be used on HFS Plus. - */ - kHFSUnusedNodeFixBit = 31, /* Unused nodes in the Catalog B-tree have been zero-filled. See Radar #6947811. */ - kHFSContentProtectionBit = 30, /* Volume has per-file content protection */ - - /*** Keep these in sync with the bits above ! ****/ - kHFSVolumeHardwareLockMask = 0x00000080, - kHFSVolumeUnmountedMask = 0x00000100, - kHFSVolumeSparedBlocksMask = 0x00000200, - kHFSVolumeNoCacheRequiredMask = 0x00000400, - kHFSBootVolumeInconsistentMask = 0x00000800, - kHFSCatalogNodeIDsReusedMask = 0x00001000, - kHFSVolumeJournaledMask = 0x00002000, - kHFSVolumeInconsistentMask = 0x00004000, - kHFSVolumeSoftwareLockMask = 0x00008000, - - /* Bits 16-31 are allocated from high to low */ - - kHFSContentProtectionMask = 0x40000000, - kHFSUnusedNodeFixMask = 0x80000000, - - kHFSMDBAttributesMask = 0x8380 -}; - -enum { - kHFSUnusedNodesFixDate = 0xc5ef2480 /* March 25, 2009 */ -}; - -/* HFS Master Directory Block - 162 bytes */ -/* Stored at sector #2 (3rd sector) and second-to-last sector. */ -struct HFSMasterDirectoryBlock { - u_int16_t drSigWord; /* == kHFSSigWord */ - u_int32_t drCrDate; /* date and time of volume creation */ - u_int32_t drLsMod; /* date and time of last modification */ - u_int16_t drAtrb; /* volume attributes */ - u_int16_t drNmFls; /* number of files in root folder */ - u_int16_t drVBMSt; /* first block of volume bitmap */ - u_int16_t drAllocPtr; /* start of next allocation search */ - u_int16_t drNmAlBlks; /* number of allocation blocks in volume */ - u_int32_t drAlBlkSiz; /* size (in bytes) of allocation blocks */ - u_int32_t drClpSiz; /* default clump size */ - u_int16_t drAlBlSt; /* first allocation block in volume */ - u_int32_t drNxtCNID; /* next unused catalog node ID */ - u_int16_t drFreeBks; /* number of unused allocation blocks */ - u_int8_t drVN[kHFSMaxVolumeNameChars + 1]; /* volume name */ - u_int32_t drVolBkUp; /* date and time of last backup */ - u_int16_t drVSeqNum; /* volume backup sequence number */ - u_int32_t drWrCnt; /* volume write count */ - u_int32_t drXTClpSiz; /* clump size for extents overflow file */ - u_int32_t drCTClpSiz; /* clump size for catalog file */ - u_int16_t drNmRtDirs; /* number of directories in root folder */ - u_int32_t drFilCnt; /* number of files in volume */ - u_int32_t drDirCnt; /* number of directories in volume */ - u_int32_t drFndrInfo[8]; /* information used by the Finder */ - u_int16_t drEmbedSigWord; /* embedded volume signature (formerly drVCSize) */ - HFSExtentDescriptor drEmbedExtent; /* embedded volume location and size (formerly drVBMCSize and drCtlCSize) */ - u_int32_t drXTFlSize; /* size of extents overflow file */ - HFSExtentRecord drXTExtRec; /* extent record for extents overflow file */ - u_int32_t drCTFlSize; /* size of catalog file */ - HFSExtentRecord drCTExtRec; /* extent record for catalog file */ -} __attribute__((aligned(2), packed)); -typedef struct HFSMasterDirectoryBlock HFSMasterDirectoryBlock; - - -#ifdef __APPLE_API_UNSTABLE -#define SET_HFS_TEXT_ENCODING(hint) \ - (0x656e6300 | ((hint) & 0xff)) -#define GET_HFS_TEXT_ENCODING(hint) \ - (((hint) & 0xffffff00) == 0x656e6300 ? (hint) & 0x000000ff : 0xffffffffU) -#endif /* __APPLE_API_UNSTABLE */ - - -/* HFS Plus Volume Header - 512 bytes */ -/* Stored at sector #2 (3rd sector) and second-to-last sector. */ -struct HFSPlusVolumeHeader { - u_int16_t signature; /* == kHFSPlusSigWord */ - u_int16_t version; /* == kHFSPlusVersion */ - u_int32_t attributes; /* volume attributes */ - u_int32_t lastMountedVersion; /* implementation version which last mounted volume */ - u_int32_t journalInfoBlock; /* block addr of journal info (if volume is journaled, zero otherwise) */ - - u_int32_t createDate; /* date and time of volume creation */ - u_int32_t modifyDate; /* date and time of last modification */ - u_int32_t backupDate; /* date and time of last backup */ - u_int32_t checkedDate; /* date and time of last disk check */ - - u_int32_t fileCount; /* number of files in volume */ - u_int32_t folderCount; /* number of directories in volume */ - - u_int32_t blockSize; /* size (in bytes) of allocation blocks */ - u_int32_t totalBlocks; /* number of allocation blocks in volume (includes this header and VBM*/ - u_int32_t freeBlocks; /* number of unused allocation blocks */ - - u_int32_t nextAllocation; /* start of next allocation search */ - u_int32_t rsrcClumpSize; /* default resource fork clump size */ - u_int32_t dataClumpSize; /* default data fork clump size */ - u_int32_t nextCatalogID; /* next unused catalog node ID */ - - u_int32_t writeCount; /* volume write count */ - u_int64_t encodingsBitmap; /* which encodings have been use on this volume */ - - u_int8_t finderInfo[32]; /* information used by the Finder */ - - HFSPlusForkData allocationFile; /* allocation bitmap file */ - HFSPlusForkData extentsFile; /* extents B-tree file */ - HFSPlusForkData catalogFile; /* catalog B-tree file */ - HFSPlusForkData attributesFile; /* extended attributes B-tree file */ - HFSPlusForkData startupFile; /* boot file (secondary loader) */ -} __attribute__((aligned(2), packed)); -typedef struct HFSPlusVolumeHeader HFSPlusVolumeHeader; - - -/* B-tree structures */ - -enum BTreeKeyLimits{ - kMaxKeyLength = 520 -}; - -union BTreeKey{ - u_int8_t length8; - u_int16_t length16; - u_int8_t rawData [kMaxKeyLength+2]; -}; -typedef union BTreeKey BTreeKey; - -/* BTNodeDescriptor -- Every B-tree node starts with these fields. */ -struct BTNodeDescriptor { - u_int32_t fLink; /* next node at this level*/ - u_int32_t bLink; /* previous node at this level*/ - int8_t kind; /* kind of node (leaf, index, header, map)*/ - u_int8_t height; /* zero for header, map; child is one more than parent*/ - u_int16_t numRecords; /* number of records in this node*/ - u_int16_t reserved; /* reserved - initialized as zero */ -} __attribute__((aligned(2), packed)); -typedef struct BTNodeDescriptor BTNodeDescriptor; - -/* Constants for BTNodeDescriptor kind */ -enum { - kBTLeafNode = -1, - kBTIndexNode = 0, - kBTHeaderNode = 1, - kBTMapNode = 2 -}; - -/* BTHeaderRec -- The first record of a B-tree header node */ -struct BTHeaderRec { - u_int16_t treeDepth; /* maximum height (usually leaf nodes) */ - u_int32_t rootNode; /* node number of root node */ - u_int32_t leafRecords; /* number of leaf records in all leaf nodes */ - u_int32_t firstLeafNode; /* node number of first leaf node */ - u_int32_t lastLeafNode; /* node number of last leaf node */ - u_int16_t nodeSize; /* size of a node, in bytes */ - u_int16_t maxKeyLength; /* reserved */ - u_int32_t totalNodes; /* total number of nodes in tree */ - u_int32_t freeNodes; /* number of unused (free) nodes in tree */ - u_int16_t reserved1; /* unused */ - u_int32_t clumpSize; /* reserved */ - u_int8_t btreeType; /* reserved */ - u_int8_t keyCompareType; /* Key string Comparison Type */ - u_int32_t attributes; /* persistent attributes about the tree */ - u_int32_t reserved3[16]; /* reserved */ -} __attribute__((aligned(2), packed)); -typedef struct BTHeaderRec BTHeaderRec; - -/* Constants for BTHeaderRec attributes */ -enum { - kBTBadCloseMask = 0x00000001, /* reserved */ - kBTBigKeysMask = 0x00000002, /* key length field is 16 bits */ - kBTVariableIndexKeysMask = 0x00000004 /* keys in index nodes are variable length */ -}; - - -/* Catalog Key Name Comparison Type */ -enum { - kHFSCaseFolding = 0xCF, /* case folding (case-insensitive) */ - kHFSBinaryCompare = 0xBC /* binary compare (case-sensitive) */ -}; - -#include - -/* JournalInfoBlock - Structure that describes where our journal lives */ - -// the original size of the reserved field in the JournalInfoBlock was -// 32*sizeof(u_int32_t). To keep the total size of the structure the -// same we subtract the size of new fields (currently: ext_jnl_uuid and -// machine_uuid). If you add additional fields, place them before the -// reserved field and subtract their size in this macro. -// -#define JIB_RESERVED_SIZE ((32*sizeof(u_int32_t)) - sizeof(uuid_string_t) - 48) - -struct JournalInfoBlock { - u_int32_t flags; - u_int32_t device_signature[8]; // signature used to locate our device. - u_int64_t offset; // byte offset to the journal on the device - u_int64_t size; // size in bytes of the journal - uuid_string_t ext_jnl_uuid; - char machine_serial_num[48]; - char reserved[JIB_RESERVED_SIZE]; -} __attribute__((aligned(2), packed)); -typedef struct JournalInfoBlock JournalInfoBlock; - -enum { - kJIJournalInFSMask = 0x00000001, - kJIJournalOnOtherDeviceMask = 0x00000002, - kJIJournalNeedInitMask = 0x00000004 -}; - -// -// This the content type uuid for "external journal" GPT -// partitions. Each instance of a partition also has a -// uuid that uniquely identifies that instance. -// -#define EXTJNL_CONTENT_TYPE_UUID "4A6F7572-6E61-11AA-AA11-00306543ECAC" - - -#ifdef __cplusplus -} -#endif - -#endif /* __HFS_FORMAT__ */ diff --git a/bsd/hfs/hfs_fsctl.h b/bsd/hfs/hfs_fsctl.h deleted file mode 100644 index 0958179ea..000000000 --- a/bsd/hfs/hfs_fsctl.h +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright (c) 2004-2014 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _HFS_FSCTL_H_ -#define _HFS_FSCTL_H_ - -#include - -#include -#include -#include -#include - -#ifdef __APPLE_API_UNSTABLE - -struct hfs_backingstoreinfo { - int signature; /* == 3419115 */ - int version; /* version of this struct (1) */ - int backingfd; /* disk image file (on backing fs) */ - int bandsize; /* sparse disk image band size */ -}; - - -typedef char pathname_t[MAXPATHLEN]; - -struct hfs_journal_info { - off_t jstart; - off_t jsize; -}; - - -// Will be deprecated and replaced by hfs_fsinfo -struct hfsinfo_metadata { - uint32_t total; - uint32_t extents; - uint32_t catalog; - uint32_t allocation; - uint32_t attribute; - uint32_t journal; - uint32_t reserved[4]; -}; - -/* - * Flags for hfs_fsinfo_data structure - */ -#define HFS_FSINFO_CLASS_A 0x0001 /* Information for class A files requested */ -#define HFS_FSINFO_CLASS_B 0x0002 /* Information for class B files requested */ -#define HFS_FSINFO_CLASS_C 0x0004 /* Information for class C files requested */ -#define HFS_FSINFO_CLASS_D 0x0008 /* Information for class D files requested */ - -/* - * Maximum number of buckets to represent range from 0 to 1TB (2^40) in - * increments of power of 2, and one catch-all bucket for anything that - * is greater than 1TB - */ -#define HFS_FSINFO_DATA_MAX_BUCKETS 42 - -/* - * Maximum number of buckets to represents percentage range from 0 to 100 - * in increments of 10. - */ -#define HFS_FSINFO_PERCENT_MAX_BUCKETS 10 - -/* - * Maximum number of buckets to represent number of file/directory name characters - * (range 1 to 255) in increments of 5. - */ -#define HFS_FSINFO_NAME_MAX_BUCKETS 51 - -/* - * Version number to ensure that the caller and the kernel have same understanding - * of the hfs_fsinfo_data structure. This version needs to be bumped whenever the - * number of buckets is changed. - */ -#define HFS_FSINFO_VERSION 1 - -/* - * hfs_fsinfo_data is generic data structure to aggregate information like sizes - * or counts in buckets of power of 2. Each bucket represents a range of values - * that is determined based on its index in the array. Specifically, buckets[i] - * represents values that are greater than or equal to 2^(i-1) and less than 2^i, - * except the last bucket which represents range greater than or equal to 2^(i-1) - * - * The current maximum number of buckets is 41, so we can represent range from - * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of - * anything that is greater than or equal to 1TB. - * - * For example, - * bucket[0] -> greater than or equal to 0 and less than 1 - * bucket[1] -> greater than or equal to 1 and less than 2 - * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024 - * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB - * bucket[41] -> greater than or equal to 2^(41-1) = 1TB - * - * Note that fsctls that populate this data structure can take long time to - * execute as this operation can be I/O intensive (traversing btrees) and compute - * intensive. - * - * WARNING: Any changes to this structure should also update version number to - * ensure that the clients and kernel are reading/writing correctly. - */ - -/* - * The header includes the user input fields. - */ -typedef struct hfs_fsinfo_header { - uint32_t request_type; - uint16_t version; - uint16_t flags; -} hfs_fsinfo_header_t; - -struct hfs_fsinfo_data { - hfs_fsinfo_header_t header; - uint32_t bucket[HFS_FSINFO_DATA_MAX_BUCKETS]; -}; - -/* - * Structure to represent information about metadata files - * - * WARNING: Any changes to this structure should also update version number to - * ensure that the clients and kernel are reading/writing correctly. - */ -struct hfs_fsinfo_metadata { - hfs_fsinfo_header_t header; - uint32_t extents; - uint32_t catalog; - uint32_t allocation; - uint32_t attribute; - uint32_t journal; -}; - -/* - * Structure to represent distribution of number of file name characters - * in increments of 5s. Each bucket represents a range of values that is - * determined based on its index in the array. So bucket[i] represents values - * that are greater than or equal to (i*5) and less than ((i+1)*10). - * - * Since this structure represents range of file name characters and the - * maximum number of unicode characters in HFS+ is 255, the maximum number - * of buckets will be 52 [0..51]. - * - * For example, - * bucket[4] -> greater than or equal to 20 and less than 25 characters - * bucket[51] -> equal to 255 characters - * - * WARNING: Any changes to this structure should also update version number to - * ensure that the clients and kernel are reading/writing correctly. - */ -struct hfs_fsinfo_name { - hfs_fsinfo_header_t header; - uint32_t bucket[HFS_FSINFO_NAME_MAX_BUCKETS]; -}; - -/* - * Structure to represent information about content protection classes - * - * WARNING: Any changes to this structure should also update version number to - * ensure that the clients and kernel are reading/writing correctly. - */ -struct hfs_fsinfo_cprotect { - hfs_fsinfo_header_t header; - uint32_t class_A; - uint32_t class_B; - uint32_t class_C; - uint32_t class_D; - uint32_t class_E; - uint32_t class_F; -}; - -/* - * Union of all the different values returned by HFSIOC_FSINFO fsctl - */ -union hfs_fsinfo { - hfs_fsinfo_header_t header; - struct hfs_fsinfo_data data; - struct hfs_fsinfo_metadata metadata; - struct hfs_fsinfo_name name; - struct hfs_fsinfo_cprotect cprotect; -}; -typedef union hfs_fsinfo hfs_fsinfo; - -/* - * Type of FSINFO requested, specified by the caller in request_type field - */ -enum { - /* Information about number of allocation blocks for each metadata file, returns struct hfs_fsinfo_metadata */ - HFS_FSINFO_METADATA_BLOCKS_INFO = 1, - - /* Information about number of extents for each metadata file, returns struct hfs_fsinfo_metadata */ - HFS_FSINFO_METADATA_EXTENTS = 2, - - /* Information about percentage of free nodes vs used nodes in metadata btrees, returns struct hfs_fsinfo_metadata */ - HFS_FSINFO_METADATA_PERCENTFREE = 3, - - /* Distribution of number of extents for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ - HFS_FSINFO_FILE_EXTENT_COUNT = 4, - - /* Distribution of extent sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ - HFS_FSINFO_FILE_EXTENT_SIZE = 5, - - /* Distribution of file sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ - HFS_FSINFO_FILE_SIZE = 6, - - /* Distribution of valence for all directories, returns struct hfs_fsinfo_data */ - HFS_FSINFO_DIR_VALENCE = 7, - - /* Distribution of file/directory name size in unicode characters, returns struct hfs_fsinfo_name */ - HFS_FSINFO_NAME_SIZE = 8, - - /* Distribution of extended attribute sizes, returns hfs_fsinfo_data */ - HFS_FSINFO_XATTR_SIZE = 9, - - /* Distribution of free space for the entire file system, returns struct hfs_fsinfo_data */ - HFS_FSINFO_FREE_EXTENTS = 10, - - /* Information about number of files belonging to each class, returns hfs_fsinfo_cprotect */ - HFS_FSINFO_FILE_CPROTECT_COUNT = 11, - - /* - * Distribution of symbolic link sizes for data files (data fork, no rsrc fork, no xattr), - * returns struct hfs_fsinfo_data - */ - HFS_FSINFO_SYMLINK_SIZE = 12, -}; - - -/* HFS FS CONTROL COMMANDS */ - -#define HFSIOC_RESIZE_PROGRESS _IOR('h', 1, u_int32_t) -#define HFS_RESIZE_PROGRESS IOCBASECMD(HFSIOC_RESIZE_PROGRESS) - -#define HFSIOC_RESIZE_VOLUME _IOW('h', 2, u_int64_t) -#define HFS_RESIZE_VOLUME IOCBASECMD(HFSIOC_RESIZE_VOLUME) - -#define HFSIOC_CHANGE_NEXT_ALLOCATION _IOWR('h', 3, u_int32_t) -#define HFS_CHANGE_NEXT_ALLOCATION IOCBASECMD(HFSIOC_CHANGE_NEXT_ALLOCATION) -/* Magic value for next allocation to use with fcntl to set next allocation - * to zero and never update it again on new block allocation. - */ -#define HFS_NO_UPDATE_NEXT_ALLOCATION 0xffffFFFF - -#define HFSIOC_GETCREATETIME _IOR('h', 4, time_t) -#define HFS_GETCREATETIME IOCBASECMD(HFSIOC_GETCREATETIME) - -#define HFSIOC_SETBACKINGSTOREINFO _IOW('h', 7, struct hfs_backingstoreinfo) -#define HFS_SETBACKINGSTOREINFO IOCBASECMD(HFSIOC_SETBACKINGSTOREINFO) - -#define HFSIOC_CLRBACKINGSTOREINFO _IO('h', 8) -#define HFS_CLRBACKINGSTOREINFO IOCBASECMD(HFSIOC_CLRBACKINGSTOREINFO) - -// 'h', 9 used to be HFSIOC_BULKACCESS which is now deprecated - -/* Unsupported - Previously used to enable/disable ACLs */ -#define HFSIOC_UNSUPPORTED _IOW('h', 10, int32_t) - -#define HFSIOC_PREV_LINK _IOWR('h', 11, u_int32_t) -#define HFS_PREV_LINK IOCBASECMD(HFSIOC_PREV_LINK) - -#define HFSIOC_NEXT_LINK _IOWR('h', 12, u_int32_t) -#define HFS_NEXT_LINK IOCBASECMD(HFSIOC_NEXT_LINK) - -#define HFSIOC_GETPATH _IOWR('h', 13, pathname_t) -#define HFS_GETPATH IOCBASECMD(HFSIOC_GETPATH) -/* By default, the path returned by HFS_GETPATH is an absolute path, - * i.e. it also contains the mount point of the volume on which the - * fileID exists. If the following bit is set, the path returned is - * relative to the root of the volume. - */ -#define HFS_GETPATH_VOLUME_RELATIVE 0x1 - -/* Enable/disable extent-based extended attributes */ -#define HFSIOC_SET_XATTREXTENTS_STATE _IOW('h', 14, u_int32_t) -#define HFS_SET_XATTREXTENTS_STATE IOCBASECMD(HFSIOC_SET_XATTREXTENTS_STATE) - -#define HFSIOC_EXT_BULKACCESS _IOW('h', 15, struct user32_ext_access_t) -#define HFS_EXT_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_EXT_BULKACCESS) - -#define HFSIOC_MARK_BOOT_CORRUPT _IO('h', 16) -#define HFS_MARK_BOOT_CORRUPT IOCBASECMD(HFSIOC_MARK_BOOT_CORRUPT) - -#define HFSIOC_GET_JOURNAL_INFO _IOR('h', 17, struct hfs_journal_info) -#define HFS_FSCTL_GET_JOURNAL_INFO IOCBASECMD(HFSIOC_GET_JOURNAL_INFO) - -#define HFSIOC_SET_VERY_LOW_DISK _IOW('h', 20, u_int32_t) -#define HFS_FSCTL_SET_VERY_LOW_DISK IOCBASECMD(HFSIOC_SET_VERY_LOW_DISK) - -#define HFSIOC_SET_LOW_DISK _IOW('h', 21, u_int32_t) -#define HFS_FSCTL_SET_LOW_DISK IOCBASECMD(HFSIOC_SET_LOW_DISK) - -#define HFSIOC_SET_DESIRED_DISK _IOW('h', 22, u_int32_t) -#define HFS_FSCTL_SET_DESIRED_DISK IOCBASECMD(HFSIOC_SET_DESIRED_DISK) - -#define HFSIOC_SET_ALWAYS_ZEROFILL _IOW('h', 23, int32_t) -#define HFS_SET_ALWAYS_ZEROFILL IOCBASECMD(HFSIOC_SET_ALWAYS_ZEROFILL) - -#define HFSIOC_VOLUME_STATUS _IOR('h', 24, u_int32_t) -#define HFS_VOLUME_STATUS IOCBASECMD(HFSIOC_VOLUME_STATUS) - -/* Disable metadata zone for given volume */ -#define HFSIOC_DISABLE_METAZONE _IO('h', 25) -#define HFS_DISABLE_METAZONE IOCBASECMD(HFSIOC_DISABLE_METAZONE) - -/* Change the next CNID value */ -#define HFSIOC_CHANGE_NEXTCNID _IOWR('h', 26, u_int32_t) -#define HFS_CHANGE_NEXTCNID IOCBASECMD(HFSIOC_CHANGE_NEXTCNID) - -/* Get the low disk space values */ -#define HFSIOC_GET_VERY_LOW_DISK _IOR('h', 27, u_int32_t) -#define HFS_FSCTL_GET_VERY_LOW_DISK IOCBASECMD(HFSIOC_GET_VERY_LOW_DISK) - -#define HFSIOC_GET_LOW_DISK _IOR('h', 28, u_int32_t) -#define HFS_FSCTL_GET_LOW_DISK IOCBASECMD(HFSIOC_GET_LOW_DISK) - -#define HFSIOC_GET_DESIRED_DISK _IOR('h', 29, u_int32_t) -#define HFS_FSCTL_GET_DESIRED_DISK IOCBASECMD(HFSIOC_GET_DESIRED_DISK) - -/* 30 was HFSIOC_GET_WRITE_GEN_COUNTER and is now deprecated */ - -/* 31 was HFSIOC_GET_DOCUMENT_ID and is now deprecated */ - -/* revisiond only uses this when something transforms in a way the kernel can't track such as "foo.rtf" -> "foo.rtfd" */ -#define HFSIOC_TRANSFER_DOCUMENT_ID _IOW('h', 32, u_int32_t) -#define HFS_TRANSFER_DOCUMENT_ID IOCBASECMD(HFSIOC_TRANSFER_DOCUMENT_ID) - - -/* - * XXX: Will be deprecated and replaced by HFSIOC_GET_FSINFO - * - * Get information about number of file system allocation blocks used by metadata - * files on the volume, including individual btrees and journal file. The caller - * can determine the size of file system allocation block using value returned as - * f_bsize by statfs(2). - */ -#define HFSIOC_FSINFO_METADATA_BLOCKS _IOWR('h', 38, struct hfsinfo_metadata) -#define HFS_FSINFO_METADATA_BLOCKS IOCBASECMD(HFSIOC_FSINFO_METADATA_BLOCKS) - -/* Send TRIMs for all free blocks to the underlying device */ -#define HFSIOC_CS_FREESPACE_TRIM _IOWR('h', 39, u_int32_t) -#define HFS_CS_FREESPACE_TRIM IOCBASECMD(HFSIOC_CS_FREESPACE_TRIM) - - -/* Get file system information for the given volume */ -#define HFSIOC_GET_FSINFO _IOWR('h', 45, hfs_fsinfo) -#define HFS_GET_FSINFO IOCBASECMD(HFSIOC_GET_FSINFO) - -/* Re-pin hotfile data; argument controls what state gets repinned */ -#define HFSIOC_REPIN_HOTFILE_STATE _IOWR('h', 46, u_int32_t) -#define HFS_REPIN_HOTFILE_STATE IOCBASECMD(HFSIOC_REPIN_HOTFILE_STATE) - -#define HFS_REPIN_METADATA 0x0001 -#define HFS_REPIN_USERDATA 0x0002 - -/* Mark a directory or file as worth caching on any underlying "fast" device */ -#define HFSIOC_SET_HOTFILE_STATE _IOWR('h', 47, u_int32_t) -#define HFS_SET_HOTFILE_STATE IOCBASECMD(HFSIOC_SET_HOTFILE_STATE) - -/* flags to pass to SET_HOTFILE_STATE */ -#define HFS_MARK_FASTDEVCANDIDATE 0x0001 -#define HFS_UNMARK_FASTDEVCANDIDATE 0x0002 -#define HFS_NEVER_FASTDEVCANDIDATE 0x0004 - - -#endif /* __APPLE_API_UNSTABLE */ - -#endif /* ! _HFS_FSCTL_H_ */ diff --git a/bsd/hfs/hfs_fsinfo.c b/bsd/hfs/hfs_fsinfo.c deleted file mode 100644 index ffb31575b..000000000 --- a/bsd/hfs/hfs_fsinfo.c +++ /dev/null @@ -1,893 +0,0 @@ -/* - * Copyright (c) 2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include - -#include "hfs.h" -#include "hfs_fsctl.h" -#include "hfs_endian.h" -#include "hfscommon/headers/BTreesInternal.h" -#include "hfscommon/headers/BTreesPrivate.h" -#include "hfscommon/headers/FileMgrInternal.h" - -#include - - -union HFSPlusRecord { - HFSPlusCatalogFolder folder_record; - HFSPlusCatalogFile file_record; - HFSPlusCatalogThread thread_record; - HFSPlusExtentRecord extent_record; - HFSPlusAttrRecord attr_record; -}; -typedef union HFSPlusRecord HFSPlusRecord; - -union HFSPlusKey { - HFSPlusExtentKey extent_key; - HFSPlusAttrKey attr_key; -}; -typedef union HFSPlusKey HFSPlusKey; - -typedef enum traverse_btree_flag { - - //If set, extents btree will also be traversed along with catalog btree, so grab correct locks upfront - TRAVERSE_BTREE_EXTENTS = 1, - - // Getting content-protection attributes, allocate enough space to accomodate the records. - TRAVERSE_BTREE_XATTR_CPROTECT = 2, - -} traverse_btree_flag_t; - - - -static errno_t hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); -static errno_t hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); -static errno_t hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); -static errno_t fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -static errno_t fsinfo_file_extent_size_catalog_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -static errno_t fsinfo_file_extent_size_overflow_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -static errno_t fsinfo_file_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -static errno_t fsinfo_dir_valence_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -static errno_t fsinfo_name_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -static errno_t fsinfo_xattr_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -static errno_t traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags, void *fsinfo, - int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *)); -static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo); -static void fsinfo_free_extents_callback(void *data, off_t free_extent_size); -#if CONFIG_PROTECT -static errno_t fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); -#endif -static errno_t fsinfo_symlink_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); - -/* - * Entry function for all the fsinfo requests from hfs_vnop_ioctl() - * Depending on the type of request, this function will call the - * appropriate sub-function and return success or failure back to - * the caller. - */ -__private_extern__ -errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data) -{ - int error = 0; - hfs_fsinfo *fsinfo_union; - uint32_t request_type; - uint32_t header_len = sizeof(hfs_fsinfo_header_t); - - fsinfo_union = (hfs_fsinfo *)a_data; - request_type = fsinfo_union->header.request_type; - - // Zero out output fields to fsinfo_union, keep the user input fields intact. - bzero((char *)fsinfo_union + header_len, sizeof(hfs_fsinfo) - header_len); - - switch (request_type) { - case HFS_FSINFO_METADATA_BLOCKS_INFO: - error = hfs_fsinfo_metadata_blocks(hfsmp, &(fsinfo_union->metadata)); - break; - - case HFS_FSINFO_METADATA_EXTENTS: - error = hfs_fsinfo_metadata_extents(hfsmp, &(fsinfo_union->metadata)); - break; - - case HFS_FSINFO_METADATA_PERCENTFREE: - error = hfs_fsinfo_metadata_percentfree(hfsmp, &(fsinfo_union->metadata)); - break; - - case HFS_FSINFO_FILE_EXTENT_COUNT: - /* Traverse catalog btree and invoke callback for all records */ - error = traverse_btree(hfsmp, kHFSCatalogFileID, TRAVERSE_BTREE_EXTENTS, &(fsinfo_union->data), fsinfo_file_extent_count_callback); - break; - - case HFS_FSINFO_FILE_EXTENT_SIZE: - /* Traverse the catalog btree first */ - error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_catalog_callback); - if (error) { - break; - } - /* Traverse the overflow extents btree now */ - error = traverse_btree(hfsmp, kHFSExtentsFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_overflow_callback); - break; - - case HFS_FSINFO_FILE_SIZE: - /* Traverse catalog btree and invoke callback for all records */ - error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_size_callback); - break; - - case HFS_FSINFO_DIR_VALENCE: - /* Traverse catalog btree and invoke callback for all records */ - error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_dir_valence_callback); - break; - - case HFS_FSINFO_NAME_SIZE: - /* Traverse catalog btree and invoke callback for all records */ - error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->name), &fsinfo_name_size_callback); - break; - - case HFS_FSINFO_XATTR_SIZE: - /* Traverse attribute btree and invoke callback for all records */ - error = traverse_btree(hfsmp, kHFSAttributesFileID, 0, &(fsinfo_union->data), &fsinfo_xattr_size_callback); - break; - - case HFS_FSINFO_FREE_EXTENTS: - error = hfs_fsinfo_free_extents(hfsmp, &(fsinfo_union->data)); - break; - - case HFS_FSINFO_SYMLINK_SIZE: - /* Traverse catalog btree and invoke callback for all records */ - error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_symlink_size_callback); - break; - -#if CONFIG_PROTECT - case HFS_FSINFO_FILE_CPROTECT_COUNT: - /* Traverse attribute btree and invoke callback for all records */ - error = traverse_btree(hfsmp, kHFSAttributesFileID, TRAVERSE_BTREE_XATTR_CPROTECT, &(fsinfo_union->cprotect), &fsinfo_cprotect_count_callback); - break; -#endif - - default: - return ENOTSUP; - }; - - return error; -} - -/* - * This function provides information about total number of allocation blocks - * for each individual metadata file. - */ -static errno_t -hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) -{ - int lockflags = 0; - int ret_lockflags = 0; - - /* - * Getting number of allocation blocks for all metadata files - * should be a relatively quick operation, so we grab locks for all - * the btrees at the same time - */ - lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; - ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); - - /* Get information about all the btrees */ - fsinfo->extents = hfsmp->hfs_extents_cp->c_datafork->ff_blocks; - fsinfo->catalog = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks; - fsinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks; - if (hfsmp->hfs_attribute_cp) - fsinfo->attribute = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks; - else - fsinfo->attribute = 0; - - /* Done with btrees, give up the locks */ - hfs_systemfile_unlock(hfsmp, ret_lockflags); - - /* Get information about journal file */ - fsinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize); - - return 0; -} - -/* - * Helper function to count the number of valid extents in a file fork structure - */ -static uint32_t -hfs_count_extents_fp(struct filefork *ff) -{ - int i; - uint32_t count = 0; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (ff->ff_data.cf_extents[i].blockCount == 0) { - break; - } - count++; - } - return count; -} - - -/* - * This is a helper function that counts the total number of valid - * extents in all the overflow extent records for given fileID - * in overflow extents btree - */ -static errno_t -hfs_count_overflow_extents(struct hfsmount *hfsmp, uint32_t fileID, uint32_t *num_extents) -{ - int error; - FCB *fcb; - struct BTreeIterator *iterator = NULL; - FSBufferDescriptor btdata; - HFSPlusExtentKey *extentKey; - HFSPlusExtentRecord extentData; - uint32_t extent_count = 0; - int i; - - fcb = VTOF(hfsmp->hfs_extents_vp); - MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO); - - extentKey = (HFSPlusExtentKey *) &iterator->key; - extentKey->keyLength = kHFSPlusExtentKeyMaximumLength; - extentKey->forkType = kHFSDataForkType; - extentKey->fileID = fileID; - extentKey->startBlock = 0; - - btdata.bufferAddress = &extentData; - btdata.itemSize = sizeof(HFSPlusExtentRecord); - btdata.itemCount = 1; - - /* Search for overflow extent record */ - error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); - - /* - * We used startBlock of zero, so we will not find any records and errors - * are expected. It will also position the iterator just before the first - * overflow extent record for given fileID (if any). - */ - if (error && error != fsBTRecordNotFoundErr && error != fsBTEndOfIterationErr) - goto out; - error = 0; - - for (;;) { - - if (msleep(NULL, NULL, PINOD | PCATCH, - "hfs_fsinfo", NULL) == EINTR) { - error = EINTR; - break; - } - - error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - if (error != 0) { - /* These are expected errors, so mask them */ - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - break; - } - - /* If we encounter different fileID, stop the iteration */ - if (extentKey->fileID != fileID) { - break; - } - - if (extentKey->forkType != kHFSDataForkType) - break; - - /* This is our record of interest; only count the datafork extents. */ - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (extentData[i].blockCount == 0) { - break; - } - extent_count++; - } - } - -out: - FREE(iterator, M_TEMP); - - if (error == 0) { - *num_extents = extent_count; - } - return MacToVFSError(error); -} - -/* - * This function provides information about total number of extents (including - * extents from overflow extents btree, if any) for each individual metadata - * file. - */ -static errno_t -hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) -{ - int error = 0; - int lockflags = 0; - int ret_lockflags = 0; - uint32_t overflow_count; - - /* - * Counting the number of extents for all metadata files should - * be a relatively quick operation, so we grab locks for all the - * btrees at the same time - */ - lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; - ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); - - /* Get number of extents for extents overflow btree */ - fsinfo->extents = hfs_count_extents_fp(hfsmp->hfs_extents_cp->c_datafork); - - /* Get number of extents for catalog btree */ - fsinfo->catalog = hfs_count_extents_fp(hfsmp->hfs_catalog_cp->c_datafork); - if (fsinfo->catalog >= kHFSPlusExtentDensity) { - error = hfs_count_overflow_extents(hfsmp, kHFSCatalogFileID, &overflow_count); - if (error) { - goto out; - } - fsinfo->catalog += overflow_count; - } - - /* Get number of extents for allocation file */ - fsinfo->allocation = hfs_count_extents_fp(hfsmp->hfs_allocation_cp->c_datafork); - if (fsinfo->allocation >= kHFSPlusExtentDensity) { - error = hfs_count_overflow_extents(hfsmp, kHFSAllocationFileID, &overflow_count); - if (error) { - goto out; - } - fsinfo->allocation += overflow_count; - } - - /* - * Get number of extents for attribute btree. - * hfs_attribute_cp might be NULL. - */ - if (hfsmp->hfs_attribute_cp) { - fsinfo->attribute = hfs_count_extents_fp(hfsmp->hfs_attribute_cp->c_datafork); - if (fsinfo->attribute >= kHFSPlusExtentDensity) { - error = hfs_count_overflow_extents(hfsmp, kHFSAttributesFileID, &overflow_count); - if (error) { - goto out; - } - fsinfo->attribute += overflow_count; - } - } - /* Journal always has one extent */ - fsinfo->journal = 1; -out: - hfs_systemfile_unlock(hfsmp, ret_lockflags); - return error; -} - -/* - * Helper function to calculate percentage i.e. X is what percent of Y? - */ -static inline uint32_t -hfs_percent(uint32_t X, uint32_t Y) -{ - return (X * 100ll) / Y; -} - -/* - * This function provides percentage of free nodes vs total nodes for each - * individual metadata btrees, i.e. for catalog, overflow extents and - * attributes btree. This information is not applicable for allocation - * file and journal file. - */ -static errno_t -hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) -{ - int lockflags = 0; - int ret_lockflags = 0; - BTreeControlBlockPtr btreePtr; - uint32_t free_nodes, total_nodes; - - /* - * Getting total and used nodes for all metadata btrees should - * be a relatively quick operation, so we grab locks for all the - * btrees at the same time - */ - lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; - ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); - - /* Overflow extents btree */ - btreePtr = VTOF(hfsmp->hfs_extents_vp)->fcbBTCBPtr; - total_nodes = btreePtr->totalNodes; - free_nodes = btreePtr->freeNodes; - fsinfo->extents = hfs_percent(free_nodes, total_nodes); - - /* Catalog btree */ - btreePtr = VTOF(hfsmp->hfs_catalog_vp)->fcbBTCBPtr; - total_nodes = btreePtr->totalNodes; - free_nodes = btreePtr->freeNodes; - fsinfo->catalog = hfs_percent(free_nodes, total_nodes); - - /* Attributes btree */ - if (hfsmp->hfs_attribute_vp) { - btreePtr = VTOF(hfsmp->hfs_attribute_vp)->fcbBTCBPtr; - total_nodes = btreePtr->totalNodes; - free_nodes = btreePtr->freeNodes; - fsinfo->attribute = hfs_percent(free_nodes, total_nodes); - } - - hfs_systemfile_unlock(hfsmp, ret_lockflags); - return 0; -} - -/* - * Helper function to calculate log base 2 for given number - */ -static inline int -hfs_log2(uint64_t entry) -{ - return (63 - __builtin_clzll(entry|1)); -} - -/* - * Helper function to account for input entry into the data - * array based on its log base 2 value - */ -__private_extern__ -void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry) -{ - /* - * From hfs_fsctl.h - - * - * hfs_fsinfo_data is generic data structure to aggregate information like sizes - * or counts in buckets of power of 2. Each bucket represents a range of values - * that is determined based on its index in the array. Specifically, buckets[i] - * represents values that are greater than or equal to 2^(i-1) and less than 2^i, - * except the last bucket which represents range greater than or equal to 2^(i-1) - * - * The current maximum number of buckets is 41, so we can represent range from - * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of - * anything that is greater than or equal to 1TB. - * - * For example, - * bucket[0] -> greater than or equal to 0 and less than 1 - * bucket[1] -> greater than or equal to 1 and less than 2 - * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024 - * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB - * bucket[41] -> greater than or equal to 2^(41-1) = 1TB - */ - uint32_t bucket; - - if (entry) { - /* - * Calculate log base 2 value for the entry. - * Account for this value in the appropriate bucket. - * The last bucket is a catch-all bucket of - * anything that is greater than or equal to 1TB - */ - bucket = MIN(hfs_log2(entry) + 1, HFS_FSINFO_DATA_MAX_BUCKETS-1); - ++fsinfo->bucket[bucket]; - } else { - /* Entry is zero, so account it in 0th offset */ - fsinfo->bucket[0]++; - } -} - -/* - * Function to traverse all the records of a btree and then call caller-provided - * callback function for every record found. The type of btree is chosen based - * on the fileID provided by the caller. This fuction grabs the correct locks - * depending on the type of btree it will be traversing and flags provided - * by the caller. - * - * Note: It might drop and reacquire the locks during execution. - */ -static errno_t -traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags, - void *fsinfo, int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *)) -{ - int error = 0; - int lockflags = 0; - int ret_lockflags = 0; - FCB *fcb; - struct BTreeIterator *iterator = NULL; - struct FSBufferDescriptor btdata; - int btree_operation; - HFSPlusRecord record; - HFSPlusKey *key; - uint64_t start, timeout_abs; - - switch(btree_fileID) { - case kHFSExtentsFileID: - fcb = VTOF(hfsmp->hfs_extents_vp); - lockflags = SFL_EXTENTS; - break; - case kHFSCatalogFileID: - fcb = VTOF(hfsmp->hfs_catalog_vp); - lockflags = SFL_CATALOG; - break; - case kHFSAttributesFileID: - // Attributes file doesn’t exist, There are no records to iterate. - if (hfsmp->hfs_attribute_vp == NULL) - return error; - fcb = VTOF(hfsmp->hfs_attribute_vp); - lockflags = SFL_ATTRIBUTE; - break; - - default: - return EINVAL; - } - - MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO); - - /* The key is initialized to zero because we are traversing entire btree */ - key = (HFSPlusKey *)&iterator->key; - - if (flags & TRAVERSE_BTREE_EXTENTS) { - lockflags |= SFL_EXTENTS; - } - - btdata.bufferAddress = &record; - btdata.itemSize = sizeof(HFSPlusRecord); - btdata.itemCount = 1; - - /* Lock btree for duration of traversal */ - ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); - btree_operation = kBTreeFirstRecord; - - nanoseconds_to_absolutetime(HFS_FSINFO_MAX_LOCKHELD_TIME, &timeout_abs); - start = mach_absolute_time(); - - while (1) { - - if (msleep(NULL, NULL, PINOD | PCATCH, - "hfs_fsinfo", NULL) == EINTR) { - error = EINTR; - break; - } - - error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); - if (error != 0) { - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - break; - } - /* Lookup next btree record on next call to BTIterateRecord() */ - btree_operation = kBTreeNextRecord; - - /* Call our callback function and stop iteration if there are any errors */ - error = callback(hfsmp, key, &record, fsinfo); - if (error) { - break; - } - - /* let someone else use the tree after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME */ - if ((mach_absolute_time() - start) >= timeout_abs) { - - /* release b-tree locks and let someone else get the lock */ - hfs_systemfile_unlock (hfsmp, ret_lockflags); - - /* add tsleep here to force context switch and fairness */ - tsleep((caddr_t)hfsmp, PRIBIO, "hfs_fsinfo", 1); - - /* - * re-acquire the locks in the same way that we wanted them originally. - * note: it is subtle but worth pointing out that in between the time that we - * released and now want to re-acquire these locks that the b-trees may have shifted - * slightly but significantly. For example, the catalog or other b-tree could have grown - * past 8 extents and now requires the extents lock to be held in order to be safely - * manipulated. We can't be sure of the state of the b-tree from where we last left off. - */ - - ret_lockflags = hfs_systemfile_lock (hfsmp, lockflags, HFS_SHARED_LOCK); - - /* - * It's highly likely that the search key we stashed away before dropping lock - * no longer points to an existing item. Iterator's IterateRecord is able to - * re-position itself and process the next record correctly. With lock dropped, - * there might be records missed for statistic gathering, which is ok. The - * point is to get aggregate values. - */ - - start = mach_absolute_time(); - - /* loop back around and get another record */ - } - } - - hfs_systemfile_unlock(hfsmp, ret_lockflags); - FREE (iterator, M_TEMP); - return MacToVFSError(error); -} - -/* - * Callback function to get distribution of number of extents - * for all user files in given file system. Note that this only - * accounts for data fork, no resource fork. - */ -static errno_t -fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, - __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - int i; - int error = 0; - uint32_t num_extents = 0; - uint32_t num_overflow = 0; - uint32_t blockCount; - - if (record->file_record.recordType == kHFSPlusFileRecord) { - /* Count total number of extents for this file */ - for (i = 0; i < kHFSPlusExtentDensity; i++) { - blockCount = record->file_record.dataFork.extents[i].blockCount; - if (blockCount == 0) { - break; - } - num_extents++; - } - /* This file has overflow extent records, so search overflow btree */ - if (num_extents >= kHFSPlusExtentDensity) { - /* The caller also hold extents overflow btree lock */ - error = hfs_count_overflow_extents(hfsmp, record->file_record.fileID, &num_overflow); - if (error) { - goto out; - } - num_extents += num_overflow; - } - hfs_fsinfo_data_add(data, num_extents); - } -out: - return error; -} - -/* - * Callback function to get distribution of individual extent sizes - * (in bytes) for all user files in given file system from catalog - * btree only. Note that this only accounts for data fork, no resource - * fork. - */ -static errno_t fsinfo_file_extent_size_catalog_callback(__unused struct hfsmount *hfsmp, - __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - int i; - uint32_t blockCount; - uint64_t extent_size; - - if (record->file_record.recordType == kHFSPlusFileRecord) { - /* Traverse through all valid extents */ - for (i = 0; i < kHFSPlusExtentDensity; i++) { - blockCount = record->file_record.dataFork.extents[i].blockCount; - if (blockCount == 0) { - break; - } - extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize); - hfs_fsinfo_data_add(data, extent_size); - } - } - return 0; -} - -/* - * Callback function to get distribution of individual extent sizes - * (in bytes) for all user files in given file system from overflow - * extents btree only. Note that this only accounts for data fork, - * no resource fork. - */ -static errno_t fsinfo_file_extent_size_overflow_callback(__unused struct hfsmount *hfsmp, - HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - int i; - uint32_t blockCount; - uint64_t extent_size; - - if (key->extent_key.fileID >= kHFSFirstUserCatalogNodeID) { - // Only count the data fork extents. - if (key->extent_key.forkType == kHFSDataForkType) { - for (i = 0; i < kHFSPlusExtentDensity; i++) { - blockCount = record->extent_record[i].blockCount; - if (blockCount == 0) { - break; - } - extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize); - hfs_fsinfo_data_add(data, extent_size); - } - } - } - return 0; -} - -/* - * Callback function to get distribution of file sizes (in bytes) - * for all user files in given file system. Note that this only - * accounts for data fork, no resource fork. - */ -static errno_t fsinfo_file_size_callback(__unused struct hfsmount *hfsmp, - __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - if (record->file_record.recordType == kHFSPlusFileRecord) { - /* Record of interest, account for the size in the bucket */ - hfs_fsinfo_data_add(data, record->file_record.dataFork.logicalSize); - } - return 0; -} - -/* - * Callback function to get distribution of directory valence - * for all directories in the given file system. - */ -static errno_t fsinfo_dir_valence_callback(__unused struct hfsmount *hfsmp, - __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - if (record->folder_record.recordType == kHFSPlusFolderRecord) { - hfs_fsinfo_data_add(data, record->folder_record.valence); - } - return 0; -} - -/* - * Callback function to get distribution of number of unicode - * characters in name for all files and directories for a given - * file system. - */ -static errno_t fsinfo_name_size_callback(__unused struct hfsmount *hfsmp, - __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - struct hfs_fsinfo_name *fsinfo = (struct hfs_fsinfo_name *)data; - uint32_t length; - - if ((record->folder_record.recordType == kHFSPlusFolderThreadRecord) || - (record->folder_record.recordType == kHFSPlusFileThreadRecord)) { - length = record->thread_record.nodeName.length; - /* Make sure that the nodeName is bounded, otherwise return error */ - if (length > kHFSPlusMaxFileNameChars) { - return EIO; - } - - // sanity check for a name length of zero, which isn't valid on disk. - if (length == 0) - return EIO; - - /* Round it down to nearest multiple of 5 to match our buckets granularity */ - length = (length - 1)/ 5; - /* Account this value into our bucket */ - fsinfo->bucket[length]++; - } - return 0; -} - -/* - * Callback function to get distribution of size of all extended - * attributes for a given file system. - */ -static errno_t fsinfo_xattr_size_callback(__unused struct hfsmount *hfsmp, - __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - if (record->attr_record.recordType == kHFSPlusAttrInlineData) { - /* Inline attribute */ - hfs_fsinfo_data_add(data, record->attr_record.attrData.attrSize); - } else if (record->attr_record.recordType == kHFSPlusAttrForkData) { - /* Larger attributes with extents information */ - hfs_fsinfo_data_add(data, record->attr_record.forkData.theFork.logicalSize); - } - return 0; -} - - -/* - * Callback function to get distribution of free space extents for a given file system. - */ -static void fsinfo_free_extents_callback(void *data, off_t free_extent_size) -{ - // Assume a minimum of 4 KB block size - hfs_fsinfo_data_add(data, free_extent_size / 4096); -} - -/* - * Function to get distribution of free space extents for a given file system. - */ -static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo) -{ - return hfs_find_free_extents(hfsmp, &fsinfo_free_extents_callback, fsinfo); -} - -/* - * Callback function to get distribution of symblock link sizes (in bytes) - * for all user files in given file system. Note that this only - * accounts for data fork, no resource fork. - */ -static errno_t fsinfo_symlink_size_callback(__unused struct hfsmount *hfsmp, - __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) -{ - if (record->file_record.recordType == kHFSPlusFileRecord) { - /* Record of interest, account for the size in the bucket */ - if (S_ISLNK(record->file_record.bsdInfo.fileMode)) - hfs_fsinfo_data_add((struct hfs_fsinfo_data *)data, record->file_record.dataFork.logicalSize); - } - return 0; -} - -#if CONFIG_PROTECT -/* - * Callback function to get total number of files/directories - * for each content protection class - */ -static int fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, - HFSPlusRecord *record, void *data) -{ - struct hfs_fsinfo_cprotect *fsinfo = (struct hfs_fsinfo_cprotect *)data; - static const uint16_t cp_xattrname_utf16[] = CONTENT_PROTECTION_XATTR_NAME_CHARS; - /* - * NOTE: cp_xattrname_utf16_len is the number of UTF-16 code units in - * the EA name string. - */ - static const size_t cp_xattrname_utf16_len = sizeof(cp_xattrname_utf16)/2; - struct cp_xattr_v5 *xattr; - size_t xattr_len = sizeof(struct cp_xattr_v5); - struct cprotect cp_entry; - struct cprotect *cp_entryp = &cp_entry; - int error = 0; - - /* Content protect xattrs are inline attributes only, so skip all others */ - if (record->attr_record.recordType != kHFSPlusAttrInlineData) - return 0; - - /* We only look at content protection xattrs */ - if ((key->attr_key.attrNameLen != cp_xattrname_utf16_len) || - (bcmp(key->attr_key.attrName, cp_xattrname_utf16, 2 * cp_xattrname_utf16_len))) { - return 0; - } - - xattr = (struct cp_xattr_v5 *)((void *)(record->attr_record.attrData.attrData)); - error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, (cprotect_t *)&cp_entryp, - CP_GET_XATTR_BASIC_INFO); - if (error) - return 0; - - /* No key present, skip this record */ - if (!ISSET(cp_entry.cp_flags, CP_HAS_A_KEY)) - return 0; - - /* Now account for the persistent class */ - switch (CP_CLASS(cp_entry.cp_pclass)) { - case PROTECTION_CLASS_A: - fsinfo->class_A++; - break; - case PROTECTION_CLASS_B: - fsinfo->class_B++; - break; - case PROTECTION_CLASS_C: - fsinfo->class_C++; - break; - case PROTECTION_CLASS_D: - fsinfo->class_D++; - break; - case PROTECTION_CLASS_E: - fsinfo->class_E++; - break; - case PROTECTION_CLASS_F: - fsinfo->class_F++; - break; - }; - - return 0; -} -#endif diff --git a/bsd/hfs/hfs_hotfiles.c b/bsd/hfs/hfs_hotfiles.c deleted file mode 100644 index b6fa4a276..000000000 --- a/bsd/hfs/hfs_hotfiles.c +++ /dev/null @@ -1,3968 +0,0 @@ -/* - * Copyright (c) 2003-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "hfscommon/headers/BTreeScanner.h" - - -#define HFC_DEBUG 0 -#define HFC_VERBOSE 0 - - -/* - * Minimum post Tiger base time. - * Thu Mar 31 17:00:00 2005 - */ -#define HFC_MIN_BASE_TIME 0x424c8f00L - -/* - * Hot File List (runtime). - */ -typedef struct hotfileinfo { - u_int32_t hf_fileid; - u_int32_t hf_temperature; - u_int32_t hf_blocks; -} hotfileinfo_t; - -typedef struct hotfilelist { - u_int32_t hfl_magic; - u_int32_t hfl_version; - time_t hfl_duration; /* duration of sample period */ - int hfl_count; /* count of hot files recorded */ - int hfl_next; /* next file to move */ - int hfl_totalblocks; /* total hot file blocks */ - int hfl_reclaimblks; /* blocks to reclaim in HFV */ - u_int32_t hfl_spare[2]; - hotfileinfo_t hfl_hotfile[1]; /* array of hot files */ -} hotfilelist_t; - - -/* - * Hot File Entry (runtime). - */ -typedef struct hotfile_entry { - struct hotfile_entry *left; - struct hotfile_entry *right; - u_int32_t fileid; - u_int32_t temperature; - u_int32_t blocks; -} hotfile_entry_t; - - -// -// We cap the max temperature for non-system files to "MAX_NORMAL_TEMP" -// so that they will always have a lower temperature than system (aka -// "auto-cached") files. System files have MAX_NORMAL_TEMP added to -// their temperature which produces two bands of files (all non-system -// files will have a temp less than MAX_NORMAL_TEMP and all system -// files will have a temp greatern than MAX_NORMAL_TEMP). -// -// This puts non-system files on the left side of the hotfile btree -// (and we start evicting from the left-side of the tree). The idea is -// that we will evict non-system files more aggressively since their -// working set changes much more dynamically than system files (which -// are for the most part, static). -// -// NOTE: these values have to fit into a 32-bit int. We use a -// value of 1-billion which gives a pretty broad range -// and yet should not run afoul of any sign issues. -// -#define MAX_NORMAL_TEMP 1000000000 -#define HF_TEMP_RANGE MAX_NORMAL_TEMP - - -// -// These used to be defines of the hard coded values. But if -// we're on an cooperative fusion (CF) system we need to change -// the values (which happens in hfs_recording_init() -// -uint32_t hfc_default_file_count = 1000; -uint32_t hfc_default_duration = (3600 * 60); -uint32_t hfc_max_file_count = 5000; -uint64_t hfc_max_file_size = (10 * 1024 * 1024); - - -/* - * Hot File Recording Data (runtime). - */ -typedef struct hotfile_data { - struct hfsmount *hfsmp; - long refcount; - u_int32_t activefiles; /* active number of hot files */ - u_int32_t threshold; - u_int32_t maxblocks; - hotfile_entry_t *rootentry; - hotfile_entry_t *freelist; - hotfile_entry_t *coldest; - hotfile_entry_t entries[1]; -} hotfile_data_t; - -static int hfs_recording_start (struct hfsmount *); -static int hfs_recording_stop (struct hfsmount *); - -/* Hotfiles pinning routines */ -static int hfs_getvnode_and_pin (struct hfsmount *hfsmp, uint32_t fileid, uint32_t *pinned); -static int hfs_pin_extent_record (struct hfsmount *hfsmp, HFSPlusExtentRecord extents, uint32_t *pinned); -static int hfs_pin_catalog_rec (struct hfsmount *hfsmp, HFSPlusCatalogFile *cfp, int rsrc); - -/* - * Hot File Data recording functions (in-memory binary tree). - */ -static int hf_insert (hotfile_data_t *, hotfile_entry_t *); -static void hf_delete (hotfile_data_t *, u_int32_t, u_int32_t); -static hotfile_entry_t * hf_coldest (hotfile_data_t *); -static hotfile_entry_t * hf_getnewentry (hotfile_data_t *); -static void hf_getsortedlist (hotfile_data_t *, hotfilelist_t *); - -#if HFC_DEBUG -static hotfile_entry_t * hf_lookup (hotfile_data_t *, u_int32_t, u_int32_t); -static void hf_maxdepth(hotfile_entry_t *, int, int *); -static void hf_printtree (hotfile_entry_t *); -#endif - -/* - * Hot File misc support functions. - */ -static int hotfiles_collect (struct hfsmount *); -static int hotfiles_age (struct hfsmount *); -static int hotfiles_adopt (struct hfsmount *, vfs_context_t); -static int hotfiles_evict (struct hfsmount *, vfs_context_t); -static int hotfiles_refine (struct hfsmount *); -static int hotextents(struct hfsmount *, HFSPlusExtentDescriptor *); -static int hfs_addhotfile_internal(struct vnode *); -static int hfs_hotfile_cur_freeblks(hfsmount_t *hfsmp); - - -/* - * Hot File Cluster B-tree (on disk) functions. - */ -static int hfc_btree_create (struct hfsmount *, unsigned int, unsigned int); -static int hfc_btree_open (struct hfsmount *, struct vnode **); -static int hfc_btree_open_ext(struct hfsmount *hfsmp, struct vnode **vpp, int ignore_btree_errs); -static int hfc_btree_close (struct hfsmount *, struct vnode *); -static int hfc_btree_delete_record(struct hfsmount *hfsmp, BTreeIterator *iterator, HotFileKey *key); -static int hfc_btree_delete(struct hfsmount *hfsmp); -static int hfc_comparekeys (HotFileKey *, HotFileKey *); - - -char hfc_tag[] = "CLUSTERED HOT FILES B-TREE "; - - -/* - *======================================================================== - * HOT FILE INTERFACE ROUTINES - *======================================================================== - */ - -/* - * Start recording the hottest files on a file system. - * - * Requires that the hfc_mutex be held. - */ -static int -hfs_recording_start(struct hfsmount *hfsmp) -{ - hotfile_data_t *hotdata; - struct timeval tv; - int maxentries; - size_t size; - int i; - int error; - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) || - (hfsmp->jnl == NULL) || - (hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) { - return (EPERM); - } - if (HFSTOVCB(hfsmp)->freeBlocks < (2 * (u_int32_t)hfsmp->hfs_hotfile_maxblks)) { - return (ENOSPC); - } - if (hfsmp->hfc_stage != HFC_IDLE) { - return (EBUSY); - } - hfsmp->hfc_stage = HFC_BUSY; - - /* - * Dump previous recording data. - */ - if (hfsmp->hfc_recdata) { - void * tmp; - - tmp = hfsmp->hfc_recdata; - hfsmp->hfc_recdata = NULL; - FREE(tmp, M_TEMP); - } - - microtime(&tv); /* Times are base on GMT time. */ - - /* - * On first startup check for suspended recording. - */ - if (hfsmp->hfc_timebase == 0 && - hfc_btree_open(hfsmp, &hfsmp->hfc_filevp) == 0) { - HotFilesInfo hotfileinfo; - - if ((BTGetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, - sizeof(hotfileinfo)) == 0) && - (SWAP_BE32 (hotfileinfo.magic) == HFC_MAGIC) && - (SWAP_BE32 (hotfileinfo.timeleft) > 0) && - (SWAP_BE32 (hotfileinfo.timebase) > 0)) { - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - if (hfsmp->hfs_hotfile_freeblks == 0) { - hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - SWAP_BE32 (hotfileinfo.usedblocks); - } - hfsmp->hfc_maxfiles = 0x7fffffff; - printf("hfs: %s: %s: hotfile freeblocks: %d, max: %d\n", hfsmp->vcbVN, __FUNCTION__, - hfsmp->hfs_hotfile_freeblks, hfsmp->hfs_hotfile_maxblks); - } else { - hfsmp->hfc_maxfiles = SWAP_BE32 (hotfileinfo.maxfilecnt); - } - hfsmp->hfc_timebase = SWAP_BE32 (hotfileinfo.timebase); - int timeleft = (int)SWAP_BE32(hotfileinfo.timeleft); - if (timeleft < 0 || timeleft > (int)(HFC_DEFAULT_DURATION*2)) { - // in case this field got botched, don't let it screw things up - // printf("hfs: hotfiles: bogus looking timeleft: %d\n", timeleft); - timeleft = HFC_DEFAULT_DURATION; - } - hfsmp->hfc_timeout = timeleft + tv.tv_sec ; - /* Fix up any bogus timebase values. */ - if (hfsmp->hfc_timebase < HFC_MIN_BASE_TIME) { - hfsmp->hfc_timebase = hfsmp->hfc_timeout - HFC_DEFAULT_DURATION; - } -#if HFC_VERBOSE - printf("hfs: Resume recording hot files on %s (%d secs left (%d); timeout %ld)\n", - hfsmp->vcbVN, SWAP_BE32 (hotfileinfo.timeleft), timeleft, hfsmp->hfc_timeout - tv.tv_sec); -#endif - } else { - hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; - hfsmp->hfc_timebase = tv.tv_sec + 1; - hfsmp->hfc_timeout = hfsmp->hfc_timebase + HFC_DEFAULT_DURATION; - } - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } else { - struct cat_attr cattr; - u_int32_t cnid; - - /* - * Make sure a btree file exists. - */ - cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL); - if ((cnid == 0) && - !S_ISREG(cattr.ca_mode) && - (error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT))) { - hfsmp->hfc_stage = HFC_IDLE; - wakeup((caddr_t)&hfsmp->hfc_stage); - return (error); - } -#if HFC_VERBOSE - printf("hfs: begin recording hot files on %s (hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n", - hfsmp->vcbVN, - hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end, - hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles); -#endif - hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; - hfsmp->hfc_timeout = tv.tv_sec + HFC_DEFAULT_DURATION; - - /* Reset time base. */ - if (hfsmp->hfc_timebase == 0) { - hfsmp->hfc_timebase = tv.tv_sec + 1; - } else { - time_t cumulativebase; - - cumulativebase = hfsmp->hfc_timeout - (HFC_CUMULATIVE_CYCLES * HFC_DEFAULT_DURATION); - hfsmp->hfc_timebase = MAX(hfsmp->hfc_timebase, cumulativebase); - } - } - - if ((hfsmp->hfc_maxfiles == 0) || - (hfsmp->hfc_maxfiles > HFC_MAXIMUM_FILE_COUNT)) { - hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; - } - maxentries = hfsmp->hfc_maxfiles; - - size = sizeof(hotfile_data_t) + (maxentries * sizeof(hotfile_entry_t)); - MALLOC(hotdata, hotfile_data_t *, size, M_TEMP, M_WAITOK); - if (hotdata == NULL) { - hfsmp->hfc_recdata = NULL; - hfsmp->hfc_stage = HFC_IDLE; - wakeup((caddr_t)&hfsmp->hfc_stage); - return(ENOMEM); - } - - bzero(hotdata, size); - - for (i = 1; i < maxentries ; i++) - hotdata->entries[i-1].right = &hotdata->entries[i]; - - hotdata->freelist = &hotdata->entries[0]; - /* - * Establish minimum temperature and maximum file size. - */ - hotdata->threshold = HFC_MINIMUM_TEMPERATURE; - hotdata->maxblocks = HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize; - hotdata->hfsmp = hfsmp; - - hfsmp->hfc_recdata = hotdata; - hfsmp->hfc_stage = HFC_RECORDING; - wakeup((caddr_t)&hfsmp->hfc_stage); - return (0); -} - -/* - * Stop recording the hotest files on a file system. - * - * Requires that the hfc_mutex be held. - */ -static int -hfs_recording_stop(struct hfsmount *hfsmp) -{ - hotfile_data_t *hotdata; - hotfilelist_t *listp; - struct timeval tv; - size_t size; - enum hfc_stage newstage = HFC_IDLE; - int error; - - if (hfsmp->hfc_stage != HFC_RECORDING) - return (EPERM); - - hfsmp->hfc_stage = HFC_BUSY; - - hotfiles_collect(hfsmp); - - - /* - * Convert hot file data into a simple file id list.... - * - * then dump the sample data - */ -#if HFC_VERBOSE - printf("hfs: end of hot file recording on %s\n", hfsmp->vcbVN); -#endif - hotdata = (hotfile_data_t *)hfsmp->hfc_recdata; - if (hotdata == NULL) - return (0); - hfsmp->hfc_recdata = NULL; - hfsmp->hfc_stage = HFC_EVALUATION; - wakeup((caddr_t)&hfsmp->hfc_stage); - -#if HFC_VERBOSE - printf("hfs: curentries: %d\n", hotdata->activefiles); -#endif - /* - * If no hot files recorded then we're done. - */ - if (hotdata->rootentry == NULL) { - error = 0; - goto out; - } - - /* Open the B-tree file for writing... */ - if (hfsmp->hfc_filevp) - panic("hfs_recording_stop: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp); - - error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); - if (error) { - goto out; - } - - /* - * Age the previous set of clustered hot files. - */ - error = hotfiles_age(hfsmp); - if (error) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - goto out; - } - - /* - * Create a sorted list of hotest files. - */ - size = sizeof(hotfilelist_t); - size += sizeof(hotfileinfo_t) * (hotdata->activefiles - 1); - MALLOC(listp, hotfilelist_t *, size, M_TEMP, M_WAITOK); - if (listp == NULL) { - error = ENOMEM; - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - goto out; - } - - bzero(listp, size); - - hf_getsortedlist(hotdata, listp); /* NOTE: destroys hot file tree! */ - microtime(&tv); - listp->hfl_duration = tv.tv_sec - hfsmp->hfc_timebase; - hfsmp->hfc_recdata = listp; - - /* - * Account for duplicates. - */ - error = hotfiles_refine(hfsmp); - if (error) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - goto out; - } - - /* - * Compute the amount of space to reclaim... - */ - if (listp->hfl_totalblocks > hfs_hotfile_cur_freeblks(hfsmp)) { - listp->hfl_reclaimblks = - MIN(listp->hfl_totalblocks, hfsmp->hfs_hotfile_maxblks) - - hfsmp->hfs_hotfile_freeblks; -#if HFC_VERBOSE - printf("hfs_recording_stop: need to reclaim %d blocks\n", listp->hfl_reclaimblks); -#endif - if (listp->hfl_reclaimblks) - newstage = HFC_EVICTION; - else - newstage = HFC_ADOPTION; - } else { - newstage = HFC_ADOPTION; - } - - if (newstage == HFC_ADOPTION && listp->hfl_totalblocks == 0) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - newstage = HFC_IDLE; - } -out: -#if HFC_VERBOSE - if (newstage == HFC_EVICTION) - printf("hfs: evicting coldest files\n"); - else if (newstage == HFC_ADOPTION) - printf("hfs: adopting hotest files\n"); -#endif - FREE(hotdata, M_TEMP); - - hfsmp->hfc_stage = newstage; - wakeup((caddr_t)&hfsmp->hfc_stage); - return (error); -} - -static void -save_btree_user_info(struct hfsmount *hfsmp) -{ - HotFilesInfo hotfileinfo; - struct timeval tv; - - microtime(&tv); - hotfileinfo.magic = SWAP_BE32 (HFC_MAGIC); - hotfileinfo.version = SWAP_BE32 (HFC_VERSION); - hotfileinfo.duration = SWAP_BE32 (HFC_DEFAULT_DURATION); - hotfileinfo.timebase = SWAP_BE32 (hfsmp->hfc_timebase); - hotfileinfo.timeleft = SWAP_BE32 (hfsmp->hfc_timeout - tv.tv_sec); - hotfileinfo.threshold = SWAP_BE32 (HFC_MINIMUM_TEMPERATURE); - hotfileinfo.maxfileblks = SWAP_BE32 (HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize); - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - hotfileinfo.usedblocks = SWAP_BE32 (hfsmp->hfs_hotfile_maxblks - hfs_hotfile_cur_freeblks(hfsmp)); -#if HFC_VERBOSE - printf("hfs: %s: saving usedblocks = %d (timeleft: %d; timeout %ld)\n", hfsmp->vcbVN, (hfsmp->hfs_hotfile_maxblks - hfsmp->hfs_hotfile_freeblks), - SWAP_BE32(hotfileinfo.timeleft), hfsmp->hfc_timeout); -#endif - } else { - hotfileinfo.maxfilecnt = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT); - } - strlcpy((char *)hotfileinfo.tag, hfc_tag, sizeof hotfileinfo.tag); - (void) BTSetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, sizeof(hotfileinfo)); -} - -/* - * Suspend recording the hotest files on a file system. - */ -int -hfs_recording_suspend(struct hfsmount *hfsmp) -{ - hotfile_data_t *hotdata = NULL; - int error; - - if (hfsmp->hfc_stage == HFC_DISABLED) - return (0); - - lck_mtx_lock(&hfsmp->hfc_mutex); - - /* - * XXX NOTE - * A suspend can occur during eval/evict/adopt stage. - * In that case we would need to write out info and - * flush our HFBT vnode. Currently we just bail. - */ - - hotdata = (hotfile_data_t *)hfsmp->hfc_recdata; - if (hotdata == NULL || hfsmp->hfc_stage != HFC_RECORDING) { - error = 0; - goto out; - } - hfsmp->hfc_stage = HFC_BUSY; - -#if HFC_VERBOSE - printf("hfs: suspend hot file recording on %s\n", hfsmp->vcbVN); -#endif - error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); - if (error) { - printf("hfs_recording_suspend: err %d opening btree\n", error); - goto out; - } - - if (hfs_start_transaction(hfsmp) != 0) { - goto out; - } - if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - goto end_transaction; - } - - save_btree_user_info(hfsmp); - - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - -end_transaction: - hfs_end_transaction(hfsmp); - -out: - if (hfsmp->hfc_filevp) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - if (hotdata) { - FREE(hotdata, M_TEMP); - hfsmp->hfc_recdata = NULL; - } - hfsmp->hfc_stage = HFC_DISABLED; - wakeup((caddr_t)&hfsmp->hfc_stage); - - lck_mtx_unlock(&hfsmp->hfc_mutex); - return (error); -} - - -static void -reset_file_ids(struct hfsmount *hfsmp, uint32_t *fileid_table, int num_ids) -{ - int i, error; - - for(i=0; i < num_ids; i++) { - struct vnode *vp; - - error = hfs_vget(hfsmp, fileid_table[i], &vp, 0, 0); - if (error) { - if (error == ENOENT) { - error = 0; - continue; /* stale entry, go to next */ - } - continue; - } - - // hfs_vget returns a locked cnode so no need to lock here - - if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { - error = hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, NULL, vfs_context_kernel()); - } - - /* - * The updates to the catalog must be journaled - */ - hfs_start_transaction(hfsmp); - - // - // turn off _all_ the hotfile related bits since we're resetting state - // - if (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevCandidateMask) { - vnode_clearfastdevicecandidate(vp); - } - - VTOC(vp)->c_attr.ca_recflags &= ~(kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask); - VTOC(vp)->c_flag |= C_MODIFIED; - - hfs_update(vp, 0); - - hfs_end_transaction(hfsmp); - - hfs_unlock(VTOC(vp)); - vnode_put(vp); - } -} - -static int -flag_hotfile(struct hfsmount *hfsmp, const char *filename) -{ - struct vnode *dvp = NULL, *fvp = NULL; - vfs_context_t ctx = vfs_context_kernel(); - struct componentname cname; - int error=0; - size_t fname_len; - const char *orig_fname = filename; - - if (filename == NULL) { - return EINVAL; - } - - fname_len = strlen(filename); // do NOT include the trailing '\0' so that we break out of the loop below - - error = VFS_ROOT(HFSTOVFS(hfsmp), &dvp, ctx); - if (error) { - return (error); - } - - /* At this point, 'dvp' must be considered iocounted */ - const char *ptr; - ptr = filename; - - while (ptr < (orig_fname + fname_len - 1)) { - for(; ptr < (orig_fname + fname_len) && *ptr && *ptr != '/'; ptr++) { - /* just keep advancing till we reach the end of the string or a slash */ - } - - cname.cn_nameiop = LOOKUP; - cname.cn_flags = ISLASTCN; - cname.cn_context = ctx; - cname.cn_ndp = NULL; - cname.cn_pnbuf = __DECONST(char *, orig_fname); - cname.cn_nameptr = __DECONST(char *, filename); - cname.cn_pnlen = fname_len; - cname.cn_namelen = ptr - filename; - cname.cn_hash = 0; - cname.cn_consume = 0; - - error = VNOP_LOOKUP(dvp, &fvp, &cname, ctx); - if (error) { - /* - * If 'dvp' is non-NULL, then it has an iocount. Make sure to release it - * before bailing out. VNOP_LOOKUP could legitimately return ENOENT - * if the item didn't exist or if we raced with a delete. - */ - if (dvp) { - vnode_put(dvp); - dvp = NULL; - } - return error; - } - - if (ptr < orig_fname + fname_len - 1) { - // - // we've got a multi-part pathname so drop the ref on the dir, - // make dvp become what we just looked up, and advance over - // the slash character in the pathname to get to the next part - // of the component - // - vnode_put(dvp); - dvp = fvp; - fvp = NULL; - - filename = ++ptr; // skip the slash character - } - } - - if (fvp == NULL) { - error = ENOENT; - goto out; - } - - struct cnode *cp = VTOC(fvp); - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) != 0) { - goto out; - } - - hfs_start_transaction(hfsmp); - - cp->c_attr.ca_recflags |= (kHFSFastDevCandidateMask|kHFSAutoCandidateMask); - cp->c_flag |= C_MODIFIED; - - hfs_update(fvp, 0); - - hfs_end_transaction(hfsmp); - - hfs_unlock(cp); - //printf("hfs: flagged /%s with the fast-dev-candidate|auto-candidate flags\n", filename); - - -out: - if (fvp) { - vnode_put(fvp); - fvp = NULL; - } - - if (dvp) { - vnode_put(dvp); - dvp = NULL; - } - - return error; -} - - -static void -hfs_setup_default_cf_hotfiles(struct hfsmount *hfsmp) -{ - const char *system_default_hotfiles[] = { - "usr", - "System", - "Applications", - "private/var/db/dyld" - }; - int i; - - for(i=0; i < (int)(sizeof(system_default_hotfiles)/sizeof(char *)); i++) { - flag_hotfile(hfsmp, system_default_hotfiles[i]); - } -} - - -#define NUM_FILE_RESET_IDS 4096 // so we allocate 16k to hold file-ids - -static void -hfs_hotfile_reset(struct hfsmount *hfsmp) -{ - CatalogKey * keyp; - CatalogRecord * datap; - u_int32_t dataSize; - BTScanState scanstate; - BTreeIterator * iterator = NULL; - FSBufferDescriptor record; - u_int32_t data; - u_int32_t cnid; - int error = 0; - uint32_t *fileids=NULL; - int cur_id_index = 0; - - int cleared = 0; /* debug variables */ - int filecount = 0; - int dircount = 0; - -#if HFC_VERBOSE - printf("hfs: %s: %s\n", hfsmp->vcbVN, __FUNCTION__); -#endif - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - error = ENOMEM; - goto out; - } - bzero(iterator, sizeof(*iterator)); - - MALLOC(fileids, uint32_t *, NUM_FILE_RESET_IDS * sizeof(uint32_t), M_TEMP, M_WAITOK); - if (fileids == NULL) { - error = ENOMEM; - goto out; - } - - record.bufferAddress = &data; - record.itemSize = sizeof(u_int32_t); - record.itemCount = 1; - - /* - * Get ready to scan the Catalog file. - */ - error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0, - kCatSearchBufferSize, &scanstate); - if (error) { - printf("hfs_hotfile_reset: err %d BTScanInit\n", error); - goto out; - } - - /* - * Visit all the catalog btree leaf records, clearing any that have the - * HotFileCached bit set. - */ - for (;;) { - error = BTScanNextRecord(&scanstate, 0, (void **)&keyp, (void **)&datap, &dataSize); - if (error) { - if (error == btNotFound) - error = 0; - else - printf("hfs_hotfile_reset: err %d BTScanNext\n", error); - break; - } - - if (datap->recordType == kHFSPlusFolderRecord && (dataSize == sizeof(HFSPlusCatalogFolder))) { - HFSPlusCatalogFolder *dirp = (HFSPlusCatalogFolder *)datap; - - dircount++; - - if ((dirp->flags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask)) == 0) { - continue; - } - - cnid = dirp->folderID; - } else if ((datap->recordType == kHFSPlusFileRecord) && (dataSize == sizeof(HFSPlusCatalogFile))) { - HFSPlusCatalogFile *filep = (HFSPlusCatalogFile *)datap; - - filecount++; - - /* - * If the file doesn't have any of the HotFileCached bits set, ignore it. - */ - if ((filep->flags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask)) == 0) { - continue; - } - - cnid = filep->fileID; - } else { - continue; - } - - /* Skip over journal files. */ - if (cnid == hfsmp->hfs_jnlfileid || cnid == hfsmp->hfs_jnlinfoblkid) { - continue; - } - - // - // Just record the cnid of the file for now. We will modify it separately - // because we can't modify the catalog while we're scanning it. - // - fileids[cur_id_index++] = cnid; - if (cur_id_index >= NUM_FILE_RESET_IDS) { - // - // We're over the limit of file-ids so we have to terminate this - // scan, go modify all the catalog records, then restart the scan. - // This is required because it's not permissible to modify the - // catalog while scanning it. - // - (void) BTScanTerminate(&scanstate, &data, &data, &data); - - reset_file_ids(hfsmp, fileids, cur_id_index); - cleared += cur_id_index; - cur_id_index = 0; - - // restart the scan - error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0, - kCatSearchBufferSize, &scanstate); - if (error) { - printf("hfs_hotfile_reset: err %d BTScanInit\n", error); - goto out; - } - continue; - } - } - - if (cur_id_index) { - reset_file_ids(hfsmp, fileids, cur_id_index); - cleared += cur_id_index; - cur_id_index = 0; - } - - printf("hfs: cleared HotFileCache related bits on %d files out of %d (dircount %d)\n", cleared, filecount, dircount); - - (void) BTScanTerminate(&scanstate, &data, &data, &data); - -out: - if (fileids) - FREE(fileids, M_TEMP); - - if (iterator) - FREE(iterator, M_TEMP); - - // - // If the hotfile btree exists, delete it. We need to open - // it to be able to delete it because we need the hfc_filevp - // for deletion. - // - error = hfc_btree_open_ext(hfsmp, &hfsmp->hfc_filevp, 1); - if (!error) { - printf("hfs: hotfile_reset: deleting existing hotfile btree\n"); - hfc_btree_delete(hfsmp); - } - - if (hfsmp->hfc_filevp) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - - hfsmp->hfs_hotfile_blk_adjust = 0; - hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks; -} - - -// -// This should ONLY be called by hfs_recording_init() and the special fsctl. -// -// We assume that the hotfile btree is already opened. -// -static int -hfs_hotfile_repin_files(struct hfsmount *hfsmp) -{ - BTreeIterator * iterator = NULL; - HotFileKey * key; - filefork_t * filefork; - int error = 0; - int bt_op; - enum hfc_stage stage; - uint32_t pinned_blocks; - uint32_t num_files=0, nrsrc=0; - uint32_t total_pinned=0; - - if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) || !hfsmp->hfc_filevp) { - // - // this is only meaningful if we're pinning hotfiles - // (as opposed to the regular form of hotfiles that - // get relocated to the hotfile zone) - // - return 0; - } - -#if HFC_VERBOSE - printf("hfs: %s: %s\n", hfsmp->vcbVN, __FUNCTION__); -#endif - - if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - return (EPERM); - } - - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - return (ENOMEM); - } - - stage = hfsmp->hfc_stage; - hfsmp->hfc_stage = HFC_BUSY; - - bt_op = kBTreeFirstRecord; - - bzero(iterator, sizeof(*iterator)); - key = (HotFileKey*) &iterator->key; - - filefork = VTOF(hfsmp->hfc_filevp); - int lockflags; - - while (1) { - - lockflags = 0; - /* - * Obtain the first record (ie the coldest one). - */ - if (BTIterateRecord(filefork, bt_op, iterator, NULL, NULL) != 0) { - // no more records - error = 0; - break; - } - if (key->keyLength != HFC_KEYLENGTH) { - // printf("hfs: hotfiles_repin_files: invalid key length %d\n", key->keyLength); - error = EFTYPE; - break; - } - if (key->temperature == HFC_LOOKUPTAG) { - // ran into thread records in the hotfile btree - error = 0; - break; - } - - // - // Just lookup the records in the catalog and pin the direct - // mapped extents. Faster than instantiating full vnodes - // (and thereby thrashing the system vnode cache). - // - struct cat_desc fdesc; - struct cat_attr attr; - struct cat_fork fork; - uint8_t forktype = 0; - - lockflags = hfs_systemfile_lock(hfsmp, (SFL_CATALOG | SFL_EXTENTS), HFS_SHARED_LOCK); - /* - * Snoop the cnode hash to find out if the item we want is in-core already. - * - * We largely expect this function to fail (the items we want are probably not in the hash). - * we use the special variant which bails out as soon as it finds a vnode (even if it is - * marked as open-unlinked or actually removed on-disk. If we find a vnode, then we - * release the systemfile locks and go through the pin-vnode path instead. - */ - if (hfs_chash_snoop (hfsmp, key->fileID, 1, NULL, NULL) == 0) { - pinned_blocks = 0; - - /* unlock immediately and go through the in-core path */ - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - - error = hfs_getvnode_and_pin (hfsmp, key->fileID, &pinned_blocks); - if (error) { - /* if ENOENT, then it was deleted in the catalog. Remove from our hotfiles tracking */ - if (error == ENOENT) { - hfc_btree_delete_record(hfsmp, iterator, key); - } - /* other errors, just ignore and move on with life */ - } - else { //!error - total_pinned += pinned_blocks; - num_files++; - } - - goto next; - } - - /* If we get here, we're still holding the systemfile locks */ - error = cat_idlookup(hfsmp, key->fileID, 1, 0, &fdesc, &attr, &fork); - if (error) { - // - // this file system could have been mounted while booted from a - // different partition and thus the hotfile btree would not have - // been maintained. thus a file that was hotfile cached could - // have been deleted while booted from a different partition which - // means we need to delete it from the hotfile btree. - // - // block accounting is taken care of at the end: we re-assign - // hfsmp->hfs_hotfile_freeblks based on how many blocks we actually - // pinned. - // - hfc_btree_delete_record(hfsmp, iterator, key); - - goto next; - } - - if (fork.cf_size == 0) { - // hmmm, the data is probably in the resource fork (aka a compressed file) - error = cat_idlookup(hfsmp, key->fileID, 1, 1, &fdesc, &attr, &fork); - if (error) { - hfc_btree_delete_record(hfsmp, iterator, key); - goto next; - } - forktype = 0xff; - nrsrc++; - } - - pinned_blocks = 0; - - /* Can't release the catalog /extents lock yet, we may need to go find the overflow blocks */ - error = hfs_pin_extent_record (hfsmp, fork.cf_extents, &pinned_blocks); - if (error) { - goto next; //skip to next - } - /* add in the blocks from the inline 8 */ - total_pinned += pinned_blocks; - pinned_blocks = 0; - - /* Could this file have overflow extents? */ - if (fork.cf_extents[kHFSPlusExtentDensity-1].startBlock) { - /* better pin them, too */ - error = hfs_pin_overflow_extents (hfsmp, key->fileID, forktype, &pinned_blocks); - if (error) { - /* If we fail to pin all of the overflow extents, then just skip to the next file */ - goto next; - } - } - - num_files++; - if (pinned_blocks) { - /* now add in any overflow also */ - total_pinned += pinned_blocks; - } - - next: - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - } - bt_op = kBTreeNextRecord; - - } /* end while */ - -#if HFC_VERBOSE - printf("hfs: hotfiles_repin_files: re-pinned %d files (nrsrc %d, total pinned %d blks; freeblock %d, maxblocks %d, calculated free: %d)\n", - num_files, nrsrc, total_pinned, hfsmp->hfs_hotfile_freeblks, hfsmp->hfs_hotfile_maxblks, - hfsmp->hfs_hotfile_maxblks - total_pinned); -#endif - // - // make sure this is accurate based on how many blocks we actually pinned - // - hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - total_pinned; - - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - - FREE(iterator, M_TEMP); - hfsmp->hfc_stage = stage; - wakeup((caddr_t)&hfsmp->hfc_stage); - return (error); -} - -void -hfs_repin_hotfiles(struct hfsmount *hfsmp) -{ - int error, need_close; - - lck_mtx_lock(&hfsmp->hfc_mutex); - - if (hfsmp->hfc_filevp == NULL) { - error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); - if (!error) { - need_close = 1; - } else { - printf("hfs: failed to open the btree err=%d. Unable to re-pin hotfiles.\n", error); - lck_mtx_unlock(&hfsmp->hfc_mutex); - return; - } - } else { - need_close = 0; - } - - hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL, vfs_context_kernel()); - - hfs_hotfile_repin_files(hfsmp); - - if (need_close) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - - lck_mtx_unlock(&hfsmp->hfc_mutex); -} - -/* - * For a given file ID, find and pin all of its overflow extents to the underlying CS - * device. Assumes that the extents overflow b-tree is locked for the duration of this call. - * - * Emit the number of blocks pinned in output argument 'pinned' - * - * Return success or failure (errno) in return value. - * - */ -int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid, - uint8_t forktype, uint32_t *pinned) { - - struct BTreeIterator *ext_iter = NULL; - ExtentKey *ext_key_ptr = NULL; - ExtentRecord ext_data; - FSBufferDescriptor btRecord; - uint16_t btRecordSize; - int error = 0; - - uint32_t pinned_blocks = 0; - - - MALLOC (ext_iter, struct BTreeIterator*, sizeof (struct BTreeIterator), M_TEMP, M_WAITOK); - if (ext_iter == NULL) { - return ENOMEM; - } - bzero (ext_iter, sizeof(*ext_iter)); - - BTInvalidateHint (ext_iter); - ext_key_ptr = (ExtentKey*)&ext_iter->key; - btRecord.bufferAddress = &ext_data; - btRecord.itemCount = 1; - - /* - * This is like when you delete a file; we don't actually need most of the search machinery because - * we are going to need all of the extent records that belong to this file (for a given fork type), - * so we might as well use a straight-up iterator. - * - * Position the B-Tree iterator at the first record with this file ID - */ - btRecord.itemSize = sizeof (HFSPlusExtentRecord); - ext_key_ptr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; - ext_key_ptr->hfsPlus.forkType = forktype; - ext_key_ptr->hfsPlus.pad = 0; - ext_key_ptr->hfsPlus.fileID = fileid; - ext_key_ptr->hfsPlus.startBlock = 0; - - error = BTSearchRecord (VTOF(hfsmp->hfs_extents_vp), ext_iter, &btRecord, &btRecordSize, ext_iter); - if (error == btNotFound) { - /* empty b-tree, so that's ok. we'll fall out during error check below. */ - error = 0; - } - - while (1) { - uint32_t found_fileid; - uint32_t pblocks; - - error = BTIterateRecord (VTOF(hfsmp->hfs_extents_vp), kBTreeNextRecord, ext_iter, &btRecord, &btRecordSize); - if (error) { - /* swallow it if it's btNotFound, otherwise just bail out */ - if (error == btNotFound) - error = 0; - break; - } - - found_fileid = ext_key_ptr->hfsPlus.fileID; - /* - * We only do one fork type at a time. So if either the fork-type doesn't - * match what we are looking for (resource or data), OR the file id doesn't match - * which indicates that there's nothing more with this file ID as the key, then bail out - */ - if ((found_fileid != fileid) || (ext_key_ptr->hfsPlus.forkType != forktype)) { - error = 0; - break; - } - - /* Otherwise, we now have an extent record. Process and pin all of the file extents. */ - pblocks = 0; - error = hfs_pin_extent_record (hfsmp, ext_data.hfsPlus, &pblocks); - - if (error) { - break; - } - pinned_blocks += pblocks; - - /* if 8th extent is empty, then bail out */ - if (ext_data.hfsPlus[kHFSPlusExtentDensity-1].startBlock == 0) { - error = 0; - break; - } - - } // end extent-getting loop - - /* dump the iterator */ - FREE (ext_iter, M_TEMP); - - if (error == 0) { - /* - * In the event that the file has no overflow extents, pinned_blocks - * will never be updated, so we'll properly export 0 pinned blocks to caller - */ - *pinned = pinned_blocks; - } - - return error; - -} - - -static int -hfs_getvnode_and_pin (struct hfsmount *hfsmp, uint32_t fileid, uint32_t *pinned) { - struct vnode *vp; - int error = 0; - *pinned = 0; - uint32_t pblocks; - - /* - * Acquire the vnode for this file. This returns a locked cnode on success - */ - error = hfs_vget(hfsmp, fileid, &vp, 0, 0); - if (error) { - /* It's possible the file was open-unlinked. In this case, we'll get ENOENT back. */ - return error; - } - - /* - * Symlinks that may have been inserted into the hotfile zone during a previous OS are now stuck - * here. We do not want to move them. - */ - if (!vnode_isreg(vp)) { - hfs_unlock(VTOC(vp)); - vnode_put(vp); - return EPERM; - } - - if (!(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { - hfs_unlock(VTOC(vp)); - vnode_put(vp); - return EINVAL; - } - - error = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, &pblocks, vfs_context_kernel()); - if (error == 0) { - *pinned = pblocks; - } - - hfs_unlock(VTOC(vp)); - vnode_put(vp); - - return error; - -} - -/* - * Pins an HFS Extent record to the underlying CoreStorage. Assumes that Catalog & Extents overflow - * B-trees are held locked, as needed. - * - * Returns the number of blocks pinned in the output argument 'pinned' - * - * Returns error status (0 || errno) in return value. - */ -static int hfs_pin_extent_record (struct hfsmount *hfsmp, HFSPlusExtentRecord extents, uint32_t *pinned) { - uint32_t pb = 0; - int i; - int error; - - if (pinned == NULL) { - return EINVAL; - } - *pinned = 0; - - - - /* iterate through the extents */ - for ( i = 0; i < kHFSPlusExtentDensity; i++) { - if (extents[i].startBlock == 0) { - break; - } - - error = hfs_pin_block_range (hfsmp, HFS_PIN_IT, extents[i].startBlock, - extents[i].blockCount, vfs_context_kernel()); - - if (error) { - break; - } - pb += extents[i].blockCount; - } - - *pinned = pb; - - return error; -} - -/* - * Consume an HFS Plus on-disk catalog record and pin its blocks - * to the underlying CS devnode. - * - * NOTE: This is an important distinction! - * This function takes in an HFSPlusCatalogFile* which is the actual - * 200-some-odd-byte on-disk representation in the Catalog B-Tree (not - * one of the run-time structs that we normally use. - * - * This assumes that the catalog and extents-overflow btrees - * are locked, at least in shared mode - */ -static int hfs_pin_catalog_rec (struct hfsmount *hfsmp, HFSPlusCatalogFile *cfp, int rsrc) { - uint32_t pinned_blocks = 0; - HFSPlusForkData *forkdata; - int error = 0; - uint8_t forktype = 0; - - if (rsrc) { - forkdata = &cfp->resourceFork; - forktype = 0xff; - } - else { - forkdata = &cfp->dataFork; - } - - uint32_t pblocks = 0; - - /* iterate through the inline extents */ - error = hfs_pin_extent_record (hfsmp, forkdata->extents, &pblocks); - if (error) { - return error; - } - - pinned_blocks += pblocks; - pblocks = 0; - - /* it may have overflow extents */ - if (forkdata->extents[kHFSPlusExtentDensity-1].startBlock != 0) { - error = hfs_pin_overflow_extents (hfsmp, cfp->fileID, forktype, &pblocks); - } - pinned_blocks += pblocks; - - hfsmp->hfs_hotfile_freeblks -= pinned_blocks; - - return error; -} - - -/* - * - */ -int -hfs_recording_init(struct hfsmount *hfsmp) -{ - CatalogKey * keyp; - CatalogRecord * datap; - u_int32_t dataSize; - HFSPlusCatalogFile *filep; - BTScanState scanstate; - BTreeIterator * iterator = NULL; - FSBufferDescriptor record; - HotFileKey * key; - filefork_t * filefork; - u_int32_t data; - struct cat_attr cattr; - u_int32_t cnid; - int error = 0; - long starting_temp; - - int started_tr = 0; - int started_scan = 0; - - int inserted = 0; /* debug variables */ - int filecount = 0; - int uncacheable = 0; - - /* - * For now, only the boot volume is supported. - */ - if ((vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) == 0) { - hfsmp->hfc_stage = HFC_DISABLED; - return (EPERM); - } - - /* We grab the HFC mutex even though we're not fully mounted yet, just for orderliness */ - lck_mtx_lock (&hfsmp->hfc_mutex); - - /* - * Tracking of hot files requires up-to-date access times. - * So if access time updates are disabled, then we disable - * hot files, too. - */ - if (vfs_flags(HFSTOVFS(hfsmp)) & MNT_NOATIME) { - hfsmp->hfc_stage = HFC_DISABLED; - lck_mtx_unlock (&hfsmp->hfc_mutex); - return EPERM; - } - - // - // Check if we've been asked to suspend operation - // - cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, ".hotfile-suspend", &cattr, NULL); - if (cnid != 0) { - printf("hfs: %s: %s: hotfiles explicitly disabled! remove /.hotfiles-suspend to re-enable\n", hfsmp->vcbVN, __FUNCTION__); - hfsmp->hfc_stage = HFC_DISABLED; - lck_mtx_unlock (&hfsmp->hfc_mutex); - return EPERM; - } - - // - // Check if we've been asked to reset our state. - // - cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, ".hotfile-reset", &cattr, NULL); - if (cnid != 0) { - hfs_hotfile_reset(hfsmp); - } - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - // - // Cooperative Fusion (CF) systems use different constants - // than traditional hotfile systems. These were picked after a bit of - // experimentation - we can cache many more files on the - // ssd in an CF system and we can do so more rapidly - // so bump the limits considerably (and turn down the - // duration so that it doesn't take weeks to adopt all - // the files). - // - hfc_default_file_count = 20000; - hfc_default_duration = 300; // 5min - hfc_max_file_count = 50000; - hfc_max_file_size = (512ULL * 1024ULL * 1024ULL); - } - - /* - * If the Hot File btree exists then metadata zone is ready. - */ - cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL); - if (cnid != 0 && S_ISREG(cattr.ca_mode)) { - int recreate = 0; - - if (hfsmp->hfc_stage == HFC_DISABLED) - hfsmp->hfc_stage = HFC_IDLE; - hfsmp->hfs_hotfile_freeblks = 0; - - if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && cattr.ca_blocks > 0) { - // - // make sure the hotfile btree is pinned - // - error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); - if (!error) { - /* XXX: must fix hfs_pin_vnode too */ - hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL, vfs_context_kernel()); - - } else { - printf("hfs: failed to open the btree err=%d. Recreating hotfile btree.\n", error); - recreate = 1; - } - - hfs_hotfile_repin_files(hfsmp); - - if (hfsmp->hfc_filevp) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - - } else if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - // hmmm, the hotfile btree is zero bytes long? how odd. let's recreate it. - printf("hfs: hotfile btree is zero bytes long?! recreating it.\n"); - recreate = 1; - } - - if (!recreate) { - /* don't forget to unlock the mutex */ - lck_mtx_unlock (&hfsmp->hfc_mutex); - return (0); - } else { - // - // open the hotfile btree file ignoring errors because - // we need the vnode pointer for hfc_btree_delete() to - // be able to do its work - // - error = hfc_btree_open_ext(hfsmp, &hfsmp->hfc_filevp, 1); - if (!error) { - // and delete it! - error = hfc_btree_delete(hfsmp); - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - } - } - - printf("hfs: %s: %s: creating the hotfile btree\n", hfsmp->vcbVN, __FUNCTION__); - if (hfs_start_transaction(hfsmp) != 0) { - lck_mtx_unlock (&hfsmp->hfc_mutex); - return EINVAL; - } - - /* B-tree creation must be journaled */ - started_tr = 1; - - error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT); - if (error) { -#if HFC_VERBOSE - printf("hfs: Error %d creating hot file b-tree on %s \n", error, hfsmp->vcbVN); -#endif - goto recording_init_out; - } - - hfs_end_transaction (hfsmp); - started_tr = 0; - /* - * Do a journal flush + flush track cache. We have to ensure that the async I/Os have been issued to the media - * before proceeding. - */ - hfs_flush (hfsmp, HFS_FLUSH_FULL); - - /* now re-start a new transaction */ - if (hfs_start_transaction (hfsmp) != 0) { - lck_mtx_unlock (&hfsmp->hfc_mutex); - return EINVAL; - } - started_tr = 1; - - /* - * Open the Hot File B-tree file for writing. - */ - if (hfsmp->hfc_filevp) - panic("hfs_recording_init: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp); - - error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); - if (error) { -#if HFC_VERBOSE - printf("hfs: Error %d opening hot file b-tree on %s \n", error, hfsmp->vcbVN); -#endif - goto recording_init_out; - } - - /* - * This function performs work similar to namei; we must NOT hold the catalog lock while - * calling it. This will decorate catalog records as being pinning candidates. (no hotfiles work) - */ - hfs_setup_default_cf_hotfiles(hfsmp); - - /* - * now grab the hotfiles b-tree vnode/cnode lock first, as it is not classified as a systemfile. - */ - if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - error = EPERM; - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - /* zero it out to avoid pinning later on */ - hfsmp->hfc_filevp = NULL; - goto recording_init_out; - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - error = ENOMEM; - hfs_unlock (VTOC(hfsmp->hfc_filevp)); - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - /* zero it out to avoid pinning */ - hfsmp->hfc_filevp = NULL; - goto recording_init_out; - } - - bzero(iterator, sizeof(*iterator)); - key = (HotFileKey*) &iterator->key; - key->keyLength = HFC_KEYLENGTH; - - record.bufferAddress = &data; - record.itemSize = sizeof(u_int32_t); - record.itemCount = 1; - -#if HFC_VERBOSE - printf("hfs: Evaluating space for \"%s\" metadata zone... (freeblks %d)\n", HFSTOVCB(hfsmp)->vcbVN, - hfsmp->hfs_hotfile_freeblks); -#endif - - /* - * Get ready to scan the Catalog file. We explicitly do NOT grab the catalog lock because - * we're fully single-threaded at the moment (by virtue of being called during mount()), - * and if we have to grow the hotfile btree, then we would need to grab the catalog lock - * and if we take a shared lock here, it would deadlock (see ) - * - * We already started a transaction so we should already be holding the journal lock at this point. - * Note that we have to hold the journal lock / start a txn BEFORE the systemfile locks. - */ - - error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0, - kCatSearchBufferSize, &scanstate); - if (error) { - printf("hfs_recording_init: err %d BTScanInit\n", error); - - /* drop the systemfile locks */ - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - - /* zero it out to avoid pinning */ - hfsmp->hfc_filevp = NULL; - goto recording_init_out; - } - - started_scan = 1; - - filefork = VTOF(hfsmp->hfc_filevp); - - starting_temp = random() % HF_TEMP_RANGE; - - /* - * Visit all the catalog btree leaf records. We have to hold the catalog lock to do this. - * - * NOTE: The B-Tree scanner reads from the media itself. Under normal circumstances it would be - * fine to simply use b-tree routines to read blocks that correspond to b-tree nodes, because the - * block cache is going to ensure you always get the cached copy of a block (even if a journal - * txn has modified one of those blocks). That is NOT true when - * using the scanner. In particular, it will always read whatever is on-disk. So we have to ensure - * that the journal has flushed and that the async I/Os to the metadata files have been issued. - */ - for (;;) { - error = BTScanNextRecord(&scanstate, 0, (void **)&keyp, (void **)&datap, &dataSize); - if (error) { - if (error == btNotFound) - error = 0; - else - printf("hfs_recording_init: err %d BTScanNext\n", error); - break; - } - if ((datap->recordType != kHFSPlusFileRecord) || - (dataSize != sizeof(HFSPlusCatalogFile))) { - continue; - } - filep = (HFSPlusCatalogFile *)datap; - filecount++; - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - if (filep->flags & kHFSDoNotFastDevPinMask) { - uncacheable++; - } - - // - // If the file does not have the FastDevPinnedMask set, we - // can ignore it and just go to the next record. - // - if ((filep->flags & kHFSFastDevPinnedMask) == 0) { - continue; - } - } else if (filep->dataFork.totalBlocks == 0) { - continue; - } - - /* - * On a regular hdd, any file that has blocks inside - * the hot file space is recorded for later eviction. - * - * For now, resource forks are ignored. - * - * We don't do this on CF systems as there is no real - * hotfile area - we just pin/unpin blocks belonging to - * interesting files. - */ - if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && !hotextents(hfsmp, &filep->dataFork.extents[0])) { - continue; - } - cnid = filep->fileID; - - /* Skip over journal files and the hotfiles B-Tree file. */ - if (cnid == hfsmp->hfs_jnlfileid - || cnid == hfsmp->hfs_jnlinfoblkid - || cnid == VTOC(hfsmp->hfc_filevp)->c_fileid) { - continue; - } - /* - * XXX - need to skip quota files as well. - */ - - uint32_t temp; - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - int rsrc = 0; - - temp = (uint32_t)starting_temp++; - if (filep->flags & kHFSAutoCandidateMask) { - temp += MAX_NORMAL_TEMP; - } - - /* use the data fork by default */ - if (filep->dataFork.totalBlocks == 0) { - /* - * but if empty, switch to rsrc as its likely - * a compressed file - */ - rsrc = 1; - } - - error = hfs_pin_catalog_rec (hfsmp, filep, rsrc); - if (error) - break; - - } else { - temp = HFC_MINIMUM_TEMPERATURE; - } - - /* Insert a hot file entry. */ - key->keyLength = HFC_KEYLENGTH; - key->temperature = temp; - key->fileID = cnid; - key->forkType = 0; - data = 0x3f3f3f3f; - error = BTInsertRecord(filefork, iterator, &record, record.itemSize); - if (error) { - printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); - error = MacToVFSError(error); - break; - } - - /* Insert the corresponding thread record. */ - key->keyLength = HFC_KEYLENGTH; - key->temperature = HFC_LOOKUPTAG; - key->fileID = cnid; - key->forkType = 0; - data = temp; - error = BTInsertRecord(filefork, iterator, &record, record.itemSize); - if (error) { - printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); - error = MacToVFSError(error); - break; - } - inserted++; - } // end catalog iteration loop - - save_btree_user_info(hfsmp); - (void) BTFlushPath(filefork); - -recording_init_out: - - /* Unlock first, then pin after releasing everything else */ - if (hfsmp->hfc_filevp) { - hfs_unlock (VTOC(hfsmp->hfc_filevp)); - } - - if (started_scan) { - (void) BTScanTerminate (&scanstate, &data, &data, &data); - } - - if (started_tr) { - hfs_end_transaction(hfsmp); - } - -#if HFC_VERBOSE - printf("hfs: %d files identified out of %d (freeblocks is now: %d)\n", inserted, filecount, hfsmp->hfs_hotfile_freeblks); - if (uncacheable) { - printf("hfs: %d files were marked as uncacheable\n", uncacheable); - } -#endif - - if (iterator) - FREE(iterator, M_TEMP); - - if (hfsmp->hfc_filevp) { - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL, vfs_context_kernel()); - } - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - - if (error == 0) - hfsmp->hfc_stage = HFC_IDLE; - - /* Finally, unlock the HFC mutex */ - lck_mtx_unlock (&hfsmp->hfc_mutex); - - return (error); -} - -/* - * Use sync to perform ocassional background work. - */ -int -hfs_hotfilesync(struct hfsmount *hfsmp, vfs_context_t ctx) -{ - if (hfsmp->hfc_stage) { - struct timeval tv; - - lck_mtx_lock(&hfsmp->hfc_mutex); - - switch (hfsmp->hfc_stage) { - case HFC_IDLE: - (void) hfs_recording_start(hfsmp); - break; - - case HFC_RECORDING: - microtime(&tv); - if (tv.tv_sec > hfsmp->hfc_timeout) - (void) hfs_recording_stop(hfsmp); - break; - - case HFC_EVICTION: - (void) hotfiles_evict(hfsmp, ctx); - break; - - case HFC_ADOPTION: - (void) hotfiles_adopt(hfsmp, ctx); - break; - default: - break; - } - - lck_mtx_unlock(&hfsmp->hfc_mutex); - } - return (0); -} - -/* - * Add a hot file to the recording list. - * - * This can happen when a hot file gets reclaimed or at the - * end of the recording period for any active hot file. - * - * NOTE: Since both the data and resource fork can be hot, - * there can be two entries for the same file id. - * - * Note: the cnode is locked on entry. - */ -int -hfs_addhotfile(struct vnode *vp) -{ - hfsmount_t *hfsmp; - int error; - - hfsmp = VTOHFS(vp); - if (hfsmp->hfc_stage != HFC_RECORDING) - return (0); - - lck_mtx_lock(&hfsmp->hfc_mutex); - error = hfs_addhotfile_internal(vp); - lck_mtx_unlock(&hfsmp->hfc_mutex); - return (error); -} - -static int -hf_ignore_process(const char *pname, size_t maxlen) -{ - if ( strncmp(pname, "mds", maxlen) == 0 - || strncmp(pname, "mdworker", maxlen) == 0 - || strncmp(pname, "mds_stores", maxlen) == 0 - || strncmp(pname, "makewhatis", maxlen) == 0) { - return 1; - } - - return 0; - -} - -static int -hfs_addhotfile_internal(struct vnode *vp) -{ - hotfile_data_t *hotdata; - hotfile_entry_t *entry; - hfsmount_t *hfsmp; - cnode_t *cp; - filefork_t *ffp; - u_int32_t temperature; - - hfsmp = VTOHFS(vp); - if (hfsmp->hfc_stage != HFC_RECORDING) - return (0); - - /* - * Only regular files are eligible for hotfiles addition. - * - * Symlinks were previously added to the list and may exist in - * extant hotfiles regions, but no new ones will be added, and no - * symlinks will now be relocated/evicted from the hotfiles region. - */ - if (!vnode_isreg(vp) || vnode_issystem(vp)) { - return (0); - } - - /* Skip resource forks for now. */ - if (VNODE_IS_RSRC(vp)) { - return (0); - } - if ((hotdata = (hotfile_data_t *)hfsmp->hfc_recdata) == NULL) { - return (0); - } - ffp = VTOF(vp); - cp = VTOC(vp); - - if (cp->c_attr.ca_recflags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask)) { - // it's already a hotfile or can't be a hotfile... - return 0; - } - - if (vnode_isdir(vp) || vnode_issystem(vp) || (cp->c_flag & (C_DELETED | C_NOEXISTS))) { - return 0; - } - - if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && vnode_isfastdevicecandidate(vp)) { - // - // On cooperative fusion (CF) systems we have different criteria for whether something - // can be pinned to the ssd. - // - if (cp->c_flag & (C_DELETED|C_NOEXISTS)) { - // - // dead files are definitely not worth caching - // - return 0; - } else if (ffp->ff_blocks == 0 && !(cp->c_bsdflags & UF_COMPRESSED) && !(cp->c_attr.ca_recflags & kHFSFastDevCandidateMask)) { - // - // empty files aren't worth caching but compressed ones might be, as are - // newly created files that live in WorthCaching directories... - // - return 0; - } - - char pname[256]; - pname[0] = '\0'; - proc_selfname(pname, sizeof(pname)); - if (hf_ignore_process(pname, sizeof(pname))) { - // ignore i/o's from certain system daemons - return 0; - } - - temperature = cp->c_fileid; // in memory we just keep it sorted by file-id - } else { - // the normal hard drive based hotfile checks - if ((ffp->ff_bytesread == 0) || - (ffp->ff_blocks == 0) || - (ffp->ff_size == 0) || - (ffp->ff_blocks > hotdata->maxblocks) || - (cp->c_bsdflags & (UF_NODUMP | UF_COMPRESSED)) || - (cp->c_atime < hfsmp->hfc_timebase)) { - return (0); - } - - temperature = ffp->ff_bytesread / ffp->ff_size; - if (temperature < hotdata->threshold) { - return (0); - } - } - - /* - * If there is room or this file is hotter than - * the coldest one then add it to the list. - * - */ - if ((hotdata->activefiles < hfsmp->hfc_maxfiles) || - (hotdata->coldest == NULL) || - (temperature >= hotdata->coldest->temperature)) { - ++hotdata->refcount; - entry = hf_getnewentry(hotdata); - entry->temperature = temperature; - entry->fileid = cp->c_fileid; - // - // if ffp->ff_blocks is zero, it might be compressed so make sure we record - // that there's at least one block. - // - entry->blocks = ffp->ff_blocks ? ffp->ff_blocks : 1; - if (hf_insert(hotdata, entry) == EEXIST) { - // entry is already present, don't need to add it again - entry->right = hotdata->freelist; - hotdata->freelist = entry; - } - --hotdata->refcount; - } - - return (0); -} - -/* - * Remove a hot file from the recording list. - * - * This can happen when a hot file becomes - * an active vnode (active hot files are - * not kept in the recording list until the - * end of the recording period). - * - * Note: the cnode is locked on entry. - */ -int -hfs_removehotfile(struct vnode *vp) -{ - hotfile_data_t *hotdata; - hfsmount_t *hfsmp; - cnode_t *cp; - filefork_t *ffp; - u_int32_t temperature; - - hfsmp = VTOHFS(vp); - if (hfsmp->hfc_stage != HFC_RECORDING) - return (0); - - if ((!vnode_isreg(vp)) || vnode_issystem(vp)) { - return (0); - } - - ffp = VTOF(vp); - cp = VTOC(vp); - - if ((ffp->ff_bytesread == 0) || (ffp->ff_blocks == 0) || - (ffp->ff_size == 0) || (cp->c_atime < hfsmp->hfc_timebase)) { - return (0); - } - - lck_mtx_lock(&hfsmp->hfc_mutex); - if (hfsmp->hfc_stage != HFC_RECORDING) - goto out; - if ((hotdata = (hotfile_data_t *)hfsmp->hfc_recdata) == NULL) - goto out; - - temperature = ffp->ff_bytesread / ffp->ff_size; - if (temperature < hotdata->threshold) - goto out; - - if (hotdata->coldest && (temperature >= hotdata->coldest->temperature)) { - ++hotdata->refcount; - hf_delete(hotdata, VTOC(vp)->c_fileid, temperature); - --hotdata->refcount; - } -out: - lck_mtx_unlock(&hfsmp->hfc_mutex); - return (0); -} - -int -hfs_hotfile_deleted(__unused struct vnode *vp) -{ -#if 1 - return 0; -#else - // - // XXXdbg - this code, while it would work, would introduce a huge inefficiency - // to deleting files as the way it's written would require us to open - // the hotfile btree on every open, delete two records in it and then - // close the hotfile btree (which involves more writes). - // - // We actually can be lazy about deleting hotfile records for files - // that get deleted. When it's time to evict things, if we encounter - // a record that references a dead file (i.e. a fileid which no - // longer exists), the eviction code will remove the records. Likewise - // the code that scans the HotFile B-Tree at boot time to re-pin files - // will remove dead records. - // - - hotfile_data_t *hotdata; - hfsmount_t *hfsmp; - cnode_t *cp; - filefork_t *filefork; - u_int32_t temperature; - BTreeIterator * iterator = NULL; - FSBufferDescriptor record; - HotFileKey *key; - u_int32_t data; - int error=0; - - cp = VTOC(vp); - if (cp == NULL || !(cp->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { - return 0; - } - - hfsmp = VTOHFS(vp); - if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { - return 0; - } - - if (hfc_btree_open(hfsmp, &hfsmp->hfc_filevp) != 0 || hfsmp->hfc_filevp == NULL) { - // either there is no hotfile info or it's damaged - return EINVAL; - } - - filefork = VTOF(hfsmp->hfc_filevp); - if (filefork == NULL) { - return 0; - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return ENOMEM; - } - bzero(iterator, sizeof(*iterator)); - key = (HotFileKey*) &iterator->key; - - record.bufferAddress = &data; - record.itemSize = sizeof(u_int32_t); - record.itemCount = 1; - - key->keyLength = HFC_KEYLENGTH; - key->temperature = HFC_LOOKUPTAG; - key->fileID = cp->c_fileid; - key->forkType = 0; - - lck_mtx_lock(&hfsmp->hfc_mutex); - (void) BTInvalidateHint(iterator); - if (BTSearchRecord(filefork, iterator, &record, NULL, iterator) == 0) { - temperature = key->temperature; - hfc_btree_delete_record(hfsmp, iterator, key); - } else { - //printf("hfs: hotfile_deleted: did not find fileid %d\n", cp->c_fileid); - error = ENOENT; - } - - if ((hotdata = (hotfile_data_t *)hfsmp->hfc_recdata) != NULL) { - // just in case, also make sure it's removed from the in-memory list as well - ++hotdata->refcount; - hf_delete(hotdata, cp->c_fileid, cp->c_fileid); - --hotdata->refcount; - } - - lck_mtx_unlock(&hfsmp->hfc_mutex); - FREE(iterator, M_TEMP); - - hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - - return error; -#endif -} - -int -hfs_hotfile_adjust_blocks(struct vnode *vp, int64_t num_blocks) -{ - hfsmount_t *hfsmp; - - if (vp == NULL) { - return 0; - } - - hfsmp = VTOHFS(vp); - - if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) || num_blocks == 0 || vp == NULL) { - return 0; - } - - // - // if file is not HotFileCached or it has the CanNotHotFile cache - // bit set then there is nothing to do - // - if (!(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask) || (VTOC(vp)->c_attr.ca_recflags & kHFSDoNotFastDevPinMask)) { - // it's not a hot file or can't be one so don't bother tracking - return 0; - } - - OSAddAtomic(num_blocks, &hfsmp->hfs_hotfile_blk_adjust); - - return (0); -} - -// -// Assumes hfsmp->hfc_mutex is LOCKED -// -static int -hfs_hotfile_cur_freeblks(hfsmount_t *hfsmp) -{ - if (hfsmp->hfc_stage < HFC_IDLE) { - return 0; - } - - int cur_blk_adjust = hfsmp->hfs_hotfile_blk_adjust; // snap a copy of this value - - if (cur_blk_adjust) { - OSAddAtomic(-cur_blk_adjust, &hfsmp->hfs_hotfile_blk_adjust); - hfsmp->hfs_hotfile_freeblks += cur_blk_adjust; - } - - return hfsmp->hfs_hotfile_freeblks; -} - - -/* - *======================================================================== - * HOT FILE MAINTENANCE ROUTINES - *======================================================================== - */ - -static int -hotfiles_collect_callback(struct vnode *vp, __unused void *cargs) -{ - if ((vnode_isreg(vp)) && !vnode_issystem(vp)) - (void) hfs_addhotfile_internal(vp); - - return (VNODE_RETURNED); -} - -/* - * Add all active hot files to the recording list. - */ -static int -hotfiles_collect(struct hfsmount *hfsmp) -{ - struct mount *mp = HFSTOVFS(hfsmp); - - if (vfs_busy(mp, LK_NOWAIT)) - return (0); - - /* - * hotfiles_collect_callback will be called for each vnode - * hung off of this mount point - * the vnode will be - * properly referenced and unreferenced around the callback - */ - vnode_iterate(mp, 0, hotfiles_collect_callback, (void *)NULL); - - vfs_unbusy(mp); - - return (0); -} - - -/* - * Update the data of a btree record - * This is called from within BTUpdateRecord. - */ -static int -update_callback(const HotFileKey *key, u_int32_t *data, u_int32_t *state) -{ - if (key->temperature == HFC_LOOKUPTAG) - *data = *state; - return (0); -} - -/* - * Identify files already in hot area. - */ -static int -hotfiles_refine(struct hfsmount *hfsmp) -{ - BTreeIterator * iterator = NULL; - struct mount *mp; - filefork_t * filefork; - hotfilelist_t *listp; - FSBufferDescriptor record; - HotFileKey * key; - u_int32_t data; - int i; - int error = 0; - - if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) - return (0); - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - // on ssd's we don't refine the temperature since the - // replacement algorithm is simply random - return 0; - } - - mp = HFSTOVFS(hfsmp); - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - error = ENOMEM; - goto out; - } - bzero(iterator, sizeof(*iterator)); - key = (HotFileKey*) &iterator->key; - - record.bufferAddress = &data; - record.itemSize = sizeof(u_int32_t); - record.itemCount = 1; - - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - error = EPERM; - goto out1; - } - filefork = VTOF(hfsmp->hfc_filevp); - - for (i = 0; i < listp->hfl_count; ++i) { - /* - * Check if entry (thread) is already in hot area. - */ - key->keyLength = HFC_KEYLENGTH; - key->temperature = HFC_LOOKUPTAG; - key->fileID = listp->hfl_hotfile[i].hf_fileid; - key->forkType = 0; - (void) BTInvalidateHint(iterator); - if (BTSearchRecord(filefork, iterator, &record, NULL, iterator) != 0) { - continue; /* not in hot area, so skip */ - } - - /* - * Update thread entry with latest temperature. - */ - error = BTUpdateRecord(filefork, iterator, - (IterateCallBackProcPtr)update_callback, - &listp->hfl_hotfile[i].hf_temperature); - if (error) { - printf("hfs: hotfiles_refine: BTUpdateRecord failed %d (file %d)\n", error, key->fileID); - error = MacToVFSError(error); - // break; - } - /* - * Re-key entry with latest temperature. - */ - key->keyLength = HFC_KEYLENGTH; - key->temperature = data; - key->fileID = listp->hfl_hotfile[i].hf_fileid; - key->forkType = 0; - /* Pick up record data. */ - (void) BTInvalidateHint(iterator); - (void) BTSearchRecord(filefork, iterator, &record, NULL, iterator); - error = BTDeleteRecord(filefork, iterator); - if (error) { - printf("hfs: hotfiles_refine: BTDeleteRecord failed %d (file %d)\n", error, key->fileID); - error = MacToVFSError(error); - break; - } - key->keyLength = HFC_KEYLENGTH; - key->temperature = listp->hfl_hotfile[i].hf_temperature; - key->fileID = listp->hfl_hotfile[i].hf_fileid; - key->forkType = 0; - error = BTInsertRecord(filefork, iterator, &record, record.itemSize); - if (error) { - printf("hfs: hotfiles_refine: BTInsertRecord failed %d (file %d)\n", error, key->fileID); - error = MacToVFSError(error); - break; - } - /* - * Invalidate this entry in the list. - */ - listp->hfl_hotfile[i].hf_temperature = 0; - listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; - - } /* end for */ - - (void) BTFlushPath(filefork); - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - -out1: - hfs_end_transaction(hfsmp); -out: - if (iterator) - FREE(iterator, M_TEMP); - return (error); -} - -/* - * Move new hot files into hot area. - * - * Requires that the hfc_mutex be held. - */ -static int -hotfiles_adopt(struct hfsmount *hfsmp, vfs_context_t ctx) -{ - BTreeIterator * iterator = NULL; - struct vnode *vp; - filefork_t * filefork; - hotfilelist_t *listp; - FSBufferDescriptor record; - HotFileKey * key; - u_int32_t data; - enum hfc_stage stage; - int fileblocks; - int blksmoved; - int i; - int last; - int error = 0; - int startedtrans = 0; - // - // all files in a given adoption phase have a temperature - // that starts at a random value and then increases linearly. - // the idea is that during eviction, files that were adopted - // together will be evicted together - // - long starting_temp = random() % HF_TEMP_RANGE; - long temp_adjust = 0; - - if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) - return (0); - - if (hfsmp->hfc_stage != HFC_ADOPTION) { - return (EBUSY); - } - if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - return (EPERM); - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - return (ENOMEM); - } - -#if HFC_VERBOSE - printf("hfs:%s: hotfiles_adopt: (hfl_next: %d, hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n", - hfsmp->vcbVN, - listp->hfl_next, - hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end, - hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles); -#endif - - stage = hfsmp->hfc_stage; - hfsmp->hfc_stage = HFC_BUSY; - - blksmoved = 0; - last = listp->hfl_next + HFC_FILESPERSYNC; - if (last > listp->hfl_count) - last = listp->hfl_count; - - bzero(iterator, sizeof(*iterator)); - key = (HotFileKey*) &iterator->key; - key->keyLength = HFC_KEYLENGTH; - - record.bufferAddress = &data; - record.itemSize = sizeof(u_int32_t); - record.itemCount = 1; - - filefork = VTOF(hfsmp->hfc_filevp); - - for (i = listp->hfl_next; (i < last) && (blksmoved < HFC_BLKSPERSYNC); ++i) { - /* - * Skip entries that aren't going to work. - */ - if (listp->hfl_hotfile[i].hf_temperature == 0) { - //printf("hfs: zero temp on file-id %d\n", listp->hfl_hotfile[i].hf_fileid); - listp->hfl_next++; - continue; - } - if (listp->hfl_hotfile[i].hf_fileid == VTOC(hfsmp->hfc_filevp)->c_fileid) { - //printf("hfs: cannot adopt the hotfile b-tree itself! (file-id %d)\n", listp->hfl_hotfile[i].hf_fileid); - listp->hfl_next++; - continue; - } - if (listp->hfl_hotfile[i].hf_fileid < kHFSFirstUserCatalogNodeID) { - //printf("hfs: cannot adopt system files (file-id %d)\n", listp->hfl_hotfile[i].hf_fileid); - listp->hfl_next++; - continue; - } - - /* - * Acquire a vnode for this file. - */ - error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0, 0); - if (error) { - //printf("failed to get fileid %d (err %d)\n", listp->hfl_hotfile[i].hf_fileid, error); - if (error == ENOENT) { - error = 0; - listp->hfl_next++; - continue; /* stale entry, go to next */ - } - break; - } - - //printf("hfs: examining hotfile entry w/fileid %d, temp %d, blocks %d (HotFileCached: %s)\n", - // listp->hfl_hotfile[i].hf_fileid, listp->hfl_hotfile[i].hf_temperature, - // listp->hfl_hotfile[i].hf_blocks, - // (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask) ? "YES" : "NO"); - - if (!vnode_isreg(vp)) { - /* Symlinks are ineligible for adoption into the hotfile zone. */ - //printf("hfs: hotfiles_adopt: huh, not a file %d (%d)\n", listp->hfl_hotfile[i].hf_fileid, VTOC(vp)->c_cnid); - hfs_unlock(VTOC(vp)); - vnode_put(vp); - listp->hfl_hotfile[i].hf_temperature = 0; - listp->hfl_next++; - continue; /* stale entry, go to next */ - } - if ( (VTOC(vp)->c_flag & (C_DELETED | C_NOEXISTS)) - || (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && hotextents(hfsmp, &VTOF(vp)->ff_extents[0])) - || (VTOC(vp)->c_attr.ca_recflags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask))) { - hfs_unlock(VTOC(vp)); - vnode_put(vp); - listp->hfl_hotfile[i].hf_temperature = 0; - listp->hfl_next++; - listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; - continue; /* stale entry, go to next */ - } - - fileblocks = VTOF(vp)->ff_blocks; - - // - // for CF, if the file is empty (and not compressed) or it is too large, - // do not try to pin it. (note: if fileblocks == 0 but the file is marked - // as compressed, we may still be able to cache it). - // - if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && - ((fileblocks == 0 && !(VTOC(vp)->c_bsdflags & UF_COMPRESSED)) || - (unsigned int)fileblocks > (HFC_MAXIMUM_FILESIZE / (uint64_t)HFSTOVCB(hfsmp)->blockSize))) { - // don't try to cache something too large or that's zero-bytes - - vnode_clearfastdevicecandidate(vp); // turn off the fast-dev-candidate flag so we don't keep trying to cache it. - - hfs_unlock(VTOC(vp)); - vnode_put(vp); - listp->hfl_hotfile[i].hf_temperature = 0; - listp->hfl_next++; - listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; - continue; /* entry is too big, just carry on with the next guy */ - } - - // - // If a file is not an autocandidate (i.e. it's a user-tagged file desirous of - // being hotfile cached) but it is already bigger than 4 megs, don't bother - // hotfile caching it. Note that if a user tagged file starts small, gets - // adopted and then grows over time we will allow it to grow bigger than 4 megs - // which is intentional for things like the Mail or Photos database files which - // grow slowly over time and benefit from being on the FastDevice. - // - if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && - !(VTOC(vp)->c_attr.ca_recflags & kHFSAutoCandidateMask) && - (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevCandidateMask) && - (unsigned int)fileblocks > ((4*1024*1024) / (uint64_t)HFSTOVCB(hfsmp)->blockSize)) { - - vnode_clearfastdevicecandidate(vp); // turn off the fast-dev-candidate flag so we don't keep trying to cache it. - - hfs_unlock(VTOC(vp)); - vnode_put(vp); - listp->hfl_hotfile[i].hf_temperature = 0; - listp->hfl_next++; - listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; - continue; /* entry is too big, just carry on with the next guy */ - } - - if (fileblocks > hfs_hotfile_cur_freeblks(hfsmp)) { - // - // No room for this file. Although eviction should have made space - // it's best that we check here as well since writes to existing - // hotfiles may have eaten up space since we performed eviction - // - hfs_unlock(VTOC(vp)); - vnode_put(vp); - listp->hfl_next++; - listp->hfl_totalblocks -= fileblocks; - continue; /* entry too big, go to next */ - } - - if ((blksmoved > 0) && - (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { - // - // we've done enough work, let's be nice to the system and - // stop until the next iteration - // - hfs_unlock(VTOC(vp)); - vnode_put(vp); - break; /* adopt this entry the next time around */ - } - if (VTOC(vp)->c_desc.cd_nameptr) - data = *(const u_int32_t *)(VTOC(vp)->c_desc.cd_nameptr); - else - data = 0x3f3f3f3f; - - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - // - // For CF we pin the blocks belonging to the file - // to the "fast" (aka ssd) media - // - uint32_t pinned_blocks; - - if (vnode_isautocandidate(vp)) { - VTOC(vp)->c_attr.ca_recflags |= kHFSAutoCandidateMask; - } - if (VTOC(vp)->c_attr.ca_recflags & kHFSAutoCandidateMask) { - // - // this moves auto-cached files to the higher tier - // of "temperatures" which means they are less likely - // to get evicted (user selected hotfiles will get - // evicted first in the theory that they change more - // frequently compared to system files) - // - temp_adjust = MAX_NORMAL_TEMP; - } else { - temp_adjust = 0; - } - - hfs_unlock(VTOC(vp)); // don't need an exclusive lock for this - hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - error = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, &pinned_blocks, ctx); - - fileblocks = pinned_blocks; - - // go back to an exclusive lock since we're going to modify the cnode again - hfs_unlock(VTOC(vp)); - hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - } else { - // - // Old style hotfiles moves the data to the center (aka "hot") - // region of the disk - // - error = hfs_relocate(vp, hfsmp->hfs_hotfile_start, kauth_cred_get(), current_proc()); - } - - if (!error) { - VTOC(vp)->c_attr.ca_recflags |= kHFSFastDevPinnedMask; - VTOC(vp)->c_flag |= C_MODIFIED; - } else if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && error == EALREADY) { - // - // If hfs_pin_vnode() returned EALREADY then this file is not - // ever able to be hotfile cached the normal way. This can - // happen with compressed files which have their data stored - // in an extended attribute. We flag them so that we won't - // bother to try and hotfile cache them again the next time - // they're read. - // - VTOC(vp)->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask; - VTOC(vp)->c_flag |= C_MODIFIED; - } - - hfs_unlock(VTOC(vp)); - vnode_put(vp); - if (error) { -#if HFC_VERBOSE - if (error != EALREADY) { - printf("hfs: hotfiles_adopt: could not relocate file %d (err %d)\n", listp->hfl_hotfile[i].hf_fileid, error); - } -#endif - - if (last < listp->hfl_count) { - last++; - } - /* Move on to next item. */ - listp->hfl_next++; - continue; - } - /* Keep hot file free space current. */ - hfsmp->hfs_hotfile_freeblks -= fileblocks; - listp->hfl_totalblocks -= fileblocks; - - /* Insert hot file entry */ - key->keyLength = HFC_KEYLENGTH; - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - // - // The "temperature" for a CF hotfile is simply a random - // number that we sequentially increment for each file in - // the set of files we're currently adopting. This has the - // nice property that all of the files we pin to the ssd - // in the current phase will sort together in the hotfile - // btree. When eviction time comes we will evict them - // together as well. This gives the eviction phase temporal - // locality - things written together get evicted together - // which is what ssd's like. - // - listp->hfl_hotfile[i].hf_temperature = (uint32_t)temp_adjust + starting_temp++; - } - - key->temperature = listp->hfl_hotfile[i].hf_temperature; - key->fileID = listp->hfl_hotfile[i].hf_fileid; - key->forkType = 0; - - /* Start a new transaction before calling BTree code. */ - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - break; - } - startedtrans = 1; - - error = BTInsertRecord(filefork, iterator, &record, record.itemSize); - if (error) { - int orig_error = error; - error = MacToVFSError(error); - printf("hfs: hotfiles_adopt:1: BTInsertRecord failed %d/%d (fileid %d)\n", error, orig_error, key->fileID); - stage = HFC_IDLE; - break; - } - - /* Insert thread record */ - key->keyLength = HFC_KEYLENGTH; - key->temperature = HFC_LOOKUPTAG; - key->fileID = listp->hfl_hotfile[i].hf_fileid; - key->forkType = 0; - data = listp->hfl_hotfile[i].hf_temperature; - error = BTInsertRecord(filefork, iterator, &record, record.itemSize); - if (error) { - int orig_error = error; - error = MacToVFSError(error); - printf("hfs: hotfiles_adopt:2: BTInsertRecord failed %d/%d (fileid %d)\n", error, orig_error, key->fileID); - stage = HFC_IDLE; - break; - } else { - (void) BTFlushPath(filefork); - blksmoved += fileblocks; - } - - listp->hfl_next++; - if (listp->hfl_next >= listp->hfl_count) { - break; - } - - /* Transaction complete. */ - if (startedtrans) { - hfs_end_transaction(hfsmp); - startedtrans = 0; - } - - if (hfs_hotfile_cur_freeblks(hfsmp) <= 0) { -#if HFC_VERBOSE - printf("hfs: hotfiles_adopt: free space exhausted (%d)\n", hfsmp->hfs_hotfile_freeblks); -#endif - break; - } - } /* end for */ - -#if HFC_VERBOSE - printf("hfs: hotfiles_adopt: [%d] adopted %d blocks (%d files left)\n", listp->hfl_next, blksmoved, listp->hfl_count - i); -#endif - if (!startedtrans) { - // start a txn so we'll save the btree summary info - if (hfs_start_transaction(hfsmp) == 0) { - startedtrans = 1; - } - } - - /* Finish any outstanding transactions. */ - if (startedtrans) { - save_btree_user_info(hfsmp); - - (void) BTFlushPath(filefork); - hfs_end_transaction(hfsmp); - startedtrans = 0; - } - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - - if ((listp->hfl_next >= listp->hfl_count) || (hfsmp->hfs_hotfile_freeblks <= 0)) { -#if HFC_VERBOSE - printf("hfs: hotfiles_adopt: all done relocating %d files\n", listp->hfl_count); - printf("hfs: hotfiles_adopt: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks); -#endif - stage = HFC_IDLE; - } - FREE(iterator, M_TEMP); - - if (stage != HFC_ADOPTION && hfsmp->hfc_filevp) { - (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); - hfsmp->hfc_filevp = NULL; - } - hfsmp->hfc_stage = stage; - wakeup((caddr_t)&hfsmp->hfc_stage); - return (error); -} - -/* - * Reclaim space by evicting the coldest files. - * - * Requires that the hfc_mutex be held. - */ -static int -hotfiles_evict(struct hfsmount *hfsmp, vfs_context_t ctx) -{ - BTreeIterator * iterator = NULL; - struct vnode *vp; - HotFileKey * key; - filefork_t * filefork; - hotfilelist_t *listp; - enum hfc_stage stage; - u_int32_t savedtemp; - int blksmoved; - int filesmoved; - int fileblocks; - int error = 0; - int startedtrans = 0; - int bt_op; - - if (hfsmp->hfc_stage != HFC_EVICTION) { - return (EBUSY); - } - - if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) - return (0); - - if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - return (EPERM); - } - -#if HFC_VERBOSE - printf("hfs:%s: hotfiles_evict (hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n", - hfsmp->vcbVN, - hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end, - hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles); -#endif - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - return (ENOMEM); - } - - stage = hfsmp->hfc_stage; - hfsmp->hfc_stage = HFC_BUSY; - - filesmoved = blksmoved = 0; - bt_op = kBTreeFirstRecord; - - bzero(iterator, sizeof(*iterator)); - key = (HotFileKey*) &iterator->key; - - filefork = VTOF(hfsmp->hfc_filevp); - -#if HFC_VERBOSE - printf("hfs: hotfiles_evict: reclaim blks %d\n", listp->hfl_reclaimblks); -#endif - - while (listp->hfl_reclaimblks > 0 && - blksmoved < HFC_BLKSPERSYNC && - filesmoved < HFC_FILESPERSYNC) { - - /* - * Obtain the first record (ie the coldest one). - */ - if (BTIterateRecord(filefork, bt_op, iterator, NULL, NULL) != 0) { -#if HFC_VERBOSE - printf("hfs: hotfiles_evict: no more records\n"); -#endif - error = 0; - stage = HFC_ADOPTION; - break; - } - if (key->keyLength != HFC_KEYLENGTH) { - printf("hfs: hotfiles_evict: invalid key length %d\n", key->keyLength); - error = EFTYPE; - break; - } - if (key->temperature == HFC_LOOKUPTAG) { -#if HFC_VERBOSE - printf("hfs: hotfiles_evict: ran into thread records\n"); -#endif - error = 0; - stage = HFC_ADOPTION; - break; - } - - // Jump straight to delete for some files... - if (key->fileID == VTOC(hfsmp->hfc_filevp)->c_fileid - || key->fileID == hfsmp->hfs_jnlfileid - || key->fileID == hfsmp->hfs_jnlinfoblkid - || key->fileID < kHFSFirstUserCatalogNodeID) { - goto delete; - } - - /* - * Aquire the vnode for this file. - */ - error = hfs_vget(hfsmp, key->fileID, &vp, 0, 0); - if (error) { - if (error == ENOENT) { - goto delete; /* stale entry, go to next */ - } else { - printf("hfs: hotfiles_evict: err %d getting file %d\n", - error, key->fileID); - } - break; - } - - /* - * Symlinks that may have been inserted into the hotfile zone during a previous OS are now stuck - * here. We do not want to move them. - */ - if (!vnode_isreg(vp)) { - //printf("hfs: hotfiles_evict: huh, not a file %d\n", key->fileID); - hfs_unlock(VTOC(vp)); - vnode_put(vp); - goto delete; /* invalid entry, go to next */ - } - - fileblocks = VTOF(vp)->ff_blocks; - if ((blksmoved > 0) && - (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { - hfs_unlock(VTOC(vp)); - vnode_put(vp); - break; - } - /* - * Make sure file is in the hot area. - */ - if (!hotextents(hfsmp, &VTOF(vp)->ff_extents[0]) && !(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { -#if HFC_VERBOSE - printf("hfs: hotfiles_evict: file %d isn't hot!\n", key->fileID); -#endif - hfs_unlock(VTOC(vp)); - vnode_put(vp); - goto delete; /* stale entry, go to next */ - } - - /* - * Relocate file out of hot area. On cooperative fusion (CF) that just - * means un-pinning the data from the ssd. For traditional hotfiles that means moving - * the file data out of the hot region of the disk. - */ - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - uint32_t pinned_blocks; - - hfs_unlock(VTOC(vp)); // don't need an exclusive lock for this - hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - error = hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &pinned_blocks, ctx); - fileblocks = pinned_blocks; - - if (!error) { - // go back to an exclusive lock since we're going to modify the cnode again - hfs_unlock(VTOC(vp)); - hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - } - } else { - error = hfs_relocate(vp, HFSTOVCB(hfsmp)->nextAllocation, vfs_context_ucred(ctx), vfs_context_proc(ctx)); - } - if (error) { -#if HFC_VERBOSE - printf("hfs: hotfiles_evict: err %d relocating file %d\n", error, key->fileID); -#endif - hfs_unlock(VTOC(vp)); - vnode_put(vp); - bt_op = kBTreeNextRecord; - goto next; /* go to next */ - } else { - VTOC(vp)->c_attr.ca_recflags &= ~kHFSFastDevPinnedMask; - VTOC(vp)->c_flag |= C_MODIFIED; - } - - // - // We do not believe that this call to hfs_fsync() is - // necessary and it causes a journal transaction - // deadlock so we are removing it. - // - // (void) hfs_fsync(vp, MNT_WAIT, 0, p); - - hfs_unlock(VTOC(vp)); - vnode_put(vp); - - hfsmp->hfs_hotfile_freeblks += fileblocks; - listp->hfl_reclaimblks -= fileblocks; - if (listp->hfl_reclaimblks < 0) - listp->hfl_reclaimblks = 0; - blksmoved += fileblocks; - filesmoved++; -delete: - /* Start a new transaction before calling BTree code. */ - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - break; - } - startedtrans = 1; - - error = BTDeleteRecord(filefork, iterator); - if (error) { - error = MacToVFSError(error); - break; - } - savedtemp = key->temperature; - key->temperature = HFC_LOOKUPTAG; - error = BTDeleteRecord(filefork, iterator); - if (error) { - error = MacToVFSError(error); - break; - } - key->temperature = savedtemp; -next: - (void) BTFlushPath(filefork); - - /* Transaction complete. */ - if (startedtrans) { - hfs_end_transaction(hfsmp); - startedtrans = 0; - } - - } /* end while */ - -#if HFC_VERBOSE - printf("hfs: hotfiles_evict: moved %d files (%d blks, %d to go)\n", filesmoved, blksmoved, listp->hfl_reclaimblks); -#endif - /* Finish any outstanding transactions. */ - if (startedtrans) { - save_btree_user_info(hfsmp); - - (void) BTFlushPath(filefork); - hfs_end_transaction(hfsmp); - startedtrans = 0; - } - hfs_unlock(VTOC(hfsmp->hfc_filevp)); - - /* - * Move to next stage when finished. - */ - if (listp->hfl_reclaimblks <= 0) { - stage = HFC_ADOPTION; -#if HFC_VERBOSE - printf("hfs: hotfiles_evict: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks); -#endif - } - FREE(iterator, M_TEMP); - hfsmp->hfc_stage = stage; - wakeup((caddr_t)&hfsmp->hfc_stage); - return (error); -} - -/* - * Age the existing records in the hot files b-tree. - */ -static int -hotfiles_age(struct hfsmount *hfsmp) -{ - BTreeInfoRec btinfo; - BTreeIterator * iterator = NULL; - BTreeIterator * prev_iterator; - FSBufferDescriptor record; - FSBufferDescriptor prev_record; - HotFileKey * key; - HotFileKey * prev_key; - filefork_t * filefork; - u_int32_t data; - u_int32_t prev_data; - u_int32_t newtemp; - int error; - int i; - int numrecs; - int aged = 0; - u_int16_t reclen; - - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - // - // hotfiles don't age on CF - // - return 0; - } - - MALLOC(iterator, BTreeIterator *, 2 * sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - error = ENOMEM; - goto out2; - } - bzero(iterator, 2 * sizeof(*iterator)); - key = (HotFileKey*) &iterator->key; - - prev_iterator = &iterator[1]; - prev_key = (HotFileKey*) &prev_iterator->key; - - record.bufferAddress = &data; - record.itemSize = sizeof(data); - record.itemCount = 1; - prev_record.bufferAddress = &prev_data; - prev_record.itemSize = sizeof(prev_data); - prev_record.itemCount = 1; - - /* - * Capture b-tree changes inside a transaction - */ - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out2; - } - if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - error = EPERM; - goto out1; - } - filefork = VTOF(hfsmp->hfc_filevp); - - error = BTGetInformation(filefork, 0, &btinfo); - if (error) { - error = MacToVFSError(error); - goto out; - } - if (btinfo.numRecords < 2) { - error = 0; - goto out; - } - - /* Only want 1st half of leaf records */ - numrecs = (btinfo.numRecords /= 2) - 1; - - error = BTIterateRecord(filefork, kBTreeFirstRecord, iterator, &record, &reclen); - if (error) { - printf("hfs_agehotfiles: BTIterateRecord: %d\n", error); - error = MacToVFSError(error); - goto out; - } - bcopy(iterator, prev_iterator, sizeof(BTreeIterator)); - prev_data = data; - - for (i = 0; i < numrecs; ++i) { - error = BTIterateRecord(filefork, kBTreeNextRecord, iterator, &record, &reclen); - if (error == 0) { - if (key->temperature < prev_key->temperature) { - printf("hfs_agehotfiles: out of order keys!\n"); - error = EFTYPE; - break; - } - if (reclen != sizeof(data)) { - printf("hfs_agehotfiles: invalid record length %d\n", reclen); - error = EFTYPE; - break; - } - if (key->keyLength != HFC_KEYLENGTH) { - printf("hfs_agehotfiles: invalid key length %d\n", key->keyLength); - error = EFTYPE; - break; - } - } else if ((error == fsBTEndOfIterationErr || error == fsBTRecordNotFoundErr) && - (i == (numrecs - 1))) { - error = 0; - } else if (error) { - printf("hfs_agehotfiles: %d of %d BTIterateRecord: %d\n", i, numrecs, error); - error = MacToVFSError(error); - break; - } - if (prev_key->temperature == HFC_LOOKUPTAG) { -#if HFC_VERBOSE - printf("hfs_agehotfiles: ran into thread record\n"); -#endif - error = 0; - break; - } - error = BTDeleteRecord(filefork, prev_iterator); - if (error) { - printf("hfs_agehotfiles: BTDeleteRecord failed %d (file %d)\n", error, prev_key->fileID); - error = MacToVFSError(error); - break; - } - - /* Age by halving the temperature (floor = 4) */ - newtemp = MAX(prev_key->temperature >> 1, 4); - prev_key->temperature = newtemp; - - error = BTInsertRecord(filefork, prev_iterator, &prev_record, prev_record.itemSize); - if (error) { - printf("hfs_agehotfiles: BTInsertRecord failed %d (file %d)\n", error, prev_key->fileID); - error = MacToVFSError(error); - break; - } - ++aged; - /* - * Update thread entry with latest temperature. - */ - prev_key->temperature = HFC_LOOKUPTAG; - error = BTUpdateRecord(filefork, prev_iterator, - (IterateCallBackProcPtr)update_callback, - &newtemp); - if (error) { - printf("hfs_agehotfiles: %d of %d BTUpdateRecord failed %d (file %d, %d)\n", - i, numrecs, error, prev_key->fileID, newtemp); - error = MacToVFSError(error); - // break; - } - - bcopy(iterator, prev_iterator, sizeof(BTreeIterator)); - prev_data = data; - - } /* end for */ - -#if HFC_VERBOSE - if (error == 0) - printf("hfs_agehotfiles: aged %d records out of %d\n", aged, btinfo.numRecords); -#endif - (void) BTFlushPath(filefork); -out: - hfs_unlock(VTOC(hfsmp->hfc_filevp)); -out1: - hfs_end_transaction(hfsmp); -out2: - if (iterator) - FREE(iterator, M_TEMP); - return (error); -} - -/* - * Return true if any blocks (or all blocks if all is true) - * are contained in the hot file region. - */ -static int -hotextents(struct hfsmount *hfsmp, HFSPlusExtentDescriptor * extents) -{ - u_int32_t b1, b2; - int i; - int inside = 0; - - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - b1 = extents[i].startBlock; - if (b1 == 0) - break; - b2 = b1 + extents[i].blockCount - 1; - if ((b1 >= hfsmp->hfs_hotfile_start && - b2 <= hfsmp->hfs_hotfile_end) || - (b1 < hfsmp->hfs_hotfile_end && - b2 > hfsmp->hfs_hotfile_end)) { - inside = 1; - break; - } - } - return (inside); -} - - -/* - *======================================================================== - * HOT FILE B-TREE ROUTINES - *======================================================================== - */ - -/* - * Open the hot files b-tree for writing. - * - * On successful exit the vnode has a reference but not an iocount. - */ -static int -hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) -{ - return hfc_btree_open_ext(hfsmp, vpp, 0); -} - -static int -hfc_btree_open_ext(struct hfsmount *hfsmp, struct vnode **vpp, int ignore_btree_errs) -{ - proc_t p; - struct vnode *vp; - struct cat_desc cdesc; - struct cat_attr cattr; - struct cat_fork cfork; - static char filename[] = HFC_FILENAME; - int error; - int retry = 0; - int lockflags; - int newvnode_flags = 0; - - *vpp = NULL; - p = current_proc(); - - bzero(&cdesc, sizeof(cdesc)); - cdesc.cd_parentcnid = kRootDirID; - cdesc.cd_nameptr = (const u_int8_t *)filename; - cdesc.cd_namelen = strlen(filename); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - error = cat_lookup(hfsmp, &cdesc, 0, 0, &cdesc, &cattr, &cfork, NULL); - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) { - printf("hfs: hfc_btree_open: cat_lookup error %d\n", error); - return (error); - } -again: - cdesc.cd_flags |= CD_ISMETA; - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, - &cfork, &vp, &newvnode_flags); - if (error) { - printf("hfs: hfc_btree_open: hfs_getnewvnode error %d\n", error); - cat_releasedesc(&cdesc); - return (error); - } - if (!vnode_issystem(vp)) { -#if HFC_VERBOSE - printf("hfs: hfc_btree_open: file has UBC, try again\n"); -#endif - hfs_unlock(VTOC(vp)); - vnode_recycle(vp); - vnode_put(vp); - if (retry++ == 0) - goto again; - else - return (EBUSY); - } - - /* Open the B-tree file for writing... */ - error = BTOpenPath(VTOF(vp), (KeyCompareProcPtr) hfc_comparekeys); - if (error) { - if (!ignore_btree_errs) { - printf("hfs: hfc_btree_open: BTOpenPath error %d; filesize %lld\n", error, VTOF(vp)->ff_size); - error = MacToVFSError(error); - } else { - error = 0; - } - } - - hfs_unlock(VTOC(vp)); - if (error == 0) { - *vpp = vp; - vnode_ref(vp); /* keep a reference while its open */ - } - vnode_put(vp); - - if (!vnode_issystem(vp)) - panic("hfs: hfc_btree_open: not a system file (vp = %p)", vp); - - HotFilesInfo hotfileinfo; - - if (error == 0 && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { - if ((BTGetUserData(VTOF(vp), &hotfileinfo, sizeof(hotfileinfo)) == 0) && (SWAP_BE32 (hotfileinfo.magic) == HFC_MAGIC)) { - if (hfsmp->hfs_hotfile_freeblks == 0) { - hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - SWAP_BE32 (hotfileinfo.usedblocks); - } - - hfs_hotfile_cur_freeblks(hfsmp); // factors in any adjustments that happened at run-time - } - } - - return (error); -} - -/* - * Close the hot files b-tree. - * - * On entry the vnode has a reference. - */ -static int -hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp) -{ - proc_t p = current_proc(); - int error = 0; - - - if (hfsmp->jnl) { - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); - } - - if (vnode_get(vp) == 0) { - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error == 0) { - (void) hfs_fsync(vp, MNT_WAIT, 0, p); - error = BTClosePath(VTOF(vp)); - hfs_unlock(VTOC(vp)); - } - vnode_rele(vp); - vnode_recycle(vp); - vnode_put(vp); - } - - return (error); -} - -// -// Assumes that hfsmp->hfc_filevp points to the hotfile btree vnode -// (i.e. you called hfc_btree_open() ahead of time) -// -static int -hfc_btree_delete_record(struct hfsmount *hfsmp, BTreeIterator *iterator, HotFileKey *key) -{ - int error; - filefork_t *filefork=VTOF(hfsmp->hfc_filevp); - - /* Start a new transaction before calling BTree code. */ - if (hfs_start_transaction(hfsmp) != 0) { - return EINVAL; - } - - error = BTDeleteRecord(filefork, iterator); - if (error) { - error = MacToVFSError(error); - printf("hfs: failed to delete record for file-id %d : err %d\n", key->fileID, error); - goto out; - } - - int savedtemp; - savedtemp = key->temperature; - key->temperature = HFC_LOOKUPTAG; - error = BTDeleteRecord(filefork, iterator); - if (error) { - error = MacToVFSError(error); - printf("hfs:2: failed to delete record for file-id %d : err %d\n", key->fileID, error); - } - key->temperature = savedtemp; - - (void) BTFlushPath(filefork); - -out: - /* Transaction complete. */ - hfs_end_transaction(hfsmp); - - return error; -} - -// -// You have to have already opened the hotfile btree so -// that hfsmp->hfc_filevp is filled in. -// -static int -hfc_btree_delete(struct hfsmount *hfsmp) -{ - struct vnode *dvp = NULL; - vfs_context_t ctx = vfs_context_current(); - struct vnode_attr va; - struct componentname cname; - static char filename[] = HFC_FILENAME; - int error; - - error = VFS_ROOT(HFSTOVFS(hfsmp), &dvp, ctx); - if (error) { - return (error); - } - cname.cn_nameiop = DELETE; - cname.cn_flags = ISLASTCN; - cname.cn_context = ctx; - cname.cn_pnbuf = filename; - cname.cn_pnlen = sizeof(filename); - cname.cn_nameptr = filename; - cname.cn_namelen = strlen(filename); - cname.cn_hash = 0; - cname.cn_consume = 0; - - VATTR_INIT(&va); - VATTR_SET(&va, va_type, VREG); - VATTR_SET(&va, va_mode, S_IFREG | S_IRUSR | S_IWUSR); - VATTR_SET(&va, va_uid, 0); - VATTR_SET(&va, va_gid, 0); - - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - - /* call ourselves directly, ignore the higher-level VFS file creation code */ - error = VNOP_REMOVE(dvp, hfsmp->hfc_filevp, &cname, 0, ctx); - if (error) { - printf("hfs: error %d removing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); - } - - hfs_end_transaction(hfsmp); - -out: - if (dvp) { - vnode_put(dvp); - dvp = NULL; - } - - return 0; -} - - - - -/* - * Create a hot files btree file. - * - */ -static int -hfc_btree_create(struct hfsmount *hfsmp, unsigned int nodesize, unsigned int entries) -{ - struct vnode *dvp = NULL; - struct vnode *vp = NULL; - struct cnode *cp = NULL; - vfs_context_t ctx = vfs_context_current(); - struct vnode_attr va; - struct componentname cname; - static char filename[] = HFC_FILENAME; - int error; - - if (hfsmp->hfc_filevp) - panic("hfs: hfc_btree_create: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp); - - error = VFS_ROOT(HFSTOVFS(hfsmp), &dvp, ctx); - if (error) { - return (error); - } - cname.cn_nameiop = CREATE; - cname.cn_flags = ISLASTCN; - cname.cn_context = ctx; - cname.cn_pnbuf = filename; - cname.cn_pnlen = sizeof(filename); - cname.cn_nameptr = filename; - cname.cn_namelen = strlen(filename); - cname.cn_hash = 0; - cname.cn_consume = 0; - - VATTR_INIT(&va); - VATTR_SET(&va, va_type, VREG); - VATTR_SET(&va, va_mode, S_IFREG | S_IRUSR | S_IWUSR); - VATTR_SET(&va, va_uid, 0); - VATTR_SET(&va, va_gid, 0); - - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - - /* call ourselves directly, ignore the higher-level VFS file creation code */ - error = VNOP_CREATE(dvp, &vp, &cname, &va, ctx); - if (error) { - printf("hfs: error %d creating HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); - goto out; - } - if (dvp) { - vnode_put(dvp); - dvp = NULL; - } - if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - goto out; - } - cp = VTOC(vp); - - /* Don't use non-regular files or files with links. */ - if (!vnode_isreg(vp) || cp->c_linkcount != 1) { - error = EFTYPE; - goto out; - } - - printf("hfs: created HFBT on %s\n", HFSTOVCB(hfsmp)->vcbVN); - - if (VTOF(vp)->ff_size < nodesize) { - caddr_t buffer; - u_int16_t *index; - u_int16_t offset; - BTNodeDescriptor *ndp; - BTHeaderRec *bthp; - HotFilesInfo *hotfileinfo; - int nodecnt; - int filesize; - int entirespernode; - - /* - * Mark it invisible (truncate will pull these changes). - */ - ((FndrFileInfo *)&cp->c_finderinfo[0])->fdFlags |= - SWAP_BE16 (kIsInvisible + kNameLocked); - - if (kmem_alloc(kernel_map, (vm_offset_t *)&buffer, nodesize, VM_KERN_MEMORY_FILE)) { - error = ENOMEM; - goto out; - } - bzero(buffer, nodesize); - index = (u_int16_t *)buffer; - - entirespernode = (nodesize - sizeof(BTNodeDescriptor) - 2) / - (sizeof(HotFileKey) + 6); - nodecnt = 2 + howmany(entries * 2, entirespernode); - nodecnt = roundup(nodecnt, 8); - filesize = nodecnt * nodesize; - - /* FILL IN THE NODE DESCRIPTOR: */ - ndp = (BTNodeDescriptor *)buffer; - ndp->kind = kBTHeaderNode; - ndp->numRecords = SWAP_BE16 (3); - offset = sizeof(BTNodeDescriptor); - index[(nodesize / 2) - 1] = SWAP_BE16 (offset); - - /* FILL IN THE HEADER RECORD: */ - bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset); - bthp->nodeSize = SWAP_BE16 (nodesize); - bthp->totalNodes = SWAP_BE32 (filesize / nodesize); - bthp->freeNodes = SWAP_BE32 (nodecnt - 1); - bthp->clumpSize = SWAP_BE32 (filesize); - bthp->btreeType = kUserBTreeType; /* non-metadata */ - bthp->attributes |= SWAP_BE32 (kBTBigKeysMask); - bthp->maxKeyLength = SWAP_BE16 (HFC_KEYLENGTH); - offset += sizeof(BTHeaderRec); - index[(nodesize / 2) - 2] = SWAP_BE16 (offset); - - /* FILL IN THE USER RECORD: */ - hotfileinfo = (HotFilesInfo *)((u_int8_t *)buffer + offset); - hotfileinfo->magic = SWAP_BE32 (HFC_MAGIC); - hotfileinfo->version = SWAP_BE32 (HFC_VERSION); - hotfileinfo->duration = SWAP_BE32 (HFC_DEFAULT_DURATION); - hotfileinfo->timebase = 0; - hotfileinfo->timeleft = 0; - hotfileinfo->threshold = SWAP_BE32 (HFC_MINIMUM_TEMPERATURE); - hotfileinfo->maxfileblks = SWAP_BE32 (HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize); - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - if (hfsmp->hfs_hotfile_freeblks == 0) { - hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks; - } - hotfileinfo->usedblocks = SWAP_BE32 (hfsmp->hfs_hotfile_maxblks - hfsmp->hfs_hotfile_freeblks); - } else { - hotfileinfo->maxfilecnt = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT); - } - strlcpy((char *)hotfileinfo->tag, hfc_tag, - sizeof hotfileinfo->tag); - offset += kBTreeHeaderUserBytes; - index[(nodesize / 2) - 3] = SWAP_BE16 (offset); - - /* FILL IN THE MAP RECORD (only one node in use). */ - *((u_int8_t *)buffer + offset) = 0x80; - offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) - - kBTreeHeaderUserBytes - (4 * sizeof(int16_t)); - index[(nodesize / 2) - 4] = SWAP_BE16 (offset); - - vnode_setnoflush(vp); - error = hfs_truncate(vp, (off_t)filesize, IO_NDELAY, 0, ctx); - if (error) { - printf("hfs: error %d growing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); - goto out; - } - cp->c_flag |= C_ZFWANTSYNC; - cp->c_zftimeout = 1; - - if (error == 0) { - struct vnop_write_args args; - uio_t auio; - - auio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (uintptr_t)buffer, nodesize); - - args.a_desc = &vnop_write_desc; - args.a_vp = vp; - args.a_uio = auio; - args.a_ioflag = 0; - args.a_context = ctx; - - hfs_unlock(cp); - cp = NULL; - - error = hfs_vnop_write(&args); - if (error) - printf("hfs: error %d writing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); - - uio_free(auio); - } - kmem_free(kernel_map, (vm_offset_t)buffer, nodesize); - } -out: - hfs_end_transaction(hfsmp); - if (dvp) { - vnode_put(dvp); - } - if (vp) { - if (cp) - hfs_unlock(cp); - vnode_recycle(vp); - vnode_put(vp); - } - return (error); -} - -/* - * Compare two hot file b-tree keys. - * - * Result: +n search key > trial key - * 0 search key = trial key - * -n search key < trial key - */ -static int -hfc_comparekeys(HotFileKey *searchKey, HotFileKey *trialKey) -{ - /* - * Compared temperatures first. - */ - if (searchKey->temperature == trialKey->temperature) { - /* - * Temperatures are equal so compare file ids. - */ - if (searchKey->fileID == trialKey->fileID) { - /* - * File ids are equal so compare fork types. - */ - if (searchKey->forkType == trialKey->forkType) { - return (0); - } else if (searchKey->forkType > trialKey->forkType) { - return (1); - } - } else if (searchKey->fileID > trialKey->fileID) { - return (1); - } - } else if (searchKey->temperature > trialKey->temperature) { - return (1); - } - - return (-1); -} - - -/* - *======================================================================== - * HOT FILE DATA COLLECTING ROUTINES - *======================================================================== - */ - -/* - * Lookup a hot file entry in the tree. - */ -#if HFC_DEBUG -static hotfile_entry_t * -hf_lookup(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) -{ - hotfile_entry_t *entry = hotdata->rootentry; - - while (entry && - entry->temperature != temperature && - entry->fileid != fileid) { - - if (temperature > entry->temperature) - entry = entry->right; - else if (temperature < entry->temperature) - entry = entry->left; - else if (fileid > entry->fileid) - entry = entry->right; - else - entry = entry->left; - } - return (entry); -} -#endif - -/* - * Insert a hot file entry into the tree. - */ -static int -hf_insert(hotfile_data_t *hotdata, hotfile_entry_t *newentry) -{ - hotfile_entry_t *entry = hotdata->rootentry; - u_int32_t fileid = newentry->fileid; - u_int32_t temperature = newentry->temperature; - - if (entry == NULL) { - hotdata->rootentry = newentry; - hotdata->coldest = newentry; - hotdata->activefiles++; - return 0; - } - - while (entry) { - if (temperature > entry->temperature) { - if (entry->right) { - entry = entry->right; - } else { - entry->right = newentry; - break; - } - } else if (temperature < entry->temperature) { - if (entry->left) { - entry = entry->left; - } else { - entry->left = newentry; - break; - } - } else if (fileid > entry->fileid) { - if (entry->right) { - entry = entry->right; - } else { - if (entry->fileid != fileid) - entry->right = newentry; - break; - } - } else { - if (entry->left) { - entry = entry->left; - } else { - if (entry->fileid != fileid) { - entry->left = newentry; - } else { - return EEXIST; - } - break; - } - } - } - - hotdata->activefiles++; - return 0; -} - -/* - * Find the coldest entry in the tree. - */ -static hotfile_entry_t * -hf_coldest(hotfile_data_t *hotdata) -{ - hotfile_entry_t *entry = hotdata->rootentry; - - if (entry) { - while (entry->left) - entry = entry->left; - } - return (entry); -} - -/* - * Find the hottest entry in the tree. - */ -static hotfile_entry_t * -hf_hottest(hotfile_data_t *hotdata) -{ - hotfile_entry_t *entry = hotdata->rootentry; - - if (entry) { - while (entry->right) - entry = entry->right; - } - return (entry); -} - -/* - * Delete a hot file entry from the tree. - */ -static void -hf_delete(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) -{ - hotfile_entry_t *entry, *parent, *next; - - parent = NULL; - entry = hotdata->rootentry; - - while (entry && - entry->temperature != temperature && - entry->fileid != fileid) { - - parent = entry; - if (temperature > entry->temperature) - entry = entry->right; - else if (temperature < entry->temperature) - entry = entry->left; - else if (fileid > entry->fileid) - entry = entry->right; - else - entry = entry->left; - } - - if (entry) { - /* - * Reorganize the sub-trees spanning from our entry. - */ - if ((next = entry->right)) { - hotfile_entry_t *pnextl, *psub; - /* - * Tree pruning: take the left branch of the - * current entry and place it at the lowest - * left branch of the current right branch - */ - psub = next; - - /* Walk the Right/Left sub tree from current entry */ - while ((pnextl = psub->left)) - psub = pnextl; - - /* Plug the old left tree to the new ->Right leftmost entry */ - psub->left = entry->left; - - } else /* only left sub-tree, simple case */ { - next = entry->left; - } - /* - * Now, plug the current entry sub tree to - * the good pointer of our parent entry. - */ - if (parent == NULL) - hotdata->rootentry = next; - else if (parent->left == entry) - parent->left = next; - else - parent->right = next; - - /* Place entry back on the free-list */ - entry->left = 0; - entry->fileid = 0; - entry->temperature = 0; - - entry->right = hotdata->freelist; - hotdata->freelist = entry; - hotdata->activefiles--; - - if (hotdata->coldest == entry || hotdata->coldest == NULL) { - hotdata->coldest = hf_coldest(hotdata); - } - - } -} - -/* - * Get a free hot file entry. - */ -static hotfile_entry_t * -hf_getnewentry(hotfile_data_t *hotdata) -{ - hotfile_entry_t * entry; - - /* - * When the free list is empty then steal the coldest one - */ - if (hotdata->freelist == NULL) { - entry = hf_coldest(hotdata); - hf_delete(hotdata, entry->fileid, entry->temperature); - } - entry = hotdata->freelist; - hotdata->freelist = entry->right; - entry->right = 0; - - return (entry); -} - - -/* - * Generate a sorted list of hot files (hottest to coldest). - * - * As a side effect, every node in the hot file tree will be - * deleted (moved to the free list). - */ -static void -hf_getsortedlist(hotfile_data_t * hotdata, hotfilelist_t *sortedlist) -{ - int i = 0; - hotfile_entry_t *entry; - - while ((entry = hf_hottest(hotdata)) != NULL) { - sortedlist->hfl_hotfile[i].hf_fileid = entry->fileid; - sortedlist->hfl_hotfile[i].hf_temperature = entry->temperature; - sortedlist->hfl_hotfile[i].hf_blocks = entry->blocks; - sortedlist->hfl_totalblocks += entry->blocks; - ++i; - - hf_delete(hotdata, entry->fileid, entry->temperature); - } - - sortedlist->hfl_count = i; - -#if HFC_VERBOSE - printf("hfs: hf_getsortedlist returning %d entries w/%d total blocks\n", i, sortedlist->hfl_totalblocks); -#endif -} - - -#if HFC_DEBUG -static void -hf_maxdepth(hotfile_entry_t * root, int depth, int *maxdepth) -{ - if (root) { - depth++; - if (depth > *maxdepth) - *maxdepth = depth; - hf_maxdepth(root->left, depth, maxdepth); - hf_maxdepth(root->right, depth, maxdepth); - } -} - -static void -hf_printtree(hotfile_entry_t * root) -{ - if (root) { - hf_printtree(root->left); - printf("hfs: temperature: % 8d, fileid %d\n", root->temperature, root->fileid); - hf_printtree(root->right); - } -} -#endif diff --git a/bsd/hfs/hfs_hotfiles.h b/bsd/hfs/hfs_hotfiles.h deleted file mode 100644 index 7d8681954..000000000 --- a/bsd/hfs/hfs_hotfiles.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2003, 2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef __HFS_HOTFILES__ -#define __HFS_HOTFILES__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - - -#define HFC_FILENAME ".hotfiles.btree" - - -/* - * Temperature measurement constraints. - */ -#define HFC_DEFAULT_FILE_COUNT hfc_default_file_count -#define HFC_DEFAULT_DURATION hfc_default_duration -#define HFC_CUMULATIVE_CYCLES 3 -#define HFC_MAXIMUM_FILE_COUNT hfc_max_file_count -#define HFC_MAXIMUM_FILESIZE hfc_max_file_size -#define HFC_MINIMUM_TEMPERATURE 24 - - -/* - * Sync constraints. - */ -#define HFC_BLKSPERSYNC 300 -#define HFC_FILESPERSYNC 50 - - -/* - * Hot file clustering stages. - */ -enum hfc_stage { - HFC_DISABLED, - HFC_IDLE, - HFC_BUSY, - HFC_RECORDING, - HFC_EVALUATION, - HFC_EVICTION, - HFC_ADOPTION, -}; - - -/* - * B-tree file key format (on-disk). - */ -struct HotFileKey { - u_int16_t keyLength; /* length of key, excluding this field */ - u_int8_t forkType; /* 0 = data fork, FF = resource fork */ - u_int8_t pad; /* make the other fields align on 32-bit boundary */ - u_int32_t temperature; /* temperature recorded */ - u_int32_t fileID; /* file ID */ -}; -typedef struct HotFileKey HotFileKey; - -#define HFC_LOOKUPTAG 0xFFFFFFFF -#define HFC_KEYLENGTH (sizeof(HotFileKey) - sizeof(u_int16_t)) - -/* - * B-tree header node user info (on-disk). - */ -struct HotFilesInfo { - u_int32_t magic; - u_int32_t version; - u_int32_t duration; /* duration of sample period (secs) */ - u_int32_t timebase; /* start of recording period (GMT time in secs) */ - u_int32_t timeleft; /* time remaining in recording period (secs) */ - u_int32_t threshold; - u_int32_t maxfileblks; - union { - u_int32_t _maxfilecnt; // on hdd's we track the max # of files - u_int32_t _usedblocks; // on ssd's we track how many blocks are used - } _u; - u_int8_t tag[32]; -}; - -#define usedblocks _u._usedblocks -#define maxfilecnt _u._maxfilecnt - -typedef struct HotFilesInfo HotFilesInfo; - -#define HFC_MAGIC 0xFF28FF26 -#define HFC_VERSION 1 - - -struct hfsmount; -struct proc; -struct vnode; - -/* - * Hot File interface functions. - */ -int hfs_hotfilesync (struct hfsmount *, vfs_context_t ctx); - -int hfs_recording_init(struct hfsmount *); -int hfs_recording_suspend (struct hfsmount *); - -int hfs_addhotfile (struct vnode *); -int hfs_removehotfile (struct vnode *); -int hfs_hotfile_deleted(struct vnode *vp); // called when a file is deleted -void hfs_repin_hotfiles(struct hfsmount *); - -// call this to adjust the number of used hotfile blocks either up/down -int hfs_hotfile_adjust_blocks(struct vnode *vp, int64_t num_blocks); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* __HFS_HOTFILES__ */ diff --git a/bsd/hfs/hfs_kdebug.h b/bsd/hfs/hfs_kdebug.h deleted file mode 100644 index 827fc4f29..000000000 --- a/bsd/hfs/hfs_kdebug.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef HFS_KDEBUG_H_ -#define HFS_KDEBUG_H_ - -#include - -/* - * KERNEL_DEBUG related definitions for HFS. - * - * NOTE: The Class DBG_FSYSTEM = 3, and Subclass DBG_HFS = 8, so these - * debug codes are of the form 0x0308nnnn. - */ -#define HFSDBG_CODE(code) FSDBG_CODE(DBG_HFS, code) - -enum { - HFSDBG_WRITE = FSDBG_CODE(DBG_FSRW, 0), /* 0x3010000 */ - HFSDBG_TRUNCATE = FSDBG_CODE(DBG_FSRW, 7), /* 0x301001C */ - HFSDBG_READ = FSDBG_CODE(DBG_FSRW, 12), /* 0x3010030 */ - HFSDBG_GETNEWVNODE = FSDBG_CODE(DBG_FSRW, 37), /* 0x3010094 */ - HFSDBG_UPDATE = FSDBG_CODE(DBG_FSRW, 8192), /* 0x3018000 */ - HFSDBG_UNMAP_FREE = HFSDBG_CODE(0), /* 0x03080000 */ - HFSDBG_UNMAP_ALLOC = HFSDBG_CODE(1), /* 0x03080004 */ - HFSDBG_UNMAP_CALLBACK = HFSDBG_CODE(2), /* 0x03080008 */ - /* 0x0308000C is unused */ - HFSDBG_BLOCK_ALLOCATE = HFSDBG_CODE(4), /* 0x03080010 */ - HFSDBG_BLOCK_DEALLOCATE = HFSDBG_CODE(5), /* 0x03080014 */ - HFSDBG_READ_BITMAP_BLOCK = HFSDBG_CODE(6), /* 0x03080018 */ - HFSDBG_RELEASE_BITMAP_BLOCK = HFSDBG_CODE(7), /* 0x0308001C */ - HFSDBG_FIND_CONTIG_BITMAP = HFSDBG_CODE(8), /* 0x03080020 */ - HFSDBG_ALLOC_ANY_BITMAP = HFSDBG_CODE(9), /* 0x03080024 */ - HFSDBG_ALLOC_FIND_KNOWN = HFSDBG_CODE(10), /* 0x03080028 */ - HFSDBG_MARK_ALLOC_BITMAP = HFSDBG_CODE(11), /* 0x0308002C */ - HFSDBG_MARK_FREE_BITMAP = HFSDBG_CODE(12), /* 0x03080030 */ - HFSDBG_BLOCK_FIND_CONTIG = HFSDBG_CODE(13), /* 0x03080034 */ - HFSDBG_IS_ALLOCATED = HFSDBG_CODE(14), /* 0x03080038 */ - /* 0x0308003C is unused */ - HFSDBG_RESET_EXTENT_CACHE = HFSDBG_CODE(16), /* 0x03080040 */ - HFSDBG_REMOVE_EXTENT_CACHE = HFSDBG_CODE(17), /* 0x03080044 */ - HFSDBG_ADD_EXTENT_CACHE = HFSDBG_CODE(18), /* 0x03080048 */ - HFSDBG_READ_BITMAP_RANGE = HFSDBG_CODE(19), /* 0x0308004C */ - HFSDBG_RELEASE_SCAN_BITMAP = HFSDBG_CODE(20), /* 0x03080050 */ - HFSDBG_SYNCER = HFSDBG_CODE(21), /* 0x03080054 */ - HFSDBG_SYNCER_TIMED = HFSDBG_CODE(22), /* 0x03080058 */ - HFSDBG_UNMAP_SCAN = HFSDBG_CODE(23), /* 0x0308005C */ - HFSDBG_UNMAP_SCAN_TRIM = HFSDBG_CODE(24), /* 0x03080060 */ -}; - -/* - Parameters logged by the above tracepoints: ---------------------------------------------------------------------------------------------------------------------------------- - CODE EVENT NAME DBG_FUNC_START arg1, arg2, arg3, arg4, arg5 ... DBG_FUNC_END arg1, arg2, arg3, arg4, arg5 - DBG_FUNC_NONE arg1, arg2, arg3, arg4, arg5 ---------------------------------------------------------------------------------------------------------------------------------- -0x3010000 HFSDBG_WRITE offset, uio_resid, ff_size, filebytes, 0 ... uio_offset, uio_resid, ff_size, filebytes, 0 - offset, uio_resid, ff_size, filebytes, 0 -0x301001C HFSDBG_TRUNCATE length, ff_size, filebytes, 0, 0 ... length, ff_size, filebytes, retval, 0 - length, ff_size, filebytes, 0, 0 -0x3010030 HFSDBG_READ uio_offset, uio_resid, filesize, filebytes, 0 ... uio_offset, uio_resid, filesize, filebytes, 0 -0x3010094 HFSDBG_GETNEWVNODE c_vp, c_rsrc_vp, 0, 0, 0 -0x3018000 HFSDBG_UPDATE vp, tstate, 0, 0, 0 ... vp, tstate, error, 0/-1, 0 - 0 HFSDBG_UNMAP_FREE startBlock, blockCount, 0, 0, 0 ... err, 0, 0, 0, 0 - 1 HFSDBG_UNMAP_ALLOC startBlock, blockCount, 0, 0, 0 ... err, 0, 0, 0, 0 - 2 HFSDBG_UNMAP_CALLBACK 0, extentCount, 0, 0, 0 ... 0, 0, 0, 0, 0 - 3 unused - 4 HFSDBG_BLOCK_ALLOCATE startBlock, minBlocks, maxBlocks, flags, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 - 5 HFSDBG_BLOCK_DEALLOCATE startBlock, blockCount, flags, 0, 0 ... err, 0, 0, 0, 0 - 6 HFSDBG_READ_BITMAP_BLOCK startBlock, 0, 0, 0, 0 ... err, 0, 0, 0, 0 - 7 HFSDBG_RELEASE_BITMAP_BLOCK dirty, 0, 0, 0, 0 ... 0, 0, 0, 0, 0 - 8 HFSDBG_FIND_CONTIG_BITMAP startBlock, minBlocks, maxBlocks, useMeta, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 - 9 HFSDBG_ALLOC_ANY_BITMAP startBlock, endBlock, maxBlocks, useMeta, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 - 10 HFSDBG_ALLOC_FIND_KNOWN 0, 0, maxBlocks, 0, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 - 11 HFSDBG_MARK_ALLOC_BITMAP startBlock, blockCount, flags, 0, 0 ... err, 0, 0, 0, 0 - 12 HFSDBG_MARK_FREE_BITMAP startBlock, blockCount, valid, 0, 0 ... err, 0, 0, 0, 0 - 13 HFSDBG_BLOCK_FIND_CONTIG startBlock, endBlock, minBlocks, maxBlocks, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 - 14 HFSDBG_IS_ALLOCATED startBlock, blockCount, stop, 0, 0 ... err, 0, actualBlockCount, 0, 0 - 15 unused - 16 HFSDBG_RESET_EXTENT_CACHE 0, 0, 0, 0, 0 ... 0, 0, 0, 0, 0 - 17 HFSDBG_REMOVE_EXTENT_CACHE startBlock, blockCount, vcbFreeExtCnt, 0, 0 ... 0, 0, vcbFreeExtCnt, extentsRemoved, 0 - 18 HFSDBG_ADD_EXTENT_CACHE startBlock, blockCount, vcbFreeExtCnt, 0, 0 ... 0, 0, vcbFreeExtCnt, retval, 0 - 19 HFSDBG_READ_BITMAP_RANGE startBlock, iosize, 0, 0, 0 ... err, 0, 0, 0, 0 - 20 HFSDBG_RELEASE_SCAN_BITMAP 0, 0, 0, 0, 0 ... 0, 0, 0, 0, 0 - 21 HFSDBG_SYNCER hfsmp, now, mnt_last_write_completed_timestamp, mnt_pending_write_size, 0 ... err, deadline, 0, 0, 0 - 22 HFSDBG_SYNCER_TIMED now, last_write_completed, hfs_mp->mnt_last_write_issued_timestamp, mnt_pending_write_size, 0 ... now, mnt_last_write_completed_timestamp, mnt_last_write_issued_timestamp, hfs_mp->mnt_pending_write_size, 0 - 23 HFSDBG_UNMAP_SCAN hfs_raw_dev, 0, 0, 0, 0 ... hfs_raw_dev, error, 0, 0, 0 - 24 HFSDBG_UNMAP_TRIM hfs_raw_dev, 0, 0, 0, 0 ... hfs_raw_dev, error, 0, 0, 0 -*/ - -#endif // HFS_KDEBUG_H_ diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c deleted file mode 100644 index 2dd7fda4b..000000000 --- a/bsd/hfs/hfs_link.c +++ /dev/null @@ -1,1431 +0,0 @@ -/* - * Copyright (c) 1999-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_format.h" -#include "hfs_endian.h" - - -static int cur_link_id = 0; - -/* - * Private directories where hardlink inodes reside. - */ -const char *hfs_private_names[] = { - HFSPLUSMETADATAFOLDER, /* FILE HARDLINKS */ - HFSPLUS_DIR_METADATA_FOLDER /* DIRECTORY HARDLINKS */ -}; - - -/* - * Hardlink inodes save the head of their link chain in a - * private extended attribute. The following calls are - * used to access this attribute. - */ -static int setfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t firstlink); -static int getfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t *firstlink); - -int hfs_makelink(struct hfsmount *hfsmp, struct vnode *src_vp, struct cnode *cp, - struct cnode *dcp, struct componentname *cnp); -/* - * Create a new catalog link record - * - * An indirect link is a reference to an inode (the real - * file or directory record). - * - * All the indirect links for a given inode are chained - * together in a doubly linked list. - * - * Pre-Leopard file hard links do not have kHFSHasLinkChainBit - * set and do not have first/prev/next link IDs i.e. the values - * are zero. If a new link is being added to an existing - * pre-Leopard file hard link chain, do not set kHFSHasLinkChainBit. - */ -static int -createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, struct cat_desc *descp, - cnid_t nextcnid, cnid_t *linkcnid, int is_inode_linkchain_set) -{ - struct FndrFileInfo *fip; - struct cat_attr attr; - - if (linknum == 0) { - printf("hfs: createindirectlink: linknum is zero!\n"); - return (EINVAL); - } - - /* Setup the default attributes */ - bzero(&attr, sizeof(attr)); - - /* Links are matched to inodes by link ID and to volumes by create date */ - attr.ca_linkref = linknum; - attr.ca_itime = hfsmp->hfs_metadata_createdate; - attr.ca_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH; - attr.ca_recflags = kHFSHasLinkChainMask | kHFSThreadExistsMask; - attr.ca_flags = UF_IMMUTABLE; - fip = (struct FndrFileInfo *)&attr.ca_finderinfo; - - if (descp->cd_flags & CD_ISDIR) { - fip->fdType = SWAP_BE32 (kHFSAliasType); - fip->fdCreator = SWAP_BE32 (kHFSAliasCreator); - fip->fdFlags = SWAP_BE16 (kIsAlias); - } else /* file */ { - fip->fdType = SWAP_BE32 (kHardLinkFileType); - fip->fdCreator = SWAP_BE32 (kHFSPlusCreator); - fip->fdFlags = SWAP_BE16 (kHasBeenInited); - /* If the file inode does not have kHFSHasLinkChainBit set - * and the next link chain ID is zero, assume that this - * is pre-Leopard file inode. Therefore clear the bit. - */ - if ((is_inode_linkchain_set == 0) && (nextcnid == 0)) { - attr.ca_recflags &= ~kHFSHasLinkChainMask; - } - } - /* Create the indirect link directly in the catalog */ - return cat_createlink(hfsmp, descp, &attr, nextcnid, linkcnid); -} - - -/* - * Make a link to the cnode cp in the directory dp - * using the name in cnp. src_vp is the vnode that - * corresponds to 'cp' which was part of the arguments to - * hfs_vnop_link. - * - * The cnodes cp and dcp must be locked. - */ -int -hfs_makelink(struct hfsmount *hfsmp, struct vnode *src_vp, struct cnode *cp, - struct cnode *dcp, struct componentname *cnp) -{ - vfs_context_t ctx = cnp->cn_context; - struct proc *p = vfs_context_proc(ctx); - u_int32_t indnodeno = 0; - char inodename[32]; - struct cat_desc to_desc; - struct cat_desc link_desc; - int newlink = 0; - int lockflags; - int retval = 0; - cat_cookie_t cookie; - cnid_t orig_cnid; - cnid_t linkcnid; - cnid_t orig_firstlink; - enum privdirtype type; - - type = S_ISDIR(cp->c_mode) ? DIR_HARDLINKS : FILE_HARDLINKS; - - if (cur_link_id == 0) { - cur_link_id = ((random() & 0x3fffffff) + 100); - } - - /* We don't allow link nodes in our private system directories. */ - if (dcp->c_fileid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - dcp->c_fileid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - return (EPERM); - } - - bzero(&cookie, sizeof(cat_cookie_t)); - /* Reserve some space in the Catalog file. */ - if ((retval = cat_preflight(hfsmp, (2 * CAT_CREATE)+ CAT_RENAME, &cookie, p))) { - return (retval); - } - - lockflags = SFL_CATALOG | SFL_ATTRIBUTE; - /* Directory hard links allocate space for a symlink. */ - if (type == DIR_HARDLINKS) { - lockflags |= SFL_BITMAP; - } - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - /* Save the current cnid value so we restore it if an error occurs. */ - orig_cnid = cp->c_desc.cd_cnid; - - /* - * If this is a new hardlink then we need to create the inode - * and replace the original file/dir object with a link node. - */ - if ((cp->c_linkcount == 2) && !(cp->c_flag & C_HARDLINK)) { - newlink = 1; - bzero(&to_desc, sizeof(to_desc)); - to_desc.cd_parentcnid = hfsmp->hfs_private_desc[type].cd_cnid; - to_desc.cd_cnid = cp->c_fileid; - to_desc.cd_flags = (type == DIR_HARDLINKS) ? CD_ISDIR : 0; - - do { - if (type == DIR_HARDLINKS) { - /* Directory hardlinks always use the cnid. */ - indnodeno = cp->c_fileid; - MAKE_DIRINODE_NAME(inodename, sizeof(inodename), - indnodeno); - } else { - /* Get a unique indirect node number */ - if (retval == 0) { - indnodeno = cp->c_fileid; - } else { - indnodeno = cur_link_id++; - } - MAKE_INODE_NAME(inodename, sizeof(inodename), - indnodeno); - } - /* Move original file/dir to data node directory */ - to_desc.cd_nameptr = (const u_int8_t *)inodename; - to_desc.cd_namelen = strlen(inodename); - - retval = cat_rename(hfsmp, &cp->c_desc, &hfsmp->hfs_private_desc[type], - &to_desc, NULL); - - if (retval != 0 && retval != EEXIST) { - printf("hfs_makelink: cat_rename to %s failed (%d) fileid=%d, vol=%s\n", - inodename, retval, cp->c_fileid, hfsmp->vcbVN); - } - } while ((retval == EEXIST) && (type == FILE_HARDLINKS)); - if (retval) - goto out; - - /* - * Replace original file/dir with a link record. - */ - - bzero(&link_desc, sizeof(link_desc)); - link_desc.cd_nameptr = cp->c_desc.cd_nameptr; - link_desc.cd_namelen = cp->c_desc.cd_namelen; - link_desc.cd_parentcnid = cp->c_parentcnid; - link_desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; - - retval = createindirectlink(hfsmp, indnodeno, &link_desc, 0, &linkcnid, true); - if (retval) { - int err; - - /* Restore the cnode's cnid. */ - cp->c_desc.cd_cnid = orig_cnid; - - /* Put the original file back. */ - err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); - if (err) { - if (err != EIO && err != ENXIO) - printf("hfs_makelink: error %d from cat_rename backout 1", err); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } - if (retval != EIO && retval != ENXIO) { - printf("hfs_makelink: createindirectlink (1) failed: %d\n", retval); - retval = EIO; - } - goto out; - } - cp->c_attr.ca_linkref = indnodeno; - cp->c_desc.cd_cnid = linkcnid; - /* Directory hard links store the first link in an attribute. */ - if (type == DIR_HARDLINKS) { - if (setfirstlink(hfsmp, cp->c_fileid, linkcnid) == 0) - cp->c_attr.ca_recflags |= kHFSHasAttributesMask; - } else /* FILE_HARDLINKS */ { - cp->c_attr.ca_firstlink = linkcnid; - } - cp->c_attr.ca_recflags |= kHFSHasLinkChainMask; - } else { - indnodeno = cp->c_attr.ca_linkref; - } - - /* - * Create a catalog entry for the new link (parentID + name). - */ - - bzero(&link_desc, sizeof(link_desc)); - link_desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - link_desc.cd_namelen = strlen(cnp->cn_nameptr); - link_desc.cd_parentcnid = dcp->c_fileid; - link_desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; - - /* Directory hard links store the first link in an attribute. */ - if (type == DIR_HARDLINKS) { - retval = getfirstlink(hfsmp, cp->c_fileid, &orig_firstlink); - } else /* FILE_HARDLINKS */ { - orig_firstlink = cp->c_attr.ca_firstlink; - } - if (retval == 0) - retval = createindirectlink(hfsmp, indnodeno, &link_desc, - orig_firstlink, &linkcnid, - (cp->c_attr.ca_recflags & kHFSHasLinkChainMask)); - if (retval && newlink) { - int err; - - /* Get rid of new link */ - (void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); - - /* Restore the cnode's cnid. */ - cp->c_desc.cd_cnid = orig_cnid; - - /* Put the original file back. */ - err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); - if (err) { - if (err != EIO && err != ENXIO) - printf("hfs_makelink: error %d from cat_rename backout 2", err); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } - - cp->c_attr.ca_linkref = 0; - - if (retval != EIO && retval != ENXIO) { - printf("hfs_makelink: createindirectlink (2) failed: %d\n", retval); - retval = EIO; - } - goto out; - } else if (retval == 0) { - - /* Update the original first link to point back to the new first link. */ - if (cp->c_attr.ca_recflags & kHFSHasLinkChainMask) { - (void) cat_update_siblinglinks(hfsmp, orig_firstlink, linkcnid, HFS_IGNORABLE_LINK); - - /* Update the inode's first link value. */ - if (type == DIR_HARDLINKS) { - if (setfirstlink(hfsmp, cp->c_fileid, linkcnid) == 0) - cp->c_attr.ca_recflags |= kHFSHasAttributesMask; - } else { - cp->c_attr.ca_firstlink = linkcnid; - } - } - /* - * Finally, if this is a new hardlink then: - * - update the private system directory - * - mark the cnode as a hard link - */ - if (newlink) { - vnode_t vp; - - hfsmp->hfs_private_attr[type].ca_entries++; - /* From application perspective, directory hard link is a - * normal directory. Therefore count the new directory - * hard link for folder count calculation. - */ - if (type == DIR_HARDLINKS) { - INC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[type]); - } - retval = cat_update(hfsmp, &hfsmp->hfs_private_desc[type], - &hfsmp->hfs_private_attr[type], NULL, NULL); - if (retval) { - if (retval != EIO && retval != ENXIO) { - printf("hfs_makelink: cat_update of privdir failed! (%d)\n", retval); - retval = EIO; - } - hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); - } - cp->c_flag |= C_HARDLINK; - - /* - * Now we need to mark the vnodes as being hardlinks via the vnode_setmultipath call. - * Note that we're calling vnode_get here, which should simply add an iocount if possible, without - * doing much checking. It's safe to call this because we are protected by the cnode lock, which - * ensures that anyone trying to reclaim it will block until we release it. vnode_get will usually - * give us an extra iocount, unless the vnode is about to be reclaimed (and has no iocounts). - * In that case, we'd error out, but we'd also not care if we added the VISHARDLINK bit to the vnode. - * - * As for the iocount we're about to add, we can't necessarily always call vnode_put here. - * If the one we add is the only iocount on the vnode, and there was - * sufficient vnode pressure, it could go through VNOP_INACTIVE immediately, which would - * require the cnode lock and cause us to double-lock panic. We can only call vnode_put if we know - * that the vnode we're operating on is the one with which we came into hfs_vnop_link, because - * that means VFS took an iocount on it for us. If it's *not* the one that we came into the call - * with, then mark it as NEED_VNODE_PUT to have hfs_unlock drop it for us. hfs_vnop_link will - * unlock the cnode when it is finished. - */ - if ((vp = cp->c_vp) != NULLVP) { - if (vnode_get(vp) == 0) { - vnode_setmultipath(vp); - if (vp == src_vp) { - /* we have an iocount on data fork vnode already. */ - vnode_put(vp); - } - else { - cp->c_flag |= C_NEED_DVNODE_PUT; - } - } - } - if ((vp = cp->c_rsrc_vp) != NULLVP) { - if (vnode_get(vp) == 0) { - vnode_setmultipath(vp); - if (vp == src_vp) { - vnode_put(vp); - } - else { - cp->c_flag |= C_NEED_RVNODE_PUT; - } - } - } - cp->c_flag |= C_MODIFIED; - cp->c_touch_chgtime = TRUE; - } - } -out: - hfs_systemfile_unlock(hfsmp, lockflags); - - cat_postflight(hfsmp, &cookie, p); - - if (retval == 0 && newlink) { - hfs_volupdate(hfsmp, VOL_MKFILE, 0); - } - return (retval); -} - - -/* - * link vnode operation - * - * IN vnode_t a_vp; - * IN vnode_t a_tdvp; - * IN struct componentname *a_cnp; - * IN vfs_context_t a_context; - */ -int -hfs_vnop_link(struct vnop_link_args *ap) -{ - struct hfsmount *hfsmp; - struct vnode *vp = ap->a_vp; - struct vnode *tdvp = ap->a_tdvp; - struct vnode *fdvp = NULLVP; - struct componentname *cnp = ap->a_cnp; - struct cnode *cp; - struct cnode *tdcp; - struct cnode *fdcp = NULL; - struct cat_desc todesc; - cnid_t parentcnid; - int lockflags = 0; - int intrans = 0; - enum vtype v_type; - int error, ret; - - hfsmp = VTOHFS(vp); - v_type = vnode_vtype(vp); - - /* No hard links in HFS standard file systems. */ - if (hfsmp->hfs_flags & HFS_STANDARD) { - return (ENOTSUP); - } - /* Linking to a special file is not permitted. */ - if (v_type == VBLK || v_type == VCHR) { - return (EPERM); - } - - /* - * For now, return ENOTSUP for a symlink target. This can happen - * for linkat(2) when called without AT_SYMLINK_FOLLOW. - */ - if (v_type == VLNK) - return (ENOTSUP); - - cp = VTOC(vp); - - if (v_type == VDIR) { -#if CONFIG_HFS_DIRLINK - /* Make sure our private directory exists. */ - if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid == 0) { - return (EPERM); - } - /* - * Directory hardlinks (ADLs) have only been qualified on - * journaled HFS+. If/when they are tested on non-journaled - * file systems then this test can be removed. - */ - if (hfsmp->jnl == NULL) { - return (EPERM); - } - - /* Directory hardlinks also need the parent of the original directory. */ - if ((error = hfs_vget(hfsmp, hfs_currentparent(cp, /* have_lock: */ false), - &fdvp, 1, 0))) { - return (error); - } -#else - /* some platforms don't support directory hardlinks. */ - return EPERM; -#endif - } else { - /* Make sure our private directory exists. */ - if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid == 0) { - return (ENOTSUP); - } - } - if (hfs_freeblks(hfsmp, 0) == 0) { - if (fdvp) { - vnode_put(fdvp); - } - return (ENOSPC); - } - - check_for_tracked_file(vp, VTOC(vp)->c_ctime, NAMESPACE_HANDLER_LINK_CREATE, NULL); - - - /* Lock the cnodes. */ - if (fdvp) { - if ((error = hfs_lockfour(VTOC(tdvp), VTOC(vp), VTOC(fdvp), NULL, HFS_EXCLUSIVE_LOCK, NULL))) { - if (fdvp) { - vnode_put(fdvp); - } - return (error); - } - fdcp = VTOC(fdvp); - } else { - if ((error = hfs_lockpair(VTOC(tdvp), VTOC(vp), HFS_EXCLUSIVE_LOCK))) { - return (error); - } - } - tdcp = VTOC(tdvp); - /* grab the parent CNID from originlist after grabbing cnode locks */ - parentcnid = hfs_currentparent(cp, /* have_lock: */ true); - - /* - * Make sure we didn't race the src or dst parent directories with rmdir. - * Note that we should only have a src parent directory cnode lock - * if we're dealing with a directory hardlink here. - */ - if (fdcp) { - if (fdcp->c_flag & (C_NOEXISTS | C_DELETED)) { - error = ENOENT; - goto out; - } - } - - if (tdcp->c_flag & (C_NOEXISTS | C_DELETED)) { - error = ENOENT; - goto out; - } - - /* Check the source for errors: - * too many links, immutable, race with unlink - */ - if (cp->c_linkcount >= HFS_LINK_MAX) { - error = EMLINK; - goto out; - } - if (cp->c_bsdflags & (IMMUTABLE | APPEND)) { - error = EPERM; - goto out; - } - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - error = ENOENT; - goto out; - } - - tdcp->c_flag |= C_DIR_MODIFICATION; - - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - intrans = 1; - - todesc.cd_flags = (v_type == VDIR) ? CD_ISDIR : 0; - todesc.cd_encoding = 0; - todesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - todesc.cd_namelen = cnp->cn_namelen; - todesc.cd_parentcnid = tdcp->c_fileid; - todesc.cd_hint = 0; - todesc.cd_cnid = 0; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - /* If destination exists then we lost a race with create. */ - if (cat_lookup(hfsmp, &todesc, 0, 0, NULL, NULL, NULL, NULL) == 0) { - error = EEXIST; - goto out; - } - if (cp->c_flag & C_HARDLINK) { - struct cat_attr cattr; - - /* If inode is missing then we lost a race with unlink. */ - if ((cat_idlookup(hfsmp, cp->c_fileid, 0, 0, NULL, &cattr, NULL) != 0) || - (cattr.ca_fileid != cp->c_fileid)) { - error = ENOENT; - goto out; - } - } else { - cnid_t fileid; - - /* If source is missing then we lost a race with unlink. */ - if ((cat_lookup(hfsmp, &cp->c_desc, 0, 0, NULL, NULL, NULL, &fileid) != 0) || - (fileid != cp->c_fileid)) { - error = ENOENT; - goto out; - } - } - /* - * All directory links must reside in an non-ARCHIVED hierarchy. - */ - if (v_type == VDIR) { - /* - * - Source parent and destination parent cannot match - * - A link is not permitted in the root directory - * - Parent of 'pointed at' directory is not the root directory - * - The 'pointed at' directory (source) is not an ancestor - * of the new directory hard link (destination). - * - No ancestor of the new directory hard link (destination) - * is a directory hard link. - */ - if ((parentcnid == tdcp->c_fileid) || - (tdcp->c_fileid == kHFSRootFolderID) || - (parentcnid == kHFSRootFolderID) || - cat_check_link_ancestry(hfsmp, tdcp->c_fileid, cp->c_fileid)) { - error = EPERM; /* abide by the rules, you did not */ - goto out; - } - } - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - - cp->c_linkcount++; - cp->c_flag |= C_MODIFIED; - cp->c_touch_chgtime = TRUE; - error = hfs_makelink(hfsmp, vp, cp, tdcp, cnp); - if (error) { - cp->c_linkcount--; - hfs_volupdate(hfsmp, VOL_UPDATE, 0); - } else { - /* Invalidate negative cache entries in the destination directory */ - if (tdcp->c_flag & C_NEG_ENTRIES) { - cache_purge_negatives(tdvp); - tdcp->c_flag &= ~C_NEG_ENTRIES; - } - - /* Update the target directory and volume stats */ - tdcp->c_entries++; - if (v_type == VDIR) { - INC_FOLDERCOUNT(hfsmp, tdcp->c_attr); - tdcp->c_attr.ca_recflags |= kHFSHasChildLinkMask; - - /* Set kHFSHasChildLinkBit in the destination hierarchy */ - error = cat_set_childlinkbit(hfsmp, tdcp->c_parentcnid); - if (error) { - printf ("hfs_vnop_link: error updating destination parent chain for id=%u, vol=%s\n", tdcp->c_cnid, hfsmp->vcbVN); - error = 0; - } - } - tdcp->c_dirchangecnt++; - tdcp->c_flag |= C_MODIFIED; - hfs_incr_gencount(tdcp); - tdcp->c_touch_chgtime = TRUE; - tdcp->c_touch_modtime = TRUE; - - error = hfs_update(tdvp, 0); - if (error) { - if (error != EIO && error != ENXIO) { - printf("hfs_vnop_link: error %d updating tdvp %p\n", error, tdvp); - error = EIO; - } - hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); - } - - if ((v_type == VDIR) && - (fdcp != NULL) && - ((fdcp->c_attr.ca_recflags & kHFSHasChildLinkMask) == 0)) { - - fdcp->c_attr.ca_recflags |= kHFSHasChildLinkMask; - fdcp->c_flag |= C_MODIFIED; - fdcp->c_touch_chgtime = TRUE; - error = hfs_update(fdvp, 0); - if (error) { - if (error != EIO && error != ENXIO) { - printf("hfs_vnop_link: error %d updating fdvp %p\n", error, fdvp); - // No point changing error as it's set immediate below - } - hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); - } - - /* Set kHFSHasChildLinkBit in the source hierarchy */ - error = cat_set_childlinkbit(hfsmp, fdcp->c_parentcnid); - if (error) { - printf ("hfs_vnop_link: error updating source parent chain for id=%u, vol=%s\n", fdcp->c_cnid, hfsmp->vcbVN); - error = 0; - } - } - hfs_volupdate(hfsmp, VOL_MKFILE, - (tdcp->c_cnid == kHFSRootFolderID)); - } - - if (error == 0 && (ret = hfs_update(vp, 0)) != 0) { - if (ret != EIO && ret != ENXIO) - printf("hfs_vnop_link: error %d updating vp @ %p\n", ret, vp); - hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); - } - -out: - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - } - if (intrans) { - hfs_end_transaction(hfsmp); - } - - tdcp->c_flag &= ~C_DIR_MODIFICATION; - wakeup((caddr_t)&tdcp->c_flag); - - if (fdcp) { - hfs_unlockfour(tdcp, cp, fdcp, NULL); - } else { - hfs_unlockpair(tdcp, cp); - } - if (fdvp) { - vnode_put(fdvp); - } - return (error); -} - - -/* - * Remove a link to a hardlink file/dir. - * - * Note: dvp and vp cnodes are already locked. - */ -int -hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int skip_reserve) -{ - struct cnode *cp; - struct cnode *dcp; - struct cat_desc cndesc; - struct timeval tv; - char inodename[32]; - cnid_t prevlinkid; - cnid_t nextlinkid; - int lockflags = 0; - int started_tr; - int error; - - if (hfsmp->hfs_flags & HFS_STANDARD) { - return (EPERM); - } - cp = VTOC(vp); - dcp = VTOC(dvp); - - dcp->c_flag |= C_DIR_MODIFICATION; - - /* Remove the entry from the namei cache: */ - cache_purge(vp); - - if ((error = hfs_start_transaction(hfsmp)) != 0) { - started_tr = 0; - goto out; - } - started_tr = 1; - - /* - * Protect against a race with rename by using the component - * name passed in and parent id from dvp (instead of using - * the cp->c_desc which may have changed). - * - * Re-lookup the component name so we get the correct cnid - * for the name (as opposed to the c_cnid in the cnode which - * could have changed before the cnode was locked). - */ - cndesc.cd_flags = vnode_isdir(vp) ? CD_ISDIR : 0; - cndesc.cd_encoding = cp->c_desc.cd_encoding; - cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - cndesc.cd_namelen = cnp->cn_namelen; - cndesc.cd_parentcnid = dcp->c_fileid; - cndesc.cd_hint = dcp->c_childhint; - - lockflags = SFL_CATALOG | SFL_ATTRIBUTE; - if (cndesc.cd_flags & CD_ISDIR) { - /* We'll be removing the alias resource allocation blocks. */ - lockflags |= SFL_BITMAP; - } - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - if ((error = cat_lookuplink(hfsmp, &cndesc, &cndesc.cd_cnid, &prevlinkid, &nextlinkid))) { - goto out; - } - - /* Reserve some space in the catalog file. */ - if (!skip_reserve && (error = cat_preflight(hfsmp, 2 * CAT_DELETE, NULL, 0))) { - goto out; - } - - /* Purge any cached origin entries for a directory or file hard link. */ - hfs_relorigin(cp, dcp->c_fileid); - if (dcp->c_fileid != dcp->c_cnid) { - hfs_relorigin(cp, dcp->c_cnid); - } - - /* Delete the link record. */ - if ((error = cat_deletelink(hfsmp, &cndesc))) { - goto out; - } - - /* Update the parent directory. */ - if (dcp->c_entries > 0) { - dcp->c_entries--; - } - if (cndesc.cd_flags & CD_ISDIR) { - DEC_FOLDERCOUNT(hfsmp, dcp->c_attr); - } - dcp->c_dirchangecnt++; - hfs_incr_gencount(dcp); - microtime(&tv); - dcp->c_touch_chgtime = dcp->c_touch_modtime = true; - dcp->c_flag |= C_MODIFIED; - hfs_update(dcp->c_vp, 0); - - /* - * If this is the last link then we need to process the inode. - * Otherwise we need to fix up the link chain. - */ - --cp->c_linkcount; - if (cp->c_linkcount < 1) { - char delname[32]; - struct cat_desc to_desc; - struct cat_desc from_desc; - - /* - * If a file inode or directory inode is being deleted, rename - * it to an open deleted file. This ensures that deletion - * of inode and its corresponding extended attributes does - * not overflow the journal. This inode will be deleted - * either in hfs_vnop_inactive() or in hfs_remove_orphans(). - * Note: a rename failure here is not fatal. - */ - bzero(&from_desc, sizeof(from_desc)); - bzero(&to_desc, sizeof(to_desc)); - if (vnode_isdir(vp)) { - if (cp->c_entries != 0) { - panic("hfs_unlink: dir not empty (id %d, %d entries)", cp->c_fileid, cp->c_entries); - } - MAKE_DIRINODE_NAME(inodename, sizeof(inodename), - cp->c_attr.ca_linkref); - from_desc.cd_parentcnid = hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid; - from_desc.cd_flags = CD_ISDIR; - to_desc.cd_flags = CD_ISDIR; - } else { - MAKE_INODE_NAME(inodename, sizeof(inodename), - cp->c_attr.ca_linkref); - from_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - from_desc.cd_flags = 0; - to_desc.cd_flags = 0; - } - from_desc.cd_nameptr = (const u_int8_t *)inodename; - from_desc.cd_namelen = strlen(inodename); - from_desc.cd_cnid = cp->c_fileid; - - MAKE_DELETED_NAME(delname, sizeof(delname), cp->c_fileid); - to_desc.cd_nameptr = (const u_int8_t *)delname; - to_desc.cd_namelen = strlen(delname); - to_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - to_desc.cd_cnid = cp->c_fileid; - - error = cat_rename(hfsmp, &from_desc, &hfsmp->hfs_private_desc[FILE_HARDLINKS], - &to_desc, (struct cat_desc *)NULL); - if (error == 0) { - cp->c_flag |= C_DELETED; - cp->c_attr.ca_recflags &= ~kHFSHasLinkChainMask; - cp->c_attr.ca_firstlink = 0; - if (vnode_isdir(vp)) { - hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries--; - DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[DIR_HARDLINKS]); - - hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries++; - INC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); - - (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[DIR_HARDLINKS], - &hfsmp->hfs_private_attr[DIR_HARDLINKS], NULL, NULL); - (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], - &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); - } - } else { - error = 0; /* rename failure here is not fatal */ - } - } else /* Still some links left */ { - cnid_t firstlink; - - /* - * Update the start of the link chain. - * Note: Directory hard links store the first link in an attribute. - */ - if (vnode_isdir(vp) && - getfirstlink(hfsmp, cp->c_fileid, &firstlink) == 0 && - firstlink == cndesc.cd_cnid) { - if (setfirstlink(hfsmp, cp->c_fileid, nextlinkid) == 0) - cp->c_attr.ca_recflags |= kHFSHasAttributesMask; - } else if (cp->c_attr.ca_firstlink == cndesc.cd_cnid) { - cp->c_attr.ca_firstlink = nextlinkid; - } - /* Update previous link. */ - if (prevlinkid) { - (void) cat_update_siblinglinks(hfsmp, prevlinkid, HFS_IGNORABLE_LINK, nextlinkid); - } - /* Update next link. */ - if (nextlinkid) { - (void) cat_update_siblinglinks(hfsmp, nextlinkid, prevlinkid, HFS_IGNORABLE_LINK); - } - } - - /* - * The call to cat_releasedesc below will only release the name - * buffer; it does not zero out the rest of the fields in the - * 'cat_desc' data structure. - * - * As a result, since there are still other links at this point, - * we need to make the current cnode descriptor point to the raw - * inode. If a path-based system call comes along first, it will - * replace the descriptor with a valid link ID. If a userland - * process already has a file descriptor open, then they will - * bypass that lookup, though. Replacing the descriptor CNID with - * the raw inode will force it to generate a new full path. - */ - cp->c_cnid = cp->c_fileid; - - /* Push new link count to disk. */ - cp->c_ctime = tv.tv_sec; - (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); - - /* All done with the system files. */ - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - - /* Update file system stats. */ - hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); - - /* - * All done with this cnode's descriptor... - * - * Note: all future catalog calls for this cnode may be - * by fileid only. This is OK for HFS (which doesn't have - * file thread records) since HFS doesn't support hard links. - */ - cat_releasedesc(&cp->c_desc); - -out: - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - } - if (started_tr) { - hfs_end_transaction(hfsmp); - } - - dcp->c_flag &= ~C_DIR_MODIFICATION; - wakeup((caddr_t)&dcp->c_flag); - - return (error); -} - - -/* - * Initialize the HFS+ private system directories. - * - * These directories are used to hold the inodes - * for file and directory hardlinks as well as - * open-unlinked files. - * - * If they don't yet exist they will get created. - * - * This call is assumed to be made during mount. - */ -void -hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) -{ - struct vnode * dvp = NULLVP; - struct cnode * dcp = NULL; - struct cat_desc *priv_descp; - struct cat_attr *priv_attrp; - struct FndrDirInfo * fndrinfo; - struct timeval tv; - int lockflags; - int trans = 0; - int error; - - if (hfsmp->hfs_flags & HFS_STANDARD) { - return; - } - - priv_descp = &hfsmp->hfs_private_desc[type]; - priv_attrp = &hfsmp->hfs_private_attr[type]; - - /* Check if directory already exists. */ - if (priv_descp->cd_cnid != 0) { - return; - } - - priv_descp->cd_parentcnid = kRootDirID; - priv_descp->cd_nameptr = (const u_int8_t *)hfs_private_names[type]; - priv_descp->cd_namelen = strlen((const char *)priv_descp->cd_nameptr); - priv_descp->cd_flags = CD_ISDIR | CD_DECOMPOSED; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_lookup(hfsmp, priv_descp, 0, 0, NULL, priv_attrp, NULL, NULL); - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error == 0) { - if (type == FILE_HARDLINKS) { - hfsmp->hfs_metadata_createdate = priv_attrp->ca_itime; - } - priv_descp->cd_cnid = priv_attrp->ca_fileid; - goto exit; - } - - /* Directory is missing, if this is read-only then we're done. */ - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - goto exit; - } - - /* Grab the root directory so we can update it later. */ - if (hfs_vget(hfsmp, kRootDirID, &dvp, 0, 0) != 0) { - goto exit; - } - dcp = VTOC(dvp); - - /* Setup the default attributes */ - bzero(priv_attrp, sizeof(struct cat_attr)); - priv_attrp->ca_flags = UF_IMMUTABLE | UF_HIDDEN; - priv_attrp->ca_mode = S_IFDIR; - if (type == DIR_HARDLINKS) { - priv_attrp->ca_mode |= S_ISVTX | S_IRUSR | S_IXUSR | S_IRGRP | - S_IXGRP | S_IROTH | S_IXOTH; - } - priv_attrp->ca_linkcount = 1; - priv_attrp->ca_itime = hfsmp->hfs_itime; - priv_attrp->ca_recflags = kHFSHasFolderCountMask; - - fndrinfo = (struct FndrDirInfo *)&priv_attrp->ca_finderinfo; - fndrinfo->frLocation.v = SWAP_BE16(16384); - fndrinfo->frLocation.h = SWAP_BE16(16384); - fndrinfo->frFlags = SWAP_BE16(kIsInvisible + kNameLocked); - - if (hfs_start_transaction(hfsmp) != 0) { - goto exit; - } - trans = 1; - - /* Need the catalog and EA b-trees for CNID acquisition */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - /* Make sure there's space in the Catalog file. */ - if (cat_preflight(hfsmp, CAT_CREATE, NULL, 0) != 0) { - hfs_systemfile_unlock(hfsmp, lockflags); - goto exit; - } - - /* Get the CNID for use */ - cnid_t new_id; - if ((error = cat_acquire_cnid(hfsmp, &new_id))) { - hfs_systemfile_unlock (hfsmp, lockflags); - goto exit; - } - - /* Create the private directory on disk. */ - error = cat_create(hfsmp, new_id, priv_descp, priv_attrp, NULL); - if (error == 0) { - priv_descp->cd_cnid = priv_attrp->ca_fileid; - - /* Update the parent directory */ - dcp->c_entries++; - INC_FOLDERCOUNT(hfsmp, dcp->c_attr); - dcp->c_dirchangecnt++; - hfs_incr_gencount(dcp); - microtime(&tv); - dcp->c_ctime = tv.tv_sec; - dcp->c_mtime = tv.tv_sec; - (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); - } - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) { - goto exit; - } - if (type == FILE_HARDLINKS) { - hfsmp->hfs_metadata_createdate = priv_attrp->ca_itime; - } - hfs_volupdate(hfsmp, VOL_MKDIR, 1); -exit: - if (trans) { - hfs_end_transaction(hfsmp); - } - if (dvp) { - hfs_unlock(dcp); - vnode_put(dvp); - } - if ((error == 0) && (type == DIR_HARDLINKS)) { - hfs_xattr_init(hfsmp); - } -} - - -/* - * Lookup a hardlink link (from chain) - */ -int -hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) -{ - int lockflags; - int error; - - *prevlinkid = 0; - *nextlinkid = 0; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - error = cat_lookup_siblinglinks(hfsmp, linkfileid, prevlinkid, nextlinkid); - if (error == ENOLINK) { - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - - error = getfirstlink(hfsmp, linkfileid, nextlinkid); - } - hfs_systemfile_unlock(hfsmp, lockflags); - - return (error); -} - - -/* Find the oldest / last hardlink in the link chain */ -int -hfs_lookup_lastlink (struct hfsmount *hfsmp, cnid_t linkfileid, - cnid_t *lastid, struct cat_desc *cdesc) { - int lockflags; - int error; - - *lastid = 0; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - error = cat_lookup_lastlink(hfsmp, linkfileid, lastid, cdesc); - - hfs_systemfile_unlock(hfsmp, lockflags); - - /* - * cat_lookup_lastlink will zero out the lastid/cdesc arguments as needed - * upon error cases. - */ - return error; -} - - -/* - * Cache the origin of a directory or file hard link - * - * cnode must be lock on entry - */ -__private_extern__ -void -hfs_savelinkorigin(cnode_t *cp, cnid_t parentcnid) -{ - linkorigin_t *origin = NULL; - thread_t thread = current_thread(); - int count = 0; - int maxorigins = (S_ISDIR(cp->c_mode)) ? MAX_CACHED_ORIGINS : MAX_CACHED_FILE_ORIGINS; - /* - * Look for an existing origin first. If not found, create/steal one. - */ - TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { - ++count; - if (origin->lo_thread == thread) { - TAILQ_REMOVE(&cp->c_originlist, origin, lo_link); - break; - } - } - if (origin == NULL) { - /* Recycle the last (i.e., the oldest) if we have too many. */ - if (count > maxorigins) { - origin = TAILQ_LAST(&cp->c_originlist, hfs_originhead); - TAILQ_REMOVE(&cp->c_originlist, origin, lo_link); - } else { - MALLOC(origin, linkorigin_t *, sizeof(linkorigin_t), M_TEMP, M_WAITOK); - } - origin->lo_thread = thread; - } - origin->lo_cnid = cp->c_cnid; - origin->lo_parentcnid = parentcnid; - TAILQ_INSERT_HEAD(&cp->c_originlist, origin, lo_link); -} - -/* - * Release any cached origins for a directory or file hard link - * - * cnode must be lock on entry - */ -__private_extern__ -void -hfs_relorigins(struct cnode *cp) -{ - linkorigin_t *origin, *prev; - - TAILQ_FOREACH_SAFE(origin, &cp->c_originlist, lo_link, prev) { - FREE(origin, M_TEMP); - } - TAILQ_INIT(&cp->c_originlist); -} - -/* - * Release a specific origin for a directory or file hard link - * - * cnode must be lock on entry - */ -__private_extern__ -void -hfs_relorigin(struct cnode *cp, cnid_t parentcnid) -{ - linkorigin_t *origin, *prev; - thread_t thread = current_thread(); - - TAILQ_FOREACH_SAFE(origin, &cp->c_originlist, lo_link, prev) { - if (origin->lo_thread == thread) { - TAILQ_REMOVE(&cp->c_originlist, origin, lo_link); - FREE(origin, M_TEMP); - break; - } else if (origin->lo_parentcnid == parentcnid) { - /* - * If the threads don't match, then we don't want to - * delete the entry because that might cause other threads - * to fall back and use whatever happens to be in - * c_parentcnid or the wrong link ID. By setting the - * values to zero here, it should serve as an indication - * that the path is no longer valid and that's better than - * using a random parent ID or link ID. - */ - origin->lo_parentcnid = 0; - origin->lo_cnid = 0; - } - } -} - -/* - * Test if a directory or file hard link has a cached origin - * - * cnode must be lock on entry - */ -__private_extern__ -int -hfs_haslinkorigin(cnode_t *cp) -{ - if (cp->c_flag & C_HARDLINK) { - linkorigin_t *origin; - thread_t thread = current_thread(); - - TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { - if (origin->lo_thread == thread) { - return origin->lo_cnid != 0; - } - } - } - return (0); -} - -/* - * Obtain the current parent cnid of a directory or file hard link - * - * cnode must be lock on entry - */ -__private_extern__ -cnid_t -hfs_currentparent(cnode_t *cp, bool have_lock) -{ - if (cp->c_flag & C_HARDLINK) { - if (!have_lock) - hfs_lock_always(cp, HFS_SHARED_LOCK); - - linkorigin_t *origin; - thread_t thread = current_thread(); - - TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { - if (origin->lo_thread == thread) { - if (!have_lock) - hfs_unlock(cp); - return (origin->lo_parentcnid); - } - } - - if (!have_lock) - hfs_unlock(cp); - } - return (cp->c_parentcnid); -} - -/* - * Obtain the current cnid of a directory or file hard link - * - * cnode must be lock on entry - */ -__private_extern__ -cnid_t -hfs_currentcnid(cnode_t *cp) -{ - if (cp->c_flag & C_HARDLINK) { - linkorigin_t *origin; - thread_t thread = current_thread(); - - TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { - if (origin->lo_thread == thread) { - return (origin->lo_cnid); - } - } - } - return (cp->c_cnid); -} - - -/* - * Set the first link attribute for a given file id. - * - * The attributes b-tree must already be locked. - * If journaling is enabled, a transaction must already be started. - */ -static int -setfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t firstlink) -{ - FCB * btfile; - BTreeIterator * iterator; - FSBufferDescriptor btdata; - u_int8_t attrdata[FIRST_LINK_XATTR_REC_SIZE]; - HFSPlusAttrData *dataptr; - int result; - u_int16_t datasize; - - if (hfsmp->hfs_attribute_cp == NULL) { - return (EPERM); - } - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - - result = hfs_buildattrkey(fileid, FIRST_LINK_XATTR_NAME, (HFSPlusAttrKey *)&iterator->key); - if (result) { - goto out; - } - dataptr = (HFSPlusAttrData *)&attrdata[0]; - dataptr->recordType = kHFSPlusAttrInlineData; - dataptr->reserved[0] = 0; - dataptr->reserved[1] = 0; - - /* - * Since attrData is variable length, we calculate the size of - * attrData by subtracting the size of all other members of - * structure HFSPlusAttData from the size of attrdata. - */ - (void)snprintf((char *)&dataptr->attrData[0], - sizeof(dataptr) - (4 * sizeof(uint32_t)), - "%lu", (unsigned long)firstlink); - dataptr->attrSize = 1 + strlen((char *)&dataptr->attrData[0]); - - /* Calculate size of record rounded up to multiple of 2 bytes. */ - datasize = sizeof(HFSPlusAttrData) - 2 + dataptr->attrSize + ((dataptr->attrSize & 1) ? 1 : 0); - - btdata.bufferAddress = dataptr; - btdata.itemSize = datasize; - btdata.itemCount = 1; - - btfile = hfsmp->hfs_attribute_cp->c_datafork; - - /* Insert the attribute. */ - result = BTInsertRecord(btfile, iterator, &btdata, datasize); - if (result == btExists) { - result = BTReplaceRecord(btfile, iterator, &btdata, datasize); - } - (void) BTFlushPath(btfile); -out: - FREE(iterator, M_TEMP); - - return MacToVFSError(result); -} - -/* - * Get the first link attribute for a given file id. - * - * The attributes b-tree must already be locked. - */ -static int -getfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t *firstlink) -{ - FCB * btfile; - BTreeIterator * iterator; - FSBufferDescriptor btdata; - u_int8_t attrdata[FIRST_LINK_XATTR_REC_SIZE]; - HFSPlusAttrData *dataptr; - int result; - u_int16_t datasize; - - if (hfsmp->hfs_attribute_cp == NULL) { - return (EPERM); - } - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - - result = hfs_buildattrkey(fileid, FIRST_LINK_XATTR_NAME, (HFSPlusAttrKey *)&iterator->key); - if (result) - goto out; - - dataptr = (HFSPlusAttrData *)&attrdata[0]; - datasize = sizeof(attrdata); - - btdata.bufferAddress = dataptr; - btdata.itemSize = sizeof(attrdata); - btdata.itemCount = 1; - - btfile = hfsmp->hfs_attribute_cp->c_datafork; - - result = BTSearchRecord(btfile, iterator, &btdata, NULL, NULL); - if (result) - goto out; - - if (dataptr->attrSize < 3) { - result = ENOENT; - goto out; - } - *firstlink = strtoul((char*)&dataptr->attrData[0], NULL, 10); -out: - FREE(iterator, M_TEMP); - - return MacToVFSError(result); -} - -errno_t hfs_first_link(hfsmount_t *hfsmp, cnode_t *cp, cnid_t *link_id) -{ - errno_t error = 0; - - if (S_ISDIR(cp->c_mode)) { - int lockf = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - - error = getfirstlink(hfsmp, cp->c_fileid, link_id); - - hfs_systemfile_unlock(hfsmp, lockf); - } else { - if (cp->c_attr.ca_firstlink) - *link_id = cp->c_attr.ca_firstlink; - else { - // This can happen if the cnode has been deleted - error = ENOENT; - } - } - - return error; -} diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c deleted file mode 100644 index c46bce7c7..000000000 --- a/bsd/hfs/hfs_lookup.c +++ /dev/null @@ -1,698 +0,0 @@ -/* - * Copyright (c) 1999-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1989, 1993 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)hfs_lookup.c 1.0 - * derived from @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 - * - * (c) 1998-1999 Apple Computer, Inc. All Rights Reserved - * (c) 1990, 1992 NeXT Computer, Inc. All Rights Reserved - * - * - * hfs_lookup.c -- code to handle directory traversal on HFS/HFS+ volume - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_cnode.h" - - -/* - * FROM FREEBSD 3.1 - * Convert a component of a pathname into a pointer to a locked cnode. - * This is a very central and rather complicated routine. - * If the file system is not maintained in a strict tree hierarchy, - * this can result in a deadlock situation (see comments in code below). - * - * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending - * on whether the name is to be looked up, created, renamed, or deleted. - * When CREATE, RENAME, or DELETE is specified, information usable in - * creating, renaming, or deleting a directory entry may be calculated. - * Notice that these are the only operations that can affect the directory of the target. - * - * LOCKPARENT and WANTPARENT actually refer to the parent of the last item, - * so if ISLASTCN is not set, they should be ignored. Also they are mutually exclusive, or - * WANTPARENT really implies DONTLOCKPARENT. Either of them set means that the calling - * routine wants to access the parent of the target, locked or unlocked. - * - * Keeping the parent locked as long as possible protects from other processes - * looking up the same item, so it has to be locked until the cnode is totally finished - * - * hfs_cache_lookup() performs the following for us: - * check that it is a directory - * check accessibility of directory - * check for modification attempts on read-only mounts - * if name found in cache - * if at end of path and deleting or creating - * drop it - * else - * return name. - * return hfs_lookup() - * - * Overall outline of hfs_lookup: - * - * handle simple cases of . and .. - * search for name in directory, to found or notfound - * notfound: - * if creating, return locked directory, leaving info on available slots - * else return error - * found: - * if at end of path and deleting, return information to allow delete - * if at end of path and rewriting (RENAME and LOCKPARENT), lock target - * cnode and return info to allow rewrite - * if not at end, add name to cache; if at end and neither creating - * nor deleting, add name to cache - */ - - -/* - * Lookup *cnp in directory *dvp, return it in *vpp. - * **vpp is held on exit. - * We create a cnode for the file, but we do NOT open the file here. - -#% lookup dvp L ? ? -#% lookup vpp - L - - - IN struct vnode *dvp - Parent node of file; - INOUT struct vnode **vpp - node of target file, its a new node if - the target vnode did not exist; - IN struct componentname *cnp - Name of file; - - * When should we lock parent_hp in here ?? - */ -static int -hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int *cnode_locked, int force_casesensitive_lookup) -{ - struct cnode *dcp; /* cnode for directory being searched */ - struct vnode *tvp; /* target vnode */ - struct hfsmount *hfsmp; - int flags; - int nameiop; - int retval = 0; - int isDot; - struct cat_desc desc; - struct cat_desc cndesc; - struct cat_attr attr; - struct cat_fork fork; - int lockflags; - int newvnode_flags; - - retry: - newvnode_flags = 0; - dcp = NULL; - hfsmp = VTOHFS(dvp); - *vpp = NULL; - *cnode_locked = 0; - isDot = FALSE; - tvp = NULL; - nameiop = cnp->cn_nameiop; - flags = cnp->cn_flags; - bzero(&desc, sizeof(desc)); - - /* - * First check to see if it is a . or .., else look it up. - */ - if (flags & ISDOTDOT) { /* Wanting the parent */ - cnp->cn_flags &= ~MAKEENTRY; - goto found; /* .. is always defined */ - } else if ((cnp->cn_nameptr[0] == '.') && (cnp->cn_namelen == 1)) { - isDot = TRUE; - cnp->cn_flags &= ~MAKEENTRY; - goto found; /* We always know who we are */ - } else { - if (hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - retval = ENOENT; /* The parent no longer exists ? */ - goto exit; - } - dcp = VTOC(dvp); - - if (dcp->c_flag & C_DIR_MODIFICATION) { - // XXXdbg - if we could msleep on a lck_rw_t then we would do that - // but since we can't we have to unlock, delay for a bit - // and then retry... - // msleep((caddr_t)&dcp->c_flag, &dcp->c_rwlock, PINOD, "hfs_vnop_lookup", 0); - hfs_unlock(dcp); - tsleep((caddr_t)dvp, PRIBIO, "hfs_lookup", 1); - - goto retry; - } - - - /* - * We shouldn't need to go to the catalog if there are no children. - * However, in the face of a minor disk corruption where the valence of - * the directory is off, we could infinite loop here if we return ENOENT - * even though there are actually items in the directory. (create will - * see the ENOENT, try to create something, which will return with - * EEXIST over and over again). As a result, always check the catalog. - */ - - bzero(&cndesc, sizeof(cndesc)); - cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - cndesc.cd_namelen = cnp->cn_namelen; - cndesc.cd_parentcnid = dcp->c_fileid; - cndesc.cd_hint = dcp->c_childhint; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - retval = cat_lookup(hfsmp, &cndesc, 0, force_casesensitive_lookup, &desc, &attr, &fork, NULL); - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (retval == 0) { - dcp->c_childhint = desc.cd_hint; - /* - * Note: We must drop the parent lock here before calling - * hfs_getnewvnode (which takes the child lock). - */ - hfs_unlock(dcp); - dcp = NULL; - - /* Verify that the item just looked up isn't one of the hidden directories. */ - if (desc.cd_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - desc.cd_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - retval = ENOENT; - goto exit; - } - - goto found; - } - - /* - * ENAMETOOLONG supersedes other errors - * - * For a CREATE or RENAME operation on the last component - * the ENAMETOOLONG will be handled in the next VNOP. - */ - if ((retval != ENAMETOOLONG) && - (cnp->cn_namelen > kHFSPlusMaxFileNameChars) && - (((flags & ISLASTCN) == 0) || ((nameiop != CREATE) && (nameiop != RENAME)))) { - retval = ENAMETOOLONG; - } else if (retval == 0) { - retval = ENOENT; - } else if (retval == ERESERVEDNAME) { - /* - * We found the name in the catalog, but it is unavailable - * to us. The exact error to return to our caller depends - * on the operation, and whether we've already reached the - * last path component. In all cases, avoid a negative - * cache entry, since someone else may be able to access - * the name if their lookup is configured differently. - */ - - cnp->cn_flags &= ~MAKEENTRY; - - if (((flags & ISLASTCN) == 0) || ((nameiop == LOOKUP) || (nameiop == DELETE))) { - /* A reserved name for a pure lookup is the same as the path not being present */ - retval = ENOENT; - } else { - /* A reserved name with intent to create must be rejected as impossible */ - retval = EEXIST; - } - } - if (retval != ENOENT) - goto exit; - /* - * This is a non-existing entry - * - * If creating, and at end of pathname and current - * directory has not been removed, then can consider - * allowing file to be created. - */ - if ((nameiop == CREATE || nameiop == RENAME) && - (flags & ISLASTCN) && - !(ISSET(dcp->c_flag, C_DELETED | C_NOEXISTS))) { - retval = EJUSTRETURN; - goto exit; - } - /* - * Insert name into the name cache (as non-existent). - */ - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0 && - (cnp->cn_flags & MAKEENTRY) && - (nameiop != CREATE)) { - cache_enter(dvp, NULL, cnp); - dcp->c_flag |= C_NEG_ENTRIES; - } - goto exit; - } - -found: - if (flags & ISLASTCN) { - switch(nameiop) { - case DELETE: - cnp->cn_flags &= ~MAKEENTRY; - break; - - case RENAME: - cnp->cn_flags &= ~MAKEENTRY; - if (isDot) { - retval = EISDIR; - goto exit; - } - break; - } - } - - if (isDot) { - if ((retval = vnode_get(dvp))) - goto exit; - *vpp = dvp; - } else if (flags & ISDOTDOT) { - /* - * Directory hard links can have multiple parents so - * find the appropriate parent for the current thread. - */ - if ((retval = hfs_vget(hfsmp, hfs_currentparent(VTOC(dvp), - /* have_lock: */ false), &tvp, 0, 0))) { - goto exit; - } - *cnode_locked = 1; - *vpp = tvp; - } else { - int type = (attr.ca_mode & S_IFMT); - - if (!(flags & ISLASTCN) && (type != S_IFDIR) && (type != S_IFLNK)) { - retval = ENOTDIR; - goto exit; - } - /* Don't cache directory hardlink names. */ - if (attr.ca_recflags & kHFSHasLinkChainMask) { - cnp->cn_flags &= ~MAKEENTRY; - } - /* Names with composed chars are not cached. */ - if (cnp->cn_namelen != desc.cd_namelen) - cnp->cn_flags &= ~MAKEENTRY; - - retval = hfs_getnewvnode(hfsmp, dvp, cnp, &desc, 0, &attr, &fork, &tvp, &newvnode_flags); - - if (retval) { - /* - * If this was a create/rename operation lookup, then by this point - * we expected to see the item returned from hfs_getnewvnode above. - * In the create case, it would probably eventually bubble out an EEXIST - * because the item existed when we were trying to create it. In the - * rename case, it would let us know that we need to go ahead and - * delete it as part of the rename. However, if we hit the condition below - * then it means that we found the element during cat_lookup above, but - * it is now no longer there. We simply behave as though we never found - * the element at all and return EJUSTRETURN. - */ - if ((retval == ENOENT) && - ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && - (flags & ISLASTCN)) { - retval = EJUSTRETURN; - } - - /* - * If this was a straight lookup operation, we may need to redrive the entire - * lookup starting from cat_lookup if the element was deleted as the result of - * a rename operation. Since rename is supposed to guarantee atomicity, then - * lookups cannot fail because the underlying element is deleted as a result of - * the rename call -- either they returned the looked up element prior to rename - * or return the newer element. If we are in this region, then all we can do is add - * workarounds to guarantee the latter case. The element has already been deleted, so - * we just re-try the lookup to ensure the caller gets the most recent element. - */ - if ((retval == ENOENT) && (cnp->cn_nameiop == LOOKUP) && - (newvnode_flags & (GNV_CHASH_RENAMED | GNV_CAT_DELETED))) { - if (dcp) { - hfs_unlock (dcp); - } - /* get rid of any name buffers that may have lingered from the cat_lookup call */ - cat_releasedesc (&desc); - goto retry; - } - - /* Also, re-drive the lookup if the item we looked up was a hardlink, and the number - * or name of hardlinks has changed in the interim between the cat_lookup above, and - * our call to hfs_getnewvnode. hfs_getnewvnode will validate the cattr we passed it - * against what is actually in the catalog after the cnode is created. If there were - * any issues, it will bubble out ERECYCLE, which we need to swallow and use as the - * key to redrive as well. We need to special case this below because in this case, - * it needs to occur regardless of the type of lookup we're doing here. - */ - if ((retval == ERECYCLE) && (newvnode_flags & GNV_CAT_ATTRCHANGED)) { - if (dcp) { - hfs_unlock (dcp); - } - /* get rid of any name buffers that may have lingered from the cat_lookup call */ - cat_releasedesc (&desc); - retval = 0; - goto retry; - } - - /* skip to the error-handling code if we can't retry */ - goto exit; - } - - /* - * Save the origin info for file and directory hardlinks. Directory hardlinks - * need the origin for '..' lookups, and file hardlinks need it to ensure that - * competing lookups do not cause us to vend different hardlinks than the ones requested. - */ - if (ISSET(VTOC(tvp)->c_flag, C_HARDLINK)) - hfs_savelinkorigin(VTOC(tvp), VTOC(dvp)->c_fileid); - *cnode_locked = 1; - *vpp = tvp; - } -exit: - if (dcp) { - hfs_unlock(dcp); - } - cat_releasedesc(&desc); - return (retval); -} - - - -/* - * Name caching works as follows: - * - * Names found by directory scans are retained in a cache - * for future reference. It is managed LRU, so frequently - * used names will hang around. Cache is indexed by hash value - * obtained from (vp, name) where vp refers to the directory - * containing name. - * - * If it is a "negative" entry, (i.e. for a name that is known NOT to - * exist) the vnode pointer will be NULL. - * - * Upon reaching the last segment of a path, if the reference - * is for DELETE, or NOCACHE is set (rewrite), and the - * name is located in the cache, it will be dropped. - * - */ - -#define S_IXALL 0000111 - -int -hfs_vnop_lookup(struct vnop_lookup_args *ap) -{ - struct vnode *dvp = ap->a_dvp; - struct vnode *vp; - struct cnode *cp; - struct cnode *dcp; - struct hfsmount *hfsmp; - int error; - struct vnode **vpp = ap->a_vpp; - struct componentname *cnp = ap->a_cnp; - struct proc *p = vfs_context_proc(ap->a_context); - int flags = cnp->cn_flags; - int force_casesensitive_lookup = proc_is_forcing_hfs_case_sensitivity(p); - int cnode_locked; - int fastdev_candidate = 0; - int auto_candidate = 0; - - *vpp = NULL; - dcp = VTOC(dvp); - hfsmp = VTOHFS(dvp); - - if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && (vnode_isfastdevicecandidate(dvp) || (dcp->c_attr.ca_recflags & kHFSFastDevCandidateMask)) ){ - fastdev_candidate = 1; - auto_candidate = (vnode_isautocandidate(dvp) || (dcp->c_attr.ca_recflags & kHFSAutoCandidateMask)); - } - - - /* - * Lookup an entry in the cache - * - * If the lookup succeeds, the vnode is returned in *vpp, - * and a status of -1 is returned. - * - * If the lookup determines that the name does not exist - * (negative cacheing), a status of ENOENT is returned. - * - * If the lookup fails, a status of zero is returned. - */ - error = cache_lookup(dvp, vpp, cnp); - if (error != -1) { - if ((error == ENOENT) && (cnp->cn_nameiop != CREATE)) - goto exit; /* found a negative cache entry */ - goto lookup; /* did not find it in the cache */ - } - /* - * We have a name that matched - * cache_lookup returns the vp with an iocount reference already taken - */ - error = 0; - vp = *vpp; - cp = VTOC(vp); - - /* We aren't allowed to vend out vp's via lookup to the hidden directory */ - if (cp->c_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - cp->c_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - /* Drop the iocount from cache_lookup */ - vnode_put (vp); - error = ENOENT; - goto exit; - } - - if (cp->c_attr.ca_recflags & kHFSDoNotFastDevPinMask) { - fastdev_candidate = 0; - } - - /* - * If this is a hard-link vnode then we need to update - * the name (of the link), the parent ID, the cnid, the - * text encoding and the catalog hint. This enables - * getattrlist calls to return the correct link info. - */ - - /* - * Alternatively, if we are forcing a case-sensitive lookup - * on a case-insensitive volume, the namecache entry - * may have been for an incorrect case. Since we cannot - * determine case vs. normalization, redrive the catalog - * lookup based on any byte mismatch. - */ - if (((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK)) - || (force_casesensitive_lookup && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE))) { - int stale_link = 0; - - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - if ((cp->c_parentcnid != dcp->c_cnid) || - (cnp->cn_namelen != cp->c_desc.cd_namelen) || - (bcmp(cnp->cn_nameptr, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0)) { - struct cat_desc desc; - struct cat_attr lookup_attr; - int lockflags; - - if (force_casesensitive_lookup && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE)) { - /* - * Since the name in the cnode doesn't match our lookup - * string exactly, do a full lookup. - */ - hfs_unlock (cp); - - vnode_put(vp); - goto lookup; - } - - /* - * Get an updated descriptor - */ - desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - desc.cd_namelen = cnp->cn_namelen; - desc.cd_parentcnid = dcp->c_fileid; - desc.cd_hint = dcp->c_childhint; - desc.cd_encoding = 0; - desc.cd_cnid = 0; - desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; - - /* - * Because lookups call replace_desc to put a new descriptor in - * the cnode we are modifying it is possible that this cnode's - * descriptor is out of date for the parent ID / name that - * we are trying to look up. (It may point to a different hardlink). - * - * We need to be cautious that when re-supplying the - * descriptor below that the results of the catalog lookup - * still point to the same raw inode for the hardlink. This would - * not be the case if we found something in the cache above but - * the vnode it returned no longer has a valid hardlink for the - * parent ID/filename combo we are requesting. (This is because - * hfs_unlink does not directly trigger namecache removal). - * - * As a result, before vending out the vnode (and replacing - * its descriptor) verify that the fileID is the same by comparing - * the in-cnode attributes vs. the one returned from the lookup call - * below. If they do not match, treat this lookup as if we never hit - * in the cache at all. - */ - - lockflags = hfs_systemfile_lock(VTOHFS(dvp), SFL_CATALOG, HFS_SHARED_LOCK); - - error = cat_lookup(VTOHFS(vp), &desc, 0, 0, &desc, &lookup_attr, NULL, NULL); - - hfs_systemfile_unlock(VTOHFS(dvp), lockflags); - - /* - * Note that cat_lookup may fail to find something with the name provided in the - * stack-based descriptor above. In that case, an ENOENT is a legitimate errno - * to be placed in error, which will get returned in the fastpath below. - */ - if (error == 0) { - if (lookup_attr.ca_fileid == cp->c_attr.ca_fileid) { - /* It still points to the right raw inode. Replacing the descriptor is fine */ - replace_desc (cp, &desc); - - /* - * Save the origin info for file and directory hardlinks. Directory hardlinks - * need the origin for '..' lookups, and file hardlinks need it to ensure that - * competing lookups do not cause us to vend different hardlinks than the ones requested. - */ - hfs_savelinkorigin(cp, dcp->c_fileid); - } - else { - /* If the fileID does not match then do NOT replace the descriptor! */ - stale_link = 1; - } - } - } - hfs_unlock (cp); - - if (stale_link) { - /* - * If we had a stale_link, then we need to pretend as though - * we never found this vnode and force a lookup through the - * traditional path. Drop the iocount acquired through - * cache_lookup above and force a cat lookup / getnewvnode - */ - vnode_put(vp); - goto lookup; - } - - if (error) { - /* - * If the cat_lookup failed then the caller will not expect - * a vnode with an iocount on it. - */ - vnode_put(vp); - } - - } - goto exit; - -lookup: - /* - * The vnode was not in the name cache or it was stale. - * - * So we need to do a real lookup. - */ - cnode_locked = 0; - - error = hfs_lookup(dvp, vpp, cnp, &cnode_locked, force_casesensitive_lookup); - - if (*vpp && (VTOC(*vpp)->c_attr.ca_recflags & kHFSDoNotFastDevPinMask)) { - fastdev_candidate = 0; - } - - if (*vpp && (VTOC(*vpp)->c_attr.ca_recflags & kHFSAutoCandidateMask)) { - //printf("vp %s / %d is an auto-candidate\n", (*vpp)->v_name ? (*vpp)->v_name : "no-name", VTOC(*vpp)->c_fileid); - auto_candidate = 1; - } - - if (cnode_locked) - hfs_unlock(VTOC(*vpp)); -exit: - if (*vpp && fastdev_candidate && (*vpp)->v_parent == dvp && !(vnode_isfastdevicecandidate(*vpp))) { - vnode_setfastdevicecandidate(*vpp); - if (auto_candidate) { - vnode_setautocandidate(*vpp); - } - } - - { - uthread_t ut = (struct uthread *)get_bsdthread_info(current_thread()); - - /* - * check to see if we issued any I/O while completing this lookup and - * this thread/task is throttleable... if so, throttle now - * - * this allows us to throttle in between multiple meta data reads that - * might result due to looking up a long pathname (since we'll have to - * re-enter hfs_vnop_lookup for each component of the pathnam not in - * the VFS cache), instead of waiting until the entire path lookup has - * completed and throttling at the systemcall return - */ - if (__improbable(ut->uu_lowpri_window)) { - throttle_lowpri_io(1); - } - } - - return (error); -} - - diff --git a/bsd/hfs/hfs_macos_defs.h b/bsd/hfs/hfs_macos_defs.h deleted file mode 100644 index b4a303b88..000000000 --- a/bsd/hfs/hfs_macos_defs.h +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef __HFS_MACOS_TYPES__ -#define __HFS_MACOS_TYPES__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - -#include - -#include -#include -#include -#include -#include -#include - - -#define TARGET_OS_MAC 0 -#define TARGET_OS_WIN32 0 -#define TARGET_OS_UNIX 0 - -#define PRAGMA_IMPORT 0 -#define PRAGMA_STRUCT_ALIGN 1 -#define PRAGMA_ONCE 0 -#define PRAGMA_STRUCT_PACK 0 -#define PRAGMA_STRUCT_PACKPUSH 0 - -#if __GNUC__ >= 2 - #define TYPE_LONGLONG 1 -#else - #define TYPE_LONGLONG 0 -#endif -#ifdef __cplusplus - #define TYPE_BOOL 1 -#else - #define TYPE_BOOL 0 -#endif - -#define EXTERN_API(_type) extern _type -#define EXTERN_API_C(_type) extern _type - -#define CALLBACK_API_C(_type, _name) _type ( * _name) - -#define TARGET_API_MACOS_X 1 -#define TARGET_API_MAC_OS8 0 -#define TARGET_API_MAC_CARBON 0 - - - -/****** START OF MACOSTYPES *********/ - - -/* - 4.4BSD's sys/types.h defines size_t without defining __size_t__: - Things are a lot clearer from here on if we define __size_t__ now. - */ -#define __size_t__ - -/* - Convert kernel's diagnostic flag to MacOS's -*/ -#if HFS_DIAGNOSTIC - #define DEBUG_BUILD 1 -#else - #define DEBUG_BUILD 0 -#endif /* DIAGNOSTIC */ - -/******************************************************************************** - - Special values in C - - NULL The C standard for an impossible pointer value - nil A carry over from pascal, NULL is prefered for C - -*********************************************************************************/ -#ifndef NULL - #define NULL 0 -#endif - -#ifndef nil - #define nil NULL -#endif - -typedef char * Ptr; -typedef long Size; - -typedef int16_t OSErr; -typedef u_int32_t ItemCount; -typedef u_int32_t ByteCount; -typedef u_int8_t * BytePtr; -typedef u_int32_t ByteOffset; - -typedef u_int16_t UniChar; -typedef unsigned char Str255[256]; -typedef unsigned char Str31[32]; -typedef unsigned char * StringPtr; -typedef const unsigned char * ConstStr255Param; -typedef const unsigned char * ConstStr31Param; -typedef const unsigned char * ConstUTF8Param; - -typedef u_int8_t Byte; - -typedef u_int32_t TextEncoding; -typedef UniChar * UniCharArrayPtr; -typedef const UniChar * ConstUniCharArrayPtr; - - -/******************************************************************************** - - Boolean types and values - - Boolean A one byte value, holds "false" (0) or "true" (1) - false The Boolean value of zero (0) - true The Boolean value of one (1) - -*********************************************************************************/ -/* - The identifiers "true" and "false" are becoming keywords in C++ - and work with the new built-in type "bool" - "Boolean" will remain an unsigned char for compatibility with source - code written before "bool" existed. -*/ -#if !TYPE_BOOL && !__bool_true_false_are_defined - -enum { - false = 0, - true = 1 -}; - -#endif /* !TYPE_BOOL */ - - -EXTERN_API( void ) DebugStr(const char * debuggerMsg); - -/********************************************************************************* - - Added types for HFSPlus MacOS X functionality. Needs to be incorporated to - other places - -*********************************************************************************/ - -typedef struct vnode* FileReference; - - -/***** START OF MACOSSTUBS ********/ - - -/* - SizeTDef.h -- Common definitions - - size_t - this type is defined by several ANSI headers. -*/ -#if ! defined (__size_t__) - #define __size_t__ - #if defined (__xlc) || defined (__xlC) || defined (__xlC__) || defined (__MWERKS__) - typedef unsigned long size_t; - #else /* __xlC */ - typedef unsigned int size_t; - #endif /* __xlC */ -#endif /* __size_t__ */ - - -/* - File: Errors.h - -*/ -enum { - noErr = 0, - dskFulErr = -34, /*disk full*/ - bdNamErr = -37, /*there may be no bad names in the final system!*/ - paramErr = -50, /*error in user parameter list*/ - memFullErr = -108, /*Not enough room in heap zone*/ - fileBoundsErr = -1309, /*file's EOF, offset, mark or size is too big*/ - kTECUsedFallbacksStatus = -8783, - -}; - - -enum { - /* Finder Flags */ - kHasBeenInited = 0x0100, - kHasCustomIcon = 0x0400, - kIsStationery = 0x0800, - kNameLocked = 0x1000, - kHasBundle = 0x2000, - kIsInvisible = 0x4000, - kIsAlias = 0x8000 -}; - -enum { - fsRtParID = 1, - fsRtDirID = 2 -}; - - -enum { - /* Mac OS encodings*/ - kTextEncodingMacRoman = 0L, - kTextEncodingMacJapanese = 1, - kTextEncodingMacChineseTrad = 2, - kTextEncodingMacKorean = 3, - kTextEncodingMacArabic = 4, - kTextEncodingMacHebrew = 5, - kTextEncodingMacGreek = 6, - kTextEncodingMacCyrillic = 7, - kTextEncodingMacDevanagari = 9, - kTextEncodingMacGurmukhi = 10, - kTextEncodingMacGujarati = 11, - kTextEncodingMacOriya = 12, - kTextEncodingMacBengali = 13, - kTextEncodingMacTamil = 14, - kTextEncodingMacTelugu = 15, - kTextEncodingMacKannada = 16, - kTextEncodingMacMalayalam = 17, - kTextEncodingMacSinhalese = 18, - kTextEncodingMacBurmese = 19, - kTextEncodingMacKhmer = 20, - kTextEncodingMacThai = 21, - kTextEncodingMacLaotian = 22, - kTextEncodingMacGeorgian = 23, - kTextEncodingMacArmenian = 24, - kTextEncodingMacChineseSimp = 25, - kTextEncodingMacTibetan = 26, - kTextEncodingMacMongolian = 27, - kTextEncodingMacEthiopic = 28, - kTextEncodingMacCentralEurRoman = 29, - kTextEncodingMacVietnamese = 30, - kTextEncodingMacExtArabic = 31, /* The following use script code 0, smRoman*/ - kTextEncodingMacSymbol = 33, - kTextEncodingMacDingbats = 34, - kTextEncodingMacTurkish = 35, - kTextEncodingMacCroatian = 36, - kTextEncodingMacIcelandic = 37, - kTextEncodingMacRomanian = 38, - kTextEncodingMacUnicode = 0x7E, - - kTextEncodingMacFarsi = 0x8C, /* Like MacArabic but uses Farsi digits */ /* The following use script code 7, smCyrillic */ - kTextEncodingMacUkrainian = 0x98, /* The following use script code 32, smUnimplemented */ -}; - - -/* PROTOTYPES */ - -#if HFS_DIAGNOSTIC - extern void RequireFileLock(FileReference vp, int shareable); - #define REQUIRE_FILE_LOCK(vp,s) RequireFileLock((vp),(s)) -#else - #define REQUIRE_FILE_LOCK(vp,s) -#endif - - -EXTERN_API( void ) -BlockMoveData(const void * srcPtr, void * destPtr, Size byteCount); - -#define BlockMoveData(src, dest, len) bcopy((src), (dest), (len)) - -EXTERN_API_C( void ) -ClearMemory(void * start, u_int32_t length); - -#define ClearMemory(start, length) bzero((start), (size_t)(length)); - - -EXTERN_API( Ptr ) -NewPtr(Size byteCount); - -EXTERN_API( Ptr ) -NewPtrSysClear(Size byteCount); - -EXTERN_API( void ) -DisposePtr(Ptr p); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* __HFS_MACOS_TYPES__ */ diff --git a/bsd/hfs/hfs_mount.h b/bsd/hfs/hfs_mount.h deleted file mode 100644 index ca4f8703f..000000000 --- a/bsd/hfs/hfs_mount.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997-2002 Apple Computer, Inc. All Rights Reserved - * - */ - -#ifndef _HFS_MOUNT_H_ -#define _HFS_MOUNT_H_ - -#include - -#include -#include - -/* - * Arguments to mount HFS-based filesystems - */ - -#define OVERRIDE_UNKNOWN_PERMISSIONS 0 - -#define UNKNOWNUID ((uid_t)99) -#define UNKNOWNGID ((gid_t)99) -#define UNKNOWNPERMISSIONS (S_IRWXU | S_IROTH | S_IXOTH) /* 705 */ - -#ifdef __APPLE_API_UNSTABLE -struct hfs_mount_args { -#ifndef KERNEL - char *fspec; /* block special device to mount */ -#endif - uid_t hfs_uid; /* uid that owns hfs files (standard HFS only) */ - gid_t hfs_gid; /* gid that owns hfs files (standard HFS only) */ - mode_t hfs_mask; /* mask to be applied for hfs perms (standard HFS only) */ - u_int32_t hfs_encoding; /* encoding for this volume (standard HFS only) */ - struct timezone hfs_timezone; /* user time zone info (standard HFS only) */ - int flags; /* mounting flags, see below */ - int journal_tbuffer_size; /* size in bytes of the journal transaction buffer */ - int journal_flags; /* flags to pass to journal_open/create */ - int journal_disable; /* don't use journaling (potentially dangerous) */ -}; - -#define HFSFSMNT_NOXONFILES 0x1 /* disable execute permissions for files */ -#define HFSFSMNT_WRAPPER 0x2 /* mount HFS wrapper (if it exists) */ -#define HFSFSMNT_EXTENDED_ARGS 0x4 /* indicates new fields after "flags" are valid */ - -/* - * Sysctl values for HFS - */ -#define HFS_ENCODINGBIAS 1 /* encoding matching CJK bias */ -#define HFS_EXTEND_FS 2 -#define HFS_ENCODINGHINT 3 /* guess encoding for string */ -#define HFS_ENABLE_JOURNALING 0x082969 -#define HFS_DISABLE_JOURNALING 0x031272 -#define HFS_GET_JOURNAL_INFO 0x6a6e6c69 -#define HFS_SET_PKG_EXTENSIONS 0x121031 -#define HFS_REPLAY_JOURNAL 0x6a6e6c72 -#define HFS_ENABLE_RESIZE_DEBUG 4 /* enable debug code for volume resizing */ - -#endif /* __APPLE_API_UNSTABLE */ - -#endif /* ! _HFS_MOUNT_H_ */ diff --git a/bsd/hfs/hfs_notification.c b/bsd/hfs/hfs_notification.c deleted file mode 100644 index 621c58de5..000000000 --- a/bsd/hfs/hfs_notification.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (C) 2003, 2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_cnode.h" -#include "hfs_dbg.h" -#include "hfs_mount.h" -#include "hfs_quota.h" -#include "hfs_endian.h" - -#include "hfscommon/headers/BTreesInternal.h" -#include "hfscommon/headers/FileMgrInternal.h" - - - -void hfs_generate_volume_notifications(struct hfsmount *hfsmp) -{ - fsid_t fsid; - u_int32_t freeblks, state=999; - - /* Do not generate low disk notifications for read-only volumes */ - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return; - } - - fsid.val[0] = hfsmp->hfs_raw_dev; - fsid.val[1] = vfs_typenum(HFSTOVFS(hfsmp)); - - freeblks = hfs_freeblks(hfsmp, 1); - - if (freeblks < hfsmp->hfs_freespace_notify_dangerlimit) { - state = 2; - } else if (freeblks < hfsmp->hfs_freespace_notify_warninglimit) { - state = 1; - } else if (freeblks >= hfsmp->hfs_freespace_notify_desiredlevel) { - state = 0; - } - - /* Free blocks are less than dangerlimit for the first time */ - if (state == 2 && !(hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK)) { - /* Dump some logging to track down intermittent issues */ - printf("hfs: set VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_dangerlimit); - -#if HFS_SPARSE_DEV - // If we're a sparse device, dump some info about the backing store.. - hfs_lock_mount(hfsmp); - vnode_t backing_vp = hfsmp->hfs_backingfs_rootvp; - if (backing_vp && vnode_get(backing_vp) != 0) - backing_vp = NULL; - hfs_unlock_mount(hfsmp); - - if (backing_vp) { - struct mount *mp = vnode_mount(backing_vp); - printf("hfs: set VeryLowDisk: vol:%s, backingstore b_avail:%lld, tag:%d\n", - hfsmp->vcbVN, mp->mnt_vfsstat.f_bavail, backing_vp->v_tag); - vnode_put(backing_vp); - } -#endif - - hfsmp->hfs_notification_conditions |= (VQ_VERYLOWDISK|VQ_LOWDISK); - vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); - } else if (state == 1) { - /* Free blocks are less than warning limit for the first time */ - if (!(hfsmp->hfs_notification_conditions & VQ_LOWDISK)) { - printf("hfs: set LowDisk: vol:%s, freeblks:%d, warninglimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_warninglimit); - hfsmp->hfs_notification_conditions |= VQ_LOWDISK; - vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); - } else if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { - /* Free blocks count has increased from danger limit to warning limit, so just clear VERYLOWDISK warning */ - printf("hfs: clear VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_dangerlimit); - hfsmp->hfs_notification_conditions &= ~VQ_VERYLOWDISK; - vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); - } - } else if (state == 0) { - /* Free blocks count has increased to desirable level, so clear all conditions */ - if (hfsmp->hfs_notification_conditions & (VQ_LOWDISK|VQ_VERYLOWDISK)) { - if (hfsmp->hfs_notification_conditions & VQ_LOWDISK) { - printf("hfs: clear LowDisk: vol:%s, freeblks:%d, warninglimit:%d, desiredlevel:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_warninglimit, hfsmp->hfs_freespace_notify_desiredlevel); - } - if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { - printf("hfs: clear VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_warninglimit); - } - hfsmp->hfs_notification_conditions &= ~(VQ_VERYLOWDISK|VQ_LOWDISK); - if (hfsmp->hfs_notification_conditions == 0) { - vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL); - } else { - vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); - } - } - } -} diff --git a/bsd/hfs/hfs_quota.c b/bsd/hfs/hfs_quota.c deleted file mode 100644 index 10a9e4e8b..000000000 --- a/bsd/hfs/hfs_quota.c +++ /dev/null @@ -1,1013 +0,0 @@ -/* - * Copyright (c) 2002-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1990, 1993, 1995 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Robert Elz at The University of Melbourne. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)hfs_quota.c - * derived from @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - - -/* - * Quota name to error message mapping. - */ -#if 0 -static char *quotatypes[] = INITQFNAMES; -#endif - -/* - * Set up the quotas for a cnode. - * - * This routine completely defines the semantics of quotas. - * If other criterion want to be used to establish quotas, the - * MAXQUOTAS value in quotas.h should be increased, and the - * additional dquots set up here. - */ -int -hfs_getinoquota(cp) - register struct cnode *cp; -{ - struct hfsmount *hfsmp; - struct vnode *vp; - int error; - int drop_usrquota = false; - - vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; - hfsmp = VTOHFS(vp); - /* - * Set up the user quota based on file uid. - * EINVAL means that quotas are not enabled. - */ - if (cp->c_dquot[USRQUOTA] == NODQUOT) { - error = dqget(cp->c_uid, &hfsmp->hfs_qfiles[USRQUOTA], USRQUOTA, &cp->c_dquot[USRQUOTA]); - if ((error != 0) && (error != EINVAL)) { - return error; - } else if (error == 0) { - drop_usrquota = true; - } - } - - /* - * Set up the group quota based on file gid. - * EINVAL means that quotas are not enabled. - */ - if (cp->c_dquot[GRPQUOTA] == NODQUOT) { - error = dqget(cp->c_gid, &hfsmp->hfs_qfiles[GRPQUOTA], GRPQUOTA, &cp->c_dquot[GRPQUOTA]); - if ((error != 0) && (error != EINVAL)) { - if (drop_usrquota == true) { - dqrele(cp->c_dquot[USRQUOTA]); - cp->c_dquot[USRQUOTA] = NODQUOT; - } - return error; - } - } - - return (0); -} - -/* - * Update disk usage, and take corrective action. - */ -int -hfs_chkdq(cp, change, cred, flags) - register struct cnode *cp; - int64_t change; - kauth_cred_t cred; - int flags; -{ - register struct dquot *dq; - register int i; - int64_t ncurbytes; - int error=0; - struct proc *p; - -#if DIAGNOSTIC - if ((flags & CHOWN) == 0) - hfs_chkdquot(cp); -#endif - if (change == 0) - return (0); - if (change < 0) { - for (i = 0; i < MAXQUOTAS; i++) { - if ((dq = cp->c_dquot[i]) == NODQUOT) - continue; - dqlock(dq); - - ncurbytes = dq->dq_curbytes + change; - if (ncurbytes >= 0) - dq->dq_curbytes = ncurbytes; - else - dq->dq_curbytes = 0; - dq->dq_flags &= ~DQ_BLKS; - dq->dq_flags |= DQ_MOD; - - dqunlock(dq); - } - return (0); - } - p = current_proc(); - /* - * This use of proc_ucred() is safe because kernproc credential never - * changes. - */ - if (!IS_VALID_CRED(cred)) - cred = proc_ucred(kernproc); - if (suser(cred, NULL) || proc_forcequota(p)) { - for (i = 0; i < MAXQUOTAS; i++) { - if ((dq = cp->c_dquot[i]) == NODQUOT) - continue; - error = hfs_chkdqchg(cp, change, cred, i); - if (error) { - break; - } - } - } - if ((flags & FORCE) || error == 0) { - for (i = 0; i < MAXQUOTAS; i++) { - if ((dq = cp->c_dquot[i]) == NODQUOT) - continue; - dqlock(dq); - - dq->dq_curbytes += change; - dq->dq_flags |= DQ_MOD; - - dqunlock(dq); - } - } - return (error); -} - -/* - * Check for a valid change to a users allocation. - * Issue an error message and vfs event if appropriate. - */ -int -hfs_chkdqchg(cp, change, cred, type) - struct cnode *cp; - int64_t change; - kauth_cred_t cred; - int type; -{ - register struct dquot *dq = cp->c_dquot[type]; - u_int64_t ncurbytes; - struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; - - fsid_t fsid; - fsid.val[0] = VTOHFS(vp)->hfs_raw_dev; - fsid.val[1] = vfs_typenum(VTOVFS(vp)); - - dqlock(dq); - - ncurbytes = dq->dq_curbytes + change; - /* - * If user would exceed their hard limit, disallow space allocation. - */ - if (ncurbytes >= dq->dq_bhardlimit && dq->dq_bhardlimit) { - if ((dq->dq_flags & DQ_BLKS) == 0 && - cp->c_uid == kauth_cred_getuid(cred)) { -#if 0 - printf("\nhfs: write failed, %s disk limit reached\n", - quotatypes[type]); -#endif - dq->dq_flags |= DQ_BLKS; - vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); - } - dqunlock(dq); - - return (EDQUOT); - } - /* - * If user is over their soft limit for too long, disallow space - * allocation. Reset time limit as they cross their soft limit. - */ - if (ncurbytes >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { - struct timeval tv; - - microuptime(&tv); - if (dq->dq_curbytes < dq->dq_bsoftlimit) { - dq->dq_btime = tv.tv_sec + - VTOHFS(vp)->hfs_qfiles[type].qf_btime; -#if 0 - if (cp->c_uid == kauth_cred_getuid(cred)) - printf("\nhfs: warning, %s %s\n", - quotatypes[type], "disk quota exceeded"); -#endif - vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); - dqunlock(dq); - - return (0); - } - if (tv.tv_sec > (time_t)dq->dq_btime) { - if ((dq->dq_flags & DQ_BLKS) == 0 && - cp->c_uid == kauth_cred_getuid(cred)) { -#if 0 - printf("\nhfs: write failed, %s %s\n", - quotatypes[type], - "disk quota exceeded for too long"); -#endif - dq->dq_flags |= DQ_BLKS; - vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); - } - dqunlock(dq); - - return (EDQUOT); - } - } - dqunlock(dq); - - return (0); -} - -/* - * Check the inode limit, applying corrective action. - */ -int -hfs_chkiq(cp, change, cred, flags) - register struct cnode *cp; - int32_t change; - kauth_cred_t cred; - int flags; -{ - register struct dquot *dq; - register int i; - int ncurinodes, error=0; - struct proc *p; - -#if DIAGNOSTIC - if ((flags & CHOWN) == 0) - hfs_chkdquot(cp); -#endif - if (change == 0) - return (0); - if (change < 0) { - for (i = 0; i < MAXQUOTAS; i++) { - if ((dq = cp->c_dquot[i]) == NODQUOT) - continue; - dqlock(dq); - - ncurinodes = dq->dq_curinodes + change; - if (ncurinodes >= 0) - dq->dq_curinodes = ncurinodes; - else - dq->dq_curinodes = 0; - dq->dq_flags &= ~DQ_INODS; - dq->dq_flags |= DQ_MOD; - - dqunlock(dq); - } - return (0); - } - p = current_proc(); - /* - * This use of proc_ucred() is safe because kernproc credential never - * changes. - */ - if (!IS_VALID_CRED(cred)) - cred = proc_ucred(kernproc); - if (suser(cred, NULL) || proc_forcequota(p)) { - for (i = 0; i < MAXQUOTAS; i++) { - if ((dq = cp->c_dquot[i]) == NODQUOT) - continue; - error = hfs_chkiqchg(cp, change, cred, i); - if (error) { - break; - } - } - } - if ((flags & FORCE) || error == 0) { - for (i = 0; i < MAXQUOTAS; i++) { - if ((dq = cp->c_dquot[i]) == NODQUOT) - continue; - dqlock(dq); - - dq->dq_curinodes += change; - dq->dq_flags |= DQ_MOD; - - dqunlock(dq); - } - } - return (error); -} - - -/* - * Check to see if a change to a user's allocation should be permitted or not. - * Issue an error message if it should not be permitted. Return 0 if - * it should be allowed. - */ -int hfs_isiqchg_allowed(dq, hfsmp, change, cred, type, uid) - struct dquot* dq; - struct hfsmount* hfsmp; - int32_t change; - kauth_cred_t cred; - int type; - uid_t uid; -{ - u_int32_t ncurinodes; - - fsid_t fsid; - fsid.val[0] = hfsmp->hfs_raw_dev; - fsid.val[1] = vfs_typenum(HFSTOVFS(hfsmp)); - - dqlock(dq); - - ncurinodes = dq->dq_curinodes + change; - /* - * If user would exceed their hard limit, disallow cnode allocation. - */ - if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { - if ((dq->dq_flags & DQ_INODS) == 0 && - uid == kauth_cred_getuid(cred)) { - dq->dq_flags |= DQ_INODS; - vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); - } - dqunlock(dq); - - return (EDQUOT); - } - /* - * If user is over their soft limit for too long, disallow cnode - * allocation. Reset time limit as they cross their soft limit. - */ - if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { - struct timeval tv; - - microuptime(&tv); - if (dq->dq_curinodes < dq->dq_isoftlimit) { - dq->dq_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; - vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); - dqunlock(dq); - return (0); - } - if (tv.tv_sec > (time_t)dq->dq_itime) { - if (((dq->dq_flags & DQ_INODS) == 0) && - (uid == kauth_cred_getuid(cred))) { - dq->dq_flags |= DQ_INODS; - vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); - } - dqunlock(dq); - - return (EDQUOT); - } - } - dqunlock(dq); - - return (0); -} - - -/* - * Check for a valid change to a users allocation. - * Issue an error message if appropriate. - */ -int -hfs_chkiqchg(cp, change, cred, type) - struct cnode *cp; - int32_t change; - kauth_cred_t cred; - int type; -{ - register struct dquot *dq = cp->c_dquot[type]; - u_int32_t ncurinodes; - struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; - - dqlock(dq); - - ncurinodes = dq->dq_curinodes + change; - /* - * If user would exceed their hard limit, disallow cnode allocation. - */ - if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { - if ((dq->dq_flags & DQ_INODS) == 0 && - cp->c_uid == kauth_cred_getuid(cred)) { -#if 0 - printf("\nhfs: write failed, %s cnode limit reached\n", - quotatypes[type]); -#endif - dq->dq_flags |= DQ_INODS; - } - dqunlock(dq); - - return (EDQUOT); - } - /* - * If user is over their soft limit for too long, disallow cnode - * allocation. Reset time limit as they cross their soft limit. - */ - if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { - struct timeval tv; - - microuptime(&tv); - if (dq->dq_curinodes < dq->dq_isoftlimit) { - dq->dq_itime = tv.tv_sec + - VTOHFS(vp)->hfs_qfiles[type].qf_itime; -#if 0 - if (cp->c_uid == kauth_cred_getuid(cred)) - printf("\nhfs: warning, %s %s\n", - quotatypes[type], "cnode quota exceeded"); -#endif - dqunlock(dq); - - return (0); - } - if (tv.tv_sec > (time_t)dq->dq_itime) { - if ((dq->dq_flags & DQ_INODS) == 0 && - cp->c_uid == kauth_cred_getuid(cred)) { -#if 0 - printf("\nhfs: write failed, %s %s\n", - quotatypes[type], - "cnode quota exceeded for too long"); -#endif - dq->dq_flags |= DQ_INODS; - } - dqunlock(dq); - - return (EDQUOT); - } - } - dqunlock(dq); - - return (0); -} - -#if DIAGNOSTIC -/* - * On filesystems with quotas enabled, it is an error for a file to change - * size and not to have a dquot structure associated with it. - */ -void -hfs_chkdquot(cp) - register struct cnode *cp; -{ - struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; - struct hfsmount *hfsmp = VTOHFS(vp); - register int i; - - for (i = 0; i < MAXQUOTAS; i++) { - if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP) - continue; - if (cp->c_dquot[i] == NODQUOT) { - vprint("chkdquot: missing dquot", vp); - panic("missing dquot"); - } - } -} -#endif - -/* - * Code to process quotactl commands. - */ - -/* - * Q_QUOTAON - set up a quota file for a particular file system. - */ -struct hfs_quotaon_cargs { - int error; -}; - -static int -hfs_quotaon_callback(struct vnode *vp, void *cargs) -{ - struct hfs_quotaon_cargs *args; - - args = (struct hfs_quotaon_cargs *)cargs; - - args->error = hfs_getinoquota(VTOC(vp)); - if (args->error) - return (VNODE_RETURNED_DONE); - - return (VNODE_RETURNED); -} - -int -hfs_quotaon(p, mp, type, fnamep) - struct proc *p; - struct mount *mp; - register int type; - caddr_t fnamep; -{ - struct hfsmount *hfsmp = VFSTOHFS(mp); - struct quotafile *qfp; - struct vnode *vp; - int error = 0; - struct hfs_quotaon_cargs args; - - /* Finish setting up quota structures. */ - dqhashinit(); - - qfp = &hfsmp->hfs_qfiles[type]; - - if ( (qf_get(qfp, QTF_OPENING)) ) - return (0); - - error = vnode_open(fnamep, FREAD|FWRITE, 0, 0, &vp, NULL); - if (error) { - goto out; - } - if (!vnode_isreg(vp)) { - (void) vnode_close(vp, FREAD|FWRITE, NULL); - error = EACCES; - goto out; - } - vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_QUOTA)); - hfs_lock_mount (hfsmp); - hfsmp->hfs_flags |= HFS_QUOTAS; - hfs_unlock_mount (hfsmp); - vnode_setnoflush(vp); - /* - * Save the credential of the process that turned on quotas. - */ - qfp->qf_cred = kauth_cred_proc_ref(p); - qfp->qf_vp = vp; - /* - * Finish initializing the quota file - */ - error = dqfileopen(qfp, type); - if (error) { - (void) vnode_close(vp, FREAD|FWRITE, NULL); - - if (IS_VALID_CRED(qfp->qf_cred)) - kauth_cred_unref(&qfp->qf_cred); - qfp->qf_vp = NULLVP; - goto out; - } - qf_put(qfp, QTF_OPENING); - - /* - * Search vnodes associated with this mount point, - * adding references to quota file being opened. - * NB: only need to add dquot's for cnodes being modified. - * - * hfs_quota_callback will be called for each vnode open for - * 'write' (VNODE_WRITEABLE) hung off of this mount point - * the vnode will be in an 'unbusy' state (VNODE_WAIT) and - * properly referenced and unreferenced around the callback - */ - args.error = 0; - - vnode_iterate(mp, VNODE_WRITEABLE | VNODE_WAIT, hfs_quotaon_callback, (void *)&args); - - error = args.error; - - if (error) { - hfs_quotaoff(p, mp, type); - } - return (error); - -out: - qf_put(qfp, QTF_OPENING); - - return (error); -} - - -/* - * Q_QUOTAOFF - turn off disk quotas for a filesystem. - */ -struct hfs_quotaoff_cargs { - int type; -}; - -static int -hfs_quotaoff_callback(struct vnode *vp, void *cargs) -{ - struct hfs_quotaoff_cargs *args; - struct cnode *cp; - struct dquot *dq; - - args = (struct hfs_quotaoff_cargs *)cargs; - - cp = VTOC(vp); - - dq = cp->c_dquot[args->type]; - cp->c_dquot[args->type] = NODQUOT; - - dqrele(dq); - - return (VNODE_RETURNED); -} - -int -hfs_quotaoff(__unused struct proc *p, struct mount *mp, register int type) -{ - struct vnode *qvp; - struct hfsmount *hfsmp = VFSTOHFS(mp); - struct quotafile *qfp; - int error; - struct hfs_quotaoff_cargs args; - - /* - * If quotas haven't been initialized, there's no work to be done. - */ - if (!dqisinitialized()) - return (0); - - qfp = &hfsmp->hfs_qfiles[type]; - - if ( (qf_get(qfp, QTF_CLOSING)) ) - return (0); - qvp = qfp->qf_vp; - - /* - * Sync out any orpaned dirty dquot entries. - */ - dqsync_orphans(qfp); - - /* - * Search vnodes associated with this mount point, - * deleting any references to quota file being closed. - * - * hfs_quotaoff_callback will be called for each vnode - * hung off of this mount point - * the vnode will be in an 'unbusy' state (VNODE_WAIT) and - * properly referenced and unreferenced around the callback - */ - args.type = type; - - vnode_iterate(mp, VNODE_WAIT, hfs_quotaoff_callback, (void *)&args); - - dqflush(qvp); - /* Finish tearing down the quota file */ - dqfileclose(qfp, type); - - vnode_clearnoflush(qvp); - error = vnode_close(qvp, FREAD|FWRITE, NULL); - - qfp->qf_vp = NULLVP; - - if (IS_VALID_CRED(qfp->qf_cred)) - kauth_cred_unref(&qfp->qf_cred); - for (type = 0; type < MAXQUOTAS; type++) - if (hfsmp->hfs_qfiles[type].qf_vp != NULLVP) - break; - if (type == MAXQUOTAS) { - vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_QUOTA)); - hfs_lock_mount (hfsmp); - hfsmp->hfs_flags &= ~HFS_QUOTAS; - hfs_unlock_mount (hfsmp); - } - - qf_put(qfp, QTF_CLOSING); - - return (error); -} - -/* - * hfs_quotacheck - checks quotas mountwide for a hypothetical situation. It probes - * the quota data structures to see if adding an inode would be allowed or not. If it - * will be allowed, the change is made. Otherwise, it reports an error back out so the - * caller will know not to proceed with inode allocation in the HFS Catalog. - * - * Note that this function ONLY tests for addition of inodes, not subtraction. - */ -int hfs_quotacheck(hfsmp, change, uid, gid, cred) - struct hfsmount *hfsmp; - int change; - uid_t uid; - gid_t gid; - kauth_cred_t cred; -{ - struct dquot *dq = NULL; - struct proc *p; - int error = 0; - int i; - id_t id = uid; - - p = current_proc(); - if (!IS_VALID_CRED(cred)) { - /* This use of proc_ucred() is safe because kernproc credential never changes */ - cred = proc_ucred(kernproc); - } - - if (suser(cred, NULL) || proc_forcequota(p)) { - for (i = 0; i < MAXQUOTAS; i++) { - /* Select if user or group id should be used */ - if (i == USRQUOTA) - id = uid; - else if (i == GRPQUOTA) - id = gid; - - error = dqget(id, &hfsmp->hfs_qfiles[i], i, &dq); - if (error && (error != EINVAL)) - break; - - error = 0; - if (dq == NODQUOT) - continue; - - /* Check quota information */ - error = hfs_isiqchg_allowed(dq, hfsmp, change, cred, i, id); - if (error) { - dqrele(dq); - break; - } - - dqlock(dq); - /* Update quota information */ - dq->dq_curinodes += change; - dqunlock(dq); - dqrele(dq); - } - } - - return error; -} - - -/* - * Q_GETQUOTA - return current values in a dqblk structure. - */ -int -hfs_getquota(mp, id, type, datap) - struct mount *mp; - u_int32_t id; - int type; - caddr_t datap; -{ - struct dquot *dq; - int error; - - error = dqget(id, &VFSTOHFS(mp)->hfs_qfiles[type], type, &dq); - if (error) - return (error); - dqlock(dq); - - bcopy(&dq->dq_dqb, datap, sizeof(dq->dq_dqb)); - - dqunlock(dq); - dqrele(dq); - - return (error); -} - -/* - * Q_SETQUOTA - assign an entire dqblk structure. - */ -int -hfs_setquota(mp, id, type, datap) - struct mount *mp; - u_int32_t id; - int type; - caddr_t datap; -{ - struct dquot *dq; - struct hfsmount *hfsmp = VFSTOHFS(mp); - struct dqblk * newlimp = (struct dqblk *) datap; - struct timeval tv; - int error; - - error = dqget(id, &hfsmp->hfs_qfiles[type], type, &dq); - if (error) - return (error); - dqlock(dq); - - /* - * Copy all but the current values. - * Reset time limit if previously had no soft limit or were - * under it, but now have a soft limit and are over it. - */ - newlimp->dqb_curbytes = dq->dq_curbytes; - newlimp->dqb_curinodes = dq->dq_curinodes; - if (dq->dq_id != 0) { - newlimp->dqb_btime = dq->dq_btime; - newlimp->dqb_itime = dq->dq_itime; - } - if (newlimp->dqb_bsoftlimit && - dq->dq_curbytes >= newlimp->dqb_bsoftlimit && - (dq->dq_bsoftlimit == 0 || dq->dq_curbytes < dq->dq_bsoftlimit)) { - microuptime(&tv); - newlimp->dqb_btime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; - } - if (newlimp->dqb_isoftlimit && - dq->dq_curinodes >= newlimp->dqb_isoftlimit && - (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) { - microuptime(&tv); - newlimp->dqb_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; - } - bcopy(newlimp, &dq->dq_dqb, sizeof(dq->dq_dqb)); - if (dq->dq_curbytes < dq->dq_bsoftlimit) - dq->dq_flags &= ~DQ_BLKS; - if (dq->dq_curinodes < dq->dq_isoftlimit) - dq->dq_flags &= ~DQ_INODS; - if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && - dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) - dq->dq_flags |= DQ_FAKE; - else - dq->dq_flags &= ~DQ_FAKE; - dq->dq_flags |= DQ_MOD; - - dqunlock(dq); - dqrele(dq); - - return (0); -} - -/* - * Q_SETUSE - set current cnode and byte usage. - */ -int -hfs_setuse(mp, id, type, datap) - struct mount *mp; - u_int32_t id; - int type; - caddr_t datap; -{ - struct hfsmount *hfsmp = VFSTOHFS(mp); - struct dquot *dq; - struct timeval tv; - int error; - struct dqblk *quotablkp = (struct dqblk *) datap; - - error = dqget(id, &hfsmp->hfs_qfiles[type], type, &dq); - if (error) - return (error); - dqlock(dq); - - /* - * Reset time limit if have a soft limit and were - * previously under it, but are now over it. - */ - if (dq->dq_bsoftlimit && dq->dq_curbytes < dq->dq_bsoftlimit && - quotablkp->dqb_curbytes >= dq->dq_bsoftlimit) { - microuptime(&tv); - dq->dq_btime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; - } - if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && - quotablkp->dqb_curinodes >= dq->dq_isoftlimit) { - microuptime(&tv); - dq->dq_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; - } - dq->dq_curbytes = quotablkp->dqb_curbytes; - dq->dq_curinodes = quotablkp->dqb_curinodes; - if (dq->dq_curbytes < dq->dq_bsoftlimit) - dq->dq_flags &= ~DQ_BLKS; - if (dq->dq_curinodes < dq->dq_isoftlimit) - dq->dq_flags &= ~DQ_INODS; - dq->dq_flags |= DQ_MOD; - - dqunlock(dq); - dqrele(dq); - - return (0); -} - - -/* - * Q_SYNC - sync quota files to disk. - */ -static int -hfs_qsync_callback(struct vnode *vp, __unused void *cargs) -{ - struct cnode *cp; - struct dquot *dq; - int i; - - cp = VTOC(vp); - - for (i = 0; i < MAXQUOTAS; i++) { - dq = cp->c_dquot[i]; - if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) - dqsync(dq); - } - return (VNODE_RETURNED); -} - -int -hfs_qsync(mp) - struct mount *mp; -{ - struct hfsmount *hfsmp = VFSTOHFS(mp); - int i; - - if (!dqisinitialized()) - return (0); - - /* - * Check if the mount point has any quotas. - * If not, simply return. - */ - for (i = 0; i < MAXQUOTAS; i++) - if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP) - break; - if (i == MAXQUOTAS) - return (0); - - /* - * Sync out any orpaned dirty dquot entries. - */ - for (i = 0; i < MAXQUOTAS; i++) - if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP) - dqsync_orphans(&hfsmp->hfs_qfiles[i]); - - /* - * Search vnodes associated with this mount point, - * synchronizing any modified dquot structures. - * - * hfs_qsync_callback will be called for each vnode - * hung off of this mount point - * the vnode will be - * properly referenced and unreferenced around the callback - */ - vnode_iterate(mp, 0, hfs_qsync_callback, (void *)NULL); - - return (0); -} - -/* - * Q_QUOTASTAT - get quota on/off status - */ -int -hfs_quotastat(mp, type, datap) - struct mount *mp; - register int type; - caddr_t datap; -{ - struct hfsmount *hfsmp = VFSTOHFS(mp); - int error = 0; - int qstat; - - if ((((unsigned int)vfs_flags(mp)) & MNT_QUOTA) && (hfsmp->hfs_qfiles[type].qf_vp != NULLVP)) - qstat = 1; /* quotas are on for this type */ - else - qstat = 0; /* quotas are off for this type */ - - *((int *)datap) = qstat; - return (error); -} - diff --git a/bsd/hfs/hfs_quota.h b/bsd/hfs/hfs_quota.h deleted file mode 100644 index a57dbdff3..000000000 --- a/bsd/hfs/hfs_quota.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Robert Elz at The University of Melbourne. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)hfs_quota.h - * derived from @(#)quota.h 8.3 (Berkeley) 8/19/94 - */ - -#ifndef _HFS_QUOTA_H_ -#define _HFS_QUOTA_H_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -#include - -#include - -struct cnode; -struct mount; -struct proc; -#ifndef _KAUTH_CRED_T -#define _KAUTH_CRED_T -struct ucred; -typedef struct ucred *kauth_cred_t; -#endif /* !_KAUTH_CRED_T */ -__BEGIN_DECLS -int hfs_chkdq(struct cnode *, int64_t, kauth_cred_t, int); -int hfs_chkdqchg(struct cnode *, int64_t, kauth_cred_t, int); -int hfs_chkiq(struct cnode *, int32_t, kauth_cred_t, int); -int hfs_chkiqchg(struct cnode *, int32_t, kauth_cred_t, int); -int hfs_getinoquota(struct cnode *); -int hfs_getquota(struct mount *, u_int32_t, int, caddr_t); -int hfs_qsync(struct mount *mp); -int hfs_quotaoff(struct proc *, struct mount *, int); -int hfs_quotaon(struct proc *, struct mount *, int, caddr_t); -int hfs_quotastat(struct mount *, int, caddr_t); -int hfs_setquota(struct mount *, u_int32_t, int, caddr_t); -int hfs_setuse(struct mount *, u_int32_t, int, caddr_t); -int hfs_isiqchg_allowed(struct dquot *, struct hfsmount *, int32_t, kauth_cred_t, int, uid_t); -int hfs_quotacheck (struct hfsmount *, int , uid_t, gid_t, kauth_cred_t); -__END_DECLS - -#if DIAGNOSTIC -__BEGIN_DECLS -void hfs_chkdquot(struct cnode *); -__END_DECLS -#endif -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ - -#endif /* ! _HFS_QUOTA_H_ */ diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c deleted file mode 100644 index 78719c069..000000000 --- a/bsd/hfs/hfs_readwrite.c +++ /dev/null @@ -1,5682 +0,0 @@ -/* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* @(#)hfs_readwrite.c 1.0 - * - * (c) 1998-2001 Apple Computer, Inc. All Rights Reserved - * - * hfs_readwrite.c -- vnode operations to deal with reading and writing files. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include - -#include -#include - -#include - -#include - -#include "hfs.h" -#include "hfs_attrlist.h" -#include "hfs_endian.h" -#include "hfs_fsctl.h" -#include "hfs_quota.h" -#include "hfscommon/headers/FileMgrInternal.h" -#include "hfscommon/headers/BTreesInternal.h" -#include "hfs_cnode.h" -#include "hfs_dbg.h" - - -#define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2))) - -enum { - MAXHFSFILESIZE = 0x7FFFFFFF /* this needs to go in the mount structure */ -}; - -/* from bsd/hfs/hfs_vfsops.c */ -extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); - -/* from hfs_hotfiles.c */ -extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid, - uint8_t forktype, uint32_t *pinned); - -static int hfs_clonefile(struct vnode *, int, int, int); -static int hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *); -static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context); - -/* from bsd/hfs/hfs_vnops.c */ -extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp); - - - -int flush_cache_on_write = 0; -SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); - -/* - * Read data from a file. - */ -int -hfs_vnop_read(struct vnop_read_args *ap) -{ - /* - struct vnop_read_args { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - }; - */ - - uio_t uio = ap->a_uio; - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct filefork *fp; - struct hfsmount *hfsmp; - off_t filesize; - off_t filebytes; - off_t start_resid = uio_resid(uio); - off_t offset = uio_offset(uio); - int retval = 0; - int took_truncate_lock = 0; - int io_throttle = 0; - int throttled_count = 0; - - /* Preflight checks */ - if (!vnode_isreg(vp)) { - /* can only read regular files */ - if (vnode_isdir(vp)) - return (EISDIR); - else - return (EPERM); - } - if (start_resid == 0) - return (0); /* Nothing left to do */ - if (offset < 0) - return (EINVAL); /* cant read from a negative offset */ - -#if SECURE_KERNEL - if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == - (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { - /* Don't allow unencrypted io request from user space */ - return EPERM; - } -#endif - -#if HFS_COMPRESSION - if (VNODE_IS_RSRC(vp)) { - if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */ - return 0; - } - /* otherwise read the resource fork normally */ - } else { - int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ - if (compressed) { - retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp)); - if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) { - (void) hfs_addhotfile(vp); - } - if (compressed) { - if (retval == 0) { - /* successful read, update the access time */ - VTOC(vp)->c_touch_acctime = TRUE; - - // - // compressed files are not traditional hot file candidates - // but they may be for CF (which ignores the ff_bytesread - // field) - // - if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { - VTOF(vp)->ff_bytesread = 0; - } - } - return retval; - } - /* otherwise the file was converted back to a regular file while we were reading it */ - retval = 0; - } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { - int error; - - error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); - if (error) { - return error; - } - - } - } -#endif /* HFS_COMPRESSION */ - - cp = VTOC(vp); - fp = VTOF(vp); - hfsmp = VTOHFS(vp); - -#if CONFIG_PROTECT - if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) { - goto exit; - } - -#endif // CONFIG_PROTECT - - /* - * If this read request originated from a syscall (as opposed to - * an in-kernel page fault or something), then set it up for - * throttle checks - */ - if (ap->a_ioflag & IO_SYSCALL_DISPATCH) { - io_throttle = IO_RETURN_ON_THROTTLE; - } - -read_again: - - /* Protect against a size change. */ - hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - took_truncate_lock = 1; - - filesize = fp->ff_size; - filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - - /* - * Check the file size. Note that per POSIX spec, we return 0 at - * file EOF, so attempting a read at an offset that is too big - * should just return 0 on HFS+. Since the return value was initialized - * to 0 above, we just jump to exit. HFS Standard has its own behavior. - */ - if (offset > filesize) { - if ((hfsmp->hfs_flags & HFS_STANDARD) && - (offset > (off_t)MAXHFSFILESIZE)) { - retval = EFBIG; - } - goto exit; - } - - KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START, - (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); - - retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle); - - cp->c_touch_acctime = TRUE; - - KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END, - (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); - - /* - * Keep track blocks read - */ - if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) { - int took_cnode_lock = 0; - off_t bytesread; - - bytesread = start_resid - uio_resid(uio); - - /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ - if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - took_cnode_lock = 1; - } - /* - * If this file hasn't been seen since the start of - * the current sampling period then start over. - */ - if (cp->c_atime < hfsmp->hfc_timebase) { - struct timeval tv; - - fp->ff_bytesread = bytesread; - microtime(&tv); - cp->c_atime = tv.tv_sec; - } else { - fp->ff_bytesread += bytesread; - } - - if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) { - // - // We don't add hotfiles for processes doing IO_EVTONLY I/O - // on the assumption that they're system processes such as - // mdworker which scan everything in the system (and thus - // do not represent user-initiated access to files) - // - (void) hfs_addhotfile(vp); - } - if (took_cnode_lock) - hfs_unlock(cp); - } -exit: - if (took_truncate_lock) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - } - if (retval == EAGAIN) { - throttle_lowpri_io(1); - throttled_count++; - - retval = 0; - goto read_again; - } - if (throttled_count) { - throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread())); - } - return (retval); -} - -/* - * Ideally, this wouldn't be necessary; the cluster code should be - * able to handle this on the read-side. See . - */ -static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to) -{ - assert(VTOC(vp)->c_lockowner != current_thread()); - assert(VTOC(vp)->c_truncatelockowner == current_thread()); - - struct filefork *fp = VTOF(vp); - - if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) { - // Nothing to do - return 0; - } - - zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size)); - - /* N.B. At present, @zero_up_to is not important because the cluster - code will always zero up to the end of the page anyway. */ - return cluster_write(vp, NULL, fp->ff_size, zero_up_to, - fp->ff_size, 0, IO_HEADZEROFILL); -} - -/* - * Write data to a file. - */ -int -hfs_vnop_write(struct vnop_write_args *ap) -{ - uio_t uio = ap->a_uio; - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct filefork *fp; - struct hfsmount *hfsmp; - kauth_cred_t cred = NULL; - off_t origFileSize; - off_t writelimit; - off_t bytesToAdd = 0; - off_t actualBytesAdded; - off_t filebytes; - off_t offset; - ssize_t resid; - int eflags; - int ioflag = ap->a_ioflag; - int retval = 0; - int lockflags; - int cnode_locked = 0; - int partialwrite = 0; - int do_snapshot = 1; - time_t orig_ctime=VTOC(vp)->c_ctime; - int took_truncate_lock = 0; - int io_return_on_throttle = 0; - int throttled_count = 0; - -#if HFS_COMPRESSION - if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ - int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); - switch(state) { - case FILE_IS_COMPRESSED: - return EACCES; - case FILE_IS_CONVERTING: - /* if FILE_IS_CONVERTING, we allow writes but do not - bother with snapshots or else we will deadlock. - */ - do_snapshot = 0; - break; - default: - printf("invalid state %d for compressed file\n", state); - /* fall through */ - } - } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { - int error; - - error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP); - if (error != 0) { - return error; - } - } - - if (do_snapshot) { - check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio); - } - -#endif - -#if SECURE_KERNEL - if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == - (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { - /* Don't allow unencrypted io request from user space */ - return EPERM; - } -#endif - - resid = uio_resid(uio); - offset = uio_offset(uio); - - if (offset < 0) - return (EINVAL); - if (resid == 0) - return (E_NONE); - if (!vnode_isreg(vp)) - return (EPERM); /* Can only write regular files */ - - cp = VTOC(vp); - fp = VTOF(vp); - hfsmp = VTOHFS(vp); - -#if CONFIG_PROTECT - if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) { - goto exit; - } -#endif - - eflags = kEFDeferMask; /* defer file block allocations */ -#if HFS_SPARSE_DEV - /* - * When the underlying device is sparse and space - * is low (< 8MB), stop doing delayed allocations - * and begin doing synchronous I/O. - */ - if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && - (hfs_freeblks(hfsmp, 0) < 2048)) { - eflags &= ~kEFDeferMask; - ioflag |= IO_SYNC; - } -#endif /* HFS_SPARSE_DEV */ - - if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) == - (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) { - io_return_on_throttle = IO_RETURN_ON_THROTTLE; - } - -again: - /* - * Protect against a size change. - * - * Note: If took_truncate_lock is true, then we previously got the lock shared - * but needed to upgrade to exclusive. So try getting it exclusive from the - * start. - */ - if (ioflag & IO_APPEND || took_truncate_lock) { - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - } - else { - hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - } - took_truncate_lock = 1; - - /* Update UIO */ - if (ioflag & IO_APPEND) { - uio_setoffset(uio, fp->ff_size); - offset = fp->ff_size; - } - if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) { - retval = EPERM; - goto exit; - } - - cred = vfs_context_ucred(ap->a_context); - if (cred && suser(cred, NULL) != 0) - eflags |= kEFReserveMask; - - origFileSize = fp->ff_size; - writelimit = offset + resid; - - /* - * We may need an exclusive truncate lock for several reasons, all - * of which are because we may be writing to a (portion of a) block - * for the first time, and we need to make sure no readers see the - * prior, uninitialized contents of the block. The cases are: - * - * 1. We have unallocated (delayed allocation) blocks. We may be - * allocating new blocks to the file and writing to them. - * (A more precise check would be whether the range we're writing - * to contains delayed allocation blocks.) - * 2. We need to extend the file. The bytes between the old EOF - * and the new EOF are not yet initialized. This is important - * even if we're not allocating new blocks to the file. If the - * old EOF and new EOF are in the same block, we still need to - * protect that range of bytes until they are written for the - * first time. - * - * If we had a shared lock with the above cases, we need to try to upgrade - * to an exclusive lock. If the upgrade fails, we will lose the shared - * lock, and will need to take the truncate lock again; the took_truncate_lock - * flag will still be set, causing us to try for an exclusive lock next time. - */ - if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) && - ((fp->ff_unallocblocks != 0) || - (writelimit > origFileSize))) { - if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) { - /* - * Lock upgrade failed and we lost our shared lock, try again. - * Note: we do not set took_truncate_lock=0 here. Leaving it - * set to 1 will cause us to try to get the lock exclusive. - */ - goto again; - } - else { - /* Store the owner in the c_truncatelockowner field if we successfully upgrade */ - cp->c_truncatelockowner = current_thread(); - } - } - - if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - goto exit; - } - cnode_locked = 1; - - filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize); - - if (offset > filebytes - && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)), - hfsmp->blockSize) < offset - filebytes)) { - retval = ENOSPC; - goto exit; - } - - KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START, - (int)offset, uio_resid(uio), (int)fp->ff_size, - (int)filebytes, 0); - - /* Check if we do not need to extend the file */ - if (writelimit <= filebytes) { - goto sizeok; - } - - bytesToAdd = writelimit - filebytes; - -#if QUOTA - retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)), - cred, 0); - if (retval) - goto exit; -#endif /* QUOTA */ - - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto exit; - } - - while (writelimit > filebytes) { - bytesToAdd = writelimit - filebytes; - - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - /* Files that are changing size are not hot file candidates. */ - if (hfsmp->hfc_stage == HFC_RECORDING) { - fp->ff_bytesread = 0; - } - retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd, - 0, eflags, &actualBytesAdded)); - - hfs_systemfile_unlock(hfsmp, lockflags); - - if ((actualBytesAdded == 0) && (retval == E_NONE)) - retval = ENOSPC; - if (retval != E_NONE) - break; - filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE, - (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); - } - (void) hfs_update(vp, 0); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - (void) hfs_end_transaction(hfsmp); - - /* - * If we didn't grow the file enough try a partial write. - * POSIX expects this behavior. - */ - if ((retval == ENOSPC) && (filebytes > offset)) { - retval = 0; - partialwrite = 1; - uio_setresid(uio, (uio_resid(uio) - bytesToAdd)); - resid -= bytesToAdd; - writelimit = filebytes; - } -sizeok: - if (retval == E_NONE) { - off_t filesize; - off_t head_off; - int lflag; - - if (writelimit > fp->ff_size) { - filesize = writelimit; - struct timeval tv; - rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges); - microuptime(&tv); - cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; - } else - filesize = fp->ff_size; - - lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY); - - /* - * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except - * for one case below). For the regions that lie before the - * beginning and after the end of this write that are in the - * same page, we let the cluster code handle zeroing that out - * if necessary. If those areas are not cached, the cluster - * code will try and read those areas in, and in the case - * where those regions have never been written to, - * hfs_vnop_blockmap will consult the invalid ranges and then - * indicate that. The cluster code will zero out those areas. - */ - - head_off = trunc_page_64(offset); - - if (head_off < offset && head_off >= fp->ff_size) { - /* - * The first page is beyond current EOF, so as an - * optimisation, we can pass IO_HEADZEROFILL. - */ - lflag |= IO_HEADZEROFILL; - } - - hfs_unlock(cp); - cnode_locked = 0; - - /* - * We need to tell UBC the fork's new size BEFORE calling - * cluster_write, in case any of the new pages need to be - * paged out before cluster_write completes (which does happen - * in embedded systems due to extreme memory pressure). - * Similarly, we need to tell hfs_vnop_pageout what the new EOF - * will be, so that it can pass that on to cluster_pageout, and - * allow those pageouts. - * - * We don't update ff_size yet since we don't want pageins to - * be able to see uninitialized data between the old and new - * EOF, until cluster_write has completed and initialized that - * part of the file. - * - * The vnode pager relies on the file size last given to UBC via - * ubc_setsize. hfs_vnop_pageout relies on fp->ff_new_size or - * ff_size (whichever is larger). NOTE: ff_new_size is always - * zero, unless we are extending the file via write. - */ - if (filesize > fp->ff_size) { - retval = hfs_zero_eof_page(vp, offset); - if (retval) - goto exit; - fp->ff_new_size = filesize; - ubc_setsize(vp, filesize); - } - retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off, - 0, lflag | IO_NOZERODIRTY | io_return_on_throttle); - if (retval) { - fp->ff_new_size = 0; /* no longer extending; use ff_size */ - - if (retval == EAGAIN) { - /* - * EAGAIN indicates that we still have I/O to do, but - * that we now need to be throttled - */ - if (resid != uio_resid(uio)) { - /* - * did manage to do some I/O before returning EAGAIN - */ - resid = uio_resid(uio); - offset = uio_offset(uio); - - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - hfs_incr_gencount(cp); - } - if (filesize > fp->ff_size) { - /* - * we called ubc_setsize before the call to - * cluster_write... since we only partially - * completed the I/O, we need to - * re-adjust our idea of the filesize based - * on our interim EOF - */ - ubc_setsize(vp, offset); - - fp->ff_size = offset; - } - goto exit; - } - if (filesize > origFileSize) { - ubc_setsize(vp, origFileSize); - } - goto ioerr_exit; - } - - if (filesize > origFileSize) { - fp->ff_size = filesize; - - /* Files that are changing size are not hot file candidates. */ - if (hfsmp->hfc_stage == HFC_RECORDING) { - fp->ff_bytesread = 0; - } - } - fp->ff_new_size = 0; /* ff_size now has the correct size */ - } - if (partialwrite) { - uio_setresid(uio, (uio_resid(uio) + bytesToAdd)); - resid += bytesToAdd; - } - - // XXXdbg - see radar 4871353 for more info - { - if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) { - hfs_flush(hfsmp, HFS_FLUSH_CACHE); - } - } - -ioerr_exit: - if (!cnode_locked) { - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - cnode_locked = 1; - } - - if (resid > uio_resid(uio)) { - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - hfs_incr_gencount(cp); - - /* - * If we successfully wrote any data, and we are not the superuser - * we clear the setuid and setgid bits as a precaution against - * tampering. - */ - if (cp->c_mode & (S_ISUID | S_ISGID)) { - cred = vfs_context_ucred(ap->a_context); - if (cred && suser(cred, NULL)) { - cp->c_mode &= ~(S_ISUID | S_ISGID); - } - } - } - if (retval) { - if (ioflag & IO_UNIT) { - (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC, - 0, ap->a_context); - uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio)))); - uio_setresid(uio, resid); - filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - } - } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) - retval = hfs_update(vp, 0); - - /* Updating vcbWrCnt doesn't need to be atomic. */ - hfsmp->vcbWrCnt++; - - KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END, - (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); -exit: - if (retval && took_truncate_lock - && cp->c_truncatelockowner == current_thread()) { - fp->ff_new_size = 0; - rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges); - } - - if (cnode_locked) - hfs_unlock(cp); - - if (took_truncate_lock) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - } - if (retval == EAGAIN) { - throttle_lowpri_io(1); - throttled_count++; - - retval = 0; - goto again; - } - if (throttled_count) { - throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread())); - } - return (retval); -} - -/* support for the "bulk-access" fcntl */ - -#define CACHE_LEVELS 16 -#define NUM_CACHE_ENTRIES (64*16) -#define PARENT_IDS_FLAG 0x100 - -struct access_cache { - int numcached; - int cachehits; /* these two for statistics gathering */ - int lookups; - unsigned int *acache; - unsigned char *haveaccess; -}; - -struct access_t { - uid_t uid; /* IN: effective user id */ - short flags; /* IN: access requested (i.e. R_OK) */ - short num_groups; /* IN: number of groups user belongs to */ - int num_files; /* IN: number of files to process */ - int *file_ids; /* IN: array of file ids */ - gid_t *groups; /* IN: array of groups */ - short *access; /* OUT: access info for each file (0 for 'has access') */ -} __attribute__((unavailable)); // this structure is for reference purposes only - -struct user32_access_t { - uid_t uid; /* IN: effective user id */ - short flags; /* IN: access requested (i.e. R_OK) */ - short num_groups; /* IN: number of groups user belongs to */ - int num_files; /* IN: number of files to process */ - user32_addr_t file_ids; /* IN: array of file ids */ - user32_addr_t groups; /* IN: array of groups */ - user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ -}; - -struct user64_access_t { - uid_t uid; /* IN: effective user id */ - short flags; /* IN: access requested (i.e. R_OK) */ - short num_groups; /* IN: number of groups user belongs to */ - int num_files; /* IN: number of files to process */ - user64_addr_t file_ids; /* IN: array of file ids */ - user64_addr_t groups; /* IN: array of groups */ - user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ -}; - - -// these are the "extended" versions of the above structures -// note that it is crucial that they be different sized than -// the regular version -struct ext_access_t { - uint32_t flags; /* IN: access requested (i.e. R_OK) */ - uint32_t num_files; /* IN: number of files to process */ - uint32_t map_size; /* IN: size of the bit map */ - uint32_t *file_ids; /* IN: Array of file ids */ - char *bitmap; /* OUT: hash-bitmap of interesting directory ids */ - short *access; /* OUT: access info for each file (0 for 'has access') */ - uint32_t num_parents; /* future use */ - cnid_t *parents; /* future use */ -} __attribute__((unavailable)); // this structure is for reference purposes only - -struct user32_ext_access_t { - uint32_t flags; /* IN: access requested (i.e. R_OK) */ - uint32_t num_files; /* IN: number of files to process */ - uint32_t map_size; /* IN: size of the bit map */ - user32_addr_t file_ids; /* IN: Array of file ids */ - user32_addr_t bitmap; /* OUT: hash-bitmap of interesting directory ids */ - user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ - uint32_t num_parents; /* future use */ - user32_addr_t parents; /* future use */ -}; - -struct user64_ext_access_t { - uint32_t flags; /* IN: access requested (i.e. R_OK) */ - uint32_t num_files; /* IN: number of files to process */ - uint32_t map_size; /* IN: size of the bit map */ - user64_addr_t file_ids; /* IN: array of file ids */ - user64_addr_t bitmap; /* IN: array of groups */ - user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ - uint32_t num_parents;/* future use */ - user64_addr_t parents;/* future use */ -}; - - -/* - * Perform a binary search for the given parent_id. Return value is - * the index if there is a match. If no_match_indexp is non-NULL it - * will be assigned with the index to insert the item (even if it was - * not found). - */ -static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp) -{ - int index=-1; - unsigned int lo=0; - - do { - unsigned int mid = ((hi - lo)/2) + lo; - unsigned int this_id = array[mid]; - - if (parent_id == this_id) { - hi = mid; - break; - } - - if (parent_id < this_id) { - hi = mid; - continue; - } - - if (parent_id > this_id) { - lo = mid + 1; - continue; - } - } while(lo < hi); - - /* check if lo and hi converged on the match */ - if (parent_id == array[hi]) { - index = hi; - } - - if (no_match_indexp) { - *no_match_indexp = hi; - } - - return index; -} - - -static int -lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) -{ - unsigned int hi; - int matches = 0; - int index, no_match_index; - - if (cache->numcached == 0) { - *indexp = 0; - return 0; // table is empty, so insert at index=0 and report no match - } - - if (cache->numcached > NUM_CACHE_ENTRIES) { - cache->numcached = NUM_CACHE_ENTRIES; - } - - hi = cache->numcached - 1; - - index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index); - - /* if no existing entry found, find index for new one */ - if (index == -1) { - index = no_match_index; - matches = 0; - } else { - matches = 1; - } - - *indexp = index; - return matches; -} - -/* - * Add a node to the access_cache at the given index (or do a lookup first - * to find the index if -1 is passed in). We currently do a replace rather - * than an insert if the cache is full. - */ -static void -add_node(struct access_cache *cache, int index, cnid_t nodeID, int access) -{ - int lookup_index = -1; - - /* need to do a lookup first if -1 passed for index */ - if (index == -1) { - if (lookup_bucket(cache, &lookup_index, nodeID)) { - if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) { - // only update an entry if the previous access was ESRCH (i.e. a scope checking error) - cache->haveaccess[lookup_index] = access; - } - - /* mission accomplished */ - return; - } else { - index = lookup_index; - } - - } - - /* if the cache is full, do a replace rather than an insert */ - if (cache->numcached >= NUM_CACHE_ENTRIES) { - cache->numcached = NUM_CACHE_ENTRIES-1; - - if (index > cache->numcached) { - index = cache->numcached; - } - } - - if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) { - index++; - } - - if (index >= 0 && index < cache->numcached) { - /* only do bcopy if we're inserting */ - bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) ); - bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) ); - } - - cache->acache[index] = nodeID; - cache->haveaccess[index] = access; - cache->numcached++; -} - - -struct cinfo { - uid_t uid; - gid_t gid; - mode_t mode; - cnid_t parentcnid; - u_int16_t recflags; -}; - -static int -snoop_callback(const cnode_t *cp, void *arg) -{ - struct cinfo *cip = arg; - - cip->uid = cp->c_uid; - cip->gid = cp->c_gid; - cip->mode = cp->c_mode; - cip->parentcnid = cp->c_parentcnid; - cip->recflags = cp->c_attr.ca_recflags; - - return (0); -} - -/* - * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item - * isn't incore, then go to the catalog. - */ -static int -do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, - struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp) -{ - int error = 0; - - /* if this id matches the one the fsctl was called with, skip the lookup */ - if (cnid == skip_cp->c_cnid) { - cnattrp->ca_uid = skip_cp->c_uid; - cnattrp->ca_gid = skip_cp->c_gid; - cnattrp->ca_mode = skip_cp->c_mode; - cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags; - keyp->hfsPlus.parentID = skip_cp->c_parentcnid; - } else { - struct cinfo c_info; - - /* otherwise, check the cnode hash incase the file/dir is incore */ - error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info); - - if (error == EACCES) { - // File is deleted - return ENOENT; - } else if (!error) { - cnattrp->ca_uid = c_info.uid; - cnattrp->ca_gid = c_info.gid; - cnattrp->ca_mode = c_info.mode; - cnattrp->ca_recflags = c_info.recflags; - keyp->hfsPlus.parentID = c_info.parentcnid; - } else { - int lockflags; - - if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp))) - throttle_lowpri_io(1); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - /* lookup this cnid in the catalog */ - error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); - - hfs_systemfile_unlock(hfsmp, lockflags); - - cache->lookups++; - } - } - - return (error); -} - - -/* - * Compute whether we have access to the given directory (nodeID) and all its parents. Cache - * up to CACHE_LEVELS as we progress towards the root. - */ -static int -do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID, - struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, - struct vfs_context *my_context, - char *bitmap, - uint32_t map_size, - cnid_t* parents, - uint32_t num_parents) -{ - int myErr = 0; - int myResult; - HFSCatalogNodeID thisNodeID; - unsigned int myPerms; - struct cat_attr cnattr; - int cache_index = -1, scope_index = -1, scope_idx_start = -1; - CatalogKey catkey; - - int i = 0, ids_to_cache = 0; - int parent_ids[CACHE_LEVELS]; - - thisNodeID = nodeID; - while (thisNodeID >= kRootDirID) { - myResult = 0; /* default to "no access" */ - - /* check the cache before resorting to hitting the catalog */ - - /* ASSUMPTION: access info of cached entries is "final"... i.e. no need - * to look any further after hitting cached dir */ - - if (lookup_bucket(cache, &cache_index, thisNodeID)) { - cache->cachehits++; - myErr = cache->haveaccess[cache_index]; - if (scope_index != -1) { - if (myErr == ESRCH) { - myErr = 0; - } - } else { - scope_index = 0; // so we'll just use the cache result - scope_idx_start = ids_to_cache; - } - myResult = (myErr == 0) ? 1 : 0; - goto ExitThisRoutine; - } - - - if (parents) { - int tmp; - tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL); - if (scope_index == -1) - scope_index = tmp; - if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) { - scope_idx_start = ids_to_cache; - } - } - - /* remember which parents we want to cache */ - if (ids_to_cache < CACHE_LEVELS) { - parent_ids[ids_to_cache] = thisNodeID; - ids_to_cache++; - } - // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"... - if (bitmap && map_size) { - bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7)); - } - - - /* do the lookup (checks the cnode hash, then the catalog) */ - myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr); - if (myErr) { - goto ExitThisRoutine; /* no access */ - } - - /* Root always gets access. */ - if (suser(myp_ucred, NULL) == 0) { - thisNodeID = catkey.hfsPlus.parentID; - myResult = 1; - continue; - } - - // if the thing has acl's, do the full permission check - if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { - struct vnode *vp; - - /* get the vnode for this cnid */ - myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0); - if ( myErr ) { - myResult = 0; - goto ExitThisRoutine; - } - - thisNodeID = VTOC(vp)->c_parentcnid; - - hfs_unlock(VTOC(vp)); - - if (vnode_vtype(vp) == VDIR) { - myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context); - } else { - myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context); - } - - vnode_put(vp); - if (myErr) { - myResult = 0; - goto ExitThisRoutine; - } - } else { - unsigned int flags; - int mode = cnattr.ca_mode & S_IFMT; - myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr); - - if (mode == S_IFDIR) { - flags = R_OK | X_OK; - } else { - flags = R_OK; - } - if ( (myPerms & flags) != flags) { - myResult = 0; - myErr = EACCES; - goto ExitThisRoutine; /* no access */ - } - - /* up the hierarchy we go */ - thisNodeID = catkey.hfsPlus.parentID; - } - } - - /* if here, we have access to this node */ - myResult = 1; - - ExitThisRoutine: - if (parents && myErr == 0 && scope_index == -1) { - myErr = ESRCH; - } - - if (myErr) { - myResult = 0; - } - *err = myErr; - - /* cache the parent directory(ies) */ - for (i = 0; i < ids_to_cache; i++) { - if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) { - add_node(cache, -1, parent_ids[i], ESRCH); - } else { - add_node(cache, -1, parent_ids[i], myErr); - } - } - - return (myResult); -} - -static int -do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, - struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context) -{ - boolean_t is64bit; - - /* - * NOTE: on entry, the vnode has an io_ref. In case this vnode - * happens to be in our list of file_ids, we'll note it - * avoid calling hfs_chashget_nowait() on that id as that - * will cause a "locking against myself" panic. - */ - Boolean check_leaf = true; - - struct user64_ext_access_t *user_access_structp; - struct user64_ext_access_t tmp_user_access; - struct access_cache cache; - - int error = 0, prev_parent_check_ok=1; - unsigned int i; - - short flags; - unsigned int num_files = 0; - int map_size = 0; - int num_parents = 0; - int *file_ids=NULL; - short *access=NULL; - char *bitmap=NULL; - cnid_t *parents=NULL; - int leaf_index; - - cnid_t cnid; - cnid_t prevParent_cnid = 0; - unsigned int myPerms; - short myaccess = 0; - struct cat_attr cnattr; - CatalogKey catkey; - struct cnode *skip_cp = VTOC(vp); - kauth_cred_t cred = vfs_context_ucred(context); - proc_t p = vfs_context_proc(context); - - is64bit = proc_is64bit(p); - - /* initialize the local cache and buffers */ - cache.numcached = 0; - cache.cachehits = 0; - cache.lookups = 0; - cache.acache = NULL; - cache.haveaccess = NULL; - - /* struct copyin done during dispatch... need to copy file_id array separately */ - if (ap->a_data == NULL) { - error = EINVAL; - goto err_exit_bulk_access; - } - - if (is64bit) { - if (arg_size != sizeof(struct user64_ext_access_t)) { - error = EINVAL; - goto err_exit_bulk_access; - } - - user_access_structp = (struct user64_ext_access_t *)ap->a_data; - - } else if (arg_size == sizeof(struct user32_access_t)) { - struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data; - - // convert an old style bulk-access struct to the new style - tmp_user_access.flags = accessp->flags; - tmp_user_access.num_files = accessp->num_files; - tmp_user_access.map_size = 0; - tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); - tmp_user_access.bitmap = USER_ADDR_NULL; - tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); - tmp_user_access.num_parents = 0; - user_access_structp = &tmp_user_access; - - } else if (arg_size == sizeof(struct user32_ext_access_t)) { - struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data; - - // up-cast from a 32-bit version of the struct - tmp_user_access.flags = accessp->flags; - tmp_user_access.num_files = accessp->num_files; - tmp_user_access.map_size = accessp->map_size; - tmp_user_access.num_parents = accessp->num_parents; - - tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); - tmp_user_access.bitmap = CAST_USER_ADDR_T(accessp->bitmap); - tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); - tmp_user_access.parents = CAST_USER_ADDR_T(accessp->parents); - - user_access_structp = &tmp_user_access; - } else { - error = EINVAL; - goto err_exit_bulk_access; - } - - map_size = user_access_structp->map_size; - - num_files = user_access_structp->num_files; - - num_parents= user_access_structp->num_parents; - - if (num_files < 1) { - goto err_exit_bulk_access; - } - if (num_files > 1024) { - error = EINVAL; - goto err_exit_bulk_access; - } - - if (num_parents > 1024) { - error = EINVAL; - goto err_exit_bulk_access; - } - - file_ids = (int *) kalloc(sizeof(int) * num_files); - access = (short *) kalloc(sizeof(short) * num_files); - if (map_size) { - bitmap = (char *) kalloc(sizeof(char) * map_size); - } - - if (num_parents) { - parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents); - } - - cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES); - cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES); - - if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) { - if (file_ids) { - kfree(file_ids, sizeof(int) * num_files); - } - if (bitmap) { - kfree(bitmap, sizeof(char) * map_size); - } - if (access) { - kfree(access, sizeof(short) * num_files); - } - if (cache.acache) { - kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES); - } - if (cache.haveaccess) { - kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES); - } - if (parents) { - kfree(parents, sizeof(cnid_t) * num_parents); - } - return ENOMEM; - } - - // make sure the bitmap is zero'ed out... - if (bitmap) { - bzero(bitmap, (sizeof(char) * map_size)); - } - - if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids, - num_files * sizeof(int)))) { - goto err_exit_bulk_access; - } - - if (num_parents) { - if ((error = copyin(user_access_structp->parents, (caddr_t)parents, - num_parents * sizeof(cnid_t)))) { - goto err_exit_bulk_access; - } - } - - flags = user_access_structp->flags; - if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) { - flags = R_OK; - } - - /* check if we've been passed leaf node ids or parent ids */ - if (flags & PARENT_IDS_FLAG) { - check_leaf = false; - } - - /* Check access to each file_id passed in */ - for (i = 0; i < num_files; i++) { - leaf_index=-1; - cnid = (cnid_t) file_ids[i]; - - /* root always has access */ - if ((!parents) && (!suser(cred, NULL))) { - access[i] = 0; - continue; - } - - if (check_leaf) { - /* do the lookup (checks the cnode hash, then the catalog) */ - error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr); - if (error) { - access[i] = (short) error; - continue; - } - - if (parents) { - // Check if the leaf matches one of the parent scopes - leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL); - if (leaf_index >= 0 && parents[leaf_index] == cnid) - prev_parent_check_ok = 0; - else if (leaf_index >= 0) - prev_parent_check_ok = 1; - } - - // if the thing has acl's, do the full permission check - if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { - struct vnode *cvp; - int myErr = 0; - /* get the vnode for this cnid */ - myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0); - if ( myErr ) { - access[i] = myErr; - continue; - } - - hfs_unlock(VTOC(cvp)); - - if (vnode_vtype(cvp) == VDIR) { - myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context); - } else { - myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context); - } - - vnode_put(cvp); - if (myErr) { - access[i] = myErr; - continue; - } - } else { - /* before calling CheckAccess(), check the target file for read access */ - myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, - cnattr.ca_mode, hfsmp->hfs_mp, cred, p); - - /* fail fast if no access */ - if ((myPerms & flags) == 0) { - access[i] = EACCES; - continue; - } - } - } else { - /* we were passed an array of parent ids */ - catkey.hfsPlus.parentID = cnid; - } - - /* if the last guy had the same parent and had access, we're done */ - if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) { - cache.cachehits++; - access[i] = 0; - continue; - } - - myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, - skip_cp, p, cred, context,bitmap, map_size, parents, num_parents); - - if (myaccess || (error == ESRCH && leaf_index != -1)) { - access[i] = 0; // have access.. no errors to report - } else { - access[i] = (error != 0 ? (short) error : EACCES); - } - - prevParent_cnid = catkey.hfsPlus.parentID; - } - - /* copyout the access array */ - if ((error = copyout((caddr_t)access, user_access_structp->access, - num_files * sizeof (short)))) { - goto err_exit_bulk_access; - } - if (map_size && bitmap) { - if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap, - map_size * sizeof (char)))) { - goto err_exit_bulk_access; - } - } - - - err_exit_bulk_access: - - if (file_ids) - kfree(file_ids, sizeof(int) * num_files); - if (parents) - kfree(parents, sizeof(cnid_t) * num_parents); - if (bitmap) - kfree(bitmap, sizeof(char) * map_size); - if (access) - kfree(access, sizeof(short) * num_files); - if (cache.acache) - kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES); - if (cache.haveaccess) - kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES); - - return (error); -} - - -/* end "bulk-access" support */ - - -/* - * Control filesystem operating characteristics. - */ -int -hfs_vnop_ioctl( struct vnop_ioctl_args /* { - vnode_t a_vp; - long a_command; - caddr_t a_data; - int a_fflag; - vfs_context_t a_context; - } */ *ap) -{ - struct vnode * vp = ap->a_vp; - struct hfsmount *hfsmp = VTOHFS(vp); - vfs_context_t context = ap->a_context; - kauth_cred_t cred = vfs_context_ucred(context); - proc_t p = vfs_context_proc(context); - struct vfsstatfs *vfsp; - boolean_t is64bit; - off_t jnl_start, jnl_size; - struct hfs_journal_info *jip; -#if HFS_COMPRESSION - int compressed = 0; - off_t uncompressed_size = -1; - int decmpfs_error = 0; - - if (ap->a_command == F_RDADVISE) { - /* we need to inspect the decmpfs state of the file as early as possible */ - compressed = hfs_file_is_compressed(VTOC(vp), 0); - if (compressed) { - if (VNODE_IS_RSRC(vp)) { - /* if this is the resource fork, treat it as if it were empty */ - uncompressed_size = 0; - } else { - decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0); - if (decmpfs_error != 0) { - /* failed to get the uncompressed size, we'll check for this later */ - uncompressed_size = -1; - } - } - } - } -#endif /* HFS_COMPRESSION */ - - is64bit = proc_is64bit(p); - -#if CONFIG_PROTECT - { - int error = 0; - if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { - return error; - } - } -#endif /* CONFIG_PROTECT */ - - switch (ap->a_command) { - - case HFS_GETPATH: - { - struct vnode *file_vp; - cnid_t cnid; - int outlen; - char *bufptr; - int error; - int flags = 0; - - /* Caller must be owner of file system. */ - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); - } - /* Target vnode must be file system's root. */ - if (!vnode_isvroot(vp)) { - return (EINVAL); - } - bufptr = (char *)ap->a_data; - cnid = strtoul(bufptr, NULL, 10); - if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) { - flags |= BUILDPATH_VOLUME_RELATIVE; - } - - /* We need to call hfs_vfs_vget to leverage the code that will - * fix the origin list for us if needed, as opposed to calling - * hfs_vget, since we will need the parent for build_path call. - */ - - if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) { - return (error); - } - error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context); - vnode_put(file_vp); - - return (error); - } - - case HFS_TRANSFER_DOCUMENT_ID: - { - struct cnode *cp = NULL; - int error; - u_int32_t to_fd = *(u_int32_t *)ap->a_data; - struct fileproc *to_fp; - struct vnode *to_vp; - struct cnode *to_cp; - - cp = VTOC(vp); - - if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) { - //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error); - return error; - } - if ( (error = vnode_getwithref(to_vp)) ) { - file_drop(to_fd); - return error; - } - - if (VTOHFS(to_vp) != hfsmp) { - error = EXDEV; - goto transfer_cleanup; - } - - int need_unlock = 1; - to_cp = VTOC(to_vp); - error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); - if (error != 0) { - //printf("could not lock the pair of cnodes (error %d)\n", error); - goto transfer_cleanup; - } - - if (!(cp->c_bsdflags & UF_TRACKED)) { - error = EINVAL; - } else if (to_cp->c_bsdflags & UF_TRACKED) { - // - // if the destination is already tracked, return an error - // as otherwise it's a silent deletion of the target's - // document-id - // - error = EEXIST; - } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { - // - // we can use the FndrExtendedFileInfo because the doc-id is the first - // thing in both it and the ExtendedDirInfo struct which is fixed in - // format and can not change layout - // - struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16); - struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16); - - if (f_extinfo->document_id == 0) { - uint32_t new_id; - - hfs_unlockpair(cp, to_cp); // have to unlock to be able to get a new-id - - if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) { - // - // re-lock the pair now that we have the document-id - // - hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); - f_extinfo->document_id = new_id; - } else { - goto transfer_cleanup; - } - } - - to_extinfo->document_id = f_extinfo->document_id; - f_extinfo->document_id = 0; - //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid); - - // make sure the destination is also UF_TRACKED - to_cp->c_bsdflags |= UF_TRACKED; - cp->c_bsdflags &= ~UF_TRACKED; - - // mark the cnodes dirty - cp->c_flag |= C_MODIFIED; - to_cp->c_flag |= C_MODIFIED; - - int lockflags; - if ((error = hfs_start_transaction(hfsmp)) == 0) { - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); - (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL); - - hfs_systemfile_unlock (hfsmp, lockflags); - (void) hfs_end_transaction(hfsmp); - } - -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, context, - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # - FSE_ARG_INO, (ino64_t)to_cp->c_fileid, // dst inode # - FSE_ARG_INT32, to_extinfo->document_id, - FSE_ARG_DONE); - - hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents - need_unlock = 0; - - if (need_fsevent(FSE_STAT_CHANGED, vp)) { - add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE); - } - if (need_fsevent(FSE_STAT_CHANGED, to_vp)) { - add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE); - } -#else - hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents - need_unlock = 0; -#endif - } - - if (need_unlock) { - hfs_unlockpair(cp, to_cp); - } - - transfer_cleanup: - vnode_put(to_vp); - file_drop(to_fd); - - return error; - } - - - - case HFS_PREV_LINK: - case HFS_NEXT_LINK: - { - cnid_t linkfileid; - cnid_t nextlinkid; - cnid_t prevlinkid; - int error; - - /* Caller must be owner of file system. */ - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); - } - /* Target vnode must be file system's root. */ - if (!vnode_isvroot(vp)) { - return (EINVAL); - } - linkfileid = *(cnid_t *)ap->a_data; - if (linkfileid < kHFSFirstUserCatalogNodeID) { - return (EINVAL); - } - if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { - return (error); - } - if (ap->a_command == HFS_NEXT_LINK) { - *(cnid_t *)ap->a_data = nextlinkid; - } else { - *(cnid_t *)ap->a_data = prevlinkid; - } - return (0); - } - - case HFS_RESIZE_PROGRESS: { - - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); /* must be owner of file system */ - } - if (!vnode_isvroot(vp)) { - return (EINVAL); - } - /* file system must not be mounted read-only */ - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - - return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data); - } - - case HFS_RESIZE_VOLUME: { - u_int64_t newsize; - u_int64_t cursize; - int ret; - - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); /* must be owner of file system */ - } - if (!vnode_isvroot(vp)) { - return (EINVAL); - } - - /* filesystem must not be mounted read only */ - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - newsize = *(u_int64_t *)ap->a_data; - cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; - - if (newsize == cursize) { - return (0); - } - IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize); - if (newsize > cursize) { - ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context); - } else { - ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context); - } - IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize); - return (ret); - } - case HFS_CHANGE_NEXT_ALLOCATION: { - int error = 0; /* Assume success */ - u_int32_t location; - - if (vnode_vfsisrdonly(vp)) { - return (EROFS); - } - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); /* must be owner of file system */ - } - if (!vnode_isvroot(vp)) { - return (EINVAL); - } - hfs_lock_mount(hfsmp); - location = *(u_int32_t *)ap->a_data; - if ((location >= hfsmp->allocLimit) && - (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) { - error = EINVAL; - goto fail_change_next_allocation; - } - /* Return previous value. */ - *(u_int32_t *)ap->a_data = hfsmp->nextAllocation; - if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) { - /* On magic value for location, set nextAllocation to next block - * after metadata zone and set flag in mount structure to indicate - * that nextAllocation should not be updated again. - */ - if (hfsmp->hfs_metazone_end != 0) { - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1); - } - hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION; - } else { - hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION; - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location); - } - MarkVCBDirty(hfsmp); -fail_change_next_allocation: - hfs_unlock_mount(hfsmp); - return (error); - } - -#if HFS_SPARSE_DEV - case HFS_SETBACKINGSTOREINFO: { - struct vnode * bsfs_rootvp; - struct vnode * di_vp; - struct hfs_backingstoreinfo *bsdata; - int error = 0; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - return (EALREADY); - } - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); /* must be owner of file system */ - } - bsdata = (struct hfs_backingstoreinfo *)ap->a_data; - if (bsdata == NULL) { - return (EINVAL); - } - if ((error = file_vnode(bsdata->backingfd, &di_vp))) { - return (error); - } - if ((error = vnode_getwithref(di_vp))) { - file_drop(bsdata->backingfd); - return(error); - } - - if (vnode_mount(vp) == vnode_mount(di_vp)) { - (void)vnode_put(di_vp); - file_drop(bsdata->backingfd); - return (EINVAL); - } - - /* - * Obtain the backing fs root vnode and keep a reference - * on it. This reference will be dropped in hfs_unmount. - */ - error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */ - if (error) { - (void)vnode_put(di_vp); - file_drop(bsdata->backingfd); - return (error); - } - vnode_ref(bsfs_rootvp); - vnode_put(bsfs_rootvp); - - hfs_lock_mount(hfsmp); - hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; - hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; - hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4; - hfs_unlock_mount(hfsmp); - - /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */ - - /* - * If the sparse image is on a sparse image file (as opposed to a sparse - * bundle), then we may need to limit the free space to the maximum size - * of a file on that volume. So we query (using pathconf), and if we get - * a meaningful result, we cache the number of blocks for later use in - * hfs_freeblks(). - */ - hfsmp->hfs_backingfs_maxblocks = 0; - if (vnode_vtype(di_vp) == VREG) { - int terr; - int hostbits; - terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context); - if (terr == 0 && hostbits != 0 && hostbits < 64) { - u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits; - - hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize; - } - } - - /* The free extent cache is managed differently for sparse devices. - * There is a window between which the volume is mounted and the - * device is marked as sparse, so the free extent cache for this - * volume is currently initialized as normal volume (sorted by block - * count). Reset the cache so that it will be rebuilt again - * for sparse device (sorted by start block). - */ - ResetVCBFreeExtCache(hfsmp); - - (void)vnode_put(di_vp); - file_drop(bsdata->backingfd); - return (0); - } - case HFS_CLRBACKINGSTOREINFO: { - struct vnode * tmpvp; - - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); /* must be owner of file system */ - } - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - - if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && - hfsmp->hfs_backingfs_rootvp) { - - hfs_lock_mount(hfsmp); - hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; - tmpvp = hfsmp->hfs_backingfs_rootvp; - hfsmp->hfs_backingfs_rootvp = NULLVP; - hfsmp->hfs_sparsebandblks = 0; - hfs_unlock_mount(hfsmp); - - vnode_rele(tmpvp); - } - return (0); - } -#endif /* HFS_SPARSE_DEV */ - - /* Change the next CNID stored in the VH */ - case HFS_CHANGE_NEXTCNID: { - int error = 0; /* Assume success */ - u_int32_t fileid; - int wraparound = 0; - int lockflags = 0; - - if (vnode_vfsisrdonly(vp)) { - return (EROFS); - } - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - if (suser(cred, NULL) && - kauth_cred_getuid(cred) != vfsp->f_owner) { - return (EACCES); /* must be owner of file system */ - } - - fileid = *(u_int32_t *)ap->a_data; - - /* Must have catalog lock excl. to advance the CNID pointer */ - lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK); - - hfs_lock_mount(hfsmp); - - /* If it is less than the current next CNID, force the wraparound bit to be set */ - if (fileid < hfsmp->vcbNxtCNID) { - wraparound=1; - } - - /* Return previous value. */ - *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID; - - hfsmp->vcbNxtCNID = fileid; - - if (wraparound) { - hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; - } - - MarkVCBDirty(hfsmp); - hfs_unlock_mount(hfsmp); - hfs_systemfile_unlock (hfsmp, lockflags); - - return (error); - } - - case F_FREEZE_FS: { - struct mount *mp; - - mp = vnode_mount(vp); - hfsmp = VFSTOHFS(mp); - - if (!(hfsmp->jnl)) - return (ENOTSUP); - - vfsp = vfs_statfs(mp); - - if (kauth_cred_getuid(cred) != vfsp->f_owner && - !kauth_cred_issuser(cred)) - return (EACCES); - - return hfs_freeze(hfsmp); - } - - case F_THAW_FS: { - vfsp = vfs_statfs(vnode_mount(vp)); - if (kauth_cred_getuid(cred) != vfsp->f_owner && - !kauth_cred_issuser(cred)) - return (EACCES); - - return hfs_thaw(hfsmp, current_proc()); - } - - case HFS_EXT_BULKACCESS_FSCTL: { - int size; - - if (hfsmp->hfs_flags & HFS_STANDARD) { - return EINVAL; - } - - if (is64bit) { - size = sizeof(struct user64_ext_access_t); - } else { - size = sizeof(struct user32_ext_access_t); - } - - return do_bulk_access_check(hfsmp, vp, ap, size, context); - } - - case HFS_SET_XATTREXTENTS_STATE: { - int state; - - if (ap->a_data == NULL) { - return (EINVAL); - } - - state = *(int *)ap->a_data; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - - /* Super-user can enable or disable extent-based extended - * attribute support on a volume - * Note: Starting Mac OS X 10.7, extent-based extended attributes - * are enabled by default, so any change will be transient only - * till the volume is remounted. - */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return (EPERM); - } - if (state == 0 || state == 1) - return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state); - else - return (EINVAL); - } - - case F_SETSTATICCONTENT: { - int error; - int enable_static = 0; - struct cnode *cp = NULL; - /* - * lock the cnode, decorate the cnode flag, and bail out. - * VFS should have already authenticated the caller for us. - */ - - if (ap->a_data) { - /* - * Note that even though ap->a_data is of type caddr_t, - * the fcntl layer at the syscall handler will pass in NULL - * or 1 depending on what the argument supplied to the fcntl - * was. So it is in fact correct to check the ap->a_data - * argument for zero or non-zero value when deciding whether or not - * to enable the static bit in the cnode. - */ - enable_static = 1; - } - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return EROFS; - } - cp = VTOC(vp); - - error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error == 0) { - if (enable_static) { - cp->c_flag |= C_SSD_STATIC; - } - else { - cp->c_flag &= ~C_SSD_STATIC; - } - hfs_unlock (cp); - } - return error; - } - - case F_SET_GREEDY_MODE: { - int error; - int enable_greedy_mode = 0; - struct cnode *cp = NULL; - /* - * lock the cnode, decorate the cnode flag, and bail out. - * VFS should have already authenticated the caller for us. - */ - - if (ap->a_data) { - /* - * Note that even though ap->a_data is of type caddr_t, - * the fcntl layer at the syscall handler will pass in NULL - * or 1 depending on what the argument supplied to the fcntl - * was. So it is in fact correct to check the ap->a_data - * argument for zero or non-zero value when deciding whether or not - * to enable the greedy mode bit in the cnode. - */ - enable_greedy_mode = 1; - } - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return EROFS; - } - cp = VTOC(vp); - - error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error == 0) { - if (enable_greedy_mode) { - cp->c_flag |= C_SSD_GREEDY_MODE; - } - else { - cp->c_flag &= ~C_SSD_GREEDY_MODE; - } - hfs_unlock (cp); - } - return error; - } - - case F_SETIOTYPE: { - int error; - uint32_t iotypeflag = 0; - - struct cnode *cp = NULL; - /* - * lock the cnode, decorate the cnode flag, and bail out. - * VFS should have already authenticated the caller for us. - */ - - if (ap->a_data == NULL) { - return EINVAL; - } - - /* - * Note that even though ap->a_data is of type caddr_t, we - * can only use 32 bits of flag values. - */ - iotypeflag = (uint32_t) ap->a_data; - switch (iotypeflag) { - case F_IOTYPE_ISOCHRONOUS: - break; - default: - return EINVAL; - } - - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return EROFS; - } - cp = VTOC(vp); - - error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error == 0) { - switch (iotypeflag) { - case F_IOTYPE_ISOCHRONOUS: - cp->c_flag |= C_IO_ISOCHRONOUS; - break; - default: - break; - } - hfs_unlock (cp); - } - return error; - } - - case F_MAKECOMPRESSED: { - int error = 0; - uint32_t gen_counter; - struct cnode *cp = NULL; - int reset_decmp = 0; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return EROFS; - } - - /* - * acquire & lock the cnode. - * VFS should have already authenticated the caller for us. - */ - - if (ap->a_data) { - /* - * Cast the pointer into a uint32_t so we can extract the - * supplied generation counter. - */ - gen_counter = *((uint32_t*)ap->a_data); - } - else { - return EINVAL; - } - -#if HFS_COMPRESSION - cp = VTOC(vp); - /* Grab truncate lock first; we may truncate the file */ - hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - - error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - return error; - } - - /* Are there any other usecounts/FDs? */ - if (vnode_isinuse(vp, 1)) { - hfs_unlock(cp); - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - return EBUSY; - } - - /* now we have the cnode locked down; Validate arguments */ - if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) { - /* EINVAL if you are trying to manipulate an IMMUTABLE file */ - hfs_unlock(cp); - hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); - return EINVAL; - } - - if ((hfs_get_gencount (cp)) == gen_counter) { - /* - * OK, the gen_counter matched. Go for it: - * Toggle state bits, truncate file, and suppress mtime update - */ - reset_decmp = 1; - cp->c_bsdflags |= UF_COMPRESSED; - - error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES, - ap->a_context); - } - else { - error = ESTALE; - } - - /* Unlock cnode before executing decmpfs ; they may need to get an EA */ - hfs_unlock(cp); - - /* - * Reset the decmp state while still holding the truncate lock. We need to - * serialize here against a listxattr on this node which may occur at any - * time. - * - * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed, - * that will still potentially require getting the com.apple.decmpfs EA. If the - * EA is required, then we can't hold the cnode lock, because the getxattr call is - * generic(through VFS), and can't pass along any info telling it that we're already - * holding it (the lock). If we don't serialize, then we risk listxattr stopping - * and trying to fill in the hfs_file_is_compressed info during the callback - * operation, which will result in deadlock against the b-tree node. - * - * So, to serialize against listxattr (which will grab buf_t meta references on - * the b-tree blocks), we hold the truncate lock as we're manipulating the - * decmpfs payload. - */ - if ((reset_decmp) && (error == 0)) { - decmpfs_cnode *dp = VTOCMP (vp); - if (dp != NULL) { - decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); - } - - /* Initialize the decmpfs node as needed */ - (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */ - } - - hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); - -#endif - return error; - } - - case F_SETBACKINGSTORE: { - - int error = 0; - - /* - * See comment in F_SETSTATICCONTENT re: using - * a null check for a_data - */ - if (ap->a_data) { - error = hfs_set_backingstore (vp, 1); - } - else { - error = hfs_set_backingstore (vp, 0); - } - - return error; - } - - case F_GETPATH_MTMINFO: { - int error = 0; - - int *data = (int*) ap->a_data; - - /* Ask if this is a backingstore vnode */ - error = hfs_is_backingstore (vp, data); - - return error; - } - - case F_FULLFSYNC: { - int error; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error == 0) { - error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p); - hfs_unlock(VTOC(vp)); - } - - return error; - } - - case F_BARRIERFSYNC: { - int error; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error == 0) { - error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p); - hfs_unlock(VTOC(vp)); - } - - return error; - } - - case F_CHKCLEAN: { - register struct cnode *cp; - int error; - - if (!vnode_isreg(vp)) - return EINVAL; - - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error == 0) { - cp = VTOC(vp); - /* - * used by regression test to determine if - * all the dirty pages (via write) have been cleaned - * after a call to 'fsysnc'. - */ - error = is_file_clean(vp, VTOF(vp)->ff_size); - hfs_unlock(cp); - } - return (error); - } - - case F_RDADVISE: { - register struct radvisory *ra; - struct filefork *fp; - int error; - - if (!vnode_isreg(vp)) - return EINVAL; - - ra = (struct radvisory *)(ap->a_data); - fp = VTOF(vp); - - /* Protect against a size change. */ - hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - -#if HFS_COMPRESSION - if (compressed && (uncompressed_size == -1)) { - /* fetching the uncompressed size failed above, so return the error */ - error = decmpfs_error; - } else if ((compressed && (ra->ra_offset >= uncompressed_size)) || - (!compressed && (ra->ra_offset >= fp->ff_size))) { - error = EFBIG; - } -#else /* HFS_COMPRESSION */ - if (ra->ra_offset >= fp->ff_size) { - error = EFBIG; - } -#endif /* HFS_COMPRESSION */ - else { - error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); - } - - hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT); - return (error); - } - - case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ - { - if (is64bit) { - *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); - } - else { - *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); - } - return 0; - } - - case SPOTLIGHT_FSCTL_GET_MOUNT_TIME: - *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time; - break; - - case SPOTLIGHT_FSCTL_GET_LAST_MTIME: - *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime; - break; - - case HFS_FSCTL_GET_VERY_LOW_DISK: - *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit; - break; - - case HFS_FSCTL_SET_VERY_LOW_DISK: - if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) { - return EINVAL; - } - - hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data; - break; - - case HFS_FSCTL_GET_LOW_DISK: - *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit; - break; - - case HFS_FSCTL_SET_LOW_DISK: - if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel - || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) { - - return EINVAL; - } - - hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data; - break; - - case HFS_FSCTL_GET_DESIRED_DISK: - *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel; - break; - - case HFS_FSCTL_SET_DESIRED_DISK: - if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) { - return EINVAL; - } - - hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data; - break; - - case HFS_VOLUME_STATUS: - *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions; - break; - - case HFS_SET_BOOT_INFO: - if (!vnode_isvroot(vp)) - return(EINVAL); - if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner)) - return(EACCES); /* must be superuser or owner of filesystem */ - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - hfs_lock_mount (hfsmp); - bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo)); - hfs_unlock_mount (hfsmp); - (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); - break; - - case HFS_GET_BOOT_INFO: - if (!vnode_isvroot(vp)) - return(EINVAL); - hfs_lock_mount (hfsmp); - bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo)); - hfs_unlock_mount(hfsmp); - break; - - case HFS_MARK_BOOT_CORRUPT: - /* Mark the boot volume corrupt by setting - * kHFSVolumeInconsistentBit in the volume header. This will - * force fsck_hfs on next mount. - */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return EACCES; - } - - /* Allowed only on the root vnode of the boot volume */ - if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) || - !vnode_isvroot(vp)) { - return EINVAL; - } - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n"); - hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED); - break; - - case HFS_FSCTL_GET_JOURNAL_INFO: - jip = (struct hfs_journal_info*)ap->a_data; - - if (vp == NULLVP) - return EINVAL; - - if (hfsmp->jnl == NULL) { - jnl_start = 0; - jnl_size = 0; - } else { - jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset; - jnl_size = hfsmp->jnl_size; - } - - jip->jstart = jnl_start; - jip->jsize = jnl_size; - break; - - case HFS_SET_ALWAYS_ZEROFILL: { - struct cnode *cp = VTOC(vp); - - if (*(int *)ap->a_data) { - cp->c_flag |= C_ALWAYS_ZEROFILL; - } else { - cp->c_flag &= ~C_ALWAYS_ZEROFILL; - } - break; - } - - case HFS_DISABLE_METAZONE: { - /* Only root can disable metadata zone */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return EACCES; - } - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - - /* Disable metadata zone now */ - (void) hfs_metadatazone_init(hfsmp, true); - printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN); - break; - } - - - case HFS_FSINFO_METADATA_BLOCKS: { - int error; - struct hfsinfo_metadata *hinfo; - - hinfo = (struct hfsinfo_metadata *)ap->a_data; - - /* Get information about number of metadata blocks */ - error = hfs_getinfo_metadata_blocks(hfsmp, hinfo); - if (error) { - return error; - } - - break; - } - - case HFS_GET_FSINFO: { - hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data; - - /* Only root is allowed to get fsinfo */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return EACCES; - } - - /* - * Make sure that the caller's version number matches with - * the kernel's version number. This will make sure that - * if the structures being read/written into are changed - * by the kernel, the caller will not read incorrect data. - * - * The first three fields --- request_type, version and - * flags are same for all the hfs_fsinfo structures, so - * we can access the version number by assuming any - * structure for now. - */ - if (fsinfo->header.version != HFS_FSINFO_VERSION) { - return ENOTSUP; - } - - /* Make sure that the current file system is not marked inconsistent */ - if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) { - return EIO; - } - - return hfs_get_fsinfo(hfsmp, ap->a_data); - } - - case HFS_CS_FREESPACE_TRIM: { - int error = 0; - int lockflags = 0; - - /* Only root allowed */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return EACCES; - } - - /* - * This core functionality is similar to hfs_scan_blocks(). - * The main difference is that hfs_scan_blocks() is called - * as part of mount where we are assured that the journal is - * empty to start with. This fcntl() can be called on a - * mounted volume, therefore it has to flush the content of - * the journal as well as ensure the state of summary table. - * - * This fcntl scans over the entire allocation bitmap, - * creates list of all the free blocks, and issues TRIM - * down to the underlying device. This can take long time - * as it can generate up to 512MB of read I/O. - */ - - if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) { - error = hfs_init_summary(hfsmp); - if (error) { - printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN); - return error; - } - } - - /* - * The journal maintains list of recently deallocated blocks to - * issue DKIOCUNMAPs when the corresponding journal transaction is - * flushed to the disk. To avoid any race conditions, we only - * want one active trim list and only one thread issuing DKIOCUNMAPs. - * Therefore we make sure that the journal trim list is sync'ed, - * empty, and not modifiable for the duration of our scan. - * - * Take the journal lock before flushing the journal to the disk. - * We will keep on holding the journal lock till we don't get the - * bitmap lock to make sure that no new journal transactions can - * start. This will make sure that the journal trim list is not - * modified after the journal flush and before getting bitmap lock. - * We can release the journal lock after we acquire the bitmap - * lock as it will prevent any further block deallocations. - */ - hfs_journal_lock(hfsmp); - - /* Flush the journal and wait for all I/Os to finish up */ - error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - if (error) { - hfs_journal_unlock(hfsmp); - return error; - } - - /* Take bitmap lock to ensure it is not being modified */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - /* Release the journal lock */ - hfs_journal_unlock(hfsmp); - - /* - * ScanUnmapBlocks reads the bitmap in large block size - * (up to 1MB) unlike the runtime which reads the bitmap - * in the 4K block size. This can cause buf_t collisions - * and potential data corruption. To avoid this, we - * invalidate all the existing buffers associated with - * the bitmap vnode before scanning it. - * - * Note: ScanUnmapBlock() cleans up all the buffers - * after itself, so there won't be any large buffers left - * for us to clean up after it returns. - */ - error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); - if (error) { - hfs_systemfile_unlock(hfsmp, lockflags); - return error; - } - - /* Traverse bitmap and issue DKIOCUNMAPs */ - error = ScanUnmapBlocks(hfsmp); - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) { - return error; - } - - break; - } - - case HFS_SET_HOTFILE_STATE: { - int error; - struct cnode *cp = VTOC(vp); - uint32_t hf_state = *((uint32_t*)ap->a_data); - uint32_t num_unpinned = 0; - - error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error) { - return error; - } - - // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name); - if (hf_state == HFS_MARK_FASTDEVCANDIDATE) { - vnode_setfastdevicecandidate(vp); - - cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask; - cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask; - cp->c_flag |= C_MODIFIED; - } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) { - vnode_clearfastdevicecandidate(vp); - hfs_removehotfile(vp); - - if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) { - hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned, ap->a_context); - } - - if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) { - cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask; - } - cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask); - cp->c_flag |= C_MODIFIED; - - } else { - error = EINVAL; - } - - if (num_unpinned != 0) { - lck_mtx_lock(&hfsmp->hfc_mutex); - hfsmp->hfs_hotfile_freeblks += num_unpinned; - lck_mtx_unlock(&hfsmp->hfc_mutex); - } - - hfs_unlock(cp); - return error; - break; - } - - case HFS_REPIN_HOTFILE_STATE: { - int error=0; - uint32_t repin_what = *((uint32_t*)ap->a_data); - - /* Only root allowed */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return EACCES; - } - - if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) { - // this system is neither regular Fusion or Cooperative Fusion - // so this fsctl makes no sense. - return EINVAL; - } - - // - // After a converting a CoreStorage volume to be encrypted, the - // extents could have moved around underneath us. This call - // allows corestoraged to re-pin everything that should be - // pinned (it would happen on the next reboot too but that could - // be a long time away). - // - if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) { - hfs_pin_fs_metadata(hfsmp); - } - if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { - hfs_repin_hotfiles(hfsmp); - } - if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) { - //XXX Swapfiles (marked SWAP_PINNED) may have moved too. - //XXX Do we care? They have a more transient/dynamic nature/lifetime. - } - - return error; - break; - } - - - default: - return (ENOTTY); - } - - return 0; -} - -/* - * select - */ -int -hfs_vnop_select(__unused struct vnop_select_args *ap) -/* - struct vnop_select_args { - vnode_t a_vp; - int a_which; - int a_fflags; - void *a_wql; - vfs_context_t a_context; - }; -*/ -{ - /* - * We should really check to see if I/O is possible. - */ - return (1); -} - -/* - * Converts a logical block number to a physical block, and optionally returns - * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize. - * The physical block number is based on the device block size, currently its 512. - * The block run is returned in logical blocks, and is the REMAINING amount of blocks - */ -int -hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp) -{ - struct filefork *fp = VTOF(vp); - struct hfsmount *hfsmp = VTOHFS(vp); - int retval = E_NONE; - u_int32_t logBlockSize; - size_t bytesContAvail = 0; - off_t blockposition; - int lockExtBtree; - int lockflags = 0; - - /* - * Check for underlying vnode requests and ensure that logical - * to physical mapping is requested. - */ - if (vpp != NULL) - *vpp = hfsmp->hfs_devvp; - if (bnp == NULL) - return (0); - - logBlockSize = GetLogicalBlockSize(vp); - blockposition = (off_t)bn * logBlockSize; - - lockExtBtree = overflow_extents(fp); - - if (lockExtBtree) - lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - - retval = MacToVFSError( - MapFileBlockC (HFSTOVCB(hfsmp), - (FCB*)fp, - MAXPHYSIO, - blockposition, - bnp, - &bytesContAvail)); - - if (lockExtBtree) - hfs_systemfile_unlock(hfsmp, lockflags); - - if (retval == E_NONE) { - /* Figure out how many read ahead blocks there are */ - if (runp != NULL) { - if (can_cluster(logBlockSize)) { - /* Make sure this result never goes negative: */ - *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1; - } else { - *runp = 0; - } - } - } - return (retval); -} - -/* - * Convert logical block number to file offset. - */ -int -hfs_vnop_blktooff(struct vnop_blktooff_args *ap) -/* - struct vnop_blktooff_args { - vnode_t a_vp; - daddr64_t a_lblkno; - off_t *a_offset; - }; -*/ -{ - if (ap->a_vp == NULL) - return (EINVAL); - *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp); - - return(0); -} - -/* - * Convert file offset to logical block number. - */ -int -hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) -/* - struct vnop_offtoblk_args { - vnode_t a_vp; - off_t a_offset; - daddr64_t *a_lblkno; - }; -*/ -{ - if (ap->a_vp == NULL) - return (EINVAL); - *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp)); - - return(0); -} - -/* - * Map file offset to physical block number. - * - * If this function is called for write operation, and if the file - * had virtual blocks allocated (delayed allocation), real blocks - * are allocated by calling ExtendFileC(). - * - * If this function is called for read operation, and if the file - * had virtual blocks allocated (delayed allocation), no change - * to the size of file is done, and if required, rangelist is - * searched for mapping. - * - * System file cnodes are expected to be locked (shared or exclusive). - * - * -- INVALID RANGES -- - * - * Invalid ranges are used to keep track of where we have extended a - * file, but have not yet written that data to disk. In the past we - * would clear up the invalid ranges as we wrote to those areas, but - * before data was actually flushed to disk. The problem with that - * approach is that the data can be left in the cache and is therefore - * still not valid on disk. So now we clear up the ranges here, when - * the flags field has VNODE_WRITE set, indicating a write is about to - * occur. This isn't ideal (ideally we want to clear them up when - * know the data has been successfully written), but it's the best we - * can do. - * - * For reads, we use the invalid ranges here in block map to indicate - * to the caller that the data should be zeroed (a_bpn == -1). We - * have to be careful about what ranges we return to the cluster code. - * Currently the cluster code can only handle non-rounded values for - * the EOF; it cannot handle funny sized ranges in the middle of the - * file (the main problem is that it sends down odd sized I/Os to the - * disk). Our code currently works because whilst the very first - * offset and the last offset in the invalid ranges are not aligned, - * gaps in the invalid ranges between the first and last, have to be - * aligned (because we always write page sized blocks). For example, - * consider this arrangement: - * - * +-------------+-----+-------+------+ - * | |XXXXX| |XXXXXX| - * +-------------+-----+-------+------+ - * a b c d - * - * This shows two invalid ranges and . Whilst a and d - * are not necessarily aligned, b and c *must* be. - * - * Zero-filling occurs in a number of ways: - * - * 1. When a read occurs and we return with a_bpn == -1. - * - * 2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges - * which will cause us to iterate over the ranges bringing in - * pages that are not present in the cache and zeroing them. Any - * pages that are already in the cache are left untouched. Note - * that hfs_fsync does not always flush invalid ranges. - * - * 3. When we extend a file we zero out from the old EOF to the end - * of the page. It would be nice if we didn't have to do this if - * the page wasn't present (and could defer it), but because of - * the problem described above, we have to. - * - * The invalid ranges are also used to restrict the size that we write - * out on disk: see hfs_prepare_fork_for_update. - * - * Note that invalid ranges are ignored when neither the VNODE_READ or - * the VNODE_WRITE flag is specified. This is useful for the - * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they - * just want to know whether blocks are physically allocated or not. - */ -int -hfs_vnop_blockmap(struct vnop_blockmap_args *ap) -/* - struct vnop_blockmap_args { - vnode_t a_vp; - off_t a_foffset; - size_t a_size; - daddr64_t *a_bpn; - size_t *a_run; - void *a_poff; - int a_flags; - vfs_context_t a_context; - }; -*/ -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct filefork *fp; - struct hfsmount *hfsmp; - size_t bytesContAvail = ap->a_size; - int retval = E_NONE; - int syslocks = 0; - int lockflags = 0; - struct rl_entry *invalid_range; - enum rl_overlaptype overlaptype; - int started_tr = 0; - int tooklock = 0; - -#if HFS_COMPRESSION - if (VNODE_IS_RSRC(vp)) { - /* allow blockmaps to the resource fork */ - } else { - if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ - int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); - switch(state) { - case FILE_IS_COMPRESSED: - return ENOTSUP; - case FILE_IS_CONVERTING: - /* if FILE_IS_CONVERTING, we allow blockmap */ - break; - default: - printf("invalid state %d for compressed file\n", state); - /* fall through */ - } - } - } -#endif /* HFS_COMPRESSION */ - - /* Do not allow blockmap operation on a directory */ - if (vnode_isdir(vp)) { - return (ENOTSUP); - } - - /* - * Check for underlying vnode requests and ensure that logical - * to physical mapping is requested. - */ - if (ap->a_bpn == NULL) - return (0); - - hfsmp = VTOHFS(vp); - cp = VTOC(vp); - fp = VTOF(vp); - - if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) { - if (cp->c_lockowner != current_thread()) { - hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - tooklock = 1; - } - - // For reads, check the invalid ranges - if (ISSET(ap->a_flags, VNODE_READ)) { - if (ap->a_foffset >= fp->ff_size) { - retval = ERANGE; - goto exit; - } - - overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, - ap->a_foffset + (off_t)bytesContAvail - 1, - &invalid_range); - switch(overlaptype) { - case RL_MATCHINGOVERLAP: - case RL_OVERLAPCONTAINSRANGE: - case RL_OVERLAPSTARTSBEFORE: - /* There's no valid block for this byte offset */ - *ap->a_bpn = (daddr64_t)-1; - /* There's no point limiting the amount to be returned - * if the invalid range that was hit extends all the way - * to the EOF (i.e. there's no valid bytes between the - * end of this range and the file's EOF): - */ - if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && - ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; - } - - retval = 0; - goto exit; - - case RL_OVERLAPISCONTAINED: - case RL_OVERLAPENDSAFTER: - /* The range of interest hits an invalid block before the end: */ - if (invalid_range->rl_start == ap->a_foffset) { - /* There's actually no valid information to be had starting here: */ - *ap->a_bpn = (daddr64_t)-1; - if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && - ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; - } - - retval = 0; - goto exit; - } else { - /* - * Sadly, the lower layers don't like us to - * return unaligned ranges, so we skip over - * any invalid ranges here that are less than - * a page: zeroing of those bits is not our - * responsibility (it's dealt with elsewhere). - */ - do { - off_t rounded_start = round_page_64(invalid_range->rl_start); - if ((off_t)bytesContAvail < rounded_start - ap->a_foffset) - break; - if (rounded_start < invalid_range->rl_end + 1) { - bytesContAvail = rounded_start - ap->a_foffset; - break; - } - } while ((invalid_range = TAILQ_NEXT(invalid_range, - rl_link))); - } - break; - - case RL_NOOVERLAP: - break; - } // switch - } - } - -#if CONFIG_PROTECT - if (cp->c_cpentry) { - const int direction = (ISSET(ap->a_flags, VNODE_WRITE) - ? VNODE_WRITE : VNODE_READ); - - cp_io_params_t io_params; - cp_io_params(hfsmp, cp->c_cpentry, - off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)), - direction, &io_params); - - if (io_params.max_len < (off_t)bytesContAvail) - bytesContAvail = io_params.max_len; - - if (io_params.phys_offset != -1) { - *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset) - / hfsmp->hfs_logical_block_size); - - retval = 0; - goto exit; - } - } -#endif - -retry: - - /* Check virtual blocks only when performing write operation */ - if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto exit; - } else { - started_tr = 1; - } - syslocks = SFL_EXTENTS | SFL_BITMAP; - - } else if (overflow_extents(fp)) { - syslocks = SFL_EXTENTS; - } - - if (syslocks) - lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK); - - /* - * Check for any delayed allocations. - */ - if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { - int64_t actbytes; - u_int32_t loanedBlocks; - - // - // Make sure we have a transaction. It's possible - // that we came in and fp->ff_unallocblocks was zero - // but during the time we blocked acquiring the extents - // btree, ff_unallocblocks became non-zero and so we - // will need to start a transaction. - // - if (started_tr == 0) { - if (syslocks) { - hfs_systemfile_unlock(hfsmp, lockflags); - syslocks = 0; - } - goto retry; - } - - /* - * Note: ExtendFileC will Release any blocks on loan and - * aquire real blocks. So we ask to extend by zero bytes - * since ExtendFileC will account for the virtual blocks. - */ - - loanedBlocks = fp->ff_unallocblocks; - retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0, - kEFAllMask | kEFNoClumpMask, &actbytes); - - if (retval) { - fp->ff_unallocblocks = loanedBlocks; - cp->c_blocks += loanedBlocks; - fp->ff_blocks += loanedBlocks; - - hfs_lock_mount (hfsmp); - hfsmp->loanedBlocks += loanedBlocks; - hfs_unlock_mount (hfsmp); - - hfs_systemfile_unlock(hfsmp, lockflags); - cp->c_flag |= C_MODIFIED; - if (started_tr) { - (void) hfs_update(vp, 0); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - - hfs_end_transaction(hfsmp); - started_tr = 0; - } - goto exit; - } - } - - retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset, - ap->a_bpn, &bytesContAvail); - if (syslocks) { - hfs_systemfile_unlock(hfsmp, lockflags); - syslocks = 0; - } - - if (retval) { - /* On write, always return error because virtual blocks, if any, - * should have been allocated in ExtendFileC(). We do not - * allocate virtual blocks on read, therefore return error - * only if no virtual blocks are allocated. Otherwise we search - * rangelist for zero-fills - */ - if ((MacToVFSError(retval) != ERANGE) || - (ap->a_flags & VNODE_WRITE) || - ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) { - goto exit; - } - - /* Validate if the start offset is within logical file size */ - if (ap->a_foffset >= fp->ff_size) { - goto exit; - } - - /* - * At this point, we have encountered a failure during - * MapFileBlockC that resulted in ERANGE, and we are not - * servicing a write, and there are borrowed blocks. - * - * However, the cluster layer will not call blockmap for - * blocks that are borrowed and in-cache. We have to assume - * that because we observed ERANGE being emitted from - * MapFileBlockC, this extent range is not valid on-disk. So - * we treat this as a mapping that needs to be zero-filled - * prior to reading. - */ - - if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail) - bytesContAvail = fp->ff_size - ap->a_foffset; - - *ap->a_bpn = (daddr64_t) -1; - retval = 0; - - goto exit; - } - -exit: - if (retval == 0) { - if (ISSET(ap->a_flags, VNODE_WRITE)) { - struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges); - - // See if we might be overlapping invalid ranges... - if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) { - /* - * Mark the file as needing an update if we think the - * on-disk EOF has changed. - */ - if (ap->a_foffset <= r->rl_start) - SET(cp->c_flag, C_MODIFIED); - - /* - * This isn't the ideal place to put this. Ideally, we - * should do something *after* we have successfully - * written to the range, but that's difficult to do - * because we cannot take locks in the callback. At - * present, the cluster code will call us with VNODE_WRITE - * set just before it's about to write the data so we know - * that data is about to be written. If we get an I/O - * error at this point then chances are the metadata - * update to follow will also have an I/O error so the - * risk here is small. - */ - rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1, - &fp->ff_invalidranges); - - if (!TAILQ_FIRST(&fp->ff_invalidranges)) { - cp->c_flag &= ~C_ZFWANTSYNC; - cp->c_zftimeout = 0; - } - } - } - - if (ap->a_run) - *ap->a_run = bytesContAvail; - - if (ap->a_poff) - *(int *)ap->a_poff = 0; - } - - if (started_tr) { - hfs_update(vp, TRUE); - hfs_volupdate(hfsmp, VOL_UPDATE, 0); - hfs_end_transaction(hfsmp); - started_tr = 0; - } - - if (tooklock) - hfs_unlock(cp); - - return (MacToVFSError(retval)); -} - -/* - * prepare and issue the I/O - * buf_strategy knows how to deal - * with requests that require - * fragmented I/Os - */ -int -hfs_vnop_strategy(struct vnop_strategy_args *ap) -{ - buf_t bp = ap->a_bp; - vnode_t vp = buf_vnode(bp); - int error = 0; - - /* Mark buffer as containing static data if cnode flag set */ - if (VTOC(vp)->c_flag & C_SSD_STATIC) { - buf_markstatic(bp); - } - - /* Mark buffer as containing static data if cnode flag set */ - if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) { - bufattr_markgreedymode(&bp->b_attr); - } - - /* mark buffer as containing burst mode data if cnode flag set */ - if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) { - bufattr_markisochronous(&bp->b_attr); - } - -#if CONFIG_PROTECT - error = cp_handle_strategy(bp); - - if (error) - return error; -#endif - - error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); - - return error; -} - -int -do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context) -{ - register struct cnode *cp = VTOC(vp); - struct filefork *fp = VTOF(vp); - kauth_cred_t cred = vfs_context_ucred(context); - int retval; - off_t bytesToAdd; - off_t actualBytesAdded; - off_t filebytes; - u_int32_t fileblocks; - int blksize; - struct hfsmount *hfsmp; - int lockflags; - int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES); - - blksize = VTOVCB(vp)->blockSize; - fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)blksize; - - KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START, - (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); - - if (length < 0) - return (EINVAL); - - /* This should only happen with a corrupt filesystem */ - if ((off_t)fp->ff_size < 0) - return (EINVAL); - - if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE)) - return (EFBIG); - - hfsmp = VTOHFS(vp); - - retval = E_NONE; - - /* Files that are changing size are not hot file candidates. */ - if (hfsmp->hfc_stage == HFC_RECORDING) { - fp->ff_bytesread = 0; - } - - /* - * We cannot just check if fp->ff_size == length (as an optimization) - * since there may be extra physical blocks that also need truncation. - */ -#if QUOTA - if ((retval = hfs_getinoquota(cp))) - return(retval); -#endif /* QUOTA */ - - /* - * Lengthen the size of the file. We must ensure that the - * last byte of the file is allocated. Since the smallest - * value of ff_size is 0, length will be at least 1. - */ - if (length > (off_t)fp->ff_size) { -#if QUOTA - retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)), - cred, 0); - if (retval) - goto Err_Exit; -#endif /* QUOTA */ - /* - * If we don't have enough physical space then - * we need to extend the physical size. - */ - if (length > filebytes) { - int eflags; - u_int32_t blockHint = 0; - - /* All or nothing and don't round up to clumpsize. */ - eflags = kEFAllMask | kEFNoClumpMask; - - if (cred && (suser(cred, NULL) != 0)) { - eflags |= kEFReserveMask; /* keep a reserve */ - } - - /* - * Allocate Journal and Quota files in metadata zone. - */ - if (filebytes == 0 && - hfsmp->hfs_flags & HFS_METADATA_ZONE && - hfs_virtualmetafile(cp)) { - eflags |= kEFMetadataMask; - blockHint = hfsmp->hfs_metazone_start; - } - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto Err_Exit; - } - - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - /* - * Keep growing the file as long as the current EOF is - * less than the desired value. - */ - while ((length > filebytes) && (retval == E_NONE)) { - bytesToAdd = length - filebytes; - retval = MacToVFSError(ExtendFileC(VTOVCB(vp), - (FCB*)fp, - bytesToAdd, - blockHint, - eflags, - &actualBytesAdded)); - - filebytes = (off_t)fp->ff_blocks * (off_t)blksize; - if (actualBytesAdded == 0 && retval == E_NONE) { - if (length > filebytes) - length = filebytes; - break; - } - } /* endwhile */ - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (hfsmp->jnl) { - hfs_update(vp, 0); - hfs_volupdate(hfsmp, VOL_UPDATE, 0); - } - - hfs_end_transaction(hfsmp); - - if (retval) - goto Err_Exit; - - KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, - (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); - } - - if (ISSET(flags, IO_NOZEROFILL)) { - // An optimisation for the hibernation file - if (vnode_isswap(vp)) - rl_remove_all(&fp->ff_invalidranges); - } else { - if (UBCINFOEXISTS(vp) && (vnode_issystem(vp) == 0) && retval == E_NONE) { - if (length > (off_t)fp->ff_size) { - struct timeval tv; - - /* Extending the file: time to fill out the current last page w. zeroes? */ - if (fp->ff_size & PAGE_MASK_64) { - /* There might be some valid data at the start of the (current) last page - of the file, so zero out the remainder of that page to ensure the - entire page contains valid data. */ - hfs_unlock(cp); - retval = hfs_zero_eof_page(vp, length); - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - if (retval) goto Err_Exit; - } - microuptime(&tv); - rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges); - cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; - } - } else { - panic("hfs_truncate: invoked on non-UBC object?!"); - }; - } - if (suppress_times == 0) { - cp->c_touch_modtime = TRUE; - } - fp->ff_size = length; - - } else { /* Shorten the size of the file */ - - // An optimisation for the hibernation file - if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) { - rl_remove_all(&fp->ff_invalidranges); - } else if ((off_t)fp->ff_size > length) { - /* Any space previously marked as invalid is now irrelevant: */ - rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges); - } - - /* - * Account for any unmapped blocks. Note that the new - * file length can still end up with unmapped blocks. - */ - if (fp->ff_unallocblocks > 0) { - u_int32_t finalblks; - u_int32_t loanedBlocks; - - hfs_lock_mount(hfsmp); - loanedBlocks = fp->ff_unallocblocks; - cp->c_blocks -= loanedBlocks; - fp->ff_blocks -= loanedBlocks; - fp->ff_unallocblocks = 0; - - hfsmp->loanedBlocks -= loanedBlocks; - - finalblks = (length + blksize - 1) / blksize; - if (finalblks > fp->ff_blocks) { - /* calculate required unmapped blocks */ - loanedBlocks = finalblks - fp->ff_blocks; - hfsmp->loanedBlocks += loanedBlocks; - - fp->ff_unallocblocks = loanedBlocks; - cp->c_blocks += loanedBlocks; - fp->ff_blocks += loanedBlocks; - } - hfs_unlock_mount (hfsmp); - } - - off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto Err_Exit; - } - - if (fp->ff_unallocblocks == 0) { - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, - FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false)); - - hfs_systemfile_unlock(hfsmp, lockflags); - } - if (hfsmp->jnl) { - if (retval == 0) { - fp->ff_size = length; - } - hfs_update(vp, 0); - hfs_volupdate(hfsmp, VOL_UPDATE, 0); - } - hfs_end_transaction(hfsmp); - - filebytes = (off_t)fp->ff_blocks * (off_t)blksize; - if (retval) - goto Err_Exit; -#if QUOTA - /* These are bytesreleased */ - (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); -#endif /* QUOTA */ - - // - // Unlike when growing a file, we adjust the hotfile block count here - // instead of deeper down in the block allocation code because we do - // not necessarily have a vnode or "fcb" at the time we're deleting - // the file and so we wouldn't know if it was hotfile cached or not - // - hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize)); - - - /* - * Only set update flag if the logical length changes & we aren't - * suppressing modtime updates. - */ - if (((off_t)fp->ff_size != length) && (suppress_times == 0)) { - cp->c_touch_modtime = TRUE; - } - fp->ff_size = length; - } - if (cp->c_mode & (S_ISUID | S_ISGID)) { - if (!vfs_context_issuser(context)) - cp->c_mode &= ~(S_ISUID | S_ISGID); - } - cp->c_flag |= C_MODIFIED; - cp->c_touch_chgtime = TRUE; /* status changed */ - if (suppress_times == 0) { - cp->c_touch_modtime = TRUE; /* file data was modified */ - - /* - * If we are not suppressing the modtime update, then - * update the gen count as well. - */ - if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) { - hfs_incr_gencount(cp); - } - } - - retval = hfs_update(vp, 0); - if (retval) { - KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, - -1, -1, -1, retval, 0); - } - -Err_Exit: - - KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END, - (int)length, (int)fp->ff_size, (int)filebytes, retval, 0); - - return (retval); -} - -/* - * Preparation which must be done prior to deleting the catalog record - * of a file or directory. In order to make the on-disk as safe as possible, - * we remove the catalog entry before releasing the bitmap blocks and the - * overflow extent records. However, some work must be done prior to deleting - * the catalog record. - * - * When calling this function, the cnode must exist both in memory and on-disk. - * If there are both resource fork and data fork vnodes, this function should - * be called on both. - */ - -int -hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { - - struct filefork *fp = VTOF(vp); - struct cnode *cp = VTOC(vp); -#if QUOTA - int retval = 0; -#endif /* QUOTA */ - - /* Cannot truncate an HFS directory! */ - if (vnode_isdir(vp)) { - return (EISDIR); - } - - /* - * See the comment below in hfs_truncate for why we need to call - * setsize here. Essentially we want to avoid pending IO if we - * already know that the blocks are going to be released here. - * This function is only called when totally removing all storage for a file, so - * we can take a shortcut and immediately setsize (0); - */ - ubc_setsize(vp, 0); - - /* This should only happen with a corrupt filesystem */ - if ((off_t)fp->ff_size < 0) - return (EINVAL); - - /* - * We cannot just check if fp->ff_size == length (as an optimization) - * since there may be extra physical blocks that also need truncation. - */ -#if QUOTA - if ((retval = hfs_getinoquota(cp))) { - return(retval); - } -#endif /* QUOTA */ - - /* Wipe out any invalid ranges which have yet to be backed by disk */ - rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges); - - /* - * Account for any unmapped blocks. Since we're deleting the - * entire file, we don't have to worry about just shrinking - * to a smaller number of borrowed blocks. - */ - if (fp->ff_unallocblocks > 0) { - u_int32_t loanedBlocks; - - hfs_lock_mount (hfsmp); - loanedBlocks = fp->ff_unallocblocks; - cp->c_blocks -= loanedBlocks; - fp->ff_blocks -= loanedBlocks; - fp->ff_unallocblocks = 0; - - hfsmp->loanedBlocks -= loanedBlocks; - - hfs_unlock_mount (hfsmp); - } - - return 0; -} - - -/* - * Special wrapper around calling TruncateFileC. This function is useable - * even when the catalog record does not exist any longer, making it ideal - * for use when deleting a file. The simplification here is that we know - * that we are releasing all blocks. - * - * Note that this function may be called when there is no vnode backing - * the file fork in question. We may call this from hfs_vnop_inactive - * to clear out resource fork data (and may not want to clear out the data - * fork yet). As a result, we pointer-check both sets of inputs before - * doing anything with them. - * - * The caller is responsible for saving off a copy of the filefork(s) - * embedded within the cnode prior to calling this function. The pointers - * supplied as arguments must be valid even if the cnode is no longer valid. - */ - -int -hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, - struct filefork *rsrcfork, u_int32_t fileid) { - - off_t filebytes; - u_int32_t fileblocks; - int blksize = 0; - int error = 0; - int lockflags; - - blksize = hfsmp->blockSize; - - /* Data Fork */ - if (datafork) { - off_t prev_filebytes; - datafork->ff_size = 0; - - fileblocks = datafork->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)blksize; - prev_filebytes = filebytes; - - /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ - - while (filebytes > 0) { - if (filebytes > HFS_BIGFILE_SIZE) { - filebytes -= HFS_BIGFILE_SIZE; - } else { - filebytes = 0; - } - - /* Start a transaction, and wipe out as many blocks as we can in this iteration */ - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - break; - } - - if (datafork->ff_unallocblocks == 0) { - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(datafork)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false)); - - hfs_systemfile_unlock(hfsmp, lockflags); - } - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - - struct cnode *cp = datafork ? FTOC(datafork) : NULL; - struct vnode *vp; - vp = cp ? CTOV(cp, 0) : NULL; - hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize)); - prev_filebytes = filebytes; - - /* Finish the transaction and start over if necessary */ - hfs_end_transaction(hfsmp); - - if (error) { - break; - } - } - } - - /* Resource fork */ - if (error == 0 && rsrcfork) { - rsrcfork->ff_size = 0; - - fileblocks = rsrcfork->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)blksize; - - /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ - - while (filebytes > 0) { - if (filebytes > HFS_BIGFILE_SIZE) { - filebytes -= HFS_BIGFILE_SIZE; - } else { - filebytes = 0; - } - - /* Start a transaction, and wipe out as many blocks as we can in this iteration */ - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - break; - } - - if (rsrcfork->ff_unallocblocks == 0) { - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(rsrcfork)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false)); - - hfs_systemfile_unlock(hfsmp, lockflags); - } - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - - /* Finish the transaction and start over if necessary */ - hfs_end_transaction(hfsmp); - - if (error) { - break; - } - } - } - - return error; -} - -errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock) -{ - errno_t error; - - /* - * Call ubc_setsize to give the VM subsystem a chance to do - * whatever it needs to with existing pages before we delete - * blocks. Note that symlinks don't use the UBC so we'll - * get back ENOENT in that case. - */ - if (have_cnode_lock) { - error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY); - if (error == EAGAIN) { - cnode_t *cp = VTOC(vp); - - if (cp->c_truncatelockowner != current_thread()) { -#if DEVELOPMENT || DEBUG - panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!"); -#else - printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n"); -#endif - } - - hfs_unlock(cp); - error = ubc_setsize_ex(vp, len, 0); - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); - } - } else - error = ubc_setsize_ex(vp, len, 0); - - return error == ENOENT ? 0 : error; -} - -/* - * Truncate a cnode to at most length size, freeing (or adding) the - * disk blocks. - */ -int -hfs_truncate(struct vnode *vp, off_t length, int flags, - int truncateflags, vfs_context_t context) -{ - struct filefork *fp = VTOF(vp); - off_t filebytes; - u_int32_t fileblocks; - int blksize; - errno_t error = 0; - struct cnode *cp = VTOC(vp); - hfsmount_t *hfsmp = VTOHFS(vp); - - /* Cannot truncate an HFS directory! */ - if (vnode_isdir(vp)) { - return (EISDIR); - } - /* A swap file cannot change size. */ - if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) { - return (EPERM); - } - - blksize = hfsmp->blockSize; - fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)blksize; - - bool caller_has_cnode_lock = (cp->c_lockowner == current_thread()); - - error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock); - if (error) - return error; - - if (!caller_has_cnode_lock) { - error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error) - return error; - } - - // have to loop truncating or growing files that are - // really big because otherwise transactions can get - // enormous and consume too many kernel resources. - - if (length < filebytes) { - while (filebytes > length) { - if ((filebytes - length) > HFS_BIGFILE_SIZE) { - filebytes -= HFS_BIGFILE_SIZE; - } else { - filebytes = length; - } - error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); - if (error) - break; - } - } else if (length > filebytes) { - kauth_cred_t cred = vfs_context_ucred(context); - const bool keep_reserve = cred && suser(cred, NULL) != 0; - - if (hfs_freeblks(hfsmp, keep_reserve) - < howmany(length - filebytes, blksize)) { - error = ENOSPC; - } else { - while (filebytes < length) { - if ((length - filebytes) > HFS_BIGFILE_SIZE) { - filebytes += HFS_BIGFILE_SIZE; - } else { - filebytes = length; - } - error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); - if (error) - break; - } - } - } else /* Same logical size */ { - - error = do_hfs_truncate(vp, length, flags, truncateflags, context); - } - /* Files that are changing size are not hot file candidates. */ - if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { - fp->ff_bytesread = 0; - } - - - if (!caller_has_cnode_lock) - hfs_unlock(cp); - - // Make sure UBC's size matches up (in case we didn't completely succeed) - errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock); - if (!error) - error = err2; - - return error; -} - - -/* - * Preallocate file storage space. - */ -int -hfs_vnop_allocate(struct vnop_allocate_args /* { - vnode_t a_vp; - off_t a_length; - u_int32_t a_flags; - off_t *a_bytesallocated; - off_t a_offset; - vfs_context_t a_context; - } */ *ap) -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct filefork *fp; - ExtendedVCB *vcb; - off_t length = ap->a_length; - off_t startingPEOF; - off_t moreBytesRequested; - off_t actualBytesAdded; - off_t filebytes; - u_int32_t fileblocks; - int retval, retval2; - u_int32_t blockHint; - u_int32_t extendFlags; /* For call to ExtendFileC */ - struct hfsmount *hfsmp; - kauth_cred_t cred = vfs_context_ucred(ap->a_context); - int lockflags; - time_t orig_ctime; - - *(ap->a_bytesallocated) = 0; - - if (!vnode_isreg(vp)) - return (EISDIR); - if (length < (off_t)0) - return (EINVAL); - - cp = VTOC(vp); - - orig_ctime = VTOC(vp)->c_ctime; - - check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); - - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - - if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - goto Err_Exit; - } - - fp = VTOF(vp); - hfsmp = VTOHFS(vp); - vcb = VTOVCB(vp); - - fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)vcb->blockSize; - - if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) { - retval = EINVAL; - goto Err_Exit; - } - - /* Fill in the flags word for the call to Extend the file */ - - extendFlags = kEFNoClumpMask; - if (ap->a_flags & ALLOCATECONTIG) - extendFlags |= kEFContigMask; - if (ap->a_flags & ALLOCATEALL) - extendFlags |= kEFAllMask; - if (cred && suser(cred, NULL) != 0) - extendFlags |= kEFReserveMask; - if (hfs_virtualmetafile(cp)) - extendFlags |= kEFMetadataMask; - - retval = E_NONE; - blockHint = 0; - startingPEOF = filebytes; - - if (ap->a_flags & ALLOCATEFROMPEOF) - length += filebytes; - else if (ap->a_flags & ALLOCATEFROMVOL) - blockHint = ap->a_offset / VTOVCB(vp)->blockSize; - - /* If no changes are necesary, then we're done */ - if (filebytes == length) - goto Std_Exit; - - /* - * Lengthen the size of the file. We must ensure that the - * last byte of the file is allocated. Since the smallest - * value of filebytes is 0, length will be at least 1. - */ - if (length > filebytes) { - if (ISSET(extendFlags, kEFAllMask) - && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask)) - < howmany(length - filebytes, hfsmp->blockSize))) { - retval = ENOSPC; - goto Err_Exit; - } - - off_t total_bytes_added = 0, orig_request_size; - - orig_request_size = moreBytesRequested = length - filebytes; - -#if QUOTA - retval = hfs_chkdq(cp, - (int64_t)(roundup(moreBytesRequested, vcb->blockSize)), - cred, 0); - if (retval) - goto Err_Exit; - -#endif /* QUOTA */ - /* - * Metadata zone checks. - */ - if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { - /* - * Allocate Journal and Quota files in metadata zone. - */ - if (hfs_virtualmetafile(cp)) { - blockHint = hfsmp->hfs_metazone_start; - } else if ((blockHint >= hfsmp->hfs_metazone_start) && - (blockHint <= hfsmp->hfs_metazone_end)) { - /* - * Move blockHint outside metadata zone. - */ - blockHint = hfsmp->hfs_metazone_end + 1; - } - } - - - while ((length > filebytes) && (retval == E_NONE)) { - off_t bytesRequested; - - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto Err_Exit; - } - - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - if (moreBytesRequested >= HFS_BIGFILE_SIZE) { - bytesRequested = HFS_BIGFILE_SIZE; - } else { - bytesRequested = moreBytesRequested; - } - - if (extendFlags & kEFContigMask) { - // if we're on a sparse device, this will force it to do a - // full scan to find the space needed. - hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN; - } - - retval = MacToVFSError(ExtendFileC(vcb, - (FCB*)fp, - bytesRequested, - blockHint, - extendFlags, - &actualBytesAdded)); - - if (retval == E_NONE) { - *(ap->a_bytesallocated) += actualBytesAdded; - total_bytes_added += actualBytesAdded; - moreBytesRequested -= actualBytesAdded; - if (blockHint != 0) { - blockHint += actualBytesAdded / vcb->blockSize; - } - } - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (hfsmp->jnl) { - (void) hfs_update(vp, 0); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - } - - hfs_end_transaction(hfsmp); - } - - - /* - * if we get an error and no changes were made then exit - * otherwise we must do the hfs_update to reflect the changes - */ - if (retval && (startingPEOF == filebytes)) - goto Err_Exit; - - /* - * Adjust actualBytesAdded to be allocation block aligned, not - * clump size aligned. - * NOTE: So what we are reporting does not affect reality - * until the file is closed, when we truncate the file to allocation - * block size. - */ - if (total_bytes_added != 0 && orig_request_size < total_bytes_added) - *(ap->a_bytesallocated) = - roundup(orig_request_size, (off_t)vcb->blockSize); - - } else { /* Shorten the size of the file */ - - /* - * N.B. At present, this code is never called. If and when we - * do start using it, it looks like there might be slightly - * strange semantics with the file size: it's possible for the - * file size to *increase* e.g. if current file size is 5, - * length is 1024 and filebytes is 4096, the file size will - * end up being 1024 bytes. This isn't necessarily a problem - * but it's not consistent with the code above which doesn't - * change the file size. - */ - - retval = hfs_truncate(vp, length, 0, 0, ap->a_context); - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - - /* - * if we get an error and no changes were made then exit - * otherwise we must do the hfs_update to reflect the changes - */ - if (retval && (startingPEOF == filebytes)) goto Err_Exit; -#if QUOTA - /* These are bytesreleased */ - (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0); -#endif /* QUOTA */ - - if (fp->ff_size > filebytes) { - fp->ff_size = filebytes; - - hfs_ubc_setsize(vp, fp->ff_size, true); - } - } - -Std_Exit: - cp->c_flag |= C_MODIFIED; - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - retval2 = hfs_update(vp, 0); - - if (retval == 0) - retval = retval2; -Err_Exit: - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - hfs_unlock(cp); - return (retval); -} - - -/* - * Pagein for HFS filesystem - */ -int -hfs_vnop_pagein(struct vnop_pagein_args *ap) -/* - struct vnop_pagein_args { - vnode_t a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - int a_flags - vfs_context_t a_context; - }; -*/ -{ - vnode_t vp; - struct cnode *cp; - struct filefork *fp; - int error = 0; - upl_t upl; - upl_page_info_t *pl; - off_t f_offset; - off_t page_needed_f_offset; - int offset; - int isize; - int upl_size; - int pg_index; - boolean_t truncate_lock_held = FALSE; - boolean_t file_converted = FALSE; - kern_return_t kret; - - vp = ap->a_vp; - cp = VTOC(vp); - fp = VTOF(vp); - -#if CONFIG_PROTECT - if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) { - /* - * If we errored here, then this means that one of two things occurred: - * 1. there was a problem with the decryption of the key. - * 2. the device is locked and we are not allowed to access this particular file. - * - * Either way, this means that we need to shut down this upl now. As long as - * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves) - * then we create a upl and immediately abort it. - */ - if (ap->a_pl == NULL) { - /* create the upl */ - ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl, - UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); - /* mark the range as needed so it doesn't immediately get discarded upon abort */ - ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1); - - /* Abort the range */ - ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); - } - - - return error; - } -#endif /* CONFIG_PROTECT */ - - if (ap->a_pl != NULL) { - /* - * this can only happen for swap files now that - * we're asking for V2 paging behavior... - * so don't need to worry about decompression, or - * keeping track of blocks read or taking the truncate lock - */ - error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, (off_t)fp->ff_size, ap->a_flags); - goto pagein_done; - } - - page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset; - -retry_pagein: - /* - * take truncate lock (shared/recursive) to guard against - * zero-fill thru fsync interfering, but only for v2 - * - * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the - * lock shared and we are allowed to recurse 1 level if this thread already - * owns the lock exclusively... this can legally occur - * if we are doing a shrinking ftruncate against a file - * that is mapped private, and the pages being truncated - * do not currently exist in the cache... in that case - * we will have to page-in the missing pages in order - * to provide them to the private mapping... we must - * also call hfs_unlock_truncate with a postive been_recursed - * arg to indicate that if we have recursed, there is no need to drop - * the lock. Allowing this simple recursion is necessary - * in order to avoid a certain deadlock... since the ftruncate - * already holds the truncate lock exclusively, if we try - * to acquire it shared to protect the pagein path, we will - * hang this thread - * - * NOTE: The if () block below is a workaround in order to prevent a - * VM deadlock. See rdar://7853471. - * - * If we are in a forced unmount, then launchd will still have the - * dyld_shared_cache file mapped as it is trying to reboot. If we - * take the truncate lock here to service a page fault, then our - * thread could deadlock with the forced-unmount. The forced unmount - * thread will try to reclaim the dyld_shared_cache vnode, but since it's - * marked C_DELETED, it will call ubc_setsize(0). As a result, the unmount - * thread will think it needs to copy all of the data out of the file - * and into a VM copy object. If we hold the cnode lock here, then that - * VM operation will not be able to proceed, because we'll set a busy page - * before attempting to grab the lock. Note that this isn't as simple as "don't - * call ubc_setsize" because doing that would just shift the problem to the - * ubc_msync done before the vnode is reclaimed. - * - * So, if a forced unmount on this volume is in flight AND the cnode is - * marked C_DELETED, then just go ahead and do the page in without taking - * the lock (thus suspending pagein_v2 semantics temporarily). Since it's on a file - * that is not going to be available on the next mount, this seems like a - * OK solution from a correctness point of view, even though it is hacky. - */ - if (vfs_isforce(vp->v_mount)) { - if (cp->c_flag & C_DELETED) { - /* If we don't get it, then just go ahead and operate without the lock */ - truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); - } - } - else { - hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); - truncate_lock_held = TRUE; - } - - kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); - - if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { - error = EINVAL; - goto pagein_done; - } - ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1); - - upl_size = isize = ap->a_size; - - /* - * Scan from the back to find the last page in the UPL, so that we - * aren't looking at a UPL that may have already been freed by the - * preceding aborts/completions. - */ - for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { - if (upl_page_present(pl, --pg_index)) - break; - if (pg_index == 0) { - /* - * no absent pages were found in the range specified - * just abort the UPL to get rid of it and then we're done - */ - ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); - goto pagein_done; - } - } - /* - * initialize the offset variables before we touch the UPL. - * f_offset is the position into the file, in bytes - * offset is the position into the UPL, in bytes - * pg_index is the pg# of the UPL we're operating on - * isize is the offset into the UPL of the last page that is present. - */ - isize = ((pg_index + 1) * PAGE_SIZE); - pg_index = 0; - offset = 0; - f_offset = ap->a_f_offset; - - while (isize) { - int xsize; - int num_of_pages; - - if ( !upl_page_present(pl, pg_index)) { - /* - * we asked for RET_ONLY_ABSENT, so it's possible - * to get back empty slots in the UPL. - * just skip over them - */ - f_offset += PAGE_SIZE; - offset += PAGE_SIZE; - isize -= PAGE_SIZE; - pg_index++; - - continue; - } - /* - * We know that we have at least one absent page. - * Now checking to see how many in a row we have - */ - num_of_pages = 1; - xsize = isize - PAGE_SIZE; - - while (xsize) { - if ( !upl_page_present(pl, pg_index + num_of_pages)) - break; - num_of_pages++; - xsize -= PAGE_SIZE; - } - xsize = num_of_pages * PAGE_SIZE; - -#if HFS_COMPRESSION - if (VNODE_IS_RSRC(vp)) { - /* allow pageins of the resource fork */ - } else { - int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ - - if (compressed) { - - if (truncate_lock_held) { - /* - * can't hold the truncate lock when calling into the decmpfs layer - * since it calls back into this layer... even though we're only - * holding the lock in shared mode, and the re-entrant path only - * takes the lock shared, we can deadlock if some other thread - * tries to grab the lock exclusively in between. - */ - hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); - truncate_lock_held = FALSE; - } - ap->a_pl = upl; - ap->a_pl_offset = offset; - ap->a_f_offset = f_offset; - ap->a_size = xsize; - - error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); - /* - * note that decpfs_pagein_compressed can change the state of - * 'compressed'... it will set it to 0 if the file is no longer - * compressed once the compression lock is successfully taken - * i.e. we would block on that lock while the file is being inflated - */ - if (error == 0 && vnode_isfastdevicecandidate(vp)) { - (void) hfs_addhotfile(vp); - } - if (compressed) { - if (error == 0) { - /* successful page-in, update the access time */ - VTOC(vp)->c_touch_acctime = TRUE; - - // - // compressed files are not traditional hot file candidates - // but they may be for CF (which ignores the ff_bytesread - // field) - // - if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { - fp->ff_bytesread = 0; - } - } else if (error == EAGAIN) { - /* - * EAGAIN indicates someone else already holds the compression lock... - * to avoid deadlocking, we'll abort this range of pages with an - * indication that the pagein needs to be redriven - */ - ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); - } else if (error == ENOSPC) { - - if (upl_size == PAGE_SIZE) - panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n"); - - ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); - - ap->a_size = PAGE_SIZE; - ap->a_pl = NULL; - ap->a_pl_offset = 0; - ap->a_f_offset = page_needed_f_offset; - - goto retry_pagein; - } - goto pagein_next_range; - } - else { - /* - * Set file_converted only if the file became decompressed while we were - * paging in. If it were still compressed, we would re-start the loop using the goto - * in the above block. This avoid us overloading truncate_lock_held as our retry_pagein - * condition below, since we could have avoided taking the truncate lock to prevent - * a deadlock in the force unmount case. - */ - file_converted = TRUE; - } - } - if (file_converted == TRUE) { - /* - * the file was converted back to a regular file after we first saw it as compressed - * we need to abort the upl, retake the truncate lock, recreate the UPL and start over - * reset a_size so that we consider what remains of the original request - * and null out a_upl and a_pl_offset. - * - * We should only be able to get into this block if the decmpfs_pagein_compressed - * successfully decompressed the range in question for this file. - */ - ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); - - ap->a_size = isize; - ap->a_pl = NULL; - ap->a_pl_offset = 0; - - /* Reset file_converted back to false so that we don't infinite-loop. */ - file_converted = FALSE; - goto retry_pagein; - } - } -#endif - error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags); - - /* - * Keep track of blocks read. - */ - if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { - int bytesread; - int took_cnode_lock = 0; - - if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) - bytesread = fp->ff_size; - else - bytesread = xsize; - - /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ - if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - took_cnode_lock = 1; - } - /* - * If this file hasn't been seen since the start of - * the current sampling period then start over. - */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { - struct timeval tv; - - fp->ff_bytesread = bytesread; - microtime(&tv); - cp->c_atime = tv.tv_sec; - } else { - fp->ff_bytesread += bytesread; - } - cp->c_touch_acctime = TRUE; - - if (vnode_isfastdevicecandidate(vp)) { - (void) hfs_addhotfile(vp); - } - if (took_cnode_lock) - hfs_unlock(cp); - } -pagein_next_range: - f_offset += xsize; - offset += xsize; - isize -= xsize; - pg_index += num_of_pages; - - error = 0; - } - -pagein_done: - if (truncate_lock_held == TRUE) { - /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */ - hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); - } - - return (error); -} - -/* - * Pageout for HFS filesystem. - */ -int -hfs_vnop_pageout(struct vnop_pageout_args *ap) -/* - struct vnop_pageout_args { - vnode_t a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - int a_flags - vfs_context_t a_context; - }; -*/ -{ - vnode_t vp = ap->a_vp; - struct cnode *cp; - struct filefork *fp; - int retval = 0; - off_t filesize; - upl_t upl; - upl_page_info_t* pl; - vm_offset_t a_pl_offset; - int a_flags; - int is_pageoutv2 = 0; - kern_return_t kret; - - cp = VTOC(vp); - fp = VTOF(vp); - - a_flags = ap->a_flags; - a_pl_offset = ap->a_pl_offset; - - /* - * we can tell if we're getting the new or old behavior from the UPL - */ - if ((upl = ap->a_pl) == NULL) { - int request_flags; - - is_pageoutv2 = 1; - /* - * we're in control of any UPL we commit - * make sure someone hasn't accidentally passed in UPL_NOCOMMIT - */ - a_flags &= ~UPL_NOCOMMIT; - a_pl_offset = 0; - - /* - * For V2 semantics, we want to take the cnode truncate lock - * shared to guard against the file size changing via zero-filling. - * - * However, we have to be careful because we may be invoked - * via the ubc_msync path to write out dirty mmap'd pages - * in response to a lock event on a content-protected - * filesystem (e.g. to write out class A files). - * As a result, we want to take the truncate lock 'SHARED' with - * the mini-recursion locktype so that we don't deadlock/panic - * because we may be already holding the truncate lock exclusive to force any other - * IOs to have blocked behind us. - */ - hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); - - if (a_flags & UPL_MSYNC) { - request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; - } - else { - request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; - } - - kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); - - if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { - retval = EINVAL; - goto pageout_done; - } - } - /* - * from this point forward upl points at the UPL we're working with - * it was either passed in or we succesfully created it - */ - - /* - * Figure out where the file ends, for pageout purposes. If - * ff_new_size > ff_size, then we're in the middle of extending the - * file via a write, so it is safe (and necessary) that we be able - * to pageout up to that point. - */ - filesize = fp->ff_size; - if (fp->ff_new_size > filesize) - filesize = fp->ff_new_size; - - /* - * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own - * UPL instead of relying on the UPL passed into us. We go ahead and do that here, - * scanning for dirty ranges. We'll issue our own N cluster_pageout calls, for - * N dirty ranges in the UPL. Note that this is almost a direct copy of the - * logic in vnode_pageout except that we need to do it after grabbing the truncate - * lock in HFS so that we don't lock invert ourselves. - * - * Note that we can still get into this function on behalf of the default pager with - * non-V2 behavior (swapfiles). However in that case, we did not grab locks above - * since fsync and other writing threads will grab the locks, then mark the - * relevant pages as busy. But the pageout codepath marks the pages as busy, - * and THEN would attempt to grab the truncate lock, which would result in deadlock. So - * we do not try to grab anything for the pre-V2 case, which should only be accessed - * by the paging/VM system. - */ - - if (is_pageoutv2) { - off_t f_offset; - int offset; - int isize; - int pg_index; - int error; - int error_ret = 0; - - isize = ap->a_size; - f_offset = ap->a_f_offset; - - /* - * Scan from the back to find the last page in the UPL, so that we - * aren't looking at a UPL that may have already been freed by the - * preceding aborts/completions. - */ - for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { - if (upl_page_present(pl, --pg_index)) - break; - if (pg_index == 0) { - ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); - goto pageout_done; - } - } - - /* - * initialize the offset variables before we touch the UPL. - * a_f_offset is the position into the file, in bytes - * offset is the position into the UPL, in bytes - * pg_index is the pg# of the UPL we're operating on. - * isize is the offset into the UPL of the last non-clean page. - */ - isize = ((pg_index + 1) * PAGE_SIZE); - - offset = 0; - pg_index = 0; - - while (isize) { - int xsize; - int num_of_pages; - - if ( !upl_page_present(pl, pg_index)) { - /* - * we asked for RET_ONLY_DIRTY, so it's possible - * to get back empty slots in the UPL. - * just skip over them - */ - f_offset += PAGE_SIZE; - offset += PAGE_SIZE; - isize -= PAGE_SIZE; - pg_index++; - - continue; - } - if ( !upl_dirty_page(pl, pg_index)) { - panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl); - } - - /* - * We know that we have at least one dirty page. - * Now checking to see how many in a row we have - */ - num_of_pages = 1; - xsize = isize - PAGE_SIZE; - - while (xsize) { - if ( !upl_dirty_page(pl, pg_index + num_of_pages)) - break; - num_of_pages++; - xsize -= PAGE_SIZE; - } - xsize = num_of_pages * PAGE_SIZE; - - if ((error = cluster_pageout(vp, upl, offset, f_offset, - xsize, filesize, a_flags))) { - if (error_ret == 0) - error_ret = error; - } - f_offset += xsize; - offset += xsize; - isize -= xsize; - pg_index += num_of_pages; - } - /* capture errnos bubbled out of cluster_pageout if they occurred */ - if (error_ret != 0) { - retval = error_ret; - } - } /* end block for v2 pageout behavior */ - else { - /* - * just call cluster_pageout for old pre-v2 behavior - */ - retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset, - ap->a_size, filesize, a_flags); - } - - /* - * If data was written, update the modification time of the file - * but only if it's mapped writable; we will have touched the - * modifcation time for direct writes. - */ - if (retval == 0 && (ubc_is_mapped_writable(vp) - || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) { - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - // Check again with lock - bool mapped_writable = ubc_is_mapped_writable(vp); - if (mapped_writable - || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) { - cp->c_touch_modtime = TRUE; - cp->c_touch_chgtime = TRUE; - - /* - * We only need to increment the generation counter if - * it's currently mapped writable because we incremented - * the counter in hfs_vnop_mnomap. - */ - if (mapped_writable) - hfs_incr_gencount(VTOC(vp)); - - /* - * If setuid or setgid bits are set and this process is - * not the superuser then clear the setuid and setgid bits - * as a precaution against tampering. - */ - if ((cp->c_mode & (S_ISUID | S_ISGID)) && - (vfs_context_suser(ap->a_context) != 0)) { - cp->c_mode &= ~(S_ISUID | S_ISGID); - } - } - - hfs_unlock(cp); - } - -pageout_done: - if (is_pageoutv2) { - /* - * Release the truncate lock. Note that because - * we may have taken the lock recursively by - * being invoked via ubc_msync due to lockdown, - * we should release it recursively, too. - */ - hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); - } - return (retval); -} - -/* - * Intercept B-Tree node writes to unswap them if necessary. - */ -int -hfs_vnop_bwrite(struct vnop_bwrite_args *ap) -{ - int retval = 0; - register struct buf *bp = ap->a_bp; - register struct vnode *vp = buf_vnode(bp); - BlockDescriptor block; - - /* Trap B-Tree writes */ - if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) || - (VTOC(vp)->c_fileid == kHFSCatalogFileID) || - (VTOC(vp)->c_fileid == kHFSAttributesFileID) || - (vp == VTOHFS(vp)->hfc_filevp)) { - - /* - * Swap and validate the node if it is in native byte order. - * This is always be true on big endian, so we always validate - * before writing here. On little endian, the node typically has - * been swapped and validated when it was written to the journal, - * so we won't do anything here. - */ - if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) { - /* Prepare the block pointer */ - block.blockHeader = bp; - block.buffer = (char *)buf_dataptr(bp); - block.blockNum = buf_lblkno(bp); - /* not found in cache ==> came from disk */ - block.blockReadFromDisk = (buf_fromcache(bp) == 0); - block.blockSize = buf_count(bp); - - /* Endian un-swap B-Tree node */ - retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false); - if (retval) - panic("hfs_vnop_bwrite: about to write corrupt node!\n"); - } - } - - /* This buffer shouldn't be locked anymore but if it is clear it */ - if ((buf_flags(bp) & B_LOCKED)) { - // XXXdbg - if (VTOHFS(vp)->jnl) { - panic("hfs: CLEARING the lock bit on bp %p\n", bp); - } - buf_clearflags(bp, B_LOCKED); - } - retval = vn_bwrite (ap); - - return (retval); -} - - -int -hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks, vfs_context_t ctx) -{ - _dk_cs_pin_t pin; - unsigned ioc; - int err; - - memset(&pin, 0, sizeof(pin)); - pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize; - pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize; - switch (pin_state) { - case HFS_PIN_IT: - ioc = _DKIOCCSPINEXTENT; - pin.cp_flags = _DKIOCCSPINTOFASTMEDIA; - break; - case HFS_PIN_IT | HFS_TEMP_PIN: - ioc = _DKIOCCSPINEXTENT; - pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN; - break; - case HFS_PIN_IT | HFS_DATALESS_PIN: - ioc = _DKIOCCSPINEXTENT; - pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE; - break; - case HFS_UNPIN_IT: - ioc = _DKIOCCSUNPINEXTENT; - pin.cp_flags = 0; - break; - case HFS_UNPIN_IT | HFS_EVICT_PIN: - ioc = _DKIOCCSPINEXTENT; - pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA; - break; - default: - return EINVAL; - } - err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, ctx); - return err; -} - -// -// The cnode lock should already be held on entry to this function -// -int -hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned, vfs_context_t ctx) -{ - struct filefork *fp = VTOF(vp); - int i, err=0, need_put=0; - struct vnode *rsrc_vp=NULL; - uint32_t npinned = 0; - off_t offset; - - if (num_blocks_pinned) { - *num_blocks_pinned = 0; - } - - if (vnode_vtype(vp) != VREG) { - /* Not allowed to pin directories or symlinks */ - printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp)); - return (EPERM); - } - - if (fp->ff_unallocblocks) { - printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks); - return (EINVAL); - } - - /* - * It is possible that if the caller unlocked/re-locked the cnode after checking - * for C_NOEXISTS|C_DELETED that the file could have been deleted while the - * cnode was unlocked. So check the condition again and return ENOENT so that - * the caller knows why we failed to pin the vnode. - */ - if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) { - // makes no sense to pin something that's pending deletion - return ENOENT; - } - - if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { - if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) { - //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid, - // VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size); - - fp = VTOC(rsrc_vp)->c_rsrcfork; - need_put = 1; - } - } - if (fp->ff_blocks == 0) { - if (need_put) { - // - // use a distinct error code for a compressed file that has no resource fork; - // we return EALREADY to indicate that the data is already probably hot file - // cached because it's in an EA and the attributes btree is on the ssd - // - err = EALREADY; - } else { - err = EINVAL; - } - goto out; - } - - offset = 0; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (fp->ff_extents[i].startBlock == 0) { - break; - } - - err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, ctx); - if (err) { - break; - } else { - npinned += fp->ff_extents[i].blockCount; - } - } - - if (err || npinned == 0) { - goto out; - } - - if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) { - uint32_t pblocks; - uint8_t forktype = 0; - - if (fp == VTOC(vp)->c_rsrcfork) { - forktype = 0xff; - } - /* - * The file could have overflow extents, better pin them. - * - * We assume that since we are holding the cnode lock for this cnode, - * the files extents cannot be manipulated, but the tree could, so we - * need to ensure that it doesn't change behind our back as we iterate it. - */ - int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); - err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks); - hfs_systemfile_unlock (hfsmp, lockflags); - - if (err) { - goto out; - } - npinned += pblocks; - } - -out: - if (num_blocks_pinned) { - *num_blocks_pinned = npinned; - } - - if (need_put && rsrc_vp) { - // - // have to unlock the cnode since it's shared between the - // resource fork vnode and the data fork vnode (and the - // vnode_put() may need to re-acquire the cnode lock to - // reclaim the resource fork vnode) - // - hfs_unlock(VTOC(vp)); - vnode_put(rsrc_vp); - hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - } - return err; -} - - -/* - * Relocate a file to a new location on disk - * cnode must be locked on entry - * - * Relocation occurs by cloning the file's data from its - * current set of blocks to a new set of blocks. During - * the relocation all of the blocks (old and new) are - * owned by the file. - * - * ----------------- - * |///////////////| - * ----------------- - * 0 N (file offset) - * - * ----------------- ----------------- - * |///////////////| | | STEP 1 (acquire new blocks) - * ----------------- ----------------- - * 0 N N+1 2N - * - * ----------------- ----------------- - * |///////////////| |///////////////| STEP 2 (clone data) - * ----------------- ----------------- - * 0 N N+1 2N - * - * ----------------- - * |///////////////| STEP 3 (head truncate blocks) - * ----------------- - * 0 N - * - * During steps 2 and 3 page-outs to file offsets less - * than or equal to N are suspended. - * - * During step 3 page-ins to the file get suspended. - */ -int -hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, - struct proc *p) -{ - struct cnode *cp; - struct filefork *fp; - struct hfsmount *hfsmp; - u_int32_t headblks; - u_int32_t datablks; - u_int32_t blksize; - u_int32_t growsize; - u_int32_t nextallocsave; - daddr64_t sector_a, sector_b; - int eflags; - off_t newbytes; - int retval; - int lockflags = 0; - int took_trunc_lock = 0; - int started_tr = 0; - enum vtype vnodetype; - - vnodetype = vnode_vtype(vp); - if (vnodetype != VREG) { - /* Not allowed to move symlinks. */ - return (EPERM); - } - - hfsmp = VTOHFS(vp); - if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) { - return (ENOSPC); - } - - cp = VTOC(vp); - fp = VTOF(vp); - if (fp->ff_unallocblocks) - return (EINVAL); - -#if CONFIG_PROTECT - /* - * - * Disable HFS file relocation on content-protected filesystems - */ - if (cp_fs_protected (hfsmp->hfs_mp)) { - return EINVAL; - } -#endif - /* If it's an SSD, also disable HFS relocation */ - if (hfsmp->hfs_flags & HFS_SSD) { - return EINVAL; - } - - - blksize = hfsmp->blockSize; - if (blockHint == 0) - blockHint = hfsmp->nextAllocation; - - if (fp->ff_size > 0x7fffffff) { - return (EFBIG); - } - - // - // We do not believe that this call to hfs_fsync() is - // necessary and it causes a journal transaction - // deadlock so we are removing it. - // - //if (vnodetype == VREG && !vnode_issystem(vp)) { - // retval = hfs_fsync(vp, MNT_WAIT, 0, p); - // if (retval) - // return (retval); - //} - - if (!vnode_issystem(vp) && (vnodetype != VLNK)) { - hfs_unlock(cp); - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - /* Force lock since callers expects lock to be held. */ - if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - return (retval); - } - /* No need to continue if file was removed. */ - if (cp->c_flag & C_NOEXISTS) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - return (ENOENT); - } - took_trunc_lock = 1; - } - headblks = fp->ff_blocks; - datablks = howmany(fp->ff_size, blksize); - growsize = datablks * blksize; - eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask; - if (blockHint >= hfsmp->hfs_metazone_start && - blockHint <= hfsmp->hfs_metazone_end) - eflags |= kEFMetadataMask; - - if (hfs_start_transaction(hfsmp) != 0) { - if (took_trunc_lock) - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - return (EINVAL); - } - started_tr = 1; - /* - * Protect the extents b-tree and the allocation bitmap - * during MapFileBlockC and ExtendFileC operations. - */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, §or_a, NULL); - if (retval) { - retval = MacToVFSError(retval); - goto out; - } - - /* - * STEP 1 - acquire new allocation blocks. - */ - nextallocsave = hfsmp->nextAllocation; - retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes); - if (eflags & kEFMetadataMask) { - hfs_lock_mount(hfsmp); - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave); - MarkVCBDirty(hfsmp); - hfs_unlock_mount(hfsmp); - } - - retval = MacToVFSError(retval); - if (retval == 0) { - cp->c_flag |= C_MODIFIED; - if (newbytes < growsize) { - retval = ENOSPC; - goto restore; - } else if (fp->ff_blocks < (headblks + datablks)) { - printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN); - retval = ENOSPC; - goto restore; - } - - retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, §or_b, NULL); - if (retval) { - retval = MacToVFSError(retval); - } else if ((sector_a + 1) == sector_b) { - retval = ENOSPC; - goto restore; - } else if ((eflags & kEFMetadataMask) && - ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) > - hfsmp->hfs_metazone_end)) { -#if 0 - const char * filestr; - char emptystr = '\0'; - - if (cp->c_desc.cd_nameptr != NULL) { - filestr = (const char *)&cp->c_desc.cd_nameptr[0]; - } else if (vnode_name(vp) != NULL) { - filestr = vnode_name(vp); - } else { - filestr = &emptystr; - } -#endif - retval = ENOSPC; - goto restore; - } - } - /* Done with system locks and journal for now. */ - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - hfs_end_transaction(hfsmp); - started_tr = 0; - - if (retval) { - /* - * Check to see if failure is due to excessive fragmentation. - */ - if ((retval == ENOSPC) && - (hfs_freeblks(hfsmp, 0) > (datablks * 2))) { - hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE; - } - goto out; - } - /* - * STEP 2 - clone file data into the new allocation blocks. - */ - - if (vnodetype == VLNK) - retval = EPERM; - else if (vnode_issystem(vp)) - retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p); - else - retval = hfs_clonefile(vp, headblks, datablks, blksize); - - /* Start transaction for step 3 or for a restore. */ - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto out; - } - started_tr = 1; - if (retval) - goto restore; - - /* - * STEP 3 - switch to cloned data and remove old blocks. - */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks); - - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - if (retval) - goto restore; -out: - if (took_trunc_lock) - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - } - - /* Push cnode's new extent data to disk. */ - if (retval == 0) { - hfs_update(vp, 0); - } - if (hfsmp->jnl) { - if (cp->c_cnid < kHFSFirstUserCatalogNodeID) - (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - else - (void) hfs_flushvolumeheader(hfsmp, 0); - } -exit: - if (started_tr) - hfs_end_transaction(hfsmp); - - return (retval); - -restore: - if (fp->ff_blocks == headblks) { - if (took_trunc_lock) - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - goto exit; - } - /* - * Give back any newly allocated space. - */ - if (lockflags == 0) { - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - } - - (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp), - FTOC(fp)->c_fileid, false); - - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - - if (took_trunc_lock) - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - goto exit; -} - - -/* - * Clone a file's data within the file. - * - */ -static int -hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) -{ - caddr_t bufp; - size_t bufsize; - size_t copysize; - size_t iosize; - size_t offset; - off_t writebase; - uio_t auio; - int error = 0; - - writebase = blkstart * blksize; - copysize = blkcnt * blksize; - iosize = bufsize = MIN(copysize, 128 * 1024); - offset = 0; - - hfs_unlock(VTOC(vp)); - -#if CONFIG_PROTECT - if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { - hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - return (error); - } -#endif /* CONFIG_PROTECT */ - - if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) { - hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - return (ENOMEM); - } - - auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); - - while (offset < copysize) { - iosize = MIN(copysize - offset, iosize); - - uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ); - uio_addiov(auio, (uintptr_t)bufp, iosize); - - error = cluster_read(vp, auio, copysize, IO_NOCACHE); - if (error) { - printf("hfs_clonefile: cluster_read failed - %d\n", error); - break; - } - if (uio_resid(auio) != 0) { - printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio)); - error = EIO; - break; - } - - uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (uintptr_t)bufp, iosize); - - error = cluster_write(vp, auio, writebase + offset, - writebase + offset + iosize, - uio_offset(auio), 0, IO_NOCACHE | IO_SYNC); - if (error) { - printf("hfs_clonefile: cluster_write failed - %d\n", error); - break; - } - if (uio_resid(auio) != 0) { - printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n"); - error = EIO; - break; - } - offset += iosize; - } - uio_free(auio); - - if ((blksize & PAGE_MASK)) { - /* - * since the copy may not have started on a PAGE - * boundary (or may not have ended on one), we - * may have pages left in the cache since NOCACHE - * will let partially written pages linger... - * lets just flush the entire range to make sure - * we don't have any pages left that are beyond - * (or intersect) the real LEOF of this file - */ - ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY); - } else { - /* - * No need to call ubc_msync or hfs_invalbuf - * since the file was copied using IO_NOCACHE and - * the copy was done starting and ending on a page - * boundary in the file. - */ - } - kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); - - hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - return (error); -} - -/* - * Clone a system (metadata) file. - * - */ -static int -hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, - kauth_cred_t cred, struct proc *p) -{ - caddr_t bufp; - char * offset; - size_t bufsize; - size_t iosize; - struct buf *bp = NULL; - daddr64_t blkno; - daddr64_t blk; - daddr64_t start_blk; - daddr64_t last_blk; - int breadcnt; - int i; - int error = 0; - - - iosize = GetLogicalBlockSize(vp); - bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1); - breadcnt = bufsize / iosize; - - if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) { - return (ENOMEM); - } - start_blk = ((daddr64_t)blkstart * blksize) / iosize; - last_blk = ((daddr64_t)blkcnt * blksize) / iosize; - blkno = 0; - - while (blkno < last_blk) { - /* - * Read up to a megabyte - */ - offset = bufp; - for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) { - error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp); - if (error) { - printf("hfs_clonesysfile: meta_bread error %d\n", error); - goto out; - } - if (buf_count(bp) != iosize) { - printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp)); - goto out; - } - bcopy((char *)buf_dataptr(bp), offset, iosize); - - buf_markinvalid(bp); - buf_brelse(bp); - bp = NULL; - - offset += iosize; - } - - /* - * Write up to a megabyte - */ - offset = bufp; - for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) { - bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META); - if (bp == NULL) { - printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno); - error = EIO; - goto out; - } - bcopy(offset, (char *)buf_dataptr(bp), iosize); - error = (int)buf_bwrite(bp); - bp = NULL; - if (error) - goto out; - offset += iosize; - } - } -out: - if (bp) { - buf_brelse(bp); - } - - kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); - - error = hfs_fsync(vp, MNT_WAIT, 0, p); - - return (error); -} - -errno_t hfs_flush_invalid_ranges(vnode_t vp) -{ - cnode_t *cp = VTOC(vp); - - assert(cp->c_lockowner == current_thread()); - assert(cp->c_truncatelockowner == current_thread()); - - if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout) - return 0; - - filefork_t *fp = VTOF(vp); - - /* - * We can't hold the cnode lock whilst we call cluster_write so we - * need to copy the extents into a local buffer. - */ - int max_exts = 16; - struct ext { - off_t start, end; - } exts_buf[max_exts]; // 256 bytes - struct ext *exts = exts_buf; - int ext_count = 0; - errno_t ret; - - struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges); - - while (r) { - /* If we have more than can fit in our stack buffer, switch - to a heap buffer. */ - if (exts == exts_buf && ext_count == max_exts) { - max_exts = 256; - MALLOC(exts, struct ext *, sizeof(struct ext) * max_exts, - M_TEMP, M_WAITOK); - memcpy(exts, exts_buf, ext_count * sizeof(struct ext)); - } - - struct rl_entry *next = TAILQ_NEXT(r, rl_link); - - exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end }; - - if (!next || (ext_count == max_exts && exts != exts_buf)) { - hfs_unlock(cp); - for (int i = 0; i < ext_count; ++i) { - ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1, - exts[i].start, 0, - IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE); - if (ret) { - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); - goto exit; - } - } - - if (!next) { - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); - break; - } - - /* Push any existing clusters which should clean up our invalid - ranges as they go through hfs_vnop_blockmap. */ - cluster_push(vp, 0); - - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); - - /* - * Get back to where we were (given we dropped the lock). - * This shouldn't be many because we pushed above. - */ - TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) { - if (r->rl_end > exts[ext_count - 1].end) - break; - } - - ext_count = 0; - } else - r = next; - } - - ret = 0; - -exit: - - if (exts != exts_buf) - FREE(exts, M_TEMP); - - return ret; -} diff --git a/bsd/hfs/hfs_resize.c b/bsd/hfs/hfs_resize.c deleted file mode 100644 index f5dc27ad5..000000000 --- a/bsd/hfs/hfs_resize.c +++ /dev/null @@ -1,3486 +0,0 @@ -/* - * Copyright (c) 2013-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_cnode.h" -#include "hfs_endian.h" -#include "hfs_btreeio.h" -#include "hfs_cprotect.h" - -/* Enable/disable debugging code for live volume resizing */ -int hfs_resize_debug = 0; - -static errno_t hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, - struct HFSPlusCatalogFile *filerec, bool *overlaps); -static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context); -static int hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context); - -/* - * Extend a file system. - */ -int -hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) -{ - struct proc *p = vfs_context_proc(context); - kauth_cred_t cred = vfs_context_ucred(context); - struct vnode *vp; - struct vnode *devvp; - struct buf *bp; - struct filefork *fp = NULL; - ExtendedVCB *vcb; - struct cat_fork forkdata; - u_int64_t oldsize; - u_int64_t newblkcnt; - u_int64_t prev_phys_block_count; - u_int32_t addblks; - u_int64_t sector_count; - u_int32_t sector_size; - u_int32_t phys_sector_size; - u_int32_t overage_blocks; - daddr64_t prev_fs_alt_sector; - daddr_t bitmapblks; - int lockflags = 0; - int error; - int64_t oldBitmapSize; - - Boolean usedExtendFileC = false; - int transaction_begun = 0; - - devvp = hfsmp->hfs_devvp; - vcb = HFSTOVCB(hfsmp); - - /* - * - HFS Plus file systems only. - * - Journaling must be enabled. - * - No embedded volumes. - */ - if ((vcb->vcbSigWord == kHFSSigWord) || - (hfsmp->jnl == NULL) || - (vcb->hfsPlusIOPosOffset != 0)) { - return (EPERM); - } - /* - * If extending file system by non-root, then verify - * ownership and check permissions. - */ - if (suser(cred, NULL)) { - error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0); - - if (error) - return (error); - error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0); - if (error == 0) { - error = hfs_write_access(vp, cred, p, false); - } - hfs_unlock(VTOC(vp)); - vnode_put(vp); - if (error) - return (error); - - error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context); - if (error) - return (error); - } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§or_size, 0, context)) { - return (ENXIO); - } - if (sector_size != hfsmp->hfs_logical_block_size) { - return (ENXIO); - } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§or_count, 0, context)) { - return (ENXIO); - } - /* Check if partition size is correct for new file system size */ - if ((sector_size * sector_count) < newsize) { - printf("hfs_extendfs: not enough space on device (vol=%s)\n", hfsmp->vcbVN); - return (ENOSPC); - } - error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sector_size, 0, context); - if (error) { - if ((error != ENOTSUP) && (error != ENOTTY)) { - return (ENXIO); - } - /* If ioctl is not supported, force physical and logical sector size to be same */ - phys_sector_size = sector_size; - } - oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; - - /* - * Validate new size. - */ - if ((newsize <= oldsize) || (newsize % sector_size) || (newsize % phys_sector_size)) { - printf("hfs_extendfs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize); - return (EINVAL); - } - newblkcnt = newsize / vcb->blockSize; - if (newblkcnt > (u_int64_t)0xFFFFFFFF) { - printf ("hfs_extendfs: current blockSize=%u too small for newsize=%qu\n", hfsmp->blockSize, newsize); - return (EOVERFLOW); - } - - addblks = newblkcnt - vcb->totalBlocks; - - if (hfs_resize_debug) { - printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks); - printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks); - } - printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks); - - hfs_lock_mount (hfsmp); - if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { - hfs_unlock_mount(hfsmp); - error = EALREADY; - goto out; - } - hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; - hfs_unlock_mount (hfsmp); - - /* Start with a clean journal. */ - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - - /* - * Enclose changes inside a transaction. - */ - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - transaction_begun = 1; - - - /* Update the hfsmp fields for the physical information about the device */ - prev_phys_block_count = hfsmp->hfs_logical_block_count; - prev_fs_alt_sector = hfsmp->hfs_fs_avh_sector; - - hfsmp->hfs_logical_block_count = sector_count; - hfsmp->hfs_logical_bytes = (uint64_t) sector_count * (uint64_t) sector_size; - - /* - * It is possible that the new file system is smaller than the partition size. - * Therefore, update offsets for AVH accordingly. - */ - if (hfs_resize_debug) { - printf ("hfs_extendfs: old: partition_avh_sector=%qu, fs_avh_sector=%qu\n", - hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); - } - hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) + - HFS_ALT_SECTOR(sector_size, hfsmp->hfs_logical_block_count); - - hfsmp->hfs_fs_avh_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) + - HFS_ALT_SECTOR(sector_size, (newsize/hfsmp->hfs_logical_block_size)); - if (hfs_resize_debug) { - printf ("hfs_extendfs: new: partition_avh_sector=%qu, fs_avh_sector=%qu\n", - hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); - } - - /* - * Note: we take the attributes lock in case we have an attribute data vnode - * which needs to change size. - */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - vp = vcb->allocationsRefNum; - fp = VTOF(vp); - bcopy(&fp->ff_data, &forkdata, sizeof(forkdata)); - - /* - * Calculate additional space required (if any) by allocation bitmap. - */ - oldBitmapSize = fp->ff_size; - bitmapblks = roundup((newblkcnt+7) / 8, vcb->vcbVBMIOSize) / vcb->blockSize; - if (bitmapblks > (daddr_t)fp->ff_blocks) - bitmapblks -= fp->ff_blocks; - else - bitmapblks = 0; - - /* - * The allocation bitmap can contain unused bits that are beyond end of - * current volume's allocation blocks. Usually they are supposed to be - * zero'ed out but there can be cases where they might be marked as used. - * After extending the file system, those bits can represent valid - * allocation blocks, so we mark all the bits from the end of current - * volume to end of allocation bitmap as "free". - * - * Figure out the number of overage blocks before proceeding though, - * so we don't add more bytes to our I/O than necessary. - * First figure out the total number of blocks representable by the - * end of the bitmap file vs. the total number of blocks in the new FS. - * Then subtract away the number of blocks in the current FS. This is how much - * we can mark as free right now without having to grow the bitmap file. - */ - overage_blocks = fp->ff_blocks * vcb->blockSize * 8; - overage_blocks = MIN (overage_blocks, newblkcnt); - overage_blocks -= vcb->totalBlocks; - - BlockMarkFreeUnused(vcb, vcb->totalBlocks, overage_blocks); - - if (bitmapblks > 0) { - daddr64_t blkno; - daddr_t blkcnt; - off_t bytesAdded; - - /* - * Get the bitmap's current size (in allocation blocks) so we know - * where to start zero filling once the new space is added. We've - * got to do this before the bitmap is grown. - */ - blkno = (daddr64_t)fp->ff_blocks; - - /* - * Try to grow the allocation file in the normal way, using allocation - * blocks already existing in the file system. This way, we might be - * able to grow the bitmap contiguously, or at least in the metadata - * zone. - */ - error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0, - kEFAllMask | kEFNoClumpMask | kEFReserveMask - | kEFMetadataMask | kEFContigMask, &bytesAdded); - - if (error == 0) { - usedExtendFileC = true; - } else { - /* - * If the above allocation failed, fall back to allocating the new - * extent of the bitmap from the space we're going to add. Since those - * blocks don't yet belong to the file system, we have to update the - * extent list directly, and manually adjust the file size. - */ - bytesAdded = 0; - error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks); - if (error) { - printf("hfs_extendfs: error %d adding extents\n", error); - goto out; - } - fp->ff_blocks += bitmapblks; - VTOC(vp)->c_blocks = fp->ff_blocks; - VTOC(vp)->c_flag |= C_MODIFIED; - } - - /* - * Update the allocation file's size to include the newly allocated - * blocks. Note that ExtendFileC doesn't do this, which is why this - * statement is outside the above "if" statement. - */ - fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; - - /* - * Zero out the new bitmap blocks. - */ - { - - bp = NULL; - blkcnt = bitmapblks; - while (blkcnt > 0) { - error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp); - if (error) { - if (bp) { - buf_brelse(bp); - } - break; - } - bzero((char *)buf_dataptr(bp), vcb->blockSize); - buf_markaged(bp); - error = (int)buf_bwrite(bp); - if (error) - break; - --blkcnt; - ++blkno; - } - } - if (error) { - printf("hfs_extendfs: error %d clearing blocks\n", error); - goto out; - } - /* - * Mark the new bitmap space as allocated. - * - * Note that ExtendFileC will have marked any blocks it allocated, so - * this is only needed if we used AddFileExtent. Also note that this - * has to come *after* the zero filling of new blocks in the case where - * we used AddFileExtent (since the part of the bitmap we're touching - * is in those newly allocated blocks). - */ - if (!usedExtendFileC) { - error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks); - if (error) { - printf("hfs_extendfs: error %d setting bitmap\n", error); - goto out; - } - vcb->freeBlocks -= bitmapblks; - } - } - - /* - * Mark the new alternate VH as allocated. - */ - if (vcb->blockSize == 512) - error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2); - else - error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1); - if (error) { - printf("hfs_extendfs: error %d setting bitmap (VH)\n", error); - goto out; - } - - /* - * Mark the old alternate VH as free. - */ - if (vcb->blockSize == 512) - (void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2); - else - (void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1); - - /* - * Adjust file system variables for new space. - */ - vcb->totalBlocks += addblks; - vcb->freeBlocks += addblks; - MarkVCBDirty(vcb); - error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - if (error) { - printf("hfs_extendfs: couldn't flush volume headers (%d)", error); - /* - * Restore to old state. - */ - if (usedExtendFileC) { - (void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp), - FTOC(fp)->c_fileid, false); - } else { - fp->ff_blocks -= bitmapblks; - fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; - /* - * No need to mark the excess blocks free since those bitmap blocks - * are no longer part of the bitmap. But we do need to undo the - * effect of the "vcb->freeBlocks -= bitmapblks" above. - */ - vcb->freeBlocks += bitmapblks; - } - vcb->totalBlocks -= addblks; - vcb->freeBlocks -= addblks; - hfsmp->hfs_logical_block_count = prev_phys_block_count; - hfsmp->hfs_fs_avh_sector = prev_fs_alt_sector; - /* Do not revert hfs_partition_avh_sector because the - * partition size is larger than file system size - */ - MarkVCBDirty(vcb); - if (vcb->blockSize == 512) { - if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) { - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } - } else { - if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) { - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } - } - goto out; - } - /* - * Invalidate the old alternate volume header. We are growing the filesystem so - * this sector must be returned to the FS as free space. - */ - bp = NULL; - if (prev_fs_alt_sector) { - if (buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(prev_fs_alt_sector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) { - journal_modify_block_start(hfsmp->jnl, bp); - - bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize); - - journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); - } else if (bp) { - buf_brelse(bp); - } - } - - /* - * Update the metadata zone size based on current volume size - */ - hfs_metadatazone_init(hfsmp, false); - - /* - * Adjust the size of hfsmp->hfs_attrdata_vp - */ - if (hfsmp->hfs_attrdata_vp) { - struct cnode *attr_cp; - struct filefork *attr_fp; - - if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) { - attr_cp = VTOC(hfsmp->hfs_attrdata_vp); - attr_fp = VTOF(hfsmp->hfs_attrdata_vp); - - attr_cp->c_blocks = newblkcnt; - attr_fp->ff_blocks = newblkcnt; - attr_fp->ff_extents[0].blockCount = newblkcnt; - attr_fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize; - ubc_setsize(hfsmp->hfs_attrdata_vp, attr_fp->ff_size); - vnode_put(hfsmp->hfs_attrdata_vp); - } - } - - /* - * We only update hfsmp->allocLimit if totalBlocks actually increased. - */ - if (error == 0) { - UpdateAllocLimit(hfsmp, hfsmp->totalBlocks); - } - - /* Release all locks and sync up journal content before - * checking and extending, if required, the journal - */ - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - } - if (transaction_begun) { - hfs_end_transaction(hfsmp); - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - transaction_begun = 0; - } - - /* Increase the journal size, if required. */ - error = hfs_extend_journal(hfsmp, sector_size, sector_count, context); - if (error) { - printf ("hfs_extendfs: Could not extend journal size\n"); - goto out_noalloc; - } - - /* Log successful extending */ - printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n", - hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize)); - -out: - if (error && fp) { - /* Restore allocation fork. */ - bcopy(&forkdata, &fp->ff_data, sizeof(forkdata)); - VTOC(vp)->c_blocks = fp->ff_blocks; - - } - -out_noalloc: - hfs_lock_mount (hfsmp); - hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; - hfs_unlock_mount (hfsmp); - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - } - if (transaction_begun) { - hfs_end_transaction(hfsmp); - /* Just to be sure, sync all data to the disk */ - int flush_error = hfs_flush(hfsmp, HFS_FLUSH_FULL); - if (flush_error && !error) - error = flush_error; - } - if (error) { - printf ("hfs_extentfs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN); - } - - return MacToVFSError(error); -} - -#define HFS_MIN_SIZE (32LL * 1024LL * 1024LL) - -/* - * Truncate a file system (while still mounted). - */ -int -hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) -{ - u_int64_t oldsize; - u_int32_t newblkcnt; - u_int32_t reclaimblks = 0; - int lockflags = 0; - int transaction_begun = 0; - Boolean updateFreeBlocks = false; - Boolean disable_sparse = false; - int error = 0; - - hfs_lock_mount (hfsmp); - if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { - hfs_unlock_mount (hfsmp); - return (EALREADY); - } - hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; - hfsmp->hfs_resize_blocksmoved = 0; - hfsmp->hfs_resize_totalblocks = 0; - hfsmp->hfs_resize_progress = 0; - hfs_unlock_mount (hfsmp); - - /* - * - Journaled HFS Plus volumes only. - * - No embedded volumes. - */ - if ((hfsmp->jnl == NULL) || - (hfsmp->hfsPlusIOPosOffset != 0)) { - error = EPERM; - goto out; - } - oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; - newblkcnt = newsize / hfsmp->blockSize; - reclaimblks = hfsmp->totalBlocks - newblkcnt; - - if (hfs_resize_debug) { - printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1)); - printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks); - } - - /* Make sure new size is valid. */ - if ((newsize < HFS_MIN_SIZE) || - (newsize >= oldsize) || - (newsize % hfsmp->hfs_logical_block_size) || - (newsize % hfsmp->hfs_physical_block_size)) { - printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize); - error = EINVAL; - goto out; - } - - /* - * Make sure that the file system has enough free blocks reclaim. - * - * Before resize, the disk is divided into four zones - - * A. Allocated_Stationary - These are allocated blocks that exist - * before the new end of disk. These blocks will not be - * relocated or modified during resize. - * B. Free_Stationary - These are free blocks that exist before the - * new end of disk. These blocks can be used for any new - * allocations during resize, including allocation for relocating - * data from the area of disk being reclaimed. - * C. Allocated_To-Reclaim - These are allocated blocks that exist - * beyond the new end of disk. These blocks need to be reclaimed - * during resize by allocating equal number of blocks in Free - * Stationary zone and copying the data. - * D. Free_To-Reclaim - These are free blocks that exist beyond the - * new end of disk. Nothing special needs to be done to reclaim - * them. - * - * Total number of blocks on the disk before resize: - * ------------------------------------------------ - * Total Blocks = Allocated_Stationary + Free_Stationary + - * Allocated_To-Reclaim + Free_To-Reclaim - * - * Total number of blocks that need to be reclaimed: - * ------------------------------------------------ - * Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim - * - * Note that the check below also makes sure that we have enough space - * to relocate data from Allocated_To-Reclaim to Free_Stationary. - * Therefore we do not need to check total number of blocks to relocate - * later in the code. - * - * The condition below gets converted to: - * - * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim - * - * which is equivalent to: - * - * Allocated To-Reclaim >= Free Stationary - */ - if (reclaimblks >= hfs_freeblks(hfsmp, 1)) { - printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); - error = ENOSPC; - goto out; - } - - /* Start with a clean journal. */ - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - transaction_begun = 1; - - /* Take the bitmap lock to update the alloc limit field */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - /* - * Prevent new allocations from using the part we're trying to truncate. - * - * NOTE: allocLimit is set to the allocation block number where the new - * alternate volume header will be. That way there will be no files to - * interfere with allocating the new alternate volume header, and no files - * in the allocation blocks beyond (i.e. the blocks we're trying to - * truncate away. - */ - if (hfsmp->blockSize == 512) { - error = UpdateAllocLimit (hfsmp, newblkcnt - 2); - } - else { - error = UpdateAllocLimit (hfsmp, newblkcnt - 1); - } - - /* Sparse devices use first fit allocation which is not ideal - * for volume resize which requires best fit allocation. If a - * sparse device is being truncated, disable the sparse device - * property temporarily for the duration of resize. Also reset - * the free extent cache so that it is rebuilt as sorted by - * totalBlocks instead of startBlock. - * - * Note that this will affect all allocations on the volume and - * ideal fix would be just to modify resize-related allocations, - * but it will result in complexity like handling of two free - * extent caches sorted differently, etc. So we stick to this - * solution for now. - */ - hfs_lock_mount (hfsmp); - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; - ResetVCBFreeExtCache(hfsmp); - disable_sparse = true; - } - - /* - * Update the volume free block count to reflect the total number - * of free blocks that will exist after a successful resize. - * Relocation of extents will result in no net change in the total - * free space on the disk. Therefore the code that allocates - * space for new extent and deallocates the old extent explicitly - * prevents updating the volume free block count. It will also - * prevent false disk full error when the number of blocks in - * an extent being relocated is more than the free blocks that - * will exist after the volume is resized. - */ - hfsmp->reclaimBlocks = reclaimblks; - hfsmp->freeBlocks -= reclaimblks; - updateFreeBlocks = true; - hfs_unlock_mount(hfsmp); - - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - } - - /* - * Update the metadata zone size to match the new volume size, - * and if it too less, metadata zone might be disabled. - */ - hfs_metadatazone_init(hfsmp, false); - - /* - * If some files have blocks at or beyond the location of the - * new alternate volume header, recalculate free blocks and - * reclaim blocks. Otherwise just update free blocks count. - * - * The current allocLimit is set to the location of new alternate - * volume header, and reclaimblks are the total number of blocks - * that need to be reclaimed. So the check below is really - * ignoring the blocks allocated for old alternate volume header. - */ - if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) { - /* - * hfs_reclaimspace will use separate transactions when - * relocating files (so we don't overwhelm the journal). - */ - hfs_end_transaction(hfsmp); - transaction_begun = 0; - - /* Attempt to reclaim some space. */ - error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context); - if (error != 0) { - printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error); - error = ENOSPC; - goto out; - } - - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - transaction_begun = 1; - - /* Check if we're clear now. */ - error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks); - if (error != 0) { - printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error); - error = EAGAIN; /* tell client to try again */ - goto out; - } - } - - /* - * Note: we take the attributes lock in case we have an attribute data vnode - * which needs to change size. - */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - /* - * Allocate last 1KB for alternate volume header. - */ - error = BlockMarkAllocated(hfsmp, hfsmp->allocLimit, (hfsmp->blockSize == 512) ? 2 : 1); - if (error) { - printf("hfs_truncatefs: Error %d allocating new alternate volume header\n", error); - goto out; - } - - /* - * Mark the old alternate volume header as free. - * We don't bother shrinking allocation bitmap file. - */ - if (hfsmp->blockSize == 512) - (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2); - else - (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1); - - /* Don't invalidate the old AltVH yet. It is still valid until the partition size is updated ! */ - - /* Log successful shrinking. */ - printf("hfs_truncatefs: shrank \"%s\" to %d blocks (was %d blocks)\n", - hfsmp->vcbVN, newblkcnt, hfsmp->totalBlocks); - - /* - * Adjust file system variables and flush them to disk. - * - * Note that although the logical block size is updated here, it is only - * done for the benefit/convenience of the partition management software. The - * logical block count change has not yet actually been propagated to - * the disk device yet (and we won't get any notification when it does). - */ - hfsmp->totalBlocks = newblkcnt; - hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size; - hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; - hfsmp->reclaimBlocks = 0; - - /* - * At this point, a smaller HFS file system exists in a larger volume. - * As per volume format, the alternate volume header is located 1024 bytes - * before end of the partition. So, until the partition is also resized, - * a valid alternate volume header will need to be updated at 1024 bytes - * before end of the volume. Under normal circumstances, a file system - * resize is always followed by a volume resize, so we also need to - * write a copy of the new alternate volume header at 1024 bytes before - * end of the new file system. - */ - if (hfs_resize_debug) { - printf ("hfs_truncatefs: old: partition_avh_sector=%qu, fs_avh_sector=%qu\n", - hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); - } - hfsmp->hfs_fs_avh_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); - /* Note hfs_partition_avh_sector stays unchanged! partition size has not yet been modified */ - if (hfs_resize_debug) { - printf ("hfs_truncatefs: new: partition_avh_sector=%qu, fs_avh_sector=%qu\n", - hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); - } - - MarkVCBDirty(hfsmp); - error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - if (error) { - panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error); - } - - /* - * Adjust the size of hfsmp->hfs_attrdata_vp - */ - if (hfsmp->hfs_attrdata_vp) { - struct cnode *cp; - struct filefork *fp; - - if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) { - cp = VTOC(hfsmp->hfs_attrdata_vp); - fp = VTOF(hfsmp->hfs_attrdata_vp); - - cp->c_blocks = newblkcnt; - fp->ff_blocks = newblkcnt; - fp->ff_extents[0].blockCount = newblkcnt; - fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize; - ubc_setsize(hfsmp->hfs_attrdata_vp, fp->ff_size); - vnode_put(hfsmp->hfs_attrdata_vp); - } - } - -out: - /* - * Update the allocLimit to acknowledge the last one or two blocks now. - * Add it to the tree as well if necessary. - */ - UpdateAllocLimit (hfsmp, hfsmp->totalBlocks); - - hfs_lock_mount (hfsmp); - if (disable_sparse == true) { - /* Now that resize is completed, set the volume to be sparse - * device again so that all further allocations will be first - * fit instead of best fit. Reset free extent cache so that - * it is rebuilt. - */ - hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; - ResetVCBFreeExtCache(hfsmp); - } - - if (error && (updateFreeBlocks == true)) { - hfsmp->freeBlocks += reclaimblks; - } - hfsmp->reclaimBlocks = 0; - - if (hfsmp->nextAllocation >= hfsmp->allocLimit) { - hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1; - } - hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; - hfs_unlock_mount (hfsmp); - - /* On error, reset the metadata zone for original volume size */ - if (error && (updateFreeBlocks == true)) { - hfs_metadatazone_init(hfsmp, false); - } - - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - } - if (transaction_begun) { - hfs_end_transaction(hfsmp); - /* Just to be sure, sync all data to the disk */ - int flush_error = hfs_flush(hfsmp, HFS_FLUSH_FULL); - if (flush_error && !error) - error = flush_error; - } - - if (error) { - printf ("hfs_truncatefs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN); - } - - return MacToVFSError(error); -} - - -/* - * Invalidate the physical block numbers associated with buffer cache blocks - * in the given extent of the given vnode. - */ -struct hfs_inval_blk_no { - daddr64_t sectorStart; - daddr64_t sectorCount; -}; -static int -hfs_invalidate_block_numbers_callback(buf_t bp, void *args_in) -{ - daddr64_t blkno; - struct hfs_inval_blk_no *args; - - blkno = buf_blkno(bp); - args = args_in; - - if (blkno >= args->sectorStart && blkno < args->sectorStart+args->sectorCount) - buf_setblkno(bp, buf_lblkno(bp)); - - return BUF_RETURNED; -} -static void -hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sectorCount) -{ - struct hfs_inval_blk_no args; - args.sectorStart = sectorStart; - args.sectorCount = sectorCount; - - buf_iterate(vp, hfs_invalidate_block_numbers_callback, BUF_SCAN_DIRTY|BUF_SCAN_CLEAN, &args); -} - - -/* - * Copy the contents of an extent to a new location. Also invalidates the - * physical block number of any buffer cache block in the copied extent - * (so that if the block is written, it will go through VNOP_BLOCKMAP to - * determine the new physical block number). - * - * At this point, for regular files, we hold the truncate lock exclusive - * and the cnode lock exclusive. - */ -static int -hfs_copy_extent( - struct hfsmount *hfsmp, - struct vnode *vp, /* The file whose extent is being copied. */ - u_int32_t oldStart, /* The start of the source extent. */ - u_int32_t newStart, /* The start of the destination extent. */ - u_int32_t blockCount, /* The number of allocation blocks to copy. */ - __unused vfs_context_t context) -{ - int err = 0; - size_t bufferSize; - void *buffer = NULL; - struct vfsioattr ioattr; - buf_t bp = NULL; - off_t resid; - size_t ioSize; - u_int32_t ioSizeSectors; /* Device sectors in this I/O */ - daddr64_t srcSector, destSector; - u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size; -#if CONFIG_PROTECT - int cpenabled = 0; -#endif - - /* - * Sanity check that we have locked the vnode of the file we're copying. - * - * But since hfs_systemfile_lock() doesn't actually take the lock on - * the allocation file if a journal is active, ignore the check if the - * file being copied is the allocation file. - */ - struct cnode *cp = VTOC(vp); - if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread()) - panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp); - -#if CONFIG_PROTECT - /* - * Prepare the CP blob and get it ready for use, if necessary. - * - * Note that we specifically *exclude* system vnodes (catalog, bitmap, extents, EAs), - * because they are implicitly protected via the media key on iOS. As such, they - * must not be relocated except with the media key. So it is OK to not pass down - * a special cpentry to the IOMedia/LwVM code for handling. - */ - if (!vnode_issystem (vp) && vnode_isreg(vp) && cp_fs_protected (hfsmp->hfs_mp)) { - cpenabled = 1; - } -#endif - - /* - * Determine the I/O size to use - * - * NOTE: Many external drives will result in an ioSize of 128KB. - * TODO: Should we use a larger buffer, doing several consecutive - * reads, then several consecutive writes? - */ - vfs_ioattr(hfsmp->hfs_mp, &ioattr); - bufferSize = MIN(ioattr.io_maxreadcnt, ioattr.io_maxwritecnt); - if (kmem_alloc(kernel_map, (vm_offset_t*) &buffer, bufferSize, VM_KERN_MEMORY_FILE)) - return ENOMEM; - - /* Get a buffer for doing the I/O */ - bp = buf_alloc(hfsmp->hfs_devvp); - buf_setdataptr(bp, (uintptr_t)buffer); - - resid = (off_t) blockCount * (off_t) hfsmp->blockSize; - srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size; - destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size; - while (resid > 0) { - ioSize = MIN(bufferSize, (size_t) resid); - ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size; - - /* Prepare the buffer for reading */ - buf_reset(bp, B_READ); - buf_setsize(bp, ioSize); - buf_setcount(bp, ioSize); - buf_setblkno(bp, srcSector); - buf_setlblkno(bp, srcSector); - - /* - * Note that because this is an I/O to the device vp - * it is correct to have lblkno and blkno both point to the - * start sector being read from. If it were being issued against the - * underlying file then that would be different. - */ - - /* Attach the new CP blob to the buffer if needed */ -#if CONFIG_PROTECT - if (cpenabled) { - /* attach the RELOCATION_INFLIGHT flag for the underlying call to VNOP_STRATEGY */ - cp->c_cpentry->cp_flags |= CP_RELOCATION_INFLIGHT; - bufattr_setcpx(buf_attr(bp), hfsmp->hfs_resize_cpx); - - /* Initialize the content protection file offset to start at 0 */ - buf_setcpoff (bp, 0); - } -#endif - - /* Do the read */ - err = VNOP_STRATEGY(bp); - if (!err) - err = buf_biowait(bp); - if (err) { -#if CONFIG_PROTECT - /* Turn the flag off in error cases. */ - if (cpenabled) { - cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT; - } -#endif - printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err); - break; - } - - /* Prepare the buffer for writing */ - buf_reset(bp, B_WRITE); - buf_setsize(bp, ioSize); - buf_setcount(bp, ioSize); - buf_setblkno(bp, destSector); - buf_setlblkno(bp, destSector); - if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl)) - buf_markfua(bp); - -#if CONFIG_PROTECT - /* Attach the CP to the buffer if needed */ - if (cpenabled) { - bufattr_setcpx(buf_attr(bp), hfsmp->hfs_resize_cpx); - /* - * The last STRATEGY call may have updated the cp file offset behind our - * back, so we cannot trust it. Re-initialize the content protection - * file offset back to 0 before initiating the write portion of this I/O. - */ - buf_setcpoff (bp, 0); - } -#endif - - /* Do the write */ - vnode_startwrite(hfsmp->hfs_devvp); - err = VNOP_STRATEGY(bp); - if (!err) { - err = buf_biowait(bp); - } -#if CONFIG_PROTECT - /* Turn the flag off regardless once the strategy call finishes. */ - if (cpenabled) { - cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT; - } -#endif - if (err) { - printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err); - break; - } - - resid -= ioSize; - srcSector += ioSizeSectors; - destSector += ioSizeSectors; - } - if (bp) - buf_free(bp); - if (buffer) - kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize); - - /* Make sure all writes have been flushed to disk. */ - if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) { - - err = hfs_flush(hfsmp, HFS_FLUSH_CACHE); - if (err) { - printf("hfs_copy_extent: hfs_flush failed (%d)\n", err); - err = 0; /* Don't fail the copy. */ - } - } - - if (!err) - hfs_invalidate_sectors(vp, (daddr64_t)oldStart*sectorsPerBlock, (daddr64_t)blockCount*sectorsPerBlock); - - return err; -} - - -/* Structure to store state of reclaiming extents from a - * given file. hfs_reclaim_file()/hfs_reclaim_xattr() - * initializes the values in this structure which are then - * used by code that reclaims and splits the extents. - */ -struct hfs_reclaim_extent_info { - struct vnode *vp; - u_int32_t fileID; - u_int8_t forkType; - u_int8_t is_dirlink; /* Extent belongs to directory hard link */ - u_int8_t is_sysfile; /* Extent belongs to system file */ - u_int8_t is_xattr; /* Extent belongs to extent-based xattr */ - u_int8_t extent_index; - int lockflags; /* Locks that reclaim and split code should grab before modifying the extent record */ - u_int32_t blocks_relocated; /* Total blocks relocated for this file till now */ - u_int32_t recStartBlock; /* File allocation block number (FABN) for current extent record */ - u_int32_t cur_blockCount; /* Number of allocation blocks that have been checked for reclaim */ - struct filefork *catalog_fp; /* If non-NULL, extent is from catalog record */ - union record { - HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */ - HFSPlusAttrRecord xattr; /* Attribute record for large EAs */ - } record; - HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being processed. - * For catalog extent record, points to the correct - * extent information in filefork. For overflow extent - * record, or xattr record, points to extent record - * in the structure above - */ - struct cat_desc *dirlink_desc; - struct cat_attr *dirlink_attr; - struct filefork *dirlink_fork; /* For directory hard links, fp points actually to this */ - struct BTreeIterator *iterator; /* Shared read/write iterator, hfs_reclaim_file/xattr() - * use it for reading and hfs_reclaim_extent()/hfs_split_extent() - * use it for writing updated extent record - */ - struct FSBufferDescriptor btdata; /* Shared btdata for reading/writing extent record, same as iterator above */ - u_int16_t recordlen; - int overflow_count; /* For debugging, counter for overflow extent record */ - FCB *fcb; /* Pointer to the current btree being traversed */ -}; - -/* - * Split the current extent into two extents, with first extent - * to contain given number of allocation blocks. Splitting of - * extent creates one new extent entry which can result in - * shifting of many entries through all the extent records of a - * file, and/or creating a new extent record in the overflow - * extent btree. - * - * Example: - * The diagram below represents two consecutive extent records, - * for simplicity, lets call them record X and X+1 respectively. - * Interesting extent entries have been denoted by letters. - * If the letter is unchanged before and after split, it means - * that the extent entry was not modified during the split. - * A '.' means that the entry remains unchanged after the split - * and is not relevant for our example. A '0' means that the - * extent entry is empty. - * - * If there isn't sufficient contiguous free space to relocate - * an extent (extent "C" below), we will have to break the one - * extent into multiple smaller extents, and relocate each of - * the smaller extents individually. The way we do this is by - * finding the largest contiguous free space that is currently - * available (N allocation blocks), and then convert extent "C" - * into two extents, C1 and C2, that occupy exactly the same - * allocation blocks as extent C. Extent C1 is the first - * N allocation blocks of extent C, and extent C2 is the remainder - * of extent C. Then we can relocate extent C1 since we know - * we have enough contiguous free space to relocate it in its - * entirety. We then repeat the process starting with extent C2. - * - * In record X, only the entries following entry C are shifted, and - * the original entry C is replaced with two entries C1 and C2 which - * are actually two extent entries for contiguous allocation blocks. - * - * Note that the entry E from record X is shifted into record X+1 as - * the new first entry. Since the first entry of record X+1 is updated, - * the FABN will also get updated with the blockCount of entry E. - * This also results in shifting of all extent entries in record X+1. - * Note that the number of empty entries after the split has been - * changed from 3 to 2. - * - * Before: - * record X record X+1 - * ---------------------===--------- --------------------------------- - * | A | . | . | . | B | C | D | E | | F | . | . | . | G | 0 | 0 | 0 | - * ---------------------===--------- --------------------------------- - * - * After: - * ---------------------=======----- --------------------------------- - * | A | . | . | . | B | C1| C2| D | | E | F | . | . | . | G | 0 | 0 | - * ---------------------=======----- --------------------------------- - * - * C1.startBlock = C.startBlock - * C1.blockCount = N - * - * C2.startBlock = C.startBlock + N - * C2.blockCount = C.blockCount - N - * - * FABN = old FABN - E.blockCount - * - * Inputs: - * extent_info - This is the structure that contains state about - * the current file, extent, and extent record that - * is being relocated. This structure is shared - * among code that traverses through all the extents - * of the file, code that relocates extents, and - * code that splits the extent. - * newBlockCount - The blockCount of the extent to be split after - * successfully split operation. - * Output: - * Zero on success, non-zero on failure. - */ -static int -hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount) -{ - int error = 0; - int index = extent_info->extent_index; - int i; - HFSPlusExtentDescriptor shift_extent; /* Extent entry that should be shifted into next extent record */ - HFSPlusExtentDescriptor last_extent; - HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */ - HFSPlusExtentRecord *extents_rec = NULL; - HFSPlusExtentKey *extents_key = NULL; - HFSPlusAttrRecord *xattr_rec = NULL; - HFSPlusAttrKey *xattr_key = NULL; - struct BTreeIterator iterator; - struct FSBufferDescriptor btdata; - uint16_t reclen; - uint32_t read_recStartBlock; /* Starting allocation block number to read old extent record */ - uint32_t write_recStartBlock; /* Starting allocation block number to insert newly updated extent record */ - Boolean create_record = false; - Boolean is_xattr; - struct cnode *cp; - - is_xattr = extent_info->is_xattr; - extents = extent_info->extents; - cp = VTOC(extent_info->vp); - - if (newBlockCount == 0) { - if (hfs_resize_debug) { - printf ("hfs_split_extent: No splitting required for newBlockCount=0\n"); - } - return error; - } - - if (hfs_resize_debug) { - printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount); - } - - /* Extents overflow btree can not have more than 8 extents. - * No split allowed if the 8th extent is already used. - */ - if ((extent_info->fileID == kHFSExtentsFileID) && (extents[kHFSPlusExtentDensity - 1].blockCount != 0)) { - printf ("hfs_split_extent: Maximum 8 extents allowed for extents overflow btree, cannot split further.\n"); - error = ENOSPC; - goto out; - } - - /* Determine the starting allocation block number for the following - * overflow extent record, if any, before the current record - * gets modified. - */ - read_recStartBlock = extent_info->recStartBlock; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (extents[i].blockCount == 0) { - break; - } - read_recStartBlock += extents[i].blockCount; - } - - /* Shift and split */ - if (index == kHFSPlusExtentDensity-1) { - /* The new extent created after split will go into following overflow extent record */ - shift_extent.startBlock = extents[index].startBlock + newBlockCount; - shift_extent.blockCount = extents[index].blockCount - newBlockCount; - - /* Last extent in the record will be split, so nothing to shift */ - } else { - /* Splitting of extents can result in at most of one - * extent entry to be shifted into following overflow extent - * record. So, store the last extent entry for later. - */ - shift_extent = extents[kHFSPlusExtentDensity-1]; - if ((hfs_resize_debug) && (shift_extent.blockCount != 0)) { - printf ("hfs_split_extent: Save 7:(%u,%u) to shift into overflow record\n", shift_extent.startBlock, shift_extent.blockCount); - } - - /* Start shifting extent information from the end of the extent - * record to the index where we want to insert the new extent. - * Note that kHFSPlusExtentDensity-1 is already saved above, and - * does not need to be shifted. The extent entry that is being - * split does not get shifted. - */ - for (i = kHFSPlusExtentDensity-2; i > index; i--) { - if (hfs_resize_debug) { - if (extents[i].blockCount) { - printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount); - } - } - extents[i+1] = extents[i]; - } - } - - if (index == kHFSPlusExtentDensity-1) { - /* The second half of the extent being split will be the overflow - * entry that will go into following overflow extent record. The - * value has been stored in 'shift_extent' above, so there is - * nothing to be done here. - */ - } else { - /* Update the values in the second half of the extent being split - * before updating the first half of the split. Note that the - * extent to split or first half of the split is at index 'index' - * and a new extent or second half of the split will be inserted at - * 'index+1' or into following overflow extent record. - */ - extents[index+1].startBlock = extents[index].startBlock + newBlockCount; - extents[index+1].blockCount = extents[index].blockCount - newBlockCount; - } - /* Update the extent being split, only the block count will change */ - extents[index].blockCount = newBlockCount; - - if (hfs_resize_debug) { - printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount); - if (index != kHFSPlusExtentDensity-1) { - printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount); - } else { - printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount); - } - } - - /* Write out information about the newly split extent to the disk */ - if (extent_info->catalog_fp) { - /* (extent_info->catalog_fp != NULL) means the newly split - * extent exists in the catalog record. This means that - * the cnode was updated. Therefore, to write out the changes, - * mark the cnode as modified. We cannot call hfs_update() - * in this function because the caller hfs_reclaim_extent() - * is holding the catalog lock currently. - */ - cp->c_flag |= C_MODIFIED; - } else { - /* The newly split extent is for large EAs or is in overflow - * extent record, so update it directly in the btree using the - * iterator information from the shared extent_info structure - */ - error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, - &(extent_info->btdata), extent_info->recordlen); - if (error) { - printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error); - goto out; - } - } - - /* No extent entry to be shifted into another extent overflow record */ - if (shift_extent.blockCount == 0) { - if (hfs_resize_debug) { - printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n"); - } - error = 0; - goto out; - } - - /* The overflow extent entry has to be shifted into an extent - * overflow record. This means that we might have to shift - * extent entries from all subsequent overflow records by one. - * We start iteration from the first record to the last record, - * and shift the extent entry from one record to another. - * We might have to create a new extent record for the last - * extent entry for the file. - */ - - /* Initialize iterator to search the next record */ - bzero(&iterator, sizeof(iterator)); - if (is_xattr) { - /* Copy the key from the iterator that was used to update the modified attribute record. */ - xattr_key = (HFSPlusAttrKey *)&(iterator.key); - bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey)); - /* Note: xattr_key->startBlock will be initialized later in the iteration loop */ - - MALLOC(xattr_rec, HFSPlusAttrRecord *, - sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK); - if (xattr_rec == NULL) { - error = ENOMEM; - goto out; - } - btdata.bufferAddress = xattr_rec; - btdata.itemSize = sizeof(HFSPlusAttrRecord); - btdata.itemCount = 1; - extents = xattr_rec->overflowExtents.extents; - } else { - /* Initialize the extent key for the current file */ - extents_key = (HFSPlusExtentKey *) &(iterator.key); - extents_key->keyLength = kHFSPlusExtentKeyMaximumLength; - extents_key->forkType = extent_info->forkType; - extents_key->fileID = extent_info->fileID; - /* Note: extents_key->startBlock will be initialized later in the iteration loop */ - - MALLOC(extents_rec, HFSPlusExtentRecord *, - sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK); - if (extents_rec == NULL) { - error = ENOMEM; - goto out; - } - btdata.bufferAddress = extents_rec; - btdata.itemSize = sizeof(HFSPlusExtentRecord); - btdata.itemCount = 1; - extents = extents_rec[0]; - } - - /* The overflow extent entry has to be shifted into an extent - * overflow record. This means that we might have to shift - * extent entries from all subsequent overflow records by one. - * We start iteration from the first record to the last record, - * examine one extent record in each iteration and shift one - * extent entry from one record to another. We might have to - * create a new extent record for the last extent entry for the - * file. - * - * If shift_extent.blockCount is non-zero, it means that there is - * an extent entry that needs to be shifted into the next - * overflow extent record. We keep on going till there are no such - * entries left to be shifted. This will also change the starting - * allocation block number of the extent record which is part of - * the key for the extent record in each iteration. Note that - * because the extent record key is changing while we are searching, - * the record can not be updated directly, instead it has to be - * deleted and inserted again. - */ - while (shift_extent.blockCount) { - if (hfs_resize_debug) { - printf ("hfs_split_extent: Will shift (%u,%u) into overflow record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock); - } - - /* Search if there is any existing overflow extent record - * that matches the current file and the logical start block - * number. - * - * For this, the logical start block number in the key is - * the value calculated based on the logical start block - * number of the current extent record and the total number - * of blocks existing in the current extent record. - */ - if (is_xattr) { - xattr_key->startBlock = read_recStartBlock; - } else { - extents_key->startBlock = read_recStartBlock; - } - error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator); - if (error) { - if (error != btNotFound) { - printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); - goto out; - } - /* No matching record was found, so create a new extent record. - * Note: Since no record was found, we can't rely on the - * btree key in the iterator any longer. This will be initialized - * later before we insert the record. - */ - create_record = true; - } - - /* The extra extent entry from the previous record is being inserted - * as the first entry in the current extent record. This will change - * the file allocation block number (FABN) of the current extent - * record, which is the startBlock value from the extent record key. - * Since one extra entry is being inserted in the record, the new - * FABN for the record will less than old FABN by the number of blocks - * in the new extent entry being inserted at the start. We have to - * do this before we update read_recStartBlock to point at the - * startBlock of the following record. - */ - write_recStartBlock = read_recStartBlock - shift_extent.blockCount; - if (hfs_resize_debug) { - if (create_record) { - printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock); - } - } - - /* Now update the read_recStartBlock to account for total number - * of blocks in this extent record. It will now point to the - * starting allocation block number for the next extent record. - */ - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (extents[i].blockCount == 0) { - break; - } - read_recStartBlock += extents[i].blockCount; - } - - if (create_record == true) { - /* Initialize new record content with only one extent entry */ - bzero(extents, sizeof(HFSPlusExtentRecord)); - /* The new record will contain only one extent entry */ - extents[0] = shift_extent; - /* There are no more overflow extents to be shifted */ - shift_extent.startBlock = shift_extent.blockCount = 0; - - if (is_xattr) { - /* BTSearchRecord above returned btNotFound, - * but since the attribute btree is never empty - * if we are trying to insert new overflow - * record for the xattrs, the extents_key will - * contain correct data. So we don't need to - * re-initialize it again like below. - */ - - /* Initialize the new xattr record */ - xattr_rec->recordType = kHFSPlusAttrExtents; - xattr_rec->overflowExtents.reserved = 0; - reclen = sizeof(HFSPlusAttrExtents); - } else { - /* BTSearchRecord above returned btNotFound, - * which means that extents_key content might - * not correspond to the record that we are - * trying to create, especially when the extents - * overflow btree is empty. So we reinitialize - * the extents_key again always. - */ - extents_key->keyLength = kHFSPlusExtentKeyMaximumLength; - extents_key->forkType = extent_info->forkType; - extents_key->fileID = extent_info->fileID; - - /* Initialize the new extent record */ - reclen = sizeof(HFSPlusExtentRecord); - } - } else { - /* The overflow extent entry from previous record will be - * the first entry in this extent record. If the last - * extent entry in this record is valid, it will be shifted - * into the following extent record as its first entry. So - * save the last entry before shifting entries in current - * record. - */ - last_extent = extents[kHFSPlusExtentDensity-1]; - - /* Shift all entries by one index towards the end */ - for (i = kHFSPlusExtentDensity-2; i >= 0; i--) { - extents[i+1] = extents[i]; - } - - /* Overflow extent entry saved from previous record - * is now the first entry in the current record. - */ - extents[0] = shift_extent; - - if (hfs_resize_debug) { - printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock); - } - - /* The last entry from current record will be the - * overflow entry which will be the first entry for - * the following extent record. - */ - shift_extent = last_extent; - - /* Since the key->startBlock is being changed for this record, - * it should be deleted and inserted with the new key. - */ - error = BTDeleteRecord(extent_info->fcb, &iterator); - if (error) { - printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); - goto out; - } - if (hfs_resize_debug) { - printf ("hfs_split_extent: Deleted extent record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock)); - } - } - - /* Insert the newly created or modified extent record */ - bzero(&iterator.hint, sizeof(iterator.hint)); - if (is_xattr) { - xattr_key->startBlock = write_recStartBlock; - } else { - extents_key->startBlock = write_recStartBlock; - } - error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen); - if (error) { - printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error); - goto out; - } - if (hfs_resize_debug) { - printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock); - } - } - -out: - /* - * Extents overflow btree or attributes btree headers might have - * been modified during the split/shift operation, so flush the - * changes to the disk while we are inside journal transaction. - * We should only be able to generate I/O that modifies the B-Tree - * header nodes while we're in the middle of a journal transaction. - * Otherwise it might result in panic during unmount. - */ - BTFlushPath(extent_info->fcb); - - if (extents_rec) { - FREE (extents_rec, M_TEMP); - } - if (xattr_rec) { - FREE (xattr_rec, M_TEMP); - } - return error; -} - - -/* - * Relocate an extent if it lies beyond the expected end of volume. - * - * This function is called for every extent of the file being relocated. - * It allocates space for relocation, copies the data, deallocates - * the old extent, and update corresponding on-disk extent. If the function - * does not find contiguous space to relocate an extent, it splits the - * extent in smaller size to be able to relocate it out of the area of - * disk being reclaimed. As an optimization, if an extent lies partially - * in the area of the disk being reclaimed, it is split so that we only - * have to relocate the area that was overlapping with the area of disk - * being reclaimed. - * - * Note that every extent is relocated in its own transaction so that - * they do not overwhelm the journal. This function handles the extent - * record that exists in the catalog record, extent record from overflow - * extents btree, and extents for large EAs. - * - * Inputs: - * extent_info - This is the structure that contains state about - * the current file, extent, and extent record that - * is being relocated. This structure is shared - * among code that traverses through all the extents - * of the file, code that relocates extents, and - * code that splits the extent. - */ -static int -hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context) -{ - int error = 0; - int index; - struct cnode *cp; - u_int32_t oldStartBlock; - u_int32_t oldBlockCount; - u_int32_t newStartBlock; - u_int32_t newBlockCount; - u_int32_t roundedBlockCount; - uint16_t node_size; - uint32_t remainder_blocks; - u_int32_t alloc_flags; - int blocks_allocated = false; - - index = extent_info->extent_index; - cp = VTOC(extent_info->vp); - - oldStartBlock = extent_info->extents[index].startBlock; - oldBlockCount = extent_info->extents[index].blockCount; - - if (0 && hfs_resize_debug) { - printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount); - } - - /* If the current extent lies completely within allocLimit, - * it does not require any relocation. - */ - if ((oldStartBlock + oldBlockCount) <= allocLimit) { - extent_info->cur_blockCount += oldBlockCount; - return error; - } - - /* Every extent should be relocated in its own transaction - * to make sure that we don't overflow the journal buffer. - */ - error = hfs_start_transaction(hfsmp); - if (error) { - return error; - } - extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK); - - /* Check if the extent lies partially in the area to reclaim, - * i.e. it starts before allocLimit and ends beyond allocLimit. - * We have already skipped extents that lie completely within - * allocLimit in the check above, so we only check for the - * startBlock. If it lies partially, split it so that we - * only relocate part of the extent. - */ - if (oldStartBlock < allocLimit) { - newBlockCount = allocLimit - oldStartBlock; - - if (hfs_resize_debug) { - int idx = extent_info->extent_index; - printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount); - } - - /* If the extent belongs to a btree, check and trim - * it to be multiple of the node size. - */ - if (extent_info->is_sysfile) { - node_size = get_btree_nodesize(extent_info->vp); - /* If the btree node size is less than the block size, - * splitting this extent will not split a node across - * different extents. So we only check and trim if - * node size is more than the allocation block size. - */ - if (node_size > hfsmp->blockSize) { - remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize); - if (remainder_blocks) { - newBlockCount -= remainder_blocks; - if (hfs_resize_debug) { - printf ("hfs_reclaim_extent: Round-down newBlockCount to be multiple of nodeSize, node_allocblks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount); - } - } - } - /* The newBlockCount is zero because of rounding-down so that - * btree nodes are not split across extents. Therefore this - * straddling extent across resize-boundary does not require - * splitting. Skip over to relocating of complete extent. - */ - if (newBlockCount == 0) { - if (hfs_resize_debug) { - printf ("hfs_reclaim_extent: After round-down newBlockCount=0, skip split, relocate full extent\n"); - } - goto relocate_full_extent; - } - } - - /* Split the extents into two parts --- the first extent lies - * completely within allocLimit and therefore does not require - * relocation. The second extent will require relocation which - * will be handled when the caller calls this function again - * for the next extent. - */ - error = hfs_split_extent(extent_info, newBlockCount); - if (error == 0) { - /* Split success, no relocation required */ - goto out; - } - /* Split failed, so try to relocate entire extent */ - if (hfs_resize_debug) { - int idx = extent_info->extent_index; - printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks failed, relocate full extent\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount); - } - } - -relocate_full_extent: - /* At this point, the current extent requires relocation. - * We will try to allocate space equal to the size of the extent - * being relocated first to try to relocate it without splitting. - * If the allocation fails, we will try to allocate contiguous - * blocks out of metadata zone. If that allocation also fails, - * then we will take a whatever contiguous block run is returned - * by the allocation, split the extent into two parts, and then - * relocate the first splitted extent. - */ - alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; - if (extent_info->is_sysfile) { - alloc_flags |= HFS_ALLOC_METAZONE; - } - - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, - &newStartBlock, &newBlockCount); - if ((extent_info->is_sysfile == false) && - ((error == dskFulErr) || (error == ENOSPC))) { - /* For non-system files, try reallocating space in metadata zone */ - alloc_flags |= HFS_ALLOC_METAZONE; - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, - alloc_flags, &newStartBlock, &newBlockCount); - } - if ((error == dskFulErr) || (error == ENOSPC)) { - /* - * We did not find desired contiguous space for this - * extent, when we asked for it, including the metazone allocations. - * At this point we are not worrying about getting contiguity anymore. - * - * HOWEVER, if we now allow blocks to be used which were recently - * de-allocated, we may find a contiguous range (though this seems - * unlikely). As a result, assume that we will have to split the - * current extent into two pieces, but if we are able to satisfy - * the request with a single extent, detect that as well. - */ - alloc_flags &= ~HFS_ALLOC_FORCECONTIG; - alloc_flags |= HFS_ALLOC_FLUSHTXN; - - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, - alloc_flags, &newStartBlock, &newBlockCount); - if (error) { - printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); - goto out; - } - - /* - * Allowing recently deleted extents may now allow us to find - * a single contiguous extent in the amount & size desired. If so, - * do NOT split this extent into two pieces. This is technically a - * check for "< oldBlockCount", but we use != to highlight the point - * that the special case is when they're equal. The allocator should - * never vend back more blocks than were requested. - */ - if (newBlockCount != oldBlockCount) { - blocks_allocated = true; - - /* The number of blocks allocated is less than the requested - * number of blocks. For btree extents, check and trim the - * extent to be multiple of the node size. - */ - if (extent_info->is_sysfile) { - node_size = get_btree_nodesize(extent_info->vp); - if (node_size > hfsmp->blockSize) { - remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize); - if (remainder_blocks) { - roundedBlockCount = newBlockCount - remainder_blocks; - /* Free tail-end blocks of the newly allocated extent */ - BlockDeallocate(hfsmp, newStartBlock + roundedBlockCount, - newBlockCount - roundedBlockCount, - HFS_ALLOC_SKIPFREEBLKS); - newBlockCount = roundedBlockCount; - if (hfs_resize_debug) { - printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount); - } - if (newBlockCount == 0) { - printf ("hfs_reclaim_extent: Not enough contiguous blocks available to relocate fileID=%d\n", extent_info->fileID); - error = ENOSPC; - goto out; - } - } - } - } - - /* The number of blocks allocated is less than the number of - * blocks requested, so split this extent --- the first extent - * will be relocated as part of this function call and the caller - * will handle relocating the second extent by calling this - * function again for the second extent. - */ - error = hfs_split_extent(extent_info, newBlockCount); - if (error) { - printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); - goto out; - } - oldBlockCount = newBlockCount; - } /* end oldBlockCount != newBlockCount */ - } /* end allocation request for any available free space */ - - if (error) { - printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); - goto out; - } - blocks_allocated = true; - - /* Copy data from old location to new location */ - error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock, - newStartBlock, newBlockCount, context); - if (error) { - printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error); - goto out; - } - - /* Update the extent record with the new start block information */ - extent_info->extents[index].startBlock = newStartBlock; - - /* Sync the content back to the disk */ - if (extent_info->catalog_fp) { - /* Update the extents in catalog record */ - if (extent_info->is_dirlink) { - error = cat_update_dirlink(hfsmp, extent_info->forkType, - extent_info->dirlink_desc, extent_info->dirlink_attr, - &(extent_info->dirlink_fork->ff_data)); - } else { - cp->c_flag |= C_MODIFIED; - /* If this is a system file, sync volume headers on disk */ - if (extent_info->is_sysfile) { - error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - } - } - } else { - /* Replace record for extents overflow or extents-based xattrs */ - error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, - &(extent_info->btdata), extent_info->recordlen); - } - if (error) { - printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error); - goto out; - } - - /* Deallocate the old extent */ - error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); - if (error) { - printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); - goto out; - } - extent_info->blocks_relocated += newBlockCount; - - if (hfs_resize_debug) { - printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); - } - -out: - if (error != 0) { - if (blocks_allocated == true) { - BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); - } - } else { - /* On success, increment the total allocation blocks processed */ - extent_info->cur_blockCount += newBlockCount; - } - - hfs_systemfile_unlock(hfsmp, extent_info->lockflags); - - /* For a non-system file, if an extent entry from catalog record - * was modified, sync the in-memory changes to the catalog record - * on disk before ending the transaction. - */ - if ((extent_info->catalog_fp) && - (extent_info->is_sysfile == false)) { - hfs_update(extent_info->vp, 0); - } - - hfs_end_transaction(hfsmp); - - return error; -} - -/* Report intermediate progress during volume resize */ -static void -hfs_truncatefs_progress(struct hfsmount *hfsmp) -{ - u_int32_t cur_progress = 0; - - hfs_resize_progress(hfsmp, &cur_progress); - if (cur_progress > (hfsmp->hfs_resize_progress + 9)) { - printf("hfs_truncatefs: %d%% done...\n", cur_progress); - hfsmp->hfs_resize_progress = cur_progress; - } - return; -} - -/* - * Reclaim space at the end of a volume for given file and forktype. - * - * This routine attempts to move any extent which contains allocation blocks - * at or after "allocLimit." A separate transaction is used for every extent - * that needs to be moved. If there is not contiguous space available for - * moving an extent, it can be split into smaller extents. The contents of - * any moved extents are read and written via the volume's device vnode -- - * NOT via "vp." During the move, moved blocks which are part of a transaction - * have their physical block numbers invalidated so they will eventually be - * written to their new locations. - * - * This function is also called for directory hard links. Directory hard links - * are regular files with no data fork and resource fork that contains alias - * information for backward compatibility with pre-Leopard systems. However - * non-Mac OS X implementation can add/modify data fork or resource fork - * information to directory hard links, so we check, and if required, relocate - * both data fork and resource fork. - * - * Inputs: - * hfsmp The volume being resized. - * vp The vnode for the system file. - * fileID ID of the catalog record that needs to be relocated - * forktype The type of fork that needs relocated, - * kHFSResourceForkType for resource fork, - * kHFSDataForkType for data fork - * allocLimit Allocation limit for the new volume size, - * do not use this block or beyond. All extents - * that use this block or any blocks beyond this limit - * will be relocated. - * - * Side Effects: - * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation - * blocks that were relocated. - */ -static int -hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, - u_int8_t forktype, u_long allocLimit, vfs_context_t context) -{ - int error = 0; - struct hfs_reclaim_extent_info *extent_info; - int i; - int lockflags = 0; - struct cnode *cp; - struct filefork *fp; - int took_truncate_lock = false; - int release_desc = false; - HFSPlusExtentKey *key; - - /* If there is no vnode for this file, then there's nothing to do. */ - if (vp == NULL) { - return 0; - } - - cp = VTOC(vp); - - if (hfs_resize_debug) { - const char *filename = (const char *) cp->c_desc.cd_nameptr; - int namelen = cp->c_desc.cd_namelen; - - if (filename == NULL) { - filename = ""; - namelen = 0; - } - printf("hfs_reclaim_file: reclaiming '%.*s'\n", namelen, filename); - } - - MALLOC(extent_info, struct hfs_reclaim_extent_info *, - sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK); - if (extent_info == NULL) { - return ENOMEM; - } - bzero(extent_info, sizeof(struct hfs_reclaim_extent_info)); - extent_info->vp = vp; - extent_info->fileID = fileID; - extent_info->forkType = forktype; - extent_info->is_sysfile = vnode_issystem(vp); - if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) { - extent_info->is_dirlink = true; - } - /* We always need allocation bitmap and extent btree lock */ - lockflags = SFL_BITMAP | SFL_EXTENTS; - if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) { - lockflags |= SFL_CATALOG; - } else if (fileID == kHFSAttributesFileID) { - lockflags |= SFL_ATTRIBUTE; - } else if (fileID == kHFSStartupFileID) { - lockflags |= SFL_STARTUP; - } - extent_info->lockflags = lockflags; - extent_info->fcb = VTOF(hfsmp->hfs_extents_vp); - - /* Flush data associated with current file on disk. - * - * If the current vnode is directory hard link, no flushing of - * journal or vnode is required. The current kernel does not - * modify data/resource fork of directory hard links, so nothing - * will be in the cache. If a directory hard link is newly created, - * the resource fork data is written directly using devvp and - * the code that actually relocates data (hfs_copy_extent()) also - * uses devvp for its I/O --- so they will see a consistent copy. - */ - if (extent_info->is_sysfile) { - /* If the current vnode is system vnode, flush journal - * to make sure that all data is written to the disk. - */ - error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - if (error) { - printf ("hfs_reclaim_file: journal_flush returned %d\n", error); - goto out; - } - } else if (extent_info->is_dirlink == false) { - /* Flush all blocks associated with this regular file vnode. - * Normally there should not be buffer cache blocks for regular - * files, but for objects like symlinks, we can have buffer cache - * blocks associated with the vnode. Therefore we call - * buf_flushdirtyblks() also. - */ - buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file"); - - hfs_unlock(cp); - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - took_truncate_lock = true; - (void) cluster_push(vp, 0); - error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - if (error) { - goto out; - } - - /* If the file no longer exists, nothing left to do */ - if (cp->c_flag & C_NOEXISTS) { - error = 0; - goto out; - } - - /* Wait for any in-progress writes to this vnode to complete, so that we'll - * be copying consistent bits. (Otherwise, it's possible that an async - * write will complete to the old extent after we read from it. That - * could lead to corruption.) - */ - error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file"); - if (error) { - goto out; - } - } - - if (hfs_resize_debug) { - printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID); - } - - if (extent_info->is_dirlink) { - MALLOC(extent_info->dirlink_desc, struct cat_desc *, - sizeof(struct cat_desc), M_TEMP, M_WAITOK); - MALLOC(extent_info->dirlink_attr, struct cat_attr *, - sizeof(struct cat_attr), M_TEMP, M_WAITOK); - MALLOC(extent_info->dirlink_fork, struct filefork *, - sizeof(struct filefork), M_TEMP, M_WAITOK); - if ((extent_info->dirlink_desc == NULL) || - (extent_info->dirlink_attr == NULL) || - (extent_info->dirlink_fork == NULL)) { - error = ENOMEM; - goto out; - } - - /* Lookup catalog record for directory hard link and - * create a fake filefork for the value looked up from - * the disk. - */ - fp = extent_info->dirlink_fork; - bzero(extent_info->dirlink_fork, sizeof(struct filefork)); - extent_info->dirlink_fork->ff_cp = cp; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - error = cat_lookup_dirlink(hfsmp, fileID, forktype, - extent_info->dirlink_desc, extent_info->dirlink_attr, - &(extent_info->dirlink_fork->ff_data)); - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) { - printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error); - goto out; - } - release_desc = true; - } else { - fp = VTOF(vp); - } - - extent_info->catalog_fp = fp; - extent_info->recStartBlock = 0; - extent_info->extents = extent_info->catalog_fp->ff_extents; - /* Relocate extents from the catalog record */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (fp->ff_extents[i].blockCount == 0) { - break; - } - extent_info->extent_index = i; - error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); - if (error) { - printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error); - goto out; - } - } - - /* If the number of allocation blocks processed for reclaiming - * are less than total number of blocks for the file, continuing - * working on overflow extents record. - */ - if (fp->ff_blocks <= extent_info->cur_blockCount) { - if (0 && hfs_resize_debug) { - printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); - } - goto out; - } - - if (hfs_resize_debug) { - printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); - } - - MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (extent_info->iterator == NULL) { - error = ENOMEM; - goto out; - } - bzero(extent_info->iterator, sizeof(struct BTreeIterator)); - key = (HFSPlusExtentKey *) &(extent_info->iterator->key); - key->keyLength = kHFSPlusExtentKeyMaximumLength; - key->forkType = forktype; - key->fileID = fileID; - key->startBlock = extent_info->cur_blockCount; - - extent_info->btdata.bufferAddress = extent_info->record.overflow; - extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord); - extent_info->btdata.itemCount = 1; - - extent_info->catalog_fp = NULL; - - /* Search the first overflow extent with expected startBlock as 'cur_blockCount' */ - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - error = BTSearchRecord(extent_info->fcb, extent_info->iterator, - &(extent_info->btdata), &(extent_info->recordlen), - extent_info->iterator); - hfs_systemfile_unlock(hfsmp, lockflags); - while (error == 0) { - extent_info->overflow_count++; - extent_info->recStartBlock = key->startBlock; - extent_info->extents = extent_info->record.overflow; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (extent_info->record.overflow[i].blockCount == 0) { - goto out; - } - extent_info->extent_index = i; - error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); - if (error) { - printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error); - goto out; - } - } - - /* Look for more overflow records */ - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, - extent_info->iterator, &(extent_info->btdata), - &(extent_info->recordlen)); - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) { - break; - } - /* Stop when we encounter a different file or fork. */ - if ((key->fileID != fileID) || (key->forkType != forktype)) { - break; - } - } - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - -out: - /* If any blocks were relocated, account them and report progress */ - if (extent_info->blocks_relocated) { - hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; - hfs_truncatefs_progress(hfsmp); - if (fileID < kHFSFirstUserCatalogNodeID) { - printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n", - extent_info->blocks_relocated, fileID, hfsmp->vcbVN); - } - } - if (extent_info->iterator) { - FREE(extent_info->iterator, M_TEMP); - } - if (release_desc == true) { - cat_releasedesc(extent_info->dirlink_desc); - } - if (extent_info->dirlink_desc) { - FREE(extent_info->dirlink_desc, M_TEMP); - } - if (extent_info->dirlink_attr) { - FREE(extent_info->dirlink_attr, M_TEMP); - } - if (extent_info->dirlink_fork) { - FREE(extent_info->dirlink_fork, M_TEMP); - } - if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) { - hfs_update(vp, 0); - } - if (took_truncate_lock) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - } - if (extent_info) { - FREE(extent_info, M_TEMP); - } - if (hfs_resize_debug) { - printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error); - } - - return error; -} - - -/* - * This journal_relocate callback updates the journal info block to point - * at the new journal location. This write must NOT be done using the - * transaction. We must write the block immediately. We must also force - * it to get to the media so that the new journal location will be seen by - * the replay code before we can safely let journaled blocks be written - * to their normal locations. - * - * The tests for journal_uses_fua below are mildly hacky. Since the journal - * and the file system are both on the same device, I'm leveraging what - * the journal has decided about FUA. - */ -struct hfs_journal_relocate_args { - struct hfsmount *hfsmp; - vfs_context_t context; - u_int32_t newStartBlock; - u_int32_t newBlockCount; -}; - -static errno_t -hfs_journal_relocate_callback(void *_args) -{ - int error; - struct hfs_journal_relocate_args *args = _args; - struct hfsmount *hfsmp = args->hfsmp; - buf_t bp; - JournalInfoBlock *jibp; - - error = buf_meta_bread(hfsmp->hfs_devvp, - (uint64_t)hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), - hfsmp->blockSize, vfs_context_ucred(args->context), &bp); - if (error) { - printf("hfs_journal_relocate_callback: failed to read JIB (%d)\n", error); - if (bp) { - buf_brelse(bp); - } - return error; - } - jibp = (JournalInfoBlock*) buf_dataptr(bp); - jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize); - jibp->size = SWAP_BE64((u_int64_t)args->newBlockCount * hfsmp->blockSize); - if (journal_uses_fua(hfsmp->jnl)) - buf_markfua(bp); - error = buf_bwrite(bp); - if (error) { - printf("hfs_journal_relocate_callback: failed to write JIB (%d)\n", error); - return error; - } - if (!journal_uses_fua(hfsmp->jnl)) { - error = hfs_flush(hfsmp, HFS_FLUSH_CACHE); - if (error) { - printf("hfs_journal_relocate_callback: hfs_flush failed (%d)\n", error); - error = 0; /* Don't fail the operation. */ - } - } - - return error; -} - - -/* Type of resize operation in progress */ -#define HFS_RESIZE_TRUNCATE 1 -#define HFS_RESIZE_EXTEND 2 - -/* - * Core function to relocate the journal file. This function takes the - * journal size of the newly relocated journal --- the caller can - * provide a new journal size if they want to change the size of - * the journal. The function takes care of updating the journal info - * block and all other data structures correctly. - * - * Note: This function starts a transaction and grabs the btree locks. - */ -static int -hfs_relocate_journal_file(struct hfsmount *hfsmp, u_int32_t jnl_size, int resize_type, vfs_context_t context) -{ - int error; - int journal_err; - int lockflags; - u_int32_t oldStartBlock; - u_int32_t newStartBlock; - u_int32_t oldBlockCount; - u_int32_t newBlockCount; - u_int32_t jnlBlockCount; - u_int32_t alloc_skipfreeblks; - struct cat_desc journal_desc; - struct cat_attr journal_attr; - struct cat_fork journal_fork; - struct hfs_journal_relocate_args callback_args; - - /* Calculate the number of allocation blocks required for the journal */ - jnlBlockCount = howmany(jnl_size, hfsmp->blockSize); - - /* - * During truncatefs(), the volume free block count is updated - * before relocating data and reflects the total number of free - * blocks that will exist on volume after the resize is successful. - * This means that the allocation blocks required for relocation - * have already been reserved and accounted for in the free block - * count. Therefore, block allocation and deallocation routines - * can skip the free block check by passing HFS_ALLOC_SKIPFREEBLKS - * flag. - * - * This special handling is not required when the file system - * is being extended as we want all the allocated and deallocated - * blocks to be accounted for correctly. - */ - if (resize_type == HFS_RESIZE_TRUNCATE) { - alloc_skipfreeblks = HFS_ALLOC_SKIPFREEBLKS; - } else { - alloc_skipfreeblks = 0; - } - - error = hfs_start_transaction(hfsmp); - if (error) { - printf("hfs_relocate_journal_file: hfs_start_transaction returned %d\n", error); - return error; - } - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - error = BlockAllocate(hfsmp, 1, jnlBlockCount, jnlBlockCount, - HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_FLUSHTXN | alloc_skipfreeblks, - &newStartBlock, &newBlockCount); - if (error) { - printf("hfs_relocate_journal_file: BlockAllocate returned %d\n", error); - goto fail; - } - if (newBlockCount != jnlBlockCount) { - printf("hfs_relocate_journal_file: newBlockCount != jnlBlockCount (%u, %u)\n", newBlockCount, jnlBlockCount); - goto free_fail; - } - - error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, 0, &journal_desc, &journal_attr, &journal_fork); - if (error) { - printf("hfs_relocate_journal_file: cat_idlookup returned %d\n", error); - goto free_fail; - } - - oldStartBlock = journal_fork.cf_extents[0].startBlock; - oldBlockCount = journal_fork.cf_extents[0].blockCount; - error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, alloc_skipfreeblks); - if (error) { - printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error); - goto free_fail; - } - - /* Update the catalog record for .journal */ - journal_fork.cf_size = hfs_blk_to_bytes(newBlockCount, hfsmp->blockSize); - journal_fork.cf_extents[0].startBlock = newStartBlock; - journal_fork.cf_extents[0].blockCount = newBlockCount; - journal_fork.cf_blocks = newBlockCount; - error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL); - cat_releasedesc(&journal_desc); /* all done with cat descriptor */ - if (error) { - printf("hfs_relocate_journal_file: cat_update returned %d\n", error); - goto free_fail; - } - - /* - * If the journal is part of the file system, then tell the journal - * code about the new location. If the journal is on an external - * device, then just keep using it as-is. - */ - if (hfsmp->jvp == hfsmp->hfs_devvp) { - callback_args.hfsmp = hfsmp; - callback_args.context = context; - callback_args.newStartBlock = newStartBlock; - callback_args.newBlockCount = newBlockCount; - - error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize, - (off_t)newBlockCount*hfsmp->blockSize, 0, - hfs_journal_relocate_callback, &callback_args); - if (error) { - /* NOTE: journal_relocate will mark the journal invalid. */ - printf("hfs_relocate_journal_file: journal_relocate returned %d\n", error); - goto fail; - } - if (hfs_resize_debug) { - printf ("hfs_relocate_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); - } - hfsmp->jnl_start = newStartBlock; - hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize; - } - - hfs_systemfile_unlock(hfsmp, lockflags); - error = hfs_end_transaction(hfsmp); - if (error) { - printf("hfs_relocate_journal_file: hfs_end_transaction returned %d\n", error); - } - - return error; - -free_fail: - journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); - if (journal_err) { - printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } -fail: - hfs_systemfile_unlock(hfsmp, lockflags); - (void) hfs_end_transaction(hfsmp); - if (hfs_resize_debug) { - printf ("hfs_relocate_journal_file: Error relocating journal file (error=%d)\n", error); - } - return error; -} - - -/* - * Relocate the journal file when the file system is being truncated. - * We do not down-size the journal when the file system size is - * reduced, so we always provide the current journal size to the - * relocate code. - */ -static int -hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) -{ - int error = 0; - u_int32_t startBlock; - u_int32_t blockCount = hfsmp->jnl_size / hfsmp->blockSize; - - /* - * Figure out the location of the .journal file. When the journal - * is on an external device, we need to look up the .journal file. - */ - if (hfsmp->jvp == hfsmp->hfs_devvp) { - startBlock = hfsmp->jnl_start; - blockCount = hfsmp->jnl_size / hfsmp->blockSize; - } else { - u_int32_t fileid; - u_int32_t old_jnlfileid; - struct cat_attr attr; - struct cat_fork fork; - - /* - * The cat_lookup inside GetFileInfo will fail because hfs_jnlfileid - * is set, and it is trying to hide the .journal file. So temporarily - * unset the field while calling GetFileInfo. - */ - old_jnlfileid = hfsmp->hfs_jnlfileid; - hfsmp->hfs_jnlfileid = 0; - fileid = GetFileInfo(hfsmp, kHFSRootFolderID, ".journal", &attr, &fork); - hfsmp->hfs_jnlfileid = old_jnlfileid; - if (fileid != old_jnlfileid) { - printf("hfs_reclaim_journal_file: cannot find .journal file!\n"); - return EIO; - } - - startBlock = fork.cf_extents[0].startBlock; - blockCount = fork.cf_extents[0].blockCount; - } - - if (startBlock + blockCount <= allocLimit) { - /* The journal file does not require relocation */ - return 0; - } - - error = hfs_relocate_journal_file(hfsmp, hfs_blk_to_bytes(blockCount, hfsmp->blockSize), - HFS_RESIZE_TRUNCATE, context); - if (error == 0) { - hfsmp->hfs_resize_blocksmoved += blockCount; - hfs_truncatefs_progress(hfsmp); - printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n", - blockCount, hfsmp->vcbVN); - } - - return error; -} - - -/* - * Move the journal info block to a new location. We have to make sure the - * new copy of the journal info block gets to the media first, then change - * the field in the volume header and the catalog record. - */ -static int -hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) -{ - int error; - int journal_err; - int lockflags; - u_int32_t oldBlock; - u_int32_t newBlock; - u_int32_t blockCount; - struct cat_desc jib_desc; - struct cat_attr jib_attr; - struct cat_fork jib_fork; - buf_t old_bp, new_bp; - - if (hfsmp->vcbJinfoBlock <= allocLimit) { - /* The journal info block does not require relocation */ - return 0; - } - - error = hfs_start_transaction(hfsmp); - if (error) { - printf("hfs_reclaim_journal_info_block: hfs_start_transaction returned %d\n", error); - return error; - } - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - error = BlockAllocate(hfsmp, 1, 1, 1, - HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS | HFS_ALLOC_FLUSHTXN, - &newBlock, &blockCount); - if (error) { - printf("hfs_reclaim_journal_info_block: BlockAllocate returned %d\n", error); - goto fail; - } - if (blockCount != 1) { - printf("hfs_reclaim_journal_info_block: blockCount != 1 (%u)\n", blockCount); - goto free_fail; - } - - /* Copy the old journal info block content to the new location */ - error = buf_meta_bread(hfsmp->hfs_devvp, - (uint64_t)hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), - hfsmp->blockSize, vfs_context_ucred(context), &old_bp); - if (error) { - printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error); - if (old_bp) { - buf_brelse(old_bp); - } - goto free_fail; - } - new_bp = buf_getblk(hfsmp->hfs_devvp, - (uint64_t)newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), - hfsmp->blockSize, 0, 0, BLK_META); - bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize); - buf_brelse(old_bp); - if (journal_uses_fua(hfsmp->jnl)) - buf_markfua(new_bp); - error = buf_bwrite(new_bp); - if (error) { - printf("hfs_reclaim_journal_info_block: failed to write new JIB (%d)\n", error); - goto free_fail; - } - if (!journal_uses_fua(hfsmp->jnl)) { - error = hfs_flush(hfsmp, HFS_FLUSH_CACHE); - if (error) { - printf("hfs_reclaim_journal_info_block: hfs_flush failed (%d)\n", error); - /* Don't fail the operation. */ - } - } - - /* Deallocate the old block once the new one has the new valid content */ - error = BlockDeallocate(hfsmp, hfsmp->vcbJinfoBlock, 1, HFS_ALLOC_SKIPFREEBLKS); - if (error) { - printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error); - goto free_fail; - } - - - /* Update the catalog record for .journal_info_block */ - error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, 0, &jib_desc, &jib_attr, &jib_fork); - if (error) { - printf("hfs_reclaim_journal_info_block: cat_idlookup returned %d\n", error); - goto fail; - } - oldBlock = jib_fork.cf_extents[0].startBlock; - jib_fork.cf_size = hfsmp->blockSize; - jib_fork.cf_extents[0].startBlock = newBlock; - jib_fork.cf_extents[0].blockCount = 1; - jib_fork.cf_blocks = 1; - error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL); - cat_releasedesc(&jib_desc); /* all done with cat descriptor */ - if (error) { - printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error); - goto fail; - } - - /* Update the pointer to the journal info block in the volume header. */ - hfsmp->vcbJinfoBlock = newBlock; - error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - if (error) { - printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error); - goto fail; - } - hfs_systemfile_unlock(hfsmp, lockflags); - error = hfs_end_transaction(hfsmp); - if (error) { - printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error); - } - error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); - if (error) { - printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error); - } - - /* Account for the block relocated and print progress */ - hfsmp->hfs_resize_blocksmoved += 1; - hfs_truncatefs_progress(hfsmp); - if (!error) { - printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n", - hfsmp->vcbVN); - if (hfs_resize_debug) { - printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount); - } - } - return error; - -free_fail: - journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS); - if (journal_err) { - printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error); - hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); - } - -fail: - hfs_systemfile_unlock(hfsmp, lockflags); - (void) hfs_end_transaction(hfsmp); - if (hfs_resize_debug) { - printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error); - } - return error; -} - - -static u_int64_t -calculate_journal_size(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count) -{ - u_int64_t journal_size; - u_int32_t journal_scale; - -#define DEFAULT_JOURNAL_SIZE (8*1024*1024) -#define MAX_JOURNAL_SIZE (512*1024*1024) - - /* Calculate the journal size for this volume. We want - * at least 8 MB of journal for each 100 GB of disk space. - * We cap the size at 512 MB, unless the allocation block - * size is larger, in which case, we use one allocation - * block. - */ - journal_scale = (sector_size * sector_count) / ((u_int64_t)100 * 1024 * 1024 * 1024); - journal_size = DEFAULT_JOURNAL_SIZE * (journal_scale + 1); - if (journal_size > MAX_JOURNAL_SIZE) { - journal_size = MAX_JOURNAL_SIZE; - } - if (journal_size < hfsmp->blockSize) { - journal_size = hfsmp->blockSize; - } - return journal_size; -} - - -/* - * Calculate the expected journal size based on current partition size. - * If the size of the current journal is less than the calculated size, - * force journal relocation with the new journal size. - */ -static int -hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context) -{ - int error = 0; - u_int64_t calc_journal_size; - - if (hfsmp->jvp != hfsmp->hfs_devvp) { - if (hfs_resize_debug) { - printf("hfs_extend_journal: not resizing the journal because it is on an external device.\n"); - } - return 0; - } - - calc_journal_size = calculate_journal_size(hfsmp, sector_size, sector_count); - if (calc_journal_size <= hfsmp->jnl_size) { - /* The journal size requires no modification */ - goto out; - } - - if (hfs_resize_debug) { - printf ("hfs_extend_journal: journal old=%u, new=%qd\n", hfsmp->jnl_size, calc_journal_size); - } - - /* Extend the journal to the new calculated size */ - error = hfs_relocate_journal_file(hfsmp, calc_journal_size, HFS_RESIZE_EXTEND, context); - if (error == 0) { - printf ("hfs_extend_journal: Extended journal size to %u bytes on \"%s\"\n", - hfsmp->jnl_size, hfsmp->vcbVN); - } -out: - return error; -} - - -/* - * This function traverses through all extended attribute records for a given - * fileID, and calls function that reclaims data blocks that exist in the - * area of the disk being reclaimed which in turn is responsible for allocating - * new space, copying extent data, deallocating new space, and if required, - * splitting the extent. - * - * Note: The caller has already acquired the cnode lock on the file. Therefore - * we are assured that no other thread would be creating/deleting/modifying - * extended attributes for this file. - * - * Side Effects: - * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation - * blocks that were relocated. - * - * Returns: - * 0 on success, non-zero on failure. - */ -static int -hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context) -{ - int error = 0; - struct hfs_reclaim_extent_info *extent_info; - int i; - HFSPlusAttrKey *key; - int *lockflags; - - if (hfs_resize_debug) { - printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID); - } - - MALLOC(extent_info, struct hfs_reclaim_extent_info *, - sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK); - if (extent_info == NULL) { - return ENOMEM; - } - bzero(extent_info, sizeof(struct hfs_reclaim_extent_info)); - extent_info->vp = vp; - extent_info->fileID = fileID; - extent_info->is_xattr = true; - extent_info->is_sysfile = vnode_issystem(vp); - extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp); - lockflags = &(extent_info->lockflags); - *lockflags = SFL_ATTRIBUTE | SFL_BITMAP; - - /* Initialize iterator from the extent_info structure */ - MALLOC(extent_info->iterator, struct BTreeIterator *, - sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (extent_info->iterator == NULL) { - error = ENOMEM; - goto out; - } - bzero(extent_info->iterator, sizeof(struct BTreeIterator)); - - /* Build attribute key */ - key = (HFSPlusAttrKey *)&(extent_info->iterator->key); - error = hfs_buildattrkey(fileID, NULL, key); - if (error) { - goto out; - } - - /* Initialize btdata from extent_info structure. Note that the - * buffer pointer actually points to the xattr record from the - * extent_info structure itself. - */ - extent_info->btdata.bufferAddress = &(extent_info->record.xattr); - extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord); - extent_info->btdata.itemCount = 1; - - /* - * Sync all extent-based attribute data to the disk. - * - * All extent-based attribute data I/O is performed via cluster - * I/O using a virtual file that spans across entire file system - * space. - */ - hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - (void)cluster_push(hfsmp->hfs_attrdata_vp, 0); - error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr"); - hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_LOCK_DEFAULT); - if (error) { - goto out; - } - - /* Search for extended attribute for current file. This - * will place the iterator before the first matching record. - */ - *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); - error = BTSearchRecord(extent_info->fcb, extent_info->iterator, - &(extent_info->btdata), &(extent_info->recordlen), - extent_info->iterator); - hfs_systemfile_unlock(hfsmp, *lockflags); - if (error) { - if (error != btNotFound) { - goto out; - } - /* btNotFound is expected here, so just mask it */ - error = 0; - } - - while (1) { - /* Iterate to the next record */ - *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); - error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, - extent_info->iterator, &(extent_info->btdata), - &(extent_info->recordlen)); - hfs_systemfile_unlock(hfsmp, *lockflags); - - /* Stop the iteration if we encounter end of btree or xattr with different fileID */ - if (error || key->fileID != fileID) { - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - break; - } - - /* We only care about extent-based EAs */ - if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) && - (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) { - continue; - } - - if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) { - extent_info->overflow_count = 0; - extent_info->extents = extent_info->record.xattr.forkData.theFork.extents; - } else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) { - extent_info->overflow_count++; - extent_info->extents = extent_info->record.xattr.overflowExtents.extents; - } - - extent_info->recStartBlock = key->startBlock; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (extent_info->extents[i].blockCount == 0) { - break; - } - extent_info->extent_index = i; - error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); - if (error) { - printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error); - goto out; - } - } - } - -out: - /* If any blocks were relocated, account them and report progress */ - if (extent_info->blocks_relocated) { - hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; - hfs_truncatefs_progress(hfsmp); - } - if (extent_info->iterator) { - FREE(extent_info->iterator, M_TEMP); - } - if (extent_info) { - FREE(extent_info, M_TEMP); - } - if (hfs_resize_debug) { - printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error); - } - return error; -} - -/* - * Reclaim any extent-based extended attributes allocation blocks from - * the area of the disk that is being truncated. - * - * The function traverses the attribute btree to find out the fileIDs - * of the extended attributes that need to be relocated. For every - * file whose large EA requires relocation, it looks up the cnode and - * calls hfs_reclaim_xattr() to do all the work for allocating - * new space, copying data, deallocating old space, and if required, - * splitting the extents. - * - * Inputs: - * allocLimit - starting block of the area being reclaimed - * - * Returns: - * returns 0 on success, non-zero on failure. - */ -static int -hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) -{ - int error = 0; - FCB *fcb; - struct BTreeIterator *iterator = NULL; - struct FSBufferDescriptor btdata; - HFSPlusAttrKey *key; - HFSPlusAttrRecord rec; - int lockflags = 0; - cnid_t prev_fileid = 0; - struct vnode *vp; - int need_relocate; - int btree_operation; - u_int32_t files_moved = 0; - u_int32_t prev_blocksmoved; - int i; - - fcb = VTOF(hfsmp->hfs_attribute_vp); - /* Store the value to print total blocks moved by this function in end */ - prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; - - if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator), VM_KERN_MEMORY_FILE)) { - return ENOMEM; - } - bzero(iterator, sizeof(*iterator)); - key = (HFSPlusAttrKey *)&iterator->key; - btdata.bufferAddress = &rec; - btdata.itemSize = sizeof(rec); - btdata.itemCount = 1; - - need_relocate = false; - btree_operation = kBTreeFirstRecord; - /* Traverse the attribute btree to find extent-based EAs to reclaim */ - while (1) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) { - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - break; - } - btree_operation = kBTreeNextRecord; - - /* If the extents of current fileID were already relocated, skip it */ - if (prev_fileid == key->fileID) { - continue; - } - - /* Check if any of the extents in the current record need to be relocated */ - need_relocate = false; - switch(rec.recordType) { - case kHFSPlusAttrForkData: - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (rec.forkData.theFork.extents[i].blockCount == 0) { - break; - } - if ((rec.forkData.theFork.extents[i].startBlock + - rec.forkData.theFork.extents[i].blockCount) > allocLimit) { - need_relocate = true; - break; - } - } - break; - - case kHFSPlusAttrExtents: - for (i = 0; i < kHFSPlusExtentDensity; i++) { - if (rec.overflowExtents.extents[i].blockCount == 0) { - break; - } - if ((rec.overflowExtents.extents[i].startBlock + - rec.overflowExtents.extents[i].blockCount) > allocLimit) { - need_relocate = true; - break; - } - } - break; - }; - - /* Continue iterating to next attribute record */ - if (need_relocate == false) { - continue; - } - - /* Look up the vnode for corresponding file. The cnode - * will be locked which will ensure that no one modifies - * the xattrs when we are relocating them. - * - * We want to allow open-unlinked files to be moved, - * so provide allow_deleted == 1 for hfs_vget(). - */ - if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) { - continue; - } - - error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context); - hfs_unlock(VTOC(vp)); - vnode_put(vp); - if (error) { - printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error); - break; - } - prev_fileid = key->fileID; - files_moved++; - } - - if (files_moved) { - printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n", - (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), - files_moved, hfsmp->vcbVN); - } - - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - return error; -} - -/* - * Reclaim blocks from regular files. - * - * This function iterates over all the record in catalog btree looking - * for files with extents that overlap into the space we're trying to - * free up. If a file extent requires relocation, it looks up the vnode - * and calls function to relocate the data. - * - * Returns: - * Zero on success, non-zero on failure. - */ -static int -hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) -{ - int error; - FCB *fcb; - struct BTreeIterator *iterator = NULL; - struct FSBufferDescriptor btdata; - int btree_operation; - int lockflags; - struct HFSPlusCatalogFile filerec; - struct vnode *vp; - struct vnode *rvp; - struct filefork *datafork; - u_int32_t files_moved = 0; - u_int32_t prev_blocksmoved; - -#if CONFIG_PROTECT - int keys_generated = 0; -#endif - - fcb = VTOF(hfsmp->hfs_catalog_vp); - /* Store the value to print total blocks moved by this function at the end */ - prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; - - if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator), VM_KERN_MEMORY_FILE)) { - error = ENOMEM; - goto reclaim_filespace_done; - } - -#if CONFIG_PROTECT - /* - * For content-protected filesystems, we may need to relocate files that - * are encrypted. If they use the new-style offset-based IVs, then - * we can move them regardless of the lock state. We create a temporary - * key here that we use to read/write the data, then we discard it at the - * end of the function. - */ - if (cp_fs_protected (hfsmp->hfs_mp)) { - error = cpx_gentempkeys(&hfsmp->hfs_resize_cpx, hfsmp); - if (error == 0) { - keys_generated = 1; - } - - if (error) { - printf("hfs_reclaimspace: Error generating temporary keys for resize (%d)\n", error); - goto reclaim_filespace_done; - } - } - -#endif - - bzero(iterator, sizeof(*iterator)); - - btdata.bufferAddress = &filerec; - btdata.itemSize = sizeof(filerec); - btdata.itemCount = 1; - - btree_operation = kBTreeFirstRecord; - while (1) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) { - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - break; - } - btree_operation = kBTreeNextRecord; - - if (filerec.recordType != kHFSPlusFileRecord) { - continue; - } - - /* Check if any of the extents require relocation */ - bool overlaps; - error = hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec, &overlaps); - if (error) - break; - - if (!overlaps) - continue; - - /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */ - if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) { - if (hfs_resize_debug) { - printf("hfs_reclaim_filespace: hfs_vget(%u) failed.\n", filerec.fileID); - } - continue; - } - - /* If data fork exists or item is a directory hard link, relocate blocks */ - datafork = VTOF(vp); - if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) { - error = hfs_reclaim_file(hfsmp, vp, filerec.fileID, - kHFSDataForkType, allocLimit, context); - if (error) { - printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); - hfs_unlock(VTOC(vp)); - vnode_put(vp); - break; - } - } - - /* If resource fork exists or item is a directory hard link, relocate blocks */ - if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) { - if (vnode_isdir(vp)) { - /* Resource fork vnode lookup is invalid for directory hard link. - * So we fake data fork vnode as resource fork vnode. - */ - rvp = vp; - } else { - error = hfs_vgetrsrc(hfsmp, vp, &rvp); - if (error) { - printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error); - hfs_unlock(VTOC(vp)); - vnode_put(vp); - break; - } - VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT; - } - - error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID, - kHFSResourceForkType, allocLimit, context); - if (error) { - printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); - hfs_unlock(VTOC(vp)); - vnode_put(vp); - break; - } - } - - /* The file forks were relocated successfully, now drop the - * cnode lock and vnode reference, and continue iterating to - * next catalog record. - */ - hfs_unlock(VTOC(vp)); - vnode_put(vp); - files_moved++; - } - - if (files_moved) { - printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n", - (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), - files_moved, hfsmp->vcbVN); - } - -reclaim_filespace_done: - if (iterator) { - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - } - -#if CONFIG_PROTECT - if (keys_generated) { - cpx_free(hfsmp->hfs_resize_cpx); - hfsmp->hfs_resize_cpx = NULL; - } -#endif - return error; -} - -/* - * Reclaim space at the end of a file system. - * - * Inputs - - * allocLimit - start block of the space being reclaimed - * reclaimblks - number of allocation blocks to reclaim - */ -static int -hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context) -{ - int error = 0; - - /* - * Preflight the bitmap to find out total number of blocks that need - * relocation. - * - * Note: Since allocLimit is set to the location of new alternate volume - * header, the check below does not account for blocks allocated for old - * alternate volume header. - */ - error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks)); - if (error) { - printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error); - return error; - } - if (hfs_resize_debug) { - printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks); - } - - /* Just to be safe, sync the content of the journal to the disk before we proceed */ - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - - /* First, relocate journal file blocks if they're in the way. - * Doing this first will make sure that journal relocate code - * gets access to contiguous blocks on disk first. The journal - * file has to be contiguous on the disk, otherwise resize will - * fail. - */ - error = hfs_reclaim_journal_file(hfsmp, allocLimit, context); - if (error) { - printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error); - return error; - } - - /* Relocate journal info block blocks if they're in the way. */ - error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context); - if (error) { - printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error); - return error; - } - - /* Relocate extents of the Extents B-tree if they're in the way. - * Relocating extents btree before other btrees is important as - * this will provide access to largest contiguous block range on - * the disk for relocating extents btree. Note that extents btree - * can only have maximum of 8 extents. - */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID, - kHFSDataForkType, allocLimit, context); - if (error) { - printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error); - return error; - } - - /* Relocate extents of the Allocation file if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID, - kHFSDataForkType, allocLimit, context); - if (error) { - printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error); - return error; - } - - /* Relocate extents of the Catalog B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID, - kHFSDataForkType, allocLimit, context); - if (error) { - printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error); - return error; - } - - /* Relocate extents of the Attributes B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID, - kHFSDataForkType, allocLimit, context); - if (error) { - printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error); - return error; - } - - /* Relocate extents of the Startup File if there is one and they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID, - kHFSDataForkType, allocLimit, context); - if (error) { - printf("hfs_reclaimspace: reclaim startup file returned %d\n", error); - return error; - } - - /* - * We need to make sure the alternate volume header gets flushed if we moved - * any extents in the volume header. But we need to do that before - * shrinking the size of the volume, or else the journal code will panic - * with an invalid (too large) block number. - * - * Note that blks_moved will be set if ANY extent was moved, even - * if it was just an overflow extent. In this case, the journal_flush isn't - * strictly required, but shouldn't hurt. - */ - if (hfsmp->hfs_resize_blocksmoved) { - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - } - - /* Reclaim extents from catalog file records */ - error = hfs_reclaim_filespace(hfsmp, allocLimit, context); - if (error) { - printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error); - return error; - } - - /* Reclaim extents from extent-based extended attributes, if any */ - error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context); - if (error) { - printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error); - return error; - } - - /* - * Make sure reserved ranges in the region we're to allocate don't - * overlap. - */ - struct rl_entry *range; -again:; - int lockf = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_SHARED_LOCK); - TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[HFS_LOCKED_BLOCKS], rl_link) { - if (rl_overlap(range, hfsmp->allocLimit, RL_INFINITY) != RL_NOOVERLAP) { - // Wait 100ms - hfs_systemfile_unlock(hfsmp, lockf); - msleep(hfs_reclaimspace, NULL, PINOD, "waiting on reserved blocks", - &(struct timespec){ 0, 100 * 1000000 }); - goto again; - } - } - hfs_systemfile_unlock(hfsmp, lockf); - - return error; -} - - -/* - * Check if there are any extents (including overflow extents) that overlap - * into the disk space that is being reclaimed. - * - * Output - - * true - One of the extents need to be relocated - * false - No overflow extents need to be relocated, or there was an error - */ -static errno_t -hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, - struct HFSPlusCatalogFile *filerec, bool *overlaps) -{ - struct BTreeIterator * iterator = NULL; - struct FSBufferDescriptor btdata; - HFSPlusExtentRecord extrec; - HFSPlusExtentKey *extkeyptr; - FCB *fcb; - int i, j; - int error; - int lockflags = 0; - u_int32_t endblock; - errno_t ret = 0; - - /* Check if data fork overlaps the target space */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (filerec->dataFork.extents[i].blockCount == 0) { - break; - } - endblock = filerec->dataFork.extents[i].startBlock + - filerec->dataFork.extents[i].blockCount; - if (endblock > allocLimit) { - *overlaps = true; - goto out; - } - } - - /* Check if resource fork overlaps the target space */ - for (j = 0; j < kHFSPlusExtentDensity; ++j) { - if (filerec->resourceFork.extents[j].blockCount == 0) { - break; - } - endblock = filerec->resourceFork.extents[j].startBlock + - filerec->resourceFork.extents[j].blockCount; - if (endblock > allocLimit) { - *overlaps = true; - goto out; - } - } - - /* Return back if there are no overflow extents for this file */ - if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) { - *overlaps = false; - goto out; - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - - bzero(iterator, sizeof(*iterator)); - extkeyptr = (HFSPlusExtentKey *)&iterator->key; - extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength; - extkeyptr->forkType = 0; - extkeyptr->fileID = filerec->fileID; - extkeyptr->startBlock = 0; - - btdata.bufferAddress = &extrec; - btdata.itemSize = sizeof(extrec); - btdata.itemCount = 1; - - fcb = VTOF(hfsmp->hfs_extents_vp); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); - - /* This will position the iterator just before the first overflow - * extent record for given fileID. It will always return btNotFound, - * so we special case the error code. - */ - error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); - if (error && (error != btNotFound)) { - ret = MacToVFSError(error); - goto out; - } - - /* BTIterateRecord() might return error if the btree is empty, and - * therefore we return that the extent does not overflow to the caller - */ - error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - while (error == 0) { - /* Stop when we encounter a different file. */ - if (extkeyptr->fileID != filerec->fileID) { - break; - } - /* Check if any of the forks exist in the target space. */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (extrec[i].blockCount == 0) { - break; - } - endblock = extrec[i].startBlock + extrec[i].blockCount; - if (endblock > allocLimit) { - *overlaps = true; - goto out; - } - } - /* Look for more records. */ - error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - } - - if (error && error != btNotFound) { - ret = MacToVFSError(error); - goto out; - } - - *overlaps = false; - -out: - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - } - - FREE(iterator, M_TEMP); - - return ret; -} - - -/* - * Calculate the progress of a file system resize operation. - */ -__private_extern__ -int -hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress) -{ - if ((hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) == 0) { - return (ENXIO); - } - - if (hfsmp->hfs_resize_totalblocks > 0) { - *progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks); - } else { - *progress = 0; - } - - return (0); -} diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c deleted file mode 100644 index 45cd1a22d..000000000 --- a/bsd/hfs/hfs_search.c +++ /dev/null @@ -1,1407 +0,0 @@ -/* - * Copyright (c) 1997-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - * - * @(#)hfs_search.c - */ -/* - * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce - * support for mandatory and extensible security protections. This notice - * is included in support of clause 2.2 (b) of the Apple Public License, - * Version 2.0. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if CONFIG_MACF -#include -#endif - -#include "hfs.h" -#include "hfs_dbg.h" -#include "hfs_catalog.h" -#include "hfs_attrlist.h" -#include "hfs_endian.h" - -#include "hfscommon/headers/FileMgrInternal.h" -#include "hfscommon/headers/HFSUnicodeWrappers.h" -#include "hfscommon/headers/BTreesPrivate.h" -#include "hfscommon/headers/BTreeScanner.h" -#include "hfscommon/headers/CatalogPrivate.h" - -#if CONFIG_SEARCHFS - -/* Search criterea. */ -struct directoryInfoSpec -{ - u_int32_t numFiles; -}; - -struct fileInfoSpec -{ - off_t dataLogicalLength; - off_t dataPhysicalLength; - off_t resourceLogicalLength; - off_t resourcePhysicalLength; -}; - -struct searchinfospec -{ - u_char name[kHFSPlusMaxFileNameBytes]; - u_int32_t nameLength; - char attributes; // see IM:Files 2-100 - u_int32_t nodeID; - u_int32_t parentDirID; - struct timespec creationDate; - struct timespec modificationDate; - struct timespec changeDate; - struct timespec accessDate; - struct timespec lastBackupDate; - u_int8_t finderInfo[32]; - uid_t uid; - gid_t gid; - mode_t mask; - struct fileInfoSpec f; - struct directoryInfoSpec d; -}; -typedef struct searchinfospec searchinfospec_t; - -static void ResolveHardlink(struct hfsmount *hfsmp, HFSPlusCatalogFile *recp); - - -static int UnpackSearchAttributeBlock(struct hfsmount *hfsmp, struct attrlist *alist, - searchinfospec_t *searchInfo, void *attributeBuffer, int firstblock); - -static int CheckCriteria( ExtendedVCB *vcb, - u_long searchBits, - struct attrlist *attrList, - CatalogRecord *rec, - CatalogKey *key, - searchinfospec_t *searchInfo1, - searchinfospec_t *searchInfo2, - struct vfs_context *ctx); - -static int CheckAccess(ExtendedVCB *vcb, u_long searchBits, CatalogKey *key, struct vfs_context *ctx); - -static int InsertMatch(struct hfsmount *hfsmp, uio_t a_uio, CatalogRecord *rec, - CatalogKey *key, struct attrlist *returnAttrList, - void *attributesBuffer, void *variableBuffer, - uint32_t * nummatches ); - -static Boolean CompareRange(u_long val, u_long low, u_long high); -static Boolean CompareWideRange(u_int64_t val, u_int64_t low, u_int64_t high); - -static Boolean CompareRange( u_long val, u_long low, u_long high ) -{ - return( (val >= low) && (val <= high) ); -} - -static Boolean CompareWideRange( u_int64_t val, u_int64_t low, u_int64_t high ) -{ - return( (val >= low) && (val <= high) ); -} -//#define CompareRange(val, low, high) ((val >= low) && (val <= high)) - - -/************************************************************************/ -/* Entry for searchfs() */ -/************************************************************************/ - -#define errSearchBufferFull 101 /* Internal search errors */ -/* -# -#% searchfs vp L L L -# -vnop_searchfs { - IN struct vnode *vp; - IN off_t length; - IN int flags; - IN kauth_cred_t cred; - IN struct proc *p; -}; -*/ - -int -hfs_vnop_search(ap) - struct vnop_searchfs_args *ap; /* - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - void *a_searchparams1; - void *a_searchparams2; - struct attrlist *a_searchattrs; - u_long a_maxmatches; - struct timeval *a_timelimit; - struct attrlist *a_returnattrs; - u_long *a_nummatches; - u_long a_scriptcode; - u_long a_options; - struct uio *a_uio; - struct searchstate *a_searchstate; - vfs_context_t a_context; - */ -{ - ExtendedVCB *vcb = VTOVCB(ap->a_vp); - struct hfsmount *hfsmp; - FCB * catalogFCB; - searchinfospec_t searchInfo1; - searchinfospec_t searchInfo2; - void *attributesBuffer = NULL; - void *variableBuffer; - u_int32_t fixedBlockSize; - u_int32_t eachReturnBufferSize; - struct proc *p = current_proc(); - int err = E_NONE; - int isHFSPlus; - CatalogKey * myCurrentKeyPtr; - CatalogRecord * myCurrentDataPtr; - CatPosition * myCatPositionPtr; - BTScanState myBTScanState; - user_addr_t user_start = 0; - user_size_t user_len = 0; - int32_t searchTime; - int lockflags; - struct uthread *ut; - boolean_t timerExpired = FALSE; - boolean_t needThrottle = FALSE; - - /* XXX Parameter check a_searchattrs? */ - - *(ap->a_nummatches) = 0; - - if (ap->a_options & ~SRCHFS_VALIDOPTIONSMASK) { - return (EINVAL); - } - - /* - * Fail requests for attributes that HFS does not support for the - * items that match the search criteria. Note that these checks - * are for the OUTBOUND attributes to be returned (not search criteria). - */ - if ((ap->a_returnattrs->commonattr & ~HFS_ATTR_CMN_VALID) || - (ap->a_returnattrs->volattr != 0) || - (ap->a_returnattrs->dirattr & ~HFS_ATTR_DIR_VALID) || - (ap->a_returnattrs->fileattr & ~HFS_ATTR_FILE_VALID) || - (ap->a_returnattrs->forkattr != 0)) { - - return (EINVAL); - } - - /* SRCHFS_SKIPLINKS requires root access. - * This option cannot be used with either - * the ATTR_CMN_NAME or ATTR_CMN_PAROBJID - * attributes. - */ - if (ap->a_options & SRCHFS_SKIPLINKS) { - attrgroup_t attrs; - - attrs = ap->a_searchattrs->commonattr | ap->a_returnattrs->commonattr; - if (attrs & (ATTR_CMN_NAME | ATTR_CMN_PAROBJID)) { - return (EINVAL); - } - - if ((err = vfs_context_suser(ap->a_context))) { - return (err); - } - } - - // If both 32-bit and 64-bit parent ids or file ids are given - // then return an error. - - attrgroup_t test_attrs=ap->a_searchattrs->commonattr; - - if (((test_attrs & ATTR_CMN_OBJID) && (test_attrs & ATTR_CMN_FILEID)) || - ((test_attrs & ATTR_CMN_PARENTID) && (test_attrs & ATTR_CMN_PAROBJID))) { - return (EINVAL); - } - - if (uio_resid(ap->a_uio) <= 0) { - return (EINVAL); - } - - isHFSPlus = (vcb->vcbSigWord == kHFSPlusSigWord); - hfsmp = VTOHFS(ap->a_vp); - - searchTime = kMaxMicroSecsInKernel; - if (ap->a_timelimit->tv_sec == 0 && - ap->a_timelimit->tv_usec > 0 && - ap->a_timelimit->tv_usec < kMaxMicroSecsInKernel) { - searchTime = ap->a_timelimit->tv_usec; - } - - /* UnPack the search boundries, searchInfo1, searchInfo2 */ - err = UnpackSearchAttributeBlock(hfsmp, ap->a_searchattrs, - &searchInfo1, ap->a_searchparams1, 1); - if (err) { - return err; - } - err = UnpackSearchAttributeBlock(hfsmp, ap->a_searchattrs, - &searchInfo2, ap->a_searchparams2, 0); - if (err) { - return err; - } - //shadow search bits if 64-bit file/parent ids are used - if (ap->a_searchattrs->commonattr & ATTR_CMN_FILEID) - ap->a_searchattrs->commonattr |= ATTR_CMN_OBJID; - if (ap->a_searchattrs->commonattr & ATTR_CMN_PARENTID) - ap->a_searchattrs->commonattr |= ATTR_CMN_PAROBJID; - - fixedBlockSize = sizeof(u_int32_t) + hfs_attrblksize(ap->a_returnattrs); /* u_int32_t for length word */ - - eachReturnBufferSize = fixedBlockSize; - - if ( ap->a_returnattrs->commonattr & ATTR_CMN_NAME ) /* XXX should be more robust! */ - eachReturnBufferSize += kHFSPlusMaxFileNameBytes + 1; - - MALLOC( attributesBuffer, void *, eachReturnBufferSize, M_TEMP, M_WAITOK ); - if (attributesBuffer == NULL) { - err = ENOMEM; - goto ExitThisRoutine; - } - bzero(attributesBuffer, eachReturnBufferSize); - variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize); - - // XXXdbg - have to lock the user's buffer so we don't fault - // while holding the shared catalog file lock. see the comment - // in hfs_readdir() for more details. - // - if (hfsmp->jnl && uio_isuserspace(ap->a_uio)) { - user_start = uio_curriovbase(ap->a_uio); - user_len = uio_curriovlen(ap->a_uio); - - if ((err = vslock(user_start, user_len)) != 0) { - user_start = 0; - goto ExitThisRoutine; - } - } - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - catalogFCB = GetFileControlBlock(vcb->catalogRefNum); - myCurrentKeyPtr = NULL; - myCurrentDataPtr = NULL; - myCatPositionPtr = (CatPosition *)ap->a_searchstate; - - if (ap->a_options & SRCHFS_START) { - /* Starting a new search. */ - /* Make sure the on-disk Catalog file is current */ - (void) hfs_fsync(vcb->catalogRefNum, MNT_WAIT, 0, p); - if (hfsmp->jnl) { - hfs_systemfile_unlock(hfsmp, lockflags); - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - } - - ap->a_options &= ~SRCHFS_START; - bzero((caddr_t)myCatPositionPtr, sizeof(*myCatPositionPtr)); - err = BTScanInitialize(catalogFCB, 0, 0, 0, kCatSearchBufferSize, &myBTScanState); - if (err) { - hfs_systemfile_unlock(hfsmp, lockflags); - goto ExitThisRoutine; - } - } else { - /* Resuming a search. */ - err = BTScanInitialize(catalogFCB, myCatPositionPtr->nextNode, - myCatPositionPtr->nextRecord, - myCatPositionPtr->recordsFound, - kCatSearchBufferSize, - &myBTScanState); - /* Make sure Catalog hasn't changed. */ - if (err == 0 - && myCatPositionPtr->writeCount != myBTScanState.btcb->writeCount) { - myCatPositionPtr->writeCount = myBTScanState.btcb->writeCount; - err = EBUSY; /* catChangedErr */ - } - } - hfs_systemfile_unlock(hfsmp, lockflags); - - if (err) - goto ExitThisRoutine; - - if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) - needThrottle = TRUE; - /* - * Check all the catalog btree records... - * return the attributes for matching items - */ - for (;;) { - struct timeval myCurrentTime; - struct timeval myElapsedTime; - - err = BTScanNextRecord(&myBTScanState, timerExpired, - (void **)&myCurrentKeyPtr, (void **)&myCurrentDataPtr, - NULL); - if (err) - break; - - /* Resolve any hardlinks */ - if (isHFSPlus && (ap->a_options & SRCHFS_SKIPLINKS) == 0) { - ResolveHardlink(vcb, (HFSPlusCatalogFile *)myCurrentDataPtr); - } - if (CheckCriteria( vcb, ap->a_options, ap->a_searchattrs, myCurrentDataPtr, - myCurrentKeyPtr, &searchInfo1, &searchInfo2, ap->a_context ) - && CheckAccess(vcb, ap->a_options, myCurrentKeyPtr, ap->a_context)) { - err = InsertMatch(hfsmp, ap->a_uio, myCurrentDataPtr, - myCurrentKeyPtr, ap->a_returnattrs, - attributesBuffer, variableBuffer, ap->a_nummatches); - if (err) { - /* - * The last match didn't fit so come back - * to this record on the next trip. - */ - --myBTScanState.recordsFound; - --myBTScanState.recordNum; - break; - } - - if (*(ap->a_nummatches) >= ap->a_maxmatches) - break; - } - if (timerExpired == FALSE) { - /* - * Check our elapsed time and bail if we've hit the max. - * The idea here is to throttle the amount of time we - * spend in the kernel. - */ - microuptime(&myCurrentTime); - timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); - /* - * Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 - */ - if (myElapsedTime.tv_sec > 0 - || myElapsedTime.tv_usec >= searchTime) { - timerExpired = TRUE; - } else if (needThrottle == TRUE) { - if (throttle_io_will_be_throttled(ut->uu_lowpri_window, HFSTOVFS(hfsmp))) - timerExpired = TRUE; - } - } - } - - /* Update catalog position */ - myCatPositionPtr->writeCount = myBTScanState.btcb->writeCount; - - BTScanTerminate(&myBTScanState, &myCatPositionPtr->nextNode, - &myCatPositionPtr->nextRecord, - &myCatPositionPtr->recordsFound); - - if ( err == E_NONE ) { - err = EAGAIN; /* signal to the user to call searchfs again */ - } else if ( err == errSearchBufferFull ) { - if ( *(ap->a_nummatches) > 0 ) - err = EAGAIN; - else - err = ENOBUFS; - } else if ( err == btNotFound ) { - err = E_NONE; /* the entire disk has been searched */ - } else if ( err == fsBTTimeOutErr ) { - err = EAGAIN; - } - -ExitThisRoutine: - if (attributesBuffer) - FREE(attributesBuffer, M_TEMP); - - if (user_start) { - vsunlock(user_start, user_len, TRUE); - } - - return (MacToVFSError(err)); -} - - -static void -ResolveHardlink(struct hfsmount *hfsmp, HFSPlusCatalogFile *recp) -{ - u_int32_t type, creator; - int isdirlink = 0; - int isfilelink = 0; - time_t filecreatedate; - - if (recp->recordType != kHFSPlusFileRecord) { - return; - } - type = SWAP_BE32(recp->userInfo.fdType); - creator = SWAP_BE32(recp->userInfo.fdCreator); - filecreatedate = to_bsd_time(recp->createDate); - - if ((type == kHardLinkFileType && creator == kHFSPlusCreator) && - (filecreatedate == (time_t)hfsmp->hfs_itime || - filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { - isfilelink = 1; - } else if ((type == kHFSAliasType && creator == kHFSAliasCreator) && - (recp->flags & kHFSHasLinkChainMask) && - (filecreatedate == (time_t)hfsmp->hfs_itime || - filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { - isdirlink = 1; - } - - if (isfilelink || isdirlink) { - cnid_t saved_cnid; - int lockflags; - - /* Export link's cnid (a unique value) instead of inode's cnid */ - saved_cnid = recp->fileID; - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - (void) cat_resolvelink(hfsmp, recp->hl_linkReference, isdirlink, recp); - - recp->fileID = saved_cnid; - hfs_systemfile_unlock(hfsmp, lockflags); - } -} - - -static Boolean -CompareMasked(const u_int32_t *thisValue, const u_int32_t *compareData, - const u_int32_t *compareMask, u_int32_t count) -{ - Boolean matched; - u_int32_t i; - - matched = true; /* Assume it will all match */ - - for (i=0; i= f_len) { - *tsp = f_len; - - if (FastRelString(tsp++, find) == 0) - return TRUE; - } - - return FALSE; -} -#endif - - -/* - * Check to see if caller has access rights to this item - */ - -static int -CheckAccess(ExtendedVCB *theVCBPtr, u_long searchBits, CatalogKey *theKeyPtr, struct vfs_context *ctx) -{ - Boolean isHFSPlus; - int myErr; - int myResult; - HFSCatalogNodeID myNodeID; - hfsmount_t * hfsmp; - struct FndrDirInfo *finfop; - struct vnode * vp = NULL; - - myResult = 0; /* default to "no access" */ - - if (!vfs_context_suser(ctx)) { - myResult = 1; /* allow access */ - goto ExitThisRoutine; /* root always has access */ - } - - hfsmp = VCBTOHFS( theVCBPtr ); - isHFSPlus = ( theVCBPtr->vcbSigWord == kHFSPlusSigWord ); - if ( isHFSPlus ) - myNodeID = theKeyPtr->hfsPlus.parentID; -#if CONFIG_HFS_STD - else - myNodeID = theKeyPtr->hfs.parentID; -#endif - - while ( myNodeID >= kRootDirID ) { - cnode_t * cp; - - /* now go get catalog data for this directory */ - myErr = hfs_vget(hfsmp, myNodeID, &vp, 0, 0); - if ( myErr ) { - goto ExitThisRoutine; /* no access */ - } - - cp = VTOC(vp); - finfop = (struct FndrDirInfo *)&cp->c_attr.ca_finderinfo[0]; - - if ( searchBits & SRCHFS_SKIPPACKAGES ) { - if ( (SWAP_BE16(finfop->frFlags) & kHasBundle) - || (cp->c_desc.cd_nameptr != NULL - && is_package_name((const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen)) ) { - myResult = 0; - goto ExitThisRoutine; - } - } - - if ( searchBits & SRCHFS_SKIPINAPPROPRIATE ) { - if ( cp->c_parentcnid == kRootDirID && cp->c_desc.cd_nameptr != NULL && - vn_searchfs_inappropriate_name((const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) ) { - myResult = 0; - goto ExitThisRoutine; - } - } - - if ( (searchBits & SRCHFS_SKIPINVISIBLE) && - (SWAP_BE16(finfop->frFlags) & kIsInvisible) ) { - myResult = 0; - goto ExitThisRoutine; - } - - myNodeID = cp->c_parentcnid; /* move up the hierarchy */ - hfs_unlock(VTOC(vp)); - -#if CONFIG_MACF - if (vp->v_type == VDIR) { - myErr = mac_vnode_check_readdir(ctx, vp); - } else { - myErr = mac_vnode_check_stat(ctx, NOCRED, vp); - } - if (myErr) { - vnode_put(vp); - vp = NULL; - goto ExitThisRoutine; - } -#endif /* MAC */ - - if (vp->v_type == VDIR) { - myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), ctx); - } else { - myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH), ctx); - } - vnode_put(vp); - vp = NULL; - if ( myErr ) { - goto ExitThisRoutine; /* no access */ - } - } - myResult = 1; /* allow access */ - -ExitThisRoutine: - if ( vp != NULL ) { - hfs_unlock(VTOC(vp)); - vnode_put(vp); - } - return ( myResult ); - -} - -static int -CheckCriteria( ExtendedVCB *vcb, - u_long searchBits, - struct attrlist *attrList, - CatalogRecord *rec, - CatalogKey *key, - searchinfospec_t *searchInfo1, - searchinfospec_t *searchInfo2, - struct vfs_context *ctx) -{ - Boolean matched, atleastone; - Boolean isHFSPlus; - attrgroup_t searchAttributes; - struct cat_attr c_attr; - struct cat_fork datafork; - struct cat_fork rsrcfork; - int force_case_sensitivity = proc_is_forcing_hfs_case_sensitivity(vfs_context_proc(ctx)); - - bzero(&c_attr, sizeof(c_attr)); - isHFSPlus = (vcb->vcbSigWord == kHFSPlusSigWord); - - switch (rec->recordType) { - -#if CONFIG_HFS_STD - case kHFSFolderRecord: - if ( (searchBits & SRCHFS_MATCHDIRS) == 0 ) { /* If we are NOT searching folders */ - matched = false; - goto TestDone; - } - break; - - case kHFSFileRecord: - if ( (searchBits & SRCHFS_MATCHFILES) == 0 ) { /* If we are NOT searching files */ - matched = false; - goto TestDone; - } - break; -#endif - - case kHFSPlusFolderRecord: - if ( (searchBits & SRCHFS_MATCHDIRS) == 0 ) { /* If we are NOT searching folders */ - matched = false; - goto TestDone; - } - break; - - case kHFSPlusFileRecord: - /* Check if hardlink links should be skipped. */ - if (searchBits & SRCHFS_SKIPLINKS) { - cnid_t parid = key->hfsPlus.parentID; - HFSPlusCatalogFile *filep = (HFSPlusCatalogFile *)rec; - - if ((SWAP_BE32(filep->userInfo.fdType) == kHardLinkFileType) && - (SWAP_BE32(filep->userInfo.fdCreator) == kHFSPlusCreator)) { - return (false); /* skip over file link records */ - } else if ((parid == vcb->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && - (filep->bsdInfo.special.linkCount == 0)) { - return (false); /* skip over unlinked files */ - } else if ((SWAP_BE32(filep->userInfo.fdType) == kHFSAliasType) && - (SWAP_BE32(filep->userInfo.fdCreator) == kHFSAliasCreator) && - (filep->flags & kHFSHasLinkChainMask)) { - return (false); /* skip over dir link records */ - } - } else if (key->hfsPlus.parentID == vcb->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { - return (false); /* skip over private files */ - } else if (key->hfsPlus.parentID == vcb->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - return (false); /* skip over private files */ - } - - if ( (searchBits & SRCHFS_MATCHFILES) == 0 ) { /* If we are NOT searching files */ - matched = false; - goto TestDone; - } - break; - - default: /* Never match a thread record or any other type. */ - return( false ); /* Not a file or folder record, so can't search it */ - } - - matched = true; /* Assume we got a match */ - atleastone = false; /* Dont insert unless we match at least one criteria */ - - /* First, attempt to match the name -- either partial or complete */ - if ( attrList->commonattr & ATTR_CMN_NAME ) { - if (isHFSPlus) { - int case_sensitive = 0; - - /* - * Longstanding default behavior here is to use a non-case-sensitive - * search, even on case-sensitive filesystems. - * - * We only force case sensitivity if the controlling process has explicitly - * asked for it in the proc flags, and only if they are not doing - * a partial name match. Consider that if you are doing a partial - * name match ("all files that begin with 'image'"), the likelihood is - * high that you would want to see all matches, even those that do not - * explicitly match the case. - */ - if (force_case_sensitivity) { - case_sensitive = 1; - } - - /* Check for partial/full HFS Plus name match */ - - if ( searchBits & SRCHFS_MATCHPARTIALNAMES ) { - /* always use a case-INSENSITIVE search here */ - matched = ComparePartialUnicodeName(key->hfsPlus.nodeName.unicode, - key->hfsPlus.nodeName.length, - (UniChar*)searchInfo1->name, - searchInfo1->nameLength, 0); - } - else { - /* Full name match. Are we HFSX (case sensitive) or HFS+ ? */ - if (case_sensitive) { - matched = (UnicodeBinaryCompare(key->hfsPlus.nodeName.unicode, - key->hfsPlus.nodeName.length, - (UniChar*)searchInfo1->name, - searchInfo1->nameLength ) == 0); - } - else { - matched = (FastUnicodeCompare(key->hfsPlus.nodeName.unicode, - key->hfsPlus.nodeName.length, - (UniChar*)searchInfo1->name, - searchInfo1->nameLength ) == 0); - } - } - } -#if CONFIG_HFS_STD - else { - /* Check for partial/full HFS name match */ - - if ( searchBits & SRCHFS_MATCHPARTIALNAMES ) - matched = ComparePartialPascalName(key->hfs.nodeName, (u_char*)searchInfo1->name); - else /* full HFS name match */ - matched = (FastRelString(key->hfs.nodeName, (u_char*)searchInfo1->name) == 0); - } -#endif - - if ( matched == false || (searchBits & ~SRCHFS_MATCHPARTIALNAMES) == 0 ) - goto TestDone; /* no match, or nothing more to compare */ - - atleastone = true; - } - - /* Convert catalog record into cat_attr format. */ - cat_convertattr(VCBTOHFS(vcb), rec, &c_attr, &datafork, &rsrcfork); - - if (searchBits & SRCHFS_SKIPINVISIBLE) { - int flags; - - switch (rec->recordType) { -#if CONFIG_HFS_STD - case kHFSFolderRecord: - { - struct FndrDirInfo *finder_info; - - finder_info = (struct FndrDirInfo *)&c_attr.ca_finderinfo[0]; - flags = SWAP_BE16(finder_info->frFlags); - break; - } - - case kHFSFileRecord: - { - struct FndrFileInfo *finder_info; - - finder_info = (struct FndrFileInfo *)&c_attr.ca_finderinfo[0]; - flags = SWAP_BE16(finder_info->fdFlags); - break; - } -#endif - - case kHFSPlusFolderRecord: - { - struct FndrDirInfo *finder_info; - - finder_info = (struct FndrDirInfo *)&c_attr.ca_finderinfo[0]; - flags = SWAP_BE16(finder_info->frFlags); - break; - } - - case kHFSPlusFileRecord: - { - struct FndrFileInfo *finder_info; - - finder_info = (struct FndrFileInfo *)&c_attr.ca_finderinfo[0]; - flags = SWAP_BE16(finder_info->fdFlags); - break; - } - - default: - { - flags = kIsInvisible; - break; - } - } - - if (flags & kIsInvisible) { - matched = false; - goto TestDone; - } - } - - - - /* Now that we have a record worth searching, see if it matches the search attributes */ -#if CONFIG_HFS_STD - if (rec->recordType == kHFSFileRecord || - rec->recordType == kHFSPlusFileRecord) { -#else - if (rec->recordType == kHFSPlusFileRecord) { -#endif - - if ((attrList->fileattr & ~ATTR_FILE_VALIDMASK) != 0) { /* attr we do know about */ - matched = false; - goto TestDone; - } - else if ((attrList->fileattr & ATTR_FILE_VALIDMASK) != 0) { - searchAttributes = attrList->fileattr; - -#if HFS_COMPRESSION - if ( c_attr.ca_flags & UF_COMPRESSED ) { - /* for compressed files, set the data length to the uncompressed data size */ - if (( searchAttributes & ATTR_FILE_DATALENGTH ) || - ( searchAttributes & ATTR_FILE_DATAALLOCSIZE ) ) { - if ( 0 == hfs_uncompressed_size_of_compressed_file(vcb, NULL, c_attr.ca_fileid, &datafork.cf_size, 1) ) { /* 1 == don't take the cnode lock */ - datafork.cf_blocks = rsrcfork.cf_blocks; - } - } - /* treat compressed files as if their resource fork is empty */ - if (( searchAttributes & ATTR_FILE_RSRCLENGTH ) || - ( searchAttributes & ATTR_FILE_RSRCALLOCSIZE ) ) { - rsrcfork.cf_size = 0; - rsrcfork.cf_blocks = 0; - } - } -#endif /* HFS_COMPRESSION */ - - /* File logical length (data fork) */ - if ( searchAttributes & ATTR_FILE_DATALENGTH ) { - matched = CompareWideRange( - datafork.cf_size, - searchInfo1->f.dataLogicalLength, - searchInfo2->f.dataLogicalLength); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* File physical length (data fork) */ - if ( searchAttributes & ATTR_FILE_DATAALLOCSIZE ) { - matched = CompareWideRange( - (u_int64_t)datafork.cf_blocks * (u_int64_t)vcb->blockSize, - searchInfo1->f.dataPhysicalLength, - searchInfo2->f.dataPhysicalLength); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* File logical length (resource fork) */ - if ( searchAttributes & ATTR_FILE_RSRCLENGTH ) { - matched = CompareWideRange( - rsrcfork.cf_size, - searchInfo1->f.resourceLogicalLength, - searchInfo2->f.resourceLogicalLength); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* File physical length (resource fork) */ - if ( searchAttributes & ATTR_FILE_RSRCALLOCSIZE ) { - matched = CompareWideRange( - (u_int64_t)rsrcfork.cf_blocks * (u_int64_t)vcb->blockSize, - searchInfo1->f.resourcePhysicalLength, - searchInfo2->f.resourcePhysicalLength); - if (matched == false) goto TestDone; - atleastone = true; - } - } - else { - atleastone = true; /* to match SRCHFS_MATCHFILES */ - } - } - /* - * Check the directory attributes - */ -#if CONFIG_HFS_STD - else if (rec->recordType == kHFSFolderRecord || - rec->recordType == kHFSPlusFolderRecord) { -#else - else if (rec->recordType == kHFSPlusFolderRecord) { -#endif - if ((attrList->dirattr & ~ATTR_DIR_VALIDMASK) != 0) { /* attr we do know about */ - matched = false; - goto TestDone; - } - else if ((attrList->dirattr & ATTR_DIR_VALIDMASK) != 0) { - searchAttributes = attrList->dirattr; - - /* Directory valence */ - if ( searchAttributes & ATTR_DIR_ENTRYCOUNT ) { - matched = CompareRange(c_attr.ca_entries, - searchInfo1->d.numFiles, - searchInfo2->d.numFiles ); - if (matched == false) goto TestDone; - atleastone = true; - } - } - else { - atleastone = true; /* to match SRCHFS_MATCHDIRS */ - } - } - - /* - * Check the common attributes - */ - searchAttributes = attrList->commonattr; - if ( (searchAttributes & ATTR_CMN_VALIDMASK) != 0 ) { - /* node ID */ - if ( searchAttributes & ATTR_CMN_OBJID ) { - matched = CompareRange(c_attr.ca_fileid, - searchInfo1->nodeID, - searchInfo2->nodeID ); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Parent ID */ - if ( searchAttributes & ATTR_CMN_PAROBJID ) { - HFSCatalogNodeID parentID; - - if (isHFSPlus) - parentID = key->hfsPlus.parentID; -#if CONFIG_HFS_STD - else - parentID = key->hfs.parentID; -#endif - - matched = CompareRange(parentID, searchInfo1->parentDirID, - searchInfo2->parentDirID ); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Finder Info & Extended Finder Info where extFinderInfo is last 32 bytes */ - if ( searchAttributes & ATTR_CMN_FNDRINFO ) { - u_int32_t *thisValue; - thisValue = (u_int32_t *) &c_attr.ca_finderinfo; - - /* - * Note: ioFlFndrInfo and ioDrUsrWds have the same offset in search info, so - * no need to test the object type here. - */ - matched = CompareMasked(thisValue, - (u_int32_t *)&searchInfo1->finderInfo, - (u_int32_t *) &searchInfo2->finderInfo, 8); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Create date */ - if ( searchAttributes & ATTR_CMN_CRTIME ) { - matched = CompareRange(c_attr.ca_itime, - searchInfo1->creationDate.tv_sec, - searchInfo2->creationDate.tv_sec); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Mod date */ - if ( searchAttributes & ATTR_CMN_MODTIME ) { - matched = CompareRange(c_attr.ca_mtime, - searchInfo1->modificationDate.tv_sec, - searchInfo2->modificationDate.tv_sec); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Change Time */ - if ( searchAttributes & ATTR_CMN_CHGTIME ) { - matched = CompareRange(c_attr.ca_ctime, - searchInfo1->changeDate.tv_sec, - searchInfo2->changeDate.tv_sec); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Access date */ - if ( searchAttributes & ATTR_CMN_ACCTIME ) { - matched = CompareRange(c_attr.ca_atime, - searchInfo1->accessDate.tv_sec, - searchInfo2->accessDate.tv_sec); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Backup date */ - if ( searchAttributes & ATTR_CMN_BKUPTIME ) { - matched = CompareRange(c_attr.ca_btime, - searchInfo1->lastBackupDate.tv_sec, - searchInfo2->lastBackupDate.tv_sec); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* User ID */ - if ( searchAttributes & ATTR_CMN_OWNERID ) { - matched = CompareRange(c_attr.ca_uid, - searchInfo1->uid, searchInfo2->uid); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* Group ID */ - if ( searchAttributes & ATTR_CMN_GRPID ) { - matched = CompareRange(c_attr.ca_gid, - searchInfo1->gid, searchInfo2->gid); - if (matched == false) goto TestDone; - atleastone = true; - } - - /* mode */ - if ( searchAttributes & ATTR_CMN_ACCESSMASK ) { - matched = CompareRange((u_int32_t)c_attr.ca_mode, - (u_int32_t)searchInfo1->mask, - (u_int32_t)searchInfo2->mask); - if (matched == false) goto TestDone; - atleastone = true; - } - } - - /* If we got here w/o matching any, then set to false */ - if (! atleastone) - matched = false; - -TestDone: - /* - * Finally, determine whether we need to negate the sense of the match - * (i.e. find all objects that DON'T match). - */ - if ( searchBits & SRCHFS_NEGATEPARAMS ) - matched = !matched; - - return( matched ); -} - - -/* - * Adds another record to the packed array for output - */ -static int -InsertMatch(struct hfsmount *hfsmp, uio_t a_uio, CatalogRecord *rec, - CatalogKey *key, struct attrlist *returnAttrList, - void *attributesBuffer, void *variableBuffer, uint32_t * nummatches) -{ - int err; - void *rovingAttributesBuffer; - void *rovingVariableBuffer; - long packedBufferSize; - struct attrblock attrblk; - struct cat_desc c_desc; - struct cat_attr c_attr; - struct cat_fork datafork; - struct cat_fork rsrcfork; - - bzero(&c_desc, sizeof(c_desc)); - bzero(&c_attr, sizeof(c_attr)); - rovingAttributesBuffer = (char*)attributesBuffer + sizeof(u_int32_t); /* Reserve space for length field */ - rovingVariableBuffer = variableBuffer; - - /* Convert catalog record into cat_attr format. */ - cat_convertattr(hfsmp, rec, &c_attr, &datafork, &rsrcfork); - - /* Hide our private meta data directories */ - if (c_attr.ca_fileid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - c_attr.ca_fileid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - err = 0; - goto exit; - } - - /* Hide the private journal files */ - if (hfsmp->jnl && - ((c_attr.ca_fileid == hfsmp->hfs_jnlfileid) || - (c_attr.ca_fileid == hfsmp->hfs_jnlinfoblkid))) { - err = 0; - goto exit; - } - - if (returnAttrList->commonattr & ATTR_CMN_NAME) { - err = cat_convertkey(hfsmp, key, rec, &c_desc); - if (err) { - /* This means that we probably had a CNID error */ - goto exit; - } - } else { - c_desc.cd_cnid = c_attr.ca_fileid; - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) - c_desc.cd_parentcnid = key->hfsPlus.parentID; -#if CONFIG_HFS_STD - else - c_desc.cd_parentcnid = key->hfs.parentID; -#endif - - } - - attrblk.ab_attrlist = returnAttrList; - attrblk.ab_attrbufpp = &rovingAttributesBuffer; - attrblk.ab_varbufpp = &rovingVariableBuffer; - attrblk.ab_flags = 0; - attrblk.ab_blocksize = 0; - attrblk.ab_context = vfs_context_current(); - - hfs_packattrblk(&attrblk, hfsmp, NULL, &c_desc, &c_attr, &datafork, &rsrcfork, vfs_context_current()); - - packedBufferSize = (char*)rovingVariableBuffer - (char*)attributesBuffer; - - if ( packedBufferSize > uio_resid(a_uio) ) - return( errSearchBufferFull ); - - (* nummatches)++; - - *((u_int32_t *)attributesBuffer) = packedBufferSize; /* Store length of fixed + var block */ - - err = uiomove( (caddr_t)attributesBuffer, packedBufferSize, a_uio ); -exit: - cat_releasedesc(&c_desc); - - return( err ); -} - - -static int -UnpackSearchAttributeBlock( struct hfsmount *hfsmp, struct attrlist *alist, - searchinfospec_t *searchInfo, void *attributeBuffer, int firstblock) -{ - attrgroup_t a; - u_int32_t bufferSize; - boolean_t is_64_bit; - - DBG_ASSERT(searchInfo != NULL); - - is_64_bit = proc_is64bit(current_proc()); - - bufferSize = *((u_int32_t *)attributeBuffer); - if (bufferSize == 0) - return (EINVAL); /* XXX -DJB is a buffer size of zero ever valid for searchfs? */ - - attributeBuffer = (u_int32_t *)attributeBuffer + 1; /* advance past the size */ - - /* - * UnPack common attributes - */ - a = alist->commonattr; - if ( a != 0 ) { - if ( a & ATTR_CMN_NAME ) { - if (firstblock) { - /* Only use the attrreference_t for the first searchparams */ - char *s; - u_int32_t len; - - s = (char*) attributeBuffer + ((attrreference_t *) attributeBuffer)->attr_dataoffset; - len = ((attrreference_t *) attributeBuffer)->attr_length; - - if (len > sizeof(searchInfo->name)) - return (EINVAL); - - - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { - size_t ucslen; - /* Convert name to Unicode to match HFS Plus B-Tree names */ - - if (len > 0) { - if (utf8_decodestr((u_int8_t *)s, len-1, (UniChar*)searchInfo->name, &ucslen, - sizeof(searchInfo->name), ':', UTF_DECOMPOSED | UTF_ESCAPE_ILLEGAL)) - return (EINVAL); - - searchInfo->nameLength = ucslen / sizeof(UniChar); - } else { - searchInfo->nameLength = 0; - } - } -#if CONFIG_HFS_STD - else { - /* Convert name to pascal string to match HFS (Standard) B-Tree names */ - - if (len > 0) { - if (utf8_to_hfs(HFSTOVCB(hfsmp), len-1, (u_char *)s, (u_char*)searchInfo->name) != 0) - return (EINVAL); - - searchInfo->nameLength = searchInfo->name[0]; - } else { - searchInfo->name[0] = searchInfo->nameLength = 0; - } - } -#endif - } - attributeBuffer = (attrreference_t*) attributeBuffer +1; - } - if ( a & ATTR_CMN_OBJID ) { - searchInfo->nodeID = ((fsobj_id_t *) attributeBuffer)->fid_objno; /* ignore fid_generation */ - attributeBuffer = (fsobj_id_t *)attributeBuffer + 1; - } - if ( a & ATTR_CMN_PAROBJID ) { - searchInfo->parentDirID = ((fsobj_id_t *) attributeBuffer)->fid_objno; /* ignore fid_generation */ - attributeBuffer = (fsobj_id_t *)attributeBuffer + 1; - } - - if ( a & ATTR_CMN_CRTIME ) { - if (is_64_bit) { - struct user64_timespec tmp; - tmp = *((struct user64_timespec *)attributeBuffer); - searchInfo->creationDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->creationDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; - } - else { - struct user32_timespec tmp; - tmp = *((struct user32_timespec *)attributeBuffer); - searchInfo->creationDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->creationDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; - } - } - if ( a & ATTR_CMN_MODTIME ) { - if (is_64_bit) { - struct user64_timespec tmp; - tmp = *((struct user64_timespec *)attributeBuffer); - searchInfo->modificationDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->modificationDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; - } - else { - struct user32_timespec tmp; - tmp = *((struct user32_timespec *)attributeBuffer); - searchInfo->modificationDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->modificationDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; - } - } - if ( a & ATTR_CMN_CHGTIME ) { - if (is_64_bit) { - struct user64_timespec tmp; - tmp = *((struct user64_timespec *)attributeBuffer); - searchInfo->changeDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->changeDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; - } - else { - struct user32_timespec tmp; - tmp = *((struct user32_timespec *)attributeBuffer); - searchInfo->changeDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->changeDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; - } - } - if ( a & ATTR_CMN_ACCTIME ) { - if (is_64_bit) { - struct user64_timespec tmp; - tmp = *((struct user64_timespec *)attributeBuffer); - searchInfo->accessDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->accessDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; - } - else { - struct user32_timespec tmp; - tmp = *((struct user32_timespec *)attributeBuffer); - searchInfo->accessDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->accessDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; - } - } - if ( a & ATTR_CMN_BKUPTIME ) { - if (is_64_bit) { - struct user64_timespec tmp; - tmp = *((struct user64_timespec *)attributeBuffer); - searchInfo->lastBackupDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->lastBackupDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; - } - else { - struct user32_timespec tmp; - tmp = *((struct user32_timespec *)attributeBuffer); - searchInfo->lastBackupDate.tv_sec = (time_t)tmp.tv_sec; - searchInfo->lastBackupDate.tv_nsec = tmp.tv_nsec; - attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; - } - } - if ( a & ATTR_CMN_FNDRINFO ) { - bcopy( attributeBuffer, searchInfo->finderInfo, sizeof(searchInfo->finderInfo) ); - attributeBuffer = (u_int8_t *)attributeBuffer + 32; - } - if ( a & ATTR_CMN_OWNERID ) { - searchInfo->uid = *((uid_t *)attributeBuffer); - attributeBuffer = (uid_t *)attributeBuffer + 1; - } - if ( a & ATTR_CMN_GRPID ) { - searchInfo->gid = *((gid_t *)attributeBuffer); - attributeBuffer = (gid_t *)attributeBuffer + 1; - } - if ( a & ATTR_CMN_ACCESSMASK ) { - searchInfo->mask = *((mode_t *)attributeBuffer); - attributeBuffer = (mode_t *)attributeBuffer + 1; - } - if ( a & ATTR_CMN_FILEID ) { - searchInfo->nodeID = (u_int32_t)*((u_int64_t *) attributeBuffer); - attributeBuffer = (u_int64_t *)attributeBuffer + 1; - } - if ( a & ATTR_CMN_PARENTID ) { - searchInfo->parentDirID = (u_int32_t)*((u_int64_t *) attributeBuffer); - attributeBuffer = (u_int64_t *)attributeBuffer + 1; - } - } - - a = alist->dirattr; - if ( a != 0 ) { - if ( a & ATTR_DIR_ENTRYCOUNT ) { - searchInfo->d.numFiles = *((u_int32_t *)attributeBuffer); - attributeBuffer = (u_int32_t *)attributeBuffer + 1; - } - } - - a = alist->fileattr; - if ( a != 0 ) { - if ( a & ATTR_FILE_DATALENGTH ) { - searchInfo->f.dataLogicalLength = *((off_t *)attributeBuffer); - attributeBuffer = (off_t *)attributeBuffer + 1; - } - if ( a & ATTR_FILE_DATAALLOCSIZE ) { - searchInfo->f.dataPhysicalLength = *((off_t *)attributeBuffer); - attributeBuffer = (off_t *)attributeBuffer + 1; - } - if ( a & ATTR_FILE_RSRCLENGTH ) { - searchInfo->f.resourceLogicalLength = *((off_t *)attributeBuffer); - attributeBuffer = (off_t *)attributeBuffer + 1; - } - if ( a & ATTR_FILE_RSRCALLOCSIZE ) { - searchInfo->f.resourcePhysicalLength = *((off_t *)attributeBuffer); - attributeBuffer = (off_t *)attributeBuffer + 1; - } - } - - return (0); -} -#endif /* CONFIG_SEARCHFS */ diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c deleted file mode 100644 index 71380f628..000000000 --- a/bsd/hfs/hfs_vfsops.c +++ /dev/null @@ -1,4792 +0,0 @@ -/* - * Copyright (c) 1999-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1991, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * hfs_vfsops.c - * derived from @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95 - * - * (c) Copyright 1997-2002 Apple Computer, Inc. All rights reserved. - * - * hfs_vfsops.c -- VFS layer for loadable HFS file system. - * - */ -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* for parsing boot-args */ -#include - - -#include - -#include - -#include -#include - -#include -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_cnode.h" -#include "hfs_dbg.h" -#include "hfs_endian.h" -#include "hfs_hotfiles.h" -#include "hfs_quota.h" -#include "hfs_btreeio.h" -#include "hfs_kdebug.h" -#include "hfs_cprotect.h" - -#include "hfscommon/headers/FileMgrInternal.h" -#include "hfscommon/headers/BTreesInternal.h" - -#define HFS_MOUNT_DEBUG 1 - -#if HFS_DIAGNOSTIC -int hfs_dbg_all = 0; -int hfs_dbg_err = 0; -#endif - -/* Enable/disable debugging code for live volume resizing, defined in hfs_resize.c */ -extern int hfs_resize_debug; - -lck_grp_attr_t * hfs_group_attr; -lck_attr_t * hfs_lock_attr; -lck_grp_t * hfs_mutex_group; -lck_grp_t * hfs_rwlock_group; -lck_grp_t * hfs_spinlock_group; - -extern struct vnodeopv_desc hfs_vnodeop_opv_desc; - -#if CONFIG_HFS_STD -extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; -static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush); -#endif - -/* not static so we can re-use in hfs_readwrite.c for build_path calls */ -int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); - -static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args); -static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context); -static int hfs_flushfiles(struct mount *, int, struct proc *); -static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp); -static int hfs_init(struct vfsconf *vfsp); -static void hfs_locks_destroy(struct hfsmount *hfsmp); -static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context); -static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context); -static int hfs_start(struct mount *mp, int flags, vfs_context_t context); -static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context); -static int hfs_journal_replay(vnode_t devvp, vfs_context_t context); -static void hfs_syncer_free(struct hfsmount *hfsmp); - -void hfs_initialize_allocator (struct hfsmount *hfsmp); -int hfs_teardown_allocator (struct hfsmount *hfsmp); - -int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); -int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context); -int hfs_reload(struct mount *mp); -int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context); -int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context); -int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, vfs_context_t context); -int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); - -/* - * Called by vfs_mountroot when mounting HFS Plus as root. - */ - -int -hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) -{ - struct hfsmount *hfsmp; - ExtendedVCB *vcb; - struct vfsstatfs *vfsp; - int error; - - if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n", - error, rvp, (rvp->v_name ? rvp->v_name : "unknown device")); - } - return (error); - } - - /* Init hfsmp */ - hfsmp = VFSTOHFS(mp); - - hfsmp->hfs_uid = UNKNOWNUID; - hfsmp->hfs_gid = UNKNOWNGID; - hfsmp->hfs_dir_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */ - hfsmp->hfs_file_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */ - - /* Establish the free block reserve. */ - vcb = HFSTOVCB(hfsmp); - vcb->reserveBlocks = ((u_int64_t)vcb->totalBlocks * HFS_MINFREE) / 100; - vcb->reserveBlocks = MIN(vcb->reserveBlocks, HFS_MAXRESERVE / vcb->blockSize); - - vfsp = vfs_statfs(mp); - (void)hfs_statfs(mp, vfsp, NULL); - - return (0); -} - - -/* - * VFS Operations. - * - * mount system call - */ - -int -hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context) -{ - struct proc *p = vfs_context_proc(context); - struct hfsmount *hfsmp = NULL; - struct hfs_mount_args args; - int retval = E_NONE; - u_int32_t cmdflags; - - if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: copyin returned %d for fs\n", retval); - } - return (retval); - } - cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS; - if (cmdflags & MNT_UPDATE) { - hfsmp = VFSTOHFS(mp); - - /* Reload incore data after an fsck. */ - if (cmdflags & MNT_RELOAD) { - if (vfs_isrdonly(mp)) { - int error = hfs_reload(mp); - if (error && HFS_MOUNT_DEBUG) { - printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN); - } - return error; - } - else { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN); - } - return (EINVAL); - } - } - - /* Change to a read-only file system. */ - if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && - vfs_isrdonly(mp)) { - int flags; - - /* Set flag to indicate that a downgrade to read-only - * is in progress and therefore block any further - * modifications to the file system. - */ - hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); - hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE; - hfsmp->hfs_downgrading_thread = current_thread(); - hfs_unlock_global (hfsmp); - hfs_syncer_free(hfsmp); - - /* use VFS_SYNC to push out System (btree) files */ - retval = VFS_SYNC(mp, MNT_WAIT, context); - if (retval && ((cmdflags & MNT_FORCE) == 0)) { - hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; - hfsmp->hfs_downgrading_thread = NULL; - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN); - } - goto out; - } - - flags = WRITECLOSE; - if (cmdflags & MNT_FORCE) - flags |= FORCECLOSE; - - if ((retval = hfs_flushfiles(mp, flags, p))) { - hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; - hfsmp->hfs_downgrading_thread = NULL; - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN); - } - goto out; - } - - /* mark the volume cleanly unmounted */ - hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask; - retval = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); - hfsmp->hfs_flags |= HFS_READ_ONLY; - - /* - * Close down the journal. - * - * NOTE: It is critically important to close down the journal - * and have it issue all pending I/O prior to calling VNOP_FSYNC below. - * In a journaled environment it is expected that the journal be - * the only actor permitted to issue I/O for metadata blocks in HFS. - * If we were to call VNOP_FSYNC prior to closing down the journal, - * we would inadvertantly issue (and wait for) the I/O we just - * initiated above as part of the flushvolumeheader call. - * - * To avoid this, we follow the same order of operations as in - * unmount and issue the journal_close prior to calling VNOP_FSYNC. - */ - - if (hfsmp->jnl) { - hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); - - journal_close(hfsmp->jnl); - hfsmp->jnl = NULL; - - // Note: we explicitly don't want to shutdown - // access to the jvp because we may need - // it later if we go back to being read-write. - - hfs_unlock_global (hfsmp); - - vfs_clearflags(hfsmp->hfs_mp, MNT_JOURNALED); - } - - /* - * Write out any pending I/O still outstanding against the device node - * now that the journal has been closed. - */ - if (retval == 0) { - vnode_get(hfsmp->hfs_devvp); - retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context); - vnode_put(hfsmp->hfs_devvp); - } - - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN); - } - hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; - hfsmp->hfs_downgrading_thread = NULL; - hfsmp->hfs_flags &= ~HFS_READ_ONLY; - goto out; - } - - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - if (hfsmp->hfs_summary_table) { - int err = 0; - /* - * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress - */ - if (hfsmp->hfs_allocation_vp) { - err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - } - FREE (hfsmp->hfs_summary_table, M_TEMP); - hfsmp->hfs_summary_table = NULL; - hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE; - if (err == 0 && hfsmp->hfs_allocation_vp){ - hfs_unlock (VTOC(hfsmp->hfs_allocation_vp)); - } - } - } - - hfsmp->hfs_downgrading_thread = NULL; - } - - /* Change to a writable file system. */ - if (vfs_iswriteupgrade(mp)) { - /* - * On inconsistent disks, do not allow read-write mount - * unless it is the boot volume being mounted. - */ - if (!(vfs_flags(mp) & MNT_ROOTFS) && - (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n", (hfsmp->vcbVN)); - } - retval = EINVAL; - goto out; - } - - // If the journal was shut-down previously because we were - // asked to be read-only, let's start it back up again now - - if ( (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) - && hfsmp->jnl == NULL - && hfsmp->jvp != NULL) { - int jflags; - - if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) { - jflags = JOURNAL_RESET; - } else { - jflags = 0; - } - - hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); - - /* We provide the mount point twice here: The first is used as - * an opaque argument to be passed back when hfs_sync_metadata - * is called. The second is provided to the throttling code to - * indicate which mount's device should be used when accounting - * for metadata writes. - */ - hfsmp->jnl = journal_open(hfsmp->jvp, - hfs_blk_to_bytes(hfsmp->jnl_start, HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, - hfsmp->jnl_size, - hfsmp->hfs_devvp, - hfsmp->hfs_logical_block_size, - jflags, - 0, - hfs_sync_metadata, hfsmp->hfs_mp, - hfsmp->hfs_mp); - - /* - * Set up the trim callback function so that we can add - * recently freed extents to the free extent cache once - * the transaction that freed them is written to the - * journal on disk. - */ - if (hfsmp->jnl) - journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); - - hfs_unlock_global (hfsmp); - - if (hfsmp->jnl == NULL) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN)); - } - retval = EINVAL; - goto out; - } else { - hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; - vfs_setflags(hfsmp->hfs_mp, MNT_JOURNALED); - } - } - - /* See if we need to erase unused Catalog nodes due to . */ - retval = hfs_erase_unused_nodes(hfsmp); - if (retval != E_NONE) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN); - } - goto out; - } - - /* If this mount point was downgraded from read-write - * to read-only, clear that information as we are now - * moving back to read-write. - */ - hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; - hfsmp->hfs_downgrading_thread = NULL; - - /* mark the volume dirty (clear clean unmount bit) */ - hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask; - - retval = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); - if (retval != E_NONE) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN); - } - goto out; - } - - /* Only clear HFS_READ_ONLY after a successful write */ - hfsmp->hfs_flags &= ~HFS_READ_ONLY; - - - if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) { - /* Setup private/hidden directories for hardlinks. */ - hfs_privatedir_init(hfsmp, FILE_HARDLINKS); - hfs_privatedir_init(hfsmp, DIR_HARDLINKS); - - hfs_remove_orphans(hfsmp); - - /* - * Since we're upgrading to a read-write mount, allow - * hot file clustering if conditions allow. - * - * Note: this normally only would happen if you booted - * single-user and upgraded the mount to read-write - * - * Note: at this point we are not allowed to fail the - * mount operation because the HotFile init code - * in hfs_recording_init() will lookup vnodes with - * VNOP_LOOKUP() which hangs vnodes off the mount - * (and if we were to fail, VFS is not prepared to - * clean that up at this point. Since HotFiles are - * optional, this is not a big deal. - */ - if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && - (((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0) || (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) ) { - (void) hfs_recording_init(hfsmp); - } - /* Force ACLs on HFS+ file systems. */ - if (vfs_extendedsecurity(HFSTOVFS(hfsmp)) == 0) { - vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - } - } - } - - /* Update file system parameters. */ - retval = hfs_changefs(mp, &args); - if (retval && HFS_MOUNT_DEBUG) { - printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN); - } - - } else /* not an update request */ { - - /* Set the mount flag to indicate that we support volfs */ - vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS)); - - retval = hfs_mountfs(devvp, mp, &args, 0, context); - if (retval) { - const char *name = vnode_getname(devvp); - printf("hfs_mount: hfs_mountfs returned error=%d for device %s\n", retval, (name ? name : "unknown-dev")); - if (name) { - vnode_putname(name); - } - goto out; - } - - /* After hfs_mountfs succeeds, we should have valid hfsmp */ - hfsmp = VFSTOHFS(mp); - - } - -out: - if (retval == 0) { - (void)hfs_statfs(mp, vfs_statfs(mp), context); - } - return (retval); -} - - -struct hfs_changefs_cargs { - struct hfsmount *hfsmp; - int namefix; - int permfix; - int permswitch; -}; - -static int -hfs_changefs_callback(struct vnode *vp, void *cargs) -{ - ExtendedVCB *vcb; - struct cnode *cp; - struct cat_desc cndesc; - struct cat_attr cnattr; - struct hfs_changefs_cargs *args; - int lockflags; - int error; - - args = (struct hfs_changefs_cargs *)cargs; - - cp = VTOC(vp); - vcb = HFSTOVCB(args->hfsmp); - - lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_lookup(args->hfsmp, &cp->c_desc, 0, 0, &cndesc, &cnattr, NULL, NULL); - hfs_systemfile_unlock(args->hfsmp, lockflags); - if (error) { - /* - * If we couldn't find this guy skip to the next one - */ - if (args->namefix) - cache_purge(vp); - - return (VNODE_RETURNED); - } - /* - * Get the real uid/gid and perm mask from disk. - */ - if (args->permswitch || args->permfix) { - cp->c_uid = cnattr.ca_uid; - cp->c_gid = cnattr.ca_gid; - cp->c_mode = cnattr.ca_mode; - } - /* - * If we're switching name converters then... - * Remove the existing entry from the namei cache. - * Update name to one based on new encoder. - */ - if (args->namefix) { - cache_purge(vp); - replace_desc(cp, &cndesc); - - if (cndesc.cd_cnid == kHFSRootFolderID) { - strlcpy((char *)vcb->vcbVN, (const char *)cp->c_desc.cd_nameptr, NAME_MAX+1); - cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding; - } - } else { - cat_releasedesc(&cndesc); - } - return (VNODE_RETURNED); -} - -/* Change fs mount parameters */ -static int -hfs_changefs(struct mount *mp, struct hfs_mount_args *args) -{ - int retval = 0; - int namefix, permfix, permswitch; - struct hfsmount *hfsmp; - ExtendedVCB *vcb; - struct hfs_changefs_cargs cargs; - u_int32_t mount_flags; - -#if CONFIG_HFS_STD - u_int32_t old_encoding = 0; - hfs_to_unicode_func_t get_unicode_func; - unicode_to_hfs_func_t get_hfsname_func; -#endif - - hfsmp = VFSTOHFS(mp); - vcb = HFSTOVCB(hfsmp); - mount_flags = (unsigned int)vfs_flags(mp); - - hfsmp->hfs_flags |= HFS_IN_CHANGEFS; - - permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) && - ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) || - (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) && - (mount_flags & MNT_UNKNOWNPERMISSIONS))); - - /* The root filesystem must operate with actual permissions: */ - if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) { - vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS)); /* Just say "No". */ - retval = EINVAL; - goto exit; - } - if (mount_flags & MNT_UNKNOWNPERMISSIONS) - hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; - else - hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS; - - namefix = permfix = 0; - - /* - * Tracking of hot files requires up-to-date access times. So if - * access time updates are disabled, we must also disable hot files. - */ - if (mount_flags & MNT_NOATIME) { - (void) hfs_recording_suspend(hfsmp); - } - - /* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */ - if (args->hfs_timezone.tz_minuteswest != VNOVAL) { - gTimeZone = args->hfs_timezone; - } - - /* Change the default uid, gid and/or mask */ - if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) { - hfsmp->hfs_uid = args->hfs_uid; - if (vcb->vcbSigWord == kHFSPlusSigWord) - ++permfix; - } - if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) { - hfsmp->hfs_gid = args->hfs_gid; - if (vcb->vcbSigWord == kHFSPlusSigWord) - ++permfix; - } - if (args->hfs_mask != (mode_t)VNOVAL) { - if (hfsmp->hfs_dir_mask != (args->hfs_mask & ALLPERMS)) { - hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS; - hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS; - if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES)) - hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE); - if (vcb->vcbSigWord == kHFSPlusSigWord) - ++permfix; - } - } - -#if CONFIG_HFS_STD - /* Change the hfs encoding value (hfs only) */ - if ((vcb->vcbSigWord == kHFSSigWord) && - (args->hfs_encoding != (u_int32_t)VNOVAL) && - (hfsmp->hfs_encoding != args->hfs_encoding)) { - - retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func); - if (retval) - goto exit; - - /* - * Connect the new hfs_get_unicode converter but leave - * the old hfs_get_hfsname converter in place so that - * we can lookup existing vnodes to get their correctly - * encoded names. - * - * When we're all finished, we can then connect the new - * hfs_get_hfsname converter and release our interest - * in the old converters. - */ - hfsmp->hfs_get_unicode = get_unicode_func; - old_encoding = hfsmp->hfs_encoding; - hfsmp->hfs_encoding = args->hfs_encoding; - ++namefix; - } -#endif - - if (!(namefix || permfix || permswitch)) - goto exit; - - /* XXX 3762912 hack to support HFS filesystem 'owner' */ - if (permfix) - vfs_setowner(mp, - hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid, - hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid); - - /* - * For each active vnode fix things that changed - * - * Note that we can visit a vnode more than once - * and we can race with fsync. - * - * hfs_changefs_callback will be called for each vnode - * hung off of this mount point - * - * The vnode will be properly referenced and unreferenced - * around the callback - */ - cargs.hfsmp = hfsmp; - cargs.namefix = namefix; - cargs.permfix = permfix; - cargs.permswitch = permswitch; - - vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs); - -#if CONFIG_HFS_STD - /* - * If we're switching name converters we can now - * connect the new hfs_get_hfsname converter and - * release our interest in the old converters. - */ - if (namefix) { - /* HFS standard only */ - hfsmp->hfs_get_hfsname = get_hfsname_func; - vcb->volumeNameEncodingHint = args->hfs_encoding; - (void) hfs_relconverter(old_encoding); - } -#endif - -exit: - hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS; - return (retval); -} - - -struct hfs_reload_cargs { - struct hfsmount *hfsmp; - int error; -}; - -static int -hfs_reload_callback(struct vnode *vp, void *cargs) -{ - struct cnode *cp; - struct hfs_reload_cargs *args; - int lockflags; - - args = (struct hfs_reload_cargs *)cargs; - /* - * flush all the buffers associated with this node - */ - (void) buf_invalidateblks(vp, 0, 0, 0); - - cp = VTOC(vp); - /* - * Remove any directory hints - */ - if (vnode_isdir(vp)) - hfs_reldirhints(cp, 0); - - /* - * Re-read cnode data for all active vnodes (non-metadata files). - */ - if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) { - struct cat_fork *datafork; - struct cat_desc desc; - - datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL; - - /* lookup by fileID since name could have changed */ - lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - args->error = cat_idlookup(args->hfsmp, cp->c_fileid, 0, 0, &desc, &cp->c_attr, datafork); - hfs_systemfile_unlock(args->hfsmp, lockflags); - if (args->error) { - return (VNODE_RETURNED_DONE); - } - - /* update cnode's catalog descriptor */ - (void) replace_desc(cp, &desc); - } - return (VNODE_RETURNED); -} - -/* - * Reload all incore data for a filesystem (used after running fsck on - * the root filesystem and finding things to fix). The filesystem must - * be mounted read-only. - * - * Things to do to update the mount: - * invalidate all cached meta-data. - * invalidate all inactive vnodes. - * invalidate all cached file data. - * re-read volume header from disk. - * re-load meta-file info (extents, file size). - * re-load B-tree header data. - * re-read cnode data for all active vnodes. - */ -int -hfs_reload(struct mount *mountp) -{ - register struct vnode *devvp; - struct buf *bp; - int error, i; - struct hfsmount *hfsmp; - struct HFSPlusVolumeHeader *vhp; - ExtendedVCB *vcb; - struct filefork *forkp; - struct cat_desc cndesc; - struct hfs_reload_cargs args; - daddr64_t priIDSector; - - hfsmp = VFSTOHFS(mountp); - vcb = HFSTOVCB(hfsmp); - - if (vcb->vcbSigWord == kHFSSigWord) - return (EINVAL); /* rooting from HFS is not supported! */ - - /* - * Invalidate all cached meta-data. - */ - devvp = hfsmp->hfs_devvp; - if (buf_invalidateblks(devvp, 0, 0, 0)) - panic("hfs_reload: dirty1"); - - args.hfsmp = hfsmp; - args.error = 0; - /* - * hfs_reload_callback will be called for each vnode - * hung off of this mount point that can't be recycled... - * vnode_iterate will recycle those that it can (the VNODE_RELOAD option) - * the vnode will be in an 'unbusy' state (VNODE_WAIT) and - * properly referenced and unreferenced around the callback - */ - vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args); - - if (args.error) - return (args.error); - - /* - * Re-read VolumeHeader from disk. - */ - priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); - - error = (int)buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp); - if (error) { - if (bp != NULL) - buf_brelse(bp); - return (error); - } - - vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); - - /* Do a quick sanity check */ - if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord && - SWAP_BE16(vhp->signature) != kHFSXSigWord) || - (SWAP_BE16(vhp->version) != kHFSPlusVersion && - SWAP_BE16(vhp->version) != kHFSXVersion) || - SWAP_BE32(vhp->blockSize) != vcb->blockSize) { - buf_brelse(bp); - return (EIO); - } - - vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); - vcb->vcbAtrb = SWAP_BE32 (vhp->attributes); - vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); - vcb->vcbClpSiz = SWAP_BE32 (vhp->rsrcClumpSize); - vcb->vcbNxtCNID = SWAP_BE32 (vhp->nextCatalogID); - vcb->vcbVolBkUp = to_bsd_time(SWAP_BE32(vhp->backupDate)); - vcb->vcbWrCnt = SWAP_BE32 (vhp->writeCount); - vcb->vcbFilCnt = SWAP_BE32 (vhp->fileCount); - vcb->vcbDirCnt = SWAP_BE32 (vhp->folderCount); - HFS_UPDATE_NEXT_ALLOCATION(vcb, SWAP_BE32 (vhp->nextAllocation)); - vcb->totalBlocks = SWAP_BE32 (vhp->totalBlocks); - vcb->freeBlocks = SWAP_BE32 (vhp->freeBlocks); - vcb->encodingsBitmap = SWAP_BE64 (vhp->encodingsBitmap); - bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo)); - vcb->localCreateDate = SWAP_BE32 (vhp->createDate); /* hfs+ create date is in local time */ - - /* - * Re-load meta-file vnode data (extent info, file size, etc). - */ - forkp = VTOF((struct vnode *)vcb->extentsRefNum); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - forkp->ff_extents[i].startBlock = - SWAP_BE32 (vhp->extentsFile.extents[i].startBlock); - forkp->ff_extents[i].blockCount = - SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); - } - forkp->ff_size = SWAP_BE64 (vhp->extentsFile.logicalSize); - forkp->ff_blocks = SWAP_BE32 (vhp->extentsFile.totalBlocks); - forkp->ff_clumpsize = SWAP_BE32 (vhp->extentsFile.clumpSize); - - - forkp = VTOF((struct vnode *)vcb->catalogRefNum); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - forkp->ff_extents[i].startBlock = - SWAP_BE32 (vhp->catalogFile.extents[i].startBlock); - forkp->ff_extents[i].blockCount = - SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); - } - forkp->ff_size = SWAP_BE64 (vhp->catalogFile.logicalSize); - forkp->ff_blocks = SWAP_BE32 (vhp->catalogFile.totalBlocks); - forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize); - - if (hfsmp->hfs_attribute_vp) { - forkp = VTOF(hfsmp->hfs_attribute_vp); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - forkp->ff_extents[i].startBlock = - SWAP_BE32 (vhp->attributesFile.extents[i].startBlock); - forkp->ff_extents[i].blockCount = - SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); - } - forkp->ff_size = SWAP_BE64 (vhp->attributesFile.logicalSize); - forkp->ff_blocks = SWAP_BE32 (vhp->attributesFile.totalBlocks); - forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize); - } - - forkp = VTOF((struct vnode *)vcb->allocationsRefNum); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - forkp->ff_extents[i].startBlock = - SWAP_BE32 (vhp->allocationFile.extents[i].startBlock); - forkp->ff_extents[i].blockCount = - SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); - } - forkp->ff_size = SWAP_BE64 (vhp->allocationFile.logicalSize); - forkp->ff_blocks = SWAP_BE32 (vhp->allocationFile.totalBlocks); - forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize); - - buf_brelse(bp); - vhp = NULL; - - /* - * Re-load B-tree header data - */ - forkp = VTOF((struct vnode *)vcb->extentsRefNum); - if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) - return (error); - - forkp = VTOF((struct vnode *)vcb->catalogRefNum); - if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) - return (error); - - if (hfsmp->hfs_attribute_vp) { - forkp = VTOF(hfsmp->hfs_attribute_vp); - if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) - return (error); - } - - /* Reload the volume name */ - if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, NULL, NULL))) - return (error); - vcb->volumeNameEncodingHint = cndesc.cd_encoding; - bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen)); - cat_releasedesc(&cndesc); - - /* Re-establish private/hidden directories. */ - hfs_privatedir_init(hfsmp, FILE_HARDLINKS); - hfs_privatedir_init(hfsmp, DIR_HARDLINKS); - - /* In case any volume information changed to trigger a notification */ - hfs_generate_volume_notifications(hfsmp); - - return (0); -} - -__unused -static uint64_t tv_to_usecs(struct timeval *tv) -{ - return tv->tv_sec * 1000000ULL + tv->tv_usec; -} - -// Returns TRUE if b - a >= usecs -static boolean_t hfs_has_elapsed (const struct timeval *a, - const struct timeval *b, - uint64_t usecs) -{ - struct timeval diff; - timersub(b, a, &diff); - return diff.tv_sec * 1000000ULL + diff.tv_usec >= usecs; -} - -static void -hfs_syncer(void *arg0, __unused void *unused) -{ - struct hfsmount *hfsmp = arg0; - struct timeval now; - - microuptime(&now); - - KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER | DBG_FUNC_START, hfsmp, - tv_to_usecs(&now), - tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp), - hfsmp->hfs_mp->mnt_pending_write_size, 0); - - hfs_syncer_lock(hfsmp); - - if (!hfsmp->hfs_syncer) { - // hfs_unmount is waiting for us leave now and let it do the sync - hfsmp->hfs_sync_incomplete = FALSE; - hfs_syncer_unlock(hfsmp); - hfs_syncer_wakeup(hfsmp); - return; - } - - /* Check to see whether we should flush now: either the oldest is - > HFS_MAX_META_DELAY or HFS_META_DELAY has elapsed since the - request and there are no pending writes. */ - - boolean_t flush_now = FALSE; - - if (hfs_has_elapsed(&hfsmp->hfs_sync_req_oldest, &now, HFS_MAX_META_DELAY)) - flush_now = TRUE; - else if (!hfsmp->hfs_mp->mnt_pending_write_size) { - /* N.B. accessing mnt_last_write_completed_timestamp is not thread safe, but - it won't matter for what we're using it for. */ - if (hfs_has_elapsed(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp, - &now, - HFS_META_DELAY)) { - flush_now = TRUE; - } - } - - if (!flush_now) { - thread_call_t syncer = hfsmp->hfs_syncer; - - hfs_syncer_unlock(hfsmp); - - hfs_syncer_queue(syncer); - - return; - } - - timerclear(&hfsmp->hfs_sync_req_oldest); - - hfs_syncer_unlock(hfsmp); - - KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER_TIMED | DBG_FUNC_START, - tv_to_usecs(&now), - tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp), - tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_issued_timestamp), - hfsmp->hfs_mp->mnt_pending_write_size, 0); - - if (hfsmp->hfs_syncer_thread) { - printf("hfs: syncer already running!\n"); - return; - } - - hfsmp->hfs_syncer_thread = current_thread(); - - /* - * We intentionally do a synchronous flush (of the journal or entire volume) here. - * For journaled volumes, this means we wait until the metadata blocks are written - * to both the journal and their final locations (in the B-trees, etc.). - * - * This tends to avoid interleaving the metadata writes with other writes (for - * example, user data, or to the journal when a later transaction notices that - * an earlier transaction has finished its async writes, and then updates the - * journal start in the journal header). Avoiding interleaving of writes is - * very good for performance on simple flash devices like SD cards, thumb drives; - * and on devices like floppies. Since removable devices tend to be this kind of - * simple device, doing a synchronous flush actually improves performance in - * practice. - * - * NOTE: For non-journaled volumes, the call to hfs_sync will also cause dirty - * user data to be written. - */ - if (hfsmp->jnl) { - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - } else { - hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel()); - } - - KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER_TIMED | DBG_FUNC_END, - (microuptime(&now), tv_to_usecs(&now)), - tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp), - tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_issued_timestamp), - hfsmp->hfs_mp->mnt_pending_write_size, 0); - - hfsmp->hfs_syncer_thread = NULL; - - hfs_syncer_lock(hfsmp); - - // If hfs_unmount lets us and we missed a sync, schedule again - if (hfsmp->hfs_syncer && timerisset(&hfsmp->hfs_sync_req_oldest)) { - thread_call_t syncer = hfsmp->hfs_syncer; - - hfs_syncer_unlock(hfsmp); - - hfs_syncer_queue(syncer); - } else { - hfsmp->hfs_sync_incomplete = FALSE; - hfs_syncer_unlock(hfsmp); - hfs_syncer_wakeup(hfsmp); - } - - /* BE CAREFUL WHAT YOU ADD HERE: at this point hfs_unmount is free - to continue and therefore hfsmp might be invalid. */ - - KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER | DBG_FUNC_END, 0, 0, 0, 0, 0); -} - - -extern int IOBSDIsMediaEjectable( const char *cdev_name ); - -/* - * Call into the allocator code and perform a full scan of the bitmap file. - * - * This allows us to TRIM unallocated ranges if needed, and also to build up - * an in-memory summary table of the state of the allocated blocks. - */ -void hfs_scan_blocks (struct hfsmount *hfsmp) { - /* - * Take the allocation file lock. Journal transactions will block until - * we're done here. - */ - - int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - /* - * We serialize here with the HFS mount lock as we're mounting. - * - * The mount can only proceed once this thread has acquired the bitmap - * lock, since we absolutely do not want someone else racing in and - * getting the bitmap lock, doing a read/write of the bitmap file, - * then us getting the bitmap lock. - * - * To prevent this, the mount thread takes the HFS mount mutex, starts us - * up, then immediately msleeps on the scan_var variable in the mount - * point as a condition variable. This serialization is safe since - * if we race in and try to proceed while they're still holding the lock, - * we'll block trying to acquire the global lock. Since the mount thread - * acquires the HFS mutex before starting this function in a new thread, - * any lock acquisition on our part must be linearizably AFTER the mount thread's. - * - * Note that the HFS mount mutex is always taken last, and always for only - * a short time. In this case, we just take it long enough to mark the - * scan-in-flight bit. - */ - (void) hfs_lock_mount (hfsmp); - hfsmp->scan_var |= HFS_ALLOCATOR_SCAN_INFLIGHT; - wakeup((caddr_t) &hfsmp->scan_var); - hfs_unlock_mount (hfsmp); - - /* Initialize the summary table */ - if (hfs_init_summary (hfsmp)) { - printf("hfs: could not initialize summary table for %s\n", hfsmp->vcbVN); - } - - /* - * ScanUnmapBlocks assumes that the bitmap lock is held when you - * call the function. We don't care if there were any errors issuing unmaps. - * - * It will also attempt to build up the summary table for subsequent - * allocator use, as configured. - */ - (void) ScanUnmapBlocks(hfsmp); - - (void) hfs_lock_mount (hfsmp); - hfsmp->scan_var &= ~HFS_ALLOCATOR_SCAN_INFLIGHT; - hfsmp->scan_var |= HFS_ALLOCATOR_SCAN_COMPLETED; - wakeup((caddr_t) &hfsmp->scan_var); - hfs_unlock_mount (hfsmp); - - buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); - - hfs_systemfile_unlock(hfsmp, flags); - -} - -static int hfs_root_unmounted_cleanly = 0; - -SYSCTL_DECL(_vfs_generic); -SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly"); - -/* - * Common code for mount and mountroot - */ -int -hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, - int journal_replay_only, vfs_context_t context) -{ - struct proc *p = vfs_context_proc(context); - int retval = E_NONE; - struct hfsmount *hfsmp = NULL; - struct buf *bp; - dev_t dev; - HFSMasterDirectoryBlock *mdbp = NULL; - int ronly; -#if QUOTA - int i; -#endif - int mntwrapper; - kauth_cred_t cred; - u_int64_t disksize; - daddr64_t log_blkcnt; - u_int32_t log_blksize; - u_int32_t phys_blksize; - u_int32_t minblksize; - u_int32_t iswritable; - daddr64_t mdb_offset; - int isvirtual = 0; - int isroot = 0; - u_int32_t device_features = 0; - int isssd; - - if (args == NULL) { - /* only hfs_mountroot passes us NULL as the 'args' argument */ - isroot = 1; - } - - ronly = vfs_isrdonly(mp); - dev = vnode_specrdev(devvp); - cred = p ? vfs_context_ucred(context) : NOCRED; - mntwrapper = 0; - - bp = NULL; - hfsmp = NULL; - mdbp = NULL; - minblksize = kHFSBlockSize; - - /* Advisory locking should be handled at the VFS layer */ - vfs_setlocklocal(mp); - - /* Get the logical block size (treated as physical block size everywhere) */ - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n"); - } - retval = ENXIO; - goto error_exit; - } - if (log_blksize == 0 || log_blksize > 1024*1024*1024) { - printf("hfs: logical block size 0x%x looks bad. Not mounting.\n", log_blksize); - retval = ENXIO; - goto error_exit; - } - - /* Get the physical block size. */ - retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context); - if (retval) { - if ((retval != ENOTSUP) && (retval != ENOTTY)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n"); - } - retval = ENXIO; - goto error_exit; - } - /* If device does not support this ioctl, assume that physical - * block size is same as logical block size - */ - phys_blksize = log_blksize; - } - if (phys_blksize == 0 || phys_blksize > MAXBSIZE) { - printf("hfs: physical block size 0x%x looks bad. Not mounting.\n", phys_blksize); - retval = ENXIO; - goto error_exit; - } - - /* Switch to 512 byte sectors (temporarily) */ - if (log_blksize > 512) { - u_int32_t size512 = 512; - - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n"); - } - retval = ENXIO; - goto error_exit; - } - } - /* Get the number of 512 byte physical blocks. */ - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { - /* resetting block size may fail if getting block count did */ - (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context); - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n"); - } - retval = ENXIO; - goto error_exit; - } - /* Compute an accurate disk size (i.e. within 512 bytes) */ - disksize = (u_int64_t)log_blkcnt * (u_int64_t)512; - - /* - * On Tiger it is not necessary to switch the device - * block size to be 4k if there are more than 31-bits - * worth of blocks but to insure compatibility with - * pre-Tiger systems we have to do it. - * - * If the device size is not a multiple of 4K (8 * 512), then - * switching the logical block size isn't going to help because - * we will be unable to write the alternate volume header. - * In this case, just leave the logical block size unchanged. - */ - if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) { - minblksize = log_blksize = 4096; - if (phys_blksize < log_blksize) - phys_blksize = log_blksize; - } - - /* - * The cluster layer is not currently prepared to deal with a logical - * block size larger than the system's page size. (It can handle - * blocks per page, but not multiple pages per block.) So limit the - * logical block size to the page size. - */ - if (log_blksize > PAGE_SIZE) { - log_blksize = PAGE_SIZE; - } - - /* Now switch to our preferred physical block size. */ - if (log_blksize > 512) { - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n"); - } - retval = ENXIO; - goto error_exit; - } - /* Get the count of physical blocks. */ - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n"); - } - retval = ENXIO; - goto error_exit; - } - } - /* - * At this point: - * minblksize is the minimum physical block size - * log_blksize has our preferred physical block size - * log_blkcnt has the total number of physical blocks - */ - - mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize); - if ((retval = (int)buf_meta_bread(devvp, - HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)), - phys_blksize, cred, &bp))) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval); - } - goto error_exit; - } - MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK); - if (mdbp == NULL) { - retval = ENOMEM; - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: MALLOC failed\n"); - } - goto error_exit; - } - bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize); - buf_brelse(bp); - bp = NULL; - - MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK); - if (hfsmp == NULL) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: MALLOC (2) failed\n"); - } - retval = ENOMEM; - goto error_exit; - } - bzero(hfsmp, sizeof(struct hfsmount)); - - hfs_chashinit_finish(hfsmp); - - /* Init the ID lookup hashtable */ - hfs_idhash_init (hfsmp); - - /* - * See if the disk supports unmap (trim). - * - * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field - * returned by vfs_ioattr. We need to call VNOP_IOCTL ourselves. - */ - if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) { - if (device_features & DK_FEATURE_UNMAP) { - hfsmp->hfs_flags |= HFS_UNMAP; - } - - if(device_features & DK_FEATURE_BARRIER) - hfsmp->hfs_flags |= HFS_FEATURE_BARRIER; - } - - /* - * See if the disk is a solid state device, too. We need this to decide what to do about - * hotfiles. - */ - if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) { - if (isssd) { - hfsmp->hfs_flags |= HFS_SSD; - } - } - - /* See if the underlying device is Core Storage or not */ - dk_corestorage_info_t cs_info; - memset(&cs_info, 0, sizeof(dk_corestorage_info_t)); - if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, context) == 0) { - hfsmp->hfs_flags |= HFS_CS; - if (isroot && (cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) { - hfsmp->hfs_flags |= HFS_CS_METADATA_PIN; - } - if (isroot && (cs_info.flags & DK_CORESTORAGE_ENABLE_HOTFILES)) { - hfsmp->hfs_flags |= HFS_CS_HOTFILE_PIN; - hfsmp->hfs_cs_hotfile_size = cs_info.hotfile_size; - } - if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_SWAPFILE)) { - hfsmp->hfs_flags |= HFS_CS_SWAPFILE_PIN; - - mp->mnt_ioflags |= MNT_IOFLAGS_SWAPPIN_SUPPORTED; - mp->mnt_max_swappin_available = cs_info.swapfile_pinning; - } - } - - /* - * Init the volume information structure - */ - - lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr); - lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr); - lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr); - lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr); - - vfs_setfsprivate(mp, hfsmp); - hfsmp->hfs_mp = mp; /* Make VFSTOHFS work */ - hfsmp->hfs_raw_dev = vnode_specrdev(devvp); - hfsmp->hfs_devvp = devvp; - vnode_ref(devvp); /* Hold a ref on the device, dropped when hfsmp is freed. */ - hfsmp->hfs_logical_block_size = log_blksize; - hfsmp->hfs_logical_block_count = log_blkcnt; - hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt; - hfsmp->hfs_physical_block_size = phys_blksize; - hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize); - hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; - if (ronly) - hfsmp->hfs_flags |= HFS_READ_ONLY; - if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) - hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; - -#if QUOTA - for (i = 0; i < MAXQUOTAS; i++) - dqfileinit(&hfsmp->hfs_qfiles[i]); -#endif - - if (args) { - hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid; - if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID; - hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid; - if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID; - vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid); /* tell the VFS */ - if (args->hfs_mask != (mode_t)VNOVAL) { - hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS; - if (args->flags & HFSFSMNT_NOXONFILES) { - hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE); - } else { - hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS; - } - } else { - hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS; /* 0777: rwx---rwx */ - hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE; /* 0666: no --x by default? */ - } - if ((args->flags != (int)VNOVAL) && (args->flags & HFSFSMNT_WRAPPER)) - mntwrapper = 1; - } else { - /* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */ - if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) { - hfsmp->hfs_uid = UNKNOWNUID; - hfsmp->hfs_gid = UNKNOWNGID; - vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid); /* tell the VFS */ - hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS; /* 0777: rwx---rwx */ - hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE; /* 0666: no --x by default? */ - } - } - - /* Find out if disk media is writable. */ - if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) { - if (iswritable) - hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; - else - hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; - } - - // Reservations - rl_init(&hfsmp->hfs_reserved_ranges[0]); - rl_init(&hfsmp->hfs_reserved_ranges[1]); - - // record the current time at which we're mounting this volume - struct timeval tv; - microtime(&tv); - hfsmp->hfs_mount_time = tv.tv_sec; - - /* Mount a standard HFS disk */ - if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) && - (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) { -#if CONFIG_HFS_STD - /* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */ - if (vfs_isrdwr(mp)) { - retval = EROFS; - goto error_exit; - } - - printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n"); - - /* Treat it as if it's read-only and not writeable */ - hfsmp->hfs_flags |= HFS_READ_ONLY; - hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; - - /* If only journal replay is requested, exit immediately */ - if (journal_replay_only) { - retval = 0; - goto error_exit; - } - - if ((vfs_flags(mp) & MNT_ROOTFS)) { - retval = EINVAL; /* Cannot root from HFS standard disks */ - goto error_exit; - } - /* HFS disks can only use 512 byte physical blocks */ - if (log_blksize > kHFSBlockSize) { - log_blksize = kHFSBlockSize; - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { - retval = ENXIO; - goto error_exit; - } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { - retval = ENXIO; - goto error_exit; - } - hfsmp->hfs_logical_block_size = log_blksize; - hfsmp->hfs_logical_block_count = log_blkcnt; - hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt; - hfsmp->hfs_physical_block_size = log_blksize; - hfsmp->hfs_log_per_phys = 1; - } - if (args) { - hfsmp->hfs_encoding = args->hfs_encoding; - HFSTOVCB(hfsmp)->volumeNameEncodingHint = args->hfs_encoding; - - /* establish the timezone */ - gTimeZone = args->hfs_timezone; - } - - retval = hfs_getconverter(hfsmp->hfs_encoding, &hfsmp->hfs_get_unicode, - &hfsmp->hfs_get_hfsname); - if (retval) - goto error_exit; - - retval = hfs_MountHFSVolume(hfsmp, mdbp, p); - if (retval) - (void) hfs_relconverter(hfsmp->hfs_encoding); -#else - /* On platforms where HFS Standard is not supported, deny the mount altogether */ - retval = EINVAL; - goto error_exit; -#endif - - } - else { /* Mount an HFS Plus disk */ - HFSPlusVolumeHeader *vhp; - off_t embeddedOffset; - int jnl_disable = 0; - - /* Get the embedded Volume Header */ - if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) { - embeddedOffset = SWAP_BE16(mdbp->drAlBlSt) * kHFSBlockSize; - embeddedOffset += (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.startBlock) * - (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz); - - /* - * Cooperative Fusion is not allowed on embedded HFS+ - * filesystems (HFS+ inside HFS standard wrapper) - */ - hfsmp->hfs_flags &= ~HFS_CS_METADATA_PIN; - - /* - * If the embedded volume doesn't start on a block - * boundary, then switch the device to a 512-byte - * block size so everything will line up on a block - * boundary. - */ - if ((embeddedOffset % log_blksize) != 0) { - printf("hfs_mountfs: embedded volume offset not" - " a multiple of physical block size (%d);" - " switching to 512\n", log_blksize); - log_blksize = 512; - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, - (caddr_t)&log_blksize, FWRITE, context)) { - - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n"); - } - retval = ENXIO; - goto error_exit; - } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, - (caddr_t)&log_blkcnt, 0, context)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n"); - } - retval = ENXIO; - goto error_exit; - } - /* Note: relative block count adjustment */ - hfsmp->hfs_logical_block_count *= - hfsmp->hfs_logical_block_size / log_blksize; - - /* Update logical /physical block size */ - hfsmp->hfs_logical_block_size = log_blksize; - hfsmp->hfs_physical_block_size = log_blksize; - - phys_blksize = log_blksize; - hfsmp->hfs_log_per_phys = 1; - } - - disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) * - (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz); - - hfsmp->hfs_logical_block_count = disksize / log_blksize; - - hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; - - mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); - - if (bp) { - buf_markinvalid(bp); - buf_brelse(bp); - bp = NULL; - } - retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), - phys_blksize, cred, &bp); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval); - } - goto error_exit; - } - bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512); - buf_brelse(bp); - bp = NULL; - vhp = (HFSPlusVolumeHeader*) mdbp; - - } - else { /* pure HFS+ */ - embeddedOffset = 0; - vhp = (HFSPlusVolumeHeader*) mdbp; - } - - retval = hfs_ValidateHFSPlusVolumeHeader(hfsmp, vhp); - if (retval) - goto error_exit; - - /* - * If allocation block size is less than the physical block size, - * invalidate the buffer read in using native physical block size - * to ensure data consistency. - * - * HFS Plus reserves one allocation block for the Volume Header. - * If the physical size is larger, then when we read the volume header, - * we will also end up reading in the next allocation block(s). - * If those other allocation block(s) is/are modified, and then the volume - * header is modified, the write of the volume header's buffer will write - * out the old contents of the other allocation blocks. - * - * We assume that the physical block size is same as logical block size. - * The physical block size value is used to round down the offsets for - * reading and writing the primary and alternate volume headers. - * - * The same logic is also in hfs_MountHFSPlusVolume to ensure that - * hfs_mountfs, hfs_MountHFSPlusVolume and later are doing the I/Os - * using same block size. - */ - if (SWAP_BE32(vhp->blockSize) < hfsmp->hfs_physical_block_size) { - phys_blksize = hfsmp->hfs_logical_block_size; - hfsmp->hfs_physical_block_size = hfsmp->hfs_logical_block_size; - hfsmp->hfs_log_per_phys = 1; - // There should be one bp associated with devvp in buffer cache. - retval = buf_invalidateblks(devvp, 0, 0, 0); - if (retval) - goto error_exit; - } - - if (isroot) { - hfs_root_unmounted_cleanly = ((SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0); - } - - /* - * On inconsistent disks, do not allow read-write mount - * unless it is the boot volume being mounted. We also - * always want to replay the journal if the journal_replay_only - * flag is set because that will (most likely) get the - * disk into a consistent state before fsck_hfs starts - * looking at it. - */ - if ( !(vfs_flags(mp) & MNT_ROOTFS) - && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask) - && !journal_replay_only - && !(hfsmp->hfs_flags & HFS_READ_ONLY)) { - - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: failed to mount non-root inconsistent disk\n"); - } - retval = EINVAL; - goto error_exit; - } - - - // XXXdbg - // - hfsmp->jnl = NULL; - hfsmp->jvp = NULL; - if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) && - args->journal_disable) { - jnl_disable = 1; - } - - // - // We only initialize the journal here if the last person - // to mount this volume was journaling aware. Otherwise - // we delay journal initialization until later at the end - // of hfs_MountHFSPlusVolume() because the last person who - // mounted it could have messed things up behind our back - // (so we need to go find the .journal file, make sure it's - // the right size, re-sync up if it was moved, etc). - // - if ( (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion) - && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask) - && !jnl_disable) { - - // if we're able to init the journal, mark the mount - // point as journaled. - // - if ((retval = hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred)) == 0) { - vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - } else { - if (retval == EROFS) { - // EROFS is a special error code that means the volume has an external - // journal which we couldn't find. in that case we do not want to - // rewrite the volume header - we'll just refuse to mount the volume. - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n"); - } - retval = EINVAL; - goto error_exit; - } - - // if the journal failed to open, then set the lastMountedVersion - // to be "FSK!" which fsck_hfs will see and force the fsck instead - // of just bailing out because the volume is journaled. - if (!ronly) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n"); - } - - HFSPlusVolumeHeader *jvhp; - - hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; - - if (mdb_offset == 0) { - mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); - } - - bp = NULL; - retval = (int)buf_meta_bread(devvp, - HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), - phys_blksize, cred, &bp); - if (retval == 0) { - jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize)); - - if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { - printf ("hfs(1): Journal replay fail. Writing lastMountVersion as FSK!\n"); - jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); - buf_bwrite(bp); - } else { - buf_brelse(bp); - } - bp = NULL; - } else if (bp) { - buf_brelse(bp); - // clear this so the error exit path won't try to use it - bp = NULL; - } - } - - // if this isn't the root device just bail out. - // If it is the root device we just continue on - // in the hopes that fsck_hfs will be able to - // fix any damage that exists on the volume. - if ( !(vfs_flags(mp) & MNT_ROOTFS)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n"); - } - retval = EINVAL; - goto error_exit; - } - } - } - // XXXdbg - - /* Either the journal is replayed successfully, or there - * was nothing to replay, or no journal exists. In any case, - * return success. - */ - if (journal_replay_only) { - retval = 0; - goto error_exit; - } - - (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname); - - retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); - /* - * If the backend didn't like our physical blocksize - * then retry with physical blocksize of 512. - */ - if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) { - printf("hfs_mountfs: could not use physical block size " - "(%d) switching to 512\n", log_blksize); - log_blksize = 512; - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n"); - } - retval = ENXIO; - goto error_exit; - } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n"); - } - retval = ENXIO; - goto error_exit; - } - devvp->v_specsize = log_blksize; - /* Note: relative block count adjustment (in case this is an embedded volume). */ - hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize; - hfsmp->hfs_logical_block_size = log_blksize; - hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize; - - hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; - - if (hfsmp->jnl && hfsmp->jvp == devvp) { - // close and re-open this with the new block size - journal_close(hfsmp->jnl); - hfsmp->jnl = NULL; - if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) { - vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - } else { - // if the journal failed to open, then set the lastMountedVersion - // to be "FSK!" which fsck_hfs will see and force the fsck instead - // of just bailing out because the volume is journaled. - if (!ronly) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n"); - } - HFSPlusVolumeHeader *jvhp; - - hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; - - if (mdb_offset == 0) { - mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); - } - - bp = NULL; - retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), - phys_blksize, cred, &bp); - if (retval == 0) { - jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize)); - - if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { - printf ("hfs(2): Journal replay fail. Writing lastMountVersion as FSK!\n"); - jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); - buf_bwrite(bp); - } else { - buf_brelse(bp); - } - bp = NULL; - } else if (bp) { - buf_brelse(bp); - // clear this so the error exit path won't try to use it - bp = NULL; - } - } - - // if this isn't the root device just bail out. - // If it is the root device we just continue on - // in the hopes that fsck_hfs will be able to - // fix any damage that exists on the volume. - if ( !(vfs_flags(mp) & MNT_ROOTFS)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: hfs_early_journal_init (2) failed \n"); - } - retval = EINVAL; - goto error_exit; - } - } - } - - /* Try again with a smaller block size... */ - retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); - if (retval && HFS_MOUNT_DEBUG) { - printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval); - } - } - if (retval) - (void) hfs_relconverter(0); - } - - // save off a snapshot of the mtime from the previous mount - // (for matador). - hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime; - - if ( retval ) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mountfs: encountered failure %d \n", retval); - } - goto error_exit; - } - - mp->mnt_vfsstat.f_fsid.val[0] = dev; - mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp); - vfs_setmaxsymlen(mp, 0); - - mp->mnt_vtable->vfc_vfsflags |= VFC_VFSNATIVEXATTR; -#if NAMEDSTREAMS - mp->mnt_kern_flag |= MNTK_NAMED_STREAMS; -#endif - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0 ) { - /* Tell VFS that we support directory hard links. */ - mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS; - } -#if CONFIG_HFS_STD - else { - /* HFS standard doesn't support extended readdir! */ - mount_set_noreaddirext (mp); - } -#endif - - if (args) { - /* - * Set the free space warning levels for a non-root volume: - * - * Set the "danger" limit to 1% of the volume size or 100MB, whichever - * is less. Set the "warning" limit to 2% of the volume size or 150MB, - * whichever is less. And last, set the "desired" freespace level to - * to 3% of the volume size or 200MB, whichever is less. - */ - hfsmp->hfs_freespace_notify_dangerlimit = - MIN(HFS_VERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, - (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_VERYLOWDISKTRIGGERFRACTION); - hfsmp->hfs_freespace_notify_warninglimit = - MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, - (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION); - hfsmp->hfs_freespace_notify_desiredlevel = - MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize, - (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION); - } else { - /* - * Set the free space warning levels for the root volume: - * - * Set the "danger" limit to 5% of the volume size or 512MB, whichever - * is less. Set the "warning" limit to 10% of the volume size or 1GB, - * whichever is less. And last, set the "desired" freespace level to - * to 11% of the volume size or 1.25GB, whichever is less. - */ - hfsmp->hfs_freespace_notify_dangerlimit = - MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, - (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTVERYLOWDISKTRIGGERFRACTION); - hfsmp->hfs_freespace_notify_warninglimit = - MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, - (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION); - hfsmp->hfs_freespace_notify_desiredlevel = - MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize, - (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION); - }; - - /* Check if the file system exists on virtual device, like disk image */ - if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, context) == 0) { - if (isvirtual) { - hfsmp->hfs_flags |= HFS_VIRTUAL_DEVICE; - } - } - - /* do not allow ejectability checks on the root device */ - if (isroot == 0) { - if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 && - IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) { - hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp); - if (hfsmp->hfs_syncer == NULL) { - printf("hfs: failed to allocate syncer thread callback for %s (%s)\n", - mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); - } - } - } - - printf("hfs: mounted %s on device %s\n", (hfsmp->vcbVN ? (const char*) hfsmp->vcbVN : "unknown"), - (devvp->v_name ? devvp->v_name : (isroot ? "root_device": "unknown device"))); - - /* - * Start looking for free space to drop below this level and generate a - * warning immediately if needed: - */ - hfsmp->hfs_notification_conditions = 0; - hfs_generate_volume_notifications(hfsmp); - - if (ronly == 0) { - (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); - } - FREE(mdbp, M_TEMP); - return (0); - -error_exit: - if (bp) - buf_brelse(bp); - if (mdbp) - FREE(mdbp, M_TEMP); - - if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - vnode_clearmountedon(hfsmp->jvp); - (void)VNOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, vfs_context_kernel()); - hfsmp->jvp = NULL; - } - if (hfsmp) { - if (hfsmp->hfs_devvp) { - vnode_rele(hfsmp->hfs_devvp); - } - hfs_locks_destroy(hfsmp); - hfs_delete_chash(hfsmp); - hfs_idhash_destroy (hfsmp); - - FREE(hfsmp, M_HFSMNT); - vfs_setfsprivate(mp, NULL); - } - return (retval); -} - - -/* - * Make a filesystem operational. - * Nothing to do at the moment. - */ -/* ARGSUSED */ -static int -hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context) -{ - return (0); -} - - -/* - * unmount system call - */ -int -hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) -{ - struct proc *p = vfs_context_proc(context); - struct hfsmount *hfsmp = VFSTOHFS(mp); - int retval = E_NONE; - int flags; - int force; - int started_tr = 0; - - flags = 0; - force = 0; - if (mntflags & MNT_FORCE) { - flags |= FORCECLOSE; - force = 1; - } - - printf("hfs: unmount initiated on %s on device %s\n", - (hfsmp->vcbVN ? (const char*) hfsmp->vcbVN : "unknown"), - (hfsmp->hfs_devvp ? ((hfsmp->hfs_devvp->v_name ? hfsmp->hfs_devvp->v_name : "unknown device")) : "unknown device")); - - if ((retval = hfs_flushfiles(mp, flags, p)) && !force) - return (retval); - - if (hfsmp->hfs_flags & HFS_METADATA_ZONE) - (void) hfs_recording_suspend(hfsmp); - - hfs_syncer_free(hfsmp); - - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - if (hfsmp->hfs_summary_table) { - int err = 0; - /* - * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress - */ - if (hfsmp->hfs_allocation_vp) { - err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - } - FREE (hfsmp->hfs_summary_table, M_TEMP); - hfsmp->hfs_summary_table = NULL; - hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE; - - if (err == 0 && hfsmp->hfs_allocation_vp){ - hfs_unlock (VTOC(hfsmp->hfs_allocation_vp)); - } - - } - } - - /* - * Flush out the b-trees, volume bitmap and Volume Header - */ - if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { - retval = hfs_start_transaction(hfsmp); - if (retval == 0) { - started_tr = 1; - } else if (!force) { - goto err_exit; - } - - if (hfsmp->hfs_startup_vp) { - (void) hfs_lock(VTOC(hfsmp->hfs_startup_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - retval = hfs_fsync(hfsmp->hfs_startup_vp, MNT_WAIT, 0, p); - hfs_unlock(VTOC(hfsmp->hfs_startup_vp)); - if (retval && !force) - goto err_exit; - } - - if (hfsmp->hfs_attribute_vp) { - (void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p); - hfs_unlock(VTOC(hfsmp->hfs_attribute_vp)); - if (retval && !force) - goto err_exit; - } - - (void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p); - hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); - if (retval && !force) - goto err_exit; - - (void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p); - hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - if (retval && !force) - goto err_exit; - - if (hfsmp->hfs_allocation_vp) { - (void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p); - hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); - if (retval && !force) - goto err_exit; - } - - if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) { - retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p); - if (retval && !force) - goto err_exit; - } - - /* If runtime corruption was detected, indicate that the volume - * was not unmounted cleanly. - */ - if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) { - HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; - } else { - HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; - } - - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - int i; - u_int32_t min_start = hfsmp->totalBlocks; - - // set the nextAllocation pointer to the smallest free block number - // we've seen so on the next mount we won't rescan unnecessarily - lck_spin_lock(&hfsmp->vcbFreeExtLock); - for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) { - if (hfsmp->vcbFreeExt[i].startBlock < min_start) { - min_start = hfsmp->vcbFreeExt[i].startBlock; - } - } - lck_spin_unlock(&hfsmp->vcbFreeExtLock); - if (min_start < hfsmp->nextAllocation) { - hfsmp->nextAllocation = min_start; - } - } - - retval = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); - if (retval) { - HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; - if (!force) - goto err_exit; /* could not flush everything */ - } - - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = 0; - } - } - - if (hfsmp->jnl) { - hfs_flush(hfsmp, HFS_FLUSH_FULL); - } - - /* - * Invalidate our caches and release metadata vnodes - */ - (void) hfsUnmount(hfsmp, p); - -#if CONFIG_HFS_STD - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) { - (void) hfs_relconverter(hfsmp->hfs_encoding); - } -#endif - - // XXXdbg - if (hfsmp->jnl) { - journal_close(hfsmp->jnl); - hfsmp->jnl = NULL; - } - - VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context); - - if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - vnode_clearmountedon(hfsmp->jvp); - retval = VNOP_CLOSE(hfsmp->jvp, - hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, - vfs_context_kernel()); - vnode_put(hfsmp->jvp); - hfsmp->jvp = NULL; - } - // XXXdbg - - /* - * Last chance to dump unreferenced system files. - */ - (void) vflush(mp, NULLVP, FORCECLOSE); - -#if HFS_SPARSE_DEV - /* Drop our reference on the backing fs (if any). */ - if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { - struct vnode * tmpvp; - - hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; - tmpvp = hfsmp->hfs_backingfs_rootvp; - hfsmp->hfs_backingfs_rootvp = NULLVP; - vnode_rele(tmpvp); - } -#endif /* HFS_SPARSE_DEV */ - - vnode_rele(hfsmp->hfs_devvp); - - hfs_locks_destroy(hfsmp); - hfs_delete_chash(hfsmp); - hfs_idhash_destroy(hfsmp); - - assert(TAILQ_EMPTY(&hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS]) - && TAILQ_EMPTY(&hfsmp->hfs_reserved_ranges[HFS_LOCKED_BLOCKS])); - assert(!hfsmp->lockedBlocks); - - FREE(hfsmp, M_HFSMNT); - - return (0); - - err_exit: - if (started_tr) { - hfs_end_transaction(hfsmp); - } - return retval; -} - - -/* - * Return the root of a filesystem. - */ -static int -hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context) -{ - return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0); -} - - -/* - * Do operations associated with quotas - */ -#if !QUOTA -static int -hfs_quotactl(__unused struct mount *mp, __unused int cmds, __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context) -{ - return (ENOTSUP); -} -#else -static int -hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context) -{ - struct proc *p = vfs_context_proc(context); - int cmd, type, error; - - if (uid == ~0U) - uid = kauth_cred_getuid(vfs_context_ucred(context)); - cmd = cmds >> SUBCMDSHIFT; - - switch (cmd) { - case Q_SYNC: - case Q_QUOTASTAT: - break; - case Q_GETQUOTA: - if (uid == kauth_cred_getuid(vfs_context_ucred(context))) - break; - /* fall through */ - default: - if ( (error = vfs_context_suser(context)) ) - return (error); - } - - type = cmds & SUBCMDMASK; - if ((u_int)type >= MAXQUOTAS) - return (EINVAL); - if (vfs_busy(mp, LK_NOWAIT)) - return (0); - - switch (cmd) { - - case Q_QUOTAON: - error = hfs_quotaon(p, mp, type, datap); - break; - - case Q_QUOTAOFF: - error = hfs_quotaoff(p, mp, type); - break; - - case Q_SETQUOTA: - error = hfs_setquota(mp, uid, type, datap); - break; - - case Q_SETUSE: - error = hfs_setuse(mp, uid, type, datap); - break; - - case Q_GETQUOTA: - error = hfs_getquota(mp, uid, type, datap); - break; - - case Q_SYNC: - error = hfs_qsync(mp); - break; - - case Q_QUOTASTAT: - error = hfs_quotastat(mp, type, datap); - break; - - default: - error = EINVAL; - break; - } - vfs_unbusy(mp); - - return (error); -} -#endif /* QUOTA */ - -/* Subtype is composite of bits */ -#define HFS_SUBTYPE_JOURNALED 0x01 -#define HFS_SUBTYPE_CASESENSITIVE 0x02 -/* bits 2 - 6 reserved */ -#define HFS_SUBTYPE_STANDARDHFS 0x80 - -/* - * Get file system statistics. - */ -int -hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context) -{ - ExtendedVCB *vcb = VFSTOVCB(mp); - struct hfsmount *hfsmp = VFSTOHFS(mp); - u_int32_t freeCNIDs; - u_int16_t subtype = 0; - - freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)vcb->vcbNxtCNID; - - sbp->f_bsize = (u_int32_t)vcb->blockSize; - sbp->f_iosize = (size_t)cluster_max_io_size(mp, 0); - sbp->f_blocks = (u_int64_t)((u_int32_t)vcb->totalBlocks); - sbp->f_bfree = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 0)); - sbp->f_bavail = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 1)); - sbp->f_files = (u_int64_t)((u_int32_t )(vcb->totalBlocks - 2)); /* max files is constrained by total blocks */ - sbp->f_ffree = (u_int64_t)((u_int32_t )(MIN(freeCNIDs, sbp->f_bavail))); - - /* - * Subtypes (flavors) for HFS - * 0: Mac OS Extended - * 1: Mac OS Extended (Journaled) - * 2: Mac OS Extended (Case Sensitive) - * 3: Mac OS Extended (Case Sensitive, Journaled) - * 4 - 127: Reserved - * 128: Mac OS Standard - * - */ - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { - /* HFS+ & variants */ - if (hfsmp->jnl) { - subtype |= HFS_SUBTYPE_JOURNALED; - } - if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) { - subtype |= HFS_SUBTYPE_CASESENSITIVE; - } - } -#if CONFIG_HFS_STD - else { - /* HFS standard */ - subtype = HFS_SUBTYPE_STANDARDHFS; - } -#endif - sbp->f_fssubtype = subtype; - - return (0); -} - - -// -// XXXdbg -- this is a callback to be used by the journal to -// get meta data blocks flushed out to disk. -// -// XXXdbg -- be smarter and don't flush *every* block on each -// call. try to only flush some so we don't wind up -// being too synchronous. -// -__private_extern__ -void -hfs_sync_metadata(void *arg) -{ - struct mount *mp = (struct mount *)arg; - struct hfsmount *hfsmp; - ExtendedVCB *vcb; - buf_t bp; - int retval; - daddr64_t priIDSector; - hfsmp = VFSTOHFS(mp); - vcb = HFSTOVCB(hfsmp); - - // now make sure the super block is flushed - priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); - - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp); - if ((retval != 0 ) && (retval != ENXIO)) { - printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n", - (int)priIDSector, retval); - } - - if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { - buf_bwrite(bp); - } else if (bp) { - buf_brelse(bp); - } - - /* Note that these I/Os bypass the journal (no calls to journal_start_modify_block) */ - - // the alternate super block... - // XXXdbg - we probably don't need to do this each and every time. - // hfs_btreeio.c:FlushAlternate() should flag when it was - // written... - if (hfsmp->hfs_partition_avh_sector) { - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_partition_avh_sector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp); - if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { - /* - * note this I/O can fail if the partition shrank behind our backs! - * So failure should be OK here. - */ - buf_bwrite(bp); - } else if (bp) { - buf_brelse(bp); - } - } - - /* Is the FS's idea of the AVH different than the partition ? */ - if ((hfsmp->hfs_fs_avh_sector) && (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector)) { - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_fs_avh_sector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp); - if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { - buf_bwrite(bp); - } else if (bp) { - buf_brelse(bp); - } - } - -} - - -struct hfs_sync_cargs { - kauth_cred_t cred; - struct proc *p; - int waitfor; - int error; - int atime_only_syncs; - time_t sync_start_time; -}; - - -static int -hfs_sync_callback(struct vnode *vp, void *cargs) -{ - struct cnode *cp = VTOC(vp); - struct hfs_sync_cargs *args; - int error; - - args = (struct hfs_sync_cargs *)cargs; - - if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - return (VNODE_RETURNED); - } - - hfs_dirty_t dirty_state = hfs_is_dirty(cp); - - bool sync = dirty_state == HFS_DIRTY || vnode_hasdirtyblks(vp); - - if (!sync && dirty_state == HFS_DIRTY_ATIME - && args->atime_only_syncs < 256) { - // We only update if the atime changed more than 60s ago - if (args->sync_start_time - cp->c_attr.ca_atime > 60) { - sync = true; - ++args->atime_only_syncs; - } - } - - if (sync) { - error = hfs_fsync(vp, args->waitfor, 0, args->p); - - if (error) - args->error = error; - } else if (cp->c_touch_acctime) - hfs_touchtimes(VTOHFS(vp), cp); - - hfs_unlock(cp); - return (VNODE_RETURNED); -} - - - -/* - * Go through the disk queues to initiate sandbagged IO; - * go through the inodes to write those that have been modified; - * initiate the writing of the super block if it has been modified. - * - * Note: we are always called with the filesystem marked `MPBUSY'. - */ -int -hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) -{ - struct proc *p = vfs_context_proc(context); - struct cnode *cp; - struct hfsmount *hfsmp; - ExtendedVCB *vcb; - struct vnode *meta_vp[4]; - int i; - int error, allerror = 0; - struct hfs_sync_cargs args; - - hfsmp = VFSTOHFS(mp); - - // Back off if hfs_changefs or a freeze is underway - hfs_lock_mount(hfsmp); - if ((hfsmp->hfs_flags & HFS_IN_CHANGEFS) - || hfsmp->hfs_freeze_state != HFS_THAWED) { - hfs_unlock_mount(hfsmp); - return 0; - } - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - hfs_unlock_mount(hfsmp); - return (EROFS); - } - - ++hfsmp->hfs_syncers; - hfs_unlock_mount(hfsmp); - - args.cred = kauth_cred_get(); - args.waitfor = waitfor; - args.p = p; - args.error = 0; - args.atime_only_syncs = 0; - - struct timeval tv; - microtime(&tv); - - args.sync_start_time = tv.tv_sec; - - /* - * hfs_sync_callback will be called for each vnode - * hung off of this mount point... the vnode will be - * properly referenced and unreferenced around the callback - */ - vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args); - - if (args.error) - allerror = args.error; - - vcb = HFSTOVCB(hfsmp); - - meta_vp[0] = vcb->extentsRefNum; - meta_vp[1] = vcb->catalogRefNum; - meta_vp[2] = vcb->allocationsRefNum; /* This is NULL for standard HFS */ - meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */ - - /* Now sync our three metadata files */ - for (i = 0; i < 4; ++i) { - struct vnode *btvp; - - btvp = meta_vp[i];; - if ((btvp==0) || (vnode_mount(btvp) != mp)) - continue; - - /* XXX use hfs_systemfile_lock instead ? */ - (void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - cp = VTOC(btvp); - - if (!hfs_is_dirty(cp) && !vnode_hasdirtyblks(btvp)) { - hfs_unlock(VTOC(btvp)); - continue; - } - error = vnode_get(btvp); - if (error) { - hfs_unlock(VTOC(btvp)); - continue; - } - if ((error = hfs_fsync(btvp, waitfor, 0, p))) - allerror = error; - - hfs_unlock(cp); - vnode_put(btvp); - }; - - -#if CONFIG_HFS_STD - /* - * Force stale file system control information to be flushed. - */ - if (vcb->vcbSigWord == kHFSSigWord) { - if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) { - allerror = error; - } - } -#endif - -#if QUOTA - hfs_qsync(mp); -#endif /* QUOTA */ - - hfs_hotfilesync(hfsmp, vfs_context_kernel()); - - /* - * Write back modified superblock. - */ - if (IsVCBDirty(vcb)) { - error = hfs_flushvolumeheader(hfsmp, waitfor == MNT_WAIT ? HFS_FVH_WAIT : 0); - if (error) - allerror = error; - } - - if (hfsmp->jnl) { - hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); - } - - hfs_lock_mount(hfsmp); - boolean_t wake = (!--hfsmp->hfs_syncers - && hfsmp->hfs_freeze_state == HFS_WANT_TO_FREEZE); - hfs_unlock_mount(hfsmp); - if (wake) - wakeup(&hfsmp->hfs_freeze_state); - - return (allerror); -} - - -/* - * File handle to vnode - * - * Have to be really careful about stale file handles: - * - check that the cnode id is valid - * - call hfs_vget() to get the locked cnode - * - check for an unallocated cnode (i_mode == 0) - * - check that the given client host has export rights and return - * those rights via. exflagsp and credanonp - */ -static int -hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, __unused vfs_context_t context) -{ - struct hfsfid *hfsfhp; - struct vnode *nvp; - int result; - - *vpp = NULL; - hfsfhp = (struct hfsfid *)fhp; - - if (fhlen < (int)sizeof(struct hfsfid)) - return (EINVAL); - - result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0); - if (result) { - if (result == ENOENT) - result = ESTALE; - return result; - } - - /* - * We used to use the create time as the gen id of the file handle, - * but it is not static enough because it can change at any point - * via system calls. We still don't have another volume ID or other - * unique identifier to use for a generation ID across reboots that - * persists until the file is removed. Using only the CNID exposes - * us to the potential wrap-around case, but as of 2/2008, it would take - * over 2 months to wrap around if the machine did nothing but allocate - * CNIDs. Using some kind of wrap counter would only be effective if - * each file had the wrap counter associated with it. For now, - * we use only the CNID to identify the file as it's good enough. - */ - - *vpp = nvp; - - hfs_unlock(VTOC(nvp)); - return (0); -} - - -/* - * Vnode pointer to File handle - */ -/* ARGSUSED */ -static int -hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context) -{ - struct cnode *cp; - struct hfsfid *hfsfhp; - - if (ISHFS(VTOVCB(vp))) - return (ENOTSUP); /* hfs standard is not exportable */ - - if (*fhlenp < (int)sizeof(struct hfsfid)) - return (EOVERFLOW); - - cp = VTOC(vp); - hfsfhp = (struct hfsfid *)fhp; - /* only the CNID is used to identify the file now */ - hfsfhp->hfsfid_cnid = htonl(cp->c_fileid); - hfsfhp->hfsfid_gen = htonl(cp->c_fileid); - *fhlenp = sizeof(struct hfsfid); - - return (0); -} - - -/* - * Initialize HFS filesystems, done only once per boot. - * - * HFS is not a kext-based file system. This makes it difficult to find - * out when the last HFS file system was unmounted and call hfs_uninit() - * to deallocate data structures allocated in hfs_init(). Therefore we - * never deallocate memory allocated by lock attribute and group initializations - * in this function. - */ -static int -hfs_init(__unused struct vfsconf *vfsp) -{ - static int done = 0; - - if (done) - return (0); - done = 1; - hfs_chashinit(); - hfs_converterinit(); - - BTReserveSetup(); - - hfs_lock_attr = lck_attr_alloc_init(); - hfs_group_attr = lck_grp_attr_alloc_init(); - hfs_mutex_group = lck_grp_alloc_init("hfs-mutex", hfs_group_attr); - hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr); - hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr); - -#if HFS_COMPRESSION - decmpfs_init(); -#endif - - return (0); -} - - -/* - * Destroy all locks, mutexes and spinlocks in hfsmp on unmount or failed mount - */ -static void -hfs_locks_destroy(struct hfsmount *hfsmp) -{ - - lck_mtx_destroy(&hfsmp->hfs_mutex, hfs_mutex_group); - lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group); - lck_rw_destroy(&hfsmp->hfs_global_lock, hfs_rwlock_group); - lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group); - - return; -} - - -static int -hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp) -{ - struct hfsmount * hfsmp; - char fstypename[MFSNAMELEN]; - - if (vp == NULL) - return (EINVAL); - - if (!vnode_isvroot(vp)) - return (EINVAL); - - vnode_vfsname(vp, fstypename); - if (strncmp(fstypename, "hfs", sizeof(fstypename)) != 0) - return (EINVAL); - - hfsmp = VTOHFS(vp); - - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) - return (EINVAL); - - *hfsmpp = hfsmp; - - return (0); -} - -// XXXdbg -#include - -static hfsmount_t *hfs_mount_from_cwd(vfs_context_t ctx) -{ - vnode_t vp = vfs_context_cwd(ctx); - - if (!vp) - return NULL; - - /* - * We could use vnode_tag, but it is probably more future proof to - * compare fstypename. - */ - char fstypename[MFSNAMELEN]; - vnode_vfsname(vp, fstypename); - - if (strcmp(fstypename, "hfs")) - return NULL; - - return VTOHFS(vp); -} - -/* - * HFS filesystem related variables. - */ -int -hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, vfs_context_t context) -{ - struct proc *p = vfs_context_proc(context); - int error; - struct hfsmount *hfsmp; - - /* all sysctl names at this level are terminal */ - - if (name[0] == HFS_ENCODINGBIAS) { - int bias; - - bias = hfs_getencodingbias(); - error = sysctl_int(oldp, oldlenp, newp, newlen, &bias); - if (error == 0 && newp) - hfs_setencodingbias(bias); - return (error); - - } else if (name[0] == HFS_EXTEND_FS) { - u_int64_t newsize = 0; - vnode_t vp = vfs_context_cwd(context); - - if (newp == USER_ADDR_NULL || vp == NULLVP) - return (EINVAL); - if ((error = hfs_getmountpoint(vp, &hfsmp))) - return (error); - - /* Start with the 'size' set to the current number of bytes in the filesystem */ - newsize = ((uint64_t)hfsmp->totalBlocks) * ((uint64_t)hfsmp->blockSize); - - /* now get the new size from userland and over-write our stored value */ - error = sysctl_quad(oldp, oldlenp, newp, newlen, (quad_t *)&newsize); - if (error) - return (error); - - error = hfs_extendfs(hfsmp, newsize, context); - return (error); - - } else if (name[0] == HFS_ENCODINGHINT) { - size_t bufsize; - size_t bytes; - u_int32_t hint; - u_int16_t *unicode_name = NULL; - char *filename = NULL; - - if ((newlen <= 0) || (newlen > MAXPATHLEN)) - return (EINVAL); - - bufsize = MAX(newlen * 3, MAXPATHLEN); - MALLOC(filename, char *, newlen, M_TEMP, M_WAITOK); - if (filename == NULL) { - error = ENOMEM; - goto encodinghint_exit; - } - MALLOC(unicode_name, u_int16_t *, bufsize, M_TEMP, M_WAITOK); - if (unicode_name == NULL) { - error = ENOMEM; - goto encodinghint_exit; - } - - error = copyin(newp, (caddr_t)filename, newlen); - if (error == 0) { - error = utf8_decodestr((u_int8_t *)filename, newlen - 1, unicode_name, - &bytes, bufsize, 0, UTF_DECOMPOSED); - if (error == 0) { - hint = hfs_pickencoding(unicode_name, bytes / 2); - error = sysctl_int(oldp, oldlenp, USER_ADDR_NULL, 0, (int32_t *)&hint); - } - } - -encodinghint_exit: - if (unicode_name) - FREE(unicode_name, M_TEMP); - if (filename) - FREE(filename, M_TEMP); - return (error); - - } else if (name[0] == HFS_ENABLE_JOURNALING) { - // make the file system journaled... - vnode_t jvp; - ExtendedVCB *vcb; - struct cat_attr jnl_attr; - struct cat_attr jinfo_attr; - struct cat_fork jnl_fork; - struct cat_fork jinfo_fork; - buf_t jib_buf; - uint64_t jib_blkno; - uint32_t tmpblkno; - uint64_t journal_byte_offset; - uint64_t journal_size; - vnode_t jib_vp = NULLVP; - struct JournalInfoBlock local_jib; - int err = 0; - void *jnl = NULL; - int lockflags; - - /* Only root can enable journaling */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return (EPERM); - } - - hfsmp = hfs_mount_from_cwd(context); - if (!hfsmp) - return EINVAL; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return EROFS; - } - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) { - printf("hfs: can't make a plain hfs volume journaled.\n"); - return EINVAL; - } - - if (hfsmp->jnl) { - printf("hfs: volume %s is already journaled!\n", hfsmp->vcbVN); - return EAGAIN; - } - vcb = HFSTOVCB(hfsmp); - - /* Set up local copies of the initialization info */ - tmpblkno = (uint32_t) name[1]; - jib_blkno = (uint64_t) tmpblkno; - journal_byte_offset = (uint64_t) name[2]; - journal_byte_offset *= hfsmp->blockSize; - journal_byte_offset += hfsmp->hfsPlusIOPosOffset; - journal_size = (uint64_t)((unsigned)name[3]); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 || - BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) { - - printf("hfs: volume has a btree w/non-contiguous nodes. can not enable journaling.\n"); - hfs_systemfile_unlock(hfsmp, lockflags); - return EINVAL; - } - hfs_systemfile_unlock(hfsmp, lockflags); - - // make sure these both exist! - if ( GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0 - || GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) { - - return EINVAL; - } - - /* - * At this point, we have a copy of the metadata that lives in the catalog for the - * journal info block. Compare that the journal info block's single extent matches - * that which was passed into this sysctl. - * - * If it is different, deny the journal enable call. - */ - if (jinfo_fork.cf_blocks > 1) { - /* too many blocks */ - return EINVAL; - } - - if (jinfo_fork.cf_extents[0].startBlock != jib_blkno) { - /* Wrong block */ - return EINVAL; - } - - /* - * We want to immediately purge the vnode for the JIB. - * - * Because it was written to from userland, there's probably - * a vnode somewhere in the vnode cache (possibly with UBC backed blocks). - * So we bring the vnode into core, then immediately do whatever - * we can to flush/vclean it out. This is because those blocks will be - * interpreted as user data, which may be treated separately on some platforms - * than metadata. If the vnode is gone, then there cannot be backing blocks - * in the UBC. - */ - if (hfs_vget (hfsmp, jinfo_attr.ca_fileid, &jib_vp, 1, 0)) { - return EINVAL; - } - /* - * Now we have a vnode for the JIB. recycle it. Because we hold an iocount - * on the vnode, we'll just mark it for termination when the last iocount - * (hopefully ours), is dropped. - */ - vnode_recycle (jib_vp); - err = vnode_put (jib_vp); - if (err) { - return EINVAL; - } - - /* Initialize the local copy of the JIB (just like hfs.util) */ - memset (&local_jib, 'Z', sizeof(struct JournalInfoBlock)); - local_jib.flags = SWAP_BE32(kJIJournalInFSMask); - /* Note that the JIB's offset is in bytes */ - local_jib.offset = SWAP_BE64(journal_byte_offset); - local_jib.size = SWAP_BE64(journal_size); - - /* - * Now write out the local JIB. This essentially overwrites the userland - * copy of the JIB. Read it as BLK_META to treat it as a metadata read/write. - */ - jib_buf = buf_getblk (hfsmp->hfs_devvp, - jib_blkno * (hfsmp->blockSize / hfsmp->hfs_logical_block_size), - hfsmp->blockSize, 0, 0, BLK_META); - char* buf_ptr = (char*) buf_dataptr (jib_buf); - - /* Zero out the portion of the block that won't contain JIB data */ - memset (buf_ptr, 0, hfsmp->blockSize); - - bcopy(&local_jib, buf_ptr, sizeof(local_jib)); - if (buf_bwrite (jib_buf)) { - return EIO; - } - - /* Force a flush track cache */ - hfs_flush(hfsmp, HFS_FLUSH_CACHE); - - /* Now proceed with full volume sync */ - hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context); - - printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", - (off_t)name[2], (off_t)name[3]); - - // - // XXXdbg - note that currently (Sept, 08) hfs_util does not support - // enabling the journal on a separate device so it is safe - // to just copy hfs_devvp here. If hfs_util gets the ability - // to dynamically enable the journal on a separate device then - // we will have to do the same thing as hfs_early_journal_init() - // to locate and open the journal device. - // - jvp = hfsmp->hfs_devvp; - jnl = journal_create(jvp, journal_byte_offset, journal_size, - hfsmp->hfs_devvp, - hfsmp->hfs_logical_block_size, - 0, - 0, - hfs_sync_metadata, hfsmp->hfs_mp, - hfsmp->hfs_mp); - - /* - * Set up the trim callback function so that we can add - * recently freed extents to the free extent cache once - * the transaction that freed them is written to the - * journal on disk. - */ - if (jnl) - journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp); - - if (jnl == NULL) { - printf("hfs: FAILED to create the journal!\n"); - if (jvp && jvp != hfsmp->hfs_devvp) { - vnode_clearmountedon(jvp); - VNOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel()); - } - jvp = NULL; - - return EINVAL; - } - - hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); - - /* - * Flush all dirty metadata buffers. - */ - buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl"); - if (hfsmp->hfs_attribute_vp) - buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl"); - - HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1]; - HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask; - hfsmp->jvp = jvp; - hfsmp->jnl = jnl; - - // save this off for the hack-y check in hfs_remove() - hfsmp->jnl_start = (u_int32_t)name[2]; - hfsmp->jnl_size = (off_t)((unsigned)name[3]); - hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid; - hfsmp->hfs_jnlfileid = jnl_attr.ca_fileid; - - vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - - hfs_unlock_global (hfsmp); - hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - - { - fsid_t fsid; - - fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev; - fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp)); - vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL); - } - return 0; - } else if (name[0] == HFS_DISABLE_JOURNALING) { - // clear the journaling bit - - /* Only root can disable journaling */ - if (!kauth_cred_issuser(kauth_cred_get())) { - return (EPERM); - } - - hfsmp = hfs_mount_from_cwd(context); - if (!hfsmp) - return EINVAL; - - /* - * Disabling journaling is disallowed on volumes with directory hard links - * because we have not tested the relevant code path. - */ - if (hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries != 0){ - printf("hfs: cannot disable journaling on volumes with directory hardlinks\n"); - return EPERM; - } - - printf("hfs: disabling journaling for %s\n", hfsmp->vcbVN); - - hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); - - // Lights out for you buddy! - journal_close(hfsmp->jnl); - hfsmp->jnl = NULL; - - if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - vnode_clearmountedon(hfsmp->jvp); - VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel()); - vnode_put(hfsmp->jvp); - } - hfsmp->jvp = NULL; - vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - hfsmp->jnl_start = 0; - hfsmp->hfs_jnlinfoblkid = 0; - hfsmp->hfs_jnlfileid = 0; - - HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask; - - hfs_unlock_global (hfsmp); - - hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); - - { - fsid_t fsid; - - fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev; - fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp)); - vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL); - } - return 0; - } else if (name[0] == HFS_SET_PKG_EXTENSIONS) { - - return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]); - - } else if (name[0] == VFS_CTL_QUERY) { - struct sysctl_req *req; - union union_vfsidctl vc; - struct mount *mp; - struct vfsquery vq; - - req = CAST_DOWN(struct sysctl_req *, oldp); /* we're new style vfs sysctl. */ - if (req == NULL) { - return EFAULT; - } - - error = SYSCTL_IN(req, &vc, proc_is64bit(p)? sizeof(vc.vc64):sizeof(vc.vc32)); - if (error) return (error); - - mp = vfs_getvfs(&vc.vc32.vc_fsid); /* works for 32 and 64 */ - if (mp == NULL) return (ENOENT); - - hfsmp = VFSTOHFS(mp); - bzero(&vq, sizeof(vq)); - vq.vq_flags = hfsmp->hfs_notification_conditions; - return SYSCTL_OUT(req, &vq, sizeof(vq));; - } else if (name[0] == HFS_REPLAY_JOURNAL) { - vnode_t devvp = NULL; - int device_fd; - if (namelen != 2) { - return (EINVAL); - } - device_fd = name[1]; - error = file_vnode(device_fd, &devvp); - if (error) { - return error; - } - error = vnode_getwithref(devvp); - if (error) { - file_drop(device_fd); - return error; - } - error = hfs_journal_replay(devvp, context); - file_drop(device_fd); - vnode_put(devvp); - return error; - } else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) { - hfs_resize_debug = 1; - printf ("hfs_sysctl: Enabled volume resize debugging.\n"); - return 0; - } - - return (ENOTSUP); -} - -/* - * hfs_vfs_vget is not static since it is used in hfs_readwrite.c to support - * the build_path ioctl. We use it to leverage the code below that updates - * the origin list cache if necessary - */ - -int -hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context) -{ - int error; - int lockflags; - struct hfsmount *hfsmp; - - hfsmp = VFSTOHFS(mp); - - error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0); - if (error) - return error; - - /* - * If the look-up was via the object ID (rather than the link ID), - * then we make sure there's a parent here. We can't leave this - * until hfs_vnop_getattr because if there's a problem getting the - * parent at that point, all the caller will do is call - * hfs_vfs_vget again and we'll end up in an infinite loop. - */ - - cnode_t *cp = VTOC(*vpp); - - if (ISSET(cp->c_flag, C_HARDLINK) && ino == cp->c_fileid) { - hfs_lock_always(cp, HFS_SHARED_LOCK); - - if (!hfs_haslinkorigin(cp)) { - if (!hfs_lock_upgrade(cp)) - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); - - if (cp->c_cnid == cp->c_fileid) { - /* - * Descriptor is stale, so we need to refresh it. We - * pick the first link. - */ - cnid_t link_id; - - error = hfs_first_link(hfsmp, cp, &link_id); - - if (!error) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_findname(hfsmp, link_id, &cp->c_desc); - hfs_systemfile_unlock(hfsmp, lockflags); - } - } else { - // We'll use whatever link the descriptor happens to have - error = 0; - } - if (!error) - hfs_savelinkorigin(cp, cp->c_parentcnid); - } - - hfs_unlock(cp); - - if (error) { - vnode_put(*vpp); - *vpp = NULL; - } - } - - return error; -} - - -/* - * Look up an HFS object by ID. - * - * The object is returned with an iocount reference and the cnode locked. - * - * If the object is a file then it will represent the data fork. - */ -int -hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted) -{ - struct vnode *vp = NULLVP; - struct cat_desc cndesc; - struct cat_attr cnattr; - struct cat_fork cnfork; - u_int32_t linkref = 0; - int error; - - /* Check for cnids that should't be exported. */ - if ((cnid < kHFSFirstUserCatalogNodeID) && - (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) { - return (ENOENT); - } - /* Don't export our private directories. */ - if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || - cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { - return (ENOENT); - } - /* - * Check the hash first - */ - vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted); - if (vp) { - *vpp = vp; - return(0); - } - - bzero(&cndesc, sizeof(cndesc)); - bzero(&cnattr, sizeof(cnattr)); - bzero(&cnfork, sizeof(cnfork)); - - /* - * Not in hash, lookup in catalog - */ - if (cnid == kHFSRootParentID) { - static char hfs_rootname[] = "/"; - - cndesc.cd_nameptr = (const u_int8_t *)&hfs_rootname[0]; - cndesc.cd_namelen = 1; - cndesc.cd_parentcnid = kHFSRootParentID; - cndesc.cd_cnid = kHFSRootFolderID; - cndesc.cd_flags = CD_ISDIR; - - cnattr.ca_fileid = kHFSRootFolderID; - cnattr.ca_linkcount = 1; - cnattr.ca_entries = 1; - cnattr.ca_dircount = 1; - cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO); - } else { - int lockflags; - cnid_t pid; - const char *nameptr; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_idlookup(hfsmp, cnid, 0, 0, &cndesc, &cnattr, &cnfork); - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) { - *vpp = NULL; - return (error); - } - - /* - * Check for a raw hardlink inode and save its linkref. - */ - pid = cndesc.cd_parentcnid; - nameptr = (const char *)cndesc.cd_nameptr; - - if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && - (bcmp(nameptr, HFS_INODE_PREFIX, HFS_INODE_PREFIX_LEN) == 0)) { - linkref = strtoul(&nameptr[HFS_INODE_PREFIX_LEN], NULL, 10); - - } else if ((pid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) && - (bcmp(nameptr, HFS_DIRINODE_PREFIX, HFS_DIRINODE_PREFIX_LEN) == 0)) { - linkref = strtoul(&nameptr[HFS_DIRINODE_PREFIX_LEN], NULL, 10); - - } else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && - (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) { - *vpp = NULL; - cat_releasedesc(&cndesc); - return (ENOENT); /* open unlinked file */ - } - } - - /* - * Finish initializing cnode descriptor for hardlinks. - * - * We need a valid name and parent for reverse lookups. - */ - if (linkref) { - cnid_t lastid; - struct cat_desc linkdesc; - int linkerr = 0; - - cnattr.ca_linkref = linkref; - bzero (&linkdesc, sizeof (linkdesc)); - - /* - * If the caller supplied the raw inode value, then we don't know exactly - * which hardlink they wanted. It's likely that they acquired the raw inode - * value BEFORE the item became a hardlink, in which case, they probably - * want the oldest link. So request the oldest link from the catalog. - * - * Unfortunately, this requires that we iterate through all N hardlinks. On the plus - * side, since we know that we want the last linkID, we can also have this one - * call give us back the name of the last ID, since it's going to have it in-hand... - */ - linkerr = hfs_lookup_lastlink (hfsmp, linkref, &lastid, &linkdesc); - if ((linkerr == 0) && (lastid != 0)) { - /* - * Release any lingering buffers attached to our local descriptor. - * Then copy the name and other business into the cndesc - */ - cat_releasedesc (&cndesc); - bcopy (&linkdesc, &cndesc, sizeof(linkdesc)); - } - /* If it failed, the linkref code will just use whatever it had in-hand below. */ - } - - if (linkref) { - int newvnode_flags = 0; - - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, - &cnfork, &vp, &newvnode_flags); - if (error == 0) { - VTOC(vp)->c_flag |= C_HARDLINK; - vnode_setmultipath(vp); - } - } else { - struct componentname cn; - int newvnode_flags = 0; - - /* Supply hfs_getnewvnode with a component name. */ - MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - cn.cn_nameiop = LOOKUP; - cn.cn_flags = ISLASTCN | HASBUF; - cn.cn_context = NULL; - cn.cn_pnlen = MAXPATHLEN; - cn.cn_nameptr = cn.cn_pnbuf; - cn.cn_namelen = cndesc.cd_namelen; - cn.cn_hash = 0; - cn.cn_consume = 0; - bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1); - - error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr, - &cnfork, &vp, &newvnode_flags); - - if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) { - hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid); - } - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - } - cat_releasedesc(&cndesc); - - *vpp = vp; - if (vp && skiplock) { - hfs_unlock(VTOC(vp)); - } - return (error); -} - - -/* - * Flush out all the files in a filesystem. - */ -static int -#if QUOTA -hfs_flushfiles(struct mount *mp, int flags, struct proc *p) -#else -hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p) -#endif /* QUOTA */ -{ - struct hfsmount *hfsmp; - struct vnode *skipvp = NULLVP; - int error; - int accounted_root_usecounts; -#if QUOTA - int i; -#endif - - hfsmp = VFSTOHFS(mp); - - accounted_root_usecounts = 0; -#if QUOTA - /* - * The open quota files have an indirect reference on - * the root directory vnode. We must account for this - * extra reference when doing the intial vflush. - */ - if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) { - /* Find out how many quota files we have open. */ - for (i = 0; i < MAXQUOTAS; i++) { - if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP) - ++accounted_root_usecounts; - } - } -#endif /* QUOTA */ - - if (accounted_root_usecounts > 0) { - /* Obtain the root vnode so we can skip over it. */ - skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0); - } - - error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags); - if (error != 0) - return(error); - - error = vflush(mp, skipvp, SKIPSYSTEM | flags); - - if (skipvp) { - /* - * See if there are additional references on the - * root vp besides the ones obtained from the open - * quota files and CoreStorage. - */ - if ((error == 0) && - (vnode_isinuse(skipvp, accounted_root_usecounts))) { - error = EBUSY; /* root directory is still open */ - } - hfs_unlock(VTOC(skipvp)); - /* release the iocount from the hfs_chash_getvnode call above. */ - vnode_put(skipvp); - } - if (error && (flags & FORCECLOSE) == 0) - return (error); - -#if QUOTA - if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) { - for (i = 0; i < MAXQUOTAS; i++) { - if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP) - continue; - hfs_quotaoff(p, mp, i); - } - } -#endif /* QUOTA */ - - if (skipvp) { - error = vflush(mp, NULLVP, SKIPSYSTEM | flags); - } - - return (error); -} - -/* - * Update volume encoding bitmap (HFS Plus only) - * - * Mark a legacy text encoding as in-use (as needed) - * in the volume header of this HFS+ filesystem. - */ -__private_extern__ -void -hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding) -{ -#define kIndexMacUkrainian 48 /* MacUkrainian encoding is 152 */ -#define kIndexMacFarsi 49 /* MacFarsi encoding is 140 */ - - u_int32_t index; - - switch (encoding) { - case kTextEncodingMacUkrainian: - index = kIndexMacUkrainian; - break; - case kTextEncodingMacFarsi: - index = kIndexMacFarsi; - break; - default: - index = encoding; - break; - } - - /* Only mark the encoding as in-use if it wasn't already set */ - if (index < 64 && (hfsmp->encodingsBitmap & (u_int64_t)(1ULL << index)) == 0) { - hfs_lock_mount (hfsmp); - hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index); - MarkVCBDirty(hfsmp); - hfs_unlock_mount(hfsmp); - } -} - -/* - * Update volume stats - * - * On journal volumes this will cause a volume header flush - */ -int -hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot) -{ - struct timeval tv; - - microtime(&tv); - - hfs_lock_mount (hfsmp); - - MarkVCBDirty(hfsmp); - hfsmp->hfs_mtime = tv.tv_sec; - - switch (op) { - case VOL_UPDATE: - break; - case VOL_MKDIR: - if (hfsmp->hfs_dircount != 0xFFFFFFFF) - ++hfsmp->hfs_dircount; - if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF) - ++hfsmp->vcbNmRtDirs; - break; - case VOL_RMDIR: - if (hfsmp->hfs_dircount != 0) - --hfsmp->hfs_dircount; - if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF) - --hfsmp->vcbNmRtDirs; - break; - case VOL_MKFILE: - if (hfsmp->hfs_filecount != 0xFFFFFFFF) - ++hfsmp->hfs_filecount; - if (inroot && hfsmp->vcbNmFls != 0xFFFF) - ++hfsmp->vcbNmFls; - break; - case VOL_RMFILE: - if (hfsmp->hfs_filecount != 0) - --hfsmp->hfs_filecount; - if (inroot && hfsmp->vcbNmFls != 0xFFFF) - --hfsmp->vcbNmFls; - break; - } - - hfs_unlock_mount (hfsmp); - - if (hfsmp->jnl) { - hfs_flushvolumeheader(hfsmp, 0); - } - - return (0); -} - - -#if CONFIG_HFS_STD -/* HFS Standard MDB flush */ -static int -hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) -{ - ExtendedVCB *vcb = HFSTOVCB(hfsmp); - struct filefork *fp; - HFSMasterDirectoryBlock *mdb; - struct buf *bp = NULL; - int retval; - int sector_size; - ByteCount namelen; - - sector_size = hfsmp->hfs_logical_block_size; - retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sector_size), sector_size, NOCRED, &bp); - if (retval) { - if (bp) - buf_brelse(bp); - return retval; - } - - hfs_lock_mount (hfsmp); - - mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sector_size)); - - mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime))); - mdb->drLsMod = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod))); - mdb->drAtrb = SWAP_BE16 (vcb->vcbAtrb); - mdb->drNmFls = SWAP_BE16 (vcb->vcbNmFls); - mdb->drAllocPtr = SWAP_BE16 (vcb->nextAllocation); - mdb->drClpSiz = SWAP_BE32 (vcb->vcbClpSiz); - mdb->drNxtCNID = SWAP_BE32 (vcb->vcbNxtCNID); - mdb->drFreeBks = SWAP_BE16 (vcb->freeBlocks); - - namelen = strlen((char *)vcb->vcbVN); - retval = utf8_to_hfs(vcb, namelen, vcb->vcbVN, mdb->drVN); - /* Retry with MacRoman in case that's how it was exported. */ - if (retval) - retval = utf8_to_mac_roman(namelen, vcb->vcbVN, mdb->drVN); - - mdb->drVolBkUp = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbVolBkUp))); - mdb->drWrCnt = SWAP_BE32 (vcb->vcbWrCnt); - mdb->drNmRtDirs = SWAP_BE16 (vcb->vcbNmRtDirs); - mdb->drFilCnt = SWAP_BE32 (vcb->vcbFilCnt); - mdb->drDirCnt = SWAP_BE32 (vcb->vcbDirCnt); - - bcopy(vcb->vcbFndrInfo, mdb->drFndrInfo, sizeof(mdb->drFndrInfo)); - - fp = VTOF(vcb->extentsRefNum); - mdb->drXTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock); - mdb->drXTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount); - mdb->drXTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock); - mdb->drXTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount); - mdb->drXTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock); - mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount); - mdb->drXTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize); - mdb->drXTClpSiz = SWAP_BE32 (fp->ff_clumpsize); - FTOC(fp)->c_flag &= ~C_MODIFIED; - - fp = VTOF(vcb->catalogRefNum); - mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock); - mdb->drCTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount); - mdb->drCTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock); - mdb->drCTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount); - mdb->drCTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock); - mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount); - mdb->drCTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize); - mdb->drCTClpSiz = SWAP_BE32 (fp->ff_clumpsize); - FTOC(fp)->c_flag &= ~C_MODIFIED; - - MarkVCBClean( vcb ); - - hfs_unlock_mount (hfsmp); - - /* If requested, flush out the alternate MDB */ - if (altflush) { - struct buf *alt_bp = NULL; - - if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_partition_avh_sector, sector_size, NOCRED, &alt_bp) == 0) { - bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sector_size), kMDBSize); - - (void) VNOP_BWRITE(alt_bp); - } else if (alt_bp) - buf_brelse(alt_bp); - } - - if (waitfor != MNT_WAIT) - buf_bawrite(bp); - else - retval = VNOP_BWRITE(bp); - - return (retval); -} -#endif - -/* - * Flush any dirty in-memory mount data to the on-disk - * volume header. - * - * Note: the on-disk volume signature is intentionally - * not flushed since the on-disk "H+" and "HX" signatures - * are always stored in-memory as "H+". - */ -int -hfs_flushvolumeheader(struct hfsmount *hfsmp, - hfs_flush_volume_header_options_t options) -{ - ExtendedVCB *vcb = HFSTOVCB(hfsmp); - struct filefork *fp; - HFSPlusVolumeHeader *volumeHeader, *altVH; - int retval; - struct buf *bp, *alt_bp; - int i; - daddr64_t priIDSector; - bool critical = false; - u_int16_t signature; - u_int16_t hfsversion; - daddr64_t avh_sector; - bool altflush = ISSET(options, HFS_FVH_WRITE_ALT); - - if (ISSET(options, HFS_FVH_FLUSH_IF_DIRTY) - && !hfs_header_needs_flushing(hfsmp)) { - return 0; - } - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return(0); - } -#if CONFIG_HFS_STD - if (hfsmp->hfs_flags & HFS_STANDARD) { - return hfs_flushMDB(hfsmp, ISSET(options, HFS_FVH_WAIT) ? MNT_WAIT : 0, altflush); - } -#endif - priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); - - if (hfs_start_transaction(hfsmp) != 0) { - return EINVAL; - } - - bp = NULL; - alt_bp = NULL; - - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp); - if (retval) { - printf("hfs: err %d reading VH blk (vol=%s)\n", retval, vcb->vcbVN); - goto err_exit; - } - - volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) + - HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); - - /* - * Sanity check what we just read. If it's bad, try the alternate - * instead. - */ - signature = SWAP_BE16 (volumeHeader->signature); - hfsversion = SWAP_BE16 (volumeHeader->version); - if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) || - (hfsversion < kHFSPlusVersion) || (hfsversion > 100) || - (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) { - printf("hfs: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d\n", - vcb->vcbVN, signature, hfsversion, - SWAP_BE32 (volumeHeader->blockSize)); - hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); - - /* Almost always we read AVH relative to the partition size */ - avh_sector = hfsmp->hfs_partition_avh_sector; - - if (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector) { - /* - * The two altVH offsets do not match --- which means that a smaller file - * system exists in a larger partition. Verify that we have the correct - * alternate volume header sector as per the current parititon size. - * The GPT device that we are mounted on top could have changed sizes - * without us knowing. - * - * We're in a transaction, so it's safe to modify the partition_avh_sector - * field if necessary. - */ - - uint64_t sector_count; - - /* Get underlying device block count */ - if ((retval = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCGETBLOCKCOUNT, - (caddr_t)§or_count, 0, vfs_context_current()))) { - printf("hfs_flushVH: err %d getting block count (%s) \n", retval, vcb->vcbVN); - retval = ENXIO; - goto err_exit; - } - - /* Partition size was changed without our knowledge */ - if (sector_count != (uint64_t)hfsmp->hfs_logical_block_count) { - hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, sector_count); - /* Note: hfs_fs_avh_sector will remain unchanged */ - printf ("hfs_flushVH: partition size changed, partition_avh_sector=%qu, fs_avh_sector=%qu\n", - hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); - - /* - * We just updated the offset for AVH relative to - * the partition size, so the content of that AVH - * will be invalid. But since we are also maintaining - * a valid AVH relative to the file system size, we - * can read it since primary VH and partition AVH - * are not valid. - */ - avh_sector = hfsmp->hfs_fs_avh_sector; - } - } - - printf ("hfs: trying alternate (for %s) avh_sector=%qu\n", - (avh_sector == hfsmp->hfs_fs_avh_sector) ? "file system" : "partition", avh_sector); - - if (avh_sector) { - retval = buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(avh_sector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &alt_bp); - if (retval) { - printf("hfs: err %d reading alternate VH (%s)\n", retval, vcb->vcbVN); - goto err_exit; - } - - altVH = (HFSPlusVolumeHeader *)((char *)buf_dataptr(alt_bp) + - HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)); - signature = SWAP_BE16(altVH->signature); - hfsversion = SWAP_BE16(altVH->version); - - if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) || - (hfsversion < kHFSPlusVersion) || (kHFSPlusVersion > 100) || - (SWAP_BE32(altVH->blockSize) != vcb->blockSize)) { - printf("hfs: corrupt alternate VH on %s, sig 0x%04x, ver %d, blksize %d\n", - vcb->vcbVN, signature, hfsversion, - SWAP_BE32(altVH->blockSize)); - retval = EIO; - goto err_exit; - } - - /* The alternate is plausible, so use it. */ - bcopy(altVH, volumeHeader, kMDBSize); - buf_brelse(alt_bp); - alt_bp = NULL; - } else { - /* No alternate VH, nothing more we can do. */ - retval = EIO; - goto err_exit; - } - } - - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, bp); - } - - /* - * For embedded HFS+ volumes, update create date if it changed - * (ie from a setattrlist call) - */ - if ((vcb->hfsPlusIOPosOffset != 0) && - (SWAP_BE32 (volumeHeader->createDate) != vcb->localCreateDate)) { - struct buf *bp2; - HFSMasterDirectoryBlock *mdb; - - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp2); - if (retval) { - if (bp2) - buf_brelse(bp2); - retval = 0; - } else { - mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) + - HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); - - if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate ) - { - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, bp2); - } - - mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate); /* pick up the new create date */ - - if (hfsmp->jnl) { - journal_modify_block_end(hfsmp->jnl, bp2, NULL, NULL); - } else { - (void) VNOP_BWRITE(bp2); /* write out the changes */ - } - } - else - { - buf_brelse(bp2); /* just release it */ - } - } - } - - hfs_lock_mount (hfsmp); - - /* Note: only update the lower 16 bits worth of attributes */ - volumeHeader->attributes = SWAP_BE32 (vcb->vcbAtrb); - volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock); - if (hfsmp->jnl) { - volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion); - } else { - volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion); - } - volumeHeader->createDate = SWAP_BE32 (vcb->localCreateDate); /* volume create date is in local time */ - volumeHeader->modifyDate = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod)); - volumeHeader->backupDate = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp)); - volumeHeader->fileCount = SWAP_BE32 (vcb->vcbFilCnt); - volumeHeader->folderCount = SWAP_BE32 (vcb->vcbDirCnt); - volumeHeader->totalBlocks = SWAP_BE32 (vcb->totalBlocks); - volumeHeader->freeBlocks = SWAP_BE32 (vcb->freeBlocks + vcb->reclaimBlocks); - volumeHeader->nextAllocation = SWAP_BE32 (vcb->nextAllocation); - volumeHeader->rsrcClumpSize = SWAP_BE32 (vcb->vcbClpSiz); - volumeHeader->dataClumpSize = SWAP_BE32 (vcb->vcbClpSiz); - volumeHeader->nextCatalogID = SWAP_BE32 (vcb->vcbNxtCNID); - volumeHeader->writeCount = SWAP_BE32 (vcb->vcbWrCnt); - volumeHeader->encodingsBitmap = SWAP_BE64 (vcb->encodingsBitmap); - - if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) { - bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)); - critical = true; - } - - if (!altflush && !ISSET(options, HFS_FVH_FLUSH_IF_DIRTY)) { - goto done; - } - - /* Sync Extents over-flow file meta data */ - fp = VTOF(vcb->extentsRefNum); - if (FTOC(fp)->c_flag & C_MODIFIED) { - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->extentsFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->extentsFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); - } - volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->extentsFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); - FTOC(fp)->c_flag &= ~C_MODIFIED; - altflush = true; - } - - /* Sync Catalog file meta data */ - fp = VTOF(vcb->catalogRefNum); - if (FTOC(fp)->c_flag & C_MODIFIED) { - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->catalogFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->catalogFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); - } - volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->catalogFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); - FTOC(fp)->c_flag &= ~C_MODIFIED; - altflush = true; - } - - /* Sync Allocation file meta data */ - fp = VTOF(vcb->allocationsRefNum); - if (FTOC(fp)->c_flag & C_MODIFIED) { - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->allocationFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->allocationFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); - } - volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->allocationFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); - FTOC(fp)->c_flag &= ~C_MODIFIED; - altflush = true; - } - - /* Sync Attribute file meta data */ - if (hfsmp->hfs_attribute_vp) { - fp = VTOF(hfsmp->hfs_attribute_vp); - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->attributesFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->attributesFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); - } - if (ISSET(FTOC(fp)->c_flag, C_MODIFIED)) { - FTOC(fp)->c_flag &= ~C_MODIFIED; - altflush = true; - } - volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->attributesFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); - } - - /* Sync Startup file meta data */ - if (hfsmp->hfs_startup_vp) { - fp = VTOF(hfsmp->hfs_startup_vp); - if (FTOC(fp)->c_flag & C_MODIFIED) { - for (i = 0; i < kHFSPlusExtentDensity; i++) { - volumeHeader->startupFile.extents[i].startBlock = - SWAP_BE32 (fp->ff_extents[i].startBlock); - volumeHeader->startupFile.extents[i].blockCount = - SWAP_BE32 (fp->ff_extents[i].blockCount); - } - volumeHeader->startupFile.logicalSize = SWAP_BE64 (fp->ff_size); - volumeHeader->startupFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); - volumeHeader->startupFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); - FTOC(fp)->c_flag &= ~C_MODIFIED; - altflush = true; - } - } - - if (altflush) - critical = true; - -done: - MarkVCBClean(hfsmp); - hfs_unlock_mount (hfsmp); - - /* If requested, flush out the alternate volume header */ - if (altflush) { - /* - * The two altVH offsets do not match --- which means that a smaller file - * system exists in a larger partition. Verify that we have the correct - * alternate volume header sector as per the current parititon size. - * The GPT device that we are mounted on top could have changed sizes - * without us knowning. - * - * We're in a transaction, so it's safe to modify the partition_avh_sector - * field if necessary. - */ - if (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector) { - uint64_t sector_count; - - /* Get underlying device block count */ - if ((retval = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCGETBLOCKCOUNT, - (caddr_t)§or_count, 0, vfs_context_current()))) { - printf("hfs_flushVH: err %d getting block count (%s) \n", retval, vcb->vcbVN); - retval = ENXIO; - goto err_exit; - } - - /* Partition size was changed without our knowledge */ - if (sector_count != (uint64_t)hfsmp->hfs_logical_block_count) { - hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, sector_count); - /* Note: hfs_fs_avh_sector will remain unchanged */ - printf ("hfs_flushVH: altflush: partition size changed, partition_avh_sector=%qu, fs_avh_sector=%qu\n", - hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); - } - } - - /* - * First see if we need to write I/O to the "secondary" AVH - * located at FS Size - 1024 bytes, because this one will - * always go into the journal. We put this AVH into the journal - * because even if the filesystem size has shrunk, this LBA should be - * reachable after the partition-size modification has occurred. - * The one where we need to be careful is partitionsize-1024, since the - * partition size should hopefully shrink. - * - * Most of the time this block will not execute. - */ - if ((hfsmp->hfs_fs_avh_sector) && - (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector)) { - if (buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_fs_avh_sector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) { - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, alt_bp); - } - - bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) + - HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), - kMDBSize); - - if (hfsmp->jnl) { - journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL); - } else { - (void) VNOP_BWRITE(alt_bp); - } - } else if (alt_bp) { - buf_brelse(alt_bp); - } - } - - /* - * Flush out alternate volume header located at 1024 bytes before - * end of the partition as part of journal transaction. In - * most cases, this will be the only alternate volume header - * that we need to worry about because the file system size is - * same as the partition size, therefore hfs_fs_avh_sector is - * same as hfs_partition_avh_sector. This is the "priority" AVH. - * - * However, do not always put this I/O into the journal. If we skipped the - * FS-Size AVH write above, then we will put this I/O into the journal as - * that indicates the two were in sync. However, if the FS size is - * not the same as the partition size, we are tracking two. We don't - * put it in the journal in that case, since if the partition - * size changes between uptimes, and we need to replay the journal, - * this I/O could generate an EIO if during replay it is now trying - * to access blocks beyond the device EOF. - */ - if (hfsmp->hfs_partition_avh_sector) { - if (buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_partition_avh_sector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) { - - /* only one AVH, put this I/O in the journal. */ - if ((hfsmp->jnl) && (hfsmp->hfs_partition_avh_sector == hfsmp->hfs_fs_avh_sector)) { - journal_modify_block_start(hfsmp->jnl, alt_bp); - } - - bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) + - HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), - kMDBSize); - - /* If journaled and we only have one AVH to track */ - if ((hfsmp->jnl) && (hfsmp->hfs_partition_avh_sector == hfsmp->hfs_fs_avh_sector)) { - journal_modify_block_end (hfsmp->jnl, alt_bp, NULL, NULL); - } else { - /* - * If we don't have a journal or there are two AVH's at the - * moment, then this one doesn't go in the journal. Note that - * this one may generate I/O errors, since the partition - * can be resized behind our backs at any moment and this I/O - * may now appear to be beyond the device EOF. - */ - (void) VNOP_BWRITE(alt_bp); - hfs_flush(hfsmp, HFS_FLUSH_CACHE); - } - } else if (alt_bp) { - buf_brelse(alt_bp); - } - } - } - - /* Finish modifying the block for the primary VH */ - if (hfsmp->jnl) { - journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); - } else { - if (!ISSET(options, HFS_FVH_WAIT)) { - buf_bawrite(bp); - } else { - retval = VNOP_BWRITE(bp); - /* When critical data changes, flush the device cache */ - if (critical && (retval == 0)) { - hfs_flush(hfsmp, HFS_FLUSH_CACHE); - } - } - } - hfs_end_transaction(hfsmp); - - return (retval); - -err_exit: - if (alt_bp) - buf_brelse(alt_bp); - if (bp) - buf_brelse(bp); - hfs_end_transaction(hfsmp); - return retval; -} - - -/* - * Creates a UUID from a unique "name" in the HFS UUID Name space. - * See version 3 UUID. - */ -static void -hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result) -{ - MD5_CTX md5c; - uint8_t rawUUID[8]; - - ((uint32_t *)rawUUID)[0] = hfsmp->vcbFndrInfo[6]; - ((uint32_t *)rawUUID)[1] = hfsmp->vcbFndrInfo[7]; - - MD5Init( &md5c ); - MD5Update( &md5c, HFS_UUID_NAMESPACE_ID, sizeof( uuid_t ) ); - MD5Update( &md5c, rawUUID, sizeof (rawUUID) ); - MD5Final( result, &md5c ); - - result[6] = 0x30 | ( result[6] & 0x0F ); - result[8] = 0x80 | ( result[8] & 0x3F ); -} - -/* - * Get file system attributes. - */ -static int -hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context) -{ -#define HFS_ATTR_CMN_VALIDMASK ATTR_CMN_VALIDMASK -#define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST)) -#define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_ACCTIME)) - - ExtendedVCB *vcb = VFSTOVCB(mp); - struct hfsmount *hfsmp = VFSTOHFS(mp); - u_int32_t freeCNIDs; - - int searchfs_on = 0; - int exchangedata_on = 1; - -#if CONFIG_SEARCHFS - searchfs_on = 1; -#endif - -#if CONFIG_PROTECT - if (cp_fs_protected(mp)) { - exchangedata_on = 0; - } -#endif - - freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)hfsmp->vcbNxtCNID; - - VFSATTR_RETURN(fsap, f_objcount, (u_int64_t)hfsmp->vcbFilCnt + (u_int64_t)hfsmp->vcbDirCnt); - VFSATTR_RETURN(fsap, f_filecount, (u_int64_t)hfsmp->vcbFilCnt); - VFSATTR_RETURN(fsap, f_dircount, (u_int64_t)hfsmp->vcbDirCnt); - VFSATTR_RETURN(fsap, f_maxobjcount, (u_int64_t)0xFFFFFFFF); - VFSATTR_RETURN(fsap, f_iosize, (size_t)cluster_max_io_size(mp, 0)); - VFSATTR_RETURN(fsap, f_blocks, (u_int64_t)hfsmp->totalBlocks); - VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)hfs_freeblks(hfsmp, 0)); - VFSATTR_RETURN(fsap, f_bavail, (u_int64_t)hfs_freeblks(hfsmp, 1)); - VFSATTR_RETURN(fsap, f_bsize, (u_int32_t)vcb->blockSize); - /* XXX needs clarification */ - VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1)); - /* Maximum files is constrained by total blocks. */ - VFSATTR_RETURN(fsap, f_files, (u_int64_t)(hfsmp->totalBlocks - 2)); - VFSATTR_RETURN(fsap, f_ffree, MIN((u_int64_t)freeCNIDs, (u_int64_t)hfs_freeblks(hfsmp, 1))); - - fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev; - fsap->f_fsid.val[1] = vfs_typenum(mp); - VFSATTR_SET_SUPPORTED(fsap, f_fsid); - - VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord); - VFSATTR_RETURN(fsap, f_carbon_fsid, 0); - - if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { - vol_capabilities_attr_t *cap; - - cap = &fsap->f_capabilities; - - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { - /* HFS+ & variants */ - cap->capabilities[VOL_CAPABILITIES_FORMAT] = - VOL_CAP_FMT_PERSISTENTOBJECTIDS | - VOL_CAP_FMT_SYMBOLICLINKS | - VOL_CAP_FMT_HARDLINKS | - VOL_CAP_FMT_JOURNAL | - VOL_CAP_FMT_ZERO_RUNS | - (hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) | - (hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) | - VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS | - VOL_CAP_FMT_2TB_FILESIZE | - VOL_CAP_FMT_HIDDEN_FILES | -#if HFS_COMPRESSION - VOL_CAP_FMT_PATH_FROM_ID | - VOL_CAP_FMT_DECMPFS_COMPRESSION; -#else - VOL_CAP_FMT_PATH_FROM_ID; -#endif - } -#if CONFIG_HFS_STD - else { - /* HFS standard */ - cap->capabilities[VOL_CAPABILITIES_FORMAT] = - VOL_CAP_FMT_PERSISTENTOBJECTIDS | - VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS | - VOL_CAP_FMT_HIDDEN_FILES | - VOL_CAP_FMT_PATH_FROM_ID; - } -#endif - - /* - * The capabilities word in 'cap' tell you whether or not - * this particular filesystem instance has feature X enabled. - */ - - cap->capabilities[VOL_CAPABILITIES_INTERFACES] = - VOL_CAP_INT_ATTRLIST | - VOL_CAP_INT_NFSEXPORT | - VOL_CAP_INT_READDIRATTR | - VOL_CAP_INT_ALLOCATE | - VOL_CAP_INT_VOL_RENAME | - VOL_CAP_INT_ADVLOCK | - VOL_CAP_INT_FLOCK | -#if NAMEDSTREAMS - VOL_CAP_INT_EXTENDED_ATTR | - VOL_CAP_INT_NAMEDSTREAMS; -#else - VOL_CAP_INT_EXTENDED_ATTR; -#endif - - /* HFS may conditionally support searchfs and exchangedata depending on the runtime */ - - if (searchfs_on) { - cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_SEARCHFS; - } - if (exchangedata_on) { - cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_EXCHANGEDATA; - } - - cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0; - cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0; - - cap->valid[VOL_CAPABILITIES_FORMAT] = - VOL_CAP_FMT_PERSISTENTOBJECTIDS | - VOL_CAP_FMT_SYMBOLICLINKS | - VOL_CAP_FMT_HARDLINKS | - VOL_CAP_FMT_JOURNAL | - VOL_CAP_FMT_JOURNAL_ACTIVE | - VOL_CAP_FMT_NO_ROOT_TIMES | - VOL_CAP_FMT_SPARSE_FILES | - VOL_CAP_FMT_ZERO_RUNS | - VOL_CAP_FMT_CASE_SENSITIVE | - VOL_CAP_FMT_CASE_PRESERVING | - VOL_CAP_FMT_FAST_STATFS | - VOL_CAP_FMT_2TB_FILESIZE | - VOL_CAP_FMT_OPENDENYMODES | - VOL_CAP_FMT_HIDDEN_FILES | -#if HFS_COMPRESSION - VOL_CAP_FMT_PATH_FROM_ID | - VOL_CAP_FMT_DECMPFS_COMPRESSION; -#else - VOL_CAP_FMT_PATH_FROM_ID; -#endif - - /* - * Bits in the "valid" field tell you whether or not the on-disk - * format supports feature X. - */ - - cap->valid[VOL_CAPABILITIES_INTERFACES] = - VOL_CAP_INT_ATTRLIST | - VOL_CAP_INT_NFSEXPORT | - VOL_CAP_INT_READDIRATTR | - VOL_CAP_INT_COPYFILE | - VOL_CAP_INT_ALLOCATE | - VOL_CAP_INT_VOL_RENAME | - VOL_CAP_INT_ADVLOCK | - VOL_CAP_INT_FLOCK | - VOL_CAP_INT_MANLOCK | -#if NAMEDSTREAMS - VOL_CAP_INT_EXTENDED_ATTR | - VOL_CAP_INT_NAMEDSTREAMS; -#else - VOL_CAP_INT_EXTENDED_ATTR; -#endif - - /* HFS always supports exchangedata and searchfs in the on-disk format natively */ - cap->valid[VOL_CAPABILITIES_INTERFACES] |= (VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_EXCHANGEDATA); - - - cap->valid[VOL_CAPABILITIES_RESERVED1] = 0; - cap->valid[VOL_CAPABILITIES_RESERVED2] = 0; - VFSATTR_SET_SUPPORTED(fsap, f_capabilities); - } - if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { - vol_attributes_attr_t *attrp = &fsap->f_attributes; - - attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; - attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; - attrp->validattr.dirattr = ATTR_DIR_VALIDMASK; - attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK; - attrp->validattr.forkattr = 0; - - attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; - attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; - attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK; - attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK; - attrp->nativeattr.forkattr = 0; - VFSATTR_SET_SUPPORTED(fsap, f_attributes); - } - fsap->f_create_time.tv_sec = hfsmp->hfs_itime; - fsap->f_create_time.tv_nsec = 0; - VFSATTR_SET_SUPPORTED(fsap, f_create_time); - fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod; - fsap->f_modify_time.tv_nsec = 0; - VFSATTR_SET_SUPPORTED(fsap, f_modify_time); - - fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp; - fsap->f_backup_time.tv_nsec = 0; - VFSATTR_SET_SUPPORTED(fsap, f_backup_time); - if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) { - u_int16_t subtype = 0; - - /* - * Subtypes (flavors) for HFS - * 0: Mac OS Extended - * 1: Mac OS Extended (Journaled) - * 2: Mac OS Extended (Case Sensitive) - * 3: Mac OS Extended (Case Sensitive, Journaled) - * 4 - 127: Reserved - * 128: Mac OS Standard - * - */ - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { - if (hfsmp->jnl) { - subtype |= HFS_SUBTYPE_JOURNALED; - } - if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) { - subtype |= HFS_SUBTYPE_CASESENSITIVE; - } - } -#if CONFIG_HFS_STD - else { - subtype = HFS_SUBTYPE_STANDARDHFS; - } -#endif - fsap->f_fssubtype = subtype; - VFSATTR_SET_SUPPORTED(fsap, f_fssubtype); - } - - if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { - strlcpy(fsap->f_vol_name, (char *) hfsmp->vcbVN, MAXPATHLEN); - VFSATTR_SET_SUPPORTED(fsap, f_vol_name); - } - if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) { - hfs_getvoluuid(hfsmp, fsap->f_uuid); - VFSATTR_SET_SUPPORTED(fsap, f_uuid); - } - return (0); -} - -/* - * Perform a volume rename. Requires the FS' root vp. - */ -static int -hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) -{ - ExtendedVCB *vcb = VTOVCB(vp); - struct cnode *cp = VTOC(vp); - struct hfsmount *hfsmp = VTOHFS(vp); - struct cat_desc to_desc; - struct cat_desc todir_desc; - struct cat_desc new_desc; - cat_cookie_t cookie; - int lockflags; - int error = 0; - char converted_volname[256]; - size_t volname_length = 0; - size_t conv_volname_length = 0; - - - /* - * Ignore attempts to rename a volume to a zero-length name. - */ - if (name[0] == 0) - return(0); - - bzero(&to_desc, sizeof(to_desc)); - bzero(&todir_desc, sizeof(todir_desc)); - bzero(&new_desc, sizeof(new_desc)); - bzero(&cookie, sizeof(cookie)); - - todir_desc.cd_parentcnid = kHFSRootParentID; - todir_desc.cd_cnid = kHFSRootFolderID; - todir_desc.cd_flags = CD_ISDIR; - - to_desc.cd_nameptr = (const u_int8_t *)name; - to_desc.cd_namelen = strlen(name); - to_desc.cd_parentcnid = kHFSRootParentID; - to_desc.cd_cnid = cp->c_cnid; - to_desc.cd_flags = CD_ISDIR; - - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) == 0) { - if ((error = hfs_start_transaction(hfsmp)) == 0) { - if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc); - - /* - * If successful, update the name in the VCB, ensure it's terminated. - */ - if (error == 0) { - strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN)); - - volname_length = strlen ((const char*)vcb->vcbVN); - /* Send the volume name down to CoreStorage if necessary */ - error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); - if (error == 0) { - (void) VNOP_IOCTL (hfsmp->hfs_devvp, _DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); - } - error = 0; - } - - hfs_systemfile_unlock(hfsmp, lockflags); - cat_postflight(hfsmp, &cookie, p); - - if (error) - MarkVCBDirty(vcb); - (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); - } - hfs_end_transaction(hfsmp); - } - if (!error) { - /* Release old allocated name buffer */ - if (cp->c_desc.cd_flags & CD_HASBUF) { - const char *tmp_name = (const char *)cp->c_desc.cd_nameptr; - - cp->c_desc.cd_nameptr = 0; - cp->c_desc.cd_namelen = 0; - cp->c_desc.cd_flags &= ~CD_HASBUF; - vfs_removename(tmp_name); - } - /* Update cnode's catalog descriptor */ - replace_desc(cp, &new_desc); - vcb->volumeNameEncodingHint = new_desc.cd_encoding; - cp->c_touch_chgtime = TRUE; - } - - hfs_unlock(cp); - } - - return(error); -} - -/* - * Get file system attributes. - */ -static int -hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context) -{ - kauth_cred_t cred = vfs_context_ucred(context); - int error = 0; - - /* - * Must be superuser or owner of filesystem to change volume attributes - */ - if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner)) - return(EACCES); - - if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { - vnode_t root_vp; - - error = hfs_vfs_root(mp, &root_vp, context); - if (error) - goto out; - - error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context)); - (void) vnode_put(root_vp); - if (error) - goto out; - - VFSATTR_SET_SUPPORTED(fsap, f_vol_name); - } - -out: - return error; -} - -/* If a runtime corruption is detected, set the volume inconsistent - * bit in the volume attributes. The volume inconsistent bit is a persistent - * bit which represents that the volume is corrupt and needs repair. - * The volume inconsistent bit can be set from the kernel when it detects - * runtime corruption or from file system repair utilities like fsck_hfs when - * a repair operation fails. The bit should be cleared only from file system - * verify/repair utility like fsck_hfs when a verify/repair succeeds. - */ -__private_extern__ -void hfs_mark_inconsistent(struct hfsmount *hfsmp, - hfs_inconsistency_reason_t reason) -{ - hfs_lock_mount (hfsmp); - if ((hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) == 0) { - hfsmp->vcbAtrb |= kHFSVolumeInconsistentMask; - MarkVCBDirty(hfsmp); - } - if ((hfsmp->hfs_flags & HFS_READ_ONLY)==0) { - switch (reason) { - case HFS_INCONSISTENCY_DETECTED: - printf("hfs_mark_inconsistent: Runtime corruption detected on %s, fsck will be forced on next mount.\n", - hfsmp->vcbVN); - break; - case HFS_ROLLBACK_FAILED: - printf("hfs_mark_inconsistent: Failed to roll back; volume `%s' might be inconsistent; fsck will be forced on next mount.\n", - hfsmp->vcbVN); - break; - case HFS_OP_INCOMPLETE: - printf("hfs_mark_inconsistent: Failed to complete operation; volume `%s' might be inconsistent; fsck will be forced on next mount.\n", - hfsmp->vcbVN); - break; - case HFS_FSCK_FORCED: - printf("hfs_mark_inconsistent: fsck requested for `%s'; fsck will be forced on next mount.\n", - hfsmp->vcbVN); - break; - } - } - hfs_unlock_mount (hfsmp); -} - -/* Replay the journal on the device node provided. Returns zero if - * journal replay succeeded or no journal was supposed to be replayed. - */ -static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) -{ - int retval = 0; - int error = 0; - struct mount *mp = NULL; - struct hfs_mount_args *args = NULL; - - /* Replay allowed only on raw devices */ - if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) { - retval = EINVAL; - goto out; - } - - /* Create dummy mount structures */ - MALLOC(mp, struct mount *, sizeof(struct mount), M_TEMP, M_WAITOK); - if (mp == NULL) { - retval = ENOMEM; - goto out; - } - bzero(mp, sizeof(struct mount)); - mount_lock_init(mp); - - MALLOC(args, struct hfs_mount_args *, sizeof(struct hfs_mount_args), M_TEMP, M_WAITOK); - if (args == NULL) { - retval = ENOMEM; - goto out; - } - bzero(args, sizeof(struct hfs_mount_args)); - - retval = hfs_mountfs(devvp, mp, args, 1, context); - buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay"); - - /* FSYNC the devnode to be sure all data has been flushed */ - error = VNOP_FSYNC(devvp, MNT_WAIT, context); - if (error) { - retval = error; - } - -out: - if (mp) { - mount_lock_destroy(mp); - FREE(mp, M_TEMP); - } - if (args) { - FREE(args, M_TEMP); - } - return retval; -} - - -/* - * Cancel the syncer - */ -static void -hfs_syncer_free(struct hfsmount *hfsmp) -{ - if (hfsmp && hfsmp->hfs_syncer) { - hfs_syncer_lock(hfsmp); - - /* - * First, make sure everything else knows we don't want any more - * requests queued. - */ - thread_call_t syncer = hfsmp->hfs_syncer; - hfsmp->hfs_syncer = NULL; - - hfs_syncer_unlock(hfsmp); - - // Now deal with requests that are outstanding - if (hfsmp->hfs_sync_incomplete) { - if (thread_call_cancel(syncer)) { - // We managed to cancel the timer so we're done - hfsmp->hfs_sync_incomplete = FALSE; - } else { - // Syncer must be running right now so we have to wait - hfs_syncer_lock(hfsmp); - while (hfsmp->hfs_sync_incomplete) - hfs_syncer_wait(hfsmp); - hfs_syncer_unlock(hfsmp); - } - } - - // Now we're safe to free the syncer - thread_call_free(syncer); - } -} - -/* - * hfs vfs operations. - */ -struct vfsops hfs_vfsops = { - hfs_mount, - hfs_start, - hfs_unmount, - hfs_vfs_root, - hfs_quotactl, - hfs_vfs_getattr, /* was hfs_statfs */ - hfs_sync, - hfs_vfs_vget, - hfs_fhtovp, - hfs_vptofh, - hfs_init, - hfs_sysctl, - hfs_vfs_setattr, - {NULL} -}; diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c deleted file mode 100644 index ade6d0ca0..000000000 --- a/bsd/hfs/hfs_vfsutils.c +++ /dev/null @@ -1,4035 +0,0 @@ -/* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* @(#)hfs_vfsutils.c 4.0 -* -* (c) 1997-2002 Apple Computer, Inc. All Rights Reserved -* -* hfs_vfsutils.c -- Routines that go between the HFS layer and the VFS. -* -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* for parsing boot-args */ -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_dbg.h" -#include "hfs_mount.h" -#include "hfs_endian.h" -#include "hfs_cnode.h" -#include "hfs_fsctl.h" -#include "hfs_cprotect.h" - -#include "hfscommon/headers/FileMgrInternal.h" -#include "hfscommon/headers/BTreesInternal.h" -#include "hfscommon/headers/HFSUnicodeWrappers.h" - -/* Enable/disable debugging code for live volume resizing, defined in hfs_resize.c */ -extern int hfs_resize_debug; - -static void ReleaseMetaFileVNode(struct vnode *vp); -static int hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args); - -static u_int32_t hfs_hotfile_freeblocks(struct hfsmount *); -static void hfs_thaw_locked(struct hfsmount *hfsmp); - -#define HFS_MOUNT_DEBUG 1 - - -//******************************************************************************* -// Note: Finder information in the HFS/HFS+ metadata are considered opaque and -// hence are not in the right byte order on little endian machines. It is -// the responsibility of the finder and other clients to swap the data. -//******************************************************************************* - -//******************************************************************************* -// Routine: hfs_MountHFSVolume -// -// -//******************************************************************************* -unsigned char hfs_catname[] = "Catalog B-tree"; -unsigned char hfs_extname[] = "Extents B-tree"; -unsigned char hfs_vbmname[] = "Volume Bitmap"; -unsigned char hfs_attrname[] = "Attribute B-tree"; -unsigned char hfs_startupname[] = "Startup File"; - -#if CONFIG_HFS_STD -OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, - __unused struct proc *p) -{ - ExtendedVCB *vcb = HFSTOVCB(hfsmp); - int error; - ByteCount utf8chars; - struct cat_desc cndesc; - struct cat_attr cnattr; - struct cat_fork fork; - int newvnode_flags = 0; - - /* Block size must be a multiple of 512 */ - if (SWAP_BE32(mdb->drAlBlkSiz) == 0 || - (SWAP_BE32(mdb->drAlBlkSiz) & 0x01FF) != 0) - return (EINVAL); - - /* don't mount a writeable volume if its dirty, it must be cleaned by fsck_hfs */ - if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && - ((SWAP_BE16(mdb->drAtrb) & kHFSVolumeUnmountedMask) == 0)) { - return (EINVAL); - } - hfsmp->hfs_flags |= HFS_STANDARD; - /* - * The MDB seems OK: transfer info from it into VCB - * Note - the VCB starts out clear (all zeros) - * - */ - vcb->vcbSigWord = SWAP_BE16 (mdb->drSigWord); - vcb->hfs_itime = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drCrDate))); - vcb->localCreateDate = SWAP_BE32 (mdb->drCrDate); - vcb->vcbLsMod = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drLsMod))); - vcb->vcbAtrb = SWAP_BE16 (mdb->drAtrb); - vcb->vcbNmFls = SWAP_BE16 (mdb->drNmFls); - vcb->vcbVBMSt = SWAP_BE16 (mdb->drVBMSt); - vcb->nextAllocation = SWAP_BE16 (mdb->drAllocPtr); - vcb->totalBlocks = SWAP_BE16 (mdb->drNmAlBlks); - vcb->allocLimit = vcb->totalBlocks; - vcb->blockSize = SWAP_BE32 (mdb->drAlBlkSiz); - vcb->vcbClpSiz = SWAP_BE32 (mdb->drClpSiz); - vcb->vcbAlBlSt = SWAP_BE16 (mdb->drAlBlSt); - vcb->vcbNxtCNID = SWAP_BE32 (mdb->drNxtCNID); - vcb->freeBlocks = SWAP_BE16 (mdb->drFreeBks); - vcb->vcbVolBkUp = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drVolBkUp))); - vcb->vcbWrCnt = SWAP_BE32 (mdb->drWrCnt); - vcb->vcbNmRtDirs = SWAP_BE16 (mdb->drNmRtDirs); - vcb->vcbFilCnt = SWAP_BE32 (mdb->drFilCnt); - vcb->vcbDirCnt = SWAP_BE32 (mdb->drDirCnt); - bcopy(mdb->drFndrInfo, vcb->vcbFndrInfo, sizeof(vcb->vcbFndrInfo)); - if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) - vcb->vcbWrCnt++; /* Compensate for write of MDB on last flush */ - - /* convert hfs encoded name into UTF-8 string */ - error = hfs_to_utf8(vcb, mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); - /* - * When an HFS name cannot be encoded with the current - * volume encoding we use MacRoman as a fallback. - */ - if (error || (utf8chars == 0)) { - error = mac_roman_to_utf8(mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); - /* If we fail to encode to UTF8 from Mac Roman, the name is bad. Deny the mount */ - if (error) { - goto MtVolErr; - } - } - - hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); - vcb->vcbVBMIOSize = kHFSBlockSize; - - /* Generate the partition-based AVH location */ - hfsmp->hfs_partition_avh_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, - hfsmp->hfs_logical_block_count); - - /* HFS standard is read-only, so just stuff the FS location in here, too */ - hfsmp->hfs_fs_avh_sector = hfsmp->hfs_partition_avh_sector; - - bzero(&cndesc, sizeof(cndesc)); - cndesc.cd_parentcnid = kHFSRootParentID; - cndesc.cd_flags |= CD_ISMETA; - bzero(&cnattr, sizeof(cnattr)); - cnattr.ca_linkcount = 1; - cnattr.ca_mode = S_IFREG; - bzero(&fork, sizeof(fork)); - - /* - * Set up Extents B-tree vnode - */ - cndesc.cd_nameptr = hfs_extname; - cndesc.cd_namelen = strlen((char *)hfs_extname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSExtentsFileID; - fork.cf_size = SWAP_BE32(mdb->drXTFlSize); - fork.cf_blocks = fork.cf_size / vcb->blockSize; - fork.cf_clump = SWAP_BE32(mdb->drXTClpSiz); - fork.cf_vblocks = 0; - fork.cf_extents[0].startBlock = SWAP_BE16(mdb->drXTExtRec[0].startBlock); - fork.cf_extents[0].blockCount = SWAP_BE16(mdb->drXTExtRec[0].blockCount); - fork.cf_extents[1].startBlock = SWAP_BE16(mdb->drXTExtRec[1].startBlock); - fork.cf_extents[1].blockCount = SWAP_BE16(mdb->drXTExtRec[1].blockCount); - fork.cf_extents[2].startBlock = SWAP_BE16(mdb->drXTExtRec[2].startBlock); - fork.cf_extents[2].blockCount = SWAP_BE16(mdb->drXTExtRec[2].blockCount); - cnattr.ca_blocks = fork.cf_blocks; - - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_extents_vp, &newvnode_flags); - if (error) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfs (std): error creating Ext Vnode (%d) \n", error); - } - goto MtVolErr; - } - error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), - (KeyCompareProcPtr)CompareExtentKeys)); - if (error) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfs (std): error opening Ext Vnode (%d) \n", error); - } - hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - goto MtVolErr; - } - hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp); - - /* - * Set up Catalog B-tree vnode... - */ - cndesc.cd_nameptr = hfs_catname; - cndesc.cd_namelen = strlen((char *)hfs_catname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSCatalogFileID; - fork.cf_size = SWAP_BE32(mdb->drCTFlSize); - fork.cf_blocks = fork.cf_size / vcb->blockSize; - fork.cf_clump = SWAP_BE32(mdb->drCTClpSiz); - fork.cf_vblocks = 0; - fork.cf_extents[0].startBlock = SWAP_BE16(mdb->drCTExtRec[0].startBlock); - fork.cf_extents[0].blockCount = SWAP_BE16(mdb->drCTExtRec[0].blockCount); - fork.cf_extents[1].startBlock = SWAP_BE16(mdb->drCTExtRec[1].startBlock); - fork.cf_extents[1].blockCount = SWAP_BE16(mdb->drCTExtRec[1].blockCount); - fork.cf_extents[2].startBlock = SWAP_BE16(mdb->drCTExtRec[2].startBlock); - fork.cf_extents[2].blockCount = SWAP_BE16(mdb->drCTExtRec[2].blockCount); - cnattr.ca_blocks = fork.cf_blocks; - - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_catalog_vp, &newvnode_flags); - if (error) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfs (std): error creating catalog Vnode (%d) \n", error); - } - hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - goto MtVolErr; - } - error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), - (KeyCompareProcPtr)CompareCatalogKeys)); - if (error) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfs (std): error opening catalog Vnode (%d) \n", error); - } - hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); - hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - goto MtVolErr; - } - hfsmp->hfs_catalog_cp = VTOC(hfsmp->hfs_catalog_vp); - - /* - * Set up dummy Allocation file vnode (used only for locking bitmap) - */ - cndesc.cd_nameptr = hfs_vbmname; - cndesc.cd_namelen = strlen((char *)hfs_vbmname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSAllocationFileID; - bzero(&fork, sizeof(fork)); - cnattr.ca_blocks = 0; - - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_allocation_vp, &newvnode_flags); - if (error) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfs (std): error creating bitmap Vnode (%d) \n", error); - } - hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); - hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - goto MtVolErr; - } - hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); - - /* mark the volume dirty (clear clean unmount bit) */ - vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; - - if (error == noErr) { - error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, NULL, NULL, NULL); - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfs (std): error looking up root folder (%d) \n", error); - } - } - - if (error == noErr) { - /* If the disk isn't write protected.. */ - if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask)) { - MarkVCBDirty (vcb); // mark VCB dirty so it will be written - } - } - - /* - * all done with system files so we can unlock now... - */ - hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); - hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); - hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - - if (error == noErr) { - /* If successful, then we can just return once we've unlocked the cnodes */ - return error; - } - - //-- Release any resources allocated so far before exiting with an error: -MtVolErr: - hfsUnmount(hfsmp, NULL); - - return (error); -} - -#endif - -//******************************************************************************* -// -// Sanity check Volume Header Block: -// Input argument *vhp is a pointer to a HFSPlusVolumeHeader block that has -// not been endian-swapped and represents the on-disk contents of this sector. -// This routine will not change the endianness of vhp block. -// -//******************************************************************************* -OSErr hfs_ValidateHFSPlusVolumeHeader(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp) -{ - u_int16_t signature; - u_int16_t hfs_version; - u_int32_t blockSize; - - signature = SWAP_BE16(vhp->signature); - hfs_version = SWAP_BE16(vhp->version); - - if (signature == kHFSPlusSigWord) { - if (hfs_version != kHFSPlusVersion) { - printf("hfs_ValidateHFSPlusVolumeHeader: invalid HFS+ version: %x\n", hfs_version); - return (EINVAL); - } - } else if (signature == kHFSXSigWord) { - if (hfs_version != kHFSXVersion) { - printf("hfs_ValidateHFSPlusVolumeHeader: invalid HFSX version: %x\n", hfs_version); - return (EINVAL); - } - } else { - /* Removed printf for invalid HFS+ signature because it gives - * false error for UFS root volume - */ - if (HFS_MOUNT_DEBUG) { - printf("hfs_ValidateHFSPlusVolumeHeader: unknown Volume Signature : %x\n", signature); - } - return (EINVAL); - } - - /* Block size must be at least 512 and a power of 2 */ - blockSize = SWAP_BE32(vhp->blockSize); - if (blockSize < 512 || !powerof2(blockSize)) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_ValidateHFSPlusVolumeHeader: invalid blocksize (%d) \n", blockSize); - } - return (EINVAL); - } - - if (blockSize < hfsmp->hfs_logical_block_size) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_ValidateHFSPlusVolumeHeader: invalid physical blocksize (%d), hfs_logical_blocksize (%d) \n", - blockSize, hfsmp->hfs_logical_block_size); - } - return (EINVAL); - } - return 0; -} - -//******************************************************************************* -// Routine: hfs_MountHFSPlusVolume -// -// -//******************************************************************************* - -OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - off_t embeddedOffset, u_int64_t disksize, __unused struct proc *p, void *args, kauth_cred_t cred) -{ - register ExtendedVCB *vcb; - struct cat_desc cndesc; - struct cat_attr cnattr; - struct cat_fork cfork; - u_int32_t blockSize; - daddr64_t spare_sectors; - struct BTreeInfoRec btinfo; - u_int16_t signature; - u_int16_t hfs_version; - int newvnode_flags = 0; - int i; - OSErr retval; - char converted_volname[256]; - size_t volname_length = 0; - size_t conv_volname_length = 0; - - signature = SWAP_BE16(vhp->signature); - hfs_version = SWAP_BE16(vhp->version); - - retval = hfs_ValidateHFSPlusVolumeHeader(hfsmp, vhp); - if (retval) - return retval; - - if (signature == kHFSXSigWord) { - /* The in-memory signature is always 'H+'. */ - signature = kHFSPlusSigWord; - hfsmp->hfs_flags |= HFS_X; - } - - blockSize = SWAP_BE32(vhp->blockSize); - /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */ - if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0 && hfsmp->jnl == NULL && - (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: cannot mount dirty non-journaled volumes\n"); - } - return (EINVAL); - } - - /* Make sure we can live with the physical block size. */ - if ((disksize & (hfsmp->hfs_logical_block_size - 1)) || - (embeddedOffset & (hfsmp->hfs_logical_block_size - 1))) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_logical_blocksize (%d) \n", - hfsmp->hfs_logical_block_size); - } - return (ENXIO); - } - - /* - * If allocation block size is less than the physical block size, - * same data could be cached in two places and leads to corruption. - * - * HFS Plus reserves one allocation block for the Volume Header. - * If the physical size is larger, then when we read the volume header, - * we will also end up reading in the next allocation block(s). - * If those other allocation block(s) is/are modified, and then the volume - * header is modified, the write of the volume header's buffer will write - * out the old contents of the other allocation blocks. - * - * We assume that the physical block size is same as logical block size. - * The physical block size value is used to round down the offsets for - * reading and writing the primary and alternate volume headers. - * - * The same logic to ensure good hfs_physical_block_size is also in - * hfs_mountfs so that hfs_mountfs, hfs_MountHFSPlusVolume and - * later are doing the I/Os using same block size. - */ - if (blockSize < hfsmp->hfs_physical_block_size) { - hfsmp->hfs_physical_block_size = hfsmp->hfs_logical_block_size; - hfsmp->hfs_log_per_phys = 1; - } - - /* - * The VolumeHeader seems OK: transfer info from it into VCB - * Note - the VCB starts out clear (all zeros) - */ - vcb = HFSTOVCB(hfsmp); - - vcb->vcbSigWord = signature; - vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); - vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); - vcb->vcbAtrb = SWAP_BE32(vhp->attributes); - vcb->vcbClpSiz = SWAP_BE32(vhp->rsrcClumpSize); - vcb->vcbNxtCNID = SWAP_BE32(vhp->nextCatalogID); - vcb->vcbVolBkUp = to_bsd_time(SWAP_BE32(vhp->backupDate)); - vcb->vcbWrCnt = SWAP_BE32(vhp->writeCount); - vcb->vcbFilCnt = SWAP_BE32(vhp->fileCount); - vcb->vcbDirCnt = SWAP_BE32(vhp->folderCount); - - /* copy 32 bytes of Finder info */ - bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo)); - - vcb->vcbAlBlSt = 0; /* hfs+ allocation blocks start at first block of volume */ - if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) - vcb->vcbWrCnt++; /* compensate for write of Volume Header on last flush */ - - /* Now fill in the Extended VCB info */ - vcb->nextAllocation = SWAP_BE32(vhp->nextAllocation); - vcb->totalBlocks = SWAP_BE32(vhp->totalBlocks); - vcb->allocLimit = vcb->totalBlocks; - vcb->freeBlocks = SWAP_BE32(vhp->freeBlocks); - vcb->blockSize = blockSize; - vcb->encodingsBitmap = SWAP_BE64(vhp->encodingsBitmap); - vcb->localCreateDate = SWAP_BE32(vhp->createDate); - - vcb->hfsPlusIOPosOffset = embeddedOffset; - - /* Default to no free block reserve */ - vcb->reserveBlocks = 0; - - /* - * Update the logical block size in the mount struct - * (currently set up from the wrapper MDB) using the - * new blocksize value: - */ - hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); - vcb->vcbVBMIOSize = min(vcb->blockSize, MAXPHYSIO); - - /* - * Validate and initialize the location of the alternate volume header. - * - * Note that there may be spare sectors beyond the end of the filesystem that still - * belong to our partition. - */ - - spare_sectors = hfsmp->hfs_logical_block_count - - (((daddr64_t)vcb->totalBlocks * blockSize) / - hfsmp->hfs_logical_block_size); - - /* - * Differentiate between "innocuous" spare sectors and the more unusual - * degenerate case: - * - * *** Innocuous spare sectors exist if: - * - * A) the number of bytes assigned to the partition (by multiplying logical - * block size * logical block count) is greater than the filesystem size - * (by multiplying allocation block count and allocation block size) - * - * and - * - * B) the remainder is less than the size of a full allocation block's worth of bytes. - * - * This handles the normal case where there may be a few extra sectors, but the two - * are fundamentally in sync. - * - * *** Degenerate spare sectors exist if: - * A) The number of bytes assigned to the partition (by multiplying logical - * block size * logical block count) is greater than the filesystem size - * (by multiplying allocation block count and block size). - * - * and - * - * B) the remainder is greater than a full allocation's block worth of bytes. - * In this case, a smaller file system exists in a larger partition. - * This can happen in various ways, including when volume is resized but the - * partition is yet to be resized. Under this condition, we have to assume that - * a partition management software may resize the partition to match - * the file system size in the future. Therefore we should update - * alternate volume header at two locations on the disk, - * a. 1024 bytes before end of the partition - * b. 1024 bytes before end of the file system - */ - - if (spare_sectors > (daddr64_t)(blockSize / hfsmp->hfs_logical_block_size)) { - /* - * Handle the degenerate case above. FS < partition size. - * AVH located at 1024 bytes from the end of the partition - */ - hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); - - /* AVH located at 1024 bytes from the end of the filesystem */ - hfsmp->hfs_fs_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, - (((daddr64_t)vcb->totalBlocks * blockSize) / hfsmp->hfs_logical_block_size)); - } - else { - /* Innocuous spare sectors; Partition & FS notion are in sync */ - hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + - HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); - - hfsmp->hfs_fs_avh_sector = hfsmp->hfs_partition_avh_sector; - } - if (hfs_resize_debug) { - printf ("hfs_MountHFSPlusVolume: partition_avh_sector=%qu, fs_avh_sector=%qu\n", - hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); - } - - bzero(&cndesc, sizeof(cndesc)); - cndesc.cd_parentcnid = kHFSRootParentID; - cndesc.cd_flags |= CD_ISMETA; - bzero(&cnattr, sizeof(cnattr)); - cnattr.ca_linkcount = 1; - cnattr.ca_mode = S_IFREG; - - /* - * Set up Extents B-tree vnode - */ - cndesc.cd_nameptr = hfs_extname; - cndesc.cd_namelen = strlen((char *)hfs_extname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSExtentsFileID; - - cfork.cf_size = SWAP_BE64 (vhp->extentsFile.logicalSize); - cfork.cf_new_size= 0; - cfork.cf_clump = SWAP_BE32 (vhp->extentsFile.clumpSize); - cfork.cf_blocks = SWAP_BE32 (vhp->extentsFile.totalBlocks); - cfork.cf_vblocks = 0; - cnattr.ca_blocks = cfork.cf_blocks; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - cfork.cf_extents[i].startBlock = - SWAP_BE32 (vhp->extentsFile.extents[i].startBlock); - cfork.cf_extents[i].blockCount = - SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); - } - retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_extents_vp, &newvnode_flags); - if (retval) - { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting extentoverflow BT\n", retval); - } - goto ErrorExit; - } - - hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp); - hfs_unlock(hfsmp->hfs_extents_cp); - - retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), - (KeyCompareProcPtr) CompareExtentKeysPlus)); - if (retval) - { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting extentoverflow BT\n", retval); - } - goto ErrorExit; - } - /* - * Set up Catalog B-tree vnode - */ - cndesc.cd_nameptr = hfs_catname; - cndesc.cd_namelen = strlen((char *)hfs_catname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSCatalogFileID; - - cfork.cf_size = SWAP_BE64 (vhp->catalogFile.logicalSize); - cfork.cf_clump = SWAP_BE32 (vhp->catalogFile.clumpSize); - cfork.cf_blocks = SWAP_BE32 (vhp->catalogFile.totalBlocks); - cfork.cf_vblocks = 0; - cnattr.ca_blocks = cfork.cf_blocks; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - cfork.cf_extents[i].startBlock = - SWAP_BE32 (vhp->catalogFile.extents[i].startBlock); - cfork.cf_extents[i].blockCount = - SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); - } - retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_catalog_vp, &newvnode_flags); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting catalog BT\n", retval); - } - goto ErrorExit; - } - hfsmp->hfs_catalog_cp = VTOC(hfsmp->hfs_catalog_vp); - hfs_unlock(hfsmp->hfs_catalog_cp); - - retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), - (KeyCompareProcPtr) CompareExtendedCatalogKeys)); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting catalog BT\n", retval); - } - goto ErrorExit; - } - if ((hfsmp->hfs_flags & HFS_X) && - BTGetInformation(VTOF(hfsmp->hfs_catalog_vp), 0, &btinfo) == 0) { - if (btinfo.keyCompareType == kHFSBinaryCompare) { - hfsmp->hfs_flags |= HFS_CASE_SENSITIVE; - /* Install a case-sensitive key compare */ - (void) BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), - (KeyCompareProcPtr)cat_binarykeycompare); - } - } - - /* - * Set up Allocation file vnode - */ - cndesc.cd_nameptr = hfs_vbmname; - cndesc.cd_namelen = strlen((char *)hfs_vbmname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSAllocationFileID; - - cfork.cf_size = SWAP_BE64 (vhp->allocationFile.logicalSize); - cfork.cf_clump = SWAP_BE32 (vhp->allocationFile.clumpSize); - cfork.cf_blocks = SWAP_BE32 (vhp->allocationFile.totalBlocks); - cfork.cf_vblocks = 0; - cnattr.ca_blocks = cfork.cf_blocks; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - cfork.cf_extents[i].startBlock = - SWAP_BE32 (vhp->allocationFile.extents[i].startBlock); - cfork.cf_extents[i].blockCount = - SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); - } - retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_allocation_vp, &newvnode_flags); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting bitmap\n", retval); - } - goto ErrorExit; - } - hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); - hfs_unlock(hfsmp->hfs_allocation_cp); - - /* - * Set up Attribute B-tree vnode - */ - if (vhp->attributesFile.totalBlocks != 0) { - cndesc.cd_nameptr = hfs_attrname; - cndesc.cd_namelen = strlen((char *)hfs_attrname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSAttributesFileID; - - cfork.cf_size = SWAP_BE64 (vhp->attributesFile.logicalSize); - cfork.cf_clump = SWAP_BE32 (vhp->attributesFile.clumpSize); - cfork.cf_blocks = SWAP_BE32 (vhp->attributesFile.totalBlocks); - cfork.cf_vblocks = 0; - cnattr.ca_blocks = cfork.cf_blocks; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - cfork.cf_extents[i].startBlock = - SWAP_BE32 (vhp->attributesFile.extents[i].startBlock); - cfork.cf_extents[i].blockCount = - SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); - } - retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_attribute_vp, &newvnode_flags); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting EA BT\n", retval); - } - goto ErrorExit; - } - hfsmp->hfs_attribute_cp = VTOC(hfsmp->hfs_attribute_vp); - hfs_unlock(hfsmp->hfs_attribute_cp); - retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_attribute_vp), - (KeyCompareProcPtr) hfs_attrkeycompare)); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting EA BT\n", retval); - } - goto ErrorExit; - } - - /* Initialize vnode for virtual attribute data file that spans the - * entire file system space for performing I/O to attribute btree - * We hold iocount on the attrdata vnode for the entire duration - * of mount (similar to btree vnodes) - */ - retval = init_attrdata_vnode(hfsmp); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: init_attrdata_vnode returned (%d) for virtual EA file\n", retval); - } - goto ErrorExit; - } - } - - /* - * Set up Startup file vnode - */ - if (vhp->startupFile.totalBlocks != 0) { - cndesc.cd_nameptr = hfs_startupname; - cndesc.cd_namelen = strlen((char *)hfs_startupname); - cndesc.cd_cnid = cnattr.ca_fileid = kHFSStartupFileID; - - cfork.cf_size = SWAP_BE64 (vhp->startupFile.logicalSize); - cfork.cf_clump = SWAP_BE32 (vhp->startupFile.clumpSize); - cfork.cf_blocks = SWAP_BE32 (vhp->startupFile.totalBlocks); - cfork.cf_vblocks = 0; - cnattr.ca_blocks = cfork.cf_blocks; - for (i = 0; i < kHFSPlusExtentDensity; i++) { - cfork.cf_extents[i].startBlock = - SWAP_BE32 (vhp->startupFile.extents[i].startBlock); - cfork.cf_extents[i].blockCount = - SWAP_BE32 (vhp->startupFile.extents[i].blockCount); - } - retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_startup_vp, &newvnode_flags); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting startup file\n", retval); - } - goto ErrorExit; - } - hfsmp->hfs_startup_cp = VTOC(hfsmp->hfs_startup_vp); - hfs_unlock(hfsmp->hfs_startup_cp); - } - - /* - * Pick up volume name and create date - * - * Acquiring the volume name should not manipulate the bitmap, only the catalog - * btree and possibly the extents overflow b-tree. - */ - retval = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, &cnattr, NULL); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: cat_idlookup returned (%d) getting rootfolder \n", retval); - } - goto ErrorExit; - } - vcb->hfs_itime = cnattr.ca_itime; - vcb->volumeNameEncodingHint = cndesc.cd_encoding; - bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen)); - volname_length = strlen ((const char*)vcb->vcbVN); - cat_releasedesc(&cndesc); - - /* Send the volume name down to CoreStorage if necessary */ - retval = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); - if (retval == 0) { - (void) VNOP_IOCTL (hfsmp->hfs_devvp, _DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); - } - - /* reset retval == 0. we don't care about errors in volname conversion */ - retval = 0; - - - /* - * We now always initiate a full bitmap scan even if the volume is read-only because this is - * our only shot to do I/Os of dramaticallly different sizes than what the buffer cache ordinarily - * expects. TRIMs will not be delivered to the underlying media if the volume is not - * read-write though. - */ - thread_t allocator_scanner; - hfsmp->scan_var = 0; - - /* Take the HFS mount mutex and wait on scan_var */ - hfs_lock_mount (hfsmp); - - kernel_thread_start ((thread_continue_t) hfs_scan_blocks, hfsmp, &allocator_scanner); - /* Wait until it registers that it's got the appropriate locks (or that it is finished) */ - while ((hfsmp->scan_var & (HFS_ALLOCATOR_SCAN_INFLIGHT|HFS_ALLOCATOR_SCAN_COMPLETED)) == 0) { - msleep (&hfsmp->scan_var, &hfsmp->hfs_mutex, PINOD, "hfs_scan_blocks", 0); - } - - hfs_unlock_mount(hfsmp); - - thread_deallocate (allocator_scanner); - - /* mark the volume dirty (clear clean unmount bit) */ - vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; - if (hfsmp->jnl && (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { - hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); - } - - /* kHFSHasFolderCount is only supported/updated on HFSX volumes */ - if ((hfsmp->hfs_flags & HFS_X) != 0) { - hfsmp->hfs_flags |= HFS_FOLDERCOUNT; - } - - // - // Check if we need to do late journal initialization. This only - // happens if a previous version of MacOS X (or 9) touched the disk. - // In that case hfs_late_journal_init() will go re-locate the journal - // and journal_info_block files and validate that they're still kosher. - // - if ( (vcb->vcbAtrb & kHFSVolumeJournaledMask) - && (SWAP_BE32(vhp->lastMountedVersion) != kHFSJMountVersion) - && (hfsmp->jnl == NULL)) { - - retval = hfs_late_journal_init(hfsmp, vhp, args); - if (retval != 0) { - if (retval == EROFS) { - // EROFS is a special error code that means the volume has an external - // journal which we couldn't find. in that case we do not want to - // rewrite the volume header - we'll just refuse to mount the volume. - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d), maybe an external jnl?\n", retval); - } - retval = EINVAL; - goto ErrorExit; - } - - hfsmp->jnl = NULL; - - // if the journal failed to open, then set the lastMountedVersion - // to be "FSK!" which fsck_hfs will see and force the fsck instead - // of just bailing out because the volume is journaled. - if (!(hfsmp->hfs_flags & HFS_READ_ONLY)) { - HFSPlusVolumeHeader *jvhp; - daddr64_t mdb_offset; - struct buf *bp = NULL; - - hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; - - mdb_offset = (daddr64_t)((embeddedOffset / blockSize) + HFS_PRI_SECTOR(blockSize)); - - bp = NULL; - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, - HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, cred, &bp); - if (retval == 0) { - jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); - - if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { - printf ("hfs(3): Journal replay fail. Writing lastMountVersion as FSK!\n"); - jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); - buf_bwrite(bp); - } else { - buf_brelse(bp); - } - bp = NULL; - } else if (bp) { - buf_brelse(bp); - // clear this so the error exit path won't try to use it - bp = NULL; - } - } - - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d)\n", retval); - } - retval = EINVAL; - goto ErrorExit; - } else if (hfsmp->jnl) { - vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - } - } else if (hfsmp->jnl || ((vcb->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) { - struct cat_attr jinfo_attr, jnl_attr; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; - } - - // if we're here we need to fill in the fileid's for the - // journal and journal_info_block. - hfsmp->hfs_jnlinfoblkid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, NULL); - hfsmp->hfs_jnlfileid = GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, NULL); - if (hfsmp->hfs_jnlinfoblkid == 0 || hfsmp->hfs_jnlfileid == 0) { - printf("hfs: danger! couldn't find the file-id's for the journal or journal_info_block\n"); - printf("hfs: jnlfileid %d, jnlinfoblkid %d\n", hfsmp->hfs_jnlfileid, hfsmp->hfs_jnlinfoblkid); - } - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - vcb->vcbAtrb |= kHFSVolumeJournaledMask; - } - - if (hfsmp->jnl == NULL) { - vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - } - } - - if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected - { - MarkVCBDirty( vcb ); // mark VCB dirty so it will be written - } - - if (hfsmp->hfs_flags & HFS_CS_METADATA_PIN) { - hfs_pin_fs_metadata(hfsmp); - } - /* - * Distinguish 3 potential cases involving content protection: - * 1. mount point bit set; vcbAtrb does not support it. Fail. - * 2. mount point bit set; vcbattrb supports it. we're good. - * 3. mount point bit not set; vcbatrb supports it, turn bit on, then good. - */ - if (vfs_flags(hfsmp->hfs_mp) & MNT_CPROTECT) { - /* Does the mount point support it ? */ - if ((vcb->vcbAtrb & kHFSContentProtectionMask) == 0) { - /* Case 1 above */ - retval = EINVAL; - goto ErrorExit; - } - } - else { - /* not requested in the mount point. Is it in FS? */ - if (vcb->vcbAtrb & kHFSContentProtectionMask) { - /* Case 3 above */ - vfs_setflags (hfsmp->hfs_mp, MNT_CPROTECT); - } - } - - /* At this point, if the mount point flag is set, we can enable it. */ - if (vfs_flags(hfsmp->hfs_mp) & MNT_CPROTECT) { - /* Cases 2+3 above */ -#if CONFIG_PROTECT - /* Get the EAs as needed. */ - int cperr = 0; - struct cp_root_xattr *xattr = NULL; - MALLOC (xattr, struct cp_root_xattr*, sizeof(struct cp_root_xattr), M_TEMP, M_WAITOK); - - /* go get the EA to get the version information */ - cperr = cp_getrootxattr (hfsmp, xattr); - /* - * If there was no EA there, then write one out. - * Assuming EA is not present on the root means - * this is an erase install or a very old FS - */ - - if (cperr == 0) { - /* Have to run a valid CP version. */ - if (!cp_is_supported_version(xattr->major_version)) { - cperr = EINVAL; - } - } - else if (cperr == ENOATTR) { - printf("No root EA set, creating new EA with new version: %d\n", CP_CURRENT_VERS); - bzero(xattr, sizeof(struct cp_root_xattr)); - xattr->major_version = CP_CURRENT_VERS; - xattr->minor_version = CP_MINOR_VERS; - cperr = cp_setrootxattr (hfsmp, xattr); - } - - if (cperr) { - FREE(xattr, M_TEMP); - retval = EPERM; - goto ErrorExit; - } - - /* If we got here, then the CP version is valid. Set it in the mount point */ - hfsmp->hfs_running_cp_major_vers = xattr->major_version; - printf("Running with CP root xattr: %d.%d\n", xattr->major_version, xattr->minor_version); - hfsmp->cproot_flags = xattr->flags; - hfsmp->cp_crypto_generation = ISSET(xattr->flags, CP_ROOT_CRYPTOG1) ? 1 : 0; - - FREE(xattr, M_TEMP); - - /* - * Acquire the boot-arg for the AKS default key; if invalid, obtain from the device tree. - * Ensure that the boot-arg's value is valid for FILES (not directories), - * since only files are actually protected for now. - */ - - PE_parse_boot_argn("aks_default_class", &hfsmp->default_cp_class, sizeof(hfsmp->default_cp_class)); - - if (cp_is_valid_class(0, hfsmp->default_cp_class) == 0) { - PE_get_default("kern.default_cp_class", &hfsmp->default_cp_class, sizeof(hfsmp->default_cp_class)); - } - -#if HFS_TMPDBG -#if !SECURE_KERNEL - PE_parse_boot_argn("aks_verbose", &hfsmp->hfs_cp_verbose, sizeof(hfsmp->hfs_cp_verbose)); -#endif -#endif - - if (cp_is_valid_class(0, hfsmp->default_cp_class) == 0) { - hfsmp->default_cp_class = PROTECTION_CLASS_C; - } - -#else - /* If CONFIG_PROTECT not built, ignore CP */ - vfs_clearflags(hfsmp->hfs_mp, MNT_CPROTECT); -#endif - } - - /* - * Establish a metadata allocation zone. - */ - hfs_metadatazone_init(hfsmp, false); - - /* - * Make any metadata zone adjustments. - */ - if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { - /* Keep the roving allocator out of the metadata zone. */ - if (vcb->nextAllocation >= hfsmp->hfs_metazone_start && - vcb->nextAllocation <= hfsmp->hfs_metazone_end) { - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1); - } - } else { - if (vcb->nextAllocation <= 1) { - vcb->nextAllocation = hfsmp->hfs_min_alloc_start; - } - } - vcb->sparseAllocation = hfsmp->hfs_min_alloc_start; - - /* Setup private/hidden directories for hardlinks. */ - hfs_privatedir_init(hfsmp, FILE_HARDLINKS); - hfs_privatedir_init(hfsmp, DIR_HARDLINKS); - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) - hfs_remove_orphans(hfsmp); - - /* See if we need to erase unused Catalog nodes due to . */ - if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) - { - retval = hfs_erase_unused_nodes(hfsmp); - if (retval) { - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: hfs_erase_unused_nodes returned (%d) for %s \n", retval, hfsmp->vcbVN); - } - - goto ErrorExit; - } - } - - /* - * Allow hot file clustering if conditions allow. - */ - if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && !(hfsmp->hfs_flags & HFS_READ_ONLY) && - ((hfsmp->hfs_flags & HFS_SSD) == 0 || (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN))) { - // - // Wait until the bitmap scan completes before we initializes the - // hotfile area so that we do not run into any issues with the - // bitmap being read while hotfiles is initializing itself. On - // some older/slower machines, without this interlock, the bitmap - // would sometimes get corrupted at boot time. - // - hfs_lock_mount(hfsmp); - while(!(hfsmp->scan_var & HFS_ALLOCATOR_SCAN_COMPLETED)) { - (void) msleep (&hfsmp->scan_var, &hfsmp->hfs_mutex, PINOD, "hfs_hotfile_bitmap_interlock", 0); - } - hfs_unlock_mount(hfsmp); - - /* - * Note: at this point we are not allowed to fail the - * mount operation because the HotFile init code - * in hfs_recording_init() will lookup vnodes with - * VNOP_LOOKUP() which hangs vnodes off the mount - * (and if we were to fail, VFS is not prepared to - * clean that up at this point. Since HotFiles are - * optional, this is not a big deal. - */ - (void) hfs_recording_init(hfsmp); - } - - /* Force ACLs on HFS+ file systems. */ - vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - - /* Enable extent-based extended attributes by default */ - hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; - - return (0); - -ErrorExit: - /* - * A fatal error occurred and the volume cannot be mounted, so - * release any resources that we acquired... - */ - hfsUnmount(hfsmp, NULL); - - if (HFS_MOUNT_DEBUG) { - printf("hfs_mounthfsplus: encountered error (%d)\n", retval); - } - return (retval); -} - -static int -_pin_metafile(struct hfsmount *hfsmp, vnode_t vp) -{ - int err; - - err = hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - if (err == 0) { - err = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, NULL, vfs_context_kernel()); - hfs_unlock(VTOC(vp)); - } - - return err; -} - -void -hfs_pin_fs_metadata(struct hfsmount *hfsmp) -{ - ExtendedVCB *vcb; - int err; - - vcb = HFSTOVCB(hfsmp); - - err = _pin_metafile(hfsmp, hfsmp->hfs_extents_vp); - if (err != 0) { - printf("hfs: failed to pin extents overflow file %d\n", err); - } - err = _pin_metafile(hfsmp, hfsmp->hfs_catalog_vp); - if (err != 0) { - printf("hfs: failed to pin catalog file %d\n", err); - } - err = _pin_metafile(hfsmp, hfsmp->hfs_allocation_vp); - if (err != 0) { - printf("hfs: failed to pin bitmap file %d\n", err); - } - err = _pin_metafile(hfsmp, hfsmp->hfs_attribute_vp); - if (err != 0) { - printf("hfs: failed to pin extended attr file %d\n", err); - } - - hfs_pin_block_range(hfsmp, HFS_PIN_IT, 0, 1, vfs_context_kernel()); - hfs_pin_block_range(hfsmp, HFS_PIN_IT, vcb->totalBlocks-1, 1, vfs_context_kernel()); - - if (vfs_flags(hfsmp->hfs_mp) & MNT_JOURNALED) { - // and hey, if we've got a journal, let's pin that too! - hfs_pin_block_range(hfsmp, HFS_PIN_IT, hfsmp->jnl_start, howmany(hfsmp->jnl_size, vcb->blockSize), vfs_context_kernel()); - } -} - -/* - * ReleaseMetaFileVNode - * - * vp L - - - */ -static void ReleaseMetaFileVNode(struct vnode *vp) -{ - struct filefork *fp; - - if (vp && (fp = VTOF(vp))) { - if (fp->fcbBTCBPtr != NULL) { - (void)hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - (void) BTClosePath(fp); - hfs_unlock(VTOC(vp)); - } - - /* release the node even if BTClosePath fails */ - vnode_recycle(vp); - vnode_put(vp); - } -} - - -/************************************************************* -* -* Unmounts a hfs volume. -* At this point vflush() has been called (to dump all non-metadata files) -* -*************************************************************/ - -int -hfsUnmount( register struct hfsmount *hfsmp, __unused struct proc *p) -{ - /* Get rid of our attribute data vnode (if any). This is done - * after the vflush() during mount, so we don't need to worry - * about any locks. - */ - if (hfsmp->hfs_attrdata_vp) { - ReleaseMetaFileVNode(hfsmp->hfs_attrdata_vp); - hfsmp->hfs_attrdata_vp = NULLVP; - } - - if (hfsmp->hfs_startup_vp) { - ReleaseMetaFileVNode(hfsmp->hfs_startup_vp); - hfsmp->hfs_startup_cp = NULL; - hfsmp->hfs_startup_vp = NULL; - } - - if (hfsmp->hfs_attribute_vp) { - ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); - hfsmp->hfs_attribute_cp = NULL; - hfsmp->hfs_attribute_vp = NULL; - } - - if (hfsmp->hfs_catalog_vp) { - ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); - hfsmp->hfs_catalog_cp = NULL; - hfsmp->hfs_catalog_vp = NULL; - } - - if (hfsmp->hfs_extents_vp) { - ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); - hfsmp->hfs_extents_cp = NULL; - hfsmp->hfs_extents_vp = NULL; - } - - if (hfsmp->hfs_allocation_vp) { - ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); - hfsmp->hfs_allocation_cp = NULL; - hfsmp->hfs_allocation_vp = NULL; - } - - return (0); -} - - -/* - * Test if fork has overflow extents. - * - * Returns: - * non-zero - overflow extents exist - * zero - overflow extents do not exist - */ -__private_extern__ -bool overflow_extents(struct filefork *fp) -{ - u_int32_t blocks; - - // - // If the vnode pointer is NULL then we're being called - // from hfs_remove_orphans() with a faked-up filefork - // and therefore it has to be an HFS+ volume. Otherwise - // we check through the volume header to see what type - // of volume we're on. - // - -#if CONFIG_HFS_STD - if (FTOV(fp) && VTOVCB(FTOV(fp))->vcbSigWord == kHFSSigWord) { - if (fp->ff_extents[2].blockCount == 0) - return false; - - blocks = fp->ff_extents[0].blockCount + - fp->ff_extents[1].blockCount + - fp->ff_extents[2].blockCount; - - return fp->ff_blocks > blocks; - } -#endif - - if (fp->ff_extents[7].blockCount == 0) - return false; - - blocks = fp->ff_extents[0].blockCount + - fp->ff_extents[1].blockCount + - fp->ff_extents[2].blockCount + - fp->ff_extents[3].blockCount + - fp->ff_extents[4].blockCount + - fp->ff_extents[5].blockCount + - fp->ff_extents[6].blockCount + - fp->ff_extents[7].blockCount; - - return fp->ff_blocks > blocks; -} - -static __attribute__((pure)) -boolean_t hfs_is_frozen(struct hfsmount *hfsmp) -{ - return (hfsmp->hfs_freeze_state == HFS_FROZEN - || (hfsmp->hfs_freeze_state == HFS_FREEZING - && current_thread() != hfsmp->hfs_freezing_thread)); -} - -/* - * Lock the HFS global journal lock - */ -int -hfs_lock_global (struct hfsmount *hfsmp, enum hfs_locktype locktype) -{ - thread_t thread = current_thread(); - - if (hfsmp->hfs_global_lockowner == thread) { - panic ("hfs_lock_global: locking against myself!"); - } - - /* - * This check isn't really necessary but this stops us taking - * the mount lock in most cases. The essential check is below. - */ - if (hfs_is_frozen(hfsmp)) { - /* - * Unfortunately, there is no easy way of getting a notification - * for when a process is exiting and it's possible for the exiting - * process to get blocked somewhere else. To catch this, we - * periodically monitor the frozen process here and thaw if - * we spot that it's exiting. - */ -frozen: - hfs_lock_mount(hfsmp); - - struct timespec ts = { 0, 500 * NSEC_PER_MSEC }; - - while (hfs_is_frozen(hfsmp)) { - if (hfsmp->hfs_freeze_state == HFS_FROZEN - && proc_exiting(hfsmp->hfs_freezing_proc)) { - hfs_thaw_locked(hfsmp); - break; - } - - msleep(&hfsmp->hfs_freeze_state, &hfsmp->hfs_mutex, - PWAIT, "hfs_lock_global (frozen)", &ts); - } - hfs_unlock_mount(hfsmp); - } - - /* HFS_SHARED_LOCK */ - if (locktype == HFS_SHARED_LOCK) { - lck_rw_lock_shared (&hfsmp->hfs_global_lock); - hfsmp->hfs_global_lockowner = HFS_SHARED_OWNER; - } - /* HFS_EXCLUSIVE_LOCK */ - else { - lck_rw_lock_exclusive (&hfsmp->hfs_global_lock); - hfsmp->hfs_global_lockowner = thread; - } - - /* - * We have to check if we're frozen again because of the time - * between when we checked and when we took the global lock. - */ - if (hfs_is_frozen(hfsmp)) { - hfs_unlock_global(hfsmp); - goto frozen; - } - - return 0; -} - - -/* - * Unlock the HFS global journal lock - */ -void -hfs_unlock_global (struct hfsmount *hfsmp) -{ - thread_t thread = current_thread(); - - /* HFS_LOCK_EXCLUSIVE */ - if (hfsmp->hfs_global_lockowner == thread) { - hfsmp->hfs_global_lockowner = NULL; - lck_rw_unlock_exclusive (&hfsmp->hfs_global_lock); - } - /* HFS_LOCK_SHARED */ - else { - lck_rw_unlock_shared (&hfsmp->hfs_global_lock); - } -} - -/* - * Lock the HFS mount lock - * - * Note: this is a mutex, not a rw lock! - */ -inline -void hfs_lock_mount (struct hfsmount *hfsmp) { - lck_mtx_lock (&(hfsmp->hfs_mutex)); -} - -/* - * Unlock the HFS mount lock - * - * Note: this is a mutex, not a rw lock! - */ -inline -void hfs_unlock_mount (struct hfsmount *hfsmp) { - lck_mtx_unlock (&(hfsmp->hfs_mutex)); -} - -/* - * Lock HFS system file(s). - * - * This function accepts a @flags parameter which indicates which - * system file locks are required. The value it returns should be - * used in a subsequent call to hfs_systemfile_unlock. The caller - * should treat this value as opaque; it may or may not have a - * relation to the @flags field that is passed in. The *only* - * guarantee that we make is that a value of zero means that no locks - * were taken and that there is no need to call hfs_systemfile_unlock - * (although it is harmless to do so). Recursion is supported but - * care must still be taken to ensure correct lock ordering. Note - * that requests for certain locks may cause other locks to also be - * taken, including locks that are not possible to ask for via the - * @flags parameter. - */ -int -hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfs_locktype locktype) -{ - /* - * Locking order is Catalog file, Attributes file, Startup file, Bitmap file, Extents file - */ - if (flags & SFL_CATALOG) { - if (hfsmp->hfs_catalog_cp - && hfsmp->hfs_catalog_cp->c_lockowner != current_thread()) { -#ifdef HFS_CHECK_LOCK_ORDER - if (hfsmp->hfs_attribute_cp && hfsmp->hfs_attribute_cp->c_lockowner == current_thread()) { - panic("hfs_systemfile_lock: bad lock order (Attributes before Catalog)"); - } - if (hfsmp->hfs_startup_cp && hfsmp->hfs_startup_cp->c_lockowner == current_thread()) { - panic("hfs_systemfile_lock: bad lock order (Startup before Catalog)"); - } - if (hfsmp-> hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == current_thread()) { - panic("hfs_systemfile_lock: bad lock order (Extents before Catalog)"); - } -#endif /* HFS_CHECK_LOCK_ORDER */ - - (void) hfs_lock(hfsmp->hfs_catalog_cp, locktype, HFS_LOCK_DEFAULT); - /* - * When the catalog file has overflow extents then - * also acquire the extents b-tree lock if its not - * already requested. - */ - if (((flags & SFL_EXTENTS) == 0) && - (hfsmp->hfs_catalog_vp != NULL) && - (overflow_extents(VTOF(hfsmp->hfs_catalog_vp)))) { - flags |= SFL_EXTENTS; - } - } else { - flags &= ~SFL_CATALOG; - } - } - - if (flags & SFL_ATTRIBUTE) { - if (hfsmp->hfs_attribute_cp - && hfsmp->hfs_attribute_cp->c_lockowner != current_thread()) { -#ifdef HFS_CHECK_LOCK_ORDER - if (hfsmp->hfs_startup_cp && hfsmp->hfs_startup_cp->c_lockowner == current_thread()) { - panic("hfs_systemfile_lock: bad lock order (Startup before Attributes)"); - } - if (hfsmp->hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == current_thread()) { - panic("hfs_systemfile_lock: bad lock order (Extents before Attributes)"); - } -#endif /* HFS_CHECK_LOCK_ORDER */ - - (void) hfs_lock(hfsmp->hfs_attribute_cp, locktype, HFS_LOCK_DEFAULT); - /* - * When the attribute file has overflow extents then - * also acquire the extents b-tree lock if its not - * already requested. - */ - if (((flags & SFL_EXTENTS) == 0) && - (hfsmp->hfs_attribute_vp != NULL) && - (overflow_extents(VTOF(hfsmp->hfs_attribute_vp)))) { - flags |= SFL_EXTENTS; - } - } else { - flags &= ~SFL_ATTRIBUTE; - } - } - - if (flags & SFL_STARTUP) { - if (hfsmp->hfs_startup_cp - && hfsmp->hfs_startup_cp->c_lockowner != current_thread()) { -#ifdef HFS_CHECK_LOCK_ORDER - if (hfsmp-> hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == current_thread()) { - panic("hfs_systemfile_lock: bad lock order (Extents before Startup)"); - } -#endif /* HFS_CHECK_LOCK_ORDER */ - - (void) hfs_lock(hfsmp->hfs_startup_cp, locktype, HFS_LOCK_DEFAULT); - /* - * When the startup file has overflow extents then - * also acquire the extents b-tree lock if its not - * already requested. - */ - if (((flags & SFL_EXTENTS) == 0) && - (hfsmp->hfs_startup_vp != NULL) && - (overflow_extents(VTOF(hfsmp->hfs_startup_vp)))) { - flags |= SFL_EXTENTS; - } - } else { - flags &= ~SFL_STARTUP; - } - } - - /* - * To prevent locks being taken in the wrong order, the extent lock - * gets a bitmap lock as well. - */ - if (flags & (SFL_BITMAP | SFL_EXTENTS)) { - if (hfsmp->hfs_allocation_cp) { - (void) hfs_lock(hfsmp->hfs_allocation_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - /* - * The bitmap lock is also grabbed when only extent lock - * was requested. Set the bitmap lock bit in the lock - * flags which callers will use during unlock. - */ - flags |= SFL_BITMAP; - } else { - flags &= ~SFL_BITMAP; - } - } - - if (flags & SFL_EXTENTS) { - /* - * Since the extents btree lock is recursive we always - * need exclusive access. - */ - if (hfsmp->hfs_extents_cp) { - (void) hfs_lock(hfsmp->hfs_extents_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - - if (hfsmp->hfs_mp->mnt_kern_flag & MNTK_SWAP_MOUNT) { - /* - * because we may need this lock on the pageout path (if a swapfile allocation - * spills into the extents overflow tree), we will grant the holder of this - * lock the privilege of dipping into the reserve free pool in order to prevent - * a deadlock from occurring if we need those pageouts to complete before we - * will make any new pages available on the free list... the deadlock can occur - * if this thread needs to allocate memory while this lock is held - */ - if (set_vm_privilege(TRUE) == FALSE) { - /* - * indicate that we need to drop vm_privilege - * when we unlock - */ - flags |= SFL_VM_PRIV; - } - } - } else { - flags &= ~SFL_EXTENTS; - } - } - - return (flags); -} - -/* - * unlock HFS system file(s). - */ -void -hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) -{ - if (!flags) - return; - - struct timeval tv; - u_int32_t lastfsync; - int numOfLockedBuffs; - - if (hfsmp->jnl == NULL) { - microuptime(&tv); - lastfsync = tv.tv_sec; - } - if (flags & SFL_STARTUP && hfsmp->hfs_startup_cp) { - hfs_unlock(hfsmp->hfs_startup_cp); - } - if (flags & SFL_ATTRIBUTE && hfsmp->hfs_attribute_cp) { - if (hfsmp->jnl == NULL) { - BTGetLastSync((FCB*)VTOF(hfsmp->hfs_attribute_vp), &lastfsync); - numOfLockedBuffs = count_lock_queue(); - if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || - ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > - kMaxSecsForFsync))) { - hfs_btsync(hfsmp->hfs_attribute_vp, HFS_SYNCTRANS); - } - } - hfs_unlock(hfsmp->hfs_attribute_cp); - } - if (flags & SFL_CATALOG && hfsmp->hfs_catalog_cp) { - if (hfsmp->jnl == NULL) { - BTGetLastSync((FCB*)VTOF(hfsmp->hfs_catalog_vp), &lastfsync); - numOfLockedBuffs = count_lock_queue(); - if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || - ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > - kMaxSecsForFsync))) { - hfs_btsync(hfsmp->hfs_catalog_vp, HFS_SYNCTRANS); - } - } - hfs_unlock(hfsmp->hfs_catalog_cp); - } - if (flags & SFL_BITMAP && hfsmp->hfs_allocation_cp) { - hfs_unlock(hfsmp->hfs_allocation_cp); - } - if (flags & SFL_EXTENTS && hfsmp->hfs_extents_cp) { - if (hfsmp->jnl == NULL) { - BTGetLastSync((FCB*)VTOF(hfsmp->hfs_extents_vp), &lastfsync); - numOfLockedBuffs = count_lock_queue(); - if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || - ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > - kMaxSecsForFsync))) { - hfs_btsync(hfsmp->hfs_extents_vp, HFS_SYNCTRANS); - } - } - hfs_unlock(hfsmp->hfs_extents_cp); - - if (flags & SFL_VM_PRIV) { - /* - * revoke the vm_privilege we granted this thread - * now that we have unlocked the overflow extents - */ - set_vm_privilege(FALSE); - } - } -} - - -/* - * RequireFileLock - * - * Check to see if a vnode is locked in the current context - * This is to be used for debugging purposes only!! - */ -#if HFS_DIAGNOSTIC -void RequireFileLock(FileReference vp, int shareable) -{ - int locked; - - /* The extents btree and allocation bitmap are always exclusive. */ - if (VTOC(vp)->c_fileid == kHFSExtentsFileID || - VTOC(vp)->c_fileid == kHFSAllocationFileID) { - shareable = 0; - } - - locked = VTOC(vp)->c_lockowner == current_thread(); - - if (!locked && !shareable) { - switch (VTOC(vp)->c_fileid) { - case kHFSExtentsFileID: - panic("hfs: extents btree not locked! v: 0x%08X\n #\n", (u_int)vp); - break; - case kHFSCatalogFileID: - panic("hfs: catalog btree not locked! v: 0x%08X\n #\n", (u_int)vp); - break; - case kHFSAllocationFileID: - /* The allocation file can hide behind the jornal lock. */ - if (VTOHFS(vp)->jnl == NULL) - panic("hfs: allocation file not locked! v: 0x%08X\n #\n", (u_int)vp); - break; - case kHFSStartupFileID: - panic("hfs: startup file not locked! v: 0x%08X\n #\n", (u_int)vp); - case kHFSAttributesFileID: - panic("hfs: attributes btree not locked! v: 0x%08X\n #\n", (u_int)vp); - break; - } - } -} -#endif - - -/* - * There are three ways to qualify for ownership rights on an object: - * - * 1. (a) Your UID matches the cnode's UID. - * (b) The object in question is owned by "unknown" - * 2. (a) Permissions on the filesystem are being ignored and - * your UID matches the replacement UID. - * (b) Permissions on the filesystem are being ignored and - * the replacement UID is "unknown". - * 3. You are root. - * - */ -int -hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, kauth_cred_t cred, - __unused struct proc *p, int invokesuperuserstatus) -{ - if ((kauth_cred_getuid(cred) == cnode_uid) || /* [1a] */ - (cnode_uid == UNKNOWNUID) || /* [1b] */ - ((((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) && /* [2] */ - ((kauth_cred_getuid(cred) == hfsmp->hfs_uid) || /* [2a] */ - (hfsmp->hfs_uid == UNKNOWNUID))) || /* [2b] */ - (invokesuperuserstatus && (suser(cred, 0) == 0))) { /* [3] */ - return (0); - } else { - return (EPERM); - } -} - - -u_int32_t BestBlockSizeFit(u_int32_t allocationBlockSize, - u_int32_t blockSizeLimit, - u_int32_t baseMultiple) { - /* - Compute the optimal (largest) block size (no larger than allocationBlockSize) that is less than the - specified limit but still an even multiple of the baseMultiple. - */ - int baseBlockCount, blockCount; - u_int32_t trialBlockSize; - - if (allocationBlockSize % baseMultiple != 0) { - /* - Whoops: the allocation blocks aren't even multiples of the specified base: - no amount of dividing them into even parts will be a multiple, either then! - */ - return 512; /* Hope for the best */ - }; - - /* Try the obvious winner first, to prevent 12K allocation blocks, for instance, - from being handled as two 6K logical blocks instead of 3 4K logical blocks. - Even though the former (the result of the loop below) is the larger allocation - block size, the latter is more efficient: */ - if (allocationBlockSize % PAGE_SIZE == 0) return PAGE_SIZE; - - /* No clear winner exists: pick the largest even fraction <= MAXBSIZE: */ - baseBlockCount = allocationBlockSize / baseMultiple; /* Now guaranteed to be an even multiple */ - - for (blockCount = baseBlockCount; blockCount > 0; --blockCount) { - trialBlockSize = blockCount * baseMultiple; - if (allocationBlockSize % trialBlockSize == 0) { /* An even multiple? */ - if ((trialBlockSize <= blockSizeLimit) && - (trialBlockSize % baseMultiple == 0)) { - return trialBlockSize; - }; - }; - }; - - /* Note: we should never get here, since blockCount = 1 should always work, - but this is nice and safe and makes the compiler happy, too ... */ - return 512; -} - - -u_int32_t -GetFileInfo(ExtendedVCB *vcb, __unused u_int32_t dirid, const char *name, - struct cat_attr *fattr, struct cat_fork *forkinfo) -{ - struct hfsmount * hfsmp; - struct cat_desc jdesc; - int lockflags; - int error; - - if (vcb->vcbSigWord != kHFSPlusSigWord) - return (0); - - hfsmp = VCBTOHFS(vcb); - - memset(&jdesc, 0, sizeof(struct cat_desc)); - jdesc.cd_parentcnid = kRootDirID; - jdesc.cd_nameptr = (const u_int8_t *)name; - jdesc.cd_namelen = strlen(name); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_lookup(hfsmp, &jdesc, 0, 0, NULL, fattr, forkinfo, NULL); - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error == 0) { - return (fattr->ca_fileid); - } else if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (0); - } - - return (0); /* XXX what callers expect on an error */ -} - - -/* - * On HFS Plus Volumes, there can be orphaned files or directories - * These are files or directories that were unlinked while busy. - * If the volume was not cleanly unmounted then some of these may - * have persisted and need to be removed. - */ -void -hfs_remove_orphans(struct hfsmount * hfsmp) -{ - struct BTreeIterator * iterator = NULL; - struct FSBufferDescriptor btdata; - struct HFSPlusCatalogFile filerec; - struct HFSPlusCatalogKey * keyp; - struct proc *p = current_proc(); - FCB *fcb; - ExtendedVCB *vcb; - char filename[32]; - char tempname[32]; - size_t namelen; - cat_cookie_t cookie; - int catlock = 0; - int catreserve = 0; - bool started_tr = false; - int lockflags; - int result; - int orphaned_files = 0; - int orphaned_dirs = 0; - - bzero(&cookie, sizeof(cookie)); - - if (hfsmp->hfs_flags & HFS_CLEANED_ORPHANS) - return; - - vcb = HFSTOVCB(hfsmp); - fcb = VTOF(hfsmp->hfs_catalog_vp); - - btdata.bufferAddress = &filerec; - btdata.itemSize = sizeof(filerec); - btdata.itemCount = 1; - - MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); - - /* Build a key to "temp" */ - keyp = (HFSPlusCatalogKey*)&iterator->key; - keyp->parentID = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - keyp->nodeName.length = 4; /* "temp" */ - keyp->keyLength = kHFSPlusCatalogKeyMinimumLength + keyp->nodeName.length * 2; - keyp->nodeName.unicode[0] = 't'; - keyp->nodeName.unicode[1] = 'e'; - keyp->nodeName.unicode[2] = 'm'; - keyp->nodeName.unicode[3] = 'p'; - - /* - * Position the iterator just before the first real temp file/dir. - */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - (void) BTSearchRecord(fcb, iterator, NULL, NULL, iterator); - hfs_systemfile_unlock(hfsmp, lockflags); - - /* Visit all the temp files/dirs in the HFS+ private directory. */ - for (;;) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - hfs_systemfile_unlock(hfsmp, lockflags); - if (result) - break; - if (keyp->parentID != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) - break; - - (void) utf8_encodestr(keyp->nodeName.unicode, keyp->nodeName.length * 2, - (u_int8_t *)filename, &namelen, sizeof(filename), 0, 0); - - (void) snprintf(tempname, sizeof(tempname), "%s%d", - HFS_DELETE_PREFIX, filerec.fileID); - - /* - * Delete all files (and directories) named "tempxxx", - * where xxx is the file's cnid in decimal. - * - */ - if (bcmp(tempname, filename, namelen) != 0) - continue; - - struct filefork dfork; - struct filefork rfork; - struct cnode cnode; - int mode = 0; - - bzero(&dfork, sizeof(dfork)); - bzero(&rfork, sizeof(rfork)); - bzero(&cnode, sizeof(cnode)); - - if (hfs_start_transaction(hfsmp) != 0) { - printf("hfs_remove_orphans: failed to start transaction\n"); - goto exit; - } - started_tr = true; - - /* - * Reserve some space in the Catalog file. - */ - if (cat_preflight(hfsmp, CAT_DELETE, &cookie, p) != 0) { - printf("hfs_remove_orphans: cat_preflight failed\n"); - goto exit; - } - catreserve = 1; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - catlock = 1; - - /* Build a fake cnode */ - cat_convertattr(hfsmp, (CatalogRecord *)&filerec, &cnode.c_attr, - &dfork.ff_data, &rfork.ff_data); - cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - cnode.c_desc.cd_nameptr = (const u_int8_t *)filename; - cnode.c_desc.cd_namelen = namelen; - cnode.c_desc.cd_cnid = cnode.c_attr.ca_fileid; - cnode.c_blocks = dfork.ff_blocks + rfork.ff_blocks; - - /* Position iterator at previous entry */ - if (BTIterateRecord(fcb, kBTreePrevRecord, iterator, - NULL, NULL) != 0) { - break; - } - - /* Truncate the file to zero (both forks) */ - if (dfork.ff_blocks > 0) { - u_int64_t fsize; - - dfork.ff_cp = &cnode; - cnode.c_datafork = &dfork; - cnode.c_rsrcfork = NULL; - fsize = (u_int64_t)dfork.ff_blocks * (u_int64_t)HFSTOVCB(hfsmp)->blockSize; - while (fsize > 0) { - if (fsize > HFS_BIGFILE_SIZE) { - fsize -= HFS_BIGFILE_SIZE; - } else { - fsize = 0; - } - - if (TruncateFileC(vcb, (FCB*)&dfork, fsize, 1, 0, - cnode.c_attr.ca_fileid, false) != 0) { - printf("hfs: error truncating data fork!\n"); - break; - } - - // - // if we're iteratively truncating this file down, - // then end the transaction and start a new one so - // that no one transaction gets too big. - // - if (fsize > 0) { - /* Drop system file locks before starting - * another transaction to preserve lock order. - */ - hfs_systemfile_unlock(hfsmp, lockflags); - catlock = 0; - hfs_end_transaction(hfsmp); - - if (hfs_start_transaction(hfsmp) != 0) { - started_tr = false; - goto exit; - } - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - catlock = 1; - } - } - } - - if (rfork.ff_blocks > 0) { - rfork.ff_cp = &cnode; - cnode.c_datafork = NULL; - cnode.c_rsrcfork = &rfork; - if (TruncateFileC(vcb, (FCB*)&rfork, 0, 1, 1, cnode.c_attr.ca_fileid, false) != 0) { - printf("hfs: error truncating rsrc fork!\n"); - break; - } - } - - // Deal with extended attributes - if (ISSET(cnode.c_attr.ca_recflags, kHFSHasAttributesMask)) { - // hfs_removeallattr uses its own transactions - hfs_systemfile_unlock(hfsmp, lockflags); - catlock = false; - hfs_end_transaction(hfsmp); - - hfs_removeallattr(hfsmp, cnode.c_attr.ca_fileid, &started_tr); - - if (!started_tr) { - if (hfs_start_transaction(hfsmp) != 0) { - printf("hfs_remove_orphans: failed to start transaction\n"); - goto exit; - } - started_tr = true; - } - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - catlock = 1; - } - - /* Remove the file or folder record from the Catalog */ - if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) { - printf("hfs_remove_orphans: error deleting cat rec for id %d!\n", cnode.c_desc.cd_cnid); - hfs_systemfile_unlock(hfsmp, lockflags); - catlock = 0; - hfs_volupdate(hfsmp, VOL_UPDATE, 0); - break; - } - - mode = cnode.c_attr.ca_mode & S_IFMT; - - if (mode == S_IFDIR) { - orphaned_dirs++; - } - else { - orphaned_files++; - } - - /* Update parent and volume counts */ - hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; - if (mode == S_IFDIR) { - DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); - } - - (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], - &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); - - /* Drop locks and end the transaction */ - hfs_systemfile_unlock(hfsmp, lockflags); - cat_postflight(hfsmp, &cookie, p); - catlock = catreserve = 0; - - /* - Now that Catalog is unlocked, update the volume info, making - sure to differentiate between files and directories - */ - if (mode == S_IFDIR) { - hfs_volupdate(hfsmp, VOL_RMDIR, 0); - } - else{ - hfs_volupdate(hfsmp, VOL_RMFILE, 0); - } - - hfs_end_transaction(hfsmp); - started_tr = false; - } /* end for */ - -exit: - - if (orphaned_files > 0 || orphaned_dirs > 0) - printf("hfs: Removed %d orphaned / unlinked files and %d directories \n", orphaned_files, orphaned_dirs); - if (catlock) { - hfs_systemfile_unlock(hfsmp, lockflags); - } - if (catreserve) { - cat_postflight(hfsmp, &cookie, p); - } - if (started_tr) { - hfs_end_transaction(hfsmp); - } - - FREE(iterator, M_TEMP); - hfsmp->hfs_flags |= HFS_CLEANED_ORPHANS; -} - - -/* - * This will return the correct logical block size for a given vnode. - * For most files, it is the allocation block size, for meta data like - * BTrees, this is kept as part of the BTree private nodeSize - */ -u_int32_t -GetLogicalBlockSize(struct vnode *vp) -{ -u_int32_t logBlockSize; - - DBG_ASSERT(vp != NULL); - - /* start with default */ - logBlockSize = VTOHFS(vp)->hfs_logBlockSize; - - if (vnode_issystem(vp)) { - if (VTOF(vp)->fcbBTCBPtr != NULL) { - BTreeInfoRec bTreeInfo; - - /* - * We do not lock the BTrees, because if we are getting block..then the tree - * should be locked in the first place. - * We just want the nodeSize wich will NEVER change..so even if the world - * is changing..the nodeSize should remain the same. Which argues why lock - * it in the first place?? - */ - - (void) BTGetInformation (VTOF(vp), kBTreeInfoVersion, &bTreeInfo); - - logBlockSize = bTreeInfo.nodeSize; - - } else if (VTOC(vp)->c_fileid == kHFSAllocationFileID) { - logBlockSize = VTOVCB(vp)->vcbVBMIOSize; - } - } - - DBG_ASSERT(logBlockSize > 0); - - return logBlockSize; -} - -#if HFS_SPARSE_DEV -static bool hfs_get_backing_free_blks(hfsmount_t *hfsmp, uint64_t *pfree_blks) -{ - struct vfsstatfs *vfsp; /* 272 bytes */ - uint64_t vfreeblks; - struct timeval now; - - hfs_lock_mount(hfsmp); - - vnode_t backing_vp = hfsmp->hfs_backingfs_rootvp; - if (!backing_vp) { - hfs_unlock_mount(hfsmp); - return false; - } - - // usecount is not enough; we need iocount - if (vnode_get(backing_vp)) { - hfs_unlock_mount(hfsmp); - *pfree_blks = 0; - return true; - } - - uint32_t loanedblks = hfsmp->loanedBlocks + hfsmp->lockedBlocks; - uint32_t bandblks = hfsmp->hfs_sparsebandblks; - uint64_t maxblks = hfsmp->hfs_backingfs_maxblocks; - - hfs_unlock_mount(hfsmp); - - mount_t backingfs_mp = vnode_mount(backing_vp); - - microtime(&now); - if ((now.tv_sec - hfsmp->hfs_last_backingstatfs) >= 1) { - vfs_update_vfsstat(backingfs_mp, vfs_context_kernel(), VFS_KERNEL_EVENT); - hfsmp->hfs_last_backingstatfs = now.tv_sec; - } - - if (!(vfsp = vfs_statfs(backingfs_mp))) { - vnode_put(backing_vp); - return false; - } - - vfreeblks = vfsp->f_bavail; - /* Normalize block count if needed. */ - if (vfsp->f_bsize != hfsmp->blockSize) - vfreeblks = vfreeblks * vfsp->f_bsize / hfsmp->blockSize; - if (vfreeblks > bandblks) - vfreeblks -= bandblks; - else - vfreeblks = 0; - - /* - * Take into account any delayed allocations. It is not - * certain what the original reason for the "2 *" is. Most - * likely it is to allow for additional requirements in the - * host file system and metadata required by disk images. The - * number of loaned blocks is likely to be small and we will - * stop using them as we get close to the limit. - */ - loanedblks = 2 * loanedblks; - if (vfreeblks > loanedblks) - vfreeblks -= loanedblks; - else - vfreeblks = 0; - - if (maxblks) - vfreeblks = MIN(vfreeblks, maxblks); - - vnode_put(backing_vp); - - *pfree_blks = vfreeblks; - - return true; -} -#endif - -u_int32_t -hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) -{ - u_int32_t freeblks; - u_int32_t rsrvblks; - u_int32_t loanblks; - - /* - * We don't bother taking the mount lock - * to look at these values since the values - * themselves are each updated atomically - * on aligned addresses. - */ - freeblks = hfsmp->freeBlocks; - rsrvblks = hfsmp->reserveBlocks; - loanblks = hfsmp->loanedBlocks + hfsmp->lockedBlocks; - if (wantreserve) { - if (freeblks > rsrvblks) - freeblks -= rsrvblks; - else - freeblks = 0; - } - if (freeblks > loanblks) - freeblks -= loanblks; - else - freeblks = 0; - -#if HFS_SPARSE_DEV - /* - * When the underlying device is sparse, check the - * available space on the backing store volume. - */ - uint64_t vfreeblks; - if (hfs_get_backing_free_blks(hfsmp, &vfreeblks)) - freeblks = MIN(freeblks, vfreeblks); -#endif /* HFS_SPARSE_DEV */ - - return (freeblks); -} - -/* - * Map HFS Common errors (negative) to BSD error codes (positive). - * Positive errors (ie BSD errors) are passed through unchanged. - */ -short MacToVFSError(OSErr err) -{ - if (err >= 0) - return err; - - /* BSD/VFS internal errnos */ - switch (err) { - case ERESERVEDNAME: /* -8 */ - return err; - } - - switch (err) { - case dskFulErr: /* -34 */ - case btNoSpaceAvail: /* -32733 */ - return ENOSPC; - case fxOvFlErr: /* -32750 */ - return EOVERFLOW; - - case btBadNode: /* -32731 */ - return EIO; - - case memFullErr: /* -108 */ - return ENOMEM; /* +12 */ - - case cmExists: /* -32718 */ - case btExists: /* -32734 */ - return EEXIST; /* +17 */ - - case cmNotFound: /* -32719 */ - case btNotFound: /* -32735 */ - return ENOENT; /* 28 */ - - case cmNotEmpty: /* -32717 */ - return ENOTEMPTY; /* 66 */ - - case cmFThdDirErr: /* -32714 */ - return EISDIR; /* 21 */ - - case fxRangeErr: /* -32751 */ - return ERANGE; - - case bdNamErr: /* -37 */ - return ENAMETOOLONG; /* 63 */ - - case paramErr: /* -50 */ - case fileBoundsErr: /* -1309 */ - return EINVAL; /* +22 */ - - case fsBTBadNodeSize: - return ENXIO; - - default: - return EIO; /* +5 */ - } -} - - -/* - * Find the current thread's directory hint for a given index. - * - * Requires an exclusive lock on directory cnode. - * - * Use detach if the cnode lock must be dropped while the hint is still active. - */ -__private_extern__ -directoryhint_t * -hfs_getdirhint(struct cnode *dcp, int index, int detach) -{ - struct timeval tv; - directoryhint_t *hint; - boolean_t need_remove, need_init; - const u_int8_t * name; - - microuptime(&tv); - - /* - * Look for an existing hint first. If not found, create a new one (when - * the list is not full) or recycle the oldest hint. Since new hints are - * always added to the head of the list, the last hint is always the - * oldest. - */ - TAILQ_FOREACH(hint, &dcp->c_hintlist, dh_link) { - if (hint->dh_index == index) - break; - } - if (hint != NULL) { /* found an existing hint */ - need_init = false; - need_remove = true; - } else { /* cannot find an existing hint */ - need_init = true; - if (dcp->c_dirhintcnt < HFS_MAXDIRHINTS) { /* we don't need recycling */ - /* Create a default directory hint */ - MALLOC_ZONE(hint, directoryhint_t *, sizeof(directoryhint_t), M_HFSDIRHINT, M_WAITOK); - ++dcp->c_dirhintcnt; - need_remove = false; - } else { /* recycle the last (i.e., the oldest) hint */ - hint = TAILQ_LAST(&dcp->c_hintlist, hfs_hinthead); - if ((hint->dh_desc.cd_flags & CD_HASBUF) && - (name = hint->dh_desc.cd_nameptr)) { - hint->dh_desc.cd_nameptr = NULL; - hint->dh_desc.cd_namelen = 0; - hint->dh_desc.cd_flags &= ~CD_HASBUF; - vfs_removename((const char *)name); - } - need_remove = true; - } - } - - if (need_remove) - TAILQ_REMOVE(&dcp->c_hintlist, hint, dh_link); - - if (detach) - --dcp->c_dirhintcnt; - else - TAILQ_INSERT_HEAD(&dcp->c_hintlist, hint, dh_link); - - if (need_init) { - hint->dh_index = index; - hint->dh_desc.cd_flags = 0; - hint->dh_desc.cd_encoding = 0; - hint->dh_desc.cd_namelen = 0; - hint->dh_desc.cd_nameptr = NULL; - hint->dh_desc.cd_parentcnid = dcp->c_fileid; - hint->dh_desc.cd_hint = dcp->c_childhint; - hint->dh_desc.cd_cnid = 0; - } - hint->dh_time = tv.tv_sec; - return (hint); -} - -/* - * Release a single directory hint. - * - * Requires an exclusive lock on directory cnode. - */ -__private_extern__ -void -hfs_reldirhint(struct cnode *dcp, directoryhint_t * relhint) -{ - const u_int8_t * name; - directoryhint_t *hint; - - /* Check if item is on list (could be detached) */ - TAILQ_FOREACH(hint, &dcp->c_hintlist, dh_link) { - if (hint == relhint) { - TAILQ_REMOVE(&dcp->c_hintlist, relhint, dh_link); - --dcp->c_dirhintcnt; - break; - } - } - name = relhint->dh_desc.cd_nameptr; - if ((relhint->dh_desc.cd_flags & CD_HASBUF) && (name != NULL)) { - relhint->dh_desc.cd_nameptr = NULL; - relhint->dh_desc.cd_namelen = 0; - relhint->dh_desc.cd_flags &= ~CD_HASBUF; - vfs_removename((const char *)name); - } - FREE_ZONE(relhint, sizeof(directoryhint_t), M_HFSDIRHINT); -} - -/* - * Release directory hints for given directory - * - * Requires an exclusive lock on directory cnode. - */ -__private_extern__ -void -hfs_reldirhints(struct cnode *dcp, int stale_hints_only) -{ - struct timeval tv; - directoryhint_t *hint, *prev; - const u_int8_t * name; - - if (stale_hints_only) - microuptime(&tv); - - /* searching from the oldest to the newest, so we can stop early when releasing stale hints only */ - for (hint = TAILQ_LAST(&dcp->c_hintlist, hfs_hinthead); hint != NULL; hint = prev) { - if (stale_hints_only && (tv.tv_sec - hint->dh_time) < HFS_DIRHINT_TTL) - break; /* stop here if this entry is too new */ - name = hint->dh_desc.cd_nameptr; - if ((hint->dh_desc.cd_flags & CD_HASBUF) && (name != NULL)) { - hint->dh_desc.cd_nameptr = NULL; - hint->dh_desc.cd_namelen = 0; - hint->dh_desc.cd_flags &= ~CD_HASBUF; - vfs_removename((const char *)name); - } - prev = TAILQ_PREV(hint, hfs_hinthead, dh_link); /* must save this pointer before calling FREE_ZONE on this node */ - TAILQ_REMOVE(&dcp->c_hintlist, hint, dh_link); - FREE_ZONE(hint, sizeof(directoryhint_t), M_HFSDIRHINT); - --dcp->c_dirhintcnt; - } -} - -/* - * Insert a detached directory hint back into the list of dirhints. - * - * Requires an exclusive lock on directory cnode. - */ -__private_extern__ -void -hfs_insertdirhint(struct cnode *dcp, directoryhint_t * hint) -{ - directoryhint_t *test; - - TAILQ_FOREACH(test, &dcp->c_hintlist, dh_link) { - if (test == hint) - panic("hfs_insertdirhint: hint %p already on list!", hint); - } - - TAILQ_INSERT_HEAD(&dcp->c_hintlist, hint, dh_link); - ++dcp->c_dirhintcnt; -} - -/* - * Perform a case-insensitive compare of two UTF-8 filenames. - * - * Returns 0 if the strings match. - */ -__private_extern__ -int -hfs_namecmp(const u_int8_t *str1, size_t len1, const u_int8_t *str2, size_t len2) -{ - u_int16_t *ustr1, *ustr2; - size_t ulen1, ulen2; - size_t maxbytes; - int cmp = -1; - - if (len1 != len2) - return (cmp); - - maxbytes = kHFSPlusMaxFileNameChars << 1; - MALLOC(ustr1, u_int16_t *, maxbytes << 1, M_TEMP, M_WAITOK); - ustr2 = ustr1 + (maxbytes >> 1); - - if (utf8_decodestr(str1, len1, ustr1, &ulen1, maxbytes, ':', 0) != 0) - goto out; - if (utf8_decodestr(str2, len2, ustr2, &ulen2, maxbytes, ':', 0) != 0) - goto out; - - cmp = FastUnicodeCompare(ustr1, ulen1>>1, ustr2, ulen2>>1); -out: - FREE(ustr1, M_TEMP); - return (cmp); -} - - -typedef struct jopen_cb_info { - off_t jsize; - char *desired_uuid; - struct vnode *jvp; - size_t blksize; - int need_clean; - int need_init; -} jopen_cb_info; - -static int -journal_open_cb(const char *bsd_dev_name, const char *uuid_str, void *arg) -{ - struct nameidata nd; - jopen_cb_info *ji = (jopen_cb_info *)arg; - char bsd_name[256]; - int error; - - strlcpy(&bsd_name[0], "/dev/", sizeof(bsd_name)); - strlcpy(&bsd_name[5], bsd_dev_name, sizeof(bsd_name)-5); - - if (ji->desired_uuid && ji->desired_uuid[0] && strcmp(uuid_str, ji->desired_uuid) != 0) { - return 1; // keep iterating - } - - // if we're here, either the desired uuid matched or there was no - // desired uuid so let's try to open the device for writing and - // see if it works. if it does, we'll use it. - - NDINIT(&nd, LOOKUP, OP_LOOKUP, LOCKLEAF, UIO_SYSSPACE32, CAST_USER_ADDR_T(bsd_name), vfs_context_kernel()); - if ((error = namei(&nd))) { - printf("hfs: journal open cb: error %d looking up device %s (dev uuid %s)\n", error, bsd_name, uuid_str); - return 1; // keep iterating - } - - ji->jvp = nd.ni_vp; - nameidone(&nd); - - if (ji->jvp == NULL) { - printf("hfs: journal open cb: did not find %s (error %d)\n", bsd_name, error); - } else { - error = VNOP_OPEN(ji->jvp, FREAD|FWRITE, vfs_context_kernel()); - if (error == 0) { - // if the journal is dirty and we didn't specify a desired - // journal device uuid, then do not use the journal. but - // if the journal is just invalid (e.g. it hasn't been - // initialized) then just set the need_init flag. - if (ji->need_clean && ji->desired_uuid && ji->desired_uuid[0] == '\0') { - error = journal_is_clean(ji->jvp, 0, ji->jsize, (void *)1, ji->blksize); - if (error == EBUSY) { - VNOP_CLOSE(ji->jvp, FREAD|FWRITE, vfs_context_kernel()); - vnode_put(ji->jvp); - ji->jvp = NULL; - return 1; // keep iterating - } else if (error == EINVAL) { - ji->need_init = 1; - } - } - - if (ji->desired_uuid && ji->desired_uuid[0] == '\0') { - strlcpy(ji->desired_uuid, uuid_str, 128); - } - vnode_setmountedon(ji->jvp); - return 0; // stop iterating - } else { - vnode_put(ji->jvp); - ji->jvp = NULL; - } - } - - return 1; // keep iterating -} - -extern void IOBSDIterateMediaWithContent(const char *uuid_cstring, int (*func)(const char *bsd_dev_name, const char *uuid_str, void *arg), void *arg); -kern_return_t IOBSDGetPlatformSerialNumber(char *serial_number_str, u_int32_t len); - - -static vnode_t -open_journal_dev(const char *vol_device, - int need_clean, - char *uuid_str, - char *machine_serial_num, - off_t jsize, - size_t blksize, - int *need_init) -{ - int retry_counter=0; - jopen_cb_info ji; - - ji.jsize = jsize; - ji.desired_uuid = uuid_str; - ji.jvp = NULL; - ji.blksize = blksize; - ji.need_clean = need_clean; - ji.need_init = 0; - -// if (uuid_str[0] == '\0') { -// printf("hfs: open journal dev: %s: locating any available non-dirty external journal partition\n", vol_device); -// } else { -// printf("hfs: open journal dev: %s: trying to find the external journal partition w/uuid %s\n", vol_device, uuid_str); -// } - while (ji.jvp == NULL && retry_counter++ < 4) { - if (retry_counter > 1) { - if (uuid_str[0]) { - printf("hfs: open_journal_dev: uuid %s not found. waiting 10sec.\n", uuid_str); - } else { - printf("hfs: open_journal_dev: no available external journal partition found. waiting 10sec.\n"); - } - delay_for_interval(10* 1000000, NSEC_PER_USEC); // wait for ten seconds and then try again - } - - IOBSDIterateMediaWithContent(EXTJNL_CONTENT_TYPE_UUID, journal_open_cb, &ji); - } - - if (ji.jvp == NULL) { - printf("hfs: volume: %s: did not find jnl device uuid: %s from machine serial number: %s\n", - vol_device, uuid_str, machine_serial_num); - } - - *need_init = ji.need_init; - - return ji.jvp; -} - - -int -hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - void *_args, off_t embeddedOffset, daddr64_t mdb_offset, - HFSMasterDirectoryBlock *mdbp, kauth_cred_t cred) -{ - JournalInfoBlock *jibp; - struct buf *jinfo_bp, *bp; - int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0; - int retval, write_jibp = 0; - uint32_t blksize = hfsmp->hfs_logical_block_size; - struct vnode *devvp; - struct hfs_mount_args *args = _args; - u_int32_t jib_flags; - u_int64_t jib_offset; - u_int64_t jib_size; - const char *dev_name; - - devvp = hfsmp->hfs_devvp; - dev_name = vnode_getname_printable(devvp); - - if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) { - arg_flags = args->journal_flags; - arg_tbufsz = args->journal_tbuffer_size; - } - - sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize; - - jinfo_bp = NULL; - retval = (int)buf_meta_bread(devvp, - (daddr64_t)((embeddedOffset/blksize) + - ((u_int64_t)SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock)), - hfsmp->hfs_physical_block_size, cred, &jinfo_bp); - if (retval) { - if (jinfo_bp) { - buf_brelse(jinfo_bp); - } - goto cleanup_dev_name; - } - - jibp = (JournalInfoBlock *)buf_dataptr(jinfo_bp); - jib_flags = SWAP_BE32(jibp->flags); - jib_size = SWAP_BE64(jibp->size); - - if (jib_flags & kJIJournalInFSMask) { - hfsmp->jvp = hfsmp->hfs_devvp; - jib_offset = SWAP_BE64(jibp->offset); - } else { - int need_init=0; - - // if the volume was unmounted cleanly then we'll pick any - // available external journal partition - // - if (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) { - *((char *)&jibp->ext_jnl_uuid[0]) = '\0'; - } - - hfsmp->jvp = open_journal_dev(dev_name, - !(jib_flags & kJIJournalNeedInitMask), - (char *)&jibp->ext_jnl_uuid[0], - (char *)&jibp->machine_serial_num[0], - jib_size, - hfsmp->hfs_logical_block_size, - &need_init); - if (hfsmp->jvp == NULL) { - buf_brelse(jinfo_bp); - retval = EROFS; - goto cleanup_dev_name; - } else { - if (IOBSDGetPlatformSerialNumber(&jibp->machine_serial_num[0], sizeof(jibp->machine_serial_num)) != KERN_SUCCESS) { - strlcpy(&jibp->machine_serial_num[0], "unknown-machine-uuid", sizeof(jibp->machine_serial_num)); - } - } - - jib_offset = 0; - write_jibp = 1; - if (need_init) { - jib_flags |= kJIJournalNeedInitMask; - } - } - - // save this off for the hack-y check in hfs_remove() - hfsmp->jnl_start = jib_offset / SWAP_BE32(vhp->blockSize); - hfsmp->jnl_size = jib_size; - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) && (vfs_flags(hfsmp->hfs_mp) & MNT_ROOTFS) == 0) { - // if the file system is read-only, check if the journal is empty. - // if it is, then we can allow the mount. otherwise we have to - // return failure. - retval = journal_is_clean(hfsmp->jvp, - jib_offset + embeddedOffset, - jib_size, - devvp, - hfsmp->hfs_logical_block_size); - - hfsmp->jnl = NULL; - - buf_brelse(jinfo_bp); - - if (retval) { - const char *name = vnode_getname_printable(devvp); - printf("hfs: early journal init: volume on %s is read-only and journal is dirty. Can not mount volume.\n", - name); - vnode_putname_printable(name); - } - - goto cleanup_dev_name; - } - - if (jib_flags & kJIJournalNeedInitMask) { - printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", - jib_offset + embeddedOffset, jib_size); - hfsmp->jnl = journal_create(hfsmp->jvp, - jib_offset + embeddedOffset, - jib_size, - devvp, - blksize, - arg_flags, - arg_tbufsz, - hfs_sync_metadata, hfsmp->hfs_mp, - hfsmp->hfs_mp); - if (hfsmp->jnl) - journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); - - // no need to start a transaction here... if this were to fail - // we'd just re-init it on the next mount. - jib_flags &= ~kJIJournalNeedInitMask; - jibp->flags = SWAP_BE32(jib_flags); - buf_bwrite(jinfo_bp); - jinfo_bp = NULL; - jibp = NULL; - } else { - //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n", - // jib_offset + embeddedOffset, - // jib_size, SWAP_BE32(vhp->blockSize)); - - hfsmp->jnl = journal_open(hfsmp->jvp, - jib_offset + embeddedOffset, - jib_size, - devvp, - blksize, - arg_flags, - arg_tbufsz, - hfs_sync_metadata, hfsmp->hfs_mp, - hfsmp->hfs_mp); - if (hfsmp->jnl) - journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); - - if (write_jibp) { - buf_bwrite(jinfo_bp); - } else { - buf_brelse(jinfo_bp); - } - jinfo_bp = NULL; - jibp = NULL; - - if (hfsmp->jnl && mdbp) { - // reload the mdb because it could have changed - // if the journal had to be replayed. - if (mdb_offset == 0) { - mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); - } - bp = NULL; - retval = (int)buf_meta_bread(devvp, - HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, cred, &bp); - if (retval) { - if (bp) { - buf_brelse(bp); - } - printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n", - retval); - goto cleanup_dev_name; - } - bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size), mdbp, 512); - buf_brelse(bp); - bp = NULL; - } - } - - // if we expected the journal to be there and we couldn't - // create it or open it then we have to bail out. - if (hfsmp->jnl == NULL) { - printf("hfs: early jnl init: failed to open/create the journal (retval %d).\n", retval); - retval = EINVAL; - goto cleanup_dev_name; - } - - retval = 0; - -cleanup_dev_name: - vnode_putname_printable(dev_name); - return retval; -} - - -// -// This function will go and re-locate the .journal_info_block and -// the .journal files in case they moved (which can happen if you -// run Norton SpeedDisk). If we fail to find either file we just -// disable journaling for this volume and return. We turn off the -// journaling bit in the vcb and assume it will get written to disk -// later (if it doesn't on the next mount we'd do the same thing -// again which is harmless). If we disable journaling we don't -// return an error so that the volume is still mountable. -// -// If the info we find for the .journal_info_block and .journal files -// isn't what we had stored, we re-set our cached info and proceed -// with opening the journal normally. -// -static int -hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args) -{ - JournalInfoBlock *jibp; - struct buf *jinfo_bp; - int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0; - int retval, write_jibp = 0, recreate_journal = 0; - struct vnode *devvp; - struct cat_attr jib_attr, jattr; - struct cat_fork jib_fork, jfork; - ExtendedVCB *vcb; - u_int32_t fid; - struct hfs_mount_args *args = _args; - u_int32_t jib_flags; - u_int64_t jib_offset; - u_int64_t jib_size; - - devvp = hfsmp->hfs_devvp; - vcb = HFSTOVCB(hfsmp); - - if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) { - if (args->journal_disable) { - return 0; - } - - arg_flags = args->journal_flags; - arg_tbufsz = args->journal_tbuffer_size; - } - - fid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jib_attr, &jib_fork); - if (fid == 0 || jib_fork.cf_extents[0].startBlock == 0 || jib_fork.cf_size == 0) { - printf("hfs: can't find the .journal_info_block! disabling journaling (start: %d).\n", - jib_fork.cf_extents[0].startBlock); - vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; - return 0; - } - hfsmp->hfs_jnlinfoblkid = fid; - - // make sure the journal_info_block begins where we think it should. - if (SWAP_BE32(vhp->journalInfoBlock) != jib_fork.cf_extents[0].startBlock) { - printf("hfs: The journal_info_block moved (was: %d; is: %d). Fixing up\n", - SWAP_BE32(vhp->journalInfoBlock), jib_fork.cf_extents[0].startBlock); - - vcb->vcbJinfoBlock = jib_fork.cf_extents[0].startBlock; - vhp->journalInfoBlock = SWAP_BE32(jib_fork.cf_extents[0].startBlock); - recreate_journal = 1; - } - - - sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_logical_block_size; - jinfo_bp = NULL; - retval = (int)buf_meta_bread(devvp, - (vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size + - ((u_int64_t)SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock)), - hfsmp->hfs_physical_block_size, NOCRED, &jinfo_bp); - if (retval) { - if (jinfo_bp) { - buf_brelse(jinfo_bp); - } - printf("hfs: can't read journal info block. disabling journaling.\n"); - vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; - return 0; - } - - jibp = (JournalInfoBlock *)buf_dataptr(jinfo_bp); - jib_flags = SWAP_BE32(jibp->flags); - jib_offset = SWAP_BE64(jibp->offset); - jib_size = SWAP_BE64(jibp->size); - - fid = GetFileInfo(vcb, kRootDirID, ".journal", &jattr, &jfork); - if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) { - printf("hfs: can't find the journal file! disabling journaling (start: %d)\n", - jfork.cf_extents[0].startBlock); - buf_brelse(jinfo_bp); - vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; - return 0; - } - hfsmp->hfs_jnlfileid = fid; - - // make sure the journal file begins where we think it should. - if ((jib_flags & kJIJournalInFSMask) && (jib_offset / (u_int64_t)vcb->blockSize) != jfork.cf_extents[0].startBlock) { - printf("hfs: The journal file moved (was: %lld; is: %d). Fixing up\n", - (jib_offset / (u_int64_t)vcb->blockSize), jfork.cf_extents[0].startBlock); - - jib_offset = (u_int64_t)jfork.cf_extents[0].startBlock * (u_int64_t)vcb->blockSize; - write_jibp = 1; - recreate_journal = 1; - } - - // check the size of the journal file. - if (jib_size != (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize) { - printf("hfs: The journal file changed size! (was %lld; is %lld). Fixing up.\n", - jib_size, (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize); - - jib_size = (u_int64_t)jfork.cf_extents[0].blockCount * vcb->blockSize; - write_jibp = 1; - recreate_journal = 1; - } - - if (jib_flags & kJIJournalInFSMask) { - hfsmp->jvp = hfsmp->hfs_devvp; - jib_offset += (off_t)vcb->hfsPlusIOPosOffset; - } else { - const char *dev_name; - int need_init = 0; - - dev_name = vnode_getname_printable(devvp); - - // since the journal is empty, just use any available external journal - *((char *)&jibp->ext_jnl_uuid[0]) = '\0'; - - // this fills in the uuid of the device we actually get - hfsmp->jvp = open_journal_dev(dev_name, - !(jib_flags & kJIJournalNeedInitMask), - (char *)&jibp->ext_jnl_uuid[0], - (char *)&jibp->machine_serial_num[0], - jib_size, - hfsmp->hfs_logical_block_size, - &need_init); - if (hfsmp->jvp == NULL) { - buf_brelse(jinfo_bp); - vnode_putname_printable(dev_name); - return EROFS; - } else { - if (IOBSDGetPlatformSerialNumber(&jibp->machine_serial_num[0], sizeof(jibp->machine_serial_num)) != KERN_SUCCESS) { - strlcpy(&jibp->machine_serial_num[0], "unknown-machine-serial-num", sizeof(jibp->machine_serial_num)); - } - } - jib_offset = 0; - recreate_journal = 1; - write_jibp = 1; - if (need_init) { - jib_flags |= kJIJournalNeedInitMask; - } - vnode_putname_printable(dev_name); - } - - // save this off for the hack-y check in hfs_remove() - hfsmp->jnl_start = jib_offset / SWAP_BE32(vhp->blockSize); - hfsmp->jnl_size = jib_size; - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) && (vfs_flags(hfsmp->hfs_mp) & MNT_ROOTFS) == 0) { - // if the file system is read-only, check if the journal is empty. - // if it is, then we can allow the mount. otherwise we have to - // return failure. - retval = journal_is_clean(hfsmp->jvp, - jib_offset, - jib_size, - devvp, - hfsmp->hfs_logical_block_size); - - hfsmp->jnl = NULL; - - buf_brelse(jinfo_bp); - - if (retval) { - const char *name = vnode_getname_printable(devvp); - printf("hfs: late journal init: volume on %s is read-only and journal is dirty. Can not mount volume.\n", - name); - vnode_putname_printable(name); - } - - return retval; - } - - if ((jib_flags & kJIJournalNeedInitMask) || recreate_journal) { - printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", - jib_offset, jib_size); - hfsmp->jnl = journal_create(hfsmp->jvp, - jib_offset, - jib_size, - devvp, - hfsmp->hfs_logical_block_size, - arg_flags, - arg_tbufsz, - hfs_sync_metadata, hfsmp->hfs_mp, - hfsmp->hfs_mp); - if (hfsmp->jnl) - journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); - - // no need to start a transaction here... if this were to fail - // we'd just re-init it on the next mount. - jib_flags &= ~kJIJournalNeedInitMask; - write_jibp = 1; - - } else { - // - // if we weren't the last person to mount this volume - // then we need to throw away the journal because it - // is likely that someone else mucked with the disk. - // if the journal is empty this is no big deal. if the - // disk is dirty this prevents us from replaying the - // journal over top of changes that someone else made. - // - arg_flags |= JOURNAL_RESET; - - //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n", - // jib_offset, - // jib_size, SWAP_BE32(vhp->blockSize)); - - hfsmp->jnl = journal_open(hfsmp->jvp, - jib_offset, - jib_size, - devvp, - hfsmp->hfs_logical_block_size, - arg_flags, - arg_tbufsz, - hfs_sync_metadata, hfsmp->hfs_mp, - hfsmp->hfs_mp); - if (hfsmp->jnl) - journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); - } - - - if (write_jibp) { - jibp->flags = SWAP_BE32(jib_flags); - jibp->offset = SWAP_BE64(jib_offset); - jibp->size = SWAP_BE64(jib_size); - - buf_bwrite(jinfo_bp); - } else { - buf_brelse(jinfo_bp); - } - jinfo_bp = NULL; - jibp = NULL; - - // if we expected the journal to be there and we couldn't - // create it or open it then we have to bail out. - if (hfsmp->jnl == NULL) { - printf("hfs: late jnl init: failed to open/create the journal (retval %d).\n", retval); - return EINVAL; - } - - return 0; -} - -/* - * Calculate the allocation zone for metadata. - * - * This zone includes the following: - * Allocation Bitmap file - * Overflow Extents file - * Journal file - * Quota files - * Clustered Hot files - * Catalog file - * - * METADATA ALLOCATION ZONE - * ____________________________________________________________________________ - * | | | | | | | - * | BM | JF | OEF | CATALOG |---> | HOT FILES | - * |____|____|_____|_______________|______________________________|___________| - * - * <------------------------------- N * 128 MB -------------------------------> - * - */ -#define GIGABYTE (u_int64_t)(1024*1024*1024) - -#define OVERFLOW_DEFAULT_SIZE (4*1024*1024) -#define OVERFLOW_MAXIMUM_SIZE (128*1024*1024) -#define JOURNAL_DEFAULT_SIZE (8*1024*1024) -#define JOURNAL_MAXIMUM_SIZE (512*1024*1024) -#define HOTBAND_MINIMUM_SIZE (10*1024*1024) -#define HOTBAND_MAXIMUM_SIZE (512*1024*1024) - -/* Initialize the metadata zone. - * - * If the size of the volume is less than the minimum size for - * metadata zone, metadata zone is disabled. - * - * If disable is true, disable metadata zone unconditionally. - */ -void -hfs_metadatazone_init(struct hfsmount *hfsmp, int disable) -{ - ExtendedVCB *vcb; - u_int64_t fs_size; - u_int64_t zonesize; - u_int64_t temp; - u_int64_t filesize; - u_int32_t blk; - int items, really_do_it=1; - - vcb = HFSTOVCB(hfsmp); - fs_size = (u_int64_t)vcb->blockSize * (u_int64_t)vcb->allocLimit; - - /* - * For volumes less than 10 GB, don't bother. - */ - if (fs_size < ((u_int64_t)10 * GIGABYTE)) { - really_do_it = 0; - } - - /* - * Skip non-journaled volumes as well. - */ - if (hfsmp->jnl == NULL) { - really_do_it = 0; - } - - /* If caller wants to disable metadata zone, do it */ - if (disable == true) { - really_do_it = 0; - } - - /* - * Start with space for the boot blocks and Volume Header. - * 1536 = byte offset from start of volume to end of volume header: - * 1024 bytes is the offset from the start of the volume to the - * start of the volume header (defined by the volume format) - * + 512 bytes (the size of the volume header). - */ - zonesize = roundup(1536, hfsmp->blockSize); - - /* - * Add the on-disk size of allocation bitmap. - */ - zonesize += hfsmp->hfs_allocation_cp->c_datafork->ff_blocks * hfsmp->blockSize; - - /* - * Add space for the Journal Info Block and Journal (if they're in - * this file system). - */ - if (hfsmp->jnl && hfsmp->jvp == hfsmp->hfs_devvp) { - zonesize += hfsmp->blockSize + hfsmp->jnl_size; - } - - /* - * Add the existing size of the Extents Overflow B-tree. - * (It rarely grows, so don't bother reserving additional room for it.) - */ - zonesize += hfs_blk_to_bytes(hfsmp->hfs_extents_cp->c_datafork->ff_blocks, hfsmp->blockSize); - - /* - * If there is an Attributes B-tree, leave room for 11 clumps worth. - * newfs_hfs allocates one clump, and leaves a gap of 10 clumps. - * When installing a full OS install onto a 20GB volume, we use - * 7 to 8 clumps worth of space (depending on packages), so that leaves - * us with another 3 or 4 clumps worth before we need another extent. - */ - if (hfsmp->hfs_attribute_cp) { - zonesize += 11 * hfsmp->hfs_attribute_cp->c_datafork->ff_clumpsize; - } - - /* - * Leave room for 11 clumps of the Catalog B-tree. - * Again, newfs_hfs allocates one clump plus a gap of 10 clumps. - * When installing a full OS install onto a 20GB volume, we use - * 7 to 8 clumps worth of space (depending on packages), so that leaves - * us with another 3 or 4 clumps worth before we need another extent. - */ - zonesize += 11 * hfsmp->hfs_catalog_cp->c_datafork->ff_clumpsize; - - /* - * Add space for hot file region. - * - * ...for now, use 5 MB per 1 GB (0.5 %) - */ - filesize = (fs_size / 1024) * 5; - if (filesize > HOTBAND_MAXIMUM_SIZE) - filesize = HOTBAND_MAXIMUM_SIZE; - else if (filesize < HOTBAND_MINIMUM_SIZE) - filesize = HOTBAND_MINIMUM_SIZE; - /* - * Calculate user quota file requirements. - */ - if (hfsmp->hfs_flags & HFS_QUOTAS) { - items = QF_USERS_PER_GB * (fs_size / GIGABYTE); - if (items < QF_MIN_USERS) - items = QF_MIN_USERS; - else if (items > QF_MAX_USERS) - items = QF_MAX_USERS; - if (!powerof2(items)) { - int x = items; - items = 4; - while (x>>1 != 1) { - x = x >> 1; - items = items << 1; - } - } - filesize += (items + 1) * sizeof(struct dqblk); - /* - * Calculate group quota file requirements. - * - */ - items = QF_GROUPS_PER_GB * (fs_size / GIGABYTE); - if (items < QF_MIN_GROUPS) - items = QF_MIN_GROUPS; - else if (items > QF_MAX_GROUPS) - items = QF_MAX_GROUPS; - if (!powerof2(items)) { - int x = items; - items = 4; - while (x>>1 != 1) { - x = x >> 1; - items = items << 1; - } - } - filesize += (items + 1) * sizeof(struct dqblk); - } - zonesize += filesize; - - /* - * Round up entire zone to a bitmap block's worth. - * The extra space goes to the catalog file and hot file area. - */ - temp = zonesize; - zonesize = roundup(zonesize, (u_int64_t)vcb->vcbVBMIOSize * 8 * vcb->blockSize); - hfsmp->hfs_min_alloc_start = zonesize / vcb->blockSize; - /* - * If doing the round up for hfs_min_alloc_start would push us past - * allocLimit, then just reset it back to 0. Though using a value - * bigger than allocLimit would not cause damage in the block allocator - * code, this value could get stored in the volume header and make it out - * to disk, making the volume header technically corrupt. - */ - if (hfsmp->hfs_min_alloc_start >= hfsmp->allocLimit) { - hfsmp->hfs_min_alloc_start = 0; - } - - if (really_do_it == 0) { - /* If metadata zone needs to be disabled because the - * volume was truncated, clear the bit and zero out - * the values that are no longer needed. - */ - if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { - /* Disable metadata zone */ - hfsmp->hfs_flags &= ~HFS_METADATA_ZONE; - - /* Zero out mount point values that are not required */ - hfsmp->hfs_catalog_maxblks = 0; - hfsmp->hfs_hotfile_maxblks = 0; - hfsmp->hfs_hotfile_start = 0; - hfsmp->hfs_hotfile_end = 0; - hfsmp->hfs_hotfile_freeblks = 0; - hfsmp->hfs_metazone_start = 0; - hfsmp->hfs_metazone_end = 0; - } - - return; - } - - temp = zonesize - temp; /* temp has extra space */ - filesize += temp / 3; - hfsmp->hfs_catalog_maxblks += (temp - (temp / 3)) / vcb->blockSize; - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - hfsmp->hfs_hotfile_maxblks = (uint32_t) (hfsmp->hfs_cs_hotfile_size / HFSTOVCB(hfsmp)->blockSize); - } else { - hfsmp->hfs_hotfile_maxblks = filesize / vcb->blockSize; - } - - /* Convert to allocation blocks. */ - blk = zonesize / vcb->blockSize; - - /* The default metadata zone location is at the start of volume. */ - hfsmp->hfs_metazone_start = 1; - hfsmp->hfs_metazone_end = blk - 1; - - /* The default hotfile area is at the end of the zone. */ - if (vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) { - hfsmp->hfs_hotfile_start = blk - (filesize / vcb->blockSize); - hfsmp->hfs_hotfile_end = hfsmp->hfs_metazone_end; - hfsmp->hfs_hotfile_freeblks = hfs_hotfile_freeblocks(hfsmp); - } - else { - hfsmp->hfs_hotfile_start = 0; - hfsmp->hfs_hotfile_end = 0; - hfsmp->hfs_hotfile_freeblks = 0; - } -#if DEBUG - printf("hfs:%s: metadata zone is %d to %d\n", hfsmp->vcbVN, hfsmp->hfs_metazone_start, hfsmp->hfs_metazone_end); - printf("hfs:%s: hot file band is %d to %d\n", hfsmp->vcbVN, hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end); - printf("hfs:%s: hot file band free blocks = %d\n", hfsmp->vcbVN, hfsmp->hfs_hotfile_freeblks); -#endif - - hfsmp->hfs_flags |= HFS_METADATA_ZONE; -} - - -static u_int32_t -hfs_hotfile_freeblocks(struct hfsmount *hfsmp) -{ - ExtendedVCB *vcb = HFSTOVCB(hfsmp); - int lockflags; - int freeblocks; - - if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { - // - // This is only used at initialization time and on an ssd - // we'll get the real info from the hotfile btree user - // info - // - return 0; - } - - lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - freeblocks = MetaZoneFreeBlocks(vcb); - hfs_systemfile_unlock(hfsmp, lockflags); - - /* Minus Extents overflow file reserve. */ - if ((uint32_t)hfsmp->hfs_overflow_maxblks >= VTOF(hfsmp->hfs_extents_vp)->ff_blocks) { - freeblocks -= hfsmp->hfs_overflow_maxblks - VTOF(hfsmp->hfs_extents_vp)->ff_blocks; - } - - /* Minus catalog file reserve. */ - if ((uint32_t)hfsmp->hfs_catalog_maxblks >= VTOF(hfsmp->hfs_catalog_vp)->ff_blocks) { - freeblocks -= hfsmp->hfs_catalog_maxblks - VTOF(hfsmp->hfs_catalog_vp)->ff_blocks; - } - - if (freeblocks < 0) - freeblocks = 0; - - // printf("hfs: hotfile_freeblocks: MIN(%d, %d) = %d\n", freeblocks, hfsmp->hfs_hotfile_maxblks, MIN(freeblocks, hfsmp->hfs_hotfile_maxblks)); - return MIN(freeblocks, hfsmp->hfs_hotfile_maxblks); -} - -/* - * Determine if a file is a "virtual" metadata file. - * This includes journal and quota files. - */ -int -hfs_virtualmetafile(struct cnode *cp) -{ - const char * filename; - - - if (cp->c_parentcnid != kHFSRootFolderID) - return (0); - - filename = (const char *)cp->c_desc.cd_nameptr; - if (filename == NULL) - return (0); - - if ((strncmp(filename, ".journal", sizeof(".journal")) == 0) || - (strncmp(filename, ".journal_info_block", sizeof(".journal_info_block")) == 0) || - (strncmp(filename, ".quota.user", sizeof(".quota.user")) == 0) || - (strncmp(filename, ".quota.group", sizeof(".quota.group")) == 0) || - (strncmp(filename, ".hotfiles.btree", sizeof(".hotfiles.btree")) == 0)) - return (1); - - return (0); -} - -__private_extern__ -void hfs_syncer_lock(struct hfsmount *hfsmp) -{ - hfs_lock_mount(hfsmp); -} - -__private_extern__ -void hfs_syncer_unlock(struct hfsmount *hfsmp) -{ - hfs_unlock_mount(hfsmp); -} - -__private_extern__ -void hfs_syncer_wait(struct hfsmount *hfsmp) -{ - msleep(&hfsmp->hfs_sync_incomplete, &hfsmp->hfs_mutex, PWAIT, - "hfs_syncer_wait", NULL); -} - -__private_extern__ -void hfs_syncer_wakeup(struct hfsmount *hfsmp) -{ - wakeup(&hfsmp->hfs_sync_incomplete); -} - -__private_extern__ -uint64_t hfs_usecs_to_deadline(uint64_t usecs) -{ - uint64_t deadline; - clock_interval_to_deadline(usecs, NSEC_PER_USEC, &deadline); - return deadline; -} - -__private_extern__ -void hfs_syncer_queue(thread_call_t syncer) -{ - if (thread_call_enter_delayed_with_leeway(syncer, - NULL, - hfs_usecs_to_deadline(HFS_META_DELAY), - 0, - THREAD_CALL_DELAY_SYS_BACKGROUND)) { - printf("hfs: syncer already scheduled!\n"); - } -} - -// -// Fire off a timed callback to sync the disk if the -// volume is on ejectable media. -// - __private_extern__ -void -hfs_sync_ejectable(struct hfsmount *hfsmp) -{ - // If we don't have a syncer or we get called by the syncer, just return - if (!hfsmp->hfs_syncer || current_thread() == hfsmp->hfs_syncer_thread) - return; - - hfs_syncer_lock(hfsmp); - - if (!timerisset(&hfsmp->hfs_sync_req_oldest)) - microuptime(&hfsmp->hfs_sync_req_oldest); - - /* If hfs_unmount is running, it will set hfs_syncer to NULL. Also we - don't want to queue again if there is a sync outstanding. */ - if (!hfsmp->hfs_syncer || hfsmp->hfs_sync_incomplete) { - hfs_syncer_unlock(hfsmp); - return; - } - - hfsmp->hfs_sync_incomplete = TRUE; - - thread_call_t syncer = hfsmp->hfs_syncer; - - hfs_syncer_unlock(hfsmp); - - hfs_syncer_queue(syncer); -} - -int -hfs_start_transaction(struct hfsmount *hfsmp) -{ - int ret = 0, unlock_on_err = 0; - thread_t thread = current_thread(); - -#ifdef HFS_CHECK_LOCK_ORDER - /* - * You cannot start a transaction while holding a system - * file lock. (unless the transaction is nested.) - */ - if (hfsmp->jnl && journal_owner(hfsmp->jnl) != thread) { - if (hfsmp->hfs_catalog_cp && hfsmp->hfs_catalog_cp->c_lockowner == thread) { - panic("hfs_start_transaction: bad lock order (cat before jnl)\n"); - } - if (hfsmp->hfs_attribute_cp && hfsmp->hfs_attribute_cp->c_lockowner == thread) { - panic("hfs_start_transaction: bad lock order (attr before jnl)\n"); - } - if (hfsmp->hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == thread) { - panic("hfs_start_transaction: bad lock order (ext before jnl)\n"); - } - } -#endif /* HFS_CHECK_LOCK_ORDER */ - -again: - - if (hfsmp->jnl) { - if (journal_owner(hfsmp->jnl) != thread) { - /* - * The global lock should be held shared if journal is - * active to prevent disabling. If we're not the owner - * of the journal lock, verify that we're not already - * holding the global lock exclusive before moving on. - */ - if (hfsmp->hfs_global_lockowner == thread) { - ret = EBUSY; - goto out; - } - - hfs_lock_global (hfsmp, HFS_SHARED_LOCK); - - // Things could have changed - if (!hfsmp->jnl) { - hfs_unlock_global(hfsmp); - goto again; - } - - OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); - unlock_on_err = 1; - } - } else { - // No journal - if (hfsmp->hfs_global_lockowner != thread) { - hfs_lock_global(hfsmp, HFS_EXCLUSIVE_LOCK); - - // Things could have changed - if (hfsmp->jnl) { - hfs_unlock_global(hfsmp); - goto again; - } - - OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); - unlock_on_err = 1; - } - } - - /* If a downgrade to read-only mount is in progress, no other - * thread than the downgrade thread is allowed to modify - * the file system. - */ - if ((hfsmp->hfs_flags & HFS_RDONLY_DOWNGRADE) && - hfsmp->hfs_downgrading_thread != thread) { - ret = EROFS; - goto out; - } - - if (hfsmp->jnl) { - ret = journal_start_transaction(hfsmp->jnl); - } else { - ret = 0; - } - - if (ret == 0) - ++hfsmp->hfs_transaction_nesting; - -out: - if (ret != 0 && unlock_on_err) { - hfs_unlock_global (hfsmp); - OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); - } - - return ret; -} - -int -hfs_end_transaction(struct hfsmount *hfsmp) -{ - int ret; - - assert(!hfsmp->jnl || journal_owner(hfsmp->jnl) == current_thread()); - assert(hfsmp->hfs_transaction_nesting > 0); - - if (hfsmp->jnl && hfsmp->hfs_transaction_nesting == 1) - hfs_flushvolumeheader(hfsmp, HFS_FVH_FLUSH_IF_DIRTY); - - bool need_unlock = !--hfsmp->hfs_transaction_nesting; - - if (hfsmp->jnl) { - ret = journal_end_transaction(hfsmp->jnl); - } else { - ret = 0; - } - - if (need_unlock) { - OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); - hfs_unlock_global (hfsmp); - hfs_sync_ejectable(hfsmp); - } - - return ret; -} - - -void -hfs_journal_lock(struct hfsmount *hfsmp) -{ - /* Only peek at hfsmp->jnl while holding the global lock */ - hfs_lock_global (hfsmp, HFS_SHARED_LOCK); - if (hfsmp->jnl) { - journal_lock(hfsmp->jnl); - } - hfs_unlock_global (hfsmp); -} - -void -hfs_journal_unlock(struct hfsmount *hfsmp) -{ - /* Only peek at hfsmp->jnl while holding the global lock */ - hfs_lock_global (hfsmp, HFS_SHARED_LOCK); - if (hfsmp->jnl) { - journal_unlock(hfsmp->jnl); - } - hfs_unlock_global (hfsmp); -} - -/* - * Flush the contents of the journal to the disk. - * - * - HFS_FLUSH_JOURNAL - * Wait to write in-memory journal to the disk consistently. - * This means that the journal still contains uncommitted - * transactions and the file system metadata blocks in - * the journal transactions might be written asynchronously - * to the disk. But there is no guarantee that they are - * written to the disk before returning to the caller. - * Note that this option is sufficient for file system - * data integrity as it guarantees consistent journal - * content on the disk. - * - * - HFS_FLUSH_JOURNAL_META - * Wait to write in-memory journal to the disk - * consistently, and also wait to write all asynchronous - * metadata blocks to its corresponding locations - * consistently on the disk. This is overkill in normal - * scenarios but is useful whenever the metadata blocks - * are required to be consistent on-disk instead of - * just the journalbeing consistent; like before live - * verification and live volume resizing. The update of the - * metadata doesn't include a barrier of track cache flush. - * - * - HFS_FLUSH_FULL - * HFS_FLUSH_JOURNAL + force a track cache flush to media - * - * - HFS_FLUSH_CACHE - * Force a track cache flush to media. - * - * - HFS_FLUSH_BARRIER - * Barrier-only flush to ensure write order - * - */ -errno_t hfs_flush(struct hfsmount *hfsmp, hfs_flush_mode_t mode) -{ - errno_t error = 0; - journal_flush_options_t options = 0; - dk_synchronize_t sync_req = { .options = DK_SYNCHRONIZE_OPTION_BARRIER }; - - switch (mode) { - case HFS_FLUSH_JOURNAL_META: - // wait for journal, metadata blocks and previous async flush to finish - SET(options, JOURNAL_WAIT_FOR_IO); - - // no break - - case HFS_FLUSH_JOURNAL: - case HFS_FLUSH_JOURNAL_BARRIER: - case HFS_FLUSH_FULL: - - if (mode == HFS_FLUSH_JOURNAL_BARRIER && - !(hfsmp->hfs_flags & HFS_FEATURE_BARRIER)) - mode = HFS_FLUSH_FULL; - - if (mode == HFS_FLUSH_FULL) - SET(options, JOURNAL_FLUSH_FULL); - - /* Only peek at hfsmp->jnl while holding the global lock */ - hfs_lock_global (hfsmp, HFS_SHARED_LOCK); - - if (hfsmp->jnl) - error = journal_flush(hfsmp->jnl, options); - - hfs_unlock_global (hfsmp); - - /* - * This may result in a double barrier as - * journal_flush may have issued a barrier itself - */ - if (mode == HFS_FLUSH_JOURNAL_BARRIER) - error = VNOP_IOCTL(hfsmp->hfs_devvp, - DKIOCSYNCHRONIZE, (caddr_t)&sync_req, - FWRITE, vfs_context_kernel()); - - break; - - case HFS_FLUSH_CACHE: - // Do a full sync - sync_req.options = 0; - - // no break - - case HFS_FLUSH_BARRIER: - // If barrier only flush doesn't support, fall back to use full flush. - if (!(hfsmp->hfs_flags & HFS_FEATURE_BARRIER)) - sync_req.options = 0; - - error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZE, (caddr_t)&sync_req, - FWRITE, vfs_context_kernel()); - break; - - default: - error = EINVAL; - } - - return error; -} - -/* - * hfs_erase_unused_nodes - * - * Check wheter a volume may suffer from unused Catalog B-tree nodes that - * are not zeroed (due to ). If so, just write - * zeroes to the unused nodes. - * - * How do we detect when a volume needs this repair? We can't always be - * certain. If a volume was created after a certain date, then it may have - * been created with the faulty newfs_hfs. Since newfs_hfs only created one - * clump, we can assume that if a Catalog B-tree is larger than its clump size, - * that means that the entire first clump must have been written to, which means - * there shouldn't be unused and unwritten nodes in that first clump, and this - * repair is not needed. - * - * We have defined a bit in the Volume Header's attributes to indicate when the - * unused nodes have been repaired. A newer newfs_hfs will set this bit. - * As will fsck_hfs when it repairs the unused nodes. - */ -int hfs_erase_unused_nodes(struct hfsmount *hfsmp) -{ - int result; - struct filefork *catalog; - int lockflags; - - if (hfsmp->vcbAtrb & kHFSUnusedNodeFixMask) - { - /* This volume has already been checked and repaired. */ - return 0; - } - - if ((hfsmp->localCreateDate < kHFSUnusedNodesFixDate)) - { - /* This volume is too old to have had the problem. */ - hfsmp->vcbAtrb |= kHFSUnusedNodeFixMask; - return 0; - } - - catalog = hfsmp->hfs_catalog_cp->c_datafork; - if (catalog->ff_size > catalog->ff_clumpsize) - { - /* The entire first clump must have been in use at some point. */ - hfsmp->vcbAtrb |= kHFSUnusedNodeFixMask; - return 0; - } - - /* - * If we get here, we need to zero out those unused nodes. - * - * We start a transaction and lock the catalog since we're going to be - * making on-disk changes. But note that BTZeroUnusedNodes doens't actually - * do its writing via the journal, because that would be too much I/O - * to fit in a transaction, and it's a pain to break it up into multiple - * transactions. (It behaves more like growing a B-tree would.) - */ - printf("hfs_erase_unused_nodes: updating volume %s.\n", hfsmp->vcbVN); - result = hfs_start_transaction(hfsmp); - if (result) - goto done; - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - result = BTZeroUnusedNodes(catalog); - vnode_waitforwrites(hfsmp->hfs_catalog_vp, 0, 0, 0, "hfs_erase_unused_nodes"); - hfs_systemfile_unlock(hfsmp, lockflags); - hfs_end_transaction(hfsmp); - if (result == 0) - hfsmp->vcbAtrb |= kHFSUnusedNodeFixMask; - printf("hfs_erase_unused_nodes: done updating volume %s.\n", hfsmp->vcbVN); - -done: - return result; -} - - -extern time_t snapshot_timestamp; - -int -check_for_tracked_file(struct vnode *vp, time_t ctime, uint64_t op_type, void *arg) -{ - int snapshot_error = 0; - - if (vp == NULL) { - return 0; - } - - /* Swap files are special; skip them */ - if (vnode_isswap(vp)) { - return 0; - } - - if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) { - // the change time is within this epoch - int error; - - error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg); - if (error == EDEADLK) { - snapshot_error = 0; - } else if (error) { - if (error == EAGAIN) { - printf("hfs: cow-snapshot: timed out waiting for namespace handler...\n"); - } else if (error == EINTR) { - // printf("hfs: cow-snapshot: got a signal while waiting for namespace handler...\n"); - snapshot_error = EINTR; - } - } - } - - if (snapshot_error) return snapshot_error; - - return 0; -} - -int -check_for_dataless_file(struct vnode *vp, uint64_t op_type) -{ - int error; - - if (vp == NULL || (VTOC(vp)->c_bsdflags & UF_COMPRESSED) == 0 || VTOCMP(vp) == NULL || VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { - // there's nothing to do, it's not dataless - return 0; - } - - /* Swap files are special; ignore them */ - if (vnode_isswap(vp)) { - return 0; - } - - // printf("hfs: dataless: encountered a file with the dataless bit set! (vp %p)\n", vp); - error = resolve_nspace_item(vp, op_type | NAMESPACE_HANDLER_NSPACE_EVENT); - if (error == EDEADLK && op_type == NAMESPACE_HANDLER_WRITE_OP) { - error = 0; - } else if (error) { - if (error == EAGAIN) { - printf("hfs: dataless: timed out waiting for namespace handler...\n"); - // XXXdbg - return the fabled ENOTPRESENT (i.e. EJUKEBOX)? - return 0; - } else if (error == EINTR) { - // printf("hfs: dataless: got a signal while waiting for namespace handler...\n"); - return EINTR; - } - } else if (VTOC(vp)->c_bsdflags & UF_COMPRESSED) { - // - // if we're here, the dataless bit is still set on the file - // which means it didn't get handled. we return an error - // but it's presently ignored by all callers of this function. - // - // XXXdbg - EDATANOTPRESENT is what we really need... - // - return EBADF; - } - - return error; -} - - -// -// NOTE: this function takes care of starting a transaction and -// acquiring the systemfile lock so that it can call -// cat_update(). -// -// NOTE: do NOT hold and cnode locks while calling this function -// to avoid deadlocks (because we take a lock on the root -// cnode) -// -int -hfs_generate_document_id(struct hfsmount *hfsmp, uint32_t *docid) -{ - struct vnode *rvp; - struct cnode *cp; - int error; - - error = VFS_ROOT(HFSTOVFS(hfsmp), &rvp, vfs_context_kernel()); - if (error) { - return error; - } - - cp = VTOC(rvp); - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) != 0) { - return error; - } - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)((void *)((char *)&cp->c_attr.ca_finderinfo + 16)); - - int lockflags; - if ((error = hfs_start_transaction(hfsmp)) != 0) { - return error; - } - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - if (extinfo->document_id == 0) { - // initialize this to start at 3 (one greater than the root-dir id) - extinfo->document_id = 3; - } - - *docid = extinfo->document_id++; - - // mark the root cnode dirty - cp->c_flag |= C_MODIFIED; - hfs_update(cp->c_vp, 0); - - hfs_systemfile_unlock (hfsmp, lockflags); - (void) hfs_end_transaction(hfsmp); - - (void) hfs_unlock(cp); - - vnode_put(rvp); - rvp = NULL; - - return 0; -} - - -/* - * Return information about number of file system allocation blocks - * taken by metadata on a volume. - * - * This function populates struct hfsinfo_metadata with allocation blocks - * used by extents overflow btree, catalog btree, bitmap, attribute btree, - * journal file, and sum of all of the above. - */ -int -hfs_getinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfsinfo_metadata *hinfo) -{ - int lockflags = 0; - int ret_lockflags = 0; - - /* Zero out the output buffer */ - bzero(hinfo, sizeof(struct hfsinfo_metadata)); - - /* - * Getting number of allocation blocks for all btrees - * should be a quick operation, so we grab locks for - * all of them at the same time - */ - lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; - ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - /* - * Make sure that we were able to acquire all locks requested - * to protect us against conditions like unmount in progress. - */ - if ((lockflags & ret_lockflags) != lockflags) { - /* Release any locks that were acquired */ - hfs_systemfile_unlock(hfsmp, ret_lockflags); - return EPERM; - } - - /* Get information about all the btrees */ - hinfo->extents = hfsmp->hfs_extents_cp->c_datafork->ff_blocks; - hinfo->catalog = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks; - hinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks; - hinfo->attribute = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks; - - /* Done with btrees, give up the locks */ - hfs_systemfile_unlock(hfsmp, ret_lockflags); - - /* Get information about journal file */ - hinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize); - - /* Calculate total number of metadata blocks */ - hinfo->total = hinfo->extents + hinfo->catalog + - hinfo->allocation + hinfo->attribute + - hinfo->journal; - - return 0; -} - -static int -hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs) -{ - vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze 8"); - - return 0; -} - -__private_extern__ -int hfs_freeze(struct hfsmount *hfsmp) -{ - // First make sure some other process isn't freezing - hfs_lock_mount(hfsmp); - while (hfsmp->hfs_freeze_state != HFS_THAWED) { - if (msleep(&hfsmp->hfs_freeze_state, &hfsmp->hfs_mutex, - PWAIT | PCATCH, "hfs freeze 1", NULL) == EINTR) { - hfs_unlock_mount(hfsmp); - return EINTR; - } - } - - // Stop new syncers from starting - hfsmp->hfs_freeze_state = HFS_WANT_TO_FREEZE; - - // Now wait for all syncers to finish - while (hfsmp->hfs_syncers) { - if (msleep(&hfsmp->hfs_freeze_state, &hfsmp->hfs_mutex, - PWAIT | PCATCH, "hfs freeze 2", NULL) == EINTR) { - hfs_thaw_locked(hfsmp); - hfs_unlock_mount(hfsmp); - return EINTR; - } - } - hfs_unlock_mount(hfsmp); - - // flush things before we get started to try and prevent - // dirty data from being paged out while we're frozen. - // note: we can't do this once we're in the freezing state because - // other threads will need to take the global lock - vnode_iterate(hfsmp->hfs_mp, 0, hfs_freezewrite_callback, NULL); - - // Block everything in hfs_lock_global now - hfs_lock_mount(hfsmp); - hfsmp->hfs_freeze_state = HFS_FREEZING; - hfsmp->hfs_freezing_thread = current_thread(); - hfs_unlock_mount(hfsmp); - - /* Take the exclusive lock to flush out anything else that - might have the global lock at the moment and also so we - can flush the journal. */ - hfs_lock_global(hfsmp, HFS_EXCLUSIVE_LOCK); - journal_flush(hfsmp->jnl, JOURNAL_WAIT_FOR_IO); - hfs_unlock_global(hfsmp); - - // don't need to iterate on all vnodes, we just need to - // wait for writes to the system files and the device vnode - // - // Now that journal flush waits for all metadata blocks to - // be written out, waiting for btree writes is probably no - // longer required. - if (HFSTOVCB(hfsmp)->extentsRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze 3"); - if (HFSTOVCB(hfsmp)->catalogRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze 4"); - if (HFSTOVCB(hfsmp)->allocationsRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze 5"); - if (hfsmp->hfs_attribute_vp) - vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze 6"); - vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze 7"); - - // We're done, mark frozen - hfs_lock_mount(hfsmp); - hfsmp->hfs_freeze_state = HFS_FROZEN; - hfsmp->hfs_freezing_proc = current_proc(); - hfs_unlock_mount(hfsmp); - - return 0; -} - -__private_extern__ -int hfs_thaw(struct hfsmount *hfsmp, const struct proc *process) -{ - hfs_lock_mount(hfsmp); - - if (hfsmp->hfs_freeze_state != HFS_FROZEN) { - hfs_unlock_mount(hfsmp); - return EINVAL; - } - if (process && hfsmp->hfs_freezing_proc != process) { - hfs_unlock_mount(hfsmp); - return EPERM; - } - - hfs_thaw_locked(hfsmp); - - hfs_unlock_mount(hfsmp); - - return 0; -} - -static void hfs_thaw_locked(struct hfsmount *hfsmp) -{ - hfsmp->hfs_freezing_proc = NULL; - hfsmp->hfs_freeze_state = HFS_THAWED; - - wakeup(&hfsmp->hfs_freeze_state); -} diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c deleted file mode 100644 index a198b651e..000000000 --- a/bsd/hfs/hfs_vnops.c +++ /dev/null @@ -1,7383 +0,0 @@ -/* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#include "hfs.h" -#include "hfs_catalog.h" -#include "hfs_cnode.h" -#include "hfs_dbg.h" -#include "hfs_mount.h" -#include "hfs_quota.h" -#include "hfs_endian.h" -#include "hfs_kdebug.h" -#include "hfs_cprotect.h" - - -#include "hfscommon/headers/BTreesInternal.h" -#include "hfscommon/headers/FileMgrInternal.h" - -#define KNDETACH_VNLOCKED 0x00000001 - -/* Global vfs data structures for hfs */ - -/* Always F_FULLFSYNC? 1=yes,0=no (default due to "various" reasons is 'no') */ -int always_do_fullfsync = 0; -SYSCTL_DECL(_vfs_generic); -SYSCTL_INT (_vfs_generic, OID_AUTO, always_do_fullfsync, CTLFLAG_RW | CTLFLAG_LOCKED, &always_do_fullfsync, 0, "always F_FULLFSYNC when fsync is called"); - -int hfs_makenode(struct vnode *dvp, struct vnode **vpp, - struct componentname *cnp, struct vnode_attr *vap, - vfs_context_t ctx); -int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p); -int hfs_metasync_all(struct hfsmount *hfsmp); - -int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, - int, int); -int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, - int, int, int, struct vnode *, int); - -/* Used here and in cnode teardown -- for symlinks */ -int hfs_removefile_callback(struct buf *bp, void *hfsmp); - -enum { - HFS_MOVE_DATA_INCLUDE_RSRC = 1, -}; -typedef uint32_t hfs_move_data_options_t; - -static int hfs_move_data(cnode_t *from_cp, cnode_t *to_cp, - hfs_move_data_options_t options); -static int hfs_move_fork(filefork_t *srcfork, cnode_t *src, - filefork_t *dstfork, cnode_t *dst); - -#if HFS_COMPRESSION -static int hfs_move_compressed(cnode_t *from_vp, cnode_t *to_vp); -#endif - -decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp); - -#if FIFO -static int hfsfifo_read(struct vnop_read_args *); -static int hfsfifo_write(struct vnop_write_args *); -static int hfsfifo_close(struct vnop_close_args *); - -extern int (**fifo_vnodeop_p)(void *); -#endif /* FIFO */ - -int hfs_vnop_close(struct vnop_close_args*); -int hfs_vnop_create(struct vnop_create_args*); -int hfs_vnop_exchange(struct vnop_exchange_args*); -int hfs_vnop_fsync(struct vnop_fsync_args*); -int hfs_vnop_mkdir(struct vnop_mkdir_args*); -int hfs_vnop_mknod(struct vnop_mknod_args*); -int hfs_vnop_getattr(struct vnop_getattr_args*); -int hfs_vnop_open(struct vnop_open_args*); -int hfs_vnop_readdir(struct vnop_readdir_args*); -int hfs_vnop_remove(struct vnop_remove_args*); -int hfs_vnop_rename(struct vnop_rename_args*); -int hfs_vnop_rmdir(struct vnop_rmdir_args*); -int hfs_vnop_symlink(struct vnop_symlink_args*); -int hfs_vnop_setattr(struct vnop_setattr_args*); -int hfs_vnop_readlink(struct vnop_readlink_args *); -int hfs_vnop_pathconf(struct vnop_pathconf_args *); -int hfs_vnop_mmap(struct vnop_mmap_args *ap); -int hfsspec_read(struct vnop_read_args *); -int hfsspec_write(struct vnop_write_args *); -int hfsspec_close(struct vnop_close_args *); - -/* Options for hfs_removedir and hfs_removefile */ -#define HFSRM_SKIP_RESERVE 0x01 - - - -/***************************************************************************** -* -* Common Operations on vnodes -* -*****************************************************************************/ - -/* - * Is the given cnode either the .journal or .journal_info_block file on - * a volume with an active journal? Many VNOPs use this to deny access - * to those files. - * - * Note: the .journal file on a volume with an external journal still - * returns true here, even though it does not actually hold the contents - * of the volume's journal. - */ -static _Bool -hfs_is_journal_file(struct hfsmount *hfsmp, struct cnode *cp) -{ - if (hfsmp->jnl != NULL && - (cp->c_fileid == hfsmp->hfs_jnlinfoblkid || - cp->c_fileid == hfsmp->hfs_jnlfileid)) { - return true; - } else { - return false; - } -} - -/* - * Create a regular file. - */ -int -hfs_vnop_create(struct vnop_create_args *ap) -{ - /* - * We leave handling of certain race conditions here to the caller - * which will have a better understanding of the semantics it - * requires. For example, if it turns out that the file exists, - * it would be wrong of us to return a reference to the existing - * file because the caller might not want that and it would be - * misleading to suggest the file had been created when it hadn't - * been. Note that our NFS server code does not set the - * VA_EXCLUSIVE flag so you cannot assume that callers don't want - * EEXIST errors if it's not set. The common case, where users - * are calling open with the O_CREAT mode, is handled in VFS; when - * we return EEXIST, it will loop and do the look-up again. - */ - return hfs_makenode(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap, ap->a_context); -} - -/* - * Make device special file. - */ -int -hfs_vnop_mknod(struct vnop_mknod_args *ap) -{ - struct vnode_attr *vap = ap->a_vap; - struct vnode *dvp = ap->a_dvp; - struct vnode **vpp = ap->a_vpp; - struct cnode *cp; - int error; - - if (VTOVCB(dvp)->vcbSigWord != kHFSPlusSigWord) { - return (ENOTSUP); - } - - /* Create the vnode */ - error = hfs_makenode(dvp, vpp, ap->a_cnp, vap, ap->a_context); - if (error) - return (error); - - cp = VTOC(*vpp); - cp->c_touch_acctime = TRUE; - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - - if ((vap->va_rdev != VNOVAL) && - (vap->va_type == VBLK || vap->va_type == VCHR)) - cp->c_rdev = vap->va_rdev; - - return (0); -} - -#if HFS_COMPRESSION -/* - * hfs_ref_data_vp(): returns the data fork vnode for a given cnode. - * In the (hopefully rare) case where the data fork vnode is not - * present, it will use hfs_vget() to create a new vnode for the - * data fork. - * - * NOTE: If successful and a vnode is returned, the caller is responsible - * for releasing the returned vnode with vnode_rele(). - */ -static int -hfs_ref_data_vp(struct cnode *cp, struct vnode **data_vp, int skiplock) -{ - int vref = 0; - - if (!data_vp || !cp) /* sanity check incoming parameters */ - return EINVAL; - - /* maybe we should take the hfs cnode lock here, and if so, use the skiplock parameter to tell us not to */ - - if (!skiplock) hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - struct vnode *c_vp = cp->c_vp; - if (c_vp) { - /* we already have a data vnode */ - *data_vp = c_vp; - vref = vnode_ref(*data_vp); - if (!skiplock) hfs_unlock(cp); - if (vref == 0) { - return 0; - } - return EINVAL; - } - /* no data fork vnode in the cnode, so ask hfs for one. */ - - if (!cp->c_rsrc_vp) { - /* if we don't have either a c_vp or c_rsrc_vp, we can't really do anything useful */ - *data_vp = NULL; - if (!skiplock) hfs_unlock(cp); - return EINVAL; - } - - if (0 == hfs_vget(VTOHFS(cp->c_rsrc_vp), cp->c_cnid, data_vp, 1, 0) && - 0 != data_vp) { - vref = vnode_ref(*data_vp); - vnode_put(*data_vp); - if (!skiplock) hfs_unlock(cp); - if (vref == 0) { - return 0; - } - return EINVAL; - } - /* there was an error getting the vnode */ - *data_vp = NULL; - if (!skiplock) hfs_unlock(cp); - return EINVAL; -} - -/* - * hfs_lazy_init_decmpfs_cnode(): returns the decmpfs_cnode for a cnode, - * allocating it if necessary; returns NULL if there was an allocation error. - * function is non-static so that it can be used from the FCNTL handler. - */ -decmpfs_cnode * -hfs_lazy_init_decmpfs_cnode(struct cnode *cp) -{ - if (!cp->c_decmp) { - decmpfs_cnode *dp = NULL; - MALLOC_ZONE(dp, decmpfs_cnode *, sizeof(decmpfs_cnode), M_DECMPFS_CNODE, M_WAITOK); - if (!dp) { - /* error allocating a decmpfs cnode */ - return NULL; - } - decmpfs_cnode_init(dp); - if (!OSCompareAndSwapPtr(NULL, dp, (void * volatile *)&cp->c_decmp)) { - /* another thread got here first, so free the decmpfs_cnode we allocated */ - decmpfs_cnode_destroy(dp); - FREE_ZONE(dp, sizeof(*dp), M_DECMPFS_CNODE); - } - } - - return cp->c_decmp; -} - -/* - * hfs_file_is_compressed(): returns 1 if the file is compressed, and 0 (zero) if not. - * if the file's compressed flag is set, makes sure that the decmpfs_cnode field - * is allocated by calling hfs_lazy_init_decmpfs_cnode(), then makes sure it is populated, - * or else fills it in via the decmpfs_file_is_compressed() function. - */ -int -hfs_file_is_compressed(struct cnode *cp, int skiplock) -{ - int ret = 0; - - /* fast check to see if file is compressed. If flag is clear, just answer no */ - if (!(cp->c_bsdflags & UF_COMPRESSED)) { - return 0; - } - - decmpfs_cnode *dp = hfs_lazy_init_decmpfs_cnode(cp); - if (!dp) { - /* error allocating a decmpfs cnode, treat the file as uncompressed */ - return 0; - } - - /* flag was set, see if the decmpfs_cnode state is valid (zero == invalid) */ - uint32_t decmpfs_state = decmpfs_cnode_get_vnode_state(dp); - switch(decmpfs_state) { - case FILE_IS_COMPRESSED: - case FILE_IS_CONVERTING: /* treat decompressing files as if they are compressed */ - return 1; - case FILE_IS_NOT_COMPRESSED: - return 0; - /* otherwise the state is not cached yet */ - } - - /* decmpfs hasn't seen this file yet, so call decmpfs_file_is_compressed() to init the decmpfs_cnode struct */ - struct vnode *data_vp = NULL; - if (0 == hfs_ref_data_vp(cp, &data_vp, skiplock)) { - if (data_vp) { - ret = decmpfs_file_is_compressed(data_vp, VTOCMP(data_vp)); // fill in decmpfs_cnode - vnode_rele(data_vp); - } - } - return ret; -} - -/* hfs_uncompressed_size_of_compressed_file() - get the uncompressed size of the file. - * if the caller has passed a valid vnode (has a ref count > 0), then hfsmp and fid are not required. - * if the caller doesn't have a vnode, pass NULL in vp, and pass valid hfsmp and fid. - * files size is returned in size (required) - * if the indicated file is a directory (or something that doesn't have a data fork), then this call - * will return an error and the caller should fall back to treating the item as an uncompressed file - */ -int -hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *vp, cnid_t fid, off_t *size, int skiplock) -{ - int ret = 0; - int putaway = 0; /* flag to remember if we used hfs_vget() */ - - if (!size) { - return EINVAL; /* no place to put the file size */ - } - - if (NULL == vp) { - if (!hfsmp || !fid) { /* make sure we have the required parameters */ - return EINVAL; - } - if (0 != hfs_vget(hfsmp, fid, &vp, skiplock, 0)) { /* vnode is null, use hfs_vget() to get it */ - vp = NULL; - } else { - putaway = 1; /* note that hfs_vget() was used to aquire the vnode */ - } - } - /* this double check for compression (hfs_file_is_compressed) - * ensures the cached size is present in case decmpfs hasn't - * encountered this node yet. - */ - if (vp) { - if (hfs_file_is_compressed(VTOC(vp), skiplock) ) { - *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ - } else { - if (VTOCMP(vp) && VTOCMP(vp)->cmp_type >= CMP_MAX) { - if (VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { - // if we don't recognize this type, just use the real data fork size - if (VTOC(vp)->c_datafork) { - *size = VTOC(vp)->c_datafork->ff_size; - ret = 0; - } else { - ret = EINVAL; - } - } else { - *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ - ret = 0; - } - } else { - ret = EINVAL; - } - } - } - - if (putaway) { /* did we use hfs_vget() to get this vnode? */ - vnode_put(vp); /* if so, release it and set it to null */ - vp = NULL; - } - return ret; -} - -int -hfs_hides_rsrc(vfs_context_t ctx, struct cnode *cp, int skiplock) -{ - if (ctx == decmpfs_ctx) - return 0; - if (!hfs_file_is_compressed(cp, skiplock)) - return 0; - return decmpfs_hides_rsrc(ctx, cp->c_decmp); -} - -int -hfs_hides_xattr(vfs_context_t ctx, struct cnode *cp, const char *name, int skiplock) -{ - if (ctx == decmpfs_ctx) - return 0; - if (!hfs_file_is_compressed(cp, skiplock)) - return 0; - return decmpfs_hides_xattr(ctx, cp->c_decmp, name); -} -#endif /* HFS_COMPRESSION */ - - -// -// This function gets the doc_tombstone structure for the -// current thread. If the thread doesn't have one, the -// structure is allocated. -// -static struct doc_tombstone * -get_uthread_doc_tombstone(void) -{ - struct uthread *ut; - ut = get_bsdthread_info(current_thread()); - - if (ut->t_tombstone == NULL) { - ut->t_tombstone = kalloc(sizeof(struct doc_tombstone)); - if (ut->t_tombstone) { - memset(ut->t_tombstone, 0, sizeof(struct doc_tombstone)); - } - } - - return ut->t_tombstone; -} - -// -// This routine clears out the current tombstone for the -// current thread and if necessary passes the doc-id of -// the tombstone on to the dst_cnode. -// -// If the doc-id transfers to dst_cnode, we also generate -// a doc-id changed fsevent. Unlike all the other fsevents, -// doc-id changed events can only be generated here in HFS -// where we have the necessary info. -// -static void -clear_tombstone_docid(struct doc_tombstone *ut, __unused struct hfsmount *hfsmp, struct cnode *dst_cnode) -{ - uint32_t old_id = ut->t_lastop_document_id; - - ut->t_lastop_document_id = 0; - ut->t_lastop_parent = NULL; - ut->t_lastop_parent_vid = 0; - ut->t_lastop_filename[0] = '\0'; - - // - // If the lastop item is still the same and needs to be cleared, - // clear it. - // - if (dst_cnode && old_id && ut->t_lastop_item && vnode_vid(ut->t_lastop_item) == ut->t_lastop_item_vid) { - // - // clear the document_id from the file that used to have it. - // XXXdbg - we need to lock the other vnode and make sure to - // update it on disk. - // - struct cnode *ocp = VTOC(ut->t_lastop_item); - struct FndrExtendedFileInfo *ofip = (struct FndrExtendedFileInfo *)((char *)&ocp->c_attr.ca_finderinfo + 16); - - // printf("clearing doc-id from ino %d\n", ocp->c_desc.cd_cnid); - ofip->document_id = 0; - ocp->c_bsdflags &= ~UF_TRACKED; - ocp->c_flag |= C_MODIFIED; - /* cat_update(hfsmp, &ocp->c_desc, &ocp->c_attr, NULL, NULL); */ - - } - -#if CONFIG_FSE - if (dst_cnode && old_id) { - struct FndrExtendedFileInfo *fip = (struct FndrExtendedFileInfo *)((char *)&dst_cnode->c_attr.ca_finderinfo + 16); - - add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)ut->t_lastop_fileid, // src inode # - FSE_ARG_INO, (ino64_t)dst_cnode->c_fileid, // dst inode # - FSE_ARG_INT32, (uint32_t)fip->document_id, - FSE_ARG_DONE); - } -#endif - // last, clear these now that we're all done - ut->t_lastop_item = NULL; - ut->t_lastop_fileid = 0; - ut->t_lastop_item_vid = 0; -} - - -// -// This function is used to filter out operations on temp -// filenames. We have to filter out operations on certain -// temp filenames to work-around questionable application -// behavior from apps like Autocad that perform unusual -// sequences of file system operations for a "safe save". -static int -is_ignorable_temp_name(const char *nameptr, int len) -{ - if (len == 0) { - len = strlen(nameptr); - } - - if ( strncmp(nameptr, "atmp", 4) == 0 - || (len > 4 && strncmp(nameptr+len-4, ".bak", 4) == 0) - || (len > 4 && strncmp(nameptr+len-4, ".tmp", 4) == 0)) { - return 1; - } - - return 0; -} - -// -// Decide if we need to save a tombstone or not. Normally we always -// save a tombstone - but if there already is one and the name we're -// given is an ignorable name, then we will not save a tombstone. -// -static int -should_save_docid_tombstone(struct doc_tombstone *ut, struct vnode *vp, struct componentname *cnp) -{ - if (cnp->cn_nameptr == NULL) { - return 0; - } - - if (ut->t_lastop_document_id && ut->t_lastop_item == vp && is_ignorable_temp_name(cnp->cn_nameptr, cnp->cn_namelen)) { - return 0; - } - - return 1; -} - - -// -// This function saves a tombstone for the given vnode and name. The -// tombstone represents the parent directory and name where the document -// used to live and the document-id of that file. This info is recorded -// in the doc_tombstone structure hanging off the uthread (which assumes -// that all safe-save operations happen on the same thread). -// -// If later on the same parent/name combo comes back into existence then -// we'll preserve the doc-id from this vnode onto the new vnode. -// -static void -save_tombstone(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int for_unlink) -{ - struct cnode *cp = VTOC(vp); - struct doc_tombstone *ut; - ut = get_uthread_doc_tombstone(); - - if (for_unlink && vp->v_type == VREG && cp->c_linkcount > 1) { - // - // a regular file that is being unlinked and that is also - // hardlinked should not clear the UF_TRACKED state or - // mess with the tombstone because somewhere else in the - // file system the file is still alive. - // - return; - } - - ut->t_lastop_parent = dvp; - ut->t_lastop_parent_vid = vnode_vid(dvp); - ut->t_lastop_fileid = cp->c_fileid; - if (for_unlink) { - ut->t_lastop_item = NULL; - ut->t_lastop_item_vid = 0; - } else { - ut->t_lastop_item = vp; - ut->t_lastop_item_vid = vnode_vid(vp); - } - - strlcpy((char *)&ut->t_lastop_filename[0], cnp->cn_nameptr, sizeof(ut->t_lastop_filename)); - - struct FndrExtendedFileInfo *fip = (struct FndrExtendedFileInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); - ut->t_lastop_document_id = fip->document_id; - - if (for_unlink) { - // clear this so it's never returned again - fip->document_id = 0; - cp->c_bsdflags &= ~UF_TRACKED; - - if (ut->t_lastop_document_id) { - (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); - -#if CONFIG_FSE - // this event is more of a "pending-delete" - add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # - FSE_ARG_INO, (ino64_t)0, // dst inode # - FSE_ARG_INT32, ut->t_lastop_document_id, // document id - FSE_ARG_DONE); -#endif - } - } -} - - -/* - * Open a file/directory. - */ -int -hfs_vnop_open(struct vnop_open_args *ap) -{ - struct vnode *vp = ap->a_vp; - struct filefork *fp; - struct timeval tv; - int error; - static int past_bootup = 0; - struct cnode *cp = VTOC(vp); - struct hfsmount *hfsmp = VTOHFS(vp); - -#if HFS_COMPRESSION - if (ap->a_mode & FWRITE) { - /* open for write */ - if ( hfs_file_is_compressed(cp, 1) ) { /* 1 == don't take the cnode lock */ - /* opening a compressed file for write, so convert it to decompressed */ - struct vnode *data_vp = NULL; - error = hfs_ref_data_vp(cp, &data_vp, 1); /* 1 == don't take the cnode lock */ - if (0 == error) { - if (data_vp) { - error = decmpfs_decompress_file(data_vp, VTOCMP(data_vp), -1, 1, 0); - vnode_rele(data_vp); - } else { - error = EINVAL; - } - } - if (error != 0) - return error; - } - } else { - /* open for read */ - if (hfs_file_is_compressed(cp, 1) ) { /* 1 == don't take the cnode lock */ - if (VNODE_IS_RSRC(vp)) { - /* opening the resource fork of a compressed file, so nothing to do */ - } else { - /* opening a compressed file for read, make sure it validates */ - error = decmpfs_validate_compressed_file(vp, VTOCMP(vp)); - if (error != 0) - return error; - } - } - } -#endif - - /* - * Files marked append-only must be opened for appending. - */ - if ((cp->c_bsdflags & APPEND) && !vnode_isdir(vp) && - (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) - return (EPERM); - - if (vnode_isreg(vp) && !UBCINFOEXISTS(vp)) - return (EBUSY); /* file is in use by the kernel */ - - /* Don't allow journal to be opened externally. */ - if (hfs_is_journal_file(hfsmp, cp)) - return (EPERM); - - bool have_lock = false; - -#if CONFIG_PROTECT - if (ISSET(ap->a_mode, FENCRYPTED) && cp->c_cpentry && vnode_isreg(vp)) { - bool have_trunc_lock = false; - - - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - if (have_trunc_lock) - hfs_unlock_truncate(cp, 0); - return error; - } - - have_lock = true; - - if (cp->c_cpentry->cp_raw_open_count + 1 - < cp->c_cpentry->cp_raw_open_count) { - // Overflow; too many raw opens on this file - hfs_unlock(cp); - if (have_trunc_lock) - hfs_unlock_truncate(cp, 0); - return ENFILE; - } - - - if (have_trunc_lock) - hfs_unlock_truncate(cp, 0); - - ++cp->c_cpentry->cp_raw_open_count; - } -#endif - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) || - (hfsmp->jnl == NULL) || -#if NAMEDSTREAMS - !vnode_isreg(vp) || vnode_isinuse(vp, 0) || vnode_isnamedstream(vp)) { -#else - !vnode_isreg(vp) || vnode_isinuse(vp, 0)) { -#endif - -#if CONFIG_PROTECT - if (have_lock) - hfs_unlock(cp); -#endif - - return (0); - } - - if (!have_lock && (error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) - return (error); - -#if QUOTA - /* If we're going to write to the file, initialize quotas. */ - if ((ap->a_mode & FWRITE) && (hfsmp->hfs_flags & HFS_QUOTAS)) - (void)hfs_getinoquota(cp); -#endif /* QUOTA */ - - /* - * On the first (non-busy) open of a fragmented - * file attempt to de-frag it (if its less than 20MB). - */ - fp = VTOF(vp); - if (fp->ff_blocks && - fp->ff_extents[7].blockCount != 0 && - fp->ff_size <= (20 * 1024 * 1024)) { - int no_mods = 0; - struct timeval now; - /* - * Wait until system bootup is done (3 min). - * And don't relocate a file that's been modified - * within the past minute -- this can lead to - * system thrashing. - */ - - if (!past_bootup) { - microuptime(&tv); - if (tv.tv_sec > (60*3)) { - past_bootup = 1; - } - } - - microtime(&now); - if ((now.tv_sec - cp->c_mtime) > 60) { - no_mods = 1; - } - - if (past_bootup && no_mods) { - (void) hfs_relocate(vp, hfsmp->nextAllocation + 4096, - vfs_context_ucred(ap->a_context), - vfs_context_proc(ap->a_context)); - } - } - - hfs_unlock(cp); - - return (0); -} - - -/* - * Close a file/directory. - */ -int -hfs_vnop_close(ap) - struct vnop_close_args /* { - struct vnode *a_vp; - int a_fflag; - vfs_context_t a_context; - } */ *ap; -{ - register struct vnode *vp = ap->a_vp; - register struct cnode *cp; - struct proc *p = vfs_context_proc(ap->a_context); - struct hfsmount *hfsmp; - int busy; - int tooktrunclock = 0; - int knownrefs = 0; - - if ( hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) - return (0); - cp = VTOC(vp); - hfsmp = VTOHFS(vp); - -#if CONFIG_PROTECT - if (cp->c_cpentry && ISSET(ap->a_fflag, FENCRYPTED) && vnode_isreg(vp)) { - assert(cp->c_cpentry->cp_raw_open_count > 0); - --cp->c_cpentry->cp_raw_open_count; - } -#endif - - /* - * If the rsrc fork is a named stream, it can cause the data fork to - * stay around, preventing de-allocation of these blocks. - * Do checks for truncation on close. Purge extra extents if they exist. - * Make sure the vp is not a directory, and that it has a resource fork, - * and that resource fork is also a named stream. - */ - - if ((vp->v_type == VREG) && (cp->c_rsrc_vp) - && (vnode_isnamedstream(cp->c_rsrc_vp))) { - uint32_t blks; - - blks = howmany(VTOF(vp)->ff_size, VTOVCB(vp)->blockSize); - /* - * If there are extra blocks and there are only 2 refs on - * this vp (ourselves + rsrc fork holding ref on us), go ahead - * and try to truncate. - */ - if ((blks < VTOF(vp)->ff_blocks) && (!vnode_isinuse(vp, 2))) { - // release cnode lock; must acquire truncate lock BEFORE cnode lock - hfs_unlock(cp); - - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - tooktrunclock = 1; - - if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - // bail out if we can't re-acquire cnode lock - return 0; - } - // now re-test to make sure it's still valid - if (cp->c_rsrc_vp) { - knownrefs = 1 + vnode_isnamedstream(cp->c_rsrc_vp); - if (!vnode_isinuse(vp, knownrefs)){ - // now we can truncate the file, if necessary - blks = howmany(VTOF(vp)->ff_size, VTOVCB(vp)->blockSize); - if (blks < VTOF(vp)->ff_blocks){ - (void) hfs_truncate(vp, VTOF(vp)->ff_size, IO_NDELAY, - 0, ap->a_context); - } - } - } - } - } - - - // if we froze the fs and we're exiting, then "thaw" the fs - if (hfsmp->hfs_freeze_state == HFS_FROZEN - && hfsmp->hfs_freezing_proc == p && proc_exiting(p)) { - hfs_thaw(hfsmp, p); - } - - busy = vnode_isinuse(vp, 1); - - if (busy) { - hfs_touchtimes(VTOHFS(vp), cp); - } - if (vnode_isdir(vp)) { - hfs_reldirhints(cp, busy); - } else if (vnode_issystem(vp) && !busy) { - vnode_recycle(vp); - } - - if (tooktrunclock){ - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - } - hfs_unlock(cp); - - if (ap->a_fflag & FWASWRITTEN) { - hfs_sync_ejectable(hfsmp); - } - - return (0); -} - -static bool hfs_should_generate_document_id(hfsmount_t *hfsmp, cnode_t *cp) -{ - return (!ISSET(hfsmp->hfs_flags, HFS_READ_ONLY) - && ISSET(cp->c_bsdflags, UF_TRACKED) - && cp->c_desc.cd_cnid != kHFSRootFolderID - && (S_ISDIR(cp->c_mode) || S_ISREG(cp->c_mode) || S_ISLNK(cp->c_mode))); -} - -/* - * Get basic attributes. - */ -int -hfs_vnop_getattr(struct vnop_getattr_args *ap) -{ -#define VNODE_ATTR_TIMES \ - (VNODE_ATTR_va_access_time|VNODE_ATTR_va_change_time|VNODE_ATTR_va_modify_time) -#define VNODE_ATTR_AUTH \ - (VNODE_ATTR_va_mode | VNODE_ATTR_va_uid | VNODE_ATTR_va_gid | \ - VNODE_ATTR_va_flags | VNODE_ATTR_va_acl) - - struct vnode *vp = ap->a_vp; - struct vnode_attr *vap = ap->a_vap; - struct vnode *rvp = NULLVP; - struct hfsmount *hfsmp; - struct cnode *cp; - uint64_t data_size; - enum vtype v_type; - int error = 0; - cp = VTOC(vp); - -#if HFS_COMPRESSION - /* we need to inspect the decmpfs state of the file before we take the hfs cnode lock */ - int compressed = 0; - int hide_size = 0; - off_t uncompressed_size = -1; - if (VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_alloc) || VATTR_IS_ACTIVE(vap, va_data_alloc) || VATTR_IS_ACTIVE(vap, va_total_size)) { - /* we only care about whether the file is compressed if asked for the uncompressed size */ - if (VNODE_IS_RSRC(vp)) { - /* if it's a resource fork, decmpfs may want us to hide the size */ - hide_size = hfs_hides_rsrc(ap->a_context, cp, 0); - } else { - /* if it's a data fork, we need to know if it was compressed so we can report the uncompressed size */ - compressed = hfs_file_is_compressed(cp, 0); - } - if ((VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_size))) { - // if it's compressed - if (compressed || (!VNODE_IS_RSRC(vp) && cp->c_decmp && cp->c_decmp->cmp_type >= CMP_MAX)) { - if (0 != hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0)) { - /* failed to get the uncompressed size, we'll check for this later */ - uncompressed_size = -1; - } else { - // fake that it's compressed - compressed = 1; - } - } - } - } -#endif - - /* - * Shortcut for vnode_authorize path. Each of the attributes - * in this set is updated atomically so we don't need to take - * the cnode lock to access them. - */ - if ((vap->va_active & ~VNODE_ATTR_AUTH) == 0) { - /* Make sure file still exists. */ - if (cp->c_flag & C_NOEXISTS) - return (ENOENT); - - vap->va_uid = cp->c_uid; - vap->va_gid = cp->c_gid; - vap->va_mode = cp->c_mode; - vap->va_flags = cp->c_bsdflags; - vap->va_supported |= VNODE_ATTR_AUTH & ~VNODE_ATTR_va_acl; - - if ((cp->c_attr.ca_recflags & kHFSHasSecurityMask) == 0) { - vap->va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; - VATTR_SET_SUPPORTED(vap, va_acl); - } - - return (0); - } - - hfsmp = VTOHFS(vp); - v_type = vnode_vtype(vp); - - if (VATTR_IS_ACTIVE(vap, va_document_id)) { - uint32_t document_id; - - if (cp->c_desc.cd_cnid == kHFSRootFolderID) - document_id = kHFSRootFolderID; - else { - /* - * This is safe without a lock because we're just reading - * a 32 bit aligned integer which should be atomic on all - * platforms we support. - */ - document_id = hfs_get_document_id(cp); - - if (!document_id && hfs_should_generate_document_id(hfsmp, cp)) { - uint32_t new_document_id; - - error = hfs_generate_document_id(hfsmp, &new_document_id); - if (error) - return error; - - error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error) - return error; - - bool want_docid_fsevent = false; - - // Need to check again now that we have the lock - document_id = hfs_get_document_id(cp); - if (!document_id && hfs_should_generate_document_id(hfsmp, cp)) { - cp->c_attr.ca_finderextendeddirinfo.document_id = document_id = new_document_id; - want_docid_fsevent = true; - SET(cp->c_flag, C_MODIFIED); - } - - hfs_unlock(cp); - - if (want_docid_fsevent) { -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, ap->a_context, - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)0, // src inode # - FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # - FSE_ARG_INT32, document_id, - FSE_ARG_DONE); - - if (need_fsevent(FSE_STAT_CHANGED, vp)) { - add_fsevent(FSE_STAT_CHANGED, ap->a_context, - FSE_ARG_VNODE, vp, FSE_ARG_DONE); - } -#endif - } - } - } - - vap->va_document_id = document_id; - VATTR_SET_SUPPORTED(vap, va_document_id); - } - - /* - * If time attributes are requested and we have cnode times - * that require updating, then acquire an exclusive lock on - * the cnode before updating the times. Otherwise we can - * just acquire a shared lock. - */ - if ((vap->va_active & VNODE_ATTR_TIMES) && - (cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime)) { - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) - return (error); - hfs_touchtimes(hfsmp, cp); - - // downgrade to a shared lock since that's all we need from here on out - cp->c_lockowner = HFS_SHARED_OWNER; - lck_rw_lock_exclusive_to_shared(&cp->c_rwlock); - - } else if ((error = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { - return (error); - } - - if (v_type == VDIR) { - data_size = (cp->c_entries + 2) * AVERAGE_HFSDIRENTRY_SIZE; - - if (VATTR_IS_ACTIVE(vap, va_nlink)) { - int nlink; - - /* - * For directories, the va_nlink is esentially a count - * of the ".." references to a directory plus the "." - * reference and the directory itself. So for HFS+ this - * becomes the sub-directory count plus two. - * - * In the absence of a sub-directory count we use the - * directory's item count. This will be too high in - * most cases since it also includes files. - */ - if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && - (cp->c_attr.ca_recflags & kHFSHasFolderCountMask)) - nlink = cp->c_attr.ca_dircount; /* implied ".." entries */ - else - nlink = cp->c_entries; - - /* Account for ourself and our "." entry */ - nlink += 2; - /* Hide our private directories. */ - if (cp->c_cnid == kHFSRootFolderID) { - if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) { - --nlink; - } - if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) { - --nlink; - } - } - VATTR_RETURN(vap, va_nlink, (u_int64_t)nlink); - } - if (VATTR_IS_ACTIVE(vap, va_nchildren)) { - int entries; - - entries = cp->c_entries; - /* Hide our private files and directories. */ - if (cp->c_cnid == kHFSRootFolderID) { - if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) - --entries; - if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) - --entries; - if (hfsmp->jnl || ((hfsmp->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) - entries -= 2; /* hide the journal files */ - } - VATTR_RETURN(vap, va_nchildren, entries); - } - /* - * The va_dirlinkcount is the count of real directory hard links. - * (i.e. its not the sum of the implied "." and ".." references) - */ - if (VATTR_IS_ACTIVE(vap, va_dirlinkcount)) { - VATTR_RETURN(vap, va_dirlinkcount, (uint32_t)cp->c_linkcount); - } - } else /* !VDIR */ { - data_size = VCTOF(vp, cp)->ff_size; - - VATTR_RETURN(vap, va_nlink, (u_int64_t)cp->c_linkcount); - if (VATTR_IS_ACTIVE(vap, va_data_alloc)) { - u_int64_t blocks; - -#if HFS_COMPRESSION - if (hide_size) { - VATTR_RETURN(vap, va_data_alloc, 0); - } else if (compressed) { - /* for compressed files, we report all allocated blocks as belonging to the data fork */ - blocks = cp->c_blocks; - VATTR_RETURN(vap, va_data_alloc, blocks * (u_int64_t)hfsmp->blockSize); - } - else -#endif - { - blocks = VCTOF(vp, cp)->ff_blocks; - VATTR_RETURN(vap, va_data_alloc, blocks * (u_int64_t)hfsmp->blockSize); - } - } - } - - /* conditional because 64-bit arithmetic can be expensive */ - if (VATTR_IS_ACTIVE(vap, va_total_size)) { - if (v_type == VDIR) { - VATTR_RETURN(vap, va_total_size, (cp->c_entries + 2) * AVERAGE_HFSDIRENTRY_SIZE); - } else { - u_int64_t total_size = ~0ULL; - struct cnode *rcp; -#if HFS_COMPRESSION - if (hide_size) { - /* we're hiding the size of this file, so just return 0 */ - total_size = 0; - } else if (compressed) { - if (uncompressed_size == -1) { - /* - * We failed to get the uncompressed size above, - * so we'll fall back to the standard path below - * since total_size is still -1 - */ - } else { - /* use the uncompressed size we fetched above */ - total_size = uncompressed_size; - } - } -#endif - if (total_size == ~0ULL) { - if (cp->c_datafork) { - total_size = cp->c_datafork->ff_size; - } - - if (cp->c_blocks - VTOF(vp)->ff_blocks) { - /* We deal with rsrc fork vnode iocount at the end of the function */ - error = hfs_vgetrsrc(hfsmp, vp, &rvp); - if (error) { - /* - * Note that we call hfs_vgetrsrc with error_on_unlinked - * set to FALSE. This is because we may be invoked via - * fstat() on an open-unlinked file descriptor and we must - * continue to support access to the rsrc fork until it disappears. - * The code at the end of this function will be - * responsible for releasing the iocount generated by - * hfs_vgetrsrc. This is because we can't drop the iocount - * without unlocking the cnode first. - */ - goto out; - } - - rcp = VTOC(rvp); - if (rcp && rcp->c_rsrcfork) { - total_size += rcp->c_rsrcfork->ff_size; - } - } - } - - VATTR_RETURN(vap, va_total_size, total_size); - } - } - if (VATTR_IS_ACTIVE(vap, va_total_alloc)) { - if (v_type == VDIR) { - VATTR_RETURN(vap, va_total_alloc, 0); - } else { - VATTR_RETURN(vap, va_total_alloc, (u_int64_t)cp->c_blocks * (u_int64_t)hfsmp->blockSize); - } - } - - /* - * If the VFS wants extended security data, and we know that we - * don't have any (because it never told us it was setting any) - * then we can return the supported bit and no data. If we do - * have extended security, we can just leave the bit alone and - * the VFS will use the fallback path to fetch it. - */ - if (VATTR_IS_ACTIVE(vap, va_acl)) { - if ((cp->c_attr.ca_recflags & kHFSHasSecurityMask) == 0) { - vap->va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; - VATTR_SET_SUPPORTED(vap, va_acl); - } - } - - vap->va_access_time.tv_sec = cp->c_atime; - vap->va_access_time.tv_nsec = 0; - vap->va_create_time.tv_sec = cp->c_itime; - vap->va_create_time.tv_nsec = 0; - vap->va_modify_time.tv_sec = cp->c_mtime; - vap->va_modify_time.tv_nsec = 0; - vap->va_change_time.tv_sec = cp->c_ctime; - vap->va_change_time.tv_nsec = 0; - vap->va_backup_time.tv_sec = cp->c_btime; - vap->va_backup_time.tv_nsec = 0; - - /* See if we need to emit the date added field to the user */ - if (VATTR_IS_ACTIVE(vap, va_addedtime)) { - u_int32_t dateadded = hfs_get_dateadded (cp); - if (dateadded) { - vap->va_addedtime.tv_sec = dateadded; - vap->va_addedtime.tv_nsec = 0; - VATTR_SET_SUPPORTED (vap, va_addedtime); - } - } - - /* XXX is this really a good 'optimal I/O size'? */ - vap->va_iosize = hfsmp->hfs_logBlockSize; - vap->va_uid = cp->c_uid; - vap->va_gid = cp->c_gid; - vap->va_mode = cp->c_mode; - vap->va_flags = cp->c_bsdflags; - - /* - * Exporting file IDs from HFS Plus: - * - * For "normal" files the c_fileid is the same value as the - * c_cnid. But for hard link files, they are different - the - * c_cnid belongs to the active directory entry (ie the link) - * and the c_fileid is for the actual inode (ie the data file). - * - * The stat call (getattr) uses va_fileid and the Carbon APIs, - * which are hardlink-ignorant, will ask for va_linkid. - */ - vap->va_fileid = (u_int64_t)cp->c_fileid; - /* - * We need to use the origin cache for both hardlinked files - * and directories. Hardlinked directories have multiple cnids - * and parents (one per link). Hardlinked files also have their - * own parents and link IDs separate from the indirect inode number. - * If we don't use the cache, we could end up vending the wrong ID - * because the cnode will only reflect the link that was looked up most recently. - */ - if (cp->c_flag & C_HARDLINK) { - vap->va_linkid = (u_int64_t)hfs_currentcnid(cp); - vap->va_parentid = (u_int64_t)hfs_currentparent(cp, /* have_lock: */ true); - } else { - vap->va_linkid = (u_int64_t)cp->c_cnid; - vap->va_parentid = (u_int64_t)cp->c_parentcnid; - } - vap->va_fsid = hfsmp->hfs_raw_dev; - vap->va_filerev = 0; - vap->va_encoding = cp->c_encoding; - vap->va_rdev = (v_type == VBLK || v_type == VCHR) ? cp->c_rdev : 0; -#if HFS_COMPRESSION - if (VATTR_IS_ACTIVE(vap, va_data_size)) { - if (hide_size) - vap->va_data_size = 0; - else if (compressed) { - if (uncompressed_size == -1) { - /* failed to get the uncompressed size above, so just return data_size */ - vap->va_data_size = data_size; - } else { - /* use the uncompressed size we fetched above */ - vap->va_data_size = uncompressed_size; - } - } else - vap->va_data_size = data_size; - VATTR_SET_SUPPORTED(vap, va_data_size); - } -#else - vap->va_data_size = data_size; - vap->va_supported |= VNODE_ATTR_va_data_size; -#endif - -#if CONFIG_PROTECT - if (VATTR_IS_ACTIVE(vap, va_dataprotect_class)) { - vap->va_dataprotect_class = cp->c_cpentry ? CP_CLASS(cp->c_cpentry->cp_pclass) : 0; - VATTR_SET_SUPPORTED(vap, va_dataprotect_class); - } -#endif - if (VATTR_IS_ACTIVE(vap, va_write_gencount)) { - if (ubc_is_mapped_writable(vp)) { - /* - * Return 0 to the caller to indicate the file may be - * changing. There is no need for us to increment the - * generation counter here because it gets done as part of - * page-out and also when the file is unmapped (to account - * for changes we might not have seen). - */ - vap->va_write_gencount = 0; - } else { - vap->va_write_gencount = hfs_get_gencount(cp); - } - - VATTR_SET_SUPPORTED(vap, va_write_gencount); - } - - /* Mark them all at once instead of individual VATTR_SET_SUPPORTED calls. */ - vap->va_supported |= VNODE_ATTR_va_access_time | - VNODE_ATTR_va_create_time | VNODE_ATTR_va_modify_time | - VNODE_ATTR_va_change_time| VNODE_ATTR_va_backup_time | - VNODE_ATTR_va_iosize | VNODE_ATTR_va_uid | - VNODE_ATTR_va_gid | VNODE_ATTR_va_mode | - VNODE_ATTR_va_flags |VNODE_ATTR_va_fileid | - VNODE_ATTR_va_linkid | VNODE_ATTR_va_parentid | - VNODE_ATTR_va_fsid | VNODE_ATTR_va_filerev | - VNODE_ATTR_va_encoding | VNODE_ATTR_va_rdev; - - /* If this is the root, let VFS to find out the mount name, which - * may be different from the real name. Otherwise, we need to take care - * for hardlinked files, which need to be looked up, if necessary - */ - if (VATTR_IS_ACTIVE(vap, va_name) && (cp->c_cnid != kHFSRootFolderID)) { - struct cat_desc linkdesc; - int lockflags; - int uselinkdesc = 0; - cnid_t nextlinkid = 0; - cnid_t prevlinkid = 0; - - /* Get the name for ATTR_CMN_NAME. We need to take special care for hardlinks - * here because the info. for the link ID requested by getattrlist may be - * different than what's currently in the cnode. This is because the cnode - * will be filled in with the information for the most recent link ID that went - * through namei/lookup(). If there are competing lookups for hardlinks that point - * to the same inode, one (or more) getattrlists could be vended incorrect name information. - * Also, we need to beware of open-unlinked files which could have a namelen of 0. - */ - - if ((cp->c_flag & C_HARDLINK) && - ((cp->c_desc.cd_namelen == 0) || (vap->va_linkid != cp->c_cnid))) { - /* - * If we have no name and our link ID is the raw inode number, then we may - * have an open-unlinked file. Go to the next link in this case. - */ - if ((cp->c_desc.cd_namelen == 0) && (vap->va_linkid == cp->c_fileid)) { - if ((error = hfs_lookup_siblinglinks(hfsmp, vap->va_linkid, &prevlinkid, &nextlinkid))){ - goto out; - } - } - else { - /* just use link obtained from vap above */ - nextlinkid = vap->va_linkid; - } - - /* We need to probe the catalog for the descriptor corresponding to the link ID - * stored in nextlinkid. Note that we don't know if we have the exclusive lock - * for the cnode here, so we can't just update the descriptor. Instead, - * we should just store the descriptor's value locally and then use it to pass - * out the name value as needed below. - */ - if (nextlinkid){ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_findname(hfsmp, nextlinkid, &linkdesc); - hfs_systemfile_unlock(hfsmp, lockflags); - if (error == 0) { - uselinkdesc = 1; - } - } - } - - /* By this point, we've either patched up the name above and the c_desc - * points to the correct data, or it already did, in which case we just proceed - * by copying the name into the vap. Note that we will never set va_name to - * supported if nextlinkid is never initialized. This could happen in the degenerate - * case above involving the raw inode number, where it has no nextlinkid. In this case - * we will simply not mark the name bit as supported. - */ - if (uselinkdesc) { - strlcpy(vap->va_name, (const char*) linkdesc.cd_nameptr, MAXPATHLEN); - VATTR_SET_SUPPORTED(vap, va_name); - cat_releasedesc(&linkdesc); - } - else if (cp->c_desc.cd_namelen) { - strlcpy(vap->va_name, (const char*) cp->c_desc.cd_nameptr, MAXPATHLEN); - VATTR_SET_SUPPORTED(vap, va_name); - } - } - -out: - hfs_unlock(cp); - /* - * We need to vnode_put the rsrc fork vnode only *after* we've released - * the cnode lock, since vnode_put can trigger an inactive call, which - * will go back into HFS and try to acquire a cnode lock. - */ - if (rvp) { - vnode_put (rvp); - } - - return (error); -} - -int -hfs_vnop_setattr(ap) - struct vnop_setattr_args /* { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode_attr *vap = ap->a_vap; - struct vnode *vp = ap->a_vp; - struct cnode *cp = NULL; - struct hfsmount *hfsmp; - kauth_cred_t cred = vfs_context_ucred(ap->a_context); - struct proc *p = vfs_context_proc(ap->a_context); - int error = 0; - uid_t nuid; - gid_t ngid; - time_t orig_ctime; - - orig_ctime = VTOC(vp)->c_ctime; - -#if HFS_COMPRESSION - int decmpfs_reset_state = 0; - /* - we call decmpfs_update_attributes even if the file is not compressed - because we want to update the incoming flags if the xattrs are invalid - */ - error = decmpfs_update_attributes(vp, vap); - if (error) - return error; -#endif - // - // if this is not a size-changing setattr and it is not just - // an atime update, then check for a snapshot. - // - if (!VATTR_IS_ACTIVE(vap, va_data_size) && !(vap->va_active == VNODE_ATTR_va_access_time)) { - check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_MOD, NSPACE_REARM_NO_ARG); - } - -#if CONFIG_PROTECT - /* - * All metadata changes should be allowed except a size-changing setattr, which - * has effects on file content and requires calling into cp_handle_vnop - * to have content protection check. - */ - if (VATTR_IS_ACTIVE(vap, va_data_size)) { - if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { - return (error); - } - } -#endif /* CONFIG_PROTECT */ - - hfsmp = VTOHFS(vp); - - /* Don't allow modification of the journal. */ - if (hfs_is_journal_file(hfsmp, VTOC(vp))) { - return (EPERM); - } - - // - // Check if we'll need a document_id and if so, get it before we lock the - // the cnode to avoid any possible deadlock with the root vnode which has - // to get locked to get the document id - // - u_int32_t document_id=0; - if (VATTR_IS_ACTIVE(vap, va_flags) && (vap->va_flags & UF_TRACKED) && !(VTOC(vp)->c_bsdflags & UF_TRACKED)) { - struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&(VTOC(vp)->c_attr.ca_finderinfo) + 16); - // - // If the document_id is not set, get a new one. It will be set - // on the file down below once we hold the cnode lock. - // - if (fip->document_id == 0) { - if (hfs_generate_document_id(hfsmp, &document_id) != 0) { - document_id = 0; - } - } - } - - - /* - * File size change request. - * We are guaranteed that this is not a directory, and that - * the filesystem object is writeable. - * - * NOTE: HFS COMPRESSION depends on the data_size being set *before* the bsd flags are updated - */ - VATTR_SET_SUPPORTED(vap, va_data_size); - if (VATTR_IS_ACTIVE(vap, va_data_size) && !vnode_islnk(vp)) { -#if HFS_COMPRESSION - /* keep the compressed state locked until we're done truncating the file */ - decmpfs_cnode *dp = VTOCMP(vp); - if (!dp) { - /* - * call hfs_lazy_init_decmpfs_cnode() to make sure that the decmpfs_cnode - * is filled in; we need a decmpfs_cnode to lock out decmpfs state changes - * on this file while it's truncating - */ - dp = hfs_lazy_init_decmpfs_cnode(VTOC(vp)); - if (!dp) { - /* failed to allocate a decmpfs_cnode */ - return ENOMEM; /* what should this be? */ - } - } - - check_for_tracked_file(vp, orig_ctime, vap->va_data_size == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); - - decmpfs_lock_compressed_data(dp, 1); - if (hfs_file_is_compressed(VTOC(vp), 1)) { - error = decmpfs_decompress_file(vp, dp, -1/*vap->va_data_size*/, 0, 1); - if (error != 0) { - decmpfs_unlock_compressed_data(dp, 1); - return error; - } - } -#endif - - // Take truncate lock - hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - - // hfs_truncate will deal with the cnode lock - error = hfs_truncate(vp, vap->va_data_size, vap->va_vaflags & 0xffff, - 0, ap->a_context); - - hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT); -#if HFS_COMPRESSION - decmpfs_unlock_compressed_data(dp, 1); -#endif - if (error) - return error; - } - if (cp == NULL) { - if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) - return (error); - cp = VTOC(vp); - } - - /* - * If it is just an access time update request by itself - * we know the request is from kernel level code, and we - * can delay it without being as worried about consistency. - * This change speeds up mmaps, in the rare case that they - * get caught behind a sync. - */ - - if (vap->va_active == VNODE_ATTR_va_access_time) { - cp->c_touch_acctime=TRUE; - goto out; - } - - - - /* - * Owner/group change request. - * We are guaranteed that the new owner/group is valid and legal. - */ - VATTR_SET_SUPPORTED(vap, va_uid); - VATTR_SET_SUPPORTED(vap, va_gid); - nuid = VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : (uid_t)VNOVAL; - ngid = VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : (gid_t)VNOVAL; - if (((nuid != (uid_t)VNOVAL) || (ngid != (gid_t)VNOVAL)) && - ((error = hfs_chown(vp, nuid, ngid, cred, p)) != 0)) - goto out; - - /* - * Mode change request. - * We are guaranteed that the mode value is valid and that in - * conjunction with the owner and group, this change is legal. - */ - VATTR_SET_SUPPORTED(vap, va_mode); - if (VATTR_IS_ACTIVE(vap, va_mode) && - ((error = hfs_chmod(vp, (int)vap->va_mode, cred, p)) != 0)) - goto out; - - /* - * File flags change. - * We are guaranteed that only flags allowed to change given the - * current securelevel are being changed. - */ - VATTR_SET_SUPPORTED(vap, va_flags); - if (VATTR_IS_ACTIVE(vap, va_flags)) { - u_int16_t *fdFlags; - -#if HFS_COMPRESSION - if ((cp->c_bsdflags ^ vap->va_flags) & UF_COMPRESSED) { - /* - * the UF_COMPRESSED was toggled, so reset our cached compressed state - * but we don't want to actually do the update until we've released the cnode lock down below - * NOTE: turning the flag off doesn't actually decompress the file, so that we can - * turn off the flag and look at the "raw" file for debugging purposes - */ - decmpfs_reset_state = 1; - } -#endif - if ((vap->va_flags & UF_TRACKED) && !(cp->c_bsdflags & UF_TRACKED)) { - struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); - - // - // we're marking this item UF_TRACKED. if the document_id is - // not set, get a new one and put it on the file. - // - if (fip->document_id == 0) { - if (document_id != 0) { - // printf("SETATTR: assigning doc-id %d to %s (ino %d)\n", document_id, vp->v_name, cp->c_desc.cd_cnid); - fip->document_id = (uint32_t)document_id; -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, ap->a_context, - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)0, // src inode # - FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # - FSE_ARG_INT32, document_id, - FSE_ARG_DONE); -#endif - } else { - // printf("hfs: could not acquire a new document_id for %s (ino %d)\n", vp->v_name, cp->c_desc.cd_cnid); - } - } - - } else if (!(vap->va_flags & UF_TRACKED) && (cp->c_bsdflags & UF_TRACKED)) { - // - // UF_TRACKED is being cleared so clear the document_id - // - struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); - if (fip->document_id) { - // printf("SETATTR: clearing doc-id %d from %s (ino %d)\n", fip->document_id, vp->v_name, cp->c_desc.cd_cnid); -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, ap->a_context, - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # - FSE_ARG_INO, (ino64_t)0, // dst inode # - FSE_ARG_INT32, fip->document_id, // document id - FSE_ARG_DONE); -#endif - fip->document_id = 0; - cp->c_bsdflags &= ~UF_TRACKED; - } - } - - cp->c_bsdflags = vap->va_flags; - cp->c_flag |= C_MODIFIED; - cp->c_touch_chgtime = TRUE; - - - /* - * Mirror the UF_HIDDEN flag to the invisible bit of the Finder Info. - * - * The fdFlags for files and frFlags for folders are both 8 bytes - * into the userInfo (the first 16 bytes of the Finder Info). They - * are both 16-bit fields. - */ - fdFlags = (u_int16_t *) &cp->c_finderinfo[8]; - if (vap->va_flags & UF_HIDDEN) - *fdFlags |= OSSwapHostToBigConstInt16(kFinderInvisibleMask); - else - *fdFlags &= ~OSSwapHostToBigConstInt16(kFinderInvisibleMask); - } - - /* - * Timestamp updates. - */ - VATTR_SET_SUPPORTED(vap, va_create_time); - VATTR_SET_SUPPORTED(vap, va_access_time); - VATTR_SET_SUPPORTED(vap, va_modify_time); - VATTR_SET_SUPPORTED(vap, va_backup_time); - VATTR_SET_SUPPORTED(vap, va_change_time); - if (VATTR_IS_ACTIVE(vap, va_create_time) || - VATTR_IS_ACTIVE(vap, va_access_time) || - VATTR_IS_ACTIVE(vap, va_modify_time) || - VATTR_IS_ACTIVE(vap, va_backup_time)) { - if (VATTR_IS_ACTIVE(vap, va_create_time)) - cp->c_itime = vap->va_create_time.tv_sec; - if (VATTR_IS_ACTIVE(vap, va_access_time)) { - cp->c_atime = vap->va_access_time.tv_sec; - cp->c_touch_acctime = FALSE; - } - if (VATTR_IS_ACTIVE(vap, va_modify_time)) { - cp->c_mtime = vap->va_modify_time.tv_sec; - cp->c_touch_modtime = FALSE; - cp->c_touch_chgtime = TRUE; - - hfs_clear_might_be_dirty_flag(cp); - - /* - * The utimes system call can reset the modification - * time but it doesn't know about HFS create times. - * So we need to ensure that the creation time is - * always at least as old as the modification time. - */ - if ((VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) && - (cp->c_cnid != kHFSRootFolderID) && - !VATTR_IS_ACTIVE(vap, va_create_time) && - (cp->c_mtime < cp->c_itime)) { - cp->c_itime = cp->c_mtime; - } - } - if (VATTR_IS_ACTIVE(vap, va_backup_time)) - cp->c_btime = vap->va_backup_time.tv_sec; - cp->c_flag |= C_MINOR_MOD; - } - - /* - * Set name encoding. - */ - VATTR_SET_SUPPORTED(vap, va_encoding); - if (VATTR_IS_ACTIVE(vap, va_encoding)) { - cp->c_encoding = vap->va_encoding; - cp->c_flag |= C_MODIFIED; - hfs_setencodingbits(hfsmp, cp->c_encoding); - } - - if ((error = hfs_update(vp, 0)) != 0) - goto out; -out: - if (cp) { - /* Purge origin cache for cnode, since caller now has correct link ID for it - * We purge it here since it was acquired for us during lookup, and we no longer need it. - */ - if ((cp->c_flag & C_HARDLINK) && (vp->v_type != VDIR)){ - hfs_relorigin(cp, 0); - } - - hfs_unlock(cp); -#if HFS_COMPRESSION - if (decmpfs_reset_state) { - /* - * we've changed the UF_COMPRESSED flag, so reset the decmpfs state for this cnode - * but don't do it while holding the hfs cnode lock - */ - decmpfs_cnode *dp = VTOCMP(vp); - if (!dp) { - /* - * call hfs_lazy_init_decmpfs_cnode() to make sure that the decmpfs_cnode - * is filled in; we need a decmpfs_cnode to prevent decmpfs state changes - * on this file if it's locked - */ - dp = hfs_lazy_init_decmpfs_cnode(VTOC(vp)); - if (!dp) { - /* failed to allocate a decmpfs_cnode */ - return ENOMEM; /* what should this be? */ - } - } - decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); - } -#endif - } - return (error); -} - - -/* - * Change the mode on a file. - * cnode must be locked before calling. - */ -int -hfs_chmod(struct vnode *vp, int mode, __unused kauth_cred_t cred, __unused struct proc *p) -{ - register struct cnode *cp = VTOC(vp); - - if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord) - return (0); - - // Don't allow modification of the journal or journal_info_block - if (hfs_is_journal_file(VTOHFS(vp), cp)) { - return EPERM; - } - -#if OVERRIDE_UNKNOWN_PERMISSIONS - if (((unsigned int)vfs_flags(VTOVFS(vp))) & MNT_UNKNOWNPERMISSIONS) { - return (0); - }; -#endif - - mode_t new_mode = (cp->c_mode & ~ALLPERMS) | (mode & ALLPERMS); - if (new_mode != cp->c_mode) { - cp->c_mode = new_mode; - cp->c_flag |= C_MINOR_MOD; - } - cp->c_touch_chgtime = TRUE; - return (0); -} - - -int -hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags) -{ - struct cnode *cp = VTOC(vp); - int retval = 0; - int is_member; - - /* - * Disallow write attempts on read-only file systems; - * unless the file is a socket, fifo, or a block or - * character device resident on the file system. - */ - switch (vnode_vtype(vp)) { - case VDIR: - case VLNK: - case VREG: - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) - return (EROFS); - break; - default: - break; - } - - /* If immutable bit set, nobody gets to write it. */ - if (considerFlags && (cp->c_bsdflags & IMMUTABLE)) - return (EPERM); - - /* Otherwise, user id 0 always gets access. */ - if (!suser(cred, NULL)) - return (0); - - /* Otherwise, check the owner. */ - if ((retval = hfs_owner_rights(VTOHFS(vp), cp->c_uid, cred, p, false)) == 0) - return ((cp->c_mode & S_IWUSR) == S_IWUSR ? 0 : EACCES); - - /* Otherwise, check the groups. */ - if (kauth_cred_ismember_gid(cred, cp->c_gid, &is_member) == 0 && is_member) { - return ((cp->c_mode & S_IWGRP) == S_IWGRP ? 0 : EACCES); - } - - /* Otherwise, check everyone else. */ - return ((cp->c_mode & S_IWOTH) == S_IWOTH ? 0 : EACCES); -} - - -/* - * Perform chown operation on cnode cp; - * code must be locked prior to call. - */ -int -#if !QUOTA -hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, __unused kauth_cred_t cred, - __unused struct proc *p) -#else -hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, - __unused struct proc *p) -#endif -{ - register struct cnode *cp = VTOC(vp); - uid_t ouid; - gid_t ogid; -#if QUOTA - int error = 0; - register int i; - int64_t change; -#endif /* QUOTA */ - - if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord) - return (ENOTSUP); - - if (((unsigned int)vfs_flags(VTOVFS(vp))) & MNT_UNKNOWNPERMISSIONS) - return (0); - - if (uid == (uid_t)VNOVAL) - uid = cp->c_uid; - if (gid == (gid_t)VNOVAL) - gid = cp->c_gid; - -#if 0 /* we are guaranteed that this is already the case */ - /* - * If we don't own the file, are trying to change the owner - * of the file, or are not a member of the target group, - * the caller must be superuser or the call fails. - */ - if ((kauth_cred_getuid(cred) != cp->c_uid || uid != cp->c_uid || - (gid != cp->c_gid && - (kauth_cred_ismember_gid(cred, gid, &is_member) || !is_member))) && - (error = suser(cred, 0))) - return (error); -#endif - - ogid = cp->c_gid; - ouid = cp->c_uid; - - if (ouid == uid && ogid == gid) { - // No change, just set change time - cp->c_touch_chgtime = TRUE; - return 0; - } - -#if QUOTA - if ((error = hfs_getinoquota(cp))) - return (error); - if (ouid == uid) { - dqrele(cp->c_dquot[USRQUOTA]); - cp->c_dquot[USRQUOTA] = NODQUOT; - } - if (ogid == gid) { - dqrele(cp->c_dquot[GRPQUOTA]); - cp->c_dquot[GRPQUOTA] = NODQUOT; - } - - /* - * Eventually need to account for (fake) a block per directory - * if (vnode_isdir(vp)) - * change = VTOHFS(vp)->blockSize; - * else - */ - - change = (int64_t)(cp->c_blocks) * (int64_t)VTOVCB(vp)->blockSize; - (void) hfs_chkdq(cp, -change, cred, CHOWN); - (void) hfs_chkiq(cp, -1, cred, CHOWN); - for (i = 0; i < MAXQUOTAS; i++) { - dqrele(cp->c_dquot[i]); - cp->c_dquot[i] = NODQUOT; - } -#endif /* QUOTA */ - cp->c_gid = gid; - cp->c_uid = uid; -#if QUOTA - if ((error = hfs_getinoquota(cp)) == 0) { - if (ouid == uid) { - dqrele(cp->c_dquot[USRQUOTA]); - cp->c_dquot[USRQUOTA] = NODQUOT; - } - if (ogid == gid) { - dqrele(cp->c_dquot[GRPQUOTA]); - cp->c_dquot[GRPQUOTA] = NODQUOT; - } - if ((error = hfs_chkdq(cp, change, cred, CHOWN)) == 0) { - if ((error = hfs_chkiq(cp, 1, cred, CHOWN)) == 0) - goto good; - else - (void) hfs_chkdq(cp, -change, cred, CHOWN|FORCE); - } - for (i = 0; i < MAXQUOTAS; i++) { - dqrele(cp->c_dquot[i]); - cp->c_dquot[i] = NODQUOT; - } - } - cp->c_gid = ogid; - cp->c_uid = ouid; - if (hfs_getinoquota(cp) == 0) { - if (ouid == uid) { - dqrele(cp->c_dquot[USRQUOTA]); - cp->c_dquot[USRQUOTA] = NODQUOT; - } - if (ogid == gid) { - dqrele(cp->c_dquot[GRPQUOTA]); - cp->c_dquot[GRPQUOTA] = NODQUOT; - } - (void) hfs_chkdq(cp, change, cred, FORCE|CHOWN); - (void) hfs_chkiq(cp, 1, cred, FORCE|CHOWN); - (void) hfs_getinoquota(cp); - } - return (error); -good: - if (hfs_getinoquota(cp)) - panic("hfs_chown: lost quota"); -#endif /* QUOTA */ - - /* - * Without quotas, we could probably make this a minor - * modification. - */ - cp->c_flag |= C_MODIFIED; - - /* - According to the SUSv3 Standard, chown() shall mark - for update the st_ctime field of the file. - (No exceptions mentioned) - */ - cp->c_touch_chgtime = TRUE; - return (0); -} - -#if HFS_COMPRESSION -/* - * Flush the resource fork if it exists. vp is the data fork and has - * an iocount. - */ -static int hfs_flush_rsrc(vnode_t vp, vfs_context_t ctx) -{ - cnode_t *cp = VTOC(vp); - - hfs_lock(cp, HFS_SHARED_LOCK, 0); - - vnode_t rvp = cp->c_rsrc_vp; - - if (!rvp) { - hfs_unlock(cp); - return 0; - } - - int vid = vnode_vid(rvp); - - hfs_unlock(cp); - - int error = vnode_getwithvid(rvp, vid); - - if (error) - return error == ENOENT ? 0 : error; - - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, 0); - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); - hfs_filedone(rvp, ctx, HFS_FILE_DONE_NO_SYNC); - hfs_unlock(cp); - hfs_unlock_truncate(cp, 0); - - error = ubc_msync(rvp, 0, ubc_getsize(rvp), NULL, - UBC_PUSHALL | UBC_SYNC); - - vnode_put(rvp); - - return error; -} -#endif // HFS_COMPRESSION - -/* - * hfs_vnop_exchange: - * - * Inputs: - * 'from' vnode/cnode - * 'to' vnode/cnode - * options flag bits - * vfs_context - * - * Discussion: - * hfs_vnop_exchange is used to service the exchangedata(2) system call. - * Per the requirements of that system call, this function "swaps" some - * of the information that lives in one catalog record for some that - * lives in another. Note that not everything is swapped; in particular, - * the extent information stored in each cnode is kept local to that - * cnode. This allows existing file descriptor references to continue - * to operate on the same content, regardless of the location in the - * namespace that the file may have moved to. See inline comments - * in the function for more information. - */ -int -hfs_vnop_exchange(ap) - struct vnop_exchange_args /* { - struct vnode *a_fvp; - struct vnode *a_tvp; - int a_options; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *from_vp = ap->a_fvp; - struct vnode *to_vp = ap->a_tvp; - struct cnode *from_cp; - struct cnode *to_cp; - struct hfsmount *hfsmp; - struct cat_desc tempdesc; - struct cat_attr tempattr; - const unsigned char *from_nameptr; - const unsigned char *to_nameptr; - char from_iname[32]; - char to_iname[32]; - uint32_t to_flag_special; - uint32_t from_flag_special; - cnid_t from_parid; - cnid_t to_parid; - int lockflags; - int error = 0, started_tr = 0, got_cookie = 0; - cat_cookie_t cookie; - time_t orig_from_ctime, orig_to_ctime; - bool have_cnode_locks = false, have_from_trunc_lock = false, have_to_trunc_lock = false; - - /* - * VFS does the following checks: - * 1. Validate that both are files. - * 2. Validate that both are on the same mount. - * 3. Validate that they're not the same vnode. - */ - - from_cp = VTOC(from_vp); - to_cp = VTOC(to_vp); - hfsmp = VTOHFS(from_vp); - - orig_from_ctime = from_cp->c_ctime; - orig_to_ctime = to_cp->c_ctime; - -#if CONFIG_PROTECT - /* - * Do not allow exchangedata/F_MOVEDATAEXTENTS on data-protected filesystems - * because the EAs will not be swapped. As a result, the persistent keys would not - * match and the files will be garbage. - */ - if (cp_fs_protected (vnode_mount(from_vp))) { - return EINVAL; - } -#endif - -#if HFS_COMPRESSION - if (!ISSET(ap->a_options, FSOPT_EXCHANGE_DATA_ONLY)) { - if ( hfs_file_is_compressed(from_cp, 0) ) { - if ( 0 != ( error = decmpfs_decompress_file(from_vp, VTOCMP(from_vp), -1, 0, 1) ) ) { - return error; - } - } - - if ( hfs_file_is_compressed(to_cp, 0) ) { - if ( 0 != ( error = decmpfs_decompress_file(to_vp, VTOCMP(to_vp), -1, 0, 1) ) ) { - return error; - } - } - } -#endif // HFS_COMPRESSION - - // Resource forks cannot be exchanged. - if (VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp)) - return EINVAL; - - /* - * Normally, we want to notify the user handlers about the event, - * except if it's a handler driving the event. - */ - if ((ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) == 0) { - check_for_tracked_file(from_vp, orig_from_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); - check_for_tracked_file(to_vp, orig_to_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); - } else { - /* - * This is currently used by mtmd so we should tidy up the - * file now because the data won't be used again in the - * destination file. - */ - hfs_lock_truncate(from_cp, HFS_EXCLUSIVE_LOCK, 0); - hfs_lock_always(from_cp, HFS_EXCLUSIVE_LOCK); - hfs_filedone(from_vp, ap->a_context, HFS_FILE_DONE_NO_SYNC); - hfs_unlock(from_cp); - hfs_unlock_truncate(from_cp, 0); - - // Flush all the data from the source file - error = ubc_msync(from_vp, 0, ubc_getsize(from_vp), NULL, - UBC_PUSHALL | UBC_SYNC); - if (error) - goto exit; - -#if HFS_COMPRESSION - /* - * If this is a compressed file, we need to do the same for - * the resource fork. - */ - if (ISSET(from_cp->c_bsdflags, UF_COMPRESSED)) { - error = hfs_flush_rsrc(from_vp, ap->a_context); - if (error) - goto exit; - } -#endif - - /* - * We're doing a data-swap so we need to take the truncate - * lock exclusively. We need an exclusive lock because we - * will be completely truncating the source file and we must - * make sure nobody else sneaks in and trys to issue I/O - * whilst we don't have the cnode lock. - * - * After taking the truncate lock we do a quick check to - * verify there are no other references (including mmap - * references), but we must remember that this does not stop - * anybody coming in later and taking a reference. We will - * have the truncate lock exclusively so that will prevent - * them from issuing any I/O. - */ - - if (to_cp < from_cp) { - hfs_lock_truncate(to_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - have_to_trunc_lock = true; - } - - hfs_lock_truncate(from_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - have_from_trunc_lock = true; - - /* - * Do an early check to verify the source is not in use by - * anyone. We should be called from an FD opened as F_EVTONLY - * so that doesn't count as a reference. - */ - if (vnode_isinuse(from_vp, 0)) { - error = EBUSY; - goto exit; - } - - if (to_cp >= from_cp) { - hfs_lock_truncate(to_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - have_to_trunc_lock = true; - } - } - - if ((error = hfs_lockpair(from_cp, to_cp, HFS_EXCLUSIVE_LOCK))) - goto exit; - have_cnode_locks = true; - - // Don't allow modification of the journal or journal_info_block - if (hfs_is_journal_file(hfsmp, from_cp) || - hfs_is_journal_file(hfsmp, to_cp)) { - error = EPERM; - goto exit; - } - - /* - * Ok, now that all of the pre-flighting is done, call the underlying - * function if needed. - */ - if (ISSET(ap->a_options, FSOPT_EXCHANGE_DATA_ONLY)) { -#if HFS_COMPRESSION - if (ISSET(from_cp->c_bsdflags, UF_COMPRESSED)) { - error = hfs_move_compressed(from_cp, to_cp); - goto exit; - } -#endif - - error = hfs_move_data(from_cp, to_cp, 0); - goto exit; - } - - if ((error = hfs_start_transaction(hfsmp)) != 0) { - goto exit; - } - started_tr = 1; - - /* - * Reserve some space in the Catalog file. - */ - if ((error = cat_preflight(hfsmp, CAT_EXCHANGE, &cookie, vfs_context_proc(ap->a_context)))) { - goto exit; - } - got_cookie = 1; - - /* The backend code always tries to delete the virtual - * extent id for exchanging files so we need to lock - * the extents b-tree. - */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - /* Account for the location of the catalog objects. */ - if (from_cp->c_flag & C_HARDLINK) { - MAKE_INODE_NAME(from_iname, sizeof(from_iname), - from_cp->c_attr.ca_linkref); - from_nameptr = (unsigned char *)from_iname; - from_parid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - from_cp->c_hint = 0; - } else { - from_nameptr = from_cp->c_desc.cd_nameptr; - from_parid = from_cp->c_parentcnid; - } - if (to_cp->c_flag & C_HARDLINK) { - MAKE_INODE_NAME(to_iname, sizeof(to_iname), - to_cp->c_attr.ca_linkref); - to_nameptr = (unsigned char *)to_iname; - to_parid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - to_cp->c_hint = 0; - } else { - to_nameptr = to_cp->c_desc.cd_nameptr; - to_parid = to_cp->c_parentcnid; - } - - /* - * ExchangeFileIDs swaps the on-disk, or in-BTree extent information - * attached to two different file IDs. It also swaps the extent - * information that may live in the extents-overflow B-Tree. - * - * We do this in a transaction as this may require a lot of B-Tree nodes - * to do completely, particularly if one of the files in question - * has a lot of extents. - * - * For example, assume "file1" has fileID 50, and "file2" has fileID 52. - * For the on-disk records, which are assumed to be synced, we will - * first swap the resident inline-8 extents as part of the catalog records. - * Then we will swap any extents overflow records for each file. - * - * When ExchangeFileIDs returns successfully, "file1" will have fileID 52, - * and "file2" will have fileID 50. However, note that this is only - * approximately half of the work that exchangedata(2) will need to - * accomplish. In other words, we swap "too much" of the information - * because if we only called ExchangeFileIDs, both the fileID and extent - * information would be the invariants of this operation. We don't - * actually want that; we want to conclude with "file1" having - * file ID 50, and "file2" having fileID 52. - * - * The remainder of hfs_vnop_exchange will swap the file ID and other cnode - * data back to the proper ownership, while still allowing the cnode to remain - * pointing at the same set of extents that it did originally. - */ - error = ExchangeFileIDs(hfsmp, from_nameptr, to_nameptr, from_parid, - to_parid, from_cp->c_hint, to_cp->c_hint); - hfs_systemfile_unlock(hfsmp, lockflags); - - /* - * Note that we don't need to exchange any extended attributes - * since the attributes are keyed by file ID. - */ - - if (error != E_NONE) { - error = MacToVFSError(error); - goto exit; - } - - /* Purge the vnodes from the name cache */ - if (from_vp) - cache_purge(from_vp); - if (to_vp) - cache_purge(to_vp); - - /* Bump both source and destination write counts before any swaps. */ - { - hfs_incr_gencount (from_cp); - hfs_incr_gencount (to_cp); - } - - /* Save a copy of "from" attributes before swapping. */ - bcopy(&from_cp->c_desc, &tempdesc, sizeof(struct cat_desc)); - bcopy(&from_cp->c_attr, &tempattr, sizeof(struct cat_attr)); - - /* Save whether or not each cnode is a hardlink or has EAs */ - from_flag_special = from_cp->c_flag & (C_HARDLINK | C_HASXATTRS); - to_flag_special = to_cp->c_flag & (C_HARDLINK | C_HASXATTRS); - - /* Drop the special bits from each cnode */ - from_cp->c_flag &= ~(C_HARDLINK | C_HASXATTRS); - to_cp->c_flag &= ~(C_HARDLINK | C_HASXATTRS); - - /* - * Now complete the in-memory portion of the copy. - * - * ExchangeFileIDs swaps the on-disk records involved. We complete the - * operation by swapping the in-memory contents of the two files here. - * We swap the cnode descriptors, which contain name, BSD attributes, - * timestamps, etc, about the file. - * - * NOTE: We do *NOT* swap the fileforks of the two cnodes. We have - * already swapped the on-disk extent information. As long as we swap the - * IDs, the in-line resident 8 extents that live in the filefork data - * structure will point to the right data for the new file ID if we leave - * them alone. - * - * As a result, any file descriptor that points to a particular - * vnode (even though it should change names), will continue - * to point to the same content. - */ - - /* Copy the "to" -> "from" cnode */ - bcopy(&to_cp->c_desc, &from_cp->c_desc, sizeof(struct cat_desc)); - - from_cp->c_hint = 0; - /* - * If 'to' was a hardlink, then we copied over its link ID/CNID/(namespace ID) - * when we bcopy'd the descriptor above. However, the cnode attributes - * are not bcopied. As a result, make sure to swap the file IDs of each item. - * - * Further, other hardlink attributes must be moved along in this swap: - * the linkcount, the linkref, and the firstlink all need to move - * along with the file IDs. See note below regarding the flags and - * what moves vs. what does not. - * - * For Reference: - * linkcount == total # of hardlinks. - * linkref == the indirect inode pointer. - * firstlink == the first hardlink in the chain (written to the raw inode). - * These three are tied to the fileID and must move along with the rest of the data. - */ - from_cp->c_fileid = to_cp->c_attr.ca_fileid; - - from_cp->c_itime = to_cp->c_itime; - from_cp->c_btime = to_cp->c_btime; - from_cp->c_atime = to_cp->c_atime; - from_cp->c_ctime = to_cp->c_ctime; - from_cp->c_gid = to_cp->c_gid; - from_cp->c_uid = to_cp->c_uid; - from_cp->c_bsdflags = to_cp->c_bsdflags; - from_cp->c_mode = to_cp->c_mode; - from_cp->c_linkcount = to_cp->c_linkcount; - from_cp->c_attr.ca_linkref = to_cp->c_attr.ca_linkref; - from_cp->c_attr.ca_firstlink = to_cp->c_attr.ca_firstlink; - - /* - * The cnode flags need to stay with the cnode and not get transferred - * over along with everything else because they describe the content; they are - * not attributes that reflect changes specific to the file ID. In general, - * fields that are tied to the file ID are the ones that will move. - * - * This reflects the fact that the file may have borrowed blocks, dirty metadata, - * or other extents, which may not yet have been written to the catalog. If - * they were, they would have been transferred above in the ExchangeFileIDs call above... - * - * The flags that are special are: - * C_HARDLINK, C_HASXATTRS - * - * These flags move with the item and file ID in the namespace since their - * state is tied to that of the file ID. - * - * So to transfer the flags, we have to take the following steps - * 1) Store in a localvar whether or not the special bits are set. - * 2) Drop the special bits from the current flags - * 3) swap the special flag bits to their destination - */ - from_cp->c_flag |= to_flag_special | C_MODIFIED; - from_cp->c_attr.ca_recflags = to_cp->c_attr.ca_recflags; - bcopy(to_cp->c_finderinfo, from_cp->c_finderinfo, 32); - - - /* Copy the "from" -> "to" cnode */ - bcopy(&tempdesc, &to_cp->c_desc, sizeof(struct cat_desc)); - to_cp->c_hint = 0; - /* - * Pull the file ID from the tempattr we copied above. We can't assume - * it is the same as the CNID. - */ - to_cp->c_fileid = tempattr.ca_fileid; - to_cp->c_itime = tempattr.ca_itime; - to_cp->c_btime = tempattr.ca_btime; - to_cp->c_atime = tempattr.ca_atime; - to_cp->c_ctime = tempattr.ca_ctime; - to_cp->c_gid = tempattr.ca_gid; - to_cp->c_uid = tempattr.ca_uid; - to_cp->c_bsdflags = tempattr.ca_flags; - to_cp->c_mode = tempattr.ca_mode; - to_cp->c_linkcount = tempattr.ca_linkcount; - to_cp->c_attr.ca_linkref = tempattr.ca_linkref; - to_cp->c_attr.ca_firstlink = tempattr.ca_firstlink; - - /* - * Only OR in the "from" flags into our cnode flags below. - * Leave the rest of the flags alone. - */ - to_cp->c_flag |= from_flag_special | C_MODIFIED; - - to_cp->c_attr.ca_recflags = tempattr.ca_recflags; - bcopy(tempattr.ca_finderinfo, to_cp->c_finderinfo, 32); - - - /* Rehash the cnodes using their new file IDs */ - hfs_chash_rehash(hfsmp, from_cp, to_cp); - - /* - * When a file moves out of "Cleanup At Startup" - * we can drop its NODUMP status. - */ - if ((from_cp->c_bsdflags & UF_NODUMP) && - (from_cp->c_parentcnid != to_cp->c_parentcnid)) { - from_cp->c_bsdflags &= ~UF_NODUMP; - from_cp->c_touch_chgtime = TRUE; - } - if ((to_cp->c_bsdflags & UF_NODUMP) && - (to_cp->c_parentcnid != from_cp->c_parentcnid)) { - to_cp->c_bsdflags &= ~UF_NODUMP; - to_cp->c_touch_chgtime = TRUE; - } - -exit: - if (got_cookie) { - cat_postflight(hfsmp, &cookie, vfs_context_proc(ap->a_context)); - } - if (started_tr) { - hfs_end_transaction(hfsmp); - } - - if (have_cnode_locks) - hfs_unlockpair(from_cp, to_cp); - - if (have_from_trunc_lock) - hfs_unlock_truncate(from_cp, 0); - - if (have_to_trunc_lock) - hfs_unlock_truncate(to_cp, 0); - - return (error); -} - -#if HFS_COMPRESSION -/* - * This function is used specifically for the case when a namespace - * handler is trying to steal data before it's deleted. Note that we - * don't bother deleting the xattr from the source because it will get - * deleted a short time later anyway. - * - * cnodes must be locked - */ -static int hfs_move_compressed(cnode_t *from_cp, cnode_t *to_cp) -{ - int ret; - void *data = NULL; - - CLR(from_cp->c_bsdflags, UF_COMPRESSED); - SET(from_cp->c_flag, C_MODIFIED); - - ret = hfs_move_data(from_cp, to_cp, HFS_MOVE_DATA_INCLUDE_RSRC); - if (ret) - goto exit; - - /* - * Transfer the xattr that decmpfs uses. Ideally, this code - * should be with the other decmpfs code but it's file system - * agnostic and this path is currently, and likely to remain, HFS+ - * specific. It's easier and more performant if we implement it - * here. - */ - - size_t size = MAX_DECMPFS_XATTR_SIZE; - MALLOC(data, void *, size, M_TEMP, M_WAITOK); - - ret = hfs_xattr_read(from_cp->c_vp, DECMPFS_XATTR_NAME, data, &size); - if (ret) - goto exit; - - ret = hfs_xattr_write(to_cp->c_vp, DECMPFS_XATTR_NAME, data, size); - if (ret) - goto exit; - - SET(to_cp->c_bsdflags, UF_COMPRESSED); - SET(to_cp->c_flag, C_MODIFIED); - -exit: - if (data) - FREE(data, M_TEMP); - - return ret; -} -#endif // HFS_COMPRESSION - -int -hfs_vnop_mmap(struct vnop_mmap_args *ap) -{ - struct vnode *vp = ap->a_vp; - cnode_t *cp = VTOC(vp); - int error; - - if (VNODE_IS_RSRC(vp)) { - /* allow pageins of the resource fork */ - } else { - int compressed = hfs_file_is_compressed(cp, 1); /* 1 == don't take the cnode lock */ - time_t orig_ctime = cp->c_ctime; - - if (!compressed && (cp->c_bsdflags & UF_COMPRESSED)) { - error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); - if (error != 0) { - return error; - } - } - - if (ap->a_fflags & PROT_WRITE) { - check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); - } - } - - // - // NOTE: we return ENOTSUP because we want the cluster layer - // to actually do all the real work. - // - return (ENOTSUP); -} - -static errno_t hfs_vnop_mnomap(struct vnop_mnomap_args *ap) -{ - vnode_t vp = ap->a_vp; - - /* - * Whilst the file was mapped, there may not have been any - * page-outs so we need to increment the generation counter now. - * Unfortunately this may lead to a change in the generation - * counter when no actual change has been made, but there is - * little we can do about that with our current architecture. - */ - if (ubc_is_mapped_writable(vp)) { - cnode_t *cp = VTOC(vp); - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - hfs_incr_gencount(cp); - - /* - * We don't want to set the modification time here since a - * change to that is not acceptable if no changes were made. - * Instead we set a flag so that if we get any page-outs we - * know to update the modification time. It's possible that - * they weren't actually because of changes made whilst the - * file was mapped but that's not easy to fix now. - */ - SET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING); - - hfs_unlock(cp); - } - - return 0; -} - -/* - * Mark the resource fork as needing a ubc_setsize when we drop the - * cnode lock later. - */ -static void hfs_rsrc_setsize(cnode_t *cp) -{ - /* - * We need to take an iocount if we don't have one. vnode_get - * will return ENOENT if the vnode is terminating which is what we - * want as it's not safe to call ubc_setsize in that case. - */ - if (cp->c_rsrc_vp && !vnode_get(cp->c_rsrc_vp)) { - // Shouldn't happen, but better safe... - if (ISSET(cp->c_flag, C_NEED_RVNODE_PUT)) - vnode_put(cp->c_rsrc_vp); - SET(cp->c_flag, C_NEED_RVNODE_PUT | C_NEED_RSRC_SETSIZE); - } -} - -/* - * hfs_move_data - * - * This is a non-symmetric variant of exchangedata. In this function, - * the contents of the data fork (and optionally the resource fork) - * are moved from from_cp to to_cp. - * - * The cnodes must be locked. - * - * The cnode pointed to by 'to_cp' *must* be empty prior to invoking - * this function. We impose this restriction because we may not be - * able to fully delete the entire file's contents in a single - * transaction, particularly if it has a lot of extents. In the - * normal file deletion codepath, the file is screened for two - * conditions: 1) bigger than 400MB, and 2) more than 8 extents. If - * so, the file is relocated to the hidden directory and the deletion - * is broken up into multiple truncates. We can't do that here - * because both files need to exist in the namespace. The main reason - * this is imposed is that we may have to touch a whole lot of bitmap - * blocks if there are many extents. - * - * Any data written to 'from_cp' after this call completes is not - * guaranteed to be moved. - * - * Arguments: - * cnode_t *from_cp : source file - * cnode_t *to_cp : destination file; must be empty - * - * Returns: - * - * EBUSY - File has been deleted or is in use - * EFBIG - Destination file was not empty - * EIO - An I/O error - * 0 - success - * other - Other errors that can be returned from called functions - */ -int hfs_move_data(cnode_t *from_cp, cnode_t *to_cp, - hfs_move_data_options_t options) -{ - hfsmount_t *hfsmp = VTOHFS(from_cp->c_vp); - int error = 0; - int lockflags = 0; - bool return_EIO_on_error = false; - const bool include_rsrc = ISSET(options, HFS_MOVE_DATA_INCLUDE_RSRC); - - /* Verify that neither source/dest file is open-unlinked */ - if (ISSET(from_cp->c_flag, C_DELETED | C_NOEXISTS) - || ISSET(to_cp->c_flag, C_DELETED | C_NOEXISTS)) { - return EBUSY; - } - - /* - * Verify the source file is not in use by anyone besides us. - * - * This function is typically invoked by a namespace handler - * process responding to a temporarily stalled system call. - * The FD that it is working off of is opened O_EVTONLY, so - * it really has no active usecounts (the kusecount from O_EVTONLY - * is subtracted from the total usecounts). - * - * As a result, we shouldn't have any active usecounts against - * this vnode when we go to check it below. - */ - if (vnode_isinuse(from_cp->c_vp, 0)) - return EBUSY; - - if (include_rsrc && from_cp->c_rsrc_vp) { - if (vnode_isinuse(from_cp->c_rsrc_vp, 0)) - return EBUSY; - - /* - * In the code below, if the destination file doesn't have a - * c_rsrcfork then we don't create it which means we we cannot - * transfer the ff_invalidranges and cf_vblocks fields. These - * shouldn't be set because we flush the resource fork before - * calling this function but there is a tiny window when we - * did not have any locks... - */ - if (!to_cp->c_rsrcfork - && (!TAILQ_EMPTY(&from_cp->c_rsrcfork->ff_invalidranges) - || from_cp->c_rsrcfork->ff_unallocblocks)) { - /* - * The file isn't really busy now but something did slip - * in and tinker with the file while we didn't have any - * locks, so this is the most meaningful return code for - * the caller. - */ - return EBUSY; - } - } - - // Check the destination file is empty - if (to_cp->c_datafork->ff_blocks - || to_cp->c_datafork->ff_size - || (include_rsrc - && (to_cp->c_blocks - || (to_cp->c_rsrcfork && to_cp->c_rsrcfork->ff_size)))) { - return EFBIG; - } - - if ((error = hfs_start_transaction (hfsmp))) - return error; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, - HFS_EXCLUSIVE_LOCK); - - // filefork_t is 128 bytes which should be OK - filefork_t rfork_buf, *from_rfork = NULL; - - if (include_rsrc) { - from_rfork = from_cp->c_rsrcfork; - - /* - * Creating resource fork vnodes is expensive, so just get get - * the fork data if we need it. - */ - if (!from_rfork && hfs_has_rsrc(from_cp)) { - from_rfork = &rfork_buf; - - from_rfork->ff_cp = from_cp; - TAILQ_INIT(&from_rfork->ff_invalidranges); - - error = cat_idlookup(hfsmp, from_cp->c_fileid, 0, 1, NULL, NULL, - &from_rfork->ff_data); - - if (error) - goto exit; - } - } - - /* - * From here on, any failures mean that we might be leaving things - * in a weird or inconsistent state. Ideally, we should back out - * all the changes, but to do that properly we need to fix - * MoveData. We'll save fixing that for another time. For now, - * just return EIO in all cases to the caller so that they know. - */ - return_EIO_on_error = true; - - bool data_overflow_extents = overflow_extents(from_cp->c_datafork); - - // Move the data fork - if ((error = hfs_move_fork (from_cp->c_datafork, from_cp, - to_cp->c_datafork, to_cp))) { - goto exit; - } - - SET(from_cp->c_flag, C_NEED_DATA_SETSIZE); - SET(to_cp->c_flag, C_NEED_DATA_SETSIZE); - - // We move the resource fork later - - /* - * Note that because all we're doing is moving the extents around, - * we can probably do this in a single transaction: Each extent - * record (group of 8) is 64 bytes. A extent overflow B-Tree node - * is typically 4k. This means each node can hold roughly ~60 - * extent records == (480 extents). - * - * If a file was massively fragmented and had 20k extents, this - * means we'd roughly touch 20k/480 == 41 to 42 nodes, plus the - * index nodes, for half of the operation. (inserting or - * deleting). So if we're manipulating 80-100 nodes, this is - * basically 320k of data to write to the journal in a bad case. - */ - if (data_overflow_extents) { - if ((error = MoveData(hfsmp, from_cp->c_cnid, to_cp->c_cnid, 0))) - goto exit; - } - - if (from_rfork && overflow_extents(from_rfork)) { - if ((error = MoveData(hfsmp, from_cp->c_cnid, to_cp->c_cnid, 1))) - goto exit; - } - - // Touch times - from_cp->c_touch_acctime = TRUE; - from_cp->c_touch_chgtime = TRUE; - from_cp->c_touch_modtime = TRUE; - hfs_touchtimes(hfsmp, from_cp); - - to_cp->c_touch_acctime = TRUE; - to_cp->c_touch_chgtime = TRUE; - to_cp->c_touch_modtime = TRUE; - hfs_touchtimes(hfsmp, to_cp); - - struct cat_fork dfork_buf; - const struct cat_fork *dfork, *rfork; - - dfork = hfs_prepare_fork_for_update(to_cp->c_datafork, NULL, - &dfork_buf, hfsmp->blockSize); - rfork = hfs_prepare_fork_for_update(from_rfork, NULL, - &rfork_buf.ff_data, hfsmp->blockSize); - - // Update the catalog nodes, to_cp first - if ((error = cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, - dfork, rfork))) { - goto exit; - } - - CLR(to_cp->c_flag, C_MODIFIED | C_MINOR_MOD); - - // Update in-memory resource fork data here - if (from_rfork) { - // Update c_blocks - uint32_t moving = from_rfork->ff_blocks + from_rfork->ff_unallocblocks; - - from_cp->c_blocks -= moving; - to_cp->c_blocks += moving; - - // Update to_cp's resource data if it has it - filefork_t *to_rfork = to_cp->c_rsrcfork; - if (to_rfork) { - TAILQ_SWAP(&to_rfork->ff_invalidranges, - &from_rfork->ff_invalidranges, rl_entry, rl_link); - to_rfork->ff_data = from_rfork->ff_data; - - // Deal with ubc_setsize - hfs_rsrc_setsize(to_cp); - } - - // Wipe out the resource fork in from_cp - rl_init(&from_rfork->ff_invalidranges); - bzero(&from_rfork->ff_data, sizeof(from_rfork->ff_data)); - - // Deal with ubc_setsize - hfs_rsrc_setsize(from_cp); - } - - // Currently unnecessary, but might be useful in future... - dfork = hfs_prepare_fork_for_update(from_cp->c_datafork, NULL, &dfork_buf, - hfsmp->blockSize); - rfork = hfs_prepare_fork_for_update(from_rfork, NULL, &rfork_buf.ff_data, - hfsmp->blockSize); - - // Update from_cp - if ((error = cat_update(hfsmp, &from_cp->c_desc, &from_cp->c_attr, - dfork, rfork))) { - goto exit; - } - - CLR(from_cp->c_flag, C_MODIFIED | C_MINOR_MOD); - -exit: - if (lockflags) { - hfs_systemfile_unlock(hfsmp, lockflags); - hfs_end_transaction(hfsmp); - } - - if (error && error != EIO && return_EIO_on_error) { - printf("hfs_move_data: encountered error %d\n", error); - error = EIO; - } - - return error; -} - -/* - * Move all of the catalog and runtime data in srcfork to dstfork. - * - * This allows us to maintain the invalid ranges across the move data - * operation so we don't need to force all of the pending IO right - * now. In addition, we move all non overflow-extent extents into the - * destination here. - * - * The destination fork must be empty and should have been checked - * prior to calling this. - */ -static int hfs_move_fork(filefork_t *srcfork, cnode_t *src_cp, - filefork_t *dstfork, cnode_t *dst_cp) -{ - // Move the invalid ranges - TAILQ_SWAP(&dstfork->ff_invalidranges, &srcfork->ff_invalidranges, - rl_entry, rl_link); - rl_remove_all(&srcfork->ff_invalidranges); - - // Move the fork data (copy whole structure) - dstfork->ff_data = srcfork->ff_data; - bzero(&srcfork->ff_data, sizeof(srcfork->ff_data)); - - // Update c_blocks - src_cp->c_blocks -= dstfork->ff_blocks + dstfork->ff_unallocblocks; - dst_cp->c_blocks += dstfork->ff_blocks + dstfork->ff_unallocblocks; - - return 0; -} - - -#include - -struct hfs_fsync_panic_hook { - panic_hook_t hook; - struct cnode *cp; -}; - -static void hfs_fsync_panic_hook(panic_hook_t *hook_) -{ - struct hfs_fsync_panic_hook *hook = (struct hfs_fsync_panic_hook *)hook_; - extern int kdb_log(const char *fmt, ...); - - // Get the physical region just before cp - panic_phys_range_t range; - uint64_t phys; - - if (panic_phys_range_before(hook->cp, &phys, &range)) { - kdb_log("cp = %p, phys = %p, prev (%p: %p-%p)\n", - hook->cp, phys, range.type, range.phys_start, - range.phys_start + range.len); - } else - kdb_log("cp = %p, phys = %p, prev (!)\n", hook->cp, phys); - - panic_dump_mem((void *)(((vm_offset_t)hook->cp - 4096) & ~4095), 12288); - - kdb_log("\n"); -} - - -/* - * cnode must be locked - */ -int -hfs_fsync(struct vnode *vp, int waitfor, hfs_fsync_mode_t fsyncmode, struct proc *p) -{ - struct cnode *cp = VTOC(vp); - struct filefork *fp = NULL; - int retval = 0; - struct hfsmount *hfsmp = VTOHFS(vp); - struct timeval tv; - int waitdata; /* attributes necessary for data retrieval */ - int wait; /* all other attributes (e.g. atime, etc.) */ - int lockflag; - int took_trunc_lock = 0; - int locked_buffers = 0; - int fsync_default = 1; - - /* - * Applications which only care about data integrity rather than full - * file integrity may opt out of (delay) expensive metadata update - * operations as a performance optimization. - */ - wait = (waitfor == MNT_WAIT); - waitdata = (waitfor == MNT_DWAIT) | wait; - - if (always_do_fullfsync) - fsyncmode = HFS_FSYNC_FULL; - if (fsyncmode != HFS_FSYNC) - fsync_default = 0; - - /* HFS directories don't have any data blocks. */ - if (vnode_isdir(vp)) - goto metasync; - fp = VTOF(vp); - - /* - * For system files flush the B-tree header and - * for regular files write out any clusters - */ - if (vnode_issystem(vp)) { - if (VTOF(vp)->fcbBTCBPtr != NULL) { - // XXXdbg - if (hfsmp->jnl == NULL) { - BTFlushPath(VTOF(vp)); - } - } - } else if (UBCINFOEXISTS(vp)) { - hfs_unlock(cp); - hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - took_trunc_lock = 1; - - struct hfs_fsync_panic_hook hook; - hook.cp = cp; - panic_hook(&hook.hook, hfs_fsync_panic_hook); - - if (fp->ff_unallocblocks != 0) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - } - - panic_unhook(&hook.hook); - - /* Don't hold cnode lock when calling into cluster layer. */ - (void) cluster_push(vp, waitdata ? IO_SYNC : 0); - - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - } - /* - * When MNT_WAIT is requested and the zero fill timeout - * has expired then we must explicitly zero out any areas - * that are currently marked invalid (holes). - * - * Files with NODUMP can bypass zero filling here. - */ - if (fp && (((cp->c_flag & C_ALWAYS_ZEROFILL) && !TAILQ_EMPTY(&fp->ff_invalidranges)) || - ((wait || (cp->c_flag & C_ZFWANTSYNC)) && - ((cp->c_bsdflags & UF_NODUMP) == 0) && - UBCINFOEXISTS(vp) && (vnode_issystem(vp) ==0) && - cp->c_zftimeout != 0))) { - - microuptime(&tv); - if ((cp->c_flag & C_ALWAYS_ZEROFILL) == 0 && fsync_default && tv.tv_sec < (long)cp->c_zftimeout) { - /* Remember that a force sync was requested. */ - cp->c_flag |= C_ZFWANTSYNC; - goto datasync; - } - if (!TAILQ_EMPTY(&fp->ff_invalidranges)) { - if (!took_trunc_lock || (cp->c_truncatelockowner == HFS_SHARED_OWNER)) { - hfs_unlock(cp); - if (took_trunc_lock) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - } - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - took_trunc_lock = 1; - } - hfs_flush_invalid_ranges(vp); - hfs_unlock(cp); - (void) cluster_push(vp, waitdata ? IO_SYNC : 0); - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - } - } -datasync: - if (took_trunc_lock) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - took_trunc_lock = 0; - } - /* - * if we have a journal and if journal_active() returns != 0 then the - * we shouldn't do anything to a locked block (because it is part - * of a transaction). otherwise we'll just go through the normal - * code path and flush the buffer. note journal_active() can return - * -1 if the journal is invalid -- however we still need to skip any - * locked blocks as they get cleaned up when we finish the transaction - * or close the journal. - */ - // if (hfsmp->jnl && journal_active(hfsmp->jnl) >= 0) - if (hfsmp->jnl) - lockflag = BUF_SKIP_LOCKED; - else - lockflag = 0; - - /* - * Flush all dirty buffers associated with a vnode. - * Record how many of them were dirty AND locked (if necessary). - */ - locked_buffers = buf_flushdirtyblks_skipinfo(vp, waitdata, lockflag, "hfs_fsync"); - if ((lockflag & BUF_SKIP_LOCKED) && (locked_buffers) && (vnode_vtype(vp) == VLNK)) { - /* - * If there are dirty symlink buffers, then we may need to take action - * to prevent issues later on if we are journaled. If we're fsyncing a - * symlink vnode then we are in one of three cases: - * - * 1) automatic sync has fired. In this case, we don't want the behavior to change. - * - * 2) Someone has opened the FD for the symlink (not what it points to) - * and has issued an fsync against it. This should be rare, and we don't - * want the behavior to change. - * - * 3) We are being called by a vclean which is trying to reclaim this - * symlink vnode. If this is the case, then allowing this fsync to - * proceed WITHOUT flushing the journal could result in the vclean - * invalidating the buffer's blocks before the journal transaction is - * written to disk. To prevent this, we force a journal flush - * if the vnode is in the middle of a recycle (VL_TERMINATE or VL_DEAD is set). - */ - if (vnode_isrecycled(vp)) { - fsync_default = 0; - } - } - -metasync: - if (vnode_isreg(vp) && vnode_issystem(vp)) { - if (VTOF(vp)->fcbBTCBPtr != NULL) { - microuptime(&tv); - BTSetLastSync(VTOF(vp), tv.tv_sec); - } - cp->c_touch_acctime = FALSE; - cp->c_touch_chgtime = FALSE; - cp->c_touch_modtime = FALSE; - } else if ( !(vp->v_flag & VSWAP) ) /* User file */ { - retval = hfs_update(vp, HFS_UPDATE_FORCE); - - /* - * When MNT_WAIT is requested push out the catalog record for - * this file. If they asked for a full fsync, we can skip this - * because the journal_flush or hfs_metasync_all will push out - * all of the metadata changes. - */ - if ((retval == 0) && wait && fsync_default && cp->c_hint && - !ISSET(cp->c_flag, C_DELETED | C_NOEXISTS)) { - hfs_metasync(VTOHFS(vp), (daddr64_t)cp->c_hint, p); - } - - /* - * If this was a full fsync, make sure all metadata - * changes get to stable storage. - */ - if (!fsync_default) { - if (hfsmp->jnl) { - if (fsyncmode == HFS_FSYNC_FULL) - hfs_flush(hfsmp, HFS_FLUSH_FULL); - else - hfs_flush(hfsmp, - HFS_FLUSH_JOURNAL_BARRIER); - } else { - retval = hfs_metasync_all(hfsmp); - /* XXX need to pass context! */ - hfs_flush(hfsmp, HFS_FLUSH_CACHE); - } - } - } - - if (!hfs_is_dirty(cp) && !ISSET(cp->c_flag, C_DELETED)) - vnode_cleardirty(vp); - - return (retval); -} - - -/* Sync an hfs catalog b-tree node */ -int -hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p) -{ - vnode_t vp; - buf_t bp; - int lockflags; - - vp = HFSTOVCB(hfsmp)->catalogRefNum; - - // XXXdbg - don't need to do this on a journaled volume - if (hfsmp->jnl) { - return 0; - } - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - /* - * Look for a matching node that has been delayed - * but is not part of a set (B_LOCKED). - * - * BLK_ONLYVALID causes buf_getblk to return a - * buf_t for the daddr64_t specified only if it's - * currently resident in the cache... the size - * parameter to buf_getblk is ignored when this flag - * is set - */ - bp = buf_getblk(vp, node, 0, 0, 0, BLK_META | BLK_ONLYVALID); - - if (bp) { - if ((buf_flags(bp) & (B_LOCKED | B_DELWRI)) == B_DELWRI) - (void) VNOP_BWRITE(bp); - else - buf_brelse(bp); - } - - hfs_systemfile_unlock(hfsmp, lockflags); - - return (0); -} - - -/* - * Sync all hfs B-trees. Use this instead of journal_flush for a volume - * without a journal. Note that the volume bitmap does not get written; - * we rely on fsck_hfs to fix that up (which it can do without any loss - * of data). - */ -int -hfs_metasync_all(struct hfsmount *hfsmp) -{ - int lockflags; - - /* Lock all of the B-trees so we get a mutually consistent state */ - lockflags = hfs_systemfile_lock(hfsmp, - SFL_CATALOG|SFL_EXTENTS|SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - /* Sync each of the B-trees */ - if (hfsmp->hfs_catalog_vp) - hfs_btsync(hfsmp->hfs_catalog_vp, 0); - if (hfsmp->hfs_extents_vp) - hfs_btsync(hfsmp->hfs_extents_vp, 0); - if (hfsmp->hfs_attribute_vp) - hfs_btsync(hfsmp->hfs_attribute_vp, 0); - - /* Wait for all of the writes to complete */ - if (hfsmp->hfs_catalog_vp) - vnode_waitforwrites(hfsmp->hfs_catalog_vp, 0, 0, 0, "hfs_metasync_all"); - if (hfsmp->hfs_extents_vp) - vnode_waitforwrites(hfsmp->hfs_extents_vp, 0, 0, 0, "hfs_metasync_all"); - if (hfsmp->hfs_attribute_vp) - vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs_metasync_all"); - - hfs_systemfile_unlock(hfsmp, lockflags); - - return 0; -} - - -/*ARGSUSED 1*/ -static int -hfs_btsync_callback(struct buf *bp, __unused void *dummy) -{ - buf_clearflags(bp, B_LOCKED); - (void) buf_bawrite(bp); - - return(BUF_CLAIMED); -} - - -int -hfs_btsync(struct vnode *vp, int sync_transaction) -{ - struct cnode *cp = VTOC(vp); - struct timeval tv; - int flags = 0; - - if (sync_transaction) - flags |= BUF_SKIP_NONLOCKED; - /* - * Flush all dirty buffers associated with b-tree. - */ - buf_iterate(vp, hfs_btsync_callback, flags, 0); - - microuptime(&tv); - if (vnode_issystem(vp) && (VTOF(vp)->fcbBTCBPtr != NULL)) - (void) BTSetLastSync(VTOF(vp), tv.tv_sec); - cp->c_touch_acctime = FALSE; - cp->c_touch_chgtime = FALSE; - cp->c_touch_modtime = FALSE; - - return 0; -} - -/* - * Remove a directory. - */ -int -hfs_vnop_rmdir(ap) - struct vnop_rmdir_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *dvp = ap->a_dvp; - struct vnode *vp = ap->a_vp; - struct cnode *dcp = VTOC(dvp); - struct cnode *cp = VTOC(vp); - int error; - time_t orig_ctime; - - orig_ctime = VTOC(vp)->c_ctime; - - if (!S_ISDIR(cp->c_mode)) { - return (ENOTDIR); - } - if (dvp == vp) { - return (EINVAL); - } - - check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); - cp = VTOC(vp); - - if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { - return (error); - } - - /* Check for a race with rmdir on the parent directory */ - if (dcp->c_flag & (C_DELETED | C_NOEXISTS)) { - hfs_unlockpair (dcp, cp); - return ENOENT; - } - - // - // if the item is tracked but doesn't have a document_id, assign one and generate an fsevent for it - // - if ((cp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id == 0) { - uint32_t newid; - - hfs_unlockpair(dcp, cp); - - if (hfs_generate_document_id(VTOHFS(vp), &newid) == 0) { - hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); - ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id = newid; -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), - FSE_ARG_DEV, VTOHFS(vp)->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)0, // src inode # - FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # - FSE_ARG_INT32, newid, - FSE_ARG_DONE); -#endif - } else { - // XXXdbg - couldn't get a new docid... what to do? can't really fail the rm... - hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); - } - } - - error = hfs_removedir(dvp, vp, ap->a_cnp, 0, 0); - - hfs_unlockpair(dcp, cp); - - return (error); -} - -/* - * Remove a directory - * - * Both dvp and vp cnodes are locked - */ -int -hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, - int skip_reserve, int only_unlink) -{ - struct cnode *cp; - struct cnode *dcp; - struct hfsmount * hfsmp; - struct cat_desc desc; - int lockflags; - int error = 0, started_tr = 0; - - cp = VTOC(vp); - dcp = VTOC(dvp); - hfsmp = VTOHFS(vp); - - if (dcp == cp) { - return (EINVAL); /* cannot remove "." */ - } - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - return (0); - } - if (cp->c_entries != 0) { - return (ENOTEMPTY); - } - - /* - * If the directory is open or in use (e.g. opendir() or current working - * directory for some process); wait for inactive/reclaim to actually - * remove cnode from the catalog. Both inactive and reclaim codepaths are capable - * of removing open-unlinked directories from the catalog, as well as getting rid - * of EAs still on the element. So change only_unlink to true, so that it will get - * cleaned up below. - * - * Otherwise, we can get into a weird old mess where the directory has C_DELETED, - * but it really means C_NOEXISTS because the item was actually removed from the - * catalog. Then when we try to remove the entry from the catalog later on, it won't - * really be there anymore. - */ - if (vnode_isinuse(vp, 0)) { - only_unlink = 1; - } - - /* Deal with directory hardlinks */ - if (cp->c_flag & C_HARDLINK) { - /* - * Note that if we have a directory which was a hardlink at any point, - * its actual directory data is stored in the directory inode in the hidden - * directory rather than the leaf element(s) present in the namespace. - * - * If there are still other hardlinks to this directory, - * then we'll just eliminate this particular link and the vnode will still exist. - * If this is the last link to an empty directory, then we'll open-unlink the - * directory and it will be only tagged with C_DELETED (as opposed to C_NOEXISTS). - * - * We could also return EBUSY here. - */ - - return hfs_unlink(hfsmp, dvp, vp, cnp, skip_reserve); - } - - /* - * In a few cases, we may want to allow the directory to persist in an - * open-unlinked state. If the directory is being open-unlinked (still has usecount - * references), or if it has EAs, or if it was being deleted as part of a rename, - * then we go ahead and move it to the hidden directory. - * - * If the directory is being open-unlinked, then we want to keep the catalog entry - * alive so that future EA calls and fchmod/fstat etc. do not cause issues later. - * - * If the directory had EAs, then we want to use the open-unlink trick so that the - * EA removal is not done in one giant transaction. Otherwise, it could cause a panic - * due to overflowing the journal. - * - * Finally, if it was deleted as part of a rename, we move it to the hidden directory - * in order to maintain rename atomicity. - * - * Note that the allow_dirs argument to hfs_removefile specifies that it is - * supposed to handle directories for this case. - */ - - if (((hfsmp->hfs_attribute_vp != NULL) && - ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0)) || - (only_unlink != 0)) { - - int ret = hfs_removefile(dvp, vp, cnp, 0, 0, 1, NULL, only_unlink); - /* - * Even though hfs_vnop_rename calls vnode_recycle for us on tvp we call - * it here just in case we were invoked by rmdir() on a directory that had - * EAs. To ensure that we start reclaiming the space as soon as possible, - * we call vnode_recycle on the directory. - */ - vnode_recycle(vp); - - return ret; - - } - - dcp->c_flag |= C_DIR_MODIFICATION; - -#if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) - (void)hfs_getinoquota(cp); -#endif - if ((error = hfs_start_transaction(hfsmp)) != 0) { - goto out; - } - started_tr = 1; - - /* - * Verify the directory is empty (and valid). - * (Rmdir ".." won't be valid since - * ".." will contain a reference to - * the current directory and thus be - * non-empty.) - */ - if ((dcp->c_bsdflags & APPEND) || (cp->c_bsdflags & (IMMUTABLE | APPEND))) { - error = EPERM; - goto out; - } - - /* Remove the entry from the namei cache: */ - cache_purge(vp); - - /* - * Protect against a race with rename by using the component - * name passed in and parent id from dvp (instead of using - * the cp->c_desc which may have changed). - */ - desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - desc.cd_namelen = cnp->cn_namelen; - desc.cd_parentcnid = dcp->c_fileid; - desc.cd_cnid = cp->c_cnid; - desc.cd_flags = CD_ISDIR; - desc.cd_encoding = cp->c_encoding; - desc.cd_hint = 0; - - if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { - error = 0; - goto out; - } - - /* Remove entry from catalog */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - if (!skip_reserve) { - /* - * Reserve some space in the Catalog file. - */ - if ((error = cat_preflight(hfsmp, CAT_DELETE, NULL, 0))) { - hfs_systemfile_unlock(hfsmp, lockflags); - goto out; - } - } - - error = cat_delete(hfsmp, &desc, &cp->c_attr); - - if (!error) { - // - // if skip_reserve == 1 then we're being called from hfs_vnop_rename() and thus - // we don't need to touch the document_id as it's handled by the rename code. - // otherwise it's a normal remove and we need to save the document id in the - // per thread struct and clear it from the cnode. - // - struct doc_tombstone *ut; - ut = get_uthread_doc_tombstone(); - if (!skip_reserve && (cp->c_bsdflags & UF_TRACKED) && should_save_docid_tombstone(ut, vp, cnp)) { - - if (ut->t_lastop_document_id) { - clear_tombstone_docid(ut, hfsmp, NULL); - } - save_tombstone(hfsmp, dvp, vp, cnp, 1); - - } - - /* The parent lost a child */ - if (dcp->c_entries > 0) - dcp->c_entries--; - DEC_FOLDERCOUNT(hfsmp, dcp->c_attr); - dcp->c_dirchangecnt++; - hfs_incr_gencount(dcp); - - dcp->c_touch_chgtime = TRUE; - dcp->c_touch_modtime = TRUE; - dcp->c_flag |= C_MODIFIED; - - hfs_update(dcp->c_vp, 0); - } - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) - goto out; - -#if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) - (void)hfs_chkiq(cp, -1, NOCRED, 0); -#endif /* QUOTA */ - - hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID)); - - /* Mark C_NOEXISTS since the catalog entry is now gone */ - cp->c_flag |= C_NOEXISTS; - -out: - dcp->c_flag &= ~C_DIR_MODIFICATION; - wakeup((caddr_t)&dcp->c_flag); - - if (started_tr) { - hfs_end_transaction(hfsmp); - } - - return (error); -} - - -/* - * Remove a file or link. - */ -int -hfs_vnop_remove(ap) - struct vnop_remove_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - int a_flags; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *dvp = ap->a_dvp; - struct vnode *vp = ap->a_vp; - struct cnode *dcp = VTOC(dvp); - struct cnode *cp; - struct vnode *rvp = NULL; - int error=0, recycle_rsrc=0; - int recycle_vnode = 0; - uint32_t rsrc_vid = 0; - time_t orig_ctime; - - if (dvp == vp) { - return (EINVAL); - } - - orig_ctime = VTOC(vp)->c_ctime; - if (!vnode_isnamedstream(vp) && ((ap->a_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) == 0)) { - error = check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); - if (error) { - // XXXdbg - decide on a policy for handling namespace handler failures! - // for now we just let them proceed. - } - } - error = 0; - - cp = VTOC(vp); - -relock: - - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - - if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - if (rvp) { - vnode_put (rvp); - } - return (error); - } - // - // if the item is tracked but doesn't have a document_id, assign one and generate an fsevent for it - // - if ((cp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id == 0) { - uint32_t newid; - - hfs_unlockpair(dcp, cp); - - if (hfs_generate_document_id(VTOHFS(vp), &newid) == 0) { - hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); - ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id = newid; -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), - FSE_ARG_DEV, VTOHFS(vp)->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)0, // src inode # - FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # - FSE_ARG_INT32, newid, - FSE_ARG_DONE); -#endif - } else { - // XXXdbg - couldn't get a new docid... what to do? can't really fail the rm... - hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); - } - } - - /* - * Lazily respond to determining if there is a valid resource fork - * vnode attached to 'cp' if it is a regular file or symlink. - * If the vnode does not exist, then we may proceed without having to - * create it. - * - * If, however, it does exist, then we need to acquire an iocount on the - * vnode after acquiring its vid. This ensures that if we have to do I/O - * against it, it can't get recycled from underneath us in the middle - * of this call. - * - * Note: this function may be invoked for directory hardlinks, so just skip these - * steps if 'vp' is a directory. - */ - - if ((vp->v_type == VLNK) || (vp->v_type == VREG)) { - if ((cp->c_rsrc_vp) && (rvp == NULL)) { - /* We need to acquire the rsrc vnode */ - rvp = cp->c_rsrc_vp; - rsrc_vid = vnode_vid (rvp); - - /* Unlock everything to acquire iocount on the rsrc vnode */ - hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); - hfs_unlockpair (dcp, cp); - /* Use the vid to maintain identity on rvp */ - if (vnode_getwithvid(rvp, rsrc_vid)) { - /* - * If this fails, then it was recycled or - * reclaimed in the interim. Reset fields and - * start over. - */ - rvp = NULL; - rsrc_vid = 0; - } - goto relock; - } - } - - /* - * Check to see if we raced rmdir for the parent directory - * hfs_removefile already checks for a race on vp/cp - */ - if (dcp->c_flag & (C_DELETED | C_NOEXISTS)) { - error = ENOENT; - goto rm_done; - } - - error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, NULL, 0); - - /* - * If the remove succeeded in deleting the file, then we may need to mark - * the resource fork for recycle so that it is reclaimed as quickly - * as possible. If it were not recycled quickly, then this resource fork - * vnode could keep a v_parent reference on the data fork, which prevents it - * from going through reclaim (by giving it extra usecounts), except in the force- - * unmount case. - * - * However, a caveat: we need to continue to supply resource fork - * access to open-unlinked files even if the resource fork is not open. This is - * a requirement for the compressed files work. Luckily, hfs_vgetrsrc will handle - * this already if the data fork has been re-parented to the hidden directory. - * - * As a result, all we really need to do here is mark the resource fork vnode - * for recycle. If it goes out of core, it can be brought in again if needed. - * If the cnode was instead marked C_NOEXISTS, then there wouldn't be any - * more work. - */ - if (error == 0) { - hfs_hotfile_deleted(vp); - - if (rvp) { - recycle_rsrc = 1; - } - /* - * If the target was actually removed from the catalog schedule it for - * full reclamation/inactivation. We hold an iocount on it so it should just - * get marked with MARKTERM - */ - if (cp->c_flag & C_NOEXISTS) { - recycle_vnode = 1; - } - } - - - /* - * Drop the truncate lock before unlocking the cnode - * (which can potentially perform a vnode_put and - * recycle the vnode which in turn might require the - * truncate lock) - */ -rm_done: - hfs_unlockpair(dcp, cp); - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - - if (recycle_rsrc) { - /* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */ - vnode_recycle(rvp); - } - if (recycle_vnode) { - vnode_recycle (vp); - } - - if (rvp) { - /* drop iocount on rsrc fork, was obtained at beginning of fxn */ - vnode_put(rvp); - } - - return (error); -} - - -int -hfs_removefile_callback(struct buf *bp, void *hfsmp) { - - if ( !(buf_flags(bp) & B_META)) - panic("hfs: symlink bp @ %p is not marked meta-data!\n", bp); - /* - * it's part of the current transaction, kill it. - */ - journal_kill_block(((struct hfsmount *)hfsmp)->jnl, bp); - - return (BUF_CLAIMED); -} - -/* - * hfs_removefile - * - * Similar to hfs_vnop_remove except there are additional options. - * This function may be used to remove directories if they have - * lots of EA's -- note the 'allow_dirs' argument. - * - * This function is able to delete blocks & fork data for the resource - * fork even if it does not exist in core (and have a backing vnode). - * It should infer the correct behavior based on the number of blocks - * in the cnode and whether or not the resource fork pointer exists or - * not. As a result, one only need pass in the 'vp' corresponding to the - * data fork of this file (or main vnode in the case of a directory). - * Passing in a resource fork will result in an error. - * - * Because we do not create any vnodes in this function, we are not at - * risk of deadlocking against ourselves by double-locking. - * - * Requires cnode and truncate locks to be held. - */ -int -hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, - int flags, int skip_reserve, int allow_dirs, - __unused struct vnode *rvp, int only_unlink) -{ - struct cnode *cp; - struct cnode *dcp; - struct vnode *rsrc_vp = NULL; - struct hfsmount *hfsmp; - struct cat_desc desc; - struct timeval tv; - int dataforkbusy = 0; - int rsrcforkbusy = 0; - int lockflags; - int error = 0; - int started_tr = 0; - int isbigfile = 0, defer_remove=0, isdir=0; - int update_vh = 0; - - cp = VTOC(vp); - dcp = VTOC(dvp); - hfsmp = VTOHFS(vp); - - /* Check if we lost a race post lookup. */ - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - return (0); - } - - if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { - return 0; - } - - /* Make sure a remove is permitted */ - if (VNODE_IS_RSRC(vp)) { - return (EPERM); - } - else { - /* - * We know it's a data fork. - * Probe the cnode to see if we have a valid resource fork - * in hand or not. - */ - rsrc_vp = cp->c_rsrc_vp; - } - - /* Don't allow deleting the journal or journal_info_block. */ - if (hfs_is_journal_file(hfsmp, cp)) { - return (EPERM); - } - - /* - * Hard links require special handling. - */ - if (cp->c_flag & C_HARDLINK) { - if ((flags & VNODE_REMOVE_NODELETEBUSY) && vnode_isinuse(vp, 0)) { - return (EBUSY); - } else { - /* A directory hard link with a link count of one is - * treated as a regular directory. Therefore it should - * only be removed using rmdir(). - */ - if ((vnode_isdir(vp) == 1) && (cp->c_linkcount == 1) && - (allow_dirs == 0)) { - return (EPERM); - } - return hfs_unlink(hfsmp, dvp, vp, cnp, skip_reserve); - } - } - - /* Directories should call hfs_rmdir! (unless they have a lot of attributes) */ - if (vnode_isdir(vp)) { - if (allow_dirs == 0) - return (EPERM); /* POSIX */ - isdir = 1; - } - /* Sanity check the parent ids. */ - if ((cp->c_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && - (cp->c_parentcnid != dcp->c_fileid)) { - return (EINVAL); - } - - dcp->c_flag |= C_DIR_MODIFICATION; - - // this guy is going away so mark him as such - cp->c_flag |= C_DELETED; - - - /* Remove our entry from the namei cache. */ - cache_purge(vp); - - /* - * If the caller was operating on a file (as opposed to a - * directory with EAs), then we need to figure out - * whether or not it has a valid resource fork vnode. - * - * If there was a valid resource fork vnode, then we need - * to use hfs_truncate to eliminate its data. If there is - * no vnode, then we hold the cnode lock which would - * prevent it from being created. As a result, - * we can use the data deletion functions which do not - * require that a cnode/vnode pair exist. - */ - - /* Check if this file is being used. */ - if (isdir == 0) { - dataforkbusy = vnode_isinuse(vp, 0); - /* - * At this point, we know that 'vp' points to the - * a data fork because we checked it up front. And if - * there is no rsrc fork, rsrc_vp will be NULL. - */ - if (rsrc_vp && (cp->c_blocks - VTOF(vp)->ff_blocks)) { - rsrcforkbusy = vnode_isinuse(rsrc_vp, 0); - } - } - - /* Check if we have to break the deletion into multiple pieces. */ - if (isdir == 0) - isbigfile = cp->c_datafork->ff_size >= HFS_BIGFILE_SIZE; - - /* Check if the file has xattrs. If it does we'll have to delete them in - individual transactions in case there are too many */ - if ((hfsmp->hfs_attribute_vp != NULL) && - (cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0) { - defer_remove = 1; - } - - /* If we are explicitly told to only unlink item and move to hidden dir, then do it */ - if (only_unlink) { - defer_remove = 1; - } - - /* - * Carbon semantics prohibit deleting busy files. - * (enforced when VNODE_REMOVE_NODELETEBUSY is requested) - */ - if (dataforkbusy || rsrcforkbusy) { - if ((flags & VNODE_REMOVE_NODELETEBUSY) || - (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid == 0)) { - error = EBUSY; - goto out; - } - } - -#if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) - (void)hfs_getinoquota(cp); -#endif /* QUOTA */ - - /* - * Do a ubc_setsize to indicate we need to wipe contents if: - * 1) item is a regular file. - * 2) Neither fork is busy AND we are not told to unlink this. - * - * We need to check for the defer_remove since it can be set without - * having a busy data or rsrc fork - */ - if (isdir == 0 && (!dataforkbusy || !rsrcforkbusy) && (defer_remove == 0)) { - /* - * A ubc_setsize can cause a pagein so defer it - * until after the cnode lock is dropped. The - * cnode lock cannot be dropped/reacquired here - * since we might already hold the journal lock. - */ - if (!dataforkbusy && cp->c_datafork->ff_blocks && !isbigfile) { - cp->c_flag |= C_NEED_DATA_SETSIZE; - } - if (!rsrcforkbusy && rsrc_vp) { - cp->c_flag |= C_NEED_RSRC_SETSIZE; - } - } - - if ((error = hfs_start_transaction(hfsmp)) != 0) { - goto out; - } - started_tr = 1; - - // XXXdbg - if we're journaled, kill any dirty symlink buffers - if (hfsmp->jnl && vnode_islnk(vp) && (defer_remove == 0)) { - buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); - } - - /* - * Prepare to truncate any non-busy forks. Busy forks will - * get truncated when their vnode goes inactive. - * Note that we will only enter this region if we - * can avoid creating an open-unlinked file. If - * either region is busy, we will have to create an open - * unlinked file. - * - * Since we are deleting the file, we need to stagger the runtime - * modifications to do things in such a way that a crash won't - * result in us getting overlapped extents or any other - * bad inconsistencies. As such, we call prepare_release_storage - * which updates the UBC, updates quota information, and releases - * any loaned blocks that belong to this file. No actual - * truncation or bitmap manipulation is done until *AFTER* - * the catalog record is removed. - */ - if (isdir == 0 && (!dataforkbusy && !rsrcforkbusy) && (only_unlink == 0)) { - - if (!dataforkbusy && !isbigfile && cp->c_datafork->ff_blocks != 0) { - - error = hfs_prepare_release_storage (hfsmp, vp); - if (error) { - goto out; - } - update_vh = 1; - } - - /* - * If the resource fork vnode does not exist, we can skip this step. - */ - if (!rsrcforkbusy && rsrc_vp) { - error = hfs_prepare_release_storage (hfsmp, rsrc_vp); - if (error) { - goto out; - } - update_vh = 1; - } - } - - /* - * Protect against a race with rename by using the component - * name passed in and parent id from dvp (instead of using - * the cp->c_desc which may have changed). Also, be aware that - * because we allow directories to be passed in, we need to special case - * this temporary descriptor in case we were handed a directory. - */ - if (isdir) { - desc.cd_flags = CD_ISDIR; - } - else { - desc.cd_flags = 0; - } - desc.cd_encoding = cp->c_desc.cd_encoding; - desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - desc.cd_namelen = cnp->cn_namelen; - desc.cd_parentcnid = dcp->c_fileid; - desc.cd_hint = cp->c_desc.cd_hint; - desc.cd_cnid = cp->c_cnid; - microtime(&tv); - - /* - * There are two cases to consider: - * 1. File/Dir is busy/big/defer_remove ==> move/rename the file/dir - * 2. File is not in use ==> remove the file - * - * We can get a directory in case 1 because it may have had lots of attributes, - * which need to get removed here. - */ - if (dataforkbusy || rsrcforkbusy || isbigfile || defer_remove) { - char delname[32]; - struct cat_desc to_desc; - struct cat_desc todir_desc; - - /* - * Orphan this file or directory (move to hidden directory). - * Again, we need to take care that we treat directories as directories, - * and files as files. Because directories with attributes can be passed in - * check to make sure that we have a directory or a file before filling in the - * temporary descriptor's flags. We keep orphaned directories AND files in - * the FILE_HARDLINKS private directory since we're generalizing over all - * orphaned filesystem objects. - */ - bzero(&todir_desc, sizeof(todir_desc)); - todir_desc.cd_parentcnid = 2; - - MAKE_DELETED_NAME(delname, sizeof(delname), cp->c_fileid); - bzero(&to_desc, sizeof(to_desc)); - to_desc.cd_nameptr = (const u_int8_t *)delname; - to_desc.cd_namelen = strlen(delname); - to_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - if (isdir) { - to_desc.cd_flags = CD_ISDIR; - } - else { - to_desc.cd_flags = 0; - } - to_desc.cd_cnid = cp->c_cnid; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - if (!skip_reserve) { - if ((error = cat_preflight(hfsmp, CAT_RENAME, NULL, 0))) { - hfs_systemfile_unlock(hfsmp, lockflags); - goto out; - } - } - - error = cat_rename(hfsmp, &desc, &todir_desc, - &to_desc, (struct cat_desc *)NULL); - - if (error == 0) { - hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries++; - if (isdir == 1) { - INC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); - } - (void) cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], - &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); - - /* Update the parent directory */ - if (dcp->c_entries > 0) - dcp->c_entries--; - if (isdir == 1) { - DEC_FOLDERCOUNT(hfsmp, dcp->c_attr); - } - dcp->c_dirchangecnt++; - hfs_incr_gencount(dcp); - - dcp->c_ctime = tv.tv_sec; - dcp->c_mtime = tv.tv_sec; - (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); - - /* Update the file or directory's state */ - cp->c_flag |= C_DELETED; - cp->c_ctime = tv.tv_sec; - --cp->c_linkcount; - (void) cat_update(hfsmp, &to_desc, &cp->c_attr, NULL, NULL); - } - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) - goto out; - - } - else { - /* - * Nobody is using this item; we can safely remove everything. - */ - struct filefork *temp_rsrc_fork = NULL; -#if QUOTA - off_t savedbytes; - int blksize = hfsmp->blockSize; -#endif - u_int32_t fileid = cp->c_fileid; - - /* - * Figure out if we need to read the resource fork data into - * core before wiping out the catalog record. - * - * 1) Must not be a directory - * 2) cnode's c_rsrcfork ptr must be NULL. - * 3) rsrc fork must have actual blocks - */ - if ((isdir == 0) && (cp->c_rsrcfork == NULL) && - (cp->c_blocks - VTOF(vp)->ff_blocks)) { - /* - * The resource fork vnode & filefork did not exist. - * Create a temporary one for use in this function only. - */ - MALLOC_ZONE (temp_rsrc_fork, struct filefork *, sizeof (struct filefork), M_HFSFORK, M_WAITOK); - bzero(temp_rsrc_fork, sizeof(struct filefork)); - temp_rsrc_fork->ff_cp = cp; - rl_init(&temp_rsrc_fork->ff_invalidranges); - } - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - /* Look up the resource fork first, if necessary */ - if (temp_rsrc_fork) { - error = cat_lookup (hfsmp, &desc, 1, 0, (struct cat_desc*) NULL, - (struct cat_attr*) NULL, &temp_rsrc_fork->ff_data, NULL); - if (error) { - FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); - hfs_systemfile_unlock (hfsmp, lockflags); - goto out; - } - } - - if (!skip_reserve) { - if ((error = cat_preflight(hfsmp, CAT_DELETE, NULL, 0))) { - if (temp_rsrc_fork) { - FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); - } - hfs_systemfile_unlock(hfsmp, lockflags); - goto out; - } - } - - error = cat_delete(hfsmp, &desc, &cp->c_attr); - - if (error && error != ENXIO && error != ENOENT) { - printf("hfs_removefile: deleting file %s (id=%d) vol=%s err=%d\n", - cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, hfsmp->vcbVN, error); - } - - if (error == 0) { - /* Update the parent directory */ - if (dcp->c_entries > 0) - dcp->c_entries--; - dcp->c_dirchangecnt++; - hfs_incr_gencount(dcp); - - dcp->c_ctime = tv.tv_sec; - dcp->c_mtime = tv.tv_sec; - (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); - } - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) { - if (temp_rsrc_fork) { - FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); - } - goto out; - } - - /* - * Now that we've wiped out the catalog record, the file effectively doesn't - * exist anymore. So update the quota records to reflect the loss of the - * data fork and the resource fork. - */ -#if QUOTA - if (cp->c_datafork->ff_blocks > 0) { - savedbytes = ((off_t)cp->c_datafork->ff_blocks * (off_t)blksize); - (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); - } - - /* - * We may have just deleted the catalog record for a resource fork even - * though it did not exist in core as a vnode. However, just because there - * was a resource fork pointer in the cnode does not mean that it had any blocks. - */ - if (temp_rsrc_fork || cp->c_rsrcfork) { - if (cp->c_rsrcfork) { - if (cp->c_rsrcfork->ff_blocks > 0) { - savedbytes = ((off_t)cp->c_rsrcfork->ff_blocks * (off_t)blksize); - (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); - } - } - else { - /* we must have used a temporary fork */ - savedbytes = ((off_t)temp_rsrc_fork->ff_blocks * (off_t)blksize); - (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); - } - } - - if (hfsmp->hfs_flags & HFS_QUOTAS) { - (void)hfs_chkiq(cp, -1, NOCRED, 0); - } -#endif - - /* - * If we didn't get any errors deleting the catalog entry, then go ahead - * and release the backing store now. The filefork pointers are still valid. - */ - if (temp_rsrc_fork) { - error = hfs_release_storage (hfsmp, cp->c_datafork, temp_rsrc_fork, fileid); - } - else { - /* if cp->c_rsrcfork == NULL, hfs_release_storage will skip over it. */ - error = hfs_release_storage (hfsmp, cp->c_datafork, cp->c_rsrcfork, fileid); - } - if (error) { - /* - * If we encountered an error updating the extents and bitmap, - * mark the volume inconsistent. At this point, the catalog record has - * already been deleted, so we can't recover it at this point. We need - * to proceed and update the volume header and mark the cnode C_NOEXISTS. - * The subsequent fsck should be able to recover the free space for us. - */ - hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); - } - else { - /* reset update_vh to 0, since hfs_release_storage should have done it for us */ - update_vh = 0; - } - - /* Get rid of the temporary rsrc fork */ - if (temp_rsrc_fork) { - FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); - } - - cp->c_flag |= C_NOEXISTS; - cp->c_flag &= ~C_DELETED; - - cp->c_touch_chgtime = TRUE; - --cp->c_linkcount; - - /* - * We must never get a directory if we're in this else block. We could - * accidentally drop the number of files in the volume header if we did. - */ - hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); - - } - - // - // if skip_reserve == 1 then we're being called from hfs_vnop_rename() and thus - // we don't need to touch the document_id as it's handled by the rename code. - // otherwise it's a normal remove and we need to save the document id in the - // per thread struct and clear it from the cnode. - // - struct doc_tombstone *ut; - ut = get_uthread_doc_tombstone(); - if (!error && !skip_reserve && (cp->c_bsdflags & UF_TRACKED) && should_save_docid_tombstone(ut, vp, cnp)) { - - if (ut->t_lastop_document_id) { - clear_tombstone_docid(ut, hfsmp, NULL); - } - save_tombstone(hfsmp, dvp, vp, cnp, 1); - - } - - - /* - * All done with this cnode's descriptor... - * - * Note: all future catalog calls for this cnode must be by - * fileid only. This is OK for HFS (which doesn't have file - * thread records) since HFS doesn't support the removal of - * busy files. - */ - cat_releasedesc(&cp->c_desc); - -out: - if (error) { - cp->c_flag &= ~C_DELETED; - } - - if (update_vh) { - /* - * If we bailed out earlier, we may need to update the volume header - * to deal with the borrowed blocks accounting. - */ - hfs_volupdate (hfsmp, VOL_UPDATE, 0); - } - - if (started_tr) { - hfs_end_transaction(hfsmp); - } - - dcp->c_flag &= ~C_DIR_MODIFICATION; - wakeup((caddr_t)&dcp->c_flag); - - return (error); -} - - -__private_extern__ void -replace_desc(struct cnode *cp, struct cat_desc *cdp) -{ - // fixes 4348457 and 4463138 - if (&cp->c_desc == cdp) { - return; - } - - /* First release allocated name buffer */ - if (cp->c_desc.cd_flags & CD_HASBUF && cp->c_desc.cd_nameptr != 0) { - const u_int8_t *name = cp->c_desc.cd_nameptr; - - cp->c_desc.cd_nameptr = 0; - cp->c_desc.cd_namelen = 0; - cp->c_desc.cd_flags &= ~CD_HASBUF; - vfs_removename((const char *)name); - } - bcopy(cdp, &cp->c_desc, sizeof(cp->c_desc)); - - /* Cnode now owns the name buffer */ - cdp->cd_nameptr = 0; - cdp->cd_namelen = 0; - cdp->cd_flags &= ~CD_HASBUF; -} - - -/* - * Rename a cnode. - * - * The VFS layer guarantees that: - * - source and destination will either both be directories, or - * both not be directories. - * - all the vnodes are from the same file system - * - * When the target is a directory, HFS must ensure that its empty. - * - * Note that this function requires up to 6 vnodes in order to work properly - * if it is operating on files (and not on directories). This is because only - * files can have resource forks, and we now require iocounts to be held on the - * vnodes corresponding to the resource forks (if applicable) as well as - * the files or directories undergoing rename. The problem with not holding - * iocounts on the resource fork vnodes is that it can lead to a deadlock - * situation: The rsrc fork of the source file may be recycled and reclaimed - * in order to provide a vnode for the destination file's rsrc fork. Since - * data and rsrc forks share the same cnode, we'd eventually try to lock the - * source file's cnode in order to sync its rsrc fork to disk, but it's already - * been locked. By taking the rsrc fork vnodes up front we ensure that they - * cannot be recycled, and that the situation mentioned above cannot happen. - */ -int -hfs_vnop_rename(ap) - struct vnop_rename_args /* { - struct vnode *a_fdvp; - struct vnode *a_fvp; - struct componentname *a_fcnp; - struct vnode *a_tdvp; - struct vnode *a_tvp; - struct componentname *a_tcnp; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *tvp = ap->a_tvp; - struct vnode *tdvp = ap->a_tdvp; - struct vnode *fvp = ap->a_fvp; - struct vnode *fdvp = ap->a_fdvp; - /* - * Note that we only need locals for the target/destination's - * resource fork vnode (and only if necessary). We don't care if the - * source has a resource fork vnode or not. - */ - struct vnode *tvp_rsrc = NULLVP; - uint32_t tvp_rsrc_vid = 0; - struct componentname *tcnp = ap->a_tcnp; - struct componentname *fcnp = ap->a_fcnp; - struct proc *p = vfs_context_proc(ap->a_context); - struct cnode *fcp; - struct cnode *fdcp; - struct cnode *tdcp; - struct cnode *tcp; - struct cnode *error_cnode; - struct cat_desc from_desc; - struct cat_desc to_desc; - struct cat_desc out_desc; - struct hfsmount *hfsmp; - cat_cookie_t cookie; - int tvp_deleted = 0; - int started_tr = 0, got_cookie = 0; - int took_trunc_lock = 0; - int lockflags; - int error; - time_t orig_from_ctime, orig_to_ctime; - int emit_rename = 1; - int emit_delete = 1; - int is_tracked = 0; - int unlocked; - - orig_from_ctime = VTOC(fvp)->c_ctime; - if (tvp && VTOC(tvp)) { - orig_to_ctime = VTOC(tvp)->c_ctime; - } else { - orig_to_ctime = ~0; - } - - hfsmp = VTOHFS(tdvp); - /* - * Do special case checks here. If fvp == tvp then we need to check the - * cnode with locks held. - */ - if (fvp == tvp) { - int is_hardlink = 0; - /* - * In this case, we do *NOT* ever emit a DELETE event. - * We may not necessarily emit a RENAME event - */ - emit_delete = 0; - if ((error = hfs_lock(VTOC(fvp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { - return error; - } - /* Check to see if the item is a hardlink or not */ - is_hardlink = (VTOC(fvp)->c_flag & C_HARDLINK); - hfs_unlock (VTOC(fvp)); - - /* - * If the item is not a hardlink, then case sensitivity must be off, otherwise - * two names should not resolve to the same cnode unless they were case variants. - */ - if (is_hardlink) { - emit_rename = 0; - /* - * Hardlinks are a little trickier. We only want to emit a rename event - * if the item is a hardlink, the parent directories are the same, case sensitivity - * is off, and the case folded names are the same. See the fvp == tvp case below for more - * info. - */ - - if ((fdvp == tdvp) && ((hfsmp->hfs_flags & HFS_CASE_SENSITIVE) == 0)) { - if (hfs_namecmp((const u_int8_t *)fcnp->cn_nameptr, fcnp->cn_namelen, - (const u_int8_t *)tcnp->cn_nameptr, tcnp->cn_namelen) == 0) { - /* Then in this case only it is ok to emit a rename */ - emit_rename = 1; - } - } - } - } - if (emit_rename) { - /* c_bsdflags should only be assessed while holding the cnode lock. - * This is not done consistently throughout the code and can result - * in race. This will be fixed via rdar://12181064 - */ - if (VTOC(fvp)->c_bsdflags & UF_TRACKED) { - is_tracked = 1; - } - check_for_tracked_file(fvp, orig_from_ctime, NAMESPACE_HANDLER_RENAME_OP, NULL); - } - - if (tvp && VTOC(tvp)) { - if (emit_delete) { - check_for_tracked_file(tvp, orig_to_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); - } - } - -retry: - /* When tvp exists, take the truncate lock for hfs_removefile(). */ - if (tvp && (vnode_isreg(tvp) || vnode_islnk(tvp))) { - hfs_lock_truncate(VTOC(tvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - took_trunc_lock = 1; - } - -relock: - error = hfs_lockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL, - HFS_EXCLUSIVE_LOCK, &error_cnode); - if (error) { - if (took_trunc_lock) { - hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT); - took_trunc_lock = 0; - } - - /* - * We hit an error path. If we were trying to re-acquire the locks - * after coming through here once, we might have already obtained - * an iocount on tvp's resource fork vnode. Drop that before dealing - * with the failure. Note this is safe -- since we are in an - * error handling path, we can't be holding the cnode locks. - */ - if (tvp_rsrc) { - vnode_put (tvp_rsrc); - tvp_rsrc_vid = 0; - tvp_rsrc = NULL; - } - - /* - * tvp might no longer exist. If the cause of the lock failure - * was tvp, then we can try again with tvp/tcp set to NULL. - * This is ok because the vfs syscall will vnode_put the vnodes - * after we return from hfs_vnop_rename. - */ - if ((error == ENOENT) && (tvp != NULL) && (error_cnode == VTOC(tvp))) { - tcp = NULL; - tvp = NULL; - goto retry; - } - - /* If we want to reintroduce notifications for failed renames, this - is the place to do it. */ - - return (error); - } - - fdcp = VTOC(fdvp); - fcp = VTOC(fvp); - tdcp = VTOC(tdvp); - tcp = tvp ? VTOC(tvp) : NULL; - - // - // if the item is tracked but doesn't have a document_id, assign one and generate an fsevent for it - // - unlocked = 0; - if ((fcp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16))->document_id == 0) { - uint32_t newid; - - hfs_unlockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL); - unlocked = 1; - - if (hfs_generate_document_id(hfsmp, &newid) == 0) { - hfs_lock(fcp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - ((struct FndrExtendedDirInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16))->document_id = newid; -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)0, // src inode # - FSE_ARG_INO, (ino64_t)fcp->c_fileid, // dst inode # - FSE_ARG_INT32, newid, - FSE_ARG_DONE); -#endif - hfs_unlock(fcp); - } else { - // XXXdbg - couldn't get a new docid... what to do? can't really fail the rename... - } - - // - // check if we're going to need to fix tcp as well. if we aren't, go back relock - // everything. otherwise continue on and fix up tcp as well before relocking. - // - if (tcp == NULL || !(tcp->c_bsdflags & UF_TRACKED) || ((struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16))->document_id != 0) { - goto relock; - } - } - - // - // same thing for tcp if it's set - // - if (tcp && (tcp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16))->document_id == 0) { - uint32_t newid; - - if (!unlocked) { - hfs_unlockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL); - unlocked = 1; - } - - if (hfs_generate_document_id(hfsmp, &newid) == 0) { - hfs_lock(tcp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - ((struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16))->document_id = newid; -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)0, // src inode # - FSE_ARG_INO, (ino64_t)tcp->c_fileid, // dst inode # - FSE_ARG_INT32, newid, - FSE_ARG_DONE); -#endif - hfs_unlock(tcp); - } else { - // XXXdbg - couldn't get a new docid... what to do? can't really fail the rename... - } - - // go back up and relock everything. next time through the if statement won't be true - // and we'll skip over this block of code. - goto relock; - } - - - - /* - * Acquire iocounts on the destination's resource fork vnode - * if necessary. If dst/src are files and the dst has a resource - * fork vnode, then we need to try and acquire an iocount on the rsrc vnode. - * If it does not exist, then we don't care and can skip it. - */ - if ((vnode_isreg(fvp)) || (vnode_islnk(fvp))) { - if ((tvp) && (tcp->c_rsrc_vp) && (tvp_rsrc == NULL)) { - tvp_rsrc = tcp->c_rsrc_vp; - /* - * We can look at the vid here because we're holding the - * cnode lock on the underlying cnode for this rsrc vnode. - */ - tvp_rsrc_vid = vnode_vid (tvp_rsrc); - - /* Unlock everything to acquire iocount on this rsrc vnode */ - if (took_trunc_lock) { - hfs_unlock_truncate (VTOC(tvp), HFS_LOCK_DEFAULT); - took_trunc_lock = 0; - } - hfs_unlockfour(fdcp, fcp, tdcp, tcp); - - if (vnode_getwithvid (tvp_rsrc, tvp_rsrc_vid)) { - /* iocount acquisition failed. Reset fields and start over.. */ - tvp_rsrc_vid = 0; - tvp_rsrc = NULL; - } - goto retry; - } - } - - - - /* Ensure we didn't race src or dst parent directories with rmdir. */ - if (fdcp->c_flag & (C_NOEXISTS | C_DELETED)) { - error = ENOENT; - goto out; - } - - if (tdcp->c_flag & (C_NOEXISTS | C_DELETED)) { - error = ENOENT; - goto out; - } - - - /* Check for a race against unlink. The hfs_valid_cnode checks validate - * the parent/child relationship with fdcp and tdcp, as well as the - * component name of the target cnodes. - */ - if ((fcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, fdvp, fcnp, fcp->c_fileid, NULL, &error)) { - error = ENOENT; - goto out; - } - - if (tcp && ((tcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, tdvp, tcnp, tcp->c_fileid, NULL, &error))) { - // - // hmm, the destination vnode isn't valid any more. - // in this case we can just drop him and pretend he - // never existed in the first place. - // - if (took_trunc_lock) { - hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT); - took_trunc_lock = 0; - } - error = 0; - - hfs_unlockfour(fdcp, fcp, tdcp, tcp); - - tcp = NULL; - tvp = NULL; - - // retry the locking with tvp null'ed out - goto retry; - } - - fdcp->c_flag |= C_DIR_MODIFICATION; - if (fdvp != tdvp) { - tdcp->c_flag |= C_DIR_MODIFICATION; - } - - /* - * Disallow renaming of a directory hard link if the source and - * destination parent directories are different, or a directory whose - * descendant is a directory hard link and the one of the ancestors - * of the destination directory is a directory hard link. - */ - if (vnode_isdir(fvp) && (fdvp != tdvp)) { - if (fcp->c_flag & C_HARDLINK) { - error = EPERM; - goto out; - } - if (fcp->c_attr.ca_recflags & kHFSHasChildLinkMask) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - if (cat_check_link_ancestry(hfsmp, tdcp->c_fileid, 0)) { - error = EPERM; - hfs_systemfile_unlock(hfsmp, lockflags); - goto out; - } - hfs_systemfile_unlock(hfsmp, lockflags); - } - } - - /* - * The following edge case is caught here: - * (to cannot be a descendent of from) - * - * o fdvp - * / - * / - * o fvp - * \ - * \ - * o tdvp - * / - * / - * o tvp - */ - if (tdcp->c_parentcnid == fcp->c_fileid) { - error = EINVAL; - goto out; - } - - /* - * The following two edge cases are caught here: - * (note tvp is not empty) - * - * o tdvp o tdvp - * / / - * / / - * o tvp tvp o fdvp - * \ \ - * \ \ - * o fdvp o fvp - * / - * / - * o fvp - */ - if (tvp && vnode_isdir(tvp) && (tcp->c_entries != 0) && fvp != tvp) { - error = ENOTEMPTY; - goto out; - } - - /* - * The following edge case is caught here: - * (the from child and parent are the same) - * - * o tdvp - * / - * / - * fdvp o fvp - */ - if (fdvp == fvp) { - error = EINVAL; - goto out; - } - - /* - * Make sure "from" vnode and its parent are changeable. - */ - if ((fcp->c_bsdflags & (IMMUTABLE | APPEND)) || (fdcp->c_bsdflags & APPEND)) { - error = EPERM; - goto out; - } - - /* - * If the destination parent directory is "sticky", then the - * user must own the parent directory, or the destination of - * the rename, otherwise the destination may not be changed - * (except by root). This implements append-only directories. - * - * Note that checks for immutable and write access are done - * by the call to hfs_removefile. - */ - if (tvp && (tdcp->c_mode & S_ISTXT) && - (suser(vfs_context_ucred(tcnp->cn_context), NULL)) && - (kauth_cred_getuid(vfs_context_ucred(tcnp->cn_context)) != tdcp->c_uid) && - (hfs_owner_rights(hfsmp, tcp->c_uid, vfs_context_ucred(tcnp->cn_context), p, false)) ) { - error = EPERM; - goto out; - } - - /* Don't allow modification of the journal or journal_info_block */ - if (hfs_is_journal_file(hfsmp, fcp) || - (tcp && hfs_is_journal_file(hfsmp, tcp))) { - error = EPERM; - goto out; - } - -#if QUOTA - if (tvp) - (void)hfs_getinoquota(tcp); -#endif - /* Preflighting done, take fvp out of the name space. */ - cache_purge(fvp); - -#if CONFIG_SECLUDED_RENAME - /* - * Check for "secure" rename that imposes additional restrictions on the - * source vnode. We wait until here to check in order to prevent a race - * with other threads that manage to look up fvp, but their open or link - * is blocked by our locks. At this point, with fvp out of the name cache, - * and holding the lock on fdvp, no other thread can find fvp. - * - * TODO: Do we need to limit these checks to regular files only? - */ - if (fcnp->cn_flags & CN_SECLUDE_RENAME) { - if (vnode_isdir(fvp)) { - error = EISDIR; - goto out; - } - - /* - * Neither fork of source may be open or memory mapped. - * We also don't want it in use by any other system call. - * The file must not have hard links. - * - * We can't simply use vnode_isinuse() because that does not - * count opens with O_EVTONLY. We don't want a malicious - * process using O_EVTONLY to subvert a secluded rename. - */ - if (fcp->c_linkcount != 1) { - error = EMLINK; - goto out; - } - - if (fcp->c_rsrc_vp && (fcp->c_rsrc_vp->v_usecount > 0 || - fcp->c_rsrc_vp->v_iocount > 0)) { - /* Resource fork is in use (including O_EVTONLY) */ - error = EBUSY; - goto out; - } - if (fcp->c_vp && (fcp->c_vp->v_usecount > (fcp->c_rsrc_vp ? 1 : 0) || - fcp->c_vp->v_iocount > 1)) { - /* - * Data fork is in use, including O_EVTONLY, but not - * including a reference from the resource fork. - */ - error = EBUSY; - goto out; - } - } -#endif - - bzero(&from_desc, sizeof(from_desc)); - from_desc.cd_nameptr = (const u_int8_t *)fcnp->cn_nameptr; - from_desc.cd_namelen = fcnp->cn_namelen; - from_desc.cd_parentcnid = fdcp->c_fileid; - from_desc.cd_flags = fcp->c_desc.cd_flags & ~(CD_HASBUF | CD_DECOMPOSED); - from_desc.cd_cnid = fcp->c_cnid; - - bzero(&to_desc, sizeof(to_desc)); - to_desc.cd_nameptr = (const u_int8_t *)tcnp->cn_nameptr; - to_desc.cd_namelen = tcnp->cn_namelen; - to_desc.cd_parentcnid = tdcp->c_fileid; - to_desc.cd_flags = fcp->c_desc.cd_flags & ~(CD_HASBUF | CD_DECOMPOSED); - to_desc.cd_cnid = fcp->c_cnid; - - if ((error = hfs_start_transaction(hfsmp)) != 0) { - goto out; - } - started_tr = 1; - - /* hfs_vnop_link() and hfs_vnop_rename() set kHFSHasChildLinkMask - * inside a journal transaction and without holding a cnode lock. - * As setting of this bit depends on being in journal transaction for - * concurrency, check this bit again after we start journal transaction for rename - * to ensure that this directory does not have any descendant that - * is a directory hard link. - */ - if (vnode_isdir(fvp) && (fdvp != tdvp)) { - if (fcp->c_attr.ca_recflags & kHFSHasChildLinkMask) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - if (cat_check_link_ancestry(hfsmp, tdcp->c_fileid, 0)) { - error = EPERM; - hfs_systemfile_unlock(hfsmp, lockflags); - goto out; - } - hfs_systemfile_unlock(hfsmp, lockflags); - } - } - - // if it's a hardlink then re-lookup the name so - // that we get the correct cnid in from_desc (see - // the comment in hfs_removefile for more details) - // - if (fcp->c_flag & C_HARDLINK) { - struct cat_desc tmpdesc; - cnid_t real_cnid; - - tmpdesc.cd_nameptr = (const u_int8_t *)fcnp->cn_nameptr; - tmpdesc.cd_namelen = fcnp->cn_namelen; - tmpdesc.cd_parentcnid = fdcp->c_fileid; - tmpdesc.cd_hint = fdcp->c_childhint; - tmpdesc.cd_flags = fcp->c_desc.cd_flags & CD_ISDIR; - tmpdesc.cd_encoding = 0; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - if (cat_lookup(hfsmp, &tmpdesc, 0, 0, NULL, NULL, NULL, &real_cnid) != 0) { - hfs_systemfile_unlock(hfsmp, lockflags); - goto out; - } - - // use the real cnid instead of whatever happened to be there - from_desc.cd_cnid = real_cnid; - hfs_systemfile_unlock(hfsmp, lockflags); - } - - /* - * Reserve some space in the Catalog file. - */ - if ((error = cat_preflight(hfsmp, CAT_RENAME + CAT_DELETE, &cookie, p))) { - goto out; - } - got_cookie = 1; - - /* - * If the destination exists then it may need to be removed. - * - * Due to HFS's locking system, we should always move the - * existing 'tvp' element to the hidden directory in hfs_vnop_rename. - * Because the VNOP_LOOKUP call enters and exits the filesystem independently - * of the actual vnop that it was trying to do (stat, link, readlink), - * we must release the cnode lock of that element during the interim to - * do MAC checking, vnode authorization, and other calls. In that time, - * the item can be deleted (or renamed over). However, only in the rename - * case is it inappropriate to return ENOENT from any of those calls. Either - * the call should return information about the old element (stale), or get - * information about the newer element that we are about to write in its place. - * - * HFS lookup has been modified to detect a rename and re-drive its - * lookup internally. For other calls that have already succeeded in - * their lookup call and are waiting to acquire the cnode lock in order - * to proceed, that cnode lock will not fail due to the cnode being marked - * C_NOEXISTS, because it won't have been marked as such. It will only - * have C_DELETED. Thus, they will simply act on the stale open-unlinked - * element. All future callers will get the new element. - * - * To implement this behavior, we pass the "only_unlink" argument to - * hfs_removefile and hfs_removedir. This will result in the vnode acting - * as though it is open-unlinked. Additionally, when we are done moving the - * element to the hidden directory, we vnode_recycle the target so that it is - * reclaimed as soon as possible. Reclaim and inactive are both - * capable of clearing out unused blocks for an open-unlinked file or dir. - */ - if (tvp) { - // - // if the destination has a document id, we need to preserve it - // - if (fvp != tvp) { - uint32_t document_id; - struct FndrExtendedDirInfo *ffip = (struct FndrExtendedDirInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16); - struct FndrExtendedDirInfo *tfip = (struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16); - - if (ffip->document_id && tfip->document_id) { - // both documents are tracked. only save a tombstone from tcp and do nothing else. - save_tombstone(hfsmp, tdvp, tvp, tcnp, 0); - } else { - struct doc_tombstone *ut; - ut = get_uthread_doc_tombstone(); - - document_id = tfip->document_id; - tfip->document_id = 0; - - if (document_id != 0) { - // clear UF_TRACKED as well since tcp is now no longer tracked - tcp->c_bsdflags &= ~UF_TRACKED; - (void) cat_update(hfsmp, &tcp->c_desc, &tcp->c_attr, NULL, NULL); - } - - if (ffip->document_id == 0 && document_id != 0) { - // printf("RENAME: preserving doc-id %d onto %s (from ino %d, to ino %d)\n", document_id, tcp->c_desc.cd_nameptr, tcp->c_desc.cd_cnid, fcp->c_desc.cd_cnid); - fcp->c_bsdflags |= UF_TRACKED; - ffip->document_id = document_id; - - (void) cat_update(hfsmp, &fcp->c_desc, &fcp->c_attr, NULL, NULL); -#if CONFIG_FSE - add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), - FSE_ARG_DEV, hfsmp->hfs_raw_dev, - FSE_ARG_INO, (ino64_t)tcp->c_fileid, // src inode # - FSE_ARG_INO, (ino64_t)fcp->c_fileid, // dst inode # - FSE_ARG_INT32, (uint32_t)ffip->document_id, - FSE_ARG_DONE); -#endif - } else if ((fcp->c_bsdflags & UF_TRACKED) && should_save_docid_tombstone(ut, fvp, fcnp)) { - - if (ut->t_lastop_document_id) { - clear_tombstone_docid(ut, hfsmp, NULL); - } - save_tombstone(hfsmp, fdvp, fvp, fcnp, 0); - - //printf("RENAME: (dest-exists): saving tombstone doc-id %lld @ %s (ino %d)\n", - // ut->t_lastop_document_id, ut->t_lastop_filename, fcp->c_desc.cd_cnid); - } - } - } - - /* - * When fvp matches tvp they could be case variants - * or matching hard links. - */ - if (fvp == tvp) { - if (!(fcp->c_flag & C_HARDLINK)) { - /* - * If they're not hardlinks, then fvp == tvp must mean we - * are using case-insensitive HFS because case-sensitive would - * not use the same vnode for both. In this case we just update - * the catalog for: a -> A - */ - goto skip_rm; /* simple case variant */ - - } - /* For all cases below, we must be using hardlinks */ - else if ((fdvp != tdvp) || - (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)) { - /* - * If the parent directories are not the same, AND the two items - * are hardlinks, posix says to do nothing: - * dir1/fred <-> dir2/bob and the op was mv dir1/fred -> dir2/bob - * We just return 0 in this case. - * - * If case sensitivity is on, and we are using hardlinks - * then renaming is supposed to do nothing. - * dir1/fred <-> dir2/FRED, and op == mv dir1/fred -> dir2/FRED - */ - goto out; /* matching hardlinks, nothing to do */ - - } else if (hfs_namecmp((const u_int8_t *)fcnp->cn_nameptr, fcnp->cn_namelen, - (const u_int8_t *)tcnp->cn_nameptr, tcnp->cn_namelen) == 0) { - /* - * If we get here, then the following must be true: - * a) We are running case-insensitive HFS+. - * b) Both paths 'fvp' and 'tvp' are in the same parent directory. - * c) the two names are case-variants of each other. - * - * In this case, we are really only dealing with a single catalog record - * whose name is being updated. - * - * op is dir1/fred -> dir1/FRED - * - * We need to special case the name matching, because if - * dir1/fred <-> dir1/bob were the two links, and the - * op was dir1/fred -> dir1/bob - * That would fail/do nothing. - */ - goto skip_rm; /* case-variant hardlink in the same dir */ - } else { - goto out; /* matching hardlink, nothing to do */ - } - } - - - if (vnode_isdir(tvp)) { - /* - * hfs_removedir will eventually call hfs_removefile on the directory - * we're working on, because only hfs_removefile does the renaming of the - * item to the hidden directory. The directory will stay around in the - * hidden directory with C_DELETED until it gets an inactive or a reclaim. - * That way, we can destroy all of the EAs as needed and allow new ones to be - * written. - */ - error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE, 1); - } - else { - error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, NULL, 1); - - /* - * If the destination file had a resource fork vnode, then we need to get rid of - * its blocks when there are no more references to it. Because the call to - * hfs_removefile above always open-unlinks things, we need to force an inactive/reclaim - * on the resource fork vnode, in order to prevent block leaks. Otherwise, - * the resource fork vnode could prevent the data fork vnode from going out of scope - * because it holds a v_parent reference on it. So we mark it for termination - * with a call to vnode_recycle. hfs_vnop_reclaim has been modified so that it - * can clean up the blocks of open-unlinked files and resource forks. - * - * We can safely call vnode_recycle on the resource fork because we took an iocount - * reference on it at the beginning of the function. - */ - - if ((error == 0) && (tcp->c_flag & C_DELETED) && (tvp_rsrc)) { - vnode_recycle(tvp_rsrc); - } - } - - if (error) { - goto out; - } - - tvp_deleted = 1; - - /* Mark 'tcp' as being deleted due to a rename */ - tcp->c_flag |= C_RENAMED; - - /* - * Aggressively mark tvp/tcp for termination to ensure that we recover all blocks - * as quickly as possible. - */ - vnode_recycle(tvp); - } else { - struct doc_tombstone *ut; - ut = get_uthread_doc_tombstone(); - - // - // There is nothing at the destination. If the file being renamed is - // tracked, save a "tombstone" of the document_id. If the file is - // not a tracked file, then see if it needs to inherit a tombstone. - // - // NOTE: we do not save a tombstone if the file being renamed begins - // with "atmp" which is done to work-around AutoCad's bizarre - // 5-step un-safe save behavior - // - if (fcp->c_bsdflags & UF_TRACKED) { - if (should_save_docid_tombstone(ut, fvp, fcnp)) { - save_tombstone(hfsmp, fdvp, fvp, fcnp, 0); - - //printf("RENAME: (no dest): saving tombstone doc-id %lld @ %s (ino %d)\n", - // ut->t_lastop_document_id, ut->t_lastop_filename, fcp->c_desc.cd_cnid); - } else { - // intentionally do nothing - } - } else if ( ut->t_lastop_document_id != 0 - && tdvp == ut->t_lastop_parent - && vnode_vid(tdvp) == ut->t_lastop_parent_vid - && strcmp((char *)ut->t_lastop_filename, (char *)tcnp->cn_nameptr) == 0) { - - //printf("RENAME: %s (ino %d) inheriting doc-id %lld\n", tcnp->cn_nameptr, fcp->c_desc.cd_cnid, ut->t_lastop_document_id); - struct FndrExtendedFileInfo *fip = (struct FndrExtendedFileInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16); - fcp->c_bsdflags |= UF_TRACKED; - fip->document_id = ut->t_lastop_document_id; - cat_update(hfsmp, &fcp->c_desc, &fcp->c_attr, NULL, NULL); - - clear_tombstone_docid(ut, hfsmp, fcp); // will send the docid-changed fsevent - - } else if (ut->t_lastop_document_id && should_save_docid_tombstone(ut, fvp, fcnp) && should_save_docid_tombstone(ut, tvp, tcnp)) { - // no match, clear the tombstone - //printf("RENAME: clearing the tombstone %lld @ %s\n", ut->t_lastop_document_id, ut->t_lastop_filename); - clear_tombstone_docid(ut, hfsmp, NULL); - } - - } -skip_rm: - /* - * All done with tvp and fvp. - * - * We also jump to this point if there was no destination observed during lookup and namei. - * However, because only iocounts are held at the VFS layer, there is nothing preventing a - * competing thread from racing us and creating a file or dir at the destination of this rename - * operation. If this occurs, it may cause us to get a spurious EEXIST out of the cat_rename - * call below. To preserve rename's atomicity, we need to signal VFS to re-drive the - * namei/lookup and restart the rename operation. EEXIST is an allowable errno to be bubbled - * out of the rename syscall, but not for this reason, since it is a synonym errno for ENOTEMPTY. - * To signal VFS, we return ERECYCLE (which is also used for lookup restarts). This errno - * will be swallowed and it will restart the operation. - */ - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - error = cat_rename(hfsmp, &from_desc, &tdcp->c_desc, &to_desc, &out_desc); - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) { - if (error == EEXIST) { - error = ERECYCLE; - } - goto out; - } - - /* Invalidate negative cache entries in the destination directory */ - if (tdcp->c_flag & C_NEG_ENTRIES) { - cache_purge_negatives(tdvp); - tdcp->c_flag &= ~C_NEG_ENTRIES; - } - - /* Update cnode's catalog descriptor */ - replace_desc(fcp, &out_desc); - fcp->c_parentcnid = tdcp->c_fileid; - fcp->c_hint = 0; - - /* Now indicate this cnode needs to have date-added written to the finderinfo */ - fcp->c_flag |= C_NEEDS_DATEADDED; - (void) hfs_update (fvp, 0); - - - hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_RMDIR : VOL_RMFILE, - (fdcp->c_cnid == kHFSRootFolderID)); - hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_MKDIR : VOL_MKFILE, - (tdcp->c_cnid == kHFSRootFolderID)); - - /* Update both parent directories. */ - if (fdvp != tdvp) { - if (vnode_isdir(fvp)) { - /* If the source directory has directory hard link - * descendants, set the kHFSHasChildLinkBit in the - * destination parent hierarchy - */ - if ((fcp->c_attr.ca_recflags & kHFSHasChildLinkMask) && - !(tdcp->c_attr.ca_recflags & kHFSHasChildLinkMask)) { - - tdcp->c_attr.ca_recflags |= kHFSHasChildLinkMask; - - error = cat_set_childlinkbit(hfsmp, tdcp->c_parentcnid); - if (error) { - printf ("hfs_vnop_rename: error updating parent chain for %u\n", tdcp->c_cnid); - error = 0; - } - } - INC_FOLDERCOUNT(hfsmp, tdcp->c_attr); - DEC_FOLDERCOUNT(hfsmp, fdcp->c_attr); - } - tdcp->c_entries++; - tdcp->c_dirchangecnt++; - tdcp->c_flag |= C_MODIFIED; - hfs_incr_gencount(tdcp); - - if (fdcp->c_entries > 0) - fdcp->c_entries--; - fdcp->c_dirchangecnt++; - fdcp->c_flag |= C_MODIFIED; - fdcp->c_touch_chgtime = TRUE; - fdcp->c_touch_modtime = TRUE; - - if (ISSET(fcp->c_flag, C_HARDLINK)) { - hfs_relorigin(fcp, fdcp->c_fileid); - if (fdcp->c_fileid != fdcp->c_cnid) - hfs_relorigin(fcp, fdcp->c_cnid); - } - - (void) hfs_update(fdvp, 0); - } - hfs_incr_gencount(fdcp); - - tdcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ - tdcp->c_touch_chgtime = TRUE; - tdcp->c_touch_modtime = TRUE; - - (void) hfs_update(tdvp, 0); - - /* Update the vnode's name now that the rename has completed. */ - vnode_update_identity(fvp, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, - tcnp->cn_hash, (VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME)); - - /* - * At this point, we may have a resource fork vnode attached to the - * 'from' vnode. If it exists, we will want to update its name, because - * it contains the old name + _PATH_RSRCFORKSPEC. ("/..namedfork/rsrc"). - * - * Note that the only thing we need to update here is the name attached to - * the vnode, since a resource fork vnode does not have a separate resource - * cnode -- it's still 'fcp'. - */ - if (fcp->c_rsrc_vp) { - char* rsrc_path = NULL; - int len; - - /* Create a new temporary buffer that's going to hold the new name */ - MALLOC_ZONE (rsrc_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - len = snprintf (rsrc_path, MAXPATHLEN, "%s%s", tcnp->cn_nameptr, _PATH_RSRCFORKSPEC); - len = MIN(len, MAXPATHLEN); - - /* - * vnode_update_identity will do the following for us: - * 1) release reference on the existing rsrc vnode's name. - * 2) copy/insert new name into the name cache - * 3) attach the new name to the resource vnode - * 4) update the vnode's vid - */ - vnode_update_identity (fcp->c_rsrc_vp, fvp, rsrc_path, len, 0, (VNODE_UPDATE_NAME | VNODE_UPDATE_CACHE)); - - /* Free the memory associated with the resource fork's name */ - FREE_ZONE (rsrc_path, MAXPATHLEN, M_NAMEI); - } -out: - if (got_cookie) { - cat_postflight(hfsmp, &cookie, p); - } - if (started_tr) { - hfs_end_transaction(hfsmp); - } - - fdcp->c_flag &= ~C_DIR_MODIFICATION; - wakeup((caddr_t)&fdcp->c_flag); - if (fdvp != tdvp) { - tdcp->c_flag &= ~C_DIR_MODIFICATION; - wakeup((caddr_t)&tdcp->c_flag); - } - - hfs_unlockfour(fdcp, fcp, tdcp, tcp); - - if (took_trunc_lock) { - hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT); - } - - /* Now vnode_put the resource forks vnodes if necessary */ - if (tvp_rsrc) { - vnode_put(tvp_rsrc); - tvp_rsrc = NULL; - } - - /* After tvp is removed the only acceptable error is EIO */ - if (error && tvp_deleted) - error = EIO; - - /* If we want to reintroduce notifications for renames, this is the - place to do it. */ - - return (error); -} - - -/* - * Make a directory. - */ -int -hfs_vnop_mkdir(struct vnop_mkdir_args *ap) -{ - /***** HACK ALERT ********/ - ap->a_cnp->cn_flags |= MAKEENTRY; - return hfs_makenode(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap, ap->a_context); -} - - -/* - * Create a symbolic link. - */ -int -hfs_vnop_symlink(struct vnop_symlink_args *ap) -{ - struct vnode **vpp = ap->a_vpp; - struct vnode *dvp = ap->a_dvp; - struct vnode *vp = NULL; - struct cnode *cp = NULL; - struct hfsmount *hfsmp; - struct filefork *fp; - struct buf *bp = NULL; - char *datap; - int started_tr = 0; - u_int32_t len; - int error; - - /* HFS standard disks don't support symbolic links */ - if (VTOVCB(dvp)->vcbSigWord != kHFSPlusSigWord) - return (ENOTSUP); - - /* Check for empty target name */ - if (ap->a_target[0] == 0) - return (EINVAL); - - hfsmp = VTOHFS(dvp); - len = strlen(ap->a_target); - - /* Check for free space */ - if (((u_int64_t)hfs_freeblks(hfsmp, 0) * (u_int64_t)hfsmp->blockSize) < len) { - return (ENOSPC); - } - - /* Create the vnode */ - ap->a_vap->va_mode |= S_IFLNK; - if ((error = hfs_makenode(dvp, vpp, ap->a_cnp, ap->a_vap, ap->a_context))) { - goto out; - } - vp = *vpp; - if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - goto out; - } - cp = VTOC(vp); - fp = VTOF(vp); - - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - goto out; - } - -#if QUOTA - (void)hfs_getinoquota(cp); -#endif /* QUOTA */ - - if ((error = hfs_start_transaction(hfsmp)) != 0) { - goto out; - } - started_tr = 1; - - /* - * Allocate space for the link. - * - * Since we're already inside a transaction, - * - * Don't need truncate lock since a symlink is treated as a system file. - */ - error = hfs_truncate(vp, len, IO_NOZEROFILL, 0, ap->a_context); - - /* On errors, remove the symlink file */ - if (error) { - /* - * End the transaction so we don't re-take the cnode lock - * below while inside a transaction (lock order violation). - */ - hfs_end_transaction(hfsmp); - - /* hfs_removefile() requires holding the truncate lock */ - hfs_unlock(cp); - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); - - if (hfs_start_transaction(hfsmp) != 0) { - started_tr = 0; - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - goto out; - } - - (void) hfs_removefile(dvp, vp, ap->a_cnp, 0, 0, 0, NULL, 0); - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - goto out; - } - - /* Write the link to disk */ - bp = buf_getblk(vp, (daddr64_t)0, roundup((int)fp->ff_size, hfsmp->hfs_physical_block_size), - 0, 0, BLK_META); - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, bp); - } - datap = (char *)buf_dataptr(bp); - bzero(datap, buf_size(bp)); - bcopy(ap->a_target, datap, len); - - if (hfsmp->jnl) { - journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); - } else { - buf_bawrite(bp); - } -out: - if (started_tr) - hfs_end_transaction(hfsmp); - if ((cp != NULL) && (vp != NULL)) { - hfs_unlock(cp); - } - if (error) { - if (vp) { - vnode_put(vp); - } - *vpp = NULL; - } - return (error); -} - - -/* structures to hold a "." or ".." directory entry */ -struct hfs_stddotentry { - u_int32_t d_fileno; /* unique file number */ - u_int16_t d_reclen; /* length of this structure */ - u_int8_t d_type; /* dirent file type */ - u_int8_t d_namlen; /* len of filename */ - char d_name[4]; /* "." or ".." */ -}; - -struct hfs_extdotentry { - u_int64_t d_fileno; /* unique file number */ - u_int64_t d_seekoff; /* seek offset (optional, used by servers) */ - u_int16_t d_reclen; /* length of this structure */ - u_int16_t d_namlen; /* len of filename */ - u_int8_t d_type; /* dirent file type */ - u_char d_name[3]; /* "." or ".." */ -}; - -typedef union { - struct hfs_stddotentry std; - struct hfs_extdotentry ext; -} hfs_dotentry_t; - -/* - * hfs_vnop_readdir reads directory entries into the buffer pointed - * to by uio, in a filesystem independent format. Up to uio_resid - * bytes of data can be transferred. The data in the buffer is a - * series of packed dirent structures where each one contains the - * following entries: - * - * u_int32_t d_fileno; // file number of entry - * u_int16_t d_reclen; // length of this record - * u_int8_t d_type; // file type - * u_int8_t d_namlen; // length of string in d_name - * char d_name[MAXNAMELEN+1]; // null terminated file name - * - * The current position (uio_offset) refers to the next block of - * entries. The offset can only be set to a value previously - * returned by hfs_vnop_readdir or zero. This offset does not have - * to match the number of bytes returned (in uio_resid). - * - * In fact, the offset used by HFS is essentially an index (26 bits) - * with a tag (6 bits). The tag is for associating the next request - * with the current request. This enables us to have multiple threads - * reading the directory while the directory is also being modified. - * - * Each tag/index pair is tied to a unique directory hint. The hint - * contains information (filename) needed to build the catalog b-tree - * key for finding the next set of entries. - * - * If the directory is marked as deleted-but-in-use (cp->c_flag & C_DELETED), - * do NOT synthesize entries for "." and "..". - */ -int -hfs_vnop_readdir(ap) - struct vnop_readdir_args /* { - vnode_t a_vp; - uio_t a_uio; - int a_flags; - int *a_eofflag; - int *a_numdirent; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - uio_t uio = ap->a_uio; - struct cnode *cp; - struct hfsmount *hfsmp; - directoryhint_t *dirhint = NULL; - directoryhint_t localhint; - off_t offset; - off_t startoffset; - int error = 0; - int eofflag = 0; - user_addr_t user_start = 0; - user_size_t user_len = 0; - int index; - unsigned int tag; - int items; - int lockflags; - int extended; - int nfs_cookies; - cnid_t cnid_hint = 0; - int bump_valence = 0; - - items = 0; - startoffset = offset = uio_offset(uio); - extended = (ap->a_flags & VNODE_READDIR_EXTENDED); - nfs_cookies = extended && (ap->a_flags & VNODE_READDIR_REQSEEKOFF); - - /* Sanity check the uio data. */ - if (uio_iovcnt(uio) > 1) - return (EINVAL); - - if (VTOC(vp)->c_bsdflags & UF_COMPRESSED) { - int compressed = hfs_file_is_compressed(VTOC(vp), 0); /* 0 == take the cnode lock */ - if (VTOCMP(vp) != NULL && !compressed) { - error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); - if (error) { - return error; - } - } - } - - cp = VTOC(vp); - hfsmp = VTOHFS(vp); - - /* Note that the dirhint calls require an exclusive lock. */ - if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) - return (error); - - /* Pick up cnid hint (if any). */ - if (nfs_cookies) { - cnid_hint = (cnid_t)(uio_offset(uio) >> 32); - uio_setoffset(uio, uio_offset(uio) & 0x00000000ffffffffLL); - if (cnid_hint == INT_MAX) { /* searching pass the last item */ - eofflag = 1; - goto out; - } - } - /* - * Synthesize entries for "." and "..", unless the directory has - * been deleted, but not closed yet (lazy delete in progress). - */ - if (offset == 0 && !(cp->c_flag & C_DELETED)) { - hfs_dotentry_t dotentry[2]; - size_t uiosize; - - if (extended) { - struct hfs_extdotentry *entry = &dotentry[0].ext; - - entry->d_fileno = cp->c_cnid; - entry->d_reclen = sizeof(struct hfs_extdotentry); - entry->d_type = DT_DIR; - entry->d_namlen = 1; - entry->d_name[0] = '.'; - entry->d_name[1] = '\0'; - entry->d_name[2] = '\0'; - entry->d_seekoff = 1; - - ++entry; - entry->d_fileno = cp->c_parentcnid; - entry->d_reclen = sizeof(struct hfs_extdotentry); - entry->d_type = DT_DIR; - entry->d_namlen = 2; - entry->d_name[0] = '.'; - entry->d_name[1] = '.'; - entry->d_name[2] = '\0'; - entry->d_seekoff = 2; - uiosize = 2 * sizeof(struct hfs_extdotentry); - } else { - struct hfs_stddotentry *entry = &dotentry[0].std; - - entry->d_fileno = cp->c_cnid; - entry->d_reclen = sizeof(struct hfs_stddotentry); - entry->d_type = DT_DIR; - entry->d_namlen = 1; - *(int *)&entry->d_name[0] = 0; - entry->d_name[0] = '.'; - - ++entry; - entry->d_fileno = cp->c_parentcnid; - entry->d_reclen = sizeof(struct hfs_stddotentry); - entry->d_type = DT_DIR; - entry->d_namlen = 2; - *(int *)&entry->d_name[0] = 0; - entry->d_name[0] = '.'; - entry->d_name[1] = '.'; - uiosize = 2 * sizeof(struct hfs_stddotentry); - } - if ((error = uiomove((caddr_t)&dotentry, uiosize, uio))) { - goto out; - } - offset += 2; - } - - /* - * Intentionally avoid checking the valence here. If we - * have FS corruption that reports the valence is 0, even though it - * has contents, we might artificially skip over iterating - * this directory. - */ - - // - // We have to lock the user's buffer here so that we won't - // fault on it after we've acquired a shared lock on the - // catalog file. The issue is that you can get a 3-way - // deadlock if someone else starts a transaction and then - // tries to lock the catalog file but can't because we're - // here and we can't service our page fault because VM is - // blocked trying to start a transaction as a result of - // trying to free up pages for our page fault. It's messy - // but it does happen on dual-processors that are paging - // heavily (see radar 3082639 for more info). By locking - // the buffer up-front we prevent ourselves from faulting - // while holding the shared catalog file lock. - // - // Fortunately this and hfs_search() are the only two places - // currently (10/30/02) that can fault on user data with a - // shared lock on the catalog file. - // - if (hfsmp->jnl && uio_isuserspace(uio)) { - user_start = uio_curriovbase(uio); - user_len = uio_curriovlen(uio); - - if ((error = vslock(user_start, user_len)) != 0) { - user_start = 0; - goto out; - } - } - /* Convert offset into a catalog directory index. */ - index = (offset & HFS_INDEX_MASK) - 2; - tag = offset & ~HFS_INDEX_MASK; - - /* Lock catalog during cat_findname and cat_getdirentries. */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - /* When called from NFS, try and resolve a cnid hint. */ - if (nfs_cookies && cnid_hint != 0) { - if (cat_findname(hfsmp, cnid_hint, &localhint.dh_desc) == 0) { - if ( localhint.dh_desc.cd_parentcnid == cp->c_fileid) { - localhint.dh_index = index - 1; - localhint.dh_time = 0; - bzero(&localhint.dh_link, sizeof(localhint.dh_link)); - dirhint = &localhint; /* don't forget to release the descriptor */ - } else { - cat_releasedesc(&localhint.dh_desc); - } - } - } - - /* Get a directory hint (cnode must be locked exclusive) */ - if (dirhint == NULL) { - dirhint = hfs_getdirhint(cp, ((index - 1) & HFS_INDEX_MASK) | tag, 0); - - /* Hide tag from catalog layer. */ - dirhint->dh_index &= HFS_INDEX_MASK; - if (dirhint->dh_index == HFS_INDEX_MASK) { - dirhint->dh_index = -1; - } - } - - if (index == 0) { - dirhint->dh_threadhint = cp->c_dirthreadhint; - } - else { - /* - * If we have a non-zero index, there is a possibility that during the last - * call to hfs_vnop_readdir we hit EOF for this directory. If that is the case - * then we don't want to return any new entries for the caller. Just return 0 - * items, mark the eofflag, and bail out. Because we won't have done any work, the - * code at the end of the function will release the dirhint for us. - * - * Don't forget to unlock the catalog lock on the way out, too. - */ - if (dirhint->dh_desc.cd_flags & CD_EOF) { - error = 0; - eofflag = 1; - uio_setoffset(uio, startoffset); - hfs_systemfile_unlock (hfsmp, lockflags); - - goto seekoffcalc; - } - } - - /* Pack the buffer with dirent entries. */ - error = cat_getdirentries(hfsmp, cp->c_entries, dirhint, uio, ap->a_flags, &items, &eofflag); - - if (index == 0 && error == 0) { - cp->c_dirthreadhint = dirhint->dh_threadhint; - } - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error != 0) { - goto out; - } - - /* Get index to the next item */ - index += items; - - if (items >= (int)cp->c_entries) { - eofflag = 1; - } - - /* - * Detect valence FS corruption. - * - * We are holding the cnode lock exclusive, so there should not be - * anybody modifying the valence field of this cnode. If we enter - * this block, that means we observed filesystem corruption, because - * this directory reported a valence of 0, yet we found at least one - * item. In this case, we need to minimally self-heal this - * directory to prevent userland from tripping over a directory - * that appears empty (getattr of valence reports 0), but actually - * has contents. - * - * We'll force the cnode update at the end of the function after - * completing all of the normal getdirentries steps. - */ - if ((cp->c_entries == 0) && (items > 0)) { - /* disk corruption */ - cp->c_entries++; - /* Mark the cnode as dirty. */ - cp->c_flag |= C_MODIFIED; - printf("hfs_vnop_readdir: repairing valence to non-zero! \n"); - bump_valence++; - } - - - /* Convert catalog directory index back into an offset. */ - while (tag == 0) - tag = (++cp->c_dirhinttag) << HFS_INDEX_BITS; - uio_setoffset(uio, (index + 2) | tag); - dirhint->dh_index |= tag; - -seekoffcalc: - cp->c_touch_acctime = TRUE; - - if (ap->a_numdirent) { - if (startoffset == 0) - items += 2; - *ap->a_numdirent = items; - } - -out: - if (user_start) { - vsunlock(user_start, user_len, TRUE); - } - /* If we didn't do anything then go ahead and dump the hint. */ - if ((dirhint != NULL) && - (dirhint != &localhint) && - (uio_offset(uio) == startoffset)) { - hfs_reldirhint(cp, dirhint); - eofflag = 1; - } - if (ap->a_eofflag) { - *ap->a_eofflag = eofflag; - } - if (dirhint == &localhint) { - cat_releasedesc(&localhint.dh_desc); - } - - if (bump_valence) { - /* force the update before dropping the cnode lock*/ - hfs_update(vp, 0); - } - - hfs_unlock(cp); - - return (error); -} - - -/* - * Read contents of a symbolic link. - */ -int -hfs_vnop_readlink(ap) - struct vnop_readlink_args /* { - struct vnode *a_vp; - struct uio *a_uio; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct filefork *fp; - int error; - - if (!vnode_islnk(vp)) - return (EINVAL); - - if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) - return (error); - cp = VTOC(vp); - fp = VTOF(vp); - - /* Zero length sym links are not allowed */ - if (fp->ff_size == 0 || fp->ff_size > MAXPATHLEN) { - error = EINVAL; - goto exit; - } - - /* Cache the path so we don't waste buffer cache resources */ - if (fp->ff_symlinkptr == NULL) { - struct buf *bp = NULL; - - MALLOC(fp->ff_symlinkptr, char *, fp->ff_size, M_TEMP, M_WAITOK); - if (fp->ff_symlinkptr == NULL) { - error = ENOMEM; - goto exit; - } - error = (int)buf_meta_bread(vp, (daddr64_t)0, - roundup((int)fp->ff_size, VTOHFS(vp)->hfs_physical_block_size), - vfs_context_ucred(ap->a_context), &bp); - if (error) { - if (bp) - buf_brelse(bp); - if (fp->ff_symlinkptr) { - FREE(fp->ff_symlinkptr, M_TEMP); - fp->ff_symlinkptr = NULL; - } - goto exit; - } - bcopy((char *)buf_dataptr(bp), fp->ff_symlinkptr, (size_t)fp->ff_size); - - if (VTOHFS(vp)->jnl && (buf_flags(bp) & B_LOCKED) == 0) { - buf_markinvalid(bp); /* data no longer needed */ - } - buf_brelse(bp); - } - error = uiomove((caddr_t)fp->ff_symlinkptr, (int)fp->ff_size, ap->a_uio); - - /* - * Keep track blocks read - */ - if ((VTOHFS(vp)->hfc_stage == HFC_RECORDING) && (error == 0)) { - - /* - * If this file hasn't been seen since the start of - * the current sampling period then start over. - */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) - VTOF(vp)->ff_bytesread = fp->ff_size; - else - VTOF(vp)->ff_bytesread += fp->ff_size; - - // if (VTOF(vp)->ff_bytesread > fp->ff_size) - // cp->c_touch_acctime = TRUE; - } - -exit: - hfs_unlock(cp); - return (error); -} - - -/* - * Get configurable pathname variables. - */ -int -hfs_vnop_pathconf(ap) - struct vnop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - int *a_retval; - vfs_context_t a_context; - } */ *ap; -{ - - int std_hfs = (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD); - switch (ap->a_name) { - case _PC_LINK_MAX: - if (std_hfs == 0){ - *ap->a_retval = HFS_LINK_MAX; - } -#if CONFIG_HFS_STD - else { - *ap->a_retval = 1; - } -#endif - break; - case _PC_NAME_MAX: - if (std_hfs == 0) { - *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ - } -#if CONFIG_HFS_STD - else { - *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ - } -#endif - break; - case _PC_PATH_MAX: - *ap->a_retval = PATH_MAX; /* 1024 */ - break; - case _PC_PIPE_BUF: - *ap->a_retval = PIPE_BUF; - break; - case _PC_CHOWN_RESTRICTED: - *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */ - break; - case _PC_NO_TRUNC: - *ap->a_retval = 200112; /* _POSIX_NO_TRUNC */ - break; - case _PC_NAME_CHARS_MAX: - if (std_hfs == 0) { - *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ - } -#if CONFIG_HFS_STD - else { - *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ - } -#endif - break; - case _PC_CASE_SENSITIVE: - if (VTOHFS(ap->a_vp)->hfs_flags & HFS_CASE_SENSITIVE) - *ap->a_retval = 1; - else - *ap->a_retval = 0; - break; - case _PC_CASE_PRESERVING: - *ap->a_retval = 1; - break; - case _PC_FILESIZEBITS: - /* number of bits to store max file size */ - if (std_hfs == 0) { - *ap->a_retval = 64; - } -#if CONFIG_HFS_STD - else { - *ap->a_retval = 32; - } -#endif - break; - case _PC_XATTR_SIZE_BITS: - /* Number of bits to store maximum extended attribute size */ - *ap->a_retval = HFS_XATTR_SIZE_BITS; - break; - default: - return (EINVAL); - } - - return (0); -} - -/* - * Prepares a fork for cat_update by making sure ff_size and ff_blocks - * are no bigger than the valid data on disk thus reducing the chance - * of exposing uninitialised data in the event of a non clean unmount. - * fork_buf is where to put the temporary copy if required. (It can - * be inside pfork.) - */ -const struct cat_fork * -hfs_prepare_fork_for_update(filefork_t *ff, - const struct cat_fork *cf, - struct cat_fork *cf_buf, - uint32_t block_size) -{ - if (!ff) - return NULL; - - if (!cf) - cf = &ff->ff_data; - if (!cf_buf) - cf_buf = &ff->ff_data; - - off_t max_size = ff->ff_size; - - // Check first invalid range - if (!TAILQ_EMPTY(&ff->ff_invalidranges)) - max_size = TAILQ_FIRST(&ff->ff_invalidranges)->rl_start; - - if (!ff->ff_unallocblocks && ff->ff_size <= max_size) - return cf; // Nothing to do - - if (ff->ff_blocks < ff->ff_unallocblocks) { - panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", - ff->ff_blocks, ff->ff_unallocblocks); - } - - struct cat_fork *out = cf_buf; - - if (out != cf) - bcopy(cf, out, sizeof(*cf)); - - // Adjust cf_blocks for cf_vblocks - out->cf_blocks -= out->cf_vblocks; - - /* - * Here we trim the size with the updated cf_blocks. This is - * probably unnecessary now because the invalid ranges should - * catch this (but that wasn't always the case). - */ - off_t alloc_bytes = hfs_blk_to_bytes(out->cf_blocks, block_size); - if (out->cf_size > alloc_bytes) - out->cf_size = alloc_bytes; - - // Trim cf_size to first invalid range - if (out->cf_size > max_size) - out->cf_size = max_size; - - return out; -} - -/* - * Update a cnode's on-disk metadata. - * - * The cnode must be locked exclusive. See declaration for possible - * options. - */ -int -hfs_update(struct vnode *vp, int options) -{ - struct cnode *cp = VTOC(vp); - struct proc *p; - const struct cat_fork *dataforkp = NULL; - const struct cat_fork *rsrcforkp = NULL; - struct cat_fork datafork; - struct cat_fork rsrcfork; - struct hfsmount *hfsmp; - int lockflags; - int error; - uint32_t tstate = 0; - - if (ISSET(cp->c_flag, C_NOEXISTS)) - return 0; - - p = current_proc(); - hfsmp = VTOHFS(vp); - - if (((vnode_issystem(vp) && (cp->c_cnid < kHFSFirstUserCatalogNodeID))) || - hfsmp->hfs_catalog_vp == NULL){ - return (0); - } - if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (cp->c_mode == 0)) { - CLR(cp->c_flag, C_MODIFIED | C_MINOR_MOD | C_NEEDS_DATEADDED); - cp->c_touch_acctime = 0; - cp->c_touch_chgtime = 0; - cp->c_touch_modtime = 0; - return (0); - } - if (kdebug_enable) { - if (cp->c_touch_acctime || cp->c_atime != cp->c_attr.ca_atimeondisk) - tstate |= DBG_HFS_UPDATE_ACCTIME; - if (cp->c_touch_modtime) - tstate |= DBG_HFS_UPDATE_MODTIME; - if (cp->c_touch_chgtime) - tstate |= DBG_HFS_UPDATE_CHGTIME; - - if (cp->c_flag & C_MODIFIED) - tstate |= DBG_HFS_UPDATE_MODIFIED; - if (ISSET(options, HFS_UPDATE_FORCE)) - tstate |= DBG_HFS_UPDATE_FORCE; - if (cp->c_flag & C_NEEDS_DATEADDED) - tstate |= DBG_HFS_UPDATE_DATEADDED; - if (cp->c_flag & C_MINOR_MOD) - tstate |= DBG_HFS_UPDATE_MINOR; - } - hfs_touchtimes(hfsmp, cp); - - if (!ISSET(cp->c_flag, C_MODIFIED | C_MINOR_MOD) - && !hfs_should_save_atime(cp)) { - // Nothing to update - return 0; - } - - KDBG(HFSDBG_UPDATE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(vp), tstate); - - bool check_txn = false; - - if (!ISSET(options, HFS_UPDATE_FORCE) && !ISSET(cp->c_flag, C_MODIFIED)) { - /* - * This must be a minor modification. If the current - * transaction already has an update for this node, then we - * bundle in the modification. - */ - if (hfsmp->jnl - && journal_current_txn(hfsmp->jnl) == cp->c_update_txn) { - check_txn = true; - } else { - tstate |= DBG_HFS_UPDATE_SKIPPED; - error = 0; - goto exit; - } - } - - if ((error = hfs_start_transaction(hfsmp)) != 0) - goto exit; - - if (check_txn - && journal_current_txn(hfsmp->jnl) != cp->c_update_txn) { - hfs_end_transaction(hfsmp); - tstate |= DBG_HFS_UPDATE_SKIPPED; - error = 0; - goto exit; - } - - if (cp->c_datafork) - dataforkp = &cp->c_datafork->ff_data; - if (cp->c_rsrcfork) - rsrcforkp = &cp->c_rsrcfork->ff_data; - - /* - * Modify the values passed to cat_update based on whether or not - * the file has invalid ranges or borrowed blocks. - */ - dataforkp = hfs_prepare_fork_for_update(cp->c_datafork, NULL, &datafork, hfsmp->blockSize); - rsrcforkp = hfs_prepare_fork_for_update(cp->c_rsrcfork, NULL, &rsrcfork, hfsmp->blockSize); - - if (__improbable(kdebug_enable & KDEBUG_TRACE)) { - long dbg_parms[NUMPARMS]; - int dbg_namelen; - - dbg_namelen = NUMPARMS * sizeof(long); - vn_getpath(vp, (char *)dbg_parms, &dbg_namelen); - - if (dbg_namelen < (int)sizeof(dbg_parms)) - memset((char *)dbg_parms + dbg_namelen, 0, sizeof(dbg_parms) - dbg_namelen); - - kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE); - } - - /* - * Lock the Catalog b-tree file. - */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp); - - if (hfsmp->jnl) - cp->c_update_txn = journal_current_txn(hfsmp->jnl); - - hfs_systemfile_unlock(hfsmp, lockflags); - - CLR(cp->c_flag, C_MODIFIED | C_MINOR_MOD); - - hfs_end_transaction(hfsmp); - -exit: - - KDBG(HFSDBG_UPDATE | DBG_FUNC_END, VM_KERNEL_ADDRPERM(vp), tstate, error); - - return error; -} - -/* - * Allocate a new node - */ -int -hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, - struct vnode_attr *vap, vfs_context_t ctx) -{ - struct cnode *cp = NULL; - struct cnode *dcp = NULL; - struct vnode *tvp; - struct hfsmount *hfsmp; - struct cat_desc in_desc, out_desc; - struct cat_attr attr; - struct timeval tv; - int lockflags; - int error, started_tr = 0; - enum vtype vnodetype; - int mode; - int newvnode_flags = 0; - u_int32_t gnv_flags = 0; - int protectable_target = 0; - int nocache = 0; - -#if CONFIG_PROTECT - struct cprotect *entry = NULL; - int32_t cp_class = -1; - - /* - * By default, it's OK for AKS to overrride our target class preferences. - */ - uint32_t keywrap_flags = CP_KEYWRAP_DIFFCLASS; - - if (VATTR_IS_ACTIVE(vap, va_dataprotect_class)) { - cp_class = (int32_t)vap->va_dataprotect_class; - /* - * Since the user specifically requested this target class be used, - * we want to fail this creation operation if we cannot wrap to their - * target class. The CP_KEYWRAP_DIFFCLASS bit says that it is OK to - * use a different class than the one specified, so we turn that off - * now. - */ - keywrap_flags &= ~CP_KEYWRAP_DIFFCLASS; - } - int protected_mount = 0; -#endif - - - if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) - return (error); - - /* set the cnode pointer only after successfully acquiring lock */ - dcp = VTOC(dvp); - - /* Don't allow creation of new entries in open-unlinked directories */ - if ((error = hfs_checkdeleted(dcp))) { - hfs_unlock(dcp); - return error; - } - - dcp->c_flag |= C_DIR_MODIFICATION; - - hfsmp = VTOHFS(dvp); - - *vpp = NULL; - tvp = NULL; - out_desc.cd_flags = 0; - out_desc.cd_nameptr = NULL; - - vnodetype = vap->va_type; - if (vnodetype == VNON) - vnodetype = VREG; - mode = MAKEIMODE(vnodetype, vap->va_mode); - - if (S_ISDIR (mode) || S_ISREG (mode)) { - protectable_target = 1; - } - - - /* Check if were out of usable disk space. */ - if ((hfs_freeblks(hfsmp, 1) == 0) && (vfs_context_suser(ctx) != 0)) { - error = ENOSPC; - goto exit; - } - - microtime(&tv); - - /* Setup the default attributes */ - bzero(&attr, sizeof(attr)); - attr.ca_mode = mode; - attr.ca_linkcount = 1; - if (VATTR_IS_ACTIVE(vap, va_rdev)) { - attr.ca_rdev = vap->va_rdev; - } - if (VATTR_IS_ACTIVE(vap, va_create_time)) { - VATTR_SET_SUPPORTED(vap, va_create_time); - attr.ca_itime = vap->va_create_time.tv_sec; - } else { - attr.ca_itime = tv.tv_sec; - } -#if CONFIG_HFS_STD - if ((hfsmp->hfs_flags & HFS_STANDARD) && gTimeZone.tz_dsttime) { - attr.ca_itime += 3600; /* Same as what hfs_update does */ - } -#endif - attr.ca_atime = attr.ca_ctime = attr.ca_mtime = attr.ca_itime; - attr.ca_atimeondisk = attr.ca_atime; - if (VATTR_IS_ACTIVE(vap, va_flags)) { - VATTR_SET_SUPPORTED(vap, va_flags); - attr.ca_flags = vap->va_flags; - } - - /* - * HFS+ only: all files get ThreadExists - * HFSX only: dirs get HasFolderCount - */ - if (!(hfsmp->hfs_flags & HFS_STANDARD)) { - if (vnodetype == VDIR) { - if (hfsmp->hfs_flags & HFS_FOLDERCOUNT) - attr.ca_recflags = kHFSHasFolderCountMask; - } else { - attr.ca_recflags = kHFSThreadExistsMask; - } - } - -#if CONFIG_PROTECT - if (cp_fs_protected(hfsmp->hfs_mp)) { - protected_mount = 1; - } - /* - * On a content-protected HFS+/HFSX filesystem, files and directories - * cannot be created without atomically setting/creating the EA that - * contains the protection class metadata and keys at the same time, in - * the same transaction. As a result, pre-set the "EAs exist" flag - * on the cat_attr for protectable catalog record creations. This will - * cause the cnode creation routine in hfs_getnewvnode to mark the cnode - * as having EAs. - */ - if ((protected_mount) && (protectable_target)) { - attr.ca_recflags |= kHFSHasAttributesMask; - /* delay entering in the namecache */ - nocache = 1; - } -#endif - - - /* - * Add the date added to the item. See above, as - * all of the dates are set to the itime. - */ - hfs_write_dateadded (&attr, attr.ca_atime); - - /* Initialize the gen counter to 1 */ - hfs_write_gencount(&attr, (uint32_t)1); - - attr.ca_uid = vap->va_uid; - attr.ca_gid = vap->va_gid; - VATTR_SET_SUPPORTED(vap, va_mode); - VATTR_SET_SUPPORTED(vap, va_uid); - VATTR_SET_SUPPORTED(vap, va_gid); - -#if QUOTA - /* check to see if this node's creation would cause us to go over - * quota. If so, abort this operation. - */ - if (hfsmp->hfs_flags & HFS_QUOTAS) { - if ((error = hfs_quotacheck(hfsmp, 1, attr.ca_uid, attr.ca_gid, - vfs_context_ucred(ctx)))) { - goto exit; - } - } -#endif - - - /* Tag symlinks with a type and creator. */ - if (vnodetype == VLNK) { - struct FndrFileInfo *fip; - - fip = (struct FndrFileInfo *)&attr.ca_finderinfo; - fip->fdType = SWAP_BE32(kSymLinkFileType); - fip->fdCreator = SWAP_BE32(kSymLinkCreator); - } - - /* Setup the descriptor */ - in_desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; - in_desc.cd_namelen = cnp->cn_namelen; - in_desc.cd_parentcnid = dcp->c_fileid; - in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0; - in_desc.cd_hint = dcp->c_childhint; - in_desc.cd_encoding = 0; - -#if CONFIG_PROTECT - /* - * To preserve file creation atomicity with regards to the content protection EA, - * we must create the file in the catalog and then write out its EA in the same - * transaction. - * - * We only denote the target class in this EA; key generation is not completed - * until the file has been inserted into the catalog and will be done - * in a separate transaction. - */ - if ((protected_mount) && (protectable_target)) { - error = cp_setup_newentry(hfsmp, dcp, cp_class, attr.ca_mode, &entry); - if (error) { - goto exit; - } - } -#endif - - if ((error = hfs_start_transaction(hfsmp)) != 0) { - goto exit; - } - started_tr = 1; - - // have to also lock the attribute file because cat_create() needs - // to check that any fileID it wants to use does not have orphaned - // attributes in it. - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - cnid_t new_id; - - /* Reserve some space in the Catalog file. */ - if ((error = cat_preflight(hfsmp, CAT_CREATE, NULL, 0))) { - hfs_systemfile_unlock(hfsmp, lockflags); - goto exit; - } - - if ((error = cat_acquire_cnid(hfsmp, &new_id))) { - hfs_systemfile_unlock (hfsmp, lockflags); - goto exit; - } - - error = cat_create(hfsmp, new_id, &in_desc, &attr, &out_desc); - if (error == 0) { - /* Update the parent directory */ - dcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ - dcp->c_entries++; - - if (vnodetype == VDIR) { - INC_FOLDERCOUNT(hfsmp, dcp->c_attr); - } - dcp->c_dirchangecnt++; - hfs_incr_gencount(dcp); - - dcp->c_touch_chgtime = dcp->c_touch_modtime = true; - dcp->c_flag |= C_MODIFIED; - - hfs_update(dcp->c_vp, 0); - -#if CONFIG_PROTECT - /* - * If we are creating a content protected file, now is when - * we create the EA. We must create it in the same transaction - * that creates the file. We can also guarantee that the file - * MUST exist because we are still holding the catalog lock - * at this point. - */ - if ((attr.ca_fileid != 0) && (protected_mount) && (protectable_target)) { - error = cp_setxattr (NULL, entry, hfsmp, attr.ca_fileid, XATTR_CREATE); - - if (error) { - int delete_err; - /* - * If we fail the EA creation, then we need to delete the file. - * Luckily, we are still holding all of the right locks. - */ - delete_err = cat_delete (hfsmp, &out_desc, &attr); - if (delete_err == 0) { - /* Update the parent directory */ - if (dcp->c_entries > 0) - dcp->c_entries--; - dcp->c_dirchangecnt++; - dcp->c_ctime = tv.tv_sec; - dcp->c_mtime = tv.tv_sec; - (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); - } - - /* Emit EINVAL if we fail to create EA*/ - error = EINVAL; - } - } -#endif - } - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) - goto exit; - - uint32_t txn = hfsmp->jnl ? journal_current_txn(hfsmp->jnl) : 0; - - /* Invalidate negative cache entries in the directory */ - if (dcp->c_flag & C_NEG_ENTRIES) { - cache_purge_negatives(dvp); - dcp->c_flag &= ~C_NEG_ENTRIES; - } - - hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE, - (dcp->c_cnid == kHFSRootFolderID)); - - // XXXdbg - // have to end the transaction here before we call hfs_getnewvnode() - // because that can cause us to try and reclaim a vnode on a different - // file system which could cause us to start a transaction which can - // deadlock with someone on that other file system (since we could be - // holding two transaction locks as well as various vnodes and we did - // not obtain the locks on them in the proper order). - // - // NOTE: this means that if the quota check fails or we have to update - // the change time on a block-special device that those changes - // will happen as part of independent transactions. - // - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = 0; - } - -#if CONFIG_PROTECT - /* - * At this point, we must have encountered success with writing the EA. - * Destroy our temporary cprotect (which had no keys). - */ - - if ((attr.ca_fileid != 0) && (protected_mount) && (protectable_target)) { - cp_entry_destroy (hfsmp, entry); - entry = NULL; - } -#endif - gnv_flags |= GNV_CREATE; - if (nocache) { - gnv_flags |= GNV_NOCACHE; - } - - /* - * Create a vnode for the object just created. - * - * NOTE: Maintaining the cnode lock on the parent directory is important, - * as it prevents race conditions where other threads want to look up entries - * in the directory and/or add things as we are in the process of creating - * the vnode below. However, this has the potential for causing a - * double lock panic when dealing with shadow files on a HFS boot partition. - * The panic could occur if we are not cleaning up after ourselves properly - * when done with a shadow file or in the error cases. The error would occur if we - * try to create a new vnode, and then end up reclaiming another shadow vnode to - * create the new one. However, if everything is working properly, this should - * be a non-issue as we would never enter that reclaim codepath. - * - * The cnode is locked on successful return. - */ - error = hfs_getnewvnode(hfsmp, dvp, cnp, &out_desc, gnv_flags, &attr, - NULL, &tvp, &newvnode_flags); - if (error) - goto exit; - - cp = VTOC(tvp); - - cp->c_update_txn = txn; - - struct doc_tombstone *ut; - ut = get_uthread_doc_tombstone(); - if ( ut->t_lastop_document_id != 0 - && ut->t_lastop_parent == dvp - && ut->t_lastop_parent_vid == vnode_vid(dvp) - && strcmp((char *)ut->t_lastop_filename, (const char *)cp->c_desc.cd_nameptr) == 0) { - struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); - - //printf("CREATE: preserving doc-id %lld on %s\n", ut->t_lastop_document_id, ut->t_lastop_filename); - fip->document_id = (uint32_t)(ut->t_lastop_document_id & 0xffffffff); - - cp->c_bsdflags |= UF_TRACKED; - cp->c_flag |= C_MODIFIED; - - if ((error = hfs_start_transaction(hfsmp)) == 0) { - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); - - (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); - - hfs_systemfile_unlock (hfsmp, lockflags); - (void) hfs_end_transaction(hfsmp); - } - - clear_tombstone_docid(ut, hfsmp, cp); // will send the docid-changed fsevent - } else if (ut->t_lastop_document_id != 0) { - int len = cnp->cn_namelen; - if (len == 0) { - len = strlen(cnp->cn_nameptr); - } - - if (is_ignorable_temp_name(cnp->cn_nameptr, cnp->cn_namelen)) { - // printf("CREATE: not clearing tombstone because %s is a temp name.\n", cnp->cn_nameptr); - } else { - // Clear the tombstone because the thread is not recreating the same path - // printf("CREATE: clearing tombstone because %s is NOT a temp name.\n", cnp->cn_nameptr); - clear_tombstone_docid(ut, hfsmp, NULL); - } - } - - if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && (vnode_isfastdevicecandidate(dvp) && !vnode_isautocandidate(dvp))) { - - //printf("hfs: flagging %s (fileid: %d) as VFASTDEVCANDIDATE (dvp name: %s)\n", - // cnp->cn_nameptr ? cnp->cn_nameptr : "", - // cp->c_fileid, - // dvp->v_name ? dvp->v_name : "no-dir-name"); - - // - // On new files we set the FastDevCandidate flag so that - // any new blocks allocated to it will be pinned. - // - cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask; - vnode_setfastdevicecandidate(tvp); - - // - // properly inherit auto-cached flags - // - if (vnode_isautocandidate(dvp)) { - cp->c_attr.ca_recflags |= kHFSAutoCandidateMask; - vnode_setautocandidate(tvp); - } - - - // - // We also want to add it to the hotfile adoption list so - // that it will eventually land in the hotfile btree - // - (void) hfs_addhotfile(tvp); - } - - *vpp = tvp; - -#if CONFIG_PROTECT - /* - * Now that we have a vnode-in-hand, generate keys for this namespace item. - * If we fail to create the keys, then attempt to delete the item from the - * namespace. If we can't delete the item, that's not desirable but also not fatal.. - * All of the places which deal with restoring/unwrapping keys must also be - * prepared to encounter an entry that does not have keys. - */ - if ((protectable_target) && (protected_mount)) { - struct cprotect *keyed_entry = NULL; - - if (cp->c_cpentry == NULL) { - panic ("hfs_makenode: no cpentry for cnode (%p)", cp); - } - - error = cp_generate_keys (hfsmp, cp, CP_CLASS(cp->c_cpentry->cp_pclass), keywrap_flags, &keyed_entry); - if (error == 0) { - /* - * Upon success, the keys were generated and written out. - * Update the cp pointer in the cnode. - */ - cp_replace_entry (hfsmp, cp, keyed_entry); - if (nocache) { - cache_enter (dvp, tvp, cnp); - } - } - else { - /* If key creation OR the setxattr failed, emit EPERM to userland */ - error = EPERM; - - /* - * Beware! This slightly violates the lock ordering for the - * cnode/vnode 'tvp'. Ordinarily, you must acquire the truncate lock - * which guards file size changes before acquiring the normal cnode lock - * and calling hfs_removefile on an item. - * - * However, in this case, we are still holding the directory lock so - * 'tvp' is not lookup-able and it was a newly created vnode so it - * cannot have any content yet. The only reason we are initiating - * the removefile is because we could not generate content protection keys - * for this namespace item. Note also that we pass a '1' in the allow_dirs - * argument for hfs_removefile because we may be creating a directory here. - * - * All this to say that while it is technically a violation it is - * impossible to race with another thread for this cnode so it is safe. - */ - int err = hfs_removefile (dvp, tvp, cnp, 0, 0, 1, NULL, 0); - if (err) { - printf("hfs_makenode: removefile failed (%d) for CP entry %p\n", err, tvp); - } - - /* Release the cnode lock and mark the vnode for termination */ - hfs_unlock (cp); - err = vnode_recycle (tvp); - if (err) { - printf("hfs_makenode: vnode_recycle failed (%d) for CP entry %p\n", err, tvp); - } - - /* Drop the iocount on the new vnode to force reclamation/recycling */ - vnode_put (tvp); - cp = NULL; - *vpp = NULL; - } - } -#endif - -#if QUOTA - /* - * Once we create this vnode, we need to initialize its quota data - * structures, if necessary. We know that it is OK to just go ahead and - * initialize because we've already validated earlier (through the hfs_quotacheck - * function) to see if creating this cnode/vnode would cause us to go over quota. - */ - if (hfsmp->hfs_flags & HFS_QUOTAS) { - if (cp) { - /* cp could have been zeroed earlier */ - (void) hfs_getinoquota(cp); - } - } -#endif - -exit: - cat_releasedesc(&out_desc); - -#if CONFIG_PROTECT - /* - * We may have jumped here in error-handling various situations above. - * If we haven't already dumped the temporary CP used to initialize - * the file atomically, then free it now. cp_entry_destroy should null - * out the pointer if it was called already. - */ - if (entry) { - cp_entry_destroy (hfsmp, entry); - entry = NULL; - } -#endif - - /* - * Make sure we release cnode lock on dcp. - */ - if (dcp) { - dcp->c_flag &= ~C_DIR_MODIFICATION; - wakeup((caddr_t)&dcp->c_flag); - - hfs_unlock(dcp); - } - if (error == 0 && cp != NULL) { - hfs_unlock(cp); - } - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = 0; - } - - return (error); -} - - -/* - * hfs_vgetrsrc acquires a resource fork vnode corresponding to the - * cnode that is found in 'vp'. The cnode should be locked upon entry - * and will be returned locked, but it may be dropped temporarily. - * - * If the resource fork vnode does not exist, HFS will attempt to acquire an - * empty (uninitialized) vnode from VFS so as to avoid deadlocks with - * jetsam. If we let the normal getnewvnode code produce the vnode for us - * we would be doing so while holding the cnode lock of our cnode. - * - * On success, *rvpp wlll hold the resource fork vnode with an - * iocount. *Don't* forget the vnode_put. - */ -int -hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp) -{ - struct vnode *rvp = NULLVP; - struct vnode *empty_rvp = NULLVP; - struct vnode *dvp = NULLVP; - struct cnode *cp = VTOC(vp); - int error; - int vid; - - if (vnode_vtype(vp) == VDIR) { - return EINVAL; - } - -restart: - /* Attempt to use existing vnode */ - if ((rvp = cp->c_rsrc_vp)) { - vid = vnode_vid(rvp); - - // vnode_getwithvid can block so we need to drop the cnode lock - hfs_unlock(cp); - - error = vnode_getwithvid(rvp, vid); - - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); - - /* - * When our lock was relinquished, the resource fork - * could have been recycled. Check for this and try - * again. - */ - if (error == ENOENT) - goto restart; - - if (error) { - const char * name = (const char *)VTOC(vp)->c_desc.cd_nameptr; - - if (name) - printf("hfs_vgetrsrc: couldn't get resource" - " fork for %s, vol=%s, err=%d\n", name, hfsmp->vcbVN, error); - return (error); - } - } else { - struct cat_fork rsrcfork; - struct componentname cn; - struct cat_desc *descptr = NULL; - struct cat_desc to_desc; - char delname[32]; - int lockflags; - int newvnode_flags = 0; - - /* - * In this case, we don't currently see a resource fork vnode attached - * to this cnode. In most cases, we were called from a read-only VNOP - * like getattr, so it should be safe to drop the cnode lock and then - * re-acquire it. - * - * Here, we drop the lock so that we can acquire an empty/husk - * vnode so that we don't deadlock against jetsam. - * - * It does not currently appear possible to hold the truncate lock via - * FS re-entrancy when we get to this point. (8/2014) - */ - hfs_unlock (cp); - - error = vnode_create_empty (&empty_rvp); - - hfs_lock_always (cp, HFS_EXCLUSIVE_LOCK); - - if (error) { - /* If acquiring the 'empty' vnode failed, then nothing to clean up */ - return error; - } - - /* - * We could have raced with another thread here while we dropped our cnode - * lock. See if the cnode now has a resource fork vnode and restart if appropriate. - * - * Note: We just released the cnode lock, so there is a possibility that the - * cnode that we just acquired has been deleted or even removed from disk - * completely, though this is unlikely. If the file is open-unlinked, the - * check below will resolve it for us. If it has been completely - * removed (even from the catalog!), then when we examine the catalog - * directly, below, while holding the catalog lock, we will not find the - * item and we can fail out properly. - */ - if (cp->c_rsrc_vp) { - /* Drop the empty vnode before restarting */ - vnode_put (empty_rvp); - empty_rvp = NULL; - rvp = NULL; - goto restart; - } - - /* - * hfs_vgetsrc may be invoked for a cnode that has already been marked - * C_DELETED. This is because we need to continue to provide rsrc - * fork access to open-unlinked files. In this case, build a fake descriptor - * like in hfs_removefile. If we don't do this, buildkey will fail in - * cat_lookup because this cnode has no name in its descriptor. - */ - if ((cp->c_flag & C_DELETED ) && (cp->c_desc.cd_namelen == 0)) { - bzero (&to_desc, sizeof(to_desc)); - bzero (delname, 32); - MAKE_DELETED_NAME(delname, sizeof(delname), cp->c_fileid); - to_desc.cd_nameptr = (const u_int8_t*) delname; - to_desc.cd_namelen = strlen(delname); - to_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; - to_desc.cd_flags = 0; - to_desc.cd_cnid = cp->c_cnid; - - descptr = &to_desc; - } - else { - descptr = &cp->c_desc; - } - - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - /* - * We call cat_idlookup (instead of cat_lookup) below because we can't - * trust the descriptor in the provided cnode for lookups at this point. - * Between the time of the original lookup of this vnode and now, the - * descriptor could have gotten swapped or replaced. If this occurred, - * the parent/name combo originally desired may not necessarily be provided - * if we use the descriptor. Even worse, if the vnode represents - * a hardlink, we could have removed one of the links from the namespace - * but left the descriptor alone, since hfs_unlink does not invalidate - * the descriptor in the cnode if other links still point to the inode. - * - * Consider the following (slightly contrived) scenario: - * /tmp/a <--> /tmp/b (hardlinks). - * 1. Thread A: open rsrc fork on /tmp/b. - * 1a. Thread A: does lookup, goes out to lunch right before calling getnamedstream. - * 2. Thread B does 'mv /foo/b /tmp/b' - * 2. Thread B succeeds. - * 3. Thread A comes back and wants rsrc fork info for /tmp/b. - * - * Even though the hardlink backing /tmp/b is now eliminated, the descriptor - * is not removed/updated during the unlink process. So, if you were to - * do a lookup on /tmp/b, you'd acquire an entirely different record's resource - * fork. - * - * As a result, we use the fileid, which should be invariant for the lifetime - * of the cnode (possibly barring calls to exchangedata). - * - * Addendum: We can't do the above for HFS standard since we aren't guaranteed to - * have thread records for files. They were only required for directories. So - * we need to do the lookup with the catalog name. This is OK since hardlinks were - * never allowed on HFS standard. - */ - - /* Get resource fork data */ - if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { - error = cat_idlookup (hfsmp, cp->c_fileid, 0, 1, NULL, NULL, &rsrcfork); - } -#if CONFIG_HFS_STD - else { - /* - * HFS standard only: - * - * Get the resource fork for this item with a cat_lookup call, but do not - * force a case lookup since HFS standard is case-insensitive only. We - * don't want the descriptor; just the fork data here. If we tried to - * do a ID lookup (via thread record -> catalog record), then we might fail - * prematurely since, as noted above, thread records were not strictly required - * on files in HFS. - */ - error = cat_lookup (hfsmp, descptr, 1, 0, (struct cat_desc*)NULL, - (struct cat_attr*)NULL, &rsrcfork, NULL); - } -#endif - - hfs_systemfile_unlock(hfsmp, lockflags); - if (error) { - /* Drop our 'empty' vnode ! */ - vnode_put (empty_rvp); - return (error); - } - /* - * Supply hfs_getnewvnode with a component name. - */ - cn.cn_pnbuf = NULL; - if (descptr->cd_nameptr) { - MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - cn.cn_nameiop = LOOKUP; - cn.cn_flags = ISLASTCN | HASBUF; - cn.cn_context = NULL; - cn.cn_pnlen = MAXPATHLEN; - cn.cn_nameptr = cn.cn_pnbuf; - cn.cn_hash = 0; - cn.cn_consume = 0; - cn.cn_namelen = snprintf(cn.cn_nameptr, MAXPATHLEN, - "%s%s", descptr->cd_nameptr, - _PATH_RSRCFORKSPEC); - // Should never happen because cn.cn_nameptr won't ever be long... - if (cn.cn_namelen >= MAXPATHLEN) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - /* Drop our 'empty' vnode ! */ - vnode_put (empty_rvp); - return ENAMETOOLONG; - - } - } - dvp = vnode_getparent(vp); - - /* - * We are about to call hfs_getnewvnode and pass in the vnode that we acquired - * earlier when we were not holding any locks. The semantics of GNV_USE_VP require that - * either hfs_getnewvnode consume the vnode and vend it back to us, properly initialized, - * or it will consume/dispose of it properly if it errors out. - */ - rvp = empty_rvp; - - error = hfs_getnewvnode(hfsmp, dvp, cn.cn_pnbuf ? &cn : NULL, - descptr, (GNV_WANTRSRC | GNV_SKIPLOCK | GNV_USE_VP), - &cp->c_attr, &rsrcfork, &rvp, &newvnode_flags); - - if (dvp) - vnode_put(dvp); - if (cn.cn_pnbuf) - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - if (error) - return (error); - } /* End 'else' for rsrc fork not existing */ - - *rvpp = rvp; - return (0); -} - -/* - * Wrapper for special device reads - */ -int -hfsspec_read(ap) - struct vnop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap; -{ - /* - * Set access flag. - */ - VTOC(ap->a_vp)->c_touch_acctime = TRUE; - return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_read), ap)); -} - -/* - * Wrapper for special device writes - */ -int -hfsspec_write(ap) - struct vnop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap; -{ - /* - * Set update and change flags. - */ - VTOC(ap->a_vp)->c_touch_chgtime = TRUE; - VTOC(ap->a_vp)->c_touch_modtime = TRUE; - return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_write), ap)); -} - -/* - * Wrapper for special device close - * - * Update the times on the cnode then do device close. - */ -int -hfsspec_close(ap) - struct vnop_close_args /* { - struct vnode *a_vp; - int a_fflag; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - - if (vnode_isinuse(ap->a_vp, 0)) { - if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) == 0) { - cp = VTOC(vp); - hfs_touchtimes(VTOHFS(vp), cp); - hfs_unlock(cp); - } - } - return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_close), ap)); -} - -#if FIFO -/* - * Wrapper for fifo reads - */ -static int -hfsfifo_read(ap) - struct vnop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap; -{ - /* - * Set access flag. - */ - VTOC(ap->a_vp)->c_touch_acctime = TRUE; - return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_read), ap)); -} - -/* - * Wrapper for fifo writes - */ -static int -hfsfifo_write(ap) - struct vnop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap; -{ - /* - * Set update and change flags. - */ - VTOC(ap->a_vp)->c_touch_chgtime = TRUE; - VTOC(ap->a_vp)->c_touch_modtime = TRUE; - return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_write), ap)); -} - -/* - * Wrapper for fifo close - * - * Update the times on the cnode then do device close. - */ -static int -hfsfifo_close(ap) - struct vnop_close_args /* { - struct vnode *a_vp; - int a_fflag; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - - if (vnode_isinuse(ap->a_vp, 1)) { - if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) == 0) { - cp = VTOC(vp); - hfs_touchtimes(VTOHFS(vp), cp); - hfs_unlock(cp); - } - } - return (VOCALL (fifo_vnodeop_p, VOFFSET(vnop_close), ap)); -} - - -#endif /* FIFO */ - -/* - * Getter for the document_id - * the document_id is stored in FndrExtendedFileInfo/FndrExtendedDirInfo - */ -static u_int32_t -hfs_get_document_id_internal(const uint8_t *finderinfo, mode_t mode) -{ - const uint8_t *finfo = NULL; - u_int32_t doc_id = 0; - - /* overlay the FinderInfo to the correct pointer, and advance */ - finfo = finderinfo + 16; - - if (S_ISDIR(mode) || S_ISREG(mode)) { - const struct FndrExtendedFileInfo *extinfo = (const struct FndrExtendedFileInfo *)finfo; - doc_id = extinfo->document_id; - } else if (S_ISDIR(mode)) { - const struct FndrExtendedDirInfo *extinfo = (const struct FndrExtendedDirInfo *)finfo; - doc_id = extinfo->document_id; - } - - return doc_id; -} - - -/* getter(s) for document id */ -u_int32_t -hfs_get_document_id(struct cnode *cp) -{ - return (hfs_get_document_id_internal((u_int8_t*)cp->c_finderinfo, - cp->c_attr.ca_mode)); -} - -/* If you have finderinfo and mode, you can use this */ -u_int32_t -hfs_get_document_id_from_blob(const uint8_t *finderinfo, mode_t mode) -{ - return (hfs_get_document_id_internal(finderinfo, mode)); -} - -/* - * Synchronize a file's in-core state with that on disk. - */ -int -hfs_vnop_fsync(ap) - struct vnop_fsync_args /* { - struct vnode *a_vp; - int a_waitfor; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode* vp = ap->a_vp; - int error; - - /* Note: We check hfs flags instead of vfs mount flag because during - * read-write update, hfs marks itself read-write much earlier than - * the vfs, and hence won't result in skipping of certain writes like - * zero'ing out of unused nodes, creation of hotfiles btree, etc. - */ - if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) { - return 0; - } - - /* - * No need to call cp_handle_vnop to resolve fsync(). Any dirty data - * should have caused the keys to be unwrapped at the time the data was - * put into the UBC, either at mmap/pagein/read-write. If we did manage - * to let this by, then strategy will auto-resolve for us. - * - * We also need to allow ENOENT lock errors since unlink - * system call can call VNOP_FSYNC during vclean. - */ - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (error) - return (0); - - error = hfs_fsync(vp, ap->a_waitfor, 0, vfs_context_proc(ap->a_context)); - - hfs_unlock(VTOC(vp)); - return (error); -} - -int (**hfs_vnodeop_p)(void *); - -#define VOPFUNC int (*)(void *) - - -#if CONFIG_HFS_STD -int (**hfs_std_vnodeop_p) (void *); -static int hfs_readonly_op (__unused void* ap) { return (EROFS); } - -/* - * In 10.6 and forward, HFS Standard is read-only and deprecated. The vnop table below - * is for use with HFS standard to block out operations that would modify the file system - */ - -struct vnodeopv_entry_desc hfs_standard_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)hfs_vnop_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)hfs_readonly_op }, /* create (READONLY) */ - { &vnop_mknod_desc, (VOPFUNC)hfs_readonly_op }, /* mknod (READONLY) */ - { &vnop_open_desc, (VOPFUNC)hfs_vnop_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)hfs_vnop_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)hfs_readonly_op }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)hfs_vnop_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)hfs_readonly_op }, /* write (READONLY) */ - { &vnop_ioctl_desc, (VOPFUNC)hfs_vnop_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)hfs_vnop_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ - { &vnop_exchange_desc, (VOPFUNC)hfs_readonly_op }, /* exchange (READONLY)*/ - { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)hfs_readonly_op}, /* fsync (READONLY) */ - { &vnop_remove_desc, (VOPFUNC)hfs_readonly_op }, /* remove (READONLY) */ - { &vnop_link_desc, (VOPFUNC)hfs_readonly_op }, /* link ( READONLLY) */ - { &vnop_rename_desc, (VOPFUNC)hfs_readonly_op }, /* rename (READONLY)*/ - { &vnop_mkdir_desc, (VOPFUNC)hfs_readonly_op }, /* mkdir (READONLY) */ - { &vnop_rmdir_desc, (VOPFUNC)hfs_readonly_op }, /* rmdir (READONLY) */ - { &vnop_symlink_desc, (VOPFUNC)hfs_readonly_op }, /* symlink (READONLY) */ - { &vnop_readdir_desc, (VOPFUNC)hfs_vnop_readdir }, /* readdir */ - { &vnop_readdirattr_desc, (VOPFUNC)hfs_vnop_readdirattr }, /* readdirattr */ - { &vnop_readlink_desc, (VOPFUNC)hfs_vnop_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)hfs_vnop_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)hfs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vnop_allocate_desc, (VOPFUNC)hfs_readonly_op }, /* allocate (READONLY) */ -#if CONFIG_SEARCHFS - { &vnop_searchfs_desc, (VOPFUNC)hfs_vnop_search }, /* search fs */ -#else - { &vnop_searchfs_desc, (VOPFUNC)err_searchfs }, /* search fs */ -#endif - { &vnop_bwrite_desc, (VOPFUNC)hfs_readonly_op }, /* bwrite (READONLY) */ - { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* pagein */ - { &vnop_pageout_desc,(VOPFUNC) hfs_readonly_op }, /* pageout (READONLY) */ - { &vnop_copyfile_desc, (VOPFUNC)hfs_readonly_op }, /* copyfile (READONLY)*/ - { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ - { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, - { &vnop_setxattr_desc, (VOPFUNC)hfs_readonly_op}, /* set xattr (READONLY) */ - { &vnop_removexattr_desc, (VOPFUNC)hfs_readonly_op}, /* remove xattr (READONLY) */ - { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, -#if NAMEDSTREAMS - { &vnop_getnamedstream_desc, (VOPFUNC)hfs_vnop_getnamedstream }, - { &vnop_makenamedstream_desc, (VOPFUNC)hfs_readonly_op }, - { &vnop_removenamedstream_desc, (VOPFUNC)hfs_readonly_op }, -#endif - { &vnop_getattrlistbulk_desc, (VOPFUNC)hfs_vnop_getattrlistbulk }, /* getattrlistbulk */ - { NULL, (VOPFUNC)NULL } -}; - -struct vnodeopv_desc hfs_std_vnodeop_opv_desc = -{ &hfs_std_vnodeop_p, hfs_standard_vnodeop_entries }; -#endif - -/* VNOP table for HFS+ */ -struct vnodeopv_entry_desc hfs_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)hfs_vnop_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)hfs_vnop_create }, /* create */ - { &vnop_mknod_desc, (VOPFUNC)hfs_vnop_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)hfs_vnop_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)hfs_vnop_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)hfs_vnop_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)hfs_vnop_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)hfs_vnop_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)hfs_vnop_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ - { &vnop_exchange_desc, (VOPFUNC)hfs_vnop_exchange }, /* exchange */ - { &vnop_mmap_desc, (VOPFUNC)hfs_vnop_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)hfs_vnop_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)hfs_vnop_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)hfs_vnop_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)hfs_vnop_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)hfs_vnop_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)hfs_vnop_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)hfs_vnop_readdir }, /* readdir */ - { &vnop_readdirattr_desc, (VOPFUNC)hfs_vnop_readdirattr }, /* readdirattr */ - { &vnop_readlink_desc, (VOPFUNC)hfs_vnop_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)hfs_vnop_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)hfs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vnop_allocate_desc, (VOPFUNC)hfs_vnop_allocate }, /* allocate */ -#if CONFIG_SEARCHFS - { &vnop_searchfs_desc, (VOPFUNC)hfs_vnop_search }, /* search fs */ -#else - { &vnop_searchfs_desc, (VOPFUNC)err_searchfs }, /* search fs */ -#endif - { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, /* bwrite */ - { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* pagein */ - { &vnop_pageout_desc,(VOPFUNC) hfs_vnop_pageout }, /* pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ - { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, - { &vnop_setxattr_desc, (VOPFUNC)hfs_vnop_setxattr}, - { &vnop_removexattr_desc, (VOPFUNC)hfs_vnop_removexattr}, - { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, -#if NAMEDSTREAMS - { &vnop_getnamedstream_desc, (VOPFUNC)hfs_vnop_getnamedstream }, - { &vnop_makenamedstream_desc, (VOPFUNC)hfs_vnop_makenamedstream }, - { &vnop_removenamedstream_desc, (VOPFUNC)hfs_vnop_removenamedstream }, -#endif - { &vnop_getattrlistbulk_desc, (VOPFUNC)hfs_vnop_getattrlistbulk }, /* getattrlistbulk */ - { &vnop_mnomap_desc, (VOPFUNC)hfs_vnop_mnomap }, - { NULL, (VOPFUNC)NULL } -}; - -struct vnodeopv_desc hfs_vnodeop_opv_desc = -{ &hfs_vnodeop_p, hfs_vnodeop_entries }; - - -/* Spec Op vnop table for HFS+ */ -int (**hfs_specop_p)(void *); -struct vnodeopv_entry_desc hfs_specop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)spec_create }, /* create */ - { &vnop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)hfsspec_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)hfsspec_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)hfsspec_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)spec_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, - { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)hfs_vnop_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ - { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, - { &vnop_setxattr_desc, (VOPFUNC)hfs_vnop_setxattr}, - { &vnop_removexattr_desc, (VOPFUNC)hfs_vnop_removexattr}, - { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, - { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } -}; -struct vnodeopv_desc hfs_specop_opv_desc = - { &hfs_specop_p, hfs_specop_entries }; - -#if FIFO -/* HFS+ FIFO VNOP table */ -int (**hfs_fifoop_p)(void *); -struct vnodeopv_entry_desc hfs_fifoop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)fifo_create }, /* create */ - { &vnop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)fifo_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)hfsfifo_close }, /* close */ - { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)hfsfifo_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)hfsfifo_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)fifo_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ - { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)fifo_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, - { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)hfs_vnop_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ - { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, - { &vnop_setxattr_desc, (VOPFUNC)hfs_vnop_setxattr}, - { &vnop_removexattr_desc, (VOPFUNC)hfs_vnop_removexattr}, - { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, - { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } -}; -struct vnodeopv_desc hfs_fifoop_opv_desc = - { &hfs_fifoop_p, hfs_fifoop_entries }; -#endif /* FIFO */ - - - diff --git a/bsd/hfs/hfs_xattr.c b/bsd/hfs/hfs_xattr.c deleted file mode 100644 index c63dce8ea..000000000 --- a/bsd/hfs/hfs_xattr.c +++ /dev/null @@ -1,2636 +0,0 @@ -/* - * Copyright (c) 2004-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hfs.h" -#include "hfs_cnode.h" -#include "hfs_mount.h" -#include "hfs_format.h" -#include "hfs_endian.h" -#include "hfs_btreeio.h" -#include "hfs_fsctl.h" -#include "hfs_cprotect.h" - -#include "hfscommon/headers/BTreesInternal.h" - -#define HFS_XATTR_VERBOSE 0 - -#define ATTRIBUTE_FILE_NODE_SIZE 8192 - - -/* State information for the listattr_callback callback function. */ -struct listattr_callback_state { - u_int32_t fileID; - int result; - uio_t uio; - size_t size; -#if HFS_COMPRESSION - int showcompressed; - vfs_context_t ctx; - vnode_t vp; -#endif /* HFS_COMPRESSION */ -}; - - -/* HFS Internal Names */ -#define XATTR_EXTENDEDSECURITY_NAME "system.extendedsecurity" -#define XATTR_XATTREXTENTS_NAME "system.xattrextents" - -static u_int32_t emptyfinfo[8] = {0}; - -static int hfs_zero_hidden_fields (struct cnode *cp, u_int8_t *finderinfo); - -const char hfs_attrdatafilename[] = "Attribute Data"; - -static int listattr_callback(const HFSPlusAttrKey *key, const HFSPlusAttrData *data, - struct listattr_callback_state *state); - -static int remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator); - -static int getnodecount(struct hfsmount *hfsmp, size_t nodesize); - -static size_t getmaxinlineattrsize(struct vnode * attrvp); - -static int read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents); - -static int write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents); - -static int alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, HFSPlusExtentDescriptor *extents, int *blocks); - -static void free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *extents); - -static int has_overflow_extents(HFSPlusForkData *forkdata); - -static int count_extent_blocks(int maxblks, HFSPlusExtentRecord extents); - -#if NAMEDSTREAMS -/* - * Obtain the vnode for a stream. - */ -int -hfs_vnop_getnamedstream(struct vnop_getnamedstream_args* ap) -{ - vnode_t vp = ap->a_vp; - vnode_t *svpp = ap->a_svpp; - struct cnode *cp; - int error = 0; - - *svpp = NULL; - - /* - * We only support the "com.apple.ResourceFork" stream. - */ - if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) != 0) { - return (ENOATTR); - } - cp = VTOC(vp); - if ( !S_ISREG(cp->c_mode) ) { - return (EPERM); - } -#if HFS_COMPRESSION - int hide_rsrc = hfs_hides_rsrc(ap->a_context, VTOC(vp), 1); -#endif /* HFS_COMPRESSION */ - if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (error); - } - if ((!hfs_has_rsrc(cp) -#if HFS_COMPRESSION - || hide_rsrc -#endif /* HFS_COMPRESSION */ - ) && (ap->a_operation != NS_OPEN)) { - hfs_unlock(cp); - return (ENOATTR); - } - error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp); - hfs_unlock(cp); - - return (error); -} - -/* - * Create a stream. - */ -int -hfs_vnop_makenamedstream(struct vnop_makenamedstream_args* ap) -{ - vnode_t vp = ap->a_vp; - vnode_t *svpp = ap->a_svpp; - struct cnode *cp; - int error = 0; - - *svpp = NULL; - - /* - * We only support the "com.apple.ResourceFork" stream. - */ - if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) != 0) { - return (ENOATTR); - } - cp = VTOC(vp); - if ( !S_ISREG(cp->c_mode) ) { - return (EPERM); - } -#if HFS_COMPRESSION - if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { - if (VNODE_IS_RSRC(vp)) { - return EINVAL; - } else { - error = decmpfs_decompress_file(vp, VTOCMP(vp), -1, 1, 0); - if (error != 0) - return error; - } - } -#endif /* HFS_COMPRESSION */ - if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (error); - } - error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp); - hfs_unlock(cp); - - return (error); -} - -/* - * Remove a stream. - */ -int -hfs_vnop_removenamedstream(struct vnop_removenamedstream_args* ap) -{ - vnode_t svp = ap->a_svp; - cnode_t *scp = VTOC(svp); - int error = 0; - - /* - * We only support the "com.apple.ResourceFork" stream. - */ - if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) != 0) { - return (ENOATTR); - } -#if HFS_COMPRESSION - if (hfs_hides_rsrc(ap->a_context, scp, 1)) { - /* do nothing */ - return 0; - } -#endif /* HFS_COMPRESSION */ - - hfs_lock_truncate(scp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (VTOF(svp)->ff_size) { - // hfs_truncate will deal with the cnode lock - error = hfs_truncate(svp, 0, IO_NDELAY, 0, ap->a_context); - } - hfs_unlock_truncate(scp, HFS_LOCK_DEFAULT); - - return error; -} -#endif - - -/* Zero out the date added field for the specified cnode */ -static int hfs_zero_hidden_fields (struct cnode *cp, u_int8_t *finderinfo) -{ - u_int8_t *finfo = finderinfo; - - /* Advance finfo by 16 bytes to the 2nd half of the finderinfo */ - finfo = finfo + 16; - - if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - extinfo->document_id = 0; - extinfo->date_added = 0; - extinfo->write_gen_counter = 0; - } else if (S_ISDIR(cp->c_attr.ca_mode)) { - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; - extinfo->document_id = 0; - extinfo->date_added = 0; - extinfo->write_gen_counter = 0; - } else { - /* Return an error */ - return -1; - } - return 0; - -} - -/* - * Retrieve the data of an extended attribute. - */ -int -hfs_vnop_getxattr(struct vnop_getxattr_args *ap) -/* - struct vnop_getxattr_args { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - char * a_name; - uio_t a_uio; - size_t *a_size; - int a_options; - vfs_context_t a_context; - }; -*/ -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct hfsmount *hfsmp; - uio_t uio = ap->a_uio; - size_t bufsize; - int result; - - cp = VTOC(vp); - if (vp == cp->c_vp) { -#if HFS_COMPRESSION - int decmpfs_hide = hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1); /* 1 == don't take the cnode lock */ - if (decmpfs_hide && !(ap->a_options & XATTR_SHOWCOMPRESSION)) - return ENOATTR; -#endif /* HFS_COMPRESSION */ - - /* Get the Finder Info. */ - if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { - u_int8_t finderinfo[32]; - bufsize = 32; - - if ((result = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { - return (result); - } - /* Make a copy since we may not export all of it. */ - bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); - hfs_unlock(cp); - - /* Zero out the date added field in the local copy */ - hfs_zero_hidden_fields (cp, finderinfo); - - /* Don't expose a symlink's private type/creator. */ - if (vnode_islnk(vp)) { - struct FndrFileInfo *fip; - - fip = (struct FndrFileInfo *)&finderinfo; - fip->fdType = 0; - fip->fdCreator = 0; - } - /* If Finder Info is empty then it doesn't exist. */ - if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { - return (ENOATTR); - } - if (uio == NULL) { - *ap->a_size = bufsize; - return (0); - } - if ((user_size_t)uio_resid(uio) < bufsize) - return (ERANGE); - - result = uiomove((caddr_t)&finderinfo , bufsize, uio); - - return (result); - } - /* Read the Resource Fork. */ - if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0) { - struct vnode *rvp = NULL; - int openunlinked = 0; - int namelen = 0; - - if ( !S_ISREG(cp->c_mode) ) { - return (EPERM); - } - if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (result); - } - namelen = cp->c_desc.cd_namelen; - - if (!hfs_has_rsrc(cp)) { - hfs_unlock(cp); - return (ENOATTR); - } - hfsmp = VTOHFS(vp); - if ((cp->c_flag & C_DELETED) && (namelen == 0)) { - openunlinked = 1; - } - - result = hfs_vgetrsrc(hfsmp, vp, &rvp); - hfs_unlock(cp); - if (result) { - return (result); - } - if (uio == NULL) { - *ap->a_size = (size_t)VTOF(rvp)->ff_size; - } else { -#if HFS_COMPRESSION - user_ssize_t uio_size = 0; - if (decmpfs_hide) - uio_size = uio_resid(uio); -#endif /* HFS_COMPRESSION */ - result = VNOP_READ(rvp, uio, 0, ap->a_context); -#if HFS_COMPRESSION - if (decmpfs_hide && - (result == 0) && - (uio_resid(uio) == uio_size)) { - /* - * We intentionally make the above call to VNOP_READ so that - * it can return an authorization/permission/etc. Error - * based on ap->a_context and thus deny this operation; - * in that case, result != 0 and we won't proceed. - * - * However, if result == 0, it will have returned no data - * because hfs_vnop_read hid the resource fork - * (hence uio_resid(uio) == uio_size, i.e. the uio is untouched) - * - * In that case, we try again with the decmpfs_ctx context - * to get the actual data - */ - result = VNOP_READ(rvp, uio, 0, decmpfs_ctx); - } -#endif /* HFS_COMPRESSION */ - } - /* force the rsrc fork vnode to recycle right away */ - if (openunlinked) { - int vref; - vref = vnode_ref (rvp); - if (vref == 0) { - vnode_rele (rvp); - } - vnode_recycle(rvp); - } - vnode_put(rvp); - return (result); - } - } - hfsmp = VTOHFS(vp); - /* - * Standard HFS only supports native FinderInfo and Resource Forks. - */ - if (hfsmp->hfs_flags & HFS_STANDARD) { - return (EPERM); - } - - if ((result = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { - return (result); - } - - /* Check for non-rsrc, non-finderinfo EAs */ - result = hfs_getxattr_internal (cp, ap, VTOHFS(cp->c_vp), 0); - - hfs_unlock(cp); - - return MacToVFSError(result); -} - -// Has same limitations as hfs_getxattr_internal below -int hfs_xattr_read(vnode_t vp, const char *name, void *data, size_t *size) -{ - char uio_buf[UIO_SIZEOF(1)]; - uio_t uio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, uio_buf, - sizeof(uio_buf)); - - uio_addiov(uio, CAST_USER_ADDR_T(data), *size); - - struct vnop_getxattr_args args = { - .a_uio = uio, - .a_name = name, - .a_size = size - }; - - return hfs_getxattr_internal(VTOC(vp), &args, VTOHFS(vp), 0); -} - -/* - * getxattr_internal - * - * We break out this internal function which searches the attributes B-Tree and the - * overflow extents file to find non-resource, non-finderinfo EAs. There may be cases - * where we need to get EAs in contexts where we are already holding the cnode lock, - * and to re-enter hfs_vnop_getxattr would cause us to double-lock the cnode. Instead, - * we can just directly call this function. - * - * We pass the hfsmp argument directly here because we may not necessarily have a cnode to - * operate on. Under normal conditions, we have a file or directory to query, but if we - * are operating on the root directory (id 1), then we may not have a cnode. In this case, if hte - * 'cp' argument is NULL, then we need to use the 'fileid' argument as the entry to manipulate - * - * NOTE: This function assumes the cnode lock for 'cp' is held exclusive or shared. - */ -int hfs_getxattr_internal (struct cnode *cp, struct vnop_getxattr_args *ap, - struct hfsmount *hfsmp, u_int32_t fileid) -{ - - struct filefork *btfile; - struct BTreeIterator * iterator = NULL; - size_t bufsize = 0; - HFSPlusAttrRecord *recp = NULL; - FSBufferDescriptor btdata; - int lockflags = 0; - int result = 0; - u_int16_t datasize = 0; - uio_t uio = ap->a_uio; - u_int32_t target_id = 0; - - if (cp) { - target_id = cp->c_fileid; - } else { - target_id = fileid; - } - - - /* Bail if we don't have an EA B-Tree. */ - if ((hfsmp->hfs_attribute_vp == NULL) || - ((cp) && (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0)) { - result = ENOATTR; - goto exit; - } - - /* Initialize the B-Tree iterator for searching for the proper EA */ - btfile = VTOF(hfsmp->hfs_attribute_vp); - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - result = ENOMEM; - goto exit; - } - bzero(iterator, sizeof(*iterator)); - - /* Allocate memory for reading in the attribute record. This buffer is - * big enough to read in all types of attribute records. It is not big - * enough to read inline attribute data which is read in later. - */ - MALLOC(recp, HFSPlusAttrRecord *, sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK); - if (recp == NULL) { - result = ENOMEM; - goto exit; - } - btdata.bufferAddress = recp; - btdata.itemSize = sizeof(HFSPlusAttrRecord); - btdata.itemCount = 1; - - result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - if (result) { - goto exit; - } - - /* Lookup the attribute in the Attribute B-Tree */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); - hfs_systemfile_unlock(hfsmp, lockflags); - - if (result) { - if (result == btNotFound) { - result = ENOATTR; - } - goto exit; - } - - /* - * Operate differently if we have inline EAs that can fit in the attribute B-Tree or if - * we have extent based EAs. - */ - switch (recp->recordType) { - - /* Attribute fits in the Attribute B-Tree */ - case kHFSPlusAttrInlineData: { - /* - * Sanity check record size. It's not required to have any - * user data, so the minimum size is 2 bytes less that the - * size of HFSPlusAttrData (since HFSPlusAttrData struct - * has 2 bytes set aside for attribute data). - */ - if (datasize < (sizeof(HFSPlusAttrData) - 2)) { - printf("hfs_getxattr: vol=%s %d,%s invalid record size %d (expecting %lu)\n", - hfsmp->vcbVN, target_id, ap->a_name, datasize, sizeof(HFSPlusAttrData)); - result = ENOATTR; - break; - } - *ap->a_size = recp->attrData.attrSize; - if (uio && recp->attrData.attrSize != 0) { - if (*ap->a_size > (user_size_t)uio_resid(uio)) { - /* User provided buffer is not large enough for the xattr data */ - result = ERANGE; - } else { - /* Previous BTreeSearchRecord() read in only the attribute record, - * and not the attribute data. Now allocate enough memory for - * both attribute record and data, and read the attribute record again. - */ - bufsize = sizeof(HFSPlusAttrData) - 2 + recp->attrData.attrSize; - FREE(recp, M_TEMP); - MALLOC(recp, HFSPlusAttrRecord *, bufsize, M_TEMP, M_WAITOK); - if (recp == NULL) { - result = ENOMEM; - goto exit; - } - - btdata.bufferAddress = recp; - btdata.itemSize = bufsize; - btdata.itemCount = 1; - - bzero(iterator, sizeof(*iterator)); - result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - if (result) { - goto exit; - } - - /* Lookup the attribute record and inline data */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); - hfs_systemfile_unlock(hfsmp, lockflags); - if (result) { - if (result == btNotFound) { - result = ENOATTR; - } - goto exit; - } - - /* Copy-out the attribute data to the user buffer */ - *ap->a_size = recp->attrData.attrSize; - result = uiomove((caddr_t) &recp->attrData.attrData , recp->attrData.attrSize, uio); - } - } - break; - } - - /* Extent-Based EAs */ - case kHFSPlusAttrForkData: { - if (datasize < sizeof(HFSPlusAttrForkData)) { - printf("hfs_getxattr: vol=%s %d,%s invalid record size %d (expecting %lu)\n", - hfsmp->vcbVN, target_id, ap->a_name, datasize, sizeof(HFSPlusAttrForkData)); - result = ENOATTR; - break; - } - *ap->a_size = recp->forkData.theFork.logicalSize; - if (uio == NULL) { - break; - } - if (*ap->a_size > (user_size_t)uio_resid(uio)) { - result = ERANGE; - break; - } - /* Process overflow extents if necessary. */ - if (has_overflow_extents(&recp->forkData.theFork)) { - HFSPlusExtentDescriptor *extentbuf; - HFSPlusExtentDescriptor *extentptr; - size_t extentbufsize; - u_int32_t totalblocks; - u_int32_t blkcnt; - u_int32_t attrlen; - - totalblocks = recp->forkData.theFork.totalBlocks; - /* Ignore bogus block counts. */ - if (totalblocks > howmany(HFS_XATTR_MAXSIZE, hfsmp->blockSize)) { - result = ERANGE; - break; - } - attrlen = recp->forkData.theFork.logicalSize; - - /* Get a buffer to hold the worst case amount of extents. */ - extentbufsize = totalblocks * sizeof(HFSPlusExtentDescriptor); - extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); - MALLOC(extentbuf, HFSPlusExtentDescriptor *, extentbufsize, M_TEMP, M_WAITOK); - if (extentbuf == NULL) { - result = ENOMEM; - break; - } - bzero(extentbuf, extentbufsize); - extentptr = extentbuf; - - /* Grab the first 8 extents. */ - bcopy(&recp->forkData.theFork.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); - extentptr += kHFSPlusExtentDensity; - blkcnt = count_extent_blocks(totalblocks, recp->forkData.theFork.extents); - - /* Now lookup the overflow extents. */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - while (blkcnt < totalblocks) { - ((HFSPlusAttrKey *)&iterator->key)->startBlock = blkcnt; - result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); - if (result || - (recp->recordType != kHFSPlusAttrExtents) || - (datasize < sizeof(HFSPlusAttrExtents))) { - printf("hfs_getxattr: %s missing extents, only %d blks of %d found\n", - ap->a_name, blkcnt, totalblocks); - result = ENOATTR; - break; /* break from while */ - } - /* Grab the next 8 extents. */ - bcopy(&recp->overflowExtents.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); - extentptr += kHFSPlusExtentDensity; - blkcnt += count_extent_blocks(totalblocks, recp->overflowExtents.extents); - } - - /* Release Attr B-Tree lock */ - hfs_systemfile_unlock(hfsmp, lockflags); - - if (blkcnt < totalblocks) { - result = ENOATTR; - } else { - result = read_attr_data(hfsmp, uio, attrlen, extentbuf); - } - FREE(extentbuf, M_TEMP); - - } else { /* No overflow extents. */ - result = read_attr_data(hfsmp, uio, recp->forkData.theFork.logicalSize, recp->forkData.theFork.extents); - } - break; - } - - default: - /* We only support Extent or inline EAs. Default to ENOATTR for anything else */ - result = ENOATTR; - break; - } - -exit: - if (iterator) { - FREE(iterator, M_TEMP); - } - if (recp) { - FREE(recp, M_TEMP); - } - - return result; - -} - - -/* - * Set the data of an extended attribute. - */ -int -hfs_vnop_setxattr(struct vnop_setxattr_args *ap) -/* - struct vnop_setxattr_args { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - char * a_name; - uio_t a_uio; - int a_options; - vfs_context_t a_context; - }; -*/ -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp = NULL; - struct hfsmount *hfsmp; - uio_t uio = ap->a_uio; - size_t attrsize; - void * user_data_ptr = NULL; - int result; - time_t orig_ctime=VTOC(vp)->c_ctime; - - if (ap->a_name == NULL || ap->a_name[0] == '\0') { - return (EINVAL); /* invalid name */ - } - hfsmp = VTOHFS(vp); - if (VNODE_IS_RSRC(vp)) { - return (EPERM); - } - -#if HFS_COMPRESSION - if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) ) { /* 1 == don't take the cnode lock */ - result = decmpfs_decompress_file(vp, VTOCMP(vp), -1, 1, 0); - if (result != 0) - return result; - } -#endif /* HFS_COMPRESSION */ - - check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_WRITE_OP, NSPACE_REARM_NO_ARG); - - /* Set the Finder Info. */ - if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { - u_int8_t finderinfo[32]; - struct FndrFileInfo *fip; - void * finderinfo_start; - u_int8_t *finfo = NULL; - u_int16_t fdFlags; - u_int32_t dateadded = 0; - u_int32_t write_gen_counter = 0; - u_int32_t document_id = 0; - - attrsize = sizeof(VTOC(vp)->c_finderinfo); - - if ((user_size_t)uio_resid(uio) != attrsize) { - return (ERANGE); - } - /* Grab the new Finder Info data. */ - if ((result = uiomove((caddr_t)&finderinfo , attrsize, uio))) { - return (result); - } - fip = (struct FndrFileInfo *)&finderinfo; - - if ((result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (result); - } - cp = VTOC(vp); - - /* Symlink's don't have an external type/creator. */ - if (vnode_islnk(vp)) { - /* Skip over type/creator fields. */ - finderinfo_start = &cp->c_finderinfo[8]; - attrsize -= 8; - } else { - finderinfo_start = &cp->c_finderinfo[0]; - /* - * Don't allow the external setting of - * file type to kHardLinkFileType. - */ - if (fip->fdType == SWAP_BE32(kHardLinkFileType)) { - hfs_unlock(cp); - return (EPERM); - } - } - - /* Grab the current date added from the cnode */ - dateadded = hfs_get_dateadded (cp); - if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16); - /* - * Grab generation counter directly from the cnode - * instead of calling hfs_get_gencount(), because - * for zero generation count values hfs_get_gencount() - * lies and bumps it up to one. - */ - write_gen_counter = extinfo->write_gen_counter; - document_id = extinfo->document_id; - } else if (S_ISDIR(cp->c_attr.ca_mode)) { - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)((u_int8_t*)cp->c_finderinfo + 16); - write_gen_counter = extinfo->write_gen_counter; - document_id = extinfo->document_id; - } - - /* - * Zero out the finder info's reserved fields like date added, - * generation counter, and document id to ignore user's attempts - * to set it - */ - hfs_zero_hidden_fields(cp, finderinfo); - - if (bcmp(finderinfo_start, emptyfinfo, attrsize)) { - /* attr exists and "create" was specified. */ - if (ap->a_options & XATTR_CREATE) { - hfs_unlock(cp); - return (EEXIST); - } - } else { /* empty */ - /* attr doesn't exists and "replace" was specified. */ - if (ap->a_options & XATTR_REPLACE) { - hfs_unlock(cp); - return (ENOATTR); - } - } - - /* - * Now restore the date added and other reserved fields to the finderinfo to - * be written out. Advance to the 2nd half of the finderinfo to write them - * out into the buffer. - * - * Make sure to endian swap the date added back into big endian. When we used - * hfs_get_dateadded above to retrieve it, it swapped into local endianness - * for us. But now that we're writing it out, put it back into big endian. - */ - finfo = &finderinfo[16]; - if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - extinfo->date_added = OSSwapHostToBigInt32(dateadded); - extinfo->write_gen_counter = write_gen_counter; - extinfo->document_id = document_id; - } else if (S_ISDIR(cp->c_attr.ca_mode)) { - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; - extinfo->date_added = OSSwapHostToBigInt32(dateadded); - extinfo->write_gen_counter = write_gen_counter; - extinfo->document_id = document_id; - } - - /* Set the cnode's Finder Info. */ - if (attrsize == sizeof(cp->c_finderinfo)) { - bcopy(&finderinfo[0], finderinfo_start, attrsize); - } else { - bcopy(&finderinfo[8], finderinfo_start, attrsize); - } - - /* Updating finderInfo updates change time and modified time */ - cp->c_touch_chgtime = TRUE; - cp->c_flag |= C_MODIFIED; - - /* - * Mirror the invisible bit to the UF_HIDDEN flag. - * - * The fdFlags for files and frFlags for folders are both 8 bytes - * into the userInfo (the first 16 bytes of the Finder Info). They - * are both 16-bit fields. - */ - fdFlags = *((u_int16_t *) &cp->c_finderinfo[8]); - if (fdFlags & OSSwapHostToBigConstInt16(kFinderInvisibleMask)) { - cp->c_bsdflags |= UF_HIDDEN; - } else { - cp->c_bsdflags &= ~UF_HIDDEN; - } - - result = hfs_update(vp, 0); - - hfs_unlock(cp); - return (result); - } - /* Write the Resource Fork. */ - if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0) { - struct vnode *rvp = NULL; - int namelen = 0; - int openunlinked = 0; - - if (!vnode_isreg(vp)) { - return (EPERM); - } - if ((result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (result); - } - cp = VTOC(vp); - namelen = cp->c_desc.cd_namelen; - - if (hfs_has_rsrc(cp)) { - /* attr exists and "create" was specified. */ - if (ap->a_options & XATTR_CREATE) { - hfs_unlock(cp); - return (EEXIST); - } - } else { - /* attr doesn't exists and "replace" was specified. */ - if (ap->a_options & XATTR_REPLACE) { - hfs_unlock(cp); - return (ENOATTR); - } - } - - /* - * Note that we could be called on to grab the rsrc fork vnode - * for a file that has become open-unlinked. - */ - if ((cp->c_flag & C_DELETED) && (namelen == 0)) { - openunlinked = 1; - } - - result = hfs_vgetrsrc(hfsmp, vp, &rvp); - hfs_unlock(cp); - if (result) { - return (result); - } - /* VNOP_WRITE marks cnode as needing a modtime update */ - result = VNOP_WRITE(rvp, uio, 0, ap->a_context); - - /* if open unlinked, force it inactive */ - if (openunlinked) { - int vref; - vref = vnode_ref (rvp); - if (vref == 0) { - vnode_rele(rvp); - } - vnode_recycle (rvp); - } else { - /* cnode is not open-unlinked, so re-lock cnode to sync */ - if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - vnode_recycle (rvp); - vnode_put(rvp); - return result; - } - - /* hfs fsync rsrc fork to force to disk and update modtime */ - result = hfs_fsync (rvp, MNT_NOWAIT, 0, vfs_context_proc (ap->a_context)); - hfs_unlock (cp); - } - - vnode_put(rvp); - return (result); - } - /* - * Standard HFS only supports native FinderInfo and Resource Forks. - */ - if (hfsmp->hfs_flags & HFS_STANDARD) { - return (EPERM); - } - attrsize = uio_resid(uio); - - /* Enforce an upper limit. */ - if (attrsize > HFS_XATTR_MAXSIZE) { - result = E2BIG; - goto exit; - } - - /* - * Attempt to copy the users attr data before taking any locks, - * only if it will be an inline attribute. For larger attributes, - * the data will be directly read from the uio. - */ - if (attrsize > 0 && - hfsmp->hfs_max_inline_attrsize != 0 && - attrsize < hfsmp->hfs_max_inline_attrsize) { - MALLOC(user_data_ptr, void *, attrsize, M_TEMP, M_WAITOK); - if (user_data_ptr == NULL) { - result = ENOMEM; - goto exit; - } - - result = uiomove((caddr_t)user_data_ptr, attrsize, uio); - if (result) { - goto exit; - } - } - - result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if (result) { - goto exit; - } - cp = VTOC(vp); - - /* - * If we're trying to set a non-finderinfo, non-resourcefork EA, then - * call the breakout function. - */ - result = hfs_setxattr_internal (cp, user_data_ptr, attrsize, ap, VTOHFS(vp), 0); - - exit: - if (cp) { - hfs_unlock(cp); - } - if (user_data_ptr) { - FREE(user_data_ptr, M_TEMP); - } - - return (result == btNotFound ? ENOATTR : MacToVFSError(result)); -} - -// Has same limitations as hfs_setxattr_internal below -int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size) -{ - struct vnop_setxattr_args args = { - .a_vp = vp, - .a_name = name, - }; - - return hfs_setxattr_internal(VTOC(vp), data, size, &args, VTOHFS(vp), 0); -} - -/* - * hfs_setxattr_internal - * - * Internal function to set non-rsrc, non-finderinfo EAs to either the attribute B-Tree or - * extent-based EAs. - * - * See comments from hfs_getxattr_internal on why we need to pass 'hfsmp' and fileid here. - * The gist is that we could end up writing to the root folder which may not have a cnode. - * - * Assumptions: - * 1. cnode 'cp' is locked EXCLUSIVE before calling this function. - * 2. data_ptr contains data to be written. If gathering data from userland, this must be - * done before calling this function. - * 3. If data originates entirely in-kernel, use a null UIO, and ensure the size is less than - * hfsmp->hfs_max_inline_attrsize bytes long. - */ -int hfs_setxattr_internal (struct cnode *cp, const void *data_ptr, size_t attrsize, - struct vnop_setxattr_args *ap, struct hfsmount *hfsmp, - u_int32_t fileid) -{ - uio_t uio = ap->a_uio; - struct vnode *vp = ap->a_vp; - int started_transaction = 0; - struct BTreeIterator * iterator = NULL; - struct filefork *btfile = NULL; - FSBufferDescriptor btdata; - HFSPlusAttrRecord attrdata; /* 90 bytes */ - HFSPlusAttrRecord *recp = NULL; - HFSPlusExtentDescriptor *extentptr = NULL; - int result = 0; - int lockflags = 0; - int exists = 0; - int allocatedblks = 0; - u_int32_t target_id; - - if (cp) { - target_id = cp->c_fileid; - } else { - target_id = fileid; - } - - /* Start a transaction for our changes. */ - if (hfs_start_transaction(hfsmp) != 0) { - result = EINVAL; - goto exit; - } - started_transaction = 1; - - /* - * Once we started the transaction, nobody can compete - * with us, so make sure this file is still there. - */ - if ((cp) && (cp->c_flag & C_NOEXISTS)) { - result = ENOENT; - goto exit; - } - - /* - * If there isn't an attributes b-tree then create one. - */ - if (hfsmp->hfs_attribute_vp == NULL) { - result = hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, - getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); - if (result) { - goto exit; - } - } - if (hfsmp->hfs_max_inline_attrsize == 0) { - hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); - } - - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - /* Build the b-tree key. */ - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - result = ENOMEM; - goto exit; - } - bzero(iterator, sizeof(*iterator)); - result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - if (result) { - goto exit; - } - - /* Preflight for replace/create semantics. */ - btfile = VTOF(hfsmp->hfs_attribute_vp); - btdata.bufferAddress = &attrdata; - btdata.itemSize = sizeof(attrdata); - btdata.itemCount = 1; - exists = BTSearchRecord(btfile, iterator, &btdata, NULL, NULL) == 0; - - /* Replace requires that the attribute already exists. */ - if ((ap->a_options & XATTR_REPLACE) && !exists) { - result = ENOATTR; - goto exit; - } - /* Create requires that the attribute doesn't exist. */ - if ((ap->a_options & XATTR_CREATE) && exists) { - result = EEXIST; - goto exit; - } - - /* If it won't fit inline then use extent-based attributes. */ - if (attrsize > hfsmp->hfs_max_inline_attrsize) { - size_t extentbufsize; - int blkcnt; - int extentblks; - u_int32_t *keystartblk; - int i; - - if (uio == NULL) { - /* - * setxattrs originating from in-kernel are not supported if they are bigger - * than the inline max size. Just return ENOATTR and force them to do it with a - * smaller EA. - */ - result = EPERM; - goto exit; - } - - /* Get some blocks. */ - blkcnt = howmany(attrsize, hfsmp->blockSize); - extentbufsize = blkcnt * sizeof(HFSPlusExtentDescriptor); - extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); - MALLOC(extentptr, HFSPlusExtentDescriptor *, extentbufsize, M_TEMP, M_WAITOK); - if (extentptr == NULL) { - result = ENOMEM; - goto exit; - } - bzero(extentptr, extentbufsize); - result = alloc_attr_blks(hfsmp, attrsize, extentbufsize, extentptr, &allocatedblks); - if (result) { - allocatedblks = 0; - goto exit; /* no more space */ - } - /* Copy data into the blocks. */ - result = write_attr_data(hfsmp, uio, attrsize, extentptr); - if (result) { - if (vp) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: write_attr_data vol=%s err (%d) %s:%s\n", - hfsmp->vcbVN, result, name ? name : "", ap->a_name); - if (name) - vnode_putname(name); - } - goto exit; - } - - /* Now remove any previous attribute. */ - if (exists) { - result = remove_attribute_records(hfsmp, iterator); - if (result) { - if (vp) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: remove_attribute_records vol=%s err (%d) %s:%s\n", - hfsmp->vcbVN, result, name ? name : "", ap->a_name); - if (name) - vnode_putname(name); - } - goto exit; - } - } - /* Create attribute fork data record. */ - MALLOC(recp, HFSPlusAttrRecord *, sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK); - if (recp == NULL) { - result = ENOMEM; - goto exit; - } - btdata.bufferAddress = recp; - btdata.itemCount = 1; - btdata.itemSize = sizeof(HFSPlusAttrForkData); - - recp->recordType = kHFSPlusAttrForkData; - recp->forkData.reserved = 0; - recp->forkData.theFork.logicalSize = attrsize; - recp->forkData.theFork.clumpSize = 0; - recp->forkData.theFork.totalBlocks = blkcnt; - bcopy(extentptr, recp->forkData.theFork.extents, sizeof(HFSPlusExtentRecord)); - - (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - - result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); - if (result) { - printf ("hfs_setxattr: BTInsertRecord(): vol=%s %d,%s err=%d\n", - hfsmp->vcbVN, target_id, ap->a_name, result); - goto exit; - } - extentblks = count_extent_blocks(blkcnt, recp->forkData.theFork.extents); - blkcnt -= extentblks; - keystartblk = &((HFSPlusAttrKey *)&iterator->key)->startBlock; - i = 0; - - /* Create overflow extents as needed. */ - while (blkcnt > 0) { - /* Initialize the key and record. */ - *keystartblk += (u_int32_t)extentblks; - btdata.itemSize = sizeof(HFSPlusAttrExtents); - recp->recordType = kHFSPlusAttrExtents; - recp->overflowExtents.reserved = 0; - - /* Copy the next set of extents. */ - i += kHFSPlusExtentDensity; - bcopy(&extentptr[i], recp->overflowExtents.extents, sizeof(HFSPlusExtentRecord)); - - result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); - if (result) { - printf ("hfs_setxattr: BTInsertRecord() overflow: vol=%s %d,%s err=%d\n", - hfsmp->vcbVN, target_id, ap->a_name, result); - goto exit; - } - extentblks = count_extent_blocks(blkcnt, recp->overflowExtents.extents); - blkcnt -= extentblks; - } - } else { /* Inline data */ - if (exists) { - result = remove_attribute_records(hfsmp, iterator); - if (result) { - goto exit; - } - } - - /* Calculate size of record rounded up to multiple of 2 bytes. */ - btdata.itemSize = sizeof(HFSPlusAttrData) - 2 + attrsize + ((attrsize & 1) ? 1 : 0); - MALLOC(recp, HFSPlusAttrRecord *, btdata.itemSize, M_TEMP, M_WAITOK); - if (recp == NULL) { - result = ENOMEM; - goto exit; - } - recp->recordType = kHFSPlusAttrInlineData; - recp->attrData.reserved[0] = 0; - recp->attrData.reserved[1] = 0; - recp->attrData.attrSize = attrsize; - - /* Copy in the attribute data (if any). */ - if (attrsize > 0) { - if (data_ptr) { - bcopy(data_ptr, &recp->attrData.attrData, attrsize); - } else { - /* - * A null UIO meant it originated in-kernel. If they didn't supply data_ptr - * then deny the copy operation. - */ - if (uio == NULL) { - result = EPERM; - goto exit; - } - result = uiomove((caddr_t)&recp->attrData.attrData, attrsize, uio); - } - - if (result) { - goto exit; - } - } - - (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - - btdata.bufferAddress = recp; - btdata.itemCount = 1; - result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); - } - -exit: - if (btfile && started_transaction) { - (void) BTFlushPath(btfile); - } - hfs_systemfile_unlock(hfsmp, lockflags); - if (result == 0) { - if (vp) { - cp = VTOC(vp); - /* Setting an attribute only updates change time and not - * modified time of the file. - */ - cp->c_touch_chgtime = TRUE; - cp->c_flag |= C_MODIFIED; - cp->c_attr.ca_recflags |= kHFSHasAttributesMask; - if ((bcmp(ap->a_name, KAUTH_FILESEC_XATTR, sizeof(KAUTH_FILESEC_XATTR)) == 0)) { - cp->c_attr.ca_recflags |= kHFSHasSecurityMask; - } - (void) hfs_update(vp, 0); - } - } - if (started_transaction) { - if (result && allocatedblks) { - free_attr_blks(hfsmp, allocatedblks, extentptr); - } - hfs_end_transaction(hfsmp); - } - - if (recp) { - FREE(recp, M_TEMP); - } - if (extentptr) { - FREE(extentptr, M_TEMP); - } - if (iterator) { - FREE(iterator, M_TEMP); - } - - return result; -} - - - - -/* - * Remove an extended attribute. - */ -int -hfs_vnop_removexattr(struct vnop_removexattr_args *ap) -/* - struct vnop_removexattr_args { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - char * a_name; - int a_options; - vfs_context_t a_context; - }; -*/ -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct hfsmount *hfsmp; - struct BTreeIterator * iterator = NULL; - int lockflags; - int result; - time_t orig_ctime=VTOC(vp)->c_ctime; - - if (ap->a_name == NULL || ap->a_name[0] == '\0') { - return (EINVAL); /* invalid name */ - } - hfsmp = VTOHFS(vp); - if (VNODE_IS_RSRC(vp)) { - return (EPERM); - } - -#if HFS_COMPRESSION - if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) && !(ap->a_options & XATTR_SHOWCOMPRESSION)) { - return ENOATTR; - } -#endif /* HFS_COMPRESSION */ - - check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_DELETE_OP, NSPACE_REARM_NO_ARG); - - /* If Resource Fork is non-empty then truncate it. */ - if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0) { - struct vnode *rvp = NULL; - - if ( !vnode_isreg(vp) ) { - return (EPERM); - } - if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (result); - } - if (!hfs_has_rsrc(cp)) { - hfs_unlock(cp); - return (ENOATTR); - } - result = hfs_vgetrsrc(hfsmp, vp, &rvp); - hfs_unlock(cp); - if (result) { - return (result); - } - - hfs_lock_truncate(VTOC(rvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - - // Tell UBC now before we take the cnode lock and start the transaction - hfs_ubc_setsize(rvp, 0, false); - - if ((result = hfs_lock(VTOC(rvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - vnode_put(rvp); - return (result); - } - - /* Start a transaction for encapsulating changes in - * hfs_truncate() and hfs_update() - */ - if ((result = hfs_start_transaction(hfsmp))) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - hfs_unlock(cp); - vnode_put(rvp); - return (result); - } - - result = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 0, ap->a_context); - if (result == 0) { - cp->c_touch_chgtime = TRUE; - cp->c_flag |= C_MODIFIED; - result = hfs_update(vp, 0); - } - - hfs_end_transaction(hfsmp); - hfs_unlock_truncate(VTOC(rvp), HFS_LOCK_DEFAULT); - hfs_unlock(VTOC(rvp)); - - vnode_put(rvp); - return (result); - } - /* Clear out the Finder Info. */ - if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { - void * finderinfo_start; - int finderinfo_size; - u_int8_t finderinfo[32]; - u_int32_t date_added, write_gen_counter, document_id; - u_int8_t *finfo = NULL; - - if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - return (result); - } - - /* Use the local copy to store our temporary changes. */ - bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); - - - /* Zero out the date added field in the local copy */ - hfs_zero_hidden_fields (cp, finderinfo); - - /* Don't expose a symlink's private type/creator. */ - if (vnode_islnk(vp)) { - struct FndrFileInfo *fip; - - fip = (struct FndrFileInfo *)&finderinfo; - fip->fdType = 0; - fip->fdCreator = 0; - } - - /* Do the byte compare against the local copy */ - if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { - hfs_unlock(cp); - return (ENOATTR); - } - - /* - * If there was other content, zero out everything except - * type/creator and date added. First, save the date added. - */ - finfo = cp->c_finderinfo; - finfo = finfo + 16; - if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - date_added = extinfo->date_added; - write_gen_counter = extinfo->write_gen_counter; - document_id = extinfo->document_id; - } else if (S_ISDIR(cp->c_attr.ca_mode)) { - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; - date_added = extinfo->date_added; - write_gen_counter = extinfo->write_gen_counter; - document_id = extinfo->document_id; - } - - if (vnode_islnk(vp)) { - /* Ignore type/creator */ - finderinfo_start = &cp->c_finderinfo[8]; - finderinfo_size = sizeof(cp->c_finderinfo) - 8; - } else { - finderinfo_start = &cp->c_finderinfo[0]; - finderinfo_size = sizeof(cp->c_finderinfo); - } - bzero(finderinfo_start, finderinfo_size); - - - /* Now restore the date added */ - if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { - struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; - extinfo->date_added = date_added; - extinfo->write_gen_counter = write_gen_counter; - extinfo->document_id = document_id; - } else if (S_ISDIR(cp->c_attr.ca_mode)) { - struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; - extinfo->date_added = date_added; - extinfo->write_gen_counter = write_gen_counter; - extinfo->document_id = document_id; - } - - /* Updating finderInfo updates change time and modified time */ - cp->c_touch_chgtime = TRUE; - cp->c_flag |= C_MODIFIED; - hfs_update(vp, 0); - - hfs_unlock(cp); - - return (0); - } - /* - * Standard HFS only supports native FinderInfo and Resource Forks. - */ - if (hfsmp->hfs_flags & HFS_STANDARD) { - return (EPERM); - } - if (hfsmp->hfs_attribute_vp == NULL) { - return (ENOATTR); - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return (ENOMEM); - } - bzero(iterator, sizeof(*iterator)); - - if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { - goto exit_nolock; - } - - result = hfs_buildattrkey(cp->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - if (result) { - goto exit; - } - - if (hfs_start_transaction(hfsmp) != 0) { - result = EINVAL; - goto exit; - } - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - result = remove_attribute_records(hfsmp, iterator); - - hfs_systemfile_unlock(hfsmp, lockflags); - - if (result == 0) { - cp->c_touch_chgtime = TRUE; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - - /* If no more attributes exist, clear attribute bit */ - result = file_attribute_exist(hfsmp, cp->c_fileid); - if (result == 0) { - cp->c_attr.ca_recflags &= ~kHFSHasAttributesMask; - cp->c_flag |= C_MODIFIED; - } - if (result == EEXIST) { - result = 0; - } - - hfs_systemfile_unlock(hfsmp, lockflags); - - /* If ACL was removed, clear security bit */ - if ((bcmp(ap->a_name, KAUTH_FILESEC_XATTR, sizeof(KAUTH_FILESEC_XATTR)) == 0)) { - cp->c_attr.ca_recflags &= ~kHFSHasSecurityMask; - cp->c_flag |= C_MODIFIED; - } - (void) hfs_update(vp, 0); - } - - hfs_end_transaction(hfsmp); -exit: - hfs_unlock(cp); -exit_nolock: - FREE(iterator, M_TEMP); - return MacToVFSError(result); -} - -/* Check if any attribute record exist for given fileID. This function - * is called by hfs_vnop_removexattr to determine if it should clear the - * attribute bit in the catalog record or not. - * - * Note - you must acquire a shared lock on the attribute btree before - * calling this function. - * - * Output: - * EEXIST - If attribute record was found - * 0 - Attribute was not found - * (other) - Other error (such as EIO) - */ -int -file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID) -{ - HFSPlusAttrKey *key; - struct BTreeIterator * iterator = NULL; - struct filefork *btfile; - int result = 0; - - // if there's no attribute b-tree we sure as heck - // can't have any attributes! - if (hfsmp->hfs_attribute_vp == NULL) { - return false; - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - result = ENOMEM; - goto out; - } - bzero(iterator, sizeof(*iterator)); - key = (HFSPlusAttrKey *)&iterator->key; - - result = hfs_buildattrkey(fileID, NULL, key); - if (result) { - goto out; - } - - btfile = VTOF(hfsmp->hfs_attribute_vp); - result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); - if (result && (result != btNotFound)) { - goto out; - } - - result = BTIterateRecord(btfile, kBTreeNextRecord, iterator, NULL, NULL); - /* If no next record was found or fileID for next record did not match, - * no more attributes exist for this fileID - */ - if ((result && (result == btNotFound)) || (key->fileID != fileID)) { - result = 0; - } else { - result = EEXIST; - } - -out: - if (iterator) { - FREE(iterator, M_TEMP); - } - return result; -} - - -/* - * Remove all the records for a given attribute. - * - * - Used by hfs_vnop_removexattr, hfs_vnop_setxattr and hfs_removeallattr. - * - A transaction must have been started. - * - The Attribute b-tree file must be locked exclusive. - * - The Allocation Bitmap file must be locked exclusive. - * - The iterator key must be initialized. - */ -int -remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator) -{ - struct filefork *btfile; - FSBufferDescriptor btdata; - HFSPlusAttrRecord attrdata; /* 90 bytes */ - u_int16_t datasize; - int result; - - btfile = VTOF(hfsmp->hfs_attribute_vp); - - btdata.bufferAddress = &attrdata; - btdata.itemSize = sizeof(attrdata); - btdata.itemCount = 1; - result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); - if (result) { - goto exit; /* no records. */ - } - /* - * Free the blocks from extent based attributes. - * - * Note that the block references (btree records) are removed - * before releasing the blocks in the allocation bitmap. - */ - if (attrdata.recordType == kHFSPlusAttrForkData) { - int totalblks; - int extentblks; - u_int32_t *keystartblk; - - if (datasize < sizeof(HFSPlusAttrForkData)) { - printf("hfs: remove_attribute_records: bad record size %d (expecting %lu)\n", datasize, sizeof(HFSPlusAttrForkData)); - } - totalblks = attrdata.forkData.theFork.totalBlocks; - - /* Process the first 8 extents. */ - extentblks = count_extent_blocks(totalblks, attrdata.forkData.theFork.extents); - if (extentblks > totalblks) - panic("hfs: remove_attribute_records: corruption..."); - if (BTDeleteRecord(btfile, iterator) == 0) { - free_attr_blks(hfsmp, extentblks, attrdata.forkData.theFork.extents); - } - totalblks -= extentblks; - keystartblk = &((HFSPlusAttrKey *)&iterator->key)->startBlock; - - /* Process any overflow extents. */ - while (totalblks) { - *keystartblk += (u_int32_t)extentblks; - - result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); - if (result || - (attrdata.recordType != kHFSPlusAttrExtents) || - (datasize < sizeof(HFSPlusAttrExtents))) { - printf("hfs: remove_attribute_records: BTSearchRecord: vol=%s, err=%d (%d), totalblks %d\n", - hfsmp->vcbVN, MacToVFSError(result), attrdata.recordType != kHFSPlusAttrExtents, totalblks); - result = ENOATTR; - break; /* break from while */ - } - /* Process the next 8 extents. */ - extentblks = count_extent_blocks(totalblks, attrdata.overflowExtents.extents); - if (extentblks > totalblks) - panic("hfs: remove_attribute_records: corruption..."); - if (BTDeleteRecord(btfile, iterator) == 0) { - free_attr_blks(hfsmp, extentblks, attrdata.overflowExtents.extents); - } - totalblks -= extentblks; - } - } else { - result = BTDeleteRecord(btfile, iterator); - } - (void) BTFlushPath(btfile); -exit: - return (result == btNotFound ? ENOATTR : MacToVFSError(result)); -} - - -/* - * Retrieve the list of extended attribute names. - */ -int -hfs_vnop_listxattr(struct vnop_listxattr_args *ap) -/* - struct vnop_listxattr_args { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - uio_t a_uio; - size_t *a_size; - int a_options; - vfs_context_t a_context; -*/ -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct hfsmount *hfsmp; - uio_t uio = ap->a_uio; - struct BTreeIterator * iterator = NULL; - struct filefork *btfile; - struct listattr_callback_state state; - user_addr_t user_start = 0; - user_size_t user_len = 0; - int lockflags; - int result; - u_int8_t finderinfo[32]; - - - if (VNODE_IS_RSRC(vp)) { - return (EPERM); - } - -#if HFS_COMPRESSION - int compressed = hfs_file_is_compressed(cp, 1); /* 1 == don't take the cnode lock */ -#endif /* HFS_COMPRESSION */ - - hfsmp = VTOHFS(vp); - *ap->a_size = 0; - - /* - * Take the truncate lock; this serializes us against the ioctl - * to truncate data & reset the decmpfs state - * in the compressed file handler. - */ - hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - - /* Now the regular cnode lock (shared) */ - if ((result = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - return (result); - } - - /* - * Make a copy of the cnode's finderinfo to a local so we can - * zero out the date added field. Also zero out the private type/creator - * for symlinks. - */ - bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); - hfs_zero_hidden_fields (cp, finderinfo); - - /* Don't expose a symlink's private type/creator. */ - if (vnode_islnk(vp)) { - struct FndrFileInfo *fip; - - fip = (struct FndrFileInfo *)&finderinfo; - fip->fdType = 0; - fip->fdCreator = 0; - } - - - /* If Finder Info is non-empty then export it's name. */ - if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) != 0) { - if (uio == NULL) { - *ap->a_size += sizeof(XATTR_FINDERINFO_NAME); - } else if ((user_size_t)uio_resid(uio) < sizeof(XATTR_FINDERINFO_NAME)) { - result = ERANGE; - goto exit; - } else { - result = uiomove(XATTR_FINDERINFO_NAME, - sizeof(XATTR_FINDERINFO_NAME), uio); - if (result) - goto exit; - } - } - /* If Resource Fork is non-empty then export it's name. */ - if (S_ISREG(cp->c_mode) && hfs_has_rsrc(cp)) { -#if HFS_COMPRESSION - if ((ap->a_options & XATTR_SHOWCOMPRESSION) || - !compressed || - !decmpfs_hides_rsrc(ap->a_context, VTOCMP(vp)) - ) -#endif /* HFS_COMPRESSION */ - { - if (uio == NULL) { - *ap->a_size += sizeof(XATTR_RESOURCEFORK_NAME); - } else if ((user_size_t)uio_resid(uio) < sizeof(XATTR_RESOURCEFORK_NAME)) { - result = ERANGE; - goto exit; - } else { - result = uiomove(XATTR_RESOURCEFORK_NAME, - sizeof(XATTR_RESOURCEFORK_NAME), uio); - if (result) - goto exit; - } - } - } - /* - * Standard HFS only supports native FinderInfo and Resource Forks. - * Return at this point. - */ - if (hfsmp->hfs_flags & HFS_STANDARD) { - result = 0; - goto exit; - } - /* Bail if we don't have any extended attributes. */ - if ((hfsmp->hfs_attribute_vp == NULL) || - (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0) { - result = 0; - goto exit; - } - btfile = VTOF(hfsmp->hfs_attribute_vp); - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - result = ENOMEM; - goto exit; - } - bzero(iterator, sizeof(*iterator)); - result = hfs_buildattrkey(cp->c_fileid, NULL, (HFSPlusAttrKey *)&iterator->key); - if (result) - goto exit; - - /* - * Lock the user's buffer here so that we won't fault on - * it in uiomove while holding the attributes b-tree lock. - */ - if (uio && uio_isuserspace(uio)) { - user_start = uio_curriovbase(uio); - user_len = uio_curriovlen(uio); - - if ((result = vslock(user_start, user_len)) != 0) { - user_start = 0; - goto exit; - } - } - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - - result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); - if (result && result != btNotFound) { - hfs_systemfile_unlock(hfsmp, lockflags); - goto exit; - } - - state.fileID = cp->c_fileid; - state.result = 0; - state.uio = uio; - state.size = 0; -#if HFS_COMPRESSION - state.showcompressed = !compressed || ap->a_options & XATTR_SHOWCOMPRESSION; - state.ctx = ap->a_context; - state.vp = vp; -#endif /* HFS_COMPRESSION */ - - /* - * Process entries starting just after iterator->key. - */ - result = BTIterateRecords(btfile, kBTreeNextRecord, iterator, - (IterateCallBackProcPtr)listattr_callback, &state); - hfs_systemfile_unlock(hfsmp, lockflags); - if (uio == NULL) { - *ap->a_size += state.size; - } - - if (state.result || result == btNotFound) - result = state.result; - -exit: - if (user_start) { - vsunlock(user_start, user_len, TRUE); - } - if (iterator) { - FREE(iterator, M_TEMP); - } - hfs_unlock(cp); - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); - - return MacToVFSError(result); -} - - -/* - * Callback - called for each attribute record - */ -static int -listattr_callback(const HFSPlusAttrKey *key, __unused const HFSPlusAttrData *data, struct listattr_callback_state *state) -{ - char attrname[XATTR_MAXNAMELEN + 1]; - ssize_t bytecount; - int result; - - if (state->fileID != key->fileID) { - state->result = 0; - return (0); /* stop */ - } - /* - * Skip over non-primary keys - */ - if (key->startBlock != 0) { - return (1); /* continue */ - } - - /* Convert the attribute name into UTF-8. */ - result = utf8_encodestr(key->attrName, key->attrNameLen * sizeof(UniChar), - (u_int8_t *)attrname, (size_t *)&bytecount, sizeof(attrname), '/', 0); - if (result) { - state->result = result; - return (0); /* stop */ - } - bytecount++; /* account for null termination char */ - - if (xattr_protected(attrname)) - return (1); /* continue */ - -#if HFS_COMPRESSION - if (!state->showcompressed && decmpfs_hides_xattr(state->ctx, VTOCMP(state->vp), attrname) ) - return 1; /* continue */ -#endif /* HFS_COMPRESSION */ - - if (state->uio == NULL) { - state->size += bytecount; - } else { - if (bytecount > uio_resid(state->uio)) { - state->result = ERANGE; - return (0); /* stop */ - } - result = uiomove((caddr_t) attrname, bytecount, state->uio); - if (result) { - state->result = result; - return (0); /* stop */ - } - } - return (1); /* continue */ -} - -/* - * Remove all the attributes from a cnode. - * - * This function creates/ends its own transaction so that each - * attribute is deleted in its own transaction (to avoid having - * a transaction grow too large). - * - * This function takes the necessary locks on the attribute - * b-tree file and the allocation (bitmap) file. - * - * NOTE: Upon sucecss, this function will return with an open - * transaction. The reason we do it this way is because when we - * delete the last attribute, we must make sure the flag in the - * catalog record that indicates there are no more records is cleared. - * The caller is responsible for doing this and *must* do it before - * ending the transaction. - */ -int -hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid, - bool *open_transaction) -{ - BTreeIterator *iterator = NULL; - HFSPlusAttrKey *key; - struct filefork *btfile; - int result, lockflags = 0; - - *open_transaction = false; - - if (hfsmp->hfs_attribute_vp == NULL) - return 0; - - btfile = VTOF(hfsmp->hfs_attribute_vp); - - MALLOC(iterator, BTreeIterator *, sizeof(BTreeIterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return (ENOMEM); - } - bzero(iterator, sizeof(BTreeIterator)); - key = (HFSPlusAttrKey *)&iterator->key; - - /* Loop until there are no more attributes for this file id */ - do { - if (!*open_transaction) - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - - (void) hfs_buildattrkey(fileid, NULL, key); - result = BTIterateRecord(btfile, kBTreeNextRecord, iterator, NULL, NULL); - if (result || key->fileID != fileid) - goto exit; - - hfs_systemfile_unlock(hfsmp, lockflags); - lockflags = 0; - - if (*open_transaction) { - hfs_end_transaction(hfsmp); - *open_transaction = false; - } - - if (hfs_start_transaction(hfsmp) != 0) { - result = EINVAL; - goto exit; - } - - *open_transaction = true; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - result = remove_attribute_records(hfsmp, iterator); - -#if HFS_XATTR_VERBOSE - if (result) { - printf("hfs_removeallattr: unexpected err %d\n", result); - } -#endif - } while (!result); - -exit: - FREE(iterator, M_TEMP); - - if (lockflags) - hfs_systemfile_unlock(hfsmp, lockflags); - - result = result == btNotFound ? 0 : MacToVFSError(result); - - if (result && *open_transaction) { - hfs_end_transaction(hfsmp); - *open_transaction = false; - } - - return result; -} - -__private_extern__ -void -hfs_xattr_init(struct hfsmount * hfsmp) -{ - /* - * If there isn't an attributes b-tree then create one. - */ - if (!(hfsmp->hfs_flags & HFS_STANDARD) && - (hfsmp->hfs_attribute_vp == NULL) && - !(hfsmp->hfs_flags & HFS_READ_ONLY)) { - (void) hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, - getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); - } - if (hfsmp->hfs_attribute_vp) - hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); -} - -/* - * Enable/Disable volume attributes stored as EA for root file system. - * Supported attributes are - - * 1. Extent-based Extended Attributes - */ -int -hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) -{ - struct BTreeIterator * iterator = NULL; - struct filefork *btfile; - int lockflags; - int result; - - if (hfsmp->hfs_flags & HFS_STANDARD) { - return (ENOTSUP); - } - if (xattrtype != HFS_SET_XATTREXTENTS_STATE) { - return EINVAL; - } - - /* - * If there isn't an attributes b-tree then create one. - */ - if (hfsmp->hfs_attribute_vp == NULL) { - result = hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, - getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); - if (result) { - return (result); - } - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return (ENOMEM); - } - bzero(iterator, sizeof(*iterator)); - - /* - * Build a b-tree key. - * We use the root's parent id (1) to hold this volume attribute. - */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, - (HFSPlusAttrKey *)&iterator->key); - - /* Start a transaction for our changes. */ - if (hfs_start_transaction(hfsmp) != 0) { - result = EINVAL; - goto exit; - } - btfile = VTOF(hfsmp->hfs_attribute_vp); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - if (state == 0) { - /* Remove the attribute. */ - result = BTDeleteRecord(btfile, iterator); - if (result == btNotFound) - result = 0; - } else { - FSBufferDescriptor btdata; - HFSPlusAttrData attrdata; - u_int16_t datasize; - - datasize = sizeof(attrdata); - btdata.bufferAddress = &attrdata; - btdata.itemSize = datasize; - btdata.itemCount = 1; - attrdata.recordType = kHFSPlusAttrInlineData; - attrdata.reserved[0] = 0; - attrdata.reserved[1] = 0; - attrdata.attrSize = 2; - attrdata.attrData[0] = 0; - attrdata.attrData[1] = 0; - - /* Insert the attribute. */ - result = BTInsertRecord(btfile, iterator, &btdata, datasize); - if (result == btExists) - result = 0; - } - (void) BTFlushPath(btfile); - - hfs_systemfile_unlock(hfsmp, lockflags); - - /* Finish the transaction of our changes. */ - hfs_end_transaction(hfsmp); - - /* Update the state in the mount point */ - hfs_lock_mount (hfsmp); - if (state == 0) { - hfsmp->hfs_flags &= ~HFS_XATTR_EXTENTS; - } else { - hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; - } - hfs_unlock_mount (hfsmp); - -exit: - if (iterator) { - FREE(iterator, M_TEMP); - } - return MacToVFSError(result); -} - - -/* - * hfs_attrkeycompare - compare two attribute b-tree keys. - * - * The name portion of the key is compared using a 16-bit binary comparison. - * This is called from the b-tree code. - */ -__private_extern__ -int -hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey) -{ - u_int32_t searchFileID, trialFileID; - int result; - - searchFileID = searchKey->fileID; - trialFileID = trialKey->fileID; - result = 0; - - if (searchFileID > trialFileID) { - ++result; - } else if (searchFileID < trialFileID) { - --result; - } else { - u_int16_t * str1 = &searchKey->attrName[0]; - u_int16_t * str2 = &trialKey->attrName[0]; - int length1 = searchKey->attrNameLen; - int length2 = trialKey->attrNameLen; - u_int16_t c1, c2; - int length; - - if (length1 < length2) { - length = length1; - --result; - } else if (length1 > length2) { - length = length2; - ++result; - } else { - length = length1; - } - - while (length--) { - c1 = *(str1++); - c2 = *(str2++); - - if (c1 > c2) { - result = 1; - break; - } - if (c1 < c2) { - result = -1; - break; - } - } - if (result) - return (result); - /* - * Names are equal; compare startBlock - */ - if (searchKey->startBlock == trialKey->startBlock) { - return (0); - } else { - return (searchKey->startBlock < trialKey->startBlock ? -1 : 1); - } - } - - return result; -} - - -/* - * hfs_buildattrkey - build an Attribute b-tree key - */ -__private_extern__ -int -hfs_buildattrkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key) -{ - int result = 0; - size_t unicodeBytes = 0; - - if (attrname != NULL) { - /* - * Convert filename from UTF-8 into Unicode - */ - result = utf8_decodestr((const u_int8_t *)attrname, strlen(attrname), key->attrName, - &unicodeBytes, sizeof(key->attrName), 0, 0); - if (result) { - if (result != ENAMETOOLONG) - result = EINVAL; /* name has invalid characters */ - return (result); - } - key->attrNameLen = unicodeBytes / sizeof(UniChar); - key->keyLength = kHFSPlusAttrKeyMinimumLength + unicodeBytes; - } else { - key->attrNameLen = 0; - key->keyLength = kHFSPlusAttrKeyMinimumLength; - } - key->pad = 0; - key->fileID = fileID; - key->startBlock = 0; - - return (0); - } - -/* - * getnodecount - calculate starting node count for attributes b-tree. - */ -static int -getnodecount(struct hfsmount *hfsmp, size_t nodesize) -{ - u_int64_t freebytes; - u_int64_t calcbytes; - - /* - * 10.4: Scale base on current catalog file size (20 %) up to 20 MB. - * 10.5: Attempt to be as big as the catalog clump size. - * - * Use no more than 10 % of the remaining free space. - */ - freebytes = (u_int64_t)hfs_freeblks(hfsmp, 0) * (u_int64_t)hfsmp->blockSize; - - calcbytes = MIN(hfsmp->hfs_catalog_cp->c_datafork->ff_size / 5, 20 * 1024 * 1024); - - calcbytes = MAX(calcbytes, hfsmp->hfs_catalog_cp->c_datafork->ff_clumpsize); - - calcbytes = MIN(calcbytes, freebytes / 10); - - return (MAX(2, (int)(calcbytes / nodesize))); -} - - -/* - * getmaxinlineattrsize - calculate maximum inline attribute size. - * - * This yields 3,802 bytes for an 8K node size. - */ -static size_t -getmaxinlineattrsize(struct vnode * attrvp) -{ - struct BTreeInfoRec btinfo; - size_t nodesize = ATTRIBUTE_FILE_NODE_SIZE; - size_t maxsize; - - if (attrvp != NULL) { - (void) hfs_lock(VTOC(attrvp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - if (BTGetInformation(VTOF(attrvp), 0, &btinfo) == 0) - nodesize = btinfo.nodeSize; - hfs_unlock(VTOC(attrvp)); - } - maxsize = nodesize; - maxsize -= sizeof(BTNodeDescriptor); /* minus node descriptor */ - maxsize -= 3 * sizeof(u_int16_t); /* minus 3 index slots */ - maxsize /= 2; /* 2 key/rec pairs minumum */ - maxsize -= sizeof(HFSPlusAttrKey); /* minus maximum key size */ - maxsize -= sizeof(HFSPlusAttrData) - 2; /* minus data header */ - maxsize &= 0xFFFFFFFE; /* multiple of 2 bytes */ - - return (maxsize); -} - -/* - * Initialize vnode for attribute data I/O. - * - * On success, - * - returns zero - * - the attrdata vnode is initialized as hfsmp->hfs_attrdata_vp - * - an iocount is taken on the attrdata vnode which exists - * for the entire duration of the mount. It is only dropped - * during unmount - * - the attrdata cnode is not locked - * - * On failure, - * - returns non-zero value - * - the caller does not have to worry about any locks or references - */ -int init_attrdata_vnode(struct hfsmount *hfsmp) -{ - vnode_t vp; - int result = 0; - struct cat_desc cat_desc; - struct cat_attr cat_attr; - struct cat_fork cat_fork; - int newvnode_flags = 0; - - bzero(&cat_desc, sizeof(cat_desc)); - cat_desc.cd_parentcnid = kHFSRootParentID; - cat_desc.cd_nameptr = (const u_int8_t *)hfs_attrdatafilename; - cat_desc.cd_namelen = strlen(hfs_attrdatafilename); - cat_desc.cd_cnid = kHFSAttributeDataFileID; - /* Tag vnode as system file, note that we can still use cluster I/O */ - cat_desc.cd_flags |= CD_ISMETA; - - bzero(&cat_attr, sizeof(cat_attr)); - cat_attr.ca_linkcount = 1; - cat_attr.ca_mode = S_IFREG; - cat_attr.ca_fileid = cat_desc.cd_cnid; - cat_attr.ca_blocks = hfsmp->totalBlocks; - - /* - * The attribute data file is a virtual file that spans the - * entire file system space. - * - * Each extent-based attribute occupies a unique portion of - * in this virtual file. The cluster I/O is done using actual - * allocation block offsets so no additional mapping is needed - * for the VNOP_BLOCKMAP call. - * - * This approach allows the attribute data to be cached without - * incurring the high cost of using a separate vnode per attribute. - * - * Since we need to acquire the attribute b-tree file lock anyways, - * the virtual file doesn't introduce any additional serialization. - */ - bzero(&cat_fork, sizeof(cat_fork)); - cat_fork.cf_size = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; - cat_fork.cf_blocks = hfsmp->totalBlocks; - cat_fork.cf_extents[0].startBlock = 0; - cat_fork.cf_extents[0].blockCount = cat_fork.cf_blocks; - - result = hfs_getnewvnode(hfsmp, NULL, NULL, &cat_desc, 0, &cat_attr, - &cat_fork, &vp, &newvnode_flags); - if (result == 0) { - hfsmp->hfs_attrdata_vp = vp; - hfs_unlock(VTOC(vp)); - } - return (result); -} - -/* - * Read an extent based attribute. - */ -static int -read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) -{ - vnode_t evp = hfsmp->hfs_attrdata_vp; - int bufsize; - int64_t iosize; - int attrsize; - int blksize; - int i; - int result = 0; - - hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - - bufsize = (int)uio_resid(uio); - attrsize = (int)datasize; - blksize = (int)hfsmp->blockSize; - - /* - * Read the attribute data one extent at a time. - * For the typical case there is only one extent. - */ - for (i = 0; (attrsize > 0) && (bufsize > 0) && (extents[i].startBlock != 0); ++i) { - iosize = extents[i].blockCount * blksize; - iosize = MIN(iosize, attrsize); - iosize = MIN(iosize, bufsize); - uio_setresid(uio, iosize); - uio_setoffset(uio, (u_int64_t)extents[i].startBlock * (u_int64_t)blksize); - - result = cluster_read(evp, uio, VTOF(evp)->ff_size, IO_SYNC | IO_UNIT); - -#if HFS_XATTR_VERBOSE - printf("hfs: read_attr_data: cr iosize %lld [%d, %d] (%d)\n", - iosize, extents[i].startBlock, extents[i].blockCount, result); -#endif - if (result) - break; - attrsize -= iosize; - bufsize -= iosize; - } - uio_setresid(uio, bufsize); - uio_setoffset(uio, datasize); - - hfs_unlock_truncate(VTOC(evp), HFS_LOCK_DEFAULT); - return (result); -} - -/* - * Write an extent based attribute. - */ -static int -write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) -{ - vnode_t evp = hfsmp->hfs_attrdata_vp; - off_t filesize; - int bufsize; - int attrsize; - int64_t iosize; - int blksize; - int i; - int result = 0; - - hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); - - bufsize = uio_resid(uio); - attrsize = (int) datasize; - blksize = (int) hfsmp->blockSize; - filesize = VTOF(evp)->ff_size; - - /* - * Write the attribute data one extent at a time. - */ - for (i = 0; (attrsize > 0) && (bufsize > 0) && (extents[i].startBlock != 0); ++i) { - iosize = extents[i].blockCount * blksize; - iosize = MIN(iosize, attrsize); - iosize = MIN(iosize, bufsize); - uio_setresid(uio, iosize); - uio_setoffset(uio, (u_int64_t)extents[i].startBlock * (u_int64_t)blksize); - - result = cluster_write(evp, uio, filesize, filesize, filesize, - (off_t) 0, IO_SYNC | IO_UNIT); -#if HFS_XATTR_VERBOSE - printf("hfs: write_attr_data: cw iosize %lld [%d, %d] (%d)\n", - iosize, extents[i].startBlock, extents[i].blockCount, result); -#endif - if (result) - break; - attrsize -= iosize; - bufsize -= iosize; - } - uio_setresid(uio, bufsize); - uio_setoffset(uio, datasize); - - hfs_unlock_truncate(VTOC(evp), HFS_LOCK_DEFAULT); - return (result); -} - -/* - * Allocate blocks for an extent based attribute. - */ -static int -alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, HFSPlusExtentDescriptor *extents, int *blocks) -{ - int blkcnt; - int startblk; - int lockflags; - int i; - int maxextents; - int result = 0; - - startblk = hfsmp->hfs_metazone_end; - blkcnt = howmany(attrsize, hfsmp->blockSize); - if (blkcnt > (int)hfs_freeblks(hfsmp, 0)) { - return (ENOSPC); - } - *blocks = blkcnt; - maxextents = extentbufsize / sizeof(HFSPlusExtentDescriptor); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - for (i = 0; (blkcnt > 0) && (i < maxextents); i++) { - /* Try allocating and see if we find something decent */ - result = BlockAllocate(hfsmp, startblk, blkcnt, blkcnt, 0, - &extents[i].startBlock, &extents[i].blockCount); - /* - * If we couldn't find anything, then re-try the allocation but allow - * journal flushes. - */ - if (result == dskFulErr) { - result = BlockAllocate(hfsmp, startblk, blkcnt, blkcnt, HFS_ALLOC_FLUSHTXN, - &extents[i].startBlock, &extents[i].blockCount); - } - - -#if HFS_XATTR_VERBOSE - printf("hfs: alloc_attr_blks: BA blkcnt %d [%d, %d] (%d)\n", - blkcnt, extents[i].startBlock, extents[i].blockCount, result); -#endif - if (result) { - extents[i].startBlock = 0; - extents[i].blockCount = 0; - break; - } - blkcnt -= extents[i].blockCount; - startblk = extents[i].startBlock + extents[i].blockCount; - } - /* - * If it didn't fit in the extents buffer then bail. - */ - if (blkcnt) { - result = ENOSPC; - -#if HFS_XATTR_VERBOSE - printf("hfs: alloc_attr_blks: unexpected failure, %d blocks unallocated\n", blkcnt); -#endif - for (; i >= 0; i--) { - if ((blkcnt = extents[i].blockCount) != 0) { - (void) BlockDeallocate(hfsmp, extents[i].startBlock, blkcnt, 0); - extents[i].startBlock = 0; - extents[i].blockCount = 0; - } - } - } - - hfs_systemfile_unlock(hfsmp, lockflags); - return MacToVFSError(result); -} - -/* - * Release blocks from an extent based attribute. - */ -static void -free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *extents) -{ - vnode_t evp = hfsmp->hfs_attrdata_vp; - int remblks = blkcnt; - int lockflags; - int i; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - for (i = 0; (remblks > 0) && (extents[i].blockCount != 0); i++) { - if (extents[i].blockCount > (u_int32_t)blkcnt) { -#if HFS_XATTR_VERBOSE - printf("hfs: free_attr_blks: skipping bad extent [%d, %d]\n", - extents[i].startBlock, extents[i].blockCount); -#endif - extents[i].blockCount = 0; - continue; - } - if (extents[i].startBlock == 0) { - break; - } - (void)BlockDeallocate(hfsmp, extents[i].startBlock, extents[i].blockCount, 0); - remblks -= extents[i].blockCount; - extents[i].startBlock = 0; - extents[i].blockCount = 0; - -#if HFS_XATTR_VERBOSE - printf("hfs: free_attr_blks: BlockDeallocate [%d, %d]\n", - extents[i].startBlock, extents[i].blockCount); -#endif - /* Discard any resident pages for this block range. */ - if (evp) { - off_t start, end; - - start = (u_int64_t)extents[i].startBlock * (u_int64_t)hfsmp->blockSize; - end = start + (u_int64_t)extents[i].blockCount * (u_int64_t)hfsmp->blockSize; - (void) ubc_msync(hfsmp->hfs_attrdata_vp, start, end, &start, UBC_INVALIDATE); - } - } - - hfs_systemfile_unlock(hfsmp, lockflags); -} - -static int -has_overflow_extents(HFSPlusForkData *forkdata) -{ - u_int32_t blocks; - - if (forkdata->extents[7].blockCount == 0) - return (0); - - blocks = forkdata->extents[0].blockCount + - forkdata->extents[1].blockCount + - forkdata->extents[2].blockCount + - forkdata->extents[3].blockCount + - forkdata->extents[4].blockCount + - forkdata->extents[5].blockCount + - forkdata->extents[6].blockCount + - forkdata->extents[7].blockCount; - - return (forkdata->totalBlocks > blocks); -} - -static int -count_extent_blocks(int maxblks, HFSPlusExtentRecord extents) -{ - int blocks; - int i; - - for (i = 0, blocks = 0; i < kHFSPlusExtentDensity; ++i) { - /* Ignore obvious bogus extents. */ - if (extents[i].blockCount > (u_int32_t)maxblks) - continue; - if (extents[i].startBlock == 0 || extents[i].blockCount == 0) - break; - blocks += extents[i].blockCount; - } - return (blocks); -} - diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c deleted file mode 100644 index a8a057e64..000000000 --- a/bsd/hfs/hfscommon/BTree/BTree.c +++ /dev/null @@ -1,2101 +0,0 @@ -/* - * Copyright (c) 2000-2008, 2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: BTree.c - - Contains: Implementation of public interface routines for B-tree manager. - - Version: HFS Plus 1.0 - - Written by: Gordon Sheridan and Bill Bruffey - - Copyright: � 1992-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: File Systems - - Writers: - - (msd) Mark Day - (DSH) Deric Horn - (djb) Don Brady - - Change History (most recent first): - 9/22/99 ser Added routines BTGetLastSync and BTSetLastSync - 6/1/99 djb Sync up with Mac OS 8.6. - 6/30/98 djb In BTOpenPath make sure nodes are contiguous on disk (radar #2249539). - 4/15/98 djb In BTOpenPath need to clear nodeRec.buffer if GetBlockProc fails. - 4/11/98 djb Add RequireFileLock checking to all external entry points. - - 03/23/98 djb In BTOpenPath use kTrashBlock option when releasing the header so - that we get a full node when we call GetNode. - - 12/12/97 djb Radar #2202682, BTIterateRecord with kBTreeCurrentRecord was not - checking if we had a record and could call BlockMove with an - uninitialize source pointer (causing a bus error). - 10/24/97 msd In BTIterateRecord, when moving to the previous or next record - and we have to move to another node, see if we need to release - the node about to be "shifted out" (opposite sibling of the - direction we need to move). - 7/25/97 DSH BTSearchRecord now takes a heuristicHint, nodeNum, and tries it - before calling SearchBTree - 7/24/97 djb GetBlockProc now take a file refnum instead of an FCB ptr. - 7/22/97 djb Move trace points from BTreeWrapper.c to here. - 7/21/97 djb LogEndTime now takes an error code. - 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name - collision - 5/19/97 djb Add summary traces to BTIterateRecord. - 4/23/97 djb first checked in - - 2/19/97 djb Enable variable sized index keys for HFS+ volumes. Added node - cache to support nodes larger than 512 bytes. - 1/27/97 djb Calls to InsertTree and DeleteTree are now recursive (to support - variable sized index keys). - 1/13/97 djb Added support for getting current record to BTIterateRecord. - 1/6/97 djb Initialize "BigKeys" attribute in BTOpen. - 1/3/97 djb Added support for large keys. - 12/23/96 djb On exit map fsBTEmptyErr and fsBTEndOfIterationErr to - fsBTRecordNotFoundErr. - 12/19/96 djb first checked in - - History applicable to original Scarecrow Design: - - <13> 10/25/96 ser Changing for new VFPI - <12> 10/18/96 ser Converting over VFPI changes - <11> 9/17/96 dkh More BTree statistics. Modified hint checks to not bail out when - an error is returned from GetNode. - <10> 9/16/96 dkh Revised BTree statistics. - <9> 8/23/96 dkh Remove checks for multiple paths to BTree file. Need to add - equivalent mechanism later. - <8> 6/20/96 dkh Radar #1358740. Switch from using Pools to debug MemAllocators. - <7> 3/14/96 jev Fix BTreeSetRecord, recordFound was not set for the case of a - simple replace causing the leafRecords count to get bumped even - though we didn't have to add a record. - <6> 3/1/96 prp Fix lint problems. Bug in BTSetRecord that does not initialize - recordFound. - <5> 1/22/96 dkh Add #include Memory.h - <4> 1/10/96 msd Use the real function names from Math64.i. - <3> 1/4/96 jev Fix BTItererateRecord for the condition when the iterator - position routine does not find the record and we are looking for - the next record. In such a case, if the node's forrward link is - non-zero, we have to keep iterating next and not return - fsBTEndOfIterationErr error. - <2> 12/7/95 dkh D10E2 build. Changed usage of Ref data type to LogicalAddress. - <1> 10/18/95 rst Moved from Scarecrow project. - - <24> 7/18/95 mbb Change MoveData & ClearBytes to BlockMoveData & BlockZero. - <23> 1/31/95 prp GetBlockProc interface uses a 64 bit node number. - <22> 1/12/95 wjk Adopt Model FileSystem changes in D5. - <21> 11/16/94 prp Add IsItAHint routine and use it whenever hint's node number was - used for testing. - <20> 11/10/94 prp BTGetInfo name collides with the same name in FileManagerPriv.i. - Change it to BTGetInformation. - <19> 9/30/94 prp Get in sync with D2 interface changes. - <18> 7/22/94 wjk Convert to the new set of header files. - <17> 12/9/93 wjk Cleanup usage of char, Byte, int8, UInt8, etc. - <16> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and - NRCmds environments. - <15> 11/30/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and - NRCmds environments. - <14> 9/30/93 gs Rename E_NoGetNodeProc and E_NoReleaseNodeProc to - E_NoXxxxBlockProc. - <13> 8/31/93 prp Use Set64U instead of Set64. - <12> 8/16/93 prp In BTSearchRecord, if the input hint found the node and record, - set the local nodeNum variable correctly so that the resultant - iterator gets set correctly. - <11> 7/1/93 gs Fix bug in BTIterateRecord related to kBTreePrevRecord - operation. - <10> 6/2/93 gs Update for changes to FSErrors.h and add some comments. - <9> 5/24/93 gs Fix bug in BTInsert/Set/ReplaceRecord which didn't set node hint - properly in some cases. - <8> 5/24/93 gs Do NOT map fsBTEmptyErr to fsBTRecordNotFoundErr in BTSearchRecord. - <7> 5/24/93 gs Rename BTFlush to BTFlushPath. - <6> 5/21/93 gs Add hint optimization to Set/Replace routines. - <5> 5/10/93 gs Remove Panic from BTInitialize for small logicalEOF. Implement - Insert, Set, Replace, and Delete. - <4> 3/23/93 gs Finish BTInitialize. - <3> 2/8/93 gs Implement BTSearchRecord and BTIterateRecord. - <2> 12/8/92 gs Implement Open and Close routines. - <1> 11/15/92 gs first checked in - -*/ - -#include "../headers/BTreesPrivate.h" -#include "../../hfs_btreeio.h" - -/* - * The amount that the BTree header leaf count can be wrong before we assume - * it is in an infinite loop. - */ -#define kNumLeafRecSlack 10 - -//////////////////////////////////// Globals //////////////////////////////////// - - -/////////////////////////// BTree Module Entry Points /////////////////////////// - - - -/*------------------------------------------------------------------------------- -Routine: BTOpenPath - Open a file for access as a B*Tree. - -Function: Create BTree control block for a file, if necessary. Validates the - file to be sure it looks like a BTree file. - - -Input: filePtr - pointer to file to open as a B-tree - keyCompareProc - pointer to client's KeyCompare function - -Result: noErr - success - paramErr - required ptr was nil - fsBTInvalidFileErr - - memFullErr - - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - BTHeaderRec *header; - NodeRec nodeRec; - - ////////////////////// Preliminary Error Checking /////////////////////////// - - if ( filePtr == nil ) - { - return paramErr; - } - - /* - * Subsequent opens allow key compare proc to be changed. - */ - if ( filePtr->fcbBTCBPtr != nil && keyCompareProc != nil) { - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - btreePtr->keyCompareProc = keyCompareProc; - return noErr; - } - - if ( filePtr->fcbEOF < kMinNodeSize ) - return fsBTInvalidFileErr; - - - //////////////////////// Allocate Control Block ///////////////////////////// - - btreePtr = (BTreeControlBlock*) NewPtrSysClear( sizeof( BTreeControlBlock ) ); - if (btreePtr == nil) - { - Panic ("BTOpen: no memory for btreePtr."); - return memFullErr; - } - - btreePtr->getBlockProc = GetBTreeBlock; - btreePtr->releaseBlockProc = ReleaseBTreeBlock; - btreePtr->setEndOfForkProc = ExtendBTreeFile; - btreePtr->keyCompareProc = keyCompareProc; - - /////////////////////////// Read Header Node //////////////////////////////// - - nodeRec.buffer = nil; // so we can call ReleaseNode - btreePtr->fileRefNum = GetFileRefNumFromFCB(filePtr); - filePtr->fcbBTCBPtr = (Ptr) btreePtr; // attach btree cb to file - - /* Prefer doing I/O a physical block at a time */ - nodeRec.blockSize = VTOHFS(btreePtr->fileRefNum)->hfs_physical_block_size; - - /* Start with the allocation block size for regular files. */ - if (FTOC(filePtr)->c_fileid >= kHFSFirstUserCatalogNodeID) - { - nodeRec.blockSize = FCBTOVCB(filePtr)->blockSize; - } - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - // it is now safe to call M_ExitOnError (err) - - err = SetBTreeBlockSize (btreePtr->fileRefNum, nodeRec.blockSize, 1); - M_ExitOnError (err); - - - err = GetBTreeBlock(btreePtr->fileRefNum, - kHeaderNodeNum, - kGetBlock, - &nodeRec ); - if (err != noErr) - { - nodeRec.buffer = nil; - nodeRec.blockHeader = nil; - Panic("BTOpen: getNodeProc returned error getting header node."); - goto ErrorExit; - } - ++btreePtr->numGetNodes; - header = (BTHeaderRec*) ((uintptr_t)nodeRec.buffer + sizeof(BTNodeDescriptor)); - - - ///////////////////////////// verify header ///////////////////////////////// - - err = VerifyHeader (filePtr, header); - M_ExitOnError (err); - - - ///////////////////// Initalize fields from header ////////////////////////// - - PanicIf ( (FCBTOVCB(filePtr)->vcbSigWord != 0x4244) && (header->nodeSize == 512), " BTOpenPath: wrong node size for HFS+ volume!"); // 0x4244 = 'BD' - - btreePtr->treeDepth = header->treeDepth; - btreePtr->rootNode = header->rootNode; - btreePtr->leafRecords = header->leafRecords; - btreePtr->firstLeafNode = header->firstLeafNode; - btreePtr->lastLeafNode = header->lastLeafNode; - btreePtr->nodeSize = header->nodeSize; - btreePtr->maxKeyLength = header->maxKeyLength; - btreePtr->totalNodes = header->totalNodes; - btreePtr->freeNodes = header->freeNodes; - if (FTOC(filePtr)->c_fileid >= kHFSFirstUserCatalogNodeID) - filePtr->ff_clumpsize = header->clumpSize; - btreePtr->btreeType = header->btreeType; - - btreePtr->keyCompareType = header->keyCompareType; - - btreePtr->attributes = header->attributes; - - if ( btreePtr->maxKeyLength > 40 ) - btreePtr->attributes |= (kBTBigKeysMask + kBTVariableIndexKeysMask); //�� we need a way to save these attributes - - /////////////////////// Initialize dynamic fields /////////////////////////// - - btreePtr->version = kBTreeVersion; - btreePtr->flags = 0; - btreePtr->writeCount = 1; - - /////////////////////////// Check Header Node /////////////////////////////// - - // set kBadClose attribute bit, and UpdateNode - - /* b-tree node size must be at least as big as the logical block size */ - if (btreePtr->nodeSize < VTOHFS(btreePtr->fileRefNum)->hfs_logical_block_size) - { - /* - * If this tree has any records or the media is writeable then - * we cannot mount using the current physical block size. - */ - if (btreePtr->leafRecords > 0 || - VTOHFS(btreePtr->fileRefNum)->hfs_flags & HFS_WRITEABLE_MEDIA) - { - err = fsBTBadNodeSize; - goto ErrorExit; - } - } - - /* - * If the actual node size is different than the amount we read, - * then release and trash this block, and re-read with the correct - * node size. - */ - if ( btreePtr->nodeSize != nodeRec.blockSize ) - { - err = SetBTreeBlockSize (btreePtr->fileRefNum, btreePtr->nodeSize, 32); - M_ExitOnError (err); - - /* - * Need to use kTrashBlock option to force the - * buffer cache to read the entire node - */ - err = ReleaseBTreeBlock(btreePtr->fileRefNum, &nodeRec, kTrashBlock); - ++btreePtr->numReleaseNodes; - M_ExitOnError (err); - - err = GetNode (btreePtr, kHeaderNodeNum, 0, &nodeRec ); - M_ExitOnError (err); - } - - //�� total nodes * node size <= LEOF? - - - err = ReleaseNode (btreePtr, &nodeRec); - M_ExitOnError (err); - - /* - * Under Mac OS, b-tree nodes can be non-contiguous on disk when the - * allocation block size is smaller than the b-tree node size. - * - * If journaling is turned on for this volume we can't deal with this - * situation and so we bail out. If journaling isn't on it's ok as - * hfs_strategy_fragmented() deals with it. Journaling can't support - * this because it assumes that if you give it a block that it's - * contiguous on disk. - */ - if ( FCBTOHFS(filePtr)->jnl && !NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize) ) { - return fsBTInvalidNodeErr; - } - - //////////////////////////////// Success //////////////////////////////////// - - //�� align LEOF to multiple of node size? - just on close - - return noErr; - - - /////////////////////// Error - Clean up and Exit /////////////////////////// - -ErrorExit: - - filePtr->fcbBTCBPtr = nil; - (void) ReleaseNode (btreePtr, &nodeRec); - DisposePtr( (Ptr) btreePtr ); - - return err; -} - - - -/*------------------------------------------------------------------------------- -Routine: BTClosePath - Flush BTree Header and Deallocate Memory for BTree. - -Function: Flush the BTreeControlBlock fields to header node, and delete BTree control - block and key descriptor associated with the file if filePtr is last - path of type kBTreeType ('btre'). - - -Input: filePtr - pointer to file to delete BTree control block for. - -Result: noErr - success - fsBTInvalidFileErr - - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus BTClosePath (FCB *filePtr) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - if (btreePtr == nil) - return fsBTInvalidFileErr; - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - ////////////////////// Check for other BTree Paths ////////////////////////// - - btreePtr->attributes &= ~kBTBadCloseMask; // clear "bad close" attribute bit - err = UpdateHeader (btreePtr, true); - M_ExitOnError (err); - - DisposePtr( (Ptr) btreePtr ); - filePtr->fcbBTCBPtr = nil; - - return noErr; - - /////////////////////// Error - Clean Up and Exit /////////////////////////// - -ErrorExit: - - return err; -} - - - -/*------------------------------------------------------------------------------- -Routine: BTSearchRecord - Search BTree for a record with a matching key. - -Function: Search for position in B*Tree indicated by searchKey. If a valid node hint - is provided, it will be searched first, then SearchTree will be called. - If a BTreeIterator is provided, it will be set to the position found as - a result of the search. If a record exists at that position, and a BufferDescriptor - is supplied, the record will be copied to the buffer (as much as will fit), - and recordLen will be set to the length of the record. - - If an error other than fsBTRecordNotFoundErr occurs, the BTreeIterator, if any, - is invalidated, and recordLen is set to 0. - - -Input: pathPtr - pointer to path for BTree file. - searchKey - pointer to search key to match. - hintPtr - pointer to hint (may be nil) - -Output: record - pointer to BufferDescriptor containing record - recordLen - length of data at recordPtr - iterator - pointer to BTreeIterator indicating position result of search - -Result: noErr - success, record contains copy of record found - fsBTRecordNotFoundErr - record was not found, no data copied - fsBTInvalidFileErr - no BTreeControlBlock is allocated for the fork - fsBTInvalidKeyLengthErr - - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus BTSearchRecord (FCB *filePtr, - BTreeIterator *searchIterator, - FSBufferDescriptor *record, - u_int16_t *recordLen, - BTreeIterator *resultIterator ) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - TreePathTable treePathTable; - u_int32_t nodeNum; - BlockDescriptor node; - u_int16_t index; - BTreeKeyPtr keyPtr; - RecordPtr recordPtr; - u_int16_t len; - Boolean foundRecord; - Boolean validHint; - - if (filePtr == nil) - { - return paramErr; - } - - if (searchIterator == nil) - { - return paramErr; - } - - node.buffer = nil; - node.blockHeader = nil; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - if (btreePtr == nil) - { - return fsBTInvalidFileErr; - } - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - foundRecord = false; - - ////////////////////////////// Take A Hint ////////////////////////////////// - - err = IsItAHint (btreePtr, searchIterator, &validHint); - M_ExitOnError (err); - - if (validHint) - { - nodeNum = searchIterator->hint.nodeNum; - - err = GetNode (btreePtr, nodeNum, kGetNodeHint, &node); - if( err == noErr ) - { - if ( ((BTNodeDescriptor*) node.buffer)->kind == kBTLeafNode && - ((BTNodeDescriptor*) node.buffer)->numRecords > 0 ) - { - foundRecord = SearchNode (btreePtr, node.buffer, &searchIterator->key, &index); - - //�� if !foundRecord, we could still skip tree search if ( 0 < index < numRecords ) - } - - if (foundRecord == false) - { - err = ReleaseNode (btreePtr, &node); - M_ExitOnError (err); - } - else - { - ++btreePtr->numValidHints; - } - } - - if( foundRecord == false ) - (void) BTInvalidateHint( searchIterator ); - } - - - //////////////////////////// Search The Tree //////////////////////////////// - - if (foundRecord == false) - { - err = SearchTree ( btreePtr, &searchIterator->key, treePathTable, &nodeNum, &node, &index); - switch (err) - { - case noErr: - foundRecord = true; - break; - case fsBTRecordNotFoundErr: - break; - default: - goto ErrorExit; - } - } - - - //////////////////////////// Get the Record ///////////////////////////////// - - if (foundRecord == true) - { - //XXX Should check for errors! Or BlockMove could choke on recordPtr!!! - GetRecordByIndex (btreePtr, node.buffer, index, &keyPtr, &recordPtr, &len); - - if (recordLen != nil) *recordLen = len; - - if (record != nil) - { - ByteCount recordSize; - - recordSize = record->itemCount * record->itemSize; - - if (len > recordSize) len = recordSize; - - BlockMoveData (recordPtr, record->bufferAddress, len); - } - } - - - /////////////////////// Success - Update Iterator /////////////////////////// - - if (resultIterator != nil) - { - if (foundRecord) { - resultIterator->hint.writeCount = btreePtr->writeCount; - resultIterator->hint.nodeNum = nodeNum; - resultIterator->hint.index = index; - } -#if DEBUG_BUILD - resultIterator->hint.reserved1 = 0; - resultIterator->hint.reserved2 = 0; - resultIterator->version = 0; - resultIterator->reserved = 0; -#endif - // copy the key in the BTree when found rather than searchIterator->key to get proper case/diacriticals - if (foundRecord == true) - BlockMoveData ((Ptr)keyPtr, (Ptr)&resultIterator->key, CalcKeySize(btreePtr, keyPtr)); - else - BlockMoveData ((Ptr)&searchIterator->key, (Ptr)&resultIterator->key, CalcKeySize(btreePtr, &searchIterator->key)); - } - - err = ReleaseNode (btreePtr, &node); - M_ExitOnError (err); - - if (foundRecord == false) return fsBTRecordNotFoundErr; - else return noErr; - - - /////////////////////// Error - Clean Up and Exit /////////////////////////// - -ErrorExit: - - if (recordLen != nil) - *recordLen = 0; - - if (resultIterator != nil) - { - resultIterator->hint.writeCount = 0; - resultIterator->hint.nodeNum = 0; - resultIterator->hint.index = 0; - resultIterator->hint.reserved1 = 0; - resultIterator->hint.reserved2 = 0; - - resultIterator->version = 0; - resultIterator->reserved = 0; - resultIterator->key.length16 = 0; // zero out two bytes to cover both types of keys - } - - if ( err == fsBTEmptyErr ) - err = fsBTRecordNotFoundErr; - - return err; -} - - - -/*------------------------------------------------------------------------------- -Routine: BTIterateRecord - Find the first, next, previous, or last record. - -Function: Find the first, next, previous, or last record in the BTree - -Input: pathPtr - pointer to path iterate records for. - operation - iteration operation (first,next,prev,last) - iterator - pointer to iterator indicating start position - -Output: iterator - iterator is updated to indicate new position - newKeyPtr - pointer to buffer to copy key found by iteration - record - pointer to buffer to copy record found by iteration - recordLen - length of record - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus BTIterateRecord (FCB *filePtr, - BTreeIterationOperation operation, - BTreeIterator *iterator, - FSBufferDescriptor *record, - u_int16_t *recordLen ) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - BTreeKeyPtr keyPtr; - RecordPtr recordPtr; - u_int16_t len; - - Boolean foundRecord; - u_int32_t nodeNum; - - BlockDescriptor left, node, right; - u_int16_t index; - - - ////////////////////////// Priliminary Checks /////////////////////////////// - - left.buffer = nil; - left.blockHeader = nil; - right.buffer = nil; - right.blockHeader = nil; - node.buffer = nil; - node.blockHeader = nil; - - - if (filePtr == nil) - { - return paramErr; - } - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - if (btreePtr == nil) - { - return fsBTInvalidFileErr; //�� handle properly - } - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - if ((operation != kBTreeFirstRecord) && - (operation != kBTreeNextRecord) && - (operation != kBTreeCurrentRecord) && - (operation != kBTreePrevRecord) && - (operation != kBTreeLastRecord)) - { - err = fsInvalidIterationMovmentErr; - goto ErrorExit; - } - - /////////////////////// Find First or Last Record /////////////////////////// - - if ((operation == kBTreeFirstRecord) || (operation == kBTreeLastRecord)) - { - if (operation == kBTreeFirstRecord) nodeNum = btreePtr->firstLeafNode; - else nodeNum = btreePtr->lastLeafNode; - - if (nodeNum == 0) - { - err = fsBTEmptyErr; - goto ErrorExit; - } - - err = GetNode (btreePtr, nodeNum, 0, &node); - M_ExitOnError (err); - - if ( ((NodeDescPtr) node.buffer)->kind != kBTLeafNode || - ((NodeDescPtr) node.buffer)->numRecords <= 0 ) - { - err = ReleaseNode (btreePtr, &node); - M_ExitOnError (err); - - err = fsBTInvalidNodeErr; - printf ("hfs: BTIterateRecord() found invalid btree node on volume %s\n", FCBTOVCB(filePtr)->vcbVN); - hfs_mark_inconsistent(FCBTOVCB(filePtr), HFS_INCONSISTENCY_DETECTED); - goto ErrorExit; - } - - if (operation == kBTreeFirstRecord) index = 0; - else index = ((BTNodeDescriptor*) node.buffer)->numRecords - 1; - - goto CopyData; //�� is there a cleaner way? - } - - - //////////////////////// Find Iterator Position ///////////////////////////// - - // Not called for (operation == kBTreeFirstRecord || operation == kBTreeLastRecord) - err = FindIteratorPosition (btreePtr, iterator, - &left, &node, &right, &nodeNum, &index, &foundRecord); - M_ExitOnError (err); - - - ///////////////////// Find Next Or Previous Record ////////////////////////// - - if (operation == kBTreePrevRecord) - { - if (index > 0) - { - --index; - } - else - { - if (left.buffer == nil) - { - nodeNum = ((NodeDescPtr) node.buffer)->bLink; - if ( nodeNum > 0) - { - // BTree nodes are always grabbed in left to right order. - // Therefore release the current node before looking up the - // left node. - err = ReleaseNode(btreePtr, &node); - M_ExitOnError(err); - - // Look up the left node - err = GetNode (btreePtr, nodeNum, 0, &left); - M_ExitOnError (err); - - // Look up the current node again - err = GetRightSiblingNode (btreePtr, left.buffer, &node); - M_ExitOnError (err); - } else { - err = fsBTStartOfIterationErr; - goto ErrorExit; - } - } - // Before we stomp on "right", we'd better release it if needed - if (right.buffer != nil) { - err = ReleaseNode(btreePtr, &right); - M_ExitOnError(err); - } - right = node; - node = left; - left.buffer = nil; - index = ((NodeDescPtr) node.buffer)->numRecords -1; - } - } - else if (operation == kBTreeNextRecord) - { - if ((foundRecord != true) && - (((NodeDescPtr) node.buffer)->fLink == 0) && - (index == ((NodeDescPtr) node.buffer)->numRecords)) - { - err = fsBTEndOfIterationErr; - goto ErrorExit; - } - - // we did not find the record but the index is already positioned correctly - if ((foundRecord == false) && (index != ((NodeDescPtr) node.buffer)->numRecords)) - goto CopyData; - - // we found the record OR we have to look in the next node - if (index < ((NodeDescPtr) node.buffer)->numRecords -1) - { - ++index; - } - else - { - if (right.buffer == nil) - { - nodeNum = ((NodeDescPtr) node.buffer)->fLink; - if ( nodeNum > 0) - { - err = GetNode (btreePtr, nodeNum, 0, &right); - M_ExitOnError (err); - } else { - err = fsBTEndOfIterationErr; - goto ErrorExit; - } - } - // Before we stomp on "left", we'd better release it if needed - if (left.buffer != nil) { - err = ReleaseNode(btreePtr, &left); - M_ExitOnError(err); - } - left = node; - node = right; - right.buffer = nil; - index = 0; - } - } - else // operation == kBTreeCurrentRecord - { - // make sure we have something... - if ((foundRecord != true) && - (index >= ((NodeDescPtr) node.buffer)->numRecords)) - { - err = fsBTEndOfIterationErr; - goto ErrorExit; - } - } - - //////////////////// Copy Record And Update Iterator //////////////////////// - -CopyData: - - // added check for errors - err = GetRecordByIndex (btreePtr, node.buffer, index, &keyPtr, &recordPtr, &len); - M_ExitOnError (err); - - if (recordLen != nil) - *recordLen = len; - - if (record != nil) - { - ByteCount recordSize; - - recordSize = record->itemCount * record->itemSize; - - if (len > recordSize) len = recordSize; - - BlockMoveData (recordPtr, record->bufferAddress, len); - } - - if (iterator != nil) // first & last do not require iterator - { - iterator->hint.writeCount = btreePtr->writeCount; - iterator->hint.nodeNum = nodeNum; - iterator->hint.index = index; - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - - iterator->version = 0; - iterator->reserved = 0; - - /* SER - * Check for infinite loops by making sure we do not - * process more leaf records, than can possibly be (or the BTree header - * is seriously damaged)....a brute force method. - */ - if ((operation == kBTreeFirstRecord) || (operation == kBTreeLastRecord)) - iterator->hitCount = 1; - else if (operation != kBTreeCurrentRecord) - iterator->hitCount += 1; - /* Always use the highest max, in case the grows while iterating */ - iterator->maxLeafRecs = max(btreePtr->leafRecords, iterator->maxLeafRecs); - -#if 0 - if (iterator->hitCount > iterator->maxLeafRecs + kNumLeafRecSlack) - { - err = fsBTInvalidNodeErr; - printf ("hfs: BTIterateRecord() found invalid btree node on volume %s\n", FCBTOVCB(filePtr)->vcbVN); - hfs_mark_inconsistent(FCBTOVCB(filePtr), HFS_INCONSISTENCY_DETECTED); - goto ErrorExit; - } -#endif - - BlockMoveData ((Ptr)keyPtr, (Ptr)&iterator->key, CalcKeySize(btreePtr, keyPtr)); - } - - - ///////////////////////////// Release Nodes ///////////////////////////////// - - err = ReleaseNode (btreePtr, &node); - M_ExitOnError (err); - - if (left.buffer != nil) - { - err = ReleaseNode (btreePtr, &left); - M_ExitOnError (err); - } - - if (right.buffer != nil) - { - err = ReleaseNode (btreePtr, &right); - M_ExitOnError (err); - } - - return noErr; - - /////////////////////// Error - Clean Up and Exit /////////////////////////// - -ErrorExit: - - (void) ReleaseNode (btreePtr, &left); - (void) ReleaseNode (btreePtr, &node); - (void) ReleaseNode (btreePtr, &right); - - if (recordLen != nil) - *recordLen = 0; - - if (iterator != nil) - { - iterator->hint.writeCount = 0; - iterator->hint.nodeNum = 0; - iterator->hint.index = 0; - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - - iterator->version = 0; - iterator->reserved = 0; - iterator->key.length16 = 0; - } - - if ( err == fsBTEmptyErr || err == fsBTEndOfIterationErr ) - err = fsBTRecordNotFoundErr; - - return err; -} - - -/*------------------------------------------------------------------------------- -Routine: BTIterateRecords - -Function: Find a series of records - -Input: filePtr - b-tree file - operation - iteration operation (first,next,prev,last) - iterator - pointer to iterator indicating start position - callBackProc - pointer to routince to process a record - callBackState - pointer to state data (used by callBackProc) - -Output: iterator - iterator is updated to indicate new position - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus -BTIterateRecords(FCB *filePtr, BTreeIterationOperation operation, BTreeIterator *iterator, - IterateCallBackProcPtr callBackProc, void * callBackState) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - BTreeKeyPtr keyPtr; - RecordPtr recordPtr; - u_int16_t len; - Boolean foundRecord; - u_int32_t nodeNum; - BlockDescriptor left, node, right; - u_int16_t index; - - - ////////////////////////// Priliminary Checks /////////////////////////////// - - left.buffer = nil; - left.blockHeader = nil; - right.buffer = nil; - right.blockHeader = nil; - node.buffer = nil; - node.blockHeader = nil; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - if ((operation != kBTreeFirstRecord) && - (operation != kBTreeNextRecord) && - (operation != kBTreeCurrentRecord) && - (operation != kBTreePrevRecord) && - (operation != kBTreeLastRecord)) - { - err = fsInvalidIterationMovmentErr; - goto ErrorExit; - } - - /////////////////////// Find First or Last Record /////////////////////////// - - if ((operation == kBTreeFirstRecord) || (operation == kBTreeLastRecord)) - { - if (operation == kBTreeFirstRecord) - nodeNum = btreePtr->firstLeafNode; - else - nodeNum = btreePtr->lastLeafNode; - - if (nodeNum == 0) - { - err = fsBTEmptyErr; - goto ErrorExit; - } - - err = GetNode(btreePtr, nodeNum, 0, &node); - M_ExitOnError(err); - - if ( ((NodeDescPtr)node.buffer)->kind != kBTLeafNode || - ((NodeDescPtr)node.buffer)->numRecords <= 0 ) - { - err = ReleaseNode(btreePtr, &node); - M_ExitOnError(err); - - err = fsBTInvalidNodeErr; - printf ("hfs: BTIterateRecords() found invalid btree node on volume %s\n", FCBTOVCB(filePtr)->vcbVN); - hfs_mark_inconsistent(FCBTOVCB(filePtr), HFS_INCONSISTENCY_DETECTED); - goto ErrorExit; - } - - if (operation == kBTreeFirstRecord) - index = 0; - else - index = ((BTNodeDescriptor*) node.buffer)->numRecords - 1; - - goto ProcessData; - } - - //////////////////////// Find Iterator Position ///////////////////////////// - - // Not called for (operation == kBTreeFirstRecord || operation == kBTreeLastRecord) - err = FindIteratorPosition(btreePtr, iterator, &left, &node, &right, - &nodeNum, &index, &foundRecord); - if (err == fsBTRecordNotFoundErr) - err = 0; - M_ExitOnError(err); - - - ///////////////////// Find Next Or Previous Record ////////////////////////// - - if (operation == kBTreePrevRecord) - { - if (index > 0) - { - --index; - } - else - { - if (left.buffer == nil) - { - nodeNum = ((NodeDescPtr) node.buffer)->bLink; - if ( nodeNum > 0) - { - // BTree nodes are always grabbed in left to right order. - // Therefore release the current node before looking up the - // left node. - err = ReleaseNode(btreePtr, &node); - M_ExitOnError(err); - - // Look up the left node - err = GetNode (btreePtr, nodeNum, 0, &left); - M_ExitOnError (err); - - // Look up the current node again - err = GetRightSiblingNode (btreePtr, left.buffer, &node); - M_ExitOnError (err); - } else { - err = fsBTStartOfIterationErr; - goto ErrorExit; - } - } - // Before we stomp on "right", we'd better release it if needed - if (right.buffer != nil) { - err = ReleaseNode(btreePtr, &right); - M_ExitOnError(err); - } - right = node; - node = left; - left.buffer = nil; - index = ((NodeDescPtr) node.buffer)->numRecords -1; - } - } - else if (operation == kBTreeNextRecord) - { - if ((foundRecord != true) && - (((NodeDescPtr)node.buffer)->fLink == 0) && - (index == ((NodeDescPtr)node.buffer)->numRecords)) - { - err = fsBTEndOfIterationErr; - goto ErrorExit; - } - - // we did not find the record but the index is already positioned correctly - if ((foundRecord == false) && (index != ((NodeDescPtr)node.buffer)->numRecords)) - goto ProcessData; - - // we found the record OR we have to look in the next node - if (index < ((NodeDescPtr)node.buffer)->numRecords -1) - { - ++index; - } - else - { - if (right.buffer == nil) - { - nodeNum = ((NodeDescPtr)node.buffer)->fLink; - if ( nodeNum > 0) - { - err = GetNode(btreePtr, nodeNum, 0, &right); - M_ExitOnError(err); - } else { - err = fsBTEndOfIterationErr; - goto ErrorExit; - } - } - // Before we stomp on "left", we'd better release it if needed - if (left.buffer != nil) { - err = ReleaseNode(btreePtr, &left); - M_ExitOnError(err); - } - left = node; - node = right; - right.buffer = nil; - index = 0; - } - } - else // operation == kBTreeCurrentRecord - { - // make sure we have something... - if ((foundRecord != true) && - (index >= ((NodeDescPtr)node.buffer)->numRecords)) - { - err = fsBTEndOfIterationErr; - goto ErrorExit; - } - } - - //////////////////// Process Records Using Callback //////////////////////// - -ProcessData: - err = GetRecordByIndex(btreePtr, node.buffer, index, &keyPtr, &recordPtr, &len); - if (err) { - err = btBadNode; - goto ErrorExit; - } - - while (err == 0) { - if (callBackProc(keyPtr, recordPtr, callBackState) == 0) - break; - - if ((index+1) < ((NodeDescPtr)node.buffer)->numRecords) { - ++index; - } else { - if (right.buffer == nil) - { - nodeNum = ((NodeDescPtr)node.buffer)->fLink; - if ( nodeNum > 0) - { - err = GetNode(btreePtr, nodeNum, 0, &right); - M_ExitOnError(err); - } else { - err = fsBTEndOfIterationErr; - break; - } - } - // Before we stomp on "left", we'd better release it if needed - if (left.buffer != nil) { - err = ReleaseNode(btreePtr, &left); - M_ExitOnError(err); - } - left = node; - node = right; - right.buffer = nil; - index = 0; - } - err = GetRecordByIndex(btreePtr, node.buffer, index, - &keyPtr, &recordPtr, &len); - if (err) { - err = btBadNode; - goto ErrorExit; - } - } - - - ///////////////// Update Iterator to Last Item Processed ///////////////////// - - - if (iterator != nil) // first & last have optional iterator - { - iterator->hint.writeCount = btreePtr->writeCount; - iterator->hint.nodeNum = nodeNum; - iterator->hint.index = index; - iterator->version = 0; - - BlockMoveData((Ptr)keyPtr, (Ptr)&iterator->key, CalcKeySize(btreePtr, keyPtr)); - } - M_ExitOnError(err); - - - ///////////////////////////// Release Nodes ///////////////////////////////// - - err = ReleaseNode(btreePtr, &node); - M_ExitOnError(err); - - if (left.buffer != nil) - { - err = ReleaseNode(btreePtr, &left); - M_ExitOnError(err); - } - - if (right.buffer != nil) - { - err = ReleaseNode(btreePtr, &right); - M_ExitOnError(err); - } - - return noErr; - - /////////////////////// Error - Clean Up and Exit /////////////////////////// - -ErrorExit: - - (void) ReleaseNode(btreePtr, &left); - (void) ReleaseNode(btreePtr, &node); - (void) ReleaseNode(btreePtr, &right); - - if (iterator != nil) - { - iterator->hint.writeCount = 0; - iterator->hint.nodeNum = 0; - iterator->hint.index = 0; - iterator->version = 0; - iterator->key.length16 = 0; - } - - if ( err == fsBTEmptyErr || err == fsBTEndOfIterationErr ) - err = fsBTRecordNotFoundErr; - - return err; -} - - -//////////////////////////////// BTInsertRecord ///////////////////////////////// - -OSStatus BTInsertRecord (FCB *filePtr, - BTreeIterator *iterator, - FSBufferDescriptor *record, - u_int16_t recordLen ) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - TreePathTable treePathTable; - u_int32_t nodesNeeded; - BlockDescriptor nodeRec; - u_int32_t insertNodeNum; - u_int16_t index; - Boolean recordFit; - - ////////////////////////// Priliminary Checks /////////////////////////////// - - nodeRec.buffer = nil; // so we can call ReleaseNode - nodeRec.blockHeader = nil; - - err = CheckInsertParams (filePtr, iterator, record, recordLen); - if (err != noErr) - return err; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - - ///////////////////////// Find Insert Position ////////////////////////////// - - // always call SearchTree for Insert - err = SearchTree (btreePtr, &iterator->key, treePathTable, &insertNodeNum, &nodeRec, &index); - - switch (err) // set/replace/insert decision point - { - case noErr: err = fsBTDuplicateRecordErr; - goto ErrorExit; - - case fsBTRecordNotFoundErr: break; - - case fsBTEmptyErr: // if tree empty add 1st leaf node - - if (btreePtr->freeNodes == 0) - { - err = ExtendBTree (btreePtr, btreePtr->totalNodes + 1); - M_ExitOnError (err); - } - - err = AllocateNode (btreePtr, &insertNodeNum); - M_ExitOnError (err); - - err = GetNewNode (btreePtr, insertNodeNum, &nodeRec); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - - ((NodeDescPtr)nodeRec.buffer)->kind = kBTLeafNode; - ((NodeDescPtr)nodeRec.buffer)->height = 1; - - recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, 0, - &iterator->key, KeyLength(btreePtr, &iterator->key), - record->bufferAddress, recordLen ); - if (recordFit != true) - { - err = fsBTRecordTooLargeErr; - goto ErrorExit; - } - - /* - * Update the B-tree control block. Do this before - * calling UpdateNode since it will compare the node's - * height with treeDepth. - */ - btreePtr->treeDepth = 1; - btreePtr->rootNode = insertNodeNum; - btreePtr->firstLeafNode = insertNodeNum; - btreePtr->lastLeafNode = insertNodeNum; - - err = UpdateNode (btreePtr, &nodeRec, 0, kLockTransaction); - M_ExitOnError (err); - - M_BTreeHeaderDirty (btreePtr); - - goto Success; - - default: goto ErrorExit; - } - - if (index > 0) - { - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - - recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, index, - &iterator->key, KeyLength(btreePtr, &iterator->key), - record->bufferAddress, recordLen); - if (recordFit == true) - { - err = UpdateNode (btreePtr, &nodeRec, 0, kLockTransaction); - M_ExitOnError (err); - - goto Success; - } - } - - /////////////////////// Extend File If Necessary //////////////////////////// - - if ((btreePtr->treeDepth + 1UL) > btreePtr->freeNodes) - { - nodesNeeded = btreePtr->treeDepth + 1 + btreePtr->totalNodes - btreePtr->freeNodes; - if (nodesNeeded > CalcMapBits (btreePtr)) // we'll need to add a map node too! - ++nodesNeeded; - - err = ExtendBTree (btreePtr, nodesNeeded); - M_ExitOnError (err); - } - - // no need to delete existing record - - err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress, - recordLen, &nodeRec, index, 1, kInsertRecord, &insertNodeNum); - M_ExitOnError (err); - - - //////////////////////////////// Success //////////////////////////////////// - -Success: - ++btreePtr->writeCount; - ++btreePtr->leafRecords; - M_BTreeHeaderDirty (btreePtr); - - // create hint - iterator->hint.writeCount = btreePtr->writeCount; - iterator->hint.nodeNum = insertNodeNum; - iterator->hint.index = 0; // unused - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - - return noErr; - - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - (void) ReleaseNode (btreePtr, &nodeRec); - - iterator->hint.writeCount = 0; - iterator->hint.nodeNum = 0; - iterator->hint.index = 0; - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - - if (err == fsBTEmptyErr) - err = fsBTRecordNotFoundErr; - - return err; -} - - -//////////////////////////////// BTReplaceRecord //////////////////////////////// - -OSStatus BTReplaceRecord (FCB *filePtr, - BTreeIterator *iterator, - FSBufferDescriptor *record, - u_int16_t recordLen ) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - TreePathTable treePathTable; - u_int32_t nodesNeeded; - BlockDescriptor nodeRec; - u_int32_t insertNodeNum; - u_int16_t index; - Boolean recordFit; - Boolean validHint; - - - ////////////////////////// Priliminary Checks /////////////////////////////// - - nodeRec.buffer = nil; // so we can call ReleaseNode - nodeRec.blockHeader = nil; - - err = CheckInsertParams (filePtr, iterator, record, recordLen); - if (err != noErr) - return err; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - ////////////////////////////// Take A Hint ////////////////////////////////// - - err = IsItAHint (btreePtr, iterator, &validHint); - M_ExitOnError (err); - - if (validHint) - { - insertNodeNum = iterator->hint.nodeNum; - - err = GetNode (btreePtr, insertNodeNum, kGetNodeHint, &nodeRec); - if( err == noErr ) - { - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - - err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit); - M_ExitOnError (err); - - if (recordFit) - { - err = UpdateNode (btreePtr, &nodeRec, 0, 0); - M_ExitOnError (err); - - ++btreePtr->numValidHints; - - goto Success; - } - else - { - (void) BTInvalidateHint( iterator ); - } - - err = ReleaseNode (btreePtr, &nodeRec); - M_ExitOnError (err); - } - else - { - (void) BTInvalidateHint( iterator ); - } - } - - - ////////////////////////////// Get A Clue /////////////////////////////////// - - err = SearchTree (btreePtr, &iterator->key, treePathTable, &insertNodeNum, &nodeRec, &index); - M_ExitOnError (err); // record must exit for Replace - - // optimization - if simple replace will work then don't extend btree - // �� if we tried this before, and failed because it wouldn't fit then we shouldn't try this again... - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - - err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit); - M_ExitOnError (err); - - if (recordFit) - { - err = UpdateNode (btreePtr, &nodeRec, 0, 0); - M_ExitOnError (err); - - goto Success; - } - - - //////////////////////////// Make Some Room ///////////////////////////////// - - if ((btreePtr->treeDepth + 1UL) > btreePtr->freeNodes) - { - nodesNeeded = btreePtr->treeDepth + 1 + btreePtr->totalNodes - btreePtr->freeNodes; - if (nodesNeeded > CalcMapBits (btreePtr)) // we'll need to add a map node too! - ++nodesNeeded; - - err = ExtendBTree (btreePtr, nodesNeeded); - M_ExitOnError (err); - } - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - - DeleteRecord (btreePtr, nodeRec.buffer, index); // delete existing key/record - - err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress, - recordLen, &nodeRec, index, 1, kReplaceRecord, &insertNodeNum); - M_ExitOnError (err); - - ++btreePtr->writeCount; /* writeCount changes only if the tree structure changed */ - -Success: - // create hint - iterator->hint.writeCount = btreePtr->writeCount; - iterator->hint.nodeNum = insertNodeNum; - iterator->hint.index = 0; // unused - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - - return noErr; - - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - (void) ReleaseNode (btreePtr, &nodeRec); - - iterator->hint.writeCount = 0; - iterator->hint.nodeNum = 0; - iterator->hint.index = 0; - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - - return err; -} - - - -//////////////////////////////// BTUpdateRecord //////////////////////////////// - -OSStatus -BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, - IterateCallBackProcPtr callBackProc, void * callBackState) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - TreePathTable treePathTable; - BlockDescriptor nodeRec; - RecordPtr recordPtr; - BTreeKeyPtr keyPtr; - u_int32_t insertNodeNum; - u_int16_t recordLen; - u_int16_t index; - Boolean validHint; - - - ////////////////////////// Priliminary Checks /////////////////////////////// - - nodeRec.buffer = nil; // so we can call ReleaseNode - nodeRec.blockHeader = nil; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - ////////////////////////////// Take A Hint ////////////////////////////////// - - err = IsItAHint (btreePtr, iterator, &validHint); - M_ExitOnError (err); - - if (validHint) - { - insertNodeNum = iterator->hint.nodeNum; - - err = GetNode (btreePtr, insertNodeNum, kGetNodeHint, &nodeRec); - if (err == noErr) - { - if (((NodeDescPtr)nodeRec.buffer)->kind == kBTLeafNode && - SearchNode (btreePtr, nodeRec.buffer, &iterator->key, &index)) - { - err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - - err = callBackProc(keyPtr, recordPtr, callBackState); - M_ExitOnError (err); - - err = UpdateNode (btreePtr, &nodeRec, 0, 0); - M_ExitOnError (err); - - ++btreePtr->numValidHints; - - goto Success; - } - else - { - (void) BTInvalidateHint( iterator ); - } - - err = ReleaseNode (btreePtr, &nodeRec); - M_ExitOnError (err); - } - else - { - (void) BTInvalidateHint( iterator ); - } - } - - ////////////////////////////// Get A Clue /////////////////////////////////// - - err = SearchTree (btreePtr, &iterator->key, treePathTable, &insertNodeNum, &nodeRec, &index); - M_ExitOnError (err); - - err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); - - err = callBackProc(keyPtr, recordPtr, callBackState); - M_ExitOnError (err); - - err = UpdateNode (btreePtr, &nodeRec, 0, 0); - M_ExitOnError (err); - -Success: - // create hint - iterator->hint.writeCount = btreePtr->writeCount; - iterator->hint.nodeNum = insertNodeNum; - iterator->hint.index = 0; - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - return noErr; - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - (void) ReleaseNode (btreePtr, &nodeRec); - - iterator->hint.writeCount = 0; - iterator->hint.nodeNum = 0; - iterator->hint.index = 0; - iterator->hint.reserved1 = 0; - iterator->hint.reserved2 = 0; - return err; -} - - - -//////////////////////////////// BTDeleteRecord ///////////////////////////////// - -OSStatus BTDeleteRecord (FCB *filePtr, - BTreeIterator *iterator ) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - TreePathTable treePathTable; - BlockDescriptor nodeRec; - u_int32_t nodesNeeded; - u_int32_t nodeNum; - u_int16_t index; - - - ////////////////////////// Priliminary Checks /////////////////////////////// - - nodeRec.buffer = nil; // so we can call ReleaseNode - nodeRec.blockHeader = nil; - - M_ReturnErrorIf (filePtr == nil, paramErr); - M_ReturnErrorIf (iterator == nil, paramErr); - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - if (btreePtr == nil) - { - err = fsBTInvalidFileErr; - goto ErrorExit; - } - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - - /////////////////////////////// Find Key //////////////////////////////////// - - // check hint for simple delete case (index > 0, numRecords > 2) - - err = SearchTree (btreePtr, &iterator->key, treePathTable, &nodeNum, &nodeRec, &index); - M_ExitOnError (err); // record must exit for Delete - - - /////////////////////// Extend File If Necessary //////////////////////////// - - /* - * Worst case: we delete the first record in the tree and - * following key is sufficiently larger to cause all parents to - * require splitting and we need a new root node and a new map - * node. - */ - if (index == 0 && btreePtr->treeDepth + 1 > btreePtr->freeNodes) - { - nodesNeeded = btreePtr->treeDepth + btreePtr->totalNodes; - if (nodesNeeded > CalcMapBits (btreePtr)) - ++nodesNeeded; - - if (nodesNeeded - btreePtr->totalNodes > btreePtr->freeNodes) { - err = ExtendBTree (btreePtr, nodesNeeded); - M_ExitOnError (err); - } - } - - ///////////////////////////// Delete Record ///////////////////////////////// - - err = DeleteTree (btreePtr, treePathTable, &nodeRec, index, 1); - M_ExitOnError (err); - - ++btreePtr->writeCount; - --btreePtr->leafRecords; - M_BTreeHeaderDirty (btreePtr); - - iterator->hint.nodeNum = 0; - - return noErr; - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - (void) ReleaseNode (btreePtr, &nodeRec); - - return err; -} - - - -OSStatus BTGetInformation (FCB *filePtr, - u_int16_t file_version, - BTreeInfoRec *info ) -{ -#pragma unused (file_version) - - BTreeControlBlockPtr btreePtr; - - - M_ReturnErrorIf (filePtr == nil, paramErr); - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - /* - * XXX SER - * This should not require the whole tree to be locked, just maybe the BTreeControlBlockPtr - * - * REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - */ - - M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); - M_ReturnErrorIf (info == nil, paramErr); - - //�� check version? - - info->nodeSize = btreePtr->nodeSize; - info->maxKeyLength = btreePtr->maxKeyLength; - info->treeDepth = btreePtr->treeDepth; - info->numRecords = btreePtr->leafRecords; - info->numNodes = btreePtr->totalNodes; - info->numFreeNodes = btreePtr->freeNodes; - info->lastfsync = btreePtr->lastfsync; - info->keyCompareType = btreePtr->keyCompareType; - return noErr; -} - -// XXXdbg -__private_extern__ -OSStatus -BTIsDirty(FCB *filePtr) -{ - BTreeControlBlockPtr btreePtr; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - return TreeIsDirty(btreePtr); -} - -/*------------------------------------------------------------------------------- -Routine: BTFlushPath - Flush BTreeControlBlock to Header Node. - -Function: Brief_description_of_the_function_and_any_side_effects - - -Input: pathPtr - pointer to path control block for B*Tree file to flush - -Output: none - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus BTFlushPath (FCB *filePtr) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - - - M_ReturnErrorIf (filePtr == nil, paramErr); - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - err = UpdateHeader (btreePtr, false); - - return err; -} - - -/*------------------------------------------------------------------------------- -Routine: BTReload - Reload B-tree Header Data. - -Function: Reload B-tree header data from disk. This is called after fsck - has made repairs to the root filesystem. The filesystem is - mounted read-only when BTReload is caled. - - -Input: filePtr - the B*Tree file that needs its header updated - -Output: none - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus -BTReloadData(FCB *filePtr) -{ - OSStatus err; - BTreeControlBlockPtr btreePtr; - BlockDescriptor node; - BTHeaderRec *header; - - - node.buffer = nil; - node.blockHeader = nil; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - if (btreePtr == nil) - return (fsBTInvalidFileErr); - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - err = GetNode(btreePtr, kHeaderNodeNum, 0, &node); - if (err != noErr) - return (err); - - header = (BTHeaderRec*)((char *)node.buffer + sizeof(BTNodeDescriptor)); - if ((err = VerifyHeader (filePtr, header)) == 0) { - btreePtr->treeDepth = header->treeDepth; - btreePtr->rootNode = header->rootNode; - btreePtr->leafRecords = header->leafRecords; - btreePtr->firstLeafNode = header->firstLeafNode; - btreePtr->lastLeafNode = header->lastLeafNode; - btreePtr->maxKeyLength = header->maxKeyLength; - btreePtr->totalNodes = header->totalNodes; - btreePtr->freeNodes = header->freeNodes; - btreePtr->btreeType = header->btreeType; - - btreePtr->flags &= (~kBTHeaderDirty); - } - - (void) ReleaseNode(btreePtr, &node); - - return err; -} - - -/*------------------------------------------------------------------------------- -Routine: BTInvalidateHint - Invalidates the hint within a BTreeInterator. - -Function: Invalidates the hint within a BTreeInterator. - - -Input: iterator - pointer to BTreeIterator - -Output: iterator - iterator with the hint.nodeNum cleared - -Result: noErr - success - paramErr - iterator == nil --------------------------------------------------------------------------------*/ - - -OSStatus BTInvalidateHint (BTreeIterator *iterator ) -{ - if (iterator == nil) - return paramErr; - - iterator->hint.nodeNum = 0; - - return noErr; -} - - - - -/*------------------------------------------------------------------------------- -Routine: BTGetLastSync - -Function: Returns the last time that this btree was flushed, does not include header. - -Input: filePtr - pointer file control block - -Output: lastfsync - time in seconds of last update - -Result: noErr - success - paramErr - iterator == nil --------------------------------------------------------------------------------*/ - - -OSStatus BTGetLastSync (FCB *filePtr, - u_int32_t *lastsync) -{ - BTreeControlBlockPtr btreePtr; - - - M_ReturnErrorIf (filePtr == nil, paramErr); - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - /* Maybe instead of requiring a lock..an atomic set might be more appropriate */ - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); - M_ReturnErrorIf (lastsync == nil, paramErr); - - *lastsync = btreePtr->lastfsync; - - return noErr; -} - - - - -/*------------------------------------------------------------------------------- -Routine: BTSetLastSync - -Function: Sets the last time that this btree was flushed, does not include header. - - -Input: fcb - pointer file control block - -Output: lastfsync - time in seconds of last update - -Result: noErr - success - paramErr - iterator == nil --------------------------------------------------------------------------------*/ - - -OSStatus BTSetLastSync (FCB *filePtr, - u_int32_t lastsync) -{ - BTreeControlBlockPtr btreePtr; - - - M_ReturnErrorIf (filePtr == nil, paramErr); - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - /* Maybe instead of requiring a lock..an atomic set might be more appropriate */ - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); - M_ReturnErrorIf (lastsync == 0, paramErr); - - btreePtr->lastfsync = lastsync; - - return noErr; -} - -__private_extern__ -OSStatus BTHasContiguousNodes (FCB *filePtr) -{ - BTreeControlBlockPtr btreePtr; - - - M_ReturnErrorIf (filePtr == nil, paramErr); - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); - - M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); - - return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize); -} - - -/*------------------------------------------------------------------------------- -Routine: BTGetUserData - -Function: Read the user data area of the b-tree header node. - --------------------------------------------------------------------------------*/ -OSStatus -BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize) -{ - BTreeControlBlockPtr btreePtr; - BlockDescriptor node; - char * offset; - OSStatus err; - - if (dataSize > kBTreeHeaderUserBytes) - return (EINVAL); - node.buffer = nil; - node.blockHeader = nil; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - if (btreePtr == nil) - return (fsBTInvalidFileErr); - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - err = GetNode(btreePtr, kHeaderNodeNum, 0, &node); - if (err) - return (err); - - offset = (char *)node.buffer + sizeof(BTNodeDescriptor) + sizeof(BTHeaderRec); - bcopy(offset, dataPtr, dataSize); - - (void) ReleaseNode(btreePtr, &node); - - return (0); -} - - -/*------------------------------------------------------------------------------- -Routine: BTSetUserData - -Function: Write the user data area of the b-tree header node. --------------------------------------------------------------------------------*/ -OSStatus -BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize) -{ - BTreeControlBlockPtr btreePtr; - BlockDescriptor node; - char * offset; - OSStatus err; - - if (dataSize > kBTreeHeaderUserBytes) - return (EINVAL); - node.buffer = nil; - node.blockHeader = nil; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - if (btreePtr == nil) - return (fsBTInvalidFileErr); - - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); - - err = GetNode(btreePtr, kHeaderNodeNum, 0, &node); - if (err) - return (err); - - ModifyBlockStart(btreePtr->fileRefNum, &node); - - offset = (char *)node.buffer + sizeof(BTNodeDescriptor) + sizeof(BTHeaderRec); - bcopy(dataPtr, offset, dataSize); - - err = UpdateNode (btreePtr, &node, 0, 0); - - return (err); -} - diff --git a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c deleted file mode 100644 index dbd0a8a54..000000000 --- a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c +++ /dev/null @@ -1,744 +0,0 @@ -/* - * Copyright (c) 2000-2003, 2005-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: BTreeAllocate.c - - Contains: BTree Node Allocation routines for the BTree Module. - - Version: xxx put the technology version here xxx - - Written by: Gordon Sheridan and Bill Bruffey - - Copyright: � 1992-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: File Systems - - Writers: - - (djb) Don Brady - (ser) Scott Roberts - (msd) Mark Day - - Change History (most recent first): - - 6/1/99 djb Sync up with Mac OS 8.6. - 11/24/97 djb Remove some debug code (Panic calls). - 7/24/97 djb CallbackProcs now take refnum instead of an FCB. - 4/23/97 djb first checked in - - 2/19/97 djb Change E_BadNodeType to fsBTBadNodeType. - 12/19/96 djb first checked in - - History applicable to original Scarecrow Design: - - <4> 10/25/96 ser Changing for new VFPI - <3> 10/18/96 ser Converting over VFPI changes - <2> 1/10/96 msd Change 64-bit math to use real function names from Math64.i. - <1> 10/18/95 rst Moved from Scarecrow project. - - <8> 1/12/95 wjk Adopt Model FileSystem changes in D5. - <7> 9/30/94 prp Get in sync with D2 interface changes. - <6> 7/22/94 wjk Convert to the new set of header files. - <5> 8/31/93 prp Use U64SetU instead of S64Set. - <4> 5/21/93 gs Fix ExtendBTree bug. - <3> 5/10/93 gs Fix pointer arithmetic bug in AllocateNode. - <2> 3/23/93 gs finish ExtendBTree routine. - <1> 2/8/93 gs first checked in - <0> 1/1/93 gs begin AllocateNode and FreeNode - -*/ - -#include "../../hfs_btreeio.h" -#include "../../hfs_endian.h" -#include "../headers/BTreesPrivate.h" - -///////////////////// Routines Internal To BTreeAllocate.c ////////////////////// - -static OSStatus GetMapNode (BTreeControlBlockPtr btreePtr, - BlockDescriptor *nodePtr, - u_int16_t **mapPtr, - u_int16_t *mapSize ); - -///////////////////////////////////////////////////////////////////////////////// - -/*------------------------------------------------------------------------------- - -Routine: AllocateNode - Find Free Node, Mark It Used, and Return Node Number. - -Function: Searches the map records for the first free node, marks it "in use" and - returns the node number found. This routine should really only be called - when we know there are free blocks, otherwise it's just a waste of time. - -Note: We have to examine map nodes a word at a time rather than a long word - because the External BTree Mgr used map records that were not an integral - number of long words. Too bad. In our spare time could develop a more - sophisticated algorithm that read map records by long words (and long - word aligned) and handled the spare bytes at the beginning and end - appropriately. - -Input: btreePtr - pointer to control block for BTree file - -Output: nodeNum - number of node allocated - - -Result: noErr - success - fsBTNoMoreMapNodesErr - no free blocks were found - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus AllocateNode (BTreeControlBlockPtr btreePtr, u_int32_t *nodeNum) -{ - OSStatus err; - BlockDescriptor node; - u_int16_t *mapPtr, *pos; - u_int16_t mapSize, size; - u_int16_t freeWord; - u_int16_t mask; - u_int16_t bitOffset; - u_int32_t nodeNumber; - - - nodeNumber = 0; // first node number of header map record - node.buffer = nil; // clear node.buffer to get header node - // - and for ErrorExit - node.blockHeader = nil; - - while (true) - { - err = GetMapNode (btreePtr, &node, &mapPtr, &mapSize); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &node); - - //////////////////////// Find Word with Free Bit //////////////////////////// - - pos = mapPtr; - size = mapSize; - size >>= 1; // convert to number of words - //�� assumes mapRecords contain an integral number of words - - while ( size-- ) - { - if ( *pos++ != 0xFFFF ) // assume test fails, and increment pos - break; - } - - --pos; // whoa! backup - - if (*pos != 0xFFFF) // hey, we got one! - break; - - nodeNumber += mapSize << 3; // covert to number of bits (nodes) - } - - ///////////////////////// Find Free Bit in Word ///////////////////////////// - - freeWord = SWAP_BE16 (*pos); - bitOffset = 15; - mask = 0x8000; - - do { - if ( (freeWord & mask) == 0) - break; - mask >>= 1; - } while (--bitOffset); - - ////////////////////// Calculate Free Node Number /////////////////////////// - - nodeNumber += ((pos - mapPtr) << 4) + (15 - bitOffset); // (pos-mapPtr) = # of words! - - - ///////////////////////// Check for End of Map ////////////////////////////// - - if (nodeNumber >= btreePtr->totalNodes) - { - err = fsBTFullErr; - goto ErrorExit; - } - - /////////////////////////// Allocate the Node /////////////////////////////// - - *pos |= SWAP_BE16 (mask); // set the map bit for the node - - err = UpdateNode (btreePtr, &node, 0, kLockTransaction); - M_ExitOnError (err); - - --btreePtr->freeNodes; - M_BTreeHeaderDirty(btreePtr); - - /* Account for allocations from node reserve */ - BTUpdateReserve(btreePtr, 1); - - *nodeNum = nodeNumber; - - return noErr; - -////////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - (void) ReleaseNode (btreePtr, &node); - *nodeNum = 0; - - return err; -} - - - -/*------------------------------------------------------------------------------- - -Routine: FreeNode - Clear allocation bit for node. - -Function: Finds the bit representing the node specified by nodeNum in the node - map and clears the bit. - - -Input: btreePtr - pointer to control block for BTree file - nodeNum - number of node to mark free - -Output: none - -Result: noErr - success - fsBTNoMoreMapNodesErr - node number is beyond end of node map - != noErr - GetNode or ReleaseNode encountered some difficulty --------------------------------------------------------------------------------*/ - -OSStatus FreeNode (BTreeControlBlockPtr btreePtr, u_int32_t nodeNum) -{ - OSStatus err; - BlockDescriptor node; - u_int32_t nodeIndex; - u_int16_t mapSize; - u_int16_t *mapPos; - u_int16_t bitOffset; - - - //////////////////////////// Find Map Record //////////////////////////////// - nodeIndex = 0; // first node number of header map record - node.buffer = nil; // invalidate node.buffer to get header node - node.blockHeader = nil; - - while (nodeNum >= nodeIndex) - { - err = GetMapNode (btreePtr, &node, &mapPos, &mapSize); - M_ExitOnError (err); - - nodeIndex += mapSize << 3; // covert to number of bits (nodes) - } - - //////////////////////////// Mark Node Free ///////////////////////////////// - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &node); - - nodeNum -= (nodeIndex - (mapSize << 3)); // relative to this map record - bitOffset = 15 - (nodeNum & 0x0000000F); // last 4 bits are bit offset - mapPos += nodeNum >> 4; // point to word containing map bit - - M_SWAP_BE16_ClearBitNum (*mapPos, bitOffset); // clear it - - err = UpdateNode (btreePtr, &node, 0, kLockTransaction); - M_ExitOnError (err); - - ++btreePtr->freeNodes; - M_BTreeHeaderDirty(btreePtr); - - return noErr; - -ErrorExit: - - (void) ReleaseNode (btreePtr, &node); - - return err; -} - - - -/*------------------------------------------------------------------------------- - -Routine: ExtendBTree - Call FSAgent to extend file, and allocate necessary map nodes. - -Function: This routine calls the the FSAgent to extend the end of fork, if necessary, - to accomodate the number of nodes requested. It then allocates as many - map nodes as are necessary to account for all the nodes in the B*Tree. - If newTotalNodes is less than the current number of nodes, no action is - taken. - -Note: Internal HFS File Manager BTree Module counts on an integral number of - long words in map records, although they are not long word aligned. - -Input: btreePtr - pointer to control block for BTree file - newTotalNodes - total number of nodes the B*Tree is to extended to - -Output: none - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, - u_int32_t newTotalNodes ) -{ - OSStatus err; - FCB *filePtr; - FSSize minEOF, maxEOF; - u_int16_t nodeSize; - u_int32_t oldTotalNodes; - u_int32_t newMapNodes; - u_int32_t mapBits, totalMapBits; - u_int32_t recStartBit; - u_int32_t nodeNum, nextNodeNum; - u_int32_t firstNewMapNodeNum, lastNewMapNodeNum; - BlockDescriptor mapNode, newNode; - u_int16_t *mapPos; - u_int16_t *mapStart; - u_int16_t mapSize; - u_int16_t mapNodeRecSize; - u_int32_t bitInWord, bitInRecord; - u_int16_t mapIndex; - - - oldTotalNodes = btreePtr->totalNodes; - if (newTotalNodes <= oldTotalNodes) // we're done! - return noErr; - - nodeSize = btreePtr->nodeSize; - filePtr = GetFileControlBlock(btreePtr->fileRefNum); - - mapNode.buffer = nil; - mapNode.blockHeader = nil; - newNode.buffer = nil; - newNode.blockHeader = nil; - - mapNodeRecSize = nodeSize - sizeof(BTNodeDescriptor) - 6; // 2 bytes of free space (see note) - - - //////////////////////// Count Bits In Node Map ///////////////////////////// - - totalMapBits = 0; - do { - err = GetMapNode (btreePtr, &mapNode, &mapStart, &mapSize); - M_ExitOnError (err); - - mapBits = mapSize << 3; // mapSize (in bytes) * 8 - recStartBit = totalMapBits; // bit number of first bit in map record - totalMapBits += mapBits; - - } while ( ((BTNodeDescriptor*)mapNode.buffer)->fLink != 0 ); - - if (DEBUG_BUILD && totalMapBits != CalcMapBits (btreePtr)) - Panic ("ExtendBTree: totalMapBits != CalcMapBits"); - - /////////////////////// Extend LEOF If Necessary //////////////////////////// - - minEOF = (u_int64_t)newTotalNodes * (u_int64_t)nodeSize; - if ( (u_int64_t)filePtr->fcbEOF < minEOF ) - { - maxEOF = (u_int64_t)0x7fffffffLL * (u_int64_t)nodeSize; - - err = btreePtr->setEndOfForkProc (btreePtr->fileRefNum, minEOF, maxEOF); - M_ExitOnError (err); - } - - - //////////////////// Calc New Total Number Of Nodes ///////////////////////// - - newTotalNodes = filePtr->fcbEOF / nodeSize; // hack! - // do we wish to perform any verification of newTotalNodes at this point? - - btreePtr->totalNodes = newTotalNodes; // do we need to update freeNodes here too? - - - ////////////// Calculate Number Of New Map Nodes Required /////////////////// - - newMapNodes = 0; - if (newTotalNodes > totalMapBits) - { - newMapNodes = (((newTotalNodes - totalMapBits) >> 3) / mapNodeRecSize) + 1; - firstNewMapNodeNum = oldTotalNodes; - lastNewMapNodeNum = firstNewMapNodeNum + newMapNodes - 1; - } - else - { - err = ReleaseNode (btreePtr, &mapNode); - M_ExitOnError (err); - - goto Success; - } - - - /////////////////////// Initialize New Map Nodes //////////////////////////// - // XXXdbg - this is the correct place for this: - ModifyBlockStart(btreePtr->fileRefNum, &mapNode); - - ((BTNodeDescriptor*)mapNode.buffer)->fLink = firstNewMapNodeNum; - - nodeNum = firstNewMapNodeNum; - while (true) - { - err = GetNewNode (btreePtr, nodeNum, &newNode); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &newNode); - - ((NodeDescPtr)newNode.buffer)->numRecords = 1; - ((NodeDescPtr)newNode.buffer)->kind = kBTMapNode; - - // set free space offset - *(u_int16_t *)((Ptr)newNode.buffer + nodeSize - 4) = nodeSize - 6; - - if (nodeNum++ == lastNewMapNodeNum) - break; - - ((BTNodeDescriptor*)newNode.buffer)->fLink = nodeNum; // point to next map node - - err = UpdateNode (btreePtr, &newNode, 0, kLockTransaction); - M_ExitOnError (err); - } - - err = UpdateNode (btreePtr, &newNode, 0, kLockTransaction); - M_ExitOnError (err); - - - ///////////////////// Mark New Map Nodes Allocated ////////////////////////// - - nodeNum = firstNewMapNodeNum; - do { - bitInRecord = nodeNum - recStartBit; - - while (bitInRecord >= mapBits) - { - nextNodeNum = ((NodeDescPtr)mapNode.buffer)->fLink; - if ( nextNodeNum == 0) - { - err = fsBTNoMoreMapNodesErr; - goto ErrorExit; - } - - err = UpdateNode (btreePtr, &mapNode, 0, kLockTransaction); - M_ExitOnError (err); - - err = GetNode (btreePtr, nextNodeNum, 0, &mapNode); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &mapNode); - - mapIndex = 0; - - mapStart = (u_int16_t *) GetRecordAddress (btreePtr, mapNode.buffer, mapIndex); - mapSize = GetRecordSize (btreePtr, mapNode.buffer, mapIndex); - - if (DEBUG_BUILD && mapSize != M_MapRecordSize (btreePtr->nodeSize) ) - { - Panic ("ExtendBTree: mapSize != M_MapRecordSize"); - } - - mapBits = mapSize << 3; // mapSize (in bytes) * 8 - recStartBit = totalMapBits; // bit number of first bit in map record - totalMapBits += mapBits; - - bitInRecord = nodeNum - recStartBit; - } - - mapPos = mapStart + ((nodeNum - recStartBit) >> 4); - bitInWord = 15 - ((nodeNum - recStartBit) & 0x0000000F); - - M_SWAP_BE16_SetBitNum (*mapPos, bitInWord); - - ++nodeNum; - - } while (nodeNum <= lastNewMapNodeNum); - - err = UpdateNode (btreePtr, &mapNode, 0, kLockTransaction); - M_ExitOnError (err); - - - //////////////////////////////// Success //////////////////////////////////// - -Success: - - btreePtr->totalNodes = newTotalNodes; - btreePtr->freeNodes += (newTotalNodes - oldTotalNodes) - newMapNodes; - - M_BTreeHeaderDirty(btreePtr); - - /* Force the b-tree header changes to disk */ - (void) UpdateHeader (btreePtr, true); - - return noErr; - - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - (void) ReleaseNode (btreePtr, &mapNode); - (void) ReleaseNode (btreePtr, &newNode); - - return err; -} - - - -/*------------------------------------------------------------------------------- - -Routine: GetMapNode - Get the next map node and pointer to the map record. - -Function: Given a BlockDescriptor to a map node in nodePtr, GetMapNode releases - it and gets the next node. If nodePtr->buffer is nil, then the header - node is retrieved. - - -Input: btreePtr - pointer to control block for BTree file - nodePtr - pointer to a BlockDescriptor of a map node - -Output: nodePtr - pointer to the BlockDescriptor for the next map node - mapPtr - pointer to the map record within the map node - mapSize - number of bytes in the map record - -Result: noErr - success - fsBTNoMoreMapNodesErr - we've run out of map nodes - fsBTInvalidNodeErr - bad node, or not node type kMapNode - != noErr - failure --------------------------------------------------------------------------------*/ - -static -OSStatus GetMapNode (BTreeControlBlockPtr btreePtr, - BlockDescriptor *nodePtr, - u_int16_t **mapPtr, - u_int16_t *mapSize ) -{ - OSStatus err; - u_int16_t mapIndex; - u_int32_t nextNodeNum; - - if (nodePtr->buffer != nil) // if iterator is valid... - { - nextNodeNum = ((NodeDescPtr)nodePtr->buffer)->fLink; - if (nextNodeNum == 0) - { - err = fsBTNoMoreMapNodesErr; - goto ErrorExit; - } - - err = ReleaseNode (btreePtr, nodePtr); - M_ExitOnError (err); - - err = GetNode (btreePtr, nextNodeNum, 0, nodePtr); - M_ExitOnError (err); - - if ( ((NodeDescPtr)nodePtr->buffer)->kind != kBTMapNode) - { - err = fsBTBadNodeType; - goto ErrorExit; - } - - ++btreePtr->numMapNodesRead; - mapIndex = 0; - } else { - err = GetNode (btreePtr, kHeaderNodeNum, 0, nodePtr); - M_ExitOnError (err); - - if ( ((NodeDescPtr)nodePtr->buffer)->kind != kBTHeaderNode) - { - err = fsBTInvalidHeaderErr; //�� or fsBTBadNodeType - goto ErrorExit; - } - - mapIndex = 2; - } - - - *mapPtr = (u_int16_t *) GetRecordAddress (btreePtr, nodePtr->buffer, mapIndex); - *mapSize = GetRecordSize (btreePtr, nodePtr->buffer, mapIndex); - - return noErr; - - -ErrorExit: - - (void) ReleaseNode (btreePtr, nodePtr); - - *mapPtr = nil; - *mapSize = 0; - - return err; -} - - - -////////////////////////////////// CalcMapBits ////////////////////////////////// - -u_int32_t CalcMapBits (BTreeControlBlockPtr btreePtr) -{ - u_int32_t mapBits; - - mapBits = M_HeaderMapRecordSize (btreePtr->nodeSize) << 3; - - while (mapBits < btreePtr->totalNodes) - mapBits += M_MapRecordSize (btreePtr->nodeSize) << 3; - - return mapBits; -} - - -/*------------------------------------------------------------------------------- -Routine: BTZeroUnusedNodes - -Function: Write zeros to all nodes in the B-tree that are not currently in use. --------------------------------------------------------------------------------*/ -int -BTZeroUnusedNodes(FCB *filePtr) -{ - int err; - vnode_t vp; - BTreeControlBlockPtr btreePtr; - BlockDescriptor mapNode; - buf_t bp; - u_int32_t nodeNumber; - u_int16_t *mapPtr, *pos; - u_int16_t mapSize, size; - u_int16_t mask; - u_int16_t bitNumber; - u_int16_t word; - int numWritten; - - vp = FTOV(filePtr); - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - bp = NULL; - nodeNumber = 0; - mapNode.buffer = nil; - mapNode.blockHeader = nil; - numWritten = 0; - - /* Iterate over map nodes. */ - while (true) - { - err = GetMapNode (btreePtr, &mapNode, &mapPtr, &mapSize); - if (err) - { - err = MacToVFSError(err); - goto ErrorExit; - } - - pos = mapPtr; - size = mapSize; - size >>= 1; /* convert to number of 16-bit words */ - - /* Iterate over 16-bit words in the map record. */ - while (size--) - { - if (*pos != 0xFFFF) /* Anything free in this word? */ - { - word = SWAP_BE16(*pos); - - /* Iterate over bits in the word. */ - for (bitNumber = 0, mask = 0x8000; - bitNumber < 16; - ++bitNumber, mask >>= 1) - { - if (word & mask) - continue; /* This node is in use. */ - - if (nodeNumber + bitNumber >= btreePtr->totalNodes) - { - /* We've processed all of the nodes. */ - goto done; - } - - /* - * Get a buffer full of zeros and write it to the unused - * node. Since we'll probably be writing a lot of nodes, - * bypass the journal (to avoid a transaction that's too - * big). Instead, this behaves more like clearing out - * nodes when extending a B-tree (eg., ClearBTNodes). - */ - bp = buf_getblk(vp, nodeNumber + bitNumber, btreePtr->nodeSize, 0, 0, BLK_META); - if (bp == NULL) - { - printf("hfs: BTZeroUnusedNodes: unable to read node %u\n", nodeNumber + bitNumber); - err = EIO; - goto ErrorExit; - } - - if (buf_flags(bp) & B_LOCKED) { - /* - * This node is already part of a transaction and will be written when - * the transaction is committed, so don't write it here. If we did, then - * we'd hit a panic in hfs_vnop_bwrite because the B_LOCKED bit is still set. - */ - buf_brelse(bp); - continue; - } - - buf_clear(bp); - buf_markaged(bp); - - /* - * Try not to hog the buffer cache. Wait for the write - * every 32 nodes. If VNOP_BWRITE reports an error, bail out and bubble - * it up to the function calling us. If we tried to update a read-only - * mount on read-only media, for example, catching the error will let - * us alert the callers of this function that they should maintain - * the mount in read-only mode. - - */ - ++numWritten; - if (numWritten % 32 == 0) { - err = VNOP_BWRITE(bp); - if (err) { - goto ErrorExit; - } - } - else { - buf_bawrite(bp); - } - } - } - - /* Go to the next word in the bitmap */ - ++pos; - nodeNumber += 16; - } - } - -ErrorExit: -done: - (void) ReleaseNode(btreePtr, &mapNode); - - return err; -} diff --git a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c b/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c deleted file mode 100644 index 2574b8a84..000000000 --- a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c +++ /dev/null @@ -1,678 +0,0 @@ -/* - * Copyright (c) 2000-2003, 2005-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: BTreeMiscOps.c - - Contains: Miscellaneous operations for the BTree Module. - - Version: xxx put the technology version here xxx - - Written by: Gordon Sheridan and Bill Bruffey - - Copyright: � 1992-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: File Systems - - Writers: - - (DSH) Deric Horn - (msd) Mark Day - (djb) Don Brady - - Change History (most recent first): - - 6/1/99 djb Sync up with Mac OS 8.6. - 9/4/97 djb Optimize TrySimpleReplace for the case where record size is not - changing. - 4/23/97 djb first checked in - - 3/31/97 djb Move ClearMemory to Utilities.c. - 3/17/97 DSH Casting for DFA - 2/27/97 msd Remove temporary fix from last revision. BTree EOF's should be - correct now, so check for strict equality. - 2/26/97 msd Fix a casting problem in ClearMemory. TEMPORARY FIX: Made - VerifyHeader more lenient, allowing the EOF to be greater than - the amount actually used by nodes; this should really be fixed - in the formatting code (which needs to compute the real BTree - sizes before writing the volume header). - 2/19/97 djb Added ClearMemory. Changed CalcKeyLength to KeyLength. - 1/3/97 djb Added support for large keys. - 12/19/96 djb first checked in - - History applicable to original Scarecrow Design: - - <9> 10/25/96 ser Changing for new VFPI - <8> 10/18/96 ser Converting over VFPI changes - <7> 9/17/96 dkh More BTree statistics. Change IsItAHint to not always check to - see if the hint node is allocated. - <6> 9/16/96 dkh Revised BTree statistics. - <5> 6/20/96 dkh Radar #1358740. Change from using Pools to debug MemAllocators. - <4> 1/22/96 dkh Change Pools.i inclusion to PoolsPriv.i - <3> 1/10/96 msd Change 64-bit math to use real function names from Math64.i. - <2> 12/7/95 dkh D10E2 build. Changed usage of Ref data type to LogicalAddress. - <1> 10/18/95 rst Moved from Scarecrow project. - - <19> 4/26/95 prp In UpdateHeader, clear the dirty flag after the BTree is updated. - <18> 1/12/95 wjk Adopt Model FileSystem changes in D5. - <17> 11/16/94 prp Add IsItAHint routine and use it whenever hint's node number was - used for testing. - <16> 10/5/94 bk add pools.h include file - <15> 9/30/94 prp Get in sync with D2 interface changes. - <14> 7/22/94 wjk Convert to the new set of header files. - <13> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and - NRCmds environments. - <12> 11/30/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and - NRCmds environments. - <11> 11/23/93 wjk Changes required to compile on the RS6000. - <10> 8/31/93 prp Use U64SetU instead of S64Set. - <9> 6/2/93 gs Update for changes to FSErrors.h and add some comments. - <8> 5/21/93 gs Modify UpdateHeader to write out attributes. Remove - Get/UpdateNode from TrySimpleReplace. - <7> 5/10/93 gs Add TrySimpleReplace routine. - <6> 3/23/93 gs Change MoveData to take void * instead of Ptr. Add UpdateHeader - and ClearBytes routines. - <5> 2/8/93 gs Add FindIteratorPosition. - <4> 12/10/92 gs Implement CheckKeyDescriptor and the KeyDescriptor interpreter. - <3> 12/8/92 gs Add GetKeyDescriptor, VerifyHeader, and Alloc/Dealloc memory - routines. - <2> 12/2/92 gs Add CompareKeys routine. - <1> 11/15/92 gs first checked in - -*/ - -#include "../headers/BTreesPrivate.h" -#include "../../hfs_btreeio.h" - - -////////////////////////////// Routine Definitions ////////////////////////////// - -/*------------------------------------------------------------------------------- -Routine: CalcKeyRecordSize - Return size of combined key/record structure. - -Function: Rounds keySize and recSize so they will end on word boundaries. - Does NOT add size of offset. - -Input: keySize - length of key (including length field) - recSize - length of record data - -Output: none - -Result: u_int16_t - size of combined key/record that will be inserted in btree --------------------------------------------------------------------------------*/ - -u_int16_t CalcKeyRecordSize (u_int16_t keySize, - u_int16_t recSize ) -{ - if ( M_IsOdd (keySize) ) keySize += 1; // pad byte - - if (M_IsOdd (recSize) ) recSize += 1; // pad byte - - return (keySize + recSize); -} - - - -/*------------------------------------------------------------------------------- -Routine: VerifyHeader - Validate fields of the BTree header record. - -Function: Examines the fields of the BTree header record to determine if the - fork appears to contain a valid BTree. - -Input: forkPtr - pointer to fork control block - header - pointer to BTree header - - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus VerifyHeader (FCB *filePtr, - BTHeaderRec *header ) -{ - u_int64_t forkSize; - u_int32_t totalNodes; - - - switch (header->nodeSize) // node size == 512*2^n - { - case 512: - case 1024: - case 2048: - case 4096: - case 8192: - case 16384: - case 32768: break; - default: return fsBTInvalidHeaderErr; //�� E_BadNodeType - } - - totalNodes = header->totalNodes; - - forkSize = (u_int64_t)totalNodes * (u_int64_t)header->nodeSize; - - if ( forkSize > (u_int64_t)filePtr->fcbEOF ) - return fsBTInvalidHeaderErr; - - if ( header->freeNodes >= totalNodes ) - return fsBTInvalidHeaderErr; - - if ( header->rootNode >= totalNodes ) - return fsBTInvalidHeaderErr; - - if ( header->firstLeafNode >= totalNodes ) - return fsBTInvalidHeaderErr; - - if ( header->lastLeafNode >= totalNodes ) - return fsBTInvalidHeaderErr; - - if ( header->treeDepth > kMaxTreeDepth ) - return fsBTInvalidHeaderErr; - - - /////////////////////////// Check BTree Type //////////////////////////////// - - switch (header->btreeType) - { - case 0: // HFS Type - no Key Descriptor - case kUserBTreeType: // with Key Descriptors etc. - case kReservedBTreeType: // Desktop Mgr BTree ? - break; - - default: return fsBTUnknownVersionErr; - } - - return noErr; -} - - - -__private_extern__ -OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr) -{ - return (btreePtr->flags & kBTHeaderDirty); -} - - - -/*------------------------------------------------------------------------------- -Routine: UpdateHeader - Write BTreeInfoRec fields to Header node. - -Function: Checks the kBTHeaderDirty flag in the BTreeInfoRec and updates the - header node if necessary. - -Input: btreePtr - pointer to BTreeInfoRec - - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus UpdateHeader(BTreeControlBlockPtr btreePtr, Boolean forceWrite) -{ - OSStatus err; - BlockDescriptor node; - BTHeaderRec *header; - u_int32_t options; - - if ((btreePtr->flags & kBTHeaderDirty) == 0) // btree info already flushed - return noErr; - - - err = GetNode (btreePtr, kHeaderNodeNum, 0, &node ); - if (err != noErr) { - return err; - } - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &node); - - header = (BTHeaderRec*) ((char *)node.buffer + sizeof(BTNodeDescriptor)); - - header->treeDepth = btreePtr->treeDepth; - header->rootNode = btreePtr->rootNode; - header->leafRecords = btreePtr->leafRecords; - header->firstLeafNode = btreePtr->firstLeafNode; - header->lastLeafNode = btreePtr->lastLeafNode; - header->nodeSize = btreePtr->nodeSize; //�� this shouldn't change - header->maxKeyLength = btreePtr->maxKeyLength; //�� neither should this - header->totalNodes = btreePtr->totalNodes; - header->freeNodes = btreePtr->freeNodes; - header->btreeType = btreePtr->btreeType; - - // ignore header->clumpSize; //�� rename this field? - - if (forceWrite) - options = kForceWriteBlock; - else - options = kLockTransaction; - - err = UpdateNode (btreePtr, &node, 0, options); - - btreePtr->flags &= (~kBTHeaderDirty); - - return err; -} - - - -/*------------------------------------------------------------------------------- -Routine: FindIteratorPosition - One_line_description. - -Function: Brief_description_of_the_function_and_any_side_effects - -Algorithm: see FSC.BT.BTIterateRecord.PICT - -Note: //�� document side-effects of bad node hints - -Input: btreePtr - description - iterator - description - - -Output: iterator - description - left - description - middle - description - right - description - nodeNum - description - returnIndex - description - foundRecord - description - - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus FindIteratorPosition (BTreeControlBlockPtr btreePtr, - BTreeIteratorPtr iterator, - BlockDescriptor *left, - BlockDescriptor *middle, - BlockDescriptor *right, - u_int32_t *returnNodeNum, - u_int16_t *returnIndex, - Boolean *foundRecord ) -{ - OSStatus err; - Boolean foundIt; - u_int32_t nodeNum; - u_int16_t leftIndex, index, rightIndex; - Boolean validHint; - - // assume btreePtr valid - // assume left, middle, right point to BlockDescriptors - // assume nodeNum points to u_int32_t - // assume index points to u_int16_t - // assume foundRecord points to Boolean - - left->buffer = nil; - left->blockHeader = nil; - middle->buffer = nil; - middle->blockHeader = nil; - right->buffer = nil; - right->blockHeader = nil; - - foundIt = false; - - if (iterator == nil) // do we have an iterator? - { - err = fsBTInvalidIteratorErr; - goto ErrorExit; - } - - err = IsItAHint (btreePtr, iterator, &validHint); - M_ExitOnError (err); - - nodeNum = iterator->hint.nodeNum; - if (! validHint) // does the hint appear to be valid? - { - goto SearchTheTree; - } - - err = GetNode (btreePtr, nodeNum, kGetNodeHint, middle); - if( err == fsBTInvalidNodeErr ) // returned if nodeNum is out of range - goto SearchTheTree; - - M_ExitOnError (err); - - if ( ((NodeDescPtr) middle->buffer)->kind != kBTLeafNode || - ((NodeDescPtr) middle->buffer)->numRecords <= 0 ) - { - goto SearchTheTree; - } - - foundIt = SearchNode (btreePtr, middle->buffer, &iterator->key, &index); - if (foundIt == true) - { - ++btreePtr->numValidHints; - goto SuccessfulExit; - } - iterator->hint.nodeNum = 0; - - if (index == 0) - { - if (((NodeDescPtr) middle->buffer)->bLink == 0) // before 1st btree record - { - goto SuccessfulExit; - } - - nodeNum = ((NodeDescPtr) middle->buffer)->bLink; - - // BTree nodes are always grabbed in left to right order. - // Therefore release the current node before looking up the - // left node. - err = ReleaseNode(btreePtr, middle); - M_ExitOnError(err); - - // Look up the left node - err = GetNode (btreePtr, nodeNum, 0, left); - M_ExitOnError (err); - - // Look up the current node again - err = GetRightSiblingNode (btreePtr, left->buffer, middle); - M_ExitOnError (err); - - if ( ((NodeDescPtr) left->buffer)->kind != kBTLeafNode || - ((NodeDescPtr) left->buffer)->numRecords <= 0 ) - { - goto SearchTheTree; - } - - foundIt = SearchNode (btreePtr, left->buffer, &iterator->key, &leftIndex); - if (foundIt == true) - { - *right = *middle; - *middle = *left; - left->buffer = nil; - index = leftIndex; - - goto SuccessfulExit; - } - - if (leftIndex == 0) // we're lost! - { - goto SearchTheTree; - } - else if (leftIndex >= ((NodeDescPtr) left->buffer)->numRecords) - { - nodeNum = ((NodeDescPtr) left->buffer)->fLink; - - PanicIf (index != 0, "FindIteratorPosition: index != 0"); //�� just checking... - goto SuccessfulExit; - } - else - { - *right = *middle; - *middle = *left; - left->buffer = nil; - index = leftIndex; - - goto SuccessfulExit; - } - } - else if (index >= ((NodeDescPtr) middle->buffer)->numRecords) - { - if (((NodeDescPtr) middle->buffer)->fLink == 0) // beyond last record - { - goto SuccessfulExit; - } - - nodeNum = ((NodeDescPtr) middle->buffer)->fLink; - - err = GetRightSiblingNode (btreePtr, middle->buffer, right); - M_ExitOnError (err); - - if ( ((NodeDescPtr) right->buffer)->kind != kBTLeafNode || - ((NodeDescPtr) right->buffer)->numRecords <= 0 ) - { - goto SearchTheTree; - } - - foundIt = SearchNode (btreePtr, right->buffer, &iterator->key, &rightIndex); - if (rightIndex >= ((NodeDescPtr) right->buffer)->numRecords) // we're lost - { - goto SearchTheTree; - } - else // we found it, or rightIndex==0, or rightIndexbuffer = nil; - index = rightIndex; - - goto SuccessfulExit; - } - } - - - //////////////////////////// Search The Tree //////////////////////////////// - -SearchTheTree: - { - TreePathTable treePathTable; // so we only use stack space if we need to - - err = ReleaseNode (btreePtr, left); M_ExitOnError (err); - err = ReleaseNode (btreePtr, middle); M_ExitOnError (err); - err = ReleaseNode (btreePtr, right); M_ExitOnError (err); - - err = SearchTree ( btreePtr, &iterator->key, treePathTable, &nodeNum, middle, &index); - switch (err) //�� separate find condition from exceptions - { - case noErr: foundIt = true; break; - case fsBTRecordNotFoundErr: break; - default: goto ErrorExit; - } - } - - /////////////////////////////// Success! //////////////////////////////////// - -SuccessfulExit: - - *returnNodeNum = nodeNum; - *returnIndex = index; - *foundRecord = foundIt; - - return noErr; - - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - (void) ReleaseNode (btreePtr, left); - (void) ReleaseNode (btreePtr, middle); - (void) ReleaseNode (btreePtr, right); - - *returnNodeNum = 0; - *returnIndex = 0; - *foundRecord = false; - - return err; -} - - - -/////////////////////////////// CheckInsertParams /////////////////////////////// - -OSStatus CheckInsertParams (FCB *filePtr, - BTreeIterator *iterator, - FSBufferDescriptor *record, - u_int16_t recordLen ) -{ - BTreeControlBlockPtr btreePtr; - - if (filePtr == nil) return paramErr; - - btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; - if (btreePtr == nil) return fsBTInvalidFileErr; - if (iterator == nil) return paramErr; - if (record == nil) return paramErr; - - // check total key/record size limit - if ( CalcKeyRecordSize (CalcKeySize(btreePtr, &iterator->key), recordLen) > (btreePtr->nodeSize >> 1)) - return fsBTRecordTooLargeErr; - - return noErr; -} - - - -/*------------------------------------------------------------------------------- -Routine: TrySimpleReplace - Attempts a simple insert, set, or replace. - -Function: If a hint exitst for the iterator, attempt to find the key in the hint - node. If the key is found, an insert operation fails. If the is not - found, a replace operation fails. If the key was not found, and the - insert position is greater than 0 and less than numRecords, the record - is inserted, provided there is enough freeSpace. If the key was found, - and there is more freeSpace than the difference between the new record - and the old record, the old record is deleted and the new record is - inserted. - -Assumptions: iterator key has already been checked by CheckKey - - -Input: btreePtr - description - iterator - description - record - description - recordLen - description - operation - description - - -Output: recordInserted - description - - -Result: noErr - success - E_RecordExits - insert operation failure - != noErr - GetNode, ReleaseNode, UpdateNode returned an error --------------------------------------------------------------------------------*/ - -OSStatus TrySimpleReplace (BTreeControlBlockPtr btreePtr, - NodeDescPtr nodePtr, - BTreeIterator *iterator, - FSBufferDescriptor *record, - u_int16_t recordLen, - Boolean *recordInserted ) -{ - u_int32_t oldSpace; - u_int32_t spaceNeeded; - u_int16_t index; - u_int16_t keySize; - Boolean foundIt; - Boolean didItFit; - - - *recordInserted = false; // we'll assume this won't work... - - if ( nodePtr->kind != kBTLeafNode ) - return noErr; // we're in the weeds! - - foundIt = SearchNode (btreePtr, nodePtr, &iterator->key, &index); - - if ( foundIt == false ) - return noErr; // we might be lost... - - keySize = CalcKeySize(btreePtr, &iterator->key); // includes length field - - spaceNeeded = CalcKeyRecordSize (keySize, recordLen); - - oldSpace = GetRecordSize (btreePtr, nodePtr, index); - - if ( spaceNeeded == oldSpace ) - { - u_int8_t * dst; - - dst = GetRecordAddress (btreePtr, nodePtr, index); - - if ( M_IsOdd (keySize) ) - ++keySize; // add pad byte - - dst += keySize; // skip over key to point at record - - BlockMoveData(record->bufferAddress, dst, recordLen); // blast away... - - *recordInserted = true; - } - else if ( (GetNodeFreeSize(btreePtr, nodePtr) + oldSpace) >= spaceNeeded) - { - DeleteRecord (btreePtr, nodePtr, index); - - didItFit = InsertKeyRecord (btreePtr, nodePtr, index, - &iterator->key, KeyLength(btreePtr, &iterator->key), - record->bufferAddress, recordLen); - PanicIf (didItFit == false, "TrySimpleInsert: InsertKeyRecord returned false!"); - - *recordInserted = true; - } - // else not enough space... - - return noErr; -} - - -/*------------------------------------------------------------------------------- -Routine: IsItAHint - checks the hint within a BTreeInterator. - -Function: checks the hint within a BTreeInterator. If it is non-zero, it may - possibly be valid. - -Input: btreePtr - pointer to control block for BTree file - iterator - pointer to BTreeIterator - -Output: answer - true if the hint looks reasonable - - false if the hint is 0 - -Result: noErr - success --------------------------------------------------------------------------------*/ - - -OSStatus IsItAHint (BTreeControlBlockPtr btreePtr, BTreeIterator *iterator, Boolean *answer) -{ - ++btreePtr->numHintChecks; - -#if DEBUG_BUILD - if (iterator->hint.nodeNum >= btreePtr->totalNodes) - { - *answer = false; - } else - -#endif - if (iterator->hint.nodeNum == 0) - { - *answer = false; - } - else - { - *answer = true; - ++btreePtr->numPossibleHints; - } - - return noErr; -} diff --git a/bsd/hfs/hfscommon/BTree/BTreeNodeOps.c b/bsd/hfs/hfscommon/BTree/BTreeNodeOps.c deleted file mode 100644 index 89f4eaf13..000000000 --- a/bsd/hfs/hfscommon/BTree/BTreeNodeOps.c +++ /dev/null @@ -1,1068 +0,0 @@ -/* - * Copyright (c) 2000, 2002, 2005-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: BTreeNodeOps.c - - Contains: Single-node operations for the BTree Module. - - Version: xxx put the technology version here xxx - - Written by: Gordon Sheridan and Bill Bruffey - - Copyright: � 1992-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: File Systems - - Writers: - - (msd) Mark Day - (djb) Don Brady - - Change History (most recent first): - - 6/1/99 djb Sync up with Mac OS 8.6. - 4/113/99 djb Fix key size checking bug in CheckNode. - 3/19/99 djb Added key size checking to CheckNode. - 3/26/98 djb Added PrintNode for debugging. - 9/4/97 djb Removed GetRightSiblingNode and GetLeftSiblingNode - they are - now macros. SearchNode is now in BTreeSearchNode.a. - 8/22/97 djb Turn off debugging code in CheckKey. - 7/24/97 djb Add summary traces for Get/Rel Node. Made GetRecordOffset into a - macro. Only call CheckNode if the node came from disk. - 7/21/97 msd Make GetRecordByIndex check its record index input; it now - returns an OSStatus. - 4/23/97 djb first checked in - - 2/19/97 djb Changes to support big node cache. - 1/3/97 djb Added support for large keys. - 12/19/96 djb first checked in - - - History applicable to original Scarecrow Design: - - <6> 10/25/96 ser Changing for new VFPI - <5> 9/17/96 dkh Add bounds checking to GetNode. Update GetNode to not assert - that CheckNode failed if the node is all zeroes. This can happen - if the hint case if the fetched node has been deallocated - <4> 3/7/96 dkh Change GetNewNode() to not use kGetEmptyBlock. Instead use - kGetBlock to fetch a block from the disk itself. ��� Why? - <3> 1/22/96 dkh Add #include Memory.h - <2> 1/10/96 msd Change 64-bit math to use real function names from Math64.i. - <1> 10/18/95 rst Moved from Scarecrow project. - - <17> 7/18/95 mbb Change MoveData & ClearBytes to BlockMoveData & BlockZero. - <16> 1/31/95 prp GetBlockProc interface uses a 64 bit node number. - <15> 1/12/95 wjk Adopt Model FileSystem changes in D5. - <14> 9/30/94 prp Get in sync with D2 interface changes. - <13> 7/25/94 wjk Eliminate usage of BytePtr in favor of UInt8 *. - <12> 7/22/94 wjk Convert to the new set of header files. - <11> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and - NRCmds environments. - <10> 11/30/93 wjk Change some Ptr's to BytePtr's in function definitions so they - agree with their prototypes. - <9> 8/31/93 prp Use U64SetU instead of S64Set. - <8> 5/21/93 gs Maintain statistical counters on Get/Release node routines. - <7> 5/10/93 gs Change keySize parameter to keyLength for InsertKeyRecord - routine. Calculate number of bytes in key from keyLength to - account for length and pad bytes. Add GetChildNodeNum routine. - <6> 3/23/93 gs Add InsertKeyRecord routine. - <5> 2/8/93 gs Fix bug in SearchNode that caused "off by 1" error when final - compare was searchKey > trialKey. Add UpdateNode. - <4> 12/10/92 gs Change keyLength field of key to 'length'. - <3> 12/8/92 gs Incorporate suggestions from preliminary code review. - <2> 12/2/92 gs Implement routines. - <1> 11/15/92 gs Define routine interfaces. - -*/ - -#include "../headers/BTreesPrivate.h" - - - -///////////////////////// BTree Module Node Operations ////////////////////////// -// -// GetNode - Call FS Agent to get node -// GetNewNode - Call FS Agent to get a new node -// ReleaseNode - Call FS Agent to release node obtained by GetNode. -// UpdateNode - Mark a node as dirty and call FS Agent to release it. -// -// ClearNode - Clear a node to all zeroes. -// -// InsertRecord - Inserts a record into a BTree node. -// InsertKeyRecord - Inserts a key and record pair into a BTree node. -// DeleteRecord - Deletes a record from a BTree node. -// -// SearchNode - Return index for record that matches key. -// LocateRecord - Return pointer to key and data, and size of data. -// -// GetNodeDataSize - Return the amount of space used for data in the node. -// GetNodeFreeSize - Return the amount of free space in the node. -// -// GetRecordOffset - Return the offset for record "index". -// GetRecordAddress - Return address of record "index". -// GetOffsetAddress - Return address of offset for record "index". -// -// InsertOffset - Inserts a new offset into a node. -// DeleteOffset - Deletes an offset from a node. -// -///////////////////////////////////////////////////////////////////////////////// - - - -////////////////////// Routines Internal To BTreeNodeOps.c ////////////////////// - -u_int16_t GetRecordOffset (BTreeControlBlockPtr btree, - NodeDescPtr node, - u_int16_t index ); - -u_int16_t *GetOffsetAddress (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ); - -void InsertOffset (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index, - u_int16_t delta ); - -void DeleteOffset (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ); - - -///////////////////////////////////////////////////////////////////////////////// - -#define GetRecordOffset(btreePtr,node,index) (*(short *) ((u_int8_t *)(node) + (btreePtr)->nodeSize - ((index) << 1) - kOffsetSize)) - -#if HFS_DIAGNOSTIC - #include - #define PRINTIT kprintf -static void PrintNode(const NodeDescPtr node, u_int16_t nodeSize, u_int32_t nodeNumber); -#endif /* HFS_DIAGNOSTIC */ - - - - -/*------------------------------------------------------------------------------- - -Routine: GetNode - Call FS Agent to get node - -Function: Gets an existing BTree node from FS Agent and verifies it. - -Input: btreePtr - pointer to BTree control block - nodeNum - number of node to request - -Output: nodePtr - pointer to beginning of node (nil if error) - -Result: - noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus GetNode (BTreeControlBlockPtr btreePtr, - u_int32_t nodeNum, - u_int32_t flags, - NodeRec *nodePtr ) -{ - OSStatus err; - GetBlockProcPtr getNodeProc; - u_int32_t options; - - - // is nodeNum within proper range? - if( nodeNum >= btreePtr->totalNodes ) - { - Panic("GetNode:nodeNum >= totalNodes"); - err = fsBTInvalidNodeErr; - goto ErrorExit; - } - - nodePtr->blockSize = btreePtr->nodeSize; // indicate the size of a node - - options = kGetBlock; - if ( flags & kGetNodeHint ) - { - options |= kGetBlockHint; - } - - getNodeProc = btreePtr->getBlockProc; - err = getNodeProc (btreePtr->fileRefNum, - nodeNum, - options, - nodePtr ); - - if (err != noErr) - { - Panic ("GetNode: getNodeProc returned error."); - goto ErrorExit; - } - ++btreePtr->numGetNodes; - - return noErr; - -ErrorExit: - nodePtr->buffer = nil; - nodePtr->blockHeader = nil; - - return err; -} - - - -/*------------------------------------------------------------------------------- - -Routine: GetNewNode - Call FS Agent to get a new node - -Function: Gets a new BTree node from FS Agent and initializes it to an empty - state. - -Input: btreePtr - pointer to BTree control block - nodeNum - number of node to request - -Output: returnNodePtr - pointer to beginning of node (nil if error) - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus GetNewNode (BTreeControlBlockPtr btreePtr, - u_int32_t nodeNum, - NodeRec *returnNodePtr ) -{ - OSStatus err; - NodeDescPtr node; - void *pos; - GetBlockProcPtr getNodeProc; - - - //////////////////////// get buffer for new node //////////////////////////// - - returnNodePtr->blockSize = btreePtr->nodeSize; // indicate the size of a node - - getNodeProc = btreePtr->getBlockProc; - err = getNodeProc (btreePtr->fileRefNum, - nodeNum, - kGetBlock+kGetEmptyBlock, - returnNodePtr ); - - if (err != noErr) - { - Panic ("GetNewNode: getNodeProc returned error."); - // returnNodePtr->buffer = nil; - return err; - } - ++btreePtr->numGetNewNodes; - - - ////////////////////////// initialize the node ////////////////////////////// - - node = returnNodePtr->buffer; - - ClearNode (btreePtr, node); // clear the node - - pos = (char *)node + btreePtr->nodeSize - 2; // find address of last offset - *(u_int16_t *)pos = sizeof (BTNodeDescriptor); // set offset to beginning of free space - - - return noErr; -} - - - -/*------------------------------------------------------------------------------- - -Routine: ReleaseNode - Call FS Agent to release node obtained by GetNode. - -Function: Informs the FS Agent that a BTree node may be released. - -Input: btreePtr - pointer to BTree control block - nodeNum - number of node to release - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus ReleaseNode (BTreeControlBlockPtr btreePtr, - NodePtr nodePtr ) -{ - OSStatus err; - ReleaseBlockProcPtr releaseNodeProc; - - - err = noErr; - - if (nodePtr->buffer != nil) - { - releaseNodeProc = btreePtr->releaseBlockProc; - err = releaseNodeProc (btreePtr->fileRefNum, - nodePtr, - kReleaseBlock ); - PanicIf (err, "ReleaseNode: releaseNodeProc returned error."); - ++btreePtr->numReleaseNodes; - } - - nodePtr->buffer = nil; - nodePtr->blockHeader = nil; - - return err; -} - - - - -/*------------------------------------------------------------------------------- - -Routine: TrashNode - Call FS Agent to release node obtained by GetNode, and - not store it...mark it as bad. - -Function: Informs the FS Agent that a BTree node may be released and thrown away. - -Input: btreePtr - pointer to BTree control block - nodeNum - number of node to release - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus TrashNode (BTreeControlBlockPtr btreePtr, - NodePtr nodePtr ) -{ - OSStatus err; - ReleaseBlockProcPtr releaseNodeProc; - - - err = noErr; - - if (nodePtr->buffer != nil) - { - releaseNodeProc = btreePtr->releaseBlockProc; - err = releaseNodeProc (btreePtr->fileRefNum, - nodePtr, - kReleaseBlock | kTrashBlock ); - PanicIf (err, "TrashNode: releaseNodeProc returned error."); - ++btreePtr->numReleaseNodes; - } - - nodePtr->buffer = nil; - nodePtr->blockHeader = nil; - - return err; -} - - - -/*------------------------------------------------------------------------------- - -Routine: UpdateNode - Mark a node as dirty and call FS Agent to release it. - -Function: Marks a BTree node dirty and informs the FS Agent that it may be released. - -Input: btreePtr - pointer to BTree control block - nodeNum - number of node to release - transactionID - ID of transaction this node update is a part of - flags - special flags to pass to ReleaseNodeProc - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus UpdateNode (BTreeControlBlockPtr btreePtr, - NodePtr nodePtr, - u_int32_t transactionID, - u_int32_t flags ) -{ -#pragma unused(transactionID) - - OSStatus err; - ReleaseBlockProcPtr releaseNodeProc; - - - err = noErr; - - if (nodePtr->buffer != nil) // Why call UpdateNode if nil ?!? - { - releaseNodeProc = btreePtr->releaseBlockProc; - err = releaseNodeProc (btreePtr->fileRefNum, - nodePtr, - flags | kMarkBlockDirty ); - ++btreePtr->numUpdateNodes; - M_ExitOnError (err); - } - - nodePtr->buffer = nil; - nodePtr->blockHeader = nil; - - return noErr; - -ErrorExit: - - return err; -} - - - -#if HFS_DIAGNOSTIC -static void PrintNode(const NodeDescPtr node, u_int16_t nodeSize, u_int32_t nodeNumber) -{ - struct row { - u_int16_t word[8]; - }; - struct row *offset; - u_int16_t rows; - u_int32_t *lp; - - PRINTIT("Dump of B-tree node #%ld ($%08lX)\n", nodeNumber, nodeNumber); - - rows = nodeSize/16; - lp = (u_int32_t*) node; - offset = 0; - - while (rows-- > 0) - PRINTIT("%04X: %08lX %08lX %08lX %08lX\n", (u_int)offset++, *lp++, *lp++, *lp++, *lp++); -} -#endif - - -/*------------------------------------------------------------------------------- - -Routine: ClearNode - Clear a node to all zeroes. - -Function: Writes zeroes from beginning of node for nodeSize bytes. - -Input: btreePtr - pointer to BTree control block - node - pointer to node to clear - -Result: none --------------------------------------------------------------------------------*/ - -void ClearNode (BTreeControlBlockPtr btreePtr, NodeDescPtr node ) -{ - ClearMemory( node, btreePtr->nodeSize ); -} - -/*------------------------------------------------------------------------------- - -Routine: InsertRecord - Inserts a record into a BTree node. - -Function: - -Note: Record size must be even! - -Input: btreePtr - pointer to BTree control block - node - pointer to node to insert the record - index - position record is to be inserted - recPtr - pointer to record to insert - -Result: noErr - success - fsBTFullErr - record larger than remaining free space. --------------------------------------------------------------------------------*/ - -Boolean InsertRecord (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index, - RecordPtr recPtr, - u_int16_t recSize ) -{ - u_int16_t freeSpace; - u_int16_t indexOffset; - u_int16_t freeOffset; - u_int16_t bytesToMove; - void *src; - void *dst; - - //// will new record fit in node? - - freeSpace = GetNodeFreeSize (btreePtr, node); - //�� we could get freeOffset & calc freeSpace - if ( freeSpace < recSize + 2) - { - return false; - } - - - //// make hole for new record - - indexOffset = GetRecordOffset (btreePtr, node, index); - freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); - - src = ((Ptr) node) + indexOffset; - dst = ((Ptr) src) + recSize; - bytesToMove = freeOffset - indexOffset; - if (bytesToMove) - MoveRecordsRight (src, dst, bytesToMove); - - - //// adjust offsets for moved records - - InsertOffset (btreePtr, node, index, recSize); - - - //// move in the new record - - dst = ((Ptr) node) + indexOffset; - MoveRecordsLeft (recPtr, dst, recSize); - - return true; -} - - - -/*------------------------------------------------------------------------------- - -Routine: InsertKeyRecord - Inserts a record into a BTree node. - -Function: - -Note: Record size must be even! - -Input: btreePtr - pointer to BTree control block - node - pointer to node to insert the record - index - position record is to be inserted - keyPtr - pointer to key for record to insert - keyLength - length of key (or maxKeyLength) - recPtr - pointer to record to insert - recSize - number of bytes to copy for record - -Result: noErr - success - fsBTFullErr - record larger than remaining free space. --------------------------------------------------------------------------------*/ - -Boolean InsertKeyRecord (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index, - KeyPtr keyPtr, - u_int16_t keyLength, - RecordPtr recPtr, - u_int16_t recSize ) -{ - u_int16_t freeSpace; - u_int16_t indexOffset; - u_int16_t freeOffset; - u_int16_t bytesToMove; - u_int8_t * src; - u_int8_t * dst; - u_int16_t keySize; - u_int16_t rawKeyLength; - u_int16_t sizeOfLength; - - //// calculate actual key size - - if ( btreePtr->attributes & kBTBigKeysMask ) - keySize = keyLength + sizeof(u_int16_t); - else - keySize = keyLength + sizeof(u_int8_t); - - if ( M_IsOdd (keySize) ) - ++keySize; // add pad byte - - - //// will new record fit in node? - - freeSpace = GetNodeFreeSize (btreePtr, node); - //�� we could get freeOffset & calc freeSpace - if ( freeSpace < keySize + recSize + 2) - { - return false; - } - - - //// make hole for new record - - indexOffset = GetRecordOffset (btreePtr, node, index); - freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); - - src = ((u_int8_t *) node) + indexOffset; - dst = ((u_int8_t *) src) + keySize + recSize; - bytesToMove = freeOffset - indexOffset; - if (bytesToMove) - MoveRecordsRight (src, dst, bytesToMove); - - - //// adjust offsets for moved records - - InsertOffset (btreePtr, node, index, keySize + recSize); - - - //// copy record key - - dst = ((u_int8_t *) node) + indexOffset; - - if ( btreePtr->attributes & kBTBigKeysMask ) - { - *((u_int16_t *)dst) = keyLength; // use keyLength rather than key.length - dst = (u_int8_t *) (((u_int16_t *)dst) + 1); - rawKeyLength = keyPtr->length16; - sizeOfLength = 2; - } - else - { - *dst++ = keyLength; // use keyLength rather than key.length - rawKeyLength = keyPtr->length8; - sizeOfLength = 1; - } - - MoveRecordsLeft ( ((u_int8_t *) keyPtr) + sizeOfLength, dst, rawKeyLength); // copy key - - // any pad bytes? - bytesToMove = keySize - rawKeyLength; - if (bytesToMove) - ClearMemory (dst + rawKeyLength, bytesToMove); // clear pad bytes in index key - - - //// copy record data - - dst = ((u_int8_t *) node) + indexOffset + keySize; - MoveRecordsLeft (recPtr, dst, recSize); - - return true; -} - - - -/*------------------------------------------------------------------------------- - -Routine: DeleteRecord - Deletes a record from a BTree node. - -Function: - -Input: btreePtr - pointer to BTree control block - node - pointer to node to insert the record - index - position record is to be inserted - -Result: none --------------------------------------------------------------------------------*/ - -void DeleteRecord (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ) -{ - int16_t indexOffset; - int16_t nextOffset; - int16_t freeOffset; - int16_t bytesToMove; - void *src; - void *dst; - - //// compress records - indexOffset = GetRecordOffset (btreePtr, node, index); - nextOffset = GetRecordOffset (btreePtr, node, index + 1); - freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); - - src = ((Ptr) node) + nextOffset; - dst = ((Ptr) node) + indexOffset; - bytesToMove = freeOffset - nextOffset; - if (bytesToMove) - MoveRecordsLeft (src, dst, bytesToMove); - - //// Adjust the offsets - DeleteOffset (btreePtr, node, index); - - /* clear out new free space */ - bytesToMove = nextOffset - indexOffset; - ClearMemory(GetRecordAddress(btreePtr, node, node->numRecords), bytesToMove); - -} - - - -/*------------------------------------------------------------------------------- - -Routine: SearchNode - Return index for record that matches key. - -Function: Returns the record index for the record that matches the search key. - If no record was found that matches the search key, the "insert index" - of where the record should go is returned instead. - -Algorithm: A binary search algorithm is used to find the specified key. - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - searchKey - pointer to the key to match - -Output: index - pointer to beginning of key for record - -Result: true - success (index = record index) - false - key did not match anything in node (index = insert index) --------------------------------------------------------------------------------*/ -Boolean -SearchNode( BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - KeyPtr searchKey, - u_int16_t *returnIndex ) -{ - int32_t lowerBound; - int32_t upperBound; - int32_t index; - int32_t result; - KeyPtr trialKey; - u_int16_t *offset; - KeyCompareProcPtr compareProc = btreePtr->keyCompareProc; - - lowerBound = 0; - upperBound = node->numRecords - 1; - offset = (u_int16_t *) ((u_int8_t *)(node) + (btreePtr)->nodeSize - kOffsetSize); - - while (lowerBound <= upperBound) { - index = (lowerBound + upperBound) >> 1; - - trialKey = (KeyPtr) ((u_int8_t *)node + *(offset - index)); - - result = compareProc(searchKey, trialKey); - - if (result < 0) { - upperBound = index - 1; /* search < trial */ - } else if (result > 0) { - lowerBound = index + 1; /* search > trial */ - } else { - *returnIndex = index; /* search == trial */ - return true; - } - } - - *returnIndex = lowerBound; /* lowerBound is insert index */ - return false; -} - - -/*------------------------------------------------------------------------------- - -Routine: GetRecordByIndex - Return pointer to key and data, and size of data. - -Function: Returns a pointer to beginning of key for record, a pointer to the - beginning of the data for the record, and the size of the record data - (does not include the size of the key). - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - index - index of record to get - -Output: keyPtr - pointer to beginning of key for record - dataPtr - pointer to beginning of data for record - dataSize - size of the data portion of the record - -Result: none --------------------------------------------------------------------------------*/ - -OSStatus GetRecordByIndex (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index, - KeyPtr *keyPtr, - u_int8_t * *dataPtr, - u_int16_t *dataSize ) -{ - u_int16_t offset; - u_int16_t nextOffset; - u_int16_t keySize; - - // - // Make sure index is valid (in range 0..numRecords-1) - // - if (index >= node->numRecords) - return fsBTRecordNotFoundErr; - - //// find keyPtr - offset = GetRecordOffset (btreePtr, node, index); - *keyPtr = (KeyPtr) ((Ptr)node + offset); - - //// find dataPtr - keySize = CalcKeySize(btreePtr, *keyPtr); - if ( M_IsOdd (keySize) ) - ++keySize; // add pad byte - - offset += keySize; // add the key length to find data offset - *dataPtr = (u_int8_t *) node + offset; - - //// find dataSize - nextOffset = GetRecordOffset (btreePtr, node, index + 1); - *dataSize = nextOffset - offset; - - return noErr; -} - - - -/*------------------------------------------------------------------------------- - -Routine: GetNodeDataSize - Return the amount of space used for data in the node. - -Function: Gets the size of the data currently contained in a node, excluding - the node header. (record data + offset overhead) - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - -Result: - number of bytes used for data and offsets in the node. --------------------------------------------------------------------------------*/ - -u_int16_t GetNodeDataSize (BTreeControlBlockPtr btreePtr, NodeDescPtr node ) -{ - u_int16_t freeOffset; - - freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); - - return freeOffset + (node->numRecords << 1) - sizeof (BTNodeDescriptor); -} - - - -/*------------------------------------------------------------------------------- - -Routine: GetNodeFreeSize - Return the amount of free space in the node. - -Function: - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - -Result: - number of bytes of free space in the node. --------------------------------------------------------------------------------*/ - -u_int16_t GetNodeFreeSize (BTreeControlBlockPtr btreePtr, NodeDescPtr node ) -{ - u_int16_t freeOffset; - - freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); //�� inline? - - return btreePtr->nodeSize - freeOffset - (node->numRecords << 1) - kOffsetSize; -} - - - -/*------------------------------------------------------------------------------- - -Routine: GetRecordOffset - Return the offset for record "index". - -Function: - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - index - record to obtain offset for - -Result: - offset (in bytes) from beginning of node of record specified by index --------------------------------------------------------------------------------*/ -// make this a macro (for inlining) -#if 0 -u_int16_t GetRecordOffset (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ) -{ - void *pos; - - - pos = (u_int8_t *)node + btreePtr->nodeSize - (index << 1) - kOffsetSize; - - return *(short *)pos; -} -#endif - - - -/*------------------------------------------------------------------------------- - -Routine: GetRecordAddress - Return address of record "index". - -Function: - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - index - record to obtain offset address for - -Result: - pointer to record "index". --------------------------------------------------------------------------------*/ -// make this a macro (for inlining) -#if 0 -u_int8_t * GetRecordAddress (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ) -{ - u_int8_t * pos; - - pos = (u_int8_t *)node + GetRecordOffset (btreePtr, node, index); - - return pos; -} -#endif - - - -/*------------------------------------------------------------------------------- - -Routine: GetRecordSize - Return size of record "index". - -Function: - -Note: This does not work on the FreeSpace index! - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - index - record to obtain record size for - -Result: - size of record "index". --------------------------------------------------------------------------------*/ - -u_int16_t GetRecordSize (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ) -{ - u_int16_t *pos; - - pos = (u_int16_t *) ((Ptr)node + btreePtr->nodeSize - (index << 1) - kOffsetSize); - - return *(pos-1) - *pos; -} - - - -/*------------------------------------------------------------------------------- -Routine: GetOffsetAddress - Return address of offset for record "index". - -Function: - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - index - record to obtain offset address for - -Result: - pointer to offset for record "index". --------------------------------------------------------------------------------*/ - -u_int16_t *GetOffsetAddress (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ) -{ - void *pos; - - pos = (Ptr)node + btreePtr->nodeSize - (index << 1) -2; - - return (u_int16_t *)pos; -} - - - -/*------------------------------------------------------------------------------- -Routine: GetChildNodeNum - Return child node number from index record "index". - -Function: Returns the first u_int32_t stored after the key for record "index". - -Assumes: The node is an Index Node. - The key.length stored at record "index" is ODD. //�� change for variable length index keys - -Input: btreePtr - pointer to BTree control block - node - pointer to node that contains the record - index - record to obtain child node number from - -Result: - child node number from record "index". --------------------------------------------------------------------------------*/ - -u_int32_t GetChildNodeNum (BTreeControlBlockPtr btreePtr, - NodeDescPtr nodePtr, - u_int16_t index ) -{ - u_int8_t * pos; - - pos = GetRecordAddress (btreePtr, nodePtr, index); - pos += CalcKeySize(btreePtr, (BTreeKey *) pos); // key.length + size of length field - - return *(u_int32_t *)pos; -} - - - -/*------------------------------------------------------------------------------- -Routine: InsertOffset - Add an offset and adjust existing offsets by delta. - -Function: Add an offset at 'index' by shifting 'index+1' through the last offset - and adjusting them by 'delta', the size of the record to be inserted. - The number of records contained in the node is also incremented. - -Input: btreePtr - pointer to BTree control block - node - pointer to node - index - index at which to insert record - delta - size of record to be inserted - -Result: none --------------------------------------------------------------------------------*/ - -void InsertOffset (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index, - u_int16_t delta ) -{ - u_int16_t *src, *dst; - u_int16_t numOffsets; - - src = GetOffsetAddress (btreePtr, node, node->numRecords); // point to free offset - dst = src - 1; // point to new offset - numOffsets = node->numRecords++ - index; // subtract index & postincrement - - do { - *dst++ = *src++ + delta; // to tricky? - } while (numOffsets--); -} - - - -/*------------------------------------------------------------------------------- - -Routine: DeleteOffset - Delete an offset. - -Function: Delete the offset at 'index' by shifting 'index+1' through the last offset - and adjusting them by the size of the record 'index'. - The number of records contained in the node is also decremented. - -Input: btreePtr - pointer to BTree control block - node - pointer to node - index - index at which to delete record - -Result: none --------------------------------------------------------------------------------*/ - -void DeleteOffset (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index ) -{ - u_int16_t *src, *dst; - u_int16_t numOffsets; - u_int16_t delta; - - dst = GetOffsetAddress (btreePtr, node, index); - src = dst - 1; - delta = *src - *dst; - numOffsets = --node->numRecords - index; // predecrement numRecords & subtract index - - while (numOffsets--) - { - *--dst = *--src - delta; // work our way left - } -} - - diff --git a/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c b/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c deleted file mode 100644 index 94577758a..000000000 --- a/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c +++ /dev/null @@ -1,341 +0,0 @@ -/* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include "../headers/BTreesPrivate.h" -#include "sys/malloc.h" -#include - - -/* - * B-tree Node Reserve - * - * BTReserveSpace - * BTReleaseReserve - * BTUpdateReserve - * - * Each kernel thread can have it's own reserve of b-tree - * nodes. This reserve info is kept in a hash table. - * - * Don't forget to call BTReleaseReserve when you're finished - * or you will leave stale node reserves in the hash. - */ - - -/* - * BE CAREFUL WHEN INCREASING THE SIZE OF THIS STRUCT! - * - * It must remain equal in size to the opaque cat_cookie_t - * struct (in hfs_catalog.h). - */ -struct nreserve { - LIST_ENTRY(nreserve) nr_hash; /* hash chain */ - int nr_nodecnt; /* count of nodes held in reserve */ - int nr_newnodes; /* nodes that were allocated */ - struct vnode *nr_btvp; /* b-tree file vnode */ - void *nr_tag; /* unique tag (per thread) */ -}; - -#define NR_GET_TAG() (current_thread()) - -#define NR_CACHE 17 - -#define NR_HASH(btvp, tag) \ - (&nr_hashtbl[((((intptr_t)(btvp)) >> 8) ^ ((intptr_t)(tag) >> 4)) & nr_hashmask]) - -LIST_HEAD(nodereserve, nreserve) *nr_hashtbl; - -u_long nr_hashmask; - -lck_grp_t * nr_lck_grp; -lck_grp_attr_t * nr_lck_grp_attr; -lck_attr_t * nr_lck_attr; - -lck_mtx_t nr_mutex; - -/* Internal Node Reserve Hash Routines (private) */ -static void nr_insert (struct vnode *, struct nreserve *nrp, int); -static void nr_delete (struct vnode *, struct nreserve *nrp, int *); -static void nr_update (struct vnode *, int); - - -/* - * BTReserveSetup - initialize the node reserve hash table - */ -__private_extern__ -void -BTReserveSetup() -{ - if (sizeof(struct nreserve) != sizeof(cat_cookie_t)) - panic("hfs: BTReserveSetup: nreserve size != opaque struct size"); - - nr_hashtbl = hashinit(NR_CACHE, M_HFSMNT, &nr_hashmask); - - nr_lck_grp_attr= lck_grp_attr_alloc_init(); - nr_lck_grp = lck_grp_alloc_init("btree_node_reserve", nr_lck_grp_attr); - - nr_lck_attr = lck_attr_alloc_init(); - - lck_mtx_init(&nr_mutex, nr_lck_grp, nr_lck_attr); -} - - -/* - * BTReserveSpace - obtain a node reserve (for current thread) - * - * Used by the Catalog Layer (hfs_catalog.c) to reserve space. - * - * When data is NULL, we only insure that there's enough space - * but it is not reserved (assumes you keep the b-tree lock). - */ -__private_extern__ -int -BTReserveSpace(FCB *file, int operations, void* data) -{ - BTreeControlBlock *btree; - int rsrvNodes, availNodes, totalNodes; - int height; - int inserts, deletes; - u_int32_t clumpsize; - int err = 0; - - btree = (BTreeControlBlockPtr)file->fcbBTCBPtr; - clumpsize = file->ff_clumpsize; - - REQUIRE_FILE_LOCK(btree->fileRefNum, true); - - /* - * The node reserve is based on the number of b-tree - * operations (insert/deletes) and the height of the - * tree. - */ - height = btree->treeDepth; - if (height < 2) - height = 2; /* prevent underflow in rsrvNodes calculation */ - inserts = operations & 0xffff; - deletes = operations >> 16; - - /* - * Allow for at least one root split. - * - * Each delete operation can propogate a big key up the - * index. This can cause a split at each level up. - * - * Each insert operation can cause a local split and a - * split at each level up. - */ - rsrvNodes = 1 + (deletes * (height - 2)) + (inserts * (height - 1)); - - availNodes = btree->freeNodes - btree->reservedNodes; - - if (rsrvNodes > availNodes) { - u_int32_t reqblks, freeblks, rsrvblks; - uint32_t bt_rsrv; - struct hfsmount *hfsmp; - - /* - * For UNIX conformance, we try and reserve the MIN of either 5% of - * total file blocks or 10MB worth of blocks, for growing existing - * files. On non-HFS filesystems, creating a new directory entry may - * not cause additional disk space to be allocated, but on HFS, creating - * a new entry could cause the b-tree to grow. As a result, we take - * some precautions here to prevent that on configurations that try to - * satisfy conformance. - */ - hfsmp = VTOVCB(btree->fileRefNum); - rsrvblks = ((u_int64_t)hfsmp->allocLimit * 5) / 100; - if (hfsmp->blockSize > HFS_BT_MAXRESERVE) { - bt_rsrv = 1; - } - else { - bt_rsrv = (HFS_BT_MAXRESERVE / hfsmp->blockSize); - } - rsrvblks = MIN(rsrvblks, bt_rsrv); - - freeblks = hfs_freeblks(hfsmp, 0); - if (freeblks <= rsrvblks) { - /* When running low, disallow adding new items. */ - if ((inserts > 0) && (deletes == 0)) { - return (ENOSPC); - } - freeblks = 0; - } else { - freeblks -= rsrvblks; - } - reqblks = clumpsize / hfsmp->blockSize; - - if (reqblks > freeblks) { - reqblks = ((rsrvNodes - availNodes) * btree->nodeSize) / hfsmp->blockSize; - /* When running low, disallow adding new items. */ - if ((reqblks > freeblks) && (inserts > 0) && (deletes == 0)) { - return (ENOSPC); - } - file->ff_clumpsize = freeblks * hfsmp->blockSize; - } - totalNodes = rsrvNodes + btree->totalNodes - availNodes; - - /* See if we also need a map node */ - if (totalNodes > (int)CalcMapBits(btree)) { - ++totalNodes; - } - if ((err = ExtendBTree(btree, totalNodes))) { - goto out; - } - } - /* Save this reserve if this is a persistent request. */ - if (data) { - btree->reservedNodes += rsrvNodes; - nr_insert(btree->fileRefNum, (struct nreserve *)data, rsrvNodes); - } -out: - /* Put clump size back if it was changed. */ - if (file->ff_clumpsize != clumpsize) - file->ff_clumpsize = clumpsize; - - return (err); -} - - -/* - * BTReleaseReserve - release the node reserve held by current thread - * - * Used by the Catalog Layer (hfs_catalog.c) to relinquish reserved space. - */ -__private_extern__ -int -BTReleaseReserve(FCB *file, void* data) -{ - BTreeControlBlock *btree; - int nodecnt; - - btree = (BTreeControlBlockPtr)file->fcbBTCBPtr; - - REQUIRE_FILE_LOCK(btree->fileRefNum, true); - - nr_delete(btree->fileRefNum, (struct nreserve *)data, &nodecnt); - - if (nodecnt) - btree->reservedNodes -= nodecnt; - - return (0); -} - -/* - * BTUpdateReserve - update a node reserve for allocations that occurred. - */ -__private_extern__ -void -BTUpdateReserve(BTreeControlBlockPtr btreePtr, int nodes) -{ - nr_update(btreePtr->fileRefNum, nodes); -} - - -/*----------------------------------------------------------------------------*/ -/* Node Reserve Hash Functions (private) */ - - -int nrinserts = 0; -int nrdeletes = 0; - -/* - * Insert a new node reserve. - */ -static void -nr_insert(struct vnode * btvp, struct nreserve *nrp, int nodecnt) -{ - struct nodereserve *nrhead; - struct nreserve *tmp_nrp; - void * tag = NR_GET_TAG(); - - /* - * Check the cache - there may already be a reserve - */ - lck_mtx_lock(&nr_mutex); - nrhead = NR_HASH(btvp, tag); - for (tmp_nrp = nrhead->lh_first; tmp_nrp; - tmp_nrp = tmp_nrp->nr_hash.le_next) { - if ((tmp_nrp->nr_tag == tag) && (tmp_nrp->nr_btvp == btvp)) { - nrp->nr_tag = 0; - tmp_nrp->nr_nodecnt += nodecnt; - lck_mtx_unlock(&nr_mutex); - return; - } - } - - nrp->nr_nodecnt = nodecnt; - nrp->nr_newnodes = 0; - nrp->nr_btvp = btvp; - nrp->nr_tag = tag; - LIST_INSERT_HEAD(nrhead, nrp, nr_hash); - ++nrinserts; - lck_mtx_unlock(&nr_mutex); -} - -/* - * Delete a node reserve. - */ -static void -nr_delete(struct vnode * btvp, struct nreserve *nrp, int *nodecnt) -{ - void * tag = NR_GET_TAG(); - - lck_mtx_lock(&nr_mutex); - if (nrp->nr_tag) { - if ((nrp->nr_tag != tag) || (nrp->nr_btvp != btvp)) - panic("hfs: nr_delete: invalid NR (%p)", nrp); - LIST_REMOVE(nrp, nr_hash); - *nodecnt = nrp->nr_nodecnt; - bzero(nrp, sizeof(struct nreserve)); - ++nrdeletes; - } else { - *nodecnt = 0; - } - lck_mtx_unlock(&nr_mutex); -} - - -/* - * Update a node reserve for any allocations that occurred. - */ -static void -nr_update(struct vnode * btvp, int nodecnt) -{ - struct nodereserve *nrhead; - struct nreserve *nrp; - void* tag = NR_GET_TAG(); - - lck_mtx_lock(&nr_mutex); - - nrhead = NR_HASH(btvp, tag); - for (nrp = nrhead->lh_first; nrp; nrp = nrp->nr_hash.le_next) { - if ((nrp->nr_tag == tag) && (nrp->nr_btvp == btvp)) { - nrp->nr_newnodes += nodecnt; - break; - } - } - lck_mtx_unlock(&nr_mutex); -} diff --git a/bsd/hfs/hfscommon/BTree/BTreeScanner.c b/bsd/hfs/hfscommon/BTree/BTreeScanner.c deleted file mode 100644 index ea549278d..000000000 --- a/bsd/hfs/hfscommon/BTree/BTreeScanner.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (c) 1996-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - * - * @(#)BTreeScanner.c - */ -#include -#include "../../hfs_endian.h" - -#include "../headers/BTreeScanner.h" - -static int FindNextLeafNode( BTScanState *scanState, Boolean avoidIO ); -static int ReadMultipleNodes( BTScanState *scanState ); - - -//_________________________________________________________________________________ -// -// Routine: BTScanNextRecord -// -// Purpose: Return the next leaf record in a scan. -// -// Inputs: -// scanState Scanner's current state -// avoidIO If true, don't do any I/O to refill the buffer -// -// Outputs: -// key Key of found record (points into buffer) -// data Data of found record (points into buffer) -// dataSize Size of data in found record -// -// Result: -// noErr Found a valid record -// btNotFound No more records -// ??? Needed to do I/O to get next node, but avoidIO set -// -// Notes: -// This routine returns pointers to the found record's key and data. It -// does not copy the key or data to a caller-supplied buffer (like -// GetBTreeRecord would). The caller must not modify the key or data. -//_________________________________________________________________________________ - -int BTScanNextRecord( BTScanState * scanState, - Boolean avoidIO, - void * * key, - void * * data, - u_int32_t * dataSize ) -{ - int err; - u_int16_t dataSizeShort; - - err = noErr; - - // - // If this is the first call, there won't be any nodes in the buffer, so go - // find the first first leaf node (if any). - // - if ( scanState->nodesLeftInBuffer == 0 ) - { - err = FindNextLeafNode( scanState, avoidIO ); - } - - while ( err == noErr ) - { - // See if we have a record in the current node - err = GetRecordByIndex( scanState->btcb, scanState->currentNodePtr, - scanState->recordNum, (KeyPtr *) key, - (u_int8_t **) data, &dataSizeShort ); - - if ( err == noErr ) - { - ++scanState->recordsFound; - ++scanState->recordNum; - if (dataSize != NULL) - *dataSize = dataSizeShort; - return noErr; - } - else if (err > 0) - { - // We didn't get the node through the cache, so we can't invalidate it. - //XXX Should we do something else to avoid seeing the same record again? - return err; - } - - // We're done with the current node. See if we've returned all the records - if ( scanState->recordsFound >= scanState->btcb->leafRecords ) - { - return btNotFound; - } - - // Move to the first record of the next leaf node - scanState->recordNum = 0; - err = FindNextLeafNode( scanState, avoidIO ); - } - - // - // If we got an EOF error from FindNextLeafNode, then there are no more leaf - // records to be found. - // - if ( err == fsEndOfIterationErr ) - err = btNotFound; - - return err; - -} /* BTScanNextRecord */ - - -//_________________________________________________________________________________ -// -// Routine: FindNextLeafNode -// -// Purpose: Point to the next leaf node in the buffer. Read more nodes -// into the buffer if needed (and allowed). -// -// Inputs: -// scanState Scanner's current state -// avoidIO If true, don't do any I/O to refill the buffer -// -// Result: -// noErr Found a valid record -// fsEndOfIterationErr No more nodes in file -// ??? Needed to do I/O to get next node, but avoidIO set -//_________________________________________________________________________________ - -static int FindNextLeafNode( BTScanState *scanState, Boolean avoidIO ) -{ - int err; - BlockDescriptor block; - FileReference fref; - - err = noErr; // Assume everything will be OK - - while ( 1 ) - { - if ( scanState->nodesLeftInBuffer == 0 ) - { - // Time to read some more nodes into the buffer - if ( avoidIO ) - { - return fsBTTimeOutErr; - } - else - { - // read some more nodes into buffer - err = ReadMultipleNodes( scanState ); - if ( err != noErr ) - break; - } - } - else - { - // Adjust the node counters and point to the next node in the buffer - ++scanState->nodeNum; - --scanState->nodesLeftInBuffer; - - // If we've looked at all nodes in the tree, then we're done - if ( scanState->nodeNum >= scanState->btcb->totalNodes ) - return fsEndOfIterationErr; - - if ( scanState->nodesLeftInBuffer == 0 ) - { - scanState->recordNum = 0; - continue; - } - - scanState->currentNodePtr = (BTNodeDescriptor *)(((u_int8_t *)scanState->currentNodePtr) - + scanState->btcb->nodeSize); - } - - /* Fake a BlockDescriptor */ - block.blockHeader = NULL; /* No buffer cache buffer */ - block.buffer = scanState->currentNodePtr; - block.blockNum = scanState->nodeNum; - block.blockSize = scanState->btcb->nodeSize; - block.blockReadFromDisk = 1; - block.isModified = 0; - - fref = scanState->btcb->fileRefNum; - - /* This node was read from disk, so it must be swapped/checked. - * Since we are reading multiple nodes, we might have read an - * unused node. Therefore we allow swapping of unused nodes. - */ - err = hfs_swap_BTNode(&block, fref, kSwapBTNodeBigToHost, true); - if ( err != noErr ) { - printf("hfs: FindNextLeafNode: Error from hfs_swap_BTNode (node %u)\n", scanState->nodeNum); - continue; - } - - if ( scanState->currentNodePtr->kind == kBTLeafNode ) - break; - } - - return err; - -} /* FindNextLeafNode */ - - -//_________________________________________________________________________________ -// -// Routine: ReadMultipleNodes -// -// Purpose: Read one or more nodes into the buffer. -// -// Inputs: -// theScanStatePtr Scanner's current state -// -// Result: -// noErr One or nodes were read -// fsEndOfIterationErr No nodes left in file, none in buffer -//_________________________________________________________________________________ - -static int ReadMultipleNodes( BTScanState *theScanStatePtr ) -{ - int myErr = E_NONE; - BTreeControlBlockPtr myBTreeCBPtr; - daddr64_t myPhyBlockNum; - u_int32_t myBufferSize; - struct vnode * myDevPtr; - unsigned int myBlockRun; - u_int32_t myBlocksInBufferCount; - - // release old buffer if we have one - if ( theScanStatePtr->bufferPtr != NULL ) - { - buf_markinvalid(theScanStatePtr->bufferPtr); - buf_brelse( theScanStatePtr->bufferPtr ); - theScanStatePtr->bufferPtr = NULL; - theScanStatePtr->currentNodePtr = NULL; - } - - myBTreeCBPtr = theScanStatePtr->btcb; - - // map logical block in catalog btree file to physical block on volume - myErr = hfs_bmap(myBTreeCBPtr->fileRefNum, theScanStatePtr->nodeNum, - &myDevPtr, &myPhyBlockNum, &myBlockRun); - if ( myErr != E_NONE ) - { - goto ExitThisRoutine; - } - - // bmap block run gives us the remaining number of valid blocks (number of blocks - // minus the first). so if there are 10 valid blocks our run number will be 9. - // blocks, in our case is the same as nodes (both are 4K) - myBlocksInBufferCount = (theScanStatePtr->bufferSize / myBTreeCBPtr->nodeSize ); - myBufferSize = theScanStatePtr->bufferSize; - if ( (myBlockRun + 1) < myBlocksInBufferCount ) - { - myBufferSize = (myBlockRun + 1) * myBTreeCBPtr->nodeSize; - } - - // now read blocks from the device - myErr = (int)buf_meta_bread(myDevPtr, - myPhyBlockNum, - myBufferSize, - NOCRED, - &theScanStatePtr->bufferPtr ); - if ( myErr != E_NONE ) - { - goto ExitThisRoutine; - } - - theScanStatePtr->nodesLeftInBuffer = buf_count(theScanStatePtr->bufferPtr) / theScanStatePtr->btcb->nodeSize; - theScanStatePtr->currentNodePtr = (BTNodeDescriptor *) buf_dataptr(theScanStatePtr->bufferPtr); - -ExitThisRoutine: - return myErr; - -} /* ReadMultipleNodes */ - - - -//_________________________________________________________________________________ -// -// Routine: BTScanInitialize -// -// Purpose: Prepare to start a new BTree scan, or resume a previous one. -// -// Inputs: -// btreeFile The B-Tree's file control block -// startingNode Initial node number -// startingRecord Initial record number within node -// recordsFound Number of valid records found so far -// bufferSize Size (in bytes) of buffer -// -// Outputs: -// scanState Scanner's current state; pass to other scanner calls -// -// Notes: -// To begin a new scan and see all records in the B-Tree, pass zeroes for -// startingNode, startingRecord, and recordsFound. -// -// To resume a scan from the point of a previous BTScanTerminate, use the -// values returned by BTScanTerminate as input for startingNode, startingRecord, -// and recordsFound. -// -// When resuming a scan, the caller should check the B-tree's write count. If -// it is different from the write count when the scan was terminated, then the -// tree may have changed and the current state may be incorrect. In particular, -// you may see some records more than once, or never see some records. Also, -// the scanner may not be able to detect when all leaf records have been seen, -// and will have to scan through many empty nodes. -// -// XXX�Perhaps the write count should be managed by BTScanInitialize and -// XXX BTScanTerminate? This would avoid the caller having to peek at -// XXX internal B-Tree structures. -//_________________________________________________________________________________ - -int BTScanInitialize( const FCB * btreeFile, - u_int32_t startingNode, - u_int32_t startingRecord, - u_int32_t recordsFound, - u_int32_t bufferSize, - BTScanState * scanState ) -{ - BTreeControlBlock *btcb; - - // - // Make sure this is a valid B-Tree file - // - btcb = (BTreeControlBlock *) btreeFile->fcbBTCBPtr; - if (btcb == NULL) - return fsBTInvalidFileErr; - - // - // Make sure buffer size is big enough, and a multiple of the - // B-Tree node size - // - if ( bufferSize < btcb->nodeSize ) - return paramErr; - bufferSize = (bufferSize / btcb->nodeSize) * btcb->nodeSize; - - // - // Set up the scanner's state - // - scanState->bufferSize = bufferSize; - scanState->bufferPtr = NULL; - scanState->btcb = btcb; - scanState->nodeNum = startingNode; - scanState->recordNum = startingRecord; - scanState->currentNodePtr = NULL; - scanState->nodesLeftInBuffer = 0; // no nodes currently in buffer - scanState->recordsFound = recordsFound; - microuptime(&scanState->startTime); // initialize our throttle - - return noErr; - -} /* BTScanInitialize */ - - -//_________________________________________________________________________________ -// -// Routine: BTScanTerminate -// -// Purpose: Return state information about a scan so that it can be resumed -// later via BTScanInitialize. -// -// Inputs: -// scanState Scanner's current state -// -// Outputs: -// nextNode Node number to resume a scan (pass to BTScanInitialize) -// nextRecord Record number to resume a scan (pass to BTScanInitialize) -// recordsFound Valid records seen so far (pass to BTScanInitialize) -//_________________________________________________________________________________ - -int BTScanTerminate( BTScanState * scanState, - u_int32_t * startingNode, - u_int32_t * startingRecord, - u_int32_t * recordsFound ) -{ - *startingNode = scanState->nodeNum; - *startingRecord = scanState->recordNum; - *recordsFound = scanState->recordsFound; - - if ( scanState->bufferPtr != NULL ) - { - buf_markinvalid(scanState->bufferPtr); - buf_brelse( scanState->bufferPtr ); - scanState->bufferPtr = NULL; - scanState->currentNodePtr = NULL; - } - - return noErr; - -} /* BTScanTerminate */ - - diff --git a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c deleted file mode 100644 index 34bd8e41b..000000000 --- a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c +++ /dev/null @@ -1,1345 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: BTreeTreeOps.c - - Contains: Multi-node tree operations for the BTree Module. - - Version: xxx put the technology version here xxx - - Written by: Gordon Sheridan and Bill Bruffey - - Copyright: � 1992-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: File Systems - - Writers: - - (msd) Mark Day - (DSH) Deric Horn - (djb) Don Brady - - Change History (most recent first): - - 6/1/99 djb Sync up with Mac OS 8.6. - 12/8/97 djb Radar #2200632, CollapseTree wasn't marking root node dirty. - 11/24/97 djb Radar #2005325, InsertLevel incorrectly handled root splits! - 10/17/97 msd Conditionalize DebugStrs. - 5/16/97 msd InsertNode() needs a return statement in ErrorExit. - 4/23/97 djb first checked in - - 3/17/97 DSH Conditionalize out Panic assertion for SC. - 3/3/97 djb Removed DebugStr in InsertLevel. - 2/19/97 djb Major re-write of insert code; added InsertLevel and InsertNode. - 1/27/97 djb InsertTree and DeleteTree are now recursive and support variable - sized index keys. - 1/16/97 djb Removed DebugStr in SearchTree. Added initial support for - variable sized index keys. - 1/3/97 djb Changed len8 to length8. - 1/3/97 djb Added support for large keys. - 12/19/96 djb first checked in - - History applicable to original Scarecrow Design: - - <3> 10/25/96 ser Changing for new VFPI - <2> 1/22/96 dkh Add #include Memory.h - <1> 10/18/95 rst Moved from Scarecrow project. - - <12> 7/18/95 mbb Change MoveData & ClearBytes to BlockMoveData & BlockZero. - <11> 9/30/94 prp Get in sync with D2 interface changes. - <10> 7/25/94 wjk Eliminate usage of BytePtr in favor of UInt8 *. - <9> 7/22/94 wjk Convert to the new set of header files. - <8> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and - NRCmds environments. - <7> 11/30/93 wjk Change some Ptr's to BytePtr's in function definitions so they - agree with their prototypes. - <6> 5/21/93 gs Debug DeleteTree. Modify InsertTree for BTReplaceRecord. - <5> 5/10/93 gs Modify RotateLeft, and add DeleteTree, CollapseTree routines. - <4> 3/23/93 gs revise RotateLeft to use InsertKeyRecord instead of - InsertRecord. - <3> 3/23/93 gs Implement SplitLeft, InsertTree routine. - <2> 2/8/93 gs Implement SearchTree, and RotateLeft. - <1> 11/15/92 gs first checked in - -*/ - -#include "../headers/BTreesPrivate.h" -#include "../../hfs_btreeio.h" - -// -/////////////////////// Routines Internal To BTree Module /////////////////////// -// -// SearchTree -// InsertTree -// -////////////////////// Routines Internal To BTreeTreeOps.c ////////////////////// - -static OSStatus AddNewRootNode (BTreeControlBlockPtr btreePtr, - NodeDescPtr leftNode, - NodeDescPtr rightNode ); - -static OSStatus CollapseTree (BTreeControlBlockPtr btreePtr, - BlockDescriptor *blockPtr ); - -static OSStatus RotateLeft (BTreeControlBlockPtr btreePtr, - NodeDescPtr leftNode, - NodeDescPtr rightNode, - u_int16_t rightInsertIndex, - KeyPtr keyPtr, - u_int8_t * recPtr, - u_int16_t recSize, - u_int16_t *insertIndex, - u_int32_t *insertNodeNum, - Boolean *recordFit, - u_int16_t *recsRotated ); - -static Boolean RotateRecordLeft (BTreeControlBlockPtr btreePtr, - NodeDescPtr leftNode, - NodeDescPtr rightNode ); - -static OSStatus SplitLeft (BTreeControlBlockPtr btreePtr, - BlockDescriptor *leftNode, - BlockDescriptor *rightNode, - u_int32_t rightNodeNum, - u_int16_t index, - KeyPtr keyPtr, - u_int8_t * recPtr, - u_int16_t recSize, - u_int16_t *insertIndex, - u_int32_t *insertNodeNum, - u_int16_t *recsRotated ); - - - -static OSStatus InsertLevel (BTreeControlBlockPtr btreePtr, - TreePathTable treePathTable, - InsertKey *primaryKey, - InsertKey *secondaryKey, - BlockDescriptor *targetNode, - u_int16_t index, - u_int16_t level, - u_int32_t *insertNode ); - -static OSErr InsertNode (BTreeControlBlockPtr btreePtr, - InsertKey *key, - BlockDescriptor *rightNode, - u_int32_t node, - u_int16_t index, - u_int32_t *newNode, - u_int16_t *newIndex, - BlockDescriptor *leftNode, - Boolean *updateParent, - Boolean *insertParent, - Boolean *rootSplit ); - -static u_int16_t GetKeyLength (const BTreeControlBlock *btreePtr, - const BTreeKey *key, - Boolean forLeafNode ); - - - -//////////////////////// BTree Multi-node Tree Operations /////////////////////// - - -/*------------------------------------------------------------------------------- - -Routine: SearchTree - Search BTree for key and set up Tree Path Table. - -Function: Searches BTree for specified key, setting up the Tree Path Table to - reflect the search path. - - -Input: btreePtr - pointer to control block of BTree to search - keyPtr - pointer to the key to search for - treePathTable - pointer to the tree path table to construct - -Output: nodeNum - number of the node containing the key position - iterator - BTreeIterator specifying record or insert position - -Result: noErr - key found, index is record index - fsBTRecordNotFoundErr - key not found, index is insert index - fsBTEmptyErr - key not found, return params are nil - otherwise - catastrophic failure (GetNode/ReleaseNode failed) --------------------------------------------------------------------------------*/ - -OSStatus SearchTree (BTreeControlBlockPtr btreePtr, - BTreeKeyPtr searchKey, - TreePathTable treePathTable, - u_int32_t *nodeNum, - BlockDescriptor *nodePtr, - u_int16_t *returnIndex ) -{ - OSStatus err; - int16_t level; // Expected depth of current node - u_int32_t curNodeNum; // Current node we're searching - NodeRec nodeRec; - u_int16_t index; - Boolean keyFound; - int8_t nodeKind; // Kind of current node (index/leaf) - KeyPtr keyPtr; - u_int8_t * dataPtr; - u_int16_t dataSize; - - - curNodeNum = btreePtr->rootNode; - level = btreePtr->treeDepth; - - if (level == 0) // is the tree empty? - { - err = fsBTEmptyErr; - goto ErrorExit; - } - - //�� for debugging... - treePathTable [0].node = 0; - treePathTable [0].index = 0; - - while (true) - { - // - // [2550929] Node number 0 is the header node. It is never a valid - // index or leaf node. If we're ever asked to search through node 0, - // something has gone wrong (typically a bad child node number, or - // we found a node full of zeroes that we thought was an index node). - // - if (curNodeNum == 0) - { -// Panic("SearchTree: curNodeNum is zero!"); - err = btBadNode; - goto ErrorExit; - } - - err = GetNode (btreePtr, curNodeNum, 0, &nodeRec); - if (err != noErr) - { - goto ErrorExit; - } - - // - // [2550929] Sanity check the node height and node type. We expect - // particular values at each iteration in the search. This checking - // quickly finds bad pointers, loops, and other damage to the - // hierarchy of the B-tree. - // - if (((BTNodeDescriptor*)nodeRec.buffer)->height != level) - { -// Panic("Incorrect node height"); - err = btBadNode; - goto ReleaseAndExit; - } - nodeKind = ((BTNodeDescriptor*)nodeRec.buffer)->kind; - if (level == 1) - { - // Nodes at level 1 must be leaves, by definition - if (nodeKind != kBTLeafNode) - { - // Panic("Incorrect node type: expected leaf"); - err = btBadNode; - goto ReleaseAndExit; - } - } - else - { - // A node at any other depth must be an index node - if (nodeKind != kBTIndexNode) - { -// Panic("Incorrect node type: expected index"); - err = btBadNode; - goto ReleaseAndExit; - } - } - - keyFound = SearchNode (btreePtr, nodeRec.buffer, searchKey, &index); - - treePathTable [level].node = curNodeNum; - - if (nodeKind == kBTLeafNode) - { - treePathTable [level].index = index; - break; // were done... - } - - if ( (keyFound != true) && (index != 0)) - --index; - - treePathTable [level].index = index; - - err = GetRecordByIndex (btreePtr, nodeRec.buffer, index, &keyPtr, &dataPtr, &dataSize); - if (err != noErr) - { - // [2550929] If we got an error, it is probably because the index was bad - // (typically a corrupt node that confused SearchNode). Invalidate the node - // so we won't accidentally use the corrupted contents. NOTE: the Mac OS 9 - // sources call this InvalidateNode. - - (void) TrashNode(btreePtr, &nodeRec); - goto ErrorExit; - } - - // Get the child pointer out of this index node. We're now done with the current - // node and can continue the search with the child node. - curNodeNum = *(u_int32_t *)dataPtr; - err = ReleaseNode (btreePtr, &nodeRec); - if (err != noErr) - { - goto ErrorExit; - } - - // The child node should be at a level one less than the parent. - --level; - } - - *nodeNum = curNodeNum; - *nodePtr = nodeRec; - *returnIndex = index; - - if (keyFound) - return noErr; // searchKey found, index identifies record in node - else - return fsBTRecordNotFoundErr; // searchKey not found, index identifies insert point - -ReleaseAndExit: - (void) ReleaseNode(btreePtr, &nodeRec); - // fall into ErrorExit - -ErrorExit: - - *nodeNum = 0; - nodePtr->buffer = nil; - nodePtr->blockHeader = nil; - *returnIndex = 0; - - return err; -} - - - - -////////////////////////////////// InsertTree /////////////////////////////////// - -OSStatus InsertTree ( BTreeControlBlockPtr btreePtr, - TreePathTable treePathTable, - KeyPtr keyPtr, - u_int8_t * recPtr, - u_int16_t recSize, - BlockDescriptor *targetNode, - u_int16_t index, - u_int16_t level, - Boolean replacingKey, - u_int32_t *insertNode ) -{ - InsertKey primaryKey; - OSStatus err; - - primaryKey.keyPtr = keyPtr; - primaryKey.keyLength = GetKeyLength(btreePtr, primaryKey.keyPtr, (level == 1)); - primaryKey.recPtr = recPtr; - primaryKey.recSize = recSize; - primaryKey.replacingKey = replacingKey; - primaryKey.skipRotate = false; - - err = InsertLevel (btreePtr, treePathTable, &primaryKey, nil, - targetNode, index, level, insertNode ); - - return err; - -} // End of InsertTree - - -////////////////////////////////// InsertLevel ////////////////////////////////// - -OSStatus InsertLevel (BTreeControlBlockPtr btreePtr, - TreePathTable treePathTable, - InsertKey *primaryKey, - InsertKey *secondaryKey, - BlockDescriptor *targetNode, - u_int16_t index, - u_int16_t level, - u_int32_t *insertNode ) -{ - OSStatus err; - BlockDescriptor leftNode; - u_int32_t targetNodeNum; - u_int32_t newNodeNum; - u_int16_t newIndex; - Boolean insertParent; - Boolean updateParent; - Boolean newRoot; - InsertKey insertKey; - -#if defined(applec) && !defined(__SC__) - PanicIf ((level == 1) && (((NodeDescPtr)targetNode->buffer)->kind != kBTLeafNode), " InsertLevel: non-leaf at level 1! "); -#endif - leftNode.buffer = nil; - leftNode.blockHeader = nil; - targetNodeNum = treePathTable [level].node; - - insertParent = false; - updateParent = false; - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, targetNode); - - ////// process first insert ////// - - err = InsertNode (btreePtr, primaryKey, targetNode, targetNodeNum, index, - &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &newRoot ); - M_ExitOnError (err); - - if ( newRoot ) - { - // Extend the treePathTable by adding an entry for the new - // root node that references the current targetNode. - // - // If inserting the secondaryKey changes the first key of - // the target node, then we'll have to update the second - // key in the new root node. - - treePathTable [level + 1].node = btreePtr->rootNode; - treePathTable [level + 1].index = 1; // 1 since we always split/rotate left - } - - if ( level == 1 ) - *insertNode = newNodeNum; - - ////// process second insert (if any) ////// - - if ( secondaryKey != nil ) - { - Boolean temp; - - err = InsertNode (btreePtr, secondaryKey, targetNode, newNodeNum, newIndex, - &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &temp); - M_ExitOnError (err); - - if ( DEBUG_BUILD && updateParent && newRoot ) - DebugStr(" InsertLevel: New root from primary key, update from secondary key..."); - } - - //////////////////////// Update Parent(s) /////////////////////////////// - - if ( insertParent || updateParent ) - { - BlockDescriptor parentNode; - u_int32_t parentNodeNum; - KeyPtr keyPtr; - u_int8_t * recPtr; - u_int16_t recSize; - - parentNode.buffer = nil; - parentNode.blockHeader = nil; - - secondaryKey = nil; - - PanicIf ( (level == btreePtr->treeDepth), " InsertLevel: unfinished insert!?"); - - ++level; - - // Get Parent Node data... - index = treePathTable [level].index; - parentNodeNum = treePathTable [level].node; - - PanicIf ( parentNodeNum == 0, " InsertLevel: parent node is zero!?"); - - err = GetNode (btreePtr, parentNodeNum, 0, &parentNode); // released as target node in next level up - M_ExitOnError (err); -#if defined(applec) && !defined(__SC__) - if (DEBUG_BUILD && level > 1) - PanicIf ( ((NodeDescPtr)parentNode.buffer)->kind != kBTIndexNode, " InsertLevel: parent node not an index node! "); -#endif - ////////////////////////// Update Parent Index ////////////////////////////// - - if ( updateParent ) - { - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &parentNode); - - //���debug: check if ptr == targetNodeNum - GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize); - PanicIf( (*(u_int32_t *) recPtr) != targetNodeNum, " InsertLevel: parent ptr doesn't match target node!"); - - // need to delete and re-insert this parent key/ptr - // we delete it here and it gets re-inserted in the - // InsertLevel call below. - DeleteRecord (btreePtr, parentNode.buffer, index); - - primaryKey->keyPtr = (KeyPtr) GetRecordAddress( btreePtr, targetNode->buffer, 0 ); - primaryKey->keyLength = GetKeyLength(btreePtr, primaryKey->keyPtr, false); - primaryKey->recPtr = (u_int8_t *) &targetNodeNum; - primaryKey->recSize = sizeof(targetNodeNum); - primaryKey->replacingKey = kReplaceRecord; - primaryKey->skipRotate = insertParent; // don't rotate left if we have two inserts occuring - } - - ////////////////////////// Add New Parent Index ///////////////////////////// - - if ( insertParent ) - { - InsertKey *insertKeyPtr; - - if ( updateParent ) - { - insertKeyPtr = &insertKey; - secondaryKey = &insertKey; - } - else - { - insertKeyPtr = primaryKey; - } - - insertKeyPtr->keyPtr = (KeyPtr) GetRecordAddress (btreePtr, leftNode.buffer, 0); - insertKeyPtr->keyLength = GetKeyLength(btreePtr, insertKeyPtr->keyPtr, false); - insertKeyPtr->recPtr = (u_int8_t *) &((NodeDescPtr)targetNode->buffer)->bLink; - insertKeyPtr->recSize = sizeof(u_int32_t); - insertKeyPtr->replacingKey = kInsertRecord; - insertKeyPtr->skipRotate = false; // a rotate is OK during second insert - } - - err = InsertLevel (btreePtr, treePathTable, primaryKey, secondaryKey, - &parentNode, index, level, insertNode ); - M_ExitOnError (err); - } - - err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction); // all done with target - M_ExitOnError (err); - - err = UpdateNode (btreePtr, &leftNode, 0, kLockTransaction); // all done with left sibling - M_ExitOnError (err); - - return noErr; - -ErrorExit: - - (void) ReleaseNode (btreePtr, targetNode); - (void) ReleaseNode (btreePtr, &leftNode); - - Panic (" InsertLevel: an error occurred!"); - - return err; - -} // End of InsertLevel - - - -////////////////////////////////// InsertNode /////////////////////////////////// - -static OSErr InsertNode (BTreeControlBlockPtr btreePtr, - InsertKey *key, - - BlockDescriptor *rightNode, - u_int32_t node, - u_int16_t index, - - u_int32_t *newNode, - u_int16_t *newIndex, - - BlockDescriptor *leftNode, - Boolean *updateParent, - Boolean *insertParent, - Boolean *rootSplit ) -{ - BlockDescriptor *targetNode = NULL; - u_int32_t leftNodeNum; - u_int16_t recsRotated; - OSErr err; - Boolean recordFit; - - *rootSplit = false; - - PanicIf ( rightNode->buffer == leftNode->buffer, " InsertNode: rightNode == leftNode, huh?"); - - leftNodeNum = ((NodeDescPtr) rightNode->buffer)->bLink; - - - /////////////////////// Try Simple Insert /////////////////////////////// - - /* sanity check our left and right nodes here. */ - if (node == leftNodeNum) { - if (leftNode->buffer == NULL) { - err = fsBTInvalidNodeErr; - M_ExitOnError(err); - } - else{ - targetNode = leftNode; - } - } - else { - // we can assume right node is initialized. - targetNode = rightNode; - } - - - recordFit = InsertKeyRecord (btreePtr, targetNode->buffer, index, key->keyPtr, key->keyLength, key->recPtr, key->recSize); - - if ( recordFit ) - { - *newNode = node; - *newIndex = index; - - if ( (index == 0) && (((NodeDescPtr) targetNode->buffer)->height != btreePtr->treeDepth) ) - *updateParent = true; // the first record changed so we need to update the parent - } - - - //////////////////////// Try Rotate Left //////////////////////////////// - - if ( !recordFit && leftNodeNum > 0 ) - { - PanicIf ( leftNode->buffer != nil, " InsertNode: leftNode already acquired!"); - - if ( leftNode->buffer == nil ) - { - err = GetNode (btreePtr, leftNodeNum, 0, leftNode); // will be released by caller or a split below - M_ExitOnError (err); - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, leftNode); - } - - PanicIf ( ((NodeDescPtr) leftNode->buffer)->fLink != node, " InsertNode, RotateLeft: invalid sibling link!" ); - - if ( !key->skipRotate ) // are rotates allowed? - { - err = RotateLeft (btreePtr, leftNode->buffer, rightNode->buffer, index, key->keyPtr, key->recPtr, - key->recSize, newIndex, newNode, &recordFit, &recsRotated ); - M_ExitOnError (err); - - if ( recordFit ) - { - if ( key->replacingKey || (recsRotated > 1) || (index > 0) ) - *updateParent = true; - } - } - } - - - //////////////////////// Try Split Left ///////////////////////////////// - - if ( !recordFit ) - { - // might not have left node... - err = SplitLeft (btreePtr, leftNode, rightNode, node, index, key->keyPtr, - key->recPtr, key->recSize, newIndex, newNode, &recsRotated); - M_ExitOnError (err); - - // if we split root node - add new root - - if ( ((NodeDescPtr) rightNode->buffer)->height == btreePtr->treeDepth ) - { - err = AddNewRootNode (btreePtr, leftNode->buffer, rightNode->buffer); // Note: does not update TPT - M_ExitOnError (err); - *rootSplit = true; - } - else - { - *insertParent = true; - - if ( key->replacingKey || (recsRotated > 1) || (index > 0) ) - *updateParent = true; - } - } - - return noErr; - -ErrorExit: - (void) ReleaseNode (btreePtr, leftNode); - return err; - -} // End of InsertNode - - -/*------------------------------------------------------------------------------- -Routine: DeleteTree - One_line_description. - -Function: Brief_description_of_the_function_and_any_side_effects - -ToDo: - -Input: btreePtr - description - treePathTable - description - targetNode - description - index - description - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, - TreePathTable treePathTable, - BlockDescriptor *targetNode, - u_int16_t index, - u_int16_t level ) -{ - OSStatus err; - BlockDescriptor parentNode; - BTNodeDescriptor *targetNodePtr; - u_int32_t targetNodeNum; - Boolean deleteRequired; - Boolean updateRequired; - - // XXXdbg - initialize these to null in case we get an - // error and try to exit before it's initialized - parentNode.buffer = nil; - parentNode.blockHeader = nil; - - deleteRequired = false; - updateRequired = false; - - targetNodeNum = treePathTable[level].node; - targetNodePtr = targetNode->buffer; - PanicIf (targetNodePtr == nil, "DeleteTree: targetNode has nil buffer!"); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, targetNode); - - DeleteRecord (btreePtr, targetNodePtr, index); - - //�� coalesce remaining records? - - if ( targetNodePtr->numRecords == 0 ) // did we delete the last record? - { - BlockDescriptor siblingNode; - u_int32_t siblingNodeNum; - - deleteRequired = true; - - siblingNode.buffer = nil; - siblingNode.blockHeader = nil; - - ////////////////// Get Siblings & Update Links ////////////////////////// - - siblingNodeNum = targetNodePtr->bLink; // Left Sibling Node - if ( siblingNodeNum != 0 ) - { - err = GetNode (btreePtr, siblingNodeNum, 0, &siblingNode); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &siblingNode); - - ((NodeDescPtr)siblingNode.buffer)->fLink = targetNodePtr->fLink; - err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction); - M_ExitOnError (err); - } - else if ( targetNodePtr->kind == kBTLeafNode ) // update firstLeafNode - { - btreePtr->firstLeafNode = targetNodePtr->fLink; - } - - siblingNodeNum = targetNodePtr->fLink; // Right Sibling Node - if ( siblingNodeNum != 0 ) - { - err = GetNode (btreePtr, siblingNodeNum, 0, &siblingNode); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &siblingNode); - - ((NodeDescPtr)siblingNode.buffer)->bLink = targetNodePtr->bLink; - err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction); - M_ExitOnError (err); - } - else if ( targetNodePtr->kind == kBTLeafNode ) // update lastLeafNode - { - btreePtr->lastLeafNode = targetNodePtr->bLink; - } - - //////////////////////// Free Empty Node //////////////////////////////// - - ClearNode (btreePtr, targetNodePtr); - - err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction); - M_ExitOnError (err); - - err = FreeNode (btreePtr, targetNodeNum); - M_ExitOnError (err); - } - else if ( index == 0 ) // did we delete the first record? - { - updateRequired = true; // yes, so we need to update parent - } - - - if ( level == btreePtr->treeDepth ) // then targetNode->buffer is the root node - { - deleteRequired = false; - updateRequired = false; - - if ( targetNode->buffer == nil ) // then root was freed and the btree is empty - { - btreePtr->rootNode = 0; - btreePtr->treeDepth = 0; - } - else if ( ((NodeDescPtr)targetNode->buffer)->numRecords == 1 ) - { - err = CollapseTree (btreePtr, targetNode); - M_ExitOnError (err); - } - } - - - if ( updateRequired || deleteRequired ) - { - ++level; // next level - - //// Get Parent Node and index - index = treePathTable [level].index; - err = GetNode (btreePtr, treePathTable[level].node, 0, &parentNode); - M_ExitOnError (err); - - if ( updateRequired ) - { - KeyPtr keyPtr; - u_int8_t * recPtr; - u_int16_t recSize; - u_int32_t insertNode; - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &parentNode); - - //���debug: check if ptr == targetNodeNum - GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize); - PanicIf( (*(u_int32_t *) recPtr) != targetNodeNum, " DeleteTree: parent ptr doesn't match targetNodeNum!!"); - - // need to delete and re-insert this parent key/ptr - DeleteRecord (btreePtr, parentNode.buffer, index); - - keyPtr = (KeyPtr) GetRecordAddress( btreePtr, targetNode->buffer, 0 ); - recPtr = (u_int8_t *) &targetNodeNum; - recSize = sizeof(targetNodeNum); - - err = InsertTree (btreePtr, treePathTable, keyPtr, recPtr, recSize, - &parentNode, index, level, kReplaceRecord, &insertNode); - M_ExitOnError (err); - } - else // deleteRequired - { - err = DeleteTree (btreePtr, treePathTable, &parentNode, index, level); - M_ExitOnError (err); - } - } - - - err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction); - M_ExitOnError (err); - - return noErr; - -ErrorExit: - - (void) ReleaseNode (btreePtr, targetNode); - (void) ReleaseNode (btreePtr, &parentNode); - - return err; - -} // end DeleteTree - - - -///////////////////////////////// CollapseTree ////////////////////////////////// - -static OSStatus CollapseTree (BTreeControlBlockPtr btreePtr, - BlockDescriptor *blockPtr ) -{ - OSStatus err; - u_int32_t originalRoot; - u_int32_t nodeNum; - - originalRoot = btreePtr->rootNode; - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, blockPtr); - - while (true) - { - if ( ((NodeDescPtr)blockPtr->buffer)->numRecords > 1) - break; // this will make a fine root node - - if ( ((NodeDescPtr)blockPtr->buffer)->kind == kBTLeafNode) - break; // we've hit bottom - - nodeNum = btreePtr->rootNode; - btreePtr->rootNode = GetChildNodeNum (btreePtr, blockPtr->buffer, 0); - --btreePtr->treeDepth; - - //// Clear and Free Current Old Root Node //// - ClearNode (btreePtr, blockPtr->buffer); - err = UpdateNode (btreePtr, blockPtr, 0, kLockTransaction); - M_ExitOnError (err); - err = FreeNode (btreePtr, nodeNum); - M_ExitOnError (err); - - //// Get New Root Node - err = GetNode (btreePtr, btreePtr->rootNode, 0, blockPtr); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, blockPtr); - } - - if (btreePtr->rootNode != originalRoot) - M_BTreeHeaderDirty (btreePtr); - - err = UpdateNode (btreePtr, blockPtr, 0, kLockTransaction); // always update! - M_ExitOnError (err); - - return noErr; - - -/////////////////////////////////// ErrorExit /////////////////////////////////// - -ErrorExit: - (void) ReleaseNode (btreePtr, blockPtr); - return err; -} - - - -////////////////////////////////// RotateLeft /////////////////////////////////// - -/*------------------------------------------------------------------------------- - -Routine: RotateLeft - One_line_description. - -Function: Brief_description_of_the_function_and_any_side_effects - -Algorithm: if rightIndex > insertIndex, subtract 1 for actual rightIndex - -Input: btreePtr - description - leftNode - description - rightNode - description - rightInsertIndex - description - keyPtr - description - recPtr - description - recSize - description - -Output: insertIndex - insertNodeNum - description - recordFit - description - recsRotated - -Result: noErr - success - != noErr - failure --------------------------------------------------------------------------------*/ - -static OSStatus RotateLeft (BTreeControlBlockPtr btreePtr, - NodeDescPtr leftNode, - NodeDescPtr rightNode, - u_int16_t rightInsertIndex, - KeyPtr keyPtr, - u_int8_t * recPtr, - u_int16_t recSize, - u_int16_t *insertIndex, - u_int32_t *insertNodeNum, - Boolean *recordFit, - u_int16_t *recsRotated ) -{ - OSStatus err; - int32_t insertSize; - int32_t nodeSize; - int32_t leftSize, rightSize; - int32_t moveSize = 0; - u_int16_t keyLength; - u_int16_t lengthFieldSize; - u_int16_t index, moveIndex; - Boolean didItFit; - - ///////////////////// Determine If Record Will Fit ////////////////////////// - - keyLength = GetKeyLength(btreePtr, keyPtr, (rightNode->kind == kBTLeafNode)); - - // the key's length field is 8-bits in HFS and 16-bits in HFS+ - if ( btreePtr->attributes & kBTBigKeysMask ) - lengthFieldSize = sizeof(u_int16_t); - else - lengthFieldSize = sizeof(u_int8_t); - - insertSize = keyLength + lengthFieldSize + recSize + sizeof(u_int16_t); - - if ( M_IsOdd (insertSize) ) - ++insertSize; // add pad byte; - - nodeSize = btreePtr->nodeSize; - - // add size of insert record to right node - rightSize = nodeSize - GetNodeFreeSize (btreePtr, rightNode) + insertSize; - leftSize = nodeSize - GetNodeFreeSize (btreePtr, leftNode); - - moveIndex = 0; - - while ( leftSize < rightSize ) - { - if ( moveIndex < rightInsertIndex ) - { - moveSize = GetRecordSize (btreePtr, rightNode, moveIndex) + 2; - } - else if ( moveIndex == rightInsertIndex ) - { - moveSize = insertSize; - } - else // ( moveIndex > rightInsertIndex ) - { - moveSize = GetRecordSize (btreePtr, rightNode, moveIndex - 1) + 2; - } - - leftSize += moveSize; - rightSize -= moveSize; - ++moveIndex; - } - - if ( leftSize > nodeSize ) // undo last move - { - leftSize -= moveSize; - rightSize += moveSize; - --moveIndex; - } - - if ( rightSize > nodeSize ) // record won't fit - failure, but not error - { - *insertIndex = 0; - *insertNodeNum = 0; - *recordFit = false; - *recsRotated = 0; - - return noErr; - } - - // we've found balance point, moveIndex == number of records moved into leftNode - - - //////////////////////////// Rotate Records ///////////////////////////////// - - *recsRotated = moveIndex; - *recordFit = true; - index = 0; - - while ( index < moveIndex ) - { - if ( index == rightInsertIndex ) // insert new record in left node - { - u_int16_t leftInsertIndex; - - leftInsertIndex = leftNode->numRecords; - - didItFit = InsertKeyRecord (btreePtr, leftNode, leftInsertIndex, - keyPtr, keyLength, recPtr, recSize); - if ( !didItFit ) - { - Panic ("RotateLeft: InsertKeyRecord (left) returned false!"); - err = fsBTBadRotateErr; - goto ErrorExit; - } - - *insertIndex = leftInsertIndex; - *insertNodeNum = rightNode->bLink; - } - else - { - didItFit = RotateRecordLeft (btreePtr, leftNode, rightNode); - if ( !didItFit ) - { - Panic ("RotateLeft: RotateRecordLeft returned false!"); - err = fsBTBadRotateErr; - goto ErrorExit; - } - } - - ++index; - } - - if ( moveIndex <= rightInsertIndex ) // then insert new record in right node - { - rightInsertIndex -= index; // adjust for records already rotated - - didItFit = InsertKeyRecord (btreePtr, rightNode, rightInsertIndex, - keyPtr, keyLength, recPtr, recSize); - if ( !didItFit ) - { - Panic ("RotateLeft: InsertKeyRecord (right) returned false!"); - err = fsBTBadRotateErr; - goto ErrorExit; - } - - *insertIndex = rightInsertIndex; - *insertNodeNum = leftNode->fLink; - } - - - return noErr; - - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - *insertIndex = 0; - *insertNodeNum = 0; - *recordFit = false; - *recsRotated = 0; - - return err; -} - - - -/////////////////////////////////// SplitLeft /////////////////////////////////// - -static OSStatus SplitLeft (BTreeControlBlockPtr btreePtr, - BlockDescriptor *leftNode, - BlockDescriptor *rightNode, - u_int32_t rightNodeNum, - u_int16_t index, - KeyPtr keyPtr, - u_int8_t * recPtr, - u_int16_t recSize, - u_int16_t *insertIndex, - u_int32_t *insertNodeNum, - u_int16_t *recsRotated ) -{ - OSStatus err; - NodeDescPtr left, right; - u_int32_t newNodeNum; - Boolean recordFit; - - - ///////////////////////////// Compare Nodes ///////////////////////////////// - - right = rightNode->buffer; - left = leftNode->buffer; - - PanicIf ( right->bLink != 0 && left == 0, " SplitLeft: left sibling missing!?" ); - - /* type should be kBTLeafNode or kBTIndexNode */ - - if ( (right->height == 1) && (right->kind != kBTLeafNode) ) - return fsBTInvalidNodeErr; - - if ( left != nil ) - { - if ( left->fLink != rightNodeNum ) - return fsBTInvalidNodeErr; //�� E_BadSibling ? - - if ( left->height != right->height ) - return fsBTInvalidNodeErr; //�� E_BadNodeHeight ? - - if ( left->kind != right->kind ) - return fsBTInvalidNodeErr; //�� E_BadNodeType ? - } - - - ///////////////////////////// Allocate Node ///////////////////////////////// - - err = AllocateNode (btreePtr, &newNodeNum); - M_ExitOnError (err); - - - /////////////// Update Forward Link In Original Left Node /////////////////// - - if ( left != nil ) - { - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, leftNode); - - left->fLink = newNodeNum; - err = UpdateNode (btreePtr, leftNode, 0, kLockTransaction); - M_ExitOnError (err); - } - - - /////////////////////// Initialize New Left Node //////////////////////////// - - err = GetNewNode (btreePtr, newNodeNum, leftNode); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, leftNode); - - left = leftNode->buffer; - left->fLink = rightNodeNum; - - - // Steal Info From Right Node - - left->bLink = right->bLink; - left->kind = right->kind; - left->height = right->height; - - right->bLink = newNodeNum; // update Right bLink - - if ( (left->kind == kBTLeafNode) && (left->bLink == 0) ) - { - // if we're adding a new first leaf node - update BTreeInfoRec - - btreePtr->firstLeafNode = newNodeNum; - M_BTreeHeaderDirty (btreePtr); //�� AllocateNode should have set the bit already... - } - - ////////////////////////////// Rotate Left ////////////////////////////////// - - err = RotateLeft (btreePtr, left, right, index, keyPtr, recPtr, recSize, - insertIndex, insertNodeNum, &recordFit, recsRotated); - - M_ExitOnError (err); - - return noErr; - -ErrorExit: - - (void) ReleaseNode (btreePtr, leftNode); - (void) ReleaseNode (btreePtr, rightNode); - - //�� Free new node if allocated? - - *insertIndex = 0; - *insertNodeNum = 0; - *recsRotated = 0; - - return err; -} - - - -/////////////////////////////// RotateRecordLeft //////////////////////////////// - -static Boolean RotateRecordLeft (BTreeControlBlockPtr btreePtr, - NodeDescPtr leftNode, - NodeDescPtr rightNode ) -{ - u_int16_t size; - u_int8_t * recPtr; - Boolean recordFit; - - size = GetRecordSize (btreePtr, rightNode, 0); - recPtr = GetRecordAddress (btreePtr, rightNode, 0); - - recordFit = InsertRecord (btreePtr, leftNode, leftNode->numRecords, recPtr, size); - - if ( !recordFit ) - return false; - - DeleteRecord (btreePtr, rightNode, 0); - - return true; -} - - -//////////////////////////////// AddNewRootNode ///////////////////////////////// - -static OSStatus AddNewRootNode (BTreeControlBlockPtr btreePtr, - NodeDescPtr leftNode, - NodeDescPtr rightNode ) -{ - OSStatus err; - BlockDescriptor rootNode; - u_int32_t rootNum; - KeyPtr keyPtr; - Boolean didItFit; - u_int16_t keyLength; - - rootNode.buffer = nil; - rootNode.blockHeader = nil; - - PanicIf (leftNode == nil, "AddNewRootNode: leftNode == nil"); - PanicIf (rightNode == nil, "AddNewRootNode: rightNode == nil"); - - - /////////////////////// Initialize New Root Node //////////////////////////// - - err = AllocateNode (btreePtr, &rootNum); - M_ExitOnError (err); - - err = GetNewNode (btreePtr, rootNum, &rootNode); - M_ExitOnError (err); - - // XXXdbg - ModifyBlockStart(btreePtr->fileRefNum, &rootNode); - - ((NodeDescPtr)rootNode.buffer)->kind = kBTIndexNode; - ((NodeDescPtr)rootNode.buffer)->height = ++btreePtr->treeDepth; - - - ///////////////////// Insert Left Node Index Record ///////////////////////// - - keyPtr = (KeyPtr) GetRecordAddress (btreePtr, leftNode, 0); - keyLength = GetKeyLength(btreePtr, keyPtr, false); - - didItFit = InsertKeyRecord ( btreePtr, rootNode.buffer, 0, keyPtr, keyLength, - (u_int8_t *) &rightNode->bLink, 4 ); - - PanicIf ( !didItFit, "AddNewRootNode:InsertKeyRecord failed for left index record"); - - - //////////////////// Insert Right Node Index Record ///////////////////////// - - keyPtr = (KeyPtr) GetRecordAddress (btreePtr, rightNode, 0); - keyLength = GetKeyLength(btreePtr, keyPtr, false); - - didItFit = InsertKeyRecord ( btreePtr, rootNode.buffer, 1, keyPtr, keyLength, - (u_int8_t *) &leftNode->fLink, 4 ); - - PanicIf ( !didItFit, "AddNewRootNode:InsertKeyRecord failed for right index record"); - - - /////////////////////////// Release Root Node /////////////////////////////// - - err = UpdateNode (btreePtr, &rootNode, 0, kLockTransaction); - M_ExitOnError (err); - - // update BTreeInfoRec - - btreePtr->rootNode = rootNum; - M_BTreeHeaderDirty(btreePtr); - - return noErr; - - - ////////////////////////////// Error Exit /////////////////////////////////// - -ErrorExit: - - return err; -} - - -static u_int16_t GetKeyLength ( const BTreeControlBlock *btreePtr, const BTreeKey *key, Boolean forLeafNode ) -{ - u_int16_t length; - - if ( forLeafNode || btreePtr->attributes & kBTVariableIndexKeysMask ) - length = KeyLength (btreePtr, key); // just use actual key length - else - length = btreePtr->maxKeyLength; // fixed sized index key (i.e. HFS) //�� shouldn't we clear the pad bytes? - - return length; -} - diff --git a/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c b/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c deleted file mode 100644 index 6a76e1df7..000000000 --- a/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (c) 2000-2002, 2004-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include -#include - -#include "../headers/FileMgrInternal.h" -#include "../headers/BTreesInternal.h" -#include "../headers/CatalogPrivate.h" -#include "../headers/HFSUnicodeWrappers.h" -#include "../headers/BTreesPrivate.h" -#include - -// -// Routine: LocateCatalogNodeByKey -// -// Function: Locates the catalog record for an existing folder or file -// CNode and returns the key and data records. -// - -OSErr -LocateCatalogNodeByKey(const ExtendedVCB *volume, u_int32_t hint, CatalogKey *keyPtr, - CatalogRecord *dataPtr, u_int32_t *newHint) -{ - OSErr result; - CatalogName *nodeName = NULL; - HFSCatalogNodeID threadParentID; - u_int16_t tempSize; - FSBufferDescriptor btRecord; - struct BTreeIterator *searchIterator; - FCB *fcb; - - MALLOC (searchIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (searchIterator == NULL) { - return memFullErr; // translates to ENOMEM - } - - bzero(searchIterator, sizeof(*searchIterator)); - - fcb = GetFileControlBlock(volume->catalogRefNum); - - btRecord.bufferAddress = dataPtr; - btRecord.itemCount = 1; - btRecord.itemSize = sizeof(CatalogRecord); - - searchIterator->hint.nodeNum = hint; - - bcopy(keyPtr, &searchIterator->key, sizeof(CatalogKey)); - - result = BTSearchRecord( fcb, searchIterator, &btRecord, &tempSize, searchIterator ); - - if (result == noErr) - { - *newHint = searchIterator->hint.nodeNum; - - BlockMoveData(&searchIterator->key, keyPtr, sizeof(CatalogKey)); - } - - if (result == btNotFound) { - result = cmNotFound; - } - - if (result) { - FREE(searchIterator, M_TEMP); - return result; - } - - // if we got a thread record, then go look up real record - switch ( dataPtr->recordType ) - { - -#if CONFIG_HFS_STD - case kHFSFileThreadRecord: - case kHFSFolderThreadRecord: - threadParentID = dataPtr->hfsThread.parentID; - nodeName = (CatalogName *) &dataPtr->hfsThread.nodeName; - break; -#endif - - case kHFSPlusFileThreadRecord: - case kHFSPlusFolderThreadRecord: - threadParentID = dataPtr->hfsPlusThread.parentID; - nodeName = (CatalogName *) &dataPtr->hfsPlusThread.nodeName; - break; - - default: - threadParentID = 0; - break; - } - - if ( threadParentID ) // found a thread - result = LocateCatalogRecord(volume, threadParentID, nodeName, kNoHint, keyPtr, dataPtr, newHint); - - FREE (searchIterator, M_TEMP); - return result; -} - - - -//******************************************************************************* -// Routine: LocateCatalogRecord -// -// Function: Locates the catalog record associated with folderID and name -// -//******************************************************************************* - -OSErr -LocateCatalogRecord(const ExtendedVCB *volume, HFSCatalogNodeID folderID, const CatalogName *name, - __unused u_int32_t hint, CatalogKey *keyPtr, CatalogRecord *dataPtr, u_int32_t *newHint) -{ - OSErr result; - uint16_t tempSize; - FSBufferDescriptor btRecord; - struct BTreeIterator *searchIterator = NULL; - FCB *fcb; - BTreeControlBlock *btcb; - - MALLOC (searchIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (searchIterator == NULL) { - return memFullErr; // translates to ENOMEM - } - - bzero(searchIterator, sizeof(*searchIterator)); - - - fcb = GetFileControlBlock(volume->catalogRefNum); - btcb = (BTreeControlBlock *)fcb->fcbBTCBPtr; - - btRecord.bufferAddress = dataPtr; - btRecord.itemCount = 1; - btRecord.itemSize = sizeof(CatalogRecord); - - BuildCatalogKey(folderID, name, (volume->vcbSigWord == kHFSPlusSigWord), (CatalogKey *)&searchIterator->key); - - result = BTSearchRecord(fcb, searchIterator, &btRecord, &tempSize, searchIterator); - if (result == noErr) { - *newHint = searchIterator->hint.nodeNum; - BlockMoveData(&searchIterator->key, keyPtr, CalcKeySize(btcb, &searchIterator->key)); - } - - FREE (searchIterator, M_TEMP); - return (result == btNotFound ? cmNotFound : result); -} - - - -/* - * Routine: BuildCatalogKey - * - * Function: Constructs a catalog key record (ckr) given the parent - * folder ID and CName. Works for both classic and extended - * HFS volumes. - * - */ - -void -BuildCatalogKey(HFSCatalogNodeID parentID, const CatalogName *cName, Boolean isHFSPlus, CatalogKey *key) -{ - if ( isHFSPlus ) - { - key->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength; // initial key length (4 + 2) - key->hfsPlus.parentID = parentID; // set parent ID - key->hfsPlus.nodeName.length = 0; // null CName length - if ( cName != NULL ) - { - CopyCatalogName(cName, (CatalogName *) &key->hfsPlus.nodeName, isHFSPlus); - key->hfsPlus.keyLength += sizeof(UniChar) * cName->ustr.length; // add CName size to key length - } - } -#if CONFIG_HFS_STD - else - { - key->hfs.keyLength = kHFSCatalogKeyMinimumLength; // initial key length (1 + 4 + 1) - key->hfs.reserved = 0; // clear unused byte - key->hfs.parentID = parentID; // set parent ID - key->hfs.nodeName[0] = 0; // null CName length - if ( cName != NULL ) - { - UpdateCatalogName(cName->pstr, key->hfs.nodeName); - key->hfs.keyLength += key->hfs.nodeName[0]; // add CName size to key length - } - } -#endif - -} - -OSErr -BuildCatalogKeyUTF8(ExtendedVCB *volume, HFSCatalogNodeID parentID, const unsigned char *name, u_int32_t nameLength, - CatalogKey *key, u_int32_t *textEncoding) -{ - OSErr err = 0; - - if ( name == NULL) - nameLength = 0; - else if (nameLength == kUndefinedStrLen) - nameLength = strlen((const char *)name); - - if ( volume->vcbSigWord == kHFSPlusSigWord ) { - size_t unicodeBytes = 0; - - key->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength; // initial key length (4 + 2) - key->hfsPlus.parentID = parentID; // set parent ID - key->hfsPlus.nodeName.length = 0; // null CName length - if ( nameLength > 0 ) { - err = utf8_decodestr(name, nameLength, key->hfsPlus.nodeName.unicode, - &unicodeBytes, sizeof(key->hfsPlus.nodeName.unicode), ':', UTF_DECOMPOSED); - key->hfsPlus.nodeName.length = unicodeBytes / sizeof(UniChar); - key->hfsPlus.keyLength += unicodeBytes; - } - - if (textEncoding && (*textEncoding != kTextEncodingMacUnicode)) - *textEncoding = hfs_pickencoding(key->hfsPlus.nodeName.unicode, - key->hfsPlus.nodeName.length); - } -#if CONFIG_HFS_STD - else { - key->hfs.keyLength = kHFSCatalogKeyMinimumLength; // initial key length (1 + 4 + 1) - key->hfs.reserved = 0; // clear unused byte - key->hfs.parentID = parentID; // set parent ID - key->hfs.nodeName[0] = 0; // null CName length - if ( nameLength > 0 ) { - err = utf8_to_hfs(volume, nameLength, name, &key->hfs.nodeName[0]); - /* - * Retry with MacRoman in case that's how it was exported. - * When textEncoding != NULL we know that this is a create - * or rename call and can skip the retry (ugly but it works). - */ - if (err && (textEncoding == NULL)) - err = utf8_to_mac_roman(nameLength, name, &key->hfs.nodeName[0]); - key->hfs.keyLength += key->hfs.nodeName[0]; // add CName size to key length - } - if (textEncoding) - *textEncoding = 0; - } -#endif - - if (err) { - if (err == ENAMETOOLONG) - err = bdNamErr; /* name is too long */ - else - err = paramErr; /* name has invalid characters */ - } - - return err; -} - - -//******************************************************************************* -// Routine: FlushCatalog -// -// Function: Flushes the catalog for a specified volume. -// -//******************************************************************************* - -OSErr -FlushCatalog(ExtendedVCB *volume) -{ - FCB * fcb; - OSErr result; - struct hfsmount *hfsmp = VCBTOHFS (volume); - - fcb = GetFileControlBlock(volume->catalogRefNum); - result = BTFlushPath(fcb); - - if (result == noErr) - { - //--- check if catalog's fcb is dirty... - - if ( (0) /*fcb->fcbFlags & fcbModifiedMask*/ ) - { - hfs_lock_mount (hfsmp); - MarkVCBDirty(volume); // Mark the VCB dirty - volume->vcbLsMod = GetTimeUTC(); // update last modified date - hfs_unlock_mount (hfsmp); - - // result = FlushVolumeControlBlock(volume); - } - } - - return result; -} - - -//������������������������������������������������������������������������������� -// Routine: UpdateCatalogName -// -// Function: Updates a CName. -// -//������������������������������������������������������������������������������� - -void -UpdateCatalogName(ConstStr31Param srcName, Str31 destName) -{ - Size length = srcName[0]; - - if (length > CMMaxCName) - length = CMMaxCName; // truncate to max - - destName[0] = length; // set length byte - - BlockMoveData(&srcName[1], &destName[1], length); -} - -//_______________________________________________________________________ - -void -CopyCatalogName(const CatalogName *srcName, CatalogName *dstName, Boolean isHFSPlus) -{ - u_int32_t length = 0; - - if ( srcName == NULL ) - { - if ( dstName != NULL ) - dstName->ustr.length = 0; // set length byte to zero (works for both unicode and pascal) - return; - } - - if (isHFSPlus) { - length = sizeof(UniChar) * (srcName->ustr.length + 1); - } -#if CONFIG_HFS_STD - else { - length = sizeof(u_int8_t) + srcName->pstr[0]; - } -#endif - - if ( length > 1 ) - BlockMoveData(srcName, dstName, length); - else - dstName->ustr.length = 0; // set length byte to zero (works for both unicode and pascal) -} - diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c deleted file mode 100644 index fa7e210d0..000000000 --- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c +++ /dev/null @@ -1,810 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include "../../hfs_macos_defs.h" -#include "../../hfs_format.h" - -#include "../headers/FileMgrInternal.h" -#include "../headers/HFSUnicodeWrappers.h" -#include "../headers/CatalogPrivate.h" -#include -#include -#include - - -struct ExtentsRecBuffer { - ExtentKey extentKey; - ExtentRecord extentData; -}; -typedef struct ExtentsRecBuffer ExtentsRecBuffer; - - -static u_int32_t CheckExtents( void *extents, u_int32_t blocks, Boolean isHFSPlus ); -static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileNumber, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); -static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); - -#if CONFIG_HFS_STD -static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); -#endif - -static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); -static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, u_int16_t bufferCount ); - -/* - * This function moves the overflow extents associated with srcID into the file associated with dstID. - * We should have already verified that 'srcID' has overflow extents. So now we move all of the overflow - * extent records. - */ -OSErr MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc) { - - OSErr err; - - /* - * Only the source file should have extents, so we just track those. - * We operate on the fork represented by the open FD that was used to call into this - * function - */ - if (rsrc) { - /* Copy the extent overflow blocks. */ - err = MoveExtents( vcb, srcID, destID, 1, (u_int8_t)0xff, 1); - if ( err != noErr ) { - if ( err != dskFulErr ) { - return( err ); - } - /* - * In case of error, we would have probably run into problems - * growing the extents b-tree. Since the move is actually a copy + delete - * just delete the new entries. Same for below. - */ - err = DeleteExtents( vcb, destID, 1, (u_int8_t)0xff, 1); - ReturnIfError( err ); // we are doomed. Just QUIT! - goto FlushAndReturn; - } - } - else { - /* Copy the extent overflow blocks. */ - err = MoveExtents( vcb, srcID, destID, 1, 0, 1); - if ( err != noErr ) { - if ( err != dskFulErr ) { - return( err ); - } - err = DeleteExtents( vcb, destID, 1, 0, 1); - ReturnIfError( err ); // we are doomed. Just QUIT! - goto FlushAndReturn; - } - } - -FlushAndReturn: - /* Write out the catalog and extent overflow B-Tree changes */ - err = FlushCatalog( vcb ); - err = FlushExtentFile( vcb ); - - return( err ); -} - - -OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param destName, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, u_int32_t srcHint, u_int32_t destHint ) -{ - CatalogKey srcKey; // 518 bytes - CatalogKey destKey; // 518 bytes - CatalogRecord srcData; // 520 bytes - CatalogRecord destData; // 520 bytes - CatalogRecord swapData; // 520 bytes - int16_t numSrcExtentBlocks; - int16_t numDestExtentBlocks; - OSErr err; - Boolean isHFSPlus = ( vcb->vcbSigWord == kHFSPlusSigWord ); - - err = BuildCatalogKeyUTF8(vcb, srcID, srcName, kUndefinedStrLen, &srcKey, NULL); - ReturnIfError(err); - - err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL); - ReturnIfError(err); - - if ( isHFSPlus ) - { - //-- Step 1: Check the catalog nodes for extents - - //-- locate the source file, test for extents in extent file, and copy the cat record for later - err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); - ReturnIfError( err ); - - if ( srcData.recordType != kHFSPlusFileRecord ) - return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - - //-- Check if there are any extents in the source file - //�� I am only checling the extents in the low 32 bits, routine will fail if files extents after 2 gig are in overflow - numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.dataFork.extents, srcData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); - if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents - numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.resourceFork.extents, srcData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - - //-- Check if there are any extents in the destination file - err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); - ReturnIfError( err ); - - if ( destData.recordType != kHFSPlusFileRecord ) - return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - - numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.dataFork.extents, destData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); - if ( numDestExtentBlocks == 0 ) // then check the resource fork extents - numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.resourceFork.extents, destData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - - //-- Step 2: Exchange the Extent key in the extent file - - //-- Exchange the extents key in the extent file - err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); - - if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents - { - //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, 0,0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) { - return( err ); - } - else { - err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = FlushCatalog( vcb ); // flush the catalog - err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) - return( dskFulErr ); - } - } - - //-- Change the destination extents file id's to the source id's - err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - -ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = FlushCatalog( vcb ); // flush the catalog - err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) - return( dskFulErr ); - - } - - //-- Change the bogus extents file id's to the dest id's - err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - - err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back - ReturnIfError( err ); // we are doomed. Just QUIT! - - goto ExUndo2aPlus; - } - - } - else if ( numSrcExtentBlocks ) // just the source file has extents - { - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - - err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - goto FlushAndReturn; - } - } - else if ( numDestExtentBlocks ) // just the destination file has extents - { - err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - - err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - goto FlushAndReturn; - } - } - - //-- Step 3: Change the data in the catalog nodes - - //-- find the source cnode and put dest info in it - err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); - if ( err != noErr ) - return( cmBadNews ); - - BlockMoveData( &srcData, &swapData, sizeof(CatalogRecord) ); - CopyBigCatalogNodeInfo( &destData, &srcData ); - - err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSPlusCatalogFile), &srcHint ); - ReturnIfError( err ); - - // find the destination cnode and put source info in it - err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); - if ( err != noErr ) - return( cmBadNews ); - - CopyBigCatalogNodeInfo( &swapData, &destData ); - err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSPlusCatalogFile), &destHint ); - ReturnIfError( err ); - } -#if CONFIG_HFS_STD - else // HFS // - { - //-- Step 1: Check the catalog nodes for extents - - //-- locate the source file, test for extents in extent file, and copy the cat record for later - err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); - ReturnIfError( err ); - - if ( srcData.recordType != kHFSFileRecord ) - return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - - //-- Check if there are any extents in the source file - numSrcExtentBlocks = CheckExtents( srcData.hfsFile.dataExtents, srcData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); - if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents - numSrcExtentBlocks = CheckExtents( srcData.hfsFile.rsrcExtents, srcData.hfsFile.rsrcPhysicalSize / vcb->blockSize, isHFSPlus ); - - - //�� Do we save the found source node for later use? - - - //-- Check if there are any extents in the destination file - err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); - ReturnIfError( err ); - - if ( destData.recordType != kHFSFileRecord ) - return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - - numDestExtentBlocks = CheckExtents( destData.hfsFile.dataExtents, destData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); - if ( numDestExtentBlocks == 0 ) // then check the resource fork extents - numDestExtentBlocks = CheckExtents( destData.hfsFile.rsrcExtents, destData.hfsFile.rsrcPhysicalSize / vcb->blockSize, isHFSPlus ); - - //�� Do we save the found destination node for later use? - - - //-- Step 2: Exchange the Extent key in the extent file - - //-- Exchange the extents key in the extent file - err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); - - if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents - { - //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - -ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = FlushCatalog( vcb ); // flush the catalog - err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) - return( dskFulErr ); - } - - //-- Change the destination extents file id's to the source id's - err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - -ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back - ReturnIfError( err ); // we are doomed. Just QUIT! - - goto ExUndo1a; - } - - //-- Change the bogus extents file id's to the dest id's - err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - - err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back - ReturnIfError( err ); // we are doomed. Just QUIT! - - goto ExUndo2a; - } - - } - else if ( numSrcExtentBlocks ) // just the source file has extents - { - err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - - err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - goto FlushAndReturn; - } - } - else if ( numDestExtentBlocks ) // just the destination file has extents - { - err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); - if ( err != noErr ) - { - if ( err != dskFulErr ) - return( err ); - - err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); - ReturnIfError( err ); // we are doomed. Just QUIT! - - goto FlushAndReturn; - } - } - - //-- Step 3: Change the data in the catalog nodes - - //-- find the source cnode and put dest info in it - err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); - if ( err != noErr ) - return( cmBadNews ); - - BlockMoveData( &srcData, &swapData, sizeof(CatalogRecord) ); - //�� Asm source copies from the saved dest catalog node - CopyCatalogNodeInfo( &destData, &srcData ); - - err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSCatalogFile), &srcHint ); - ReturnIfError( err ); - - - // find the destination cnode and put source info in it - err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); - if ( err != noErr ) - return( cmBadNews ); - - CopyCatalogNodeInfo( &swapData, &destData ); - err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSCatalogFile), &destHint ); - ReturnIfError( err ); - } -#endif - - err = noErr; - - //-- Step 4: Error Handling section - - -FlushAndReturn: - err = FlushCatalog( vcb ); // flush the catalog - err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) - return( err ); -} - - -#if CONFIG_HFS_STD -static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) -{ - dest->hfsFile.dataLogicalSize = src->hfsFile.dataLogicalSize; - dest->hfsFile.dataPhysicalSize = src->hfsFile.dataPhysicalSize; - dest->hfsFile.rsrcLogicalSize = src->hfsFile.rsrcLogicalSize; - dest->hfsFile.rsrcPhysicalSize = src->hfsFile.rsrcPhysicalSize; - dest->hfsFile.modifyDate = src->hfsFile.modifyDate; - BlockMoveData( src->hfsFile.dataExtents, dest->hfsFile.dataExtents, sizeof(HFSExtentRecord) ); - BlockMoveData( src->hfsFile.rsrcExtents, dest->hfsFile.rsrcExtents, sizeof(HFSExtentRecord) ); -} -#endif - -static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) -{ - BlockMoveData( &src->hfsPlusFile.dataFork, &dest->hfsPlusFile.dataFork, sizeof(HFSPlusForkData) ); - BlockMoveData( &src->hfsPlusFile.resourceFork, &dest->hfsPlusFile.resourceFork, sizeof(HFSPlusForkData) ); - dest->hfsPlusFile.contentModDate = src->hfsPlusFile.contentModDate; -} - - -static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) -{ - FCB * fcb; - ExtentsRecBuffer extentsBuffer[kNumExtentsToCache]; - ExtentKey * extentKeyPtr; - ExtentRecord extentData; - struct BTreeIterator *btIterator = NULL; - struct BTreeIterator *tmpIterator = NULL; - FSBufferDescriptor btRecord; - u_int16_t btKeySize; - u_int16_t btRecordSize; - int16_t i, j; - OSErr err; - - MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (btIterator == NULL) { - return memFullErr; // translates to ENOMEM - } - - - MALLOC (tmpIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (tmpIterator == NULL) { - FREE (btIterator, M_TEMP); - return memFullErr; // translates to ENOMEM - } - - bzero(btIterator, sizeof(*btIterator)); - bzero (tmpIterator, sizeof(*tmpIterator)); - - - fcb = GetFileControlBlock(vcb->extentsRefNum); - - (void) BTInvalidateHint(btIterator); - extentKeyPtr = (ExtentKey*) &btIterator->key; - btRecord.bufferAddress = &extentData; - btRecord.itemCount = 1; - - //-- Collect the extent records - - // - // A search on the following key will cause the BTree to be positioned immediately - // before the first extent record for file #srcFileID, but not actually positioned - // on any record. This is because there cannot be an extent record with FABN = 0 - // (the first extent of the fork, which would be in the catalog entry, not an extent - // record). - // - // Using BTIterateRecord with kBTreeNextRecord will then get that first extent record. - // - if (isHFSPlus) { - btRecord.itemSize = sizeof(HFSPlusExtentRecord); - btKeySize = sizeof(HFSPlusExtentKey); - - extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; - extentKeyPtr->hfsPlus.forkType = forkType; - extentKeyPtr->hfsPlus.pad = 0; - extentKeyPtr->hfsPlus.fileID = srcFileID; - extentKeyPtr->hfsPlus.startBlock = 0; - } -#if CONFIG_HFS_STD - else { - btRecord.itemSize = sizeof(HFSExtentRecord); - btKeySize = sizeof(HFSExtentKey); - - extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; - extentKeyPtr->hfs.forkType = 0; - extentKeyPtr->hfs.fileID = srcFileID; - extentKeyPtr->hfs.startBlock = 0; - } -#else - else { - return cmBadNews; - } -#endif - - // - // We do an initial BTSearchRecord to position the BTree's iterator just before any extent - // records for srcFileID. We then do a few BTIterateRecord and BTInsertRecord of those found - // records, but with destFileID as the file number in the key. Keep doing this sequence of - // BTIterateRecord and BTInsertRecord until we find an extent for another file, or there are - // no more extent records in the tree. - // - // Basically, we're copying records kNumExtentsToCache at a time. The copies have their file ID - // set to destFileID. - // - // This depends on BTInsertRecord not effecting the iterator used by BTIterateRecord. If it - // _did_ effect the iterator, then we would need to do a BTSearchRecord before each series - // of BTIterateRecord. We'd need to set up the key for BTSearchRecord to find the last record - // we found, so that BTIterateRecord would get the next one (the first we haven't processed). - // - - err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); - - // We expect a btNotFound here, since there shouldn't be an extent record with FABN = 0. - if (err != btNotFound) - { - if ( DEBUG_BUILD ) - DebugStr("Unexpected error from SearchBTreeRecord"); - - if (err == noErr) // If we found such a bogus extent record, then the tree is really messed up - err = cmBadNews; // so return an error that conveys the disk is hosed. - - FREE (tmpIterator, M_TEMP); - FREE (btIterator, M_TEMP); - return err; - } - - do - { - btRecord.bufferAddress = &extentData; - btRecord.itemCount = 1; - - for ( i=0 ; ihfsPlus.fileID; - } -#if CONFIG_HFS_STD - else { - foundFileID = extentKeyPtr->hfs.fileID; - } -#endif - if ( foundFileID == srcFileID ) { - /* Check if we need to quit early. */ - if (quitEarly && isHFSPlus) { - if (extentKeyPtr->hfsPlus.forkType != forkType) { - break; - } - } - CopyExtentInfo(extentKeyPtr, &extentData, extentsBuffer, i); - } - else{ - /* The fileID's are of a different file. We're done here. */ - break; - } - } - - - - //-- edit each extent key, and reinsert each extent record in the extent file - if (isHFSPlus) - btRecordSize = sizeof(HFSPlusExtentRecord); -#if CONFIG_HFS_STD - else - btRecordSize = sizeof(HFSExtentRecord); -#endif - - for ( j=0 ; jkey, btKeySize); - btRecord.bufferAddress = &(extentsBuffer[j].extentData); - - err = BTInsertRecord(fcb, tmpIterator, &btRecord, btRecordSize); - if ( err != noErr ) { - /* Parse the error and free iterators */ - FREE (btIterator, M_TEMP); - FREE (tmpIterator, M_TEMP); - if ( err == btExists ) - { - if ( DEBUG_BUILD ) { - DebugStr("Can't insert record -- already exists"); - } - return( cmBadNews ); - } - else { - return( err ); - } - } - } - - //-- okay, done with this buffered batch, go get the next set of extent records - // If our buffer is not full, we must be done, or recieved an error - - if ( i != kNumExtentsToCache ) // if the buffer is not full, we must be done - { - err = DeleteExtents( vcb, srcFileID, quitEarly, forkType, isHFSPlus ); // Now delete all the extent entries with the sourceID - if ( DEBUG_BUILD && err != noErr ) - DebugStr("Error from DeleteExtents"); - break; // we're done! - } - } while ( true ); - - FREE (tmpIterator, M_TEMP); - FREE (btIterator, M_TEMP); - - return( err ); -} - - -static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, u_int16_t bufferCount ) -{ - BlockMoveData( key, &(buffer[bufferCount].extentKey), sizeof( ExtentKey ) ); - BlockMoveData( data, &(buffer[bufferCount].extentData), sizeof( ExtentRecord ) ); -} - - -//-- Delete all extents in extent file that have the ID given. -static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) -{ - FCB * fcb; - ExtentKey * extentKeyPtr; - ExtentRecord extentData; - struct BTreeIterator *btIterator = NULL; - struct BTreeIterator *tmpIterator = NULL; - FSBufferDescriptor btRecord; - u_int16_t btRecordSize; - OSErr err; - - MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), - M_TEMP, M_WAITOK | M_ZERO); - - MALLOC (tmpIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), - M_TEMP, M_WAITOK | M_ZERO); - - fcb = GetFileControlBlock(vcb->extentsRefNum); - - (void) BTInvalidateHint(btIterator); - extentKeyPtr = (ExtentKey*) &btIterator->key; - btRecord.bufferAddress = &extentData; - btRecord.itemCount = 1; - - // The algorithm is to position the BTree just before any extent records for fileID. - // Then just keep getting successive records. If the record is still for fileID, - // then delete it. - - if (isHFSPlus) { - btRecord.itemSize = sizeof(HFSPlusExtentRecord); - - extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; - extentKeyPtr->hfsPlus.forkType = forkType; - extentKeyPtr->hfsPlus.pad = 0; - extentKeyPtr->hfsPlus.fileID = fileID; - extentKeyPtr->hfsPlus.startBlock = 0; - } -#if CONFIG_HFS_STD - else { - btRecord.itemSize = sizeof(HFSExtentRecord); - - extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; - extentKeyPtr->hfs.forkType = forkType; - extentKeyPtr->hfs.fileID = fileID; - extentKeyPtr->hfs.startBlock = 0; - } -#else - else { - err = cmBadNews; - goto exit; - } -#endif - - err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); - if ( err != btNotFound ) - { - if (err == noErr) { // Did we find a bogus extent record? - err = cmBadNews; // Yes, so indicate things are messed up. - } - - goto exit; - } - - do - { - HFSCatalogNodeID foundFileID = 0; - - err = BTIterateRecord(fcb, kBTreeNextRecord, btIterator, &btRecord, &btRecordSize); - if ( err != noErr ) - { - if (err == btNotFound) // If we hit the end of the BTree - err = noErr; // then it's OK - - break; // We're done now. - } - if (isHFSPlus) { - foundFileID = extentKeyPtr->hfsPlus.fileID; - } -#if CONFIG_HFS_STD - else { - foundFileID = extentKeyPtr->hfs.fileID; - } -#endif - - if ( foundFileID != fileID ) { - break; // numbers don't match, we must be done - } - if (quitEarly && isHFSPlus) { - /* If we're only deleting one type of fork, then quit early if it doesn't match */ - if (extentKeyPtr->hfsPlus.forkType != forkType) { - break; - } - } - - *tmpIterator = *btIterator; - err = BTDeleteRecord( fcb, tmpIterator ); - if (err != noErr) - break; - } while ( true ); - -exit: - - FREE (tmpIterator, M_TEMP); - FREE (btIterator, M_TEMP); - - return( err ); -} - - -// Check if there are extents represented in the extents overflow file. -static u_int32_t CheckExtents( void *extents, u_int32_t totalBlocks, Boolean isHFSPlus ) -{ - u_int32_t extentAllocationBlocks; - u_int16_t i; - - - if ( totalBlocks == 0 ) - return( 0 ); - - extentAllocationBlocks = 0; - - if ( isHFSPlus ) - { - for ( i = 0 ; i < kHFSPlusExtentDensity ; i++ ) - { - extentAllocationBlocks += ((HFSPlusExtentDescriptor *)extents)[i].blockCount; - if ( extentAllocationBlocks >= totalBlocks ) // greater than or equal (extents can add past eof if 'Close" crashes w/o truncating new clump) - return( 0 ); - } - } -#if CONFIG_HFS_STD - else - { - for ( i = 0 ; i < kHFSExtentDensity ; i++ ) - { - extentAllocationBlocks += ((HFSExtentDescriptor *)extents)[i].blockCount; - if ( extentAllocationBlocks >= totalBlocks ) // greater than or equal (extents can add past eof if 'Close" crashes w/o truncating new clump) - return( 0 ); - } - } -#endif - - return( extentAllocationBlocks ); -} diff --git a/bsd/hfs/hfscommon/Misc/BTreeWrapper.c b/bsd/hfs/hfscommon/Misc/BTreeWrapper.c deleted file mode 100644 index bd4b905ad..000000000 --- a/bsd/hfs/hfscommon/Misc/BTreeWrapper.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Copyright (c) 2000, 2002, 2005-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include "../headers/BTreesPrivate.h" -#include -#include -#include - - -// local routines -static OSErr CheckBTreeKey(const BTreeKey *key, const BTreeControlBlock *btcb); -static Boolean ValidHFSRecord(const void *record, const BTreeControlBlock *btcb, u_int16_t recordSize); - - -OSErr ReplaceBTreeRecord(FileReference refNum, const void* key, u_int32_t hint, void *newData, u_int16_t dataSize, u_int32_t *newHint) -{ - FSBufferDescriptor btRecord; - struct BTreeIterator *iterator = NULL; - FCB *fcb; - BTreeControlBlock *btcb; - OSStatus result; - - MALLOC (iterator, struct BTreeIterator *, sizeof (struct BTreeIterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return memFullErr; //translates to ENOMEM - } - bzero (iterator, sizeof (*iterator)); - - fcb = GetFileControlBlock(refNum); - btcb = (BTreeControlBlock*) fcb->fcbBTCBPtr; - - btRecord.bufferAddress = newData; - btRecord.itemSize = dataSize; - btRecord.itemCount = 1; - - iterator->hint.nodeNum = hint; - - result = CheckBTreeKey((const BTreeKey *) key, btcb); - if (result) { - goto ErrorExit; - } - - BlockMoveData(key, &iterator->key, CalcKeySize(btcb, (const BTreeKey *) key)); //�� should we range check against maxkeylen? - - if ( DEBUG_BUILD && !ValidHFSRecord(newData, btcb, dataSize) ) - DebugStr("ReplaceBTreeRecord: bad record?"); - - result = BTReplaceRecord( fcb, iterator, &btRecord, dataSize ); - - *newHint = iterator->hint.nodeNum; - -ErrorExit: - - FREE (iterator, M_TEMP); - return result; -} - - - -static OSErr CheckBTreeKey(const BTreeKey *key, const BTreeControlBlock *btcb) -{ - u_int16_t keyLen; - - if ( btcb->attributes & kBTBigKeysMask ) - keyLen = key->length16; - else - keyLen = key->length8; - - if ( (keyLen < 6) || (keyLen > btcb->maxKeyLength) ) - { - if ( DEBUG_BUILD ) - DebugStr("CheckBTreeKey: bad key length!"); - return fsBTInvalidKeyLengthErr; - } - - return noErr; -} - - -static Boolean ValidHFSRecord(const void *record, const BTreeControlBlock *btcb, u_int16_t recordSize) -{ - u_int32_t cNodeID; - - if (btcb->maxKeyLength == kHFSPlusExtentKeyMaximumLength ) - { - return ( recordSize == sizeof(HFSPlusExtentRecord) ); - } -#if CONFIG_HFS_STD - else if ( btcb->maxKeyLength == kHFSExtentKeyMaximumLength ) - { - return ( recordSize == sizeof(HFSExtentRecord) ); - } -#endif - - else // Catalog record - { - const CatalogRecord *catalogRecord = (const CatalogRecord*) record; - - switch(catalogRecord->recordType) - { - -#if CONFIG_HFS_STD - /* - * HFS standard File/folder records and File/Folder Thread records - * are only valid on configs that support HFS standard. - */ - case kHFSFolderRecord: - { - if ( recordSize != sizeof(HFSCatalogFolder) ) - return false; - if ( catalogRecord->hfsFolder.flags != 0 ) - return false; - if ( catalogRecord->hfsFolder.valence > 0x7FFF ) - return false; - - cNodeID = catalogRecord->hfsFolder.folderID; - - if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) - return false; - } - break; - - case kHFSFileRecord: - { - const HFSExtentDescriptor *dataExtent; - const HFSExtentDescriptor *rsrcExtent; - - if ( recordSize != sizeof(HFSCatalogFile) ) - return false; - if ( (catalogRecord->hfsFile.flags & ~(0x83)) != 0 ) - return false; - - cNodeID = catalogRecord->hfsFile.fileID; - - if ( cNodeID < 16 ) - return false; - - // make sure 0 � LEOF � PEOF for both forks - - if ( catalogRecord->hfsFile.dataLogicalSize < 0 ) - return false; - if ( catalogRecord->hfsFile.dataPhysicalSize < catalogRecord->hfsFile.dataLogicalSize ) - return false; - if ( catalogRecord->hfsFile.rsrcLogicalSize < 0 ) - return false; - if ( catalogRecord->hfsFile.rsrcPhysicalSize < catalogRecord->hfsFile.rsrcLogicalSize ) - return false; - - dataExtent = (const HFSExtentDescriptor*) &catalogRecord->hfsFile.dataExtents; - rsrcExtent = (const HFSExtentDescriptor*) &catalogRecord->hfsFile.rsrcExtents; - -#if 0 - for (i = 0; i < kHFSExtentDensity; ++i) - { - if ( (dataExtent[i].blockCount > 0) && (dataExtent[i].startBlock == 0) ) - return false; - if ( (rsrcExtent[i].blockCount > 0) && (rsrcExtent[i].startBlock == 0) ) - return false; - } -#endif - } - break; - - case kHFSFileThreadRecord: - case kHFSFolderThreadRecord: - { - if ( recordSize != sizeof(HFSCatalogThread) ) - return false; - - cNodeID = catalogRecord->hfsThread.parentID; - if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) - return false; - - if ( (catalogRecord->hfsThread.nodeName[0] == 0) || - (catalogRecord->hfsThread.nodeName[0] > 31) ) - return false; - } - break; -#endif - - case kHFSPlusFolderRecord: - { - if ( recordSize != sizeof(HFSPlusCatalogFolder) ) - return false; - if ( catalogRecord->hfsPlusFolder.flags != 0 ) - return false; - if ( catalogRecord->hfsPlusFolder.valence > 0x7FFF ) - return false; - - cNodeID = catalogRecord->hfsPlusFolder.folderID; - - if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) - return false; - } - break; - - case kHFSPlusFileRecord: - { -// u_int16_t i; - const HFSPlusExtentDescriptor *dataExtent; - const HFSPlusExtentDescriptor *rsrcExtent; - - if ( recordSize != sizeof(HFSPlusCatalogFile) ) - return false; - if ( (catalogRecord->hfsPlusFile.flags & ~(0x83)) != 0 ) - return false; - - cNodeID = catalogRecord->hfsPlusFile.fileID; - - if ( cNodeID < 16 ) - return false; - - // make sure 0 � LEOF � PEOF for both forks - - dataExtent = (const HFSPlusExtentDescriptor*) &catalogRecord->hfsPlusFile.dataFork.extents; - rsrcExtent = (const HFSPlusExtentDescriptor*) &catalogRecord->hfsPlusFile.resourceFork.extents; - -#if 0 - for (i = 0; i < kHFSPlusExtentDensity; ++i) - { - if ( (dataExtent[i].blockCount > 0) && (dataExtent[i].startBlock == 0) ) - return false; - if ( (rsrcExtent[i].blockCount > 0) && (rsrcExtent[i].startBlock == 0) ) - return false; - } -#endif - } - break; - - case kHFSPlusFileThreadRecord: - case kHFSPlusFolderThreadRecord: - { - if ( recordSize > sizeof(HFSPlusCatalogThread) || recordSize < (sizeof(HFSPlusCatalogThread) - sizeof(HFSUniStr255))) - return false; - - cNodeID = catalogRecord->hfsPlusThread.parentID; - if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) - return false; - - if ( (catalogRecord->hfsPlusThread.nodeName.length == 0) || - (catalogRecord->hfsPlusThread.nodeName.length > 255) ) - return false; - } - break; - - default: - return false; - } - } - - return true; // record appears to be OK -} diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c deleted file mode 100644 index 31249e05b..000000000 --- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c +++ /dev/null @@ -1,2266 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -#include "../../hfs.h" -#include "../../hfs_format.h" -#include "../../hfs_endian.h" - -#include "../headers/FileMgrInternal.h" -#include "../headers/BTreesInternal.h" - -#include -#include - -/* -============================================================ -Public (Exported) Routines: -============================================================ - - ExtendFileC Allocate more space to a given file. - - CompareExtentKeys - Compare two extents file keys (a search key and a trial - key). Used by the BTree manager when searching for, - adding, or deleting keys in the extents file of an HFS - volume. - - CompareExtentKeysPlus - Compare two extents file keys (a search key and a trial - key). Used by the BTree manager when searching for, - adding, or deleting keys in the extents file of an HFS+ - volume. - - MapFileBlockC Convert (map) an offset within a given file into a - physical disk address. - - TruncateFileC Truncates the disk space allocated to a file. The file - space is truncated to a specified new physical EOF, rounded - up to the next allocation block boundry. There is an option - to truncate to the end of the extent containing the new EOF. - - FlushExtentFile - Flush the extents file for a given volume. - - SearchExtentFile - Search the FCB and extents file for an extent record that - contains a given file position (in bytes). - - -============================================================ -Internal Routines: -============================================================ - FindExtentRecord - Search the extents BTree for a particular extent record. - SearchExtentRecord - Search a given extent record to see if it contains a given - file position (in bytes). Used by SearchExtentFile. - ReleaseExtents - Deallocate all allocation blocks in all extents of an extent - data record. - TruncateExtents - Deallocate blocks and delete extent records for all allocation - blocks beyond a certain point in a file. The starting point - must be the first file allocation block for some extent record - for the file. - DeallocateFork - Deallocate all allocation blocks belonging to a given fork. - UpdateExtentRecord - If the extent record came from the extents file, write out - the updated record; otherwise, copy the updated record into - the FCB resident extent record. If the record has no extents, - and was in the extents file, then delete the record instead. -*/ - -#if CONFIG_HFS_STD -static const int64_t kTwoGigabytes = 0x80000000LL; -#endif - -enum -{ - kDataForkType = 0, - kResourceForkType = 0xFF, - - kPreviousRecord = -1 -}; - - -#if CONFIG_HFS_STD -static OSErr HFSPlusToHFSExtents( - const HFSPlusExtentRecord oldExtents, - HFSExtentRecord newExtents); -#endif - -static OSErr FindExtentRecord( - const ExtendedVCB *vcb, - u_int8_t forkType, - u_int32_t fileID, - u_int32_t startBlock, - Boolean allowPrevious, - HFSPlusExtentKey *foundKey, - HFSPlusExtentRecord foundData, - u_int32_t *foundHint); - -static OSErr DeleteExtentRecord( - const ExtendedVCB *vcb, - u_int8_t forkType, - u_int32_t fileID, - u_int32_t startBlock); - -static OSErr CreateExtentRecord( - ExtendedVCB *vcb, - HFSPlusExtentKey *key, - HFSPlusExtentRecord extents, - u_int32_t *hint); - - -static OSErr GetFCBExtentRecord( - const FCB *fcb, - HFSPlusExtentRecord extents); - -static OSErr SearchExtentRecord( - ExtendedVCB *vcb, - u_int32_t searchFABN, - const HFSPlusExtentRecord extentData, - u_int32_t extentDataStartFABN, - u_int32_t *foundExtentDataOffset, - u_int32_t *endingFABNPlusOne, - Boolean *noMoreExtents); - -static OSErr ReleaseExtents( - ExtendedVCB *vcb, - const HFSPlusExtentRecord extentRecord, - u_int32_t *numReleasedAllocationBlocks, - Boolean *releasedLastExtent); - -static OSErr DeallocateFork( - ExtendedVCB *vcb, - HFSCatalogNodeID fileID, - u_int8_t forkType, - HFSPlusExtentRecord catalogExtents, - Boolean * recordDeleted); - -static OSErr TruncateExtents( - ExtendedVCB *vcb, - u_int8_t forkType, - u_int32_t fileID, - u_int32_t startBlock, - Boolean * recordDeleted); - -static OSErr UpdateExtentRecord ( - ExtendedVCB *vcb, - FCB *fcb, - int deleted, - const HFSPlusExtentKey *extentFileKey, - const HFSPlusExtentRecord extentData, - u_int32_t extentBTreeHint); - -static Boolean ExtentsAreIntegral( - const HFSPlusExtentRecord extentRecord, - u_int32_t mask, - u_int32_t *blocksChecked, - Boolean *checkedLastExtent); - -//_________________________________________________________________________________ -// -// Routine: FindExtentRecord -// -// Purpose: Search the extents BTree for an extent record matching the given -// FileID, fork, and starting file allocation block number. -// -// Inputs: -// vcb Volume to search -// forkType 0 = data fork, -1 = resource fork -// fileID File's FileID (CatalogNodeID) -// startBlock Starting file allocation block number -// allowPrevious If the desired record isn't found and this flag is set, -// then see if the previous record belongs to the same fork. -// If so, then return it. -// -// Outputs: -// foundKey The key data for the record actually found -// foundData The extent record actually found (NOTE: on an HFS volume, the -// fourth entry will be zeroes. -// foundHint The BTree hint to find the node again -//_________________________________________________________________________________ -static OSErr FindExtentRecord( - const ExtendedVCB *vcb, - u_int8_t forkType, - u_int32_t fileID, - u_int32_t startBlock, - Boolean allowPrevious, - HFSPlusExtentKey *foundKey, - HFSPlusExtentRecord foundData, - u_int32_t *foundHint) -{ - FCB * fcb; - struct BTreeIterator *btIterator = NULL; - FSBufferDescriptor btRecord; - OSErr err; - u_int16_t btRecordSize; - - err = noErr; - if (foundHint) - *foundHint = 0; - fcb = GetFileControlBlock(vcb->extentsRefNum); - - MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (btIterator == NULL) { - return memFullErr; // translates to ENOMEM - } - bzero(btIterator, sizeof(*btIterator)); - - /* HFS Plus / HFSX */ - if (vcb->vcbSigWord != kHFSSigWord) { - HFSPlusExtentKey * extentKeyPtr; - HFSPlusExtentRecord extentData; - - extentKeyPtr = (HFSPlusExtentKey*) &btIterator->key; - extentKeyPtr->keyLength = kHFSPlusExtentKeyMaximumLength; - extentKeyPtr->forkType = forkType; - extentKeyPtr->pad = 0; - extentKeyPtr->fileID = fileID; - extentKeyPtr->startBlock = startBlock; - - btRecord.bufferAddress = &extentData; - btRecord.itemSize = sizeof(HFSPlusExtentRecord); - btRecord.itemCount = 1; - - err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); - - if (err == btNotFound && allowPrevious) { - err = BTIterateRecord(fcb, kBTreePrevRecord, btIterator, &btRecord, &btRecordSize); - - // A previous record may not exist, so just return btNotFound (like we would if - // it was for the wrong file/fork). - if (err == (OSErr) fsBTStartOfIterationErr) //�� fsBTStartOfIterationErr is type unsigned long - err = btNotFound; - - if (err == noErr) { - // Found a previous record. Does it belong to the same fork of the same file? - if (extentKeyPtr->fileID != fileID || extentKeyPtr->forkType != forkType) - err = btNotFound; - } - } - - if (err == noErr) { - // Copy the found key back for the caller - if (foundKey) - BlockMoveData(extentKeyPtr, foundKey, sizeof(HFSPlusExtentKey)); - // Copy the found data back for the caller - BlockMoveData(&extentData, foundData, sizeof(HFSPlusExtentRecord)); - } - } -#if CONFIG_HFS_STD - else { - HFSExtentKey * extentKeyPtr; - HFSExtentRecord extentData; - - extentKeyPtr = (HFSExtentKey*) &btIterator->key; - extentKeyPtr->keyLength = kHFSExtentKeyMaximumLength; - extentKeyPtr->forkType = forkType; - extentKeyPtr->fileID = fileID; - extentKeyPtr->startBlock = startBlock; - - btRecord.bufferAddress = &extentData; - btRecord.itemSize = sizeof(HFSExtentRecord); - btRecord.itemCount = 1; - - err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); - - if (err == btNotFound && allowPrevious) { - err = BTIterateRecord(fcb, kBTreePrevRecord, btIterator, &btRecord, &btRecordSize); - - // A previous record may not exist, so just return btNotFound (like we would if - // it was for the wrong file/fork). - if (err == (OSErr) fsBTStartOfIterationErr) //�� fsBTStartOfIterationErr is type unsigned long - err = btNotFound; - - if (err == noErr) { - // Found a previous record. Does it belong to the same fork of the same file? - if (extentKeyPtr->fileID != fileID || extentKeyPtr->forkType != forkType) - err = btNotFound; - } - } - - if (err == noErr) { - u_int16_t i; - - // Copy the found key back for the caller - if (foundKey) { - foundKey->keyLength = kHFSPlusExtentKeyMaximumLength; - foundKey->forkType = extentKeyPtr->forkType; - foundKey->pad = 0; - foundKey->fileID = extentKeyPtr->fileID; - foundKey->startBlock = extentKeyPtr->startBlock; - } - // Copy the found data back for the caller - foundData[0].startBlock = extentData[0].startBlock; - foundData[0].blockCount = extentData[0].blockCount; - foundData[1].startBlock = extentData[1].startBlock; - foundData[1].blockCount = extentData[1].blockCount; - foundData[2].startBlock = extentData[2].startBlock; - foundData[2].blockCount = extentData[2].blockCount; - - for (i = 3; i < kHFSPlusExtentDensity; ++i) - { - foundData[i].startBlock = 0; - foundData[i].blockCount = 0; - } - } - } -#endif - - if (foundHint) - *foundHint = btIterator->hint.nodeNum; - - FREE(btIterator, M_TEMP); - return err; -} - - - -static OSErr CreateExtentRecord( - ExtendedVCB *vcb, - HFSPlusExtentKey *key, - HFSPlusExtentRecord extents, - u_int32_t *hint) -{ - struct BTreeIterator *btIterator = NULL; - FSBufferDescriptor btRecord; - u_int16_t btRecordSize; - int lockflags; - OSErr err; - - err = noErr; - *hint = 0; - - MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (btIterator == NULL) { - return memFullErr; // translates to ENOMEM - } - bzero(btIterator, sizeof(*btIterator)); - - /* - * The lock taken by callers of ExtendFileC is speculative and - * only occurs when the file already has overflow extents. So - * We need to make sure we have the lock here. The extents - * btree lock can be nested (its recursive) so we always take - * it here. - */ - lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - - /* HFS+/HFSX */ - if (vcb->vcbSigWord != kHFSSigWord) { - btRecordSize = sizeof(HFSPlusExtentRecord); - btRecord.bufferAddress = extents; - btRecord.itemSize = btRecordSize; - btRecord.itemCount = 1; - - BlockMoveData(key, &btIterator->key, sizeof(HFSPlusExtentKey)); - } -#if CONFIG_HFS_STD - else { - /* HFS Standard */ - HFSExtentKey * keyPtr; - HFSExtentRecord data; - - btRecordSize = sizeof(HFSExtentRecord); - btRecord.bufferAddress = &data; - btRecord.itemSize = btRecordSize; - btRecord.itemCount = 1; - - keyPtr = (HFSExtentKey*) &btIterator->key; - keyPtr->keyLength = kHFSExtentKeyMaximumLength; - keyPtr->forkType = key->forkType; - keyPtr->fileID = key->fileID; - keyPtr->startBlock = key->startBlock; - - err = HFSPlusToHFSExtents(extents, data); - } -#endif - - if (err == noErr) - err = BTInsertRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator, &btRecord, btRecordSize); - - if (err == noErr) - *hint = btIterator->hint.nodeNum; - - (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); - - hfs_systemfile_unlock(vcb, lockflags); - - FREE (btIterator, M_TEMP); - return err; -} - - -static OSErr DeleteExtentRecord( - const ExtendedVCB *vcb, - u_int8_t forkType, - u_int32_t fileID, - u_int32_t startBlock) -{ - struct BTreeIterator *btIterator = NULL; - OSErr err; - - err = noErr; - - MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (btIterator == NULL) { - return memFullErr; // translates to ENOMEM - } - bzero(btIterator, sizeof(*btIterator)); - - /* HFS+ / HFSX */ - if (vcb->vcbSigWord != kHFSSigWord) { // HFS Plus volume - HFSPlusExtentKey * keyPtr; - - keyPtr = (HFSPlusExtentKey*) &btIterator->key; - keyPtr->keyLength = kHFSPlusExtentKeyMaximumLength; - keyPtr->forkType = forkType; - keyPtr->pad = 0; - keyPtr->fileID = fileID; - keyPtr->startBlock = startBlock; - } -#if CONFIG_HFS_STD - else { - /* HFS standard */ - HFSExtentKey * keyPtr; - - keyPtr = (HFSExtentKey*) &btIterator->key; - keyPtr->keyLength = kHFSExtentKeyMaximumLength; - keyPtr->forkType = forkType; - keyPtr->fileID = fileID; - keyPtr->startBlock = startBlock; - } -#endif - - err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator); - (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); - - - FREE(btIterator, M_TEMP); - return err; -} - - - -//_________________________________________________________________________________ -// -// Routine: MapFileBlock -// -// Function: Maps a file position into a physical disk address. -// -//_________________________________________________________________________________ - -OSErr MapFileBlockC ( - ExtendedVCB *vcb, // volume that file resides on - FCB *fcb, // FCB of file - size_t numberOfBytes, // number of contiguous bytes desired - off_t offset, // starting offset within file (in bytes) - daddr64_t *startSector, // first sector (NOT an allocation block) - size_t *availableBytes) // number of contiguous bytes (up to numberOfBytes) -{ - OSErr err; - u_int32_t allocBlockSize; // Size of the volume's allocation block - u_int32_t sectorSize; - HFSPlusExtentKey foundKey; - HFSPlusExtentRecord foundData; - u_int32_t foundIndex; - u_int32_t hint; - u_int32_t firstFABN; // file allocation block of first block in found extent - u_int32_t nextFABN; // file allocation block of block after end of found extent - off_t dataEnd; // (offset) end of range that is contiguous - u_int32_t sectorsPerBlock; // Number of sectors per allocation block - u_int32_t startBlock; // volume allocation block corresponding to firstFABN - daddr64_t temp; - off_t tmpOff; - - allocBlockSize = vcb->blockSize; - sectorSize = VCBTOHFS(vcb)->hfs_logical_block_size; - - err = SearchExtentFile(vcb, fcb, offset, &foundKey, foundData, &foundIndex, &hint, &nextFABN); - if (err == noErr) { - startBlock = foundData[foundIndex].startBlock; - firstFABN = nextFABN - foundData[foundIndex].blockCount; - } - - if (err != noErr) - { - return err; - } - - // - // Determine the end of the available space. It will either be the end of the extent, - // or the file's PEOF, whichever is smaller. - // - dataEnd = (off_t)((off_t)(nextFABN) * (off_t)(allocBlockSize)); // Assume valid data through end of this extent - if (((off_t)fcb->ff_blocks * (off_t)allocBlockSize) < dataEnd) // Is PEOF shorter? - dataEnd = (off_t)fcb->ff_blocks * (off_t)allocBlockSize; // Yes, so only map up to PEOF - - // Compute the number of sectors in an allocation block - sectorsPerBlock = allocBlockSize / sectorSize; // sectors per allocation block - - // - // Compute the absolute sector number that contains the offset of the given file - // offset in sectors from start of the extent + - // offset in sectors from start of allocation block space - // - temp = (daddr64_t)((offset - (off_t)((off_t)(firstFABN) * (off_t)(allocBlockSize)))/sectorSize); - temp += (daddr64_t)startBlock * (daddr64_t)sectorsPerBlock; - - /* Add in any volume offsets */ - if (vcb->vcbSigWord == kHFSPlusSigWord) - temp += vcb->hfsPlusIOPosOffset / sectorSize; - else - temp += vcb->vcbAlBlSt; - - // Return the desired sector for file position "offset" - *startSector = temp; - - // - // Determine the number of contiguous bytes until the end of the extent - // (or the amount they asked for, whichever comes first). - // - if (availableBytes) - { - tmpOff = dataEnd - offset; - /* - * Disallow negative runs. - */ - if (tmpOff <= 0) { - return EINVAL; - } - - if (tmpOff > (off_t)(numberOfBytes)) { - *availableBytes = numberOfBytes; // more there than they asked for, so pin the output - } - else { - *availableBytes = tmpOff; - } - } - - return noErr; -} - - -//������������������������������������������������������������������������������� -// Routine: ReleaseExtents -// -// Function: Release the extents of a single extent data record. -//������������������������������������������������������������������������������� - -static OSErr ReleaseExtents( - ExtendedVCB *vcb, - const HFSPlusExtentRecord extentRecord, - u_int32_t *numReleasedAllocationBlocks, - Boolean *releasedLastExtent) -{ - u_int32_t extentIndex; - u_int32_t numberOfExtents; - OSErr err = noErr; - - *numReleasedAllocationBlocks = 0; - *releasedLastExtent = false; - - if (vcb->vcbSigWord == kHFSPlusSigWord) - numberOfExtents = kHFSPlusExtentDensity; - else - numberOfExtents = kHFSExtentDensity; - - for( extentIndex = 0; extentIndex < numberOfExtents; extentIndex++) - { - u_int32_t numAllocationBlocks; - - // Loop over the extent record and release the blocks associated with each extent. - - numAllocationBlocks = extentRecord[extentIndex].blockCount; - if ( numAllocationBlocks == 0 ) - { - *releasedLastExtent = true; - break; - } - - err = BlockDeallocate( vcb, extentRecord[extentIndex].startBlock, numAllocationBlocks , 0); - if ( err != noErr ) - break; - - *numReleasedAllocationBlocks += numAllocationBlocks; // bump FABN to beg of next extent - } - - return( err ); -} - - - -//������������������������������������������������������������������������������� -// Routine: TruncateExtents -// -// Purpose: Delete extent records whose starting file allocation block number -// is greater than or equal to a given starting block number. The -// allocation blocks represented by the extents are deallocated. -// -// Inputs: -// vcb Volume to operate on -// fileID Which file to operate on -// startBlock Starting file allocation block number for first extent -// record to delete. -//������������������������������������������������������������������������������� - -static OSErr TruncateExtents( - ExtendedVCB *vcb, - u_int8_t forkType, - u_int32_t fileID, - u_int32_t startBlock, - Boolean * recordDeleted) -{ - OSErr err; - u_int32_t numberExtentsReleased; - Boolean releasedLastExtent; - u_int32_t hint; - HFSPlusExtentKey key; - HFSPlusExtentRecord extents; - int lockflags; - - /* - * The lock taken by callers of TruncateFileC is speculative and - * only occurs when the file already has overflow extents. So - * We need to make sure we have the lock here. The extents - * btree lock can be nested (its recursive) so we always take - * it here. - */ - lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - - while (true) { - err = FindExtentRecord(vcb, forkType, fileID, startBlock, false, &key, extents, &hint); - if (err != noErr) { - if (err == btNotFound) - err = noErr; - break; - } - - err = ReleaseExtents( vcb, extents, &numberExtentsReleased, &releasedLastExtent ); - if (err != noErr) break; - - err = DeleteExtentRecord(vcb, forkType, fileID, startBlock); - if (err != noErr) break; - - *recordDeleted = true; - startBlock += numberExtentsReleased; - } - hfs_systemfile_unlock(vcb, lockflags); - - return err; -} - - - -//������������������������������������������������������������������������������� -// Routine: DeallocateFork -// -// Function: De-allocates all disk space allocated to a specified fork. -//������������������������������������������������������������������������������� - -static OSErr DeallocateFork( - ExtendedVCB *vcb, - HFSCatalogNodeID fileID, - u_int8_t forkType, - HFSPlusExtentRecord catalogExtents, - Boolean * recordDeleted) /* true if a record was deleted */ -{ - OSErr err; - u_int32_t numReleasedAllocationBlocks; - Boolean releasedLastExtent; - - // Release the catalog extents - err = ReleaseExtents( vcb, catalogExtents, &numReleasedAllocationBlocks, &releasedLastExtent ); - // Release the extra extents, if present - if (err == noErr && !releasedLastExtent) - err = TruncateExtents(vcb, forkType, fileID, numReleasedAllocationBlocks, recordDeleted); - - return( err ); -} - -//������������������������������������������������������������������������������� -// Routine: FlushExtentFile -// -// Function: Flushes the extent file for a specified volume -//������������������������������������������������������������������������������� - -OSErr FlushExtentFile( ExtendedVCB *vcb ) -{ - FCB * fcb; - OSErr err; - int lockflags; - - fcb = GetFileControlBlock(vcb->extentsRefNum); - - lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - err = BTFlushPath(fcb); - hfs_systemfile_unlock(vcb, lockflags); - - if ( err == noErr ) - { - // If the FCB for the extent "file" is dirty, mark the VCB as dirty. - - if (FTOC(fcb)->c_flag & C_MODIFIED) - { - MarkVCBDirty( vcb ); - // err = FlushVolumeControlBlock( vcb ); - } - } - - return( err ); -} - - -#if CONFIG_HFS_STD -//������������������������������������������������������������������������������� -// Routine: CompareExtentKeys -// -// Function: Compares two extent file keys (a search key and a trial key) for -// an HFS volume. -//������������������������������������������������������������������������������� - -__private_extern__ -int32_t CompareExtentKeys( const HFSExtentKey *searchKey, const HFSExtentKey *trialKey ) -{ - int32_t result; // � 1 - - #if DEBUG_BUILD - if (searchKey->keyLength != kHFSExtentKeyMaximumLength) - DebugStr("HFS: search Key is wrong length"); - if (trialKey->keyLength != kHFSExtentKeyMaximumLength) - DebugStr("HFS: trial Key is wrong length"); - #endif - - result = -1; // assume searchKey < trialKey - - if (searchKey->fileID == trialKey->fileID) { - // - // FileNum's are equal; compare fork types - // - if (searchKey->forkType == trialKey->forkType) { - // - // Fork types are equal; compare allocation block number - // - if (searchKey->startBlock == trialKey->startBlock) { - // - // Everything is equal - // - result = 0; - } - else { - // - // Allocation block numbers differ; determine sign - // - if (searchKey->startBlock > trialKey->startBlock) - result = 1; - } - } - else { - // - // Fork types differ; determine sign - // - if (searchKey->forkType > trialKey->forkType) - result = 1; - } - } - else { - // - // FileNums differ; determine sign - // - if (searchKey->fileID > trialKey->fileID) - result = 1; - } - - return( result ); -} -#endif - - -//������������������������������������������������������������������������������� -// Routine: CompareExtentKeysPlus -// -// Function: Compares two extent file keys (a search key and a trial key) for -// an HFS volume. -//������������������������������������������������������������������������������� - -__private_extern__ -int32_t CompareExtentKeysPlus( const HFSPlusExtentKey *searchKey, const HFSPlusExtentKey *trialKey ) -{ - int32_t result; // � 1 - - #if DEBUG_BUILD - if (searchKey->keyLength != kHFSPlusExtentKeyMaximumLength) - DebugStr("HFS: search Key is wrong length"); - if (trialKey->keyLength != kHFSPlusExtentKeyMaximumLength) - DebugStr("HFS: trial Key is wrong length"); - #endif - - result = -1; // assume searchKey < trialKey - - if (searchKey->fileID == trialKey->fileID) { - // - // FileNum's are equal; compare fork types - // - if (searchKey->forkType == trialKey->forkType) { - // - // Fork types are equal; compare allocation block number - // - if (searchKey->startBlock == trialKey->startBlock) { - // - // Everything is equal - // - result = 0; - } - else { - // - // Allocation block numbers differ; determine sign - // - if (searchKey->startBlock > trialKey->startBlock) - result = 1; - } - } - else { - // - // Fork types differ; determine sign - // - if (searchKey->forkType > trialKey->forkType) - result = 1; - } - } - else { - // - // FileNums differ; determine sign - // - if (searchKey->fileID > trialKey->fileID) - result = 1; - } - - return( result ); -} - -static int -should_pin_blocks(hfsmount_t *hfsmp, FCB *fcb) -{ - if (!ISSET(hfsmp->hfs_flags, HFS_CS_HOTFILE_PIN) - || fcb->ff_cp == NULL || fcb->ff_cp->c_vp == NULL) { - return 0; - } - - int pin_blocks; - - // - // File system metadata should get pinned - // - if (vnode_issystem(fcb->ff_cp->c_vp)) { - return 1; - } - - // - // If a file is AutoCandidate, we should not pin its blocks because - // it was an automatically added file and this function is intended - // to pin new blocks being added to user-generated content. - // - if (fcb->ff_cp->c_attr.ca_recflags & kHFSAutoCandidateMask) { - return 0; - } - - // - // If a file is marked FastDevPinned it is an existing pinned file - // or a new file that should be pinned. - // - // If a file is marked FastDevCandidate it is a new file that is - // being written to for the first time so we don't want to pin it - // just yet as it may not meet the criteria (i.e. too large). - // - if ((fcb->ff_cp->c_attr.ca_recflags & (kHFSFastDevPinnedMask)) != 0) { - pin_blocks = 1; - } else { - pin_blocks = 0; - } - - return pin_blocks; -} - - - -static void -pin_blocks_if_needed(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount) -{ - if (!should_pin_blocks(vcb, fcb)) { - return; - } - - // ask CoreStorage to pin the new blocks being added to this file - if (hfs_pin_block_range((struct hfsmount *)vcb, HFS_PIN_IT, startBlock, blockCount, vfs_context_kernel()) == 0) { - struct vnode *vp = fcb->ff_cp->c_vp; - - // and make sure to keep our accounting in order - hfs_hotfile_adjust_blocks(vp, -blockCount); - } -} - - - -/* - * Add a file extent to a file. - * - * Used by hfs_extendfs to extend the volume allocation bitmap file. - * - */ -int -AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount) -{ - HFSPlusExtentKey foundKey; - HFSPlusExtentRecord foundData; - u_int32_t foundIndex; - u_int32_t hint; - u_int32_t nextBlock; - int64_t peof; - int i; - int error; - - peof = (int64_t)(fcb->ff_blocks + blockCount) * (int64_t)vcb->blockSize; - - error = SearchExtentFile(vcb, fcb, peof-1, &foundKey, foundData, &foundIndex, &hint, &nextBlock); - if (error != fxRangeErr) - return (EBUSY); - - /* - * Add new extent. See if there is room in the current record. - */ - if (foundData[foundIndex].blockCount != 0) - ++foundIndex; - if (foundIndex == kHFSPlusExtentDensity) { - /* - * Existing record is full so create a new one. - */ - foundKey.keyLength = kHFSPlusExtentKeyMaximumLength; - foundKey.forkType = kDataForkType; - foundKey.pad = 0; - foundKey.fileID = FTOC(fcb)->c_fileid; - foundKey.startBlock = nextBlock; - - foundData[0].startBlock = startBlock; - foundData[0].blockCount = blockCount; - - /* zero out remaining extents. */ - for (i = 1; i < kHFSPlusExtentDensity; ++i) { - foundData[i].startBlock = 0; - foundData[i].blockCount = 0; - } - - foundIndex = 0; - - error = CreateExtentRecord(vcb, &foundKey, foundData, &hint); - if (error == fxOvFlErr) { - error = dskFulErr; - } else if (error == 0) { - pin_blocks_if_needed(vcb, fcb, startBlock, blockCount); - } - - } else { - /* - * Add a new extent into existing record. - */ - foundData[foundIndex].startBlock = startBlock; - foundData[foundIndex].blockCount = blockCount; - error = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); - if (error == 0) { - pin_blocks_if_needed(vcb, fcb, startBlock, blockCount); - } - } - (void) FlushExtentFile(vcb); - - return (error); -} - - -//_________________________________________________________________________________ -// -// Routine: Extendfile -// -// Function: Extends the disk space allocated to a file. -// -//_________________________________________________________________________________ - -OSErr ExtendFileC ( - ExtendedVCB *vcb, // volume that file resides on - FCB *fcb, // FCB of file to truncate - int64_t bytesToAdd, // number of bytes to allocate - u_int32_t blockHint, // desired starting allocation block - u_int32_t flags, // EFContig and/or EFAll - int64_t *actualBytesAdded) // number of bytes actually allocated -{ - OSErr err; - u_int32_t volumeBlockSize; - int64_t blocksToAdd; - int64_t bytesThisExtent; - HFSPlusExtentKey foundKey; - HFSPlusExtentRecord foundData; - u_int32_t foundIndex; - u_int32_t hint; - u_int32_t nextBlock; - u_int32_t startBlock; - Boolean allOrNothing; - Boolean forceContig; - Boolean wantContig; - Boolean useMetaZone; - Boolean needsFlush; - int allowFlushTxns; - u_int32_t actualStartBlock; - u_int32_t actualNumBlocks; - u_int32_t numExtentsPerRecord; - int64_t maximumBytes; - int64_t availbytes; - int64_t peof; - u_int32_t prevblocks; - uint32_t fastdev = 0; - - struct hfsmount *hfsmp = (struct hfsmount*)vcb; - allowFlushTxns = 0; - needsFlush = false; - *actualBytesAdded = 0; - volumeBlockSize = vcb->blockSize; - allOrNothing = ((flags & kEFAllMask) != 0); - forceContig = ((flags & kEFContigMask) != 0); - prevblocks = fcb->ff_blocks; - - if (vcb->vcbSigWord != kHFSSigWord) { - numExtentsPerRecord = kHFSPlusExtentDensity; - } -#if CONFIG_HFS_STD - else { - /* HFS Standard */ - numExtentsPerRecord = kHFSExtentDensity; - - /* Make sure the request and new PEOF are less than 2GB if HFS std*/ - if (bytesToAdd >= kTwoGigabytes) - goto HFS_Std_Overflow; - if ((((int64_t)fcb->ff_blocks * (int64_t)volumeBlockSize) + bytesToAdd) >= kTwoGigabytes) - goto HFS_Std_Overflow; - } -#endif - - // - // Determine how many blocks need to be allocated. - // Round up the number of desired bytes to add. - // - blocksToAdd = howmany(bytesToAdd, volumeBlockSize); - bytesToAdd = (int64_t)((int64_t)blocksToAdd * (int64_t)volumeBlockSize); - - /* - * For deferred allocations just reserve the blocks. - */ - if ((flags & kEFDeferMask) - && (vcb->vcbSigWord == kHFSPlusSigWord) - && (bytesToAdd < (int64_t)HFS_MAX_DEFERED_ALLOC) - && (blocksToAdd < hfs_freeblks(VCBTOHFS(vcb), 1))) { - hfs_lock_mount (hfsmp); - vcb->loanedBlocks += blocksToAdd; - hfs_unlock_mount(hfsmp); - - fcb->ff_unallocblocks += blocksToAdd; - FTOC(fcb)->c_blocks += blocksToAdd; - fcb->ff_blocks += blocksToAdd; - - /* - * We haven't touched the disk here; no blocks have been - * allocated and the volume will not be inconsistent if we - * don't update the catalog record immediately. - */ - FTOC(fcb)->c_flag |= C_MINOR_MOD; - *actualBytesAdded = bytesToAdd; - return (0); - } - /* - * Give back any unallocated blocks before doing real allocations. - */ - if (fcb->ff_unallocblocks > 0) { - u_int32_t loanedBlocks; - - loanedBlocks = fcb->ff_unallocblocks; - blocksToAdd += loanedBlocks; - bytesToAdd = (int64_t)blocksToAdd * (int64_t)volumeBlockSize; - FTOC(fcb)->c_blocks -= loanedBlocks; - fcb->ff_blocks -= loanedBlocks; - fcb->ff_unallocblocks = 0; - - hfs_lock_mount(hfsmp); - vcb->loanedBlocks -= loanedBlocks; - hfs_unlock_mount(hfsmp); - } - - // - // If the file's clump size is larger than the allocation block size, - // then set the maximum number of bytes to the requested number of bytes - // rounded up to a multiple of the clump size. - // - if ((vcb->vcbClpSiz > (int32_t)volumeBlockSize) - && (bytesToAdd < (int64_t)HFS_MAX_DEFERED_ALLOC) - && (flags & kEFNoClumpMask) == 0) { - maximumBytes = (int64_t)howmany(bytesToAdd, vcb->vcbClpSiz); - maximumBytes *= vcb->vcbClpSiz; - } else { - maximumBytes = bytesToAdd; - } - -#if CONFIG_HFS_STD - // - // Compute new physical EOF, rounded up to a multiple of a block. - // - if ( (vcb->vcbSigWord == kHFSSigWord) && // Too big? - ((((int64_t)fcb->ff_blocks * (int64_t)volumeBlockSize) + bytesToAdd) >= kTwoGigabytes) ) { - if (allOrNothing) // Yes, must they have it all? - goto HFS_Std_Overflow; // Yes, can't have it - else { - --blocksToAdd; // No, give give 'em one block less - bytesToAdd -= volumeBlockSize; - } - } -#endif - - // - // If allocation is all-or-nothing, make sure there are - // enough free blocks on the volume (quick test). - // - if (allOrNothing && - (blocksToAdd > hfs_freeblks(VCBTOHFS(vcb), flags & kEFReserveMask))) { - err = dskFulErr; - goto ErrorExit; - } - - // - // See if there are already enough blocks allocated to the file. - // - peof = ((int64_t)fcb->ff_blocks * (int64_t)volumeBlockSize) + bytesToAdd; // potential new PEOF - err = SearchExtentFile(vcb, fcb, peof-1, &foundKey, foundData, &foundIndex, &hint, &nextBlock); - if (err == noErr) { - // Enough blocks are already allocated. Just update the FCB to reflect the new length. - fcb->ff_blocks = peof / volumeBlockSize; - FTOC(fcb)->c_blocks += (bytesToAdd / volumeBlockSize); - FTOC(fcb)->c_flag |= C_MODIFIED; - goto Exit; - } - if (err != fxRangeErr) // Any real error? - goto ErrorExit; // Yes, so exit immediately - - // - // Adjust the PEOF to the end of the last extent. - // - peof = (int64_t)((int64_t)nextBlock * (int64_t)volumeBlockSize); // currently allocated PEOF - bytesThisExtent = (int64_t)(nextBlock - fcb->ff_blocks) * (int64_t)volumeBlockSize; - if (bytesThisExtent != 0) { - fcb->ff_blocks = nextBlock; - FTOC(fcb)->c_blocks += (bytesThisExtent / volumeBlockSize); - FTOC(fcb)->c_flag |= C_MODIFIED; - bytesToAdd -= bytesThisExtent; - } - - // - // Allocate some more space. - // - // First try a contiguous allocation (of the whole amount). - // If that fails, get whatever we can. - // If forceContig, then take whatever we got - // else, keep getting bits and pieces (non-contig) - - /* - * Note that for sparse devices (like sparse bundle dmgs), we - * should only be aggressive with re-using once-allocated pieces - * if we're not dealing with system files. If we're trying to operate - * on behalf of a system file, we need the maximum contiguous amount - * possible. For non-system files we favor locality and fragmentation over - * contiguity as it can result in fewer blocks being needed from the underlying - * filesystem that the sparse image resides upon. - */ - err = noErr; - if ( (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) - && (fcb->ff_cp->c_fileid >= kHFSFirstUserCatalogNodeID) - && (flags & kEFMetadataMask) == 0) { - /* - * We want locality over contiguity so by default we set wantContig to - * false unless we hit one of the circumstances below. - */ - wantContig = false; - if (hfs_isrbtree_active(VCBTOHFS(vcb))) { - /* - * If the red-black tree is acive, we can always find a suitable contiguous - * chunk. So if the user specifically requests contiguous files, we should - * honor that no matter what kind of device it is. - */ - if (forceContig) { - wantContig = true; - } - } - else { - /* - * If the red-black tree is not active, then only set wantContig to true - * if we have never done a contig scan on the device, which would populate - * the free extent cache. Note that the caller may explicitly unset the - * DID_CONTIG_SCAN bit in order to force us to vend a contiguous extent here - * if the caller wants to get a contiguous chunk. - */ - if ((vcb->hfs_flags & HFS_DID_CONTIG_SCAN) == 0) { - vcb->hfs_flags |= HFS_DID_CONTIG_SCAN; - wantContig = true; - } - } - } - else { - wantContig = true; - } - - if (should_pin_blocks(hfsmp, fcb)) - fastdev = HFS_ALLOC_FAST_DEV; - - useMetaZone = flags & kEFMetadataMask; - do { - if (blockHint != 0) - startBlock = blockHint; - else - startBlock = foundData[foundIndex].startBlock + foundData[foundIndex].blockCount; - - actualNumBlocks = 0; - actualStartBlock = 0; - - /* Find number of free blocks based on reserved block flag option */ - availbytes = (int64_t)hfs_freeblks(VCBTOHFS(vcb), flags & kEFReserveMask) * - (int64_t)volumeBlockSize; - if (availbytes <= 0) { - err = dskFulErr; - } else { - if (wantContig && (availbytes < bytesToAdd)) { - err = dskFulErr; - } - else { - uint32_t ba_flags = fastdev; - - if (wantContig) { - ba_flags |= HFS_ALLOC_FORCECONTIG; - } - if (useMetaZone) { - ba_flags |= HFS_ALLOC_METAZONE; - } - if (allowFlushTxns) { - ba_flags |= HFS_ALLOC_FLUSHTXN; - } - - err = BlockAllocate( - vcb, - startBlock, - howmany(MIN(bytesToAdd, availbytes), volumeBlockSize), - howmany(MIN(maximumBytes, availbytes), volumeBlockSize), - ba_flags, - &actualStartBlock, - &actualNumBlocks); - } - } - if (err == dskFulErr) { - if (forceContig) { - if (allowFlushTxns == 0) { - /* If we're forcing contiguity, re-try but allow plucking from recently freed regions */ - allowFlushTxns = 1; - wantContig = 1; - err = noErr; - continue; - } - else { - break; // AllocContig failed because not enough contiguous space - } - } - if (wantContig) { - // Couldn't get one big chunk, so get whatever we can. - err = noErr; - wantContig = false; - continue; - } - if (actualNumBlocks != 0) - err = noErr; - - if (useMetaZone == 0) { - /* Couldn't get anything so dip into metadat zone */ - err = noErr; - useMetaZone = 1; - continue; - } - - /* If we couldn't find what we needed without flushing the journal, then go ahead and do it now */ - if (allowFlushTxns == 0) { - allowFlushTxns = 1; - err = noErr; - continue; - } - - } - if (err == noErr) { - // Add the new extent to the existing extent record, or create a new one. - if ((actualStartBlock == startBlock) && (blockHint == 0)) { - // We grew the file's last extent, so just adjust the number of blocks. - foundData[foundIndex].blockCount += actualNumBlocks; - err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); - if (err != noErr) break; - } - else { - u_int16_t i; - - // Need to add a new extent. See if there is room in the current record. - if (foundData[foundIndex].blockCount != 0) // Is current extent free to use? - ++foundIndex; // No, so use the next one. - if (foundIndex == numExtentsPerRecord) { - // This record is full. Need to create a new one. - if (FTOC(fcb)->c_fileid == kHFSExtentsFileID) { - (void) BlockDeallocate(vcb, actualStartBlock, actualNumBlocks, 0); - err = dskFulErr; // Oops. Can't extend extents file past first record. - break; - } - - foundKey.keyLength = kHFSPlusExtentKeyMaximumLength; - if (FORK_IS_RSRC(fcb)) - foundKey.forkType = kResourceForkType; - else - foundKey.forkType = kDataForkType; - foundKey.pad = 0; - foundKey.fileID = FTOC(fcb)->c_fileid; - foundKey.startBlock = nextBlock; - - foundData[0].startBlock = actualStartBlock; - foundData[0].blockCount = actualNumBlocks; - - // zero out remaining extents... - for (i = 1; i < kHFSPlusExtentDensity; ++i) - { - foundData[i].startBlock = 0; - foundData[i].blockCount = 0; - } - - foundIndex = 0; - - err = CreateExtentRecord(vcb, &foundKey, foundData, &hint); - if (err == fxOvFlErr) { - // We couldn't create an extent record because extents B-tree - // couldn't grow. Dellocate the extent just allocated and - // return a disk full error. - (void) BlockDeallocate(vcb, actualStartBlock, actualNumBlocks, 0); - err = dskFulErr; - } - if (err != noErr) break; - - needsFlush = true; // We need to update the B-tree header - } - else { - // Add a new extent into this record and update. - foundData[foundIndex].startBlock = actualStartBlock; - foundData[foundIndex].blockCount = actualNumBlocks; - err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); - if (err != noErr) break; - } - } - - // Figure out how many bytes were actually allocated. - // NOTE: BlockAllocate could have allocated more than we asked for. - // Don't set the PEOF beyond what our client asked for. - nextBlock += actualNumBlocks; - bytesThisExtent = (int64_t)((int64_t)actualNumBlocks * (int64_t)volumeBlockSize); - if (bytesThisExtent > bytesToAdd) { - bytesToAdd = 0; - } - else { - bytesToAdd -= bytesThisExtent; - maximumBytes -= bytesThisExtent; - } - fcb->ff_blocks += (bytesThisExtent / volumeBlockSize); - FTOC(fcb)->c_blocks += (bytesThisExtent / volumeBlockSize); - FTOC(fcb)->c_flag |= C_MODIFIED; - - // If contiguous allocation was requested, then we've already got one contiguous - // chunk. If we didn't get all we wanted, then adjust the error to disk full. - if (forceContig) { - if (bytesToAdd != 0) - err = dskFulErr; - break; // We've already got everything that's contiguous - } - } - } while (err == noErr && bytesToAdd); - -ErrorExit: -Exit: - if (VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) { - /* Keep the roving allocator out of the metadata zone. */ - if (vcb->nextAllocation >= VCBTOHFS(vcb)->hfs_metazone_start && - vcb->nextAllocation <= VCBTOHFS(vcb)->hfs_metazone_end) { - hfs_lock_mount (hfsmp); - HFS_UPDATE_NEXT_ALLOCATION(vcb, VCBTOHFS(vcb)->hfs_metazone_end + 1); - MarkVCBDirty(vcb); - hfs_unlock_mount(hfsmp); - } - } - if (prevblocks < fcb->ff_blocks) { - *actualBytesAdded = (int64_t)(fcb->ff_blocks - prevblocks) * (int64_t)volumeBlockSize; - } else { - *actualBytesAdded = 0; - } - - if (fastdev) { - hfs_hotfile_adjust_blocks(fcb->ff_cp->c_vp, - (int64_t)prevblocks - fcb->ff_blocks); - } - - if (needsFlush) - (void) FlushExtentFile(vcb); - - return err; - -#if CONFIG_HFS_STD -HFS_Std_Overflow: - err = fileBoundsErr; - goto ErrorExit; -#endif -} - - - -//_________________________________________________________________________________ -// -// Routine: TruncateFileC -// -// Function: Truncates the disk space allocated to a file. The file space is -// truncated to a specified new PEOF rounded up to the next allocation -// block boundry. If the 'TFTrunExt' option is specified, the file is -// truncated to the end of the extent containing the new PEOF. -// -//_________________________________________________________________________________ - -OSErr TruncateFileC ( - ExtendedVCB *vcb, // volume that file resides on - FCB *fcb, // FCB of file to truncate - int64_t peof, // new physical size for file - int deleted, // if nonzero, the file's catalog record has already been deleted. - int rsrc, // does this represent a resource fork or not? - uint32_t fileid, // the fileid of the file we're manipulating. - Boolean truncateToExtent) // if true, truncate to end of extent containing newPEOF - -{ - OSErr err; - u_int32_t nextBlock; // next file allocation block to consider - u_int32_t startBlock; // Physical (volume) allocation block number of start of a range - u_int32_t physNumBlocks; // Number of allocation blocks in file (according to PEOF) - u_int32_t numBlocks; - HFSPlusExtentKey key; // key for current extent record; key->keyLength == 0 if FCB's extent record - u_int32_t hint; // BTree hint corresponding to key - HFSPlusExtentRecord extentRecord; - u_int32_t extentIndex; - u_int32_t extentNextBlock; - u_int32_t numExtentsPerRecord; - int64_t temp64; - u_int8_t forkType; - Boolean extentChanged; // true if we actually changed an extent - Boolean recordDeleted; // true if an extent record got deleted - - recordDeleted = false; - - if (vcb->vcbSigWord == kHFSPlusSigWord) { - numExtentsPerRecord = kHFSPlusExtentDensity; - } - else { - numExtentsPerRecord = kHFSExtentDensity; - } - - if (rsrc) { - forkType = kResourceForkType; - } - else { - forkType = kDataForkType; - } - - temp64 = fcb->ff_blocks; - physNumBlocks = (u_int32_t)temp64; - - // - // Round newPEOF up to a multiple of the allocation block size. If new size is - // two gigabytes or more, then round down by one allocation block (??? really? - // shouldn't that be an error?). - // - nextBlock = howmany(peof, vcb->blockSize); // number of allocation blocks to remain in file - peof = (int64_t)((int64_t)nextBlock * (int64_t)vcb->blockSize); // number of bytes in those blocks - -#if CONFIG_HFS_STD - if ((vcb->vcbSigWord == kHFSSigWord) && (peof >= kTwoGigabytes)) { - #if DEBUG_BUILD - DebugStr("HFS: Trying to truncate a file to 2GB or more"); - #endif - err = fileBoundsErr; - goto ErrorExit; - } -#endif - - // - // Update FCB's length - // - /* - * XXX Any errors could cause ff_blocks and c_blocks to get out of sync... - */ - numBlocks = peof / vcb->blockSize; - if (!deleted) { - FTOC(fcb)->c_blocks -= (fcb->ff_blocks - numBlocks); - } - fcb->ff_blocks = numBlocks; - - // this catalog entry is modified and *must* get forced - // to disk when hfs_update() is called - if (!deleted) { - /* - * If the file is already C_NOEXISTS, then the catalog record - * has been removed from disk already. We wouldn't need to force - * another update - */ - FTOC(fcb)->c_flag |= C_MODIFIED; - } - // - // If the new PEOF is 0, then truncateToExtent has no meaning (we should always deallocate - // all storage). - // - if (peof == 0) { - int i; - - // Deallocate all the extents for this fork - err = DeallocateFork(vcb, fileid, forkType, fcb->fcbExtents, &recordDeleted); - if (err != noErr) goto ErrorExit; // got some error, so return it - - // Update the catalog extent record (making sure it's zeroed out) - if (err == noErr) { - for (i=0; i < kHFSPlusExtentDensity; i++) { - fcb->fcbExtents[i].startBlock = 0; - fcb->fcbExtents[i].blockCount = 0; - } - } - goto Done; - } - - // - // Find the extent containing byte (peof-1). This is the last extent we'll keep. - // (If truncateToExtent is true, we'll keep the whole extent; otherwise, we'll only - // keep up through peof). The search will tell us how many allocation blocks exist - // in the found extent plus all previous extents. - // - err = SearchExtentFile(vcb, fcb, peof-1, &key, extentRecord, &extentIndex, &hint, &extentNextBlock); - if (err != noErr) goto ErrorExit; - - extentChanged = false; // haven't changed the extent yet - - if (!truncateToExtent) { - // - // Shorten this extent. It may be the case that the entire extent gets - // freed here. - // - numBlocks = extentNextBlock - nextBlock; // How many blocks in this extent to free up - if (numBlocks != 0) { - // Compute first volume allocation block to free - startBlock = extentRecord[extentIndex].startBlock + extentRecord[extentIndex].blockCount - numBlocks; - // Free the blocks in bitmap - err = BlockDeallocate(vcb, startBlock, numBlocks, 0); - if (err != noErr) goto ErrorExit; - // Adjust length of this extent - extentRecord[extentIndex].blockCount -= numBlocks; - // If extent is empty, set start block to 0 - if (extentRecord[extentIndex].blockCount == 0) - extentRecord[extentIndex].startBlock = 0; - // Remember that we changed the extent record - extentChanged = true; - } - } - - // - // Now move to the next extent in the record, and set up the file allocation block number - // - nextBlock = extentNextBlock; // Next file allocation block to free - ++extentIndex; // Its index within the extent record - - // - // Release all following extents in this extent record. Update the record. - // - while (extentIndex < numExtentsPerRecord && extentRecord[extentIndex].blockCount != 0) { - numBlocks = extentRecord[extentIndex].blockCount; - // Deallocate this extent - err = BlockDeallocate(vcb, extentRecord[extentIndex].startBlock, numBlocks, 0); - if (err != noErr) goto ErrorExit; - // Update next file allocation block number - nextBlock += numBlocks; - // Zero out start and length of this extent to delete it from record - extentRecord[extentIndex].startBlock = 0; - extentRecord[extentIndex].blockCount = 0; - // Remember that we changed an extent - extentChanged = true; - // Move to next extent in record - ++extentIndex; - } - - // - // If any of the extents in the current record were changed, then update that - // record (in the FCB, or extents file). - // - if (extentChanged) { - err = UpdateExtentRecord(vcb, fcb, deleted, &key, extentRecord, hint); - if (err != noErr) goto ErrorExit; - } - - // - // If there are any following allocation blocks, then we need - // to seach for their extent records and delete those allocation - // blocks. - // - if (nextBlock < physNumBlocks) - err = TruncateExtents(vcb, forkType, fileid, nextBlock, &recordDeleted); - -Done: -ErrorExit: - if (recordDeleted) - (void) FlushExtentFile(vcb); - - return err; -} - - -/* - * HFS Plus only - * - */ -OSErr HeadTruncateFile ( - ExtendedVCB *vcb, - FCB *fcb, - u_int32_t headblks) -{ - HFSPlusExtentRecord extents; - HFSPlusExtentRecord tailExtents; - HFSCatalogNodeID fileID; - u_int8_t forkType; - u_int32_t blkcnt; - u_int32_t startblk; - u_int32_t blksfreed; - int i, j; - int error = 0; - int lockflags; - - - if (vcb->vcbSigWord != kHFSPlusSigWord) - return (-1); - - forkType = FORK_IS_RSRC(fcb) ? kResourceForkType : kDataForkType; - fileID = FTOC(fcb)->c_fileid; - bzero(tailExtents, sizeof(tailExtents)); - - blksfreed = 0; - startblk = 0; - - /* - * Process catalog resident extents - */ - for (i = 0, j = 0; i < kHFSPlusExtentDensity; ++i) { - blkcnt = fcb->fcbExtents[i].blockCount; - if (blkcnt == 0) - break; /* end of extents */ - - if (blksfreed < headblks) { - error = BlockDeallocate(vcb, fcb->fcbExtents[i].startBlock, blkcnt, 0); - /* - * Any errors after the first BlockDeallocate - * must be ignored so we can put the file in - * a known state. - */ - if (error ) { - if (i == 0) - goto ErrorExit; /* uh oh */ - else { - error = 0; - printf("hfs: HeadTruncateFile: problems deallocating %s (%d)\n", - FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); - } - } - - blksfreed += blkcnt; - fcb->fcbExtents[i].startBlock = 0; - fcb->fcbExtents[i].blockCount = 0; - } else { - tailExtents[j].startBlock = fcb->fcbExtents[i].startBlock; - tailExtents[j].blockCount = blkcnt; - ++j; - } - startblk += blkcnt; - } - - if (blkcnt == 0) - goto CopyExtents; - - lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - - /* - * Process overflow extents - */ - for (;;) { - u_int32_t extblks; - - error = FindExtentRecord(vcb, forkType, fileID, startblk, false, NULL, extents, NULL); - if (error) { - /* - * Any errors after the first BlockDeallocate - * must be ignored so we can put the file in - * a known state. - */ - if (error != btNotFound) - printf("hfs: HeadTruncateFile: problems finding extents %s (%d)\n", - FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); - error = 0; - break; - } - - for(i = 0, extblks = 0; i < kHFSPlusExtentDensity; ++i) { - blkcnt = extents[i].blockCount; - if (blkcnt == 0) - break; /* end of extents */ - - if (blksfreed < headblks) { - error = BlockDeallocate(vcb, extents[i].startBlock, blkcnt, 0); - if (error) { - printf("hfs: HeadTruncateFile: problems deallocating %s (%d)\n", - FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); - error = 0; - } - blksfreed += blkcnt; - } else { - tailExtents[j].startBlock = extents[i].startBlock; - tailExtents[j].blockCount = blkcnt; - ++j; - } - extblks += blkcnt; - } - - error = DeleteExtentRecord(vcb, forkType, fileID, startblk); - if (error) { - printf("hfs: HeadTruncateFile: problems deallocating %s (%d)\n", - FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); - error = 0; - } - - if (blkcnt == 0) - break; /* all done */ - - startblk += extblks; - } - hfs_systemfile_unlock(vcb, lockflags); - -CopyExtents: - if (blksfreed) { - bcopy(tailExtents, fcb->fcbExtents, sizeof(tailExtents)); - blkcnt = fcb->ff_blocks - headblks; - FTOC(fcb)->c_blocks -= headblks; - fcb->ff_blocks = blkcnt; - - FTOC(fcb)->c_flag |= C_MODIFIED; - FTOC(fcb)->c_touch_chgtime = TRUE; - - (void) FlushExtentFile(vcb); - } - -ErrorExit: - return MacToVFSError(error); -} - - - -//������������������������������������������������������������������������������� -// Routine: SearchExtentRecord (was XRSearch) -// -// Function: Searches extent record for the extent mapping a given file -// allocation block number (FABN). -// -// Input: searchFABN - desired FABN -// extentData - pointer to extent data record (xdr) -// extentDataStartFABN - beginning FABN for extent record -// -// Output: foundExtentDataOffset - offset to extent entry within xdr -// result = noErr, offset to extent mapping desired FABN -// result = FXRangeErr, offset to last extent in record -// endingFABNPlusOne - ending FABN +1 -// noMoreExtents - True if the extent was not found, and the -// extent record was not full (so don't bother -// looking in subsequent records); false otherwise. -// -// Result: noErr = ok -// FXRangeErr = desired FABN > last mapped FABN in record -//������������������������������������������������������������������������������� - -static OSErr SearchExtentRecord( - ExtendedVCB *vcb, - u_int32_t searchFABN, - const HFSPlusExtentRecord extentData, - u_int32_t extentDataStartFABN, - u_int32_t *foundExtentIndex, - u_int32_t *endingFABNPlusOne, - Boolean *noMoreExtents) -{ - OSErr err = noErr; - u_int32_t extentIndex; - /* Set it to the HFS std value */ - u_int32_t numberOfExtents = kHFSExtentDensity; - u_int32_t numAllocationBlocks; - Boolean foundExtent; - - *endingFABNPlusOne = extentDataStartFABN; - *noMoreExtents = false; - foundExtent = false; - - /* Override numberOfExtents for HFS+/HFSX */ - if (vcb->vcbSigWord != kHFSSigWord) { - numberOfExtents = kHFSPlusExtentDensity; - } - - for( extentIndex = 0; extentIndex < numberOfExtents; ++extentIndex ) - { - - // Loop over the extent record and find the search FABN. - - numAllocationBlocks = extentData[extentIndex].blockCount; - if ( numAllocationBlocks == 0 ) - { - break; - } - - *endingFABNPlusOne += numAllocationBlocks; - - if( searchFABN < *endingFABNPlusOne ) - { - // Found the extent. - foundExtent = true; - break; - } - } - - if( foundExtent ) - { - // Found the extent. Note the extent offset - *foundExtentIndex = extentIndex; - } - else - { - // Did not find the extent. Set foundExtentDataOffset accordingly - if( extentIndex > 0 ) - { - *foundExtentIndex = extentIndex - 1; - } - else - { - *foundExtentIndex = 0; - } - - // If we found an empty extent, then set noMoreExtents. - if (extentIndex < numberOfExtents) - *noMoreExtents = true; - - // Finally, return an error to the caller - err = fxRangeErr; - } - - return( err ); -} - -//������������������������������������������������������������������������������� -// Routine: SearchExtentFile (was XFSearch) -// -// Function: Searches extent file (including the FCB resident extent record) -// for the extent mapping a given file position. -// -// Input: vcb - VCB pointer -// fcb - FCB pointer -// filePosition - file position (byte address) -// -// Output: foundExtentKey - extent key record (xkr) -// If extent was found in the FCB's resident extent record, -// then foundExtentKey->keyLength will be set to 0. -// foundExtentData - extent data record(xdr) -// foundExtentIndex - index to extent entry in xdr -// result = 0, offset to extent mapping desired FABN -// result = FXRangeErr, offset to last extent in record -// (i.e., kNumExtentsPerRecord-1) -// extentBTreeHint - BTree hint for extent record -// kNoHint = Resident extent record -// endingFABNPlusOne - ending FABN +1 -// -// Result: -// noErr Found an extent that contains the given file position -// FXRangeErr Given position is beyond the last allocated extent -// (other) (some other internal I/O error) -//������������������������������������������������������������������������������� - -OSErr SearchExtentFile( - ExtendedVCB *vcb, - const FCB *fcb, - int64_t filePosition, - HFSPlusExtentKey *foundExtentKey, - HFSPlusExtentRecord foundExtentData, - u_int32_t *foundExtentIndex, - u_int32_t *extentBTreeHint, - u_int32_t *endingFABNPlusOne ) -{ - OSErr err; - u_int32_t filePositionBlock; - int64_t temp64; - Boolean noMoreExtents; - int lockflags; - - temp64 = filePosition / (int64_t)vcb->blockSize; - filePositionBlock = (u_int32_t)temp64; - - bcopy ( fcb->fcbExtents, foundExtentData, sizeof(HFSPlusExtentRecord)); - - // Search the resident FCB first. - err = SearchExtentRecord( vcb, filePositionBlock, foundExtentData, 0, - foundExtentIndex, endingFABNPlusOne, &noMoreExtents ); - - if( err == noErr ) { - // Found the extent. Set results accordingly - *extentBTreeHint = kNoHint; // no hint, because not in the BTree - foundExtentKey->keyLength = 0; // 0 = the FCB itself - - goto Exit; - } - - // Didn't find extent in FCB. If FCB's extent record wasn't full, there's no point - // in searching the extents file. Note that SearchExtentRecord left us pointing at - // the last valid extent (or the first one, if none were valid). This means we need - // to fill in the hint and key outputs, just like the "if" statement above. - if ( noMoreExtents ) { - *extentBTreeHint = kNoHint; // no hint, because not in the BTree - foundExtentKey->keyLength = 0; // 0 = the FCB itself - err = fxRangeErr; // There are no more extents, so must be beyond PEOF - goto Exit; - } - - // - // Find the desired record, or the previous record if it is the same fork - // - lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - - err = FindExtentRecord(vcb, FORK_IS_RSRC(fcb) ? kResourceForkType : kDataForkType, - FTOC(fcb)->c_fileid, filePositionBlock, true, foundExtentKey, foundExtentData, extentBTreeHint); - hfs_systemfile_unlock(vcb, lockflags); - - if (err == btNotFound) { - // - // If we get here, the desired position is beyond the extents in the FCB, and there are no extents - // in the extents file. Return the FCB's extents and a range error. - // - *extentBTreeHint = kNoHint; - foundExtentKey->keyLength = 0; - err = GetFCBExtentRecord(fcb, foundExtentData); - // Note: foundExtentIndex and endingFABNPlusOne have already been set as a result of the very - // first SearchExtentRecord call in this function (when searching in the FCB's extents, and - // we got a range error). - - return fxRangeErr; - } - - // - // If we get here, there was either a BTree error, or we found an appropriate record. - // If we found a record, then search it for the correct index into the extents. - // - if (err == noErr) { - // Find appropriate index into extent record - err = SearchExtentRecord(vcb, filePositionBlock, foundExtentData, foundExtentKey->startBlock, - foundExtentIndex, endingFABNPlusOne, &noMoreExtents); - } - -Exit: - return err; -} - - - -//============================================================================ -// Routine: UpdateExtentRecord -// -// Function: Write new extent data to an existing extent record with a given key. -// If all of the extents are empty, and the extent record is in the -// extents file, then the record is deleted. -// -// Input: vcb - the volume containing the extents -// fcb - the file that owns the extents -// deleted - whether or not the file is already deleted -// extentFileKey - pointer to extent key record (xkr) -// If the key length is 0, then the extents are actually part -// of the catalog record, stored in the FCB. -// extentData - pointer to extent data record (xdr) -// extentBTreeHint - hint for given key, or kNoHint -// -// Result: noErr = ok -// (other) = error from BTree -//============================================================================ - -static OSErr UpdateExtentRecord (ExtendedVCB *vcb, FCB *fcb, int deleted, - const HFSPlusExtentKey *extentFileKey, - const HFSPlusExtentRecord extentData, - u_int32_t extentBTreeHint) -{ - OSErr err = noErr; - - if (extentFileKey->keyLength == 0) { // keyLength == 0 means the FCB's extent record - BlockMoveData(extentData, fcb->fcbExtents, sizeof(HFSPlusExtentRecord)); - if (!deleted) { - FTOC(fcb)->c_flag |= C_MODIFIED; - } - } - else { - struct BTreeIterator *btIterator = NULL; - FSBufferDescriptor btRecord; - u_int16_t btRecordSize; - FCB * btFCB; - int lockflags; - - // - // Need to find and change a record in Extents BTree - // - btFCB = GetFileControlBlock(vcb->extentsRefNum); - - MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); - if (btIterator == NULL) { - return memFullErr; // translates to ENOMEM - } - bzero(btIterator, sizeof(*btIterator)); - - /* - * The lock taken by callers of ExtendFileC/TruncateFileC is - * speculative and only occurs when the file already has - * overflow extents. So we need to make sure we have the lock - * here. The extents btree lock can be nested (its recursive) - * so we always take it here. - */ - lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - - /* HFS+/HFSX */ - if (vcb->vcbSigWord != kHFSSigWord) { // HFS Plus volume - HFSPlusExtentRecord foundData; // The extent data actually found - - BlockMoveData(extentFileKey, &btIterator->key, sizeof(HFSPlusExtentKey)); - - btIterator->hint.index = 0; - btIterator->hint.nodeNum = extentBTreeHint; - - btRecord.bufferAddress = &foundData; - btRecord.itemSize = sizeof(HFSPlusExtentRecord); - btRecord.itemCount = 1; - - err = BTSearchRecord(btFCB, btIterator, &btRecord, &btRecordSize, btIterator); - - if (err == noErr) { - BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord)); - err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); - } - (void) BTFlushPath(btFCB); - } -#if CONFIG_HFS_STD - else { - /* HFS Standard */ - HFSExtentKey * key; // Actual extent key used on disk in HFS - HFSExtentRecord foundData; // The extent data actually found - - key = (HFSExtentKey*) &btIterator->key; - key->keyLength = kHFSExtentKeyMaximumLength; - key->forkType = extentFileKey->forkType; - key->fileID = extentFileKey->fileID; - key->startBlock = extentFileKey->startBlock; - - btIterator->hint.index = 0; - btIterator->hint.nodeNum = extentBTreeHint; - - btRecord.bufferAddress = &foundData; - btRecord.itemSize = sizeof(HFSExtentRecord); - btRecord.itemCount = 1; - - err = BTSearchRecord(btFCB, btIterator, &btRecord, &btRecordSize, btIterator); - - if (err == noErr) - err = HFSPlusToHFSExtents(extentData, (HFSExtentDescriptor *)&foundData); - - if (err == noErr) - err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); - (void) BTFlushPath(btFCB); - - } -#endif - - hfs_systemfile_unlock(vcb, lockflags); - - FREE(btIterator, M_TEMP); - } - - return err; -} - - - -#if CONFIG_HFS_STD -static OSErr HFSPlusToHFSExtents( - const HFSPlusExtentRecord oldExtents, - HFSExtentRecord newExtents) -{ - OSErr err; - - err = noErr; - - // copy the first 3 extents - newExtents[0].startBlock = oldExtents[0].startBlock; - newExtents[0].blockCount = oldExtents[0].blockCount; - newExtents[1].startBlock = oldExtents[1].startBlock; - newExtents[1].blockCount = oldExtents[1].blockCount; - newExtents[2].startBlock = oldExtents[2].startBlock; - newExtents[2].blockCount = oldExtents[2].blockCount; - - #if DEBUG_BUILD - if (oldExtents[3].startBlock || oldExtents[3].blockCount) { - DebugStr("ExtentRecord with > 3 extents is invalid for HFS"); - err = fsDSIntErr; - } - #endif - - return err; -} -#endif - - - -static OSErr GetFCBExtentRecord( - const FCB *fcb, - HFSPlusExtentRecord extents) -{ - - BlockMoveData(fcb->fcbExtents, extents, sizeof(HFSPlusExtentRecord)); - - return noErr; -} - - -//_________________________________________________________________________________ -// -// Routine: ExtentsAreIntegral -// -// Purpose: Ensure that each extent can hold an integral number of nodes -// Called by the NodesAreContiguous function -//_________________________________________________________________________________ - -static Boolean ExtentsAreIntegral( - const HFSPlusExtentRecord extentRecord, - u_int32_t mask, - u_int32_t *blocksChecked, - Boolean *checkedLastExtent) -{ - u_int32_t blocks; - u_int32_t extentIndex; - - *blocksChecked = 0; - *checkedLastExtent = false; - - for(extentIndex = 0; extentIndex < kHFSPlusExtentDensity; extentIndex++) - { - blocks = extentRecord[extentIndex].blockCount; - - if ( blocks == 0 ) - { - *checkedLastExtent = true; - break; - } - - *blocksChecked += blocks; - - if (blocks & mask) - return false; - } - - return true; -} - - -//_________________________________________________________________________________ -// -// Routine: NodesAreContiguous -// -// Purpose: Ensure that all b-tree nodes are contiguous on disk -// Called by BTOpenPath during volume mount -//_________________________________________________________________________________ - -Boolean NodesAreContiguous( - ExtendedVCB *vcb, - FCB *fcb, - u_int32_t nodeSize) -{ - u_int32_t mask; - u_int32_t startBlock; - u_int32_t blocksChecked; - u_int32_t hint; - HFSPlusExtentKey key; - HFSPlusExtentRecord extents; - OSErr result; - Boolean lastExtentReached; - int lockflags; - - - if (vcb->blockSize >= nodeSize) - return TRUE; - - mask = (nodeSize / vcb->blockSize) - 1; - - // check the local extents - (void) GetFCBExtentRecord(fcb, extents); - if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) - return FALSE; - - if ( lastExtentReached || - (int64_t)((int64_t)blocksChecked * (int64_t)vcb->blockSize) >= (int64_t)fcb->ff_size) - return TRUE; - - startBlock = blocksChecked; - - lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); - - // check the overflow extents (if any) - while ( !lastExtentReached ) - { - result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint); - if (result) break; - - if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) { - hfs_systemfile_unlock(vcb, lockflags); - return FALSE; - } - startBlock += blocksChecked; - } - hfs_systemfile_unlock(vcb, lockflags); - return TRUE; -} - diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c deleted file mode 100644 index 612171809..000000000 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ /dev/null @@ -1,6250 +0,0 @@ -/* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: VolumeAllocation.c - - Contains: Routines for accessing and modifying the volume bitmap. - - Version: HFS Plus 1.0 - - Copyright: � 1996-2009 by Apple Computer, Inc., all rights reserved. - -*/ - -/* -Public routines: - BlockAllocate / hfs_block_alloc - Allocate space on a volume. Can allocate space contiguously. - If not contiguous, then allocation may be less than what was - asked for. Returns the starting block number, and number of - blocks. It will only return a single extent. - - BlockDeallocate - Deallocate a contiguous run of allocation blocks. - - BlockMarkAllocated - Exported wrapper to mark blocks as in-use. This will correctly determine - whether or not the red-black tree is enabled and call the appropriate function - if applicable. - BlockMarkFree - Exported wrapper to mark blocks as freed. This will correctly determine whether or - not the red-black tree is enabled and call the appropriate function if applicable. - - - ResetVCBFreeExtCache - Since the red-black tree obviates the need to maintain the free extent cache, we do - not update it if the tree is also live. As a result, if we ever need to destroy the trees - we should reset the free extent cache so it doesn't confuse us when we need to fall back to the - bitmap scanning allocator. - We also reset and disable the free extent cache when volume resizing is - in flight. - - UpdateAllocLimit - Adjusts the AllocLimit field in the hfs mount point. This is used when we need to prevent - allocations from occupying space in the region we are modifying during a filesystem resize. - At other times, it should be consistent with the total number of allocation blocks in the - filesystem. It is also used to shrink or grow the number of blocks that the red-black tree should - know about. If growing, scan the new range of bitmap, and if shrinking, reduce the - number of items in the tree that we can allocate from. - - ScanUnmapBlocks - Traverse the entire allocation bitmap. Potentially issue DKIOCUNMAPs to the device as it - tracks unallocated ranges when iterating the volume bitmap. Additionally, build up the in-core - summary table of the allocation bitmap. - -Internal routines: - BlockMarkFreeInternal - Mark a contiguous range of blocks as free. The corresponding - bits in the volume bitmap will be cleared. This will actually do the work - of modifying the bitmap for us. - - BlockMarkAllocatedInternal - Mark a contiguous range of blocks as allocated. The cor- - responding bits in the volume bitmap are set. Also tests to see - if any of the blocks were previously unallocated. - BlockFindContiguous - Find a contiguous range of blocks of a given size. The caller - specifies where to begin the search (by block number). The - block number of the first block in the range is returned. This is only - called by the bitmap scanning logic as the red-black tree should be able - to do this internally by searching its tree. - BlockFindAny - Find and allocate a contiguous range of blocks up to a given size. The - first range of contiguous free blocks found are allocated, even if there - are fewer blocks than requested (and even if a contiguous range of blocks - of the given size exists elsewhere). - BlockFindAnyBitmap - Finds a range of blocks per the above requirements without using the - Allocation RB Tree. This relies on the bitmap-scanning logic in order to find - any valid range of free space needed. - BlockFindContig - Find a contiguous range of blocks of a given size. - If the minimum cannot be satisfied, nothing is - returned. - BlockFindKnown - Try to allocate space from known free space in the volume's - free extent cache. - ReadBitmapBlock - Given an allocation block number, read the bitmap block that - contains that allocation block into a caller-supplied buffer. - - ReleaseBitmapBlock - Release a bitmap block back into the buffer cache. - - ReadBitmapRange - Given an allocation block number, read a range of bitmap that - must begin at that allocation block into a caller supplied buffer. - - ReleaseBitmapRange - Release and invalidate a buf_t corresponding to the bitmap - back into the UBC in order to prevent coherency issues. - - remove_free_extent_cache - Remove an extent from the free extent cache. Handles overlaps - with multiple extents in the cache, and handles splitting an - extent in the cache if the extent to be removed is in the middle - of a cached extent. - - add_free_extent_cache - Add an extent to the free extent cache. It will merge the - input extent with extents already in the cache. - CheckUnmappedBytes - Check whether or not the current transaction - has allocated blocks that were recently freed. This may have data safety implications. - - - -Debug/Test Routines - hfs_isallocated - Test to see if any blocks in a range are allocated. Journal or - allocation file lock must be held. - - hfs_isallocated_scan - Test to see if any blocks in a range are allocated. Releases and - invalidates the block used when finished. - -Optimization Routines - hfs_alloc_scan_block - Given a starting allocation block number, figures out which physical block contains that - allocation block's bit, and scans it from the starting bit until either the ending bit or - the end of the block. Free space extents are inserted into the appropriate red-black tree. - -*/ - - -#include -#include - -#if !HFS_ALLOC_TEST - -#include "../../hfs_macos_defs.h" -#include -#include -#include -/* For VM Page size */ -#include -#include -#include "../../hfs.h" -#include "../../hfs_endian.h" -#include "../headers/FileMgrInternal.h" - -#endif // !HFS_ALLOC_TEST - -#include -#include -#include -#include - -#include "../../hfs_dbg.h" -#include "../../hfs_format.h" -#include "../../hfs_kdebug.h" -#include "../../rangelist.h" -#include "../../hfs_extents.h" - -/* Headers for unmap-on-mount support */ -#include - -#ifndef CONFIG_HFS_TRIM -#define CONFIG_HFS_TRIM 0 -#endif - -/* - * Use sysctl vfs.generic.hfs.kdebug.allocation to control which - * KERNEL_DEBUG_CONSTANT events are enabled at runtime. (They're - * disabled by default because there can be a lot of these events, - * and we don't want to overwhelm the kernel debug buffer. If you - * want to watch these events in particular, just set the sysctl.) - */ -static int hfs_kdebug_allocation = 0; -SYSCTL_DECL(_vfs_generic); -SYSCTL_NODE(_vfs_generic, OID_AUTO, hfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS file system"); -SYSCTL_NODE(_vfs_generic_hfs, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS kdebug"); -SYSCTL_INT(_vfs_generic_hfs_kdebug, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED, &hfs_kdebug_allocation, 0, "Enable kdebug logging for HFS allocations"); -enum { - /* - * HFSDBG_ALLOC_ENABLED: Log calls to BlockAllocate and - * BlockDeallocate, including the internal BlockAllocateXxx - * routines so we can see how an allocation was satisfied. - * - * HFSDBG_EXT_CACHE_ENABLED: Log routines that read or write the - * free extent cache. - * - * HFSDBG_UNMAP_ENABLED: Log events involving the trim list. - * - * HFSDBG_BITMAP_ENABLED: Log accesses to the volume bitmap (setting - * or clearing bits, scanning the bitmap). - */ - HFSDBG_ALLOC_ENABLED = 1, - HFSDBG_EXT_CACHE_ENABLED = 2, - HFSDBG_UNMAP_ENABLED = 4, - HFSDBG_BITMAP_ENABLED = 8 -}; - -enum { - kBytesPerWord = 4, - kBitsPerByte = 8, - kBitsPerWord = 32, - - kBitsWithinWordMask = kBitsPerWord-1 -}; - -#define kLowBitInWordMask 0x00000001ul -#define kHighBitInWordMask 0x80000000ul -#define kAllBitsSetInWord 0xFFFFFFFFul - -#define HFS_MIN_SUMMARY_BLOCKSIZE 4096 - -#define ALLOC_DEBUG 0 - -static OSErr ReadBitmapBlock( - ExtendedVCB *vcb, - u_int32_t bit, - u_int32_t **buffer, - uintptr_t *blockRef, - hfs_block_alloc_flags_t flags); - -static OSErr ReleaseBitmapBlock( - ExtendedVCB *vcb, - uintptr_t blockRef, - Boolean dirty); - -static OSErr hfs_block_alloc_int(hfsmount_t *hfsmp, - HFSPlusExtentDescriptor *extent, - hfs_block_alloc_flags_t flags, - hfs_alloc_extra_args_t *ap); - -static OSErr BlockFindAny( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t endingBlock, - u_int32_t maxBlocks, - hfs_block_alloc_flags_t flags, - Boolean trustSummary, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks); - -static OSErr BlockFindAnyBitmap( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t endingBlock, - u_int32_t maxBlocks, - hfs_block_alloc_flags_t flags, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks); - -static OSErr BlockFindContig( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t minBlocks, - u_int32_t maxBlocks, - hfs_block_alloc_flags_t flags, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks); - -static OSErr BlockFindContiguous( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t endingBlock, - u_int32_t minBlocks, - u_int32_t maxBlocks, - Boolean useMetaZone, - Boolean trustSummary, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks, - hfs_block_alloc_flags_t flags); - -static OSErr BlockFindKnown( - ExtendedVCB *vcb, - u_int32_t maxBlocks, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks); - -static OSErr hfs_alloc_try_hard(hfsmount_t *hfsmp, - HFSPlusExtentDescriptor *extent, - uint32_t max_blocks, - hfs_block_alloc_flags_t flags); - -static OSErr BlockMarkAllocatedInternal ( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t numBlocks, - hfs_block_alloc_flags_t flags); - -static OSErr BlockMarkFreeInternal( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t numBlocks, - Boolean do_validate); - - -static OSErr ReadBitmapRange (struct hfsmount *hfsmp, uint32_t offset, uint32_t iosize, - uint32_t **buffer, struct buf **blockRef); - -static OSErr ReleaseScanBitmapRange( struct buf *bp ); - -static int hfs_track_unmap_blocks (struct hfsmount *hfsmp, u_int32_t offset, - u_int32_t numBlocks, struct jnl_trim_list *list); - -static int hfs_issue_unmap (struct hfsmount *hfsmp, struct jnl_trim_list *list); - -static int hfs_alloc_scan_range(struct hfsmount *hfsmp, - u_int32_t startbit, - u_int32_t *bitToScan, - struct jnl_trim_list *list); - -static int hfs_scan_range_size (struct hfsmount* hfsmp, uint32_t start, uint32_t *iosize); -static uint32_t CheckUnmappedBytes (struct hfsmount *hfsmp, uint64_t blockno, uint64_t numblocks, int *recent, uint32_t *next); - -/* Bitmap Re-use Detection */ -static inline int extents_overlap (uint32_t start1, uint32_t len1, - uint32_t start2, uint32_t len2) { - return !( ((start1 + len1) <= start2) || ((start2 + len2) <= start1) ); -} - - -int hfs_isallocated_scan (struct hfsmount *hfsmp, - u_int32_t startingBlock, - u_int32_t *bp_buf); - -/* Summary Table Functions */ -static int hfs_set_summary (struct hfsmount *hfsmp, uint32_t summarybit, uint32_t inuse); -static int hfs_get_summary_index (struct hfsmount *hfsmp, uint32_t block, uint32_t *index); -static int hfs_find_summary_free (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock); -static int hfs_get_summary_allocblock (struct hfsmount *hfsmp, uint32_t summarybit, uint32_t *alloc); -static int hfs_release_summary (struct hfsmount *hfsmp, uint32_t start, uint32_t length); -static int hfs_check_summary (struct hfsmount *hfsmp, uint32_t start, uint32_t *freeblocks); -static int hfs_rebuild_summary (struct hfsmount *hfsmp); - -#if 0 -static int hfs_get_next_summary (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock); -#endif - -/* Used in external mount code to initialize the summary table */ -int hfs_init_summary (struct hfsmount *hfsmp); - -#if ALLOC_DEBUG -void hfs_validate_summary (struct hfsmount *hfsmp); -#endif - - -/* Functions for manipulating free extent cache */ -static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); -static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); -static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated); - -static void hfs_release_reserved(hfsmount_t *hfsmp, struct rl_entry *range, int list); - -/* Functions for getting free exents */ - -typedef struct bitmap_context { - void *bitmap; // current bitmap chunk - uint32_t run_offset; // offset (in bits) from start of bitmap to start of current run - uint32_t chunk_current; // next bit to scan in the chunk - uint32_t chunk_end; // number of valid bits in this chunk - struct hfsmount *hfsmp; - struct buf *bp; - uint32_t last_free_summary_bit; // last marked free summary bit - int lockflags; - uint64_t lock_start; -} bitmap_context_t; - - -static errno_t get_more_bits(bitmap_context_t *bitmap_ctx); -static int bit_count_set(void *bitmap, int start, int end); -static int bit_count_clr(void *bitmap, int start, int end); -static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count); -static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count); -static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count); -static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set); -static int clzll(uint64_t x); - -#if ALLOC_DEBUG -/* - * Validation Routine to verify that the TRIM list maintained by the journal - * is in good shape relative to what we think the bitmap should have. We should - * never encounter allocated blocks in the TRIM list, so if we ever encounter them, - * we panic. - */ -int trim_validate_bitmap (struct hfsmount *hfsmp); -int trim_validate_bitmap (struct hfsmount *hfsmp) { - u_int64_t blockno_offset; - u_int64_t numblocks; - int i; - int count; - u_int32_t startblk; - u_int32_t blks; - int err = 0; - uint32_t alloccount = 0; - - if (hfsmp->jnl) { - struct journal *jnl = (struct journal*)hfsmp->jnl; - if (jnl->active_tr) { - struct jnl_trim_list *trim = &(jnl->active_tr->trim); - count = trim->extent_count; - for (i = 0; i < count; i++) { - blockno_offset = trim->extents[i].offset; - blockno_offset = blockno_offset - (uint64_t)hfsmp->hfsPlusIOPosOffset; - blockno_offset = blockno_offset / hfsmp->blockSize; - numblocks = trim->extents[i].length / hfsmp->blockSize; - - startblk = (u_int32_t)blockno_offset; - blks = (u_int32_t) numblocks; - err = hfs_count_allocated (hfsmp, startblk, blks, &alloccount); - - if (err == 0 && alloccount != 0) { - panic ("trim_validate_bitmap: %d blocks @ ABN %d are allocated!", alloccount, startblk); - } - } - } - } - return 0; -} - -#endif - - -/* - ;________________________________________________________________________________ - ; - ; Routine: hfs_unmap_free_extent - ; - ; Function: Make note of a range of allocation blocks that should be - ; unmapped (trimmed). That is, the given range of blocks no - ; longer have useful content, and the device can unmap the - ; previous contents. For example, a solid state disk may reuse - ; the underlying storage for other blocks. - ; - ; This routine is only supported for journaled volumes. The extent - ; being freed is passed to the journal code, and the extent will - ; be unmapped after the current transaction is written to disk. - ; - ; Input Arguments: - ; hfsmp - The volume containing the allocation blocks. - ; startingBlock - The first allocation block of the extent being freed. - ; numBlocks - The number of allocation blocks of the extent being freed. - ;________________________________________________________________________________ - */ -static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) -{ - u_int64_t offset; - u_int64_t length; - u_int64_t device_sz; - int err = 0; - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); - - if (ALLOC_DEBUG) { - if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { - panic("hfs: %p: (%u,%u) unmapping allocated blocks", hfsmp, startingBlock, numBlocks); - } - } - - if (hfsmp->jnl != NULL) { - device_sz = hfsmp->hfs_logical_bytes; - offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; - length = (u_int64_t) numBlocks * hfsmp->blockSize; - - /* Validate that the trim is in a valid range of bytes */ - if ((offset >= device_sz) || ((offset + length) > device_sz)) { - printf("hfs_unmap_free_ext: ignoring trim vol=%s @ off %lld len %lld \n", hfsmp->vcbVN, offset, length); - err = EINVAL; - } - - if (err == 0) { - err = journal_trim_add_extent(hfsmp->jnl, offset, length); - if (err) { - printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent for vol=%s", err, hfsmp->vcbVN); - } - } - } - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_END, err, 0, 0, 0, 0); -} - -/* - ;________________________________________________________________________________ - ; - ; Routine: hfs_track_unmap_blocks - ; - ; Function: Make note of a range of allocation blocks that should be - ; unmapped (trimmed). That is, the given range of blocks no - ; longer have useful content, and the device can unmap the - ; previous contents. For example, a solid state disk may reuse - ; the underlying storage for other blocks. - ; - ; This routine is only supported for journaled volumes. - ; - ; *****NOTE*****: - ; This function should *NOT* be used when the volume is fully - ; mounted. This function is intended to support a bitmap iteration - ; at mount time to fully inform the SSD driver of the state of all blocks - ; at mount time, and assumes that there is no allocation/deallocation - ; interference during its iteration., - ; - ; Input Arguments: - ; hfsmp - The volume containing the allocation blocks. - ; offset - The first allocation block of the extent being freed. - ; numBlocks - The number of allocation blocks of the extent being freed. - ; list - The list of currently tracked trim ranges. - ;________________________________________________________________________________ - */ -static int hfs_track_unmap_blocks (struct hfsmount *hfsmp, u_int32_t start, - u_int32_t numBlocks, struct jnl_trim_list *list) { - - u_int64_t offset; - u_int64_t length; - int error = 0; - - if ((hfsmp->hfs_flags & HFS_UNMAP) && (hfsmp->jnl != NULL) && list->allocated_count && list->extents != NULL) { - int extent_no = list->extent_count; - offset = (u_int64_t) start * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; - length = (u_int64_t) numBlocks * hfsmp->blockSize; - - - list->extents[extent_no].offset = offset; - list->extents[extent_no].length = length; - list->extent_count++; - if (list->extent_count == list->allocated_count) { - error = hfs_issue_unmap (hfsmp, list); - } - } - - return error; -} - -/* - ;________________________________________________________________________________ - ; - ; Routine: hfs_issue_unmap - ; - ; Function: Issue a DKIOCUNMAP for all blocks currently tracked by the jnl_trim_list - ; - ; Input Arguments: - ; hfsmp - The volume containing the allocation blocks. - ; list - The list of currently tracked trim ranges. - ;________________________________________________________________________________ - */ - -static int hfs_issue_unmap (struct hfsmount *hfsmp, struct jnl_trim_list *list) -{ - dk_unmap_t unmap; - int error = 0; - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN_TRIM | DBG_FUNC_START, hfsmp->hfs_raw_dev, 0, 0, 0, 0); - } - - if (list->extent_count > 0 && list->extents != NULL) { - bzero(&unmap, sizeof(unmap)); - unmap.extents = list->extents; - unmap.extentsCount = list->extent_count; - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN_TRIM | DBG_FUNC_NONE, hfsmp->hfs_raw_dev, unmap.extentsCount, 0, 0, 0); - } - -#if CONFIG_PROTECT - /* - * If we have not yet completed the first scan through the bitmap, then - * optionally inform the block driver below us that this is an initialization - * TRIM scan, if it can deal with this information. - */ - if ((hfsmp->scan_var & HFS_ALLOCATOR_SCAN_COMPLETED) == 0) { - unmap.options |= _DK_UNMAP_INITIALIZE; - } -#endif - /* Issue a TRIM and flush them out */ - error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); - - bzero (list->extents, (list->allocated_count * sizeof(dk_extent_t))); - bzero (&unmap, sizeof(unmap)); - list->extent_count = 0; - } - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN_TRIM | DBG_FUNC_END, error, hfsmp->hfs_raw_dev, 0, 0, 0); - } - - return error; -} - -/* - ;________________________________________________________________________________ - ; - ; Routine: hfs_unmap_alloc_extent - ; - ; Function: Make note of a range of allocation blocks, some of - ; which may have previously been passed to hfs_unmap_free_extent, - ; is now in use on the volume. The given blocks will be removed - ; from any pending DKIOCUNMAP. - ; - ; Input Arguments: - ; hfsmp - The volume containing the allocation blocks. - ; startingBlock - The first allocation block of the extent being allocated. - ; numBlocks - The number of allocation blocks being allocated. - ;________________________________________________________________________________ - */ - -static void hfs_unmap_alloc_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) -{ - u_int64_t offset; - u_int64_t length; - int err = 0; - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); - - if (hfsmp->jnl != NULL) { - offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; - length = (u_int64_t) numBlocks * hfsmp->blockSize; - - err = journal_trim_remove_extent(hfsmp->jnl, offset, length); - if (err) { - printf("hfs_unmap_alloc_extent: error %d from journal_trim_remove_extent for vol=%s", err, hfsmp->vcbVN); - } - } - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_END, err, 0, 0, 0, 0); -} - - -/* -;________________________________________________________________________________ -; -; Routine: hfs_trim_callback -; -; Function: This function is called when a transaction that freed extents -; (via hfs_unmap_free_extent/journal_trim_add_extent) has been -; written to the on-disk journal. This routine will add those -; extents to the free extent cache so that they can be reused. -; -; CAUTION: This routine is called while the journal's trim lock -; is held shared, so that no other thread can reuse any portion -; of those extents. We must be very careful about which locks -; we take from within this callback, to avoid deadlock. The -; call to add_free_extent_cache will end up taking the cache's -; lock (just long enough to add these extents to the cache). -; -; CAUTION: If the journal becomes invalid (eg., due to an I/O -; error when trying to write to the journal), this callback -; will stop getting called, even if extents got freed before -; the journal became invalid! -; -; Input Arguments: -; arg - The hfsmount of the volume containing the extents. -; extent_count - The number of extents freed in the transaction. -; extents - An array of extents (byte ranges) that were freed. -;________________________________________________________________________________ -*/ - -__private_extern__ void -hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents) -{ - uint32_t i; - uint32_t startBlock, numBlocks; - struct hfsmount *hfsmp = arg; - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_START, 0, extent_count, 0, 0, 0); - - for (i=0; ihfsPlusIOPosOffset) / hfsmp->blockSize; - numBlocks = extents[i].length / hfsmp->blockSize; - (void) add_free_extent_cache(hfsmp, startBlock, numBlocks); - } - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_END, 0, 0, 0, 0, 0); -} - - -/* - ;________________________________________________________________________________ - ; - ; Routine: CheckUnmappedBytes - ; - ; Function: From the specified inputs, determine if the extent in question overlaps - ; space that was recently freed, where the recently freed space may still be - ; lingering in an uncommitted journal transaction. This may have data safety - ; implications. The intended use is to decide whether or not to force a journal flush - ; before allowing file data I/O to be issued. If we did not do this - ; then it would be possible to issue the file I/O ahead of the - ; journal, resulting in data being overwritten if the transaction either - ; is not committed or cannot be replayed. - ; - ; NOTE: This function assumes that the journal and catalog/extent locks are held. - ; - ; Input Arguments: - ; hfsmp - The volume containing the allocation blocks. - ; foffset - start of the extent in question (in allocation blocks) - ; numbytes - number of blocks in the extent. - ; recently_freed: - output pointer containing whether or not the blocks were freed recently - ; overlap_end - end of the overlap between the argument extent and the trim list (in allocation blocks) - ; - ; Output: - ; - ; Returns 0 if we could determine extent validity for this (or a previous transaction) - ; Returns errno if there was an error - ; - ; If returned 0, then recently freed will contain a boolean that indicates - ; that it was recently freed. - ;________________________________________________________________________________ - */ - -u_int32_t -CheckUnmappedBytes (struct hfsmount *hfsmp, uint64_t blockno, uint64_t numblocks, int *recently_freed, uint32_t *overlap_end) { - uint64_t device_offset; - uint64_t numbytes; - uint32_t err = 0; - uint64_t lba_overlap_end; - - if (hfsmp->jnl != NULL) { - /* - * Convert the allocation block # and the number of blocks into device-relative - * offsets so that they can be compared using the TRIM list. - */ - uint64_t device_sz = hfsmp->hfs_logical_bytes; - device_offset = blockno * ((uint64_t)hfsmp->blockSize); - device_offset += hfsmp->hfsPlusIOPosOffset; - numbytes = (((uint64_t)hfsmp->blockSize) * numblocks); - - /* - * Since we check that the device_offset isn't too large, it's safe to subtract it - * from the size in the second check. - */ - if ((device_offset >= device_sz) || (numbytes > (device_sz - device_offset))) { - return EINVAL; - } - - /* Ask the journal if this extent overlaps with any pending TRIMs */ - if (journal_trim_extent_overlap (hfsmp->jnl, device_offset, numbytes, &lba_overlap_end)) { - *recently_freed = 1; - - /* Convert lba_overlap_end back into allocation blocks */ - uint64_t end_offset = lba_overlap_end - hfsmp->hfsPlusIOPosOffset; - end_offset = end_offset / ((uint64_t) hfsmp->blockSize); - *overlap_end = (uint32_t) end_offset; - } - else { - *recently_freed = 0; - } - err = 0; - } - else { - /* There may not be a journal. In that case, always return success. */ - *recently_freed = 0; - } - return err; - -} - - -/* - ;________________________________________________________________________________ - ; - ; Routine: ScanUnmapBlocks - ; - ; Function: Traverse the bitmap, and potentially issue DKIOCUNMAPs to the underlying - ; device as needed so that the underlying disk device is as - ; up-to-date as possible with which blocks are unmapped. - ; Additionally build up the summary table as needed. - ; - ; This function reads the bitmap in large block size - ; (up to 1MB) unlink the runtime which reads the bitmap - ; in 4K block size. So if this function is being called - ; after the volume is mounted and actively modified, the - ; caller needs to invalidate all of the existing buffers - ; associated with the bitmap vnode before calling this - ; function. If the buffers are not invalidated, it can - ; cause but_t collision and potential data corruption. - ; - ; Input Arguments: - ; hfsmp - The volume containing the allocation blocks. - ;________________________________________________________________________________ - */ - -__private_extern__ -u_int32_t ScanUnmapBlocks (struct hfsmount *hfsmp) -{ - u_int32_t blocks_scanned = 0; - int error = 0; - struct jnl_trim_list trimlist; - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN | DBG_FUNC_START, hfsmp->hfs_raw_dev, 0, 0, 0, 0); - } - - /* - *struct jnl_trim_list { - uint32_t allocated_count; - uint32_t extent_count; - dk_extent_t *extents; - }; - */ - bzero (&trimlist, sizeof(trimlist)); - - /* - * The scanning itself here is not tied to the presence of CONFIG_HFS_TRIM - * which is now enabled for most architectures. Instead, any trim related - * work should be tied to whether the underlying storage media supports - * UNMAP, as any solid state device would on desktop or embedded. - * - * We do this because we may want to scan the full bitmap on desktop - * for spinning media for the purposes of building up the - * summary table. - * - * We also avoid sending TRIMs down to the underlying media if the mount is read-only. - */ - - if ((hfsmp->hfs_flags & HFS_UNMAP) && - ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) { - /* If the underlying device supports unmap and the mount is read-write, initialize */ - int alloc_count = PAGE_SIZE / sizeof(dk_extent_t); - void *extents = kalloc (alloc_count * sizeof(dk_extent_t)); - if (extents == NULL) { - return ENOMEM; - } - trimlist.extents = (dk_extent_t*)extents; - trimlist.allocated_count = alloc_count; - trimlist.extent_count = 0; - } - - while ((blocks_scanned < hfsmp->totalBlocks) && (error == 0)){ - - error = hfs_alloc_scan_range (hfsmp, blocks_scanned, &blocks_scanned, &trimlist); - - if (error) { - printf("HFS: bitmap scan range error: %d on vol=%s\n", error, hfsmp->vcbVN); - break; - } - } - - if ((hfsmp->hfs_flags & HFS_UNMAP) && - ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) { - if (error == 0) { - hfs_issue_unmap(hfsmp, &trimlist); - } - if (trimlist.extents) { - kfree (trimlist.extents, (trimlist.allocated_count * sizeof(dk_extent_t))); - } - } - - /* - * This is in an #if block because hfs_validate_summary prototype and function body - * will only show up if ALLOC_DEBUG is on, to save wired memory ever so slightly. - */ -#if ALLOC_DEBUG - sanity_check_free_ext(hfsmp, 1); - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - /* Validate the summary table too! */ - hfs_validate_summary(hfsmp); - printf("HFS: Summary validation complete on %s\n", hfsmp->vcbVN); - } -#endif - - if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN | DBG_FUNC_END, error, hfsmp->hfs_raw_dev, 0, 0, 0); - } - - return error; -} - -static void add_to_reserved_list(hfsmount_t *hfsmp, uint32_t start, - uint32_t count, int list, - struct rl_entry **reservation) -{ - struct rl_entry *range, *next_range; - - if (list == HFS_TENTATIVE_BLOCKS) { - int nranges = 0; - // Don't allow more than 4 tentative reservations - TAILQ_FOREACH_SAFE(range, &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS], - rl_link, next_range) { - if (++nranges > 3) - hfs_release_reserved(hfsmp, range, HFS_TENTATIVE_BLOCKS); - } - } - - MALLOC(range, struct rl_entry *, sizeof(*range), M_TEMP, M_WAITOK); - range->rl_start = start; - range->rl_end = start + count - 1; - TAILQ_INSERT_HEAD(&hfsmp->hfs_reserved_ranges[list], range, rl_link); - *reservation = range; -} - -static void hfs_release_reserved(hfsmount_t *hfsmp, - struct rl_entry *range, - int list) -{ - if (range->rl_start == -1) - return; - - TAILQ_REMOVE(&hfsmp->hfs_reserved_ranges[list], range, rl_link); - - if (rl_len(range) > 0) { - if (list == HFS_TENTATIVE_BLOCKS) - hfsmp->tentativeBlocks -= rl_len(range); - else { - /* - * We don't need to unmap tentative blocks because we won't have - * written to them, but we might have written to reserved blocks. - * Nothing can refer to those blocks so this doesn't have to be - * via the journal. If this proves to be too expensive, we could - * consider not sending down the unmap or we could require this - * to always be called within a transaction and then we can use - * the journal. - */ - dk_extent_t extent = { - .offset = (hfs_blk_to_bytes(range->rl_start, hfsmp->blockSize) - + hfsmp->hfsPlusIOPosOffset), - .length = hfs_blk_to_bytes(rl_len(range), hfsmp->blockSize) - }; - dk_unmap_t unmap = { - .extents = &extent, - .extentsCount = 1, - }; - VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCUNMAP, (caddr_t)&unmap, - 0, vfs_context_kernel()); - assert(hfsmp->lockedBlocks >= rl_len(range)); - hfsmp->lockedBlocks -= rl_len(range); - } - hfs_release_summary(hfsmp, range->rl_start, rl_len(range)); - add_free_extent_cache(hfsmp, range->rl_start, rl_len(range)); - } - - range->rl_start = -1; - range->rl_end = -2; -} - -static void hfs_free_locked_internal(hfsmount_t *hfsmp, - struct rl_entry **reservation, - int list) -{ - if (*reservation) { - hfs_release_reserved(hfsmp, *reservation, list); - FREE(*reservation, M_TEMP); - *reservation = NULL; - } -} - -void hfs_free_tentative(hfsmount_t *hfsmp, struct rl_entry **reservation) -{ - hfs_free_locked_internal(hfsmp, reservation, HFS_TENTATIVE_BLOCKS); -} - -void hfs_free_locked(hfsmount_t *hfsmp, struct rl_entry **reservation) -{ - hfs_free_locked_internal(hfsmp, reservation, HFS_LOCKED_BLOCKS); -} - -OSErr BlockAllocate ( - hfsmount_t *hfsmp, /* which volume to allocate space on */ - u_int32_t startingBlock, /* preferred starting block, or 0 for no preference */ - u_int32_t minBlocks, /* desired number of blocks to allocate */ - u_int32_t maxBlocks, /* maximum number of blocks to allocate */ - hfs_block_alloc_flags_t flags, /* option flags */ - u_int32_t *actualStartBlock, /* actual first block of allocation */ - u_int32_t *actualNumBlocks) -{ - hfs_alloc_extra_args_t extra_args = { - .max_blocks = maxBlocks - }; - - HFSPlusExtentDescriptor extent = { startingBlock, minBlocks }; - - OSErr err = hfs_block_alloc_int(hfsmp, &extent, flags, &extra_args); - - *actualStartBlock = extent.startBlock; - *actualNumBlocks = extent.blockCount; - - return err; -} - -errno_t hfs_block_alloc(hfsmount_t *hfsmp, - HFSPlusExtentDescriptor *extent, - hfs_block_alloc_flags_t flags, - hfs_alloc_extra_args_t *ap) -{ - return MacToVFSError(hfs_block_alloc_int(hfsmp, extent, flags, ap)); -} - -/* - ;________________________________________________________________________________ - ; - ; Routine: hfs_block_alloc_int - ; - ; Function: Allocate space on a volume. If contiguous allocation is requested, - ; at least the requested number of bytes will be allocated or an - ; error will be returned. If contiguous allocation is not forced, - ; the space will be allocated with the first largest extent available - ; at the requested starting allocation block. If there is not enough - ; room there, a block allocation of less than the requested size will be - ; allocated. - ; - ; If the requested starting block is 0 (for new file allocations), - ; the volume's allocation block pointer will be used as a starting - ; point. - ; - ; Input Arguments: - ; hfsmp - Pointer to the HFS mount structure. - ; extent - startBlock indicates the block to start - ; searching from and blockCount is the number of - ; blocks required. Depending on the flags used, - ; more or less blocks may be returned. The - ; allocated extent is returned via this - ; parameter. - ; flags - Flags to specify options like contiguous, use - ; metadata zone, skip free block check, etc. - ; ap - Additional arguments used depending on flags. - ; See hfs_alloc_extra_args_t and below. - ; - ; Output: - ; (result) - Error code, zero for successful allocation - ; extent - If successful, the allocated extent. - ; - ; Side effects: - ; The volume bitmap is read and updated; the volume bitmap cache may be changed. - ; - ; HFS_ALLOC_TENTATIVE - ; Blocks will be reserved but not marked allocated. They can be - ; stolen if free space is limited. Tentative blocks can be used by - ; passing HFS_ALLOC_USE_TENTATIVE and passing in the resevation. - ; @ap->reservation_out is used to store the reservation. - ; - ; HFS_ALLOC_USE_TENTATIVE - ; Use blocks previously returned with HFS_ALLOC_TENTATIVE. - ; @ap->reservation_in should be set to whatever @ap->reservation_out - ; was set to when HFS_ALLOC_TENTATIVE was used. If the tentative - ; reservation was stolen, a normal allocation will take place. - ; - ; HFS_ALLOC_LOCKED - ; Blocks will be reserved but not marked allocated. Unlike tentative - ; reservations they cannot be stolen. It is safe to write to these - ; blocks. @ap->reservation_out is used to store the reservation. - ; - ; HFS_ALLOC_COMMIT - ; This will take blocks previously returned with HFS_ALLOC_LOCKED and - ; mark them allocated on disk. @ap->reservation_in is used. - ; - ; HFS_ALLOC_ROLL_BACK - ; Take blocks that were just recently deallocated and mark them - ; allocated. This is for roll back situations. Blocks got - ; deallocated and then something went wrong and we need to roll back - ; by marking the blocks allocated. - ; - ; HFS_ALLOC_FORCECONTIG - ; It will not return fewer than @min_blocks. - ; - ; HFS_ALLOC_TRY_HARD - ; We will perform an exhaustive search to try and find @max_blocks. - ; It will not return fewer than @min_blocks. - ; - ;________________________________________________________________________________ - */ -OSErr hfs_block_alloc_int(hfsmount_t *hfsmp, - HFSPlusExtentDescriptor *extent, - hfs_block_alloc_flags_t flags, - hfs_alloc_extra_args_t *ap) -{ - u_int32_t freeBlocks; - OSErr err = 0; - Boolean updateAllocPtr = false; // true if nextAllocation needs to be updated - Boolean useMetaZone; - Boolean forceContiguous = false; - Boolean forceFlush; - - uint32_t startingBlock = extent->startBlock; - uint32_t minBlocks = extent->blockCount; - uint32_t maxBlocks = (ap && ap->max_blocks) ? ap->max_blocks : minBlocks; - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, flags, 0); - - if (ISSET(flags, HFS_ALLOC_COMMIT)) { - extent->startBlock = (*ap->reservation_in)->rl_start; - extent->blockCount = rl_len(*ap->reservation_in); - goto mark_allocated; - } - - if (ISSET(flags, HFS_ALLOC_ROLL_BACK)) - goto mark_allocated; - - freeBlocks = hfs_freeblks(hfsmp, 0); - - if (ISSET(flags, HFS_ALLOC_USE_TENTATIVE)) { - struct rl_entry *range = *ap->reservation_in; - - if (range && range->rl_start != -1) { - /* - * It's possible that we have a tentative reservation - * but there aren't enough free blocks due to loaned blocks - * or insufficient space in the backing store. - */ - uint32_t count = min(min(maxBlocks, rl_len(range)), freeBlocks); - - if (count >= minBlocks) { - extent->startBlock = range->rl_start; - extent->blockCount = count; - - // Should we go straight to commit? - if (!ISSET(flags, HFS_ALLOC_LOCKED)) - SET(flags, HFS_ALLOC_COMMIT); - - goto mark_allocated; - } - } - - /* - * We can't use the tentative reservation so free it and allocate - * normally. - */ - hfs_free_tentative(hfsmp, ap->reservation_in); - CLR(flags, HFS_ALLOC_USE_TENTATIVE); - } - - if (ISSET(flags, HFS_ALLOC_FORCECONTIG | HFS_ALLOC_TRY_HARD)) - forceContiguous = true; - - if (flags & HFS_ALLOC_METAZONE) { - useMetaZone = true; - } else { - useMetaZone = false; - } - - if (flags & HFS_ALLOC_FLUSHTXN) { - forceFlush = true; - } - else { - forceFlush = false; - } - - assert(hfsmp->freeBlocks >= hfsmp->tentativeBlocks); - - // See if we have to steal tentative blocks - if (freeBlocks < hfsmp->tentativeBlocks + minBlocks) - SET(flags, HFS_ALLOC_IGNORE_TENTATIVE); - - /* Skip free block check if blocks are being allocated for relocating - * data during truncating a volume. - * - * During hfs_truncatefs(), the volume free block count is updated - * before relocating data to reflect the total number of free blocks - * that will exist on the volume after resize is successful. This - * means that we have reserved allocation blocks required for relocating - * the data and hence there is no need to check the free blocks. - * It will also prevent resize failure when the number of blocks in - * an extent being relocated is more than the free blocks that will - * exist after the volume is resized. - */ - if ((flags & HFS_ALLOC_SKIPFREEBLKS) == 0) { - // If the disk is already full, don't bother. - if (freeBlocks == 0) { - err = dskFulErr; - goto exit; - } - if (forceContiguous && freeBlocks < minBlocks) { - err = dskFulErr; - goto exit; - } - - /* - * Clip if necessary so we don't over-subscribe the free blocks. - */ - if (minBlocks > freeBlocks) { - minBlocks = freeBlocks; - } - if (maxBlocks > freeBlocks) { - maxBlocks = freeBlocks; - } - } - - if (ISSET(flags, HFS_ALLOC_TRY_HARD)) { - err = hfs_alloc_try_hard(hfsmp, extent, maxBlocks, flags); - if (err) - goto exit; - - goto mark_allocated; - } - - // - // If caller didn't specify a starting block number, then use the volume's - // next block to allocate from. - // - if (startingBlock == 0) { - hfs_lock_mount (hfsmp); - - /* Sparse Allocation and nextAllocation are both used even if the R/B Tree is on */ - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - startingBlock = hfsmp->sparseAllocation; - } - else { - startingBlock = hfsmp->nextAllocation; - } - hfs_unlock_mount(hfsmp); - updateAllocPtr = true; - } - - - if (startingBlock >= hfsmp->allocLimit) { - startingBlock = 0; /* overflow so start at beginning */ - } - - // - // If the request must be contiguous, then find a sequence of free blocks - // that is long enough. Otherwise, find the first free block. - // - if (forceContiguous) { - err = BlockFindContig(hfsmp, startingBlock, minBlocks, maxBlocks, - flags, &extent->startBlock, &extent->blockCount); - /* - * If we allocated from a new position then also update the roving allocator. - * This will keep the roving allocation pointer up-to-date even - * if we are using the new R/B tree allocator, since - * it doesn't matter to us here, how the underlying allocator found - * the block to vend out. - */ - if ((err == noErr) && - (extent->startBlock > startingBlock) && - ((extent->startBlock < hfsmp->hfs_metazone_start) || - (extent->startBlock > hfsmp->hfs_metazone_end))) { - updateAllocPtr = true; - } - } else { - /* - * Scan the bitmap once, gather the N largest free extents, then - * allocate from these largest extents. Repeat as needed until - * we get all the space we needed. We could probably build up - * that list when the higher level caller tried (and failed) a - * contiguous allocation first. - * - * Note that the free-extent cache will be cease to be updated if - * we are using the red-black tree for allocations. If we jettison - * the tree, then we will reset the free-extent cache and start over. - */ - - /* Disable HFS_ALLOC_FLUSHTXN if needed */ - if (forceFlush) { - flags &= ~HFS_ALLOC_FLUSHTXN; - } - - /* - * BlockFindKnown only examines the free extent cache; anything in there will - * have been committed to stable storage already. - */ - err = BlockFindKnown(hfsmp, maxBlocks, &extent->startBlock, - &extent->blockCount); - - /* dskFulErr out of BlockFindKnown indicates an empty Free Extent Cache */ - - if (err == dskFulErr) { - /* - * Now we have to do a bigger scan. Start at startingBlock and go up until the - * allocation limit. We 'trust' the summary bitmap in this call, if it tells us - * that it could not find any free space. - */ - err = BlockFindAny(hfsmp, startingBlock, hfsmp->allocLimit, - maxBlocks, flags, true, - &extent->startBlock, &extent->blockCount); - } - if (err == dskFulErr) { - /* - * Vary the behavior here if the summary table is on or off. - * If it is on, then we don't trust it it if we get into this case and - * basically do a full scan for maximum coverage. - * If it is off, then we trust the above and go up until the startingBlock. - */ - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - err = BlockFindAny(hfsmp, 1, hfsmp->allocLimit, maxBlocks, - flags, false, - &extent->startBlock, &extent->blockCount); - } - else { - err = BlockFindAny(hfsmp, 1, startingBlock, maxBlocks, - flags, false, - &extent->startBlock, &extent->blockCount); - } - - /* - * Last Resort: Find/use blocks that may require a journal flush. - */ - if (err == dskFulErr && forceFlush) { - flags |= HFS_ALLOC_FLUSHTXN; - err = BlockFindAny(hfsmp, 1, hfsmp->allocLimit, maxBlocks, - flags, false, - &extent->startBlock, &extent->blockCount); - } - } - } - - if (err) - goto exit; - -mark_allocated: - - // Handle alignment - if (ap && ap->alignment && extent->blockCount < ap->max_blocks) { - /* - * See the comment in FileMgrInternal.h for alignment - * semantics. - */ - uint32_t rounding = ((extent->blockCount + ap->alignment_offset) - % ap->alignment); - - // @minBlocks is still the minimum - if (extent->blockCount >= minBlocks + rounding) - extent->blockCount -= rounding; - } - - err = BlockMarkAllocatedInternal(hfsmp, extent->startBlock, - extent->blockCount, flags); - - if (err) - goto exit; - - if (ISSET(hfsmp->hfs_flags, HFS_CS) && extent->blockCount != 0 - && !ISSET(flags, HFS_ALLOC_TENTATIVE)) { - if (ISSET(flags, HFS_ALLOC_FAST_DEV)) { -#if !HFS_ALLOC_TEST /* need this guard because this file is compiled outside of the kernel */ - hfs_pin_block_range(hfsmp, HFS_PIN_IT, - extent->startBlock, extent->blockCount, - vfs_context_kernel()); -#endif - } else { - _dk_cs_map_t cm = { - .cm_extent = { - (hfs_blk_to_bytes(extent->startBlock, hfsmp->blockSize) - + hfsmp->hfsPlusIOPosOffset), - hfs_blk_to_bytes(extent->blockCount, hfsmp->blockSize) - } - }; - - errno_t err2 = VNOP_IOCTL(hfsmp->hfs_devvp, _DKIOCCSMAP, - (caddr_t)&cm, 0, vfs_context_current()); - - /* - * Ignore errors for now; we are fully provisioned so in - * theory CoreStorage should be able to handle this - * allocation. Should we want to change this in future, then - * we should think carefully how we handle errors. Allowing - * CoreStorage to truncate our allocation is problematic - * because we might have minimum and alignment requirements - * and backing out changes we have already made is - * non-trivial. - */ - - if (err2 || cm.cm_bytes_mapped < cm.cm_extent.length) { - printf("hfs: _DKIOCCSMAP error: %d, bytes_mapped: %llu\n", - err2, cm.cm_bytes_mapped); - } - } - } - - // if we actually allocated something then go update the - // various bits of state that we maintain regardless of - // whether there was an error (i.e. partial allocations - // still need to update things like the free block count). - // - if (extent->blockCount != 0) { - // - // If we used the volume's roving allocation pointer, then we need to update it. - // Adding in the length of the current allocation might reduce the next allocate - // call by avoiding a re-scan of the already allocated space. However, the clump - // just allocated can quite conceivably end up being truncated or released when - // the file is closed or its EOF changed. Leaving the allocation pointer at the - // start of the last allocation will avoid unnecessary fragmentation in this case. - // - hfs_lock_mount (hfsmp); - - if (!ISSET(flags, HFS_ALLOC_USE_TENTATIVE | HFS_ALLOC_COMMIT)) { - lck_spin_lock(&hfsmp->vcbFreeExtLock); - if (hfsmp->vcbFreeExtCnt == 0 && hfsmp->hfs_freed_block_count == 0) { - hfsmp->sparseAllocation = extent->startBlock; - } - lck_spin_unlock(&hfsmp->vcbFreeExtLock); - if (extent->blockCount < hfsmp->hfs_freed_block_count) { - hfsmp->hfs_freed_block_count -= extent->blockCount; - } else { - hfsmp->hfs_freed_block_count = 0; - } - - if (updateAllocPtr && - ((extent->startBlock < hfsmp->hfs_metazone_start) || - (extent->startBlock > hfsmp->hfs_metazone_end))) { - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, extent->startBlock); - } - - (void) remove_free_extent_cache(hfsmp, extent->startBlock, extent->blockCount); - } - - if (ISSET(flags, HFS_ALLOC_USE_TENTATIVE)) { - (*ap->reservation_in)->rl_start += extent->blockCount; - hfsmp->tentativeBlocks -= extent->blockCount; - if (rl_len(*ap->reservation_in) <= 0) - hfs_free_tentative(hfsmp, ap->reservation_in); - } else if (ISSET(flags, HFS_ALLOC_COMMIT)) { - // Handle committing locked extents - assert(hfsmp->lockedBlocks >= extent->blockCount); - (*ap->reservation_in)->rl_start += extent->blockCount; - hfsmp->lockedBlocks -= extent->blockCount; - hfs_free_locked(hfsmp, ap->reservation_in); - } - - /* - * Update the number of free blocks on the volume - * - * Skip updating the free blocks count if the block are - * being allocated to relocate data as part of hfs_truncatefs() - */ - - if (ISSET(flags, HFS_ALLOC_TENTATIVE)) { - hfsmp->tentativeBlocks += extent->blockCount; - } else if (ISSET(flags, HFS_ALLOC_LOCKED)) { - hfsmp->lockedBlocks += extent->blockCount; - } else if ((flags & HFS_ALLOC_SKIPFREEBLKS) == 0) { - hfsmp->freeBlocks -= extent->blockCount; - } - MarkVCBDirty(hfsmp); - hfs_unlock_mount(hfsmp); - - hfs_generate_volume_notifications(hfsmp); - - if (ISSET(flags, HFS_ALLOC_TENTATIVE)) { - add_to_reserved_list(hfsmp, extent->startBlock, extent->blockCount, - 0, ap->reservation_out); - } else if (ISSET(flags, HFS_ALLOC_LOCKED)) { - add_to_reserved_list(hfsmp, extent->startBlock, extent->blockCount, - 1, ap->reservation_out); - } - - if (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE)) { - /* - * See if we used tentative blocks. Note that we cannot - * free the reservations here because we don't have access - * to the external pointers. All we can do is update the - * reservations and they'll be cleaned up when whatever is - * holding the pointers calls us back. - * - * We use the rangelist code to detect overlaps and - * constrain the tentative block allocation. Note that - * @end is inclusive so that our rangelist code will - * resolve the various cases for us. As a result, we need - * to ensure that we account for it properly when removing - * the blocks from the tentative count in the mount point - * and re-inserting the remainder (either head or tail) - */ - struct rl_entry *range, *next_range; - struct rl_head *ranges = &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS]; - const uint32_t start = extent->startBlock; - const uint32_t end = start + extent->blockCount - 1; - TAILQ_FOREACH_SAFE(range, ranges, rl_link, next_range) { - switch (rl_overlap(range, start, end)) { - case RL_OVERLAPCONTAINSRANGE: - // Keep the bigger part - if (start - range->rl_start > range->rl_end - end) { - // Discard the tail - hfsmp->tentativeBlocks -= range->rl_end + 1 - start; - hfs_release_summary(hfsmp, end + 1, range->rl_end - end); - const uint32_t old_end = range->rl_end; - range->rl_end = start - 1; - add_free_extent_cache(hfsmp, end + 1, old_end - end); - } else { - // Discard the head - hfsmp->tentativeBlocks -= end + 1 - range->rl_start; - hfs_release_summary(hfsmp, range->rl_start, - start - range->rl_start); - const uint32_t old_start = range->rl_start; - range->rl_start = end + 1; - add_free_extent_cache(hfsmp, old_start, - start - old_start); - } - assert(range->rl_end >= range->rl_start); - break; - case RL_MATCHINGOVERLAP: - case RL_OVERLAPISCONTAINED: - hfsmp->tentativeBlocks -= rl_len(range); - range->rl_end = range->rl_start - 1; - hfs_release_reserved(hfsmp, range, HFS_TENTATIVE_BLOCKS); - break; - case RL_OVERLAPSTARTSBEFORE: - hfsmp->tentativeBlocks -= range->rl_end + 1 - start; - range->rl_end = start - 1; - assert(range->rl_end >= range->rl_start); - break; - case RL_OVERLAPENDSAFTER: - hfsmp->tentativeBlocks -= end + 1 - range->rl_start; - range->rl_start = end + 1; - assert(range->rl_end >= range->rl_start); - break; - case RL_NOOVERLAP: - break; - } - } - } - } - -exit: - - if (ALLOC_DEBUG) { - if (err == noErr) { - if (extent->startBlock >= hfsmp->totalBlocks) { - panic ("BlockAllocate: vending invalid blocks!"); - } - if (extent->startBlock >= hfsmp->allocLimit) { - panic ("BlockAllocate: vending block past allocLimit!"); - } - - if ((extent->startBlock + extent->blockCount) >= hfsmp->totalBlocks) { - panic ("BlockAllocate: vending too many invalid blocks!"); - } - - if ((extent->startBlock + extent->blockCount) >= hfsmp->allocLimit) { - panic ("BlockAllocate: vending too many invalid blocks past allocLimit!"); - } - } - } - - if (err) { - // Just to be safe... - extent->startBlock = 0; - extent->blockCount = 0; - } - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_END, err, extent->startBlock, extent->blockCount, 0, 0); - - return err; -} - - -/* -;________________________________________________________________________________ -; -; Routine: BlockDeallocate -; -; Function: Update the bitmap to deallocate a run of disk allocation blocks -; -; Input Arguments: -; vcb - Pointer to ExtendedVCB for the volume to free space on -; firstBlock - First allocation block to be freed -; numBlocks - Number of allocation blocks to free up (must be > 0!) -; -; Output: -; (result) - Result code -; -; Side effects: -; The volume bitmap is read and updated; the volume bitmap cache may be changed. -; The Allocator's red-black trees may also be modified as a result. -; -;________________________________________________________________________________ -*/ - -OSErr BlockDeallocate ( - ExtendedVCB *vcb, // Which volume to deallocate space on - u_int32_t firstBlock, // First block in range to deallocate - u_int32_t numBlocks, // Number of contiguous blocks to deallocate - hfs_block_alloc_flags_t flags) -{ - if (ISSET(flags, HFS_ALLOC_TENTATIVE | HFS_ALLOC_LOCKED)) - return 0; - - OSErr err; - struct hfsmount *hfsmp; - hfsmp = VCBTOHFS(vcb); - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_START, firstBlock, numBlocks, flags, 0, 0); - - // - // If no blocks to deallocate, then exit early - // - if (numBlocks == 0) { - err = noErr; - goto Exit; - } - - - if (ALLOC_DEBUG) { - if (firstBlock >= hfsmp->totalBlocks) { - panic ("BlockDeallocate: freeing invalid blocks!"); - } - - if ((firstBlock + numBlocks) >= hfsmp->totalBlocks) { - panic ("BlockDeallocate: freeing too many invalid blocks!"); - } - } - - /* - * If we're using the summary bitmap, then try to mark the bits - * as potentially usable/free before actually deallocating them. - * It is better to be slightly speculative here for correctness. - */ - - (void) hfs_release_summary (hfsmp, firstBlock, numBlocks); - - err = BlockMarkFreeInternal(vcb, firstBlock, numBlocks, true); - - if (err) { - goto Exit; - } - - // - // Update the volume's free block count, and mark the VCB as dirty. - // - hfs_lock_mount(hfsmp); - /* - * Do not update the free block count. This flags is specified - * when a volume is being truncated. - */ - if ((flags & HFS_ALLOC_SKIPFREEBLKS) == 0) { - vcb->freeBlocks += numBlocks; - } - - vcb->hfs_freed_block_count += numBlocks; - - if (vcb->nextAllocation == (firstBlock + numBlocks)) { - HFS_UPDATE_NEXT_ALLOCATION(vcb, (vcb->nextAllocation - numBlocks)); - } - - if (hfsmp->jnl == NULL) { - /* - * In the journal case, we'll add the free extent once the journal - * calls us back to tell us it wrote the transaction to disk. - */ - (void) add_free_extent_cache(vcb, firstBlock, numBlocks); - - /* - * If the journal case, we'll only update sparseAllocation once the - * free extent cache becomes empty (when we remove the last entry - * from the cache). Skipping it here means we're less likely to - * find a recently freed extent via the bitmap before it gets added - * to the free extent cache. - */ - if (firstBlock < vcb->sparseAllocation) { - vcb->sparseAllocation = firstBlock; - } - } - - MarkVCBDirty(vcb); - hfs_unlock_mount(hfsmp); - - hfs_generate_volume_notifications(VCBTOHFS(vcb)); -Exit: - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_END, err, 0, 0, 0, 0); - - return err; -} - - -u_int8_t freebitcount[16] = { - 4, 3, 3, 2, 3, 2, 2, 1, /* 0 1 2 3 4 5 6 7 */ - 3, 2, 2, 1, 2, 1, 1, 0, /* 8 9 A B C D E F */ -}; - -u_int32_t -MetaZoneFreeBlocks(ExtendedVCB *vcb) -{ - u_int32_t freeblocks; - u_int32_t *currCache; - uintptr_t blockRef; - u_int32_t bit; - u_int32_t lastbit; - int bytesleft; - int bytesperblock; - u_int8_t byte; - u_int8_t *buffer; - - blockRef = 0; - bytesleft = freeblocks = 0; - buffer = NULL; - bit = VCBTOHFS(vcb)->hfs_metazone_start; - if (bit == 1) - bit = 0; - - lastbit = VCBTOHFS(vcb)->hfs_metazone_end; - bytesperblock = vcb->vcbVBMIOSize; - - /* - * Count all the bits from bit to lastbit. - */ - while (bit < lastbit) { - /* - * Get next bitmap block. - */ - if (bytesleft == 0) { - if (blockRef) { - (void) ReleaseBitmapBlock(vcb, blockRef, false); - blockRef = 0; - } - if (ReadBitmapBlock(vcb, bit, &currCache, &blockRef, - HFS_ALLOC_IGNORE_TENTATIVE) != 0) { - return (0); - } - buffer = (u_int8_t *)currCache; - bytesleft = bytesperblock; - } - byte = *buffer++; - freeblocks += freebitcount[byte & 0x0F]; - freeblocks += freebitcount[(byte >> 4) & 0x0F]; - bit += kBitsPerByte; - --bytesleft; - } - if (blockRef) - (void) ReleaseBitmapBlock(vcb, blockRef, false); - - return (freeblocks); -} - - -/* - * Obtain the next allocation block (bit) that's - * outside the metadata allocation zone. - */ -static u_int32_t NextBitmapBlock( - ExtendedVCB *vcb, - u_int32_t bit) -{ - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) - return (bit); - /* - * Skip over metadata allocation zone. - */ - if ((bit >= hfsmp->hfs_metazone_start) && - (bit <= hfsmp->hfs_metazone_end)) { - bit = hfsmp->hfs_metazone_end + 1; - } - return (bit); -} - - -// Assumes @bitmap is aligned to 8 bytes and multiple of 8 bytes. -static void bits_set(void *bitmap, int start, int end) -{ - const int start_bit = start & 63; - const int end_bit = end & 63; - -#define LEFT_MASK(bit) OSSwapHostToBigInt64(0xffffffffffffffffull << (64 - bit)) -#define RIGHT_MASK(bit) OSSwapHostToBigInt64(0xffffffffffffffffull >> bit) - - uint64_t *p = (uint64_t *)bitmap + start / 64; - - if ((start & ~63) == (end & ~63)) { - // Start and end in same 64 bits - *p |= RIGHT_MASK(start_bit) & LEFT_MASK(end_bit); - } else { - *p++ |= RIGHT_MASK(start_bit); - - int nquads = (end - end_bit - start - 1) / 64; - - while (nquads--) - *p++ = 0xffffffffffffffffull; - - if (end_bit) - *p |= LEFT_MASK(end_bit); - } -} - -// Modifies the buffer and applies any reservations that we might have -static buf_t process_reservations(hfsmount_t *hfsmp, buf_t bp, off_t offset, - hfs_block_alloc_flags_t flags, - bool always_copy) -{ - bool taken_copy = false; - void *buffer = (void *)buf_dataptr(bp); - const uint32_t nbytes = buf_count(bp); - const off_t end = offset + nbytes * 8 - 1; - - for (int i = (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE) - ? HFS_LOCKED_BLOCKS : HFS_TENTATIVE_BLOCKS); i < 2; ++i) { - struct rl_entry *entry; - TAILQ_FOREACH(entry, &hfsmp->hfs_reserved_ranges[i], rl_link) { - uint32_t a, b; - - enum rl_overlaptype overlap_type = rl_overlap(entry, offset, end); - - if (overlap_type == RL_NOOVERLAP) - continue; - - /* - * If always_copy is false, we only take a copy if B_LOCKED is - * set because ReleaseScanBitmapRange doesn't invalidate the - * buffer in that case. - */ - if (!taken_copy && (always_copy || ISSET(buf_flags(bp), B_LOCKED))) { - buf_t new_bp = buf_create_shadow(bp, true, 0, NULL, NULL); - buf_brelse(bp); - bp = new_bp; - buf_setflags(bp, B_NOCACHE); - buffer = (void *)buf_dataptr(bp); - taken_copy = true; - } - - switch (overlap_type) { - case RL_OVERLAPCONTAINSRANGE: - case RL_MATCHINGOVERLAP: - memset(buffer, 0xff, nbytes); - return bp; - case RL_OVERLAPISCONTAINED: - a = entry->rl_start; - b = entry->rl_end; - break; - case RL_OVERLAPSTARTSBEFORE: - a = offset; - b = entry->rl_end; - break; - case RL_OVERLAPENDSAFTER: - a = entry->rl_start; - b = end; - break; - case RL_NOOVERLAP: - __builtin_unreachable(); - } - - a -= offset; - b -= offset; - - assert(a < buf_count(bp) * 8); - assert(b < buf_count(bp) * 8); - assert(b >= a); - - // b is inclusive - bits_set(buffer, a, b + 1); - } - } // for (;;) - - return bp; -} - -/* -;_______________________________________________________________________ -; -; Routine: ReadBitmapBlock -; -; Function: Read in a bitmap block corresponding to a given allocation -; block (bit). Return a pointer to the bitmap block. -; -; Inputs: -; vcb -- Pointer to ExtendedVCB -; bit -- Allocation block whose bitmap block is desired -; -; Outputs: -; buffer -- Pointer to bitmap block corresonding to "block" -; blockRef -;_______________________________________________________________________ -*/ -static OSErr ReadBitmapBlock(ExtendedVCB *vcb, - u_int32_t bit, - u_int32_t **buffer, - uintptr_t *blockRef, - hfs_block_alloc_flags_t flags) -{ - OSErr err; - struct buf *bp = NULL; - struct vnode *vp = NULL; - daddr64_t block; - u_int32_t blockSize; - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_START, bit, 0, 0, 0, 0); - - /* - * volume bitmap blocks are protected by the allocation file lock - */ - REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); - - blockSize = (u_int32_t)vcb->vcbVBMIOSize; - block = (daddr64_t)(bit / (blockSize * kBitsPerByte)); - - /* HFS+ / HFSX */ - if (vcb->vcbSigWord != kHFSSigWord) { - vp = vcb->hfs_allocation_vp; /* use allocation file vnode */ - } -#if CONFIG_HFS_STD - else { - /* HFS Standard */ - vp = VCBTOHFS(vcb)->hfs_devvp; /* use device I/O vnode */ - block += vcb->vcbVBMSt; /* map to physical block */ - } -#endif - - err = (int)buf_meta_bread(vp, block, blockSize, NOCRED, &bp); - - if (bp) { - if (err) { - buf_brelse(bp); - *blockRef = 0; - *buffer = NULL; - } else { - if (!ISSET(flags, HFS_ALLOC_IGNORE_RESERVED)) { - bp = process_reservations(vcb, bp, block * blockSize * 8, - flags, /* always_copy: */ true); - } - - buf_setfsprivate(bp, (void *)(uintptr_t)flags); - - *blockRef = (uintptr_t)bp; - *buffer = (u_int32_t *)buf_dataptr(bp); - } - } - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_END, err, 0, 0, 0, 0); - - return err; -} - - -/* -;_______________________________________________________________________ -; -; Routine: ReadBitmapRange -; -; Function: Read in a range of the bitmap starting at the given offset. -; Use the supplied size to determine the amount of I/O to generate -; against the bitmap file. Return a pointer to the bitmap block. -; -; Inputs: -; hfsmp -- Pointer to hfs mount -; offset -- byte offset into the bitmap file -; size -- How much I/O to generate against the bitmap file. -; -; Outputs: -; buffer -- Pointer to bitmap block data corresonding to "block" -; blockRef -- struct 'buf' pointer which MUST be released in a subsequent call. -;_______________________________________________________________________ -*/ -static OSErr ReadBitmapRange(struct hfsmount *hfsmp, uint32_t offset, - uint32_t iosize, uint32_t **buffer, struct buf **blockRef) -{ - - OSErr err; - struct buf *bp = NULL; - struct vnode *vp = NULL; - daddr64_t block; - - /* This function isn't supported for HFS standard */ - if (hfsmp->vcbSigWord != kHFSPlusSigWord) { - return EINVAL; - } - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_RANGE | DBG_FUNC_START, offset, iosize, 0, 0, 0); - } - - /* - * volume bitmap blocks are protected by the allocation file lock - */ - REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); - - vp = hfsmp->hfs_allocation_vp; /* use allocation file vnode */ - - /* - * The byte offset argument must be converted into bitmap-relative logical - * block numbers before using it in buf_meta_bread. - * - * buf_meta_bread (and the things it calls) will eventually try to - * reconstruct the byte offset into the file by multiplying the logical - * block number passed in below by the vcbVBMIOSize field in the mount - * point. So we prepare for that by converting the byte offset back into - * logical blocks in terms of VBMIOSize units. - * - * The amount of I/O requested and the byte offset should be computed - * based on the helper function in the frame that called us, so we can - * get away with just doing a simple divide here. - */ - block = (daddr64_t)(offset / hfsmp->vcbVBMIOSize); - - err = (int) buf_meta_bread(vp, block, iosize, NOCRED, &bp); - - if (bp) { - if (err) { - buf_brelse(bp); - *blockRef = 0; - *buffer = NULL; - } else { - bp = process_reservations(hfsmp, bp, (offset * 8), 0, - /* always_copy: */ false); - - *blockRef = bp; - *buffer = (u_int32_t *)buf_dataptr(bp); - } - } - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_RANGE | DBG_FUNC_END, err, 0, 0, 0, 0); - } - - return err; -} - - -/* -;_______________________________________________________________________ -; -; Routine: ReleaseBitmapBlock -; -; Function: Relase a bitmap block. -; -; Inputs: -; vcb -; blockRef -; dirty -;_______________________________________________________________________ -*/ -static OSErr ReleaseBitmapBlock( - ExtendedVCB *vcb, - uintptr_t blockRef, - Boolean dirty) -{ - struct buf *bp = (struct buf *)blockRef; - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_START, dirty, 0, 0, 0, 0); - - if (blockRef == 0) { - if (dirty) - panic("hfs: ReleaseBitmapBlock: missing bp"); - return (0); - } - - if (bp) { - if (dirty) { - hfs_block_alloc_flags_t flags = (uintptr_t)buf_fsprivate(bp); - - if (!ISSET(flags, HFS_ALLOC_IGNORE_RESERVED)) - panic("Modified read-only bitmap buffer!"); - - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - if (hfsmp->jnl) { - journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); - } else { - buf_bdwrite(bp); - } - } else { - buf_brelse(bp); - } - } - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_END, 0, 0, 0, 0, 0); - - return (0); -} - -/* - * ReleaseScanBitmapRange - * - * This is used to release struct bufs that were created for use by - * bitmap scanning code. Because they may be of sizes different than the - * typical runtime manipulation code, we want to force them to be purged out - * of the buffer cache ASAP, so we'll release them differently than in the - * ReleaseBitmapBlock case. - * - * Additionally, because we know that we're only reading the blocks and that they - * should have been clean prior to reading them, we will never - * issue a write to them (thus dirtying them). - */ - -static OSErr ReleaseScanBitmapRange(struct buf *bp ) { - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_START, 0, 0, 0, 0, 0); - } - - if (bp) { - /* Mark the buffer invalid if it isn't locked, then release it */ - if ((buf_flags(bp) & B_LOCKED) == 0) { - buf_markinvalid(bp); - } - buf_brelse(bp); - } - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { - KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_SCAN_BITMAP | DBG_FUNC_END, 0, 0, 0, 0, 0); - } - - return (0); -} - -/* - * @extent.startBlock, on input, contains a preferred block for the - * allocation. @extent.blockCount, on input, contains the minimum - * number of blocks acceptable. Upon success, the result is conveyed - * in @extent. - */ -static OSErr hfs_alloc_try_hard(hfsmount_t *hfsmp, - HFSPlusExtentDescriptor *extent, - uint32_t max_blocks, - hfs_block_alloc_flags_t flags) -{ - OSErr err = dskFulErr; - - const uint32_t min_blocks = extent->blockCount; - - // It's > rather than >= because the last block is always reserved - if (extent->startBlock > 0 && extent->startBlock < hfsmp->allocLimit - && hfsmp->allocLimit - extent->startBlock > max_blocks) { - /* - * This is just checking to see if there's an extent starting - * at extent->startBlock that will suit. We only check for - * @max_blocks here; @min_blocks is ignored. - */ - - err = BlockFindContiguous(hfsmp, extent->startBlock, extent->startBlock + max_blocks, - max_blocks, max_blocks, true, true, - &extent->startBlock, &extent->blockCount, flags); - - if (err != dskFulErr) - return err; - } - - err = BlockFindKnown(hfsmp, max_blocks, &extent->startBlock, - &extent->blockCount); - - if (!err) { - if (extent->blockCount >= max_blocks) - return 0; - } else if (err != dskFulErr) - return err; - - // Try a more exhaustive search - return BlockFindContiguous(hfsmp, 1, hfsmp->allocLimit, - min_blocks, max_blocks, - /* useMetaZone: */ true, - /* trustSummary: */ true, - &extent->startBlock, &extent->blockCount, flags); -} - -/* -_______________________________________________________________________ - -Routine: BlockFindContig - -Function: Find a contiguous group of allocation blocks. If the - minimum cannot be satisfied, nothing is returned. The - caller guarantees that there are enough free blocks - (though they may not be contiguous, in which case this - call will fail). - -Inputs: - vcb Pointer to volume where space is to be allocated - startingBlock Preferred first block for allocation - minBlocks Minimum number of contiguous blocks to allocate - maxBlocks Maximum number of contiguous blocks to allocate - flags - -Outputs: - actualStartBlock First block of range allocated, or 0 if error - actualNumBlocks Number of blocks allocated, or 0 if error -_______________________________________________________________________ -*/ -static OSErr BlockFindContig( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t minBlocks, - u_int32_t maxBlocks, - hfs_block_alloc_flags_t flags, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks) -{ - OSErr retval = noErr; - uint32_t currentStart = startingBlock; - - uint32_t foundStart = 0; // values to emit to caller - uint32_t foundCount = 0; - - uint32_t collision_start = 0; // if we have to re-allocate a recently deleted extent, use this - uint32_t collision_count = 0; - - int err; - int allowReuse = (flags & HFS_ALLOC_FLUSHTXN); - Boolean useMetaZone = (flags & HFS_ALLOC_METAZONE); - - int recently_deleted = 0; - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_FIND_CONTIG_BITMAP | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, useMetaZone, 0); - - while ((retval == noErr) && (foundStart == 0) && (foundCount == 0)) { - - /* Try and find something that works. */ - - /* - * NOTE: If the only contiguous free extent of at least minBlocks - * crosses startingBlock (i.e. starts before, ends after), then we - * won't find it. Earlier versions *did* find this case by letting - * the second search look past startingBlock by minBlocks. But - * with the free extent cache, this can lead to duplicate entries - * in the cache, causing the same blocks to be allocated twice. - */ - retval = BlockFindContiguous(vcb, currentStart, vcb->allocLimit, minBlocks, - maxBlocks, useMetaZone, true, &foundStart, &foundCount, flags); - - if (retval == dskFulErr && currentStart != 0) { - /* - * We constrain the endingBlock so we don't bother looking for ranges - * that would overlap those found in the previous call, if the summary bitmap - * is not on for this volume. If it is, then we assume that it was not trust - * -worthy and do a full scan. - */ - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - retval = BlockFindContiguous(vcb, 1, vcb->allocLimit, minBlocks, - maxBlocks, useMetaZone, false, &foundStart, &foundCount, flags); - } - else { - retval = BlockFindContiguous(vcb, 1, currentStart, minBlocks, - maxBlocks, useMetaZone, false, &foundStart, &foundCount, flags); - } - } - - if (retval != noErr) { - goto bailout; - } - - /* Do we overlap with the recently found collision extent? */ - if (collision_start) { - if (extents_overlap (foundStart, foundCount, collision_start, collision_count)) { - /* - * We've looped around, and the only thing we could use was the collision extent. - * Since we are allowed to use it, go ahead and do so now. - */ - if(allowReuse) { - /* - * then we couldn't find anything except values which might have been - * recently deallocated. just return our cached value if we are allowed to. - */ - foundStart = collision_start; - foundCount = collision_count; - goto bailout; - } - else { - /* Otherwise, we looped around and couldn't find anything that wouldn't require a journal flush. */ - retval = dskFulErr; - goto bailout; - } - } - } - - /* OK, we know we must not have collided . See if this one is recently deleted */ - if (hfsmp->jnl) { - recently_deleted = 0; - uint32_t nextStart; - err = CheckUnmappedBytes (hfsmp, (uint64_t)foundStart, - (uint64_t) foundCount, &recently_deleted, &nextStart); - if (err == 0) { - if(recently_deleted != 0) { - /* - * these blocks were recently deleted/deallocated. Cache the extent, but - * but keep searching to see if we can find one that won't collide here. - */ - if (collision_start == 0) { - collision_start = foundStart; - collision_count = foundCount; - } - recently_deleted = 0; - - /* - * advance currentStart to the point just past the overlap we just found. Note that - * we will automatically loop around to start of the bitmap as needed. - */ - currentStart = nextStart; - /* Unset foundStart/Count to allow it to loop around again. */ - foundStart = 0; - foundCount = 0; - } - } - } // end jnl/deleted case - - /* - * If we found something good, we'd break out of the loop at the top; foundCount - * and foundStart should be set. - */ - - } // end while loop. - -bailout: - - if (retval == noErr) { - *actualStartBlock = foundStart; - *actualNumBlocks = foundCount; - } - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_FIND_CONTIG_BITMAP | DBG_FUNC_END, foundStart, foundCount, retval, 0, 0); - - return retval; - -} - - -/* -_______________________________________________________________________ - -Routine: BlockFindAny - -Function: Find one or more allocation blocks and may return fewer than - requested. The caller guarantees that there is at least one - free block. - -Inputs: - vcb Pointer to volume where space is to be allocated - startingBlock Preferred first block for allocation - endingBlock Last block to check + 1 - maxBlocks Maximum number of contiguous blocks to allocate - useMetaZone - -Outputs: - actualStartBlock First block of range allocated, or 0 if error - actualNumBlocks Number of blocks allocated, or 0 if error -_______________________________________________________________________ -*/ - -static OSErr BlockFindAny( - ExtendedVCB *vcb, - u_int32_t startingBlock, - register u_int32_t endingBlock, - u_int32_t maxBlocks, - hfs_block_alloc_flags_t flags, - Boolean trustSummary, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks) -{ - - /* - * If it is enabled, scan through the summary table to find the first free block. - * - * If it reports that there are not any free blocks, we could have a false - * positive, so in that case, use the input arguments as a pass through. - */ - uint32_t start_blk = startingBlock; - uint32_t end_blk = endingBlock; - struct hfsmount *hfsmp; - OSErr err; - - hfsmp = (struct hfsmount*)vcb; - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - uint32_t suggested_start; - - /* - * If the summary table is enabled, scan through it to find the first free - * block. If there was an error, or we couldn't find anything free in the - * summary table, then just leave the start_blk fields unmodified. We wouldn't - * have gotten to this point if the mount point made it look like there was possibly - * free space in the FS. - */ - err = hfs_find_summary_free (hfsmp, startingBlock, &suggested_start); - if (err == 0) { - start_blk = suggested_start; - } - else { - /* Differentiate between ENOSPC and a more esoteric error in the above call. */ - if ((err == ENOSPC) && (trustSummary)) { - /* - * The 'trustSummary' argument is for doing a full scan if we really - * really, need the space and we think it's somewhere but can't find it in the - * summary table. If it's true, then we trust the summary table and return - * dskFulErr if we couldn't find it above. - */ - return dskFulErr; - } - /* - * If either trustSummary was false or we got a different errno, then we - * want to fall through to the real bitmap single i/o code... - */ - } - } - - err = BlockFindAnyBitmap(vcb, start_blk, end_blk, maxBlocks, - flags, actualStartBlock, actualNumBlocks); - - return err; -} - - -/* - * BlockFindAnyBitmap finds free ranges by scanning the bitmap to - * figure out where the free allocation blocks are. Inputs and - * outputs are the same as for BlockFindAny. - */ - -static OSErr BlockFindAnyBitmap( - ExtendedVCB *vcb, - u_int32_t startingBlock, - register u_int32_t endingBlock, - u_int32_t maxBlocks, - hfs_block_alloc_flags_t flags, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks) -{ - OSErr err; - register u_int32_t block = 0; // current block number - register u_int32_t currentWord; // Pointer to current word within bitmap block - register u_int32_t bitMask; // Word with given bits already set (ready to OR in) - register u_int32_t wordsLeft; // Number of words left in this bitmap block - u_int32_t *buffer = NULL; - u_int32_t *currCache = NULL; - uintptr_t blockRef = 0; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - Boolean dirty = false; - struct hfsmount *hfsmp = VCBTOHFS(vcb); - Boolean useMetaZone = (flags & HFS_ALLOC_METAZONE); - Boolean forceFlush = (flags & HFS_ALLOC_FLUSHTXN); - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_START, startingBlock, endingBlock, maxBlocks, useMetaZone, 0); - -restartSearchAny: - - /* - * When we're skipping the metadata zone and the start/end - * range overlaps with the metadata zone then adjust the - * start to be outside of the metadata zone. If the range - * is entirely inside the metadata zone then we can deny the - * request (dskFulErr). - */ - if (!useMetaZone && (vcb->hfs_flags & HFS_METADATA_ZONE)) { - if (startingBlock <= vcb->hfs_metazone_end) { - if (endingBlock > (vcb->hfs_metazone_end + 2)) - startingBlock = vcb->hfs_metazone_end + 1; - else { - err = dskFulErr; - goto Exit; - } - } - } - - // Since this routine doesn't wrap around - if (maxBlocks > (endingBlock - startingBlock)) { - maxBlocks = endingBlock - startingBlock; - } - - // - // Pre-read the first bitmap block - // - err = ReadBitmapBlock(vcb, startingBlock, &currCache, &blockRef, flags); - if (err != noErr) goto Exit; - buffer = currCache; - - // - // Set up the current position within the block - // - { - u_int32_t wordIndexInBlock; - - bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; - - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - buffer += wordIndexInBlock; - wordsLeft = wordsPerBlock - wordIndexInBlock; - currentWord = SWAP_BE32 (*buffer); - bitMask = kHighBitInWordMask >> (startingBlock & kBitsWithinWordMask); - } - - /* - * While loop 1: - * Find the first unallocated block starting at 'block' - */ - uint32_t summary_block_scan = 0; - - block=startingBlock; - while (block < endingBlock) { - if ((currentWord & bitMask) == 0) - break; - - // Next bit - ++block; - bitMask >>= 1; - if (bitMask == 0) { - // Next word - bitMask = kHighBitInWordMask; - ++buffer; - - if (--wordsLeft == 0) { - // Next block - buffer = currCache = NULL; - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - /* - * If summary_block_scan is non-zero, then we must have - * pulled a bitmap file block into core, and scanned through - * the entire thing. Because we're in this loop, we are - * implicitly trusting that the bitmap didn't have any knowledge - * about this particular block. As a result, update the bitmap - * (lazily, now that we've scanned it) with our findings that - * this particular block is completely used up. - */ - if (summary_block_scan != 0) { - uint32_t summary_bit; - (void) hfs_get_summary_index (hfsmp, summary_block_scan, &summary_bit); - hfs_set_summary (hfsmp, summary_bit, 1); - summary_block_scan = 0; - } - } - - err = ReleaseBitmapBlock(vcb, blockRef, false); - if (err != noErr) goto Exit; - - /* - * Skip over metadata blocks. - */ - if (!useMetaZone) { - block = NextBitmapBlock(vcb, block); - } - if (block >= endingBlock) { - err = dskFulErr; - goto Exit; - } - - err = ReadBitmapBlock(vcb, block, &currCache, &blockRef, flags); - if (err != noErr) goto Exit; - buffer = currCache; - summary_block_scan = block; - wordsLeft = wordsPerBlock; - } - currentWord = SWAP_BE32 (*buffer); - } - } - - // Did we get to the end of the bitmap before finding a free block? - // If so, then couldn't allocate anything. - if (block >= endingBlock) { - err = dskFulErr; - goto Exit; - } - - - /* - * Don't move forward just yet. Verify that either one of the following - * two conditions is true: - * 1) journaling is not enabled - * 2) block is not currently on any pending TRIM list. - */ - if (hfsmp->jnl != NULL && (forceFlush == false)) { - int recently_deleted = 0; - uint32_t nextblk; - err = CheckUnmappedBytes (hfsmp, (uint64_t) block, 1, &recently_deleted, &nextblk); - if ((err == 0) && (recently_deleted)) { - - /* release the bitmap block & unset currCache. we may jump past it. */ - err = ReleaseBitmapBlock(vcb, blockRef, false); - currCache = NULL; - if (err != noErr) { - goto Exit; - } - /* set our start to nextblk, and re-do the search. */ - startingBlock = nextblk; - goto restartSearchAny; - } - } - - - // Return the first block in the allocated range - *actualStartBlock = block; - dirty = true; - - // If we could get the desired number of blocks before hitting endingBlock, - // then adjust endingBlock so we won't keep looking. Ideally, the comparison - // would be (block + maxBlocks) < endingBlock, but that could overflow. The - // comparison below yields identical results, but without overflow. - if (block < (endingBlock-maxBlocks)) { - endingBlock = block + maxBlocks; // if we get this far, we've found enough - } - - /* - * While loop 2: - * Scan the bitmap, starting at 'currentWord' in the current - * bitmap block. Continue iterating through the bitmap until - * either we hit an allocated block, or until we have accumuluated - * maxBlocks worth of bitmap. - */ - - /* Continue until we see an allocated block */ - while ((currentWord & bitMask) == 0) { - // Move to the next block. If no more, then exit. - ++block; - if (block == endingBlock) { - break; - } - - // Next bit - bitMask >>= 1; - if (bitMask == 0) { - // Next word - bitMask = kHighBitInWordMask; - ++buffer; - - if (--wordsLeft == 0) { - // Next block - buffer = currCache = NULL; - - /* We're only reading the bitmap here, so mark it as clean */ - err = ReleaseBitmapBlock(vcb, blockRef, false); - if (err != noErr) { - goto Exit; - } - - /* - * Skip over metadata blocks. - */ - if (!useMetaZone) { - u_int32_t nextBlock; - nextBlock = NextBitmapBlock(vcb, block); - if (nextBlock != block) { - goto Exit; /* allocation gap, so stop */ - } - } - - if (block >= endingBlock) { - goto Exit; - } - - err = ReadBitmapBlock(vcb, block, &currCache, &blockRef, flags); - if (err != noErr) { - goto Exit; - } - buffer = currCache; - wordsLeft = wordsPerBlock; - } - currentWord = SWAP_BE32 (*buffer); - } - } - -Exit: - if (currCache) { - /* Release the bitmap reference prior to marking bits in-use */ - (void) ReleaseBitmapBlock(vcb, blockRef, false); - currCache = NULL; - } - - if (err == noErr) { - *actualNumBlocks = block - *actualStartBlock; - - // sanity check - if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) { - panic("hfs: BlockFindAnyBitmap: allocation overflow on \"%s\"", vcb->vcbVN); - } - } - else { - *actualStartBlock = 0; - *actualNumBlocks = 0; - } - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); - - return err; -} - - -/* -_______________________________________________________________________ - -Routine: BlockFindKnown - -Function: Return a potential extent from the free extent cache. The - returned extent *must* be marked allocated and removed - from the cache by the *caller*. - -Inputs: - vcb Pointer to volume where space is to be allocated - maxBlocks Maximum number of contiguous blocks to allocate - -Outputs: - actualStartBlock First block of range allocated, or 0 if error - actualNumBlocks Number of blocks allocated, or 0 if error - -Returns: - dskFulErr Free extent cache is empty -_______________________________________________________________________ -*/ - -static OSErr BlockFindKnown( - ExtendedVCB *vcb, - u_int32_t maxBlocks, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks) -{ - OSErr err; - u_int32_t foundBlocks; - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_FIND_KNOWN | DBG_FUNC_START, 0, 0, maxBlocks, 0, 0); - - hfs_lock_mount (hfsmp); - lck_spin_lock(&vcb->vcbFreeExtLock); - if ( vcb->vcbFreeExtCnt == 0 || - vcb->vcbFreeExt[0].blockCount == 0) { - lck_spin_unlock(&vcb->vcbFreeExtLock); - hfs_unlock_mount(hfsmp); - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_FIND_KNOWN | DBG_FUNC_END, dskFulErr, *actualStartBlock, *actualNumBlocks, 0, 0); - return dskFulErr; - } - lck_spin_unlock(&vcb->vcbFreeExtLock); - hfs_unlock_mount(hfsmp); - - lck_spin_lock(&vcb->vcbFreeExtLock); - - // Just grab up to maxBlocks of the first (largest) free exent. - *actualStartBlock = vcb->vcbFreeExt[0].startBlock; - foundBlocks = vcb->vcbFreeExt[0].blockCount; - if (foundBlocks > maxBlocks) - foundBlocks = maxBlocks; - *actualNumBlocks = foundBlocks; - - lck_spin_unlock(&vcb->vcbFreeExtLock); - - // sanity check - if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) - { - printf ("hfs: BlockAllocateKnown() found allocation overflow on \"%s\"", vcb->vcbVN); - hfs_mark_inconsistent(vcb, HFS_INCONSISTENCY_DETECTED); - err = EIO; - } else - err = 0; - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_FIND_KNOWN | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); - - return err; -} - -/* - * BlockMarkAllocated - * - * This is a wrapper function around the internal calls which will actually mark the blocks - * as in-use. It will mark the blocks in the red-black tree if appropriate. We need to do - * this logic here to avoid callers having to deal with whether or not the red-black tree - * is enabled. - */ - -OSErr BlockMarkAllocated( - ExtendedVCB *vcb, - u_int32_t startingBlock, - register u_int32_t numBlocks) -{ - struct hfsmount *hfsmp; - - hfsmp = VCBTOHFS(vcb); - - return BlockMarkAllocatedInternal(vcb, startingBlock, numBlocks, 0); - -} - - -/* -_______________________________________________________________________ - -Routine: BlockMarkAllocatedInternal - -Function: Mark a contiguous group of blocks as allocated (set in the - bitmap). It assumes those bits are currently marked - deallocated (clear in the bitmap). Note that this function - must be called regardless of whether or not the bitmap or - tree-based allocator is used, as all allocations must correctly - be marked on-disk. If the tree-based approach is running, then - this will be done before the node is removed from the tree. - -Inputs: - vcb Pointer to volume where space is to be allocated - startingBlock First block number to mark as allocated - numBlocks Number of blocks to mark as allocated -_______________________________________________________________________ -*/ -static -OSErr BlockMarkAllocatedInternal ( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t numBlocks, - hfs_block_alloc_flags_t flags) -{ - OSErr err; - register u_int32_t *currentWord; // Pointer to current word within bitmap block - register u_int32_t wordsLeft; // Number of words left in this bitmap block - register u_int32_t bitMask; // Word with given bits already set (ready to OR in) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef = 0; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - // XXXdbg - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_START, startingBlock, numBlocks, flags, 0, 0); - -#if DEBUG - - struct rl_entry *range; - TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[HFS_LOCKED_BLOCKS], rl_link) { - assert(rl_overlap(range, startingBlock, - startingBlock + numBlocks - 1) == RL_NOOVERLAP); - } - -#endif - - int force_flush = 0; - /* - * Since we are about to mark these bits as in-use - * in the bitmap, decide if we need to alert the caller - * that a journal flush might be appropriate. It's safe to - * poke at the journal pointer here since we MUST have - * called start_transaction by the time this function is invoked. - * If the journal is enabled, then it will have taken the requisite - * journal locks. If it is not enabled, then we have taken - * a shared lock on the global lock. - */ - if (hfsmp->jnl) { - uint32_t ignore; - err = CheckUnmappedBytes (hfsmp, (uint64_t) startingBlock, (uint64_t)numBlocks, &force_flush, &ignore); - if ((err == 0) && (force_flush)) { - journal_request_immediate_flush (hfsmp->jnl); - } - } - - hfs_unmap_alloc_extent(vcb, startingBlock, numBlocks); - - /* - * Don't make changes to the disk if we're just reserving. Note that - * we could do better in the tentative case because we could, in theory, - * avoid the journal flush above. However, that would mean that we would - * need to catch the callback to stop it incorrectly addding the extent - * to our free cache. - */ - if (ISSET(flags, HFS_ALLOC_LOCKED | HFS_ALLOC_TENTATIVE)) { - err = 0; - goto Exit; - } - - // - // Pre-read the bitmap block containing the first word of allocation - // - - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_RESERVED); - if (err != noErr) goto Exit; - // - // Initialize currentWord, and wordsLeft. - // - { - u_int32_t wordIndexInBlock; - - bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; - - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - currentWord = buffer + wordIndexInBlock; - wordsLeft = wordsPerBlock - wordIndexInBlock; - } - - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); - } - - // - // If the first block to allocate doesn't start on a word - // boundary in the bitmap, then treat that first word - // specially. - // - - firstBit = startingBlock % kBitsPerWord; - if (firstBit != 0) { - bitMask = kAllBitsSetInWord >> firstBit; // turn off all bits before firstBit - numBits = kBitsPerWord - firstBit; // number of remaining bits in this word - if (numBits > numBlocks) { - numBits = numBlocks; // entire allocation is inside this one word - bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); // turn off bits after last - } -#if DEBUG - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); - } -#endif - *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap - numBlocks -= numBits; // adjust number of blocks left to allocate - - ++currentWord; // move to next word - --wordsLeft; // one less word left in this block - } - - // - // Allocate whole words (32 blocks) at a time. - // - - bitMask = kAllBitsSetInWord; // put this in a register for 68K - while (numBlocks >= kBitsPerWord) { - if (wordsLeft == 0) { - // Read in the next bitmap block - startingBlock += bitsPerBlock; // generate a block number in the next bitmap block - - buffer = NULL; - err = ReleaseBitmapBlock(vcb, blockRef, true); - if (err != noErr) goto Exit; - - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_RESERVED); - if (err != noErr) goto Exit; - - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); - } - - // Readjust currentWord and wordsLeft - currentWord = buffer; - wordsLeft = wordsPerBlock; - } -#if DEBUG - if (*currentWord != 0) { - panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); - } -#endif - *currentWord = SWAP_BE32 (bitMask); - numBlocks -= kBitsPerWord; - - ++currentWord; // move to next word - --wordsLeft; // one less word left in this block - } - - // - // Allocate any remaining blocks. - // - - if (numBlocks != 0) { - bitMask = ~(kAllBitsSetInWord >> numBlocks); // set first numBlocks bits - if (wordsLeft == 0) { - // Read in the next bitmap block - startingBlock += bitsPerBlock; // generate a block number in the next bitmap block - - buffer = NULL; - err = ReleaseBitmapBlock(vcb, blockRef, true); - if (err != noErr) goto Exit; - - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_RESERVED); - if (err != noErr) goto Exit; - - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); - } - - // Readjust currentWord and wordsLeft - currentWord = buffer; - wordsLeft = wordsPerBlock; - } -#if DEBUG - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); - } -#endif - *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap - - // No need to update currentWord or wordsLeft - } - -Exit: - - if (buffer) - (void)ReleaseBitmapBlock(vcb, blockRef, true); - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); - - return err; -} - - -/* - * BlockMarkFree - * - * This is a wrapper function around the internal calls which will actually mark the blocks - * as freed. It will mark the blocks in the red-black tree if appropriate. We need to do - * this logic here to avoid callers having to deal with whether or not the red-black tree - * is enabled. - * - */ -OSErr BlockMarkFree( - ExtendedVCB *vcb, - u_int32_t startingBlock, - register u_int32_t numBlocks) -{ - struct hfsmount *hfsmp; - hfsmp = VCBTOHFS(vcb); - - return BlockMarkFreeInternal(vcb, startingBlock, numBlocks, true); -} - - -/* - * BlockMarkFreeUnused - * - * Scan the bitmap block beyond end of current file system for bits - * that are marked as used. If any of the bits are marked as used, - * this function marks them free. - * - * Note: This was specifically written to mark all bits beyond - * end of current file system during hfs_extendfs(), which makes - * sure that all the new blocks added to the file system are - * marked as free. We expect that all the blocks beyond end of - * current file system are always marked as free, but there might - * be cases where are marked as used. This function assumes that - * the number of blocks marked as used incorrectly are relatively - * small, otherwise this can overflow journal transaction size - * on certain file system configurations (example, large unused - * bitmap with relatively small journal). - * - * Input: - * startingBlock: First block of the range to mark unused - * numBlocks: Number of blocks in the range to mark unused - * - * Returns: zero on success, non-zero on error. - */ -OSErr BlockMarkFreeUnused(ExtendedVCB *vcb, u_int32_t startingBlock, register u_int32_t numBlocks) -{ - int error = 0; - struct hfsmount *hfsmp = VCBTOHFS(vcb); - u_int32_t curNumBlocks; - u_int32_t bitsPerBlock; - u_int32_t lastBit; - - /* Use the optimal bitmap I/O size instead of bitmap block size */ - bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; - - /* - * First clear any non bitmap allocation block aligned bits - * - * Calculate the first bit in the bitmap block next to - * the bitmap block containing the bit for startingBlock. - * Using this value, we calculate the total number of - * bits to be marked unused from startingBlock to the - * end of bitmap block containing startingBlock. - */ - lastBit = ((startingBlock + (bitsPerBlock - 1))/bitsPerBlock) * bitsPerBlock; - curNumBlocks = lastBit - startingBlock; - if (curNumBlocks > numBlocks) { - curNumBlocks = numBlocks; - } - error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); - if (error) { - return error; - } - startingBlock += curNumBlocks; - numBlocks -= curNumBlocks; - - /* - * Check a full bitmap block for any 'used' bit. If any bit is used, - * mark all the bits only in that bitmap block as free. This ensures - * that we do not write unmodified bitmap blocks and do not - * overwhelm the journal. - * - * The code starts by checking full bitmap block at a time, and - * marks entire bitmap block as free only if any bit in that bitmap - * block is marked as used. In the end, it handles the last bitmap - * block which might be partially full by only checking till the - * caller-specified last bit and if any bit is set, only mark that - * range as free. - */ - while (numBlocks) { - if (numBlocks >= bitsPerBlock) { - curNumBlocks = bitsPerBlock; - } else { - curNumBlocks = numBlocks; - } - if (hfs_isallocated(hfsmp, startingBlock, curNumBlocks) == true) { - error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); - if (error) { - return error; - } - } - startingBlock += curNumBlocks; - numBlocks -= curNumBlocks; - } - - return error; -} - -/* -_______________________________________________________________________ - -Routine: BlockMarkFreeInternal - -Function: Mark a contiguous group of blocks as free (clear in the - bitmap). It assumes those bits are currently marked - allocated (set in the bitmap). - -Inputs: - vcb Pointer to volume where space is to be freed - startingBlock First block number to mark as freed - numBlocks Number of blocks to mark as freed - do_validate If true, validate that the blocks being - deallocated to check if they are within totalBlocks - for current volume and whether they were allocated - before they are marked free. -_______________________________________________________________________ -*/ -static -OSErr BlockMarkFreeInternal( - ExtendedVCB *vcb, - u_int32_t startingBlock_in, - register u_int32_t numBlocks_in, - Boolean do_validate) -{ - OSErr err; - u_int32_t startingBlock = startingBlock_in; - u_int32_t numBlocks = numBlocks_in; - uint32_t unmapStart = startingBlock_in; - uint32_t unmapCount = numBlocks_in; - uint32_t wordIndexInBlock; - u_int32_t *currentWord; // Pointer to current word within bitmap block - u_int32_t wordsLeft; // Number of words left in this bitmap block - u_int32_t bitMask; // Word with given bits already set (ready to OR in) - u_int32_t currentBit; // Bit index within word of current bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef = 0; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - // XXXdbg - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_START, startingBlock_in, numBlocks_in, do_validate, 0, 0); - - /* - * NOTE: We use vcb->totalBlocks instead of vcb->allocLimit because we - * need to be able to free blocks being relocated during hfs_truncatefs. - */ - if ((do_validate == true) && - (startingBlock + numBlocks > vcb->totalBlocks)) { -#if ALLOC_DEBUG || DEBUG - panic ("BlockMarkFreeInternal() free non-existent blocks at %u (numBlock=%u) on vol %s\n", startingBlock, numBlocks, vcb->vcbVN); - __builtin_unreachable(); -#else - printf ("hfs: BlockMarkFreeInternal() trying to free non-existent blocks starting at %u (numBlock=%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); - hfs_mark_inconsistent(vcb, HFS_INCONSISTENCY_DETECTED); - err = EIO; - goto Exit; -#endif - } - - // - // Pre-read the bitmap block containing the first word of allocation - // - - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_RESERVED); - if (err != noErr) goto Exit; - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); - } - - uint32_t min_unmap = 0, max_unmap = UINT32_MAX; - - // Work out the bounds of any unmap we can send down - struct rl_entry *range; - for (int i = 0; i < 2; ++i) { - TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[i], rl_link) { - if (range->rl_start < startingBlock - && range->rl_end >= min_unmap) { - min_unmap = range->rl_end + 1; - } - if (range->rl_end >= startingBlock + numBlocks - && range->rl_start < max_unmap) { - max_unmap = range->rl_start; - } - } - } - - // - // Figure out how many bits and words per bitmap block. - // - bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - - // - // Look for a range of free blocks immediately before startingBlock - // (up to the start of the current bitmap block). Set unmapStart to - // the first free block. - // - currentWord = buffer + wordIndexInBlock; - currentBit = startingBlock % kBitsPerWord; - bitMask = kHighBitInWordMask >> currentBit; - while (unmapStart > min_unmap) { - // Move currentWord/bitMask back by one bit - bitMask <<= 1; - if (bitMask == 0) { - if (--currentWord < buffer) - break; - bitMask = kLowBitInWordMask; - } - - if (*currentWord & SWAP_BE32(bitMask)) - break; // Found an allocated block. Stop searching. - --unmapStart; - ++unmapCount; - } - - // - // If the first block to free doesn't start on a word - // boundary in the bitmap, then treat that first word - // specially. - // - - currentWord = buffer + wordIndexInBlock; - wordsLeft = wordsPerBlock - wordIndexInBlock; - currentBit = startingBlock % kBitsPerWord; - if (currentBit != 0) { - bitMask = kAllBitsSetInWord >> currentBit; // turn off all bits before currentBit - numBits = kBitsPerWord - currentBit; // number of remaining bits in this word - if (numBits > numBlocks) { - numBits = numBlocks; // entire allocation is inside this one word - bitMask &= ~(kAllBitsSetInWord >> (currentBit + numBits)); // turn off bits after last - } - if ((do_validate == true) && - (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { - goto Corruption; - } - *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap - numBlocks -= numBits; // adjust number of blocks left to free - - ++currentWord; // move to next word - --wordsLeft; // one less word left in this block - } - - // - // Free whole words (32 blocks) at a time. - // - - while (numBlocks >= kBitsPerWord) { - if (wordsLeft == 0) { - // Read in the next bitmap block - startingBlock += bitsPerBlock; // generate a block number in the next bitmap block - - buffer = NULL; - err = ReleaseBitmapBlock(vcb, blockRef, true); - if (err != noErr) goto Exit; - - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_RESERVED); - if (err != noErr) goto Exit; - - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); - } - - // Readjust currentWord and wordsLeft - currentWord = buffer; - wordsLeft = wordsPerBlock; - } - if ((do_validate == true) && - (*currentWord != SWAP_BE32 (kAllBitsSetInWord))) { - goto Corruption; - } - *currentWord = 0; // clear the entire word - numBlocks -= kBitsPerWord; - - ++currentWord; // move to next word - --wordsLeft; // one less word left in this block - } - - // - // Free any remaining blocks. - // - - if (numBlocks != 0) { - bitMask = ~(kAllBitsSetInWord >> numBlocks); // set first numBlocks bits - if (wordsLeft == 0) { - // Read in the next bitmap block - startingBlock += bitsPerBlock; // generate a block number in the next bitmap block - - buffer = NULL; - err = ReleaseBitmapBlock(vcb, blockRef, true); - if (err != noErr) goto Exit; - - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_RESERVED); - if (err != noErr) goto Exit; - - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); - } - - // Readjust currentWord and wordsLeft - currentWord = buffer; - wordsLeft = wordsPerBlock; - } - if ((do_validate == true) && - (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { - goto Corruption; - } - *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap - - // No need to update currentWord or wordsLeft - } - - // - // Look for a range of free blocks immediately after the range we just freed - // (up to the end of the current bitmap block). - // - wordIndexInBlock = ((startingBlock_in + numBlocks_in - 1) & (bitsPerBlock-1)) / kBitsPerWord; - wordsLeft = wordsPerBlock - wordIndexInBlock; - currentWord = buffer + wordIndexInBlock; - currentBit = (startingBlock_in + numBlocks_in - 1) % kBitsPerWord; - bitMask = kHighBitInWordMask >> currentBit; - while (unmapStart + unmapCount < max_unmap) { - // Move currentWord/bitMask/wordsLeft forward one bit - bitMask >>= 1; - if (bitMask == 0) { - if (--wordsLeft == 0) - break; - ++currentWord; - bitMask = kHighBitInWordMask; - } - - if (*currentWord & SWAP_BE32(bitMask)) - break; // Found an allocated block. Stop searching. - ++unmapCount; - } - -Exit: - - if (buffer) - (void)ReleaseBitmapBlock(vcb, blockRef, true); - - if (err == noErr) { - hfs_unmap_free_extent(vcb, unmapStart, unmapCount); - } - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); - - return err; - -Corruption: -#if DEBUG - panic("hfs: BlockMarkFreeInternal: blocks not allocated!"); - __builtin_unreachable(); -#else - printf ("hfs: BlockMarkFreeInternal() trying to free unallocated blocks on volume %s <%u, %u>\n", - vcb->vcbVN, startingBlock_in, numBlocks_in); - hfs_mark_inconsistent(vcb, HFS_INCONSISTENCY_DETECTED); - err = EIO; - goto Exit; -#endif -} - - -/* -_______________________________________________________________________ - -Routine: BlockFindContiguous - -Function: Find a contiguous range of blocks that are free (bits - clear in the bitmap). If a contiguous range of the - minimum size can't be found, an error will be returned. - This is only needed to support the bitmap-scanning logic, - as the red-black tree should be able to do this by internally - searching its tree. - -Inputs: - vcb Pointer to volume where space is to be allocated - startingBlock Preferred first block of range - endingBlock Last possible block in range + 1 - minBlocks Minimum number of blocks needed. Must be > 0. - maxBlocks Maximum (ideal) number of blocks desired - useMetaZone OK to dip into metadata allocation zone - -Outputs: - actualStartBlock First block of range found, or 0 if error - actualNumBlocks Number of blocks found, or 0 if error - -Returns: - noErr Found at least minBlocks contiguous - dskFulErr No contiguous space found, or all less than minBlocks -_______________________________________________________________________ -*/ - -static OSErr BlockFindContiguous( - ExtendedVCB *vcb, - u_int32_t startingBlock, - u_int32_t endingBlock, - u_int32_t minBlocks, - u_int32_t maxBlocks, - Boolean useMetaZone, - Boolean trustSummary, - u_int32_t *actualStartBlock, - u_int32_t *actualNumBlocks, - hfs_block_alloc_flags_t flags) -{ - OSErr err; - register u_int32_t currentBlock; // Block we're currently looking at. - u_int32_t firstBlock; // First free block in current extent. - u_int32_t stopBlock; // If we get to this block, stop searching for first free block. - u_int32_t foundBlocks; // Number of contiguous free blocks in current extent. - u_int32_t *buffer = NULL; - register u_int32_t *currentWord; - register u_int32_t bitMask; - register u_int32_t wordsLeft; - register u_int32_t tempWord; - uintptr_t blockRef = 0; - u_int32_t wordsPerBlock; - u_int32_t updated_free_extent = 0; - struct hfsmount *hfsmp = (struct hfsmount*) vcb; - HFSPlusExtentDescriptor best = { 0, 0 }; - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_START, startingBlock, endingBlock, minBlocks, maxBlocks, 0); - - /* - * When we're skipping the metadata zone and the start/end - * range overlaps with the metadata zone then adjust the - * start to be outside of the metadata zone. If the range - * is entirely inside the metadata zone then we can deny the - * request (dskFulErr). - */ - if (!useMetaZone && (vcb->hfs_flags & HFS_METADATA_ZONE)) { - if (startingBlock <= vcb->hfs_metazone_end) { - if (endingBlock > (vcb->hfs_metazone_end + 2)) - startingBlock = vcb->hfs_metazone_end + 1; - else - goto DiskFull; - } - } - - if ((endingBlock - startingBlock) < minBlocks) - { - // The set of blocks we're checking is smaller than the minimum number - // of blocks, so we couldn't possibly find a good range. - goto DiskFull; - } - - stopBlock = endingBlock - minBlocks + 1; - currentBlock = startingBlock; - firstBlock = 0; - - /* - * Skip over metadata blocks. - */ - if (!useMetaZone) - currentBlock = NextBitmapBlock(vcb, currentBlock); - - /* - * Use the summary table if we can. Skip over any totally - * allocated blocks. currentBlock should now point to the first - * block beyond the metadata zone if the metazone allocations are not - * allowed in this invocation. - */ - if ((trustSummary) && (hfsmp->hfs_flags & HFS_SUMMARY_TABLE)) { - uint32_t suggestion; - err = hfs_find_summary_free (hfsmp, currentBlock, &suggestion); - if (err && err != ENOSPC) - goto ErrorExit; - if (err == ENOSPC || suggestion >= stopBlock) - goto DiskFull; - currentBlock = suggestion; - } - - - // - // Pre-read the first bitmap block. - // - err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef, flags); - if ( err != noErr ) goto ErrorExit; - - // - // Figure out where currentBlock is within the buffer. - // - wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; - - wordsLeft = (currentBlock / kBitsPerWord) & (wordsPerBlock-1); // Current index into buffer - currentWord = buffer + wordsLeft; - wordsLeft = wordsPerBlock - wordsLeft; - - uint32_t remaining = (hfsmp->freeBlocks - hfsmp->lockedBlocks - - (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE) - ? 0 : hfsmp->tentativeBlocks)); - - /* - * This outer do-while loop is the main body of this function. Its job is - * to search through the blocks (until we hit 'stopBlock'), and iterate - * through swaths of allocated bitmap until it finds free regions. - */ - - do - { - foundBlocks = 0; - /* - * We will try and update the summary table as we search - * below. Note that we will never update the summary table - * for the first and last blocks that the summary table - * covers. Ideally, we should, but the benefits probably - * aren't that significant so we leave things alone for now. - */ - uint32_t summary_block_scan = 0; - /* - * Inner while loop 1: - * Look for free blocks, skipping over allocated ones. - * - * Initialization starts with checking the initial partial word - * if applicable. - */ - bitMask = currentBlock & kBitsWithinWordMask; - if (bitMask) - { - tempWord = SWAP_BE32(*currentWord); // Fetch the current word only once - bitMask = kHighBitInWordMask >> bitMask; - while (tempWord & bitMask) - { - bitMask >>= 1; - ++currentBlock; - } - - // Did we find an unused bit (bitMask != 0), or run out of bits (bitMask == 0)? - if (bitMask) - goto FoundUnused; - - // Didn't find any unused bits, so we're done with this word. - ++currentWord; - --wordsLeft; - } - - // - // Check whole words - // - while (currentBlock < stopBlock) - { - // See if it's time to read another block. - if (wordsLeft == 0) - { - buffer = NULL; - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - /* - * If summary_block_scan is non-zero, then we must have - * pulled a bitmap file block into core, and scanned through - * the entire thing. Because we're in this loop, we are - * implicitly trusting that the bitmap didn't have any knowledge - * about this particular block. As a result, update the bitmap - * (lazily, now that we've scanned it) with our findings that - * this particular block is completely used up. - */ - if (summary_block_scan != 0) { - uint32_t summary_bit; - (void) hfs_get_summary_index (hfsmp, summary_block_scan, &summary_bit); - hfs_set_summary (hfsmp, summary_bit, 1); - summary_block_scan = 0; - } - } - err = ReleaseBitmapBlock(vcb, blockRef, false); - if (err != noErr) goto ErrorExit; - - /* - * Skip over metadata blocks. - */ - if (!useMetaZone) { - currentBlock = NextBitmapBlock(vcb, currentBlock); - if (currentBlock >= stopBlock) { - goto LoopExit; - } - } - - /* Skip over fully allocated bitmap blocks if we can */ - if ((trustSummary) && (hfsmp->hfs_flags & HFS_SUMMARY_TABLE)) { - uint32_t suggestion; - err = hfs_find_summary_free (hfsmp, currentBlock, &suggestion); - if (err && err != ENOSPC) - goto ErrorExit; - if (err == ENOSPC || suggestion >= stopBlock) - goto LoopExit; - currentBlock = suggestion; - } - - err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef, flags); - if ( err != noErr ) goto ErrorExit; - - /* - * Set summary_block_scan to be the block we just read into the block cache. - * - * At this point, we've just read an allocation block worth of bitmap file - * into the buffer above, but we don't know if it is completely allocated or not. - * If we find that it is completely allocated/full then we will jump - * through this loop again and set the appropriate summary bit as fully allocated. - */ - summary_block_scan = currentBlock; - currentWord = buffer; - wordsLeft = wordsPerBlock; - } - - // See if any of the bits are clear - if ((tempWord = SWAP_BE32(*currentWord)) + 1) // non-zero if any bits were clear - { - // Figure out which bit is clear - bitMask = kHighBitInWordMask; - while (tempWord & bitMask) - { - bitMask >>= 1; - ++currentBlock; - } - - break; // Found the free bit; break out to FoundUnused. - } - - // Keep looking at the next word - currentBlock += kBitsPerWord; - ++currentWord; - --wordsLeft; - } - -FoundUnused: - // Make sure the unused bit is early enough to use - if (currentBlock >= stopBlock) - { - break; - } - - // Remember the start of the extent - firstBlock = currentBlock; - - - /* - * Inner while loop 2: - * We get here if we find a free block. Count the number - * of contiguous free blocks observed. - * - * Initialization starts with checking the initial partial word - * if applicable. - */ - bitMask = currentBlock & kBitsWithinWordMask; - if (bitMask) - { - tempWord = SWAP_BE32(*currentWord); // Fetch the current word only once - bitMask = kHighBitInWordMask >> bitMask; - while (bitMask && !(tempWord & bitMask)) - { - bitMask >>= 1; - ++currentBlock; - } - - // Did we find a used bit (bitMask != 0), or run out of bits (bitMask == 0)? - if (bitMask) - goto FoundUsed; - - // Didn't find any used bits, so we're done with this word. - ++currentWord; - --wordsLeft; - } - - // - // Check whole words - // - while (currentBlock < endingBlock) - { - // See if it's time to read another block. - if (wordsLeft == 0) - { - buffer = NULL; - err = ReleaseBitmapBlock(vcb, blockRef, false); - if (err != noErr) goto ErrorExit; - - /* - * Skip over metadata blocks. - */ - if (!useMetaZone) { - u_int32_t nextBlock; - - nextBlock = NextBitmapBlock(vcb, currentBlock); - if (nextBlock != currentBlock) { - goto LoopExit; /* allocation gap, so stop */ - } - } - - err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef, flags); - if ( err != noErr ) goto ErrorExit; - - currentWord = buffer; - wordsLeft = wordsPerBlock; - } - - // See if any of the bits are set - if ((tempWord = SWAP_BE32(*currentWord)) != 0) - { - // Figure out which bit is set - bitMask = kHighBitInWordMask; - while (!(tempWord & bitMask)) - { - bitMask >>= 1; - ++currentBlock; - } - - break; // Found the used bit; break out to FoundUsed. - } - - // Keep looking at the next word - currentBlock += kBitsPerWord; - ++currentWord; - --wordsLeft; - - // If we found at least maxBlocks, we can quit early. - if ((currentBlock - firstBlock) >= maxBlocks) - break; - } - -FoundUsed: - // Make sure we didn't run out of bitmap looking for a used block. - // If so, pin to the end of the bitmap. - if (currentBlock > endingBlock) - currentBlock = endingBlock; - - // Figure out how many contiguous free blocks there were. - // Pin the answer to maxBlocks. - foundBlocks = currentBlock - firstBlock; - if (foundBlocks > maxBlocks) - foundBlocks = maxBlocks; - - if (remaining) { - if (foundBlocks > remaining) { -#if DEBUG || DEVELOPMENT - printf("hfs: found more blocks than are indicated free!\n"); -#endif - remaining = UINT32_MAX; - } else - remaining -= foundBlocks; - } - - if (ISSET(flags, HFS_ALLOC_TRY_HARD)) { - if (foundBlocks > best.blockCount) { - best.startBlock = firstBlock; - best.blockCount = foundBlocks; - } - - if (foundBlocks >= maxBlocks || best.blockCount >= remaining) - break; - - /* - * Note that we will go ahead and add this free extent to our - * cache below but that's OK because we'll remove it again if we - * decide to use this extent. - */ - } else if (foundBlocks >= minBlocks) - break; // Found what we needed! - - /* - * We did not find the total blocks we were looking for, but - * add this free block run to our free extent cache list, if possible. - */ - - // If we're ignoring tentative ranges, we need to account for them here - if (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE)) { - struct rl_entry free_extent = rl_make(firstBlock, firstBlock + foundBlocks - 1); - struct rl_entry *range;; - TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS], rl_link) { - rl_subtract(&free_extent, range); - if (rl_len(range) == 0) - break; - } - firstBlock = free_extent.rl_start; - foundBlocks = rl_len(&free_extent); - } - - if (foundBlocks) { - if (hfsmp->jnl == NULL) { - /* If there is no journal, go ahead and add to the free ext cache. */ - updated_free_extent = add_free_extent_cache(vcb, firstBlock, foundBlocks); - } - else { - /* - * If journaled, only add to the free extent cache if this block is not - * waiting for a TRIM to complete; that implies that the transaction that freed it - * has not yet been committed to stable storage. - */ - int recently_deleted = 0; - uint32_t nextblock; - err = CheckUnmappedBytes(hfsmp, (uint64_t)firstBlock, - (uint64_t)foundBlocks, &recently_deleted, &nextblock); - if ((err) || (recently_deleted == 0)) { - /* if we hit an error, or the blocks not recently freed, go ahead and insert it */ - updated_free_extent = add_free_extent_cache(vcb, firstBlock, foundBlocks); - } - err = 0; - } - } - } while (currentBlock < stopBlock); -LoopExit: - - if (ISSET(flags, HFS_ALLOC_TRY_HARD)) { - firstBlock = best.startBlock; - foundBlocks = best.blockCount; - } - - // Return the outputs. - if (foundBlocks < minBlocks) - { -DiskFull: - err = dskFulErr; -ErrorExit: - *actualStartBlock = 0; - *actualNumBlocks = 0; - } - else - { - err = noErr; - *actualStartBlock = firstBlock; - *actualNumBlocks = foundBlocks; - /* - * Sanity check for overflow - */ - if ((firstBlock + foundBlocks) > vcb->allocLimit) { - panic("hfs: blk allocation overflow on \"%s\" sb:0x%08x eb:0x%08x cb:0x%08x fb:0x%08x stop:0x%08x min:0x%08x found:0x%08x", - vcb->vcbVN, startingBlock, endingBlock, currentBlock, - firstBlock, stopBlock, minBlocks, foundBlocks); - } - } - - if (updated_free_extent && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { - int i; - u_int32_t min_start = vcb->totalBlocks; - - // set the nextAllocation pointer to the smallest free block number - // we've seen so on the next mount we won't rescan unnecessarily - lck_spin_lock(&vcb->vcbFreeExtLock); - for(i=0; i < (int)vcb->vcbFreeExtCnt; i++) { - if (vcb->vcbFreeExt[i].startBlock < min_start) { - min_start = vcb->vcbFreeExt[i].startBlock; - } - } - lck_spin_unlock(&vcb->vcbFreeExtLock); - if (min_start != vcb->totalBlocks) { - if (min_start < vcb->nextAllocation) { - vcb->nextAllocation = min_start; - } - if (min_start < vcb->sparseAllocation) { - vcb->sparseAllocation = min_start; - } - } - } - - if (buffer) - (void) ReleaseBitmapBlock(vcb, blockRef, false); - - if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); - - return err; -} - - -/* - * Count number of bits set in the given 32-bit unsigned number - * - * Returns: - * Number of bits set - */ -static int num_bits_set(u_int32_t num) -{ - int count; - - for (count = 0; num; count++) { - num &= num - 1; - } - - return count; -} - -/* - * For a given range of blocks, find the total number of blocks - * allocated. If 'stop_on_first' is true, it stops as soon as it - * encounters the first allocated block. This option is useful - * to determine if any block is allocated or not. - * - * Inputs: - * startingBlock First allocation block number of the range to be scanned. - * numBlocks Total number of blocks that need to be scanned. - * stop_on_first Stop the search after the first allocated block is found. - * - * Output: - * allocCount Total number of allocation blocks allocated in the given range. - * - * On error, it is the number of allocated blocks found - * before the function got an error. - * - * If 'stop_on_first' is set, - * allocCount = 1 if any allocated block was found. - * allocCount = 0 if no allocated block was found. - * - * Returns: - * 0 on success, non-zero on failure. - */ -static int -hfs_isallocated_internal(struct hfsmount *hfsmp, u_int32_t startingBlock, - u_int32_t numBlocks, Boolean stop_on_first, u_int32_t *allocCount) -{ - u_int32_t *currentWord; // Pointer to current word within bitmap block - u_int32_t wordsLeft; // Number of words left in this bitmap block - u_int32_t bitMask; // Word with given bits already set (ready to test) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - u_int32_t blockCount = 0; - int error; - - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_START, startingBlock, numBlocks, stop_on_first, 0, 0); - - /* - * Pre-read the bitmap block containing the first word of allocation - */ - error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_TENTATIVE); - if (error) - goto JustReturn; - - /* - * Initialize currentWord, and wordsLeft. - */ - { - u_int32_t wordIndexInBlock; - - bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; - - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - currentWord = buffer + wordIndexInBlock; - wordsLeft = wordsPerBlock - wordIndexInBlock; - } - - /* - * First test any non word aligned bits. - */ - firstBit = startingBlock % kBitsPerWord; - if (firstBit != 0) { - bitMask = kAllBitsSetInWord >> firstBit; - numBits = kBitsPerWord - firstBit; - if (numBits > numBlocks) { - numBits = numBlocks; - bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); - } - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - if (stop_on_first) { - blockCount = 1; - goto Exit; - } - blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); - } - numBlocks -= numBits; - ++currentWord; - --wordsLeft; - } - - /* - * Test whole words (32 blocks) at a time. - */ - while (numBlocks >= kBitsPerWord) { - if (wordsLeft == 0) { - /* Read in the next bitmap block. */ - startingBlock += bitsPerBlock; - - buffer = NULL; - error = ReleaseBitmapBlock(hfsmp, blockRef, false); - if (error) goto Exit; - - error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_TENTATIVE); - if (error) goto Exit; - - /* Readjust currentWord and wordsLeft. */ - currentWord = buffer; - wordsLeft = wordsPerBlock; - } - if (*currentWord != 0) { - if (stop_on_first) { - blockCount = 1; - goto Exit; - } - blockCount += num_bits_set(*currentWord); - } - numBlocks -= kBitsPerWord; - ++currentWord; - --wordsLeft; - } - - /* - * Test any remaining blocks. - */ - if (numBlocks != 0) { - bitMask = ~(kAllBitsSetInWord >> numBlocks); - if (wordsLeft == 0) { - /* Read in the next bitmap block */ - startingBlock += bitsPerBlock; - - buffer = NULL; - error = ReleaseBitmapBlock(hfsmp, blockRef, false); - if (error) goto Exit; - - error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_TENTATIVE); - if (error) goto Exit; - - currentWord = buffer; - wordsLeft = wordsPerBlock; - } - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - if (stop_on_first) { - blockCount = 1; - goto Exit; - } - blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); - } - } -Exit: - if (buffer) { - (void)ReleaseBitmapBlock(hfsmp, blockRef, false); - } - if (allocCount) { - *allocCount = blockCount; - } - -JustReturn: - if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_END, error, 0, blockCount, 0, 0); - - return (error); -} - -/* - * Count total number of blocks that are allocated in the given - * range from the bitmap. This is used to preflight total blocks - * that need to be relocated during volume resize. - * - * The journal or allocation file lock must be held. - * - * Returns: - * 0 on success, non-zero on failure. - * On failure, allocCount is zero. - */ - int -hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, - u_int32_t numBlocks, u_int32_t *allocCount) -{ - return hfs_isallocated_internal(hfsmp, startBlock, numBlocks, false, allocCount); -} - -/* - * Test to see if any blocks in a range are allocated. - * - * Note: On error, this function returns 1, which means that - * one or more blocks in the range are allocated. This function - * is primarily used for volume resize and we do not want - * to report to the caller that the blocks are free when we - * were not able to deterministically find it out. So on error, - * we always report that the blocks are allocated. - * - * The journal or allocation file lock must be held. - * - * Returns - * 0 if all blocks in the range are free. - * 1 if blocks in the range are allocated, or there was an error. - */ - int -hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) -{ - int error; - u_int32_t allocCount; - - error = hfs_isallocated_internal(hfsmp, startingBlock, numBlocks, true, &allocCount); - if (error) { - /* On error, we always say that the blocks are allocated - * so that volume resize does not return false success. - */ - return 1; - } else { - /* The function was deterministically able to find out - * if there was any block allocated or not. In that case, - * the value in allocCount is good enough to be returned - * back to the caller. - */ - return allocCount; - } -} - -/* - * CONFIG_HFS_RBTREE - * Check to see if the red-black tree is live. Allocation file lock must be held - * shared or exclusive to call this function. Note that we may call this even if - * HFS is built without activating the red-black tree code. - */ -__private_extern__ -int -hfs_isrbtree_active(struct hfsmount *hfsmp){ - -#pragma unused (hfsmp) - - /* Just return 0 for now */ - return 0; -} - - - -/* Summary Table Functions */ -/* - * hfs_check_summary: - * - * This function should be used to query the summary table to see if we can - * bypass a bitmap block or not when we're trying to find a free allocation block. - * - * - * Inputs: - * allocblock - allocation block number. Will be used to infer the correct summary bit. - * hfsmp -- filesystem in question. - * - * Output Arg: - * *freeblocks - set to 1 if we believe at least one free blocks in this vcbVBMIOSize - * page of bitmap file. - * - * - * Returns: - * 0 on success - * EINVAL on error - * - */ - -static int hfs_check_summary (struct hfsmount *hfsmp, uint32_t allocblock, uint32_t *freeblocks) { - - int err = EINVAL; - if (hfsmp->vcbVBMIOSize) { - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - uint32_t index; - if (hfs_get_summary_index (hfsmp, allocblock, &index)) { - *freeblocks = 0; - return EINVAL; - } - - /* Ok, now that we have the bit index into the array, what byte is it in ? */ - uint32_t byteindex = index / kBitsPerByte; - uint8_t current_byte = hfsmp->hfs_summary_table[byteindex]; - uint8_t bit_in_byte = index % kBitsPerByte; - - if (current_byte & (1 << bit_in_byte)) { - /* - * We do not believe there is anything free in the - * entire vcbVBMIOSize'd block. - */ - *freeblocks = 0; - } - else { - /* Looks like there might be a free block here... */ - *freeblocks = 1; - } - } - err = 0; - } - - return err; -} - - -#if 0 -/* - * hfs_get_next_summary - * - * From a given allocation block, jump to the allocation block at the start of the - * next vcbVBMIOSize boundary. This is useful when trying to quickly skip over - * large swaths of bitmap once we have determined that the bitmap is relatively full. - * - * Inputs: hfsmount, starting allocation block number - * Output Arg: *newblock will contain the allocation block number to start - * querying. - * - * Returns: - * 0 on success - * EINVAL if the block argument is too large to be used, or the summary table not live. - * EFBIG if there are no more summary bits to be queried - */ -static int -hfs_get_next_summary (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock) { - - u_int32_t bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; - u_int32_t start_offset; - u_int32_t next_offset; - int err = EINVAL; - - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - if ((err = hfs_get_summary_index(hfsmp, block, &start_offset))) { - return err; - } - - next_offset = start_offset++; - - if ((start_offset >= hfsmp->hfs_summary_size) || (next_offset >= hfsmp->hfs_summary_size)) { - /* Can't jump to the next summary bit. */ - return EINVAL; - } - - /* Otherwise, compute and return */ - *newblock = next_offset * bits_per_iosize; - if (*newblock >= hfsmp->totalBlocks) { - return EINVAL; - } - err = 0; - } - - return err; -} - -#endif - -/* - * hfs_release_summary - * - * Given an extent that is about to be de-allocated on-disk, determine the number - * of summary bitmap bits that need to be marked as 'potentially available'. - * Then go ahead and mark them as free. - * - * Inputs: - * hfsmp - hfs mount - * block - starting allocation block. - * length - length of the extent. - * - * Returns: - * EINVAL upon any errors. - */ -static int hfs_release_summary(struct hfsmount *hfsmp, uint32_t start_blk, uint32_t length) { - int err = EINVAL; - uint32_t end_blk = (start_blk + length) - 1; - - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - /* Figure out what the starting / ending block's summary bits are */ - uint32_t start_bit; - uint32_t end_bit; - uint32_t current_bit; - - err = hfs_get_summary_index (hfsmp, start_blk, &start_bit); - if (err) { - goto release_err; - } - err = hfs_get_summary_index (hfsmp, end_blk, &end_bit); - if (err) { - goto release_err; - } - - if (ALLOC_DEBUG) { - if (start_bit > end_bit) { - panic ("HFS: start > end!, %d %d ", start_bit, end_bit); - } - } - current_bit = start_bit; - while (current_bit <= end_bit) { - err = hfs_set_summary (hfsmp, current_bit, 0); - current_bit++; - } - } - -release_err: - return err; -} - -/* - * hfs_find_summary_free - * - * Given a allocation block as input, returns an allocation block number as output as a - * suggestion for where to start scanning the bitmap in order to find free blocks. It will - * determine the vcbVBMIOsize of the input allocation block, convert that into a summary - * bit, then keep iterating over the summary bits in order to find the first free one. - * - * Inputs: - * hfsmp - hfs mount - * block - starting allocation block - * newblock - output block as suggestion - * - * Returns: - * 0 on success - * ENOSPC if we could not find a free block - */ - -int hfs_find_summary_free (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock) { - - int err = ENOSPC; - uint32_t bit_index = 0; - uint32_t maybe_has_blocks = 0; - - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - uint32_t byte_index; - uint8_t curbyte; - uint8_t bit_in_byte; - uint32_t summary_cap; - - /* - * We generate a cap for the summary search because the summary table - * always represents a full summary of the bitmap FILE, which may - * be way more bits than are necessary for the actual filesystem - * whose allocations are mapped by the bitmap. - * - * Compute how much of hfs_summary_size is useable for the given number - * of allocation blocks eligible on this FS. - */ - err = hfs_get_summary_index (hfsmp, hfsmp->allocLimit - 1, &summary_cap); - if (err) { - goto summary_exit; - } - - /* Check the starting block first */ - err = hfs_check_summary (hfsmp, block, &maybe_has_blocks); - if (err) { - goto summary_exit; - } - - if (maybe_has_blocks) { - /* - * It looks like the initial start block could have something. - * Short-circuit and just use that. - */ - *newblock = block; - goto summary_exit; - } - - /* - * OK, now we know that the first block was useless. - * Get the starting summary bit, and find it in the array - */ - maybe_has_blocks = 0; - err = hfs_get_summary_index (hfsmp, block, &bit_index); - if (err) { - goto summary_exit; - } - - /* Iterate until we find something. */ - while (bit_index <= summary_cap) { - byte_index = bit_index / kBitsPerByte; - curbyte = hfsmp->hfs_summary_table[byte_index]; - bit_in_byte = bit_index % kBitsPerByte; - - if (curbyte & (1 << bit_in_byte)) { - /* nothing here. increment and move on */ - bit_index++; - } - else { - /* - * found something! convert bit_index back into - * an allocation block for use. 'newblock' will now - * contain the proper allocation block # based on the bit - * index. - */ - err = hfs_get_summary_allocblock (hfsmp, bit_index, newblock); - if (err) { - goto summary_exit; - } - maybe_has_blocks = 1; - break; - } - } - - /* If our loop didn't find anything, set err to ENOSPC */ - if (maybe_has_blocks == 0) { - err = ENOSPC; - } - } - - /* If the summary table is not active for this mount, we'll just return ENOSPC */ -summary_exit: - if (maybe_has_blocks) { - err = 0; - } - - return err; -} - -/* - * hfs_get_summary_allocblock - * - * Convert a summary bit into an allocation block number to use to start searching for free blocks. - * - * Inputs: - * hfsmp - hfs mount - * summarybit - summmary bit index - * *alloc - allocation block number in the bitmap file. - * - * Output: - * 0 on success - * EINVAL on failure - */ -int hfs_get_summary_allocblock (struct hfsmount *hfsmp, uint32_t - summarybit, uint32_t *alloc) { - uint32_t bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; - uint32_t allocblk; - - allocblk = summarybit * bits_per_iosize; - - if (allocblk >= hfsmp->totalBlocks) { - return EINVAL; - } - else { - *alloc = allocblk; - } - - return 0; -} - - -/* - * hfs_set_summary: - * - * This function should be used to manipulate the summary table - * - * The argument 'inuse' will set the value of the bit in question to one or zero - * depending on its value. - * - * Inputs: - * hfsmp - hfs mount - * summarybit - the bit index into the summary table to set/unset. - * inuse - the value to assign to the bit. - * - * Returns: - * 0 on success - * EINVAL on error - * - */ - -static int hfs_set_summary (struct hfsmount *hfsmp, uint32_t summarybit, uint32_t inuse) { - - int err = EINVAL; - if (hfsmp->vcbVBMIOSize) { - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - - if (ALLOC_DEBUG) { - if (hfsmp->hfs_summary_table == NULL) { - panic ("hfs_set_summary: no table for %p ", hfsmp); - } - } - - /* Ok, now that we have the bit index into the array, what byte is it in ? */ - uint32_t byte_index = summarybit / kBitsPerByte; - uint8_t current_byte = hfsmp->hfs_summary_table[byte_index]; - uint8_t bit_in_byte = summarybit % kBitsPerByte; - - if (inuse) { - current_byte = (current_byte | (1 << bit_in_byte)); - } - else { - current_byte = (current_byte & ~(1 << bit_in_byte)); - } - - hfsmp->hfs_summary_table[byte_index] = current_byte; - } - err = 0; - } - - return err; -} - - -/* - * hfs_get_summary_index: - * - * This is a helper function which determines what summary bit represents the vcbVBMIOSize worth - * of IO against the bitmap file. - * - * Returns: - * 0 on success - * EINVAL on failure - */ -static int hfs_get_summary_index (struct hfsmount *hfsmp, uint32_t block, uint32_t* index) { - uint32_t summary_bit; - uint32_t bits_per_iosize; - int err = EINVAL; - - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - /* Is the input block bigger than the total number of blocks? */ - if (block >= hfsmp->totalBlocks) { - return EINVAL; - } - - /* Is there even a vbmIOSize set? */ - if (hfsmp->vcbVBMIOSize == 0) { - return EINVAL; - } - - bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; - - summary_bit = block / bits_per_iosize; - - *index = summary_bit; - err = 0; - } - - return err; -} - -/* - * hfs_init_summary - * - * From a given mount structure, compute how big the summary table should be for the given - * filesystem, then allocate and bzero the memory. - * - * Returns: - * 0 on success - * EINVAL on failure - */ -int -hfs_init_summary (struct hfsmount *hfsmp) { - - uint32_t summary_size; - uint32_t summary_size_bytes; - uint8_t *summary_table; - - if (hfsmp->hfs_allocation_cp == NULL) { - if (ALLOC_DEBUG) { - printf("hfs: summary table cannot progress without a bitmap cnode! \n"); - } - return EINVAL; - } - /* - * The practical maximum size of the summary table is 16KB: - * - * (512MB maximum bitmap size / (4k -- min alloc block size)) / 8 bits/byte. - * - * HFS+ will allow filesystems with allocation block sizes smaller than 4k, but - * the end result is that we'll start to issue I/O in 2k or 1k sized chunks, which makes - * supporting this much worse. The math would instead look like this: - * (512MB / 2k) / 8 == 32k. - * - * So, we will disallow the summary table if the allocation block size is < 4k. - */ - - if (hfsmp->blockSize < HFS_MIN_SUMMARY_BLOCKSIZE) { - printf("hfs: summary table not allowed on FS with block size of %d\n", hfsmp->blockSize); - return EINVAL; - } - - summary_size = hfsmp->hfs_allocation_cp->c_blocks; - - if (ALLOC_DEBUG) { - printf("HFS Summary Table Initialization: Bitmap %u blocks\n", - hfsmp->hfs_allocation_cp->c_blocks); - } - - /* - * If the bitmap IO size is not the same as the allocation block size then - * then re-compute the number of summary bits necessary. Note that above, the - * the default size is the number of allocation blocks in the bitmap *FILE* - * (not the number of bits in the bitmap itself). If the allocation block size - * is large enough though, we may need to increase this. - */ - if (hfsmp->blockSize != hfsmp->vcbVBMIOSize) { - uint64_t lrg_size = (uint64_t) hfsmp->hfs_allocation_cp->c_blocks * (uint64_t) hfsmp->blockSize; - lrg_size = lrg_size / (uint64_t)hfsmp->vcbVBMIOSize; - - /* With a full bitmap and 64k-capped iosize chunks, this would be 64k */ - summary_size = (uint32_t) lrg_size; - } - - /* - * If the block size is the same as the IO Size, then the total number of blocks - * is already equal to the number of IO units, which is our number of summary bits. - */ - - summary_size_bytes = summary_size / kBitsPerByte; - /* Always add one byte, just in case we have a dangling number of bits */ - summary_size_bytes++; - - if (ALLOC_DEBUG) { - printf("HFS Summary Table: vcbVBMIOSize %d summary bits %d \n", hfsmp->vcbVBMIOSize, summary_size); - printf("HFS Summary Table Size (in bytes) %d \n", summary_size_bytes); - } - - /* Store the field in the mount point, and then MALLOC/bzero the memory */ - hfsmp->hfs_summary_size = summary_size; - hfsmp->hfs_summary_bytes = summary_size_bytes; - - MALLOC (summary_table, uint8_t*, summary_size_bytes, M_TEMP, M_WAITOK); - if (summary_table == NULL) { - return ENOMEM; - } - bzero (summary_table, summary_size_bytes); - - /* enable the summary table */ - hfsmp->hfs_flags |= HFS_SUMMARY_TABLE; - hfsmp->hfs_summary_table = summary_table; - - if (ALLOC_DEBUG) { - if (hfsmp->hfs_summary_table == NULL) { - panic ("HFS Summary Init: no table for %p\n", hfsmp); - } - } - return 0; -} - -/* - * hfs_rebuild_summary - * - * This function should be used to allocate a new hunk of memory for use as a summary - * table, then copy the existing data into it. We use it whenever the filesystem's size - * changes. When a resize is in progress, you can still use the extant summary - * table if it is active. - * - * Inputs: - * hfsmp -- FS in question - * newlength -- new length of the FS in allocation blocks. - * - * Outputs: - * 0 on success, EINVAL on failure. If this function fails, the summary table - * will be disabled for future use. - * - */ -static int hfs_rebuild_summary (struct hfsmount *hfsmp) { - - uint32_t new_summary_size; - - new_summary_size = hfsmp->hfs_allocation_cp->c_blocks; - - - if (ALLOC_DEBUG) { - printf("HFS Summary Table Re-init: bitmap %u blocks\n", new_summary_size); - } - - /* - * If the bitmap IO size is not the same as the allocation block size, then re-compute - * the number of summary bits necessary. Note that above, the default size is the number - * of allocation blocks in the bitmap *FILE* (not the number of bits that the bitmap manages). - * If the allocation block size is large enough though, we may need to increase this, as - * bitmap IO is capped at 64k per IO - */ - if (hfsmp->blockSize != hfsmp->vcbVBMIOSize) { - uint64_t lrg_size = (uint64_t) hfsmp->hfs_allocation_cp->c_blocks * (uint64_t) hfsmp->blockSize; - lrg_size = lrg_size / (uint64_t)hfsmp->vcbVBMIOSize; - - /* With a full bitmap and 64k-capped iosize chunks, this would be 64k */ - new_summary_size = (uint32_t) lrg_size; - } - - /* - * Ok, we have the new summary bitmap theoretical max size. See if it's the same as - * what we've got already... - */ - if (new_summary_size != hfsmp->hfs_summary_size) { - uint32_t summarybytes = new_summary_size / kBitsPerByte; - uint32_t copysize; - uint8_t *newtable; - /* Add one byte for slop */ - summarybytes++; - - if (ALLOC_DEBUG) { - printf("HFS Summary Table: vcbVBMIOSize %d summary bits %d \n", hfsmp->vcbVBMIOSize, new_summary_size); - printf("HFS Summary Table Size (in bytes) %d \n", summarybytes); - } - - /* Attempt to MALLOC the memory */ - MALLOC (newtable, uint8_t*, summarybytes, M_TEMP, M_WAITOK); - if (newtable == NULL) { - /* - * ERROR! We need to disable the table now - */ - FREE (hfsmp->hfs_summary_table, M_TEMP); - hfsmp->hfs_summary_table = NULL; - hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE; - return EINVAL; - } - bzero (newtable, summarybytes); - - /* - * The new table may be smaller than the old one. If this is true, then - * we can't copy the full size of the existing summary table into the new - * one. - * - * The converse is not an issue since we bzeroed the table above. - */ - copysize = hfsmp->hfs_summary_bytes; - if (summarybytes < hfsmp->hfs_summary_bytes) { - copysize = summarybytes; - } - memcpy (newtable, hfsmp->hfs_summary_table, copysize); - - /* We're all good. Destroy the old copy and update ptrs */ - FREE (hfsmp->hfs_summary_table, M_TEMP); - - hfsmp->hfs_summary_table = newtable; - hfsmp->hfs_summary_size = new_summary_size; - hfsmp->hfs_summary_bytes = summarybytes; - } - - return 0; -} - - -#if ALLOC_DEBUG -/* - * hfs_validate_summary - * - * Validation routine for the summary table. Debug-only function. - * - * Bitmap lock must be held. - * - */ -void hfs_validate_summary (struct hfsmount *hfsmp) { - uint32_t i; - int err; - - /* - * Iterate over all of the bits in the summary table, and verify if - * there really are free blocks in the pages that we believe may - * may contain free blocks. - */ - - if (hfsmp->hfs_summary_table == NULL) { - panic ("HFS Summary: No HFS summary table!"); - } - - /* 131072 bits == 16384 bytes. This is the theoretical max size of the summary table. we add 1 byte for slop */ - if (hfsmp->hfs_summary_size == 0 || hfsmp->hfs_summary_size > 131080) { - panic("HFS Summary: Size is bad! %d", hfsmp->hfs_summary_size); - } - - if (hfsmp->vcbVBMIOSize == 0) { - panic("HFS Summary: no VCB VBM IO Size !"); - } - - printf("hfs: summary validation beginning on %s\n", hfsmp->vcbVN); - printf("hfs: summary validation %d summary bits, %d summary blocks\n", hfsmp->hfs_summary_size, hfsmp->totalBlocks); - - - /* iterate through all possible summary bits */ - for (i = 0; i < hfsmp->hfs_summary_size ; i++) { - - uint32_t bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; - uint32_t byte_offset = hfsmp->vcbVBMIOSize * i; - - /* Compute the corresponding allocation block for the summary bit. */ - uint32_t alloc_block = i * bits_per_iosize; - - /* - * We use a uint32_t pointer here because it will speed up - * access to the real bitmap data on disk. - */ - uint32_t *block_data; - struct buf *bp; - int counter; - int counter_max; - int saw_free_bits = 0; - - /* Get the block */ - if ((err = ReadBitmapRange (hfsmp, byte_offset, hfsmp->vcbVBMIOSize, &block_data, &bp))) { - panic ("HFS Summary: error (%d) in ReadBitmapRange!", err); - } - - /* Query the status of the bit and then make sure we match */ - uint32_t maybe_has_free_blocks; - err = hfs_check_summary (hfsmp, alloc_block, &maybe_has_free_blocks); - if (err) { - panic ("HFS Summary: hfs_check_summary returned error (%d) ", err); - } - counter_max = hfsmp->vcbVBMIOSize / kBytesPerWord; - - for (counter = 0; counter < counter_max; counter++) { - uint32_t word = block_data[counter]; - - /* We assume that we'll not find any free bits here. */ - if (word != kAllBitsSetInWord) { - if (maybe_has_free_blocks) { - /* All done */ - saw_free_bits = 1; - break; - } - else { - panic ("HFS Summary: hfs_check_summary saw free bits!"); - } - } - } - - if (maybe_has_free_blocks && (saw_free_bits == 0)) { - panic ("HFS Summary: did not see free bits !"); - } - - /* Release the block. */ - if ((err = ReleaseScanBitmapRange (bp))) { - panic ("HFS Summary: Error (%d) in ReleaseScanBitmapRange", err); - } - } - - printf("hfs: summary validation completed successfully on %s\n", hfsmp->vcbVN); - - return; -} -#endif - -/* - * hfs_alloc_scan_range: - * - * This function should be used to scan large ranges of the allocation bitmap - * at one time. It makes two key assumptions: - * - * 1) Bitmap lock is held during the duration of the call (exclusive) - * 2) There are no pages in the buffer cache for any of the bitmap - * blocks that we may encounter. It *MUST* be completely empty. - * - * The expected use case is when we are scanning the bitmap in full while we are - * still mounting the filesystem in order to issue TRIMs or build up the summary - * table for the mount point. It should be done after any potential journal replays - * are completed and their I/Os fully issued. - * - * The key reason for assumption (2) above is that this function will try to issue - * I/O against the bitmap file in chunks as large a possible -- essentially as - * much as the buffer layer will handle (1MB). Because the size of these I/Os - * is larger than what would be expected during normal runtime we must invalidate - * the buffers as soon as we are done with them so that they do not persist in - * the buffer cache for other threads to find, as they'll typically be doing - * allocation-block size I/Os instead. - * - * Input Args: - * hfsmp - hfs mount data structure - * startbit - allocation block # to start our scan. It must be aligned - * on a vcbVBMIOsize boundary. - * list - journal trim list data structure for issuing TRIMs - * - * Output Args: - * bitToScan - Return the next bit to scan if this function is called again. - * Caller will supply this into the next invocation - * of this call as 'startbit'. - */ - -static int hfs_alloc_scan_range(struct hfsmount *hfsmp, u_int32_t startbit, - u_int32_t *bitToScan, struct jnl_trim_list *list) { - - int error; - int readwrite = 1; - u_int32_t curAllocBlock; - struct buf *blockRef = NULL; - u_int32_t *buffer = NULL; - u_int32_t free_offset = 0; //tracks the start of the current free range - u_int32_t size = 0; // tracks the length of the current free range. - u_int32_t iosize = 0; //how much io we should generate against the bitmap - u_int32_t byte_off; // byte offset into the bitmap file. - u_int32_t completed_size; // how much io was actually completed - u_int32_t last_bitmap_block; - u_int32_t current_word; - u_int32_t word_index = 0; - - /* summary table building */ - uint32_t summary_bit = 0; - uint32_t saw_free_blocks = 0; - uint32_t last_marked = 0; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - readwrite = 0; - } - - /* - * Compute how much I/O we should generate here. - * hfs_scan_range_size will validate that the start bit - * converted into a byte offset into the bitmap file, - * is aligned on a VBMIOSize boundary. - */ - error = hfs_scan_range_size (hfsmp, startbit, &iosize); - if (error) { - if (ALLOC_DEBUG) { - panic ("hfs_alloc_scan_range: hfs_scan_range_size error %d\n", error); - } - return error; - } - - if (iosize < hfsmp->vcbVBMIOSize) { - if (ALLOC_DEBUG) { - panic ("hfs_alloc_scan_range: iosize too small! (iosize %d)\n", iosize); - } - return EINVAL; - } - - /* hfs_scan_range_size should have verified startbit. Convert it to bytes */ - byte_off = startbit / kBitsPerByte; - - /* - * When the journal replays blocks, it does so by writing directly to the disk - * device (bypassing any filesystem vnodes and such). When it finishes its I/Os - * it also immediately re-reads and invalidates the range covered by the bp so - * it does not leave anything lingering in the cache (for iosize reasons). - * - * As such, it is safe to do large I/Os here with ReadBitmapRange. - * - * NOTE: It is not recommended, but it is possible to call the function below - * on sections of the bitmap that may be in core already as long as the pages are not - * dirty. In that case, we'd notice that something starting at that - * logical block of the bitmap exists in the metadata cache, and we'd check - * if the iosize requested is the same as what was already allocated for it. - * Odds are pretty good we're going to request something larger. In that case, - * we just free the existing memory associated with the buf and reallocate a - * larger range. This function should immediately invalidate it as soon as we're - * done scanning, so this shouldn't cause any coherency issues. - */ - - error = ReadBitmapRange(hfsmp, byte_off, iosize, &buffer, &blockRef); - if (error) { - if (ALLOC_DEBUG) { - panic ("hfs_alloc_scan_range: start %d iosize %d ReadBitmapRange error %d\n", startbit, iosize, error); - } - return error; - } - - /* - * At this point, we have a giant wired buffer that represents some portion of - * the bitmap file that we want to analyze. We may not have gotten all 'iosize' - * bytes though, so clip our ending bit to what we actually read in. - */ - completed_size = buf_count(blockRef); - last_bitmap_block = completed_size * kBitsPerByte; - last_bitmap_block = last_bitmap_block + startbit; - - /* Cap the last block to the total number of blocks if required */ - if (last_bitmap_block > hfsmp->totalBlocks) { - last_bitmap_block = hfsmp->totalBlocks; - } - - /* curAllocBlock represents the logical block we're analyzing. */ - curAllocBlock = startbit; - word_index = 0; - size = 0; - - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - if (hfs_get_summary_index (hfsmp, startbit, &summary_bit)) { - error = EINVAL; - if (ALLOC_DEBUG) { - panic ("hfs_alloc_scan_range: Could not acquire summary index for %u", startbit); - } - return error; - } - /* - * summary_bit should now be set to the summary bit corresponding to - * the allocation block of the first bit that we're supposed to scan - */ - } - saw_free_blocks = 0; - - while (curAllocBlock < last_bitmap_block) { - u_int32_t bit; - - /* Update the summary table as needed */ - if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { - if (ALLOC_DEBUG) { - if (hfsmp->hfs_summary_table == NULL) { - panic ("hfs_alloc_scan_range: no summary table!"); - } - } - - uint32_t temp_summary; - error = hfs_get_summary_index (hfsmp, curAllocBlock, &temp_summary); - if (error) { - if (ALLOC_DEBUG) { - panic ("hfs_alloc_scan_range: could not get summary index for %u", curAllocBlock); - } - return EINVAL; - } - - if (ALLOC_DEBUG) { - if (temp_summary < summary_bit) { - panic ("hfs_alloc_scan_range: backwards summary bit?\n"); - } - } - - /* - * If temp_summary is greater than summary_bit, then this - * means that the next allocation block crosses a vcbVBMIOSize boundary - * and we should treat this range of on-disk data as part of a new summary - * bit. - */ - if (temp_summary > summary_bit) { - if (saw_free_blocks == 0) { - /* Mark the bit as totally consumed in the summary table */ - hfs_set_summary (hfsmp, summary_bit, 1); - } - else { - /* Mark the bit as potentially free in summary table */ - hfs_set_summary (hfsmp, summary_bit, 0); - } - last_marked = summary_bit; - /* - * Any time we set the summary table, update our counter which tracks - * what the last bit that was fully marked in the summary table. - * - * Then reset our marker which says we haven't seen a free bit yet. - */ - saw_free_blocks = 0; - summary_bit = temp_summary; - } - } /* End summary table conditions */ - - current_word = SWAP_BE32(buffer[word_index]); - /* Iterate through the word 1 bit at a time... */ - for (bit = 0 ; bit < kBitsPerWord ; bit++, curAllocBlock++) { - if (curAllocBlock >= last_bitmap_block) { - break; - } - u_int32_t allocated = (current_word & (kHighBitInWordMask >> bit)); - - if (allocated) { - if (size != 0) { - if (readwrite) { - /* Insert the previously tracked range of free blocks to the trim list */ - hfs_track_unmap_blocks (hfsmp, free_offset, size, list); - } - add_free_extent_cache (hfsmp, free_offset, size); - size = 0; - free_offset = 0; - } - } - else { - /* Not allocated */ - size++; - if (free_offset == 0) { - /* Start a new run of free spcae at curAllocBlock */ - free_offset = curAllocBlock; - } - if (saw_free_blocks == 0) { - saw_free_blocks = 1; - } - } - } /* end for loop iterating through the word */ - - if (curAllocBlock < last_bitmap_block) { - word_index++; - } - - } /* End while loop (iterates through last_bitmap_block) */ - - - /* - * We've (potentially) completed our pass through this region of bitmap, - * but one thing we may not have done is updated that last summary bit for - * the last page we scanned, because we would have never transitioned across - * a vcbVBMIOSize boundary again. Check for that and update the last bit - * as needed. - * - * Note that 'last_bitmap_block' is *not* inclusive WRT the very last bit in the bitmap - * for the region of bitmap on-disk that we were scanning. (it is one greater). - */ - if ((curAllocBlock >= last_bitmap_block) && - (hfsmp->hfs_flags & HFS_SUMMARY_TABLE)) { - uint32_t temp_summary; - /* temp_block should be INSIDE the region we just scanned, so subtract 1 */ - uint32_t temp_block = last_bitmap_block - 1; - error = hfs_get_summary_index (hfsmp, temp_block, &temp_summary); - if (error) { - if (ALLOC_DEBUG) { - panic ("hfs_alloc_scan_range: end bit curAllocBlock %u, last_bitmap_block %u", curAllocBlock, last_bitmap_block); - } - return EINVAL; - } - - /* Did we already update this in the table? */ - if (temp_summary > last_marked) { - if (saw_free_blocks == 0) { - hfs_set_summary (hfsmp, temp_summary, 1); - } - else { - hfs_set_summary (hfsmp, temp_summary, 0); - } - } - } - - /* - * We may have been tracking a range of free blocks that hasn't been inserted yet. - * Keep the logic for the TRIM and free extent separate from that of the summary - * table management even though they are closely linked. - */ - if (size != 0) { - if (readwrite) { - hfs_track_unmap_blocks (hfsmp, free_offset, size, list); - } - add_free_extent_cache (hfsmp, free_offset, size); - } - - /* - * curAllocBlock represents the next block we need to scan when we return - * to this function. - */ - *bitToScan = curAllocBlock; - ReleaseScanBitmapRange(blockRef); - - return 0; - -} - - - -/* - * Compute the maximum I/O size to generate against the bitmap file - * Will attempt to generate at LEAST VBMIOsize I/Os for interior ranges of the bitmap. - * - * Inputs: - * hfsmp -- hfsmount to look at - * bitmap_off -- bit offset into the bitmap file - * - * Outputs: - * iosize -- iosize to generate. - * - * Returns: - * 0 on success; EINVAL otherwise - */ -static int hfs_scan_range_size (struct hfsmount *hfsmp, uint32_t bitmap_st, uint32_t *iosize) { - - /* - * The maximum bitmap size is 512MB regardless of ABN size, so we can get away - * with 32 bit math in this function. - */ - - uint32_t bitmap_len; - uint32_t remaining_bitmap; - uint32_t target_iosize; - uint32_t bitmap_off; - - /* Is this bit index not word aligned? If so, immediately fail. */ - if (bitmap_st % kBitsPerWord) { - if (ALLOC_DEBUG) { - panic ("hfs_scan_range_size unaligned start bit! bitmap_st %d \n", bitmap_st); - } - return EINVAL; - } - - /* bitmap_off is in bytes, not allocation blocks/bits */ - bitmap_off = bitmap_st / kBitsPerByte; - - if ((hfsmp->totalBlocks <= bitmap_st) || (bitmap_off > (512 * 1024 * 1024))) { - if (ALLOC_DEBUG) { - panic ("hfs_scan_range_size: invalid start! bitmap_st %d, bitmap_off %d\n", bitmap_st, bitmap_off); - } - return EINVAL; - } - - /* - * Also invalid if it's not at least aligned to HFS bitmap logical - * block boundaries. We don't have to emit an iosize that's an - * exact multiple of the VBMIOSize, but it must start on such - * a boundary. - * - * The vcbVBMIOSize may be SMALLER than the allocation block size - * on a FS with giant allocation blocks, but it will never be - * greater than it, so it should be safe to start I/O - * aligned on a VBMIOsize boundary. - */ - if (bitmap_off & (hfsmp->vcbVBMIOSize - 1)) { - if (ALLOC_DEBUG) { - panic ("hfs_scan_range_size: unaligned start! bitmap_off %d\n", bitmap_off); - } - return EINVAL; - } - - /* - * Generate the total bitmap file length in bytes, then round up - * that value to the end of the last allocation block, if needed (It - * will probably be needed). We won't scan past the last actual - * allocation block. - * - * Unless we're completing the bitmap scan (or bitmap < 1MB), we - * have to complete the I/O on VBMIOSize boundaries, but we can only read - * up until the end of the bitmap file. - */ - bitmap_len = roundup(hfsmp->totalBlocks, hfsmp->blockSize * 8) / 8; - - remaining_bitmap = bitmap_len - bitmap_off; - - /* - * io size is the MIN of the maximum I/O we can generate or the - * remaining amount of bitmap. - */ - target_iosize = MIN((MAXBSIZE), remaining_bitmap); - *iosize = target_iosize; - - return 0; -} - - - - -/* - * This function is basically the same as hfs_isallocated, except it's designed for - * use with the red-black tree validation code. It assumes we're only checking whether - * one bit is active, and that we're going to pass in the buf to use, since GenerateTree - * calls ReadBitmapBlock and will have that buf locked down for the duration of its operation. - * - * This should not be called in general purpose scanning code. - */ -int hfs_isallocated_scan(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t *bp_buf) { - - u_int32_t *currentWord; // Pointer to current word within bitmap block - u_int32_t bitMask; // Word with given bits already set (ready to test) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t bitsPerBlock; - uintptr_t blockRef = 0; - u_int32_t wordsPerBlock; - u_int32_t numBlocks = 1; - u_int32_t *buffer = NULL; - - int inuse = 0; - int error; - - - if (bp_buf) { - /* just use passed-in buffer if avail. */ - buffer = bp_buf; - } - else { - /* - * Pre-read the bitmap block containing the first word of allocation - */ - error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, - HFS_ALLOC_IGNORE_TENTATIVE); - if (error) - return (error); - } - - /* - * Initialize currentWord, and wordsLeft. - */ - u_int32_t wordIndexInBlock; - - bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; - - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - currentWord = buffer + wordIndexInBlock; - - /* - * First test any non word aligned bits. - */ - firstBit = startingBlock % kBitsPerWord; - bitMask = kAllBitsSetInWord >> firstBit; - numBits = kBitsPerWord - firstBit; - if (numBits > numBlocks) { - numBits = numBlocks; - bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); - } - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - inuse = 1; - goto Exit; - } - numBlocks -= numBits; - ++currentWord; - -Exit: - if(bp_buf == NULL) { - if (buffer) { - (void)ReleaseBitmapBlock(hfsmp, blockRef, false); - } - } - return (inuse); - - - -} - -/* - * This function resets all of the data structures relevant to the - * free extent cache stored in the hfsmount struct. - * - * If we are using the red-black tree code then we need to account for the fact that - * we may encounter situations where we need to jettison the tree. If that is the - * case, then we fail-over to the bitmap scanning logic, but we need to ensure that - * the free ext cache is zeroed before we start using it. - * - * We also reset and disable the cache when allocLimit is updated... which - * is when a volume is being resized (via hfs_truncatefs() or hfs_extendfs()). - * It is independent of the type of allocator being used currently. - */ -void ResetVCBFreeExtCache(struct hfsmount *hfsmp) -{ - int bytes; - void *freeExt; - - if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_START, 0, 0, 0, 0, 0); - - lck_spin_lock(&hfsmp->vcbFreeExtLock); - - /* reset Free Extent Count */ - hfsmp->vcbFreeExtCnt = 0; - - /* reset the actual array */ - bytes = kMaxFreeExtents * sizeof(HFSPlusExtentDescriptor); - freeExt = (void*)(hfsmp->vcbFreeExt); - - bzero (freeExt, bytes); - - lck_spin_unlock(&hfsmp->vcbFreeExtLock); - - if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, 0, 0); - - return; -} - -/* - * This function is used to inform the allocator if we have to effectively shrink - * or grow the total number of allocation blocks via hfs_truncatefs or hfs_extendfs. - * - * The bitmap lock must be held when calling this function. This function also modifies the - * allocLimit field in the hfs mount point structure in the general case. - * - * In the shrinking case, we'll have to remove all free extents from the red-black - * tree past the specified offset new_end_block. In the growth case, we'll have to force - * a re-scan of the new allocation blocks from our current allocLimit to the new end block. - * - * new_end_block represents the total number of blocks available for allocation in the resized - * filesystem. Block #new_end_block should not be allocatable in the resized filesystem since it - * will be out of the (0, n-1) range that are indexable in the bitmap. - * - * Returns 0 on success - * errno on failure - */ -__private_extern__ -u_int32_t UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block) { - - /* - * Update allocLimit to the argument specified - */ - hfsmp->allocLimit = new_end_block; - - /* Invalidate the free extent cache completely so that - * it does not have any extents beyond end of current - * volume. - */ - ResetVCBFreeExtCache(hfsmp); - - /* Force a rebuild of the summary table. */ - (void) hfs_rebuild_summary (hfsmp); - - // Delete any tentative ranges that are in the area we're shrinking - struct rl_entry *range, *next_range; - TAILQ_FOREACH_SAFE(range, &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS], - rl_link, next_range) { - if (rl_overlap(range, new_end_block, RL_INFINITY) != RL_NOOVERLAP) - hfs_release_reserved(hfsmp, range, HFS_TENTATIVE_BLOCKS); - } - - return 0; -} - -/* - * Remove an extent from the list of free extents. - * - * This is a low-level routine. It does not handle overlaps or splitting; - * that is the responsibility of the caller. The input extent must exactly - * match an extent already in the list; it will be removed, and any following - * extents in the list will be shifted up. - * - * Inputs: - * startBlock - Start of extent to remove - * blockCount - Number of blocks in extent to remove - * - * Result: - * The index of the extent that was removed. - */ -static void remove_free_extent_list(struct hfsmount *hfsmp, int index) -{ - if (index < 0 || (uint32_t)index >= hfsmp->vcbFreeExtCnt) { - if (ALLOC_DEBUG) - panic("hfs: remove_free_extent_list: %p: index (%d) out of range (0, %u)", hfsmp, index, hfsmp->vcbFreeExtCnt); - else - printf("hfs: remove_free_extent_list: %p: index (%d) out of range (0, %u)", hfsmp, index, hfsmp->vcbFreeExtCnt); - return; - } - int shift_count = hfsmp->vcbFreeExtCnt - index - 1; - if (shift_count > 0) { - memmove(&hfsmp->vcbFreeExt[index], &hfsmp->vcbFreeExt[index+1], shift_count * sizeof(hfsmp->vcbFreeExt[0])); - } - hfsmp->vcbFreeExtCnt--; -} - - -/* - * Add an extent to the list of free extents. - * - * This is a low-level routine. It does not handle overlaps or coalescing; - * that is the responsibility of the caller. This routine *does* make - * sure that the extent it is adding is inserted in the correct location. - * If the list is full, this routine will handle either removing the last - * extent in the list to make room for the new extent, or ignoring the - * new extent if it is "worse" than the last extent in the list. - * - * Inputs: - * startBlock - Start of extent to add - * blockCount - Number of blocks in extent to add - * - * Result: - * The index where the extent that was inserted, or kMaxFreeExtents - * if the extent was not inserted (the list was full, and the extent - * being added was "worse" than everything in the list). - */ -static int add_free_extent_list(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) -{ - uint32_t i; - - /* ALLOC_DEBUG: Make sure no extents in the list overlap or are contiguous with the input extent. */ - if (ALLOC_DEBUG) { - uint32_t endBlock = startBlock + blockCount; - for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { - if (endBlock < hfsmp->vcbFreeExt[i].startBlock || - startBlock > (hfsmp->vcbFreeExt[i].startBlock + hfsmp->vcbFreeExt[i].blockCount)) { - continue; - } - panic("hfs: add_free_extent_list: %p: extent(%u %u) overlaps existing extent (%u %u) at index %d", - hfsmp, startBlock, blockCount, hfsmp->vcbFreeExt[i].startBlock, hfsmp->vcbFreeExt[i].blockCount, i); - } - } - - /* Figure out what index the new extent should be inserted at. */ - for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - /* The list is sorted by increasing offset. */ - if (startBlock < hfsmp->vcbFreeExt[i].startBlock) { - break; - } - } else { - /* The list is sorted by decreasing size. */ - if (blockCount > hfsmp->vcbFreeExt[i].blockCount) { - break; - } - } - } - - /* When we get here, i is the index where the extent should be inserted. */ - if (i == kMaxFreeExtents) { - /* - * The new extent is worse than anything already in the list, - * and the list is full, so just ignore the extent to be added. - */ - return i; - } - - /* - * Grow the list (if possible) to make room for an insert. - */ - if (hfsmp->vcbFreeExtCnt < kMaxFreeExtents) - hfsmp->vcbFreeExtCnt++; - - /* - * If we'll be keeping any extents after the insert position, then shift them. - */ - int shift_count = hfsmp->vcbFreeExtCnt - i - 1; - if (shift_count > 0) { - memmove(&hfsmp->vcbFreeExt[i+1], &hfsmp->vcbFreeExt[i], shift_count * sizeof(hfsmp->vcbFreeExt[0])); - } - - /* Finally, store the new extent at its correct position. */ - hfsmp->vcbFreeExt[i].startBlock = startBlock; - hfsmp->vcbFreeExt[i].blockCount = blockCount; - return i; -} - - -/* - * Remove an entry from free extent cache after it has been allocated. - * - * This is a high-level routine. It handles removing a portion of a - * cached extent, potentially splitting it into two (if the cache was - * already full, throwing away the extent that would sort last). It - * also handles removing an extent that overlaps multiple extents in - * the cache. - * - * Inputs: - * hfsmp - mount point structure - * startBlock - starting block of the extent to be removed. - * blockCount - number of blocks of the extent to be removed. - */ -static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) -{ - u_int32_t i, insertedIndex; - u_int32_t currentStart, currentEnd, endBlock; - int extentsRemoved = 0; - - if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); - - endBlock = startBlock + blockCount; - - lck_spin_lock(&hfsmp->vcbFreeExtLock); - - /* - * Iterate over all of the extents in the free extent cache, removing or - * updating any entries that overlap with the input extent. - */ - for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { - currentStart = hfsmp->vcbFreeExt[i].startBlock; - currentEnd = currentStart + hfsmp->vcbFreeExt[i].blockCount; - - /* - * If the current extent is entirely before or entirely after the - * the extent to be removed, then we keep it as-is. - */ - if (currentEnd <= startBlock || currentStart >= endBlock) { - continue; - } - - /* - * If the extent being removed entirely contains the current extent, - * then remove the current extent. - */ - if (startBlock <= currentStart && endBlock >= currentEnd) { - remove_free_extent_list(hfsmp, i); - - /* - * We just removed the extent at index i. The extent at - * index i+1 just got shifted to index i. So decrement i - * to undo the loop's "++i", and the next iteration will - * examine index i again, which contains the next extent - * in the list. - */ - --i; - ++extentsRemoved; - continue; - } - - /* - * If the extent being removed is strictly "in the middle" of the - * current extent, then we need to split the current extent into - * two discontiguous extents (the "head" and "tail"). The good - * news is that we don't need to examine any other extents in - * the list. - */ - if (startBlock > currentStart && endBlock < currentEnd) { - remove_free_extent_list(hfsmp, i); - add_free_extent_list(hfsmp, currentStart, startBlock - currentStart); - add_free_extent_list(hfsmp, endBlock, currentEnd - endBlock); - break; - } - - /* - * The only remaining possibility is that the extent to be removed - * overlaps the start or end (but not both!) of the current extent. - * So we need to replace the current extent with a shorter one. - * - * The only tricky part is that the updated extent might be at a - * different index than the original extent. If the updated extent - * was inserted after the current extent, then we need to re-examine - * the entry at index i, since it now contains the extent that was - * previously at index i+1. If the updated extent was inserted - * before or at the same index as the removed extent, then the - * following extents haven't changed position. - */ - remove_free_extent_list(hfsmp, i); - if (startBlock > currentStart) { - /* Remove the tail of the current extent. */ - insertedIndex = add_free_extent_list(hfsmp, currentStart, startBlock - currentStart); - } else { - /* Remove the head of the current extent. */ - insertedIndex = add_free_extent_list(hfsmp, endBlock, currentEnd - endBlock); - } - if (insertedIndex > i) { - --i; /* Undo the "++i" in the loop, so we examine the entry at index i again. */ - } - } - - lck_spin_unlock(&hfsmp->vcbFreeExtLock); - - sanity_check_free_ext(hfsmp, 0); - - if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, extentsRemoved, 0); - - return; -} - - -/* - * Add an entry to free extent cache after it has been deallocated. - * - * This is a high-level routine. It will merge overlapping or contiguous - * extents into a single, larger extent. - * - * If the extent provided has blocks beyond current allocLimit, it is - * clipped to allocLimit (so that we won't accidentally find and allocate - * space beyond allocLimit). - * - * Inputs: - * hfsmp - mount point structure - * startBlock - starting block of the extent to be removed. - * blockCount - number of blocks of the extent to be removed. - * - * Returns: - * true - if the extent was added successfully to the list - * false - if the extent was not added to the list, maybe because - * the extent was beyond allocLimit, or is not best - * candidate to be put in the cache. - */ -static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) -{ - Boolean retval = false; - uint32_t endBlock; - uint32_t currentEnd; - uint32_t i; - - if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); - -#if DEBUG - for (i = 0; i < 2; ++i) { - struct rl_entry *range; - TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[i], rl_link) { - assert(rl_overlap(range, startBlock, - startBlock + blockCount - 1) == RL_NOOVERLAP); - } - } -#endif - - /* No need to add extent that is beyond current allocLimit */ - if (startBlock >= hfsmp->allocLimit) { - goto out_not_locked; - } - - /* If end of the free extent is beyond current allocLimit, clip the extent */ - if ((startBlock + blockCount) > hfsmp->allocLimit) { - blockCount = hfsmp->allocLimit - startBlock; - } - - lck_spin_lock(&hfsmp->vcbFreeExtLock); - - /* - * Make a pass through the free extent cache, looking for known extents that - * overlap or are contiguous with the extent to be added. We'll remove those - * extents from the cache, and incorporate them into the new extent to be added. - */ - endBlock = startBlock + blockCount; - for (i=0; i < hfsmp->vcbFreeExtCnt; ++i) { - currentEnd = hfsmp->vcbFreeExt[i].startBlock + hfsmp->vcbFreeExt[i].blockCount; - if (hfsmp->vcbFreeExt[i].startBlock > endBlock || currentEnd < startBlock) { - /* Extent i does not overlap and is not contiguous, so keep it. */ - continue; - } else { - /* We need to remove extent i and combine it with the input extent. */ - if (hfsmp->vcbFreeExt[i].startBlock < startBlock) - startBlock = hfsmp->vcbFreeExt[i].startBlock; - if (currentEnd > endBlock) - endBlock = currentEnd; - - remove_free_extent_list(hfsmp, i); - /* - * We just removed the extent at index i. The extent at - * index i+1 just got shifted to index i. So decrement i - * to undo the loop's "++i", and the next iteration will - * examine index i again, which contains the next extent - * in the list. - */ - --i; - } - } - add_free_extent_list(hfsmp, startBlock, endBlock - startBlock); - - lck_spin_unlock(&hfsmp->vcbFreeExtLock); - -out_not_locked: - sanity_check_free_ext(hfsmp, 0); - - if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) - KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, retval, 0); - - return retval; -} - -/* Debug function to check if the free extent cache is good or not */ -static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated) -{ - u_int32_t i, j; - - /* Do not do anything if debug is not on */ - if (ALLOC_DEBUG == 0) { - return; - } - - lck_spin_lock(&hfsmp->vcbFreeExtLock); - - if (hfsmp->vcbFreeExtCnt > kMaxFreeExtents) - panic("hfs: %p: free extent count (%u) is too large", hfsmp, hfsmp->vcbFreeExtCnt); - - /* - * Iterate the Free extent cache and ensure no entries are bogus or refer to - * allocated blocks. - */ - for(i=0; i < hfsmp->vcbFreeExtCnt; i++) { - u_int32_t start, nblocks; - - start = hfsmp->vcbFreeExt[i].startBlock; - nblocks = hfsmp->vcbFreeExt[i].blockCount; - - /* Check if any of the blocks in free extent cache are allocated. - * This should not be enabled always because it might take - * very long for large extents that get added to the list. - * - * We have to drop vcbFreeExtLock while we call hfs_isallocated - * because it is going to do I/O. Note that the free extent - * cache could change. That's a risk we take when using this - * debugging code. (Another alternative would be to try to - * detect when the free extent cache changed, and perhaps - * restart if the list changed while we dropped the lock.) - */ - if (check_allocated) { - lck_spin_unlock(&hfsmp->vcbFreeExtLock); - if (hfs_isallocated(hfsmp, start, nblocks)) { - panic("hfs: %p: slot %d:(%u,%u) in the free extent array is allocated\n", - hfsmp, i, start, nblocks); - } - lck_spin_lock(&hfsmp->vcbFreeExtLock); - } - - /* Check if any part of the extent is beyond allocLimit */ - if ((start > hfsmp->allocLimit) || ((start + nblocks) > hfsmp->allocLimit)) { - panic ("hfs: %p: slot %d:(%u,%u) in the free extent array is beyond allocLimit=%u\n", - hfsmp, i, start, nblocks, hfsmp->allocLimit); - } - - /* Check if there are any duplicate start blocks */ - for(j=i+1; j < hfsmp->vcbFreeExtCnt; j++) { - if (start == hfsmp->vcbFreeExt[j].startBlock) { - panic("hfs: %p: slot %d:(%u,%u) and %d:(%u,%u) are duplicate\n", - hfsmp, i, start, nblocks, j, hfsmp->vcbFreeExt[j].startBlock, - hfsmp->vcbFreeExt[j].blockCount); - } - } - - /* Check if the entries are out of order */ - if ((i+1) != hfsmp->vcbFreeExtCnt) { - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - /* sparse devices are sorted by starting block number (ascending) */ - if (hfsmp->vcbFreeExt[i].startBlock > hfsmp->vcbFreeExt[i+1].startBlock) { - panic ("hfs: %p: SPARSE %d:(%u,%u) and %d:(%u,%u) are out of order\n", - hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, - hfsmp->vcbFreeExt[i+1].blockCount); - } - } else { - /* normally sorted by block count (descending) */ - if (hfsmp->vcbFreeExt[i].blockCount < hfsmp->vcbFreeExt[i+1].blockCount) { - panic ("hfs: %p: %d:(%u,%u) and %d:(%u,%u) are out of order\n", - hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, - hfsmp->vcbFreeExt[i+1].blockCount); - } - } - } - } - lck_spin_unlock(&hfsmp->vcbFreeExtLock); -} - -#define BIT_RIGHT_MASK(bit) (0xffffffffffffffffull >> (bit)) -#define kHighBitInDoubleWordMask 0x8000000000000000ull - -static int clzll(uint64_t x) -{ - if (x == 0) - return 64; - else - return __builtin_clzll(x); -} - -#if !HFS_ALLOC_TEST - -static errno_t get_more_bits(bitmap_context_t *bitmap_ctx) -{ - uint32_t start_bit; - uint32_t iosize = 0; - uint32_t byte_offset; - uint32_t last_bitmap_block; - int error; - struct hfsmount *hfsmp = bitmap_ctx->hfsmp; -#if !HFS_ALLOC_TEST - uint64_t lock_elapsed; -#endif - - - if (bitmap_ctx->bp) - ReleaseScanBitmapRange(bitmap_ctx->bp); - - if (msleep(NULL, NULL, PINOD | PCATCH, - "hfs_fsinfo", NULL) == EINTR) { - return EINTR; - } - -#if !HFS_ALLOC_TEST - /* - * Let someone else use the allocation map after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME . - * lock_start is initialized in hfs_find_free_extents(). - */ - absolutetime_to_nanoseconds(mach_absolute_time() - bitmap_ctx->lock_start, &lock_elapsed); - - if (lock_elapsed >= HFS_FSINFO_MAX_LOCKHELD_TIME) { - - hfs_systemfile_unlock(hfsmp, bitmap_ctx->lockflags); - - /* add tsleep here to force context switch and fairness */ - tsleep((caddr_t)get_more_bits, PRIBIO, "hfs_fsinfo", 1); - - hfs_journal_lock(hfsmp); - - /* Flush the journal and wait for all I/Os to finish up */ - error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - if (error) { - hfs_journal_unlock(hfsmp); - return error; - } - - /* - * Take bitmap lock to ensure it is not being modified while journal is still held. - * Since we are reading larger than normal blocks from the bitmap, which - * might confuse other parts of the bitmap code using normal blocks, we - * take exclusive lock here. - */ - bitmap_ctx->lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - - bitmap_ctx->lock_start = mach_absolute_time(); - - /* Release the journal lock */ - hfs_journal_unlock(hfsmp); - - /* - * Bitmap is read in large block size (up to 1MB), - * unlike the runtime which reads the bitmap in the - * 4K block size. If the bitmap is read by both ways - * at the same time, it can result in multiple buf_t with - * different sizes and potentially case data corruption. - * To avoid this, we invalidate all the existing buffers - * associated with the bitmap vnode. - */ - error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); - if (error) { - /* hfs_systemfile_unlock will be called in the caller */ - return error; - } - } -#endif - - start_bit = bitmap_ctx->run_offset; - - if (start_bit >= bitmap_ctx->hfsmp->totalBlocks) { - bitmap_ctx->chunk_end = 0; - bitmap_ctx->bp = NULL; - bitmap_ctx->bitmap = NULL; - return 0; - } - - assert(start_bit % 8 == 0); - - /* - * Compute how much I/O we should generate here. - * hfs_scan_range_size will validate that the start bit - * converted into a byte offset into the bitmap file, - * is aligned on a VBMIOSize boundary. - */ - error = hfs_scan_range_size (bitmap_ctx->hfsmp, start_bit, &iosize); - if (error) - return error; - - assert(iosize != 0); - - /* hfs_scan_range_size should have verified startbit. Convert it to bytes */ - byte_offset = start_bit / kBitsPerByte; - - /* - * When the journal replays blocks, it does so by writing directly to the disk - * device (bypassing any filesystem vnodes and such). When it finishes its I/Os - * it also immediately re-reads and invalidates the range covered by the bp so - * it does not leave anything lingering in the cache (for iosize reasons). - * - * As such, it is safe to do large I/Os here with ReadBitmapRange. - * - * NOTE: It is not recommended, but it is possible to call the function below - * on sections of the bitmap that may be in core already as long as the pages are not - * dirty. In that case, we'd notice that something starting at that - * logical block of the bitmap exists in the metadata cache, and we'd check - * if the iosize requested is the same as what was already allocated for it. - * Odds are pretty good we're going to request something larger. In that case, - * we just free the existing memory associated with the buf and reallocate a - * larger range. This function should immediately invalidate it as soon as we're - * done scanning, so this shouldn't cause any coherency issues. - */ - error = ReadBitmapRange(bitmap_ctx->hfsmp, byte_offset, iosize, (uint32_t **)&bitmap_ctx->bitmap, &bitmap_ctx->bp); - if (error) - return error; - - /* - * At this point, we have a giant wired buffer that represents some portion of - * the bitmap file that we want to analyze. We may not have gotten all 'iosize' - * bytes though, so clip our ending bit to what we actually read in. - */ - last_bitmap_block = start_bit + buf_count(bitmap_ctx->bp) * kBitsPerByte; - - /* Cap the last block to the total number of blocks if required */ - if (last_bitmap_block > bitmap_ctx->hfsmp->totalBlocks) - last_bitmap_block = bitmap_ctx->hfsmp->totalBlocks; - - bitmap_ctx->chunk_current = 0; // new chunk of bitmap - bitmap_ctx->chunk_end = last_bitmap_block - start_bit; - - return 0; -} - -#endif // !HFS_ALLOC_TEST - -// Returns number of contiguous bits set at start -static int bit_count_set(void *bitmap, int start, int end) -{ - if (start == end) - return 0; - - assert(end > start); - - const int start_bit = start & 63; - const int end_bit = end & 63; - - uint64_t *p = (uint64_t *)bitmap + start / 64; - uint64_t x = ~OSSwapBigToHostInt64(*p); - - if ((start & ~63) == (end & ~63)) { - // Start and end in same 64 bits - x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit); - return clzll(x) - start_bit; - } - - // Deal with initial unaligned bit - x &= BIT_RIGHT_MASK(start_bit); - - if (x) - return clzll(x) - start_bit; - - // Go fast - ++p; - int count = 64 - start_bit; - int nquads = (end - end_bit - start - 1) / 64; - - while (nquads--) { - if (*p != 0xffffffffffffffffull) { - x = ~OSSwapBigToHostInt64(*p); - return count + clzll(x); - } - ++p; - count += 64; - } - - if (end_bit) { - x = ~OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit); - count += clzll(x); - } - - return count; -} - -/* Returns the number of a run of cleared bits: - * bitmap is a single chunk of memory being examined - * start: the start bit relative to the current buffer to be examined; start is inclusive. - * end: the end bit relative to the current buffer to be examined; end is not inclusive. - */ -static int bit_count_clr(void *bitmap, int start, int end) -{ - if (start == end) - return 0; - - assert(end > start); - - const int start_bit = start & 63; - const int end_bit = end & 63; - - uint64_t *p = (uint64_t *)bitmap + start / 64; - uint64_t x = OSSwapBigToHostInt64(*p); - - if ((start & ~63) == (end & ~63)) { - // Start and end in same 64 bits - x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit); - - return clzll(x) - start_bit; - } - - // Deal with initial unaligned bit - x &= BIT_RIGHT_MASK(start_bit); - - if (x) - return clzll(x) - start_bit; - - // Go fast - ++p; - int count = 64 - start_bit; - int nquads = (end - end_bit - start - 1) / 64; - - while (nquads--) { - if (*p) { - x = OSSwapBigToHostInt64(*p); - return count + clzll(x); - } - ++p; - count += 64; - } - - if (end_bit) { - x = OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit); - - count += clzll(x); - } - - return count; -} - -#if !HFS_ALLOC_TEST -static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set) -{ - uint32_t end, start_summary_bit, end_summary_bit; - errno_t error = 0; - - if (count == 0) - goto out; - - if (!ISSET(bitmap_ctx->hfsmp->hfs_flags, HFS_SUMMARY_TABLE)) - return 0; - - if (hfs_get_summary_index (bitmap_ctx->hfsmp, start, &start_summary_bit)) { - error = EINVAL; - goto out; - } - - end = start + count - 1; - if (hfs_get_summary_index (bitmap_ctx->hfsmp, end, &end_summary_bit)) { - error = EINVAL; - goto out; - } - - // if summary table bit has been updated with free block previously, leave it. - if ((start_summary_bit == bitmap_ctx->last_free_summary_bit) && set) - start_summary_bit++; - - for (uint32_t summary_bit = start_summary_bit; summary_bit <= end_summary_bit; summary_bit++) - hfs_set_summary (bitmap_ctx->hfsmp, summary_bit, set); - - if (!set) - bitmap_ctx->last_free_summary_bit = end_summary_bit; - -out: - return error; - -} -#endif //!HFS_ALLOC_TEST - -/* - * Read in chunks of the bitmap into memory, and find a run of cleared/set bits; - * the run can extend across chunk boundaries. - * bit_count_clr can be passed to get a run of cleared bits. - * bit_count_set can be passed to get a run of set bits. - */ -static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count) -{ - int count; - errno_t error = 0; - - *bit_count = 0; - - do { - if (bitmap_ctx->run_offset == 0 || bitmap_ctx->chunk_current == bitmap_ctx->chunk_end) { - if ((error = get_more_bits(bitmap_ctx)) != 0) - goto out; - } - - if (bitmap_ctx->chunk_end == 0) - break; - - count = fn(bitmap_ctx->bitmap, bitmap_ctx->chunk_current, bitmap_ctx->chunk_end); - - bitmap_ctx->run_offset += count; - bitmap_ctx->chunk_current += count; - *bit_count += count; - - } while (bitmap_ctx->chunk_current >= bitmap_ctx->chunk_end && count); - -out: - return error; - -} - -// Returns count of number of bits clear -static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count) -{ - return hfs_bit_count(bitmap_ctx, bit_count_clr, count); -} - -// Returns count of number of bits set -static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count) -{ - return hfs_bit_count(bitmap_ctx, bit_count_set, count); -} - -static uint32_t hfs_bit_offset(bitmap_context_t *bitmap_ctx) -{ - return bitmap_ctx->run_offset; -} - -/* - * Perform a full scan of the bitmap file. - * Note: during the scan of bitmap file, it may drop and reacquire the - * bitmap lock to let someone else use the bitmap for fairness. - * Currently it is used by HFS_GET_FSINFO statistic gathing, which - * is run while other processes might perform HFS operations. - */ - -errno_t hfs_find_free_extents(struct hfsmount *hfsmp, - void (*callback)(void *data, off_t free_extent_size), void *callback_arg) -{ - struct bitmap_context bitmap_ctx; - uint32_t count; - errno_t error = 0; - - if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) { - error = hfs_init_summary(hfsmp); - if (error) - return error; - } - - bzero(&bitmap_ctx, sizeof(struct bitmap_context)); - - /* - * The journal maintains list of recently deallocated blocks to - * issue DKIOCUNMAPs when the corresponding journal transaction is - * flushed to the disk. To avoid any race conditions, we only - * want one active trim list. Therefore we make sure that the - * journal trim list is sync'ed, empty, and not modifiable for - * the duration of our scan. - * - * Take the journal lock before flushing the journal to the disk. - * We will keep on holding the journal lock till we don't get the - * bitmap lock to make sure that no new journal transactions can - * start. This will make sure that the journal trim list is not - * modified after the journal flush and before getting bitmap lock. - * We can release the journal lock after we acquire the bitmap - * lock as it will prevent any further block deallocations. - */ - hfs_journal_lock(hfsmp); - - /* Flush the journal and wait for all I/Os to finish up */ - error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); - if (error) { - hfs_journal_unlock(hfsmp); - return error; - } - - /* - * Take bitmap lock to ensure it is not being modified. - * Since we are reading larger than normal blocks from the bitmap, which - * might confuse other parts of the bitmap code using normal blocks, we - * take exclusive lock here. - */ - bitmap_ctx.lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - -#if !HFS_ALLOC_TEST - bitmap_ctx.lock_start = mach_absolute_time(); -#endif - - /* Release the journal lock */ - hfs_journal_unlock(hfsmp); - - /* - * Bitmap is read in large block size (up to 1MB), - * unlike the runtime which reads the bitmap in the - * 4K block size. If the bitmap is read by both ways - * at the same time, it can result in multiple buf_t with - * different sizes and potentially case data corruption. - * To avoid this, we invalidate all the existing buffers - * associated with the bitmap vnode. - */ - error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); - if (error) - goto out; - - /* - * Get the list of all free extent ranges. hfs_alloc_scan_range() - * will call hfs_fsinfo_data_add() to account for all the free - * extent ranges found during scan. - */ - bitmap_ctx.hfsmp = hfsmp; - bitmap_ctx.run_offset = 0; - - while (bitmap_ctx.run_offset < hfsmp->totalBlocks) { - - uint32_t start = hfs_bit_offset(&bitmap_ctx); - - if ((error = hfs_bit_count_clr(&bitmap_ctx, &count)) != 0) - goto out; - - if (count) - callback(callback_arg, hfs_blk_to_bytes(count, hfsmp->blockSize)); - - if ((error = update_summary_table(&bitmap_ctx, start, count, false)) != 0) - goto out; - - start = hfs_bit_offset(&bitmap_ctx); - - if ((error = hfs_bit_count_set(&bitmap_ctx, &count)) != 0) - goto out; - - if ((error = update_summary_table(&bitmap_ctx, start, count, true)) != 0) - goto out; - } - -out: - if (bitmap_ctx.lockflags) { - hfs_systemfile_unlock(hfsmp, bitmap_ctx.lockflags); - } - - return error; -} - diff --git a/bsd/hfs/hfscommon/Unicode/UCStringCompareData.h b/bsd/hfs/hfscommon/Unicode/UCStringCompareData.h deleted file mode 100644 index c00d15117..000000000 --- a/bsd/hfs/hfscommon/Unicode/UCStringCompareData.h +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) 2000-2002, 2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: UCStringCompareData.h - - Contains: xxx put contents here xxx - - Version: HFS Plus 1.0 - - Copyright: � 1997-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Mark Day - - Other Contact: xxx put other contact here xxx - - Technology: xxx put technology here xxx - - Writers: - - (djb) Don Brady - (msd) Mark Day - - Change History (most recent first): - - 11/16/97 djb msd. Updated lower case table with ignorable mappings and less - aggressive case folding. Added a trailing comma to make the - StreamEdit script work right. Removed Unicode decomposition - tables. Case folding tables convert u+0000 to 0xFFFF so that the - NUL character can appear in names, while still allowing a zero - value to be a sentinel. (From Andy Daniels, 11/10/97) - 8/26/97 djb Tweak gLowerCaseTable to make it faster. - 8/14/97 djb Add RelString compare table... - 4/24/97 djb first checked in - 2/27/97 msd first checked in -*/ - -#ifndef _UCSTRINGCOMPAREDATA_ -#define _UCSTRINGCOMPAREDATA_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -/* - * For better performance, the case folding table for basic latin - * is seperate from the others. This eliminates the extra lookup - * to get the offset to this table. - * - * Note: 0x0000 now maps to 0 so that it will be ignored - */ -u_int16_t gLatinCaseFold[] = { - /* 0 */ 0xFFFF, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, - /* 1 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, - /* 2 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, - /* 3 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, - /* 4 */ 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, - /* 5 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, - /* 6 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, - /* 7 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, - /* 8 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, - /* 9 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, - /* A */ 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, - /* B */ 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, - /* C */ 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00E6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, - /* D */ 0x00F0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00F8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00FE, 0x00DF, - /* E */ 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, - /* F */ 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, -}; - -/* The lower case table consists of a 256-entry high-byte table followed by some number of - 256-entry subtables. The high-byte table contains either an offset to the subtable for - characters with that high byte or zero, which means that there are no case mappings or - ignored characters in that block. Ignored characters are mapped to zero. - */ - -u_int16_t gLowerCaseTable[] = { - - /* High-byte indices ( == 0 iff no case mapping and no ignorables ) */ - - /* 0 */ 0x0000, 0x0100, 0x0000, 0x0200, 0x0300, 0x0400, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 1 */ 0x0500, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 2 */ 0x0600, 0x0700, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 3 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 4 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 5 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 6 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 7 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 9 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* A */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* B */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* C */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* D */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* E */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* F */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0800, 0x0900, - - /* Table 1 (for high byte 0x01) */ - - /* 0 */ 0x0100, 0x0101, 0x0102, 0x0103, 0x0104, 0x0105, 0x0106, 0x0107, 0x0108, 0x0109, 0x010A, 0x010B, 0x010C, 0x010D, 0x010E, 0x010F, - /* 1 */ 0x0111, 0x0111, 0x0112, 0x0113, 0x0114, 0x0115, 0x0116, 0x0117, 0x0118, 0x0119, 0x011A, 0x011B, 0x011C, 0x011D, 0x011E, 0x011F, - /* 2 */ 0x0120, 0x0121, 0x0122, 0x0123, 0x0124, 0x0125, 0x0127, 0x0127, 0x0128, 0x0129, 0x012A, 0x012B, 0x012C, 0x012D, 0x012E, 0x012F, - /* 3 */ 0x0130, 0x0131, 0x0133, 0x0133, 0x0134, 0x0135, 0x0136, 0x0137, 0x0138, 0x0139, 0x013A, 0x013B, 0x013C, 0x013D, 0x013E, 0x0140, - /* 4 */ 0x0140, 0x0142, 0x0142, 0x0143, 0x0144, 0x0145, 0x0146, 0x0147, 0x0148, 0x0149, 0x014B, 0x014B, 0x014C, 0x014D, 0x014E, 0x014F, - /* 5 */ 0x0150, 0x0151, 0x0153, 0x0153, 0x0154, 0x0155, 0x0156, 0x0157, 0x0158, 0x0159, 0x015A, 0x015B, 0x015C, 0x015D, 0x015E, 0x015F, - /* 6 */ 0x0160, 0x0161, 0x0162, 0x0163, 0x0164, 0x0165, 0x0167, 0x0167, 0x0168, 0x0169, 0x016A, 0x016B, 0x016C, 0x016D, 0x016E, 0x016F, - /* 7 */ 0x0170, 0x0171, 0x0172, 0x0173, 0x0174, 0x0175, 0x0176, 0x0177, 0x0178, 0x0179, 0x017A, 0x017B, 0x017C, 0x017D, 0x017E, 0x017F, - /* 8 */ 0x0180, 0x0253, 0x0183, 0x0183, 0x0185, 0x0185, 0x0254, 0x0188, 0x0188, 0x0256, 0x0257, 0x018C, 0x018C, 0x018D, 0x01DD, 0x0259, - /* 9 */ 0x025B, 0x0192, 0x0192, 0x0260, 0x0263, 0x0195, 0x0269, 0x0268, 0x0199, 0x0199, 0x019A, 0x019B, 0x026F, 0x0272, 0x019E, 0x0275, - /* A */ 0x01A0, 0x01A1, 0x01A3, 0x01A3, 0x01A5, 0x01A5, 0x01A6, 0x01A8, 0x01A8, 0x0283, 0x01AA, 0x01AB, 0x01AD, 0x01AD, 0x0288, 0x01AF, - /* B */ 0x01B0, 0x028A, 0x028B, 0x01B4, 0x01B4, 0x01B6, 0x01B6, 0x0292, 0x01B9, 0x01B9, 0x01BA, 0x01BB, 0x01BD, 0x01BD, 0x01BE, 0x01BF, - /* C */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x01C6, 0x01C6, 0x01C6, 0x01C9, 0x01C9, 0x01C9, 0x01CC, 0x01CC, 0x01CC, 0x01CD, 0x01CE, 0x01CF, - /* D */ 0x01D0, 0x01D1, 0x01D2, 0x01D3, 0x01D4, 0x01D5, 0x01D6, 0x01D7, 0x01D8, 0x01D9, 0x01DA, 0x01DB, 0x01DC, 0x01DD, 0x01DE, 0x01DF, - /* E */ 0x01E0, 0x01E1, 0x01E2, 0x01E3, 0x01E5, 0x01E5, 0x01E6, 0x01E7, 0x01E8, 0x01E9, 0x01EA, 0x01EB, 0x01EC, 0x01ED, 0x01EE, 0x01EF, - /* F */ 0x01F0, 0x01F3, 0x01F3, 0x01F3, 0x01F4, 0x01F5, 0x01F6, 0x01F7, 0x01F8, 0x01F9, 0x01FA, 0x01FB, 0x01FC, 0x01FD, 0x01FE, 0x01FF, - - /* Table 2 (for high byte 0x03) */ - - /* 0 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F, - /* 1 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, 0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F, - /* 2 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F, - /* 3 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F, - /* 4 */ 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F, - /* 5 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, 0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F, - /* 6 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F, - /* 7 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, 0x0378, 0x0379, 0x037A, 0x037B, 0x037C, 0x037D, 0x037E, 0x037F, - /* 8 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387, 0x0388, 0x0389, 0x038A, 0x038B, 0x038C, 0x038D, 0x038E, 0x038F, - /* 9 */ 0x0390, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, - /* A */ 0x03C0, 0x03C1, 0x03A2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF, - /* B */ 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, - /* C */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0x03CF, - /* D */ 0x03D0, 0x03D1, 0x03D2, 0x03D3, 0x03D4, 0x03D5, 0x03D6, 0x03D7, 0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF, - /* E */ 0x03E0, 0x03E1, 0x03E3, 0x03E3, 0x03E5, 0x03E5, 0x03E7, 0x03E7, 0x03E9, 0x03E9, 0x03EB, 0x03EB, 0x03ED, 0x03ED, 0x03EF, 0x03EF, - /* F */ 0x03F0, 0x03F1, 0x03F2, 0x03F3, 0x03F4, 0x03F5, 0x03F6, 0x03F7, 0x03F8, 0x03F9, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF, - - /* Table 3 (for high byte 0x04) */ - - /* 0 */ 0x0400, 0x0401, 0x0452, 0x0403, 0x0454, 0x0455, 0x0456, 0x0407, 0x0458, 0x0459, 0x045A, 0x045B, 0x040C, 0x040D, 0x040E, 0x045F, - /* 1 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0419, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, - /* 2 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, - /* 3 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, - /* 4 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, - /* 5 */ 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E, 0x045F, - /* 6 */ 0x0461, 0x0461, 0x0463, 0x0463, 0x0465, 0x0465, 0x0467, 0x0467, 0x0469, 0x0469, 0x046B, 0x046B, 0x046D, 0x046D, 0x046F, 0x046F, - /* 7 */ 0x0471, 0x0471, 0x0473, 0x0473, 0x0475, 0x0475, 0x0476, 0x0477, 0x0479, 0x0479, 0x047B, 0x047B, 0x047D, 0x047D, 0x047F, 0x047F, - /* 8 */ 0x0481, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, 0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F, - /* 9 */ 0x0491, 0x0491, 0x0493, 0x0493, 0x0495, 0x0495, 0x0497, 0x0497, 0x0499, 0x0499, 0x049B, 0x049B, 0x049D, 0x049D, 0x049F, 0x049F, - /* A */ 0x04A1, 0x04A1, 0x04A3, 0x04A3, 0x04A5, 0x04A5, 0x04A7, 0x04A7, 0x04A9, 0x04A9, 0x04AB, 0x04AB, 0x04AD, 0x04AD, 0x04AF, 0x04AF, - /* B */ 0x04B1, 0x04B1, 0x04B3, 0x04B3, 0x04B5, 0x04B5, 0x04B7, 0x04B7, 0x04B9, 0x04B9, 0x04BB, 0x04BB, 0x04BD, 0x04BD, 0x04BF, 0x04BF, - /* C */ 0x04C0, 0x04C1, 0x04C2, 0x04C4, 0x04C4, 0x04C5, 0x04C6, 0x04C8, 0x04C8, 0x04C9, 0x04CA, 0x04CC, 0x04CC, 0x04CD, 0x04CE, 0x04CF, - /* D */ 0x04D0, 0x04D1, 0x04D2, 0x04D3, 0x04D4, 0x04D5, 0x04D6, 0x04D7, 0x04D8, 0x04D9, 0x04DA, 0x04DB, 0x04DC, 0x04DD, 0x04DE, 0x04DF, - /* E */ 0x04E0, 0x04E1, 0x04E2, 0x04E3, 0x04E4, 0x04E5, 0x04E6, 0x04E7, 0x04E8, 0x04E9, 0x04EA, 0x04EB, 0x04EC, 0x04ED, 0x04EE, 0x04EF, - /* F */ 0x04F0, 0x04F1, 0x04F2, 0x04F3, 0x04F4, 0x04F5, 0x04F6, 0x04F7, 0x04F8, 0x04F9, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF, - - /* Table 4 (for high byte 0x05) */ - - /* 0 */ 0x0500, 0x0501, 0x0502, 0x0503, 0x0504, 0x0505, 0x0506, 0x0507, 0x0508, 0x0509, 0x050A, 0x050B, 0x050C, 0x050D, 0x050E, 0x050F, - /* 1 */ 0x0510, 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0516, 0x0517, 0x0518, 0x0519, 0x051A, 0x051B, 0x051C, 0x051D, 0x051E, 0x051F, - /* 2 */ 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, 0x0528, 0x0529, 0x052A, 0x052B, 0x052C, 0x052D, 0x052E, 0x052F, - /* 3 */ 0x0530, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, - /* 4 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, - /* 5 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0557, 0x0558, 0x0559, 0x055A, 0x055B, 0x055C, 0x055D, 0x055E, 0x055F, - /* 6 */ 0x0560, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, - /* 7 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, - /* 8 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0587, 0x0588, 0x0589, 0x058A, 0x058B, 0x058C, 0x058D, 0x058E, 0x058F, - /* 9 */ 0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597, 0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, 0x059E, 0x059F, - /* A */ 0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7, 0x05A8, 0x05A9, 0x05AA, 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF, - /* B */ 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF, - /* C */ 0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05C4, 0x05C5, 0x05C6, 0x05C7, 0x05C8, 0x05C9, 0x05CA, 0x05CB, 0x05CC, 0x05CD, 0x05CE, 0x05CF, - /* D */ 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF, - /* E */ 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0x05EB, 0x05EC, 0x05ED, 0x05EE, 0x05EF, - /* F */ 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0x05F5, 0x05F6, 0x05F7, 0x05F8, 0x05F9, 0x05FA, 0x05FB, 0x05FC, 0x05FD, 0x05FE, 0x05FF, - - /* Table 5 (for high byte 0x10) */ - - /* 0 */ 0x1000, 0x1001, 0x1002, 0x1003, 0x1004, 0x1005, 0x1006, 0x1007, 0x1008, 0x1009, 0x100A, 0x100B, 0x100C, 0x100D, 0x100E, 0x100F, - /* 1 */ 0x1010, 0x1011, 0x1012, 0x1013, 0x1014, 0x1015, 0x1016, 0x1017, 0x1018, 0x1019, 0x101A, 0x101B, 0x101C, 0x101D, 0x101E, 0x101F, - /* 2 */ 0x1020, 0x1021, 0x1022, 0x1023, 0x1024, 0x1025, 0x1026, 0x1027, 0x1028, 0x1029, 0x102A, 0x102B, 0x102C, 0x102D, 0x102E, 0x102F, - /* 3 */ 0x1030, 0x1031, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037, 0x1038, 0x1039, 0x103A, 0x103B, 0x103C, 0x103D, 0x103E, 0x103F, - /* 4 */ 0x1040, 0x1041, 0x1042, 0x1043, 0x1044, 0x1045, 0x1046, 0x1047, 0x1048, 0x1049, 0x104A, 0x104B, 0x104C, 0x104D, 0x104E, 0x104F, - /* 5 */ 0x1050, 0x1051, 0x1052, 0x1053, 0x1054, 0x1055, 0x1056, 0x1057, 0x1058, 0x1059, 0x105A, 0x105B, 0x105C, 0x105D, 0x105E, 0x105F, - /* 6 */ 0x1060, 0x1061, 0x1062, 0x1063, 0x1064, 0x1065, 0x1066, 0x1067, 0x1068, 0x1069, 0x106A, 0x106B, 0x106C, 0x106D, 0x106E, 0x106F, - /* 7 */ 0x1070, 0x1071, 0x1072, 0x1073, 0x1074, 0x1075, 0x1076, 0x1077, 0x1078, 0x1079, 0x107A, 0x107B, 0x107C, 0x107D, 0x107E, 0x107F, - /* 8 */ 0x1080, 0x1081, 0x1082, 0x1083, 0x1084, 0x1085, 0x1086, 0x1087, 0x1088, 0x1089, 0x108A, 0x108B, 0x108C, 0x108D, 0x108E, 0x108F, - /* 9 */ 0x1090, 0x1091, 0x1092, 0x1093, 0x1094, 0x1095, 0x1096, 0x1097, 0x1098, 0x1099, 0x109A, 0x109B, 0x109C, 0x109D, 0x109E, 0x109F, - /* A */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, - /* B */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, - /* C */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10C6, 0x10C7, 0x10C8, 0x10C9, 0x10CA, 0x10CB, 0x10CC, 0x10CD, 0x10CE, 0x10CF, - /* D */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, - /* E */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, - /* F */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10F6, 0x10F7, 0x10F8, 0x10F9, 0x10FA, 0x10FB, 0x10FC, 0x10FD, 0x10FE, 0x10FF, - - /* Table 6 (for high byte 0x20) */ - - /* 0 */ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x0000, 0x0000, 0x0000, 0x0000, - /* 1 */ 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017, 0x2018, 0x2019, 0x201A, 0x201B, 0x201C, 0x201D, 0x201E, 0x201F, - /* 2 */ 0x2020, 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027, 0x2028, 0x2029, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x202F, - /* 3 */ 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037, 0x2038, 0x2039, 0x203A, 0x203B, 0x203C, 0x203D, 0x203E, 0x203F, - /* 4 */ 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x2047, 0x2048, 0x2049, 0x204A, 0x204B, 0x204C, 0x204D, 0x204E, 0x204F, - /* 5 */ 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057, 0x2058, 0x2059, 0x205A, 0x205B, 0x205C, 0x205D, 0x205E, 0x205F, - /* 6 */ 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 7 */ 0x2070, 0x2071, 0x2072, 0x2073, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x207A, 0x207B, 0x207C, 0x207D, 0x207E, 0x207F, - /* 8 */ 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x208A, 0x208B, 0x208C, 0x208D, 0x208E, 0x208F, - /* 9 */ 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209A, 0x209B, 0x209C, 0x209D, 0x209E, 0x209F, - /* A */ 0x20A0, 0x20A1, 0x20A2, 0x20A3, 0x20A4, 0x20A5, 0x20A6, 0x20A7, 0x20A8, 0x20A9, 0x20AA, 0x20AB, 0x20AC, 0x20AD, 0x20AE, 0x20AF, - /* B */ 0x20B0, 0x20B1, 0x20B2, 0x20B3, 0x20B4, 0x20B5, 0x20B6, 0x20B7, 0x20B8, 0x20B9, 0x20BA, 0x20BB, 0x20BC, 0x20BD, 0x20BE, 0x20BF, - /* C */ 0x20C0, 0x20C1, 0x20C2, 0x20C3, 0x20C4, 0x20C5, 0x20C6, 0x20C7, 0x20C8, 0x20C9, 0x20CA, 0x20CB, 0x20CC, 0x20CD, 0x20CE, 0x20CF, - /* D */ 0x20D0, 0x20D1, 0x20D2, 0x20D3, 0x20D4, 0x20D5, 0x20D6, 0x20D7, 0x20D8, 0x20D9, 0x20DA, 0x20DB, 0x20DC, 0x20DD, 0x20DE, 0x20DF, - /* E */ 0x20E0, 0x20E1, 0x20E2, 0x20E3, 0x20E4, 0x20E5, 0x20E6, 0x20E7, 0x20E8, 0x20E9, 0x20EA, 0x20EB, 0x20EC, 0x20ED, 0x20EE, 0x20EF, - /* F */ 0x20F0, 0x20F1, 0x20F2, 0x20F3, 0x20F4, 0x20F5, 0x20F6, 0x20F7, 0x20F8, 0x20F9, 0x20FA, 0x20FB, 0x20FC, 0x20FD, 0x20FE, 0x20FF, - - /* Table 7 (for high byte 0x21) */ - - /* 0 */ 0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107, 0x2108, 0x2109, 0x210A, 0x210B, 0x210C, 0x210D, 0x210E, 0x210F, - /* 1 */ 0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117, 0x2118, 0x2119, 0x211A, 0x211B, 0x211C, 0x211D, 0x211E, 0x211F, - /* 2 */ 0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127, 0x2128, 0x2129, 0x212A, 0x212B, 0x212C, 0x212D, 0x212E, 0x212F, - /* 3 */ 0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137, 0x2138, 0x2139, 0x213A, 0x213B, 0x213C, 0x213D, 0x213E, 0x213F, - /* 4 */ 0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147, 0x2148, 0x2149, 0x214A, 0x214B, 0x214C, 0x214D, 0x214E, 0x214F, - /* 5 */ 0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157, 0x2158, 0x2159, 0x215A, 0x215B, 0x215C, 0x215D, 0x215E, 0x215F, - /* 6 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, - /* 7 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, - /* 8 */ 0x2180, 0x2181, 0x2182, 0x2183, 0x2184, 0x2185, 0x2186, 0x2187, 0x2188, 0x2189, 0x218A, 0x218B, 0x218C, 0x218D, 0x218E, 0x218F, - /* 9 */ 0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197, 0x2198, 0x2199, 0x219A, 0x219B, 0x219C, 0x219D, 0x219E, 0x219F, - /* A */ 0x21A0, 0x21A1, 0x21A2, 0x21A3, 0x21A4, 0x21A5, 0x21A6, 0x21A7, 0x21A8, 0x21A9, 0x21AA, 0x21AB, 0x21AC, 0x21AD, 0x21AE, 0x21AF, - /* B */ 0x21B0, 0x21B1, 0x21B2, 0x21B3, 0x21B4, 0x21B5, 0x21B6, 0x21B7, 0x21B8, 0x21B9, 0x21BA, 0x21BB, 0x21BC, 0x21BD, 0x21BE, 0x21BF, - /* C */ 0x21C0, 0x21C1, 0x21C2, 0x21C3, 0x21C4, 0x21C5, 0x21C6, 0x21C7, 0x21C8, 0x21C9, 0x21CA, 0x21CB, 0x21CC, 0x21CD, 0x21CE, 0x21CF, - /* D */ 0x21D0, 0x21D1, 0x21D2, 0x21D3, 0x21D4, 0x21D5, 0x21D6, 0x21D7, 0x21D8, 0x21D9, 0x21DA, 0x21DB, 0x21DC, 0x21DD, 0x21DE, 0x21DF, - /* E */ 0x21E0, 0x21E1, 0x21E2, 0x21E3, 0x21E4, 0x21E5, 0x21E6, 0x21E7, 0x21E8, 0x21E9, 0x21EA, 0x21EB, 0x21EC, 0x21ED, 0x21EE, 0x21EF, - /* F */ 0x21F0, 0x21F1, 0x21F2, 0x21F3, 0x21F4, 0x21F5, 0x21F6, 0x21F7, 0x21F8, 0x21F9, 0x21FA, 0x21FB, 0x21FC, 0x21FD, 0x21FE, 0x21FF, - - /* Table 8 (for high byte 0xFE) */ - - /* 0 */ 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, 0xFE06, 0xFE07, 0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F, - /* 1 */ 0xFE10, 0xFE11, 0xFE12, 0xFE13, 0xFE14, 0xFE15, 0xFE16, 0xFE17, 0xFE18, 0xFE19, 0xFE1A, 0xFE1B, 0xFE1C, 0xFE1D, 0xFE1E, 0xFE1F, - /* 2 */ 0xFE20, 0xFE21, 0xFE22, 0xFE23, 0xFE24, 0xFE25, 0xFE26, 0xFE27, 0xFE28, 0xFE29, 0xFE2A, 0xFE2B, 0xFE2C, 0xFE2D, 0xFE2E, 0xFE2F, - /* 3 */ 0xFE30, 0xFE31, 0xFE32, 0xFE33, 0xFE34, 0xFE35, 0xFE36, 0xFE37, 0xFE38, 0xFE39, 0xFE3A, 0xFE3B, 0xFE3C, 0xFE3D, 0xFE3E, 0xFE3F, - /* 4 */ 0xFE40, 0xFE41, 0xFE42, 0xFE43, 0xFE44, 0xFE45, 0xFE46, 0xFE47, 0xFE48, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C, 0xFE4D, 0xFE4E, 0xFE4F, - /* 5 */ 0xFE50, 0xFE51, 0xFE52, 0xFE53, 0xFE54, 0xFE55, 0xFE56, 0xFE57, 0xFE58, 0xFE59, 0xFE5A, 0xFE5B, 0xFE5C, 0xFE5D, 0xFE5E, 0xFE5F, - /* 6 */ 0xFE60, 0xFE61, 0xFE62, 0xFE63, 0xFE64, 0xFE65, 0xFE66, 0xFE67, 0xFE68, 0xFE69, 0xFE6A, 0xFE6B, 0xFE6C, 0xFE6D, 0xFE6E, 0xFE6F, - /* 7 */ 0xFE70, 0xFE71, 0xFE72, 0xFE73, 0xFE74, 0xFE75, 0xFE76, 0xFE77, 0xFE78, 0xFE79, 0xFE7A, 0xFE7B, 0xFE7C, 0xFE7D, 0xFE7E, 0xFE7F, - /* 8 */ 0xFE80, 0xFE81, 0xFE82, 0xFE83, 0xFE84, 0xFE85, 0xFE86, 0xFE87, 0xFE88, 0xFE89, 0xFE8A, 0xFE8B, 0xFE8C, 0xFE8D, 0xFE8E, 0xFE8F, - /* 9 */ 0xFE90, 0xFE91, 0xFE92, 0xFE93, 0xFE94, 0xFE95, 0xFE96, 0xFE97, 0xFE98, 0xFE99, 0xFE9A, 0xFE9B, 0xFE9C, 0xFE9D, 0xFE9E, 0xFE9F, - /* A */ 0xFEA0, 0xFEA1, 0xFEA2, 0xFEA3, 0xFEA4, 0xFEA5, 0xFEA6, 0xFEA7, 0xFEA8, 0xFEA9, 0xFEAA, 0xFEAB, 0xFEAC, 0xFEAD, 0xFEAE, 0xFEAF, - /* B */ 0xFEB0, 0xFEB1, 0xFEB2, 0xFEB3, 0xFEB4, 0xFEB5, 0xFEB6, 0xFEB7, 0xFEB8, 0xFEB9, 0xFEBA, 0xFEBB, 0xFEBC, 0xFEBD, 0xFEBE, 0xFEBF, - /* C */ 0xFEC0, 0xFEC1, 0xFEC2, 0xFEC3, 0xFEC4, 0xFEC5, 0xFEC6, 0xFEC7, 0xFEC8, 0xFEC9, 0xFECA, 0xFECB, 0xFECC, 0xFECD, 0xFECE, 0xFECF, - /* D */ 0xFED0, 0xFED1, 0xFED2, 0xFED3, 0xFED4, 0xFED5, 0xFED6, 0xFED7, 0xFED8, 0xFED9, 0xFEDA, 0xFEDB, 0xFEDC, 0xFEDD, 0xFEDE, 0xFEDF, - /* E */ 0xFEE0, 0xFEE1, 0xFEE2, 0xFEE3, 0xFEE4, 0xFEE5, 0xFEE6, 0xFEE7, 0xFEE8, 0xFEE9, 0xFEEA, 0xFEEB, 0xFEEC, 0xFEED, 0xFEEE, 0xFEEF, - /* F */ 0xFEF0, 0xFEF1, 0xFEF2, 0xFEF3, 0xFEF4, 0xFEF5, 0xFEF6, 0xFEF7, 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC, 0xFEFD, 0xFEFE, 0x0000, - - /* Table 9 (for high byte 0xFF) */ - - /* 0 */ 0xFF00, 0xFF01, 0xFF02, 0xFF03, 0xFF04, 0xFF05, 0xFF06, 0xFF07, 0xFF08, 0xFF09, 0xFF0A, 0xFF0B, 0xFF0C, 0xFF0D, 0xFF0E, 0xFF0F, - /* 1 */ 0xFF10, 0xFF11, 0xFF12, 0xFF13, 0xFF14, 0xFF15, 0xFF16, 0xFF17, 0xFF18, 0xFF19, 0xFF1A, 0xFF1B, 0xFF1C, 0xFF1D, 0xFF1E, 0xFF1F, - /* 2 */ 0xFF20, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, - /* 3 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, 0xFF58, 0xFF59, 0xFF5A, 0xFF3B, 0xFF3C, 0xFF3D, 0xFF3E, 0xFF3F, - /* 4 */ 0xFF40, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, - /* 5 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, 0xFF58, 0xFF59, 0xFF5A, 0xFF5B, 0xFF5C, 0xFF5D, 0xFF5E, 0xFF5F, - /* 6 */ 0xFF60, 0xFF61, 0xFF62, 0xFF63, 0xFF64, 0xFF65, 0xFF66, 0xFF67, 0xFF68, 0xFF69, 0xFF6A, 0xFF6B, 0xFF6C, 0xFF6D, 0xFF6E, 0xFF6F, - /* 7 */ 0xFF70, 0xFF71, 0xFF72, 0xFF73, 0xFF74, 0xFF75, 0xFF76, 0xFF77, 0xFF78, 0xFF79, 0xFF7A, 0xFF7B, 0xFF7C, 0xFF7D, 0xFF7E, 0xFF7F, - /* 8 */ 0xFF80, 0xFF81, 0xFF82, 0xFF83, 0xFF84, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8B, 0xFF8C, 0xFF8D, 0xFF8E, 0xFF8F, - /* 9 */ 0xFF90, 0xFF91, 0xFF92, 0xFF93, 0xFF94, 0xFF95, 0xFF96, 0xFF97, 0xFF98, 0xFF99, 0xFF9A, 0xFF9B, 0xFF9C, 0xFF9D, 0xFF9E, 0xFF9F, - /* A */ 0xFFA0, 0xFFA1, 0xFFA2, 0xFFA3, 0xFFA4, 0xFFA5, 0xFFA6, 0xFFA7, 0xFFA8, 0xFFA9, 0xFFAA, 0xFFAB, 0xFFAC, 0xFFAD, 0xFFAE, 0xFFAF, - /* B */ 0xFFB0, 0xFFB1, 0xFFB2, 0xFFB3, 0xFFB4, 0xFFB5, 0xFFB6, 0xFFB7, 0xFFB8, 0xFFB9, 0xFFBA, 0xFFBB, 0xFFBC, 0xFFBD, 0xFFBE, 0xFFBF, - /* C */ 0xFFC0, 0xFFC1, 0xFFC2, 0xFFC3, 0xFFC4, 0xFFC5, 0xFFC6, 0xFFC7, 0xFFC8, 0xFFC9, 0xFFCA, 0xFFCB, 0xFFCC, 0xFFCD, 0xFFCE, 0xFFCF, - /* D */ 0xFFD0, 0xFFD1, 0xFFD2, 0xFFD3, 0xFFD4, 0xFFD5, 0xFFD6, 0xFFD7, 0xFFD8, 0xFFD9, 0xFFDA, 0xFFDB, 0xFFDC, 0xFFDD, 0xFFDE, 0xFFDF, - /* E */ 0xFFE0, 0xFFE1, 0xFFE2, 0xFFE3, 0xFFE4, 0xFFE5, 0xFFE6, 0xFFE7, 0xFFE8, 0xFFE9, 0xFFEA, 0xFFEB, 0xFFEC, 0xFFED, 0xFFEE, 0xFFEF, - /* F */ 0xFFF0, 0xFFF1, 0xFFF2, 0xFFF3, 0xFFF4, 0xFFF5, 0xFFF6, 0xFFF7, 0xFFF8, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF, -}; - - -/* RelString case folding table */ - -unsigned short gCompareTable[] = { - - /* 0 */ 0x0000, 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700, 0x0800, 0x0900, 0x0A00, 0x0B00, 0x0C00, 0x0D00, 0x0E00, 0x0F00, - /* 1 */ 0x1000, 0x1100, 0x1200, 0x1300, 0x1400, 0x1500, 0x1600, 0x1700, 0x1800, 0x1900, 0x1A00, 0x1B00, 0x1C00, 0x1D00, 0x1E00, 0x1F00, - /* 2 */ 0x2000, 0x2100, 0x2200, 0x2300, 0x2400, 0x2500, 0x2600, 0x2700, 0x2800, 0x2900, 0x2A00, 0x2B00, 0x2C00, 0x2D00, 0x2E00, 0x2F00, - /* 3 */ 0x3000, 0x3100, 0x3200, 0x3300, 0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3A00, 0x3B00, 0x3C00, 0x3D00, 0x3E00, 0x3F00, - /* 4 */ 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00, 0x4B00, 0x4C00, 0x4D00, 0x4E00, 0x4F00, - /* 5 */ 0x5000, 0x5100, 0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x5C00, 0x5D00, 0x5E00, 0x5F00, - - // 0x60 maps to 'a' - // range 0x61 to 0x7a ('a' to 'z') map to upper case - - /* 6 */ 0x4180, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00, 0x4B00, 0x4C00, 0x4D00, 0x4E00, 0x4F00, - /* 7 */ 0x5000, 0x5100, 0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x7B00, 0x7C00, 0x7D00, 0x7E00, 0x7F00, - - // range 0x80 to 0xd8 gets mapped... - - /* 8 */ 0x4108, 0x410C, 0x4310, 0x4502, 0x4E0A, 0x4F08, 0x5508, 0x4182, 0x4104, 0x4186, 0x4108, 0x410A, 0x410C, 0x4310, 0x4502, 0x4584, - /* 9 */ 0x4586, 0x4588, 0x4982, 0x4984, 0x4986, 0x4988, 0x4E0A, 0x4F82, 0x4F84, 0x4F86, 0x4F08, 0x4F0A, 0x5582, 0x5584, 0x5586, 0x5508, - /* A */ 0xA000, 0xA100, 0xA200, 0xA300, 0xA400, 0xA500, 0xA600, 0x5382, 0xA800, 0xA900, 0xAA00, 0xAB00, 0xAC00, 0xAD00, 0x4114, 0x4F0E, - /* B */ 0xB000, 0xB100, 0xB200, 0xB300, 0xB400, 0xB500, 0xB600, 0xB700, 0xB800, 0xB900, 0xBA00, 0x4192, 0x4F92, 0xBD00, 0x4114, 0x4F0E, - /* C */ 0xC000, 0xC100, 0xC200, 0xC300, 0xC400, 0xC500, 0xC600, 0x2206, 0x2208, 0xC900, 0x2000, 0x4104, 0x410A, 0x4F0A, 0x4F14, 0x4F14, - /* D */ 0xD000, 0xD100, 0x2202, 0x2204, 0x2702, 0x2704, 0xD600, 0xD700, 0x5988, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, - - /* E */ 0xE000, 0xE100, 0xE200, 0xE300, 0xE400, 0xE500, 0xE600, 0xE700, 0xE800, 0xE900, 0xEA00, 0xEB00, 0xEC00, 0xED00, 0xEE00, 0xEF00, - /* F */ 0xF000, 0xF100, 0xF200, 0xF300, 0xF400, 0xF500, 0xF600, 0xF700, 0xF800, 0xF900, 0xFA00, 0xFB00, 0xFC00, 0xFD00, 0xFE00, 0xFF00, - -}; -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* _UCSTRINGCOMPAREDATA_ */ diff --git a/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c b/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c deleted file mode 100644 index 2d1c22900..000000000 --- a/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Copyright (c) 2000-2013 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: UnicodeWrappers.c - - Contains: Wrapper routines for Unicode conversion and comparison. - -*/ - -#if HFS -#include -#include - -#include "../../hfs_macos_defs.h" -#include "UCStringCompareData.h" - -#include "../headers/FileMgrInternal.h" -#include "../headers/HFSUnicodeWrappers.h" - -enum { - kMinFileExtensionChars = 1, /* does not include dot */ - kMaxFileExtensionChars = 5 /* does not include dot */ -}; - - -#define EXTENSIONCHAR(c) (((c) >= 0x61 && (c) <= 0x7A) || \ - ((c) >= 0x41 && (c) <= 0x5A) || \ - ((c) >= 0x30 && (c) <= 0x39)) - - -#define IsHexDigit(c) (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \ - ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F')) - - -static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr ); - - -static u_int32_t HexStringToInteger( u_int32_t length, const u_int8_t *hexStr ); - - -/* - * Get filename extension (if any) as a C string - */ -static void -GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr) -{ - u_int32_t i; - UniChar c; - u_int16_t extChars; /* number of extension chars (excluding dot) */ - u_int16_t maxExtChars; - Boolean foundExtension; - - extStr[0] = '\0'; /* assume there's no extension */ - - if ( length < 3 ) - return; /* "x.y" is smallest possible extension */ - - if ( length < (kMaxFileExtensionChars + 2) ) - maxExtChars = length - 2; /* save room for prefix + dot */ - else - maxExtChars = kMaxFileExtensionChars; - - i = length; - extChars = 0; - foundExtension = false; - - while ( extChars <= maxExtChars ) { - c = unicodeStr[--i]; - - /* look for leading dot */ - if ( c == (UniChar) '.' ) { - if ( extChars > 0 ) /* cannot end with a dot */ - foundExtension = true; - break; - } - - if ( EXTENSIONCHAR(c) ) - ++extChars; - else - break; - } - - /* if we found one then copy it */ - if ( foundExtension ) { - u_int8_t *extStrPtr = (u_int8_t *)extStr; - const UniChar *unicodeStrPtr = &unicodeStr[i]; - - for ( i = 0; i <= extChars; ++i ) - *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++); - extStr[extChars + 1] = '\0'; /* terminate extension + dot */ - } -} - - - -/* - * Count filename extension characters (if any) - */ -__private_extern__ u_int32_t -CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length ) -{ - u_int32_t i; - UniChar c; - u_int32_t extChars; /* number of extension chars (excluding dot) */ - u_int16_t maxExtChars; - Boolean foundExtension; - - if ( length < 3 ) - return 0; /* "x.y" is smallest possible extension */ - - if ( length < (kMaxFileExtensionChars + 2) ) - maxExtChars = length - 2; /* save room for prefix + dot */ - else - maxExtChars = kMaxFileExtensionChars; - - extChars = 0; /* assume there's no extension */ - i = length - 1; /* index to last ascii character */ - foundExtension = false; - - while ( extChars <= maxExtChars ) { - c = filename[i--]; - - /* look for leading dot */ - if ( c == (u_int8_t) '.' ) { - if ( extChars > 0 ) /* cannot end with a dot */ - return (extChars); - - break; - } - - if ( EXTENSIONCHAR(c) ) - ++extChars; - else - break; - } - - return 0; -} - - -/* - * extract the file id from a mangled name - */ -HFSCatalogNodeID -GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength) -{ - short extChars; - short i; - u_int8_t c; - - *prefixLength = 0; - - if ( filename == NULL ) - return 0; - - if ( length < 28 ) - return 0; /* too small to have been mangled */ - - /* big enough for a file ID (#10) and an extension (.x) ? */ - if ( length > 5 ) - extChars = CountFilenameExtensionChars(filename, length); - else - extChars = 0; - - /* skip over dot plus extension characters */ - if ( extChars > 0 ) - length -= (extChars + 1); - - /* scan for file id digits */ - for ( i = length - 1; i >= 0; --i) { - c = filename[i]; - - /* look for file ID marker */ - if ( c == '#' ) { - if ( (length - i) < 3 ) - break; /* too small to be a file ID */ - - *prefixLength = i; - return HexStringToInteger(length - i - 1, &filename[i+1]); - } - - if ( !IsHexDigit(c) ) - break; /* file ID string must have hex digits */ - } - - return 0; -} - - - -static u_int32_t -HexStringToInteger(u_int32_t length, const u_int8_t *hexStr) -{ - u_int32_t value; - u_int32_t i; - u_int8_t c; - const u_int8_t *p; - - value = 0; - p = hexStr; - - for ( i = 0; i < length; ++i ) { - c = *p++; - - if (c >= '0' && c <= '9') { - value = value << 4; - value += (u_int32_t) c - (u_int32_t) '0'; - } else if (c >= 'A' && c <= 'F') { - value = value << 4; - value += 10 + ((unsigned int) c - (unsigned int) 'A'); - } else { - return 0; /* bad character */ - } - } - - return value; -} - - -/* - * Routine: FastRelString - * - * Output: returns -1 if str1 < str2 - * returns 1 if str1 > str2 - * return 0 if equal - * - */ -int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 ) -{ - u_int16_t* compareTable; - int32_t bestGuess; - u_int8_t length, length2; - u_int8_t delta; - - delta = 0; - length = *(str1++); - length2 = *(str2++); - - if (length == length2) - bestGuess = 0; - else if (length < length2) - { - bestGuess = -1; - delta = length2 - length; - } - else - { - bestGuess = 1; - length = length2; - } - - compareTable = (u_int16_t*) gCompareTable; - - while (length--) - { - u_int8_t aChar, bChar; - - aChar = *(str1++); - bChar = *(str2++); - - if (aChar != bChar) // If they don't match exacly, do case conversion - { - u_int16_t aSortWord, bSortWord; - - aSortWord = compareTable[aChar]; - bSortWord = compareTable[bChar]; - - if (aSortWord > bSortWord) - return 1; - - if (aSortWord < bSortWord) - return -1; - } - - // If characters match exactly, then go on to next character immediately without - // doing any extra work. - } - - // if you got to here, then return bestGuess - return bestGuess; -} - - - -// -// FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering -// -// IF RESULT -// -------------------------- -// str1 < str2 => -1 -// str1 = str2 => 0 -// str1 > str2 => +1 -// -// The lower case table starts with 256 entries (one for each of the upper bytes -// of the original Unicode char). If that entry is zero, then all characters with -// that upper byte are already case folded. If the entry is non-zero, then it is -// the _index_ (not byte offset) of the start of the sub-table for the characters -// with that upper byte. All ignorable characters are folded to the value zero. -// -// In pseudocode: -// -// Let c = source Unicode character -// Let table[] = lower case table -// -// lower = table[highbyte(c)] -// if (lower == 0) -// lower = c -// else -// lower = table[lower+lowbyte(c)] -// -// if (lower == 0) -// ignore this character -// -// To handle ignorable characters, we now need a loop to find the next valid character. -// Also, we can't pre-compute the number of characters to compare; the string length might -// be larger than the number of non-ignorable characters. Further, we must be able to handle -// ignorable characters at any point in the string, including as the first or last characters. -// We use a zero value as a sentinel to detect both end-of-string and ignorable characters. -// Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename, -// the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is -// an invalid Unicode character). -// -// Pseudocode: -// -// while (1) { -// c1 = GetNextValidChar(str1) // returns zero if at end of string -// c2 = GetNextValidChar(str2) -// -// if (c1 != c2) break // found a difference -// -// if (c1 == 0) // reached end of string on both strings at once? -// return 0; // yes, so strings are equal -// } -// -// // When we get here, c1 != c2. So, we just need to determine which one is less. -// if (c1 < c2) -// return -1; -// else -// return 1; -// - -int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1, - register ConstUniCharArrayPtr str2, register ItemCount length2) -{ - register u_int16_t c1,c2; - register u_int16_t temp; - register u_int16_t* lowerCaseTable; - - lowerCaseTable = (u_int16_t*) gLowerCaseTable; - - while (1) { - /* Set default values for c1, c2 in case there are no more valid chars */ - c1 = 0; - c2 = 0; - - /* Find next non-ignorable char from str1, or zero if no more */ - while (length1 && c1 == 0) { - c1 = *(str1++); - --length1; - /* check for basic latin first */ - if (c1 < 0x0100) { - c1 = gLatinCaseFold[c1]; - break; - } - /* case fold if neccessary */ - if ((temp = lowerCaseTable[c1>>8]) != 0) - c1 = lowerCaseTable[temp + (c1 & 0x00FF)]; - } - - - /* Find next non-ignorable char from str2, or zero if no more */ - while (length2 && c2 == 0) { - c2 = *(str2++); - --length2; - /* check for basic latin first */ - if (c2 < 0x0100) { - c2 = gLatinCaseFold[c2]; - break; - } - /* case fold if neccessary */ - if ((temp = lowerCaseTable[c2>>8]) != 0) - c2 = lowerCaseTable[temp + (c2 & 0x00FF)]; - } - - if (c1 != c2) // found a difference, so stop looping - break; - - if (c1 == 0) // did we reach the end of both strings at the same time? - return 0; // yes, so strings are equal - } - - if (c1 < c2) - return -1; - else - return 1; -} - -/* - * UnicodeBinaryCompare - * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them. - * - * Results are emitted like FastUnicodeCompare: - * - * - * IF RESULT - * -------------------------- - * str1 < str2 => -1 - * str1 = str2 => 0 - * str1 > str2 => +1 - * - * The case matching source code is greatly simplified due to the lack of case-folding - * in this comparison routine. We compare, in order: the lengths, then do character-by- - * character comparisons. - * - */ -int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1, - register ConstUniCharArrayPtr str2, register ItemCount len2) { - uint16_t c1; - uint16_t c2; - int string_length; - int32_t result = 0; - - /* Set default values for the two character pointers */ - c1 = 0; - c2 = 0; - - /* First generate the string length (for comparison purposes) */ - if (len1 < len2) { - string_length = len1; - --result; - } - else if (len1 > len2) { - string_length = len2; - ++result; - } - else { - string_length = len1; - } - - /* now compare the two string pointers */ - while (string_length--) { - c1 = *(str1++); - c2 = *(str2++); - - if (c1 > c2) { - result = 1; - break; - } - - if (c1 < c2) { - result = -1; - break; - } - /* If equal, iterate to the next two respective chars */ - } - - return result; -} - - -OSErr -ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen, - ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid) -{ - ByteCount subMaxLen; - size_t utf8len; - char fileIDStr[15]; - char extStr[15]; - - snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid); - GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr); - - /* remove extension chars from source */ - srcLen -= strlen(extStr) * sizeof(UniChar); - subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr)); - - (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0); - - strlcat((char *)dstStr, fileIDStr, maxDstLen); - strlcat((char *)dstStr, extStr, maxDstLen); - *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr)); - - return noErr; -} - -#else /* not HFS - temp workaround until 4277828 is fixed */ -/* stubs for exported routines that aren't present when we build kernel without HFS */ - -#include - -int32_t FastUnicodeCompare( void * str1, u_int32_t length1, void * str2, u_int32_t length2 ); - - -int32_t FastUnicodeCompare( __unused void * str1, - __unused u_int32_t length1, - __unused void * str2, - __unused u_int32_t length2 ) -{ - return( 0 ); -} - - -#endif /* HFS */ - diff --git a/bsd/hfs/hfscommon/headers/BTreeScanner.h b/bsd/hfs/hfscommon/headers/BTreeScanner.h deleted file mode 100644 index a806102e6..000000000 --- a/bsd/hfs/hfscommon/headers/BTreeScanner.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 1996-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - * - * @(#)BTreeScanner.h - */ - -#ifndef _BTREESCANNER_H_ -#define _BTREESCANNER_H_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -#include - -#include "FileMgrInternal.h" -#include "BTreesPrivate.h" - -// amount of time we are allowed to process a catalog search (in � secs) -// NOTE - code assumes kMaxMicroSecsInKernel is less than 1,000,000 -enum { kMaxMicroSecsInKernel = (1000 * 100) }; // 1 tenth of a second - -// btree node scanner buffer size. at 32K we get 8 nodes. this is the size used -// in Mac OS 9 -enum { kCatSearchBufferSize = (32 * 1024) }; - - -/* - * ============ W A R N I N G ! ============ - * DO NOT INCREASE THE SIZE OF THIS STRUCT! - * It must be less than or equal to the size of - * the opaque searchstate struct (in sys/attr.h). - */ -/* Private description used in hfs_search */ -struct CatPosition -{ - u_int32_t writeCount; /* The BTree's write count (to see if the catalog writeCount */ - /* changed since the last search). If 0, the rest */ - /* of the record is invalid, start from beginning. */ - u_int32_t nextNode; /* node number to resume search */ - u_int32_t nextRecord; /* record number to resume search */ - u_int32_t recordsFound; /* number of leaf records seen so far */ -}; -typedef struct CatPosition CatPosition; - - -/* - BTScanState - This structure is used to keep track of the current state - of a BTree scan. It contains both the dynamic state information (like - the current node number and record number) and information that is static - for the duration of a scan (such as buffer pointers). - - NOTE: recordNum may equal or exceed the number of records in the node - number nodeNum. If so, then the next attempt to get a record will move - to a new node number. -*/ -struct BTScanState -{ - // The following fields are set up once at initialization time. - // They are not changed during a scan. - u_int32_t bufferSize; - struct buf * bufferPtr; - BTreeControlBlock * btcb; - - // The following fields are the dynamic state of the current scan. - u_int32_t nodeNum; // zero is first node - u_int32_t recordNum; // zero is first record - BTNodeDescriptor * currentNodePtr; // points to current node within buffer - u_int32_t nodesLeftInBuffer; // number of valid nodes still in the buffer - u_int32_t recordsFound; // number of leaf records seen so far - struct timeval startTime; // time we started catalog search -}; -typedef struct BTScanState BTScanState; - - -/* *********************** PROTOTYPES *********************** */ - -int BTScanInitialize( const FCB * btreeFile, - u_int32_t startingNode, - u_int32_t startingRecord, - u_int32_t recordsFound, - u_int32_t bufferSize, - BTScanState * scanState ); - -int BTScanNextRecord( BTScanState * scanState, - Boolean avoidIO, - void * * key, - void * * data, - u_int32_t * dataSize ); - -int BTScanTerminate( BTScanState * scanState, - u_int32_t * startingNode, - u_int32_t * startingRecord, - u_int32_t * recordsFound ); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* !_BTREESCANNER_H_ */ diff --git a/bsd/hfs/hfscommon/headers/BTreesInternal.h b/bsd/hfs/hfscommon/headers/BTreesInternal.h deleted file mode 100644 index f3c4e37d2..000000000 --- a/bsd/hfs/hfscommon/headers/BTreesInternal.h +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: BTreesInternal.h - - Contains: IPI to File Manager B-tree - - Version: HFS Plus 1.0 - - Copyright: � 1996-1998 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: File Systems - - Writers: - - (msd) Mark Day - (DSH) Deric Horn - (djb) Don Brady - - Change History (most recent first): - - 9/22/99 ser Added prototypes for BTGetLastSync and BTSetLastSync - 6/22/98 djb Add ERR_BASE to btree error codes to make them negative (for MacOS X only). - - 7/28/97 msd Add enum for fsBTTimeOutErr. - 7/25/97 DSH Added heuristicHint as parameter to BTSearchRecord. - 7/24/97 djb Add blockReadFromDisk flag to BlockDescriptor. Callbacks now use - a file refNum instead of an FCB. - 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name - collision - 6/2/97 DSH Added SetEndOfForkProc() prototype, so Attributes.c can call it - directly. - 5/19/97 djb kMaxKeyLength is now 520. - 4/28/97 djb first checked in - - 3/17/97 DSH Remove Key Comparison prototype, already in FilesInternal.h. - 2/19/97 djb Add SetBlockSizeProcPtr. Add blockSize field to BlockDescriptor. - Remove E_ type error enums. - 1/27/97 djb Include Types.h and FilesInternal.h. - 1/13/97 djb Added kBTreeCurrentRecord for BTIterateRecord. - 1/3/97 djb Added support for large keys. - 12/19/96 djb first checked in - -*/ - -#ifndef __BTREESINTERNAL__ -#define __BTREESINTERNAL__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - -#ifndef __FILEMGRINTERNAL__ -#include "FileMgrInternal.h" -#endif - -enum { - fsBTInvalidHeaderErr = btBadHdr, - fsBTBadRotateErr = dsBadRotate, - fsBTInvalidNodeErr = btBadNode, - fsBTRecordTooLargeErr = btNoFit, - fsBTRecordNotFoundErr = btNotFound, - fsBTDuplicateRecordErr = btExists, - fsBTFullErr = btNoSpaceAvail, - - fsBTInvalidFileErr = ERR_BASE + 0x0302, /* no BTreeCB has been allocated for fork*/ - fsBTrFileAlreadyOpenErr = ERR_BASE + 0x0303, - fsBTInvalidIteratorErr = ERR_BASE + 0x0308, - fsBTEmptyErr = ERR_BASE + 0x030A, - fsBTNoMoreMapNodesErr = ERR_BASE + 0x030B, - fsBTBadNodeSize = ERR_BASE + 0x030C, - fsBTBadNodeType = ERR_BASE + 0x030D, - fsBTInvalidKeyLengthErr = ERR_BASE + 0x030E, - fsBTStartOfIterationErr = ERR_BASE + 0x0353, - fsBTEndOfIterationErr = ERR_BASE + 0x0354, - fsBTUnknownVersionErr = ERR_BASE + 0x0355, - fsBTTreeTooDeepErr = ERR_BASE + 0x0357, - fsIteratorExitedScopeErr = ERR_BASE + 0x0A02, /* iterator exited the scope*/ - fsIteratorScopeExceptionErr = ERR_BASE + 0x0A03, /* iterator is undefined due to error or movement of scope locality*/ - fsUnknownIteratorMovementErr = ERR_BASE + 0x0A04, /* iterator movement is not defined*/ - fsInvalidIterationMovmentErr = ERR_BASE + 0x0A05, /* iterator movement is invalid in current context*/ - fsClientIDMismatchErr = ERR_BASE + 0x0A06, /* wrong client process ID*/ - fsEndOfIterationErr = ERR_BASE + 0x0A07, /* there were no objects left to return on iteration*/ - fsBTTimeOutErr = ERR_BASE + 0x0A08 /* BTree scan interrupted -- no time left for physical I/O */ -}; - -struct BlockDescriptor{ - void *buffer; - void *blockHeader; - daddr64_t blockNum; /* logical block number (used by hfs_swap_BTNode) */ - ByteCount blockSize; - Boolean blockReadFromDisk; - Byte isModified; // XXXdbg - for journaling - Byte reserved[2]; -}; -typedef struct BlockDescriptor BlockDescriptor; -typedef BlockDescriptor *BlockDescPtr; - - -struct FSBufferDescriptor { - void * bufferAddress; - ByteCount itemSize; - ItemCount itemCount; -}; -typedef struct FSBufferDescriptor FSBufferDescriptor; - -typedef FSBufferDescriptor *FSBufferDescriptorPtr; - - -/* - Fork Level Access Method Block get options -*/ -enum { - kGetBlock = 0x00000000, - kGetBlockHint = 0x00000001, // if set, the block is being looked up using hint - kForceReadBlock = 0x00000002, //�� how does this relate to Read/Verify? Do we need this? - kGetEmptyBlock = 0x00000008 -}; -typedef u_int32_t GetBlockOptions; - -/* - Fork Level Access Method Block release options -*/ -enum { - kReleaseBlock = 0x00000000, - kForceWriteBlock = 0x00000001, - kMarkBlockDirty = 0x00000002, - kTrashBlock = 0x00000004, - kLockTransaction = 0x00000100 -}; -typedef u_int32_t ReleaseBlockOptions; - -typedef u_int64_t FSSize; -typedef u_int32_t ForkBlockNumber; - -/*============================================================================ - Fork Level Buffered I/O Access Method -============================================================================*/ - -typedef OSStatus (* GetBlockProcPtr) (FileReference fileRefNum, - u_int32_t blockNum, - GetBlockOptions options, - BlockDescriptor *block ); - - -typedef OSStatus (* ReleaseBlockProcPtr) (FileReference fileRefNum, - BlockDescPtr blockPtr, - ReleaseBlockOptions options ); - -typedef OSStatus (* SetEndOfForkProcPtr) (FileReference fileRefNum, - FSSize minEOF, - FSSize maxEOF ); - -typedef OSStatus (* SetBlockSizeProcPtr) (FileReference fileRefNum, - ByteCount blockSize, - ItemCount minBlockCount ); - -OSStatus SetEndOfForkProc ( FileReference fileRefNum, FSSize minEOF, FSSize maxEOF ); - - -/* - B*Tree Information Version -*/ - -enum BTreeInformationVersion{ - kBTreeInfoVersion = 0 -}; - -/* - B*Tree Iteration Operation Constants -*/ - -enum BTreeIterationOperations{ - kBTreeFirstRecord, - kBTreeNextRecord, - kBTreePrevRecord, - kBTreeLastRecord, - kBTreeCurrentRecord -}; -typedef u_int16_t BTreeIterationOperation; - - -/* - Btree types: 0 is HFS CAT/EXT file, 1~127 are AppleShare B*Tree files, 128~254 unused - hfsBtreeType EQU 0 ; control file - validBTType EQU $80 ; user btree type starts from 128 - userBT1Type EQU $FF ; 255 is our Btree type. Used by BTInit and BTPatch -*/ - -enum BTreeTypes{ - kHFSBTreeType = 0, // control file - kUserBTreeType = 128, // user btree type starts from 128 - kReservedBTreeType = 255 // -}; - -#define kBTreeHeaderUserBytes 128 - - -typedef BTreeKey *BTreeKeyPtr; - - -/* - BTreeInfoRec Structure - for BTGetInformation -*/ -struct BTreeInfoRec{ - u_int16_t version; - u_int16_t nodeSize; - u_int16_t maxKeyLength; - u_int16_t treeDepth; - u_int32_t lastfsync; /* Last time that this was fsynced */ - ItemCount numRecords; - ItemCount numNodes; - ItemCount numFreeNodes; - u_int8_t keyCompareType; - u_int8_t reserved[3]; -}; -typedef struct BTreeInfoRec BTreeInfoRec; -typedef BTreeInfoRec *BTreeInfoPtr; - -/* - BTreeHint can never be exported to the outside. Use u_int32_t BTreeHint[4], - u_int8_t BTreeHint[16], etc. - */ -struct BTreeHint{ - ItemCount writeCount; - u_int32_t nodeNum; // node the key was last seen in - u_int16_t index; // index then key was last seen at - u_int16_t reserved1; - u_int32_t reserved2; -}; -typedef struct BTreeHint BTreeHint; -typedef BTreeHint *BTreeHintPtr; - -/* - BTree Iterator -*/ -struct BTreeIterator{ - BTreeHint hint; - u_int16_t version; - u_int16_t reserved; - u_int32_t hitCount; // Total number of leaf records hit - u_int32_t maxLeafRecs; // Max leaf records over iteration - BTreeKey key; -}; -typedef struct BTreeIterator BTreeIterator; -typedef BTreeIterator *BTreeIteratorPtr; - - -/*============================================================================ - B*Tree SPI -============================================================================*/ - -/* - Key Comparison Function ProcPtr Type - for BTOpenPath -*/ -//typedef int32_t (* KeyCompareProcPtr)(BTreeKeyPtr a, BTreeKeyPtr b); - - -typedef int32_t (* IterateCallBackProcPtr)(BTreeKeyPtr key, void * record, void * state); - - -extern OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc); - -extern OSStatus BTClosePath (FCB *filePtr ); - - -extern OSStatus BTSearchRecord (FCB *filePtr, - BTreeIterator *searchIterator, - FSBufferDescriptor *btRecord, - u_int16_t *recordLen, - BTreeIterator *resultIterator ); - -extern OSStatus BTIterateRecord (FCB *filePtr, - BTreeIterationOperation operation, - BTreeIterator *iterator, - FSBufferDescriptor *btRecord, - u_int16_t *recordLen ); - - -extern OSStatus BTIterateRecords(FCB *filePtr, BTreeIterationOperation operation, BTreeIterator *iterator, - IterateCallBackProcPtr callBackProc, void * callBackState); - -extern OSStatus BTInsertRecord (FCB *filePtr, - BTreeIterator *iterator, - FSBufferDescriptor *btrecord, - u_int16_t recordLen ); - -extern OSStatus BTReplaceRecord (FCB *filePtr, - BTreeIterator *iterator, - FSBufferDescriptor *btRecord, - u_int16_t recordLen ); - -extern OSStatus BTUpdateRecord (FCB *filePtr, - BTreeIterator *iterator, - IterateCallBackProcPtr callBackProc, - void *callBackState ); - -extern OSStatus BTDeleteRecord (FCB *filePtr, - BTreeIterator *iterator ); - -extern OSStatus BTGetInformation (FCB *filePtr, - u_int16_t vers, - BTreeInfoRec *info ); - -extern OSStatus BTIsDirty(FCB *filePtr); - -extern OSStatus BTFlushPath (FCB *filePtr ); - -extern OSStatus BTReloadData (FCB *filePtr); - -extern OSStatus BTInvalidateHint (BTreeIterator *iterator ); - -extern OSStatus BTGetLastSync (FCB *filePtr, - u_int32_t *lastfsync ); - -extern OSStatus BTSetLastSync (FCB *filePtr, - u_int32_t lastfsync ); - -extern OSStatus BTHasContiguousNodes(FCB *filePtr); - -extern OSStatus BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize); - -extern OSStatus BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize); - -/* B-tree node reserve routines. */ -extern void BTReserveSetup(void); - -extern int BTReserveSpace(FCB *file, int operations, void * data); - -extern int BTReleaseReserve(FCB *file, void * data); - -extern int BTZeroUnusedNodes(FCB *file); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif // __BTREESINTERNAL__ diff --git a/bsd/hfs/hfscommon/headers/BTreesPrivate.h b/bsd/hfs/hfscommon/headers/BTreesPrivate.h deleted file mode 100644 index 07f06afb8..000000000 --- a/bsd/hfs/hfscommon/headers/BTreesPrivate.h +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: BTreesPrivate.h - - Contains: Private interface file for the BTree Module. - - Version: xxx put the technology version here xxx - - Written by: Gordon Sheridan and Bill Bruffey - - Copyright: � 1992-1999 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: Mark Day - - Technology: File Systems - - Writers: - - (msd) Mark Day - (DSH) Deric Horn - (djb) Don Brady - (ser) Scott Roberts - (dkh) Dave Heller - - Change History (most recent first): - 3/19/99 djb Disable MoveRecordsLeft/Right macros since bcopy is broken. - - 8/10/98 djb Removed unused BTreeIterator from BTreeControlBlock, fixed alignment. - - 9/4/97 djb Convert MoveRecordsLeft and GetLeftSiblingNode to macros. - 7/24/97 djb Add macro for GetRecordAddress (was a function before). - 7/21/97 msd GetRecordByIndex now returns an OSStatus. - 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name - collision - 4/23/97 djb first checked in - - 3/17/97 DSH Added a refCon field to BTreeControlBlock, for DFA use, to point - to additional data. Fixed Panic macros for use with SC. - 2/19/97 djb Add InsertKey struct. Moved on-disk definitions to - HFSBTreesPriv.h - 1/27/97 djb InsertTree and DeleteTree are now recursive and support variable - sized index keys. - 1/15/97 djb Move GetFileRefNumFromFCB macro to FilesInternal.h. Added - kBTVariableIndexKeysMask. - 1/3/97 djb Added support for large keys. - 12/19/96 djb first checked in - - History applicable to original Scarecrow Design: - - <7> 10/25/96 ser Changing for new VFPI - <6> 10/18/96 ser Converting over VFPI changes - <5> 9/17/96 dkh More BTree statistics - <4> 9/16/96 dkh Revised BTree statistics - <3> 6/20/96 dkh Radar #1358740. Switch from using Pools to debug MemAllocators. - <2> 12/7/95 dkh D10E2 build. Changed usage of Ref data type to LogicalAddress. - <1> 10/18/95 rst Moved from Scarecrow project. - - <19> 11/22/94 djb Add prototype for GetMapNode - <18> 11/16/94 prp Add IsItAHint routine prototype. - <17> 9/30/94 prp Get in sync with D2 interface changes. - <16> 7/25/94 wjk Eliminate usage of BytePtr in favor of UInt8 *. - <15> 7/22/94 wjk Convert to the new set of header files. - <14> 5/31/94 srs Moved Btree types to public interface - <13> 12/9/93 wjk Add 68k alignment pragma's around persistent structures. - <12> 11/30/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and - NRCmds environments. - <11> 11/23/93 wjk Changes required to compile on the RS6000. - <10> 8/30/93 CH Removed the M_ExitOnError and M_ReturnErrorIf macros which were - already defined in FileSystemPriv.h (included here). - <9> 8/30/93 CH Added parens around the M_ReturnErrorIf macro. - <8> 5/21/93 gs Add kBadClose flag. Add some prototypes for internal routines. - <7> 5/10/93 gs Change Ptr to BytePtr. Move BTreeTypes to BTree.h. Add - DeleteTree prototype. - <6> 3/23/93 gs Remove mysterious "flags" field from HeaderRec structure. Move - prototypes of private functions to top of respective source - files. - <5> 2/8/93 gs Update to use FSAgent.h Get/Release/SetEOF/SetBlockSize - procPtrs. Add UpdateNode routine. - <4> 12/10/92 gs Add Key Descriptor function declarations. - <3> 12/8/92 gs Add HeaderRec structure and incorporate review feedback. - <2> 12/2/92 gs Add GetNode and ReleaseNode callback procptrs to BTree CB, and - add internal function declarations. - <1> 11/15/92 gs first checked in - -*/ - -#ifndef __BTREESPRIVATE__ -#define __BTREESPRIVATE__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - -#include "../../hfs_macos_defs.h" - -#ifndef __FILEMGRINTERNAL__ -#include "FileMgrInternal.h" -#endif - -#ifndef __BTREESINTERNAL__ -#include "BTreesInternal.h" -#endif - - -/////////////////////////////////// Constants /////////////////////////////////// - -#define kBTreeVersion 1 -#define kMaxTreeDepth 16 - - -#define kHeaderNodeNum 0 -#define kKeyDescRecord 1 - - -// Header Node Record Offsets -enum { - kHeaderRecOffset = 0x000E, - kKeyDescRecOffset = 0x0078, - kHeaderMapRecOffset = 0x00F8 -}; - -#define kMinNodeSize 512 - -#define kMinRecordSize 6 - // where is minimum record size enforced? - -// miscellaneous BTree constants -enum { - kOffsetSize = 2 -}; - -// Insert Operations -typedef enum { - kInsertRecord = 0, - kReplaceRecord = 1 -} InsertType; - -// illegal string attribute bits set in mask -#define kBadStrAttribMask 0xCF - - - -//////////////////////////////////// Macros ///////////////////////////////////// - -#define M_NodesInMap(mapSize) ((mapSize) << 3) - -#define M_ClearBitNum(integer,bitNumber) ((integer) &= (~(1<<(bitNumber)))) -#define M_SetBitNum(integer,bitNumber) ((integer) |= (1<<(bitNumber))) -#define M_IsOdd(integer) (((integer) & 1) != 0) -#define M_IsEven(integer) (((integer) & 1) == 0) - -#define M_MapRecordSize(nodeSize) (nodeSize - sizeof (BTNodeDescriptor) - 6) -#define M_HeaderMapRecordSize(nodeSize) (nodeSize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) - 128 - 8) - -#define M_SWAP_BE16_ClearBitNum(integer,bitNumber) ((integer) &= SWAP_BE16(~(1<<(bitNumber)))) -#define M_SWAP_BE16_SetBitNum(integer,bitNumber) ((integer) |= SWAP_BE16(1<<(bitNumber))) - -///////////////////////////////////// Types ///////////////////////////////////// - -typedef struct BTreeControlBlock { // fields specific to BTree CBs - - u_int8_t keyCompareType; /* Key string Comparison Type */ - u_int8_t btreeType; - u_int16_t treeDepth; - FileReference fileRefNum; // refNum of btree file - KeyCompareProcPtr keyCompareProc; - u_int32_t rootNode; - u_int32_t leafRecords; - u_int32_t firstLeafNode; - u_int32_t lastLeafNode; - u_int16_t nodeSize; - u_int16_t maxKeyLength; - u_int32_t totalNodes; - u_int32_t freeNodes; - - u_int16_t reserved3; // 4-byte alignment - - // new fields - int16_t version; - u_int32_t flags; // dynamic flags - u_int32_t attributes; // persistent flags - u_int32_t writeCount; - u_int32_t lastfsync; /* Last time that this was fsynced */ - - GetBlockProcPtr getBlockProc; - ReleaseBlockProcPtr releaseBlockProc; - SetEndOfForkProcPtr setEndOfForkProc; - - // statistical information - u_int32_t numGetNodes; - u_int32_t numGetNewNodes; - u_int32_t numReleaseNodes; - u_int32_t numUpdateNodes; - u_int32_t numMapNodesRead; // map nodes beyond header node - u_int32_t numHintChecks; - u_int32_t numPossibleHints; // Looks like a formated hint - u_int32_t numValidHints; // Hint used to find correct record. - u_int32_t reservedNodes; - BTreeIterator iterator; // useable when holding exclusive b-tree lock - -#if DEBUG - void *madeDirtyBy[2]; -#endif -} BTreeControlBlock, *BTreeControlBlockPtr; - -u_int32_t CalcKeySize(const BTreeControlBlock *btcb, const BTreeKey *key); -#define CalcKeySize(btcb, key) ( ((btcb)->attributes & kBTBigKeysMask) ? ((key)->length16 + 2) : ((key)->length8 + 1) ) - -u_int32_t KeyLength(const BTreeControlBlock *btcb, const BTreeKey *key); -#define KeyLength(btcb, key) ( ((btcb)->attributes & kBTBigKeysMask) ? (key)->length16 : (key)->length8 ) - - - -typedef enum { - kBTHeaderDirty = 0x00000001 -} BTreeFlags; - -static inline void M_BTreeHeaderDirty(BTreeControlBlock *bt) { -#if DEBUG - bt->madeDirtyBy[0] = __builtin_return_address(0); - bt->madeDirtyBy[1] = __builtin_return_address(1); -#endif - bt->flags |= kBTHeaderDirty; -} - -typedef int8_t *NodeBuffer; -typedef BlockDescriptor NodeRec, *NodePtr; //�� remove this someday... - - - - -//// Tree Path Table - constructed by SearchTree, used by InsertTree and DeleteTree - -typedef struct { - u_int32_t node; // node number - u_int16_t index; - u_int16_t reserved; // align size to a power of 2 -} TreePathRecord, *TreePathRecordPtr; - -typedef TreePathRecord TreePathTable [kMaxTreeDepth]; - - -//// InsertKey - used by InsertTree, InsertLevel and InsertNode - -struct InsertKey { - BTreeKeyPtr keyPtr; - u_int8_t * recPtr; - u_int16_t keyLength; - u_int16_t recSize; - Boolean replacingKey; - Boolean skipRotate; -}; - -typedef struct InsertKey InsertKey; - - -//// For Notational Convenience - -typedef BTNodeDescriptor* NodeDescPtr; -typedef u_int8_t *RecordPtr; -typedef BTreeKeyPtr KeyPtr; - - -//////////////////////////////////// Globals //////////////////////////////////// - - -//////////////////////////////////// Macros ///////////////////////////////////// - -#if DEBUG_BUILD - #define Panic( message ) DebugStr( message ) - #define PanicIf( condition, message ) do { if ( condition != 0 ) DebugStr( message ); } while(0) -#else - #define Panic( message ) do { } while(0) - #define PanicIf( condition, message ) do { } while(0) -#endif - -// Exit function on error -#define M_ExitOnError( result ) do { if ( ( result ) != noErr ) goto ErrorExit; } while(0) - -// Test for passed condition and return if true -#define M_ReturnErrorIf( condition, error ) do { if ( condition ) return( error ); } while(0) - -//////////////////////////////// Key Operations ///////////////////////////////// - -int32_t CompareKeys (BTreeControlBlockPtr btreePtr, - KeyPtr searchKey, - KeyPtr trialKey ); - -//////////////////////////////// Map Operations ///////////////////////////////// - -OSStatus AllocateNode (BTreeControlBlockPtr btreePtr, - u_int32_t *nodeNum); - -OSStatus FreeNode (BTreeControlBlockPtr btreePtr, - u_int32_t nodeNum); - -OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, - u_int32_t nodes ); - -u_int32_t CalcMapBits (BTreeControlBlockPtr btreePtr); - - -void BTUpdateReserve (BTreeControlBlockPtr btreePtr, - int nodes); - -//////////////////////////////// Misc Operations //////////////////////////////// - -u_int16_t CalcKeyRecordSize (u_int16_t keySize, - u_int16_t recSize ); - -OSStatus VerifyHeader (FCB *filePtr, - BTHeaderRec *header ); - -OSStatus UpdateHeader (BTreeControlBlockPtr btreePtr, - Boolean forceWrite ); - -OSStatus FindIteratorPosition (BTreeControlBlockPtr btreePtr, - BTreeIteratorPtr iterator, - BlockDescriptor *left, - BlockDescriptor *middle, - BlockDescriptor *right, - u_int32_t *nodeNum, - u_int16_t *index, - Boolean *foundRecord ); - -OSStatus CheckInsertParams (FCB *filePtr, - BTreeIterator *iterator, - FSBufferDescriptor *record, - u_int16_t recordLen ); - -OSStatus TrySimpleReplace (BTreeControlBlockPtr btreePtr, - NodeDescPtr nodePtr, - BTreeIterator *iterator, - FSBufferDescriptor *record, - u_int16_t recordLen, - Boolean *recordInserted ); - -OSStatus IsItAHint (BTreeControlBlockPtr btreePtr, - BTreeIterator *iterator, - Boolean *answer ); - -extern OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr); - -//////////////////////////////// Node Operations //////////////////////////////// - -//// Node Operations - -OSStatus GetNode (BTreeControlBlockPtr btreePtr, - u_int32_t nodeNum, - u_int32_t flags, - NodeRec *returnNodePtr ); - -/* Flags for GetNode() */ -#define kGetNodeHint 0x1 /* If set, the node is being looked up using a hint */ - -OSStatus GetLeftSiblingNode (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - NodeRec *left ); - -#define GetLeftSiblingNode(btree,node,left) GetNode ((btree), ((NodeDescPtr)(node))->bLink, 0, (left)) - -OSStatus GetRightSiblingNode (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - NodeRec *right ); - -#define GetRightSiblingNode(btree,node,right) GetNode ((btree), ((NodeDescPtr)(node))->fLink, 0, (right)) - - -OSStatus GetNewNode (BTreeControlBlockPtr btreePtr, - u_int32_t nodeNum, - NodeRec *returnNodePtr ); - -OSStatus ReleaseNode (BTreeControlBlockPtr btreePtr, - NodePtr nodePtr ); - -OSStatus TrashNode (BTreeControlBlockPtr btreePtr, - NodePtr nodePtr ); - -OSStatus UpdateNode (BTreeControlBlockPtr btreePtr, - NodePtr nodePtr, - u_int32_t transactionID, - u_int32_t flags ); - -//// Node Buffer Operations - -void ClearNode (BTreeControlBlockPtr btreePtr, - NodeDescPtr node ); - -u_int16_t GetNodeDataSize (BTreeControlBlockPtr btreePtr, - NodeDescPtr node ); - -u_int16_t GetNodeFreeSize (BTreeControlBlockPtr btreePtr, - NodeDescPtr node ); - - -//// Record Operations - -Boolean InsertRecord (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index, - RecordPtr recPtr, - u_int16_t recSize ); - -Boolean InsertKeyRecord (BTreeControlBlockPtr btreePtr, - NodeDescPtr node, - u_int16_t index, - KeyPtr keyPtr, - u_int16_t keyLength, - RecordPtr recPtr, - u_int16_t recSize ); - -void DeleteRecord (BTreeControlBlockPtr btree, - NodeDescPtr node, - u_int16_t index ); - - -Boolean SearchNode (BTreeControlBlockPtr btree, - NodeDescPtr node, - KeyPtr searchKey, - u_int16_t *index ); - -OSStatus GetRecordByIndex (BTreeControlBlockPtr btree, - NodeDescPtr node, - u_int16_t index, - KeyPtr *keyPtr, - u_int8_t * *dataPtr, - u_int16_t *dataSize ); - -u_int8_t * GetRecordAddress (BTreeControlBlockPtr btree, - NodeDescPtr node, - u_int16_t index ); - -#define GetRecordAddress(btreePtr,node,index) ((u_int8_t *)(node) + (*(short *) ((u_int8_t *)(node) + (btreePtr)->nodeSize - ((index) << 1) - kOffsetSize))) - - -u_int16_t GetRecordSize (BTreeControlBlockPtr btree, - NodeDescPtr node, - u_int16_t index ); - -u_int32_t GetChildNodeNum (BTreeControlBlockPtr btreePtr, - NodeDescPtr nodePtr, - u_int16_t index ); - -void MoveRecordsLeft (u_int8_t * src, - u_int8_t * dst, - u_int16_t bytesToMove ); - -#define MoveRecordsLeft(src,dst,bytes) bcopy((src),(dst),(bytes)) - -void MoveRecordsRight (u_int8_t * src, - u_int8_t * dst, - u_int16_t bytesToMove ); - -#define MoveRecordsRight(src,dst,bytes) bcopy((src),(dst),(bytes)) - - -//////////////////////////////// Tree Operations //////////////////////////////// - -OSStatus SearchTree (BTreeControlBlockPtr btreePtr, - BTreeKeyPtr keyPtr, - TreePathTable treePathTable, - u_int32_t *nodeNum, - BlockDescriptor *nodePtr, - u_int16_t *index ); - -OSStatus InsertTree (BTreeControlBlockPtr btreePtr, - TreePathTable treePathTable, - KeyPtr keyPtr, - u_int8_t * recPtr, - u_int16_t recSize, - BlockDescriptor *targetNode, - u_int16_t index, - u_int16_t level, - Boolean replacingKey, - u_int32_t *insertNode ); - -OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, - TreePathTable treePathTable, - BlockDescriptor *targetNode, - u_int16_t index, - u_int16_t level ); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif //__BTREESPRIVATE__ diff --git a/bsd/hfs/hfscommon/headers/CatalogPrivate.h b/bsd/hfs/hfscommon/headers/CatalogPrivate.h deleted file mode 100644 index bd3f00ddd..000000000 --- a/bsd/hfs/hfscommon/headers/CatalogPrivate.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: CatalogPrivate.h - - Contains: Private Catalog Manager interfaces. - - Version: HFS Plus 1.0 - - Copyright: � 1997-1998 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: Don Brady - - Other Contact: xxx put other contact here xxx - - Technology: xxx put technology here xxx - - Writers: - - (JL) Jim Luther - (msd) Mark Day - (DSH) Deric Horn - (djb) Don Brady - - Change History (most recent first): - 11/10/98 djb Remove obsolete PrepareInputName prototype; - 4/6/98 djb Added lock data stuctures and ReleaseCatalogIterator prototype; - 4/6/98 djb Removed CatalogDataCache since its no longer used. - 4/2/98 djb InvalidateCatalogNodeCache does nothing under MacOS X. - 3/31/98 djb Sync up with final HFSVolumes.h header file. - - 11/20/97 djb Radar #2002357. Fixing retry mechanism. - 11/17/97 djb PrepareInputName routine now returns an error. - 11/13/97 djb Radar #1683572. Move CatalogIterator to this file from - FileMgrInternal.i. Double size of short unicode name. - 10/31/97 JL #2000184 - Changed prototypes for CreateFileThreadID and - ExchangeFiles. - 10/17/97 msd In CatalogCacheGlobals, add room for a single UniStr255 so - catalog iterators can step over long Unicode names. - 10/17/97 djb Add ConvertInputNameToUnicode for Catalog Create/Rename. - 10/1/97 djb Change catalog iterator implementation. - 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name - collision - 6/24/97 djb Add LocateCatalogNodeByMangledName routine. - 6/24/97 djb first checked in -*/ - -#ifndef __CATALOGPRIVATE__ -#define __CATALOGPRIVATE__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - -#include "../../hfs_format.h" - -#include "FileMgrInternal.h" -#include "BTreesInternal.h" - -// -// Private Catalog Manager Routines (for use only by Catalog Manager, CatSearch and FileID Services) -// - - -extern OSErr LocateCatalogNodeByKey ( const ExtendedVCB *volume, u_int32_t hint, CatalogKey *keyPtr, - CatalogRecord *dataPtr, u_int32_t *newHint ); - -extern OSErr LocateCatalogRecord( const ExtendedVCB *volume, HFSCatalogNodeID folderID, const CatalogName *name, - u_int32_t hint, CatalogKey *keyPtr, CatalogRecord *dataPtr, u_int32_t *newHint); - -extern OSErr LocateCatalogNodeWithRetry ( const ExtendedVCB *volume, HFSCatalogNodeID folderID, ConstStr31Param pascalName, - CatalogName *unicodeName, u_int32_t hint, CatalogKey *keyPtr, CatalogRecord *dataPtr, - u_int32_t *newHint ); -extern OSErr FlushCatalog( ExtendedVCB *volume); - - -extern void ConvertInputNameToUnicode(ConstStr31Param name, TextEncoding encodingHint, - TextEncoding *actualEncoding, CatalogName *catalogName); - -extern void BuildCatalogKey( HFSCatalogNodeID parentID, const CatalogName *name, Boolean isHFSPlus, - CatalogKey *key); - -extern OSErr BuildCatalogKeyUTF8(ExtendedVCB *volume, HFSCatalogNodeID parentID, const unsigned char *name, - u_int32_t length, CatalogKey *key, u_int32_t *textEncoding); - -extern void CopyCatalogName( const CatalogName *srcName, CatalogName *dstName, Boolean isHFSPLus); - -extern OSErr ResolveFileID( ExtendedVCB *vcb, HFSCatalogNodeID fileID, HFSCatalogNodeID *parentID, Str31 name ); - -#if 0 -extern OSErr CreateFileThreadID( FIDParam *filePB, WDCBRecPtr *wdcbPtr ); - -extern OSErr ExchangeFiles( FIDParam *filePB, WDCBRecPtr *wdcbPtr ); -#endif - -extern void UpdateCatalogName( ConstStr31Param srcName, Str31 destName ); - - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif //__CATALOGPRIVATE__ diff --git a/bsd/hfs/hfscommon/headers/FileMgrInternal.h b/bsd/hfs/hfscommon/headers/FileMgrInternal.h deleted file mode 100644 index 20d38dd93..000000000 --- a/bsd/hfs/hfscommon/headers/FileMgrInternal.h +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: FilesInternal.h - - Contains: IPI for File Manager (HFS Plus) - - Version: HFS Plus 1.0 - - Copyright: � 1996-2001 by Apple Computer, Inc., all rights reserved. - -*/ -#ifndef __FILEMGRINTERNAL__ -#define __FILEMGRINTERNAL__ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - -#include -#include - -#if !HFS_ALLOC_TEST - -#include "../../hfs.h" -#include "../../hfs_macos_defs.h" -#include "../../hfs_format.h" -#include "../../hfs_cnode.h" - -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* CatalogNodeID is used to track catalog objects */ -typedef u_int32_t HFSCatalogNodeID; - -/* internal error codes*/ - -#if TARGET_API_MACOS_X - #define ERR_BASE -32767 -#else - #define ERR_BASE 0 -#endif - -enum { - /* FXM errors*/ - fxRangeErr = ERR_BASE + 16, /* file position beyond mapped range*/ - fxOvFlErr = ERR_BASE + 17, /* extents file overflow*/ - /* Unicode errors*/ - uniTooLongErr = ERR_BASE + 24, /* Unicode string too long to convert to Str31*/ - uniBufferTooSmallErr = ERR_BASE + 25, /* Unicode output buffer too small*/ - uniNotMappableErr = ERR_BASE + 26, /* Unicode string can't be mapped to given script*/ - /* BTree Manager errors*/ - btNotFound = ERR_BASE + 32, /* record not found*/ - btExists = ERR_BASE + 33, /* record already exists*/ - btNoSpaceAvail = ERR_BASE + 34, /* no available space*/ - btNoFit = ERR_BASE + 35, /* record doesn't fit in node */ - btBadNode = ERR_BASE + 36, /* bad node detected*/ - btBadHdr = ERR_BASE + 37, /* bad BTree header record detected*/ - dsBadRotate = ERR_BASE + 64, /* bad BTree rotate*/ - /* Catalog Manager errors*/ - cmNotFound = ERR_BASE + 48, /* CNode not found*/ - cmExists = ERR_BASE + 49, /* CNode already exists*/ - cmNotEmpty = ERR_BASE + 50, /* directory CNode not empty (valence = 0)*/ - cmRootCN = ERR_BASE + 51, /* invalid reference to root CNode*/ - cmBadNews = ERR_BASE + 52, /* detected bad catalog structure*/ - cmFThdDirErr = ERR_BASE + 53, /* thread belongs to a directory not a file*/ - cmFThdGone = ERR_BASE + 54, /* file thread doesn't exist*/ - cmParentNotFound = ERR_BASE + 55, /* CNode for parent ID does not exist*/ - /* TFS internal errors*/ - fsDSIntErr = -127 /* Internal file system error*/ -}; - - -/* internal flags*/ - -enum { - kEFAllMask = 0x01, /* allocate all requested bytes or none */ - kEFContigMask = 0x02, /* force contiguous allocation */ - kEFReserveMask = 0x04, /* keep block reserve */ - kEFDeferMask = 0x08, /* defer file block allocations */ - kEFNoClumpMask = 0x10, /* don't round up to clump size */ - kEFMetadataMask = 0x20, /* metadata allocation */ - - kTFTrunExtBit = 0, /* truncate to the extent containing new PEOF*/ - kTFTrunExtMask = 1 -}; - -enum { - kUndefinedStrLen = 0, /* Unknown string length */ - kNoHint = 0, - - /* FileIDs variables*/ - kNumExtentsToCache = 4 /* just guessing for ExchangeFiles*/ -}; - - -/* Universal Extent Key */ - -union ExtentKey { - HFSExtentKey hfs; - HFSPlusExtentKey hfsPlus; -}; -typedef union ExtentKey ExtentKey; -/* Universal extent descriptor */ - -union ExtentDescriptor { - HFSExtentDescriptor hfs; - HFSPlusExtentDescriptor hfsPlus; -}; -typedef union ExtentDescriptor ExtentDescriptor; -/* Universal extent record */ - -union ExtentRecord { - HFSExtentRecord hfs; - HFSPlusExtentRecord hfsPlus; -}; -typedef union ExtentRecord ExtentRecord; - - -enum { - CMMaxCName = kHFSMaxFileNameChars -}; - - - -/* Universal catalog name*/ - -union CatalogName { - Str31 pstr; - HFSUniStr255 ustr; -}; -typedef union CatalogName CatalogName; - - -/* - * MacOS accessor routines - */ -#define GetFileControlBlock(fref) VTOF((fref)) -#define GetFileRefNumFromFCB(fcb) FTOV((fcb)) - -/* Test for error and return if error occurred*/ -EXTERN_API_C( void ) -ReturnIfError (OSErr result); - -#define ReturnIfError(result) do { if ( (result) != noErr ) return (result); } while(0) - -/* Exit function on error*/ -EXTERN_API_C( void ) -ExitOnError (OSErr result); - -#define ExitOnError( result ) do { if ( ( result ) != noErr ) goto ErrorExit; } while(0) - - - -/* Catalog Manager Routines (IPI)*/ - -EXTERN_API_C( OSErr ) -ExchangeFileIDs (ExtendedVCB * volume, - ConstUTF8Param srcName, - ConstUTF8Param destName, - HFSCatalogNodeID srcID, - HFSCatalogNodeID destID, - u_int32_t srcHint, - u_int32_t destHint ); - -EXTERN_API_C( OSErr ) -MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc); - -/* BTree Manager Routines*/ - -typedef CALLBACK_API_C( int32_t , KeyCompareProcPtr )(void *a, void *b); - - -EXTERN_API_C( OSErr ) -ReplaceBTreeRecord (FileReference refNum, - const void * key, - u_int32_t hint, - void * newData, - u_int16_t dataSize, - u_int32_t * newHint); - - -/* Prototypes for exported routines in VolumeAllocation.c*/ - -/* - * Flags for BlockAllocate(), BlockDeallocate() and hfs_block_alloc. - * Some of these are for internal use only. See the comment at the - * top of hfs_alloc_int for more details on the semantics of these - * flags. - */ -#define HFS_ALLOC_FORCECONTIG 0x001 //force contiguous block allocation; minblocks must be allocated -#define HFS_ALLOC_METAZONE 0x002 //can use metazone blocks -#define HFS_ALLOC_SKIPFREEBLKS 0x004 //skip checking/updating freeblocks during alloc/dealloc -#define HFS_ALLOC_FLUSHTXN 0x008 //pick best fit for allocation, even if a jnl flush is req'd -#define HFS_ALLOC_TENTATIVE 0x010 //reserved allocation that can be claimed back -#define HFS_ALLOC_LOCKED 0x020 //reserved allocation that can't be claimed back -#define HFS_ALLOC_IGNORE_TENTATIVE 0x040 //Steal tentative blocks if necessary -#define HFS_ALLOC_IGNORE_RESERVED 0x080 //Ignore tentative/committed blocks -#define HFS_ALLOC_USE_TENTATIVE 0x100 //Use the supplied tentative range (if possible) -#define HFS_ALLOC_COMMIT 0x200 //Commit the supplied extent to disk -#define HFS_ALLOC_TRY_HARD 0x400 //Search hard to try and get maxBlocks; implies HFS_ALLOC_FLUSHTXN -#define HFS_ALLOC_ROLL_BACK 0x800 //Reallocate blocks that were just deallocated -#define HFS_ALLOC_FAST_DEV 0x1000 //Prefer fast device for allocation - -typedef uint32_t hfs_block_alloc_flags_t; - -struct rl_entry; -EXTERN_API_C( OSErr ) -BlockAllocate (ExtendedVCB * vcb, - u_int32_t startingBlock, - u_int32_t minBlocks, - u_int32_t maxBlocks, - hfs_block_alloc_flags_t flags, - u_int32_t * startBlock, - u_int32_t * actualBlocks); - -typedef struct hfs_alloc_extra_args { - // Used with HFS_ALLOC_TRY_HARD and HFS_ALLOC_FORCECONTIG - uint32_t max_blocks; - - // Used with with HFS_ALLOC_USE_TENTATIVE & HFS_ALLOC_COMMIT - struct rl_entry **reservation_in; - - // Used with HFS_ALLOC_TENTATIVE & HFS_ALLOC_LOCKED - struct rl_entry **reservation_out; - - /* - * If the maximum cannot be returned, the allocation will be - * trimmed to the specified alignment after taking - * @alignment_offset into account. @alignment and - * @alignment_offset are both in terms of blocks, *not* bytes. - * The result will be such that: - * - * (block_count + @alignment_offset) % @alignment == 0 - * - * Alignment is *not* guaranteed. - * - * One example where alignment might be useful is in the case - * where the page size is greater than the allocation block size - * and I/O is being performed in multiples of the page size. - */ - int alignment; - int alignment_offset; -} hfs_alloc_extra_args_t; - -/* - * Same as BlockAllocate but slightly different API. - * @extent.startBlock is a hint for where to start searching and - * @extent.blockCount is the minimum number of blocks acceptable. - * Additional arguments can be passed in @extra_args and use will - * depend on @flags. See comment at top of hfs_block_alloc_int for - * more information. - */ -errno_t hfs_block_alloc(hfsmount_t *hfsmp, - HFSPlusExtentDescriptor *extent, - hfs_block_alloc_flags_t flags, - hfs_alloc_extra_args_t *extra_args); - -EXTERN_API_C( OSErr ) -BlockDeallocate (ExtendedVCB * vcb, - u_int32_t firstBlock, - u_int32_t numBlocks, - hfs_block_alloc_flags_t flags); - -EXTERN_API_C ( void ) -ResetVCBFreeExtCache(struct hfsmount *hfsmp); - -EXTERN_API_C( OSErr ) -BlockMarkAllocated(ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); - -EXTERN_API_C( OSErr ) -BlockMarkFree( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); - -EXTERN_API_C( OSErr ) -BlockMarkFreeUnused( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); - -EXTERN_API_C( u_int32_t ) -MetaZoneFreeBlocks(ExtendedVCB *vcb); - -EXTERN_API_C( u_int32_t ) -UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block); - -EXTERN_API_C( u_int32_t ) -ScanUnmapBlocks(struct hfsmount *hfsmp); - -EXTERN_API_C( int ) -hfs_init_summary (struct hfsmount *hfsmp); - -errno_t hfs_find_free_extents(struct hfsmount *hfsmp, - void (*callback)(void *data, off_t), void *callback_arg); - -void hfs_free_tentative(hfsmount_t *hfsmp, struct rl_entry **reservation); -void hfs_free_locked(hfsmount_t *hfsmp, struct rl_entry **reservation); - -/* File Extent Mapping routines*/ -EXTERN_API_C( OSErr ) -FlushExtentFile (ExtendedVCB * vcb); - -#if CONFIG_HFS_STD -EXTERN_API_C( int32_t ) -CompareExtentKeys (const HFSExtentKey * searchKey, - const HFSExtentKey * trialKey); -#endif - -EXTERN_API_C( int32_t ) -CompareExtentKeysPlus (const HFSPlusExtentKey *searchKey, - const HFSPlusExtentKey *trialKey); - -OSErr SearchExtentFile(ExtendedVCB *vcb, - const FCB *fcb, - int64_t filePosition, - HFSPlusExtentKey *foundExtentKey, - HFSPlusExtentRecord foundExtentData, - u_int32_t *foundExtentDataIndex, - u_int32_t *extentBTreeHint, - u_int32_t *endingFABNPlusOne ); - -EXTERN_API_C( OSErr ) -TruncateFileC (ExtendedVCB *vcb, FCB *fcb, int64_t peof, int deleted, - int rsrc, uint32_t fileid, Boolean truncateToExtent); - -EXTERN_API_C( OSErr ) -ExtendFileC (ExtendedVCB * vcb, - FCB * fcb, - int64_t bytesToAdd, - u_int32_t blockHint, - u_int32_t flags, - int64_t * actualBytesAdded); - -EXTERN_API_C( OSErr ) -MapFileBlockC (ExtendedVCB * vcb, - FCB * fcb, - size_t numberOfBytes, - off_t offset, - daddr64_t * startBlock, - size_t * availableBytes); - -OSErr HeadTruncateFile(ExtendedVCB *vcb, FCB *fcb, u_int32_t headblks); - -EXTERN_API_C( int ) -AddFileExtent (ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount); - -#if TARGET_API_MACOS_X -EXTERN_API_C( Boolean ) -NodesAreContiguous (ExtendedVCB * vcb, - FCB * fcb, - u_int32_t nodeSize); -#endif - -/* Get the current time in UTC (GMT)*/ -EXTERN_API_C( u_int32_t ) -GetTimeUTC (void); - -EXTERN_API_C( u_int32_t ) -LocalToUTC (u_int32_t localTime); - -EXTERN_API_C( u_int32_t ) -UTCToLocal (u_int32_t utcTime); - - -#ifdef __cplusplus -} -#endif - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* __FILEMGRINTERNAL__ */ - diff --git a/bsd/hfs/hfscommon/headers/HFSUnicodeWrappers.h b/bsd/hfs/hfscommon/headers/HFSUnicodeWrappers.h deleted file mode 100644 index 50ae8d87e..000000000 --- a/bsd/hfs/hfscommon/headers/HFSUnicodeWrappers.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2000-2003, 2005-2013 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: HFSUnicodeWrappers.h - - Contains: IPI to Unicode routines used by File Manager. - - Version: HFS Plus 1.0 - - Written by: Mark Day - - Copyright: � 1996-1997 by Apple Computer, Inc., all rights reserved. - - File Ownership: - - DRI: xxx put dri here xxx - - Other Contact: xxx put other contact here xxx - - Technology: xxx put technology here xxx - - Writers: - - (DSH) Deric Horn - (msd) Mark Day - (djb) Don Brady - - Change History (most recent first): - - 11/16/97 djb Change Unicode.h to UnicodeConverter.h. - 11/7/97 msd Remove prototype for CompareUnicodeNames(). Add prototype for - FastUnicodeCompare(). - 10/13/97 djb Add encoding/index macros and add prototypes for new Get/Set - encodding routines. - 9/15/97 djb InitUnicodeConverter now takes a boolean. - 9/10/97 msd Add prototype for InitializeEncodingContext. - 6/26/97 DSH Include "MockConverter" prototype for DFA usage. - 6/25/97 DSH Removed Prototype definitions, and checked in Unicode.h and - TextCommon.h from Julio Gonzales into InternalInterfaces. - 6/25/97 msd Add prototypes for some new Unicode routines that haven't - appeared in MasterInterfaces yet. - 6/18/97 djb Add more ConversionContexts routines. - 6/13/97 djb Switched to ConvertUnicodeToHFSName, ConvertHFSNameToUnicode, & - CompareUnicodeNames. - 4/28/97 djb first checked in - 12/12/96 msd first checked in - -*/ -#ifndef _HFSUNICODEWRAPPERS_ -#define _HFSUNICODEWRAPPERS_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE - -#include "../../hfs_macos_defs.h" -#include "../../hfs_format.h" - - -extern OSErr ConvertUnicodeToUTF8Mangled ( ByteCount srcLen, - ConstUniCharArrayPtr srcStr, - ByteCount maxDstLen, - ByteCount *actualDstLen, - unsigned char* dstStr , - HFSCatalogNodeID cnid); - -/* - This routine compares two Unicode names based on an ordering defined by the HFS Plus B-tree. - This ordering must stay fixed for all time. - - Output: - -n name1 < name2 (i.e. name 1 sorts before name 2) - 0 name1 = name2 - +n name1 > name2 - - NOTE: You should not depend on the magnitude of the result, just its sign. That is, when name1 < name2, then any - negative number may be returned. -*/ - -extern int32_t FastUnicodeCompare(register ConstUniCharArrayPtr str1, register ItemCount length1, - register ConstUniCharArrayPtr str2, register ItemCount length2); - -extern int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount length1, - register ConstUniCharArrayPtr str2, register ItemCount length2); - -extern int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 ); - - -extern HFSCatalogNodeID GetEmbeddedFileID( ConstStr31Param filename, u_int32_t length, u_int32_t *prefixLength ); -extern u_int32_t CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length ); - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* _HFSUNICODEWRAPPERS_ */ diff --git a/bsd/hfs/rangelist.c b/bsd/hfs/rangelist.c deleted file mode 100644 index 81b384c48..000000000 --- a/bsd/hfs/rangelist.c +++ /dev/null @@ -1,470 +0,0 @@ -/* - * Copyright (c) 2001-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#if HFS - -#include -#include -#include -#include - -#if !RANGELIST_TEST -#include -#endif - -#include "rangelist.h" - -static enum rl_overlaptype rl_scan_from(struct rl_head *rangelist, off_t start, off_t end, struct rl_entry **overlap, struct rl_entry *range); -static void rl_collapse_forwards(struct rl_head *rangelist, struct rl_entry *range); -static void rl_collapse_backwards(struct rl_head *rangelist, struct rl_entry *range); -static void rl_collapse_neighbors(struct rl_head *rangelist, struct rl_entry *range); - - -#ifdef RL_DIAGNOSTIC -static void -rl_verify(struct rl_head *rangelist) { - struct rl_entry *entry; - struct rl_entry *next; - off_t limit = 0; - - TAILQ_FOREACH_SAFE(rangelist, entry, rl_link, next) { - if ((limit > 0) && (entry->rl_start <= limit)) panic("hfs: rl_verify: bad entry start?!"); - if (entry->rl_end < entry->rl_start) panic("hfs: rl_verify: bad entry end?!"); - limit = entry->rl_end; - }; -} -#endif - - - -/* - * Initialize a range list head - */ -void -rl_init(struct rl_head *rangelist) -{ - TAILQ_INIT(rangelist); -} - -/* - * Add a range to the list - */ -void -rl_add(off_t start, off_t end, struct rl_head *rangelist) -{ - struct rl_entry *range; - struct rl_entry *overlap; - enum rl_overlaptype ovcase; - -#ifdef RL_DIAGNOSTIC - if (end < start) panic("hfs: rl_add: end < start?!"); -#endif - - ovcase = rl_scan(rangelist, start, end, &overlap); - - /* - * Six cases: - * 0) no overlap - * 1) overlap == range - * 2) overlap contains range - * 3) range contains overlap - * 4) overlap starts before range - * 5) overlap ends after range - */ - switch (ovcase) { - case RL_NOOVERLAP: /* 0: no overlap */ - /* - * overlap points to the entry we should insert before, or - * if NULL, we should insert at the end. - */ - MALLOC(range, struct rl_entry *, sizeof(*range), M_TEMP, M_WAITOK); - range->rl_start = start; - range->rl_end = end; - - /* Link in the new range: */ - if (overlap) { - TAILQ_INSERT_BEFORE(overlap, range, rl_link); - } else { - TAILQ_INSERT_TAIL(rangelist, range, rl_link); - } - - /* Check to see if any ranges can be combined (possibly including the immediately - preceding range entry) - */ - rl_collapse_neighbors(rangelist, range); - break; - - case RL_MATCHINGOVERLAP: /* 1: overlap == range */ - case RL_OVERLAPCONTAINSRANGE: /* 2: overlap contains range */ - range = overlap; /* for debug output below */ - break; - - case RL_OVERLAPISCONTAINED: /* 3: range contains overlap */ - /* - * Replace the overlap with the new, larger range: - */ - overlap->rl_start = start; - overlap->rl_end = end; - rl_collapse_neighbors(rangelist, overlap); - range = overlap; /* for debug output below */ - break; - - case RL_OVERLAPSTARTSBEFORE: /* 4: overlap starts before range */ - /* - * Expand the overlap area to cover the new range: - */ - overlap->rl_end = end; - rl_collapse_forwards(rangelist, overlap); - range = overlap; /* for debug output below */ - break; - - case RL_OVERLAPENDSAFTER: /* 5: overlap ends after range */ - /* - * Expand the overlap area to cover the new range: - */ - overlap->rl_start = start; - rl_collapse_backwards(rangelist, overlap); - range = overlap; /* for debug output below */ - break; - } - -#ifdef RL_DIAGNOSTIC - rl_verify(rangelist); -#endif -} - - - -/* - * Remove a range from a range list. - * - * Generally, find the range (or an overlap to that range) - * and remove it (or shrink it), then wakeup anyone we can. - */ -void -rl_remove(off_t start, off_t end, struct rl_head *rangelist) -{ - struct rl_entry *range, *next_range, *overlap, *splitrange; - int ovcase; - -#ifdef RL_DIAGNOSTIC - if (end < start) panic("hfs: rl_remove: end < start?!"); -#endif - - if (TAILQ_EMPTY(rangelist)) { - return; - }; - - range = TAILQ_FIRST(rangelist); - while ((ovcase = rl_scan_from(rangelist, start, end, &overlap, range))) { - switch (ovcase) { - - case RL_MATCHINGOVERLAP: /* 1: overlap == range */ - TAILQ_REMOVE(rangelist, overlap, rl_link); - FREE(overlap, M_TEMP); - break; - - case RL_OVERLAPCONTAINSRANGE: /* 2: overlap contains range: split it */ - if (overlap->rl_start == start) { - overlap->rl_start = end + 1; - break; - }; - - if (overlap->rl_end == end) { - overlap->rl_end = start - 1; - break; - }; - - /* - * Make a new range consisting of the last part of the encompassing range - */ - MALLOC(splitrange, struct rl_entry *, sizeof *splitrange, M_TEMP, M_WAITOK); - splitrange->rl_start = end + 1; - splitrange->rl_end = overlap->rl_end; - overlap->rl_end = start - 1; - - /* - * Now link the new entry into the range list after the range from which it was split: - */ - TAILQ_INSERT_AFTER(rangelist, overlap, splitrange, rl_link); - break; - - case RL_OVERLAPISCONTAINED: /* 3: range contains overlap */ - /* Check before discarding overlap entry */ - next_range = TAILQ_NEXT(overlap, rl_link); - TAILQ_REMOVE(rangelist, overlap, rl_link); - FREE(overlap, M_TEMP); - if (next_range) { - range = next_range; - continue; - }; - break; - - case RL_OVERLAPSTARTSBEFORE: /* 4: overlap starts before range */ - overlap->rl_end = start - 1; - range = TAILQ_NEXT(overlap, rl_link); - if (range) { - continue; - } - break; - - case RL_OVERLAPENDSAFTER: /* 5: overlap ends after range */ - overlap->rl_start = (end == RL_INFINITY ? RL_INFINITY : end + 1); - break; - } - break; - } - -#ifdef RL_DIAGNOSTIC - rl_verify(rangelist); -#endif -} - - - -/* - * Scan a range list for an entry in a specified range (if any): - * - * NOTE: this returns only the FIRST overlapping range. - * There may be more than one. - */ - -enum rl_overlaptype -rl_scan(struct rl_head *rangelist, - off_t start, - off_t end, - struct rl_entry **overlap) { - - return rl_scan_from(rangelist, start, end, overlap, TAILQ_FIRST(rangelist)); -} - -enum rl_overlaptype -rl_overlap(const struct rl_entry *range, off_t start, off_t end) -{ - /* - * OK, check for overlap - * - * Six cases: - * 0) no overlap (RL_NOOVERLAP) - * 1) overlap == range (RL_MATCHINGOVERLAP) - * 2) overlap contains range (RL_OVERLAPCONTAINSRANGE) - * 3) range contains overlap (RL_OVERLAPISCONTAINED) - * 4) overlap starts before range (RL_OVERLAPSTARTSBEFORE) - * 5) overlap ends after range (RL_OVERLAPENDSAFTER) - */ - if (start > range->rl_end || range->rl_start > end) { - /* Case 0 (RL_NOOVERLAP) */ - return RL_NOOVERLAP; - } - - if (range->rl_start == start && range->rl_end == end) { - /* Case 1 (RL_MATCHINGOVERLAP) */ - return RL_MATCHINGOVERLAP; - } - - if (range->rl_start <= start && range->rl_end >= end) { - /* Case 2 (RL_OVERLAPCONTAINSRANGE) */ - return RL_OVERLAPCONTAINSRANGE; - } - - if (start <= range->rl_start && end >= range->rl_end) { - /* Case 3 (RL_OVERLAPISCONTAINED) */ - return RL_OVERLAPISCONTAINED; - } - - if (range->rl_start < start && range->rl_end < end) { - /* Case 4 (RL_OVERLAPSTARTSBEFORE) */ - return RL_OVERLAPSTARTSBEFORE; - } - - /* Case 5 (RL_OVERLAPENDSAFTER) */ - // range->rl_start > start && range->rl_end > end - return RL_OVERLAPENDSAFTER; -} - -/* - * Walk the list of ranges for an entry to - * find an overlapping range (if any). - * - * NOTE: this returns only the FIRST overlapping range. - * There may be more than one. - */ -static enum rl_overlaptype -rl_scan_from(struct rl_head *rangelist __unused, - off_t start, - off_t end, - struct rl_entry **overlap, - struct rl_entry *range) -{ -#ifdef RL_DIAGNOSTIC - rl_verify(rangelist); -#endif - - while (range) { - enum rl_overlaptype ot = rl_overlap(range, start, end); - - if (ot != RL_NOOVERLAP || range->rl_start > end) { - *overlap = range; - return ot; - } - - range = TAILQ_NEXT(range, rl_link); - } - - *overlap = NULL; - return RL_NOOVERLAP; -} - - -static void -rl_collapse_forwards(struct rl_head *rangelist, struct rl_entry *range) { - struct rl_entry *next_range; - - while ((next_range = TAILQ_NEXT(range, rl_link))) { - if ((range->rl_end != RL_INFINITY) && (range->rl_end < next_range->rl_start - 1)) return; - - /* Expand this range to include the next range: */ - range->rl_end = next_range->rl_end; - - /* Remove the now covered range from the list: */ - TAILQ_REMOVE(rangelist, next_range, rl_link); - FREE(next_range, M_TEMP); - -#ifdef RL_DIAGNOSTIC - rl_verify(rangelist); -#endif - }; -} - - - -static void -rl_collapse_backwards(struct rl_head *rangelist, struct rl_entry *range) { - struct rl_entry *prev_range; - - while ((prev_range = TAILQ_PREV(range, rl_head, rl_link))) { - if (prev_range->rl_end < range->rl_start -1) { -#ifdef RL_DIAGNOSTIC - rl_verify(rangelist); -#endif - return; - }; - - /* Expand this range to include the previous range: */ - range->rl_start = prev_range->rl_start; - - /* Remove the now covered range from the list: */ - TAILQ_REMOVE(rangelist, prev_range, rl_link); - FREE(prev_range, M_TEMP); - }; -} - - - -static void -rl_collapse_neighbors(struct rl_head *rangelist, struct rl_entry *range) -{ - rl_collapse_forwards(rangelist, range); - rl_collapse_backwards(rangelist, range); -} - -void rl_remove_all(struct rl_head *rangelist) -{ - struct rl_entry *r, *nextr; - TAILQ_FOREACH_SAFE(r, rangelist, rl_link, nextr) - FREE(r, M_TEMP); - TAILQ_INIT(rangelist); -} - -/* - * In the case where b is contained by a, we return the the largest part - * remaining. The result is stored in a. - */ -void rl_subtract(struct rl_entry *a, const struct rl_entry *b) -{ - switch (rl_overlap(b, a->rl_start, a->rl_end)) { - case RL_MATCHINGOVERLAP: - case RL_OVERLAPCONTAINSRANGE: - a->rl_end = a->rl_start - 1; - break; - case RL_OVERLAPISCONTAINED: - // Keep the bigger part - if (b->rl_start - a->rl_start >= a->rl_end - b->rl_end) { - // Keep left - a->rl_end = b->rl_start - 1; - } else { - // Keep right - a->rl_start = b->rl_end + 1; - } - break; - case RL_OVERLAPSTARTSBEFORE: - a->rl_start = b->rl_end + 1; - break; - case RL_OVERLAPENDSAFTER: - a->rl_end = b->rl_start - 1; - break; - case RL_NOOVERLAP: - break; - } -} - -#else /* not HFS - temp workaround until 4277828 is fixed */ -/* stubs for exported routines that aren't present when we build kernel without HFS */ - -#include - -void rl_add(off_t start, off_t end, void *rangelist); -void rl_init(void *rangelist); -void rl_remove(off_t start, off_t end, void *rangelist); -int rl_scan(void *rangelist, off_t start, off_t end, void **overlap); - -void rl_add(__unused off_t start, __unused off_t end, __unused void *rangelist) -{ - return; -} - -void rl_init(__unused void *rangelist) -{ - return; -} - -void rl_remove(__unused off_t start, __unused off_t end, __unused void *rangelist) -{ - return; -} - -int rl_scan(__unused void *rangelist, __unused off_t start, __unused off_t end, __unused void **overlap) -{ - return(0); -} - -void rl_remove_all(struct rl_head *rangelist) -{ -} - -#endif /* HFS */ diff --git a/bsd/hfs/rangelist.h b/bsd/hfs/rangelist.h deleted file mode 100644 index 41708be5d..000000000 --- a/bsd/hfs/rangelist.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2001-2014 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _HFS_RANGELIST_H_ -#define _HFS_RANGELIST_H_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -#include -#include - -enum rl_overlaptype { - RL_NOOVERLAP = 0, /* 0 */ - RL_MATCHINGOVERLAP, /* 1 */ - RL_OVERLAPCONTAINSRANGE, /* 2 */ - RL_OVERLAPISCONTAINED, /* 3 */ - RL_OVERLAPSTARTSBEFORE, /* 4 */ - RL_OVERLAPENDSAFTER /* 5 */ -}; - -#define RL_INFINITY INT64_MAX - -TAILQ_HEAD(rl_head, rl_entry); - -struct rl_entry { - TAILQ_ENTRY(rl_entry) rl_link; - off_t rl_start; - off_t rl_end; -}; - -__BEGIN_DECLS -void rl_init(struct rl_head *rangelist); -void rl_add(off_t start, off_t end, struct rl_head *rangelist); -void rl_remove(off_t start, off_t end, struct rl_head *rangelist); -void rl_remove_all(struct rl_head *rangelist); -enum rl_overlaptype rl_scan(struct rl_head *rangelist, - off_t start, - off_t end, - struct rl_entry **overlap); -enum rl_overlaptype rl_overlap(const struct rl_entry *range, - off_t start, off_t end); - -static __attribute__((pure)) inline -off_t rl_len(const struct rl_entry *range) -{ - return range->rl_end - range->rl_start + 1; -} - -void rl_subtract(struct rl_entry *a, const struct rl_entry *b); - -static inline struct rl_entry rl_make(off_t start, off_t end) -{ - return (struct rl_entry){ .rl_start = start, .rl_end = end }; -} - -__END_DECLS - -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ -#endif /* ! _HFS_RANGELIST_H_ */ diff --git a/bsd/i386/Makefile b/bsd/i386/Makefile index 6c5370018..5763410f8 100644 --- a/bsd/i386/Makefile +++ b/bsd/i386/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -22,7 +21,6 @@ KERNELFILES = \ types.h vmparam.h _types.h _param.h \ _mcontext.h - INSTALL_MD_LIST = ${DATAFILES} INSTALL_MD_LCL_LIST = ${PRIVATE_DATAFILES} @@ -34,5 +32,3 @@ EXPORT_MD_DIR = i386 include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/i386/param.h b/bsd/i386/param.h index c4c906bdb..221318fe1 100644 --- a/bsd/i386/param.h +++ b/bsd/i386/param.h @@ -165,7 +165,7 @@ #define DELAY(n) delay(n) #else /* defined(KERNEL) || defined(STANDALONE) */ -#define DELAY(n) { register int N = (n); while (--N > 0); } +#define DELAY(n) { int N = (n); while (--N > 0); } #endif /* defined(KERNEL) || defined(STANDALONE) */ #endif /* _I386_PARAM_H_ */ diff --git a/bsd/i386/types.h b/bsd/i386/types.h index eec91fb3e..30f0bc591 100644 --- a/bsd/i386/types.h +++ b/bsd/i386/types.h @@ -78,10 +78,10 @@ #include #include -typedef unsigned char u_int8_t; -typedef unsigned short u_int16_t; -typedef unsigned int u_int32_t; -typedef unsigned long long u_int64_t; +#include +#include +#include +#include #if __LP64__ typedef int64_t register_t; diff --git a/bsd/kern/Makefile b/bsd/kern/Makefile deleted file mode 100644 index 32c7ab729..000000000 --- a/bsd/kern/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTALL_SHARE_MISC_LIST = \ - trace.codes - -INSTALL_SHARE_MISC_FILES = \ - $(addprefix $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR)/, $(INSTALL_SHARE_MISC_LIST)) - -$(INSTALL_SHARE_MISC_FILES): $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR)/% : % - $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR) - @echo INSTALL $(@F) - $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ - -do_textfiles_install:: $(INSTALL_SHARE_MISC_FILES) - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/bsd/kern/ast.h b/bsd/kern/ast.h index 94a326584..00a8fa199 100644 --- a/bsd/kern/ast.h +++ b/bsd/kern/ast.h @@ -39,4 +39,8 @@ extern void act_set_astbsd(thread_t); extern void bsd_ast(thread_t); +#if CONFIG_DTRACE +extern void ast_dtrace_on(void); +#endif + #endif /* _KERN_AST_H_ */ diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index 8cf33e20a..50270ee4c 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -119,6 +119,7 @@ #include #include /* for pseudo_inits */ #include +#include #include #include @@ -191,6 +192,7 @@ #include #endif + #include #include #include @@ -238,7 +240,7 @@ int hostnamelen; char domainname[MAXDOMNAMELEN]; int domainnamelen; -char rootdevice[16]; /* hfs device names have at least 9 chars */ +char rootdevice[16]; /* device names have at least 9 chars */ #if KMEMSTATS struct kmemstats kmemstats[M_LAST]; @@ -252,6 +254,10 @@ int minimalboot = 0; __private_extern__ int proc_ref_tracking_disabled = 0; /* disable panics on leaked proc refs across syscall boundary */ #endif +#if OS_REASON_DEBUG +__private_extern__ int os_reason_debug_disabled = 0; /* disable asserts for when we fail to allocate OS reasons */ +#endif + extern kern_return_t IOFindBSDRoot(char *, unsigned int, dev_t *, u_int32_t *); extern void IOSecureBSDRoot(const char * rootName); extern kern_return_t IOKitBSDInit(void ); @@ -259,8 +265,8 @@ extern void kminit(void); extern void file_lock_init(void); extern void kmeminit(void); extern void bsd_bufferinit(void); +extern void oslog_setsize(int size); extern void throttle_init(void); -extern void macx_init(void); extern void acct_init(void); extern int serverperfmode; @@ -290,16 +296,23 @@ __private_extern__ int bootarg_no_vnode_jetsam = 0; __private_extern__ int bootarg_disable_aslr = 0; #endif +/* + * Allow an alternate dyld to be used for testing. + */ + +#if DEVELOPMENT || DEBUG +char dyld_alt_path[MAXPATHLEN]; +int use_alt_dyld = 0; +#endif + int cmask = CMASK; extern int customnbuf; -void bsd_init(void); kern_return_t bsd_autoconf(void); void bsd_utaskbootstrap(void); static void parse_bsd_args(void); extern task_t bsd_init_task; -extern boolean_t init_task_died; #if CONFIG_DEV_KMEM extern void dev_kmem_init(void); #endif @@ -319,6 +332,8 @@ extern void sysv_sem_lock_init(void); extern void sysv_msg_lock_init(void); #endif +extern void ulock_initialize(void); + #if CONFIG_MACF #if defined (__i386__) || defined (__x86_64__) /* MACF policy_check configuration flags; see policy_check.c for details */ @@ -380,6 +395,17 @@ extern lck_mtx_t * execargs_cache_lock; void (*mountroot_post_hook)(void); void (*unmountroot_pre_hook)(void); +/* + * This function is called before IOKit initialization, so that globals + * like the sysctl tree are initialized before kernel extensions + * are started (since they may want to register sysctls + */ +void +bsd_early_init(void) +{ + sysctl_early_init(); +} + /* * This function is called very early on in the Mach startup, from the * function start_kernel_threads() in osfmk/kern/startup.c. It's called @@ -513,6 +539,8 @@ bsd_init(void) /* Initialize System Override call */ init_system_override(); + ulock_initialize(); + /* * Create process 0. */ @@ -556,7 +584,11 @@ bsd_init(void) kernproc->p_flag = P_SYSTEM; kernproc->p_lflag = 0; kernproc->p_ladvflag = 0; - + +#if defined(__LP64__) + kernproc->p_flag |= P_LP64; +#endif + #if DEVELOPMENT || DEBUG if (bootarg_disable_aslr) kernproc->p_flag |= P_DISABLE_ASLR; @@ -687,12 +719,6 @@ bsd_init(void) bsd_init_kprintf("calling ubc_init\n"); ubc_init(); - /* - * Initialize device-switches. - */ - bsd_init_kprintf("calling devsw_init() \n"); - devsw_init(); - /* Initialize the file systems. */ bsd_init_kprintf("calling vfsinit\n"); vfsinit(); @@ -769,10 +795,6 @@ bsd_init(void) * Initialize protocols. Block reception of incoming packets * until everything is ready. */ - bsd_init_kprintf("calling sysctl_register_fixed\n"); - sysctl_register_fixed(); - bsd_init_kprintf("calling sysctl_mib_init\n"); - sysctl_mib_init(); #if NETWORKING bsd_init_kprintf("calling dlil_init\n"); dlil_init(); @@ -808,9 +830,6 @@ bsd_init(void) memorystatus_init(); #endif /* CONFIG_MEMORYSTATUS */ - bsd_init_kprintf("calling macx_init\n"); - macx_init(); - bsd_init_kprintf("calling acct_init\n"); acct_init(); @@ -819,9 +838,15 @@ bsd_init(void) kmstartup(); #endif + bsd_init_kprintf("calling sysctl_mib_init\n"); + sysctl_mib_init() + bsd_init_kprintf("calling bsd_autoconf\n"); bsd_autoconf(); + bsd_init_kprintf("calling os_reason_init\n"); + os_reason_init(); + #if CONFIG_DTRACE dtrace_postinit(); #endif @@ -991,10 +1016,6 @@ bsd_init(void) bsd_init_kprintf("calling bsd_utaskbootstrap\n"); bsd_utaskbootstrap(); -#if defined(__LP64__) - kernproc->p_flag |= P_LP64; -#endif - pal_kernel_announce(); bsd_init_kprintf("calling mountroot_post_hook\n"); @@ -1004,7 +1025,7 @@ bsd_init(void) mountroot_post_hook(); #if 0 /* not yet */ - consider_zone_gc(FALSE); + consider_zone_gc(); #endif @@ -1032,7 +1053,6 @@ bsdinit_task(void) ut = (uthread_t)get_bsdthread_info(thread); bsd_init_task = get_threadtask(thread); - init_task_died = FALSE; #if CONFIG_MACF mac_cred_label_associate_user(p->p_ucred); @@ -1171,6 +1191,7 @@ parse_bsd_args(void) if (PE_parse_boot_argn("msgbuf", &msgbuf, sizeof (msgbuf))) { log_setsize(msgbuf); + oslog_setsize(msgbuf); } if (PE_parse_boot_argn("-novfscache", namep, sizeof(namep))) { @@ -1190,7 +1211,27 @@ parse_bsd_args(void) } #endif +#if OS_REASON_DEBUG + if (PE_parse_boot_argn("-disable_osreason_debug", namep, sizeof(namep))) { + os_reason_debug_disabled = 1; + } +#endif + PE_parse_boot_argn("sigrestrict", &sigrestrict_arg, sizeof(sigrestrict_arg)); + +#if DEVELOPMENT|| DEBUG + if (PE_parse_boot_argn("-no_sigsys", namep, sizeof(namep))) { + send_sigsys = false; + } +#endif + +#if (DEVELOPMENT|| DEBUG) + if (PE_parse_boot_argn("alt-dyld", dyld_alt_path, sizeof(dyld_alt_path))) { + if (strlen(dyld_alt_path) > 0) { + use_alt_dyld = 1; + } + } +#endif } void diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index f941c0128..85931c3e9 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -160,11 +160,14 @@ bdevsw_isfree(int index) int bdevsw_add(int index, struct bdevsw * bsw) { + lck_mtx_lock_spin(&devsw_lock_list_mtx); index = bdevsw_isfree(index); if (index < 0) { - return (-1); + index = -1; + } else { + bdevsw[index] = *bsw; } - bdevsw[index] = *bsw; + lck_mtx_unlock(&devsw_lock_list_mtx); return (index); } /* @@ -180,10 +183,13 @@ bdevsw_remove(int index, struct bdevsw * bsw) return (-1); devsw = &bdevsw[index]; + lck_mtx_lock_spin(&devsw_lock_list_mtx); if ((memcmp((char *)devsw, (char *)bsw, sizeof(struct bdevsw)) != 0)) { - return (-1); + index = -1; + } else { + bdevsw[index] = nobdev; } - bdevsw[index] = nobdev; + lck_mtx_unlock(&devsw_lock_list_mtx); return (index); } @@ -240,11 +246,14 @@ cdevsw_isfree(int index) int cdevsw_add(int index, struct cdevsw * csw) { + lck_mtx_lock_spin(&devsw_lock_list_mtx); index = cdevsw_isfree(index); if (index < 0) { - return (-1); + index = -1; + } else { + cdevsw[index] = *csw; } - cdevsw[index] = *csw; + lck_mtx_unlock(&devsw_lock_list_mtx); return (index); } /* @@ -260,11 +269,14 @@ cdevsw_remove(int index, struct cdevsw * csw) return (-1); devsw = &cdevsw[index]; + lck_mtx_lock_spin(&devsw_lock_list_mtx); if ((memcmp((char *)devsw, (char *)csw, sizeof(struct cdevsw)) != 0)) { - return (-1); + index = -1; + } else { + cdevsw[index] = nocdev; + cdevsw_flags[index] = 0; } - cdevsw[index] = nocdev; - cdevsw_flags[index] = 0; + lck_mtx_unlock(&devsw_lock_list_mtx); return (index); } diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index 5c71793ad..bb4b8c2ff 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -25,11 +25,36 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if !HFS_COMPRESSION -/* we need these symbols even though compression is turned off */ -char register_decmpfs_decompressor; -char unregister_decmpfs_decompressor; -#else /* HFS_COMPRESSION */ +#if !FS_COMPRESSION + +/* We need these symbols even though compression is turned off */ + +#define UNUSED_SYMBOL(x) asm(".global _" #x "\n.set _" #x ", 0\n"); + +UNUSED_SYMBOL(register_decmpfs_decompressor) +UNUSED_SYMBOL(unregister_decmpfs_decompressor) +UNUSED_SYMBOL(decmpfs_init) +UNUSED_SYMBOL(decmpfs_read_compressed) +UNUSED_SYMBOL(decmpfs_cnode_cmp_type) +UNUSED_SYMBOL(decmpfs_cnode_get_vnode_state) +UNUSED_SYMBOL(decmpfs_cnode_get_vnode_cached_size) +UNUSED_SYMBOL(decmpfs_lock_compressed_data) +UNUSED_SYMBOL(decmpfs_cnode_free) +UNUSED_SYMBOL(decmpfs_cnode_alloc) +UNUSED_SYMBOL(decmpfs_cnode_destroy) +UNUSED_SYMBOL(decmpfs_decompress_file) +UNUSED_SYMBOL(decmpfs_unlock_compressed_data) +UNUSED_SYMBOL(decmpfs_cnode_init) +UNUSED_SYMBOL(decmpfs_cnode_set_vnode_state) +UNUSED_SYMBOL(decmpfs_hides_xattr) +UNUSED_SYMBOL(decmpfs_ctx) +UNUSED_SYMBOL(decmpfs_file_is_compressed) +UNUSED_SYMBOL(decmpfs_update_attributes) +UNUSED_SYMBOL(decmpfs_hides_rsrc) +UNUSED_SYMBOL(decmpfs_pagein_compressed) +UNUSED_SYMBOL(decmpfs_validate_compressed_file) + +#else /* FS_COMPRESSION */ #include #include #include @@ -289,6 +314,18 @@ vnsize(vnode_t vp, uint64_t *size) #pragma mark --- cnode routines --- +decmpfs_cnode *decmpfs_cnode_alloc(void) +{ + decmpfs_cnode *dp; + MALLOC_ZONE(dp, decmpfs_cnode *, sizeof(decmpfs_cnode), M_DECMPFS_CNODE, M_WAITOK); + return dp; +} + +void decmpfs_cnode_free(decmpfs_cnode *dp) +{ + FREE_ZONE(dp, sizeof(*dp), M_DECMPFS_CNODE); +} + void decmpfs_cnode_init(decmpfs_cnode *cp) { @@ -302,16 +339,16 @@ decmpfs_cnode_destroy(decmpfs_cnode *cp) lck_rw_destroy(&cp->compressed_data_lock, decmpfs_lockgrp); } -boolean_t +bool decmpfs_trylock_compressed_data(decmpfs_cnode *cp, int exclusive) { void *thread = current_thread(); - boolean_t retval = FALSE; + bool retval = false; if (cp->lockowner == thread) { /* this thread is already holding an exclusive lock, so bump the count */ cp->lockcount++; - retval = TRUE; + retval = true; } else if (exclusive) { if ((retval = lck_rw_try_lock_exclusive(&cp->compressed_data_lock))) { cp->lockowner = thread; @@ -432,6 +469,11 @@ decmpfs_cnode_set_decompression_flags(decmpfs_cnode *cp, uint64_t flags) } } +uint32_t decmpfs_cnode_cmp_type(decmpfs_cnode *cp) +{ + return cp->cmp_type; +} + #pragma mark --- decmpfs state routines --- static int @@ -1082,7 +1124,6 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp user_ssize_t uplSize = 0; void *data = NULL; decmpfs_header *hdr = NULL; - int abort_pagein = 0; uint64_t cachedSize = 0; int cmpdata_locked = 0; @@ -1118,6 +1159,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp kern_return_t kr = ubc_upl_map(pl, (vm_offset_t*)&data); if ((kr != KERN_SUCCESS) || (data == NULL)) { err = ENOSPC; + data = NULL; #if CONFIG_IOSCHED upl_unmark_decmp(pl); #endif /* CONFIG_IOSCHED */ @@ -1168,7 +1210,6 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp if (cmp_state == FILE_IS_NOT_COMPRESSED) { DebugLogWithPath("cmp_state == FILE_IS_NOT_COMPRESSED\n"); /* the file was decompressed after we started reading it */ - abort_pagein = 1; /* we're not going to commit our data */ *is_compressed = 0; /* instruct caller to fall back to its normal path */ } } @@ -1187,7 +1228,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp if (kr != KERN_SUCCESS) ErrorLogWithPath("ubc_upl_unmap error %d\n", (int)kr); else { - if (!abort_pagein) { + if (!err) { /* commit our pages */ kr = commit_upl(pl, pl_offset, total_size, UPL_COMMIT_FREE_ON_EMPTY, 0); } @@ -1198,14 +1239,15 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp if (hdr) FREE(hdr, M_TEMP); if (cmpdata_locked) decmpfs_unlock_compressed_data(cp, 0); if (err) { -#if DEVELOPMENT || DEBUG - char *path; - MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK); - panic("%s: decmpfs_pagein_compressed: err %d", vnpath(vp, path, PATH_MAX), err); - FREE(path, M_TEMP); -#else +#if 0 + if (err != ENXIO && err != ENOSPC) { + char *path; + MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK); + panic("%s: decmpfs_pagein_compressed: err %d", vnpath(vp, path, PATH_MAX), err); + FREE(path, M_TEMP); + } +#endif /* 0 */ ErrorLogWithPath("err %d\n", err); -#endif } return err; } @@ -1351,14 +1393,14 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c if (kr != KERN_SUCCESS) { commit_upl(upl, 0, curUplSize, UPL_ABORT_FREE_ON_EMPTY, 1); -#if DEVELOPMENT || DEBUG +#if 0 char *path; MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK); panic("%s: decmpfs_read_compressed: ubc_upl_map error %d", vnpath(vp, path, PATH_MAX), (int)kr); FREE(path, M_TEMP); -#else - ErrorLogWithPath("ubc_upl_map error %d\n", (int)kr); -#endif +#else /* 0 */ + ErrorLogWithPath("ubc_upl_map kr=0x%x\n", (int)kr); +#endif /* 0 */ err = EINVAL; goto out; } @@ -1821,6 +1863,7 @@ void decmpfs_init() lck_grp_attr_t *attr = lck_grp_attr_alloc_init(); decmpfs_lockgrp = lck_grp_alloc_init("VFSCOMP", attr); + lck_grp_attr_free(attr); decompressorsLock = lck_rw_alloc_init(decmpfs_lockgrp, NULL); decompress_channel_mtx = lck_mtx_alloc_init(decmpfs_lockgrp, NULL); @@ -1828,4 +1871,4 @@ void decmpfs_init() done = 1; } -#endif /* HFS_COMPRESSION */ +#endif /* FS_COMPRESSION */ diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 556e0d0bb..6d7fc0902 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -1,14 +1,14 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @Apple_LICENSE_HEADER_START@ - * + * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. - * + * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -16,11 +16,10 @@ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - #include #include @@ -30,17 +29,21 @@ #include #include #include +#include +#include #include #include #include -#include -#define HZ 100 #include #include #include +#include #include +#include +#include + #if defined(__i386__) || defined(__x86_64__) #include #include @@ -59,10 +62,11 @@ #include #include #include +#include +#include #include #include -#include #include #include @@ -75,14 +79,7 @@ #include #include - -extern boolean_t kdebug_serial; -#if KDEBUG_MOJO_TRACE -#include -static void kdebug_serial_print( /* forward */ - uint32_t, uint32_t, uint64_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); -#endif +#include /* * IOP(s) @@ -120,37 +117,161 @@ typedef struct kd_iop { static kd_iop_t* kd_iops = NULL; -/* XXX should have prototypes, but Mach does not provide one */ +/* + * Typefilter(s) + * + * A typefilter is a 8KB bitmap that is used to selectively filter events + * being recorded. It is able to individually address every class & subclass. + * + * There is a shared typefilter in the kernel which is lazily allocated. Once + * allocated, the shared typefilter is never deallocated. The shared typefilter + * is also mapped on demand into userspace processes that invoke kdebug_trace + * API from Libsyscall. When mapped into a userspace process, the memory is + * read only, and does not have a fixed address. + * + * It is a requirement that the kernel's shared typefilter always pass DBG_TRACE + * events. This is enforced automatically, by having the needed bits set any + * time the shared typefilter is mutated. + */ + +typedef uint8_t* typefilter_t; + +static typefilter_t kdbg_typefilter; +static mach_port_t kdbg_typefilter_memory_entry; + +/* + * There are 3 combinations of page sizes: + * + * 4KB / 4KB + * 4KB / 16KB + * 16KB / 16KB + * + * The typefilter is exactly 8KB. In the first two scenarios, we would like + * to use 2 pages exactly; in the third scenario we must make certain that + * a full page is allocated so we do not inadvertantly share 8KB of random + * data to userspace. The round_page_32 macro rounds to kernel page size. + */ +#define TYPEFILTER_ALLOC_SIZE MAX(round_page_32(KDBG_TYPEFILTER_BITMAP_SIZE), KDBG_TYPEFILTER_BITMAP_SIZE) + +static typefilter_t typefilter_create(void) +{ + typefilter_t tf; + if (KERN_SUCCESS == kmem_alloc(kernel_map, (vm_offset_t*)&tf, TYPEFILTER_ALLOC_SIZE, VM_KERN_MEMORY_DIAG)) { + memset(&tf[KDBG_TYPEFILTER_BITMAP_SIZE], 0, TYPEFILTER_ALLOC_SIZE - KDBG_TYPEFILTER_BITMAP_SIZE); + return tf; + } + return NULL; +} + +static void typefilter_deallocate(typefilter_t tf) +{ + assert(tf); + assert(tf != kdbg_typefilter); + kmem_free(kernel_map, (vm_offset_t)tf, TYPEFILTER_ALLOC_SIZE); +} + +static void typefilter_copy(typefilter_t dst, typefilter_t src) +{ + assert(src); + assert(dst); + memcpy(dst, src, KDBG_TYPEFILTER_BITMAP_SIZE); +} + +static void typefilter_reject_all(typefilter_t tf) +{ + assert(tf); + memset(tf, 0, KDBG_TYPEFILTER_BITMAP_SIZE); +} + +static void typefilter_allow_class(typefilter_t tf, uint8_t class) +{ + assert(tf); + const uint32_t BYTES_PER_CLASS = 256 / 8; // 256 subclasses, 1 bit each + memset(&tf[class * BYTES_PER_CLASS], 0xFF, BYTES_PER_CLASS); +} + +static void typefilter_allow_csc(typefilter_t tf, uint16_t csc) +{ + assert(tf); + setbit(tf, csc); +} + +static boolean_t typefilter_is_debugid_allowed(typefilter_t tf, uint32_t id) +{ + assert(tf); + return isset(tf, KDBG_EXTRACT_CSC(id)); +} + +static mach_port_t typefilter_create_memory_entry(typefilter_t tf) +{ + assert(tf); + + mach_port_t memory_entry = MACH_PORT_NULL; + memory_object_size_t size = TYPEFILTER_ALLOC_SIZE; + + mach_make_memory_entry_64(kernel_map, + &size, + (memory_object_offset_t)tf, + VM_PROT_READ, + &memory_entry, + MACH_PORT_NULL); + + return memory_entry; +} + +static int kdbg_copyin_typefilter(user_addr_t addr, size_t size); +static void kdbg_enable_typefilter(void); +static void kdbg_disable_typefilter(void); + +/* + * External prototypes + */ + void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); int cpu_number(void); /* XXX include path broken */ -void commpage_update_kdebug_enable(void); /* XXX sign */ +void commpage_update_kdebug_state(void); /* XXX sign */ + +extern int log_leaks; +extern boolean_t kdebug_serial; + +#if KDEBUG_MOJO_TRACE +#include +static void kdebug_serial_print( /* forward */ + uint32_t, uint32_t, uint64_t, + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); +#endif -/* XXX should probably be static, but it's debugging code... */ -int kdbg_read(user_addr_t, size_t *, vnode_t, vfs_context_t, uint32_t); -void kdbg_control_chud(int, void *); int kdbg_control(int *, u_int, user_addr_t, size_t *); -int kdbg_readcpumap(user_addr_t, size_t *); -int kdbg_readcurcpumap(user_addr_t, size_t *); -int kdbg_readthrmap(user_addr_t, size_t *, vnode_t, vfs_context_t); -int kdbg_readthrmap_v3(user_addr_t, size_t *, int); -int kdbg_readcurthrmap(user_addr_t, size_t *); -int kdbg_setreg(kd_regtype *); -int kdbg_setrtcdec(kd_regtype *); -int kdbg_setpidex(kd_regtype *); -int kdbg_setpid(kd_regtype *); -void kdbg_thrmap_init(void); -int kdbg_reinit(boolean_t); -int kdbg_bootstrap(boolean_t); + +static int kdbg_read(user_addr_t, size_t *, vnode_t, vfs_context_t, uint32_t); +static int kdbg_readcpumap(user_addr_t, size_t *); +static int kdbg_readthrmap_v3(user_addr_t, size_t, int); +static int kdbg_readcurthrmap(user_addr_t, size_t *); +static int kdbg_setreg(kd_regtype *); +static int kdbg_setpidex(kd_regtype *); +static int kdbg_setpid(kd_regtype *); +static void kdbg_thrmap_init(void); +static int kdbg_reinit(boolean_t); +static int kdbg_bootstrap(boolean_t); +static int kdbg_test(void); + +static int kdbg_write_v1_header(boolean_t write_thread_map, vnode_t vp, vfs_context_t ctx); +static int kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx); +static int kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size); +static void kdbg_clear_thread_map(void); + +static boolean_t kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait); +static void kdbg_wakeup(void); int kdbg_cpumap_init_internal(kd_iop_t* iops, uint32_t cpu_count, uint8_t** cpumap, uint32_t* cpumap_size); -kd_threadmap* kdbg_thrmap_init_internal(unsigned int count, - unsigned int *mapsize, - unsigned int *mapcount); +static kd_threadmap *kdbg_thrmap_init_internal(unsigned int count, + unsigned int *mapsize, + unsigned int *mapcount); static boolean_t kdebug_current_proc_enabled(uint32_t debugid); -static boolean_t kdebug_debugid_enabled(uint32_t debugid); +boolean_t kdebug_debugid_enabled(uint32_t debugid); static errno_t kdebug_check_trace_string(uint32_t debugid, uint64_t str_id); int kdbg_write_v3_header(user_addr_t, size_t *, int); @@ -162,14 +283,14 @@ user_addr_t kdbg_write_v3_event_chunk_header(user_addr_t buffer, uint32_t tag, uint64_t length, vnode_t vp, vfs_context_t ctx); -static int kdbg_enable_typefilter(void); -static int kdbg_disable_typefilter(void); -static int kdbg_allocate_typefilter(void); -static int kdbg_deallocate_typefilter(void); +// Helper functions static int create_buffers(boolean_t); static void delete_buffers(void); +extern int tasks_count; +extern int threads_count; +extern char *proc_best_name(proc_t p); extern void IOSleep(int); /* trace enable status */ @@ -181,9 +302,8 @@ static kd_buf kd_early_buffer[KD_EARLY_BUFFER_MAX]; static int kd_early_index = 0; static boolean_t kd_early_overflow = FALSE; -#define SLOW_NOLOG 0x01 -#define SLOW_CHECKS 0x02 -#define SLOW_CHUD 0x08 +#define SLOW_NOLOG 0x01 +#define SLOW_CHECKS 0x02 #define EVENTS_PER_STORAGE_UNIT 2048 #define MIN_STORAGE_UNITS_PER_CPU 4 @@ -246,6 +366,7 @@ struct kd_ctrl_page_t { int kds_inuse_count; uint32_t kdebug_flags; uint32_t kdebug_slowcheck; + uint64_t oldest_time; /* * The number of kd_bufinfo structs allocated may not match the current * number of active cpus. We capture the iops list head at initialization @@ -255,7 +376,11 @@ struct kd_ctrl_page_t { */ kd_iop_t* kdebug_iops; uint32_t kdebug_cpus; -} kd_ctrl_page = { .kds_free_list = {.raw = KDS_PTR_NULL}, .kdebug_slowcheck = SLOW_NOLOG }; +} kd_ctrl_page = { + .kds_free_list = {.raw = KDS_PTR_NULL}, + .kdebug_slowcheck = SLOW_NOLOG, + .oldest_time = 0 +}; #pragma pack() @@ -269,10 +394,6 @@ struct kd_bufinfo *kdbip = NULL; kd_buf *kdcopybuf = NULL; -boolean_t kdlog_bg_trace = FALSE; -boolean_t kdlog_bg_trace_running = FALSE; -unsigned int bg_nkdbufs = 0; - unsigned int nkdbufs = 0; unsigned int kdlog_beg=0; unsigned int kdlog_end=0; @@ -283,24 +404,6 @@ unsigned int kdlog_value4=0; static lck_spin_t * kdw_spin_lock; static lck_spin_t * kds_spin_lock; -static lck_mtx_t * kd_trace_mtx_sysctl; -static lck_grp_t * kd_trace_mtx_sysctl_grp; -static lck_attr_t * kd_trace_mtx_sysctl_attr; -static lck_grp_attr_t *kd_trace_mtx_sysctl_grp_attr; - -extern kern_return_t stack_snapshot2(int pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval); - -#if CONFIG_TELEMETRY -extern kern_return_t stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval); -#endif /* CONFIG_TELEMETRY */ - -extern kern_return_t kern_stack_snapshot_with_reason(char* reason); - -extern kern_return_t kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user); - -extern kern_return_t stack_snapshot_from_kernel_internal(int pid, void *buf, uint32_t size, uint32_t flags, unsigned *bytes_traced); - -int stack_snapshot_from_kernel(pid_t pid, void *buf, uint32_t size, uint32_t flags, unsigned *bytes_traced); kd_threadmap *kd_mapptr = 0; unsigned int kd_mapsize = 0; @@ -311,8 +414,6 @@ int RAW_file_written = 0; #define RAW_FLUSH_SIZE (2 * 1024 * 1024) -pid_t global_state_pid = -1; /* Used to control exclusive use of kd_buffer */ - /* * A globally increasing counter for identifying strings in trace. Starts at * 1 because 0 is a reserved return value. @@ -355,46 +456,10 @@ struct krt typedef struct krt krt_t; -/* This is for the CHUD toolkit call */ -typedef void (*kd_chudhook_fn) (uint32_t debugid, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, uintptr_t arg5); - -volatile kd_chudhook_fn kdebug_chudhook = 0; /* pointer to CHUD toolkit function */ - -static uint8_t *type_filter_bitmap; - -/* - * This allows kperf to swap out the global state pid when kperf ownership is - * passed from one process to another. It checks the old global state pid so - * that kperf can't accidentally steal control of trace when a non-kperf trace user has - * control of trace. - */ -void -kdbg_swap_global_state_pid(pid_t old_pid, pid_t new_pid); - -void -kdbg_swap_global_state_pid(pid_t old_pid, pid_t new_pid) -{ - if (!(kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT)) - return; - - lck_mtx_lock(kd_trace_mtx_sysctl); - - if (old_pid == global_state_pid) - global_state_pid = new_pid; - - lck_mtx_unlock(kd_trace_mtx_sysctl); -} - static uint32_t kdbg_cpu_count(boolean_t early_trace) { if (early_trace) { - /* - * we've started tracing before the IOKit has even - * started running... just use the static max value - */ return max_ncpus; } @@ -426,12 +491,12 @@ kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type) kdebug_enable |= trace_type; kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; kd_ctrl_page.enabled = 1; - commpage_update_kdebug_enable(); + commpage_update_kdebug_state(); } else { kdebug_enable &= ~(KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_PPT); kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; kd_ctrl_page.enabled = 0; - commpage_update_kdebug_enable(); + commpage_update_kdebug_state(); } lck_spin_unlock(kds_spin_lock); ml_set_interrupts_enabled(s); @@ -467,20 +532,27 @@ kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled) ml_set_interrupts_enabled(s); } -void +/* + * Disable wrapping and return true if trace wrapped, false otherwise. + */ +boolean_t disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) { + boolean_t wrapped; int s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); *old_slowcheck = kd_ctrl_page.kdebug_slowcheck; *old_flags = kd_ctrl_page.kdebug_flags; + wrapped = kd_ctrl_page.kdebug_flags & KDBG_WRAPPED; kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; kd_ctrl_page.kdebug_flags |= KDBG_NOWRAP; lck_spin_unlock(kds_spin_lock); ml_set_interrupts_enabled(s); + + return wrapped; } void @@ -504,11 +576,11 @@ enable_wrap(uint32_t old_slowcheck, boolean_t lostevents) static int create_buffers(boolean_t early_trace) { - int i; - int p_buffer_size; - int f_buffer_size; - int f_buffers; - int error = 0; + int i; + int p_buffer_size; + int f_buffer_size; + int f_buffers; + int error = 0; /* * For the duration of this allocation, trace code will only reference @@ -696,13 +768,13 @@ release_storage_unit(int cpu, uint32_t kdsp_raw) boolean_t allocate_storage_unit(int cpu) { - union kds_ptr kdsp; - struct kd_storage *kdsp_actual, *kdsp_next_actual; - struct kd_bufinfo *kdbp, *kdbp_vict, *kdbp_try; - uint64_t oldest_ts, ts; - boolean_t retval = TRUE; - int s = 0; - + union kds_ptr kdsp; + struct kd_storage *kdsp_actual, *kdsp_next_actual; + struct kd_bufinfo *kdbp, *kdbp_vict, *kdbp_try; + uint64_t oldest_ts, ts; + boolean_t retval = TRUE; + int s = 0; + s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); @@ -729,7 +801,7 @@ allocate_storage_unit(int cpu) goto out; } kdbp_vict = NULL; - oldest_ts = (uint64_t)-1; + oldest_ts = UINT64_MAX; for (kdbp_try = &kdbip[0]; kdbp_try < &kdbip[kd_ctrl_page.kdebug_cpus]; kdbp_try++) { @@ -751,14 +823,17 @@ allocate_storage_unit(int cpu) */ continue; } - ts = kdbg_get_timestamp(&kdsp_actual->kds_records[0]); + /* + * When wrapping, steal the storage unit with the + * earliest timestamp on its last event, instead of the + * earliest timestamp on the first event. This allows a + * storage unit with more recent events to be preserved, + * even if the storage unit contains events that are + * older than those found in other CPUs. + */ + ts = kdbg_get_timestamp(&kdsp_actual->kds_records[EVENTS_PER_STORAGE_UNIT - 1]); if (ts < oldest_ts) { - /* - * when 'wrapping', we want to steal the - * storage unit that has the 'earliest' time - * associated with it (first event time) - */ oldest_ts = ts; kdbp_vict = kdbp_try; } @@ -766,7 +841,7 @@ allocate_storage_unit(int cpu) if (kdbp_vict == NULL) { kdebug_enable = 0; kd_ctrl_page.enabled = 0; - commpage_update_kdebug_enable(); + commpage_update_kdebug_state(); retval = FALSE; goto out; } @@ -780,6 +855,7 @@ allocate_storage_unit(int cpu) } else kdbp_vict->kd_lostevents = TRUE; + kd_ctrl_page.oldest_time = oldest_ts; kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; } kdsp_actual->kds_timestamp = mach_absolute_time(); @@ -882,18 +958,8 @@ kernel_debug_enter( goto out1; if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { - /* - * Recheck if TYPEFILTER is being used, and if so, - * dereference bitmap. If the trace facility is being - * disabled, we have ~100ms of preemption-free CPU - * usage to access the bitmap. - */ - disable_preemption(); - if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { - if (isset(type_filter_bitmap, KDBG_EXTRACT_CSC(debugid))) - goto record_event_preempt_disabled; - } - enable_preemption(); + if (typefilter_is_debugid_allowed(kdbg_typefilter, debugid)) + goto record_event; goto out1; } else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { @@ -909,12 +975,17 @@ kernel_debug_enter( goto out1; } } - + + if (kd_ctrl_page.kdebug_flags & KDBG_WRAPPED) { + if (timestamp < kd_ctrl_page.oldest_time) { + goto out1; + } + } + record_event: disable_preemption(); -record_event_preempt_disabled: if (kd_ctrl_page.enabled == 0) goto out; @@ -969,78 +1040,36 @@ kernel_debug_enter( enable_preemption(); out1: if ((kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold)) { - boolean_t need_kds_wakeup = FALSE; - int s; - - /* - * try to take the lock here to synchronize with the - * waiter entering the blocked state... use the try - * mode to prevent deadlocks caused by re-entering this - * routine due to various trace points triggered in the - * lck_spin_sleep_xxxx routines used to actually enter - * our wait condition... no problem if we fail, - * there will be lots of additional events coming in that - * will eventually succeed in grabbing this lock - */ - s = ml_set_interrupts_enabled(FALSE); - - if (lck_spin_try_lock(kdw_spin_lock)) { - - if (kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold) { - kds_waiter = 0; - need_kds_wakeup = TRUE; - } - lck_spin_unlock(kdw_spin_lock); - } - - ml_set_interrupts_enabled(s); - - if (need_kds_wakeup == TRUE) - wakeup(&kds_waiter); + kdbg_wakeup(); } } - - static void kernel_debug_internal( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t arg5) + boolean_t only_filter, + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5) { - struct proc *curproc; + struct proc *curproc; uint64_t now; uint32_t bindx; - boolean_t s; kd_buf *kd; int cpu; struct kd_bufinfo *kdbp; struct kd_storage *kdsp_actual; union kds_ptr kds_raw; - - if (kd_ctrl_page.kdebug_slowcheck) { - - if (kdebug_enable & KDEBUG_ENABLE_CHUD) { - kd_chudhook_fn chudhook; - /* - * Mask interrupts to minimize the interval across - * which the driver providing the hook could be - * unloaded. - */ - s = ml_set_interrupts_enabled(FALSE); - chudhook = kdebug_chudhook; - if (chudhook) - chudhook(debugid, arg1, arg2, arg3, arg4, arg5); - ml_set_interrupts_enabled(s); - } - if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) || !(kdebug_enable & (KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_PPT))) + if ((kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) || + !(kdebug_enable & (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_PPT))) + { goto out1; - + } + if ( !ml_at_interrupt_context()) { if (kd_ctrl_page.kdebug_flags & KDBG_PIDCHECK) { /* @@ -1067,22 +1096,11 @@ kernel_debug_internal( } if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { - /* Always record trace system info */ - if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) + if (typefilter_is_debugid_allowed(kdbg_typefilter, debugid)) goto record_event; - /* - * Recheck if TYPEFILTER is being used, and if so, - * dereference bitmap. If the trace facility is being - * disabled, we have ~100ms of preemption-free CPU - * usage to access the bitmap. - */ - disable_preemption(); - if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { - if (isset(type_filter_bitmap, KDBG_EXTRACT_CSC(debugid))) - goto record_event_preempt_disabled; - } - enable_preemption(); + goto out1; + } else if (only_filter == TRUE) { goto out1; } else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { @@ -1104,11 +1122,13 @@ kernel_debug_internal( (debugid & KDBG_EVENTID_MASK) != kdlog_value4) goto out1; } + } else if (only_filter == TRUE) { + goto out1; } + record_event: disable_preemption(); -record_event_preempt_disabled: if (kd_ctrl_page.enabled == 0) goto out; @@ -1154,49 +1174,27 @@ kernel_debug_internal( kd->arg3 = arg3; kd->arg4 = arg4; kd->arg5 = arg5; - + kdbg_set_timestamp_and_cpu(kd, now, cpu); OSAddAtomic(1, &kdsp_actual->kds_bufcnt); + +#if KPERF + kperf_kdebug_callback(debugid, __builtin_frame_address(0)); +#endif out: enable_preemption(); out1: if (kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold) { uint32_t etype; uint32_t stype; - + etype = debugid & KDBG_EVENTID_MASK; stype = debugid & KDBG_CSC_MASK; if (etype == INTERRUPT || etype == MACH_vmfault || stype == BSC_SysCall || stype == MACH_SysCall) { - - boolean_t need_kds_wakeup = FALSE; - - /* - * try to take the lock here to synchronize with the - * waiter entering the blocked state... use the try - * mode to prevent deadlocks caused by re-entering this - * routine due to various trace points triggered in the - * lck_spin_sleep_xxxx routines used to actually enter - * one of our 2 wait conditions... no problem if we fail, - * there will be lots of additional events coming in that - * will eventually succeed in grabbing this lock - */ - s = ml_set_interrupts_enabled(FALSE); - - if (lck_spin_try_lock(kdw_spin_lock)) { - - if (kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold) { - kds_waiter = 0; - need_kds_wakeup = TRUE; - } - lck_spin_unlock(kdw_spin_lock); - } - ml_set_interrupts_enabled(s); - - if (need_kds_wakeup == TRUE) - wakeup(&kds_waiter); + kdbg_wakeup(); } } } @@ -1210,7 +1208,8 @@ kernel_debug( uintptr_t arg4, __unused uintptr_t arg5) { - kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, (uintptr_t)thread_tid(current_thread())); + kernel_debug_internal(FALSE, debugid, arg1, arg2, arg3, arg4, + (uintptr_t)thread_tid(current_thread())); } void @@ -1222,21 +1221,72 @@ kernel_debug1( uintptr_t arg4, uintptr_t arg5) { - kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5); + kernel_debug_internal(FALSE, debugid, arg1, arg2, arg3, arg4, arg5); +} + +void +kernel_debug_filtered( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4) +{ + kernel_debug_internal(TRUE, debugid, arg1, arg2, arg3, arg4, + (uintptr_t)thread_tid(current_thread())); } void -kernel_debug_string_simple(const char *message) +kernel_debug_string_early(const char *message) { uintptr_t arg[4] = {0, 0, 0, 0}; /* Stuff the message string in the args and log it. */ - strncpy((char *)arg, message, MIN(sizeof(arg), strlen(message))); + strncpy((char *)arg, message, MIN(sizeof(arg), strlen(message))); KERNEL_DEBUG_EARLY( TRACE_INFO_STRING, arg[0], arg[1], arg[2], arg[3]); } +#define SIMPLE_STR_LEN (64) +static_assert(SIMPLE_STR_LEN % sizeof(uintptr_t) == 0); + +void +kernel_debug_string_simple(uint32_t eventid, const char *str) +{ + /* array of uintptr_ts simplifies emitting the string as arguments */ + uintptr_t str_buf[(SIMPLE_STR_LEN / sizeof(uintptr_t)) + 1] = { 0 }; + size_t len = strlcpy((char *)str_buf, str, SIMPLE_STR_LEN + 1); + + uintptr_t thread_id = (uintptr_t)thread_tid(current_thread()); + uint32_t debugid = eventid | DBG_FUNC_START; + + /* string can fit in a single tracepoint */ + if (len <= (4 * sizeof(uintptr_t))) { + debugid |= DBG_FUNC_END; + } + + kernel_debug_internal(FALSE, debugid, str_buf[0], + str_buf[1], + str_buf[2], + str_buf[3], thread_id); + + debugid &= KDBG_EVENTID_MASK; + int i = 4; + size_t written = 4 * sizeof(uintptr_t); + + for (; written < len; i += 4, written += 4 * sizeof(uintptr_t)) { + /* if this is the last tracepoint to be emitted */ + if ((written + (4 * sizeof(uintptr_t))) >= len) { + debugid |= DBG_FUNC_END; + } + kernel_debug_internal(FALSE, debugid, str_buf[i], + str_buf[i + 1], + str_buf[i + 2], + str_buf[i + 3], thread_id); + } +} + extern int master_cpu; /* MACH_KERNEL_PRIVATE */ /* * Used prior to start_kern_tracing() being called. @@ -1313,7 +1363,15 @@ kernel_debug_early_end(void) TRACE_LOST_EVENTS, 0, 0, 0, 0, 0); /* This trace marks the start of kernel tracing */ - kernel_debug_string_simple("early trace done"); + kernel_debug_string_early("early trace done"); +} + +void +kernel_debug_disable(void) +{ + if (kdebug_enable) { + kdbg_set_tracing_enabled(FALSE, 0); + } } /* @@ -1333,6 +1391,83 @@ kdebug_validate_debugid(uint32_t debugid) return 0; } +/* + * Support syscall SYS_kdebug_typefilter. + */ +int +kdebug_typefilter(__unused struct proc* p, + struct kdebug_typefilter_args* uap, + __unused int *retval) +{ + int ret = KERN_SUCCESS; + + if (uap->addr == USER_ADDR_NULL || + uap->size == USER_ADDR_NULL) { + return EINVAL; + } + + /* + * The atomic load is to close a race window with setting the typefilter + * and memory entry values. A description follows: + * + * Thread 1 (writer) + * + * Allocate Typefilter + * Allocate MemoryEntry + * Write Global MemoryEntry Ptr + * Atomic Store (Release) Global Typefilter Ptr + * + * Thread 2 (reader, AKA us) + * + * if ((Atomic Load (Acquire) Global Typefilter Ptr) == NULL) + * return; + * + * Without the atomic store, it isn't guaranteed that the write of + * Global MemoryEntry Ptr is visible before we can see the write of + * Global Typefilter Ptr. + * + * Without the atomic load, it isn't guaranteed that the loads of + * Global MemoryEntry Ptr aren't speculated. + * + * The global pointers transition from NULL -> valid once and only once, + * and never change after becoming valid. This means that having passed + * the first atomic load test of Global Typefilter Ptr, this function + * can then safely use the remaining global state without atomic checks. + */ + if (!__c11_atomic_load((_Atomic typefilter_t *)&kdbg_typefilter, memory_order_acquire)) { + return EINVAL; + } + + assert(kdbg_typefilter_memory_entry); + + mach_vm_offset_t user_addr = 0; + vm_map_t user_map = current_map(); + + ret = mach_to_bsd_errno( + mach_vm_map(user_map, // target map + &user_addr, // [in, out] target address + TYPEFILTER_ALLOC_SIZE, // initial size + 0, // mask (alignment?) + VM_FLAGS_ANYWHERE, // flags + kdbg_typefilter_memory_entry, // port (memory entry!) + 0, // offset (in memory entry) + FALSE, // should copy + VM_PROT_READ, // cur_prot + VM_PROT_READ, // max_prot + VM_INHERIT_SHARE)); // inherit behavior on fork + + if (ret == KERN_SUCCESS) { + vm_size_t user_ptr_size = vm_map_is_64bit(user_map) ? 8 : 4; + ret = copyout(CAST_DOWN(void *, &user_addr), uap->addr, user_ptr_size ); + + if (ret != KERN_SUCCESS) { + mach_vm_deallocate(user_map, user_addr, TYPEFILTER_ALLOC_SIZE); + } + } + + return ret; +} + /* * Support syscall SYS_kdebug_trace. U64->K32 args may get truncated in kdebug_trace64 */ @@ -1351,20 +1486,30 @@ kdebug_trace(struct proc *p, struct kdebug_trace_args *uap, int32_t *retval) } /* - * Support syscall SYS_kdebug_trace64. 64-bit args on K32 will get truncated to fit in 32-bit record format. + * Support syscall SYS_kdebug_trace64. 64-bit args on K32 will get truncated + * to fit in 32-bit record format. + * + * It is intentional that error conditions are not checked until kdebug is + * enabled. This is to match the userspace wrapper behavior, which is optimizing + * for non-error case performance. */ int kdebug_trace64(__unused struct proc *p, struct kdebug_trace64_args *uap, __unused int32_t *retval) { int err; + if ( __probable(kdebug_enable == 0) ) + return(0); + if ((err = kdebug_validate_debugid(uap->code)) != 0) { return err; } - if ( __probable(kdebug_enable == 0) ) - return(0); - - kernel_debug_internal(uap->code, (uintptr_t)uap->arg1, (uintptr_t)uap->arg2, (uintptr_t)uap->arg3, (uintptr_t)uap->arg4, (uintptr_t)thread_tid(current_thread())); + kernel_debug_internal(FALSE, uap->code, + (uintptr_t)uap->arg1, + (uintptr_t)uap->arg2, + (uintptr_t)uap->arg3, + (uintptr_t)uap->arg4, + (uintptr_t)thread_tid(current_thread())); return(0); } @@ -1409,7 +1554,7 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, /* if the ID is being invalidated, just emit that */ if (str_id != 0 && str_len == 0) { - kernel_debug_internal(trace_debugid | DBG_FUNC_START | DBG_FUNC_END, + kernel_debug_internal(FALSE, trace_debugid | DBG_FUNC_START | DBG_FUNC_END, (uintptr_t)debugid, (uintptr_t)str_id, 0, 0, thread_id); return str_id; @@ -1427,7 +1572,7 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, trace_debugid |= DBG_FUNC_END; } - kernel_debug_internal(trace_debugid, (uintptr_t)debugid, + kernel_debug_internal(FALSE, trace_debugid, (uintptr_t)debugid, (uintptr_t)str_id, str[0], str[1], thread_id); @@ -1439,10 +1584,10 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, if ((written + (4 * sizeof(uintptr_t))) >= str_len) { trace_debugid |= DBG_FUNC_END; } - kernel_debug_internal(trace_debugid, str[i], - str[i + 1], - str[i + 2], - str[i + 3], thread_id); + kernel_debug_internal(FALSE, trace_debugid, str[i], + str[i + 1], + str[i + 2], + str[i + 3], thread_id); } return str_id; @@ -1488,44 +1633,29 @@ kdebug_current_proc_enabled(uint32_t debugid) } /* - * Returns true if the debugid is disabled by filters, and false if the + * Returns false if the debugid is disabled by filters, and true if the * debugid is allowed to be traced. A debugid may not be traced if the * typefilter disables its class and subclass, it's outside a range * check, or if it's not an allowed debugid in a value check. Trace * system events bypass this check. */ -static boolean_t +boolean_t kdebug_debugid_enabled(uint32_t debugid) { - boolean_t is_enabled = TRUE; - /* if no filtering is enabled */ if (!kd_ctrl_page.kdebug_slowcheck) { return TRUE; } - if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) { + if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { + return typefilter_is_debugid_allowed(kdbg_typefilter, debugid); + } else if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) { return TRUE; } - if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { - disable_preemption(); - - /* - * Recheck if typefilter is still being used. If tracing is being - * disabled, there's a 100ms sleep on the other end to keep the - * bitmap around for this check. - */ - if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { - if (!(isset(type_filter_bitmap, KDBG_EXTRACT_CSC(debugid)))) { - is_enabled = FALSE; - } - } - - enable_preemption(); - } else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { + if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { if (debugid < kdlog_beg || debugid > kdlog_end) { - is_enabled = FALSE; + return FALSE; } } else if (kd_ctrl_page.kdebug_flags & KDBG_VALCHECK) { if ((debugid & KDBG_EVENTID_MASK) != kdlog_value1 && @@ -1533,11 +1663,11 @@ kdebug_debugid_enabled(uint32_t debugid) (debugid & KDBG_EVENTID_MASK) != kdlog_value3 && (debugid & KDBG_EVENTID_MASK) != kdlog_value4) { - is_enabled = FALSE; + return FALSE; } } - return is_enabled; + return TRUE; } /* @@ -1571,7 +1701,7 @@ kernel_debug_string(uint32_t debugid, uint64_t *str_id, const char *str) { /* arguments to tracepoints must be word-aligned */ __attribute__((aligned(sizeof(uintptr_t)))) char str_buf[STR_BUF_SIZE]; - assert_static(sizeof(str_buf) > MAX_STR_LEN); + static_assert(sizeof(str_buf) > MAX_STR_LEN); vm_size_t len_copied; int err; @@ -1618,7 +1748,7 @@ kdebug_trace_string(__unused struct proc *p, uint64_t *retval) { __attribute__((aligned(sizeof(uintptr_t)))) char str_buf[STR_BUF_SIZE]; - assert_static(sizeof(str_buf) > MAX_STR_LEN); + static_assert(sizeof(str_buf) > MAX_STR_LEN); size_t len_copied; int err; @@ -1673,36 +1803,29 @@ kdebug_trace_string(__unused struct proc *p, static void kdbg_lock_init(void) { - if (kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT) + static lck_grp_attr_t *kdebug_lck_grp_attr = NULL; + static lck_grp_t *kdebug_lck_grp = NULL; + static lck_attr_t *kdebug_lck_attr = NULL; + + if (kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT) { return; - - /* - * allocate lock group attribute and group - */ - kd_trace_mtx_sysctl_grp_attr = lck_grp_attr_alloc_init(); - kd_trace_mtx_sysctl_grp = lck_grp_alloc_init("kdebug", kd_trace_mtx_sysctl_grp_attr); - - /* - * allocate the lock attribute - */ - kd_trace_mtx_sysctl_attr = lck_attr_alloc_init(); + } + assert(kdebug_lck_grp_attr == NULL); + kdebug_lck_grp_attr = lck_grp_attr_alloc_init(); + kdebug_lck_grp = lck_grp_alloc_init("kdebug", kdebug_lck_grp_attr); + kdebug_lck_attr = lck_attr_alloc_init(); - /* - * allocate and initialize mutex's - */ - kd_trace_mtx_sysctl = lck_mtx_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); - kds_spin_lock = lck_spin_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); - kdw_spin_lock = lck_spin_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); + kds_spin_lock = lck_spin_alloc_init(kdebug_lck_grp, kdebug_lck_attr); + kdw_spin_lock = lck_spin_alloc_init(kdebug_lck_grp, kdebug_lck_attr); kd_ctrl_page.kdebug_flags |= KDBG_LOCKINIT; } - int kdbg_bootstrap(boolean_t early_trace) { - kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; + kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; return (create_buffers(early_trace)); } @@ -1717,7 +1840,7 @@ kdbg_reinit(boolean_t early_trace) * First make sure we're not in * the middle of cutting a trace */ - kdbg_set_tracing_enabled(FALSE, KDEBUG_ENABLE_TRACE); + kernel_debug_disable(); /* * make sure the SLOW_NOLOG is seen @@ -1728,13 +1851,7 @@ kdbg_reinit(boolean_t early_trace) delete_buffers(); - if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { - kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; - kd_mapsize = 0; - kd_mapptr = NULL; - kd_mapcount = 0; - } + kdbg_clear_thread_map(); ret = kdbg_bootstrap(early_trace); RAW_file_offset = 0; @@ -1831,7 +1948,6 @@ kdbg_resolve_map(thread_t th_act, void *opaque) * * We may be reporting data from "now", or from the "past". * - * The "now" data would be for something like kdbg_readcurcpumap(). * The "past" data would be for kdbg_readcpumap(). * * If we do not pass both iops and cpu_count, and iops is NULL, this function @@ -1855,6 +1971,7 @@ kdbg_cpumap_init_internal(kd_iop_t* iops, uint32_t cpu_count, uint8_t** cpumap, if (kmem_alloc(kernel_map, (vm_offset_t*)cpumap, (vm_size_t)*cpumap_size, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) { return ENOMEM; } + bzero(*cpumap, *cpumap_size); } else if (bytes_available < bytes_needed) { return EINVAL; } @@ -1870,7 +1987,6 @@ kdbg_cpumap_init_internal(kd_iop_t* iops, uint32_t cpu_count, uint8_t** cpumap, while (iops) { cpus[index].cpu_id = iops->cpu_id; cpus[index].flags = KDBG_CPUMAP_IS_IOP; - bzero(cpus[index].name, sizeof(cpus->name)); strlcpy(cpus[index].name, iops->callback.iop_name, sizeof(cpus->name)); iops = iops->next; @@ -1880,7 +1996,6 @@ kdbg_cpumap_init_internal(kd_iop_t* iops, uint32_t cpu_count, uint8_t** cpumap, while (index >= 0) { cpus[index].cpu_id = index; cpus[index].flags = 0; - bzero(cpus[index].name, sizeof(cpus->name)); strlcpy(cpus[index].name, "AP", sizeof(cpus->name)); index--; @@ -1892,40 +2007,35 @@ kdbg_cpumap_init_internal(kd_iop_t* iops, uint32_t cpu_count, uint8_t** cpumap, void kdbg_thrmap_init(void) { - if (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + if (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) { return; + } kd_mapptr = kdbg_thrmap_init_internal(0, &kd_mapsize, &kd_mapcount); - if (kd_mapptr) + if (kd_mapptr) { kd_ctrl_page.kdebug_flags |= KDBG_MAPINIT; + } } - -kd_threadmap* kdbg_thrmap_init_internal(unsigned int count, unsigned int *mapsize, unsigned int *mapcount) +static kd_threadmap * +kdbg_thrmap_init_internal(unsigned int count, unsigned int *mapsize, unsigned int *mapcount) { - kd_threadmap *mapptr; - struct proc *p; - struct krt akrt; - int tts_count; /* number of task-to-string structures */ - struct tts *tts_mapptr; - unsigned int tts_mapsize = 0; - int i; - vm_offset_t kaddr; + kd_threadmap *mapptr; + proc_t p; + struct krt akrt; + int tts_count = 0; /* number of task-to-string structures */ + struct tts *tts_mapptr; + unsigned int tts_mapsize = 0; + vm_offset_t kaddr; - /* - * need to use PROC_SCANPROCLIST with proc_iterate - */ - proc_list_lock(); + assert(mapsize != NULL); + assert(mapcount != NULL); - /* - * Calculate the sizes of map buffers - */ - for (p = allproc.lh_first, *mapcount=0, tts_count=0; p; p = p->p_list.le_next) { - *mapcount += get_task_numacts((task_t)p->task); - tts_count++; - } - proc_list_unlock(); + *mapcount = threads_count; + tts_count = tasks_count; /* * The proc count could change during buffer allocation, @@ -1933,19 +2043,21 @@ kd_threadmap* kdbg_thrmap_init_internal(unsigned int count, unsigned int *mapsiz * buffer sizes. This gives new tasks some chance of * making into the tables. Bump up by 25%. */ - *mapcount += *mapcount/4; - tts_count += tts_count/4; + *mapcount += *mapcount / 4; + tts_count += tts_count / 4; *mapsize = *mapcount * sizeof(kd_threadmap); - if (count && count < *mapcount) - return (0); + if (count && count < *mapcount) { + return 0; + } if ((kmem_alloc(kernel_map, &kaddr, (vm_size_t)*mapsize, VM_KERN_MEMORY_DIAG) == KERN_SUCCESS)) { bzero((void *)kaddr, *mapsize); mapptr = (kd_threadmap *)kaddr; - } else - return (0); + } else { + return 0; + } tts_mapsize = tts_count * sizeof(struct tts); @@ -1955,28 +2067,28 @@ kd_threadmap* kdbg_thrmap_init_internal(unsigned int count, unsigned int *mapsiz } else { kmem_free(kernel_map, (vm_offset_t)mapptr, *mapsize); - return (0); + return 0; } - /* - * We need to save the procs command string - * and take a reference for each task associated - * with a valid process - */ - - proc_list_lock(); /* - * should use proc_iterate + * Save the proc's name and take a reference for each task associated + * with a valid process. */ - for (p = allproc.lh_first, i=0; p && i < tts_count; p = p->p_list.le_next) { - if (p->p_lflag & P_LEXIT) - continue; + proc_list_lock(); + int i = 0; + ALLPROC_FOREACH(p) { + if (i >= tts_count) { + break; + } + if (p->p_lflag & P_LEXIT) { + continue; + } if (p->task) { task_reference(p->task); tts_mapptr[i].task = p->task; - tts_mapptr[i].pid = p->p_pid; - (void)strlcpy(tts_mapptr[i].task_comm, p->p_comm, sizeof(tts_mapptr[i].task_comm)); + tts_mapptr[i].pid = p->p_pid; + (void)strlcpy(tts_mapptr[i].task_comm, proc_best_name(p), sizeof(tts_mapptr[i].task_comm)); i++; } } @@ -1990,17 +2102,17 @@ kd_threadmap* kdbg_thrmap_init_internal(unsigned int count, unsigned int *mapsiz akrt.map = mapptr; akrt.count = 0; akrt.maxcount = *mapcount; - + for (i = 0; i < tts_count; i++) { akrt.atts = &tts_mapptr[i]; task_act_iterate_wth_args(tts_mapptr[i].task, kdbg_resolve_map, &akrt); - task_deallocate((task_t) tts_mapptr[i].task); + task_deallocate((task_t)tts_mapptr[i].task); } kmem_free(kernel_map, (vm_offset_t)tts_mapptr, tts_mapsize); *mapcount = akrt.count; - return (mapptr); + return mapptr; } static void @@ -2011,7 +2123,7 @@ kdbg_clear(void) * First make sure we're not in * the middle of cutting a trace */ - kdbg_set_tracing_enabled(FALSE, KDEBUG_ENABLE_TRACE); + kernel_debug_disable(); kdbg_disable_typefilter(); /* @@ -2021,28 +2133,46 @@ kdbg_clear(void) */ IOSleep(100); - global_state_pid = -1; + /* reset kdebug state for each process */ + if (kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) { + proc_list_lock(); + proc_t p; + ALLPROC_FOREACH(p) { + p->p_kdebug = 0; + } + proc_list_unlock(); + } + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kd_ctrl_page.kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); kd_ctrl_page.kdebug_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE); - - kdbg_deallocate_typefilter(); + + kd_ctrl_page.oldest_time = 0; + delete_buffers(); nkdbufs = 0; /* Clean up the thread map buffer */ - kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; - if (kd_mapptr) { - kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kd_mapptr = (kd_threadmap *) 0; - } - kd_mapsize = 0; - kd_mapcount = 0; + kdbg_clear_thread_map(); RAW_file_offset = 0; RAW_file_written = 0; } +void +kdebug_reset(void) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + kdbg_lock_init(); + + kdbg_clear(); + if (kdbg_typefilter) { + typefilter_reject_all(kdbg_typefilter); + typefilter_allow_class(kdbg_typefilter, DBG_TRACE); + } +} + int kdbg_setpid(kd_regtype *kdr) { @@ -2053,7 +2183,7 @@ kdbg_setpid(kd_regtype *kdr) pid = (pid_t)kdr->value1; flag = (int)kdr->value2; - if (pid > 0) { + if (pid >= 0) { if ((p = proc_find(pid)) == NULL) ret = ESRCH; else { @@ -2095,7 +2225,7 @@ kdbg_setpidex(kd_regtype *kdr) pid = (pid_t)kdr->value1; flag = (int)kdr->value2; - if (pid > 0) { + if (pid >= 0) { if ((p = proc_find(pid)) == NULL) ret = ESRCH; else { @@ -2126,98 +2256,130 @@ kdbg_setpidex(kd_regtype *kdr) return(ret); } +/* + * The following functions all operate on the "global" typefilter singleton. + */ /* - * This is for setting a maximum decrementer value + * The tf param is optional, you may pass either a valid typefilter or NULL. + * If you pass a valid typefilter, you release ownership of that typefilter. */ -int -kdbg_setrtcdec(kd_regtype *kdr) +static int +kdbg_initialize_typefilter(typefilter_t tf) { - int ret = 0; - natural_t decval; + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + assert(!kdbg_typefilter); + assert(!kdbg_typefilter_memory_entry); + typefilter_t deallocate_tf = NULL; - decval = (natural_t)kdr->value1; + if (!tf && ((tf = deallocate_tf = typefilter_create()) == NULL)) { + return ENOMEM; + } - if (decval && decval < KDBG_MINRTCDEC) - ret = EINVAL; - else - ret = ENOTSUP; + if ((kdbg_typefilter_memory_entry = typefilter_create_memory_entry(tf)) == MACH_PORT_NULL) { + if (deallocate_tf) { + typefilter_deallocate(deallocate_tf); + } + return ENOMEM; + } - return(ret); + /* + * The atomic store closes a race window with + * the kdebug_typefilter syscall, which assumes + * that any non-null kdbg_typefilter means a + * valid memory_entry is available. + */ + __c11_atomic_store(((_Atomic typefilter_t*)&kdbg_typefilter), tf, memory_order_release); + + return KERN_SUCCESS; } -int -kdbg_enable_typefilter(void) +static int +kdbg_copyin_typefilter(user_addr_t addr, size_t size) { - int ret; + int ret = ENOMEM; + typefilter_t tf; - /* Allocate memory for bitmap if not already allocated */ - ret = kdbg_allocate_typefilter(); - if (ret) { - return ret; + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + if (size != KDBG_TYPEFILTER_BITMAP_SIZE) { + return EINVAL; + } + + if ((tf = typefilter_create())) { + if ((ret = copyin(addr, tf, KDBG_TYPEFILTER_BITMAP_SIZE)) == 0) { + /* The kernel typefilter must always allow DBG_TRACE */ + typefilter_allow_class(tf, DBG_TRACE); + + /* + * If this is the first typefilter; claim it. + * Otherwise copy and deallocate. + * + * Allocating a typefilter for the copyin allows + * the kernel to hold the invariant that DBG_TRACE + * must always be allowed. + */ + if (!kdbg_typefilter) { + if ((ret = kdbg_initialize_typefilter(tf))) { + return ret; + } + tf = NULL; + } else { + typefilter_copy(kdbg_typefilter, tf); + } + + kdbg_enable_typefilter(); + kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_TYPEFILTER_CHANGED, kdbg_typefilter); + } + + if (tf) + typefilter_deallocate(tf); } - /* Turn off range and value checks */ + return ret; +} + +/* + * Enable the flags in the control page for the typefilter. Assumes that + * kdbg_typefilter has already been allocated, so events being written + * don't see a bad typefilter. + */ +static void +kdbg_enable_typefilter(void) +{ + assert(kdbg_typefilter); kd_ctrl_page.kdebug_flags &= ~(KDBG_RANGECHECK | KDBG_VALCHECK); - - /* Enable filter checking */ kd_ctrl_page.kdebug_flags |= KDBG_TYPEFILTER_CHECK; kdbg_set_flags(SLOW_CHECKS, 0, TRUE); - return 0; + commpage_update_kdebug_state(); } -int +/* + * Disable the flags in the control page for the typefilter. The typefilter + * may be safely deallocated shortly after this function returns. + */ +static void kdbg_disable_typefilter(void) { - /* Disable filter checking */ kd_ctrl_page.kdebug_flags &= ~KDBG_TYPEFILTER_CHECK; - /* Turn off slow checks unless pid checks are using them */ - if ( (kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) + if ((kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE))) { kdbg_set_flags(SLOW_CHECKS, 0, TRUE); - else - kdbg_set_flags(SLOW_CHECKS, 0, FALSE); - - /* typefilter bitmap will be deallocated later */ - - return 0; -} - -static int -kdbg_allocate_typefilter(void) -{ - if (type_filter_bitmap == NULL) { - vm_offset_t bitmap = 0; - - if (kmem_alloc(kernel_map, &bitmap, KDBG_TYPEFILTER_BITMAP_SIZE, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) { - return ENOSPC; - } - - bzero((void *)bitmap, KDBG_TYPEFILTER_BITMAP_SIZE); - - if (!OSCompareAndSwapPtr(NULL, (void *)bitmap, &type_filter_bitmap)) { - kmem_free(kernel_map, bitmap, KDBG_TYPEFILTER_BITMAP_SIZE); - return 0; /* someone assigned a buffer */ - } } else { - bzero(type_filter_bitmap, KDBG_TYPEFILTER_BITMAP_SIZE); + kdbg_set_flags(SLOW_CHECKS, 0, FALSE); } - - return 0; + commpage_update_kdebug_state(); } -static int -kdbg_deallocate_typefilter(void) +uint32_t +kdebug_commpage_state(void) { - if(type_filter_bitmap) { - vm_offset_t bitmap = (vm_offset_t)type_filter_bitmap; - - if (OSCompareAndSwapPtr((void *)bitmap, NULL, &type_filter_bitmap)) { - kmem_free(kernel_map, bitmap, KDBG_TYPEFILTER_BITMAP_SIZE); - return 0; - } else { - /* already swapped */ + if (kdebug_enable) { + if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { + return KDEBUG_COMMPAGE_ENABLE_TYPEFILTER | KDEBUG_COMMPAGE_ENABLE_TRACE; } + + return KDEBUG_COMMPAGE_ENABLE_TRACE; } return 0; @@ -2300,11 +2462,11 @@ int kdbg_write_v3_chunk_header(user_addr_t buffer, uint32_t tag, uint32_t sub_tag, uint64_t length, vnode_t vp, vfs_context_t ctx) { int ret = KERN_SUCCESS; - kd_chunk_header_v3 header; - - header.tag = tag; - header.sub_tag = sub_tag; - header.length = length; + kd_chunk_header_v3 header = { + .tag = tag, + .sub_tag = sub_tag, + .length = length, + }; // Check that only one of them is valid assert(!buffer ^ !vp); @@ -2333,11 +2495,11 @@ kdbg_write_v3_chunk_header(user_addr_t buffer, uint32_t tag, uint32_t sub_tag, u int kdbg_write_v3_chunk_header_to_buffer(void * buffer, uint32_t tag, uint32_t sub_tag, uint64_t length) { - kd_chunk_header_v3 header; - - header.tag = tag; - header.sub_tag = sub_tag; - header.length = length; + kd_chunk_header_v3 header = { + .tag = tag, + .sub_tag = sub_tag, + .length = length, + }; if (!buffer) { return 0; @@ -2377,7 +2539,11 @@ kdbg_write_v3_chunk_to_fd(uint32_t tag, uint32_t sub_tag, uint64_t length, void if ( (vnode_getwithref(vp)) == 0 ) { RAW_file_offset = fp->f_fglob->fg_offset; - kd_chunk_header_v3 chunk_header = { .tag = tag, .sub_tag = sub_tag, .length = length }; + kd_chunk_header_v3 chunk_header = { + .tag = tag, + .sub_tag = sub_tag, + .length = length, + }; int ret = kdbg_write_to_vnode((caddr_t) &chunk_header, sizeof(kd_chunk_header_v3), vp, &context, RAW_file_offset); if (!ret) { @@ -2436,7 +2602,6 @@ int kdbg_write_v3_header(user_addr_t user_header, size_t *user_header_size, int fd) { int ret = KERN_SUCCESS; - kd_header_v3 header; uint8_t* cpumap = 0; uint32_t cpumap_size = 0; @@ -2471,36 +2636,32 @@ kdbg_write_v3_header(user_addr_t user_header, size_t *user_header_size, int fd) } thrmap_size = kd_mapcount * sizeof(kd_threadmap); - // Setup the header. - // See v3 header description in sys/kdebug.h for more inforamtion. - - header.tag = RAW_VERSION3; - header.sub_tag = V3_HEADER_VERSION; - header.length = ( sizeof(kd_header_v3) + cpumap_size - sizeof(kd_cpumap_header)); - mach_timebase_info_data_t timebase = {0, 0}; clock_timebase_info(&timebase); - header.timebase_numer = timebase.numer; - header.timebase_denom = timebase.denom; - header.timestamp = 0; - header.walltime_secs = 0; - header.walltime_usecs = 0; - header.timezone_minuteswest = 0; - header.timezone_dst = 0; - -#if defined __LP64__ - header.flags = 1; + + // Setup the header. + // See v3 header description in sys/kdebug.h for more inforamtion. + kd_header_v3 header = { + .tag = RAW_VERSION3, + .sub_tag = V3_HEADER_VERSION, + .length = (sizeof(kd_header_v3) + cpumap_size - sizeof(kd_cpumap_header)), + .timebase_numer = timebase.numer, + .timebase_denom = timebase.denom, + .timestamp = 0, /* FIXME rdar://problem/22053009 */ + .walltime_secs = 0, + .walltime_usecs = 0, + .timezone_minuteswest = 0, + .timezone_dst = 0, +#if defined(__LP64__) + .flags = 1, #else - header.flags = 0; + .flags = 0, #endif + }; // If its a buffer, check if we have enough space to copy the header and the maps. if (user_header) { bytes_needed = header.length + thrmap_size + (2 * sizeof(kd_chunk_header_v3)); - if ( !user_header_size ) { - ret = EINVAL; - goto bail; - } if (*user_header_size < bytes_needed) { ret = EINVAL; goto bail; @@ -2632,32 +2793,44 @@ kdbg_readcurthrmap(user_addr_t buffer, size_t *bufsize) } static int -kdbg_write_v1_plus_header(uint32_t count, vnode_t vp, vfs_context_t ctx) +kdbg_write_v1_header(boolean_t write_thread_map, vnode_t vp, vfs_context_t ctx) { int ret = 0; - RAW_header header; - clock_sec_t secs; - clock_usec_t usecs; - char *pad_buf; + RAW_header header; + clock_sec_t secs; + clock_usec_t usecs; + char *pad_buf; uint32_t pad_size; uint32_t extra_thread_count = 0; uint32_t cpumap_size; - unsigned int mapsize = kd_mapcount * sizeof(kd_threadmap); + size_t map_size = 0; + size_t map_count = 0; + + if (write_thread_map) { + assert(kd_ctrl_page.kdebug_flags & KDBG_MAPINIT); + map_count = kd_mapcount; + map_size = map_count * sizeof(kd_threadmap); + } + + /* + * Without the buffers initialized, we cannot construct a CPU map or a + * thread map, and cannot write a header. + */ + if (!(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT)) { + return EINVAL; + } /* - * To write a RAW_VERSION1+ file, we - * must embed a cpumap in the "padding" - * used to page align the events following - * the threadmap. If the threadmap happens - * to not require enough padding, we - * artificially increase its footprint - * until it needs enough padding. + * To write a RAW_VERSION1+ file, we must embed a cpumap in the + * "padding" used to page align the events following the threadmap. If + * the threadmap happens to not require enough padding, we artificially + * increase its footprint until it needs enough padding. */ - assert(vp); - assert(ctx); + assert(vp); + assert(ctx); - pad_size = PAGE_16KB - ((sizeof(RAW_header) + (count * sizeof(kd_threadmap))) & PAGE_MASK_64); + pad_size = PAGE_16KB - ((sizeof(RAW_header) + map_size) & PAGE_MASK_64); cpumap_size = sizeof(kd_cpumap_header) + kd_ctrl_page.kdebug_cpus * sizeof(kd_cpumap); if (cpumap_size > pad_size) { @@ -2665,7 +2838,8 @@ kdbg_write_v1_plus_header(uint32_t count, vnode_t vp, vfs_context_t ctx) * we increase the pad_size by 16K. We do this so that the event * data is always available on a page aligned boundary for both * 4k and 16k systems. We enforce this alignment for the event - * data so that we can take advantage of optimized file/disk writes.*/ + * data so that we can take advantage of optimized file/disk writes. + */ pad_size += PAGE_16KB; } @@ -2680,28 +2854,36 @@ kdbg_write_v1_plus_header(uint32_t count, vnode_t vp, vfs_context_t ctx) extra_thread_count = (pad_size / sizeof(kd_threadmap)) + 1; } + memset(&header, 0, sizeof(header)); header.version_no = RAW_VERSION1; - header.thread_count = count + extra_thread_count; + header.thread_count = map_count + extra_thread_count; clock_get_calendar_microtime(&secs, &usecs); header.TOD_secs = secs; header.TOD_usecs = usecs; ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)&header, sizeof(RAW_header), RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - if (ret) + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (ret) { goto write_error; + } RAW_file_offset += sizeof(RAW_header); + RAW_file_written += sizeof(RAW_header); - ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)kd_mapptr, mapsize, RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - if (ret) - goto write_error; - RAW_file_offset += mapsize; + if (write_thread_map) { + ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)kd_mapptr, map_size, RAW_file_offset, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (ret) { + goto write_error; + } + + RAW_file_offset += map_size; + RAW_file_written += map_size; + } if (extra_thread_count) { pad_size = extra_thread_count * sizeof(kd_threadmap); - pad_buf = (char *)kalloc(pad_size); + pad_buf = kalloc(pad_size); if (!pad_buf) { ret = ENOMEM; goto write_error; @@ -2709,13 +2891,14 @@ kdbg_write_v1_plus_header(uint32_t count, vnode_t vp, vfs_context_t ctx) memset(pad_buf, 0, pad_size); ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)pad_buf, pad_size, RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); kfree(pad_buf, pad_size); - - if (ret) + if (ret) { goto write_error; - RAW_file_offset += pad_size; + } + RAW_file_offset += pad_size; + RAW_file_written += pad_size; } pad_size = PAGE_SIZE - (RAW_file_offset & PAGE_MASK_64); @@ -2740,201 +2923,244 @@ kdbg_write_v1_plus_header(uint32_t count, vnode_t vp, vfs_context_t ctx) ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)pad_buf, pad_size, RAW_file_offset, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); kfree(pad_buf, pad_size); - - if (ret) + if (ret) { goto write_error; + } + RAW_file_offset += pad_size; + RAW_file_written += pad_size; } - RAW_file_written += sizeof(RAW_header) + mapsize + pad_size; write_error: return ret; } -int -kdbg_readthrmap(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) +static void +kdbg_clear_thread_map(void) { + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + if (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) { + assert(kd_mapptr != NULL); + kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); + kd_mapptr = NULL; + kd_mapsize = 0; + kd_mapcount = 0; + kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; + } +} - int avail = 0; +/* + * Write out a version 1 header and the thread map, if it is initialized, to a + * vnode. Used by KDWRITEMAP and kdbg_dump_trace_to_file. + * + * Returns write errors from vn_rdwr if a write fails. Returns ENODATA if the + * thread map has not been initialized, but the header will still be written. + * Returns ENOMEM if padding could not be allocated. Returns 0 otherwise. + */ +static int +kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx) +{ int ret = 0; - uint32_t count = 0; - unsigned int mapsize; + boolean_t map_initialized; - if ((!vp && !buffer) || (vp && buffer)) { - return EINVAL; + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + assert(ctx != NULL); + + map_initialized = (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT); + + ret = kdbg_write_v1_header(map_initialized, vp, ctx); + if (ret == 0) { + if (map_initialized) { + kdbg_clear_thread_map(); + } else { + ret = ENODATA; + } } - assert(number); - assert((vp == NULL) || (ctx != NULL)); + return ret; +} - avail = *number; - count = avail/sizeof (kd_threadmap); - mapsize = kd_mapcount * sizeof(kd_threadmap); +/* + * Copy out the thread map to a user space buffer. Used by KDTHRMAP. + * + * Returns copyout errors if the copyout fails. Returns ENODATA if the thread + * map has not been initialized. Returns EINVAL if the buffer provided is not + * large enough for the entire thread map. Returns 0 otherwise. + */ +static int +kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size) +{ + boolean_t map_initialized; + size_t map_size; + int ret = 0; - if (count && (count <= kd_mapcount)) { - if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { - if (*number < mapsize) - ret = EINVAL; - else { - if (vp) { - ret = kdbg_write_v1_plus_header(count, vp, ctx); - if (ret) - goto write_error; - } - else { - if (copyout(kd_mapptr, buffer, mapsize)) - ret = EINVAL; - } - } - } - else - ret = EINVAL; + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + assert(buffer_size != NULL); + + map_initialized = (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT); + if (!map_initialized) { + return ENODATA; } - else - ret = EINVAL; - if (ret && vp) - { - count = 0; + map_size = kd_mapcount * sizeof(kd_threadmap); + if (*buffer_size < map_size) { + return EINVAL; + } - ret = kdbg_write_to_vnode((caddr_t)&count, sizeof(uint32_t), vp, ctx, RAW_file_offset); - if (!ret) { - RAW_file_offset += sizeof(uint32_t); - RAW_file_written += sizeof(uint32_t); - } + ret = copyout(kd_mapptr, buffer, map_size); + if (ret == 0) { + kdbg_clear_thread_map(); } -write_error: - if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) - { - kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; - kd_mapsize = 0; - kd_mapptr = (kd_threadmap *) 0; - kd_mapcount = 0; - } - return(ret); + + return ret; } int -kdbg_readthrmap_v3(user_addr_t buffer, size_t *number, int fd) +kdbg_readthrmap_v3(user_addr_t buffer, size_t buffer_size, int fd) { - int avail = 0; int ret = 0; - uint32_t count = 0; - unsigned int mapsize; + boolean_t map_initialized; + size_t map_size; + + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); if ((!fd && !buffer) || (fd && buffer)) { return EINVAL; } - assert(number); + map_initialized = (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT); + map_size = kd_mapcount * sizeof(kd_threadmap); - avail = *number; - count = avail/sizeof (kd_threadmap); - mapsize = kd_mapcount * sizeof(kd_threadmap); + if (map_initialized && (buffer_size >= map_size)) + { + ret = kdbg_write_v3_header(buffer, &buffer_size, fd); - if (count && (count <= kd_mapcount)) { - if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { - if (*number < mapsize) { - ret = EINVAL; - } - else { - ret = kdbg_write_v3_header(buffer, number, fd); - if (ret) { - goto write_error; - } - } - } - else { - ret = EINVAL; + if (ret == 0) { + kdbg_clear_thread_map(); } - } - else { + } else { ret = EINVAL; } -write_error: - if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { - kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; - kd_mapsize = 0; - kd_mapptr = (kd_threadmap *) 0; - kd_mapcount = 0; - } - return(ret); -} + return ret; +} -static int +static void kdbg_set_nkdbufs(unsigned int value) { - /* - * We allow a maximum buffer size of 50% of either ram or max mapped address, whichever is smaller - * 'value' is the desired number of trace entries + /* + * We allow a maximum buffer size of 50% of either ram or max mapped + * address, whichever is smaller 'value' is the desired number of trace + * entries */ - unsigned int max_entries = (sane_size/2) / sizeof(kd_buf); + unsigned int max_entries = (sane_size / 2) / sizeof(kd_buf); - if (value <= max_entries) - return (value); - else - return (max_entries); + if (value <= max_entries) { + nkdbufs = value; + } else { + nkdbufs = max_entries; + } } - -static int -kdbg_enable_bg_trace(void) +/* + * Block until there are `n_storage_threshold` storage units filled with + * events or `timeout_ms` milliseconds have passed. If `locked_wait` is true, + * `ktrace_lock` is held while waiting. This is necessary while waiting to + * write events out of the buffers. + * + * Returns true if the threshold was reached and false otherwise. + * + * Called with `ktrace_lock` locked and interrupts enabled. + */ +static boolean_t +kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait) { - int ret = 0; + int wait_result = THREAD_AWAKENED; + uint64_t abstime = 0; - if (kdlog_bg_trace == TRUE && kdlog_bg_trace_running == FALSE && n_storage_buffers == 0) { - nkdbufs = bg_nkdbufs; - ret = kdbg_reinit(FALSE); - if (0 == ret) { - kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE); - kdlog_bg_trace_running = TRUE; - } - wakeup(&kdlog_bg_trace); + if (timeout_ms != 0) { + uint64_t ns = timeout_ms * NSEC_PER_MSEC; + nanoseconds_to_absolutetime(ns, &abstime); + clock_absolutetime_interval_to_deadline(abstime, &abstime); } - return ret; -} -static void -kdbg_disable_bg_trace(void) -{ - if (kdlog_bg_trace_running == TRUE) { - kdlog_bg_trace_running = FALSE; - kdbg_clear(); + boolean_t s = ml_set_interrupts_enabled(FALSE); + if (!s) { + panic("kdbg_wait() called with interrupts disabled"); } -} + lck_spin_lock(kdw_spin_lock); + if (!locked_wait) { + /* drop the mutex to allow others to access trace */ + lck_mtx_unlock(ktrace_lock); + } + while (wait_result == THREAD_AWAKENED && + kd_ctrl_page.kds_inuse_count < n_storage_threshold) + { + kds_waiter = 1; -/* - * This function is provided for the CHUD toolkit only. - * int val: - * zero disables kdebug_chudhook function call - * non-zero enables kdebug_chudhook function call - * char *fn: - * address of the enabled kdebug_chudhook function -*/ + if (abstime) { + wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime); + } else { + wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE); + } -void -kdbg_control_chud(int val, void *fn) + kds_waiter = 0; + } + + /* check the count under the spinlock */ + boolean_t threshold_exceeded = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold); + + lck_spin_unlock(kdw_spin_lock); + ml_set_interrupts_enabled(s); + + if (!locked_wait) { + /* pick the mutex back up again */ + lck_mtx_lock(ktrace_lock); + } + + /* write out whether we've exceeded the threshold */ + return threshold_exceeded; +} + +/* + * Wakeup a thread waiting using `kdbg_wait` if there are at least + * `n_storage_threshold` storage units in use. + */ +static void +kdbg_wakeup(void) { - kdbg_lock_init(); - - if (val) { - /* enable chudhook */ - kdebug_chudhook = fn; - kdbg_set_flags(SLOW_CHUD, KDEBUG_ENABLE_CHUD, TRUE); + boolean_t need_kds_wakeup = FALSE; + + /* + * Try to take the lock here to synchronize with the waiter entering + * the blocked state. Use the try mode to prevent deadlocks caused by + * re-entering this routine due to various trace points triggered in the + * lck_spin_sleep_xxxx routines used to actually enter one of our 2 wait + * conditions. No problem if we fail, there will be lots of additional + * events coming in that will eventually succeed in grabbing this lock. + */ + boolean_t s = ml_set_interrupts_enabled(FALSE); + + if (lck_spin_try_lock(kdw_spin_lock)) { + if (kds_waiter && + (kd_ctrl_page.kds_inuse_count >= n_storage_threshold)) + { + kds_waiter = 0; + need_kds_wakeup = TRUE; + } + lck_spin_unlock(kdw_spin_lock); } - else { - /* disable chudhook */ - kdbg_set_flags(SLOW_CHUD, KDEBUG_ENABLE_CHUD, FALSE); - kdebug_chudhook = 0; + + ml_set_interrupts_enabled(s); + + if (need_kds_wakeup == TRUE) { + wakeup(&kds_waiter); } } - int kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) { @@ -2943,8 +3169,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) unsigned int value = 0; kd_regtype kd_Reg; kbufinfo_t kd_bufinfo; - pid_t curpid; - proc_t p, curproc; + proc_t p; if (name[0] == KERN_KDGETENTROPY || name[0] == KERN_KDWRITETR || @@ -2954,49 +3179,68 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) name[0] == KERN_KDEFLAGS || name[0] == KERN_KDDFLAGS || name[0] == KERN_KDENABLE || - name[0] == KERN_KDENABLE_BG_TRACE || - name[0] == KERN_KDSETBUF) { - - if ( namelen < 2 ) - return(EINVAL); + name[0] == KERN_KDSETBUF) + { + if (namelen < 2) { + return EINVAL; + } value = name[1]; } - + kdbg_lock_init(); + assert(kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT); - if ( !(kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT)) - return(ENOSPC); + lck_mtx_lock(ktrace_lock); - lck_mtx_lock(kd_trace_mtx_sysctl); + /* + * Some requests only require "read" access to kdebug trace. Regardless, + * tell ktrace that a configuration or read is occurring (and see if it's + * allowed). + */ + if (name[0] != KERN_KDGETBUF && + name[0] != KERN_KDGETREG && + name[0] != KERN_KDREADCURTHRMAP && + name[0] != KERN_KDGETENTROPY) + { + if ((ret = ktrace_configure(KTRACE_KDEBUG))) { + goto out; + } + } else { + if ((ret = ktrace_read_check())) { + goto out; + } + } switch(name[0]) { case KERN_KDGETBUF: - /* - * Does not alter the global_state_pid - * This is a passive request. - */ if (size < sizeof(kd_bufinfo.nkdbufs)) { /* * There is not enough room to return even * the first element of the info structure. */ ret = EINVAL; - goto out; + break; } + + memset(&kd_bufinfo, 0, sizeof(kd_bufinfo)); + kd_bufinfo.nkdbufs = nkdbufs; kd_bufinfo.nkdthreads = kd_mapcount; - + if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) ) kd_bufinfo.nolog = 1; else kd_bufinfo.nolog = 0; - + kd_bufinfo.flags = kd_ctrl_page.kdebug_flags; #if defined(__LP64__) kd_bufinfo.flags |= KDBG_LP64; #endif - kd_bufinfo.bufid = global_state_pid; - + { + int pid = ktrace_get_owning_pid(); + kd_bufinfo.bufid = (pid == 0 ? -1 : pid); + } + if (size >= sizeof(kd_bufinfo)) { /* * Provide all the info we have @@ -3011,103 +3255,31 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) if (copyout(&kd_bufinfo, where, size)) ret = EINVAL; } - goto out; + break; + case KERN_KDGETENTROPY: { /* Obsolescent - just fake with a random buffer */ char *buffer = (char *) kalloc(size); read_frandom((void *) buffer, size); ret = copyout(buffer, where, size); kfree(buffer, size); - goto out; + break; } - - case KERN_KDENABLE_BG_TRACE: - bg_nkdbufs = kdbg_set_nkdbufs(value); - kdlog_bg_trace = TRUE; - ret = kdbg_enable_bg_trace(); - goto out; - - case KERN_KDDISABLE_BG_TRACE: - kdlog_bg_trace = FALSE; - kdbg_disable_bg_trace(); - goto out; - - case KERN_KDWAIT_BG_TRACE_RESET: - if (!kdlog_bg_trace){ - ret = EINVAL; - goto out; - } - wait_result_t wait_result = assert_wait(&kdlog_bg_trace, THREAD_ABORTSAFE); - lck_mtx_unlock(kd_trace_mtx_sysctl); - if (wait_result == THREAD_WAITING) - wait_result = thread_block(THREAD_CONTINUE_NULL); - if (wait_result == THREAD_INTERRUPTED) - ret = EINTR; - lck_mtx_lock(kd_trace_mtx_sysctl); - goto out; - case KERN_KDSET_BG_TYPEFILTER: - if (!kdlog_bg_trace || !kdlog_bg_trace_running){ - ret = EINVAL; - goto out; - } - - if (size != KDBG_TYPEFILTER_BITMAP_SIZE) { - ret = EINVAL; - goto out; - } - - if ((kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) == 0){ - if ((ret = kdbg_enable_typefilter())) - goto out; - } - - if (copyin(where, type_filter_bitmap, KDBG_TYPEFILTER_BITMAP_SIZE)) { - ret = EINVAL; - goto out; - } - kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_TYPEFILTER_CHANGED, type_filter_bitmap); - goto out; - } - - if ((curproc = current_proc()) != NULL) - curpid = curproc->p_pid; - else { - ret = ESRCH; - goto out; - } - if (global_state_pid == -1) - global_state_pid = curpid; - else if (global_state_pid != curpid) { - if ((p = proc_find(global_state_pid)) == NULL) { - /* - * The global pid no longer exists - */ - global_state_pid = curpid; - } else { - /* - * The global pid exists, deny this request - */ - proc_rele(p); - - ret = EBUSY; - goto out; - } - } + case KERN_KDREADCURTHRMAP: + ret = kdbg_readcurthrmap(where, sizep); + break; - switch(name[0]) { case KERN_KDEFLAGS: - kdbg_disable_bg_trace(); - value &= KDBG_USERFLAGS; kd_ctrl_page.kdebug_flags |= value; break; - case KERN_KDDFLAGS: - kdbg_disable_bg_trace(); + case KERN_KDDFLAGS: value &= KDBG_USERFLAGS; kd_ctrl_page.kdebug_flags &= ~value; break; + case KERN_KDENABLE: /* * Enable tracing mechanism. Two types: @@ -3130,23 +3302,26 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) } else { - kdbg_set_tracing_enabled(FALSE, 0); + if (!kdebug_enable) { + break; + } + + kernel_debug_disable(); } break; - case KERN_KDSETBUF: - kdbg_disable_bg_trace(); - nkdbufs = kdbg_set_nkdbufs(value); + case KERN_KDSETBUF: + kdbg_set_nkdbufs(value); break; - case KERN_KDSETUP: - kdbg_disable_bg_trace(); + case KERN_KDSETUP: ret = kdbg_reinit(FALSE); break; + case KERN_KDREMOVE: - kdbg_clear(); - ret = kdbg_enable_bg_trace(); + ktrace_reset(KTRACE_KDEBUG); break; + case KERN_KDSETREG: if(size < sizeof(kd_regtype)) { ret = EINVAL; @@ -3156,17 +3331,18 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; break; } - kdbg_disable_bg_trace(); ret = kdbg_setreg(&kd_Reg); break; + case KERN_KDGETREG: - kdbg_disable_bg_trace(); ret = EINVAL; break; + case KERN_KDREADTR: ret = kdbg_read(where, sizep, NULL, NULL, RAW_VERSION1); break; + case KERN_KDWRITETR: case KERN_KDWRITETR_V3: case KERN_KDWRITEMAP: @@ -3179,34 +3355,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) int fd; if (name[0] == KERN_KDWRITETR || name[0] == KERN_KDWRITETR_V3) { - int s; - int wait_result = THREAD_AWAKENED; - u_int64_t abstime; - u_int64_t ns; - - if (*sizep) { - ns = ((u_int64_t)*sizep) * (u_int64_t)(1000 * 1000); - nanoseconds_to_absolutetime(ns, &abstime ); - clock_absolutetime_interval_to_deadline( abstime, &abstime ); - } else - abstime = 0; - - s = ml_set_interrupts_enabled(FALSE); - lck_spin_lock(kdw_spin_lock); - - while (wait_result == THREAD_AWAKENED && kd_ctrl_page.kds_inuse_count < n_storage_threshold) { - - kds_waiter = 1; - - if (abstime) - wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime); - else - wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE); - - kds_waiter = 0; - } - lck_spin_unlock(kdw_spin_lock); - ml_set_interrupts_enabled(s); + (void)kdbg_wait(size, TRUE); } p = current_proc(); fd = value; @@ -3244,10 +3393,11 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) *sizep = number; } else { number = kd_mapcount * sizeof(kd_threadmap); - if (name[0] == KERN_KDWRITEMAP_V3) - kdbg_readthrmap_v3(0, &number, fd); - else - kdbg_readthrmap(0, &number, vp, &context); + if (name[0] == KERN_KDWRITEMAP_V3) { + ret = kdbg_readthrmap_v3(0, number, fd); + } else { + ret = kdbg_write_thread_map(vp, &context); + } } fp->f_fglob->fg_offset = RAW_file_offset; vnode_put(vp); @@ -3257,60 +3407,9 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) break; } case KERN_KDBUFWAIT: - { - /* WRITETR lite -- just block until there's data */ - int s; - int wait_result = THREAD_AWAKENED; - u_int64_t abstime; - u_int64_t ns; - size_t number = 0; - - kdbg_disable_bg_trace(); - - - if (*sizep) { - ns = ((u_int64_t)*sizep) * (u_int64_t)(1000 * 1000); - nanoseconds_to_absolutetime(ns, &abstime ); - clock_absolutetime_interval_to_deadline( abstime, &abstime ); - } else - abstime = 0; - - s = ml_set_interrupts_enabled(FALSE); - if( !s ) - panic("trying to wait with interrupts off"); - lck_spin_lock(kdw_spin_lock); - - /* drop the mutex so don't exclude others from - * accessing trace - */ - lck_mtx_unlock(kd_trace_mtx_sysctl); - - while (wait_result == THREAD_AWAKENED && - kd_ctrl_page.kds_inuse_count < n_storage_threshold) { - - kds_waiter = 1; - - if (abstime) - wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime); - else - wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE); - - kds_waiter = 0; - } - - /* check the count under the spinlock */ - number = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold); - - lck_spin_unlock(kdw_spin_lock); - ml_set_interrupts_enabled(s); - - /* pick the mutex back up again */ - lck_mtx_lock(kd_trace_mtx_sysctl); - - /* write out whether we've exceeded the threshold */ - *sizep = number; + *sizep = kdbg_wait(size, FALSE); break; - } + case KERN_KDPIDTR: if (size < sizeof(kd_regtype)) { ret = EINVAL; @@ -3320,10 +3419,10 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; break; } - kdbg_disable_bg_trace(); ret = kdbg_setpid(&kd_Reg); break; + case KERN_KDPIDEX: if (size < sizeof(kd_regtype)) { ret = EINVAL; @@ -3333,56 +3432,33 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; break; } - kdbg_disable_bg_trace(); ret = kdbg_setpidex(&kd_Reg); break; + case KERN_KDCPUMAP: ret = kdbg_readcpumap(where, sizep); break; + case KERN_KDTHRMAP: - ret = kdbg_readthrmap(where, sizep, NULL, NULL); - break; - case KERN_KDREADCURTHRMAP: - ret = kdbg_readcurthrmap(where, sizep); + ret = kdbg_copyout_thread_map(where, sizep); break; - case KERN_KDSETRTCDEC: - if (size < sizeof(kd_regtype)) { - ret = EINVAL; - break; - } - if (copyin(where, &kd_Reg, sizeof(kd_regtype))) { - ret = EINVAL; - break; - } - kdbg_disable_bg_trace(); - ret = kdbg_setrtcdec(&kd_Reg); + case KERN_KDSET_TYPEFILTER: { + ret = kdbg_copyin_typefilter(where, size); break; - case KERN_KDSET_TYPEFILTER: - kdbg_disable_bg_trace(); - - if (size != KDBG_TYPEFILTER_BITMAP_SIZE) { - ret = EINVAL; - break; - } - - if ((kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) == 0){ - if ((ret = kdbg_enable_typefilter())) - break; - } + } - if (copyin(where, type_filter_bitmap, KDBG_TYPEFILTER_BITMAP_SIZE)) { - ret = EINVAL; - break; - } - kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_TYPEFILTER_CHANGED, type_filter_bitmap); + case KERN_KDTEST: + ret = kdbg_test(); break; + default: ret = EINVAL; + break; } out: - lck_mtx_unlock(kd_trace_mtx_sysctl); + lck_mtx_unlock(ktrace_lock); return(ret); } @@ -3399,7 +3475,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin { unsigned int count; unsigned int cpu, min_cpu; - uint64_t mintime, t, barrier = 0; + uint64_t barrier_min = 0, barrier_max = 0, t, earliest_time; int error = 0; kd_buf *tempbuf; uint32_t rcursor; @@ -3414,64 +3490,99 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin uint32_t old_kdebug_slowcheck; boolean_t lostevents = FALSE; boolean_t out_of_events = FALSE; + boolean_t wrapped = FALSE; - assert(number); + assert(number); count = *number/sizeof(kd_buf); *number = 0; if (count == 0 || !(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT) || kdcopybuf == 0) return EINVAL; + thread_set_eager_preempt(current_thread()); + memset(&lostevent, 0, sizeof(lostevent)); lostevent.debugid = TRACE_LOST_EVENTS; - /* Capture timestamp. Only sort events that have occured before the timestamp. - * Since the iop is being flushed here, its possible that events occur on the AP - * while running live tracing. If we are disabled, no new events should - * occur on the AP. - */ - - if (kd_ctrl_page.enabled) - { - // timestamp is non-zero value - barrier = mach_absolute_time() & KDBG_TIMESTAMP_MASK; + /* + * Capture the current time. Only sort events that have occured + * before now. Since the IOPs are being flushed here, it is possible + * that events occur on the AP while running live tracing. If we are + * disabled, no new events should occur on the AP. + */ + if (kd_ctrl_page.enabled) { + barrier_max = mach_absolute_time() & KDBG_TIMESTAMP_MASK; } - - // Request each IOP to provide us with up to date entries before merging buffers together. - kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH, NULL); /* - * because we hold kd_trace_mtx_sysctl, no other control threads can - * be playing with kdebug_flags... the code that cuts new events could - * be running, but it grabs kds_spin_lock if it needs to acquire a new - * storage chunk which is where it examines kdebug_flags... it its adding - * to the same chunk we're reading from, no problem... + * Request each IOP to provide us with up to date entries before merging + * buffers together. */ + kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH, NULL); - disable_wrap(&old_kdebug_slowcheck, &old_kdebug_flags); + /* + * Disable wrap so storage units cannot be stolen out from underneath us + * while merging events. + * + * Because we hold ktrace_lock, no other control threads can be playing + * with kdebug_flags. The code that emits new events could be running, + * but it grabs kds_spin_lock if it needs to acquire a new storage + * chunk, which is where it examines kdebug_flags. If it is adding to + * the same chunk we're reading from, check for that below. + */ + wrapped = disable_wrap(&old_kdebug_slowcheck, &old_kdebug_flags); if (count > nkdbufs) count = nkdbufs; - if ((tempbuf_count = count) > KDCOPYBUF_COUNT) - tempbuf_count = KDCOPYBUF_COUNT; + if ((tempbuf_count = count) > KDCOPYBUF_COUNT) { + tempbuf_count = KDCOPYBUF_COUNT; + } + + /* + * If the buffers have wrapped, capture the earliest time where there + * are events for all CPUs and do not emit additional lost events for + * oldest storage units. + */ + if (wrapped) { + barrier_min = kd_ctrl_page.oldest_time; + kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; + kd_ctrl_page.oldest_time = 0; + + for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page.kdebug_cpus; cpu++, kdbp++) { + if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { + continue; + } + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + kdsp_actual->kds_lostevents = FALSE; + } + } while (count) { tempbuf = kdcopybuf; tempbuf_number = 0; - // While space + if (wrapped) { + /* Trace a single lost events event for wrapping. */ + kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0); + *tempbuf = lostevent; + wrapped = FALSE; + goto nextevent; + } + + /* While space left in merged events scratch buffer. */ while (tempbuf_count) { - mintime = 0xffffffffffffffffULL; + earliest_time = UINT64_MAX; min_kdbp = NULL; min_cpu = 0; - // Check all CPUs + /* Check each CPU's buffers. */ for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page.kdebug_cpus; cpu++, kdbp++) { - - // Find one with raw data - if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) - continue; + /* Skip CPUs without data. */ + if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { +next_cpu: + continue; + } /* Debugging aid: maintain a copy of the "kdsp" * index. */ @@ -3479,29 +3590,50 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin kdsp_shadow = kdsp; - // Get from cpu data to buffer header to buffer + /* From CPU data to buffer header to buffer. */ kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); volatile struct kd_storage *kdsp_actual_shadow; kdsp_actual_shadow = kdsp_actual; - // See if there are actual data left in this buffer + /* Skip buffer if there are no events left. */ rcursor = kdsp_actual->kds_readlast; - if (rcursor == kdsp_actual->kds_bufindx) + if (rcursor == kdsp_actual->kds_bufindx) { continue; + } t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); - if ((t > barrier) && (barrier > 0)) { - /* - * Need to wait to flush iop again before we - * sort any more data from the buffers - */ + /* Ignore events that have aged out due to wrapping. */ + while (t < barrier_min) { + rcursor = ++kdsp_actual->kds_readlast; + + if (rcursor >= EVENTS_PER_STORAGE_UNIT) { + release_storage_unit(cpu, kdsp.raw); + + if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { + goto next_cpu; + } + kdsp_shadow = kdsp; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + kdsp_actual_shadow = kdsp_actual; + rcursor = kdsp_actual->kds_readlast; + } + + t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); + } + + if ((t > barrier_max) && (barrier_max > 0)) { + /* + * Need to flush IOPs again before we + * can sort any more data from the + * buffers. + */ out_of_events = TRUE; break; - } + } if (t < kdsp_actual->kds_timestamp) { /* * indicates we've not yet completed filling @@ -3517,8 +3649,8 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin out_of_events = TRUE; break; } - if (t < mintime) { - mintime = t; + if (t < earliest_time) { + earliest_time = t; min_kdbp = kdbp; min_cpu = cpu; } @@ -3531,21 +3663,10 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin break; } - // Get data kdsp = min_kdbp->kd_list_head; kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - if (kdsp_actual->kds_lostevents == TRUE) { - kdbg_set_timestamp_and_cpu(&lostevent, kdsp_actual->kds_records[kdsp_actual->kds_readlast].timestamp, min_cpu); - *tempbuf = lostevent; - - kdsp_actual->kds_lostevents = FALSE; - lostevents = TRUE; - - goto nextevent; - } - - // Copy into buffer + /* Copy earliest event into merged events scratch buffer. */ *tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++]; if (kdsp_actual->kds_readlast == EVENTS_PER_STORAGE_UNIT) @@ -3553,15 +3674,15 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin /* * Watch for out of order timestamps - */ - if (mintime < min_kdbp->kd_prev_timebase) { + */ + if (earliest_time < min_kdbp->kd_prev_timebase) { /* * if so, use the previous timestamp + 1 cycle */ min_kdbp->kd_prev_timebase++; kdbg_set_timestamp_and_cpu(tempbuf, min_kdbp->kd_prev_timebase, kdbg_get_cpu(tempbuf)); } else - min_kdbp->kd_prev_timebase = mintime; + min_kdbp->kd_prev_timebase = earliest_time; nextevent: tempbuf_count--; tempbuf_number++; @@ -3619,326 +3740,256 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin if ( !(old_kdebug_flags & KDBG_NOWRAP)) { enable_wrap(old_kdebug_slowcheck, lostevents); } + thread_clear_eager_preempt(current_thread()); return (error); } +static int +kdbg_test(void) +{ +#define KDEBUG_TEST_CODE(code) BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, (code)) + int code = 0; + + KDBG(KDEBUG_TEST_CODE(code)); code++; + KDBG(KDEBUG_TEST_CODE(code), 1); code++; + KDBG(KDEBUG_TEST_CODE(code), 1, 2); code++; + KDBG(KDEBUG_TEST_CODE(code), 1, 2, 3); code++; + KDBG(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++; + + KDBG_RELEASE(KDEBUG_TEST_CODE(code)); code++; + KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1); code++; + KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1, 2); code++; + KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1, 2, 3); code++; + KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++; + + KDBG_FILTERED(KDEBUG_TEST_CODE(code)); code++; + KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1); code++; + KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2); code++; + KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3); code++; + KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++; + + KDBG_DEBUG(KDEBUG_TEST_CODE(code)); code++; + KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1); code++; + KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2); code++; + KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2, 3); code++; + KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++; -unsigned char *getProcName(struct proc *proc); -unsigned char *getProcName(struct proc *proc) { - - return (unsigned char *) &proc->p_comm; /* Return pointer to the proc name */ - + return 0; +#undef KDEBUG_TEST_CODE } -static int -stackshot_kern_return_to_bsd_error(kern_return_t kr) +void +kdebug_boot_trace(unsigned int n_events, char *filter_desc) { - switch (kr) { - case KERN_SUCCESS: - return 0; - case KERN_RESOURCE_SHORTAGE: - return ENOMEM; - case KERN_NO_SPACE: - return ENOSPC; - case KERN_NO_ACCESS: - return EPERM; - case KERN_MEMORY_PRESENT: - return EEXIST; - case KERN_NOT_SUPPORTED: - return ENOTSUP; - case KERN_NOT_IN_SET: - return ENOENT; - default: - return EINVAL; + assert(filter_desc != NULL); + +#if (defined(__i386__) || defined(__x86_64__)) + /* only trace MACH events when outputting kdebug to serial */ + if (kdebug_serial) { + n_events = 1; + if (filter_desc[0] == '\0') { + filter_desc[0] = 'C'; + filter_desc[1] = '1'; + filter_desc[2] = '\0'; + } } -} - - -/* - * DEPRECATION WARNING: THIS SYSCALL IS BEING REPLACED WITH SYS_stack_snapshot_with_config and SYS_microstackshot. - * - * stack_snapshot: Obtains a coherent set of stack traces for all threads - * on the system, tracing both kernel and user stacks - * where available. Uses machine specific trace routines - * for ppc, ppc64 and x86. - * Inputs: uap->pid - process id of process to be traced, or -1 - * for the entire system - * uap->tracebuf - address of the user space destination - * buffer - * uap->tracebuf_size - size of the user space trace buffer - * uap->options - various options, including the maximum - * number of frames to trace. - * Outputs: EPERM if the caller is not privileged - * EINVAL if the supplied trace buffer isn't sanely sized - * ENOMEM if we don't have enough memory to satisfy the - * request - * ENOENT if the target pid isn't found - * ENOSPC if the supplied buffer is insufficient - * *retval contains the number of bytes traced, if successful - * and -1 otherwise. If the request failed due to - * tracebuffer exhaustion, we copyout as much as possible. - */ -int -stack_snapshot(struct proc *p, register struct stack_snapshot_args *uap, int32_t *retval) { - int error = 0; - kern_return_t kr; +#endif - if ((error = suser(kauth_cred_get(), &p->p_acflag))) - return(error); + if (log_leaks && n_events == 0) { + n_events = 200000; + } - kr = stack_snapshot2(uap->pid, uap->tracebuf, uap->tracebuf_size, uap->flags, retval); - return stackshot_kern_return_to_bsd_error(kr); + kdebug_trace_start(n_events, filter_desc, FALSE); } -/* - * stack_snapshot_with_config: Obtains a coherent set of stack traces for specified threads on the sysem, - * tracing both kernel and user stacks where available. Allocates a buffer from the - * kernel and maps the buffer into the calling task's address space. - * - * Inputs: uap->stackshot_config_version - version of the stackshot config that is being passed - * uap->stackshot_config - pointer to the stackshot config - * uap->stackshot_config_size- size of the stackshot config being passed - * Outputs: EINVAL if there is a problem with the arguments - * EFAULT if we failed to copy in the arguments succesfully - * EPERM if the caller is not privileged - * ENOTSUP if the caller is passing a version of arguments that is not supported by the kernel - * (indicates libsyscall:kernel mismatch) or if the caller is requesting unsupported flags - * ENOENT if the caller is requesting an existing buffer that doesn't exist or if the - * requested PID isn't found - * ENOMEM if the kernel is unable to allocate enough memory to serve the request - * ENOSPC if there isn't enough space in the caller's address space to remap the buffer - * ESRCH if the target PID isn't found - * returns KERN_SUCCESS on success - */ -int -stack_snapshot_with_config(struct proc *p, struct stack_snapshot_with_config_args *uap, __unused int *retval) +static void +kdbg_set_typefilter_string(const char *filter_desc) { - int error = 0; - kern_return_t kr; + char *end = NULL; - if ((error = suser(kauth_cred_get(), &p->p_acflag))) - return(error); + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); - if((void*)uap->stackshot_config == NULL) { - return EINVAL; - } + assert(filter_desc != NULL); - switch (uap->stackshot_config_version) { - case STACKSHOT_CONFIG_TYPE: - if (uap->stackshot_config_size != sizeof(stackshot_config_t)) { - return EINVAL; - } - stackshot_config_t config; - error = copyin(uap->stackshot_config, &config, sizeof(stackshot_config_t)); - if (error != KERN_SUCCESS) - { - return EFAULT; - } - kr = kern_stack_snapshot_internal(uap->stackshot_config_version, &config, sizeof(stackshot_config_t), TRUE); - return stackshot_kern_return_to_bsd_error(kr); - default: - return ENOTSUP; + typefilter_reject_all(kdbg_typefilter); + typefilter_allow_class(kdbg_typefilter, DBG_TRACE); + + /* if the filter description starts with a number, assume it's a csc */ + if (filter_desc[0] >= '0' && filter_desc[0] <= '9'){ + unsigned long csc = strtoul(filter_desc, NULL, 0); + if (filter_desc != end && csc <= KDBG_CSC_MAX) { + typefilter_allow_csc(kdbg_typefilter, csc); + } + return; } -} -#if CONFIG_TELEMETRY -/* - * microstackshot: Catch all system call for microstackshot related operations, including - * enabling/disabling both global and windowed microstackshots as well - * as retrieving windowed or global stackshots and the boot profile. - * Inputs: uap->tracebuf - address of the user space destination - * buffer - * uap->tracebuf_size - size of the user space trace buffer - * uap->flags - various flags - * Outputs: EPERM if the caller is not privileged - * EINVAL if the supplied mss_args is NULL, mss_args.tracebuf is NULL or mss_args.tracebuf_size is not sane - * ENOMEM if we don't have enough memory to satisfy the request - * *retval contains the number of bytes traced, if successful - * and -1 otherwise. - */ -int -microstackshot(struct proc *p, struct microstackshot_args *uap, int32_t *retval) -{ - int error = 0; - kern_return_t kr; + while (filter_desc[0] != '\0') { + unsigned long allow_value; - if ((error = suser(kauth_cred_get(), &p->p_acflag))) - return(error); + char filter_type = filter_desc[0]; + if (filter_type != 'C' && filter_type != 'S') { + return; + } + filter_desc++; - kr = stack_microstackshot(uap->tracebuf, uap->tracebuf_size, uap->flags, retval); - return stackshot_kern_return_to_bsd_error(kr); -} -#endif /* CONFIG_TELEMETRY */ + allow_value = strtoul(filter_desc, &end, 0); + if (filter_desc == end) { + /* cannot parse as integer */ + return; + } -/* - * kern_stack_snapshot_with_reason: Obtains a coherent set of stack traces for specified threads on the sysem, - * tracing both kernel and user stacks where available. Allocates a buffer from the - * kernel and stores the address of this buffer. - * - * Inputs: reason - the reason for triggering a stackshot (unused at the moment, but in the - * future will be saved in the stackshot) - * Outputs: EINVAL/ENOTSUP if there is a problem with the arguments - * EPERM if the caller doesn't pass at least one KERNEL stackshot flag - * ENOMEM if the kernel is unable to allocate enough memory to serve the request - * ESRCH if the target PID isn't found - * returns KERN_SUCCESS on success - */ -int -kern_stack_snapshot_with_reason(__unused char *reason) -{ - stackshot_config_t config; - kern_return_t kr; - - config.sc_pid = -1; - config.sc_flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_IN_KERNEL_BUFFER | - STACKSHOT_KCDATA_FORMAT); - config.sc_since_timestamp = 0; - config.sc_out_buffer_addr = 0; - config.sc_out_size_addr = 0; - - kr = kern_stack_snapshot_internal(STACKSHOT_CONFIG_TYPE, &config, sizeof(stackshot_config_t), FALSE); - return stackshot_kern_return_to_bsd_error(kr); + switch (filter_type) { + case 'C': + if (allow_value <= KDBG_CLASS_MAX) { + typefilter_allow_class(kdbg_typefilter, allow_value); + } else { + /* illegal class */ + return; + } + break; + case 'S': + if (allow_value <= KDBG_CSC_MAX) { + typefilter_allow_csc(kdbg_typefilter, allow_value); + } else { + /* illegal class subclass */ + return; + } + break; + default: + return; + } + + /* advance to next filter entry */ + filter_desc = end; + if (filter_desc[0] == ',') { + filter_desc++; + } + } } /* - * stack_snapshot_from_kernel: Stackshot function for kernel consumers who have their own buffer. - * - * Inputs: pid - the PID to be traced or -1 for the whole system - * buf - a pointer to the buffer where the stackshot should be written - * size - the size of the buffer - * flags - flags to be passed to the stackshot - * *bytes_traced - a pointer to be filled with the length of the stackshot - * Outputs: -1 if there is a problem with the arguments - * the error returned by the stackshot code otherwise + * This function is meant to be called from the bootstrap thread or coming out + * of acpi_idle_kernel. */ -int -stack_snapshot_from_kernel(pid_t pid, void *buf, uint32_t size, uint32_t flags, unsigned *bytes_traced) +void +kdebug_trace_start(unsigned int n_events, const char *filter_desc, + boolean_t need_map) { - kern_return_t kr; + uint32_t old1, old2; - kr = stack_snapshot_from_kernel_internal(pid, buf, size, flags, bytes_traced); - if (kr == KERN_FAILURE) { - return -1; + if (!n_events) { + return; } - return kr; -} + lck_mtx_lock(ktrace_lock); -void -start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map) -{ - - if (!new_nkdbufs) - return; - nkdbufs = kdbg_set_nkdbufs(new_nkdbufs); kdbg_lock_init(); - kernel_debug_string_simple("start_kern_tracing"); + ktrace_kernel_configure(KTRACE_KDEBUG); - if (0 == kdbg_reinit(TRUE)) { + kdbg_set_nkdbufs(n_events); - if (need_map == TRUE) { - uint32_t old1, old2; + kernel_debug_string_early("start_kern_tracing"); - kdbg_thrmap_init(); + if (kdbg_reinit(TRUE)) { + printf("error from kdbg_reinit, kernel tracing not started\n"); + goto out; + } - disable_wrap(&old1, &old2); + /* + * Wrapping is disabled because boot and wake tracing is interested in + * the earliest events, at the expense of later ones. + */ + (void)disable_wrap(&old1, &old2); + + if (filter_desc && filter_desc[0] != '\0') { + if (kdbg_initialize_typefilter(NULL) == KERN_SUCCESS) { + kdbg_set_typefilter_string(filter_desc); + kdbg_enable_typefilter(); } + } - /* Hold off interrupts until the early traces are cut */ - boolean_t s = ml_set_interrupts_enabled(FALSE); + /* + * Hold off interrupts between getting a thread map and enabling trace + * and until the early traces are recorded. + */ + boolean_t s = ml_set_interrupts_enabled(FALSE); - kdbg_set_tracing_enabled( - TRUE, - kdebug_serial ? - (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_SERIAL) : - KDEBUG_ENABLE_TRACE); + if (need_map == TRUE) { + kdbg_thrmap_init(); + } - /* - * Transfer all very early events from the static buffer - * into the real buffers. - */ - kernel_debug_early_end(); - - ml_set_interrupts_enabled(s); + kdbg_set_tracing_enabled(TRUE, kdebug_serial ? + (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_SERIAL) : + KDEBUG_ENABLE_TRACE); - printf("kernel tracing started\n"); -#if KDEBUG_MOJO_TRACE - if (kdebug_serial) { - printf("serial output enabled with %lu named events\n", - sizeof(kd_events)/sizeof(kd_event_t)); - } -#endif - } else { - printf("error from kdbg_reinit, kernel tracing not started\n"); - } -} + /* + * Transfer all very early events from the static buffer into the real + * buffers. + */ + kernel_debug_early_end(); -void -start_kern_tracing_with_typefilter(unsigned int new_nkdbufs, - boolean_t need_map, - unsigned int typefilter) -{ - /* startup tracing */ - start_kern_tracing(new_nkdbufs, need_map); + ml_set_interrupts_enabled(s); - /* check that tracing was actually enabled */ - if (!(kdebug_enable & KDEBUG_ENABLE_TRACE)) - return; + printf("kernel tracing started with %u events\n", n_events); + +#if KDEBUG_MOJO_TRACE + if (kdebug_serial) { + printf("serial output enabled with %lu named events\n", + sizeof(kd_events)/sizeof(kd_event_t)); + } +#endif - /* setup the typefiltering */ - if (0 == kdbg_enable_typefilter()) - setbit(type_filter_bitmap, - typefilter & (KDBG_CSC_MASK >> KDBG_CSC_OFFSET)); +out: + lck_mtx_unlock(ktrace_lock); } void kdbg_dump_trace_to_file(const char *filename) { - vfs_context_t ctx; - vnode_t vp; - int error; - size_t number; + vfs_context_t ctx; + vnode_t vp; + size_t write_size; + lck_mtx_lock(ktrace_lock); - if ( !(kdebug_enable & KDEBUG_ENABLE_TRACE)) - return; + if (!(kdebug_enable & KDEBUG_ENABLE_TRACE)) { + goto out; + } - if (global_state_pid != -1) { - if ((proc_find(global_state_pid)) != NULL) { - /* - * The global pid exists, we're running - * due to fs_usage, latency, etc... - * don't cut the panic/shutdown trace file - * Disable tracing from this point to avoid - * perturbing state. - */ - kdebug_enable = 0; - kd_ctrl_page.enabled = 0; - commpage_update_kdebug_enable(); - return; - } + if (ktrace_get_owning_pid() != 0) { + /* + * Another process owns ktrace and is still active, disable tracing to + * capture whatever was being recorded. + */ + kdebug_enable = 0; + kd_ctrl_page.enabled = 0; + commpage_update_kdebug_state(); + goto out; } + KERNEL_DEBUG_CONSTANT(TRACE_PANIC | DBG_FUNC_NONE, 0, 0, 0, 0, 0); kdebug_enable = 0; kd_ctrl_page.enabled = 0; - commpage_update_kdebug_enable(); + commpage_update_kdebug_state(); ctx = vfs_context_kernel(); - if ((error = vnode_open(filename, (O_CREAT | FWRITE | O_NOFOLLOW), 0600, 0, &vp, ctx))) - return; + if (vnode_open(filename, (O_CREAT | FWRITE | O_NOFOLLOW), 0600, 0, &vp, ctx)) { + goto out; + } - number = kd_mapcount * sizeof(kd_threadmap); - kdbg_readthrmap(0, &number, vp, ctx); + kdbg_write_thread_map(vp, ctx); - number = nkdbufs*sizeof(kd_buf); - kdbg_read(0, &number, vp, ctx, RAW_VERSION1); - - vnode_close(vp, FWRITE, ctx); + write_size = nkdbufs * sizeof(kd_buf); + kdbg_read(0, &write_size, vp, ctx, RAW_VERSION1); + vnode_close(vp, FWRITE, ctx); sync(current_proc(), (void *)NULL, (int *)NULL); + +out: + lck_mtx_unlock(ktrace_lock); } /* Helper function for filling in the BSD name for an address space @@ -4103,4 +4154,5 @@ kdebug_serial_print( kprintf("%s", kprintf_line); kd_last_timstamp = timestamp; } + #endif diff --git a/bsd/kern/kern_aio.c b/bsd/kern/kern_aio.c index 44c956e9b..3869ad669 100644 --- a/bsd/kern/kern_aio.c +++ b/bsd/kern/kern_aio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2014 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -304,7 +304,7 @@ aio_workq_init(aio_workq_t wq) TAILQ_INIT(&wq->aioq_entries); wq->aioq_count = 0; lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr); - waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO|SYNC_POLICY_DISABLE_IRQ); + waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO); } @@ -1654,8 +1654,9 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) * we get a wake up call on sleep channel &aio_anchor.aio_async_workq * after new work is queued up. */ +__attribute__((noreturn)) static void -aio_work_thread( void ) +aio_work_thread(void) { aio_workq_entry *entryp; int error; diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c index 91c2305c8..630a4c100 100644 --- a/bsd/kern/kern_authorization.c +++ b/bsd/kern/kern_authorization.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2011 Apple Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -596,7 +596,6 @@ kauth_authorize_generic_callback(kauth_cred_t credential, __unused void *idata, /* XXX == 0 ? */ return((kauth_cred_getuid(credential) == 0) ? KAUTH_RESULT_ALLOW : KAUTH_RESULT_DENY); - break; } /* no explicit result, so defer to others in the chain */ @@ -814,21 +813,8 @@ kauth_acl_inherit(vnode_t dvp, kauth_acl_t initial, kauth_acl_t *product, int is KAUTH_DEBUG(" ERROR - could not get parent directory ACL for inheritance"); return(error); } - if (VATTR_IS_SUPPORTED(&dva, va_acl)) { + if (VATTR_IS_SUPPORTED(&dva, va_acl)) inherit = dva.va_acl; - /* - * If there is an ACL on the parent directory, then - * there are potentially inheritable ACE entries, but - * if the flags on the directory ACL say not to - * inherit, then we don't inherit. This allows for - * per directory rerooting of the inheritable ACL - * hierarchy. - */ - if (inherit != NULL && inherit->acl_flags & KAUTH_ACL_NO_INHERIT) { - kauth_acl_free(inherit); - inherit = NULL; - } - } } /* diff --git a/bsd/kern/kern_backtrace.c b/bsd/kern/kern_backtrace.c new file mode 100644 index 000000000..9b175b009 --- /dev/null +++ b/bsd/kern/kern_backtrace.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#define MAX_BACKTRACE (128) + +#define BACKTRACE_USER (0) + +static int backtrace_sysctl SYSCTL_HANDLER_ARGS; + +SYSCTL_NODE(_kern, OID_AUTO, backtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "backtrace"); + +SYSCTL_PROC(_kern_backtrace, OID_AUTO, user, + CTLFLAG_RW | CTLFLAG_LOCKED, (void *)BACKTRACE_USER, + sizeof(uint64_t), backtrace_sysctl, "O", "take user backtrace of current thread"); + +static int +backtrace_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + uintptr_t *bt; + uint32_t bt_len, bt_filled; + uintptr_t type = (uintptr_t)arg1; + bool user_64; + int err = 0; + + if (type != BACKTRACE_USER) { + return EINVAL; + } + + if (req->oldptr == USER_ADDR_NULL || req->oldlen == 0) { + return EFAULT; + } + + bt_len = req->oldlen > MAX_BACKTRACE ? MAX_BACKTRACE : req->oldlen; + bt = kalloc(sizeof(uintptr_t) * bt_len); + if (!bt) { + return ENOBUFS; + } + + err = backtrace_user(bt, bt_len, &bt_filled, &user_64); + if (err) { + goto out; + } + + err = copyout(bt, req->oldptr, bt_filled * sizeof(uint64_t)); + if (err) { + goto out; + } + req->oldidx = bt_filled; + +out: + kfree(bt, sizeof(uintptr_t) * bt_len); + return err; +} diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index ebda4203d..f7ca33348 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -225,6 +225,7 @@ SYSCTL_INT(_net_systm_kctl, OID_AUTO, debug, static uintptr_t kctl_tbl_size = 0; static u_int32_t kctl_tbl_growing = 0; +static u_int32_t kctl_tbl_growing_waiting = 0; static uintptr_t kctl_tbl_count = 0; static struct kctl **kctl_table = NULL; static uintptr_t kctl_ref_gencnt = 0; @@ -461,9 +462,10 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p) error = soreserve(so, sendbufsize, recvbufsize); if (error) { - printf("%s - soreserve(%llx, %u, %u) error %d\n", __func__, - (uint64_t)VM_KERNEL_ADDRPERM(so), - sendbufsize, recvbufsize, error); + if (ctl_debug) + printf("%s - soreserve(%llx, %u, %u) error %d\n", + __func__, (uint64_t)VM_KERNEL_ADDRPERM(so), + sendbufsize, recvbufsize, error); goto done; } soisconnecting(so); @@ -478,6 +480,15 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p) end: if (error && kctl->disconnect) { + /* + * XXX Make sure we Don't check the return value + * of disconnect here. + * ipsec/utun_ctl_disconnect will return error when + * disconnect gets called after connect failure. + * However if we decide to check for disconnect return + * value here. Please make sure to revisit + * ipsec/utun_ctl_disconnect. + */ socket_unlock(so, 0); (*kctl->disconnect)(kctl->kctlref, kcb->unit, kcb->userdata); socket_lock(so, 0); @@ -857,7 +868,7 @@ ctl_enqueuembuf_list(void *kctlref, u_int32_t unit, struct mbuf *m_list, for (m = m_list; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; - if (m->m_pkthdr.len == 0) + if (m->m_pkthdr.len == 0 && ctl_debug) printf("%s: %llx m_pkthdr.len is 0", __func__, (uint64_t)VM_KERNEL_ADDRPERM(m)); @@ -953,8 +964,10 @@ ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, num_needed = 1; m = m_allocpacket_internal(&num_needed, len, NULL, M_NOWAIT, 1, 0); if (m == NULL) { - printf("ctl_enqueuedata: m_allocpacket_internal(%lu) failed\n", - len); + kctlstat.kcs_enqdata_mb_alloc_fail++; + if (ctl_debug) + printf("%s: m_allocpacket_internal(%lu) failed\n", + __func__, len); error = ENOMEM; goto bye; } @@ -977,6 +990,7 @@ ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, if ((flags & CTL_DATA_NOWAKEUP) == 0) sorwakeup(so); } else { + kctlstat.kcs_enqdata_sbappend_fail++; error = ENOBUFS; OSIncrementAtomic64((SInt64 *)&kctlstat.kcs_enqueue_fullsock); } @@ -1214,10 +1228,15 @@ kctl_tbl_grow() lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); - while (kctl_tbl_growing) { + if (kctl_tbl_growing) { /* Another thread is allocating */ - (void) msleep((caddr_t) &kctl_tbl_growing, ctl_mtx, - PSOCK | PCATCH, "kctl_tbl_growing", 0); + kctl_tbl_growing_waiting++; + + do { + (void) msleep((caddr_t) &kctl_tbl_growing, ctl_mtx, + PSOCK | PCATCH, "kctl_tbl_growing", 0); + } while (kctl_tbl_growing); + kctl_tbl_growing_waiting--; } /* Another thread grew the table */ if (kctl_table != NULL && kctl_tbl_count < kctl_tbl_size) @@ -1225,8 +1244,10 @@ kctl_tbl_grow() /* Verify we have a sane size */ if (kctl_tbl_size + KCTL_TBL_INC >= UINT16_MAX) { - printf("%s kctl_tbl_size %lu too big\n", - __func__, kctl_tbl_size); + kctlstat.kcs_tbl_size_too_big++; + if (ctl_debug) + printf("%s kctl_tbl_size %lu too big\n", + __func__, kctl_tbl_size); return; } kctl_tbl_growing = 1; @@ -1250,6 +1271,10 @@ kctl_tbl_grow() } kctl_tbl_growing = 0; + + if (kctl_tbl_growing_waiting) { + wakeup(&kctl_tbl_growing); + } } #define KCTLREF_INDEX_MASK 0x0000FFFF @@ -1760,13 +1785,13 @@ ctl_unlock(struct socket *so, int refcount, void *lr) else lr_saved = lr; -#ifdef MORE_KCTLLOCK_DEBUG +#if (MORE_KCTLLOCK_DEBUG && (DEVELOPMENT || DEBUG)) printf("ctl_unlock: so=%llx sopcb=%x lock=%llx ref=%u lr=%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb, (uint64_t)VM_KERNEL_ADDRPERM(((struct ctl_cb *)so->so_pcb)->mtx), so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved)); -#endif +#endif /* (MORE_KCTLLOCK_DEBUG && (DEVELOPMENT || DEBUG)) */ if (refcount) so->so_usecount--; diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index 9477378ef..5cb6e4fa2 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -32,6 +32,7 @@ * This file contains machine independent code for performing core dumps. * */ +#if CONFIG_COREDUMP #include #include @@ -65,6 +66,11 @@ #include +#if CONFIG_CSR +#include +#include +#endif + typedef struct { int flavor; /* the number for this flavor */ mach_msg_type_number_t count; /* count of ints in this flavor */ @@ -90,19 +96,13 @@ typedef struct { int flavor_count; } tir_t; -/* XXX should be static */ -void collectth_state(thread_t th_act, void *tirp); - extern int freespace_mb(vnode_t vp); /* XXX not in a Mach header anywhere */ -kern_return_t thread_getstatus(register thread_t act, int flavor, +kern_return_t thread_getstatus(thread_t act, int flavor, thread_state_t tstate, mach_msg_type_number_t *count); void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); -static cpu_type_t process_cpu_type(proc_t proc); -static cpu_type_t process_cpu_subtype(proc_t proc); - #ifdef SECURE_KERNEL __XNU_PRIVATE_EXTERN int do_coredump = 0; /* default: don't dump cores */ #else @@ -142,7 +142,7 @@ process_cpu_subtype(proc_t core_proc) return what_we_think; } -void +static void collectth_state(thread_t th_act, void *tirp) { vm_offset_t header; @@ -181,7 +181,6 @@ collectth_state(thread_t th_act, void *tirp) t->hoffset = hoffset; } - /* * coredump * @@ -256,6 +255,20 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) return (EFAULT); } +#if CONFIG_CSR + /* If the process is restricted, CSR isn't configured to allow + * restricted processes to be debugged, and CSR isn't configured in + * AppleInternal mode, then don't dump core. */ + if (cs_restricted(core_proc) && + csr_check(CSR_ALLOW_TASK_FOR_PID) && + csr_check(CSR_ALLOW_APPLE_INTERNAL)) { +#if CONFIG_AUDIT + audit_proc_coredump(core_proc, NULL, EFAULT); +#endif + return (EFAULT); + } +#endif + if (IS_64BIT_PROCESS(core_proc)) { is_64 = 1; mach_header_sz = sizeof(struct mach_header_64); @@ -507,3 +520,11 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) return (error); } + +#else /* CONFIG_COREDUMP */ + +/* When core dumps aren't needed, no need to compile this file at all */ + +#error assertion failed: this section is not compiled + +#endif /* CONFIG_COREDUMP */ diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index 0558ce449..1376ff3c5 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -1337,6 +1337,11 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id if ((kip->ki_valid & KI_VALID_UID) && (kip->ki_uid == elp->el_uid)) { if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_SUPGRPS) { assert(elp->el_sup_grp_cnt <= NGROUPS); + if (elp->el_sup_grp_cnt > NGROUPS) { + KAUTH_DEBUG("CACHE - invalid sup_grp_cnt provided (%d), truncating to %d", + elp->el_sup_grp_cnt, NGROUPS); + elp->el_sup_grp_cnt = NGROUPS; + } kip->ki_supgrpcnt = elp->el_sup_grp_cnt; memcpy(kip->ki_supgrps, elp->el_sup_groups, sizeof(elp->el_sup_groups[0]) * kip->ki_supgrpcnt); kip->ki_valid |= KI_VALID_GROUPS; @@ -2313,6 +2318,45 @@ kauth_cred_guid2gid(guid_t *guidp, gid_t *gidp) return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_GID, guidp, gidp)); } +/* + * kauth_cred_nfs4domain2dsnode + * + * Description: Fetch dsnode from nfs4domain + * + * Parameters: nfs4domain Pointer to a string nfs4 domain + * dsnode Pointer to buffer for dsnode + * + * Returns: 0 Success + * ENOENT For now just a stub that always fails + * + * Implicit returns: + * *dsnode Modified, if successuful + */ +int +kauth_cred_nfs4domain2dsnode(__unused char *nfs4domain, __unused char *dsnode) +{ + return(ENOENT); +} + +/* + * kauth_cred_dsnode2nfs4domain + * + * Description: Fetch nfs4domain from dsnode + * + * Parameters: nfs4domain Pointer to string dsnode + * dsnode Pointer to buffer for nfs4domain + * + * Returns: 0 Success + * ENOENT For now just a stub that always fails + * + * Implicit returns: + * *nfs4domain Modified, if successuful + */ +int +kauth_cred_dsnode2nfs4domain(__unused char *dsnode, __unused char *nfs4domain) +{ + return(ENOENT); +} /* * kauth_cred_ntsid2uid @@ -2757,6 +2801,11 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) * changing access to server file system objects on each * expiration. */ + if (ki.ki_supgrpcnt > NGROUPS) { + panic("kauth data structure corrupted. kauth identity 0x%p with %d groups, greater than max of %d", + &ki, ki.ki_supgrpcnt, NGROUPS); + } + el.el_sup_grp_cnt = ki.ki_supgrpcnt; memcpy(el.el_sup_groups, ki.ki_supgrps, sizeof (el.el_sup_groups[0]) * ki.ki_supgrpcnt); diff --git a/bsd/kern/kern_cs.c b/bsd/kern/kern_cs.c index c15dd4f11..6ff8c458b 100644 --- a/bsd/kern/kern_cs.c +++ b/bsd/kern/kern_cs.c @@ -150,6 +150,7 @@ cs_init(void) lck_grp_attr_t *attr = lck_grp_attr_alloc_init(); cs_lockgrp = lck_grp_alloc_init("KERNCS", attr); + lck_grp_attr_free(attr); } int @@ -186,8 +187,7 @@ cs_allow_invalid(struct proc *p) } int -cs_invalid_page( - addr64_t vaddr) +cs_invalid_page(addr64_t vaddr, boolean_t *cs_killed) { struct proc *p; int send_kill = 0, retval = 0, verbose = cs_debug; @@ -209,25 +209,12 @@ cs_invalid_page( /* CS_KILL triggers a kill signal, and no you can't have the page. Nothing else. */ if (p->p_csflags & CS_KILL) { - if (panic_on_cs_killed && - vaddr >= SHARED_REGION_BASE && - vaddr < SHARED_REGION_BASE + SHARED_REGION_SIZE) { - panic(" cs_invalid_page(va=0x%llx): killing p=%p\n", (uint64_t) vaddr, p); - } p->p_csflags |= CS_KILLED; cs_procs_killed++; send_kill = 1; retval = 1; } -#if __x86_64__ - if (panic_on_cs_killed && - vaddr >= SHARED_REGION_BASE && - vaddr < SHARED_REGION_BASE + SHARED_REGION_SIZE) { - panic(" cs_invalid_page(va=0x%llx): cs error p=%p\n", (uint64_t) vaddr, p); - } -#endif /* __x86_64__ */ - /* CS_HARD means fail the mapping operation so the process stays valid. */ if (p->p_csflags & CS_HARD) { retval = 1; @@ -248,8 +235,15 @@ cs_invalid_page( retval ? "denying" : "allowing (remove VALID)", send_kill ? " sending SIGKILL" : ""); - if (send_kill) - threadsignal(current_thread(), SIGKILL, EXC_BAD_ACCESS); + if (send_kill) { + /* We will set the exit reason for the thread later */ + threadsignal(current_thread(), SIGKILL, EXC_BAD_ACCESS, FALSE); + if (cs_killed) { + *cs_killed = TRUE; + } + } else if (cs_killed) { + *cs_killed = FALSE; + } return retval; @@ -275,6 +269,22 @@ cs_enforcement(struct proc *p) return 0; } +/* + * Returns whether a given process is still valid. + */ +int +cs_valid(struct proc *p) +{ + + if (p == NULL) + p = current_proc(); + + if (p != NULL && (p->p_csflags & CS_VALID)) + return 1; + + return 0; +} + /* * Library validation functions */ @@ -294,6 +304,53 @@ cs_require_lv(struct proc *p) return 0; } +/* + * added to allow system level library + * validation check at mac_cred_label_update_execve time + */ +int +cs_system_require_lv(void) +{ + return cs_library_val_enable ? 1 : 0; +} + +/* + * Function: csblob_get_base_offset + * + * Description: This function returns the base offset into the Mach-O binary + * for a given blob. +*/ + +off_t +csblob_get_base_offset(struct cs_blob *blob) +{ + return blob->csb_base_offset; +} + +/* + * Function: csblob_get_size + * + * Description: This function returns the size of a given blob. +*/ + +vm_size_t +csblob_get_size(struct cs_blob *blob) +{ + return blob->csb_mem_size; +} + +/* + * Function: csblob_get_addr + * + * Description: This function returns the address of a given blob. +*/ + +vm_address_t +csblob_get_addr(struct cs_blob *blob) +{ + return blob->csb_mem_kaddr; +} + /* * Function: csblob_get_platform_binary * @@ -318,9 +375,7 @@ csblob_get_platform_binary(struct cs_blob *blob) unsigned int csblob_get_flags(struct cs_blob *blob) { - if (blob) - return blob->csb_flags; - return 0; + return blob->csb_flags; } /* @@ -398,6 +453,22 @@ csblob_get_cdhash(struct cs_blob *csblob) return csblob->csb_cdhash; } +void * +csblob_entitlements_dictionary_copy(struct cs_blob *csblob) +{ + if (!csblob->csb_entitlements) return NULL; + osobject_retain(csblob->csb_entitlements); + return csblob->csb_entitlements; +} + +void +csblob_entitlements_dictionary_set(struct cs_blob *csblob, void * entitlements) +{ + assert(csblob->csb_entitlements == NULL); + if (entitlements) osobject_retain(entitlements); + csblob->csb_entitlements = entitlements; +} + /* * Function: csproc_get_teamid * diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index e089c5363..7bc3c62a9 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */ @@ -106,6 +106,7 @@ #include #include #include +#include #include #include #include @@ -119,8 +120,6 @@ #include #include -#include - kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, mach_msg_type_name_t, ipc_port_t *); void ipc_port_release_send(ipc_port_t); @@ -211,7 +210,7 @@ check_file_seek_range(struct flock *fl, off_t cur_file_offset) return EINVAL; } /* Check if end marker is beyond LLONG_MAX. */ - if ((fl->l_len > 0) && (CHECK_ADD_OVERFLOW_INT64L(fl->l_start + + if ((fl->l_len > 0) && (CHECK_ADD_OVERFLOW_INT64L(fl->l_start + cur_file_offset, fl->l_len - 1))) { return EOVERFLOW; } @@ -226,7 +225,7 @@ check_file_seek_range(struct flock *fl, off_t cur_file_offset) return EINVAL; } /* Check if the end marker is beyond LLONG_MAX. */ - if ((fl->l_len > 0) && + if ((fl->l_len > 0) && CHECK_ADD_OVERFLOW_INT64L(fl->l_start, fl->l_len - 1)) { return EOVERFLOW; } @@ -367,7 +366,7 @@ procfdtbl_releasefd(struct proc * p, int fd, struct fileproc * fp) } } -void +void procfdtbl_waitfd(struct proc * p, int fd) { p->p_fd->fd_ofileflags[fd] |= UF_RESVWAIT; @@ -381,7 +380,7 @@ procfdtbl_clearfd(struct proc * p, int fd) int waiting; waiting = (p->p_fd->fd_ofileflags[fd] & UF_RESVWAIT); - p->p_fd->fd_ofiles[fd] = NULL; + p->p_fd->fd_ofiles[fd] = NULL; p->p_fd->fd_ofileflags[fd] = 0; if ( waiting == UF_RESVWAIT) { wakeup(&p->p_fd); @@ -458,12 +457,12 @@ fd_rdwr( error = EBADF; goto out; } - + if (rw == UIO_READ && !(fp->f_flag & FREAD)) { error = EBADF; goto out; } - + context.vc_ucred = fp->f_fglob->fg_cred; if (UIO_SEG_IS_USER_SPACE(segflg)) @@ -1223,7 +1222,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* now set the space allocated to 0 */ alloc_struct.fst_bytesalloc = 0; - + /* * Do some simple parameter checking */ @@ -1231,7 +1230,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* set up the flags */ alloc_flags |= PREALLOCATE; - + if (alloc_struct.fst_flags & F_ALLOCATECONTIG) alloc_flags |= ALLOCATECONTIG; @@ -1244,7 +1243,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) */ switch (alloc_struct.fst_posmode) { - + case F_PEOFPOSMODE: if (alloc_struct.fst_offset != 0) { error = EINVAL; @@ -1283,7 +1282,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = error2; } goto outdrop; - + } case F_SETSIZE: if (fp->f_type != DTYPE_VNODE) { @@ -1312,7 +1311,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) #endif /* * Make sure that we are root. Growing a file - * without zero filling the data is a security hole + * without zero filling the data is a security hole * root would have access anyway so we'll allow it */ if (!kauth_cred_issuser(kauth_cred_get())) { @@ -1323,6 +1322,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) */ error = vnode_setsize(vp, offset, IO_NOZEROFILL, &context); + +#if CONFIG_MACF + if (error == 0) + mac_vnode_notify_truncate(&context, fp->f_fglob->fg_cred, vp); +#endif } (void)vnode_put(vp); @@ -1502,7 +1506,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } else { a_size = devBlockSize; } - + error = VNOP_BLOCKMAP(vp, offset, a_size, &bn, &run, NULL, 0, &context); (void)vnode_put(vp); @@ -1627,7 +1631,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = ENOENT; goto outdrop; } - + /* Only valid for directories */ if (vp->v_type != VDIR) { vnode_put(vp); @@ -1687,7 +1691,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = ENOENT; goto outdrop; } - + /* Only valid for directories */ if (vp->v_type != VDIR) { vnode_put(vp); @@ -1704,7 +1708,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* Start the lookup relative to the file descriptor's vnode. */ error = unlink1(&context, vp, pathname, UIO_USERSPACE, 0); - + vnode_put(vp); break; @@ -1766,7 +1770,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) { /* If this is for dyld_sim revalidate the blob */ if (uap->cmd == F_ADDFILESIGS_FOR_DYLD_SIM) { - error = ubc_cs_blob_revalidate(vp, blob, blob_add_flags); + error = ubc_cs_blob_revalidate(vp, blob, NULL, blob_add_flags); + if (error) { + vnode_put(vp); + goto outdrop; + } } } else { @@ -1816,10 +1824,10 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) memset((void *)(kernel_blob_addr + (kernel_blob_size - resid)), 0x0, resid); } } - + if (error) { ubc_cs_blob_deallocate(kernel_blob_addr, - kernel_blob_size); + kernel_blob_size); vnode_put(vp); goto outdrop; } @@ -1828,15 +1836,21 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = ubc_cs_blob_add(vp, CPU_TYPE_ANY, /* not for a specific architecture */ fs.fs_file_start, - kernel_blob_addr, + &kernel_blob_addr, kernel_blob_size, + NULL, blob_add_flags, &blob); + + /* ubc_blob_add() has consumed "kernel_blob_addr" if it is zeroed */ if (error) { - ubc_cs_blob_deallocate(kernel_blob_addr, - kernel_blob_size); + if (kernel_blob_addr) { + ubc_cs_blob_deallocate(kernel_blob_addr, + kernel_blob_size); + } + vnode_put(vp); + goto outdrop; } else { - /* ubc_blob_add() has consumed "kernel_blob_addr" */ #if CHECK_CS_VALIDATION_BITMAP ubc_cs_validation_bitmap_allocate( vp ); #endif @@ -1858,14 +1872,44 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) (void) vnode_put(vp); break; } + case F_GETCODEDIR: case F_FINDSIGS: { error = ENOTSUP; goto out; } + case F_CHECK_LV: { + struct fileglob *fg; + fchecklv_t lv; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + fg = fp->f_fglob; + proc_fdunlock(p); + + if (IS_64BIT_PROCESS(p)) { + error = copyin(argp, &lv, sizeof (lv)); + } else { + struct user32_fchecklv lv32; + + error = copyin(argp, &lv32, sizeof (lv32)); + lv.lv_file_start = lv32.lv_file_start; + lv.lv_error_message = CAST_USER_ADDR_T(lv32.lv_error_message); + lv.lv_error_message_size = lv32.lv_error_message; + } + if (error) + goto outdrop; + +#if CONFIG_MACF + error = mac_file_check_library_validation(p, fg, lv.lv_file_start, + lv.lv_error_message, lv.lv_error_message_size); +#endif + + break; + } #if CONFIG_PROTECT case F_GETPROTECTIONCLASS: { - int class = 0; - if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -1878,20 +1922,27 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = ENOENT; goto outdrop; } - - error = cp_vnode_getclass (vp, &class); - if (error == 0) { - *retval = class; + + struct vnode_attr va; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_dataprotect_class); + error = VNOP_GETATTR(vp, &va, &context); + if (!error) { + if (VATTR_IS_SUPPORTED(&va, va_dataprotect_class)) + *retval = va.va_dataprotect_class; + else + error = ENOTSUP; } vnode_put(vp); break; } - + case F_SETPROTECTIONCLASS: { /* tmp must be a valid PROTECTION_CLASS_* */ tmp = CAST_DOWN_EXPLICIT(uint32_t, uap->arg); - + if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -1899,12 +1950,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) vp = (struct vnode *)fp->f_data; proc_fdunlock(p); - + if (vnode_getwithref(vp)) { error = ENOENT; goto outdrop; - } - + } + /* Only go forward if you have write access */ vfs_context_t ctx = vfs_context_current(); if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { @@ -1912,53 +1963,55 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EBADF; goto outdrop; } - error = cp_vnode_setclass (vp, tmp); + + struct vnode_attr va; + + VATTR_INIT(&va); + VATTR_SET(&va, va_dataprotect_class, tmp); + + error = VNOP_SETATTR(vp, &va, ctx); + vnode_put(vp); break; - } + } case F_TRANSCODEKEY: { - - char *backup_keyp = NULL; - unsigned backup_key_len = CP_MAX_WRAPPEDKEYSIZE; - if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; } - + vp = (struct vnode *)fp->f_data; proc_fdunlock(p); if (vnode_getwithref(vp)) { error = ENOENT; goto outdrop; - } - - MALLOC(backup_keyp, char *, backup_key_len, M_TEMP, M_WAITOK); - if (backup_keyp == NULL) { - error = ENOMEM; - goto outdrop; } - error = cp_vnode_transcode (vp, backup_keyp, &backup_key_len); + cp_key_t k = { + .len = CP_MAX_WRAPPEDKEYSIZE, + }; + + MALLOC(k.key, char *, k.len, M_TEMP, M_WAITOK); + + error = VNOP_IOCTL(vp, F_TRANSCODEKEY, (caddr_t)&k, 1, &context); + vnode_put(vp); if (error == 0) { - error = copyout((caddr_t)backup_keyp, argp, backup_key_len); - *retval = backup_key_len; + error = copyout(k.key, argp, k.len); + *retval = k.len; } - FREE(backup_keyp, M_TEMP); + FREE(k.key, M_TEMP); break; - } + } case F_GETPROTECTIONLEVEL: { - uint32_t cp_version = 0; - if (fp->f_type != DTYPE_VNODE) { - error = EBADF; + error = EBADF; goto out; } @@ -1970,23 +2023,15 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } - /* - * if cp_get_major_vers fails, error will be set to proper errno - * and cp_version will still be 0. - */ - - error = cp_get_root_major_vers (vp, &cp_version); - *retval = cp_version; + error = VNOP_IOCTL(vp, F_GETPROTECTIONLEVEL, (caddr_t)retval, 0, &context); vnode_put (vp); break; } case F_GETDEFAULTPROTLEVEL: { - uint32_t cp_default = 0; - if (fp->f_type != DTYPE_VNODE) { - error = EBADF; + error = EBADF; goto out; } @@ -1999,18 +2044,16 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } /* - * if cp_get_major_vers fails, error will be set to proper errno + * if cp_get_major_vers fails, error will be set to proper errno * and cp_version will still be 0. */ - error = cp_get_default_level(vp, &cp_default); - *retval = cp_default; + error = VNOP_IOCTL(vp, F_GETDEFAULTPROTLEVEL, (caddr_t)retval, 0, &context); vnode_put (vp); break; } - #endif /* CONFIG_PROTECT */ case F_MOVEDATAEXTENTS: { @@ -2037,7 +2080,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } /* - * Get the references before we start acquiring iocounts on the vnodes, + * Get the references before we start acquiring iocounts on the vnodes, * while we still hold the proc fd lock */ if ( (error = fp_lookup(p, fd2, &fp2, 1)) ) { @@ -2073,15 +2116,15 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) fp_drop(p, fd2, fp2, 0); error = ENOENT; goto outdrop; - } + } if (vnode_getwithref(dst_vp)) { vnode_put (src_vp); fp_drop(p, fd2, fp2, 0); error = ENOENT; goto outdrop; - } - - /* + } + + /* * Basic asserts; validate they are not the same and that * both live on the same filesystem. */ @@ -2091,7 +2134,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) fp_drop (p, fd2, fp2, 0); error = EINVAL; goto outdrop; - } + } if (dst_vp->v_mount != src_vp->v_mount) { vnode_put (src_vp); @@ -2104,7 +2147,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* Now we have a legit pair of FDs. Go to work */ /* Now check for write access to the target files */ - if(vnode_authorize(src_vp, NULLVP, + if(vnode_authorize(src_vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { vnode_put(src_vp); vnode_put(dst_vp); @@ -2112,8 +2155,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EBADF; goto outdrop; } - - if(vnode_authorize(dst_vp, NULLVP, + + if(vnode_authorize(dst_vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { vnode_put(src_vp); vnode_put(dst_vp); @@ -2121,7 +2164,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EBADF; goto outdrop; } - + /* Verify that both vps point to files and not directories */ if ( !vnode_isreg(src_vp) || !vnode_isreg(dst_vp)) { error = EINVAL; @@ -2131,27 +2174,27 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } - /* + /* * The exchangedata syscall handler passes in 0 for the flags to VNOP_EXCHANGE. * We'll pass in our special bit indicating that the new behavior is expected */ - + error = VNOP_EXCHANGE(src_vp, dst_vp, FSOPT_EXCHANGE_DATA_ONLY, &context); - + vnode_put (src_vp); vnode_put (dst_vp); fp_drop(p, fd2, fp2, 0); break; } - - /* + + /* * SPI for making a file compressed. */ case F_MAKECOMPRESSED: { uint32_t gcounter = CAST_DOWN_EXPLICIT(uint32_t, uap->arg); if (fp->f_type != DTYPE_VNODE) { - error = EBADF; + error = EBADF; goto out; } @@ -2172,7 +2215,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } /* invoke ioctl to pass off to FS */ - /* Only go forward if you have write access */ + /* Only go forward if you have write access */ vfs_context_t ctx = vfs_context_current(); if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { vnode_put(vp); @@ -2183,9 +2226,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = VNOP_IOCTL(vp, uap->cmd, (caddr_t)&gcounter, 0, &context); vnode_put (vp); - break; + break; } - + /* * SPI (private) for indicating to a filesystem that subsequent writes to * the open FD will written to the Fastflow. @@ -2229,7 +2272,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = VNOP_IOCTL(vp, uap->cmd, ioctl_arg, 0, &context); (void)vnode_put(vp); - + break; } @@ -2239,7 +2282,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) * or other flavors that may be necessary. */ case F_SETIOTYPE: { - caddr_t param_ptr; + caddr_t param_ptr; uint32_t param; if (uap->arg) { @@ -2252,9 +2295,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EINVAL; goto out; } - - /* - * Validate the different types of flags that can be specified: + + /* + * Validate the different types of flags that can be specified: * all of them are mutually exclusive for now. */ switch (param) { @@ -2294,123 +2337,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) break; } - /* - * Extract the CodeDirectory of the vnode associated with - * the file descriptor and copy it back to user space - */ - case F_GETCODEDIR: { - struct user_fcodeblobs args; - - if (fp->f_type != DTYPE_VNODE) { - error = EBADF; - goto out; - } - - vp = (struct vnode *)fp->f_data; - proc_fdunlock(p); - - if ((fp->f_flag & FREAD) == 0) { - error = EBADF; - goto outdrop; - } - - if (IS_64BIT_PROCESS(p)) { - struct user64_fcodeblobs args64; - - error = copyin(argp, &args64, sizeof(args64)); - if (error) - goto outdrop; - - args.f_cd_hash = args64.f_cd_hash; - args.f_hash_size = args64.f_hash_size; - args.f_cd_buffer = args64.f_cd_buffer; - args.f_cd_size = args64.f_cd_size; - args.f_out_size = args64.f_out_size; - args.f_arch = args64.f_arch; - } else { - struct user32_fcodeblobs args32; - - error = copyin(argp, &args32, sizeof(args32)); - if (error) - goto outdrop; - - args.f_cd_hash = CAST_USER_ADDR_T(args32.f_cd_hash); - args.f_hash_size = args32.f_hash_size; - args.f_cd_buffer = CAST_USER_ADDR_T(args32.f_cd_buffer); - args.f_cd_size = args32.f_cd_size; - args.f_out_size = CAST_USER_ADDR_T(args32.f_out_size); - args.f_arch = args32.f_arch; - } - - if (vp->v_ubcinfo == NULL) { - error = EINVAL; - goto outdrop; - } - - struct cs_blob *t_blob = vp->v_ubcinfo->cs_blobs; - - /* - * This call fails if there is no cs_blob corresponding to the - * vnode, or if there are multiple cs_blobs present, and the caller - * did not specify which cpu_type they want the cs_blob for - */ - if (t_blob == NULL) { - error = ENOENT; /* there is no codesigning blob for this process */ - goto outdrop; - } else if (args.f_arch == 0 && t_blob->csb_next != NULL) { - error = ENOENT; /* too many architectures and none specified */ - goto outdrop; - } - - /* If the user specified an architecture, find the right blob */ - if (args.f_arch != 0) { - while (t_blob) { - if (t_blob->csb_cpu_type == args.f_arch) - break; - t_blob = t_blob->csb_next; - } - /* The cpu_type the user requested could not be found */ - if (t_blob == NULL) { - error = ENOENT; - goto outdrop; - } - } - - const CS_CodeDirectory *cd = t_blob->csb_cd; - if (cd == NULL) { - error = ENOENT; - goto outdrop; - } - - uint64_t buffer_size = ntohl(cd->length); - - if (buffer_size > UINT_MAX) { - error = ERANGE; - goto outdrop; - } - - error = copyout(&buffer_size, args.f_out_size, sizeof(unsigned int)); - if (error) - goto outdrop; - - if (sizeof(t_blob->csb_cdhash) > args.f_hash_size || - buffer_size > args.f_cd_size) { - error = ERANGE; - goto outdrop; - } - - error = copyout(t_blob->csb_cdhash, args.f_cd_hash, sizeof(t_blob->csb_cdhash)); - if (error) - goto outdrop; - error = copyout(cd, args.f_cd_buffer, buffer_size); - if (error) - goto outdrop; - - break; - } - - /* * Set the vnode pointed to by 'fd' * and tag it as the (potentially future) backing store * for another filesystem @@ -2420,7 +2347,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EBADF; goto out; } - + vp = (struct vnode *)fp->f_data; if (vp->v_tag != VT_HFS) { @@ -2433,7 +2360,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = ENOENT; goto outdrop; } - + /* only proceed if you have write access */ vfs_context_t ctx = vfs_context_current(); if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { @@ -2442,7 +2369,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } - + /* If arg != 0, set, otherwise unset */ if (uap->arg) { error = VNOP_IOCTL (vp, uap->cmd, (caddr_t)1, 0, &context); @@ -2450,12 +2377,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) else { error = VNOP_IOCTL (vp, uap->cmd, (caddr_t)NULL, 0, &context); } - + vnode_put(vp); break; } - /* + /* * like F_GETPATH, but special semantics for * the mobile time machine handler. */ @@ -2478,7 +2405,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } if ( (error = vnode_getwithref(vp)) == 0 ) { int backingstore = 0; - + /* Check for error from vn_getpath before moving on */ if ((error = vn_getpath(vp, pathbufp, &pathlen)) == 0) { if (vp->v_tag == VT_HFS) { @@ -2490,7 +2417,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = copyout((caddr_t)pathbufp, argp, pathlen); } if (error == 0) { - /* + /* * If the copyout was successful, now check to ensure * that this vnode is not a BACKINGSTORE vnode. mtmd * wants the path regardless. @@ -2530,7 +2457,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EINVAL; goto out; } - + /* Catch any now-invalid fcntl() selectors */ switch (uap->cmd) { case F_MARKDEPENDENCY: @@ -2580,7 +2507,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } else { data = &stkbuf[0]; } - + if (uap->cmd & IOC_IN) { if (size) { /* structure */ @@ -2623,7 +2550,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) (void)vnode_put(vp); /* Copy any output data to user */ - if (error == 0 && (uap->cmd & IOC_OUT) && size) + if (error == 0 && (uap->cmd & IOC_OUT) && size) error = copyout(data, argp, size); if (memp) kfree(memp, size); @@ -2838,14 +2765,14 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) if ( (fp->f_type == DTYPE_VNODE) && kauth_authorize_fileop_has_listeners() ) { /* - * call out to allow 3rd party notification of close. + * call out to allow 3rd party notification of close. * Ignore result of kauth_authorize_fileop call. */ if (vnode_getwithref((vnode_t)fp->f_data) == 0) { u_int fileop_flags = 0; if ((fp->f_flags & FP_WRITTEN) != 0) fileop_flags |= KAUTH_FILEOP_CLOSE_MODIFIED; - kauth_authorize_fileop(fp->f_fglob->fg_cred, KAUTH_FILEOP_CLOSE, + kauth_authorize_fileop(fp->f_fglob->fg_cred, KAUTH_FILEOP_CLOSE, (uintptr_t)fp->f_data, (uintptr_t)fileop_flags); vnode_put((vnode_t)fp->f_data); } @@ -2860,9 +2787,9 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) } if (fd < fdp->fd_knlistsize) - knote_fdclose(p, fd); + knote_fdclose(p, fd, FALSE); - if (fp->f_flags & FP_WAITEVENT) + if (fp->f_flags & FP_WAITEVENT) (void)waitevent_close(p, fp); fileproc_drain(p, fp); @@ -2884,7 +2811,7 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) proc_fdunlock(p); - fileproc_free(fp); + fileproc_free(fp); proc_fdlock(p); @@ -3018,11 +2945,11 @@ fstat1(proc_t p, int fd, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsec source.sb64.st_qspare[1] = 0LL; if (IS_64BIT_PROCESS(current_proc())) { - munge_user64_stat64(&source.sb64, &dest.user64_sb64); + munge_user64_stat64(&source.sb64, &dest.user64_sb64); my_size = sizeof(dest.user64_sb64); sbp = (caddr_t)&dest.user64_sb64; } else { - munge_user32_stat64(&source.sb64, &dest.user32_sb64); + munge_user32_stat64(&source.sb64, &dest.user32_sb64); my_size = sizeof(dest.user32_sb64); sbp = (caddr_t)&dest.user32_sb64; } @@ -3031,11 +2958,11 @@ fstat1(proc_t p, int fd, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsec source.sb.st_qspare[0] = 0LL; source.sb.st_qspare[1] = 0LL; if (IS_64BIT_PROCESS(current_proc())) { - munge_user64_stat(&source.sb, &dest.user64_sb); + munge_user64_stat(&source.sb, &dest.user64_sb); my_size = sizeof(dest.user64_sb); sbp = (caddr_t)&dest.user64_sb; } else { - munge_user32_stat(&source.sb, &dest.user32_sb); + munge_user32_stat(&source.sb, &dest.user32_sb); my_size = sizeof(dest.user32_sb); sbp = (caddr_t)&dest.user32_sb; } @@ -3097,7 +3024,7 @@ fstat_extended(proc_t p, struct fstat_extended_args *uap, __unused int32_t *retv { return(fstat1(p, uap->fd, uap->ub, uap->xsecurity, uap->xsecurity_size, 0)); } - + /* * fstat @@ -3112,7 +3039,7 @@ fstat_extended(proc_t p, struct fstat_extended_args *uap, __unused int32_t *retv * !0 Errno (see fstat1) */ int -fstat(proc_t p, register struct fstat_args *uap, __unused int32_t *retval) +fstat(proc_t p, struct fstat_args *uap, __unused int32_t *retval) { return(fstat1(p, uap->fd, uap->ub, 0, 0, 0)); } @@ -3139,7 +3066,7 @@ fstat64_extended(proc_t p, struct fstat64_extended_args *uap, __unused int32_t * { return(fstat1(p, uap->fd, uap->ub, uap->xsecurity, uap->xsecurity_size, 1)); } - + /* * fstat64 @@ -3155,7 +3082,7 @@ fstat64_extended(proc_t p, struct fstat64_extended_args *uap, __unused int32_t * * !0 Errno (see fstat1) */ int -fstat64(proc_t p, register struct fstat64_args *uap, __unused int32_t *retval) +fstat64(proc_t p, struct fstat64_args *uap, __unused int32_t *retval) { return(fstat1(p, uap->fd, uap->ub, 0, 0, 1)); } @@ -3849,6 +3776,7 @@ fp_getfpipe(proc_t p, int fd, struct fileproc **resultfp, return (0); } + /* * fp_lookup * @@ -3894,14 +3822,14 @@ fp_lookup(proc_t p, int fd, struct fileproc **resultfp, int locked) *resultfp = fp; if (!locked) proc_fdunlock(p); - + return (0); } /* * fp_tryswap - * + * * Description: Swap the fileproc pointer for a given fd with a new * fileproc pointer in the per-process open file table of * the specified process. The fdlock must be held at entry. @@ -3992,11 +3920,11 @@ fp_drop_written(proc_t p, int fd, struct fileproc *fp) proc_fdlock_spin(p); fp->f_flags |= FP_WRITTEN; - + error = fp_drop(p, fd, fp, 1); proc_fdunlock(p); - + return (error); } @@ -4028,11 +3956,11 @@ fp_drop_event(proc_t p, int fd, struct fileproc *fp) proc_fdlock_spin(p); fp->f_flags |= FP_WAITEVENT; - + error = fp_drop(p, fd, fp, 1); proc_fdunlock(p); - + return (error); } @@ -4091,7 +4019,7 @@ fp_drop(proc_t p, int fd, struct fileproc *fp, int locked) proc_fdunlock(p); if (needwakeup) wakeup(&p->p_fpdrainwait); - + return (0); } @@ -4137,7 +4065,7 @@ file_vnode(int fd, struct vnode **vpp) proc_t p = current_proc(); struct fileproc *fp; int error; - + proc_fdlock_spin(p); if ( (error = fp_lookup(p, fd, &fp, 1)) ) { proc_fdunlock(p); @@ -4199,7 +4127,7 @@ file_vnode_withvid(int fd, struct vnode **vpp, uint32_t * vidp) struct fileproc *fp; vnode_t vp; int error; - + proc_fdlock_spin(p); if ( (error = fp_lookup(p, fd, &fp, 1)) ) { proc_fdunlock(p); @@ -4211,10 +4139,10 @@ file_vnode_withvid(int fd, struct vnode **vpp, uint32_t * vidp) return(EINVAL); } vp = (struct vnode *)fp->f_data; - if (vpp != NULL) + if (vpp != NULL) *vpp = vp; - if ((vidp != NULL) && (vp != NULLVP)) + if ((vidp != NULL) && (vp != NULLVP)) *vidp = (uint32_t)vp->v_id; proc_fdunlock(p); @@ -4263,7 +4191,7 @@ file_socket(int fd, struct socket **sp) proc_t p = current_proc(); struct fileproc *fp; int error; - + proc_fdlock_spin(p); if ( (error = fp_lookup(p, fd, &fp, 1)) ) { proc_fdunlock(p); @@ -4311,7 +4239,7 @@ file_flags(int fd, int *flags) proc_t p = current_proc(); struct fileproc *fp; int error; - + proc_fdlock_spin(p); if ( (error = fp_lookup(p, fd, &fp, 1)) ) { proc_fdunlock(p); @@ -4361,7 +4289,7 @@ file_flags(int fd, int *flags) * * Use of this function is discouraged. */ -int +int file_drop(int fd) { struct fileproc *fp; @@ -4681,8 +4609,8 @@ fdexec(proc_t p, short flags) || (fp && mac_file_check_inherit(proc_ucred(p), fp->f_fglob)) #endif ) { - if (i < fdp->fd_knlistsize) - knote_fdclose(p, i); + if (i < fdp->fd_knlistsize) + knote_fdclose(p, i, TRUE); procfdtbl_clearfd(p, i); if (i == fdp->fd_lastfile && i > 0) fdp->fd_lastfile--; @@ -4969,17 +4897,20 @@ fdfree(proc_t p) panic("filedesc0"); if (fdp->fd_nfiles > 0 && fdp->fd_ofiles) { - for (i = fdp->fd_lastfile; i >= 0; i--) { + for (i = fdp->fd_lastfile; i >= 0; i--) { + + /* May still have knotes for fd without open file */ + if (i < fdp->fd_knlistsize) + knote_fdclose(p, i, TRUE); + if ((fp = fdp->fd_ofiles[i]) != NULL) { - + if (fdp->fd_ofileflags[i] & UF_RESERVED) panic("fdfree: found fp with UF_RESERVED"); procfdtbl_reservefd(p, i); - if (i < fdp->fd_knlistsize) - knote_fdclose(p, i); - if (fp->f_flags & FP_WAITEVENT) + if (fp->f_flags & FP_WAITEVENT) (void)waitevent_close(p, fp); (void) closef_locked(fp, fp->f_fglob, p); fileproc_free(fp); @@ -4988,10 +4919,10 @@ fdfree(proc_t p) FREE_ZONE(fdp->fd_ofiles, fdp->fd_nfiles * OFILESIZE, M_OFILETABL); fdp->fd_ofiles = NULL; fdp->fd_nfiles = 0; - } + } proc_fdunlock(p); - + if (fdp->fd_cdir) vnode_rele(fdp->fd_cdir); if (fdp->fd_rdir) @@ -5093,12 +5024,12 @@ closef_locked(struct fileproc *fp, struct fileglob *fg, proc_t p) if (p) proc_fdunlock(p); - /* Since we ensure that fg->fg_ops is always initialized, + /* Since we ensure that fg->fg_ops is always initialized, * it is safe to invoke fo_close on the fg */ error = fo_close(fg, &context); fg_free(fg); - + if (p) proc_fdlock(p); @@ -5207,7 +5138,7 @@ fp_free(proc_t p, int fd, struct fileproc * fp) * attempted * uap->how (Un)Lock bits, including type * retval Pointer to the call return area - * + * * Returns: 0 Success * fp_getfvp:EBADF Bad file descriptor * fp_getfvp:ENOTSUP fd does not refer to a vnode @@ -5262,12 +5193,11 @@ flock(proc_t p, struct flock_args *uap, __unused int32_t *retval) if (error) goto out; #endif - fp->f_flag |= FHASLOCK; - if (how & LOCK_NB) { - error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, F_FLOCK, ctx, NULL); - goto out; - } - error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, F_FLOCK|F_WAIT, ctx, NULL); + error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, + (how & LOCK_NB ? F_FLOCK : F_FLOCK | F_WAIT), + ctx, NULL); + if (!error) + fp->f_flag |= FHASLOCK; out: (void)vnode_put(vp); out1: @@ -5292,7 +5222,7 @@ flock(proc_t p, struct flock_args *uap, __unused int32_t *retval) * EAGAIN Resource shortage. * * Implicit returns: - * On success, name of send right is stored at user-specified address. + * On success, name of send right is stored at user-specified address. */ int fileport_makeport(proc_t p, struct fileport_makeport_args *uap, @@ -5335,14 +5265,14 @@ fileport_makeport(proc_t p, struct fileport_makeport_args *uap, fg_drop(fp); goto out; } - + /* Add an entry. Deallocates port on failure. */ name = ipc_port_copyout_send(fileport, get_task_ipcspace(p->task)); if (!MACH_PORT_VALID(name)) { err = EINVAL; goto out; - } - + } + err = copyout(&name, user_portaddr, sizeof(mach_port_name_t)); if (err != 0) { goto out; @@ -5447,7 +5377,7 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) out: if ((fp != NULL) && (0 != err)) { fileproc_free(fp); - } + } if (IPC_PORT_NULL != port) { ipc_port_release_send(port); @@ -5815,7 +5745,7 @@ fo_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) * function, it will need to revalidate/reacquire any cached * protected data obtained prior to the call. */ -int +int fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) { int error; @@ -5824,7 +5754,7 @@ fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) error = (*fp->f_ops->fo_ioctl)(fp, com, data, ctx); proc_fdlock(vfs_context_proc(ctx)); return(error); -} +} /* @@ -5843,7 +5773,7 @@ fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) */ int fo_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) -{ +{ return((*fp->f_ops->fo_select)(fp, which, wql, ctx)); } @@ -5863,7 +5793,7 @@ fo_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) */ int fo_close(struct fileglob *fg, vfs_context_t ctx) -{ +{ return((*fg->fg_ops->fo_close)(fg, ctx)); } @@ -5878,8 +5808,9 @@ fo_close(struct fileglob *fg, vfs_context_t ctx) * kn pointer to knote to filter on * ctx VFS context for operation * - * Returns: 0 Success - * !0 Errno from kqueue filter + * Returns: (kn->kn_flags & EV_ERROR) error in kn->kn_data + * 0 Filter is not active + * !0 Filter is active */ int fo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) @@ -5892,7 +5823,7 @@ fo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) * process is opt-in by file type. */ boolean_t -file_issendable(proc_t p, struct fileproc *fp) +file_issendable(proc_t p, struct fileproc *fp) { proc_fdlock_assert(p, LCK_MTX_ASSERT_OWNED); diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 6bc84137c..dd0390022 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,6 +86,7 @@ #include #include +#include #include #include #include @@ -93,19 +94,24 @@ #include #include +#include + #include #include "net/net_str_id.h" #include -#if VM_PRESSURE_EVENTS -#include -#endif - #if CONFIG_MEMORYSTATUS #include #endif +/* + * JMM - this typedef needs to be unified with pthread_priority_t + * and mach_msg_priority_t. It also needs to be the same type + * everywhere. + */ +typedef int32_t qos_t; + MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); #define KQ_EVENT NO_EVENT64 @@ -114,11 +120,10 @@ static inline void kqlock(struct kqueue *kq); static inline void kqunlock(struct kqueue *kq); static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn); -static int kqlock2knoteusewait(struct kqueue *kq, struct knote *kn); static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn); -static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn); +static int kqlock2knotedetach(struct kqueue *kq, struct knote *kn); +static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int defer_drop); -static void kqueue_wakeup(struct kqueue *kq, int closed); static int kqueue_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); static int kqueue_write(struct fileproc *fp, struct uio *uio, @@ -146,7 +151,7 @@ static const struct fileops kqueueops = { static int kevent_internal(struct proc *p, int fd, user_addr_t changelist, int nchanges, user_addr_t eventlist, int nevents, - user_addr_t data_out, user_size_t *data_available, + user_addr_t data_out, uint64_t data_available, unsigned int flags, user_addr_t utimeout, kqueue_continue_t continuation, int32_t *retval); @@ -156,27 +161,66 @@ static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p, unsigned int flags); char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n); +static void kqueue_interrupt(struct kqueue *kq); static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp, void *data); static void kevent_continue(struct kqueue *kq, void *data, int error); static void kqueue_scan_continue(void *contp, wait_result_t wait_result); -static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, - void *data, int *countp, struct proc *p); -static int kqueue_begin_processing(struct kqueue *kq); -static void kqueue_end_processing(struct kqueue *kq); -static int knote_process(struct knote *kn, kevent_callback_t callback, - void *data, struct kqtailq *inprocessp, struct proc *p); +static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data, + struct filt_process_s *process_data, kq_index_t servicer_qos_index, + int *countp, struct proc *p); +static int kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags); +static void kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags); +static struct kqtailq *kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index); +static struct kqtailq *kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index); +static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index); + +static struct kqtailq *kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index); + +static void kqworkq_request_thread(struct kqworkq *kqwq, kq_index_t qos_index); +static void kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index, uint32_t type); +static void kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index); +static void kqworkq_bind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags); +static void kqworkq_unbind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags); +static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index); + + +static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data, + struct filt_process_s *process_data, struct proc *p); +#if 0 static void knote_put(struct knote *kn); -static int knote_fdpattach(struct knote *kn, struct filedesc *fdp, - struct proc *p); +#endif + +static int knote_fdadd(struct knote *kn, struct proc *p); +static void knote_fdremove(struct knote *kn, struct proc *p); +static struct knote *knote_fdfind(struct kqueue *kq, struct kevent_internal_s *kev, struct proc *p); + static void knote_drop(struct knote *kn, struct proc *p); -static void knote_activate(struct knote *kn, int); -static void knote_deactivate(struct knote *kn); -static void knote_enqueue(struct knote *kn); -static void knote_dequeue(struct knote *kn); static struct knote *knote_alloc(void); static void knote_free(struct knote *kn); +static void knote_activate(struct knote *kn); +static void knote_deactivate(struct knote *kn); + +static void knote_enable(struct knote *kn); +static void knote_disable(struct knote *kn); + +static int knote_enqueue(struct knote *kn); +static void knote_dequeue(struct knote *kn); + +static void knote_suppress(struct knote *kn); +static void knote_unsuppress(struct knote *kn); +static void knote_wakeup(struct knote *kn); + +static kq_index_t knote_get_queue_index(struct knote *kn); +static struct kqtailq *knote_get_queue(struct knote *kn); +static struct kqtailq *knote_get_suppressed_queue(struct knote *kn); +static kq_index_t knote_get_req_index(struct knote *kn); +static kq_index_t knote_get_qos_index(struct knote *kn); +static void knote_set_qos_index(struct knote *kn, kq_index_t qos_index); +static kq_index_t knote_get_qos_override_index(struct knote *kn); +static void knote_set_qos_override_index(struct knote *kn, kq_index_t qos_index); + static int filt_fileattach(struct knote *kn); static struct filterops file_filtops = { .f_isfd = 1, @@ -185,10 +229,14 @@ static struct filterops file_filtops = { static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); +static int filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); static struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, + .f_touch = filt_kqtouch, + .f_process = filt_kqprocess, }; /* placeholder for not-yet-implemented filters */ @@ -200,23 +248,16 @@ static struct filterops bad_filtops = { static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); +static int filt_proctouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); static struct filterops proc_filtops = { .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, + .f_touch = filt_proctouch, + .f_process = filt_procprocess, }; -#if VM_PRESSURE_EVENTS -static int filt_vmattach(struct knote *kn); -static void filt_vmdetach(struct knote *kn); -static int filt_vm(struct knote *kn, long hint); -static struct filterops vm_filtops = { - .f_attach = filt_vmattach, - .f_detach = filt_vmdetach, - .f_event = filt_vm, -}; -#endif /* VM_PRESSURE_EVENTS */ - #if CONFIG_MEMORYSTATUS extern struct filterops memorystatus_filtops; #endif /* CONFIG_MEMORYSTATUS */ @@ -229,19 +270,20 @@ extern struct filterops sig_filtops; static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); -static void filt_timertouch(struct knote *kn, struct kevent_internal_s *kev, - long type); +static int filt_timertouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_timerprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); static struct filterops timer_filtops = { .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, + .f_process = filt_timerprocess, }; /* Helpers */ static void filt_timerexpire(void *knx, void *param1); static int filt_timervalidate(struct knote *kn); -static void filt_timerupdate(struct knote *kn); +static void filt_timerupdate(struct knote *kn, int num_fired); static void filt_timercancel(struct knote *kn); #define TIMER_RUNNING 0x1 @@ -252,6 +294,8 @@ static void filt_timerlock(void); static void filt_timerunlock(void); static zone_t knote_zone; +static zone_t kqfile_zone; +static zone_t kqworkq_zone; #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) @@ -266,53 +310,186 @@ extern struct filterops machport_filtops; static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); -static void filt_usertouch(struct knote *kn, struct kevent_internal_s *kev, - long type); +static int filt_usertouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_userprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, + .f_process = filt_userprocess, }; +static lck_spin_t _filt_userlock; +static void filt_userlock(void); +static void filt_userunlock(void); + +extern struct filterops pipe_rfiltops; +extern struct filterops pipe_wfiltops; +extern struct filterops ptsd_kqops; +extern struct filterops soread_filtops; +extern struct filterops sowrite_filtops; +extern struct filterops sock_filtops; +extern struct filterops soexcept_filtops; +extern struct filterops spec_filtops; +extern struct filterops bpfread_filtops; +extern struct filterops necp_fd_rfiltops; +extern struct filterops skywalk_channel_rfiltops; +extern struct filterops skywalk_channel_wfiltops; +extern struct filterops fsevent_filtops; +extern struct filterops vnode_filtops; + /* - * Table for all system-defined filters. + * + * Rules for adding new filters to the system: + * Public filters: + * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value) + * in the exported section of the header + * - Update the EVFILT_SYSCOUNT value to reflect the new addition + * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end + * of the Public Filters section in the array. + * Private filters: + * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value) + * in the XNU_KERNEL_PRIVATE section of the header + * - Update the EVFILTID_MAX value to reflect the new addition + * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of + * the Private filters section of the array. */ -static struct filterops *sysfilt_ops[] = { - &file_filtops, /* EVFILT_READ */ - &file_filtops, /* EVFILT_WRITE */ -#if 0 - &aio_filtops, /* EVFILT_AIO */ -#else - &bad_filtops, /* EVFILT_AIO */ -#endif - &file_filtops, /* EVFILT_VNODE */ - &proc_filtops, /* EVFILT_PROC */ - &sig_filtops, /* EVFILT_SIGNAL */ - &timer_filtops, /* EVFILT_TIMER */ - &machport_filtops, /* EVFILT_MACHPORT */ - &fs_filtops, /* EVFILT_FS */ - &user_filtops, /* EVFILT_USER */ - &bad_filtops, /* unused */ -#if VM_PRESSURE_EVENTS - &vm_filtops, /* EVFILT_VM */ -#else - &bad_filtops, /* EVFILT_VM */ -#endif - &file_filtops, /* EVFILT_SOCK */ +static struct filterops *sysfilt_ops[EVFILTID_MAX] = { + /* Public Filters */ + [~EVFILT_READ] = &file_filtops, + [~EVFILT_WRITE] = &file_filtops, + [~EVFILT_AIO] = &bad_filtops, + [~EVFILT_VNODE] = &file_filtops, + [~EVFILT_PROC] = &proc_filtops, + [~EVFILT_SIGNAL] = &sig_filtops, + [~EVFILT_TIMER] = &timer_filtops, + [~EVFILT_MACHPORT] = &machport_filtops, + [~EVFILT_FS] = &fs_filtops, + [~EVFILT_USER] = &user_filtops, + &bad_filtops, + &bad_filtops, + [~EVFILT_SOCK] = &file_filtops, #if CONFIG_MEMORYSTATUS - &memorystatus_filtops, /* EVFILT_MEMORYSTATUS */ + [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops, #else - &bad_filtops, /* EVFILT_MEMORYSTATUS */ + [~EVFILT_MEMORYSTATUS] = &bad_filtops, #endif + [~EVFILT_EXCEPT] = &file_filtops, + + /* Private filters */ + [EVFILTID_KQREAD] = &kqread_filtops, + [EVFILTID_PIPE_R] = &pipe_rfiltops, + [EVFILTID_PIPE_W] = &pipe_wfiltops, + [EVFILTID_PTSD] = &ptsd_kqops, + [EVFILTID_SOREAD] = &soread_filtops, + [EVFILTID_SOWRITE] = &sowrite_filtops, + [EVFILTID_SCK] = &sock_filtops, + [EVFILTID_SOEXCEPT] = &soexcept_filtops, + [EVFILTID_SPEC] = &spec_filtops, + [EVFILTID_BPFREAD] = &bpfread_filtops, + [EVFILTID_NECP_FD] = &necp_fd_rfiltops, + [EVFILTID_FSEVENT] = &fsevent_filtops, + [EVFILTID_VN] = &vnode_filtops }; +/* waitq prepost callback */ +void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos); + +#ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG +#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */ +#endif +#ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG +#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG 0x80000000 /* request overcommit threads */ +#endif +#ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK +#define _PTHREAD_PRIORITY_QOS_CLASS_MASK 0x003fff00 /* QoS class mask */ +#endif +#ifndef _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 +#define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 8 +#endif + +static inline +qos_t canonicalize_kevent_qos(qos_t qos) +{ + unsigned long canonical; + + /* preserve manager and overcommit flags in this case */ + canonical = pthread_priority_canonicalize(qos, FALSE); + return (qos_t)canonical; +} + +static inline +kq_index_t qos_index_from_qos(qos_t qos, boolean_t propagation) +{ + kq_index_t qos_index; + unsigned long flags = 0; + + qos_index = (kq_index_t)thread_qos_from_pthread_priority( + (unsigned long)qos, &flags); + + if (!propagation && (flags & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) + return KQWQ_QOS_MANAGER; + + return qos_index; +} + +static inline +qos_t qos_from_qos_index(kq_index_t qos_index) +{ + if (qos_index == KQWQ_QOS_MANAGER) + return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; + + if (qos_index == 0) + return 0; /* Unspecified */ + + /* Should have support from pthread kext support */ + return (1 << (qos_index - 1 + + _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32)); +} + +static inline +kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags) +{ + kq_index_t qos_index; + + if (flags & KEVENT_FLAG_WORKQ_MANAGER) + return KQWQ_QOS_MANAGER; + + /* + * If the caller didn't pass in a class (legacy pthread kext) + * the we use the thread policy QoS of the current thread. + */ + assert(qos_class != -1); + if (qos_class == -1) + qos_index = proc_get_thread_policy(thread, + TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS); + else + qos_index = (kq_index_t)qos_class; + + assert(qos_index > 0 && qos_index < KQWQ_NQOS); + + return qos_index; +} + /* - * kqueue/note lock attributes and implementations + * kqueue/note lock implementations + * + * The kqueue lock guards the kq state, the state of its queues, + * and the kqueue-aware status and use counts of individual knotes. * - * kqueues have locks, while knotes have use counts - * Most of the knote state is guarded by the object lock. - * the knote "inuse" count and status use the kqueue lock. + * The kqueue workq lock is used to protect state guarding the + * interaction of the kqueue with the workq. This state cannot + * be guarded by the kq lock - as it needs to be taken when we + * already have the waitq set lock held (during the waitq hook + * callback). It might be better to use the waitq lock itself + * for this, but the IRQ requirements make that difficult). + * + * Knote flags, filter flags, and associated data are protected + * by the underlying object lock - and are only ever looked at + * by calling the filter to get a [consistent] snapshot of that + * data. */ lck_grp_attr_t * kq_lck_grp_attr; lck_grp_t * kq_lck_grp; @@ -330,79 +507,143 @@ kqunlock(struct kqueue *kq) lck_spin_unlock(&kq->kq_lock); } + /* * Convert a kq lock to a knote use referece. * - * If the knote is being dropped, we can't get - * a use reference, so just return with it - * still locked. + * If the knote is being dropped, or has + * vanished, we can't get a use reference. + * Just return with it still locked. + * * - kq locked at entry * - unlock on exit if we get the use reference */ static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn) { - if (kn->kn_status & KN_DROPPING) + if (kn->kn_status & (KN_DROPPING | KN_VANISHED)) return (0); - kn->kn_inuse++; - kqunlock(kq); - return (1); -} -/* - * Convert a kq lock to a knote use referece, - * but wait for attach and drop events to complete. - * - * If the knote is being dropped, we can't get - * a use reference, so just return with it - * still locked. - * - kq locked at entry - * - kq always unlocked on exit - */ -static int -kqlock2knoteusewait(struct kqueue *kq, struct knote *kn) -{ - if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) { - kn->kn_status |= KN_USEWAIT; - waitq_assert_wait64((struct waitq *)kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_status), - THREAD_UNINT, TIMEOUT_WAIT_FOREVER); - kqunlock(kq); - thread_block(THREAD_CONTINUE_NULL); - return (0); - } + assert(kn->kn_status & KN_ATTACHED); kn->kn_inuse++; kqunlock(kq); return (1); } + /* * Convert from a knote use reference back to kq lock. * * Drop a use reference and wake any waiters if * this is the last one. * - * The exit return indicates if the knote is - * still alive - but the kqueue lock is taken - * unconditionally. + * If someone is trying to drop the knote, but the + * caller has events they must deliver, take + * responsibility for the drop later - and wake the + * other attempted dropper in a manner that informs + * him of the transfer of responsibility. + * + * The exit return indicates if the knote is still alive + * (or if not, the other dropper has been given the green + * light to drop it). + * + * The kqueue lock is re-taken unconditionally. */ static int -knoteuse2kqlock(struct kqueue *kq, struct knote *kn) +knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int steal_drop) { + int dropped = 0; + kqlock(kq); if (--kn->kn_inuse == 0) { + if ((kn->kn_status & KN_ATTACHING) != 0) { kn->kn_status &= ~KN_ATTACHING; } + if ((kn->kn_status & KN_USEWAIT) != 0) { + wait_result_t result; + + /* If we need to, try and steal the drop */ + if (kn->kn_status & KN_DROPPING) { + if (steal_drop && !(kn->kn_status & KN_STOLENDROP)) { + kn->kn_status |= KN_STOLENDROP; + } else { + dropped = 1; + } + } + + /* wakeup indicating if ANY USE stole the drop */ + result = (kn->kn_status & KN_STOLENDROP) ? + THREAD_RESTART : THREAD_AWAKENED; + kn->kn_status &= ~KN_USEWAIT; - waitq_wakeup64_all((struct waitq *)kq->kq_wqs, + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, CAST_EVENT64_T(&kn->kn_status), - THREAD_AWAKENED, + result, WAITQ_ALL_PRIORITIES); + } else { + /* should have seen use-wait if dropping with use refs */ + assert((kn->kn_status & (KN_DROPPING|KN_STOLENDROP)) == 0); + } + + } else if (kn->kn_status & KN_DROPPING) { + /* not the last ref but want to steal a drop if present */ + if (steal_drop && ((kn->kn_status & KN_STOLENDROP) == 0)) { + kn->kn_status |= KN_STOLENDROP; + + /* but we now have to wait to be the last ref */ + kn->kn_status |= KN_USEWAIT; + waitq_assert_wait64((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(&kn->kn_status), + THREAD_UNINT, TIMEOUT_WAIT_FOREVER); + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + kqlock(kq); + } else { + dropped = 1; } } - return ((kn->kn_status & KN_DROPPING) == 0); + + return (!dropped); +} + +/* + * Convert a kq lock to a knote use reference + * (for the purpose of detaching AND vanishing it). + * + * If the knote is being dropped, we can't get + * a detach reference, so wait for the knote to + * finish dropping before returning. + * + * If the knote is being used for other purposes, + * we cannot detach it until those uses are done + * as well. Again, just wait for them to finish + * (caller will start over at lookup). + * + * - kq locked at entry + * - unlocked on exit + */ +static int +kqlock2knotedetach(struct kqueue *kq, struct knote *kn) +{ + if ((kn->kn_status & KN_DROPPING) || kn->kn_inuse) { + /* have to wait for dropper or current uses to go away */ + kn->kn_status |= KN_USEWAIT; + waitq_assert_wait64((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(&kn->kn_status), + THREAD_UNINT, TIMEOUT_WAIT_FOREVER); + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + return (0); + } + assert((kn->kn_status & KN_VANISHED) == 0); + assert(kn->kn_status & KN_ATTACHED); + kn->kn_status &= ~KN_ATTACHED; + kn->kn_status |= KN_VANISHED; + kn->kn_inuse++; + kqunlock(kq); + return (1); } /* @@ -423,10 +664,13 @@ static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn) { int oktodrop; + wait_result_t result; oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0); - kn->kn_status &= ~KN_STAYQUEUED; + /* if another thread is attaching, they will become the dropping thread */ kn->kn_status |= KN_DROPPING; + knote_unsuppress(kn); + knote_dequeue(kn); if (oktodrop) { if (kn->kn_inuse == 0) { kqunlock(kq); @@ -434,27 +678,29 @@ kqlock2knotedrop(struct kqueue *kq, struct knote *kn) } } kn->kn_status |= KN_USEWAIT; - waitq_assert_wait64((struct waitq *)kq->kq_wqs, + waitq_assert_wait64((struct waitq *)&kq->kq_wqs, CAST_EVENT64_T(&kn->kn_status), THREAD_UNINT, TIMEOUT_WAIT_FOREVER); kqunlock(kq); - thread_block(THREAD_CONTINUE_NULL); - return (oktodrop); + result = thread_block(THREAD_CONTINUE_NULL); + /* THREAD_RESTART == another thread stole the knote drop */ + return (result == THREAD_AWAKENED); } +#if 0 /* * Release a knote use count reference. */ static void knote_put(struct knote *kn) { - struct kqueue *kq = kn->kn_kq; + struct kqueue *kq = knote_get_kq(kn); kqlock(kq); if (--kn->kn_inuse == 0) { if ((kn->kn_status & KN_USEWAIT) != 0) { kn->kn_status &= ~KN_USEWAIT; - waitq_wakeup64_all((struct waitq *)kq->kq_wqs, + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, CAST_EVENT64_T(&kn->kn_status), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); @@ -462,6 +708,7 @@ knote_put(struct knote *kn) } kqunlock(kq); } +#endif static int filt_fileattach(struct knote *kn) @@ -479,10 +726,11 @@ filt_fileattach(struct knote *kn) static void filt_kqdetach(struct knote *kn) { - struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data; + struct kqueue *kq = &kqf->kqf_kqueue; kqlock(kq); - KNOTE_DETACH(&kq->kq_sel.si_note, kn); + KNOTE_DETACH(&kqf->kqf_sel.si_note, kn); kqunlock(kq); } @@ -491,9 +739,48 @@ static int filt_kqueue(struct knote *kn, __unused long hint) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + int count; + + count = kq->kq_count; + return (count > 0); +} + +static int +filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev) +{ +#pragma unused(kev) + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + int res; + + kqlock(kq); + kn->kn_data = kq->kq_count; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + res = (kn->kn_data > 0); + + kqunlock(kq); + + return res; +} + +static int +filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + int res; + kqlock(kq); kn->kn_data = kq->kq_count; - return (kn->kn_data > 0); + res = (kn->kn_data > 0); + if (res) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) + kn->kn_data = 0; + } + kqunlock(kq); + + return res; } static int @@ -503,12 +790,17 @@ filt_procattach(struct knote *kn) assert(PID_MAX < NOTE_PDATAMASK); - if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) - return (ENOTSUP); + if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) { + kn->kn_flags = EV_ERROR; + kn->kn_data = ENOTSUP; + return 0; + } p = proc_find(kn->kn_id); if (p == NULL) { - return (ESRCH); + kn->kn_flags = EV_ERROR; + kn->kn_data = ESRCH; + return 0; } const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS; @@ -525,12 +817,13 @@ filt_procattach(struct knote *kn) break; /* parent-in-waiting => ok */ proc_rele(p); - return (EACCES); + kn->kn_flags = EV_ERROR; + kn->kn_data = EACCES; + return 0; } while (0); proc_klist_lock(); - kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_ptr.p_proc = p; /* store the proc handle */ KNOTE_ATTACH(&p->p_klist, kn); @@ -539,9 +832,14 @@ filt_procattach(struct knote *kn) proc_rele(p); + /* + * only captures edge-triggered events after this point + * so it can't already be fired. + */ return (0); } + /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. In that case, @@ -566,151 +864,150 @@ filt_procdetach(struct knote *kn) static int filt_proc(struct knote *kn, long hint) { + u_int event; + + /* ALWAYS CALLED WITH proc_klist_lock */ + /* * Note: a lot of bits in hint may be obtained from the knote * To free some of those bits, see Freeing up * bits in hint for filt_proc + * + * mask off extra data */ - /* hint is 0 when called from above */ - if (hint != 0) { - u_int event; - - /* ALWAYS CALLED WITH proc_klist_lock when (hint != 0) */ - - /* - * mask off extra data - */ - event = (u_int)hint & NOTE_PCTRLMASK; + event = (u_int)hint & NOTE_PCTRLMASK; - /* - * termination lifecycle events can happen while a debugger - * has reparented a process, in which case notifications - * should be quashed except to the tracing parent. When - * the debugger reaps the child (either via wait4(2) or - * process exit), the child will be reparented to the original - * parent and these knotes re-fired. - */ - if (event & NOTE_EXIT) { - if ((kn->kn_ptr.p_proc->p_oppid != 0) - && (kn->kn_kq->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) { - /* - * This knote is not for the current ptrace(2) parent, ignore. - */ - return 0; - } - } + /* + * termination lifecycle events can happen while a debugger + * has reparented a process, in which case notifications + * should be quashed except to the tracing parent. When + * the debugger reaps the child (either via wait4(2) or + * process exit), the child will be reparented to the original + * parent and these knotes re-fired. + */ + if (event & NOTE_EXIT) { + if ((kn->kn_ptr.p_proc->p_oppid != 0) + && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) { + /* + * This knote is not for the current ptrace(2) parent, ignore. + */ + return 0; + } + } - /* - * if the user is interested in this event, record it. - */ - if (kn->kn_sfflags & event) - kn->kn_fflags |= event; + /* + * if the user is interested in this event, record it. + */ + if (kn->kn_sfflags & event) + kn->kn_fflags |= event; #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" - if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) { - kn->kn_flags |= (EV_EOF | EV_ONESHOT); - } + if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + } #pragma clang diagnostic pop - /* - * The kernel has a wrapper in place that returns the same data - * as is collected here, in kn_data. Any changes to how - * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected - * should also be reflected in the proc_pidnoteexit() wrapper. - */ - if (event == NOTE_EXIT) { - kn->kn_data = 0; - if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) { - kn->kn_fflags |= NOTE_EXITSTATUS; - kn->kn_data |= (hint & NOTE_PDATAMASK); + /* + * The kernel has a wrapper in place that returns the same data + * as is collected here, in kn_data. Any changes to how + * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected + * should also be reflected in the proc_pidnoteexit() wrapper. + */ + if (event == NOTE_EXIT) { + kn->kn_data = 0; + if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) { + kn->kn_fflags |= NOTE_EXITSTATUS; + kn->kn_data |= (hint & NOTE_PDATAMASK); + } + if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) { + kn->kn_fflags |= NOTE_EXIT_DETAIL; + if ((kn->kn_ptr.p_proc->p_lflag & + P_LTERM_DECRYPTFAIL) != 0) { + kn->kn_data |= NOTE_EXIT_DECRYPTFAIL; } - if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) { - kn->kn_fflags |= NOTE_EXIT_DETAIL; - if ((kn->kn_ptr.p_proc->p_lflag & - P_LTERM_DECRYPTFAIL) != 0) { - kn->kn_data |= NOTE_EXIT_DECRYPTFAIL; - } - if ((kn->kn_ptr.p_proc->p_lflag & - P_LTERM_JETSAM) != 0) { - kn->kn_data |= NOTE_EXIT_MEMORY; - switch (kn->kn_ptr.p_proc->p_lflag & - P_JETSAM_MASK) { - case P_JETSAM_VMPAGESHORTAGE: - kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE; - break; - case P_JETSAM_VMTHRASHING: - kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING; - break; - case P_JETSAM_FCTHRASHING: - kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING; - break; - case P_JETSAM_VNODE: - kn->kn_data |= NOTE_EXIT_MEMORY_VNODE; - break; - case P_JETSAM_HIWAT: - kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT; - break; - case P_JETSAM_PID: - kn->kn_data |= NOTE_EXIT_MEMORY_PID; - break; - case P_JETSAM_IDLEEXIT: - kn->kn_data |= NOTE_EXIT_MEMORY_IDLE; - break; - } - } - if ((kn->kn_ptr.p_proc->p_csflags & - CS_KILLED) != 0) { - kn->kn_data |= NOTE_EXIT_CSERROR; + if ((kn->kn_ptr.p_proc->p_lflag & + P_LTERM_JETSAM) != 0) { + kn->kn_data |= NOTE_EXIT_MEMORY; + switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) { + case P_JETSAM_VMPAGESHORTAGE: + kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE; + break; + case P_JETSAM_VMTHRASHING: + kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING; + break; + case P_JETSAM_FCTHRASHING: + kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING; + break; + case P_JETSAM_VNODE: + kn->kn_data |= NOTE_EXIT_MEMORY_VNODE; + break; + case P_JETSAM_HIWAT: + kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT; + break; + case P_JETSAM_PID: + kn->kn_data |= NOTE_EXIT_MEMORY_PID; + break; + case P_JETSAM_IDLEEXIT: + kn->kn_data |= NOTE_EXIT_MEMORY_IDLE; + break; } } + if ((kn->kn_ptr.p_proc->p_csflags & + CS_KILLED) != 0) { + kn->kn_data |= NOTE_EXIT_CSERROR; + } } } - /* atomic check, no locking need when called from above */ + /* if we have any matching state, activate the knote */ return (kn->kn_fflags != 0); } -#if VM_PRESSURE_EVENTS -/* - * Virtual memory kevents - * - * author: Matt Jacobson [matthew_jacobson@apple.com] - */ - static int -filt_vmattach(struct knote *kn) +filt_proctouch(struct knote *kn, struct kevent_internal_s *kev) { + int res; + + proc_klist_lock(); + + /* accept new filter flags and mask off output events no long interesting */ + kn->kn_sfflags = kev->fflags; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* restrict the current results to the (smaller?) set of new interest */ /* - * The note will be cleared once the information has been flushed to - * the client. If there is still pressure, we will be re-alerted. + * For compatibility with previous implementations, we leave kn_fflags + * as they were before. */ - kn->kn_flags |= EV_CLEAR; - return (vm_knote_register(kn)); -} + //kn->kn_fflags &= kn->kn_sfflags; -static void -filt_vmdetach(struct knote *kn) -{ - vm_knote_unregister(kn); + res = (kn->kn_fflags != 0); + + proc_klist_unlock(); + + return res; } static int -filt_vm(struct knote *kn, long hint) +filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) { - /* hint == 0 means this is just an alive? check (always true) */ - if (hint != 0) { - const pid_t pid = (pid_t)hint; - if ((kn->kn_sfflags & NOTE_VM_PRESSURE) && - (kn->kn_kq->kq_p->p_pid == pid)) { - kn->kn_fflags |= NOTE_VM_PRESSURE; - } - } +#pragma unused(data) + int res; - return (kn->kn_fflags != 0); + proc_klist_lock(); + res = (kn->kn_fflags != 0); + if (res) { + *kev = kn->kn_kevent; + kn->kn_flags |= EV_CLEAR; /* automatically set */ + kn->kn_fflags = 0; + kn->kn_data = 0; + } + proc_klist_unlock(); + return res; } -#endif /* VM_PRESSURE_EVENTS */ /* * filt_timervalidate - process data from user @@ -774,13 +1071,17 @@ filt_timervalidate(struct knote *kn) nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC + nanoseconds, &now); - if (raw < now) { - /* time has already passed */ - kn->kn_ext[0] = 0; - } else { + /* if time is in the future */ + if (now < raw) { raw -= now; - clock_absolutetime_interval_to_deadline(raw, - &kn->kn_ext[0]); + + if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) { + clock_continuoustime_interval_to_deadline(raw, + &kn->kn_ext[0]); + } else { + clock_absolutetime_interval_to_deadline(raw, + &kn->kn_ext[0]); + } } } else { kn->kn_sdata = raw; @@ -801,16 +1102,24 @@ filt_timervalidate(struct knote *kn) * Timer filter lock is held. */ static void -filt_timerupdate(struct knote *kn) +filt_timerupdate(struct knote *kn, int num_fired) { + assert(num_fired > 0); + /* if there's no interval, deadline is just in kn_ext[0] */ if (kn->kn_sdata == 0) return; /* if timer hasn't fired before, fire in interval nsecs */ if (kn->kn_ext[0] == 0) { - clock_absolutetime_interval_to_deadline(kn->kn_sdata, - &kn->kn_ext[0]); + assert(num_fired == 1); + if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) { + clock_continuoustime_interval_to_deadline(kn->kn_sdata, + &kn->kn_ext[0]); + } else { + clock_absolutetime_interval_to_deadline(kn->kn_sdata, + &kn->kn_ext[0]); + } } else { /* * If timer has fired before, schedule the next pop @@ -818,8 +1127,11 @@ filt_timerupdate(struct knote *kn) * * We could check for whether the deadline has expired, * but the thread call layer can handle that. + * + * Go forward an additional number of periods, in the case the + * timer fired multiple times while the system was asleep. */ - kn->kn_ext[0] += kn->kn_sdata; + kn->kn_ext[0] += (kn->kn_sdata * num_fired); } } @@ -849,8 +1161,8 @@ filt_timerexpire(void *knx, __unused void *spare) /* if someone is waiting for timer to pop */ if (kn->kn_hookid & TIMER_CANCELWAIT) { - struct kqueue *kq = kn->kn_kq; - waitq_wakeup64_all((struct waitq *)kq->kq_wqs, + struct kqueue *kq = knote_get_kq(kn); + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, CAST_EVENT64_T(&kn->kn_hook), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); @@ -866,7 +1178,7 @@ filt_timerexpire(void *knx, __unused void *spare) static void filt_timercancel(struct knote *kn) { - struct kqueue *kq = kn->kn_kq; + struct kqueue *kq = knote_get_kq(kn); thread_call_t callout = kn->kn_hook; boolean_t cancelled; @@ -878,7 +1190,7 @@ filt_timercancel(struct knote *kn) } else { /* we have to wait for the expire routine. */ kn->kn_hookid |= TIMER_CANCELWAIT; - waitq_assert_wait64((struct waitq *)kq->kq_wqs, + waitq_assert_wait64((struct waitq *)&kq->kq_wqs, CAST_EVENT64_T(&kn->kn_hook), THREAD_UNINT, TIMEOUT_WAIT_FOREVER); filt_timerunlock(); @@ -897,17 +1209,23 @@ filt_timerattach(struct knote *kn) { thread_call_t callout; int error; + int res; callout = thread_call_allocate(filt_timerexpire, kn); - if (NULL == callout) - return (ENOMEM); + if (NULL == callout) { + kn->kn_flags = EV_ERROR; + kn->kn_data = ENOMEM; + return 0; + } filt_timerlock(); error = filt_timervalidate(kn); if (error != 0) { filt_timerunlock(); thread_call_free(callout); - return (error); + kn->kn_flags = EV_ERROR; + kn->kn_data = error; + return 0; } kn->kn_hook = (void*)callout; @@ -917,7 +1235,7 @@ filt_timerattach(struct knote *kn) if (kn->kn_sfflags & NOTE_ABSOLUTE) kn->kn_flags |= EV_ONESHOT; - filt_timerupdate(kn); + filt_timerupdate(kn, 1); if (kn->kn_ext[0]) { kn->kn_flags |= EV_CLEAR; unsigned int timer_flags = 0; @@ -930,6 +1248,8 @@ filt_timerattach(struct knote *kn) if (kn->kn_sfflags & NOTE_LEEWAY) timer_flags |= THREAD_CALL_DELAY_LEEWAY; + if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) + timer_flags |= THREAD_CALL_CONTINUOUS; thread_call_enter_delayed_with_leeway(callout, NULL, kn->kn_ext[0], kn->kn_ext[1], timer_flags); @@ -940,8 +1260,11 @@ filt_timerattach(struct knote *kn) kn->kn_data = 1; } + res = (kn->kn_data > 0); + filt_timerunlock(); - return (0); + + return res; } /* @@ -963,94 +1286,68 @@ filt_timerdetach(struct knote *kn) } - -static int -filt_timer(struct knote *kn, long hint) +static int filt_timer_num_fired(struct knote *kn) { - int result; - - if (hint) { - /* real timer pop -- timer lock held by filt_timerexpire */ - kn->kn_data++; - - if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) && - ((kn->kn_flags & EV_ONESHOT) == 0)) { - - /* evaluate next time to fire */ - filt_timerupdate(kn); + /* by default we fire a timer once */ + int num_fired = 1; - if (kn->kn_ext[0]) { - unsigned int timer_flags = 0; - - /* keep the callout and re-arm */ - if (kn->kn_sfflags & NOTE_CRITICAL) - timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL; - else if (kn->kn_sfflags & NOTE_BACKGROUND) - timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND; - else - timer_flags |= THREAD_CALL_DELAY_USER_NORMAL; - - if (kn->kn_sfflags & NOTE_LEEWAY) - timer_flags |= THREAD_CALL_DELAY_LEEWAY; - - thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL, - kn->kn_ext[0], kn->kn_ext[1], timer_flags); - - kn->kn_hookid |= TIMER_RUNNING; - } + /* + * When the time base is mach_continuous_time, we have to calculate + * the number of times the timer fired while we were asleep. + */ + if ((kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) && + (kn->kn_sdata != 0) && + (kn->kn_ext[0] != 0)) + { + const uint64_t now = mach_continuous_time(); + // time for timer to fire (right now) is kn_ext[0] + // kn_sdata is period for timer to fire + assert(now >= kn->kn_ext[0]); + assert(kn->kn_sdata > 0); + + const uint64_t overrun_ticks = now - kn->kn_ext[0]; + const uint64_t kn_sdata = kn->kn_sdata; + + if (overrun_ticks < kn_sdata) { + num_fired = 1; + } else if (overrun_ticks < (kn_sdata << 1)) { + num_fired = 2; + } else { + num_fired = (overrun_ticks / kn_sdata) + 1; } - - return (1); } - /* user-query */ - filt_timerlock(); - - result = (kn->kn_data != 0); - - filt_timerunlock(); - - return (result); + return num_fired; } - /* - * filt_timertouch - update knote with new user input + * filt_timer - post events to a timer knote * - * Cancel and restart the timer based on new user data. When - * the user picks up a knote, clear the count of how many timer - * pops have gone off (in kn_data). + * Count the timer fire and re-arm as requested. + * This always crosses the threshold of interest, + * so always return an indication that the knote + * should be activated (if not already). */ -static void -filt_timertouch(struct knote *kn, struct kevent_internal_s *kev, long type) +static int +filt_timer( + struct knote *kn, + long hint) { - int error; - filt_timerlock(); - - switch (type) { - case EVENT_REGISTER: - /* cancel current call */ - filt_timercancel(kn); +#pragma unused(hint) - /* recalculate deadline */ - kn->kn_sdata = kev->data; - kn->kn_sfflags = kev->fflags; - kn->kn_ext[0] = kev->ext[0]; - kn->kn_ext[1] = kev->ext[1]; - - error = filt_timervalidate(kn); - if (error) { - /* no way to report error, so mark it in the knote */ - kn->kn_flags |= EV_ERROR; - kn->kn_data = error; - break; - } + /* real timer pop -- timer lock held by filt_timerexpire */ + int num_fired = filt_timer_num_fired(kn); + kn->kn_data += num_fired; - /* start timer if necessary */ - filt_timerupdate(kn); + if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) && + ((kn->kn_flags & EV_ONESHOT) == 0)) { + /* evaluate next time to fire */ + filt_timerupdate(kn, num_fired); if (kn->kn_ext[0]) { unsigned int timer_flags = 0; + + /* keep the callout and re-arm */ if (kn->kn_sfflags & NOTE_CRITICAL) timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL; else if (kn->kn_sfflags & NOTE_BACKGROUND) @@ -1065,27 +1362,123 @@ filt_timertouch(struct knote *kn, struct kevent_internal_s *kev, long type) kn->kn_ext[0], kn->kn_ext[1], timer_flags); kn->kn_hookid |= TIMER_RUNNING; - } else { - /* pretend the timer has fired */ - kn->kn_data = 1; } + } + return (1); +} - break; - case EVENT_PROCESS: - /* reset the timer pop count in kn_data */ - *kev = kn->kn_kevent; - kev->ext[0] = 0; - kn->kn_data = 0; - if (kn->kn_flags & EV_CLEAR) - kn->kn_fflags = 0; - break; - default: - panic("%s: - invalid type (%ld)", __func__, type); - break; + +/* + * filt_timertouch - update timer knote with new user input + * + * Cancel and restart the timer based on new user data. When + * the user picks up a knote, clear the count of how many timer + * pops have gone off (in kn_data). + */ +static int +filt_timertouch( + struct knote *kn, + struct kevent_internal_s *kev) +{ + int error; + int res; + + filt_timerlock(); + + /* cancel current call */ + filt_timercancel(kn); + + /* capture the new values used to compute deadline */ + kn->kn_sdata = kev->data; + kn->kn_sfflags = kev->fflags; + kn->kn_ext[0] = kev->ext[0]; + kn->kn_ext[1] = kev->ext[1]; + + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* recalculate deadline */ + error = filt_timervalidate(kn); + if (error) { + /* no way to report error, so mark it in the knote */ + filt_timerunlock(); + kn->kn_flags |= EV_ERROR; + kn->kn_data = error; + return 1; + } + + /* start timer if necessary */ + filt_timerupdate(kn, 1); + + if (kn->kn_ext[0]) { + unsigned int timer_flags = 0; + if (kn->kn_sfflags & NOTE_CRITICAL) + timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL; + else if (kn->kn_sfflags & NOTE_BACKGROUND) + timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND; + else + timer_flags |= THREAD_CALL_DELAY_USER_NORMAL; + + if (kn->kn_sfflags & NOTE_LEEWAY) + timer_flags |= THREAD_CALL_DELAY_LEEWAY; + + thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL, + kn->kn_ext[0], kn->kn_ext[1], timer_flags); + + kn->kn_hookid |= TIMER_RUNNING; + } else { + /* pretend the timer has fired */ + kn->kn_data = 1; + } + + /* capture if already fired */ + res = (kn->kn_data > 0); + + filt_timerunlock(); + + return res; +} + +/* + * filt_timerprocess - query state of knote and snapshot event data + * + * Determine if the timer has fired in the past, snapshot the state + * of the kevent for returning to user-space, and clear pending event + * counters for the next time. + */ +static int +filt_timerprocess( + struct knote *kn, + __unused struct filt_process_s *data, + struct kevent_internal_s *kev) +{ + filt_timerlock(); + + /* user-query */ + if (kn->kn_data == 0) { + filt_timerunlock(); + return 0; } + /* + * Copy out the interesting kevent state, + * but don't leak out the raw time calculations. + */ + *kev = kn->kn_kevent; + kev->ext[0] = 0; + /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */ + + /* + * reset the timer pop count in kn_data + * and (optionally) clear the fflags. + */ + kn->kn_data = 0; + if (kn->kn_flags & EV_CLEAR) + kn->kn_fflags = 0; + filt_timerunlock(); + return 1; } static void @@ -1100,17 +1493,30 @@ filt_timerunlock(void) lck_mtx_unlock(&_filt_timerlock); } +static void +filt_userlock(void) +{ + lck_spin_lock(&_filt_userlock); +} + +static void +filt_userunlock(void) +{ + lck_spin_unlock(&_filt_userlock); +} + static int filt_userattach(struct knote *kn) { /* EVFILT_USER knotes are not attached to anything in the kernel */ + /* Cant discover this knote until after attach - so no lock needed */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) { kn->kn_hookid = 1; } else { kn->kn_hookid = 0; } - return (0); + return (kn->kn_hookid); } static void @@ -1120,52 +1526,79 @@ filt_userdetach(__unused struct knote *kn) } static int -filt_user(struct knote *kn, __unused long hint) +filt_user( + __unused struct knote *kn, + __unused long hint) { - return (kn->kn_hookid); + panic("filt_user"); + return 0; } -static void -filt_usertouch(struct knote *kn, struct kevent_internal_s *kev, long type) +static int +filt_usertouch( + struct knote *kn, + struct kevent_internal_s *kev) { uint32_t ffctrl; - switch (type) { - case EVENT_REGISTER: - if (kev->fflags & NOTE_TRIGGER) { - kn->kn_hookid = 1; - } + int fflags; + int active; - ffctrl = kev->fflags & NOTE_FFCTRLMASK; - kev->fflags &= NOTE_FFLAGSMASK; - switch (ffctrl) { - case NOTE_FFNOP: - break; - case NOTE_FFAND: - OSBitAndAtomic(kev->fflags, &kn->kn_sfflags); - break; - case NOTE_FFOR: - OSBitOrAtomic(kev->fflags, &kn->kn_sfflags); - break; - case NOTE_FFCOPY: - kn->kn_sfflags = kev->fflags; - break; - } - kn->kn_sdata = kev->data; + filt_userlock(); + + ffctrl = kev->fflags & NOTE_FFCTRLMASK; + fflags = kev->fflags & NOTE_FFLAGSMASK; + switch (ffctrl) { + case NOTE_FFNOP: break; - case EVENT_PROCESS: - *kev = kn->kn_kevent; - kev->fflags = (volatile UInt32)kn->kn_sfflags; - kev->data = kn->kn_sdata; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_hookid = 0; - kn->kn_data = 0; - kn->kn_fflags = 0; - } + case NOTE_FFAND: + kn->kn_sfflags &= fflags; break; - default: - panic("%s: - invalid type (%ld)", __func__, type); + case NOTE_FFOR: + kn->kn_sfflags |= fflags; + break; + case NOTE_FFCOPY: + kn->kn_sfflags = fflags; break; } + kn->kn_sdata = kev->data; + + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + if (kev->fflags & NOTE_TRIGGER) { + kn->kn_hookid = 1; + } + active = kn->kn_hookid; + + filt_userunlock(); + + return (active); +} + +static int +filt_userprocess( + struct knote *kn, + __unused struct filt_process_s *data, + struct kevent_internal_s *kev) +{ + filt_userlock(); + + if (kn->kn_hookid == 0) { + filt_userunlock(); + return 0; + } + + *kev = kn->kn_kevent; + kev->fflags = (volatile UInt32)kn->kn_sfflags; + kev->data = kn->kn_sdata; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_hookid = 0; + kn->kn_data = 0; + kn->kn_fflags = 0; + } + filt_userunlock(); + + return 1; } /* @@ -1174,33 +1607,65 @@ filt_usertouch(struct knote *kn, struct kevent_internal_s *kev, long type) static int filt_badattach(__unused struct knote *kn) { - return (ENOTSUP); + kn->kn_flags |= EV_ERROR; + kn->kn_data = ENOTSUP; + return 0; } struct kqueue * -kqueue_alloc(struct proc *p) +kqueue_alloc(struct proc *p, unsigned int flags) { struct filedesc *fdp = p->p_fd; - struct kqueue *kq; + struct kqueue *kq = NULL; + int policy; + void *hook; + uint64_t kq_addr_offset; - MALLOC_ZONE(kq, struct kqueue *, sizeof (struct kqueue), M_KQUEUE, - M_WAITOK); - if (kq != NULL) { - struct waitq_set *wqs; - - wqs = waitq_set_alloc(SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST | SYNC_POLICY_DISABLE_IRQ); - if (wqs != NULL) { - bzero(kq, sizeof (struct kqueue)); - lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr); - TAILQ_INIT(&kq->kq_head); - kq->kq_wqs = wqs; - kq->kq_p = p; - } else { - FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE); - kq = NULL; + if (flags & KEVENT_FLAG_WORKQ) { + struct kqworkq *kqwq; + int i; + + kqwq = (struct kqworkq *)zalloc(kqworkq_zone); + if (kqwq == NULL) + return NULL; + + kq = &kqwq->kqwq_kqueue; + bzero(kqwq, sizeof (struct kqworkq)); + + kqwq->kqwq_state = KQ_WORKQ; + + for (i = 0; i < KQWQ_NBUCKETS; i++) { + TAILQ_INIT(&kq->kq_queue[i]); } + for (i = 0; i < KQWQ_NQOS; i++) { + TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed); + } + + lck_spin_init(&kqwq->kqwq_reqlock, kq_lck_grp, kq_lck_attr); + policy = SYNC_POLICY_FIFO; + hook = (void *)kqwq; + + } else { + struct kqfile *kqf; + + kqf = (struct kqfile *)zalloc(kqfile_zone); + if (kqf == NULL) + return NULL; + + kq = &kqf->kqf_kqueue; + bzero(kqf, sizeof (struct kqfile)); + TAILQ_INIT(&kq->kq_queue[0]); + TAILQ_INIT(&kqf->kqf_suppressed); + + policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST; + hook = NULL; + } + waitq_set_init(&kq->kq_wqs, policy, NULL, hook); + lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr); + kq->kq_p = p; + if (fdp->fd_knlistsize < 0) { proc_fdlock(p); if (fdp->fd_knlistsize < 0) @@ -1208,6 +1673,9 @@ kqueue_alloc(struct proc *p) proc_fdunlock(p); } + kq_addr_offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS); + /* Assert that the address can be pointer compacted for use with knote */ + assert(kq_addr_offset < (uint64_t)(1ull << KNOTE_KQ_BITSIZE)); return (kq); } @@ -1243,12 +1711,11 @@ kqueue_dealloc(struct kqueue *kq) for (i = 0; i < fdp->fd_knlistsize; i++) { kn = SLIST_FIRST(&fdp->fd_knlist[i]); while (kn != NULL) { - if (kq == kn->kn_kq) { + if (kq == knote_get_kq(kn)) { kqlock(kq); proc_fdunlock(p); /* drop it ourselves or wait */ if (kqlock2knotedrop(kq, kn)) { - kn->kn_fop->f_detach(kn); knote_drop(kn, p); } proc_fdlock(p); @@ -1263,12 +1730,11 @@ kqueue_dealloc(struct kqueue *kq) for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) { kn = SLIST_FIRST(&fdp->fd_knhash[i]); while (kn != NULL) { - if (kq == kn->kn_kq) { + if (kq == knote_get_kq(kn)) { kqlock(kq); proc_fdunlock(p); /* drop it ourselves or wait */ if (kqlock2knotedrop(kq, kn)) { - kn->kn_fop->f_detach(kn); knote_drop(kn, p); } proc_fdlock(p); @@ -1283,13 +1749,22 @@ kqueue_dealloc(struct kqueue *kq) proc_fdunlock(p); /* - * waitq_set_free() clears all preposts and also remove the KQ's - * waitq set from any select sets to which it may belong. + * waitq_set_deinit() remove the KQ's waitq set from + * any select sets to which it may belong. */ - waitq_set_free(kq->kq_wqs); - kq->kq_wqs = NULL; + waitq_set_deinit(&kq->kq_wqs); lck_spin_destroy(&kq->kq_lock, kq_lck_grp); - FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE); + + if (kq->kq_state & KQ_WORKQ) { + struct kqworkq *kqwq = (struct kqworkq *)kq; + + lck_spin_destroy(&kqwq->kqwq_reqlock, kq_lck_grp); + zfree(kqworkq_zone, kqwq); + } else { + struct kqfile *kqf = (struct kqfile *)kq; + + zfree(kqfile_zone, kqf); + } } int @@ -1305,7 +1780,7 @@ kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval) return (error); } - kq = kqueue_alloc(p); + kq = kqueue_alloc(p, 0); if (kq == NULL) { fp_free(p, fd, fp); return (ENOMEM); @@ -1398,11 +1873,15 @@ kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p kevp->ident = kevqos.ident; kevp->filter = kevqos.filter; kevp->flags = kevqos.flags; + kevp->qos = kevqos.qos; +// kevp->xflags = kevqos.xflags; kevp->udata = kevqos.udata; kevp->fflags = kevqos.fflags; kevp->data = kevqos.data; kevp->ext[0] = kevqos.ext[0]; kevp->ext[1] = kevqos.ext[1]; + kevp->ext[2] = kevqos.ext[2]; + kevp->ext[3] = kevqos.ext[3]; } if (!error) *addrp += advance; @@ -1417,12 +1896,21 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * int advance; int error; + /* + * fully initialize the differnt output event structure + * types from the internal kevent (and some universal + * defaults for fields not represented in the internal + * form). + */ if (flags & KEVENT_FLAG_LEGACY32) { assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0); if (IS_64BIT_PROCESS(p)) { struct user64_kevent kev64; + advance = sizeof (kev64); + bzero(&kev64, advance); + /* * deal with the special case of a user-supplied * value of (uintptr_t)-1. @@ -1435,18 +1923,18 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * kev64.fflags = kevp->fflags; kev64.data = (int64_t) kevp->data; kev64.udata = kevp->udata; - advance = sizeof (kev64); error = copyout((caddr_t)&kev64, addr, advance); } else { struct user32_kevent kev32; + advance = sizeof (kev32); + bzero(&kev32, advance); kev32.ident = (uint32_t)kevp->ident; kev32.filter = kevp->filter; kev32.flags = kevp->flags; kev32.fflags = kevp->fflags; kev32.data = (int32_t)kevp->data; kev32.udata = kevp->udata; - advance = sizeof (kev32); error = copyout((caddr_t)&kev32, addr, advance); } } else if (flags & KEVENT_FLAG_LEGACY64) { @@ -1456,6 +1944,7 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * if (flags & KEVENT_FLAG_STACK_EVENTS) { addr -= advance; } + bzero(&kev64, advance); kev64.ident = kevp->ident; kev64.filter = kevp->filter; kev64.flags = kevp->flags; @@ -1467,20 +1956,24 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * error = copyout((caddr_t)&kev64, addr, advance); } else { struct kevent_qos_s kevqos; - - bzero(&kevqos, sizeof (struct kevent_qos_s)); + advance = sizeof (struct kevent_qos_s); if (flags & KEVENT_FLAG_STACK_EVENTS) { addr -= advance; } + bzero(&kevqos, advance); kevqos.ident = kevp->ident; kevqos.filter = kevp->filter; kevqos.flags = kevp->flags; + kevqos.qos = kevp->qos; + kevqos.udata = kevp->udata; kevqos.fflags = kevp->fflags; + kevqos.xflags = 0; kevqos.data = (int64_t) kevp->data; - kevqos.udata = kevp->udata; kevqos.ext[0] = kevp->ext[0]; kevqos.ext[1] = kevp->ext[1]; + kevqos.ext[2] = kevp->ext[2]; + kevqos.ext[3] = kevp->ext[3]; error = copyout((caddr_t)&kevqos, addr, advance); } if (!error) { @@ -1492,23 +1985,84 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * return (error); } +static int +kevent_get_data_size(struct proc *p, + uint64_t data_available, + unsigned int flags, + user_size_t *residp) +{ + user_size_t resid; + int error = 0; + + if (data_available != USER_ADDR_NULL) { + if (flags & KEVENT_FLAG_KERNEL) { + resid = *(user_size_t *)(uintptr_t)data_available; + } else if (IS_64BIT_PROCESS(p)) { + user64_size_t usize; + error = copyin((user_addr_t)data_available, &usize, sizeof(usize)); + resid = (user_size_t)usize; + } else { + user32_size_t usize; + error = copyin((user_addr_t)data_available, &usize, sizeof(usize)); + resid = (user_size_t)usize; + } + if (error) + return(error); + } else { + resid = 0; + } + *residp = resid; + return 0; +} + +static int +kevent_put_data_size(struct proc *p, + uint64_t data_available, + unsigned int flags, + user_size_t resid) +{ + int error = 0; + + if (data_available) { + if (flags & KEVENT_FLAG_KERNEL) { + *(user_size_t *)(uintptr_t)data_available = resid; + } else if (IS_64BIT_PROCESS(p)) { + user64_size_t usize = (user64_size_t)resid; + error = copyout(&usize, (user_addr_t)data_available, sizeof(usize)); + } else { + user32_size_t usize = (user32_size_t)resid; + error = copyout(&usize, (user_addr_t)data_available, sizeof(usize)); + } + } + return error; +} + /* * kevent_continue - continue a kevent syscall after blocking * * assume we inherit a use count on the kq fileglob. */ +__attribute__((noreturn)) static void kevent_continue(__unused struct kqueue *kq, void *data, int error) { struct _kevent *cont_args; struct fileproc *fp; + uint64_t data_available; + user_size_t data_size; + user_size_t data_resid; + unsigned int flags; int32_t *retval; int noutputs; int fd; struct proc *p = current_proc(); cont_args = (struct _kevent *)data; + data_available = cont_args->data_available; + flags = cont_args->process_data.fp_flags; + data_size = cont_args->process_data.fp_data_size; + data_resid = cont_args->process_data.fp_data_resid; noutputs = cont_args->eventout; retval = cont_args->retval; fd = cont_args->fd; @@ -1517,6 +2071,11 @@ kevent_continue(__unused struct kqueue *kq, void *data, int error) if (fp != NULL) fp_drop(p, fd, fp, 0); + /* don't abandon other output just because of residual copyout failures */ + if (error == 0 && data_available && data_resid != data_size) { + (void)kevent_put_data_size(p, data_available, flags, data_resid); + } + /* don't restart after signals... */ if (error == ERESTART) error = EINTR; @@ -1537,14 +2096,14 @@ kevent(struct proc *p, struct kevent_args *uap, int32_t *retval) unsigned int flags = KEVENT_FLAG_LEGACY32; return kevent_internal(p, - uap->fd, - uap->changelist, uap->nchanges, - uap->eventlist, uap->nevents, - 0ULL, 0ULL, - flags, - uap->timeout, - kevent_continue, - retval); + uap->fd, + uap->changelist, uap->nchanges, + uap->eventlist, uap->nevents, + 0ULL, 0ULL, + flags, + uap->timeout, + kevent_continue, + retval); } int @@ -1557,64 +2116,31 @@ kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval) flags |= KEVENT_FLAG_LEGACY64; return kevent_internal(p, - uap->fd, - uap->changelist, uap->nchanges, - uap->eventlist, uap->nevents, - 0ULL, 0ULL, - flags, - uap->timeout, - kevent_continue, - retval); + uap->fd, + uap->changelist, uap->nchanges, + uap->eventlist, uap->nevents, + 0ULL, 0ULL, + flags, + uap->timeout, + kevent_continue, + retval); } int kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval) { - user_size_t usize = 0; - user_size_t ssize; - int error; - /* restrict to user flags */ uap->flags &= KEVENT_FLAG_USER; - if (uap->data_available) { - if (!IS_64BIT_PROCESS(p)) { - uint32_t csize; - - error = copyin(uap->data_available, (caddr_t)&csize, sizeof(csize)); - if (error) - return error; - usize = csize; - } else { - uint64_t csize; - error = copyin(uap->data_available, (caddr_t)&csize, sizeof(csize)); - if (error) - return error; - usize = csize; - } - } - ssize = usize; - - error = kevent_internal(p, - uap->fd, - uap->changelist, uap->nchanges, - uap->eventlist, uap->nevents, - uap->data_out, &usize, - uap->flags, - 0ULL, - kevent_continue, - retval); - - if (error == 0 && uap->data_available && usize != ssize) { - if (!IS_64BIT_PROCESS(p)) { - uint32_t csize = (uint32_t)usize; - - error = copyout((caddr_t)&csize, uap->data_available, sizeof(csize)); - } else { - error = copyout((caddr_t)&usize, uap->data_available, sizeof(usize)); - } - } - return error; + return kevent_internal(p, + uap->fd, + uap->changelist, uap->nchanges, + uap->eventlist, uap->nevents, + uap->data_out, (uint64_t)uap->data_available, + uap->flags, + 0ULL, + kevent_continue, + retval); } int @@ -1626,57 +2152,33 @@ kevent_qos_internal(struct proc *p, int fd, int32_t *retval) { return kevent_internal(p, - fd, - changelist, nchanges, - eventlist, nevents, - data_out, data_available, - flags, - 0ULL, - NULL, - retval); + fd, + changelist, nchanges, + eventlist, nevents, + data_out, (uint64_t)data_available, + (flags | KEVENT_FLAG_KERNEL), + 0ULL, + NULL, + retval); } static int -kevent_internal(struct proc *p, - int fd, - user_addr_t changelist, int nchanges, - user_addr_t ueventlist, int nevents, - user_addr_t data_out, user_size_t *data_available, - unsigned int flags, - user_addr_t utimeout, - kqueue_continue_t continuation, - int32_t *retval) +kevent_get_timeout(struct proc *p, + user_addr_t utimeout, + unsigned int flags, + struct timeval *atvp) { - struct _kevent *cont_args; - uthread_t ut; - struct kqueue *kq; - struct fileproc *fp = NULL; - struct kevent_internal_s kev; - int error = 0, noutputs; struct timeval atv; + int error = 0; -#if 1 - /* temporarily ignore these fields */ - (void)data_out; - (void)data_available; -#endif - - /* prepare to deal with stack-wise allocation of out events */ - if (flags & KEVENT_FLAG_STACK_EVENTS) { - int scale = ((flags & KEVENT_FLAG_LEGACY32) ? - (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) : - sizeof(struct user32_kevent)) : - ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) : - sizeof(struct kevent_qos_s))); - ueventlist += nevents * scale; - } - - /* convert timeout to absolute - if we have one (and not immediate) */ if (flags & KEVENT_FLAG_IMMEDIATE) { getmicrouptime(&atv); } else if (utimeout != USER_ADDR_NULL) { struct timeval rtv; - if (IS_64BIT_PROCESS(p)) { + if (flags & KEVENT_FLAG_KERNEL) { + struct timespec *tsp = (struct timespec *)utimeout; + TIMESPEC_TO_TIMEVAL(&rtv, tsp); + } else if (IS_64BIT_PROCESS(p)) { struct user64_timespec ts; error = copyin(utimeout, &ts, sizeof(ts)); if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0) @@ -1699,6 +2201,41 @@ kevent_internal(struct proc *p, atv.tv_sec = 0; atv.tv_usec = 0; } + *atvp = atv; + return 0; +} + +static int +kevent_set_kq_mode(struct kqueue *kq, unsigned int flags) +{ + /* each kq should only be used for events of one type */ + kqlock(kq); + if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) { + if (flags & KEVENT_FLAG_LEGACY32) { + if ((kq->kq_state & KQ_KEV32) == 0) { + kqunlock(kq); + return EINVAL; + } + } else if (kq->kq_state & KQ_KEV32) { + kqunlock(kq); + return EINVAL; + } + } else if (flags & KEVENT_FLAG_LEGACY32) { + kq->kq_state |= KQ_KEV32; + } else { + /* JMM - set KQ_KEVQOS when we are ready for exclusive */ + kq->kq_state |= KQ_KEV64; + } + kqunlock(kq); + return 0; +} + +static int +kevent_get_kq(struct proc *p, int fd, unsigned int flags, struct fileproc **fpp, struct kqueue **kqp) +{ + struct fileproc *fp = NULL; + struct kqueue *kq; + int error; if (flags & KEVENT_FLAG_WORKQ) { /* @@ -1709,18 +2246,12 @@ kevent_internal(struct proc *p, */ kq = p->p_wqkqueue; if (kq == NULL) { - struct kqueue *alloc_kq = kqueue_alloc(p); + struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ); if (alloc_kq == NULL) return ENOMEM; proc_fdlock(p); if (p->p_wqkqueue == NULL) { - /* - * The kq is marked as special - - * with unique interactions with - * the workq for this process. - */ - alloc_kq->kq_state |= KQ_WORKQ; kq = p->p_wqkqueue = alloc_kq; proc_fdunlock(p); } else { @@ -1734,28 +2265,69 @@ kevent_internal(struct proc *p, if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) return (error); } + if ((error = kevent_set_kq_mode(kq, flags)) != 0) { + /* drop the usecount */ + if (fp != NULL) + fp_drop(p, fd, fp, 0); + return error; + } + + *fpp = fp; + *kqp = kq; + return 0; +} - /* each kq should only be used for events of one type */ - kqlock(kq); - if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) { - if (flags & KEVENT_FLAG_LEGACY32) { - if ((kq->kq_state & KQ_KEV32) == 0) { - error = EINVAL; - kqunlock(kq); - goto errorout; - } - } else if (kq->kq_state & KQ_KEV32) { - error = EINVAL; - kqunlock(kq); - goto errorout; - } - } else if (flags & KEVENT_FLAG_LEGACY32) { - kq->kq_state |= KQ_KEV32; - } else { - /* JMM - set KQ_KEVQOS when we are ready for exclusive */ - kq->kq_state |= KQ_KEV64; + +static int +kevent_internal(struct proc *p, + int fd, + user_addr_t changelist, int nchanges, + user_addr_t ueventlist, int nevents, + user_addr_t data_out, uint64_t data_available, + unsigned int flags, + user_addr_t utimeout, + kqueue_continue_t continuation, + int32_t *retval) +{ + struct _kevent *cont_args; + uthread_t ut; + struct kqueue *kq; + struct fileproc *fp = NULL; + struct kevent_internal_s kev; + int error, noutputs; + struct timeval atv; + user_size_t data_size; + user_size_t data_resid; + + /* Don't allow user-space threads to process output events from the workq kq */ + if ((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ && + !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0) + return EINVAL; + + /* prepare to deal with stack-wise allocation of out events */ + if (flags & KEVENT_FLAG_STACK_EVENTS) { + int scale = ((flags & KEVENT_FLAG_LEGACY32) ? + (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) : + sizeof(struct user32_kevent)) : + ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) : + sizeof(struct kevent_qos_s))); + ueventlist += nevents * scale; } - kqunlock(kq); + + /* convert timeout to absolute - if we have one (and not immediate) */ + error = kevent_get_timeout(p, utimeout, flags, &atv); + if (error) + return error; + + /* copyin initial value of data residual from data_available */ + error = kevent_get_data_size(p, data_available, flags, &data_size); + if (error) + return error; + + /* get the kq we are going to be working on */ + error = kevent_get_kq(p, fd, flags, &fp, &kq); + if (error) + return error; /* register all the change requests the user provided... */ noutputs = 0; @@ -1764,24 +2336,33 @@ kevent_internal(struct proc *p, if (error) break; + /* Make sure user doesn't pass in any system flags */ kev.flags &= ~EV_SYSFLAGS; - error = kevent_register(kq, &kev, p); - if ((error || (kev.flags & EV_RECEIPT)) && nevents > 0) { - kev.flags = EV_ERROR; - kev.data = error; + + kevent_register(kq, &kev, p); + + if (nevents > 0 && + ((kev.flags & EV_ERROR) || (kev.flags & EV_RECEIPT))) { + if (kev.flags & EV_RECEIPT) { + kev.flags |= EV_ERROR; + kev.data = 0; + } error = kevent_copyout(&kev, &ueventlist, p, flags); if (error == 0) { nevents--; noutputs++; } + } else if (kev.flags & EV_ERROR) { + error = kev.data; } nchanges--; } /* short-circuit the scan if we only want error events */ - if (flags & KEVENT_FLAG_ERROR_EVENTS) + if (flags & KEVENT_FLAG_ERROR_EVENTS) nevents = 0; + /* process pending events */ if (nevents > 0 && noutputs == 0 && error == 0) { /* store the continuation/completion data in the uthread */ @@ -1793,13 +2374,27 @@ kevent_internal(struct proc *p, cont_args->eventlist = ueventlist; cont_args->eventcount = nevents; cont_args->eventout = noutputs; - cont_args->eventflags = flags; + cont_args->data_available = data_available; + cont_args->process_data.fp_fd = fd; + cont_args->process_data.fp_flags = flags; + cont_args->process_data.fp_data_out = data_out; + cont_args->process_data.fp_data_size = data_size; + cont_args->process_data.fp_data_resid = data_size; error = kqueue_scan(kq, kevent_callback, continuation, cont_args, + &cont_args->process_data, &atv, p); + /* process remaining outputs */ noutputs = cont_args->eventout; + data_resid = cont_args->process_data.fp_data_resid; + + /* copyout residual data size value (if it needs to be copied out) */ + /* don't abandon other output just because of residual copyout failures */ + if (error == 0 && data_available && data_resid != data_size) { + (void)kevent_put_data_size(p, data_available, flags, data_resid); + } } /* don't restart after signals... */ @@ -1809,7 +2404,6 @@ kevent_internal(struct proc *p, error = 0; if (error == 0) *retval = noutputs; -errorout: if (fp != NULL) fp_drop(p, fd, fp, 0); return (error); @@ -1836,7 +2430,7 @@ kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, * Copy out the appropriate amount of event data for this user. */ error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(), - cont_args->eventflags); + cont_args->process_data.fp_flags); /* * If there isn't space for additional events, return @@ -1885,250 +2479,287 @@ kevent_description(struct kevent_internal_s *kevp, char *s, size_t n) * caller holds a reference on the kqueue */ -int +void kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, __unused struct proc *ctxp) { struct proc *p = kq->kq_p; - struct filedesc *fdp = p->p_fd; struct filterops *fops; - struct fileproc *fp = NULL; struct knote *kn = NULL; - struct klist *list; + int result = 0; int error = 0; if (kev->filter < 0) { - if (kev->filter + EVFILT_SYSCOUNT < 0) - return (EINVAL); + if (kev->filter + EVFILT_SYSCOUNT < 0) { + error = EINVAL; + goto out; + } fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ } else { - return (EINVAL); + error = EINVAL; + goto out; } + /* restrict EV_VANISHED to adding udata-specific dispatch kevents */ + if ((kev->flags & EV_VANISHED) && + (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) { + error = EINVAL; + goto out; + } + + /* Simplify the flags - delete and disable overrule */ + if (kev->flags & EV_DELETE) + kev->flags &= ~EV_ADD; + if (kev->flags & EV_DISABLE) + kev->flags &= ~EV_ENABLE; + restart: - /* this iocount needs to be dropped if it is not registered */ - list = NULL; + proc_fdlock(p); - /* - * determine where to look for the knote - */ - if (fops->f_isfd) { - if ((error = fp_lookup(p, kev->ident, &fp, 1)) != 0) { - proc_fdunlock(p); - return (error); - } - /* fd-based knotes are linked off the fd table */ - if (kev->ident < (u_int)fdp->fd_knlistsize) { - list = &fdp->fd_knlist[kev->ident]; - } - } else if (fdp->fd_knhashmask != 0) { - /* hash non-fd knotes here too */ - list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; - } + /* find the matching knote from the fd tables/hashes */ + kn = knote_fdfind(kq, kev, p); - /* - * scan the selected list looking for a match - */ - if (list != NULL) { - SLIST_FOREACH(kn, list, kn_link) { - if (kq == kn->kn_kq && - kev->ident == kn->kn_id && - kev->filter == kn->kn_filter) { - if (kev->flags & EV_UDATA_SPECIFIC) { - if ((kn->kn_flags & EV_UDATA_SPECIFIC) && - kev->udata == kn->kn_udata) { - break; /* matching udata-specific knote */ - } - } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) { - break; /* matching non-udata-specific knote */ + if (kn == NULL) { + if (kev->flags & EV_ADD) { + struct fileproc *fp = NULL; + + /* grab a file reference for the new knote */ + if (fops->f_isfd) { + if ((error = fp_lookup(p, kev->ident, &fp, 1)) != 0) { + proc_fdunlock(p); + goto out; } } - } - } - /* - * kn now contains the matching knote, or NULL if no match - */ - if (kn == NULL) { - if ((kev->flags & (EV_ADD|EV_DELETE)) == EV_ADD) { kn = knote_alloc(); if (kn == NULL) { proc_fdunlock(p); error = ENOMEM; - goto done; + if (fp != NULL) + fp_drop(p, kev->ident, fp, 0); + goto out; } + kn->kn_fp = fp; - kn->kn_kq = kq; - kn->kn_tq = &kq->kq_head; - kn->kn_fop = fops; + knote_set_kq(kn,kq); + kn->kn_filtid = ~kev->filter; + kn->kn_inuse = 1; /* for f_attach() */ + kn->kn_status = KN_ATTACHING | KN_ATTACHED; + + /* was vanish support requested */ + if (kev->flags & EV_VANISHED) { + kev->flags &= ~EV_VANISHED; + kn->kn_status |= KN_REQVANISH; + } + + /* snapshot matching/dispatching protcol flags into knote */ + if (kev->flags & EV_DISPATCH) + kn->kn_status |= KN_DISPATCH; + if (kev->flags & EV_UDATA_SPECIFIC) + kn->kn_status |= KN_UDATA_SPECIFIC; + + /* + * copy the kevent state into knote + * protocol is that fflags and data + * are saved off, and cleared before + * calling the attach routine. + */ + kn->kn_kevent = *kev; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - kev->fflags = 0; - kev->data = 0; - kn->kn_kevent = *kev; - kn->kn_inuse = 1; /* for f_attach() */ - kn->kn_status = KN_ATTACHING; + kn->kn_fflags = 0; + kn->kn_data = 0; + + /* invoke pthread kext to convert kevent qos to thread qos */ + if (kq->kq_state & KQ_WORKQ) { + kn->kn_qos = canonicalize_kevent_qos(kn->kn_qos); + knote_set_qos_index(kn, qos_index_from_qos(kn->kn_qos, FALSE)); + knote_set_qos_override_index(kn, QOS_INDEX_KQFILE); + assert(knote_get_qos_index(kn) < KQWQ_NQOS); + } else { + knote_set_qos_index(kn, QOS_INDEX_KQFILE); + knote_set_qos_override_index(kn, QOS_INDEX_KQFILE); + } /* before anyone can find it */ if (kev->flags & EV_DISABLE) - kn->kn_status |= KN_DISABLED; + knote_disable(kn); - error = knote_fdpattach(kn, fdp, p); + /* Add the knote for lookup thru the fd table */ + error = knote_fdadd(kn, p); proc_fdunlock(p); if (error) { knote_free(kn); - goto done; + if (fp != NULL) + fp_drop(p, kev->ident, fp, 0); + goto out; } - /* - * apply reference count to knote structure, and - * do not release it at the end of this routine. - */ - fp = NULL; + /* fp reference count now applies to knote */ - error = fops->f_attach(kn); + /* call filter attach routine */ + result = fops->f_attach(kn); - kqlock(kq); + /* + * Trade knote use count for kq lock. + * Cannot be dropped because we held + * KN_ATTACHING throughout. + */ + knoteuse2kqlock(kq, kn, 1); - if (error != 0) { + if (kn->kn_flags & EV_ERROR) { /* * Failed to attach correctly, so drop. * All other possible users/droppers - * have deferred to us. + * have deferred to us. Save the error + * to return to our caller. */ + kn->kn_status &= ~KN_ATTACHED; kn->kn_status |= KN_DROPPING; + error = kn->kn_data; kqunlock(kq); knote_drop(kn, p); - goto done; - } else if (kn->kn_status & KN_DROPPING) { + goto out; + } + + /* end "attaching" phase - now just attached */ + kn->kn_status &= ~KN_ATTACHING; + + if (kn->kn_status & KN_DROPPING) { /* * Attach succeeded, but someone else * deferred their drop - now we have - * to do it for them (after detaching). + * to do it for them. */ kqunlock(kq); - kn->kn_fop->f_detach(kn); knote_drop(kn, p); - goto done; + goto out; } - kn->kn_status &= ~KN_ATTACHING; - kqunlock(kq); + + /* + * If the attach routine indicated that an + * event is already fired, activate the knote. + */ + if (result) + knote_activate(kn); + } else { proc_fdunlock(p); error = ENOENT; - goto done; + goto out; } + } else { /* existing knote - get kqueue lock */ kqlock(kq); proc_fdunlock(p); + if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) { + /* + * The knote is not in a stable state, wait for that + * transition to complete and then redrive the lookup. + */ + kn->kn_status |= KN_USEWAIT; + waitq_assert_wait64((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(&kn->kn_status), + THREAD_UNINT, TIMEOUT_WAIT_FOREVER); + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + goto restart; + } + if (kev->flags & EV_DELETE) { + + /* + * If attempting to delete a disabled dispatch2 knote, + * we must wait for the knote to be re-enabled (unless + * it is being re-enabled atomically here). + */ if ((kev->flags & EV_ENABLE) == 0 && - (kev->flags & EV_DISPATCH2) == EV_DISPATCH2 && - (kn->kn_status & KN_DISABLED) == KN_DISABLED) { - /* mark for deferred drop */ - kn->kn_status |= KN_DEFERDROP; + (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) == + (KN_DISPATCH2 | KN_DISABLED)) { + kn->kn_status |= KN_DEFERDELETE; kqunlock(kq); error = EINPROGRESS; + } else if (kqlock2knotedrop(kq, kn)) { + knote_drop(kn, p); } else { - knote_dequeue(kn); - kn->kn_status |= KN_DISABLED; - if (kqlock2knotedrop(kq, kn)) { - kn->kn_fop->f_detach(kn); - knote_drop(kn, p); - } else { - /* pretend we didn't find it */ - error = ENOENT; - } - } - goto done; - } - - /* update status flags for existing knote */ - if (kev->flags & EV_DISABLE) { - knote_dequeue(kn); - kn->kn_status |= KN_DISABLED; - - } else if ((kev->flags & EV_ENABLE) && - (kn->kn_status & KN_DISABLED)) { - kn->kn_status &= ~KN_DISABLED; - - /* handle deferred drop */ - if (kn->kn_status & KN_DEFERDROP) { - kn->kn_status &= ~KN_DEFERDROP; - kn->kn_flags |= (EV_DELETE | EV_ONESHOT); - knote_activate(kn, 0); - kqunlock(kq); - goto done; - } - - if (kn->kn_status & KN_ACTIVE) { - /* force re-activate if previously active */ - knote_activate(kn, 1); + /* + * The kqueue is unlocked, it's not being + * dropped, and kqlock2knotedrop returned 0: + * this means that someone stole the drop of + * the knote from us. + */ + error = EINPROGRESS; } + goto out; } /* - * The user may change some filter values after the - * initial EV_ADD, but doing so will not reset any - * filter which have already been triggered. + * If we are re-enabling a deferred-delete knote, + * just enable it now and avoid calling the + * filter touch routine (it has delivered its + * last event already). */ - kn->kn_kevent.udata = kev->udata; - if (fops->f_isfd || fops->f_touch == NULL) { - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; + if ((kev->flags & EV_ENABLE) && + (kn->kn_status & KN_DEFERDELETE)) { + assert(kn->kn_status & KN_DISABLED); + knote_activate(kn); + knote_enable(kn); + kqunlock(kq); + goto out; } /* - * If somebody is in the middle of dropping this - * knote - go find/insert a new one. But we have - * wait for this one to go away first. Attaches - * running in parallel may also drop/modify the - * knote. Wait for those to complete as well and - * then start over if we encounter one. + * If we are disabling, do it before unlocking and + * calling the touch routine (so no processing can + * see the new kevent state before the disable is + * applied). */ - if (!kqlock2knoteusewait(kq, kn)) { - /* kqueue, proc_fdlock both unlocked */ - goto restart; - } + if (kev->flags & EV_DISABLE) + knote_disable(kn); /* - * Call touch routine to notify filter of changes - * in filter values. + * Convert the kqlock to a use reference on the + * knote so we can call the filter touch routine. */ - if (!fops->f_isfd && fops->f_touch != NULL) - fops->f_touch(kn, kev, EVENT_REGISTER); - } - /* still have use ref on knote */ + if (kqlock2knoteuse(kq, kn)) { + + /* + * Call touch routine to notify filter of changes + * in filter values (and to re-determine if any + * events are fired). + */ + result = knote_fops(kn)->f_touch(kn, kev); + + /* Get the kq lock back (don't defer droppers). */ + if (!knoteuse2kqlock(kq, kn, 0)) { + kqunlock(kq); + goto out; + } + + /* Activate it if the touch routine said to */ + if (result) + knote_activate(kn); + } + + /* Enable the knote if called for */ + if (kev->flags & EV_ENABLE) + knote_enable(kn); - /* - * Invoke the filter routine to see if it should be enqueued now. - */ -#if 0 - if (kn->kn_fop->f_event(kn, 0)) { -#else - /* - * JMM - temporary workaround until rdar://problem/19986199 - * This potentially results in extra wakeups for KN_STAYQUEUED event types, - * but waking up only truly active ones (yet trying below to determine - * active status, by invoking the filter routine, is having side-effects). - */ - if ((kn->kn_status & KN_STAYQUEUED) || kn->kn_fop->f_event(kn, 0)) { -#endif - if (knoteuse2kqlock(kq, kn)) - knote_activate(kn, (kn->kn_status & KN_STAYQUEUED)); - kqunlock(kq); - } else { - knote_put(kn); } -done: - if (fp != NULL) - fp_drop(p, kev->ident, fp, 0); - return (error); + /* still have kqlock held and knote is valid */ + kqunlock(kq); + + out: + /* output local errors through the kevent */ + if (error) { + kev->flags |= EV_ERROR; + kev->data = error; + } } @@ -2138,168 +2769,217 @@ kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, * Validate that it is really still a triggered event * by calling the filter routines (if necessary). Hold * a use reference on the knote to avoid it being detached. - * If it is still considered triggered, invoke the callback - * routine provided and move it to the provided inprocess - * queue. + * + * If it is still considered triggered, we will have taken + * a copy of the state under the filter lock. We use that + * snapshot to dispatch the knote for future processing (or + * not, if this was a lost event). + * + * Our caller assures us that nobody else can be processing + * events from this knote during the whole operation. But + * others can be touching or posting events to the knote + * interspersed with our processing it. * * caller holds a reference on the kqueue. * kqueue locked on entry and exit - but may be dropped */ static int -knote_process(struct knote *kn, - kevent_callback_t callback, - void *data, - struct kqtailq *inprocessp, - struct proc *p) +knote_process(struct knote *kn, + kevent_callback_t callback, + void *callback_data, + struct filt_process_s *process_data, + struct proc *p) { - struct kqueue *kq = kn->kn_kq; struct kevent_internal_s kev; - int touch; - int result; - int error; + struct kqueue *kq = knote_get_kq(kn); + int result = 0; + int error = 0; + + bzero(&kev, sizeof(kev)); /* - * Determine the kevent state we want to return. - * - * Some event states need to be revalidated before returning - * them, others we take the snapshot at the time the event - * was enqueued. - * - * Events with non-NULL f_touch operations must be touched. - * Triggered events must fill in kev for the callback. - * - * Convert our lock to a use-count and call the event's - * filter routine(s) to update. + * Must be active or stayactive + * Must be queued and not disabled/suppressed */ - if ((kn->kn_status & KN_DISABLED) != 0) { - result = 0; - touch = 0; - } else { - int revalidate; + assert(kn->kn_status & KN_QUEUED); + assert(kn->kn_status & (KN_ACTIVE|KN_STAYACTIVE)); + assert(!(kn->kn_status & (KN_DISABLED|KN_SUPPRESSED|KN_DROPPING))); + /* + * For deferred-drop or vanished events, we just create a fake + * event to acknowledge end-of-life. Otherwise, we call the + * filter's process routine to snapshot the kevent state under + * the filter's locking protocol. + */ + if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) { + /* create fake event */ + kev.filter = kn->kn_filter; + kev.ident = kn->kn_id; + kev.qos = kn->kn_qos; + kev.flags = (kn->kn_status & KN_DEFERDELETE) ? + EV_DELETE : EV_VANISHED; + kev.flags |= (EV_DISPATCH2 | EV_ONESHOT); + kev.udata = kn->kn_udata; result = 1; - revalidate = ((kn->kn_status & KN_STAYQUEUED) != 0 || - (kn->kn_flags & EV_ONESHOT) == 0); - touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); - if (revalidate || touch) { - if (revalidate) - knote_deactivate(kn); + knote_suppress(kn); + } else { - /* call the filter/touch routines with just a ref */ - if (kqlock2knoteuse(kq, kn)) { - /* if we have to revalidate, call the filter */ - if (revalidate) { - result = kn->kn_fop->f_event(kn, 0); - } + /* deactivate - so new activations indicate a wakeup */ + knote_deactivate(kn); - /* - * capture the kevent data - using touch if - * specified - */ - if (result && touch) { - kn->kn_fop->f_touch(kn, &kev, - EVENT_PROCESS); - } - if (result && (kn->kn_status & KN_TOUCH)) - kn->kn_fop->f_touch(kn, &kev, - EVENT_PROCESS); + /* suppress knotes to avoid returning the same event multiple times in a single call. */ + knote_suppress(kn); - /* - * convert back to a kqlock - bail if the knote - * went away - */ - if (!knoteuse2kqlock(kq, kn)) { - return (EJUSTRETURN); - } else if (result) { - /* - * if revalidated as alive, make sure - * it's active - */ - knote_activate(kn, 0); + /* convert lock to a knote use reference */ + if (!kqlock2knoteuse(kq, kn)) + panic("dropping knote found on queue\n"); - /* - * capture all events that occurred - * during filter - */ - if (!touch) { - kev = kn->kn_kevent; - } + /* call out to the filter to process with just a ref */ + result = knote_fops(kn)->f_process(kn, process_data, &kev); + + /* + * convert our reference back to a lock. accept drop + * responsibility from others if we've committed to + * delivering event data. + */ + if (!knoteuse2kqlock(kq, kn, result)) { + /* knote dropped */ + kn = NULL; + } + } + + if (kn != NULL) { + /* + * Determine how to dispatch the knote for future event handling. + * not-fired: just return (do not callout, leave deactivated). + * One-shot: If dispatch2, enter deferred-delete mode (unless this is + * is the deferred delete event delivery itself). Otherwise, + * drop it. + * stolendrop:We took responsibility for someone else's drop attempt. + * treat this just like one-shot and prepare to turn it back + * into a deferred delete if required. + * Dispatch: don't clear state, just mark it disabled. + * Cleared: just leave it deactivated. + * Others: re-activate as there may be more events to handle. + * This will not wake up more handlers right now, but + * at the completion of handling events it may trigger + * more handler threads (TODO: optimize based on more than + * just this one event being detected by the filter). + */ + + if (result == 0) + return (EJUSTRETURN); + + if ((kev.flags & EV_ONESHOT) || (kn->kn_status & KN_STOLENDROP)) { + if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) { + /* defer dropping non-delete oneshot dispatch2 events */ + kn->kn_status |= KN_DEFERDELETE; + knote_disable(kn); - } else if ((kn->kn_status & KN_STAYQUEUED) == 0) { + /* if we took over another's drop clear those flags here */ + if (kn->kn_status & KN_STOLENDROP) { + assert(kn->kn_status & KN_DROPPING); /* - * was already dequeued, so just bail on - * this one + * the knote will be dropped when the + * deferred deletion occurs */ - return (EJUSTRETURN); + kn->kn_status &= ~(KN_DROPPING|KN_STOLENDROP); } - } else { - return (EJUSTRETURN); + } else if (kn->kn_status & KN_STOLENDROP) { + /* We now own the drop of the knote. */ + assert(kn->kn_status & KN_DROPPING); + knote_unsuppress(kn); + kqunlock(kq); + knote_drop(kn, p); + kqlock(kq); + } else if (kqlock2knotedrop(kq, kn)) { + /* just EV_ONESHOT, _not_ DISPATCH2 */ + knote_drop(kn, p); + kqlock(kq); } - } else { - kev = kn->kn_kevent; + } else if (kn->kn_status & KN_DISPATCH) { + /* disable all dispatch knotes */ + knote_disable(kn); + } else if ((kev.flags & EV_CLEAR) == 0) { + /* re-activate in case there are more events */ + knote_activate(kn); } } - /* move knote onto inprocess queue */ - assert(kn->kn_tq == &kq->kq_head); - TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); - kn->kn_tq = inprocessp; - TAILQ_INSERT_TAIL(inprocessp, kn, kn_tqe); - /* - * Determine how to dispatch the knote for future event handling. - * not-fired: just return (do not callout). - * One-shot: If dispatch2, enter deferred-delete mode (unless this is - * is the deferred delete event delivery itself). Otherwise, - * deactivate and drop it. - * Clear: deactivate and clear the state. - * Dispatch: don't clear state, just deactivate it and mark it disabled. - * All others: just leave where they are. + * callback to handle each event as we find it. + * If we have to detach and drop the knote, do + * it while we have the kq unlocked. */ + if (result) { + kqunlock(kq); + error = (callback)(kq, &kev, callback_data); + kqlock(kq); + } + return (error); +} - if (result == 0) { - return (EJUSTRETURN); - } else if ((kn->kn_flags & EV_ONESHOT) != 0) { - knote_deactivate(kn); - if ((kn->kn_flags & (EV_DISPATCH2|EV_DELETE)) == EV_DISPATCH2) { - /* defer dropping non-delete oneshot dispatch2 events */ - kn->kn_status |= (KN_DISABLED | KN_DEFERDROP); - kqunlock(kq); - } else if (kqlock2knotedrop(kq, kn)) { - kn->kn_fop->f_detach(kn); - knote_drop(kn, p); - } - } else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) { - if ((kn->kn_flags & EV_DISPATCH) != 0) { - /* deactivate and disable all dispatch knotes */ - knote_deactivate(kn); - kn->kn_status |= KN_DISABLED; - } else if (!touch || kn->kn_fflags == 0) { - /* only deactivate if nothing since the touch */ - knote_deactivate(kn); - } - if (!touch && (kn->kn_flags & EV_CLEAR) != 0) { - /* manually clear non-touch knotes */ - kn->kn_data = 0; - kn->kn_fflags = 0; + +/* + * Return 0 to indicate that processing should proceed, + * -1 if there is nothing to process. + * + * Called with kqueue locked and returns the same way, + * but may drop lock temporarily. + */ +static int +kqworkq_begin_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags) +{ + struct kqrequest *kqr; + thread_t self = current_thread(); + __assert_only struct uthread *ut = get_bsdthread_info(self); + thread_t thread; + + assert(kqwq->kqwq_state & KQ_WORKQ); + assert(qos_index < KQWQ_NQOS); + + kqwq_req_lock(kqwq); + kqr = kqworkq_get_request(kqwq, qos_index); + + thread = kqr->kqr_thread; + + /* manager skips buckets that haven't ask for its help */ + if (flags & KEVENT_FLAG_WORKQ_MANAGER) { + + /* If nothing for manager to do, just return */ + if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { + assert(kqr->kqr_thread != self); + kqwq_req_unlock(kqwq); + return -1; } - kqunlock(kq); + + /* bind manager thread from this time on */ + kqworkq_bind_thread(kqwq, qos_index, self, flags); + } else { - /* - * leave on inprocess queue. We'll - * move all the remaining ones back - * the kq queue and wakeup any - * waiters when we are done. - */ - kqunlock(kq); + /* must have been bound by now */ + assert(thread == self); + assert(ut->uu_kqueue_bound == qos_index); + assert((ut->uu_kqueue_flags & flags) == ut->uu_kqueue_flags); } - /* callback to handle each event as we find it */ - error = (callback)(kq, &kev, data); + /* nobody else should still be processing */ + assert(kqr->kqr_state & KQWQ_THREQUESTED); + assert((kqr->kqr_state & KQWQ_PROCESSING) == 0); + + /* anything left to process? */ + if (kqueue_queue_empty(&kqwq->kqwq_kqueue, qos_index)) { + kqwq_req_unlock(kqwq); + return -1; + } - kqlock(kq); - return (error); + /* convert to processing mode */ + /* reset workq triggers and thread requests - maybe processing */ + kqr->kqr_state &= ~(KQWQ_HOOKCALLED | KQWQ_WAKEUP); + kqr->kqr_state |= KQWQ_PROCESSING; + kqwq_req_unlock(kqwq); + return 0; } /* @@ -2308,45 +2988,378 @@ knote_process(struct knote *kn, * * Called with kqueue locked and returns the same way, * but may drop lock temporarily. + * May block. */ static int -kqueue_begin_processing(struct kqueue *kq) +kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags) { + struct kqtailq *suppressq; + + if (kq->kq_state & KQ_WORKQ) + return kqworkq_begin_processing((struct kqworkq *)kq, qos_index, flags); + + assert(qos_index == QOS_INDEX_KQFILE); + + /* wait to become the exclusive processing thread */ for (;;) { - if (kq->kq_count == 0) { - return (-1); - } + if (kq->kq_state & KQ_DRAIN) + return -1; + + if ((kq->kq_state & KQ_PROCESSING) == 0) + break; /* if someone else is processing the queue, wait */ - if (kq->kq_nprocess != 0) { - waitq_assert_wait64((struct waitq *)kq->kq_wqs, - CAST_EVENT64_T(&kq->kq_nprocess), - THREAD_UNINT, TIMEOUT_WAIT_FOREVER); - kq->kq_state |= KQ_PROCWAIT; - kqunlock(kq); - thread_block(THREAD_CONTINUE_NULL); - kqlock(kq); + kq->kq_state |= KQ_PROCWAIT; + suppressq = kqueue_get_suppressed_queue(kq, qos_index); + waitq_assert_wait64((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(suppressq), + THREAD_UNINT, TIMEOUT_WAIT_FOREVER); + + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + kqlock(kq); + } + + /* Nobody else processing */ + + /* clear pre-posts and KQ_WAKEUP now, in case we bail early */ + waitq_set_clear_preposts(&kq->kq_wqs); + kq->kq_state &= ~KQ_WAKEUP; + + /* anything left to process? */ + if (kqueue_queue_empty(kq, qos_index)) + return -1; + + /* convert to processing mode */ + kq->kq_state |= KQ_PROCESSING; + + return 0; +} + +/* + * kqworkq_end_processing - Complete the processing of a workq kqueue + * + * We may have to request new threads. + * This can happen there are no waiting processing threads and: + * - there were active events we never got to (count > 0) + * - we pended waitq hook callouts during processing + * - we pended wakeups while processing (or unsuppressing) + * + * Called with kqueue lock held. + */ +static void +kqworkq_end_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags) +{ +#pragma unused(flags) + + struct kqueue *kq = &kqwq->kqwq_kqueue; + struct kqtailq *suppressq = kqueue_get_suppressed_queue(kq, qos_index); + + thread_t self = current_thread(); + __assert_only struct uthread *ut = get_bsdthread_info(self); + struct knote *kn; + struct kqrequest *kqr; + int queued_events; + uint16_t pended; + thread_t thread; + + assert(kqwq->kqwq_state & KQ_WORKQ); + assert(qos_index < KQWQ_NQOS); + + /* leave early if we are not even processing */ + kqwq_req_lock(kqwq); + kqr = kqworkq_get_request(kqwq, qos_index); + thread = kqr->kqr_thread; + + if (flags & KEVENT_FLAG_WORKQ_MANAGER) { + assert(ut->uu_kqueue_bound == KQWQ_QOS_MANAGER); + assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER); + + /* if this bucket didn't need manager help, bail */ + if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { + assert(thread != self); + kqwq_req_unlock(kqwq); + return; + } + + assert(kqr->kqr_state & KQWQ_THREQUESTED); + + /* unbound bucket - see if still needs servicing */ + if (thread == THREAD_NULL) { + assert((kqr->kqr_state & KQWQ_PROCESSING) == 0); + assert(TAILQ_EMPTY(suppressq)); } else { - kq->kq_nprocess = 1; - return (0); + assert(thread == self); } + + } else { + assert(thread == self); + assert(ut->uu_kqueue_bound == qos_index); + assert((ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0); } + + kqwq_req_unlock(kqwq); + + /* Any events queued before we put suppressed ones back? */ + queued_events = !kqueue_queue_empty(kq, qos_index); + + /* + * Return suppressed knotes to their original state. + * For workq kqueues, suppressed ones that are still + * truly active (not just forced into the queue) will + * set flags we check below to see if anything got + * woken up. + */ + while ((kn = TAILQ_FIRST(suppressq)) != NULL) { + assert(kn->kn_status & KN_SUPPRESSED); + knote_unsuppress(kn); + } + + kqwq_req_lock(kqwq); + + /* Determine if wakeup-type events were pended during servicing */ + pended = (kqr->kqr_state & (KQWQ_HOOKCALLED | KQWQ_WAKEUP)); + + /* unbind thread thread */ + kqworkq_unbind_thread(kqwq, qos_index, self, flags); + + /* Indicate that we are done processing */ + kqr->kqr_state &= ~(KQWQ_PROCESSING | \ + KQWQ_THREQUESTED | KQWQ_THMANAGER); + + /* + * request a new thread if events have happened + * (not just putting stay-active events back). + */ + if ((queued_events || pended) && + !kqueue_queue_empty(kq, qos_index)) { + kqworkq_request_thread(kqwq, qos_index); + } + + kqwq_req_unlock(kqwq); } /* * Called with kqueue lock held. */ static void -kqueue_end_processing(struct kqueue *kq) +kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags) +{ + struct knote *kn; + struct kqtailq *suppressq; + int procwait; + + if (kq->kq_state & KQ_WORKQ) { + kqworkq_end_processing((struct kqworkq *)kq, qos_index, flags); + return; + } + + assert(qos_index == QOS_INDEX_KQFILE); + + /* + * Return suppressed knotes to their original state. + * For workq kqueues, suppressed ones that are still + * truly active (not just forced into the queue) will + * set flags we check below to see if anything got + * woken up. + */ + suppressq = kqueue_get_suppressed_queue(kq, qos_index); + while ((kn = TAILQ_FIRST(suppressq)) != NULL) { + assert(kn->kn_status & KN_SUPPRESSED); + knote_unsuppress(kn); + } + + procwait = (kq->kq_state & KQ_PROCWAIT); + kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT); + + if (procwait) { + /* first wake up any thread already waiting to process */ + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(suppressq), + THREAD_AWAKENED, + WAITQ_ALL_PRIORITIES); + } +} + +/* + * kevent_qos_internal_bind - bind thread to processing kqueue + * + * Indicates that the provided thread will be responsible for + * servicing the particular QoS class index specified in the + * parameters. Once the binding is done, any overrides that may + * be associated with the cooresponding events can be applied. + * + * This should be called as soon as the thread identity is known, + * preferably while still at high priority during creation. + * + * - caller holds a reference on the kqueue. + * - the thread MUST call kevent_qos_internal after being bound + * or the bucket of events may never be delivered. + * - Nothing locked (may take mutex or block). + */ + +int +kevent_qos_internal_bind( + struct proc *p, + int qos_class, + thread_t thread, + unsigned int flags) +{ + struct fileproc *fp = NULL; + struct kqueue *kq = NULL; + struct kqworkq *kqwq; + struct kqrequest *kqr; + struct uthread *ut; + kq_index_t qos_index; + int res = 0; + + assert(thread != THREAD_NULL); + assert(flags & KEVENT_FLAG_WORKQ); + + if (thread == THREAD_NULL || + (flags & KEVENT_FLAG_WORKQ) == 0) { + return EINVAL; + } + + ut = get_bsdthread_info(thread); + + /* find the kqueue */ + res = kevent_get_kq(p, -1, flags, &fp, &kq); + assert(fp == NULL); + if (res) + return res; + + /* get the qos index we're going to service */ + qos_index = qos_index_for_servicer(qos_class, thread, flags); + + /* No need to bind the manager thread to any bucket */ + if (qos_index == KQWQ_QOS_MANAGER) { + assert(ut->uu_kqueue_bound == 0); + ut->uu_kqueue_bound = qos_index; + ut->uu_kqueue_flags = flags; + return 0; + } + + kqlock(kq); + assert(kq->kq_state & KQ_WORKQ); + + kqwq = (struct kqworkq *)kq; + kqr = kqworkq_get_request(kqwq, qos_index); + + kqwq_req_lock(kqwq); + + /* + * A (non-emergency) request should have been made + * and nobody should already be servicing this bucket. + */ + assert(kqr->kqr_state & KQWQ_THREQUESTED); + assert((kqr->kqr_state & KQWQ_THMANAGER) == 0); + assert((kqr->kqr_state & KQWQ_PROCESSING) == 0); + + /* Is this is an extraneous bind? */ + if (thread == kqr->kqr_thread) { + assert(ut->uu_kqueue_bound == qos_index); + goto out; + } + + /* nobody else bound and we're not bound elsewhere */ + assert(ut->uu_kqueue_bound == 0); + assert(ut->uu_kqueue_flags == 0); + assert(kqr->kqr_thread == THREAD_NULL); + + /* Don't bind if there is a conflict */ + if (kqr->kqr_thread != THREAD_NULL || + (kqr->kqr_state & KQWQ_THMANAGER)) { + res = EINPROGRESS; + goto out; + } + + /* finally bind the thread */ + kqr->kqr_thread = thread; + ut->uu_kqueue_bound = qos_index; + ut->uu_kqueue_flags = flags; + + /* add any pending overrides to the thread */ + if (kqr->kqr_override_delta) { + thread_add_ipc_override(thread, qos_index + kqr->kqr_override_delta); + } + +out: + kqwq_req_unlock(kqwq); + kqunlock(kq); + + return res; +} + +/* + * kevent_qos_internal_unbind - unbind thread from processing kqueue + * + * End processing the per-QoS bucket of events and allow other threads + * to be requested for future servicing. + * + * caller holds a reference on the kqueue. + * thread is the current thread. + */ + +int +kevent_qos_internal_unbind( + struct proc *p, + int qos_class, + thread_t thread, + unsigned int flags) { - kq->kq_nprocess = 0; - if (kq->kq_state & KQ_PROCWAIT) { - kq->kq_state &= ~KQ_PROCWAIT; - waitq_wakeup64_all((struct waitq *)kq->kq_wqs, - CAST_EVENT64_T(&kq->kq_nprocess), - THREAD_AWAKENED, - WAITQ_ALL_PRIORITIES); + struct kqueue *kq; + struct uthread *ut; + struct fileproc *fp = NULL; + kq_index_t qos_index; + kq_index_t end_index; + int res; + + assert(flags & KEVENT_FLAG_WORKQ); + assert(thread == current_thread()); + + if (thread == THREAD_NULL || + (flags & KEVENT_FLAG_WORKQ) == 0) + return EINVAL; + + /* get the kq */ + res = kevent_get_kq(p, -1, flags, &fp, &kq); + assert(fp == NULL); + if (res) + return res; + + assert(kq->kq_state & KQ_WORKQ); + + /* get the index we have been servicing */ + qos_index = qos_index_for_servicer(qos_class, thread, flags); + + ut = get_bsdthread_info(thread); + + /* early out if we were already unbound - or never bound */ + if (ut->uu_kqueue_bound != qos_index) { + __assert_only struct kqworkq *kqwq = (struct kqworkq *)kq; + __assert_only struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index); + + assert(ut->uu_kqueue_bound == 0); + assert(ut->uu_kqueue_flags == 0); + assert(kqr->kqr_thread != thread); + return EALREADY; } + + /* unbind from all the buckets we might own */ + end_index = (qos_index == KQWQ_QOS_MANAGER) ? + 0 : qos_index; + kqlock(kq); + do { + kqueue_end_processing(kq, qos_index, flags); + } while (qos_index-- > end_index); + kqunlock(kq); + + /* indicate that we are done processing in the uthread */ + ut->uu_kqueue_bound = 0; + ut->uu_kqueue_flags = 0; + + return 0; } /* @@ -2367,65 +3380,74 @@ kqueue_end_processing(struct kqueue *kq) static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, - void *data, + void *callback_data, + struct filt_process_s *process_data, + kq_index_t servicer_qos_index, int *countp, struct proc *p) { - struct kqtailq inprocess; + unsigned int flags = process_data ? process_data->fp_flags : 0; + kq_index_t start_index, end_index, i; struct knote *kn; - int nevents; - int error; - - TAILQ_INIT(&inprocess); - - if (kqueue_begin_processing(kq) == -1) { - *countp = 0; - /* Nothing to process */ - return (0); - } + int nevents = 0; + int error = 0; /* - * Clear any pre-posted status from previous runs, so we - * only detect events that occur during this run. + * Based on the native QoS of the servicer, + * determine the range of QoSes that need checking */ - waitq_set_clear_preposts(kq->kq_wqs); + start_index = servicer_qos_index; + end_index = (start_index == KQWQ_QOS_MANAGER) ? 0 : start_index; + + i = start_index; - /* - * loop through the enqueued knotes, processing each one and - * revalidating those that need it. As they are processed, - * they get moved to the inprocess queue (so the loop can end). - */ - error = 0; - nevents = 0; + do { + if (kqueue_begin_processing(kq, i, flags) == -1) { + *countp = 0; + /* Nothing to process */ + continue; + } - while (error == 0 && - (kn = TAILQ_FIRST(&kq->kq_head)) != NULL) { - error = knote_process(kn, callback, data, &inprocess, p); - if (error == EJUSTRETURN) - error = 0; - else - nevents++; - } + /* + * loop through the enqueued knotes, processing each one and + * revalidating those that need it. As they are processed, + * they get moved to the inprocess queue (so the loop can end). + */ + error = 0; - /* - * With the kqueue still locked, move any knotes - * remaining on the inprocess queue back to the - * kq's queue and wake up any waiters. - */ - while ((kn = TAILQ_FIRST(&inprocess)) != NULL) { - assert(kn->kn_tq == &inprocess); - TAILQ_REMOVE(&inprocess, kn, kn_tqe); - kn->kn_tq = &kq->kq_head; - TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); - } + struct kqtailq *base_queue = kqueue_get_base_queue(kq, i); + struct kqtailq *queue = kqueue_get_high_queue(kq, i); + do { + while (error == 0 && + (kn = TAILQ_FIRST(queue)) != NULL) { + /* Process the knote */ + error = knote_process(kn, callback, callback_data, process_data, p); + if (error == EJUSTRETURN) + error = 0; + else + nevents++; + + /* break out if no more space for additional events */ + if (error == EWOULDBLOCK) { + if ((kq->kq_state & KQ_WORKQ) == 0) + kqueue_end_processing(kq, i, flags); + error = 0; + goto out; + } + } + } while (error == 0 && queue-- > base_queue); + + /* let somebody else process events if we're not in workq mode */ + if ((kq->kq_state & KQ_WORKQ) == 0) + kqueue_end_processing(kq, i, flags); - kqueue_end_processing(kq); + } while (i-- > end_index); +out: *countp = nevents; return (error); } - static void kqueue_scan_continue(void *data, wait_result_t wait_result) { @@ -2433,17 +3455,22 @@ kqueue_scan_continue(void *data, wait_result_t wait_result) uthread_t ut = (uthread_t)get_bsdthread_info(self); struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan; struct kqueue *kq = (struct kqueue *)data; + struct filt_process_s *process_data = cont_args->process_data; int error; int count; /* convert the (previous) wait_result to a proper error */ switch (wait_result) { - case THREAD_AWAKENED: + case THREAD_AWAKENED: { kqlock(kq); - error = kqueue_process(kq, cont_args->call, cont_args, &count, - current_proc()); + retry: + error = kqueue_process(kq, cont_args->call, cont_args->data, + process_data, cont_args->servicer_qos_index, + &count, current_proc()); if (error == 0 && count == 0) { - waitq_assert_wait64((struct waitq *)kq->kq_wqs, + if (kq->kq_state & KQ_WAKEUP) + goto retry; + waitq_assert_wait64((struct waitq *)&kq->kq_wqs, KQ_EVENT, THREAD_ABORTSAFE, cont_args->deadline); kq->kq_state |= KQ_SLEEP; @@ -2452,13 +3479,16 @@ kqueue_scan_continue(void *data, wait_result_t wait_result) /* NOTREACHED */ } kqunlock(kq); - break; + } break; case THREAD_TIMED_OUT: error = EWOULDBLOCK; break; case THREAD_INTERRUPTED: error = EINTR; break; + case THREAD_RESTART: + error = EBADF; + break; default: panic("%s: - invalid wait_result (%d)", __func__, wait_result); @@ -2489,17 +3519,30 @@ int kqueue_scan(struct kqueue *kq, kevent_callback_t callback, kqueue_continue_t continuation, - void *data, + void *callback_data, + struct filt_process_s *process_data, struct timeval *atvp, struct proc *p) { thread_continue_t cont = THREAD_CONTINUE_NULL; + kq_index_t servicer_qos_index; + unsigned int flags; uint64_t deadline; int error; int first; + int fd; assert(callback != NULL); + /* + * Determine which QoS index we are servicing + */ + flags = (process_data) ? process_data->fp_flags : 0; + fd = (process_data) ? process_data->fp_fd : -1; + servicer_qos_index = (kq->kq_state & KQ_WORKQ) ? + qos_index_for_servicer(fd, current_thread(), flags) : + QOS_INDEX_KQFILE; + first = 1; for (;;) { wait_result_t wait_result; @@ -2510,7 +3553,9 @@ kqueue_scan(struct kqueue *kq, * triggered. */ kqlock(kq); - error = kqueue_process(kq, callback, data, &count, p); + error = kqueue_process(kq, callback, callback_data, + process_data, servicer_qos_index, + &count, p); if (error || count) break; /* lock still held */ @@ -2543,13 +3588,21 @@ kqueue_scan(struct kqueue *kq, cont_args->call = callback; cont_args->cont = continuation; cont_args->deadline = deadline; - cont_args->data = data; + cont_args->data = callback_data; + cont_args->process_data = process_data; + cont_args->servicer_qos_index = servicer_qos_index; cont = kqueue_scan_continue; } } + /* If awakened during processing, try again */ + if (kq->kq_state & KQ_WAKEUP) { + kqunlock(kq); + continue; + } + /* go ahead and wait */ - waitq_assert_wait64_leeway((struct waitq *)kq->kq_wqs, + waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs, KQ_EVENT, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL, deadline, TIMEOUT_NO_LEEWAY); @@ -2562,9 +3615,11 @@ kqueue_scan(struct kqueue *kq, case THREAD_AWAKENED: continue; case THREAD_TIMED_OUT: - return (EWOULDBLOCK); + return EWOULDBLOCK; case THREAD_INTERRUPTED: - return (EINTR); + return EINTR; + case THREAD_RESTART: + return EBADF; default: panic("%s: - bad wait_result (%d)", __func__, wait_result); @@ -2616,16 +3671,18 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, __unused vfs_context_t ctx) { struct kqueue *kq = (struct kqueue *)fp->f_data; + struct kqtailq *queue; + struct kqtailq *suppressq; struct knote *kn; - struct kqtailq inprocessq; int retnum = 0; if (which != FREAD) return (0); - TAILQ_INIT(&inprocessq); - kqlock(kq); + + assert((kq->kq_state & KQ_WORKQ) == 0); + /* * If this is the first pass, link the wait queue associated with the * the kqueue onto the wait queue set for the select(). Normally we @@ -2639,7 +3696,7 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, struct uthread * ut = get_bsdthread_info(cur_act); kq->kq_state |= KQ_SEL; - waitq_link((struct waitq *)kq->kq_wqs, ut->uu_wqset, + waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset, WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id); /* always consume the reserved link object */ @@ -2655,56 +3712,64 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, * memcpy here because the pointer may not be properly aligned * on 32-bit systems. */ - memcpy(wq_link_id, (void *)&(kq->kq_wqs), sizeof(void *)); + void *wqptr = &kq->kq_wqs; + memcpy(wq_link_id, (void *)&wqptr, sizeof(void *)); } - if (kqueue_begin_processing(kq) == -1) { + if (kqueue_begin_processing(kq, QOS_INDEX_KQFILE, 0) == -1) { kqunlock(kq); return (0); } - if (kq->kq_count != 0) { + queue = kqueue_get_base_queue(kq, QOS_INDEX_KQFILE); + if (!TAILQ_EMPTY(queue)) { /* * there is something queued - but it might be a - * KN_STAYQUEUED knote, which may or may not have - * any events pending. So, we have to walk the - * list of knotes to see, and peek at the stay- - * queued ones to be really sure. + * KN_STAYACTIVE knote, which may or may not have + * any events pending. Otherwise, we have to walk + * the list of knotes to see, and peek at the + * (non-vanished) stay-active ones to be really sure. */ - while ((kn = (struct knote *)TAILQ_FIRST(&kq->kq_head)) != NULL) { - if ((kn->kn_status & KN_STAYQUEUED) == 0) { + while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) { + if (kn->kn_status & KN_ACTIVE) { retnum = 1; goto out; } + assert(kn->kn_status & KN_STAYACTIVE); + knote_suppress(kn); + } - TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); - TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe); + /* + * There were no regular events on the queue, so take + * a deeper look at the stay-queued ones we suppressed. + */ + suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE); + while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) { + unsigned peek = 1; + /* If didn't vanish while suppressed - peek at it */ if (kqlock2knoteuse(kq, kn)) { - unsigned peek; - peek = kn->kn_fop->f_peek(kn); - if (knoteuse2kqlock(kq, kn)) { - if (peek > 0) { - retnum = 1; - goto out; - } - } else { - retnum = 0; - } + peek = knote_fops(kn)->f_peek(kn); + + /* if it dropped while getting lock - move on */ + if (!knoteuse2kqlock(kq, kn, 0)) + continue; + } + + /* unsuppress it */ + knote_unsuppress(kn); + + /* has data or it has to report a vanish */ + if (peek > 0) { + retnum = 1; + goto out; } } } out: - /* Return knotes to active queue */ - while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) { - TAILQ_REMOVE(&inprocessq, kn, kn_tqe); - kn->kn_tq = &kq->kq_head; - TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); - } - - kqueue_end_processing(kq); + kqueue_end_processing(kq, QOS_INDEX_KQFILE, 0); kqunlock(kq); return (retnum); } @@ -2716,9 +3781,10 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, static int kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx) { - struct kqueue *kq = (struct kqueue *)fg->fg_data; + struct kqfile *kqf = (struct kqfile *)fg->fg_data; - kqueue_dealloc(kq); + assert((kqf->kqf_state & KQ_WORKQ) == 0); + kqueue_dealloc(&kqf->kqf_kqueue); fg->fg_data = NULL; return (0); } @@ -2732,12 +3798,18 @@ kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx) static int kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx) { - struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; - struct kqueue *parentkq = kn->kn_kq; + struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data; + struct kqueue *kq = &kqf->kqf_kqueue; + struct kqueue *parentkq = knote_get_kq(kn); + + assert((kqf->kqf_state & KQ_WORKQ) == 0); if (parentkq == kq || - kn->kn_filter != EVFILT_READ) - return (1); + kn->kn_filter != EVFILT_READ) { + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; + return 0; + } /* * We have to avoid creating a cycle when nesting kqueues @@ -2755,7 +3827,9 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_con parentkq->kq_level < kq->kq_level) { kqunlock(parentkq); - return (1); + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; + return 0; } else { /* set parent level appropriately */ if (parentkq->kq_level == 0) @@ -2764,14 +3838,16 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_con parentkq->kq_level = kq->kq_level + 1; kqunlock(parentkq); - kn->kn_fop = &kqread_filtops; + kn->kn_filtid = EVFILTID_KQREAD; kqlock(kq); - KNOTE_ATTACH(&kq->kq_sel.si_note, kn); + KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn); /* indicate nesting in child, if needed */ if (kq->kq_level == 0) kq->kq_level = 1; + + int count = kq->kq_count; kqunlock(kq); - return (0); + return (count > 0); } } @@ -2783,8 +3859,12 @@ static int kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx) { struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data; + + assert((kq->kq_state & KQ_WORKQ) == 0); + kqlock(kq); - kqueue_wakeup(kq, 1); + kq->kq_state |= KQ_DRAIN; + kqueue_interrupt(kq); kqunlock(kq); return (0); } @@ -2793,6 +3873,8 @@ kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx) int kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p) { + assert((kq->kq_state & KQ_WORKQ) == 0); + kqlock(kq); if (isstat64 != 0) { struct stat64 *sb64 = (struct stat64 *)ub; @@ -2809,58 +3891,549 @@ kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p) sb64->st_blksize = sizeof(struct user32_kevent); sb64->st_mode = S_IFIFO; } else { - struct stat *sb = (struct stat *)ub; + struct stat *sb = (struct stat *)ub; + + bzero((void *)sb, sizeof(*sb)); + sb->st_size = kq->kq_count; + if (kq->kq_state & KQ_KEV_QOS) + sb->st_blksize = sizeof(struct kevent_qos_s); + else if (kq->kq_state & KQ_KEV64) + sb->st_blksize = sizeof(struct kevent64_s); + else if (IS_64BIT_PROCESS(p)) + sb->st_blksize = sizeof(struct user64_kevent); + else + sb->st_blksize = sizeof(struct user32_kevent); + sb->st_mode = S_IFIFO; + } + kqunlock(kq); + return (0); +} + + +/* + * Interact with the pthread kext to request a servicing there. + * Eventually, this will request threads at specific QoS levels. + * For now, it only requests a dispatch-manager-QoS thread, and + * only one-at-a-time. + * + * - Caller holds the workq request lock + * + * - May be called with the kqueue's wait queue set locked, + * so cannot do anything that could recurse on that. + */ +static void +kqworkq_request_thread( + struct kqworkq *kqwq, + kq_index_t qos_index) +{ + struct kqrequest *kqr; + + assert(kqwq->kqwq_state & KQ_WORKQ); + assert(qos_index < KQWQ_NQOS); + + kqr = kqworkq_get_request(kqwq, qos_index); + + /* + * If we have already requested a thread, and it hasn't + * started processing yet, there's no use hammering away + * on the pthread kext. + */ + if (kqr->kqr_state & KQWQ_THREQUESTED) + return; + + assert(kqr->kqr_thread == THREAD_NULL); + + /* request additional workq threads if appropriate */ + if (pthread_functions != NULL && + pthread_functions->workq_reqthreads != NULL) { + unsigned int flags = KEVENT_FLAG_WORKQ; + + /* Compute a priority based on qos_index. */ + struct workq_reqthreads_req_s request = { + .priority = qos_from_qos_index(qos_index), + .count = 1 + }; + + thread_t wqthread; + wqthread = (*pthread_functions->workq_reqthreads)(kqwq->kqwq_p, 1, &request); + kqr->kqr_state |= KQWQ_THREQUESTED; + + /* Have we been switched to the emergency/manager thread? */ + if (wqthread == (thread_t)-1) { + flags |= KEVENT_FLAG_WORKQ_MANAGER; + wqthread = THREAD_NULL; + } else if (qos_index == KQWQ_QOS_MANAGER) + flags |= KEVENT_FLAG_WORKQ_MANAGER; + + /* bind the thread */ + kqworkq_bind_thread(kqwq, qos_index, wqthread, flags); + } +} + +/* + * If we aren't already busy processing events [for this QoS], + * request workq thread support as appropriate. + * + * TBD - for now, we don't segregate out processing by QoS. + * + * - May be called with the kqueue's wait queue set locked, + * so cannot do anything that could recurse on that. + */ +static void +kqworkq_request_help( + struct kqworkq *kqwq, + kq_index_t qos_index, + uint32_t type) +{ + struct kqrequest *kqr; + + /* convert to thread qos value */ + assert(qos_index < KQWQ_NQOS); + + kqwq_req_lock(kqwq); + kqr = kqworkq_get_request(kqwq, qos_index); + + /* + * If someone is processing the queue, just mark what type + * of attempt this was (from a kq wakeup or from a waitq hook). + * They'll be noticed at the end of servicing and a new thread + * will be requested at that point. + */ + if (kqr->kqr_state & KQWQ_PROCESSING) { + kqr->kqr_state |= type; + kqwq_req_unlock(kqwq); + return; + } + + kqworkq_request_thread(kqwq, qos_index); + kqwq_req_unlock(kqwq); +} + +/* + * These arrays described the low and high qindexes for a given qos_index. + * The values come from the chart in (must stay in sync). + */ +static kq_index_t _kq_base_index[KQWQ_NQOS] = {0, 0, 6, 11, 15, 18, 20, 21}; +static kq_index_t _kq_high_index[KQWQ_NQOS] = {0, 5, 10, 14, 17, 19, 20, 21}; + +static struct kqtailq * +kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index) +{ + assert(qos_index < KQWQ_NQOS); + return &kq->kq_queue[_kq_base_index[qos_index]]; +} + +static struct kqtailq * +kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index) +{ + assert(qos_index < KQWQ_NQOS); + return &kq->kq_queue[_kq_high_index[qos_index]]; +} + +static int +kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index) +{ + struct kqtailq *base_queue = kqueue_get_base_queue(kq, qos_index); + struct kqtailq *queue = kqueue_get_high_queue(kq, qos_index); + + do { + if (!TAILQ_EMPTY(queue)) + return 0; + } while (queue-- > base_queue); + return 1; +} + +static struct kqtailq * +kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index) +{ + if (kq->kq_state & KQ_WORKQ) { + struct kqworkq *kqwq = (struct kqworkq *)kq; + struct kqrequest *kqr; + + kqr = kqworkq_get_request(kqwq, qos_index); + return &kqr->kqr_suppressed; + } else { + struct kqfile *kqf = (struct kqfile *)kq; + return &kqf->kqf_suppressed; + } +} + +static kq_index_t +knote_get_queue_index(struct knote *kn) +{ + kq_index_t override_index = knote_get_qos_override_index(kn); + kq_index_t qos_index = knote_get_qos_index(kn); + struct kqueue *kq = knote_get_kq(kn); + kq_index_t res; + + if ((kq->kq_state & KQ_WORKQ) == 0) { + assert(qos_index == 0); + assert(override_index == 0); + } + res = _kq_base_index[qos_index]; + if (override_index > qos_index) + res += override_index - qos_index; + + assert(res <= _kq_high_index[qos_index]); + return res; +} + +static struct kqtailq * +knote_get_queue(struct knote *kn) +{ + kq_index_t qindex = knote_get_queue_index(kn); + + return &(knote_get_kq(kn))->kq_queue[qindex]; +} + +static struct kqtailq * +knote_get_suppressed_queue(struct knote *kn) +{ + kq_index_t qos_index = knote_get_qos_index(kn); + struct kqueue *kq = knote_get_kq(kn); + + return kqueue_get_suppressed_queue(kq, qos_index); +} + +static kq_index_t +knote_get_req_index(struct knote *kn) +{ + return kn->kn_req_index; +} + +static kq_index_t +knote_get_qos_index(struct knote *kn) +{ + return kn->kn_qos_index; +} + +static void +knote_set_qos_index(struct knote *kn, kq_index_t qos_index) +{ + struct kqueue *kq = knote_get_kq(kn); + + assert(qos_index < KQWQ_NQOS); + assert((kn->kn_status & KN_QUEUED) == 0); + + if (kq->kq_state & KQ_WORKQ) + assert(qos_index > QOS_INDEX_KQFILE); + else + assert(qos_index == QOS_INDEX_KQFILE); + + /* always set requested */ + kn->kn_req_index = qos_index; + + /* only adjust in-use qos index when not suppressed */ + if ((kn->kn_status & KN_SUPPRESSED) == 0) + kn->kn_qos_index = qos_index; +} + +static kq_index_t +knote_get_qos_override_index(struct knote *kn) +{ + return kn->kn_qos_override; +} + +static void +knote_set_qos_override_index(struct knote *kn, kq_index_t override_index) +{ + struct kqueue *kq = knote_get_kq(kn); + kq_index_t qos_index = knote_get_qos_index(kn); + + assert((kn->kn_status & KN_QUEUED) == 0); + + if (override_index == KQWQ_QOS_MANAGER) + assert(qos_index == KQWQ_QOS_MANAGER); + else + assert(override_index < KQWQ_QOS_MANAGER); + + kn->kn_qos_override = override_index; + + /* + * If this is a workq kqueue, apply the override to the + * workq servicing thread. + */ + if (kq->kq_state & KQ_WORKQ) { + struct kqworkq *kqwq = (struct kqworkq *)kq; + + assert(qos_index > QOS_INDEX_KQFILE); + kqworkq_update_override(kqwq, qos_index, override_index); + } +} + +static void +kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index) +{ + struct kqrequest *kqr; + kq_index_t new_delta; + kq_index_t old_delta; + + new_delta = (override_index > qos_index) ? + override_index - qos_index : 0; + + kqr = kqworkq_get_request(kqwq, qos_index); + + kqwq_req_lock(kqwq); + old_delta = kqr->kqr_override_delta; + + if (new_delta > old_delta) { + thread_t wqthread = kqr->kqr_thread; + + /* store the new override delta */ + kqr->kqr_override_delta = new_delta; + + /* apply the override to [incoming?] servicing thread */ + if (wqthread) { + /* only apply if non-manager */ + if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { + if (old_delta) + thread_update_ipc_override(wqthread, override_index); + else + thread_add_ipc_override(wqthread, override_index); + } + } + } + kqwq_req_unlock(kqwq); +} + +/* called with the kqworkq lock held */ +static void +kqworkq_bind_thread( + struct kqworkq *kqwq, + kq_index_t qos_index, + thread_t thread, + unsigned int flags) +{ + struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index); + thread_t old_thread = kqr->kqr_thread; + struct uthread *ut; + + assert(kqr->kqr_state & KQWQ_THREQUESTED); + + /* If no identity yet, just set flags as needed */ + if (thread == THREAD_NULL) { + assert(old_thread == THREAD_NULL); + + /* emergency or unindetified */ + if (flags & KEVENT_FLAG_WORKQ_MANAGER) { + assert((kqr->kqr_state & KQWQ_THMANAGER) == 0); + kqr->kqr_state |= KQWQ_THMANAGER; + } + return; + } + + /* Known thread identity */ + ut = get_bsdthread_info(thread); + + /* + * If this is a manager, and the manager request bit is + * not set, assure no other thread is bound. If the bit + * is set, make sure the old thread is us (or not set). + */ + if (flags & KEVENT_FLAG_WORKQ_MANAGER) { + if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { + assert(old_thread == THREAD_NULL); + kqr->kqr_state |= KQWQ_THMANAGER; + } else if (old_thread == THREAD_NULL) { + kqr->kqr_thread = thread; + ut->uu_kqueue_bound = KQWQ_QOS_MANAGER; + ut->uu_kqueue_flags = (KEVENT_FLAG_WORKQ | + KEVENT_FLAG_WORKQ_MANAGER); + } else { + assert(thread == old_thread); + assert(ut->uu_kqueue_bound == KQWQ_QOS_MANAGER); + assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER); + } + return; + } + + /* Just a normal one-queue servicing thread */ + assert(old_thread == THREAD_NULL); + assert((kqr->kqr_state & KQWQ_THMANAGER) == 0); + + kqr->kqr_thread = thread; + + /* apply an ipc QoS override if one is needed */ + if (kqr->kqr_override_delta) + thread_add_ipc_override(thread, qos_index + kqr->kqr_override_delta); + + /* indicate that we are processing in the uthread */ + ut->uu_kqueue_bound = qos_index; + ut->uu_kqueue_flags = flags; +} + +/* called with the kqworkq lock held */ +static void +kqworkq_unbind_thread( + struct kqworkq *kqwq, + kq_index_t qos_index, + thread_t thread, + __unused unsigned int flags) +{ + struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index); + kq_index_t override = 0; + + assert(thread == current_thread()); + + /* + * If there is an override, drop it from the current thread + * and then we are free to recompute (a potentially lower) + * minimum override to apply to the next thread request. + */ + if (kqr->kqr_override_delta) { + struct kqtailq *base_queue = kqueue_get_base_queue(&kqwq->kqwq_kqueue, qos_index); + struct kqtailq *queue = kqueue_get_high_queue(&kqwq->kqwq_kqueue, qos_index); + + /* if not bound to a manager thread, drop the current ipc override */ + if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { + assert(thread == kqr->kqr_thread); + thread_drop_ipc_override(thread); + } + + /* recompute the new override */ + do { + if (!TAILQ_EMPTY(queue)) { + override = queue - base_queue; + break; + } + } while (queue-- > base_queue); + } + + /* unbind the thread and apply the new override */ + kqr->kqr_thread = THREAD_NULL; + kqr->kqr_override_delta = override; +} + +struct kqrequest * +kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index) +{ + assert(qos_index < KQWQ_NQOS); + return &kqwq->kqwq_request[qos_index]; +} + +void +knote_adjust_qos(struct knote *kn, qos_t new_qos, qos_t new_override) +{ + if (knote_get_kq(kn)->kq_state & KQ_WORKQ) { + kq_index_t new_qos_index; + kq_index_t new_override_index; + kq_index_t servicer_qos_index; + + new_qos_index = qos_index_from_qos(new_qos, FALSE); + new_override_index = qos_index_from_qos(new_override, TRUE); + + /* make sure the servicer qos acts as a floor */ + servicer_qos_index = qos_index_from_qos(kn->kn_qos, FALSE); + if (servicer_qos_index > new_qos_index) + new_qos_index = servicer_qos_index; + if (servicer_qos_index > new_override_index) + new_override_index = servicer_qos_index; + + kqlock(knote_get_kq(kn)); + if (new_qos_index != knote_get_req_index(kn) || + new_override_index != knote_get_qos_override_index(kn)) { + if (kn->kn_status & KN_QUEUED) { + knote_dequeue(kn); + knote_set_qos_index(kn, new_qos_index); + knote_set_qos_override_index(kn, new_override_index); + knote_enqueue(kn); + knote_wakeup(kn); + } else { + knote_set_qos_index(kn, new_qos_index); + knote_set_qos_override_index(kn, new_override_index); + } + } + kqunlock(knote_get_kq(kn)); + } +} + +static void +knote_wakeup(struct knote *kn) +{ + struct kqueue *kq = knote_get_kq(kn); + + if (kq->kq_state & KQ_WORKQ) { + /* request a servicing thread */ + struct kqworkq *kqwq = (struct kqworkq *)kq; + kq_index_t qos_index = knote_get_qos_index(kn); + + kqworkq_request_help(kqwq, qos_index, KQWQ_WAKEUP); + + } else { + struct kqfile *kqf = (struct kqfile *)kq; + + /* flag wakeups during processing */ + if (kq->kq_state & KQ_PROCESSING) + kq->kq_state |= KQ_WAKEUP; + + /* wakeup a thread waiting on this queue */ + if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) { + kq->kq_state &= ~(KQ_SLEEP | KQ_SEL); + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, + KQ_EVENT, + THREAD_AWAKENED, + WAITQ_ALL_PRIORITIES); + } - bzero((void *)sb, sizeof(*sb)); - sb->st_size = kq->kq_count; - if (kq->kq_state & KQ_KEV_QOS) - sb->st_blksize = sizeof(struct kevent_qos_s); - else if (kq->kq_state & KQ_KEV64) - sb->st_blksize = sizeof(struct kevent64_s); - else if (IS_64BIT_PROCESS(p)) - sb->st_blksize = sizeof(struct user64_kevent); - else - sb->st_blksize = sizeof(struct user32_kevent); - sb->st_mode = S_IFIFO; + /* wakeup other kqueues/select sets we're inside */ + KNOTE(&kqf->kqf_sel.si_note, 0); } - kqunlock(kq); - return (0); } - + /* * Called with the kqueue locked */ static void -kqueue_wakeup(struct kqueue *kq, int closed) +kqueue_interrupt(struct kqueue *kq) { - wait_result_t res = THREAD_NOT_WAITING; + assert((kq->kq_state & KQ_WORKQ) == 0); - if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0 || kq->kq_nprocess > 0) { + /* wakeup sleeping threads */ + if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) { kq->kq_state &= ~(KQ_SLEEP | KQ_SEL); - res = waitq_wakeup64_all((struct waitq *)kq->kq_wqs, KQ_EVENT, - (closed) ? THREAD_INTERRUPTED : THREAD_AWAKENED, - WAITQ_ALL_PRIORITIES); + (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, + KQ_EVENT, + THREAD_RESTART, + WAITQ_ALL_PRIORITIES); } - /* request additional workq threads if appropriate */ - if (res == THREAD_NOT_WAITING && (kq->kq_state & KQ_WORKQ) && - pthread_functions != NULL && pthread_functions->workq_reqthreads != NULL) { - /* - * The special workq kq should be accumulating the counts of - * queued sources on a pthread_priority_t basis and we should - * be providing that here. For now, just hard-code a single - * entry request at a fixed (default) QOS. - */ - struct workq_reqthreads_req_s request = { - .priority = 0x020004ff, /* legacy event manager */ - .count = kq->kq_count }; - thread_t wqthread; + /* wakeup threads waiting their turn to process */ + if (kq->kq_state & KQ_PROCWAIT) { + struct kqtailq *suppressq; - wqthread = (*pthread_functions->workq_reqthreads)(kq->kq_p, 1, &request); - assert(wqthread == THREAD_NULL); + assert(kq->kq_state & KQ_PROCESSING); + + kq->kq_state &= ~KQ_PROCWAIT; + suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE); + (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(suppressq), + THREAD_RESTART, + WAITQ_ALL_PRIORITIES); } } +/* + * Called back from waitq code when no threads waiting and the hook was set. + * + * Interrupts are likely disabled and spin locks are held - minimal work + * can be done in this context!!! + * + * JMM - in the future, this will try to determine which knotes match the + * wait queue wakeup and apply these wakeups against those knotes themselves. + * For now, all the events dispatched this way are dispatch-manager handled, + * so hard-code that for now. + */ +void +waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos) +{ +#pragma unused(knote_hook, qos) + + struct kqworkq *kqwq = (struct kqworkq *)kq_hook; + + assert(kqwq->kqwq_state & KQ_WORKQ); + kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER, KQWQ_HOOKCALLED); +} + void klist_init(struct klist *list) { @@ -2878,7 +4451,7 @@ klist_init(struct klist *list) * * The object lock should also hold off pending * detach/drop operations. But we'll prevent it here - * too - just in case. + * too (by taking a use reference) - just in case. */ void knote(struct klist *list, long hint) @@ -2886,19 +4459,21 @@ knote(struct klist *list, long hint) struct knote *kn; SLIST_FOREACH(kn, list, kn_selnext) { - struct kqueue *kq = kn->kn_kq; + struct kqueue *kq = knote_get_kq(kn); kqlock(kq); + + /* If we can get a use reference - deliver event */ if (kqlock2knoteuse(kq, kn)) { int result; /* call the event with only a use count */ - result = kn->kn_fop->f_event(kn, hint); + result = knote_fops(kn)->f_event(kn, hint); /* if its not going away and triggered */ - if (knoteuse2kqlock(kq, kn) && result) - knote_activate(kn, 0); - /* lock held again */ + if (knoteuse2kqlock(kq, kn, 0) && result) + knote_activate(kn); + /* kq lock held */ } kqunlock(kq); } @@ -2927,6 +4502,53 @@ knote_detach(struct klist *list, struct knote *kn) return (SLIST_EMPTY(list)); } +/* + * knote_vanish - Indicate that the source has vanished + * + * If the knote has requested EV_VANISHED delivery, + * arrange for that. Otherwise, deliver a NOTE_REVOKE + * event for backward compatibility. + * + * The knote is marked as having vanished, but is not + * actually detached from the source in this instance. + * The actual detach is deferred until the knote drop. + * + * Our caller already has the object lock held. Calling + * the detach routine would try to take that lock + * recursively - which likely is not supported. + */ +void +knote_vanish(struct klist *list) +{ + struct knote *kn; + struct knote *kn_next; + + SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) { + struct kqueue *kq = knote_get_kq(kn); + int result; + + kqlock(kq); + if ((kn->kn_status & KN_DROPPING) == 0) { + + /* If EV_VANISH supported - prepare to deliver one */ + if (kn->kn_status & KN_REQVANISH) { + kn->kn_status |= KN_VANISHED; + knote_activate(kn); + + } else if (kqlock2knoteuse(kq, kn)) { + /* call the event with only a use count */ + result = knote_fops(kn)->f_event(kn, NOTE_REVOKE); + + /* if its not going away and triggered */ + if (knoteuse2kqlock(kq, kn, 0) && result) + knote_activate(kn); + /* lock held again */ + } + } + kqunlock(kq); + } +} + /* * For a given knote, link a provided wait queue directly with the kqueue. * Wakeups will happen via recursive wait queue support. But nothing will move @@ -2934,18 +4556,19 @@ knote_detach(struct klist *list, struct knote *kn) * we permanently enqueue them here. * * kqueue and knote references are held by caller. + * waitq locked by caller. * * caller provides the wait queue link structure. */ int knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link) { - struct kqueue *kq = kn->kn_kq; + struct kqueue *kq = knote_get_kq(kn); kern_return_t kr; - kr = waitq_link(wq, kq->kq_wqs, WAITQ_SHOULD_LOCK, reserved_link); + kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link); if (kr == KERN_SUCCESS) { - knote_markstayqueued(kn); + knote_markstayactive(kn); return (0); } else { return (EINVAL); @@ -2964,11 +4587,11 @@ knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link) int knote_unlink_waitq(struct knote *kn, struct waitq *wq) { - struct kqueue *kq = kn->kn_kq; + struct kqueue *kq = knote_get_kq(kn); kern_return_t kr; - kr = waitq_unlink(wq, kq->kq_wqs); - knote_clearstayqueued(kn); + kr = waitq_unlink(wq, &kq->kq_wqs); + knote_clearstayactive(kn); return ((kr != KERN_SUCCESS) ? EINVAL : 0); } @@ -2982,49 +4605,90 @@ knote_unlink_waitq(struct knote *kn, struct waitq *wq) * It returns the same way, but may drop it temporarily. */ void -knote_fdclose(struct proc *p, int fd) +knote_fdclose(struct proc *p, int fd, int force) { - struct filedesc *fdp = p->p_fd; struct klist *list; struct knote *kn; - list = &fdp->fd_knlist[fd]; - while ((kn = SLIST_FIRST(list)) != NULL) { - struct kqueue *kq = kn->kn_kq; +restart: + list = &p->p_fd->fd_knlist[fd]; + SLIST_FOREACH(kn, list, kn_link) { + struct kqueue *kq = knote_get_kq(kn); + + kqlock(kq); if (kq->kq_p != p) panic("%s: proc mismatch (kq->kq_p=%p != p=%p)", __func__, kq->kq_p, p); - kqlock(kq); + /* + * If the knote supports EV_VANISHED delivery, + * transition it to vanished mode (or skip over + * it if already vanished). + */ + if (!force && (kn->kn_status & KN_REQVANISH)) { + + if ((kn->kn_status & KN_VANISHED) == 0) { + proc_fdunlock(p); + + /* get detach reference (also marks vanished) */ + if (kqlock2knotedetach(kq, kn)) { + + /* detach knote and drop fp use reference */ + knote_fops(kn)->f_detach(kn); + if (knote_fops(kn)->f_isfd) + fp_drop(p, kn->kn_id, kn->kn_fp, 0); + + /* activate it if it's still in existence */ + if (knoteuse2kqlock(kq, kn, 0)) { + knote_activate(kn); + } + kqunlock(kq); + } + proc_fdlock(p); + goto restart; + } else { + kqunlock(kq); + continue; + } + } + proc_fdunlock(p); /* - * Convert the lock to a drop ref. + * Convert the kq lock to a drop ref. * If we get it, go ahead and drop it. - * Otherwise, we waited for it to - * be dropped by the other guy, so - * it is safe to move on in the list. + * Otherwise, we waited for the blocking + * condition to complete. Either way, + * we dropped the fdlock so start over. */ if (kqlock2knotedrop(kq, kn)) { - kn->kn_fop->f_detach(kn); knote_drop(kn, p); } proc_fdlock(p); - - /* the fd tables may have changed - start over */ - list = &fdp->fd_knlist[fd]; + goto restart; } } -/* proc_fdlock held on entry (and exit) */ +/* + * knote_fdadd - Add knote to the fd table for process + * + * All file-based filters associate a list of knotes by file + * descriptor index. All other filters hash the knote by ident. + * + * May have to grow the table of knote lists to cover the + * file descriptor index presented. + * + * proc_fdlock held on entry (and exit) + */ static int -knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p) +knote_fdadd(struct knote *kn, struct proc *p) { + struct filedesc *fdp = p->p_fd; struct klist *list = NULL; - if (! kn->kn_fop->f_isfd) { + if (! knote_fops(kn)->f_isfd) { if (fdp->fd_knhashmask == 0) fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &fdp->fd_knhashmask); @@ -3065,41 +4729,142 @@ knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p) return (0); } +/* + * knote_fdremove - remove a knote from the fd table for process + * + * If the filter is file-based, remove based on fd index. + * Otherwise remove from the hash based on the ident. + * + * proc_fdlock held on entry (and exit) + */ +static void +knote_fdremove(struct knote *kn, struct proc *p) +{ + struct filedesc *fdp = p->p_fd; + struct klist *list = NULL; + + if (knote_fops(kn)->f_isfd) { + assert ((u_int)fdp->fd_knlistsize > kn->kn_id); + list = &fdp->fd_knlist[kn->kn_id]; + } else { + list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + } + SLIST_REMOVE(list, kn, knote, kn_link); +} + +/* + * knote_fdfind - lookup a knote in the fd table for process + * + * If the filter is file-based, lookup based on fd index. + * Otherwise use a hash based on the ident. + * + * Matching is based on kq, filter, and ident. Optionally, + * it may also be based on the udata field in the kevent - + * allowing multiple event registration for the file object + * per kqueue. + * + * proc_fdlock held on entry (and exit) + */ +static struct knote * +knote_fdfind(struct kqueue *kq, + struct kevent_internal_s *kev, + struct proc *p) +{ + struct filedesc *fdp = p->p_fd; + struct klist *list = NULL; + struct knote *kn = NULL; + struct filterops *fops; + + fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ + + /* + * determine where to look for the knote + */ + if (fops->f_isfd) { + /* fd-based knotes are linked off the fd table */ + if (kev->ident < (u_int)fdp->fd_knlistsize) { + list = &fdp->fd_knlist[kev->ident]; + } + } else if (fdp->fd_knhashmask != 0) { + /* hash non-fd knotes here too */ + list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; + } + /* + * scan the selected list looking for a match + */ + if (list != NULL) { + SLIST_FOREACH(kn, list, kn_link) { + if (kq == knote_get_kq(kn) && + kev->ident == kn->kn_id && + kev->filter == kn->kn_filter) { + if (kev->flags & EV_UDATA_SPECIFIC) { + if ((kn->kn_status & KN_UDATA_SPECIFIC) && + kev->udata == kn->kn_udata) { + break; /* matching udata-specific knote */ + } + } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) { + break; /* matching non-udata-specific knote */ + } + } + } + } + return kn; +} /* + * knote_drop - disconnect and drop the knote + * + * Called with the kqueue unlocked and holding a + * "drop reference" on the knote in question. + * This reference is most often aquired thru a call + * to kqlock2knotedrop(). But it can also be acquired + * through stealing a drop reference via a call to + * knoteuse2knotedrop() or during the initial attach + * of the knote. + * + * The knote may have already been detached from + * (or not yet attached to) its source object. + * * should be called at spl == 0, since we don't want to hold spl * while calling fdrop and free. */ static void knote_drop(struct knote *kn, __unused struct proc *ctxp) { - struct kqueue *kq = kn->kn_kq; + struct kqueue *kq = knote_get_kq(kn); struct proc *p = kq->kq_p; - struct filedesc *fdp = p->p_fd; - struct klist *list; int needswakeup; + /* We have to have a dropping reference on the knote */ + assert(kn->kn_status & KN_DROPPING); + + /* If we are attached, disconnect from the source first */ + if (kn->kn_status & KN_ATTACHED) { + knote_fops(kn)->f_detach(kn); + } + proc_fdlock(p); - if (kn->kn_fop->f_isfd) - list = &fdp->fd_knlist[kn->kn_id]; - else - list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; - SLIST_REMOVE(list, kn, knote, kn_link); + /* Remove the source from the appropriate hash */ + knote_fdremove(kn, p); + + /* trade fdlock for kq lock */ kqlock(kq); - knote_dequeue(kn); + proc_fdunlock(p); + + /* determine if anyone needs to know about the drop */ + assert((kn->kn_status & (KN_SUPPRESSED | KN_QUEUED)) == 0); needswakeup = (kn->kn_status & KN_USEWAIT); kqunlock(kq); - proc_fdunlock(p); if (needswakeup) - waitq_wakeup64_all((struct waitq *)kq->kq_wqs, + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, CAST_EVENT64_T(&kn->kn_status), - THREAD_AWAKENED, + THREAD_RESTART, WAITQ_ALL_PRIORITIES); - if (kn->kn_fop->f_isfd) + if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0)) fp_drop(p, kn->kn_id, kn->kn_fp, 0); knote_free(kn); @@ -3107,19 +4872,14 @@ knote_drop(struct knote *kn, __unused struct proc *ctxp) /* called with kqueue lock held */ static void -knote_activate(struct knote *kn, int force) +knote_activate(struct knote *kn) { - struct kqueue *kq = kn->kn_kq; - - if (!force && (kn->kn_status & KN_ACTIVE)) + if (kn->kn_status & KN_ACTIVE) return; kn->kn_status |= KN_ACTIVE; - knote_enqueue(kn); - kqueue_wakeup(kq, 0); - - /* wake up the parent kq, too */ - KNOTE(&kq->kq_sel.si_note, 0); + if (knote_enqueue(kn)) + knote_wakeup(kn); } /* called with kqueue lock held */ @@ -3127,45 +4887,118 @@ static void knote_deactivate(struct knote *kn) { kn->kn_status &= ~KN_ACTIVE; + if ((kn->kn_status & KN_STAYACTIVE) == 0) + knote_dequeue(kn); +} + +/* called with kqueue lock held */ +static void +knote_enable(struct knote *kn) +{ + if ((kn->kn_status & KN_DISABLED) == 0) + return; + + kn->kn_status &= ~KN_DISABLED; + if (knote_enqueue(kn)) + knote_wakeup(kn); +} + +/* called with kqueue lock held */ +static void +knote_disable(struct knote *kn) +{ + if (kn->kn_status & KN_DISABLED) + return; + + kn->kn_status |= KN_DISABLED; + knote_dequeue(kn); +} + +/* called with kqueue lock held */ +static void +knote_suppress(struct knote *kn) +{ + struct kqtailq *suppressq; + + if (kn->kn_status & KN_SUPPRESSED) + return; + knote_dequeue(kn); + kn->kn_status |= KN_SUPPRESSED; + suppressq = knote_get_suppressed_queue(kn); + TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe); } /* called with kqueue lock held */ static void +knote_unsuppress(struct knote *kn) +{ + struct kqtailq *suppressq; + + if ((kn->kn_status & KN_SUPPRESSED) == 0) + return; + + kn->kn_status &= ~KN_SUPPRESSED; + suppressq = knote_get_suppressed_queue(kn); + TAILQ_REMOVE(suppressq, kn, kn_tqe); + + /* udate in-use qos to equal requested qos */ + kn->kn_qos_index = kn->kn_req_index; + + /* don't wakeup if unsuppressing just a stay-active knote */ + if (knote_enqueue(kn) && + (kn->kn_status & KN_ACTIVE)) + knote_wakeup(kn); +} + +/* called with kqueue lock held */ +static int knote_enqueue(struct knote *kn) { - if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_STAYQUEUED || - (kn->kn_status & (KN_QUEUED | KN_STAYQUEUED | KN_DISABLED)) == 0) { - struct kqtailq *tq = kn->kn_tq; - struct kqueue *kq = kn->kn_kq; + if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 || + (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING))) + return 0; - TAILQ_INSERT_TAIL(tq, kn, kn_tqe); + if ((kn->kn_status & KN_QUEUED) == 0) { + struct kqtailq *queue = knote_get_queue(kn); + struct kqueue *kq = knote_get_kq(kn); + + TAILQ_INSERT_TAIL(queue, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; + return 1; } + return ((kn->kn_status & KN_STAYACTIVE) != 0); } + /* called with kqueue lock held */ static void knote_dequeue(struct knote *kn) { - struct kqueue *kq = kn->kn_kq; + struct kqueue *kq = knote_get_kq(kn); + struct kqtailq *queue; - if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_QUEUED) { - struct kqtailq *tq = kn->kn_tq; + if ((kn->kn_status & KN_QUEUED) == 0) + return; - TAILQ_REMOVE(tq, kn, kn_tqe); - kn->kn_tq = &kq->kq_head; - kn->kn_status &= ~KN_QUEUED; - kq->kq_count--; - } + queue = knote_get_queue(kn); + TAILQ_REMOVE(queue, kn, kn_tqe); + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; } void knote_init(void) { knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote), - 8192, "knote zone"); + 8192, "knote zone"); + + kqfile_zone = zinit(sizeof(struct kqfile), 8192*sizeof(struct kqfile), + 8192, "kqueue file zone"); + + kqworkq_zone = zinit(sizeof(struct kqworkq), 8192*sizeof(struct kqworkq), + 8192, "kqueue workq zone"); /* allocate kq lock group attribute and group */ kq_lck_grp_attr = lck_grp_attr_alloc_init(); @@ -3178,10 +5011,8 @@ knote_init(void) /* Initialize the timer filter lock */ lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr); -#if VM_PRESSURE_EVENTS - /* Initialize the vm pressure list lock */ - vm_pressure_init(kq_lck_grp, kq_lck_attr); -#endif + /* Initialize the user filter lock */ + lck_spin_init(&_filt_userlock, kq_lck_grp, kq_lck_attr); #if CONFIG_MEMORYSTATUS /* Initialize the memorystatus list lock */ @@ -3190,6 +5021,12 @@ knote_init(void) } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) +struct filterops * +knote_fops(struct knote *kn) +{ + return sysfilt_ops[kn->kn_filtid]; +} + static struct knote * knote_alloc(void) { @@ -3640,7 +5477,7 @@ kev_post_msg(struct kev_msg *event_msg) * unsafe to use "m2" */ so_inc_recv_data_stat(ev_pcb->evp_socket, - 1, m->m_len, SO_TC_BE); + 1, m->m_len, MBUF_TC_BE); sorwakeup(ev_pcb->evp_socket); OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted); @@ -3857,38 +5694,43 @@ fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo) void -knote_markstayqueued(struct knote *kn) +knote_markstayactive(struct knote *kn) { - kqlock(kn->kn_kq); - kn->kn_status |= KN_STAYQUEUED; - knote_enqueue(kn); - kqunlock(kn->kn_kq); + kqlock(knote_get_kq(kn)); + kn->kn_status |= KN_STAYACTIVE; + + /* handle all stayactive knotes on the manager */ + if (knote_get_kq(kn)->kq_state & KQ_WORKQ) + knote_set_qos_index(kn, KQWQ_QOS_MANAGER); + + knote_activate(kn); + kqunlock(knote_get_kq(kn)); } void -knote_clearstayqueued(struct knote *kn) +knote_clearstayactive(struct knote *kn) { - kqlock(kn->kn_kq); - kn->kn_status &= ~KN_STAYQUEUED; - knote_dequeue(kn); - kqunlock(kn->kn_kq); + kqlock(knote_get_kq(kn)); + kn->kn_status &= ~KN_STAYACTIVE; + knote_deactivate(kn); + kqunlock(knote_get_kq(kn)); } static unsigned long kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf, unsigned long buflen, unsigned long nknotes) { - struct kevent_qos_s kevqos; struct kevent_internal_s *kevp; for (; kn; kn = SLIST_NEXT(kn, kn_link)) { - if (kq == kn->kn_kq) { + if (kq == knote_get_kq(kn)) { if (nknotes < buflen) { struct kevent_extinfo *info = &buf[nknotes]; + struct kevent_qos_s kevqos; kqlock(kq); - bzero(&kevqos, sizeof(kevqos)); kevp = &(kn->kn_kevent); + bzero(&kevqos, sizeof(kevqos)); kevqos.ident = kevp->ident; kevqos.filter = kevp->filter; kevqos.flags = kevp->flags; @@ -3900,10 +5742,7 @@ kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo * memcpy(&info->kqext_kev, &kevqos, sizeof(info->kqext_kev)); info->kqext_sdata = kn->kn_sdata; - - /* status flags exported to userspace/libproc */ -#define KQEXT_STATUS_MASK (KN_ACTIVE|KN_QUEUED|KN_DISABLED|KN_STAYQUEUED) - info->kqext_status = kn->kn_status & KQEXT_STATUS_MASK; + info->kqext_status = kn->kn_status; info->kqext_sfflags = kn->kn_sfflags; kqunlock(kq); @@ -3929,6 +5768,9 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf, unsigned long buflen = bufsize / sizeof(struct kevent_extinfo); struct kevent_extinfo *kqext = NULL; + /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */ + buflen = min(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX); + kqext = kalloc(buflen * sizeof(struct kevent_extinfo)); if (kqext == NULL) { err = ENOMEM; @@ -3961,7 +5803,59 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf, kqext = NULL; } - if (!err) - *retval = nknotes; + if (!err) { + *retval = min(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX); + } return err; } + +static unsigned long +kevent_udatainfo_emit(struct kqueue *kq, struct knote *kn, uint64_t *buf, + unsigned long buflen, unsigned long nknotes) +{ + struct kevent_internal_s *kevp; + for (; kn; kn = SLIST_NEXT(kn, kn_link)) { + if (kq == knote_get_kq(kn)) { + if (nknotes < buflen) { + kqlock(kq); + kevp = &(kn->kn_kevent); + buf[nknotes] = kevp->udata; + kqunlock(kq); + } + + /* we return total number of knotes, which may be more than requested */ + nknotes++; + } + } + + return nknotes; +} + +int +pid_kqueue_udatainfo(proc_t p, struct kqueue *kq, uint64_t *buf, + uint32_t bufsize) +{ + struct knote *kn; + int i; + struct filedesc *fdp = p->p_fd; + unsigned long nknotes = 0; + unsigned long buflen = bufsize / sizeof(uint64_t); + + proc_fdlock(p); + + for (i = 0; i < fdp->fd_knlistsize; i++) { + kn = SLIST_FIRST(&fdp->fd_knlist[i]); + nknotes = kevent_udatainfo_emit(kq, kn, buf, buflen, nknotes); + } + + if (fdp->fd_knhashmask != 0) { + for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) { + kn = SLIST_FIRST(&fdp->fd_knhash[i]); + nknotes = kevent_udatainfo_emit(kq, kn, buf, buflen, nknotes); + } + } + + proc_fdunlock(p); + return (int)nknotes; +} + diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 20b1f0317..295c001c5 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,6 +103,7 @@ #include #include #include +#include #if SYSV_SHM #include /* shmexec() */ #endif @@ -111,6 +112,7 @@ #include #include #include +#include #include #include @@ -133,6 +135,7 @@ #include #include #include +#include #include #if CONFIG_MACF @@ -159,10 +162,8 @@ #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ -extern void (*dtrace_fasttrap_exec_ptr)(proc_t); +extern void dtrace_proc_exec(proc_t); extern void (*dtrace_proc_waitfor_exec_ptr)(proc_t); -extern void (*dtrace_helpers_cleanup)(proc_t); -extern void dtrace_lazy_dofs_destroy(proc_t); /* * Since dtrace_proc_waitfor_exec_ptr can be added/removed in dtrace_subr.c, @@ -197,7 +198,6 @@ void task_importance_update_owner_info(task_t); #endif extern struct savearea *get_user_regs(thread_t); -extern kern_return_t machine_thread_neon_state_initialize(thread_t thread); __attribute__((noinline)) int __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(mach_port_t task_access_port, int32_t new_pid); @@ -242,7 +242,7 @@ static int execargs_alloc(struct image_params *imgp); static int execargs_free(struct image_params *imgp); static int exec_check_permissions(struct image_params *imgp); static int exec_extract_strings(struct image_params *imgp); -static int exec_add_apple_strings(struct image_params *imgp); +static int exec_add_apple_strings(struct image_params *imgp, const load_result_t *load_result); static int exec_handle_sugid(struct image_params *imgp); static int sugid_scripts = 0; SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW | CTLFLAG_LOCKED, &sugid_scripts, 0, ""); @@ -251,7 +251,7 @@ static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size); static void exec_resettextvp(proc_t, struct image_params *); static int check_for_signature(proc_t, struct image_params *); static void exec_prefault_data(proc_t, struct image_params *, load_result_t *); -static errno_t exec_handle_port_actions(struct image_params *imgp, short psa_flags, boolean_t * portwatch_present, ipc_port_t * portwatch_ports); +static errno_t exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_present, ipc_port_t * portwatch_ports); static errno_t exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, uint64_t psa_darwin_role, ipc_port_t * portwatch_ports, int portwatch_count); @@ -698,16 +698,24 @@ exec_fat_imgact(struct image_params *imgp) } static int -activate_thread_state(thread_t thread, load_result_t *result) +activate_exec_state(task_t task, proc_t p, thread_t thread, load_result_t *result) { int ret; + task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0); + if (result->is64bit) { + task_set_64bit(task, TRUE); + OSBitOrAtomic(P_LP64, &p->p_flag); + } else { + task_set_64bit(task, FALSE); + OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag); + } + ret = thread_state_initialize(thread); if (ret != KERN_SUCCESS) { return ret; } - if (result->threadstate) { uint32_t *ts = result->threadstate; uint32_t total_size = result->threadstate_sz; @@ -731,6 +739,31 @@ activate_thread_state(thread_t thread, load_result_t *result) } +/* + * Set p->p_comm and p->p_name to the name passed to exec + */ +static void +set_proc_name(struct image_params *imgp, proc_t p) +{ + int p_name_len = sizeof(p->p_name) - 1; + + if (imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len) { + imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len; + } + + bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name, + (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen); + p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; + + if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN) { + imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN; + } + + bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm, + (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen); + p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; +} + /* * exec_mach_imgact * @@ -771,7 +804,7 @@ exec_mach_imgact(struct image_params *imgp) struct _posix_spawnattr *psa = NULL; int spawn = (imgp->ip_flags & IMGPF_SPAWN); int vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC); - int p_name_len; + os_reason_t exec_failure_reason = OS_REASON_NULL; /* * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference @@ -851,10 +884,6 @@ exec_mach_imgact(struct image_params *imgp) if (error) goto bad; - error = exec_add_apple_strings(imgp); - if (error) - goto bad; - AUDIT_ARG(argv, imgp->ip_startargv, imgp->ip_argc, imgp->ip_endargv - imgp->ip_startargv); AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc, @@ -885,19 +914,6 @@ exec_mach_imgact(struct image_params *imgp) map = VM_MAP_NULL; } - /* - * We set these flags here; this is OK, since if we fail after - * this point, we have already destroyed the parent process anyway. - */ - task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0); - if (imgp->ip_flags & IMGPF_IS_64BIT) { - task_set_64bit(task, TRUE); - OSBitOrAtomic(P_LP64, &p->p_flag); - } else { - task_set_64bit(task, FALSE); - OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag); - } - /* * Load the Mach-O file. * @@ -914,9 +930,22 @@ exec_mach_imgact(struct image_params *imgp) * Actually load the image file we previously decided to load. */ lret = load_machfile(imgp, mach_header, thread, &map, &load_result); - if (lret != LOAD_SUCCESS) { error = load_return_to_errno(lret); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO, 0, 0); + if (lret == LOAD_BADMACHO_UPX) { + /* set anything that might be useful in the crash report */ + set_proc_name(imgp, p); + + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_UPX); + exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + exec_failure_reason->osr_flags |= OS_REASON_FLAG_CONSISTENT_FAILURE; + } else { + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO); + } + goto badtoolate; } @@ -933,8 +962,10 @@ exec_mach_imgact(struct image_params *imgp) */ if (load_result.csflags & CS_VALID) { imgp->ip_csflags |= load_result.csflags & - (CS_VALID| - CS_HARD|CS_KILL|CS_RESTRICT|CS_ENFORCEMENT|CS_REQUIRE_LV|CS_DYLD_PLATFORM| + (CS_VALID|CS_SIGNED| + CS_HARD|CS_KILL|CS_RESTRICT|CS_ENFORCEMENT|CS_REQUIRE_LV| + CS_ENTITLEMENTS_VALIDATED|CS_DYLD_PLATFORM| + CS_ENTITLEMENT_FLAGS| CS_EXEC_SET_HARD|CS_EXEC_SET_KILL|CS_EXEC_SET_ENFORCEMENT); } else { imgp->ip_csflags &= ~CS_VALID; @@ -952,7 +983,7 @@ exec_mach_imgact(struct image_params *imgp) /* * Set up the system reserved areas in the new address space. */ - vm_map_exec(map, task, (void *)p->p_fd->fd_rdir, cpu_type()); + vm_map_exec(map, task, load_result.is64bit, (void *)p->p_fd->fd_rdir, cpu_type()); /* * Close file descriptors which specify close-on-exec. @@ -967,6 +998,10 @@ exec_mach_imgact(struct image_params *imgp) if (spawn || !vfexec) { vm_map_deallocate(map); } + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE); goto badtoolate; } @@ -982,8 +1017,12 @@ exec_mach_imgact(struct image_params *imgp) vm_map_deallocate(old_map); } - lret = activate_thread_state(thread, &load_result); + lret = activate_exec_state(task, p, thread, &load_result); if (lret != KERN_SUCCESS) { + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE); goto badtoolate; } @@ -1002,6 +1041,19 @@ exec_mach_imgact(struct image_params *imgp) &load_result, p) != KERN_SUCCESS) { error = load_return_to_errno(LOAD_NOSPACE); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC); + goto badtoolate; + } + + error = exec_add_apple_strings(imgp, &load_result); + if (error) { + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT); goto badtoolate; } @@ -1021,6 +1073,10 @@ exec_mach_imgact(struct image_params *imgp) if (error) { if (vfexec || spawn) vm_map_switch(old_map); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS); goto badtoolate; } /* Set the stack */ @@ -1038,6 +1094,10 @@ exec_mach_imgact(struct image_params *imgp) if (error) { if (vfexec || spawn) vm_map_switch(old_map); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER, 0, 0); + exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER); goto badtoolate; } task_set_dyld_info(task, load_result.all_image_info_addr, @@ -1080,21 +1140,32 @@ exec_mach_imgact(struct image_params *imgp) */ p->p_acflag &= ~AFORK; - /* - * Set p->p_comm and p->p_name to the name passed to exec - */ - p_name_len = sizeof(p->p_name) - 1; - if(imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len) - imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len; - bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name, - (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen); - p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; - - if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN) - imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN; - bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm, - (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen); - p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; + set_proc_name(imgp, p); + +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_apps) { + if (strncmp(p->p_name, + "Camera", + sizeof (p->p_name)) == 0 || +#if 00 + strncmp(p->p_name, + "camerad", + sizeof (p->p_name)) == 0 || +#endif + strncmp(p->p_name, + "testCamera", + sizeof (p->p_name)) == 0) { + task_set_could_use_secluded_mem(p->task, TRUE); + } else { + task_set_could_use_secluded_mem(p->task, FALSE); + } + if (strncmp(p->p_name, + "mediaserverd", + sizeof (p->p_name)) == 0) { + task_set_could_also_use_secluded_mem(p->task, TRUE); + } + } +#endif /* CONFIG_SECLUDED_MEMORY */ pal_dbg_set_task_name( p->task ); @@ -1107,41 +1178,8 @@ exec_mach_imgact(struct image_params *imgp) memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid)); -// dtrace code cleanup needed #if CONFIG_DTRACE - /* - * Invalidate any predicate evaluation already cached for this thread by DTrace. - * That's because we've just stored to p_comm and DTrace refers to that when it - * evaluates the "execname" special variable. uid and gid may have changed as well. - */ - dtrace_set_thread_predcache(current_thread(), 0); - - /* - * Free any outstanding lazy dof entries. It is imperative we - * always call dtrace_lazy_dofs_destroy, rather than null check - * and call if !NULL. If we NULL test, during lazy dof faulting - * we can race with the faulting code and proceed from here to - * beyond the helpers cleanup. The lazy dof faulting will then - * install new helpers which no longer belong to this process! - */ - dtrace_lazy_dofs_destroy(p); - - - /* - * Clean up any DTrace helpers for the process. - */ - if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) { - (*dtrace_helpers_cleanup)(p); - } - - /* - * Cleanup the DTrace provider associated with this process. - */ - proc_lock(p); - if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) { - (*dtrace_fasttrap_exec_ptr)(p); - } - proc_unlock(p); + dtrace_proc_exec(p); #endif if (kdebug_enable) { @@ -1206,13 +1244,20 @@ exec_mach_imgact(struct image_params *imgp) /* Don't allow child process to execute any instructions */ if (!spawn) { if (vfexec) { - psignal_vfork(p, new_task, thread, SIGKILL); + assert(exec_failure_reason != OS_REASON_NULL); + psignal_vfork_with_reason(p, new_task, thread, SIGKILL, exec_failure_reason); + exec_failure_reason = OS_REASON_NULL; } else { - psignal(p, SIGKILL); + assert(exec_failure_reason != OS_REASON_NULL); + psignal_with_reason(p, SIGKILL, exec_failure_reason); + exec_failure_reason = OS_REASON_NULL; } /* We can't stop this system call at this point, so just pretend we succeeded */ error = 0; + } else { + os_reason_free(exec_failure_reason); + exec_failure_reason = OS_REASON_NULL; } done: @@ -1234,6 +1279,8 @@ exec_mach_imgact(struct image_params *imgp) } bad: + /* If we hit this, we likely would have leaked an exit reason */ + assert(exec_failure_reason == OS_REASON_NULL); return(error); } @@ -1547,14 +1594,14 @@ exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, * and/or audit_session_spawnjoin for the current task. * * Parameters: struct image_params * Image parameter block - * short psa_flags posix spawn attribute flags * * Returns: 0 Success * EINVAL Failure * ENOTSUP Illegal posix_spawn attr flag was set */ static errno_t -exec_handle_port_actions(struct image_params *imgp, short psa_flags, boolean_t * portwatch_present, ipc_port_t * portwatch_ports) +exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_present, + ipc_port_t * portwatch_ports) { _posix_spawn_port_actions_t pacts = imgp->ip_px_spa; proc_t p = vfs_context_proc(imgp->ip_vfs_context); @@ -1563,36 +1610,39 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags, boolean_t * ipc_port_t port = NULL; errno_t ret = 0; int i; + kern_return_t kr; *portwatch_present = FALSE; for (i = 0; i < pacts->pspa_count; i++) { act = &pacts->pspa_actions[i]; - if (ipc_object_copyin(get_task_ipcspace(current_task()), - act->new_port, MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &port) != KERN_SUCCESS) { - ret = EINVAL; - goto done; + if (MACH_PORT_VALID(act->new_port)) { + kr = ipc_object_copyin(get_task_ipcspace(current_task()), + act->new_port, MACH_MSG_TYPE_COPY_SEND, + (ipc_object_t *) &port); + + if (kr != KERN_SUCCESS) { + ret = EINVAL; + goto done; + } + } else { + /* it's NULL or DEAD */ + port = CAST_MACH_NAME_TO_PORT(act->new_port); } switch (act->port_type) { case PSPA_SPECIAL: - /* Only allowed when not under vfork */ - if (!(psa_flags & POSIX_SPAWN_SETEXEC)) - ret = ENOTSUP; - else if (task_set_special_port(task, - act->which, port) != KERN_SUCCESS) + kr = task_set_special_port(task, act->which, port); + + if (kr != KERN_SUCCESS) ret = EINVAL; break; case PSPA_EXCEPTION: - /* Only allowed when not under vfork */ - if (!(psa_flags & POSIX_SPAWN_SETEXEC)) - ret = ENOTSUP; - else if (task_set_exception_ports(task, - act->mask, port, act->behavior, - act->flavor) != KERN_SUCCESS) + kr = task_set_exception_ports(task, act->mask, port, + act->behavior, act->flavor); + if (kr != KERN_SUCCESS) ret = EINVAL; break; #if CONFIG_AUDIT @@ -1601,22 +1651,22 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags, boolean_t * break; #endif case PSPA_IMP_WATCHPORTS: - if (portwatch_ports != NULL) { + if (portwatch_ports != NULL && IPC_PORT_VALID(port)) { *portwatch_present = TRUE; /* hold on to this till end of spawn */ portwatch_ports[i] = port; - ret = 0; - } else + } else { ipc_port_release_send(port); + } + break; default: ret = EINVAL; break; } - /* action failed, so release port resources */ - - if (ret) { + if (ret) { + /* action failed, so release port resources */ ipc_port_release_send(port); break; } @@ -2197,6 +2247,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); imgp->ip_mac_return = 0; imgp->ip_px_persona = NULL; + imgp->ip_cs_error = OS_REASON_NULL; if (uap->adesc != USER_ADDR_NULL) { if(is_64) { @@ -2540,8 +2591,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) portwatch_ports = NULL; } - if ((error = exec_handle_port_actions(imgp, - imgp->ip_px_sa != NULL ? px_sa.psa_flags : 0, &portwatch_present, portwatch_ports)) != 0) + if ((error = exec_handle_port_actions(imgp, &portwatch_present, portwatch_ports)) != 0) goto bad; if (portwatch_present == FALSE && portwatch_ports != NULL) { @@ -2578,18 +2628,35 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * image will take precedence over the spawn attributes * (re)setting them. * - * The use of p_ucred is safe, since we are acting on the - * new process, and it has no threads other than the one - * we are creating for it. + * Modifications to p_ucred must be guarded using the + * proc's ucred lock. This prevents others from accessing + * a garbage credential. */ - if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) { - kauth_cred_t my_cred = p->p_ucred; + while (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) { + kauth_cred_t my_cred = kauth_cred_proc_ref(p); kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, kauth_cred_getruid(my_cred), kauth_cred_getrgid(my_cred)); - if (my_new_cred != my_cred) { - p->p_ucred = my_new_cred; - /* update cred on proc */ - PROC_UPDATE_CREDS_ONPROC(p); + + if (my_cred == my_new_cred) { + kauth_cred_unref(&my_cred); + break; + } + + /* update cred on proc */ + proc_ucred_lock(p); + + if (p->p_ucred != my_cred) { + proc_ucred_unlock(p); + kauth_cred_unref(&my_new_cred); + continue; } + + /* donate cred reference on my_new_cred to p->p_ucred */ + p->p_ucred = my_new_cred; + PROC_UPDATE_CREDS_ONPROC(p); + proc_ucred_unlock(p); + + /* drop additional reference that was taken on the previous cred */ + kauth_cred_unref(&my_cred); } #if CONFIG_PERSONAS @@ -2712,9 +2779,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) vec.sa_tramp = 0; vec.sa_mask = 0; vec.sa_flags = 0; - for (sig = 0; sig < NSIG; sig++) - if (px_sa.psa_sigdefault & (1 << sig)) { - error = setsigvec(p, child_thread, sig + 1, &vec, spawn_no_exec); + for (sig = 1; sig < NSIG; sig++) + if (px_sa.psa_sigdefault & (1 << (sig-1))) { + error = setsigvec(p, child_thread, sig, &vec, spawn_no_exec); } } @@ -2765,7 +2832,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) } exec_resettextvp(p, imgp); -#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM +#if CONFIG_MEMORYSTATUS /* Has jetsam attributes? */ if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_SET)) { /* @@ -2797,7 +2864,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) } } -#endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM*/ +#endif /* CONFIG_MEMORYSTATUS */ } /* @@ -2901,6 +2968,10 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) if (imgp != NULL && spawn_no_exec && (p->p_lflag & P_LTRACED)) { psignal_vfork(p, p->task, imgp->ip_new_thread, SIGTRAP); } + + if (error == 0 && !spawn_no_exec) + KDBG(BSDDBG_CODE(DBG_BSD_PROC,BSD_PROC_EXEC), + p->p_pid); } @@ -2926,6 +2997,10 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) mac_cred_label_free(imgp->ip_execlabelp); if (imgp->ip_scriptlabelp) mac_vnode_label_free(imgp->ip_scriptlabelp); + if (imgp->ip_cs_error != OS_REASON_NULL) { + os_reason_free(imgp->ip_cs_error); + imgp->ip_cs_error = OS_REASON_NULL; + } #endif } @@ -3128,6 +3203,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE); imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); imgp->ip_mac_return = 0; + imgp->ip_cs_error = OS_REASON_NULL; uthread = get_bsdthread_info(current_thread()); if (uthread->uu_flag & UT_VFORK) { @@ -3168,6 +3244,10 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) if (imgp->ip_scriptlabelp) mac_vnode_label_free(imgp->ip_scriptlabelp); #endif + if (imgp->ip_cs_error != OS_REASON_NULL) { + os_reason_free(imgp->ip_cs_error); + imgp->ip_cs_error = OS_REASON_NULL; + } if (!error) { /* @@ -3754,47 +3834,6 @@ exec_extract_strings(struct image_params *imgp) return error; } -static char * -random_hex_str(char *str, int len, boolean_t embedNUL) -{ - uint64_t low, high, value; - int idx; - char digit; - - /* A 64-bit value will only take 16 characters, plus '0x' and NULL. */ - if (len > 19) - len = 19; - - /* We need enough room for at least 1 digit */ - if (len < 4) - return (NULL); - - low = random(); - high = random(); - value = high << 32 | low; - - if (embedNUL) { - /* - * Zero a byte to protect against C string vulnerabilities - * e.g. for userland __stack_chk_guard. - */ - value &= ~(0xffull << 8); - } - - str[0] = '0'; - str[1] = 'x'; - for (idx = 2; idx < len - 1; idx++) { - digit = value & 0xf; - value = value >> 4; - if (digit < 10) - str[idx] = '0' + digit; - else - str[idx] = 'a' + (digit - 10); - } - str[idx] = '\0'; - return (str); -} - /* * Libc has an 8-element array set up for stack guard values. It only fills * in one of those entries, and both gcc and llvm seem to use only a single @@ -3818,49 +3857,81 @@ random_hex_str(char *str, int len, boolean_t embedNUL) #define PFZ_KEY "pfz=" extern user32_addr_t commpage_text32_location; extern user64_addr_t commpage_text64_location; -/* - * Build up the contents of the apple[] string vector - */ + +#define MAIN_STACK_VALUES 4 +#define MAIN_STACK_KEY "main_stack=" + +#define HEX_STR_LEN 18 // 64-bit hex value "0x0123456701234567" + static int -exec_add_apple_strings(struct image_params *imgp) +exec_add_entropy_key(struct image_params *imgp, + const char *key, + int values, + boolean_t embedNUL) { - int i, error; - int new_ptr_size=4; - char guard[19]; - char guard_vec[strlen(GUARD_KEY) + 19 * GUARD_VALUES + 1]; + const int limit = 8; + uint64_t entropy[limit]; + char str[strlen(key) + (HEX_STR_LEN + 1) * limit + 1]; + if (values > limit) { + values = limit; + } - char entropy[19]; - char entropy_vec[strlen(ENTROPY_KEY) + 19 * ENTROPY_VALUES + 1]; + read_random(entropy, sizeof(entropy[0]) * values); - char pfz_string[strlen(PFZ_KEY) + 16 + 4 +1]; - - if( imgp->ip_flags & IMGPF_IS_64BIT) { - new_ptr_size = 8; - snprintf(pfz_string, sizeof(pfz_string),PFZ_KEY "0x%llx",commpage_text64_location); - } else { - snprintf(pfz_string, sizeof(pfz_string),PFZ_KEY "0x%x",commpage_text32_location); + if (embedNUL) { + entropy[0] &= ~(0xffull << 8); + } + + int len = snprintf(str, sizeof(str), "%s0x%llx", key, entropy[0]); + int remaining = sizeof(str) - len; + for (int i = 1; i < values && remaining > 0; ++i) { + int start = sizeof(str) - remaining; + len = snprintf(&str[start], remaining, ",0x%llx", entropy[i]); + remaining -= len; } + return exec_add_user_string(imgp, CAST_USER_ADDR_T(str), UIO_SYSSPACE, FALSE); +} + +/* + * Build up the contents of the apple[] string vector + */ +static int +exec_add_apple_strings(struct image_params *imgp, + const load_result_t *load_result) +{ + int error; + int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + /* exec_save_path stored the first string */ imgp->ip_applec = 1; /* adding the pfz string */ - error = exec_add_user_string(imgp, CAST_USER_ADDR_T(pfz_string),UIO_SYSSPACE,FALSE); - if(error) - goto bad; - imgp->ip_applec++; + { + char pfz_string[strlen(PFZ_KEY) + HEX_STR_LEN + 1]; + + if (img_ptr_size == 8) { + snprintf(pfz_string, sizeof(pfz_string), PFZ_KEY "0x%llx", commpage_text64_location); + } else { + snprintf(pfz_string, sizeof(pfz_string), PFZ_KEY "0x%x", commpage_text32_location); + } + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(pfz_string), UIO_SYSSPACE, FALSE); + if (error) { + goto bad; + } + imgp->ip_applec++; + } /* adding the NANO_ENGAGE_KEY key */ if (imgp->ip_px_sa) { int proc_flags = (((struct _posix_spawnattr *) imgp->ip_px_sa)->psa_flags); if ((proc_flags & _POSIX_SPAWN_NANO_ALLOCATOR) == _POSIX_SPAWN_NANO_ALLOCATOR) { - char uiapp_string[strlen(NANO_ENGAGE_KEY) + 1]; - - snprintf(uiapp_string, sizeof(uiapp_string), NANO_ENGAGE_KEY); - error = exec_add_user_string(imgp, CAST_USER_ADDR_T(uiapp_string),UIO_SYSSPACE,FALSE); - if (error) + const char *nano_string = NANO_ENGAGE_KEY; + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(nano_string), UIO_SYSSPACE, FALSE); + if (error){ goto bad; + } imgp->ip_applec++; } } @@ -3872,37 +3943,45 @@ exec_add_apple_strings(struct image_params *imgp) * (The first random string always contains an embedded NUL so that * __stack_chk_guard also protects against C string vulnerabilities) */ - (void)strlcpy(guard_vec, GUARD_KEY, sizeof (guard_vec)); - for (i = 0; i < GUARD_VALUES; i++) { - random_hex_str(guard, sizeof (guard), i == 0); - if (i) - (void)strlcat(guard_vec, ",", sizeof (guard_vec)); - (void)strlcat(guard_vec, guard, sizeof (guard_vec)); - } - - error = exec_add_user_string(imgp, CAST_USER_ADDR_T(guard_vec), UIO_SYSSPACE, FALSE); - if (error) + error = exec_add_entropy_key(imgp, GUARD_KEY, GUARD_VALUES, TRUE); + if (error) { goto bad; + } imgp->ip_applec++; /* * Supply libc with entropy for system malloc. */ - (void)strlcpy(entropy_vec, ENTROPY_KEY, sizeof(entropy_vec)); - for (i = 0; i < ENTROPY_VALUES; i++) { - random_hex_str(entropy, sizeof (entropy), FALSE); - if (i) - (void)strlcat(entropy_vec, ",", sizeof (entropy_vec)); - (void)strlcat(entropy_vec, entropy, sizeof (entropy_vec)); - } - - error = exec_add_user_string(imgp, CAST_USER_ADDR_T(entropy_vec), UIO_SYSSPACE, FALSE); - if (error) + error = exec_add_entropy_key(imgp, ENTROPY_KEY, ENTROPY_VALUES, FALSE); + if (error) { goto bad; + } imgp->ip_applec++; + /* + * Add MAIN_STACK_KEY: Supplies the address and size of the main thread's + * stack if it was allocated by the kernel. + * + * The guard page is not included in this stack size as libpthread + * expects to add it back in after receiving this value. + */ + if (load_result->unixproc) { + char stack_string[strlen(MAIN_STACK_KEY) + (HEX_STR_LEN + 1) * MAIN_STACK_VALUES + 1]; + snprintf(stack_string, sizeof(stack_string), + MAIN_STACK_KEY "0x%llx,0x%llx,0x%llx,0x%llx", + (uint64_t)load_result->user_stack, + (uint64_t)load_result->user_stack_size, + (uint64_t)load_result->user_stack_alloc, + (uint64_t)load_result->user_stack_alloc_size); + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(stack_string), UIO_SYSSPACE, FALSE); + if (error) { + goto bad; + } + imgp->ip_applec++; + } + /* Align the tail of the combined applev area */ - while (imgp->ip_strspace % new_ptr_size != 0) { + while (imgp->ip_strspace % img_ptr_size != 0) { *imgp->ip_strendp++ = '\0'; imgp->ip_strspace--; } @@ -4040,8 +4119,9 @@ exec_check_permissions(struct image_params *imgp) static int exec_handle_sugid(struct image_params *imgp) { - kauth_cred_t cred = vfs_context_ucred(imgp->ip_vfs_context); proc_t p = vfs_context_proc(imgp->ip_vfs_context); + kauth_cred_t cred = vfs_context_ucred(imgp->ip_vfs_context); + kauth_cred_t my_cred, my_new_cred; int i; int leave_sugid_clear = 0; int mac_reset_ipc = 0; @@ -4112,16 +4192,67 @@ exec_handle_sugid(struct image_params *imgp) * membership resolution, then dropping their * effective privilege to that of the desired * final credential state. + * + * Modifications to p_ucred must be guarded using the + * proc's ucred lock. This prevents others from accessing + * a garbage credential. */ - if (imgp->ip_origvattr->va_mode & VSUID) { - p->p_ucred = kauth_cred_setresuid(p->p_ucred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE); + while (imgp->ip_origvattr->va_mode & VSUID) { + my_cred = kauth_cred_proc_ref(p); + my_new_cred = kauth_cred_setresuid(my_cred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE); + + if (my_new_cred == my_cred) { + kauth_cred_unref(&my_cred); + break; + } + /* update cred on proc */ + proc_ucred_lock(p); + + if (p->p_ucred != my_cred) { + proc_ucred_unlock(p); + kauth_cred_unref(&my_new_cred); + continue; + } + + /* donate cred reference on my_new_cred to p->p_ucred */ + p->p_ucred = my_new_cred; PROC_UPDATE_CREDS_ONPROC(p); + proc_ucred_unlock(p); + + /* drop additional reference that was taken on the previous cred */ + kauth_cred_unref(&my_cred); + + break; } - if (imgp->ip_origvattr->va_mode & VSGID) { - p->p_ucred = kauth_cred_setresgid(p->p_ucred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid); + + while (imgp->ip_origvattr->va_mode & VSGID) { + my_cred = kauth_cred_proc_ref(p); + my_new_cred = kauth_cred_setresgid(my_cred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid); + + if (my_new_cred == my_cred) { + kauth_cred_unref(&my_cred); + break; + } + /* update cred on proc */ + proc_ucred_lock(p); + + if (p->p_ucred != my_cred) { + proc_ucred_unlock(p); + kauth_cred_unref(&my_new_cred); + continue; + } + + /* donate cred reference on my_new_cred to p->p_ucred */ + p->p_ucred = my_new_cred; PROC_UPDATE_CREDS_ONPROC(p); + proc_ucred_unlock(p); + + /* drop additional reference that was taken on the previous cred */ + kauth_cred_unref(&my_cred); + + break; } #if CONFIG_MACF @@ -4237,6 +4368,7 @@ exec_handle_sugid(struct image_params *imgp) MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO); if (ndp == NULL) { + fp_free(p, indx, fp); error = ENOMEM; break; } @@ -4247,6 +4379,7 @@ exec_handle_sugid(struct image_params *imgp) if ((error = vn_open(ndp, flag, 0)) != 0) { fp_free(p, indx, fp); + FREE(ndp, M_TEMP); break; } @@ -4287,11 +4420,41 @@ exec_handle_sugid(struct image_params *imgp) /* * Implement the semantic where the effective user and group become * the saved user and group in exec'ed programs. + * + * Modifications to p_ucred must be guarded using the + * proc's ucred lock. This prevents others from accessing + * a garbage credential. */ - p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred), kauth_cred_getgid(p->p_ucred)); - /* update cred on proc */ - PROC_UPDATE_CREDS_ONPROC(p); - + for (;;) { + my_cred = kauth_cred_proc_ref(p); + my_new_cred = kauth_cred_setsvuidgid(my_cred, kauth_cred_getuid(my_cred), kauth_cred_getgid(my_cred)); + + if (my_new_cred == my_cred) { + kauth_cred_unref(&my_cred); + break; + } + + /* update cred on proc */ + proc_ucred_lock(p); + + if (p->p_ucred != my_cred) { + proc_ucred_unlock(p); + kauth_cred_unref(&my_new_cred); + continue; + } + + /* donate cred reference on my_new_cred to p->p_ucred */ + p->p_ucred = my_new_cred; + PROC_UPDATE_CREDS_ONPROC(p); + proc_ucred_unlock(p); + + /* drop additional reference that was taken on the previous cred */ + kauth_cred_unref(&my_cred); + + break; + } + + /* Update the process' identity version and set the security token */ p->p_idversion++; set_security_token(p); @@ -4331,7 +4494,7 @@ create_unix_stack(vm_map_t map, load_result_t* load_result, p->user_stack = user_stack; proc_unlock(p); - if (!load_result->prog_allocated_stack) { + if (load_result->user_stack_alloc_size > 0) { /* * Allocate enough space for the maximum stack size we * will ever authorize and an extra page to act as @@ -4339,22 +4502,22 @@ create_unix_stack(vm_map_t map, load_result_t* load_result, * vm_initial_limit_stack takes care of the extra guard page. * Otherwise we must allocate it ourselves. */ - - size = mach_vm_round_page(load_result->user_stack_size); - if (load_result->prog_stack_size) - size += PAGE_SIZE; + if (mach_vm_round_page_overflow(load_result->user_stack_alloc_size, &size)) { + return KERN_INVALID_ARGUMENT; + } addr = mach_vm_trunc_page(load_result->user_stack - size); kr = mach_vm_allocate(map, &addr, size, - VM_MAKE_TAG(VM_MEMORY_STACK) | - VM_FLAGS_FIXED); + VM_MAKE_TAG(VM_MEMORY_STACK) | + VM_FLAGS_FIXED); if (kr != KERN_SUCCESS) { - /* If can't allocate at default location, try anywhere */ + // Can't allocate at default location, try anywhere addr = 0; kr = mach_vm_allocate(map, &addr, size, - VM_MAKE_TAG(VM_MEMORY_STACK) | - VM_FLAGS_ANYWHERE); - if (kr != KERN_SUCCESS) + VM_MAKE_TAG(VM_MEMORY_STACK) | + VM_FLAGS_ANYWHERE); + if (kr != KERN_SUCCESS) { return kr; + } user_stack = addr + size; load_result->user_stack = user_stack; @@ -4364,22 +4527,27 @@ create_unix_stack(vm_map_t map, load_result_t* load_result, proc_unlock(p); } + load_result->user_stack_alloc = addr; + /* * And prevent access to what's above the current stack * size limit for this process. */ - prot_addr = addr; - if (load_result->prog_stack_size) + if (load_result->user_stack_size == 0) { + load_result->user_stack_size = unix_stack_size(p); + prot_size = mach_vm_trunc_page(size - load_result->user_stack_size); + } else { prot_size = PAGE_SIZE; - else - prot_size = mach_vm_trunc_page(size - unix_stack_size(p)); + } + + prot_addr = addr; kr = mach_vm_protect(map, - prot_addr, - prot_size, - FALSE, - VM_PROT_NONE); + prot_addr, + prot_size, + FALSE, + VM_PROT_NONE); if (kr != KERN_SUCCESS) { - (void) mach_vm_deallocate(map, addr, size); + (void)mach_vm_deallocate(map, addr, size); return kr; } } @@ -4407,17 +4575,17 @@ create_unix_stack(vm_map_t map, load_result_t* load_result, * for the first time. This is done to ensure that bsd_init() * has run to completion. * - * The address map of the first manufactured process is 32 bit. - * WHEN this becomes 64b, this code will fail; it needs to be - * made 64b capable. + * The address map of the first manufactured process matches the + * word width of the kernel. Once the self-exec completes, the + * initproc might be different. */ static int load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path) { - uint32_t argv[3]; - uint32_t argc = 0; int retval[2]; + int error; struct execve_args init_exec_args; + user_addr_t argv0 = USER_ADDR_NULL, argv1 = USER_ADDR_NULL; /* * Validate inputs and pre-conditions @@ -4426,18 +4594,16 @@ load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path) assert(scratch_addr); assert(path); - if (IS_64BIT_PROCESS(p)) { - panic("Init against 64b primordial proc not implemented"); - } - /* * Copy out program name. */ size_t path_length = strlen(path) + 1; - (void) copyout(path, scratch_addr, path_length); + argv0 = scratch_addr; + error = copyout(path, argv0, path_length); + if (error) + return error; - argv[argc++] = (uint32_t)scratch_addr; - scratch_addr = USER_ADDR_ALIGN(scratch_addr + path_length, 16); + scratch_addr = USER_ADDR_ALIGN(scratch_addr + path_length, sizeof(user_addr_t)); /* * Put out first (and only) argument, similarly. @@ -4447,26 +4613,40 @@ load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path) const char *init_args = "-s"; size_t init_args_length = strlen(init_args)+1; - copyout(init_args, scratch_addr, init_args_length); + argv1 = scratch_addr; + error = copyout(init_args, argv1, init_args_length); + if (error) + return error; - argv[argc++] = (uint32_t)scratch_addr; - scratch_addr = USER_ADDR_ALIGN(scratch_addr + init_args_length, 16); + scratch_addr = USER_ADDR_ALIGN(scratch_addr + init_args_length, sizeof(user_addr_t)); } - /* - * Null-end the argument list - */ - argv[argc] = 0; - - /* - * Copy out the argument list. - */ - (void) copyout(argv, scratch_addr, sizeof(argv)); + if (proc_is64bit(p)) { + user64_addr_t argv64bit[3]; + + argv64bit[0] = argv0; + argv64bit[1] = argv1; + argv64bit[2] = USER_ADDR_NULL; + + error = copyout(argv64bit, scratch_addr, sizeof(argv64bit)); + if (error) + return error; + } else { + user32_addr_t argv32bit[3]; + + argv32bit[0] = (user32_addr_t)argv0; + argv32bit[1] = (user32_addr_t)argv1; + argv32bit[2] = USER_ADDR_NULL; + + error = copyout(argv32bit, scratch_addr, sizeof(argv32bit)); + if (error) + return error; + } /* * Set up argument block for fake call to execve. */ - init_exec_args.fname = CAST_USER_ADDR_T(argv[0]); + init_exec_args.fname = argv0; init_exec_args.argp = scratch_addr; init_exec_args.envp = USER_ADDR_NULL; @@ -4483,7 +4663,6 @@ static const char * init_programs[] = { "/usr/local/sbin/launchd.debug", #endif #if DEVELOPMENT || DEBUG - /* Remove DEBUG conditional when is fixed */ "/usr/local/sbin/launchd.development", #endif "/sbin/launchd", @@ -4509,9 +4688,6 @@ static const char * init_programs[] = { * the kcsuffix boot-arg, setting launchdsuffix to "" or "release" * will force /sbin/launchd to be selected. * - * The DEBUG kernel will continue to check for a .development - * version until is fixed. - * * Search order by build: * * DEBUG DEVELOPMENT RELEASE PATH @@ -4526,9 +4702,11 @@ load_init_program(proc_t p) { uint32_t i; int error; - vm_offset_t scratch_addr = VM_MIN_ADDRESS; + vm_map_t map = current_map(); + mach_vm_offset_t scratch_addr = 0; + mach_vm_size_t map_page_size = vm_map_page_size(map); - (void) vm_allocate(current_map(), &scratch_addr, PAGE_SIZE, VM_FLAGS_ANYWHERE); + (void) mach_vm_allocate(map, &scratch_addr, map_page_size, VM_FLAGS_ANYWHERE); #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM (void) memorystatus_init_at_boot_snapshot(); #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ @@ -4542,7 +4720,7 @@ load_init_program(proc_t p) (strcmp(launchd_suffix, "release") == 0)); if (is_release_suffix) { - error = load_init_program_at_path(p, CAST_USER_ADDR_T(scratch_addr), "/sbin/launchd"); + error = load_init_program_at_path(p, (user_addr_t)scratch_addr, "/sbin/launchd"); if (!error) return; @@ -4553,7 +4731,7 @@ load_init_program(proc_t p) /* All the error data is lost in the loop below, don't * attempt to save it. */ - if (!load_init_program_at_path(p, CAST_USER_ADDR_T(scratch_addr), launchd_path)) { + if (!load_init_program_at_path(p, (user_addr_t)scratch_addr, launchd_path)) { return; } } @@ -4562,7 +4740,7 @@ load_init_program(proc_t p) error = ENOENT; for (i = 0; i < sizeof(init_programs)/sizeof(init_programs[0]); i++) { - error = load_init_program_at_path(p, CAST_USER_ADDR_T(scratch_addr), init_programs[i]); + error = load_init_program_at_path(p, (user_addr_t)scratch_addr, init_programs[i]); if (!error) return; } @@ -4598,6 +4776,7 @@ load_return_to_errno(load_return_t lrtn) case LOAD_BADARCH: return EBADARCH; case LOAD_BADMACHO: + case LOAD_BADMACHO_UPX: return EBADMACHO; case LOAD_SHLIB: return ESHLIBVERS; @@ -4923,6 +5102,7 @@ check_for_signature(proc_t p, struct image_params *imgp) boolean_t require_success = FALSE; int spawn = (imgp->ip_flags & IMGPF_SPAWN); int vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC); + os_reason_t signature_failure_reason = OS_REASON_NULL; /* * Override inherited code signing flags with the @@ -4944,11 +5124,22 @@ check_for_signature(proc_t p, struct image_params *imgp) * approve of exec, kill and return immediately. */ if (imgp->ip_mac_return != 0) { + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_SECURITY_POLICY, 0, 0); + signature_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SECURITY_POLICY); error = imgp->ip_mac_return; unexpected_failure = TRUE; goto done; } + if (imgp->ip_cs_error != OS_REASON_NULL) { + signature_failure_reason = imgp->ip_cs_error; + imgp->ip_cs_error = OS_REASON_NULL; + error = EACCES; + goto done; + } + /* check if callout to taskgated is needed */ if (!taskgated_required(p, &require_success)) { error = 0; @@ -4958,8 +5149,12 @@ check_for_signature(proc_t p, struct image_params *imgp) kr = task_get_task_access_port(p->task, &port); if (KERN_SUCCESS != kr || !IPC_PORT_VALID(port)) { error = 0; - if (require_success) + if (require_success) { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASK_ACCESS_PORT, 0, 0); + signature_failure_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASK_ACCESS_PORT); error = EACCES; + } goto done; } @@ -4978,9 +5173,17 @@ check_for_signature(proc_t p, struct image_params *imgp) break; case KERN_FAILURE: error = EACCES; + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASKGATED_INVALID_SIG, 0, 0); + signature_failure_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASKGATED_INVALID_SIG); goto done; default: error = EACCES; + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_TASKGATED_OTHER, 0, 0); + signature_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_TASKGATED_OTHER); unexpected_failure = TRUE; goto done; } @@ -5004,12 +5207,20 @@ check_for_signature(proc_t p, struct image_params *imgp) p->p_csflags |= CS_KILLED; /* make very sure execution fails */ if (vfexec || spawn) { - psignal_vfork(p, p->task, imgp->ip_new_thread, SIGKILL); + assert(signature_failure_reason != OS_REASON_NULL); + psignal_vfork_with_reason(p, p->task, imgp->ip_new_thread, + SIGKILL, signature_failure_reason); + signature_failure_reason = OS_REASON_NULL; error = 0; } else { - psignal(p, SIGKILL); + assert(signature_failure_reason != OS_REASON_NULL); + psignal_with_reason(p, SIGKILL, signature_failure_reason); + signature_failure_reason = OS_REASON_NULL; } } + + /* If we hit this, we likely would have leaked an exit reason */ + assert(signature_failure_reason == OS_REASON_NULL); return error; } diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index f9739accd..6cf36945d 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011, 2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,7 @@ #include #include #include +#include #include #include #include @@ -114,6 +115,7 @@ #include #include +#include #include #include @@ -123,11 +125,9 @@ #include #include #include -#include +#include -#if VM_PRESSURE_EVENTS -#include -#endif +#include #if CONFIG_MEMORYSTATUS #include @@ -135,15 +135,14 @@ #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ -extern void (*dtrace_fasttrap_exit_ptr)(proc_t); -extern void (*dtrace_helpers_cleanup)(proc_t); -extern void dtrace_lazy_dofs_destroy(proc_t); +void dtrace_proc_exit(proc_t p); #include #endif #if CONFIG_MACF #include +#include #include #endif @@ -155,18 +154,25 @@ extern void dtrace_lazy_dofs_destroy(proc_t); #include -extern boolean_t init_task_died; void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify); +void gather_populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, mach_exception_data_type_t code, mach_exception_data_type_t subcode, uint64_t *udata_buffer, int num_udata); +mach_exception_data_type_t proc_encode_exit_exception_code(proc_t p); void vfork_exit(proc_t p, int rv); void vproc_exit(proc_t p); __private_extern__ void munge_user64_rusage(struct rusage *a_rusage_p, struct user64_rusage *a_user_rusage_p); __private_extern__ void munge_user32_rusage(struct rusage *a_rusage_p, struct user32_rusage *a_user_rusage_p); static int reap_child_locked(proc_t parent, proc_t child, int deadparent, int reparentedtoinit, int locked, int droplock); -static void populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, struct rusage_superset *rup, mach_exception_data_type_t code, mach_exception_data_type_t subcode); -extern int proc_pidpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); +static void populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, struct rusage_superset *rup, mach_exception_data_type_t code, mach_exception_data_type_t subcode, uint64_t *udata_buffer, int num_udata); +static void proc_update_corpse_exception_codes(proc_t p, mach_exception_data_type_t *code, mach_exception_data_type_t *subcode); +extern int proc_pidpathinfo_internal(proc_t p, uint64_t arg, char *buffer, uint32_t buffersize, int32_t *retval); +static void abort_with_payload_internal(proc_t p, uint32_t reason_namespace, uint64_t reason_code, user_addr_t payload, + uint32_t payload_size, user_addr_t reason_string, uint64_t reason_flags); static __attribute__((noinline)) void launchd_crashed_panic(proc_t p, int rv); extern void proc_piduniqidentifierinfo(proc_t p, struct proc_uniqidentifierinfo *p_uniqidinfo); +extern void task_coalition_ids(task_t task, uint64_t ids[COALITION_NUM_TYPES]); +extern uint64_t get_task_phys_footprint_limit(task_t); +int proc_list_uptrs(void *p, uint64_t *udata_buffer, int size); /* @@ -233,7 +239,59 @@ copyoutsiginfo(user_siginfo_t *native, boolean_t is64, user_addr_t uaddr) } } -static void populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, struct rusage_superset *rup, mach_exception_data_type_t code, mach_exception_data_type_t subcode) +void gather_populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, mach_exception_data_type_t code, mach_exception_data_type_t subcode, uint64_t *udata_buffer, int num_udata) +{ + struct rusage_superset rup; + + gather_rusage_info(p, &rup.ri, RUSAGE_INFO_CURRENT); + rup.ri.ri_phys_footprint = 0; + populate_corpse_crashinfo(p, crash_info_ptr, &rup, code, subcode, udata_buffer, num_udata); +} + +static void proc_update_corpse_exception_codes(proc_t p, mach_exception_data_type_t *code, mach_exception_data_type_t *subcode) +{ + mach_exception_data_type_t code_update = *code; + mach_exception_data_type_t subcode_update = *subcode; + if (p->p_exit_reason == OS_REASON_NULL) { + return; + } + + switch (p->p_exit_reason->osr_namespace) { + case OS_REASON_JETSAM: + if (p->p_exit_reason->osr_code == JETSAM_REASON_MEMORY_PERPROCESSLIMIT) { + /* Update the code with EXC_RESOURCE code for high memory watermark */ + EXC_RESOURCE_ENCODE_TYPE(code_update, RESOURCE_TYPE_MEMORY); + EXC_RESOURCE_ENCODE_FLAVOR(code_update, FLAVOR_HIGH_WATERMARK); + EXC_RESOURCE_HWM_ENCODE_LIMIT(code_update, ((get_task_phys_footprint_limit(p->task)) >> 20)); + subcode_update = 0; + break; + } + + break; + default: + break; + } + + *code = code_update; + *subcode = subcode_update; + return; +} + +mach_exception_data_type_t proc_encode_exit_exception_code(proc_t p) +{ + uint64_t subcode = 0; + + if (p->p_exit_reason == OS_REASON_NULL) { + return 0; + } + + /* Embed first 32 bits of osr_namespace and osr_code in exception code */ + ENCODE_OSR_NAMESPACE_TO_MACH_EXCEPTION_CODE(subcode, p->p_exit_reason->osr_namespace); + ENCODE_OSR_CODE_TO_MACH_EXCEPTION_CODE(subcode, p->p_exit_reason->osr_code); + return (mach_exception_data_type_t)subcode; +} + +static void populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, struct rusage_superset *rup, mach_exception_data_type_t code, mach_exception_data_type_t subcode, uint64_t *udata_buffer, int num_udata) { mach_vm_address_t uaddr = 0; mach_exception_data_type_t exc_codes[EXCEPTION_CODE_MAX]; @@ -245,83 +303,87 @@ static void populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, struct rus int retval = 0; uint64_t crashed_threadid = thread_tid(current_thread()); unsigned int pflags = 0; + uint64_t max_footprint_mb; + uint64_t max_footprint; #if CONFIG_MEMORYSTATUS int memstat_dirty_flags = 0; #endif if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_EXCEPTION_CODES, sizeof(exc_codes), &uaddr)) { - copyout(exc_codes, uaddr, sizeof(exc_codes)); + kcdata_memcpy(crash_info_ptr, uaddr, exc_codes, sizeof(exc_codes)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PID, sizeof(p->p_pid), &uaddr)) { - copyout(&p->p_pid, uaddr, sizeof(p->p_pid)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_pid, sizeof(p->p_pid)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PPID, sizeof(p->p_ppid), &uaddr)) { - copyout(&p->p_ppid, uaddr, sizeof(p->p_ppid)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_ppid, sizeof(p->p_ppid)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_CRASHED_THREADID, sizeof(uint64_t), &uaddr)) { - copyout(&crashed_threadid, uaddr, sizeof(uint64_t)); - } - - if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_RUSAGE, sizeof(struct rusage), &uaddr)) { - copyout(&rup->ru, uaddr, sizeof(struct rusage)); + kcdata_memcpy(crash_info_ptr, uaddr, &crashed_threadid, sizeof(uint64_t)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_BSDINFOWITHUNIQID, sizeof(struct proc_uniqidentifierinfo), &uaddr)) { proc_piduniqidentifierinfo(p, &p_uniqidinfo); - copyout(&p_uniqidinfo, uaddr, sizeof(struct proc_uniqidentifierinfo)); + kcdata_memcpy(crash_info_ptr, uaddr, &p_uniqidinfo, sizeof(struct proc_uniqidentifierinfo)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_RUSAGE_INFO, sizeof(rusage_info_current), &uaddr)) { - copyout(&rup->ri, uaddr, sizeof(rusage_info_current)); + kcdata_memcpy(crash_info_ptr, uaddr, &rup->ri, sizeof(rusage_info_current)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_CSFLAGS, sizeof(p->p_csflags), &uaddr)) { - copyout(&p->p_csflags, uaddr, sizeof(p->p_csflags)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_csflags, sizeof(p->p_csflags)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_NAME, sizeof(p->p_comm), &uaddr)) { - copyout(&p->p_comm, uaddr, sizeof(p->p_comm)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_comm, sizeof(p->p_comm)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_STARTTIME, sizeof(p->p_start), &uaddr)) { struct timeval64 t64; t64.tv_sec = (int64_t)p->p_start.tv_sec; t64.tv_usec = (int64_t)p->p_start.tv_usec; - copyout(&t64, uaddr, sizeof(t64)); + kcdata_memcpy(crash_info_ptr, uaddr, &t64, sizeof(t64)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_USERSTACK, sizeof(p->user_stack), &uaddr)) { - copyout(&p->user_stack, uaddr, sizeof(p->user_stack)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->user_stack, sizeof(p->user_stack)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_ARGSLEN, sizeof(p->p_argslen), &uaddr)) { - copyout(&p->p_argslen, uaddr, sizeof(p->p_argslen)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_argslen, sizeof(p->p_argslen)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_ARGC, sizeof(p->p_argc), &uaddr)) { - copyout(&p->p_argc, uaddr, sizeof(p->p_argc)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_argc, sizeof(p->p_argc)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_PATH, MAXPATHLEN, &uaddr)) { - proc_pidpathinfo(p, 0, uaddr, MAXPATHLEN, &retval); + char *buf = (char *) kalloc(MAXPATHLEN); + if (buf != NULL) { + bzero(buf, MAXPATHLEN); + proc_pidpathinfo_internal(p, 0, buf, MAXPATHLEN, &retval); + kcdata_memcpy(crash_info_ptr, uaddr, buf, MAXPATHLEN); + kfree(buf, MAXPATHLEN); + } } pflags = p->p_flag & (P_LP64 | P_SUGID); if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_FLAGS, sizeof(pflags), &uaddr)) { - copyout(&pflags, uaddr, sizeof(pflags)); + kcdata_memcpy(crash_info_ptr, uaddr, &pflags, sizeof(pflags)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_UID, sizeof(p->p_uid), &uaddr)) { - copyout(&p->p_uid, uaddr, sizeof(p->p_uid)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_uid, sizeof(p->p_uid)); } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_GID, sizeof(p->p_gid), &uaddr)) { - copyout(&p->p_gid, uaddr, sizeof(p->p_gid)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_gid, sizeof(p->p_gid)); } cputype = cpu_type() & ~CPU_ARCH_MASK; @@ -329,37 +391,126 @@ static void populate_corpse_crashinfo(proc_t p, void *crash_info_ptr, struct rus cputype |= CPU_ARCH_ABI64; if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_CPUTYPE, sizeof(cpu_type_t), &uaddr)) { - copyout(&cputype, uaddr, sizeof(cpu_type_t)); + kcdata_memcpy(crash_info_ptr, uaddr, &cputype, sizeof(cpu_type_t)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_MEMORY_LIMIT, sizeof(max_footprint_mb), &uaddr)) { + max_footprint = get_task_phys_footprint_limit(p->task); + max_footprint_mb = max_footprint >> 20; + kcdata_memcpy(crash_info_ptr, uaddr, &max_footprint_mb, sizeof(max_footprint_mb)); } bzero(&pwqinfo, sizeof(struct proc_workqueueinfo)); retval = fill_procworkqueue(p, &pwqinfo); if (retval == 0) { if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_WORKQUEUEINFO, sizeof(struct proc_workqueueinfo), &uaddr)) { - copyout(&pwqinfo, uaddr, sizeof(struct proc_workqueueinfo)); + kcdata_memcpy(crash_info_ptr, uaddr, &pwqinfo, sizeof(struct proc_workqueueinfo)); } } if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_RESPONSIBLE_PID, sizeof(p->p_responsible_pid), &uaddr)) { - copyout(&p->p_responsible_pid, uaddr, sizeof(p->p_responsible_pid)); + kcdata_memcpy(crash_info_ptr, uaddr, &p->p_responsible_pid, sizeof(p->p_responsible_pid)); + } + +#if CONFIG_COALITIONS + if (KERN_SUCCESS == kcdata_get_memory_addr_for_array(crash_info_ptr, TASK_CRASHINFO_COALITION_ID, sizeof(uint64_t), COALITION_NUM_TYPES, &uaddr)) { + uint64_t coalition_ids[COALITION_NUM_TYPES]; + task_coalition_ids(p->task, coalition_ids); + kcdata_memcpy(crash_info_ptr, uaddr, coalition_ids, sizeof(coalition_ids)); } +#endif /* CONFIG_COALITIONS */ #if CONFIG_MEMORYSTATUS memstat_dirty_flags = memorystatus_dirty_get(p); if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_DIRTY_FLAGS, sizeof(memstat_dirty_flags), &uaddr)) { - copyout(&memstat_dirty_flags, uaddr, sizeof(memstat_dirty_flags)); + kcdata_memcpy(crash_info_ptr, uaddr, &memstat_dirty_flags, sizeof(memstat_dirty_flags)); } #endif + if (p->p_exit_reason != OS_REASON_NULL) { + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, EXIT_REASON_SNAPSHOT, sizeof(struct exit_reason_snapshot), &uaddr)) { + struct exit_reason_snapshot ers = { + .ers_namespace = p->p_exit_reason->osr_namespace, + .ers_code = p->p_exit_reason->osr_code, + .ers_flags = p->p_exit_reason->osr_flags + }; + + kcdata_memcpy(crash_info_ptr, uaddr, &ers, sizeof(ers)); + } + + if (p->p_exit_reason->osr_kcd_buf != 0) { + uint32_t reason_buf_size = kcdata_memory_get_used_bytes(&p->p_exit_reason->osr_kcd_descriptor); + assert(reason_buf_size != 0); + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, KCDATA_TYPE_NESTED_KCDATA, reason_buf_size, &uaddr)) { + kcdata_memcpy(crash_info_ptr, uaddr, p->p_exit_reason->osr_kcd_buf, reason_buf_size); + } + } + } + + if (num_udata > 0) { + if (KERN_SUCCESS == kcdata_get_memory_addr_for_array(crash_info_ptr, TASK_CRASHINFO_UDATA_PTRS, + sizeof(uint64_t), num_udata, &uaddr)) { + kcdata_memcpy(crash_info_ptr, uaddr, udata_buffer, sizeof(uint64_t) * num_udata); + } + } +} + +/* + * We only parse exit reason kcdata blobs for launchd when it dies + * and we're going to panic. + * + * Meant to be called immediately before panicking. + */ +char * +launchd_exit_reason_get_string_desc(os_reason_t exit_reason) +{ + kcdata_iter_t iter; + + if (exit_reason == OS_REASON_NULL || exit_reason->osr_kcd_buf == NULL || + exit_reason->osr_bufsize == 0) { + return NULL; + } + + iter = kcdata_iter(exit_reason->osr_kcd_buf, exit_reason->osr_bufsize); + if (!kcdata_iter_valid(iter)) { +#if DEBUG || DEVELOPMENT + printf("launchd exit reason has invalid exit reason buffer\n"); +#endif + return NULL; + } + + if (kcdata_iter_type(iter) != KCDATA_BUFFER_BEGIN_OS_REASON) { +#if DEBUG || DEVELOPMENT + printf("launchd exit reason buffer type mismatch, expected %d got %d\n", + KCDATA_BUFFER_BEGIN_OS_REASON, kcdata_iter_type(iter)); +#endif + return NULL; + } + + iter = kcdata_iter_find_type(iter, EXIT_REASON_USER_DESC); + if (!kcdata_iter_valid(iter)) { + return NULL; + } + + return (char *)kcdata_iter_payload(iter); } static __attribute__((noinline)) void launchd_crashed_panic(proc_t p, int rv) { - printf("pid 1 exited (signal %d, exit %d)\n", - WTERMSIG(rv), WEXITSTATUS(rv)); + char *launchd_exit_reason_desc = launchd_exit_reason_get_string_desc(p->p_exit_reason); + + if (p->p_exit_reason == OS_REASON_NULL) { + printf("pid 1 exited -- no exit reason available -- (signal %d, exit %d)\n", + WTERMSIG(rv), WEXITSTATUS(rv)); + } else { + printf("pid 1 exited -- exit reason namespace %d subcode 0x%llx, description %s\n", + p->p_exit_reason->osr_namespace, p->p_exit_reason->osr_code, launchd_exit_reason_desc ? + launchd_exit_reason_desc : "none"); + } -#if (DEVELOPMENT || DEBUG) +#if (DEVELOPMENT || DEBUG) && CONFIG_COREDUMP /* * For debugging purposes, generate a core file of initproc before * panicking. Leave at least 300 MB free on the root volume, and ignore @@ -389,18 +540,59 @@ launchd_crashed_panic(proc_t p, int rv) printf("Generated initproc core file in %d.%03d seconds\n", (uint32_t)tv_sec, tv_msec); } -#endif +#endif /* (DEVELOPMENT || DEBUG) && CONFIG_COREDUMP */ sync(p, (void *)NULL, (int *)NULL); - panic_plain("%s exited (signal %d, exit status %d %s)", (p->p_name[0] != '\0' ? p->p_name : "initproc"), WTERMSIG(rv), - WEXITSTATUS(rv), ((p->p_csflags & CS_KILLED) ? "CS_KILLED" : "")); + if (p->p_exit_reason == OS_REASON_NULL) { + panic_plain(LAUNCHD_CRASHED_PREFIX " -- no exit reason available -- (signal %d, exit status %d %s)", + WTERMSIG(rv), WEXITSTATUS(rv), ((p->p_csflags & CS_KILLED) ? "CS_KILLED" : "")); + } else { + panic_plain(LAUNCHD_CRASHED_PREFIX " %s -- exit reason namespace %d subcode 0x%llx description: %." LAUNCHD_PANIC_REASON_STRING_MAXLEN "s", + ((p->p_csflags & CS_KILLED) ? "CS_KILLED" : ""), + p->p_exit_reason->osr_namespace, p->p_exit_reason->osr_code, + launchd_exit_reason_desc ? launchd_exit_reason_desc : "none"); + } +} + +static void +abort_with_payload_internal(proc_t p, uint32_t reason_namespace, uint64_t reason_code, user_addr_t payload, uint32_t payload_size, + user_addr_t reason_string, uint64_t reason_flags) +{ + os_reason_t exit_reason = OS_REASON_NULL; + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, reason_namespace, + reason_code, 0, 0); + + exit_reason = build_userspace_exit_reason(reason_namespace, reason_code, payload, payload_size, reason_string, + reason_flags); + + /* + * We use SIGABRT (rather than calling exit directly from here) so that + * the debugger can catch abort_with_{reason,payload} calls. + */ + psignal_try_thread_with_reason(p, current_thread(), SIGABRT, exit_reason); + + return; +} + +int +abort_with_payload(struct proc *cur_proc, struct abort_with_payload_args *args, + __unused void *retval) +{ + abort_with_payload_internal(cur_proc, args->reason_namespace, args->reason_code, args->payload, args->payload_size, + args->reason_string, args->reason_flags); + + return 0; } + /* * exit -- * Death of process. */ +__attribute__((noreturn)) void exit(proc_t p, struct exit_args *uap, int *retval) { @@ -427,6 +619,16 @@ exit1(proc_t p, int rv, int *retval) int exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, boolean_t perf_notify, int jetsam_flags) +{ + return exit_with_reason(p, rv, retval, thread_can_terminate, perf_notify, jetsam_flags, OS_REASON_NULL); +} + +/* + * NOTE: exit_with_reason drops a reference on the passed exit_reason + */ +int +exit_with_reason(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, boolean_t perf_notify, + int jetsam_flags, struct os_reason *exit_reason) { thread_t self = current_thread(); struct task *task = p->task; @@ -441,6 +643,7 @@ exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, bo ut = get_bsdthread_info(self); if (ut->uu_flag & UT_VFORK) { + os_reason_free(exit_reason); if (!thread_can_terminate) { return EINVAL; } @@ -470,32 +673,38 @@ exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, bo /* mark process is going to exit and pull out of DBG/disk throttle */ /* TODO: This should be done after becoming exit thread */ - proc_set_task_policy(p->task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, + proc_set_task_policy(p->task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_TERMINATED, TASK_POLICY_ENABLE); proc_lock(p); - error = proc_transstart(p, 1, (((jetsam_flags & P_JETSAM_MASK) == P_JETSAM_VNODE) ? 1 : 0)); + error = proc_transstart(p, 1, (jetsam_flags ? 1 : 0)); if (error == EDEADLK) { - /* Temp: If deadlock error, then it implies multithreaded exec is - * in progress. Instread of letting exit continue and - * corrupting the freed memory, let the exit thread - * return. This will save corruption in remote case. + /* + * If proc_transstart() returns EDEADLK, then another thread + * is either exec'ing or exiting. Return an error and allow + * the other thread to continue. */ proc_unlock(p); + os_reason_free(exit_reason); if (current_proc() == p){ - if (p->exit_thread == self) + if (p->exit_thread == self) { printf("exit_thread failed to exit, leaving process %s[%d] in unkillable limbo\n", p->p_comm, p->p_pid); - thread_exception_return(); - } else { - /* external termination like jetsam */ - return(error); + } + + if (thread_can_terminate) { + thread_exception_return(); + } } + + return error; } while (p->exit_thread != self) { if (sig_try_locked(p) <= 0) { proc_transend(p, 1); + os_reason_free(exit_reason); + if (get_threadtask(self) != task) { proc_unlock(p); return(0); @@ -513,10 +722,15 @@ exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, bo sig_lock_to_exit(p); } - if (p == initproc && current_proc() == p) { - init_task_died = TRUE; + if (exit_reason != OS_REASON_NULL) { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_COMMIT) | DBG_FUNC_NONE, + p->p_pid, exit_reason->osr_namespace, + exit_reason->osr_code, 0, 0); } - + + assert(p->p_exit_reason == OS_REASON_NULL); + p->p_exit_reason = exit_reason; + p->p_lflag |= P_LEXIT; p->p_xstat = rv; p->p_lflag |= jetsam_flags; @@ -550,7 +764,9 @@ proc_prepareexit(proc_t p, int rv, boolean_t perf_notify) } /* If a core should be generated, notify crash reporter */ - if (hassigprop(WTERMSIG(rv), SA_CORE) || ((p->p_csflags & CS_KILLED) != 0)) { + if (hassigprop(WTERMSIG(rv), SA_CORE) || ((p->p_csflags & CS_KILLED) != 0) || + (p->p_exit_reason != OS_REASON_NULL && (p->p_exit_reason->osr_flags & + OS_REASON_FLAG_GENERATE_CRASH_REPORT))) { /* * Workaround for processes checking up on PT_DENY_ATTACH: * should be backed out post-Leopard (details in 5431025). @@ -597,9 +813,6 @@ proc_prepareexit(proc_t p, int rv, boolean_t perf_notify) printf("Process[%d] crashed: %s. Too many corpses being created.\n", p->p_pid, p->p_comm); } create_corpse = FALSE; - } else { - /* XXX: Need to sync ATM buffer before crash */ - kr = task_send_trace_memory(current_task(), p->p_pid, p->p_uniqueid); } } @@ -624,7 +837,27 @@ proc_prepareexit(proc_t p, int rv, boolean_t perf_notify) p->p_ru = rup; } if (create_corpse) { - populate_corpse_crashinfo(p, task_get_corpseinfo(current_task()), rup, code, subcode); + int est_knotes = 0, num_knotes = 0; + uint64_t *buffer = NULL; + int buf_size = 0; + + /* Get all the udata pointers from kqueue */ + est_knotes = proc_list_uptrs(p, NULL, 0); + if (est_knotes > 0) { + buf_size = (est_knotes + 32) * sizeof(uint64_t); + buffer = (uint64_t *) kalloc(buf_size); + num_knotes = proc_list_uptrs(p, buffer, buf_size); + if (num_knotes > est_knotes + 32) { + num_knotes = est_knotes + 32; + } + } + + /* Update the code, subcode based on exit reason */ + proc_update_corpse_exception_codes(p, &code, &subcode); + populate_corpse_crashinfo(p, task_get_corpseinfo(current_task()), rup, code, subcode, buffer, num_knotes); + if (buffer != NULL) { + kfree(buffer, buf_size); + } } /* * Remove proc from allproc queue and from pidhash chain. @@ -716,43 +949,11 @@ proc_exit(proc_t p) pid, exitval, 0, 0, 0); #if CONFIG_DTRACE - /* - * Free any outstanding lazy dof entries. It is imperative we - * always call dtrace_lazy_dofs_destroy, rather than null check - * and call if !NULL. If we NULL test, during lazy dof faulting - * we can race with the faulting code and proceed from here to - * beyond the helpers cleanup. The lazy dof faulting will then - * install new helpers which will never be cleaned up, and leak. - */ - dtrace_lazy_dofs_destroy(p); - - /* - * Clean up any DTrace helper actions or probes for the process. - */ - if (p->p_dtrace_helpers != NULL) { - (*dtrace_helpers_cleanup)(p); - } - - /* - * Clean up any DTrace probes associated with this process. - */ - /* - * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(), - * call this after dtrace_helpers_cleanup() - */ - proc_lock(p); - if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) { - (*dtrace_fasttrap_exit_ptr)(p); - } - proc_unlock(p); + dtrace_proc_exit(p); #endif nspace_proc_exit(p); -#if VM_PRESSURE_EVENTS - vm_pressure_proc_cleanup(p); -#endif - /* * need to cancel async IO requests that can be cancelled and wait for those * already active. MAY BLOCK! @@ -978,7 +1179,7 @@ proc_exit(proc_t p) */ thread_resume(thread); clear_wait(thread, THREAD_INTERRUPTED); - threadsignal(thread, SIGKILL, 0); + threadsignal(thread, SIGKILL, 0, TRUE); } else { proc_unlock(q); } @@ -1033,21 +1234,7 @@ proc_exit(proc_t p) } } - proc_spinlock(p); - if (thread_call_cancel(p->p_rcall)) - p->p_ractive--; - - while (p->p_ractive > 0) { - proc_spinunlock(p); - - delay(1); - - proc_spinlock(p); - } - proc_spinunlock(p); - - thread_call_free(p->p_rcall); - p->p_rcall = NULL; + proc_free_realitimer(p); /* * Other substructures are freed from wait(). @@ -1328,6 +1515,8 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int reparentedtoi #endif (void)chgproccnt(kauth_cred_getruid(child->p_ucred), -1); + os_reason_free(child->p_exit_reason); + /* * Free up credentials. */ @@ -1386,7 +1575,6 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int reparentedtoi #endif lck_spin_destroy(&child->p_slock, proc_lck_grp); #endif /* CONFIG_FINE_LOCK_GROUPS */ - workqueue_destroy_lock(child); FREE_ZONE(child, sizeof *child, M_PROC); if ((locked == 1) && (droplock == 0)) @@ -1447,7 +1635,7 @@ wait4_nocancel(proc_t q, struct wait4_nocancel_args *uap, int32_t *retval) nfound = 0; sibling_count = 0; - for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) { + PCHILDREN_FOREACH(q, p) { if ( p->p_sibling.le_next != 0 ) sibling_count++; if (uap->pid != WAIT_ANY && @@ -1690,8 +1878,8 @@ waitid_nocancel(proc_t q, struct waitid_nocancel_args *uap, proc_list_lock(); loop1: nfound = 0; - for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) { + PCHILDREN_FOREACH(q, p) { switch (uap->idtype) { case P_PID: /* child with process ID equal to... */ if (p->p_pid != (pid_t)uap->id) @@ -1997,21 +2185,7 @@ vfork_exit_internal(proc_t p, int rv, int forceexit) p->p_sigignore = ~0; proc_unlock(p); - proc_spinlock(p); - if (thread_call_cancel(p->p_rcall)) - p->p_ractive--; - - while (p->p_ractive > 0) { - proc_spinunlock(p); - - delay(1); - - proc_spinlock(p); - } - proc_spinunlock(p); - - thread_call_free(p->p_rcall); - p->p_rcall = NULL; + proc_free_realitimer(p); ut->uu_siglist = 0; @@ -2201,7 +2375,7 @@ vproc_exit(proc_t p) */ thread_resume(thread); clear_wait(thread, THREAD_INTERRUPTED); - threadsignal(thread, SIGKILL, 0); + threadsignal(thread, SIGKILL, 0, TRUE); } else { proc_unlock(q); } diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index 284752296..78e50cc5e 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -96,12 +97,11 @@ #if CONFIG_PERSONAS #include #endif +#include #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ -extern void dtrace_fasttrap_fork(proc_t, proc_t); -extern void (*dtrace_helpers_fork)(proc_t, proc_t); extern void (*dtrace_proc_waitfor_exec_ptr)(proc_t); -extern void dtrace_lazy_dofs_duplicate(proc_t, proc_t); +extern void dtrace_proc_fork(proc_t, proc_t, int); /* * Since dtrace_proc_waitfor_exec_ptr can be added/removed in dtrace_subr.c, @@ -617,60 +617,8 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind, coalition_t *coalit child_proc->p_acflag = AFORK; /* forked but not exec'ed */ -// dtrace code cleanup needed #if CONFIG_DTRACE - /* - * This code applies to new processes who are copying the task - * and thread state and address spaces of their parent process. - */ - if (!spawn) { -// call dtrace specific function here instead of all this... - /* - * APPLE NOTE: Solaris does a sprlock() and drops the - * proc_lock here. We're cheating a bit and only taking - * the p_dtrace_sprlock lock. A full sprlock would - * task_suspend the parent. - */ - lck_mtx_lock(&parent_proc->p_dtrace_sprlock); - - /* - * Remove all DTrace tracepoints from the child process. We - * need to do this _before_ duplicating USDT providers since - * any associated probes may be immediately enabled. - */ - if (parent_proc->p_dtrace_count > 0) { - dtrace_fasttrap_fork(parent_proc, child_proc); - } - - lck_mtx_unlock(&parent_proc->p_dtrace_sprlock); - - /* - * Duplicate any lazy dof(s). This must be done while NOT - * holding the parent sprlock! Lock ordering is - * dtrace_dof_mode_lock, then sprlock. It is imperative we - * always call dtrace_lazy_dofs_duplicate, rather than null - * check and call if !NULL. If we NULL test, during lazy dof - * faulting we can race with the faulting code and proceed - * from here to beyond the helpers copy. The lazy dof - * faulting will then fail to copy the helpers to the child - * process. - */ - dtrace_lazy_dofs_duplicate(parent_proc, child_proc); - - /* - * Duplicate any helper actions and providers. The SFORKING - * we set above informs the code to enable USDT probes that - * sprlock() may fail because the child is being forked. - */ - /* - * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent - * never fails to find the child. We do not set SFORKING. - */ - if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) { - (*dtrace_helpers_fork)(parent_proc, child_proc); - } - - } + dtrace_proc_fork(parent_proc, child_proc, spawn); #endif /* CONFIG_DTRACE */ if (!spawn) { /* @@ -805,6 +753,7 @@ fork_create_child(task_t parent_task, coalition_t *parent_coalitions, proc_t chi parent_coalitions, inherit_memory, is64bit, + TF_NONE, &child_task); if (result != KERN_SUCCESS) { printf("%s: task_create_internal failed. Code: %d\n", @@ -990,7 +939,7 @@ cloneproc(task_t parent_task, coalition_t *parent_coalitions, proc_t parent_proc goto bad; } - child_thread = fork_create_child(parent_task, parent_coalitions, child_proc, inherit_memory, (parent_task == TASK_NULL) ? FALSE : (parent_proc->p_flag & P_LP64)); + child_thread = fork_create_child(parent_task, parent_coalitions, child_proc, inherit_memory, parent_proc->p_flag & P_LP64); if (child_thread == NULL) { /* @@ -1055,8 +1004,6 @@ forkproc_free(proc_t p) pth_proc_hashdelete(p); #endif /* PSYNCH */ - workqueue_destroy_lock(p); - /* We held signal and a transition locks; drop them */ proc_signalend(p, 0); proc_transend(p, 0); @@ -1381,12 +1328,6 @@ forkproc(proc_t parent_proc) /* Inherit the parent flags for code sign */ child_proc->p_csflags = (parent_proc->p_csflags & ~CS_KILLED); - /* - * All processes have work queue locks; cleaned up by - * reap_child_locked() - */ - workqueue_init_lock(child_proc); - /* * Copy work queue information * @@ -1401,7 +1342,6 @@ forkproc(proc_t parent_proc) child_proc->p_wqthread = parent_proc->p_wqthread; child_proc->p_threadstart = parent_proc->p_threadstart; child_proc->p_pthsize = parent_proc->p_pthsize; - child_proc->p_targconc = parent_proc->p_targconc; if ((parent_proc->p_lflag & P_LREGISTER) != 0) { child_proc->p_lflag |= P_LREGISTER; } @@ -1424,11 +1364,16 @@ forkproc(proc_t parent_proc) #endif #if CONFIG_MEMORYSTATUS - /* Memorystatus + jetsam init */ + /* Memorystatus init */ child_proc->p_memstat_state = 0; child_proc->p_memstat_effectivepriority = JETSAM_PRIORITY_DEFAULT; child_proc->p_memstat_requestedpriority = JETSAM_PRIORITY_DEFAULT; - child_proc->p_memstat_userdata = 0; + child_proc->p_memstat_userdata = 0; + child_proc->p_memstat_idle_start = 0; + child_proc->p_memstat_idle_delta = 0; + child_proc->p_memstat_memlimit = 0; + child_proc->p_memstat_memlimit_active = 0; + child_proc->p_memstat_memlimit_inactive = 0; #if CONFIG_FREEZE child_proc->p_memstat_suspendedfootprint = 0; #endif @@ -1491,19 +1436,25 @@ proc_ucred_unlock(proc_t p) #include -struct zone *uthread_zone; -static int uthread_zone_inited = 0; +struct zone *uthread_zone = NULL; + +static lck_grp_t *rethrottle_lock_grp; +static lck_attr_t *rethrottle_lock_attr; +static lck_grp_attr_t *rethrottle_lock_grp_attr; static void uthread_zone_init(void) { - if (!uthread_zone_inited) { - uthread_zone = zinit(sizeof(struct uthread), - thread_max * sizeof(struct uthread), - THREAD_CHUNK * sizeof(struct uthread), - "uthreads"); - uthread_zone_inited = 1; - } + assert(uthread_zone == NULL); + + rethrottle_lock_grp_attr = lck_grp_attr_alloc_init(); + rethrottle_lock_grp = lck_grp_alloc_init("rethrottle", rethrottle_lock_grp_attr); + rethrottle_lock_attr = lck_attr_alloc_init(); + + uthread_zone = zinit(sizeof(struct uthread), + thread_max * sizeof(struct uthread), + THREAD_CHUNK * sizeof(struct uthread), + "uthreads"); } void * @@ -1514,7 +1465,7 @@ uthread_alloc(task_t task, thread_t thread, int noinherit) uthread_t uth_parent; void *ut; - if (!uthread_zone_inited) + if (uthread_zone == NULL) uthread_zone_init(); ut = (void *)zalloc(uthread_zone); @@ -1524,6 +1475,9 @@ uthread_alloc(task_t task, thread_t thread, int noinherit) uth = (uthread_t)ut; uth->uu_thread = thread; + lck_spin_init(&uth->uu_rethrottle_lock, rethrottle_lock_grp, + rethrottle_lock_attr); + /* * Thread inherits credential from the creating thread, if both * are in the same task. @@ -1583,9 +1537,7 @@ uthread_alloc(task_t task, thread_t thread, int noinherit) /* * This routine frees the thread name field of the uthread_t structure. Split out of - * uthread_cleanup() so it can be called separately on the threads of a corpse after - * the corpse notification has been sent, and the handler has had a chance to extract - * the thread names. + * uthread_cleanup() so thread name does not get deallocated while generating a corpse fork. */ void uthread_cleanup_name(void *uthread) @@ -1614,7 +1566,7 @@ uthread_cleanup_name(void *uthread) * It does not free the uthread structure as well */ void -uthread_cleanup(task_t task, void *uthread, void * bsd_info, boolean_t is_corpse) +uthread_cleanup(task_t task, void *uthread, void * bsd_info) { struct _select *sel; uthread_t uth = (uthread_t)uthread; @@ -1647,6 +1599,15 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info, boolean_t is_corpse */ assert(uth->uu_ar == NULL); + if (uth->uu_kqueue_bound) { + kevent_qos_internal_unbind(p, + uth->uu_kqueue_bound, + uth->uu_thread, + uth->uu_kqueue_flags); + uth->uu_kqueue_flags = 0; + uth->uu_kqueue_bound = 0; + } + sel = &uth->uu_select; /* cleanup the select bit space */ if (sel->nbytes) { @@ -1668,13 +1629,7 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info, boolean_t is_corpse uth->uu_wqstate_sz = 0; } - /* - * defer the removal of the thread name on process corpses until the corpse has - * been autopsied. - */ - if (!is_corpse) { - uthread_cleanup_name(uth); - } + os_reason_free(uth->uu_exit_reason); if ((task != kernel_task) && p) { @@ -1726,6 +1681,9 @@ uthread_zone_free(void *uthread) uth->t_tombstone = NULL; } + lck_spin_destroy(&uth->uu_rethrottle_lock, rethrottle_lock_grp); + + uthread_cleanup_name(uthread); /* and free the uthread itself */ zfree(uthread_zone, uthread); } diff --git a/bsd/kern/kern_guarded.c b/bsd/kern/kern_guarded.c index dad131eb4..a24987fc0 100644 --- a/bsd/kern/kern_guarded.c +++ b/bsd/kern/kern_guarded.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,9 +45,6 @@ #include #include #include -#if CONFIG_PROTECT -#include -#endif #define f_flag f_fglob->fg_flag @@ -959,3 +956,28 @@ guarded_writev_np(struct proc *p, struct guarded_writev_np_args *uap, user_ssize } return (error); } + +/* + * int falloc_guarded(struct proc *p, struct fileproc **fp, int *fd, + * vfs_context_t ctx, const guardid_t *guard, u_int attrs); + * + * This SPI is the guarded variant of falloc(). It borrows the same + * restrictions as those used by the rest of the guarded_* routines. + */ +int +falloc_guarded(struct proc *p, struct fileproc **fp, int *fd, + vfs_context_t ctx, const guardid_t *guard, u_int attrs) +{ + struct gfp_crarg crarg; + + if (((attrs & GUARD_REQUIRED) != GUARD_REQUIRED) || + ((attrs & ~GUARD_ALL) != 0) || (*guard == 0)) + return (EINVAL); + + bzero(&crarg, sizeof (crarg)); + crarg.gca_guard = *guard; + crarg.gca_attrs = attrs; + + return (falloc_withalloc(p, fp, fd, ctx, guarded_fileproc_alloc_init, + &crarg)); +} diff --git a/bsd/kern/kern_kpc.c b/bsd/kern/kern_kpc.c index 3e3443fc3..38bc2abbd 100644 --- a/bsd/kern/kern_kpc.c +++ b/bsd/kern/kern_kpc.c @@ -36,6 +36,7 @@ #include #include +#include #include #include @@ -415,6 +416,8 @@ kpc_sysctl SYSCTL_HANDLER_ARGS if( !kpc_initted ) panic("kpc_init not called"); + lck_mtx_lock(ktrace_lock); + // Most sysctls require an access check, but a few are public. switch( (uintptr_t) arg1 ) { case REQ_CLASSES: @@ -426,13 +429,15 @@ kpc_sysctl SYSCTL_HANDLER_ARGS default: // Require kperf access to read or write anything else. // This is either root or the blessed pid. - ret = kperf_access_check(); - if (ret) { + if ((ret = ktrace_read_check())) { + lck_mtx_unlock(ktrace_lock); return ret; } break; } + lck_mtx_unlock(ktrace_lock); + lck_mtx_lock(&sysctl_buffer_lock); /* which request */ diff --git a/bsd/kern/kern_ktrace.c b/bsd/kern/kern_ktrace.c new file mode 100644 index 000000000..4b6546ea9 --- /dev/null +++ b/bsd/kern/kern_ktrace.c @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * This file manages the ownership of ktrace and its subsystems, like kdebug + * and kperf, as well as the overall state of the system, whether it is in + * foreground or background mode. + * + * When unconfigured or in background mode, any root process can take ownership + * of ktrace and configure it, changing the state to foreground and, in the case + * of a transition out of background, resetting the background configuration. + * + * When in foreground mode, if the owning process is still running, only it may + * configure ktrace. If it exits, ktrace keeps running but any root process can + * change the configuration. When ktrace is reset, the state changes back to + * unconfigured and a notification is sent on the ktrace_background host special + * port. + * + * If a process has set itself as the background tool, using the init_background + * sysctl, it can configure ktrace only when ktrace is off or already in + * background mode. The first attempt to configure ktrace by the background pid + * when it is off results in the transition to background mode. + */ + +#include + +#include +#include +#include + +#include +#include +#include +char *proc_name_address(void *p); +#include +#include + +#include +#include + +#include +#include + +#include + +kern_return_t ktrace_background_available_notify_user(void); + +lck_mtx_t *ktrace_lock; + +/* + * The overall state of ktrace, whether it is unconfigured, in foreground mode, + * or in background mode. The state determines which processes can configure + * ktrace. + */ +static enum ktrace_state ktrace_state = KTRACE_STATE_OFF; + +/* The true owner of ktrace, checked by ktrace_access_check(). */ +static uint64_t ktrace_owning_unique_id = 0; +static pid_t ktrace_owning_pid = 0; + +/* + * The background pid of ktrace, automatically made the owner when + * transitioning to background mode. + */ +static uint64_t ktrace_bg_unique_id = 0; +static pid_t ktrace_bg_pid = 0; + +/* The name of the last process to configure ktrace. */ +static char ktrace_last_owner_execname[MAXCOMLEN + 1] = { 0 }; + +/* + * Which subsystems of ktrace (currently kdebug and kperf) are active. + */ +static uint32_t ktrace_active_mask = 0; + +/* + * At boot or when a daemon has been newly loaded, it's necessary to bootstrap + * user space background tools by sending a background available notification + * when the init_background sysctl is made. + * + * Background tools must be RunAtLoad daemons. + */ +static boolean_t should_notify_on_init = TRUE; + +/* Set the owning process of ktrace. */ +static void ktrace_set_owning_proc(proc_t p); + +/* Reset ktrace ownership back to unowned. */ +static void ktrace_release_ownership(void); + +/* Make the background tool the owner of ktrace. */ +static void ktrace_promote_background(void); + +/* + * If user space sets a pid manually (through kperf "blessing"), ktrace should + * not treat resets as releasing ownership. At that point, ownership is only + * released when the owner is set to an invalid pid. + * + * This is managed by the user space-oriented function ktrace_set_owning_pid + * and ktrace_unset_owning_pid. + */ +boolean_t ktrace_keep_ownership_on_reset = FALSE; + +/* Allow user space to unset the owning pid and potentially reset ktrace. */ +static void ktrace_set_invalid_owning_pid(void); + +/* + * This flag allows any root process to set a new ktrace owner. It is + * currently used by Instruments. + */ +int ktrace_root_set_owner_allowed = 0; + +void +ktrace_reset(uint32_t reset_mask) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + assert(reset_mask != 0); + + if (ktrace_active_mask == 0) { + if (!ktrace_keep_ownership_on_reset) { + assert(ktrace_state == KTRACE_STATE_OFF); + } + return; + } + + if (!ktrace_keep_ownership_on_reset) { + ktrace_active_mask &= ~reset_mask; + } + + if (reset_mask & KTRACE_KPERF) { + kperf_reset(); + } + if (reset_mask & KTRACE_KDEBUG) { + kdebug_reset(); + } + + if (ktrace_active_mask == 0) { + if (ktrace_state == KTRACE_STATE_FG) { + /* transition from foreground to background */ + ktrace_promote_background(); + } else if (ktrace_state == KTRACE_STATE_BG) { + /* background tool is resetting ktrace */ + should_notify_on_init = TRUE; + ktrace_release_ownership(); + ktrace_state = KTRACE_STATE_OFF; + } + } +} + +static void +ktrace_promote_background(void) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + assert(ktrace_state != KTRACE_STATE_BG); + + /* + * Remember to send a background available notification on the next init + * if the notification failed (meaning no task holds the receive right + * for the host special port). + */ + if (ktrace_background_available_notify_user() == KERN_FAILURE) { + should_notify_on_init = TRUE; + } else { + should_notify_on_init = FALSE; + } + + ktrace_release_ownership(); + ktrace_state = KTRACE_STATE_OFF; +} + +bool +ktrace_background_active(void) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + return (ktrace_state == KTRACE_STATE_BG); +} + +int +ktrace_read_check(void) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + if (proc_uniqueid(current_proc()) == ktrace_owning_unique_id) + { + return 0; + } + + return kauth_cred_issuser(kauth_cred_get()) ? 0 : EPERM; +} + +/* If an owning process has exited, reset the ownership. */ +static void +ktrace_ownership_maintenance(void) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + /* do nothing if ktrace is not owned */ + if (ktrace_owning_unique_id == 0) { + return; + } + + /* reset ownership if process cannot be found */ + + proc_t owning_proc = proc_find(ktrace_owning_pid); + + if (owning_proc != NULL) { + /* make sure the pid was not recycled */ + if (proc_uniqueid(owning_proc) != ktrace_owning_unique_id) { + ktrace_release_ownership(); + } + + proc_rele(owning_proc); + } else { + ktrace_release_ownership(); + } +} + +int +ktrace_configure(uint32_t config_mask) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + assert(config_mask != 0); + + proc_t p = current_proc(); + + /* if process clearly owns ktrace, allow */ + if (proc_uniqueid(p) == ktrace_owning_unique_id) { + ktrace_active_mask |= config_mask; + return 0; + } + + /* background configure while foreground is active is not allowed */ + if (proc_uniqueid(p) == ktrace_bg_unique_id && + ktrace_state == KTRACE_STATE_FG) + { + return EBUSY; + } + + ktrace_ownership_maintenance(); + + /* allow process to gain control when unowned or background */ + if (ktrace_owning_unique_id == 0 || ktrace_state == KTRACE_STATE_BG) { + if (!kauth_cred_issuser(kauth_cred_get())) { + return EPERM; + } + + ktrace_set_owning_proc(p); + ktrace_active_mask |= config_mask; + return 0; + } + + /* owned by an existing, different process */ + return EBUSY; +} + +void +ktrace_disable(enum ktrace_state state_to_match) +{ + if (ktrace_state == state_to_match) { + kernel_debug_disable(); + kperf_sampling_disable(); + } +} + +int +ktrace_get_owning_pid(void) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + ktrace_ownership_maintenance(); + return ktrace_owning_pid; +} + +void +ktrace_kernel_configure(uint32_t config_mask) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + if (ktrace_state != KTRACE_STATE_OFF) { + if (ktrace_active_mask & KTRACE_KPERF) { + kperf_reset(); + } + if (ktrace_active_mask & KTRACE_KDEBUG) { + kdebug_reset(); + } + } + + ktrace_active_mask = config_mask; + ktrace_state = KTRACE_STATE_FG; + + ktrace_release_ownership(); + strlcpy(ktrace_last_owner_execname, "kernel_task", + sizeof(ktrace_last_owner_execname)); +} + +static errno_t +ktrace_init_background(void) +{ + int err = 0; + + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + if ((err = priv_check_cred(kauth_cred_get(), PRIV_KTRACE_BACKGROUND, 0))) { + return err; + } + + /* + * When a background tool first checks in, send a notification if ktrace + * is available. + */ + if (should_notify_on_init) { + if (ktrace_state == KTRACE_STATE_OFF) { + /* + * This notification can only fail if a process does not + * hold the receive right for the host special port. + * Return an error and don't make the current process + * the background tool. + */ + if (ktrace_background_available_notify_user() == KERN_FAILURE) { + return EINVAL; + } + } + should_notify_on_init = FALSE; + } + + proc_t p = current_proc(); + + ktrace_bg_unique_id = proc_uniqueid(p); + ktrace_bg_pid = proc_pid(p); + + if (ktrace_state == KTRACE_STATE_BG) { + ktrace_set_owning_proc(p); + } + + return 0; +} + +void +ktrace_set_invalid_owning_pid(void) +{ + if (ktrace_keep_ownership_on_reset) { + ktrace_reset(ktrace_active_mask); + ktrace_keep_ownership_on_reset = FALSE; + } +} + +int +ktrace_set_owning_pid(int pid) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + /* allow user space to successfully unset owning pid */ + if (pid == -1) { + ktrace_set_invalid_owning_pid(); + return 0; + } + + /* use ktrace_reset or ktrace_release_ownership, not this */ + if (pid == 0) { + ktrace_set_invalid_owning_pid(); + return EINVAL; + } + + proc_t p = proc_find(pid); + if (!p) { + ktrace_set_invalid_owning_pid(); + return ESRCH; + } + + ktrace_keep_ownership_on_reset = TRUE; + ktrace_set_owning_proc(p); + + proc_rele(p); + return 0; +} + +static void +ktrace_set_owning_proc(proc_t p) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + assert(p); + + if (ktrace_state != KTRACE_STATE_FG) { + if (proc_uniqueid(p) == ktrace_bg_unique_id) { + ktrace_state = KTRACE_STATE_BG; + } else { + if (ktrace_state == KTRACE_STATE_BG) { + if (ktrace_active_mask & KTRACE_KPERF) { + kperf_reset(); + } + if (ktrace_active_mask & KTRACE_KDEBUG) { + kdebug_reset(); + } + + ktrace_active_mask = 0; + } + ktrace_state = KTRACE_STATE_FG; + should_notify_on_init = FALSE; + } + } + + ktrace_owning_unique_id = proc_uniqueid(p); + ktrace_owning_pid = proc_pid(p); + strlcpy(ktrace_last_owner_execname, proc_name_address(p), + sizeof(ktrace_last_owner_execname)); +} + +static void +ktrace_release_ownership(void) +{ + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + ktrace_owning_unique_id = 0; + ktrace_owning_pid = 0; +} + +#define SYSCTL_INIT_BACKGROUND (1) + +static int ktrace_sysctl SYSCTL_HANDLER_ARGS; + +SYSCTL_NODE(, OID_AUTO, ktrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "ktrace"); + +SYSCTL_UINT(_ktrace, OID_AUTO, state, CTLFLAG_RD | CTLFLAG_LOCKED, + &ktrace_state, 0, + ""); + +SYSCTL_INT(_ktrace, OID_AUTO, owning_pid, CTLFLAG_RD | CTLFLAG_LOCKED, + &ktrace_owning_pid, 0, + "pid of the process that owns ktrace"); + +SYSCTL_INT(_ktrace, OID_AUTO, background_pid, CTLFLAG_RD | CTLFLAG_LOCKED, + &ktrace_bg_pid, 0, + "pid of the background ktrace tool"); + +SYSCTL_STRING(_ktrace, OID_AUTO, configured_by, CTLFLAG_RD | CTLFLAG_LOCKED, + ktrace_last_owner_execname, 0, + "execname of process that last configured ktrace"); + +SYSCTL_PROC(_ktrace, OID_AUTO, init_background, CTLFLAG_RW | CTLFLAG_LOCKED, + (void *)SYSCTL_INIT_BACKGROUND, sizeof(int), + ktrace_sysctl, "I", "initialize calling process as background"); + +static int +ktrace_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + int ret = 0; + uintptr_t type = (uintptr_t)arg1; + + lck_mtx_lock(ktrace_lock); + + if (!kauth_cred_issuser(kauth_cred_get())) { + ret = EPERM; + goto out; + } + + if (type == SYSCTL_INIT_BACKGROUND) { + if (req->newptr != USER_ADDR_NULL) { + ret = ktrace_init_background(); + goto out; + } else { + ret = EINVAL; + goto out; + } + } else { + ret = EINVAL; + goto out; + } + +out: + lck_mtx_unlock(ktrace_lock); + return ret; +} + +/* This should only be called from the bootstrap thread. */ +void +ktrace_init(void) +{ + static lck_grp_attr_t *lock_grp_attr = NULL; + static lck_grp_t *lock_grp = NULL; + static boolean_t initialized = FALSE; + + if (initialized) { + return; + } + + lock_grp_attr = lck_grp_attr_alloc_init(); + lock_grp = lck_grp_alloc_init("ktrace", lock_grp_attr); + lck_grp_attr_free(lock_grp_attr); + + ktrace_lock = lck_mtx_alloc_init(lock_grp, LCK_ATTR_NULL); + assert(ktrace_lock); + initialized = TRUE; +} diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index 46c4f2e77..0a3165cf2 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -76,7 +76,7 @@ #include #include #include -#include +#include #include @@ -1432,7 +1432,7 @@ lf_printlist(const char *tag, struct lockf *lock) static void lf_hold_assertion(task_t block_task, struct lockf *block) { - if (task_importance_hold_file_lock_assertion(block_task, 1)) { + if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) { block->lf_boosted = LF_BOOSTED; LOCKF_DEBUG(LF_DBG_IMPINH, "lf: importance hold file lock assert on pid %d lock %p\n", diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index d1adaabec..bf2771a12 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -97,8 +97,7 @@ #include #include #include - -#include +#include #include @@ -107,8 +106,6 @@ #include #include -#include - #include #include @@ -120,10 +117,10 @@ void kmeminit(void); * Must be in synch with the #defines is sys/malloc.h * NOTE - the reason we pass null strings in some cases is to reduce of foot * print as much as possible for systems where a tiny kernel is needed. - * todo - We should probably redsign this and use enums for our types and only + * todo - We should probably redesign this and use enums for our types and only * include types needed for that configuration of the kernel. This can't be * done without some kind of kpi since several types are hardwired and exported - * (for example see types M_HFSMNT, M_UDFMNT, M_TEMP, etc in sys/malloc.h) + * (for example see types M_UDFMNT, M_TEMP, etc in sys/malloc.h) */ const char *memname[] = { "free", /* 0 M_FREE */ @@ -227,17 +224,11 @@ const char *memname[] = { "buf hdrs", /* 72 M_BUFHDR */ "ofile tabl", /* 73 M_OFILETABL */ "mbuf clust", /* 74 M_MCLUST */ -#if HFS - "HFS mount", /* 75 M_HFSMNT */ - "HFS node", /* 76 M_HFSNODE */ - "HFS fork", /* 77 M_HFSFORK */ -#else - "", /* 75 M_HFSMNT */ - "", /* 76 M_HFSNODE */ - "", /* 77 M_HFSFORK */ -#endif - "", /* 78 unused */ - "", /* 79 unused */ + "", /* 75 unused */ + "", /* 76 unused */ + "", /* 77 unused */ + "", /* 78 unused */ + "", /* 79 unused */ "temp", /* 80 M_TEMP */ "key mgmt", /* 81 M_SECA */ "DEVFS", /* 82 M_DEVFS */ @@ -255,21 +246,12 @@ const char *memname[] = { #endif "TCP Segment Q",/* 89 M_TSEGQ */ "IGMP state", /* 90 M_IGMP */ -#if JOURNALING - "Journal", /* 91 M_JNL_JNL */ - "Transaction", /* 92 M_JNL_TR */ -#else - "", /* 91 M_JNL_JNL */ - "", /* 92 M_JNL_TR */ -#endif + "", /* 91 unused */ + "", /* 92 unused */ "specinfo", /* 93 M_SPECINFO */ "kqueue", /* 94 M_KQUEUE */ -#if HFS - "HFS dirhint", /* 95 M_HFSDIRHINT */ -#else - "", /* 95 M_HFSDIRHINT */ -#endif - "cluster_read", /* 96 M_CLRDAHEAD */ + "", /* 95 unused */ + "cluster_read", /* 96 M_CLRDAHEAD */ "cluster_write",/* 97 M_CLWRBEHIND */ "iov64", /* 98 M_IOV64 */ "fileglob", /* 99 M_FILEGLOB */ @@ -286,11 +268,11 @@ const char *memname[] = { #else "", /* 108 M_TRAFFIC_MGT */ #endif -#if HFS_COMPRESSION +#if FS_COMPRESSION "decmpfs_cnode",/* 109 M_DECMPFS_CNODE */ #else "", /* 109 M_DECMPFS_CNODE */ -#endif /* HFS_COMPRESSION */ +#endif /* FS_COMPRESSION */ "ipmfilter", /* 110 M_INMFILTER */ "ipmsource", /* 111 M_IPMSOURCE */ "in6mfilter", /* 112 M_IN6MFILTER */ @@ -438,15 +420,9 @@ struct kmzones { { (NDFILE * OFILESIZE), KMZ_CREATEZONE_ACCT, FALSE }, /* 73 M_OFILETABL */ { MCLBYTES, KMZ_CREATEZONE, FALSE }, /* 74 M_MCLUST */ -#if HFS - { SOX(hfsmount),KMZ_LOOKUPZONE, FALSE }, /* 75 M_HFSMNT */ - { SOS(cnode), KMZ_CREATEZONE, TRUE }, /* 76 M_HFSNODE */ - { SOS(filefork),KMZ_CREATEZONE, TRUE }, /* 77 M_HFSFORK */ -#else - { 0, KMZ_MALLOC, FALSE }, /* 75 M_HFSMNT */ - { 0, KMZ_MALLOC, FALSE }, /* 76 M_HFSNODE */ - { 0, KMZ_MALLOC, FALSE }, /* 77 M_HFSFORK */ -#endif + { 0, KMZ_MALLOC, FALSE }, /* 75 unused */ + { 0, KMZ_MALLOC, FALSE }, /* 76 unused */ + { 0, KMZ_MALLOC, FALSE }, /* 77 unused */ { 0, KMZ_MALLOC, FALSE }, /* 78 unused */ { 0, KMZ_MALLOC, FALSE }, /* 79 unused */ { 0, KMZ_MALLOC, FALSE }, /* 80 M_TEMP */ @@ -460,20 +436,11 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 88 M_IP6MISC */ { 0, KMZ_MALLOC, FALSE }, /* 89 M_TSEGQ */ { 0, KMZ_MALLOC, FALSE }, /* 90 M_IGMP */ -#if JOURNALING - { SOS(journal), KMZ_CREATEZONE, FALSE }, /* 91 M_JNL_JNL */ - { SOS(transaction), KMZ_CREATEZONE, FALSE }, /* 92 M_JNL_TR */ -#else - { 0, KMZ_MALLOC, FALSE }, /* 91 M_JNL_JNL */ - { 0, KMZ_MALLOC, FALSE }, /* 92 M_JNL_TR */ -#endif + { 0, KMZ_MALLOC, FALSE }, /* 91 unused */ + { 0, KMZ_MALLOC, FALSE }, /* 92 unused */ { SOS(specinfo),KMZ_CREATEZONE, TRUE }, /* 93 M_SPECINFO */ { SOS(kqueue), KMZ_CREATEZONE, FALSE }, /* 94 M_KQUEUE */ -#if HFS - { SOS(directoryhint), KMZ_CREATEZONE, TRUE }, /* 95 M_HFSDIRHINT */ -#else - { 0, KMZ_MALLOC, FALSE }, /* 95 M_HFSDIRHINT */ -#endif + { 0, KMZ_MALLOC, FALSE }, /* 95 unused */ { SOS(cl_readahead), KMZ_CREATEZONE, TRUE }, /* 96 M_CLRDAHEAD */ { SOS(cl_writebehind),KMZ_CREATEZONE, TRUE }, /* 97 M_CLWRBEHIND */ { SOS(user64_iovec), KMZ_LOOKUPZONE, FALSE },/* 98 M_IOV64 */ @@ -487,11 +454,11 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 106 M_HFS_EXTATTR */ { 0, KMZ_MALLOC, FALSE }, /* 107 M_SELECT */ { 0, KMZ_MALLOC, FALSE }, /* 108 M_TRAFFIC_MGT */ -#if HFS_COMPRESSION +#if FS_COMPRESSION { SOS(decmpfs_cnode),KMZ_CREATEZONE , FALSE}, /* 109 M_DECMPFS_CNODE */ #else { 0, KMZ_MALLOC, FALSE }, /* 109 M_DECMPFS_CNODE */ -#endif /* HFS_COMPRESSION */ +#endif /* FS_COMPRESSION */ { 0, KMZ_MALLOC, FALSE }, /* 110 M_INMFILTER */ { 0, KMZ_MALLOC, FALSE }, /* 111 M_IPMSOURCE */ { 0, KMZ_MALLOC, FALSE }, /* 112 M_IN6MFILTER */ @@ -579,12 +546,6 @@ kmeminit(void) } } -struct _mhead { - size_t mlen; - char dat[0]; -}; - - void * _MALLOC_external( size_t size, @@ -607,8 +568,8 @@ __MALLOC( int flags, vm_allocation_site_t *site) { - struct _mhead *hdr = NULL; - size_t memsize = sizeof (*hdr) + size; + void *addr = NULL; + vm_size_t msize = size; if (type >= M_LAST) panic("_malloc TYPE"); @@ -616,25 +577,15 @@ __MALLOC( if (size == 0) return (NULL); + if (msize != size) { + panic("Requested size to __MALLOC is too large (%llx)!\n", (uint64_t)size); + } + if (flags & M_NOWAIT) { - if (size > memsize) /* overflow detected */ - return (NULL); - else - hdr = (void *)kalloc_canblock(memsize, FALSE, site); + addr = (void *)kalloc_canblock(&msize, FALSE, site); } else { - if (size > memsize) { - /* - * We get here when the caller told us to block, waiting for memory but an overflow - * has been detected. The caller isn't expecting a NULL return code so we panic - * with a descriptive message. - */ - panic("_MALLOC: overflow detected, size %llu ", (uint64_t) size); - } - else - hdr = (void *)kalloc_canblock(memsize, TRUE, site); - - if (hdr == NULL) { - + addr = (void *)kalloc_canblock(&msize, TRUE, site); + if (addr == NULL) { /* * We get here when the caller told us to block waiting for memory, but * kalloc said there's no memory left to get. Generally, this means there's a @@ -648,15 +599,13 @@ __MALLOC( panic("_MALLOC: kalloc returned NULL (potential leak), size %llu", (uint64_t) size); } } - if (!hdr) + if (!addr) return (0); - hdr->mlen = memsize; - if (flags & M_ZERO) - bzero(hdr->dat, size); + bzero(addr, size); - return (hdr->dat); + return (addr); } void @@ -664,16 +613,13 @@ _FREE( void *addr, int type) { - struct _mhead *hdr; - if (type >= M_LAST) panic("_free TYPE"); if (!addr) return; /* correct (convenient bsd kernel legacy) */ - hdr = addr; hdr--; - kfree(hdr, hdr->mlen); + kfree_addr(addr); } void * @@ -684,7 +630,6 @@ __REALLOC( int flags, vm_allocation_site_t *site) { - struct _mhead *hdr; void *newaddr; size_t alloc; @@ -692,14 +637,19 @@ __REALLOC( if (addr == NULL) return (__MALLOC(size, type, flags, site)); + alloc = kalloc_size(addr); + /* + * Find out the size of the bucket in which the new sized allocation + * would land. If it matches the bucket of the original allocation, + * simply return the address. + */ + if (kalloc_bucket_size(size) == alloc) + return addr; + /* Allocate a new, bigger (or smaller) block */ if ((newaddr = __MALLOC(size, type, flags, site)) == NULL) return (NULL); - hdr = addr; - --hdr; - alloc = hdr->mlen - sizeof (*hdr); - /* Copy over original contents */ bcopy(addr, newaddr, MIN(size, alloc)); _FREE(addr, type); @@ -748,12 +698,16 @@ __MALLOC_ZONE( } else { elem = (void *)zalloc(kmz->kz_zalloczone); } - else - if (flags & M_NOWAIT) { - elem = (void *)kalloc_canblock(size, FALSE, site); + else { + vm_size_t kalloc_size = size; + if (size > kalloc_size) { + elem = NULL; + } else if (flags & M_NOWAIT) { + elem = (void *)kalloc_canblock(&kalloc_size, FALSE, site); } else { - elem = (void *)kalloc_canblock(size, TRUE, site); + elem = (void *)kalloc_canblock(&kalloc_size, TRUE, site); } + } return (elem); } diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index 22f7edbd3..236b02d5c 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -35,6 +35,10 @@ #include #include #include +#include + +#include + #include #include #include @@ -46,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -62,7 +67,8 @@ #include -#if CONFIG_JETSAM +#include + /* For logging clarity */ static const char *jetsam_kill_cause_name[] = { "" , @@ -77,6 +83,7 @@ static const char *jetsam_kill_cause_name[] = { "idle-exit" , /* kMemorystatusKilledIdleExit */ }; +#if CONFIG_JETSAM /* Does cause indicate vm or fc thrashing? */ static boolean_t is_thrashing(unsigned cause) @@ -92,7 +99,7 @@ is_thrashing(unsigned cause) /* Callback into vm_compressor.c to signal that thrashing has been mitigated. */ extern void vm_thrashing_jetsam_done(void); -#endif +#endif /* CONFIG_JETSAM */ /* These are very verbose printfs(), enable with * MEMORYSTATUS_DEBUG_LOG @@ -177,6 +184,7 @@ unsigned long critical_threshold_percentage = 5; unsigned long idle_offset_percentage = 5; unsigned long pressure_threshold_percentage = 15; unsigned long freeze_threshold_percentage = 50; +unsigned long policy_more_free_offset_percentage = 5; /* General memorystatus stuff */ @@ -186,7 +194,8 @@ static lck_mtx_t memorystatus_klist_mutex; static void memorystatus_klist_lock(void); static void memorystatus_klist_unlock(void); -static uint64_t memorystatus_idle_delay_time = 0; +static uint64_t memorystatus_sysprocs_idle_delay_time = 0; +static uint64_t memorystatus_apps_idle_delay_time = 0; /* * Memorystatus kevents @@ -195,22 +204,29 @@ static uint64_t memorystatus_idle_delay_time = 0; static int filt_memorystatusattach(struct knote *kn); static void filt_memorystatusdetach(struct knote *kn); static int filt_memorystatus(struct knote *kn, long hint); +static int filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); struct filterops memorystatus_filtops = { .f_attach = filt_memorystatusattach, .f_detach = filt_memorystatusdetach, .f_event = filt_memorystatus, + .f_touch = filt_memorystatustouch, + .f_process = filt_memorystatusprocess, }; enum { kMemorystatusNoPressure = 0x1, kMemorystatusPressure = 0x2, - kMemorystatusLowSwap = 0x4 + kMemorystatusLowSwap = 0x4, + kMemorystatusProcLimitWarn = 0x8, + kMemorystatusProcLimitCritical = 0x10 }; /* Idle guard handling */ -static int32_t memorystatus_scheduled_idle_demotions = 0; +static int32_t memorystatus_scheduled_idle_demotions_sysprocs = 0; +static int32_t memorystatus_scheduled_idle_demotions_apps = 0; static thread_call_t memorystatus_idle_demotion_call; @@ -219,15 +235,17 @@ static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_s static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state); static void memorystatus_reschedule_idle_demotion_locked(void); -static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert); +static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check); + +vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t); +void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear); void memorystatus_send_low_swap_note(void); int memorystatus_wakeup = 0; unsigned int memorystatus_level = 0; -unsigned int memorystatus_early_boot_level = 0; static int memorystatus_list_count = 0; @@ -242,11 +260,212 @@ memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT]; uint64_t memstat_idle_demotion_deadline = 0; +int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; +int applications_aging_band = JETSAM_PRIORITY_IDLE; + +#define isProcessInAgingBands(p) ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band))) +#define isApp(p) (! (p->p_memstat_dirty & P_DIRTY_TRACK)) +#define isSysProc(p) ((p->p_memstat_dirty & P_DIRTY_TRACK)) + +#define kJetsamAgingPolicyNone (0) +#define kJetsamAgingPolicyLegacy (1) +#define kJetsamAgingPolicySysProcsReclaimedFirst (2) +#define kJetsamAgingPolicyAppsReclaimedFirst (3) +#define kJetsamAgingPolicyMax kJetsamAgingPolicyAppsReclaimedFirst + +unsigned int jetsam_aging_policy = kJetsamAgingPolicyLegacy; + +extern int corpse_for_fatal_memkill; +extern unsigned long total_corpses_count; +extern void task_purge_all_corpses(void); + +#if 0 + +/* Keeping around for future use if we need a utility that can do this OR an app that needs a dynamic adjustment. */ + +static int +sysctl_set_jetsam_aging_policy SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + int error = 0, val = 0; + memstat_bucket_t *old_bucket = 0; + int old_system_procs_aging_band = 0, new_system_procs_aging_band = 0; + int old_applications_aging_band = 0, new_applications_aging_band = 0; + proc_t p = NULL, next_proc = NULL; + + + error = sysctl_io_number(req, jetsam_aging_policy, sizeof(int), &val, NULL); + if (error || !req->newptr) { + return (error); + } + + if ((val < 0) || (val > kJetsamAgingPolicyMax)) { + printf("jetsam: ordering policy sysctl has invalid value - %d\n", val); + return EINVAL; + } + + /* + * We need to synchronize with any potential adding/removal from aging bands + * that might be in progress currently. We use the proc_list_lock() just for + * consistency with all the routines dealing with 'aging' processes. We need + * a lighterweight lock. + */ + proc_list_lock(); + + old_system_procs_aging_band = system_procs_aging_band; + old_applications_aging_band = applications_aging_band; + + switch (val) { + + case kJetsamAgingPolicyNone: + new_system_procs_aging_band = JETSAM_PRIORITY_IDLE; + new_applications_aging_band = JETSAM_PRIORITY_IDLE; + break; + + case kJetsamAgingPolicyLegacy: + /* + * Legacy behavior where some daemons get a 10s protection once and only before the first clean->dirty->clean transition before going into IDLE band. + */ + new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; + new_applications_aging_band = JETSAM_PRIORITY_IDLE; + break; + + case kJetsamAgingPolicySysProcsReclaimedFirst: + new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; + new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND2; + break; + + case kJetsamAgingPolicyAppsReclaimedFirst: + new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2; + new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND1; + break; + + default: + break; + } + + if (old_system_procs_aging_band && (old_system_procs_aging_band != new_system_procs_aging_band)) { + + old_bucket = &memstat_bucket[old_system_procs_aging_band]; + p = TAILQ_FIRST(&old_bucket->list); + + while (p) { + + next_proc = TAILQ_NEXT(p, p_memstat_list); + + if (isSysProc(p)) { + if (new_system_procs_aging_band == JETSAM_PRIORITY_IDLE) { + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + } + + memorystatus_update_priority_locked(p, new_system_procs_aging_band, false, true); + } + + p = next_proc; + continue; + } + } + + if (old_applications_aging_band && (old_applications_aging_band != new_applications_aging_band)) { + + old_bucket = &memstat_bucket[old_applications_aging_band]; + p = TAILQ_FIRST(&old_bucket->list); + + while (p) { + + next_proc = TAILQ_NEXT(p, p_memstat_list); + + if (isApp(p)) { + if (new_applications_aging_band == JETSAM_PRIORITY_IDLE) { + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + } + + memorystatus_update_priority_locked(p, new_applications_aging_band, false, true); + } + + p = next_proc; + continue; + } + } + + jetsam_aging_policy = val; + system_procs_aging_band = new_system_procs_aging_band; + applications_aging_band = new_applications_aging_band; + + proc_list_unlock(); + + return (0); +} + +SYSCTL_PROC(_kern, OID_AUTO, set_jetsam_aging_policy, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_set_jetsam_aging_policy, "I", "Jetsam Aging Policy"); +#endif /*0*/ + +static int +sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + int error = 0, val = 0, old_time_in_secs = 0; + uint64_t old_time_in_ns = 0; + + absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns); + old_time_in_secs = old_time_in_ns / NSEC_PER_SEC; + + error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL); + if (error || !req->newptr) { + return (error); + } + + if ((val < 0) || (val > INT32_MAX)) { + printf("jetsam: new idle delay interval has invalid value.\n"); + return EINVAL; + } + + nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time); + + return(0); +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_sysprocs_idle_delay_time, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes"); + + +static int +sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + int error = 0, val = 0, old_time_in_secs = 0; + uint64_t old_time_in_ns = 0; + + absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns); + old_time_in_secs = old_time_in_ns / NSEC_PER_SEC; + + error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL); + if (error || !req->newptr) { + return (error); + } + + if ((val < 0) || (val > INT32_MAX)) { + printf("jetsam: new idle delay interval has invalid value.\n"); + return EINVAL; + } + + nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time); + + return(0); +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_apps_idle_delay_time, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications"); + +SYSCTL_INT(_kern, OID_AUTO, jetsam_aging_policy, CTLTYPE_INT|CTLFLAG_RD, &jetsam_aging_policy, 0, ""); + static unsigned int memorystatus_dirty_count = 0; -#if CONFIG_JETSAM SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED, &max_task_footprint_mb, 0, ""); -#endif // CONFIG_JETSAM int @@ -268,6 +487,15 @@ static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc static void memorystatus_thread(void *param __unused, wait_result_t wr __unused); +/* Memory Limits */ + +static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */ + +static boolean_t proc_jetsam_state_is_active_locked(proc_t); +static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason); +static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason); + + /* Jetsam */ #if CONFIG_JETSAM @@ -280,17 +508,12 @@ static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval); -static boolean_t proc_jetsam_state_is_active_locked(proc_t); +static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval); int proc_get_memstat_priority(proc_t, boolean_t); -/* Kill processes exceeding their limit either under memory pressure (1), or as soon as possible (0) */ -#define LEGACY_HIWATER 1 - static boolean_t memorystatus_idle_snapshot = 0; -static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. This should be removed. */ - unsigned int memorystatus_delta = 0; static unsigned int memorystatus_available_pages_critical_base = 0; @@ -332,12 +555,10 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, #if DEVELOPMENT || DEBUG static unsigned int memorystatus_jetsam_panic_debug = 0; - -static unsigned int memorystatus_jetsam_policy = kPolicyDefault; static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0; -static unsigned int memorystatus_debug_dump_this_bucket = 0; #endif +static unsigned int memorystatus_jetsam_policy = kPolicyDefault; static unsigned int memorystatus_thread_wasted_wakeup = 0; static uint32_t kill_under_pressure_cause = 0; @@ -358,21 +579,30 @@ static uint64_t memorystatus_jetsam_snapshot_timeout = 0; */ static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot; +static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count); +static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount); +static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime); + static void memorystatus_clear_errors(void); static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages); +static void memorystatus_get_task_phys_footprint_page_counts(task_t task, + uint64_t *internal_pages, uint64_t *internal_compressed_pages, + uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages, + uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages, + uint64_t *iokit_mapped_pages, uint64_t *page_table_pages); + +static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count); + static uint32_t memorystatus_build_state(proc_t p); static void memorystatus_update_levels_locked(boolean_t critical_only); //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured); -static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause); -static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, int32_t *priority, uint32_t *errors); -static boolean_t memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors); -#if LEGACY_HIWATER +static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, uint32_t *errors); +static boolean_t memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, os_reason_t jetsam_reason, int aggr_count, int32_t priority_max, uint32_t *errors); +static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors); static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors); -#endif static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause); -static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause); /* Priority Band Sorting Routines */ static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order); @@ -395,12 +625,13 @@ extern unsigned int vm_page_inactive_count; extern unsigned int vm_page_throttled_count; extern unsigned int vm_page_purgeable_count; extern unsigned int vm_page_wire_count; +#if CONFIG_SECLUDED_MEMORY +extern unsigned int vm_page_secluded_count; +#endif /* CONFIG_SECLUDED_MEMORY */ #if VM_PRESSURE_EVENTS -#include "vm_pressure.h" - -extern boolean_t memorystatus_warn_process(pid_t pid, boolean_t critical); +boolean_t memorystatus_warn_process(pid_t pid, boolean_t exceeded); vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal; @@ -410,6 +641,7 @@ unsigned int memorystatus_available_pages_pressure = 0; unsigned int memorystatus_available_pages_critical = 0; unsigned int memorystatus_frozen_count = 0; unsigned int memorystatus_suspended_count = 0; +unsigned int memorystatus_policy_more_free_offset_pages = 0; /* * We use this flag to signal if we have any HWM offenders @@ -430,6 +662,16 @@ static int memorystatus_send_note(int event_code, void *data, size_t data_length #endif /* VM_PRESSURE_EVENTS */ + +#if DEVELOPMENT || DEBUG + +lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr; +lck_grp_t *disconnect_page_mappings_lck_grp; +static lck_mtx_t disconnect_page_mappings_mutex; + +#endif + + /* Freeze */ #if CONFIG_FREEZE @@ -468,7 +710,7 @@ static throttle_interval_t throttle_intervals[] = { static uint64_t memorystatus_freeze_throttle_count = 0; -static unsigned int memorystatus_suspended_footprint_total = 0; +static unsigned int memorystatus_suspended_footprint_total = 0; /* pages */ extern uint64_t vm_swap_get_free_space(void); @@ -482,14 +724,14 @@ extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *); #if DEVELOPMENT || DEBUG -#if CONFIG_JETSAM +static unsigned int memorystatus_debug_dump_this_bucket = 0; static void memorystatus_debug_dump_bucket_locked (unsigned int bucket_index) { proc_t p = NULL; - uint32_t pages = 0; - uint32_t pages_in_mb = 0; + uint64_t bytes = 0; + int ledger_limit = 0; unsigned int b = bucket_index; boolean_t traverse_all_buckets = FALSE; @@ -502,25 +744,34 @@ memorystatus_debug_dump_bucket_locked (unsigned int bucket_index) } /* - * Missing from this dump is the value actually - * stored in the ledger... also, format could be better. + * footprint reported in [pages / MB ] + * limits reported as: + * L-limit proc's Ledger limit + * C-limit proc's Cached limit, should match Ledger + * A-limit proc's Active limit + * IA-limit proc's Inactive limit + * F==Fatal, NF==NonFatal */ - printf("memorystatus_debug_dump ***START***\n"); - printf("bucket [pid] [pages/pages-mb] state [EP / RP] dirty deadline [C-limit / A-limit / IA-limit] name\n"); + + printf("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64); + printf("bucket [pid] [pages / MB] [state] [EP / RP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n"); p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets); while (p) { - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL); - pages_in_mb = (pages * 4096) /1024 / 1024; - printf("%d [%d] [%d/%dMB] 0x%x [%d / %d] 0x%x %lld [%d%s / %d%s / %d%s] %s\n", - b, p->p_pid, pages, pages_in_mb, + bytes = get_task_phys_footprint(p->task); + task_get_phys_footprint_limit(p->task, &ledger_limit); + printf("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n", + b, p->p_pid, + (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */ + (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */ p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_dirty, p->p_memstat_idledeadline, + ledger_limit, p->p_memstat_memlimit, (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_memlimit_active, (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"), p->p_memstat_memlimit_inactive, (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"), - (p->p_comm ? p->p_comm : "unknown")); + (*p->p_name ? p->p_name : "unknown")); p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets); } printf("memorystatus_debug_dump ***END***\n"); @@ -635,14 +886,112 @@ sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS } -SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, ""); - SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", ""); +#if VM_PRESSURE_EVENTS + +/* + * This routine is used for targeted notifications + * regardless of system memory pressure. + * "memnote" is the current user. + */ + +static int +sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error = 0, pid = 0; + struct knote *kn = NULL; + boolean_t found_knote = FALSE; + int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */ + uint64_t value = 0; + + error = sysctl_handle_quad(oidp, &value, 0, req); + if (error || !req->newptr) + return (error); + + /* + * Find the pid in the low 32 bits of value passed in. + */ + pid = (int)(value & 0xFFFFFFFF); + + /* + * Find notification in the high 32 bits of the value passed in. + */ + fflags = (int)((value >> 32) & 0xFFFFFFFF); + + /* + * For backwards compatibility, when no notification is + * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN + */ + if (fflags == 0) { + fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; + // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags); + } + + /* + * See event.h ... fflags for EVFILT_MEMORYSTATUS + */ + if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL)|| + (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) || + (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) || + (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) || + (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) || + (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL))) { + + printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags); + error = 1; + return (error); + } + + /* + * Forcibly send pid a memorystatus notification. + */ + + memorystatus_klist_lock(); + + SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { + proc_t knote_proc = knote_get_kq(kn)->kq_p; + pid_t knote_pid = knote_proc->p_pid; + + if (knote_pid == pid) { + /* + * Forcibly send this pid a memorystatus notification. + */ + kn->kn_fflags = fflags; + found_knote = TRUE; + } + } + + if (found_knote) { + KNOTE(&memorystatus_klist, 0); + printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid); + error = 0; + } else { + printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid); + error = 1; + } + + memorystatus_klist_unlock(); + + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", ""); + +#endif /* VM_PRESSURE_EVENTS */ + +#if CONFIG_JETSAM + +SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, ""); + SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, ""); /* Diagnostic code */ @@ -721,103 +1070,40 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, ""); +#endif /* VM_PRESSURE_EVENTS */ -/* - * This routine is used for targeted notifications - * regardless of system memory pressure. - * "memnote" is the current user. - */ +#endif /* CONFIG_JETSAM */ + +#if CONFIG_FREEZE + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, ""); + +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, ""); + +boolean_t memorystatus_freeze_throttle_enabled = TRUE; +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, ""); +/* + * Manual trigger of freeze and thaw for dev / debug kernels only. + */ static int -sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS +sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2) + int error, pid = 0; + proc_t p; - int error = 0, pid = 0; - int ret = 0; - struct knote *kn = NULL; - boolean_t found_knote = FALSE; - - error = sysctl_handle_int(oidp, &pid, 0, req); - if (error || !req->newptr) - return (error); - - /* - * We inspect 3 lists here for targeted notifications: - * - memorystatus_klist - * - vm_pressure_klist - * - vm_pressure_dormant_klist - * - * The vm_pressure_* lists are tied to the old VM_PRESSURE - * notification mechanism. We intend to stop using that - * mechanism and, in turn, get rid of the 2 lists and - * vm_dispatch_pressure_note_to_pid() too. - */ - - memorystatus_klist_lock(); - - SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { - proc_t knote_proc = kn->kn_kq->kq_p; - pid_t knote_pid = knote_proc->p_pid; - - if (knote_pid == pid) { - /* - * Forcibly send this pid a "warning" memory pressure notification. - */ - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; - found_knote = TRUE; - } - } - - if (found_knote) { - KNOTE(&memorystatus_klist, 0); - ret = 0; - } else { - ret = vm_dispatch_pressure_note_to_pid(pid, FALSE); - } - - memorystatus_klist_unlock(); - - return ret; -} - -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_vm_pressure_send, "I", ""); - -#endif /* VM_PRESSURE_EVENTS */ - -#endif /* CONFIG_JETSAM */ - -#if CONFIG_FREEZE - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, ""); - -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, ""); - -boolean_t memorystatus_freeze_throttle_enabled = TRUE; -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, ""); - -/* - * Manual trigger of freeze and thaw for dev / debug kernels only. - */ -static int -sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2) - int error, pid = 0; - proc_t p; - - if (memorystatus_freeze_enabled == FALSE) { - return ENOTSUP; - } + if (memorystatus_freeze_enabled == FALSE) { + return ENOTSUP; + } error = sysctl_handle_int(oidp, &pid, 0, req); if (error || !req->newptr) @@ -837,22 +1123,15 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS boolean_t shared; uint32_t max_pages = 0; - if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { unsigned int avail_swap_space = 0; /* in pages. */ - if (DEFAULT_FREEZER_IS_ACTIVE) { - /* - * Freezer backed by default pager and swap file(s). - */ - avail_swap_space = default_pager_swap_pages_free(); - } else { - /* - * Freezer backed by the compressor and swap file(s) - * while will hold compressed data. - */ - avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; - } + /* + * Freezer backed by the compressor and swap file(s) + * while will hold compressed data. + */ + avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max); @@ -921,6 +1200,48 @@ extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation integer_t priority, thread_t *new_thread); +#if DEVELOPMENT || DEBUG + +static int +sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0, pid = 0; + proc_t p; + + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error || !req->newptr) + return (error); + + lck_mtx_lock(&disconnect_page_mappings_mutex); + + if (pid == -1) { + vm_pageout_disconnect_all_pages(); + } else { + p = proc_find(pid); + + if (p != NULL) { + error = task_disconnect_page_mappings(p->task); + + proc_rele(p); + + if (error) + error = EIO; + } else + error = EINVAL; + } + lck_mtx_unlock(&disconnect_page_mappings_mutex); + + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", ""); + +#endif /* DEVELOPMENT || DEBUG */ + + + #if CONFIG_JETSAM /* * Picks the sorting routine for a given jetsam priority band. @@ -1083,7 +1404,15 @@ memorystatus_init(void) memorystatus_freeze_pages_max = FREEZE_PAGES_MAX; #endif - nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_idle_delay_time); +#if DEVELOPMENT || DEBUG + disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init(); + disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr); + + lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL); +#endif + + nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time); + nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time); /* Init buckets */ for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) { @@ -1095,6 +1424,9 @@ memorystatus_init(void) /* Apply overrides */ PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage)); + if (delta_percentage == 0) { + delta_percentage = 5; + } assert(delta_percentage < 100); PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage)); assert(critical_threshold_percentage < 100); @@ -1105,13 +1437,71 @@ memorystatus_init(void) PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage)); assert(freeze_threshold_percentage < 100); + if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy, + sizeof (jetsam_aging_policy))) { + + if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy, + sizeof(jetsam_aging_policy))) { + + jetsam_aging_policy = kJetsamAgingPolicyLegacy; + } + } + + if (jetsam_aging_policy > kJetsamAgingPolicyMax) { + jetsam_aging_policy = kJetsamAgingPolicyLegacy; + } + + switch (jetsam_aging_policy) { + + case kJetsamAgingPolicyNone: + system_procs_aging_band = JETSAM_PRIORITY_IDLE; + applications_aging_band = JETSAM_PRIORITY_IDLE; + break; + + case kJetsamAgingPolicyLegacy: + /* + * Legacy behavior where some daemons get a 10s protection once + * AND only before the first clean->dirty->clean transition before + * going into IDLE band. + */ + system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; + applications_aging_band = JETSAM_PRIORITY_IDLE; + break; + + case kJetsamAgingPolicySysProcsReclaimedFirst: + system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; + applications_aging_band = JETSAM_PRIORITY_AGING_BAND2; + break; + + case kJetsamAgingPolicyAppsReclaimedFirst: + system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2; + applications_aging_band = JETSAM_PRIORITY_AGING_BAND1; + break; + + default: + break; + } + + /* + * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE + * band and must be below it in priority. This is so that we don't have to make + * our 'aging' code worry about a mix of processes, some of which need to age + * and some others that need to stay elevated in the jetsam bands. + */ + assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band); + assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band); + #if CONFIG_JETSAM - /* device tree can request to take snapshots for idle-exit kills by default */ - PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot)); + /* Take snapshots for idle-exit kills by default? First check the boot-arg... */ + if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof (memorystatus_idle_snapshot))) { + /* ...no boot-arg, so check the device tree */ + PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot)); + } memorystatus_delta = delta_percentage * atop_64(max_mem) / 100; memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100; memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta; + memorystatus_policy_more_free_offset_pages = (policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta; memorystatus_jetsam_snapshot_max = maxproc; memorystatus_jetsam_snapshot = @@ -1152,7 +1542,7 @@ memorystatus_init(void) /* Centralised for the purposes of allowing panic-on-jetsam */ extern void -vm_wake_compactor_swapper(void); +vm_run_compactor(void); /* * The jetsam no frills kill call @@ -1160,9 +1550,9 @@ vm_wake_compactor_swapper(void); * error code on failure (EINVAL...) */ static int -jetsam_do_kill(proc_t p, int jetsam_flags) { +jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason) { int error = 0; - error = exit1_internal(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags); + error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason); return(error); } @@ -1170,7 +1560,7 @@ jetsam_do_kill(proc_t p, int jetsam_flags) { * Wrapper for processes exiting with memorystatus details */ static boolean_t -memorystatus_do_kill(proc_t p, uint32_t cause) { +memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason) { int error = 0; __unused pid_t victim_pid = p->p_pid; @@ -1178,6 +1568,7 @@ memorystatus_do_kill(proc_t p, uint32_t cause) { KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START, victim_pid, cause, vm_page_free_count, 0, 0); + DTRACE_MEMORYSTATUS3(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause); #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) if (memorystatus_jetsam_panic_debug & (1 << cause)) { panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause); @@ -1195,12 +1586,12 @@ memorystatus_do_kill(proc_t p, uint32_t cause) { case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break; case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break; } - error = jetsam_do_kill(p, jetsam_flags); + error = jetsam_do_kill(p, jetsam_flags, jetsam_reason); KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END, victim_pid, cause, vm_page_free_count, error, 0); - vm_wake_compactor_swapper(); + vm_run_compactor(); return (error == 0); } @@ -1217,11 +1608,90 @@ memorystatus_check_levels_locked(void) { #endif } +/* + * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work. + * For an application: that means no longer in the FG band + * For a daemon: that means no longer in its 'requested' jetsam priority band + */ + +int +memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, boolean_t effective_now) +{ + int error = 0; + boolean_t enable = FALSE; + proc_t p = NULL; + + if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) { + enable = TRUE; + } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) { + enable = FALSE; + } else { + return EINVAL; + } + + p = proc_find(pid); + if (p != NULL) { + + if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) || + (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) { + /* + * No change in state. + */ + + } else { + + proc_list_lock(); + + if (enable) { + p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + + if (effective_now) { + if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_ELEVATED_INACTIVE) { + boolean_t trigger_exception; + CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception); + task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, trigger_exception); + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_ELEVATED_INACTIVE, FALSE, FALSE); + } + } else { + if (isProcessInAgingBands(p)) { + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } + } + } else { + + p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + + if (effective_now) { + if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE) { + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } + } else { + if (isProcessInAgingBands(p)) { + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } + } + } + + proc_list_unlock(); + } + proc_rele(p); + error = 0; + + } else { + error = ESRCH; + } + + return error; +} + static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2) { proc_t p; - uint64_t current_time; + uint64_t current_time = 0, idle_delay_time = 0; + int demote_prio_band = 0; memstat_bucket_t *demotion_bucket; MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n"); @@ -1232,35 +1702,54 @@ memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2) proc_list_lock(); - demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED]; - p = TAILQ_FIRST(&demotion_bucket->list); - - while (p) { - MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid); - - assert(p->p_memstat_idledeadline); - assert(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS); - assert((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED); - - if (current_time >= p->p_memstat_idledeadline) { -#if DEBUG || DEVELOPMENT - if (!(p->p_memstat_dirty & P_DIRTY_MARKED)) { - printf("memorystatus_perform_idle_demotion: moving process %d [%s] to idle band, but never dirtied (0x%x)!\n", - p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"), p->p_memstat_dirty); - } -#endif - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false); - - // The prior process has moved out of the demotion bucket, so grab the new head and continue - p = TAILQ_FIRST(&demotion_bucket->list); + demote_prio_band = JETSAM_PRIORITY_IDLE + 1; + + for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) { + + if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band) continue; + + demotion_bucket = &memstat_bucket[demote_prio_band]; + p = TAILQ_FIRST(&demotion_bucket->list); + + while (p) { + MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid); + + assert(p->p_memstat_idledeadline); + + assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS); + + if (current_time >= p->p_memstat_idledeadline) { + + if ((isSysProc(p) && + ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/ + task_has_assertions((struct task *)(p->task))) { /* has outstanding assertions which might indicate outstanding work too */ + idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time; + + p->p_memstat_idledeadline += idle_delay_time; + p = TAILQ_NEXT(p, p_memstat_list); + + } else { + + proc_t next_proc = NULL; + + next_proc = TAILQ_NEXT(p, p_memstat_list); + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true); + + p = next_proc; + continue; + + } + } else { + // No further candidates + break; + } } - - // No further candidates - break; + } - + memorystatus_reschedule_idle_demotion_locked(); proc_list_unlock(); @@ -1271,59 +1760,118 @@ memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2) static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state) { - boolean_t present_in_deferred_bucket = FALSE; - - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { - present_in_deferred_bucket = TRUE; + boolean_t present_in_sysprocs_aging_bucket = FALSE; + boolean_t present_in_apps_aging_bucket = FALSE; + uint64_t idle_delay_time = 0; + + if (jetsam_aging_policy == kJetsamAgingPolicyNone) { + return; + } + + if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { + /* + * This process isn't going to be making the trip to the lower bands. + */ + return; } + if (isProcessInAgingBands(p)){ + + if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) { + assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS); + } + + if (isSysProc(p) && system_procs_aging_band) { + present_in_sysprocs_aging_bucket = TRUE; + + } else if (isApp(p) && applications_aging_band) { + present_in_apps_aging_bucket = TRUE; + } + } + + assert(!present_in_sysprocs_aging_bucket); + assert(!present_in_apps_aging_bucket); + MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n", - p->p_pid, p->p_memstat_dirty, set_state, memorystatus_scheduled_idle_demotions); + p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)); + + if(isSysProc(p)) { + assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED); + } - assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED); + idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time; if (set_state) { - assert(p->p_memstat_idledeadline == 0); - p->p_memstat_dirty |= P_DIRTY_DEFER_IN_PROGRESS; - p->p_memstat_idledeadline = mach_absolute_time() + memorystatus_idle_delay_time; + p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS; + p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time; } assert(p->p_memstat_idledeadline); - if (present_in_deferred_bucket == FALSE) { - memorystatus_scheduled_idle_demotions++; + if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) { + memorystatus_scheduled_idle_demotions_sysprocs++; + + } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) { + memorystatus_scheduled_idle_demotions_apps++; } } static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state) { - boolean_t present_in_deferred_bucket = FALSE; - - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { - present_in_deferred_bucket = TRUE; - assert(p->p_memstat_idledeadline); + boolean_t present_in_sysprocs_aging_bucket = FALSE; + boolean_t present_in_apps_aging_bucket = FALSE; + + if (!system_procs_aging_band && !applications_aging_band) { + return; + } + + if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) { + return; + } + + if (isProcessInAgingBands(p)) { + + if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) { + assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS); + } + + if (isSysProc(p) && system_procs_aging_band) { + assert(p->p_memstat_effectivepriority == system_procs_aging_band); + assert(p->p_memstat_idledeadline); + present_in_sysprocs_aging_bucket = TRUE; + + } else if (isApp(p) && applications_aging_band) { + assert(p->p_memstat_effectivepriority == applications_aging_band); + assert(p->p_memstat_idledeadline); + present_in_apps_aging_bucket = TRUE; + } } MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n", - p->p_pid, clear_state, memorystatus_scheduled_idle_demotions); + p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)); if (clear_state) { p->p_memstat_idledeadline = 0; - p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS; + p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS; } - if (present_in_deferred_bucket == TRUE) { - memorystatus_scheduled_idle_demotions--; + if (isSysProc(p) &&present_in_sysprocs_aging_bucket == TRUE) { + memorystatus_scheduled_idle_demotions_sysprocs--; + assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0); + + } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) { + memorystatus_scheduled_idle_demotions_apps--; + assert(memorystatus_scheduled_idle_demotions_apps >= 0); } - assert(memorystatus_scheduled_idle_demotions >= 0); + assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0); } static void memorystatus_reschedule_idle_demotion_locked(void) { - if (0 == memorystatus_scheduled_idle_demotions) { + if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) { if (memstat_idle_demotion_deadline) { /* Transitioned 1->0, so cancel next call */ thread_call_cancel(memorystatus_idle_demotion_call); @@ -1331,15 +1879,37 @@ memorystatus_reschedule_idle_demotion_locked(void) { } } else { memstat_bucket_t *demotion_bucket; - proc_t p; - demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED]; - p = TAILQ_FIRST(&demotion_bucket->list); - - assert(p && p->p_memstat_idledeadline); - - if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){ - thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline); - memstat_idle_demotion_deadline = p->p_memstat_idledeadline; + proc_t p = NULL, p1 = NULL, p2 = NULL; + + if (system_procs_aging_band) { + + demotion_bucket = &memstat_bucket[system_procs_aging_band]; + p1 = TAILQ_FIRST(&demotion_bucket->list); + + p = p1; + } + + if (applications_aging_band) { + + demotion_bucket = &memstat_bucket[applications_aging_band]; + p2 = TAILQ_FIRST(&demotion_bucket->list); + + if (p1 && p2) { + p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1; + } else { + p = (p1 == NULL) ? p2 : p1; + } + + } + + assert(p); + + if (p != NULL) { + assert(p && p->p_memstat_idledeadline); + if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){ + thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline); + memstat_idle_demotion_deadline = p->p_memstat_idledeadline; + } } } } @@ -1354,11 +1924,13 @@ memorystatus_add(proc_t p, boolean_t locked) memstat_bucket_t *bucket; MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority); - + if (!locked) { proc_list_lock(); } - + + DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority); + /* Processes marked internal do not have priority tracked */ if (p->p_memstat_state & P_MEMSTAT_INTERNAL) { goto exit; @@ -1366,8 +1938,18 @@ memorystatus_add(proc_t p, boolean_t locked) bucket = &memstat_bucket[p->p_memstat_effectivepriority]; - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { - assert(bucket->count == memorystatus_scheduled_idle_demotions); + if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) { + assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1); + + } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) { + assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1); + + } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { + /* + * Entering the idle band. + * Record idle start time. + */ + p->p_memstat_idle_start = mach_absolute_time(); } TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list); @@ -1392,9 +1974,24 @@ memorystatus_add(proc_t p, boolean_t locked) * * Monitors transition between buckets and if necessary * will update cached memory limits accordingly. + * + * skip_demotion_check: + * - if the 'jetsam aging policy' is NOT 'legacy': + * When this flag is TRUE, it means we are going + * to age the ripe processes out of the aging bands and into the + * IDLE band and apply their inactive memory limits. + * + * - if the 'jetsam aging policy' is 'legacy': + * When this flag is TRUE, it might mean the above aging mechanism + * OR + * It might be that we have a process that has used up its 'idle deferral' + * stay that is given to it once per lifetime. And in this case, the process + * won't be going through any aging codepaths. But we still need to apply + * the right inactive limits and so we explicitly set this to TRUE if the + * new priority for the process is the IDLE band. */ -static void -memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert) +void +memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check) { memstat_bucket_t *old_bucket, *new_bucket; @@ -1404,18 +2001,71 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser if ((p->p_listflag & P_LIST_EXITED) != 0) { return; } - - MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting pid %d to priority %d, inserting at %s\n", - p->p_pid, priority, head_insert ? "head" : "tail"); + + MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n", + (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail"); + + DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority); + +#if DEVELOPMENT || DEBUG + if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */ + skip_demotion_check == FALSE && /* and it isn't via the path that will set the INACTIVE memlimits */ + (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */ + ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */ + ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? ( ! (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) /* OR type (fatal vs non-fatal) */ + panic("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */ +#endif /* DEVELOPMENT || DEBUG */ old_bucket = &memstat_bucket[p->p_memstat_effectivepriority]; - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { - assert(old_bucket->count == (memorystatus_scheduled_idle_demotions + 1)); + + if (skip_demotion_check == FALSE) { + + if (isSysProc(p)) { + /* + * For system processes, the memorystatus_dirty_* routines take care of adding/removing + * the processes from the aging bands and balancing the demotion counts. + * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute. + */ + + if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) { + priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; + + assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS)); + } + } else if (isApp(p)) { + + /* + * Check to see if the application is being lowered in jetsam priority. If so, and: + * - it has an 'elevated inactive jetsam band' attribute, then put it in the JETSAM_PRIORITY_ELEVATED_INACTIVE band. + * - it is a normal application, then let it age in the aging band if that policy is in effect. + */ + + if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) { + priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; + } else { + + if (applications_aging_band) { + if (p->p_memstat_effectivepriority == applications_aging_band) { + assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1)); + } + + if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) { + assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS)); + priority = applications_aging_band; + memorystatus_schedule_idle_demotion_locked(p, TRUE); + } + } + } + } + } + + if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) { + assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS); } TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list); old_bucket->count--; - + new_bucket = &memstat_bucket[priority]; if (head_insert) TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list); @@ -1423,7 +2073,6 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list); new_bucket->count++; -#if CONFIG_JETSAM if (memorystatus_highwater_enabled) { boolean_t trigger_exception; @@ -1444,16 +2093,18 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser * but: * dirty <--> clean is ignored * - * We bypass processes that have opted into dirty tracking because + * We bypass non-idle processes that have opted into dirty tracking because * a move between buckets does not imply a transition between the * dirty <--> clean state. - * Setting limits on processes opted into dirty tracking is handled - * in memorystatus_dirty_set() where the transition is very clear. */ if (p->p_memstat_dirty & P_DIRTY_TRACK) { - ledger_update_needed = FALSE; + if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) { + CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception); + } else { + ledger_update_needed = FALSE; + } } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) { /* @@ -1493,9 +2144,43 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser } } -#endif /* CONFIG_JETSAM */ - + /* + * Record idle start or idle delta. + */ + if (p->p_memstat_effectivepriority == priority) { + /* + * This process is not transitioning between + * jetsam priority buckets. Do nothing. + */ + } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { + uint64_t now; + /* + * Transitioning out of the idle priority bucket. + * Record idle delta. + */ + assert(p->p_memstat_idle_start != 0); + now = mach_absolute_time(); + if (now > p->p_memstat_idle_start) { + p->p_memstat_idle_delta = now - p->p_memstat_idle_start; + } + } else if (priority == JETSAM_PRIORITY_IDLE) { + /* + * Transitioning into the idle priority bucket. + * Record idle start. + */ + p->p_memstat_idle_start = mach_absolute_time(); + } + p->p_memstat_effectivepriority = priority; + +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_apps && + task_could_use_secluded_mem(p->task)) { + task_set_can_use_secluded_mem( + p->task, + (priority >= JETSAM_PRIORITY_FOREGROUND)); + } +#endif /* CONFIG_SECLUDED_MEMORY */ memorystatus_check_levels_locked(); } @@ -1542,21 +2227,16 @@ memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effect { int ret; boolean_t head_insert = false; - -#if !CONFIG_JETSAM -#pragma unused(update_memlimit, memlimit_active, memlimit_inactive) -#pragma unused(memlimit_active_is_fatal, memlimit_inactive_is_fatal) -#endif /* !CONFIG_JETSAM */ - MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing pid %d: priority %d, user_data 0x%llx\n", p->p_pid, priority, user_data); + MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data); KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0); if (priority == -1) { /* Use as shorthand for default priority */ priority = JETSAM_PRIORITY_DEFAULT; - } else if (priority == JETSAM_PRIORITY_IDLE_DEFERRED) { - /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */ + } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) { + /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */ priority = JETSAM_PRIORITY_IDLE; } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) { /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */ @@ -1591,8 +2271,7 @@ memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effect p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED; p->p_memstat_userdata = user_data; p->p_memstat_requestedpriority = priority; - -#if CONFIG_JETSAM + if (update_memlimit) { boolean_t trigger_exception; @@ -1618,7 +2297,7 @@ memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effect #if DEVELOPMENT || DEBUG printf("memorystatus_update: WARNING %s[%d] set unused flag P_MEMSTAT_MEMLIMIT_BACKGROUND [A==%dMB %s] [IA==%dMB %s]\n", - (p->p_comm ? p->p_comm : "unknown"), p->p_pid, + (*p->p_name ? p->p_name : "unknown"), p->p_pid, memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"), memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF")); #endif /* DEVELOPMENT || DEBUG */ @@ -1692,21 +2371,33 @@ memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effect (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); } } -#endif /* CONFIG_JETSAM */ /* - * We can't add to the JETSAM_PRIORITY_IDLE_DEFERRED bucket here. - * But, we could be removing it from the bucket. + * We can't add to the aging bands buckets here. + * But, we could be removing it from those buckets. * Check and take appropriate steps if so. */ - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { + if (isProcessInAgingBands(p)) { memorystatus_invalidate_idle_demotion_locked(p, TRUE); + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); + } else { + if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) { + /* + * Daemons with 'inactive' limits will go through the dirty tracking codepath. + * This path deals with apps that may have 'inactive' limits e.g. WebContent processes. + * If this is the legacy aging policy we explicitly need to apply those limits. If it + * is any other aging policy, then we don't need to worry because all processes + * will go through the aging bands and then the demotion thread will take care to + * move them into the IDLE band and apply the required limits. + */ + memorystatus_update_priority_locked(p, priority, head_insert, TRUE); + } } - - memorystatus_update_priority_locked(p, priority, head_insert); - + + memorystatus_update_priority_locked(p, priority, head_insert, FALSE); + proc_list_unlock(); ret = 0; @@ -1721,6 +2412,7 @@ memorystatus_remove(proc_t p, boolean_t locked) { int ret; memstat_bucket_t *bucket; + boolean_t reschedule = FALSE; MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid); @@ -1731,8 +2423,27 @@ memorystatus_remove(proc_t p, boolean_t locked) assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL)); bucket = &memstat_bucket[p->p_memstat_effectivepriority]; - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { - assert(bucket->count == memorystatus_scheduled_idle_demotions); + + if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) { + + assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs); + reschedule = TRUE; + + } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) { + + assert(bucket->count == memorystatus_scheduled_idle_demotions_apps); + reschedule = TRUE; + } + + /* + * Record idle delta + */ + + if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { + uint64_t now = mach_absolute_time(); + if (now > p->p_memstat_idle_start) { + p->p_memstat_idle_delta = now - p->p_memstat_idle_start; + } } TAILQ_REMOVE(&bucket->list, p, p_memstat_list); @@ -1741,7 +2452,7 @@ memorystatus_remove(proc_t p, boolean_t locked) memorystatus_list_count--; /* If awaiting demotion to the idle band, clean up */ - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { + if (reschedule) { memorystatus_invalidate_idle_demotion_locked(p, TRUE); memorystatus_reschedule_idle_demotion_locked(); } @@ -1778,6 +2489,8 @@ memorystatus_remove(proc_t p, boolean_t locked) * Return: * 0 on success * non-0 on failure + * + * The proc_list_lock is held by the caller. */ static int @@ -1813,15 +2526,35 @@ memorystatus_update_idle_priority_locked(proc_t p) { int32_t priority; MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty); - + + assert(isSysProc(p)); + if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) { - priority = (p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) ? JETSAM_PRIORITY_IDLE_DEFERRED : JETSAM_PRIORITY_IDLE; + + priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE; } else { priority = p->p_memstat_requestedpriority; } if (priority != p->p_memstat_effectivepriority) { - memorystatus_update_priority_locked(p, priority, false); + + if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) && + (priority == JETSAM_PRIORITY_IDLE)) { + + /* + * This process is on its way into the IDLE band. The system is + * using 'legacy' jetsam aging policy. That means, this process + * has already used up its idle-deferral aging time that is given + * once per its lifetime. So we need to set the INACTIVE limits + * explicitly because it won't be going through the demotion paths + * that take care to apply the limits appropriately. + */ + assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0); + memorystatus_update_priority_locked(p, priority, false, true); + + } else { + memorystatus_update_priority_locked(p, priority, false, false); + } } } @@ -1832,7 +2565,7 @@ memorystatus_update_idle_priority_locked(proc_t p) { * priority idle band when clean (and killed earlier, protecting higher priority procesess). * * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by - * memorystatus_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band + * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle * band. The deferral can be cleared early by clearing the appropriate flag. @@ -1888,10 +2621,11 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) { p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS; } - if (old_dirty & P_DIRTY_DEFER_IN_PROGRESS) { + if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) { already_deferred = TRUE; } + /* This can be set and cleared exactly once. */ if (pcontrol & PROC_DIRTY_DEFER) { @@ -1910,24 +2644,45 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) { /* Kick off or invalidate the idle exit deferment if there's a state transition. */ if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) { - if (((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) && - defer_now && !already_deferred) { - - /* - * Request to defer a clean process that's idle-exit enabled - * and not already in the jetsam deferred band. - */ - memorystatus_schedule_idle_demotion_locked(p, TRUE); - reschedule = TRUE; + if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { - } else if (!defer_now && already_deferred) { + if (defer_now && !already_deferred) { + + /* + * Request to defer a clean process that's idle-exit enabled + * and not already in the jetsam deferred band. Most likely a + * new launch. + */ + memorystatus_schedule_idle_demotion_locked(p, TRUE); + reschedule = TRUE; - /* - * Either the process is no longer idle-exit enabled OR - * there's a request to cancel a currently active deferral. - */ - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - reschedule = TRUE; + } else if (!defer_now) { + + /* + * The process isn't asking for the 'aging' facility. + * Could be that it is: + */ + + if (already_deferred) { + /* + * already in the aging bands. Traditionally, + * some processes have tried to use this to + * opt out of the 'aging' facility. + */ + + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + } else { + /* + * agnostic to the 'aging' facility. In that case, + * we'll go ahead and opt it in because this is likely + * a new launch (clean process, dirty tracking enabled) + */ + + memorystatus_schedule_idle_demotion_locked(p, TRUE); + } + + reschedule = TRUE; + } } } else { @@ -1937,13 +2692,13 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) { * deferred state or not? * * This could be a legal request like: - * - this process had opted into the JETSAM_DEFERRED band + * - this process had opted into the 'aging' band * - but it's now dirty and requests to opt out. * In this case, we remove the process from the band and reset its * state too. It'll opt back in properly when needed. * * OR, this request could be a user-space bug. E.g.: - * - this process had opted into the JETSAM_DEFERRED band when clean + * - this process had opted into the 'aging' band when clean * - and, then issues another request to again put it into the band except * this time the process is dirty. * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of @@ -1951,14 +2706,17 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) { * But we do it here anyways for coverage. * * memorystatus_update_idle_priority_locked() - * single-mindedly treats a dirty process as "cannot be in the deferred band". + * single-mindedly treats a dirty process as "cannot be in the aging band". */ if (!defer_now && already_deferred) { memorystatus_invalidate_idle_demotion_locked(p, TRUE); reschedule = TRUE; } else { - memorystatus_invalidate_idle_demotion_locked(p, FALSE); + + boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE; + + memorystatus_invalidate_idle_demotion_locked(p, reset_state); reschedule = TRUE; } } @@ -1986,7 +2744,6 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { boolean_t now_dirty = FALSE; MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty); - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0); proc_list_lock(); @@ -2052,62 +2809,65 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { (was_dirty == FALSE && now_dirty == TRUE)) { /* Manage idle exit deferral, if applied */ - if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) == - (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) { + if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) { /* - * P_DIRTY_DEFER_IN_PROGRESS means the process is in the deferred band OR it might be heading back - * there once it's clean again and has some protection window left. + * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back + * there once it's clean again. For the legacy case, this only applies if it has some protection window left. + * + * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over + * in that band on it's way to IDLE. */ if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { /* * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE" * - * The process will move from the deferred band to its higher requested - * jetsam band. But we don't clear its state i.e. we want to remember that - * this process was part of the "deferred" band and will return to it. - * - * This way, we don't let it age beyond the protection - * window when it returns to "clean". All the while giving - * it a chance to perform its work while "dirty". - * + * The process will move from its aging band to its higher requested + * jetsam band. */ - memorystatus_invalidate_idle_demotion_locked(p, FALSE); + boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE; + + memorystatus_invalidate_idle_demotion_locked(p, reset_state); reschedule = TRUE; } else { /* * Process is back from "dirty" to "clean". - * - * Is its timer up OR does it still have some protection - * window left? */ - if (mach_absolute_time() >= p->p_memstat_idledeadline) { - /* - * The process' deadline has expired. It currently - * does not reside in the DEFERRED bucket. - * - * It's on its way to the JETSAM_PRIORITY_IDLE - * bucket via memorystatus_update_idle_priority_locked() - * below. - - * So all we need to do is reset all the state on the - * process that's related to the DEFERRED bucket i.e. - * the DIRTY_DEFER_IN_PROGRESS flag and the timer deadline. - * - */ + if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) { + if (mach_absolute_time() >= p->p_memstat_idledeadline) { + /* + * The process' deadline has expired. It currently + * does not reside in any of the aging buckets. + * + * It's on its way to the JETSAM_PRIORITY_IDLE + * bucket via memorystatus_update_idle_priority_locked() + * below. + + * So all we need to do is reset all the state on the + * process that's related to the aging bucket i.e. + * the AGING_IN_PROGRESS flag and the timer deadline. + */ - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - reschedule = TRUE; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + reschedule = TRUE; + } else { + /* + * It still has some protection window left and so + * we just re-arm the timer without modifying any + * state on the process iff it still wants into that band. + */ + + if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) { + memorystatus_schedule_idle_demotion_locked(p, FALSE); + reschedule = TRUE; + } + } } else { - /* - * It still has some protection window left and so - * we just re-arm the timer without modifying any - * state on the process. - */ - memorystatus_schedule_idle_demotion_locked(p, FALSE); + + memorystatus_schedule_idle_demotion_locked(p, TRUE); reschedule = TRUE; } } @@ -2115,9 +2875,8 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { memorystatus_update_idle_priority_locked(p); -#if CONFIG_JETSAM if (memorystatus_highwater_enabled) { - boolean_t trigger_exception; + boolean_t trigger_exception = FALSE, ledger_update_needed = TRUE; /* * We are in this path because this process transitioned between * dirty <--> clean state. Update the cached memory limits. @@ -2128,11 +2887,21 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { * process is dirty */ CACHE_ACTIVE_LIMITS_LOCKED(p, trigger_exception); + ledger_update_needed = TRUE; } else { /* - * process is clean + * process is clean...but if it has opted into pressured-exit + * we don't apply the INACTIVE limit till the process has aged + * out and is entering the IDLE band. + * See memorystatus_update_priority_locked() for that. */ - CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception); + + if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) { + ledger_update_needed = FALSE; + } else { + CACHE_INACTIVE_LIMITS_LOCKED(p, trigger_exception); + ledger_update_needed = TRUE; + } } /* @@ -2144,7 +2913,7 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { * See rdar://21394491. */ - if (proc_ref_locked(p) == p) { + if (ledger_update_needed && proc_ref_locked(p) == p) { int ledger_limit; if (p->p_memstat_memlimit > 0) { ledger_limit = p->p_memstat_memlimit; @@ -2163,7 +2932,6 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { } } -#endif /* CONFIG_JETSAM */ /* If the deferral state changed, reschedule the demotion timer */ if (reschedule) { @@ -2352,6 +3120,9 @@ memorystatus_on_inactivity(proc_t p) #endif } +/* + * The proc_list_lock is held by the caller. +*/ static uint32_t memorystatus_build_state(proc_t p) { uint32_t snapshot_state = 0; @@ -2390,10 +3161,16 @@ kill_idle_exit_proc(void) uint64_t current_time; boolean_t killed = FALSE; unsigned int i = 0; + os_reason_t jetsam_reason = OS_REASON_NULL; /* Pick next idle exit victim. */ current_time = mach_absolute_time(); + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT); + if (jetsam_reason == OS_REASON_NULL) { + printf("kill_idle_exit_proc: failed to allocate jetsam reason\n"); + } + proc_list_lock(); p = memorystatus_get_first_proc_locked(&i, FALSE); @@ -2417,9 +3194,11 @@ kill_idle_exit_proc(void) proc_list_unlock(); if (victim_p) { - printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_p->p_pid, (victim_p->p_comm ? victim_p->p_comm : "(unknown)")); - killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit); + printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "(unknown)")); + killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason); proc_rele(victim_p); + } else { + os_reason_free(jetsam_reason); } return killed; @@ -2457,12 +3236,14 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) uint32_t errors = 0; uint32_t hwm_kill = 0; boolean_t sort_flag = TRUE; + boolean_t corpse_list_purged = FALSE; /* Jetsam Loop Detection - locals */ memstat_bucket_t *bucket; int jld_bucket_count = 0; struct timeval jld_now_tstamp = {0,0}; uint64_t jld_now_msecs = 0; + int elevated_bucket_count = 0; /* Jetsam Loop Detection - statics */ static uint64_t jld_timestamp_msecs = 0; @@ -2508,14 +3289,25 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) boolean_t killed; int32_t priority; uint32_t cause; + uint64_t jetsam_reason_code = JETSAM_REASON_INVALID; + os_reason_t jetsam_reason = OS_REASON_NULL; - if (kill_under_pressure_cause) { - cause = kill_under_pressure_cause; - } else { - cause = kMemorystatusKilledVMPageShortage; + cause = kill_under_pressure_cause; + switch (cause) { + case kMemorystatusKilledFCThrashing: + jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING; + break; + case kMemorystatusKilledVMThrashing: + jetsam_reason_code = JETSAM_REASON_MEMORY_VMTHRASHING; + break; + case kMemorystatusKilledVMPageShortage: + /* falls through */ + default: + jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE; + cause = kMemorystatusKilledVMPageShortage; + break; } -#if LEGACY_HIWATER /* Highwater */ killed = memorystatus_kill_hiwat_proc(&errors); if (killed) { @@ -2542,7 +3334,12 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) break; } -#endif + + jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_thread: failed to allocate jetsam reason\n"); + } + if (memorystatus_jld_enabled == TRUE) { /* @@ -2559,10 +3356,32 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) jld_now_msecs = (jld_now_tstamp.tv_sec * 1000); proc_list_lock(); - bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; - jld_bucket_count = bucket->count; - bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED]; - jld_bucket_count += bucket->count; + switch (jetsam_aging_policy) { + case kJetsamAgingPolicyLegacy: + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + jld_bucket_count = bucket->count; + bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1]; + jld_bucket_count += bucket->count; + break; + case kJetsamAgingPolicySysProcsReclaimedFirst: + case kJetsamAgingPolicyAppsReclaimedFirst: + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + jld_bucket_count = bucket->count; + bucket = &memstat_bucket[system_procs_aging_band]; + jld_bucket_count += bucket->count; + bucket = &memstat_bucket[applications_aging_band]; + jld_bucket_count += bucket->count; + break; + case kJetsamAgingPolicyNone: + default: + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + jld_bucket_count = bucket->count; + break; + } + + bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE]; + elevated_bucket_count = bucket->count; + proc_list_unlock(); /* @@ -2585,7 +3404,29 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) if (jld_idle_kills > jld_idle_kill_candidates) { jld_eval_aggressive_count++; - if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) { + +#if DEVELOPMENT || DEBUG + printf("memorystatus: aggressive%d: beginning of window: %lld ms, : timestamp now: %lld ms\n", + jld_eval_aggressive_count, + jld_timestamp_msecs, + jld_now_msecs); + printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n", + jld_eval_aggressive_count, + jld_idle_kill_candidates, + jld_idle_kills); +#endif /* DEVELOPMENT || DEBUG */ + + if ((jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) && + (total_corpses_count > 0) && (corpse_list_purged == FALSE)) { + /* + * If we reach this aggressive cycle, corpses might be causing memory pressure. + * So, in an effort to avoid jetsams in the FG band, we will attempt to purge + * corpse memory prior to this final march through JETSAM_PRIORITY_UI_SUPPORT. + */ + task_purge_all_corpses(); + corpse_list_purged = TRUE; + } + else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) { /* * Bump up the jetsam priority limit (eg: the bucket index) * Enforce bucket index sanity. @@ -2600,24 +3441,76 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) } } + /* Visit elevated processes first */ + while (elevated_bucket_count) { + + elevated_bucket_count--; + + /* + * memorystatus_kill_elevated_process() drops a reference, + * so take another one so we can continue to use this exit reason + * even after it returns. + */ + + os_reason_ref(jetsam_reason); + killed = memorystatus_kill_elevated_process( + kMemorystatusKilledVMThrashing, + jetsam_reason, + jld_eval_aggressive_count, + &errors); + + if (killed) { + post_snapshot = TRUE; + if (memorystatus_available_pages <= memorystatus_available_pages_pressure) { + /* + * Still under pressure. + * Find another pinned processes. + */ + continue; + } else { + goto done; + } + } else { + /* + * No pinned processes left to kill. + * Abandon elevated band. + */ + break; + } + } + + /* + * memorystatus_kill_top_process_aggressive() drops a reference, + * so take another one so we can continue to use this exit reason + * even after it returns + */ + os_reason_ref(jetsam_reason); killed = memorystatus_kill_top_process_aggressive( TRUE, kMemorystatusKilledVMThrashing, + jetsam_reason, jld_eval_aggressive_count, jld_priority_band_max, &errors); - if (killed) { /* Always generate logs after aggressive kill */ post_snapshot = TRUE; + jld_idle_kills = 0; goto done; } } } - + + /* + * memorystatus_kill_top_process() drops a reference, + * so take another one so we can continue to use this exit reason + * even after it returns + */ + os_reason_ref(jetsam_reason); + /* LRU */ - killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, &priority, &errors); + killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors); sort_flag = FALSE; if (killed) { @@ -2632,7 +3525,7 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) /* Jetsam Loop Detection */ if (memorystatus_jld_enabled == TRUE) { - if ((priority == JETSAM_PRIORITY_IDLE) || (priority == JETSAM_PRIORITY_IDLE_DEFERRED)) { + if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) { jld_idle_kills++; } else { /* @@ -2641,12 +3534,33 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) */ } } + + if ((priority >= JETSAM_PRIORITY_UI_SUPPORT) && (total_corpses_count > 0) && (corpse_list_purged == FALSE)) { + /* + * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT + * then we attempt to relieve pressure by purging corpse memory. + */ + task_purge_all_corpses(); + corpse_list_purged = TRUE; + } goto done; } if (memorystatus_available_pages <= memorystatus_available_pages_critical) { - /* Under pressure and unable to kill a process - panic */ - panic("memorystatus_jetsam_thread: no victim! available pages:%d\n", memorystatus_available_pages); + /* + * Still under pressure and unable to kill a process - purge corpse memory + */ + if (total_corpses_count > 0) { + task_purge_all_corpses(); + corpse_list_purged = TRUE; + } + + if (memorystatus_available_pages <= memorystatus_available_pages_critical) { + /* + * Still under pressure and unable to kill a process - panic + */ + panic("memorystatus_jetsam_thread: no victim! available pages:%d\n", memorystatus_available_pages); + } } done: @@ -2660,6 +3574,8 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) kill_under_pressure_cause = 0; vm_thrashing_jetsam_done(); } + + os_reason_free(jetsam_reason); } kill_under_pressure_cause = 0; @@ -2680,18 +3596,23 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) #endif if (post_snapshot) { + proc_list_lock(); size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count); uint64_t timestamp_now = mach_absolute_time(); memorystatus_jetsam_snapshot->notification_time = timestamp_now; + memorystatus_jetsam_snapshot->js_gencount++; if (memorystatus_jetsam_snapshot_last_timestamp == 0 || timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout) { + proc_list_unlock(); int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); if (!ret) { proc_list_lock(); memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; proc_list_unlock(); } + } else { + proc_list_unlock(); } } @@ -2722,64 +3643,97 @@ boolean_t memorystatus_idle_exit_from_VM(void) { } #endif /* !CONFIG_JETSAM */ -#if CONFIG_JETSAM - /* - * Callback invoked when allowable physical memory footprint exceeded - * (dirty pages + IOKit mappings) - * - * This is invoked for both advisory, non-fatal per-task high watermarks, - * as well as the fatal task memory limits. + * Returns TRUE: + * when exceeding ledger footprint is fatal. + * Returns FALSE: + * when exceeding ledger footprint is non fatal. */ -void -memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footprint_mb) +boolean_t +memorystatus_turnoff_exception_and_get_fatalness(boolean_t warning, const int max_footprint_mb) { - boolean_t is_active; - boolean_t is_fatal; - proc_t p = current_proc(); + boolean_t is_fatal; proc_list_lock(); - is_active = proc_jetsam_state_is_active_locked(p); is_fatal = (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT); if (warning == FALSE) { + boolean_t is_active; + boolean_t state_changed = FALSE; + /* - * We only want the EXC_RESOURCE to trigger once per lifetime - * of the active/inactive limit state. So, here, we detect the - * active/inactive state of the process and mark the - * state as exception has been triggered. + * We are here because a process has exceeded its ledger limit. + * That is, the process is no longer in the limit warning range. + * + * When a process exceeds its ledger limit, we want an EXC_RESOURCE + * to trigger, but only once per process per limit. We enforce that + * here, by identifying the active/inactive limit type. We then turn + * off the exception state by marking the limit as exception triggered. */ + + is_active = proc_jetsam_state_is_active_locked(p); + if (is_active == TRUE) { /* * turn off exceptions for active state */ - p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED; + if (!(p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED)) { + p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED; + state_changed = TRUE; + } } else { /* * turn off exceptions for inactive state */ - p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED; + if (!(p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED)) { + p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED; + state_changed = TRUE; + } } /* + * The limit violation is logged here, but only once per process per limit. + * This avoids excessive logging when a process consistently exceeds a soft limit. * Soft memory limit is a non-fatal high-water-mark * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit. */ - printf("process %d (%s) exceeded physical memory footprint, the %s%sMemoryLimit of %d MB\n", - p->p_pid, p->p_comm, (is_active ? "Active" : "Inactive"), - (is_fatal ? "Hard" : "Soft"), max_footprint_mb); + if(state_changed) { + printf("process %d (%s) exceeded physical memory footprint, the %s%sMemoryLimit of %d MB\n", + p->p_pid, (*p->p_name ? p->p_name : "unknown"), (is_active ? "Active" : "Inactive"), + (is_fatal ? "Hard" : "Soft"), max_footprint_mb); + } } - proc_list_unlock(); + return is_fatal; +} + +/* + * Callback invoked when allowable physical memory footprint exceeded + * (dirty pages + IOKit mappings) + * + * This is invoked for both advisory, non-fatal per-task high watermarks, + * as well as the fatal task memory limits. + */ +void +memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t is_fatal) +{ + os_reason_t jetsam_reason = OS_REASON_NULL; + + proc_t p = current_proc(); + #if VM_PRESSURE_EVENTS if (warning == TRUE) { - if (memorystatus_warn_process(p->p_pid, TRUE /* critical? */) != TRUE) { + /* + * This is a warning path which implies that the current process is close, but has + * not yet exceeded its per-process memory limit. + */ + if (memorystatus_warn_process(p->p_pid, FALSE /* not exceeded */) != TRUE) { /* Print warning, since it's possible that task has not registered for pressure notifications */ - printf("task_exceeded_footprint: failed to warn the current task (exiting, or no handler registered?).\n"); + printf("task_exceeded_footprint: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid); } return; } @@ -2790,7 +3744,15 @@ memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footp * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task * has violated either the system-wide per-task memory limit OR its own task limit. */ - if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit) != TRUE) { + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT); + if (jetsam_reason == NULL) { + printf("task_exceeded footprint: failed to allocate jetsam reason\n"); + } else if (corpse_for_fatal_memkill != 0) { + /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */ + jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } + + if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) { printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n"); } } else { @@ -2799,9 +3761,175 @@ memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footp * See comment near its declaration for more details. */ memorystatus_hwm_candidates = TRUE; + +#if VM_PRESSURE_EVENTS + /* + * The current process is not in the warning path. + * This path implies the current process has exceeded a non-fatal (soft) memory limit. + * Failure to send note is ignored here. + */ + (void)memorystatus_warn_process(p->p_pid, TRUE /* exceeded */); + +#endif /* VM_PRESSURE_EVENTS */ + } +} + +/* + * Description: + * Evaluates active vs. inactive process state. + * Processes that opt into dirty tracking are evaluated + * based on clean vs dirty state. + * dirty ==> active + * clean ==> inactive + * + * Process that do not opt into dirty tracking are + * evalulated based on priority level. + * Foreground or above ==> active + * Below Foreground ==> inactive + * + * Return: TRUE if active + * False if inactive + */ + +static boolean_t +proc_jetsam_state_is_active_locked(proc_t p) { + + if (p->p_memstat_dirty & P_DIRTY_TRACK) { + /* + * process has opted into dirty tracking + * active state is based on dirty vs. clean + */ + if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { + /* + * process is dirty + * implies active state + */ + return TRUE; + } else { + /* + * process is clean + * implies inactive state + */ + return FALSE; + } + } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) { + /* + * process is Foreground or higher + * implies active state + */ + return TRUE; + } else { + /* + * process found below Foreground + * implies inactive state + */ + return FALSE; + } +} + +static boolean_t +memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) { + boolean_t res; + +#if CONFIG_JETSAM + uint32_t errors = 0; + + if (victim_pid == -1) { + /* No pid, so kill first process */ + res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors); + } else { + res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason); + } + + if (errors) { + memorystatus_clear_errors(); + } + + if (res == TRUE) { + /* Fire off snapshot notification */ + proc_list_lock(); + size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + + sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count; + uint64_t timestamp_now = mach_absolute_time(); + memorystatus_jetsam_snapshot->notification_time = timestamp_now; + if (memorystatus_jetsam_snapshot_last_timestamp == 0 || + timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout) { + proc_list_unlock(); + int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); + if (!ret) { + proc_list_lock(); + memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; + proc_list_unlock(); + } + } else { + proc_list_unlock(); + } + } +#else /* !CONFIG_JETSAM */ + + res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason); + +#endif /* CONFIG_JETSAM */ + + return res; +} + +/* + * Jetsam a specific process. + */ +static boolean_t +memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) { + boolean_t killed; + proc_t p; + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; + + /* TODO - add a victim queue and push this into the main jetsam thread */ + + p = proc_find(victim_pid); + if (!p) { + os_reason_free(jetsam_reason); + return FALSE; + } + + proc_list_lock(); + +#if CONFIG_JETSAM + if (memorystatus_jetsam_snapshot_count == 0) { + memorystatus_init_jetsam_snapshot_locked(NULL,0); } + + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; + + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); + + proc_list_unlock(); + + printf("%lu.%02d memorystatus: specifically killing pid %d [%s] (%s %d) - memorystatus_available_pages: %d\n", + (unsigned long)tv_sec, tv_msec, victim_pid, (*p->p_name ? p->p_name : "(unknown)"), + jetsam_kill_cause_name[cause], p->p_memstat_effectivepriority, memorystatus_available_pages); +#else /* !CONFIG_JETSAM */ + proc_list_unlock(); + + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; + printf("%lu.%02d memorystatus: specifically killing pid %d [%s] (%s %d)\n", + (unsigned long)tv_sec, tv_msec, victim_pid, (*p->p_name ? p->p_name : "(unknown)"), + jetsam_kill_cause_name[cause], p->p_memstat_effectivepriority); +#endif /* CONFIG_JETSAM */ + + killed = memorystatus_do_kill(p, cause, jetsam_reason); + proc_rele(p); + + return killed; } + /* * Toggle the P_MEMSTAT_TERMINATED state. * Takes the proc_list_lock. @@ -2828,6 +3956,8 @@ proc_memstat_terminated(proc_t p, boolean_t set) return; } + +#if CONFIG_JETSAM /* * This is invoked when cpulimits have been exceeded while in fatal mode. * The jetsam_flags do not apply as those are for memory related kills. @@ -2840,54 +3970,270 @@ jetsam_on_ledger_cpulimit_exceeded(void) int retval = 0; int jetsam_flags = 0; /* make it obvious */ proc_t p = current_proc(); + os_reason_t jetsam_reason = OS_REASON_NULL; printf("task_exceeded_cpulimit: killing pid %d [%s]\n", - p->p_pid, (p->p_comm ? p->p_comm : "(unknown)")); + p->p_pid, (*p->p_name ? p->p_name : "(unknown)")); - retval = jetsam_do_kill(p, jetsam_flags); + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT); + if (jetsam_reason == OS_REASON_NULL) { + printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n"); + } + + retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason); if (retval) { printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n"); } } +static void +memorystatus_get_task_memory_region_count(task_t task, uint64_t *count) +{ + assert(task); + assert(count); + + *count = get_task_memory_region_count(task); +} + static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages) { assert(task); assert(footprint); - - *footprint = (uint32_t)(get_task_phys_footprint(task) / PAGE_SIZE_64); + + uint64_t pages; + + pages = (get_task_phys_footprint(task) / PAGE_SIZE_64); + assert(((uint32_t)pages) == pages); + *footprint = (uint32_t)pages; + if (max_footprint) { - *max_footprint = (uint32_t)(get_task_phys_footprint_max(task) / PAGE_SIZE_64); + pages = (get_task_phys_footprint_max(task) / PAGE_SIZE_64); + assert(((uint32_t)pages) == pages); + *max_footprint = (uint32_t)pages; } if (max_footprint_lifetime) { - *max_footprint_lifetime = (uint32_t)(get_task_resident_max(task) / PAGE_SIZE_64); + pages = (get_task_resident_max(task) / PAGE_SIZE_64); + assert(((uint32_t)pages) == pages); + *max_footprint_lifetime = (uint32_t)pages; } if (purgeable_pages) { - *purgeable_pages = (uint32_t)(get_task_purgeable_size(task) / PAGE_SIZE_64); + pages = (get_task_purgeable_size(task) / PAGE_SIZE_64); + assert(((uint32_t)pages) == pages); + *purgeable_pages = (uint32_t)pages; + } +} + +static void +memorystatus_get_task_phys_footprint_page_counts(task_t task, + uint64_t *internal_pages, uint64_t *internal_compressed_pages, + uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages, + uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages, + uint64_t *iokit_mapped_pages, uint64_t *page_table_pages) +{ + assert(task); + + if (internal_pages) { + *internal_pages = (get_task_internal(task) / PAGE_SIZE_64); + } + + if (internal_compressed_pages) { + *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64); + } + + if (purgeable_nonvolatile_pages) { + *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64); + } + + if (purgeable_nonvolatile_compressed_pages) { + *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64); + } + + if (alternate_accounting_pages) { + *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64); + } + + if (alternate_accounting_compressed_pages) { + *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64); + } + + if (iokit_mapped_pages) { + *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64); + } + + if (page_table_pages) { + *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64); } } +/* + * This routine only acts on the global jetsam event snapshot. + * Updating the process's entry can race when the memorystatus_thread + * has chosen to kill a process that is racing to exit on another core. + */ static void -memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause) +memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime) { + memorystatus_jetsam_snapshot_entry_t *entry = NULL; + memorystatus_jetsam_snapshot_t *snapshot = NULL; + memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL; + unsigned int i; + if (memorystatus_jetsam_snapshot_count == 0) { + /* + * No active snapshot. + * Nothing to do. + */ + return; + } + + /* + * Sanity check as this routine should only be called + * from a jetsam kill path. + */ + assert(kill_cause != 0 && killtime != 0); + + snapshot = memorystatus_jetsam_snapshot; + snapshot_list = memorystatus_jetsam_snapshot->entries; + for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) { - if (memorystatus_jetsam_snapshot_list[i].pid == p->p_pid) { - /* Update if the priority has changed since the snapshot was taken */ - if (memorystatus_jetsam_snapshot_list[i].priority != p->p_memstat_effectivepriority) { - memorystatus_jetsam_snapshot_list[i].priority = p->p_memstat_effectivepriority; - strlcpy(memorystatus_jetsam_snapshot_list[i].name, p->p_comm, MAXCOMLEN+1); - memorystatus_jetsam_snapshot_list[i].state = memorystatus_build_state(p); - memorystatus_jetsam_snapshot_list[i].user_data = p->p_memstat_userdata; - memorystatus_jetsam_snapshot_list[i].fds = p->p_fd->fd_nfiles; + if (snapshot_list[i].pid == p->p_pid) { + + entry = &snapshot_list[i]; + + if (entry->killed || entry->jse_killtime) { + /* + * We apparently raced on the exit path + * for this process, as it's snapshot entry + * has already recorded a kill. + */ + assert(entry->killed && entry->jse_killtime); + break; + } + + /* + * Update the entry we just found in the snapshot. + */ + + entry->killed = kill_cause; + entry->jse_killtime = killtime; + entry->jse_gencount = snapshot->js_gencount; + entry->jse_idle_delta = p->p_memstat_idle_delta; + + /* + * If a process has moved between bands since snapshot was + * initialized, then likely these fields changed too. + */ + if (entry->priority != p->p_memstat_effectivepriority) { + + strlcpy(entry->name, p->p_name, sizeof(entry->name)); + entry->priority = p->p_memstat_effectivepriority; + entry->state = memorystatus_build_state(p); + entry->user_data = p->p_memstat_userdata; + entry->fds = p->p_fd->fd_nfiles; + } + + /* + * Always update the page counts on a kill. + */ + + uint32_t pages = 0; + uint32_t max_pages = 0; + uint32_t max_pages_lifetime = 0; + uint32_t purgeable_pages = 0; + + memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages); + entry->pages = (uint64_t)pages; + entry->max_pages = (uint64_t)max_pages; + entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; + entry->purgeable_pages = (uint64_t)purgeable_pages; + + uint64_t internal_pages = 0; + uint64_t internal_compressed_pages = 0; + uint64_t purgeable_nonvolatile_pages = 0; + uint64_t purgeable_nonvolatile_compressed_pages = 0; + uint64_t alternate_accounting_pages = 0; + uint64_t alternate_accounting_compressed_pages = 0; + uint64_t iokit_mapped_pages = 0; + uint64_t page_table_pages = 0; + + memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages, + &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages, + &alternate_accounting_pages, &alternate_accounting_compressed_pages, + &iokit_mapped_pages, &page_table_pages); + + entry->jse_internal_pages = internal_pages; + entry->jse_internal_compressed_pages = internal_compressed_pages; + entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages; + entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages; + entry->jse_alternate_accounting_pages = alternate_accounting_pages; + entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages; + entry->jse_iokit_mapped_pages = iokit_mapped_pages; + entry->jse_page_table_pages = page_table_pages; + + uint64_t region_count = 0; + memorystatus_get_task_memory_region_count(p->task, ®ion_count); + entry->jse_memory_region_count = region_count; + + goto exit; + } + } + + if (entry == NULL) { + /* + * The entry was not found in the snapshot, so the process must have + * launched after the snapshot was initialized. + * Let's try to append the new entry. + */ + if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) { + /* + * A populated snapshot buffer exists + * and there is room to init a new entry. + */ + assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count); + + unsigned int next = memorystatus_jetsam_snapshot_count; + + if(memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[next], (snapshot->js_gencount)) == TRUE) { + + entry = &snapshot_list[next]; + entry->killed = kill_cause; + entry->jse_killtime = killtime; + + snapshot->entry_count = ++next; + memorystatus_jetsam_snapshot_count = next; + + if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) { + /* + * We just used the last slot in the snapshot buffer. + * We only want to log it once... so we do it here + * when we notice we've hit the max. + */ + printf("memorystatus: WARNING snapshot buffer is full, count %d\n", + memorystatus_jetsam_snapshot_count); + } } - memorystatus_jetsam_snapshot_list[i].killed = kill_cause; - return; } } + +exit: + if (entry == NULL) { + /* + * If we reach here, the snapshot buffer could not be updated. + * Most likely, the buffer is full, in which case we would have + * logged a warning in the previous call. + * + * For now, we will stop appending snapshot entries. + * When the buffer is consumed, the snapshot state will reset. + */ + + MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n", + p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count); + } + + return; } void memorystatus_pages_update(unsigned int pages_avail) @@ -2922,33 +4268,87 @@ void memorystatus_pages_update(unsigned int pages_avail) || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE; if (critical || delta) { - memorystatus_level = memorystatus_available_pages * 100 / atop_64(max_mem); + unsigned int total_pages; + + total_pages = (unsigned int) atop_64(max_mem); +#if CONFIG_SECLUDED_MEMORY + total_pages -= vm_page_secluded_count; +#endif /* CONFIG_SECLUDED_MEMORY */ + memorystatus_level = memorystatus_available_pages * 100 / total_pages; memorystatus_thread_wake(); } #endif /* VM_PRESSURE_EVENTS */ } static boolean_t -memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry) +memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount) { clock_sec_t tv_sec; clock_usec_t tv_usec; + uint32_t pages = 0; + uint32_t max_pages = 0; + uint32_t max_pages_lifetime = 0; + uint32_t purgeable_pages = 0; + uint64_t internal_pages = 0; + uint64_t internal_compressed_pages = 0; + uint64_t purgeable_nonvolatile_pages = 0; + uint64_t purgeable_nonvolatile_compressed_pages = 0; + uint64_t alternate_accounting_pages = 0; + uint64_t alternate_accounting_compressed_pages = 0; + uint64_t iokit_mapped_pages = 0; + uint64_t page_table_pages =0; + uint64_t region_count = 0; + uint64_t cids[COALITION_NUM_TYPES]; memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t)); - + entry->pid = p->p_pid; - strlcpy(&entry->name[0], p->p_comm, MAXCOMLEN+1); + strlcpy(&entry->name[0], p->p_name, sizeof(entry->name)); entry->priority = p->p_memstat_effectivepriority; - memorystatus_get_task_page_counts(p->task, &entry->pages, &entry->max_pages, &entry->max_pages_lifetime, &entry->purgeable_pages); - entry->state = memorystatus_build_state(p); + + memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages); + entry->pages = (uint64_t)pages; + entry->max_pages = (uint64_t)max_pages; + entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; + entry->purgeable_pages = (uint64_t)purgeable_pages; + + memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages, + &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages, + &alternate_accounting_pages, &alternate_accounting_compressed_pages, + &iokit_mapped_pages, &page_table_pages); + + entry->jse_internal_pages = internal_pages; + entry->jse_internal_compressed_pages = internal_compressed_pages; + entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages; + entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages; + entry->jse_alternate_accounting_pages = alternate_accounting_pages; + entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages; + entry->jse_iokit_mapped_pages = iokit_mapped_pages; + entry->jse_page_table_pages = page_table_pages; + + memorystatus_get_task_memory_region_count(p->task, ®ion_count); + entry->jse_memory_region_count = region_count; + + entry->state = memorystatus_build_state(p); entry->user_data = p->p_memstat_userdata; memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid)); - entry->fds = p->p_fd->fd_nfiles; + entry->fds = p->p_fd->fd_nfiles; absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec); entry->cpu_time.tv_sec = tv_sec; entry->cpu_time.tv_usec = tv_usec; + assert(p->p_stats != NULL); + entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */ + entry->jse_killtime = 0; /* abstime jetsam chose to kill process */ + entry->killed = 0; /* the jetsam kill cause */ + entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */ + + entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */ + + proc_coalitionids(p, cids); + entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM]; + return TRUE; } @@ -3019,14 +4419,20 @@ memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snap snapshot_max = memorystatus_jetsam_snapshot_max; } + /* + * Init the snapshot header information + */ memorystatus_init_snapshot_vmstats(snapshot); + snapshot->snapshot_time = mach_absolute_time(); + snapshot->notification_time = 0; + snapshot->js_gencount = 0; next_p = memorystatus_get_first_proc_locked(&b, TRUE); while (next_p) { p = next_p; next_p = memorystatus_get_next_proc_locked(&b, p, TRUE); - if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i])) { + if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) { continue; } @@ -3040,7 +4446,6 @@ memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snap } } - snapshot->snapshot_time = mach_absolute_time(); snapshot->entry_count = i; if (!od_snapshot) { @@ -3099,47 +4504,14 @@ memorystatus_cmd_test_jetsam_sort(int priority, int sort_order) { return (error); } -#endif - -/* - * Jetsam a specific process. - */ -static boolean_t -memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause) { - boolean_t killed; - proc_t p; - - /* TODO - add a victim queue and push this into the main jetsam thread */ - p = proc_find(victim_pid); - if (!p) { - return FALSE; - } - - proc_list_lock(); - - if (memorystatus_jetsam_snapshot_count == 0) { - memorystatus_init_jetsam_snapshot_locked(NULL,0); - } - - memorystatus_update_jetsam_snapshot_entry_locked(p, cause); - proc_list_unlock(); - - printf("memorystatus: specifically killing pid %d [%s] (%s %d) - memorystatus_available_pages: %d\n", - victim_pid, (p->p_comm ? p->p_comm : "(unknown)"), - jetsam_kill_cause_name[cause], p->p_memstat_effectivepriority, memorystatus_available_pages); - - - killed = memorystatus_do_kill(p, cause); - proc_rele(p); - - return killed; -} +#endif /* DEVELOPMENT || DEBUG */ /* * Jetsam the first process in the queue. */ static boolean_t -memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, int32_t *priority, uint32_t *errors) +memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, + int32_t *priority, uint32_t *errors) { pid_t aPid; proc_t p = PROC_NULL, next_p = PROC_NULL; @@ -3147,6 +4519,10 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause int kill_count = 0; unsigned int i = 0; uint32_t aPid_ep; + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; #ifndef CONFIG_FREEZE #pragma unused(any) @@ -3181,7 +4557,7 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause aPid_ep = p->p_memstat_effectivepriority; if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { - continue; + continue; /* with lock held */ } #if DEVELOPMENT || DEBUG @@ -3227,7 +4603,7 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause * - the priority was requested *and* the targeted process is not at idle priority */ if ((memorystatus_jetsam_snapshot_count == 0) && - (memorystatus_idle_snapshot || ((!priority) || (priority && (*priority != JETSAM_PRIORITY_IDLE))))) { + (memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) { memorystatus_init_jetsam_snapshot_locked(NULL,0); new_snapshot = TRUE; } @@ -3239,12 +4615,16 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause * acquisition of the proc lock. */ p->p_memstat_state |= P_MEMSTAT_TERMINATED; + + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; #if DEVELOPMENT || DEBUG if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) { MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n", - aPid, (p->p_comm ? p->p_comm: "(unknown)"), memorystatus_level); - memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic); + aPid, (*p->p_name ? p->p_name: "(unknown)"), memorystatus_level); + memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime); p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED; if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) { jetsam_diagnostic_suspended_one_active_proc = 1; @@ -3267,17 +4647,24 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause #endif /* DEVELOPMENT || DEBUG */ { /* Shift queue, update stats */ - memorystatus_update_jetsam_snapshot_entry_locked(p, cause); + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); if (proc_ref_locked(p) == p) { proc_list_unlock(); - printf("memorystatus: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n", - ((aPid_ep == JETSAM_PRIORITY_IDLE) ? - "idle exiting pid" : "jetsam killing pid"), - aPid, (p->p_comm ? p->p_comm : "(unknown)"), + printf("%lu.%02d memorystatus: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n", + (unsigned long)tv_sec, tv_msec, + ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "idle exiting pid" : "jetsam killing top process pid"), + aPid, (*p->p_name ? p->p_name : "(unknown)"), jetsam_kill_cause_name[cause], aPid_ep, memorystatus_available_pages); - killed = memorystatus_do_kill(p, cause); + /* + * memorystatus_do_kill() drops a reference, so take another one so we can + * continue to use this exit reason even after memorystatus_do_kill() + * returns. + */ + os_reason_ref(jetsam_reason); + + killed = memorystatus_do_kill(p, cause, jetsam_reason); /* Success? */ if (killed) { @@ -3321,9 +4708,13 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause proc_list_unlock(); exit: + os_reason_free(jetsam_reason); + /* Clear snapshot if freshly captured and no target was found */ if (new_snapshot && !killed) { - memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_lock(); + memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); } KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, @@ -3336,8 +4727,8 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause * Jetsam aggressively */ static boolean_t -memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr_count, int32_t priority_max, - uint32_t *errors) +memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, os_reason_t jetsam_reason, int aggr_count, + int32_t priority_max, uint32_t *errors) { pid_t aPid; proc_t p = PROC_NULL, next_p = PROC_NULL; @@ -3346,6 +4737,10 @@ memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr unsigned int i = 0; int32_t aPid_ep = 0; unsigned int memorystatus_level_snapshot = 0; + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; #pragma unused(any) @@ -3376,7 +4771,7 @@ memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr */ MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding %s moved from band %d --> %d\n", - aggr_count, next_p->p_comm, i, next_p->p_memstat_effectivepriority); + aggr_count, (*next_p->p_name ? next_p->p_name : "unknown"), i, next_p->p_memstat_effectivepriority); next_p = memorystatus_get_first_proc_locked(&i, TRUE); continue; @@ -3430,9 +4825,13 @@ memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr * acquisition of the proc lock. */ p->p_memstat_state |= P_MEMSTAT_TERMINATED; + + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; /* Shift queue, update stats */ - memorystatus_update_jetsam_snapshot_entry_locked(p, cause); + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); /* * In order to kill the target process, we will drop the proc_list_lock. @@ -3452,7 +4851,7 @@ memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr */ MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n", - aggr_count, next_p->p_pid, (next_p->p_comm ? next_p->p_comm : "(unknown)")); + aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)")); temp_p = next_p; next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE); @@ -3460,16 +4859,22 @@ memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr } proc_list_unlock(); - printf("memorystatus: aggressive%d: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n", - aggr_count, + printf("%lu.%01d memorystatus: aggressive%d: %s %d [%s] (%s %d) - memorystatus_available_pages: %d\n", + (unsigned long)tv_sec, tv_msec, aggr_count, ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "idle exiting pid" : "jetsam killing pid"), - aPid, (p->p_comm ? p->p_comm : "(unknown)"), + aPid, (*p->p_name ? p->p_name : "(unknown)"), jetsam_kill_cause_name[cause], aPid_ep, memorystatus_available_pages); memorystatus_level_snapshot = memorystatus_level; - killed = memorystatus_do_kill(p, cause); - + /* + * memorystatus_do_kill() drops a reference, so take another one so we can + * continue to use this exit reason even after memorystatus_do_kill() + * returns. + */ + os_reason_ref(jetsam_reason); + killed = memorystatus_do_kill(p, cause, jetsam_reason); + /* Success? */ if (killed) { proc_rele(p); @@ -3532,6 +4937,8 @@ memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr proc_list_unlock(); exit: + os_reason_free(jetsam_reason); + /* Clear snapshot if freshly captured and no target was found */ if (new_snapshot && (kill_count == 0)) { memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; @@ -3548,8 +4955,6 @@ memorystatus_kill_top_process_aggressive(boolean_t any, uint32_t cause, int aggr } } -#if LEGACY_HIWATER - static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors) { @@ -3559,16 +4964,26 @@ memorystatus_kill_hiwat_proc(uint32_t *errors) int kill_count = 0; unsigned int i = 0; uint32_t aPid_ep; - + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; + os_reason_t jetsam_reason = OS_REASON_NULL; KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START, memorystatus_available_pages, 0, 0, 0, 0); + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n"); + } + proc_list_lock(); next_p = memorystatus_get_first_proc_locked(&i, TRUE); while (next_p) { - uint32_t footprint; - boolean_t skip; + uint64_t footprint_in_bytes = 0; + uint64_t memlimit_in_bytes = 0; + boolean_t skip = 0; p = next_p; next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); @@ -3598,9 +5013,9 @@ memorystatus_kill_hiwat_proc(uint32_t *errors) continue; } #endif - - footprint = (uint32_t)(get_task_phys_footprint(p->task) / (1024 * 1024)); - skip = (((int32_t)footprint) <= p->p_memstat_memlimit); + footprint_in_bytes = get_task_phys_footprint(p->task); + memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */ + skip = (footprint_in_bytes <= memlimit_in_bytes); #if DEVELOPMENT || DEBUG if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) { @@ -3623,8 +5038,13 @@ memorystatus_kill_hiwat_proc(uint32_t *errors) if (skip) { continue; } else { - MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %d Mb > 1 (%d Mb)\n", - (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", aPid, p->p_comm, footprint, p->p_memstat_memlimit); +#if DEVELOPMENT || DEBUG + MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %lld Mb > 1 (%d Mb)\n", + (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", + aPid, (*p->p_name ? p->p_name : "unknown"), + (footprint_in_bytes / (1024ULL * 1024ULL)), /* converted bytes to MB */ + p->p_memstat_memlimit); +#endif /* DEVELOPMENT || DEBUG */ if (memorystatus_jetsam_snapshot_count == 0) { memorystatus_init_jetsam_snapshot_locked(NULL,0); @@ -3632,11 +5052,15 @@ memorystatus_kill_hiwat_proc(uint32_t *errors) } p->p_memstat_state |= P_MEMSTAT_TERMINATED; + + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; #if DEVELOPMENT || DEBUG if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) { MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages); - memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic); + memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime); p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED; p = proc_ref_locked(p); @@ -3651,16 +5075,23 @@ memorystatus_kill_hiwat_proc(uint32_t *errors) } else #endif /* DEVELOPMENT || DEBUG */ { - memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledHiwat); + memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledHiwat, killtime); if (proc_ref_locked(p) == p) { proc_list_unlock(); - printf("memorystatus: jetsam killing pid %d [%s] (highwater %d) - memorystatus_available_pages: %d\n", - aPid, (p->p_comm ? p->p_comm : "(unknown)"), aPid_ep, memorystatus_available_pages); + printf("%lu.%02d memorystatus: jetsam killing pid %d [%s] (highwater %d) - memorystatus_available_pages: %d\n", + (unsigned long)tv_sec, tv_msec, aPid, (*p->p_name ? p->p_name : "(unknown)"), aPid_ep, memorystatus_available_pages); + + /* + * memorystatus_do_kill drops a reference, so take another one so we can + * continue to use this exit reason even after memorystatus_do_kill() + * returns + */ + os_reason_ref(jetsam_reason); + + killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat, jetsam_reason); - killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat); - /* Success? */ if (killed) { proc_rele(p); @@ -3700,22 +5131,170 @@ memorystatus_kill_hiwat_proc(uint32_t *errors) proc_list_unlock(); exit: + os_reason_free(jetsam_reason); + /* Clear snapshot if freshly captured and no target was found */ if (new_snapshot && !killed) { + proc_list_lock(); + memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); + } + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END, + memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0); + + return killed; +} + +/* + * Jetsam a process pinned in the elevated band. + * + * Return: true -- at least one pinned process was jetsammed + * false -- no pinned process was jetsammed + */ +static boolean_t +memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors) +{ + pid_t aPid = 0; + proc_t p = PROC_NULL, next_p = PROC_NULL; + boolean_t new_snapshot = FALSE, killed = FALSE; + int kill_count = 0; + unsigned int i = JETSAM_PRIORITY_ELEVATED_INACTIVE; + uint32_t aPid_ep; + uint64_t killtime = 0; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + uint32_t tv_msec; + + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); + + proc_list_lock(); + + next_p = memorystatus_get_first_proc_locked(&i, FALSE); + while (next_p) { + + p = next_p; + next_p = memorystatus_get_next_proc_locked(&i, p, FALSE); + + aPid = p->p_pid; + aPid_ep = p->p_memstat_effectivepriority; + + /* + * Only pick a process pinned in this elevated band + */ + if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) { + continue; + } + + if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) { + continue; + } + +#if CONFIG_FREEZE + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + continue; + } +#endif + +#if DEVELOPMENT || DEBUG + MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n", + aggr_count, + aPid, (*p->p_name ? p->p_name : "unknown"), + memorystatus_available_pages); +#endif /* DEVELOPMENT || DEBUG */ + + if (memorystatus_jetsam_snapshot_count == 0) { + memorystatus_init_jetsam_snapshot_locked(NULL,0); + new_snapshot = TRUE; + } + + p->p_memstat_state |= P_MEMSTAT_TERMINATED; + + killtime = mach_absolute_time(); + absolutetime_to_microtime(killtime, &tv_sec, &tv_usec); + tv_msec = tv_usec / 1000; + + memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); + + if (proc_ref_locked(p) == p) { + + proc_list_unlock(); + + printf("%lu.%01d memorystatus: elevated%d: jetsam killing pid %d [%s] (%s %d) - memorystatus_available_pages: %d\n", + (unsigned long)tv_sec, tv_msec, + aggr_count, + aPid, (*p->p_name ? p->p_name : "(unknown)"), + jetsam_kill_cause_name[cause], aPid_ep, memorystatus_available_pages); + + /* + * memorystatus_do_kill drops a reference, so take another one so we can + * continue to use this exit reason even after memorystatus_do_kill() + * returns + */ + os_reason_ref(jetsam_reason); + killed = memorystatus_do_kill(p, cause, jetsam_reason); + + /* Success? */ + if (killed) { + proc_rele(p); + kill_count++; + goto exit; + } + + /* + * Failure - first unwind the state, + * then fall through to restart the search. + */ + proc_list_lock(); + proc_rele_locked(p); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + p->p_memstat_state |= P_MEMSTAT_ERROR; + *errors += 1; + } + + /* + * Failure - restart the search. + * + * We might have raced with "p" exiting on another core, resulting in no + * ref on "p". Or, we may have failed to kill "p". + * + * Either way, we fall thru to here, leaving the proc in the + * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state. + * + * And, we hold the the proc_list_lock at this point. + */ + + next_p = memorystatus_get_first_proc_locked(&i, FALSE); + } + + proc_list_unlock(); + +exit: + os_reason_free(jetsam_reason); + + /* Clear snapshot if freshly captured and no target was found */ + if (new_snapshot && (kill_count == 0)) { + proc_list_lock(); memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); } - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END, + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0); - return killed; + return (killed); } -#endif /* LEGACY_HIWATER */ - static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) { - /* TODO: allow a general async path */ + /* + * TODO: allow a general async path + * + * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to + * add the appropriate exit reason code mapping. + */ if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing && cause != kMemorystatusKilledFCThrashing)) { return FALSE; @@ -3726,48 +5305,17 @@ memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) { return TRUE; } -static boolean_t -memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause) { - boolean_t res; - uint32_t errors = 0; - - if (victim_pid == -1) { - /* No pid, so kill first process */ - res = memorystatus_kill_top_process(TRUE, TRUE, cause, NULL, &errors); - } else { - res = memorystatus_kill_specific_process(victim_pid, cause); - } - - if (errors) { - memorystatus_clear_errors(); - } - - if (res == TRUE) { - /* Fire off snapshot notification */ - size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + - sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count; - uint64_t timestamp_now = mach_absolute_time(); - memorystatus_jetsam_snapshot->notification_time = timestamp_now; - if (memorystatus_jetsam_snapshot_last_timestamp == 0 || - timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout) { - int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); - if (!ret) { - proc_list_lock(); - memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; - proc_list_unlock(); - } - } - } - - return res; -} - boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async) { if (async) { return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage); } else { - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage); + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n"); + } + + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason); } } @@ -3776,22 +5324,39 @@ memorystatus_kill_on_VM_thrashing(boolean_t async) { if (async) { return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing); } else { - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing); + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMTHRASHING); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_VM_thrashing -- sync: failed to allocate jetsam reason\n"); + } + + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing, jetsam_reason); } } boolean_t memorystatus_kill_on_FC_thrashing(boolean_t async) { + + if (async) { return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing); } else { - return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing); + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n"); + } + + return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason); } } boolean_t memorystatus_kill_on_vnode_limit(void) { - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes); + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n"); + } + + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason); } #endif /* CONFIG_JETSAM */ @@ -3875,22 +5440,15 @@ memorystatus_freeze_process_sync(proc_t p) goto exit; } - if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { unsigned int avail_swap_space = 0; /* in pages. */ - if (DEFAULT_FREEZER_IS_ACTIVE) { - /* - * Freezer backed by default pager and swap file(s). - */ - avail_swap_space = default_pager_swap_pages_free(); - } else { - /* - * Freezer backed by the compressor and swap file(s) - * while will hold compressed data. - */ - avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; - } + /* + * Freezer backed by the compressor and swap file(s) + * while will hold compressed data. + */ + avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max); @@ -3911,10 +5469,12 @@ memorystatus_freeze_process_sync(proc_t p) ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE); + DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty); + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - " - "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n", - (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"), - memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free()); + "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", + (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"), + memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); proc_list_lock(); p->p_memstat_state &= ~P_MEMSTAT_LOCKED; @@ -3926,7 +5486,7 @@ memorystatus_freeze_process_sync(proc_t p) p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM)); - if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { /* Update stats */ for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { throttle_intervals[i].pageouts += dirty; @@ -3991,24 +5551,17 @@ memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low) continue; // with lock held } - if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { /* Ensure there's enough free space to freeze this process. */ unsigned int avail_swap_space = 0; /* in pages. */ - if (DEFAULT_FREEZER_IS_ACTIVE) { - /* - * Freezer backed by default pager and swap file(s). - */ - avail_swap_space = default_pager_swap_pages_free(); - } else { - /* - * Freezer backed by the compressor and swap file(s) - * while will hold compressed data. - */ - avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; - } + /* + * Freezer backed by the compressor and swap file(s) + * while will hold compressed data. + */ + avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max); @@ -4036,9 +5589,9 @@ memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low) kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE); MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - " - "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n", - (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"), - memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free()); + "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", + (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"), + memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); proc_list_lock(); p->p_memstat_state &= ~P_MEMSTAT_LOCKED; @@ -4050,8 +5603,8 @@ memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low) memorystatus_frozen_count++; p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM)); - - if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { /* Update stats */ for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { throttle_intervals[i].pageouts += dirty; @@ -4139,12 +5692,12 @@ memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low) if (!memorystatus_can_freeze_processes()) { return FALSE; } + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); - if (COMPRESSED_PAGER_IS_SWAPLESS || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) { + if ( !VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { /* * In-core compressor used for freezing WITHOUT on-disk swap support. */ - if (vm_compressor_low_on_space()) { if (*memorystatus_freeze_swap_low) { *memorystatus_freeze_swap_low = TRUE; @@ -4162,34 +5715,17 @@ memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low) } else { /* * Freezing WITH on-disk swap support. + * + * In-core compressor fronts the swap. */ - - if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { - /* - * In-core compressor fronts the swap. - */ - if (vm_swap_low_on_space()) { - if (*memorystatus_freeze_swap_low) { - *memorystatus_freeze_swap_low = TRUE; - } - - can_freeze = FALSE; + if (vm_swap_low_on_space()) { + if (*memorystatus_freeze_swap_low) { + *memorystatus_freeze_swap_low = TRUE; } - } else if (DEFAULT_FREEZER_IS_ACTIVE) { - /* - * Legacy freeze mode with no compressor support. - */ - if (default_pager_swap_pages_free() < memorystatus_freeze_pages_min) { - if (*memorystatus_freeze_swap_low) { - *memorystatus_freeze_swap_low = TRUE; - } - - can_freeze = FALSE; - } - } else { - panic("Not a valid freeze configuration.\n"); + can_freeze = FALSE; } + } return can_freeze; @@ -4291,7 +5827,7 @@ static int memorystatus_send_note(int event_code, void *data, size_t data_length) { int ret; struct kev_msg ev_msg; - + ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_SYSTEM_CLASS; ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; @@ -4311,7 +5847,7 @@ memorystatus_send_note(int event_code, void *data, size_t data_length) { } boolean_t -memorystatus_warn_process(pid_t pid, boolean_t critical) { +memorystatus_warn_process(pid_t pid, boolean_t limit_exceeded) { boolean_t ret = FALSE; boolean_t found_knote = FALSE; @@ -4324,7 +5860,7 @@ memorystatus_warn_process(pid_t pid, boolean_t critical) { memorystatus_klist_lock(); SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { - proc_t knote_proc = kn->kn_kq->kq_p; + proc_t knote_proc = knote_get_kq(kn)->kq_p; pid_t knote_pid = knote_proc->p_pid; if (knote_pid == pid) { @@ -4336,30 +5872,35 @@ memorystatus_warn_process(pid_t pid, boolean_t critical) { * system pressure snapshot evaluation in * filt_memorystatus(). */ - - if (critical) { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL; - } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; + + if (!limit_exceeded) { + + /* + * Processes on desktop are not expecting to handle a system-wide + * critical or system-wide warning notification from this path. + * Intentionally set only the unambiguous limit warning here. + */ + + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; + found_knote = TRUE; } + } else { - if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) { - kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN; + /* + * Send this notification when a process has exceeded a soft limit. + */ + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; + found_knote = TRUE; } } - - found_knote = TRUE; } } if (found_knote) { KNOTE(&memorystatus_klist, 0); ret = TRUE; - } else { - if (vm_dispatch_pressure_note_to_pid(pid, FALSE) == 0) { - ret = TRUE; - } } memorystatus_klist_unlock(); @@ -4437,6 +5978,21 @@ memorystatus_is_foreground_locked(proc_t p) { return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) || (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT)); } + +/* + * This is meant for stackshot and kperf -- it does not take the proc_list_lock + * to access the p_memstat_dirty field. + */ +boolean_t +memorystatus_proc_is_dirty_unsafe(void *v) +{ + if (!v) { + return FALSE; + } + proc_t p = (proc_t)v; + return (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0; +} + #endif /* CONFIG_MEMORYSTATUS */ /* @@ -4456,15 +6012,21 @@ vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal; extern struct knote * vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t); -extern -kern_return_t vm_pressure_notification_without_levels(boolean_t); - -extern void vm_pressure_klist_lock(void); -extern void vm_pressure_klist_unlock(void); +/* + * This value is the threshold that a process must meet to be considered for scavenging. + */ +#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */ -extern void vm_reset_active_list(void); +#define VM_PRESSURE_NOTIFY_WAIT_PERIOD 10000 /* milliseconds */ -extern void delay(int); +#if DEBUG +#define VM_PRESSURE_DEBUG(cond, format, ...) \ +do { \ + if (cond) { printf(format, ##__VA_ARGS__); } \ +} while(0) +#else +#define VM_PRESSURE_DEBUG(cond, format, ...) +#endif #define INTER_NOTIFICATION_DELAY (250000) /* .25 second */ @@ -4485,26 +6047,284 @@ void memorystatus_on_pageout_scan_end(void) { * */ -boolean_t -is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set) -{ - if (kn_max->kn_sfflags & knote_pressure_level) { +boolean_t +is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set) +{ + if (kn_max->kn_sfflags & knote_pressure_level) { + + if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) { + + task_clear_has_been_notified(task, pressure_level_to_clear); + } + + task_mark_has_been_notified(task, pressure_level_to_set); + return TRUE; + } + + return FALSE; +} + +void +memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear) +{ + struct knote *kn = NULL; + + memorystatus_klist_lock(); + SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { + + proc_t p = PROC_NULL; + struct task* t = TASK_NULL; + + p = knote_get_kq(kn)->kq_p; + proc_list_lock(); + if (p != proc_ref_locked(p)) { + p = PROC_NULL; + proc_list_unlock(); + continue; + } + proc_list_unlock(); + + t = (struct task *)(p->task); + + task_clear_has_been_notified(t, pressure_level_to_clear); + + proc_rele(p); + } + + memorystatus_klist_unlock(); +} + +extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process); + +struct knote * +vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process); + +/* + * Used by the vm_pressure_thread which is + * signalled from within vm_pageout_scan(). + */ +static void vm_dispatch_memory_pressure(void); +void consider_vm_pressure_events(void); + +void consider_vm_pressure_events(void) +{ + vm_dispatch_memory_pressure(); +} +static void vm_dispatch_memory_pressure(void) +{ + memorystatus_update_vm_pressure(FALSE); +} + +extern vm_pressure_level_t +convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); + +struct knote * +vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process) +{ + struct knote *kn = NULL, *kn_max = NULL; + uint64_t resident_max = 0; /* MB */ + struct timeval curr_tstamp = {0, 0}; + int elapsed_msecs = 0; + int selected_task_importance = 0; + static int pressure_snapshot = -1; + boolean_t pressure_increase = FALSE; + + if (pressure_snapshot == -1) { + /* + * Initial snapshot. + */ + pressure_snapshot = level; + pressure_increase = TRUE; + } else { + + if (level >= pressure_snapshot) { + pressure_increase = TRUE; + } else { + pressure_increase = FALSE; + } + + pressure_snapshot = level; + } + + if (pressure_increase == TRUE) { + /* + * We'll start by considering the largest + * unimportant task in our list. + */ + selected_task_importance = INT_MAX; + } else { + /* + * We'll start by considering the largest + * important task in our list. + */ + selected_task_importance = 0; + } + + microuptime(&curr_tstamp); + + SLIST_FOREACH(kn, candidate_list, kn_selnext) { + + uint64_t resident_size = 0; /* MB */ + proc_t p = PROC_NULL; + struct task* t = TASK_NULL; + int curr_task_importance = 0; + boolean_t consider_knote = FALSE; + boolean_t privileged_listener = FALSE; + + p = knote_get_kq(kn)->kq_p; + proc_list_lock(); + if (p != proc_ref_locked(p)) { + p = PROC_NULL; + proc_list_unlock(); + continue; + } + proc_list_unlock(); + +#if CONFIG_MEMORYSTATUS + if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) { + /* + * Skip process not marked foreground. + */ + proc_rele(p); + continue; + } +#endif /* CONFIG_MEMORYSTATUS */ + + t = (struct task *)(p->task); + + timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp); + elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; + + vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level); + + if ((kn->kn_sfflags & dispatch_level) == 0) { + proc_rele(p); + continue; + } + +#if CONFIG_MEMORYSTATUS + if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) { + VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid); + proc_rele(p); + continue; + } +#endif /* CONFIG_MEMORYSTATUS */ + + curr_task_importance = task_importance_estimate(t); + + /* + * Privileged listeners are only considered in the multi-level pressure scheme + * AND only if the pressure is increasing. + */ + if (level > 0) { + + if (task_has_been_notified(t, level) == FALSE) { + + /* + * Is this a privileged listener? + */ + if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) { + + if (privileged_listener) { + kn_max = kn; + proc_rele(p); + goto done_scanning; + } + } + } else { + proc_rele(p); + continue; + } + } else if (level == 0) { + + /* + * Task wasn't notified when the pressure was increasing and so + * no need to notify it that the pressure is decreasing. + */ + if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) { + proc_rele(p); + continue; + } + } + + /* + * We don't want a small process to block large processes from + * being notified again. + */ + resident_size = (get_task_phys_footprint(t))/(1024*1024ULL); /* MB */ + + if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { + + if (level > 0) { + /* + * Warning or Critical Pressure. + */ + if (pressure_increase) { + if ((curr_task_importance < selected_task_importance) || + ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { + + /* + * We have found a candidate process which is: + * a) at a lower importance than the current selected process + * OR + * b) has importance equal to that of the current selected process but is larger + */ + + consider_knote = TRUE; + } + } else { + if ((curr_task_importance > selected_task_importance) || + ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - if (task_has_been_notified(task, pressure_level_to_clear) == TRUE) { + /* + * We have found a candidate process which is: + * a) at a higher importance than the current selected process + * OR + * b) has importance equal to that of the current selected process but is larger + */ - task_clear_has_been_notified(task, pressure_level_to_clear); - } + consider_knote = TRUE; + } + } + } else if (level == 0) { + /* + * Pressure back to normal. + */ + if ((curr_task_importance > selected_task_importance) || + ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - task_mark_has_been_notified(task, pressure_level_to_set); - return TRUE; + consider_knote = TRUE; + } + } + + if (consider_knote) { + resident_max = resident_size; + kn_max = kn; + selected_task_importance = curr_task_importance; + consider_knote = FALSE; /* reset for the next candidate */ + } + } else { + /* There was no candidate with enough resident memory to scavenge */ + VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size); + } + proc_rele(p); + } + +done_scanning: + if (kn_max) { + VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0); + VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max); } - return FALSE; + return kn_max; } -extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process); - #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */ +#define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */ +#define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */ + +uint64_t next_warning_notification_sent_at_ts = 0; +uint64_t next_critical_notification_sent_at_ts = 0; kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process) @@ -4523,6 +6343,7 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) struct timeval smoothing_window_start_tstamp = {0, 0}; struct timeval curr_tstamp = {0, 0}; int elapsed_msecs = 0; + uint64_t curr_ts = mach_absolute_time(); #if !CONFIG_JETSAM #define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */ @@ -4553,6 +6374,31 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) } #endif /* !CONFIG_JETSAM */ + if (level_snapshot != kVMPressureNormal) { + + /* + * Check to see if we are still in the 'resting' period + * after having notified all clients interested in + * a particular pressure level. + */ + + level_snapshot = memorystatus_vm_pressure_level; + + if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { + + if (curr_ts < next_warning_notification_sent_at_ts) { + delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */); + return KERN_SUCCESS; + } + } else if (level_snapshot == kVMPressureCritical) { + + if (curr_ts < next_critical_notification_sent_at_ts) { + delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */); + return KERN_SUCCESS; + } + } + } + while (1) { /* @@ -4594,24 +6440,29 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) /* * No more level-based clients to notify. - * Try the non-level based notification clients. - * - * However, these non-level clients don't understand - * the "return-to-normal" notification. - * - * So don't consider them for those notifications. Just - * return instead. * + * Start the 'resting' window within which clients will not be re-notified. */ if (level_snapshot != kVMPressureNormal) { - goto try_dispatch_vm_clients; - } else { - return KERN_FAILURE; - } + if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { + nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts); + next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts; + + memorystatus_klist_reset_all_for_level(kVMPressureWarning); + } + + if (level_snapshot == kVMPressureCritical) { + nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts); + next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts; + + memorystatus_klist_reset_all_for_level(kVMPressureCritical); + } + } + return KERN_FAILURE; } - target_proc = kn_max->kn_kq->kq_p; + target_proc = knote_get_kq(kn_max)->kq_p; proc_list_lock(); if (target_proc != proc_ref_locked(target_proc)) { @@ -4630,13 +6481,13 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) { - if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, kVMPressureCritical, kVMPressureWarning) == TRUE) { + if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) { found_candidate = TRUE; } } else { if (level_snapshot == kVMPressureCritical) { - if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, kVMPressureWarning, kVMPressureCritical) == TRUE) { + if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) { found_candidate = TRUE; } } @@ -4658,11 +6509,16 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) } SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) { - proc_t knote_proc = kn_cur->kn_kq->kq_p; - pid_t knote_pid = knote_proc->p_pid; - if (knote_pid == target_pid) { - KNOTE_DETACH(&memorystatus_klist, kn_cur); - KNOTE_ATTACH(&dispatch_klist, kn_cur); + + int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot); + + if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) { + proc_t knote_proc = knote_get_kq(kn_cur)->kq_p; + pid_t knote_pid = knote_proc->p_pid; + if (knote_pid == target_pid) { + KNOTE_DETACH(&memorystatus_klist, kn_cur); + KNOTE_ATTACH(&dispatch_klist, kn_cur); + } } } @@ -4682,34 +6538,6 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) break; } -try_dispatch_vm_clients: - if (kn_max == NULL && level_snapshot != kVMPressureNormal) { - /* - * We will exit this loop when we are done with - * notification clients (level and non-level based). - */ - if ((vm_pressure_notify_dispatch_vm_clients(target_foreground_process) == KERN_FAILURE) && (kn_max == NULL)) { - /* - * kn_max == NULL i.e. we didn't find any eligible clients for the level-based notifications - * AND - * we have failed to find any eligible clients for the non-level based notifications too. - * So, we are done. - */ - - return KERN_FAILURE; - } - } - - /* - * LD: This block of code below used to be invoked in the older memory notification scheme on embedded everytime - * a process was sent a memory pressure notification. The "memorystatus_klist" list was used to hold these - * privileged listeners. But now we have moved to the newer scheme and are trying to move away from the extra - * notifications. So the code is here in case we break compat. and need to send out notifications to the privileged - * apps. - */ -#if 0 -#endif /* 0 */ - if (memorystatus_manual_testing_on == TRUE) { /* * Testing out the pressure notification scheme. @@ -4745,9 +6573,6 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) return KERN_SUCCESS; } -vm_pressure_level_t -convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); - vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level) { @@ -4898,15 +6723,6 @@ sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) { memorystatus_manual_testing_on = FALSE; - - vm_pressure_klist_lock(); - vm_reset_active_list(); - vm_pressure_klist_unlock(); - } else { - - vm_pressure_klist_lock(); - vm_pressure_notification_without_levels(FALSE); - vm_pressure_klist_unlock(); } return 0; @@ -4967,7 +6783,6 @@ memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t list_entry->pid = p->p_pid; list_entry->priority = p->p_memstat_effectivepriority; list_entry->user_data = p->p_memstat_userdata; -#if LEGACY_HIWATER /* * No need to consider P_MEMSTAT_MEMLIMIT_BACKGROUND anymore. @@ -4980,9 +6795,7 @@ memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t } else { list_entry->limit = p->p_memstat_memlimit; } -#else - task_get_phys_footprint_limit(p->task, &list_entry->limit); -#endif + list_entry->state = memorystatus_build_state(p); list_entry++; @@ -5061,6 +6874,7 @@ memorystatus_update_levels_locked(boolean_t critical_only) { /* * If there's an entry in the first bucket, we have idle processes. */ + memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; if (first_bucket->count) { memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset; @@ -5085,7 +6899,11 @@ memorystatus_update_levels_locked(boolean_t critical_only) { } } #endif - + + if (memorystatus_jetsam_policy & kPolicyMoreFree) { + memorystatus_available_pages_critical += memorystatus_policy_more_free_offset_pages; + } + if (critical_only) { return; } @@ -5100,6 +6918,50 @@ memorystatus_update_levels_locked(boolean_t critical_only) { #endif } +static int +sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + int error = 0, more_free = 0; + + /* + * TODO: Enable this privilege check? + * + * error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0); + * if (error) + * return (error); + */ + + error = sysctl_handle_int(oidp, &more_free, 0, req); + if (error || !req->newptr) + return (error); + + if ((more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree)) || + (!more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0))) { + + /* + * No change in state. + */ + return 0; + } + + proc_list_lock(); + + if (more_free) { + memorystatus_jetsam_policy |= kPolicyMoreFree; + } else { + memorystatus_jetsam_policy &= ~kPolicyMoreFree; + } + + memorystatus_update_levels_locked(TRUE); + + proc_list_unlock(); + + return 0; +} +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_policy_more_free, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_kern_memorystatus_policy_more_free, "I", ""); + /* * Get the at_boot snapshot */ @@ -5280,9 +7142,8 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b /* * The jetsam snapshot is never freed, its count is simply reset. */ - snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; - proc_list_lock(); + snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; memorystatus_jetsam_snapshot_last_timestamp = 0; proc_list_unlock(); } @@ -5399,8 +7260,8 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu if (entries[i].priority == -1) { /* Use as shorthand for default priority */ entries[i].priority = JETSAM_PRIORITY_DEFAULT; - } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_DEFERRED) { - /* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use; + } else if ((entries[i].priority == system_procs_aging_band) || (entries[i].priority == applications_aging_band)) { + /* Both the aging bands are reserved for internal use; * if requested, adjust to JETSAM_PRIORITY_IDLE. */ entries[i].priority = JETSAM_PRIORITY_IDLE; } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) { @@ -5469,14 +7330,14 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu } /* - * Take appropriate steps if moving proc out of the - * JETSAM_PRIORITY_IDLE_DEFERRED band. + * Take appropriate steps if moving proc out of + * either of the aging bands. */ - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) { + if ((p->p_memstat_effectivepriority == system_procs_aging_band) || (p->p_memstat_effectivepriority == applications_aging_band)) { memorystatus_invalidate_idle_demotion_locked(p, TRUE); } - memorystatus_update_priority_locked(p, new_priority, head_insert); + memorystatus_update_priority_locked(p, new_priority, head_insert, false); } proc_list_unlock(); @@ -5613,6 +7474,61 @@ memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t b } +/* + * SPI for kbd - pr24956468 + * This is a very simple snapshot that calculates how much a + * process's phys_footprint exceeds a specific memory limit. + * Only the inactive memory limit is supported for now. + * The delta is returned as bytes in excess or zero. + */ +static int +memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) { + int error = 0; + uint64_t footprint_in_bytes = 0; + uint64_t delta_in_bytes = 0; + int32_t memlimit_mb = 0; + uint64_t memlimit_bytes = 0; + + /* Validate inputs */ + if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) { + return EINVAL; + } + + proc_t p = proc_find(pid); + if (!p) { + return ESRCH; + } + + /* + * Get the inactive limit. + * No locks taken since we hold a reference to the proc. + */ + + if (p->p_memstat_memlimit_inactive <= 0) { + task_convert_phys_footprint_limit(-1, &memlimit_mb); + } else { + memlimit_mb = p->p_memstat_memlimit_inactive; + } + + footprint_in_bytes = get_task_phys_footprint(p->task); + + proc_rele(p); + + memlimit_bytes = memlimit_mb * 1024 * 1024; /* MB to bytes */ + + /* + * Computed delta always returns >= 0 bytes + */ + if (footprint_in_bytes > memlimit_bytes) { + delta_in_bytes = footprint_in_bytes - memlimit_bytes; + } + + error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes)); + + return(error); +} + + static int memorystatus_cmd_get_pressure_status(int32_t *retval) { int error; @@ -5780,6 +7696,7 @@ memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1), (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty, (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : "")); + DTRACE_MEMORYSTATUS2(memorystatus_set_memlimit, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1)); } proc_list_unlock(); @@ -5805,79 +7722,33 @@ proc_get_memstat_priority(proc_t p, boolean_t effective_priority) return 0; } -/* - * Description: - * Evaluates active vs. inactive process state. - * Processes that opt into dirty tracking are evaluated - * based on clean vs dirty state. - * dirty ==> active - * clean ==> inactive - * - * Process that do not opt into dirty tracking are - * evalulated based on priority level. - * Foreground or above ==> active - * Below Foreground ==> inactive - * - * Return: TRUE if active - * False if inactive - */ - -static boolean_t -proc_jetsam_state_is_active_locked(proc_t p) { - - if (p->p_memstat_dirty & P_DIRTY_TRACK) { - /* - * process has opted into dirty tracking - * active state is based on dirty vs. clean - */ - if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) { - /* - * process is dirty - * implies active state - */ - return TRUE; - } else { - /* - * process is clean - * implies inactive state - */ - return FALSE; - } - } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) { - /* - * process is Foreground or higher - * implies active state - */ - return TRUE; - } else { - /* - * process found below Foreground - * implies inactive state - */ - return FALSE; - } -} - #endif /* CONFIG_JETSAM */ int memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) { int error = EINVAL; + os_reason_t jetsam_reason = OS_REASON_NULL; #if !CONFIG_JETSAM #pragma unused(ret) + #pragma unused(jetsam_reason) #endif - /* Root only for now */ - if (!kauth_cred_issuser(kauth_cred_get())) { + /* Need to be root or have entitlement */ + if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) { error = EPERM; goto out; } - - /* Sanity check */ - if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) { - error = EINVAL; - goto out; + + /* + * Sanity check. + * Do not enforce it for snapshots. + */ + if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT) { + if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) { + error = EINVAL; + goto out; + } } switch (args->command) { @@ -5894,6 +7765,9 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES: error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret); break; + case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS: + error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret); + break; case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES: error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret); break; @@ -5922,7 +7796,12 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * /* Test commands */ #if DEVELOPMENT || DEBUG case MEMORYSTATUS_CMD_TEST_JETSAM: - error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled) ? 0 : EINVAL; + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_control: failed to allocate jetsam reason\n"); + } + + error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL; break; case MEMORYSTATUS_CMD_TEST_JETSAM_SORT: error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags); @@ -5930,6 +7809,8 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS: error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize); break; +#else /* DEVELOPMENT || DEBUG */ + #pragma unused(jetsam_reason) #endif /* DEVELOPMENT || DEBUG */ case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE: if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) { @@ -5939,6 +7820,7 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * memorystatus_aggressive_jetsam_lenient_allowed = TRUE; memorystatus_aggressive_jetsam_lenient = TRUE; + error = 0; } break; case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE: @@ -5947,12 +7829,21 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * #endif /* DEVELOPMENT || DEBUG */ memorystatus_aggressive_jetsam_lenient_allowed = FALSE; memorystatus_aggressive_jetsam_lenient = FALSE; + error = 0; break; #endif /* CONFIG_JETSAM */ case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE: case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE: error = memorystatus_low_mem_privileged_listener(args->command); break; + +#if CONFIG_JETSAM + case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE: + case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE: + error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, args->flags ? TRUE : FALSE); + break; +#endif /* CONFIG_JETSAM */ + default: break; } @@ -5965,8 +7856,15 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * static int filt_memorystatusattach(struct knote *kn) { + int error; + kn->kn_flags |= EV_CLEAR; - return memorystatus_knote_register(kn); + error = memorystatus_knote_register(kn); + if (error) { + kn->kn_flags = EV_ERROR; + kn->kn_data = error; + } + return 0; } static void @@ -6002,6 +7900,19 @@ filt_memorystatus(struct knote *kn __unused, long hint) kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP; } break; + + case kMemorystatusProcLimitWarn: + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN; + } + break; + + case kMemorystatusProcLimitCritical: + if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) { + kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL; + } + break; + default: break; } @@ -6010,6 +7921,55 @@ filt_memorystatus(struct knote *kn __unused, long hint) return (kn->kn_fflags != 0); } +static int +filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev) +{ + int res; + + memorystatus_klist_lock(); + + /* + * copy in new kevent settings + * (saving the "desired" data and fflags). + */ + kn->kn_sfflags = kev->fflags; + + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* + * reset the output flags based on a + * combination of the old events and + * the new desired event list. + */ + //kn->kn_fflags &= kn->kn_sfflags; + + res = (kn->kn_fflags != 0); + + memorystatus_klist_unlock(); + + return res; +} + +static int +filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + int res; + + memorystatus_klist_lock(); + res = (kn->kn_fflags != 0); + if (res) { + *kev = kn->kn_kevent; + kn->kn_flags |= EV_CLEAR; /* automatic */ + kn->kn_fflags = 0; + kn->kn_data = 0; + } + memorystatus_klist_unlock(); + + return res; +} + static void memorystatus_klist_lock(void) { lck_mtx_lock(&memorystatus_klist_mutex); @@ -6032,7 +7992,9 @@ memorystatus_knote_register(struct knote *kn) { memorystatus_klist_lock(); - if (kn->kn_sfflags & (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP)) { + if (kn->kn_sfflags & (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | + NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP | + NOTE_MEMORYSTATUS_PROC_LIMIT_WARN | NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL)) { KNOTE_ATTACH(&memorystatus_klist, kn); diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 4157bfba5..9ef3a2479 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -383,7 +383,7 @@ SYSCTL_QUAD (_hw, HW_MEMSIZE, memsize, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LO SYSCTL_INT (_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &packages, 0, ""); /* - * Optional features can register nodes below hw.optional. + * Optional CPU features can register nodes below hw.optional. * * If the feature is not present, the node should either not be registered, * or it should return -1. If the feature is present, the node should return @@ -394,6 +394,11 @@ SYSCTL_NODE(_hw, OID_AUTO, optional, CTLFLAG_RW|CTLFLAG_LOCKED, NULL, "optional SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, 1, ""); /* always set */ +/* + * Optional device hardware features can be registered by drivers below hw.features + */ +SYSCTL_NODE(_hw, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "hardware features"); + /* * Deprecated variables. These are supported for backwards compatibility * purposes only. The MASKED flag requests that the variables not be diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 1dd5bd3f0..318400dc9 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -95,9 +95,6 @@ #include #include #include -#if CONFIG_PROTECT -#include -#endif #include #include @@ -143,7 +140,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) * Map in special device (must be SHARED) or file */ struct fileproc *fp; - register struct vnode *vp; + struct vnode *vp; int flags; int prot; int err=0; @@ -224,15 +221,16 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if ((flags & MAP_FIXED) || (flags & MAP_SHARED) || !(flags & MAP_ANON) || - (flags & MAP_RESILIENT_CODESIGN)) { + (flags & MAP_RESILIENT_CODESIGN) || + (flags & MAP_RESILIENT_MEDIA)) { return EINVAL; } } if ((flags & MAP_RESILIENT_CODESIGN) || (flags & MAP_RESILIENT_MEDIA)) { - assert(!(flags & MAP_JIT)); - if (flags & MAP_ANON) { + if ((flags & MAP_ANON) || + (flags & MAP_JIT)) { return EINVAL; } if (prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) { @@ -435,16 +433,6 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) goto bad; } #endif /* MAC */ - -#if CONFIG_PROTECT - { - error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0); - if (error) { - (void) vnode_put(vp); - goto bad; - } - } -#endif /* CONFIG_PROTECT */ } } @@ -778,7 +766,7 @@ munmap(__unused proc_t p, struct munmap_args *uap, __unused int32_t *retval) int mprotect(__unused proc_t p, struct mprotect_args *uap, __unused int32_t *retval) { - register vm_prot_t prot; + vm_prot_t prot; mach_vm_offset_t user_addr; mach_vm_size_t user_size; kern_return_t result; @@ -794,7 +782,7 @@ mprotect(__unused proc_t p, struct mprotect_args *uap, __unused int32_t *retval) user_map = current_map(); user_addr = (mach_vm_offset_t) uap->addr; user_size = (mach_vm_size_t) uap->len; - prot = (vm_prot_t)(uap->prot & (VM_PROT_ALL | VM_PROT_TRUSTED)); + prot = (vm_prot_t)(uap->prot & (VM_PROT_ALL | VM_PROT_TRUSTED | VM_PROT_STRIP_READ)); if (user_addr & vm_map_page_mask(user_map)) { /* UNIX SPEC: user address is not page-aligned, return EINVAL */ @@ -814,6 +802,7 @@ mprotect(__unused proc_t p, struct mprotect_args *uap, __unused int32_t *retval) prot |= VM_PROT_READ; #endif /* 3936456 */ + #if CONFIG_MACF /* * The MAC check for mprotect is of limited use for 2 reasons: @@ -881,7 +870,7 @@ minherit(__unused proc_t p, struct minherit_args *uap, __unused int32_t *retval) { mach_vm_offset_t addr; mach_vm_size_t size; - register vm_inherit_t inherit; + vm_inherit_t inherit; vm_map_t user_map; kern_return_t result; @@ -1112,6 +1101,8 @@ mlock(__unused proc_t p, struct mlock_args *uap, __unused int32_t *retvalval) if (result == KERN_RESOURCE_SHORTAGE) return EAGAIN; + else if (result == KERN_PROTECTION_FAILURE) + return EACCES; else if (result != KERN_SUCCESS) return ENOMEM; @@ -1243,12 +1234,16 @@ mremap_encrypted(__unused struct proc *p, struct mremap_encrypted_args *uap, __u .cputype = cputype, .cpusubtype = cpusubtype }; result = text_crypter_create(&crypt_info, cryptname, (void*)&crypt_data); -#if DEVELOPMENT || DEBUG - printf("APPLE_PROTECT: %d[%s] map %p [0x%llx:0x%llx] %s(%s) -> 0x%x\n", - p->p_pid, p->p_comm, - user_map, (uint64_t) user_addr, (uint64_t) (user_addr + user_size), - __FUNCTION__, vpath, result); -#endif /* DEVELOPMENT || DEBUG */ +#if VM_MAP_DEBUG_APPLE_PROTECT + if (vm_map_debug_apple_protect) { + printf("APPLE_PROTECT: %d[%s] map %p [0x%llx:0x%llx] %s(%s) -> 0x%x\n", + p->p_pid, p->p_comm, + user_map, + (uint64_t) user_addr, + (uint64_t) (user_addr + user_size), + __FUNCTION__, vpath, result); + } +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ FREE_ZONE(vpath, MAXPATHLEN, M_NAMEI); if(result) { diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index fdd86a948..6895674f9 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -78,6 +78,7 @@ #include #include +#include lck_grp_t * sysctl_lock_group = NULL; lck_rw_t * sysctl_geometry_lock = NULL; @@ -177,19 +178,6 @@ sysctl_register_oid(struct sysctl_oid *new_oidp) } } - if(sysctl_geometry_lock == NULL) - { - /* - * Initialise the geometry lock for reading/modifying the - * sysctl tree. This is done here because IOKit registers - * some sysctl's before bsd_init() calls - * sysctl_register_fixed(). - */ - - sysctl_lock_group = lck_grp_alloc_init("sysctl", NULL); - sysctl_geometry_lock = lck_rw_alloc_init(sysctl_lock_group, NULL); - sysctl_unlocked_node_lock = lck_mtx_alloc_init(sysctl_lock_group, NULL); - } /* Get the write lock to modify the geometry */ lck_rw_lock_exclusive(sysctl_geometry_lock); @@ -321,21 +309,35 @@ sysctl_unregister_set(const char *set) } } - /* - * Register the kernel's oids on startup. + * Exported in BSDKernel.exports, kept for binary compatibility */ - +#if defined(__x86_64__) void -sysctl_register_all() +sysctl_register_fixed(void) { - sysctl_register_set("__sysctl_set"); } +#endif + +/* + * Register the kernel's oids on startup. + */ void -sysctl_register_fixed(void) +sysctl_early_init(void) { - sysctl_register_all(); + /* + * Initialize the geometry lock for reading/modifying the + * sysctl tree. This is done here because IOKit registers + * some sysctl's before bsd_init() would otherwise perform + * subsystem initialization. + */ + + sysctl_lock_group = lck_grp_alloc_init("sysctl", NULL); + sysctl_geometry_lock = lck_rw_alloc_init(sysctl_lock_group, NULL); + sysctl_unlocked_node_lock = lck_mtx_alloc_init(sysctl_lock_group, NULL); + + sysctl_register_set("__sysctl_set"); } /* diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index 549024e9d..0d719cb4d 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -153,18 +153,24 @@ extern int cs_debug; #if DEBUG #define __PROC_INTERNAL_DEBUG 1 #endif +#if CONFIG_COREDUMP /* Name to give to core files */ __XNU_PRIVATE_EXTERN char corefilename[MAXPATHLEN+1] = {"/cores/core.%P"}; +#endif #if PROC_REF_DEBUG -extern uint32_t fastbacktrace(uintptr_t* bt, uint32_t max_frames) __attribute__((noinline)); +#include #endif -static void orphanpg(struct pgrp *pg); -void proc_name_kdp(task_t t, char * buf, int size); -int proc_threadname_kdp(void *uth, char *buf, size_t size); -void proc_starttime_kdp(void *p, uint64_t *tv_sec, uint64_t *tv_usec); -char *proc_name_address(void *p); +static void orphanpg(struct pgrp * pg); +void proc_name_kdp(task_t t, char * buf, int size); +void * proc_get_uthread_uu_threadlist(void * uthread_v); +int proc_threadname_kdp(void * uth, char * buf, size_t size); +void proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime); +char * proc_name_address(void * p); + +/* TODO: make a header that's exported and usable in osfmk */ +char* proc_best_name(proc_t p); static void pgrp_add(struct pgrp * pgrp, proc_t parent, proc_t child); static void pgrp_remove(proc_t p); @@ -182,6 +188,20 @@ struct fixjob_iterargs { int fixjob_callback(proc_t, void *); +uint64_t get_current_unique_pid(void); + + +uint64_t +get_current_unique_pid(void) +{ + proc_t p = current_proc(); + + if (p) + return p->p_uniqueid; + else + return 0; +} + /* * Initialize global process hashing structures. */ @@ -410,7 +430,7 @@ record_procref(proc_t p, int count) { if (count == 1) { if (uth->uu_pindex < NUM_PROC_REFS_TO_TRACK) { - fastbacktrace((uintptr_t *) &uth->uu_proc_pcs[uth->uu_pindex], PROC_REF_STACK_DEPTH); + backtrace((uintptr_t *) &uth->uu_proc_pcs[uth->uu_pindex], PROC_REF_STACK_DEPTH); uth->uu_proc_ps[uth->uu_pindex] = p; uth->uu_pindex++; @@ -775,9 +795,8 @@ proc_name_kdp(task_t t, char * buf, int size) strlcpy(buf, &p->p_comm[0], MIN((int)sizeof(p->p_comm), size)); } - int -proc_threadname_kdp(void *uth, char *buf, size_t size) +proc_threadname_kdp(void * uth, char * buf, size_t size) { if (size < MAXTHREADNAMESIZE) { /* this is really just a protective measure for the future in @@ -798,7 +817,7 @@ proc_threadname_kdp(void *uth, char *buf, size_t size) * thus the input arguments will in general be unaligned. We have to handle * that here. */ void -proc_starttime_kdp(void *p, uint64_t *tv_sec, uint64_t *tv_usec) +proc_starttime_kdp(void *p, uint64_t *tv_sec, uint64_t *tv_usec, uint64_t *abstime) { proc_t pp = (proc_t)p; struct uint64p { @@ -810,6 +829,12 @@ proc_starttime_kdp(void *p, uint64_t *tv_sec, uint64_t *tv_usec) ((struct uint64p *)tv_sec)->val = pp->p_start.tv_sec; if (tv_usec != NULL) ((struct uint64p *)tv_usec)->val = pp->p_start.tv_usec; + if (abstime != NULL) { + if (pp->p_stats != NULL) + *abstime = pp->p_stats->ps_start; + else + *abstime = 0; + } } } @@ -819,6 +844,14 @@ proc_name_address(void *p) return &((proc_t)p)->p_comm[0]; } +char * +proc_best_name(proc_t p) +{ + if (p->p_name[0] != 0) + return (&p->p_name[0]); + return (&p->p_comm[0]); +} + void proc_selfname(char * buf, int size) { @@ -1554,75 +1587,95 @@ fixjobc(proc_t p, struct pgrp *pgrp, int entering) proc_childrenwalk(p, fixjob_callback, &fjarg); } -/* - * A process group has become orphaned; - * if there are any stopped processes in the group, - * hang-up all process in that group. +/* + * A process group has become orphaned; if there are any stopped processes in + * the group, hang-up all process in that group. */ static void -orphanpg(struct pgrp * pgrp) +orphanpg(struct pgrp *pgrp) { + pid_t *pid_list; proc_t p; - pid_t * pid_list; - int count, pidcount, i, alloc_count; + vm_size_t pid_list_size = 0; + vm_size_t pid_list_size_needed = 0; + int pid_count = 0; + int pid_count_available = 0; - if (pgrp == PGRP_NULL) - return; - count = 0; - pgrp_lock(pgrp); - for (p = pgrp->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) { - if (p->p_stat == SSTOP) { - for (p = pgrp->pg_members.lh_first; p != 0; - p = p->p_pglist.le_next) - count++; - break; /* ??? stops after finding one.. */ + assert(pgrp != NULL); + + /* allocate outside of the pgrp_lock */ + for (;;) { + pgrp_lock(pgrp); + + boolean_t should_iterate = FALSE; + pid_count_available = 0; + + PGMEMBERS_FOREACH(pgrp, p) { + pid_count_available++; + + if (p->p_stat == SSTOP) { + should_iterate = TRUE; + } + } + + if (pid_count_available == 0 || !should_iterate) { + pgrp_unlock(pgrp); + return; + } + + pid_list_size_needed = pid_count_available * sizeof(pid_t); + if (pid_list_size >= pid_list_size_needed) { + break; + } + pgrp_unlock(pgrp); + + if (pid_list_size != 0) { + kfree(pid_list, pid_list_size); + } + pid_list = kalloc(pid_list_size_needed); + if (!pid_list) { + return; } + pid_list_size = pid_list_size_needed; } - pgrp_unlock(pgrp); - count += 20; - if (count > hard_maxproc) - count = hard_maxproc; - alloc_count = count * sizeof(pid_t); - pid_list = (pid_t *)kalloc(alloc_count); - bzero(pid_list, alloc_count); - - pidcount = 0; - pgrp_lock(pgrp); - for (p = pgrp->pg_members.lh_first; p != 0; - p = p->p_pglist.le_next) { - if (p->p_stat == SSTOP) { - for (p = pgrp->pg_members.lh_first; p != 0; - p = p->p_pglist.le_next) { - pid_list[pidcount] = p->p_pid; - pidcount++; - if (pidcount >= count) - break; - } - break; /* ??? stops after finding one.. */ + /* no orphaned processes */ + if (pid_list_size == 0) { + pgrp_unlock(pgrp); + return; + } + + PGMEMBERS_FOREACH(pgrp, p) { + pid_list[pid_count++] = proc_pid(p); + if (pid_count >= pid_count_available) { + break; } } pgrp_unlock(pgrp); - - if (pidcount == 0) - goto out; + if (pid_count == 0) { + goto out; + } - for (i = 0; i< pidcount; i++) { - /* No handling or proc0 */ - if (pid_list[i] == 0) + for (int i = 0; i < pid_count; i++) { + /* do not handle kernproc */ + if (pid_list[i] == 0) { continue; + } p = proc_find(pid_list[i]); - if (p) { - proc_transwait(p, 0); - pt_setrunnable(p); - psignal(p, SIGHUP); - psignal(p, SIGCONT); - proc_rele(p); + if (!p) { + continue; } + + proc_transwait(p, 0); + pt_setrunnable(p); + psignal(p, SIGHUP); + psignal(p, SIGCONT); + proc_rele(p); } + out: - kfree(pid_list, alloc_count); + kfree(pid_list, pid_list_size); return; } @@ -1645,6 +1698,7 @@ proc_is_forcing_hfs_case_sensitivity(proc_t p) return (p->p_vfs_iopolicy & P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY) ? 1 : 0; } +#if CONFIG_COREDUMP /* * proc_core_name(name, uid, pid) * Expand the name described in corefilename, using name, uid, and pid. @@ -1687,6 +1741,8 @@ proc_core_name(const char *name, uid_t uid, pid_t pid, char *cf_name, snprintf(id_buf, sizeof(id_buf), "%u", uid); appendstr = id_buf; break; + case '\0': /* format string ended in % symbol */ + goto endofstring; default: appendstr = ""; log(LOG_ERR, @@ -1710,7 +1766,12 @@ proc_core_name(const char *name, uid_t uid, pid_t pid, char *cf_name, log(LOG_ERR, "pid %ld (%s), uid (%u): corename is too long\n", (long)pid, name, (uint32_t)uid); return (1); +endofstring: + log(LOG_ERR, "pid %ld (%s), uid (%u): unexpected end of string after %% token\n", + (long)pid, name, (uint32_t)uid); + return (1); } +#endif /* CONFIG_COREDUMP */ /* Code Signing related routines */ @@ -1816,6 +1877,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user case CS_OPS_MARKKILL: case CS_OPS_MARKRESTRICT: case CS_OPS_SET_STATUS: + case CS_OPS_CLEARINSTALLER: if ((error = mac_proc_check_set_cs_info(current_proc(), pt, ops))) goto out; break; @@ -1836,6 +1898,8 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user retflags |= CS_ENFORCEMENT; if (csproc_get_platform_binary(pt)) retflags |= CS_PLATFORM_BINARY; + if (csproc_get_platform_path(pt)) + retflags |= CS_PLATFORM_PATH; proc_unlock(pt); if (uaddr != USER_ADDR_NULL) @@ -1954,8 +2018,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user CS_KILL | CS_EXEC_SET_KILL | CS_RESTRICT | CS_REQUIRE_LV | - CS_ENFORCEMENT | CS_EXEC_SET_ENFORCEMENT | - CS_ENTITLEMENTS_VALIDATED; + CS_ENFORCEMENT | CS_EXEC_SET_ENFORCEMENT; proc_lock(pt); if (pt->p_csflags & CS_VALID) @@ -2032,6 +2095,12 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user break; } + case CS_OPS_CLEARINSTALLER: + proc_lock(pt); + pt->p_csflags &= ~(CS_INSTALLER | CS_EXEC_SET_INSTALLER); + proc_unlock(pt); + break; + default: error = EINVAL; break; @@ -2042,362 +2111,370 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user } int -proc_iterate(flags, callout, arg, filterfn, filterarg) - int flags; - int (*callout)(proc_t, void *); - void * arg; - int (*filterfn)(proc_t, void *); - void * filterarg; -{ - proc_t p; - pid_t * pid_list; - int count, pidcount, alloc_count, i, retval; +proc_iterate( + unsigned int flags, + proc_iterate_fn_t callout, + void *arg, + proc_iterate_fn_t filterfn, + void *filterarg) +{ + pid_t *pid_list; + vm_size_t pid_list_size = 0; + vm_size_t pid_list_size_needed = 0; + int pid_count = 0; + int pid_count_available = 0; + + assert(callout != NULL); + + /* allocate outside of the proc_list_lock */ + for (;;) { + proc_list_lock(); - count = nprocs+ 10; - if (count > hard_maxproc) - count = hard_maxproc; - alloc_count = count * sizeof(pid_t); - pid_list = (pid_t *)kalloc(alloc_count); - bzero(pid_list, alloc_count); + pid_count_available = nprocs; + assert(pid_count_available > 0); + pid_list_size_needed = pid_count_available * sizeof(pid_t); + if (pid_list_size >= pid_list_size_needed) { + break; + } + proc_list_unlock(); - proc_list_lock(); + if (pid_list_size != 0) { + kfree(pid_list, pid_list_size); + } + pid_list = kalloc(pid_list_size_needed); + if (!pid_list) { + return 1; + } + pid_list_size = pid_list_size_needed; + } + /* filter pids into pid_list */ - pidcount = 0; if (flags & PROC_ALLPROCLIST) { - for (p = allproc.lh_first; (p != 0); p = p->p_list.le_next) { - if (p->p_stat == SIDL) + proc_t p; + ALLPROC_FOREACH(p) { + /* ignore processes that are being forked */ + if (p->p_stat == SIDL) { + continue; + } + if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) { continue; - if ( (filterfn == 0 ) || (filterfn(p, filterarg) != 0)) { - pid_list[pidcount] = p->p_pid; - pidcount++; - if (pidcount >= count) - break; + } + + pid_list[pid_count++] = proc_pid(p); + if (pid_count >= pid_count_available) { + break; } } } - if ((pidcount < count ) && (flags & PROC_ZOMBPROCLIST)) { - for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next) { - if ( (filterfn == 0 ) || (filterfn(p, filterarg) != 0)) { - pid_list[pidcount] = p->p_pid; - pidcount++; - if (pidcount >= count) - break; + + if ((pid_count < pid_count_available) && + (flags & PROC_ZOMBPROCLIST)) + { + proc_t p; + ZOMBPROC_FOREACH(p) { + if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) { + continue; + } + + pid_list[pid_count++] = proc_pid(p); + if (pid_count >= pid_count_available) { + break; } } } - proc_list_unlock(); + /* call callout on processes in the pid_list */ - for (i = 0; i< pidcount; i++) { - p = proc_find(pid_list[i]); + for (int i = 0; i < pid_count; i++) { + proc_t p = proc_find(pid_list[i]); if (p) { - if ((flags & PROC_NOWAITTRANS) == 0) + if ((flags & PROC_NOWAITTRANS) == 0) { proc_transwait(p, 0); - retval = callout(p, arg); - - switch (retval) { - case PROC_RETURNED: - proc_rele(p); - break; - case PROC_RETURNED_DONE: - proc_rele(p); - goto out; - case PROC_CLAIMED_DONE: - goto out; - case PROC_CLAIMED: - default: - break; } - } else if (flags & PROC_ZOMBPROCLIST) { - p = proc_find_zombref(pid_list[i]); - if (p != PROC_NULL) { - retval = callout(p, arg); - - switch (retval) { - case PROC_RETURNED: - proc_drop_zombref(p); - break; - case PROC_RETURNED_DONE: - proc_drop_zombref(p); - goto out; - case PROC_CLAIMED_DONE: - goto out; - case PROC_CLAIMED: - default: - break; - } - } - } - } - -out: - kfree(pid_list, alloc_count); - return(0); - -} + int callout_ret = callout(p, arg); + switch (callout_ret) { + case PROC_RETURNED_DONE: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; -#if 0 -/* This is for iteration in case of trivial non blocking callouts */ -int -proc_scanall(flags, callout, arg) - int flags; - int (*callout)(proc_t, void *); - void * arg; -{ - proc_t p; - int retval; + case PROC_RETURNED: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; + default: + panic("proc_iterate: callout returned %d for pid %d", + callout_ret, pid_list[i]); + break; + } + } else if (flags & PROC_ZOMBPROCLIST) { + p = proc_find_zombref(pid_list[i]); + if (!p) { + continue; + } + int callout_ret = callout(p, arg); - proc_list_lock(); + switch (callout_ret) { + case PROC_RETURNED_DONE: + proc_drop_zombref(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; + case PROC_RETURNED: + proc_drop_zombref(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; - if (flags & PROC_ALLPROCLIST) { - for (p = allproc.lh_first; (p != 0); p = p->p_list.le_next) { - retval = callout(p, arg); - if (retval == PROC_RETURNED_DONE) - goto out; - } - } - if (flags & PROC_ZOMBPROCLIST) { - for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next) { - retval = callout(p, arg); - if (retval == PROC_RETURNED_DONE) - goto out; + default: + panic("proc_iterate: callout returned %d for zombie pid %d", + callout_ret, pid_list[i]); + break; + } } } -out: - proc_list_unlock(); +out: + kfree(pid_list, pid_list_size); + return 0; - return(0); } -#endif - -int -proc_rebootscan(callout, arg, filterfn, filterarg) - int (*callout)(proc_t, void *); - void * arg; - int (*filterfn)(proc_t, void *); - void * filterarg; +void +proc_rebootscan( + proc_iterate_fn_t callout, + void *arg, + proc_iterate_fn_t filterfn, + void *filterarg) { proc_t p; - int lockheld = 0, retval; + + assert(callout != NULL); proc_shutdown_exitcount = 0; -ps_allprocscan: +restart_foreach: proc_list_lock(); - lockheld = 1; - - for (p = allproc.lh_first; (p != 0); p = p->p_list.le_next) { - if ( (filterfn == 0 ) || (filterfn(p, filterarg) != 0)) { - p = proc_ref_locked(p); + ALLPROC_FOREACH(p) { + if ((filterfn != NULL) && filterfn(p, filterarg) == 0) { + continue; + } + p = proc_ref_locked(p); + if (!p) { + continue; + } - proc_list_unlock(); - lockheld = 0; + proc_list_unlock(); - if (p) { - proc_transwait(p, 0); - retval = callout(p, arg); - proc_rele(p); - - switch (retval) { - case PROC_RETURNED_DONE: - case PROC_CLAIMED_DONE: - goto out; - } - } - goto ps_allprocscan; - } /* filter pass */ - } /* allproc walk thru */ + proc_transwait(p, 0); + (void)callout(p, arg); + proc_rele(p); - if (lockheld == 1) { - proc_list_unlock(); - lockheld = 0; + goto restart_foreach; } -out: - return(0); - + proc_list_unlock(); } - int -proc_childrenwalk(parent, callout, arg) - struct proc * parent; - int (*callout)(proc_t, void *); - void * arg; +proc_childrenwalk( + proc_t parent, + proc_iterate_fn_t callout, + void *arg) { - register struct proc *p; - pid_t * pid_list; - int count, pidcount, alloc_count, i, retval; + pid_t *pid_list; + vm_size_t pid_list_size = 0; + vm_size_t pid_list_size_needed = 0; + int pid_count = 0; + int pid_count_available = 0; - count = nprocs+ 10; - if (count > hard_maxproc) - count = hard_maxproc; - alloc_count = count * sizeof(pid_t); - pid_list = (pid_t *)kalloc(alloc_count); - bzero(pid_list, alloc_count); + assert(parent != NULL); + assert(callout != NULL); + for (;;) { + proc_list_lock(); - proc_list_lock(); + pid_count_available = parent->p_childrencnt; + if (pid_count_available == 0) { + proc_list_unlock(); + return 0; + } + pid_list_size_needed = pid_count_available * sizeof(pid_t); + if (pid_list_size >= pid_list_size_needed) { + break; + } + proc_list_unlock(); - pidcount = 0; - for (p = parent->p_children.lh_first; (p != 0); p = p->p_sibling.le_next) { - if (p->p_stat == SIDL) + if (pid_list_size != 0) { + kfree(pid_list, pid_list_size); + } + pid_list = kalloc(pid_list_size_needed); + if (!pid_list) { + return 1; + } + pid_list_size = pid_list_size_needed; + } + + proc_t p; + PCHILDREN_FOREACH(parent, p) { + if (p->p_stat == SIDL) { continue; - pid_list[pidcount] = p->p_pid; - pidcount++; - if (pidcount >= count) + } + + pid_list[pid_count++] = proc_pid(p); + if (pid_count >= pid_count_available) { break; + } } - proc_list_unlock(); + proc_list_unlock(); - for (i = 0; i< pidcount; i++) { + for (int i = 0; i < pid_count; i++) { p = proc_find(pid_list[i]); - if (p) { - proc_transwait(p, 0); - retval = callout(p, arg); - - switch (retval) { - case PROC_RETURNED: - case PROC_RETURNED_DONE: - proc_rele(p); - if (retval == PROC_RETURNED_DONE) { - goto out; - } - break; - - case PROC_CLAIMED_DONE: - goto out; - case PROC_CLAIMED: - default: - break; - } + if (!p) { + continue; } - } -out: - kfree(pid_list, alloc_count); - return(0); + int callout_ret = callout(p, arg); + switch (callout_ret) { + case PROC_RETURNED_DONE: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; + + case PROC_RETURNED: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; + default: + panic("proc_childrenwalk: callout returned %d for pid %d", + callout_ret, pid_list[i]); + break; + } + } + +out: + kfree(pid_list, pid_list_size); + return 0; } -/* - */ -/* PGRP_BLOCKITERATE is not implemented yet */ int -pgrp_iterate(pgrp, flags, callout, arg, filterfn, filterarg) - struct pgrp *pgrp; - int flags; - int (*callout)(proc_t, void *); - void * arg; - int (*filterfn)(proc_t, void *); - void * filterarg; -{ +pgrp_iterate( + struct pgrp *pgrp, + unsigned int flags, + proc_iterate_fn_t callout, + void * arg, + proc_iterate_fn_t filterfn, + void * filterarg) +{ + pid_t *pid_list; proc_t p; - pid_t * pid_list; - int count, pidcount, i, alloc_count; - int retval; + vm_size_t pid_list_size = 0; + vm_size_t pid_list_size_needed = 0; + int pid_count = 0; + int pid_count_available = 0; + pid_t pgid; - int dropref = flags & PGRP_DROPREF; -#if 0 - int serialize = flags & PGRP_BLOCKITERATE; -#else - int serialize = 0; -#endif - if (pgrp == 0) - return(0); - count = pgrp->pg_membercnt + 10; - if (count > hard_maxproc) - count = hard_maxproc; - alloc_count = count * sizeof(pid_t); - pid_list = (pid_t *)kalloc(alloc_count); - bzero(pid_list, alloc_count); - - pgrp_lock(pgrp); - if (serialize != 0) { - while ((pgrp->pg_listflags & PGRP_FLAG_ITERABEGIN) == PGRP_FLAG_ITERABEGIN) { - pgrp->pg_listflags |= PGRP_FLAG_ITERWAIT; - msleep(&pgrp->pg_listflags, &pgrp->pg_mlock, 0, "pgrp_iterate", 0); + assert(pgrp != NULL); + assert(callout != NULL); + + for (;;) { + pgrp_lock(pgrp); + + pid_count_available = pgrp->pg_membercnt; + if (pid_count_available == 0) { + pgrp_unlock(pgrp); + return 0; + } + + pid_list_size_needed = pid_count_available * sizeof(pid_t); + if (pid_list_size >= pid_list_size_needed) { + break; + } + pgrp_unlock(pgrp); + + if (pid_list_size != 0) { + kfree(pid_list, pid_list_size); + } + pid_list = kalloc(pid_list_size_needed); + if (!pid_list) { + return 1; } - pgrp->pg_listflags |= PGRP_FLAG_ITERABEGIN; + pid_list_size = pid_list_size_needed; } pgid = pgrp->pg_id; - pidcount = 0; - for (p = pgrp->pg_members.lh_first; p != 0; - p = p->p_pglist.le_next) { - if ( (filterfn == 0 ) || (filterfn(p, filterarg) != 0)) { - pid_list[pidcount] = p->p_pid; - pidcount++; - if (pidcount >= count) - break; + PGMEMBERS_FOREACH(pgrp, p) { + if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) { + continue;; + } + pid_list[pid_count++] = proc_pid(p); + if (pid_count >= pid_count_available) { + break; } } - pgrp_unlock(pgrp); - if ((serialize == 0) && (dropref != 0)) - pg_rele(pgrp); + if (flags & PGRP_DROPREF) { + pg_rele(pgrp); + } - for (i = 0; i< pidcount; i++) { - /* No handling or proc0 */ - if (pid_list[i] == 0) + for (int i = 0; i< pid_count; i++) { + /* do not handle kernproc */ + if (pid_list[i] == 0) { continue; + } p = proc_find(pid_list[i]); - if (p) { - if (p->p_pgrpid != pgid) { - proc_rele(p); - continue; - } - proc_transwait(p, 0); - retval = callout(p, arg); - - switch (retval) { - case PROC_RETURNED: - case PROC_RETURNED_DONE: - proc_rele(p); - if (retval == PROC_RETURNED_DONE) { - goto out; - } - break; - - case PROC_CLAIMED_DONE: - goto out; - case PROC_CLAIMED: - default: - break; - } + if (!p) { + continue; } - } -out: - if (serialize != 0) { - pgrp_lock(pgrp); - pgrp->pg_listflags &= ~PGRP_FLAG_ITERABEGIN; - if ((pgrp->pg_listflags & PGRP_FLAG_ITERWAIT) == PGRP_FLAG_ITERWAIT) { - pgrp->pg_listflags &= ~PGRP_FLAG_ITERWAIT; - wakeup(&pgrp->pg_listflags); + if (p->p_pgrpid != pgid) { + proc_rele(p); + continue; + } + + int callout_ret = callout(p, arg); + + switch (callout_ret) { + case PROC_RETURNED: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED: + break; + + case PROC_RETURNED_DONE: + proc_rele(p); + /* FALLTHROUGH */ + case PROC_CLAIMED_DONE: + goto out; + + default: + panic("pgrp_iterate: callout returned %d for pid %d", + callout_ret, pid_list[i]); } - pgrp_unlock(pgrp); - if (dropref != 0) - pg_rele(pgrp); } - kfree(pid_list, alloc_count); - return(0); + +out: + kfree(pid_list, pid_list_size); + return 0; } static void @@ -3198,3 +3275,10 @@ proc_chrooted(proc_t p) return retval; } + +void * +proc_get_uthread_uu_threadlist(void * uthread_v) +{ + uthread_t uth = (uthread_t)uthread_v; + return (uth != NULL) ? uth->uu_threadlist : NULL; +} diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index 75980efdc..df5fea45a 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -580,7 +580,7 @@ setsid(proc_t p, __unused struct setsid_args *uap, int32_t *retval) * XXX: Belongs in kern_proc.c */ int -setpgid(proc_t curp, register struct setpgid_args *uap, __unused int32_t *retval) +setpgid(proc_t curp, struct setpgid_args *uap, __unused int32_t *retval) { proc_t targp = PROC_NULL; /* target process */ struct pgrp *pg = PGRP_NULL; /* target pgrp */ diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index a994b8bd6..625916715 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -98,6 +98,7 @@ #include #include /* for thread_policy_set( ) */ #include +#include #include #include /* for absolutetime_to_microtime() */ @@ -108,20 +109,19 @@ #include #include +#include #include int donice(struct proc *curp, struct proc *chgp, int n); int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp); int uthread_get_background_state(uthread_t); static void do_background_socket(struct proc *p, thread_t thread); -static int do_background_thread(struct proc *curp, thread_t thread, int priority); +static int do_background_thread(thread_t thread, int priority); static int do_background_proc(struct proc *curp, struct proc *targetp, int priority); static int set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority); static int proc_set_darwin_role(proc_t curp, proc_t targetp, int priority); static int proc_get_darwin_role(proc_t curp, proc_t targetp, int *priority); static int get_background_proc(struct proc *curp, struct proc *targetp, int *priority); -void proc_apply_task_networkbg_internal(proc_t, thread_t); -void proc_restore_task_networkbg_internal(proc_t, thread_t); int proc_pid_rusage(int pid, int flavor, user_addr_t buf, int32_t *retval); void gather_rusage_info(proc_t p, rusage_info_current *ru, int flavor); int fill_task_rusage(task_t task, rusage_info_current *ri); @@ -212,7 +212,7 @@ getpriority(struct proc *curp, struct getpriority_args *uap, int32_t *retval) } /* No need for iteration as it is a simple scan */ pgrp_lock(pg); - for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) { + PGMEMBERS_FOREACH(pg, p) { if (p->p_nice < low) low = p->p_nice; } @@ -244,7 +244,7 @@ getpriority(struct proc *curp, struct getpriority_args *uap, int32_t *retval) if (uap->who != 0) return (EINVAL); - low = proc_get_task_policy(current_task(), current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_DARWIN_BG); + low = proc_get_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_DARWIN_BG); break; @@ -417,7 +417,7 @@ setpriority(struct proc *curp, struct setpriority_args *uap, int32_t *retval) if (uap->who != 0) return (EINVAL); - error = do_background_thread(curp, current_thread(), uap->prio); + error = do_background_thread(current_thread(), uap->prio); found++; break; } @@ -593,8 +593,10 @@ proc_set_darwin_role(proc_t curp, proc_t targetp, int priority) if (!kauth_cred_issuser(ucred) && kauth_cred_getruid(ucred) && kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred) && kauth_cred_getruid(ucred) != kauth_cred_getuid(target_cred)) { - error = EPERM; - goto out; + if (priv_check_cred(ucred, PRIV_SETPRIORITY_DARWIN_ROLE, 0) != 0) { + error = EPERM; + goto out; + } } if (curp != targetp) { @@ -615,8 +617,8 @@ proc_set_darwin_role(proc_t curp, proc_t targetp, int priority) if ((error = proc_darwin_role_to_task_role(priority, &role))) goto out; - proc_set_task_policy(proc_task(targetp), THREAD_NULL, - TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE, role); + proc_set_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE, + TASK_POLICY_ROLE, role); out: kauth_cred_unref(&target_cred); @@ -648,8 +650,7 @@ proc_get_darwin_role(proc_t curp, proc_t targetp, int *priority) #endif } - role = proc_get_task_policy(proc_task(targetp), THREAD_NULL, - TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE); + role = proc_get_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE); *priority = proc_task_role_to_darwin_role(role); @@ -678,7 +679,7 @@ get_background_proc(struct proc *curp, struct proc *targetp, int *priority) external = (curp == targetp) ? TASK_POLICY_INTERNAL : TASK_POLICY_EXTERNAL; - *priority = proc_get_task_policy(current_task(), THREAD_NULL, external, TASK_POLICY_DARWIN_BG); + *priority = proc_get_task_policy(current_task(), external, TASK_POLICY_DARWIN_BG); out: kauth_cred_unref(&target_cred); @@ -729,7 +730,7 @@ do_background_proc(struct proc *curp, struct proc *targetp, int priority) break; } - proc_set_task_policy(proc_task(targetp), THREAD_NULL, external, TASK_POLICY_DARWIN_BG, enable); + proc_set_task_policy(proc_task(targetp), external, TASK_POLICY_DARWIN_BG, enable); out: kauth_cred_unref(&target_cred); @@ -807,12 +808,15 @@ do_background_socket(struct proc *p, thread_t thread) /* * do_background_thread + * + * Requires: thread reference + * * Returns: 0 Success * EPERM Tried to background while in vfork * XXX - todo - does this need a MACF hook? */ static int -do_background_thread(struct proc *curp, thread_t thread, int priority) +do_background_thread(thread_t thread, int priority) { struct uthread *ut; int enable, external; @@ -824,6 +828,7 @@ do_background_thread(struct proc *curp, thread_t thread, int priority) if ((ut->uu_flag & UT_VFORK) != 0) return(EPERM); + /* Backgrounding is unsupported for workq threads */ if (thread_is_static_param(thread)) { return(EPERM); } @@ -838,8 +843,7 @@ do_background_thread(struct proc *curp, thread_t thread, int priority) enable = (priority == PRIO_DARWIN_BG) ? TASK_POLICY_ENABLE : TASK_POLICY_DISABLE; external = (current_thread() == thread) ? TASK_POLICY_INTERNAL : TASK_POLICY_EXTERNAL; - proc_set_task_policy_thread(curp->task, thread_tid(thread), external, - TASK_POLICY_DARWIN_BG, enable); + proc_set_thread_policy(thread, external, TASK_POLICY_DARWIN_BG, enable); return rv; } @@ -1552,14 +1556,16 @@ iopolicysys_disk(struct proc *p __unused, int cmd, int scope, int policy, struct /* Perform command */ switch(cmd) { case IOPOL_CMD_SET: - proc_set_task_policy(current_task(), thread, - TASK_POLICY_INTERNAL, policy_flavor, - policy); + if (thread != THREAD_NULL) + proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, policy_flavor, policy); + else + proc_set_task_policy(current_task(), TASK_POLICY_INTERNAL, policy_flavor, policy); break; case IOPOL_CMD_GET: - policy = proc_get_task_policy(current_task(), thread, - TASK_POLICY_INTERNAL, policy_flavor); - + if (thread != THREAD_NULL) + policy = proc_get_thread_policy(thread, TASK_POLICY_INTERNAL, policy_flavor); + else + policy = proc_get_task_policy(current_task(), TASK_POLICY_INTERNAL, policy_flavor); iop_param->iop_policy = policy; break; default: @@ -1640,9 +1646,7 @@ iopolicysys_vfs(struct proc *p, int cmd, int scope, int policy, struct _iopol_pa return (error); } -/* BSD call back function for task_policy */ -void proc_apply_task_networkbg(void * bsd_info, thread_t thread); - +/* BSD call back function for task_policy networking changes */ void proc_apply_task_networkbg(void * bsd_info, thread_t thread) { diff --git a/bsd/kern/kern_sfi.c b/bsd/kern/kern_sfi.c index a9787dfb9..f42fd4b1f 100644 --- a/bsd/kern/kern_sfi.c +++ b/bsd/kern/kern_sfi.c @@ -41,6 +41,7 @@ #include #include #include +#include /* * This file provides the syscall-based configuration facility @@ -173,10 +174,11 @@ static int proc_apply_sfi_managed(proc_t p, void * arg) } else { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_PID_CLEAR_MANAGED) | DBG_FUNC_NONE, pid, 0, 0, 0, 0); } - - proc_set_task_policy(p->task, THREAD_NULL, - TASK_POLICY_ATTRIBUTE, TASK_POLICY_SFI_MANAGED, - managed_enabled ? TASK_POLICY_ENABLE : TASK_POLICY_DISABLE); + + proc_set_task_policy(p->task, + TASK_POLICY_ATTRIBUTE, TASK_POLICY_SFI_MANAGED, + managed_enabled ? TASK_POLICY_ENABLE : TASK_POLICY_DISABLE); + return PROC_RETURNED; } @@ -240,7 +242,7 @@ int sfi_pidctl(struct proc *p __unused, struct sfi_pidctl_args *uap, int32_t *re break; } - managed_enabled = proc_get_task_policy(targetp->task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, TASK_POLICY_SFI_MANAGED); + managed_enabled = proc_get_task_policy(targetp->task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_SFI_MANAGED); proc_rele(targetp); diff --git a/bsd/kern/kern_shutdown.c b/bsd/kern/kern_shutdown.c index 59a44295f..dedb8288c 100644 --- a/bsd/kern/kern_shutdown.c +++ b/bsd/kern/kern_shutdown.c @@ -107,9 +107,9 @@ static off_t sd_log_offset = 0; static int sd_filt1(proc_t, void *); static int sd_filt2(proc_t, void *); -static int sd_callback1(proc_t p, void * arg); -static int sd_callback2(proc_t p, void * arg); -static int sd_callback3(proc_t p, void * arg); +static int sd_callback1(proc_t p, void * arg); +static int sd_callback2(proc_t p, void * arg); +static int sd_callback3(proc_t p, void * arg); extern boolean_t panic_include_zprint; extern vm_offset_t panic_kext_memory_info; @@ -135,7 +135,7 @@ kernel_hwm_panic_info(void) return; } memory_info = (mach_memory_info_t *)panic_kext_memory_info; - vm_page_diagnose(memory_info, num_sites); + vm_page_diagnose(memory_info, num_sites, 0); return; } @@ -324,7 +324,7 @@ sd_filt1(proc_t p, void * args) } -static int +static int sd_callback1(proc_t p, void * args) { struct sd_iterargs * sd = (struct sd_iterargs *)args; @@ -346,9 +346,11 @@ sd_callback1(proc_t p, void * args) psignal(p, signo); if (countproc != 0) sd->activecount++; - } else + } else { proc_unlock(p); - return(PROC_RETURNED); + } + + return PROC_RETURNED; } static int @@ -369,7 +371,7 @@ sd_filt2(proc_t p, void * args) return(1); } -static int +static int sd_callback2(proc_t p, void * args) { struct sd_iterargs * sd = (struct sd_iterargs *)args; @@ -390,14 +392,14 @@ sd_callback2(proc_t p, void * args) psignal(p, signo); if (countproc != 0) sd->activecount++; - } else + } else { proc_unlock(p); + } - return(PROC_RETURNED); - + return PROC_RETURNED; } -static int +static int sd_callback3(proc_t p, void * args) { struct sd_iterargs * sd = (struct sd_iterargs *)args; @@ -431,10 +433,11 @@ sd_callback3(proc_t p, void * args) sd->activecount++; exit1(p, 1, (int *)NULL); } - } else + } else { proc_unlock(p); + } - return(PROC_RETURNED); + return PROC_RETURNED; } diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index bf5507903..7fa64636b 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2007 Apple Inc. All rights reserved. + * Copyright (c) 1995-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -88,6 +88,7 @@ #include #include #include +#include #include #include @@ -101,10 +102,13 @@ #include #include /* for coredump */ #include /* for APC support */ +#include #include /* extern void *get_bsdtask_info(task_t); */ #include #include #include +#include + #include #include #include @@ -133,8 +137,6 @@ extern void doexception(int exc, mach_exception_code_t code, static void stop(proc_t, proc_t); int cansignal(proc_t, kauth_cred_t, proc_t, int, int); int killpg1(proc_t, int, int, int, int); -static void psignal_uthread(thread_t, int); -static void psignal_try_thread(proc_t, thread_t, int signum); kern_return_t do_bsdexception(int, int, int); void __posix_sem_syscall_return(kern_return_t); char *proc_name_address(void *p); @@ -148,14 +150,15 @@ kern_return_t semaphore_wait_trap_internal(mach_port_name_t, void (*)(kern_retur static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); -static void filt_signaltouch(struct knote *kn, struct kevent_internal_s *kev, - long type); +static int filt_signaltouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_signalprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); struct filterops sig_filtops = { .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, .f_touch = filt_signaltouch, + .f_process = filt_signalprocess, }; /* structures and fns for killpg1 iterartion callback and filters */ @@ -187,8 +190,8 @@ static kern_return_t get_signalthread(proc_t, int, thread_t *); #define PSIG_THREAD 0x4 #define PSIG_TRY_THREAD 0x8 - -static void psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum); +static os_reason_t build_signal_reason(int signum, const char *procname); +static void psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum, os_reason_t signal_reason); /* * NOTE: Source and target may *NOT* overlap! (target is smaller) @@ -637,6 +640,8 @@ setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigactio struct sigacts *ps = p->p_sigacts; int bit; + assert(signum < NSIG); + if ((signum == SIGKILL || signum == SIGSTOP) && sa->sa_handler != SIG_DFL) return(EINVAL); @@ -987,6 +992,7 @@ __pthread_canceled(__unused proc_t p, return(EINVAL); } +__attribute__((noreturn)) void __posix_sem_syscall_return(kern_return_t kern_result) { @@ -1489,6 +1495,200 @@ kill(proc_t cp, struct kill_args *uap, __unused int32_t *retval) /* NOTREACHED */ } +os_reason_t +build_userspace_exit_reason(uint32_t reason_namespace, uint64_t reason_code, user_addr_t payload, uint32_t payload_size, + user_addr_t reason_string, uint64_t reason_flags) +{ + os_reason_t exit_reason = OS_REASON_NULL; + + int error = 0; + int num_items_to_copy = 0; + uint32_t user_data_to_copy = 0; + char *reason_user_desc = NULL; + size_t reason_user_desc_len = 0; + + exit_reason = os_reason_create(reason_namespace, reason_code); + if (exit_reason == OS_REASON_NULL) { + printf("build_userspace_exit_reason: failed to allocate exit reason\n"); + return exit_reason; + } + + exit_reason->osr_flags |= OS_REASON_FLAG_FROM_USERSPACE; + + /* + * Only apply flags that are allowed to be passed from userspace. + */ + exit_reason->osr_flags |= (reason_flags & OS_REASON_FLAG_MASK_ALLOWED_FROM_USER); + if ((reason_flags & OS_REASON_FLAG_MASK_ALLOWED_FROM_USER) != reason_flags) { + printf("build_userspace_exit_reason: illegal flags passed from userspace (some masked off) 0x%llx, ns: %u, code 0x%llx\n", + reason_flags, reason_namespace, reason_code); + } + + if (!(exit_reason->osr_flags & OS_REASON_FLAG_NO_CRASH_REPORT)) { + exit_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } + + if (payload != USER_ADDR_NULL) { + if (payload_size == 0) { + printf("build_userspace_exit_reason: exit reason with namespace %u, nonzero payload but zero length\n", + reason_namespace); + exit_reason->osr_flags |= OS_REASON_FLAG_BAD_PARAMS; + payload = USER_ADDR_NULL; + } else { + num_items_to_copy++; + + if (payload_size > EXIT_REASON_PAYLOAD_MAX_LEN) { + exit_reason->osr_flags |= OS_REASON_FLAG_PAYLOAD_TRUNCATED; + payload_size = EXIT_REASON_PAYLOAD_MAX_LEN; + } + + user_data_to_copy += payload_size; + } + } + + if (reason_string != USER_ADDR_NULL) { + reason_user_desc = (char *) kalloc(EXIT_REASON_USER_DESC_MAX_LEN); + + if (reason_user_desc != NULL) { + error = copyinstr(reason_string, (void *) reason_user_desc, + EXIT_REASON_USER_DESC_MAX_LEN, &reason_user_desc_len); + + if (error == 0) { + num_items_to_copy++; + user_data_to_copy += reason_user_desc_len; + } else if (error == ENAMETOOLONG) { + num_items_to_copy++; + reason_user_desc[EXIT_REASON_USER_DESC_MAX_LEN - 1] = '\0'; + user_data_to_copy += reason_user_desc_len; + } else { + exit_reason->osr_flags |= OS_REASON_FLAG_FAILED_DATA_COPYIN; + kfree(reason_user_desc, EXIT_REASON_USER_DESC_MAX_LEN); + reason_user_desc = NULL; + reason_user_desc_len = 0; + } + } + } + + if (num_items_to_copy != 0) { + uint32_t reason_buffer_size_estimate = 0; + mach_vm_address_t data_addr = 0; + + reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(num_items_to_copy, user_data_to_copy); + + error = os_reason_alloc_buffer(exit_reason, reason_buffer_size_estimate); + if (error != 0) { + printf("build_userspace_exit_reason: failed to allocate signal reason buffer\n"); + goto out_failed_copyin; + } + + if (reason_user_desc != NULL && reason_user_desc_len != 0) { + if (KERN_SUCCESS == kcdata_get_memory_addr(&exit_reason->osr_kcd_descriptor, + EXIT_REASON_USER_DESC, + reason_user_desc_len, + &data_addr)) { + + kcdata_memcpy(&exit_reason->osr_kcd_descriptor, (mach_vm_address_t) data_addr, + reason_user_desc, reason_user_desc_len); + } else { + printf("build_userspace_exit_reason: failed to allocate space for reason string\n"); + goto out_failed_copyin; + } + } + + if (payload != USER_ADDR_NULL) { + if (KERN_SUCCESS == + kcdata_get_memory_addr(&exit_reason->osr_kcd_descriptor, + EXIT_REASON_USER_PAYLOAD, + payload_size, + &data_addr)) { + error = copyin(payload, (void *) data_addr, payload_size); + if (error) { + printf("build_userspace_exit_reason: failed to copy in payload data with error %d\n", error); + goto out_failed_copyin; + } + } else { + printf("build_userspace_exit_reason: failed to allocate space for payload data\n"); + goto out_failed_copyin; + } + } + } + + if (reason_user_desc != NULL) { + kfree(reason_user_desc, EXIT_REASON_USER_DESC_MAX_LEN); + reason_user_desc = NULL; + reason_user_desc_len = 0; + } + + return exit_reason; + +out_failed_copyin: + + if (reason_user_desc != NULL) { + kfree(reason_user_desc, EXIT_REASON_USER_DESC_MAX_LEN); + reason_user_desc = NULL; + reason_user_desc_len = 0; + } + + exit_reason->osr_flags |= OS_REASON_FLAG_FAILED_DATA_COPYIN; + os_reason_alloc_buffer(exit_reason, 0); + return exit_reason; +} + +static int +terminate_with_payload_internal(struct proc *cur_proc, int target_pid, uint32_t reason_namespace, + uint64_t reason_code, user_addr_t payload, uint32_t payload_size, + user_addr_t reason_string, uint64_t reason_flags) +{ + proc_t target_proc = PROC_NULL; + kauth_cred_t cur_cred = kauth_cred_get(); + int signum = SIGKILL; + + os_reason_t signal_reason = OS_REASON_NULL; + + AUDIT_ARG(pid, target_pid); + if ((target_pid <= 0) || (cur_proc->p_pid == target_pid)) { + return EINVAL; + } + + if (reason_namespace == OS_REASON_INVALID || + reason_namespace > OS_REASON_MAX_VALID_NAMESPACE) { + + return EINVAL; + } + + target_proc = proc_find(target_pid); + if (target_proc == PROC_NULL) { + return ESRCH; + } + + AUDIT_ARG(process, target_proc); + + if (!cansignal(cur_proc, cur_cred, target_proc, signum, 0)) { + proc_rele(target_proc); + return EPERM; + } + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + target_proc->p_pid, reason_namespace, + reason_code, 0, 0); + + signal_reason = build_userspace_exit_reason(reason_namespace, reason_code, payload, payload_size, + reason_string, reason_flags); + + psignal_with_reason(target_proc, signum, signal_reason); + proc_rele(target_proc); + + return 0; +} + +int +terminate_with_payload(struct proc *cur_proc, struct terminate_with_payload_args *args, + __unused int32_t *retval) +{ + return terminate_with_payload_internal(cur_proc, args->pid, args->reason_namespace, args->reason_code, args->payload, + args->payload_size, args->reason_string, args->reason_flags); +} + static int killpg1_filt(proc_t p, void * arg) { @@ -1608,7 +1808,7 @@ killpg1(proc_t cp, int signum, int pgid, int all, int posix) /* PGRP_DROPREF drops the pgrp refernce */ - pgrp_iterate(pgrp, PGRP_BLOCKITERATE | PGRP_DROPREF, killpg1_callback, &karg, + pgrp_iterate(pgrp, PGRP_DROPREF, killpg1_callback, &karg, killpg1_pgrpfilt, NULL); } error = (nfound ? 0 : (posix ? EPERM : ESRCH)); @@ -1663,7 +1863,7 @@ void pgsignal(struct pgrp *pgrp, int signum, int checkctty) { if (pgrp != PGRP_NULL) { - pgrp_iterate(pgrp, PGRP_BLOCKITERATE, pgsignal_callback, &signum, pgsignal_filt, &checkctty); + pgrp_iterate(pgrp, 0, pgsignal_callback, &signum, pgsignal_filt, &checkctty); } } @@ -1675,7 +1875,7 @@ tty_pgsignal(struct tty *tp, int signum, int checkctty) pg = tty_pgrp(tp); if (pg != PGRP_NULL) { - pgrp_iterate(pg, PGRP_BLOCKITERATE, pgsignal_callback, &signum, pgsignal_filt, &checkctty); + pgrp_iterate(pg, 0, pgsignal_callback, &signum, pgsignal_filt, &checkctty); pg_rele(pg); } } @@ -1683,7 +1883,7 @@ tty_pgsignal(struct tty *tp, int signum, int checkctty) * Send a signal caused by a trap to a specific thread. */ void -threadsignal(thread_t sig_actthread, int signum, mach_exception_code_t code) +threadsignal(thread_t sig_actthread, int signum, mach_exception_code_t code, boolean_t set_exitreason) { struct uthread *uth; struct task * sig_task; @@ -1711,12 +1911,63 @@ threadsignal(thread_t sig_actthread, int signum, mach_exception_code_t code) uth->uu_siglist |= mask; uth->uu_code = code; + + /* Attempt to establish whether the signal will be fatal (mirrors logic in psignal_internal()) */ + if (set_exitreason && ((p->p_lflag & P_LTRACED) || (!(uth->uu_sigwait & mask) + && !(uth->uu_sigmask & mask) && !(p->p_sigcatch & mask))) && + !(mask & stopsigmask) && !(mask & contsigmask)) { + + if (uth->uu_exit_reason == OS_REASON_NULL) { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_SIGNAL, signum, 0, 0); + + os_reason_t signal_reason = build_signal_reason(signum, "exc handler"); + + set_thread_exit_reason(sig_actthread, signal_reason, TRUE); + + /* We dropped/consumed the reference in set_thread_exit_reason() */ + signal_reason = OS_REASON_NULL; + } + } + proc_unlock(p); /* mark on process as well */ signal_setast(sig_actthread); } +void +set_thread_exit_reason(void *th, void *reason, boolean_t proc_locked) +{ + struct uthread *targ_uth = get_bsdthread_info(th); + struct task *targ_task = NULL; + proc_t targ_proc = NULL; + + os_reason_t exit_reason = (os_reason_t)reason; + + if (exit_reason == OS_REASON_NULL) + return; + + if (!proc_locked) { + targ_task = get_threadtask(th); + targ_proc = (proc_t)(get_bsdtask_info(targ_task)); + + proc_lock(targ_proc); + } + + if (targ_uth->uu_exit_reason == OS_REASON_NULL) { + targ_uth->uu_exit_reason = exit_reason; + } else { + /* The caller expects that we drop a reference on the exit reason */ + os_reason_free(exit_reason); + } + + if (!proc_locked) { + assert(targ_proc != NULL); + proc_unlock(targ_proc); + } +} + /* * get_signalthread * @@ -1766,11 +2017,71 @@ get_signalthread(proc_t p, int signum, thread_t * thr) return(KERN_FAILURE); } +static os_reason_t +build_signal_reason(int signum, const char *procname) +{ + os_reason_t signal_reason = OS_REASON_NULL; + proc_t sender_proc = current_proc(); + uint32_t reason_buffer_size_estimate = 0, proc_name_length = 0; + const char *default_sender_procname = "unknown"; + mach_vm_address_t data_addr; + int ret; + + signal_reason = os_reason_create(OS_REASON_SIGNAL, signum); + if (signal_reason == OS_REASON_NULL) { + printf("build_signal_reason: unable to allocate signal reason structure.\n"); + return signal_reason; + } + + reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(2, sizeof(sender_proc->p_name) + + sizeof(sender_proc->p_pid)); + + ret = os_reason_alloc_buffer(signal_reason, reason_buffer_size_estimate); + if (ret != 0) { + printf("build_signal_reason: unable to allocate signal reason buffer.\n"); + return signal_reason; + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(&signal_reason->osr_kcd_descriptor, KCDATA_TYPE_PID, + sizeof(sender_proc->p_pid), &data_addr)) { + kcdata_memcpy(&signal_reason->osr_kcd_descriptor, data_addr, &sender_proc->p_pid, + sizeof(sender_proc->p_pid)); + } else { + printf("build_signal_reason: exceeded space in signal reason buf, unable to log PID\n"); + } + + proc_name_length = sizeof(sender_proc->p_name); + if (KERN_SUCCESS == kcdata_get_memory_addr(&signal_reason->osr_kcd_descriptor, KCDATA_TYPE_PROCNAME, + proc_name_length, &data_addr)) { + if (procname) { + char truncated_procname[proc_name_length]; + strncpy((char *) &truncated_procname, procname, proc_name_length); + truncated_procname[proc_name_length - 1] = '\0'; + + kcdata_memcpy(&signal_reason->osr_kcd_descriptor, data_addr, truncated_procname, + strlen((char *) &truncated_procname)); + } else if (*sender_proc->p_name) { + kcdata_memcpy(&signal_reason->osr_kcd_descriptor, data_addr, &sender_proc->p_name, + sizeof(sender_proc->p_name)); + } else { + kcdata_memcpy(&signal_reason->osr_kcd_descriptor, data_addr, &default_sender_procname, + strlen(default_sender_procname) + 1); + } + } else { + printf("build_signal_reason: exceeded space in signal reason buf, unable to log procname\n"); + } + + return signal_reason; +} + /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * + * Always drops a reference on a signal_reason if one is provided, whether via + * passing it to a thread or deallocating directly. + * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. @@ -1780,19 +2091,21 @@ get_signalthread(proc_t p, int signum, thread_t * thr) * Other ignored signals are discarded immediately. */ static void -psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) +psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum, os_reason_t signal_reason) { int prop; user_addr_t action = USER_ADDR_NULL; - proc_t sig_proc; - thread_t sig_thread; - task_t sig_task; - int mask; - struct uthread *uth; - kern_return_t kret; - uid_t r_uid; - proc_t pp; - kauth_cred_t my_cred; + proc_t sig_proc; + thread_t sig_thread; + task_t sig_task; + int mask; + struct uthread *uth; + kern_return_t kret; + uid_t r_uid; + proc_t pp; + kauth_cred_t my_cred; + char *launchd_exit_reason_desc = NULL; + boolean_t update_thread_policy = FALSE; if ((u_int)signum >= NSIG || signum == 0) panic("psignal: bad signal number %d", signum); @@ -1807,10 +2120,20 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) #endif /* SIGNAL_DEBUG */ /* catch unexpected initproc kills early for easier debuggging */ - if (signum == SIGKILL && p == initproc) - panic_plain("unexpected SIGKILL of %s %s", + if (signum == SIGKILL && p == initproc) { + if (signal_reason == NULL) { + panic_plain("unexpected SIGKILL of %s %s (no reason provided)", (p->p_name[0] != '\0' ? p->p_name : "initproc"), ((p->p_csflags & CS_KILLED) ? "(CS_KILLED)" : "")); + } else { + launchd_exit_reason_desc = launchd_exit_reason_get_string_desc(signal_reason); + panic_plain("unexpected SIGKILL of %s %s with reason -- namespace %d code 0x%llx description %." LAUNCHD_PANIC_REASON_STRING_MAXLEN "s", + (p->p_name[0] != '\0' ? p->p_name : "initproc"), + ((p->p_csflags & CS_KILLED) ? "(CS_KILLED)" : ""), + signal_reason->osr_namespace, signal_reason->osr_code, + launchd_exit_reason_desc ? launchd_exit_reason_desc : "none"); + } + } /* * We will need the task pointer later. Grab it now to @@ -1836,8 +2159,10 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) sig_proc = p; } - if ((sig_task == TASK_NULL) || is_kerneltask(sig_task)) + if ((sig_task == TASK_NULL) || is_kerneltask(sig_task)) { + os_reason_free(signal_reason); return; + } /* * do not send signals to the process that has the thread @@ -1848,6 +2173,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ if (ISSET(sig_proc->p_flag, P_REBOOT) || ISSET(sig_proc->p_lflag, P_LEXIT)) { DTRACE_PROC3(signal__discard, thread_t, sig_thread, proc_t, sig_proc, int, signum); + os_reason_free(signal_reason); return; } @@ -2061,6 +2387,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) } else { /* Default action - varies */ if (mask & stopsigmask) { + assert(signal_reason == NULL); /* * These are the signals which by default * stop a process. @@ -2150,7 +2477,20 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * We would need to cover this approp down the line. */ act_set_astbsd(sig_thread); - thread_abort(sig_thread); + kret = thread_abort(sig_thread); + update_thread_policy = (kret == KERN_SUCCESS); + + if (uth->uu_exit_reason == OS_REASON_NULL) { + if (signal_reason == OS_REASON_NULL) { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + sig_proc->p_pid, OS_REASON_SIGNAL, signum, 0, 0); + + signal_reason = build_signal_reason(signum, NULL); + } + + os_reason_ref(signal_reason); + set_thread_exit_reason(sig_thread, signal_reason, TRUE); + } goto sigout_locked; @@ -2159,6 +2499,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * Let the process run. If it's sleeping on an * event, it remains so. */ + assert(signal_reason == NULL); OSBitOrAtomic(P_CONTINUED, &sig_proc->p_flag); sig_proc->p_contproc = sig_proc->p_pid; @@ -2192,7 +2533,21 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ if (((flavor & (PSIG_VFORK|PSIG_THREAD)) == 0) && (action == SIG_DFL) && (prop & SA_KILL)) { sig_proc->p_stat = SRUN; - thread_abort(sig_thread); + kret = thread_abort(sig_thread); + update_thread_policy = (kret == KERN_SUCCESS); + + if (uth->uu_exit_reason == OS_REASON_NULL) { + if (signal_reason == OS_REASON_NULL) { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + sig_proc->p_pid, OS_REASON_SIGNAL, signum, 0, 0); + + signal_reason = build_signal_reason(signum, NULL); + } + + os_reason_ref(signal_reason); + set_thread_exit_reason(sig_thread, signal_reason, TRUE); + } + goto sigout_locked; } @@ -2231,9 +2586,19 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) } sigout_locked: + if (update_thread_policy) { + /* + * Update the thread policy to heading to terminate, increase priority if + * necessary. This needs to be done before we drop the proc lock because the + * thread can take the fatal signal once it's dropped. + */ + proc_set_thread_policy(sig_thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_TERMINATED, TASK_POLICY_ENABLE); + } + proc_unlock(sig_proc); sigout_unlocked: + os_reason_free(signal_reason); if ((flavor & PSIG_LOCKED)== 0) { proc_signalend(sig_proc, 0); } @@ -2242,32 +2607,51 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) void psignal(proc_t p, int signum) { - psignal_internal(p, NULL, NULL, 0, signum); + psignal_internal(p, NULL, NULL, 0, signum, NULL); +} + +void +psignal_with_reason(proc_t p, int signum, struct os_reason *signal_reason) +{ + psignal_internal(p, NULL, NULL, 0, signum, signal_reason); } void psignal_locked(proc_t p, int signum) { - psignal_internal(p, NULL, NULL, PSIG_LOCKED, signum); + psignal_internal(p, NULL, NULL, PSIG_LOCKED, signum, NULL); } +void +psignal_vfork_with_reason(proc_t p, task_t new_task, thread_t thread, int signum, struct os_reason *signal_reason) +{ + psignal_internal(p, new_task, thread, PSIG_VFORK, signum, signal_reason); +} + + void psignal_vfork(proc_t p, task_t new_task, thread_t thread, int signum) { - psignal_internal(p, new_task, thread, PSIG_VFORK, signum); + psignal_internal(p, new_task, thread, PSIG_VFORK, signum, NULL); } -static void +void psignal_uthread(thread_t thread, int signum) { - psignal_internal(PROC_NULL, TASK_NULL, thread, PSIG_THREAD, signum); + psignal_internal(PROC_NULL, TASK_NULL, thread, PSIG_THREAD, signum, NULL); } /* same as psignal(), but prefer delivery to 'thread' if possible */ -static void +void psignal_try_thread(proc_t p, thread_t thread, int signum) { - psignal_internal(p, NULL, thread, PSIG_TRY_THREAD, signum); + psignal_internal(p, NULL, thread, PSIG_TRY_THREAD, signum, NULL); +} + +void +psignal_try_thread_with_reason(proc_t p, thread_t thread, int signum, struct os_reason *signal_reason) +{ + psignal_internal(p, TASK_NULL, thread, PSIG_TRY_THREAD, signum, signal_reason); } /* @@ -2296,23 +2680,23 @@ issignal_locked(proc_t p) cur_act = current_thread(); #if SIGNAL_DEBUG - if(rdebug_proc && (p == rdebug_proc)) { - ram_printf(3); - } + if(rdebug_proc && (p == rdebug_proc)) { + ram_printf(3); + } #endif /* SIGNAL_DEBUG */ /* * Try to grab the signal lock. */ if (sig_try_locked(p) <= 0) { - return(0); + return 0; } proc_signalstart(p, 1); ut = get_bsdthread_info(cur_act); - for(;;) { - sigbits = ut->uu_siglist & ~ut->uu_sigmask; + for (;;) { + sigbits = ut->uu_siglist & ~ut->uu_sigmask; if (p->p_lflag & P_LPPWAIT) sigbits &= ~stopsigmask; @@ -2330,25 +2714,25 @@ issignal_locked(proc_t p) * only if P_LTRACED was on when they were posted. */ if (mask & p->p_sigignore && (p->p_lflag & P_LTRACED) == 0) { - ut->uu_siglist &= ~mask; /* take the signal! */ + ut->uu_siglist &= ~mask; continue; } + if (p->p_lflag & P_LTRACED && (p->p_lflag & P_LPPWAIT) == 0) { - task_t task; /* - * If traced, always stop, and stay - * stopped until released by the debugger. + * If traced, deliver the signal to the debugger, and wait to be + * released. */ - /* ptrace debugging */ + task_t task; p->p_xstat = signum; - + if (p->p_lflag & P_LSIGEXC) { p->sigwait = TRUE; p->sigwait_thread = cur_act; p->p_stat = SSTOP; OSBitAndAtomic(~((uint32_t)P_CONTINUED), &p->p_flag); p->p_lflag &= ~P_LWAITED; - ut->uu_siglist &= ~mask; /* clear the old signal */ + ut->uu_siglist &= ~mask; /* clear the current signal from the pending list */ proc_signalend(p, 1); proc_unlock(p); do_bsdexception(EXC_SOFTWARE, EXC_SOFT_SIGNAL, signum); @@ -2385,7 +2769,7 @@ issignal_locked(proc_t p) p->p_stat = SSTOP; OSBitAndAtomic(~((uint32_t)P_CONTINUED), &p->p_flag); p->p_lflag &= ~P_LWAITED; - ut->uu_siglist &= ~mask; /* clear the old signal */ + ut->uu_siglist &= ~mask; proc_signalend(p, 1); proc_unlock(p); @@ -2408,43 +2792,22 @@ issignal_locked(proc_t p) p->sigwait_thread = NULL; wakeup((caddr_t)&p->sigwait_thread); - /* - * This code is to detect when gdb is killed - * even as the traced program is attached. - * pgsignal would get the SIGKILL to traced program - * That's what we are trying to see (I hope) - */ - if (ut->uu_siglist & sigmask(SIGKILL)) { + if (signum == SIGKILL || ut->uu_siglist & sigmask(SIGKILL)) { /* - * Wait event may still be outstanding; - * clear it, since sig_lock_to_exit will - * wait. + * Deliver a pending sigkill even if it's not the current signal. + * Necessary for PT_KILL, which should not be delivered to the + * debugger, but we can't differentiate it from any other KILL. */ - clear_wait(current_thread(), THREAD_INTERRUPTED); - sig_lock_to_exit(p); - /* - * Since this thread will be resumed - * to allow the current syscall to - * be completed, must save u_qsave - * before calling exit(). (Since exit() - * calls closef() which can trash u_qsave.) - */ - proc_signalend(p, 1); - proc_unlock(p); - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE, - p->p_pid, W_EXITCODE(0, SIGKILL), 2, 0, 0); - exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); - proc_lock(p); - return(0); + signum = SIGKILL; + goto deliver_sig; } - /* - * We may have to quit - */ + /* We may have to quit. */ if (thread_should_abort(current_thread())) { retval = 0; goto out; } + /* * If parent wants us to take the signal, * then it will leave it in p->p_xstat; @@ -2453,6 +2816,7 @@ issignal_locked(proc_t p) signum = p->p_xstat; if (signum == 0) continue; + /* * Put the new signal into p_siglist. If the * signal is being masked, look for other signals. @@ -2470,7 +2834,7 @@ issignal_locked(proc_t p) */ switch ((long)p->p_sigacts->ps_sigact[signum]) { - + case (long)SIG_DFL: /* * If there is a pending stop signal to process @@ -2489,13 +2853,12 @@ issignal_locked(proc_t p) prop & SA_TTYSTOP)) { proc_lock(p); pg_rele(pg); - break; /* == ignore */ + break; /* ignore signal */ } pg_rele(pg); if (p->p_stat != SSTOP) { proc_lock(p); p->p_xstat = signum; - p->p_stat = SSTOP; p->p_lflag &= ~P_LWAITED; proc_unlock(p); @@ -2526,16 +2889,11 @@ issignal_locked(proc_t p) * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ - break; /* == ignore */ + break; /* ignore signal */ } else { - ut->uu_siglist &= ~mask; /* take the signal! */ - retval = signum; - goto out; + goto deliver_sig; } - /*NOTREACHED*/ - break; - case (long)SIG_IGN: /* * Masking above should prevent us ever trying @@ -2545,23 +2903,27 @@ issignal_locked(proc_t p) if ((prop & SA_CONT) == 0 && (p->p_lflag & P_LTRACED) == 0) printf("issignal\n"); - break; /* == ignore */ + break; /* ignore signal */ default: - /* - * This signal has an action, let - * postsig() process it. - */ - ut->uu_siglist &= ~mask; /* take the signal! */ - retval = signum; - goto out; - } - ut->uu_siglist &= ~mask; /* take the signal! */ + /* This signal has an action - deliver it. */ + goto deliver_sig; } + + /* If we dropped through, the signal was ignored - remove it from pending list. */ + ut->uu_siglist &= ~mask; + + } /* for(;;) */ + /* NOTREACHED */ + +deliver_sig: + ut->uu_siglist &= ~mask; + retval = signum; + out: proc_signalend(p, 1); - return(retval); + return retval; } /* called from _sleep */ @@ -2738,8 +3100,10 @@ postsig_locked(int signum) p->p_sigacts->ps_sig = signum; proc_signalend(p, 1); proc_unlock(p); +#if CONFIG_COREDUMP if (coredump(p, 0, 0) == 0) signum |= WCOREFLAG; +#endif } else { proc_signalend(p, 1); proc_unlock(p); @@ -2769,7 +3133,14 @@ postsig_locked(int signum) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE, p->p_pid, W_EXITCODE(0, signum), 3, 0, 0); - exit1(p, W_EXITCODE(0, signum), (int *)NULL); + + /* + * exit_with_reason() will consume a reference to the thread's exit reason, so we take another + * reference for the thread. This reference will be destroyed in uthread_cleanup(). + */ + os_reason_ref(ut->uu_exit_reason); + exit_with_reason(p, W_EXITCODE(0, signum), (int *)NULL, TRUE, TRUE, 0, ut->uu_exit_reason); + proc_lock(p); return; } else { @@ -2836,12 +3207,12 @@ filt_sigattach(struct knote *kn) proc_klist_lock(); kn->kn_ptr.p_proc = p; - kn->kn_flags |= EV_CLEAR; /* automatically set */ KNOTE_ATTACH(&p->p_klist, kn); proc_klist_unlock(); + /* edge-triggered events can't have fired before we attached */ return (0); } @@ -2886,27 +3257,54 @@ filt_signal(struct knote *kn, long hint) return (kn->kn_data != 0); } -static void -filt_signaltouch(struct knote *kn, struct kevent_internal_s *kev, long type) +static int +filt_signaltouch( + struct knote *kn, + struct kevent_internal_s *kev) { +#pragma unused(kev) + + int res; + proc_klist_lock(); - switch (type) { - case EVENT_REGISTER: - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - break; - case EVENT_PROCESS: - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; - } - break; - default: - panic("filt_signaltouch() - invalid type (%ld)", type); - break; + + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + /* + * No data to save - + * just capture if it is already fired + */ + res = (kn->kn_data > 0); + + proc_klist_unlock(); + + return res; +} + +static int +filt_signalprocess( + struct knote *kn, + __unused struct filt_process_s *data, + struct kevent_internal_s *kev) +{ + proc_klist_lock(); + + if (kn->kn_data == 0) { + proc_klist_unlock(); + return 0; } + + /* + * Snapshot the event data. + * All signal events are EV_CLEAR, so + * add that and clear out the data field. + */ + *kev = kn->kn_kevent; + kev->flags |= EV_CLEAR; + kn->kn_data = 0; + proc_klist_unlock(); + return 1; } void diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index 9cd79fca4..edf864abf 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -139,7 +139,7 @@ kern_ioctl_file_extents(struct kern_direct_file_io_ref_t * ref, u_long theIoctl, (void) do_ioctl(p1, p2, _DKIOCCSUNPINEXTENT, (caddr_t)&pin); } - while (offset < end) + for (; offset < end; offset += filechunk) { if (ref->vp->v_type == VREG) { @@ -149,7 +149,8 @@ kern_ioctl_file_extents(struct kern_direct_file_io_ref_t * ref, u_long theIoctl, filechunk = (size_t)(end - offset); error = VNOP_BLOCKMAP(ref->vp, offset, filechunk, &blkno, &filechunk, NULL, VNODE_WRITE, NULL); - if (error) break; + if (error) break; + if (-1LL == blkno) continue; fileblk = blkno * ref->blksize; } else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) @@ -192,7 +193,6 @@ kern_ioctl_file_extents(struct kern_direct_file_io_ref_t * ref, u_long theIoctl, else error = EINVAL; if (error) break; - offset += filechunk; } return (error); } @@ -279,6 +279,7 @@ kern_open_file_for_direct_io(const char * name, VATTR_INIT(&va); VATTR_WANTED(&va, va_rdev); VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_devid); VATTR_WANTED(&va, va_data_size); VATTR_WANTED(&va, va_data_alloc); VATTR_WANTED(&va, va_nlink); @@ -295,7 +296,7 @@ kern_open_file_for_direct_io(const char * name, /* Don't dump files with links. */ if (va.va_nlink != 1) goto out; - device = va.va_fsid; + device = (VATTR_IS_SUPPORTED(&va, va_devid)) ? va.va_devid : va.va_fsid; ref->filelength = va.va_data_size; p1 = &device; @@ -367,7 +368,7 @@ kern_open_file_for_direct_io(const char * name, locked = TRUE; f_offset = 0; - while (f_offset < ref->filelength) + for (; f_offset < ref->filelength; f_offset += filechunk) { if (ref->vp->v_type == VREG) { @@ -377,7 +378,7 @@ kern_open_file_for_direct_io(const char * name, error = VNOP_BLOCKMAP(ref->vp, f_offset, filechunk, &blkno, &filechunk, NULL, VNODE_WRITE, NULL); if (error) goto out; - + if (-1LL == blkno) continue; fileblk = blkno * ref->blksize; } else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) @@ -416,7 +417,6 @@ kern_open_file_for_direct_io(const char * name, #endif physoffset += getphysreq.length; } - f_offset += filechunk; } callback(callback_ref, 0ULL, 0ULL); diff --git a/bsd/kern/kern_synch.c b/bsd/kern/kern_synch.c index b1c4eda1c..5f3b8546b 100644 --- a/bsd/kern/kern_synch.c +++ b/bsd/kern/kern_synch.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,14 +54,14 @@ #include #include #include +#include #include /* for unix_syscall_return() */ #include extern void compute_averunnable(void *); /* XXX */ - - +__attribute__((noreturn)) static void _sleep_continue( __unused void *parameter, wait_result_t wresult) { @@ -265,6 +265,7 @@ _sleep( error = EWOULDBLOCK; break; case THREAD_AWAKENED: + case THREAD_RESTART: /* * Posix implies any signal should be delivered * first, regardless of whether awakened due diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index c3ecad7d2..a99b1d43e 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -133,6 +133,7 @@ #include #include +#include #include #include @@ -280,8 +281,10 @@ STATIC int sysctl_imgsrcdev(struct sysctl_oid *oidp, void *arg1, int arg2, struc #endif STATIC int sysctl_usrstack(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_usrstack64(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#if CONFIG_COREDUMP STATIC int sysctl_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_suid_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif STATIC int sysctl_delayterm(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_rage_vnode(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_kern_check_openevt(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); @@ -318,12 +321,14 @@ fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32) la32->fscale = (user32_long_t)la->fscale; } +#if CONFIG_COREDUMP /* * Attributes stored in the kernel. */ extern char corefilename[MAXPATHLEN+1]; extern int do_coredump; extern int sugid_coredump; +#endif #if COUNT_SYSCALLS extern int do_count_syscalls; @@ -382,11 +387,16 @@ sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void * ut->pth_name = (char*)kalloc( MAXTHREADNAMESIZE ); if(!ut->pth_name) return ENOMEM; + } else { + kernel_debug_string_simple(TRACE_STRING_THREADNAME_PREV, ut->pth_name); } bzero(ut->pth_name, MAXTHREADNAMESIZE); error = copyin(newp, ut->pth_name, newlen); - if(error) + if (error) { return error; + } + + kernel_debug_string_simple(TRACE_STRING_THREADNAME, ut->pth_name); } return 0; @@ -473,7 +483,7 @@ extern int get_kernel_symfile(proc_t, char **); #if COUNT_SYSCALLS #define KERN_COUNT_SYSCALLS (KERN_OSTYPE + 1000) -extern int nsysent; +extern unsigned int nsysent; extern int syscalls_log[]; extern const char *syscallnames[]; @@ -1121,23 +1131,11 @@ sysctl_kdebug_ops SYSCTL_HANDLER_ARGS // user_addr_t newp = req->newptr; /* user buffer copy in address */ // size_t newlen = req->newlen; /* user buffer copy in size */ - proc_t p = current_proc(); int ret=0; if (namelen == 0) return(ENOTSUP); - - ret = suser(kauth_cred_get(), &p->p_acflag); -#if KPERF - /* Non-root processes may be blessed by kperf to access data - * logged into trace. - */ - if (ret) - ret = kperf_access_check(); -#endif /* KPERF */ - if (ret) - return(ret); - + switch(name[0]) { case KERN_KDEFLAGS: case KERN_KDDFLAGS: @@ -1150,24 +1148,20 @@ sysctl_kdebug_ops SYSCTL_HANDLER_ARGS case KERN_KDREADTR: case KERN_KDWRITETR: case KERN_KDWRITEMAP: + case KERN_KDTEST: case KERN_KDPIDTR: case KERN_KDTHRMAP: case KERN_KDPIDEX: - case KERN_KDSETRTCDEC: case KERN_KDSETBUF: case KERN_KDGETENTROPY: - case KERN_KDENABLE_BG_TRACE: - case KERN_KDDISABLE_BG_TRACE: case KERN_KDREADCURTHRMAP: case KERN_KDSET_TYPEFILTER: case KERN_KDBUFWAIT: case KERN_KDCPUMAP: - case KERN_KDWAIT_BG_TRACE_RESET: - case KERN_KDSET_BG_TYPEFILTER: case KERN_KDWRITEMAP_V3: case KERN_KDWRITETR_V3: - ret = kdbg_control(name, namelen, oldp, oldlenp); - break; + ret = kdbg_control(name, namelen, oldp, oldlenp); + break; default: ret= ENOTSUP; break; @@ -1858,10 +1852,6 @@ SYSCTL_INT(_kern, OID_AUTO, ignore_is_ssd, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &ignore_is_ssd, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, root_is_CF_drive, - CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, - &root_is_CF_drive, 0, ""); - SYSCTL_UINT(_kern, OID_AUTO, preheat_max_bytes, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &preheat_max_bytes, 0, ""); @@ -1914,18 +1904,19 @@ STATIC int sysctl_boottime (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { - time_t tv_sec = boottime_sec(); + struct timeval tv; + boottime_timeval(&tv); struct proc *p = req->p; if (proc_is64bit(p)) { struct user64_timeval t; - t.tv_sec = tv_sec; - t.tv_usec = 0; + t.tv_sec = tv.tv_sec; + t.tv_usec = tv.tv_usec; return sysctl_io_opaque(req, &t, sizeof(t), NULL); } else { struct user32_timeval t; - t.tv_sec = tv_sec; - t.tv_usec = 0; + t.tv_sec = tv.tv_sec; + t.tv_usec = tv.tv_usec; return sysctl_io_opaque(req, &t, sizeof(t), NULL); } } @@ -2170,6 +2161,8 @@ SYSCTL_PROC(_kern, KERN_USRSTACK64, usrstack64, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_usrstack64, "Q", ""); +#if CONFIG_COREDUMP + SYSCTL_STRING(_kern, KERN_COREFILE, corefile, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, corefilename, sizeof(corefilename), ""); @@ -2222,6 +2215,8 @@ SYSCTL_PROC(_kern, KERN_SUGID_COREDUMP, sugid_coredump, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_suid_coredump, "I", ""); +#endif /* CONFIG_COREDUMP */ + STATIC int sysctl_delayterm (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -2444,6 +2439,7 @@ sysctl_vm_toggle_address_reuse(__unused struct sysctl_oid *oidp, __unused void * SYSCTL_PROC(_debug, OID_AUTO, toggle_address_reuse, CTLFLAG_ANYBODY | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_vm_toggle_address_reuse,"I",""); + STATIC int sysctl_swapusage (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -2489,8 +2485,8 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); - - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { + + if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) { //assert(req->newptr); printf("Failed attempt to set vm.freeze_enabled sysctl\n"); return EINVAL; @@ -2697,6 +2693,8 @@ extern uint32_t vm_compressor_minorcompact_threshold_divisor; extern uint32_t vm_compressor_majorcompact_threshold_divisor; extern uint32_t vm_compressor_unthrottle_threshold_divisor; extern uint32_t vm_compressor_catchup_threshold_divisor; +extern uint32_t vm_compressor_time_thread; +extern uint64_t vm_compressor_thread_runtime; SYSCTL_QUAD(_vm, OID_AUTO, compressor_input_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_input_bytes, ""); SYSCTL_QUAD(_vm, OID_AUTO, compressor_compressed_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_compressed_bytes, ""); @@ -2721,6 +2719,41 @@ SYSCTL_INT(_vm, OID_AUTO, compressor_catchup_threshold_divisor, CTLFLAG_RW | CTL SYSCTL_STRING(_vm, OID_AUTO, swapfileprefix, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, swapfilename, sizeof(swapfilename) - SWAPFILENAME_INDEX_LEN, ""); +SYSCTL_INT(_vm, OID_AUTO, compressor_timing_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_time_thread, 0, ""); +SYSCTL_QUAD(_vm, OID_AUTO, compressor_thread_runtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_thread_runtime, ""); + +SYSCTL_QUAD(_vm, OID_AUTO, lz4_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.lz4_compressions, ""); +SYSCTL_QUAD(_vm, OID_AUTO, lz4_compression_failures, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.lz4_compression_failures, ""); +SYSCTL_QUAD(_vm, OID_AUTO, lz4_compressed_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.lz4_compressed_bytes, ""); +SYSCTL_QUAD(_vm, OID_AUTO, lz4_wk_compression_delta, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.lz4_wk_compression_delta, ""); +SYSCTL_QUAD(_vm, OID_AUTO, lz4_wk_compression_negative_delta, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.lz4_wk_compression_negative_delta, ""); + +SYSCTL_QUAD(_vm, OID_AUTO, lz4_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.lz4_decompressions, ""); +SYSCTL_QUAD(_vm, OID_AUTO, lz4_decompressed_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.lz4_decompressed_bytes, ""); + +SYSCTL_QUAD(_vm, OID_AUTO, uc_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.uc_decompressions, ""); + +SYSCTL_QUAD(_vm, OID_AUTO, wk_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_compressions, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_compressions_exclusive, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_compressions_exclusive, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_sv_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_sv_compressions, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_mzv_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_mzv_compressions, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_compression_failures, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_compression_failures, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_compressed_bytes_exclusive, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_compressed_bytes_exclusive, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_compressed_bytes_total, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_compressed_bytes_total, ""); + +SYSCTL_QUAD(_vm, OID_AUTO, wk_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_decompressions, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_decompressed_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_decompressed_bytes, ""); +SYSCTL_QUAD(_vm, OID_AUTO, wk_sv_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_stats.wk_sv_decompressions, ""); + +SYSCTL_INT(_vm, OID_AUTO, lz4_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.lz4_threshold, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, wkdm_reeval_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.wkdm_reeval_threshold, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, lz4_max_failure_skips, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.lz4_max_failure_skips, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, lz4_max_failure_run_length, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.lz4_max_failure_run_length, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, lz4_max_preselects, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.lz4_max_preselects, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, lz4_run_preselection_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.lz4_run_preselection_threshold, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, lz4_run_continue_bytes, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.lz4_run_continue_bytes, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, lz4_profitable_bytes, CTLFLAG_RW | CTLFLAG_LOCKED, &vmctune.lz4_profitable_bytes, 0, ""); + #if CONFIG_PHANTOM_CACHE extern uint32_t phantom_cache_thrashing_threshold; extern uint32_t phantom_cache_eval_period_in_msecs; @@ -2732,6 +2765,37 @@ SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold, CTLFLAG_RW | CTLFLA SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold_ssd, CTLFLAG_RW | CTLFLAG_LOCKED, &phantom_cache_thrashing_threshold_ssd, 0, ""); #endif +#if CONFIG_BACKGROUND_QUEUE + +extern uint32_t vm_page_background_count; +extern uint32_t vm_page_background_limit; +extern uint32_t vm_page_background_target; +extern uint32_t vm_page_background_internal_count; +extern uint32_t vm_page_background_external_count; +extern uint32_t vm_page_background_mode; +extern uint32_t vm_page_background_exclude_external; +extern uint64_t vm_page_background_promoted_count; +extern uint64_t vm_pageout_considered_bq_internal; +extern uint64_t vm_pageout_considered_bq_external; +extern uint64_t vm_pageout_rejected_bq_internal; +extern uint64_t vm_pageout_rejected_bq_external; + +SYSCTL_INT(_vm, OID_AUTO, vm_page_background_mode, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_background_mode, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_background_exclude_external, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_background_exclude_external, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_background_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_background_limit, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_background_target, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_background_target, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_background_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_count, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_background_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_internal_count, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_background_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_external_count, 0, ""); + +SYSCTL_QUAD(_vm, OID_AUTO, vm_page_background_promoted_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_promoted_count, ""); +SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_considered_bq_internal, ""); +SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_considered_bq_external, ""); +SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_rejected_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_rejected_bq_internal, ""); +SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_rejected_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_rejected_bq_external, ""); + +#endif + #if (DEVELOPMENT || DEBUG) SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_hard, @@ -2742,6 +2806,48 @@ SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_soft, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &vm_page_creation_throttled_soft, 0, ""); +extern uint32_t vm_pageout_memorystatus_fb_factor_nr; +extern uint32_t vm_pageout_memorystatus_fb_factor_dr; +SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_nr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_nr, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_dr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_dr, 0, ""); + +extern uint32_t vm_grab_anon_overrides; +extern uint32_t vm_grab_anon_nops; + +SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_overrides, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_grab_anon_overrides, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_nops, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_grab_anon_nops, 0, ""); + +/* log message counters for persistence mode */ +extern uint32_t oslog_p_total_msgcount; +extern uint32_t oslog_p_metadata_saved_msgcount; +extern uint32_t oslog_p_metadata_dropped_msgcount; +extern uint32_t oslog_p_error_count; +extern uint32_t oslog_p_saved_msgcount; +extern uint32_t oslog_p_dropped_msgcount; +extern uint32_t oslog_p_boot_dropped_msgcount; + +/* log message counters for streaming mode */ +extern uint32_t oslog_s_total_msgcount; +extern uint32_t oslog_s_metadata_msgcount; +extern uint32_t oslog_s_error_count; +extern uint32_t oslog_s_streamed_msgcount; +extern uint32_t oslog_s_dropped_msgcount; + +SYSCTL_UINT(_debug, OID_AUTO, oslog_p_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_total_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_p_metadata_saved_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_metadata_saved_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_p_metadata_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_metadata_dropped_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_p_error_count, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_error_count, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_p_saved_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_saved_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_p_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_dropped_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_p_boot_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_boot_dropped_msgcount, 0, ""); + +SYSCTL_UINT(_debug, OID_AUTO, oslog_s_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_total_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_s_metadata_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_metadata_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_s_error_count, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_error_count, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_s_streamed_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_streamed_msgcount, 0, ""); +SYSCTL_UINT(_debug, OID_AUTO, oslog_s_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_dropped_msgcount, 0, ""); + + #endif /* DEVELOPMENT || DEBUG */ /* @@ -2965,3 +3071,65 @@ SYSCTL_INT(_kern, OID_AUTO, hv_support, #endif +/* + * This is set by core audio to tell tailspin (ie background tracing) how long + * its smallest buffer is. Background tracing can then try to make a reasonable + * decisions to try to avoid introducing so much latency that the buffers will + * underflow. + */ + +int min_audio_buffer_usec; + +STATIC int +sysctl_audio_buffer SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int err = 0, value = 0, changed = 0; + err = sysctl_io_number(req, min_audio_buffer_usec, sizeof(int), &value, &changed); + if (err) goto exit; + + if (changed) { + /* writing is protected by an entitlement */ + if (priv_check_cred(kauth_cred_get(), PRIV_AUDIO_LATENCY, 0) != 0) { + err = EPERM; + goto exit; + } + min_audio_buffer_usec = value; + } +exit: + return err; +} + +SYSCTL_PROC(_kern, OID_AUTO, min_audio_buffer_usec, CTLFLAG_RW | CTLFLAG_ANYBODY, 0, 0, sysctl_audio_buffer, "I", "Minimum audio buffer size, in microseconds"); + +#if DEVELOPMENT || DEBUG +#include +/* This should result in a fatal exception, verifying that "sysent" is + * write-protected. + */ +static int +kern_sysent_write(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { + uint64_t new_value = 0, old_value = 0; + int changed = 0, error; + + error = sysctl_io_number(req, old_value, sizeof(uint64_t), &new_value, &changed); + if ((error == 0) && changed) { + volatile uint32_t *wraddr = (uint32_t *) &sysent[0]; + *wraddr = 0; + printf("sysent[0] write succeeded\n"); + } + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, sysent_const_check, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, + kern_sysent_write, "I", "Attempt sysent[0] write"); + +#endif + +#if DEVELOPMENT || DEBUG +SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED, NULL, 1, ""); +#else +SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED, NULL, 0, ""); +#endif diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index 5da44690a..af55a09ed 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -102,7 +102,7 @@ static void setthetime( void time_zone_slock_init(void); -/* +/* * Time of day and interval timer support. * * These routines provide the kernel entry points to get and set @@ -114,31 +114,51 @@ void time_zone_slock_init(void); /* ARGSUSED */ int gettimeofday( -__unused struct proc *p, - struct gettimeofday_args *uap, - int32_t *retval) + struct proc *p, + struct gettimeofday_args *uap, + __unused int32_t *retval) { int error = 0; struct timezone ltz; /* local copy */ + clock_sec_t secs; + clock_usec_t usecs; + uint64_t mach_time; - if (uap->tp) { - clock_sec_t secs; - clock_usec_t usecs; + if (uap->tp || uap->mach_absolute_time) { + clock_gettimeofday_and_absolute_time(&secs, &usecs, &mach_time); + } - clock_gettimeofday(&secs, &usecs); - retval[0] = secs; - retval[1] = usecs; + if (uap->tp) { + /* Casting secs through a uint32_t to match arm64 commpage */ + if (IS_64BIT_PROCESS(p)) { + struct user64_timeval user_atv = {}; + user_atv.tv_sec = (uint32_t)secs; + user_atv.tv_usec = usecs; + error = copyout(&user_atv, uap->tp, sizeof(user_atv)); + } else { + struct user32_timeval user_atv = {}; + user_atv.tv_sec = (uint32_t)secs; + user_atv.tv_usec = usecs; + error = copyout(&user_atv, uap->tp, sizeof(user_atv)); + } + if (error) { + return error; + } } - + if (uap->tzp) { lck_spin_lock(tz_slock); ltz = tz; lck_spin_unlock(tz_slock); - error = copyout((caddr_t)<z, CAST_USER_ADDR_T(uap->tzp), sizeof (tz)); + error = copyout((caddr_t)<z, CAST_USER_ADDR_T(uap->tzp), sizeof(tz)); + } + + if (error == 0 && uap->mach_absolute_time) { + error = copyout(&mach_time, uap->mach_absolute_time, sizeof(mach_time)); } - return (error); + return error; } /* @@ -292,6 +312,18 @@ boottime_sec(void) return (secs); } +void +boottime_timeval(struct timeval *tv) +{ + clock_sec_t secs; + clock_usec_t microsecs; + + clock_get_boottime_microtime(&secs, µsecs); + + tv->tv_sec = secs; + tv->tv_usec = microsecs; +} + /* * Get value of an interval timer. The process virtual and * profiling virtual time timers are kept internally in the @@ -481,24 +513,34 @@ setitimer(struct proc *p, struct setitimer_args *uap, int32_t *retval) */ void realitexpire( - struct proc *p) + struct proc *p) { struct proc *r; - struct timeval t; + struct timeval t; r = proc_find(p->p_pid); proc_spinlock(p); + assert(p->p_ractive > 0); + if (--p->p_ractive > 0 || r != p) { + /* + * bail, because either proc is exiting + * or there's another active thread call + */ proc_spinunlock(p); if (r != NULL) proc_rele(r); return; } - + if (!timerisset(&p->p_realtimer.it_interval)) { + /* + * p_realtimer was cleared while this call was pending, + * send one last SIGALRM, but don't re-arm + */ timerclear(&p->p_rtime); proc_spinunlock(p); @@ -507,8 +549,29 @@ realitexpire( return; } + proc_spinunlock(p); + + /* + * Send the signal before re-arming the next thread call, + * so in case psignal blocks, we won't create yet another thread call. + */ + + psignal(p, SIGALRM); + + proc_spinlock(p); + + /* Should we still re-arm the next thread call? */ + if (!timerisset(&p->p_realtimer.it_interval)) { + timerclear(&p->p_rtime); + proc_spinunlock(p); + + proc_rele(p); + return; + } + microuptime(&t); timevaladd(&p->p_rtime, &p->p_realtimer.it_interval); + if (timercmp(&p->p_rtime, &t, <=)) { if ((p->p_rtime.tv_sec + 2) >= t.tv_sec) { for (;;) { @@ -516,21 +579,62 @@ realitexpire( if (timercmp(&p->p_rtime, &t, >)) break; } - } - else { + } else { p->p_rtime = p->p_realtimer.it_interval; timevaladd(&p->p_rtime, &t); } } - if (!thread_call_enter_delayed(p->p_rcall, tvtoabstime(&p->p_rtime))) + assert(p->p_rcall != NULL); + + if (!thread_call_enter_delayed_with_leeway(p->p_rcall, NULL, tvtoabstime(&p->p_rtime), 0, + THREAD_CALL_DELAY_USER_NORMAL)) { p->p_ractive++; + } + proc_spinunlock(p); - psignal(p, SIGALRM); proc_rele(p); } +/* + * Called once in proc_exit to clean up after an armed or pending realitexpire + * + * This will only be called after the proc refcount is drained, + * so realitexpire cannot be currently holding a proc ref. + * i.e. it will/has gotten PROC_NULL from proc_find. + */ +void +proc_free_realitimer(proc_t p) +{ + proc_spinlock(p); + + assert(p->p_rcall != NULL); + assert(p->p_refcount == 0); + + timerclear(&p->p_realtimer.it_interval); + + if (thread_call_cancel(p->p_rcall)) { + assert(p->p_ractive > 0); + p->p_ractive--; + } + + while (p->p_ractive > 0) { + proc_spinunlock(p); + + delay(1); + + proc_spinlock(p); + } + + thread_call_t call = p->p_rcall; + p->p_rcall = NULL; + + proc_spinunlock(p); + + thread_call_free(call); +} + /* * Check that a proposed value to load into the .it_value or * .it_interval part of an interval timer is acceptable. diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index 0fe1cfa30..a4e96162e 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -89,7 +89,7 @@ int pshm_cache_purge_all(proc_t p); int psem_cache_purge_all(proc_t p); int -reboot(struct proc *p, register struct reboot_args *uap, __unused int32_t *retval) +reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval) { char message[128]; int error=0; @@ -132,7 +132,7 @@ reboot(struct proc *p, register struct reboot_args *uap, __unused int32_t *retva } int -usrctl(struct proc *p, __unused register struct usrctl_args *uap, __unused int32_t *retval) +usrctl(struct proc *p, __unused struct usrctl_args *uap, __unused int32_t *retval) { if (p != initproc) { return EPERM; diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index c40ca8189..587394e49 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,12 +22,11 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#define __KPI__ -//#include +#define __KPI__ #include #include @@ -37,6 +36,7 @@ #include #include #include +#include #include #include @@ -50,80 +50,170 @@ static const mbuf_flags_t mbuf_flags_mask = (MBUF_EXT | MBUF_PKTHDR | MBUF_EOR | /* Unalterable mbuf flags */ static const mbuf_flags_t mbuf_cflags_mask = (MBUF_EXT); -void* mbuf_data(mbuf_t mbuf) +#define MAX_MBUF_TX_COMPL_FUNC 32 +mbuf_tx_compl_func +mbuf_tx_compl_table[MAX_MBUF_TX_COMPL_FUNC]; +extern lck_rw_t *mbuf_tx_compl_tbl_lock; +u_int32_t mbuf_tx_compl_index = 0; + +#if (DEVELOPMENT || DEBUG) +int mbuf_tx_compl_debug = 0; +SInt64 mbuf_tx_compl_outstanding __attribute__((aligned(8))) = 0; +u_int64_t mbuf_tx_compl_aborted __attribute__((aligned(8))) = 0; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_NODE(_kern_ipc, OID_AUTO, mbtxcf, + CTLFLAG_RW | CTLFLAG_LOCKED, 0, ""); +SYSCTL_INT(_kern_ipc_mbtxcf, OID_AUTO, debug, + CTLFLAG_RW | CTLFLAG_LOCKED, &mbuf_tx_compl_debug, 0, ""); +SYSCTL_INT(_kern_ipc_mbtxcf, OID_AUTO, index, + CTLFLAG_RD | CTLFLAG_LOCKED, &mbuf_tx_compl_index, 0, ""); +SYSCTL_QUAD(_kern_ipc_mbtxcf, OID_AUTO, oustanding, + CTLFLAG_RD | CTLFLAG_LOCKED, &mbuf_tx_compl_outstanding, ""); +SYSCTL_QUAD(_kern_ipc_mbtxcf, OID_AUTO, aborted, + CTLFLAG_RD | CTLFLAG_LOCKED, &mbuf_tx_compl_aborted, ""); +#endif /* (DEBUG || DEVELOPMENT) */ + +void * +mbuf_data(mbuf_t mbuf) { - return mbuf->m_data; + return (mbuf->m_data); } -void* mbuf_datastart(mbuf_t mbuf) +void * +mbuf_datastart(mbuf_t mbuf) { if (mbuf->m_flags & M_EXT) - return mbuf->m_ext.ext_buf; + return (mbuf->m_ext.ext_buf); if (mbuf->m_flags & M_PKTHDR) - return mbuf->m_pktdat; - return mbuf->m_dat; + return (mbuf->m_pktdat); + return (mbuf->m_dat); } -errno_t mbuf_setdata(mbuf_t mbuf, void* data, size_t len) +errno_t +mbuf_setdata(mbuf_t mbuf, void *data, size_t len) { - size_t start = (size_t)((char*)mbuf_datastart(mbuf)); + size_t start = (size_t)((char *)mbuf_datastart(mbuf)); size_t maxlen = mbuf_maxlen(mbuf); - + if ((size_t)data < start || ((size_t)data) + len > start + maxlen) - return EINVAL; + return (EINVAL); mbuf->m_data = data; mbuf->m_len = len; - - return 0; + + return (0); } -errno_t mbuf_align_32(mbuf_t mbuf, size_t len) +errno_t +mbuf_align_32(mbuf_t mbuf, size_t len) { if ((mbuf->m_flags & M_EXT) != 0 && m_mclhasreference(mbuf)) - return ENOTSUP; + return (ENOTSUP); mbuf->m_data = mbuf_datastart(mbuf); - mbuf->m_data += ((mbuf_trailingspace(mbuf) - len) &~ (sizeof(u_int32_t) - 1)); - - return 0; + mbuf->m_data += + ((mbuf_trailingspace(mbuf) - len) &~ (sizeof(u_int32_t) - 1)); + + return (0); } -/* This function is used to provide mcl_to_paddr via symbol indirection, - * please avoid any change in behavior or remove the indirection in +/* + * This function is used to provide mcl_to_paddr via symbol indirection, + * please avoid any change in behavior or remove the indirection in * config/Unsupported* */ -addr64_t mbuf_data_to_physical(void* ptr) +addr64_t +mbuf_data_to_physical(void *ptr) { return ((addr64_t)mcl_to_paddr(ptr)); } -errno_t mbuf_get(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +errno_t +mbuf_get(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) { /* Must set *mbuf to NULL in failure case */ *mbuf = m_get(how, type); - - return (*mbuf == NULL) ? ENOMEM : 0; + + return (*mbuf == NULL ? ENOMEM : 0); } -errno_t mbuf_gethdr(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +errno_t +mbuf_gethdr(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) { /* Must set *mbuf to NULL in failure case */ *mbuf = m_gethdr(how, type); - - return (*mbuf == NULL) ? ENOMEM : 0; + + return (*mbuf == NULL ? ENOMEM : 0); } errno_t mbuf_attachcluster(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf, - caddr_t extbuf, void (*extfree)(caddr_t , u_int, caddr_t), + caddr_t extbuf, void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg) { if (mbuf == NULL || extbuf == NULL || extfree == NULL || extsize == 0) return (EINVAL); if ((*mbuf = m_clattach(*mbuf, type, extbuf, - extfree, extsize, extarg, how)) == NULL) + extfree, extsize, extarg, how, 0)) == NULL) + return (ENOMEM); + + return (0); +} + +errno_t +mbuf_ring_cluster_alloc(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf, + void (*extfree)(caddr_t, u_int, caddr_t), size_t *size) +{ + caddr_t extbuf = NULL; + errno_t err; + + if (mbuf == NULL || extfree == NULL || size == NULL || *size == 0) + return (EINVAL); + + if ((err = mbuf_alloccluster(how, size, &extbuf)) != 0) + return (err); + + if ((*mbuf = m_clattach(*mbuf, type, extbuf, + extfree, *size, NULL, how, 1)) == NULL) { + mbuf_freecluster(extbuf, *size); return (ENOMEM); + } + + return (0); +} + +int +mbuf_ring_cluster_is_active(mbuf_t mbuf) +{ + return (m_ext_paired_is_active(mbuf)); +} + +errno_t +mbuf_ring_cluster_activate(mbuf_t mbuf) +{ + if (mbuf_ring_cluster_is_active(mbuf)) + return (EBUSY); + + m_ext_paired_activate(mbuf); + return (0); +} + +errno_t +mbuf_cluster_set_prop(mbuf_t mbuf, u_int32_t oldprop, u_int32_t newprop) +{ + if (mbuf == NULL || !(mbuf->m_flags & M_EXT)) + return (EINVAL); + + return (m_ext_set_prop(mbuf, oldprop, newprop) ? 0 : EBUSY); +} + +errno_t +mbuf_cluster_get_prop(mbuf_t mbuf, u_int32_t *prop) +{ + if (mbuf == NULL || prop == NULL || !(mbuf->m_flags & M_EXT)) + return (EINVAL); + *prop = m_ext_get_prop(mbuf); return (0); } @@ -174,18 +264,18 @@ mbuf_freecluster(caddr_t addr, size_t size) } errno_t -mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t* mbuf) +mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t *mbuf) { /* Must set *mbuf to NULL in failure case */ errno_t error = 0; int created = 0; if (mbuf == NULL) - return EINVAL; + return (EINVAL); if (*mbuf == NULL) { *mbuf = m_get(how, type); if (*mbuf == NULL) - return ENOMEM; + return (ENOMEM); created = 1; } /* @@ -215,167 +305,186 @@ mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t* mbuf) mbuf_free(*mbuf); *mbuf = NULL; } - return error; + return (error); } -errno_t mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +errno_t +mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) { /* Must set *mbuf to NULL in failure case */ errno_t error = 0; int created = 0; - if (mbuf == NULL) return EINVAL; + if (mbuf == NULL) + return (EINVAL); if (*mbuf == NULL) { error = mbuf_get(how, type, mbuf); if (error) - return error; + return (error); created = 1; } - + /* * At the time this code was written, m_mclget would always * return the same value that was passed in to it. */ *mbuf = m_mclget(*mbuf, how); - + if (created && ((*mbuf)->m_flags & M_EXT) == 0) { mbuf_free(*mbuf); *mbuf = NULL; } if (*mbuf == NULL || ((*mbuf)->m_flags & M_EXT) == 0) error = ENOMEM; - return error; + return (error); } -errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf) +errno_t +mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf) { /* Must set *mbuf to NULL in failure case */ errno_t error = 0; - + *mbuf = m_getpacket_how(how); - + if (*mbuf == NULL) { if (how == MBUF_WAITOK) error = ENOMEM; else error = EWOULDBLOCK; } - - return error; + + return (error); } -/* This function is used to provide m_free via symbol indirection, please avoid +/* + * This function is used to provide m_free via symbol indirection, please avoid * any change in behavior or remove the indirection in config/Unsupported* */ -mbuf_t mbuf_free(mbuf_t mbuf) +mbuf_t +mbuf_free(mbuf_t mbuf) { - return m_free(mbuf); + return (m_free(mbuf)); } -/* This function is used to provide m_freem via symbol indirection, please avoid +/* + * This function is used to provide m_freem via symbol indirection, please avoid * any change in behavior or remove the indirection in config/Unsupported* */ -void mbuf_freem(mbuf_t mbuf) +void +mbuf_freem(mbuf_t mbuf) { m_freem(mbuf); } -int mbuf_freem_list(mbuf_t mbuf) +int +mbuf_freem_list(mbuf_t mbuf) { - return m_freem_list(mbuf); + return (m_freem_list(mbuf)); } -size_t mbuf_leadingspace(const mbuf_t mbuf) +size_t +mbuf_leadingspace(const mbuf_t mbuf) { - return m_leadingspace(mbuf); + return (m_leadingspace(mbuf)); } -/* This function is used to provide m_trailingspace via symbol indirection, - * please avoid any change in behavior or remove the indirection in +/* + * This function is used to provide m_trailingspace via symbol indirection, + * please avoid any change in behavior or remove the indirection in * config/Unsupported* */ -size_t mbuf_trailingspace(const mbuf_t mbuf) +size_t +mbuf_trailingspace(const mbuf_t mbuf) { - return m_trailingspace(mbuf); + return (m_trailingspace(mbuf)); } /* Manipulation */ -errno_t mbuf_copym(const mbuf_t src, size_t offset, size_t len, - mbuf_how_t how, mbuf_t *new_mbuf) +errno_t +mbuf_copym(const mbuf_t src, size_t offset, size_t len, + mbuf_how_t how, mbuf_t *new_mbuf) { /* Must set *mbuf to NULL in failure case */ *new_mbuf = m_copym(src, offset, len, how); - - return (*new_mbuf == NULL) ? ENOMEM : 0; + + return (*new_mbuf == NULL ? ENOMEM : 0); } -errno_t mbuf_dup(const mbuf_t src, mbuf_how_t how, mbuf_t *new_mbuf) +errno_t +mbuf_dup(const mbuf_t src, mbuf_how_t how, mbuf_t *new_mbuf) { /* Must set *new_mbuf to NULL in failure case */ *new_mbuf = m_dup(src, how); - - return (*new_mbuf == NULL) ? ENOMEM : 0; + + return (*new_mbuf == NULL ? ENOMEM : 0); } -errno_t mbuf_prepend(mbuf_t *orig, size_t len, mbuf_how_t how) +errno_t +mbuf_prepend(mbuf_t *orig, size_t len, mbuf_how_t how) { /* Must set *orig to NULL in failure case */ *orig = m_prepend_2(*orig, len, how, 0); - - return (*orig == NULL) ? ENOMEM : 0; + + return (*orig == NULL ? ENOMEM : 0); } -errno_t mbuf_split(mbuf_t src, size_t offset, +errno_t +mbuf_split(mbuf_t src, size_t offset, mbuf_how_t how, mbuf_t *new_mbuf) { /* Must set *new_mbuf to NULL in failure case */ *new_mbuf = m_split(src, offset, how); - - return (*new_mbuf == NULL) ? ENOMEM : 0; + + return (*new_mbuf == NULL ? ENOMEM : 0); } -errno_t mbuf_pullup(mbuf_t *mbuf, size_t len) +errno_t +mbuf_pullup(mbuf_t *mbuf, size_t len) { /* Must set *mbuf to NULL in failure case */ *mbuf = m_pullup(*mbuf, len); - - return (*mbuf == NULL) ? ENOMEM : 0; + + return (*mbuf == NULL ? ENOMEM : 0); } -errno_t mbuf_pulldown(mbuf_t src, size_t *offset, size_t len, mbuf_t *location) +errno_t +mbuf_pulldown(mbuf_t src, size_t *offset, size_t len, mbuf_t *location) { /* Must set *location to NULL in failure case */ int new_offset; *location = m_pulldown(src, *offset, len, &new_offset); *offset = new_offset; - - return (*location == NULL) ? ENOMEM : 0; + + return (*location == NULL ? ENOMEM : 0); } -/* This function is used to provide m_adj via symbol indirection, please avoid +/* + * This function is used to provide m_adj via symbol indirection, please avoid * any change in behavior or remove the indirection in config/Unsupported* */ -void mbuf_adj(mbuf_t mbuf, int len) +void +mbuf_adj(mbuf_t mbuf, int len) { m_adj(mbuf, len); } -errno_t mbuf_adjustlen(mbuf_t m, int amount) +errno_t +mbuf_adjustlen(mbuf_t m, int amount) { /* Verify m_len will be valid after adding amount */ if (amount > 0) { - int used = (size_t)mbuf_data(m) - (size_t)mbuf_datastart(m) + - m->m_len; - + int used = (size_t)mbuf_data(m) - (size_t)mbuf_datastart(m) + + m->m_len; + if ((size_t)(amount + used) > mbuf_maxlen(m)) - return EINVAL; - } - else if (-amount > m->m_len) { - return EINVAL; + return (EINVAL); + } else if (-amount > m->m_len) { + return (EINVAL); } - + m->m_len += amount; - return 0; + return (0); } mbuf_t @@ -389,7 +498,8 @@ mbuf_concatenate(mbuf_t dst, mbuf_t src) /* return dst as is in the current implementation */ return (dst); } -errno_t mbuf_copydata(const mbuf_t m0, size_t off, size_t len, void* out_data) +errno_t +mbuf_copydata(const mbuf_t m0, size_t off, size_t len, void *out_data) { /* Copied m_copydata, added error handling (don't just panic) */ int count; @@ -397,7 +507,7 @@ errno_t mbuf_copydata(const mbuf_t m0, size_t off, size_t len, void* out_data) while (off > 0) { if (m == 0) - return EINVAL; + return (EINVAL); if (off < (size_t)m->m_len) break; off -= m->m_len; @@ -405,81 +515,93 @@ errno_t mbuf_copydata(const mbuf_t m0, size_t off, size_t len, void* out_data) } while (len > 0) { if (m == 0) - return EINVAL; + return (EINVAL); count = m->m_len - off > len ? len : m->m_len - off; bcopy(mtod(m, caddr_t) + off, out_data, count); len -= count; - out_data = ((char*)out_data) + count; + out_data = ((char *)out_data) + count; off = 0; m = m->m_next; } - - return 0; + + return (0); } -int mbuf_mclhasreference(mbuf_t mbuf) +int +mbuf_mclhasreference(mbuf_t mbuf) { if ((mbuf->m_flags & M_EXT)) - return m_mclhasreference(mbuf); + return (m_mclhasreference(mbuf)); else - return 0; + return (0); } /* mbuf header */ -mbuf_t mbuf_next(const mbuf_t mbuf) +mbuf_t +mbuf_next(const mbuf_t mbuf) { - return mbuf->m_next; + return (mbuf->m_next); } -errno_t mbuf_setnext(mbuf_t mbuf, mbuf_t next) +errno_t +mbuf_setnext(mbuf_t mbuf, mbuf_t next) { if (next && ((next)->m_nextpkt != NULL || - (next)->m_type == MT_FREE)) return EINVAL; + (next)->m_type == MT_FREE)) + return (EINVAL); mbuf->m_next = next; - - return 0; + + return (0); } -mbuf_t mbuf_nextpkt(const mbuf_t mbuf) +mbuf_t +mbuf_nextpkt(const mbuf_t mbuf) { - return mbuf->m_nextpkt; + return (mbuf->m_nextpkt); } -void mbuf_setnextpkt(mbuf_t mbuf, mbuf_t nextpkt) +void +mbuf_setnextpkt(mbuf_t mbuf, mbuf_t nextpkt) { mbuf->m_nextpkt = nextpkt; } -size_t mbuf_len(const mbuf_t mbuf) +size_t +mbuf_len(const mbuf_t mbuf) { - return mbuf->m_len; + return (mbuf->m_len); } -void mbuf_setlen(mbuf_t mbuf, size_t len) +void +mbuf_setlen(mbuf_t mbuf, size_t len) { mbuf->m_len = len; } -size_t mbuf_maxlen(const mbuf_t mbuf) +size_t +mbuf_maxlen(const mbuf_t mbuf) { if (mbuf->m_flags & M_EXT) - return mbuf->m_ext.ext_size; - return &mbuf->m_dat[MLEN] - ((char*)mbuf_datastart(mbuf)); + return (mbuf->m_ext.ext_size); + return (&mbuf->m_dat[MLEN] - ((char *)mbuf_datastart(mbuf))); } -mbuf_type_t mbuf_type(const mbuf_t mbuf) +mbuf_type_t +mbuf_type(const mbuf_t mbuf) { - return mbuf->m_type; + return (mbuf->m_type); } -errno_t mbuf_settype(mbuf_t mbuf, mbuf_type_t new_type) +errno_t +mbuf_settype(mbuf_t mbuf, mbuf_type_t new_type) { - if (new_type == MBUF_TYPE_FREE) return EINVAL; - + if (new_type == MBUF_TYPE_FREE) + return (EINVAL); + m_mchtype(mbuf, new_type); - - return 0; + + return (0); } mbuf_flags_t @@ -499,9 +621,9 @@ mbuf_setflags(mbuf_t mbuf, mbuf_flags_t flags) * in flags argument. * 2. Return error if bits other than public flags are set in passed * flags argument. - * Please note that private flag bits must be passed as reset by kexts, - * as they must use mbuf_flags KPI to get current set of mbuf flags - * and mbuf_flags KPI does not expose private flags. + * Please note that private flag bits must be passed as reset by + * kexts, as they must use mbuf_flags KPI to get current set of + * mbuf flags and mbuf_flags KPI does not expose private flags. */ if ((flags ^ oflags) & mbuf_cflags_mask) { ret = EINVAL; @@ -530,7 +652,7 @@ mbuf_setflags_mask(mbuf_t mbuf, mbuf_flags_t flags, mbuf_flags_t mask) errno_t ret = 0; if (mask & (~mbuf_flags_mask | mbuf_cflags_mask)) { - ret = EINVAL; + ret = EINVAL; } else { mbuf_flags_t oflags = mbuf->m_flags; mbuf->m_flags = (flags & mask) | (mbuf->m_flags & ~mask); @@ -549,22 +671,25 @@ mbuf_setflags_mask(mbuf_t mbuf, mbuf_flags_t flags, mbuf_flags_t mask) return (ret); } -errno_t mbuf_copy_pkthdr(mbuf_t dest, const mbuf_t src) +errno_t +mbuf_copy_pkthdr(mbuf_t dest, const mbuf_t src) { if (((src)->m_flags & M_PKTHDR) == 0) - return EINVAL; - + return (EINVAL); + m_copy_pkthdr(dest, src); - - return 0; + + return (0); } -size_t mbuf_pkthdr_len(const mbuf_t mbuf) +size_t +mbuf_pkthdr_len(const mbuf_t mbuf) { - return mbuf->m_pkthdr.len; + return (mbuf->m_pkthdr.len); } -__private_extern__ size_t mbuf_pkthdr_maxlen(mbuf_t m) +__private_extern__ size_t +mbuf_pkthdr_maxlen(mbuf_t m) { size_t maxlen = 0; mbuf_t n = m; @@ -576,35 +701,44 @@ __private_extern__ size_t mbuf_pkthdr_maxlen(mbuf_t m) return (maxlen); } -void mbuf_pkthdr_setlen(mbuf_t mbuf, size_t len) +void +mbuf_pkthdr_setlen(mbuf_t mbuf, size_t len) { mbuf->m_pkthdr.len = len; } -void mbuf_pkthdr_adjustlen(mbuf_t mbuf, int amount) +void +mbuf_pkthdr_adjustlen(mbuf_t mbuf, int amount) { mbuf->m_pkthdr.len += amount; } -ifnet_t mbuf_pkthdr_rcvif(const mbuf_t mbuf) +ifnet_t +mbuf_pkthdr_rcvif(const mbuf_t mbuf) { - // If we reference count ifnets, we should take a reference here before returning - return mbuf->m_pkthdr.rcvif; + /* + * If we reference count ifnets, we should take a reference here + * before returning + */ + return (mbuf->m_pkthdr.rcvif); } -errno_t mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifnet) +errno_t +mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifnet) { /* May want to walk ifnet list to determine if interface is valid */ - mbuf->m_pkthdr.rcvif = (struct ifnet*)ifnet; - return 0; + mbuf->m_pkthdr.rcvif = (struct ifnet *)ifnet; + return (0); } -void* mbuf_pkthdr_header(const mbuf_t mbuf) +void* +mbuf_pkthdr_header(const mbuf_t mbuf) { - return mbuf->m_pkthdr.pkt_hdr; + return (mbuf->m_pkthdr.pkt_hdr); } -void mbuf_pkthdr_setheader(mbuf_t mbuf, void *header) +void +mbuf_pkthdr_setheader(mbuf_t mbuf, void *header) { mbuf->m_pkthdr.pkt_hdr = (void*)header; } @@ -648,8 +782,8 @@ mbuf_set_vlan_tag( { mbuf->m_pkthdr.csum_flags |= CSUM_VLAN_TAG_VALID; mbuf->m_pkthdr.vlan_tag = vlan; - - return 0; + + return (0); } errno_t @@ -658,11 +792,11 @@ mbuf_get_vlan_tag( u_int16_t *vlan) { if ((mbuf->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) - return ENXIO; // No vlan tag set - + return (ENXIO); // No vlan tag set + *vlan = mbuf->m_pkthdr.vlan_tag; - - return 0; + + return (0); } errno_t @@ -671,13 +805,13 @@ mbuf_clear_vlan_tag( { mbuf->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID; mbuf->m_pkthdr.vlan_tag = 0; - - return 0; + + return (0); } -static const mbuf_csum_request_flags_t mbuf_valid_csum_request_flags = - MBUF_CSUM_REQ_IP | MBUF_CSUM_REQ_TCP | MBUF_CSUM_REQ_UDP | - MBUF_CSUM_PARTIAL | MBUF_CSUM_REQ_TCPIPV6 | MBUF_CSUM_REQ_UDPIPV6; +static const mbuf_csum_request_flags_t mbuf_valid_csum_request_flags = + MBUF_CSUM_REQ_IP | MBUF_CSUM_REQ_TCP | MBUF_CSUM_REQ_UDP | + MBUF_CSUM_PARTIAL | MBUF_CSUM_REQ_TCPIPV6 | MBUF_CSUM_REQ_UDPIPV6; errno_t mbuf_set_csum_requested( @@ -686,13 +820,14 @@ mbuf_set_csum_requested( u_int32_t value) { request &= mbuf_valid_csum_request_flags; - mbuf->m_pkthdr.csum_flags = (mbuf->m_pkthdr.csum_flags & 0xffff0000) | request; + mbuf->m_pkthdr.csum_flags = + (mbuf->m_pkthdr.csum_flags & 0xffff0000) | request; mbuf->m_pkthdr.csum_data = value; - - return 0; + + return (0); } -static const mbuf_tso_request_flags_t mbuf_valid_tso_request_flags = +static const mbuf_tso_request_flags_t mbuf_valid_tso_request_flags = MBUF_TSO_IPV4 | MBUF_TSO_IPV6; errno_t @@ -703,14 +838,14 @@ mbuf_get_tso_requested( { if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || request == NULL || value == NULL) - return EINVAL; + return (EINVAL); *request = mbuf->m_pkthdr.csum_flags; *request &= mbuf_valid_tso_request_flags; - if (*request && value != NULL) + if (*request && value != NULL) *value = mbuf->m_pkthdr.tso_segsz; - - return 0; + + return (0); } errno_t @@ -724,8 +859,8 @@ mbuf_get_csum_requested( if (value != NULL) { *value = mbuf->m_pkthdr.csum_data; } - - return 0; + + return (0); } errno_t @@ -734,11 +869,11 @@ mbuf_clear_csum_requested( { mbuf->m_pkthdr.csum_flags &= 0xffff0000; mbuf->m_pkthdr.csum_data = 0; - - return 0; + + return (0); } -static const mbuf_csum_performed_flags_t mbuf_valid_csum_performed_flags = +static const mbuf_csum_performed_flags_t mbuf_valid_csum_performed_flags = MBUF_CSUM_DID_IP | MBUF_CSUM_IP_GOOD | MBUF_CSUM_DID_DATA | MBUF_CSUM_PSEUDO_HDR | MBUF_CSUM_PARTIAL; @@ -749,10 +884,11 @@ mbuf_set_csum_performed( u_int32_t value) { performed &= mbuf_valid_csum_performed_flags; - mbuf->m_pkthdr.csum_flags = (mbuf->m_pkthdr.csum_flags & 0xffff0000) | performed; + mbuf->m_pkthdr.csum_flags = + (mbuf->m_pkthdr.csum_flags & 0xffff0000) | performed; mbuf->m_pkthdr.csum_data = value; - - return 0; + + return (0); } errno_t @@ -761,10 +897,11 @@ mbuf_get_csum_performed( mbuf_csum_performed_flags_t *performed, u_int32_t *value) { - *performed = mbuf->m_pkthdr.csum_flags & mbuf_valid_csum_performed_flags; + *performed = + mbuf->m_pkthdr.csum_flags & mbuf_valid_csum_performed_flags; *value = mbuf->m_pkthdr.csum_data; - - return 0; + + return (0); } errno_t @@ -773,8 +910,8 @@ mbuf_clear_csum_performed( { mbuf->m_pkthdr.csum_flags &= 0xffff0000; mbuf->m_pkthdr.csum_data = 0; - - return 0; + + return (0); } errno_t @@ -782,7 +919,7 @@ mbuf_inet_cksum(mbuf_t mbuf, int protocol, u_int32_t offset, u_int32_t length, u_int16_t *csum) { if (mbuf == NULL || length == 0 || csum == NULL || - (u_int32_t)mbuf->m_pkthdr.len < (offset + length)) + (u_int32_t)mbuf->m_pkthdr.len < (offset + length)) return (EINVAL); *csum = inet_cksum(mbuf, protocol, offset, length); @@ -795,7 +932,7 @@ mbuf_inet6_cksum(mbuf_t mbuf, int protocol, u_int32_t offset, u_int32_t length, u_int16_t *csum) { if (mbuf == NULL || length == 0 || csum == NULL || - (u_int32_t)mbuf->m_pkthdr.len < (offset + length)) + (u_int32_t)mbuf->m_pkthdr.len < (offset + length)) return (EINVAL); *csum = inet6_cksum(mbuf, protocol, offset, length); @@ -845,14 +982,14 @@ nd6_storelladdr(void) * Mbuf tag KPIs */ -#define MTAG_FIRST_ID FIRST_KPI_STR_ID +#define MTAG_FIRST_ID FIRST_KPI_STR_ID errno_t mbuf_tag_id_find( const char *string, mbuf_tag_id_t *out_id) { - return net_str_id_find_internal(string, out_id, NSI_MBUF_TAG, 1); + return (net_str_id_find_internal(string, out_id, NSI_MBUF_TAG, 1)); } errno_t @@ -866,71 +1003,74 @@ mbuf_tag_allocate( { struct m_tag *tag; u_int32_t mtag_id_first, mtag_id_last; - + if (data_p != NULL) *data_p = NULL; - + /* Sanity check parameters */ - (void) net_str_id_first_last(&mtag_id_first, &mtag_id_last, NSI_MBUF_TAG); - if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || id < mtag_id_first || - id > mtag_id_last || length < 1 || (length & 0xffff0000) != 0 || - data_p == NULL) { - return EINVAL; + (void) net_str_id_first_last(&mtag_id_first, &mtag_id_last, + NSI_MBUF_TAG); + if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || + id < mtag_id_first || id > mtag_id_last || length < 1 || + (length & 0xffff0000) != 0 || data_p == NULL) { + return (EINVAL); } - + /* Make sure this mtag hasn't already been allocated */ tag = m_tag_locate(mbuf, id, type, NULL); if (tag != NULL) { - return EEXIST; + return (EEXIST); } - + /* Allocate an mtag */ tag = m_tag_create(id, type, length, how, mbuf); if (tag == NULL) { - return how == M_WAITOK ? ENOMEM : EWOULDBLOCK; + return (how == M_WAITOK ? ENOMEM : EWOULDBLOCK); } - + /* Attach the mtag and set *data_p */ m_tag_prepend(mbuf, tag); *data_p = tag + 1; - - return 0; + + return (0); } errno_t mbuf_tag_find( - mbuf_t mbuf, - mbuf_tag_id_t id, - mbuf_tag_type_t type, - size_t* length, - void** data_p) + mbuf_t mbuf, + mbuf_tag_id_t id, + mbuf_tag_type_t type, + size_t *length, + void **data_p) { struct m_tag *tag; u_int32_t mtag_id_first, mtag_id_last; - + if (length != NULL) *length = 0; if (data_p != NULL) *data_p = NULL; - + /* Sanity check parameters */ - (void) net_str_id_first_last(&mtag_id_first, &mtag_id_last, NSI_MBUF_TAG); - if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || id < mtag_id_first || - id > mtag_id_last || length == NULL || data_p == NULL) { - return EINVAL; + (void) net_str_id_first_last(&mtag_id_first, &mtag_id_last, + NSI_MBUF_TAG); + if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || + id < mtag_id_first || id > mtag_id_last || length == NULL || + data_p == NULL) { + return (EINVAL); } - + /* Locate an mtag */ tag = m_tag_locate(mbuf, id, type, NULL); if (tag == NULL) { - return ENOENT; + return (ENOENT); } - + /* Copy out the pointer to the data and the lenght value */ *length = tag->m_tag_len; *data_p = tag + 1; - - return 0; + + return (0); } void @@ -941,20 +1081,20 @@ mbuf_tag_free( { struct m_tag *tag; u_int32_t mtag_id_first, mtag_id_last; - + /* Sanity check parameters */ - (void) net_str_id_first_last(&mtag_id_first, &mtag_id_last, NSI_MBUF_TAG); - if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || id < mtag_id_first || - id > mtag_id_last) + (void) net_str_id_first_last(&mtag_id_first, &mtag_id_last, + NSI_MBUF_TAG); + if (mbuf == NULL || (mbuf->m_flags & M_PKTHDR) == 0 || + id < mtag_id_first || id > mtag_id_last) return; - + tag = m_tag_locate(mbuf, id, type, NULL); if (tag == NULL) { return; } - + m_tag_delete(mbuf, tag); - return; } /* @@ -1053,7 +1193,8 @@ mbuf_del_drvaux(mbuf_t mbuf) } /* mbuf stats */ -void mbuf_stats(struct mbuf_stat *stats) +void +mbuf_stats(struct mbuf_stat *stats) { stats->mbufs = mbstat.m_mbufs; stats->clusters = mbstat.m_clusters; @@ -1075,7 +1216,8 @@ void mbuf_stats(struct mbuf_stat *stats) } errno_t -mbuf_allocpacket(mbuf_how_t how, size_t packetlen, unsigned int *maxchunks, mbuf_t *mbuf) +mbuf_allocpacket(mbuf_how_t how, size_t packetlen, unsigned int *maxchunks, + mbuf_t *mbuf) { errno_t error; struct mbuf *m; @@ -1086,7 +1228,8 @@ mbuf_allocpacket(mbuf_how_t how, size_t packetlen, unsigned int *maxchunks, mbuf error = EINVAL; goto out; } - m = m_allocpacket_internal(&numpkts, packetlen, maxchunks ? &numchunks : NULL, how, 1, 0); + m = m_allocpacket_internal(&numpkts, packetlen, + maxchunks ? &numchunks : NULL, how, 1, 0); if (m == 0) { if (maxchunks && *maxchunks && numchunks > *maxchunks) error = ENOBUFS; @@ -1099,11 +1242,12 @@ mbuf_allocpacket(mbuf_how_t how, size_t packetlen, unsigned int *maxchunks, mbuf *mbuf = m; } out: - return error; + return (error); } errno_t -mbuf_allocpacket_list(unsigned int numpkts, mbuf_how_t how, size_t packetlen, unsigned int *maxchunks, mbuf_t *mbuf) +mbuf_allocpacket_list(unsigned int numpkts, mbuf_how_t how, size_t packetlen, + unsigned int *maxchunks, mbuf_t *mbuf) { errno_t error; struct mbuf *m; @@ -1117,7 +1261,8 @@ mbuf_allocpacket_list(unsigned int numpkts, mbuf_how_t how, size_t packetlen, un error = EINVAL; goto out; } - m = m_allocpacket_internal(&numpkts, packetlen, maxchunks ? &numchunks : NULL, how, 1, 0); + m = m_allocpacket_internal(&numpkts, packetlen, + maxchunks ? &numchunks : NULL, how, 1, 0); if (m == 0) { if (maxchunks && *maxchunks && numchunks > *maxchunks) error = ENOBUFS; @@ -1130,7 +1275,7 @@ mbuf_allocpacket_list(unsigned int numpkts, mbuf_how_t how, size_t packetlen, un *mbuf = m; } out: - return error; + return (error); } __private_extern__ size_t @@ -1182,8 +1327,8 @@ mbuf_copyback( const char *cp = data; if (m == NULL || len == 0 || data == NULL) - return EINVAL; - + return (EINVAL); + while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; @@ -1198,15 +1343,16 @@ mbuf_copyback( } m = m->m_next; } - + while (len > 0) { mlen = MIN(m->m_len - off, len); - if (mlen < len && m->m_next == NULL && mbuf_trailingspace(m) > 0) { + if (mlen < len && m->m_next == NULL && + mbuf_trailingspace(m) > 0) { size_t grow = MIN(mbuf_trailingspace(m), len - mlen); mlen += grow; m->m_len += grow; } - bcopy(cp, off + (char*)mbuf_data(m), (unsigned)mlen); + bcopy(cp, off + (char *)mbuf_data(m), (unsigned)mlen); cp += mlen; len -= mlen; mlen += off; @@ -1221,7 +1367,10 @@ mbuf_copyback( goto out; } if (len > MINCLSIZE) { - /* cluter allocation failure is okay, we can grow chain */ + /* + * cluster allocation failure is okay, + * we can grow chain + */ mbuf_mclget(how, m->m_type, &n); } n->m_len = MIN(mbuf_maxlen(n), len); @@ -1229,12 +1378,12 @@ mbuf_copyback( } m = m->m_next; } - + out: if ((m_start->m_flags & M_PKTHDR) && (m_start->m_pkthdr.len < totlen)) m_start->m_pkthdr.len = totlen; - - return result; + + return (result); } u_int32_t @@ -1379,6 +1528,357 @@ mbuf_get_unsent_data_bytes(const mbuf_t m, u_int32_t *unsent_data) if (!(m->m_pkthdr.pkt_flags & PKTF_VALID_UNSENT_DATA)) return (EINVAL); - *unsent_data = m->m_pkthdr.pkt_unsent_databytes; + *unsent_data = m->m_pkthdr.bufstatus_if + + m->m_pkthdr.bufstatus_sndbuf; return (0); } + +errno_t +mbuf_get_buffer_status(const mbuf_t m, mbuf_buffer_status_t *buf_status) +{ + if (m == NULL || buf_status == NULL || !(m->m_flags & M_PKTHDR) || + !(m->m_pkthdr.pkt_flags & PKTF_VALID_UNSENT_DATA)) + return (EINVAL); + + buf_status->buf_interface = m->m_pkthdr.bufstatus_if; + buf_status->buf_sndbuf = m->m_pkthdr.bufstatus_sndbuf; + return (0); +} + +errno_t +mbuf_pkt_new_flow(const mbuf_t m, u_int32_t *retval) +{ + if (m == NULL || retval == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + if (m->m_pkthdr.pkt_flags & PKTF_NEW_FLOW) + *retval = 1; + else + *retval = 0; + return (0); +} + +errno_t +mbuf_last_pkt(const mbuf_t m, u_int32_t *retval) +{ + if (m == NULL || retval == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + if (m->m_pkthdr.pkt_flags & PKTF_LAST_PKT) + *retval = 1; + else + *retval = 0; + return (0); +} + +errno_t +mbuf_get_timestamp(mbuf_t m, u_int64_t *ts, boolean_t *valid) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR) || ts == NULL || + valid == NULL) + return (EINVAL); + + if ((m->m_pkthdr.pkt_flags & PKTF_DRV_TS_VALID) == 0) { + *valid = FALSE; + *ts = 0; + } else { + *valid = TRUE; + *ts = m->m_pkthdr.pkt_timestamp; + } + return (0); +} + +errno_t +mbuf_set_timestamp(mbuf_t m, u_int64_t ts, boolean_t valid) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + + if (valid == FALSE) { + m->m_pkthdr.pkt_flags &= ~PKTF_DRV_TS_VALID; + m->m_pkthdr.pkt_timestamp = 0; + } else { + m->m_pkthdr.pkt_flags |= PKTF_DRV_TS_VALID; + m->m_pkthdr.pkt_timestamp = ts; + } + return (0); +} + +errno_t +mbuf_get_status(mbuf_t m, kern_return_t *status) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR) || status == NULL) + return (EINVAL); + + if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) == 0) { + *status = 0; + } else { + *status = m->m_pkthdr.drv_tx_status; + } + return (0); +} + +static void +driver_mtag_init(mbuf_t m) +{ + if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) == 0) { + m->m_pkthdr.pkt_flags |= PKTF_DRIVER_MTAG; + bzero(&m->m_pkthdr.driver_mtag, + sizeof(m->m_pkthdr.driver_mtag)); + } +} + +errno_t +mbuf_set_status(mbuf_t m, kern_return_t status) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + + driver_mtag_init(m); + + m->m_pkthdr.drv_tx_status = status; + + return (0); +} + +errno_t +mbuf_get_flowid(mbuf_t m, u_int16_t *flowid) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR) || flowid == NULL) + return (EINVAL); + + if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) == 0) { + *flowid = 0; + } else { + *flowid = m->m_pkthdr.drv_flowid; + } + return (0); +} + +errno_t +mbuf_set_flowid(mbuf_t m, u_int16_t flowid) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + + driver_mtag_init(m); + + m->m_pkthdr.drv_flowid = flowid; + + return (0); +} + +errno_t +mbuf_get_tx_compl_data(mbuf_t m, uintptr_t *arg, uintptr_t *data) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR) || arg == NULL || + data == NULL) + return (EINVAL); + + if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) == 0) { + *arg = 0; + *data = 0; + } else { + *arg = m->m_pkthdr.drv_tx_compl_arg; + *data = m->m_pkthdr.drv_tx_compl_data; + } + return (0); +} + +errno_t +mbuf_set_tx_compl_data(mbuf_t m, uintptr_t arg, uintptr_t data) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + + driver_mtag_init(m); + + m->m_pkthdr.drv_tx_compl_arg = arg; + m->m_pkthdr.drv_tx_compl_data = data; + + return (0); +} + +static u_int32_t +get_tx_compl_callback_index_locked(mbuf_tx_compl_func callback) +{ + u_int32_t i; + + for (i = 0; i < MAX_MBUF_TX_COMPL_FUNC; i++) { + if (mbuf_tx_compl_table[i] == callback) { + return (i); + } + } + return (UINT32_MAX); +} + +static u_int32_t +get_tx_compl_callback_index(mbuf_tx_compl_func callback) +{ + u_int32_t i; + + lck_rw_lock_shared(mbuf_tx_compl_tbl_lock); + + i = get_tx_compl_callback_index_locked(callback); + + lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock); + + return (i); +} + +errno_t +mbuf_register_tx_compl_callback(mbuf_tx_compl_func callback) +{ + int i; + errno_t error; + + if (callback == NULL) + return (EINVAL); + + lck_rw_lock_exclusive(mbuf_tx_compl_tbl_lock); + + i = get_tx_compl_callback_index_locked(callback); + if (i != -1) { + error = EEXIST; + goto unlock; + } + + /* assume the worst */ + error = ENOSPC; + for (i = 0; i < MAX_MBUF_TX_COMPL_FUNC; i++) { + if (mbuf_tx_compl_table[i] == NULL) { + mbuf_tx_compl_table[i] = callback; + error = 0; + goto unlock; + } + } +unlock: + lck_rw_unlock_exclusive(mbuf_tx_compl_tbl_lock); + + return (error); +} + +errno_t +mbuf_unregister_tx_compl_callback(mbuf_tx_compl_func callback) +{ + int i; + errno_t error; + + if (callback == NULL) + return (EINVAL); + + lck_rw_lock_exclusive(mbuf_tx_compl_tbl_lock); + + /* assume the worst */ + error = ENOENT; + for (i = 0; i < MAX_MBUF_TX_COMPL_FUNC; i++) { + if (mbuf_tx_compl_table[i] == callback) { + mbuf_tx_compl_table[i] = NULL; + error = 0; + goto unlock; + } + } +unlock: + lck_rw_unlock_exclusive(mbuf_tx_compl_tbl_lock); + + return (error); +} + +errno_t +mbuf_get_timestamp_requested(mbuf_t m, boolean_t *requested) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + + if ((m->m_pkthdr.pkt_flags & PKTF_TX_COMPL_TS_REQ) == 0) { + *requested = FALSE; + } else { + *requested = TRUE; + } + return (0); +} + +errno_t +mbuf_set_timestamp_requested(mbuf_t m, uintptr_t *pktid, + mbuf_tx_compl_func callback) +{ + size_t i; + + if (m == NULL || !(m->m_flags & M_PKTHDR) || callback == NULL || + pktid == NULL) + return (EINVAL); + + i = get_tx_compl_callback_index(callback); + if (i == UINT32_MAX) + return (ENOENT); + +#if (DEBUG || DEVELOPMENT) + VERIFY(i < sizeof(m->m_pkthdr.pkt_compl_callbacks)); +#endif /* (DEBUG || DEVELOPMENT) */ + + if ((m->m_pkthdr.pkt_flags & PKTF_TX_COMPL_TS_REQ) == 0) { + m->m_pkthdr.pkt_compl_callbacks = 0; + m->m_pkthdr.pkt_flags |= PKTF_TX_COMPL_TS_REQ; + m->m_pkthdr.pkt_compl_context = + atomic_add_32_ov(&mbuf_tx_compl_index, 1); + +#if (DEBUG || DEVELOPMENT) + if (mbuf_tx_compl_debug != 0) { + OSIncrementAtomic64(&mbuf_tx_compl_outstanding); + } +#endif /* (DEBUG || DEVELOPMENT) */ + } + m->m_pkthdr.pkt_compl_callbacks |= (1 << i); + *pktid = m->m_pkthdr.pkt_compl_context; + + return (0); +} + +void +m_do_tx_compl_callback(struct mbuf *m, struct ifnet *ifp) +{ + int i; + + if (m == NULL) + return; + + if ((m->m_pkthdr.pkt_flags & PKTF_TX_COMPL_TS_REQ) == 0) + return; + +#if (DEBUG || DEVELOPMENT) + if (mbuf_tx_compl_debug != 0 && ifp != NULL && + (ifp->if_xflags & IFXF_TIMESTAMP_ENABLED) != 0 && + (m->m_pkthdr.pkt_flags & PKTF_DRV_TS_VALID) == 0) { + struct timespec now; + + nanouptime(&now); + net_timernsec(&now, &m->m_pkthdr.pkt_timestamp); + } +#endif /* (DEBUG || DEVELOPMENT) */ + + for (i = 0; i < MAX_MBUF_TX_COMPL_FUNC; i++) { + mbuf_tx_compl_func callback; + + if ((m->m_pkthdr.pkt_compl_callbacks & (1 << i)) == 0) + continue; + + lck_rw_lock_shared(mbuf_tx_compl_tbl_lock); + callback = mbuf_tx_compl_table[i]; + lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock); + + if (callback != NULL) { + callback(m->m_pkthdr.pkt_compl_context, + ifp, m->m_pkthdr.pkt_timestamp, + m->m_pkthdr.drv_tx_compl_arg, + m->m_pkthdr.drv_tx_compl_data, + m->m_pkthdr.drv_tx_status); + } + } + m->m_pkthdr.pkt_compl_callbacks = 0; + +#if (DEBUG || DEVELOPMENT) + if (mbuf_tx_compl_debug != 0) { + OSDecrementAtomic64(&mbuf_tx_compl_outstanding); + if (ifp == NULL) + atomic_add_64(&mbuf_tx_compl_aborted, 1); + } +#endif /* (DEBUG || DEVELOPMENT) */ +} diff --git a/bsd/kern/kpi_socket.c b/bsd/kern/kpi_socket.c index 82a647586..2251c3f6d 100644 --- a/bsd/kern/kpi_socket.c +++ b/bsd/kern/kpi_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2012 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -495,7 +495,7 @@ so_tc_from_dscp(u_int8_t dscp) else if (dscp >= 0x20 && dscp <= 0x2f) tc = SO_TC_VI; else if (dscp >= 0x08 && dscp <= 0x17) - tc = SO_TC_BK; + tc = SO_TC_BK_SYS; else tc = SO_TC_BE; @@ -1046,7 +1046,15 @@ sock_set_tcp_stream_priority(socket_t sock) void socket_set_traffic_mgt_flags_locked(socket_t sock, u_int8_t flags) { - (void) OSBitOrAtomic8(flags, &sock->so_traffic_mgt_flags); + u_int32_t soflags1 = 0; + + if ((flags & TRAFFIC_MGT_SO_BACKGROUND)) + soflags1 |= SOF1_TRAFFIC_MGT_SO_BACKGROUND; + if ((flags & TRAFFIC_MGT_TCP_RECVBG)) + soflags1 |= SOF1_TRAFFIC_MGT_TCP_RECVBG; + + (void) OSBitOrAtomic(soflags1, &sock->so_flags1); + sock_set_tcp_stream_priority(sock); } @@ -1064,7 +1072,15 @@ socket_set_traffic_mgt_flags(socket_t sock, u_int8_t flags) void socket_clear_traffic_mgt_flags_locked(socket_t sock, u_int8_t flags) { - (void) OSBitAndAtomic8(~flags, &sock->so_traffic_mgt_flags); + u_int32_t soflags1 = 0; + + if ((flags & TRAFFIC_MGT_SO_BACKGROUND)) + soflags1 |= SOF1_TRAFFIC_MGT_SO_BACKGROUND; + if ((flags & TRAFFIC_MGT_TCP_RECVBG)) + soflags1 |= SOF1_TRAFFIC_MGT_TCP_RECVBG; + + (void) OSBitAndAtomic(~soflags1, &sock->so_flags1); + sock_set_tcp_stream_priority(sock); } diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index 1f35405f4..e4179c09e 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2013 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -181,6 +181,7 @@ sflt_entry_release(struct socket_filter_entry *entry) } } +__attribute__((noreturn)) static void sflt_cleanup_thread(void *blah, wait_result_t blah2) { diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index d4f084e31..225550f2a 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -50,6 +50,9 @@ #include #include #include +#include +#include +#include #include #include /* vm_allocate() */ @@ -98,21 +101,22 @@ extern kern_return_t memory_object_signed(memory_object_control_t control, boolean_t is_signed); /* An empty load_result_t */ -static load_result_t load_result_null = { +static const load_result_t load_result_null = { .mach_header = MACH_VM_MIN_ADDRESS, .entry_point = MACH_VM_MIN_ADDRESS, .user_stack = MACH_VM_MIN_ADDRESS, .user_stack_size = 0, + .user_stack_alloc = MACH_VM_MIN_ADDRESS, + .user_stack_alloc_size = 0, .all_image_info_addr = MACH_VM_MIN_ADDRESS, .all_image_info_size = 0, .thread_count = 0, .unixproc = 0, .dynlinker = 0, .needs_dynlinker = 0, - .prog_allocated_stack = 0, - .prog_stack_size = 0, .validentry = 0, .using_lcmain = 0, + .is64bit = 0, .csflags = 0, .has_pagezero = 0, .uuid = { 0 }, @@ -136,8 +140,10 @@ parse_machfile( off_t macho_size, int depth, int64_t slide, - int64_t dyld_slide, - load_result_t *result + int64_t dyld_slide, + load_result_t *result, + load_result_t *binresult, + struct image_params *imgp ); static load_return_t @@ -167,7 +173,8 @@ load_code_signature( off_t macho_offset, off_t macho_size, cpu_type_t cputype, - load_result_t *result); + load_result_t *result, + struct image_params *imgp); #if CONFIG_CODE_DECRYPTION static load_return_t @@ -210,10 +217,11 @@ load_threadstate( static load_return_t load_threadstack( thread_t thread, - uint32_t *ts, - uint32_t total_size, + uint32_t *ts, + uint32_t total_size, mach_vm_offset_t *user_stack, - int *customstack + int *customstack, + load_result_t *result ); static load_return_t @@ -228,11 +236,12 @@ static load_return_t load_dylinker( struct dylinker_command *lcp, integer_t archbits, - vm_map_t map, - thread_t thread, - int depth, + vm_map_t map, + thread_t thread, + int depth, int64_t slide, - load_result_t *result + load_result_t *result, + struct image_params *imgp ); struct macho_data; @@ -329,6 +338,8 @@ load_machfile( create_map = TRUE; } + result->is64bit = ((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT); + /* * If we are spawning, we have created backing objects for the process * already, which include non-lazily creating the task map. So we @@ -348,11 +359,10 @@ load_machfile( } pmap = pmap_create(get_task_ledger(ledger_task), (vm_map_size_t) 0, - ((imgp->ip_flags & IMGPF_IS_64BIT) != 0)); - pal_switch_pmap(thread, pmap, imgp->ip_flags & IMGPF_IS_64BIT); + result->is64bit); map = vm_map_create(pmap, 0, - vm_compute_max_offset(((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT)), + vm_compute_max_offset(result->is64bit), TRUE); } else map = new_map; @@ -399,8 +409,14 @@ load_machfile( *result = load_result_null; + /* + * re-set the bitness on the load result since we cleared the load result above. + */ + result->is64bit = ((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT); + lret = parse_machfile(vp, map, thread, header, file_offset, macho_size, - 0, (int64_t)aslr_offset, (int64_t)dyld_aslr_offset, result); + 0, (int64_t)aslr_offset, (int64_t)dyld_aslr_offset, result, + NULL, imgp); if (lret != LOAD_SUCCESS) { if (create_map) { @@ -413,7 +429,7 @@ load_machfile( /* * On x86, for compatibility, don't enforce the hard page-zero restriction for 32-bit binaries. */ - if ((imgp->ip_flags & IMGPF_IS_64BIT) == 0) { + if (!result->is64bit) { enforce_hard_pagezero = FALSE; } #endif @@ -430,6 +446,8 @@ load_machfile( } } + vm_commit_pagezero_status(map); + if (create_map) { /* * If this is an exec, then we are going to destroy the old @@ -504,7 +522,9 @@ parse_machfile( int depth, int64_t aslr_offset, int64_t dyld_aslr_offset, - load_result_t *result + load_result_t *result, + load_result_t *binresult, + struct image_params *imgp ) { uint32_t ncmds; @@ -525,7 +545,12 @@ parse_machfile( size_t mach_header_sz = sizeof(struct mach_header); boolean_t abi64; boolean_t got_code_signatures = FALSE; + boolean_t found_header_segment = FALSE; + boolean_t found_xhdr = FALSE; int64_t slide = 0; + boolean_t dyld_no_load_addr = FALSE; + boolean_t is_dyld = FALSE; + vm_map_offset_t effective_page_mask = MAX(PAGE_MASK, vm_map_page_mask(map)); if (header->magic == MH_MAGIC_64 || header->magic == MH_CIGAM_64) { @@ -563,6 +588,7 @@ parse_machfile( if (depth != 2) { return (LOAD_FAILURE); } + is_dyld = TRUE; break; default: @@ -616,11 +642,11 @@ parse_machfile( /* * For PIE and dyld, slide everything by the ASLR offset. */ - if ((header->flags & MH_PIE) || (header->filetype == MH_DYLINKER)) { + if ((header->flags & MH_PIE) || is_dyld) { slide = aslr_offset; } - /* + /* * Scan through the commands, processing each one as necessary. * We parse in three passes through the headers: * 0: determine if TEXT and DATA boundary can be page-aligned @@ -628,14 +654,25 @@ parse_machfile( * 2: segments * 3: dyld, encryption, check entry point */ - + + boolean_t slide_realign = FALSE; + for (pass = 0; pass <= 3; pass++) { - if (pass == 0) { - /* see if we need to adjust the slide to re-align... */ - /* no re-alignment needed on X86_64 or ARM32 kernel */ + if (pass == 0 && !slide_realign && !is_dyld) { + /* if we dont need to realign the slide or determine dyld's load + * address, pass 0 can be skipped */ continue; } else if (pass == 1) { + + if (dyld_no_load_addr && binresult) { + /* + * The dyld Mach-O does not specify a load address. Try to locate + * it right after the main binary. If binresult == NULL, load + * directly to the given slide. + */ + slide = vm_map_round_page(slide + binresult->max_vm_addr, effective_page_mask); + } } /* @@ -647,6 +684,15 @@ parse_machfile( break; } + /* + * Check that some segment maps the start of the mach-o file, which is + * needed by the dynamic loader to read the mach headers, etc. + */ + if ((pass == 3) && (found_header_segment == FALSE)) { + ret = LOAD_BADMACHO; + break; + } + /* * Loop through each of the load_commands indicated by the * Mach-O header; if an absurd value is provided, we just @@ -684,11 +730,25 @@ parse_machfile( * intervention is required. */ switch(lcp->cmd) { - case LC_SEGMENT: + case LC_SEGMENT: { + struct segment_command *scp = (struct segment_command *) lcp; + if (pass == 0) { + if (is_dyld && scp->vmaddr == 0 && scp->fileoff == 0) { + dyld_no_load_addr = TRUE; + if (!slide_realign) { + /* got what we need, bail early on pass 0 */ + continue; + } + } + break; } + if (pass == 1 && !strncmp(scp->segname, "__XHDR", sizeof(scp->segname))) { + found_xhdr = TRUE; + } + if (pass != 2) break; @@ -710,8 +770,37 @@ parse_machfile( map, slide, result); + + if (ret == LOAD_SUCCESS && scp->fileoff == 0 && scp->filesize > 0) { + /* Enforce a single segment mapping offset zero, with R+X + * protection. */ + if (found_header_segment || + ((scp->initprot & (VM_PROT_READ|VM_PROT_EXECUTE)) != (VM_PROT_READ|VM_PROT_EXECUTE))) { + ret = LOAD_BADMACHO; + break; + } + found_header_segment = TRUE; + } + break; - case LC_SEGMENT_64: + } + case LC_SEGMENT_64: { + struct segment_command_64 *scp64 = (struct segment_command_64 *) lcp; + + if (pass == 0) { + if (is_dyld && scp64->vmaddr == 0 && scp64->fileoff == 0) { + dyld_no_load_addr = TRUE; + if (!slide_realign) { + /* got what we need, bail early on pass 0 */ + continue; + } + } + } + + if (pass == 1 && !strncmp(scp64->segname, "__XHDR", sizeof(scp64->segname))) { + found_xhdr = TRUE; + } + if (pass != 2) break; @@ -733,7 +822,20 @@ parse_machfile( map, slide, result); + + if (ret == LOAD_SUCCESS && scp64->fileoff == 0 && scp64->filesize > 0) { + /* Enforce a single segment mapping offset zero, with R+X + * protection. */ + if (found_header_segment || + ((scp64->initprot & (VM_PROT_READ|VM_PROT_EXECUTE)) != (VM_PROT_READ|VM_PROT_EXECUTE))) { + ret = LOAD_BADMACHO; + break; + } + found_header_segment = TRUE; + } + break; + } case LC_UNIXTHREAD: if (pass != 1) break; @@ -785,7 +887,8 @@ parse_machfile( file_offset, macho_size, header->cputype, - result); + result, + imgp); if (ret != LOAD_SUCCESS) { printf("proc %d: load code signature error %d " "for file \"%s\"\n", @@ -803,22 +906,21 @@ parse_machfile( if (got_code_signatures) { unsigned tainted = CS_VALIDATE_TAINTED; boolean_t valid = FALSE; - struct cs_blob *blobs; vm_size_t off = 0; if (cs_debug > 10) printf("validating initial pages of %s\n", vp->v_name); - blobs = ubc_get_cs_blobs(vp); while (off < size && ret == LOAD_SUCCESS) { tainted = CS_VALIDATE_TAINTED; - valid = cs_validate_page(blobs, - NULL, - file_offset + off, - addr + off, - &tainted); + valid = cs_validate_range(vp, + NULL, + file_offset + off, + addr + off, + PAGE_SIZE, + &tainted); if (!valid || (tainted & CS_VALIDATE_TAINTED)) { if (cs_debug) printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n", @@ -844,6 +946,7 @@ parse_machfile( addr, map, slide, vp, file_offset, header->cputype, header->cpusubtype); if (ret != LOAD_SUCCESS) { + os_reason_t load_failure_reason = OS_REASON_NULL; printf("proc %d: set_code_unprotect() error %d " "for file \"%s\"\n", p->p_pid, ret, vp->v_name); @@ -858,8 +961,19 @@ parse_machfile( proc_lock(p); p->p_lflag |= P_LTERM_DECRYPTFAIL; proc_unlock(p); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_FAIRPLAY_DECRYPT, 0, 0); + load_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_FAIRPLAY_DECRYPT); + } else { + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_DECRYPT, 0, 0); + load_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_DECRYPT); } - psignal(p, SIGKILL); + + assert(load_failure_reason != OS_REASON_NULL); + psignal_with_reason(p, SIGKILL, load_failure_reason); } break; #endif @@ -891,7 +1005,7 @@ parse_machfile( if (blob != NULL) { unsigned int cs_flag_data = blob->csb_flags; if(0 != ubc_cs_generation_check(vp)) { - if (0 != ubc_cs_blob_revalidate(vp, blob, 0)) { + if (0 != ubc_cs_blob_revalidate(vp, blob, imgp, 0)) { /* clear out the flag data if revalidation fails */ cs_flag_data = 0; result->csflags &= ~CS_VALID; @@ -914,18 +1028,23 @@ parse_machfile( * offset regardless of the PIE-ness of the main binary. */ ret = load_dylinker(dlp, dlarchbits, map, thread, depth, - dyld_aslr_offset, result); + dyld_aslr_offset, result, imgp); } - - if((ret == LOAD_SUCCESS) && (depth == 1)) { + + if ((ret == LOAD_SUCCESS) && (depth == 1)) { if (result->thread_count == 0) { ret = LOAD_FAILURE; } } } - if (kl_addr ) + if (ret == LOAD_BADMACHO && found_xhdr) { + ret = LOAD_BADMACHO_UPX; + } + + if (kl_addr) { kfree(kl_addr, kl_size); + } return(ret); } @@ -977,14 +1096,18 @@ unprotect_dsmos_segment( crypt_info.crypt_ops = (void *)0x2e69cf40; vm_map_offset_t crypto_backing_offset; crypto_backing_offset = -1; /* i.e. use map entry's offset */ -#if DEVELOPMENT || DEBUG - struct proc *p; - p = current_proc(); - printf("APPLE_PROTECT: %d[%s] map %p [0x%llx:0x%llx] %s(%s)\n", - p->p_pid, p->p_comm, map, - (uint64_t) map_addr, (uint64_t) (map_addr + map_size), - __FUNCTION__, vp->v_name); -#endif /* DEVELOPMENT || DEBUG */ +#if VM_MAP_DEBUG_APPLE_PROTECT + if (vm_map_debug_apple_protect) { + struct proc *p; + p = current_proc(); + printf("APPLE_PROTECT: %d[%s] map %p " + "[0x%llx:0x%llx] %s(%s)\n", + p->p_pid, p->p_comm, map, + (uint64_t) map_addr, + (uint64_t) (map_addr + map_size), + __FUNCTION__, vp->v_name); + } +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ /* The DSMOS pager can only be used by apple signed code */ struct cs_blob * blob = csvnode_get_blob(vp, file_off); @@ -1497,18 +1620,30 @@ load_main( if (thread == THREAD_NULL) return (LOAD_SUCCESS); - /* LC_MAIN specifies stack size but not location */ + /* + * LC_MAIN specifies stack size but not location. + * Add guard page to allocation size (MAXSSIZ includes guard page). + */ if (epc->stacksize) { - result->prog_stack_size = 1; + if (os_add_overflow(epc->stacksize, 4*PAGE_SIZE, &result->user_stack_size)) { + /* + * We are going to immediately throw away this result, but we want + * to make sure we aren't loading a dangerously close to + * overflowing value, since this will have a guard page added to it + * and be rounded to page boundaries + */ + return LOAD_BADMACHO; + } result->user_stack_size = epc->stacksize; + if (os_add_overflow(epc->stacksize, PAGE_SIZE, &result->user_stack_alloc_size)) { + return LOAD_BADMACHO; + } } else { - result->prog_stack_size = 0; - result->user_stack_size = MAXSSIZ; + result->user_stack_alloc_size = MAXSSIZ; } - result->prog_allocated_stack = 0; /* use default location for stack */ - ret = thread_userstackdefault(thread, &addr); + ret = thread_userstackdefault(&addr, result->is64bit); if (ret != KERN_SUCCESS) return(LOAD_FAILURE); @@ -1524,6 +1659,12 @@ load_main( /* kernel does *not* use entryoff from LC_MAIN. Dyld uses it. */ result->needs_dynlinker = TRUE; result->using_lcmain = TRUE; + + ret = thread_state_initialize( thread ); + if (ret != KERN_SUCCESS) { + return(LOAD_FAILURE); + } + result->unixproc = TRUE; result->thread_count++; @@ -1557,20 +1698,14 @@ load_unixthread( (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), tcp->cmdsize - sizeof(struct thread_command), - &addr, - &customstack); + &addr, &customstack, result); if (ret != LOAD_SUCCESS) return(ret); /* LC_UNIXTHREAD optionally specifies stack size and location */ - if (customstack) { - result->prog_stack_size = 0; /* unknown */ - result->prog_allocated_stack = 1; - } else { - result->prog_allocated_stack = 0; - result->prog_stack_size = 0; - result->user_stack_size = MAXSSIZ; + if (!customstack) { + result->user_stack_alloc_size = MAXSSIZ; } /* The stack slides down from the default location */ @@ -1644,8 +1779,7 @@ load_threadstate( flavor = *ts++; size = *ts++; - if (os_add_overflow(size, UINT32_C(2), &thread_size) || - os_mul_overflow(thread_size, (uint32_t)sizeof(uint32_t), &thread_size) || + if (os_add_and_mul_overflow(size, 2, sizeof(uint32_t), &thread_size) || os_sub_overflow(total_size, thread_size, &total_size)) { ret = LOAD_BADMACHO; goto bad; @@ -1668,11 +1802,12 @@ load_threadstate( static load_return_t load_threadstack( - thread_t thread, - uint32_t *ts, - uint32_t total_size, + thread_t thread, + uint32_t *ts, + uint32_t total_size, mach_vm_offset_t *user_stack, - int *customstack + int *customstack, + load_result_t *result ) { kern_return_t ret; @@ -1696,7 +1831,7 @@ load_threadstack( * to the appropriate type in thread_userstack() based on * the value of flavor. */ - ret = thread_userstack(thread, flavor, (thread_state_t)ts, size, user_stack, customstack); + ret = thread_userstack(thread, flavor, (thread_state_t)ts, size, user_stack, customstack, result->is64bit); if (ret != KERN_SUCCESS) { return(LOAD_FAILURE); } @@ -1758,6 +1893,11 @@ struct macho_data { #define DEFAULT_DYLD_PATH "/usr/lib/dyld" +#if (DEVELOPMENT || DEBUG) +extern char dyld_alt_path[]; +extern int use_alt_dyld; +#endif + static load_return_t load_dylinker( struct dylinker_command *lcp, @@ -1766,7 +1906,8 @@ load_dylinker( thread_t thread, int depth, int64_t slide, - load_result_t *result + load_result_t *result, + struct image_params *imgp ) { char *name; @@ -1788,6 +1929,7 @@ load_dylinker( return (LOAD_BADMACHO); name = (char *)lcp + lcp->name.offset; + /* * Check for a proper null terminated string. */ @@ -1797,6 +1939,29 @@ load_dylinker( return(LOAD_BADMACHO); } while (*p++); +#if (DEVELOPMENT || DEBUG) + + /* + * rdar://23680808 + * If an alternate dyld has been specified via boot args, check + * to see if PROC_UUID_ALT_DYLD_POLICY has been set on this + * executable and redirect the kernel to load that linker. + */ + + if (use_alt_dyld) { + int policy_error; + uint32_t policy_flags = 0; + int32_t policy_gencount = 0; + + policy_error = proc_uuid_policy_lookup(result->uuid, &policy_flags, &policy_gencount); + if (policy_error == 0) { + if (policy_flags & PROC_UUID_ALT_DYLD_POLICY) { + name = dyld_alt_path; + } + } + } +#endif + #if !(DEVELOPMENT || DEBUG) if (0 != strcmp(name, DEFAULT_DYLD_PATH)) { return (LOAD_BADMACHO); @@ -1816,78 +1981,10 @@ load_dylinker( goto novp_out; *myresult = load_result_null; - - /* - * First try to map dyld in directly. This should work most of - * the time since there shouldn't normally be something already - * mapped to its address. - */ + myresult->is64bit = result->is64bit; ret = parse_machfile(vp, map, thread, header, file_offset, - macho_size, depth, slide, 0, myresult); - - /* - * If it turned out something was in the way, then we'll take - * take this longer path to preflight dyld's vm ranges, then - * map it at a free location in the address space. - */ - - if (ret == LOAD_NOSPACE) { - mach_vm_offset_t dyl_start, map_addr; - mach_vm_size_t dyl_length; - int64_t slide_amount; - - *myresult = load_result_null; - - /* - * Preflight parsing the Mach-O file with a NULL - * map, which will return the ranges needed for a - * subsequent map attempt (with a slide) in "myresult" - */ - ret = parse_machfile(vp, VM_MAP_NULL, THREAD_NULL, header, - file_offset, macho_size, depth, - 0 /* slide */, 0, myresult); - - if (ret != LOAD_SUCCESS) { - goto out; - } - - dyl_start = myresult->min_vm_addr; - dyl_length = myresult->max_vm_addr - myresult->min_vm_addr; - - dyl_length += slide; - - /* To find an appropriate load address, do a quick allocation */ - map_addr = dyl_start; - ret = mach_vm_allocate(map, &map_addr, dyl_length, VM_FLAGS_ANYWHERE); - if (ret != KERN_SUCCESS) { - ret = LOAD_NOSPACE; - goto out; - } - - ret = mach_vm_deallocate(map, map_addr, dyl_length); - if (ret != KERN_SUCCESS) { - ret = LOAD_NOSPACE; - goto out; - } - - if (map_addr < dyl_start) - slide_amount = -(int64_t)(dyl_start - map_addr); - else - slide_amount = (int64_t)(map_addr - dyl_start); - - slide_amount += slide; - - *myresult = load_result_null; - - ret = parse_machfile(vp, map, thread, header, - file_offset, macho_size, depth, - slide_amount, 0, myresult); - - if (ret) { - goto out; - } - } + macho_size, depth, slide, 0, myresult, result, imgp); if (ret == LOAD_SUCCESS) { if (result->threadstate) { @@ -1906,7 +2003,7 @@ load_dylinker( result->csflags |= CS_DYLD_PLATFORM; } } -out: + vnode_put(vp); novp_out: FREE(dyld_data, M_TEMP); @@ -1921,7 +2018,8 @@ load_code_signature( off_t macho_offset, off_t macho_size, cpu_type_t cputype, - load_result_t *result) + load_result_t *result, + struct image_params *imgp) { int ret; kern_return_t kr; @@ -1948,7 +2046,7 @@ load_code_signature( blob->csb_mem_size == lcp->datasize) { /* it matches the blob we want here, lets verify the version */ if(0 != ubc_cs_generation_check(vp)) { - if (0 != ubc_cs_blob_revalidate(vp, blob, 0)) { + if (0 != ubc_cs_blob_revalidate(vp, blob, imgp, 0)) { ret = LOAD_FAILURE; /* set error same as from ubc_cs_blob_add */ goto out; } @@ -1987,10 +2085,14 @@ load_code_signature( if (ubc_cs_blob_add(vp, cputype, macho_offset, - addr, + &addr, lcp->datasize, + imgp, 0, &blob)) { + if (addr) { + ubc_cs_blob_deallocate(addr, blob_size); + } ret = LOAD_FAILURE; goto out; } else { @@ -2083,12 +2185,14 @@ set_code_unprotect( .cputype = cputype, .cpusubtype = cpusubtype}; kr=text_crypter_create(&crypt_info, cryptname, (void*)&crypt_data); -#if DEVELOPMENT || DEBUG - struct proc *p; - p = current_proc(); - printf("APPLE_PROTECT: %d[%s] map %p %s(%s) -> 0x%x\n", - p->p_pid, p->p_comm, map, __FUNCTION__, vpath, kr); -#endif /* DEVELOPMENT || DEBUG */ +#if VM_MAP_DEBUG_APPLE_PROTECT + if (vm_map_debug_apple_protect) { + struct proc *p; + p = current_proc(); + printf("APPLE_PROTECT: %d[%s] map %p %s(%s) -> 0x%x\n", + p->p_pid, p->p_comm, map, __FUNCTION__, vpath, kr); + } +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ FREE_ZONE(vpath, MAXPATHLEN, M_NAMEI); if(kr) { diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index bcef8baa3..760ea45d9 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -52,8 +52,15 @@ typedef int load_return_t; typedef struct _load_result { user_addr_t mach_header; user_addr_t entry_point; + + // The user stack pointer and addressable user stack size. user_addr_t user_stack; mach_vm_size_t user_stack_size; + + // The allocation containing the stack and guard area. + user_addr_t user_stack_alloc; + mach_vm_size_t user_stack_alloc_size; + mach_vm_address_t all_image_info_addr; mach_vm_size_t all_image_info_size; int thread_count; @@ -61,11 +68,10 @@ typedef struct _load_result { /* boolean_t */ unixproc :1, needs_dynlinker : 1, dynlinker :1, - prog_allocated_stack :1, - prog_stack_size : 1, validentry :1, has_pagezero :1, using_lcmain :1, + is64bit :1, :0; unsigned int csflags; unsigned char uuid[16]; @@ -96,5 +102,6 @@ load_return_t load_machfile( #define LOAD_ENOENT 8 /* resource not found */ #define LOAD_IOERROR 9 /* IO error */ #define LOAD_DECRYPTFAIL 10 /* FP decrypt failure */ +#define LOAD_BADMACHO_UPX 11 /* malformed mach-o file */ #endif /* _BSD_KERN_MACH_LOADER_H_ */ diff --git a/bsd/kern/makesyscalls.sh b/bsd/kern/makesyscalls.sh index 7317f55b8..29dd74b74 100755 --- a/bsd/kern/makesyscalls.sh +++ b/bsd/kern/makesyscalls.sh @@ -34,6 +34,9 @@ output_sysprotofile=0 output_syshdrfile=0 output_syscalltablefile=0 output_auditevfile=0 +output_tracecodes=0 + +use_stdout=0 # output files: syscallnamesfile="syscalls.c" @@ -46,6 +49,7 @@ auditevfile="audit_kevents.c" syscallprefix="SYS_" switchname="sysent" namesname="syscallnames" +tracecodename="syscall.codes" # tmp files: syslegal="sysent.syslegal.$$" @@ -56,14 +60,15 @@ sysprotoend="sysprotoend.$$" syscallnamestempfile="syscallnamesfile.$$" syshdrtempfile="syshdrtempfile.$$" audittempfile="audittempfile.$$" +tracecodetempfile="tracecodetempfile.$$" -trap "rm $syslegal $sysent $sysinc $sysarg $sysprotoend $syscallnamestempfile $syshdrtempfile $audittempfile" 0 +trap "rm $syslegal $sysent $sysinc $sysarg $sysprotoend $syscallnamestempfile $syshdrtempfile $audittempfile $tracecodetempfile" 0 -touch $syslegal $sysent $sysinc $sysarg $sysprotoend $syscallnamestempfile $syshdrtempfile $audittempfile +touch $syslegal $sysent $sysinc $sysarg $sysprotoend $syscallnamestempfile $syshdrtempfile $audittempfile $tracecodetempfile case $# in 0) - echo "usage: $0 input-file [ []]" 1>&2 + echo "usage: $0 input-file [ []]" 1>&2 exit 1 ;; esac @@ -88,6 +93,10 @@ if [ -n "$1" ]; then audit) output_auditevfile=1 ;; + trace) + output_tracecodes=1 + use_stdout=1 + ;; esac shift; else @@ -96,6 +105,7 @@ else output_syshdrfile=1 output_syscalltablefile=1 output_auditevfile=1 + output_tracecodes=1 fi if [ -n "$1" -a -f "$1" ]; then @@ -132,6 +142,7 @@ s/\$//g syshdrfile = \"$syshdrfile\" syshdrtempfile = \"$syshdrtempfile\" audittempfile = \"$audittempfile\" + tracecodetempfile = \"$tracecodetempfile\" syscallprefix = \"$syscallprefix\" switchname = \"$switchname\" namesname = \"$namesname\" @@ -290,6 +301,7 @@ s/\$//g argc = 0 argssize = "0" additional_comments = " " + obs_comments = "_" # find start and end of call name and arguments if ($current_field != "{") @@ -347,9 +359,15 @@ s/\$//g current_field = comments_start + 1 while (current_field < comments_end) { additional_comments = additional_comments $current_field " " + obs_comments = obs_comments $current_field "_" current_field++ } } + sub(/old/, "obs", obs_comments) + obs_comments = substr(obs_comments, 1, length(obs_comments)-1) + if (obs_comments == "_") { + obs_comments = "" + } # get function return type current_field = args_start + 1 @@ -625,13 +643,6 @@ s/\$//g linesize = length(syscallprefix) + length(tempname) + 12 align_comment(linesize, 30, syshdrtempfile) printf("%d\n", syscall_num) > syshdrtempfile - # special case for gettimeofday on ppc - cctools project uses old name - if (tempname == "ppc_gettimeofday") { - printf("#define\t%s%s", syscallprefix, "gettimeofday") > syshdrtempfile - linesize = length(syscallprefix) + length(tempname) + 12 - align_comment(linesize, 30, syshdrtempfile) - printf("%d\n", syscall_num) > syshdrtempfile - } } else if (skip_for_header == 0) { printf("\t\t\t/* %d %s*/\n", syscall_num, additional_comments) > syshdrtempfile @@ -653,7 +664,21 @@ s/\$//g # output to audit_kevents.c printf("\t%s,\t\t", auditev) > audittempfile printf("/* %d = %s%s*/\n", syscall_num, tempname, additional_comments) > audittempfile - + + tempname = funcname + if (skip_for_header == 0) { + if (tempname == "nosys" || tempname == "enosys") { + if (obs_comments == "") { + printf("0x40c%04x\tBSC_#%d%s\n", (syscall_num*4), syscall_num, obs_comments) > tracecodetempfile + } else { + printf("0x40c%04x\tBSC%s\n", (syscall_num*4), obs_comments) > tracecodetempfile + } + } else { + sub(/^_+/, "", tempname) + printf("0x40c%04x\tBSC_%s\n", (syscall_num*4), tempname) > tracecodetempfile + } + } + syscall_num++ next } @@ -671,13 +696,13 @@ s/\$//g printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend printf("};\n") > sysent - printf("int nsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent - printf("/* Verify that NUM_SYSENT reflects the latest syscall count */\n") > sysent - printf("_Static_assert(((sizeof(sysent) / sizeof(sysent[0])) == NUM_SYSENT), \"NUM_SYSENT needs to be updated to match syscall count\");\n") > sysent + printf("unsigned int nsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent printf("};\n") > syscallnamestempfile printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall_num) \ > syshdrtempfile + printf("#define\t%sinvalid\t%d\n", syscallprefix, 63) \ + > syshdrtempfile printf("\n#endif /* __APPLE_API_PRIVATE */\n") > syshdrtempfile printf("#endif /* !%s */\n", syscall_h) > syshdrtempfile printf("};\n\n") > audittempfile @@ -707,3 +732,11 @@ fi if [ $output_auditevfile -eq 1 ]; then cat $syslegal $audittempfile > $auditevfile fi + +if [ $output_tracecodes -eq 1 ]; then + if [ $use_stdout -eq 1 ]; then + cat $tracecodetempfile + else + cat $tracecodetempfile > $tracecodename + fi +fi diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index cc4288b05..86b5049c3 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -118,7 +118,7 @@ common_hook(void) return rv; } -#if (MAC_POLICY_OPS_VERSION != 39) +#if (MAC_POLICY_OPS_VERSION != 45) # error "struct mac_policy_ops doesn't match definition in mac_policy.h" #endif /* @@ -201,15 +201,16 @@ static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(ipq_label_init) CHECK_SET_HOOK(ipq_label_update) - .mpo_reserved1 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved2 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved3 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved4 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved7 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved8 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved9 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(file_check_library_validation) + + CHECK_SET_HOOK(vnode_notify_setacl) + CHECK_SET_HOOK(vnode_notify_setattrlist) + CHECK_SET_HOOK(vnode_notify_setextattr) + CHECK_SET_HOOK(vnode_notify_setflags) + CHECK_SET_HOOK(vnode_notify_setmode) + CHECK_SET_HOOK(vnode_notify_setowner) + CHECK_SET_HOOK(vnode_notify_setutimes) + CHECK_SET_HOOK(vnode_notify_truncate) CHECK_SET_HOOK(mbuf_label_associate_bpfdesc) CHECK_SET_HOOK(mbuf_label_associate_ifnet) @@ -272,12 +273,13 @@ static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(proc_check_expose_task) CHECK_SET_HOOK(proc_check_set_host_special_port) CHECK_SET_HOOK(proc_check_set_host_exception_port) - .mpo_reserved11 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved12 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved13 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved14 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved15 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved16 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(exc_action_check_exception_send) + CHECK_SET_HOOK(exc_action_label_associate) + CHECK_SET_HOOK(exc_action_label_copy) + CHECK_SET_HOOK(exc_action_label_destroy) + CHECK_SET_HOOK(exc_action_label_init) + CHECK_SET_HOOK(exc_action_label_update) + .mpo_reserved17 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved18 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved19 = (mpo_reserved_hook_t *)common_hook, @@ -397,9 +399,9 @@ static struct mac_policy_ops policy_ops = { .mpo_reserved23 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved24 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved25 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved26 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved27 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved28 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(mount_check_snapshot_create) + CHECK_SET_HOOK(mount_check_snapshot_delete) + CHECK_SET_HOOK(vnode_check_clone) CHECK_SET_HOOK(proc_check_get_cs_info) CHECK_SET_HOOK(proc_check_set_cs_info) @@ -485,8 +487,9 @@ static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(vnode_notify_rename) - .mpo_reserved32 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved33 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(vnode_check_setacl) + + CHECK_SET_HOOK(vnode_notify_deleteextattr) CHECK_SET_HOOK(system_check_kas_info) diff --git a/bsd/kern/posix_sem.c b/bsd/kern/posix_sem.c index 1b166106c..54b92f059 100644 --- a/bsd/kern/posix_sem.c +++ b/bsd/kern/posix_sem.c @@ -184,14 +184,14 @@ static int psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache static int psem_kqfilter (struct fileproc *fp, struct knote *kn, vfs_context_t ctx); static const struct fileops psemops = { - DTYPE_PSXSEM, - psem_read, - psem_write, - psem_ioctl, - psem_select, - psem_closefile, - psem_kqfilter, - NULL + .fo_type = DTYPE_PSXSEM, + .fo_read = psem_read, + .fo_write = psem_write, + .fo_ioctl = psem_ioctl, + .fo_select = psem_select, + .fo_close = psem_closefile, + .fo_kqfilter = psem_kqfilter, + .fo_drain = NULL, }; static lck_grp_t *psx_sem_subsys_lck_grp; @@ -201,7 +201,7 @@ static lck_mtx_t psx_sem_subsys_mutex; #define PSEM_SUBSYS_LOCK() lck_mtx_lock(& psx_sem_subsys_mutex) #define PSEM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_sem_subsys_mutex) -#define PSEM_SUBSYS_ASSERT_HELD() lck_mtx_assert(&psx_sem_subsys_mutex, LCK_MTX_ASSERT_OWNED) +#define PSEM_SUBSYS_ASSERT_HELD() LCK_MTX_ASSERT(&psx_sem_subsys_mutex, LCK_MTX_ASSERT_OWNED) static int psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *pcp); @@ -1121,10 +1121,12 @@ psem_select(__unused struct fileproc *fp, __unused int which, } static int -psem_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn, +psem_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx) { - return (ENOTSUP); + kn->kn_flags = EV_ERROR; + kn->kn_data = ENOTSUP; + return 0; } int diff --git a/bsd/kern/posix_shm.c b/bsd/kern/posix_shm.c index 38faf2939..2ddd346bc 100644 --- a/bsd/kern/posix_shm.c +++ b/bsd/kern/posix_shm.c @@ -197,14 +197,14 @@ static int pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, static int pshm_unlink_internal(struct pshminfo *pinfo, struct pshmcache *pcache); static const struct fileops pshmops = { - DTYPE_PSXSHM, - pshm_read, - pshm_write, - pshm_ioctl, - pshm_select, - pshm_closefile, - pshm_kqfilter, - 0 + .fo_type = DTYPE_PSXSHM, + .fo_read = pshm_read, + .fo_write = pshm_write, + .fo_ioctl = pshm_ioctl, + .fo_select = pshm_select, + .fo_close = pshm_closefile, + .fo_kqfilter = pshm_kqfilter, + .fo_drain = NULL, }; static lck_grp_t *psx_shm_subsys_lck_grp; @@ -214,7 +214,7 @@ static lck_mtx_t psx_shm_subsys_mutex; #define PSHM_SUBSYS_LOCK() lck_mtx_lock(& psx_shm_subsys_mutex) #define PSHM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_shm_subsys_mutex) -#define PSHM_SUBSYS_ASSERT_HELD() lck_mtx_assert(&psx_shm_subsys_mutex, LCK_MTX_ASSERT_OWNED) +#define PSHM_SUBSYS_ASSERT_HELD() LCK_MTX_ASSERT(&psx_shm_subsys_mutex, LCK_MTX_ASSERT_OWNED) /* Initialize the mutex governing access to the posix shm subsystem */ @@ -1283,10 +1283,12 @@ pshm_select(__unused struct fileproc *fp, __unused int which, __unused void *wql } static int -pshm_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn, +pshm_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx) { - return(ENOTSUP); + kn->kn_flags = EV_ERROR; + kn->kn_data = ENOTSUP; + return 0; } int diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index e26393c52..6f5333f47 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2013 Apple Inc. All rights reserved. + * Copyright (c) 2005-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ #include #include #include +#include + #include #include #include @@ -161,13 +164,14 @@ void __attribute__ ((noinline)) proc_piduniqidentifierinfo(proc_t p, struct proc void __attribute__ ((noinline)) proc_archinfo(proc_t p, struct proc_archinfo *pai); void __attribute__ ((noinline)) proc_pidcoalitioninfo(proc_t p, struct proc_pidcoalitioninfo *pci); int __attribute__ ((noinline)) proc_pidnoteexit(proc_t p, uint64_t arg, uint32_t *data); +int __attribute__ ((noinline)) proc_pidexitreasoninfo(proc_t p, struct proc_exitreasoninfo *peri, struct proc_exitreasonbasicinfo *pberi); int __attribute__ ((noinline)) proc_pidoriginatorpid_uuid(uuid_t uuid, uint32_t buffersize, pid_t *pid); /* protos for proc_pidfdinfo calls */ int __attribute__ ((noinline)) pid_vnodeinfo(vnode_t vp, uint32_t vid, struct fileproc * fp,proc_t proc, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int __attribute__ ((noinline)) pid_vnodeinfopath(vnode_t vp, uint32_t vid, struct fileproc * fp,proc_t proc, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); -int __attribute__ ((noinline)) pid_socketinfo(socket_t so, struct fileproc *fp,proc_t proc, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); +int __attribute__ ((noinline)) pid_socketinfo(socket_t so, struct fileproc *fp,proc_t proc, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int __attribute__ ((noinline)) pid_pseminfo(struct psemnode * psem, struct fileproc * fp, proc_t proc, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int __attribute__ ((noinline)) pid_pshminfo(struct pshmnode * pshm, struct fileproc * fp, proc_t proc, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int __attribute__ ((noinline)) pid_pipeinfo(struct pipe * p, struct fileproc * fp, proc_t proc, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); @@ -182,6 +186,10 @@ void fill_fileinfo(struct fileproc * fp, proc_t proc, int fd, struct proc_filei int proc_security_policy(proc_t targetp, int callnum, int flavor, boolean_t check_same_user); static void munge_vinfo_stat(struct stat64 *sbp, struct vinfo_stat *vsbp); static int proc_piduuidinfo(pid_t pid, uuid_t uuid_buf, uint32_t buffersize); +int proc_pidpathinfo_internal(proc_t p, __unused uint64_t arg, char *buf, uint32_t buffersize, __unused int32_t *retval); +int proc_listfd_kqueue(proc_t p, int32_t *fdlist, int len); +int proc_kqueue_udata_info(proc_t p, int32_t fd, uint64_t *buffer, int bufsize); +int proc_list_uptrs(proc_t p, uint64_t *udata_buffer, int size); extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int, int); extern int proc_get_rusage(proc_t proc, int flavor, user_addr_t buffer, int is_zombie); @@ -556,7 +564,7 @@ proc_pidfileportlist(proc_t p, int proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) { - register struct tty *tp; + struct tty *tp; struct session *sessionp = NULL; struct pgrp * pg; kauth_cred_t my_cred; @@ -761,6 +769,19 @@ proc_pidthreadinfo(proc_t p, uint64_t arg, int thuniqueid, struct proc_threadin } +boolean_t +bsd_hasthreadname(void *uth) +{ + struct uthread *ut = (struct uthread*)uth; + + /* This doesn't check for the empty string; do we care? */ + if (ut->pth_name) { + return TRUE; + } else { + return FALSE; + } +} + void bsd_getthreadname(void *uth, char *buffer) { @@ -769,6 +790,52 @@ bsd_getthreadname(void *uth, char *buffer) bcopy(ut->pth_name,buffer,MAXTHREADNAMESIZE); } +/* + * This is known to race with regards to the contents of the thread name; concurrent + * callers may result in a garbled name. + */ +void +bsd_setthreadname(void *uth, const char *name) { + struct uthread *ut = (struct uthread *)uth; + char * name_buf = NULL; + + if (!ut->pth_name) { + /* If there is no existing thread name, allocate a buffer for one. */ + name_buf = kalloc(MAXTHREADNAMESIZE); + assert(name_buf); + bzero(name_buf, MAXTHREADNAMESIZE); + + /* Someone could conceivably have named the thread at the same time we did. */ + if (!OSCompareAndSwapPtr(NULL, name_buf, &ut->pth_name)) { + kfree(name_buf, MAXTHREADNAMESIZE); + } + } else { + kernel_debug_string_simple(TRACE_STRING_THREADNAME_PREV, ut->pth_name); + } + + strncpy(ut->pth_name, name, MAXTHREADNAMESIZE - 1); + kernel_debug_string_simple(TRACE_STRING_THREADNAME, ut->pth_name); +} + +void +bsd_copythreadname(void *dst_uth, void *src_uth) +{ + struct uthread *dst_ut = (struct uthread *)dst_uth; + struct uthread *src_ut = (struct uthread *)src_uth; + + if (src_ut->pth_name == NULL) + return; + + if (dst_ut->pth_name == NULL) { + dst_ut->pth_name = (char *)kalloc(MAXTHREADNAMESIZE); + if (dst_ut->pth_name == NULL) + return; + } + + bcopy(src_ut->pth_name, dst_ut->pth_name, MAXTHREADNAMESIZE); + return; +} + void bsd_threadcdir(void * uth, void *vptr, int *vidp) { @@ -1068,9 +1135,8 @@ proc_pidvnodepathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, __unu int proc_pidpathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, uint32_t buffersize, __unused int32_t *retval) { - int vid, error; + int error; vnode_t tvp; - vnode_t nvp = NULLVP; int len = buffersize; char * buf; @@ -1083,6 +1149,26 @@ proc_pidpathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, uint32_t b if (buf == NULL) return(ENOMEM); + error = proc_pidpathinfo_internal(p, arg, buf, buffersize, retval); + if (error == 0) { + error = copyout(buf, buffer, len); + } + kfree(buf, buffersize); + return(error); +} + +int +proc_pidpathinfo_internal(proc_t p, __unused uint64_t arg, char *buf, uint32_t buffersize, __unused int32_t *retval) +{ + int vid, error; + vnode_t tvp; + vnode_t nvp = NULLVP; + int len = buffersize; + + tvp = p->p_textvp; + + if (tvp == NULLVP) + return(ESRCH); vid = vnode_vid(tvp); error = vnode_getwithvid(tvp, vid); @@ -1093,12 +1179,8 @@ proc_pidpathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, uint32_t b error = vnode_lookup(buf, 0, &nvp, vfs_context_current()); if ((error == 0) && ( nvp != NULLVP)) vnode_put(nvp); - if (error == 0) { - error = copyout(buf, buffer, len); - } } } - kfree(buf, buffersize); return(error); } @@ -1243,16 +1325,13 @@ proc_pidoriginatorinfo(int pid, int flavor, user_addr_t buffer, uint32_t buffer case PROC_PIDORIGINATOR_PID_UUID: { struct proc_originatorinfo originator_info; + bzero(&originator_info, sizeof(originator_info)); error = proc_pidoriginatorpid_uuid(originator_info.originator_uuid, sizeof(uuid_t), &originator_info.originator_pid); if (error != 0) goto out; - originator_info.p_reserve2 = 0; - originator_info.p_reserve3 = 0; - originator_info.p_reserve4 = 0; - error = copyout(&originator_info, buffer, size); if (error == 0) *retval = size; @@ -1637,6 +1716,14 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu size = PROC_PIDNOTEEXIT_SIZE; findzomb = 1; break; + case PROC_PIDEXITREASONINFO: + size = PROC_PIDEXITREASONINFO_SIZE; + findzomb = 1; + break; + case PROC_PIDEXITREASONBASICINFO: + size = PROC_PIDEXITREASONBASICINFOSIZE; + findzomb = 1; + break; case PROC_PIDREGIONPATHINFO2: size = PROC_PIDREGIONPATHINFO2_SIZE; break; @@ -1698,7 +1785,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu case PROC_PIDUNIQIDENTIFIERINFO: { struct proc_uniqidentifierinfo p_uniqidinfo; - + bzero(&p_uniqidinfo, sizeof(p_uniqidinfo)); proc_piduniqidentifierinfo(p, &p_uniqidinfo); error = copyout(&p_uniqidinfo, buffer, sizeof(struct proc_uniqidentifierinfo)); if (error == 0) @@ -1708,25 +1795,26 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu case PROC_PIDT_SHORTBSDINFO: shortversion = 1; - case PROC_PIDT_BSDINFOWITHUNIQID: + case PROC_PIDT_BSDINFOWITHUNIQID: case PROC_PIDTBSDINFO: { struct proc_bsdinfo pbsd; struct proc_bsdshortinfo pbsd_short; struct proc_bsdinfowithuniqid pbsd_uniqid; - + if (flavor == PROC_PIDT_BSDINFOWITHUNIQID) uniqidversion = 1; - + if (shortversion != 0) { error = proc_pidshortbsdinfo(p, &pbsd_short, zombie); } else { error = proc_pidbsdinfo(p, &pbsd, zombie); - if (uniqidversion != 0) { + if (uniqidversion != 0) { + bzero(&pbsd_uniqid, sizeof(pbsd_uniqid)); proc_piduniqidentifierinfo(p, &pbsd_uniqid.p_uniqidentifier); pbsd_uniqid.pbsd = pbsd; } } - + if (error == 0) { if (shortversion != 0) { error = copyout(&pbsd_short, buffer, sizeof(struct proc_bsdshortinfo)); @@ -1758,15 +1846,15 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu break; case PROC_PIDTASKALLINFO: { - struct proc_taskallinfo pall; - + struct proc_taskallinfo pall; + bzero(&pall, sizeof(pall)); error = proc_pidbsdinfo(p, &pall.pbsd, 0); error = proc_pidtaskinfo(p, &pall.ptinfo); if (error == 0) { error = copyout(&pall, buffer, sizeof(struct proc_taskallinfo)); if (error == 0) *retval = sizeof(struct proc_taskallinfo); - } + } } break; @@ -1817,13 +1905,13 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu case PROC_PIDTHREADPATHINFO:{ - struct proc_threadwithpathinfo pinfo; + struct proc_threadwithpathinfo pinfo; error = proc_pidthreadpathinfo(p, arg, &pinfo); if (error == 0) { error = copyout((caddr_t)&pinfo, buffer, sizeof(struct proc_threadwithpathinfo)); if (error == 0) - *retval = sizeof(struct proc_threadwithpathinfo); + *retval = sizeof(struct proc_threadwithpathinfo); } } break; @@ -1835,25 +1923,25 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu case PROC_PIDWORKQUEUEINFO:{ - struct proc_workqueueinfo pwqinfo; + struct proc_workqueueinfo pwqinfo; error = proc_pidworkqueueinfo(p, &pwqinfo); if (error == 0) { error = copyout(&pwqinfo, buffer, sizeof(struct proc_workqueueinfo)); if (error == 0) *retval = sizeof(struct proc_workqueueinfo); - } + } } break; case PROC_PIDLISTFILEPORTS: { - error = proc_pidfileportlist(p, buffer, buffersize, - retval); + error = proc_pidfileportlist(p, buffer, buffersize, retval); } break; - case PROC_PIDARCHINFO: { + case PROC_PIDARCHINFO: { struct proc_archinfo pai; + bzero(&pai, sizeof(pai)); proc_archinfo(p, &pai); error = copyout(&pai, buffer, sizeof(struct proc_archinfo)); if (error == 0) { @@ -1884,6 +1972,39 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu } break; + case PROC_PIDEXITREASONINFO: { + struct proc_exitreasoninfo eri; + + error = copyin(buffer, &eri, sizeof(eri)); + if (error != 0) { + break; + } + + error = proc_pidexitreasoninfo(p, &eri, NULL); + if (error == 0) { + error = copyout(&eri, buffer, sizeof(eri)); + if (error == 0) { + *retval = sizeof(eri); + } + } + } + break; + + case PROC_PIDEXITREASONBASICINFO: { + struct proc_exitreasonbasicinfo beri; + + bzero(&beri, sizeof(struct proc_exitreasonbasicinfo)); + + error = proc_pidexitreasoninfo(p, NULL, &beri); + if (error == 0) { + error = copyout(&beri, buffer, sizeof(beri)); + if (error == 0) { + *retval = sizeof(beri); + } + } + } + break; + default: error = ENOTSUP; } @@ -2034,7 +2155,7 @@ pid_pseminfo(struct psemnode *psem, struct fileproc *fp, proc_t proc, int fd, us if ((error = fill_pseminfo(psem, &pseminfo.pseminfo)) == 0) { if ((error = copyout(&pseminfo, buffer, sizeof(struct psem_fdinfo))) == 0) - *retval = sizeof(struct psem_fdinfo); + *retval = sizeof(struct psem_fdinfo); } return(error); @@ -2051,7 +2172,7 @@ pid_pshminfo(struct pshmnode *pshm, struct fileproc *fp, proc_t proc, int fd, us if ((error = fill_pshminfo(pshm, &pshminfo.pshminfo)) == 0) { if ((error = copyout(&pshminfo, buffer, sizeof(struct pshm_fdinfo))) == 0) - *retval = sizeof(struct pshm_fdinfo); + *retval = sizeof(struct pshm_fdinfo); } return(error); @@ -2067,7 +2188,7 @@ pid_pipeinfo(struct pipe * p, struct fileproc *fp, proc_t proc, int fd, user_ad fill_fileinfo(fp, proc, fd, &pipeinfo.pfi); if ((error = fill_pipeinfo(p, &pipeinfo.pipeinfo)) == 0) { if ((error = copyout(&pipeinfo, buffer, sizeof(struct pipe_fdinfo))) == 0) - *retval = sizeof(struct pipe_fdinfo); + *retval = sizeof(struct pipe_fdinfo); } return(error); @@ -2089,7 +2210,7 @@ pid_kqueueinfo(struct kqueue * kq, struct fileproc *fp, proc_t proc, int fd, use if ((error = fill_kqueueinfo(kq, &kqinfo.kqueueinfo)) == 0) { if ((error = copyout(&kqinfo, buffer, sizeof(struct kqueue_fdinfo))) == 0) - *retval = sizeof(struct kqueue_fdinfo); + *retval = sizeof(struct kqueue_fdinfo); } return(error); @@ -2102,7 +2223,6 @@ pid_atalkinfo(__unused struct atalk * at, __unused struct fileproc *fp, __unuse } - /************************** proc_pidfdinfo routine ***************************/ int proc_pidfdinfo(int pid, int flavor, int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval) @@ -2279,6 +2399,121 @@ out1 : return(error); } +int +proc_listfd_kqueue(proc_t p, int32_t *fdlist, int len) +{ + int numfds; + struct fileproc * fp; + int n; + int count = 0; + + numfds = p->p_fd->fd_nfiles; + if (len < numfds) { + return -1; + } + + proc_fdlock(p); + for (n = 0; ((n < numfds) && (n < p->p_fd->fd_nfiles)); n++) { + if (((fp = p->p_fd->fd_ofiles[n]) != 0) + && ((p->p_fd->fd_ofileflags[n] & UF_RESERVED) == 0) + && (FILEGLOB_DTYPE(fp->f_fglob) == PROX_FDTYPE_KQUEUE)) { + fdlist[count++] = n; + } + } + proc_fdunlock(p); + return count; +} + +int +proc_kqueue_udata_info(proc_t p, int32_t fd, uint64_t *buffer, int bufsize) +{ + struct kqueue *kq; + struct fileproc * fp = NULL; + int retval; + + if (fd == -1) { + /* wqkqueue is initialized on-demand */ + if ((kq = p->p_wqkqueue) == NULL) { + return 0; + } + } else { + int error = fp_getfkq(p, fd, &fp, &kq); + if (error != 0) { + return 0; + } + } + + retval = pid_kqueue_udatainfo(p, kq, buffer, bufsize); + if (fp) { + fp_drop(p, fd, fp , 0); + } + + return retval; +} + +int +proc_list_uptrs(proc_t p, uint64_t *udata_buffer, int size) +{ + int32_t *fdlist = NULL; + int nfds; + int i; + int count = 0; + int ret; + int knote_max = 4096; + uint64_t *buffer; + int bufsize = knote_max * sizeof(uint64_t); + + fdlist = (int32_t *)kalloc((OPEN_MAX + 1) * sizeof(int32_t)); + if (!fdlist) { + return -1; + } + + nfds = proc_listfd_kqueue(p, &fdlist[1], OPEN_MAX); + if (nfds < 0 || nfds > OPEN_MAX) { + kfree(fdlist, (OPEN_MAX + 1) * sizeof(int32_t)); + return 0; + } + + /* Add FD -1, the implicit workq kqueue */ + fdlist[0] = -1; + nfds++; + + if (size == 0) { + bufsize = 0; + buffer = NULL; + } else { + bufsize = knote_max * sizeof(uint64_t); + buffer = (uint64_t *)kalloc(bufsize); + } + + for (i = 0; i < nfds; i++) { +again: + ret = proc_kqueue_udata_info(p, fdlist[i], buffer, bufsize); + if (bufsize != 0 && ret > knote_max) { + kfree(buffer, bufsize); + knote_max = ret + 32; + bufsize = knote_max * sizeof(uint64_t); + buffer = kalloc(bufsize); + goto again; + } + + if (ret == 0) + continue; + + /* Copy the udata ptrs */ + if (size >= (int)((count + ret) * sizeof(uint64_t))) { + memcpy(&udata_buffer[count], buffer, ret * sizeof(uint64_t)); + } + count = count + ret; + } + + kfree(fdlist, (OPEN_MAX + 1) * sizeof(int32_t)); + if (buffer) { + kfree(buffer, bufsize); + } + return count; +} + /* * Helper function for proc_pidfileportinfo */ @@ -2467,7 +2702,7 @@ proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t int error = 0; uint32_t pcontrol = (uint32_t)arg; struct uthread *ut = NULL; - + char name_buf[MAXTHREADNAMESIZE]; pself = current_proc(); if (pid != pself->p_pid) @@ -2491,19 +2726,24 @@ proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t break; case PROC_SELFSET_THREADNAME: { - /* PROC_SELFSET_THREADNAME_SIZE = (MAXTHREADNAMESIZE -1) */ - if(buffersize > PROC_SELFSET_THREADNAME_SIZE) + /* + * This is a bit ugly, as it copies the name into the kernel, and then + * invokes bsd_setthreadname again to copy it into the uthread name + * buffer. Hopefully this isn't such a hot codepath that an additional + * MAXTHREADNAMESIZE copy is a big issue. + */ + if (buffersize > (MAXTHREADNAMESIZE - 1)) { return ENAMETOOLONG; + } + ut = current_uthread(); - if(!ut->pth_name) - { - ut->pth_name = (char*)kalloc(MAXTHREADNAMESIZE ); - if(!ut->pth_name) - return ENOMEM; + bzero(name_buf, MAXTHREADNAMESIZE); + error = copyin(buffer, name_buf, buffersize); + + if (!error) { + bsd_setthreadname(ut, name_buf); } - bzero(ut->pth_name, MAXTHREADNAMESIZE); - error = copyin(buffer, ut->pth_name, buffersize); } break; @@ -2694,7 +2934,7 @@ proc_terminate(int pid, int32_t *retval) sig = SIGTERM; #endif - proc_set_task_policy(p->task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, + proc_set_task_policy(p->task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_TERMINATED, TASK_POLICY_ENABLE); psignal(p, sig); @@ -2783,7 +3023,71 @@ proc_pidcoalitioninfo(proc_t p, struct proc_pidcoalitioninfo *ppci) proc_coalitionids(p, ppci->coalition_id); } +int +proc_pidexitreasoninfo(proc_t p, struct proc_exitreasoninfo *peri, struct proc_exitreasonbasicinfo *pberi) +{ + uint32_t reason_data_size = 0; + int error = 0; + pid_t selfpid = proc_selfpid(); + proc_lock(p); + + /* + * One (and only one) of peri and pberi must be non-NULL. + */ + assert((peri != NULL) || (pberi != NULL)); + assert((peri == NULL) || (pberi == NULL)); + + /* + * Allow access to the parent of the exiting + * child or the parent debugger only. + */ + do { + if (p->p_ppid == selfpid) + break; /* parent => ok */ + + if ((p->p_lflag & P_LTRACED) != 0 && + (p->p_oppid == selfpid)) + break; /* parent-in-waiting => ok */ + + proc_unlock(p); + return EACCES; + } while (0); + + if (p->p_exit_reason == OS_REASON_NULL) { + proc_unlock(p); + return ENOENT; + } + + if (p->p_exit_reason->osr_kcd_buf != NULL) { + reason_data_size = kcdata_memory_get_used_bytes(&p->p_exit_reason->osr_kcd_descriptor); + } + + if (peri != NULL) { + peri->eri_namespace = p->p_exit_reason->osr_namespace; + peri->eri_code = p->p_exit_reason->osr_code; + peri->eri_flags = p->p_exit_reason->osr_flags; + + if ((peri->eri_kcd_buf == 0) || (peri->eri_reason_buf_size < reason_data_size)) { + proc_unlock(p); + return ENOMEM; + } + + peri->eri_reason_buf_size = reason_data_size; + if (reason_data_size != 0) { + error = copyout(p->p_exit_reason->osr_kcd_buf, peri->eri_kcd_buf, reason_data_size); + } + } else { + pberi->beri_namespace = p->p_exit_reason->osr_namespace; + pberi->beri_code = p->p_exit_reason->osr_code; + pberi->beri_flags = p->p_exit_reason->osr_flags; + pberi->beri_reason_buf_size = reason_data_size; + } + + proc_unlock(p); + + return error; +} /* * Wrapper to provide NOTE_EXIT_DETAIL and NOTE_EXITSTATUS diff --git a/bsd/kern/proc_uuid_policy.c b/bsd/kern/proc_uuid_policy.c index bc930ad14..b9e96efea 100644 --- a/bsd/kern/proc_uuid_policy.c +++ b/bsd/kern/proc_uuid_policy.c @@ -367,6 +367,7 @@ int proc_uuid_policy(struct proc *p __unused, struct proc_uuid_policy_args *uap, { int error = 0; uuid_t uuid; + memcpy(uuid, UUID_NULL, sizeof(uuid_t)); /* Need privilege for policy changes */ error = priv_check_cred(kauth_cred_get(), PRIV_PROC_UUID_POLICY, 0); diff --git a/bsd/kern/process_policy.c b/bsd/kern/process_policy.c index ded6d7215..ffbd70e98 100644 --- a/bsd/kern/process_policy.c +++ b/bsd/kern/process_policy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -51,6 +51,8 @@ #include #include #include +#include + #include #include #include @@ -71,7 +73,7 @@ #include static int handle_lowresource(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); -static int handle_resourceuse(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_cpuuse(int action, user_addr_t attrp, proc_t proc, uint64_t target_threadid); static int handle_apptype(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); static int handle_boost(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); @@ -155,7 +157,21 @@ process_policy(__unused struct proc *p, struct process_policy_args * uap, __unus error = handle_lowresource(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); break; case PROC_POLICY_RESOURCE_USAGE: - error = handle_resourceuse(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); + switch(policy_subtype) { + case PROC_POLICY_RUSAGE_NONE: + case PROC_POLICY_RUSAGE_WIREDMEM: + case PROC_POLICY_RUSAGE_VIRTMEM: + case PROC_POLICY_RUSAGE_DISK: + case PROC_POLICY_RUSAGE_NETWORK: + case PROC_POLICY_RUSAGE_POWER: + return(ENOTSUP); + default: + return(EINVAL); + case PROC_POLICY_RUSAGE_CPU: + break; + } + + error = handle_cpuuse(action, attrp, target_proc, target_threadid); break; case PROC_POLICY_APPTYPE: error = handle_apptype(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); @@ -196,45 +212,35 @@ handle_lowresource(__unused int scope, int action, __unused int policy, int poli static int -handle_resourceuse(__unused int scope, __unused int action, __unused int policy, int policy_subtype, user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +handle_cpuuse(int action, user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) { proc_policy_cpuusage_attr_t cpuattr; #if CONFIG_MACF proc_t curp = current_proc(); #endif - int entitled = TRUE; + int entitled = FALSE; + Boolean canEnable = FALSE; uint64_t interval = -1ULL; int error = 0; uint8_t percentage; - switch(policy_subtype) { - case PROC_POLICY_RUSAGE_NONE: - case PROC_POLICY_RUSAGE_WIREDMEM: - case PROC_POLICY_RUSAGE_VIRTMEM: - case PROC_POLICY_RUSAGE_DISK: - case PROC_POLICY_RUSAGE_NETWORK: - case PROC_POLICY_RUSAGE_POWER: - return(ENOTSUP); - break; - default: - return(EINVAL); - case PROC_POLICY_RUSAGE_CPU: - break; - } - #if CONFIG_MACF - if (curp != proc) { - /* the cpumon entitlement manages messing with CPU limits on self */ - error = mac_proc_check_sched(curp, proc); - if (error) - return error; - } - /* - * Allow a process to change CPU usage monitor parameters, unless a MAC policy - * overrides it with an entitlement check. + * iOS only allows processes to override their own CPU usage monitor + * parameters if they have com.apple.private.kernel.override-cpumon. + * + * Until rdar://24799462 improves our scheme, we are also using the + * same entitlement to indicate which processes can resume monitoring + * when they otherwise wouldn't be able to. */ entitled = (mac_proc_check_cpumon(curp) == 0) ? TRUE : FALSE; + canEnable = (entitled && action == PROC_POLICY_ACTION_ENABLE); + + if (!canEnable && curp != proc) { + /* can the current process change scheduling parameters? */ + error = mac_proc_check_sched(curp, proc); + if (error) return error; + } #endif switch (action) { @@ -275,10 +281,18 @@ handle_resourceuse(__unused int scope, __unused int action, __unused int policy, entitled); break; + /* restore process to prior state */ case PROC_POLICY_ACTION_RESTORE: error = proc_clear_task_ruse_cpu(proc->task, entitled); break; + /* re-enable suspended monitor */ + case PROC_POLICY_ACTION_ENABLE: + error = task_resume_cpumon(proc->task); + break; + + case PROC_POLICY_ACTION_REMOVE: + default: error = EINVAL; break; @@ -356,19 +370,18 @@ handle_apptype( int scope, switch (action) { case PROC_POLICY_ACTION_ENABLE: /* PROCESS ENABLE APPTYPE TAL */ - proc_set_task_policy(target_proc->task, THREAD_NULL, - TASK_POLICY_ATTRIBUTE, TASK_POLICY_TAL, - TASK_POLICY_ENABLE); + proc_set_task_policy(target_proc->task, + TASK_POLICY_ATTRIBUTE, TASK_POLICY_TAL, + TASK_POLICY_ENABLE); break; case PROC_POLICY_ACTION_DISABLE: /* PROCESS DISABLE APPTYPE TAL */ - proc_set_task_policy(target_proc->task, THREAD_NULL, - TASK_POLICY_ATTRIBUTE, TASK_POLICY_TAL, - TASK_POLICY_DISABLE); + proc_set_task_policy(target_proc->task, + TASK_POLICY_ATTRIBUTE, TASK_POLICY_TAL, + TASK_POLICY_DISABLE); break; default: return (EINVAL); - break; } return(0); diff --git a/bsd/kern/pthread_shims.c b/bsd/kern/pthread_shims.c index 2d5b931e3..4d55f7ef7 100644 --- a/bsd/kern/pthread_shims.c +++ b/bsd/kern/pthread_shims.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Apple Inc. All rights reserved. + * Copyright (c) 2012-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,8 @@ #include #include #include +#include + #include #include #include @@ -45,6 +47,7 @@ #include #include #include +#include /* version number of the in-kernel shims given to pthread.kext */ #define PTHREAD_SHIMS_VERSION 1 @@ -53,8 +56,8 @@ #define PTHREAD_CALLBACK_MEMBER ml_get_max_cpus /* compile time asserts to check the length of structures in pthread_shims.h */ -char pthread_functions_size_compile_assert[(sizeof(struct pthread_functions_s) - offsetof(struct pthread_functions_s, psynch_rw_yieldwrlock) - sizeof(void*)) == (sizeof(void*) * 100) ? 1 : -1]; -char pthread_callbacks_size_compile_assert[(sizeof(struct pthread_callbacks_s) - offsetof(struct pthread_callbacks_s, PTHREAD_CALLBACK_MEMBER) - sizeof(void*)) == (sizeof(void*) * 100) ? 1 : -1]; +static_assert((sizeof(struct pthread_functions_s) - offsetof(struct pthread_functions_s, psynch_rw_yieldwrlock) - sizeof(void*)) == (sizeof(void*) * 100)); +static_assert((sizeof(struct pthread_callbacks_s) - offsetof(struct pthread_callbacks_s, PTHREAD_CALLBACK_MEMBER) - sizeof(void*)) == (sizeof(void*) * 100)); /* old pthread code had definitions for these as they don't exist in headers */ extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t); @@ -69,23 +72,64 @@ extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t); set(structtype x, rettype y) { \ (x)->member = y; \ } - + PTHREAD_STRUCT_ACCESSOR(proc_get_threadstart, proc_set_threadstart, user_addr_t, struct proc*, p_threadstart); PTHREAD_STRUCT_ACCESSOR(proc_get_pthsize, proc_set_pthsize, int, struct proc*, p_pthsize); PTHREAD_STRUCT_ACCESSOR(proc_get_wqthread, proc_set_wqthread, user_addr_t, struct proc*, p_wqthread); -PTHREAD_STRUCT_ACCESSOR(proc_get_targconc, proc_set_targconc, user_addr_t, struct proc*, p_targconc); PTHREAD_STRUCT_ACCESSOR(proc_get_stack_addr_hint, proc_set_stack_addr_hint, user_addr_t, struct proc *, p_stack_addr_hint); PTHREAD_STRUCT_ACCESSOR(proc_get_dispatchqueue_offset, proc_set_dispatchqueue_offset, uint64_t, struct proc*, p_dispatchqueue_offset); PTHREAD_STRUCT_ACCESSOR(proc_get_dispatchqueue_serialno_offset, proc_set_dispatchqueue_serialno_offset, uint64_t, struct proc*, p_dispatchqueue_serialno_offset); PTHREAD_STRUCT_ACCESSOR(proc_get_pthread_tsd_offset, proc_set_pthread_tsd_offset, uint32_t, struct proc *, p_pth_tsd_offset); -PTHREAD_STRUCT_ACCESSOR(proc_get_wqptr, proc_set_wqptr, void*, struct proc*, p_wqptr); -PTHREAD_STRUCT_ACCESSOR(proc_get_wqsize, proc_set_wqsize, int, struct proc*, p_wqsize); PTHREAD_STRUCT_ACCESSOR(proc_get_pthhash, proc_set_pthhash, void*, struct proc*, p_pthhash); PTHREAD_STRUCT_ACCESSOR(uthread_get_threadlist, uthread_set_threadlist, void*, struct uthread*, uu_threadlist); PTHREAD_STRUCT_ACCESSOR(uthread_get_sigmask, uthread_set_sigmask, sigset_t, struct uthread*, uu_sigmask); PTHREAD_STRUCT_ACCESSOR(uthread_get_returnval, uthread_set_returnval, int, struct uthread*, uu_rval[0]); +#define WQPTR_IS_INITING_VALUE ((void *)~(uintptr_t)0) + +static void * +proc_get_wqptr(struct proc *p) { + void *wqptr = p->p_wqptr; + return (wqptr == WQPTR_IS_INITING_VALUE) ? NULL : wqptr; +} +static void +proc_set_wqptr(struct proc *p, void *y) { + proc_lock(p); + + assert(y == NULL || p->p_wqptr == WQPTR_IS_INITING_VALUE); + + p->p_wqptr = y; + + if (y != NULL){ + wakeup(&p->p_wqptr); + } + + proc_unlock(p); +} +static boolean_t +proc_init_wqptr_or_wait(struct proc *p) { + proc_lock(p); + + if (p->p_wqptr == NULL){ + p->p_wqptr = WQPTR_IS_INITING_VALUE; + proc_unlock(p); + + return TRUE; + } else if (p->p_wqptr == WQPTR_IS_INITING_VALUE){ + assert_wait(&p->p_wqptr, THREAD_UNINT); + proc_unlock(p); + thread_block(THREAD_CONTINUE_NULL); + + return FALSE; + } else { + proc_unlock(p); + + return FALSE; + } +} + +__attribute__((noreturn)) static void pthread_returning_to_userspace(void) { @@ -102,16 +146,6 @@ proc_get_task(struct proc *p) { return p->task; } -static lck_spin_t* -proc_get_wqlockptr(struct proc *p) { - return &(p->p_wqlock); -} - -static boolean_t* -proc_get_wqinitingptr(struct proc *p) { - return &(p->p_wqiniting); -} - static uint64_t proc_get_register(struct proc *p) { return (p->p_lflag & P_LREGISTER); @@ -149,15 +183,15 @@ qos_main_thread_active(void) static int proc_usynch_get_requested_thread_qos(struct uthread *uth) { - task_t task = current_task(); thread_t thread = uth ? uth->uu_thread : current_thread(); int requested_qos; - requested_qos = proc_get_task_policy(task, thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS); + requested_qos = proc_get_thread_policy(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS); /* - * For the purposes of userspace synchronization, it doesn't make sense to place an override of UNSPECIFIED - * on another thread, if the current thread doesn't have any QoS set. In these cases, upgrade to + * For the purposes of userspace synchronization, it doesn't make sense to + * place an override of UNSPECIFIED on another thread, if the current thread + * doesn't have any QoS set. In these cases, upgrade to * THREAD_QOS_USER_INTERACTIVE. */ if (requested_qos == THREAD_QOS_UNSPECIFIED) { @@ -167,43 +201,53 @@ static int proc_usynch_get_requested_thread_qos(struct uthread *uth) return requested_qos; } -static boolean_t proc_usynch_thread_qos_add_override(struct uthread *uth, uint64_t tid, int override_qos, boolean_t first_override_for_resource) +static int +proc_usynch_thread_qos_add_override_for_resource_check_owner(thread_t thread, + int override_qos, boolean_t first_override_for_resource, + user_addr_t resource, int resource_type, + user_addr_t user_lock_addr, mach_port_name_t user_lock_owner) { - task_t task = current_task(); - thread_t thread = uth ? uth->uu_thread : THREAD_NULL; - - return proc_thread_qos_add_override(task, thread, tid, override_qos, first_override_for_resource, USER_ADDR_NULL, THREAD_QOS_OVERRIDE_TYPE_UNKNOWN); + return proc_thread_qos_add_override_check_owner(thread, override_qos, + first_override_for_resource, resource, resource_type, + user_lock_addr, user_lock_owner); } -static boolean_t proc_usynch_thread_qos_remove_override(struct uthread *uth, uint64_t tid) +static boolean_t +proc_usynch_thread_qos_add_override_for_resource(task_t task, struct uthread *uth, + uint64_t tid, int override_qos, boolean_t first_override_for_resource, + user_addr_t resource, int resource_type) { - task_t task = current_task(); thread_t thread = uth ? uth->uu_thread : THREAD_NULL; - return proc_thread_qos_remove_override(task, thread, tid, USER_ADDR_NULL, THREAD_QOS_OVERRIDE_TYPE_UNKNOWN); -} - -static boolean_t proc_usynch_thread_qos_add_override_for_resource(task_t task, struct uthread *uth, uint64_t tid, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, int resource_type) -{ - thread_t thread = uth ? uth->uu_thread : THREAD_NULL; - - return proc_thread_qos_add_override(task, thread, tid, override_qos, first_override_for_resource, resource, resource_type); + return proc_thread_qos_add_override(task, thread, tid, override_qos, + first_override_for_resource, resource, resource_type); } -static boolean_t proc_usynch_thread_qos_remove_override_for_resource(task_t task, struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type) +static boolean_t +proc_usynch_thread_qos_remove_override_for_resource(task_t task, + struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type) { thread_t thread = uth ? uth->uu_thread : THREAD_NULL; return proc_thread_qos_remove_override(task, thread, tid, resource, resource_type); } -static boolean_t proc_usynch_thread_qos_reset_override_for_resource(task_t task, struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type) +static boolean_t +proc_usynch_thread_qos_reset_override_for_resource(task_t task, + struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type) { thread_t thread = uth ? uth->uu_thread : THREAD_NULL; return proc_thread_qos_reset_override(task, thread, tid, resource, resource_type); } +static boolean_t +proc_usynch_thread_qos_squash_override_for_resource(thread_t thread, + user_addr_t resource, int resource_type) +{ + return proc_thread_qos_squash_override(thread, resource, resource_type); +} + /* kernel (core) to kext shims */ void @@ -215,22 +259,54 @@ pthread_init(void) pthread_functions->pthread_init(); } -int +int fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo) { return pthread_functions->fill_procworkqueue(p, pwqinfo); } -void -workqueue_init_lock(proc_t p) +/* + * Returns true if the workqueue flags are available, and will fill + * in exceeded_total and exceeded_constrained. + */ +boolean_t +workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total, + boolean_t *exceeded_constrained) { - pthread_functions->workqueue_init_lock(p); + proc_t p = v; + struct proc_workqueueinfo pwqinfo; + int err; + + assert(p != NULL); + assert(exceeded_total != NULL); + assert(exceeded_constrained != NULL); + + err = fill_procworkqueue(p, &pwqinfo); + if (err) { + return FALSE; + } + if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) { + return FALSE; + } + + *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT); + *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT); + + return TRUE; } -void -workqueue_destroy_lock(proc_t p) +uint32_t +workqueue_get_pwq_state_kdp(void * v) { - pthread_functions->workqueue_destroy_lock(p); + static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) == kTaskWqExceededConstrainedThreadLimit); + static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) == kTaskWqExceededTotalThreadLimit); + static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable); + static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT | WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7); + proc_t p = v; + if (pthread_functions == NULL || pthread_functions->get_pwq_state_kdp == NULL) + return 0; + else + return pthread_functions->get_pwq_state_kdp(p); } void @@ -412,13 +488,29 @@ psynch_rw_downgrade(__unused proc_t p, __unused struct psynch_rw_downgrade_args return 0; } +int +thread_qos_from_pthread_priority(unsigned long priority, unsigned long *flags) +{ + return pthread_functions->thread_qos_from_pthread_priority(priority, flags); +} + +unsigned long +pthread_priority_canonicalize(unsigned long priority, boolean_t propagation) +{ + if (pthread_functions->pthread_priority_canonicalize2) { + return pthread_functions->pthread_priority_canonicalize2(priority, propagation); + } else { + return pthread_functions->pthread_priority_canonicalize(priority); + } +} + /* * The callbacks structure (defined in pthread_shims.h) contains a collection * of kernel functions that were not deemed sensible to expose as a KPI to all * kernel extensions. So the kext is given them in the form of a structure of * function pointers. */ -static struct pthread_callbacks_s pthread_callbacks = { +static const struct pthread_callbacks_s pthread_callbacks = { .version = PTHREAD_SHIMS_VERSION, .config_thread_max = CONFIG_THREAD_MAX, .get_task_threadmax = get_task_threadmax, @@ -429,21 +521,15 @@ static struct pthread_callbacks_s pthread_callbacks = { .proc_set_pthsize = proc_set_pthsize, .proc_get_wqthread = proc_get_wqthread, .proc_set_wqthread = proc_set_wqthread, - .proc_get_targconc = proc_get_targconc, - .proc_set_targconc = proc_set_targconc, .proc_get_dispatchqueue_offset = proc_get_dispatchqueue_offset, .proc_set_dispatchqueue_offset = proc_set_dispatchqueue_offset, .proc_get_wqptr = proc_get_wqptr, .proc_set_wqptr = proc_set_wqptr, - .proc_get_wqsize = proc_get_wqsize, - .proc_set_wqsize = proc_set_wqsize, - .proc_get_wqlockptr = proc_get_wqlockptr, - .proc_get_wqinitingptr = proc_get_wqinitingptr, - .proc_get_pthhash = proc_get_pthhash, + .proc_get_pthhash = proc_get_pthhash, .proc_set_pthhash = proc_set_pthhash, .proc_get_task = proc_get_task, .proc_lock = proc_lock, - .proc_unlock = proc_unlock, + .proc_unlock = proc_unlock, .proc_get_register = proc_get_register, .proc_set_register = proc_set_register, @@ -463,15 +549,15 @@ static struct pthread_callbacks_s pthread_callbacks = { .uthread_get_returnval = uthread_get_returnval, .uthread_set_returnval = uthread_set_returnval, .uthread_is_cancelled = uthread_is_cancelled, - + .thread_exception_return = pthread_returning_to_userspace, .thread_bootstrap_return = thread_bootstrap_return, .unix_syscall_return = unix_syscall_return, .absolutetime_to_microtime = absolutetime_to_microtime, - .proc_restore_workq_bgthreadpolicy = proc_restore_workq_bgthreadpolicy, - .proc_apply_workq_bgthreadpolicy = proc_apply_workq_bgthreadpolicy, + .thread_set_workq_pri = thread_set_workq_pri, + .thread_set_workq_qos = thread_set_workq_qos, .get_bsdthread_info = (void*)get_bsdthread_info, .thread_sched_call = thread_sched_call, @@ -494,7 +580,7 @@ static struct pthread_callbacks_s pthread_callbacks = { .current_map = _current_map, .thread_create = thread_create, .thread_resume = thread_resume, - + .convert_thread_to_port = convert_thread_to_port, .ml_get_max_cpus = (void*)ml_get_max_cpus, @@ -510,14 +596,22 @@ static struct pthread_callbacks_s pthread_callbacks = { .thread_set_tsd_base = thread_set_tsd_base, .proc_usynch_get_requested_thread_qos = proc_usynch_get_requested_thread_qos, - .proc_usynch_thread_qos_add_override = proc_usynch_thread_qos_add_override, - .proc_usynch_thread_qos_remove_override = proc_usynch_thread_qos_remove_override, .qos_main_thread_active = qos_main_thread_active, + .proc_usynch_thread_qos_add_override_for_resource_check_owner = proc_usynch_thread_qos_add_override_for_resource_check_owner, .proc_usynch_thread_qos_add_override_for_resource = proc_usynch_thread_qos_add_override_for_resource, .proc_usynch_thread_qos_remove_override_for_resource = proc_usynch_thread_qos_remove_override_for_resource, .proc_usynch_thread_qos_reset_override_for_resource = proc_usynch_thread_qos_reset_override_for_resource, + + .proc_init_wqptr_or_wait = proc_init_wqptr_or_wait, + + .thread_set_tag = thread_set_tag, + .thread_get_tag = thread_get_tag, + + .proc_usynch_thread_qos_squash_override_for_resource = proc_usynch_thread_qos_squash_override_for_resource, + .task_get_default_manager_qos = task_get_default_manager_qos, + .thread_create_workq_waiting = thread_create_workq_waiting, }; pthread_callbacks_t pthread_kern = &pthread_callbacks; @@ -535,13 +629,13 @@ pthread_kext_register(pthread_functions_t fns, pthread_callbacks_t *callbacks) if (pthread_functions != NULL) { panic("Re-initialisation of pthread kext callbacks."); } - + if (callbacks != NULL) { *callbacks = &pthread_callbacks; } else { panic("pthread_kext_register called without callbacks pointer."); } - + if (fns) { pthread_functions = fns; } diff --git a/bsd/kern/stackshot.c b/bsd/kern/stackshot.c new file mode 100644 index 000000000..5910e059f --- /dev/null +++ b/bsd/kern/stackshot.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * + * @Apple_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Stackshot system calls + */ + +#if CONFIG_TELEMETRY +extern kern_return_t stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval); +#endif /* CONFIG_TELEMETRY */ +extern kern_return_t kern_stack_snapshot_with_reason(char* reason); +extern kern_return_t kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user); + +static int +stackshot_kern_return_to_bsd_error(kern_return_t kr) +{ + switch (kr) { + case KERN_SUCCESS: + return 0; + case KERN_RESOURCE_SHORTAGE: + /* could not allocate memory, or stackshot is actually bigger than + * SANE_TRACEBUF_SIZE */ + return ENOMEM; + case KERN_INSUFFICIENT_BUFFER_SIZE: + case KERN_NO_SPACE: + /* ran out of buffer to write the stackshot. Normally this error + * causes a larger buffer to be allocated in-kernel, rather than + * being returned to the user. */ + return ENOSPC; + case KERN_NO_ACCESS: + return EPERM; + case KERN_MEMORY_PRESENT: + return EEXIST; + case KERN_NOT_SUPPORTED: + return ENOTSUP; + case KERN_NOT_IN_SET: + /* requested existing buffer, but there isn't one. */ + return ENOENT; + case KERN_ABORTED: + /* kdp did not report an error, but also did not produce any data */ + return EINTR; + case KERN_FAILURE: + /* stackshot came across inconsistent data and needed to bail out */ + return EBUSY; + case KERN_OPERATION_TIMED_OUT: + /* debugger synchronization timed out */ + return ETIMEDOUT; + default: + return EINVAL; + } +} + +/* + * stack_snapshot_with_config: Obtains a coherent set of stack traces for specified threads on the sysem, + * tracing both kernel and user stacks where available. Allocates a buffer from the + * kernel and maps the buffer into the calling task's address space. + * + * Inputs: uap->stackshot_config_version - version of the stackshot config that is being passed + * uap->stackshot_config - pointer to the stackshot config + * uap->stackshot_config_size- size of the stackshot config being passed + * Outputs: EINVAL if there is a problem with the arguments + * EFAULT if we failed to copy in the arguments succesfully + * EPERM if the caller is not privileged + * ENOTSUP if the caller is passing a version of arguments that is not supported by the kernel + * (indicates libsyscall:kernel mismatch) or if the caller is requesting unsupported flags + * ENOENT if the caller is requesting an existing buffer that doesn't exist or if the + * requested PID isn't found + * ENOMEM if the kernel is unable to allocate enough memory to serve the request + * ENOSPC if there isn't enough space in the caller's address space to remap the buffer + * ESRCH if the target PID isn't found + * returns KERN_SUCCESS on success + */ +int +stack_snapshot_with_config(struct proc *p, struct stack_snapshot_with_config_args *uap, __unused int *retval) +{ + int error = 0; + kern_return_t kr; + + if ((error = suser(kauth_cred_get(), &p->p_acflag))) + return(error); + + if((void*)uap->stackshot_config == NULL) { + return EINVAL; + } + + switch (uap->stackshot_config_version) { + case STACKSHOT_CONFIG_TYPE: + if (uap->stackshot_config_size != sizeof(stackshot_config_t)) { + return EINVAL; + } + stackshot_config_t config; + error = copyin(uap->stackshot_config, &config, sizeof(stackshot_config_t)); + if (error != KERN_SUCCESS) + { + return EFAULT; + } + kr = kern_stack_snapshot_internal(uap->stackshot_config_version, &config, sizeof(stackshot_config_t), TRUE); + return stackshot_kern_return_to_bsd_error(kr); + default: + return ENOTSUP; + } +} + +#if CONFIG_TELEMETRY +/* + * microstackshot: Catch all system call for microstackshot related operations, including + * enabling/disabling both global and windowed microstackshots as well + * as retrieving windowed or global stackshots and the boot profile. + * Inputs: uap->tracebuf - address of the user space destination + * buffer + * uap->tracebuf_size - size of the user space trace buffer + * uap->flags - various flags + * Outputs: EPERM if the caller is not privileged + * EINVAL if the supplied mss_args is NULL, mss_args.tracebuf is NULL or mss_args.tracebuf_size is not sane + * ENOMEM if we don't have enough memory to satisfy the request + * *retval contains the number of bytes traced, if successful + * and -1 otherwise. + */ +int +microstackshot(struct proc *p, struct microstackshot_args *uap, int32_t *retval) +{ + int error = 0; + kern_return_t kr; + + if ((error = suser(kauth_cred_get(), &p->p_acflag))) + return(error); + + kr = stack_microstackshot(uap->tracebuf, uap->tracebuf_size, uap->flags, retval); + return stackshot_kern_return_to_bsd_error(kr); +} +#endif /* CONFIG_TELEMETRY */ + +/* + * kern_stack_snapshot_with_reason: Obtains a coherent set of stack traces for specified threads on the sysem, + * tracing both kernel and user stacks where available. Allocates a buffer from the + * kernel and stores the address of this buffer. + * + * Inputs: reason - the reason for triggering a stackshot (unused at the moment, but in the + * future will be saved in the stackshot) + * Outputs: EINVAL/ENOTSUP if there is a problem with the arguments + * EPERM if the caller doesn't pass at least one KERNEL stackshot flag + * ENOMEM if the kernel is unable to allocate enough memory to serve the request + * ESRCH if the target PID isn't found + * returns KERN_SUCCESS on success + */ +int +kern_stack_snapshot_with_reason(__unused char *reason) +{ + stackshot_config_t config; + kern_return_t kr; + + config.sc_pid = -1; + config.sc_flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_IN_KERNEL_BUFFER | + STACKSHOT_KCDATA_FORMAT | STACKSHOT_ENABLE_UUID_FAULTING); + config.sc_delta_timestamp = 0; + config.sc_out_buffer_addr = 0; + config.sc_out_size_addr = 0; + + kr = kern_stack_snapshot_internal(STACKSHOT_CONFIG_TYPE, &config, sizeof(stackshot_config_t), FALSE); + return stackshot_kern_return_to_bsd_error(kr); +} diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index 0f8c00b36..c301e2797 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2010 Apple, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ @@ -69,6 +69,12 @@ #include #include #include +#include +#include +#include +#include + +#include #include #include #include @@ -76,15 +82,33 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include +#include +#include +#include +#include +#include +#include /* XXX should be in a common header somewhere */ extern void logwakeup(void); +extern void oslogwakeup(void); +extern void oslog_streamwakeup(void); +static void oslog_streamwakeup_locked(void); +vm_offset_t kernel_firehose_addr = 0; + +/* log message counters for streaming mode */ +uint32_t oslog_s_streamed_msgcount = 0; +uint32_t oslog_s_dropped_msgcount = 0; +extern uint32_t oslog_s_error_count; #define LOG_RDPRI (PZERO + 1) @@ -104,13 +128,65 @@ struct logsoftc { int log_open; /* also used in log() */ char smsg_bufc[CONFIG_MSG_BSIZE]; /* static buffer */ +char oslog_stream_bufc[FIREHOSE_BUFFER_CHUNK_SIZE]; /* static buffer */ +struct firehose_buffer_chunk_s __attribute__((aligned(8))) oslog_boot_buf = { + .fbc_pos = { + .fbc_next_entry_offs = offsetof(struct firehose_buffer_chunk_s, fbc_data), + .fbc_private_offs = FIREHOSE_BUFFER_CHUNK_SIZE, + .fbc_refcnt = 1, // indicate that there is a writer to this chunk + .fbc_stream = firehose_stream_persist, + .fbc_flag_io = 1, // for now, lets assume this is coming from the io bank + }, +}; /* static buffer */ +firehose_buffer_chunk_t firehose_boot_chunk = &oslog_boot_buf; struct msgbuf msgbuf = {MSG_MAGIC,sizeof(smsg_bufc),0,0,smsg_bufc}; +struct msgbuf oslog_stream_buf = {MSG_MAGIC,0,0,0,NULL}; struct msgbuf *msgbufp __attribute__((used)) = &msgbuf; +struct msgbuf *oslog_streambufp __attribute__((used)) = &oslog_stream_buf; + +// List entries for keeping track of the streaming buffer +static oslog_stream_buf_entry_t oslog_stream_buf_entries; + +#define OSLOG_NUM_STREAM_ENTRIES 64 +#define OSLOG_STREAM_BUF_SIZE 4096 + +int oslog_open = 0; +int os_log_wakeup = 0; +int oslog_stream_open = 0; +int oslog_stream_buf_size = OSLOG_STREAM_BUF_SIZE; +int oslog_stream_num_entries = OSLOG_NUM_STREAM_ENTRIES; + +/* oslogsoftc only valid while oslog_open=1 */ +struct oslogsoftc { + int sc_state; /* see above for possibilities */ + struct selinfo sc_selp; /* thread waiting for select */ + int sc_pgid; /* process/group for async I/O */ +} oslogsoftc; + +struct oslog_streamsoftc { + int sc_state; /* see above for possibilities */ + struct selinfo sc_selp; /* thread waiting for select */ + int sc_pgid; /* process/group for async I/O */ +}oslog_streamsoftc; + +STAILQ_HEAD(, oslog_stream_buf_entry_s) oslog_stream_free_head = + STAILQ_HEAD_INITIALIZER(oslog_stream_free_head); +STAILQ_HEAD(, oslog_stream_buf_entry_s) oslog_stream_buf_head = + STAILQ_HEAD_INITIALIZER(oslog_stream_buf_head); -/* the following are implemented in osfmk/kern/printf.c */ +/* defined in osfmk/kern/printf.c */ +extern void oslog_lock_init(void); extern void bsd_log_lock(void); extern void bsd_log_unlock(void); -extern void bsd_log_init(void); + +/* defined for osfmk/kern/printf.c */ +void bsd_log_init(void); + +/* + * Ideally this file would define this lock, but bsd doesn't have the definition + * for lock groups. + */ +decl_lck_spin_data(extern, oslog_stream_lock) /* XXX wants a linker set so these can be static */ extern d_open_t logopen; @@ -119,6 +195,27 @@ extern d_read_t logread; extern d_ioctl_t logioctl; extern d_select_t logselect; +/* XXX wants a linker set so these can be static */ +extern d_open_t oslogopen; +extern d_close_t oslogclose; +extern d_select_t oslogselect; +extern d_ioctl_t oslogioctl; + +/* XXX wants a linker set so these can be static */ +extern d_open_t oslog_streamopen; +extern d_close_t oslog_streamclose; +extern d_read_t oslog_streamread; +extern d_ioctl_t oslog_streamioctl; +extern d_select_t oslog_streamselect; + +void oslog_init(void); +void oslog_setsize(int size); +void oslog_streamwrite_locked(firehose_tracepoint_id_u ftid, + uint64_t stamp, const void *pubdata, size_t publen); +void oslog_streamwrite_metadata_locked(oslog_stream_buf_entry_t m_entry); +static oslog_stream_buf_entry_t oslog_stream_find_free_buf_entry_locked(void); +static void oslog_streamwrite_append_bytes(const char *buffer, int buflen); + /* * Serialize log access. Note that the log can be written at interrupt level, * so any log manipulations that can be done from, or affect, another processor @@ -161,6 +258,7 @@ int logclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused struct proc *p) { LOG_LOCK(); + logsoftc.sc_state &= ~(LOG_NBIO | LOG_ASYNC); selwakeup(&logsoftc.sc_selp); selthreadclear(&logsoftc.sc_selp); log_open = 0; @@ -168,6 +266,141 @@ logclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused s return (0); } + +int +oslogopen(__unused dev_t dev, __unused int flags, __unused int mode, struct proc *p) +{ + LOG_LOCK(); + if (oslog_open) { + LOG_UNLOCK(); + return(EBUSY); + } + oslogsoftc.sc_pgid = p->p_pid; /* signal process only */ + oslog_open = 1; + + LOG_UNLOCK(); + return (0); +} + +int +oslogclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused struct proc *p) +{ + LOG_LOCK(); + oslogsoftc.sc_state &= ~(LOG_NBIO | LOG_ASYNC); + selwakeup(&oslogsoftc.sc_selp); + selthreadclear(&oslogsoftc.sc_selp); + oslog_open = 0; + LOG_UNLOCK(); + return (0); +} + +int +oslog_streamopen(__unused dev_t dev, __unused int flags, __unused int mode, struct proc *p) +{ + char *oslog_stream_msg_bufc = NULL; + oslog_stream_buf_entry_t entries = NULL; + + lck_spin_lock(&oslog_stream_lock); + if (oslog_stream_open) { + lck_spin_unlock(&oslog_stream_lock); + return EBUSY; + } + lck_spin_unlock(&oslog_stream_lock); + + // Allocate the stream buffer + oslog_stream_msg_bufc = kalloc(oslog_stream_buf_size); + if (!oslog_stream_msg_bufc) { + return ENOMEM; + } + + /* entries to support kernel logging in stream mode */ + entries = kalloc(oslog_stream_num_entries * sizeof(struct oslog_stream_buf_entry_s)); + if (!entries) { + kfree(oslog_stream_msg_bufc, oslog_stream_buf_size); + return ENOMEM; + } + + lck_spin_lock(&oslog_stream_lock); + if (oslog_stream_open) { + lck_spin_unlock(&oslog_stream_lock); + kfree(oslog_stream_msg_bufc, oslog_stream_buf_size); + kfree(entries, oslog_stream_num_entries * sizeof(struct oslog_stream_buf_entry_s)); + return EBUSY; + } + + assert(oslog_streambufp->msg_bufc == NULL); + oslog_streambufp->msg_bufc = oslog_stream_msg_bufc; + oslog_streambufp->msg_size = oslog_stream_buf_size; + + oslog_stream_buf_entries = entries; + + STAILQ_INIT(&oslog_stream_free_head); + STAILQ_INIT(&oslog_stream_buf_head); + + for (int i = 0; i < oslog_stream_num_entries; i++) { + oslog_stream_buf_entries[i].type = oslog_stream_link_type_log; + oslog_stream_buf_entries[i].offset = 0; + oslog_stream_buf_entries[i].size = 0; + oslog_stream_buf_entries[i].timestamp = 0; + STAILQ_INSERT_TAIL(&oslog_stream_free_head, &oslog_stream_buf_entries[i], buf_entries); + } + + /* there should be no pending entries in the stream */ + assert(STAILQ_EMPTY(&oslog_stream_buf_head)); + assert(oslog_streambufp->msg_bufx == 0); + assert(oslog_streambufp->msg_bufr == 0); + + oslog_streambufp->msg_bufx = 0; + oslog_streambufp->msg_bufr = 0; + oslog_streamsoftc.sc_pgid = p->p_pid; /* signal process only */ + oslog_stream_open = 1; + lck_spin_unlock(&oslog_stream_lock); + + return 0; +} + +int +oslog_streamclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused struct proc *p) +{ + oslog_stream_buf_entry_t next_entry = NULL; + char *oslog_stream_msg_bufc = NULL; + oslog_stream_buf_entry_t entries = NULL; + + lck_spin_lock(&oslog_stream_lock); + + if (oslog_stream_open == 0) { + lck_spin_unlock(&oslog_stream_lock); + return EBADF; + } + + // Consume all log lines + while (!STAILQ_EMPTY(&oslog_stream_buf_head)) { + next_entry = STAILQ_FIRST(&oslog_stream_buf_head); + STAILQ_REMOVE_HEAD(&oslog_stream_buf_head, buf_entries); + } + oslog_streamwakeup_locked(); + oslog_streamsoftc.sc_state &= ~(LOG_NBIO | LOG_ASYNC); + selwakeup(&oslog_streamsoftc.sc_selp); + selthreadclear(&oslog_streamsoftc.sc_selp); + oslog_stream_open = 0; + oslog_streambufp->msg_bufr = 0; + oslog_streambufp->msg_bufx = 0; + oslog_stream_msg_bufc = oslog_streambufp->msg_bufc; + oslog_streambufp->msg_bufc = NULL; + entries = oslog_stream_buf_entries; + oslog_stream_buf_entries = NULL; + oslog_streambufp->msg_size = 0; + + lck_spin_unlock(&oslog_stream_lock); + + // Free the stream buffer + kfree(oslog_stream_msg_bufc, oslog_stream_buf_size); + // Free the list entries + kfree(entries, oslog_stream_num_entries * sizeof(struct oslog_stream_buf_entry_s)); + + return 0; +} + /*ARGSUSED*/ int logread(__unused dev_t dev, struct uio *uio, int flag) @@ -227,6 +460,129 @@ logread(__unused dev_t dev, struct uio *uio, int flag) return (error); } +/*ARGSUSED*/ +int +oslog_streamread(__unused dev_t dev, struct uio *uio, int flag) +{ + int error = 0; + int copy_size = 0; + static char logline[FIREHOSE_BUFFER_CHUNK_SIZE]; + + lck_spin_lock(&oslog_stream_lock); + + if (!oslog_stream_open) { + lck_spin_unlock(&oslog_stream_lock); + return EBADF; + } + + while (STAILQ_EMPTY(&oslog_stream_buf_head)) { + if (flag & IO_NDELAY || oslog_streamsoftc.sc_state & LOG_NBIO) { + lck_spin_unlock(&oslog_stream_lock); + return EWOULDBLOCK; + } + + oslog_streamsoftc.sc_state |= LOG_RDWAIT; + wait_result_t wr = assert_wait((event_t)oslog_streambufp, + THREAD_INTERRUPTIBLE); + if (wr == THREAD_WAITING) { + lck_spin_unlock(&oslog_stream_lock); + wr = thread_block(THREAD_CONTINUE_NULL); + lck_spin_lock(&oslog_stream_lock); + } + + switch (wr) { + case THREAD_AWAKENED: + case THREAD_TIMED_OUT: + break; + default: + lck_spin_unlock(&oslog_stream_lock); + return EINTR; + } + } + + if (!oslog_stream_open) { + lck_spin_unlock(&oslog_stream_lock); + return EBADF; + } + + int logpos = 0; + oslog_stream_buf_entry_t read_entry = NULL; + uint16_t rec_length; + + read_entry = STAILQ_FIRST(&oslog_stream_buf_head); + assert(read_entry != NULL); + STAILQ_REMOVE_HEAD(&oslog_stream_buf_head, buf_entries); + + // Copy the timestamp first + memcpy(logline + logpos, &read_entry->timestamp, sizeof(uint64_t)); + logpos += sizeof(uint64_t); + + switch (read_entry->type) { + /* Handle metadata messages */ + case oslog_stream_link_type_metadata: + { + memcpy(logline + logpos, + (read_entry->metadata), read_entry->size); + logpos += read_entry->size; + + lck_spin_unlock(&oslog_stream_lock); + + // Free the list entry + kfree(read_entry, (sizeof(struct oslog_stream_buf_entry_s) + read_entry->size)); + break; + } + /* Handle log messages */ + case oslog_stream_link_type_log: + { + /* ensure that the correct read entry was dequeued */ + assert(read_entry->offset == oslog_streambufp->msg_bufr); + rec_length = read_entry->size; + + // If the next log line is contiguous in the buffer, copy it out. + if(read_entry->offset + rec_length <= oslog_streambufp->msg_size) { + memcpy(logline + logpos, + oslog_streambufp->msg_bufc + read_entry->offset, rec_length); + + oslog_streambufp->msg_bufr += rec_length; + if (oslog_streambufp->msg_bufr == oslog_streambufp->msg_size) { + oslog_streambufp->msg_bufr = 0; + } + logpos += rec_length; + } else { + // Otherwise, copy until the end of the buffer, and + // copy the remaining bytes starting at index 0. + int bytes_left = oslog_streambufp->msg_size - read_entry->offset; + memcpy(logline + logpos, + oslog_streambufp->msg_bufc + read_entry->offset, bytes_left); + logpos += bytes_left; + rec_length -= bytes_left; + + memcpy(logline + logpos, (const void *)oslog_streambufp->msg_bufc, + rec_length); + oslog_streambufp->msg_bufr = rec_length; + logpos += rec_length; + } + assert(oslog_streambufp->msg_bufr < oslog_streambufp->msg_size); + STAILQ_INSERT_TAIL(&oslog_stream_free_head, read_entry, buf_entries); + + lck_spin_unlock(&oslog_stream_lock); + break; + } + default: + { + panic("Got unexpected log entry type: %hhu\n", read_entry->type); + } + } + + copy_size = min(logpos, uio_resid(uio)); + if (copy_size != 0) { + error = uiomove((caddr_t)logline, copy_size, uio); + } + (void)hw_atomic_add(&oslog_s_streamed_msgcount, 1); + + return error; +} + /*ARGSUSED*/ int logselect(__unused dev_t dev, int rw, void * wql, struct proc *p) @@ -246,11 +602,55 @@ logselect(__unused dev_t dev, int rw, void * wql, struct proc *p) return (0); } +int +oslogselect(__unused dev_t dev, int rw, void * wql, struct proc *p) +{ + switch (rw) { + + case FREAD: + LOG_LOCK(); + if (os_log_wakeup) { + LOG_UNLOCK(); + return (1); + } + selrecord(p, &oslogsoftc.sc_selp, wql); + LOG_UNLOCK(); + break; + } + return (0); +} + +int +oslog_streamselect(__unused dev_t dev, int rw, void * wql, struct proc *p) +{ + int ret = 0; + + lck_spin_lock(&oslog_stream_lock); + + switch (rw) { + case FREAD: + if (STAILQ_EMPTY(&oslog_stream_buf_head)) { + selrecord(p, &oslog_streamsoftc.sc_selp, wql); + } else { + ret = 1; + } + break; + } + + lck_spin_unlock(&oslog_stream_lock); + return ret; +} + void logwakeup(void) { int pgid; + /* cf. r24974766 & r25201228*/ + if (oslog_is_safe() == FALSE) { + return; + } + LOG_LOCK(); if (!log_open) { LOG_UNLOCK(); @@ -273,6 +673,45 @@ logwakeup(void) LOG_UNLOCK(); } +void +oslogwakeup(void) +{ + LOG_LOCK(); + if (!oslog_open) { + LOG_UNLOCK(); + return; + } + selwakeup(&oslogsoftc.sc_selp); + os_log_wakeup = 1; + LOG_UNLOCK(); +} + +static void +oslog_streamwakeup_locked(void) +{ + lck_spin_assert(&oslog_stream_lock, LCK_ASSERT_OWNED); + if (!oslog_stream_open) { + return; + } + selwakeup(&oslog_streamsoftc.sc_selp); + if (oslog_streamsoftc.sc_state & LOG_RDWAIT) { + wakeup((caddr_t)oslog_streambufp); + oslog_streamsoftc.sc_state &= ~LOG_RDWAIT; + } +} + +void +oslog_streamwakeup(void) +{ + /* cf. r24974766 & r25201228*/ + if (oslog_is_safe() == FALSE) { + return; + } + + lck_spin_lock(&oslog_stream_lock); + oslog_streamwakeup_locked(); + lck_spin_unlock(&oslog_stream_lock); +} /*ARGSUSED*/ int @@ -321,12 +760,119 @@ logioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unus return (0); } +/*ARGSUSED*/ +int +oslogioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unused struct proc *p) +{ + int ret = 0; + mach_vm_size_t buffer_size = (FIREHOSE_BUFFER_KERNEL_CHUNK_COUNT * FIREHOSE_BUFFER_CHUNK_SIZE); + firehose_buffer_map_info_t map_info = {0, 0}; + firehose_buffer_t kernel_firehose_buffer = NULL; + mach_vm_address_t user_addr = 0; + mach_port_t mem_entry_ptr = MACH_PORT_NULL; + + switch (com) { + + /* return number of characters immediately available */ + + case LOGBUFFERMAP: + kernel_firehose_buffer = kernel_firehose_addr; + + ret = mach_make_memory_entry_64(kernel_map, + &buffer_size, + (mach_vm_offset_t) kernel_firehose_buffer, + ( MAP_MEM_VM_SHARE | VM_PROT_READ ), + &mem_entry_ptr, + MACH_PORT_NULL); + if (ret == KERN_SUCCESS) { + ret = mach_vm_map(get_task_map(current_task()), + &user_addr, + buffer_size, + 0, /* mask */ + VM_FLAGS_ANYWHERE, + mem_entry_ptr, + 0, /* offset */ + FALSE, /* copy */ + VM_PROT_READ, + VM_PROT_READ, + VM_INHERIT_SHARE); + } + + if (ret == KERN_SUCCESS) { + map_info.fbmi_addr = (uint64_t) (user_addr); + map_info.fbmi_size = buffer_size; + bcopy(&map_info, data, sizeof(firehose_buffer_map_info_t)); + } + break; + case LOGFLUSHED: + LOG_LOCK(); + os_log_wakeup = 0; + LOG_UNLOCK(); + __firehose_merge_updates(*(firehose_push_reply_t *)(data)); + break; + default: + return (-1); + } + return (0); +} + +/*ARGSUSED*/ +int +oslog_streamioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unused struct proc *p) +{ + int err = 0; + + lck_spin_lock(&oslog_stream_lock); + + switch (com) { + case FIONBIO: + if (data && *(int *)data) + oslog_streamsoftc.sc_state |= LOG_NBIO; + else + oslog_streamsoftc.sc_state &= ~LOG_NBIO; + break; + case FIOASYNC: + if (data && *(int *)data) + oslog_streamsoftc.sc_state |= LOG_ASYNC; + else + oslog_streamsoftc.sc_state &= ~LOG_ASYNC; + break; + default: + err = -1; + break; + } + + lck_spin_unlock(&oslog_stream_lock); + return err; +} + void bsd_log_init(void) { /* After this point, we must be ready to accept characters */ } +void +oslog_init(void) +{ + kern_return_t kr; + vm_size_t size = FIREHOSE_BUFFER_KERNEL_CHUNK_COUNT * FIREHOSE_BUFFER_CHUNK_SIZE; + + oslog_lock_init(); + + kr = kmem_alloc_flags(kernel_map, &kernel_firehose_addr, + size + (2 * PAGE_SIZE), VM_KERN_MEMORY_LOG, + KMA_GUARD_FIRST | KMA_GUARD_LAST); + if (kr != KERN_SUCCESS) { + panic("Failed to allocate memory for firehose logging buffer"); + } + kernel_firehose_addr += PAGE_SIZE; + bzero(kernel_firehose_addr, size); + /* register buffer with firehose */ + kernel_firehose_addr = __firehose_buffer_create((size_t *) &size); + + kprintf("oslog_init completed\n"); +} /* * log_putc_locked @@ -354,6 +900,173 @@ log_putc_locked(char c) mbp->msg_bufx = 0; } +static oslog_stream_buf_entry_t +oslog_stream_find_free_buf_entry_locked(void) +{ + struct msgbuf *mbp; + oslog_stream_buf_entry_t buf_entry = NULL; + + lck_spin_assert(&oslog_stream_lock, LCK_ASSERT_OWNED); + + mbp = oslog_streambufp; + + buf_entry = STAILQ_FIRST(&oslog_stream_free_head); + if (buf_entry) { + STAILQ_REMOVE_HEAD(&oslog_stream_free_head, buf_entries); + } + else { + // If no list elements are available in the free-list, + // consume the next log line so we can free up its list element + oslog_stream_buf_entry_t prev_entry = NULL; + + buf_entry = STAILQ_FIRST(&oslog_stream_buf_head); + while (buf_entry->type == oslog_stream_link_type_metadata) { + prev_entry = buf_entry; + buf_entry = STAILQ_NEXT(buf_entry, buf_entries); + } + + if (prev_entry == NULL) { + STAILQ_REMOVE_HEAD(&oslog_stream_buf_head, buf_entries); + } + else { + STAILQ_REMOVE_AFTER(&oslog_stream_buf_head, prev_entry, buf_entries); + } + + mbp->msg_bufr += buf_entry->size; + oslog_s_dropped_msgcount++; + if (mbp->msg_bufr >= mbp->msg_size) { + mbp->msg_bufr = (mbp->msg_bufr % mbp->msg_size); + } + } + + return buf_entry; +} + +void +oslog_streamwrite_metadata_locked(oslog_stream_buf_entry_t m_entry) +{ + lck_spin_assert(&oslog_stream_lock, LCK_ASSERT_OWNED); + STAILQ_INSERT_TAIL(&oslog_stream_buf_head, m_entry, buf_entries); + + return; +} + +static void oslog_streamwrite_append_bytes(const char *buffer, int buflen) +{ + struct msgbuf *mbp; + + lck_spin_assert(&oslog_stream_lock, LCK_ASSERT_OWNED); + + mbp = oslog_streambufp; + // Check if we have enough space in the stream buffer to write the data + if (mbp->msg_bufx + buflen <= mbp->msg_size) { + memcpy((void *)(mbp->msg_bufc + mbp->msg_bufx), buffer, buflen); + + mbp->msg_bufx += buflen; + if (mbp->msg_bufx == mbp->msg_size) { + mbp->msg_bufx = 0; + } + } else { + // Copy part of the data until the end of the stream + int bytes_left = mbp->msg_size - mbp->msg_bufx; + memcpy((void *)(mbp->msg_bufc + mbp->msg_bufx), buffer, bytes_left); + + buflen -= bytes_left; + buffer += bytes_left; + + // Copy the remainder of the data from the beginning of stream + memcpy((void *)mbp->msg_bufc, buffer, buflen); + mbp->msg_bufx = buflen; + } + return; +} + + +void +oslog_streamwrite_locked(firehose_tracepoint_id_u ftid, + uint64_t stamp, const void *pubdata, size_t publen) +{ + struct msgbuf *mbp; + int available_space = 0; + oslog_stream_buf_entry_t buf_entry = NULL; + oslog_stream_buf_entry_t next_entry = NULL; + + uint16_t ft_size = offsetof(struct firehose_tracepoint_s, ft_data); + int ft_length = ft_size + publen; + + lck_spin_assert(&oslog_stream_lock, LCK_ASSERT_OWNED); + + mbp = oslog_streambufp; + if (ft_length > mbp->msg_size) { + (void)hw_atomic_add(&oslog_s_error_count, 1); + return; + } + + // Ensure that we have a list element for this record + buf_entry = oslog_stream_find_free_buf_entry_locked(); + + assert(buf_entry != NULL); + + // Ensure that we have space in the ring buffer for the current logline + if (mbp->msg_bufr > mbp->msg_bufx) { + available_space = mbp->msg_bufr - mbp->msg_bufx; + } else { + available_space = mbp->msg_size - mbp->msg_bufx + mbp->msg_bufr; + } + while(ft_length > available_space) { + oslog_stream_buf_entry_t prev_entry = NULL; + + next_entry = STAILQ_FIRST(&oslog_stream_buf_head); + assert(next_entry != NULL); + while (next_entry->type == oslog_stream_link_type_metadata) { + prev_entry = next_entry; + next_entry = STAILQ_NEXT(next_entry, buf_entries); + } + + if (prev_entry == NULL) { + STAILQ_REMOVE_HEAD(&oslog_stream_buf_head, buf_entries); + } + else { + STAILQ_REMOVE_AFTER(&oslog_stream_buf_head, prev_entry, buf_entries); + } + + mbp->msg_bufr += next_entry->size; + if (mbp->msg_bufr >= mbp->msg_size) { + mbp->msg_bufr = (mbp->msg_bufr % mbp->msg_size); + } + + oslog_s_dropped_msgcount++; + available_space += next_entry->size; + + STAILQ_INSERT_TAIL(&oslog_stream_free_head, next_entry, buf_entries); + } + + assert(ft_length <= available_space); + + // Write the log line and update the list entry for this record + buf_entry->offset = mbp->msg_bufx; + buf_entry->size = ft_length; + buf_entry->timestamp = stamp; + buf_entry->type = oslog_stream_link_type_log; + + // Construct a tracepoint + struct firehose_tracepoint_s fs = { + .ft_thread = thread_tid(current_thread()), + .ft_id.ftid_value = ftid.ftid_value, + .ft_length = publen + }; + + oslog_streamwrite_append_bytes((char *)&fs, sizeof(fs)); + oslog_streamwrite_append_bytes(pubdata, publen); + + assert(mbp->msg_bufr < mbp->msg_size); + // Insert the element to the buffer data list + STAILQ_INSERT_TAIL(&oslog_stream_buf_head, buf_entry, buf_entries); + + return; +} + + /* * log_putc @@ -479,6 +1192,22 @@ log_setsize(int size) { return 0; } +void oslog_setsize(int size) +{ + uint16_t scale = 0; + // If the size is less than the default stream buffer + // do nothing + if (size <= OSLOG_STREAM_BUF_SIZE) { + return; + } + + scale = (uint16_t) (size / OSLOG_STREAM_BUF_SIZE); + + oslog_stream_buf_size = size; + oslog_stream_num_entries = scale * OSLOG_NUM_STREAM_ENTRIES; + printf("oslog_setsize: new buffer size = %d, new num entries= %d\n", oslog_stream_buf_size, oslog_stream_num_entries); +} + SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_kern_msgbuf, "I", ""); static int sysctl_kern_msgbuf(struct sysctl_oid *oidp __unused, @@ -582,3 +1311,4 @@ log_dmesg(user_addr_t buffer, uint32_t buffersize, int32_t * retval) { kfree(localbuff, localbuff_size); return (error); } + diff --git a/bsd/kern/subr_prf.c b/bsd/kern/subr_prf.c index d14302bec..ebca1b666 100644 --- a/bsd/kern/subr_prf.c +++ b/bsd/kern/subr_prf.c @@ -100,6 +100,7 @@ #include /* for cpu_number() */ #include #include +#include /* for vaddlog(): the following are implemented in osfmk/kern/printf.c */ extern void bsd_log_lock(void); @@ -214,8 +215,7 @@ void tprintf(tpr_t tpr, const char *fmt, ...) { struct session *sess = (struct session *)tpr; - struct tty *tp = TTY_NULL; - int flags = TOLOG; + struct tty *tp; va_list ap; struct putchar_args pca; @@ -225,25 +225,27 @@ tprintf(tpr_t tpr, const char *fmt, ...) /* ttycheckoutq(), tputchar() require a locked tp */ tty_lock(tp); if(ttycheckoutq(tp, 0)) { - flags |= TOTTY; + pca.flags = TOTTY; /* going to the tty; leave locked */ - } else { - /* not going to the tty... */ - tty_unlock(tp); - tp = TTY_NULL; + pca.tty = tp; + va_start(ap, fmt); + __doprnt(fmt, ap, putchar, &pca, 10, FALSE); + va_end(ap); } + tty_unlock(tp); } - - pca.flags = flags; - pca.tty = tp; + + pca.flags = TOLOG; + pca.tty = TTY_NULL; va_start(ap, fmt); - __doprnt(fmt, ap, putchar, &pca, 10, FALSE); + __doprnt(fmt, ap, putchar, &pca, 10, TRUE); va_end(ap); - if (tp != NULL) - tty_unlock(tp); /* lock/unlock is guarded by tp, above */ - logwakeup(); + + va_start(ap, fmt); + os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, ap, __builtin_return_address(0)); + va_end(ap); } /* @@ -437,11 +439,11 @@ putchar(int c, void *arg) } int -vprintf(const char *fmt, va_list ap) +vprintf_log_locked(const char *fmt, va_list ap) { struct putchar_args pca; - pca.flags = TOLOG | TOCONS; + pca.flags = TOLOGLOCKED; pca.tty = NULL; __doprnt(fmt, ap, putchar, &pca, 10, TRUE); return 0; diff --git a/bsd/kern/subr_xxx.c b/bsd/kern/subr_xxx.c index 946f938e3..2c574f295 100644 --- a/bsd/kern/subr_xxx.c +++ b/bsd/kern/subr_xxx.c @@ -63,6 +63,7 @@ #include #include +#include #include #include #include @@ -76,6 +77,11 @@ #include #endif +#if DEVELOPMENT || DEBUG +bool send_sigsys = true; +#else +#define send_sigsys true +#endif /* * Unsupported device function (e.g. writing to read-only device). @@ -178,9 +184,11 @@ nullsys(void) */ /* ARGSUSED */ int -nosys(struct proc *p, __unused struct nosys_args *args, __unused int32_t *retval) +nosys(__unused struct proc *p, __unused struct nosys_args *args, __unused int32_t *retval) { - psignal(p, SIGSYS); + if (send_sigsys) { + psignal_uthread(current_thread(), SIGSYS); + } return (ENOSYS); } diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 41e3d8b69..8692d514d 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -146,7 +146,6 @@ void evpipefree(struct pipe *); void postpipeevent(struct pipe *, int); void postevent(struct socket *, struct sockbuf *, int); extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp); -extern void delay(int); int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval); int wr_uio(struct proc *p, struct fileproc *fp, uio_t uio, user_ssize_t *retval); @@ -171,7 +170,7 @@ void select_waitq_init(void); void select_waitq_init(void) { - waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ); + waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO); } #define f_flag f_fglob->fg_flag @@ -1204,12 +1203,12 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo panic("can't allocate %ld bytes for wqstate buffer", uth->uu_wqstate_sz); waitq_set_init(uth->uu_wqset, - SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST|SYNC_POLICY_DISABLE_IRQ, NULL); + SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL); } if (!waitq_set_is_valid(uth->uu_wqset)) waitq_set_init(uth->uu_wqset, - SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST|SYNC_POLICY_DISABLE_IRQ, NULL); + SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL); /* the last chunk of our buffer is an array of waitq pointers */ seldata->wqp = (uint64_t *)((char *)(uth->uu_wqset) + ALIGN(sizeof(struct waitq_set))); @@ -1691,7 +1690,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE))) return (EINVAL); - kq = kqueue_alloc(p); + kq = kqueue_alloc(p, 0); if (kq == NULL) return (EAGAIN); @@ -1728,7 +1727,6 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) OSBitOrAtomic(P_SELECT, &p->p_flag); for (i = 0; i < nfds; i++) { short events = fds[i].events; - int kerror = 0; /* per spec, ignore fd values below zero */ if (fds[i].fd < 0) { @@ -1747,19 +1745,19 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) kev.filter = EVFILT_READ; if (events & ( POLLPRI | POLLRDBAND )) kev.flags |= EV_OOBAND; - kerror = kevent_register(kq, &kev, p); + kevent_register(kq, &kev, p); } /* Handle output events */ - if (kerror == 0 && - events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) { + if ((kev.flags & EV_ERROR) == 0 && + (events & ( POLLOUT | POLLWRNORM | POLLWRBAND ))) { kev.filter = EVFILT_WRITE; - kerror = kevent_register(kq, &kev, p); + kevent_register(kq, &kev, p); } /* Handle BSD extension vnode events */ - if (kerror == 0 && - events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) { + if ((kev.flags & EV_ERROR) == 0 && + (events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE ))) { kev.filter = EVFILT_VNODE; kev.fflags = 0; if (events & POLLEXTEND) @@ -1770,10 +1768,10 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) kev.fflags |= NOTE_LINK; if (events & POLLWRITE) kev.fflags |= NOTE_WRITE; - kerror = kevent_register(kq, &kev, p); + kevent_register(kq, &kev, p); } - if (kerror != 0) { + if (kev.flags & EV_ERROR) { fds[i].revents = POLLNVAL; rfds++; } else @@ -1781,14 +1779,14 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) } /* Did we have any trouble registering? */ - if (rfds > 0) + if (rfds == nfds) goto done; /* scan for, and possibly wait for, the kevents to trigger */ cont->pca_fds = uap->fds; cont->pca_nfds = nfds; cont->pca_rfds = rfds; - error = kqueue_scan(kq, poll_callback, NULL, cont, &atv, p); + error = kqueue_scan(kq, poll_callback, NULL, cont, NULL, &atv, p); rfds = cont->pca_rfds; done: @@ -2104,7 +2102,7 @@ selrecord(__unused struct proc *selector, struct selinfo *sip, void *s_data) return; if ((sip->si_flags & SI_INITED) == 0) { - waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ); + waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO); sip->si_flags |= SI_INITED; sip->si_flags &= ~SI_CLEAR; } @@ -3216,7 +3214,7 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval) rval = ledger_get_task_entry_info_multiple(task, &buf, &len); proc_rele(proc); - if ((rval == 0) && (len > 0)) { + if ((rval == 0) && (len >= 0)) { sz = len * sizeof (struct ledger_entry_info); rval = copyout(buf, args->arg2, sz); kfree(buf, sz); @@ -3231,7 +3229,7 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval) int sz; rval = ledger_template_info(&buf, &len); - if ((rval == 0) && (len > 0)) { + if ((rval == 0) && (len >= 0)) { sz = len * sizeof (struct ledger_template_info); rval = copyout(buf, args->arg1, sz); kfree(buf, sz); @@ -3303,7 +3301,7 @@ static inline struct waitq_set *sysctl_get_wqset(int idx) gwqs = (struct g_wqset *)kalloc(sizeof(*gwqs)); assert(gwqs != NULL); - gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST|SYNC_POLICY_DISABLE_IRQ); + gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL); enqueue_tail(&g_wqset_list, &gwqs->link); printf("[WQ]: created new waitq set 0x%llx\n", wqset_id(gwqs->wqset)); @@ -3322,7 +3320,7 @@ static inline struct waitq *global_test_waitq(int idx) if (!g_wq_init) { g_wq_init = 1; for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++) - waitq_init(&g_wq[i], SYNC_POLICY_FIFO|SYNC_POLICY_DISABLE_IRQ); + waitq_init(&g_wq[i], SYNC_POLICY_FIFO); } return &g_wq[idx % MAX_GLOBAL_TEST_QUEUES]; diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c index 1e64ce737..f6adf702a 100644 --- a/bsd/kern/sys_pipe.c +++ b/bsd/kern/sys_pipe.c @@ -171,30 +171,40 @@ static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, static int pipe_drain(struct fileproc *fp,vfs_context_t ctx); static const struct fileops pipeops = { - DTYPE_PIPE, - pipe_read, - pipe_write, - pipe_ioctl, - pipe_select, - pipe_close, - pipe_kqfilter, - pipe_drain + .fo_type = DTYPE_PIPE, + .fo_read = pipe_read, + .fo_write = pipe_write, + .fo_ioctl = pipe_ioctl, + .fo_select = pipe_select, + .fo_close = pipe_close, + .fo_kqfilter = pipe_kqfilter, + .fo_drain = pipe_drain, }; -static void filt_pipedetach(struct knote *kn); -static int filt_piperead(struct knote *kn, long hint); -static int filt_pipewrite(struct knote *kn, long hint); +static void filt_pipedetach(struct knote *kn); -static struct filterops pipe_rfiltops = { +static int filt_piperead(struct knote *kn, long hint); +static int filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); + +static int filt_pipewrite(struct knote *kn, long hint); +static int filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); + +struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead, + .f_touch = filt_pipereadtouch, + .f_process = filt_pipereadprocess, }; -static struct filterops pipe_wfiltops = { +struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite, + .f_touch = filt_pipewritetouch, + .f_process = filt_pipewriteprocess, }; static int nbigpipe; /* for compatibility sake. no longer used */ @@ -1359,11 +1369,177 @@ pipeclose(struct pipe *cpipe) /*ARGSUSED*/ static int -pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx) +filt_piperead_common(struct knote *kn, struct pipe *rpipe) { - struct pipe *cpipe; + struct pipe *wpipe; + int retval; + + /* + * we're being called back via the KNOTE post + * we made in pipeselwakeup, and we already hold the mutex... + */ + + wpipe = rpipe->pipe_peer; + kn->kn_data = rpipe->pipe_buffer.cnt; + if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || + (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { + kn->kn_flags |= EV_EOF; + retval = 1; + } else { + int64_t lowwat = 1; + if (kn->kn_sfflags & NOTE_LOWAT) { + if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe)) + lowwat = MAX_PIPESIZE(rpipe); + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + retval = kn->kn_data >= lowwat; + } + return (retval); +} + +static int +filt_piperead(struct knote *kn, long hint) +{ +#pragma unused(hint) + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + + return filt_piperead_common(kn, rpipe); +} + +static int +filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev) +{ + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + int retval; + + PIPE_LOCK(rpipe); + + /* accept new inputs (and save the low water threshold and flag) */ + kn->kn_sdata = kev->data; + kn->kn_sfflags = kev->fflags; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* identify if any events are now fired */ + retval = filt_piperead_common(kn, rpipe); + + PIPE_UNLOCK(rpipe); + + return retval; +} + +static int +filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + int retval; + + PIPE_LOCK(rpipe); + retval = filt_piperead_common(kn, rpipe); + if (retval) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_fflags = 0; + kn->kn_data = 0; + } + } + PIPE_UNLOCK(rpipe); + + return (retval); +} + +/*ARGSUSED*/ +static int +filt_pipewrite_common(struct knote *kn, struct pipe *rpipe) +{ + struct pipe *wpipe; + + /* + * we're being called back via the KNOTE post + * we made in pipeselwakeup, and we already hold the mutex... + */ + wpipe = rpipe->pipe_peer; + + if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { + kn->kn_data = 0; + kn->kn_flags |= EV_EOF; + return (1); + } + kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt; - cpipe = (struct pipe *)kn->kn_fp->f_data; + int64_t lowwat = PIPE_BUF; + if (kn->kn_sfflags & NOTE_LOWAT) { + if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe)) + lowwat = MAX_PIPESIZE(wpipe); + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + + return (kn->kn_data >= lowwat); +} + +/*ARGSUSED*/ +static int +filt_pipewrite(struct knote *kn, long hint) +{ +#pragma unused(hint) + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + + return filt_pipewrite_common(kn, rpipe); +} + + +static int +filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev) +{ + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + int res; + + PIPE_LOCK(rpipe); + + /* accept new kevent data (and save off lowat threshold and flag) */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* determine if any event is now deemed fired */ + res = filt_pipewrite_common(kn, rpipe); + + PIPE_UNLOCK(rpipe); + + return res; +} + +static int +filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + int res; + + PIPE_LOCK(rpipe); + res = filt_pipewrite_common(kn, rpipe); + if (res) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_fflags = 0; + kn->kn_data = 0; + } + } + PIPE_UNLOCK(rpipe); + + return res; +} + +/*ARGSUSED*/ +static int +pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx) +{ + struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; + int res; PIPE_LOCK(cpipe); #if CONFIG_MACF @@ -1374,38 +1550,50 @@ pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_conte */ if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) { PIPE_UNLOCK(cpipe); - return (1); + kn->kn_flags = EV_ERROR; + kn->kn_data = EPERM; + return 0; } #endif switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_fop = &pipe_rfiltops; + kn->kn_filtid = EVFILTID_PIPE_R; + /* determine initial state */ + res = filt_piperead_common(kn, cpipe); break; + case EVFILT_WRITE: - kn->kn_fop = &pipe_wfiltops; + kn->kn_filtid = EVFILTID_PIPE_W; if (cpipe->pipe_peer == NULL) { /* * other end of pipe has been closed */ PIPE_UNLOCK(cpipe); - return (EPIPE); + kn->kn_flags = EV_ERROR; + kn->kn_data = EPIPE; + return 0; } if (cpipe->pipe_peer) cpipe = cpipe->pipe_peer; + + /* determine inital state */ + res = filt_pipewrite_common(kn, cpipe); break; default: PIPE_UNLOCK(cpipe); - return (1); + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; + return 0; } if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn)) cpipe->pipe_state |= PIPE_KNOTE; PIPE_UNLOCK(cpipe); - return (0); + return res; } static void @@ -1429,88 +1617,6 @@ filt_pipedetach(struct knote *kn) PIPE_UNLOCK(cpipe); } -/*ARGSUSED*/ -static int -filt_piperead(struct knote *kn, long hint) -{ - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; - struct pipe *wpipe; - int retval; - - /* - * if hint == 0, then we've been called from the kevent - * world directly and do not currently hold the pipe mutex... - * if hint == 1, we're being called back via the KNOTE post - * we made in pipeselwakeup, and we already hold the mutex... - */ - if (hint == 0) - PIPE_LOCK(rpipe); - - wpipe = rpipe->pipe_peer; - kn->kn_data = rpipe->pipe_buffer.cnt; - if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || - (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { - kn->kn_flags |= EV_EOF; - retval = 1; - } else { - int64_t lowwat = 1; - if (kn->kn_sfflags & NOTE_LOWAT) { - if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe)) - lowwat = MAX_PIPESIZE(rpipe); - else if (kn->kn_sdata > lowwat) - lowwat = kn->kn_sdata; - } - retval = kn->kn_data >= lowwat; - } - - if (hint == 0) - PIPE_UNLOCK(rpipe); - - return (retval); -} - -/*ARGSUSED*/ -static int -filt_pipewrite(struct knote *kn, long hint) -{ - struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; - struct pipe *wpipe; - - /* - * if hint == 0, then we've been called from the kevent - * world directly and do not currently hold the pipe mutex... - * if hint == 1, we're being called back via the KNOTE post - * we made in pipeselwakeup, and we already hold the mutex... - */ - if (hint == 0) - PIPE_LOCK(rpipe); - - wpipe = rpipe->pipe_peer; - - if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { - kn->kn_data = 0; - kn->kn_flags |= EV_EOF; - - if (hint == 0) - PIPE_UNLOCK(rpipe); - return (1); - } - kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt; - - int64_t lowwat = PIPE_BUF; - if (kn->kn_sfflags & NOTE_LOWAT) { - if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe)) - lowwat = MAX_PIPESIZE(wpipe); - else if (kn->kn_sdata > lowwat) - lowwat = kn->kn_sdata; - } - - if (hint == 0) - PIPE_UNLOCK(rpipe); - - return (kn->kn_data >= lowwat); -} - int fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo) { diff --git a/bsd/kern/sys_reason.c b/bsd/kern/sys_reason.c new file mode 100644 index 000000000..3404d199b --- /dev/null +++ b/bsd/kern/sys_reason.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if OS_REASON_DEBUG +#include + +extern int os_reason_debug_disabled; +#endif + +extern int maxproc; + +/* + * Lock group attributes for os_reason subsystem + */ +lck_grp_attr_t *os_reason_lock_grp_attr; +lck_grp_t *os_reason_lock_grp; +lck_attr_t *os_reason_lock_attr; + +#define OS_REASON_RESERVE_COUNT 100 +#define OS_REASON_MAX_COUNT (maxproc + 100) + +static struct zone *os_reason_zone; + +void +os_reason_init() +{ + int reasons_allocated = 0; + + /* + * Initialize OS reason group and lock attributes + */ + os_reason_lock_grp_attr = lck_grp_attr_alloc_init(); + os_reason_lock_grp = lck_grp_alloc_init("os_reason_lock", os_reason_lock_grp_attr); + os_reason_lock_attr = lck_attr_alloc_init(); + + /* + * Create OS reason zone. + */ + os_reason_zone = zinit(sizeof(struct os_reason), OS_REASON_MAX_COUNT * sizeof(struct os_reason), + OS_REASON_MAX_COUNT, "os reasons"); + if (os_reason_zone == NULL) { + panic("failed to initialize os_reason_zone"); + } + + /* + * We pre-fill the OS reason zone to reduce the likelihood that + * the jetsam thread and others block when they create an exit + * reason. This pre-filled memory is not-collectable since it's + * foreign memory crammed in as part of zfill(). + */ + reasons_allocated = zfill(os_reason_zone, OS_REASON_RESERVE_COUNT); + assert(reasons_allocated > 0); +} + +/* + * Creates a new reason and initializes it with the provided reason + * namespace and code. Also sets up the buffer and kcdata_descriptor + * associated with the reason. Returns a pointer to the newly created + * reason. + * + * Returns: + * REASON_NULL if unable to allocate a reason or initialize the nested buffer + * a pointer to the reason otherwise + */ +os_reason_t +os_reason_create(uint32_t osr_namespace, uint64_t osr_code) +{ + os_reason_t new_reason = OS_REASON_NULL; + + new_reason = (os_reason_t) zalloc(os_reason_zone); + if (new_reason == OS_REASON_NULL) { +#if OS_REASON_DEBUG + /* + * We rely on OS reasons to communicate important things such + * as process exit reason information, we should be aware + * when issues prevent us from allocating them. + */ + if (os_reason_debug_disabled) { + kprintf("os_reason_create: failed to allocate reason with namespace: %u, code : %llu\n", + osr_namespace, osr_code); + } else { + panic("os_reason_create: failed to allocate reason with namespace: %u, code: %llu\n", + osr_namespace, osr_code); + } +#endif + return new_reason; + } + + bzero(new_reason, sizeof(*new_reason)); + + new_reason->osr_namespace = osr_namespace; + new_reason->osr_code = osr_code; + new_reason->osr_flags = 0; + new_reason->osr_bufsize = 0; + new_reason->osr_kcd_buf = NULL; + + lck_mtx_init(&new_reason->osr_lock, os_reason_lock_grp, os_reason_lock_attr); + new_reason->osr_refcount = 1; + + return new_reason; +} + +static void +os_reason_dealloc_buffer(os_reason_t cur_reason) +{ + assert(cur_reason != OS_REASON_NULL); + LCK_MTX_ASSERT(&cur_reason->osr_lock, LCK_MTX_ASSERT_OWNED); + + if (cur_reason->osr_kcd_buf != NULL && cur_reason->osr_bufsize != 0) { + kfree(cur_reason->osr_kcd_buf, cur_reason->osr_bufsize); + } + + cur_reason->osr_bufsize = 0; + cur_reason->osr_kcd_buf = NULL; + bzero(&cur_reason->osr_kcd_descriptor, sizeof(cur_reason->osr_kcd_descriptor)); + + return; +} + +/* + * Allocates and initializes a buffer of specified size for the reason. Also + * initializes the kcdata descriptor accordingly. If there is an existing + * buffer, we dealloc the buffer before allocating a new one and + * clear the associated kcdata descriptor. If osr_bufsize is passed as 0, + * we deallocate the existing buffer and then return. + * + * Returns: + * 0 on success + * EINVAL if the passed reason pointer is invalid or the requested size is + * larger than REASON_BUFFER_MAX_SIZE + * ENOMEM if unable to allocate memory for the buffer + * EIO if we fail to initialize the kcdata buffer + */ +int +os_reason_alloc_buffer(os_reason_t cur_reason, uint32_t osr_bufsize) +{ + if (cur_reason == OS_REASON_NULL) { + return EINVAL; + } + + if (osr_bufsize > OS_REASON_BUFFER_MAX_SIZE) { + return EINVAL; + } + + lck_mtx_lock(&cur_reason->osr_lock); + + os_reason_dealloc_buffer(cur_reason); + + if (osr_bufsize == 0) { + lck_mtx_unlock(&cur_reason->osr_lock); + return 0; + } + + /* + * We don't want to block trying to acquire a reason buffer and hold + * up important things trying to clean up the system (i.e. jetsam). + */ + cur_reason->osr_kcd_buf = kalloc_noblock_tag(osr_bufsize, VM_KERN_MEMORY_REASON); + if (cur_reason->osr_kcd_buf == NULL) { + lck_mtx_unlock(&cur_reason->osr_lock); + return ENOMEM; + } + + bzero(cur_reason->osr_kcd_buf, osr_bufsize); + + cur_reason->osr_bufsize = osr_bufsize; + + if (kcdata_memory_static_init(&cur_reason->osr_kcd_descriptor, (mach_vm_address_t) cur_reason->osr_kcd_buf, + KCDATA_BUFFER_BEGIN_OS_REASON, osr_bufsize, KCFLAG_USE_MEMCOPY) != KERN_SUCCESS) { + os_reason_dealloc_buffer(cur_reason); + + lck_mtx_unlock(&cur_reason->osr_lock); + return EIO; + } + + lck_mtx_unlock(&cur_reason->osr_lock); + + return 0; +} + +/* + * Returns a pointer to the kcdata descriptor associated with the specified + * reason if there is a buffer allocated. + */ +struct kcdata_descriptor * +os_reason_get_kcdata_descriptor(os_reason_t cur_reason) +{ + if (cur_reason == OS_REASON_NULL) { + return NULL; + } + + if (cur_reason->osr_kcd_buf == NULL) { + return NULL; + } + + assert(cur_reason->osr_kcd_descriptor.kcd_addr_begin == (mach_vm_address_t) cur_reason->osr_kcd_buf); + if (cur_reason->osr_kcd_descriptor.kcd_addr_begin != (mach_vm_address_t) cur_reason->osr_kcd_buf) { + return NULL; + } + + return &cur_reason->osr_kcd_descriptor; +} + +/* + * Takes a reference on the passed reason. + */ +void +os_reason_ref(os_reason_t cur_reason) +{ + if (cur_reason == OS_REASON_NULL) { + return; + } + + lck_mtx_lock(&cur_reason->osr_lock); + + assert(cur_reason->osr_refcount > 0); + cur_reason->osr_refcount++; + + lck_mtx_unlock(&cur_reason->osr_lock); + + return; +} + +/* + * Drops a reference on the passed reason, deallocates + * the reason if no references remain. + */ +void +os_reason_free(os_reason_t cur_reason) +{ + if (cur_reason == OS_REASON_NULL) { + return; + } + + lck_mtx_lock(&cur_reason->osr_lock); + + assert(cur_reason->osr_refcount > 0); + + cur_reason->osr_refcount--; + if (cur_reason->osr_refcount != 0) { + lck_mtx_unlock(&cur_reason->osr_lock); + return; + } + + os_reason_dealloc_buffer(cur_reason); + + lck_mtx_unlock(&cur_reason->osr_lock); + lck_mtx_destroy(&cur_reason->osr_lock, os_reason_lock_grp); + + zfree(os_reason_zone, cur_reason); +} diff --git a/bsd/kern/sys_socket.c b/bsd/kern/sys_socket.c index 11df996b6..7b9e78b6f 100644 --- a/bsd/kern/sys_socket.c +++ b/bsd/kern/sys_socket.c @@ -98,14 +98,14 @@ static int soo_close(struct fileglob *, vfs_context_t ctx); static int soo_drain(struct fileproc *, vfs_context_t ctx); const struct fileops socketops = { - DTYPE_SOCKET, - soo_read, - soo_write, - soo_ioctl, - soo_select, - soo_close, - soo_kqfilter, - soo_drain + .fo_type = DTYPE_SOCKET, + .fo_read = soo_read, + .fo_write = soo_write, + .fo_ioctl = soo_ioctl, + .fo_select = soo_select, + .fo_close = soo_close, + .fo_kqfilter = soo_kqfilter, + .fo_drain = soo_drain, }; /* ARGSUSED */ diff --git a/bsd/kern/sys_ulock.c b/bsd/kern/sys_ulock.c new file mode 100644 index 000000000..d245fcbcc --- /dev/null +++ b/bsd/kern/sys_ulock.c @@ -0,0 +1,810 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define XNU_TEST_BITMAP +#include + +#include + +/* + * How ulock promotion works: + * + * There’s a requested policy field on every thread called ‘promotions’, which + * expresses which ulock promotions are happening to this thread. + * The promotion priority saturates until the promotion count goes to 0. + * + * We also track effective promotion qos, which is the qos before clamping. + * This value is used for promoting a thread that another thread is waiting on, + * so that the lock owner reinflates to the right priority after unclamping. + * + * This also works for non-QoS threads, which can donate base priority to QoS + * and non-QoS threads alike. + * + * ulock wait applies a promotion to the owner communicated through + * UL_UNFAIR_LOCK as waiters block, and that promotion is saturated as long as + * there is still an owner. In ulock wake, if the waker is still the owner, + * then it clears its ownership and drops the boost. It does NOT transfer + * ownership/priority boost to the new thread. Instead, it selects the + * waiting thread with the highest base priority to be woken next, and + * relies on that thread to carry the torch for the other waiting threads. + */ + +static lck_grp_t *ull_lck_grp; +static lck_mtx_t ull_table_lock; + +#define ull_global_lock() lck_mtx_lock(&ull_table_lock) +#define ull_global_unlock() lck_mtx_unlock(&ull_table_lock) + +#define ull_lock(ull) lck_mtx_lock(&ull->ull_lock) +#define ull_unlock(ull) lck_mtx_unlock(&ull->ull_lock) +#define ull_assert_owned(ull) LCK_MTX_ASSERT(&ull->ull_lock, LCK_MTX_ASSERT_OWNED) + +typedef struct __attribute__((packed)) { + user_addr_t ulk_addr; + pid_t ulk_pid; +} ulk_t; + +inline static bool +ull_key_match(ulk_t *a, ulk_t *b) +{ + return ((a->ulk_pid == b->ulk_pid) && + (a->ulk_addr == b->ulk_addr)); +} + +typedef struct ull { + /* + * ull_owner is the most recent known value for the owner of this ulock + * i.e. it may be out of date WRT the real value in userspace. + */ + thread_t ull_owner; /* holds +1 thread reference */ + ulk_t ull_key; + ulk_t ull_saved_key; + lck_mtx_t ull_lock; + int32_t ull_nwaiters; + int32_t ull_max_nwaiters; + int32_t ull_refcount; + struct promote_token ull_promote_token; + queue_chain_t ull_hash_link; + uint8_t ull_opcode; +} ull_t; + +static const bool ull_debug = false; + +extern void ulock_initialize(void); + +#define ULL_MUST_EXIST 0x0001 +static ull_t *ull_get(ulk_t *, uint32_t); +static void ull_put(ull_t *); + +static thread_t ull_promote_owner_locked(ull_t* ull, thread_t thread); + +#if DEVELOPMENT || DEBUG +static int ull_simulate_copyin_fault = 0; +static int ull_panic_on_corruption = 0; + +static void +ull_dump(ull_t *ull) +{ + kprintf("ull\t%p\n", ull); + kprintf("ull_key.ulk_pid\t%d\n", ull->ull_key.ulk_pid); + kprintf("ull_key.ulk_addr\t%p\n", (void *)(ull->ull_key.ulk_addr)); + kprintf("ull_saved_key.ulk_pid\t%d\n", ull->ull_saved_key.ulk_pid); + kprintf("ull_saved_key.ulk_addr\t%p\n", (void *)(ull->ull_saved_key.ulk_addr)); + kprintf("ull_nwaiters\t%d\n", ull->ull_nwaiters); + kprintf("ull_max_nwaiters\t%d\n", ull->ull_max_nwaiters); + kprintf("ull_refcount\t%d\n", ull->ull_refcount); + kprintf("ull_opcode\t%d\n\n", ull->ull_opcode); + kprintf("ull_owner\t0x%llx\n\n", thread_tid(ull->ull_owner)); + kprintf("ull_promote_token\t%d, %d\n\n", ull->ull_promote_token.pt_basepri, ull->ull_promote_token.pt_qos); +} +#endif + +static int ull_hash_buckets; +static queue_head_t *ull_bucket; +static uint32_t ull_nzalloc = 0; +static zone_t ull_zone; + +static __inline__ uint32_t +ull_hash_index(char *key, size_t length) +{ + uint32_t hash = jenkins_hash(key, length); + + hash &= (ull_hash_buckets - 1); + + return hash; +} + +/* Ensure that the key structure is packed, + * so that no undefined memory is passed to + * ull_hash_index() + */ +static_assert(sizeof(ulk_t) == sizeof(user_addr_t) + sizeof(pid_t)); + +#define ULL_INDEX(keyp) ull_hash_index((char *)keyp, sizeof *keyp) + +void +ulock_initialize(void) +{ + ull_lck_grp = lck_grp_alloc_init("ulocks", NULL); + lck_mtx_init(&ull_table_lock, ull_lck_grp, NULL); + + assert(thread_max > 16); + /* Size ull_hash_buckets based on thread_max. + * Round up to nearest power of 2, then divide by 4 + */ + ull_hash_buckets = (1 << (bit_ceiling(thread_max) - 2)); + + kprintf("%s>thread_max=%d, ull_hash_buckets=%d\n", __FUNCTION__, thread_max, ull_hash_buckets); + assert(ull_hash_buckets >= thread_max/4); + + ull_bucket = (queue_head_t *)kalloc(sizeof(queue_head_t) * ull_hash_buckets); + assert(ull_bucket != NULL); + + for (int i = 0; i < ull_hash_buckets; i++) { + queue_init(&ull_bucket[i]); + } + + ull_zone = zinit(sizeof(ull_t), + thread_max * sizeof(ull_t), + 0, "ulocks"); + + zone_change(ull_zone, Z_NOENCRYPT, TRUE); + +#if DEVELOPMENT || DEBUG + if (!PE_parse_boot_argn("ulock_panic_on_corruption", + &ull_panic_on_corruption, sizeof(ull_panic_on_corruption))) { + ull_panic_on_corruption = 0; + } +#endif +} + +#if DEVELOPMENT || DEBUG +/* Count the number of hash entries for a given pid. + * if pid==0, dump the whole table. + */ +static int +ull_hash_dump(pid_t pid) +{ + int count = 0; + ull_global_lock(); + if (pid == 0) { + kprintf("%s>total number of ull_t allocated %d\n", __FUNCTION__, ull_nzalloc); + kprintf("%s>BEGIN\n", __FUNCTION__); + } + for (int i = 0; i < ull_hash_buckets; i++) { + if (!queue_empty(&ull_bucket[i])) { + ull_t *elem; + if (pid == 0) { + kprintf("%s>index %d:\n", __FUNCTION__, i); + } + qe_foreach_element(elem, &ull_bucket[i], ull_hash_link) { + if ((pid == 0) || (pid == elem->ull_key.ulk_pid)) { + ull_dump(elem); + count++; + } + } + } + } + if (pid == 0) { + kprintf("%s>END\n", __FUNCTION__); + ull_nzalloc = 0; + } + ull_global_unlock(); + return count; +} +#endif + +static ull_t * +ull_alloc(ulk_t *key) +{ + ull_t *ull = (ull_t *)zalloc(ull_zone); + assert(ull != NULL); + + ull->ull_refcount = 1; + ull->ull_key = *key; + ull->ull_saved_key = *key; + ull->ull_nwaiters = 0; + ull->ull_max_nwaiters = 0; + ull->ull_opcode = 0; + + ull->ull_owner = THREAD_NULL; + ull->ull_promote_token = PROMOTE_TOKEN_INIT; + + lck_mtx_init(&ull->ull_lock, ull_lck_grp, NULL); + + ull_nzalloc++; + return ull; +} + +static void +ull_free(ull_t *ull) +{ + assert(ull->ull_owner == THREAD_NULL); + + lck_mtx_assert(&ull->ull_lock, LCK_ASSERT_NOTOWNED); + + lck_mtx_destroy(&ull->ull_lock, ull_lck_grp); + + zfree(ull_zone, ull); +} + +/* Finds an existing ulock structure (ull_t), or creates a new one. + * If MUST_EXIST flag is set, returns NULL instead of creating a new one. + * The ulock structure is returned with ull_lock locked + * + * TODO: Per-bucket lock to reduce contention on global lock + */ +static ull_t * +ull_get(ulk_t *key, uint32_t flags) +{ + ull_t *ull = NULL; + uint i = ULL_INDEX(key); + ull_t *elem; + ull_global_lock(); + qe_foreach_element(elem, &ull_bucket[i], ull_hash_link) { + ull_lock(elem); + if (ull_key_match(&elem->ull_key, key)) { + ull = elem; + break; + } else { + ull_unlock(elem); + } + } + if (ull == NULL) { + if (flags & ULL_MUST_EXIST) { + /* Must already exist (called from wake) */ + ull_global_unlock(); + return NULL; + } + + /* NRG maybe drop the ull_global_lock before the kalloc, + * then take the lock and check again for a key match + * and either use the new ull_t or free it. + */ + + ull = ull_alloc(key); + + if (ull == NULL) { + ull_global_unlock(); + return NULL; + } + + ull_lock(ull); + + enqueue(&ull_bucket[i], &ull->ull_hash_link); + } + + ull->ull_refcount++; + + ull_global_unlock(); + + return ull; /* still locked */ +} + +/* + * Must be called with ull_lock held + */ +static void +ull_put(ull_t *ull) +{ + ull_assert_owned(ull); + int refcount = --ull->ull_refcount; + assert(refcount == 0 ? (ull->ull_key.ulk_pid == 0 && ull->ull_key.ulk_addr == 0) : 1); + ull_unlock(ull); + + if (refcount > 0) { + return; + } + + ull_global_lock(); + remqueue(&ull->ull_hash_link); + ull_global_unlock(); + +#if DEVELOPMENT || DEBUG + if (ull_debug) { + kprintf("%s>", __FUNCTION__); + ull_dump(ull); + } +#endif + ull_free(ull); +} + +int +ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) +{ + uint opcode = args->operation & UL_OPCODE_MASK; + uint flags = args->operation & UL_FLAGS_MASK; + int ret = 0; + thread_t self = current_thread(); + int id = thread_tid(self); + ulk_t key; + + /* involved threads - each variable holds +1 ref if not null */ + thread_t owner_thread = THREAD_NULL; + thread_t old_owner = THREAD_NULL; + thread_t old_lingering_owner = THREAD_NULL; + sched_call_t workq_callback = NULL; + + if (ull_debug) { + kprintf("[%d]%s>ENTER opcode %d addr %llx value %llx timeout %d flags %x\n", id, __FUNCTION__, opcode, (unsigned long long)(args->addr), args->value, args->timeout, flags); + } + + if ((flags & ULF_WAIT_MASK) != flags) { + ret = EINVAL; + goto munge_retval; + } + + boolean_t set_owner = FALSE; + + switch (opcode) { + case UL_UNFAIR_LOCK: + set_owner = TRUE; + break; + case UL_COMPARE_AND_WAIT: + break; + default: + if (ull_debug) { + kprintf("[%d]%s>EINVAL opcode %d addr 0x%llx flags 0x%x\n", + id, __FUNCTION__, opcode, + (unsigned long long)(args->addr), flags); + } + ret = EINVAL; + goto munge_retval; + } + + /* 32-bit lock type for UL_COMPARE_AND_WAIT and UL_UNFAIR_LOCK */ + uint32_t value = 0; + + if ((args->addr == 0) || (args->addr % _Alignof(_Atomic(typeof(value))))) { + ret = EINVAL; + goto munge_retval; + } + + key.ulk_pid = p->p_pid; + key.ulk_addr = args->addr; + + if (flags & ULF_WAIT_WORKQ_DATA_CONTENTION) { + workq_callback = workqueue_get_sched_callback(); + workq_callback = thread_disable_sched_call(self, workq_callback); + } + + ull_t *ull = ull_get(&key, 0); + if (ull == NULL) { + ret = ENOMEM; + goto munge_retval; + } + /* ull is locked */ + + ull->ull_nwaiters++; + + if (ull->ull_nwaiters > ull->ull_max_nwaiters) { + ull->ull_max_nwaiters = ull->ull_nwaiters; + } + + if (ull->ull_opcode == 0) { + ull->ull_opcode = opcode; + } else if (ull->ull_opcode != opcode) { + ull_unlock(ull); + ret = EDOM; + goto out; + } + + /* + * We don't want this copyin to get wedged behind VM operations, + * but we have to read the userspace value under the ull lock for correctness. + * + * Until exists, + * fake it by disabling preemption across copyin, which forces any + * vm_fault we encounter to fail. + */ + uint64_t val64; /* copyin_word always zero-extends to 64-bits */ + + disable_preemption(); + int copy_ret = copyin_word(args->addr, &val64, sizeof(value)); + enable_preemption(); + + value = (uint32_t)val64; + +#if DEVELOPMENT || DEBUG + /* Occasionally simulate copyin finding the user address paged out */ + if (((ull_simulate_copyin_fault == p->p_pid) || (ull_simulate_copyin_fault == 1)) && (copy_ret == 0)) { + static _Atomic int fault_inject = 0; + if (__c11_atomic_fetch_add(&fault_inject, 1, __ATOMIC_RELAXED) % 73 == 0) { + copy_ret = EFAULT; + } + } +#endif + if (copy_ret != 0) { + ull_unlock(ull); + + /* copyin() will return an error if the access to the user addr would have faulted, + * so just return and let the user level code fault it in. + */ + ret = copy_ret; + goto out; + } + + if (value != args->value) { + /* Lock value has changed from expected so bail out */ + ull_unlock(ull); + if (ull_debug) { + kprintf("[%d]%s>Lock value %d has changed from expected %d so bail out\n", + id, __FUNCTION__, value, (uint32_t)(args->value)); + } + goto out; + } + + if (set_owner) { + mach_port_name_t owner_name = ulock_owner_value_to_port_name(args->value); + owner_thread = port_name_to_thread_for_ulock(owner_name); + + /* HACK: don't bail on MACH_PORT_DEAD, to avoid blowing up the no-tsd pthread lock */ + if (owner_name != MACH_PORT_DEAD && owner_thread == THREAD_NULL) { +#if DEBUG || DEVELOPMENT + if (ull_panic_on_corruption) { + if (flags & ULF_NO_ERRNO) { + // ULF_NO_ERRNO is used by libplatform ulocks, but not libdispatch ones. + // Don't panic on libdispatch ulock corruptions; the userspace likely + // mismanaged a dispatch queue. + panic("ulock_wait: ulock is corrupted; value=0x%x, ull=%p", + (uint32_t)(args->value), ull); + } + } +#endif + /* + * Translation failed - even though the lock value is up to date, + * whatever was stored in the lock wasn't actually a thread port. + */ + ull_unlock(ull); + ret = EOWNERDEAD; + goto out; + } + /* owner_thread has a +1 reference */ + + /* + * At this point, I know: + * a) owner_thread is definitely the current owner, because I just read the value + * b) owner_thread is either: + * i) holding the user lock or + * ii) has just unlocked the user lock after I looked + * and is heading toward the kernel to call ull_wake. + * If so, it's going to have to wait for the ull mutex. + * + * Therefore, I can promote its priority to match mine, and I can rely on it to + * come by later to issue the wakeup and lose its promotion. + */ + + old_owner = ull_promote_owner_locked(ull, owner_thread); + } + + wait_result_t wr; + uint32_t timeout = args->timeout; + if (timeout) { + wr = assert_wait_timeout((event_t)ull, THREAD_ABORTSAFE, timeout, NSEC_PER_USEC); + } else { + wr = assert_wait((event_t)ull, THREAD_ABORTSAFE); + } + + ull_unlock(ull); + + if (ull_debug) { + kprintf("[%d]%s>after assert_wait() returned %d\n", id, __FUNCTION__, wr); + } + + if (set_owner && owner_thread != THREAD_NULL && wr == THREAD_WAITING) { + wr = thread_handoff(owner_thread); + /* owner_thread ref is consumed */ + owner_thread = THREAD_NULL; + } else { + /* NRG At some point this should be a continuation based block, so that we can avoid saving the full kernel context. */ + wr = thread_block(NULL); + } + if (ull_debug) { + kprintf("[%d]%s>thread_block() returned %d\n", id, __FUNCTION__, wr); + } + switch (wr) { + case THREAD_AWAKENED: + break; + case THREAD_TIMED_OUT: + ret = ETIMEDOUT; + break; + case THREAD_INTERRUPTED: + case THREAD_RESTART: + default: + ret = EINTR; + break; + } + +out: + ull_lock(ull); + *retval = --ull->ull_nwaiters; + if (ull->ull_nwaiters == 0) { + /* + * If the wait was canceled early, we might need to + * clear out the lingering owner reference before + * freeing the ull. + */ + if (ull->ull_owner != THREAD_NULL) { + old_lingering_owner = ull_promote_owner_locked(ull, THREAD_NULL); + } + + assert(ull->ull_owner == THREAD_NULL); + + ull->ull_key.ulk_pid = 0; + ull->ull_key.ulk_addr = 0; + ull->ull_refcount--; + assert(ull->ull_refcount > 0); + } + ull_put(ull); + + if (owner_thread != THREAD_NULL) { + thread_deallocate(owner_thread); + } + + if (old_owner != THREAD_NULL) { + thread_deallocate(old_owner); + } + + if (old_lingering_owner != THREAD_NULL) { + thread_deallocate(old_lingering_owner); + } + + assert(*retval >= 0); + +munge_retval: + if (workq_callback) { + thread_reenable_sched_call(self, workq_callback); + } + + if ((flags & ULF_NO_ERRNO) && (ret != 0)) { + *retval = -ret; + ret = 0; + } + return ret; +} + +int +ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retval) +{ + uint opcode = args->operation & UL_OPCODE_MASK; + uint flags = args->operation & UL_FLAGS_MASK; + int ret = 0; + int id = thread_tid(current_thread()); + ulk_t key; + + /* involved threads - each variable holds +1 ref if not null */ + thread_t wake_thread = THREAD_NULL; + thread_t old_owner = THREAD_NULL; + + if (ull_debug) { + kprintf("[%d]%s>ENTER opcode %d addr %llx flags %x\n", + id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags); + } + + if ((flags & ULF_WAKE_MASK) != flags) { + ret = EINVAL; + goto munge_retval; + } + +#if DEVELOPMENT || DEBUG + if (opcode == UL_DEBUG_HASH_DUMP_PID) { + *retval = ull_hash_dump(p->p_pid); + return ret; + } else if (opcode == UL_DEBUG_HASH_DUMP_ALL) { + *retval = ull_hash_dump(0); + return ret; + } else if (opcode == UL_DEBUG_SIMULATE_COPYIN_FAULT) { + ull_simulate_copyin_fault = (int)(args->wake_value); + return ret; + } +#endif + + if (args->addr == 0) { + ret = EINVAL; + goto munge_retval; + } + + if (flags & ULF_WAKE_THREAD) { + if (flags & ULF_WAKE_ALL) { + ret = EINVAL; + goto munge_retval; + } + mach_port_name_t wake_thread_name = (mach_port_name_t)(args->wake_value); + wake_thread = port_name_to_thread_for_ulock(wake_thread_name); + if (wake_thread == THREAD_NULL) { + ret = ESRCH; + goto munge_retval; + } + } + + key.ulk_pid = p->p_pid; + key.ulk_addr = args->addr; + + ull_t *ull = ull_get(&key, ULL_MUST_EXIST); + if (ull == NULL) { + if (wake_thread != THREAD_NULL) { + thread_deallocate(wake_thread); + } + ret = ENOENT; + goto munge_retval; + } + /* ull is locked */ + + boolean_t clear_owner = FALSE; /* need to reset owner */ + + switch (opcode) { + case UL_UNFAIR_LOCK: + clear_owner = TRUE; + break; + case UL_COMPARE_AND_WAIT: + break; + default: + if (ull_debug) { + kprintf("[%d]%s>EINVAL opcode %d addr 0x%llx flags 0x%x\n", + id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags); + } + ret = EINVAL; + goto out_locked; + } + + if (opcode != ull->ull_opcode) { + if (ull_debug) { + kprintf("[%d]%s>EDOM - opcode mismatch - opcode %d addr 0x%llx flags 0x%x\n", + id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags); + } + ret = EDOM; + goto out_locked; + } + + if (!clear_owner) { + assert(ull->ull_owner == THREAD_NULL); + } + + if (flags & ULF_WAKE_ALL) { + thread_wakeup((event_t)ull); + } else if (flags & ULF_WAKE_THREAD) { + kern_return_t kr = thread_wakeup_thread((event_t)ull, wake_thread); + if (kr != KERN_SUCCESS) { + assert(kr == KERN_NOT_WAITING); + ret = EALREADY; + } + } else { + /* + * TODO: WAITQ_SELECT_MAX_PRI forces a linear scan of the (hashed) global waitq. + * Move to a ulock-private, priority sorted waitq to avoid that. + * + * TODO: 'owner is not current_thread (or null)' likely means we can avoid this wakeup + * + */ + thread_wakeup_one_with_pri((event_t)ull, WAITQ_SELECT_MAX_PRI); + } + + /* + * Reaching this point means I previously moved the lock to 'unowned' state in userspace. + * Therefore I need to relinquish my promotion. + * + * However, someone else could have locked it after I unlocked, and then had a third thread + * block on the lock, causing a promotion of some other owner. + * + * I don't want to stomp over that, so only remove the promotion if I'm the current owner. + */ + + if (ull->ull_owner == current_thread()) { + old_owner = ull_promote_owner_locked(ull, THREAD_NULL); + } + +out_locked: + ull_put(ull); + + if (wake_thread != THREAD_NULL) { + thread_deallocate(wake_thread); + } + + if (old_owner != THREAD_NULL) { + thread_deallocate(old_owner); + } + +munge_retval: + if ((flags & ULF_NO_ERRNO) && (ret != 0)) { + *retval = -ret; + ret = 0; + } + return ret; +} + +/* + * Change ull_owner to be new_owner, and update it with the properties + * of the current thread. + * + * Records the highest current promotion value in ull_promote_token, and applies that + * to any new owner. + * + * Returns +1 ref to the old ull_owner if it is going away. + */ +static thread_t +ull_promote_owner_locked(ull_t* ull, + thread_t new_owner) +{ + if (new_owner != THREAD_NULL && ull->ull_owner == new_owner) { + thread_user_promotion_update(new_owner, current_thread(), &ull->ull_promote_token); + return THREAD_NULL; + } + + thread_t old_owner = ull->ull_owner; + ull->ull_owner = THREAD_NULL; + + if (new_owner != THREAD_NULL) { + /* The ull_owner field now owns a +1 ref on thread */ + thread_reference(new_owner); + ull->ull_owner = new_owner; + + thread_user_promotion_add(new_owner, current_thread(), &ull->ull_promote_token); + } else { + /* No new owner - clear the saturated promotion value */ + ull->ull_promote_token = PROMOTE_TOKEN_INIT; + } + + if (old_owner != THREAD_NULL) { + thread_user_promotion_drop(old_owner); + } + + /* Return the +1 ref from the ull_owner field */ + return old_owner; +} + diff --git a/bsd/kern/sys_work_interval.c b/bsd/kern/sys_work_interval.c index 45b36c717..53d4a2930 100644 --- a/bsd/kern/sys_work_interval.c +++ b/bsd/kern/sys_work_interval.c @@ -33,6 +33,8 @@ #include #include #include +#include + #include int diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index faf5af9d4..7c2bd0f69 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -31,9 +31,6 @@ ; N.B.: makesyscalls.sh and createsyscalls.pl must be updated to account ; for any new argument types. -; If you add a new syscall number to the end of this file, you need to -; increment the value of NUM_SYSENT in bsd/sys/sysent.h. - #include #include #include @@ -113,7 +110,7 @@ 60 AUE_UMASK ALL { int umask(int newmask); } 61 AUE_CHROOT ALL { int chroot(user_addr_t path); } 62 AUE_NULL ALL { int nosys(void); } { old fstat } -63 AUE_NULL ALL { int nosys(void); } { used internally, reserved } +63 AUE_NULL ALL { int nosys(void); } { used internally and reserved } 64 AUE_NULL ALL { int nosys(void); } { old getpagesize } 65 AUE_MSYNC ALL { int msync(caddr_t addr, size_t len, int flags) NO_SYSCALL_STUB; } 66 AUE_VFORK ALL { int vfork(void); } @@ -182,7 +179,7 @@ 114 AUE_NULL ALL { int nosys(void); } #endif /* SOCKETS */ 115 AUE_NULL ALL { int nosys(void); } { old vtrace } -116 AUE_GETTIMEOFDAY ALL { int gettimeofday(struct timeval *tp, struct timezone *tzp) NO_SYSCALL_STUB; } +116 AUE_GETTIMEOFDAY ALL { int gettimeofday(struct timeval *tp, struct timezone *tzp, uint64_t *mach_absolute_time) NO_SYSCALL_STUB; } 117 AUE_GETRUSAGE ALL { int getrusage(int who, struct rusage *rusage); } #if SOCKETS 118 AUE_GETSOCKOPT ALL { int getsockopt(int s, int level, int name, caddr_t val, socklen_t *avalsize); } @@ -265,7 +262,7 @@ 174 AUE_NULL ALL { int nosys(void); } { old getdents } 175 AUE_NULL ALL { int nosys(void); } { old gc_control } 176 AUE_NULL ALL { int nosys(void); } { old add_profil } -177 AUE_NULL ALL { int nosys(void); } +177 AUE_KDEBUGTRACE ALL { int kdebug_typefilter(void** addr, size_t* size) NO_SYSCALL_STUB; } 178 AUE_KDEBUGTRACE ALL { uint64_t kdebug_trace_string(uint32_t debugid, uint64_t str_id, const char *str) NO_SYSCALL_STUB; } 179 AUE_KDEBUGTRACE ALL { int kdebug_trace64(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) NO_SYSCALL_STUB; } 180 AUE_KDEBUGTRACE ALL { int kdebug_trace(uint32_t code, u_long arg1, u_long arg2, u_long arg3, u_long arg4) NO_SYSCALL_STUB; } @@ -273,7 +270,7 @@ 182 AUE_SETEGID ALL { int setegid(gid_t egid); } 183 AUE_SETEUID ALL { int seteuid(uid_t euid); } 184 AUE_SIGRETURN ALL { int sigreturn(struct ucontext *uctx, int infostyle) NO_SYSCALL_STUB; } -185 AUE_CHUD ALL { int chud(uint64_t code, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5) NO_SYSCALL_STUB; } +185 AUE_NULL ALL { int enosys(void); } { old chud } 186 AUE_NULL ALL { int nosys(void); } 187 AUE_FDATASYNC ALL { int fdatasync(int fd); } 188 AUE_STAT ALL { int stat(user_addr_t path, user_addr_t ub); } @@ -281,12 +278,12 @@ 190 AUE_LSTAT ALL { int lstat(user_addr_t path, user_addr_t ub); } 191 AUE_PATHCONF ALL { int pathconf(char *path, int name); } 192 AUE_FPATHCONF ALL { int fpathconf(int fd, int name); } -193 AUE_NULL ALL { int nosys(void); } +193 AUE_NULL ALL { int nosys(void); } { old getfsstat } 194 AUE_GETRLIMIT ALL { int getrlimit(u_int which, struct rlimit *rlp) NO_SYSCALL_STUB; } 195 AUE_SETRLIMIT ALL { int setrlimit(u_int which, struct rlimit *rlp) NO_SYSCALL_STUB; } 196 AUE_GETDIRENTRIES ALL { int getdirentries(int fd, char *buf, u_int count, long *basep); } 197 AUE_MMAP ALL { user_addr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos) NO_SYSCALL_STUB; } -198 AUE_NULL ALL { int nosys(void); } { __syscall } +198 AUE_NULL ALL { int nosys(void); } { old __syscall } 199 AUE_LSEEK ALL { off_t lseek(int fd, off_t offset, int whence); } 200 AUE_TRUNCATE ALL { int truncate(char *path, off_t length); } 201 AUE_FTRUNCATE ALL { int ftruncate(int fd, off_t length); } @@ -317,14 +314,14 @@ ; 216-> 219 used to be mkcomplex and {f,l}statv variants. They are gone now. 216 AUE_NULL ALL { int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode) NO_SYSCALL_STUB; } -217 AUE_NULL ALL { int nosys(void); } -218 AUE_NULL ALL { int nosys(void); } -219 AUE_NULL ALL { int nosys(void); } +217 AUE_NULL ALL { int nosys(void); } { old statv } +218 AUE_NULL ALL { int nosys(void); } { old lstatv } +219 AUE_NULL ALL { int nosys(void); } { old fstatv } 220 AUE_GETATTRLIST ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } 221 AUE_SETATTRLIST ALL { int setattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } 222 AUE_GETDIRENTRIESATTR ALL { int getdirentriesattr(int fd, struct attrlist *alist, void *buffer, size_t buffersize, u_long *count, u_long *basep, u_long *newstate, u_long options); } 223 AUE_EXCHANGEDATA ALL { int exchangedata(const char *path1, const char *path2, u_long options); } -224 AUE_NULL ALL { int nosys(void); } { old checkuseraccess / fsgetpath (which moved to 427) } +224 AUE_NULL ALL { int nosys(void); } { old checkuseraccess or fsgetpath } 225 AUE_SEARCHFS ALL { int searchfs(const char *path, struct fssearchblock *searchblock, uint32_t *nummatches, uint32_t scriptcode, uint32_t options, struct searchstate *state); } 226 AUE_DELETE ALL { int delete(user_addr_t path) NO_SYSCALL_STUB; } { private delete (Carbon semantics) } 227 AUE_COPYFILE ALL { int copyfile(char *from, char *to, int mode, int flags) NO_SYSCALL_STUB; } @@ -380,7 +377,7 @@ 254 AUE_SEMCTL ALL { int semctl(int semid, int semnum, int cmd, semun_t arg) NO_SYSCALL_STUB; } 255 AUE_SEMGET ALL { int semget(key_t key, int nsems, int semflg); } 256 AUE_SEMOP ALL { int semop(int semid, struct sembuf *sops, int nsops); } -257 AUE_NULL ALL { int nosys(void); } +257 AUE_NULL ALL { int nosys(void); } { old semconfig } #else 254 AUE_NULL ALL { int nosys(void); } 255 AUE_NULL ALL { int nosys(void); } @@ -547,7 +544,7 @@ 362 AUE_KQUEUE ALL { int kqueue(void); } 363 AUE_NULL ALL { int kevent(int fd, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } 364 AUE_LCHOWN ALL { int lchown(user_addr_t path, uid_t owner, gid_t group) NO_SYSCALL_STUB; } -365 AUE_STACKSNAPSHOT ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset) NO_SYSCALL_STUB; } +365 AUE_NULL ALL { int nosys(void); } { old stack_snapshot } #if CONFIG_WORKQUEUE 366 AUE_NULL ALL { int bsdthread_register(user_addr_t threadstart, user_addr_t wqthread, uint32_t flags, user_addr_t stack_addr_hint, user_addr_t targetconc_ptr, uint32_t dispatchqueue_offset, uint32_t tsd_offset) NO_SYSCALL_STUB; } 367 AUE_WORKQOPEN ALL { int workq_open(void) NO_SYSCALL_STUB; } @@ -661,7 +658,7 @@ 425 AUE_MAC_GET_MOUNT ALL { int nosys(void); } #endif 426 AUE_MAC_GETFSSTAT ALL { int __mac_getfsstat(user_addr_t buf, int bufsize, user_addr_t mac, int macsize, int flags); } -427 AUE_FSGETPATH ALL { user_ssize_t fsgetpath(user_addr_t buf, size_t bufsize, user_addr_t fsid, uint64_t objid) NO_SYSCALL_STUB; } { private fsgetpath (File Manager SPI) } +427 AUE_FSGETPATH ALL { user_ssize_t fsgetpath(user_addr_t buf, size_t bufsize, user_addr_t fsid, uint64_t objid); } { private fsgetpath (File Manager SPI) } 428 AUE_NULL ALL { mach_port_name_t audit_session_self(void); } 429 AUE_NULL ALL { int audit_session_join(mach_port_name_t port); } 430 AUE_NULL ALL { int fileport_makeport(int fd, user_addr_t portnamep); } @@ -719,12 +716,12 @@ 459 AUE_NULL ALL { int enosys(void); } #endif /* COALITIONS */ #if NECP -460 AUE_NULL ALL { int necp_match_policy(uint8_t *parameters, size_t parameters_size, struct necp_aggregate_result *returned_result); } +460 AUE_NECP ALL { int necp_match_policy(uint8_t *parameters, size_t parameters_size, struct necp_aggregate_result *returned_result); } #else 460 AUE_NULL ALL { int nosys(void); } #endif /* NECP */ 461 AUE_GETATTRLISTBULK ALL { int getattrlistbulk(int dirfd, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, uint64_t options); } -462 AUE_NULL ALL { int enosys(void); } /* PLACEHOLDER for CLONEFILE */ +462 AUE_CLONEFILEAT ALL { int clonefileat(int src_dirfd, user_addr_t src, int dst_dirfd, user_addr_t dst, uint32_t flags); } 463 AUE_OPENAT_RWTC ALL { int openat(int fd, user_addr_t path, int flags, int mode) NO_SYSCALL_STUB; } 464 AUE_OPENAT_RWTC ALL { int openat_nocancel(int fd, user_addr_t path, int flags, int mode) NO_SYSCALL_STUB; } 465 AUE_RENAMEAT ALL { int renameat(int fromfd, char *from, int tofd, char *to) NO_SYSCALL_STUB; } @@ -759,18 +756,14 @@ 485 AUE_NULL ALL { user_ssize_t guarded_write_np(int fd, const guardid_t *guard, user_addr_t cbuf, user_size_t nbyte); } 486 AUE_PWRITE ALL { user_ssize_t guarded_pwrite_np(int fd, const guardid_t *guard, user_addr_t buf, user_size_t nbyte, off_t offset); } 487 AUE_WRITEV ALL { user_ssize_t guarded_writev_np(int fd, const guardid_t *guard, struct iovec *iovp, int iovcnt); } -#if CONFIG_SECLUDED_RENAME -488 AUE_RENAME ALL { int rename_ext(char *from, char *to, u_int flags) NO_SYSCALL_STUB; } -#else -488 AUE_NULL ALL { int enosys(void); } -#endif +488 AUE_RENAMEAT ALL { int renameatx_np(int fromfd, char *from, int tofd, char *to, u_int flags) NO_SYSCALL_STUB; } #if CONFIG_CODE_DECRYPTION 489 AUE_MPROTECT ALL { int mremap_encrypted(caddr_t addr, size_t len, uint32_t cryptid, uint32_t cputype, uint32_t cpusubtype); } #else 489 AUE_NULL ALL { int enosys(void); } #endif #if NETWORKING -490 AUE_NULL ALL { int netagent_trigger(uuid_t agent_uuid, size_t agent_uuidlen); } +490 AUE_NETAGENT ALL { int netagent_trigger(uuid_t agent_uuid, size_t agent_uuidlen); } #else 490 AUE_NULL ALL { int nosys(void); } #endif /* NETWORKING */ @@ -788,6 +781,33 @@ #endif 495 AUE_NULL ALL { int enosys(void); } 496 AUE_NULL ALL { int enosys(void); } -497 AUE_NULL ALL { int enosys(void); } +497 AUE_NULL ALL { int enosys(void); } 498 AUE_NULL ALL { int enosys(void); } 499 AUE_NULL ALL { int work_interval_ctl(uint32_t operation, uint64_t work_interval_id, void *arg, size_t len) NO_SYSCALL_STUB; } +500 AUE_NULL ALL { int getentropy(void *buffer, size_t size); } +#if NECP +501 AUE_NECP ALL { int necp_open(int flags); } } +502 AUE_NECP ALL { int necp_client_action(int necp_fd, uint32_t action, uuid_t client_id, size_t client_id_len, uint8_t *buffer, size_t buffer_size); } +#else +501 AUE_NULL ALL { int enosys(void); } +502 AUE_NULL ALL { int enosys(void); } +#endif /* NECP */ +503 AUE_NULL ALL { int enosys(void); } +504 AUE_NULL ALL { int enosys(void); } +505 AUE_NULL ALL { int enosys(void); } +506 AUE_NULL ALL { int enosys(void); } +507 AUE_NULL ALL { int enosys(void); } +508 AUE_NULL ALL { int enosys(void); } +509 AUE_NULL ALL { int enosys(void); } +510 AUE_NULL ALL { int enosys(void); } +511 AUE_NULL ALL { int enosys(void); } +512 AUE_NULL ALL { int enosys(void); } +513 AUE_NULL ALL { int enosys(void); } +514 AUE_NULL ALL { int enosys(void); } +515 AUE_NULL ALL { int ulock_wait(uint32_t operation, void *addr, uint64_t value, uint32_t timeout) NO_SYSCALL_STUB; } +516 AUE_NULL ALL { int ulock_wake(uint32_t operation, void *addr, uint64_t wake_value) NO_SYSCALL_STUB; } +517 AUE_FCLONEFILEAT ALL { int fclonefileat(int src_fd, int dst_dirfd, user_addr_t dst, uint32_t flags); } +518 AUE_NULL ALL { int fs_snapshot(uint32_t op, int dirfd, user_addr_t name1, user_addr_t name2, user_addr_t data, uint32_t flags) NO_SYSCALL_STUB; } +519 AUE_NULL ALL { int enosys(void); } +520 AUE_KILL ALL { int terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, uint64_t reason_flags) NO_SYSCALL_STUB; } +521 AUE_EXIT ALL { void abort_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, uint64_t reason_flags) NO_SYSCALL_STUB; } diff --git a/bsd/kern/sysv_msg.c b/bsd/kern/sysv_msg.c index cc35787f8..56c670424 100644 --- a/bsd/kern/sysv_msg.c +++ b/bsd/kern/sysv_msg.c @@ -216,7 +216,7 @@ static int msginit(__unused void *dummy) { static int initted = 0; - register int i; + int i; /* Lazy initialization on first system call; we don't have SYSINIT(). */ if (initted) diff --git a/bsd/kern/sysv_sem.c b/bsd/kern/sysv_sem.c index 418a4c6e0..55459ab16 100644 --- a/bsd/kern/sysv_sem.c +++ b/bsd/kern/sysv_sem.c @@ -265,8 +265,8 @@ semsys(struct proc *p, struct semsys_args *uap, int32_t *retval) static int grow_semu_array(int newSize) { - register int i; - register struct sem_undo *newSemu; + int i; + struct sem_undo *newSemu; if (newSize <= seminfo.semmnu) return 1; @@ -326,8 +326,8 @@ grow_semu_array(int newSize) static int grow_sema_array(int newSize) { - register struct semid_kernel *newSema; - register int i; + struct semid_kernel *newSema; + int i; if (newSize <= seminfo.semmni) return 0; @@ -472,8 +472,8 @@ grow_sem_pool(int new_pool_size) static int semu_alloc(struct proc *p) { - register int i; - register struct sem_undo *suptr; + int i; + struct sem_undo *suptr; int *supidx; int attempt; @@ -550,9 +550,9 @@ static int semundo_adjust(struct proc *p, int *supidx, int semid, int semnum, int adjval) { - register struct sem_undo *suptr; + struct sem_undo *suptr; int suidx; - register struct undo *sueptr, **suepptr, *new_sueptr; + struct undo *sueptr, **suepptr, *new_sueptr; int i; /* @@ -1063,9 +1063,9 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) int semid = uap->semid; int nsops = uap->nsops; struct sembuf sops[seminfo.semopm]; - register struct semid_kernel *semakptr; - register struct sembuf *sopptr = NULL; /* protected by 'semptr' */ - register struct sem *semptr = NULL; /* protected by 'if' */ + struct semid_kernel *semakptr; + struct sembuf *sopptr = NULL; /* protected by 'semptr' */ + struct sem *semptr = NULL; /* protected by 'if' */ int supidx = -1; int i, j, eval; int do_wakeup, do_undos; @@ -1392,7 +1392,7 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) void semexit(struct proc *p) { - register struct sem_undo *suptr = NULL; + struct sem_undo *suptr = NULL; int suidx; int *supidx; int did_something; diff --git a/bsd/kern/sysv_shm.c b/bsd/kern/sysv_shm.c index 41a6bb873..c397788c2 100644 --- a/bsd/kern/sysv_shm.c +++ b/bsd/kern/sysv_shm.c @@ -96,6 +96,7 @@ #include #include +#include /* Uncomment this line to see MAC debugging output. */ /* #define MAC_DEBUG */ @@ -106,10 +107,7 @@ #endif #if SYSV_SHM -static void shminit(void *); -#if 0 -SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL) -#endif +static int shminit(void); static lck_grp_t *sysv_shm_subsys_lck_grp; static lck_grp_attr_t *sysv_shm_subsys_lck_grp_attr; @@ -171,13 +169,18 @@ static int shm_delete_mapping(struct proc *, struct shmmap_state *, int); #define DEFAULT_SHMSEG 8 #define DEFAULT_SHMALL 1024 -struct shminfo shminfo = { - DEFAULT_SHMMAX, - DEFAULT_SHMMIN, - DEFAULT_SHMMNI, +struct shminfo shminfo = { + DEFAULT_SHMMAX, + DEFAULT_SHMMIN, + DEFAULT_SHMMNI, DEFAULT_SHMSEG, DEFAULT_SHMALL }; + +#define SHMID_IS_VALID(x) ((x) >= 0) +#define SHMID_UNALLOCATED (-1) +#define SHMID_SENTINEL (-2) + #endif /* __APPLE_API_PRIVATE */ void sysv_shm_lock_init(void); @@ -299,7 +302,7 @@ shm_delete_mapping(__unused struct proc *p, struct shmmap_state *shmmap_s, if (result != KERN_SUCCESS) return EINVAL; } - shmmap_s->shmid = -1; + shmmap_s->shmid = SHMID_UNALLOCATED; shmseg->u.shm_dtime = sysv_shmtime(); if ((--shmseg->u.shm_nattch <= 0) && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) { @@ -323,23 +326,28 @@ shmdt(struct proc *p, struct shmdt_args *uap, int32_t *retval) SYSV_SHM_SUBSYS_LOCK(); - if (!shm_inited) { - shminit(NULL); + if ((shmdtret = shminit())) { + goto shmdt_out; } + shmmap_s = (struct shmmap_state *)p->vm_shm; if (shmmap_s == NULL) { shmdtret = EINVAL; goto shmdt_out; } - for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) - if (shmmap_s->shmid != -1 && - shmmap_s->va == (mach_vm_offset_t)uap->shmaddr) + for (; shmmap_s->shmid != SHMID_SENTINEL; shmmap_s++) { + if (SHMID_IS_VALID(shmmap_s->shmid) && + shmmap_s->va == (mach_vm_offset_t)uap->shmaddr) { break; - if (i == shminfo.shmseg) { + } + } + + if (!SHMID_IS_VALID(shmmap_s->shmid)) { shmdtret = EINVAL; goto shmdt_out; } + #if CONFIG_MACF /* * XXX: It might be useful to move this into the shm_delete_mapping @@ -383,28 +391,41 @@ shmat(struct proc *p, struct shmat_args *uap, user_addr_t *retval) SYSV_SHM_SUBSYS_LOCK(); - if (!shm_inited) { - shminit(NULL); + if ((shmat_ret = shminit())) { + goto shmat_out; } shmmap_s = (struct shmmap_state *)p->vm_shm; - if (shmmap_s == NULL) { - size = shminfo.shmseg * sizeof(struct shmmap_state); - if (size == 0 || size / shminfo.shmseg != sizeof(struct shmmap_state)) { - /* overflow */ + /* lazily allocate the shm map */ + + int nsegs = shminfo.shmseg; + if (nsegs <= 0) { + shmat_ret = EMFILE; + goto shmat_out; + } + + /* +1 for the sentinel */ + if (os_add_and_mul_overflow(nsegs, 1, sizeof(struct shmmap_state), &size)) { shmat_ret = ENOMEM; goto shmat_out; - } + } + MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK); if (shmmap_s == NULL) { shmat_ret = ENOMEM; goto shmat_out; } - for (i = 0; i < shminfo.shmseg; i++) - shmmap_s[i].shmid = -1; + + /* initialize the entries */ + for (i = 0; i < nsegs; i++) { + shmmap_s[i].shmid = SHMID_UNALLOCATED; + } + shmmap_s[i].shmid = SHMID_SENTINEL; + p->vm_shm = (caddr_t)shmmap_s; } + shmseg = shm_find_segment_by_shmid(uap->shmid); if (shmseg == NULL) { shmat_ret = EINVAL; @@ -426,12 +447,13 @@ shmat(struct proc *p, struct shmat_args *uap, user_addr_t *retval) goto shmat_out; } #endif - for (i = 0; i < shminfo.shmseg; i++) { - if (shmmap_s->shmid == -1) - break; + + /* find a free shmid */ + while (SHMID_IS_VALID(shmmap_s->shmid)) { shmmap_s++; } - if (i >= shminfo.shmseg) { + if (shmmap_s->shmid != SHMID_UNALLOCATED) { + /* no free shmids */ shmat_ret = EMFILE; goto shmat_out; } @@ -561,8 +583,8 @@ shmctl(__unused struct proc *p, struct shmctl_args *uap, int32_t *retval) SYSV_SHM_SUBSYS_LOCK(); - if (!shm_inited) { - shminit(NULL); + if ((shmctl_ret = shminit())) { + goto shmctl_out; } shmseg = shm_find_segment_by_shmid(uap->shmid); @@ -726,12 +748,16 @@ shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode, void * mem_object; struct shm_handle *shm_handle_next, **shm_handle_next_p; - if (uap->size < (user_size_t)shminfo.shmmin || - uap->size > (user_size_t)shminfo.shmmax) + if (uap->size <= 0 || + uap->size < (user_size_t)shminfo.shmmin || + uap->size > (user_size_t)shminfo.shmmax) { return EINVAL; + } if (shm_nused >= shminfo.shmmni) /* any shmids left? */ return ENOSPC; - total_size = mach_vm_round_page(uap->size); + if (mach_vm_round_page_overflow(uap->size, &total_size)) { + return EINVAL; + } if ((user_ssize_t)(shm_committed + btoc(total_size)) > shminfo.shmall) return ENOMEM; if (shm_last_free < 0) { @@ -845,13 +871,13 @@ shmget(struct proc *p, struct shmget_args *uap, int32_t *retval) { int segnum, mode, error; int shmget_ret = 0; - + /* Auditing is actually done in shmget_allocate_segment() */ SYSV_SHM_SUBSYS_LOCK(); - if (!shm_inited) { - shminit(NULL); + if ((shmget_ret = shminit())) { + goto shmget_out; } mode = uap->shmflg & ACCESSPERMS; @@ -874,8 +900,6 @@ shmget(struct proc *p, struct shmget_args *uap, int32_t *retval) shmget_out: SYSV_SHM_SUBSYS_UNLOCK(); return shmget_ret; - /*NOTREACHED*/ - } /* @@ -917,59 +941,77 @@ int shmfork(struct proc *p1, struct proc *p2) { struct shmmap_state *shmmap_s; - size_t size; - int i; - int shmfork_ret = 0; + size_t size; + int nsegs = 0; + int ret = 0; SYSV_SHM_SUBSYS_LOCK(); - if (!shm_inited) { - shminit(NULL); + if (shminit()) { + ret = 1; + goto shmfork_out; + } + + struct shmmap_state *src = (struct shmmap_state *)p1->vm_shm; + assert(src); + + /* count number of shmid entries in src */ + for (struct shmmap_state *s = src; s->shmid != SHMID_SENTINEL; s++) { + nsegs++; + } + + if (os_add_and_mul_overflow(nsegs, 1, sizeof(struct shmmap_state), &size)) { + ret = 1; + goto shmfork_out; } - size = shminfo.shmseg * sizeof(struct shmmap_state); - if (size == 0 || size / shminfo.shmseg != sizeof(struct shmmap_state)) { - /* overflow */ - shmfork_ret = 1; - goto shmfork_out; - } MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK); - if (shmmap_s != NULL) { - bcopy((caddr_t)p1->vm_shm, (caddr_t)shmmap_s, size); - p2->vm_shm = (caddr_t)shmmap_s; - for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) - if (shmmap_s->shmid != -1) - shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++; - shmfork_ret = 0; + if (shmmap_s == NULL) { + ret = 1; goto shmfork_out; } - shmfork_ret = 1; /* failed to copy to child - ENOMEM */ + bcopy(src, (caddr_t)shmmap_s, size); + p2->vm_shm = (caddr_t)shmmap_s; + for (; shmmap_s->shmid != SHMID_SENTINEL; shmmap_s++) { + if (SHMID_IS_VALID(shmmap_s->shmid)) { + shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++; + } + } + shmfork_out: SYSV_SHM_SUBSYS_UNLOCK(); - return shmfork_ret; + return ret; } -void -shmexit(struct proc *p) +static void +shmcleanup(struct proc *p, int deallocate) { struct shmmap_state *shmmap_s; - int i; - - shmmap_s = (struct shmmap_state *)p->vm_shm; SYSV_SHM_SUBSYS_LOCK(); - for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) - if (shmmap_s->shmid != -1) + + shmmap_s = (struct shmmap_state *)p->vm_shm; + for (; shmmap_s->shmid != SHMID_SENTINEL; shmmap_s++) { + if (SHMID_IS_VALID(shmmap_s->shmid)) { /* * XXX: Should the MAC framework enforce * check here as well. */ - shm_delete_mapping(p, shmmap_s, 1); + shm_delete_mapping(p, shmmap_s, deallocate); + } + } + FREE((caddr_t)p->vm_shm, M_SHM); p->vm_shm = NULL; SYSV_SHM_SUBSYS_UNLOCK(); } +void +shmexit(struct proc *p) +{ + shmcleanup(p, 1); +} + /* * shmexec() is like shmexit(), only it doesn't delete the mappings, * since the old address space has already been destroyed and the new @@ -979,24 +1021,14 @@ shmexit(struct proc *p) __private_extern__ void shmexec(struct proc *p) { - struct shmmap_state *shmmap_s; - int i; - - shmmap_s = (struct shmmap_state *)p->vm_shm; - SYSV_SHM_SUBSYS_LOCK(); - for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) - if (shmmap_s->shmid != -1) - shm_delete_mapping(p, shmmap_s, 0); - FREE((caddr_t)p->vm_shm, M_SHM); - p->vm_shm = NULL; - SYSV_SHM_SUBSYS_UNLOCK(); + shmcleanup(p, 0); } -void -shminit(__unused void *dummy) +int +shminit(void) { + size_t sz; int i; - int s; if (!shm_inited) { /* @@ -1006,12 +1038,13 @@ shminit(__unused void *dummy) * dictates this filed be a size_t, which is 64 bits when * running 64 bit binaries. */ - s = sizeof(struct shmid_kernel) * shminfo.shmmni; + if (os_mul_overflow(shminfo.shmmni, sizeof(struct shmid_kernel), &sz)) { + return ENOMEM; + } - MALLOC(shmsegs, struct shmid_kernel *, s, M_SHM, M_WAITOK); + MALLOC(shmsegs, struct shmid_kernel *, sz, M_SHM, M_WAITOK); if (shmsegs == NULL) { - /* XXX fail safely: leave shared memory uninited */ - return; + return ENOMEM; } for (i = 0; i < shminfo.shmmni; i++) { shmsegs[i].u.shm_perm.mode = SHMSEG_FREE; @@ -1025,7 +1058,10 @@ shminit(__unused void *dummy) shm_committed = 0; shm_inited = 1; } + + return 0; } + /* Initialize the mutex governing access to the SysV shm subsystem */ __private_extern__ void sysv_shm_lock_init( void ) @@ -1047,10 +1083,11 @@ sysctl_shminfo(__unused struct sysctl_oid *oidp, void *arg1, { int error = 0; int sysctl_shminfo_ret = 0; - uint64_t saved_shmmax; - uint64_t saved_shmseg; - uint64_t saved_shmmni; - uint64_t saved_shmall; + int64_t saved_shmmax; + int64_t saved_shmmin; + int64_t saved_shmseg; + int64_t saved_shmmni; + int64_t saved_shmall; error = SYSCTL_OUT(req, arg1, sizeof(int64_t)); if (error || req->newptr == USER_ADDR_NULL) @@ -1064,6 +1101,7 @@ sysctl_shminfo(__unused struct sysctl_oid *oidp, void *arg1, goto sysctl_shminfo_out; } saved_shmmax = shminfo.shmmax; + saved_shmmin = shminfo.shmmin; saved_shmseg = shminfo.shmseg; saved_shmmni = shminfo.shmmni; saved_shmall = shminfo.shmall; @@ -1075,12 +1113,19 @@ sysctl_shminfo(__unused struct sysctl_oid *oidp, void *arg1, if (arg1 == &shminfo.shmmax) { /* shmmax needs to be page-aligned */ - if (shminfo.shmmax & PAGE_MASK_64) { + if (shminfo.shmmax & PAGE_MASK_64 || shminfo.shmmax < 0) { shminfo.shmmax = saved_shmmax; sysctl_shminfo_ret = EINVAL; goto sysctl_shminfo_out; } } + else if (arg1 == &shminfo.shmmin) { + if (shminfo.shmmin < 0) { + shminfo.shmmin = saved_shmmin; + sysctl_shminfo_ret = EINVAL; + goto sysctl_shminfo_out; + } + } else if (arg1 == &shminfo.shmseg) { /* add a sanity check - 20847256 */ if (shminfo.shmseg > INT32_MAX || shminfo.shmseg < 0) { @@ -1130,8 +1175,8 @@ IPCS_shm_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, SYSV_SHM_SUBSYS_LOCK(); - if (!shm_inited) { - shminit(NULL); + if ((error = shminit())) { + goto ipcs_shm_sysctl_out; } if (!IS_64BIT_PROCESS(p)) { diff --git a/bsd/kern/trace.codes b/bsd/kern/trace_codes similarity index 82% rename from bsd/kern/trace.codes rename to bsd/kern/trace_codes index c03b66bc5..3b70a29c9 100644 --- a/bsd/kern/trace.codes +++ b/bsd/kern/trace_codes @@ -100,7 +100,7 @@ 0x10c00a0 MSC_kern_invalid_#40 0x10c00a4 MSC_mach_port_guard_trap 0x10c00a8 MSC_mach_port_unguard_trap -0x10c00ac MSC_kern_invalid_#43 +0x10c00ac MSC_mach_generate_activity_id 0x10c00b0 MSC_task_name_for_pid 0x10c00b4 MSC_task_for_pid 0x10c00b8 MSC_pid_for_task @@ -127,9 +127,9 @@ 0x10c010c MSC_kern_invalid_#67 0x10c0110 MSC_kern_invalid_#68 0x10c0114 MSC_kern_invalid_#69 -0x10c0118 MSC_kern_invalid_#70 +0x10c0118 MSC_host_create_mach_voucher_trap 0x10c011c MSC_kern_invalid_#71 -0x10c0120 MSC_kern_invalid_#72 +0x10c0120 MSC_mach_voucher_extract_attr_recipe_trap 0x10c0124 MSC_kern_invalid_#73 0x10c0128 MSC_kern_invalid_#74 0x10c012c MSC_kern_invalid_#75 @@ -195,6 +195,17 @@ 0x120001c MACH_IPC_voucher_create 0x1200020 MACH_IPC_voucher_create_attr_data 0x1200024 MACH_IPC_voucher_destroy +0x1200028 MACH_IPC_kmsg_info +0x120002c MACH_IPC_kmsg_link +0x1250008 MACH_RMON_CPUUSAGE_VIOLATED +0x1250010 MACH_RMON_CPUUSAGE_VIOLATED_K32A +0x1250014 MACH_RMON_CPUUSAGE_VIOLATED_K32B +0x1250048 MACH_RMON_CPUWAKES_VIOLATED +0x1250050 MACH_RMON_CPUWAKES_VIOLATED_K32A +0x1250054 MACH_RMON_CPUWAKES_VIOLATED_K32B +0x1250088 MACH_RMON_LOGWRITES_VIOLATED +0x1250090 MACH_RMON_LOGWRITES_VIOLATED_K32A +0x1250094 MACH_RMON_LOGWRITES_VIOLATED_K32A 0x1300004 MACH_Pageout 0x1300008 MACH_vmfault 0x1300100 MACH_purgable_token_add @@ -209,7 +220,7 @@ 0x1300400 MACH_vm_check_zf_delay 0x1300404 MACH_vm_cow_delay 0x1300408 MACH_vm_zf_delay -0x130040c MACH_vm_compressor_delay +0x130040c MACH_vm_compressor_delay 0x1300410 MACH_vm_pageout_scan 0x1300414 MACH_vm_pageout_balanceQ 0x1300418 MACH_vm_pageout_freelist @@ -223,7 +234,13 @@ 0x130048C MACH_vm_page_sleep 0x1300490 MACH_vm_page_expedite 0x13004c0 MACH_vm_pressure_event -0x1300500 MACH_vm_data_write +0x1300500 MACH_vm_data_write +0x1320000 vm_disconnect_all_page_mappings +0x1320004 vm_disconnect_task_page_mappings +0x1320008 RealFaultAddressInternal +0x132000c RealFaultAddressPurgeable +0x1320010 RealFaultAddressExternal +0x1320014 RealFaultAddressSharedCache 0x1400000 MACH_SCHED 0x1400004 MACH_STKATTACH 0x1400008 MACH_STKHANDOFF @@ -464,6 +481,8 @@ 0x301017c Cl_wait_IO 0x3010180 Vnode_Pagein 0x3010184 throttle_lowpri_io +0x3010198 rethrottle_wakeup +0x301019c rethrottle_noted 0x3010200 Vnode_Pageout 0x3010280 Vnode_WaitForWrites 0x3010300 PageoutThrottle @@ -554,6 +573,7 @@ 0x3011028 UPL_map_remove_upl 0x301102c UPL_commit_range_speculative 0x3018000 HFS_update +0x3018004 HFS_modify_block_end 0x3020000 P_WrData 0x3020004 P_WrDataDone 0x3020008 P_RdData @@ -819,6 +839,9 @@ 0x3CF0000 CP_OFFSET_IO 0x4010004 proc_exit 0x4010008 force_exit +0x401000C proc_exec +0x4010010 exit_reason_create +0x4010014 exit_reason_commit 0x4020004 MEMSTAT_scan 0x4020008 MEMSTAT_jetsam 0x402000C MEMSTAT_jetsam_hiwat @@ -832,484 +855,6 @@ 0x402002C MEMSTAT_dirty_clear 0x4020030 MEMSTAT_grp_set_properties 0x4020034 MEMSTAT_do_kill -0x40c0000 BSC_SysCall -0x40c0004 BSC_exit -0x40c0008 BSC_fork -0x40c000c BSC_read -0x40c0010 BSC_write -0x40c0014 BSC_open -0x40c0018 BSC_close -0x40c001c BSC_wait4 -0x40c0020 BSC_obs_creat -0x40c0024 BSC_link -0x40c0028 BSC_unlink -0x40c002c BSC_obs_execv -0x40c0030 BSC_chdir -0x40c0034 BSC_fchdir -0x40c0038 BSC_mknod -0x40c003c BSC_chmod -0x40c0040 BSC_chown -0x40c0044 BSC_obs_break -0x40c0048 BSC_getfsstat -0x40c004c BSC_obs_lseek -0x40c0050 BSC_getpid -0x40c0054 BSC_obs_mount -0x40c0058 BSC_obs_unmount -0x40c005c BSC_setuid -0x40c0060 BSC_getuid -0x40c0064 BSC_geteuid -0x40c0068 BSC_ptrace -0x40c006c BSC_recvmsg -0x40c0070 BSC_sendmsg -0x40c0074 BSC_recvfrom -0x40c0078 BSC_accept -0x40c007c BSC_getpeername -0x40c0080 BSC_getsockname -0x40c0084 BSC_access -0x40c0088 BSC_chflags -0x40c008c BSC_fchflags -0x40c0090 BSC_sync -0x40c0094 BSC_kill -0x40c0098 BSC_obs_stat -0x40c009c BSC_getppid -0x40c00a0 BSC_obs_lstat -0x40c00a4 BSC_dup -0x40c00a8 BSC_pipe -0x40c00ac BSC_getegid -0x40c00b0 BSC_profil -0x40c00b4 BSC_obs_ktrace -0x40c00b8 BSC_sigaction -0x40c00bc BSC_getgid -0x40c00c0 BSC_sigprocmask -0x40c00c4 BSC_getlogin -0x40c00c8 BSC_setlogin -0x40c00cc BSC_acct -0x40c00d0 BSC_sigpending -0x40c00d4 BSC_sigaltstack -0x40c00d8 BSC_ioctl -0x40c00dc BSC_reboot -0x40c00e0 BSC_revoke -0x40c00e4 BSC_symlink -0x40c00e8 BSC_readlink -0x40c00ec BSC_execve -0x40c00f0 BSC_umask -0x40c00f4 BSC_chroot -0x40c00f8 BSC_obs_fstat -0x40c00fc BSC_#63 -0x40c0100 BSC_obs_getpagesize -0x40c0104 BSC_msync -0x40c0108 BSC_vfork -0x40c010c BSC_obs_vread -0x40c0110 BSC_obs_vwrite -0x40c0114 BSC_obs_sbrk -0x40c0118 BSC_obs_sstk -0x40c011c BSC_obs_mmap -0x40c0120 BSC_obs_vadvise -0x40c0124 BSC_munmap -0x40c0128 BSC_mprotect -0x40c012c BSC_madvise -0x40c0130 BSC_obs_vhangup -0x40c0134 BSC_obs_vlimit -0x40c0138 BSC_mincore -0x40c013c BSC_getgroups -0x40c0140 BSC_setgroups -0x40c0144 BSC_getpgrp -0x40c0148 BSC_setpgid -0x40c014c BSC_setitimer -0x40c0150 BSC_obs_wait -0x40c0154 BSC_swapon -0x40c0158 BSC_getitimer -0x40c015c BSC_obs_gethostname -0x40c0160 BSC_obs_sethostname -0x40c0164 BSC_getdtablesize -0x40c0168 BSC_dup2 -0x40c016c BSC_obs_getdopt -0x40c0170 BSC_fcntl -0x40c0174 BSC_select -0x40c0178 BSC_obs_setdopt -0x40c017c BSC_fsync -0x40c0180 BSC_setpriority -0x40c0184 BSC_socket -0x40c0188 BSC_connect -0x40c018c BSC_obs_accept -0x40c0190 BSC_getpriority -0x40c0194 BSC_obs_send -0x40c0198 BSC_obs_recv -0x40c019c BSC_obs_sigreturn -0x40c01a0 BSC_bind -0x40c01a4 BSC_setsockopt -0x40c01a8 BSC_listen -0x40c01ac BSC_obs_vtimes -0x40c01b0 BSC_obs_sigvec -0x40c01b4 BSC_obs_sigblock -0x40c01b8 BSC_obs_sigsetmask -0x40c01bc BSC_sigsuspend -0x40c01c0 BSC_obs_sigstack -0x40c01c4 BSC_obs_recvmsg -0x40c01c8 BSC_obs_sendmsg -0x40c01cc BSC_obs_vtrace -0x40c01d0 BSC_gettimeofday -0x40c01d4 BSC_getrusage -0x40c01d8 BSC_getsockopt -0x40c01dc BSC_obs_resuba -0x40c01e0 BSC_readv -0x40c01e4 BSC_writev -0x40c01e8 BSC_settimeofday -0x40c01ec BSC_fchown -0x40c01f0 BSC_fchmod -0x40c01f4 BSC_obs_recvfrom -0x40c01f8 BSC_setreuid -0x40c01fc BSC_setregid -0x40c0200 BSC_rename -0x40c0204 BSC_obs_truncate -0x40c0208 BSC_obs_ftruncate -0x40c020c BSC_flock -0x40c0210 BSC_mkfifo -0x40c0214 BSC_sendto -0x40c0218 BSC_shutdown -0x40c021c BSC_socketpair -0x40c0220 BSC_mkdir -0x40c0224 BSC_rmdir -0x40c0228 BSC_utimes -0x40c022c BSC_futimes -0x40c0230 BSC_adjtime -0x40c0234 BSC_obs_getpeername -0x40c0238 BSC_gethostuuid -0x40c023c BSC_obs_sethostid -0x40c0240 BSC_obs_getrlimit -0x40c0244 BSC_obs_setrlimit -0x40c0248 BSC_obs_killpg -0x40c024c BSC_setsid -0x40c0250 BSC_obs_setquota -0x40c0254 BSC_obs_qquota -0x40c0258 BSC_obs_getsockname -0x40c025c BSC_getpgid -0x40c0260 BSC_setprivexec -0x40c0264 BSC_pread -0x40c0268 BSC_pwrite -0x40c026c BSC_nfssvc -0x40c0270 BSC_obs_getdirentries -0x40c0274 BSC_statfs -0x40c0278 BSC_fstatfs -0x40c027c BSC_unmount -0x40c0280 BSC_obs_async_daemon -0x40c0284 BSC_getfh -0x40c0288 BSC_obs_getdomainname -0x40c028c BSC_obs_setdomainname -0x40c0290 BSC_#164 -0x40c0294 BSC_quotactl -0x40c0298 BSC_obs_exportfs -0x40c029c BSC_mount -0x40c02a0 BSC_obs_ustat -0x40c02a4 BSC_csops -0x40c02a8 BSC_obs_table -0x40c02ac BSC_obs_wait3 -0x40c02b0 BSC_obs_rpause -0x40c02b4 BSC_waitid -0x40c02b8 BSC_obs_getdents -0x40c02bc BSC_obs_gc_control -0x40c02c0 BSC_add_profil -0x40c02c4 BSC_#177 -0x40c02c8 BSC_kdebug_trace_string -0x40c02cc BSC_kdebug_trace64 -0x40c02d0 BSC_kdebug_trace -0x40c02d4 BSC_setgid -0x40c02d8 BSC_setegid -0x40c02dc BSC_seteuid -0x40c02e0 BSC_sigreturn -0x40c02e4 BSC_chud -0x40c02e8 BSC_#186 -0x40c02ec BSC_fdatasync -0x40c02f0 BSC_stat -0x40c02f4 BSC_fstat -0x40c02f8 BSC_lstat -0x40c02fc BSC_pathconf -0x40c0300 BSC_fpathconf -0x40c0304 BSC_obs_getfsstat -0x40c0308 BSC_getrlimit -0x40c030c BSC_setrlimit -0x40c0310 BSC_getdirentries -0x40c0314 BSC_mmap -0x40c0318 BSC_obs__syscall -0x40c031c BSC_lseek -0x40c0320 BSC_truncate -0x40c0324 BSC_ftruncate -0x40c0328 BSC_sysctl -0x40c032c BSC_mlock -0x40c0330 BSC_munlock -0x40c0334 BSC_undelete -0x40c0338 BSC_ATsocket -0x40c033c BSC_ATgetmsg -0x40c0340 BSC_ATputmsg -0x40c0344 BSC_ATPsndreq -0x40c0348 BSC_ATPsndrsp -0x40c034c BSC_ATPgetreq -0x40c0350 BSC_ATPgetrsp -0x40c0354 BSC_#213 -0x40c0358 BSC_#214 -0x40c035c BSC_#215 -0x40c0360 BSC_open_dprotected_np -0x40c0364 BSC_obs_statv -0x40c0368 BSC_obs_lstatv -0x40c036c BSC_obs_fstatv -0x40c0370 BSC_getattrlist -0x40c0374 BSC_setattrlist -0x40c0378 BSC_getdirentriesattr -0x40c037c BSC_exchangedata -0x40c0380 BSC_checkuseraccess -0x40c0384 BSC_searchfs -0x40c0388 BSC_delete_Carbon -0x40c038c BSC_copyfile -0x40c0390 BSC_fgetattrlist -0x40c0394 BSC_fsetattrlist -0x40c0398 BSC_poll -0x40c039c BSC_watchevent -0x40c03a0 BSC_waitevent -0x40c03a4 BSC_modwatch -0x40c03a8 BSC_getxattr -0x40c03ac BSC_fgetxattr -0x40c03b0 BSC_setxattr -0x40c03b4 BSC_fsetxattr -0x40c03b8 BSC_removexattr -0x40c03bc BSC_fremovexattr -0x40c03c0 BSC_listxattr -0x40c03c4 BSC_flistxattr -0x40c03c8 BSC_fsctl -0x40c03cc BSC_initgroups -0x40c03d0 BSC_posix_spawn -0x40c03d4 BSC_ffsctl -0x40c03d8 BSC_#246 -0x40c03dc BSC_nfsclnt -0x40c03e0 BSC_fhopen -0x40c03e4 BSC_#249 -0x40c03e8 BSC_minherit -0x40c03ec BSC_semsys -0x40c03f0 BSC_msgsys -0x40c03f4 BSC_shmsys -0x40c03f8 BSC_semctl -0x40c03fc BSC_semget -0x40c0400 BSC_semop -0x40c0404 BSC_semconfig -0x40c0408 BSC_msgctl -0x40c040c BSC_msgget -0x40c0410 BSC_msgsnd -0x40c0414 BSC_msgrcv -0x40c0418 BSC_shmat -0x40c041c BSC_shmctl -0x40c0420 BSC_shmdt -0x40c0424 BSC_shmget -0x40c0428 BSC_shm_open -0x40c042c BSC_shm_unlink -0x40c0430 BSC_sem_open -0x40c0434 BSC_sem_close -0x40c0438 BSC_sem_unlink -0x40c043c BSC_sem_wait -0x40c0440 BSC_sem_trywait -0x40c0444 BSC_sem_post -0x40c0448 BSC_sysctlbyname -0x40c0454 BSC_open_extended -0x40c0458 BSC_umask_extended -0x40c045c BSC_stat_extended -0x40c0460 BSC_lstat_extended -0x40c0464 BSC_fstat_extended -0x40c0468 BSC_chmod_extended -0x40c046c BSC_fchmod_extended -0x40c0470 BSC_access_extended -0x40c0474 BSC_settid -0x40c0478 BSC_gettid -0x40c047c BSC_setsgroups -0x40c0480 BSC_getsgroups -0x40c0484 BSC_setwgroups -0x40c0488 BSC_getwgroups -0x40c048c BSC_mkfifo_extended -0x40c0490 BSC_mkdir_extended -0x40c0494 BSC_identitysvc -0x40c0498 BSC_shared_region_chk_np -0x40c049c BSC_shared_region_map_np -0x40c04a0 BSC_vm_pressure_monitor -0x40c04a4 BSC_psynch_rw_longrdlock -0x40c04a8 BSC_psynch_rw_yieldwrlock -0x40c04ac BSC_psynch_rw_downgrade -0x40c04b0 BSC_psynch_rw_upgrade -0x40c04b4 BSC_psynch_mutexwait -0x40c04b8 BSC_psynch_mutexdrop -0x40c04bc BSC_psynch_cvbroad -0x40c04c0 BSC_psynch_cvsignal -0x40c04c4 BSC_psynch_cvwait -0x40c04c8 BSC_psynch_rw_rdlock -0x40c04cc BSC_psynch_rw_wrlock -0x40c04d0 BSC_psynch_rw_unlock -0x40c04d4 BSC_psynch_rw_unlock2 -0x40c04d8 BSC_getsid -0x40c04dc BSC_settid_with_pid -0x40c04e0 BSC_psynch_cvclrprepost -0x40c04e4 BSC_aio_fsync -0x40c04e8 BSC_aio_return -0x40c04ec BSC_aio_suspend -0x40c04f0 BSC_aio_cancel -0x40c04f4 BSC_aio_error -0x40c04f8 BSC_aio_read -0x40c04fc BSC_aio_write -0x40c0500 BSC_lio_listio -0x40c0504 BSC_obs_pthread_cond_wait -0x40c0508 BSC_iopolicysys -0x40c050c BSC_process_policy -0x40c0510 BSC_mlockall -0x40c0514 BSC_munlockall -0x40c0518 BSC_#326 -0x40c051c BSC_issetugid -0x40c0520 BSC_pthread_kill -0x40c0524 BSC_pthread_sigmask -0x40c0528 BSC_sigwait -0x40c052c BSC_disable_threadsignal -0x40c0530 BSC_pthread_markcancel -0x40c0534 BSC_pthread_canceled -0x40c0538 BSC_semwait_signal -0x40c053c BSC_obs_utrace -0x40c0540 BSC_proc_info -0x40c0544 BSC_sendfile -0x40c0548 BSC_stat64 -0x40c054c BSC_fstat64 -0x40c0550 BSC_lstat64 -0x40c0554 BSC_stat64_extended -0x40c0558 BSC_lstat64_extended -0x40c055c BSC_fstat64_extended -0x40c0560 BSC_getdirentries64 -0x40c0564 BSC_statfs64 -0x40c0568 BSC_fstatfs64 -0x40c056c BSC_getfsstat64 -0x40c0570 BSC_pthread_chdir -0x40c0574 BSC_pthread_fchdir -0x40c0578 BSC_audit -0x40c057c BSC_auditon -0x40c0580 BSC_#352 -0x40c0584 BSC_getauid -0x40c0588 BSC_setauid -0x40c058c BSC_getaudit -0x40c0590 BSC_setaudit -0x40c0594 BSC_getaudit_addr -0x40c0598 BSC_setaudit_addr -0x40c059c BSC_auditctl -0x40c05a0 BSC_bsdthread_create -0x40c05a4 BSC_bsdthread_terminate -0x40c05a8 BSC_kqueue -0x40c05ac BSC_kevent -0x40c05b0 BSC_lchown -0x40c05b4 BSC_stack_snapshot -0x40c05b8 BSC_bsdthread_register -0x40c05bc BSC_workq_open -0x40c05c0 BSC_workq_kernreturn -0x40c05c4 BSC_kevent64 -0x40c05c8 BSC_obs_semwait_signal -0x40c05cc BSC_obs_semwait_signal_nocancel -0x40c05d0 BSC_thread_selfid -0x40c05d4 BSC_ledger -0x40c05d8 BSC_#374 -0x40c05dc BSC_#375 -0x40c05e0 BSC_#376 -0x40c05e4 BSC_#377 -0x40c05e8 BSC_#378 -0x40c05ec BSC_#379 -0x40c05f0 BSC_mac_execve -0x40c05f4 BSC_mac_syscall -0x40c05f8 BSC_mac_get_file -0x40c0600 BSC_mac_get_link -0x40c0604 BSC_mac_set_link -0x40c0608 BSC_mac_get_proc -0x40c060c BSC_mac_set_proc -0x40c0610 BSC_mac_get_fd -0x40c0614 BSC_mac_set_fd -0x40c0618 BSC_mac_get_pid -0x40c061c BSC_#391 -0x40c0620 BSC_#392 -0x40c0624 BSC_#393 -0x40c0628 BSC_#394 -0x40c062c BSC_#395 -0x40c0630 BSC_read_nocancel -0x40c0634 BSC_write_nocancel -0x40c0638 BSC_open_nocancel -0x40c063c BSC_close_nocancel -0x40c0640 BSC_wait4_nocancel -0x40c0644 BSC_recvmsg_nocancel -0x40c0648 BSC_sendmsg_nocancel -0x40c064c BSC_recvfrom_nocancel -0x40c0650 BSC_accept_nocancel -0x40c0654 BSC_msync_nocancel -0x40c0658 BSC_fcntl_nocancel -0x40c065c BSC_select_nocancel -0x40c0660 BSC_fsync_nocancel -0x40c0664 BSC_connect_nocancel -0x40c0668 BSC_sigsuspend_nocancel -0x40c066c BSC_readv_nocancel -0x40c0670 BSC_writev_nocancel -0x40c0674 BSC_sendto_nocancel -0x40c0678 BSC_pread_nocancel -0x40c067c BSC_pwrite_nocancel -0x40c0680 BSC_waitid_nocancel -0x40c0684 BSC_poll_nocancel -0x40c0688 BSC_msgsnd_nocancel -0x40c068c BSC_msgrcv_nocancel -0x40c0690 BSC_sem_wait_nocancel -0x40c0694 BSC_aio_suspend_nocancel -0x40c0698 BSC_sigwait_nocancel -0x40c069c BSC_semwait_signal_nocancel -0x40c06a0 BSC_mac_mount -0x40c06a4 BSC_mac_get_mount -0x40c06a8 BSC_mac_getfsstat -0x40c06ac BSC_fsgetpath -0x40c06b0 BSC_audit_session -0x40c06b4 BSC_audit_session_join -0x40c06b8 BSC_fileport_makeport -0x40c06bc BSC_fileport_makefd -0x40c06c0 BSC_audit_session_port -0x40c06c4 BSC_pid_suspend -0x40c06c8 BSC_pid_resume -0x40c06cc BSC_pid_hibernate -0x40c06d0 BSC_pid_shutdown_sockets -0x40c06d4 BSC_shared_region_slide_np -0x40c06d8 BSC_shared_region_map_and_slide_np -0x40c06dc BSC_kas_info -0x40c06e0 BSC_memorystatus_control -0x40c06e4 BSC_guarded_open_np -0x40c06e8 BSC_guarded_close_np -0x40c06ec BSC_guarded_kqueue_np -0x40c06f0 BSC_change_fdguard_np -0x40c06f4 BSC___proc_suppress -0x40c06f8 BSC_proc_rlimit_control -0x40c06fc BSC_connectx -0x40c0700 BSC_disconnectx -0x40c0704 BSC_peeloff -0x40c0708 BSC_socket_delegate -0x40c070c BSC_telemetry -0x40c0710 BSC_proc_uuid_policy -0x40c0714 BSC_memorystatus_get_level -0x40c0718 BSC_system_override -0x40c071c BSC_vfs_purge -0x40c0720 BSC_sfi_ctl -0x40c0724 BSC_sfi_pidctl -0x40c0728 BSC_coalition -0x40c072c BSC_coalition_info -0x40c0734 BSC_getattrlistbulk -0x40c073c BSC_openat -0x40c0740 BSC_openat_nocancel -0x40c0744 BSC_renameat -0x40c074c BSC_chmodat -0x40c0750 BSC_chownat -0x40c0754 BSC_fstatat -0x40c0758 BSC_fstatat64 -0x40c075c BSC_linkat -0x40c0760 BSC_unlinkat -0x40c0764 BSC_readlinkat -0x40c0768 BSC_symlinkat -0x40c076c BSC_mkdirat -0x40c0770 BSC_getattrlistat -0x40c0778 BSC_bsdthread_ctl -0x40c0780 BSC_recvmsg_x -0x40c0784 BSC_sendmsg_x -0x40c0788 BSC_thread_selfusage -0x40c07a4 BSC_mremap_encrypted -0x40c07b8 BSC_persona -0x40c07cc BSC_work_interval_ctl 0x40e0104 BSC_msync_extended_info 0x40e0264 BSC_pread_extended_info 0x40e0268 BSC_pwrite_extended_info @@ -1384,7 +929,14 @@ 0x50700c0 PM_ClientNotify 0x50700c4 PM_AppNotify 0x50700d4 PM_IdleCancel -0x50700d8 PM_SystemTracePoint +0x50700d8 PM_SleepWakeTracePoint +0x50700dc PM_QuiescePowerTree +0x50700e0 PM_ComponentWakeProgress +0x50700e4 PM_UserActiveState +0x50700e8 PM_AppResponseDelay +0x50700ec PM_DriverResponseDelay +0x50700f0 PM_PCIDevChangeStart +0x50700f4 PM_PCIDevChangeDone 0x5080004 IOSERVICE_BUSY 0x5080008 IOSERVICE_NONBUSY 0x508000c IOSERVICE_MODULESTALL @@ -1585,9 +1137,12 @@ 0x7000004 TRACE_DATA_NEWTHREAD 0x7000008 TRACE_DATA_EXEC 0x700000c TRACE_DATA_THREAD_TERMINATE +0x7000010 TRACE_DATA_THREAD_TERMINATE_PID 0x7010000 TRACE_STRING_GLOBAL 0x7010004 TRACE_STRING_NEWTHREAD 0x7010008 TRACE_STRING_EXEC +0x701000c TRACE_STRING_PROC_EXIT +0x7010010 TRACE_STRING_THREADNAME 0x7020000 TRACE_PANIC 0x7020004 TRACE_TIMESTAMPS 0x7020008 TRACE_LOST_EVENTS @@ -1844,6 +1399,21 @@ 0x1f040010 DYLD_map_bundle_image 0x1f040014 DYLD_load_dependent_libraries 0x1f040018 DYLD_notify_prebinding_agent +0x1f050000 DYLD_uuid_map_a +0x1f050004 DYLD_uuid_map_b +0x1f050008 DYLD_uuid_map_32_a +0x1f05000c DYLD_uuid_map_32_b +0x1f050010 DYLD_uuid_map_32_c +0x1f050014 DYLD_uuid_unmap_a +0x1f050018 DYLD_uuid_unmap_b +0x1f05001c DYLD_uuid_unmap_32_a +0x1f050020 DYLD_uuid_unmap_32_b +0x1f050024 DYLD_uuid_unmap_32_c +0x1f050028 DYLD_uuid_shared_cache_a +0x1f05002c DYLD_uuid_shared_cache_b +0x1f050030 DYLD_uuid_shared_cache_32_a +0x1f050034 DYLD_uuid_shared_cache_32_b +0x1f050038 DYLD_uuid_shared_cache_32_c 0x1ff10000 SCROLL_BEGIN_obs 0x1ff10100 SCROLL_END_obs 0x1ff20000 BOOT_BEGIN_obs @@ -1953,7 +1523,6 @@ 0x21090004 PHD_DAEMON_FINISH 0x21090010 PHD_SYNCNOW_START 0x21090014 PHD_SYNCNOW_FINISH -0x210a0fac DISPATCH_voucher_transport 0x210b0000 TAL_APP_LAUNCH_START 0x210b0004 TAL_APP_LAUNCH_UNSUSPENDED 0x210b0008 TAL_APP_LAUNCH_UNTHROTTLED @@ -1963,19 +1532,6 @@ 0x210c0000 NSAPPLICATION_RECEIVED_KEYEVENT 0x210c0004 NSWINDOW_FLUSHED 0x210c0008 NSTEXTVIEW_PROCESSED_KEYEVENT -0x22000004 LAUNCHD_starting -0x22000008 LAUNCHD_exiting -0x2200000c LAUNCHD_finding_stray_pg -0x22000010 LAUNCHD_finding_all_strays -0x22000014 LAUNCHD_finding_execless -0x22000018 LAUNCHD_finding_weird_uids -0x2200001c LAUNCHD_data_pack -0x22000020 LAUNCHD_data_unpack -0x22000024 LAUNCHD_bug -0x22000028 LAUNCHD_mach_ipc -0x2200002c LAUNCHD_bsd_kevent -0x22000030 LAUNCHD_vproc_trans_incr -0x22000034 LAUNCHD_vproc_trans_decr 0x25000000 PERF_Event 0x25010000 PERF_THD_Sample 0x25010004 PERF_THD_Data @@ -1983,6 +1539,17 @@ 0x2501000c PERF_THD_XPend 0x25010010 PERF_THD_XData 0x25010014 PERF_THD_CSwitch +0x25010018 PERF_THD_Sched_Sample +0x2501001c PERF_THD_Sched_Data +0x25010020 PERF_THD_Snap_Sample +0x25010024 PERF_THD_Snap_Data +0x25010028 PERF_THD_Disp_Sample +0x2501002c PERF_THD_Disp_Data +0x25010030 PERF_THD_Disp_Pend +0x25010034 PERF_THD_Snap_Data_32 +0x25010038 PERF_THD_Disp_Data_32 +0x2501003c PERF_THD_Sched_Data1_32 +0x25010040 PERF_THD_Sched_Data2_32 0x25020000 PERF_STK_KSample 0x25020004 PERF_STK_USched 0x25020008 PERF_STK_USample @@ -1990,6 +1557,9 @@ 0x25020010 PERF_STK_UData 0x25020014 PERF_STK_KHdr 0x25020018 PERF_STK_UHdr +0x2502001c PERF_STK_Error +0x25020020 PERF_STK_Backtrace +0x25020024 PERF_STK_Log 0x25030000 PERF_TMR_AllSched 0x25030004 PERF_TMR_Schedule 0x25030008 PERF_TMR_Handler @@ -1999,6 +1569,10 @@ 0x2504000c PERF_ATS_Pause 0x25040010 PERF_ATS_Idle 0x25040014 PERF_ATS_Sample +0x25040018 PERF_PET_Sched +0x2504001c PERF_PET_End +0x25040020 PERF_PET_Sample_Task +0x25040024 PERF_PET_Sample_Thread 0x25050000 PERF_AST_Handler 0x25050004 PERF_AST_Error 0x25060000 PERF_KPC_Handler @@ -2011,9 +1585,14 @@ 0x2506001c PERF_KPC_ConfReg32 0x25060020 PERF_KPC_Data_Thread 0x25060024 PERF_KPC_Data_Thread32 +0x25060028 PERF_KPC_CPU_Sample +0x2506002c PERF_KPC_Thd_Sample 0x25070000 PERF_KDBG_Handler -0x25080000 PERF_CS_Handler -0x25090000 PERF_SP_Handler +0x25080000 PERF_TK_Snap_Sample +0x25080004 PERF_TK_Snap_Data1 +0x25080008 PERF_TK_Snap_Data2 +0x2508000c PERF_TK_Snap_Data1_32 +0x25080010 PERF_TK_Snap_Data2_32 0x250a0000 PERF_MI_Sample 0x250a0004 PERF_MI_Data 0x26100008 imp_assertion_hold diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index b863e27ff..580852a0c 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -2426,7 +2426,9 @@ ttwrite(struct tty *tp, struct uio *uio, int flag) i = b_to_q((u_char *)cp, ce, &tp->t_outq); ce -= i; tp->t_column += ce; - cp += ce, cc -= ce, tk_nout += ce; + cp += ce; + cc -= ce; + tk_nout += ce; tp->t_outcc += ce; if (i > 0) { /* out of space */ diff --git a/bsd/kern/tty_compat.c b/bsd/kern/tty_compat.c index 6cab324ac..4348bd133 100644 --- a/bsd/kern/tty_compat.c +++ b/bsd/kern/tty_compat.c @@ -224,7 +224,7 @@ ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term) * pending input is not discarded. */ { - register struct sgttyb *sg = (struct sgttyb *)data; + struct sgttyb *sg = (struct sgttyb *)data; int speed; if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0) @@ -253,7 +253,7 @@ ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term) */ { struct tchars *tc = (struct tchars *)data; - register cc_t *cc; + cc_t *cc; cc = term->c_cc; cc[VINTR] = tc->t_intrc; @@ -274,7 +274,7 @@ ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term) */ { struct ltchars *ltc = (struct ltchars *)data; - register cc_t *cc; + cc_t *cc; cc = term->c_cc; cc[VSUSP] = ltc->t_suspc; @@ -387,8 +387,8 @@ ttcompat(struct tty *tp, u_long com, caddr_t data, int flag, struct proc *p) * flags, into the structure pointed to by 'data'. */ { - register struct sgttyb *sg = (struct sgttyb *)data; - register cc_t *cc = tp->t_cc; + struct sgttyb *sg = (struct sgttyb *)data; + cc_t *cc = tp->t_cc; sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds); if (tp->t_ispeed == 0) @@ -407,7 +407,7 @@ ttcompat(struct tty *tp, u_long com, caddr_t data, int flag, struct proc *p) */ { struct tchars *tc = (struct tchars *)data; - register cc_t *cc = tp->t_cc; + cc_t *cc = tp->t_cc; tc->t_intrc = cc[VINTR]; tc->t_quitc = cc[VQUIT]; @@ -424,7 +424,7 @@ ttcompat(struct tty *tp, u_long com, caddr_t data, int flag, struct proc *p) */ { struct ltchars *ltc = (struct ltchars *)data; - register cc_t *cc = tp->t_cc; + cc_t *cc = tp->t_cc; ltc->t_suspc = cc[VSUSP]; ltc->t_dsuspc = cc[VDSUSP]; @@ -517,11 +517,11 @@ ttcompat(struct tty *tp, u_long com, caddr_t data, int flag, struct proc *p) static int ttcompatgetflags(struct tty *tp) { - register tcflag_t iflag = tp->t_iflag; - register tcflag_t lflag = tp->t_lflag; - register tcflag_t oflag = tp->t_oflag; - register tcflag_t cflag = tp->t_cflag; - register int flags = 0; + tcflag_t iflag = tp->t_iflag; + tcflag_t lflag = tp->t_lflag; + tcflag_t oflag = tp->t_oflag; + tcflag_t cflag = tp->t_cflag; + int flags = 0; if (iflag&IXOFF) flags |= TANDEM; @@ -589,11 +589,11 @@ ttcompatgetflags(struct tty *tp) static void ttcompatsetflags(struct tty *tp, struct termios *t) { - register int flags = tp->t_flags; - register tcflag_t iflag = t->c_iflag; - register tcflag_t oflag = t->c_oflag; - register tcflag_t lflag = t->c_lflag; - register tcflag_t cflag = t->c_cflag; + int flags = tp->t_flags; + tcflag_t iflag = t->c_iflag; + tcflag_t oflag = t->c_oflag; + tcflag_t lflag = t->c_lflag; + tcflag_t cflag = t->c_cflag; if (flags & RAW) { iflag = IGNBRK; @@ -680,11 +680,11 @@ ttcompatsetflags(struct tty *tp, struct termios *t) static void ttcompatsetlflags(struct tty *tp, struct termios *t) { - register int flags = tp->t_flags; - register tcflag_t iflag = t->c_iflag; - register tcflag_t oflag = t->c_oflag; - register tcflag_t lflag = t->c_lflag; - register tcflag_t cflag = t->c_cflag; + int flags = tp->t_flags; + tcflag_t iflag = t->c_iflag; + tcflag_t oflag = t->c_oflag; + tcflag_t lflag = t->c_lflag; + tcflag_t cflag = t->c_cflag; iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR); if (flags&CRTERA) diff --git a/bsd/kern/tty_ptmx.c b/bsd/kern/tty_ptmx.c index 6f1c71c62..43db005d0 100644 --- a/bsd/kern/tty_ptmx.c +++ b/bsd/kern/tty_ptmx.c @@ -414,6 +414,8 @@ ptmx_free_ioctl(int minor, int open_flag) if (!(_state.pis_ioctl_list[minor]->pt_flags & (PF_OPEN_M|PF_OPEN_S))) { /* Mark as free so it can be reallocated later */ old_ptmx_ioctl = _state.pis_ioctl_list[ minor]; + _state.pis_ioctl_list[minor] = NULL; + _state.pis_free++; } DEVFS_UNLOCK(); @@ -429,12 +431,6 @@ ptmx_free_ioctl(int minor, int open_flag) devfs_remove(old_ptmx_ioctl->pt_devhandle); ttyfree(old_ptmx_ioctl->pt_tty); FREE(old_ptmx_ioctl, M_TTYS); - - /* Don't remove the entry until the devfs slot is free */ - DEVFS_LOCK(); - _state.pis_ioctl_list[minor] = NULL; - _state.pis_free++; - DEVFS_UNLOCK(); } return (0); /* Success */ @@ -505,11 +501,15 @@ ptmx_clone(__unused dev_t dev, int action) int ptsd_kqfilter(dev_t, struct knote *); static void ptsd_kqops_detach(struct knote *); static int ptsd_kqops_event(struct knote *, long); +static int ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev); +static int ptsd_kqops_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -static struct filterops ptsd_kqops = { +struct filterops ptsd_kqops = { .f_isfd = 1, .f_detach = ptsd_kqops_detach, .f_event = ptsd_kqops_event, + .f_touch = ptsd_kqops_touch, + .f_process = ptsd_kqops_process, }; #define PTSD_KNOTE_VALID NULL @@ -550,15 +550,12 @@ ptsd_kqops_detach(struct knote *kn) } static int -ptsd_kqops_event(struct knote *kn, long hint) +ptsd_kqops_common(struct knote *kn, dev_t dev, long hint) { struct ptmx_ioctl *pti; struct tty *tp; - dev_t dev = (dev_t)kn->kn_hookid; int retval = 0; - ptsd_kevent_mtx_lock(minor(dev)); - do { if (kn->kn_hook != PTSD_KNOTE_VALID ) { /* We were revoked */ @@ -601,12 +598,67 @@ ptsd_kqops_event(struct knote *kn, long hint) if (hint == 0) tty_unlock(tp); - } while (0); - ptsd_kevent_mtx_unlock(minor(dev)); + } while (0); return (retval); } + +static int +ptsd_kqops_event(struct knote *kn, long hint) +{ + dev_t dev = (dev_t)kn->kn_hookid; + int res; + + ptsd_kevent_mtx_lock(minor(dev)); + res = ptsd_kqops_common(kn, dev, hint); + ptsd_kevent_mtx_unlock(minor(dev)); + return res; +} + + +static int +ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) +{ + dev_t dev = (dev_t)kn->kn_hookid; + int res; + + ptsd_kevent_mtx_lock(minor(dev)); + + /* accept new kevent state */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* recapture fired state of knote */ + res = ptsd_kqops_common(kn, dev, 0); + + ptsd_kevent_mtx_unlock(minor(dev)); + + return res; +} + +static int +ptsd_kqops_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + dev_t dev = (dev_t)kn->kn_hookid; + int res; + + ptsd_kevent_mtx_lock(minor(dev)); + res = ptsd_kqops_common(kn, dev, 0); + if (res) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_fflags = 0; + kn->kn_data = 0; + } + } + ptsd_kevent_mtx_unlock(minor(dev)); + return res; +} + int ptsd_kqfilter(dev_t dev, struct knote *kn) { @@ -616,11 +668,15 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) /* make sure we're talking about the right device type */ if (cdevsw[major(dev)].d_open != ptsopen) { - return (EINVAL); + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; + return 0; } if ((pti = ptmx_get_ioctl(minor(dev), 0)) == NULL) { - return (ENXIO); + kn->kn_flags = EV_ERROR; + kn->kn_data = ENXIO; + return 0; } tp = pti->pt_tty; @@ -628,7 +684,7 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) kn->kn_hookid = dev; kn->kn_hook = PTSD_KNOTE_VALID; - kn->kn_fop = &ptsd_kqops; + kn->kn_filtid = EVFILTID_PTSD; switch (kn->kn_filter) { case EVFILT_READ: @@ -638,11 +694,20 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) KNOTE_ATTACH(&tp->t_wsel.si_note, kn); break; default: - retval = EINVAL; + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; break; } tty_unlock(tp); + + ptsd_kevent_mtx_lock(minor(dev)); + + /* capture current event state */ + retval = ptsd_kqops_common(kn, dev, 0); + + ptsd_kevent_mtx_unlock(minor(dev)); + return (retval); } diff --git a/bsd/kern/tty_subr.c b/bsd/kern/tty_subr.c index 89bc09fe0..bfac6579b 100644 --- a/bsd/kern/tty_subr.c +++ b/bsd/kern/tty_subr.c @@ -203,8 +203,8 @@ int ndqb(struct clist *clp, int flag) { int count = 0; - register int i; - register int cc; + int i; + int cc; if ((cc = clp->c_cc) == 0) goto out; @@ -271,7 +271,7 @@ ndflush(struct clist *clp, int count) int putc(int c, struct clist *clp) { - register int i; + int i; if (clp->c_cc == 0) { if (!clp->c_cs) { @@ -320,7 +320,7 @@ void clrbits(u_char *cp, int off, int len) { int sby, sbi, eby, ebi; - register int i; + int i; u_char mask; if(len==1) { diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 3e4353a97..27a8cc511 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -110,6 +110,9 @@ static int ubc_umcallback(vnode_t, void *); static int ubc_msync_internal(vnode_t, off_t, off_t, off_t *, int, int *); static void ubc_cs_free(struct ubc_info *uip); +static boolean_t ubc_cs_supports_multilevel_hash(struct cs_blob *blob); +static void ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob); + struct zone *ubc_info_zone; static uint32_t cs_blob_generation_count = 1; @@ -121,9 +124,6 @@ static uint32_t cs_blob_generation_count = 1; extern int cs_debug; #define PAGE_SHIFT_4K (12) -#define PAGE_SIZE_4K ((1<magic) != CSMAGIC_CODEDIRECTORY) return EBADEXEC; - if (cd->pageSize != PAGE_SHIFT_4K) + if (cd->pageSize < PAGE_SHIFT_4K || cd->pageSize > PAGE_SHIFT) return EBADEXEC; hashtype = cs_find_md(cd->hashType); if (hashtype == NULL) @@ -469,12 +469,14 @@ cs_validate_blob(const CS_GenericBlob *blob, size_t length) static int cs_validate_csblob(const uint8_t *addr, size_t length, - const CS_CodeDirectory **rcd) + const CS_CodeDirectory **rcd, + const CS_GenericBlob **rentitlements) { const CS_GenericBlob *blob = (const CS_GenericBlob *)(const void *)addr; int error; *rcd = NULL; + *rentitlements = NULL; error = cs_validate_blob(blob, length); if (error) @@ -526,15 +528,25 @@ cs_validate_csblob(const uint8_t *addr, size_t length, if (best_cd == NULL || rank > best_rank) { best_cd = candidate; best_rank = rank; + + if (cs_debug > 2) + printf("using CodeDirectory type %d (rank %d)\n", (int)best_cd->hashType, best_rank); + *rcd = best_cd; } else if (best_cd != NULL && rank == best_rank) { /* repeat of a hash type (1:1 mapped to ranks), illegal and suspicious */ - printf("multiple hash=%d CodeDirectories in signature; rejecting", best_cd->hashType); + printf("multiple hash=%d CodeDirectories in signature; rejecting\n", best_cd->hashType); + return EBADEXEC; + } + } else if (type == CSSLOT_ENTITLEMENTS) { + if (ntohl(subBlob->magic) != CSMAGIC_EMBEDDED_ENTITLEMENTS) { + return EBADEXEC; + } + if (*rentitlements != NULL) { + printf("multiple entitlements blobs\n"); return EBADEXEC; } + *rentitlements = subBlob; } - if (best_cd && cs_debug > 2) - printf("using CodeDirectory type %d (rank %d)\n", (int)best_cd->hashType, best_rank); - *rcd = best_cd; } } else if (ntohl(blob->magic) == CSMAGIC_CODEDIRECTORY) { @@ -636,7 +648,11 @@ csblob_get_entitlements(struct cs_blob *csblob, void **out_start, size_t *out_le code_dir = csblob->csb_cd; - entitlements = csblob_find_blob(csblob, CSSLOT_ENTITLEMENTS, CSMAGIC_EMBEDDED_ENTITLEMENTS); + if ((csblob->csb_flags & CS_VALID) == 0) { + entitlements = NULL; + } else { + entitlements = csblob->csb_entitlements_blob; + } embedded_hash = find_special_slot(code_dir, csblob->csb_hashtype->cs_size, CSSLOT_ENTITLEMENTS); if (embedded_hash == NULL) { @@ -765,7 +781,7 @@ ubc_info_init_withsize(struct vnode *vp, off_t filesize) static int ubc_info_init_internal(vnode_t vp, int withfsize, off_t filesize) { - register struct ubc_info *uip; + struct ubc_info *uip; void * pager; int error = 0; kern_return_t kret; @@ -1766,8 +1782,19 @@ ubc_map(vnode_t vp, int flags) error = VNOP_MMAP(vp, flags, vfs_context_current()); - if (error != EPERM) - error = 0; + /* + * rdar://problem/22587101 required that we stop propagating + * EPERM up the stack. Otherwise, we would have to funnel up + * the error at all the call sites for memory_object_map(). + * The risk is in having to undo the map/object/entry state at + * all these call sites. It would also affect more than just mmap() + * e.g. vm_remap(). + * + * if (error != EPERM) + * error = 0; + */ + + error = 0; vnode_lock_spin(vp); @@ -1790,8 +1817,13 @@ ubc_map(vnode_t vp, int flags) if (need_wakeup) wakeup(&uip->ui_flags); - if (need_ref) - vnode_ref(vp); + if (need_ref) { + /* + * Make sure we get a ref as we can't unwind from here + */ + if (vnode_ref_ext(vp, 0, VNODE_REF_FORCE)) + panic("%s : VNODE_REF_FORCE failed\n", __FUNCTION__); + } } return (error); } @@ -2682,7 +2714,6 @@ boolean_t ubc_is_mapped_writable(const struct vnode *vp) /* * CODE SIGNING */ -#define CS_BLOB_PAGEABLE 0 static volatile SInt32 cs_blob_size = 0; static volatile SInt32 cs_blob_count = 0; static SInt32 cs_blob_size_peak = 0; @@ -2735,17 +2766,12 @@ ubc_cs_blob_allocate( { kern_return_t kr; -#if CS_BLOB_PAGEABLE - *blob_size_p = round_page(*blob_size_p); - kr = kmem_alloc(kernel_map, blob_addr_p, *blob_size_p, VM_KERN_MEMORY_SECURITY); -#else /* CS_BLOB_PAGEABLE */ *blob_addr_p = (vm_offset_t) kalloc_tag(*blob_size_p, VM_KERN_MEMORY_SECURITY); if (*blob_addr_p == 0) { kr = KERN_NO_SPACE; } else { kr = KERN_SUCCESS; } -#endif /* CS_BLOB_PAGEABLE */ return kr; } @@ -2754,11 +2780,255 @@ ubc_cs_blob_deallocate( vm_offset_t blob_addr, vm_size_t blob_size) { -#if CS_BLOB_PAGEABLE - kmem_free(kernel_map, blob_addr, blob_size); -#else /* CS_BLOB_PAGEABLE */ kfree((void *) blob_addr, blob_size); -#endif /* CS_BLOB_PAGEABLE */ +} + +/* + * Some codesigned files use a lowest common denominator page size of + * 4KiB, but can be used on systems that have a runtime page size of + * 16KiB. Since faults will only occur on 16KiB ranges in + * cs_validate_range(), we can convert the original Code Directory to + * a multi-level scheme where groups of 4 hashes are combined to form + * a new hash, which represents 16KiB in the on-disk file. This can + * reduce the wired memory requirement for the Code Directory by + * 75%. Care must be taken for binaries that use the "fourk" VM pager + * for unaligned access, which may still attempt to validate on + * non-16KiB multiples for compatibility with 3rd party binaries. + */ +static boolean_t +ubc_cs_supports_multilevel_hash(struct cs_blob *blob) +{ + const CS_CodeDirectory *cd; + + /* + * Only applies to binaries that ship as part of the OS, + * primarily the shared cache. + */ + if (!blob->csb_platform_binary || blob->csb_teamid != NULL) { + return FALSE; + } + + /* + * If the runtime page size matches the code signing page + * size, there is no work to do. + */ + if (PAGE_SHIFT <= blob->csb_hash_pageshift) { + return FALSE; + } + + cd = blob->csb_cd; + + /* + * There must be a valid integral multiple of hashes + */ + if (ntohl(cd->nCodeSlots) & (PAGE_MASK >> blob->csb_hash_pageshift)) { + return FALSE; + } + + /* + * Scatter lists must also have ranges that have an integral number of hashes + */ + if ((ntohl(cd->version) >= CS_SUPPORTSSCATTER) && (ntohl(cd->scatterOffset))) { + + const SC_Scatter *scatter = (const SC_Scatter*) + ((const char*)cd + ntohl(cd->scatterOffset)); + /* iterate all scatter structs to make sure they are all aligned */ + do { + uint32_t sbase = ntohl(scatter->base); + uint32_t scount = ntohl(scatter->count); + + /* last scatter? */ + if (scount == 0) { + break; + } + + if (sbase & (PAGE_MASK >> blob->csb_hash_pageshift)) { + return FALSE; + } + + if (scount & (PAGE_MASK >> blob->csb_hash_pageshift)) { + return FALSE; + } + + scatter++; + } while(1); + } + + /* Covered range must be a multiple of the new page size */ + if (ntohl(cd->codeLimit) & PAGE_MASK) { + return FALSE; + } + + /* All checks pass */ + return TRUE; +} + +/* + * All state and preconditions were checked before, so this + * function cannot fail. + */ +static void +ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) +{ + const CS_CodeDirectory *old_cd, *cd; + CS_CodeDirectory *new_cd; + const CS_GenericBlob *entitlements; + vm_offset_t new_blob_addr; + vm_size_t new_blob_size; + vm_size_t new_cdsize; + kern_return_t kr; + int error; + + uint32_t hashes_per_new_hash_shift = (uint32_t)(PAGE_SHIFT - blob->csb_hash_pageshift); + + if (cs_debug > 1) { + printf("CODE SIGNING: Attempting to convert Code Directory for %lu -> %lu page shift\n", + (unsigned long)blob->csb_hash_pageshift, (unsigned long)PAGE_SHIFT); + } + + old_cd = blob->csb_cd; + + /* Up to the hashes, we can copy all data */ + new_cdsize = ntohl(old_cd->hashOffset); + new_cdsize += (ntohl(old_cd->nCodeSlots) >> hashes_per_new_hash_shift) * old_cd->hashSize; + + new_blob_size = sizeof(CS_SuperBlob); + new_blob_size += sizeof(CS_BlobIndex); + new_blob_size += new_cdsize; + + if (blob->csb_entitlements_blob) { + /* We need to add a slot for the entitlements */ + new_blob_size += sizeof(CS_BlobIndex); + new_blob_size += ntohl(blob->csb_entitlements_blob->length); + } + + kr = ubc_cs_blob_allocate(&new_blob_addr, &new_blob_size); + if (kr != KERN_SUCCESS) { + if (cs_debug > 1) { + printf("CODE SIGNING: Failed to allocate memory for new Code Signing Blob: %d\n", + kr); + } + return; + } + + CS_SuperBlob *new_superblob; + + new_superblob = (CS_SuperBlob *)new_blob_addr; + new_superblob->magic = htonl(CSMAGIC_EMBEDDED_SIGNATURE); + new_superblob->length = htonl((uint32_t)new_blob_size); + if (blob->csb_entitlements_blob) { + vm_size_t ent_offset, cd_offset; + + cd_offset = sizeof(CS_SuperBlob) + 2 * sizeof(CS_BlobIndex); + ent_offset = cd_offset + new_cdsize; + + new_superblob->count = htonl(2); + new_superblob->index[0].type = htonl(CSSLOT_CODEDIRECTORY); + new_superblob->index[0].offset = htonl((uint32_t)cd_offset); + new_superblob->index[1].type = htonl(CSSLOT_ENTITLEMENTS); + new_superblob->index[1].offset = htonl((uint32_t)ent_offset); + + memcpy((void *)(new_blob_addr + ent_offset), blob->csb_entitlements_blob, ntohl(blob->csb_entitlements_blob->length)); + + new_cd = (CS_CodeDirectory *)(new_blob_addr + cd_offset); + } else { + vm_size_t cd_offset; + + cd_offset = sizeof(CS_SuperBlob) + 1 * sizeof(CS_BlobIndex); + + new_superblob->count = htonl(1); + new_superblob->index[0].type = htonl(CSSLOT_CODEDIRECTORY); + new_superblob->index[0].offset = htonl((uint32_t)cd_offset); + + new_cd = (CS_CodeDirectory *)new_blob_addr; + } + + memcpy(new_cd, old_cd, ntohl(old_cd->hashOffset)); + + /* Update fields in the Code Directory structure */ + new_cd->length = htonl((uint32_t)new_cdsize); + + uint32_t nCodeSlots = ntohl(new_cd->nCodeSlots); + nCodeSlots >>= hashes_per_new_hash_shift; + new_cd->nCodeSlots = htonl(nCodeSlots); + + new_cd->pageSize = PAGE_SHIFT; /* Not byte-swapped */ + + if ((ntohl(new_cd->version) >= CS_SUPPORTSSCATTER) && (ntohl(new_cd->scatterOffset))) { + SC_Scatter *scatter = (SC_Scatter*) + ((char *)new_cd + ntohl(new_cd->scatterOffset)); + /* iterate all scatter structs to scale their counts */ + do { + uint32_t scount = ntohl(scatter->count); + uint32_t sbase = ntohl(scatter->base); + + /* last scatter? */ + if (scount == 0) { + break; + } + + scount >>= hashes_per_new_hash_shift; + scatter->count = htonl(scount); + + sbase >>= hashes_per_new_hash_shift; + scatter->base = htonl(sbase); + + scatter++; + } while(1); + } + + /* For each group of hashes, hash them together */ + const unsigned char *src_base = (const unsigned char *)old_cd + ntohl(old_cd->hashOffset); + unsigned char *dst_base = (unsigned char *)new_cd + ntohl(new_cd->hashOffset); + + uint32_t hash_index; + for (hash_index = 0; hash_index < nCodeSlots; hash_index++) { + union cs_hash_union mdctx; + + uint32_t source_hash_len = old_cd->hashSize << hashes_per_new_hash_shift; + const unsigned char *src = src_base + hash_index * source_hash_len; + unsigned char *dst = dst_base + hash_index * new_cd->hashSize; + + blob->csb_hashtype->cs_init(&mdctx); + blob->csb_hashtype->cs_update(&mdctx, src, source_hash_len); + blob->csb_hashtype->cs_final(dst, &mdctx); + } + + error = cs_validate_csblob((const uint8_t *)new_blob_addr, new_blob_size, &cd, &entitlements); + if (error) { + + if (cs_debug > 1) { + printf("CODE SIGNING: Failed to validate new Code Signing Blob: %d\n", + error); + } + + ubc_cs_blob_deallocate(new_blob_addr, new_blob_size); + return; + } + + /* New Code Directory is ready for use, swap it out in the blob structure */ + ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); + + blob->csb_mem_size = new_blob_size; + blob->csb_mem_kaddr = new_blob_addr; + blob->csb_cd = cd; + blob->csb_entitlements_blob = entitlements; + + /* The blob has some cached attributes of the Code Directory, so update those */ + + blob->csb_hash_firstlevel_pagesize = blob->csb_hash_pagesize; /* Save the original page size */ + + blob->csb_hash_pagesize = PAGE_SIZE; + blob->csb_hash_pagemask = PAGE_MASK; + blob->csb_hash_pageshift = PAGE_SHIFT; + blob->csb_end_offset = ntohl(cd->codeLimit); + if((ntohl(cd->version) >= CS_SUPPORTSSCATTER) && (ntohl(cd->scatterOffset))) { + const SC_Scatter *scatter = (const SC_Scatter*) + ((const char*)cd + ntohl(cd->scatterOffset)); + blob->csb_start_offset = ((off_t)ntohl(scatter->base)) * PAGE_SIZE; + } else { + blob->csb_start_offset = 0; + } } int @@ -2766,8 +3036,9 @@ ubc_cs_blob_add( struct vnode *vp, cpu_type_t cputype, off_t base_offset, - vm_address_t addr, + vm_address_t *addr, vm_size_t size, + struct image_params *imgp, __unused int flags, struct cs_blob **ret_blob) { @@ -2775,69 +3046,42 @@ ubc_cs_blob_add( struct ubc_info *uip; struct cs_blob *blob, *oblob; int error; - ipc_port_t blob_handle; - memory_object_size_t blob_size; const CS_CodeDirectory *cd; + const CS_GenericBlob *entitlements; off_t blob_start_offset, blob_end_offset; union cs_hash_union mdctx; boolean_t record_mtime; - int cs_flags; record_mtime = FALSE; - cs_flags = 0; if (ret_blob) *ret_blob = NULL; - blob_handle = IPC_PORT_NULL; - blob = (struct cs_blob *) kalloc(sizeof (struct cs_blob)); if (blob == NULL) { return ENOMEM; } -#if CS_BLOB_PAGEABLE - /* get a memory entry on the blob */ - blob_size = (memory_object_size_t) size; - kr = mach_make_memory_entry_64(kernel_map, - &blob_size, - addr, - VM_PROT_READ, - &blob_handle, - IPC_PORT_NULL); - if (kr != KERN_SUCCESS) { - error = ENOMEM; - goto out; - } - if (memory_object_round_page(blob_size) != - (memory_object_size_t) round_page(size)) { - printf("ubc_cs_blob_add: size mismatch 0x%llx 0x%lx !?\n", - blob_size, (size_t)size); - panic("XXX FBDP size mismatch 0x%llx 0x%lx\n", blob_size, (size_t)size); - error = EINVAL; - goto out; - } -#else - blob_size = (memory_object_size_t) size; - blob_handle = IPC_PORT_NULL; -#endif - /* fill in the new blob */ blob->csb_cpu_type = cputype; blob->csb_base_offset = base_offset; blob->csb_mem_size = size; blob->csb_mem_offset = 0; - blob->csb_mem_handle = blob_handle; - blob->csb_mem_kaddr = addr; + blob->csb_mem_kaddr = *addr; blob->csb_flags = 0; blob->csb_platform_binary = 0; blob->csb_platform_path = 0; blob->csb_teamid = NULL; + blob->csb_entitlements_blob = NULL; + blob->csb_entitlements = NULL; + /* Transfer ownership. Even on error, this function will deallocate */ + *addr = 0; + /* * Validate the blob's contents */ - error = cs_validate_csblob((const uint8_t *)addr, size, &cd); + error = cs_validate_csblob((const uint8_t *)blob->csb_mem_kaddr, size, &cd, &entitlements); if (error) { if (cs_debug) @@ -2850,20 +3094,22 @@ ubc_cs_blob_add( uint8_t hash[CS_HASH_MAX_SIZE]; int md_size; -#if CS_BLOB_PAGEABLE -#error "cd might move under CS_BLOB_PAGEABLE; reconsider this code" -#endif blob->csb_cd = cd; + blob->csb_entitlements_blob = entitlements; /* may be NULL, not yet validated */ blob->csb_hashtype = cs_find_md(cd->hashType); if (blob->csb_hashtype == NULL || blob->csb_hashtype->cs_digest_size > sizeof(hash)) panic("validated CodeDirectory but unsupported type"); - + + blob->csb_hash_pageshift = cd->pageSize; + blob->csb_hash_pagesize = (1U << cd->pageSize); + blob->csb_hash_pagemask = blob->csb_hash_pagesize - 1; + blob->csb_hash_firstlevel_pagesize = 0; blob->csb_flags = (ntohl(cd->flags) & CS_ALLOWED_MACHO) | CS_VALID; - blob->csb_end_offset = round_page_4K(ntohl(cd->codeLimit)); + blob->csb_end_offset = (((vm_offset_t)ntohl(cd->codeLimit) + blob->csb_hash_pagemask) & ~((vm_offset_t)blob->csb_hash_pagemask)); if((ntohl(cd->version) >= CS_SUPPORTSSCATTER) && (ntohl(cd->scatterOffset))) { const SC_Scatter *scatter = (const SC_Scatter*) ((const char*)cd + ntohl(cd->scatterOffset)); - blob->csb_start_offset = ntohl(scatter->base) * PAGE_SIZE_4K; + blob->csb_start_offset = ((off_t)ntohl(scatter->base)) * blob->csb_hash_pagesize; } else { blob->csb_start_offset = 0; } @@ -2882,17 +3128,16 @@ ubc_cs_blob_add( * Let policy module check whether the blob's signature is accepted. */ #if CONFIG_MACF - error = mac_vnode_check_signature(vp, - base_offset, - blob->csb_cdhash, - (const void*)addr, size, - flags, &cs_flags); + unsigned int cs_flags = blob->csb_flags; + error = mac_vnode_check_signature(vp, blob, imgp, &cs_flags, flags); + blob->csb_flags = cs_flags; + if (error) { if (cs_debug) printf("check_signature[pid: %d], error = %d\n", current_proc()->p_pid, error); goto out; } - if ((flags & MAC_VNODE_CHECK_DYLD_SIM) && !(cs_flags & CS_PLATFORM_BINARY)) { + if ((flags & MAC_VNODE_CHECK_DYLD_SIM) && !(blob->csb_flags & CS_PLATFORM_BINARY)) { if (cs_debug) printf("check_signature[pid: %d], is not apple signed\n", current_proc()->p_pid); error = EPERM; @@ -2900,11 +3145,11 @@ ubc_cs_blob_add( } #endif - if (cs_flags & CS_PLATFORM_BINARY) { + if (blob->csb_flags & CS_PLATFORM_BINARY) { if (cs_debug > 1) printf("check_signature[pid: %d]: platform binary\n", current_proc()->p_pid); blob->csb_platform_binary = 1; - blob->csb_platform_path = !!(cs_flags & CS_PLATFORM_PATH); + blob->csb_platform_path = !!(blob->csb_flags & CS_PLATFORM_PATH); } else { blob->csb_platform_binary = 0; blob->csb_platform_path = 0; @@ -2916,7 +3161,7 @@ ubc_cs_blob_add( printf("check_signature[pid: %d]: no team-id\n", current_proc()->p_pid); } } - + /* * Validate the blob's coverage */ @@ -2931,6 +3176,10 @@ ubc_cs_blob_add( goto out; } + if (ubc_cs_supports_multilevel_hash(blob)) { + ubc_cs_convert_to_multilevel_hash(blob); + } + vnode_lock(vp); if (! UBCINFOEXISTS(vp)) { vnode_unlock(vp); @@ -3090,13 +3339,17 @@ ubc_cs_blob_add( /* we failed; release what we allocated */ if (blob) { + if (blob->csb_mem_kaddr) { + ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); + blob->csb_mem_kaddr = 0; + } + if (blob->csb_entitlements != NULL) { + osobject_release(blob->csb_entitlements); + blob->csb_entitlements = NULL; + } kfree(blob, sizeof (*blob)); blob = NULL; } - if (blob_handle != IPC_PORT_NULL) { - mach_memory_entry_port_release(blob_handle); - blob_handle = IPC_PORT_NULL; - } } if (error == EAGAIN) { @@ -3106,10 +3359,6 @@ ubc_cs_blob_add( * blob and we want to return success. */ error = 0; - /* - * Since we're not failing, consume the data we received. - */ - ubc_cs_blob_deallocate(addr, size); } return error; @@ -3206,10 +3455,10 @@ ubc_cs_free( blob->csb_mem_size); blob->csb_mem_kaddr = 0; } - if (blob->csb_mem_handle != IPC_PORT_NULL) { - mach_memory_entry_port_release(blob->csb_mem_handle); + if (blob->csb_entitlements != NULL) { + osobject_release(blob->csb_entitlements); + blob->csb_entitlements = NULL; } - blob->csb_mem_handle = IPC_PORT_NULL; OSAddAtomic(-1, &cs_blob_count); OSAddAtomic((SInt32) -blob->csb_mem_size, &cs_blob_size); kfree(blob, sizeof (*blob)); @@ -3245,19 +3494,17 @@ int ubc_cs_blob_revalidate( struct vnode *vp, struct cs_blob *blob, - __unused int flags + struct image_params *imgp, + int flags ) { int error = 0; -#if CONFIG_MACF - int cs_flags = 0; -#endif const CS_CodeDirectory *cd = NULL; - + const CS_GenericBlob *entitlements = NULL; assert(vp != NULL); assert(blob != NULL); - error = cs_validate_csblob((const uint8_t *)blob->csb_mem_kaddr, blob->csb_mem_size, &cd); + error = cs_validate_csblob((const uint8_t *)blob->csb_mem_kaddr, blob->csb_mem_size, &cd, &entitlements); if (error) { if (cs_debug) { printf("CODESIGNING: csblob invalid: %d\n", error); @@ -3265,18 +3512,21 @@ ubc_cs_blob_revalidate( goto out; } + unsigned int cs_flags = (ntohl(cd->flags) & CS_ALLOWED_MACHO) | CS_VALID; + /* callout to mac_vnode_check_signature */ #if CONFIG_MACF - error = mac_vnode_check_signature(vp, blob->csb_base_offset, blob->csb_cdhash, - (const void*)blob->csb_mem_kaddr, (int)blob->csb_mem_size, - flags, &cs_flags); + error = mac_vnode_check_signature(vp, blob, imgp, &cs_flags, flags); if (cs_debug && error) { printf("revalidate: check_signature[pid: %d], error = %d\n", current_proc()->p_pid, error); } +#else + (void)flags; #endif /* update generation number if success */ vnode_lock_spin(vp); + blob->csb_flags = cs_flags; if (UBCINFOEXISTS(vp)) { if (error == 0) vp->v_ubcinfo->cs_add_gen = cs_blob_generation_count; @@ -3352,12 +3602,13 @@ ubc_get_cs_mtime( unsigned long cs_validate_page_no_hash = 0; unsigned long cs_validate_page_bad_hash = 0; -boolean_t -cs_validate_page( - void *_blobs, +static boolean_t +cs_validate_hash( + struct cs_blob *blobs, memory_object_t pager, memory_object_offset_t page_offset, const void *data, + vm_size_t *bytes_processed, unsigned *tainted) { union cs_hash_union mdctx; @@ -3365,7 +3616,7 @@ cs_validate_page( unsigned char actual_hash[CS_HASH_MAX_SIZE]; unsigned char expected_hash[CS_HASH_MAX_SIZE]; boolean_t found_hash; - struct cs_blob *blobs, *blob; + struct cs_blob *blob; const CS_CodeDirectory *cd; const unsigned char *hash; boolean_t validated; @@ -3374,14 +3625,9 @@ cs_validate_page( off_t codeLimit = 0; const char *lower_bound, *upper_bound; vm_offset_t kaddr, blob_addr; - vm_size_t ksize; - kern_return_t kr; - - offset = page_offset; /* retrieve the expected hash */ found_hash = FALSE; - blobs = (struct cs_blob *) _blobs; for (blob = blobs; blob != NULL; @@ -3393,29 +3639,10 @@ cs_validate_page( continue; } - /* map the blob in the kernel address space */ + /* blob data has been released */ kaddr = blob->csb_mem_kaddr; if (kaddr == 0) { - ksize = (vm_size_t) (blob->csb_mem_size + - blob->csb_mem_offset); - kr = vm_map(kernel_map, - &kaddr, - ksize, - 0, - VM_FLAGS_ANYWHERE, - blob->csb_mem_handle, - 0, - TRUE, - VM_PROT_READ, - VM_PROT_READ, - VM_INHERIT_NONE); - if (kr != KERN_SUCCESS) { - /* XXX FBDP what to do !? */ - printf("cs_validate_page: failed to map blob, " - "size=0x%lx kr=0x%x\n", - (size_t)blob->csb_mem_size, kr); - break; - } + continue; } blob_addr = kaddr + blob->csb_mem_offset; @@ -3426,22 +3653,17 @@ cs_validate_page( if (cd != NULL) { /* all CD's that have been injected is already validated */ - offset = page_offset - blob->csb_base_offset; - if (offset < blob->csb_start_offset || - offset >= blob->csb_end_offset) { - /* our page is not covered by this blob */ - continue; - } - hashtype = blob->csb_hashtype; if (hashtype == NULL) panic("unknown hash type ?"); if (hashtype->cs_digest_size > sizeof(actual_hash)) panic("hash size too large"); + if (offset & blob->csb_hash_pagemask) + panic("offset not aligned to cshash boundary"); codeLimit = ntohl(cd->codeLimit); - hash = hashes(cd, (uint32_t)(offset>>PAGE_SHIFT_4K), + hash = hashes(cd, (uint32_t)(offset>>blob->csb_hash_pageshift), hashtype->cs_size, lower_bound, upper_bound); if (hash != NULL) { @@ -3474,17 +3696,39 @@ cs_validate_page( *tainted = 0; - size = PAGE_SIZE_4K; + size = blob->csb_hash_pagesize; + *bytes_processed = size; + const uint32_t *asha1, *esha1; if ((off_t)(offset + size) > codeLimit) { /* partial page at end of segment */ assert(offset < codeLimit); - size = (size_t) (codeLimit & PAGE_MASK_4K); + size = (size_t) (codeLimit & blob->csb_hash_pagemask); *tainted |= CS_VALIDATE_NX; } hashtype->cs_init(&mdctx); - hashtype->cs_update(&mdctx, data, size); + + if (blob->csb_hash_firstlevel_pagesize) { + const unsigned char *partial_data = (const unsigned char *)data; + size_t i; + for (i=0; i < size;) { + union cs_hash_union partialctx; + unsigned char partial_digest[CS_HASH_MAX_SIZE]; + size_t partial_size = MIN(size-i, blob->csb_hash_firstlevel_pagesize); + + hashtype->cs_init(&partialctx); + hashtype->cs_update(&partialctx, partial_data, partial_size); + hashtype->cs_final(partial_digest, &partialctx); + + /* Update cumulative multi-level hash */ + hashtype->cs_update(&mdctx, partial_digest, hashtype->cs_size); + partial_data = partial_data + partial_size; + i += partial_size; + } + } else { + hashtype->cs_update(&mdctx, data, size); + } hashtype->cs_final(actual_hash, &mdctx); asha1 = (const uint32_t *) actual_hash; @@ -3518,6 +3762,53 @@ cs_validate_page( return validated; } +boolean_t +cs_validate_range( + struct vnode *vp, + memory_object_t pager, + memory_object_offset_t page_offset, + const void *data, + vm_size_t dsize, + unsigned *tainted) +{ + vm_size_t offset_in_range; + boolean_t all_subranges_validated = TRUE; /* turn false if any subrange fails */ + + struct cs_blob *blobs = ubc_get_cs_blobs(vp); + + *tainted = 0; + + for (offset_in_range = 0; + offset_in_range < dsize; + /* offset_in_range updated based on bytes processed */) { + unsigned subrange_tainted = 0; + boolean_t subrange_validated; + vm_size_t bytes_processed = 0; + + subrange_validated = cs_validate_hash(blobs, + pager, + page_offset + offset_in_range, + (const void *)((const char *)data + offset_in_range), + &bytes_processed, + &subrange_tainted); + + *tainted |= subrange_tainted; + + if (bytes_processed == 0) { + /* Cannote make forward progress, so return an error */ + all_subranges_validated = FALSE; + break; + } else if (subrange_validated == FALSE) { + all_subranges_validated = FALSE; + /* Keep going to detect other types of failures in subranges */ + } + + offset_in_range += bytes_processed; + } + + return all_subranges_validated; +} + int ubc_cs_getcdhash( vnode_t vp, @@ -3557,6 +3848,52 @@ ubc_cs_getcdhash( return ret; } +boolean_t +ubc_cs_is_range_codesigned( + vnode_t vp, + mach_vm_offset_t start, + mach_vm_size_t size) +{ + struct cs_blob *csblob; + mach_vm_offset_t blob_start; + mach_vm_offset_t blob_end; + + if (vp == NULL) { + /* no file: no code signature */ + return FALSE; + } + if (size == 0) { + /* no range: no code signature */ + return FALSE; + } + if (start + size < start) { + /* overflow */ + return FALSE; + } + + csblob = ubc_cs_blob_get(vp, -1, start); + if (csblob == NULL) { + return FALSE; + } + + /* + * We currently check if the range is covered by a single blob, + * which should always be the case for the dyld shared cache. + * If we ever want to make this routine handle other cases, we + * would have to iterate if the blob does not cover the full range. + */ + blob_start = (mach_vm_offset_t) (csblob->csb_base_offset + + csblob->csb_start_offset); + blob_end = (mach_vm_offset_t) (csblob->csb_base_offset + + csblob->csb_end_offset); + if (blob_start > start || blob_end < (start + size)) { + /* range not fully covered by this code-signing blob */ + return FALSE; + } + + return TRUE; +} + #if CHECK_CS_VALIDATION_BITMAP #define stob(s) ((atop_64((s)) + 07) >> 3) extern boolean_t root_fs_upgrade_try; diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index be9cded69..3884b2c94 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2014 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -85,6 +85,7 @@ #include #include #include +#include #include #include @@ -314,7 +315,7 @@ static lck_grp_t *mbuf_mlock_grp; static lck_grp_attr_t *mbuf_mlock_grp_attr; /* Back-end (common) layer */ -static void *mbuf_worker_run; /* wait channel for worker thread */ +static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */ static int mbuf_worker_ready; /* worker thread is runnable */ static int mbuf_expand_mcl; /* number of cluster creation requets */ static int mbuf_expand_big; /* number of big cluster creation requests */ @@ -558,6 +559,13 @@ static lck_attr_t *mleak_lock_attr; static lck_grp_t *mleak_lock_grp; static lck_grp_attr_t *mleak_lock_grp_attr; +/* Lock to protect the completion callback table */ +static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL; +static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL; +static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL; +decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data); +lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data; + extern u_int32_t high_sb_max; /* The minimum number of objects that are allocated, to start. */ @@ -679,7 +687,7 @@ static int mleak_table_sysctl SYSCTL_HANDLER_ARGS; static char *mbuf_dump(void); static void mbuf_table_init(void); static inline void m_incref(struct mbuf *); -static inline u_int32_t m_decref(struct mbuf *); +static inline u_int16_t m_decref(struct mbuf *); static int m_clalloc(const u_int32_t, const int, const u_int32_t); static void mbuf_worker_thread_init(void); static mcache_obj_t *slab_alloc(mbuf_class_t, int); @@ -766,19 +774,43 @@ static boolean_t mbuf_report_usage(mbuf_class_t); * cleared. */ #define EXTF_READONLY 0x2 -#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY) +/* + * This flag indicates that the external cluster is paired with the mbuf. + * Pairing implies an external free routine defined which will be invoked + * when the reference count drops to the minimum at m_free time. This + * flag is never cleared. + */ +#define EXTF_PAIRED 0x4 + +#define EXTF_MASK \ + (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED) #define MEXT_RFA(m) ((m)->m_ext.ext_refflags) +#define MEXT_MINREF(m) (MEXT_RFA(m)->minref) #define MEXT_REF(m) (MEXT_RFA(m)->refcnt) +#define MEXT_PREF(m) (MEXT_RFA(m)->prefcnt) #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) -#define MBUF_IS_COMPOSITE(m) \ - (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) +#define MEXT_PRIV(m) (MEXT_RFA(m)->priv) +#define MEXT_PMBUF(m) (MEXT_RFA(m)->paired) +#define MBUF_IS_COMPOSITE(m) \ + (MEXT_REF(m) == MEXT_MINREF(m) && \ + (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) +/* + * This macro can be used to test if the mbuf is paired to an external + * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject + * is important, as EXTF_PAIRED alone is insufficient since it is immutable, + * and thus survives calls to m_free_paired. + */ +#define MBUF_IS_PAIRED(m) \ + (((m)->m_flags & M_EXT) && \ + (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \ + MEXT_PMBUF(m) == (m)) /* * Macros used to verify the integrity of the mbuf. */ #define _MCHECK(m) { \ - if ((m)->m_type != MT_FREE) { \ + if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \ if (mclaudit == NULL) \ panic("MCHECK: m_type=%d m=%p", \ (u_int16_t)(m)->m_type, m); \ @@ -856,27 +888,33 @@ static boolean_t mbuf_report_usage(mbuf_class_t); } \ } -#define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \ +#define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \ + priv, pm) { \ (m)->m_data = (m)->m_ext.ext_buf = (buf); \ (m)->m_flags |= M_EXT; \ (m)->m_ext.ext_size = (size); \ (m)->m_ext.ext_free = (free); \ (m)->m_ext.ext_arg = (arg); \ - (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \ - &(m)->m_ext.ext_refs; \ MEXT_RFA(m) = (rfa); \ + MEXT_MINREF(m) = (min); \ MEXT_REF(m) = (ref); \ + MEXT_PREF(m) = (pref); \ MEXT_FLAGS(m) = (flag); \ + MEXT_PRIV(m) = (priv); \ + MEXT_PMBUF(m) = (pm); \ } #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ - MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag) + MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \ + ref, 0, flag, 0, NULL) #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ - MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag) + MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \ + ref, 0, flag, 0, NULL) #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ - MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag) + MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \ + ref, 0, flag, 0, NULL) /* * Macro to convert BSD malloc sleep flag to mcache's @@ -1149,35 +1187,35 @@ mleak_table_sysctl SYSCTL_HANDLER_ARGS static inline void m_incref(struct mbuf *m) { - UInt32 old, new; - volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); + UInt16 old, new; + volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m); do { old = *addr; new = old + 1; ASSERT(new != 0); - } while (!OSCompareAndSwap(old, new, addr)); + } while (!OSCompareAndSwap16(old, new, addr)); /* * If cluster is shared, mark it with (sticky) EXTF_READONLY; - * we don't clear the flag when the refcount goes back to 1 - * to simplify code calling m_mclhasreference(). + * we don't clear the flag when the refcount goes back to the + * minimum, to simplify code calling m_mclhasreference(). */ - if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY)) - (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m)); + if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) + (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m)); } -static inline u_int32_t +static inline u_int16_t m_decref(struct mbuf *m) { - UInt32 old, new; - volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); + UInt16 old, new; + volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m); do { old = *addr; new = old - 1; ASSERT(old != 0); - } while (!OSCompareAndSwap(old, new, addr)); + } while (!OSCompareAndSwap16(old, new, addr)); return (new); } @@ -1667,6 +1705,27 @@ mbinit(void) (nmbclusters << MCLSHIFT) >> MBSHIFT, (nclusters << MCLSHIFT) >> MBSHIFT, (njcl << MCLSHIFT) >> MBSHIFT); + + /* initialize lock form tx completion callback table */ + mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init(); + if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) { + panic("%s: lck_grp_attr_alloc_init failed", __func__); + /* NOTREACHED */ + } + mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl", + mbuf_tx_compl_tbl_lck_grp_attr); + if (mbuf_tx_compl_tbl_lck_grp == NULL) { + panic("%s: lck_grp_alloc_init failed", __func__); + /* NOTREACHED */ + } + mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init(); + if (mbuf_tx_compl_tbl_lck_attr == NULL) { + panic("%s: lck_attr_alloc_init failed", __func__); + /* NOTREACHED */ + } + lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp, + mbuf_tx_compl_tbl_lck_attr); + } /* @@ -1744,7 +1803,7 @@ slab_alloc(mbuf_class_t class, int wait) * A 4K cluster slab can have NBCLPG references. */ VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG && - sp->sl_len == PAGE_SIZE && + sp->sl_len == PAGE_SIZE && (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL)); } else if (class == MC_16KCL) { mcl_slab_t *nsp; @@ -2321,7 +2380,7 @@ cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) VERIFY(clsp->sl_refcnt >= 1 && clsp->sl_refcnt <= NCLPG); } else { - VERIFY(clsp->sl_refcnt >= 1 && + VERIFY(clsp->sl_refcnt >= 1 && clsp->sl_refcnt <= NBCLPG); } if (cl_class == MC_16KCL) { @@ -2345,8 +2404,12 @@ cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) if (mclaudit != NULL) mcl_audit_restore_mbuf(m, mca, TRUE); + MEXT_MINREF(m) = 0; MEXT_REF(m) = 0; + MEXT_PREF(m) = 0; MEXT_FLAGS(m) = 0; + MEXT_PRIV(m) = 0; + MEXT_PMBUF(m) = NULL; rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); rfa->obj_next = ref_list; @@ -2815,8 +2878,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) } else { /* * if multiple 4K pages are being used for a - * 16K cluster - */ + * 16K cluster + */ needed = numpages / NSLABSP16KB; } @@ -2884,7 +2947,7 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) union m16kcluster *m16kcl = (union m16kcluster *)page; mcl_slab_t *nsp; int k; - + /* One for the entire 16KB */ sp = slab_get(m16kcl); if (mclaudit != NULL) @@ -2944,6 +3007,10 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) * When non-blocking we kick a thread if we have to grow the * pool or if the number of free clusters is less than requested. */ + if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) { + wakeup((caddr_t)&mbuf_worker_needs_wakeup); + mbuf_worker_needs_wakeup = FALSE; + } if (class == MC_BIGCL) { if (i > 0) { /* @@ -2953,11 +3020,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) i += m_total(MC_BIGCL); if (i > mbuf_expand_big) { mbuf_expand_big = i; - if (mbuf_worker_ready) - wakeup((caddr_t)&mbuf_worker_run); } } - if (m_infree(MC_BIGCL) >= num) return (1); } else { @@ -2969,11 +3033,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) i += m_total(MC_16KCL); if (i > mbuf_expand_16k) { mbuf_expand_16k = i; - if (mbuf_worker_ready) - wakeup((caddr_t)&mbuf_worker_run); } } - if (m_infree(MC_16KCL) >= num) return (1); } @@ -3045,7 +3106,7 @@ freelist_populate(mbuf_class_t class, unsigned int num, int wait) * MC_CL, verify that the reference count will match that * assumption */ - VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp)); + VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp)); VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); /* * Make sure that the cluster is unmolested @@ -3400,6 +3461,78 @@ m_getclr(int wait, int type) return (m); } +static int +m_free_paired(struct mbuf *m) +{ + VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED)); + + membar_sync(); + if (MEXT_PMBUF(m) == m) { + volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m); + int16_t oprefcnt, prefcnt; + + /* + * Paired ref count might be negative in case we lose + * against another thread clearing MEXT_PMBUF, in the + * event it occurs after the above memory barrier sync. + * In that case just ignore as things have been unpaired. + */ + do { + oprefcnt = *addr; + prefcnt = oprefcnt - 1; + } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr)); + + if (prefcnt > 1) { + return (1); + } else if (prefcnt == 1) { + (*(m->m_ext.ext_free))(m->m_ext.ext_buf, + m->m_ext.ext_size, m->m_ext.ext_arg); + return (1); + } else if (prefcnt == 0) { + VERIFY(MBUF_IS_PAIRED(m)); + + /* + * Restore minref to its natural value, so that + * the caller will be able to free the cluster + * as appropriate. + */ + MEXT_MINREF(m) = 0; + + /* + * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact + * as it is immutable. atomic_set_ptr also causes + * memory barrier sync. + */ + atomic_set_ptr(&MEXT_PMBUF(m), NULL); + + switch (m->m_ext.ext_size) { + case MCLBYTES: + m->m_ext.ext_free = NULL; + break; + + case MBIGCLBYTES: + m->m_ext.ext_free = m_bigfree; + break; + + case M16KCLBYTES: + m->m_ext.ext_free = m_16kfree; + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + } + } + + /* + * Tell caller the unpair has occurred, and that the reference + * count on the external cluster held for the paired mbuf should + * now be dropped. + */ + return (0); +} + struct mbuf * m_free(struct mbuf *m) { @@ -3413,15 +3546,21 @@ m_free(struct mbuf *m) m_redzone_verify(m); /* Free the aux data and tags if there is any */ m_tag_delete_chain(m, NULL); + + m_do_tx_compl_callback(m, NULL); } if (m->m_flags & M_EXT) { - u_int32_t refcnt; + u_int16_t refcnt; u_int32_t composite; + if (MBUF_IS_PAIRED(m) && m_free_paired(m)) + return (n); + refcnt = m_decref(m); composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); - if (refcnt == 0 && !composite) { + + if (refcnt == MEXT_MINREF(m) && !composite) { if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); } else if (m->m_ext.ext_free == m_bigfree) { @@ -3436,7 +3575,8 @@ m_free(struct mbuf *m) } mcache_free(ref_cache, MEXT_RFA(m)); MEXT_RFA(m) = NULL; - } else if (refcnt == 0 && composite) { + } else if (refcnt == MEXT_MINREF(m) && composite) { + VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED)); VERIFY(m->m_type != MT_FREE); mtype_stat_dec(m->m_type); @@ -3479,20 +3619,27 @@ m_free(struct mbuf *m) __private_extern__ struct mbuf * m_clattach(struct mbuf *m, int type, caddr_t extbuf, void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg, - int wait) + int wait, int pair) { struct ext_ref *rfa = NULL; - if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL) + /* + * If pairing is requested and an existing mbuf is provided, reject + * it if it's already been paired to another cluster. Otherwise, + * allocate a new one or free any existing below. + */ + if ((m != NULL && MBUF_IS_PAIRED(m)) || + (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) return (NULL); if (m->m_flags & M_EXT) { - u_int32_t refcnt; + u_int16_t refcnt; u_int32_t composite; refcnt = m_decref(m); composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); - if (refcnt == 0 && !composite) { + VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL); + if (refcnt == MEXT_MINREF(m) && !composite) { if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); } else if (m->m_ext.ext_free == m_bigfree) { @@ -3507,7 +3654,7 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, } /* Re-use the reference structure */ rfa = MEXT_RFA(m); - } else if (refcnt == 0 && composite) { + } else if (refcnt == MEXT_MINREF(m) && composite) { VERIFY(m->m_type != MT_FREE); mtype_stat_dec(m->m_type); @@ -3544,7 +3691,13 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, return (NULL); } - MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0); + if (!pair) { + MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, + 0, 1, 0, 0, 0, NULL); + } else { + MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa, + 1, 1, 1, EXTF_PAIRED, 0, m); + } return (m); } @@ -3566,7 +3719,7 @@ m_getcl(int wait, int type, int flags) m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); if (m != NULL) { - u_int32_t flag; + u_int16_t flag; struct ext_ref *rfa; void *cl; @@ -3778,10 +3931,10 @@ m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) void m_copy_pftag(struct mbuf *to, struct mbuf *from) { - to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag; + memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag)); #if PF_ECN - to->m_pkthdr.pf_mtag.pftag_hdr = NULL; - to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6); + m_pftag(to)->pftag_hdr = NULL; + m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6); #endif /* PF_ECN */ } @@ -3802,7 +3955,7 @@ m_classifier_init(struct mbuf *m, uint32_t pktf_mask) #if MEASURE_BW m->m_pkthdr.pkt_bwseq = 0; #endif /* MEASURE_BW */ - m->m_pkthdr.pkt_enqueue_ts = 0; + m->m_pkthdr.pkt_timestamp = 0; } void @@ -3839,7 +3992,7 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, unsigned int pnum, needed = *num_needed; mcache_obj_t *mp_list = NULL; int mcflags = MSLEEPF(wait); - u_int32_t flag; + u_int16_t flag; struct ext_ref *rfa; mcache_t *cp; void *cl; @@ -3933,7 +4086,7 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, printf("%s: File a radar related to . \ needed = %u, pnum = %u, num_needed = %u \n", __func__, needed, pnum, *num_needed); - } + } *num_needed = pnum; return (top); @@ -4183,7 +4336,7 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, for (;;) { struct mbuf *m; - u_int32_t flag; + u_int16_t flag; struct ext_ref *rfa; void *cl; int pkthdr; @@ -4361,14 +4514,12 @@ m_freem_list(struct mbuf *m) while (m != NULL) { struct mbuf *next = m->m_next; mcache_obj_t *o, *rfa; - u_int32_t refcnt, composite; + u_int32_t composite; + u_int16_t refcnt; if (m->m_type == MT_FREE) panic("m_free: freeing an already freed mbuf"); - if (m->m_type != MT_FREE) - mt_free++; - if (m->m_flags & M_PKTHDR) { /* Check for scratch area overflow */ m_redzone_verify(m); @@ -4376,13 +4527,23 @@ m_freem_list(struct mbuf *m) m_tag_delete_chain(m, NULL); } - if (!(m->m_flags & M_EXT)) + if (!(m->m_flags & M_EXT)) { + mt_free++; goto simple_free; + } + + if (MBUF_IS_PAIRED(m) && m_free_paired(m)) { + m = next; + continue; + } + + mt_free++; o = (mcache_obj_t *)(void *)m->m_ext.ext_buf; refcnt = m_decref(m); composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); - if (refcnt == 0 && !composite) { + + if (refcnt == MEXT_MINREF(m) && !composite) { if (m->m_ext.ext_free == NULL) { o->obj_next = mcl_list; mcl_list = o; @@ -4401,7 +4562,8 @@ m_freem_list(struct mbuf *m) rfa->obj_next = ref_list; ref_list = rfa; MEXT_RFA(m) = NULL; - } else if (refcnt == 0 && composite) { + } else if (refcnt == MEXT_MINREF(m) && composite) { + VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED)); VERIFY(m->m_type != MT_FREE); /* * Amortize the costs of atomic operations @@ -4685,7 +4847,7 @@ m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode) n->m_len = MIN(n->m_len, MLEN); if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) - panic("%s n %p copy overflow", + panic("%s n %p copy overflow", __func__, n); bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), @@ -4737,7 +4899,7 @@ m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, if (off == 0 && (m->m_flags & M_PKTHDR)) copyhdr = 1; - + if (m_lastm != NULL && *m_lastm != NULL) { m = *m_lastm; off = *m_off; @@ -4817,7 +4979,7 @@ m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, n->m_flags |= M_EXT; } else { if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) - panic("%s n %p copy overflow", + panic("%s n %p copy overflow", __func__, n); bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), @@ -5727,7 +5889,7 @@ m_dup(struct mbuf *m, int how) } while (m != NULL) { #if BLUE_DEBUG - kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len, + printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len, m->m_data); #endif if (copyhdr) @@ -5766,7 +5928,7 @@ m_dup(struct mbuf *m, int how) m = m->m_next; np = &n->m_next; #if BLUE_DEBUG - kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len, + printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len, n->m_data); #endif } @@ -5904,7 +6066,8 @@ m_append(struct mbuf *m0, int len, caddr_t cp) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; - cp += space, remainder -= space; + cp += space; + remainder -= space; } while (remainder > 0) { /* @@ -6241,6 +6404,7 @@ mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) return (mcache_retry); } +__attribute__((noreturn)) static void mbuf_worker_thread(void) { @@ -6248,7 +6412,6 @@ mbuf_worker_thread(void) while (1) { lck_mtx_lock(mbuf_mlock); - mbuf_expand = 0; if (mbuf_expand_mcl) { int n; @@ -6304,13 +6467,15 @@ mbuf_worker_thread(void) } } + mbuf_worker_needs_wakeup = TRUE; + assert_wait((caddr_t)&mbuf_worker_needs_wakeup, + THREAD_UNINT); lck_mtx_unlock(mbuf_mlock); - - assert_wait(&mbuf_worker_run, THREAD_UNINT); (void) thread_block((thread_continue_t)mbuf_worker_thread); } } +__attribute__((noreturn)) static void mbuf_worker_thread_init(void) { @@ -6332,7 +6497,7 @@ slab_get(void *buf) if ((slg = slabstbl[ix]) == NULL) { /* - * In the current implementation, we never shrink the slabs + * In the current implementation, we never shrink the slabs * table; if we attempt to reallocate a cluster group when * it's already allocated, panic since this is a sign of a * memory corruption (slabstbl[ix] got nullified). @@ -6557,7 +6722,7 @@ mcl_audit_free(void *buf, unsigned int num) ix = MTOPG(buf); VERIFY(ix < maxclaudit); - + if (mclaudit[ix].cl_audit[0] != NULL) { mca_list = mclaudit[ix].cl_audit[0]; for (i = 0; i < num; i++) { @@ -6823,7 +6988,7 @@ mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { uintptr_t bt[MLEAK_STACK_DEPTH]; - int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH); + int logged = backtrace(bt, MLEAK_STACK_DEPTH); mleak_log(bt, addr, logged, num); } } @@ -7293,6 +7458,59 @@ m_reinit(struct mbuf *m, int hdr) return (ret); } +int +m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n) +{ + ASSERT(m->m_flags & M_EXT); + return (atomic_test_set_32(&MEXT_PRIV(m), o, n)); +} + +uint32_t +m_ext_get_prop(struct mbuf *m) +{ + ASSERT(m->m_flags & M_EXT); + return (MEXT_PRIV(m)); +} + +int +m_ext_paired_is_active(struct mbuf *m) +{ + return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1); +} + +void +m_ext_paired_activate(struct mbuf *m) +{ + struct ext_ref *rfa; + int hdr, type; + caddr_t extbuf; + void *extfree; + u_int extsize; + + VERIFY(MBUF_IS_PAIRED(m)); + VERIFY(MEXT_REF(m) == MEXT_MINREF(m)); + VERIFY(MEXT_PREF(m) == MEXT_MINREF(m)); + + hdr = (m->m_flags & M_PKTHDR); + type = m->m_type; + extbuf = m->m_ext.ext_buf; + extfree = m->m_ext.ext_free; + extsize = m->m_ext.ext_size; + rfa = MEXT_RFA(m); + + VERIFY(extbuf != NULL && rfa != NULL); + + /* + * Safe to reinitialize packet header tags, since it's + * already taken care of at m_free() time. Similar to + * what's done in m_clattach() for the cluster. Bump + * up MEXT_PREF to indicate activation. + */ + MBUF_INIT(m, hdr, type); + MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa, + 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m); +} + void m_scratch_init(struct mbuf *m) { @@ -7378,7 +7596,7 @@ m_redzone_verify(struct mbuf *m) * * The values 6% and 3% are chosen so that we can do simple arithmetic * with shift operations. - */ + */ static boolean_t mbuf_report_usage(mbuf_class_t cl) { @@ -7396,7 +7614,7 @@ mbuf_report_usage(mbuf_class_t cl) __private_extern__ void mbuf_report_peak_usage(void) { - int i = 0; + int i = 0; u_int64_t uptime; struct nstat_sysinfo_data ns_data; uint32_t memreleased = 0; @@ -7405,7 +7623,7 @@ mbuf_report_peak_usage(void) lck_mtx_lock(mbuf_mlock); /* Generate an initial report after 1 week of uptime */ - if (!mb_peak_firstreport && + if (!mb_peak_firstreport && uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) { mb_peak_newreport = TRUE; mb_peak_firstreport = TRUE; @@ -7417,7 +7635,7 @@ mbuf_report_peak_usage(void) } /* - * Since a report is being generated before 1 week, + * Since a report is being generated before 1 week, * we do not need to force another one later */ if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD) @@ -7441,8 +7659,17 @@ mbuf_report_peak_usage(void) ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached; ns_data.u.mb_stats.draincnt = mbstat.m_drain; ns_data.u.mb_stats.memreleased = memreleased; + ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor; nstat_sysinfo_send_data(&ns_data); + + /* + * Reset the floor whenever we report a new + * peak to track the trend (increase peek usage + * is not a leak if mbufs get released + * between reports and the floor stays low) + */ + total_sbmb_cnt_floor = total_sbmb_cnt_peak; } /* @@ -7454,11 +7681,11 @@ m_drain(void) mbuf_class_t mc; mcl_slab_t *sp, *sp_tmp, *nsp; unsigned int num, k, interval, released = 0; - unsigned int total_mem = 0, use_mem = 0; + unsigned long total_mem = 0, use_mem = 0; boolean_t ret, purge_caches = FALSE; ppnum_t offset; mcache_obj_t *obj; - float per; + unsigned long per; static uint64_t last_drain = 0; static unsigned char scratch[32]; static ppnum_t scratch_pa = 0; @@ -7488,11 +7715,11 @@ m_drain(void) lck_mtx_unlock(mbuf_mlock); return; } - interval = net_uptime() - last_drain; + interval = net_uptime() - last_drain; if (interval <= mb_drain_maxint) { lck_mtx_unlock(mbuf_mlock); return; - } + } if (interval <= mb_drain_maxint * 5) purge_caches = TRUE; last_drain = net_uptime(); @@ -7503,8 +7730,8 @@ m_drain(void) total_mem += m_total(mc) * m_maxsize(mc); use_mem += m_active(mc) * m_maxsize(mc); } - per = (float)use_mem / (float)total_mem; - if (per >= 0.6) { + per = (use_mem * 100) / total_mem; + if (per >= 60) { lck_mtx_unlock(mbuf_mlock); return; } @@ -7529,7 +7756,7 @@ m_drain(void) * total in the freelist. */ for (mc = 0; mc < NELEM(mbuf_table); mc++) { - while (m_cobjlist(mc) && + while (m_cobjlist(mc) && m_total(mc) < m_avgtotal(mc) && m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) { obj = m_cobjlist(mc); @@ -7586,7 +7813,7 @@ m_drain(void) m_total(mc)--; for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; - VERIFY(nsp->sl_refcnt == 0 && + VERIFY(nsp->sl_refcnt == 0 && nsp->sl_base != NULL && nsp->sl_len == 0); slab_init(nsp, 0, 0, NULL, NULL, 0, 0, @@ -7616,7 +7843,7 @@ m_drain(void) */ IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa); mcl_paddr[offset] = 0; - kmem_free(mb_map, (vm_offset_t)sp->sl_base, + kmem_free(mb_map, (vm_offset_t)sp->sl_base, sp->sl_len); slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0); sp->sl_flags = 0; @@ -7636,7 +7863,7 @@ m_drain_force_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2) int val = 0, err; - + err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == USER_ADDR_NULL) return (err); @@ -7666,7 +7893,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, m_drain_force_sysctl, "I", "Forces the mbuf garbage collection to run"); SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint, diff --git a/bsd/kern/uipc_mbuf2.c b/bsd/kern/uipc_mbuf2.c index 2d6f23f08..a56ffa990 100644 --- a/bsd/kern/uipc_mbuf2.c +++ b/bsd/kern/uipc_mbuf2.c @@ -664,7 +664,7 @@ m_tag_init(struct mbuf *m, int all) * (e.g. m_dup_pkthdr), don't zero them out. */ if (all) { - bzero(&m->m_pkthdr.pf_mtag, sizeof (m->m_pkthdr.pf_mtag)); + bzero(m_pftag(m), sizeof (struct pf_mtag)); bzero(&m->m_pkthdr.proto_mtag, sizeof (m->m_pkthdr.proto_mtag)); bzero(&m->m_pkthdr.necp_mtag, sizeof (m->m_pkthdr.necp_mtag)); } diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 29568b48a..c8fc70ca3 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2015 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,6 +102,8 @@ #include #include #include +#include +#include #include #include #include @@ -112,6 +114,8 @@ #include #include #include +#include + #include #include #include @@ -136,6 +140,7 @@ /* TODO: this should be in a header file somewhere */ extern char *proc_name_address(void *p); +extern char *proc_best_name(proc_t); static u_int32_t so_cache_hw; /* High water mark for socache */ static u_int32_t so_cache_timeouts; /* number of timeouts */ @@ -154,35 +159,61 @@ static lck_mtx_t *so_cache_mtx; #include +static int filt_sorattach(struct knote *kn); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); +static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); + +static int filt_sowattach(struct knote *kn); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); +static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); + +static int filt_sockattach(struct knote *kn); static void filt_sockdetach(struct knote *kn); static int filt_sockev(struct knote *kn, long hint); -static void filt_socktouch(struct knote *kn, struct kevent_internal_s *kev, - long type); +static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); static int sooptcopyin_timeval(struct sockopt *, struct timeval *); static int sooptcopyout_timeval(struct sockopt *, const struct timeval *); -static struct filterops soread_filtops = { +struct filterops soread_filtops = { .f_isfd = 1, + .f_attach = filt_sorattach, .f_detach = filt_sordetach, .f_event = filt_soread, + .f_touch = filt_sortouch, + .f_process = filt_sorprocess, }; -static struct filterops sowrite_filtops = { +struct filterops sowrite_filtops = { .f_isfd = 1, + .f_attach = filt_sowattach, .f_detach = filt_sowdetach, .f_event = filt_sowrite, + .f_touch = filt_sowtouch, + .f_process = filt_sowprocess, }; -static struct filterops sock_filtops = { +struct filterops sock_filtops = { .f_isfd = 1, + .f_attach = filt_sockattach, .f_detach = filt_sockdetach, .f_event = filt_sockev, .f_touch = filt_socktouch, + .f_process = filt_sockprocess, +}; + +struct filterops soexcept_filtops = { + .f_isfd = 1, + .f_attach = filt_sorattach, + .f_detach = filt_sordetach, + .f_event = filt_soread, + .f_touch = filt_sortouch, + .f_process = filt_sorprocess, }; SYSCTL_DECL(_kern_ipc); @@ -193,6 +224,10 @@ int socket_debug = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, ""); +static unsigned long sodefunct_calls = 0; +SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED, + &sodefunct_calls, ""); + static int socket_zone = M_SOCKET; so_gen_t so_gencnt; /* generation count for sockets */ @@ -280,6 +315,12 @@ int soreserveheadroom = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED, &soreserveheadroom, 0, "To allocate contiguous datagram buffers"); +#if (DEBUG || DEVELOPMENT) +int so_notsent_lowat_check = 1; +SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED, + &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check"); +#endif /* DEBUG || DEVELOPMENT */ + extern struct inpcbinfo tcbinfo; /* TODO: these should be in header file */ @@ -332,7 +373,7 @@ int so_set_extended_bk_idle(struct socket *, int); * setting the DSCP code on the packet based on the service class; see * for details. */ -__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP; +__private_extern__ u_int32_t sotcdb = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, &sotcdb, 0, ""); @@ -855,9 +896,10 @@ sobindlock(struct socket *so, struct sockaddr *nam, int dolock) */ if (so->so_flags & SOF_DEFUNCT) { error = EINVAL; - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", - __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n", + __func__, proc_pid(p), proc_best_name(p), + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), error); goto out; } @@ -961,10 +1003,11 @@ solisten(struct socket *so, int backlog) (so->so_flags & SOF_DEFUNCT)) { error = EINVAL; if (so->so_flags & SOF_DEFUNCT) { - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] " "(%d)\n", __func__, proc_pid(p), + proc_best_name(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SOCK_DOM(so), SOCK_TYPE(so), error); } goto out; } @@ -1062,7 +1105,7 @@ sofreelastref(struct socket *so, int dealloc) /* 3932268: disable upcall */ so->so_rcv.sb_flags &= ~SB_UPCALL; - so->so_snd.sb_flags &= ~SB_UPCALL; + so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT); so->so_event = sonullevent; if (dealloc) @@ -1456,10 +1499,11 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { error = EOPNOTSUPP; if (so->so_flags & SOF_DEFUNCT) { - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] " "(%d)\n", __func__, proc_pid(p), + proc_best_name(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SOCK_DOM(so), SOCK_TYPE(so), error); } if (dolock) socket_unlock(so, 1); @@ -1550,10 +1594,11 @@ soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl, if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { error = EOPNOTSUPP; if (so->so_flags & SOF_DEFUNCT) { - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] " "(%d)\n", __func__, proc_pid(p), + proc_best_name(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SOCK_DOM(so), SOCK_TYPE(so), error); } return (error); } @@ -1718,10 +1763,10 @@ sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid, if (so->so_flags & SOF_DEFUNCT) { defunct: error = EPIPE; - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", - __func__, proc_selfpid(), + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n", + __func__, proc_selfpid(), proc_best_name(current_proc()), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SOCK_DOM(so), SOCK_TYPE(so), error); return (error); } @@ -2137,7 +2182,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, top == NULL && headroom > 0) { freelist->m_data += headroom; } - + /* * Fall back to regular mbufs without * reserving the socket headroom @@ -2475,7 +2520,7 @@ sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags) /* * Allocate buffer large enough to include headroom space for * network and link header - * + * */ bytes_to_alloc = maxpktlen + headroom; @@ -2500,7 +2545,7 @@ sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags) (unsigned int *)&num_needed, bytes_to_alloc, NULL, M_WAIT, 1, 0); } - + if (freelist == NULL) { socket_lock(so, 0); error = ENOMEM; @@ -2992,9 +3037,10 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct sockbuf *sb = &so->so_rcv; error = ENOTCONN; - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", - __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n", + __func__, proc_pid(p), proc_best_name(p), + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), error); /* * This socket should have been disconnected and flushed * prior to being returned from sodefunct(); there should @@ -3852,9 +3898,10 @@ soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt, struct sockbuf *sb = &so->so_rcv; error = ENOTCONN; - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", - __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n", + __func__, proc_pid(p), proc_best_name(p), + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), error); /* * This socket should have been disconnected and flushed * prior to being returned from sodefunct(); there should @@ -4300,20 +4347,6 @@ void sowflush(struct socket *so) { struct sockbuf *sb = &so->so_snd; -#ifdef notyet - lck_mtx_t *mutex_held; - /* - * XXX: This code is currently commented out, because we may get here - * as part of sofreelastref(), and at that time, pr_getlock() may no - * longer be able to return us the lock; this will be fixed in future. - */ - if (so->so_proto->pr_getlock != NULL) - mutex_held = (*so->so_proto->pr_getlock)(so, 0); - else - mutex_held = so->so_proto->pr_domain->dom_mtx; - - lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); -#endif /* notyet */ /* * Obtain lock on the socket buffer (SB_LOCK). This is required @@ -4533,6 +4566,27 @@ sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p) return (0); } +static int +soopt_cred_check(struct socket *so, int priv) +{ + kauth_cred_t cred = NULL; + proc_t ep = PROC_NULL; + int error; + + if (so->so_flags & SOF_DELEGATED) { + ep = proc_find(so->e_pid); + if (ep) + cred = kauth_cred_proc_ref(ep); + } + error = priv_check_cred(cred ? cred : so->so_cred, priv, 0); + if (cred) + kauth_cred_unref(&cred); + if (ep != PROC_NULL) + proc_rele(ep); + + return (error); +} + /* * Returns: 0 Success * EINVAL @@ -4633,6 +4687,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) case SO_WANTMORE: case SO_WANTOOBFLAG: case SO_NOWAKEFROMSLEEP: + case SO_NOAPNFALLBK: error = sooptcopyin(sopt, &optval, sizeof (optval), sizeof (optval)); if (error != 0) @@ -4812,27 +4867,32 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) if (error != 0) goto out; if (optval != 0) { - kauth_cred_t cred = NULL; - proc_t ep = PROC_NULL; - - if (so->so_flags & SOF_DELEGATED) { - ep = proc_find(so->e_pid); - if (ep) - cred = kauth_cred_proc_ref(ep); - } - error = priv_check_cred( - cred ? cred : so->so_cred, - PRIV_NET_RESTRICTED_AWDL, 0); + error = soopt_cred_check(so, + PRIV_NET_RESTRICTED_AWDL); if (error == 0) inp_set_awdl_unrestricted( sotoinpcb(so)); - if (cred) - kauth_cred_unref(&cred); - if (ep != PROC_NULL) - proc_rele(ep); } else inp_clear_awdl_unrestricted(sotoinpcb(so)); break; + case SO_INTCOPROC_ALLOW: + if (SOCK_DOM(so) != PF_INET6) { + error = EOPNOTSUPP; + goto out; + } + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error != 0) + goto out; + if (optval != 0) { + error = soopt_cred_check(so, + PRIV_NET_RESTRICTED_INTCOPROC); + if (error == 0) + inp_set_intcoproc_allowed( + sotoinpcb(so)); + } else + inp_clear_intcoproc_allowed(sotoinpcb(so)); + break; case SO_LABEL: #if CONFIG_MACF_SOCKET @@ -4897,9 +4957,16 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) sizeof (optval)); if (error != 0) goto out; + if (optval >= SO_TC_NET_SERVICE_OFFSET) { + int netsvc = optval - SO_TC_NET_SERVICE_OFFSET; + error = so_set_net_service_type(so, netsvc); + goto out; + } error = so_set_traffic_class(so, optval); if (error != 0) goto out; + so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE; + so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC; break; } @@ -4915,6 +4982,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) break; } +#if (DEVELOPMENT || DEBUG) case SO_TRAFFIC_CLASS_DBG: { struct so_tcdbg so_tcdbg; @@ -4927,6 +4995,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) goto out; break; } +#endif /* (DEVELOPMENT || DEBUG) */ case SO_PRIVILEGED_TRAFFIC_CLASS: error = priv_check_cred(kauth_cred_get(), @@ -4972,9 +5041,11 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) char d[MAX_IPv6_STR_LEN]; struct inpcb *inp = sotoinpcb(so); - SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> " - "%s:%d] is now marked as %seligible for " + SODEFUNCTLOG("%s[%d, %s]: so 0x%llx " + "[%s %s:%d -> %s:%d] is now marked " + "as %seligible for " "defunct\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", inet_ntop(SOCK_DOM(so), @@ -4988,15 +5059,17 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) (void *)&inp->in6p_faddr, d, sizeof (d)), ntohs(inp->in6p_fport), (so->so_flags & SOF_NODEFUNCT) ? - "not " : "")); + "not " : ""); } else { - SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is " - "now marked as %seligible for defunct\n", + SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] " + "is now marked as %seligible for " + "defunct\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), (so->so_flags & SOF_NODEFUNCT) ? - "not " : "")); + "not " : ""); } break; @@ -5103,6 +5176,31 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) else so->so_flags1 |= SOF1_CELLFALLBACK; break; + + case SO_NET_SERVICE_TYPE: { + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error != 0) + goto out; + error = so_set_net_service_type(so, optval); + break; + } + + case SO_QOSMARKING_POLICY_OVERRIDE: + error = priv_check_cred(kauth_cred_get(), + PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0); + if (error != 0) + goto out; + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error != 0) + goto out; + if (optval == 0) + so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE; + else + so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE; + break; + default: error = ENOPROTOOPT; break; @@ -5253,6 +5351,7 @@ sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) case SO_WANTMORE: case SO_WANTOOBFLAG: case SO_NOWAKEFROMSLEEP: + case SO_NOAPNFALLBK: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof (optval)); @@ -5376,6 +5475,15 @@ sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) error = EOPNOTSUPP; break; + case SO_INTCOPROC_ALLOW: + if (SOCK_DOM(so) == PF_INET6) { + optval = inp_get_intcoproc_allowed( + sotoinpcb(so)); + goto integer; + } else + error = EOPNOTSUPP; + break; + case SO_LABEL: #if CONFIG_MACF_SOCKET if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac), @@ -5438,9 +5546,11 @@ sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) sizeof (so->so_tc_stats)); break; +#if (DEVELOPMENT || DEBUG) case SO_TRAFFIC_CLASS_DBG: error = sogetopt_tcdbg(so, sopt); break; +#endif /* (DEVELOPMENT || DEBUG) */ case SO_PRIVILEGED_TRAFFIC_CLASS: optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS); @@ -5519,6 +5629,17 @@ sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0) ? 1 : 0; goto integer; + case SO_NET_SERVICE_TYPE: { + if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) + optval = so->so_netsvctype; + else + optval = NET_SERVICE_TYPE_BE; + goto integer; + } + case SO_NETSVC_MARKING_LEVEL: + optval = so_get_netsvc_marking_level(so); + goto integer; + default: error = ENOPROTOOPT; break; @@ -5666,6 +5787,10 @@ sohasoutofband(struct socket *so) else if (so->so_pgid > 0) proc_signal(so->so_pgid, SIGURG); selwakeup(&so->so_rcv.sb_sel); + if (so->so_rcv.sb_flags & SB_KNOTE) { + KNOTE(&so->so_rcv.sb_sel.si_note, + (NOTE_OOB | SO_FILT_HINT_LOCKED)); + } } int @@ -5723,7 +5848,7 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) #pragma unused(ctx) #endif /* MAC_SOCKET */ struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - struct klist *skl; + int result; socket_lock(so, 1); so_update_last_owner_locked(so, PROC_NULL); @@ -5733,84 +5858,48 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)), kn, so) != 0) { socket_unlock(so, 1); - return (1); + kn->kn_flags = EV_ERROR; + kn->kn_data = EPERM; + return 0; } #endif /* MAC_SOCKET */ switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_fop = &soread_filtops; - /* - * If the caller explicitly asked for OOB results (e.g. poll()), - * save that off in the hookid field and reserve the kn_flags - * EV_OOBAND bit for output only. - */ - if (kn->kn_flags & EV_OOBAND) { - kn->kn_flags &= ~EV_OOBAND; - kn->kn_hookid = EV_OOBAND; - } else { - kn->kn_hookid = 0; - } - skl = &so->so_rcv.sb_sel.si_note; + kn->kn_filtid = EVFILTID_SOREAD; break; case EVFILT_WRITE: - kn->kn_fop = &sowrite_filtops; - skl = &so->so_snd.sb_sel.si_note; + kn->kn_filtid = EVFILTID_SOWRITE; break; case EVFILT_SOCK: - kn->kn_fop = &sock_filtops; - skl = &so->so_klist; - kn->kn_hookid = 0; - kn->kn_status |= KN_TOUCH; + kn->kn_filtid = EVFILTID_SCK; + break; + case EVFILT_EXCEPT: + kn->kn_filtid = EVFILTID_SOEXCEPT; break; default: socket_unlock(so, 1); - return (1); - } - - if (KNOTE_ATTACH(skl, kn)) { - switch (kn->kn_filter) { - case EVFILT_READ: - so->so_rcv.sb_flags |= SB_KNOTE; - break; - case EVFILT_WRITE: - so->so_snd.sb_flags |= SB_KNOTE; - break; - case EVFILT_SOCK: - so->so_flags |= SOF_KNOTE; - break; - default: - socket_unlock(so, 1); - return (1); - } + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; + return 0; } - socket_unlock(so, 1); - return (0); -} -static void -filt_sordetach(struct knote *kn) -{ - struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + /* + * call the appropriate sub-filter attach + * with the socket still locked + */ + result = knote_fops(kn)->f_attach(kn); - socket_lock(so, 1); - if (so->so_rcv.sb_flags & SB_KNOTE) - if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) - so->so_rcv.sb_flags &= ~SB_KNOTE; socket_unlock(so, 1); + + return result; } -/*ARGSUSED*/ static int -filt_soread(struct knote *kn, long hint) +filt_soread_common(struct knote *kn, struct socket *so) { - struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_lock(so, 1); - if (so->so_options & SO_ACCEPTCONN) { - int isempty; + int is_not_empty; /* * Radar 6615193 handle the listen case dynamically @@ -5819,12 +5908,9 @@ filt_soread(struct knote *kn, long hint) */ kn->kn_data = so->so_qlen; - isempty = ! TAILQ_EMPTY(&so->so_comp); + is_not_empty = ! TAILQ_EMPTY(&so->so_comp); - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - - return (isempty); + return (is_not_empty); } /* socket isn't a listener */ @@ -5835,26 +5921,10 @@ filt_soread(struct knote *kn, long hint) */ kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; - /* - * Clear out EV_OOBAND that filt_soread may have set in the - * past. - */ - kn->kn_flags &= ~EV_OOBAND; - if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)) { - kn->kn_flags |= EV_OOBAND; - /* - * If caller registered explicit interest in OOB data, - * return immediately (data == amount beyond mark, for - * legacy reasons - that should be changed later). - */ - if (kn->kn_hookid == EV_OOBAND) { - /* - * When so_state is SS_RCVATMARK, so_oobmark - * is 0. - */ + if (kn->kn_sfflags & NOTE_OOB) { + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { + kn->kn_fflags |= NOTE_OOB; kn->kn_data -= so->so_oobmark; - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); return (1); } } @@ -5866,14 +5936,10 @@ filt_soread(struct knote *kn, long hint) ) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); return (1); } if (so->so_error) { /* temporary udp error */ - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); return (1); } @@ -5890,9 +5956,6 @@ filt_soread(struct knote *kn, long hint) lowwat = kn->kn_sdata; } - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - /* * The order below is important. Since NOTE_LOWAT * overrides sb_lowat, check for NOTE_LOWAT case @@ -5904,16 +5967,103 @@ filt_soread(struct knote *kn, long hint) return (so->so_rcv.sb_cc >= lowwat); } +static int +filt_sorattach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + + /* socket locked */ + + /* + * If the caller explicitly asked for OOB results (e.g. poll()) + * from EVFILT_READ, then save that off in the hookid field + * and reserve the kn_flags EV_OOBAND bit for output only. + */ + if (kn->kn_filter == EVFILT_READ && + kn->kn_flags & EV_OOBAND) { + kn->kn_flags &= ~EV_OOBAND; + kn->kn_hookid = EV_OOBAND; + } else { + kn->kn_hookid = 0; + } + if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) + so->so_rcv.sb_flags |= SB_KNOTE; + + /* indicate if event is already fired */ + return filt_soread_common(kn, so); +} + static void -filt_sowdetach(struct knote *kn) +filt_sordetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + socket_lock(so, 1); + if (so->so_rcv.sb_flags & SB_KNOTE) + if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) + so->so_rcv.sb_flags &= ~SB_KNOTE; + socket_unlock(so, 1); +} + +/*ARGSUSED*/ +static int +filt_soread(struct knote *kn, long hint) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int retval; + + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_lock(so, 1); + + retval = filt_soread_common(kn, so); + + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + + return retval; +} + +static int +filt_sortouch(struct knote *kn, struct kevent_internal_s *kev) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int retval; + + socket_lock(so, 1); + + /* save off the new input fflags and data */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* determine if changes result in fired events */ + retval = filt_soread_common(kn, so); - if (so->so_snd.sb_flags & SB_KNOTE) - if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) - so->so_snd.sb_flags &= ~SB_KNOTE; socket_unlock(so, 1); + + return retval; +} + +static int +filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int retval; + + socket_lock(so, 1); + retval = filt_soread_common(kn, so); + if (retval) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_fflags = 0; + kn->kn_data = 0; + } + } + socket_unlock(so, 1); + + return retval; } int @@ -5928,34 +6078,25 @@ so_wait_for_if_feedback(struct socket *so) return (0); } -/*ARGSUSED*/ static int -filt_sowrite(struct knote *kn, long hint) +filt_sowrite_common(struct knote *kn, struct socket *so) { - struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; int ret = 0; - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_lock(so, 1); - kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - ret = 1; - goto out; + return 1; } if (so->so_error) { /* temporary udp error */ - ret = 1; - goto out; + return 1; } if (!socanwrite(so)) { - ret = 0; - goto out; + return 0; } if (so->so_flags1 & SOF1_PRECONNECT_DATA) { - ret = 1; - goto out; + return 1; } int64_t lowwat = so->so_snd.sb_lowat; if (kn->kn_sfflags & NOTE_LOWAT) { @@ -5965,10 +6106,14 @@ filt_sowrite(struct knote *kn, long hint) lowwat = kn->kn_sdata; } if (kn->kn_data >= lowwat) { - if (so->so_flags & SOF_NOTSENT_LOWAT) { - if ((SOCK_DOM(so) == PF_INET - || SOCK_DOM(so) == PF_INET6) - && so->so_type == SOCK_STREAM) { + if ((so->so_flags & SOF_NOTSENT_LOWAT) +#if (DEBUG || DEVELOPMENT) + && so_notsent_lowat_check == 1 +#endif /* DEBUG || DEVELOPMENT */ + ) { + if ((SOCK_DOM(so) == PF_INET || + SOCK_DOM(so) == PF_INET6) && + so->so_type == SOCK_STREAM) { ret = tcp_notsent_lowat_check(so); } #if MPTCP @@ -5978,8 +6123,7 @@ filt_sowrite(struct knote *kn, long hint) } #endif else { - ret = 1; - goto out; + return 1; } } else { ret = 1; @@ -5987,36 +6131,99 @@ filt_sowrite(struct knote *kn, long hint) } if (so_wait_for_if_feedback(so)) ret = 0; -out: - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); return (ret); } +static int +filt_sowattach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + + /* socket locked */ + if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) + so->so_snd.sb_flags |= SB_KNOTE; + + /* determine if its already fired */ + return filt_sowrite_common(kn, so); +} + static void -filt_sockdetach(struct knote *kn) +filt_sowdetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; socket_lock(so, 1); - if ((so->so_flags & SOF_KNOTE) != 0) - if (KNOTE_DETACH(&so->so_klist, kn)) - so->so_flags &= ~SOF_KNOTE; + if (so->so_snd.sb_flags & SB_KNOTE) + if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) + so->so_snd.sb_flags &= ~SB_KNOTE; socket_unlock(so, 1); } +/*ARGSUSED*/ static int -filt_sockev(struct knote *kn, long hint) +filt_sowrite(struct knote *kn, long hint) { - int ret = 0, locked = 0; struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - long ev_hint = (hint & SO_FILT_HINT_EV); - uint32_t level_trigger = 0; + int ret; - if ((hint & SO_FILT_HINT_LOCKED) == 0) { + if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_lock(so, 1); - locked = 1; + + ret = filt_sowrite_common(kn, so); + + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + + return ret; +} + +static int +filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int ret; + + socket_lock(so, 1); + + /*save off the new input fflags and data */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* determine if these changes result in a triggered event */ + ret = filt_sowrite_common(kn, so); + + socket_unlock(so, 1); + + return ret; +} + +static int +filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int ret; + + socket_lock(so, 1); + ret = filt_sowrite_common(kn, so); + if (ret) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_fflags = 0; + kn->kn_data = 0; + } } + socket_unlock(so, 1); + return ret; +} + +static int +filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint) +{ + int ret = 0; + uint32_t level_trigger = 0; if (ev_hint & SO_FILT_HINT_CONNRESET) { kn->kn_fflags |= NOTE_CONNRESET; @@ -6055,6 +6262,11 @@ filt_sockev(struct knote *kn, long hint) kn->kn_fflags |= NOTE_CONNINFO_UPDATED; } + if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) || + tcp_notify_ack_active(so)) { + kn->kn_fflags |= NOTE_NOTIFY_ACK; + } + if ((so->so_state & SS_CANTRCVMORE) #if CONTENT_FILTER && cfil_sock_data_pending(&so->so_rcv) == 0 @@ -6111,32 +6323,123 @@ filt_sockev(struct knote *kn, long hint) if ((kn->kn_fflags & ~level_trigger) != 0) ret = 1; - if (locked) - socket_unlock(so, 1); - return (ret); } +static int +filt_sockattach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + + /* socket locked */ + kn->kn_hookid = 0; + if (KNOTE_ATTACH(&so->so_klist, kn)) + so->so_flags |= SOF_KNOTE; + + /* determine if event already fired */ + return filt_sockev_common(kn, so, 0); +} + static void -filt_socktouch(struct knote *kn, struct kevent_internal_s *kev, long type) +filt_sockdetach(struct knote *kn) { -#pragma unused(kev) - switch (type) { - case EVENT_REGISTER: - { - uint32_t changed_flags; - changed_flags = (kn->kn_sfflags ^ kn->kn_hookid); + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + socket_lock(so, 1); - /* - * Since we keep track of events that are already - * delivered, if any of those events are not requested - * anymore the state related to them can be reset - */ - kn->kn_hookid &= - ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK); - break; + if ((so->so_flags & SOF_KNOTE) != 0) + if (KNOTE_DETACH(&so->so_klist, kn)) + so->so_flags &= ~SOF_KNOTE; + socket_unlock(so, 1); +} + +static int +filt_sockev(struct knote *kn, long hint) +{ + int ret = 0, locked = 0; + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + long ev_hint = (hint & SO_FILT_HINT_EV); + + if ((hint & SO_FILT_HINT_LOCKED) == 0) { + socket_lock(so, 1); + locked = 1; } - case EVENT_PROCESS: + + ret = filt_sockev_common(kn, so, ev_hint); + + if (locked) + socket_unlock(so, 1); + + return ret; +} + + + +/* + * filt_socktouch - update event state + */ +static int +filt_socktouch( + struct knote *kn, + struct kevent_internal_s *kev) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + uint32_t changed_flags; + int ret; + + socket_lock(so, 1); + + /* save off the [result] data and fflags */ + changed_flags = (kn->kn_sfflags ^ kn->kn_hookid); + + /* save off the new input fflags and data */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* restrict the current results to the (smaller?) set of new interest */ + /* + * For compatibility with previous implementations, we leave kn_fflags + * as they were before. + */ + //kn->kn_fflags &= kev->fflags; + + /* + * Since we keep track of events that are already + * delivered, if any of those events are not requested + * anymore the state related to them can be reset + */ + kn->kn_hookid &= + ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK); + + /* determine if we have events to deliver */ + ret = filt_sockev_common(kn, so, 0); + + socket_unlock(so, 1); + + return ret; +} + +/* + * filt_sockprocess - query event fired state and return data + */ +static int +filt_sockprocess( + struct knote *kn, + struct filt_process_s *data, + struct kevent_internal_s *kev) +{ +#pragma unused(data) + + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int ret = 0; + + socket_lock(so, 1); + + ret = filt_sockev_common(kn, so, 0); + if (ret) { + *kev = kn->kn_kevent; + /* * Store the state of the events being delivered. This * state can be used to deliver level triggered events @@ -6145,7 +6448,7 @@ filt_socktouch(struct knote *kn, struct kevent_internal_s *kev, long type) */ if (kn->kn_fflags != 0) kn->kn_hookid |= (kn->kn_fflags & - EVFILT_SOCK_LEVEL_TRIGGER_MASK); + EVFILT_SOCK_LEVEL_TRIGGER_MASK); /* * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver @@ -6156,10 +6459,16 @@ filt_socktouch(struct knote *kn, struct kevent_internal_s *kev, long type) kn->kn_hookid &= ~NOTE_RESUME; if (kn->kn_fflags & NOTE_RESUME) kn->kn_hookid &= ~NOTE_SUSPEND; - break; - default: - break; + + if (kn->kn_flags & EV_CLEAR) { + kn->kn_data = 0; + kn->kn_fflags = 0; + } } + + socket_unlock(so, 1); + + return ret; } void @@ -6167,6 +6476,13 @@ get_sockev_state(struct socket *so, u_int32_t *statep) { u_int32_t state = *(statep); + /* + * If the state variable is already used by a previous event, + * reset it. + */ + if (state != 0) + return; + if (so->so_state & SS_ISCONNECTED) state |= SOCKEV_CONNECTED; else @@ -6349,18 +6665,23 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) if (so->so_flags & SOF_NODEFUNCT) { if (noforce) { err = EOPNOTSUPP; - SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) " - "so 0x%llx [%d,%d] is not eligible for defunct " - "(%d)\n", __func__, proc_selfpid(), proc_pid(p), - level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), err)); + SODEFUNCTLOG("%s[%d, %s]: (target pid %d " + "name %s level %d) so 0x%llx [%d,%d] " + "is not eligible for defunct " + "(%d)\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), proc_pid(p), + proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), err); return (err); } so->so_flags &= ~SOF_NODEFUNCT; - SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx " - "[%d,%d] defunct by force\n", __func__, proc_selfpid(), - proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so))); + SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " + "so 0x%llx [%d,%d] defunct by force\n", __func__, + proc_selfpid(), proc_best_name(current_proc()), + proc_pid(p), proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so)); } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) { struct inpcb *inp = (struct inpcb *)so->so_pcb; struct ifnet *ifp = inp->inp_last_outifp; @@ -6373,20 +6694,22 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime); } else if (noforce) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active); - + so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG; so->so_extended_bk_start = net_uptime(); OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag); - + inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY); - + err = EOPNOTSUPP; - SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) " - "extend bk idle " - "so 0x%llx rcv hw %d cc %d\n", - __func__, proc_selfpid(), proc_pid(p), - level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - so->so_rcv.sb_hiwat, so->so_rcv.sb_cc)); + SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s " + "level %d) extend bk idle so 0x%llx rcv hw %d " + "cc %d\n", + __func__, proc_selfpid(), + proc_best_name(current_proc()), proc_pid(p), + proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + so->so_rcv.sb_hiwat, so->so_rcv.sb_cc); return (err); } else { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced); @@ -6412,11 +6735,12 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) } done: - SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s " - "defunct%s\n", __func__, proc_selfpid(), proc_pid(p), level, - (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), - defunct ? "is already" : "marked as", - (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "")); + SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " + "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), + level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), + SOCK_TYPE(so), defunct ? "is already" : "marked as", + (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : ""); return (err); } @@ -6441,10 +6765,12 @@ sodefunct(struct proc *p, struct socket *so, int level) char d[MAX_IPv6_STR_LEN]; struct inpcb *inp = sotoinpcb(so); - SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s " - "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, " - "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(), - proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " + "so 0x%llx [%s %s:%d -> %s:%d] is now defunct " + "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", + __func__, proc_selfpid(), proc_best_name(current_proc()), + proc_pid(p), proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ? (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr), @@ -6454,15 +6780,18 @@ sodefunct(struct proc *p, struct socket *so, int level) d, sizeof (d)), ntohs(inp->in6p_fport), (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags, - rcv->sb_flags, snd->sb_flags)); + rcv->sb_flags, snd->sb_flags); } else { - SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx " - "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, " - "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(), - proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags, + SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " + "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, " + "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, + proc_selfpid(), proc_best_name(current_proc()), + proc_pid(p), proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), + (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags, - snd->sb_flags)); + snd->sb_flags); } /* @@ -6508,6 +6837,7 @@ sodefunct(struct proc *p, struct socket *so, int level) sbrelease(snd); } so->so_state |= SS_DEFUNCT; + OSIncrementAtomicLong((volatile long *)&sodefunct_calls); done: return (0); @@ -6520,11 +6850,12 @@ soresume(struct proc *p, struct socket *so, int locked) socket_lock(so, 1); if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) { - SODEFUNCTLOG(("%s[%d]: )target pid %d) so 0x%llx [%d,%d] " - "resumed from bk idle\n", - __func__, proc_selfpid(), proc_pid(p), + SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx " + "[%d,%d] resumed from bk idle\n", + __func__, proc_selfpid(), proc_best_name(current_proc()), + proc_pid(p), proc_best_name(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so))); + SOCK_DOM(so), SOCK_TYPE(so)); so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG; so->so_extended_bk_start = 0; @@ -6592,13 +6923,13 @@ so_set_extended_bk_idle(struct socket *so, int optval) so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED; OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok); } - SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] " + SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] " "%s marked for extended bk idle\n", - __func__, proc_selfpid(), + __func__, proc_selfpid(), proc_best_name(current_proc()), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? - "is" : "not")); + "is" : "not"); proc_fdunlock(p); } @@ -6649,10 +6980,10 @@ so_check_extended_bk_idle_time(struct socket *so) int ret = 1; if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) { - SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d]\n", - __func__, proc_selfpid(), + SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n", + __func__, proc_selfpid(), proc_best_name(current_proc()), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so))); + SOCK_DOM(so), SOCK_TYPE(so)); if (net_uptime() - so->so_extended_bk_start > soextbkidlestat.so_xbkidle_time) { so_stop_extended_bk_idle(so); @@ -6667,7 +6998,7 @@ so_check_extended_bk_idle_time(struct socket *so) OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched); } } - + return (ret); } @@ -6685,7 +7016,7 @@ resume_proc_sockets(proc_t p) struct socket *so; fp = fdp->fd_ofiles[i]; - if (fp == NULL || + if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) continue; diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index 40e7f1919..264246cb2 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -105,6 +105,8 @@ #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4) #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5) +extern char *proc_best_name(proc_t p); + SYSCTL_DECL(_kern_ipc); __private_extern__ u_int32_t net_io_policy_throttle_best_effort = 0; @@ -136,8 +138,8 @@ u_int32_t high_sb_max = SB_MAX; static u_int32_t sb_efficiency = 8; /* parameter for sbreserve() */ int32_t total_sbmb_cnt __attribute__((aligned(8))) = 0; +int32_t total_sbmb_cnt_floor __attribute__((aligned(8))) = 0; int32_t total_sbmb_cnt_peak __attribute__((aligned(8))) = 0; -int32_t total_snd_byte_count __attribute__((aligned(8))) = 0; int64_t sbmb_limreached __attribute__((aligned(8))) = 0; /* Control whether to throttle sockets eligible to be throttled */ @@ -377,8 +379,8 @@ sonewconn_internal(struct socket *head, int connstatus) #endif /* inherit traffic management properties of listener */ - so->so_traffic_mgt_flags = - head->so_traffic_mgt_flags & (TRAFFIC_MGT_SO_BACKGROUND); + so->so_flags1 |= + head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND); so->so_background_thread = head->so_background_thread; so->so_traffic_class = head->so_traffic_class; @@ -514,6 +516,18 @@ sbwait(struct sockbuf *sb) /* NOTREACHED */ } + if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) { + error = EBADF; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] " + "(%d)\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), + (uint64_t)VM_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), error); + } + return (error); + } + if (so->so_proto->pr_getlock != NULL) mutex_held = (*so->so_proto->pr_getlock)(so, 0); else @@ -544,10 +558,11 @@ sbwait(struct sockbuf *sb) if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) { error = EBADF; if (so->so_flags & SOF_DEFUNCT) { - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] " "(%d)\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), (uint64_t)VM_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SOCK_DOM(so), SOCK_TYPE(so), error); } } @@ -570,11 +585,12 @@ void sowakeup(struct socket *so, struct sockbuf *sb) { if (so->so_flags & SOF_DEFUNCT) { - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] si 0x%x, " + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, " "fl 0x%x [%s]\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)sb->sb_sel.si_flags, sb->sb_flags, - (sb->sb_flags & SB_RECV) ? "rcv" : "snd")); + (sb->sb_flags & SB_RECV) ? "rcv" : "snd"); } sb->sb_flags &= ~SB_SEL; @@ -1691,6 +1707,14 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) /* XXX: Probably don't need */ sb->sb_ctl += m->m_len; } + + /* update send byte count */ + if (sb->sb_flags & SB_SNDBYTE_CNT) { + inp_incr_sndbytes_total(sb->sb_so, + m->m_len); + inp_incr_sndbytes_unsent(sb->sb_so, + m->m_len); + } m = m_free(m); continue; } @@ -1749,9 +1773,6 @@ sbflush(struct sockbuf *sb) { void *lr_saved = __builtin_return_address(0); struct socket *so = sb->sb_so; -#ifdef notyet - lck_mtx_t *mutex_held; -#endif u_int32_t i; /* so_usecount may be 0 if we get here from sofreelastref() */ @@ -1765,19 +1786,6 @@ sbflush(struct sockbuf *sb) so->so_usecount, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } -#ifdef notyet - /* - * XXX: This code is currently commented out, because we may get here - * as part of sofreelastref(), and at that time, pr_getlock() may no - * longer be able to return us the lock; this will be fixed in future. - */ - if (so->so_proto->pr_getlock != NULL) - mutex_held = (*so->so_proto->pr_getlock)(so, 0); - else - mutex_held = so->so_proto->pr_domain->dom_mtx; - - lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); -#endif /* * Obtain lock on the socket buffer (SB_LOCK). This is required @@ -1848,8 +1856,8 @@ sbdrop(struct sockbuf *sb, int len) ml = (struct mbuf *)0; while (len > 0) { - if (m == 0) { - if (next == 0) { + if (m == NULL) { + if (next == NULL) { /* * temporarily replacing this panic with printf * because it occurs occasionally when closing @@ -1881,6 +1889,9 @@ sbdrop(struct sockbuf *sb, int len) m->m_len -= len; m->m_data += len; sb->sb_cc -= len; + /* update the send byte count */ + if (sb->sb_flags & SB_SNDBYTE_CNT) + inp_decr_sndbytes_total(sb->sb_so, len); if (m->m_type != MT_DATA && m->m_type != MT_HEADER && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; @@ -2197,7 +2208,7 @@ pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, } int -pru_soreceive_list_notsupp(struct socket *so, +pru_soreceive_list_notsupp(struct socket *so, struct recv_msg_elem *recv_msg_array, u_int uiocnt, int *flagsp) { #pragma unused(so, recv_msg_array, uiocnt, flagsp) @@ -2426,11 +2437,13 @@ sballoc(struct sockbuf *sb, struct mbuf *m) total_sbmb_cnt_peak = total_sbmb_cnt; /* - * If data is being appended to the send socket buffer, + * If data is being added to the send socket buffer, * update the send byte count */ - if (!(sb->sb_flags & SB_RECV)) - OSAddAtomic(cnt, &total_snd_byte_count); + if (sb->sb_flags & SB_SNDBYTE_CNT) { + inp_incr_sndbytes_total(sb->sb_so, m->m_len); + inp_incr_sndbytes_unsent(sb->sb_so, m->m_len); + } } /* adjust counters in sb reflecting freeing of m */ @@ -2450,14 +2463,15 @@ sbfree(struct sockbuf *sb, struct mbuf *m) } OSAddAtomic(cnt, &total_sbmb_cnt); VERIFY(total_sbmb_cnt >= 0); + if (total_sbmb_cnt < total_sbmb_cnt_floor) + total_sbmb_cnt_floor = total_sbmb_cnt; /* * If data is being removed from the send socket buffer, * update the send byte count */ - if (!(sb->sb_flags & SB_RECV)) { - OSAddAtomic(cnt, &total_snd_byte_count); - } + if (sb->sb_flags & SB_SNDBYTE_CNT) + inp_decr_sndbytes_total(sb->sb_so, m->m_len); } /* @@ -2550,10 +2564,11 @@ sblock(struct sockbuf *sb, uint32_t flags) if (error == 0 && (so->so_flags & SOF_DEFUNCT) && !(flags & SBL_IGNDEFUNCT)) { error = EBADF; - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] " "(%d)\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), (uint64_t)VM_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SOCK_DOM(so), SOCK_TYPE(so), error); } if (error != 0) @@ -2863,7 +2878,7 @@ soisthrottled(struct socket *so) * application, as we're missing the system wide "decision maker" */ return ( - (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND)); + (so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND)); } inline int @@ -2875,7 +2890,7 @@ soisprivilegedtraffic(struct socket *so) inline int soissrcbackground(struct socket *so) { - return ((so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND) || + return ((so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND) || IS_SO_TC_BACKGROUND(so->so_traffic_class)); } @@ -2940,8 +2955,8 @@ sysctl_io_policy_throttled SYSCTL_HANDLER_ARGS return (err); if (i != net_io_policy_throttled) - SOTHROTTLELOG(("throttle: network IO policy throttling is " - "now %s\n", i ? "ON" : "OFF")); + SOTHROTTLELOG("throttle: network IO policy throttling is " + "now %s\n", i ? "ON" : "OFF"); net_io_policy_throttled = i; @@ -2971,6 +2986,16 @@ SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat, SYSCTL_INT(_kern_ipc, OID_AUTO, soqlencomp, CTLFLAG_RW | CTLFLAG_LOCKED, &soqlencomp, 0, "Listen backlog represents only complete queue"); +SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt, CTLFLAG_RD | CTLFLAG_LOCKED, + &total_sbmb_cnt, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_peak, CTLFLAG_RD | CTLFLAG_LOCKED, + &total_sbmb_cnt_peak, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_floor, CTLFLAG_RD | CTLFLAG_LOCKED, + &total_sbmb_cnt_floor, 0, ""); +SYSCTL_QUAD(_kern_ipc, OID_AUTO, sbmb_limreached, CTLFLAG_RD | CTLFLAG_LOCKED, + &sbmb_limreached, ""); + + SYSCTL_NODE(_kern_ipc, OID_AUTO, io_policy, CTLFLAG_RW, 0, "network IO policy"); SYSCTL_PROC(_kern_ipc_io_policy, OID_AUTO, throttled, diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index f94c740fd..8ae71f6e7 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -1131,7 +1131,7 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) } #if CONFIG_MACF_SOCKET_SUBSET - error = mac_vnode_check_uipc_connect(ctx, vp); + error = mac_vnode_check_uipc_connect(ctx, vp, so); if (error) { socket_lock(so, 0); goto out; diff --git a/bsd/kern/vm_pressure.c b/bsd/kern/vm_pressure.c deleted file mode 100644 index 028411c9a..000000000 --- a/bsd/kern/vm_pressure.c +++ /dev/null @@ -1,729 +0,0 @@ -/* - * Copyright (c) 2009-2010 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#if CONFIG_MEMORYSTATUS -#include -#endif - -/* - * This value is the threshold that a process must meet to be considered for scavenging. - */ -#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */ - -#define VM_PRESSURE_NOTIFY_WAIT_PERIOD 10000 /* milliseconds */ - -void vm_pressure_klist_lock(void); -void vm_pressure_klist_unlock(void); - -static void vm_dispatch_memory_pressure(void); -void vm_reset_active_list(void); - -#if CONFIG_MEMORYSTATUS -static kern_return_t vm_try_pressure_candidates(boolean_t target_foreground_process); -#endif - -static lck_mtx_t vm_pressure_klist_mutex; - -struct klist vm_pressure_klist; -struct klist vm_pressure_klist_dormant; - -#if DEBUG -#define VM_PRESSURE_DEBUG(cond, format, ...) \ -do { \ - if (cond) { printf(format, ##__VA_ARGS__); } \ -} while(0) -#else -#define VM_PRESSURE_DEBUG(cond, format, ...) -#endif - -void vm_pressure_init(lck_grp_t *grp, lck_attr_t *attr) { - lck_mtx_init(&vm_pressure_klist_mutex, grp, attr); -} - -void vm_pressure_klist_lock(void) { - lck_mtx_lock(&vm_pressure_klist_mutex); -} - -void vm_pressure_klist_unlock(void) { - lck_mtx_unlock(&vm_pressure_klist_mutex); -} - -int vm_knote_register(struct knote *kn) { - int rv = 0; - - vm_pressure_klist_lock(); - - if ((kn->kn_sfflags) & (NOTE_VM_PRESSURE)) { - KNOTE_ATTACH(&vm_pressure_klist, kn); - } else { - rv = ENOTSUP; - } - - vm_pressure_klist_unlock(); - - return rv; -} - -void vm_knote_unregister(struct knote *kn) { - struct knote *kn_temp; - - vm_pressure_klist_lock(); - - VM_PRESSURE_DEBUG(0, "[vm_pressure] process %d cancelling pressure notification\n", kn->kn_kq->kq_p->p_pid); - - SLIST_FOREACH(kn_temp, &vm_pressure_klist, kn_selnext) { - if (kn_temp == kn) { - KNOTE_DETACH(&vm_pressure_klist, kn); - vm_pressure_klist_unlock(); - return; - } - } - - SLIST_FOREACH(kn_temp, &vm_pressure_klist_dormant, kn_selnext) { - if (kn_temp == kn) { - KNOTE_DETACH(&vm_pressure_klist_dormant, kn); - vm_pressure_klist_unlock(); - return; - } - } - - vm_pressure_klist_unlock(); -} - -void vm_pressure_proc_cleanup(proc_t p) -{ - struct knote *kn = NULL; - - vm_pressure_klist_lock(); - - VM_PRESSURE_DEBUG(0, "[vm_pressure] process %d exiting pressure notification\n", p->p_pid); - - SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) { - if (kn->kn_kq->kq_p == p) { - KNOTE_DETACH(&vm_pressure_klist, kn); - vm_pressure_klist_unlock(); - return; - } - } - - SLIST_FOREACH(kn, &vm_pressure_klist_dormant, kn_selnext) { - if (kn->kn_kq->kq_p == p) { - KNOTE_DETACH(&vm_pressure_klist_dormant, kn); - vm_pressure_klist_unlock(); - return; - } - } - - vm_pressure_klist_unlock(); -} - -/* - * Used by the vm_pressure_thread which is - * signalled from within vm_pageout_scan(). - */ -void consider_vm_pressure_events(void) -{ - vm_dispatch_memory_pressure(); -} - -#if CONFIG_MEMORYSTATUS - -/* Jetsam aware version. Called with lock held */ - -struct knote *vm_find_knote_from_pid(pid_t, struct klist *); - -struct knote *vm_find_knote_from_pid(pid_t pid, struct klist *list) { - struct knote *kn = NULL; - - SLIST_FOREACH(kn, list, kn_selnext) { - struct proc *p; - pid_t current_pid; - - p = kn->kn_kq->kq_p; - current_pid = p->p_pid; - - if (current_pid == pid) { - break; - } - } - - return kn; -} - -int vm_dispatch_pressure_note_to_pid(pid_t pid, boolean_t locked) { - int ret = EINVAL; - struct knote *kn; - - VM_PRESSURE_DEBUG(1, "vm_dispatch_pressure_note_to_pid(): pid %d\n", pid); - - if (!locked) { - vm_pressure_klist_lock(); - } - - /* - * Because we're specifically targeting a process here, we don't care - * if a warning has already been sent and it's moved to the dormant - * list; check that too. - */ - kn = vm_find_knote_from_pid(pid, &vm_pressure_klist); - if (kn) { - KNOTE(&vm_pressure_klist, pid); - ret = 0; - } else { - kn = vm_find_knote_from_pid(pid, &vm_pressure_klist_dormant); - if (kn) { - KNOTE(&vm_pressure_klist_dormant, pid); - ret = 0; - } - } - - if (!locked) { - vm_pressure_klist_unlock(); - } - - return ret; -} - -void vm_find_pressure_foreground_candidates(void) -{ - struct knote *kn, *kn_tmp; - struct klist dispatch_klist = { NULL }; - - vm_pressure_klist_lock(); - proc_list_lock(); - - /* Find the foreground processes. */ - SLIST_FOREACH_SAFE(kn, &vm_pressure_klist, kn_selnext, kn_tmp) { - proc_t p = kn->kn_kq->kq_p; - - if (memorystatus_is_foreground_locked(p)) { - KNOTE_DETACH(&vm_pressure_klist, kn); - KNOTE_ATTACH(&dispatch_klist, kn); - } - } - - SLIST_FOREACH_SAFE(kn, &vm_pressure_klist_dormant, kn_selnext, kn_tmp) { - proc_t p = kn->kn_kq->kq_p; - - if (memorystatus_is_foreground_locked(p)) { - KNOTE_DETACH(&vm_pressure_klist_dormant, kn); - KNOTE_ATTACH(&dispatch_klist, kn); - } - } - - proc_list_unlock(); - - /* Dispatch pressure notifications accordingly */ - SLIST_FOREACH_SAFE(kn, &dispatch_klist, kn_selnext, kn_tmp) { - proc_t p = kn->kn_kq->kq_p; - - proc_list_lock(); - if (p != proc_ref_locked(p)) { - proc_list_unlock(); - KNOTE_DETACH(&dispatch_klist, kn); - KNOTE_ATTACH(&vm_pressure_klist_dormant, kn); - continue; - } - proc_list_unlock(); - - VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d\n", kn->kn_kq->kq_p->p_pid); - KNOTE(&dispatch_klist, p->p_pid); - KNOTE_DETACH(&dispatch_klist, kn); - KNOTE_ATTACH(&vm_pressure_klist_dormant, kn); - microuptime(&p->vm_pressure_last_notify_tstamp); - memorystatus_send_pressure_note(p->p_pid); - proc_rele(p); - } - - vm_pressure_klist_unlock(); -} - -void vm_find_pressure_candidate(void) -{ - struct knote *kn = NULL, *kn_max = NULL; - unsigned int resident_max = 0; - pid_t target_pid = -1; - struct klist dispatch_klist = { NULL }; - struct timeval curr_tstamp = {0, 0}; - int elapsed_msecs = 0; - proc_t target_proc = PROC_NULL; - kern_return_t kr = KERN_SUCCESS; - - microuptime(&curr_tstamp); - - vm_pressure_klist_lock(); - - SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) {\ - struct mach_task_basic_info basic_info; - mach_msg_type_number_t size = MACH_TASK_BASIC_INFO_COUNT; - unsigned int resident_size = 0; - proc_t p = PROC_NULL; - struct task* t = TASK_NULL; - - p = kn->kn_kq->kq_p; - proc_list_lock(); - if (p != proc_ref_locked(p)) { - p = PROC_NULL; - proc_list_unlock(); - continue; - } - proc_list_unlock(); - - t = (struct task *)(p->task); - - timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp); - elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; - - if (elapsed_msecs < VM_PRESSURE_NOTIFY_WAIT_PERIOD) { - proc_rele(p); - continue; - } - - if (!memorystatus_bg_pressure_eligible(p)) { - VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid); - proc_rele(p); - continue; - } - - if( ( kr = task_info(t, MACH_TASK_BASIC_INFO, (task_info_t)(&basic_info), &size)) != KERN_SUCCESS ) { - VM_PRESSURE_DEBUG(1, "[vm_pressure] task_info for pid %d failed\n", p->p_pid); - proc_rele(p); - continue; - } - - /* - * We don't want a small process to block large processes from - * being notified again. - */ - resident_size = (basic_info.resident_size)/(1024 * 1024); - if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { - if (resident_size > resident_max) { - resident_max = resident_size; - kn_max = kn; - target_pid = p->p_pid; - target_proc = p; - } - } else { - /* There was no candidate with enough resident memory to scavenge */ - VM_PRESSURE_DEBUG(1, "[vm_pressure] threshold failed for pid %d with %u resident...\n", p->p_pid, resident_size); - } - proc_rele(p); - } - - if (kn_max == NULL || target_pid == -1) { - VM_PRESSURE_DEBUG(1, "[vm_pressure] - no target found!\n"); - goto exit; - } - - VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, target_pid, resident_max, 0, 0); - VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max); - - KNOTE_DETACH(&vm_pressure_klist, kn_max); - - target_proc = proc_find(target_pid); - if (target_proc != PROC_NULL) { - KNOTE_ATTACH(&dispatch_klist, kn_max); - KNOTE(&dispatch_klist, target_pid); - KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max); - memorystatus_send_pressure_note(target_pid); - microuptime(&target_proc->vm_pressure_last_notify_tstamp); - proc_rele(target_proc); - } - -exit: - vm_pressure_klist_unlock(); -} -#endif /* CONFIG_MEMORYSTATUS */ - - -struct knote * -vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process); - -kern_return_t vm_pressure_notification_without_levels(boolean_t target_foreground_process); -kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process); - -kern_return_t -vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process) -{ - vm_pressure_klist_lock(); - - if (SLIST_EMPTY(&vm_pressure_klist)) { - vm_reset_active_list(); - } - - if (!SLIST_EMPTY(&vm_pressure_klist)) { - - VM_PRESSURE_DEBUG(1, "[vm_pressure] vm_dispatch_memory_pressure\n"); - - if (KERN_SUCCESS == vm_try_pressure_candidates(target_foreground_process)) { - vm_pressure_klist_unlock(); - return KERN_SUCCESS; - } - } - - VM_PRESSURE_DEBUG(1, "[vm_pressure] could not find suitable event candidate\n"); - - vm_pressure_klist_unlock(); - - return KERN_FAILURE; -} - -static void vm_dispatch_memory_pressure(void) -{ - memorystatus_update_vm_pressure(FALSE); -} - -extern vm_pressure_level_t -convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); - -struct knote * -vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process) -{ - struct knote *kn = NULL, *kn_max = NULL; - unsigned int resident_max = 0; - struct timeval curr_tstamp = {0, 0}; - int elapsed_msecs = 0; - int selected_task_importance = 0; - static int pressure_snapshot = -1; - boolean_t pressure_increase = FALSE; - - if (level != -1) { - - if (pressure_snapshot == -1) { - /* - * Initial snapshot. - */ - pressure_snapshot = level; - pressure_increase = TRUE; - } else { - - if (level >= pressure_snapshot) { - pressure_increase = TRUE; - } else { - pressure_increase = FALSE; - } - - pressure_snapshot = level; - } - } - - if ((level > 0) && (pressure_increase) == TRUE) { - /* - * We'll start by considering the largest - * unimportant task in our list. - */ - selected_task_importance = INT_MAX; - } else { - /* - * We'll start by considering the largest - * important task in our list. - */ - selected_task_importance = 0; - } - - microuptime(&curr_tstamp); - - SLIST_FOREACH(kn, candidate_list, kn_selnext) { - - unsigned int resident_size = 0; - proc_t p = PROC_NULL; - struct task* t = TASK_NULL; - int curr_task_importance = 0; - boolean_t consider_knote = FALSE; - boolean_t privileged_listener = FALSE; - - p = kn->kn_kq->kq_p; - proc_list_lock(); - if (p != proc_ref_locked(p)) { - p = PROC_NULL; - proc_list_unlock(); - continue; - } - proc_list_unlock(); - -#if CONFIG_MEMORYSTATUS - if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) { - /* - * Skip process not marked foreground. - */ - proc_rele(p); - continue; - } -#endif /* CONFIG_MEMORYSTATUS */ - - t = (struct task *)(p->task); - - timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp); - elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; - - if ((level == -1) && (elapsed_msecs < VM_PRESSURE_NOTIFY_WAIT_PERIOD)) { - proc_rele(p); - continue; - } - - if (level != -1) { - /* - * For the level based notifications, check and see if this knote is - * registered for the current level. - */ - vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level); - - if ((kn->kn_sfflags & dispatch_level) == 0) { - proc_rele(p); - continue; - } - } - -#if CONFIG_MEMORYSTATUS - if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) { - VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid); - proc_rele(p); - continue; - } -#endif /* CONFIG_MEMORYSTATUS */ - - curr_task_importance = task_importance_estimate(t); - - /* - * Privileged listeners are only considered in the multi-level pressure scheme - * AND only if the pressure is increasing. - */ - if (level > 0) { - - if (task_has_been_notified(t, level) == FALSE) { - - /* - * Is this a privileged listener? - */ - if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) { - - if (privileged_listener) { - kn_max = kn; - proc_rele(p); - goto done_scanning; - } - } - } else { - proc_rele(p); - continue; - } - } else if (level == 0) { - - /* - * Task wasn't notified when the pressure was increasing and so - * no need to notify it that the pressure is decreasing. - */ - if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) { - proc_rele(p); - continue; - } - } - - /* - * We don't want a small process to block large processes from - * being notified again. - */ - resident_size = (get_task_phys_footprint(t))/(1024*1024ULL); //(MB); - - if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { - - if (level > 0) { - /* - * Warning or Critical Pressure. - */ - if (pressure_increase) { - if ((curr_task_importance < selected_task_importance) || - ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - - /* - * We have found a candidate process which is: - * a) at a lower importance than the current selected process - * OR - * b) has importance equal to that of the current selected process but is larger - */ - - consider_knote = TRUE; - } - } else { - if ((curr_task_importance > selected_task_importance) || - ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - - /* - * We have found a candidate process which is: - * a) at a higher importance than the current selected process - * OR - * b) has importance equal to that of the current selected process but is larger - */ - - consider_knote = TRUE; - } - } - } else if (level == 0) { - /* - * Pressure back to normal. - */ - if ((curr_task_importance > selected_task_importance) || - ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) { - - consider_knote = TRUE; - } - } else if (level == -1) { - - /* - * Simple (importance and level)-free behavior based solely on RSIZE. - */ - if (resident_size > resident_max) { - consider_knote = TRUE; - } - } - - - if (consider_knote) { - resident_max = resident_size; - kn_max = kn; - selected_task_importance = curr_task_importance; - consider_knote = FALSE; /* reset for the next candidate */ - } - } else { - /* There was no candidate with enough resident memory to scavenge */ - VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %u resident...\n", p->p_pid, resident_size); - } - proc_rele(p); - } - -done_scanning: - if (kn_max) { - VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, kn_max->kn_kq->kq_p->p_pid, resident_max, 0, 0); - VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max); - } - - return kn_max; -} - -/* - * vm_pressure_klist_lock is held for this routine. - */ -kern_return_t vm_pressure_notification_without_levels(boolean_t target_foreground_process) -{ - struct knote *kn_max = NULL; - pid_t target_pid = -1; - struct klist dispatch_klist = { NULL }; - proc_t target_proc = PROC_NULL; - struct klist *candidate_list = NULL; - - candidate_list = &vm_pressure_klist; - - kn_max = vm_pressure_select_optimal_candidate_to_notify(candidate_list, -1, target_foreground_process); - - if (kn_max == NULL) { - if (target_foreground_process) { - /* - * Doesn't matter if the process had been notified earlier on. - * This is a very specific request. Deliver it. - */ - candidate_list = &vm_pressure_klist_dormant; - kn_max = vm_pressure_select_optimal_candidate_to_notify(candidate_list, -1, target_foreground_process); - } - - if (kn_max == NULL) { - return KERN_FAILURE; - } - } - - target_proc = kn_max->kn_kq->kq_p; - - KNOTE_DETACH(candidate_list, kn_max); - - if (target_proc != PROC_NULL) { - - target_pid = target_proc->p_pid; - - memoryshot(VM_PRESSURE_EVENT, DBG_FUNC_NONE); - - KNOTE_ATTACH(&dispatch_klist, kn_max); - KNOTE(&dispatch_klist, target_pid); - KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max); - -#if CONFIG_MEMORYSTATUS - memorystatus_send_pressure_note(target_pid); -#endif /* CONFIG_MEMORYSTATUS */ - - microuptime(&target_proc->vm_pressure_last_notify_tstamp); - } - - return KERN_SUCCESS; -} - -static kern_return_t vm_try_pressure_candidates(boolean_t target_foreground_process) -{ - /* - * This takes care of candidates that use NOTE_VM_PRESSURE. - * It's a notification without indication of the level - * of memory pressure. - */ - return (vm_pressure_notification_without_levels(target_foreground_process)); -} - -/* - * Remove all elements from the dormant list and place them on the active list. - * Called with klist lock held. - */ -void vm_reset_active_list(void) { - /* Re-charge the main list from the dormant list if possible */ - if (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { - struct knote *kn; - - VM_PRESSURE_DEBUG(1, "[vm_pressure] recharging main list from dormant list\n"); - - while (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { - kn = SLIST_FIRST(&vm_pressure_klist_dormant); - SLIST_REMOVE_HEAD(&vm_pressure_klist_dormant, kn_selnext); - SLIST_INSERT_HEAD(&vm_pressure_klist, kn, kn_selnext); - } - } -} diff --git a/bsd/libkern/Makefile b/bsd/libkern/Makefile index 65e206405..6b0060acd 100644 --- a/bsd/libkern/Makefile +++ b/bsd/libkern/Makefile @@ -3,20 +3,15 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) - KERNELFILES = \ libkern.h - EXPORT_MI_LIST = ${KERNELFILES} EXPORT_MI_DIR = libkern include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/libkern/crc16.c b/bsd/libkern/crc16.c new file mode 100644 index 000000000..d0358dd0d --- /dev/null +++ b/bsd/libkern/crc16.c @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2016 Apple Inc. All rights reserved. + * + * This document is the property of Apple Inc. + * It is considered confidential and proprietary. + * + * This document may not be reproduced or transmitted in any form, + * in whole or in part, without the express written permission of + * Apple Inc. + * + * CRC-16-ANSI (aka CRC-16-IBM) Polynomial: x^16 + x^15 + x^2 + 1 + * Derived from Craig Marciniak's "Craig's Portable CRC16 Library." + */ + +#include + +static uint16_t crc16_tab[256] = { + 0x0000, 0xc0c1, 0xc181, 0x0140, 0xc301, 0x03c0, 0x0280, 0xc241, + 0xc601, 0x06c0, 0x0780, 0xc741, 0x0500, 0xc5c1, 0xc481, 0x0440, + 0xcc01, 0x0cc0, 0x0d80, 0xcd41, 0x0f00, 0xcfc1, 0xce81, 0x0e40, + 0x0a00, 0xcac1, 0xcb81, 0x0b40, 0xc901, 0x09c0, 0x0880, 0xc841, + 0xd801, 0x18c0, 0x1980, 0xd941, 0x1b00, 0xdbc1, 0xda81, 0x1a40, + 0x1e00, 0xdec1, 0xdf81, 0x1f40, 0xdd01, 0x1dc0, 0x1c80, 0xdc41, + 0x1400, 0xd4c1, 0xd581, 0x1540, 0xd701, 0x17c0, 0x1680, 0xd641, + 0xd201, 0x12c0, 0x1380, 0xd341, 0x1100, 0xd1c1, 0xd081, 0x1040, + 0xf001, 0x30c0, 0x3180, 0xf141, 0x3300, 0xf3c1, 0xf281, 0x3240, + 0x3600, 0xf6c1, 0xf781, 0x3740, 0xf501, 0x35c0, 0x3480, 0xf441, + 0x3c00, 0xfcc1, 0xfd81, 0x3d40, 0xff01, 0x3fc0, 0x3e80, 0xfe41, + 0xfa01, 0x3ac0, 0x3b80, 0xfb41, 0x3900, 0xf9c1, 0xf881, 0x3840, + 0x2800, 0xe8c1, 0xe981, 0x2940, 0xeb01, 0x2bc0, 0x2a80, 0xea41, + 0xee01, 0x2ec0, 0x2f80, 0xef41, 0x2d00, 0xedc1, 0xec81, 0x2c40, + 0xe401, 0x24c0, 0x2580, 0xe541, 0x2700, 0xe7c1, 0xe681, 0x2640, + 0x2200, 0xe2c1, 0xe381, 0x2340, 0xe101, 0x21c0, 0x2080, 0xe041, + 0xa001, 0x60c0, 0x6180, 0xa141, 0x6300, 0xa3c1, 0xa281, 0x6240, + 0x6600, 0xa6c1, 0xa781, 0x6740, 0xa501, 0x65c0, 0x6480, 0xa441, + 0x6c00, 0xacc1, 0xad81, 0x6d40, 0xaf01, 0x6fc0, 0x6e80, 0xae41, + 0xaa01, 0x6ac0, 0x6b80, 0xab41, 0x6900, 0xa9c1, 0xa881, 0x6840, + 0x7800, 0xb8c1, 0xb981, 0x7940, 0xbb01, 0x7bc0, 0x7a80, 0xba41, + 0xbe01, 0x7ec0, 0x7f80, 0xbf41, 0x7d00, 0xbdc1, 0xbc81, 0x7c40, + 0xb401, 0x74c0, 0x7580, 0xb541, 0x7700, 0xb7c1, 0xb681, 0x7640, + 0x7200, 0xb2c1, 0xb381, 0x7340, 0xb101, 0x71c0, 0x7080, 0xb041, + 0x5000, 0x90c1, 0x9181, 0x5140, 0x9301, 0x53c0, 0x5280, 0x9241, + 0x9601, 0x56c0, 0x5780, 0x9741, 0x5500, 0x95c1, 0x9481, 0x5440, + 0x9c01, 0x5cc0, 0x5d80, 0x9d41, 0x5f00, 0x9fc1, 0x9e81, 0x5e40, + 0x5a00, 0x9ac1, 0x9b81, 0x5b40, 0x9901, 0x59c0, 0x5880, 0x9841, + 0x8801, 0x48c0, 0x4980, 0x8941, 0x4b00, 0x8bc1, 0x8a81, 0x4a40, + 0x4e00, 0x8ec1, 0x8f81, 0x4f40, 0x8d01, 0x4dc0, 0x4c80, 0x8c41, + 0x4400, 0x84c1, 0x8581, 0x4540, 0x8701, 0x47c0, 0x4680, 0x8641, + 0x8201, 0x42c0, 0x4380, 0x8341, 0x4100, 0x81c1, 0x8081, 0x4040 +}; + + +uint16_t +crc16(uint16_t crc, const void *buf, size_t size) +{ + const uint8_t *p; + + p = buf; + + while (size--) + crc = crc16_tab[(crc ^ (*p++)) & 0xFF] ^ (crc >> 8); + + return crc; +} diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index 651a01390..4e6606007 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -73,6 +73,7 @@ #include #include /* for platform-specific va_list */ #include +#include #include #include #include @@ -168,6 +169,7 @@ int _consume_printf_args(int, ...); #endif #endif +uint16_t crc16(uint16_t crc, const void *bufp, size_t len); uint32_t crc32(uint32_t crc, const void *bufp, size_t len); int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done); @@ -175,12 +177,21 @@ int copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done); int copyoutstr(const void *kaddr, user_addr_t udaddr, size_t len, size_t *done); int copyin(const user_addr_t uaddr, void *kaddr, size_t len); int copyout(const void *kaddr, user_addr_t udaddr, size_t len); +#if XNU_KERNEL_PRIVATE +extern int copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes); +#endif int vsscanf(const char *, char const *, va_list); extern int vprintf(const char *, va_list); extern int vsnprintf(char *, size_t, const char *, va_list); +#if XNU_KERNEL_PRIVATE +extern int vprintf_log_locked(const char *, va_list); +extern void osobject_retain(void * object); +extern void osobject_release(void * object); +#endif + /* vsprintf() is being deprecated. Please use vsnprintf() instead. */ extern int vsprintf(char *bufp, const char *, va_list) __deprecated; @@ -195,18 +206,13 @@ extern void invalidate_icache64(addr64_t, unsigned, int); extern void flush_dcache64(addr64_t, unsigned, int); -/* - * assembly versions of clz... ideally we would just call - * __builtin_clz(num), unfortunately this one is ill defined - * by gcc for num=0 - */ -static __inline__ unsigned int +static inline int clz(unsigned int num) { - return num?__builtin_clz(num):__builtin_clz(0); + // On Intel, clz(0) is undefined + return num ? __builtin_clz(num) : sizeof(num) * CHAR_BIT; } - __END_DECLS #endif /* _LIBKERN_LIBKERN_H_ */ diff --git a/bsd/libkern/skpc.c b/bsd/libkern/skpc.c index 78d886bb1..cfe1de92d 100644 --- a/bsd/libkern/skpc.c +++ b/bsd/libkern/skpc.c @@ -65,7 +65,7 @@ int skpc(int mask0, int size, char *cp0) { - register u_char *cp, *end, mask; + u_char *cp, *end, mask; mask = mask0; cp = (u_char *)cp0; diff --git a/bsd/libkern/strsep.c b/bsd/libkern/strsep.c index 75ef7cf1b..6e0942254 100644 --- a/bsd/libkern/strsep.c +++ b/bsd/libkern/strsep.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. @@ -48,9 +76,7 @@ static char sccsid[] = "@(#)strsep.c 8.1 (Berkeley) 6/4/93"; * If *stringp is NULL, strsep returns NULL. */ char * -strsep(stringp, delim) - char **stringp; - const char *delim; +strsep(char **stringp, const char *delim) { char *s; const char *spanp; diff --git a/bsd/machine/Makefile b/bsd/machine/Makefile index 556aeb506..5b190e442 100644 --- a/bsd/machine/Makefile +++ b/bsd/machine/Makefile @@ -3,11 +3,9 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) - DATAFILES = \ byte_order.h endian.h fasttrap_isa.h \ limits.h param.h profile.h \ @@ -26,7 +24,6 @@ KERNELFILES = \ vmparam.h _types.h _limits.h _param.h \ _mcontext.h - INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} @@ -38,5 +35,3 @@ EXPORT_MI_DIR = machine include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/man/Makefile b/bsd/man/Makefile index 45da1d7e2..34575d2e8 100644 --- a/bsd/man/Makefile +++ b/bsd/man/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -12,9 +11,8 @@ INSTTEXTFILES_SUBDIRS = \ man3 \ man4 \ man5 \ - man8 \ + man7 \ man9 include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/man/man2/Makefile b/bsd/man/man2/Makefile index 047f85de3..53d932f6d 100644 --- a/bsd/man/man2/Makefile +++ b/bsd/man/man2/Makefile @@ -33,6 +33,8 @@ DATAFILES = \ chmod.2 \ chown.2 \ chroot.2 \ + clonefile.2 \ + clonefileat.2 \ close.2 \ connect.2 \ connectx.2 \ @@ -48,6 +50,7 @@ DATAFILES = \ fchmodat.2 \ fchown.2 \ fchownat.2 \ + fclonefileat.2 \ fcntl.2 \ fgetattrlist.2 \ fsetattrlist.2 \ @@ -77,6 +80,7 @@ DATAFILES = \ getdirentriesattr.2 \ getdtablesize.2 \ getegid.2 \ + getentropy.2 \ geteuid.2 \ getfh.2 \ getfsstat.2 \ @@ -136,7 +140,7 @@ DATAFILES = \ openat.2 \ pathconf.2 \ pipe.2 \ - poll.2 \ + poll.2 \ posix_madvise.2 \ pread.2 \ profil.2 \ @@ -155,6 +159,8 @@ DATAFILES = \ recvmsg.2 \ rename.2 \ renameat.2 \ + renamex_np.2 \ + renameatx_np.2 \ removexattr.2 \ revoke.2 \ rmdir.2 \ @@ -215,7 +221,6 @@ DATAFILES = \ symlink.2 \ symlinkat.2 \ sync.2 \ - syscall.2 \ truncate.2 \ umask.2 \ undelete.2 \ @@ -242,4 +247,3 @@ INSTALL_MAN_DIR = man2 include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/man/man2/access.2 b/bsd/man/man2/access.2 index 2c89f7a11..c4b2f92f6 100644 --- a/bsd/man/man2/access.2 +++ b/bsd/man/man2/access.2 @@ -1,5 +1,3 @@ -.\" $NetBSD: access.2,v 1.7 1995/02/27 12:31:44 cgd Exp $ -.\" .\" Copyright (c) 1980, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" @@ -11,10 +9,6 @@ .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. @@ -32,51 +26,57 @@ .\" SUCH DAMAGE. .\" .\" @(#)access.2 8.2 (Berkeley) 4/1/94 +.\" $FreeBSD$ .\" -.Dd April 1, 1994 +.Dd September 15, 2014 .Dt ACCESS 2 -.Os BSD 4 +.Os .Sh NAME .Nm access , .Nm faccessat -.Nd check access permissions of a file or pathname +.Nd check accessibility of a file .Sh SYNOPSIS -.Fd #include +.In unistd.h .Ft int -.Fo access -.Fa "const char *path" -.Fa "int amode" -.Fc +.Fn access "const char *path" "int mode" .Ft int .Fn faccessat "int fd" "const char *path" "int mode" "int flag" .Sh DESCRIPTION The .Fn access -function checks the accessibility of the +system call checks the accessibility of the file named by +the .Fa path +argument for the access permissions indicated by -.Fa amode . +the +.Fa mode +argument. The value of -.Fa amode -is the bitwise inclusive OR of the access permissions to be +.Fa mode +is either the bitwise-inclusive OR of the access permissions to be checked -.Pf ( Dv R_OK +.Dv ( R_OK for read permission, .Dv W_OK -for write permission and +for write permission, and .Dv X_OK -for execute/search permission) or the existence test, -.Dv F_OK . -All components of the pathname -.Fa path -are checked for access permissions (including -.Dv F_OK ) . +for execute/search permission), +or the existence test +.Pq Dv F_OK . +.Pp +For additional information, see the +.Sx "File Access Permission" +section of +.Xr intro 2 . .Pp -The real user ID is used in place of the effective user ID -and the real group access list -(including the real group ID) are -used in place of the effective ID for verifying permission. +The +.Fn access +system call uses +the real user ID in place of the effective user ID, +the real group ID in place of the effective group ID, +and the rest of the group access list. .Pp The .Fn faccessat @@ -118,62 +118,45 @@ Likewise for and .Dv W_OK . .Sh RETURN VALUES -If -.Fa path -cannot be found -or if any of the desired access modes would not be granted, -then a -1 value is returned and the global integer variable -.Va errno -is set to indicate the error. -Otherwise, a 0 value is returned. +.Rv -std .Sh ERRORS -Access to the file is denied if: +.Fn access +or +.Fn faccessat +will fail if: .Bl -tag -width Er -.\" ========== -.It Bq Er EACCES -Permission bits of the file mode do not permit the requested access, -or search permission is denied on a component of the path prefix. -.Pp -The owner of a file has permission checked -with respect to the ``owner'' read, write, and execute mode bits, -members of the file's group other than the owner have permission checked -with respect to the ``group'' mode bits, -and all others have permissions checked -with respect to the ``other'' mode bits. -.\" -.\" ========== -.It Bq Er EFAULT -.Fa Path -points outside the process's allocated address space. .It Bq Er EINVAL -An invalid value was specified for -.Ar amode . -.\" ========== -.It Bq Er EIO -An I/O error occurred while reading from or writing to the file system. -.\" ========== -.It Bq Er ELOOP -Too many symbolic links were encountered in translating the pathname. -.\" ========== +The value of the +.Fa mode +argument is invalid. +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. .It Bq Er ENAMETOOLONG A component of a pathname exceeded .Dv {NAME_MAX} characters, or an entire path name exceeded .Dv {PATH_MAX} characters. -.\" ========== .It Bq Er ENOENT The named file does not exist. -.\" ========== -.It Bq Er ENOTDIR -A component of the path prefix is not a directory. -.\" ========== +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. .It Bq Er EROFS Write access is requested for a file on a read-only file system. -.\" ========== .It Bq Er ETXTBSY Write access is requested for a pure procedure (shared text) -file that is presently being executed. +file presently being executed. +.It Bq Er EACCES +Permission bits of the file mode do not permit the requested +access, or search permission is denied on a component of the +path prefix. +.It Bq Er EFAULT +The +.Fa path +argument +points outside the process's allocated address space. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. .El .Pp Also, the @@ -204,16 +187,38 @@ nor a file descriptor associated with a directory. .El .Sh SEE ALSO .Xr chmod 2 , +.Xr intro 2 , .Xr stat 2 .Sh STANDARDS The .Fn access -function conforms to +system call is expected to conform to .St -p1003.1-90 . The .Fn faccessat system call is expected to conform to POSIX.1-2008 . -.Sh CAVEAT +.Sh HISTORY +The +.Fn access +function appeared in +.At v7 . +.Sh SECURITY CONSIDERATIONS +The result of +.Fn access +should not be used to make an actual access control decision, since its +response, even if correct at the moment it is formed, may be outdated at the +time you act on it. +.Fn access +results should only be used to pre-flight, such as when configuring user +interface elements or for optimization purposes. The actual access control +decision should be made by attempting to execute the relevant system call while +holding the applicable credentials, and properly handling any resulting errors; +and this must be done even though +.Fn access +may have predicted success. +.Pp +Additionally, set-user-ID and set-group-ID applications should restore the +effective user or group ID, +and perform actions directly rather than use .Fn access -is a potential security hole and -should never be used. +to simulate access checks for the real user or group ID. diff --git a/bsd/man/man2/adjtime.2 b/bsd/man/man2/adjtime.2 index 845d88254..51f4b18a3 100644 --- a/bsd/man/man2/adjtime.2 +++ b/bsd/man/man2/adjtime.2 @@ -44,7 +44,7 @@ .Ft int .Fn adjtime "const struct timeval *delta" "struct timeval *olddelta" .Sh DESCRIPTION -.Fn Adjtime +.Fn adjtime makes small adjustments to the system time, as returned by .Xr gettimeofday 2 , advancing or retarding it @@ -88,7 +88,7 @@ A return value of -1 indicates that an error occurred, and in this case an error code is stored in the global variable .Va errno . .Sh ERRORS -.Fn Adjtime +.Fn adjtime will fail if: .Bl -tag -width Er .It Bq Er EFAULT diff --git a/bsd/man/man2/chdir.2 b/bsd/man/man2/chdir.2 index 3d688be23..858b22d4b 100644 --- a/bsd/man/man2/chdir.2 +++ b/bsd/man/man2/chdir.2 @@ -83,7 +83,7 @@ is set to indicate the error. .Sh ERRORS The -.Fn Chdir +.Fn chdir system call will fail and the current working directory will be unchanged if one or more of the following are true: @@ -118,7 +118,7 @@ The named directory does not exist. A component of the path prefix is not a directory. .El .Pp -.Fn Fchdir +.Fn fchdir will fail and the current working directory will be unchanged if one or more of the following are true: .Bl -tag -width Er @@ -133,7 +133,7 @@ The argument is not a valid file descriptor. .\" ========== .It Bq Er EINTR -.Fn Fchdir +.Fn fchdir was interrupted by a signal. .\" ========== .It Bq Er EIO diff --git a/bsd/man/man2/chflags.2 b/bsd/man/man2/chflags.2 index 8df343407..973a27817 100644 --- a/bsd/man/man2/chflags.2 +++ b/bsd/man/man2/chflags.2 @@ -104,7 +104,7 @@ Otherwise, -1 is returned and the global variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Chflags +.Fn chflags will fail if: .Bl -tag -width Er .It Bq Er ENOTDIR @@ -137,7 +137,7 @@ error occurred while reading from or writing to the file system. The operation isn't supported by the filesystem. .El .Pp -.Fn Fchflags +.Fn fchflags will fail if: .Bl -tag -width Er .It Bq Er EBADF diff --git a/bsd/man/man2/chmod.2 b/bsd/man/man2/chmod.2 index b49c28075..52e1a7af5 100644 --- a/bsd/man/man2/chmod.2 +++ b/bsd/man/man2/chmod.2 @@ -155,7 +155,7 @@ files of other users in that directory. The sticky bit may be set by any user on a directory which the user owns or has appropriate permissions. For more details of the properties of the sticky bit, see -.Xr sticky 8 . +.Xr sticky 7 . .Pp Writing or changing the owner of a file turns off the set-user-id and set-group-id bits @@ -283,7 +283,7 @@ is necessary. .Xr open 2 , .Xr stat 2 , .Xr compat 5 , -.Xr sticky 8 +.Xr sticky 7 .Sh STANDARDS The .Fn chmod diff --git a/bsd/man/man2/chown.2 b/bsd/man/man2/chown.2 index 6734cf17d..8d867fa41 100644 --- a/bsd/man/man2/chown.2 +++ b/bsd/man/man2/chown.2 @@ -82,12 +82,7 @@ capability is restricted to the super-user. .Pp The .Fn chown -system call -clears the set-user-id and set-group-id bits -on the file -to prevent accidental or mischievous creation of -set-user-id and set-group-id programs if not executed -by the super-user. +system call clears the set-user-id and set-group-id bits on the file. The .Fn chown system call diff --git a/bsd/man/man2/chroot.2 b/bsd/man/man2/chroot.2 index 2bdb61d11..001185360 100644 --- a/bsd/man/man2/chroot.2 +++ b/bsd/man/man2/chroot.2 @@ -46,7 +46,7 @@ .Sh DESCRIPTION .Fa Dirname is the address of the pathname of a directory, terminated by an ASCII NUL. -.Fn Chroot +.Fn chroot causes .Fa dirname to become the root directory, @@ -74,7 +74,7 @@ a value of -1 is returned and .Va errno is set to indicate an error. .Sh ERRORS -.Fn Chroot +.Fn chroot will fail and the root directory will be unchanged if: .Bl -tag -width Er .It Bq Er ENOTDIR diff --git a/bsd/man/man2/clonefile.2 b/bsd/man/man2/clonefile.2 new file mode 100644 index 000000000..09558f405 --- /dev/null +++ b/bsd/man/man2/clonefile.2 @@ -0,0 +1,272 @@ +.\" Copyright (c) 2015 Apple Computer, Inc. All rights reserved. +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @(#)clonefile.2 +. +.Dd December 04, 2015 +.Dt CLONEFILE 2 +.Os Darwin +.Sh NAME +.Nm clonefile +.Nd create copy on write clones of files +.Sh SYNOPSIS +.Fd #include +.Fd #include +.Pp +.Ft int +.Fn clonefile "const char * src" "const char * dst" "int flags" +. +.Fn clonefileat "int src_dirfd" "const char * src" "int dst_dirfd" "const char * dst" "int flags" +. +.Fn fclonefileat "int srcfd" "int dst_dirfd" "const char * dst" "int flags" +. +.Sh DESCRIPTION +The +.Fn clonefile +function causes the named file +.Fa src +to be cloned to the named file +.Fa dst . +The cloned file +.Fa dst +shares its data blocks with the +.Fa src +file but has its own copy of attributes, extended attributes and ACL's which are identical to +those of the named file +.Fa src +with the exceptions listed below +.Pp +. +.Bl -enum +. +.It +ownership information and timestamps are set as they would be if +.Fa dst +was created by +.Xr openat 2 +. +.It +setuid and setgid bits are turned off in the mode bits for regular files. +.El +.Pp +Subsequent writes to either the original or cloned file are private to the file being modified (copy-on-write). +The named file +.Fa dst +must not exist for the call to be successful. Since the clonefile() system call might not +allocate new storage for data blocks, it is possible for a subsequent overwrite of an existing data block to +return ENOSPC. If +.Fa src +names a directory, the directory hierarchy is cloned as if each item was cloned individually. However, the use of +.Xr copyfile 3 +is more appropriate for copying large directory hierarchies instead of +.Xr clonefile 2 +.Pp +The +.Fn clonefileat +function is equivalent to +.Fn clonefile +except in the case where either +.Fa src +or +.Fa dst +specifies a relative path. If src is a relative path, the file to be cloned is located relative to the directory associated +with the file descriptor +.Fa src_dirfd +instead of the current working directory. If +.Fa dst +is a relative path, the same happens only relative to the directory associated with +.Fa dst_dirfd . +If +.Fn clonefileat +is passed the special value +.Dv AT_FDCWD +in either the +.Fa src_dirfd +or +.Fa dst_dirfd +parameters, the current working directory is used in the determination of the file for +the respective path parameter. +.Pp +The +.Fn fclonefileat +function is similar to +.Fn clonefileat +except that the source is identified by file descriptor +.Fa srcfd +rather than a path (as in +.Fn clonefile +or +.Fn clonefileat ) +.Pp +The +.Fa flags +parameter specifies the options that can be passed. Options are specified in the +.Fa flags +argument by or'ing the following values: +. +.Bl -tag -width CLONE_NOFOLLOW +. +.It CLONE_NOFOLLOW +Don't follow the src file if it is a symbolic link (applicable only if the source is not a directory). +The symbolic link is itself cloned if +.Fa src +names a symbolic link. +. +.El +.Pp +The +.Fn clonefile , +.Fn clonefileat +and +.Fn fclonefileat +functions are expected to be atomic i.e. the system call will result all new objects being created +successfully or no new objects will be created. POSIX conforming applications cannot use +.Fn clonefile . +. +.Sh RETURN VALUES +Upon successful completion, +.Fn clonefile +returns 0. Otherwise, a value of -1 is returned and errno is set to indicate the error. +.Pp +.Sh COMPATIBILITY +Not all volumes support +.Fn clonefile . +A volume can be tested for +.Fn clonefile +support by using +.Xr getattrlist 2 +to get the volume capabilities attribute ATTR_VOL_CAPABILITIES, and then testing the VOL_CAP_INT_CLONE flag. +.Pp +.Sh ERRORS +The +.Fn clonefile +function will fail if: +.Bl -tag -width Er +. +.It Bq Er EACCES +Read permissions are denied on the source or write permissions are on the destination parent. +. +.It Bq Er ENOTSUP +The underlying filesystem does not support this call. +. +.It Bq Er EEXIST +The named file +.Fa dst +exists. +. +.It Bq Er EXDEV +.Fa src +and +.Fa dst +are not on the same filesystem. +. +.It Bq Er EINVAL +The value of the +.Fa flags +parameter is invalid. +. +.It Bq Er ENOSPC +There is no free space remaining on the file system containing the file. +. +.It Bq Er EIO +An I/O error occurred while reading from or writing to the file system. +. +.It Bq Er EPERM +The calling process does not have appropriate privileges. +. +.It Bq Er EPERM +.Fa src +is the root of the Filesystem. +. +.It Bq Er ELOOP +A loop exists in symbolic links encountered during in resolution +of the +.Fa src +or +.Fa dst +path arguments. +. +.It Bq Er EROFS +The requested operation requires writing in a directory on a read-only file system. +. +.It Bq Er ENAMETOOLONG +The length of a component of a pathname is longer than {NAME_MAX}. +. +.It Bq Er ENOENT +A component of path +.Fa src +or the path +.Fa dst +does not name an existing file or path is an empty string. +. +.It Bq Er ENOTDIR +A component of path prefix of either +.Fa src +or +.Fa dst +names an existing file that is neither a directory nor a symbolic link to a directory, +or the path argument contains at least one non character and ends with one or +more trailing characters and the last pathname component names an existing file that +is neither a directory nor a symbolic link to a directory. +.El +.Pp +In addition, the +.Fn clonefileat +or +.Fn fclonefileat +functions may fail with the following errors +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa src +or +.Fa dst +argument does not specify an absolute path and the +.Fa src_dirfd +or +.Fa dst_dirfd +argument is neither +.Dv AT_FDCWD +nor a valid file descriptor open for searching. +. +.It Bq Er ENOTDIR +The +.Fa src +or +.Fa dst +argument is not an absolute path and +.Fa src_dirfd +or +.Fa dst_dirfd +is neither +.Dv AT_FDCWD +nor a file descriptor associated with a directory. +.El +. +.Pp +. +.Sh SEE ALSO +. +.Xr copyfile 3 +. +.Sh HISTORY +The +.Fn clonefile , +.Fn clonefileat +and +.Fn fclonefileat +function calls appeared in OS X version 10.12 +. diff --git a/bsd/man/man2/clonefileat.2 b/bsd/man/man2/clonefileat.2 new file mode 100644 index 000000000..5e5a0d811 --- /dev/null +++ b/bsd/man/man2/clonefileat.2 @@ -0,0 +1 @@ +.so man2/clonefile.2 diff --git a/bsd/man/man2/close.2 b/bsd/man/man2/close.2 index 0aa08b400..078965e9a 100644 --- a/bsd/man/man2/close.2 +++ b/bsd/man/man2/close.2 @@ -127,6 +127,6 @@ encountered an input/output error. .Xr socket 2 , .Xr socketpair 2 .Sh STANDARDS -.Fn Close +.Fn close conforms to .St -p1003.1-88 . diff --git a/bsd/man/man2/dup.2 b/bsd/man/man2/dup.2 index 897966a52..ec0f84d32 100644 --- a/bsd/man/man2/dup.2 +++ b/bsd/man/man2/dup.2 @@ -52,7 +52,7 @@ .Fa "int fildes2" .Fc .Sh DESCRIPTION -.Fn Dup +.Fn dup duplicates an existing object descriptor and returns its value to the calling process .Fa ( fildes2 @@ -157,7 +157,7 @@ is negative or greater than the maximum allowable number (see getdtablesize(2)). .Xr socket 2 , .Xr socketpair 2 .Sh STANDARDS -.Fn Dup +.Fn dup and .Fn dup2 are expected to conform to diff --git a/bsd/man/man2/execve.2 b/bsd/man/man2/execve.2 index 6f7e4f336..94f939a17 100644 --- a/bsd/man/man2/execve.2 +++ b/bsd/man/man2/execve.2 @@ -48,7 +48,7 @@ .Fa "char *const envp[]" .Fc .Sh DESCRIPTION -.Fn Execve +.Fn execve transforms the calling process into a new process. The new process is constructed from an ordinary file, whose name is pointed to by @@ -196,7 +196,7 @@ the return value will be -1 and the global variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Execve +.Fn execve will fail and return to the calling process if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/fclonefileat.2 b/bsd/man/man2/fclonefileat.2 new file mode 100644 index 000000000..5e5a0d811 --- /dev/null +++ b/bsd/man/man2/fclonefileat.2 @@ -0,0 +1 @@ +.so man2/clonefile.2 diff --git a/bsd/man/man2/fcntl.2 b/bsd/man/man2/fcntl.2 index ce2daefe5..828c3e191 100644 --- a/bsd/man/man2/fcntl.2 +++ b/bsd/man/man2/fcntl.2 @@ -71,7 +71,7 @@ .Fa "..." .Fc .Sh DESCRIPTION -.Fn Fcntl +.Fn fcntl provides for control over descriptors. The argument .Fa fildes diff --git a/bsd/man/man2/flock.2 b/bsd/man/man2/flock.2 index e72823d8d..5b0b60a22 100644 --- a/bsd/man/man2/flock.2 +++ b/bsd/man/man2/flock.2 @@ -48,7 +48,7 @@ .Ft int .Fn flock "int fd" "int operation" .Sh DESCRIPTION -.Fn Flock +.Fn flock applies or removes an .Em advisory lock on the file associated with the file descriptor diff --git a/bsd/man/man2/fork.2 b/bsd/man/man2/fork.2 index 52aea6389..b5bb420e0 100644 --- a/bsd/man/man2/fork.2 +++ b/bsd/man/man2/fork.2 @@ -46,7 +46,7 @@ .Fa void .Fc .Sh DESCRIPTION -.Fn Fork +.Fn fork causes creation of a new process. The new process (child process) is an exact copy of the calling process (parent process) except for the following: @@ -85,7 +85,7 @@ no child process is created, and the global variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Fork +.Fn fork will fail and no child process will be created if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/fsync.2 b/bsd/man/man2/fsync.2 index ad72f78de..c772a497e 100644 --- a/bsd/man/man2/fsync.2 +++ b/bsd/man/man2/fsync.2 @@ -46,7 +46,7 @@ .Fa "int fildes" .Fc .Sh DESCRIPTION -.Fn Fsync +.Fn fsync causes all modified data and attributes of .Fa fildes to be moved to a permanent storage device. diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 index 124e695c9..9f24d9c30 100644 --- a/bsd/man/man2/getattrlist.2 +++ b/bsd/man/man2/getattrlist.2 @@ -433,6 +433,8 @@ An structure that uniquely identifies the file system object within a mounted volume for the duration of it's mount; this identifier is not guaranteed to be persistent for the volume and may change every time the volume is mounted. +If the VOL_CAP_FMT_64BIT_OBJECT_IDS capability is set, this is instead a 64-bit +object identifier. .Pp On HFS+ volumes, the ATTR_CMN_OBJID of a file system object is distinct from the ATTR_CMN_OBJID of any hard link to that file system object. Although the @@ -446,6 +448,8 @@ An structure that uniquely and persistently identifies the file system object within its volume; persistence implies that this attribute is unaffected by mount/unmount operations on the volume. +If the VOL_CAP_FMT_64BIT_OBJECT_IDS capability is set, this is instead a 64-bit +object identifier. .Pp Some file systems can not return this attribute when the volume is mounted read-only and will fail the request with error @@ -461,6 +465,8 @@ structure that uniquely identifies the parent directory of the file system object within a mounted volume, for the duration of the volume mount; this identifier is not guaranteed to be persistent for the volume and may change every time the volume is mounted. +If the VOL_CAP_FMT_64BIT_OBJECT_IDS capability is set, this is instead a 64-bit +object identifier. .Pp . If a file system object is hard linked from multiple directories, the parent @@ -499,7 +505,7 @@ structure returned by .Xr stat 2 . . .It ATTR_CMN_CHGTIME -(read/write) A +A .Vt timespec structure containing the time that the file system object's attributes were last modified. @@ -1082,19 +1088,26 @@ The following fork attributes are defined. .Bl -tag -width ATTR_VOL_ALLOCATIONCLUMP . .It ATTR_FORK_TOTALSIZE +Deprecated. An .Vt off_t containing the length of the fork in bytes (the logical size). . .It ATTR_FORK_ALLOCSIZE +Deprecated. An .Vt off_t containing a count of the bytes on disk used by the fork (the physical size). . +.It ATTR_FORK_RESERVED +Reserved. +You must set this to 0. +. .El .Pp . -Fork attributes are not properly implemented by any current Mac OS X +Fork attributes are deprecated and all bits are reserved. +They are not properly implemented by any current Mac OS X volume format implementation. We strongly recommend that client programs do not request fork attributes. If you are implementing a volume format, you should not support these attributes. @@ -1327,10 +1340,10 @@ Introduced with Darwin 10.0 (Mac OS X version 10.6). . .It VOL_CAP_FMT_64BIT_OBJECT_IDS If this bit is set, the volume format uses object IDs that are 64-bit. -This means that ATTR_CMN_FILEID and ATTR_CMN_PARENTID are the only -legitimate attributes for obtaining object IDs from this volume and the -32-bit fid_objno fields of the fsobj_id_t returned by ATTR_CMN_OBJID, -ATTR_CMN_OBJPERMANENTID, and ATTR_CMN_PAROBJID are undefined. +This means that ATTR_CMN_FILEID and ATTR_CMN_PARENTID are the primary means of +obtaining object IDs from this volume. The values returned by ATTR_CMN_OBJID, +ATTR_CMN_OBJPERMANENTID, and ATTR_CMN_PAROBJID can be interpreted as 64-bit +object IDs instead of fsobj_id_t. . .El .Pp @@ -1440,12 +1453,24 @@ AFP-style mandatory byte range locks via .It VOL_CAP_INT_EXTENDED_ATTR If this bit is set, the volume format implementation supports native extended attributes (see -.Xr setxattr 2 ). +.Xr setxattr 2 Ns ). . .It VOL_CAP_INT_NAMEDSTREAMS If this bit is set, the volume format implementation supports native named streams. . +.It VOL_CAP_INT_RENAME_SWAP +If this bit is set, the file system supports swapping file system +objects. See +.Xr rename 2 +for more details. +. +.It VOL_CAP_INT_RENAME_EXCL +If this bit is set, the file system supports an exclusive rename +operation. See +.Xr rename 2 +for more details. +. .El .Pp . diff --git a/bsd/man/man2/getdirentries.2 b/bsd/man/man2/getdirentries.2 index a513ea8e8..7e04d0df5 100644 --- a/bsd/man/man2/getdirentries.2 +++ b/bsd/man/man2/getdirentries.2 @@ -46,7 +46,7 @@ .Ft int .Fn getdirentries "int fd" "char *buf" "int nbytes" "long *basep" .Sh DESCRIPTION -.Fn Getdirentries +.Fn getdirentries reads directory entries from the directory referenced by the file descriptor .Fa fd @@ -129,7 +129,7 @@ The pointer may not advance by the number of bytes returned by A value of zero is returned when the end of the directory has been reached. .Pp -.Fn Getdirentries +.Fn getdirentries writes the position of the block read into the location pointed to by .Fa basep . Alternatively, the current position pointer may be set and retrieved by @@ -171,7 +171,7 @@ Otherwise, -1 is returned and the global variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Getdirentries +.Fn getdirentries will fail if: .Bl -tag -width Er .It Bq Er EBADF diff --git a/bsd/man/man2/getentropy.2 b/bsd/man/man2/getentropy.2 new file mode 100644 index 000000000..860e942ba --- /dev/null +++ b/bsd/man/man2/getentropy.2 @@ -0,0 +1,87 @@ +.\" Copyright (c) 2014 Theo de Raadt +.\" Copyright (c) 2015 Apple Inc. All rights reserved. +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd October 2 2015 +.Dt GETENTROPY 2 +.Os +.Sh NAME +.Nm getentropy +.Nd get entropy +.Sh SYNOPSIS +.In sys/random.h +.Ft int +.Fn getentropy "void *buf" "size_t buflen" +.Sh DESCRIPTION +.Fn getentropy +fills a buffer with random data, which can be used +as input for process-context pseudorandom generators like +.Xr arc4random 3 . +.Pp +The maximum buffer size permitted is 256 bytes. +If +.Fa buflen +exceeds this, an error of +.Er EIO +will be indicated. +.Pp +.Fn getentropy +should be used as a replacement for +.Xr random 4 +when random data derived directly from the kernel random byte generator is required. +Unlike the +.Xr random 4 +pseudo-devices, it is not vulnerable to file descriptor exhaustion attacks +and is available when sandboxed or in a chroot, making it more reliable for security-critical applications. +.Pp +However, it should be noted that +.Fn getentropy +is primarily intended for use in the construction and seeding of userspace PRNGs like +.Xr arc4random 3 +or +.Xr CC_crypto 3 . +Clients who simply require random data should use +.Xr arc4random 3 , +.Fn CCRandomGenerateBytes +from +.Xr CC_crypto 3 , +or +.Fn SecRandomCopyBytes +from the Security framework instead of +.Fn getentropy +or +.Xr random 4 +.Sh RETURN VALUES +.Rv -std +.Sh ERRORS +.Fn getentropy +will succeed unless: +.Bl -tag -width Er +.It Bq Er EINVAL +The +.Fa buf +parameter points to an +invalid address. +.It Bq Er EIO +Too many bytes requested, or some other fatal error occurred. +.El +.Sh SEE ALSO +.Xr arc4random 3 +.Xr CC_crypto 3 +.Xr random 4 +.Sh HISTORY +The +.Fn getentropy +function appeared in +OSX 10.12 diff --git a/bsd/man/man2/getfh.2 b/bsd/man/man2/getfh.2 index c5152fbdd..e9ba4b605 100644 --- a/bsd/man/man2/getfh.2 +++ b/bsd/man/man2/getfh.2 @@ -67,7 +67,7 @@ .Ft int .Fn getfh "const char *path" "fhandle_t *fhp" .Sh DESCRIPTION -.Fn Getfh +.Fn getfh returns a file handle for the specified file or directory in the file handle pointed to by .Fa fhp . @@ -81,7 +81,7 @@ is returned and the global variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Getfh +.Fn getfh fails if one or more of the following are true: .Bl -tag -width Er .It Bq ENOTDIR diff --git a/bsd/man/man2/getfsstat.2 b/bsd/man/man2/getfsstat.2 index 99e2abaf6..22c6edef9 100644 --- a/bsd/man/man2/getfsstat.2 +++ b/bsd/man/man2/getfsstat.2 @@ -105,7 +105,7 @@ Otherwise, -1 is returned and the global variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Getfsstat +.Fn getfsstat fails if one or more of the following are true: .Bl -tag -width Er .It Bq Er EFAULT diff --git a/bsd/man/man2/getgid.2 b/bsd/man/man2/getgid.2 index 9b8a31834..396f406f5 100644 --- a/bsd/man/man2/getgid.2 +++ b/bsd/man/man2/getgid.2 @@ -85,7 +85,7 @@ is necessary. .Xr setregid 2 , .Xr setgid 3 .Sh STANDARDS -.Fn Getgid +.Fn getgid and .Fn getegid conform to diff --git a/bsd/man/man2/getitimer.2 b/bsd/man/man2/getitimer.2 index fb11f6cc3..031374c42 100644 --- a/bsd/man/man2/getitimer.2 +++ b/bsd/man/man2/getitimer.2 @@ -150,7 +150,7 @@ Otherwise, a value of -1 is returned and the global integer variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Getitimer +.Fn getitimer and .Fn setitimer will fail if: diff --git a/bsd/man/man2/getlogin.2 b/bsd/man/man2/getlogin.2 index cf7562b74..be2b7e5bf 100644 --- a/bsd/man/man2/getlogin.2 +++ b/bsd/man/man2/getlogin.2 @@ -65,7 +65,7 @@ for example when .Xr su 1 is used.) .Pp -.Fn Setlogin +.Fn setlogin sets the login name of the user associated with the current session to .Fa name . This call is restricted to the super-user, and diff --git a/bsd/man/man2/getpid.2 b/bsd/man/man2/getpid.2 index 102889356..974dbf45d 100644 --- a/bsd/man/man2/getpid.2 +++ b/bsd/man/man2/getpid.2 @@ -51,14 +51,14 @@ .Fa void .Fc .Sh DESCRIPTION -.Fn Getpid +.Fn getpid returns the process ID of the calling process. The ID is guaranteed to be unique and is useful for constructing temporary file names. .Pp -.Fn Getppid +.Fn getppid returns the process ID of the parent of the calling process. .Sh ERRORS @@ -79,7 +79,7 @@ is necessary. .Xr gethostid 2 , .Xr compat 5 .Sh STANDARDS -.Fn Getpid +.Fn getpid and .Fn getppid conform to diff --git a/bsd/man/man2/getpriority.2 b/bsd/man/man2/getpriority.2 index 6e6d89635..71140276c 100644 --- a/bsd/man/man2/getpriority.2 +++ b/bsd/man/man2/getpriority.2 @@ -130,7 +130,7 @@ The call returns 0 if there is no error, or -1 if there is. .Sh ERRORS -.Fn Getpriority +.Fn getpriority and .Fn setpriority will fail if: diff --git a/bsd/man/man2/getrusage.2 b/bsd/man/man2/getrusage.2 index c243f97e1..642097593 100644 --- a/bsd/man/man2/getrusage.2 +++ b/bsd/man/man2/getrusage.2 @@ -50,7 +50,7 @@ .Fa "struct rusage *r_usage" .Fc .Sh DESCRIPTION -.Fn Getrusage +.Fn getrusage returns information describing the resources utilized by the current process, or all its terminated child processes. The diff --git a/bsd/man/man2/getsockopt.2 b/bsd/man/man2/getsockopt.2 index 0ec25a000..f94a06216 100644 --- a/bsd/man/man2/getsockopt.2 +++ b/bsd/man/man2/getsockopt.2 @@ -59,7 +59,7 @@ .Fa "socklen_t option_len" .Fc .Sh DESCRIPTION -.Fn Getsockopt +.Fn getsockopt and .Fn setsockopt manipulate the diff --git a/bsd/man/man2/getuid.2 b/bsd/man/man2/getuid.2 index e8be23ce2..ffe34d0ef 100644 --- a/bsd/man/man2/getuid.2 +++ b/bsd/man/man2/getuid.2 @@ -86,7 +86,7 @@ is necessary for both functions. .Xr setreuid 2 , .Xr compat 5 .Sh STANDARDS -.Fn Geteuid +.Fn geteuid and .Fn getuid functions conform to diff --git a/bsd/man/man2/ioctl.2 b/bsd/man/man2/ioctl.2 index 9d6343bf2..8830e3d6a 100644 --- a/bsd/man/man2/ioctl.2 +++ b/bsd/man/man2/ioctl.2 @@ -79,7 +79,7 @@ If an error has occurred, a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS -.Fn Ioctl +.Fn ioctl will fail if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/kill.2 b/bsd/man/man2/kill.2 index 92b09e4c2..5d3bd6f64 100644 --- a/bsd/man/man2/kill.2 +++ b/bsd/man/man2/kill.2 @@ -106,7 +106,7 @@ Otherwise, a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS -.Fn Kill +.Fn kill will fail and no signal will be sent if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/kqueue.2 b/bsd/man/man2/kqueue.2 index 5e8702457..6f1ce36ef 100644 --- a/bsd/man/man2/kqueue.2 +++ b/bsd/man/man2/kqueue.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008-2015 Apple Inc. All rights reserved. +.\" Copyright (c) 2008-2016 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -435,6 +435,20 @@ field contains 1 on input, those devices will attach - but cannot be relied upon to provide an accurate count of bytes to be read on output. .El +.It Dv EVFILT_EXCEPT +Takes a descriptor as the identifier, and returns whenever one of the +specified exceptional conditions has occurred on the descriptor. Conditions +are specified in +.Va fflags . +Currently, this filter can be used to monitor the arrival of +out-of-band data on a socket descriptor using the filter flag +.Dv NOTE_OOB . +.Pp +If the read direction of the socket has shutdown, then the filter +also sets EV_EOF in +.Va flags , +and returns the socket error (if any) in +.Va fflags . .It EVFILT_WRITE Takes a file descriptor as the identifier, and returns whenever it is possible to write to the descriptor. For sockets, pipes @@ -501,6 +515,11 @@ The file referenced by the descriptor was renamed. Access to the file was revoked via .Xr revoke 2 or the underlying fileystem was unmounted. +.It NOTE_FUNLOCK +The file was unlocked by calling +.Xr flock 2 +or +.Xr close 2 .El .Pp On return, diff --git a/bsd/man/man2/link.2 b/bsd/man/man2/link.2 index 35010cab7..bfefff7a7 100644 --- a/bsd/man/man2/link.2 +++ b/bsd/man/man2/link.2 @@ -170,7 +170,7 @@ a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS -.Fn Link +.Fn link will fail and no link will be created if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/listen.2 b/bsd/man/man2/listen.2 index 3949b41a5..d17bdac0f 100644 --- a/bsd/man/man2/listen.2 +++ b/bsd/man/man2/listen.2 @@ -77,7 +77,7 @@ the request may be ignored so that retries may succeed. .Sh RETURN VALUES .Rv -std listen .Sh ERRORS -.Fn Listen +.Fn listen will fail if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/lseek.2 b/bsd/man/man2/lseek.2 index 0487277c6..6d29d53b0 100644 --- a/bsd/man/man2/lseek.2 +++ b/bsd/man/man2/lseek.2 @@ -61,7 +61,7 @@ The argument .Fa fildes must be an open file descriptor. -.Fn Lseek +.Fn lseek repositions the file pointer .Fa fildes as follows: @@ -113,7 +113,7 @@ a value of -1 is returned and is set to indicate the error. .Sh ERRORS -.Fn Lseek +.Fn lseek will fail and the file pointer will remain unchanged if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/mkdir.2 b/bsd/man/man2/mkdir.2 index 11f934240..249662258 100644 --- a/bsd/man/man2/mkdir.2 +++ b/bsd/man/man2/mkdir.2 @@ -98,7 +98,7 @@ A 0 return value indicates success. A -1 return value indicates an error, and an error code is stored in .Va errno . .Sh ERRORS -.Fn Mkdir +.Fn mkdir will fail and no directory will be created if: .Bl -tag -width Er .\" ========== @@ -131,6 +131,9 @@ or allocating the inode. .It Bq Er EIO An I/O error occurred while reading from or writing to the file system. .\" ========== +.It Bq Er EISDIR +The named file is the root directory. +.\" ========== .It Bq Er ELOOP Too many symbolic links were encountered in translating the pathname. This is taken to be indicative of a looping symbolic link. diff --git a/bsd/man/man2/mkfifo.2 b/bsd/man/man2/mkfifo.2 index 7d843f2c5..69ecfd616 100644 --- a/bsd/man/man2/mkfifo.2 +++ b/bsd/man/man2/mkfifo.2 @@ -45,7 +45,7 @@ .Ft int .Fn mkfifo "const char *path" "mode_t mode" .Sh DESCRIPTION -.Fn Mkfifo +.Fn mkfifo creates a new fifo file with name .Fa path . The access permissions are @@ -63,7 +63,7 @@ A 0 return value indicates success. A -1 return value indicates an error, and an error code is stored in .Va errno . .Sh ERRORS -.Fn Mkfifo +.Fn mkfifo will fail and no fifo will be created if: .Bl -tag -width Er .It Bq Er ENOTSUP diff --git a/bsd/man/man2/mknod.2 b/bsd/man/man2/mknod.2 index 017bb2508..d01a23fa4 100644 --- a/bsd/man/man2/mknod.2 +++ b/bsd/man/man2/mknod.2 @@ -70,7 +70,7 @@ does not indicate a block special or character special device, .Fa dev is ignored. .Pp -.Fn Mknod +.Fn mknod requires super-user privileges. .Sh RETURN VALUES Upon successful completion, a value of 0 is returned. @@ -78,7 +78,7 @@ Otherwise, a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS -.Fn Mknod +.Fn mknod will fail and the file will be not created if: .Bl -tag -width Er .\" ========== diff --git a/bsd/man/man2/mlock.2 b/bsd/man/man2/mlock.2 index 94584a7b0..64d8226ee 100644 --- a/bsd/man/man2/mlock.2 +++ b/bsd/man/man2/mlock.2 @@ -119,7 +119,7 @@ In this case, the global location .Va errno is set to indicate the error. .Sh ERRORS -.Fn Mlock +.Fn mlock and .Fn munlock will fail if: @@ -133,7 +133,7 @@ Part or all of the specified address range is not mapped to the process. .El .Pp -.Fn Mlock +.Fn mlock will fail if: .Bl -tag -width Er .\" =========== @@ -146,7 +146,7 @@ Some portion of the indicated address range is not allocated. There was an error faulting/mapping a page. .El .Pp -.Fn Munlock +.Fn munlock will fail if: .Bl -tag -width Er .\" =========== diff --git a/bsd/man/man2/mmap.2 b/bsd/man/man2/mmap.2 index 5707d6c2d..9cbe1aa2a 100644 --- a/bsd/man/man2/mmap.2 +++ b/bsd/man/man2/mmap.2 @@ -144,22 +144,14 @@ to .Nm mmap are: .Pp -VM_FLAGS_PURGABLE to create Mach purgable (i.e. volatile) memory +VM_FLAGS_PURGABLE to create Mach purgable (i.e. volatile) memory. .Pp -VM_MAKE_TAG(tag) to associate an 8-bit tag with the region +VM_MAKE_TAG(tag) to associate an 8-bit tag with the region. .br defines some preset tags (with a VM_MEMORY_ prefix). Users are encouraged to use tags between 240 and 255. Tags are used by tools such as vmmap(1) to help identify specific memory regions. .Pp -VM_FLAGS_SUPERPAGE_SIZE_* to use superpages for the allocation. -See for supported architectures and sizes (or use -VM_FLAGS_SUPERPAGE_SIZE_ANY to have the kernel choose a size). -The specified size must be divisible by the superpage size (except for -VM_FLAGS_SUPERPAGE_SIZE_ANY), and if you use MAP_FIXED, the specified address -must be properly aligned. If the system cannot satisfy the request with superpages, -the call will fail. Note that currently, superpages are always wired and not -inherited by children of the process. .It Dv MAP_FILE Mapped from a regular file. (This is the default mapping type, and need not be specified.) diff --git a/bsd/man/man2/mount.2 b/bsd/man/man2/mount.2 index a40dc3112..e3245d9f3 100644 --- a/bsd/man/man2/mount.2 +++ b/bsd/man/man2/mount.2 @@ -150,7 +150,7 @@ and the variable .Va errno is set to indicate the error. .Sh ERRORS -.Fn Mount +.Fn mount will fail when one of the following occurs: .Bl -tag -width [ENAMETOOLONG] .It Bq Er EPERM @@ -221,7 +221,7 @@ points outside the process's allocated address space. .Sh BUGS Some of the error codes need translation to more obvious messages. .Sh HISTORY -.Fn Mount +.Fn mount and .Fn unmount function calls appeared in diff --git a/bsd/man/man2/munmap.2 b/bsd/man/man2/munmap.2 index 73a929e08..79722ad81 100644 --- a/bsd/man/man2/munmap.2 +++ b/bsd/man/man2/munmap.2 @@ -110,7 +110,7 @@ Otherwise, a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS -.Fn Munmap +.Fn munmap will fail if: .Bl -tag -width Er .\" =========== diff --git a/bsd/man/man2/pathconf.2 b/bsd/man/man2/pathconf.2 index 9743384ce..630d79172 100644 --- a/bsd/man/man2/pathconf.2 +++ b/bsd/man/man2/pathconf.2 @@ -147,7 +147,7 @@ The implementation does not support an association of the variable name with the associated file. .El .Pp -.Fn Pathconf +.Fn pathconf will fail if: .Bl -tag -width ENAMETOOLONGAA .\" =========== @@ -173,7 +173,7 @@ A component of the path prefix is not a directory. .El .Pp .Bl -tag -width ENAMETOOLONGAA -.Fn Fpathconf +.Fn fpathconf will fail if: .\" =========== .It Bq Er EBADF diff --git a/bsd/man/man2/posix_spawn.2 b/bsd/man/man2/posix_spawn.2 index 6a940d5c1..aa339f665 100644 --- a/bsd/man/man2/posix_spawn.2 +++ b/bsd/man/man2/posix_spawn.2 @@ -325,6 +325,10 @@ A component of the path prefix is not a directory. The new process file is a pure procedure (shared text) file that is currently open for writing or reading by some process. .El +.Pp +Additionally, they may fail for any of the reasons listed in +.Xr fork 2 or +.Xr exec 3 . .Sh CAVEAT If a program is .Em setuid diff --git a/bsd/man/man2/read.2 b/bsd/man/man2/read.2 index 65886dc03..d8df563cc 100644 --- a/bsd/man/man2/read.2 +++ b/bsd/man/man2/read.2 @@ -66,21 +66,21 @@ .Fa "int iovcnt" .Fc .Sh DESCRIPTION -.Fn Read +.Fn read attempts to read .Fa nbyte bytes of data from the object referenced by the descriptor .Fa fildes into the buffer pointed to by .Fa buf . -.Fn Readv +.Fn readv performs the same action, but scatters the input data into the .Fa iovcnt buffers specified by the members of the .Fa iov array: iov[0], iov[1], ..., iov[iovcnt\|\-\|1]. -.Fn Pread +.Fn pread performs the same function, but reads from the specified position in the file without modifying the file pointer. @@ -102,7 +102,7 @@ Each .Fa iovec entry specifies the base address and length of an area in memory where data should be placed. -.Fn Readv +.Fn readv will always fill an area completely before proceeding to the next. .Pp diff --git a/bsd/man/man2/readlink.2 b/bsd/man/man2/readlink.2 index 8d02c1e0d..375eca319 100644 --- a/bsd/man/man2/readlink.2 +++ b/bsd/man/man2/readlink.2 @@ -53,7 +53,7 @@ .Fa "int fd" "const char *restrict path" "char *restrict buf" "size_t bufsize" .Fc .Sh DESCRIPTION -.Fn Readlink +.Fn readlink places the contents of the symbolic link .Fa path in the buffer @@ -91,7 +91,7 @@ if it succeeds, or a -1 if an error occurs, placing the error code in the global variable .Va errno . .Sh ERRORS -.Fn Readlink +.Fn readlink will fail if: .Bl -tag -width Er .\" =========== diff --git a/bsd/man/man2/reboot.2 b/bsd/man/man2/reboot.2 index 38791c085..f41f940fb 100644 --- a/bsd/man/man2/reboot.2 +++ b/bsd/man/man2/reboot.2 @@ -46,7 +46,7 @@ .Ft int .Fn reboot "int howto" .Sh DESCRIPTION -.Fn Reboot +.Fn reboot reboots the system. Only the super-user may reboot a machine on demand. However, a reboot may be invoked automatically in the event of unrecoverable system failures. Programs other than diff --git a/bsd/man/man2/removexattr.2 b/bsd/man/man2/removexattr.2 index bcb455474..016a084b0 100644 --- a/bsd/man/man2/removexattr.2 +++ b/bsd/man/man2/removexattr.2 @@ -38,7 +38,7 @@ Extended attributes extend the basic attributes associated with files and directories in the file system. They are stored as name:data pairs associated with file system objects (files, directories, symlinks, etc). .Pp -.Fn Removexattr +.Fn removexattr deletes the extended attribute .Fa name associated with diff --git a/bsd/man/man2/rename.2 b/bsd/man/man2/rename.2 index f07b1ab3b..49a562912 100644 --- a/bsd/man/man2/rename.2 +++ b/bsd/man/man2/rename.2 @@ -38,7 +38,9 @@ .Os BSD 4.2 .Sh NAME .Nm rename , -.Nm renameat +.Nm renameat , +.Nm renamex_np , +.Nm renameatx_np .Nd change the name of a file .Sh SYNOPSIS .Fd #include @@ -49,6 +51,10 @@ .Fc .Ft int .Fn renameat "int fromfd" "const char *from" "int tofd" "const char *to" +.Ft int +.Fn renamex_np "const char *from" "const char *to" "unsigned int flags" +.Ft int +.Fn renameatx_np "int fromfd" "const char *from" "int tofd" "const char *to" "unsigned int flags" .Sh DESCRIPTION The .Fn rename @@ -110,6 +116,35 @@ or .Fa tofd parameter, the current working directory is used in the determination of the file for the respective path parameter. +.Pp +The +.Fn renamex_np +and +.Fn renameatx_np +system calls are similar to their counterparts except that they take a +.Fa flags +argument. +Values for +.Fa flags +are constructed with below bits set: +.Bl -tag -offset indent +.It Dv RENAME_SWAP +On file systems that support it (see +.Xr getattrlist 2 +.Dv VOL_CAP_INT_RENAME_SWAP Ns ), +it will cause the source and target to be atomically swapped. Source and target need not be of +the same type, i.e. it is possible to swap a file with a directory. +EINVAL is returned in case of bitwise-inclusive OR with +.Dv RENAME_EXCL . +.It Dv RENAME_EXCL +On file systems that support it (see +.Xr getattrlist 2 +.Dv VOL_CAP_INT_RENAME_EXCL Ns ), +it will cause +.Dv EEXIST +to be returned if the destination already exists. EINVAL is returned in case of bitwise-inclusive OR with +.Dv RENAME_SWAP . +.El .Sh CAVEATS The system can deadlock if a loop is present in the file system graph. This loop takes the form of an entry in directory @@ -175,6 +210,14 @@ is being placed cannot be extended because the user's quota of disk blocks on the file system containing the directory has been exhausted. .\" =========== +.It Bq Er EEXIST +.Fa flags +has +.Dv RENAME_EXCL +set but +.Fa new +already exists. +.\" =========== .It Bq Er EFAULT .Em Path points outside the process's allocated address space. @@ -187,6 +230,23 @@ or an attempt is made to rename .Ql \&. or .Ql \&.. . +If +.Dv RENAME_SWAP +is used, then +.Dv EINVAL +will also be returned if +.Fa new +is a parent directory of +.Fa old . +If both RENAME_SWAP and RENAME_EXCL bits are set in +.Fa flags , +then +.Dv EINVAL +will be returned. +.\" =========== +.It Bq Er EINVAL +.Fa flags +has an invalid value. .\" =========== .It Bq Er EIO An I/O error occurs while making or updating a directory entry. @@ -216,6 +276,14 @@ or a path prefix of .Fa new does not exist. .\" =========== +.It Bq Er ENOENT +.Fa flags +has +.Dv RENAME_SWAP +set but +.Fa new +does not exist. +.\" =========== .It Bq Er ENOSPC The directory in which the entry for the new name is being placed cannot be extended because there is no space left on the file @@ -234,6 +302,10 @@ is not a directory. .Fa New is a directory and is not empty. .\" =========== +.It Bq Er ENOTSUP +.Fa flags +has a value that is not supported by the file system. +.\" =========== .It Bq Er EPERM The directory containing .Fa old @@ -267,11 +339,11 @@ Note that this error code will not be returned if the implementation permits cross-device links. .El .Pp -In addition to the errors returned by the -.Fn rename , -the +The .Fn renameat -may fail if: +and +.Fn renameatx_np +calls may also fail with: .Bl -tag -width Er .It Bq Er EBADF The @@ -301,6 +373,7 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.El .Sh CONFORMANCE The restriction on renaming a directory whose permissions disallow writing is based on the fact that UFS directories contain a ".." entry. diff --git a/bsd/man/man2/renameatx_np.2 b/bsd/man/man2/renameatx_np.2 new file mode 100644 index 000000000..9b74442c8 --- /dev/null +++ b/bsd/man/man2/renameatx_np.2 @@ -0,0 +1 @@ +.so man2/rename.2 diff --git a/bsd/man/man2/renamex_np.2 b/bsd/man/man2/renamex_np.2 new file mode 100644 index 000000000..9b74442c8 --- /dev/null +++ b/bsd/man/man2/renamex_np.2 @@ -0,0 +1 @@ +.so man2/rename.2 diff --git a/bsd/man/man2/rmdir.2 b/bsd/man/man2/rmdir.2 index 5a47ed7b7..56d1b68e9 100644 --- a/bsd/man/man2/rmdir.2 +++ b/bsd/man/man2/rmdir.2 @@ -46,7 +46,7 @@ .Fa "const char *path" .Fc .Sh DESCRIPTION -.Fn Rmdir +.Fn rmdir removes a directory file whose name is given by .Fa path . diff --git a/bsd/man/man2/send.2 b/bsd/man/man2/send.2 index 3a4e1e652..17f3be243 100644 --- a/bsd/man/man2/send.2 +++ b/bsd/man/man2/send.2 @@ -66,12 +66,12 @@ .Fa "socklen_t dest_len" .Fc .Sh DESCRIPTION -.Fn Send , +.Fn send , .Fn sendto , and .Fn sendmsg are used to transmit a message to another socket. -.Fn Send +.Fn send may be used only when the socket is in a .Em connected state, while diff --git a/bsd/man/man2/setgroups.2 b/bsd/man/man2/setgroups.2 index 0ec3c3086..889c652f7 100644 --- a/bsd/man/man2/setgroups.2 +++ b/bsd/man/man2/setgroups.2 @@ -45,7 +45,7 @@ .Ft int .Fn setgroups "int ngroups" "const gid_t *gidset" .Sh DESCRIPTION -.Fn Setgroups +.Fn setgroups sets the group access list of the current user process according to the array .Fa gidset . diff --git a/bsd/man/man2/setpgid.2 b/bsd/man/man2/setpgid.2 index 63ab9783a..07c37c78b 100644 --- a/bsd/man/man2/setpgid.2 +++ b/bsd/man/man2/setpgid.2 @@ -52,7 +52,7 @@ .Fa void .Fc .Sh DESCRIPTION -.Fn Setpgid +.Fn setpgid sets the process group of the specified process .Ar pid to the specified @@ -66,13 +66,13 @@ must have the same effective user-id as the invoker or be a descendant of the invoking process. .Pp .Sh RETURN VALUES -.Fn Setpgid +.Fn setpgid returns 0 when the operation was successful. If the request failed, -1 is returned and the global variable .Va errno indicates the reason. .Sh ERRORS -.Fn Setpgid +.Fn setpgid will fail and the process group will not be altered if: .Bl -tag -width Er .\" =========== diff --git a/bsd/man/man2/sigaltstack.2 b/bsd/man/man2/sigaltstack.2 index 3ca8c96f3..de20d8faa 100644 --- a/bsd/man/man2/sigaltstack.2 +++ b/bsd/man/man2/sigaltstack.2 @@ -47,7 +47,7 @@ .Fa "stack_t *restrict oss" .Fc .Sh DESCRIPTION -.Fn Sigaltstack +.Fn sigaltstack allows users to define an alternate stack on which signals are to be processed. If @@ -133,7 +133,7 @@ Otherwise, a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS -.Fn Sigaltstack +.Fn sigaltstack will fail and the signal stack context will remain unchanged if one of the following occurs. .Bl -tag -width [ENOMEM] diff --git a/bsd/man/man2/sigsuspend.2 b/bsd/man/man2/sigsuspend.2 index 29bb42891..ec4e99ae8 100644 --- a/bsd/man/man2/sigsuspend.2 +++ b/bsd/man/man2/sigsuspend.2 @@ -46,7 +46,7 @@ .Fa "const sigset_t *sigmask" .Fc .Sh DESCRIPTION -.Fn Sigsuspend +.Fn sigsuspend temporarily changes the blocked signal mask to the set to which .Fa sigmask points, diff --git a/bsd/man/man2/statfs.2 b/bsd/man/man2/statfs.2 index 7e2b9ad27..d7233d4e1 100644 --- a/bsd/man/man2/statfs.2 +++ b/bsd/man/man2/statfs.2 @@ -271,6 +271,9 @@ The structure used by these deprecated routines is the same as the .Fa statfs structure when 64-bit inodes are in effect (see above). +.Sh SEE ALSO +.Xr stat 2 , +.Xr getfsstat 2 .Sh HISTORY The .Fn statfs diff --git a/bsd/man/man2/sync.2 b/bsd/man/man2/sync.2 index 7440b25ee..51cac1588 100644 --- a/bsd/man/man2/sync.2 +++ b/bsd/man/man2/sync.2 @@ -69,7 +69,7 @@ attributes. .Xr sync 8 , .Xr update 8 .Sh BUGS -.Fn Sync +.Fn sync may return before the buffers are completely flushed. .Sh HISTORY A diff --git a/bsd/man/man2/syscall.2 b/bsd/man/man2/syscall.2 index be9b5dd17..3da6103ca 100644 --- a/bsd/man/man2/syscall.2 +++ b/bsd/man/man2/syscall.2 @@ -45,7 +45,7 @@ .Ft int .Fn syscall "int number" "..." .Sh DESCRIPTION -.Fn Syscall +.Fn syscall performs the system call whose assembly language interface has the specified .Fa number diff --git a/bsd/man/man2/vfork.2 b/bsd/man/man2/vfork.2 index c512eae01..3a07e9965 100644 --- a/bsd/man/man2/vfork.2 +++ b/bsd/man/man2/vfork.2 @@ -46,14 +46,14 @@ .Fa void .Fc .Sh DESCRIPTION -.Fn Vfork +.Fn vfork can be used to create new processes without fully copying the address space of the old process, which is horrendously inefficient in a paged environment. It is useful when the purpose of .Xr fork 2 would have been to create a new system context for an .Xr execve . -.Fn Vfork +.Fn vfork differs from .Xr fork in that the child borrows the parent's memory and thread of @@ -64,11 +64,11 @@ or an exit (either by a call to or abnormally.) The parent process is suspended while the child is using its resources. .Pp -.Fn Vfork +.Fn vfork returns 0 in the child's context and (later) the pid of the child in the parent's context. .Pp -.Fn Vfork +.Fn vfork can normally be used just like .Xr fork . It does not work, however, to return while running in the childs context diff --git a/bsd/man/man3/Makefile b/bsd/man/man3/Makefile index d2efa1a7b..fb44344cb 100644 --- a/bsd/man/man3/Makefile +++ b/bsd/man/man3/Makefile @@ -61,4 +61,3 @@ INSTALL_MAN_DIR = man3 include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/man/man3/getiopolicy_np.3 b/bsd/man/man3/getiopolicy_np.3 index 087072411..43bc02499 100644 --- a/bsd/man/man3/getiopolicy_np.3 +++ b/bsd/man/man3/getiopolicy_np.3 @@ -25,14 +25,17 @@ can be get or set for the given .Pp The I/O type is specified in the argument .Fa iotype . -The currently supported I/O type is +The only currently supported I/O type is .Dv IOPOL_TYPE_DISK , -which means the I/O policy for I/Os to local disks can be get or set. I/Os to -local disks are I/Os sent to the media without going through a network, +which can mean either the I/O policy for I/Os to local disks or to +remote volumes. +I/Os to local disks are I/Os sent to the media without going through a network, including I/Os to internal and external hard drives, optical media in internal and external drives, flash drives, floppy disks, ram disks, and mounted disk -images which reside on these media, but not including remote volumes mounted -through networks (AFP, SMB, NFS, etc) or disk images residing on remote volumes. +images which reside on these media. +I/Os to remote volumes are I/Os that require network activity to complete the +operation. +This is currently only supported for remote volumes mounted by SMB or AFP. .Pp The scope that the I/O policy takes effect is specified in the argument .Fa scope @@ -109,7 +112,7 @@ call returns 0 if there is no error, or -1 if there is an error. When error happens, the error code is stored in the external variable .Fa errno . .Sh ERRORS -.Fn Getiopolicy_np +.Fn getiopolicy_np and .Fn setiopolicy_np will fail if: diff --git a/bsd/man/man3/posix_spawnattr_setflags.3 b/bsd/man/man3/posix_spawnattr_setflags.3 index 3359497ec..21666ad8d 100644 --- a/bsd/man/man3/posix_spawnattr_setflags.3 +++ b/bsd/man/man3/posix_spawnattr_setflags.3 @@ -1,8 +1,8 @@ .\" -.\" Copyright (c) 2000-2010 Apple Inc. All rights reserved. +.\" Copyright (c) 2000-2016 Apple Inc. All rights reserved. .\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ -.\" +.\" .\" This file contains Original Code and/or Modifications of Original Code .\" as defined in and that are subject to the Apple Public Source License .\" Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ .\" unlawful or unlicensed copies of an Apple operating system, or to .\" circumvent, violate, or enable the circumvention or violation of, any .\" terms of an Apple operating system software license agreement. -.\" +.\" .\" Please obtain a copy of the License at .\" http://www.opensource.apple.com/apsl/ and read it before using this file. -.\" +.\" .\" The Original Code and all software distributed under the License are .\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER .\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ .\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. .\" Please see the License for the specific language governing rights and .\" limitations under the License. -.\" +.\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_END@ .\" .\" @(#)posix_spawnattr_setflags.3 @@ -83,7 +83,7 @@ parameter equal to the value of the spawn-pgroup value of the as set by .Xr posix_spawnattr_setpgroup 3 .It Dv POSIX_SPAWN_SETSIGDEF -Signals set to to either be caught or to the default action in the +Signals set to either be caught or to the default action in the parent process will also be set to the default action in the child process. Signals set to be ignored in the parent process will be ignored in the child. However, if this bit is set, then signals in @@ -113,29 +113,33 @@ will behave as a more featureful .Xr execve 2 . .It Dv POSIX_SPAWN_START_SUSPENDED .Em Apple Extension : -If this bit is set, then the child process will be created with its task -suspended, permitting debuggers, profilers, and other programs to -manipulate the process before it begins execution in user space. This -permits, for example, obtaining exact instruction counts, or debugging -very early in +If this bit is set, then the child process will be created as if it immediately +received a +.Li SIGSTOP +signal, permitting debuggers, profilers, and other programs to manipulate the +process before it begins execution in user space. This permits, for example, +obtaining exact instruction counts, or debugging very early in .Xr dyld 1 . +To resume the child process, it must be sent a +.Li SIGCONT +signal. .It Dv POSIX_SPAWN_CLOEXEC_DEFAULT .Em Apple Extension : If this bit is set, then only file descriptors explicitly described by the .Fa file_actions -argument are available in the spawned process; all -of the other file descriptors are -automatically closed in the spawned process. +argument are available in the spawned process; all of the other file descriptors +are automatically closed in the spawned process. .El .Sh RETURN VALUES -On success, these functions return 0; on failure they return an error -number from +On success, these functions return 0; on failure they return an error number +from .In errno.h . The .Fn posix_spawnattr_getflags -additionally, upon successful completion, modifies the value pointed to be the -.Fa attr -argument by making it equal to the +function additionally, upon successful completion, makes the value pointed to by +the +.Fa flags +argument equal to the .Em spawn-flags attribute of the .Em posix_spawnattr_t . @@ -163,6 +167,7 @@ is invalid. .Xr posix_spawnattr_setsigmask 3 , .Xr posix_spawn_file_actions_init 3 , .Xr setpgid 2 , +.Xr signal 3 , .Xr execve 2 , .Xr dyld 1 .Sh STANDARDS diff --git a/bsd/man/man4/Makefile b/bsd/man/man4/Makefile index f385d9990..18dca587c 100644 --- a/bsd/man/man4/Makefile +++ b/bsd/man/man4/Makefile @@ -23,7 +23,6 @@ DATAFILES = \ inet6.4 \ ip.4 \ ip6.4 \ - ipfirewall.4 \ ipl.4 \ ipsec.4 \ lo.4 \ @@ -50,4 +49,3 @@ INSTALL_MAN_DIR = man4 include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/man/man4/dummynet.4 b/bsd/man/man4/dummynet.4 index fbd317bf4..80856106e 100644 --- a/bsd/man/man4/dummynet.4 +++ b/bsd/man/man4/dummynet.4 @@ -17,41 +17,17 @@ management policies, and emulating delays and losses. The user interface for .Em dummynet is implemented by the -.Nm ipfw +.Nm dnctl program, so the reader is referred to the -.Xr ipfw 8 +.Xr dnctl 8 manpage for a complete description of the capabilities of .Nm and on how to use it. -.Sh KERNEL OPTIONS -The following options in the kernel configuration file are related to -.Nm -operation: -.Bd -literal - IPFIREWALL - enable ipfirewall (required for dummynet). - IPFIREWALL_VERBOSE - enable firewall output. - IPFIREWALL_VERBOSE_LIMIT - limit firewall output. - DUMMYNET - enable dummynet operation. - NMBCLUSTERS - set the amount of network packet buffers - HZ - sets the timer granularity -.Ed -.Pp -Generally, the following options are required: -.Bd -literal - options IPFIREWALL - options DUMMYNET - options HZ=1000 # strongly recommended -.Ed -.Pp -additionally, one may want to increase the number -of mbuf clusters (used to store network packets) according to the -sum of the bandwidth-delay products and queue sizes of all configured -pipes. .Sh SEE ALSO +.Xr dnctl 8 , .Xr setsockopt 2 , .Xr bridge 4 , .Xr ip 4 , -.Xr ipfw 8 , .Xr sysctl 8 .Sh HISTORY .Nm diff --git a/bsd/man/man4/ipfirewall.4 b/bsd/man/man4/ipfirewall.4 deleted file mode 100644 index 9dd42b23a..000000000 --- a/bsd/man/man4/ipfirewall.4 +++ /dev/null @@ -1,246 +0,0 @@ -.Dd June 22, 1997 -.Dt IPFIREWALL 4 -.Os Darwin -.Sh NAME -.Nm ipfirewall -.Nd IP packet filter and traffic accounting -.Sh SYNOPSIS -.Fd #include -.Fd #include -.Fd #include -.Fd #include -.Ft int -.Fn setsockopt raw_socket IPPROTO_IP "ipfw option" "struct ipfw" size -.\"-------------------------------------------------------------------------------------------- -.Sh DESCRIPTION -.\"-------------------------------------------------------------------------------------------- -IPFirewall (sometimes referred to as "ipfw") is a system facility which allows filtering, -redirecting, and other operations on IP packets travelling through network interfaces. Packets -are matched by applying an ordered list of pattern rules against each packet until a match is -found, at which point the corresponding action is taken. Rules are numbered from 1 to 65534; -multiple rules may share the same number. -.Pp -There is one rule that always exists, rule number 65535. This rule normally causes all packets -to be dropped. Hence, any packet which does not match a lower numbered rule will be dropped. -However, the kernel compile time option -.Dv IPFIREWALL_DEFAULT_TO_ACCEPT -allows the administrator to change this fixed rule to permit everything. -.Pp -The buffer passed down via the socket-option call should contain a "struct ip_fw" that is -initialized with the required parameters for the firewall command being invoked. This -structure is consistently required for every firewall command, even though in some cases -the majority of its fields will go unused. The reason for this is the API versioning that -the firewall supports for the sake of backward compatibility. The -.Dv version -field of this -structure should always be set to -.Dv IP_FW_CURRENT_API_VERSION -or an EINVAL error will be returned. -.Ss Commands -The following socket options are used to manage the rule list: -.Bl -tag -width "IP_FW_FLUSH" -.It Dv IP_FW_ADD -inserts the rule into the rule list -.It Dv IP_FW_DEL -deletes all rules having the matching rule number -.It Dv IP_FW_GET -returns the (first) rule having the matching rule number -.It Dv IP_FW_ZERO -zeros the statistics associated with all rules having the -matching rule number. -If the rule number is zero, all rules are zeroed. -.It Dv IP_FW_FLUSH -removes all rules (except 65535). -.El -.Pp -When the kernel security level is greater than 2, only -.Dv IP_FW_GET -is allowed. -.\"-------------------------------------------------------------------------------------------- -.Ss Rule Structure -.\"-------------------------------------------------------------------------------------------- -Rules are described by the following structure: -.Bd -literal -/* One ipfw rule */ -struct ip_fw { - u_int32_t version; /* Version of this structure. Should always be */ - /* set to IP_FW_CURRENT_API_VERSION by clients. */ - void *context; /* Context that is usable by user processes to */ - /* identify this rule. */ - u_int64_t fw_pcnt,fw_bcnt; /* Packet and byte counters */ - struct in_addr fw_src, fw_dst; /* Source and destination IP addr */ - struct in_addr fw_smsk, fw_dmsk; /* Mask for src and dest IP addr */ - u_short fw_number; /* Rule number */ - u_int fw_flg; /* Flags word */ -#define IP_FW_MAX_PORTS 10 /* A reasonable maximum */ - union { - u_short fw_pts[IP_FW_MAX_PORTS]; /* Array of port numbers to match */ -#define IP_FW_ICMPTYPES_MAX 128 -#define IP_FW_ICMPTYPES_DIM (IP_FW_ICMPTYPES_MAX / (sizeof(unsigned) * 8)) - unsigned fw_icmptypes[IP_FW_ICMPTYPES_DIM]; /* ICMP types bitmap */ - } fw_uar; - u_int fw_ipflg; /* IP flags word */ - u_char fw_ipopt,fw_ipnopt; /* IP options set/unset */ - u_char fw_tcpopt,fw_tcpnopt; /* TCP options set/unset */ - u_char fw_tcpf,fw_tcpnf; /* TCP flags set/unset */ - long timestamp; /* timestamp (tv_sec) of last match */ - union ip_fw_if fw_in_if, fw_out_if; /* Incoming and outgoing interfaces */ - union { - u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ - u_short fu_pipe_nr; /* queue number (option DUMMYNET) */ - u_short fu_skipto_rule; /* SKIPTO command rule number */ - u_short fu_reject_code; /* REJECT response code */ - struct sockaddr_in fu_fwd_ip; - } fw_un; - u_char fw_prot; /* IP protocol */ - /* - * N'of src ports and # of dst ports in ports array (dst ports - * follow src ports; max of 10 ports in all; count of 0 means - * match all ports) - */ - u_char fw_nports; - void *pipe_ptr; /* flow_set ptr for dummynet pipe */ - void *next_rule_ptr ; /* next rule in case of match */ - uid_t fw_uid; /* uid to match */ - int fw_logamount; /* amount to log */ - u_int64_t fw_loghighest; /* highest number packet to log */ -}; - -The ip_fw.h header also contains macros for setting the fw_ports field and various -flags and constants for setting other fields. -.Ed -.\"-------------------------------------------------------------------------------------------- -.Ss Rule Actions -.\"-------------------------------------------------------------------------------------------- -Each rule has an action described by the IP_FW_F_COMMAND bits in the flags word: -.Bl -tag -width "IP_FW_F_DIVERT" -.It Dv IP_FW_F_DENY -drop packet -.It Dv IP_FW_F_REJECT -drop packet; send rejection via ICMP or TCP -.It Dv IP_FW_F_ACCEPT -accept packet -.It Dv IP_FW_F_COUNT -increment counters; continue matching -.It Dv IP_FW_F_DIVERT -divert packet to a -.Xr divert 4 -socket -.It Dv IP_FW_F_TEE -copy packet to a -.Xr divert 4 -socket; continue -.It Dv IP_FW_F_SKIPTO -skip to rule number -.Va fu_skipto_rule -.El -.Pp -In the case of -.Dv IP_FW_F_REJECT , -if the -.Va fu_reject_code -is a number from 0 to 255, then an ICMP unreachable packet is sent back to the -original packet's source IP address, with the corresponding code. Otherwise, the -value must be 256 and the protocol -.Dv IPPROTO_TCP , -in which case a TCP reset packet is sent instead. -.Pp -With -.Dv IP_FW_F_SKIPTO , -all succeeding rules having rule number less than -.Va fu_skipto_rule -are skipped. -.Ss Kernel Options -Options in the kernel configuration file: -.Bl -tag -width "options IPFIREWALL_VERBOSE_LIMIT" -.It Cd options IPFIREWALL -enable -.Nm -.It Cd options IPFIREWALL_VERBOSE -enable firewall logging -.It Cd options IPFIREWALL_VERBOSE_LIMIT -limit firewall logging -.It Cd options IPDIVERT -enable -.Xr divert 4 -sockets -.El -.Pp -When packets match a rule with the -.Dv IP_FW_F_PRN -bit set, and if -.Dv IPFIREWALL_VERBOSE -has been enabled,a message is written to -.Pa /dev/klog -with the -.Dv LOG_SECURITY -facility -(see -.Xr syslog 3 ) -for further logging by -.Xr syslogd 8 ; -.Dv IPFIREWALL_VERBOSE_LIMIT -limits the maximum number of times each rule can cause a log message. These variables are also -available via the -.Xr sysctl 3 -interface. -.\"-------------------------------------------------------------------------------------------- -.Sh RETURN VALUES -.\"-------------------------------------------------------------------------------------------- -The -.Fn setsockopt -function returns 0 on success. Otherwise, -1 is returned and the global variable -.Va errno -is set to indicate the error. -.\"-------------------------------------------------------------------------------------------- -.Sh ERRORS -.\"-------------------------------------------------------------------------------------------- -The -.Fn setsockopt -function will fail if: -.Bl -tag -width Er -.It Bq Er EINVAL -The IP option field was improperly formed; -an option field was shorter than the minimum value -or longer than the option buffer provided. -.It Bq Er EINVAL -A structural error in ip_fw structure occurred -(n_src_p+n_dst_p too big, ports set for ALL/ICMP protocols etc.). -.It Bq Er EINVAL -The version field of the ip_fw structure was set to a value not supported by the -currently-installed -.Dv IPFirewall, -or no ip_fw structure was passed to it at all. -.It Bq Er EINVAL -An invalid rule number was used. -.El -.\"-------------------------------------------------------------------------------------------- -.Sh SEE ALSO -.\"-------------------------------------------------------------------------------------------- -.Xr setsockopt 2 , -.Xr divert 4 , -.Xr ip 4 , -.Xr ipfw 8 , -.Xr sysctl 8 , -.Xr syslogd 8 -.\"-------------------------------------------------------------------------------------------- -.Sh BUGS -.\"-------------------------------------------------------------------------------------------- -The ``tee'' rule is not yet implemented (currently it has no effect). -.Pp -This man page still needs work. -.\"-------------------------------------------------------------------------------------------- -.Sh HISTORY -.\"-------------------------------------------------------------------------------------------- -The ipfw facility was initially written as package to BSDI by -.An Daniel Boulet -.Aq danny@BouletFermat.ab.ca . -It has been heavily modified and ported to -.Fx -by -.An Ugen J.S. Antsilevich -.Aq ugen@NetVision.net.il . -.Pp -Several enhancements added by -.An Archie Cobbs -.Aq archie@FreeBSD.org . diff --git a/bsd/man/man4/random.4 b/bsd/man/man4/random.4 index ed72fa315..3c36e6317 100644 --- a/bsd/man/man4/random.4 +++ b/bsd/man/man4/random.4 @@ -18,9 +18,17 @@ To obtain random bytes, open .Nm /dev/random for reading and read from it. .Pp -To add entropy to the random generation system, open +The same random data is also available from +.Xr getentropy 2 . +Using the +.Xr getentropy 2 +system call interface will provide resiliency to file descriptor exhaustion, chroot, or sandboxing which can make .Nm /dev/random -for writing and write data that you believe to be somehow random. +unavailable. Additionally, the +.Xr arc4random 3 +API provides a fast userspace random number generator built on the +.Nm +data source and is preferred over directly accessing the system's random device. .Pp .Nm /dev/urandom is a compatibility nod to Linux. On Linux, @@ -30,40 +38,13 @@ will produce lower quality output if the entropy pool drains, while will prefer to block and wait for additional entropy to be collected. With Yarrow, this choice and distinction is not necessary, and the two devices behave identically. You may use either. -.Sh OPERATION +.Pp The .Nm device implements the .Nm Yarrow pseudo random number generator algorithm and maintains its entropy pool. -Additional entropy is fed to the generator regularly by the -.Nm SecurityServer -daemon from random jitter measurements of the kernel. -.Nm SecurityServer -is also responsible for periodically saving some entropy to disk -and reloading it during startup to provide entropy in early system -operation. -.Pp -You may feed additional entropy to the generator by writing it to the -.Nm -device, though this is not required in a normal operating environment. -.Sh LIMITATIONS AND WARNINGS -.Nm Yarrow -is a fairly resilient algorithm, and is believed -to be resistant to non-root. -The quality of its output is however dependent on regular addition -of appropriate entropy. If the -.Nm SecurityServer -system daemon fails for any reason, output quality will suffer -over time without any explicit indication from the -.Nm -device itself. -.Pp -Paranoid programmers can counteract this risk somewhat by collecting -entropy of their choice (e.g. from keystroke or mouse timings) -and seeding it into -.Nm -directly before obtaining important random numbers. +The kernel automatically seeds the algorithm with additional entropy during normal execution. .Sh FILES .Bl -tag -width /dev/urandom -compact .It Pa /dev/random diff --git a/bsd/man/man5/Makefile b/bsd/man/man5/Makefile index bf6093b3f..f9f84a1ae 100644 --- a/bsd/man/man5/Makefile +++ b/bsd/man/man5/Makefile @@ -19,4 +19,3 @@ INSTALL_MAN_DIR = man5 include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/man/man8/Makefile b/bsd/man/man7/Makefile similarity index 91% rename from bsd/man/man8/Makefile rename to bsd/man/man7/Makefile index 4c985959c..c393b40b9 100644 --- a/bsd/man/man8/Makefile +++ b/bsd/man/man7/Makefile @@ -7,12 +7,11 @@ include $(MakeInc_cmd) include $(MakeInc_def) DATAFILES = \ - sticky.8 + sticky.7 INSTALL_MAN_LIST = ${DATAFILES} -INSTALL_MAN_DIR = man8 +INSTALL_MAN_DIR = man7 include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/man/man8/sticky.8 b/bsd/man/man7/sticky.7 similarity index 78% rename from bsd/man/man8/sticky.8 rename to bsd/man/man7/sticky.7 index a595cb059..18f80ce66 100644 --- a/bsd/man/man8/sticky.8 +++ b/bsd/man/man7/sticky.7 @@ -1,5 +1,3 @@ -.\" $NetBSD: sticky.8,v 1.3 1994/11/30 19:36:27 jtc Exp $ -.\" .\" Copyright (c) 1980, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" @@ -11,11 +9,7 @@ .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors +.\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" @@ -32,10 +26,11 @@ .\" SUCH DAMAGE. .\" .\" @(#)sticky.8 8.1 (Berkeley) 6/5/93 +.\" $FreeBSD$ .\" .Dd June 5, 1993 -.Dt STICKY 8 -.Os BSD 4 +.Dt STICKY 7 +.Os .Sh NAME .Nm sticky .Nd sticky text and append-only directories @@ -44,17 +39,14 @@ A special file mode, called the .Em sticky bit (mode S_ISVTX), is used to indicate special treatment -for shareable executable files and directories. +for directories. +It is ignored for regular files. See .Xr chmod 2 or the file -.Pa /usr/include/sys/stat.h +.In sys/stat.h for an explanation of file modes. -.Sh STICKY TEXT EXECUTABLE FILES -The sticky bit has no effect on executable files. All optimization on -whether text images remain resident in memory is handled by the -kernel's virtual memory system. .Sh STICKY DIRECTORIES A directory whose `sticky bit' is set becomes an append-only directory, or, more accurately, @@ -73,13 +65,14 @@ Any user may create a sticky directory. See .Xr chmod 1 for details about modifying file modes. +.Sh HISTORY +A +.Nm +command appeared in +.At 32v . .Sh BUGS Neither .Xr open 2 nor .Xr mkdir 2 will create a file with the sticky bit set. -.Sh HISTORY -A -.Nm -command appeared in Version 32V AT&T UNIX. diff --git a/bsd/man/man9/Makefile b/bsd/man/man9/Makefile index 769918fec..0baa7c4c6 100644 --- a/bsd/man/man9/Makefile +++ b/bsd/man/man9/Makefile @@ -35,4 +35,3 @@ INSTALL_MAN_DIR = man9 include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/miscfs/Makefile b/bsd/miscfs/Makefile index d052d38c7..fccfd3a14 100644 --- a/bsd/miscfs/Makefile +++ b/bsd/miscfs/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -12,16 +11,16 @@ INSTINC_SUBDIRS = \ fifofs \ routefs \ specfs \ - union + union \ + nullfs EXPINC_SUBDIRS = \ devfs \ fifofs \ routefs \ specfs \ - union + union \ + nullfs include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/miscfs/devfs/Makefile b/bsd/miscfs/devfs/Makefile index 163dea893..07501d297 100644 --- a/bsd/miscfs/devfs/Makefile +++ b/bsd/miscfs/devfs/Makefile @@ -3,12 +3,11 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) DATAFILES = \ - devfs.h + devfs.h KERNELFILES = \ devfs.h \ @@ -30,5 +29,3 @@ EXPORT_MI_DIR = miscfs/devfs include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/miscfs/devfs/devfs_vfsops.c b/bsd/miscfs/devfs/devfs_vfsops.c index f8406a251..604cf1b10 100644 --- a/bsd/miscfs/devfs/devfs_vfsops.c +++ b/bsd/miscfs/devfs/devfs_vfsops.c @@ -75,8 +75,9 @@ #include #include #include - +#include #include +#include #if CONFIG_MACF #include @@ -125,8 +126,27 @@ devfs_init(__unused struct vfsconf *vfsp) UID_ROOT, GID_WHEEL, 0666, "null"); devfs_make_node(makedev(3, 3), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "zero"); - devfs_make_node(makedev(6, 0), DEVFS_CHAR, + uint32_t logging_config = atm_get_diagnostic_config(); + + if ( logging_config & ATM_ENABLE_LEGACY_LOGGING ) { + devfs_make_node(makedev(6, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600, "klog"); + } + + if ( !(logging_config & ATM_TRACE_DISABLE) ) { + devfs_make_node(makedev(7, 0), DEVFS_CHAR, + UID_ROOT, GID_WHEEL, 0600, "oslog"); + if (cdevsw_setkqueueok(7, (&(cdevsw[7])), 0) == -1) { + return (ENOTSUP); + } + + devfs_make_node(makedev(8, 0), DEVFS_CHAR, + UID_ROOT, GID_WHEEL, 0600, "oslog_stream"); + if (cdevsw_setkqueueok(8, (&(cdevsw[8])), 0) == -1) { + return (ENOTSUP); + } + } + #if FDESC devfs_fdesc_init(); @@ -484,18 +504,16 @@ devfs_kernel_mount(char * mntname) } struct vfsops devfs_vfsops = { - devfs_mount, - devfs_start, - devfs_unmount, - devfs_root, - NULL, /* quotactl */ - devfs_vfs_getattr, - devfs_sync, - devfs_vget, - devfs_fhtovp, - devfs_vptofh, - devfs_init, - devfs_sysctl, - NULL, - {NULL} + .vfs_mount = devfs_mount, + .vfs_start = devfs_start, + .vfs_unmount = devfs_unmount, + .vfs_root = devfs_root, + .vfs_getattr = devfs_vfs_getattr, + .vfs_sync = devfs_sync, + .vfs_vget = devfs_vget, + .vfs_fhtovp = devfs_fhtovp, + .vfs_vptofh = devfs_vptofh, + .vfs_init = devfs_init, + .vfs_sysctl = devfs_sysctl, + // There are other VFS ops that we do not support }; diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index 207a50c01..ebf0f0c14 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -618,10 +618,8 @@ devfs_read(struct vnop_read_args *ap) default: { printf("devfs_read(): bad file type %d", ap->a_vp->v_type); return(EINVAL); - break; } } - return (0); /* not reached */ } static int @@ -633,7 +631,7 @@ devfs_close(struct vnop_close_args *ap) } */ { struct vnode * vp = ap->a_vp; - register devnode_t * dnp; + devnode_t * dnp; if (vnode_isinuse(vp, 1)) { DEVFS_LOCK(); @@ -653,7 +651,7 @@ devfsspec_close(struct vnop_close_args *ap) } */ { struct vnode * vp = ap->a_vp; - register devnode_t * dnp; + devnode_t * dnp; if (vnode_isinuse(vp, 0)) { DEVFS_LOCK(); @@ -725,7 +723,7 @@ devfsspec_read(struct vnop_read_args *ap) kauth_cred_t a_cred; } */ { - register devnode_t * dnp = VTODN(ap->a_vp); + devnode_t * dnp = VTODN(ap->a_vp); devfs_consider_time_update(dnp, DEVFS_UPDATE_ACCESS); @@ -741,7 +739,7 @@ devfsspec_write(struct vnop_write_args *ap) vfs_context_t a_context; } */ { - register devnode_t * dnp = VTODN(ap->a_vp); + devnode_t * dnp = VTODN(ap->a_vp); devfs_consider_time_update(dnp, DEVFS_UPDATE_CHANGE | DEVFS_UPDATE_MOD); @@ -767,7 +765,6 @@ devfs_write(struct vnop_write_args *ap) printf("devfs_write(): bad file type %d", ap->a_vp->v_type); return (EINVAL); } - return 0; /* not reached */ } /* diff --git a/bsd/miscfs/fifofs/Makefile b/bsd/miscfs/fifofs/Makefile index 14f1db889..e67582c32 100644 --- a/bsd/miscfs/fifofs/Makefile +++ b/bsd/miscfs/fifofs/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -16,5 +15,3 @@ EXPORT_MI_DIR = miscfs/fifofs include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/miscfs/fifofs/fifo_vnops.c b/bsd/miscfs/fifofs/fifo_vnops.c index 62cc11fed..0b7bc1d3d 100644 --- a/bsd/miscfs/fifofs/fifo_vnops.c +++ b/bsd/miscfs/fifofs/fifo_vnops.c @@ -326,9 +326,9 @@ fifo_read(struct vnop_read_args *ap) /* skip soreceive to avoid blocking when we have no writers */ if (error != EWOULDBLOCK) { error = soreceive(rso, (struct sockaddr **)0, uio, (struct mbuf **)0, - (struct mbuf **)0, &rflags); - if (error == 0 && ap->a_vp->v_knotes.slh_first != NULL) - KNOTE(&ap->a_vp->v_knotes, 0); + (struct mbuf **)0, &rflags); + if (error == 0) + lock_vnode_and_post(ap->a_vp, 0); } else { /* clear EWOULDBLOCK and return EOF (zero) */ @@ -360,8 +360,8 @@ fifo_write(struct vnop_write_args *ap) #endif error = sosend(wso, (struct sockaddr *)0, ap->a_uio, NULL, (struct mbuf *)0, (ap->a_ioflag & IO_NDELAY) ? MSG_NBIO : 0); - if (error == 0 && ap->a_vp->v_knotes.slh_first != NULL) - KNOTE(&ap->a_vp->v_knotes, 0); + if (error == 0) + lock_vnode_and_post(ap->a_vp, 0); return (error); } diff --git a/bsd/miscfs/mockfs/mockfs_vfsops.c b/bsd/miscfs/mockfs/mockfs_vfsops.c index 3aefc8ad6..6116e53aa 100644 --- a/bsd/miscfs/mockfs/mockfs_vfsops.c +++ b/bsd/miscfs/mockfs/mockfs_vfsops.c @@ -241,19 +241,8 @@ int mockfs_init(__unused struct vfsconf * vfsc) } struct vfsops mockfs_vfsops = { - NULL, /* mount */ - NULL, /* start */ - mockfs_unmount, /* unmount */ - mockfs_root, /* root */ - NULL, /* quotactl */ - NULL, /* getattr */ - mockfs_sync, /* sync */ - NULL, /* vget */ - NULL, /* fhtovp */ - NULL, /* vptofh */ - mockfs_init, /* init */ - NULL, /* sysctl */ - NULL, /* setattr */ - {NULL} + .vfs_unmount = mockfs_unmount, + .vfs_root = mockfs_root, + .vfs_sync = mockfs_sync, + .vfs_init = mockfs_init, }; - diff --git a/bsd/miscfs/nullfs/Makefile b/bsd/miscfs/nullfs/Makefile new file mode 100644 index 000000000..898bbe9bf --- /dev/null +++ b/bsd/miscfs/nullfs/Makefile @@ -0,0 +1,22 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +DATAFILES = + +PRIVATE_KERNELFILES = nullfs.h + +INSTALL_MI_LIST = + +INSTALL_MI_DIR = miscfs/nullfs + +EXPORT_MI_LIST = + +EXPORT_MI_DIR = miscfs/nullfs + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/miscfs/nullfs/null_subr.c b/bsd/miscfs/nullfs/null_subr.c new file mode 100644 index 000000000..79e9d208d --- /dev/null +++ b/bsd/miscfs/nullfs/null_subr.c @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/*- + * Portions Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_subr.c 8.7 (Berkeley) 5/14/95 + * + * $FreeBSD$ + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nullfs.h" + +/* + * Null layer cache: + * Each cache entry holds a reference to the lower vnode + * along with a pointer to the alias vnode. When an + * entry is added the lower vnode is VREF'd. When the + * alias is removed the lower vnode is vrele'd. + */ + +#define NULL_HASH_SIZE (desiredvnodes / 10) + +/* osx doesn't really have the functionality freebsd uses here..gonna try this + * hacked hash...*/ +#define NULL_NHASH(vp) (&null_node_hashtbl[((((uintptr_t)vp) >> vnsz2log) + (uintptr_t)vnode_mount(vp)) & null_hash_mask]) + +static LIST_HEAD(null_node_hashhead, null_node) * null_node_hashtbl; +static lck_mtx_t null_hashmtx; +static lck_attr_t * null_hashlck_attr; +static lck_grp_t * null_hashlck_grp; +static lck_grp_attr_t * null_hashlck_grp_attr; +static u_long null_hash_mask; + +/* os x doesn't have hashes built into vnode. gonna try doing what freebsd does + anyway + Don't want to create a dependency on vnode_internal.h and the real struct + vnode. + 9 is an eyeball of the log 2 size of vnode */ +static int vnsz2log = 9; + +static int null_hashins(struct mount *, struct null_node *, struct vnode **); + +int +nullfs_init_lck(lck_mtx_t * lck) +{ + int error = 1; + if (lck && null_hashlck_grp && null_hashlck_attr) { + lck_mtx_init(lck, null_hashlck_grp, null_hashlck_attr); + error = 0; + } + return error; +} + +int +nullfs_destroy_lck(lck_mtx_t * lck) +{ + int error = 1; + if (lck && null_hashlck_grp) { + lck_mtx_destroy(lck, null_hashlck_grp); + error = 0; + } + return error; +} + +/* + * Initialise cache headers + */ +int +nullfs_init(__unused struct vfsconf * vfsp) +{ + NULLFSDEBUG("%s\n", __FUNCTION__); + + /* assuming for now that this happens immediately and by default after fs + * installation */ + null_hashlck_grp_attr = lck_grp_attr_alloc_init(); + if (null_hashlck_grp_attr == NULL) { + goto error; + } + null_hashlck_grp = lck_grp_alloc_init("com.apple.filesystems.nullfs", null_hashlck_grp_attr); + if (null_hashlck_grp == NULL) { + goto error; + } + null_hashlck_attr = lck_attr_alloc_init(); + if (null_hashlck_attr == NULL) { + goto error; + } + + lck_mtx_init(&null_hashmtx, null_hashlck_grp, null_hashlck_attr); + null_node_hashtbl = hashinit(NULL_HASH_SIZE, M_TEMP, &null_hash_mask); + NULLFSDEBUG("%s finished\n", __FUNCTION__); + return (0); +error: + printf("NULLFS: failed to get lock element\n"); + if (null_hashlck_grp_attr) { + lck_grp_attr_free(null_hashlck_grp_attr); + null_hashlck_grp_attr = NULL; + } + if (null_hashlck_grp) { + lck_grp_free(null_hashlck_grp); + null_hashlck_grp = NULL; + } + if (null_hashlck_attr) { + lck_attr_free(null_hashlck_attr); + null_hashlck_attr = NULL; + } + return KERN_FAILURE; +} + +int +nullfs_uninit() +{ + /* This gets called when the fs is uninstalled, there wasn't an exact + * equivalent in vfsops */ + lck_mtx_destroy(&null_hashmtx, null_hashlck_grp); + FREE(null_node_hashtbl, M_TEMP); + if (null_hashlck_grp_attr) { + lck_grp_attr_free(null_hashlck_grp_attr); + null_hashlck_grp_attr = NULL; + } + if (null_hashlck_grp) { + lck_grp_free(null_hashlck_grp); + null_hashlck_grp = NULL; + } + if (null_hashlck_attr) { + lck_attr_free(null_hashlck_attr); + null_hashlck_attr = NULL; + } + return (0); +} + +/* + * Find the nullfs vnode mapped to lowervp. Return it in *vpp with an iocount if found. + * Return 0 on success. On failure *vpp will be null and a non-zero error code will be returned. + */ +int +null_hashget(struct mount * mp, struct vnode * lowervp, struct vnode ** vpp) +{ + struct null_node_hashhead * hd; + struct null_node * a; + struct vnode * vp; + int error = ENOENT; + + /* + * Find hash base, and then search the (two-way) linked + * list looking for a null_node structure which is referencing + * the lower vnode. We only give up our reference at reclaim so + * just check whether the lowervp has gotten pulled from under us + */ + hd = NULL_NHASH(lowervp); + lck_mtx_lock(&null_hashmtx); + LIST_FOREACH(a, hd, null_hash) + { + if (a->null_lowervp == lowervp && vnode_mount(NULLTOV(a)) == mp) { + vp = NULLTOV(a); + if (a->null_lowervid != vnode_vid(lowervp)) { + /*lowervp has reved */ + error = EIO; + } else { + /* if we found something then get an iocount on it */ + error = vnode_getwithvid(vp, a->null_myvid); + if (error == 0) { + *vpp = vp; + } + } + break; + } + } + lck_mtx_unlock(&null_hashmtx); + return error; +} + +/* + * Act like null_hashget, but add passed null_node to hash if no existing + * node found. + */ +static int +null_hashins(struct mount * mp, struct null_node * xp, struct vnode ** vpp) +{ + struct null_node_hashhead * hd; + struct null_node * oxp; + struct vnode * ovp; + int error = 0; + + hd = NULL_NHASH(xp->null_lowervp); + lck_mtx_lock(&null_hashmtx); + LIST_FOREACH(oxp, hd, null_hash) + { + if (oxp->null_lowervp == xp->null_lowervp && vnode_mount(NULLTOV(oxp)) == mp) { + /* + * See null_hashget for a description of this + * operation. + */ + ovp = NULLTOV(oxp); + if (oxp->null_lowervid != vnode_vid(oxp->null_lowervp)) { + /*vp doesn't exist so return null (not sure we are actually gonna catch + recycle right now + This is an exceptional case right now, it suggests the vnode we are + trying to add has been recycled + don't add it.*/ + error = EIO; + goto end; + } + /* if we found something in the hash map then grab an iocount */ + error = vnode_getwithvid(ovp, oxp->null_myvid); + if (error == 0) { + *vpp = ovp; + } + goto end; + } + } + /* if it wasn't in the hash map then the vnode pointed to by xp already has a + * iocount so don't bother */ + LIST_INSERT_HEAD(hd, xp, null_hash); + xp->null_flags |= NULL_FLAG_HASHED; +end: + lck_mtx_unlock(&null_hashmtx); + return error; +} + +/* + * Remove node from hash. + */ +void +null_hashrem(struct null_node * xp) +{ + lck_mtx_lock(&null_hashmtx); + LIST_REMOVE(xp, null_hash); + lck_mtx_unlock(&null_hashmtx); +} + +static struct null_node * +null_nodecreate(struct vnode * lowervp) +{ + struct null_node * xp; + + MALLOC(xp, struct null_node *, sizeof(struct null_node), M_TEMP, M_WAITOK | M_ZERO); + if (xp != NULL) { + if (lowervp) { + xp->null_lowervp = lowervp; + xp->null_lowervid = vnode_vid(lowervp); + } + } + return xp; +} + +/* assumption is that vnode has iocount on it after vnode create */ +int +null_getnewvnode( + struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root) +{ + struct vnode_fsparam vnfs_param; + int error = 0; + enum vtype type = VDIR; + struct null_node * xp = null_nodecreate(lowervp); + + if (xp == NULL) { + return ENOMEM; + } + + if (lowervp) { + type = vnode_vtype(lowervp); + } + + vnfs_param.vnfs_mp = mp; + vnfs_param.vnfs_vtype = type; + vnfs_param.vnfs_str = "nullfs"; + vnfs_param.vnfs_dvp = dvp; + vnfs_param.vnfs_fsnode = (void *)xp; + vnfs_param.vnfs_vops = nullfs_vnodeop_p; + vnfs_param.vnfs_markroot = root; + vnfs_param.vnfs_marksystem = 0; + vnfs_param.vnfs_rdev = 0; + vnfs_param.vnfs_filesize = 0; // set this to 0 since we should only be shadowing non-regular files + vnfs_param.vnfs_cnp = cnp; + vnfs_param.vnfs_flags = VNFS_ADDFSREF; + + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vnfs_param, vpp); + if (error == 0) { + xp->null_vnode = *vpp; + xp->null_myvid = vnode_vid(*vpp); + vnode_settag(*vpp, VT_NULL); + } else { + FREE(xp, M_TEMP); + } + return error; +} + +/* + * Make a new or get existing nullfs node. + * Vp is the alias vnode, lowervp is the lower vnode. + * + * lowervp is assumed to have an iocount on it from the caller + */ +int +null_nodeget( + struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root) +{ + struct vnode * vp; + int error; + + /* Lookup the hash firstly. */ + error = null_hashget(mp, lowervp, vpp); + /* ENOENT means it wasn't found, EIO is a failure we should bail from, 0 is it + * was found */ + if (error != ENOENT) { + /* null_hashget checked the vid, so if we got something here its legit to + * the best of our knowledge*/ + /* if we found something then there is an iocount on vpp, + if we didn't find something then vpp shouldn't be used by the caller */ + return error; + } + + /* + * We do not serialize vnode creation, instead we will check for + * duplicates later, when adding new vnode to hash. + */ + error = vnode_ref(lowervp); // take a ref on lowervp so we let the system know we care about it + if(error) + { + // Failed to get a reference on the lower vp so bail. Lowervp may be gone already. + return error; + } + + error = null_getnewvnode(mp, lowervp, dvp, &vp, cnp, root); + + if (error) { + vnode_rele(lowervp); + return (error); + } + + /* + * Atomically insert our new node into the hash or vget existing + * if someone else has beaten us to it. + */ + error = null_hashins(mp, VTONULL(vp), vpp); + if (error || *vpp != NULL) { + /* recycle will call reclaim which will get rid of the internals */ + vnode_recycle(vp); + vnode_put(vp); + /* if we found vpp, then null_hashins put an iocount on it */ + return error; + } + + /* vp has an iocount from null_getnewvnode */ + *vpp = vp; + + return (0); +} diff --git a/bsd/miscfs/nullfs/null_vfsops.c b/bsd/miscfs/nullfs/null_vfsops.c new file mode 100644 index 000000000..5a191d2e1 --- /dev/null +++ b/bsd/miscfs/nullfs/null_vfsops.c @@ -0,0 +1,557 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/*- + * Portions Copyright (c) 1992, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_vfsops.c 8.2 (Berkeley) 1/21/94 + * + * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "nullfs.h" + +#define NULLFS_ENTITLEMENT "com.apple.private.nullfs_allow" + +#define SIZEOF_MEMBER(type, member) (sizeof(((type *)0)->member)) +#define MAX_MNT_FROM_LENGTH (SIZEOF_MEMBER(struct vfsstatfs, f_mntfromname)) + +static int +nullfs_vfs_getlowerattr(mount_t mp, struct vfs_attr * vfap, vfs_context_t ctx) +{ + memset(vfap, 0, sizeof(*vfap)); + VFSATTR_INIT(vfap); + VFSATTR_WANTED(vfap, f_bsize); + VFSATTR_WANTED(vfap, f_iosize); + VFSATTR_WANTED(vfap, f_blocks); + VFSATTR_WANTED(vfap, f_bfree); + VFSATTR_WANTED(vfap, f_bavail); + VFSATTR_WANTED(vfap, f_bused); + VFSATTR_WANTED(vfap, f_files); + VFSATTR_WANTED(vfap, f_ffree); + VFSATTR_WANTED(vfap, f_capabilities); + + return vfs_getattr(mp, vfap, ctx); +} + +/* + * Mount null layer + */ +static int +nullfs_mount(struct mount * mp, __unused vnode_t devvp, user_addr_t user_data, vfs_context_t ctx) +{ + int error = 0; + struct vnode *lowerrootvp = NULL, *vp = NULL; + struct vfsstatfs * sp = NULL; + struct null_mount * xmp = NULL; + char data[MAXPATHLEN]; + size_t count; + struct vfs_attr vfa; + /* set defaults (arbitrary since this file system is readonly) */ + uint32_t bsize = BLKDEV_IOSIZE; + size_t iosize = BLKDEV_IOSIZE; + uint64_t blocks = 4711 * 4711; + uint64_t bfree = 0; + uint64_t bavail = 0; + uint64_t bused = 4711; + uint64_t files = 4711; + uint64_t ffree = 0; + + kauth_cred_t cred = vfs_context_ucred(ctx); + + NULLFSDEBUG("nullfs_mount(mp = %p) %llx\n", (void *)mp, vfs_flags(mp)); + + if (vfs_flags(mp) & MNT_ROOTFS) + return (EOPNOTSUPP); + + /* + * Update is a no-op + */ + if (vfs_isupdate(mp)) { + return ENOTSUP; + } + + /* check entitlement */ + if (!IOTaskHasEntitlement(current_task(), NULLFS_ENTITLEMENT)) { + return EPERM; + } + + /* + * Get argument + */ + error = copyinstr(user_data, data, MAXPATHLEN - 1, &count); + if (error) { + NULLFSDEBUG("nullfs: error copying data form user %d\n", error); + goto error; + } + + /* This could happen if the system is configured for 32 bit inodes instead of + * 64 bit */ + if (count > MAX_MNT_FROM_LENGTH) { + error = EINVAL; + NULLFSDEBUG("nullfs: path to translocate too large for this system %d vs %d\n", count, MAX_MNT_FROM_LENGTH); + goto error; + } + + error = vnode_lookup(data, 0, &lowerrootvp, ctx); + if (error) { + NULLFSDEBUG("lookup %s -> %d\n", data, error); + goto error; + } + + /* lowervrootvp has an iocount after vnode_lookup, drop that for a usecount. + Keep this to signal what we want to keep around the thing we are mirroring. + Drop it in unmount.*/ + error = vnode_ref(lowerrootvp); + vnode_put(lowerrootvp); + if (error) + { + // If vnode_ref failed, then null it out so it can't be used anymore in cleanup. + lowerrootvp = NULL; + goto error; + } + + NULLFSDEBUG("mount %s\n", data); + + MALLOC(xmp, struct null_mount *, sizeof(*xmp), M_TEMP, M_WAITOK | M_ZERO); + if (xmp == NULL) { + error = ENOMEM; + goto error; + } + + /* + * Save reference to underlying FS + */ + xmp->nullm_lowerrootvp = lowerrootvp; + xmp->nullm_lowerrootvid = vnode_vid(lowerrootvp); + + error = null_getnewvnode(mp, NULL, NULL, &vp, NULL, 1); + if (error) { + goto error; + } + + /* vp has an iocount on it from vnode_create. drop that for a usecount. This + * is our root vnode so we drop the ref in unmount + * + * Assuming for now that because we created this vnode and we aren't finished mounting we can get a ref*/ + vnode_ref(vp); + vnode_put(vp); + + error = nullfs_init_lck(&xmp->nullm_lock); + if (error) { + goto error; + } + + xmp->nullm_rootvp = vp; + + /* read the flags the user set, but then ignore some of them, we will only + allow them if they are set on the lower file system */ + uint64_t flags = vfs_flags(mp) & (~(MNT_IGNORE_OWNERSHIP | MNT_LOCAL)); + uint64_t lowerflags = vfs_flags(vnode_mount(lowerrootvp)) & (MNT_LOCAL | MNT_QUARANTINE | MNT_IGNORE_OWNERSHIP | MNT_NOEXEC); + + if (lowerflags) { + flags |= lowerflags; + } + + /* force these flags */ + flags |= (MNT_DONTBROWSE | MNT_MULTILABEL | MNT_NOSUID | MNT_RDONLY); + vfs_setflags(mp, flags); + + vfs_setfsprivate(mp, xmp); + vfs_getnewfsid(mp); + vfs_setlocklocal(mp); + + /* fill in the stat block */ + sp = vfs_statfs(mp); + strlcpy(sp->f_mntfromname, data, MAX_MNT_FROM_LENGTH); + + sp->f_flags = flags; + + xmp->nullm_flags = NULLM_CASEINSENSITIVE; /* default to case insensitive */ + + error = nullfs_vfs_getlowerattr(vnode_mount(lowerrootvp), &vfa, ctx); + if (error == 0) { + if (VFSATTR_IS_SUPPORTED(&vfa, f_bsize)) { + bsize = vfa.f_bsize; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_iosize)) { + iosize = vfa.f_iosize; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_blocks)) { + blocks = vfa.f_blocks; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_bfree)) { + bfree = vfa.f_bfree; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_bavail)) { + bavail = vfa.f_bavail; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_bused)) { + bused = vfa.f_bused; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_files)) { + files = vfa.f_files; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_ffree)) { + ffree = vfa.f_ffree; + } + if (VFSATTR_IS_SUPPORTED(&vfa, f_capabilities)) { + if ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & (VOL_CAP_FMT_CASE_SENSITIVE)) && + (vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & (VOL_CAP_FMT_CASE_SENSITIVE))) { + xmp->nullm_flags &= ~NULLM_CASEINSENSITIVE; + } + } + } else { + goto error; + } + + sp->f_bsize = bsize; + sp->f_iosize = iosize; + sp->f_blocks = blocks; + sp->f_bfree = bfree; + sp->f_bavail = bavail; + sp->f_bused = bused; + sp->f_files = files; + sp->f_ffree = ffree; + + /* Associate the mac label information from the mirrored filesystem with the + * mirror */ + MAC_PERFORM(mount_label_associate, cred, vnode_mount(lowerrootvp), vfs_mntlabel(mp)); + + NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n", sp->f_mntfromname, sp->f_mntonname); + return (0); + +error: + if (xmp) { + FREE(xmp, M_TEMP); + } + if (lowerrootvp) { + vnode_getwithref(lowerrootvp); + vnode_rele(lowerrootvp); + vnode_put(lowerrootvp); + } + if (vp) { + /* we made the root vnode but the mount is failed, so clean it up */ + vnode_getwithref(vp); + vnode_rele(vp); + /* give vp back */ + vnode_recycle(vp); + vnode_put(vp); + } + return error; +} + +/* + * Free reference to null layer + */ +static int +nullfs_unmount(struct mount * mp, int mntflags, __unused vfs_context_t ctx) +{ + struct null_mount * mntdata; + struct vnode * vp; + int error, flags; + + NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp); + + /* check entitlement or superuser*/ + if (!IOTaskHasEntitlement(current_task(), NULLFS_ENTITLEMENT) && + vfs_context_suser(ctx) != 0) { + return EPERM; + } + + if (mntflags & MNT_FORCE) { + flags = FORCECLOSE; + } else { + flags = 0; + } + + mntdata = MOUNTTONULLMOUNT(mp); + vp = mntdata->nullm_rootvp; + + // release our reference on the root before flushing. + // it will get pulled out of the mount structure by reclaim + vnode_getalways(vp); + + error = vflush(mp, vp, flags); + if (error) + { + vnode_put(vp); + return (error); + } + + if (vnode_isinuse(vp,1) && flags == 0) + { + vnode_put(vp); + return EBUSY; + } + + vnode_rele(vp); // Drop reference taken by nullfs_mount + vnode_put(vp); // Drop ref taken above + + //Force close to get rid of the last vnode + (void)vflush(mp, NULL, FORCECLOSE); + + /* no more vnodes, so tear down the mountpoint */ + + lck_mtx_lock(&mntdata->nullm_lock); + + vfs_setfsprivate(mp, NULL); + + vnode_getalways(mntdata->nullm_lowerrootvp); + vnode_rele(mntdata->nullm_lowerrootvp); + vnode_put(mntdata->nullm_lowerrootvp); + + lck_mtx_unlock(&mntdata->nullm_lock); + + nullfs_destroy_lck(&mntdata->nullm_lock); + + FREE(mntdata, M_TEMP); + + uint64_t vflags = vfs_flags(mp); + vfs_setflags(mp, vflags & ~MNT_LOCAL); + + return (0); +} + +static int +nullfs_root(struct mount * mp, struct vnode ** vpp, __unused vfs_context_t ctx) +{ + struct vnode * vp; + int error; + + NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", (void *)mp, (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp); + + /* + * Return locked reference to root. + */ + vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; + + error = vnode_get(vp); + if (error) + return error; + + *vpp = vp; + return 0; +} + +static int +nullfs_vfs_getattr(struct mount * mp, struct vfs_attr * vfap, vfs_context_t ctx) +{ + struct vnode * coveredvp = NULL; + struct vfs_attr vfa; + struct null_mount * null_mp = MOUNTTONULLMOUNT(mp); + vol_capabilities_attr_t capabilities; + struct vfsstatfs * sp = vfs_statfs(mp); + + struct timespec tzero = {0, 0}; + + NULLFSDEBUG("%s\n", __FUNCTION__); + + /* Set default capabilities in case the lower file system is gone */ + memset(&capabilities, 0, sizeof(capabilities)); + capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = VOL_CAP_FMT_FAST_STATFS | VOL_CAP_FMT_HIDDEN_FILES; + capabilities.valid[VOL_CAPABILITIES_FORMAT] = VOL_CAP_FMT_FAST_STATFS | VOL_CAP_FMT_HIDDEN_FILES; + + if (nullfs_vfs_getlowerattr(vnode_mount(null_mp->nullm_lowerrootvp), &vfa, ctx) == 0) { + if (VFSATTR_IS_SUPPORTED(&vfa, f_capabilities)) { + memcpy(&capabilities, &vfa.f_capabilities, sizeof(capabilities)); + /* don't support vget */ + capabilities.capabilities[VOL_CAPABILITIES_FORMAT] &= ~(VOL_CAP_FMT_PERSISTENTOBJECTIDS | VOL_CAP_FMT_PATH_FROM_ID); + + capabilities.capabilities[VOL_CAPABILITIES_FORMAT] |= VOL_CAP_FMT_HIDDEN_FILES; /* Always support UF_HIDDEN */ + + capabilities.valid[VOL_CAPABILITIES_FORMAT] &= ~(VOL_CAP_FMT_PERSISTENTOBJECTIDS | VOL_CAP_FMT_PATH_FROM_ID); + + capabilities.valid[VOL_CAPABILITIES_FORMAT] |= VOL_CAP_FMT_HIDDEN_FILES; /* Always support UF_HIDDEN */ + + /* dont' support interfaces that only make sense on a writable file system + * or one with specific vnops implemented */ + capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] = 0; + + capabilities.valid[VOL_CAPABILITIES_INTERFACES] &= + ~(VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_ATTRLIST | VOL_CAP_INT_READDIRATTR | VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | VOL_CAP_INT_ALLOCATE | VOL_CAP_INT_VOL_RENAME | VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK); + } + } + + if (VFSATTR_IS_ACTIVE(vfap, f_create_time)) + VFSATTR_RETURN(vfap, f_create_time, tzero); + + if (VFSATTR_IS_ACTIVE(vfap, f_modify_time)) + VFSATTR_RETURN(vfap, f_modify_time, tzero); + + if (VFSATTR_IS_ACTIVE(vfap, f_access_time)) + VFSATTR_RETURN(vfap, f_access_time, tzero); + + if (VFSATTR_IS_ACTIVE(vfap, f_bsize)) + VFSATTR_RETURN(vfap, f_bsize, sp->f_bsize); + + if (VFSATTR_IS_ACTIVE(vfap, f_iosize)) + VFSATTR_RETURN(vfap, f_iosize, sp->f_iosize); + + if (VFSATTR_IS_ACTIVE(vfap, f_owner)) + VFSATTR_RETURN(vfap, f_owner, 0); + + if (VFSATTR_IS_ACTIVE(vfap, f_blocks)) + VFSATTR_RETURN(vfap, f_blocks, sp->f_blocks); + + if (VFSATTR_IS_ACTIVE(vfap, f_bfree)) + VFSATTR_RETURN(vfap, f_bfree, sp->f_bfree); + + if (VFSATTR_IS_ACTIVE(vfap, f_bavail)) + VFSATTR_RETURN(vfap, f_bavail, sp->f_bavail); + + if (VFSATTR_IS_ACTIVE(vfap, f_bused)) + VFSATTR_RETURN(vfap, f_bused, sp->f_bused); + + if (VFSATTR_IS_ACTIVE(vfap, f_files)) + VFSATTR_RETURN(vfap, f_files, sp->f_files); + + if (VFSATTR_IS_ACTIVE(vfap, f_ffree)) + VFSATTR_RETURN(vfap, f_ffree, sp->f_ffree); + + if (VFSATTR_IS_ACTIVE(vfap, f_fssubtype)) + VFSATTR_RETURN(vfap, f_fssubtype, 0); + + if (VFSATTR_IS_ACTIVE(vfap, f_capabilities)) { + memcpy(&vfap->f_capabilities, &capabilities, sizeof(vol_capabilities_attr_t)); + + VFSATTR_SET_SUPPORTED(vfap, f_capabilities); + } + + if (VFSATTR_IS_ACTIVE(vfap, f_attributes)) { + vol_attributes_attr_t * volattr = &vfap->f_attributes; + + volattr->validattr.commonattr = 0; + volattr->validattr.volattr = ATTR_VOL_NAME | ATTR_VOL_CAPABILITIES | ATTR_VOL_ATTRIBUTES; + volattr->validattr.dirattr = 0; + volattr->validattr.fileattr = 0; + volattr->validattr.forkattr = 0; + + volattr->nativeattr.commonattr = 0; + volattr->nativeattr.volattr = ATTR_VOL_NAME | ATTR_VOL_CAPABILITIES | ATTR_VOL_ATTRIBUTES; + volattr->nativeattr.dirattr = 0; + volattr->nativeattr.fileattr = 0; + volattr->nativeattr.forkattr = 0; + + VFSATTR_SET_SUPPORTED(vfap, f_attributes); + } + + if (VFSATTR_IS_ACTIVE(vfap, f_vol_name)) { + /* The name of the volume is the same as the directory we mounted on */ + coveredvp = vfs_vnodecovered(mp); + if (coveredvp) { + const char * name = vnode_getname_printable(coveredvp); + strlcpy(vfap->f_vol_name, name, MAXPATHLEN); + vnode_putname_printable(name); + + VFSATTR_SET_SUPPORTED(vfap, f_vol_name); + vnode_put(coveredvp); + } + } + + return 0; +} + +static int +nullfs_sync(__unused struct mount * mp, __unused int waitfor, __unused vfs_context_t ctx) +{ + /* + * XXX - Assumes no data cached at null layer. + */ + return (0); +} + + + +static int +nullfs_vfs_start(__unused struct mount * mp, __unused int flags, __unused vfs_context_t ctx) +{ + NULLFSDEBUG("%s\n", __FUNCTION__); + return 0; +} + +extern struct vnodeopv_desc nullfs_vnodeop_opv_desc; + +struct vnodeopv_desc * nullfs_vnodeopv_descs[] = { + &nullfs_vnodeop_opv_desc, +}; + +struct vfsops nullfs_vfsops = { + .vfs_mount = nullfs_mount, + .vfs_unmount = nullfs_unmount, + .vfs_start = nullfs_vfs_start, + .vfs_root = nullfs_root, + .vfs_getattr = nullfs_vfs_getattr, + .vfs_sync = nullfs_sync, + .vfs_init = nullfs_init, + .vfs_sysctl = NULL, + .vfs_setattr = NULL, +}; + diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c new file mode 100644 index 000000000..389adb7e4 --- /dev/null +++ b/bsd/miscfs/nullfs/null_vnops.c @@ -0,0 +1,1037 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/*- + * Portions Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * John Heidemann of the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 + * + * Ancestors: + * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 + * ...and... + * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nullfs.h" + +#define NULL_ROOT_INO 2 +#define NULL_SECOND_INO 3 +#define NULL_THIRD_INO 4 + +vop_t * nullfs_vnodeop_p = NULL; + +/* the mountpoint lock should be held going into this function */ +static int +nullfs_isspecialvp(struct vnode * vp) +{ + struct null_mount * null_mp; + + null_mp = MOUNTTONULLMOUNT(vnode_mount(vp)); + + /* only check for root and second here, third is special in a different way, + * related only to lookup and readdir */ + if (vp && (vp == null_mp->nullm_rootvp || vp == null_mp->nullm_secondvp)) { + return 1; + } + return 0; +} + +/* helper function to handle locking where possible */ +static int +nullfs_checkspecialvp(struct vnode* vp) +{ + int result = 0; + struct null_mount * null_mp; + + null_mp = MOUNTTONULLMOUNT(vnode_mount(vp)); + + lck_mtx_lock(&null_mp->nullm_lock); + result = (nullfs_isspecialvp(vp)); + lck_mtx_unlock(&null_mp->nullm_lock); + + return result; +} + +static int +nullfs_default(__unused struct vnop_generic_args * args) +{ + NULLFSDEBUG("%s (default)\n", ((struct vnodeop_desc_fake *)args->a_desc)->vdesc_name); + return ENOTSUP; +} + +static int +nullfs_special_getattr(struct vnop_getattr_args * args) +{ + mount_t mp = vnode_mount(args->a_vp); + struct null_mount * null_mp = MOUNTTONULLMOUNT(mp); + + ino_t ino = NULL_ROOT_INO; + struct vnode_attr covered_rootattr; + vnode_t checkvp = null_mp->nullm_lowerrootvp; + + VATTR_INIT(&covered_rootattr); + VATTR_WANTED(&covered_rootattr, va_uid); + VATTR_WANTED(&covered_rootattr, va_gid); + VATTR_WANTED(&covered_rootattr, va_create_time); + VATTR_WANTED(&covered_rootattr, va_modify_time); + VATTR_WANTED(&covered_rootattr, va_access_time); + + /* prefer to get this from the lower root vp, but if not (i.e. forced unmount + * of lower fs) try the mount point covered vnode */ + if (vnode_getwithvid(checkvp, null_mp->nullm_lowerrootvid)) { + checkvp = vfs_vnodecovered(mp); + if (checkvp == NULL) { + return EIO; + } + } + + int error = vnode_getattr(checkvp, &covered_rootattr, args->a_context); + + vnode_put(checkvp); + if (error) { + /* we should have been able to get attributes fore one of the two choices so + * fail if we didn't */ + return error; + } + + /* we got the attributes of the vnode we cover so plow ahead */ + if (args->a_vp == null_mp->nullm_secondvp) { + ino = NULL_SECOND_INO; + } + + VATTR_RETURN(args->a_vap, va_type, vnode_vtype(args->a_vp)); + VATTR_RETURN(args->a_vap, va_rdev, 0); + VATTR_RETURN(args->a_vap, va_nlink, 3); /* always just ., .., and the child */ + VATTR_RETURN(args->a_vap, va_total_size, 0); // hoping this is ok + + VATTR_RETURN(args->a_vap, va_data_size, 0); // hoping this is ok + VATTR_RETURN(args->a_vap, va_data_alloc, 0); + VATTR_RETURN(args->a_vap, va_iosize, vfs_statfs(mp)->f_iosize); + VATTR_RETURN(args->a_vap, va_fileid, ino); + VATTR_RETURN(args->a_vap, va_linkid, ino); + VATTR_RETURN(args->a_vap, va_fsid, vfs_statfs(mp)->f_fsid.val[0]); // return the fsid of the mount point + VATTR_RETURN(args->a_vap, va_filerev, 0); + VATTR_RETURN(args->a_vap, va_gen, 0); + VATTR_RETURN(args->a_vap, va_flags, UF_HIDDEN); /* mark our fake directories as hidden. People + shouldn't be enocouraged to poke around in them */ + + if (ino == NULL_SECOND_INO) { + VATTR_RETURN(args->a_vap, va_parentid, NULL_ROOT_INO); /* no parent at the root, so + the only other vnode that + goes through this path is + second and its parent is + 1.*/ + } + + if (VATTR_IS_ACTIVE(args->a_vap, va_mode)) { + /* force dr_xr_xr_x */ + VATTR_RETURN(args->a_vap, va_mode, S_IFDIR | S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + } + if (VATTR_IS_ACTIVE(args->a_vap, va_uid)) { + VATTR_RETURN(args->a_vap, va_uid, covered_rootattr.va_uid); + } + if (VATTR_IS_ACTIVE(args->a_vap, va_gid)) { + VATTR_RETURN(args->a_vap, va_gid, covered_rootattr.va_gid); + } + + if (VATTR_IS_ACTIVE(args->a_vap, va_create_time)) { + VATTR_SET_SUPPORTED(args->a_vap, va_create_time); + args->a_vap->va_create_time.tv_sec = covered_rootattr.va_create_time.tv_sec; + args->a_vap->va_create_time.tv_nsec = covered_rootattr.va_create_time.tv_nsec; + } + if (VATTR_IS_ACTIVE(args->a_vap, va_modify_time)) { + VATTR_SET_SUPPORTED(args->a_vap, va_modify_time); + args->a_vap->va_modify_time.tv_sec = covered_rootattr.va_modify_time.tv_sec; + args->a_vap->va_modify_time.tv_nsec = covered_rootattr.va_modify_time.tv_nsec; + } + if (VATTR_IS_ACTIVE(args->a_vap, va_access_time)) { + VATTR_SET_SUPPORTED(args->a_vap, va_access_time); + args->a_vap->va_modify_time.tv_sec = covered_rootattr.va_access_time.tv_sec; + args->a_vap->va_modify_time.tv_nsec = covered_rootattr.va_access_time.tv_nsec; + } + + return 0; +} + +static int +nullfs_getattr(struct vnop_getattr_args * args) +{ + int error; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(args->a_vp)); + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + lck_mtx_lock(&null_mp->nullm_lock); + if (nullfs_isspecialvp(args->a_vp)) { + error = nullfs_special_getattr(args); + lck_mtx_unlock(&null_mp->nullm_lock); + return error; + } + lck_mtx_unlock(&null_mp->nullm_lock); + + /* this will return a different inode for third than read dir will */ + struct vnode * lowervp = NULLVPTOLOWERVP(args->a_vp); + + error = vnode_getwithref(lowervp); + if (error == 0) { + error = VNOP_GETATTR(lowervp, args->a_vap, args->a_context); + vnode_put(lowervp); + + if (error == 0) { + /* fix up fsid so it doesn't say the underlying fs*/ + VATTR_RETURN(args->a_vap, va_fsid, vfs_statfs(vnode_mount(args->a_vp))->f_fsid.val[0]); + } + } + + return error; +} + +static int +nullfs_open(struct vnop_open_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_OPEN(lvp, args->a_mode, args->a_context); + vnode_put(lvp); + } + + return error; +} + +static int +nullfs_close(struct vnop_close_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_CLOSE(lvp, args->a_fflag, args->a_context); + vnode_put(lvp); + } + return error; +} + +/* get lvp's parent, if possible, even if it isn't set. + + lvp is expected to have an iocount before and after this call. + + if a dvpp is populated the returned vnode has an iocount. */ +static int +null_get_lowerparent(vnode_t lvp, vnode_t * dvpp, vfs_context_t ctx) +{ + int error = 0; + struct vnode_attr va; + mount_t mp = vnode_mount(lvp); + vnode_t dvp = vnode_parent(lvp); + + if (dvp) { + error = vnode_get(dvp); + goto end; + } + + error = ENOENT; + if (!(mp->mnt_kern_flag & MNTK_PATH_FROM_ID)) { + goto end; + } + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_parentid); + + error = vnode_getattr(lvp, &va, ctx); + + if (error || !VATTR_IS_SUPPORTED(&va, va_parentid)) { + goto end; + } + + error = VFS_VGET(mp, (ino64_t)va.va_parentid, &dvp, ctx); + +end: + if (error == 0) { + *dvpp = dvp; + } + return error; +} + +/* the mountpoint lock should be held going into this function */ +static int +null_special_lookup(struct vnop_lookup_args * ap) +{ + struct componentname * cnp = ap->a_cnp; + struct vnode * dvp = ap->a_dvp; + struct vnode * ldvp = NULL; + struct vnode * lvp = NULL; + struct vnode * vp = NULL; + struct mount * mp = vnode_mount(dvp); + struct null_mount * null_mp = MOUNTTONULLMOUNT(mp); + int error = ENOENT; + + if (dvp == null_mp->nullm_rootvp) { + /* handle . and .. */ + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1 || (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')) { + /* this is the root so both . and .. give back the root */ + vp = dvp; + error = vnode_get(vp); + goto end; + } + } + + /* our virtual wrapper directory should be d but D is acceptable if the + * lower file system is case insensitive */ + if (cnp->cn_namelen == 1 && + (cnp->cn_nameptr[0] == 'd' || (null_mp->nullm_flags & NULLM_CASEINSENSITIVE ? cnp->cn_nameptr[0] == 'D' : 0))) { + error = 0; + if (null_mp->nullm_secondvp == NULL) { + error = null_getnewvnode(mp, NULL, dvp, &vp, cnp, 0); + if (error) { + goto end; + } + + null_mp->nullm_secondvp = vp; + } else { + vp = null_mp->nullm_secondvp; + error = vnode_get(vp); + } + } + + } else if (dvp == null_mp->nullm_secondvp) { + /* handle . and .. */ + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + vp = dvp; + error = vnode_get(vp); + goto end; + } else if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + /* parent here is the root vp */ + vp = null_mp->nullm_rootvp; + error = vnode_get(vp); + goto end; + } + } + /* nullmp->nullm_lowerrootvp was set at mount time so don't need to lock to + * access it */ + /* v_name should be null terminated but cn_nameptr is not necessarily. + cn_namelen is the number of characters before the null in either case */ + error = vnode_getwithvid(null_mp->nullm_lowerrootvp, null_mp->nullm_lowerrootvid); + if (error) { + goto end; + } + + /* We don't want to mess with case insensitivity and unicode, so the plan to + check here is + 1. try to get the lower root's parent + 2. If we get a parent, then perform a lookup on the lower file system + using the parent and the passed in cnp + 3. If that worked and we got a vp, then see if the vp is lowerrootvp. If + so we got a match + 4. Anything else results in ENOENT. + */ + error = null_get_lowerparent(null_mp->nullm_lowerrootvp, &ldvp, ap->a_context); + + if (error == 0) { + error = VNOP_LOOKUP(ldvp, &lvp, cnp, ap->a_context); + vnode_put(ldvp); + + if (error == 0) { + if (lvp == null_mp->nullm_lowerrootvp) { + /* always check the hashmap for a vnode for this, the root of the + * mirrored system */ + error = null_nodeget(mp, lvp, dvp, &vp, cnp, 0); + + if (error == 0 && null_mp->nullm_thirdcovervp == NULL) { + /* if nodeget succeeded then vp has an iocount*/ + null_mp->nullm_thirdcovervp = vp; + } + } else { + error = ENOENT; + } + vnode_put(lvp); + } + } + vnode_put(null_mp->nullm_lowerrootvp); + } + +end: + if (error == 0) { + *ap->a_vpp = vp; + } + return error; +} + +/* + * We have to carry on the locking protocol on the null layer vnodes + * as we progress through the tree. We also have to enforce read-only + * if this layer is mounted read-only. + */ +static int +null_lookup(struct vnop_lookup_args * ap) +{ + struct componentname * cnp = ap->a_cnp; + struct vnode * dvp = ap->a_dvp; + struct vnode *vp, *ldvp, *lvp; + struct mount * mp; + struct null_mount * null_mp; + int error; + + NULLFSDEBUG("%s parent: %p component: %.*s\n", __FUNCTION__, ap->a_dvp, cnp->cn_namelen, cnp->cn_nameptr); + + mp = vnode_mount(dvp); + /* rename and delete are not allowed. this is a read only file system */ + if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME || cnp->cn_nameiop == CREATE) { + return (EROFS); + } + null_mp = MOUNTTONULLMOUNT(mp); + + lck_mtx_lock(&null_mp->nullm_lock); + if (nullfs_isspecialvp(dvp)) { + error = null_special_lookup(ap); + lck_mtx_unlock(&null_mp->nullm_lock); + return error; + } + lck_mtx_unlock(&null_mp->nullm_lock); + + // . and .. handling + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + vp = dvp; + } else if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + /* mount point crossing is handled in null_special_lookup */ + vp = vnode_parent(dvp); + } else { + goto notdot; + } + + error = vp ? vnode_get(vp) : ENOENT; + + if (error == 0) { + *ap->a_vpp = vp; + } + + return error; + } + +notdot: + ldvp = NULLVPTOLOWERVP(dvp); + vp = lvp = NULL; + + /* + * Hold ldvp. The reference on it, owned by dvp, is lost in + * case of dvp reclamation. + */ + error = vnode_getwithref(ldvp); + if (error) { + return error; + } + + error = VNOP_LOOKUP(ldvp, &lvp, cnp, ap->a_context); + + vnode_put(ldvp); + + if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { + if (ldvp == lvp) { + vp = dvp; + error = vnode_get(vp); + } else { + error = null_nodeget(mp, lvp, dvp, &vp, cnp, 0); + } + if (error == 0) { + *ap->a_vpp = vp; + } + } + + /* if we got lvp, drop the iocount from VNOP_LOOKUP */ + if (lvp != NULL) { + vnode_put(lvp); + } + + return (error); +} + +/* + * Don't think this needs to do anything + */ +static int +null_inactive(__unused struct vnop_inactive_args * ap) +{ + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + + return (0); +} + +static int +null_reclaim(struct vnop_reclaim_args * ap) +{ + struct vnode * vp; + struct null_node * xp; + struct vnode * lowervp; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(ap->a_vp)); + + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + + vp = ap->a_vp; + + xp = VTONULL(vp); + lowervp = xp->null_lowervp; + + lck_mtx_lock(&null_mp->nullm_lock); + + vnode_removefsref(vp); + + if (lowervp != NULL) { + /* root and second don't have a lowervp, so nothing to release and nothing + * got hashed */ + if (xp->null_flags & NULL_FLAG_HASHED) { + /* only call this if we actually made it into the hash list. reclaim gets + called also to + clean up a vnode that got created when it didn't need to under race + conditions */ + null_hashrem(xp); + } + vnode_getwithref(lowervp); + vnode_rele(lowervp); + vnode_put(lowervp); + } + + if (vp == null_mp->nullm_rootvp) { + null_mp->nullm_rootvp = NULL; + } else if (vp == null_mp->nullm_secondvp) { + null_mp->nullm_secondvp = NULL; + } else if (vp == null_mp->nullm_thirdcovervp) { + null_mp->nullm_thirdcovervp = NULL; + } + + lck_mtx_unlock(&null_mp->nullm_lock); + + cache_purge(vp); + vnode_clearfsnode(vp); + + FREE(xp, M_TEMP); + + return 0; +} + +#define DIRENT_SZ(dp) ((sizeof(struct dirent) - NAME_MAX) + (((dp)->d_namlen + 1 + 3) & ~3)) + +static int +store_entry_special(ino_t ino, const char * name, struct uio * uio) +{ + struct dirent e; + size_t namelen = strlen(name); + int error = EINVAL; + + if (namelen + 1 <= NAME_MAX) { + memset(&e, 0, sizeof(e)); + + e.d_ino = ino; + e.d_type = DT_DIR; + + e.d_namlen = namelen; /* don't include NUL */ + e.d_reclen = DIRENT_SZ(&e); + if (uio_resid(uio) >= e.d_reclen) { + strlcpy(e.d_name, name, NAME_MAX); + error = uiomove((caddr_t)&e, e.d_reclen, uio); + } else { + error = EMSGSIZE; + } + } + return error; +} + +static int +nullfs_special_readdir(struct vnop_readdir_args * ap) +{ + struct vnode * vp = ap->a_vp; + struct uio * uio = ap->a_uio; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(vp)); + off_t offset = uio_offset(uio); + int error = ERANGE; + int items = 0; + ino_t ino = 0; + const char * name = NULL; + + if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) + return (EINVAL); + + if (offset == 0) { + /* . case */ + if (vp == null_mp->nullm_rootvp) { + ino = NULL_ROOT_INO; + } else /* only get here if vp matches nullm_rootvp or nullm_secondvp */ + { + ino = NULL_SECOND_INO; + } + error = store_entry_special(ino, ".", uio); + if (error) { + goto out; + } + offset++; + items++; + } + if (offset == 1) { + /* .. case */ + /* only get here if vp matches nullm_rootvp or nullm_secondvp */ + ino = NULL_ROOT_INO; + + error = store_entry_special(ino, "..", uio); + if (error) { + goto out; + } + offset++; + items++; + } + if (offset == 2) { + /* the directory case */ + if (vp == null_mp->nullm_rootvp) { + ino = NULL_SECOND_INO; + name = "d"; + } else /* only get here if vp matches nullm_rootvp or nullm_secondvp */ + { + ino = NULL_THIRD_INO; + if (vnode_getwithvid(null_mp->nullm_lowerrootvp, null_mp->nullm_lowerrootvid)) { + /* In this case the lower file system has been ripped out from under us, + but we don't want to error out + Instead we just want d to look empty. */ + error = 0; + goto out; + } + name = vnode_getname_printable(null_mp->nullm_lowerrootvp); + } + error = store_entry_special(ino, name, uio); + + if (ino == NULL_THIRD_INO) { + vnode_putname_printable(name); + vnode_put(null_mp->nullm_lowerrootvp); + } + + if (error) { + goto out; + } + offset++; + items++; + } + +out: + if (error == EMSGSIZE) { + error = 0; /* return success if we ran out of space, but we wanted to make + sure that we didn't update offset and items incorrectly */ + } + uio_setoffset(uio, offset); + if (ap->a_numdirent) { + *ap->a_numdirent = items; + } + return error; +} + +static int +nullfs_readdir(struct vnop_readdir_args * ap) +{ + struct vnode *vp, *lvp; + int error; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(ap->a_vp)); + + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + /* assumption is that any vp that comes through here had to go through lookup + */ + + lck_mtx_lock(&null_mp->nullm_lock); + if (nullfs_isspecialvp(ap->a_vp)) { + error = nullfs_special_readdir(ap); + lck_mtx_unlock(&null_mp->nullm_lock); + return error; + } + lck_mtx_unlock(&null_mp->nullm_lock); + + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_READDIR(lvp, ap->a_uio, ap->a_flags, ap->a_eofflag, ap->a_numdirent, ap->a_context); + vnode_put(lvp); + } + + return error; +} + +static int +nullfs_readlink(struct vnop_readlink_args * ap) +{ + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + int error; + struct vnode *vp, *lvp; + + if (nullfs_checkspecialvp(ap->a_vp)) { + return ENOTSUP; /* the special vnodes aren't links */ + } + + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); + + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_READLINK(lvp, ap->a_uio, ap->a_context); + vnode_put(lvp); + + if (error) { + NULLFSDEBUG("readlink failed: %d\n", error); + } + } + + return error; +} + +static int +nullfs_pathconf(__unused struct vnop_pathconf_args * args) +{ + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + return EINVAL; +} + +static int +nullfs_fsync(__unused struct vnop_fsync_args * args) +{ + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + return 0; +} + +static int +nullfs_mmap(struct vnop_mmap_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_MMAP(lvp, args->a_fflags, args->a_context); + vnode_put(lvp); + } + + return error; +} + +static int +nullfs_mnomap(struct vnop_mnomap_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_MNOMAP(lvp, args->a_context); + vnode_put(lvp); + } + + return error; +} + +static int +nullfs_getxattr(struct vnop_getxattr_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_GETXATTR(lvp, args->a_name, args->a_uio, args->a_size, args->a_options, args->a_context); + vnode_put(lvp); + } + + return error; +} + +static int +nullfs_listxattr(struct vnop_listxattr_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_LISTXATTR(lvp, args->a_uio, args->a_size, args->a_options, args->a_context); + vnode_put(lvp); + } + + return error; +} + +/* relies on v1 paging */ +static int +nullfs_pagein(struct vnop_pagein_args * ap) +{ + int error = EIO; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); + + if (vnode_vtype(vp) != VREG) { + return ENOTSUP; + } + + /* + * Ask VM/UBC/VFS to do our bidding + */ + if (vnode_getwithvid(lvp, NULLVPTOLOWERVID(vp)) == 0) { + vm_offset_t ioaddr; + uio_t auio; + kern_return_t kret; + off_t bytes_to_commit; + off_t lowersize; + upl_t upl = ap->a_pl; + user_ssize_t bytes_remaining = 0; + + auio = uio_create(1, ap->a_f_offset, UIO_SYSSPACE, UIO_READ); + if (auio == NULL) { + error = EIO; + goto exit_no_unmap; + } + + kret = ubc_upl_map(upl, &ioaddr); + if (KERN_SUCCESS != kret) { + panic("nullfs_pagein: ubc_upl_map() failed with (%d)", kret); + } + + ioaddr += ap->a_pl_offset; + + error = uio_addiov(auio, (user_addr_t)ioaddr, ap->a_size); + if (error) { + goto exit; + } + + lowersize = ubc_getsize(lvp); + if (lowersize != ubc_getsize(vp)) { + (void)ubc_setsize(vp, lowersize); /* ignore failures, nothing can be done */ + } + + error = VNOP_READ(lvp, auio, ((ap->a_flags & UPL_IOSYNC) ? IO_SYNC : 0), ap->a_context); + + bytes_remaining = uio_resid(auio); + if (bytes_remaining > 0 && bytes_remaining <= (user_ssize_t)ap->a_size) + { + /* zero bytes that weren't read in to the upl */ + bzero((void*)((uintptr_t)(ioaddr + ap->a_size - bytes_remaining)), (size_t) bytes_remaining); + } + + exit: + kret = ubc_upl_unmap(upl); + if (KERN_SUCCESS != kret) { + panic("nullfs_pagein: ubc_upl_unmap() failed with (%d)", kret); + } + + if (auio != NULL) { + uio_free(auio); + } + + exit_no_unmap: + if ((ap->a_flags & UPL_NOCOMMIT) == 0) { + if (!error && (bytes_remaining >= 0) && (bytes_remaining <= (user_ssize_t)ap->a_size)) { + /* only commit what was read in (page aligned)*/ + bytes_to_commit = ap->a_size - bytes_remaining; + if (bytes_to_commit) + { + /* need to make sure bytes_to_commit and byte_remaining are page aligned before calling ubc_upl_commit_range*/ + if (bytes_to_commit & PAGE_MASK) + { + bytes_to_commit = (bytes_to_commit & (~PAGE_MASK)) + (PAGE_MASK + 1); + assert(bytes_to_commit <= (off_t)ap->a_size); + + bytes_remaining = ap->a_size - bytes_to_commit; + } + ubc_upl_commit_range(upl, ap->a_pl_offset, (upl_size_t)bytes_to_commit, UPL_COMMIT_FREE_ON_EMPTY); + } + + /* abort anything thats left */ + if (bytes_remaining) { + ubc_upl_abort_range(upl, ap->a_pl_offset + bytes_to_commit, (upl_size_t)bytes_remaining, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + } + } else { + ubc_upl_abort_range(upl, ap->a_pl_offset, (upl_size_t)ap->a_size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + } + } + vnode_put(lvp); + } else if((ap->a_flags & UPL_NOCOMMIT) == 0) { + ubc_upl_abort_range(ap->a_pl, ap->a_pl_offset, (upl_size_t)ap->a_size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + } + return error; +} + +static int +nullfs_read(struct vnop_read_args * ap) +{ + int error = EIO; + + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + + if (nullfs_checkspecialvp(ap->a_vp)) { + return ENOTSUP; /* the special vnodes can't be read */ + } + + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); + + /* + * First some house keeping + */ + if (vnode_getwithvid(lvp, NULLVPTOLOWERVID(vp)) == 0) { + if (!vnode_isreg(lvp) && !vnode_islnk(lvp)) { + error = EPERM; + goto end; + } + + if (uio_resid(ap->a_uio) == 0) { + error = 0; + goto end; + } + + /* + * Now ask VM/UBC/VFS to do our bidding + */ + + error = VNOP_READ(lvp, ap->a_uio, ap->a_ioflag, ap->a_context); + if (error) { + NULLFSDEBUG("VNOP_READ failed: %d\n", error); + } + end: + vnode_put(lvp); + } + return error; +} + +/* + * Global vfs data structures + */ + +static struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = { + {&vnop_default_desc, (vop_t)nullfs_default}, {&vnop_getattr_desc, (vop_t)nullfs_getattr}, + {&vnop_open_desc, (vop_t)nullfs_open}, {&vnop_close_desc, (vop_t)nullfs_close}, + {&vnop_inactive_desc, (vop_t)null_inactive}, {&vnop_reclaim_desc, (vop_t)null_reclaim}, + {&vnop_lookup_desc, (vop_t)null_lookup}, {&vnop_readdir_desc, (vop_t)nullfs_readdir}, + {&vnop_readlink_desc, (vop_t)nullfs_readlink}, {&vnop_pathconf_desc, (vop_t)nullfs_pathconf}, + {&vnop_fsync_desc, (vop_t)nullfs_fsync}, {&vnop_mmap_desc, (vop_t)nullfs_mmap}, + {&vnop_mnomap_desc, (vop_t)nullfs_mnomap}, {&vnop_getxattr_desc, (vop_t)nullfs_getxattr}, + {&vnop_pagein_desc, (vop_t)nullfs_pagein}, {&vnop_read_desc, (vop_t)nullfs_read}, + {&vnop_listxattr_desc, (vop_t)nullfs_listxattr}, {NULL, NULL}, +}; + +struct vnodeopv_desc nullfs_vnodeop_opv_desc = {&nullfs_vnodeop_p, nullfs_vnodeop_entries}; diff --git a/bsd/miscfs/nullfs/nullfs.h b/bsd/miscfs/nullfs/nullfs.h new file mode 100644 index 000000000..5d55e2c8b --- /dev/null +++ b/bsd/miscfs/nullfs/nullfs.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/*- + * Portions Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null.h 8.3 (Berkeley) 8/20/94 + * + * $FreeBSD$ + */ + +#ifndef FS_NULL_H +#define FS_NULL_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if KERNEL +#include +#else +#include +#endif + +//#define NULLFS_DEBUG 0 + +#define NULLM_CACHE 0x0001 +#define NULLM_CASEINSENSITIVE 0x0000000000000002 + +typedef int (*vop_t)(void *); + +struct null_mount { + struct vnode * nullm_rootvp; /* Reference to root null_node (inode 1) */ + struct vnode * nullm_secondvp; /* Reference to virtual directory vnode to wrap app + bundles (inode 2) */ + struct vnode * nullm_thirdcovervp; /* Reference to vnode that covers + lowerrootvp (inode 3) */ + struct vnode * nullm_lowerrootvp; /* reference to the root of the tree we are + relocating (in the other file system) */ + uint32_t nullm_lowerrootvid; /* store the lower root vid so we can check + before we build the shadow vnode lazily*/ + lck_mtx_t nullm_lock; /* lock to protect vps above */ + uint64_t nullm_flags; +}; + +#ifdef KERNEL + +#define NULL_FLAG_HASHED 0x000000001 + +/* + * A cache of vnode references + */ +struct null_node { + LIST_ENTRY(null_node) null_hash; /* Hash list */ + struct vnode * null_lowervp; /* VREFed once */ + struct vnode * null_vnode; /* Back pointer */ + uint32_t null_lowervid; /* vid for lowervp to detect lowervp getting recycled out + from under us */ + uint32_t null_myvid; + uint32_t null_flags; +}; + +struct vnodeop_desc_fake { + int vdesc_offset; + const char * vdesc_name; + /* other stuff */ +}; + +#define NULLV_NOUNLOCK 0x0001 +#define NULLV_DROP 0x0002 + +#define MOUNTTONULLMOUNT(mp) ((struct null_mount *)(vfs_fsprivate(mp))) +#define VTONULL(vp) ((struct null_node *)vnode_fsnode(vp)) +#define NULLTOV(xp) ((xp)->null_vnode) + +__BEGIN_DECLS + +int nullfs_init(struct vfsconf * vfsp); +int nullfs_init_lck(lck_mtx_t * lck); +int nullfs_destroy_lck(lck_mtx_t * lck); +int nullfs_uninit(void); +int null_nodeget( + struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root); +int null_hashget(struct mount * mp, struct vnode * lowervp, struct vnode ** vpp); +int null_getnewvnode( + struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root); +void null_hashrem(struct null_node * xp); + +#define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp) +#define NULLVPTOLOWERVID(vp) (VTONULL(vp)->null_lowervid) +#define NULLVPTOMYVID(vp) (VTONULL(vp)->null_myvid) + +extern struct vnodeopv_desc nullfs_vnodeop_opv_desc; + +extern vop_t * nullfs_vnodeop_p; + +// int nullfs_install_filesys(void); +// int nullfs_uninstall_filesys(void); + +__END_DECLS + +#ifdef NULLFS_DEBUG +#define NULLFSDEBUG(format, args...) printf(format, ##args) +#else +#define NULLFSDEBUG(format, args...) +#endif /* NULLFS_DEBUG */ + +#endif /* KERNEL */ + +#endif \ No newline at end of file diff --git a/bsd/miscfs/routefs/Makefile b/bsd/miscfs/routefs/Makefile index 1076c57a3..1a4f5095c 100644 --- a/bsd/miscfs/routefs/Makefile +++ b/bsd/miscfs/routefs/Makefile @@ -3,11 +3,10 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) -DATAFILES = +DATAFILES = KERNELFILES = \ routefs.h @@ -18,13 +17,11 @@ INSTALL_MI_DIR = miscfs/routefs INSTALL_KF_MI_LIST = ${DATAFILES} -INSTALL_KF_MI_LCL_LIST = +INSTALL_KF_MI_LCL_LIST = -EXPORT_MI_LIST = +EXPORT_MI_LIST = EXPORT_MI_DIR = miscfs/routefs include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/miscfs/specfs/Makefile b/bsd/miscfs/specfs/Makefile index 109c5fc29..0956ae01b 100644 --- a/bsd/miscfs/specfs/Makefile +++ b/bsd/miscfs/specfs/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -23,5 +22,3 @@ EXPORT_MI_DIR = miscfs/specfs include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index fd79c99fa..adddc10d8 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,10 +81,16 @@ #include #include #include +#include #include #include + #include #include +#include +#include +#include + #include #include @@ -160,8 +166,8 @@ static void set_blocksize(vnode_t, dev_t); #define LOWPRI_TIER2_WINDOW_MSECS 100 #define LOWPRI_TIER3_WINDOW_MSECS 500 -#define LOWPRI_TIER1_IO_PERIOD_MSECS 15 -#define LOWPRI_TIER2_IO_PERIOD_MSECS 50 +#define LOWPRI_TIER1_IO_PERIOD_MSECS 40 +#define LOWPRI_TIER2_IO_PERIOD_MSECS 85 #define LOWPRI_TIER3_IO_PERIOD_MSECS 200 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS 5 @@ -198,10 +204,11 @@ struct _throttle_io_info_t { struct timeval throttle_last_write_timestamp; struct timeval throttle_min_timer_deadline; - struct timeval throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; + struct timeval throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */ struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1]; pid_t throttle_last_IO_pid[THROTTLE_LEVEL_END + 1]; struct timeval throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1]; + int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1]; TAILQ_HEAD( , uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1]; /* Lists of throttled uthreads */ int throttle_next_wake_level; @@ -227,8 +234,8 @@ struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; int lowpri_throttle_enabled = 1; - -static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd); +static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level); +static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap); static int throttle_get_thread_throttle_level(uthread_t ut); /* @@ -463,16 +470,22 @@ spec_read(struct vnop_read_args *ap) switch (vp->v_type) { case VCHR: + { + struct _throttle_io_info_t *throttle_info = NULL; + int thread_throttle_level; if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { - struct _throttle_io_info_t *throttle_info; - throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; - throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd); + thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL); } error = (*cdevsw[major(vp->v_rdev)].d_read) (vp->v_rdev, uio, ap->a_ioflag); + if (throttle_info) { + throttle_info_end_io_internal(throttle_info, thread_throttle_level); + } + return (error); + } case VBLK: if (uio->uio_offset < 0) @@ -555,19 +568,25 @@ spec_write(struct vnop_write_args *ap) switch (vp->v_type) { case VCHR: + { + struct _throttle_io_info_t *throttle_info = NULL; + int thread_throttle_level; if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { - struct _throttle_io_info_t *throttle_info; - throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; - throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd); + thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL); microuptime(&throttle_info->throttle_last_write_timestamp); } error = (*cdevsw[major(vp->v_rdev)].d_write) (vp->v_rdev, uio, ap->a_ioflag); + if (throttle_info) { + throttle_info_end_io_internal(throttle_info, thread_throttle_level); + } + return (error); + } case VBLK: if (uio_resid(uio) == 0) @@ -672,27 +691,9 @@ spec_ioctl(struct vnop_ioctl_args *ap) break; case VBLK: - if (kdebug_enable) { - if (ap->a_command == DKIOCUNMAP) { - dk_unmap_t *unmap; - dk_extent_t *extent; - uint32_t i; - - unmap = (dk_unmap_t *)ap->a_data; - extent = unmap->extents; - - for (i = 0; i < unmap->extentsCount; i++, extent++) { - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, - extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0); - } - } else if (ap->a_command == DKIOCSYNCHRONIZE) { - dk_synchronize_t *synch; - synch = (dk_synchronize_t *)ap->a_data; - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, ap->a_command, - synch->options, 0, 0); - } - } retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p); + if (!retval && ap->a_command == DKIOCSETBLOCKSIZE) + ap->a_vp->v_specsize = *(uint32_t *)ap->a_data; break; default: @@ -728,22 +729,32 @@ int spec_kqfilter(vnode_t vp, struct knote *kn) { dev_t dev; - int err; assert(vnode_ischr(vp)); dev = vnode_specrdev(vp); #if NETWORKING - /* Try a bpf device, as defined in bsd/net/bpf.c */ - if ((err = bpfkqfilter(dev, kn)) == 0) { - return err; + /* + * Try a bpf device, as defined in bsd/net/bpf.c + * If it doesn't error out the attach, then it + * claimed it. Otherwise, fall through and try + * a regular spec attach. + */ + int32_t tmp_flags = kn->kn_flags; + int64_t tmp_data = kn->kn_data; + int res; + + res = bpfkqfilter(dev, kn); + if ((kn->kn_flags & EV_ERROR) == 0) { + return res; } + kn->kn_flags = tmp_flags; + kn->kn_data = tmp_data; #endif - /* Try to attach to other char special devices */ - err = filt_specattach(kn); - return err; + /* Try to attach to other char special devices */ + return filt_specattach(kn); } /* @@ -802,9 +813,9 @@ SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_ SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, ""); -static lck_grp_t *throttle_mtx_grp; -static lck_attr_t *throttle_mtx_attr; -static lck_grp_attr_t *throttle_mtx_grp_attr; +static lck_grp_t *throttle_lock_grp; +static lck_attr_t *throttle_lock_attr; +static lck_grp_attr_t *throttle_lock_grp_attr; /* @@ -854,7 +865,7 @@ throttle_info_rel(struct _throttle_io_info_t *info) if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) { DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info); - lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp); + lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp); FREE(info, M_TEMP); } return oldValue; @@ -930,7 +941,7 @@ throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) { - if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level]) { + if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[level]) { /* * we had an I/O occur at a higher priority tier within * this tier's throttle window @@ -1098,6 +1109,7 @@ throttle_timer(struct _throttle_io_info_t *info) ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]); TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist); ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; + ut->uu_is_throttled = FALSE; wake_address = (caddr_t)&ut->uu_on_throttlelist; } @@ -1115,6 +1127,7 @@ throttle_timer(struct _throttle_io_info_t *info) TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist); ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; + ut->uu_is_throttled = FALSE; wakeup(&ut->uu_on_throttlelist); } @@ -1255,8 +1268,8 @@ throttle_init(void) /* * allocate lock group attribute and group */ - throttle_mtx_grp_attr = lck_grp_attr_alloc_init(); - throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr); + throttle_lock_grp_attr = lck_grp_attr_alloc_init(); + throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr); /* Update throttle parameters based on device tree configuration */ throttle_init_throttle_window(); @@ -1264,17 +1277,18 @@ throttle_init(void) /* * allocate the lock attribute */ - throttle_mtx_attr = lck_attr_alloc_init(); + throttle_lock_attr = lck_attr_alloc_init(); for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) { info = &_throttle_io_info[i]; - lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); + lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr); info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); for (level = 0; level <= THROTTLE_LEVEL_END; level++) { TAILQ_INIT(&info->throttle_uthlist[level]); info->throttle_last_IO_pid[level] = 0; + info->throttle_inflight_count[level] = 0; } info->throttle_next_wake_level = THROTTLE_LEVEL_END; info->throttle_disabled = 0; @@ -1301,55 +1315,65 @@ sys_override_io_throttle(int flag) lowpri_throttle_enabled = 0; } -int rethrottle_removed_from_list = 0; -int rethrottle_moved_to_new_list = 0; +int rethrottle_wakeups = 0; /* - * move a throttled thread to the appropriate state based - * on it's new throttle level... throttle_add_to_list will - * reset the timer deadline if necessary... it may also - * leave the thread off of the queue if we're already outside - * the throttle window for the new level - * takes a valid uthread (which may or may not be on the - * throttle queue) as input + * the uu_rethrottle_lock is used to synchronize this function + * with "throttle_lowpri_io" which is where a throttled thread + * will block... that function will grab this lock before beginning + * it's decision making process concerning the need to block, and + * hold it through the assert_wait. When that thread is awakened + * for any reason (timer or rethrottle), it will reacquire the + * uu_rethrottle_lock before determining if it really is ok for + * it to now run. This is the point at which the thread could + * enter a different throttling queue and reblock or return from + * the throttle w/o having waited out it's entire throttle if + * the rethrottle has now moved it out of any currently + * active throttle window. * - * NOTE: This is called with the task lock held. + * + * NOTES: + * 1 - This may be called with the task lock held. + * 2 - This may be called with preemption and interrupts disabled + * in the kqueue wakeup path so we can't take the throttle_lock which is a mutex + * 3 - This cannot safely dereference uu_throttle_info, as it may + * get deallocated out from under us */ void rethrottle_thread(uthread_t ut) { - struct _throttle_io_info_t *info; - int my_new_level; - - if ((info = ut->uu_throttle_info) == NULL) + /* + * If uthread doesn't have throttle state, then there's no chance + * of it needing a rethrottle. + */ + if (ut->uu_throttle_info == NULL) return; - lck_mtx_lock(&info->throttle_lock); - - if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) { + boolean_t s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(&ut->uu_rethrottle_lock); - my_new_level = throttle_get_thread_throttle_level(ut); + if (ut->uu_is_throttled == FALSE) + ut->uu_was_rethrottled = TRUE; + else { + int my_new_level = throttle_get_thread_throttle_level(ut); if (my_new_level != ut->uu_on_throttlelist) { + /* + * ut is currently blocked (as indicated by + * ut->uu_is_throttled == TRUE) + * and we're changing it's throttle level, so + * we need to wake it up. + */ + ut->uu_is_throttled = FALSE; + wakeup(&ut->uu_on_throttlelist); - TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); - ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; - - if (my_new_level >= THROTTLE_LEVEL_THROTTLED) { - throttle_add_to_list(info, ut, my_new_level, TRUE); - rethrottle_moved_to_new_list++; - } - - /* Thread no longer in window, need to wake it up */ - if (ut->uu_on_throttlelist == THROTTLE_LEVEL_NONE) { - wakeup(&ut->uu_on_throttlelist); - rethrottle_removed_from_list++; - } + rethrottle_wakeups++; + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, my_new_level, 0, 0); } } - - lck_mtx_unlock(&info->throttle_lock); + lck_spin_unlock(&ut->uu_rethrottle_lock); + ml_set_interrupts_enabled(s); } @@ -1374,7 +1398,7 @@ throttle_info_create(void) DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info ); info->throttle_alloc = TRUE; - lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); + lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr); info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); for (level = 0; level <= THROTTLE_LEVEL_END; level++) { @@ -1546,12 +1570,19 @@ throttle_get_thread_throttle_level(uthread_t ut) return (thread_throttle_level); } - +/* + * I/O will be throttled if either of the following are true: + * - Higher tiers have in-flight I/O + * - The time delta since the last start/completion of a higher tier is within the throttle window interval + * + * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal + */ static int throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level) { struct _throttle_io_info_t *info = throttle_info; struct timeval elapsed; + struct timeval now; uint64_t elapsed_msecs; int thread_throttle_level; int throttle_level; @@ -1559,9 +1590,13 @@ throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) return (THROTTLE_DISENGAGED); - for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { + microuptime(&now); - microuptime(&elapsed); + for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { + if (info->throttle_inflight_count[throttle_level]) { + break; + } + elapsed = now; timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); @@ -1664,6 +1699,7 @@ throttle_lowpri_io(int sleep_amount) int sleep_cnt = 0; uint32_t throttle_io_period_num = 0; boolean_t insert_tail = TRUE; + boolean_t s; ut = get_bsdthread_info(current_thread()); @@ -1677,8 +1713,8 @@ throttle_lowpri_io(int sleep_amount) ut->uu_lowpri_window = 0; return (0); } - lck_mtx_lock(&info->throttle_lock); + assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED); if (sleep_amount == 0) goto done; @@ -1688,6 +1724,8 @@ throttle_lowpri_io(int sleep_amount) throttle_io_period_num = info->throttle_io_period_num; + ut->uu_was_rethrottled = FALSE; + while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) { if (throttle_type == THROTTLE_ENGAGED) { @@ -1698,21 +1736,71 @@ throttle_lowpri_io(int sleep_amount) if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) break; } + /* + * keep the same position in the list if "rethrottle_thread" changes our throttle level and + * then puts us back to the original level before we get a chance to run + */ + if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) { + /* + * must have been awakened via "rethrottle_thread" (the timer pulls us off the list) + * and we've changed our throttling level, so pull ourselves off of the appropriate list + * and make sure we get put on the tail of the new list since we're starting anew w/r to + * the throttling engine + */ + TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); + ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; + insert_tail = TRUE; + } if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) { if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) goto done; } assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END); + + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(&ut->uu_rethrottle_lock); + + /* + * this is the critical section w/r to our interaction + * with "rethrottle_thread" + */ + if (ut->uu_was_rethrottled == TRUE) { + + lck_spin_unlock(&ut->uu_rethrottle_lock); + ml_set_interrupts_enabled(s); + lck_mtx_yield(&info->throttle_lock); + + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, 0, 0, 0); + + ut->uu_was_rethrottled = FALSE; + continue; + } KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE, info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0); - if (sleep_cnt == 0) { KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0); throttled_count[mylevel]++; } - msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL); + ut->uu_wmesg = "throttle_lowpri_io"; + + assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT); + + ut->uu_is_throttled = TRUE; + lck_spin_unlock(&ut->uu_rethrottle_lock); + ml_set_interrupts_enabled(s); + + lck_mtx_unlock(&info->throttle_lock); + + thread_block(THREAD_CONTINUE_NULL); + + ut->uu_wmesg = NULL; + + ut->uu_is_throttled = FALSE; + ut->uu_was_rethrottled = FALSE; + + lck_mtx_lock(&info->throttle_lock); sleep_cnt++; @@ -1729,7 +1817,6 @@ throttle_lowpri_io(int sleep_amount) TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; } - lck_mtx_unlock(&info->throttle_lock); if (sleep_cnt) { @@ -1744,12 +1831,12 @@ throttle_lowpri_io(int sleep_amount) throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt); } - throttle_info_rel(info); - ut->uu_throttle_info = NULL; ut->uu_throttle_bc = FALSE; ut->uu_lowpri_window = 0; + throttle_info_rel(info); + return (sleep_cnt); } @@ -1763,12 +1850,9 @@ throttle_lowpri_io(int sleep_amount) */ void throttle_set_thread_io_policy(int policy) { - proc_set_task_policy(current_task(), current_thread(), - TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, - policy); + proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy); } - void throttle_info_reset_window(uthread_t ut) { struct _throttle_io_info_t *info; @@ -1805,24 +1889,79 @@ void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t * } } +/* + * Update inflight IO count and throttling window + * Should be called when an IO is done + * + * Only affects IO that was sent through spec_strategy + */ +void throttle_info_end_io(buf_t bp) { + mount_t mp; + struct bufattr *bap; + struct _throttle_io_info_t *info; + + bap = &bp->b_attr; + if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) { + return; + } + CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO); + + mp = buf_vnode(bp)->v_mount; + if (mp != NULL) { + info = &_throttle_io_info[mp->mnt_devbsdunit]; + } else { + info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; + } + + throttle_info_end_io_internal(info, GET_BUFATTR_IO_TIER(bap)); +} + +/* + * Decrement inflight count initially incremented by throttle_info_update_internal + */ +static +void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level) { + if (throttle_level == THROTTLE_LEVEL_NONE) { + return; + } + microuptime(&info->throttle_window_start_timestamp[throttle_level]); + OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]); + assert(info->throttle_inflight_count[throttle_level] >= 0); +} + +/* + * If inflight is TRUE and bap is NULL then the caller is responsible for calling + * throttle_info_end_io_internal to avoid leaking in-flight I/O. + */ static -void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd) +int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap) { int thread_throttle_level; if (lowpri_throttle_enabled == 0 || info->throttle_disabled) - return; + return THROTTLE_LEVEL_NONE; if (ut == NULL) ut = get_bsdthread_info(current_thread()); - thread_throttle_level = throttle_get_thread_throttle_level(ut); + if (bap && inflight && !ut->uu_throttle_bc) { + thread_throttle_level = GET_BUFATTR_IO_TIER(bap); + } else { + thread_throttle_level = throttle_get_thread_throttle_level(ut); + } if (thread_throttle_level != THROTTLE_LEVEL_NONE) { - if(!ISSET(flags, B_PASSIVE)) { - microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]); + if(!ISSET(flags, B_PASSIVE)) { info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid(); + if (inflight && !ut->uu_throttle_bc) { + if (NULL != bap) { + SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO); + } + OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]); + } else { + microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]); + } KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE, current_proc()->p_pid, thread_throttle_level, 0, 0, 0); } @@ -1845,6 +1984,8 @@ void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t u throttle_info_set_initial_window(ut, info, FALSE, isssd); } + + return thread_throttle_level; } void *throttle_info_update_by_mount(mount_t mp) @@ -1878,7 +2019,7 @@ void *throttle_info_update_by_mount(mount_t mp) void throttle_info_update(void *throttle_info, int flags) { if (throttle_info) - throttle_info_update_internal(throttle_info, NULL, flags, FALSE); + throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL); } /* @@ -1953,6 +2094,9 @@ int throttle_info_io_will_be_throttled(void * throttle_info, int policy) break; } for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { + if (info->throttle_inflight_count[throttle_level]) { + break; + } microuptime(&elapsed); timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); @@ -1974,6 +2118,12 @@ int throttle_info_io_will_be_throttled(void * throttle_info, int policy) return (THROTTLE_ENGAGED); } +int throttle_lowpri_window(void) +{ + struct uthread *ut = get_bsdthread_info(current_thread()); + return ut->uu_lowpri_window; +} + int spec_strategy(struct vnop_strategy_args *ap) { @@ -1988,6 +2138,7 @@ spec_strategy(struct vnop_strategy_args *ap) int strategy_ret; struct _throttle_io_info_t *throttle_info; boolean_t isssd = FALSE; + boolean_t inflight = FALSE; int code = 0; proc_t curproc = current_proc(); @@ -2044,7 +2195,8 @@ spec_strategy(struct vnop_strategy_args *ap) code |= DKIO_READ; if (bflags & B_ASYNC) code |= DKIO_ASYNC; - if (bflags & B_META) + + if (bap->ba_flags & BA_META) code |= DKIO_META; else if (bflags & B_PAGEIO) code |= DKIO_PAGING; @@ -2070,11 +2222,20 @@ spec_strategy(struct vnop_strategy_args *ap) if (mp != NULL) { if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) isssd = TRUE; + /* + * Partially initialized mounts don't have a final devbsdunit and should not be tracked. + * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value + * (mnt_throttle_mask is initialized and num_trailing_0 would be 0) + */ + if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) { + inflight = TRUE; + } throttle_info = &_throttle_io_info[mp->mnt_devbsdunit]; - } else + + } else throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; - throttle_info_update_internal(throttle_info, ut, bflags, isssd); + throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap); if ((bflags & B_READ) == 0) { microuptime(&throttle_info->throttle_last_write_timestamp); @@ -2347,6 +2508,8 @@ spec_offtoblk(struct vnop_offtoblk_args *ap) static void filt_specdetach(struct knote *kn); static int filt_spec(struct knote *kn, long hint); +static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); static unsigned filt_specpeek(struct knote *kn); struct filterops spec_filtops = { @@ -2354,6 +2517,8 @@ struct filterops spec_filtops = { .f_attach = filt_specattach, .f_detach = filt_specdetach, .f_event = filt_spec, + .f_touch = filt_spectouch, + .f_process = filt_specprocess, .f_peek = filt_specpeek }; @@ -2365,7 +2530,6 @@ filter_to_seltype(int16_t filter) return FREAD; case EVFILT_WRITE: return FWRITE; - break; default: panic("filt_to_seltype(): invalid filter %d\n", filter); return 0; @@ -2385,7 +2549,9 @@ filt_specattach(struct knote *kn) dev = vnode_specrdev(vp); if (major(dev) > nchrdev) { - return ENXIO; + kn->kn_flags |= EV_ERROR; + kn->kn_data = ENXIO; + return 0; } /* @@ -2398,15 +2564,17 @@ filt_specattach(struct knote *kn) if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0 && ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) { - return EINVAL; + kn->kn_flags |= EV_ERROR; + kn->kn_data = EINVAL; + return 0; } kn->kn_hook_data = 0; - kn->kn_fop = &spec_filtops; + kn->kn_filtid = EVFILTID_SPEC; kn->kn_hookid = vnode_vid(vp); - knote_markstayqueued(kn); + knote_markstayactive(kn); return 0; } @@ -2414,7 +2582,7 @@ filt_specattach(struct knote *kn) static void filt_specdetach(struct knote *kn) { - knote_clearstayqueued(kn); + knote_clearstayactive(kn); /* * This is potentially tricky: the device's selinfo waitq that was @@ -2429,18 +2597,41 @@ filt_specdetach(struct knote *kn) * waitq API invoked here. */ if (kn->kn_hook_data) { - waitq_unlink_by_prepost_id(kn->kn_hook_data, kn->kn_kq->kq_wqs); + waitq_unlink_by_prepost_id(kn->kn_hook_data, &(knote_get_kq(kn)->kq_wqs)); kn->kn_hook_data = 0; } } static int -filt_spec(struct knote *kn, long hint) +filt_spec(__unused struct knote *kn, __unused long hint) { + panic("filt_spec()"); + return 0; +} + + + +static int +filt_spectouch(struct knote *kn, struct kevent_internal_s *kev) +{ + kn->kn_sdata = kev->data; + kn->kn_sfflags = kev->fflags; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* stayqueued knotes don't need hints from touch */ + return 0; +} + +static int +filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) vnode_t vp; uthread_t uth; struct waitq_set *old_wqs; vfs_context_t ctx; + int res; int selres; int error; int use_offset; @@ -2449,17 +2640,16 @@ filt_spec(struct knote *kn, long hint) uint64_t rsvd, rsvd_arg; uint64_t *rlptr = NULL; - if (hint != 0) { - panic("filt_spec(): nonzero hint?"); - } - uth = get_bsdthread_info(current_thread()); ctx = vfs_context_current(); vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; + /* JMM - locking against touches? */ + error = vnode_getwithvid(vp, kn->kn_hookid); if (error != 0) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); + *kev = kn->kn_kevent; return 1; } @@ -2490,7 +2680,7 @@ filt_spec(struct knote *kn, long hint) * set into device's selinfo wait queue */ old_wqs = uth->uu_wqset; - uth->uu_wqset = kn->kn_kq->kq_wqs; + uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs); selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, rlptr, ctx); uth->uu_wqset = old_wqs; @@ -2532,10 +2722,18 @@ filt_spec(struct knote *kn, long hint) vnode_put(vp); - if ((kn->kn_sfflags & NOTE_LOWAT) != 0) - return (kn->kn_data >= kn->kn_sdata); + res = ((kn->kn_sfflags & NOTE_LOWAT) != 0) ? + (kn->kn_data >= kn->kn_sdata) : kn->kn_data; + + if (res) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_fflags = 0; + kn->kn_data = 0; + } + } - return (kn->kn_data != 0); + return res; } static unsigned @@ -2568,7 +2766,7 @@ filt_specpeek(struct knote *kn) rlptr = (void *)&rsvd_arg; old_wqs = uth->uu_wqset; - uth->uu_wqset = kn->kn_kq->kq_wqs; + uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs); selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, (void *)rlptr, ctx); uth->uu_wqset = old_wqs; diff --git a/bsd/miscfs/union/Makefile b/bsd/miscfs/union/Makefile index 773b2cd00..3e0757713 100644 --- a/bsd/miscfs/union/Makefile +++ b/bsd/miscfs/union/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -21,8 +20,5 @@ EXPORT_MI_LIST = ${KERNELFILES} EXPORT_MI_DIR = miscfs/union - include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/net/Makefile b/bsd/net/Makefile index 93855776e..234c2a15b 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -26,7 +25,8 @@ DATAFILES= \ kext_net.h \ ndrv.h \ pfkeyv2.h \ - route.h + route.h \ + net_kev.h KERNELFILES= \ kpi_interface.h kpi_interfacefilter.h kpi_protocol.h \ @@ -48,9 +48,6 @@ PRIVATE_DATAFILES = \ if_pflog.h \ if_ppp.h \ if_utun.h \ - if_utun_crypto.h \ - if_utun_crypto_ipsec.h \ - if_utun_crypto_dtls.h \ if_var.h \ if_vlan_var.h \ iptap.h \ @@ -68,7 +65,8 @@ PRIVATE_DATAFILES = \ radix.h \ raw_cb.h \ route.h \ - net_perf.h + net_perf.h \ + net_kev.h PRIVATE_KERNELFILES = $(filter-out radix.h,${KERNELFILES}) \ bpfdesc.h ppp_comp.h \ diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index f98100d2b..4eb349adb 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -2196,73 +2196,22 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) int bpfkqfilter(dev_t dev, struct knote *kn); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); +static int filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -static struct filterops bpfread_filtops = { +struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, + .f_touch = filt_bpftouch, + .f_process = filt_bpfprocess, }; -int -bpfkqfilter(dev_t dev, struct knote *kn) -{ - struct bpf_d *d; - - /* - * Is this device a bpf? - */ - if (major(dev) != CDEV_MAJOR) { - return (EINVAL); - } - - if (kn->kn_filter != EVFILT_READ) { - return (EINVAL); - } - - lck_mtx_lock(bpf_mlock); - - d = bpf_dtab[minor(dev)]; - if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) { - lck_mtx_unlock(bpf_mlock); - return (ENXIO); - } - - if (d->bd_bif == NULL) { - lck_mtx_unlock(bpf_mlock); - return (ENXIO); - } - - kn->kn_hook = d; - kn->kn_fop = &bpfread_filtops; - KNOTE_ATTACH(&d->bd_sel.si_note, kn); - d->bd_flags |= BPF_KNOTE; - - lck_mtx_unlock(bpf_mlock); - return (0); -} - -static void -filt_bpfdetach(struct knote *kn) -{ - struct bpf_d *d = (struct bpf_d *)kn->kn_hook; - - lck_mtx_lock(bpf_mlock); - if (d->bd_flags & BPF_KNOTE) { - KNOTE_DETACH(&d->bd_sel.si_note, kn); - d->bd_flags &= ~BPF_KNOTE; - } - lck_mtx_unlock(bpf_mlock); -} - static int -filt_bpfread(struct knote *kn, long hint) +filt_bpfread_common(struct knote *kn, struct bpf_d *d) { - struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready = 0; - if (hint == 0) - lck_mtx_lock(bpf_mlock); - if (d->bd_immediate) { /* * If there's data in the hold buffer, it's the @@ -2312,11 +2261,113 @@ filt_bpfread(struct knote *kn, long hint) if (!ready) bpf_start_timer(d); - if (hint == 0) - lck_mtx_unlock(bpf_mlock); return (ready); } +int +bpfkqfilter(dev_t dev, struct knote *kn) +{ + struct bpf_d *d; + int res; + + /* + * Is this device a bpf? + */ + if (major(dev) != CDEV_MAJOR || + kn->kn_filter != EVFILT_READ) { + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; + return 0; + } + + lck_mtx_lock(bpf_mlock); + + d = bpf_dtab[minor(dev)]; + + if (d == 0 || + d == (void *)1 || + d->bd_bif == NULL || + (d->bd_flags & BPF_CLOSING) != 0) { + lck_mtx_unlock(bpf_mlock); + kn->kn_flags = EV_ERROR; + kn->kn_data = ENXIO; + return 0; + } + + kn->kn_hook = d; + kn->kn_filtid = EVFILTID_BPFREAD; + KNOTE_ATTACH(&d->bd_sel.si_note, kn); + d->bd_flags |= BPF_KNOTE; + + /* capture the current state */ + res = filt_bpfread_common(kn, d); + + lck_mtx_unlock(bpf_mlock); + + return (res); +} + +static void +filt_bpfdetach(struct knote *kn) +{ + struct bpf_d *d = (struct bpf_d *)kn->kn_hook; + + lck_mtx_lock(bpf_mlock); + if (d->bd_flags & BPF_KNOTE) { + KNOTE_DETACH(&d->bd_sel.si_note, kn); + d->bd_flags &= ~BPF_KNOTE; + } + lck_mtx_unlock(bpf_mlock); +} + +static int +filt_bpfread(struct knote *kn, long hint) +{ +#pragma unused(hint) + struct bpf_d *d = (struct bpf_d *)kn->kn_hook; + + return filt_bpfread_common(kn, d); +} + +static int +filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev) +{ + struct bpf_d *d = (struct bpf_d *)kn->kn_hook; + int res; + + lck_mtx_lock(bpf_mlock); + + /* save off the lowat threshold and flag */ + kn->kn_sdata = kev->data; + kn->kn_sfflags = kev->fflags; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* output data will be re-generated here */ + res = filt_bpfread_common(kn, d); + + lck_mtx_unlock(bpf_mlock); + + return res; +} + +static int +filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + struct bpf_d *d = (struct bpf_d *)kn->kn_hook; + int res; + + lck_mtx_lock(bpf_mlock); + res = filt_bpfread_common(kn, d); + if (res) { + *kev = kn->kn_kevent; + } + lck_mtx_unlock(bpf_mlock); + + return res; +} + /* * Copy data from an mbuf chain into a buffer. This code is derived * from m_copydata in sys/uipc_mbuf.c. @@ -2564,6 +2615,18 @@ catchpacket(struct bpf_d *d, u_char *pkt, struct mbuf *m, u_int pktlen, } ehp->bh_svc = so_svc2tc(m->m_pkthdr.pkt_svc); ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_OUT; + if (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) + ehp->bh_pktflags |= BPF_PKTFLAGS_TCP_REXMT; + if (m->m_pkthdr.pkt_flags & PKTF_START_SEQ) + ehp->bh_pktflags |= BPF_PKTFLAGS_START_SEQ; + if (m->m_pkthdr.pkt_flags & PKTF_LAST_PKT) + ehp->bh_pktflags |= BPF_PKTFLAGS_LAST_PKT; + if (m->m_pkthdr.pkt_flags & PKTF_VALID_UNSENT_DATA) { + ehp->bh_unsent_bytes = + m->m_pkthdr.bufstatus_if; + ehp->bh_unsent_snd = + m->m_pkthdr.bufstatus_sndbuf; + } } else ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN; payload = (u_char *)ehp + hdrlen; diff --git a/bsd/net/bpf.h b/bsd/net/bpf.h index 20293abd9..edd79c7f7 100644 --- a/bsd/net/bpf.h +++ b/bsd/net/bpf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -246,10 +246,16 @@ struct bpf_hdr_ext { #define BPF_HDR_EXT_FLAGS_DIR_OUT 0x0001 pid_t bh_pid; /* process PID */ char bh_comm[MAXCOMLEN+1]; /* process command */ - u_char _bh_pad2[2]; + u_char _bh_pad2[1]; + u_char bh_pktflags; +#define BPF_PKTFLAGS_TCP_REXMT 0x0001 +#define BPF_PKTFLAGS_START_SEQ 0x0002 +#define BPF_PKTFLAGS_LAST_PKT 0x0004 u_char bh_proto; /* kernel reserved; 0 in userland */ bpf_u_int32 bh_svc; /* service class */ bpf_u_int32 bh_flowid; /* kernel reserved; 0 in userland */ + bpf_u_int32 bh_unsent_bytes; /* unsent bytes at interface */ + bpf_u_int32 bh_unsent_snd; /* unsent bytes at socket buffer */ }; #define BPF_CONTROL_NAME "com.apple.net.bpf" @@ -1331,7 +1337,7 @@ typedef u_int32_t bpf_tap_mode; link type are specified. The callback is responsible for releasing the mbuf whether or not it returns an error. @param interface The interface the packet is being sent on. - @param dlt The data link type the bpf device is attached to. + @param data_link_type The data link type the bpf device is attached to. @param packet The packet to be sent. */ typedef errno_t (*bpf_send_func)(ifnet_t interface, u_int32_t data_link_type, @@ -1349,7 +1355,7 @@ typedef errno_t (*bpf_send_func)(ifnet_t interface, u_int32_t data_link_type, decreasing (tap in or out is stopping), the error will be ignored. @param interface The interface being tapped. - @param dlt The data link type being tapped. + @param data_link_type The data link type being tapped. @param direction The direction of the tap. */ typedef errno_t (*bpf_tap_func)(ifnet_t interface, u_int32_t data_link_type, @@ -1399,7 +1405,7 @@ extern errno_t bpf_attach(ifnet_t interface, u_int32_t data_link_type, @param dlt The data link type of the packet. @param packet The packet received. @param header An optional pointer to a header that will be prepended. - @param headerlen If the header was specified, the length of the header. + @param header_len If the header was specified, the length of the header. */ extern void bpf_tap_in(ifnet_t interface, u_int32_t dlt, mbuf_t packet, void *header, size_t header_len); @@ -1413,7 +1419,7 @@ extern void bpf_tap_in(ifnet_t interface, u_int32_t dlt, mbuf_t packet, @param dlt The data link type of the packet. @param packet The packet received. @param header An optional pointer to a header that will be prepended. - @param headerlen If the header was specified, the length of the header. + @param header_len If the header was specified, the length of the header. */ extern void bpf_tap_out(ifnet_t interface, u_int32_t dlt, mbuf_t packet, void *header, size_t header_len); diff --git a/bsd/net/bpf_filter.c b/bsd/net/bpf_filter.c index 47e26fc19..362472a95 100644 --- a/bsd/net/bpf_filter.c +++ b/bsd/net/bpf_filter.c @@ -102,7 +102,7 @@ #ifdef KERNEL #define MINDEX(m, k) \ { \ - register unsigned int len = m->m_len; \ + unsigned int len = m->m_len; \ \ while (k >= len) { \ k -= len; \ @@ -121,9 +121,9 @@ static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err); static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err) { - register size_t len; - register u_char *cp, *np; - register struct mbuf *m0; + size_t len; + u_char *cp, *np; + struct mbuf *m0; len = m->m_len; while (k >= len) { @@ -174,9 +174,9 @@ m_xword(struct mbuf *m, bpf_u_int32 k, int *err) static u_int16_t m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) { - register size_t len; - register u_char *cp; - register struct mbuf *m0; + size_t len; + u_char *cp; + struct mbuf *m0; len = m->m_len; while (k >= len) { @@ -210,8 +210,8 @@ m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { - register u_int32_t A = 0, X = 0; - register bpf_u_int32 k; + u_int32_t A = 0, X = 0; + bpf_u_int32 k; int32_t mem[BPF_MEMWORDS]; bzero(mem, sizeof(mem)); @@ -284,7 +284,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) k = pc->k; if (k >= buflen) { #ifdef KERNEL - register struct mbuf *m; + struct mbuf *m; if (buflen != 0) return 0; @@ -356,7 +356,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) k = X + pc->k; if (pc->k >= buflen || X >= buflen - pc->k) { #ifdef KERNEL - register struct mbuf *m; + struct mbuf *m; if (buflen != 0) return 0; @@ -375,7 +375,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) k = pc->k; if (k >= buflen) { #ifdef KERNEL - register struct mbuf *m; + struct mbuf *m; if (buflen != 0) return 0; diff --git a/bsd/net/classq/Makefile b/bsd/net/classq/Makefile index a02432ac6..f78f60ab2 100644 --- a/bsd/net/classq/Makefile +++ b/bsd/net/classq/Makefile @@ -12,7 +12,7 @@ KERNELFILES= \ PRIVATE_DATAFILES = \ classq.h classq_blue.h classq_red.h classq_rio.h classq_sfb.h \ - if_classq.h + if_classq.h classq_fq_codel.h PRIVATE_KERNELFILES = ${KERNELFILES} diff --git a/bsd/net/classq/classq.c b/bsd/net/classq/classq.c index 625876773..67c1f44ea 100644 --- a/bsd/net/classq/classq.c +++ b/bsd/net/classq/classq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -136,15 +136,16 @@ _getq(class_queue_t *q) return (m); } -/* get a packet of a specific flow beginning from the head of the queue */ -struct mbuf * -_getq_flow(class_queue_t *q, u_int32_t flow) +static struct mbuf * +_getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid) { struct mbuf *m, *m_tmp; MBUFQ_FOREACH_SAFE(m, &q->mbufq, m_tmp) { - if (flow == 0 || ((m->m_flags & M_PKTHDR) && - m->m_pkthdr.pkt_flowid == flow)) { + if ((isflowid && (val == 0 || ((m->m_flags & M_PKTHDR) && + m->m_pkthdr.pkt_flowid == val))) || + (!isflowid && + MBUF_SCIDX(mbuf_get_service_class(m)) < val)) { /* remove it from the class queue */ MBUFQ_REMOVE(&q->mbufq, m); MBUFQ_NEXT(m) = NULL; @@ -166,16 +167,40 @@ _getq_flow(class_queue_t *q, u_int32_t flow) } return (m); + +} + +/* get a packet of a specific flow beginning from the head of the queue */ +struct mbuf * +_getq_flow(class_queue_t *q, u_int32_t flow) +{ + return (_getq_flow_or_scidx(q, flow, TRUE)); +} + +/* Get a packet whose MBUF_SCIDX() < scidx from head of queue */ +struct mbuf * +_getq_scidx_lt(class_queue_t *q, u_int32_t scidx) +{ + return (_getq_flow_or_scidx(q, scidx, FALSE)); } /* get all packets starting from the head of the queue */ struct mbuf * -_getq_all(class_queue_t *q) +_getq_all(class_queue_t *q, struct mbuf **last, u_int32_t *qlenp, + u_int64_t *qsizep) { struct mbuf *m; m = MBUFQ_FIRST(&q->mbufq); + if (last != NULL) + *last = MBUFQ_LAST(&q->mbufq); MBUFQ_INIT(&q->mbufq); + + if (qlenp != NULL) + *qlenp = qlen(q); + if (qsizep != NULL) + *qsizep = qsize(q); + qlen(q) = 0; qsize(q) = 0; @@ -212,7 +237,7 @@ _getq_tail(class_queue_t *q) qsize(q) = 0; if (qempty(q)) { - VERIFY(MBUFQ_EMPTY(head)); + VERIFY(m == MBUFQ_FIRST(head)); MBUFQ_INIT(head); } else { VERIFY(n != NULL); diff --git a/bsd/net/classq/classq.h b/bsd/net/classq/classq.h index e6edbac70..750ce5452 100644 --- a/bsd/net/classq/classq.h +++ b/bsd/net/classq/classq.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,6 +94,9 @@ typedef enum classq_state { #define DEFAULT_QLIMIT 128 /* default */ +#define CLASSQ_DEQUEUE_MAX_PKT_LIMIT 2048 +#define CLASSQ_DEQUEUE_MAX_BYTE_LIMIT (1024 * 1024) + /* * generic packet counter */ @@ -159,10 +162,12 @@ extern void _addq(class_queue_t *, struct mbuf *); extern void _addq_multi(class_queue_t *, struct mbuf *, struct mbuf *, u_int32_t, u_int32_t); extern struct mbuf *_getq(class_queue_t *); -extern struct mbuf *_getq_all(class_queue_t *); +extern struct mbuf *_getq_all(class_queue_t *, struct mbuf **, + u_int32_t *, u_int64_t *); extern struct mbuf *_getq_tail(class_queue_t *); extern struct mbuf *_getq_random(class_queue_t *); extern struct mbuf *_getq_flow(class_queue_t *, u_int32_t); +extern struct mbuf *_getq_scidx_lt(class_queue_t *, u_int32_t); extern void _removeq(class_queue_t *, struct mbuf *); extern void _flushq(class_queue_t *); extern void _flushq_flow(class_queue_t *, u_int32_t, u_int32_t *, u_int32_t *); diff --git a/bsd/net/classq/classq_fq_codel.c b/bsd/net/classq/classq_fq_codel.c new file mode 100644 index 000000000..f78da89d6 --- /dev/null +++ b/bsd/net/classq/classq_fq_codel.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static struct zone *flowq_zone = NULL; +static size_t flowq_size; + +#define FQ_ZONE_MAX (32 * 1024) /* across all interfaces */ +#define FQ_SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define FQ_SEQ_GT(a,b) ((int)((a)-(b)) > 0) + +void +fq_codel_init(void) +{ + if (flowq_zone != NULL) + return; + + flowq_size = sizeof (fq_t); + flowq_zone = zinit(flowq_size, FQ_ZONE_MAX * flowq_size, + 0, "flowq_zone"); + if (flowq_zone == NULL) { + panic("%s: failed to allocate flowq_zone", __func__); + /* NOTREACHED */ + } + zone_change(flowq_zone, Z_EXPAND, TRUE); + zone_change(flowq_zone, Z_CALLERACCT, TRUE); +} + +fq_t * +fq_alloc(int how) +{ + fq_t *fq = NULL; + fq = (how == M_WAITOK) ? zalloc(flowq_zone) : + zalloc_noblock(flowq_zone); + if (fq == NULL) { + log(LOG_ERR, "%s: unable to allocate from flowq_zone\n"); + return (NULL); + } + + bzero(fq, flowq_size); + MBUFQ_INIT(&fq->fq_mbufq); + return (fq); +} + +void +fq_destroy(fq_t *fq) +{ + VERIFY(MBUFQ_EMPTY(&fq->fq_mbufq)); + VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW))); + bzero(fq, flowq_size); + zfree(flowq_zone, fq); +} + +static void +fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl, + u_int64_t *now) +{ + u_int64_t maxgetqtime; + if (FQ_IS_DELAYHIGH(flowq) || flowq->fq_getqtime == 0 || + MBUFQ_EMPTY(&flowq->fq_mbufq) || + flowq->fq_bytes < FQ_MIN_FC_THRESHOLD_BYTES) + return; + maxgetqtime = flowq->fq_getqtime + fqs->fqs_update_interval; + if ((*now) > maxgetqtime) { + /* + * there was no dequeue in an update interval worth of + * time. It means that the queue is stalled. + */ + FQ_SET_DELAY_HIGH(flowq); + fq_cl->fcl_stat.fcl_dequeue_stall++; + } +} + +void +fq_head_drop(fq_if_t *fqs, fq_t *fq) +{ + struct mbuf *m = NULL; + struct ifclassq *ifq = fqs->fqs_ifq; + + m = fq_getq_flow(fqs, fq); + if (m == NULL) + return; + + IFCQ_DROP_ADD(ifq, 1, m_length(m)); + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); +} + +int +fq_addq(fq_if_t *fqs, struct mbuf *m, fq_if_classq_t *fq_cl) +{ + struct pkthdr *pkt = &m->m_pkthdr; + int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS; + u_int64_t now; + fq_t *fq = NULL; + + VERIFY(!(pkt->pkt_flags & PKTF_PRIV_GUARDED)); + pkt->pkt_flags |= PKTF_PRIV_GUARDED; + + if (pkt->pkt_timestamp > 0) { + now = pkt->pkt_timestamp; + } else { + now = mach_absolute_time(); + pkt->pkt_timestamp = now; + } + + /* find the flowq for this packet */ + fq = fq_if_hash_pkt(fqs, pkt->pkt_flowid, m_get_service_class(m), + now, TRUE); + if (fq == NULL) { + /* drop the packet if we could not allocate a flow queue */ + fq_cl->fcl_stat.fcl_drop_memfailure++; + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + + VERIFY(fq_cl->fcl_service_class == + (u_int32_t)mbuf_get_service_class(m)); + + fq_detect_dequeue_stall(fqs, fq, fq_cl, &now); + + if (FQ_IS_DELAYHIGH(fq)) { + if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) && + (pkt->pkt_flags & PKTF_FLOW_ADV)) { + fc_adv = 1; + /* + * If the flow is suspended or it is not + * TCP, drop the packet + */ + if (pkt->pkt_proto != IPPROTO_TCP) { + droptype = DTYPE_EARLY; + fq_cl->fcl_stat.fcl_drop_early++; + } + } else { + /* + * Need to drop a packet, instead of dropping this + * one, try to drop from the head of the queue + */ + if (!MBUFQ_EMPTY(&fq->fq_mbufq)) { + fq_head_drop(fqs, fq); + droptype = DTYPE_NODROP; + } else { + droptype = DTYPE_EARLY; + } + fq_cl->fcl_stat.fcl_drop_early++; + } + + } + + /* + * check if this packet is a retransmission of another pkt already + * in the queue + */ + if ((pkt->pkt_flags & (PKTF_TCP_REXMT|PKTF_START_SEQ)) == + (PKTF_TCP_REXMT|PKTF_START_SEQ) && fq->fq_dequeue_seq != 0) { + if (FQ_SEQ_GT(pkt->tx_start_seq, fq->fq_dequeue_seq)) { + fq_cl->fcl_stat.fcl_dup_rexmts++; + droptype = DTYPE_FORCED; + } + } + + /* Set the return code correctly */ + if (fc_adv == 1 && droptype != DTYPE_FORCED) { + if (fq_if_add_fcentry(fqs, pkt, fq_cl)) { + fq->fq_flags |= FQF_FLOWCTL_ON; + /* deliver flow control advisory error */ + if (droptype == DTYPE_NODROP) { + ret = CLASSQEQ_SUCCESS_FC; + } else { + /* dropped due to flow control */ + ret = CLASSQEQ_DROPPED_FC; + } + } else { + /* + * if we could not flow control the flow, it is + * better to drop + */ + droptype = DTYPE_FORCED; + ret = CLASSQEQ_DROPPED_FC; + fq_cl->fcl_stat.fcl_flow_control_fail++; + } + } + + /* + * If the queue length hits the queue limit, drop a packet from the + * front of the queue for a flow with maximum number of bytes. This + * will penalize heavy and unresponsive flows. It will also avoid a + * tail drop. + */ + if (droptype == DTYPE_NODROP && fq_if_at_drop_limit(fqs)) { + fq_if_drop_packet(fqs); + } + + if (droptype == DTYPE_NODROP) { + MBUFQ_ENQUEUE(&fq->fq_mbufq, m); + fq->fq_bytes += m_length(m); + fq_cl->fcl_stat.fcl_byte_cnt += m_length(m); + fq_cl->fcl_stat.fcl_pkt_cnt++; + + /* + * check if this queue will qualify to be the next + * victim queue + */ + fq_if_is_flow_heavy(fqs, fq); + } else { + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + m_freem(m); + return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROPPED); + } + + /* + * If the queue is not currently active, add it to the end of new + * flows list for that service class. + */ + if ((fq->fq_flags & (FQF_NEW_FLOW|FQF_OLD_FLOW)) == 0) { + VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL); + STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink); + fq->fq_flags |= FQF_NEW_FLOW; + + fq_cl->fcl_stat.fcl_newflows_cnt++; + + fq->fq_deficit = fq_cl->fcl_quantum; + } + return (ret); +} + +struct mbuf * +fq_getq_flow(fq_if_t *fqs, fq_t *fq) +{ + struct mbuf *m = NULL; + struct ifclassq *ifq = fqs->fqs_ifq; + fq_if_classq_t *fq_cl; + u_int64_t now; + int64_t qdelay; + struct pkthdr *pkt; + u_int32_t mlen; + + MBUFQ_DEQUEUE(&fq->fq_mbufq, m); + if (m == NULL) + return (NULL); + + mlen = m_length(m); + + VERIFY(fq->fq_bytes >= mlen); + fq->fq_bytes -= mlen; + + fq_cl = &fqs->fqs_classq[fq->fq_sc_index]; + fq_cl->fcl_stat.fcl_byte_cnt -= mlen; + fq_cl->fcl_stat.fcl_pkt_cnt--; + IFCQ_DEC_LEN(ifq); + IFCQ_DEC_BYTES(ifq, mlen); + + pkt = &m->m_pkthdr; + now = mach_absolute_time(); + + /* this will compute qdelay in nanoseconds */ + qdelay = now - pkt->pkt_timestamp; + + if (fq->fq_min_qdelay == 0 || + (qdelay > 0 && (u_int64_t)qdelay < fq->fq_min_qdelay)) + fq->fq_min_qdelay = qdelay; + if (now >= fq->fq_updatetime || MBUFQ_EMPTY(&fq->fq_mbufq)) { + if (fq->fq_min_qdelay >= fqs->fqs_target_qdelay) { + if (!FQ_IS_DELAYHIGH(fq)) + FQ_SET_DELAY_HIGH(fq); + } + + if (!FQ_IS_DELAYHIGH(fq) || MBUFQ_EMPTY(&fq->fq_mbufq)) { + FQ_CLEAR_DELAY_HIGH(fq); + if (fq->fq_flags & FQF_FLOWCTL_ON) { + fq_if_flow_feedback(fqs, fq, fq_cl); + } + } + + /* Reset measured queue delay and update time */ + fq->fq_updatetime = now + fqs->fqs_update_interval; + fq->fq_min_qdelay = 0; + } + + if ((pkt->pkt_flags & PKTF_START_SEQ) && (fq->fq_dequeue_seq == 0 || + (FQ_SEQ_LT(fq->fq_dequeue_seq, pkt->tx_start_seq)))) + fq->fq_dequeue_seq = pkt->tx_start_seq; + + pkt->pkt_timestamp = 0; + pkt->pkt_flags &= ~PKTF_PRIV_GUARDED; + + if (MBUFQ_EMPTY(&fq->fq_mbufq)) { + /* + * Remove from large_flow field, if this happened to be + * the one that is tagged. + */ + if (fqs->fqs_large_flow == fq) + fqs->fqs_large_flow = NULL; + + /* Reset getqtime so that we don't count idle times */ + fq->fq_getqtime = 0; + } else { + fq->fq_getqtime = now; + } + + return (m); +} diff --git a/bsd/net/classq/classq_fq_codel.h b/bsd/net/classq/classq_fq_codel.h new file mode 100644 index 000000000..35f8341b2 --- /dev/null +++ b/bsd/net/classq/classq_fq_codel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_CLASSQ_CLASSQ_FQ_CODEL_H +#define _NET_CLASSQ_CLASSQ_FQ_CODEL_H +#ifdef PRIVATE +#ifdef BSD_KERNEL_PRIVATE +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define FQ_MIN_FC_THRESHOLD_BYTES 7500 +#define FQ_IS_DELAYHIGH(_fq_) ((_fq_)->fq_flags & FQF_DELAY_HIGH) +#define FQ_SET_DELAY_HIGH(_fq_) do { \ + (_fq_)->fq_flags |= FQF_DELAY_HIGH; \ +} while (0) +#define FQ_CLEAR_DELAY_HIGH(_fq_) do { \ + (_fq_)->fq_flags &= ~FQF_DELAY_HIGH; \ +} while (0) + +typedef struct flowq { + MBUFQ_HEAD(pktq_head) fq_mbufq; /* Packet queue */ +#define FQF_FLOWCTL_CAPABLE 0x01 /* Use flow control instead of drop */ +#define FQF_DELAY_HIGH 0x02 /* Min delay is greater than target */ +#define FQF_NEW_FLOW 0x04 /* Currently on new flows queue */ +#define FQF_OLD_FLOW 0x08 /* Currently on old flows queue */ +#define FQF_FLOWCTL_ON 0x10 /* Currently flow controlled */ + u_int8_t fq_flags; /* flags */ + u_int8_t fq_sc_index; /* service_class index */ + int16_t fq_deficit; /* Deficit for scheduling */ + u_int32_t fq_bytes; /* Number of bytes in the queue */ + u_int64_t fq_min_qdelay; /* min queue delay for Codel */ + u_int64_t fq_updatetime; /* next update interval */ + u_int64_t fq_getqtime; /* last dequeue time */ + SLIST_ENTRY(flowq) fq_hashlink; /* for flow queue hash table */ + STAILQ_ENTRY(flowq) fq_actlink; /* for new/old flow queues */ + u_int32_t fq_flowhash; /* Flow hash */ + u_int32_t fq_dequeue_seq; /* Last dequeue seq */ +} fq_t; + +struct fq_codel_sched_data; +struct fq_if_classq; + +/* Function definitions */ +extern void fq_codel_init(void); +extern fq_t *fq_alloc(int); +extern void fq_destroy(fq_t *); +extern int fq_addq(struct fq_codel_sched_data *, struct mbuf *, + struct fq_if_classq *); +extern struct mbuf *fq_getq_flow(struct fq_codel_sched_data *, fq_t *); +extern void fq_head_drop(struct fq_codel_sched_data *, fq_t *); + +#ifdef __cplusplus +} +#endif +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* PRIVATE */ +#endif /* _NET_CLASSQ_CLASSQ_FQ_CODEL_H */ diff --git a/bsd/net/classq/classq_sfb.c b/bsd/net/classq/classq_sfb.c index 5831f968f..c679ca43f 100644 --- a/bsd/net/classq/classq_sfb.c +++ b/bsd/net/classq/classq_sfb.c @@ -136,12 +136,12 @@ * large enough to induce this much delay and nothing more than that. */ #define TARGET_QDELAY_BASE (10ULL * 1000 * 1000) /* 10ms */ -#define TARGET_QDELAY_MIN (10ULL * 1000) /* 10us */ -#define TARGET_QDELAY_MAX (20ULL * 1000 * 1000 * 1000) /* 20s */ +#define TARGET_QDELAY_MIN (10ULL * 1000) /* 10us */ +#define TARGET_QDELAY_MAX (20ULL * 1000 * 1000 * 1000) /* 20s */ /* * Update interval for checking the extra delay added by the queue. This - * should be 90-95 percentile of RTT experienced by any TCP connection + * should be 90-95 percentile of RTT experienced by any TCP connection * so that it will take care of the burst traffic. */ #define UPDATE_INTERVAL_BASE (100ULL * 1000 * 1000) /* 100ms */ @@ -188,14 +188,14 @@ /* Minimum nuber of bytes in queue to get flow controlled */ #define SFB_MIN_FC_THRESHOLD_BYTES 7500 -#define SFB_SET_DELAY_HIGH(_sp_, _q_) do { \ +#define SFB_SET_DELAY_HIGH(_sp_, _q_) do { \ (_sp_)->sfb_flags |= SFBF_DELAYHIGH; \ (_sp_)->sfb_fc_threshold = max(SFB_MIN_FC_THRESHOLD_BYTES, \ (qsize((_q_)) >> 3)); \ } while (0) #define SFB_QUEUE_DELAYBASED(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYBASED) -#define SFB_IS_DELAYHIGH(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYHIGH) +#define SFB_IS_DELAYHIGH(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYHIGH) #define SFB_QUEUE_DELAYBASED_MAXSIZE 2048 /* max pkts */ #define HINTERVAL_MIN (10) /* 10 seconds */ @@ -243,7 +243,6 @@ static void sfb_resetq(struct sfb *, cqev_t); static void sfb_calc_holdtime(struct sfb *, u_int64_t); static void sfb_calc_pboxtime(struct sfb *, u_int64_t); static void sfb_calc_hinterval(struct sfb *, u_int64_t *); -static void sfb_calc_target_qdelay(struct sfb *, u_int64_t); static void sfb_calc_update_interval(struct sfb *, u_int64_t); static void sfb_swap_bins(struct sfb *, u_int32_t); static inline int sfb_pcheck(struct sfb *, struct pkthdr *); @@ -280,14 +279,6 @@ static u_int64_t sfb_hinterval; SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, hinterval, CTLFLAG_RW|CTLFLAG_LOCKED, &sfb_hinterval, "SFB hash interval in nanoseconds"); -static u_int64_t sfb_target_qdelay = 0; -SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, target_qdelay, CTLFLAG_RW|CTLFLAG_LOCKED, - &sfb_target_qdelay, "SFB target queue delay in nanoseconds"); - -static u_int64_t sfb_update_interval; -SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, update_interval, - CTLFLAG_RW|CTLFLAG_LOCKED, &sfb_update_interval, "SFB update interval"); - static u_int32_t sfb_increment = SFB_INCREMENT; SYSCTL_UINT(_net_classq_sfb, OID_AUTO, increment, CTLFLAG_RW|CTLFLAG_LOCKED, &sfb_increment, SFB_INCREMENT, "SFB increment [d1]"); @@ -439,53 +430,12 @@ sfb_calc_hinterval(struct sfb *sp, u_int64_t *t) net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset); } -static void -sfb_calc_target_qdelay(struct sfb *sp, u_int64_t out_bw) -{ -#pragma unused(out_bw) - u_int64_t target_qdelay = 0; - struct ifnet *ifp = sp->sfb_ifp; - - target_qdelay = IFCQ_TARGET_QDELAY(&ifp->if_snd); - - if (sfb_target_qdelay != 0) - target_qdelay = sfb_target_qdelay; - - /* - * If we do not know the effective bandwidth, use the default - * target queue delay. - */ - if (target_qdelay == 0) - target_qdelay = IFQ_TARGET_DELAY; - - /* - * If a delay has been added to ifnet start callback for - * coalescing, we have to add that to the pre-set target delay - * because the packets can be in the queue longer. - */ - if ((ifp->if_eflags & IFEF_ENQUEUE_MULTI) && - ifp->if_start_delay_timeout > 0) - target_qdelay += ifp->if_start_delay_timeout; - - sp->sfb_target_qdelay = target_qdelay; -} - static void sfb_calc_update_interval(struct sfb *sp, u_int64_t out_bw) { #pragma unused(out_bw) u_int64_t update_interval = 0; - - /* If the system-level override is set, use it */ - if (sfb_update_interval != 0) - update_interval = sfb_update_interval; - /* - * If we do not know the effective bandwidth, use the default - * update interval. - */ - if (update_interval == 0) - update_interval = IFQ_UPDATE_INTERVAL; - + ifclassq_calc_update_interval(&update_interval); net_nsectimer(&update_interval, &sp->sfb_update_interval); } @@ -518,7 +468,7 @@ sfb_alloc(struct ifnet *ifp, u_int32_t qid, u_int32_t qlim, u_int32_t flags) log(LOG_ERR, "%s: SFB unable to allocate flow control lists\n", if_name(ifp)); sfb_destroy(sp); - return(NULL); + return (NULL); } bzero(sp->sfb_fc_lists, sfb_fcl_size); @@ -609,11 +559,11 @@ sfb_resetq(struct sfb *sp, cqev_t ev) sfb_calc_holdtime(sp, eff_rate); sfb_calc_pboxtime(sp, eff_rate); sfb_calc_hinterval(sp, NULL); - sfb_calc_target_qdelay(sp, eff_rate); + ifclassq_calc_target_qdelay(ifp, &sp->sfb_target_qdelay); sfb_calc_update_interval(sp, eff_rate); if (ev == CLASSQ_EV_LINK_DOWN || - ev == CLASSQ_EV_LINK_UP) + ev == CLASSQ_EV_LINK_UP) sfb_fclists_clean(sp); bzero(sp->sfb_bins, sizeof (*sp->sfb_bins)); @@ -1165,11 +1115,11 @@ sfb_addq(struct sfb *sp, class_queue_t *q, struct mbuf *m, struct pf_mtag *t) VERIFY(!(pkt->pkt_flags & PKTF_PRIV_GUARDED)); pkt->pkt_flags |= PKTF_PRIV_GUARDED; - if (pkt->pkt_enqueue_ts > 0) { - net_nsectimer(&pkt->pkt_enqueue_ts, &now); + if (pkt->pkt_timestamp > 0) { + net_nsectimer(&pkt->pkt_timestamp, &now); } else { nanouptime(&now); - net_timernsec(&now, &pkt->pkt_enqueue_ts); + net_timernsec(&now, &pkt->pkt_timestamp); } /* time to swap the bins? */ @@ -1356,8 +1306,8 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge) if (!purge && SFB_QUEUE_DELAYBASED(sp)) { u_int64_t dequeue_ns, queue_delay = 0; net_timernsec(&now, &dequeue_ns); - if (dequeue_ns > pkt->pkt_enqueue_ts) - queue_delay = dequeue_ns - pkt->pkt_enqueue_ts; + if (dequeue_ns > pkt->pkt_timestamp) + queue_delay = dequeue_ns - pkt->pkt_timestamp; if (sp->sfb_min_qdelay == 0 || (queue_delay > 0 && queue_delay < sp->sfb_min_qdelay)) @@ -1369,14 +1319,14 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge) } else { sp->sfb_flags &= ~(SFBF_DELAYHIGH); sp->sfb_fc_threshold = 0; - + } net_timeradd(&now, &sp->sfb_update_interval, &sp->sfb_update_time); sp->sfb_min_qdelay = 0; } } - pkt->pkt_enqueue_ts = 0; + pkt->pkt_timestamp = 0; /* * Clearpkts are the ones which were in the queue when the hash @@ -1464,7 +1414,7 @@ sfb_updateq(struct sfb *sp, cqev_t ev) } sfb_calc_holdtime(sp, eff_rate); sfb_calc_pboxtime(sp, eff_rate); - sfb_calc_target_qdelay(sp, eff_rate); + ifclassq_calc_target_qdelay(ifp, &sp->sfb_target_qdelay); sfb_calc_update_interval(sp, eff_rate); break; } diff --git a/bsd/net/classq/classq_subr.c b/bsd/net/classq/classq_subr.c index 98c007bd9..55f42daf6 100644 --- a/bsd/net/classq/classq_subr.c +++ b/bsd/net/classq/classq_subr.c @@ -39,6 +39,7 @@ #include #include #include +#include #if CLASSQ_RED #include #endif /* CLASSQ_RED */ @@ -50,6 +51,7 @@ #endif /* CLASSQ_BLUE */ #include #include +#include #include @@ -58,13 +60,24 @@ #endif /* PF_ALTQ */ static errno_t ifclassq_dequeue_common(struct ifclassq *, mbuf_svc_class_t, - u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *, - boolean_t); + u_int32_t, u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, + u_int32_t *, boolean_t); static struct mbuf *ifclassq_poll_common(struct ifclassq *, mbuf_svc_class_t, boolean_t); static struct mbuf *ifclassq_tbr_dequeue_common(struct ifclassq *, int, mbuf_svc_class_t, boolean_t); +static u_int64_t ifclassq_target_qdelay = 0; +SYSCTL_QUAD(_net_classq, OID_AUTO, target_qdelay, CTLFLAG_RW|CTLFLAG_LOCKED, + &ifclassq_target_qdelay, "target queue delay in nanoseconds"); + +static u_int64_t ifclassq_update_interval = 0; +SYSCTL_QUAD(_net_classq, OID_AUTO, update_interval, + CTLFLAG_RW|CTLFLAG_LOCKED, &ifclassq_update_interval, + "update interval in nanoseconds"); + +static int32_t ifclassq_sched_fq_codel; + void classq_init(void) { @@ -82,6 +95,11 @@ classq_init(void) blue_init(); #endif /* CLASSQ_BLUE */ sfb_init(); + fq_codel_scheduler_init(); + + if (!PE_parse_boot_argn("fq_codel", &ifclassq_sched_fq_codel, + sizeof (ifclassq_sched_fq_codel))) + ifclassq_sched_fq_codel = 0; } int @@ -221,9 +239,18 @@ ifclassq_pktsched_setup(struct ifclassq *ifq) break; case IFNET_SCHED_MODEL_NORMAL: - err = pktsched_setup(ifq, PKTSCHEDT_QFQ, ifq->ifcq_sflags); + if (ifclassq_sched_fq_codel != 0) { + err = pktsched_setup(ifq, PKTSCHEDT_FQ_CODEL, + ifq->ifcq_sflags); + } else { + err = pktsched_setup(ifq, PKTSCHEDT_QFQ, + ifq->ifcq_sflags); + } + break; + case IFNET_SCHED_MODEL_FQ_CODEL: + err = pktsched_setup(ifq, PKTSCHEDT_FQ_CODEL, + ifq->ifcq_sflags); break; - default: VERIFY(0); /* NOTREACHED */ @@ -294,26 +321,27 @@ ifclassq_enqueue(struct ifclassq *ifq, struct mbuf *m) } errno_t -ifclassq_dequeue(struct ifclassq *ifq, u_int32_t limit, struct mbuf **head, +ifclassq_dequeue(struct ifclassq *ifq, u_int32_t pkt_limit, + u_int32_t byte_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) { - return (ifclassq_dequeue_common(ifq, MBUF_SC_UNSPEC, limit, head, tail, - cnt, len, FALSE)); + return (ifclassq_dequeue_common(ifq, MBUF_SC_UNSPEC, pkt_limit, + byte_limit, head, tail, cnt, len, FALSE)); } errno_t ifclassq_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t sc, - u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, - u_int32_t *len) + u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, + u_int32_t *cnt, u_int32_t *len) { - return (ifclassq_dequeue_common(ifq, sc, limit, head, tail, - cnt, len, TRUE)); + return (ifclassq_dequeue_common(ifq, sc, pkt_limit, + CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, head, tail, cnt, len, TRUE)); } static errno_t ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, - u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, - u_int32_t *len, boolean_t drvmgt) + u_int32_t pkt_limit, u_int32_t byte_limit, struct mbuf **head, + struct mbuf **tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt) { struct ifnet *ifp = ifq->ifcq_ifp; u_int32_t i = 0, l = 0; @@ -325,14 +353,30 @@ ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, VERIFY(!drvmgt || MBUF_VALID_SC(sc)); + /* + * If the scheduler support dequeueing multiple packets at the + * same time, call that one instead. + */ + + if (ifq->ifcq_dequeue_multi != NULL) { + int err; + IFCQ_LOCK_SPIN(ifq); + err = ifq->ifcq_dequeue_multi(ifq, CLASSQDQ_REMOVE, + pkt_limit, byte_limit, head, tail, cnt, len); + IFCQ_UNLOCK(ifq); + + if (err == 0 && (*head) == NULL) + err = EAGAIN; + return (err); + } + *head = NULL; first = &(*head); last = NULL; - ifq = &ifp->if_snd; IFCQ_LOCK_SPIN(ifq); - while (i < limit) { + while (i < pkt_limit && l < byte_limit) { #if PF_ALTQ u_int32_t qlen; @@ -391,9 +435,8 @@ ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, #endif /* MEASURE_BW */ if (IFNET_IS_CELLULAR(ifp)) { (*head)->m_pkthdr.pkt_flags |= PKTF_VALID_UNSENT_DATA; - (*head)->m_pkthdr.pkt_unsent_databytes = - (total_snd_byte_count << MSIZESHIFT) + - ifq->ifcq_bytes; + (*head)->m_pkthdr.bufstatus_if = IFCQ_BYTES(ifq); + (*head)->m_pkthdr.bufstatus_sndbuf = ifp->if_sndbyte_unsent; } head = &(*head)->m_nextpkt; i++; @@ -487,7 +530,8 @@ ifclassq_update(struct ifclassq *ifq, cqev_t ev) int ifclassq_attach(struct ifclassq *ifq, u_int32_t type, void *discipline, ifclassq_enq_func enqueue, ifclassq_deq_func dequeue, - ifclassq_deq_sc_func dequeue_sc, ifclassq_req_func request) + ifclassq_deq_sc_func dequeue_sc, ifclassq_deq_multi_func dequeue_multi, + ifclassq_req_func request) { IFCQ_LOCK_ASSERT_HELD(ifq); @@ -501,6 +545,7 @@ ifclassq_attach(struct ifclassq *ifq, u_int32_t type, void *discipline, ifq->ifcq_enqueue = enqueue; ifq->ifcq_dequeue = dequeue; ifq->ifcq_dequeue_sc = dequeue_sc; + ifq->ifcq_dequeue_multi = dequeue_multi; ifq->ifcq_request = request; return (0); @@ -825,3 +870,47 @@ ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile, return (0); } + +void +ifclassq_calc_target_qdelay(struct ifnet *ifp, u_int64_t *if_target_qdelay) +{ + u_int64_t target_qdelay = 0; + target_qdelay = IFCQ_TARGET_QDELAY(&ifp->if_snd); + + if (ifclassq_target_qdelay != 0) + target_qdelay = ifclassq_target_qdelay; + + /* + * If we do not know the effective bandwidth, use the default + * target queue delay. + */ + if (target_qdelay == 0) + target_qdelay = IFQ_TARGET_DELAY; + + /* + * If a delay has been added to ifnet start callback for + * coalescing, we have to add that to the pre-set target delay + * because the packets can be in the queue longer. + */ + if ((ifp->if_eflags & IFEF_ENQUEUE_MULTI) && + ifp->if_start_delay_timeout > 0) + target_qdelay += ifp->if_start_delay_timeout; + + *(if_target_qdelay) = target_qdelay; +} + +void +ifclassq_calc_update_interval(u_int64_t *update_interval) +{ + u_int64_t uint = 0; + + /* If the system level override is set, use it */ + if (ifclassq_update_interval != 0) + uint = ifclassq_update_interval; + + /* Otherwise use the default value */ + if (uint == 0) + uint = IFQ_UPDATE_INTERVAL; + + *update_interval = uint; +} diff --git a/bsd/net/classq/if_classq.h b/bsd/net/classq/if_classq.h index bc8f4191e..de8ddc60c 100644 --- a/bsd/net/classq/if_classq.h +++ b/bsd/net/classq/if_classq.h @@ -111,6 +111,9 @@ typedef int (*ifclassq_enq_func)(struct ifclassq *, struct mbuf *); typedef struct mbuf *(*ifclassq_deq_func)(struct ifclassq *, enum cqdq_op); typedef struct mbuf *(*ifclassq_deq_sc_func)(struct ifclassq *, mbuf_svc_class_t, enum cqdq_op); +typedef int (*ifclassq_deq_multi_func)(struct ifclassq *, enum cqdq_op, + u_int32_t, u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, + u_int32_t *); typedef int (*ifclassq_req_func)(struct ifclassq *, enum cqrq, void *); /* @@ -158,6 +161,7 @@ struct ifclassq { ifclassq_enq_func ifcq_enqueue; ifclassq_deq_func ifcq_dequeue; ifclassq_deq_sc_func ifcq_dequeue_sc; + ifclassq_deq_multi_func ifcq_dequeue_multi; ifclassq_req_func ifcq_request; /* token bucket regulator */ @@ -184,11 +188,11 @@ struct ifclassq { #define IFCQ_TBR_IS_ENABLED(_ifcq) ((_ifcq)->ifcq_flags & IFCQF_TBR) /* classq enqueue return value */ -#define CLASSQEQ_DROPPED (-1) /* packet dropped (freed) */ -#define CLASSQEQ_SUCCESS 0 /* success, packet enqueued */ -#define CLASSQEQ_SUCCESS_FC 1 /* packet enqueued; */ +#define CLASSQEQ_DROPPED (-1) /* packet dropped (freed) */ +#define CLASSQEQ_SUCCESS 0 /* success, packet enqueued */ +#define CLASSQEQ_SUCCESS_FC 1 /* packet enqueued; */ /* give flow control feedback */ -#define CLASSQEQ_DROPPED_FC 2 /* packet dropped; */ +#define CLASSQEQ_DROPPED_FC 2 /* packet dropped; */ /* give flow control feedback */ #define CLASSQEQ_DROPPED_SP 3 /* packet dropped due to suspension; */ /* give flow control feedback */ @@ -209,11 +213,11 @@ typedef enum cqev { #include #include #include +#include #ifdef __cplusplus extern "C" { #endif - struct if_ifclassq_stats { u_int32_t ifqs_len; u_int32_t ifqs_maxlen; @@ -227,6 +231,7 @@ struct if_ifclassq_stats { struct cbq_classstats ifqs_cbq_stats; struct hfsc_classstats ifqs_hfsc_stats; struct qfq_classstats ifqs_qfq_stats; + struct fq_codel_classstats ifqs_fq_codel_stats; }; } __attribute__((aligned(8))); @@ -342,10 +347,12 @@ struct if_ifclassq_stats { #define IFCQ_DEC_LEN(_ifcq) (IFCQ_LEN(_ifcq)--) #define IFCQ_MAXLEN(_ifcq) ((_ifcq)->ifcq_maxlen) #define IFCQ_SET_MAXLEN(_ifcq, _len) ((_ifcq)->ifcq_maxlen = (_len)) -#define IFCQ_TARGET_QDELAY(_ifcq) ((_ifcq)->ifcq_target_qdelay) +#define IFCQ_TARGET_QDELAY(_ifcq) ((_ifcq)->ifcq_target_qdelay) #define IFCQ_BYTES(_ifcq) ((_ifcq)->ifcq_bytes) -#define IFCQ_INC_BYTES(_ifcq, _len) (IFCQ_BYTES(_ifcq) + _len) -#define IFCQ_DEC_BYTES(_ifcq, _len) (IFCQ_BYTES(_ifcq) - _len) +#define IFCQ_INC_BYTES(_ifcq, _len) \ + ((_ifcq)->ifcq_bytes = (_ifcq)->ifcq_bytes + (_len)) +#define IFCQ_DEC_BYTES(_ifcq, _len) \ + ((_ifcq)->ifcq_bytes = (_ifcq)->ifcq_bytes - (_len)) #define IFCQ_XMIT_ADD(_ifcq, _pkt, _len) do { \ PKTCNTR_ADD(&(_ifcq)->ifcq_xmitcnt, _pkt, _len); \ @@ -363,8 +370,8 @@ extern u_int32_t ifclassq_get_maxlen(struct ifclassq *); extern int ifclassq_get_len(struct ifclassq *, mbuf_svc_class_t, u_int32_t *, u_int32_t *); extern errno_t ifclassq_enqueue(struct ifclassq *, struct mbuf *); -extern errno_t ifclassq_dequeue(struct ifclassq *, u_int32_t, struct mbuf **, - struct mbuf **, u_int32_t *, u_int32_t *); +extern errno_t ifclassq_dequeue(struct ifclassq *, u_int32_t, u_int32_t, + struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *); extern errno_t ifclassq_dequeue_sc(struct ifclassq *, mbuf_svc_class_t, u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *); extern struct mbuf *ifclassq_poll(struct ifclassq *); @@ -372,7 +379,7 @@ extern struct mbuf *ifclassq_poll_sc(struct ifclassq *, mbuf_svc_class_t); extern void ifclassq_update(struct ifclassq *, cqev_t); extern int ifclassq_attach(struct ifclassq *, u_int32_t, void *, ifclassq_enq_func, ifclassq_deq_func, ifclassq_deq_sc_func, - ifclassq_req_func); + ifclassq_deq_multi_func, ifclassq_req_func); extern int ifclassq_detach(struct ifclassq *); extern int ifclassq_getqstats(struct ifclassq *, u_int32_t, void *, u_int32_t *); @@ -381,6 +388,10 @@ extern int ifclassq_tbr_set(struct ifclassq *, struct tb_profile *, boolean_t); extern struct mbuf *ifclassq_tbr_dequeue(struct ifclassq *, int); extern struct mbuf *ifclassq_tbr_dequeue_sc(struct ifclassq *, int, mbuf_svc_class_t); +extern void ifclassq_calc_target_qdelay(struct ifnet *ifp, + u_int64_t *if_target_qdelay); +extern void ifclassq_calc_update_interval(u_int64_t *update_interval); + #endif /* BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ #endif /* _NET_CLASSQ_IF_CLASSQ_H_ */ diff --git a/bsd/net/content_filter.c b/bsd/net/content_filter.c index 9975c99dc..c01c3d078 100644 --- a/bsd/net/content_filter.c +++ b/bsd/net/content_filter.c @@ -3201,11 +3201,13 @@ cfil_action_drop(struct socket *so, uint32_t kcunit) * (forcing fixed along with rdar://19391339) */ error = sosetdefunct(p, so, - SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, FALSE); + SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, + FALSE); /* Flush the socket buffer and disconnect */ if (error == 0) - error = sodefunct(p, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); + error = sodefunct(p, so, + SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); /* The filter is done, mark as detached */ entry->cfe_flags |= CFEF_CFIL_DETACHED; diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index b7fa46e15..7505ec207 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2015 Apple Inc. All rights reserved. + * Copyright (c) 1999-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,7 @@ #include #include #include +#include #endif /* INET */ #if INET6 @@ -117,20 +118,24 @@ #endif /* PF_ALTQ */ #include -#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) -#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) -#define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8)) -#define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8)) -#define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8)) +#if NECP +#include +#endif /* NECP */ -#define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */ -#define MAX_LINKADDR 4 /* LONGWORDS */ -#define M_NKE M_IFADDR +#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) +#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) +#define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8)) +#define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8)) +#define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8)) + +#define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */ +#define MAX_LINKADDR 4 /* LONGWORDS */ +#define M_NKE M_IFADDR #if 1 -#define DLIL_PRINTF printf +#define DLIL_PRINTF printf #else -#define DLIL_PRINTF kprintf +#define DLIL_PRINTF kprintf #endif #define IF_DATA_REQUIRE_ALIGNED_64(f) \ @@ -292,33 +297,27 @@ static struct zone *dlif_proto_zone; /* zone for if_proto */ #define DLIF_PROTO_ZONE_MAX (DLIF_ZONE_MAX*2) /* maximum elements in zone */ #define DLIF_PROTO_ZONE_NAME "ifnet_proto" /* zone name */ -static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */ -static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */ +static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */ +static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */ static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */ #define DLIF_TCPSTAT_ZONE_MAX 1 /* maximum elements in zone */ #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */ -static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */ +static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */ static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */ static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */ #define DLIF_UDPSTAT_ZONE_MAX 1 /* maximum elements in zone */ #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */ -/* - * Updating this variable should be done by first acquiring the global - * radix node head (rnh_lock), in tandem with settting/clearing the - * PR_AGGDRAIN for routedomain. - */ -u_int32_t ifnet_aggressive_drainers; static u_int32_t net_rtref; static struct dlil_main_threading_info dlil_main_input_thread_info; __private_extern__ struct dlil_threading_info *dlil_main_input_thread = (struct dlil_threading_info *)&dlil_main_input_thread_info; -static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg); +static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation); static int dlil_detach_filter_internal(interface_filter_t filter, int detached); static void dlil_if_trace(struct dlil_ifnet *, int); static void if_proto_ref(struct if_proto *); @@ -356,6 +355,9 @@ static errno_t ifproto_media_send_arp(struct ifnet *, u_short, static errno_t ifp_if_output(struct ifnet *, struct mbuf *); static void ifp_if_start(struct ifnet *); +static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head, + struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, + boolean_t poll, struct thread *tp); static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *); static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *); @@ -476,12 +478,12 @@ static struct zone *ifnet_fc_zone; /* ifnet_fc_entry zone */ #define IFNET_FC_ZONE_NAME "ifnet_fc_zone" #define IFNET_FC_ZONE_MAX 32 -extern void bpfdetach(struct ifnet*); +extern void bpfdetach(struct ifnet *); extern void proto_input_run(void); -extern uint32_t udp_count_opportunistic(unsigned int ifindex, +extern uint32_t udp_count_opportunistic(unsigned int ifindex, u_int32_t flags); -extern uint32_t tcp_count_opportunistic(unsigned int ifindex, +extern uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags); __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); @@ -520,7 +522,7 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen, sysctl_sndq_maxlen, "I", "Default transmit queue max length"); #define IF_RCVQ_MINLEN 32 -#define IF_RCVQ_MAXLEN 256 +#define IF_RCVQ_MAXLEN 256 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN; SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN, @@ -596,12 +598,12 @@ SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_bw_measure_size, static u_int32_t cur_dlil_input_threads = 0; SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads, - CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads , 0, + CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0, "Current number of DLIL input threads"); #if IFNET_INPUT_SANITY_CHK SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check, - CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check , 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0, "Turn on sanity checking in DLIL input"); #endif /* IFNET_INPUT_SANITY_CHK */ @@ -716,12 +718,17 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats, uint32_t tx_chain_len_count = 0; SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count, - CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, - ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, ""); SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_ports_used, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_ports_used, ""); +#if (DEVELOPMENT || DEBUG) +static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS; +SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames, + CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, ""); +#endif /* DEVELOPMENT || DEBUG */ + unsigned int net_rxpoll = 1; unsigned int net_affinity = 1; static kern_return_t dlil_affinity_set(struct thread *, u_int32_t); @@ -778,7 +785,7 @@ proto_hash_value(u_int32_t protocol_family) * the hash bucket index and the protocol family defined * here; future changes must be applied there as well. */ - switch(protocol_family) { + switch (protocol_family) { case PF_INET: return (0); case PF_INET6: @@ -863,7 +870,7 @@ if_proto_free(struct if_proto *proto) dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED, (struct net_event_data *)&ev_pr_data, - sizeof(struct kev_dl_proto_data)); + sizeof (struct kev_dl_proto_data)); zfree(dlif_proto_zone, proto); } @@ -976,11 +983,17 @@ ifnet_head_done(void) lck_rw_done(&ifnet_head_lock); } +__private_extern__ void +ifnet_head_assert_exclusive(void) +{ + lck_rw_assert(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE); +} + /* * Caller must already be holding ifnet lock. */ static int -dlil_ifp_proto_count(struct ifnet * ifp) +dlil_ifp_proto_count(struct ifnet *ifp) { int i, count = 0; @@ -1021,18 +1034,23 @@ dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, if (event_data == NULL) { event_data = &ev_data; - event_data_len = sizeof(struct net_event_data); + event_data_len = sizeof (struct net_event_data); } strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ); event_data->if_family = ifp->if_family; - event_data->if_unit = (u_int32_t) ifp->if_unit; + event_data->if_unit = (u_int32_t)ifp->if_unit; ev_msg.dv[0].data_length = event_data_len; ev_msg.dv[0].data_ptr = event_data; ev_msg.dv[1].data_length = 0; - dlil_event_internal(ifp, &ev_msg); + /* Don't update interface generation for quality and RRC state changess */ + bool update_generation = (event_subclass != KEV_DL_SUBCLASS || + (event_code != KEV_DL_LINK_QUALITY_METRIC_CHANGED && + event_code != KEV_DL_RRC_STATE_CHANGED)); + + dlil_event_internal(ifp, &ev_msg, update_generation); } __private_extern__ int @@ -1097,7 +1115,7 @@ dlil_alloc_local_stats(struct ifnet *ifp) if (ifp->if_ipv4_stat == NULL) { MALLOC(ifp->if_ipv4_stat, struct if_tcp_ecn_stat *, - sizeof(struct if_tcp_ecn_stat), M_TEMP, M_WAITOK|M_ZERO); + sizeof (struct if_tcp_ecn_stat), M_TEMP, M_WAITOK|M_ZERO); if (ifp->if_ipv4_stat == NULL) { ret = ENOMEM; goto end; @@ -1106,7 +1124,7 @@ dlil_alloc_local_stats(struct ifnet *ifp) if (ifp->if_ipv6_stat == NULL) { MALLOC(ifp->if_ipv6_stat, struct if_tcp_ecn_stat *, - sizeof(struct if_tcp_ecn_stat), M_TEMP, M_WAITOK|M_ZERO); + sizeof (struct if_tcp_ecn_stat), M_TEMP, M_WAITOK|M_ZERO); if (ifp->if_ipv6_stat == NULL) { ret = ENOMEM; goto end; @@ -1309,7 +1327,7 @@ dlil_init(void) * The following fields must be 64-bit aligned for atomic operations. */ IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets); - IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors) + IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors); IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets); IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors); IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions); @@ -1325,7 +1343,7 @@ dlil_init(void) IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets); - IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors) + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions); @@ -1409,6 +1427,7 @@ dlil_init(void) _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI); _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT); _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED); + _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC); _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN); _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN); @@ -1509,6 +1528,7 @@ dlil_init(void) TAILQ_INIT(&dlil_ifnet_head); TAILQ_INIT(&ifnet_head); TAILQ_INIT(&ifnet_detaching_head); + TAILQ_INIT(&ifnet_ordered_head); /* Setup the lock groups we will use */ dlil_grp_attributes = lck_grp_attr_alloc_init(); @@ -1567,6 +1587,9 @@ dlil_init(void) /* Initialize the pktap virtual interface */ pktap_init(); + /* Initialize the service class to dscp map */ + net_qos_map_init(); + #if DEBUG /* Run self-tests */ dlil_verify_sum16(); @@ -1758,9 +1781,6 @@ dlil_detach_filter_internal(interface_filter_t filter, int detached) if (filter->filt_detached) filter->filt_detached(filter->filt_cookie, filter->filt_ifp); - /* Free the filter */ - zfree(dlif_filt_zone, filter); - /* * Decrease filter count and route_generation ID to let TCP * know it should reevalute doing TSO or not @@ -1769,11 +1789,16 @@ dlil_detach_filter_internal(interface_filter_t filter, int detached) OSAddAtomic(-1, &dlil_filter_disable_tso_count); routegenid_update(); } + + /* Free the filter */ + zfree(dlif_filt_zone, filter); + filter = NULL; done: - if (retval != 0) { + if (retval != 0 && filter != NULL) { DLIL_PRINTF("failed to detach %s filter (err=%d)\n", filter->filt_name, retval); } + return (retval); } @@ -1795,6 +1820,7 @@ dlil_detach_filter(interface_filter_t filter) * c) protocol registrations * d) packet injections */ +__attribute__((noreturn)) static void dlil_main_input_thread_func(void *v, wait_result_t w) { @@ -1831,21 +1857,21 @@ dlil_main_input_thread_func(void *v, wait_result_t w) /* Packets for non-dedicated interfaces other than lo0 */ m_cnt = qlen(&inp->rcvq_pkts); - m = _getq_all(&inp->rcvq_pkts); + m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL); /* Packets exclusive to lo0 */ m_cnt_loop = qlen(&inpm->lo_rcvq_pkts); - m_loop = _getq_all(&inpm->lo_rcvq_pkts); + m_loop = _getq_all(&inpm->lo_rcvq_pkts, NULL, NULL, NULL); inp->wtot = 0; lck_mtx_unlock(&inp->input_lck); /* - * NOTE warning %%% attention !!!! - * We should think about putting some thread starvation - * safeguards if we deal with long chains of packets. - */ + * NOTE warning %%% attention !!!! + * We should think about putting some thread starvation + * safeguards if we deal with long chains of packets. + */ if (m_loop != NULL) dlil_input_packet_list_extended(lo_ifp, m_loop, m_cnt_loop, inp->mode); @@ -1869,9 +1895,15 @@ static void dlil_input_thread_func(void *v, wait_result_t w) { #pragma unused(w) + char thread_name[MAXTHREADNAMESIZE]; struct dlil_threading_info *inp = v; struct ifnet *ifp = inp->ifp; + /* Construct the name for this thread, and then apply it. */ + bzero(thread_name, sizeof(thread_name)); + snprintf(thread_name, sizeof(thread_name), "dlil_input_%s", ifp->if_xname); + thread_set_thread_name(inp->input_thr, thread_name); + VERIFY(inp != dlil_main_input_thread); VERIFY(ifp != NULL); VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll); @@ -1905,7 +1937,7 @@ dlil_input_thread_func(void *v, wait_result_t w) /* Packets for this interface */ m_cnt = qlen(&inp->rcvq_pkts); - m = _getq_all(&inp->rcvq_pkts); + m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL); if (inp->input_waiting & DLIL_INPUT_TERMINATE) { lck_mtx_unlock(&inp->input_lck); @@ -1926,10 +1958,10 @@ dlil_input_thread_func(void *v, wait_result_t w) lck_mtx_unlock(&inp->input_lck); /* - * NOTE warning %%% attention !!!! - * We should think about putting some thread starvation - * safeguards if we deal with long chains of packets. - */ + * NOTE warning %%% attention !!!! + * We should think about putting some thread starvation + * safeguards if we deal with long chains of packets. + */ if (m != NULL) dlil_input_packet_list_extended(NULL, m, m_cnt, inp->mode); @@ -2011,7 +2043,7 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w) m_size = qsize(&inp->rcvq_pkts); /* Packets for this interface */ - m = _getq_all(&inp->rcvq_pkts); + m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL); VERIFY(m != NULL || m_cnt == 0); nanouptime(&now); @@ -2172,10 +2204,10 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w) } /* - * NOTE warning %%% attention !!!! - * We should think about putting some thread starvation - * safeguards if we deal with long chains of packets. - */ + * NOTE warning %%% attention !!!! + * We should think about putting some thread starvation + * safeguards if we deal with long chains of packets. + */ if (m != NULL) dlil_input_packet_list_extended(NULL, m, m_cnt, mode); } @@ -2343,10 +2375,11 @@ static errno_t ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll) { - struct thread *tp = current_thread(); - struct mbuf *last; - struct dlil_threading_info *inp; + ifnet_input_handler_func handler_func; + struct ifnet_stat_increment_param _s; u_int32_t m_cnt = 0, m_size = 0; + struct mbuf *last; + errno_t err = 0; if ((m_head == NULL && !poll) || (s == NULL && ext)) { if (m_head != NULL) @@ -2369,6 +2402,9 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, return (EINVAL); } + handler_func = ifp->if_input_handler; + VERIFY(handler_func != NULL); + if (m_tail == NULL) { last = m_head; while (m_head != NULL) { @@ -2426,6 +2462,66 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, s->packets_in, m_cnt); } + if (s == NULL) { + bzero(&_s, sizeof (_s)); + s = &_s; + } else { + _s = *s; + } + _s.packets_in = m_cnt; + _s.bytes_in = m_size; + + err = (*handler_func)(ifp, m_head, m_tail, s, poll, current_thread()); + + if (ifp != lo_ifp) { + /* Release the IO refcnt */ + ifnet_decr_iorefcnt(ifp); + } + + return (err); +} + +errno_t +ifnet_set_input_handler(struct ifnet *ifp, ifnet_input_handler_func fn) +{ + return (atomic_test_set_ptr(&ifp->if_input_handler, + dlil_input_handler, fn) ? 0 : EBUSY); +} + +void +ifnet_reset_input_handler(struct ifnet *ifp) +{ + atomic_set_ptr(&ifp->if_input_handler, dlil_input_handler); +} + +errno_t +ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_handler_func fn) +{ + return (atomic_test_set_ptr(&ifp->if_output_handler, + dlil_output_handler, fn) ? 0 : EBUSY); +} + +void +ifnet_reset_output_handler(struct ifnet *ifp) +{ + atomic_set_ptr(&ifp->if_output_handler, dlil_output_handler); +} + +errno_t +dlil_output_handler(struct ifnet *ifp, struct mbuf *m) +{ + return (ifp->if_output(ifp, m)); +} + +errno_t +dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head, + struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, + boolean_t poll, struct thread *tp) +{ + struct dlil_threading_info *inp; + u_int32_t m_cnt = s->packets_in; + u_int32_t m_size = s->bytes_in; + if ((inp = ifp->if_inp) == NULL) inp = dlil_main_input_thread; @@ -2435,7 +2531,7 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, * will only do this once. */ lck_mtx_lock_spin(&inp->input_lck); - if (inp != dlil_main_input_thread && inp->net_affinity && + if (inp != dlil_main_input_thread && inp->net_affinity && tp != NULL && ((!poll && inp->wloop_thr == THREAD_NULL) || (poll && inp->poll_thr == THREAD_NULL))) { u_int32_t tag = inp->tag; @@ -2463,7 +2559,7 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0)); - /* + /* * Because of loopbacked multicast we cannot stuff the ifp in * the rcvif of the packet header: loopback (lo0) packets use a * dedicated list so that we can later associate them with lo_ifp @@ -2501,17 +2597,15 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, } #endif /* IFNET_INPUT_SANITY_CHK */ - if (s != NULL) { - dlil_input_stats_add(s, inp, poll); - /* - * If we're using the main input thread, synchronize the - * stats now since we have the interface context. All - * other cases involving dedicated input threads will - * have their stats synchronized there. - */ - if (inp == dlil_main_input_thread) - dlil_input_stats_sync(ifp, inp); - } + dlil_input_stats_add(s, inp, poll); + /* + * If we're using the main input thread, synchronize the + * stats now since we have the interface context. All + * other cases involving dedicated input threads will + * have their stats synchronized there. + */ + if (inp == dlil_main_input_thread) + dlil_input_stats_sync(ifp, inp); inp->input_waiting |= DLIL_INPUT_WAITING; if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) { @@ -2520,11 +2614,6 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, } lck_mtx_unlock(&inp->input_lck); - if (ifp != lo_ifp) { - /* Release the IO refcnt */ - ifnet_decr_iorefcnt(ifp); - } - return (0); } @@ -2549,8 +2638,8 @@ ifnet_start_common(struct ifnet *ifp, int resetfc) ifp->if_start_req++; if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL && (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) || - IFCQ_LEN(&ifp->if_snd) >= ifp->if_start_delay_qlen - || ifp->if_start_delayed == 0)) { + IFCQ_LEN(&ifp->if_snd) >= ifp->if_start_delay_qlen || + ifp->if_start_delayed == 0)) { wakeup_one((caddr_t)&ifp->if_start_thread); } lck_mtx_unlock(&ifp->if_start_lock); @@ -2568,10 +2657,16 @@ ifnet_start_thread_fn(void *v, wait_result_t w) #pragma unused(w) struct ifnet *ifp = v; char ifname[IFNAMSIZ + 1]; + char thread_name[MAXTHREADNAMESIZE]; struct timespec *ts = NULL; struct ifclassq *ifq = &ifp->if_snd; struct timespec delay_start_ts; + /* Construct the name for this thread, and then apply it. */ + bzero(thread_name, sizeof(thread_name)); + snprintf(thread_name, sizeof(thread_name), "ifnet_start_%s", ifp->if_xname); + thread_set_thread_name(ifp->if_start_thread, thread_name); + /* * Treat the dedicated starter thread for lo0 as equivalent to * the driver workloop thread; if net_affinity is enabled for @@ -2606,7 +2701,8 @@ ifnet_start_thread_fn(void *v, wait_result_t w) for (;;) { if (ifp->if_start_thread != NULL) - (void) msleep(&ifp->if_start_thread, &ifp->if_start_lock, + (void) msleep(&ifp->if_start_thread, + &ifp->if_start_lock, (PZERO - 1) | PSPIN, ifname, ts); /* interface is detached? */ @@ -2915,8 +3011,7 @@ ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model) u_int32_t omodel; errno_t err; - if (ifp == NULL || (model != IFNET_SCHED_MODEL_DRIVER_MANAGED && - model != IFNET_SCHED_MODEL_NORMAL)) + if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) return (EINVAL); else if (!(ifp->if_eflags & IFEF_TXSTART)) return (ENXIO); @@ -3055,7 +3150,8 @@ ifnet_enqueue(struct ifnet *ifp, struct mbuf *m) nanouptime(&now); net_timernsec(&now, &now_nsec); - m->m_pkthdr.pkt_enqueue_ts = now_nsec; + m->m_pkthdr.pkt_timestamp = now_nsec; + m->m_pkthdr.pkt_flags &= ~PKTF_DRV_TS_VALID; if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) { /* @@ -3071,7 +3167,7 @@ ifnet_enqueue(struct ifnet *ifp, struct mbuf *m) * 3. If the time elapsed since last enqueue is more * than 200ms we disable delaying start callback. This is * is to take idle time into account. - */ + */ u_int64_t dwin = (ifp->if_start_delay_timeout << 1); if (ifp->if_start_delay_swin > 0) { if ((ifp->if_start_delay_swin + dwin) > now_nsec) { @@ -3098,7 +3194,7 @@ ifnet_enqueue(struct ifnet *ifp, struct mbuf *m) } else { ifp->if_start_delay_idle++; } - } + } ifp->if_start_delay_swin = now_nsec; ifp->if_start_delay_cnt = 1; } @@ -3134,11 +3230,12 @@ ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp) if (ifp == NULL || mp == NULL) return (EINVAL); else if (!(ifp->if_eflags & IFEF_TXSTART) || - (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL)) + ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) return (ENXIO); if (!ifnet_is_attached(ifp, 1)) return (ENXIO); - rc = ifclassq_dequeue(&ifp->if_snd, 1, mp, NULL, NULL, NULL); + rc = ifclassq_dequeue(&ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, + mp, NULL, NULL, NULL); ifnet_decr_iorefcnt(ifp); return (rc); @@ -3152,48 +3249,69 @@ ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc, if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) return (EINVAL); else if (!(ifp->if_eflags & IFEF_TXSTART) || - (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED)) + ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) return (ENXIO); if (!ifnet_is_attached(ifp, 1)) return (ENXIO); - + rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, 1, mp, NULL, NULL, NULL); ifnet_decr_iorefcnt(ifp); return (rc); } errno_t -ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t limit, struct mbuf **head, - struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) +ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit, + struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) { errno_t rc; - if (ifp == NULL || head == NULL || limit < 1) + if (ifp == NULL || head == NULL || pkt_limit < 1) return (EINVAL); else if (!(ifp->if_eflags & IFEF_TXSTART) || - (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL)) + ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) return (ENXIO); if (!ifnet_is_attached(ifp, 1)) return (ENXIO); - - rc = ifclassq_dequeue(&ifp->if_snd, limit, head, tail, cnt, len); + + rc = ifclassq_dequeue(&ifp->if_snd, pkt_limit, + CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, head, tail, cnt, len); + ifnet_decr_iorefcnt(ifp); + return (rc); +} + +errno_t +ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit, + struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) +{ + errno_t rc; + if (ifp == NULL || head == NULL || byte_limit < 1) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART) || + ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) + return (ENXIO); + if (!ifnet_is_attached(ifp, 1)) + return (ENXIO); + + rc = ifclassq_dequeue(&ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT, + byte_limit, head, tail, cnt, len); ifnet_decr_iorefcnt(ifp); return (rc); } errno_t ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc, - u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, + u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) { errno_t rc; - if (ifp == NULL || head == NULL || limit < 1 || !MBUF_VALID_SC(sc)) + if (ifp == NULL || head == NULL || pkt_limit < 1 || + !MBUF_VALID_SC(sc)) return (EINVAL); else if (!(ifp->if_eflags & IFEF_TXSTART) || - (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED)) + ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) return (ENXIO); if (!ifnet_is_attached(ifp, 1)) return (ENXIO); - rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, limit, head, + rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, pkt_limit, head, tail, cnt, len); ifnet_decr_iorefcnt(ifp); return (rc); @@ -3325,7 +3443,6 @@ dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m) if (error != 0 && error != EJUSTRETURN) m_freem_list(m); } - return; } static void @@ -3407,6 +3524,8 @@ dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp) (ifp->if_ibytes + ifp->if_obytes) - ifp->if_dt_bytes > ifp->if_data_threshold) { ifp->if_dt_bytes = ifp->if_ibytes + ifp->if_obytes; + + lck_mtx_convert_spin(&inp->input_lck); nstat_ifnet_threshold_reached(ifp->if_index); } /* @@ -3451,7 +3570,7 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, mbuf_t * pkt_next = NULL; u_int32_t poll_thresh = 0, poll_ival = 0; - KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 && (poll_ival = if_rxpoll_interval_pkts) > 0) @@ -3563,7 +3682,7 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, goto next; } } - if (error != 0 || ((m->m_flags & M_PROMISC) != 0) ) { + if (error != 0 || ((m->m_flags & M_PROMISC) != 0)) { m_freem(m); goto next; } @@ -3628,7 +3747,7 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, ifnet_decr_iorefcnt(ifp); } - KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); } errno_t @@ -3648,10 +3767,24 @@ if_mcasts_update(struct ifnet *ifp) return (0); } - -#define TMP_IF_PROTO_ARR_SIZE 10 +/* If ifp is set, we will increment the generation for the interface */ +int +dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event) +{ + if (ifp != NULL) { + ifnet_increment_generation(ifp); + } + +#if NECP + necp_update_all_clients(); +#endif /* NECP */ + + return (kev_post_msg(event)); +} + +#define TMP_IF_PROTO_ARR_SIZE 10 static int -dlil_event_internal(struct ifnet *ifp, struct kev_msg *event) +dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation) { struct ifnet_filter *filter = NULL; struct if_proto *proto = NULL; @@ -3736,7 +3869,7 @@ dlil_event_internal(struct ifnet *ifp, struct kev_msg *event) if_proto_free(proto); } -cleanup: +cleanup: if (tmp_malloc) { FREE(tmp_ifproto_arr, M_TEMP); } @@ -3748,28 +3881,28 @@ dlil_event_internal(struct ifnet *ifp, struct kev_msg *event) /* Release the io ref count */ ifnet_decr_iorefcnt(ifp); done: - return (kev_post_msg(event)); + return (dlil_post_complete_msg(update_generation ? ifp : NULL, event)); } errno_t ifnet_event(ifnet_t ifp, struct kern_event_msg *event) { - struct kev_msg kev_msg; + struct kev_msg kev_msg; int result = 0; if (ifp == NULL || event == NULL) return (EINVAL); bzero(&kev_msg, sizeof (kev_msg)); - kev_msg.vendor_code = event->vendor_code; - kev_msg.kev_class = event->kev_class; - kev_msg.kev_subclass = event->kev_subclass; - kev_msg.event_code = event->event_code; + kev_msg.vendor_code = event->vendor_code; + kev_msg.kev_class = event->kev_class; + kev_msg.kev_subclass = event->kev_subclass; + kev_msg.event_code = event->event_code; kev_msg.dv[0].data_ptr = &event->event_data[0]; kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE; kev_msg.dv[1].data_length = 0; - result = dlil_event_internal(ifp, &kev_msg); + result = dlil_event_internal(ifp, &kev_msg, TRUE); return (result); } @@ -3944,6 +4077,7 @@ errno_t dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, void *route, const struct sockaddr *dest, int raw, struct flowadv *adv) { + ifnet_output_handler_func handler_func; char *frame_type = NULL; char *dst_linkaddr = NULL; int retval = 0; @@ -3960,14 +4094,19 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); - /* Get an io refcnt if the interface is attached to prevent ifnet_detach - * from happening while this operation is in progress */ + /* + * Get an io refcnt if the interface is attached to prevent ifnet_detach + * from happening while this operation is in progress + */ if (!ifnet_is_attached(ifp, 1)) { retval = ENXIO; goto cleanup; } iorefcnt = 1; + handler_func = ifp->if_output_handler; + VERIFY(handler_func != NULL); + /* update the driver's multicast filter, if needed */ if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) ifp->if_updatemcasts = 0; @@ -4024,16 +4163,16 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, do { #if CONFIG_DTRACE if (!raw && proto_family == PF_INET) { - struct ip *ip = mtod(m, struct ip*); - DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *ip = mtod(m, struct ip *); + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, struct ip *, ip, struct ifnet *, ifp, struct ip *, ip, struct ip6_hdr *, NULL); } else if (!raw && proto_family == PF_INET6) { - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*); - DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL, - struct ip6_hdr *, ip6, struct ifnet*, ifp, - struct ip*, NULL, struct ip6_hdr *, ip6); + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct ifnet *, ifp, + struct ip *, NULL, struct ip6_hdr *, ip6); } #endif /* CONFIG_DTRACE */ @@ -4175,7 +4314,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, } KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0, 0, 0, 0, 0); - retval = (*ifp->if_output)(ifp, m); + retval = (*handler_func)(ifp, m); if (retval == EQFULL || retval == EQSUSPENDED) { if (adv != NULL && adv->code == FADV_SUCCESS) { adv->code = (retval == EQFULL ? @@ -4210,7 +4349,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0, 0, 0, 0, 0); if (ifp->if_eflags & IFEF_SENDLIST) { - retval = (*ifp->if_output)(ifp, send_head); + retval = (*handler_func)(ifp, send_head); if (retval == EQFULL || retval == EQSUSPENDED) { if (adv != NULL) { adv->code = (retval == EQFULL ? @@ -4235,7 +4374,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, send_m = send_head; send_head = send_m->m_nextpkt; send_m->m_nextpkt = NULL; - retval = (*ifp->if_output)(ifp, send_m); + retval = (*handler_func)(ifp, send_m); if (retval == EQFULL || retval == EQSUSPENDED) { if (adv != NULL) { adv->code = (retval == EQFULL ? @@ -4250,7 +4389,8 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, fpkts++; } if (retval != 0 && dlil_verbose) { - printf("%s: output error on %s retval = %d\n", + printf("%s: output error on %s " + "retval = %d\n", __func__, if_name(ifp), retval); } } @@ -4296,7 +4436,8 @@ ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code, if (!ifnet_is_attached(ifp, 1)) return (EOPNOTSUPP); - /* Run the interface filters first. + /* + * Run the interface filters first. * We want to run all filters before calling the protocol, * interface family, or interface. */ @@ -4398,7 +4539,7 @@ dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback) if (ifp->if_set_bpf_tap) { /* Get an io reference on the interface if it is attached */ if (!ifnet_is_attached(ifp, 1)) - return ENXIO; + return (ENXIO); error = ifp->if_set_bpf_tap(ifp, mode, callback); ifnet_decr_iorefcnt(ifp); } @@ -4415,7 +4556,7 @@ dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr, proto_media_resolve_multi resolvep; if (!ifnet_is_attached(ifp, 1)) - return result; + return (result); bzero(ll_addr, ll_len); @@ -4428,7 +4569,7 @@ dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr, proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi); if (resolvep != NULL) result = resolvep(ifp, proto_addr, - (struct sockaddr_dl*)(void *)ll_addr, ll_len); + (struct sockaddr_dl *)(void *)ll_addr, ll_len); if_proto_free(proto); } @@ -4447,8 +4588,8 @@ dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr, __private_extern__ errno_t dlil_send_arp_internal(ifnet_t ifp, u_short arpop, - const struct sockaddr_dl* sender_hw, const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, const struct sockaddr* target_proto) + const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto, + const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto) { struct if_proto *proto; errno_t result = 0; @@ -4489,7 +4630,7 @@ struct net_thread_marks { }; static const struct net_thread_marks net_thread_marks_base = { }; __private_extern__ const net_thread_marks_t net_thread_marks_none = - &net_thread_marks_base; + &net_thread_marks_base; __private_extern__ net_thread_marks_t net_thread_marks_push(u_int32_t push) @@ -4590,9 +4731,9 @@ _is_announcement(const struct sockaddr_in * sender_sin, } __private_extern__ errno_t -dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto0, u_int32_t rtflags) +dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, + const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw, + const struct sockaddr *target_proto0, u_int32_t rtflags) { errno_t result = 0; const struct sockaddr_in * sender_sin; @@ -4714,10 +4855,11 @@ ifnet_lookup(struct ifnet *ifp) } return (_ifp != NULL); } + /* * Caller has to pass a non-zero refio argument to get a * IO reference count. This will prevent ifnet_detach from - * being called when there are outstanding io reference counts. + * being called when there are outstanding io reference counts. */ int ifnet_is_attached(struct ifnet *ifp, int refio) @@ -4735,6 +4877,22 @@ ifnet_is_attached(struct ifnet *ifp, int refio) return (ret); } +/* + * Caller must ensure the interface is attached; the assumption is that + * there is at least an outstanding IO reference count held already. + * Most callers would call ifnet_is_attached() instead. + */ +void +ifnet_incr_iorefcnt(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + VERIFY((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) == + IFRF_ATTACHED); + VERIFY(ifp->if_refio > 0); + ifp->if_refio++; + lck_mtx_unlock(&ifp->if_ref_lock); +} + void ifnet_decr_iorefcnt(struct ifnet *ifp) { @@ -4743,10 +4901,11 @@ ifnet_decr_iorefcnt(struct ifnet *ifp) VERIFY((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != 0); ifp->if_refio--; - /* if there are no more outstanding io references, wakeup the + /* + * if there are no more outstanding io references, wakeup the * ifnet_detach thread if detaching flag is set. */ - if (ifp->if_refio == 0 && + if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING) != 0) { wakeup(&(ifp->if_refio)); } @@ -4931,7 +5090,7 @@ ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol, if_name(ifp), protocol, retval); } ifnet_head_done(); - if (retval != 0 && ifproto != NULL) + if (retval != 0 && ifproto != NULL) zfree(dlif_proto_zone, ifproto); return (retval); } @@ -5021,7 +5180,7 @@ ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family) if (proto->proto_kpi == kProtoKPI_v1) { proto->kpi.v1.input = ifproto_media_input_v1; - proto->kpi.v1.pre_output= ifproto_media_preout; + proto->kpi.v1.pre_output = ifproto_media_preout; proto->kpi.v1.event = ifproto_media_event; proto->kpi.v1.ioctl = ifproto_media_ioctl; proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi; @@ -5207,9 +5366,6 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) VERIFY(ifp->if_flt_waiters == 0); lck_mtx_unlock(&ifp->if_flt_lock); - VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead)); - TAILQ_INIT(&ifp->if_prefixhead); - if (!(dl_if->dl_if_flags & DLIF_REUSE)) { VERIFY(LIST_EMPTY(&ifp->if_multiaddrs)); LIST_INIT(&ifp->if_multiaddrs); @@ -5281,7 +5437,8 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) ifnet_touch_lastchange(ifp); VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL || - ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED); + ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED || + ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL); /* By default, use SFB and enable flow advisory */ sflags = PKTSCHEDF_QALG_SFB; @@ -5423,11 +5580,12 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) VERIFY(ifp->if_delegated.subfamily == 0); VERIFY(ifp->if_delegated.expensive == 0); - bzero(&ifp->if_agentids, sizeof(ifp->if_agentids)); + VERIFY(ifp->if_agentids == NULL); + VERIFY(ifp->if_agentcount == 0); /* Reset interface state */ bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state)); - ifp->if_interface_state.valid_bitmask |= + ifp->if_interface_state.valid_bitmask |= IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID; ifp->if_interface_state.interface_availability = IF_INTERFACE_STATE_INTERFACE_AVAILABLE; @@ -5450,6 +5608,26 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) ifp->if_eflags &= ~IFEF_ECN_DISABLE; } + /* + * Built-in Cyclops always on policy for WiFi infra + */ + if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) { + errno_t error; + + error = if_set_qosmarking_mode(ifp, + IFRTYPE_QOSMARKING_FASTLANE); + if (error != 0) { + printf("%s if_set_qosmarking_mode(%s) error %d\n", + __func__, ifp->if_xname, error); + } else { + ifp->if_eflags |= IFEF_QOSMARKING_ENABLED; +#if (DEVELOPMENT || DEBUG) + printf("%s fastlane enabled on %s\n", + __func__, ifp->if_xname); +#endif /* (DEVELOPMENT || DEBUG) */ + } + } + ifnet_lock_done(ifp); ifnet_head_done(); @@ -5560,9 +5738,10 @@ dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr) namelen = snprintf(workbuf, sizeof (workbuf), "%s", if_name(ifp)); - masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen; + masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + + ((namelen > 0) ? namelen : 0); socksize = masklen + ifp->if_addrlen; -#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1))) +#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1))) if ((u_int32_t)socksize < sizeof (struct sockaddr_dl)) socksize = sizeof(struct sockaddr_dl); socksize = ROUNDUP(socksize); @@ -5625,8 +5804,13 @@ dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr) ifa->ifa_addr = (struct sockaddr *)asdl; asdl->sdl_len = socksize; asdl->sdl_family = AF_LINK; - bcopy(workbuf, asdl->sdl_data, namelen); - asdl->sdl_nlen = namelen; + if (namelen > 0) { + bcopy(workbuf, asdl->sdl_data, min(namelen, + sizeof (asdl->sdl_data))); + asdl->sdl_nlen = namelen; + } else { + asdl->sdl_nlen = 0; + } asdl->sdl_index = ifp->if_index; asdl->sdl_type = ifp->if_type; if (ll_addr != NULL) { @@ -5635,9 +5819,9 @@ dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr) } else { asdl->sdl_alen = 0; } - ifa->ifa_netmask = (struct sockaddr*)msdl; + ifa->ifa_netmask = (struct sockaddr *)msdl; msdl->sdl_len = masklen; - while (namelen != 0) + while (namelen > 0) msdl->sdl_data[--namelen] = 0xff; IFA_UNLOCK(ifa); @@ -5662,10 +5846,15 @@ errno_t ifnet_detach(ifnet_t ifp) { struct ifnet *delegated_ifp; + struct nd_ifinfo *ndi = NULL; if (ifp == NULL) return (EINVAL); + ndi = ND_IFINFO(ifp); + if (NULL != ndi) + ndi->cga_initialized = FALSE; + lck_mtx_lock(rnh_lock); ifnet_head_lock_exclusive(); ifnet_lock_exclusive(ifp); @@ -5679,7 +5868,7 @@ ifnet_detach(ifnet_t ifp) (void) ifnet_set_idle_flags_locked(ifp, 0, ~0); lck_mtx_lock_spin(&ifp->if_ref_lock); - if (!(ifp->if_refflags & IFRF_ATTACHED)) { + if (!(ifp->if_refflags & IFRF_ATTACHED)) { lck_mtx_unlock(&ifp->if_ref_lock); ifnet_lock_done(ifp); ifnet_head_done(); @@ -5713,6 +5902,10 @@ ifnet_detach(ifnet_t ifp) TAILQ_REMOVE(&ifnet_head, ifp, if_link); ifp->if_link.tqe_next = NULL; ifp->if_link.tqe_prev = NULL; + if (ifp->if_ordered_link.tqe_next != NULL || + ifp->if_ordered_link.tqe_prev != NULL) { + ifnet_remove_from_ordered_list(ifp); + } ifindex2ifnet[ifp->if_index] = NULL; /* 18717626 - reset IFEF_IPV4_ROUTER and IFEF_IPV6_ROUTER */ @@ -5765,6 +5958,14 @@ ifnet_detach(ifnet_t ifp) ifp->if_link_status = NULL; } + /* Clear agent IDs */ + if (ifp->if_agentids != NULL) { + FREE(ifp->if_agentids, M_NETAGENT); + ifp->if_agentids = NULL; + } + ifp->if_agentcount = 0; + + /* Let BPF know we're detaching */ bpfdetach(ifp); @@ -5854,8 +6055,6 @@ ifnet_detacher_thread_cont(int err) dlil_if_lock(); } } - /* NOTREACHED */ - return (0); } static void @@ -5963,9 +6162,8 @@ ifnet_detach_final(struct ifnet *ifp) VERIFY(ifp->if_link.tqe_prev == NULL); VERIFY(ifp->if_detaching_link.tqe_next == NULL); VERIFY(ifp->if_detaching_link.tqe_prev == NULL); - - /* Prefix list should be empty by now */ - VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead)); + VERIFY(ifp->if_ordered_link.tqe_next == NULL); + VERIFY(ifp->if_ordered_link.tqe_prev == NULL); /* The slot should have been emptied */ VERIFY(ifindex2ifnet[ifp->if_index] == NULL); @@ -6049,10 +6247,12 @@ ifnet_detach_final(struct ifnet *ifp) /* The driver might unload, so point these to ourselves */ if_free = ifp->if_free; + ifp->if_output_handler = ifp_if_output; ifp->if_output = ifp_if_output; ifp->if_pre_enqueue = ifp_if_output; ifp->if_start = ifp_if_start; ifp->if_output_ctl = ifp_if_ctl; + ifp->if_input_handler = ifp_if_input; ifp->if_input_poll = ifp_if_input_poll; ifp->if_input_ctl = ifp_if_ctl; ifp->if_ioctl = ifp_if_ioctl; @@ -6079,6 +6279,10 @@ ifnet_detach_final(struct ifnet *ifp) VERIFY(ifp->if_delegated.subfamily == 0); VERIFY(ifp->if_delegated.expensive == 0); + /* QoS marking get cleared */ + ifp->if_eflags &= ~IFEF_QOSMARKING_ENABLED; + if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE); + ifnet_lock_done(ifp); #if PF @@ -6115,9 +6319,6 @@ ifnet_detach_final(struct ifnet *ifp) dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0); - if (if_free != NULL) - if_free(ifp); - /* * Finally, mark this ifnet as detached. */ @@ -6129,6 +6330,8 @@ ifnet_detach_final(struct ifnet *ifp) } ifp->if_refflags &= ~IFRF_DETACHING; lck_mtx_unlock(&ifp->if_ref_lock); + if (if_free != NULL) + if_free(ifp); if (dlil_verbose) printf("%s: detached\n", if_name(ifp)); @@ -6141,7 +6344,7 @@ static errno_t ifp_if_output(struct ifnet *ifp, struct mbuf *m) { #pragma unused(ifp) - m_freem(m); + m_freem_list(m); return (0); } @@ -6151,6 +6354,16 @@ ifp_if_start(struct ifnet *ifp) ifnet_purge(ifp); } +static errno_t +ifp_if_input(struct ifnet *ifp, struct mbuf *m_head, + struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, + boolean_t poll, struct thread *tp) +{ +#pragma unused(ifp, m_tail, s, poll, tp) + m_freem_list(m_head); + return (ENXIO); +} + static void ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt, struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len) @@ -6274,7 +6487,7 @@ int dlil_if_acquire(u_int32_t family, const void *uniqueid, lck_mtx_lock(&dlifp1->dl_if_lock); /* same uniqueid and same len or no unique id specified */ if ((uniqueid_len == dlifp1->dl_if_uniqueid_len) && - !bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len)) { + bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) { /* check for matching interface in use */ if (dlifp1->dl_if_flags & DLIF_INUSE) { if (uniqueid_len) { @@ -6408,16 +6621,16 @@ dlil_if_release(ifnet_t ifp) ifp->if_name = dlifp->dl_if_namestorage; /* Reset external name (name + unit) */ ifp->if_xname = dlifp->dl_if_xnamestorage; - snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ, + snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ, "%s?", ifp->if_name); lck_mtx_unlock(&dlifp->dl_if_lock); #if CONFIG_MACF_NET /* - * We can either recycle the MAC label here or in dlil_if_acquire(). - * It seems logical to do it here but this means that anything that - * still has a handle on ifp will now see it as unlabeled. - * Since the interface is "dead" that may be OK. Revisit later. - */ + * We can either recycle the MAC label here or in dlil_if_acquire(). + * It seems logical to do it here but this means that anything that + * still has a handle on ifp will now see it as unlabeled. + * Since the interface is "dead" that may be OK. Revisit later. + */ mac_ifnet_label_recycle(ifp); #endif ifnet_lock_done(ifp); @@ -6550,7 +6763,7 @@ ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip) } #if INET6 -struct rtentry* +struct rtentry * ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6) { struct route_in6 src_rt; @@ -6610,7 +6823,7 @@ if_lqm_update(struct ifnet *ifp, int lqm, int locked) ifnet_lock_exclusive(ifp); if (lqm == ifp->if_interface_state.lqm_state && - (ifp->if_interface_state.valid_bitmask & + (ifp->if_interface_state.valid_bitmask & IF_INTERFACE_STATE_LQM_STATE_VALID)) { /* * Release the lock if was not held by the caller @@ -6645,7 +6858,7 @@ static void if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state) { struct kev_dl_rrc_state kev; - + if (rrc_state == ifp->if_interface_state.rrc_state && (ifp->if_interface_state.valid_bitmask & IF_INTERFACE_STATE_RRC_STATE_VALID)) @@ -6672,7 +6885,7 @@ if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state) errno_t if_state_update(struct ifnet *ifp, - struct if_interface_state* if_interface_state) + struct if_interface_state *if_interface_state) { u_short if_index_available = 0; @@ -6737,7 +6950,7 @@ if_state_update(struct ifnet *ifp, void if_get_state(struct ifnet *ifp, - struct if_interface_state* if_interface_state) + struct if_interface_state *if_interface_state) { ifnet_lock_shared(ifp); @@ -7136,7 +7349,7 @@ ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr, uint32_t flags = 0; flags |= (cmd == SIOCSIFOPPORTUNISTIC) ? INPCB_OPPORTUNISTIC_SETCMD : 0; - flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ? + flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ? INPCB_OPPORTUNISTIC_THROTTLEON : 0; ifr->ifr_opportunistic.ifo_inuse = udp_count_opportunistic(ifp->if_index, flags) + @@ -7786,7 +7999,7 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, } return; } - rxoff -=hlen; + rxoff -= hlen; if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) { /* @@ -7905,9 +8118,9 @@ sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) int err; - + if (req->oldptr == USER_ADDR_NULL) { - + } if (req->newptr != USER_ADDR_NULL) { return (EPERM); @@ -8104,7 +8317,7 @@ sysctl_get_ports_used SYSCTL_HANDLER_ARGS ifnet_t ifp = NULL; u_int8_t *bitfield = NULL; - if (req->newptr) { + if (req->newptr != USER_ADDR_NULL) { error = EPERM; goto done; } @@ -8121,12 +8334,11 @@ sysctl_get_ports_used SYSCTL_HANDLER_ARGS error = ENOMEM; goto done; } - + idx = name[0]; protocol = name[1]; flags = name[2]; - - + ifnet_head_lock_shared(); if (idx > if_index) { ifnet_head_done(); @@ -8135,7 +8347,7 @@ sysctl_get_ports_used SYSCTL_HANDLER_ARGS } ifp = ifindex2ifnet[idx]; ifnet_head_done(); - + bitfield = _MALLOC(bitstr_size(65536), M_TEMP, M_WAITOK); if (bitfield == NULL) { error = ENOMEM; @@ -8154,3 +8366,100 @@ sysctl_get_ports_used SYSCTL_HANDLER_ARGS return (error); } +#if (DEVELOPMENT || DEBUG) +/* + * The sysctl variable name contains the input parameters of + * ifnet_get_keepalive_offload_frames() + * ifp (interface index): name[0] + * frames_array_count: name[1] + * frame_data_offset: name[2] + * The return length gives used_frames_count + */ +static int +sysctl_get_kao_frames SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + int *name = (int *)arg1; + u_int namelen = arg2; + int idx; + ifnet_t ifp = NULL; + u_int32_t frames_array_count; + size_t frame_data_offset; + u_int32_t used_frames_count; + struct ifnet_keepalive_offload_frame *frames_array = NULL; + int error = 0; + u_int32_t i; + + /* + * Only root can get look at other people TCP frames + */ + error = proc_suser(current_proc()); + if (error != 0) + goto done; + /* + * Validate the input parameters + */ + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; + goto done; + } + if (namelen != 3) { + error = EINVAL; + goto done; + } + if (req->oldptr == USER_ADDR_NULL) { + error = EINVAL; + goto done; + } + if (req->oldlen == 0) { + error = EINVAL; + goto done; + } + idx = name[0]; + frames_array_count = name[1]; + frame_data_offset = name[2]; + + /* Make sure the passed buffer is large enough */ + if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) > + req->oldlen) { + error = ENOMEM; + goto done; + } + + ifnet_head_lock_shared(); + if (idx > if_index) { + ifnet_head_done(); + error = ENOENT; + goto done; + } + ifp = ifindex2ifnet[idx]; + ifnet_head_done(); + + frames_array = _MALLOC(frames_array_count * + sizeof(struct ifnet_keepalive_offload_frame), M_TEMP, M_WAITOK); + if (frames_array == NULL) { + error = ENOMEM; + goto done; + } + + error = ifnet_get_keepalive_offload_frames(ifp, frames_array, + frames_array_count, frame_data_offset, &used_frames_count); + if (error != 0) { + printf("%s: ifnet_get_keepalive_offload_frames error %d\n", + __func__, error); + goto done; + } + + for (i = 0; i < used_frames_count; i++) { + error = SYSCTL_OUT(req, frames_array + i, + sizeof(struct ifnet_keepalive_offload_frame)); + if (error != 0) { + goto done; + } + } +done: + if (frames_array != NULL) + _FREE(frames_array, M_TEMP); + return (error); +} +#endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/net/dlil.h b/bsd/net/dlil.h index f2fb7161f..a412d25fc 100644 --- a/bsd/net/dlil.h +++ b/bsd/net/dlil.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2013 Apple Inc. All rights reserved. + * Copyright (c) 1999-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,7 +100,7 @@ enum { #define net_timernsec(tvp, nsp) do { \ *(nsp) = (tvp)->tv_nsec; \ if ((tvp)->tv_sec > 0) \ - *(nsp) += ((tvp)->tv_sec * (integer_t)NSEC_PER_SEC); \ + *(nsp) += ((tvp)->tv_sec * NSEC_PER_SEC); \ } while (0) #if defined(__x86_64__) || defined(__arm64__) @@ -321,6 +321,8 @@ extern void dlil_proto_unplumb_all(ifnet_t); extern void dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t, struct net_event_data *, u_int32_t); +extern int dlil_post_complete_msg(struct ifnet *, struct kev_msg *); + extern int dlil_alloc_local_stats(struct ifnet *); /* @@ -333,8 +335,6 @@ extern int dlil_if_acquire(u_int32_t, const void *, size_t, struct ifnet **); */ extern void dlil_if_release(struct ifnet *ifp); -extern u_int32_t ifnet_aggressive_drainers; - extern errno_t dlil_if_ref(struct ifnet *); extern errno_t dlil_if_free(struct ifnet *); @@ -359,6 +359,11 @@ extern errno_t dlil_rxpoll_set_params(struct ifnet *, extern errno_t dlil_rxpoll_get_params(struct ifnet *, struct ifnet_poll_params *); +extern errno_t dlil_output_handler(struct ifnet *, struct mbuf *); +extern errno_t dlil_input_handler(struct ifnet *, struct mbuf *, + struct mbuf *, const struct ifnet_stat_increment_param *, + boolean_t, struct thread *); + #endif /* BSD_KERNEL_PRIVATE */ #endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index aea52bc20..5f29b6e71 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -105,6 +105,7 @@ struct ether_addr { #define ETHERTYPE_IPV6 0x86dd /* IPv6 */ #define ETHERTYPE_PAE 0x888e /* EAPOL PAE/802.1x */ #define ETHERTYPE_RSN_PREAUTH 0x88c7 /* 802.11i / RSN Pre-Authentication */ +#define ETHERTYPE_PTP 0x88f7 /* IEEE 1588 Precision Time Protocol */ #define ETHERTYPE_LOOPBACK 0x9000 /* used to test interfaces */ /* XXX - add more useful types here */ diff --git a/bsd/net/flowadv.c b/bsd/net/flowadv.c index 99e4d2cad..14ad67758 100644 --- a/bsd/net/flowadv.c +++ b/bsd/net/flowadv.c @@ -179,6 +179,18 @@ flowadv_add(struct flowadv_fclist *fcl) lck_mtx_unlock(&fadv_lock); } +void +flowadv_add_entry(struct flowadv_fcentry *fce) { + lck_mtx_lock_spin(&fadv_lock); + STAILQ_INSERT_HEAD(&fadv_list, fce, fce_link); + VERIFY(!STAILQ_EMPTY(&fadv_list)); + + if (!fadv_active && fadv_thread != THREAD_NULL) + wakeup_one((caddr_t)&fadv_list); + + lck_mtx_unlock(&fadv_lock); +} + static int flowadv_thread_cont(int err) { diff --git a/bsd/net/flowadv.h b/bsd/net/flowadv.h index 44c9e0868..f56eb4b8b 100644 --- a/bsd/net/flowadv.h +++ b/bsd/net/flowadv.h @@ -56,6 +56,7 @@ extern void flowadv_init(void); extern struct flowadv_fcentry *flowadv_alloc_entry(int); extern void flowadv_free_entry(struct flowadv_fcentry *); extern void flowadv_add(struct flowadv_fclist *); +extern void flowadv_add_entry(struct flowadv_fcentry *); __END_DECLS diff --git a/bsd/net/if.c b/bsd/net/if.c index a58ce458c..a43d31f54 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -85,6 +85,7 @@ #include #include #include +#include #include @@ -106,9 +107,9 @@ #include #if INET || INET6 -/*XXX*/ #include #include +#include #include #include #include @@ -147,6 +148,7 @@ static int ifioctl_ifreq(struct socket *, u_long, struct ifreq *, struct proc *); static int ifioctl_ifconf(u_long, caddr_t); static int ifioctl_ifclone(u_long, caddr_t); +static int ifioctl_iforder(u_long, caddr_t); static int ifioctl_ifdesc(struct ifnet *, u_long, caddr_t, struct proc *); static int ifioctl_linkparams(struct ifnet *, u_long, caddr_t, struct proc *); static int ifioctl_qstats(struct ifnet *, u_long, caddr_t); @@ -160,6 +162,7 @@ static int if_addmulti_common(struct ifnet *, const struct sockaddr *, struct ifmultiaddr **, int); static int if_delmulti_common(struct ifmultiaddr *, struct ifnet *, const struct sockaddr *, int); +static struct ifnet *ifunit_common(const char *, boolean_t); static int if_rtmtu(struct radix_node *, void *); static void if_rtmtu_update(struct ifnet *); @@ -170,6 +173,10 @@ MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); struct ifnethead ifnet_head = TAILQ_HEAD_INITIALIZER(ifnet_head); +/* ifnet_ordered_head and if_ordered_count are protected by the ifnet_head lock */ +struct ifnethead ifnet_ordered_head = TAILQ_HEAD_INITIALIZER(ifnet_ordered_head); +static u_int32_t if_ordered_count = 0; + static int if_cloners_count; LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); @@ -379,13 +386,13 @@ if_detach_ifa_common(struct ifnet *ifp, struct ifaddr *ifa, int link) (*ifa->ifa_detached)(ifa); } -#define INITIAL_IF_INDEXLIM 8 +#define INITIAL_IF_INDEXLIM 8 /* * Function: if_next_index * Purpose: - * Return the next available interface index. - * Grow the ifnet_addrs[] and ifindex2ifnet[] arrays to accomodate the + * Return the next available interface index. + * Grow the ifnet_addrs[] and ifindex2ifnet[] arrays to accomodate the * added entry when necessary. * * Note: @@ -422,18 +429,18 @@ if_next_index(void) new_ifnet_addrs = _MALLOC(n, M_IFADDR, M_WAITOK | M_ZERO); if (new_ifnet_addrs == NULL) { --if_index; - return -1; + return (-1); } - new_ifindex2ifnet = new_ifnet_addrs + new_ifindex2ifnet = new_ifnet_addrs + new_if_indexlim * sizeof(caddr_t); if (ifnet_addrs != NULL) { /* copy the existing data */ bcopy((caddr_t)ifnet_addrs, new_ifnet_addrs, - if_indexlim * sizeof(caddr_t)); + if_indexlim * sizeof(caddr_t)); bcopy((caddr_t)ifindex2ifnet, - new_ifindex2ifnet, - (if_indexlim + 1) * sizeof(caddr_t)); + new_ifindex2ifnet, + (if_indexlim + 1) * sizeof(caddr_t)); } /* switch to the new tables and size */ @@ -475,8 +482,8 @@ if_clone_create(char *name, int len, void *params) * Find a free unit if none was given. */ if (wildcard) { - while ((bytoff < ifc->ifc_bmlen) - && (ifc->ifc_units[bytoff] == 0xff)) + while ((bytoff < ifc->ifc_bmlen) && + (ifc->ifc_units[bytoff] == 0xff)) bytoff++; if (bytoff >= ifc->ifc_bmlen) return (ENOSPC); @@ -572,20 +579,20 @@ if_clone_lookup(const char *name, u_int32_t *unitp) const char *cp; u_int32_t i; - for (ifc = LIST_FIRST(&if_cloners); ifc != NULL;) { + for (ifc = LIST_FIRST(&if_cloners); ifc != NULL; ) { for (cp = name, i = 0; i < ifc->ifc_namelen; i++, cp++) { if (ifc->ifc_name[i] != *cp) goto next_ifc; } goto found_name; - next_ifc: +next_ifc: ifc = LIST_NEXT(ifc, ifc_list); } /* No match. */ return ((struct if_clone *)NULL); - found_name: +found_name: if (*cp == '\0') { i = UINT32_MAX; } else { @@ -627,7 +634,7 @@ if_clone_attach(struct if_clone *ifc) len++; ifc->ifc_units = _MALLOC(len, M_CLONE, M_WAITOK | M_ZERO); if (ifc->ifc_units == NULL) - return ENOBUFS; + return (ENOBUFS); ifc->ifc_bmlen = len; LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list); @@ -645,7 +652,7 @@ if_clone_attach(struct if_clone *ifc) ifc->ifc_units[bytoff] |= (1 << bitoff); } - return 0; + return (0); } /* @@ -681,7 +688,7 @@ if_clone_list(int count, int *ret_total, user_addr_t dst) count = (if_cloners_count < count) ? if_cloners_count : count; for (ifc = LIST_FIRST(&if_cloners); ifc != NULL && count != 0; - ifc = LIST_NEXT(ifc, ifc_list), count--, dst += IFNAMSIZ) { + ifc = LIST_NEXT(ifc, ifc_list), count--, dst += IFNAMSIZ) { bzero(outbuf, sizeof(outbuf)); strlcpy(outbuf, ifc->ifc_name, min(strlen(ifc->ifc_name), IFNAMSIZ)); @@ -694,25 +701,32 @@ if_clone_list(int count, int *ret_total, user_addr_t dst) } u_int32_t -if_functional_type(struct ifnet *ifp) +if_functional_type(struct ifnet *ifp, bool exclude_delegate) { u_int32_t ret = IFRTYPE_FUNCTIONAL_UNKNOWN; if (ifp != NULL) { if (ifp->if_flags & IFF_LOOPBACK) { ret = IFRTYPE_FUNCTIONAL_LOOPBACK; - } else if (IFNET_IS_WIFI(ifp)) { + } else if ((exclude_delegate && + (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) || + (!exclude_delegate && IFNET_IS_WIFI(ifp))) { if (ifp->if_eflags & IFEF_AWDL) ret = IFRTYPE_FUNCTIONAL_WIFI_AWDL; else ret = IFRTYPE_FUNCTIONAL_WIFI_INFRA; - } else if (IFNET_IS_CELLULAR(ifp)) { + } else if ((exclude_delegate && + (ifp->if_type == IFT_CELLULAR)) || + (!exclude_delegate && IFNET_IS_CELLULAR(ifp))) { ret = IFRTYPE_FUNCTIONAL_CELLULAR; - } else if (IFNET_IS_WIRED(ifp)) { + } else if ((exclude_delegate && + (ifp->if_family == IFNET_FAMILY_ETHERNET || + ifp->if_family == IFNET_FAMILY_FIREWIRE)) || + (!exclude_delegate && IFNET_IS_WIRED(ifp))) { ret = IFRTYPE_FUNCTIONAL_WIRED; } } - return ret; + return (ret); } /* @@ -805,22 +819,28 @@ ifa_ifpgetprimary(struct ifnet *ifp, int family) return (ifa); } +static inline int +ifa_equal(const struct sockaddr *sa1, const struct sockaddr *sa2) +{ + + if (!sa1 || !sa2) + return 0; + if (sa1->sa_len != sa2->sa_len) + return 0; + + return (bcmp(sa1, sa2, sa1->sa_len) == 0); +} + /* * Locate an interface based on a complete address. */ -/*ARGSUSED*/ struct ifaddr * -ifa_ifwithaddr(const struct sockaddr *addr) +ifa_ifwithaddr_locked(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *result = NULL; -#define equal(a1, a2) \ - (bcmp((const void*)(a1), (const void*)(a2), \ - ((const struct sockaddr *)(a1))->sa_len) == 0) - - ifnet_head_lock_shared(); for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { ifnet_lock_shared(ifp); @@ -831,7 +851,7 @@ ifa_ifwithaddr(const struct sockaddr *addr) IFA_UNLOCK(ifa); continue; } - if (equal(addr, ifa->ifa_addr)) { + if (ifa_equal(addr, ifa->ifa_addr)) { result = ifa; IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); @@ -841,7 +861,7 @@ ifa_ifwithaddr(const struct sockaddr *addr) ifa->ifa_broadaddr != NULL && /* IP6 doesn't have broadcast */ ifa->ifa_broadaddr->sa_len != 0 && - equal(ifa->ifa_broadaddr, addr)) { + ifa_equal(ifa->ifa_broadaddr, addr)) { result = ifa; IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); @@ -851,6 +871,19 @@ ifa_ifwithaddr(const struct sockaddr *addr) } ifnet_lock_done(ifp); } + + return (result); +} + +struct ifaddr * +ifa_ifwithaddr(const struct sockaddr *addr) +{ + struct ifaddr *result = NULL; + + ifnet_head_lock_shared(); + + result = ifa_ifwithaddr_locked(addr); + ifnet_head_done(); return (result); @@ -879,8 +912,7 @@ ifa_ifwithdstaddr(const struct sockaddr *addr) IFA_UNLOCK(ifa); continue; } - if (ifa->ifa_dstaddr && - equal(addr, ifa->ifa_dstaddr)) { + if (ifa_equal(addr, ifa->ifa_dstaddr)) { result = ifa; IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); @@ -899,17 +931,15 @@ ifa_ifwithdstaddr(const struct sockaddr *addr) * Locate the source address of an interface based on a complete address. */ struct ifaddr * -ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) +ifa_ifwithaddr_scoped_locked(const struct sockaddr *addr, unsigned int ifscope) { struct ifaddr *result = NULL; struct ifnet *ifp; if (ifscope == IFSCOPE_NONE) - return (ifa_ifwithaddr(addr)); + return (ifa_ifwithaddr_locked(addr)); - ifnet_head_lock_shared(); if (ifscope > (unsigned int)if_index) { - ifnet_head_done(); return (NULL); } @@ -930,7 +960,7 @@ ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) IFA_UNLOCK(ifa); continue; } - if (equal(addr, ifa->ifa_addr)) { + if (ifa_equal(addr, ifa->ifa_addr)) { result = ifa; IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); @@ -940,7 +970,7 @@ ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) ifa->ifa_broadaddr != NULL && /* IP6 doesn't have broadcast */ ifa->ifa_broadaddr->sa_len != 0 && - equal(ifa->ifa_broadaddr, addr)) { + ifa_equal(ifa->ifa_broadaddr, addr)) { result = ifa; IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); @@ -950,6 +980,19 @@ ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) } ifnet_lock_done(ifp); } + + return (result); +} + +struct ifaddr * +ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) +{ + struct ifaddr *result = NULL; + + ifnet_head_lock_shared(); + + result = ifa_ifwithaddr_scoped_locked(addr, ifscope); + ifnet_head_done(); return (result); @@ -981,11 +1024,9 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) const char *addr_data = addr->sa_data, *cplim; #if INET6 - if ((af != AF_INET && af != AF_INET6) || - (af == AF_INET && !ip_doscopedroute) || - (af == AF_INET6 && !ip6_doscopedroute)) + if (af != AF_INET && af != AF_INET6) #else - if (af != AF_INET || !ip_doscopedroute) + if (af != AF_INET) #endif /* !INET6 */ ifscope = IFSCOPE_NONE; @@ -1118,8 +1159,8 @@ ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) ifa_maybe = ifa; } if (ifa->ifa_netmask == 0) { - if (equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && - equal(addr, ifa->ifa_dstaddr))) { + if (ifa_equal(addr, ifa->ifa_addr) || + ifa_equal(addr, ifa->ifa_dstaddr)) { IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); break; @@ -1128,13 +1169,13 @@ ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) continue; } if (ifp->if_flags & IFF_POINTOPOINT) { - if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) { + if (ifa_equal(addr, ifa->ifa_dstaddr)) { IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); break; } } else { - if (equal(addr, ifa->ifa_addr)) { + if (ifa_equal(addr, ifa->ifa_addr)) { /* exact match */ IFA_ADDREF_LOCKED(ifa); /* for caller */ IFA_UNLOCK(ifa); @@ -1259,8 +1300,7 @@ if_updown( /* Mark interface up or down */ if (up) { ifp->if_flags |= IFF_UP; - } - else { + } else { ifp->if_flags &= ~IFF_UP; } @@ -1377,54 +1417,102 @@ if_qflush_sc(struct ifnet *ifp, mbuf_svc_class_t sc, u_int32_t flow, } /* - * Map interface name to - * interface structure pointer. + * Extracts interface unit number and name from string, returns -1 upon failure. + * Upon success, returns extracted unit number, and interface name in dst. */ -struct ifnet * -ifunit(const char *name) +int +ifunit_extract(const char *src, char *dst, size_t dstlen, int *unit) { - char namebuf[IFNAMSIZ + 1]; const char *cp; - struct ifnet *ifp; - int unit; - unsigned len, m; + size_t len, m; char c; + int u; - len = strlen(name); - if (len < 2 || len > IFNAMSIZ) - return (NULL); - cp = name + len - 1; + if (src == NULL || dst == NULL || dstlen == 0 || unit == NULL) + return (-1); + + len = strlen(src); + if (len < 2 || len > dstlen) + return (-1); + cp = src + len - 1; c = *cp; if (c < '0' || c > '9') - return (NULL); /* trailing garbage */ - unit = 0; + return (-1); /* trailing garbage */ + u = 0; m = 1; do { - if (cp == name) - return (NULL); /* no interface name */ - unit += (c - '0') * m; - if (unit > 1000000) - return (NULL); /* number is unreasonable */ + if (cp == src) + return (-1); /* no interface name */ + u += (c - '0') * m; + if (u > 1000000) + return (-1); /* number is unreasonable */ m *= 10; c = *--cp; } while (c >= '0' && c <= '9'); - len = cp - name + 1; - bcopy(name, namebuf, len); - namebuf[len] = '\0'; + len = cp - src + 1; + bcopy(src, dst, len); + dst[len] = '\0'; + *unit = u; + + return (0); +} + +/* + * Map interface name to + * interface structure pointer. + */ +static struct ifnet * +ifunit_common(const char *name, boolean_t hold) +{ + char namebuf[IFNAMSIZ + 1]; + struct ifnet *ifp; + int unit; + + if (ifunit_extract(name, namebuf, sizeof (namebuf), &unit) < 0) + return (NULL); + + /* for safety, since we use strcmp() below */ + namebuf[sizeof (namebuf) - 1] = '\0'; + /* * Now search all the interfaces for this name/number */ ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (strncmp(ifp->if_name, namebuf, len)) + /* + * Use strcmp() rather than strncmp() here, + * since we want to match the entire string. + */ + if (strcmp(ifp->if_name, namebuf)) continue; if (unit == ifp->if_unit) break; } + + /* if called from ifunit_ref() and ifnet is not attached, bail */ + if (hold && ifp != NULL && !ifnet_is_attached(ifp, 1)) + ifp = NULL; + ifnet_head_done(); return (ifp); } +struct ifnet * +ifunit(const char *name) +{ + return (ifunit_common(name, FALSE)); +} + +/* + * Similar to ifunit(), except that we hold an I/O reference count on an + * attached interface, which must later be released via ifnet_decr_iorefcnt(). + * Will return NULL unless interface exists and is fully attached. + */ +struct ifnet * +ifunit_ref(const char *name) +{ + return (ifunit_common(name, TRUE)); +} /* * Map interface name in a sockaddr_dl to @@ -1436,8 +1524,8 @@ if_withname(struct sockaddr *sa) char ifname[IFNAMSIZ+1]; struct sockaddr_dl *sdl = (struct sockaddr_dl *)(void *)sa; - if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) || - (sdl->sdl_nlen > IFNAMSIZ) ) + if ((sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) || + (sdl->sdl_nlen > IFNAMSIZ)) return (NULL); /* @@ -1529,7 +1617,7 @@ ifioctl_ifdesc(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) switch (cmd) { case SIOCSIFDESC: { /* struct if_descreq */ if ((error = proc_suser(p)) != 0) - break; + break; ifnet_lock_exclusive(ifp); bcopy(&ifdr->ifdr_len, &ifdr_len, sizeof (ifdr_len)); @@ -1585,7 +1673,7 @@ ifioctl_linkparams(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) struct tb_profile tb = { 0, 0, 0 }; if ((error = proc_suser(p)) != 0) - break; + break; IFCQ_LOCK(ifq); if (!IFCQ_IS_READY(ifq)) { @@ -1697,7 +1785,7 @@ ifioctl_throttle(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) * XXX: Use priv_check_cred() instead of root check? */ if ((error = proc_suser(p)) != 0) - break; + break; bcopy(&ifthr->ifthr_level, &ifthr_level, sizeof (ifthr_level)); error = ifnet_set_throttle(ifp, ifthr_level); @@ -1726,20 +1814,26 @@ static int ifioctl_getnetagents(struct ifnet *ifp, u_int32_t *count, user_addr_t uuid_p) { int error = 0; - int index = 0; + u_int32_t index = 0; u_int32_t valid_netagent_count = 0; *count = 0; - for (index = 0; index < IF_MAXAGENTS; index++) { - uuid_t *netagent_uuid = &(ifp->if_agentids[index]); - if (!uuid_is_null(*netagent_uuid)) { - if (uuid_p != USER_ADDR_NULL) { - if ((error = copyout(netagent_uuid, - uuid_p + sizeof(uuid_t) * valid_netagent_count, - sizeof(uuid_t))) != 0) { - return (error); + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_SHARED); + + if (ifp->if_agentids != NULL) { + for (index = 0; index < ifp->if_agentcount; index++) { + uuid_t *netagent_uuid = &(ifp->if_agentids[index]); + if (!uuid_is_null(*netagent_uuid)) { + if (uuid_p != USER_ADDR_NULL) { + error = copyout(netagent_uuid, + uuid_p + sizeof(uuid_t) * valid_netagent_count, + sizeof(uuid_t)); + if (error != 0) { + return (error); + } } + valid_netagent_count++; } - valid_netagent_count++; } } *count = valid_netagent_count; @@ -1747,6 +1841,8 @@ ifioctl_getnetagents(struct ifnet *ifp, u_int32_t *count, user_addr_t uuid_p) return (0); } +#define IF_MAXAGENTS 64 +#define IF_AGENT_INCREMENT 8 static __attribute__((noinline)) int ifioctl_netagent(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) { @@ -1756,10 +1852,22 @@ ifioctl_netagent(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) struct if_agentidsreq64 s64; } u; int error = 0; - int index = 0; + u_int32_t index = 0; VERIFY(ifp != NULL); + /* Get an io ref count if the interface is attached */ + if (!ifnet_is_attached(ifp, 1)) { + return (EOPNOTSUPP); + } + + if (cmd == SIOCAIFAGENTID || + cmd == SIOCDIFAGENTID) { + ifnet_lock_exclusive(ifp); + } else { + ifnet_lock_shared(ifp); + } + switch (cmd) { case SIOCAIFAGENTID: { /* struct if_agentidreq */ uuid_t *first_empty_slot = NULL; @@ -1767,21 +1875,54 @@ ifioctl_netagent(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) if ((error = proc_suser(p)) != 0) { break; } - for (index = 0; index < IF_MAXAGENTS; index++) { - uuid_t *netagent_uuid = &(ifp->if_agentids[index]); - if (uuid_compare(*netagent_uuid, ifar->ifar_uuid) == 0) { - /* Already present, ignore */ - break; - } - if (first_empty_slot == NULL && - uuid_is_null(*netagent_uuid)) { - first_empty_slot = netagent_uuid; + bool already_added = FALSE; + if (ifp->if_agentids != NULL) { + for (index = 0; index < ifp->if_agentcount; index++) { + uuid_t *netagent_uuid = &(ifp->if_agentids[index]); + if (uuid_compare(*netagent_uuid, ifar->ifar_uuid) == 0) { + /* Already present, ignore */ + already_added = TRUE; + break; + } + if (first_empty_slot == NULL && + uuid_is_null(*netagent_uuid)) { + first_empty_slot = netagent_uuid; + } } } - if (first_empty_slot == NULL) { - error = ENOMEM; /* No empty slot for a netagent UUID, bail */ + if (already_added) { + /* Already added agent, don't return an error */ break; } + if (first_empty_slot == NULL) { + if (ifp->if_agentcount >= IF_MAXAGENTS) { + /* No room for another netagent UUID, bail */ + error = ENOMEM; + break; + } else { + /* Calculate new array size */ + u_int32_t new_agent_count = + MIN(ifp->if_agentcount + IF_AGENT_INCREMENT, IF_MAXAGENTS); + + /* Reallocate array */ + uuid_t *new_agent_array = _REALLOC(ifp->if_agentids, + sizeof(uuid_t) * new_agent_count, M_NETAGENT, + M_WAITOK | M_ZERO); + if (new_agent_array == NULL) { + error = ENOMEM; + break; + } + + /* Save new array */ + ifp->if_agentids = new_agent_array; + + /* Set first empty slot */ + first_empty_slot = &(ifp->if_agentids[ifp->if_agentcount]); + + /* Save new array length */ + ifp->if_agentcount = new_agent_count; + } + } uuid_copy(*first_empty_slot, ifar->ifar_uuid); netagent_post_updated_interfaces(ifar->ifar_uuid); break; @@ -1792,12 +1933,14 @@ ifioctl_netagent(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) if ((error = proc_suser(p)) != 0) { break; } - for (index = 0; index < IF_MAXAGENTS; index++) { - uuid_t *netagent_uuid = &(ifp->if_agentids[index]); - if (uuid_compare(*netagent_uuid, ifar->ifar_uuid) == 0) { - uuid_clear(*netagent_uuid); - removed_agent_id = TRUE; - break; + if (ifp->if_agentids != NULL) { + for (index = 0; index < ifp->if_agentcount; index++) { + uuid_t *netagent_uuid = &(ifp->if_agentids[index]); + if (uuid_compare(*netagent_uuid, ifar->ifar_uuid) == 0) { + uuid_clear(*netagent_uuid); + removed_agent_id = TRUE; + break; + } } } if (removed_agent_id) { @@ -1805,17 +1948,19 @@ ifioctl_netagent(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) } break; } - case SIOCGIFAGENTIDS32: { /* struct if_agentidsreq32 */ + case SIOCGIFAGENTIDS32: { /* struct if_agentidsreq32 */ bcopy(data, &u.s32, sizeof(u.s32)); - error = ifioctl_getnetagents(ifp, &u.s32.ifar_count, u.s32.ifar_uuids); + error = ifioctl_getnetagents(ifp, &u.s32.ifar_count, + u.s32.ifar_uuids); if (error == 0) { bcopy(&u.s32, data, sizeof(u.s32)); } break; } - case SIOCGIFAGENTIDS64: { /* struct if_agentidsreq64 */ + case SIOCGIFAGENTIDS64: { /* struct if_agentidsreq64 */ bcopy(data, &u.s64, sizeof(u.s64)); - error = ifioctl_getnetagents(ifp, &u.s64.ifar_count, u.s64.ifar_uuids); + error = ifioctl_getnetagents(ifp, &u.s64.ifar_count, + u.s64.ifar_uuids); if (error == 0) { bcopy(&u.s64, data, sizeof(u.s64)); } @@ -1826,6 +1971,9 @@ ifioctl_netagent(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p) /* NOTREACHED */ } + ifnet_lock_done(ifp); + ifnet_decr_iorefcnt(ifp); + return (error); } @@ -1833,24 +1981,218 @@ void ifnet_clear_netagent(uuid_t netagent_uuid) { struct ifnet *ifp = NULL; - int index = 0; - bool removed_agent_id = FALSE; + u_int32_t index = 0; ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - for (index = 0; index < IF_MAXAGENTS; index++) { - uuid_t *ifp_netagent_uuid = &(ifp->if_agentids[index]); - if (uuid_compare(*ifp_netagent_uuid, netagent_uuid) == 0) { - uuid_clear(*ifp_netagent_uuid); - removed_agent_id = TRUE; + ifnet_lock_shared(ifp); + if (ifp->if_agentids != NULL) { + for (index = 0; index < ifp->if_agentcount; index++) { + uuid_t *ifp_netagent_uuid = &(ifp->if_agentids[index]); + if (uuid_compare(*ifp_netagent_uuid, netagent_uuid) == 0) { + uuid_clear(*ifp_netagent_uuid); + } } } + ifnet_lock_done(ifp); } ifnet_head_done(); } +void +ifnet_increment_generation(ifnet_t interface) +{ + OSIncrementAtomic(&interface->if_generation); +} + +u_int32_t +ifnet_get_generation(ifnet_t interface) +{ + return (interface->if_generation); +} + +void +ifnet_remove_from_ordered_list(struct ifnet *ifp) +{ + ifnet_head_assert_exclusive(); + + // Remove from list + TAILQ_REMOVE(&ifnet_ordered_head, ifp, if_ordered_link); + ifp->if_ordered_link.tqe_next = NULL; + ifp->if_ordered_link.tqe_prev = NULL; + + // Update ordered count + VERIFY(if_ordered_count > 0); + if_ordered_count--; +} + +static int +ifnet_reset_order(u_int32_t *ordered_indices, u_int32_t count) +{ + struct ifnet *ifp = NULL; + int error = 0; + + ifnet_head_lock_exclusive(); + + // Flush current ordered list + for (ifp = TAILQ_FIRST(&ifnet_ordered_head); ifp != NULL; + ifp = TAILQ_FIRST(&ifnet_ordered_head)) { + ifnet_lock_exclusive(ifp); + ifnet_remove_from_ordered_list(ifp); + ifnet_lock_done(ifp); + } + + VERIFY(if_ordered_count == 0); + + for (u_int32_t order_index = 0; order_index < count; order_index++) { + u_int32_t interface_index = ordered_indices[order_index]; + if (interface_index == IFSCOPE_NONE || + (int)interface_index > if_index) { + break; + } + ifp = ifindex2ifnet[interface_index]; + if (ifp == NULL) { + continue; + } + ifnet_lock_exclusive(ifp); + TAILQ_INSERT_TAIL(&ifnet_ordered_head, ifp, if_ordered_link); + ifnet_lock_done(ifp); + if_ordered_count++; + } + + ifnet_head_done(); + + necp_update_all_clients(); + + return (error); +} + +int +if_set_qosmarking_mode(struct ifnet *ifp, u_int32_t mode) +{ + int error = 0; + u_int32_t old_mode = ifp->if_qosmarking_mode; + + switch (mode) { + case IFRTYPE_QOSMARKING_MODE_NONE: + ifp->if_qosmarking_mode = IFRTYPE_QOSMARKING_MODE_NONE; + ifp->if_eflags &= ~IFEF_QOSMARKING_CAPABLE; + break; + case IFRTYPE_QOSMARKING_FASTLANE: + ifp->if_qosmarking_mode = IFRTYPE_QOSMARKING_FASTLANE; + ifp->if_eflags |= IFEF_QOSMARKING_CAPABLE; + if (net_qos_policy_capable_enabled != 0) + ifp->if_eflags |= IFEF_QOSMARKING_ENABLED; + break; + default: + error = EINVAL; + break; + } + if (error == 0 && old_mode != ifp->if_qosmarking_mode) { + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_QOS_MODE_CHANGED, + NULL, sizeof(struct kev_dl_rrc_state)); + + } + return (error); +} + +static __attribute__((noinline)) int +ifioctl_iforder(u_long cmd, caddr_t data) +{ + int error = 0; + u_int32_t *ordered_indices = NULL; + + if (data == NULL) { + return (EINVAL); + } + + switch (cmd) { + case SIOCSIFORDER: { /* struct if_order */ + struct if_order *ifo = (struct if_order *)(void *)data; + + if ((int)ifo->ifo_count > if_index) { + error = EINVAL; + break; + } + + size_t length = (ifo->ifo_count * sizeof(u_int32_t)); + if (length > 0) { + if (ifo->ifo_ordered_indices == USER_ADDR_NULL) { + error = EINVAL; + break; + } + ordered_indices = _MALLOC(length, M_NECP, M_WAITOK); + if (ordered_indices == NULL) { + error = ENOMEM; + break; + } + + error = copyin(ifo->ifo_ordered_indices, + ordered_indices, length); + if (error != 0) { + break; + } + } + + error = ifnet_reset_order(ordered_indices, ifo->ifo_count); + break; + } + + case SIOCGIFORDER: { /* struct if_order */ + struct if_order *ifo = (struct if_order *)(void *)data; + + u_int32_t ordered_count = if_ordered_count; + + if (ifo->ifo_count == 0 || + ordered_count == 0) { + ifo->ifo_count = ordered_count; + } else if (ifo->ifo_ordered_indices != USER_ADDR_NULL) { + u_int32_t count_to_copy = + MIN(ordered_count, ifo->ifo_count); + size_t length = (count_to_copy * sizeof(u_int32_t)); + struct ifnet *ifp = NULL; + u_int32_t cursor = 0; + + ordered_indices = _MALLOC(length, M_NECP, M_WAITOK); + if (ordered_indices == NULL) { + error = ENOMEM; + break; + } + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_ordered_head, if_ordered_link) { + if (cursor > count_to_copy) { + break; + } + ordered_indices[cursor] = ifp->if_index; + cursor++; + } + ifnet_head_done(); + + ifo->ifo_count = count_to_copy; + error = copyout(ordered_indices, + ifo->ifo_ordered_indices, length); + } else { + error = EINVAL; + } + break; + } + + default: { + VERIFY(0); + /* NOTREACHED */ + } + } + + if (ordered_indices != NULL) { + _FREE(ordered_indices, M_NECP); + } + + return (error); +} + static __attribute__((noinline)) int ifioctl_netsignature(struct ifnet *ifp, u_long cmd, caddr_t data) { @@ -1925,9 +2267,16 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGIFAGENTDATA32: /* struct netagent_req32 */ case SIOCGIFAGENTDATA64: /* struct netagent_req64 */ + case SIOCGIFAGENTLIST32: /* struct netagentlist_req32 */ + case SIOCGIFAGENTLIST64: /* struct netagentlist_req64 */ error = netagent_ioctl(cmd, data); goto done; + case SIOCSIFORDER: /* struct if_order */ + case SIOCGIFORDER: /* struct if_order */ + error = ifioctl_iforder(cmd, data); + goto done; + case SIOCSIFDSTADDR: /* struct ifreq */ case SIOCSIFADDR: /* struct ifreq */ case SIOCSIFBRDADDR: /* struct ifreq */ @@ -1995,6 +2344,12 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFPROBECONNECTIVITY: /* struct ifreq */ case SIOCGIFPROBECONNECTIVITY: /* struct ifreq */ case SIOCGSTARTDELAY: /* struct ifreq */ + case SIOCSIFTIMESTAMPENABLE: /* struct ifreq */ + case SIOCSIFTIMESTAMPDISABLE: /* struct ifreq */ + case SIOCGIFTIMESTAMPENABLED: /* struct ifreq */ +#if (DEBUG || DEVELOPMENT) + case SIOCSIFDISABLEOUTPUT: /* struct ifreq */ +#endif /* (DEBUG || DEVELOPMENT) */ case SIOCGECNMODE: /* struct ifreq */ case SIOCSECNMODE: { /* struct ifreq */ struct ifreq ifr; @@ -2005,6 +2360,19 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) bcopy(&ifr, data, sizeof (ifr)); goto done; } + case SIOCSQOSMARKINGMODE: /* struct ifreq */ + case SIOCSQOSMARKINGENABLED: /* struct ifreq */ + case SIOCGQOSMARKINGMODE: /* struct ifreq */ + case SIOCGQOSMARKINGENABLED: /* struct ifreq */ + { /* struct ifreq */ + struct ifreq ifr; + bcopy(data, &ifr, sizeof (ifr)); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + bcopy(&ifr.ifr_name, ifname, IFNAMSIZ); + error = ifioctl_ifreq(so, cmd, &ifr, p); + bcopy(&ifr, data, sizeof (ifr)); + goto done; + } } /* @@ -2090,14 +2458,14 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGIFAGENTIDS32: /* struct if_agentidsreq32 */ case SIOCGIFAGENTIDS64: /* struct if_agentidsreq64 */ bcopy(((struct if_agentidreq *)(void *)data)->ifar_name, - ifname, IFNAMSIZ); + ifname, IFNAMSIZ); ifp = ifunit(ifname); break; case SIOCSIFNETSIGNATURE: /* struct if_nsreq */ case SIOCGIFNETSIGNATURE: /* struct if_nsreq */ bcopy(((struct if_nsreq *)(void *)data)->ifnsr_name, - ifname, IFNAMSIZ); + ifname, IFNAMSIZ); ifp = ifunit(ifname); break; @@ -2244,10 +2612,10 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) switch (cmd) { case SIOCIFCREATE: case SIOCIFCREATE2: - error = proc_suser(p); - if (error) - return (error); - return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), + error = proc_suser(p); + if (error) + return (error); + return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL)); case SIOCIFDESTROY: error = proc_suser(p); @@ -2344,7 +2712,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ev_msg.dv[0].data_length = sizeof(struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); ifnet_touch_lastchange(ifp); break; @@ -2382,7 +2750,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); ifnet_touch_lastchange(ifp); break; @@ -2407,7 +2775,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ev_msg.dv[0].data_length = sizeof(struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); ifnet_touch_lastchange(ifp); break; @@ -2443,7 +2811,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ev_msg.dv[0].data_length = sizeof(struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); ifnet_touch_lastchange(ifp); rt_ifmsg(ifp); @@ -2516,7 +2884,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ev_msg.dv[0].data_length = sizeof(struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); ifnet_touch_lastchange(ifp); break; @@ -2567,7 +2935,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) break; case SIOCGIFFUNCTIONALTYPE: - ifr->ifr_functional_type = if_functional_type(ifp); + ifr->ifr_functional_type = if_functional_type(ifp, FALSE); break; case SIOCGIFPSRCADDR: @@ -2598,10 +2966,10 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) case SIOCGIFLINKQUALITYMETRIC: ifnet_lock_shared(ifp); - if ((ifp->if_interface_state.valid_bitmask & + if ((ifp->if_interface_state.valid_bitmask & IF_INTERFACE_STATE_LQM_STATE_VALID)) ifr->ifr_link_quality_metric = - ifp->if_interface_state.lqm_state; + ifp->if_interface_state.lqm_state; else if ((ifp->if_refflags & IFRF_ATTACHED)) { ifr->ifr_link_quality_metric = IFNET_LQM_THRESH_UNKNOWN; @@ -2628,7 +2996,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ifnet_lock_shared(ifp); if (ifp->if_eflags & IFEF_EXPENSIVE) ifr->ifr_expensive = 1; - else + else ifr->ifr_expensive = 0; ifnet_lock_done(ifp); break; @@ -2643,7 +3011,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ifnet_lock_exclusive(ifp); if (ifr->ifr_expensive) ifp->if_eflags |= IFEF_EXPENSIVE; - else + else ifp->if_eflags &= ~IFEF_EXPENSIVE; ifnet_lock_done(ifp); /* @@ -2654,7 +3022,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) TAILQ_FOREACH(difp, &ifnet_head, if_link) { ifnet_lock_exclusive(difp); if (difp->if_delegated.ifp == ifp) { - difp->if_delegated.expensive = + difp->if_delegated.expensive = ifp->if_eflags & IFEF_EXPENSIVE ? 1 : 0; } @@ -2758,7 +3126,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) case SIOCGIFINTERFACESTATE: if_get_state(ifp, &ifr->ifr_interface_state); - + break; case SIOCSIFINTERFACESTATE: if ((error = priv_check_cred(kauth_cred_get(), @@ -2795,6 +3163,9 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ifr->ifr_ecn_mode = IFRTYPE_ECN_DEFAULT; break; case SIOCSECNMODE: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) + return (error); if (ifr->ifr_ecn_mode == IFRTYPE_ECN_DEFAULT) { ifp->if_eflags &= ~(IFEF_ECN_ENABLE|IFEF_ECN_DISABLE); } else if (ifr->ifr_ecn_mode == IFRTYPE_ECN_ENABLE) { @@ -2806,6 +3177,81 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) } else error = EINVAL; break; + case SIOCSIFTIMESTAMPENABLE: + case SIOCSIFTIMESTAMPDISABLE: + error = proc_suser(p); + if (error != 0) + break; + + ifnet_lock_exclusive(ifp); + if ((cmd == SIOCSIFTIMESTAMPENABLE && + (ifp->if_xflags & IFXF_TIMESTAMP_ENABLED) != 0) || + (cmd == SIOCSIFTIMESTAMPDISABLE && + (ifp->if_xflags & IFXF_TIMESTAMP_ENABLED) == 0)) { + ifnet_lock_done(ifp); + break; + } + if (cmd == SIOCSIFTIMESTAMPENABLE) + ifp->if_xflags |= IFXF_TIMESTAMP_ENABLED; + else + ifp->if_xflags &= ~IFXF_TIMESTAMP_ENABLED; + ifnet_lock_done(ifp); + /* + * Pass the setting to the interface if it supports either + * software or hardware time stamping + */ + if (ifp->if_capabilities & (IFCAP_HW_TIMESTAMP | + IFCAP_SW_TIMESTAMP)) { + error = ifnet_ioctl(ifp, SOCK_DOM(so), cmd, + (caddr_t)ifr); + } + break; + case SIOCGIFTIMESTAMPENABLED: { + if ((ifp->if_xflags & IFXF_TIMESTAMP_ENABLED) != 0) + ifr->ifr_intval = 1; + else + ifr->ifr_intval = 0; + break; + } + case SIOCSQOSMARKINGMODE: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) + return (error); + error = if_set_qosmarking_mode(ifp, ifr->ifr_qosmarking_mode); + break; + + case SIOCGQOSMARKINGMODE: + ifr->ifr_qosmarking_mode = ifp->if_qosmarking_mode; + break; + + case SIOCSQOSMARKINGENABLED: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) + return (error); + if (ifr->ifr_qosmarking_enabled != 0) + ifp->if_eflags |= IFEF_QOSMARKING_ENABLED; + else + ifp->if_eflags &= ~IFEF_QOSMARKING_ENABLED; + break; + + case SIOCGQOSMARKINGENABLED: + ifr->ifr_qosmarking_enabled = + (ifp->if_eflags & IFEF_QOSMARKING_ENABLED) ? 1 : 0; + break; + + case SIOCSIFDISABLEOUTPUT: +#if (DEBUG || DEVELOPMENT) + if (ifr->ifr_disable_output == 1) { + error = ifnet_disable_output(ifp); + } else if (ifr->ifr_disable_output == 0) { + error = ifnet_enable_output(ifp); + } else { + error = EINVAL; + } +#else + error = EINVAL; +#endif /* (DEBUG || DEVELOPMENT) */ + break; default: VERIFY(0); /* NOTREACHED */ @@ -2822,9 +3268,9 @@ ifioctllocked(struct socket *so, u_long cmd, caddr_t data, struct proc *p) socket_unlock(so, 0); error = ifioctl(so, cmd, data, p); socket_lock(so, 0); - return(error); + return (error); } - + /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first @@ -2843,15 +3289,15 @@ ifnet_set_promiscuous( ifnet_lock_exclusive(ifp); oldflags = ifp->if_flags; ifp->if_pcount += pswitch ? 1 : -1; - + if (ifp->if_pcount > 0) ifp->if_flags |= IFF_PROMISC; else ifp->if_flags &= ~IFF_PROMISC; - + newflags = ifp->if_flags; ifnet_lock_done(ifp); - + if (newflags != oldflags && (newflags & IFF_UP) != 0) { error = ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); if (error == 0) { @@ -2867,14 +3313,14 @@ ifnet_set_promiscuous( ifnet_lock_done(ifp); } } - + if (newflags != oldflags) { log(LOG_INFO, "%s: promiscuous mode %s%s\n", if_name(ifp), (newflags & IFF_PROMISC) != 0 ? "enable" : "disable", error != 0 ? " failed" : " succeeded"); } - return error; + return (error); } /* @@ -3037,7 +3483,7 @@ if_allmulti(struct ifnet *ifp, int onswitch) if (error == 0) rt_ifmsg(ifp); - return error; + return (error); } static struct ifmultiaddr * @@ -3293,7 +3739,7 @@ if_detach_ifma(struct ifnet *ifp, struct ifmultiaddr *ifma, int anon) } /* - * Find an ifmultiaddr that matches a socket address on an interface. + * Find an ifmultiaddr that matches a socket address on an interface. * * Caller is responsible for holding the ifnet_lock while calling * this function. @@ -3305,9 +3751,9 @@ if_addmulti_doesexist(struct ifnet *ifp, const struct sockaddr *sa, struct ifmultiaddr *ifma; for (ifma = LIST_FIRST(&ifp->if_multiaddrs); ifma != NULL; - ifma = LIST_NEXT(ifma, ifma_link)) { + ifma = LIST_NEXT(ifma, ifma_link)) { IFMA_LOCK_SPIN(ifma); - if (!equal(sa, ifma->ifma_addr)) { + if (!ifa_equal(sa, ifma->ifma_addr)) { IFMA_UNLOCK(ifma); continue; } @@ -3339,7 +3785,7 @@ if_addmulti_doesexist(struct ifnet *ifp, const struct sockaddr *sa, /* * Radar 3642395, make sure all multicasts are in a standard format. */ -static struct sockaddr* +static struct sockaddr * copy_and_normalize(const struct sockaddr *original) { int alen = 0; @@ -3351,7 +3797,7 @@ copy_and_normalize(const struct sockaddr *original) if (original->sa_family != AF_LINK && original->sa_family != AF_UNSPEC) { /* Just make a copy */ - MALLOC(copy, struct sockaddr*, original->sa_len, + MALLOC(copy, struct sockaddr *, original->sa_len, M_IFADDR, M_WAITOK); if (copy != NULL) bcopy(original, copy, original->sa_len); @@ -3361,7 +3807,7 @@ copy_and_normalize(const struct sockaddr *original) switch (original->sa_family) { case AF_LINK: { const struct sockaddr_dl *sdl_original = - (struct sockaddr_dl*)(uintptr_t)(size_t)original; + (struct sockaddr_dl *)(uintptr_t)(size_t)original; if (sdl_original->sdl_nlen + sdl_original->sdl_alen + sdl_original->sdl_slen + @@ -3381,7 +3827,7 @@ copy_and_normalize(const struct sockaddr *original) } alen = ETHER_ADDR_LEN; - aptr = (const u_char*)original->sa_data; + aptr = (const u_char *)original->sa_data; } break; } @@ -3390,7 +3836,7 @@ copy_and_normalize(const struct sockaddr *original) return (NULL); len = alen + offsetof(struct sockaddr_dl, sdl_data); - MALLOC(sdl_new, struct sockaddr_dl*, len, M_IFADDR, M_WAITOK); + MALLOC(sdl_new, struct sockaddr_dl *, len, M_IFADDR, M_WAITOK); if (sdl_new != NULL) { bzero(sdl_new, len); @@ -3400,7 +3846,7 @@ copy_and_normalize(const struct sockaddr *original) bcopy(aptr, LLADDR(sdl_new), alen); } - return ((struct sockaddr*)sdl_new); + return ((struct sockaddr *)sdl_new); } /* @@ -3665,9 +4111,9 @@ if_delmulti_common(struct ifmultiaddr *ifma, struct ifnet *ifp, ifnet_lock_exclusive(ifp); if (ifma == NULL) { for (ifma = LIST_FIRST(&ifp->if_multiaddrs); ifma != NULL; - ifma = LIST_NEXT(ifma, ifma_link)) { + ifma = LIST_NEXT(ifma, ifma_link)) { IFMA_LOCK(ifma); - if (!equal(sa, ifma->ifma_addr) || + if (!ifa_equal(sa, ifma->ifma_addr) || (anon && !(ifma->ifma_flags & IFMAF_ANONYMOUS))) { VERIFY(!(ifma->ifma_flags & IFMAF_ANONYMOUS) || ifma->ifma_anoncnt != 0); @@ -3747,12 +4193,12 @@ if_down_all(void) ifnet_list_free(ifp); } - return 0; + return (0); } /* * Delete Routes for a Network Interface - * + * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * @@ -3840,8 +4286,8 @@ if_rtmtu(struct radix_node *rn, void *arg) * associated with a particular interface; this is called when the * MTU of that interface has changed. */ -static -void if_rtmtu_update(struct ifnet *ifp) +static void +if_rtmtu_update(struct ifnet *ifp) { struct radix_node_head *rnh; int p; @@ -3862,8 +4308,8 @@ if_data_internal_to_if_data(struct ifnet *ifp, const struct if_data_internal *if_data_int, struct if_data *if_data) { #pragma unused(ifp) -#define COPYFIELD(fld) if_data->fld = if_data_int->fld -#define COPYFIELD32(fld) if_data->fld = (u_int32_t)(if_data_int->fld) +#define COPYFIELD(fld) if_data->fld = if_data_int->fld +#define COPYFIELD32(fld) if_data->fld = (u_int32_t)(if_data_int->fld) /* compiler will cast down to 32-bit */ #define COPYFIELD32_ATOMIC(fld) do { \ atomic_get_64(if_data->fld, \ @@ -3921,8 +4367,8 @@ if_data_internal_to_if_data64(struct ifnet *ifp, struct if_data64 *if_data64) { #pragma unused(ifp) -#define COPYFIELD64(fld) if_data64->fld = if_data_int->fld -#define COPYFIELD64_ATOMIC(fld) do { \ +#define COPYFIELD64(fld) if_data64->fld = if_data_int->fld +#define COPYFIELD64_ATOMIC(fld) do { \ atomic_get_64(if_data64->fld, \ (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ } while (0) @@ -3951,8 +4397,9 @@ if_data_internal_to_if_data64(struct ifnet *ifp, COPYFIELD64_ATOMIC(ifi_iqdrops); COPYFIELD64_ATOMIC(ifi_noproto); - /* Note these two fields are actually 32 bit, so doing COPYFIELD64_ATOMIC will - * cause them to be misaligned + /* + * Note these two fields are actually 32 bit, so doing + * COPYFIELD64_ATOMIC will cause them to be misaligned */ COPYFIELD64(ifi_recvtiming); COPYFIELD64(ifi_xmittiming); @@ -3969,7 +4416,7 @@ __private_extern__ void if_copy_traffic_class(struct ifnet *ifp, struct if_traffic_class *if_tc) { -#define COPY_IF_TC_FIELD64_ATOMIC(fld) do { \ +#define COPY_IF_TC_FIELD64_ATOMIC(fld) do { \ atomic_get_64(if_tc->fld, \ (u_int64_t *)(void *)(uintptr_t)&ifp->if_tc.fld); \ } while (0) @@ -4002,7 +4449,7 @@ if_copy_traffic_class(struct ifnet *ifp, void if_copy_data_extended(struct ifnet *ifp, struct if_data_extended *if_de) { -#define COPY_IF_DE_FIELD64_ATOMIC(fld) do { \ +#define COPY_IF_DE_FIELD64_ATOMIC(fld) do { \ atomic_get_64(if_de->fld, \ (u_int64_t *)(void *)(uintptr_t)&ifp->if_data.fld); \ } while (0) @@ -4019,12 +4466,12 @@ if_copy_data_extended(struct ifnet *ifp, struct if_data_extended *if_de) void if_copy_packet_stats(struct ifnet *ifp, struct if_packet_stats *if_ps) { -#define COPY_IF_PS_TCP_FIELD64_ATOMIC(fld) do { \ +#define COPY_IF_PS_TCP_FIELD64_ATOMIC(fld) do { \ atomic_get_64(if_ps->ifi_tcp_##fld, \ (u_int64_t *)(void *)(uintptr_t)&ifp->if_tcp_stat->fld); \ } while (0) -#define COPY_IF_PS_UDP_FIELD64_ATOMIC(fld) do { \ +#define COPY_IF_PS_UDP_FIELD64_ATOMIC(fld) do { \ atomic_get_64(if_ps->ifi_udp_##fld, \ (u_int64_t *)(void *)(uintptr_t)&ifp->if_udp_stat->fld); \ } while (0) @@ -4227,6 +4674,8 @@ ifioctl_cassert(void) case SIOCSETROUTERMODE_IN6: case SIOCLL_CGASTART_32: case SIOCLL_CGASTART_64: + case SIOCGIFCGAPREP_IN6: + case SIOCSIFCGAPREP_IN6: #endif /* INET6 */ /* bsd/sys/sockio.h */ @@ -4246,6 +4695,7 @@ ifioctl_cassert(void) case SIOCSIFMETRIC: case SIOCDIFADDR: case SIOCAIFADDR: + case SIOCGIFADDR: case SIOCGIFDSTADDR: case SIOCGIFBRDADDR: @@ -4255,6 +4705,7 @@ ifioctl_cassert(void) case SIOCAUTOADDR: case SIOCAUTONETMASK: case SIOCARPIPLL: + case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMTU: @@ -4267,32 +4718,41 @@ ifioctl_cassert(void) case SIOCSIFGENERIC: case SIOCGIFGENERIC: case SIOCRSLVMULTI: + case SIOCSIFLLADDR: case SIOCGIFSTATUS: case SIOCSIFPHYADDR: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCDIFPHYADDR: + case SIOCGIFDEVMTU: case SIOCSIFALTMTU: case SIOCGIFALTMTU: case SIOCSIFBOND: case SIOCGIFBOND: + case SIOCPROTOATTACH: case SIOCPROTODETACH: + case SIOCSIFCAP: case SIOCGIFCAP: + case SIOCIFCREATE: case SIOCIFDESTROY: case SIOCIFCREATE2: + case SIOCSDRVSPEC32: case SIOCGDRVSPEC32: case SIOCSDRVSPEC64: case SIOCGDRVSPEC64: + case SIOCSIFVLAN: case SIOCGIFVLAN: + case SIOCIFGCLONERS32: case SIOCIFGCLONERS64: + case SIOCGIFASYNCMAP: case SIOCSIFASYNCMAP: #if CONFIG_MACF_NET @@ -4301,7 +4761,9 @@ ifioctl_cassert(void) #endif /* CONFIG_MACF_NET */ case SIOCSIFKPI: case SIOCGIFKPI: + case SIOCGIFWAKEFLAGS: + case SIOCGIFGETRTREFCNT: case SIOCGIFLINKQUALITYMETRIC: case SIOCSIFOPPORTUNISTIC: @@ -4315,24 +4777,53 @@ ifioctl_cassert(void) case SIOCGIFQUEUESTATS: case SIOCSIFTHROTTLE: case SIOCGIFTHROTTLE: + + case SIOCGASSOCIDS32: + case SIOCGASSOCIDS64: + case SIOCGCONNIDS32: + case SIOCGCONNIDS64: + case SIOCGCONNINFO32: + case SIOCGCONNINFO64: + case SIOCSCONNORDER: + case SIOCGCONNORDER: + case SIOCSIFLOG: case SIOCGIFLOG: case SIOCGIFDELEGATE: case SIOCGIFLLADDR: case SIOCGIFTYPE: - case SIOCGIFFUNCTIONALTYPE: + case SIOCGIFEXPENSIVE: + case SIOCSIFEXPENSIVE: + case SIOCGIF2KCL: + case SIOCSIF2KCL: + case SIOCGSTARTDELAY: + case SIOCAIFAGENTID: case SIOCDIFAGENTID: case SIOCGIFAGENTIDS32: case SIOCGIFAGENTIDS64: case SIOCGIFAGENTDATA32: case SIOCGIFAGENTDATA64: + case SIOCGIFAGENTLIST32: + case SIOCGIFAGENTLIST64: + + case SIOCSIFINTERFACESTATE: case SIOCGIFINTERFACESTATE: case SIOCSIFPROBECONNECTIVITY: case SIOCGIFPROBECONNECTIVITY: + + case SIOCGIFFUNCTIONALTYPE: + case SIOCSIFNETSIGNATURE: + case SIOCGIFNETSIGNATURE: + case SIOCGECNMODE: case SIOCSECNMODE: + + case SIOCSQOSMARKINGMODE: + case SIOCSQOSMARKINGENABLED: + case SIOCGQOSMARKINGMODE: + case SIOCGQOSMARKINGENABLED: ; } } diff --git a/bsd/net/if.h b/bsd/net/if.h index c2c99314b..6954fc003 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,47 +64,13 @@ #define _NET_IF_H_ #include +#include #define IF_NAMESIZE 16 #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #include #ifdef __APPLE__ -/* - * Define Data-Link event subclass, and associated - * events. - */ - -#define KEV_DL_SUBCLASS 2 - -#define KEV_DL_SIFFLAGS 1 -#define KEV_DL_SIFMETRICS 2 -#define KEV_DL_SIFMTU 3 -#define KEV_DL_SIFPHYS 4 -#define KEV_DL_SIFMEDIA 5 -#define KEV_DL_SIFGENERIC 6 -#define KEV_DL_ADDMULTI 7 -#define KEV_DL_DELMULTI 8 -#define KEV_DL_IF_ATTACHED 9 -#define KEV_DL_IF_DETACHING 10 -#define KEV_DL_IF_DETACHED 11 -#define KEV_DL_LINK_OFF 12 -#define KEV_DL_LINK_ON 13 -#define KEV_DL_PROTO_ATTACHED 14 -#define KEV_DL_PROTO_DETACHED 15 -#define KEV_DL_LINK_ADDRESS_CHANGED 16 -#define KEV_DL_WAKEFLAGS_CHANGED 17 -#define KEV_DL_IF_IDLE_ROUTE_REFCNT 18 -#define KEV_DL_IFCAP_CHANGED 19 -#define KEV_DL_LINK_QUALITY_METRIC_CHANGED 20 -#define KEV_DL_NODE_PRESENCE 21 -#define KEV_DL_NODE_ABSENCE 22 -#define KEV_DL_MASTER_ELECTED 23 -#define KEV_DL_ISSUES 24 -#define KEV_DL_IFDELEGATE_CHANGED 25 -#define KEV_DL_AWDL_RESTRICTED 26 -#define KEV_DL_AWDL_UNRESTRICTED 27 -#define KEV_DL_RRC_STATE_CHANGED 28 #include #include @@ -113,7 +79,8 @@ #ifdef PRIVATE #include #include -#endif +#include +#endif /* PRIVATE */ #endif struct if_clonereq { @@ -162,6 +129,7 @@ struct if_clonereq32 { #define IFEF_ENQUEUE_MULTI 0x00000002 /* enqueue multiple packets at once */ #define IFEF_DELAY_START 0x00000004 /* delay start callback */ #define IFEF_PROBE_CONNECTIVITY 0x00000008 /* Probe connections going over this interface */ +#define IFEF_QOSMARKING_CAPABLE 0x00000010 /* XXX Obsolete, to be removed */ #define IFEF_IPV6_DISABLED 0x00000020 /* coupled to ND6_IFF_IFDISABLED */ #define IFEF_ACCEPT_RTADV 0x00000040 /* accepts IPv6 RA on the interface */ #define IFEF_TXSTART 0x00000080 /* has start callback */ @@ -169,7 +137,13 @@ struct if_clonereq32 { #define IFEF_VLAN 0x00000200 /* interface has one or more vlans */ #define IFEF_BOND 0x00000400 /* interface is part of bond */ #define IFEF_ARPLL 0x00000800 /* ARP for IPv4LL addresses */ -#define IFEF_NOWINDOWSCALE 0x00001000 /* Don't scale TCP window on iface */ +/* #define IFEF_NOWINDOWSCALE 0x00001000 */ /* Don't scale TCP window on iface */ +/* + * XXX IFEF_NOAUTOIPV6LL is deprecated and should be done away with. + * Configd pretty much manages the interface configuration. + * Rather than looking at the flag we check if a specific LLA + * has to be configured or the IID has to be generated by kernel. + */ #define IFEF_NOAUTOIPV6LL 0x00002000 /* Need explicit IPv6 LL address */ #define IFEF_EXPENSIVE 0x00004000 /* Data access has a cost */ #define IFEF_IPV4_ROUTER 0x00008000 /* interior when in IPv4 router mode */ @@ -184,11 +158,20 @@ struct if_clonereq32 { #define IFEF_2KCL 0x00800000 /* prefers 2K cluster (socket based tunnel) */ #define IFEF_ECN_ENABLE 0x01000000 /* use ECN for TCP connections on the interface */ #define IFEF_ECN_DISABLE 0x02000000 /* do not use ECN for TCP connections on the interface */ +#define IFEF_SKYWALK_NATIVE 0x04000000 /* Native Skywalk support */ +#define IFEF_3CA 0x08000000 /* Capable of 3CA */ #define IFEF_SENDLIST 0x10000000 /* Supports tx packet lists */ #define IFEF_DIRECTLINK 0x20000000 /* point-to-point topology */ -#define _IFEF_INUSE 0x40000000 /* deprecated */ +#define IFEF_QOSMARKING_ENABLED 0x40000000 /* OoS marking is enabled */ #define IFEF_UPDOWNCHANGE 0x80000000 /* up/down state is changing */ + #ifdef XNU_KERNEL_PRIVATE +/* + * Extra flags + */ +#define IFXF_WAKE_ON_MAGIC_PACKET 0x00000001 /* wake on magic packet */ +#define IFXF_TIMESTAMP_ENABLED 0x00000002 /* time stamping enabled */ + /* * Current requirements for an AWDL interface. Setting/clearing IFEF_AWDL * will also trigger the setting/clearing of the rest of the flags. Once @@ -247,12 +230,16 @@ struct if_clonereq32 { #define IFCAP_LRO 0x00080 /* can do Large Receive Offload */ #define IFCAP_AV 0x00100 /* can do 802.1 AV Bridging */ #define IFCAP_TXSTATUS 0x00200 /* can return linklevel xmit status */ +#define IFCAP_SKYWALK 0x00400 /* Skywalk mode supported/enabled */ +#define IFCAP_HW_TIMESTAMP 0x00800 /* Time stamping in hardware */ +#define IFCAP_SW_TIMESTAMP 0x01000 /* Time stamping in software */ #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) #define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) #define IFCAP_VALID (IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO | IFCAP_VLAN_MTU | \ - IFCAP_VLAN_HWTAGGING | IFCAP_JUMBO_MTU | IFCAP_AV | IFCAP_TXSTATUS) + IFCAP_VLAN_HWTAGGING | IFCAP_JUMBO_MTU | IFCAP_AV | IFCAP_TXSTATUS | \ + IFCAP_SKYWALK | IFCAP_SW_TIMESTAMP | IFCAP_HW_TIMESTAMP) #define IFQ_MAXLEN 128 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ @@ -471,6 +458,7 @@ struct ifreq { #define IFRTYPE_SUBFAMILY_WIFI 3 #define IFRTYPE_SUBFAMILY_THUNDERBOLT 4 #define IFRTYPE_SUBFAMILY_RESERVED 5 +#define IFRTYPE_SUBFAMILY_INTCOPROC 6 } ifru_type; u_int32_t ifru_functional_type; #define IFRTYPE_FUNCTIONAL_UNKNOWN 0 @@ -490,8 +478,13 @@ struct ifreq { u_int32_t ifru_probe_connectivity; u_int32_t ifru_ecn_mode; #define IFRTYPE_ECN_DEFAULT 0 -#define IFRTYPE_ECN_ENABLE 1 -#define IFRTYPE_ECN_DISABLE 2 +#define IFRTYPE_ECN_ENABLE 1 +#define IFRTYPE_ECN_DISABLE 2 + u_int32_t ifru_qosmarking_mode; +#define IFRTYPE_QOSMARKING_MODE_NONE 0 +#define IFRTYPE_QOSMARKING_FASTLANE 1 + u_int32_t ifru_qosmarking_enabled; + u_int32_t ifru_disable_output; #endif /* PRIVATE */ } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ @@ -535,6 +528,12 @@ struct ifreq { #define ifr_interface_state ifr_ifru.ifru_interface_state #define ifr_probe_connectivity ifr_ifru.ifru_probe_connectivity #define ifr_ecn_mode ifr_ifru.ifru_ecn_mode +#define ifr_qosmarking_mode ifr_ifru.ifru_qosmarking_mode +#define ifr_fastlane_capable ifr_qosmarking_mode +#define ifr_qosmarking_enabled ifr_ifru.ifru_qosmarking_enabled +#define ifr_fastlane_enabled ifr_qosmarking_enabled +#define ifr_disable_output ifr_ifru.ifru_disable_output + #endif /* PRIVATE */ }; @@ -741,8 +740,9 @@ struct if_descreq { enum { IFNET_SCHED_MODEL_NORMAL = 0, IFNET_SCHED_MODEL_DRIVER_MANAGED = 1, + IFNET_SCHED_MODEL_FQ_CODEL = 2, #ifdef XNU_KERNEL_PRIVATE - IFNET_SCHED_MODEL_MAX = 2, + IFNET_SCHED_MODEL_MAX = 3, #endif /* XNU_KERNEL_PRIVATE */ }; @@ -912,6 +912,47 @@ struct if_nsreq { u_int16_t ifnsr_flags; /* for future */ u_int8_t ifnsr_data[IFNET_SIGNATURELEN]; }; + +/* + * Structure for SIOC[S/G]IFORDER + * + * When setting, ifo_count is the number of u_int32_t interface indices + * in the ifo_ordered_indices array. + * + * When getting, if ifo_count is 0, the length of the ordered list will + * be returned. If the ifo_count is non-0, it is the number of u_int32_t + * interface indices allocated. Upon return, ifo_count will contain the number + * of indices copied into the array. + */ +struct if_order { + u_int32_t ifo_count; + u_int32_t ifo_reserved; + mach_vm_address_t ifo_ordered_indices; /* array of u_int32_t */ +}; + +/* + * Struct for traffic class to DSCP mapping + */ +struct if_tdmreq { + char iftdm_name[IFNAMSIZ]; + u_int32_t iftdm_len; /* byte length of the table */ + struct netsvctype_dscp_map *iftdm_table; +}; + +#ifdef BSD_KERNEL_PRIVATE +struct if_tdmreq32 { + char iftdm_name[IFNAMSIZ]; + u_int32_t iftdm_len; /* byte length of the table */ + user32_addr_t iftdm_table; +}; + +struct if_tdmreq64 { + char iftdm_name[IFNAMSIZ]; + u_int32_t iftdm_len; /* byte length of the table */ + user64_addr_t iftdm_table __attribute__((aligned(8))); +}; +#endif + #endif /* PRIVATE */ #ifdef KERNEL diff --git a/bsd/net/if_arp.h b/bsd/net/if_arp.h index 5ea113253..45bb05088 100644 --- a/bsd/net/if_arp.h +++ b/bsd/net/if_arp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -143,6 +143,7 @@ struct arpstat { /* General statistics */ uint32_t inuse; /* # of ARP entries in routing table */ uint32_t txurequests; /* # of ARP requests sent (unicast) */ + uint32_t held; /* # of packets held waiting for a reply */ }; #ifdef BSD_KERNEL_PRIVATE diff --git a/bsd/net/if_ipsec.c b/bsd/net/if_ipsec.c index f8953609d..9e98a05b9 100644 --- a/bsd/net/if_ipsec.c +++ b/bsd/net/if_ipsec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2014 Apple Inc. All rights reserved. + * Copyright (c) 2012-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,8 +36,6 @@ #include #include #include -#include -#include #include #include #include @@ -50,6 +48,11 @@ #include #include #include +#include +#include + +extern int net_qos_policy_restricted; +extern int net_qos_policy_restrict_avapps; /* Kernel Control functions */ static errno_t ipsec_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, @@ -85,42 +88,15 @@ static errno_t ipsec_proto_pre_output(ifnet_t interface, protocol_family_t proto static kern_ctl_ref ipsec_kctlref; static u_int32_t ipsec_family; -static OSMallocTag ipsec_malloc_tag; -static SInt32 ipsec_ifcount = 0; #define IPSECQ_MAXLEN 256 -/* Prepend length */ -static void* -ipsec_alloc(size_t size) -{ - size_t *mem = OSMalloc(size + sizeof(size_t), ipsec_malloc_tag); - - if (mem) { - *mem = size + sizeof(size_t); - mem++; - } - - return (void*)mem; -} - -static void -ipsec_free(void *ptr) -{ - size_t *size = ptr; - size--; - OSFree(size, *size, ipsec_malloc_tag); -} - errno_t ipsec_register_control(void) { struct kern_ctl_reg kern_ctl; errno_t result = 0; - /* Create a tag to allocate memory */ - ipsec_malloc_tag = OSMalloc_Tagalloc(IPSEC_CONTROL_NAME, OSMT_DEFAULT); - /* Find a unique value for our interface family */ result = mbuf_tag_id_find(IPSEC_CONTROL_NAME, &ipsec_family); if (result != 0) { @@ -202,12 +178,9 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, struct ifnet_stats_param stats; /* kernel control allocates, interface frees */ - pcb = ipsec_alloc(sizeof(*pcb)); - if (pcb == NULL) - return ENOMEM; - + MALLOC(pcb, struct ipsec_pcb *, sizeof(*pcb), M_DEVBUF, M_WAITOK | M_ZERO); + /* Setup the protocol control block */ - bzero(pcb, sizeof(*pcb)); *unitinfo = pcb; pcb->ipsec_ctlref = kctlref; pcb->ipsec_unit = sac->sc_unit; @@ -234,10 +207,10 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, result = ifnet_allocate_extended(&ipsec_init, &pcb->ipsec_ifp); if (result != 0) { printf("ipsec_ctl_connect - ifnet_allocate failed: %d\n", result); - ipsec_free(pcb); + *unitinfo = NULL; + FREE(pcb, M_DEVBUF); return result; } - OSIncrementAtomic(&ipsec_ifcount); /* Set flags and additional information. */ ifnet_set_mtu(pcb->ipsec_ifp, 1500); @@ -257,16 +230,15 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, if (result != 0) { printf("ipsec_ctl_connect - ifnet_allocate failed: %d\n", result); ifnet_release(pcb->ipsec_ifp); - ipsec_free(pcb); - } - - /* Attach to bpf */ - if (result == 0) + *unitinfo = NULL; + FREE(pcb, M_DEVBUF); + } else { + /* Attach to bpf */ bpfattach(pcb->ipsec_ifp, DLT_NULL, 4); - /* The interfaces resoures allocated, mark it as running */ - if (result == 0) + /* The interfaces resoures allocated, mark it as running */ ifnet_set_flags(pcb->ipsec_ifp, IFF_RUNNING, IFF_RUNNING); + } return result; } @@ -426,9 +398,14 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref, void *unitinfo) { struct ipsec_pcb *pcb = unitinfo; - ifnet_t ifp = pcb->ipsec_ifp; + ifnet_t ifp = NULL; errno_t result = 0; - + + if (pcb == NULL) + return EINVAL; + + ifp = pcb->ipsec_ifp; + VERIFY(ifp != NULL); pcb->ipsec_ctlref = NULL; pcb->ipsec_unit = 0; @@ -438,7 +415,7 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref kctlref, * addresses and detach the protocols. Finally, we can remove and * release the interface. */ - key_delsp_for_ipsec_if(ifp); + key_delsp_for_ipsec_if(ifp); ipsec_cleanup_family(ifp, AF_INET); ipsec_cleanup_family(ifp, AF_INET6); @@ -536,6 +513,10 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, result = ifnet_find_by_name(name, &del_ifp); } if (result == 0) { + printf("%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n", + __func__, pcb->ipsec_ifp->if_xname, + del_ifp->if_xname); + result = ifnet_set_delegate(pcb->ipsec_ifp, del_ifp); if (del_ifp) ifnet_release(del_ifp); @@ -554,6 +535,9 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, } else { pcb->ipsec_output_service_class = output_service_class; } + printf("%s IPSEC_OPT_OUTPUT_TRAFFIC_CLASS %s svc %d\n", + __func__, pcb->ipsec_ifp->if_xname, + pcb->ipsec_output_service_class); break; } @@ -696,6 +680,7 @@ ipsec_output(ifnet_t interface, ipoa.ipoa_boundif = ipsec_state.outgoing_if; ipoa.ipoa_flags |= IPOAF_BOUND_IF; } + ipsec_set_ipoa_for_interface(pcb->ipsec_ifp, &ipoa); adv = &ipoa.ipoa_flowadv; @@ -752,11 +737,12 @@ ipsec_output(ifnet_t interface, bzero(&ip6oa, sizeof(ip6oa)); ip6oa.ip6oa_flowadv.code = 0; - ip6oa.ip6oa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR; + ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR; if (ipsec_state.outgoing_if) { ip6oa.ip6oa_boundif = ipsec_state.outgoing_if; - ip6oa.ip6oa_flags |= IPOAF_BOUND_IF; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; } + ipsec_set_ip6oa_for_interface(pcb->ipsec_ifp, &ip6oa); adv = &ip6oa.ip6oa_flowadv; @@ -886,9 +872,6 @@ ipsec_detached( struct ipsec_pcb *pcb = ifnet_softc(interface); ifnet_release(pcb->ipsec_ifp); - ipsec_free(pcb); - - OSDecrementAtomic(&ipsec_ifcount); } /* Protocol Handlers */ @@ -909,7 +892,8 @@ ipsec_proto_input(ifnet_t interface, mbuf_pkthdr_setrcvif(m, interface); bpf_tap_in(interface, DLT_NULL, m, &af, sizeof(af)); - + pktap_input(interface, protocol, m, NULL); + if (proto_input(protocol, m) != 0) { ifnet_stat_increment_in(interface, 0, 0, 1); m_freem(m); @@ -988,3 +972,45 @@ ipsec_set_pkthdr_for_interface(ifnet_t interface, mbuf_t packet, int family) } } } + +void +ipsec_set_ipoa_for_interface(ifnet_t interface, struct ip_out_args *ipoa) +{ + struct ipsec_pcb *pcb; + + if (interface == NULL || ipoa == NULL) + return; + pcb = ifnet_softc(interface); + + if (net_qos_policy_restricted == 0) { + ipoa->ipoa_flags |= IPOAF_QOSMARKING_ALLOWED; + ipoa->ipoa_sotc = so_svc2tc(pcb->ipsec_output_service_class); + } else if (pcb->ipsec_output_service_class != MBUF_SC_VO || + net_qos_policy_restrict_avapps != 0) { + ipoa->ipoa_flags &= ~IPOAF_QOSMARKING_ALLOWED; + } else { + ipoa->ipoa_flags |= IP6OAF_QOSMARKING_ALLOWED; + ipoa->ipoa_sotc = SO_TC_VO; + } +} + +void +ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa) +{ + struct ipsec_pcb *pcb; + + if (interface == NULL || ip6oa == NULL) + return; + pcb = ifnet_softc(interface); + + if (net_qos_policy_restricted == 0) { + ip6oa->ip6oa_flags |= IPOAF_QOSMARKING_ALLOWED; + ip6oa->ip6oa_sotc = so_svc2tc(pcb->ipsec_output_service_class); + } else if (pcb->ipsec_output_service_class != MBUF_SC_VO || + net_qos_policy_restrict_avapps != 0) { + ip6oa->ip6oa_flags &= ~IPOAF_QOSMARKING_ALLOWED; + } else { + ip6oa->ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED; + ip6oa->ip6oa_sotc = SO_TC_VO; + } +} diff --git a/bsd/net/if_ipsec.h b/bsd/net/if_ipsec.h index e665f5b21..31195e7e4 100644 --- a/bsd/net/if_ipsec.h +++ b/bsd/net/if_ipsec.h @@ -54,6 +54,11 @@ errno_t ipsec_inject_inbound_packet(ifnet_t interface, mbuf_t packet); void ipsec_set_pkthdr_for_interface(ifnet_t interface, mbuf_t packet, int family); +void ipsec_set_ipoa_for_interface(ifnet_t interface, struct ip_out_args *ipoa); + +struct ip6_out_args; +void ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa); + #endif /* diff --git a/bsd/net/if_llreach.h b/bsd/net/if_llreach.h index 4b1f5ff3a..c27ff9ded 100644 --- a/bsd/net/if_llreach.h +++ b/bsd/net/if_llreach.h @@ -66,15 +66,6 @@ struct if_llreach_info { #include #endif /* INET6 */ -/* - * Link-layer reachability is based off node constants in RFC4861. - */ -#if INET6 -#define LL_BASE_REACHABLE REACHABLE_TIME -#else -#define LL_BASE_REACHABLE 30000 /* msec */ -#endif /* !INET6 */ - /* * Per-interface link-layer reachability. (Currently only for ARP/NDP/Ethernet.) */ diff --git a/bsd/net/if_loop.c b/bsd/net/if_loop.c index a22c68cf0..3baa27434 100644 --- a/bsd/net/if_loop.c +++ b/bsd/net/if_loop.c @@ -254,6 +254,36 @@ lo_del_proto(struct ifnet *ifp, protocol_family_t protocol) return (0); } +static void +lo_tx_compl(struct ifnet *ifp, struct mbuf *m) +{ + errno_t error; + + if ((ifp->if_xflags & IFXF_TIMESTAMP_ENABLED) != 0) { + boolean_t requested; + + error = mbuf_get_timestamp_requested(m, &requested); + if (requested) { + struct timespec now; + u_int64_t ts; + + nanouptime(&now); + net_timernsec(&now, &ts); + + error = mbuf_set_timestamp(m, ts, TRUE); + if (error != 0) + printf("%s: mbuf_set_timestamp() failed %d\n", + __func__, error); + } + } + error = mbuf_set_status(m, KERN_SUCCESS); + if (error != 0) + printf("%s: mbuf_set_status() failed %d\n", + __func__, error); + + ifnet_tx_compl(ifp, m); +} + /* * Output callback. * @@ -296,6 +326,7 @@ lo_output(struct ifnet *ifp, struct mbuf *m_list) if (m->m_nextpkt == NULL) { m_tail = m; } + lo_tx_compl(ifp, m); } s.packets_in = cnt; @@ -404,6 +435,7 @@ lo_start(struct ifnet *ifp) if (cnt >= if_bw_measure_size) ifnet_transmit_burst_end(ifp, m_tail); } + lo_tx_compl(ifp, m); /* stats are required for extended variant */ s.packets_in = cnt; @@ -457,6 +489,21 @@ static errno_t lo_input(struct ifnet *ifp, protocol_family_t protocol_family, struct mbuf *m) { #pragma unused(ifp, protocol_family) + + if ((ifp->if_xflags & IFXF_TIMESTAMP_ENABLED) != 0) { + errno_t error; + struct timespec now; + u_int64_t ts; + + nanouptime(&now); + net_timernsec(&now, &ts); + + error = mbuf_set_timestamp(m, ts, TRUE); + if (error != 0) + printf("%s: mbuf_set_timestamp() failed %d\n", + __func__, error); + } + if (proto_input(protocol_family, m) != 0) m_freem(m); return (0); @@ -536,6 +583,8 @@ lo_ioctl(struct ifnet *ifp, u_long cmd, void *data) } case SIOCSIFFLAGS: /* struct ifreq */ + case SIOCSIFTIMESTAMPENABLE: + case SIOCSIFTIMESTAMPDISABLE: break; default: @@ -659,7 +708,8 @@ loopattach(void) ifnet_set_offload(lo_ifp, IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | - IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES); + IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES | + IFNET_TX_STATUS | IFNET_SW_TIMESTAMP); ifnet_set_hdrlen(lo_ifp, sizeof (struct loopback_header)); ifnet_set_eflags(lo_ifp, IFEF_SENDLIST, IFEF_SENDLIST); @@ -715,6 +765,7 @@ sysctl_sched_model SYSCTL_HANDLER_ARGS switch (i) { case IFNET_SCHED_MODEL_NORMAL: case IFNET_SCHED_MODEL_DRIVER_MANAGED: + case IFNET_SCHED_MODEL_FQ_CODEL: break; default: diff --git a/bsd/net/if_stf.c b/bsd/net/if_stf.c index 72abbef09..62f662115 100644 --- a/bsd/net/if_stf.c +++ b/bsd/net/if_stf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -524,7 +524,8 @@ stf_pre_output( struct in6_ifaddr *ia6; struct sockaddr_in *dst4; struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; errno_t result = 0; sc = ifnet_softc(ifp); diff --git a/bsd/net/if_utun.c b/bsd/net/if_utun.c index 4261be968..35868bbd2 100644 --- a/bsd/net/if_utun.c +++ b/bsd/net/if_utun.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2014 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,8 +45,6 @@ This kernel control will register an interface for every client that connects. #include #include #include -#include -#include #include #include #include @@ -55,6 +53,7 @@ This kernel control will register an interface for every client that connects. #include + /* Kernel Control functions */ static errno_t utun_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, void **unitinfo); @@ -91,34 +90,15 @@ static errno_t utun_proto_input(ifnet_t interface, protocol_family_t protocol, static errno_t utun_proto_pre_output(ifnet_t interface, protocol_family_t protocol, mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type, char *link_layer_dest); -__private_extern__ errno_t utun_pkt_input (struct utun_pcb *pcb, mbuf_t m); +static errno_t utun_pkt_input (struct utun_pcb *pcb, mbuf_t m); + + +#define UTUN_DEFAULT_MTU 1500 +#define UTUN_HEADER_SIZE(_pcb) (sizeof(u_int32_t) + (((_pcb)->utun_flags & UTUN_FLAGS_ENABLE_PROC_UUID) ? sizeof(uuid_t) : 0)) static kern_ctl_ref utun_kctlref; static u_int32_t utun_family; -static OSMallocTag utun_malloc_tag; -static SInt32 utun_ifcount = 0; - -/* Prepend length */ -void* -utun_alloc(size_t size) -{ - size_t *mem = OSMalloc(size + sizeof(size_t), utun_malloc_tag); - - if (mem) { - *mem = size + sizeof(size_t); - mem++; - } - - return (void*)mem; -} -void -utun_free(void *ptr) -{ - size_t *size = ptr; - size--; - OSFree(size, *size, utun_malloc_tag); -} errno_t utun_register_control(void) @@ -126,9 +106,6 @@ utun_register_control(void) struct kern_ctl_reg kern_ctl; errno_t result = 0; - /* Create a tag to allocate memory */ - utun_malloc_tag = OSMalloc_Tagalloc(UTUN_CONTROL_NAME, OSMT_DEFAULT); - /* Find a unique value for our interface family */ result = mbuf_tag_id_find(UTUN_CONTROL_NAME, &utun_family); if (result != 0) { @@ -149,8 +126,6 @@ utun_register_control(void) kern_ctl.ctl_getopt = utun_ctl_getopt; kern_ctl.ctl_rcvd = utun_ctl_rcvd; - utun_ctl_init_crypto(); - result = ctl_register(&kern_ctl, &utun_kctlref); if (result != 0) { printf("utun_register_control - ctl_register failed: %d\n", result); @@ -175,6 +150,7 @@ utun_register_control(void) utun_family, result); return result; } + return 0; } @@ -193,12 +169,8 @@ utun_ctl_connect( struct ifnet_stats_param stats; /* kernel control allocates, interface frees */ - pcb = utun_alloc(sizeof(*pcb)); - if (pcb == NULL) - return ENOMEM; - - /* Setup the protocol control block */ - bzero(pcb, sizeof(*pcb)); + MALLOC(pcb, struct utun_pcb *, sizeof(*pcb), M_DEVBUF, M_WAITOK | M_ZERO); + *unitinfo = pcb; pcb->utun_ctlref = kctlref; pcb->utun_unit = sac->sc_unit; @@ -214,6 +186,7 @@ utun_ctl_connect( utun_init.start = utun_start; utun_init.unit = pcb->utun_unit - 1; utun_init.family = utun_family; + utun_init.subfamily = IFNET_SUBFAMILY_UTUN; utun_init.type = IFT_OTHER; utun_init.demux = utun_demux; utun_init.framer_extended = utun_framer; @@ -222,17 +195,21 @@ utun_ctl_connect( utun_init.softc = pcb; utun_init.ioctl = utun_ioctl; utun_init.detach = utun_detached; - + + /* + * Upon success, this holds an ifnet reference which we will + * release via ifnet_release() at final detach time. + */ result = ifnet_allocate_extended(&utun_init, &pcb->utun_ifp); if (result != 0) { printf("utun_ctl_connect - ifnet_allocate failed: %d\n", result); - utun_free(pcb); + *unitinfo = NULL; + FREE(pcb, M_DEVBUF); return result; } - OSIncrementAtomic(&utun_ifcount); /* Set flags and additional information. */ - ifnet_set_mtu(pcb->utun_ifp, 1500); + ifnet_set_mtu(pcb->utun_ifp, UTUN_DEFAULT_MTU); ifnet_set_flags(pcb->utun_ifp, IFF_UP | IFF_MULTICAST | IFF_POINTOPOINT, 0xffff); /* The interface must generate its own IPv6 LinkLocal address, @@ -248,18 +225,16 @@ utun_ctl_connect( result = ifnet_attach(pcb->utun_ifp, NULL); if (result != 0) { printf("utun_ctl_connect - ifnet_allocate failed: %d\n", result); + /* Release reference now since attach failed */ ifnet_release(pcb->utun_ifp); - utun_free(pcb); - } - - /* Attach to bpf */ - if (result == 0) - bpfattach(pcb->utun_ifp, DLT_NULL, 4); - - /* The interfaces resoures allocated, mark it as running */ - if (result == 0) + *unitinfo = NULL; + FREE(pcb, M_DEVBUF); + } else { + /* Attach to bpf */ + bpfattach(pcb->utun_ifp, DLT_NULL, UTUN_HEADER_SIZE(pcb)); + /* The interfaces resoures allocated, mark it as running */ ifnet_set_flags(pcb->utun_ifp, IFF_RUNNING, IFF_RUNNING); - + } return result; } @@ -422,14 +397,18 @@ utun_ctl_disconnect( void *unitinfo) { struct utun_pcb *pcb = unitinfo; - ifnet_t ifp = pcb->utun_ifp; + ifnet_t ifp = NULL; errno_t result = 0; - utun_cleanup_crypto(pcb); + if (pcb == NULL) + return EINVAL; + + ifp = pcb->utun_ifp; + VERIFY(ifp != NULL); pcb->utun_ctlref = NULL; pcb->utun_unit = 0; - + /* * We want to do everything in our power to ensure that the interface * really goes away when the socket is closed. We must remove IP/IPv6 @@ -438,17 +417,18 @@ utun_ctl_disconnect( */ utun_cleanup_family(ifp, AF_INET); utun_cleanup_family(ifp, AF_INET6); - + + /* + * Detach now; utun_detach() will be called asynchronously once + * the I/O reference count drops to 0. There we will invoke + * ifnet_release(). + */ if ((result = ifnet_detach(ifp)) != 0) { printf("utun_ctl_disconnect - ifnet_detach failed: %d\n", result); } - if ((result = ifnet_release(ifp)) != 0) { - printf("utun_ctl_disconnect - ifnet_release failed: %d\n", result); - } - return 0; -} +} static errno_t utun_ctl_send( @@ -462,10 +442,11 @@ utun_ctl_send( * The userland ABI requires the first four bytes have the protocol family * in network byte order: swap them */ - if (m_pktlen(m) >= 4) + if (m_pktlen(m) >= (int32_t)UTUN_HEADER_SIZE((struct utun_pcb *)unitinfo)) { *(protocol_family_t *)mbuf_data(m) = ntohl(*(protocol_family_t *)mbuf_data(m)); - else + } else { printf("%s - unexpected short mbuf pkt len %d\n", __func__, m_pktlen(m) ); + } return utun_pkt_input((struct utun_pcb *)unitinfo, m); } @@ -481,7 +462,6 @@ utun_ctl_setopt( { struct utun_pcb *pcb = unitinfo; errno_t result = 0; - /* check for privileges for privileged options */ switch (opt) { case UTUN_OPT_FLAGS: @@ -495,42 +475,18 @@ utun_ctl_setopt( switch (opt) { case UTUN_OPT_FLAGS: - if (len != sizeof(u_int32_t)) + if (len != sizeof(u_int32_t)) { result = EMSGSIZE; - else + } else { + u_int32_t old_flags = pcb->utun_flags; pcb->utun_flags = *(u_int32_t *)data; - break; - case UTUN_OPT_ENABLE_CRYPTO: - result = utun_ctl_enable_crypto(kctlref, unit, unitinfo, opt, data, len); - break; - - case UTUN_OPT_CONFIG_CRYPTO_KEYS: - result = utun_ctl_config_crypto_keys(kctlref, unit, unitinfo, opt, data, len); - break; - - case UTUN_OPT_UNCONFIG_CRYPTO_KEYS: - result = utun_ctl_unconfig_crypto_keys(kctlref, unit, unitinfo, opt, data, len); - break; - - case UTUN_OPT_DISABLE_CRYPTO: - result = utun_ctl_disable_crypto(kctlref, unit, unitinfo, opt, data, len); - break; - - case UTUN_OPT_STOP_CRYPTO_DATA_TRAFFIC: - result = utun_ctl_stop_crypto_data_traffic(kctlref, unit, unitinfo, opt, data, len); - break; - - case UTUN_OPT_START_CRYPTO_DATA_TRAFFIC: - result = utun_ctl_start_crypto_data_traffic(kctlref, unit, unitinfo, opt, data, len); - break; - - case UTUN_OPT_CONFIG_CRYPTO_FRAMER: - result = utun_ctl_config_crypto_framer(kctlref, unit, unitinfo, opt, data, len); - break; - - case UTUN_OPT_UNCONFIG_CRYPTO_FRAMER: - result = utun_ctl_unconfig_crypto_framer(kctlref, unit, unitinfo, opt, data, len); + if (((old_flags ^ pcb->utun_flags) & UTUN_FLAGS_ENABLE_PROC_UUID)) { + // If UTUN_FLAGS_ENABLE_PROC_UUID flag changed, update bpf + bpfdetach(pcb->utun_ifp); + bpfattach(pcb->utun_ifp, DLT_NULL, UTUN_HEADER_SIZE(pcb)); + } + } break; case UTUN_OPT_EXT_IFDATA_STATS: @@ -635,14 +591,13 @@ utun_ctl_getopt( *len = snprintf(data, *len, "%s%d", ifnet_name(pcb->utun_ifp), ifnet_unit(pcb->utun_ifp)) + 1; break; - case UTUN_OPT_GENERATE_CRYPTO_KEYS_IDX: - result = utun_ctl_generate_crypto_keys_idx(kctlref, unit, unitinfo, opt, data, len); - break; case UTUN_OPT_MAX_PENDING_PACKETS: { *len = sizeof(u_int32_t); *((u_int32_t *)data) = pcb->utun_max_pending_packets; break; } + + default: result = ENOPROTOOPT; break; @@ -687,7 +642,11 @@ static void utun_start(ifnet_t interface) { mbuf_t data; - struct utun_pcb*pcb = ifnet_softc(interface); + struct utun_pcb *pcb = ifnet_softc(interface); + + VERIFY(pcb != NULL); + + for (;;) { bool can_accept_packets = true; ifnet_lock_shared(pcb->utun_ifp); @@ -727,14 +686,15 @@ utun_start(ifnet_t interface) } static errno_t -utun_output( - ifnet_t interface, - mbuf_t data) +utun_output(ifnet_t interface, + mbuf_t data) { struct utun_pcb *pcb = ifnet_softc(interface); errno_t result; + + VERIFY(interface == pcb->utun_ifp); - if (m_pktlen(data) >= 4) { + if (m_pktlen(data) >= (int32_t)UTUN_HEADER_SIZE(pcb)) { bpf_tap_out(pcb->utun_ifp, DLT_NULL, data, 0, 0); } @@ -748,18 +708,12 @@ utun_output( if (pcb->utun_ctlref) { int length; - // only pass packets to utun-crypto if crypto is enabled and 'suspend data traffic' is not. - if ((pcb->utun_flags & (UTUN_FLAGS_CRYPTO | UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC)) == UTUN_FLAGS_CRYPTO) { - if (utun_pkt_crypto_output(pcb, &data) == 0) { - return 0; - } - } - /* * The ABI requires the protocol in network byte order */ - if (m_pktlen(data) >= 4) + if (m_pktlen(data) >= (int32_t)UTUN_HEADER_SIZE(pcb)) { *(u_int32_t *)mbuf_data(data) = htonl(*(u_int32_t *)mbuf_data(data)); + } length = mbuf_pkthdr_len(data); result = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, data, CTL_DATA_EOR); @@ -809,7 +763,11 @@ utun_framer( u_int32_t *prepend_len, u_int32_t *postpend_len) { - if (mbuf_prepend(packet, sizeof(protocol_family_t), MBUF_DONTWAIT) != 0) { + struct utun_pcb *pcb = ifnet_softc(interface); + VERIFY(interface == pcb->utun_ifp); + + u_int32_t header_length = UTUN_HEADER_SIZE(pcb); + if (mbuf_prepend(packet, header_length, MBUF_DONTWAIT) != 0) { printf("utun_framer - ifnet_output prepend failed\n"); ifnet_stat_increment_out(interface, 0, 0, 1); @@ -818,13 +776,14 @@ utun_framer( return EJUSTRETURN; } if (prepend_len != NULL) - *prepend_len = sizeof(protocol_family_t); + *prepend_len = header_length; if (postpend_len != NULL) *postpend_len = 0; // place protocol number at the beginning of the mbuf *(protocol_family_t *)mbuf_data(*packet) = *(protocol_family_t *)(uintptr_t)(size_t)frame_type; - + + return 0; } @@ -885,26 +844,28 @@ utun_detached( { struct utun_pcb *pcb = ifnet_softc(interface); - utun_free(pcb); - - OSDecrementAtomic(&utun_ifcount); + FREE(pcb, M_DEVBUF); + /* Release reference acquired via ifnet_allocate_extended() */ + (void) ifnet_release(interface); } /* Protocol Handlers */ static errno_t utun_proto_input( - __unused ifnet_t interface, + ifnet_t interface, protocol_family_t protocol, mbuf_t m, __unused char *frame_header) { // remove protocol family first - mbuf_adj(m, sizeof(u_int32_t)); + struct utun_pcb *pcb = ifnet_softc(interface); + mbuf_adj(m, UTUN_HEADER_SIZE(pcb)); - if (proto_input(protocol, m) != 0) + if (proto_input(protocol, m) != 0) { m_freem(m); + } return 0; } @@ -916,12 +877,11 @@ utun_proto_pre_output( __unused mbuf_t *packet, __unused const struct sockaddr *dest, __unused void *route, - __unused char *frame_type, + char *frame_type, __unused char *link_layer_dest) { - *(protocol_family_t *)(void *)frame_type = protocol; - return 0; + return 0; } static errno_t @@ -945,7 +905,7 @@ utun_attach_proto( return result; } -errno_t +static errno_t utun_pkt_input (struct utun_pcb *pcb, mbuf_t m) { errno_t result; @@ -953,7 +913,7 @@ utun_pkt_input (struct utun_pcb *pcb, mbuf_t m) mbuf_pkthdr_setrcvif(m, pcb->utun_ifp); - if (m_pktlen(m) >= 4) { + if (m_pktlen(m) >= (int32_t)UTUN_HEADER_SIZE(pcb)) { protocol = *(u_int32_t *)mbuf_data(m); bpf_tap_in(pcb->utun_ifp, DLT_NULL, m, 0, 0); @@ -964,15 +924,6 @@ utun_pkt_input (struct utun_pcb *pcb, mbuf_t m) return 0; } - // quick exit for keepalive packets - if (protocol == AF_UTUN && pcb->utun_flags & UTUN_FLAGS_CRYPTO) { - if (utun_pkt_crypto_output(pcb, &m) == 0) { - return 0; - } - printf("%s: utun_pkt_crypto_output failed, flags %x\n", __FUNCTION__, pcb->utun_flags); - return EINVAL; - } - if (!pcb->utun_ext_ifdata_stats) { struct ifnet_stat_increment_param incs; @@ -992,3 +943,34 @@ utun_pkt_input (struct utun_pcb *pcb, mbuf_t m) return 0; } + + + +/* + * These are place holders until coreTLS kext stops caling them + */ +errno_t utun_ctl_register_dtls (void *reg); +int utun_pkt_dtls_input(struct utun_pcb *pcb, mbuf_t *pkt, protocol_family_t family); +void utun_ctl_disable_crypto_dtls(struct utun_pcb *pcb); + +errno_t +utun_ctl_register_dtls (void *reg) +{ +#pragma unused(reg) + return 0; +} + +int +utun_pkt_dtls_input(struct utun_pcb *pcb, mbuf_t *pkt, protocol_family_t family) +{ +#pragma unused(pcb) +#pragma unused(pkt) +#pragma unused(family) + return 0; +} + +void +utun_ctl_disable_crypto_dtls(struct utun_pcb *pcb) +{ +#pragma unused(pcb) +} diff --git a/bsd/net/if_utun.h b/bsd/net/if_utun.h index 2ffd72ee1..b75476582 100644 --- a/bsd/net/if_utun.h +++ b/bsd/net/if_utun.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -30,10 +30,6 @@ #ifndef _NET_IF_UTUN_H_ #define _NET_IF_UTUN_H_ -#ifdef PRIVATE -#include -#endif /* PRIVATE */ - #ifdef KERNEL_PRIVATE #include @@ -46,7 +42,10 @@ struct utun_pcb { u_int32_t utun_flags; int utun_ext_ifdata_stats; u_int32_t utun_max_pending_packets; - utun_crypto_ctx_t utun_crypto_ctx[UTUN_CRYPTO_CTX_NUM_DIRS]; + int utun_channel_enabled; + uuid_t utun_channel_uuid; + void * utun_channel_rxring; + u_int32_t utun_channel_max_pktlen; }; void* utun_alloc(size_t size); @@ -67,34 +66,19 @@ errno_t utun_register_control(void); #define UTUN_OPT_IFNAME 2 #define UTUN_OPT_EXT_IFDATA_STATS 3 /* get|set (type int) */ #define UTUN_OPT_INC_IFDATA_STATS_IN 4 /* set to increment stat counters (type struct utun_stats_param) */ -#define UTUN_OPT_INC_IFDATA_STATS_OUT 5 /* set to increment stat counters (type struct utun_stats_param) */ - -#ifdef PRIVATE -#define UTUN_OPT_ENABLE_CRYPTO 6 -#define UTUN_OPT_CONFIG_CRYPTO_KEYS 7 -#define UTUN_OPT_UNCONFIG_CRYPTO_KEYS 8 -#define UTUN_OPT_GENERATE_CRYPTO_KEYS_IDX 9 -#define UTUN_OPT_DISABLE_CRYPTO 10 -#define UTUN_OPT_STOP_CRYPTO_DATA_TRAFFIC 11 -#define UTUN_OPT_START_CRYPTO_DATA_TRAFFIC 12 -#define UTUN_OPT_CONFIG_CRYPTO_FRAMER 13 -#define UTUN_OPT_UNCONFIG_CRYPTO_FRAMER 14 -#endif /* PRIVATE */ +#define UTUN_OPT_INC_IFDATA_STATS_OUT 5 /* set to increment stat counters (type struct utun_stats_param) */ #define UTUN_OPT_SET_DELEGATE_INTERFACE 15 /* set the delegate interface (char[]) */ #define UTUN_OPT_MAX_PENDING_PACKETS 16 /* the number of packets that can be waiting to be read from the control socket at a time */ - +#define UTUN_OPT_ENABLE_CHANNEL 17 +#define UTUN_OPT_GET_CHANNEL_UUID 18 /* * Flags for by UTUN_OPT_FLAGS */ #define UTUN_FLAGS_NO_OUTPUT 0x0001 #define UTUN_FLAGS_NO_INPUT 0x0002 - -#ifdef PRIVATE -#define UTUN_FLAGS_CRYPTO 0x0004 -#define UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC 0x0008 -#endif /* PRIVATE */ +#define UTUN_FLAGS_ENABLE_PROC_UUID 0x0004 /* * utun stats parameter structure diff --git a/bsd/net/if_utun_crypto.c b/bsd/net/if_utun_crypto.c deleted file mode 100644 index 553d4874c..000000000 --- a/bsd/net/if_utun_crypto.c +++ /dev/null @@ -1,722 +0,0 @@ -/* - * Copyright (c) 2011 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - - -#include -#include -#include -#include -#include -#include -#include -#include - -void -utun_ctl_init_crypto (void) -{ - utun_ctl_init_crypto_dtls(); -} - -void -utun_cleanup_crypto (struct utun_pcb *pcb) -{ -#if IPSEC - utun_cleanup_all_crypto_ipsec(pcb); -#endif - utun_cleanup_all_crypto_dtls(pcb); - pcb->utun_flags &= ~UTUN_FLAGS_CRYPTO; -} - -errno_t -utun_ctl_enable_crypto (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto context args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (IPSec or DTLS) - * - ensure that the crypto context is *not* already valid (don't recreate already valid context). - * - we have only one context per direction and type. - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - int idx; - utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; - utun_crypto_ctx_t *crypto_ctx; - - if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); - return EINVAL; - } - if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); - return EINVAL; - } - if (crypto_args->args_ulen != sizeof(crypto_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - -#if IPSEC - if (crypto_args->type == UTUN_CRYPTO_TYPE_IPSEC) { - utun_ctl_enable_crypto_ipsec(pcb, crypto_args); - } else -#endif - if (crypto_args->type == UTUN_CRYPTO_TYPE_DTLS) { - utun_ctl_enable_crypto_dtls(pcb, crypto_args); - } else { - // unsupported - return EPROTONOSUPPORT; - } - for (idx = 0; idx < UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_MAX); idx++) { - crypto_ctx = &pcb->utun_crypto_ctx[idx]; - if (crypto_ctx->valid) { - return EBADF; - } - - crypto_ctx->type = crypto_args->type; - LIST_INIT(&crypto_ctx->keys_listhead); - LIST_INIT(&crypto_ctx->framer_listheads[UTUN_CRYPTO_INNER_TYPE_TO_IDX(UTUN_CRYPTO_INNER_TYPE_IPv4)]); - LIST_INIT(&crypto_ctx->framer_listheads[UTUN_CRYPTO_INNER_TYPE_TO_IDX(UTUN_CRYPTO_INNER_TYPE_IPv6)]); - crypto_ctx->valid = 1; - printf("%s: initialized framer lists\n", __FUNCTION__); - } - // data traffic is stopped by default - pcb->utun_flags |= (UTUN_FLAGS_CRYPTO | UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC); - return 0; - } -} - -errno_t -utun_ctl_disable_crypto (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto context args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (IPSec or DTLS) - * - ensure that the crypto context *is* already valid (don't release invalid context). - * - we have only one context per direction and type. - * - ensure that the crypto context has no crypto material. - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; - - if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); - return EINVAL; - } - if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); - return EINVAL; - } - if (crypto_args->args_ulen != sizeof(crypto_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - -#if IPSEC - if (crypto_args->type == UTUN_CRYPTO_TYPE_IPSEC) { - utun_ctl_disable_crypto_ipsec(pcb); - } else -#endif - if (crypto_args->type == UTUN_CRYPTO_TYPE_DTLS) { - utun_ctl_disable_crypto_dtls(pcb); - } else { - // unsupported - return EPROTONOSUPPORT; - } - } - pcb->utun_flags &= ~(UTUN_FLAGS_CRYPTO | UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC); - return 0; -} - -errno_t -utun_ctl_config_crypto_keys (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto material args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (IPSec only) - * - crypto material direction and type must match the associated crypto context's. - * - we can have a list of crypto materials per context. - * - ensure that the crypto context is already valid (don't add crypto material to invalid context). - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - int idx; - utun_crypto_keys_args_t *crypto_keys_args = (__typeof__(crypto_keys_args))data; - utun_crypto_ctx_t *crypto_ctx; - utun_crypto_keys_t *crypto_keys = NULL; - - if (crypto_keys_args->ver == 0 || crypto_keys_args->ver >= UTUN_CRYPTO_KEYS_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, crypto_keys_args->ver); - return EINVAL; - } - if (crypto_keys_args->dir == 0 || crypto_keys_args->dir >= UTUN_CRYPTO_DIR_MAX) { - printf("%s: dir check failed %d\n", __FUNCTION__, crypto_keys_args->dir); - return EINVAL; - } - if (crypto_keys_args->type == 0 || crypto_keys_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, crypto_keys_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)); - return EINVAL; - } - idx = UTUN_CRYPTO_DIR_TO_IDX(crypto_keys_args->dir); - crypto_ctx = &pcb->utun_crypto_ctx[idx]; - if (!crypto_ctx->valid) { - return EBADF; - } - if (crypto_keys_args->type != crypto_ctx->type) { - // can't add keymat to context with different crypto type - return ENOENT; - } - crypto_keys = utun_alloc(sizeof(*crypto_keys)); - if (!crypto_keys) { - return ENOBUFS; - } - bzero(crypto_keys, sizeof(*crypto_keys)); - if (crypto_keys_args->args_ulen != sizeof(crypto_keys_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - - // branch-off for ipsec vs. dtls -#if IPSEC - if (crypto_keys_args->type == UTUN_CRYPTO_TYPE_IPSEC) { - errno_t err; - if ((err = utun_ctl_config_crypto_keys_ipsec(pcb, crypto_keys_args, crypto_keys))) { - utun_free(crypto_keys); - return err; - } - } else -#endif - { - // unsupported - utun_free(crypto_keys); - return EPROTONOSUPPORT; - } - crypto_keys->type = crypto_keys_args->type; - LIST_INSERT_HEAD(&crypto_ctx->keys_listhead, crypto_keys, chain); - crypto_keys->valid = 1; - } - - return 0; -} - -errno_t -utun_ctl_unconfig_crypto_keys (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto material args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (IPSec only) - * - crypto material direction and type must match the associated crypto context's. - * - we can have a list of crypto materials per context. - * - ensure that the crypto context is already valid (don't add crypto material to invalid context). - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - int idx; - utun_crypto_keys_args_t *crypto_keys_args = (__typeof__(crypto_keys_args))data; - utun_crypto_ctx_t *crypto_ctx; - utun_crypto_keys_t *cur_crypto_keys, *nxt_crypto_keys; - - if (crypto_keys_args->ver == 0 || crypto_keys_args->ver >= UTUN_CRYPTO_KEYS_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, crypto_keys_args->ver); - return EINVAL; - } - if (crypto_keys_args->dir == 0 || crypto_keys_args->dir >= UTUN_CRYPTO_DIR_MAX) { - printf("%s: dir check failed %d\n", __FUNCTION__, crypto_keys_args->dir); - return EINVAL; - } - if (crypto_keys_args->type == 0 || crypto_keys_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, crypto_keys_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)); - return EINVAL; - } - idx = UTUN_CRYPTO_DIR_TO_IDX(crypto_keys_args->dir); - crypto_ctx = &pcb->utun_crypto_ctx[idx]; - if (!crypto_ctx->valid) { - return EBADF; - } - if (crypto_keys_args->type != crypto_ctx->type) { - // can't add keymat to context with different crypto type - return ENOENT; - } - if (crypto_keys_args->args_ulen != sizeof(crypto_keys_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - - // traverse crypto materials looking for the right one - for (cur_crypto_keys = (__typeof__(cur_crypto_keys))LIST_FIRST(&crypto_ctx->keys_listhead); - cur_crypto_keys != NULL; - cur_crypto_keys = nxt_crypto_keys) { - nxt_crypto_keys = (__typeof__(nxt_crypto_keys))LIST_NEXT(cur_crypto_keys, chain); - // branch-off for ipsec vs. dtls -#if IPSEC - if (crypto_keys_args->type == UTUN_CRYPTO_TYPE_IPSEC) { - if (crypto_keys_args->u.ipsec_v1.spi == cur_crypto_keys->state.u.ipsec.spi) { - errno_t err; - if ((err = utun_ctl_unconfig_crypto_keys_ipsec(crypto_keys_args, cur_crypto_keys))) { - return err; - } - LIST_REMOVE(cur_crypto_keys, chain); - bzero(cur_crypto_keys, sizeof(*cur_crypto_keys)); - utun_free(cur_crypto_keys); - return 0; - } - } else -#endif - { - // unsupported - return EPROTONOSUPPORT; - } - } - // TODO: if there is no SA left, ensure utun can't decrypt/encrypt packets directly. it should rely on the vpnplugin for that. - } - - return 0; -} - -errno_t -utun_ctl_config_crypto_framer (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto material args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (DTLS only) - * - crypto material direction and type must match the associated crypto context's. - * - we can have a list of crypto materials per context. - * - ensure that the crypto context is already valid (don't add crypto material to invalid context). - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_FRAMER_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - int idx; - utun_crypto_framer_args_t *framer_args = (__typeof__(framer_args))data; - utun_crypto_ctx_t *crypto_ctx; - - if (framer_args->ver == 0 || framer_args->ver >= UTUN_CRYPTO_FRAMER_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, (int)framer_args->ver); - return EINVAL; - } - if (framer_args->dir == 0 || framer_args->dir >= UTUN_CRYPTO_DIR_MAX) { - printf("%s: dir check failed %d\n", __FUNCTION__, (int)framer_args->dir); - return EINVAL; - } - if (framer_args->type == 0 || framer_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, (int)framer_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_FRAMER_ARGS_TOTAL_SIZE(framer_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_FRAMER_ARGS_TOTAL_SIZE(framer_args)); - return EINVAL; - } - idx = UTUN_CRYPTO_DIR_TO_IDX(framer_args->dir); - crypto_ctx = &pcb->utun_crypto_ctx[idx]; - if (!crypto_ctx->valid) { - return EBADF; - } - if (framer_args->type != crypto_ctx->type) { - // can't add keymat to context with different crypto type - return ENOENT; - } - if (framer_args->args_ulen != sizeof(framer_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - // TODO: - } - - // branch-off for ipsec vs. dtls - if (framer_args->type == UTUN_CRYPTO_TYPE_DTLS) { - errno_t err; - if ((err = utun_ctl_config_crypto_dtls_framer(crypto_ctx, framer_args))) { - return err; - } - } else { - // unsupported - return EPROTONOSUPPORT; - } - } - - return 0; -} - -errno_t -utun_ctl_unconfig_crypto_framer (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto material args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (DTLS only) - * - crypto material direction and type must match the associated crypto context's. - * - we can have a list of crypto materials per context. - * - ensure that the crypto context is already valid (don't add crypto material to invalid context). - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_FRAMER_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - int idx; - utun_crypto_framer_args_t *framer_args = (__typeof__(framer_args))data; - utun_crypto_ctx_t *crypto_ctx; - - if (framer_args->ver == 0 || framer_args->ver >= UTUN_CRYPTO_FRAMER_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, (int)framer_args->ver); - return EINVAL; - } - if (framer_args->dir == 0 || framer_args->dir >= UTUN_CRYPTO_DIR_MAX) { - printf("%s: dir check failed %d\n", __FUNCTION__, (int)framer_args->dir); - return EINVAL; - } - if (framer_args->type == 0 || framer_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, (int)framer_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_FRAMER_ARGS_TOTAL_SIZE(framer_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_FRAMER_ARGS_TOTAL_SIZE(framer_args)); - return EINVAL; - } - idx = UTUN_CRYPTO_DIR_TO_IDX(framer_args->dir); - crypto_ctx = &pcb->utun_crypto_ctx[idx]; - if (!crypto_ctx->valid) { - return EBADF; - } - if (framer_args->type != crypto_ctx->type) { - // can't add keymat to context with different crypto type - return ENOENT; - } - if (framer_args->args_ulen != sizeof(framer_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - - // branch-off for ipsec vs. dtls - if (framer_args->type == UTUN_CRYPTO_TYPE_DTLS) { - errno_t err; - if ((err = utun_ctl_unconfig_crypto_dtls_framer(crypto_ctx, framer_args))) { - return err; - } - } else { - // unsupported - return EPROTONOSUPPORT; - } - } - - return 0; -} - -errno_t -utun_ctl_generate_crypto_keys_idx (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t *len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto material index args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (IPSec only) - * - crypto material direction and type must match the associated crypto context's. - * - we can have a list of crypto materials per context. - * - any error should be equivalent to noop. - */ - if (*len < UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - int idx; - utun_crypto_keys_idx_args_t *crypto_keys_idx_args = (__typeof__(crypto_keys_idx_args))data; - utun_crypto_ctx_t *crypto_ctx; - - if (crypto_keys_idx_args->ver == 0 || crypto_keys_idx_args->ver >= UTUN_CRYPTO_KEYS_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, crypto_keys_idx_args->ver); - return EINVAL; - } - if (crypto_keys_idx_args->dir == 0 || crypto_keys_idx_args->dir >= UTUN_CRYPTO_DIR_MAX) { - printf("%s: dir check failed %d\n", __FUNCTION__, crypto_keys_idx_args->dir); - return EINVAL; - } - if (crypto_keys_idx_args->type == 0 || crypto_keys_idx_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, crypto_keys_idx_args->type); - return EINVAL; - } - if (*len < UTUN_CRYPTO_KEYS_IDX_ARGS_TOTAL_SIZE(crypto_keys_idx_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)*len, (int)UTUN_CRYPTO_KEYS_IDX_ARGS_TOTAL_SIZE(crypto_keys_idx_args)); - return EINVAL; - } - idx = UTUN_CRYPTO_DIR_TO_IDX(crypto_keys_idx_args->dir); - crypto_ctx = &pcb->utun_crypto_ctx[idx]; - if (!crypto_ctx->valid) { - return EBADF; - } - if (crypto_keys_idx_args->type != crypto_ctx->type) { - // can't add keymat to context with different crypto type - return ENOENT; - } - if (crypto_keys_idx_args->args_ulen != sizeof(crypto_keys_idx_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - - // traverse crypto materials looking for the right one - // branch-off for ipsec vs. dtls -#if IPSEC - if (crypto_keys_idx_args->type == UTUN_CRYPTO_TYPE_IPSEC) { - errno_t err; - if ((err = utun_ctl_generate_crypto_keys_idx_ipsec(crypto_keys_idx_args))) { - return err; - } - } else -#endif - { - // unsupported - return EPROTONOSUPPORT; - } - } - - return 0; -} - -errno_t -utun_ctl_stop_crypto_data_traffic (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto context args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (IPSec or DTLS) - * - ensure that the crypto context *is* already valid (don't release invalid context). - * - we have only one context per direction and type. - * - ensure that the crypto context has no crypto material. - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; - - if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); - return EINVAL; - } - if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); - return EINVAL; - } - if (crypto_args->args_ulen != sizeof(crypto_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - - if ((pcb->utun_flags & UTUN_FLAGS_CRYPTO) == 0) { - printf("%s: crypto is already disabled\n", __FUNCTION__); - return EINVAL; - } - - if (crypto_args->type == UTUN_CRYPTO_TYPE_IPSEC) { - // nothing - } else if (crypto_args->type == UTUN_CRYPTO_TYPE_DTLS) { - utun_ctl_stop_datatraffic_crypto_dtls(pcb); - } else { - // unsupported - return EPROTONOSUPPORT; - } - } - pcb->utun_flags |= UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC; - return 0; -} - -errno_t -utun_ctl_start_crypto_data_traffic (__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len) -{ - struct utun_pcb *pcb = unitinfo; - - /* - * - verify the crypto context args passed from user-land. - * - check the size of the argument buffer. - * - check the direction (IN or OUT) - * - check the type (IPSec or DTLS) - * - ensure that the crypto context *is* already valid (don't release invalid context). - * - we have only one context per direction and type. - * - ensure that the crypto context has no crypto material. - * - any error should be equivalent to noop. - */ - if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { - return EMSGSIZE; - } else { - utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; - - if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { - printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); - return EINVAL; - } - if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { - printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); - return EINVAL; - } - if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { - printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, - (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); - return EINVAL; - } - if (crypto_args->args_ulen != sizeof(crypto_args->u)) { - printf("%s: compatibility mode\n", __FUNCTION__); - } - - if ((pcb->utun_flags & UTUN_FLAGS_CRYPTO) == 0) { - printf("%s: crypto is already disabled\n", __FUNCTION__); - return EINVAL; - } - - if (crypto_args->type == UTUN_CRYPTO_TYPE_IPSEC) { - // nothing - } else if (crypto_args->type == UTUN_CRYPTO_TYPE_DTLS) { - utun_ctl_start_datatraffic_crypto_dtls(pcb); - } else { - // unsupported - return EPROTONOSUPPORT; - } - } - pcb->utun_flags &= ~UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC; - return 0; -} - -int -utun_pkt_crypto_output (struct utun_pcb *pcb, mbuf_t *m) -{ - int idx = UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT); - if (!pcb->utun_crypto_ctx[idx].valid) { - printf("%s: context is invalid %d\n", __FUNCTION__, pcb->utun_crypto_ctx[idx].valid); - return -1; - } -#if IPSEC - if (pcb->utun_crypto_ctx[idx].type == UTUN_CRYPTO_TYPE_IPSEC) { - return(utun_pkt_ipsec_output(pcb, m)); - } else -#endif - if (pcb->utun_crypto_ctx[idx].type == UTUN_CRYPTO_TYPE_DTLS) { - return(utun_pkt_dtls_output(pcb, m)); - } else { - // unsupported - printf("%s: type is invalid %d\n", __FUNCTION__, pcb->utun_crypto_ctx[idx].type); - } - return -1; -} diff --git a/bsd/net/if_utun_crypto.h b/bsd/net/if_utun_crypto.h deleted file mode 100644 index 14e92594d..000000000 --- a/bsd/net/if_utun_crypto.h +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Copyright (c) 2011 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _NET_IF_UTUN_CRYPTO_H_ -#define _NET_IF_UTUN_CRYPTO_H_ - -// constants used in configuring the crypto context -typedef enum utun_crypto_ver { - UTUN_CRYPTO_VER_1 = 1, - UTUN_CRYPTO_VER_MAX, -} utun_crypto_ver_t; - -#define UTUN_CRYPTO_KEYS_IPSEC_VER_1 UTUN_CRYPTO_VER_1 -#define UTUN_CRYPTO_IPSEC_VER_1 UTUN_CRYPTO_VER_1 -#define UTUN_CRYPTO_DTLS_VER_1 UTUN_CRYPTO_VER_1 - -#define UTUN_CRYPTO_ARGS_VER_MAX UTUN_CRYPTO_VER_MAX -#define UTUN_CRYPTO_KEYS_ARGS_VER_MAX UTUN_CRYPTO_VER_MAX -#define UTUN_CRYPTO_FRAMER_ARGS_VER_MAX UTUN_CRYPTO_VER_MAX - -typedef enum utun_crypto_dir { - UTUN_CRYPTO_DIR_IN = 1, - UTUN_CRYPTO_DIR_OUT, - UTUN_CRYPTO_DIR_MAX, -} utun_crypto_dir_t; - -#define UTUN_CRYPTO_CTX_NUM_DIRS 2 - -#define BITSTOBYTES(n) (n >> 3) -#define BYTESTOBITS(n) (n << 3) - -#define MAX_KEY_AUTH_LEN_BITS 512 // corresponds to SHA512 -#define MAX_KEY_AUTH_LEN_BYTES (BITSTOBYTES(MAX_KEY_AUTH_LEN_BITS)) -#define MAX_KEY_ENC_LEN_BITS 256 // corresponds to AES256 -#define MAX_KEY_ENC_LEN_BYTES (BITSTOBYTES(MAX_KEY_ENC_LEN_BITS)) - -typedef enum utun_crypto_type { - UTUN_CRYPTO_TYPE_IPSEC = 1, - UTUN_CRYPTO_TYPE_DTLS, - UTUN_CRYPTO_TYPE_MAX, -} utun_crypto_type_t; - -typedef enum if_utun_crypto_ipsec_mode { - IF_UTUN_CRYPTO_IPSEC_MODE_NONE = 0, - IF_UTUN_CRYPTO_IPSEC_MODE_TRANSPORT, - IF_UTUN_CRYPTO_IPSEC_MODE_TUNNEL, - IF_UTUN_CRYPTO_IPSEC_MODE_MAX, -} if_utun_crypto_ipsec_mode_t; - -typedef enum if_utun_crypto_ipsec_proto { - IF_UTUN_CRYPTO_IPSEC_PROTO_NONE = 0, - IF_UTUN_CRYPTO_IPSEC_PROTO_ESP, - IF_UTUN_CRYPTO_IPSEC_PROTO_AH, - IF_UTUN_CRYPTO_IPSEC_PROTO_MAX, -} if_utun_crypto_ipsec_proto_t; - -typedef enum if_utun_crypto_ipsec_auth { - IF_UTUN_CRYPTO_IPSEC_AUTH_NONE = 0, - IF_UTUN_CRYPTO_IPSEC_AUTH_MD5, - IF_UTUN_CRYPTO_IPSEC_AUTH_SHA1, - IF_UTUN_CRYPTO_IPSEC_AUTH_SHA256, - IF_UTUN_CRYPTO_IPSEC_AUTH_SHA384, - IF_UTUN_CRYPTO_IPSEC_AUTH_SHA512, - IF_UTUN_CRYPTO_IPSEC_AUTH_MAX, -} if_utun_crypto_ipsec_auth_t; - -typedef enum if_utun_crypto_ipsec_enc { - IF_UTUN_CRYPTO_IPSEC_ENC_NONE = 0, - IF_UTUN_CRYPTO_IPSEC_ENC_DES, - IF_UTUN_CRYPTO_IPSEC_ENC_3DES, - IF_UTUN_CRYPTO_IPSEC_ENC_AES128, - IF_UTUN_CRYPTO_IPSEC_ENC_AES256, - IF_UTUN_CRYPTO_IPSEC_ENC_MAX, -} if_utun_crypto_ipsec_enc_t; - -typedef enum if_utun_crypto_ipsec_keepalive { - IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_NONE = 0, - IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_NATT, - IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_ESP, - IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_MAX, -} if_utun_crypto_ipsec_keepalive_t; - -typedef enum if_utun_crypto_ipsec_natd { - IF_UTUN_CRYPTO_IPSEC_NATD_NONE = 0, - IF_UTUN_CRYPTO_IPSEC_NATD_MINE, - IF_UTUN_CRYPTO_IPSEC_NATD_PEER, - IF_UTUN_CRYPTO_IPSEC_NATD_BOTH, - IF_UTUN_CRYPTO_IPSEC_NATD_MAX, -} if_utun_crypto_ipsec_natd_t; - -// structures used for storing the App's keying index arguments -typedef struct utun_crypto_keys_idx_ipsec_args_v1 { - struct sockaddr_storage src_addr; // v4 or v6 socket address (ignore port numbers) - struct sockaddr_storage dst_addr; // v4 or v6 socket address (ignore port numbers) - if_utun_crypto_ipsec_proto_t proto; - if_utun_crypto_ipsec_mode_t mode; - u_int32_t reqid; // policy's reqid, default to 0 for now since we are avoiding policies. - u_int32_t spi; // 0 when requesting the index, otherwise it contains the resulting index - u_int32_t spirange_min; // default to 0 - u_int32_t spirange_max; // default to 0xffffffff -} __attribute__((packed)) utun_crypto_keys_idx_ipsec_args_v1_t; - -typedef struct utun_crypto_keys_idx_dtls_args_v1 { - // stub for DTLS keying index arguments - u_int32_t unused; // place holder -} __attribute__((packed)) utun_crypto_keys_idx_dtls_args_v1_t; - -// App's parent structure for sending/storing keying index arguments -typedef struct utun_crypto_keys_idx_args { - utun_crypto_ver_t ver; - utun_crypto_type_t type; - utun_crypto_dir_t dir; - u_int32_t args_ulen; - u_int32_t varargs_buflen; - union { - // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_CTX_IDX_ARGS_HDR_SIZE breaks backwards compatibility - utun_crypto_keys_idx_ipsec_args_v1_t ipsec_v1; - utun_crypto_keys_idx_dtls_args_v1_t dtls_v1; - // future (additional) versions of the arguments may be placed here - } u; - u_int8_t varargs_buf[0]; -} __attribute__((aligned(4), packed)) utun_crypto_keys_idx_args_t; - -// structures used for storing the App's keying material arguments -typedef struct utun_crypto_keys_ipsec_args_v1 { - struct sockaddr_storage src_addr; // v4 or v6 socket address (ignore port numbers) - struct sockaddr_storage dst_addr; // v4 or v6 socket address (ignore port numbers) - if_utun_crypto_ipsec_proto_t proto; - if_utun_crypto_ipsec_mode_t mode; - if_utun_crypto_ipsec_auth_t alg_auth; - if_utun_crypto_ipsec_enc_t alg_enc; - if_utun_crypto_ipsec_keepalive_t keepalive; - if_utun_crypto_ipsec_natd_t natd; - u_int8_t replay; // window size default to 4 - u_int8_t punt_rx_keepalive; - u_int16_t interval_tx_keepalive; - u_int16_t key_auth_len; // 128 or 160 or 192 or 256 or 384 or 512 - u_int16_t key_enc_len; // 64 or 128 or 192 or 256 - u_int16_t natt_port; // if non-zero flags will be set to include SADB_X_EXT_NATT - u_int16_t unused; - u_int32_t seq; // default to 0 - u_int32_t spi; - u_int32_t pid; // vpnagent's process id - u_int32_t reqid; // policy's reqid, default to 0 for now since we are avoiding policies. - u_int64_t lifetime_hard; // value in seconds - u_int64_t lifetime_soft; // value in seconds - // key_auth and key_enc will actually be stored in utun_crypto_KEYS_args_t.varargs_buf -} __attribute__((packed)) utun_crypto_keys_ipsec_args_v1_t; - -typedef struct utun_crypto_keys_dtls_args_v1 { - // stub for DTLS keying material arguments - u_int32_t unused; // place holder -} __attribute__((packed)) utun_crypto_keys_dtls_args_v1_t; - -// App's parent structure for sending/storing keying material arguments -typedef struct utun_crypto_keys_args { - utun_crypto_ver_t ver; - utun_crypto_type_t type; - utun_crypto_dir_t dir; - u_int32_t args_ulen; - u_int32_t varargs_buflen; - union { - // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE breaks backwards compatibility - utun_crypto_keys_ipsec_args_v1_t ipsec_v1; - utun_crypto_keys_dtls_args_v1_t dtls_v1; - // future (additional) versions of the arguments may be placed here - } u; - u_int8_t varargs_buf[0]; -} __attribute__((aligned(4), packed)) utun_crypto_keys_args_t; - -// structures used for storing the App's crypto arguments -typedef struct utun_crypto_ipsec_args_v1 { - // stub for IPSec crypto context arguments - u_int32_t unused; // place holder -} __attribute__((packed)) utun_crypto_ipsec_args_v1_t; - -typedef struct utun_crypto_dtls_args_v1 { - // stub for DTLS crypto context arguments - int kpi_handle; -} __attribute__((packed)) utun_crypto_dtls_args_v1_t; - -// App's parent structure for starting/stopping crypto -typedef struct utun_crypto_args { - utun_crypto_ver_t ver; - utun_crypto_type_t type; - u_int32_t stop_data_traffic; - u_int32_t args_ulen; - u_int32_t varargs_buflen; - union { - // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_ARGS_HDR_SIZE breaks backwards compatibility - utun_crypto_ipsec_args_v1_t ipsec_v1; - utun_crypto_dtls_args_v1_t dtls_v1; - // future (additional) versions of the arguments may be placed here - } u; - u_int8_t varargs_buf[0]; // must be at the end of this struct -} __attribute__((aligned(4), packed)) utun_crypto_args_t; - -typedef enum { - UTUN_CRYPTO_INNER_TYPE_IPv4 = 1, - UTUN_CRYPTO_INNER_TYPE_IPv6, - UTUN_CRYPTO_INNER_TYPE_MAX, -} utun_crypto_framer_inner_type_t; - -typedef struct utun_crypto_framer_ipsec_args_v1 { - // stub for IPSec framer arguments - u_int32_t unused; // place holder -} __attribute__((packed)) utun_crypto_framer_ipsec_args_v1_t; - -typedef struct utun_crypto_framer_dtls_in_args_v1 { - int in_pattern_len; - int in_pattern_mask_len; - int in_data_offset; - // in_pattern, in_pattern_mask will actually be stored in utun_crypto_framer_args_t.varargs_buf -} __attribute__((packed)) utun_crypto_framer_dtls_in_args_v1_t; - -typedef struct utun_crypto_framer_dtls_out_args_v1 { - int out_pattern_len; - u_int32_t len_field_mask; // 0 means unconfigured - int len_field_offset; - int len_field_extra; - u_int32_t sequence_field; - u_int32_t sequence_field_mask; // 0 means unconfigured - int sequence_field_offset; - // out_pattern will actually be stored in utun_crypto_framer_args_t.varargs_buf -} __attribute__((packed)) utun_crypto_framer_dtls_out_args_v1_t; - -typedef struct utun_crypto_framer_dtls_args_v1 { - // the following depend on utun_crypto_framer_args_t.dir - union { - // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE breaks backwards compatibility - utun_crypto_framer_dtls_in_args_v1_t in; - utun_crypto_framer_dtls_out_args_v1_t out; - // future (additional) versions of the arguments may be placed here - } u; -} __attribute__((packed)) utun_crypto_framer_dtls_args_v1_t; - -// App's parent structure for sending/storing framer arguments -typedef struct utun_crypto_framer_args { - utun_crypto_ver_t ver; - utun_crypto_type_t type; - utun_crypto_dir_t dir; - utun_crypto_framer_inner_type_t inner_type; - u_int32_t args_ulen; - u_int32_t varargs_buflen; - union { - // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE breaks backwards compatibility - utun_crypto_framer_ipsec_args_v1_t ipsec_v1; - utun_crypto_framer_dtls_args_v1_t dtls_v1; - // future (additional) versions of the arguments may be placed here - } u; - u_int8_t varargs_buf[0]; -} __attribute__((aligned(4), packed)) utun_crypto_framer_args_t; - -#define utun_crypto_framer_args_dtls_in(framer) framer->u.dtls_v1.u.in -#define utun_crypto_framer_args_dtls_out(framer) framer->u.dtls_v1.u.out - -#ifdef KERNEL_PRIVATE - -#include -#include -#include -#include -#include -#include -#include - -struct utun_pcb; - -// structures used for storing kernel's keying material runtime state -typedef struct utun_crypto_keys_ipsec_state { - // kernel's ipsec keying material state - u_int32_t spi; - struct secashead *sah; - struct secasvar *sav; - u_int8_t proto; - u_int8_t ifamily; - u_int8_t mode; - u_int8_t unused; -} __attribute__((packed)) utun_crypto_keys_ipsec_state_t; - -typedef struct utun_crypto_keys_dtls_state { - // stub for kernel's DTLS keying material state - u_int32_t unused; // place holder -} __attribute__((packed)) utun_crypto_keys_dtls_state_t; - -// kernel's parent structure for keying material state -typedef struct utun_crypto_keys_state { - union { - utun_crypto_keys_ipsec_state_t ipsec; - utun_crypto_keys_dtls_state_t dtls; - } u; -} __attribute__((aligned(4), packed)) utun_crypto_keys_state_t; - -// kernel's parent structure for keying material -typedef struct utun_crypto_keys { - int valid; // is valid? - utun_crypto_type_t type; - u_int16_t unused; - utun_crypto_keys_state_t state; // runtime state - LIST_ENTRY(utun_crypto_keys) chain; -} __attribute__((aligned(4), packed)) utun_crypto_keys_t; - -// structures used for storing kernel's framer runtime state -typedef struct utun_crypto_framer_ipsec_state { - // stub for kernel's IPSec framer state - u_int32_t unused; // place holder -} __attribute__((packed)) utun_crypto_framer_ipsec_state_t; - -typedef struct utun_crypto_framer_dtls_in_state { - u_int8_t *in_pattern; - int in_pattern_len; - u_int8_t *in_pattern_mask; - u_int8_t *in_pattern_masked; - int in_data_offset; - struct bpf_program in_pattern_filter; -} __attribute__((packed)) utun_crypto_framer_dtls_in_state_t; - -typedef struct utun_crypto_framer_dtls_out_state { - u_int8_t *out_pattern; - int out_pattern_len; - u_int32_t len_field_mask; // 0 means unconfigured - int len_field_offset; - int len_field_extra; - u_int32_t sequence_field; - u_int32_t sequence_field_initval; - u_int32_t sequence_field_mask; // 0 means unconfigured - int sequence_field_offset; -} __attribute__((packed)) utun_crypto_framer_dtls_out_state_t; - -typedef struct utun_crypto_framer_dtls_state { - union { - // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE breaks backwards compatibility - utun_crypto_framer_dtls_in_state_t in; - utun_crypto_framer_dtls_out_state_t out; - // future (additional) versions of the arguments may be placed here - } u; -} __attribute__((packed)) utun_crypto_framer_dtls_state_t; - -// kernel's parent structure for framer state -typedef struct utun_crypto_framer_state { - union { - utun_crypto_framer_ipsec_state_t ipsec; - utun_crypto_framer_dtls_state_t dtls; - } u; -} __attribute__((aligned(4), packed)) utun_crypto_framer_state_t; - -// kernel's parent structure for the framer -typedef struct utun_crypto_framer { - int valid; // is valid? - utun_crypto_type_t type; - utun_crypto_dir_t dir; - utun_crypto_framer_inner_type_t inner_type; - protocol_family_t inner_protocol_family; - utun_crypto_framer_state_t state; // runtime state - LIST_ENTRY(utun_crypto_framer) framer_chain; -} __attribute__((aligned(4), packed)) utun_crypto_framer_t; - -#define UTUN_CRYPTO_INNER_TYPE_TO_IDX(type) (type - 1) -#define UTUN_CRYPTO_IDX_TO_INNER_TYPE(idx) (idx + 1) -#define UTUN_CRYPTO_INNER_TYPE_IDX_MAX UTUN_CRYPTO_INNER_TYPE_TO_IDX(UTUN_CRYPTO_INNER_TYPE_MAX) - -#define UTUN_CRYPTO_DIR_TO_IDX(dir) (dir - 1) -#define UTUN_CRYPTO_IDX_TO_DIR(idx) (idx + 1) -#define UTUN_CRYPTO_DIR_IDX_MAX UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_MAX) - -#define utun_crypto_framer_state_dtls_in(framer) framer->state.u.dtls.u.in -#define utun_crypto_framer_state_dtls_out(framer) framer->state.u.dtls.u.out - -// kernel's parent structure for all crypto stuff -typedef struct utun_crypto_ctx { - int valid; - utun_crypto_type_t type; - u_int16_t unused; - LIST_HEAD(chain, utun_crypto_keys) keys_listhead; - LIST_HEAD(framer_chain, utun_crypto_framer) framer_listheads[UTUN_CRYPTO_INNER_TYPE_IDX_MAX]; - int num_framers; - int kpi_handle; - caddr_t kpi_ref; - int kpi_refcnt; -} __attribute__((aligned(4), packed)) utun_crypto_ctx_t; - -#define UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE ((size_t)(&((utun_crypto_keys_idx_args_t *)0)->u)) -#define UTUN_CRYPTO_KEYS_IDX_ARGS_VARARGS_BUF(args) ((u_int8_t *)args + UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE + args->args_ulen) -#define UTUN_CRYPTO_KEYS_IDX_ARGS_TOTAL_SIZE(args) ((size_t)(UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE + args->args_ulen + args->varargs_buflen)) - -#define UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE ((size_t)(&((utun_crypto_keys_args_t *)0)->u)) -#define UTUN_CRYPTO_KEYS_ARGS_VARARGS_BUF(args) ((u_int8_t *)args + UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE + args->args_ulen) -#define UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(args) ((size_t)(UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE + args->args_ulen + args->varargs_buflen)) - -#define UTUN_CRYPTO_FRAMER_ARGS_HDR_SIZE ((size_t)(&((utun_crypto_framer_args_t *)0)->u)) -#define UTUN_CRYPTO_FRAMER_ARGS_VARARGS_BUF(args) ((u_int8_t *)args + UTUN_CRYPTO_FRAMER_ARGS_HDR_SIZE + args->args_ulen) -#define UTUN_CRYPTO_FRAMER_ARGS_TOTAL_SIZE(args) ((size_t)(UTUN_CRYPTO_FRAMER_ARGS_HDR_SIZE + args->args_ulen + args->varargs_buflen)) - -#define UTUN_CRYPTO_ARGS_HDR_SIZE ((size_t)(&((utun_crypto_args_t *)0)->u)) -#define UTUN_CRYPTO_ARGS_VARARGS_BUF(args) ((u_int8_t *)args + UTUN_CRYPTO_ARGS_HDR_SIZE + args->args_ulen) -#define UTUN_CRYPTO_ARGS_TOTAL_SIZE(args) ((size_t)(UTUN_CRYPTO_ARGS_HDR_SIZE + args->args_ulen + args->varargs_buflen)) - -typedef caddr_t (*utun_crypto_kpi_connect_func)(int kpi_handle, struct utun_pcb *utun_ref); - -typedef errno_t (*utun_crypto_kpi_send_func)(caddr_t ref, mbuf_t *pkt); - -typedef struct utun_crypto_kpi_reg { - /* Dispatch functions */ - utun_crypto_type_t crypto_kpi_type; - u_int32_t crypto_kpi_flags; - utun_crypto_kpi_connect_func crypto_kpi_connect; - utun_crypto_kpi_send_func crypto_kpi_send; -} utun_crypto_kpi_reg_t; - -typedef struct utun_crypto_kpi_reg_list { - utun_crypto_kpi_reg_t reg; - struct utun_crypto_kpi_reg_list *next; -} utun_crypto_kpi_reg_list_t; - -void -utun_ctl_init_crypto(void); - -/* - * Summary: registers the crypto KPI's Kext routines with UTUN... so that UTUN can make calls into it (e.g. DTLS) - */ -errno_t -utun_crypto_kpi_register(utun_crypto_kpi_reg_t *reg); - -void -utun_cleanup_crypto(struct utun_pcb *pcb); - -errno_t -utun_ctl_enable_crypto(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -errno_t -utun_ctl_disable_crypto(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -errno_t -utun_ctl_config_crypto_keys(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -errno_t -utun_ctl_unconfig_crypto_keys(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -errno_t -utun_ctl_config_crypto_framer(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -errno_t -utun_ctl_unconfig_crypto_framer(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -errno_t -utun_ctl_generate_crypto_keys_idx(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t *len); - -errno_t -utun_ctl_stop_crypto_data_traffic(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -errno_t -utun_ctl_start_crypto_data_traffic(__unused kern_ctl_ref kctlref, - __unused u_int32_t unit, - __unused void *unitinfo, - __unused int opt, - void *data, - size_t len); - -int -utun_pkt_crypto_output(struct utun_pcb *pcb, mbuf_t *m); - -#endif // KERNEL_PRIVATE - -#endif // _NET_IF_UTUN_CRYPTO_H_ diff --git a/bsd/net/if_utun_crypto_dtls.c b/bsd/net/if_utun_crypto_dtls.c deleted file mode 100644 index 3565c4feb..000000000 --- a/bsd/net/if_utun_crypto_dtls.c +++ /dev/null @@ -1,1045 +0,0 @@ -/* - * Copyright (c) 2012-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern errno_t utun_pkt_input (struct utun_pcb *pcb, mbuf_t m); - -static UInt32 dtls_kpi_callbacks_inited = FALSE; -static unsigned int dtls_kpi_flags = 0; -static utun_crypto_kpi_connect_func dtls_kpi_connect = (__typeof__(dtls_kpi_connect))NULL; -static utun_crypto_kpi_send_func dtls_kpi_send = (__typeof__(dtls_kpi_send))NULL; - -// convert this mutex to shared lock -static UInt32 dtls_ctl_mutex_inited = FALSE; -static lck_grp_t *dtls_ctl_mutex_grp = NULL; -static lck_grp_attr_t *dtls_ctl_mutex_grp_attr = NULL; -static lck_attr_t *dtls_ctl_mutex_attr = NULL; -static lck_mtx_t dtls_ctl_mutex; - -#define utun_ctl_get_first_framer(ctx, inner_type) (utun_crypto_framer_t *)LIST_FIRST(&ctx->framer_listheads[UTUN_CRYPTO_INNER_TYPE_TO_IDX(inner_type)]) -#define utun_get_framer_listhead(ctx, inner_type) &ctx->framer_listheads[UTUN_CRYPTO_INNER_TYPE_TO_IDX(inner_type)] - -static void -utun_ctl_clr_dtls_framer (utun_crypto_framer_t *rem_framer) -{ - if (!rem_framer) return; - - // TOFIX: switch to BPF - LIST_REMOVE(rem_framer, framer_chain); // unchain the framer - if (rem_framer->dir == UTUN_CRYPTO_DIR_IN) { - if (utun_crypto_framer_state_dtls_in(rem_framer).in_pattern) { - utun_free(utun_crypto_framer_state_dtls_in(rem_framer).in_pattern); - } - if (utun_crypto_framer_state_dtls_in(rem_framer).in_pattern_mask) { - utun_free(utun_crypto_framer_state_dtls_in(rem_framer).in_pattern_mask); - } - if (utun_crypto_framer_state_dtls_in(rem_framer).in_pattern_masked) { - utun_free(utun_crypto_framer_state_dtls_in(rem_framer).in_pattern_masked); - } - } else { - if (utun_crypto_framer_state_dtls_out(rem_framer).out_pattern) { - utun_free(utun_crypto_framer_state_dtls_out(rem_framer).out_pattern); - } - } - utun_free(rem_framer); - - return; -} - -static void -utun_ctl_clr_dtls_framers (utun_crypto_framer_t *first_framer) -{ - utun_crypto_framer_t *cur_framer, *nxt_framer; - - // check framer->state.u.dtls.u.in.listhead for duplicates; - for (cur_framer = first_framer; - cur_framer != NULL; - cur_framer = nxt_framer) { - nxt_framer = (__typeof__(nxt_framer))LIST_NEXT(cur_framer, framer_chain); - utun_ctl_clr_dtls_framer(cur_framer); - } - - return; -} - -static void -utun_ctl_clr_dtls_all_framers (utun_crypto_ctx_t *crypto_ctx) -{ - utun_ctl_clr_dtls_framers(utun_ctl_get_first_framer(crypto_ctx, UTUN_CRYPTO_INNER_TYPE_IPv4)); - utun_ctl_clr_dtls_framers(utun_ctl_get_first_framer(crypto_ctx, UTUN_CRYPTO_INNER_TYPE_IPv6)); - crypto_ctx->num_framers = 0; -} - -static void -utun_ctl_restart_dtls_framers (utun_crypto_framer_t *first_framer) -{ - utun_crypto_framer_t *cur_framer; - - // check framer->state.u.dtls.u.in.listhead for duplicates; - for (cur_framer = first_framer; - cur_framer != NULL; - cur_framer = (__typeof__(cur_framer))LIST_NEXT(cur_framer, framer_chain)) { - utun_crypto_framer_state_dtls_out(cur_framer).sequence_field = utun_crypto_framer_state_dtls_out(cur_framer).sequence_field_initval; - } - - return; -} - -static void -utun_ctl_restart_dtls_all_framers (utun_crypto_ctx_t *crypto_ctx) -{ - utun_ctl_restart_dtls_framers(utun_ctl_get_first_framer(crypto_ctx, UTUN_CRYPTO_INNER_TYPE_IPv4)); - utun_ctl_restart_dtls_framers(utun_ctl_get_first_framer(crypto_ctx, UTUN_CRYPTO_INNER_TYPE_IPv6)); -} - -static int -is_pattern_all_zeroes (u_int8_t *pattern, - int pattern_len) -{ - int i; - - if (!pattern || !pattern_len) return FALSE; // false if args are NULL - - for (i = 0; i < pattern_len; i++) { - if (pattern[i] != 0) return FALSE; - } - return TRUE; -} - -static int -is_pattern_masked_all_zeroes (u_int8_t *pattern, - u_int8_t *pattern_mask, - int pattern_len) -{ - int i; - - if (!pattern || !pattern_mask || !pattern_len) return FALSE; // false if args are NULL - - for (i = 0; i < pattern_len; i++) { - if ((pattern[i] & pattern_mask[i])) return FALSE; - } - return TRUE; -} - -static void -utun_ctl_calc_dtls_framer_pattern_and_mask (u_int8_t *pattern_masked, u_int8_t *pattern, u_int8_t *mask, int len) -{ - int i; - for (i = 0; i < len; i++) { - pattern_masked[i] = (pattern[i] & mask[i]); - } -} - -static Boolean -utun_ctl_did_dtls_framer_pattern_match (u_int8_t *input, u_int8_t *pattern_masked, int len) -{ - int i; - for (i = 0; i < len; i++) { - if ((input[i] & pattern_masked[i]) != pattern_masked[i]) return FALSE; - } - return TRUE; -} - -static Boolean -utun_pkt_dtls_input_frame_is_data(utun_crypto_ctx_t *crypto_ctx, - mbuf_t *pkt, - protocol_family_t family, - int *striplen) -{ - u_int8_t *p; - utun_crypto_framer_t *cur_framer; - - p = mtod(*pkt, __typeof__(p)); - for (cur_framer = utun_ctl_get_first_framer(crypto_ctx, utun_crypto_framer_protocol_family_to_inner_type(family)); - cur_framer != NULL; - cur_framer = (__typeof__(cur_framer))LIST_NEXT(cur_framer, framer_chain)) { - if (m_pktlen(*pkt) < utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_len) { - continue; - } - if ((*pkt)->m_len < utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_len) { - *pkt = m_pullup(*pkt, utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_len); - if (!*pkt || - (*pkt)->m_len < utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_len) { - return FALSE; - } - p = mtod(*pkt, __typeof__(p)); - } - // TOFIX: switch to BPF - if (utun_ctl_did_dtls_framer_pattern_match(p, - utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_masked, - utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_len)) { - *striplen = utun_crypto_framer_state_dtls_in(cur_framer).in_data_offset; - return TRUE; - } - } - return FALSE; -} - -#define GETLONG(l, cp) { \ - (l) = *(cp)++ << 8; \ - (l) |= *(cp)++; (l) <<= 8; \ - (l) |= *(cp)++; (l) <<= 8; \ - (l) |= *(cp)++; \ - } -#define PUTLONG(l, cp) { \ - *(cp)++ = (u_char) ((l) >> 24); \ - *(cp)++ = (u_char) ((l) >> 16); \ - *(cp)++ = (u_char) ((l) >> 8); \ - *(cp)++ = (u_char) (l); \ - } - -static int -utun_pkt_dtls_output_frame_encapsulate (utun_crypto_ctx_t *crypto_ctx, - mbuf_t *pkt, - protocol_family_t proto) -{ - u_int8_t *p; - utun_crypto_framer_t *cur_framer; - u_int32_t pkt_len; - - // TOFIX: switch to BPF - - if (!crypto_ctx->num_framers) { - return 0; - } - if (proto != AF_INET && proto != AF_INET6) { - printf("%s: unsupported proto %d\n", __FUNCTION__, proto); - return EINVAL; - } - - for (cur_framer = utun_ctl_get_first_framer(crypto_ctx, utun_crypto_framer_protocol_family_to_inner_type(proto)); - cur_framer != NULL && !utun_crypto_framer_state_dtls_out(cur_framer).out_pattern; - cur_framer = (__typeof__(cur_framer))LIST_NEXT(cur_framer, framer_chain)); - if (!cur_framer || - !utun_crypto_framer_state_dtls_out(cur_framer).out_pattern_len) { - return 0; - } - - pkt_len = m_pktlen(*pkt); - - // prepend/encapsulate the output pattern - if (mbuf_prepend(pkt, utun_crypto_framer_state_dtls_out(cur_framer).out_pattern_len, MBUF_DONTWAIT) != 0) { - printf("%s - ifnet_output prepend failed\n", __FUNCTION__); - return ENOBUFS; - } - - p = mtod(*pkt, __typeof__(p)); - memcpy(p, - utun_crypto_framer_state_dtls_out(cur_framer).out_pattern, - utun_crypto_framer_state_dtls_out(cur_framer).out_pattern_len); - // fill a "length" field... if configured - if (utun_crypto_framer_state_dtls_out(cur_framer).len_field_mask) { - u_int32_t tmp; - u_int8_t *q = p + utun_crypto_framer_state_dtls_out(cur_framer).len_field_offset; - GETLONG(tmp, q); - tmp &= ((pkt_len + utun_crypto_framer_state_dtls_out(cur_framer).len_field_extra) & utun_crypto_framer_state_dtls_out(cur_framer).len_field_mask); - q = p + utun_crypto_framer_state_dtls_out(cur_framer).len_field_offset; - PUTLONG(tmp, q); - } - // fill a "sequence" field... if configured - if (utun_crypto_framer_state_dtls_out(cur_framer).sequence_field_mask) { - u_int32_t tmp = (utun_crypto_framer_state_dtls_out(cur_framer).sequence_field & utun_crypto_framer_state_dtls_out(cur_framer).sequence_field_mask); - u_int8_t *q = p + utun_crypto_framer_state_dtls_out(cur_framer).sequence_field_offset; - GETLONG(tmp, q); - tmp &= (utun_crypto_framer_state_dtls_out(cur_framer).sequence_field & utun_crypto_framer_state_dtls_out(cur_framer).sequence_field_mask); - q = p + utun_crypto_framer_state_dtls_out(cur_framer).sequence_field_offset; - PUTLONG(tmp, q); - utun_crypto_framer_state_dtls_out(cur_framer).sequence_field++; - } - return 0; -} - -void -utun_ctl_init_crypto_dtls (void) -{ - if (OSCompareAndSwap(FALSE, TRUE, &dtls_ctl_mutex_inited)) { - if (!dtls_ctl_mutex_grp_attr) - dtls_ctl_mutex_grp_attr = lck_grp_attr_alloc_init(); - if (!dtls_ctl_mutex_grp) - dtls_ctl_mutex_grp = lck_grp_alloc_init("utun-crypto", dtls_ctl_mutex_grp_attr); - if (!dtls_ctl_mutex_attr) - dtls_ctl_mutex_attr = lck_attr_alloc_init(); - - lck_mtx_init(&dtls_ctl_mutex, dtls_ctl_mutex_grp, dtls_ctl_mutex_attr); - } -} - -/* - * Summary: registers the DTLS Kext routines with UTUN... so that UTUN can make calls into DTLS - */ -errno_t -utun_ctl_register_dtls (utun_crypto_kpi_reg_t *reg) -{ - //printf("%s: entering\n", __FUNCTION__); - if (!reg) return EINVAL; - - //printf("%s: type %d\n", __FUNCTION__, reg->crypto_kpi_type); - if (reg->crypto_kpi_type != UTUN_CRYPTO_TYPE_DTLS) { - return EINVAL; - } - - if (!reg->crypto_kpi_connect) { - return EINVAL; - } - - if (!reg->crypto_kpi_send) { - return EINVAL; - } - - // printf("%s: pre-value of dtls_kpi_callbacks_inited %lu\n", __FUNCTION__, - // dtls_kpi_callbacks_inited); - if (OSCompareAndSwap(FALSE, TRUE, &dtls_kpi_callbacks_inited)) { - dtls_kpi_flags = reg->crypto_kpi_flags; - dtls_kpi_connect = reg->crypto_kpi_connect; - dtls_kpi_send = reg->crypto_kpi_send; - } - //printf("%s: post-value of dtls_kpi_callbacks_inited %lu\n", __FUNCTION__, - // dtls_kpi_callbacks_inited); - return 0; -} - -/* - * Summary: enables dtls crypto info for the specified utun. dtls ref is passed into args. - */ -void -utun_ctl_enable_crypto_dtls(struct utun_pcb *pcb, utun_crypto_args_t *args) -{ - utun_crypto_ctx_t *crypto_ctx; - - lck_mtx_lock(&dtls_ctl_mutex); - - //printf("%s: entering, flags %x, kpi-handle %x, kpi-ref %p, kpi-refcnt %d\n", __FUNCTION__, pcb->utun_flags, crypto_ctx->kpi_handle, crypto_ctx->kpi_ref, crypto_ctx->kpi_refcnt); - - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_IN)]; - if (crypto_ctx->valid) { - printf("%s: dtls already enabled (prev %u, now %u)\n", __FUNCTION__, - crypto_ctx->kpi_handle, args->u.dtls_v1.kpi_handle); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT)]; - if (!crypto_ctx->valid) { - crypto_ctx->kpi_handle = args->u.dtls_v1.kpi_handle; - } else { - printf("%s: dtls already enabled for egress (prev %u, now %u)\n", __FUNCTION__, - crypto_ctx->kpi_handle, args->u.dtls_v1.kpi_handle); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - // crypto_ctx->valid will be set in utun_ctl_enable_crypto - lck_mtx_unlock(&dtls_ctl_mutex); - return; -} - -/* - * Summary: disables dtls crypto info for the specified utun. - */ -void -utun_ctl_disable_crypto_dtls(struct utun_pcb *pcb) -{ - utun_crypto_ctx_t *crypto_ctx; - - lck_mtx_lock(&dtls_ctl_mutex); - - //printf("%s: entering, flags %x, kpi-handle %d, kpi-ref %p, kpi-refcnt %d\n", __FUNCTION__, pcb->utun_flags, crypto_ctx->kpi_handle, crypto_ctx->kpi_ref, crypto_ctx->kpi_refcnt); - - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_IN)]; - if (crypto_ctx->valid && - crypto_ctx->type == UTUN_CRYPTO_TYPE_DTLS) { - utun_ctl_clr_dtls_all_framers(crypto_ctx); - } - - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT)]; - if (!crypto_ctx->valid || - crypto_ctx->type != UTUN_CRYPTO_TYPE_DTLS) { - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - if (crypto_ctx->kpi_ref) { - if (dtls_kpi_connect) { - (void)dtls_kpi_connect(crypto_ctx->kpi_handle, NULL); - if (--crypto_ctx->kpi_refcnt == 0) { - crypto_ctx->kpi_ref = (__typeof__(crypto_ctx->kpi_ref))NULL; - crypto_ctx->kpi_handle = UTUN_CRYPTO_DTLS_HANDLE_INVALID; - } else { - // printf("%s: ### dtls_kpi_refcnt %d not yet zero\n", - // __FUNCTION__, crypto_ctx->kpi_refcnt); - } - } else { - printf("%s: ### dtls_ctl_connect unavailable\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - } else { - if (crypto_ctx->kpi_handle < 0) { - printf("%s: dtls already disabled\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - crypto_ctx->kpi_handle = UTUN_CRYPTO_DTLS_HANDLE_INVALID; - } - utun_ctl_clr_dtls_all_framers(crypto_ctx); - lck_mtx_unlock(&dtls_ctl_mutex); - return; -} - -static utun_crypto_framer_t * -utun_ctl_get_dtls_in_framer (utun_crypto_framer_t *first_framer, - u_int8_t *in_pattern, - int in_pattern_len, - u_int8_t *in_pattern_mask, - int in_pattern_mask_len) -{ - utun_crypto_framer_t *cur_framer; - - // check framer->u.listhead for duplicates; - for (cur_framer = first_framer; - cur_framer != NULL; - cur_framer = (__typeof__(cur_framer))LIST_NEXT(cur_framer, framer_chain)) { - // TOFIX: use in_pattern_masked - if (utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_len == in_pattern_len && - memcmp(utun_crypto_framer_state_dtls_in(cur_framer).in_pattern, - in_pattern, - in_pattern_len) == 0 && - utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_len == in_pattern_mask_len && - memcmp(utun_crypto_framer_state_dtls_in(cur_framer).in_pattern_mask, - in_pattern_mask, - in_pattern_mask_len) == 0) { - // found - return cur_framer; - } - } - - return NULL; -} - -errno_t -utun_ctl_config_crypto_dtls_framer (utun_crypto_ctx_t *crypto_ctx, - utun_crypto_framer_args_t *args) -{ - utun_crypto_framer_t *framer, *new_framer = NULL, *dup_framer; - - if (args->ver != UTUN_CRYPTO_DTLS_VER_1) { - return EINVAL; - } - if (!args->type || args->type >= UTUN_CRYPTO_INNER_TYPE_MAX) { - return EINVAL; - } - - lck_mtx_lock(&dtls_ctl_mutex); - - if (args->dir == UTUN_CRYPTO_DIR_IN) { - // Input framer (for tunnel hdr detection and decapsulation). there can be several pattern that identify data (vs. control) packets. - - // First, the args need to be verified for errors/inconsistencies - // pattern and mask have to be configured - if (!utun_crypto_framer_args_dtls_in(args).in_pattern_len || - !utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: invalid dtls in-pattern %d mask %d\n", __FUNCTION__, - utun_crypto_framer_args_dtls_in(args).in_pattern_len, - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len); - return EINVAL; - } - // pattern and mask lengths have to match - if (utun_crypto_framer_args_dtls_in(args).in_pattern_len != utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: inconsistent dtls in-pattern %d mask %d\n",__FUNCTION__, - utun_crypto_framer_args_dtls_in(args).in_pattern_len, - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len); - return EINVAL; - } - // check for len inconsistencies - if ((u_int32_t)utun_crypto_framer_args_dtls_in(args).in_pattern_len + (u_int32_t)utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len != args->varargs_buflen) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: inconsistent dtls in-pattern %d mask %d, total %d\n",__FUNCTION__, - utun_crypto_framer_args_dtls_in(args).in_pattern_len, - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len, - args->varargs_buflen); - return EINVAL; - } - // utun_crypto_framer_args_dtls_in(args).in_pattern should not be all zeros - if (is_pattern_all_zeroes(&args->varargs_buf[0], - utun_crypto_framer_args_dtls_in(args).in_pattern_len)) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: in-pattern is all zeros, len %d\n",__FUNCTION__, - utun_crypto_framer_args_dtls_in(args).in_pattern_len); - return EINVAL; - } - // utun_crypto_framer_args_dtls_in(args).in_pattern_mask should not be all zeros - if (is_pattern_all_zeroes(&args->varargs_buf[utun_crypto_framer_args_dtls_in(args).in_pattern_len], - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len)) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: in-pattern-mask is all zeros, len %d\n",__FUNCTION__, - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len); - return EINVAL; - } - // utun_crypto_framer_args_dtls_in(args).in_pattern & utun_crypto_framer_args_dtls_in(args).in_pattern_mask should not be zeros - if (is_pattern_masked_all_zeroes(&args->varargs_buf[0], - &args->varargs_buf[utun_crypto_framer_args_dtls_in(args).in_pattern_len], - utun_crypto_framer_args_dtls_in(args).in_pattern_len)) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: in-pattern-masked is all zeros, len %d\n",__FUNCTION__, - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len); - return EINVAL; - } - - // Secondly, we need to be careful about existing framer configs - if (!(framer = utun_ctl_get_first_framer(crypto_ctx, args->inner_type))) { - // no framers configured - if (!(framer = utun_alloc(sizeof(*framer)))) { - lck_mtx_unlock(&dtls_ctl_mutex); - return ENOBUFS; - } - bzero(framer, sizeof(*framer)); - // fall through to fill-in the 1st framer - } else { - // at least one framer configured... check framer->u.listhead for duplicates; - if ((dup_framer = utun_ctl_get_dtls_in_framer(framer /* could be a list */, - &args->varargs_buf[0], - utun_crypto_framer_args_dtls_in(args).in_pattern_len, - &args->varargs_buf[utun_crypto_framer_args_dtls_in(args).in_pattern_len], - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len))) { - // duplicate - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: ignoring duplicate framer for type %d\n",__FUNCTION__, - args->inner_type); - return 0; - } - - if (!(new_framer = utun_alloc(sizeof(*new_framer)))) { - lck_mtx_unlock(&dtls_ctl_mutex); - return ENOBUFS; - } - bzero(new_framer, sizeof(*new_framer)); - framer = new_framer; - // fall through to fill-in additional framer - } - LIST_INSERT_HEAD(utun_get_framer_listhead(crypto_ctx, args->inner_type), - new_framer, - framer_chain); - - framer->inner_type = args->inner_type; - framer->inner_protocol_family = utun_crypto_framer_inner_type_to_protocol_family(args->inner_type); - // allocate and fill the pattern - if (!(utun_crypto_framer_state_dtls_in(framer).in_pattern = utun_alloc(utun_crypto_framer_args_dtls_in(args).in_pattern_len))) { - utun_ctl_clr_dtls_framer(framer); - lck_mtx_unlock(&dtls_ctl_mutex); - return ENOBUFS; - } - memcpy(utun_crypto_framer_state_dtls_in(framer).in_pattern, - &args->varargs_buf[0], - utun_crypto_framer_args_dtls_in(args).in_pattern_len); - utun_crypto_framer_state_dtls_in(framer).in_pattern_len = utun_crypto_framer_args_dtls_in(args).in_pattern_len; - - // allocate and fill the pattern-mask - if (!(utun_crypto_framer_state_dtls_in(framer).in_pattern_mask = utun_alloc(utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len))) { - utun_ctl_clr_dtls_framer(framer); - lck_mtx_unlock(&dtls_ctl_mutex); - return ENOBUFS; - } - memcpy(utun_crypto_framer_state_dtls_in(framer).in_pattern_mask, - &args->varargs_buf[utun_crypto_framer_args_dtls_in(args).in_pattern_len], - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len); - utun_crypto_framer_state_dtls_in(framer).in_data_offset = utun_crypto_framer_args_dtls_in(args).in_data_offset; - - if (!(utun_crypto_framer_state_dtls_in(framer).in_pattern_masked = utun_alloc(utun_crypto_framer_args_dtls_in(args).in_pattern_len))) { - utun_ctl_clr_dtls_framer(framer); - lck_mtx_unlock(&dtls_ctl_mutex); - return ENOBUFS; - } - utun_ctl_calc_dtls_framer_pattern_and_mask(utun_crypto_framer_state_dtls_in(framer).in_pattern_masked, - utun_crypto_framer_state_dtls_in(framer).in_pattern, - utun_crypto_framer_state_dtls_in(framer).in_pattern_mask, - utun_crypto_framer_state_dtls_in(framer).in_pattern_len); - // TOFIX: switch to BPF - crypto_ctx->num_framers++; - } else { - // Output Framer (for tunnel hdr encapsulation)... there can only be one for each type of traffic (see caller of this function) - - // pattern and mask have to be configured - if (!utun_crypto_framer_args_dtls_out(args).out_pattern_len) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: invalid output framer, len %d\n",__FUNCTION__, - utun_crypto_framer_args_dtls_out(args).out_pattern_len); - return EINVAL; - } - // utun_crypto_framer_args_dtls_out(args).out_pattern should not be all zeros; - if (is_pattern_all_zeroes(&args->varargs_buf[0], - utun_crypto_framer_args_dtls_out(args).out_pattern_len)) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: zeroed output framer, len %d\n",__FUNCTION__, - utun_crypto_framer_args_dtls_out(args).out_pattern_len); - return EINVAL; - } - - // can't have the offset/extra configured while the mask is cleared - if ((utun_crypto_framer_args_dtls_out(args).len_field_offset || utun_crypto_framer_args_dtls_out(args).len_field_extra) && !utun_crypto_framer_args_dtls_out(args).len_field_mask) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: output framer has invalid length-field %d,%d,%x\n",__FUNCTION__, - (int)utun_crypto_framer_args_dtls_out(args).len_field_offset, - (int)utun_crypto_framer_args_dtls_out(args).len_field_extra, - utun_crypto_framer_args_dtls_out(args).len_field_mask); - return EINVAL; - } - // any length field should be within the bounds of the out-pattern - if (utun_crypto_framer_args_dtls_out(args).len_field_offset >= utun_crypto_framer_args_dtls_out(args).out_pattern_len) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - - // can't have the offset configured while the mask is cleared - if ((utun_crypto_framer_args_dtls_out(args).sequence_field || utun_crypto_framer_args_dtls_out(args).sequence_field_offset) && !utun_crypto_framer_args_dtls_out(args).sequence_field_mask) { - lck_mtx_unlock(&dtls_ctl_mutex); - printf("%s: output framer has invalid sequence-field %d,%d,%x\n",__FUNCTION__, - (int)utun_crypto_framer_args_dtls_out(args).sequence_field, - (int)utun_crypto_framer_args_dtls_out(args).sequence_field_offset, - utun_crypto_framer_args_dtls_out(args).sequence_field_mask); - return EINVAL; - } - // any sequence field should be within the bounds of the out-pattern - if (utun_crypto_framer_args_dtls_out(args).sequence_field_offset >= utun_crypto_framer_args_dtls_out(args).out_pattern_len) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - - // check for len inconsistencies - if ((u_int32_t)utun_crypto_framer_args_dtls_out(args).out_pattern_len != args->varargs_buflen) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - - if (!(framer = utun_ctl_get_first_framer(crypto_ctx, args->inner_type))) { - if (!(framer = utun_alloc(sizeof(*framer)))) { - lck_mtx_unlock(&dtls_ctl_mutex); - return ENOBUFS; - } - bzero(framer, sizeof(*framer)); - LIST_INSERT_HEAD(utun_get_framer_listhead(crypto_ctx, args->inner_type), - new_framer, - framer_chain); - // fall through to fill-in 1st framer - } else { - // only one outbound framer may be configured.. is it a dup? - if (framer->inner_type == args->inner_type && - utun_crypto_framer_state_dtls_out(framer).out_pattern_len == utun_crypto_framer_args_dtls_out(args).out_pattern_len && - utun_crypto_framer_state_dtls_out(framer).out_pattern && - memcmp(utun_crypto_framer_state_dtls_out(framer).out_pattern, - &args->varargs_buf[0], - utun_crypto_framer_args_dtls_out(args).out_pattern_len) == 0) { - // found - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; - } - - // overwrite the previous one - if (utun_crypto_framer_state_dtls_out(framer).out_pattern) { - utun_free(utun_crypto_framer_state_dtls_out(framer).out_pattern); - } - // fall through to fill-in additional framer - } - - framer->inner_type = args->inner_type; - framer->inner_protocol_family = utun_crypto_framer_inner_type_to_protocol_family(args->inner_type); - - // alloc and fill in the out-pattern - if (!(utun_crypto_framer_state_dtls_out(framer).out_pattern = utun_alloc(utun_crypto_framer_args_dtls_out(args).out_pattern_len))) { - utun_ctl_clr_dtls_framer(framer); - lck_mtx_unlock(&dtls_ctl_mutex); - return ENOBUFS; - } - memcpy(utun_crypto_framer_state_dtls_out(framer).out_pattern, - &args->varargs_buf[0], - utun_crypto_framer_args_dtls_out(args).out_pattern_len); - utun_crypto_framer_state_dtls_out(framer).out_pattern_len = utun_crypto_framer_args_dtls_out(args).out_pattern_len; - - utun_crypto_framer_state_dtls_out(framer).len_field_mask = utun_crypto_framer_args_dtls_out(args).len_field_mask; - utun_crypto_framer_state_dtls_out(framer).len_field_offset = utun_crypto_framer_args_dtls_out(args).len_field_offset; - utun_crypto_framer_state_dtls_out(framer).len_field_extra = utun_crypto_framer_args_dtls_out(args).len_field_extra; - utun_crypto_framer_state_dtls_out(framer).sequence_field_initval = utun_crypto_framer_args_dtls_out(args).sequence_field; - utun_crypto_framer_state_dtls_out(framer).sequence_field_mask = utun_crypto_framer_args_dtls_out(args).sequence_field_mask; - utun_crypto_framer_state_dtls_out(framer).sequence_field_offset = utun_crypto_framer_args_dtls_out(args).sequence_field_offset; - crypto_ctx->num_framers = 1; - } - framer->type = args->type; - framer->dir = args->dir; - framer->valid = 1; - - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; -} - -int -utun_ctl_unconfig_crypto_dtls_framer (utun_crypto_ctx_t *crypto_ctx, - utun_crypto_framer_args_t *args) -{ - utun_crypto_framer_t *framer, *rem_framer; - - if (args->ver != UTUN_CRYPTO_DTLS_VER_1) { - return EINVAL; - } - if (!args->type || args->type >= UTUN_CRYPTO_INNER_TYPE_MAX) { - return EINVAL; - } - - lck_mtx_lock(&dtls_ctl_mutex); - - if (args->dir == UTUN_CRYPTO_DIR_IN) { - if (!utun_crypto_framer_args_dtls_in(args).in_pattern_len) { - // no pattern means... clear all - utun_ctl_clr_dtls_framers(utun_ctl_get_first_framer(crypto_ctx, args->inner_type)); - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; - } - - // when both specified, pattern and mask lengths have to match - if (utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len && - utun_crypto_framer_args_dtls_in(args).in_pattern_len != utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - // check for len inconsistencies - if ((u_int32_t)utun_crypto_framer_args_dtls_in(args).in_pattern_len + (u_int32_t)utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len != args->varargs_buflen) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - // utun_crypto_framer_args_dtls_in(args).in_pattern should not be all zeros - if (is_pattern_all_zeroes(&args->varargs_buf[0], - utun_crypto_framer_args_dtls_in(args).in_pattern_len)) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - // when specified, utun_crypto_framer_args_dtls_in(args).in_pattern_mask should not be all zeros - if (utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len && - is_pattern_all_zeroes(&args->varargs_buf[utun_crypto_framer_args_dtls_in(args).in_pattern_len], - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len)) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - // utun_crypto_framer_args_dtls_in(args).in_pattern & utun_crypto_framer_args_dtls_in(args).in_pattern_mask should not be zeros - if (is_pattern_masked_all_zeroes(&args->varargs_buf[0], - &args->varargs_buf[utun_crypto_framer_args_dtls_in(args).in_pattern_len], - utun_crypto_framer_args_dtls_in(args).in_pattern_len)) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - - if ((u_int32_t)utun_crypto_framer_args_dtls_in(args).in_pattern_len + (u_int32_t)utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len != args->varargs_buflen) { - lck_mtx_unlock(&dtls_ctl_mutex); - return EINVAL; - } - - if (!(framer = utun_ctl_get_first_framer(crypto_ctx, args->inner_type))) { - // no framers - printf("%s: no framers configured\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; - } else { - if ((rem_framer = utun_ctl_get_dtls_in_framer(framer, - &args->varargs_buf[0], - utun_crypto_framer_args_dtls_in(args).in_pattern_len, - &args->varargs_buf[utun_crypto_framer_args_dtls_in(args).in_pattern_len], - utun_crypto_framer_args_dtls_in(args).in_pattern_mask_len))) { - utun_ctl_clr_dtls_framer(rem_framer); - if (crypto_ctx->num_framers) crypto_ctx->num_framers--; - } else { - printf("%s: no matching ingress framer\n", __FUNCTION__); - } - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; - } - } else { - framer = utun_ctl_get_first_framer(crypto_ctx, args->inner_type); - // overwrite the previous one - if (framer) { - if (framer->inner_type != args->inner_type || - (utun_crypto_framer_args_dtls_out(args).out_pattern_len && - utun_crypto_framer_state_dtls_out(framer).out_pattern_len != utun_crypto_framer_args_dtls_out(args).out_pattern_len) || - (utun_crypto_framer_args_dtls_out(args).out_pattern_len && - memcmp(utun_crypto_framer_state_dtls_out(framer).out_pattern, - &args->varargs_buf[0], - utun_crypto_framer_args_dtls_out(args).out_pattern_len))) { - printf("%s: no matching egress framer\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return EBADF; - } - utun_ctl_clr_dtls_framer(framer); - if (crypto_ctx->num_framers) crypto_ctx->num_framers--; - } - } - - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; -} - -/* - * Summary: enables handling of data traffic - */ -void -utun_ctl_start_datatraffic_crypto_dtls(struct utun_pcb *pcb) -{ - utun_crypto_ctx_t *crypto_ctx; - - lck_mtx_lock(&dtls_ctl_mutex); - - //printf("%s: entering, flags %x, kpi-handle %d, kpi-ref %p, kpi-refcnt %d\n", __FUNCTION__, pcb->utun_flags, crypto_ctx->kpi_handle, crypto_ctx->kpi_ref, crypto_ctx->kpi_refcnt); - - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT)]; - - if (crypto_ctx->kpi_handle < 0) { - printf("%s: dtls disabled\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - - if (!crypto_ctx->kpi_ref) { - if (dtls_kpi_connect) { - crypto_ctx->kpi_ref = dtls_kpi_connect(crypto_ctx->kpi_handle, pcb); - if (!crypto_ctx->kpi_ref) { - printf("%s: ### dtls_kpi_connect failed\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - crypto_ctx->kpi_refcnt++; - } else { - printf("%s: ### dtls_kpi_connect unavailable\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - } else { - printf("%s: dtls already stitched\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - utun_ctl_restart_dtls_all_framers(crypto_ctx); // for dynamic egress hdrs - - //printf("%s: leaving, flags %x, kpi-handle %d, kpi-ref %p, kpi-refcnt %d\n", __FUNCTION__, pcb->utun_flags, crypto_ctx->kpi_handle, crypto_ctx->kpi_ref, crypto_ctx->kpi_refcnt); - lck_mtx_unlock(&dtls_ctl_mutex); - return; -} - -/* - * Summary: disables handling of data traffic - */ -void -utun_ctl_stop_datatraffic_crypto_dtls(struct utun_pcb *pcb) -{ - utun_crypto_ctx_t *crypto_ctx; - - lck_mtx_lock(&dtls_ctl_mutex); - - //printf("%s: entering, flags %x, kpi-ref %p, kpi-refcnt %d\n", __FUNCTION__, pcb->utun_flags, crypto_ctx->kpi_ref, crypto_ctx->kpi_refcnt); - - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT)]; - - if (crypto_ctx->kpi_ref) { - if (dtls_kpi_connect) { - (void)dtls_kpi_connect(crypto_ctx->kpi_handle, NULL); - if (--crypto_ctx->kpi_refcnt == 0) { - crypto_ctx->kpi_ref = (__typeof__(crypto_ctx->kpi_ref))NULL; - crypto_ctx->kpi_handle = UTUN_CRYPTO_DTLS_HANDLE_INVALID; - } else { - // printf("%s: ### dtls_kpi_refcnt %d not yet zero\n", - // __FUNCTION__, crypto_ctx->kpi_refcnt); - } - } else { - printf("%s: dtls_kpi_connect unavailable\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - } else { - printf("%s: dtls already not-stitched\n", __FUNCTION__); - lck_mtx_unlock(&dtls_ctl_mutex); - return; - } - lck_mtx_unlock(&dtls_ctl_mutex); - return; -} - -#define utun_pkt_dtls_prepend_proto(pkt, pf) do { \ - if (mbuf_prepend(pkt, sizeof(protocol_family_t), MBUF_DONTWAIT) != 0) { \ - printf("%s - ifnet_output prepend failed\n", __FUNCTION__); \ - lck_mtx_unlock(&dtls_ctl_mutex); \ - return EBADF; \ - } \ - *(protocol_family_t *)mbuf_data(*pkt) = pf; \ - } while(0); - -#define utun_pkt_dtls_puntup(pcb, pkt, errstr, rc) do { \ - *(protocol_family_t *)mbuf_data(*pkt) = htonl(*(protocol_family_t *)mbuf_data(*pkt)); \ - rc = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, *pkt, CTL_DATA_EOR); \ - if (rc != 0) { \ - printf("%s: - ctl_enqueuembuf failed (rc %d) for %s:\n", __FUNCTION__, rc, errstr); \ - mbuf_freem(*pkt); \ - ifnet_stat_increment_out(pcb->utun_ifp, 0, 0, 1); \ - lck_mtx_unlock(&dtls_ctl_mutex); \ - return 0; \ - } \ - *pkt = NULL; \ - } while(0); - -int -utun_pkt_dtls_output(struct utun_pcb *pcb, mbuf_t *pkt) -{ - errno_t rc = ENETUNREACH; - int len; - utun_crypto_ctx_t *crypto_ctx; - protocol_family_t proto; - - //printf("%s: entering, flags %x, ifp %p\n", __FUNCTION__, pcb->utun_flags, pcb->utun_ifp); - - if (!(pcb->utun_flags & UTUN_FLAGS_CRYPTO)) { - printf("%s - crypto disabled\n", __FUNCTION__); - return EINVAL; - } - - if (!pcb->utun_ifp) { - printf("%s - utun ifp cleared\n", __FUNCTION__); - return EINVAL; - } - - proto = *(mtod(*pkt, protocol_family_t *)); - - lck_mtx_lock(&dtls_ctl_mutex); - - len = mbuf_pkthdr_len(*pkt); - - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT)]; - - //printf("%s: entering, kpi-handle %d, kpi-ref %p, kpi-refcnt %d\n", __FUNCTION__, crypto_ctx->kpi_handle, crypto_ctx->kpi_ref, crypto_ctx->kpi_refcnt); - - if (dtls_kpi_send && (crypto_ctx->kpi_handle >= 0) && crypto_ctx->kpi_ref) { - m_adj(*pkt, sizeof(protocol_family_t)); - - if (!(rc = utun_pkt_dtls_output_frame_encapsulate(crypto_ctx, pkt, proto))) { - rc = dtls_kpi_send(crypto_ctx->kpi_ref, pkt); - if (rc) { - printf("%s: DTLS failed to send pkt %d\n", __FUNCTION__, rc); - // - // dtls_kpi_send (by way of so_inject_data_out) frees mbuf during certain error cases, - ifnet_stat_increment_out(pcb->utun_ifp, 0, 0, 1); // increment errors - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; // and drop packet - } - } else if (rc == EINVAL) { - // unsupported proto... fall through and punt (but 1st undo the protocol strip) - utun_pkt_dtls_prepend_proto(pkt, proto); - utun_pkt_dtls_puntup(pcb, pkt, "unsupported proto", rc); - } else { - // mbuf_prepend failure... mbuf will be already freed - printf("%s: failed to encrypsulate and send pkt %d\n", __FUNCTION__,rc); - ifnet_stat_increment_out(pcb->utun_ifp, 0, 0, 1); // increment errors - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; // and drop packet - } - } else { - utun_pkt_dtls_puntup(pcb, pkt, "slowpath", rc); - } - - if (!rc) - ifnet_stat_increment_out(pcb->utun_ifp, 1, len, 0); - - lck_mtx_unlock(&dtls_ctl_mutex); - return rc; -} - -int -utun_pkt_dtls_input(struct utun_pcb *pcb, mbuf_t *pkt, __unused protocol_family_t family) -{ - utun_crypto_ctx_t *crypto_ctx; - int striplen = 0; - - //printf("%s: got pkt %d\n", __FUNCTION__,family); - if (!(pcb->utun_flags & UTUN_FLAGS_CRYPTO)) { - printf("%s - crypto disabled\n", __FUNCTION__); - return EINVAL; - } - - if (!pcb->utun_ifp) { - printf("%s - utun ifp cleared\n", __FUNCTION__); - return EINVAL; - } - - lck_mtx_lock(&dtls_ctl_mutex); - - /* - * make sure that family matches what the UTUN was configured for (punt those that don't... along with all that fail to match the data pattern. - */ - crypto_ctx = &pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_IN)]; - if (crypto_ctx->num_framers && - !utun_pkt_dtls_input_frame_is_data(crypto_ctx, pkt, AF_INET, &striplen) && - !utun_pkt_dtls_input_frame_is_data(crypto_ctx, pkt, AF_INET6, &striplen)) { - // control or unknown traffic, so punt up to the plugin - errno_t rc; - - utun_pkt_dtls_prepend_proto(pkt, family); - *(protocol_family_t *)mbuf_data(*pkt) = htonl(*(protocol_family_t *)mbuf_data(*pkt)); - rc = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, *pkt, CTL_DATA_EOR); - if (rc != 0) { - // drop packet - printf("%s: - ctl_enqueuembuf failed: %d\n", __FUNCTION__, rc); - mbuf_freem(*pkt); - lck_mtx_unlock(&dtls_ctl_mutex); - return rc; - } - printf("%s: - ctl_enqueuembuf punted a packet up to UTUN ctrl sock: %d\n", __FUNCTION__, rc); - ifnet_stat_increment_in(pcb->utun_ifp, 1, mbuf_pkthdr_len(*pkt), 0); - - *pkt = NULL; - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; - } - if (striplen) { - //printf("%s: - about to strip tunneled hdr of len %d\n", __FUNCTION__, striplen); - m_adj(*pkt, striplen); - } - - utun_pkt_dtls_prepend_proto(pkt, family); - - ifnet_stat_increment_in(pcb->utun_ifp, 1, mbuf_pkthdr_len(*pkt), 0); - - (void)utun_pkt_input(pcb, *pkt); - lck_mtx_unlock(&dtls_ctl_mutex); - return 0; -} diff --git a/bsd/net/if_utun_crypto_dtls.h b/bsd/net/if_utun_crypto_dtls.h deleted file mode 100644 index f5de675f3..000000000 --- a/bsd/net/if_utun_crypto_dtls.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2012 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _NET_IF_UTUN_CRYPTO_DTLS_H_ -#define _NET_IF_UTUN_CRYPTO_DTLS_H_ - -#define UTUN_CRYPTO_DTLS_HANDLE_INVALID -1 - -#ifdef KERNEL_PRIVATE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define utun_cleanup_all_crypto_dtls(pcb) utun_ctl_disable_crypto_dtls(pcb) - -/* - * Summary: initializes global vars needed for any utun crypto based on dtls - */ -void -utun_ctl_init_crypto_dtls(void); - -errno_t -utun_ctl_register_dtls (utun_crypto_kpi_reg_t *reg); - -/* - * Summary: disables all crypto DTLS in one shot - */ -void -utun_cleanup_all_crypto_dtls (struct utun_pcb *pcb); - -/* - * Summary: enables dtls crypto info for the specified utun. dtls ref is passed into args. - */ -void -utun_ctl_enable_crypto_dtls(struct utun_pcb *pcb, utun_crypto_args_t *args); - -/* - * Summary: disables ipsec crypto info for the specified utun. - */ -void -utun_ctl_disable_crypto_dtls(struct utun_pcb *pcb); - -int -utun_ctl_config_crypto_dtls_framer(utun_crypto_ctx_t *crypto_ctx, utun_crypto_framer_args_t *args); - -int -utun_ctl_unconfig_crypto_dtls_framer(utun_crypto_ctx_t *crypto_ctx, utun_crypto_framer_args_t *args); - -/* - * Summary: enables handling of data traffic - */ -void -utun_ctl_start_datatraffic_crypto_dtls(struct utun_pcb *pcb); - -/* - * Summary: disables handling of data traffic - */ -void -utun_ctl_stop_datatraffic_crypto_dtls(struct utun_pcb *pcb); - -int -utun_pkt_dtls_output(struct utun_pcb *pcb, mbuf_t *pkt); - -int -utun_pkt_dtls_input(struct utun_pcb *pcb, mbuf_t *pkt, protocol_family_t family); - -static inline protocol_family_t -utun_crypto_framer_inner_type_to_protocol_family (utun_crypto_framer_inner_type_t type) -{ - if (type == UTUN_CRYPTO_INNER_TYPE_IPv4) { - return PF_INET; - } else { - return PF_INET6; - } -} - -static inline utun_crypto_framer_inner_type_t -utun_crypto_framer_protocol_family_to_inner_type (protocol_family_t family) -{ - if (family == PF_INET) { - return UTUN_CRYPTO_INNER_TYPE_IPv4; - } else { - return UTUN_CRYPTO_INNER_TYPE_IPv6; - } -} - -#endif // KERNEL_PRIVATE - -#endif // _NET_IF_UTUN_CRYPTO_DTLS_H_ diff --git a/bsd/net/if_utun_crypto_ipsec.c b/bsd/net/if_utun_crypto_ipsec.c deleted file mode 100644 index df1c4711c..000000000 --- a/bsd/net/if_utun_crypto_ipsec.c +++ /dev/null @@ -1,1073 +0,0 @@ -/* - * Copyright (c) 2011-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#if IPSEC - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern lck_mtx_t *sadb_mutex; -extern int esp_udp_encap_port; // udp encap listening port -extern int ipsec_policy_count; -extern int ipsec_bypass; -extern int natt_keepalive_interval; - -static int utun_punt_rx_keepalive = 0; // optional global control - -extern errno_t utun_pkt_input (struct utun_pcb *pcb, mbuf_t m); - -static u_int8_t -utun_ipsec_mode_to_sadb_mode (if_utun_crypto_ipsec_mode_t mode) -{ - switch (mode) { - case IF_UTUN_CRYPTO_IPSEC_MODE_TRANSPORT: - return IPSEC_MODE_TRANSPORT; - case IF_UTUN_CRYPTO_IPSEC_MODE_TUNNEL: - return IPSEC_MODE_TUNNEL; - default: - return 0; - } -} - -static u_int16_t -utun_ipsec_proto_to_sadb_proto (if_utun_crypto_ipsec_proto_t proto) -{ - switch (proto) { - case IF_UTUN_CRYPTO_IPSEC_PROTO_ESP: - return IPPROTO_ESP; - case IF_UTUN_CRYPTO_IPSEC_PROTO_AH: - return IPPROTO_AH; - default: - return 0; - } -} - -static u_int8_t -utun_ipsec_proto_to_sadb_satype (if_utun_crypto_ipsec_proto_t proto) -{ - switch (proto) { - case IF_UTUN_CRYPTO_IPSEC_PROTO_ESP: - return SADB_SATYPE_ESP; - case IF_UTUN_CRYPTO_IPSEC_PROTO_AH: - return SADB_SATYPE_AH; - default: - return 0; - } -} - -static u_int8_t -utun_ipsec_auth_to_sadb_aalg (if_utun_crypto_ipsec_auth_t auth) -{ - switch (auth) { - case IF_UTUN_CRYPTO_IPSEC_AUTH_MD5: - return SADB_AALG_MD5HMAC; - case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA1: - return SADB_AALG_SHA1HMAC; - case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA256: - return SADB_X_AALG_SHA2_256; - case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA384: - return SADB_X_AALG_SHA2_384; - case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA512: - return SADB_X_AALG_SHA2_512; - default: - return 0; - } -} - -static u_int8_t -utun_ipsec_enc_to_sadb_ealg (if_utun_crypto_ipsec_enc_t enc) -{ - switch (enc) { - case IF_UTUN_CRYPTO_IPSEC_ENC_DES: - return SADB_EALG_DESCBC; - case IF_UTUN_CRYPTO_IPSEC_ENC_3DES: - return SADB_EALG_3DESCBC; - case IF_UTUN_CRYPTO_IPSEC_ENC_AES128: - case IF_UTUN_CRYPTO_IPSEC_ENC_AES256: - return SADB_X_EALG_AESCBC; - default: - return 0; - } -} - -static u_int32_t -utun_ipsec_keepalive_and_nat_info_to_sadb_flags (if_utun_crypto_ipsec_keepalive_t keepalive, - int punt_rx_keepalive, - if_utun_crypto_ipsec_natd_t natd, - u_int16_t natt_port) -{ - u_int32_t flags = 0; - - if (natt_port && natt_port != 500) { - flags |= SADB_X_EXT_NATT; - - switch (keepalive) { - case IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_NATT: - flags |= SADB_X_EXT_NATT_KEEPALIVE; // normal keepalive packet - break; - case IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_ESP: - flags |= (SADB_X_EXT_ESP_KEEPALIVE | SADB_X_EXT_PUNT_RX_KEEPALIVE); // use an EMPTY ESP as a keepalive - break; - default: - break; - } - - switch (natd) { - case IF_UTUN_CRYPTO_IPSEC_NATD_PEER: - flags |= SADB_X_EXT_NATT_DETECTED_PEER; - break; - default: - break; - } - } - - if (punt_rx_keepalive) { - flags |= SADB_X_EXT_PUNT_RX_KEEPALIVE; - } - - return flags; -} - -static errno_t -utun_ipsec_set_sah (struct secashead **sah, - u_int8_t dir, - u_int16_t proto, - u_int8_t mode, - u_int32_t reqid, - struct sockaddr_storage *src_addr, - struct sockaddr_storage *dst_addr) -{ - struct secasindex saidx; - - // currently only support tunnel mode and ESP - if (proto != IPPROTO_ESP || - mode != IPSEC_MODE_TUNNEL) { - return EINVAL; - } - if ((((struct sockaddr *)src_addr)->sa_family != AF_INET && - ((struct sockaddr *)src_addr)->sa_family != AF_INET6) || - (((struct sockaddr *)dst_addr)->sa_family != AF_INET && - ((struct sockaddr *)dst_addr)->sa_family != AF_INET6)) { - return EINVAL; - } - - bzero(&saidx, sizeof(saidx)); - saidx.proto = proto; - saidx.mode = mode; - saidx.reqid = reqid; - bcopy(src_addr, &saidx.src, sizeof(saidx.src)); - bcopy(dst_addr, &saidx.dst, sizeof(saidx.dst)); - - lck_mtx_lock(sadb_mutex); - // TODO: add sah and policy (collision) check and prevention. ensure that there is no conflicting policy. - // TDDO: ensure that key_spdaddxxx doesn't add a policy that's conflicting with any of our sahs. - *sah = key_newsah2(&saidx, dir); - lck_mtx_unlock(sadb_mutex); - return 0; -} - -static int -utun_ipsec_clr_sahs (struct secashead **sah) -{ - struct secasvar *sav; - struct secasvar *nextsav; - u_int state; - - lck_mtx_lock(sadb_mutex); - for (state = 0; state < SADB_SASTATE_MAX; state++) { - for (sav = LIST_FIRST(&(*sah)->savtree[state]); - sav != NULL; - sav = nextsav) { - nextsav = LIST_NEXT(sav, chain); - if (sav->state == SADB_SASTATE_LARVAL || - sav->state == SADB_SASTATE_DEAD) { - continue; - } - - if (sav->utun_pcb) { - sav->utun_pcb = NULL; - sav->utun_is_keepalive_fn = NULL; - sav->utun_in_fn = NULL; - sav->refcnt--; // unlinked from pcb - } else { - printf("%s: SAV inconsistency\n", __FUNCTION__); - } - - key_sa_chgstate(sav, SADB_SASTATE_DEAD); - key_freesav(sav, KEY_SADB_LOCKED); - } - } - - // clear the rest of the SAs - key_delsah(*sah); - lck_mtx_unlock(sadb_mutex); - return 0; -} - -static void -utun_ipsec_set_udp_encap_listen_port (utun_crypto_dir_t dir, - u_int16_t natt_port) -{ - if (dir == UTUN_CRYPTO_DIR_IN) { - if (natt_port && natt_port != 500) { - esp_udp_encap_port = natt_port; - } - } -} - -static void -utun_set_lifetime (struct sadb_lifetime *lfh, - int type, - u_int64_t l_time) -{ - lfh->sadb_lifetime_len = (sizeof(*lfh) >> 3); // convert to words - lfh->sadb_lifetime_exttype = type; - lfh->sadb_lifetime_allocations = 0; - lfh->sadb_lifetime_bytes = 0; - lfh->sadb_lifetime_addtime = l_time; - lfh->sadb_lifetime_usetime = l_time; -} - -static struct sadb_key * -utun_ipsec_set_keybuf (u_int16_t type, - u_int8_t *key, - u_int16_t key_len) -{ - struct sadb_key *new; - int len = sizeof(*new) + BITSTOBYTES(key_len); - - lck_mtx_lock(sadb_mutex); - new = utun_alloc(len); - if (new == NULL) { - return NULL; - } - lck_mtx_unlock(sadb_mutex); - bzero(new, len); - new->sadb_key_len = BITSTOBYTES(key_len); - new->sadb_key_exttype = type; - new->sadb_key_bits = key_len; - bcopy(key, &new[1], new->sadb_key_len); - return new; -} - -static errno_t -utun_ipsec_alloc_sav (struct secashead *sah, - struct secasvar **sav, - struct utun_pcb *pcb, - u_int8_t satype, - u_int8_t alg_auth, - u_int8_t alg_enc, - u_int32_t flags, - u_int8_t replay, - u_int8_t *key_auth, - u_int16_t key_auth_len, - u_int8_t *key_enc, - u_int16_t key_enc_len, - u_int16_t natt_port, - u_int32_t seq, - u_int32_t spi, - u_int32_t pid, - u_int64_t lifetime_hard, - u_int64_t lifetime_soft) -{ - struct sadb_key *keye, *keya; - struct sadb_lifetime lfh, lfs; - - if (*sav) { - return EINVAL; - } - - bzero(&lfh, sizeof(lfh)); - utun_set_lifetime(&lfh, SADB_EXT_LIFETIME_HARD, lifetime_hard); - bzero(&lfs, sizeof(lfs)); - utun_set_lifetime(&lfs, SADB_EXT_LIFETIME_SOFT, lifetime_soft); - - if ((keya = utun_ipsec_set_keybuf(SADB_EXT_KEY_AUTH, key_auth, key_auth_len)) == NULL) { - return ENOBUFS; - } - if ((keye = utun_ipsec_set_keybuf(SADB_EXT_KEY_ENCRYPT, key_enc, key_enc_len)) == NULL) { - utun_free(keya); - return ENOBUFS; - } - - lck_mtx_lock(sadb_mutex); - if ((*sav = key_newsav2(sah, - satype, - alg_auth, - alg_enc, - flags, - replay, - keya, - key_auth_len, - keye, - key_enc_len, - natt_port, - seq, - spi, - pid, - &lfh, - &lfs)) == NULL) { - lck_mtx_unlock(sadb_mutex); - utun_free(keya); - utun_free(keye); - return ENOBUFS; - } - (*sav)->utun_pcb = (__typeof__((*sav)->utun_pcb))pcb; - (*sav)->utun_is_keepalive_fn = (__typeof__((*sav)->utun_is_keepalive_fn))utun_pkt_is_ipsec_keepalive; - (*sav)->utun_in_fn = (__typeof__((*sav)->utun_in_fn))utun_pkt_ipsec_input; - (*sav)->refcnt++; // for the pcb - lck_mtx_unlock(sadb_mutex); - utun_free(keya); - utun_free(keye); - return 0; -} - -static int -utun_ipsec_free_sav (struct secasvar **sav) -{ - lck_mtx_lock(sadb_mutex); - if ((*sav)->utun_pcb) { - (*sav)->utun_pcb = NULL; - (*sav)->utun_is_keepalive_fn = NULL; - (*sav)->utun_in_fn = NULL; - } - (*sav)->refcnt--; // unlinked from pcb - key_sa_chgstate(*sav, SADB_SASTATE_DEAD); - key_freesav(*sav, KEY_SADB_LOCKED); - lck_mtx_unlock(sadb_mutex); - *sav = NULL; - return 0; -} - -static int -utun_ipsec_num_savs (struct secashead **sah) -{ - struct secasvar *sav; - struct secasvar *nextsav; - u_int state; - int n = 0; - - lck_mtx_lock(sadb_mutex); - for (state = 0; state < SADB_SASTATE_MAX; state++) { - for (sav = LIST_FIRST(&(*sah)->savtree[state]); - sav != NULL; - sav = nextsav) { - nextsav = LIST_NEXT(sav, chain); - if (sav->state == SADB_SASTATE_LARVAL || - sav->state == SADB_SASTATE_DYING || - sav->state == SADB_SASTATE_DEAD) { - continue; - } - - if (sav->utun_pcb) { - n++; - } else { - printf("%s: SAV inconsistency\n", __FUNCTION__); - } - } - } - lck_mtx_unlock(sadb_mutex); - - return n; -} - -static errno_t -utun_ctl_config_crypto_keys_ipsec_v1 (struct utun_pcb *pcb, - utun_crypto_keys_args_t *args, - utun_crypto_keys_t *crypto_keys) -{ - utun_crypto_keys_ipsec_args_v1_t *args_ipsec_v1 = &args->u.ipsec_v1; - u_int8_t *varargs_buf = UTUN_CRYPTO_KEYS_ARGS_VARARGS_BUF(args); - errno_t err; - struct secashead *sah; - u_int16_t proto; - u_int8_t mode; - u_int8_t satype, aalg, ealg; - u_int32_t flags; - - if (args_ipsec_v1->key_auth_len > MAX_KEY_AUTH_LEN_BITS) { - printf("%s: invalid auth key len %d, max %d\n", __FUNCTION__, - args_ipsec_v1->key_auth_len, MAX_KEY_AUTH_LEN_BITS); - return EINVAL; - } - if (args_ipsec_v1->key_enc_len > MAX_KEY_ENC_LEN_BITS) { - printf("%s: invalid enc key len %d, max %d\n", __FUNCTION__, - args_ipsec_v1->key_enc_len, MAX_KEY_ENC_LEN_BITS); - return EINVAL; - } - if (args->varargs_buflen != (__typeof__(args->varargs_buflen))((BITSTOBYTES(args_ipsec_v1->key_auth_len) + - BITSTOBYTES(args_ipsec_v1->key_enc_len)))) { - printf("%s: len check failed (%d,%d, %d)\n", __FUNCTION__, - args->varargs_buflen, args_ipsec_v1->key_auth_len, args_ipsec_v1->key_enc_len); - return EINVAL; - } - sah = IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys); - if (!sah) { - // TODO: make sure we pass through this once - proto = utun_ipsec_proto_to_sadb_proto(args_ipsec_v1->proto); - mode = utun_ipsec_mode_to_sadb_mode(args_ipsec_v1->mode); - - if ((err = utun_ipsec_set_sah(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys), - UTUN_CRYPTO_DIR_TO_IPSEC_DIR(args->dir), - proto, - mode, - args_ipsec_v1->reqid, - &args_ipsec_v1->src_addr, - &args_ipsec_v1->dst_addr))) { - return err; - } - sah = IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys); - if (!sah) { - return EBADF; - } - } - - satype = utun_ipsec_proto_to_sadb_satype(args_ipsec_v1->proto); - aalg = utun_ipsec_auth_to_sadb_aalg(args_ipsec_v1->alg_auth); - ealg = utun_ipsec_enc_to_sadb_ealg(args_ipsec_v1->alg_enc); - flags = utun_ipsec_keepalive_and_nat_info_to_sadb_flags(args_ipsec_v1->keepalive, - args_ipsec_v1->punt_rx_keepalive, - args_ipsec_v1->natd, - args_ipsec_v1->natt_port); - - if ((err = utun_ipsec_alloc_sav(sah, - &IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys), - pcb, - satype, - aalg, - ealg, - flags, - args_ipsec_v1->replay, - varargs_buf, - args_ipsec_v1->key_auth_len, - (varargs_buf + BITSTOBYTES(args_ipsec_v1->key_auth_len)), - args_ipsec_v1->key_enc_len, - args_ipsec_v1->natt_port, - args_ipsec_v1->seq, - args_ipsec_v1->spi, - args_ipsec_v1->pid, - args_ipsec_v1->lifetime_hard, - args_ipsec_v1->lifetime_soft))) { - return err; - } - crypto_keys->state.u.ipsec.proto = sah->saidx.proto; - crypto_keys->state.u.ipsec.mode = sah->saidx.mode; - if (((struct sockaddr *)&sah->saidx.src)->sa_family == AF_INET) { - crypto_keys->state.u.ipsec.ifamily = IPPROTO_IPV4; - } else { - crypto_keys->state.u.ipsec.ifamily = IPPROTO_IPV6; - } - crypto_keys->state.u.ipsec.spi = args_ipsec_v1->spi; - utun_ipsec_set_udp_encap_listen_port(args->dir, args_ipsec_v1->natt_port); - return 0; -} - -static errno_t -utun_ctl_unconfig_crypto_keys_ipsec_v1 (utun_crypto_keys_t *crypto_keys) -{ - if (!IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys)) { - return EBADF; - } - if (!IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys)) { - return EBADF; - } - if (utun_ipsec_free_sav(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys))) { - return EADDRNOTAVAIL; - } - if (!utun_ipsec_num_savs(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys))) { - (void)utun_ipsec_clr_sahs(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys)); - - // release sah - IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys) = NULL; - } - - return 0; -} - -static void -utun_set_spirange (struct sadb_spirange *spirange, - u_int32_t spirange_min, - u_int32_t spirange_max) -{ - spirange->sadb_spirange_min = spirange_min; - spirange->sadb_spirange_max = spirange_max; -} - -static u_int32_t -utun_ipsec_get_spi (struct sockaddr_storage *src_addr, - struct sockaddr_storage *dst_addr, - u_int16_t proto, - u_int8_t mode, - u_int32_t reqid, - u_int32_t spirange_min, - u_int32_t spirange_max) -{ - struct sadb_spirange spirange; - utun_set_spirange(&spirange, spirange_min, spirange_max); - // TODO: should this allocate an SAH? - return key_getspi2((struct sockaddr *)src_addr, - (struct sockaddr *)dst_addr, - proto, - mode, - reqid, - &spirange); -} - -static errno_t -utun_ctl_generate_crypto_keys_idx_ipsec_v1 (utun_crypto_keys_idx_args_t *args) -{ - utun_crypto_keys_idx_ipsec_args_v1_t *args_ipsec_v1 = &args->u.ipsec_v1; - u_int16_t proto; - u_int8_t mode; - - proto = utun_ipsec_proto_to_sadb_proto(args_ipsec_v1->proto); - mode = utun_ipsec_mode_to_sadb_mode(args_ipsec_v1->mode); - - args_ipsec_v1->spi = 0; - if ((args_ipsec_v1->spi = utun_ipsec_get_spi(&args_ipsec_v1->src_addr, - &args_ipsec_v1->dst_addr, - proto, - mode, - args_ipsec_v1->reqid, - args_ipsec_v1->spirange_min, - args_ipsec_v1->spirange_max)) == 0) { - return ENOBUFS; - } - return 0; -} - -void -utun_cleanup_all_crypto_ipsec (struct utun_pcb *pcb) -{ - int idx; - utun_crypto_ctx_t *crypto_ctx; - utun_crypto_keys_t *cur_crypto_keys, *nxt_crypto_keys; - - for (idx = 0; idx < UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_MAX); idx++) { - crypto_ctx = &pcb->utun_crypto_ctx[idx]; - if (!crypto_ctx->valid || - crypto_ctx->type != UTUN_CRYPTO_TYPE_IPSEC) { - continue; - } - - // flush all crypto materials - for (cur_crypto_keys = (__typeof__(cur_crypto_keys))LIST_FIRST(&crypto_ctx->keys_listhead); - cur_crypto_keys != NULL; - cur_crypto_keys = nxt_crypto_keys) { - nxt_crypto_keys = (__typeof__(nxt_crypto_keys))LIST_NEXT(cur_crypto_keys, chain); - - if (!cur_crypto_keys->valid) { - continue; - } - - if (IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(cur_crypto_keys)) { - (void)utun_ipsec_free_sav(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(cur_crypto_keys)); - } - - if (IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(cur_crypto_keys)) { - (void)utun_ipsec_clr_sahs(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(cur_crypto_keys)); - } - - LIST_REMOVE(cur_crypto_keys, chain); - bzero(cur_crypto_keys, sizeof(*cur_crypto_keys)); - utun_free(cur_crypto_keys); - } - - bzero(crypto_ctx, sizeof(*crypto_ctx)); - } -} - -static errno_t -utun_ctl_enable_crypto_ipsec_v1 (__unused utun_crypto_args_t *args) -{ - return 0; -} - -/* - * Summary: enables ipsec crypto info for the specified utun. - */ -void -utun_ctl_enable_crypto_ipsec(__unused struct utun_pcb *pcb, - utun_crypto_args_t *args) -{ - lck_mtx_lock(sadb_mutex); - /* Turn off the ipsec bypass, if already on */ - if (ipsec_bypass) { - ipsec_bypass = 0; - } - if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { - (void)utun_ctl_enable_crypto_ipsec_v1(args); - } - lck_mtx_unlock(sadb_mutex); -} - -/* - * Summary: disables ipsec crypto info for the specified utun. - */ -void -utun_ctl_disable_crypto_ipsec(__unused struct utun_pcb *pcb) -{ - utun_cleanup_all_crypto_ipsec(pcb); - lck_mtx_lock(sadb_mutex); - /* Turn on the ipsec bypass, if there are no other policies */ - if (!ipsec_policy_count && !ipsec_bypass) // TODO: ipsec_policy_count may be 1 by default - ipsec_bypass = 1; - utun_punt_rx_keepalive = 0; - lck_mtx_unlock(sadb_mutex); -} - -errno_t -utun_ctl_config_crypto_keys_ipsec (struct utun_pcb *pcb, - utun_crypto_keys_args_t *args, - utun_crypto_keys_t *crypto_keys) -{ - if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { - return(utun_ctl_config_crypto_keys_ipsec_v1(pcb, args, crypto_keys)); - } else { - printf("%s: ver unsupported (%d, %d)\n", __FUNCTION__, args->ver, UTUN_CRYPTO_KEYS_IPSEC_VER_1); - return EINVAL; - } -} - -errno_t -utun_ctl_unconfig_crypto_keys_ipsec (utun_crypto_keys_args_t *args, - utun_crypto_keys_t *crypto_keys) -{ - if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { - return(utun_ctl_unconfig_crypto_keys_ipsec_v1(crypto_keys)); - } else { - printf("%s: ver unsupported (%d, %d)\n", __FUNCTION__, args->ver, UTUN_CRYPTO_KEYS_IPSEC_VER_1); - return EINVAL; - } -} - -errno_t -utun_ctl_generate_crypto_keys_idx_ipsec (utun_crypto_keys_idx_args_t *args) -{ - if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { - return(utun_ctl_generate_crypto_keys_idx_ipsec_v1(args)); - } else { - printf("%s: ver unsupported (%d, %d)\n", __FUNCTION__, args->ver, UTUN_CRYPTO_KEYS_IPSEC_VER_1); - return EINVAL; - } -} - -int -utun_pkt_ipsec_output (struct utun_pcb *pcb, mbuf_t *pkt) -{ - utun_crypto_keys_t *crypto_keys = IF_UTUN_GET_TX_CRYPTO_KEYS(pcb); - struct secasvar *sav; - protocol_family_t proto; - mbuf_t new; - int err; - struct route *ro = NULL; - struct route ro_copy; - struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 }; - - if (crypto_keys && - crypto_keys->state.u.ipsec.proto == IPPROTO_ESP && - (sav = IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys)) && - sav->state == SADB_SASTATE_MATURE) { - // TODO: update stats to increment outgoing packets - // TODO: allow empty packets thru - - proto = *(mtod(*pkt, protocol_family_t *)); - m_adj(*pkt, sizeof(protocol_family_t)); - - bzero(&ro_copy, sizeof(ro_copy)); - - if ((proto == AF_UTUN || proto == AF_INET) && crypto_keys->state.u.ipsec.ifamily == IPPROTO_IPV4) { - struct ip *ip; - struct sockaddr_in *dst4; - - if (proto == AF_INET) { - if ((*pkt)->m_len < (__typeof__((*pkt)->m_len))sizeof(*ip)) { - if (!(*pkt = m_pullup(*pkt, sizeof(*ip)))) { - printf("%s: m_pullup failed\n", __FUNCTION__); - return 0; - } - } - - // split the mbuf chain to put the ip header and payloads in separate mbufs - new = ipsec4_splithdr(*pkt); - if (!new) { - printf("%s: ipsec4_splithdr(1) failed\n", __FUNCTION__); - ROUTE_RELEASE(&ro_copy); - *pkt = NULL; - return 0; - } - *pkt = new; - - // encapsulate with the outer header - if ((err = ipsec4_encapsulate(new, sav))) { - printf("%s: ipsec4_encapsulate failed (%d)\n", __FUNCTION__, err); - *pkt = NULL; - return 0; - } - - } else { - // otherwise it's AF_UTUN which will be a keepalive packet to be encapsulated, encrypted and sent - // encapsulate with the outer header - if ((err = ipsec4_encapsulate_utun_esp_keepalive(pkt, sav))) { - printf("%s: ipsec4_encapsulate failed (%d)\n", __FUNCTION__, err); - return 0; - } - new = *pkt; - } - - ip = mtod(new, __typeof__(ip)); - // grab sadb_mutex, to update sah's route cache and get a local copy of it - lck_mtx_lock(sadb_mutex); - ro = &sav->sah->sa_route; - dst4 = (struct sockaddr_in *)(void *)&ro->ro_dst; - if (ro->ro_rt) { - RT_LOCK(ro->ro_rt); - } - if (ROUTE_UNUSABLE(ro) || - dst4->sin_addr.s_addr != ip->ip_dst.s_addr) { - if (ro->ro_rt != NULL) - RT_UNLOCK(ro->ro_rt); - ROUTE_RELEASE(ro); - } - if (ro->ro_rt == NULL) { - dst4->sin_family = AF_INET; - dst4->sin_len = sizeof(*dst4); - dst4->sin_addr = ip->ip_dst; - rtalloc(ro); - if (ro->ro_rt) { - RT_LOCK(ro->ro_rt); - } else { - printf("%s: rtalloc(1) failed\n", __FUNCTION__); - mbuf_freem(new); - *pkt = NULL; - return 0; - } - } - if (ro->ro_rt->rt_flags & RTF_GATEWAY) { - dst4 = (struct sockaddr_in *)(void *)ro->ro_rt->rt_gateway; - } - RT_UNLOCK(ro->ro_rt); - route_copyout(&ro_copy, ro, sizeof(ro_copy)); - // release sadb_mutex, after updating sah's route cache and getting a local copy - lck_mtx_unlock(sadb_mutex); - - // split the mbuf chain to put the ip header and payloads in separate mbufs - new = ipsec4_splithdr(*pkt); - if (!new) { - printf("%s: ipsec4_splithdr(2) failed\n", __FUNCTION__); - ROUTE_RELEASE(&ro_copy); - *pkt = NULL; - return 0; - } - *pkt = new; - - if ((err = esp4_output(new, sav))) { - printf("%s: esp4_output failed (%d)\n", __FUNCTION__, err); - ROUTE_RELEASE(&ro_copy); - *pkt = NULL; - return 0; // drop - } - - ip = mtod(new, __typeof__(ip)); - ip->ip_len = ntohs(ip->ip_len); /* flip len field before calling ip_output */ - } else if ((proto == AF_UTUN || proto == AF_INET6) && crypto_keys->state.u.ipsec.ifamily == IPPROTO_IPV6) { - int plen; - struct ip6_hdr *ip6; - struct sockaddr_in6 *dst6; - - if (proto == AF_INET6) { - // split the mbuf chain to put the ip header and payloads in separate mbufs - new = ipsec6_splithdr(*pkt); - if (!new) { - printf("%s: ipsec6_splithdr(1) failed\n", __FUNCTION__); - ROUTE_RELEASE(&ro_copy); - *pkt = NULL; - return 0; - } - *pkt = new; - - // encapsulate with the outer header - if ((err = ipsec6_encapsulate(new, sav))) { - printf("%s: ipsec6_encapsulate failed (%d)\n", __FUNCTION__, err); - *pkt = NULL; - return 0; - } - - } else { - // otherwise it's AF_UTUN which will be a keepalive packet to be encapsulated, encrypted and sent - // encapsulate with the outer header - if ((err = ipsec6_encapsulate_utun_esp_keepalive(pkt, sav))) { - printf("%s: ipsec6_encapsulate failed (%d)\n", __FUNCTION__, err); - return 0; - } - new = *pkt; - } - - ip6 = mtod(new, __typeof__(ip6)); - // grab sadb_mutex, before updating sah's route cache - lck_mtx_lock(sadb_mutex); - ro = &sav->sah->sa_route; - dst6 = (struct sockaddr_in6 *)(void *)&ro->ro_dst; - if (ro->ro_rt) { - RT_LOCK(ro->ro_rt); - } - if (ROUTE_UNUSABLE(ro) || - !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst)) { - if (ro->ro_rt != NULL) - RT_UNLOCK(ro->ro_rt); - ROUTE_RELEASE(ro); - } - if (ro->ro_rt == NULL) { - bzero(dst6, sizeof(*dst6)); - dst6->sin6_family = AF_INET6; - dst6->sin6_len = sizeof(*dst6); - dst6->sin6_addr = ip6->ip6_dst; - rtalloc(ro); - if (ro->ro_rt) { - RT_LOCK(ro->ro_rt); - } else { - printf("%s: rtalloc(2) failed\n", __FUNCTION__); - mbuf_freem(new); - *pkt = NULL; - return 0; - } - } - if (ro->ro_rt->rt_flags & RTF_GATEWAY) { - dst6 = (struct sockaddr_in6 *)(void *)ro->ro_rt->rt_gateway; - } - RT_UNLOCK(ro->ro_rt); - route_copyout(&ro_copy, ro, sizeof(ro_copy)); - // release sadb_mutex, after updating sah's route cache and getting a local copy - lck_mtx_unlock(sadb_mutex); - - // split the mbuf chain to put the ip header and payloads in separate mbufs - new = ipsec6_splithdr(*pkt); - if (!new) { - printf("%s: ipsec6_splithdr failed\n", __FUNCTION__); - ROUTE_RELEASE(&ro_copy); - *pkt = NULL; - return 0; - } - *pkt = new; - - if ((err = esp6_output(new, mtod(new, u_char *), new->m_next, sav))) { - printf("%s: esp6_output failed (%d)\n", __FUNCTION__, err); - ROUTE_RELEASE(&ro_copy); - *pkt = NULL; - return 0; // drop - } - - plen = new->m_pkthdr.len - sizeof(struct ip6_hdr); - if (plen > IPV6_MAXPACKET) { - printf("%s: esp6_output failed due to invalid len (%d)\n", __FUNCTION__, plen); - ROUTE_RELEASE(&ro_copy); - mbuf_freem(new); - *pkt = NULL; - return 0; - } - ip6 = mtod(new, __typeof__(ip6)); - ip6->ip6_plen = ntohs(ip6->ip6_plen); /* flip len field before calling ip_output */ - } else { - printf("%s: packet's proto (%d) mismatched the context's proto (%d)\n", __FUNCTION__, - proto, crypto_keys->state.u.ipsec.ifamily); - mbuf_freem(*pkt); - *pkt = NULL; - return 0; - } - - if (pcb->utun_ifp) { - ifnet_stat_increment_out(pcb->utun_ifp, 1, mbuf_pkthdr_len(new), 0); - } - - if ((err = ip_output(new, NULL, &ro_copy, - (IP_OUTARGS | IP_NOIPSEC), NULL, &ipoa))) { - printf("%s: ip_output failed (%d)\n", __FUNCTION__, err); - } - lck_mtx_lock(sadb_mutex); - route_copyin(&ro_copy, ro, sizeof(*ro)); - lck_mtx_unlock(sadb_mutex); - return 0; - } else { - printf("%s: no suitable crypto-mat\n", __FUNCTION__); - } - return -1; -} - -// returns 0 if false, 1 if true, and -1 if there was a failure -int -utun_pkt_is_ipsec_keepalive (struct utun_pcb *pcb, mbuf_t *pkt, u_int16_t nxt, u_int32_t flags, size_t offs) -{ - int result; - u_int8_t *data; - int size_diff; - - if (!pcb->utun_ctlref) { - printf("%s - utun ctlref cleared\n", __FUNCTION__); - return 0; - } - - if (!(pcb->utun_flags & UTUN_FLAGS_CRYPTO)) { - printf("%s - crypto disabled\n", __FUNCTION__); - return 0; - } - - if ((*pkt)->m_pkthdr.len < 0) { - printf("%s - invalid hdr len, len %d, offs %lu\n", __FUNCTION__, (*pkt)->m_pkthdr.len, offs); - return 0; - } - - if ((size_t)(*pkt)->m_pkthdr.len <= offs) { - printf("%s - invalid offset, len %d, offs %lu\n", __FUNCTION__, (*pkt)->m_pkthdr.len, offs); - return 0; - } - - if ((*pkt)->m_len < 0) { - printf("%s - invalid len, len %d, offs %lu\n", __FUNCTION__, (*pkt)->m_len, offs); - return 0; - } - - // pullup offs + 1 bytes - if ((size_t)(*pkt)->m_len < (offs + 1)) { - if ((*pkt = m_pullup(*pkt, (offs + 1))) == NULL) { - printf("%s: m_pullup failed\n", __FUNCTION__); - return -1; - } - } - - if (pcb->utun_ifp) { - ifnet_stat_increment_in(pcb->utun_ifp, 1, mbuf_pkthdr_len(*pkt), 0); - } - - size_diff = (*pkt)->m_pkthdr.len - offs; - data = mtod(*pkt, __typeof(data)); - data += offs; - - // ESP keepalive meets all these conditions: ESP trailer's next proto indicates IP, the decrypted packet only has one zero'd byte in it. - if (flags & SADB_X_EXT_ESP_KEEPALIVE && - nxt == IPPROTO_IPV4 && - size_diff == 1 && - *data == 0) { - // TODO: update stats to increment keepalives and current timestamp - if (utun_punt_rx_keepalive || - flags & SADB_X_EXT_PUNT_RX_KEEPALIVE) { - - // strip all headers - if ((size_t)(*pkt)->m_len >= (offs + size_diff)) { - ovbcopy((caddr_t)data, (data + offs), size_diff); - (*pkt)->m_data += offs; - (*pkt)->m_len -= offs; - (*pkt)->m_pkthdr.len -= offs; - } else { - struct mbuf *n; - - n = m_split(*pkt, offs, M_DONTWAIT); - if (n == NULL) { - /* *pkt is retained by m_split */ - mbuf_freem(*pkt); - *pkt = NULL; - return -1; - } - m_adj(n, offs); - mbuf_freem(*pkt); - *pkt = n; - } - - // keepalive is being punted up to the control socket, prepend with a special packet type (PF_UTUN) - if (mbuf_prepend(pkt, sizeof(protocol_family_t), MBUF_DONTWAIT) != 0) { - printf("%s - ifnet_output prepend failed\n", __FUNCTION__); - return -1; - } - if ((size_t)(*pkt)->m_len < (sizeof(protocol_family_t) + size_diff)) { - if ((*pkt = m_pullup(*pkt, (sizeof(protocol_family_t) + size_diff))) == NULL) { - printf("%s: m_pullup failed\n", __FUNCTION__); - return -1; - } - } - - // mark UTUN/Keepalive packet - *(protocol_family_t *)mbuf_data(*pkt) = htonl(PF_UTUN); - - result = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, *pkt, CTL_DATA_EOR); - if (result != 0) { - printf("%s: - ctl_enqueuembuf failed: %d\n", __FUNCTION__, result); - mbuf_freem(*pkt); - return -1; - } - *pkt = NULL; - } - return 1; - } - return 0; -} - -int -utun_pkt_ipsec_input (struct utun_pcb *pcb, mbuf_t *pkt, protocol_family_t family) -{ - if (!m_tag_locate(*pkt, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPSEC, NULL)) { - return EINVAL; - } - - if (!(pcb->utun_flags & UTUN_FLAGS_CRYPTO)) { - printf("%s - crypto disabled\n", __FUNCTION__); - return EINVAL; - } - - if (!pcb->utun_ifp) { - printf("%s - utun ifp cleared\n", __FUNCTION__); - return EINVAL; - } - - // place protocol number at the beginning of the mbuf - if (mbuf_prepend(pkt, sizeof(protocol_family_t), MBUF_DONTWAIT) != 0) { - printf("%s - ifnet_output prepend failed\n", __FUNCTION__); - return ENOBUFS; - } - *(protocol_family_t *)mbuf_data(*pkt) = family; - - (void)utun_pkt_input(pcb, *pkt); - return 0; -} - -#endif /* IPSEC */ diff --git a/bsd/net/if_utun_crypto_ipsec.h b/bsd/net/if_utun_crypto_ipsec.h deleted file mode 100644 index 7a4c5f210..000000000 --- a/bsd/net/if_utun_crypto_ipsec.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2011 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _NET_IF_UTUN_CRYPTO_IPSEC_H_ -#define _NET_IF_UTUN_CRYPTO_IPSEC_H_ - -#ifdef KERNEL_PRIVATE - -struct utun_pcb; - -#define UTUN_CRYPTO_DIR_TO_IPSEC_DIR(dir) (dir == UTUN_CRYPTO_DIR_IN)? IPSEC_DIR_INBOUND : IPSEC_DIR_OUTBOUND -#define IF_UTUN_GET_TX_CRYPTO_KEYS(pcb) LIST_FIRST(&pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT)].keys_listhead) -#define IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(keys) keys->state.u.ipsec.sah -#define IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(keys) keys->state.u.ipsec.sav - -/* - * Summary: cleans up all crypto info for the specified utun. - */ -void -utun_cleanup_all_crypto_ipsec(struct utun_pcb *pcb); - -/* - * Summary: enables ipsec crypto info for the specified utun. - */ -void -utun_ctl_enable_crypto_ipsec(struct utun_pcb *pcb, utun_crypto_args_t *args); - -/* - * Summary: disables ipsec crypto info for the specified utun. - */ -void -utun_ctl_disable_crypto_ipsec(struct utun_pcb *pcb); - -/* - * Summary: configures an ipsec crypto context for the specified utun, with keying material - * (needed for traffic encrypt/decrypt). - * Args: - * pcb - the specified utun state info - * args - the ipsec crypto context keying arguments as passed down from userland. - * crypto_ctx_mat - the ipsec crypto context's keying material to be filled. - * Returns: 0 if successful, otherwise returns an appropriate errno. - */ -errno_t -utun_ctl_config_crypto_keys_ipsec(struct utun_pcb *pcb, - utun_crypto_keys_args_t *args, - utun_crypto_keys_t *crypto_ctx_mat); - -/* - * Summary: unconfigures the keying material in an ipsec crypto context for the specified utun. - * Args: - * args - the ipsec crypto context keying arguments as passed down from userland. - * crypto_ctx_mat - the ipsec crypto context's keying material to be filled. - * Returns: 0 if successful, otherwise returns an appropriate errno. - */ -errno_t -utun_ctl_unconfig_crypto_keys_ipsec(utun_crypto_keys_args_t *args, - utun_crypto_keys_t *crypto_ctx_mat); - -/* - * Summary: generates an SPI/index to be using by keying material in an ipsec crypto context - * for the specified utun. - * Args: - * args - the ipsec crypto context key index arguments as passed down from userland. - * Returns: 0 if successful, otherwise returns an appropriate errno. - */ -errno_t -utun_ctl_generate_crypto_keys_idx_ipsec(utun_crypto_keys_idx_args_t *args); - -int -utun_pkt_ipsec_output(struct utun_pcb *pcb, mbuf_t *pkt); - -int -utun_pkt_is_ipsec_keepalive(struct utun_pcb *pcb, mbuf_t *pkt, u_int16_t nxt, u_int32_t flags, size_t off); - -int -utun_pkt_ipsec_input(struct utun_pcb *pcb, mbuf_t *pkt, protocol_family_t family); - -#endif // KERNEL_PRIVATE - -#endif // _NET_IF_UTUN_CRYPTO_IPSEC_H_ diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 21066e652..ff5f8d428 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -364,6 +364,8 @@ struct if_tcp_ecn_stat { u_int64_t ecn_fallback_ce; u_int64_t ecn_off_conn; u_int64_t ecn_total_conn; + u_int64_t ecn_fallback_droprst; + u_int64_t ecn_fallback_droprxmt; struct if_tcp_ecn_perf_stat ecn_on; struct if_tcp_ecn_perf_stat ecn_off; }; @@ -407,6 +409,8 @@ struct if_cellular_status_v1 { #define IF_CELL_DL_MAX_BANDWIDTH_VALID 0x1000 #define IF_CELL_CONFIG_INACTIVITY_TIME_VALID 0x2000 #define IF_CELL_CONFIG_BACKOFF_TIME_VALID 0x4000 +#define IF_CELL_UL_MSS_RECOMMENDED_VALID 0x8000 + u_int32_t link_quality_metric; u_int32_t ul_effective_bandwidth; /* Measured uplink bandwidth based on current activity (bps) */ u_int32_t ul_max_bandwidth; /* Maximum supported uplink bandwidth (bps) */ @@ -426,11 +430,16 @@ struct if_cellular_status_v1 { u_int32_t dl_max_bandwidth; /* Maximum supported downlink bandwidth (bps) */ u_int32_t config_inactivity_time; /* ms */ u_int32_t config_backoff_time; /* new connections backoff time in ms */ - u_int64_t reserved_1; - u_int64_t reserved_2; +#define IF_CELL_UL_MSS_RECOMMENDED_NONE 0x0 /* Use default */ +#define IF_CELL_UL_MSS_RECOMMENDED_MEDIUM 0x1 /* 1200 byte MSS */ +#define IF_CELL_UL_MSS_RECOMMENDED_LOW 0x2 /* 512 byte MSS */ + u_int16_t mss_recommended; + u_int16_t reserved_1; + u_int32_t reserved_2; u_int64_t reserved_3; u_int64_t reserved_4; u_int64_t reserved_5; + u_int64_t reserved_6; } __attribute__((packed)); struct if_cellular_status { @@ -550,7 +559,7 @@ struct if_interface_state { /* * Indicate if the underlying link is currently - * available + * available */ u_int8_t interface_availability; #define IF_INTERFACE_STATE_INTERFACE_AVAILABLE 0x0 @@ -703,7 +712,6 @@ struct pfi_kif; /* we use TAILQs so that the order of instantiation is preserved in the list */ TAILQ_HEAD(ifnethead, ifnet); TAILQ_HEAD(ifaddrhead, ifaddr); -TAILQ_HEAD(ifprefixhead, ifprefix); LIST_HEAD(ifmultihead, ifmultiaddr); TAILQ_HEAD(tailq_head, tqdummy); TAILQ_HEAD(ifnet_filter_head, ifnet_filter); @@ -776,6 +784,7 @@ struct ifnet { struct if_description if_desc; /* extended description */ TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ TAILQ_ENTRY(ifnet) if_detaching_link; /* list of detaching ifnets */ + TAILQ_ENTRY(ifnet) if_ordered_link; /* list of ordered ifnets */ decl_lck_mtx_data(, if_ref_lock) u_int32_t if_refflags; /* see IFRF flags below */ @@ -786,6 +795,8 @@ struct ifnet { #define if_addrlist if_addrhead struct ifaddr *if_lladdr; /* link address (first/permanent) */ + u_int32_t if_qosmarking_mode; /* generation to use with NECP clients */ + int if_pcount; /* number of promiscuous listeners */ struct bpf_if *if_bpf; /* packet filter structure */ u_short if_index; /* numeric abbreviation for this if */ @@ -793,6 +804,7 @@ struct ifnet { short if_timer; /* time 'til if_watchdog called */ short if_flags; /* up/down, broadcast, etc. */ u_int32_t if_eflags; /* see */ + u_int32_t if_xflags; /* see */ int if_capabilities; /* interface features & capabilities */ int if_capenable; /* enabled features & capabilities */ @@ -805,10 +817,12 @@ struct ifnet { ifnet_family_t if_family; /* value assigned by Apple */ ifnet_subfamily_t if_subfamily; /* value assigned by Apple */ uintptr_t if_family_cookie; + ifnet_output_handler_func if_output_handler; ifnet_output_func if_output; ifnet_pre_enqueue_func if_pre_enqueue; ifnet_start_func if_start; ifnet_ctl_func if_output_ctl; + ifnet_input_handler_func if_input_handler; ifnet_input_poll_func if_input_poll; ifnet_ctl_func if_input_ctl; ifnet_ioctl_func if_ioctl; @@ -868,7 +882,6 @@ struct ifnet { struct dlil_threading_info *if_inp; - struct ifprefixhead if_prefixhead; /* list of prefixes per if */ struct { u_int32_t length; union { @@ -880,7 +893,6 @@ struct ifnet { struct label *if_label; /* interface MAC label */ #endif - u_int32_t if_wake_properties; #if PF struct pfi_kif *if_pf_kif; #endif /* PF */ @@ -901,6 +913,7 @@ struct ifnet { u_int32_t if_idle_new_flags; /* temporary idle flags */ u_int32_t if_idle_new_flags_mask; /* temporary mask */ u_int32_t if_route_refcnt; /* idle: route ref count */ + u_int32_t if_rt_sendts; /* last of a real time packet */ struct if_traffic_class if_tc __attribute__((aligned(8))); #if INET @@ -931,12 +944,20 @@ struct ifnet { uint32_t expensive:1; /* delegated i/f expensive? */ } if_delegated; -#define IF_MAXAGENTS 8 - uuid_t if_agentids[IF_MAXAGENTS]; + uuid_t *if_agentids; /* network agents attached to interface */ + u_int32_t if_agentcount; - u_int64_t if_data_threshold; + u_int32_t if_generation; /* generation to use with NECP clients */ u_int32_t if_fg_sendts; /* last send on a fg socket in seconds */ - u_int32_t if_rt_sendts; /* last of a real time packet */ + + u_int64_t if_data_threshold; + + /* Total bytes in send socket buffer */ + int64_t if_sndbyte_total __attribute__ ((aligned(8))); + /* Total unsent bytes in send socket buffer */ + int64_t if_sndbyte_unsent __attribute__ ((aligned(8))); + /* count of times, when there was data to send when sleep is impending */ + uint32_t if_unsent_data_cnt; #if INET decl_lck_rw_data(, if_inetdata_lock); @@ -1144,20 +1165,6 @@ struct ifaddr { #define IFA_REMREF_LOCKED(_ifa) \ ifa_remref(_ifa, 1) -/* - * The prefix structure contains information about one prefix - * of an interface. They are maintained by the different address families, - * are allocated and attached when an prefix or an address is set, - * and are linked together so all prefixes for an interface can be located. - */ -struct ifprefix { - struct sockaddr *ifpr_prefix; /* prefix of interface */ - struct ifnet *ifpr_ifp; /* back-pointer to interface */ - TAILQ_ENTRY(ifprefix) ifpr_list; /* queue macro glue */ - u_char ifpr_plen; /* prefix length in bits */ - u_char ifpr_type; /* protocol dependent prefix type */ -}; - /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. @@ -1272,10 +1279,19 @@ struct ifmultiaddr { (_ifp)->if_family == IFNET_FAMILY_FIREWIRE || \ (_ifp)->if_delegated.family == IFNET_FAMILY_FIREWIRE) +/* + * Indicate whether or not the immediate WiFi interface is on an infrastructure + * network + */ +#define IFNET_IS_WIFI_INFRA(_ifp) \ + ((_ifp)->if_family == IFNET_FAMILY_ETHERNET && \ + (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI && \ + !((_ifp)->if_eflags & IFEF_AWDL)) + /* * Indicate whether or not the immediate interface, or the interface delegated - * by it, is marked as expensive. The delegated interface is set/cleared - * along with the delegated ifp; we cache the flag for performance to avoid + * by it, is marked as expensive. The delegated interface is set/cleared + * along with the delegated ifp; we cache the flag for performance to avoid * dereferencing delegated ifp each time. * * Note that this is meant to be used only for policy purposes. @@ -1291,8 +1307,12 @@ struct ifmultiaddr { (((_ifp)->if_eflags & (IFEF_AWDL|IFEF_AWDL_RESTRICTED)) == \ (IFEF_AWDL|IFEF_AWDL_RESTRICTED)) +#define IFNET_IS_INTCOPROC(_ifp) \ + ((_ifp)->if_family == IFNET_FAMILY_ETHERNET && \ + (_ifp)->if_subfamily == IFNET_SUBFAMILY_INTCOPROC) extern struct ifnethead ifnet_head; +extern struct ifnethead ifnet_ordered_head; extern struct ifnet **ifindex2ifnet; extern u_int32_t if_sndq_maxlen; extern u_int32_t if_rcvq_maxlen; @@ -1321,6 +1341,8 @@ __private_extern__ void if_updown(struct ifnet *ifp, int up); extern int ifioctl(struct socket *, u_long, caddr_t, struct proc *); extern int ifioctllocked(struct socket *, u_long, caddr_t, struct proc *); extern struct ifnet *ifunit(const char *); +extern struct ifnet *ifunit_ref(const char *); +extern int ifunit_extract(const char *src, char *dst, size_t dstlen, int *unit); extern struct ifnet *if_withname(struct sockaddr *); extern void if_qflush(struct ifnet *, int); extern void if_qflush_sc(struct ifnet *, mbuf_svc_class_t, u_int32_t, @@ -1330,10 +1352,9 @@ extern struct if_clone *if_clone_lookup(const char *, u_int32_t *); extern int if_clone_attach(struct if_clone *); extern void if_clone_detach(struct if_clone *); -extern u_int32_t if_functional_type(struct ifnet *); +extern u_int32_t if_functional_type(struct ifnet *, bool); extern errno_t if_mcasts_update(struct ifnet *); -extern int32_t total_snd_byte_count; typedef enum { IFNET_LCK_ASSERT_EXCLUSIVE, /* RW: held as writer */ @@ -1365,10 +1386,12 @@ __private_extern__ void if_inet6data_lock_done(struct ifnet *ifp); __private_extern__ void ifnet_head_lock_shared(void); __private_extern__ void ifnet_head_lock_exclusive(void); __private_extern__ void ifnet_head_done(void); +__private_extern__ void ifnet_head_assert_exclusive(void); __private_extern__ errno_t ifnet_set_idle_flags_locked(ifnet_t, u_int32_t, u_int32_t); __private_extern__ int ifnet_is_attached(struct ifnet *, int refio); +__private_extern__ void ifnet_incr_iorefcnt(struct ifnet *); __private_extern__ void ifnet_decr_iorefcnt(struct ifnet *); __private_extern__ void ifnet_set_start_cycle(struct ifnet *, struct timespec *); @@ -1385,8 +1408,11 @@ __private_extern__ void dlil_if_unlock(void); __private_extern__ void dlil_if_lock_assert(void); extern struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); +extern struct ifaddr *ifa_ifwithaddr_locked(const struct sockaddr *); extern struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithaddr_scoped_locked(const struct sockaddr *, + unsigned int); extern struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); extern struct ifaddr *ifa_ifwithnet(const struct sockaddr *); extern struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, @@ -1482,6 +1508,14 @@ __private_extern__ int ifnet_set_netsignature(struct ifnet *, uint8_t, __private_extern__ int ifnet_get_netsignature(struct ifnet *, uint8_t, uint8_t *, uint16_t *, uint8_t *); +/* Required exclusive ifnet_head lock */ +__private_extern__ void ifnet_remove_from_ordered_list(struct ifnet *); + +__private_extern__ void ifnet_increment_generation(struct ifnet *); +__private_extern__ u_int32_t ifnet_get_generation(struct ifnet *); + +extern int if_set_qosmarking_mode(struct ifnet *, u_int32_t); + __private_extern__ errno_t ifnet_framer_stub(struct ifnet *, struct mbuf **, const struct sockaddr *, const char *, const char *, u_int32_t *, u_int32_t *); diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index ded46afbf..2dabd32b2 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2014 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -894,9 +894,9 @@ vlan_parent_remove_all_vlans(struct ifnet * p) vlan_parent_retain(vlp); vlan_parent_wait(vlp, "vlan_parent_remove_all_vlans"); need_vlp_release++; - vlp = parent_list_lookup(p); + /* check again */ - if (vlp == NULL) { + if (parent_list_lookup(p) != vlp) { goto signal_done; } @@ -1272,7 +1272,6 @@ vlan_input(ifnet_t p, __unused protocol_family_t protocol, ifnet_type(p)); m_freem(m); return 0; - break; } } if (tag != 0) { diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index a64dd0f03..d28af82ac 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2015 Apple Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,6 +69,7 @@ #ifdef INET6 #include #endif +#include #include "net/net_str_id.h" @@ -154,6 +155,7 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, { struct ifnet_init_eparams einit; struct ifnet *ifp = NULL; + char if_xname[IFXNAMSIZ] = {0}; int error; einit = *einit0; @@ -194,6 +196,14 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, } } + if (einit.uniqueid == NULL) { + /* Initialize external name (name + unit) */ + snprintf(if_xname, IFXNAMSIZ, + "%s%d", einit.name, einit.unit); + einit.uniqueid = if_xname; + einit.uniqueid_len = strlen(if_xname); + } + error = dlil_if_acquire(einit.family, einit.uniqueid, einit.uniqueid_len, &ifp); @@ -317,6 +327,9 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, else ifp->if_eflags &= ~IFEF_RXPOLL; + ifp->if_output_handler = dlil_output_handler; + ifp->if_input_handler = dlil_input_handler; + VERIFY(!(einit.flags & IFNET_INIT_LEGACY) || (ifp->if_pre_enqueue == NULL && ifp->if_start == NULL && ifp->if_output_ctl == NULL && ifp->if_input_poll == NULL && @@ -584,7 +597,7 @@ ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask) ev_msg.dv[0].data_length = sizeof(struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(interface, &ev_msg); } return (0); @@ -627,12 +640,8 @@ ifnet_set_idle_flags_locked(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) if ((after - before) < 0 && ifp->if_idle_flags == 0 && ifp->if_want_aggressive_drain != 0) { ifp->if_want_aggressive_drain = 0; - if (ifnet_aggressive_drainers == 0) - panic("%s: ifp=%p negative aggdrain!", __func__, ifp); } else if ((after - before) > 0 && ifp->if_want_aggressive_drain == 0) { ifp->if_want_aggressive_drain++; - if (++ifnet_aggressive_drainers == 0) - panic("%s: ifp=%p wraparound aggdrain!", __func__, ifp); } return (0); @@ -825,7 +834,7 @@ ifnet_set_capabilities_enabled(ifnet_t ifp, u_int32_t new_caps, ev_msg.dv[0].data_length = sizeof (struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); return (error); } @@ -841,11 +850,9 @@ static const ifnet_offload_t offload_mask = IFNET_IP_FRAGMENT | IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | IFNET_CSUM_PARTIAL | IFNET_VLAN_TAGGING | IFNET_VLAN_MTU | IFNET_MULTIPAGES | IFNET_TSO_IPV4 | IFNET_TSO_IPV6 | - IFNET_TX_STATUS); + IFNET_TX_STATUS | IFNET_HW_TIMESTAMP | IFNET_SW_TIMESTAMP); -static const ifnet_offload_t any_offload_csum = - (IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | - IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_CSUM_PARTIAL); +static const ifnet_offload_t any_offload_csum = IFNET_CHECKSUMF; errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload) @@ -885,7 +892,11 @@ ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload) if ((offload & IFNET_VLAN_TAGGING)) ifcaps |= IFCAP_VLAN_HWTAGGING; if ((offload & IFNET_TX_STATUS)) - ifcaps |= IFNET_TX_STATUS; + ifcaps |= IFCAP_TXSTATUS; + if ((offload & IFNET_HW_TIMESTAMP)) + ifcaps |= IFCAP_HW_TIMESTAMP; + if ((offload & IFNET_SW_TIMESTAMP)) + ifcaps |= IFCAP_SW_TIMESTAMP; if (ifcaps != 0) { (void) ifnet_set_capabilities_supported(interface, ifcaps, IFCAP_VALID); @@ -983,8 +994,12 @@ ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) ifnet_lock_exclusive(interface); - interface->if_wake_properties = - (properties & mask) | (interface->if_wake_properties & ~mask); + if (mask & IF_WAKE_ON_MAGIC_PACKET) { + if (properties & IF_WAKE_ON_MAGIC_PACKET) + interface->if_xflags |= IFXF_WAKE_ON_MAGIC_PACKET; + else + interface->if_xflags &= ~IFXF_WAKE_ON_MAGIC_PACKET; + } ifnet_lock_done(interface); @@ -1002,7 +1017,7 @@ ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) ev_msg.dv[0].data_length = sizeof (struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(interface, &ev_msg); return (0); } @@ -1010,7 +1025,15 @@ ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) u_int32_t ifnet_get_wake_flags(ifnet_t interface) { - return ((interface == NULL) ? 0 : interface->if_wake_properties); + u_int32_t flags = 0; + + if (interface == NULL) + return (0); + + if (interface->if_xflags & IFXF_WAKE_ON_MAGIC_PACKET) + flags |= IF_WAKE_ON_MAGIC_PACKET; + + return (flags); } /* @@ -2194,7 +2217,7 @@ ifnet_transmit_burst_start(ifnet_t ifp, mbuf_t pkt) ifp->if_bw.start_seq = pkt->m_pkthdr.pkt_bwseq; ifp->if_bw.start_ts = mach_absolute_time(); -#else /*!MEASURE_BW */ +#else /* !MEASURE_BW */ #pragma unused(ifp, pkt) #endif /* !MEASURE_BW */ } @@ -2267,9 +2290,9 @@ ifnet_transmit_burst_end(ifnet_t ifp, mbuf_t pkt) #endif /* !MEASURE_BW */ } -/****************************************************************************/ -/* ifaddr_t accessors */ -/****************************************************************************/ +/*************************************************************************/ +/* ifaddr_t accessors */ +/*************************************************************************/ errno_t ifaddr_reference(ifaddr_t ifa) @@ -2513,9 +2536,9 @@ ifmaddr_ifnet(ifmultiaddr_t ifma) return ((ifma == NULL) ? NULL : ifma->ifma_ifp); } -/******************************************************************************/ -/* interface cloner */ -/******************************************************************************/ +/**************************************************************************/ +/* interface cloner */ +/**************************************************************************/ errno_t ifnet_clone_attach(struct ifnet_clone_params *cloner_params, @@ -2593,9 +2616,9 @@ ifnet_clone_detach(if_clone_t ifcloner) return (error); } -/******************************************************************************/ -/* misc */ -/******************************************************************************/ +/**************************************************************************/ +/* misc */ +/**************************************************************************/ errno_t ifnet_get_local_ports_extended(ifnet_t ifp, protocol_family_t protocol, @@ -2645,12 +2668,12 @@ errno_t ifnet_get_local_ports(ifnet_t ifp, u_int8_t *bitfield) { u_int32_t flags = IFNET_GET_LOCAL_PORTS_WILDCARDOK; - return (ifnet_get_local_ports_extended(ifp, PF_UNSPEC, flags, + return (ifnet_get_local_ports_extended(ifp, PF_UNSPEC, flags, bitfield)); } errno_t -ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr* sa, int32_t rssi, +ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr *sa, int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48]) { if (ifp == NULL || sa == NULL || srvinfo == NULL) @@ -2665,7 +2688,7 @@ ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr* sa, int32_t rssi, } errno_t -ifnet_notice_node_absence(ifnet_t ifp, struct sockaddr* sa) +ifnet_notice_node_absence(ifnet_t ifp, struct sockaddr *sa) { if (ifp == NULL || sa == NULL) return (EINVAL); @@ -2691,8 +2714,18 @@ ifnet_notice_master_elected(ifnet_t ifp) errno_t ifnet_tx_compl_status(ifnet_t ifp, mbuf_t m, tx_compl_val_t val) { -#pragma unused(ifp, m, val) - /* Dummy function to be implemented XXX */ +#pragma unused(val) + + m_do_tx_compl_callback(m, ifp); + + return (0); +} + +errno_t +ifnet_tx_compl(ifnet_t ifp, mbuf_t m) +{ + m_do_tx_compl_callback(m, ifp); + return (0); } @@ -2724,6 +2757,17 @@ ifnet_set_delegate(ifnet_t ifp, ifnet_t delegated_ifp) ifnet_lock_done(ifp); goto done; } + // Test if this delegate interface would cause a loop + ifnet_t delegate_check_ifp = delegated_ifp; + while (delegate_check_ifp != NULL) { + if (delegate_check_ifp == ifp) { + printf("%s: delegating to %s would cause a loop\n", + ifp->if_xname, delegated_ifp->if_xname); + ifnet_lock_done(ifp); + goto done; + } + delegate_check_ifp = delegate_check_ifp->if_delegated.ifp; + } bzero(&ifp->if_delegated, sizeof (ifp->if_delegated)); if (delegated_ifp != NULL && ifp != delegated_ifp) { ifp->if_delegated.ifp = delegated_ifp; @@ -2787,35 +2831,33 @@ ifnet_get_delegate(ifnet_t ifp, ifnet_t *pdelegated_ifp) return (0); } -extern u_int32_t -key_fill_offload_frames_for_savs(ifnet_t ifp, - struct ifnet_keepalive_offload_frame *frames_array, - u_int32_t frames_array_count, size_t frame_data_offset); - -extern void -udp_fill_keepalive_offload_frames(ifnet_t ifp, - struct ifnet_keepalive_offload_frame *frames_array, - u_int32_t frames_array_count, size_t frame_data_offset, - u_int32_t *used_frames_count); - errno_t ifnet_get_keepalive_offload_frames(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frames_array, u_int32_t frames_array_count, size_t frame_data_offset, u_int32_t *used_frames_count) { - if (frames_array == NULL || used_frames_count == NULL) + u_int32_t i; + + if (frames_array == NULL || used_frames_count == NULL || + frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) return (EINVAL); /* frame_data_offset should be 32-bit aligned */ - if (P2ROUNDUP(frame_data_offset, sizeof(u_int32_t)) - != frame_data_offset) + if (P2ROUNDUP(frame_data_offset, sizeof(u_int32_t)) != + frame_data_offset) return (EINVAL); *used_frames_count = 0; if (frames_array_count == 0) return (0); + for (i = 0; i < frames_array_count; i++) { + struct ifnet_keepalive_offload_frame *frame = frames_array + i; + + bzero(frame, sizeof(struct ifnet_keepalive_offload_frame)); + } + /* First collect IPSec related keep-alive frames */ *used_frames_count = key_fill_offload_frames_for_savs(ifp, frames_array, frames_array_count, frame_data_offset); @@ -2826,7 +2868,14 @@ ifnet_get_keepalive_offload_frames(ifnet_t ifp, frames_array_count, frame_data_offset, used_frames_count); + /* If there is more room, collect other TCP keep-alive frames */ + if (*used_frames_count < frames_array_count) + tcp_fill_keepalive_offload_frames(ifp, frames_array, + frames_array_count, frame_data_offset, + used_frames_count); + VERIFY(*used_frames_count <= frames_array_count); + return (0); } @@ -2888,10 +2937,22 @@ ifnet_link_status_report(ifnet_t ifp, const void *buffer, if_cell_sr = &ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1; new_cell_sr = &ifsr->ifsr_u.ifsr_cell.if_cell_u.if_status_v1; + /* Check if we need to act on any new notifications */ + if ((new_cell_sr->valid_bitmask & + IF_CELL_UL_MSS_RECOMMENDED_VALID) && + new_cell_sr->mss_recommended != + if_cell_sr->mss_recommended) { + atomic_bitset_32(&tcbinfo.ipi_flags, + INPCBINFO_UPDATE_MSS); + inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST); + } + + /* Finally copy the new information */ ifp->if_link_status->ifsr_version = ifsr->ifsr_version; ifp->if_link_status->ifsr_len = ifsr->ifsr_len; if_cell_sr->valid_bitmask = 0; bcopy(new_cell_sr, if_cell_sr, sizeof(*if_cell_sr)); + } else if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) { struct if_wifi_status_v1 *if_wifi_sr, *new_wifi_sr; @@ -2961,7 +3022,7 @@ ifnet_link_status_report(ifnet_t ifp, const void *buffer, } /*************************************************************************/ -/* Packet preamble */ +/* Packet preamble */ /*************************************************************************/ #define MAX_IF_PACKET_PREAMBLE_LEN 32 @@ -2991,3 +3052,102 @@ ifnet_maxpacketpreamblelen(void) { return (MAX_IF_PACKET_PREAMBLE_LEN); } + + +/*************************************************************************/ +/* Fastlane QoS Ca */ +/*************************************************************************/ + +errno_t +ifnet_set_fastlane_capable(ifnet_t interface, boolean_t capable) +{ + if (interface == NULL) + return (EINVAL); + + if_set_qosmarking_mode(interface, + capable ? IFRTYPE_QOSMARKING_FASTLANE : IFRTYPE_QOSMARKING_MODE_NONE); + + return (0); +} + +errno_t +ifnet_get_fastlane_capable(ifnet_t interface, boolean_t *capable) +{ + if (interface == NULL || capable == NULL) + return (EINVAL); + if (interface->if_eflags & IFEF_QOSMARKING_CAPABLE) + *capable = true; + else + *capable = false; + return (0); +} + +errno_t +ifnet_get_unsent_bytes(ifnet_t interface, int64_t *unsent_bytes) +{ + int64_t bytes; + + if (interface == NULL || unsent_bytes == NULL) + return (EINVAL); + + bytes = *unsent_bytes = 0; + + if ((interface->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != + IFRF_ATTACHED) + return (ENXIO); + + bytes = interface->if_sndbyte_unsent; + + if (interface->if_eflags & IFEF_TXSTART) + bytes += IFCQ_BYTES(&interface->if_snd); + *unsent_bytes = bytes; + + return (0); +} + +errno_t +ifnet_get_buffer_status(const ifnet_t ifp, ifnet_buffer_status_t *buf_status) +{ + if (ifp == NULL || buf_status == NULL) + return (EINVAL); + + bzero(buf_status, sizeof (*buf_status)); + + if ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != + IFRF_ATTACHED) + return (ENXIO); + + buf_status->buf_sndbuf = ifp->if_sndbyte_unsent; + + if (ifp->if_eflags & IFEF_TXSTART) + buf_status->buf_interface = IFCQ_BYTES(&ifp->if_snd); + + return (0); +} + +void +ifnet_normalise_unsent_data(void) +{ + struct ifnet *ifp; + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + ifnet_lock_exclusive(ifp); + if ((ifp->if_refflags & (IFRF_ATTACHED|IFRF_DETACHING)) != + IFRF_ATTACHED) { + ifnet_lock_done(ifp); + continue; + } + if (!(ifp->if_eflags & IFEF_TXSTART)) { + ifnet_lock_done(ifp); + continue; + } + + if (ifp->if_sndbyte_total > 0 || + IFCQ_BYTES(&ifp->if_snd) > 0) + ifp->if_unsent_data_cnt++; + + ifnet_lock_done(ifp); + } + ifnet_head_done(); +} diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index 2c6e8bbe9..c94f294f0 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2015 Apple Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,7 +64,6 @@ struct ifnet_demux_desc; /*! @enum Interface Families @abstract Constants defining interface families. - @discussion @constant IFNET_FAMILY_ANY Match interface of any family type. @constant IFNET_FAMILY_LOOPBACK A software loopback interface. @constant IFNET_FAMILY_ETHERNET An Ethernet interface. @@ -123,6 +122,8 @@ enum { IFNET_SUBFAMILY_WIFI = 3, IFNET_SUBFAMILY_THUNDERBOLT = 4, IFNET_SUBFAMILY_RESERVED = 5, + IFNET_SUBFAMILY_INTCOPROC = 6, + IFNET_SUBFAMILY_UTUN = 7, }; /* @@ -137,7 +138,6 @@ typedef u_int32_t ifnet_subfamily_t; /*! @enum BPF tap mode @abstract Constants defining interface families. - @discussion @constant BPF_MODE_DISABLED Disable bpf. @constant BPF_MODE_INPUT Enable input only. @constant BPF_MODE_OUTPUT Enable output only. @@ -166,7 +166,6 @@ typedef u_int32_t protocol_family_t; /*! @enum Interface Abilities @abstract Constants defining interface offload support. - @discussion @constant IFNET_CSUM_IP Hardware will calculate IPv4 checksums. @constant IFNET_CSUM_TCP Hardware will calculate TCP checksums. @constant IFNET_CSUM_UDP Hardware will calculate UDP checksums. @@ -226,7 +225,9 @@ enum { IFNET_MULTIPAGES = 0x00100000, IFNET_TSO_IPV4 = 0x00200000, IFNET_TSO_IPV6 = 0x00400000, - IFNET_TX_STATUS = 0x00800000 + IFNET_TX_STATUS = 0x00800000, + IFNET_HW_TIMESTAMP = 0x01000000, + IFNET_SW_TIMESTAMP = 0x02000000 }; /*! @typedef ifnet_offload_t @@ -239,7 +240,15 @@ typedef u_int32_t ifnet_offload_t; "\020\1CSUM_IP\2CSUM_TCP\3CSUM_UDP\4CSUM_IP_FRAGS\5IP_FRAGMENT" \ "\6CSUM_TCPIPV6\7CSUM_UDPIPV6\10IPV6_FRAGMENT\15CSUM_PARTIAL" \ "\20VLAN_TAGGING\21VLAN_MTU\25MULTIPAGES\26TSO_IPV4\27TSO_IPV6" \ - "\30TXSTATUS" + "\30TXSTATUS\31HW_TIMESTAMP\32SW_TIMESTAMP" + +#define IFNET_CHECKSUMF \ + (IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | \ + IFNET_CSUM_FRAGMENT | IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | \ + IFNET_CSUM_PARTIAL) + +#define IFNET_TSOF \ + (IFNET_TSO_IPV4 | IFNET_TSO_IPV6) #endif /* KERNEL_PRIVATE */ /* @@ -283,8 +292,6 @@ typedef errno_t (*ifnet_output_func)(ifnet_t interface, mbuf_t data); you need to communicate with your kext using an ioctl, please use SIOCSIFKPI and SIOCGIFKPI. @param interface The interface the ioctl is being sent to. - @param proto_family The protocol family to handle the ioctl, may be - zero for no protocol_family. @param cmd The ioctl command. @param data A pointer to any data related to the ioctl. */ @@ -345,8 +352,6 @@ typedef errno_t (*ifnet_demux_func)(ifnet_t interface, mbuf_t packet, @discussion ifnet_event_func is called when an event occurs on a specific interface. @param interface The interface the event occurred on. - @param event_ptr Pointer to a kern_event structure describing the - event. */ typedef void (*ifnet_event_func)(ifnet_t interface, const struct kev_msg *msg); @@ -365,9 +370,9 @@ typedef void (*ifnet_event_func)(ifnet_t interface, const struct kev_msg *msg); protocol's pre-output function. @param frame_type The frame type as determined by the protocol's pre-output function. - @param prepend_len The length of prepended bytes to the mbuf. + @discussion prepend_len The length of prepended bytes to the mbuf. (ONLY used if KPI_INTERFACE_EMBEDDED is defined to 1) - @param postpend_len The length of the postpended bytes to the mbuf. + @discussion postpend_len The length of the postpended bytes to the mbuf. (ONLY used if KPI_INTERFACE_EMBEDDED is defined to 1) @result If the result is zero, processing will continue normally. @@ -437,7 +442,7 @@ typedef errno_t (*ifnet_del_proto_func)(ifnet_t interface, To prevent an address from being added to your multicast list, return EADDRNOTAVAIL. If you don't know how to parse/translate the address, return EOPNOTSUPP. - @param The interface. + @param interface The interface. @param mcast The multicast address. @result Zero upon success, EADDRNOTAVAIL on invalid multicast, @@ -452,7 +457,7 @@ typedef errno_t (*ifnet_check_multi)(ifnet_t interface, a specific protocol on a specific interface. This function is registered on an interface using ifnet_attach_protocol. @param ifp The interface the packet was received on. - @param protocol_family The protocol of the packet received. + @param protocol The protocol of the packet received. @param packet The packet being input. @param header The frame header. @result @@ -474,7 +479,7 @@ typedef errno_t (*proto_media_input)(ifnet_t ifp, protocol_family_t protocol, individual packet. The frame header can be retrieved using mbuf_pkthdr_header. @param ifp The interface the packet was received on. - @param protocol_family The protocol of the packet received. + @param protocol The protocol of the packet received. @param packet The packet being input. @result If the result is zero, the caller will assume the packets were @@ -492,7 +497,7 @@ typedef errno_t (*proto_media_input_v2)(ifnet_t ifp, protocol_family_t protocol, opportunity to specify the media specific frame type and destination. @param ifp The interface the packet will be sent on. - @param protocol_family The protocol of the packet being sent + @param protocol The protocol of the packet being sent (PF_INET/etc...). @param packet The packet being sent. @param dest The protocol level destination address. @@ -514,8 +519,8 @@ typedef errno_t (*proto_media_preout)(ifnet_t ifp, protocol_family_t protocol, @discussion proto_media_event is called to notify this layer of interface specific events. @param ifp The interface. - @param protocol_family The protocol family. - @param kev_msg The event. + @param protocol The protocol family. + @param event The event. */ typedef void (*proto_media_event)(ifnet_t ifp, protocol_family_t protocol, const struct kev_msg *event); @@ -535,7 +540,7 @@ typedef void (*proto_media_event)(ifnet_t ifp, protocol_family_t protocol, you need to communicate with your kext using an ioctl, please use SIOCSIFKPI and SIOCGIFKPI. @param ifp The interface. - @param protocol_family The protocol family. + @param protocol The protocol family. @param command The ioctl command. @param argument The argument to the ioctl. @result @@ -549,7 +554,7 @@ typedef errno_t (*proto_media_ioctl)(ifnet_t ifp, protocol_family_t protocol, @discussion proto_media_detached notifies you that your protocol has been detached. @param ifp The interface. - @param protocol_family The protocol family. + @param protocol The protocol family. @result See the discussion. */ @@ -577,8 +582,6 @@ typedef errno_t (*proto_media_resolve_multi)(ifnet_t ifp, function should inspect the parameters and transmit an arp packet using the information passed in. @param ifp The interface the arp packet should be sent on. - @param protocol_family The protocol family of the addresses - (PF_INET). @param arpop The arp operation (usually ARPOP_REQUEST or ARPOP_REPLY). @param sender_hw The value to use for the sender hardware @@ -711,15 +714,17 @@ typedef errno_t (*ifnet_pre_enqueue_func)(ifnet_t interface, mbuf_t data); @typedef ifnet_start_func @discussion ifnet_start_func is used to indicate to the driver that one or more packets may be dequeued by calling ifnet_dequeue() - or ifnet_dequeue_multi(). This routine gets invoked when - ifnet_start() is called; the ifnet_start_func callback will - be executed within the context of a dedicated kernel thread, - hence it is guaranteed to be single threaded. The driver must - employ additional serializations if this callback routine is - to be called directly from another context, in order to prevent - race condition related issues (e.g. out-of-order packets.) - The dequeued packets will be fully formed packets (including - frame headers). The packets must be freed by the driver. + or ifnet_dequeue_multi() or ifnet_dequeue_multi_bytes(). + This routine gets invoked when ifnet_start() is called; + the ifnet_start_func callback will be executed within the + context of a dedicated kernel thread, hence it is + guaranteed to be single threaded. The driver must employ + additional serializations if this callback routine is + to be called directly from another context, in order to + prevent race condition related issues (e.g. out-of-order + packets.) The dequeued packets will be fully formed + packets (including frame headers). The packets must be + freed by the driver. @param interface The interface being sent on. */ typedef void (*ifnet_start_func)(ifnet_t interface); @@ -742,6 +747,20 @@ typedef void (*ifnet_input_poll_func)(ifnet_t interface, u_int32_t flags, u_int32_t max_count, mbuf_t *first_packet, mbuf_t *last_packet, u_int32_t *cnt, u_int32_t *len); +#ifdef BSD_KERNEL_PRIVATE +struct thread; +typedef errno_t (*ifnet_input_handler_func)(ifnet_t ifp, mbuf_t m_head, + mbuf_t m_tail, const struct ifnet_stat_increment_param *s, + boolean_t poll, struct thread *tp); +typedef errno_t (*ifnet_output_handler_func)(ifnet_t interface, mbuf_t data); + +extern errno_t ifnet_set_input_handler(struct ifnet *ifp, + ifnet_input_handler_func fn); +extern errno_t ifnet_set_output_handler(struct ifnet *ifp, + ifnet_output_handler_func fn); +extern void ifnet_reset_input_handler(struct ifnet *ifp); +extern void ifnet_reset_output_handler(struct ifnet *ifp); +#endif /* BSD_KERNEL_PRIVATE */ /* @enum Interface control commands @abstract Constants defining control commands. @@ -991,7 +1010,7 @@ typedef errno_t (*ifnet_ctl_func)(ifnet_t interface, ifnet_ctl_cmd_t cmd, (in nanosecond.) @field start_delay_qlen The maximum length of output queue for delaying start callback to the driver. This is an - optimization for coalescing output packets. + optimization for coalescing output packets. @field start_delay_timeout The timeout in microseconds to delay start callback. If start_delay_qlen number of packets are not in the output queue when the timer fires, the start @@ -1228,8 +1247,8 @@ extern errno_t ifnet_allocate(const struct ifnet_init_params *init, @param init The initial values for the interface. These values can not be changed after the interface has been allocated. @param interface The interface allocated upon success. - @result May return ENOMEM if there is insufficient memory or EEXIST - if an interface with the same uniqueid and family has already + @result May return ENOMEM if there is insufficient memory or EBUSY + if an interface with the same uniqueid/(name + unit) and family has already been allocated and is in use. */ extern errno_t ifnet_allocate_extended(const struct ifnet_init_eparams *init, @@ -1320,6 +1339,36 @@ extern errno_t ifnet_dequeue_service_class(ifnet_t interface, extern errno_t ifnet_dequeue_multi(ifnet_t interface, u_int32_t max, mbuf_t *first_packet, mbuf_t *last_packet, u_int32_t *cnt, u_int32_t *len); +/* + @function ifnet_dequeue_multi_bytes + @discussion Dequeue one or more packets from the output queue of + an interface which implements the new driver output model, + where the scheduling model is set to + IFNET_SCHED_MODEL_NORMAL. The limit is specified in terms + of maximum number of bytes to return. The number of bytes + returned can be slightly higher than the limit so that + packet boundaries can be preserved. + @param interface The interface to dequeue the packets from + @param max_bytes The maximum number of bytes in the packet chain + that may be returned to the caller; this needs to be a + non-zero value for any packet to be returned. + @param first_packet Pointer to the first packet being dequeued + @param last_packet Pointer to the last packet being dequeued + @param cnt Pointer to a storage for the number of bytes dequeued. + Caller may supply NULL if not interested in this value + @param len Pointer to a storage for the total length (in bytes) + of the dequeued packets. Caller may supply NULL if not + interested in this value. + @result May return EINVAL if the parameters are invalid, ENXIO if + the interface doesn't implement the new driver output + model or the output scheduling model isn't + IFNET_SCHED_MODEL_NORMAL, or EAGAIN if there is currently + no packet available to be dequeued + */ +extern errno_t ifnet_dequeue_multi_bytes(ifnet_t interface, + u_int32_t max_bytes, mbuf_t *first_packet, mbuf_t *last_packet, + u_int32_t *cnt, u_int32_t *len); + /* @function ifnet_dequeue_service_class_multi @discussion Dequeue one or more packets of a particular service class @@ -2085,7 +2134,6 @@ extern errno_t ifnet_get_tso_mtu(ifnet_t interface, sa_family_t family, /*! @enum Interface wake properties @abstract Constants defining Interface wake properties. - @discussion @constant IFNET_WAKE_ON_MAGIC_PACKET Wake on Magic Packet. */ enum { @@ -2548,7 +2596,7 @@ u_int32_t packets_out, u_int32_t bytes_out, u_int32_t errors_out); The one exception would be the case where a kext wants to zero all of the counters. @param interface The interface. - @param counts The new stats values. + @param stats The new stats values. @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_stat(ifnet_t interface, @@ -2710,11 +2758,10 @@ extern void *ifnet_lladdr(ifnet_t interface); @param interface The interface. @param addr A buffer to copy the broadcast address in to. @param bufferlen The length of the buffer at addr. - @param addr_len On return, the length of the broadcast address. - @param lladdr_len The length, in bytes, of the link layer address. + @param out_len On return, the length of the broadcast address. */ extern errno_t ifnet_llbroadcast_copy_bytes(ifnet_t interface, void *addr, - size_t bufferlen, size_t *addr_len); + size_t bufferlen, size_t *out_len); #ifdef KERNEL_PRIVATE /*! @@ -2726,7 +2773,7 @@ extern errno_t ifnet_llbroadcast_copy_bytes(ifnet_t interface, void *addr, changed on. @param lladdr A pointer to the raw link layer address (pointer to the 6 byte ethernet address for ethernet). - @param lladdr_len The length, in bytes, of the link layer address. + @param length The length, in bytes, of the link layer address. @param type The link-layer address type. */ extern errno_t ifnet_set_lladdr_and_type(ifnet_t interface, const void *lladdr, @@ -2738,7 +2785,7 @@ extern errno_t ifnet_set_lladdr_and_type(ifnet_t interface, const void *lladdr, @discussion Resolves a multicast address for an attached protocol to a link-layer address. If a link-layer address is passed in, the interface will verify that it is a valid multicast address. - @param interface The interface. + @param ifp The interface. @param proto_addr A protocol address to be converted to a link-layer address. @param ll_addr Storage for the resulting link-layer address. @@ -2802,7 +2849,7 @@ extern errno_t ifnet_remove_multicast(ifmultiaddr_t multicast); ifnet_free_multicast_list will decrement the reference counts and free the array. @param interface The interface. - @param multicasts A pointer to a NULL terminated array of references + @param addresses A pointer to a NULL terminated array of references to the multicast addresses. @result 0 on success otherwise the errno error. */ @@ -2815,7 +2862,6 @@ extern errno_t ifnet_get_multicast_list(ifnet_t interface, ifnet_get_multicast_list. Decrements the refcount on each multicast address and frees the array. @param multicasts An array of references to the multicast addresses. - @result 0 on success otherwise the errno error. */ extern void ifnet_free_multicast_list(ifmultiaddr_t *multicasts); @@ -2824,7 +2870,7 @@ extern void ifnet_free_multicast_list(ifmultiaddr_t *multicasts); @discussion Find an interface by the name including the unit number. Caller must call ifnet_release on any non-null interface return value. - @param name The name of the interface, including any unit number + @param ifname The name of the interface, including any unit number (i.e. "en0"). @param interface A pointer to an interface reference. This will be filled in if a matching interface is found. @@ -3173,16 +3219,16 @@ extern errno_t ifnet_get_local_ports(ifnet_t ifp, u_int8_t *bitfield); means all protocols, otherwise PF_INET or PF_INET6. @param flags A bitwise of the following flags: IFNET_GET_LOCAL_PORTS_WILDCARDOK: When bit is set, - the list of local ports should include those that are + the list of local ports should include those that are used by sockets that aren't bound to any local address. IFNET_GET_LOCAL_PORTS_NOWAKEUPOK: When bit is - set, the list of local ports should return all sockets - including the ones that do not need a wakeup from sleep. - Sockets that do not want to wake from sleep are marked + set, the list of local ports should return all sockets + including the ones that do not need a wakeup from sleep. + Sockets that do not want to wake from sleep are marked with a socket option. - IFNET_GET_LOCAL_PORTS_TCPONLY: When bit is set, the list + IFNET_GET_LOCAL_PORTS_TCPONLY: When bit is set, the list of local ports should return the ports used by TCP sockets. - IFNET_GET_LOCAL_PORTS_UDPONLY: When bit is set, the list + IFNET_GET_LOCAL_PORTS_UDPONLY: When bit is set, the list of local ports should return the ports used by UDP sockets. only. IFNET_GET_LOCAL_PORTS_RECVANYIFONLY: When bit is set, the @@ -3258,6 +3304,14 @@ typedef u_int32_t tx_compl_val_t; */ extern errno_t ifnet_tx_compl_status(ifnet_t ifp, mbuf_t m, tx_compl_val_t val); +/* + @function ifnet_tx_compl + @discussion Used to indicates the packet has been transmitted. + @param ifp The interface to which the mbuf was sent + @param m The mbuf that was transmitted +*/ +extern errno_t ifnet_tx_compl(ifnet_t ifp, mbuf_t m); + /******************************************************************************/ /* for interfaces that support dynamic node absence/presence events */ /******************************************************************************/ @@ -3351,26 +3405,80 @@ ifnet_get_delegate(ifnet_t ifp, ifnet_t *pdelegated_ifp); /* for interface keep alive offload support */ /*************************************************************************/ -#define IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE 128 +/* + @struct ifnet_keepalive_offload_frame + @discussion This structure is used to define various opportunistic + polling parameters for an interface. + For IPSec and AirPlay UDP keep alive only a subset of the + fields are relevant. + An incoming TCP keep alive probe has the sequence number + in the TCP header equal to "remote_seq" and the + acknowledgment number field is equal to "local_seq". + An incoming TCP keep alive probe has the sequence number + equlal to "remote_seq" minus 1 and the acknowledgment number + field is equal to "local_seq". + Note that remote_seq is in network byte order so the value to + match may have to be converted to host byte order when + subtracting 1. + For TCP, the field "interval" corresponds to the socket option + TCP_KEEPALIVE, the field "keep_cnt" to TCP_KEEPINTVL and + the field "keep_cnt" to TCP_KEEPCNT. + @field data Keep alive probe to be sent. + @field type The type of keep alive frame + @field length The length of the frame in the data field + @field interval Keep alive interval between probes in seconds + @field ether_type Tell if it's the protocol is IPv4 or IPv6 + @field keep_cnt Maximum number of time to retry probes (TCP only) + @field keep_retry Interval before retrying if previous probe was not answered (TCP only) + @field reply_length The length of the frame in the reply_data field (TCP only) + @field addr_length Length in bytes of local_addr and remote_addr (TCP only) + @field reply_data Keep alive reply to be sent to incoming probe (TCP only) + @field local_addr Local address: 4 bytes IPv4 or 16 bytes IPv6 address (TCP only) + @field remote_addr Remote address: 4 bytes IPv4 or 16 bytes IPv6 address (TCP only) + @field local_port Local port (TCP only) + @field remote_port Remote port (TCP only) + @field local_seq Local sequence number for matching incoming replies (TCP only) + @field remote_seq Remote sequence number for matching incoming probes or replies (TCP only) +*/ + +#define IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE 128 +#define IFNET_KEEPALIVE_OFFLOAD_MAX_ADDR_SIZE 16 + struct ifnet_keepalive_offload_frame { u_int8_t data[IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE]; /* data bytes */ -#define IFNET_KEEPALIVE_OFFLOAD_FRAME_IPSEC 0x0 -#define IFNET_KEEPALIVE_OFFLOAD_FRAME_AIRPLAY 0x1 +#define IFNET_KEEPALIVE_OFFLOAD_FRAME_IPSEC 0x0 +#define IFNET_KEEPALIVE_OFFLOAD_FRAME_AIRPLAY 0x1 +#define IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP 0x2 u_int8_t type; /* type of application */ u_int8_t length; /* Number of valid data bytes including offset */ u_int16_t interval; /* Keep alive interval in seconds */ #define IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 0x0 #define IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6 0x1 u_int8_t ether_type; /* Ether type IPv4 or IPv6 */ - u_int8_t __reserved[3]; /* For future */ + u_int8_t keep_cnt; /* max number of time to retry probes */ + u_int16_t keep_retry; /* interval before retrying if previous probe was not answered */ + u_int8_t reply_length; /* Length of valid reply_data bytes including offset */ + u_int8_t addr_length; /* Length of valid bytes in local_addr and remote_addr */ + u_int8_t reserved[2]; + u_int8_t reply_data[IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE]; /* Response packet */ + u_int8_t local_addr[IFNET_KEEPALIVE_OFFLOAD_MAX_ADDR_SIZE]; /* in network byte order */ + u_int8_t remote_addr[IFNET_KEEPALIVE_OFFLOAD_MAX_ADDR_SIZE]; /* in network byte order */ + u_int16_t local_port; /* in host byte order */ + u_int16_t remote_port; /* in host byte order */ + u_int32_t local_seq; /* in host byte order */ + u_int32_t remote_seq; /* in host byte order */ }; /* @function ifnet_get_keepalive_offload_frames @discussion Fills out frames_array with IP packets to send at periodic intervals as Keep-alive or heartbeat messages. - These are UDP datagrams. This can be used to offload - IPSec keep alives. + This can be used to offload keep alives for UDP or TCP. + Note: The frames are returned in this order: first the IPSec + frames, then the AirPlay frames and finally the TCP frames. + If a device does not support one kind of keep alive frames_array + it should provide a frames_array large enough to accomodate + the other frames @param ifp The interface to send frames out on. This is used to select which sockets or IPSec SAs should generate the packets. @@ -3401,7 +3509,7 @@ extern errno_t ifnet_get_keepalive_offload_frames(ifnet_t ifp, length provided by the driver. The contents of the buffer will be read but will not be modified. @param ifp The interface that is generating the report - @param buffer Buffer containing the link specific information + @param buffer Buffer containing the link specific information for this interface. It is the caller's responsibility to free this buffer. @param buffer_len Valid length of the buffer provided by the caller @@ -3450,10 +3558,63 @@ extern u_int32_t ifnet_packetpreamblelen(ifnet_t interface); */ extern u_int32_t ifnet_maxpacketpreamblelen(void); +/*************************************************************************/ +/* QoS Fastlane */ +/*************************************************************************/ +/*! + @function ifnet_set_fastlane_capable + @param interface The interface. + @param capable Set the truth value that the interface is attached to + a network that is capable of Fastlane QoS marking. + @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_set_fastlane_capable(ifnet_t interface, boolean_t capable); + +/*! + @function ifnet_get_fastlane_capable + @param interface The interface. + @param capable On output contains the truth value that the interface + is attached ta network that is capable of Fastlane QoS marking. + @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_get_fastlane_capable(ifnet_t interface, boolean_t *capable); + +/*! + @function ifnet_get_unsent_bytes + @param interface The interface + @param unsent_bytes An out parameter that contains unsent bytes for + an interface + @result Returns 0 on success, error otherwise. + */ +extern errno_t ifnet_get_unsent_bytes(ifnet_t interface, int64_t *unsent_bytes); + +typedef struct { + int32_t buf_interface; /* data to send at interface */ + int32_t buf_sndbuf; /* data to send at socket buffer */ +} ifnet_buffer_status_t; + +/*! + @function ifnet_get_buffer_status + @param interface The interface + @param buf_status An out parameter that contains unsent bytes + for an interface + @result Returns 0 on success, EINVAL if any of the arguments is + NULL, ENXIO if the interface pointer is invalid + */ +extern errno_t ifnet_get_buffer_status(const ifnet_t interface, + ifnet_buffer_status_t *buf_status); + +/*! + @function ifnet_normalise_unsent_data + @discussion + Gathers the unsent bytes on all the interfaces. + This data will be reported to NetworkStatistics. + + */ +extern void ifnet_normalise_unsent_data(void); #endif /* KERNEL_PRIVATE */ __END_DECLS #endif /* __KPI_INTERFACE__ */ - diff --git a/bsd/net/kpi_protocol.h b/bsd/net/kpi_protocol.h index 222696502..b2546d4e3 100644 --- a/bsd/net/kpi_protocol.h +++ b/bsd/net/kpi_protocol.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2012 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,10 +90,6 @@ extern errno_t proto_register_input(protocol_family_t protocol, to unload until the proto_detached_handler is called. @param protocol The protocol family these functions will receive packets for. - @param input The function called when a packet is input. - @param inject The function to called when a packet is injected (not - on the normal input path). - @result A errno error on failure. */ extern void proto_unregister_input(protocol_family_t protocol); #endif /* BSD_KERNEL_PRIVATE */ @@ -132,7 +128,7 @@ extern errno_t proto_inject(protocol_family_t protocol, mbuf_t packet); interface. A typical protocol plumb function would fill out an ifnet_attach_proto_param and call ifnet_attach_protocol. @param ifp The interface the protocol should be attached to. - @param protocol_family The protocol that should be attached to the + @param protocol The protocol that should be attached to the interface. @result A non-zero value of the attach failed. @@ -145,7 +141,7 @@ typedef errno_t (*proto_plumb_handler)(ifnet_t ifp, protocol_family_t protocol); from an interface. A typical unplumb function would call ifnet_detach_protocol and perform any necessary cleanup. @param ifp The interface the protocol should be detached from. - @param protocol_family The protocol that should be detached from the + @param protocol The protocol that should be detached from the interface. */ typedef void (*proto_unplumb_handler)(ifnet_t ifp, protocol_family_t protocol); diff --git a/bsd/net/ndrv.c b/bsd/net/ndrv.c index 34d7504b9..e171b48f3 100644 --- a/bsd/net/ndrv.c +++ b/bsd/net/ndrv.c @@ -2,7 +2,7 @@ * Copyright (c) 1997-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -134,7 +134,7 @@ ndrv_output(struct mbuf *m, struct socket *so) int result = 0; #if NDRV_DEBUG - kprintf("NDRV output: %x, %x, %x\n", m, so, np); + printf("NDRV output: %x, %x, %x\n", m, so, np); #endif /* @@ -145,15 +145,15 @@ ndrv_output(struct mbuf *m, struct socket *so) /* Unlock before calling ifnet_output */ socket_unlock(so, 0); - + /* * Call DLIL if we can. DLIL is much safer than calling the * ifp directly. */ result = ifnet_output_raw(ifp, np->nd_proto_family, m); - + socket_lock(so, 0); - + return (result); } @@ -217,7 +217,7 @@ ndrv_attach(struct socket *so, int proto, __unused struct proc *p) return(EPERM); #if NDRV_DEBUG - kprintf("NDRV attach: %x, %x, %x\n", so, proto, np); + printf("NDRV attach: %x, %x, %x\n", so, proto, np); #endif if ((error = soreserve(so, ndrv_sendspace, ndrv_recvspace))) @@ -229,7 +229,7 @@ ndrv_attach(struct socket *so, int proto, __unused struct proc *p) so->so_pcb = (caddr_t)np; bzero(np, sizeof(*np)); #if NDRV_DEBUG - kprintf("NDRV attach: %x, %x, %x\n", so, proto, np); + printf("NDRV attach: %x, %x, %x\n", so, proto, np); #endif TAILQ_INIT(&np->nd_dlist); np->nd_signature = NDRV_SIGNATURE; @@ -279,13 +279,13 @@ ndrv_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) if (np->nd_faddr) return EISCONN; - + /* Allocate memory to store the remote address */ MALLOC(np->nd_faddr, struct sockaddr_ndrv*, nam->sa_len, M_IFADDR, M_WAITOK); if (np->nd_faddr == NULL) return ENOMEM; - + bcopy((caddr_t) nam, (caddr_t) np->nd_faddr, nam->sa_len); soisconnected(so); return 0; @@ -340,7 +340,7 @@ ndrv_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) if (*dname == '\0') return(EINVAL); #if NDRV_DEBUG - kprintf("NDRV bind: %x, %x, %s\n", so, np, dname); + printf("NDRV bind: %x, %x, %s\n", so, np, dname); #endif /* Track down the driver and its ifnet structure. * There's no internal call for this so we have to dup the code @@ -355,7 +355,7 @@ ndrv_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) if (ifp == NULL) return(EADDRNOTAVAIL); - + // PPP doesn't support PF_NDRV. if (ifnet_family(ifp) != APPLE_IF_FAM_PPP) { @@ -364,7 +364,7 @@ ndrv_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) result = 0; bzero(&ndrv_proto, sizeof(ndrv_proto)); ndrv_proto.event = ndrv_event; - + /* We aren't worried about double attaching, that should just return an error */ socket_unlock(so, 0); result = ifnet_attach_protocol(ifp, PF_NDRV, &ndrv_proto); @@ -377,11 +377,11 @@ ndrv_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) else { np->nd_proto_family = 0; } - + np->nd_if = ifp; np->nd_family = ifnet_family(ifp); np->nd_unit = ifnet_unit(ifp); - + return(0); } @@ -495,7 +495,7 @@ ndrv_ctloutput(struct socket *so, struct sockopt *sopt) { struct ndrv_cb *np = sotondrvcb(so); int error = 0; - + switch(sopt->sopt_name) { case NDRV_DELDMXSPEC: /* Delete current spec */ @@ -537,7 +537,7 @@ ndrv_do_detach(struct ndrv_cb *np) struct ifnet * ifp; #if NDRV_DEBUG - kprintf("NDRV detach: %x, %x\n", so, np); + printf("NDRV detach: %x, %x\n", so, np); #endif ndrv_remove_all_multicast(np); @@ -552,7 +552,7 @@ ndrv_do_detach(struct ndrv_cb *np) ifnet_detach_protocol(ifp, proto_family); socket_lock(so, 0); } - + /* Check if this is the last socket attached to this interface */ TAILQ_FOREACH(cur_np, &ndrvl, nd_next) { if (cur_np->nd_family == np->nd_family && @@ -560,7 +560,7 @@ ndrv_do_detach(struct ndrv_cb *np) break; } } - + /* If there are no other interfaces, detach PF_NDRV from the interface */ if (cur_np == NULL) { socket_unlock(so, 0); @@ -584,7 +584,7 @@ ndrv_do_disconnect(struct ndrv_cb *np) { struct socket * so = np->nd_socket; #if NDRV_DEBUG - kprintf("NDRV disconnect: %x\n", np); + printf("NDRV disconnect: %x\n", np); #endif if (np->nd_faddr) { @@ -633,7 +633,7 @@ static int name_cmp(struct ifnet *ifp, char *q) r += len; sprint_d(ifnet_unit(ifp), r, IFNAMSIZ-(r-buf)); #if NDRV_DEBUG - kprintf("Comparing %s, %s\n", buf, q); + printf("Comparing %s, %s\n", buf, q); #endif return(strncmp(buf, q, IFNAMSIZ)); } @@ -657,7 +657,7 @@ ndrv_flushq(struct ifqueue *q) m_freem(m); } } -#endif +#endif int ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) @@ -666,9 +666,9 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) struct ndrv_protocol_desc ndrvSpec; struct ndrv_demux_desc* ndrvDemux = NULL; int error = 0; - struct socket * so = np->nd_socket; + struct socket * so = np->nd_socket; user_addr_t user_addr; - + /* Sanity checking */ if (np->nd_proto_family != PF_NDRV) return EBUSY; @@ -681,7 +681,7 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) if (sopt->sopt_valsize != sizeof(ndrvSpec64)) return EINVAL; - + error = sooptcopyin(sopt, &ndrvSpec64, sizeof(ndrvSpec64), sizeof(ndrvSpec64)); if (error != 0) return error; @@ -697,7 +697,7 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) if (sopt->sopt_valsize != sizeof(ndrvSpec32)) return EINVAL; - + error = sooptcopyin(sopt, &ndrvSpec32, sizeof(ndrvSpec32), sizeof(ndrvSpec32)); if (error != 0) return error; @@ -708,7 +708,7 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) user_addr = CAST_USER_ADDR_T(ndrvSpec32.demux_list); } - + /* Verify the parameter */ if (ndrvSpec.version > NDRV_PROTOCOL_DESC_VERS) return ENOTSUP; // version is too new! @@ -716,23 +716,23 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) return EINVAL; // version is not valid else if (ndrvSpec.demux_count > NDRV_PROTODEMUX_COUNT || ndrvSpec.demux_count == 0) return EINVAL; // demux_count is not valid - + bzero(&proto_param, sizeof(proto_param)); proto_param.demux_count = ndrvSpec.demux_count; - + /* Allocate storage for demux array */ MALLOC(ndrvDemux, struct ndrv_demux_desc*, proto_param.demux_count * sizeof(struct ndrv_demux_desc), M_TEMP, M_WAITOK); if (ndrvDemux == NULL) return ENOMEM; - + /* Allocate enough ifnet_demux_descs */ MALLOC(proto_param.demux_array, struct ifnet_demux_desc*, sizeof(*proto_param.demux_array) * ndrvSpec.demux_count, M_TEMP, M_WAITOK); if (proto_param.demux_array == NULL) error = ENOMEM; - + if (error == 0) { /* Copy the ndrv demux array from userland */ @@ -740,16 +740,16 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) ndrvSpec.demux_count * sizeof(struct ndrv_demux_desc)); ndrvSpec.demux_list = ndrvDemux; } - + if (error == 0) { /* At this point, we've at least got enough bytes to start looking around */ u_int32_t demuxOn = 0; - + proto_param.demux_count = ndrvSpec.demux_count; proto_param.input = ndrv_input; proto_param.event = ndrv_event; - + for (demuxOn = 0; demuxOn < ndrvSpec.demux_count; demuxOn++) { /* Convert an ndrv_demux_desc to a ifnet_demux_desc */ @@ -759,7 +759,7 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) break; } } - + if (error == 0) { /* We've got all our ducks lined up...lets attach! */ @@ -770,13 +770,13 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) if (error == 0) np->nd_proto_family = ndrvSpec.protocol_family; } - + /* Free any memory we've allocated */ if (proto_param.demux_array) FREE(proto_param.demux_array, M_TEMP); if (ndrvDemux) FREE(ndrvDemux, M_TEMP); - + return error; } @@ -785,22 +785,22 @@ int ndrv_to_ifnet_demux(struct ndrv_demux_desc* ndrv, struct ifnet_demux_desc* ifdemux) { bzero(ifdemux, sizeof(*ifdemux)); - + if (ndrv->type < DLIL_DESC_ETYPE2) { /* using old "type", not supported */ return ENOTSUP; } - + if (ndrv->length > 28) { return EINVAL; } - + ifdemux->type = ndrv->type; ifdemux->data = ndrv->data.other; ifdemux->datalen = ndrv->length; - + return 0; } @@ -808,15 +808,15 @@ int ndrv_delspec(struct ndrv_cb *np) { int result = 0; - + if (np->nd_proto_family == PF_NDRV || np->nd_proto_family == 0) return EINVAL; - + /* Detach the protocol */ result = ifnet_detach_protocol(np->nd_if, np->nd_proto_family); np->nd_proto_family = PF_NDRV; - + return result; } @@ -824,16 +824,16 @@ struct ndrv_cb * ndrv_find_inbound(struct ifnet *ifp, u_int32_t protocol) { struct ndrv_cb* np; - + if (protocol == PF_NDRV) return NULL; - + TAILQ_FOREACH(np, &ndrvl, nd_next) { if (np->nd_proto_family == protocol && np->nd_if == ifp) { return np; } } - + return NULL; } @@ -843,7 +843,7 @@ ndrv_handle_ifp_detach(u_int32_t family, short unit) struct ndrv_cb* np; struct ifnet *ifp = NULL; struct socket *so; - + /* Find all sockets using this interface. */ TAILQ_FOREACH(np, &ndrvl, nd_next) { if (np->nd_family == family && @@ -854,23 +854,23 @@ ndrv_handle_ifp_detach(u_int32_t family, short unit) ifp = np->nd_if; if (np->nd_proto_family != 0) ndrv_delspec(np); - + /* Delete the multicasts first */ ndrv_remove_all_multicast(np); - + /* Disavow all knowledge of the ifp */ np->nd_if = NULL; np->nd_unit = 0; np->nd_family = 0; - - so = np->nd_socket; + + so = np->nd_socket; /* Make sure sending returns an error */ lck_mtx_assert(ndrvdomain->dom_mtx, LCK_MTX_ASSERT_OWNED); socantsendmore(so); socantrcvmore(so); } } - + /* Unregister our protocol */ if (ifp) { ifnet_detach_protocol(ifp, PF_NDRV); @@ -882,7 +882,7 @@ ndrv_do_add_multicast(struct ndrv_cb *np, struct sockopt *sopt) { struct ndrv_multiaddr* ndrv_multi; int result; - + if (sopt->sopt_val == 0 || sopt->sopt_valsize < 2 || sopt->sopt_level != SOL_NDRVPROTO || sopt->sopt_valsize > SOCK_MAXADDRLEN) return EINVAL; @@ -890,30 +890,30 @@ ndrv_do_add_multicast(struct ndrv_cb *np, struct sockopt *sopt) return ENXIO; if (!(np->nd_dlist_cnt < ndrv_multi_max_count)) return EPERM; - + // Allocate storage MALLOC(ndrv_multi, struct ndrv_multiaddr*, sizeof(struct ndrv_multiaddr) - sizeof(struct sockaddr) + sopt->sopt_valsize, M_IFADDR, M_WAITOK); if (ndrv_multi == NULL) return ENOMEM; - + // Copy in the address result = copyin(sopt->sopt_val, &ndrv_multi->addr, sopt->sopt_valsize); - + // Validate the sockaddr if (result == 0 && sopt->sopt_valsize != ndrv_multi->addr.sa_len) result = EINVAL; - + if (result == 0 && ndrv_have_multicast(np, &ndrv_multi->addr)) result = EEXIST; - + if (result == 0) { // Try adding the multicast result = ifnet_add_multicast(np->nd_if, &ndrv_multi->addr, &ndrv_multi->ifma); } - + if (result == 0) { // Add to our linked list @@ -926,7 +926,7 @@ ndrv_do_add_multicast(struct ndrv_cb *np, struct sockopt *sopt) // Free up the memory, something went wrong FREE(ndrv_multi, M_IFADDR); } - + return result; } @@ -936,48 +936,48 @@ ndrv_do_remove_multicast(struct ndrv_cb *np, struct sockopt *sopt) struct sockaddr* multi_addr; struct ndrv_multiaddr* ndrv_entry = NULL; int result; - + if (sopt->sopt_val == 0 || sopt->sopt_valsize < 2 || sopt->sopt_level != SOL_NDRVPROTO) return EINVAL; if (np->nd_if == NULL || np->nd_dlist_cnt == 0) return ENXIO; - + // Allocate storage MALLOC(multi_addr, struct sockaddr*, sopt->sopt_valsize, M_TEMP, M_WAITOK); if (multi_addr == NULL) return ENOMEM; - + // Copy in the address result = copyin(sopt->sopt_val, multi_addr, sopt->sopt_valsize); - + // Validate the sockaddr if (result == 0 && sopt->sopt_valsize != multi_addr->sa_len) result = EINVAL; - + if (result == 0) { /* Find the old entry */ ndrv_entry = ndrv_have_multicast(np, multi_addr); - + if (ndrv_entry == NULL) result = ENOENT; } - + if (result == 0) { // Try deleting the multicast result = ifnet_remove_multicast(ndrv_entry->ifma); } - + if (result == 0) { // Remove from our linked list struct ndrv_multiaddr* cur = np->nd_multiaddrs; - + ifmaddr_release(ndrv_entry->ifma); - + if (cur == ndrv_entry) { np->nd_multiaddrs = cur->next; @@ -993,14 +993,14 @@ ndrv_do_remove_multicast(struct ndrv_cb *np, struct sockopt *sopt) } } } - + np->nd_dlist_cnt--; - + // Free the memory FREE(ndrv_entry, M_IFADDR); } FREE(multi_addr, M_TEMP); - + return result; } @@ -1010,7 +1010,7 @@ ndrv_have_multicast(struct ndrv_cb *np, struct sockaddr* inAddr) struct ndrv_multiaddr* cur; for (cur = np->nd_multiaddrs; cur != NULL; cur = cur->next) { - + if ((inAddr->sa_len == cur->addr.sa_len) && (bcmp(&cur->addr, inAddr, inAddr->sa_len) == 0)) { @@ -1018,7 +1018,7 @@ ndrv_have_multicast(struct ndrv_cb *np, struct sockaddr* inAddr) return cur; } } - + return NULL; } @@ -1026,14 +1026,14 @@ static void ndrv_remove_all_multicast(struct ndrv_cb* np) { struct ndrv_multiaddr* cur; - + if (np->nd_if != NULL) { while (np->nd_multiaddrs != NULL) { cur = np->nd_multiaddrs; np->nd_multiaddrs = cur->next; - + ifnet_remove_multicast(cur->ifma); ifmaddr_release(cur->ifma); FREE(cur, M_IFADDR); diff --git a/bsd/net/necp.c b/bsd/net/necp.c index 6da23d1a3..9ada6fe8a 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Apple Inc. All rights reserved. + * Copyright (c) 2013-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -49,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -105,7 +106,7 @@ * list is traversed to create optimized sub-lists ("Maps") which are used during * data-path evaluation. IP policies are sent into necp_kernel_ip_output_policies_map, * which hashes incoming packets based on marked socket-layer policies, and removes - * duplicate or overlapping polcies. Socket policies are sent into two maps, + * duplicate or overlapping policies. Socket policies are sent into two maps, * necp_kernel_socket_policies_map and necp_kernel_socket_policies_app_layer_map. * The app layer map is used for policy checks coming in from user space, and is one * list with duplicate and overlapping policies removed. The socket map hashes based @@ -137,14 +138,6 @@ u_int32_t necp_debug = 0; // 0=None, 1=Basic, 2=EveryMatch u_int32_t necp_session_count = 0; -#define NECPLOG(level, format, ...) do { \ - log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: " format "\n", __FUNCTION__, __VA_ARGS__); \ -} while (0) - -#define NECPLOG0(level, msg) do { \ - log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: %s\n", __FUNCTION__, msg); \ -} while (0) - #define LIST_INSERT_SORTED_ASCENDING(head, elm, field, sortfield, tmpelm) do { \ if (LIST_EMPTY((head)) || (LIST_FIRST(head)->sortfield >= (elm)->sortfield)) { \ LIST_INSERT_HEAD((head), elm, field); \ @@ -184,25 +177,32 @@ u_int32_t necp_session_count = 0; } \ } while (0) -#define NECP_KERNEL_CONDITION_ALL_INTERFACES 0x00001 -#define NECP_KERNEL_CONDITION_BOUND_INTERFACE 0x00002 -#define NECP_KERNEL_CONDITION_PROTOCOL 0x00004 -#define NECP_KERNEL_CONDITION_LOCAL_START 0x00008 -#define NECP_KERNEL_CONDITION_LOCAL_END 0x00010 -#define NECP_KERNEL_CONDITION_LOCAL_PREFIX 0x00020 -#define NECP_KERNEL_CONDITION_REMOTE_START 0x00040 -#define NECP_KERNEL_CONDITION_REMOTE_END 0x00080 -#define NECP_KERNEL_CONDITION_REMOTE_PREFIX 0x00100 -#define NECP_KERNEL_CONDITION_APP_ID 0x00200 -#define NECP_KERNEL_CONDITION_REAL_APP_ID 0x00400 -#define NECP_KERNEL_CONDITION_DOMAIN 0x00800 -#define NECP_KERNEL_CONDITION_ACCOUNT_ID 0x01000 -#define NECP_KERNEL_CONDITION_POLICY_ID 0x02000 -#define NECP_KERNEL_CONDITION_PID 0x04000 -#define NECP_KERNEL_CONDITION_UID 0x08000 -#define NECP_KERNEL_CONDITION_LAST_INTERFACE 0x10000 // Only set from packets looping between interfaces -#define NECP_KERNEL_CONDITION_TRAFFIC_CLASS 0x20000 -#define NECP_KERNEL_CONDITION_ENTITLEMENT 0x40000 +#define IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(x) ((x) == NECP_ROUTE_RULE_DENY_INTERFACE || (x) == NECP_ROUTE_RULE_ALLOW_INTERFACE) + +#define NECP_KERNEL_CONDITION_ALL_INTERFACES 0x00001 +#define NECP_KERNEL_CONDITION_BOUND_INTERFACE 0x00002 +#define NECP_KERNEL_CONDITION_PROTOCOL 0x00004 +#define NECP_KERNEL_CONDITION_LOCAL_START 0x00008 +#define NECP_KERNEL_CONDITION_LOCAL_END 0x00010 +#define NECP_KERNEL_CONDITION_LOCAL_PREFIX 0x00020 +#define NECP_KERNEL_CONDITION_REMOTE_START 0x00040 +#define NECP_KERNEL_CONDITION_REMOTE_END 0x00080 +#define NECP_KERNEL_CONDITION_REMOTE_PREFIX 0x00100 +#define NECP_KERNEL_CONDITION_APP_ID 0x00200 +#define NECP_KERNEL_CONDITION_REAL_APP_ID 0x00400 +#define NECP_KERNEL_CONDITION_DOMAIN 0x00800 +#define NECP_KERNEL_CONDITION_ACCOUNT_ID 0x01000 +#define NECP_KERNEL_CONDITION_POLICY_ID 0x02000 +#define NECP_KERNEL_CONDITION_PID 0x04000 +#define NECP_KERNEL_CONDITION_UID 0x08000 +#define NECP_KERNEL_CONDITION_LAST_INTERFACE 0x10000 // Only set from packets looping between interfaces +#define NECP_KERNEL_CONDITION_TRAFFIC_CLASS 0x20000 +#define NECP_KERNEL_CONDITION_ENTITLEMENT 0x40000 +#define NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT 0x80000 + +#define NECP_MAX_POLICY_RESULT_SIZE 512 +#define NECP_MAX_ROUTE_RULES_ARRAY_SIZE 1024 +#define NECP_MAX_CONDITIONS_ARRAY_SIZE 4096 struct necp_service_registration { LIST_ENTRY(necp_service_registration) session_chain; @@ -307,11 +307,15 @@ static void necp_handle_policy_delete(struct necp_session *session, u_int32_t me static void necp_handle_policy_apply_all(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); static void necp_handle_policy_list_all(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); static void necp_handle_policy_delete_all(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); +static void necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); static void necp_handle_set_session_priority(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); static void necp_handle_lock_session_to_proc(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); static void necp_handle_register_service(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); static void necp_handle_unregister_service(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset); +#define MAX_RESULT_STRING_LEN 64 +static inline const char * necp_get_result_description(char *result_string, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); + static struct necp_session_policy *necp_policy_create(struct necp_session *session, necp_policy_order order, u_int8_t *conditions_array, u_int32_t conditions_array_size, u_int8_t *route_rules_array, u_int32_t route_rules_array_size, u_int8_t *result, u_int32_t result_size); static struct necp_session_policy *necp_policy_find(struct necp_session *session, necp_policy_id policy_id); static bool necp_policy_mark_for_deletion(struct necp_session *session, struct necp_session_policy *policy); @@ -319,11 +323,11 @@ static bool necp_policy_mark_all_for_deletion(struct necp_session *session); static bool necp_policy_delete(struct necp_session *session, struct necp_session_policy *policy); static void necp_policy_apply_all(struct necp_session *session); -static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); +static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); static bool necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id); static bool necp_kernel_socket_policies_reprocess(void); static bool necp_kernel_socket_policies_update_uuid_table(void); -static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count); +static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count, proc_t proc); static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); static bool necp_kernel_ip_output_policy_delete(necp_kernel_policy_id policy_id); @@ -352,6 +356,7 @@ static LIST_HEAD(necp_uuid_id_mapping_head, necp_uuid_id_mapping) *necp_uuid_app #define APPUUIDHASH(uuid) (&necp_uuid_app_id_hashtbl[uuid[0] & necp_uuid_app_id_hash_mask]) // Assume first byte of UUIDs are evenly distributed static u_int32_t necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_policy_table); static bool necp_remove_uuid_app_id_mapping(uuid_t uuid, bool *removed_mapping, bool uuid_policy_table); +static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_app_id_locked(u_int32_t local_id); static struct necp_uuid_id_mapping *necp_uuid_lookup_service_id_locked(uuid_t uuid); static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_service_id_locked(u_int32_t local_id); @@ -367,12 +372,15 @@ struct necp_string_id_mapping { static LIST_HEAD(necp_string_id_mapping_list, necp_string_id_mapping) necp_account_id_list; static u_int32_t necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char *domain); static bool necp_remove_string_to_id_mapping(struct necp_string_id_mapping_list *list, char *domain); +static struct necp_string_id_mapping *necp_lookup_string_with_id_locked(struct necp_string_id_mapping_list *list, u_int32_t local_id); static LIST_HEAD(_necp_kernel_service_list, necp_service_registration) necp_registered_service_list; static char *necp_create_trimmed_domain(char *string, size_t length); static inline int necp_count_dots(char *string, size_t length); +static char *necp_copy_string(char *string, size_t length); + #define ROUTE_RULE_IS_AGGREGATE(ruleid) (ruleid > UINT16_MAX) #define MAX_ROUTE_RULE_INTERFACES 10 @@ -391,7 +399,7 @@ struct necp_route_rule { static LIST_HEAD(necp_route_rule_list, necp_route_rule) necp_route_rules; static u_int32_t necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_array, u_int32_t route_rules_array_size); static bool necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_id); -static bool necp_route_is_allowed(struct rtentry *route, ifnet_t interface, u_int32_t route_rule_id, bool *cellular_denied); +static bool necp_route_is_allowed(struct rtentry *route, ifnet_t interface, u_int32_t route_rule_id, u_int32_t *interface_type_denied); static struct necp_route_rule *necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route_rule_id); #define MAX_AGGREGATE_ROUTE_RULES 16 @@ -453,7 +461,6 @@ sysctl_handle_necp_level SYSCTL_HANDLER_ARGS return (error); } - // Kernel Control functions static errno_t necp_register_control(void); static errno_t necp_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, void **unitinfo); @@ -521,6 +528,8 @@ necp_init(void) lck_rw_init(&necp_route_rule_lock, necp_route_rule_mtx_grp, necp_route_rule_mtx_attr); + necp_client_init(); + LIST_INIT(&necp_kernel_socket_policies); LIST_INIT(&necp_kernel_ip_output_policies); @@ -723,7 +732,7 @@ necp_packet_get_tlv_at_offset(mbuf_t packet, int tlv_offset, u_int32_t buff_len, u_int32_t length; if (tlv_offset < 0) { - return (error); + return (EINVAL); } error = mbuf_copydata(packet, tlv_offset + sizeof(u_int8_t), sizeof(length), &length); @@ -776,7 +785,32 @@ necp_buffer_write_packet_header(u_int8_t *buffer, u_int8_t packet_type, u_int8_t return (buffer + sizeof(struct necp_packet_header)); } -static u_int8_t * + +u_int8_t * +necp_buffer_write_tlv_if_different(u_int8_t *buffer, const u_int8_t *max, u_int8_t type, + u_int32_t length, const void *value, bool *updated) +{ + u_int8_t *next_tlv = (u_int8_t *)(buffer + sizeof(type) + sizeof(length) + length); + if (next_tlv <= max) { + if (*updated || *(u_int8_t *)(buffer) != type) { + *(u_int8_t *)(buffer) = type; + *updated = TRUE; + } + if (*updated || *(u_int32_t *)(void *)(buffer + sizeof(type)) != length) { + *(u_int32_t *)(void *)(buffer + sizeof(type)) = length; + *updated = TRUE; + } + if (length > 0) { + if (*updated || memcmp((u_int8_t *)(buffer + sizeof(type) + sizeof(length)), value, length) != 0) { + memcpy((u_int8_t *)(buffer + sizeof(type) + sizeof(length)), value, length); + *updated = TRUE; + } + } + } + return (next_tlv); +} + +u_int8_t * necp_buffer_write_tlv(u_int8_t *buffer, u_int8_t type, u_int32_t length, const void *value) { *(u_int8_t *)(buffer) = type; @@ -788,7 +822,7 @@ necp_buffer_write_tlv(u_int8_t *buffer, u_int8_t type, u_int32_t length, const v return ((u_int8_t *)(buffer + sizeof(type) + sizeof(length) + length)); } -static u_int8_t +u_int8_t necp_buffer_get_tlv_type(u_int8_t *buffer, int tlv_offset) { u_int8_t *type = NULL; @@ -801,7 +835,7 @@ necp_buffer_get_tlv_type(u_int8_t *buffer, int tlv_offset) return (type ? *type : 0); } -static u_int32_t +u_int32_t necp_buffer_get_tlv_length(u_int8_t *buffer, int tlv_offset) { u_int32_t *length = NULL; @@ -814,7 +848,7 @@ necp_buffer_get_tlv_length(u_int8_t *buffer, int tlv_offset) return (length ? *length : 0); } -static u_int8_t * +u_int8_t * necp_buffer_get_tlv_value(u_int8_t *buffer, int tlv_offset, u_int32_t *value_size) { u_int8_t *value = NULL; @@ -831,7 +865,7 @@ necp_buffer_get_tlv_value(u_int8_t *buffer, int tlv_offset, u_int32_t *value_siz return (value); } -static int +int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int next) { if (offset < 0) { @@ -1015,6 +1049,10 @@ necp_ctl_send(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, mbuf_t packe necp_handle_policy_delete_all(session, header.message_id, packet, sizeof(header)); break; } + case NECP_PACKET_TYPE_POLICY_DUMP_ALL: { + necp_handle_policy_dump_all(session, header.message_id, packet, sizeof(header)); + break; + } case NECP_PACKET_TYPE_SET_SESSION_PRIORITY: { necp_handle_set_session_priority(session, header.message_id, packet, sizeof(header)); break; @@ -1117,6 +1155,7 @@ necp_delete_session(struct necp_session *session) } // Session Policy Management + static inline u_int8_t necp_policy_result_get_type_from_buffer(u_int8_t *buffer, u_int32_t length) { @@ -1145,6 +1184,18 @@ necp_policy_result_requires_route_rules(u_int8_t *buffer, u_int32_t length) return (FALSE); } +static inline bool +necp_address_is_valid(struct sockaddr *address) +{ + if (address->sa_family == AF_INET) { + return (address->sa_len == sizeof(struct sockaddr_in)); + } else if (address->sa_family == AF_INET6) { + return (address->sa_len == sizeof(struct sockaddr_in6)); + } else { + return (FALSE); + } +} + static bool necp_policy_result_is_valid(u_int8_t *buffer, u_int32_t length) { @@ -1257,8 +1308,7 @@ static inline bool necp_policy_condition_requires_application(u_int8_t *buffer, u_int32_t length) { u_int8_t type = necp_policy_condition_get_type_from_buffer(buffer, length); - return (type == NECP_POLICY_CONDITION_REAL_APPLICATION || - type == NECP_POLICY_CONDITION_ENTITLEMENT); + return (type == NECP_POLICY_CONDITION_REAL_APPLICATION); } static bool @@ -1333,14 +1383,17 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli } case NECP_POLICY_CONDITION_LOCAL_ADDR: case NECP_POLICY_CONDITION_REMOTE_ADDR: { - if (!result_cannot_have_ip_layer && condition_length >= sizeof(struct necp_policy_condition_addr)) { + if (!result_cannot_have_ip_layer && condition_length >= sizeof(struct necp_policy_condition_addr) && + necp_address_is_valid(&((struct necp_policy_condition_addr *)(void *)condition_value)->address.sa)) { validated = TRUE; } break; } case NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE: case NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE: { - if (!result_cannot_have_ip_layer && condition_length >= sizeof(struct necp_policy_condition_addr_range)) { + if (!result_cannot_have_ip_layer && condition_length >= sizeof(struct necp_policy_condition_addr_range) && + necp_address_is_valid(&((struct necp_policy_condition_addr_range *)(void *)condition_value)->start_address.sa) && + necp_address_is_valid(&((struct necp_policy_condition_addr_range *)(void *)condition_value)->end_address.sa)) { validated = TRUE; } break; @@ -1379,6 +1432,10 @@ necp_policy_route_rule_is_valid(u_int8_t *buffer, u_int32_t length) validated = TRUE; break; } + case NECP_ROUTE_RULE_QOS_MARKING: { + validated = TRUE; + break; + } default: { validated = FALSE; break; @@ -1488,7 +1545,7 @@ necp_handle_register_service(struct necp_session *session, u_int32_t message_id, response_error = NECP_ERROR_INTERNAL; goto fail; } - + lck_rw_lock_exclusive(&necp_kernel_policy_lock); memset(new_service, 0, sizeof(*new_service)); new_service->service_id = necp_create_uuid_service_id_mapping(service_uuid); @@ -1589,6 +1646,11 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ response_error = NECP_ERROR_INVALID_TLV; goto fail; } + if (policy_result_size > NECP_MAX_POLICY_RESULT_SIZE) { + NECPLOG(LOG_ERR, "Policy result length too large: %u", policy_result_size); + response_error = NECP_ERROR_INVALID_TLV; + goto fail; + } MALLOC(policy_result, u_int8_t *, policy_result_size, M_NECP, M_WAITOK); if (policy_result == NULL) { NECPLOG(LOG_ERR, "Failed to allocate a policy result buffer (size %d)", policy_result_size); @@ -1624,7 +1686,11 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ response_error = NECP_ERROR_INVALID_TLV; goto fail; } - + if (route_rules_array_size > NECP_MAX_ROUTE_RULES_ARRAY_SIZE) { + NECPLOG(LOG_ERR, "Route rules length too large: %u", route_rules_array_size); + response_error = NECP_ERROR_INVALID_TLV; + goto fail; + } MALLOC(route_rules_array, u_int8_t *, route_rules_array_size, M_NECP, M_WAITOK); if (route_rules_array == NULL) { NECPLOG(LOG_ERR, "Failed to allocate a policy route rules array (size %d)", route_rules_array_size); @@ -1688,6 +1754,11 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ response_error = NECP_ERROR_INVALID_TLV; goto fail; } + if (conditions_array_size > NECP_MAX_CONDITIONS_ARRAY_SIZE) { + NECPLOG(LOG_ERR, "Conditions length too large: %u", conditions_array_size); + response_error = NECP_ERROR_INVALID_TLV; + goto fail; + } MALLOC(conditions_array, u_int8_t *, conditions_array_size, M_NECP, M_WAITOK); if (conditions_array == NULL) { NECPLOG(LOG_ERR, "Failed to allocate a policy conditions array (size %d)", conditions_array_size); @@ -1729,11 +1800,11 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ response_error = NECP_ERROR_POLICY_CONDITIONS_INVALID; goto fail; } - + if (necp_policy_condition_is_application((conditions_array + conditions_array_cursor), condition_size)) { has_application_condition = TRUE; } - + if (necp_policy_condition_requires_application((conditions_array + conditions_array_cursor), condition_size)) { requires_application_condition = TRUE; } @@ -1741,7 +1812,7 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ conditions_array_cursor += condition_size; } } - + if (requires_application_condition && !has_application_condition) { NECPLOG0(LOG_ERR, "Failed to validate conditions; did not contain application condition"); response_error = NECP_ERROR_POLICY_CONDITIONS_INVALID; @@ -1946,6 +2017,435 @@ necp_policy_get_new_id(void) return (newid); } +/* + * For the policy dump response this is the structure: + * + * + * { + * type : NECP_TLV_POLICY_DUMP + * length : ... + * value : + * { + * { + * type : NECP_TLV_POLICY_ID + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_ORDER + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_RESULT_STRING + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_OWNER + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_CONDITION + * len : ... + * value : + * { + * { + * type : NECP_POLICY_CONDITION_ALL_INTERFACES + * len : ... + * value : ... + * } + * { + * type : NECP_POLICY_CONDITION_BOUND_INTERFACES + * len : ... + * value : ... + * } + * ... + * } + * } + * } + * } + * { + * type : NECP_TLV_POLICY_DUMP + * length : ... + * value : + * { + * { + * type : NECP_TLV_POLICY_ID + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_ORDER + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_RESULT_STRING + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_OWNER + * len : ... + * value : ... + * } + * { + * type : NECP_TLV_POLICY_CONDITION + * len : ... + * value : + * { + * { + * type : NECP_POLICY_CONDITION_ALL_INTERFACES + * len : ... + * value : ... + * } + * { + * type : NECP_POLICY_CONDITION_BOUND_INTERFACES + * len : ... + * value : ... + * } + * ... + * } + * } + * } + * } + * ... + */ +static void +necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, mbuf_t packet, int offset) +{ +#pragma unused(packet, offset) + struct necp_kernel_socket_policy *policy = NULL; + int policy_i; + int policy_count = 0; + u_int8_t **tlv_buffer_pointers = NULL; + u_int32_t *tlv_buffer_lengths = NULL; + int total_tlv_len = 0; + u_int8_t *result_buf = NULL; + u_int8_t *result_buf_cursor = result_buf; + char result_string[MAX_RESULT_STRING_LEN]; + char proc_name_string[MAXCOMLEN + 1]; + + bool error_occured = false; + u_int32_t response_error = NECP_ERROR_INTERNAL; + +#define REPORT_ERROR(error) error_occured = true; \ + response_error = error; \ + goto done + +#define UNLOCK_AND_REPORT_ERROR(lock, error) lck_rw_done(lock); \ + REPORT_ERROR(error) + + errno_t cred_result = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NECP_POLICIES, 0); + if (cred_result != 0) { + NECPLOG0(LOG_ERR, "Session does not hold the necessary entitlement to get Network Extension Policy information"); + REPORT_ERROR(NECP_ERROR_INTERNAL); + } + + // LOCK + lck_rw_lock_shared(&necp_kernel_policy_lock); + + NECPLOG0(LOG_DEBUG, "Gathering policies"); + + policy_count = necp_kernel_application_policies_count; + + MALLOC(tlv_buffer_pointers, u_int8_t **, sizeof(u_int8_t *) * policy_count, M_NECP, M_NOWAIT | M_ZERO); + if (tlv_buffer_pointers == NULL) { + NECPLOG(LOG_DEBUG, "Failed to allocate tlv_buffer_pointers (%u bytes)", sizeof(u_int8_t *) * policy_count); + UNLOCK_AND_REPORT_ERROR(&necp_kernel_policy_lock, NECP_ERROR_INTERNAL); + } + + MALLOC(tlv_buffer_lengths, u_int32_t *, sizeof(u_int32_t) * policy_count, M_NECP, M_NOWAIT | M_ZERO); + if (tlv_buffer_lengths == NULL) { + NECPLOG(LOG_DEBUG, "Failed to allocate tlv_buffer_lengths (%u bytes)", sizeof(u_int32_t) * policy_count); + UNLOCK_AND_REPORT_ERROR(&necp_kernel_policy_lock, NECP_ERROR_INTERNAL); + } + + for (policy_i = 0; necp_kernel_socket_policies_app_layer_map != NULL && necp_kernel_socket_policies_app_layer_map[policy_i] != NULL; policy_i++) { + policy = necp_kernel_socket_policies_app_layer_map[policy_i]; + + memset(result_string, 0, MAX_RESULT_STRING_LEN); + memset(proc_name_string, 0, MAXCOMLEN + 1); + + necp_get_result_description(result_string, policy->result, policy->result_parameter); + proc_name(policy->session_pid, proc_name_string, MAXCOMLEN); + + u_int16_t proc_name_len = strlen(proc_name_string) + 1; + u_int16_t result_string_len = strlen(result_string) + 1; + + NECPLOG(LOG_DEBUG, "Policy: process: %s, result: %s", proc_name_string, result_string); + + u_int32_t total_allocated_bytes = sizeof(u_int8_t) + sizeof(u_int32_t) + sizeof(policy->id) + // NECP_TLV_POLICY_ID + sizeof(u_int8_t) + sizeof(u_int32_t) + sizeof(policy->order) + // NECP_TLV_POLICY_ORDER + sizeof(u_int8_t) + sizeof(u_int32_t) + sizeof(policy->session_order) + // NECP_TLV_POLICY_SESSION_ORDER + sizeof(u_int8_t) + sizeof(u_int32_t) + result_string_len + // NECP_TLV_POLICY_RESULT_STRING + sizeof(u_int8_t) + sizeof(u_int32_t) + proc_name_len + // NECP_TLV_POLICY_OWNER + sizeof(u_int8_t) + sizeof(u_int32_t); // NECP_TLV_POLICY_CONDITION + + // We now traverse the condition_mask to see how much space we need to allocate + u_int32_t condition_mask = policy->condition_mask; + u_int8_t num_conditions = 0; + struct necp_string_id_mapping *account_id_entry = NULL; + char if_name[IFXNAMSIZ]; + u_int32_t condition_tlv_length = 0; + memset(if_name, 0, sizeof(if_name)); + + if (condition_mask == NECP_POLICY_CONDITION_DEFAULT) { + num_conditions++; + } else { + if (condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES) { + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { + snprintf(if_name, IFXNAMSIZ, "%s%d", ifnet_name(policy->cond_bound_interface), ifnet_unit(policy->cond_bound_interface)); + condition_tlv_length += strlen(if_name) + 1; + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_PROTOCOL) { + condition_tlv_length += sizeof(policy->cond_protocol); + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_APP_ID) { + condition_tlv_length += sizeof(uuid_t); + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_REAL_APP_ID) { + condition_tlv_length += sizeof(uuid_t); + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_DOMAIN) { + u_int32_t domain_len = strlen(policy->cond_domain) + 1; + condition_tlv_length += domain_len; + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID) { + account_id_entry = necp_lookup_string_with_id_locked(&necp_account_id_list, policy->cond_account_id); + u_int32_t account_id_len = 0; + if (account_id_entry) { + account_id_len = account_id_entry->string ? strlen(account_id_entry->string) + 1 : 0; + } + condition_tlv_length += account_id_len; + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_PID) { + condition_tlv_length += sizeof(pid_t); + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_UID) { + condition_tlv_length += sizeof(uid_t); + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_TRAFFIC_CLASS) { + condition_tlv_length += sizeof(struct necp_policy_condition_tc_range); + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT) { + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) { + u_int32_t entitlement_len = strlen(policy->cond_custom_entitlement) + 1; + condition_tlv_length += entitlement_len; + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) { + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) { + condition_tlv_length += sizeof(struct necp_policy_condition_addr_range); + } else { + condition_tlv_length += sizeof(struct necp_policy_condition_addr); + } + num_conditions++; + } + if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_START) { + if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_END) { + condition_tlv_length += sizeof(struct necp_policy_condition_addr_range); + } else { + condition_tlv_length += sizeof(struct necp_policy_condition_addr); + } + num_conditions++; + } + } + + condition_tlv_length += num_conditions * (sizeof(u_int8_t) + sizeof(u_int32_t)); // These are for the condition TLVs. The space for "value" is already accounted for above. + total_allocated_bytes += condition_tlv_length; + + u_int8_t *tlv_buffer; + MALLOC(tlv_buffer, u_int8_t *, total_allocated_bytes, M_NECP, M_NOWAIT | M_ZERO); + if (tlv_buffer == NULL) { + NECPLOG(LOG_DEBUG, "Failed to allocate tlv_buffer (%u bytes)", total_allocated_bytes); + continue; + } + + u_int8_t *cursor = tlv_buffer; + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(policy->id), &policy->id); + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ORDER, sizeof(necp_policy_order), &policy->order); + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_SESSION_ORDER, sizeof(policy->session_order), &policy->session_order); + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_RESULT_STRING, result_string_len , result_string); + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_OWNER, proc_name_len , proc_name_string); + +#define N_QUICK 256 + u_int8_t q_cond_buf[N_QUICK]; // Minor optimization + + u_int8_t *cond_buf; // To be used for condition TLVs + if (condition_tlv_length <= N_QUICK) { + cond_buf = q_cond_buf; + } else { + MALLOC(cond_buf, u_int8_t *, condition_tlv_length, M_NECP, M_NOWAIT); + if (cond_buf == NULL) { + NECPLOG(LOG_DEBUG, "Failed to allocate cond_buffer (%u bytes)", condition_tlv_length); + FREE(tlv_buffer, M_NECP); + continue; + } + } + + memset(cond_buf, 0, condition_tlv_length); + u_int8_t *cond_buf_cursor = cond_buf; + if (condition_mask == NECP_POLICY_CONDITION_DEFAULT) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_DEFAULT, 0, ""); + } else { + if (condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ALL_INTERFACES, 0, ""); + } + if (condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_BOUND_INTERFACE, strlen(if_name) + 1, if_name); + } + if (condition_mask & NECP_KERNEL_CONDITION_PROTOCOL) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_IP_PROTOCOL, sizeof(policy->cond_protocol), &policy->cond_protocol); + } + if (condition_mask & NECP_KERNEL_CONDITION_APP_ID) { + struct necp_uuid_id_mapping *entry = necp_uuid_lookup_uuid_with_app_id_locked(policy->cond_app_id); + if (entry != NULL) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_APPLICATION, sizeof(entry->uuid), entry->uuid); + } + } + if (condition_mask & NECP_KERNEL_CONDITION_REAL_APP_ID) { + struct necp_uuid_id_mapping *entry = necp_uuid_lookup_uuid_with_app_id_locked(policy->cond_real_app_id); + if (entry != NULL) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_REAL_APPLICATION, sizeof(entry->uuid), entry->uuid); + } + } + if (condition_mask & NECP_KERNEL_CONDITION_DOMAIN) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_DOMAIN, strlen(policy->cond_domain) + 1, policy->cond_domain); + } + if (condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID) { + if (account_id_entry != NULL) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ACCOUNT, strlen(account_id_entry->string) + 1, account_id_entry->string); + } + } + if (condition_mask & NECP_KERNEL_CONDITION_PID) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_PID, sizeof(policy->cond_pid), &policy->cond_pid); + } + if (condition_mask & NECP_KERNEL_CONDITION_UID) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_UID, sizeof(policy->cond_uid), &policy->cond_uid); + } + if (condition_mask & NECP_KERNEL_CONDITION_TRAFFIC_CLASS) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_TRAFFIC_CLASS, sizeof(policy->cond_traffic_class), &policy->cond_traffic_class); + } + if (condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ENTITLEMENT, 0, ""); + } + if (condition_mask & NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ENTITLEMENT, strlen(policy->cond_custom_entitlement) + 1, policy->cond_custom_entitlement); + } + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) { + if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) { + struct necp_policy_condition_addr_range range; + memcpy(&range.start_address, &policy->cond_local_start, sizeof(policy->cond_local_start)); + memcpy(&range.end_address, &policy->cond_local_end, sizeof(policy->cond_local_end)); + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE, sizeof(range), &range); + } else { + struct necp_policy_condition_addr addr; + addr.prefix = policy->cond_local_prefix; + memcpy(&addr.address, &policy->cond_local_start, sizeof(policy->cond_local_start)); + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_LOCAL_ADDR, sizeof(addr), &addr); + } + } + if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_START) { + if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_END) { + struct necp_policy_condition_addr_range range; + memcpy(&range.start_address, &policy->cond_remote_start, sizeof(policy->cond_remote_start)); + memcpy(&range.end_address, &policy->cond_remote_end, sizeof(policy->cond_remote_end)); + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE, sizeof(range), &range); + } else { + struct necp_policy_condition_addr addr; + addr.prefix = policy->cond_remote_prefix; + memcpy(&addr.address, &policy->cond_remote_start, sizeof(policy->cond_remote_start)); + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_REMOTE_ADDR, sizeof(addr), &addr); + } + } + } + + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_CONDITION, cond_buf_cursor - cond_buf, cond_buf); + if (cond_buf != q_cond_buf) { + FREE(cond_buf, M_NECP); + } + + tlv_buffer_pointers[policy_i] = tlv_buffer; + tlv_buffer_lengths[policy_i] = (cursor - tlv_buffer); + + // This is the length of the TLV for NECP_TLV_POLICY_DUMP + total_tlv_len += sizeof(u_int8_t) + sizeof(u_int32_t) + (cursor - tlv_buffer); + } + + // UNLOCK + lck_rw_done(&necp_kernel_policy_lock); + + u_int32_t total_result_length = sizeof(struct necp_packet_header) + total_tlv_len; + MALLOC(result_buf, u_int8_t *, total_result_length, M_NECP, M_NOWAIT | M_ZERO); + if (result_buf == NULL) { + NECPLOG(LOG_DEBUG, "Failed to allocate result_buffer (%u bytes)", total_result_length); + REPORT_ERROR(NECP_ERROR_INTERNAL); + } + + result_buf_cursor = result_buf; + result_buf_cursor = necp_buffer_write_packet_header(result_buf_cursor, NECP_PACKET_TYPE_POLICY_DUMP_ALL, NECP_PACKET_FLAGS_RESPONSE, message_id); + + for (int i = 0; i < policy_count; i++) { + if (tlv_buffer_pointers[i] != NULL) { + result_buf_cursor = necp_buffer_write_tlv(result_buf_cursor, NECP_TLV_POLICY_DUMP, tlv_buffer_lengths[i], tlv_buffer_pointers[i]); + } + } + + if (!necp_send_ctl_data(session, result_buf, result_buf_cursor - result_buf)) { + NECPLOG(LOG_ERR, "Failed to send response (%u bytes)", result_buf_cursor - result_buf); + } else { + NECPLOG(LOG_ERR, "Sent data worth %u bytes. Total result buffer length was %u bytes", result_buf_cursor - result_buf, total_result_length); + } + +done: + + if (error_occured) { + if(!necp_send_error_response(session, NECP_PACKET_TYPE_POLICY_DUMP_ALL, message_id, response_error)) { + NECPLOG0(LOG_ERR, "Failed to send error response"); + } else { + NECPLOG0(LOG_ERR, "Sent error response"); + } + } + + if (result_buf != NULL) { + FREE(result_buf, M_NECP); + } + + if (tlv_buffer_pointers != NULL) { + for (int i = 0; i < policy_count; i++) { + if (tlv_buffer_pointers[i] != NULL) { + FREE(tlv_buffer_pointers[i], M_NECP); + tlv_buffer_pointers[i] = NULL; + } + } + FREE(tlv_buffer_pointers, M_NECP); + } + + if (tlv_buffer_lengths != NULL) { + FREE(tlv_buffer_lengths, M_NECP); + } +#undef N_QUICK +#undef RESET_COND_BUF +#undef REPORT_ERROR +#undef UNLOCK_AND_REPORT_ERROR +} + static struct necp_session_policy * necp_policy_create(struct necp_session *session, necp_policy_order order, u_int8_t *conditions_array, u_int32_t conditions_array_size, u_int8_t *route_rules_array, u_int32_t route_rules_array_size, u_int8_t *result, u_int32_t result_size) { @@ -2079,6 +2579,11 @@ necp_policy_delete(struct necp_session *session, struct necp_session_policy *pol policy->conditions = NULL; } + if (policy->route_rules) { + FREE(policy->route_rules, M_NECP); + policy->route_rules = NULL; + } + FREE_ZONE(policy, sizeof(*policy), M_NECP_SESSION_POLICY); if (necp_debug) { @@ -2149,18 +2654,6 @@ necp_policy_unapply(struct necp_session_policy *policy) return (TRUE); } -static inline bool -necp_address_is_valid(struct sockaddr *address) -{ - if (address->sa_family == AF_INET) { - return (address->sa_len == sizeof(struct sockaddr_in)); - } else if (address->sa_family == AF_INET6) { - return (address->sa_len == sizeof(struct sockaddr_in6)); - } else { - return (FALSE); - } -} - #define NECP_KERNEL_POLICY_SUBORDER_ID_TUNNEL_CONDITION 0 #define NECP_KERNEL_POLICY_SUBORDER_NON_ID_TUNNEL_CONDITION 1 #define NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION 2 @@ -2183,6 +2676,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli bool socket_layer_non_id_conditions = FALSE; bool ip_output_layer_non_id_conditions = FALSE; + bool ip_output_layer_non_id_only = FALSE; bool ip_output_layer_id_condition = FALSE; bool ip_output_layer_tunnel_condition_from_id = FALSE; bool ip_output_layer_tunnel_condition_from_non_id = FALSE; @@ -2193,6 +2687,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli ifnet_t cond_bound_interface = NULL; u_int32_t cond_account_id = 0; char *cond_domain = NULL; + char *cond_custom_entitlement = NULL; pid_t cond_pid = 0; uid_t cond_uid = 0; necp_app_id cond_app_id = 0; @@ -2243,8 +2738,18 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli break; } case NECP_POLICY_CONDITION_ENTITLEMENT: { - master_condition_mask |= NECP_KERNEL_CONDITION_ENTITLEMENT; - socket_only_conditions = TRUE; + if (condition_length > 0) { + if (cond_custom_entitlement == NULL) { + cond_custom_entitlement = necp_copy_string((char *)condition_value, condition_length); + if (cond_custom_entitlement != NULL) { + master_condition_mask |= NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT; + socket_only_conditions = TRUE; + } + } + } else { + master_condition_mask |= NECP_KERNEL_CONDITION_ENTITLEMENT; + socket_only_conditions = TRUE; + } break; } case NECP_POLICY_CONDITION_DOMAIN: { @@ -2480,6 +2985,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } else if (socket_ip_conditions) { socket_layer_non_id_conditions = TRUE; ip_output_layer_non_id_conditions = TRUE; + ip_output_layer_non_id_only = TRUE; // Only apply drop to packets that didn't go through socket layer } break; } @@ -2517,6 +3023,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli tunnel_parameters.interface_name[tunnel_parameters_length - sizeof(u_int32_t) - 1] = 0; // Make sure the string is NULL terminated if (ifnet_find_by_name(tunnel_parameters.interface_name, &tunnel_interface) == 0) { ultimate_result_parameter.tunnel_interface_index = tunnel_interface->if_index; + ifnet_release(tunnel_interface); } secondary_result = tunnel_parameters.secondary_result; @@ -2587,8 +3094,10 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli if (ifnet_find_by_name(interface_name, &scope_interface) == 0) { ultimate_result_parameter.scoped_interface_index = scope_interface->if_index; socket_layer_non_id_conditions = TRUE; + ifnet_release(scope_interface); } } + break; } case NECP_POLICY_RESULT_ROUTE_RULES: { if (policy->route_rules != NULL && policy->route_rules_size > 0) { @@ -2599,6 +3108,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli socket_layer_non_id_conditions = TRUE; } } + break; } default: { break; @@ -2606,7 +3116,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } if (socket_layer_non_id_conditions) { - necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->id, policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->id, policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying socket kernel policy"); @@ -2618,7 +3128,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } if (ip_output_layer_non_id_conditions) { - necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); + u_int32_t condition_mask = master_condition_mask; + if (ip_output_layer_non_id_only) { + condition_mask |= NECP_KERNEL_CONDITION_POLICY_ID; + } + necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy"); @@ -2708,6 +3222,7 @@ necp_policy_apply_all(struct necp_session *session) lck_rw_done(&necp_kernel_policy_lock); + necp_update_all_clients(); necp_post_change_event(&kev_data); if (necp_debug) { @@ -2739,9 +3254,9 @@ necp_kernel_policy_get_new_id(void) return (newid); } -#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT) +#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) static necp_kernel_policy_id -necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) +necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) { struct necp_kernel_socket_policy *new_kernel_policy = NULL; struct necp_kernel_socket_policy *tmp_kernel_policy = NULL; @@ -2784,6 +3299,9 @@ necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REAL_APP_ID) { new_kernel_policy->cond_real_app_id = cond_real_app_id; } + if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) { + new_kernel_policy->cond_custom_entitlement = cond_custom_entitlement; + } if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID) { new_kernel_policy->cond_account_id = cond_account_id; } @@ -2873,12 +3391,17 @@ necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id) ifnet_release(policy->cond_bound_interface); policy->cond_bound_interface = NULL; } - + if (policy->cond_domain) { FREE(policy->cond_domain, M_NECP); policy->cond_domain = NULL; } + if (policy->cond_custom_entitlement) { + FREE(policy->cond_custom_entitlement, M_NECP); + policy->cond_custom_entitlement = NULL; + } + FREE_ZONE(policy, sizeof(*policy), M_NECP_SOCKET_POLICY); return (TRUE); } @@ -2886,23 +3409,26 @@ necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id) return (FALSE); } -#define MAX_RESULT_STRING_LEN 64 static inline const char * necp_get_result_description(char *result_string, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) { uuid_string_t uuid_string; switch (result) { case NECP_KERNEL_POLICY_RESULT_NONE: { - return ("None"); + snprintf(result_string, MAX_RESULT_STRING_LEN, "None"); + break; } case NECP_KERNEL_POLICY_RESULT_PASS: { - return ("Pass"); + snprintf(result_string, MAX_RESULT_STRING_LEN, "Pass"); + break; } case NECP_KERNEL_POLICY_RESULT_SKIP: { - return ("Skip"); + snprintf(result_string, MAX_RESULT_STRING_LEN, "Skip (%u)", result_parameter.skip_policy_order); + break; } case NECP_KERNEL_POLICY_RESULT_DROP: { - return ("Drop"); + snprintf(result_string, MAX_RESULT_STRING_LEN, "Drop"); + break; } case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT: { snprintf(result_string, MAX_RESULT_STRING_LEN, "SocketDivert (%d)", result_parameter.flow_divert_control_unit); @@ -2918,7 +3444,8 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul break; } case NECP_KERNEL_POLICY_RESULT_IP_FILTER: { - return ("IPFilter"); + snprintf(result_string, MAX_RESULT_STRING_LEN, "IPFilter"); + break; } case NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED: { ifnet_t interface = ifindex2ifnet[result_parameter.scoped_interface_index]; @@ -2930,7 +3457,6 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul char interface_names[IFXNAMSIZ][MAX_ROUTE_RULE_INTERFACES]; struct necp_route_rule *route_rule = necp_lookup_route_rule_locked(&necp_route_rules, result_parameter.route_rule_id); if (route_rule != NULL) { - bool default_drop = (route_rule->default_action == NECP_ROUTE_RULE_DENY_INTERFACE); for (index = 0; index < MAX_ROUTE_RULE_INTERFACES; index++) { if (route_rule->exception_if_indices[index] != 0) { ifnet_t interface = ifindex2ifnet[route_rule->exception_if_indices[index]]; @@ -2939,7 +3465,8 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul memset(interface_names[index], 0, IFXNAMSIZ); } } - if (default_drop) { + switch (route_rule->default_action) { + case NECP_ROUTE_RULE_DENY_INTERFACE: snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Only %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", (route_rule->cellular_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Cell " : "", (route_rule->wifi_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "WiFi " : "", @@ -2964,7 +3491,8 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul (route_rule->exception_if_actions[8] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[8] : "", (route_rule->exception_if_actions[8] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? " " : "", (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[9] : ""); - } else { + break; + case NECP_ROUTE_RULE_ALLOW_INTERFACE: snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", (route_rule->cellular_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Cell " : "", (route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!WiFi " : "", @@ -2990,9 +3518,37 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul (route_rule->exception_if_actions[8] == NECP_ROUTE_RULE_DENY_INTERFACE) ? interface_names[8] : "", (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!" : "", (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_DENY_INTERFACE) ? interface_names[9] : ""); + break; + case NECP_ROUTE_RULE_QOS_MARKING: + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (QoSMarking %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", + (route_rule->cellular_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Cell " : "", + (route_rule->wifi_action == NECP_ROUTE_RULE_QOS_MARKING) ? "WiFi " : "", + (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Wired " : "", + (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Exp " : "", + (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[0] : "", + (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[1] : "", + (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[2] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[2] : "", + (route_rule->exception_if_actions[2] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[3] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[3] : "", + (route_rule->exception_if_actions[3] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[4] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[4] : "", + (route_rule->exception_if_actions[4] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[5] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[5] : "", + (route_rule->exception_if_actions[5] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[6] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[6] : "", + (route_rule->exception_if_actions[6] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[7] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[7] : "", + (route_rule->exception_if_actions[7] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[8] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[8] : "", + (route_rule->exception_if_actions[8] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "", + (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[9] : ""); + break; + default: + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Unknown)"); + break; } - } else { - snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Unknown)"); } break; } @@ -3197,6 +3753,11 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic continue; } + if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT && + strcmp(compared_policy->cond_custom_entitlement, policy->cond_custom_entitlement) != 0) { + continue; + } + if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && compared_policy->cond_account_id != policy->cond_account_id) { continue; @@ -3439,6 +4000,22 @@ necp_lookup_string_to_id_locked(struct necp_string_id_mapping_list *list, char * return (foundentry); } +static struct necp_string_id_mapping * +necp_lookup_string_with_id_locked(struct necp_string_id_mapping_list *list, u_int32_t local_id) +{ + struct necp_string_id_mapping *searchentry = NULL; + struct necp_string_id_mapping *foundentry = NULL; + + LIST_FOREACH(searchentry, list, chain) { + if (searchentry->id == local_id) { + foundentry = searchentry; + break; + } + } + + return (foundentry); +} + static u_int32_t necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char *string) { @@ -3668,13 +4245,15 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ continue; } - memcpy(interface_name, rule_value, rule_length); - interface_name[length - 1] = 0; // Make sure the string is NULL terminated - if (ifnet_find_by_name(interface_name, &rule_interface) == 0) { - if_actions[num_valid_indices] = rule_type; - if_indices[num_valid_indices++] = rule_interface->if_index; + if (rule_length <= IFXNAMSIZ) { + memcpy(interface_name, rule_value, rule_length); + interface_name[rule_length - 1] = 0; // Make sure the string is NULL terminated + if (ifnet_find_by_name(interface_name, &rule_interface) == 0) { + if_actions[num_valid_indices] = rule_type; + if_indices[num_valid_indices++] = rule_interface->if_index; + ifnet_release(rule_interface); + } } - offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; } @@ -3841,6 +4420,25 @@ necp_uuid_lookup_app_id_locked(uuid_t uuid) return (foundentry); } +static struct necp_uuid_id_mapping * +necp_uuid_lookup_uuid_with_app_id_locked(u_int32_t local_id) +{ + struct necp_uuid_id_mapping *searchentry = NULL; + struct necp_uuid_id_mapping *foundentry = NULL; + + struct necp_uuid_id_mapping_head *uuid_list_head = NULL; + for (uuid_list_head = &necp_uuid_app_id_hashtbl[necp_uuid_app_id_hash_num_buckets - 1]; uuid_list_head >= necp_uuid_app_id_hashtbl; uuid_list_head--) { + LIST_FOREACH(searchentry, uuid_list_head, chain) { + if (searchentry->id == local_id) { + foundentry = searchentry; + break; + } + } + } + + return (foundentry); +} + static u_int32_t necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_policy_table) { @@ -3921,7 +4519,7 @@ necp_uuid_get_null_service_id_mapping(void) static struct necp_uuid_id_mapping null_mapping; uuid_clear(null_mapping.uuid); null_mapping.id = NECP_NULL_SERVICE_ID; - + return (&null_mapping); } @@ -3930,11 +4528,11 @@ necp_uuid_lookup_service_id_locked(uuid_t uuid) { struct necp_uuid_id_mapping *searchentry = NULL; struct necp_uuid_id_mapping *foundentry = NULL; - + if (uuid_is_null(uuid)) { return necp_uuid_get_null_service_id_mapping(); } - + LIST_FOREACH(searchentry, &necp_uuid_service_id_list, chain) { if (uuid_compare(searchentry->uuid, uuid) == 0) { foundentry = searchentry; @@ -3950,11 +4548,11 @@ necp_uuid_lookup_uuid_with_service_id_locked(u_int32_t local_id) { struct necp_uuid_id_mapping *searchentry = NULL; struct necp_uuid_id_mapping *foundentry = NULL; - + if (local_id == NECP_NULL_SERVICE_ID) { return necp_uuid_get_null_service_id_mapping(); } - + LIST_FOREACH(searchentry, &necp_uuid_service_id_list, chain) { if (searchentry->id == local_id) { foundentry = searchentry; @@ -3970,11 +4568,11 @@ necp_create_uuid_service_id_mapping(uuid_t uuid) { u_int32_t local_id = 0; struct necp_uuid_id_mapping *existing_mapping = NULL; - + if (uuid_is_null(uuid)) { return (NECP_NULL_SERVICE_ID); } - + lck_rw_assert(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); existing_mapping = necp_uuid_lookup_service_id_locked(uuid); @@ -4002,11 +4600,11 @@ static bool necp_remove_uuid_service_id_mapping(uuid_t uuid) { struct necp_uuid_id_mapping *existing_mapping = NULL; - + if (uuid_is_null(uuid)) { return (TRUE); } - + lck_rw_assert(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); existing_mapping = necp_uuid_lookup_app_id_locked(uuid); @@ -4494,7 +5092,8 @@ necp_check_suffix(struct substring parent, struct substring suffix, bool require } } - return (memcmp(parent.string + length_difference, suffix.string, suffix.length) == 0); + // strncasecmp does case-insensitive check for all UTF-8 strings (ignores non-ASCII characters) + return (strncasecmp(parent.string + length_difference, suffix.string, suffix.length) == 0); } static bool @@ -4509,8 +5108,9 @@ necp_hostname_matches_domain(struct substring hostname_substring, u_int8_t hostn domain_substring.length = strlen(domain); if (hostname_dot_count == domain_dot_count) { + // strncasecmp does case-insensitive check for all UTF-8 strings (ignores non-ASCII characters) if (hostname_substring.length == domain_substring.length && - memcmp(hostname_substring.string, domain_substring.string, hostname_substring.length) == 0) { + strncasecmp(hostname_substring.string, domain_substring.string, hostname_substring.length) == 0) { return (TRUE); } } else if (domain_dot_count < hostname_dot_count) { @@ -4522,9 +5122,25 @@ necp_hostname_matches_domain(struct substring hostname_substring, u_int8_t hostn return (FALSE); } -#define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX) -static void -necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, struct necp_socket_info *info) +static char * +necp_copy_string(char *string, size_t length) +{ + char *copied_string = NULL; + + MALLOC(copied_string, char *, length + 1, M_NECP, M_WAITOK); + if (copied_string == NULL) { + return (NULL); + } + + memcpy(copied_string, string, length); + copied_string[length] = 0; + + return (copied_string); +} + +#define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX) +static void +necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, proc_t proc, struct necp_socket_info *info) { memset(info, 0, sizeof(struct necp_socket_info)); @@ -4533,7 +5149,10 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic info->protocol = protocol; info->bound_interface_index = bound_interface_index; info->traffic_class = traffic_class; - info->cred_result = 0; // Don't check the entitlement here, only in the socket layer + + if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT && proc != NULL) { + info->cred_result = priv_check_cred(proc_ucred(proc), PRIV_NET_PRIVILEGED_NECP_MATCH, 0); + } if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_APP_ID && !uuid_is_null(application_uuid)) { struct necp_uuid_id_mapping *existing_mapping = necp_uuid_lookup_app_id_locked(application_uuid); @@ -4575,7 +5194,7 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic } static void -necp_send_application_cell_denied_event(pid_t pid, uuid_t proc_uuid) +necp_send_application_interface_denied_event(pid_t pid, uuid_t proc_uuid, u_int32_t if_functional_type) { struct kev_netpolicy_ifdenied ev_ifdenied; @@ -4583,12 +5202,31 @@ necp_send_application_cell_denied_event(pid_t pid, uuid_t proc_uuid) ev_ifdenied.ev_data.epid = pid; uuid_copy(ev_ifdenied.ev_data.euuid, proc_uuid); + ev_ifdenied.ev_if_functional_type = if_functional_type; netpolicy_post_msg(KEV_NETPOLICY_IFDENIED, &ev_ifdenied.ev_data, sizeof(ev_ifdenied)); } -static int -necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t parameters_size, struct necp_aggregate_result *returned_result) +extern char *proc_name_address(void *p); + +#define NECP_VERIFY_DELEGATION_ENTITLEMENT(_p, _d) \ + if (!has_checked_delegation_entitlement) { \ + has_delegation_entitlement = (priv_check_cred(proc_ucred(_p), PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0) == 0); \ + has_checked_delegation_entitlement = TRUE; \ + } \ + if (!has_delegation_entitlement) { \ + NECPLOG(LOG_ERR, "%s(%d) does not hold the necessary entitlement to delegate network traffic for other processes by %s", \ + proc_name_address(_p), proc_pid(_p), _d); \ + break; \ + } + +int +necp_application_find_policy_match_internal(proc_t proc, + u_int8_t *parameters, + u_int32_t parameters_size, + struct necp_aggregate_result *returned_result, + u_int32_t *flags, + u_int required_interface_index) { int error = 0; size_t offset = 0; @@ -4600,21 +5238,28 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para necp_kernel_policy_result service_action = 0; necp_kernel_policy_service service = { 0, 0 }; - pid_t pid = 0; - uid_t uid = 0; u_int16_t protocol = 0; - u_int32_t bound_interface_index = 0; + u_int32_t bound_interface_index = required_interface_index; u_int32_t traffic_class = 0; union necp_sockaddr_union local_addr; union necp_sockaddr_union remote_addr; bool no_remote_addr = FALSE; + u_int8_t remote_family = 0; + bool no_local_addr = FALSE; memset(&local_addr, 0, sizeof(local_addr)); memset(&remote_addr, 0, sizeof(remote_addr)); + + // Initialize UID, PID, and UUIDs to the current process + uid_t uid = kauth_cred_getuid(proc_ucred(proc)); + pid_t pid = proc_pid(proc); uuid_t application_uuid; uuid_clear(application_uuid); uuid_t real_application_uuid; uuid_clear(real_application_uuid); + proc_getexecutableuuid(proc, real_application_uuid, sizeof(real_application_uuid)); + uuid_copy(application_uuid, real_application_uuid); + char *domain = NULL; char *account = NULL; @@ -4622,6 +5267,9 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para memset(&netagent_ids, 0, sizeof(netagent_ids)); int netagent_cursor; + bool has_checked_delegation_entitlement = FALSE; + bool has_delegation_entitlement = FALSE; + if (returned_result == NULL) { return (EINVAL); } @@ -4646,53 +5294,81 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para u_int8_t *value = necp_buffer_get_tlv_value(parameters, offset, NULL); if (value != NULL) { switch (type) { - case NECP_POLICY_CONDITION_APPLICATION: { + case NECP_CLIENT_PARAMETER_APPLICATION: { if (length >= sizeof(uuid_t)) { + if (uuid_compare(application_uuid, value) == 0) { + // No delegation + break; + } + + NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "euuid"); + uuid_copy(application_uuid, value); } break; } - case NECP_POLICY_CONDITION_REAL_APPLICATION: { + case NECP_CLIENT_PARAMETER_REAL_APPLICATION: { if (length >= sizeof(uuid_t)) { + if (uuid_compare(real_application_uuid, value) == 0) { + // No delegation + break; + } + + NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "uuid"); + uuid_copy(real_application_uuid, value); } break; } - case NECP_POLICY_CONDITION_DOMAIN: { + case NECP_CLIENT_PARAMETER_PID: { + if (length >= sizeof(pid_t)) { + if (memcmp(&pid, value, sizeof(pid_t)) == 0) { + // No delegation + break; + } + + NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "pid"); + + memcpy(&pid, value, sizeof(pid_t)); + } + break; + } + case NECP_CLIENT_PARAMETER_UID: { + if (length >= sizeof(uid_t)) { + if (memcmp(&uid, value, sizeof(uid_t)) == 0) { + // No delegation + break; + } + + NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "uid"); + + memcpy(&uid, value, sizeof(uid_t)); + } + break; + } + case NECP_CLIENT_PARAMETER_DOMAIN: { domain = (char *)value; domain[length - 1] = 0; break; } - case NECP_POLICY_CONDITION_ACCOUNT: { + case NECP_CLIENT_PARAMETER_ACCOUNT: { account = (char *)value; account[length - 1] = 0; break; } - case NECP_POLICY_CONDITION_TRAFFIC_CLASS: { + case NECP_CLIENT_PARAMETER_TRAFFIC_CLASS: { if (length >= sizeof(u_int32_t)) { memcpy(&traffic_class, value, sizeof(u_int32_t)); } break; } - case NECP_POLICY_CONDITION_PID: { - if (length >= sizeof(pid_t)) { - memcpy(&pid, value, sizeof(pid_t)); - } - break; - } - case NECP_POLICY_CONDITION_UID: { - if (length >= sizeof(uid_t)) { - memcpy(&uid, value, sizeof(uid_t)); - } - break; - } - case NECP_POLICY_CONDITION_IP_PROTOCOL: { + case NECP_CLIENT_PARAMETER_IP_PROTOCOL: { if (length >= sizeof(u_int16_t)) { memcpy(&protocol, value, sizeof(u_int16_t)); } break; } - case NECP_POLICY_CONDITION_BOUND_INTERFACE: { + case NECP_CLIENT_PARAMETER_BOUND_INTERFACE: { if (length <= IFXNAMSIZ && length > 0) { ifnet_t bound_interface = NULL; char interface_name[IFXNAMSIZ]; @@ -4700,21 +5376,26 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para interface_name[length - 1] = 0; // Make sure the string is NULL terminated if (ifnet_find_by_name(interface_name, &bound_interface) == 0) { bound_interface_index = bound_interface->if_index; + ifnet_release(bound_interface); } } break; } - case NECP_POLICY_CONDITION_LOCAL_ADDR: { + case NECP_CLIENT_PARAMETER_LOCAL_ADDRESS: { if (length >= sizeof(struct necp_policy_condition_addr)) { struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)value; - memcpy(&local_addr, &address_struct->address, sizeof(address_struct->address)); + if (necp_address_is_valid(&address_struct->address.sa)) { + memcpy(&local_addr, &address_struct->address, sizeof(address_struct->address)); + } } break; } - case NECP_POLICY_CONDITION_REMOTE_ADDR: { + case NECP_CLIENT_PARAMETER_REMOTE_ADDRESS: { if (length >= sizeof(struct necp_policy_condition_addr)) { struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)value; - memcpy(&remote_addr, &address_struct->address, sizeof(address_struct->address)); + if (necp_address_is_valid(&address_struct->address.sa)) { + memcpy(&remote_addr, &address_struct->address, sizeof(address_struct->address)); + } } break; } @@ -4731,8 +5412,8 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para // Lock lck_rw_lock_shared(&necp_kernel_policy_lock); - necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, &info); - matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS); + necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, proc, &info); + matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, proc); if (matched_policy) { returned_result->policy_id = matched_policy->id; returned_result->routing_result = matched_policy->result; @@ -4787,17 +5468,37 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para output_bound_interface = returned_result->routing_result_parameter.tunnel_interface_index; } - if (remote_addr.sa.sa_len == 0) { + if (local_addr.sa.sa_len == 0 || + (local_addr.sa.sa_family == AF_INET && local_addr.sin.sin_addr.s_addr == 0) || + (local_addr.sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&local_addr.sin6.sin6_addr))) { + no_local_addr = TRUE; + } + + if (remote_addr.sa.sa_len == 0 || + (remote_addr.sa.sa_family == AF_INET && remote_addr.sin.sin_addr.s_addr == 0) || + (remote_addr.sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&remote_addr.sin6.sin6_addr))) { no_remote_addr = TRUE; - // Default to 0.0.0.0:0 - remote_addr.sa.sa_family = AF_INET; - remote_addr.sa.sa_len = sizeof(struct sockaddr_in); + remote_family = remote_addr.sa.sa_family; + } + + if (no_remote_addr) { + memset(&remote_addr, 0, sizeof(remote_addr)); + if (remote_family == AF_INET6) { + // Reset address to :: + remote_addr.sa.sa_family = AF_INET6; + remote_addr.sa.sa_len = sizeof(struct sockaddr_in6); + } else { + // Reset address to 0.0.0.0 + remote_addr.sa.sa_family = AF_INET; + remote_addr.sa.sa_len = sizeof(struct sockaddr_in); + } } struct rtentry *rt = NULL; - rt = rtalloc1_scoped((struct sockaddr *)&remote_addr, 0, 0, output_bound_interface); + rt = rtalloc1_scoped((struct sockaddr *)&remote_addr, 0, 0, + output_bound_interface); - if (no_remote_addr && + if (no_remote_addr && remote_family == 0 && (rt == NULL || rt->rt_ifp == NULL)) { // Route lookup for default IPv4 failed, try IPv6 @@ -4813,7 +5514,8 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para remote_addr.sa.sa_len = sizeof(struct sockaddr_in6); // Get route - rt = rtalloc1_scoped((struct sockaddr *)&remote_addr, 0, 0, output_bound_interface); + rt = rtalloc1_scoped((struct sockaddr *)&remote_addr, 0, 0, + output_bound_interface); } returned_result->routed_interface_index = 0; @@ -4846,15 +5548,114 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para } } - bool cellular_denied = FALSE; - bool route_is_allowed = necp_route_is_allowed(rt, NULL, route_rule_id, &cellular_denied); + if (returned_result->routed_interface_index != 0 && + returned_result->routed_interface_index != lo_ifp->if_index && // Loopback can accept any local address + !no_local_addr) { + + // Transform local_addr into the ifaddr form + // IPv6 Scope IDs are always embedded in the ifaddr list + struct sockaddr_storage local_address_sanitized; + u_int ifscope = IFSCOPE_NONE; + (void)sa_copy(&local_addr.sa, &local_address_sanitized, &ifscope); + SIN(&local_address_sanitized)->sin_port = 0; + if (local_address_sanitized.ss_family == AF_INET6) { + SIN6(&local_address_sanitized)->sin6_scope_id = 0; + } + + // Validate local address on routed interface + struct ifaddr *ifa = ifa_ifwithaddr_scoped((struct sockaddr *)&local_address_sanitized, returned_result->routed_interface_index); + if (ifa == NULL) { + // Interface address not found, reject route + returned_result->routed_interface_index = 0; + if (rt != NULL) { + rtfree(rt); + rt = NULL; + } + } else { + ifaddr_release(ifa); + ifa = NULL; + } + } + + if (flags != NULL) { + // Check for local/direct + bool is_local = FALSE; + if (rt != NULL && (rt->rt_flags & RTF_LOCAL)) { + is_local = TRUE; + } else if (returned_result->routed_interface_index != 0 && + !no_remote_addr) { + // Check if remote address is an interface address + struct ifaddr *ifa = ifa_ifwithaddr(&remote_addr.sa); + if (ifa != NULL && ifa->ifa_ifp != NULL) { + u_int if_index_for_remote_addr = ifa->ifa_ifp->if_index; + if (if_index_for_remote_addr == returned_result->routed_interface_index || + if_index_for_remote_addr == lo_ifp->if_index) { + is_local = TRUE; + } + } + if (ifa != NULL) { + ifaddr_release(ifa); + ifa = NULL; + } + } + + if (is_local) { + *flags |= (NECP_CLIENT_RESULT_FLAG_IS_LOCAL | NECP_CLIENT_RESULT_FLAG_IS_DIRECT); + } else { + if (rt != NULL && + !(rt->rt_flags & RTF_GATEWAY) && + (rt->rt_ifa && rt->rt_ifa->ifa_ifp && !(rt->rt_ifa->ifa_ifp->if_flags & IFF_POINTOPOINT))) { + // Route is directly accessible + *flags |= NECP_CLIENT_RESULT_FLAG_IS_DIRECT; + } + } + + if (returned_result->routed_interface_index != 0) { + union necp_sockaddr_union default_address; + struct rtentry *v4Route = NULL; + struct rtentry *v6Route = NULL; + + memset(&default_address, 0, sizeof(default_address)); + + // Reset address to 0.0.0.0 + default_address.sa.sa_family = AF_INET; + default_address.sa.sa_len = sizeof(struct sockaddr_in); + v4Route = rtalloc1_scoped((struct sockaddr *)&default_address, 0, 0, + returned_result->routed_interface_index); + + // Reset address to :: + default_address.sa.sa_family = AF_INET6; + default_address.sa.sa_len = sizeof(struct sockaddr_in6); + v6Route = rtalloc1_scoped((struct sockaddr *)&default_address, 0, 0, + returned_result->routed_interface_index); + + if (v4Route != NULL) { + if (v4Route->rt_ifp != NULL) { + *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV4; + } + rtfree(v4Route); + v4Route = NULL; + } + + if (v6Route != NULL) { + if (v6Route->rt_ifp != NULL) { + *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV6; + } + rtfree(v6Route); + v6Route = NULL; + } + } + } + + u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN; + bool route_is_allowed = necp_route_is_allowed(rt, NULL, route_rule_id, &interface_type_denied); if (!route_is_allowed) { // If the route is blocked, treat the lookup as a drop returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter)); - if (cellular_denied) { - necp_send_application_cell_denied_event(pid, application_uuid); + if (interface_type_denied != IFRTYPE_FUNCTIONAL_UNKNOWN) { + necp_send_application_interface_denied_event(pid, application_uuid, interface_type_denied); } } @@ -4868,56 +5669,8 @@ necp_application_find_policy_match_internal(u_int8_t *parameters, u_int32_t para return (error); } -#define NECP_MAX_MATCH_POLICY_PARAMETER_SIZE 1024 - -int -necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *retval) -{ -#pragma unused(p, retval) - u_int8_t *parameters = NULL; - struct necp_aggregate_result returned_result; - int error = 0; - - if (uap == NULL) { - error = EINVAL; - goto done; - } - - if (uap->parameters == 0 || uap->parameters_size == 0 || uap->parameters_size > NECP_MAX_MATCH_POLICY_PARAMETER_SIZE || uap->returned_result == 0) { - error = EINVAL; - goto done; - } - - MALLOC(parameters, u_int8_t *, uap->parameters_size, M_NECP, M_WAITOK); - if (parameters == NULL) { - error = ENOMEM; - goto done; - } - // Copy parameters in - error = copyin(uap->parameters, parameters, uap->parameters_size); - if (error) { - goto done; - } - - error = necp_application_find_policy_match_internal(parameters, uap->parameters_size, &returned_result); - if (error) { - goto done; - } - - // Copy return value back - error = copyout(&returned_result, uap->returned_result, sizeof(struct necp_aggregate_result)); - if (error) { - goto done; - } -done: - if (parameters != NULL) { - FREE(parameters, M_NECP); - } - return (error); -} - static bool -necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote) +necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, proc_t proc) { if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { @@ -4972,7 +5725,7 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } } - + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT) { if (cred_result != 0) { // Process is missing entitlement @@ -4980,6 +5733,21 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) { + if (kernel_policy->cond_custom_entitlement != NULL) { + if (proc == NULL) { + // No process found, cannot check entitlement + return (FALSE); + } + task_t task = proc_task(proc); + if (task == NULL || + !IOTaskHasEntitlement(task, kernel_policy->cond_custom_entitlement)) { + // Process is missing custom entitlement + return (FALSE); + } + } + } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_DOMAIN) { bool domain_matches = necp_hostname_matches_domain(domain, domain_dot_count, kernel_policy->cond_domain, kernel_policy->cond_domain_dot_count); if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_DOMAIN) { @@ -5036,7 +5804,7 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } } - + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_TRAFFIC_CLASS) { if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_TRAFFIC_CLASS) { if (traffic_class >= kernel_policy->cond_traffic_class.start_tc && @@ -5144,7 +5912,7 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_UID) { info->uid = kauth_cred_getuid(so->so_cred); } - + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_TRAFFIC_CLASS) { info->traffic_class = so->so_traffic_class; } @@ -5171,7 +5939,7 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc info->real_application_id = real_existing_mapping->id; } } - + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT) { info->cred_result = priv_check_cred(so->so_cred, PRIV_NET_PRIVILEGED_NECP_MATCH, 0); } @@ -5246,7 +6014,7 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc } static inline struct necp_kernel_socket_policy * -necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count) +necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count, proc_t proc) { struct necp_kernel_socket_policy *matched_policy = NULL; u_int32_t skip_order = 0; @@ -5301,7 +6069,7 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy // Skip this policy continue; } - if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr)) { + if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, proc)) { if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER) { if (return_filter && *return_filter == 0) { *return_filter = policy_search_array[i]->result_parameter.filter_control_unit; @@ -5345,13 +6113,15 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy continue; } - // Passed all tests, found a match - matched_policy = policy_search_array[i]; + // Matched policy is a skip. Do skip and continue. if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) { skip_order = policy_search_array[i]->result_parameter.skip_policy_order; skip_session_order = policy_search_array[i]->session_order + 1; continue; } + + // Passed all tests, found a match + matched_policy = policy_search_array[i]; break; } } @@ -5375,23 +6145,23 @@ necp_socket_uses_interface(struct inpcb *inp, u_int32_t interface_index) int i; int family = AF_INET; ifnet_t interface = ifindex2ifnet[interface_index]; - + if (inp == NULL || interface == NULL) { return (FALSE); } - + if (inp->inp_vflag & INP_IPV4) { family = AF_INET; } else if (inp->inp_vflag & INP_IPV6) { family = AF_INET6; } - + result = ifnet_get_address_list_family(interface, &addresses, family); if (result != 0) { NECPLOG(LOG_ERR, "Failed to get address list for %s%d", ifnet_name(interface), ifnet_unit(interface)); return (FALSE); } - + for (i = 0; addresses[i] != NULL; i++) { if (ifaddr_address(addresses[i], &address_storage.sa, sizeof(address_storage)) == 0) { if (family == AF_INET) { @@ -5407,7 +6177,7 @@ necp_socket_uses_interface(struct inpcb *inp, u_int32_t interface_index) } } } - + done: ifnet_free_address_list(addresses); addresses = NULL; @@ -5441,6 +6211,16 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local return (NECP_KERNEL_POLICY_ID_NONE); } + // Ignore invalid addresses + if (override_local_addr != NULL && + !necp_address_is_valid(override_local_addr)) { + override_local_addr = NULL; + } + if (override_remote_addr != NULL && + !necp_address_is_valid(override_remote_addr)) { + override_remote_addr = NULL; + } + so = inp->inp_socket; // Don't lock. Possible race condition, but we don't want the performance hit. @@ -5449,6 +6229,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local if (necp_drop_all_order > 0) { inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = 0; + inp->inp_policyresult.app_id = 0; inp->inp_policyresult.flowhash = 0; inp->inp_policyresult.results.filter_control_unit = 0; inp->inp_policyresult.results.route_rule_id = 0; @@ -5461,13 +6242,14 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local } return (NECP_KERNEL_POLICY_ID_NONE); } - + // Check for loopback exception if (necp_pass_loopback > 0 && necp_is_loopback(override_local_addr, override_remote_addr, inp, NULL)) { // Mark socket as a pass inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = 0; + inp->inp_policyresult.app_id = 0; inp->inp_policyresult.flowhash = 0; inp->inp_policyresult.results.filter_control_unit = 0; inp->inp_policyresult.results.route_rule_id = 0; @@ -5477,8 +6259,9 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local // Lock lck_rw_lock_shared(&necp_kernel_policy_lock); - + necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, &info); + inp->inp_policyresult.app_id = info.application_id; // Check info u_int32_t flowhash = necp_socket_calc_flowhash_locked(&info); @@ -5494,7 +6277,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local } // Match socket to policy - matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS); + matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, current_proc()); // If the socket matched a scoped service policy, mark as Drop if not registered. // This covers the cases in which a service is required (on demand) but hasn't started yet. if ((service_action == NECP_KERNEL_POLICY_RESULT_TRIGGER_SCOPED || @@ -5585,7 +6368,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local if (necp_debug) { NECPLOG(LOG_DEBUG, "Marking socket in state %d as defunct", so->so_state); } - sosetdefunct(current_proc(), so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE); + sosetdefunct(current_proc(), so, SHUTDOWN_SOCKET_LEVEL_NECP | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE); } else if (necp_socket_is_connected(inp) && matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && info.protocol == IPPROTO_TCP) { @@ -5806,7 +6589,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a if (packet == NULL) { return (NECP_KERNEL_POLICY_ID_NONE); } - + socket_policy_id = necp_get_policy_id_from_packet(packet); // Exit early for an empty list @@ -5829,7 +6612,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a return (matched_policy_id); } - + // Check for loopback exception if ((necp_pass_loopback > 0 && necp_is_loopback(NULL, NULL, NULL, packet)) || @@ -5841,7 +6624,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a } return (matched_policy_id); } - + last_interface_index = necp_get_last_interface_index_from_packet(packet); // Process packet to get relevant fields @@ -5950,7 +6733,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out } socket_policy_id = necp_get_policy_id_from_packet(packet); - + // Exit early for an empty list // Don't lock. Possible race condition, but we don't want the performance hit. if (necp_kernel_ip_output_policies_count == 0 || @@ -5971,7 +6754,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out return (matched_policy_id); } - + // Check for loopback exception if ((necp_pass_loopback > 0 && necp_is_loopback(NULL, NULL, NULL, packet)) || @@ -5983,7 +6766,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out } return (matched_policy_id); } - + last_interface_index = necp_get_last_interface_index_from_packet(packet); // Process packet to get relevant fields @@ -6262,164 +7045,131 @@ necp_buffer_compare_with_bit_prefix(u_int8_t *p1, u_int8_t *p2, u_int32_t bits) return (TRUE); } -// Socket operations -#define NECP_MAX_SOCKET_ATTRIBUTE_STRING_LENGTH 253 - static bool -necp_set_socket_attribute(u_int8_t *buffer, size_t buffer_length, u_int8_t type, char **buffer_p) +necp_socket_update_qos_marking_inner(struct ifnet *ifp, u_int32_t route_rule_id) { - int error = 0; - int cursor = 0; - size_t string_size = 0; - char *local_string = NULL; - u_int8_t *value = NULL; + bool qos_marking = FALSE; + int exception_index = 0; + struct necp_route_rule *route_rule = NULL; - cursor = necp_buffer_find_tlv(buffer, buffer_length, 0, type, 0); - if (cursor < 0) { - // This will clear out the parameter + route_rule = necp_lookup_route_rule_locked(&necp_route_rules, route_rule_id); + if (route_rule == NULL) { + qos_marking = FALSE; goto done; } - string_size = necp_buffer_get_tlv_length(buffer, cursor); - if (string_size == 0 || string_size > NECP_MAX_SOCKET_ATTRIBUTE_STRING_LENGTH) { - // This will clear out the parameter + qos_marking = (route_rule->default_action == NECP_ROUTE_RULE_QOS_MARKING) ? TRUE : FALSE; + + if (ifp == NULL) { goto done; } - MALLOC(local_string, char *, string_size + 1, M_NECP, M_WAITOK); - if (local_string == NULL) { - NECPLOG(LOG_ERR, "Failed to allocate a socket attribute buffer (size %d)", string_size); - goto fail; + for (exception_index = 0; exception_index < MAX_ROUTE_RULE_INTERFACES; exception_index++) { + if (route_rule->exception_if_indices[exception_index] == 0) { + break; + } + if (route_rule->exception_if_actions[exception_index] != NECP_ROUTE_RULE_QOS_MARKING) { + continue; + } + if (route_rule->exception_if_indices[exception_index] == ifp->if_index) { + qos_marking = TRUE; + if (necp_debug > 2) { + NECPLOG(LOG_DEBUG, "QoS Marking : Interface match %d for Rule %d Allowed %d", + route_rule->exception_if_indices[exception_index], route_rule_id, qos_marking); + } + goto done; + } } - value = necp_buffer_get_tlv_value(buffer, cursor, NULL); - if (value == NULL) { - NECPLOG0(LOG_ERR, "Failed to get socket attribute"); - goto fail; + if ((route_rule->cellular_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_CELLULAR(ifp)) || + (route_rule->wifi_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_WIFI(ifp)) || + (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_WIRED(ifp)) || + (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_EXPENSIVE(ifp))) { + qos_marking = TRUE; + if (necp_debug > 2) { + NECPLOG(LOG_DEBUG, "QoS Marking: C:%d WF:%d W:%d E:%d for Rule %d Allowed %d", + route_rule->cellular_action, route_rule->wifi_action, route_rule->wired_action, + route_rule->expensive_action, route_rule_id, qos_marking); + } + goto done; } - - memcpy(local_string, value, string_size); - local_string[string_size] = 0; - done: - if (*buffer_p != NULL) { - FREE(*buffer_p, M_NECP); - *buffer_p = NULL; + if (necp_debug > 1) { + NECPLOG(LOG_DEBUG, "QoS Marking: Rule %d ifp %s Allowed %d", + route_rule_id, ifp ? ifp->if_xname : "", qos_marking); } - - *buffer_p = local_string; - return (0); -fail: - if (local_string != NULL) { - FREE(local_string, M_NECP); - } - return (error); + return (qos_marking); } -errno_t -necp_set_socket_attributes(struct socket *so, struct sockopt *sopt) +void +necp_socket_update_qos_marking(struct inpcb *inp, struct rtentry *route, struct ifnet *interface, u_int32_t route_rule_id) { - int error = 0; - u_int8_t *buffer = NULL; - struct inpcb *inp = NULL; - - if ((SOCK_DOM(so) != PF_INET -#if INET6 - && SOCK_DOM(so) != PF_INET6 -#endif - )) { - error = EINVAL; - goto done; - } - - inp = sotoinpcb(so); - - size_t valsize = sopt->sopt_valsize; - if (valsize == 0 || - valsize > ((sizeof(u_int8_t) + sizeof(u_int32_t) + NECP_MAX_SOCKET_ATTRIBUTE_STRING_LENGTH) * 2)) { - goto done; - } - - MALLOC(buffer, u_int8_t *, valsize, M_NECP, M_WAITOK); - if (buffer == NULL) { - goto done; - } + bool qos_marking = FALSE; + struct ifnet *ifp = interface = NULL; - error = sooptcopyin(sopt, buffer, valsize, 0); - if (error) { - goto done; - } + ASSERT(net_qos_policy_restricted != 0); - error = necp_set_socket_attribute(buffer, valsize, NECP_TLV_ATTRIBUTE_DOMAIN, &inp->inp_necp_attributes.inp_domain); - if (error) { - NECPLOG0(LOG_ERR, "Could not set domain TLV for socket attributes"); - goto done; - } - - error = necp_set_socket_attribute(buffer, valsize, NECP_TLV_ATTRIBUTE_ACCOUNT, &inp->inp_necp_attributes.inp_account); - if (error) { - NECPLOG0(LOG_ERR, "Could not set account TLV for socket attributes"); - goto done; + if (inp->inp_socket == NULL) { + return; } - - if (necp_debug) { - NECPLOG(LOG_DEBUG, "Set on socket: Domain %s, Account %s", inp->inp_necp_attributes.inp_domain, inp->inp_necp_attributes.inp_account); + if ((inp->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE)) { + return; } -done: - if (buffer != NULL) { - FREE(buffer, M_NECP); + /* + * This is racy but we do not need the performance hit of taking necp_kernel_policy_lock + */ + if (inp->inp_policyresult.results.qos_marking_gencount == necp_kernel_socket_policies_gencount) { + return; } - return (error); -} + lck_rw_lock_shared(&necp_kernel_policy_lock); -errno_t -necp_get_socket_attributes(struct socket *so, struct sockopt *sopt) -{ - int error = 0; - u_int8_t *buffer = NULL; - u_int8_t *cursor = NULL; - size_t valsize = 0; - struct inpcb *inp = sotoinpcb(so); - - if (inp->inp_necp_attributes.inp_domain != NULL) { - valsize += sizeof(u_int8_t) + sizeof(u_int32_t) + strlen(inp->inp_necp_attributes.inp_domain); - } - if (inp->inp_necp_attributes.inp_account != NULL) { - valsize += sizeof(u_int8_t) + sizeof(u_int32_t) + strlen(inp->inp_necp_attributes.inp_account); + if (ifp == NULL && route != NULL) { + ifp = route->rt_ifp; } - if (valsize == 0) { + /* + * By default, until we have a interface, do not mark and reevaluate the Qos marking policy + */ + if (ifp == NULL || route_rule_id == 0) { + qos_marking = FALSE; goto done; } - MALLOC(buffer, u_int8_t *, valsize, M_NECP, M_WAITOK); - if (buffer == NULL) { - goto done; - } - - cursor = buffer; - if (inp->inp_necp_attributes.inp_domain != NULL) { - cursor = necp_buffer_write_tlv(cursor, NECP_TLV_ATTRIBUTE_DOMAIN, strlen(inp->inp_necp_attributes.inp_domain), inp->inp_necp_attributes.inp_domain); - } - - if (inp->inp_necp_attributes.inp_account != NULL) { - cursor = necp_buffer_write_tlv(cursor, NECP_TLV_ATTRIBUTE_ACCOUNT, strlen(inp->inp_necp_attributes.inp_account), inp->inp_necp_attributes.inp_account); + if (ROUTE_RULE_IS_AGGREGATE(route_rule_id)) { + struct necp_aggregate_route_rule *aggregate_route_rule = necp_lookup_aggregate_route_rule_locked(route_rule_id); + if (aggregate_route_rule != NULL) { + int index = 0; + for (index = 0; index < MAX_AGGREGATE_ROUTE_RULES; index++) { + u_int32_t sub_route_rule_id = aggregate_route_rule->rule_ids[index]; + if (sub_route_rule_id == 0) { + break; + } + qos_marking = necp_socket_update_qos_marking_inner(ifp, sub_route_rule_id); + if (qos_marking == TRUE) { + break; + } + } + } + } else { + qos_marking = necp_socket_update_qos_marking_inner(ifp, route_rule_id); } + /* + * Now that we have an interface we remember the gencount + */ + inp->inp_policyresult.results.qos_marking_gencount = necp_kernel_socket_policies_gencount; - error = sooptcopyout(sopt, buffer, valsize); - if (error) { - goto done; - } done: - if (buffer != NULL) { - FREE(buffer, M_NECP); - } + lck_rw_done(&necp_kernel_policy_lock); - return (error); + if (qos_marking == TRUE) { + inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED; + } else { + inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED; + } } static bool -necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t route_rule_id, bool *cellular_denied) +necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t route_rule_id, u_int32_t *interface_type_denied) { bool default_is_allowed = TRUE; u_int8_t type_aggregate_action = NECP_ROUTE_RULE_NONE; @@ -6448,6 +7198,9 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t if (route_rule->exception_if_indices[exception_index] == 0) { break; } + if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->exception_if_actions[exception_index]) == FALSE) { + continue; + } if (route_rule->exception_if_indices[exception_index] == ifp->if_index || (delegated_ifp != NULL && route_rule->exception_if_indices[exception_index] == delegated_ifp->if_index)) { if (necp_debug > 1) { @@ -6457,11 +7210,10 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t } } - if (route_rule->cellular_action != NECP_ROUTE_RULE_NONE && + if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->cellular_action) && IFNET_IS_CELLULAR(ifp)) { - if (cellular_denied != NULL) { - // Let clients know that cellular was blocked - *cellular_denied = TRUE; + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_CELLULAR; } if (type_aggregate_action == NECP_ROUTE_RULE_NONE || (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && @@ -6471,8 +7223,11 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t } } - if (route_rule->wifi_action != NECP_ROUTE_RULE_NONE && + if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wifi_action) && IFNET_IS_WIFI(ifp)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_WIFI_INFRA; + } if (type_aggregate_action == NECP_ROUTE_RULE_NONE || (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { @@ -6481,8 +7236,11 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t } } - if (route_rule->wired_action != NECP_ROUTE_RULE_NONE && + if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wired_action) && IFNET_IS_WIRED(ifp)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_WIRED; + } if (type_aggregate_action == NECP_ROUTE_RULE_NONE || (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && route_rule->wired_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { @@ -6491,7 +7249,7 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t } } - if (route_rule->expensive_action != NECP_ROUTE_RULE_NONE && + if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->expensive_action) && IFNET_IS_EXPENSIVE(ifp)) { if (type_aggregate_action == NECP_ROUTE_RULE_NONE || (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && @@ -6515,7 +7273,7 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t } static bool -necp_route_is_allowed(struct rtentry *route, struct ifnet *interface, u_int32_t route_rule_id, bool *cellular_denied) +necp_route_is_allowed(struct rtentry *route, struct ifnet *interface, u_int32_t route_rule_id, u_int32_t *interface_type_denied) { if ((route == NULL && interface == NULL) || route_rule_id == 0) { if (necp_debug > 1) { @@ -6533,13 +7291,13 @@ necp_route_is_allowed(struct rtentry *route, struct ifnet *interface, u_int32_t if (sub_route_rule_id == 0) { break; } - if (!necp_route_is_allowed_inner(route, interface, sub_route_rule_id, cellular_denied)) { + if (!necp_route_is_allowed_inner(route, interface, sub_route_rule_id, interface_type_denied)) { return (FALSE); } } } } else { - return (necp_route_is_allowed_inner(route, interface, route_rule_id, cellular_denied)); + return (necp_route_is_allowed_inner(route, interface, route_rule_id, interface_type_denied)); } return (TRUE); @@ -6596,7 +7354,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr necp_kernel_policy_service service = { 0, 0 }; u_int32_t route_rule_id = 0; struct rtentry *route = NULL; - bool cellular_denied = FALSE; + u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN; u_int32_t netagent_ids[NECP_MAX_NETAGENTS]; memset(&netagent_ids, 0, sizeof(netagent_ids)); @@ -6637,7 +7395,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr policies_have_changed = TRUE; } else { if (inp->inp_policyresult.results.route_rule_id != 0 && - !necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &cellular_denied)) { + !necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied)) { route_allowed = FALSE; } } @@ -6682,7 +7440,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && interface && inp->inp_policyresult.results.result_parameter.tunnel_interface_index != verifyifindex) || (inp->inp_policyresult.results.route_rule_id != 0 && - !necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &cellular_denied))) { + !necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied))) { allowed_to_receive = FALSE; } else { if (return_policy_id) { @@ -6696,7 +7454,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr goto done; } - struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS); + struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, current_proc()); if (matched_policy != NULL) { if (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP || matched_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT || @@ -6706,7 +7464,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr service_action == NECP_KERNEL_POLICY_RESULT_NO_TRIGGER_SCOPED) && service.identifier != 0 && service.identifier != NECP_NULL_SERVICE_ID) || (route_rule_id != 0 && - !necp_route_is_allowed(route, interface, route_rule_id, &cellular_denied)) || + !necp_route_is_allowed(route, interface, route_rule_id, &interface_type_denied)) || !necp_netagents_allow_traffic(netagent_ids, NECP_MAX_NETAGENTS)) { allowed_to_receive = FALSE; } else { @@ -6725,12 +7483,19 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr goto done; } else if (necp_drop_all_order > 0) { allowed_to_receive = FALSE; + } else { + if (return_policy_id) { + *return_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + } + if (return_route_rule_id) { + *return_route_rule_id = route_rule_id; + } } lck_rw_done(&necp_kernel_policy_lock); done: - if (!allowed_to_receive && cellular_denied) { + if (!allowed_to_receive && interface_type_denied != IFRTYPE_FUNCTIONAL_UNKNOWN) { soevent(inp->inp_socket, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED)); } @@ -6776,7 +7541,7 @@ necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *re int necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, u_int32_t route_rule_id) { - if (packet == NULL || inp == NULL) { + if (packet == NULL || inp == NULL || !(packet->m_flags & M_PKTHDR)) { return (EINVAL); } @@ -6795,6 +7560,7 @@ necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel } else { packet->m_pkthdr.necp_mtag.necp_route_rule_id = inp->inp_policyresult.results.route_rule_id; } + packet->m_pkthdr.necp_mtag.necp_app_id = inp->inp_policyresult.app_id; return (0); } @@ -6802,7 +7568,7 @@ necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel int necp_mark_packet_from_ip(struct mbuf *packet, necp_kernel_policy_id policy_id) { - if (packet == NULL) { + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { return (EINVAL); } @@ -6819,7 +7585,7 @@ necp_mark_packet_from_ip(struct mbuf *packet, necp_kernel_policy_id policy_id) int necp_mark_packet_from_interface(struct mbuf *packet, ifnet_t interface) { - if (packet == NULL) { + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { return (EINVAL); } @@ -6834,33 +7600,33 @@ necp_mark_packet_from_interface(struct mbuf *packet, ifnet_t interface) int necp_mark_packet_as_keepalive(struct mbuf *packet, bool is_keepalive) { - if (packet == NULL) { + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { return (EINVAL); } - + if (is_keepalive) { packet->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE; } else { packet->m_pkthdr.pkt_flags &= ~PKTF_KEEPALIVE; } - + return (0); } necp_kernel_policy_id necp_get_policy_id_from_packet(struct mbuf *packet) { - if (packet == NULL) { + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { return (NECP_KERNEL_POLICY_ID_NONE); } - + return (packet->m_pkthdr.necp_mtag.necp_policy_id); } u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet) { - if (packet == NULL) { + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { return (0); } @@ -6870,20 +7636,44 @@ necp_get_last_interface_index_from_packet(struct mbuf *packet) u_int32_t necp_get_route_rule_id_from_packet(struct mbuf *packet) { - if (packet == NULL) { + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { return (0); } return (packet->m_pkthdr.necp_mtag.necp_route_rule_id); } +int +necp_get_app_uuid_from_packet(struct mbuf *packet, + uuid_t app_uuid) +{ + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { + return (EINVAL); + } + + bool found_mapping = FALSE; + if (packet->m_pkthdr.necp_mtag.necp_app_id != 0) { + lck_rw_lock_shared(&necp_kernel_policy_lock); + struct necp_uuid_id_mapping *entry = necp_uuid_lookup_uuid_with_app_id_locked(packet->m_pkthdr.necp_mtag.necp_app_id); + if (entry != NULL) { + uuid_copy(app_uuid, entry->uuid); + found_mapping = true; + } + lck_rw_done(&necp_kernel_policy_lock); + } + if (!found_mapping) { + uuid_clear(app_uuid); + } + return (0); +} + bool necp_get_is_keepalive_from_packet(struct mbuf *packet) { - if (packet == NULL) { + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { return (FALSE); } - + return (packet->m_pkthdr.pkt_flags & PKTF_KEEPALIVE); } @@ -6891,7 +7681,7 @@ u_int32_t necp_socket_get_content_filter_control_unit(struct socket *so) { struct inpcb *inp = sotoinpcb(so); - + if (inp == NULL) { return (0); } @@ -6928,7 +7718,7 @@ necp_socket_should_rescope(struct inpcb *inp) if (inp == NULL) { return (FALSE); } - + return (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED); } @@ -6938,11 +7728,11 @@ necp_socket_get_rescope_if_index(struct inpcb *inp) if (inp == NULL) { return (0); } - + if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) { return (inp->inp_policyresult.results.result_parameter.scoped_interface_index); } - + return (0); } @@ -7072,13 +7862,13 @@ necp_addr_is_loopback(struct sockaddr *address) if (address == NULL) { return (FALSE); } - + if (address->sa_family == AF_INET) { return (ntohl(((struct sockaddr_in *)(void *)address)->sin_addr.s_addr) == INADDR_LOOPBACK); } else if (address->sa_family == AF_INET6) { return IN6_IS_ADDR_LOOPBACK(&((struct sockaddr_in6 *)(void *)address)->sin6_addr); } - + return (FALSE); } @@ -7093,11 +7883,11 @@ necp_is_loopback(struct sockaddr *local_addr, struct sockaddr *remote_addr, stru if (local_addr != NULL && necp_addr_is_loopback(local_addr)) { return (TRUE); } - + if (remote_addr != NULL && necp_addr_is_loopback(remote_addr)) { return (TRUE); } - + if (inp != NULL) { if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp && (inp->inp_boundifp->if_flags & IFF_LOOPBACK)) { return (TRUE); @@ -7114,7 +7904,7 @@ necp_is_loopback(struct sockaddr *local_addr, struct sockaddr *remote_addr, stru } } } - + if (packet != NULL) { struct ip *ip = mtod(packet, struct ip *); if (ip->ip_v == 4) { @@ -7134,6 +7924,6 @@ necp_is_loopback(struct sockaddr *local_addr, struct sockaddr *remote_addr, stru } } } - + return (FALSE); } diff --git a/bsd/net/necp.h b/bsd/net/necp.h index 2aebe9c27..14609d65d 100644 --- a/bsd/net/necp.h +++ b/bsd/net/necp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2014 Apple Inc. All rights reserved. + * Copyright (c) 2013-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,8 +29,12 @@ #ifndef _NET_NECP_H_ #define _NET_NECP_H_ +#include +#ifdef PRIVATE + #include #include +#include /* * Name registered by the ipsec kernel control @@ -44,6 +48,10 @@ struct necp_packet_header { u_int8_t flags; u_int32_t message_id; }; + +/* + * Control message commands + */ #define NECP_PACKET_TYPE_POLICY_ADD 1 #define NECP_PACKET_TYPE_POLICY_GET 2 #define NECP_PACKET_TYPE_POLICY_DELETE 3 @@ -54,9 +62,16 @@ struct necp_packet_header { #define NECP_PACKET_TYPE_LOCK_SESSION_TO_PROC 8 #define NECP_PACKET_TYPE_REGISTER_SERVICE 9 #define NECP_PACKET_TYPE_UNREGISTER_SERVICE 10 +#define NECP_PACKET_TYPE_POLICY_DUMP_ALL 11 +/* + * Control message flags + */ #define NECP_PACKET_FLAGS_RESPONSE 0x01 // Used for acks, errors, and query responses +/* + * Control message TLV types + */ #define NECP_TLV_NIL 0 #define NECP_TLV_ERROR 1 // u_int32_t #define NECP_TLV_POLICY_ORDER 2 // u_int32_t @@ -69,17 +84,29 @@ struct necp_packet_header { #define NECP_TLV_SERVICE_UUID 9 // uuid_t #define NECP_TLV_ROUTE_RULE 10 +/* + * Control message TLV sent only by the kernel to userspace + */ +#define NECP_TLV_POLICY_OWNER 100 // char [] +#define NECP_TLV_POLICY_DUMP 101 +#define NECP_TLV_POLICY_RESULT_STRING 102 // char [] +#define NECP_TLV_POLICY_SESSION_ORDER 103 // u_int32_t + +/* + * Condition flags + */ #define NECP_POLICY_CONDITION_FLAGS_NEGATIVE 0x01 // Negative -// Conditions +/* + * Conditions + * Used for setting policies as well as passing parameters to necp_match_policy. + */ #define NECP_POLICY_CONDITION_DEFAULT 0 // N/A, not valid with any other conditions // Socket/Application conditions #define NECP_POLICY_CONDITION_APPLICATION 1 // uuid_t, uses effective UUID when possible #define NECP_POLICY_CONDITION_REAL_APPLICATION 2 // uuid_t, never uses effective UUID. Only valid with NECP_POLICY_CONDITION_APPLICATION -// Application-only Conditions #define NECP_POLICY_CONDITION_DOMAIN 3 // String, such as apple.com #define NECP_POLICY_CONDITION_ACCOUNT 4 // String -// Socket/Application condition #define NECP_POLICY_CONDITION_ENTITLEMENT 5 // String #define NECP_POLICY_CONDITION_PID 6 // pid_t #define NECP_POLICY_CONDITION_UID 7 // uid_t @@ -93,7 +120,9 @@ struct necp_packet_header { #define NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE 14 // necp_policy_condition_addr_range #define NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE 15 // necp_policy_condition_addr_range -// Results +/* + * Results + */ #define NECP_POLICY_RESULT_PASS 1 // N/A #define NECP_POLICY_RESULT_SKIP 2 // u_int32_t, policy order to skip to. 0 to skip all session policies. #define NECP_POLICY_RESULT_DROP 3 // N/A @@ -111,17 +140,23 @@ struct necp_packet_header { #define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_USE_NETAGENT -// Route rule +/* + * Route Rules + * Detailed parameters for NECP_POLICY_RESULT_ROUTE_RULES. + */ #define NECP_ROUTE_RULE_NONE 0 // N/A #define NECP_ROUTE_RULE_DENY_INTERFACE 1 // String, or empty to match all #define NECP_ROUTE_RULE_ALLOW_INTERFACE 2 // String, or empty to match all +#define NECP_ROUTE_RULE_QOS_MARKING 3 // String, or empty to match all #define NECP_ROUTE_RULE_FLAG_CELLULAR 0x01 #define NECP_ROUTE_RULE_FLAG_WIFI 0x02 #define NECP_ROUTE_RULE_FLAG_WIRED 0x04 #define NECP_ROUTE_RULE_FLAG_EXPENSIVE 0x08 -// Errors +/* + * Error types + */ #define NECP_ERROR_INTERNAL 0 #define NECP_ERROR_UNKNOWN_PACKET_TYPE 1 #define NECP_ERROR_INVALID_TLV 2 @@ -199,8 +234,231 @@ struct necp_aggregate_result { u_int32_t netagent_flags[NECP_MAX_NETAGENTS]; }; -#define KEV_NECP_SUBCLASS 8 -#define KEV_NECP_POLICIES_CHANGED 1 +/* + * Statistics. It would be nice if the definitions in ntstat.h could be used, + * but they get entangled with #defines for v4 etc in pfvar.h and it may be better practice + * to have separate definitions here. + */ +typedef struct necp_stat_counts +{ + /* Counters */ + u_int64_t necp_stat_rxpackets __attribute__((aligned(8))); + u_int64_t necp_stat_rxbytes __attribute__((aligned(8))); + u_int64_t necp_stat_txpackets __attribute__((aligned(8))); + u_int64_t necp_stat_txbytes __attribute__((aligned(8))); + + u_int32_t necp_stat_rxduplicatebytes; + u_int32_t necp_stat_rxoutoforderbytes; + u_int32_t necp_stat_txretransmit; + + u_int32_t necp_stat_connectattempts; + u_int32_t necp_stat_connectsuccesses; + + u_int32_t necp_stat_min_rtt; + u_int32_t necp_stat_avg_rtt; + u_int32_t necp_stat_var_rtt; + + u_int64_t necp_stat_cell_rxbytes __attribute__((aligned(8))); + u_int64_t necp_stat_cell_txbytes __attribute__((aligned(8))); + u_int64_t necp_stat_wifi_rxbytes __attribute__((aligned(8))); + u_int64_t necp_stat_wifi_txbytes __attribute__((aligned(8))); + u_int64_t necp_stat_wired_rxbytes __attribute__((aligned(8))); + u_int64_t necp_stat_wired_txbytes __attribute__((aligned(8))); +} necp_stat_counts; + +// Note, some metadata is implicit in the necp client itself: +// From the process itself : pid, upid, uuid, proc name. +// From the necp client parameters: local and remote addresses, euuid, traffic class, ifindex +// +// The following may well be supplied via future necp client parameters, +// but they are here so they don't get forgotten. +typedef struct necp_basic_metadata +{ + u_int32_t rcvbufsize; + u_int32_t rcvbufused; + + u_int64_t eupid; // Not currently used by NetworkStatistics, could skip. + u_int32_t epid; + + uuid_t vuuid; // Effective UUID as given from voucher + uint16_t ifnet_properties; +} necp_basic_metadata; + +struct necp_tcp_probe_status { + unsigned int probe_activated : 1; + unsigned int write_probe_failed : 1; + unsigned int read_probe_failed : 1; + unsigned int conn_probe_failed : 1; +}; + +typedef struct necp_extra_tcp_metadata +{ + struct necp_tcp_probe_status probestatus; + + u_int32_t sndbufsize; + u_int32_t sndbufused; + u_int32_t txunacked; + u_int32_t txwindow; + u_int32_t txcwindow; + u_int32_t traffic_mgt_flags; + u_int32_t cc_alg_index; + u_int32_t state; +} necp_extra_tcp_metadata; + +typedef struct necp_stats_hdr { + u_int32_t necp_stats_type __attribute__((aligned(8))); + u_int32_t necp_stats_ver; + u_int64_t necp_stats_event; +} necp_stats_hdr; + +#define NECP_CLIENT_STATISTICS_TYPE_TCP 1 // Identifies use of necp_tcp_stats +#define NECP_CLIENT_STATISTICS_TYPE_UDP 2 // Identifies use of necp_udp_stats +#define NECP_CLIENT_STATISTICS_TYPE_TCP_VER_1 1 // Currently supported version for TCP +#define NECP_CLIENT_STATISTICS_TYPE_UDP_VER_1 1 // Currently supported version for UDP + +typedef struct necp_tcp_stats { + necp_stats_hdr necp_tcp_hdr; + necp_stat_counts necp_tcp_counts; + necp_basic_metadata necp_tcp_basic; + necp_extra_tcp_metadata necp_tcp_extra; +} necp_tcp_stats; + +typedef struct necp_udp_stats { + necp_stats_hdr necp_udp_hdr; + necp_stat_counts necp_udp_counts; + necp_basic_metadata necp_udp_basic; +} necp_udp_stats; + +typedef struct necp_all_stats { + union { + necp_tcp_stats tcp_stats; + necp_udp_stats udp_stats; + } all_stats_u; +} necp_all_stats; + +/* + * NECP Client definitions + */ +#define NECP_MAX_CLIENT_PARAMETERS_SIZE 1024 +#define NECP_MAX_CLIENT_RESULT_SIZE 512 + +#define NECP_OPEN_FLAG_OBSERVER 0x01 // Observers can query clients they don't own + +#define NECP_CLIENT_ACTION_ADD 1 // Register a new client. Input: parameters in buffer; Output: client_id +#define NECP_CLIENT_ACTION_REMOVE 2 // Unregister a client. Input: client_id +#define NECP_CLIENT_ACTION_COPY_PARAMETERS 3 // Copy client parameters. Input: client_id; Output: parameters in buffer +#define NECP_CLIENT_ACTION_COPY_RESULT 4 // Copy client result. Input: client_id; Output: result in buffer +#define NECP_CLIENT_ACTION_COPY_LIST 5 // Copy all client IDs. Output: struct necp_client_list in buffer +#define NECP_CLIENT_ACTION_REQUEST_NEXUS_INSTANCE 6 // Request a nexus instance from a nexus provider +#define NECP_CLIENT_ACTION_AGENT 7 // Interact with agent. Input: client_id, agent parameters +#define NECP_CLIENT_ACTION_COPY_AGENT 8 // Copy agent content. Input: agent UUID; Output: struct netagent +#define NECP_CLIENT_ACTION_COPY_INTERFACE 9 // Copy interface details. Input: ifindex cast to UUID; Output: struct necp_interface_details +#define NECP_CLIENT_ACTION_SET_STATISTICS 10 // Start/update/complete per-flow statistics. Input: client_id, statistics area + +#define NECP_CLIENT_PARAMETER_APPLICATION NECP_POLICY_CONDITION_APPLICATION // Requires entitlement +#define NECP_CLIENT_PARAMETER_REAL_APPLICATION NECP_POLICY_CONDITION_REAL_APPLICATION // Requires entitlement +#define NECP_CLIENT_PARAMETER_DOMAIN NECP_POLICY_CONDITION_DOMAIN +#define NECP_CLIENT_PARAMETER_ACCOUNT NECP_POLICY_CONDITION_ACCOUNT +#define NECP_CLIENT_PARAMETER_PID NECP_POLICY_CONDITION_PID // Requires entitlement +#define NECP_CLIENT_PARAMETER_UID NECP_POLICY_CONDITION_UID // Requires entitlement +#define NECP_CLIENT_PARAMETER_BOUND_INTERFACE NECP_POLICY_CONDITION_BOUND_INTERFACE +#define NECP_CLIENT_PARAMETER_TRAFFIC_CLASS NECP_POLICY_CONDITION_TRAFFIC_CLASS +#define NECP_CLIENT_PARAMETER_IP_PROTOCOL NECP_POLICY_CONDITION_IP_PROTOCOL +#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS NECP_POLICY_CONDITION_LOCAL_ADDR +#define NECP_CLIENT_PARAMETER_REMOTE_ADDRESS NECP_POLICY_CONDITION_REMOTE_ADDR + +// "Prohibit" will never choose an interface with that property +#define NECP_CLIENT_PARAMETER_PROHIBIT_INTERFACE 100 // String, interface name +#define NECP_CLIENT_PARAMETER_PROHIBIT_IF_TYPE 101 // u_int8_t, see ifru_functional_type in +#define NECP_CLIENT_PARAMETER_PROHIBIT_AGENT 102 // uuid_t, network agent UUID +#define NECP_CLIENT_PARAMETER_PROHIBIT_AGENT_TYPE 103 // struct necp_client_parameter_netagent_type + +// "Require" will choose an interface with that property, or none if not found +#define NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE 111 // u_int8_t, see ifru_functional_type in +#define NECP_CLIENT_PARAMETER_REQUIRE_AGENT 112 // uuid_t, network agent UUID +#define NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE 113 // struct necp_client_parameter_netagent_type + +// "Prefer" will choose an interface with that property, or best otherwise if not found +#define NECP_CLIENT_PARAMETER_PREFER_AGENT 122 // uuid_t, network agent UUID +#define NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE 123 // struct necp_client_parameter_netagent_type + +// Use actions with NECP_CLIENT_ACTION_AGENT +#define NECP_CLIENT_PARAMETER_TRIGGER_AGENT 130 // uuid_t, network agent UUID +#define NECP_CLIENT_PARAMETER_ASSERT_AGENT 131 // uuid_t, network agent UUID +#define NECP_CLIENT_PARAMETER_UNASSERT_AGENT 132 // uuid_t, network agent UUID + +#define NECP_CLIENT_PARAMETER_LOCAL_ENDPOINT 200 // struct necp_client_endpoint +#define NECP_CLIENT_PARAMETER_REMOTE_ENDPOINT 201 // struct necp_client_endpoint +#define NECP_CLIENT_PARAMETER_RESERVED_START 1000 // Parameters 1000 and higher are reserved for custom userspace options + +#define NECP_CLIENT_RESULT_CLIENT_ID 1 // uuid_t +#define NECP_CLIENT_RESULT_POLICY_RESULT 2 // u_int32_t +#define NECP_CLIENT_RESULT_POLICY_RESULT_PARAMETER 3 // u_int32_t +#define NECP_CLIENT_RESULT_FILTER_CONTROL_UNIT 4 // u_int32_t +#define NECP_CLIENT_RESULT_INTERFACE_INDEX 5 // u_int32_t +#define NECP_CLIENT_RESULT_NETAGENT 6 // struct necp_client_result_netagent +#define NECP_CLIENT_RESULT_FLAGS 7 // u_int32_t, see NECP_CLIENT_RESULT_FLAG_* values +#define NECP_CLIENT_RESULT_INTERFACE 8 // struct necp_client_result_interface + +#define NECP_CLIENT_RESULT_NEXUS_INSTANCE 100 // uuid_t +#define NECP_CLIENT_RESULT_NEXUS_PORT 101 // u_int16_t + +#define NECP_CLIENT_RESULT_LOCAL_ENDPOINT 200 // struct necp_client_endpoint +#define NECP_CLIENT_RESULT_REMOTE_ENDPOINT 201 // struct necp_client_endpoint + +#define NECP_CLIENT_RESULT_FLAG_IS_LOCAL 0x0001 // Routes to this device +#define NECP_CLIENT_RESULT_FLAG_IS_DIRECT 0x0002 // Routes to directly accessible peer +#define NECP_CLIENT_RESULT_FLAG_HAS_IPV4 0x0004 // Supports IPv4 +#define NECP_CLIENT_RESULT_FLAG_HAS_IPV6 0x0008 // Supports IPv6 + +struct necp_interface_details { + char name[IFXNAMSIZ]; + u_int32_t index; + u_int32_t generation; + u_int32_t functional_type; + u_int32_t delegate_index; + u_int32_t flags; // see NECP_INTERFACE_FLAG_* + u_int32_t mtu; + u_int8_t ipv4_signature[IFNET_SIGNATURELEN]; + u_int8_t ipv6_signature[IFNET_SIGNATURELEN]; +}; + +#define NECP_INTERFACE_FLAG_EXPENSIVE 0x0001 + +struct necp_client_parameter_netagent_type { + char netagent_domain[32]; + char netagent_type[32]; +}; + +struct necp_client_result_netagent { + u_int32_t generation; + uuid_t netagent_uuid; +}; + +struct necp_client_result_interface { + u_int32_t generation; + u_int32_t index; +}; + +struct necp_client_endpoint { + union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct { + u_int8_t endpoint_length; + u_int8_t endpoint_family; // Use AF_UNSPEC to target a name + u_int16_t endpoint_port; + u_int32_t endpoint_type; // Client-specific type + char endpoint_data[0]; // Type-specific endpoint value + } endpoint; + } u; +}; + +struct necp_client_list { + u_int32_t client_count; + uuid_t clients[0]; +}; struct kev_necp_policies_changed_data { u_int32_t changed_count; // Defaults to 0. @@ -212,6 +470,35 @@ struct kev_necp_policies_changed_data { #include #include #include +#include +#include +#include + +#define NECPLOG(level, format, ...) do { \ + log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: " format "\n", __FUNCTION__, __VA_ARGS__); \ +} while (0) + +#define NECPLOG0(level, msg) do { \ + log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: %s\n", __FUNCTION__, msg); \ +} while (0) + +extern errno_t necp_client_init(void); +extern int necp_application_find_policy_match_internal(proc_t proc, u_int8_t *parameters, u_int32_t parameters_size, + struct necp_aggregate_result *returned_result, + u_int32_t *flags, u_int required_interface_index); +/* + * TLV utilities + * + * Note that these functions (other than necp_buffer_find_tlv) do not check the length of the entire buffer, + * so the caller must be sure that the entire TLV is within bounds. + */ +extern u_int8_t *necp_buffer_write_tlv(u_int8_t *buffer, u_int8_t type, u_int32_t length, const void *value); +extern u_int8_t *necp_buffer_write_tlv_if_different(u_int8_t *buffer, const u_int8_t *max, u_int8_t type, + u_int32_t length, const void *value, bool *updated); +extern u_int8_t necp_buffer_get_tlv_type(u_int8_t *buffer, int tlv_offset); +extern u_int32_t necp_buffer_get_tlv_length(u_int8_t *buffer, int tlv_offset); +extern u_int8_t *necp_buffer_get_tlv_value(u_int8_t *buffer, int tlv_offset, u_int32_t *value_size); +extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int next); #define NECPCTL_DROP_ALL_LEVEL 1 /* Drop all packets if no policy matches above this level */ #define NECPCTL_DEBUG 2 /* Log all kernel policy matches */ @@ -288,6 +575,7 @@ struct necp_kernel_socket_policy { necp_kernel_policy_id cond_policy_id; u_int32_t cond_app_id; // Locally assigned ID value stored u_int32_t cond_real_app_id; // Locally assigned ID value stored + char *cond_custom_entitlement; // String u_int32_t cond_account_id; // Locally assigned ID value stored char *cond_domain; // String u_int8_t cond_domain_dot_count; // Number of dots in cond_domain @@ -367,57 +655,90 @@ struct necp_aggregate_socket_result { necp_kernel_policy_result_parameter result_parameter; necp_kernel_policy_filter filter_control_unit; u_int32_t route_rule_id; + int32_t qos_marking_gencount; }; struct necp_inpcb_result { - char *application_layer_domain; - u_int32_t application_layer_account_id; + u_int32_t app_id; necp_kernel_policy_id policy_id; int32_t policy_gencount; u_int32_t flowhash; struct necp_aggregate_socket_result results; }; -errno_t necp_init(void); +extern errno_t necp_init(void); + +extern errno_t necp_set_socket_attributes(struct socket *so, struct sockopt *sopt); +extern errno_t necp_get_socket_attributes(struct socket *so, struct sockopt *sopt); +extern void necp_inpcb_dispose(struct inpcb *inp); + +extern u_int32_t necp_socket_get_content_filter_control_unit(struct socket *so); -errno_t necp_set_socket_attributes(struct socket *so, struct sockopt *sopt); -errno_t necp_get_socket_attributes(struct socket *so, struct sockopt *sopt); +extern bool necp_socket_should_use_flow_divert(struct inpcb *inp); +extern u_int32_t necp_socket_get_flow_divert_control_unit(struct inpcb *inp); -u_int32_t necp_socket_get_content_filter_control_unit(struct socket *so); +extern bool necp_socket_should_rescope(struct inpcb *inp); +extern u_int necp_socket_get_rescope_if_index(struct inpcb *inp); +extern u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t current_mtu); -bool necp_socket_should_use_flow_divert(struct inpcb *inp); -u_int32_t necp_socket_get_flow_divert_control_unit(struct inpcb *inp); +extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, + u_int32_t *return_route_rule_id); +extern bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, + u_int16_t remote_port, struct in_addr *local_addr, + struct in_addr *remote_addr, ifnet_t interface, + necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id); +extern bool necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, + u_int16_t remote_port, struct in6_addr *local_addr, + struct in6_addr *remote_addr, ifnet_t interface, + necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id); +extern void necp_socket_update_qos_marking(struct inpcb *inp, struct rtentry *route, struct ifnet *interface, u_int32_t route_rule_id); +extern int necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, + u_int32_t route_rule_id); +extern necp_kernel_policy_id necp_get_policy_id_from_packet(struct mbuf *packet); +extern u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet); +extern u_int32_t necp_get_route_rule_id_from_packet(struct mbuf *packet); +extern int necp_get_app_uuid_from_packet(struct mbuf *packet, + uuid_t app_uuid); -bool necp_socket_should_rescope(struct inpcb *inp); -u_int necp_socket_get_rescope_if_index(struct inpcb *inp); -u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t current_mtu); +extern necp_kernel_policy_id necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local_addr, + struct sockaddr *override_remote_addr, u_int32_t override_bound_interface); +extern necp_kernel_policy_id necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa, + necp_kernel_policy_result *result, + necp_kernel_policy_result_parameter *result_parameter); +extern necp_kernel_policy_id necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa, + necp_kernel_policy_result *result, + necp_kernel_policy_result_parameter *result_parameter); -bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id); -bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id); -bool necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id); -int necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, u_int32_t route_rule_id); -necp_kernel_policy_id necp_get_policy_id_from_packet(struct mbuf *packet); -u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet); -u_int32_t necp_get_route_rule_id_from_packet(struct mbuf *packet); +extern int necp_mark_packet_from_ip(struct mbuf *packet, necp_kernel_policy_id policy_id); +extern int necp_mark_packet_from_interface(struct mbuf *packet, ifnet_t interface); -necp_kernel_policy_id necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface); -necp_kernel_policy_id necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter); -necp_kernel_policy_id necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter); +extern ifnet_t necp_get_ifnet_from_result_parameter(necp_kernel_policy_result_parameter *result_parameter); +extern bool necp_packet_can_rebind_to_ifnet(struct mbuf *packet, struct ifnet *interface, struct route *new_route, int family); -int necp_mark_packet_from_ip(struct mbuf *packet, necp_kernel_policy_id policy_id); -int necp_mark_packet_from_interface(struct mbuf *packet, ifnet_t interface); +extern bool necp_packet_is_allowed_over_interface(struct mbuf *packet, struct ifnet *interface); -ifnet_t necp_get_ifnet_from_result_parameter(necp_kernel_policy_result_parameter *result_parameter); -bool necp_packet_can_rebind_to_ifnet(struct mbuf *packet, struct ifnet *interface, struct route *new_route, int family); +extern int necp_mark_packet_as_keepalive(struct mbuf *packet, bool is_keepalive); +extern bool necp_get_is_keepalive_from_packet(struct mbuf *packet); -bool necp_packet_is_allowed_over_interface(struct mbuf *packet, struct ifnet *interface); +extern void necp_update_all_clients(void); // Handle general re-evaluate event -int necp_mark_packet_as_keepalive(struct mbuf *packet, bool is_keepalive); -bool necp_get_is_keepalive_from_packet(struct mbuf *packet); +extern void necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid); // Cause a single client to get an update event + +extern int necp_assign_client_result(uuid_t netagent_uuid, uuid_t client_id, + u_int8_t *assigned_results, size_t assigned_results_length); #endif /* BSD_KERNEL_PRIVATE */ #ifndef KERNEL -int necp_match_policy(const uint8_t *parameters, size_t parameters_size, struct necp_aggregate_result *returned_result); + +extern int necp_match_policy(const uint8_t *parameters, size_t parameters_size, struct necp_aggregate_result *returned_result); + +extern int necp_open(int flags); + +extern int necp_client_action(int necp_fd, uint32_t action, uuid_t client_id, + size_t client_id_len, uint8_t *buffer, size_t buffer_size); + #endif /* !KERNEL */ +#endif /* PRIVATE */ + #endif diff --git a/bsd/net/necp_client.c b/bsd/net/necp_client.c new file mode 100644 index 000000000..999fab3ee --- /dev/null +++ b/bsd/net/necp_client.c @@ -0,0 +1,2900 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * NECP Client Architecture + * ------------------------------------------------ + * See for a discussion on NECP database architecture. + * + * Each client of NECP provides a set of parameters for a connection or network state + * evaluation, on which NECP policy evaluation is run. This produces a policy result + * which can be accessed by the originating process, along with events for when policies + * results have changed. + * + * ------------------------------------------------ + * NECP Client FD + * ------------------------------------------------ + * A process opens an NECP file descriptor using necp_open(). This is a very simple + * file descriptor, upon which the process may do the following operations: + * - necp_client_action(...), to add/remove/query clients + * - kqueue, to watch for readable events + * - close(), to close the client session and release all clients + * + * Client objects are allocated structures that hang off of the file descriptor. Each + * client contains: + * - Client ID, a UUID that references the client across the system + * - Parameters, a buffer of TLVs that describe the client's connection parameters, + * such as the remote and local endpoints, interface requirements, etc. + * - Result, a buffer of TLVs containing the current policy evaluation for the client. + * This result will be updated whenever a network change occurs that impacts the + * policy result for that client. + * + * +--------------+ + * | NECP fd | + * +--------------+ + * || + * ================================== + * || || || + * +--------------+ +--------------+ +--------------+ + * | Client ID | | Client ID | | Client ID | + * | ---- | | ---- | | ---- | + * | Parameters | | Parameters | | Parameters | + * | ---- | | ---- | | ---- | + * | Result | | Result | | Result | + * +--------------+ +--------------+ +--------------+ + * + * ------------------------------------------------ + * Client Actions + * ------------------------------------------------ + * - Add. Input parameters as a buffer of TLVs, and output a client ID. Allocates a + * new client structure on the file descriptor. + * - Remove. Input a client ID. Removes a client structure from the file descriptor. + * - Copy Parameters. Input a client ID, and output parameter TLVs. + * - Copy Result. Input a client ID, and output result TLVs. Alternatively, input empty + * client ID and get next unread client result. + * - Copy List. List all client IDs. + * + * ------------------------------------------------ + * Client Policy Evaluation + * ------------------------------------------------ + * Policies are evaluated for clients upon client creation, and upon update events, + * which are network/agent/policy changes coalesced by a timer. + * + * The policy evaluation goes through the following steps: + * 1. Parse client parameters. + * 2. Select a scoped interface if applicable. This involves using require/prohibit + * parameters, along with the local address, to select the most appropriate interface + * if not explicitly set by the client parameters. + * 3. Run NECP application-level policy evalution + * 4. Set policy result into client result buffer. + * + * ------------------------------------------------ + * Client Observers + * ------------------------------------------------ + * If necp_open() is called with the NECP_OPEN_FLAG_OBSERVER flag, and the process + * passes the necessary privilege check, the fd is allowed to use necp_client_action() + * to copy client state attached to the file descriptors of other processes, and to + * list all client IDs on the system. + */ + +extern u_int32_t necp_debug; + +static int noop_read(struct fileproc *, struct uio *, int, vfs_context_t); +static int noop_write(struct fileproc *, struct uio *, int, vfs_context_t); +static int noop_ioctl(struct fileproc *, unsigned long, caddr_t, + vfs_context_t); +static int necpop_select(struct fileproc *, int, void *, vfs_context_t); +static int necpop_close(struct fileglob *, vfs_context_t); +static int necpop_kqfilter(struct fileproc *, struct knote *, vfs_context_t); + +// Timer functions +static int necp_timeout_microseconds = 1000 * 100; // 100ms +static int necp_timeout_leeway_microseconds = 1000 * 500; // 500ms +extern int tvtohz(struct timeval *); + +// Parsed parameters +#define NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR 0x0001 +#define NECP_PARSED_PARAMETERS_FIELD_REMOTE_ADDR 0x0002 +#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF 0x0004 +#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IF 0x0008 +#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE 0x0010 +#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IFTYPE 0x0020 +#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT 0x0040 +#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT 0x0080 +#define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT 0x0100 +#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE 0x0200 +#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE 0x0400 +#define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE 0x0800 + +#define NECP_MAX_PARSED_PARAMETERS 16 +struct necp_client_parsed_parameters { + u_int32_t valid_fields; + union necp_sockaddr_union local_addr; + union necp_sockaddr_union remote_addr; + u_int32_t required_interface_index; + char prohibited_interfaces[IFXNAMSIZ][NECP_MAX_PARSED_PARAMETERS]; + u_int8_t required_interface_types[NECP_MAX_PARSED_PARAMETERS]; + u_int8_t prohibited_interface_types[NECP_MAX_PARSED_PARAMETERS]; + struct necp_client_parameter_netagent_type required_netagent_types[NECP_MAX_PARSED_PARAMETERS]; + struct necp_client_parameter_netagent_type prohibited_netagent_types[NECP_MAX_PARSED_PARAMETERS]; + struct necp_client_parameter_netagent_type preferred_netagent_types[NECP_MAX_PARSED_PARAMETERS]; + uuid_t required_netagents[NECP_MAX_PARSED_PARAMETERS]; + uuid_t prohibited_netagents[NECP_MAX_PARSED_PARAMETERS]; + uuid_t preferred_netagents[NECP_MAX_PARSED_PARAMETERS]; +}; + +static bool necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters, u_int *return_ifindex); + +static const struct fileops necp_fd_ops = { + .fo_type = DTYPE_NETPOLICY, + .fo_read = noop_read, + .fo_write = noop_write, + .fo_ioctl = noop_ioctl, + .fo_select = necpop_select, + .fo_close = necpop_close, + .fo_kqfilter = necpop_kqfilter, + .fo_drain = NULL, +}; + +struct necp_client_assertion { + LIST_ENTRY(necp_client_assertion) assertion_chain; + uuid_t asserted_netagent; +}; + +struct necp_client { + LIST_ENTRY(necp_client) chain; + + uuid_t client_id; + bool result_read; + bool assigned_result_read; + + size_t result_length; + u_int8_t result[NECP_MAX_CLIENT_RESULT_SIZE]; + + uuid_t nexus_agent; + size_t assigned_results_length; + u_int8_t *assigned_results; + + LIST_HEAD(_necp_client_assertion_list, necp_client_assertion) assertion_list; + + user_addr_t stats_uaddr; + user_size_t stats_ulen; + nstat_userland_context stats_handler_context; + necp_stats_hdr *stats_area; + + size_t parameters_length; + u_int8_t parameters[0]; +}; + +struct necp_fd_data { + LIST_ENTRY(necp_fd_data) chain; + LIST_HEAD(_clients, necp_client) clients; + int flags; + int proc_pid; + decl_lck_mtx_data(, fd_lock); + struct selinfo si; +}; + +static LIST_HEAD(_necp_fd_list, necp_fd_data) necp_fd_list; + +static lck_grp_attr_t *necp_fd_grp_attr = NULL; +static lck_attr_t *necp_fd_mtx_attr = NULL; +static lck_grp_t *necp_fd_mtx_grp = NULL; +decl_lck_rw_data(static, necp_fd_lock); + +static thread_call_t necp_client_tcall; + +/// NECP file descriptor functions + +static int +noop_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) +{ +#pragma unused(fp, uio, flags, ctx) + return (ENXIO); +} + +static int +noop_write(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx) +{ +#pragma unused(fp, uio, flags, ctx) + return (ENXIO); +} + +static int +noop_ioctl(struct fileproc *fp, unsigned long com, caddr_t data, + vfs_context_t ctx) +{ +#pragma unused(fp, com, data, ctx) + return (ENOTTY); +} + +static void +necp_fd_notify(struct necp_fd_data *fd_data, bool locked) +{ + struct selinfo *si = &fd_data->si; + + if (!locked) { + lck_mtx_lock(&fd_data->fd_lock); + } + + selwakeup(si); + + // use a non-zero hint to tell the notification from the + // call done in kqueue_scan() which uses 0 + KNOTE(&si->si_note, 1); // notification + + if (!locked) { + lck_mtx_unlock(&fd_data->fd_lock); + } +} + +static int +necp_fd_poll(struct necp_fd_data *fd_data, int events, void *wql, struct proc *p, int is_kevent) +{ +#pragma unused(wql, p, is_kevent) + u_int revents = 0; + struct necp_client *client = NULL; + bool has_unread_clients = FALSE; + + u_int want_rx = events & (POLLIN | POLLRDNORM); + if (want_rx) { + + LIST_FOREACH(client, &fd_data->clients, chain) { + if (!client->result_read || !client->assigned_result_read) { + has_unread_clients = TRUE; + break; + } + } + + if (has_unread_clients) { + revents |= want_rx; + } + } + + return (revents); +} + +static int +necpop_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) +{ +#pragma unused(fp, which, wql, ctx) + return (0); + struct necp_fd_data *fd_data = NULL; + int revents = 0; + int events = 0; + proc_t procp; + + fd_data = (struct necp_fd_data *)fp->f_fglob->fg_data; + if (fd_data == NULL) { + return (0); + } + + procp = vfs_context_proc(ctx); + + switch (which) { + case FREAD: { + events = POLLIN; + break; + } + + default: { + return (1); + } + } + + lck_mtx_lock(&fd_data->fd_lock); + revents = necp_fd_poll(fd_data, events, wql, procp, 0); + lck_mtx_unlock(&fd_data->fd_lock); + + return ((events & revents) ? 1 : 0); +} + +static void +necp_fd_knrdetach(struct knote *kn) +{ + struct necp_fd_data *fd_data = (struct necp_fd_data *)kn->kn_hook; + struct selinfo *si = &fd_data->si; + + lck_mtx_lock(&fd_data->fd_lock); + KNOTE_DETACH(&si->si_note, kn); + lck_mtx_unlock(&fd_data->fd_lock); +} + +static int +necp_fd_knread(struct knote *kn, long hint) +{ +#pragma unused(kn, hint) + return 1; /* assume we are ready */ +} + +static int +necp_fd_knrprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + struct necp_fd_data *fd_data; + int revents; + int res; + + fd_data = (struct necp_fd_data *)kn->kn_hook; + + lck_mtx_lock(&fd_data->fd_lock); + revents = necp_fd_poll(fd_data, POLLIN, NULL, current_proc(), 1); + res = ((revents & POLLIN) != 0); + if (res) { + *kev = kn->kn_kevent; + } + lck_mtx_unlock(&fd_data->fd_lock); + return (res); +} + +static int +necp_fd_knrtouch(struct knote *kn, struct kevent_internal_s *kev) +{ +#pragma unused(kev) + struct necp_fd_data *fd_data; + int revents; + + fd_data = (struct necp_fd_data *)kn->kn_hook; + + lck_mtx_lock(&fd_data->fd_lock); + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + revents = necp_fd_poll(fd_data, POLLIN, NULL, current_proc(), 1); + lck_mtx_unlock(&fd_data->fd_lock); + + return ((revents & POLLIN) != 0); +} + +struct filterops necp_fd_rfiltops = { + .f_isfd = 1, + .f_detach = necp_fd_knrdetach, + .f_event = necp_fd_knread, + .f_touch = necp_fd_knrtouch, + .f_process = necp_fd_knrprocess, +}; + +static int +necpop_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) +{ +#pragma unused(fp, ctx) + struct necp_fd_data *fd_data = NULL; + int revents; + + if (kn->kn_filter != EVFILT_READ) { + NECPLOG(LOG_ERR, "bad filter request %d", kn->kn_filter); + kn->kn_flags = EV_ERROR; + kn->kn_data = EINVAL; + return (0); + } + + fd_data = (struct necp_fd_data *)kn->kn_fp->f_fglob->fg_data; + if (fd_data == NULL) { + NECPLOG0(LOG_ERR, "No channel for kqfilter"); + kn->kn_flags = EV_ERROR; + kn->kn_data = ENOENT; + return (0); + } + + lck_mtx_lock(&fd_data->fd_lock); + kn->kn_filtid = EVFILTID_NECP_FD; + kn->kn_hook = fd_data; + KNOTE_ATTACH(&fd_data->si.si_note, kn); + + revents = necp_fd_poll(fd_data, POLLIN, NULL, current_proc(), 1); + + lck_mtx_unlock(&fd_data->fd_lock); + + return ((revents & POLLIN) != 0); +} + +static void +necp_destroy_client_stats(struct necp_client *client) +{ + if ((client->stats_area != NULL) && + (client->stats_handler_context != NULL) && + (client->stats_uaddr != 0)) { + // Close old stats if required. + int error = copyin(client->stats_uaddr, client->stats_area, client->stats_ulen); + if (error) { + NECPLOG(LOG_ERR, "necp_destroy_client_stats copyin error on close (%d)", error); + // Not much we can for an error on an obsolete address + } + ntstat_userland_stats_close(client->stats_handler_context); + FREE(client->stats_area, M_NECP); + client->stats_area = NULL; + client->stats_handler_context = NULL; + client->stats_uaddr = 0; + client->stats_ulen = 0; + } +} + +static void +necp_destroy_client(struct necp_client *client) +{ + // Remove from list + LIST_REMOVE(client, chain); + + // Remove nexus assignment + if (client->assigned_results != NULL) { + if (!uuid_is_null(client->nexus_agent)) { + int netagent_error = netagent_client_message(client->nexus_agent, client->client_id, + NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS); + if (netagent_error != 0) { + NECPLOG(LOG_ERR, "necp_client_remove close nexus error (%d)", netagent_error); + } + } + FREE(client->assigned_results, M_NETAGENT); + } + + // Remove agent assertions + struct necp_client_assertion *search_assertion = NULL; + struct necp_client_assertion *temp_assertion = NULL; + LIST_FOREACH_SAFE(search_assertion, &client->assertion_list, assertion_chain, temp_assertion) { + int netagent_error = netagent_client_message(search_assertion->asserted_netagent, client->client_id, NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT); + if (netagent_error != 0) { + NECPLOG(LOG_ERR, "necp_client_remove unassert agent error (%d)", netagent_error); + } + LIST_REMOVE(search_assertion, assertion_chain); + FREE(search_assertion, M_NECP); + } + necp_destroy_client_stats(client); + + FREE(client, M_NECP); +} + +static int +necpop_close(struct fileglob *fg, vfs_context_t ctx) +{ +#pragma unused(fg, ctx) + struct necp_fd_data *fd_data = NULL; + int error = 0; + + fd_data = (struct necp_fd_data *)fg->fg_data; + fg->fg_data = NULL; + + if (fd_data != NULL) { + lck_rw_lock_exclusive(&necp_fd_lock); + + lck_mtx_lock(&fd_data->fd_lock); + struct necp_client *client = NULL; + struct necp_client *temp_client = NULL; + LIST_FOREACH_SAFE(client, &fd_data->clients, chain, temp_client) { + necp_destroy_client(client); + } + lck_mtx_unlock(&fd_data->fd_lock); + + selthreadclear(&fd_data->si); + + lck_mtx_destroy(&fd_data->fd_lock, necp_fd_mtx_grp); + + LIST_REMOVE(fd_data, chain); + + lck_rw_done(&necp_fd_lock); + + FREE(fd_data, M_NECP); + fd_data = NULL; + } + + return (error); +} + +/// NECP client utilities + +static int +necp_find_fd_data(int fd, struct necp_fd_data **fd_data) +{ + proc_t p = current_proc(); + struct fileproc *fp = NULL; + int error = 0; + + proc_fdlock_spin(p); + if ((error = fp_lookup(p, fd, &fp, 1)) != 0) { + goto done; + } + if (fp->f_fglob->fg_ops->fo_type != DTYPE_NETPOLICY) { + fp_drop(p, fd, fp, 1); + error = ENODEV; + goto done; + } + *fd_data = (struct necp_fd_data *)fp->f_fglob->fg_data; + +done: + proc_fdunlock(p); + return (error); +} + +static bool +necp_netagent_applies_to_client(__unused struct necp_client *client, struct necp_client_parsed_parameters *parameters, uuid_t netagent_uuid) +{ + bool applies = FALSE; + u_int32_t flags = netagent_get_flags(netagent_uuid); + if (!(flags & NETAGENT_FLAG_REGISTERED)) { + // Unregistered agents never apply + return (applies); + } + + if (flags & NETAGENT_FLAG_SPECIFIC_USE_ONLY) { + // Specific use agents only apply when required + bool required = FALSE; + if (parameters != NULL) { + // Check required agent UUIDs + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (uuid_is_null(parameters->required_netagents[i])) { + break; + } + if (uuid_compare(parameters->required_netagents[i], netagent_uuid) == 0) { + required = TRUE; + break; + } + } + + if (!required) { + // Check required agent types + bool fetched_type = FALSE; + char netagent_domain[NETAGENT_DOMAINSIZE]; + char netagent_type[NETAGENT_TYPESIZE]; + memset(&netagent_domain, 0, NETAGENT_DOMAINSIZE); + memset(&netagent_type, 0, NETAGENT_TYPESIZE); + + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (strlen(parameters->required_netagent_types[i].netagent_domain) == 0 || + strlen(parameters->required_netagent_types[i].netagent_type) == 0) { + break; + } + + if (!fetched_type) { + if (netagent_get_agent_domain_and_type(netagent_uuid, netagent_domain, netagent_type)) { + fetched_type = TRUE; + } else { + break; + } + } + + if ((strlen(parameters->required_netagent_types[i].netagent_domain) == 0 || + strncmp(netagent_domain, parameters->required_netagent_types[i].netagent_domain, NETAGENT_DOMAINSIZE) == 0) && + (strlen(parameters->required_netagent_types[i].netagent_type) == 0 || + strncmp(netagent_type, parameters->required_netagent_types[i].netagent_type, NETAGENT_TYPESIZE) == 0)) { + required = TRUE; + break; + } + } + } + } + + applies = required; + } else { + applies = TRUE; + } + + if (applies && + (flags & NETAGENT_FLAG_NEXUS_PROVIDER) && + uuid_is_null(client->nexus_agent)) { + uuid_copy(client->nexus_agent, netagent_uuid); + } + + return (applies); +} + +static int +necp_client_parse_parameters(u_int8_t *parameters, + u_int32_t parameters_size, + struct necp_client_parsed_parameters *parsed_parameters) +{ + int error = 0; + size_t offset = 0; + + u_int32_t num_prohibited_interfaces = 0; + u_int32_t num_required_interface_types = 0; + u_int32_t num_prohibited_interface_types = 0; + u_int32_t num_required_agents = 0; + u_int32_t num_prohibited_agents = 0; + u_int32_t num_preferred_agents = 0; + u_int32_t num_required_agent_types = 0; + u_int32_t num_prohibited_agent_types = 0; + u_int32_t num_preferred_agent_types = 0; + + if (parsed_parameters == NULL) { + return (EINVAL); + } + + memset(parsed_parameters, 0, sizeof(struct necp_client_parsed_parameters)); + + while ((offset + sizeof(u_int8_t) + sizeof(u_int32_t)) <= parameters_size) { + u_int8_t type = necp_buffer_get_tlv_type(parameters, offset); + u_int32_t length = necp_buffer_get_tlv_length(parameters, offset); + + if (length > 0 && (offset + sizeof(u_int8_t) + sizeof(u_int32_t) + length) <= parameters_size) { + u_int8_t *value = necp_buffer_get_tlv_value(parameters, offset, NULL); + if (value != NULL) { + switch (type) { + case NECP_CLIENT_PARAMETER_BOUND_INTERFACE: { + if (length <= IFXNAMSIZ && length > 0) { + ifnet_t bound_interface = NULL; + char interface_name[IFXNAMSIZ]; + memcpy(interface_name, value, length); + interface_name[length - 1] = 0; // Make sure the string is NULL terminated + if (ifnet_find_by_name(interface_name, &bound_interface) == 0) { + parsed_parameters->required_interface_index = bound_interface->if_index; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF; + ifnet_release(bound_interface); + } + } + break; + } + case NECP_CLIENT_PARAMETER_LOCAL_ADDRESS: { + if (length >= sizeof(struct necp_policy_condition_addr)) { + struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)value; + if ((address_struct->address.sa.sa_family == AF_INET || + address_struct->address.sa.sa_family == AF_INET6) && + address_struct->address.sa.sa_len <= length) { + memcpy(&parsed_parameters->local_addr, &address_struct->address, sizeof(address_struct->address)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR; + } + } + break; + } + case NECP_CLIENT_PARAMETER_LOCAL_ENDPOINT: { + if (length >= sizeof(struct necp_client_endpoint)) { + struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value; + if ((endpoint->u.endpoint.endpoint_family == AF_INET || + endpoint->u.endpoint.endpoint_family == AF_INET6) && + endpoint->u.endpoint.endpoint_length <= length) { + memcpy(&parsed_parameters->local_addr, &endpoint->u.sa, sizeof(union necp_sockaddr_union)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR; + } + } + break; + } + case NECP_CLIENT_PARAMETER_REMOTE_ADDRESS: { + if (length >= sizeof(struct necp_policy_condition_addr)) { + struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)value; + if ((address_struct->address.sa.sa_family == AF_INET || + address_struct->address.sa.sa_family == AF_INET6) && + address_struct->address.sa.sa_len <= length) { + memcpy(&parsed_parameters->remote_addr, &address_struct->address, sizeof(address_struct->address)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REMOTE_ADDR; + } + } + break; + } + case NECP_CLIENT_PARAMETER_REMOTE_ENDPOINT: { + if (length >= sizeof(struct necp_client_endpoint)) { + struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value; + if ((endpoint->u.endpoint.endpoint_family == AF_INET || + endpoint->u.endpoint.endpoint_family == AF_INET6) && + endpoint->u.endpoint.endpoint_length <= length) { + memcpy(&parsed_parameters->remote_addr, &endpoint->u.sa, sizeof(union necp_sockaddr_union)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REMOTE_ADDR; + } + } + break; + } + case NECP_CLIENT_PARAMETER_PROHIBIT_INTERFACE: { + if (num_prohibited_interfaces >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length <= IFXNAMSIZ && length > 0) { + memcpy(parsed_parameters->prohibited_interfaces[num_prohibited_interfaces], value, length); + parsed_parameters->prohibited_interfaces[num_prohibited_interfaces][length - 1] = 0; // Make sure the string is NULL terminated + num_prohibited_interfaces++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IF; + } + break; + } + case NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE: { + if (num_required_interface_types >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(u_int8_t)) { + memcpy(&parsed_parameters->required_interface_types[num_required_interface_types], value, sizeof(u_int8_t)); + num_required_interface_types++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE; + } + break; + } + case NECP_CLIENT_PARAMETER_PROHIBIT_IF_TYPE: { + if (num_prohibited_interface_types >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(u_int8_t)) { + memcpy(&parsed_parameters->prohibited_interface_types[num_prohibited_interface_types], value, sizeof(u_int8_t)); + num_prohibited_interface_types++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IFTYPE; + } + break; + } + case NECP_CLIENT_PARAMETER_REQUIRE_AGENT: { + if (num_required_agents >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(uuid_t)) { + memcpy(&parsed_parameters->required_netagents[num_required_agents], value, sizeof(uuid_t)); + num_required_agents++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT; + } + break; + } + case NECP_CLIENT_PARAMETER_PROHIBIT_AGENT: { + if (num_prohibited_agents >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(uuid_t)) { + memcpy(&parsed_parameters->prohibited_netagents[num_prohibited_agents], value, sizeof(uuid_t)); + num_prohibited_agents++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT; + } + break; + } + case NECP_CLIENT_PARAMETER_PREFER_AGENT: { + if (num_preferred_agents >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(uuid_t)) { + memcpy(&parsed_parameters->preferred_netagents[num_preferred_agents], value, sizeof(uuid_t)); + num_preferred_agents++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT; + } + break; + } + case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: { + if (num_required_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(struct necp_client_parameter_netagent_type)) { + memcpy(&parsed_parameters->required_netagent_types[num_required_agent_types], value, sizeof(struct necp_client_parameter_netagent_type)); + num_required_agent_types++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE; + } + break; + } + case NECP_CLIENT_PARAMETER_PROHIBIT_AGENT_TYPE: { + if (num_prohibited_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(struct necp_client_parameter_netagent_type)) { + memcpy(&parsed_parameters->prohibited_netagent_types[num_prohibited_agent_types], value, sizeof(struct necp_client_parameter_netagent_type)); + num_prohibited_agent_types++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE; + } + break; + } + case NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE: { + if (num_preferred_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(struct necp_client_parameter_netagent_type)) { + memcpy(&parsed_parameters->preferred_netagent_types[num_preferred_agent_types], value, sizeof(struct necp_client_parameter_netagent_type)); + num_preferred_agent_types++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE; + } + break; + } + default: { + break; + } + } + } + } + + offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; + } + + return (error); +} + +int +necp_assign_client_result(uuid_t netagent_uuid, uuid_t client_id, + u_int8_t *assigned_results, size_t assigned_results_length) +{ + int error = 0; + struct necp_fd_data *client_fd = NULL; + bool found_client = FALSE; + bool client_updated = FALSE; + + lck_rw_lock_shared(&necp_fd_lock); + + LIST_FOREACH(client_fd, &necp_fd_list, chain) { + struct necp_client *client = NULL; + lck_mtx_lock(&client_fd->fd_lock); + LIST_FOREACH(client, &client_fd->clients, chain) { + if (uuid_compare(client->client_id, client_id) == 0) { + // Found the right client! + found_client = TRUE; + + if (uuid_compare(client->nexus_agent, netagent_uuid) == 0) { + // Verify that the client nexus agent matches + if (client->assigned_results != NULL) { + // Release prior result + FREE(client->assigned_results, M_NETAGENT); + } + client->assigned_results = assigned_results; + client->assigned_results_length = assigned_results_length; + client->assigned_result_read = FALSE; + client_updated = TRUE; + } + } + } + if (client_updated) { + necp_fd_notify(client_fd, true); + } + lck_mtx_unlock(&client_fd->fd_lock); + + if (found_client) { + break; + } + } + + lck_rw_done(&necp_fd_lock); + + if (!found_client) { + error = ENOENT; + } else if (!client_updated) { + error = EINVAL; + } + + return (error); +} + +/// Client updating + +static bool +necp_update_client_result(proc_t proc, + struct necp_client *client) +{ + struct necp_client_result_netagent netagent; + struct necp_aggregate_result result; + struct necp_client_parsed_parameters parsed_parameters; + u_int32_t flags = 0; + + uuid_clear(client->nexus_agent); + + int error = necp_client_parse_parameters(client->parameters, (u_int32_t)client->parameters_length, &parsed_parameters); + if (error != 0) { + return (FALSE); + } + + // Check parameters to find best interface + u_int matching_if_index = 0; + if (necp_find_matching_interface_index(&parsed_parameters, &matching_if_index)) { + if (matching_if_index != 0) { + parsed_parameters.required_interface_index = matching_if_index; + } + // Interface found or not needed, match policy. + error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length, &result, &flags, matching_if_index); + if (error != 0) { + return (FALSE); + } + } else { + // Interface not found. Clear out the whole result, make everything fail. + memset(&result, 0, sizeof(result)); + } + + // If the original request was scoped, and the policy result matches, make sure the result is scoped + if ((result.routing_result == NECP_KERNEL_POLICY_RESULT_NONE || + result.routing_result == NECP_KERNEL_POLICY_RESULT_PASS) && + result.routed_interface_index != IFSCOPE_NONE && + parsed_parameters.required_interface_index == result.routed_interface_index) { + result.routing_result = NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED; + result.routing_result_parameter.scoped_interface_index = result.routed_interface_index; + } + + bool updated = FALSE; + u_int8_t *cursor = client->result; + const u_int8_t *max = client->result + NECP_MAX_CLIENT_RESULT_SIZE; + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_CLIENT_ID, sizeof(uuid_t), client->client_id, &updated); + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_POLICY_RESULT, sizeof(result.routing_result), &result.routing_result, &updated); + if (result.routing_result_parameter.tunnel_interface_index != 0) { + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_POLICY_RESULT_PARAMETER, + sizeof(result.routing_result_parameter), &result.routing_result_parameter, &updated); + } + if (result.filter_control_unit != 0) { + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_FILTER_CONTROL_UNIT, + sizeof(result.filter_control_unit), &result.filter_control_unit, &updated); + } + if (result.routed_interface_index != 0) { + u_int routed_interface_index = result.routed_interface_index; + if (result.routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && + parsed_parameters.required_interface_index != IFSCOPE_NONE && + parsed_parameters.required_interface_index != result.routed_interface_index) { + routed_interface_index = parsed_parameters.required_interface_index; + } + + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_INTERFACE_INDEX, + sizeof(routed_interface_index), &routed_interface_index, &updated); + } + if (flags != 0) { + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_FLAGS, + sizeof(flags), &flags, &updated); + } + for (int i = 0; i < NECP_MAX_NETAGENTS; i++) { + if (uuid_is_null(result.netagents[i])) { + break; + } + uuid_copy(netagent.netagent_uuid, result.netagents[i]); + netagent.generation = netagent_get_generation(netagent.netagent_uuid); + if (necp_netagent_applies_to_client(client, &parsed_parameters, netagent.netagent_uuid)) { + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated); + } + } + + ifnet_head_lock_shared(); + ifnet_t direct_interface = NULL; + ifnet_t delegate_interface = NULL; + ifnet_t original_scoped_interface = NULL; + + if (result.routed_interface_index != IFSCOPE_NONE && (int)result.routed_interface_index <= if_index) { + direct_interface = ifindex2ifnet[result.routed_interface_index]; + } else if (parsed_parameters.required_interface_index != IFSCOPE_NONE && + (int)parsed_parameters.required_interface_index <= if_index) { + // If the request was scoped, but the route didn't match, still grab the agents + direct_interface = ifindex2ifnet[parsed_parameters.required_interface_index]; + } else if (result.routed_interface_index == IFSCOPE_NONE && + result.routing_result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED && + result.routing_result_parameter.scoped_interface_index != IFSCOPE_NONE) { + direct_interface = ifindex2ifnet[result.routing_result_parameter.scoped_interface_index]; + } + if (direct_interface != NULL) { + delegate_interface = direct_interface->if_delegated.ifp; + } + if (result.routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && + parsed_parameters.required_interface_index != IFSCOPE_NONE && + parsed_parameters.required_interface_index != result.routing_result_parameter.tunnel_interface_index && + (int)parsed_parameters.required_interface_index <= if_index) { + original_scoped_interface = ifindex2ifnet[parsed_parameters.required_interface_index]; + } + // Add interfaces + if (original_scoped_interface != NULL) { + struct necp_client_result_interface interface_struct; + interface_struct.index = original_scoped_interface->if_index; + interface_struct.generation = ifnet_get_generation(original_scoped_interface); + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_INTERFACE, sizeof(interface_struct), &interface_struct, &updated); + } + if (direct_interface != NULL) { + struct necp_client_result_interface interface_struct; + interface_struct.index = direct_interface->if_index; + interface_struct.generation = ifnet_get_generation(direct_interface); + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_INTERFACE, sizeof(interface_struct), &interface_struct, &updated); + } + if (delegate_interface != NULL) { + struct necp_client_result_interface interface_struct; + interface_struct.index = delegate_interface->if_index; + interface_struct.generation = ifnet_get_generation(delegate_interface); + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_INTERFACE, sizeof(interface_struct), &interface_struct, &updated); + } + // Add agents + if (original_scoped_interface != NULL) { + ifnet_lock_shared(original_scoped_interface); + if (original_scoped_interface->if_agentids != NULL) { + for (u_int32_t i = 0; i < original_scoped_interface->if_agentcount; i++) { + if (uuid_is_null(original_scoped_interface->if_agentids[i])) { + continue; + } + uuid_copy(netagent.netagent_uuid, original_scoped_interface->if_agentids[i]); + netagent.generation = netagent_get_generation(netagent.netagent_uuid); + if (necp_netagent_applies_to_client(client, &parsed_parameters, netagent.netagent_uuid)) { + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated); + } + } + } + ifnet_lock_done(original_scoped_interface); + } + if (direct_interface != NULL) { + ifnet_lock_shared(direct_interface); + if (direct_interface->if_agentids != NULL) { + for (u_int32_t i = 0; i < direct_interface->if_agentcount; i++) { + if (uuid_is_null(direct_interface->if_agentids[i])) { + continue; + } + uuid_copy(netagent.netagent_uuid, direct_interface->if_agentids[i]); + netagent.generation = netagent_get_generation(netagent.netagent_uuid); + if (necp_netagent_applies_to_client(client, &parsed_parameters, netagent.netagent_uuid)) { + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated); + } + } + } + ifnet_lock_done(direct_interface); + } + if (delegate_interface != NULL) { + ifnet_lock_shared(delegate_interface); + if (delegate_interface->if_agentids != NULL) { + for (u_int32_t i = 0; i < delegate_interface->if_agentcount; i++) { + if (uuid_is_null(delegate_interface->if_agentids[i])) { + continue; + } + uuid_copy(netagent.netagent_uuid, delegate_interface->if_agentids[i]); + netagent.generation = netagent_get_generation(netagent.netagent_uuid); + if (necp_netagent_applies_to_client(client, &parsed_parameters, netagent.netagent_uuid)) { + cursor = necp_buffer_write_tlv_if_different(cursor, max, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated); + } + } + } + ifnet_lock_done(delegate_interface); + } + ifnet_head_done(); + + size_t new_result_length = (cursor - client->result); + if (new_result_length != client->result_length) { + client->result_length = new_result_length; + updated = TRUE; + } + if (updated) { + client->result_read = FALSE; + } + + return (updated); +} + +static void +necp_update_all_clients_callout(__unused thread_call_param_t dummy, + __unused thread_call_param_t arg) +{ +#pragma unused(arg) + struct necp_fd_data *client_fd = NULL; + + lck_rw_lock_shared(&necp_fd_lock); + + LIST_FOREACH(client_fd, &necp_fd_list, chain) { + bool updated_result = FALSE; + struct necp_client *client = NULL; + proc_t proc = proc_find(client_fd->proc_pid); + if (proc == NULL) { + continue; + } + + lck_mtx_lock(&client_fd->fd_lock); + LIST_FOREACH(client, &client_fd->clients, chain) { + if (necp_update_client_result(proc, client)) { + updated_result = TRUE; + } + } + if (updated_result) { + necp_fd_notify(client_fd, true); + } + lck_mtx_unlock(&client_fd->fd_lock); + + proc_rele(proc); + } + + lck_rw_done(&necp_fd_lock); +} + +void +necp_update_all_clients(void) +{ + if (necp_client_tcall == NULL) { + // Don't try to update clients if the module is not initialized + return; + } + + uint64_t deadline = 0; + uint64_t leeway = 0; + clock_interval_to_deadline(necp_timeout_microseconds, NSEC_PER_USEC, &deadline); + clock_interval_to_absolutetime_interval(necp_timeout_leeway_microseconds, NSEC_PER_USEC, &leeway); + + thread_call_enter_delayed_with_leeway(necp_client_tcall, NULL, + deadline, leeway, THREAD_CALL_DELAY_LEEWAY); +} + +static void +necp_client_remove_agent_from_result(struct necp_client *client, uuid_t netagent_uuid) +{ + size_t offset = 0; + + u_int8_t *result_buffer = client->result; + while ((offset + sizeof(u_int8_t) + sizeof(u_int32_t)) <= client->result_length) { + u_int8_t type = necp_buffer_get_tlv_type(result_buffer, offset); + u_int32_t length = necp_buffer_get_tlv_length(result_buffer, offset); + + size_t tlv_total_length = (sizeof(u_int8_t) + sizeof(u_int32_t) + length); + if (type == NECP_CLIENT_RESULT_NETAGENT && + length == sizeof(struct necp_client_result_netagent) && + (offset + tlv_total_length) <= client->result_length) { + struct necp_client_result_netagent *value = ((struct necp_client_result_netagent *)(void *) + necp_buffer_get_tlv_value(result_buffer, offset, NULL)); + if (uuid_compare(value->netagent_uuid, netagent_uuid) == 0) { + // Found a netagent to remove + // Shift bytes down to remove the tlv, and adjust total length + // Don't adjust the current offset + memmove(result_buffer + offset, + result_buffer + offset + tlv_total_length, + client->result_length - (offset + tlv_total_length)); + client->result_length -= tlv_total_length; + memset(result_buffer + client->result_length, 0, NECP_MAX_CLIENT_RESULT_SIZE - client->result_length); + continue; + } + } + + offset += tlv_total_length; + } +} + +void +necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid) +{ + struct necp_fd_data *client_fd = NULL; + + lck_rw_lock_shared(&necp_fd_lock); + + LIST_FOREACH(client_fd, &necp_fd_list, chain) { + bool updated_result = FALSE; + struct necp_client *client = NULL; + lck_mtx_lock(&client_fd->fd_lock); + LIST_FOREACH(client, &client_fd->clients, chain) { + if (uuid_compare(client->client_id, client_id) == 0) { + if (!uuid_is_null(remove_netagent_uuid)) { + necp_client_remove_agent_from_result(client, remove_netagent_uuid); + } + client->assigned_result_read = FALSE; + updated_result = TRUE; + // Found the client, break + break; + } + } + if (updated_result) { + necp_fd_notify(client_fd, true); + } + lck_mtx_unlock(&client_fd->fd_lock); + if (updated_result) { + // Found the client, break + break; + } + } + + lck_rw_done(&necp_fd_lock); +} + +/// Interface matching + +#define NECP_PARSED_PARAMETERS_INTERESTING_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR | \ + NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IF | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE | \ + NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IFTYPE | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE | \ + NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) + +#define NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) + +#define NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) + +static bool +necp_ifnet_matches_type(struct ifnet *ifp, u_int8_t interface_type, bool check_delegates) +{ + struct ifnet *check_ifp = ifp; + while (check_ifp) { + if (if_functional_type(check_ifp, TRUE) == interface_type) { + return (TRUE); + } + if (!check_delegates) { + break; + } + check_ifp = check_ifp->if_delegated.ifp; + + } + return (FALSE); +} + +static bool +necp_ifnet_matches_name(struct ifnet *ifp, const char *interface_name, bool check_delegates) +{ + struct ifnet *check_ifp = ifp; + while (check_ifp) { + if (strncmp(check_ifp->if_xname, interface_name, IFXNAMSIZ) == 0) { + return (TRUE); + } + if (!check_delegates) { + break; + } + check_ifp = check_ifp->if_delegated.ifp; + } + return (FALSE); +} + +static bool +necp_ifnet_matches_agent(struct ifnet *ifp, uuid_t *agent_uuid, bool check_delegates) +{ + struct ifnet *check_ifp = ifp; + + while (check_ifp != NULL) { + ifnet_lock_shared(check_ifp); + if (check_ifp->if_agentids != NULL) { + for (u_int32_t index = 0; index < check_ifp->if_agentcount; index++) { + if (uuid_compare(check_ifp->if_agentids[index], *agent_uuid) == 0) { + ifnet_lock_done(check_ifp); + return (TRUE); + } + } + } + ifnet_lock_done(check_ifp); + + if (!check_delegates) { + break; + } + check_ifp = check_ifp->if_delegated.ifp; + } + return (FALSE); +} + +static bool +necp_necp_ifnet_matches_agent_type(struct ifnet *ifp, const char *agent_domain, const char *agent_type, bool check_delegates) +{ + struct ifnet *check_ifp = ifp; + + while (check_ifp != NULL) { + ifnet_lock_shared(check_ifp); + if (check_ifp->if_agentids != NULL) { + for (u_int32_t index = 0; index < check_ifp->if_agentcount; index++) { + if (uuid_is_null(check_ifp->if_agentids[index])) { + continue; + } + + char if_agent_domain[NETAGENT_DOMAINSIZE] = { 0 }; + char if_agent_type[NETAGENT_TYPESIZE] = { 0 }; + + if (netagent_get_agent_domain_and_type(check_ifp->if_agentids[index], if_agent_domain, if_agent_type)) { + if ((strlen(agent_domain) == 0 || + strncmp(if_agent_domain, agent_domain, NETAGENT_DOMAINSIZE) == 0) && + (strlen(agent_type) == 0 || + strncmp(if_agent_type, agent_type, NETAGENT_TYPESIZE) == 0)) { + ifnet_lock_done(check_ifp); + return (TRUE); + } + } + } + } + ifnet_lock_done(check_ifp); + + if (!check_delegates) { + break; + } + check_ifp = check_ifp->if_delegated.ifp; + } + return (FALSE); +} + +static bool +necp_ifnet_matches_local_address(struct ifnet *ifp, struct sockaddr *sa) +{ + struct ifaddr *ifa = NULL; + bool matched_local_address = FALSE; + + // Transform sa into the ifaddr form + // IPv6 Scope IDs are always embedded in the ifaddr list + struct sockaddr_storage address; + u_int ifscope = IFSCOPE_NONE; + (void)sa_copy(sa, &address, &ifscope); + SIN(&address)->sin_port = 0; + if (address.ss_family == AF_INET6) { + SIN6(&address)->sin6_scope_id = 0; + } + + ifa = ifa_ifwithaddr_scoped_locked((struct sockaddr *)&address, ifp->if_index); + matched_local_address = (ifa != NULL); + + if (ifa) { + ifaddr_release(ifa); + } + + return (matched_local_address); +} + +static bool +necp_ifnet_matches_parameters(struct ifnet *ifp, + struct necp_client_parsed_parameters *parsed_parameters, + u_int32_t *preferred_count) +{ + if (preferred_count) { + *preferred_count = 0; + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR) { + if (!necp_ifnet_matches_local_address(ifp, &parsed_parameters->local_addr.sa)) { + return (FALSE); + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (parsed_parameters->required_interface_types[i] == 0) { + break; + } + + if (!necp_ifnet_matches_type(ifp, parsed_parameters->required_interface_types[i], FALSE)) { + return (FALSE); + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IFTYPE) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (parsed_parameters->prohibited_interface_types[i] == 0) { + break; + } + + if (necp_ifnet_matches_type(ifp, parsed_parameters->prohibited_interface_types[i], TRUE)) { + return (FALSE); + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IF) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (strlen(parsed_parameters->prohibited_interfaces[i]) == 0) { + break; + } + + if (necp_ifnet_matches_name(ifp, parsed_parameters->prohibited_interfaces[i], TRUE)) { + return (FALSE); + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (uuid_is_null(parsed_parameters->required_netagents[i])) { + break; + } + + if (!necp_ifnet_matches_agent(ifp, &parsed_parameters->required_netagents[i], FALSE)) { + return (FALSE); + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (uuid_is_null(parsed_parameters->prohibited_netagents[i])) { + break; + } + + if (necp_ifnet_matches_agent(ifp, &parsed_parameters->prohibited_netagents[i], TRUE)) { + return (FALSE); + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (strlen(parsed_parameters->required_netagent_types[i].netagent_domain) == 0 && + strlen(parsed_parameters->required_netagent_types[i].netagent_type) == 0) { + break; + } + + if (!necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->required_netagent_types[i].netagent_domain, parsed_parameters->required_netagent_types[i].netagent_type, FALSE)) { + return (FALSE); + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (strlen(parsed_parameters->prohibited_netagent_types[i].netagent_domain) == 0 && + strlen(parsed_parameters->prohibited_netagent_types[i].netagent_type) == 0) { + break; + } + + if (necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->prohibited_netagent_types[i].netagent_domain, parsed_parameters->prohibited_netagent_types[i].netagent_type, TRUE)) { + return (FALSE); + } + } + } + + // Checked preferred properties + if (preferred_count) { + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (uuid_is_null(parsed_parameters->preferred_netagents[i])) { + break; + } + + if (necp_ifnet_matches_agent(ifp, &parsed_parameters->preferred_netagents[i], TRUE)) { + (*preferred_count)++; + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (strlen(parsed_parameters->preferred_netagent_types[i].netagent_domain) == 0 && + strlen(parsed_parameters->preferred_netagent_types[i].netagent_type) == 0) { + break; + } + + if (necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->preferred_netagent_types[i].netagent_domain, parsed_parameters->preferred_netagent_types[i].netagent_type, TRUE)) { + (*preferred_count)++; + } + } + } + } + + return (TRUE); +} + +static bool +necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters, u_int *return_ifindex) +{ + struct ifnet *ifp = NULL; + u_int32_t best_preferred_count = 0; + bool has_preferred_fields = FALSE; + *return_ifindex = 0; + + if (parsed_parameters->required_interface_index != 0) { + *return_ifindex = parsed_parameters->required_interface_index; + return (TRUE); + } + + if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_INTERESTING_IFNET_FIELDS)) { + return (TRUE); + } + + has_preferred_fields = (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS); + + // We have interesting parameters to parse and find a matching interface + ifnet_head_lock_shared(); + + if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS)) { + // We do have fields to match, but they are only prohibitory + // If the first interface in the list matches, we don't need to scope + ifp = TAILQ_FIRST(&ifnet_ordered_head); + if (ifp && necp_ifnet_matches_parameters(ifp, parsed_parameters, NULL)) { + // Don't set return_ifindex, so the client doesn't need to scope + ifnet_head_done(); + return (TRUE); + } + } + + // First check the ordered interface list + TAILQ_FOREACH(ifp, &ifnet_ordered_head, if_ordered_link) { + u_int32_t preferred_count = 0; + if (necp_ifnet_matches_parameters(ifp, parsed_parameters, &preferred_count)) { + if (preferred_count > best_preferred_count || + *return_ifindex == 0) { + + // Everything matched, and is most preferred. Return this interface. + *return_ifindex = ifp->if_index; + best_preferred_count = preferred_count; + + if (!has_preferred_fields) { + break; + } + } + } + } + + // Then check the remaining interfaces + if ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS) && + !(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) && + *return_ifindex == 0) { + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + u_int32_t preferred_count = 0; + if (ifp->if_ordered_link.tqe_next != NULL || + ifp->if_ordered_link.tqe_prev != NULL) { + // This interface was in the ordered list, skip + continue; + } + if (necp_ifnet_matches_parameters(ifp, parsed_parameters, &preferred_count)) { + if (preferred_count > best_preferred_count || + *return_ifindex == 0) { + + // Everything matched, and is most preferred. Return this interface. + *return_ifindex = ifp->if_index; + best_preferred_count = preferred_count; + + if (!has_preferred_fields) { + break; + } + } + } + } + } + + ifnet_head_done(); + + if ((parsed_parameters->valid_fields == (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS)) && + best_preferred_count == 0) { + // If only has preferred fields, and nothing was found, clear the interface index and return TRUE + *return_ifindex = 0; + return (TRUE); + } + + return (*return_ifindex != 0); +} + +static void +necp_find_netstat_data(struct necp_client *client, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, u_int32_t *ifindex, uuid_t euuid, u_int32_t *traffic_class) +{ + size_t offset = 0; + u_int8_t *parameters; + u_int32_t parameters_size; + + parameters = client->parameters; + parameters_size = (u_int32_t)client->parameters_length; + + while ((offset + sizeof(u_int8_t) + sizeof(u_int32_t)) <= parameters_size) { + u_int8_t type = necp_buffer_get_tlv_type(parameters, offset); + u_int32_t length = necp_buffer_get_tlv_length(parameters, offset); + + if (length > 0 && (offset + sizeof(u_int8_t) + sizeof(u_int32_t) + length) <= parameters_size) { + u_int8_t *value = necp_buffer_get_tlv_value(parameters, offset, NULL); + if (value != NULL) { + switch (type) { + case NECP_CLIENT_PARAMETER_REAL_APPLICATION: { + if (length >= sizeof(uuid_t)) { + uuid_copy(euuid, value); + } + break; + } + case NECP_CLIENT_PARAMETER_TRAFFIC_CLASS: { + if (length >= sizeof(u_int32_t)) { + memcpy(traffic_class, value, sizeof(u_int32_t)); + } + break; + } + case NECP_CLIENT_PARAMETER_BOUND_INTERFACE: { + if (length <= IFXNAMSIZ && length > 0) { + ifnet_t bound_interface = NULL; + char interface_name[IFXNAMSIZ]; + memcpy(interface_name, value, length); + interface_name[length - 1] = 0; // Make sure the string is NULL terminated + if (ifnet_find_by_name(interface_name, &bound_interface) == 0) { + *ifindex = bound_interface->if_index; + ifnet_release(bound_interface); + } + } + break; + } + case NECP_CLIENT_PARAMETER_LOCAL_ADDRESS: { + if (length >= sizeof(struct necp_policy_condition_addr)) { + struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)value; + memcpy(local, &address_struct->address, sizeof(address_struct->address)); + } + break; + } + case NECP_CLIENT_PARAMETER_REMOTE_ADDRESS: { + if (length >= sizeof(struct necp_policy_condition_addr)) { + struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)value; + memcpy(remote, &address_struct->address, sizeof(address_struct->address)); + } + break; + } + default: { + break; + } + } + } + } + offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; + } +} + +static void +necp_fillout_current_process_details(u_int32_t *pid, u_int64_t *upid, unsigned char *uuid, char *pname, size_t len) +{ + *pid = proc_selfpid(); + *upid = proc_uniqueid(current_proc()); + proc_selfname(pname, (int) len); + proc_getexecutableuuid(current_proc(), uuid, sizeof(uuid_t)); +} + +// Called from NetworkStatistics when it wishes to collect latest information for a TCP flow. +// It is a responsibility of NetworkStatistics to have previously zeroed any supplied memory. +static bool +necp_request_tcp_netstats(userland_stats_provider_context *ctx, + nstat_counts *countsp, + void *metadatap) +{ + if (ctx == NULL) { + return false; + } + + struct necp_client *client = (struct necp_client *)ctx; + struct necp_tcp_stats *tcpstats = (struct necp_tcp_stats *)client->stats_area; + if (tcpstats == NULL) { + return false; + } + + if (countsp) { + *countsp = *((struct nstat_counts *)&tcpstats->necp_tcp_counts); + } + + if (metadatap) { + nstat_tcp_descriptor *desc = (nstat_tcp_descriptor *)metadatap; + + // Metadata for the process + necp_fillout_current_process_details(&desc->pid, &desc->upid, desc->uuid, desc->pname, sizeof(desc->pname)); + + // Metadata that the necp client should have in TLV format. + necp_find_netstat_data(client, (union necp_sockaddr_union *)&desc->local, (union necp_sockaddr_union *)&desc->remote, &desc->ifindex, desc->euuid, &desc->traffic_class); + + // Basic metadata + desc->rcvbufsize = tcpstats->necp_tcp_basic.rcvbufsize; + desc->rcvbufused = tcpstats->necp_tcp_basic.rcvbufused; + desc->eupid = tcpstats->necp_tcp_basic.eupid; + desc->epid = tcpstats->necp_tcp_basic.epid; + memcpy(desc->vuuid, tcpstats->necp_tcp_basic.vuuid, sizeof(desc->vuuid)); + desc->ifnet_properties = tcpstats->necp_tcp_basic.ifnet_properties; + + // Additional TCP specific data + desc->sndbufsize = tcpstats->necp_tcp_extra.sndbufsize; + desc->sndbufused = tcpstats->necp_tcp_extra.sndbufused; + desc->txunacked = tcpstats->necp_tcp_extra.txunacked; + desc->txwindow = tcpstats->necp_tcp_extra.txwindow; + desc->txcwindow = tcpstats->necp_tcp_extra.txcwindow; + desc->traffic_mgt_flags = tcpstats->necp_tcp_extra.traffic_mgt_flags; + + if (tcpstats->necp_tcp_extra.cc_alg_index < TCP_CC_ALGO_COUNT) { + strlcpy(desc->cc_algo, tcp_cc_algo_list[tcpstats->necp_tcp_extra.cc_alg_index]->name, sizeof(desc->cc_algo)); + } else { + strlcpy(desc->cc_algo, "unknown", sizeof(desc->cc_algo)); + } + + desc->connstatus.write_probe_failed = tcpstats->necp_tcp_extra.probestatus.write_probe_failed; + desc->connstatus.read_probe_failed = tcpstats->necp_tcp_extra.probestatus.read_probe_failed; + desc->connstatus.conn_probe_failed = tcpstats->necp_tcp_extra.probestatus.conn_probe_failed; + } + return true; +} + +// Called from NetworkStatistics when it wishes to collect latest information for a UDP flow. +static bool +necp_request_udp_netstats(userland_stats_provider_context *ctx, + nstat_counts *countsp, + void *metadatap) +{ + if (ctx == NULL) { + return false; + } + + struct necp_client *client = (struct necp_client *)ctx; + struct necp_udp_stats *udpstats = (struct necp_udp_stats *)client->stats_area; + if (udpstats == NULL) { + return false; + } + + if (countsp) { + *countsp = *((struct nstat_counts *)&udpstats->necp_udp_counts); + } + + if (metadatap) { + nstat_udp_descriptor *desc = (nstat_udp_descriptor *)metadatap; + + // Metadata for the process + necp_fillout_current_process_details(&desc->pid, &desc->upid, desc->uuid, desc->pname, sizeof(desc->pname)); + + // Metadata that the necp client should have in TLV format. + necp_find_netstat_data(client, (union necp_sockaddr_union *)&desc->local, (union necp_sockaddr_union *)&desc->remote, &desc->ifindex, desc->euuid, &desc->traffic_class); + + // Basic metadata is all that is required for UDP + desc->rcvbufsize = udpstats->necp_udp_basic.rcvbufsize; + desc->rcvbufused = udpstats->necp_udp_basic.rcvbufused; + desc->eupid = udpstats->necp_udp_basic.eupid; + desc->epid = udpstats->necp_udp_basic.epid; + memcpy(desc->vuuid, udpstats->necp_udp_basic.vuuid, sizeof(desc->euuid)); + desc->ifnet_properties = udpstats->necp_udp_basic.ifnet_properties; + } + return true; +} + +static int +necp_skywalk_priv_check_cred(proc_t p, kauth_cred_t cred) +{ +#pragma unused(p, cred) + return (0); +} + +/// System calls + +int +necp_open(struct proc *p, struct necp_open_args *uap, int *retval) +{ +#pragma unused(retval) + int error = 0; + struct necp_fd_data *fd_data = NULL; + struct fileproc *fp = NULL; + int fd = -1; + + if (uap->flags & NECP_OPEN_FLAG_OBSERVER) { + if (necp_skywalk_priv_check_cred(p, kauth_cred_get()) != 0 && + priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0) != 0) { + NECPLOG0(LOG_ERR, "Client does not hold necessary entitlement to observe other NECP clients"); + error = EACCES; + goto done; + } + } + + error = falloc(p, &fp, &fd, vfs_context_current()); + if (error != 0) { + goto done; + } + + if ((fd_data = _MALLOC(sizeof(struct necp_fd_data), M_NECP, + M_WAITOK | M_ZERO)) == NULL) { + error = ENOMEM; + goto done; + } + + fd_data->flags = uap->flags; + LIST_INIT(&fd_data->clients); + lck_mtx_init(&fd_data->fd_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); + klist_init(&fd_data->si.si_note); + fd_data->proc_pid = proc_pid(p); + + fp->f_fglob->fg_flag = FREAD; + fp->f_fglob->fg_ops = &necp_fd_ops; + fp->f_fglob->fg_data = fd_data; + + proc_fdlock(p); + + *fdflags(p, fd) |= (UF_EXCLOSE | UF_FORKCLOSE); + procfdtbl_releasefd(p, fd, NULL); + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + + *retval = fd; + + lck_rw_lock_exclusive(&necp_fd_lock); + LIST_INSERT_HEAD(&necp_fd_list, fd_data, chain); + lck_rw_done(&necp_fd_lock); + +done: + if (error != 0) { + if (fp != NULL) { + fp_free(p, fd, fp); + fp = NULL; + } + if (fd_data != NULL) { + FREE(fd_data, M_NECP); + fd_data = NULL; + } + } + + return (error); +} + +static int +necp_client_add(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + struct necp_client *client = NULL; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t) || + uap->buffer_size == 0 || uap->buffer_size > NECP_MAX_CLIENT_PARAMETERS_SIZE || uap->buffer == 0) { + error = EINVAL; + goto done; + } + + if ((client = _MALLOC(sizeof(struct necp_client) + uap->buffer_size, M_NECP, + M_WAITOK | M_ZERO)) == NULL) { + error = ENOMEM; + goto done; + } + + error = copyin(uap->buffer, client->parameters, uap->buffer_size); + if (error) { + NECPLOG(LOG_ERR, "necp_client_add parameters copyin error (%d)", error); + goto done; + } + + client->parameters_length = uap->buffer_size; + + uuid_generate_random(client->client_id); + LIST_INIT(&client->assertion_list); + + error = copyout(client->client_id, uap->client_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_add client_id copyout error (%d)", error); + goto done; + } + + lck_mtx_lock(&fd_data->fd_lock); + LIST_INSERT_HEAD(&fd_data->clients, client, chain); + + // Prime the client result + (void)necp_update_client_result(current_proc(), client); + lck_mtx_unlock(&fd_data->fd_lock); +done: + if (error != 0) { + if (client != NULL) { + FREE(client, M_NECP); + client = NULL; + } + } + *retval = error; + + return (error); +} + +static int +necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + struct necp_client *client = NULL; + struct necp_client *temp_client = NULL; + uuid_t client_id; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t)) { + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, client_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_remove copyin client_id error (%d)", error); + goto done; + } + + lck_mtx_lock(&fd_data->fd_lock); + LIST_FOREACH_SAFE(client, &fd_data->clients, chain, temp_client) { + if (uuid_compare(client->client_id, client_id) == 0) { + necp_destroy_client(client); + } + } + lck_mtx_unlock(&fd_data->fd_lock); +done: + *retval = error; + + return (error); +} + +static int +necp_client_copy_internal(struct necp_client *client, bool client_is_observed, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + // Copy results out + if (uap->action == NECP_CLIENT_ACTION_COPY_PARAMETERS) { + if (uap->buffer_size < client->parameters_length) { + error = EINVAL; + goto done; + } + error = copyout(client->parameters, uap->buffer, client->parameters_length); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy parameters copyout error (%d)", error); + goto done; + } + *retval = client->parameters_length; + } else if (uap->action == NECP_CLIENT_ACTION_COPY_RESULT) { + if (uap->buffer_size < (client->result_length + client->assigned_results_length)) { + error = EINVAL; + goto done; + } + error = copyout(client->result, uap->buffer, client->result_length); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy result copyout error (%d)", error); + goto done; + } + if (client->assigned_results_length && client->assigned_results) { + error = copyout(client->assigned_results, uap->buffer + client->result_length, client->assigned_results_length); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy assigned results copyout error (%d)", error); + goto done; + } + *retval = client->result_length + client->assigned_results_length; + } else { + *retval = client->result_length; + } + + if (!client_is_observed) { + client->result_read = TRUE; + client->assigned_result_read = TRUE; + } + } + +done: + return (error); +} + +static int +necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + struct necp_client *find_client = NULL; + struct necp_client *client = NULL; + uuid_t client_id; + uuid_clear(client_id); + + *retval = 0; + + if (uap->buffer_size == 0 || uap->buffer == 0) { + error = EINVAL; + goto done; + } + + if (uap->action != NECP_CLIENT_ACTION_COPY_PARAMETERS && + uap->action != NECP_CLIENT_ACTION_COPY_RESULT) { + error = EINVAL; + goto done; + } + + if (uap->client_id) { + if (uap->client_id_len != sizeof(uuid_t)) { + NECPLOG(LOG_ERR, "Incorrect length (got %d, expected %d)", uap->client_id_len, sizeof(uuid_t)); + error = ERANGE; + goto done; + } + + error = copyin(uap->client_id, client_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy client_id copyin error (%d)", error); + goto done; + } + } + + lck_mtx_lock(&fd_data->fd_lock); + LIST_FOREACH(find_client, &fd_data->clients, chain) { + if (uap->action == NECP_CLIENT_ACTION_COPY_RESULT && + uuid_is_null(client_id)) { + if (!find_client->result_read || !find_client->assigned_result_read) { + client = find_client; + break; + } + } else if (uuid_compare(find_client->client_id, client_id) == 0) { + client = find_client; + break; + } + } + + if (client != NULL) { + error = necp_client_copy_internal(client, FALSE, uap, retval); + } + + // Unlock our own client before moving on or returning + lck_mtx_unlock(&fd_data->fd_lock); + + if (client == NULL) { + if (fd_data->flags & NECP_OPEN_FLAG_OBSERVER) { + // Observers are allowed to lookup clients on other fds + + // Lock list + lck_rw_lock_shared(&necp_fd_lock); + struct necp_fd_data *client_fd = NULL; + LIST_FOREACH(client_fd, &necp_fd_list, chain) { + // Lock client + lck_mtx_lock(&client_fd->fd_lock); + find_client = NULL; + LIST_FOREACH(find_client, &client_fd->clients, chain) { + if (uuid_compare(find_client->client_id, client_id) == 0) { + client = find_client; + break; + } + } + + if (client != NULL) { + // Matched, copy out data + error = necp_client_copy_internal(client, TRUE, uap, retval); + } + + // Unlock client + lck_mtx_unlock(&client_fd->fd_lock); + + if (client != NULL) { + break; + } + } + + // Unlock list + lck_rw_done(&necp_fd_lock); + + // No client found, fail + if (client == NULL) { + error = ENOENT; + goto done; + } + } else { + // No client found, and not allowed to search other fds, fail + error = ENOENT; + goto done; + } + } + +done: + return (error); +} + +static int +necp_client_list(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + struct necp_client *find_client = NULL; + uuid_t *list = NULL; + u_int32_t requested_client_count = 0; + u_int32_t client_count = 0; + + if (uap->buffer_size < sizeof(requested_client_count) || uap->buffer == 0) { + error = EINVAL; + goto done; + } + + if (!(fd_data->flags & NECP_OPEN_FLAG_OBSERVER)) { + NECPLOG0(LOG_ERR, "Client does not hold necessary entitlement to list other NECP clients"); + error = EACCES; + goto done; + } + + error = copyin(uap->buffer, &requested_client_count, sizeof(requested_client_count)); + if (error) { + goto done; + } + + if (uap->buffer_size != (sizeof(requested_client_count) + requested_client_count * sizeof(uuid_t))) { + error = EINVAL; + goto done; + } + + if (requested_client_count > 0) { + if ((list = _MALLOC(requested_client_count * sizeof(uuid_t), M_NECP, M_WAITOK | M_ZERO)) == NULL) { + error = ENOMEM; + goto done; + } + } + + // Lock list + lck_rw_lock_shared(&necp_fd_lock); + struct necp_fd_data *client_fd = NULL; + LIST_FOREACH(client_fd, &necp_fd_list, chain) { + // Lock client + lck_mtx_lock(&client_fd->fd_lock); + find_client = NULL; + LIST_FOREACH(find_client, &client_fd->clients, chain) { + if (!uuid_is_null(find_client->client_id)) { + if (client_count < requested_client_count) { + uuid_copy(list[client_count], find_client->client_id); + } + client_count++; + } + } + lck_mtx_unlock(&client_fd->fd_lock); + } + + // Unlock list + lck_rw_done(&necp_fd_lock); + + error = copyout(&client_count, uap->buffer, sizeof(client_count)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_list buffer copyout error (%d)", error); + goto done; + } + + if (requested_client_count > 0 && + client_count > 0 && + list != NULL) { + error = copyout(list, uap->buffer + sizeof(client_count), requested_client_count * sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_list client count copyout error (%d)", error); + goto done; + } + } +done: + if (list != NULL) { + FREE(list, M_NECP); + } + *retval = error; + + return (error); +} + +static int +necp_client_request_nexus(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + struct necp_client *client = NULL; + uuid_t client_id; + bool requested_nexus = FALSE; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t)) { + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, client_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_request_nexus copyin client_id error (%d)", error); + goto done; + } + + lck_mtx_lock(&fd_data->fd_lock); + LIST_FOREACH(client, &fd_data->clients, chain) { + if (uuid_compare(client->client_id, client_id) == 0) { + // Request from nexus agent + if (!uuid_is_null(client->nexus_agent)) { + error = netagent_client_message(client->nexus_agent, client->client_id, + NETAGENT_MESSAGE_TYPE_REQUEST_NEXUS); + if (error == 0) { + requested_nexus = TRUE; + } + } + break; + } + } + lck_mtx_unlock(&fd_data->fd_lock); + + if (!requested_nexus && + error == 0) { + error = ENOENT; + } +done: + *retval = error; + + return (error); +} + +static void +necp_client_add_assertion(struct necp_client *client, uuid_t netagent_uuid) +{ + struct necp_client_assertion *new_assertion = NULL; + + MALLOC(new_assertion, struct necp_client_assertion *, sizeof(*new_assertion), M_NECP, M_WAITOK); + if (new_assertion == NULL) { + NECPLOG0(LOG_ERR, "Failed to allocate assertion"); + return; + } + + uuid_copy(new_assertion->asserted_netagent, netagent_uuid); + + LIST_INSERT_HEAD(&client->assertion_list, new_assertion, assertion_chain); +} + +static bool +necp_client_remove_assertion(struct necp_client *client, uuid_t netagent_uuid) +{ + struct necp_client_assertion *found_assertion = NULL; + struct necp_client_assertion *search_assertion = NULL; + LIST_FOREACH(search_assertion, &client->assertion_list, assertion_chain) { + if (uuid_compare(search_assertion->asserted_netagent, netagent_uuid) == 0) { + found_assertion = search_assertion; + break; + } + } + + if (found_assertion == NULL) { + NECPLOG0(LOG_ERR, "Netagent uuid not previously asserted"); + return false; + } + + LIST_REMOVE(found_assertion, assertion_chain); + FREE(found_assertion, M_NECP); + return true; +} + +static int +necp_client_agent_action(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + struct necp_client *matched_client = NULL; + struct necp_client *client = NULL; + uuid_t client_id; + bool acted_on_agent = FALSE; + u_int8_t *parameters = NULL; + size_t parameters_size = uap->buffer_size; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t) || + uap->buffer_size == 0 || uap->buffer == 0) { + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, client_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_agent_action copyin client_id error (%d)", error); + goto done; + } + + if ((parameters = _MALLOC(uap->buffer_size, M_NECP, M_WAITOK | M_ZERO)) == NULL) { + error = ENOMEM; + goto done; + } + + error = copyin(uap->buffer, parameters, uap->buffer_size); + if (error) { + NECPLOG(LOG_ERR, "necp_client_agent_action parameters copyin error (%d)", error); + goto done; + } + + lck_mtx_lock(&fd_data->fd_lock); + LIST_FOREACH(client, &fd_data->clients, chain) { + if (uuid_compare(client->client_id, client_id) == 0) { + matched_client = client; + break; + } + } + if (matched_client) { + size_t offset = 0; + while ((offset + sizeof(u_int8_t) + sizeof(u_int32_t)) <= parameters_size) { + u_int8_t type = necp_buffer_get_tlv_type(parameters, offset); + u_int32_t length = necp_buffer_get_tlv_length(parameters, offset); + + if (length > 0 && (offset + sizeof(u_int8_t) + sizeof(u_int32_t) + length) <= parameters_size) { + u_int8_t *value = necp_buffer_get_tlv_value(parameters, offset, NULL); + if (length >= sizeof(uuid_t) && + value != NULL && + (type == NECP_CLIENT_PARAMETER_TRIGGER_AGENT || + type == NECP_CLIENT_PARAMETER_ASSERT_AGENT || + type == NECP_CLIENT_PARAMETER_UNASSERT_AGENT)) { + + uuid_t agent_uuid; + uuid_copy(agent_uuid, value); + u_int8_t netagent_message_type = 0; + if (type == NECP_CLIENT_PARAMETER_TRIGGER_AGENT) { + netagent_message_type = NETAGENT_MESSAGE_TYPE_CLIENT_TRIGGER; + } else if (type == NECP_CLIENT_PARAMETER_ASSERT_AGENT) { + netagent_message_type = NETAGENT_MESSAGE_TYPE_CLIENT_ASSERT; + } else if (type == NECP_CLIENT_PARAMETER_UNASSERT_AGENT) { + netagent_message_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT; + } + + // Before unasserting, verify that the assertion was already taken + if (type == NECP_CLIENT_PARAMETER_UNASSERT_AGENT) { + if (!necp_client_remove_assertion(client, agent_uuid)) { + error = ENOENT; + break; + } + } + + error = netagent_client_message(agent_uuid, client_id, + netagent_message_type); + if (error == 0) { + acted_on_agent = TRUE; + } else { + break; + } + + // Only save the assertion if the action succeeded + if (type == NECP_CLIENT_PARAMETER_ASSERT_AGENT) { + necp_client_add_assertion(client, agent_uuid); + } + } + } + + offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; + } + } + lck_mtx_unlock(&fd_data->fd_lock); + + if (!acted_on_agent && + error == 0) { + error = ENOENT; + } +done: + *retval = error; + if (parameters != NULL) { + FREE(parameters, M_NECP); + parameters = NULL; + } + + return (error); +} + +static int +necp_client_copy_agent(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + uuid_t agent_uuid; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t) || + uap->buffer_size == 0 || uap->buffer == 0) { + NECPLOG0(LOG_ERR, "necp_client_copy_agent bad input"); + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, agent_uuid, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy_agent copyin agent_uuid error (%d)", error); + goto done; + } + + error = netagent_copyout(agent_uuid, uap->buffer, uap->buffer_size); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy_agent netagent_copyout error (%d)", error); + goto done; + } +done: + *retval = error; + + return (error); +} + +static int +necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + u_int32_t interface_index = 0; + struct necp_interface_details interface_details; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(u_int32_t) || + uap->buffer_size < sizeof(interface_details) || uap->buffer == 0) { + NECPLOG0(LOG_ERR, "necp_client_copy_interface bad input"); + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, &interface_index, sizeof(u_int32_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy_interface copyin interface_index error (%d)", error); + goto done; + } + + if (interface_index == 0) { + error = ENOENT; + NECPLOG(LOG_ERR, "necp_client_copy_interface bad interface_index (%d)", interface_index); + goto done; + } + + memset(&interface_details, 0, sizeof(interface_details)); + + ifnet_head_lock_shared(); + ifnet_t interface = NULL; + if (interface_index != IFSCOPE_NONE && (int)interface_index <= if_index) { + interface = ifindex2ifnet[interface_index]; + } + + if (interface != NULL) { + if (interface->if_xname != NULL) { + strlcpy((char *)&interface_details.name, interface->if_xname, sizeof(interface_details.name)); + } + interface_details.index = interface->if_index; + interface_details.generation = ifnet_get_generation(interface); + if (interface->if_delegated.ifp != NULL) { + interface_details.delegate_index = interface->if_delegated.ifp->if_index; + } + interface_details.functional_type = if_functional_type(interface, TRUE); + if (IFNET_IS_EXPENSIVE(interface)) { + interface_details.flags |= NECP_INTERFACE_FLAG_EXPENSIVE; + } + interface_details.mtu = interface->if_mtu; + + u_int8_t ipv4_signature_len = sizeof(interface_details.ipv4_signature); + u_int16_t ipv4_signature_flags; + ifnet_get_netsignature(interface, AF_INET, &ipv4_signature_len, &ipv4_signature_flags, + (u_int8_t *)&interface_details.ipv4_signature); + + u_int8_t ipv6_signature_len = sizeof(interface_details.ipv6_signature); + u_int16_t ipv6_signature_flags; + ifnet_get_netsignature(interface, AF_INET6, &ipv6_signature_len, &ipv6_signature_flags, + (u_int8_t *)&interface_details.ipv6_signature); + } + + ifnet_head_done(); + + error = copyout(&interface_details, uap->buffer, sizeof(interface_details)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy_interface copyout error (%d)", error); + goto done; + } +done: + *retval = error; + + return (error); +} + +static int +necp_client_stats_action(struct necp_client *client, user_addr_t buffer, user_size_t buffer_size) +{ + int error = 0; + struct necp_stats_hdr *stats_hdr = NULL; + + if (client->stats_area) { + // Close old stats if required. + if ((client->stats_uaddr != buffer) || (client->stats_ulen != buffer_size)) { + necp_destroy_client_stats(client); + } + } + + if ((buffer == 0) || (buffer_size == 0)) { + goto done; + } + + if (client->stats_area) { + // An update + error = copyin(client->stats_uaddr, client->stats_area, client->stats_ulen); + if (error) { + NECPLOG(LOG_ERR, "necp_client_stats_action copyin error on update (%d)", error); + } else { + // Future use - check + stats_hdr = (necp_stats_hdr *)client->stats_area; + if (stats_hdr->necp_stats_event != 0) { + ntstat_userland_stats_event(client->stats_handler_context, (userland_stats_event_t)stats_hdr->necp_stats_event); + } + } + goto done; + } + + // A create + if ((buffer_size > sizeof(necp_all_stats)) || (buffer_size < sizeof(necp_stats_hdr))) { + error = EINVAL; + goto done; + } + + if ((stats_hdr = _MALLOC(buffer_size, M_NECP, M_WAITOK | M_ZERO)) == NULL) { + error = ENOMEM; + goto done; + } + + client->stats_handler_context = NULL; + client->stats_uaddr = buffer; + client->stats_ulen = buffer_size; + client->stats_area = stats_hdr; + error = copyin(client->stats_uaddr, client->stats_area, client->stats_ulen); + if (error) { + NECPLOG(LOG_ERR, "necp_client_stats_action copyin error on create (%d)", error); + goto done; + } + + switch (stats_hdr->necp_stats_type) { + case NECP_CLIENT_STATISTICS_TYPE_TCP: { + if (stats_hdr->necp_stats_ver == NECP_CLIENT_STATISTICS_TYPE_TCP_VER_1) { + client->stats_handler_context = ntstat_userland_stats_open((userland_stats_provider_context *)client, + NSTAT_PROVIDER_TCP_USERLAND, 0, necp_request_tcp_netstats); + if (client->stats_handler_context == NULL) { + error = EIO; + } + } else { + error = ENOTSUP; + } + break; + } + case NECP_CLIENT_STATISTICS_TYPE_UDP: { + if (stats_hdr->necp_stats_ver != NECP_CLIENT_STATISTICS_TYPE_UDP_VER_1) { + client->stats_handler_context = ntstat_userland_stats_open((userland_stats_provider_context *)client, + NSTAT_PROVIDER_UDP_USERLAND, 0, necp_request_udp_netstats); + if (client->stats_handler_context == NULL) { + error = EIO; + } + } else { + error = ENOTSUP; + } + break; + } + default: { + error = ENOTSUP; + break; + } + } +done: + if ((error) && (stats_hdr != NULL)) { + FREE(stats_hdr, M_NECP); + client->stats_area = NULL; + client->stats_handler_context = NULL; + client->stats_uaddr = 0; + client->stats_ulen = 0; + } + + return (error); +} + +static int +necp_client_set_statistics(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + struct necp_client *find_client = NULL; + struct necp_client *client = NULL; + uuid_t client_id; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t)) { + error = EINVAL; + goto done; + } + + error = copyin(uap->client_id, client_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_set_statistics copyin client_id error (%d)", error); + goto done; + } + + lck_mtx_lock(&fd_data->fd_lock); + LIST_FOREACH(find_client, &fd_data->clients, chain) { + if (uuid_compare(find_client->client_id, client_id) == 0) { + client = find_client; + break; + } + } + + if (client) { + error = necp_client_stats_action(client, uap->buffer, uap->buffer_size); + } else { + error = ENOENT; + } + lck_mtx_unlock(&fd_data->fd_lock); +done: + *retval = error; + return (error); +} + +int +necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *retval) +{ +#pragma unused(p) + int error = 0; + int return_value = 0; + struct necp_fd_data *fd_data = NULL; + error = necp_find_fd_data(uap->necp_fd, &fd_data); + if (error != 0) { + NECPLOG(LOG_ERR, "necp_client_action find fd error (%d)", error); + return (error); + } + + u_int32_t action = uap->action; + switch (action) { + case NECP_CLIENT_ACTION_ADD: { + return_value = necp_client_add(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_REMOVE: { + return_value = necp_client_remove(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_COPY_PARAMETERS: + case NECP_CLIENT_ACTION_COPY_RESULT: { + return_value = necp_client_copy(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_COPY_LIST: { + return_value = necp_client_list(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_REQUEST_NEXUS_INSTANCE: { + return_value = necp_client_request_nexus(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_AGENT: { + return_value = necp_client_agent_action(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_COPY_AGENT: { + return_value = necp_client_copy_agent(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_COPY_INTERFACE: { + return_value = necp_client_copy_interface(fd_data, uap, retval); + break; + } + case NECP_CLIENT_ACTION_SET_STATISTICS: { + return_value = necp_client_set_statistics(fd_data, uap, retval); + break; + } + default: { + NECPLOG(LOG_ERR, "necp_client_action unknown action (%u)", action); + return_value = EINVAL; + break; + } + } + + file_drop(uap->necp_fd); + + return (return_value); +} + +#define NECP_MAX_MATCH_POLICY_PARAMETER_SIZE 1024 + +int +necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *retval) +{ +#pragma unused(retval) + u_int8_t *parameters = NULL; + struct necp_aggregate_result returned_result; + int error = 0; + + if (uap == NULL) { + error = EINVAL; + goto done; + } + + if (uap->parameters == 0 || uap->parameters_size == 0 || uap->parameters_size > NECP_MAX_MATCH_POLICY_PARAMETER_SIZE || uap->returned_result == 0) { + error = EINVAL; + goto done; + } + + MALLOC(parameters, u_int8_t *, uap->parameters_size, M_NECP, M_WAITOK | M_ZERO); + if (parameters == NULL) { + error = ENOMEM; + goto done; + } + // Copy parameters in + error = copyin(uap->parameters, parameters, uap->parameters_size); + if (error) { + goto done; + } + + error = necp_application_find_policy_match_internal(p, parameters, uap->parameters_size, &returned_result, NULL, 0); + if (error) { + goto done; + } + + // Copy return value back + error = copyout(&returned_result, uap->returned_result, sizeof(struct necp_aggregate_result)); + if (error) { + goto done; + } +done: + if (parameters != NULL) { + FREE(parameters, M_NECP); + } + return (error); +} + +/// Socket operations +#define NECP_MAX_SOCKET_ATTRIBUTE_STRING_LENGTH 253 + +static bool +necp_set_socket_attribute(u_int8_t *buffer, size_t buffer_length, u_int8_t type, char **buffer_p) +{ + int error = 0; + int cursor = 0; + size_t string_size = 0; + char *local_string = NULL; + u_int8_t *value = NULL; + + cursor = necp_buffer_find_tlv(buffer, buffer_length, 0, type, 0); + if (cursor < 0) { + // This will clear out the parameter + goto done; + } + + string_size = necp_buffer_get_tlv_length(buffer, cursor); + if (string_size == 0 || string_size > NECP_MAX_SOCKET_ATTRIBUTE_STRING_LENGTH) { + // This will clear out the parameter + goto done; + } + + MALLOC(local_string, char *, string_size + 1, M_NECP, M_WAITOK | M_ZERO); + if (local_string == NULL) { + NECPLOG(LOG_ERR, "Failed to allocate a socket attribute buffer (size %d)", string_size); + goto fail; + } + + value = necp_buffer_get_tlv_value(buffer, cursor, NULL); + if (value == NULL) { + NECPLOG0(LOG_ERR, "Failed to get socket attribute"); + goto fail; + } + + memcpy(local_string, value, string_size); + local_string[string_size] = 0; + +done: + if (*buffer_p != NULL) { + FREE(*buffer_p, M_NECP); + *buffer_p = NULL; + } + + *buffer_p = local_string; + return (0); +fail: + if (local_string != NULL) { + FREE(local_string, M_NECP); + } + return (error); +} + +errno_t +necp_set_socket_attributes(struct socket *so, struct sockopt *sopt) +{ + int error = 0; + u_int8_t *buffer = NULL; + struct inpcb *inp = NULL; + + if ((SOCK_DOM(so) != PF_INET +#if INET6 + && SOCK_DOM(so) != PF_INET6 +#endif + )) { + error = EINVAL; + goto done; + } + + inp = sotoinpcb(so); + + size_t valsize = sopt->sopt_valsize; + if (valsize == 0 || + valsize > ((sizeof(u_int8_t) + sizeof(u_int32_t) + NECP_MAX_SOCKET_ATTRIBUTE_STRING_LENGTH) * 2)) { + goto done; + } + + MALLOC(buffer, u_int8_t *, valsize, M_NECP, M_WAITOK | M_ZERO); + if (buffer == NULL) { + goto done; + } + + error = sooptcopyin(sopt, buffer, valsize, 0); + if (error) { + goto done; + } + + error = necp_set_socket_attribute(buffer, valsize, NECP_TLV_ATTRIBUTE_DOMAIN, &inp->inp_necp_attributes.inp_domain); + if (error) { + NECPLOG0(LOG_ERR, "Could not set domain TLV for socket attributes"); + goto done; + } + + error = necp_set_socket_attribute(buffer, valsize, NECP_TLV_ATTRIBUTE_ACCOUNT, &inp->inp_necp_attributes.inp_account); + if (error) { + NECPLOG0(LOG_ERR, "Could not set account TLV for socket attributes"); + goto done; + } + + if (necp_debug) { + NECPLOG(LOG_DEBUG, "Set on socket: Domain %s, Account %s", inp->inp_necp_attributes.inp_domain, inp->inp_necp_attributes.inp_account); + } +done: + if (buffer != NULL) { + FREE(buffer, M_NECP); + } + + return (error); +} + +errno_t +necp_get_socket_attributes(struct socket *so, struct sockopt *sopt) +{ + int error = 0; + u_int8_t *buffer = NULL; + u_int8_t *cursor = NULL; + size_t valsize = 0; + struct inpcb *inp = sotoinpcb(so); + + if (inp->inp_necp_attributes.inp_domain != NULL) { + valsize += sizeof(u_int8_t) + sizeof(u_int32_t) + strlen(inp->inp_necp_attributes.inp_domain); + } + if (inp->inp_necp_attributes.inp_account != NULL) { + valsize += sizeof(u_int8_t) + sizeof(u_int32_t) + strlen(inp->inp_necp_attributes.inp_account); + } + if (valsize == 0) { + goto done; + } + + MALLOC(buffer, u_int8_t *, valsize, M_NECP, M_WAITOK | M_ZERO); + if (buffer == NULL) { + goto done; + } + + cursor = buffer; + if (inp->inp_necp_attributes.inp_domain != NULL) { + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_ATTRIBUTE_DOMAIN, strlen(inp->inp_necp_attributes.inp_domain), inp->inp_necp_attributes.inp_domain); + } + + if (inp->inp_necp_attributes.inp_account != NULL) { + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_ATTRIBUTE_ACCOUNT, strlen(inp->inp_necp_attributes.inp_account), inp->inp_necp_attributes.inp_account); + } + + error = sooptcopyout(sopt, buffer, valsize); + if (error) { + goto done; + } +done: + if (buffer != NULL) { + FREE(buffer, M_NECP); + } + + return (error); +} + +void +necp_inpcb_dispose(struct inpcb *inp) +{ + if (inp->inp_necp_attributes.inp_domain != NULL) { + FREE(inp->inp_necp_attributes.inp_domain, M_NECP); + inp->inp_necp_attributes.inp_domain = NULL; + } + if (inp->inp_necp_attributes.inp_account != NULL) { + FREE(inp->inp_necp_attributes.inp_account, M_NECP); + inp->inp_necp_attributes.inp_account = NULL; + } +} + +/// Module init + +errno_t +necp_client_init(void) +{ + errno_t result = 0; + + necp_fd_grp_attr = lck_grp_attr_alloc_init(); + if (necp_fd_grp_attr == NULL) { + NECPLOG0(LOG_ERR, "lck_grp_attr_alloc_init failed"); + result = ENOMEM; + goto done; + } + + necp_fd_mtx_grp = lck_grp_alloc_init("necp_fd", necp_fd_grp_attr); + if (necp_fd_mtx_grp == NULL) { + NECPLOG0(LOG_ERR, "lck_grp_alloc_init failed"); + result = ENOMEM; + goto done; + } + + necp_fd_mtx_attr = lck_attr_alloc_init(); + if (necp_fd_mtx_attr == NULL) { + NECPLOG0(LOG_ERR, "lck_attr_alloc_init failed"); + result = ENOMEM; + goto done; + } + + necp_client_tcall = thread_call_allocate(necp_update_all_clients_callout, NULL); + if (necp_client_tcall == NULL) { + NECPLOG0(LOG_ERR, "thread_call_allocate failed"); + result = ENOMEM; + goto done; + } + + lck_rw_init(&necp_fd_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); + + LIST_INIT(&necp_fd_list); + +done: + if (result != 0) { + if (necp_fd_mtx_attr != NULL) { + lck_attr_free(necp_fd_mtx_attr); + necp_fd_mtx_attr = NULL; + } + if (necp_fd_mtx_grp != NULL) { + lck_grp_free(necp_fd_mtx_grp); + necp_fd_mtx_grp = NULL; + } + if (necp_fd_grp_attr != NULL) { + lck_grp_attr_free(necp_fd_grp_attr); + necp_fd_grp_attr = NULL; + } + } + return (result); +} diff --git a/bsd/net/net_kev.h b/bsd/net/net_kev.h new file mode 100644 index 000000000..ba1de1cbe --- /dev/null +++ b/bsd/net/net_kev.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_NETKEV_H_ +#define _NET_NETKEV_H_ + +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) + +/* Kernel event subclass identifiers for KEV_NETWORK_CLASS */ +#define KEV_INET_SUBCLASS 1 /* inet subclass */ +/* KEV_INET_SUBCLASS event codes */ +#define KEV_INET_NEW_ADDR 1 /* Userland configured IP address */ +#define KEV_INET_CHANGED_ADDR 2 /* Address changed event */ +#define KEV_INET_ADDR_DELETED 3 /* IPv6 address was deleted */ +#define KEV_INET_SIFDSTADDR 4 /* Dest. address was set */ +#define KEV_INET_SIFBRDADDR 5 /* Broadcast address was set */ +#define KEV_INET_SIFNETMASK 6 /* Netmask was set */ +#define KEV_INET_ARPCOLLISION 7 /* ARP collision detected */ +#ifdef __APPLE_API_PRIVATE +#define KEV_INET_PORTINUSE 8 /* use ken_in_portinuse */ +#endif +#define KEV_INET_ARPRTRFAILURE 9 /* ARP resolution failed for router */ +#define KEV_INET_ARPRTRALIVE 10 /* ARP resolution succeeded for router */ + +#define KEV_DL_SUBCLASS 2 /* Data Link subclass */ +/* + * Define Data-Link event subclass, and associated + * events. + */ +#define KEV_DL_SIFFLAGS 1 +#define KEV_DL_SIFMETRICS 2 +#define KEV_DL_SIFMTU 3 +#define KEV_DL_SIFPHYS 4 +#define KEV_DL_SIFMEDIA 5 +#define KEV_DL_SIFGENERIC 6 +#define KEV_DL_ADDMULTI 7 +#define KEV_DL_DELMULTI 8 +#define KEV_DL_IF_ATTACHED 9 +#define KEV_DL_IF_DETACHING 10 +#define KEV_DL_IF_DETACHED 11 +#define KEV_DL_LINK_OFF 12 +#define KEV_DL_LINK_ON 13 +#define KEV_DL_PROTO_ATTACHED 14 +#define KEV_DL_PROTO_DETACHED 15 +#define KEV_DL_LINK_ADDRESS_CHANGED 16 +#define KEV_DL_WAKEFLAGS_CHANGED 17 +#define KEV_DL_IF_IDLE_ROUTE_REFCNT 18 +#define KEV_DL_IFCAP_CHANGED 19 +#define KEV_DL_LINK_QUALITY_METRIC_CHANGED 20 +#define KEV_DL_NODE_PRESENCE 21 +#define KEV_DL_NODE_ABSENCE 22 +#define KEV_DL_MASTER_ELECTED 23 +#define KEV_DL_ISSUES 24 +#define KEV_DL_IFDELEGATE_CHANGED 25 +#define KEV_DL_AWDL_RESTRICTED 26 +#define KEV_DL_AWDL_UNRESTRICTED 27 +#define KEV_DL_RRC_STATE_CHANGED 28 +#define KEV_DL_QOS_MODE_CHANGED 29 + +#ifdef PRIVATE +#define KEV_NETPOLICY_SUBCLASS 3 /* Network policy subclass */ +/* KEV_NETPOLICY_SUBCLASS event codes */ +#define KEV_NETPOLICY_IFDENIED 1 /* denied access to interface */ +#define KEV_NETPOLICY_IFFAILED 2 /* failed to bring up interface */ + +#define KEV_SOCKET_SUBCLASS 4 /* Socket subclass */ +/* KEV_SOCKET_SUBCLASS event codes */ +#define KEV_SOCKET_CLOSED 1 /* completely closed by protocol */ +#endif /* PRIVATE */ + +#define KEV_INET6_SUBCLASS 6 /* inet6 subclass */ +/* KEV_INET6_SUBCLASS event codes */ +#define KEV_INET6_NEW_USER_ADDR 1 /* Userland configured IPv6 address */ +#define KEV_INET6_CHANGED_ADDR 2 /* Address changed event (future) */ +#define KEV_INET6_ADDR_DELETED 3 /* IPv6 address was deleted */ +#define KEV_INET6_NEW_LL_ADDR 4 /* Autoconf LL address appeared */ +#define KEV_INET6_NEW_RTADV_ADDR 5 /* Autoconf address has appeared */ +#define KEV_INET6_DEFROUTER 6 /* Default router detected */ + +#ifdef PRIVATE +#define KEV_ND6_SUBCLASS 7 /* IPv6 NDP subclass */ +/* KEV_ND6_SUBCLASS event codes */ +#define KEV_ND6_RA 1 +#define KEV_ND6_NDFAILURE 2 /* IPv6 neighbor cache entry expiry */ +#define KEV_ND6_NDALIVE 3 /* IPv6 neighbor reachable */ + +#define KEV_NECP_SUBCLASS 8 /* NECP subclasss */ +/* KEV_NECP_SUBCLASS event codes */ +#define KEV_NECP_POLICIES_CHANGED 1 + +#define KEV_NETAGENT_SUBCLASS 9 /* Net-Agent subclass */ +/* Network Agent kernel event codes */ +#define KEV_NETAGENT_REGISTERED 1 +#define KEV_NETAGENT_UNREGISTERED 2 +#define KEV_NETAGENT_UPDATED 3 +#define KEV_NETAGENT_UPDATED_INTERFACES 4 + +#define KEV_LOG_SUBCLASS 10 /* Log subclass */ +/* KEV_LOG_SUBCLASS event codes */ +#define IPFWLOGEVENT 0 + +#define KEV_NETEVENT_SUBCLASS 11 /* Generic Net events subclass */ +/* KEV_NETEVENT_SUBCLASS event codes */ +#define KEV_NETEVENT_APNFALLBACK 1 +#endif /* PRIVATE */ + +#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ + +#endif /* _NET_NETKEV_H_ */ diff --git a/bsd/net/net_stubs.c b/bsd/net/net_stubs.c index 052ae2ffe..31ac0627a 100644 --- a/bsd/net/net_stubs.c +++ b/bsd/net/net_stubs.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2012-2015 Apple Inc. All rights reserved. + * Copyright (c) 2012-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -30,12 +30,12 @@ #if !NETWORKING -#define STUB(name) \ +#define STUB(name) \ int name(void); \ int name(void) \ { \ panic("stub called in a config with no networking"); \ - return 0; \ + return (0); \ } STUB(bpf_attach); @@ -245,6 +245,18 @@ STUB(mbuf_find_drvaux); STUB(mbuf_del_drvaux); STUB(mbuf_trailingspace); STUB(mbuf_type); +STUB(mbuf_get_flowid); +STUB(mbuf_set_flowid); +STUB(mbuf_get_timestamp); +STUB(mbuf_set_timestamp); +STUB(mbuf_get_tx_compl_data); +STUB(mbuf_set_tx_compl_data); +STUB(mbuf_get_status); +STUB(mbuf_set_status); +STUB(mbuf_get_timestamp_requested); +STUB(mbuf_set_timestamp_requested); +STUB(mbuf_register_tx_compl_callback); +STUB(mbuf_unregister_tx_compl_callback); STUB(net_init_add); STUB(proto_inject); STUB(proto_input); @@ -293,6 +305,7 @@ STUB(ifnet_clone_attach); STUB(ifnet_clone_detach); STUB(ifnet_dequeue); STUB(ifnet_dequeue_multi); +STUB(ifnet_dequeue_multi_bytes); STUB(ifnet_dequeue_service_class); STUB(ifnet_dequeue_service_class_multi); STUB(ifnet_enqueue); @@ -327,6 +340,7 @@ STUB(ifnet_start); STUB(ifnet_transmit_burst_end); STUB(ifnet_transmit_burst_start); STUB(ifnet_tx_compl_status); +STUB(ifnet_tx_compl); STUB(ifnet_flowid); STUB(ifnet_enable_output); STUB(ifnet_disable_output); @@ -335,6 +349,11 @@ STUB(ifnet_link_status_report); STUB(ifnet_set_packetpreamblelen); STUB(ifnet_packetpreamblelen); STUB(ifnet_maxpacketpreamblelen); +STUB(ifnet_set_fastlane_capable); +STUB(ifnet_get_fastlane_capable); +STUB(ifnet_get_unsent_bytes); +STUB(ifnet_get_buffer_status); +STUB(ifnet_normalise_unsent_data); STUB(in6_localaddr); STUB(in_localaddr); STUB(in6addr_local); @@ -353,6 +372,9 @@ STUB(m_split); STUB(m_trailingspace); STUB(mbuf_get_driver_scratch); STUB(mbuf_get_unsent_data_bytes); +STUB(mbuf_get_buffer_status); +STUB(mbuf_pkt_new_flow); +STUB(mbuf_last_pkt); STUB(mbuf_get_priority); STUB(mbuf_get_service_class); STUB(mbuf_get_service_class_index); @@ -434,13 +456,13 @@ STUB(ip_gre_register_input); STUB(sock_iskernel); #undef STUB -/* +/* * Called from vm_pageout.c. Nothing to be done when there's no networking. */ void m_drain(void); void m_drain(void) { - return; + return; } #endif /* !NETWORKING */ diff --git a/bsd/net/network_agent.c b/bsd/net/network_agent.c index 685437908..f05d08cec 100644 --- a/bsd/net/network_agent.c +++ b/bsd/net/network_agent.c @@ -43,6 +43,7 @@ #include #include #include +#include u_int32_t netagent_debug = LOG_NOTICE; // 0=None, 1=Basic @@ -67,25 +68,38 @@ SYSCTL_INT(_net_netagent, OID_AUTO, active_count , CTLFLAG_RD | CTLFLAG_LOCKED, log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: %s\n", __FUNCTION__, msg); \ } while (0) -struct netagent_assertion { - LIST_ENTRY(netagent_assertion) assertion_chain; - uuid_t asserted_uuid; +struct netagent_client { + LIST_ENTRY(netagent_client) client_chain; + uuid_t client_id; + uuid_t client_proc_uuid; + pid_t client_pid; }; +LIST_HEAD(netagent_client_list_s, netagent_client); + struct netagent_wrapper { LIST_ENTRY(netagent_wrapper) master_chain; u_int32_t control_unit; + u_int32_t generation; + struct netagent_client_list_s pending_triggers_list; struct netagent netagent; }; struct netagent_session { u_int32_t control_unit; struct netagent_wrapper *wrapper; - LIST_HEAD(_netagent_assertion_list, netagent_assertion) assertion_list; }; +typedef enum { + kNetagentErrorDomainPOSIX = 0, + kNetagentErrorDomainUserDefined = 1, +} netagent_error_domain_t; + static LIST_HEAD(_netagent_list, netagent_wrapper) master_netagent_list; +// Protected by netagent_lock +static u_int32_t g_next_generation = 1; + static kern_ctl_ref netagent_kctlref; static u_int32_t netagent_family; static OSMallocTag netagent_malloc_tag; @@ -111,18 +125,32 @@ static int netagent_send_ctl_data(u_int32_t control_unit, u_int8_t *buffer, size static struct netagent_session *netagent_create_session(u_int32_t control_unit); static void netagent_delete_session(struct netagent_session *session); -static void netagent_handle_register(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset); -static void netagent_handle_unregister(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset); -static void netagent_handle_update(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset); +// Register +static void netagent_handle_register_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset); +static errno_t netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payload, + u_int32_t payload_length); + +// Unregister +static void netagent_handle_unregister_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset); +static errno_t netagent_handle_unregister_setopt(struct netagent_session *session, u_int8_t *payload, + u_int32_t payload_length); + +// Update +static void netagent_handle_update_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset); +static errno_t netagent_handle_update_setopt(struct netagent_session *session, u_int8_t *payload, + u_int32_t payload_length); + +// Assign nexus +static void netagent_handle_assign_nexus_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset); +static errno_t netagent_handle_assign_nexus_setopt(struct netagent_session *session, u_int8_t *payload, + u_int32_t payload_length); + static void netagent_handle_get(struct netagent_session *session, u_int32_t message_id, u_int32_t payload_length, mbuf_t packet, int offset); -static void netagent_handle_assert(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset); -static void netagent_handle_unassert(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset); static struct netagent_wrapper *netagent_find_agent_with_uuid(uuid_t uuid); @@ -248,8 +276,12 @@ netagent_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo) // Kernel events static void -netagent_post_event(uuid_t agent_uuid, u_int32_t event_code) +netagent_post_event(uuid_t agent_uuid, u_int32_t event_code, bool update_necp) { + if (update_necp) { + necp_update_all_clients(); + } + struct kev_msg ev_msg; memset(&ev_msg, 0, sizeof(ev_msg)); @@ -323,6 +355,32 @@ netagent_send_trigger(struct netagent_wrapper *wrapper, struct proc *p, u_int32_ return (error); } +static int +netagent_send_client_message(struct netagent_wrapper *wrapper, uuid_t client_id, u_int8_t message_type) +{ + int error = 0; + struct netagent_client_message *client_message = NULL; + u_int8_t *message = NULL; + size_t message_size = sizeof(struct netagent_message_header) + sizeof(struct netagent_client_message); + + MALLOC(message, u_int8_t *, message_size, M_NETAGENT, M_WAITOK); + if (message == NULL) { + return (ENOMEM); + } + + (void)netagent_buffer_write_message_header(message, message_type, 0, 0, 0, sizeof(struct netagent_client_message)); + + client_message = (struct netagent_client_message *)(void *)(message + sizeof(struct netagent_message_header)); + uuid_copy(client_message->client_id, client_id); + + if ((error = netagent_send_ctl_data(wrapper->control_unit, (u_int8_t *)message, message_size))) { + NETAGENTLOG(LOG_ERR, "Failed to send client message %d on control unit %d", message_type, wrapper->control_unit); + } + + FREE(message, M_NETAGENT); + return (error); +} + static int netagent_send_success_response(struct netagent_session *session, u_int8_t message_type, u_int32_t message_id) { @@ -395,17 +453,17 @@ netagent_ctl_send(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, mbuf_t p switch (header.message_type) { case NETAGENT_MESSAGE_TYPE_REGISTER: { - netagent_handle_register(session, header.message_id, header.message_payload_length, - packet, sizeof(header)); + netagent_handle_register_message(session, header.message_id, header.message_payload_length, + packet, sizeof(header)); break; } case NETAGENT_MESSAGE_TYPE_UNREGISTER: { - netagent_handle_unregister(session, header.message_id, header.message_payload_length, - packet, sizeof(header)); + netagent_handle_unregister_message(session, header.message_id, header.message_payload_length, + packet, sizeof(header)); break; } case NETAGENT_MESSAGE_TYPE_UPDATE: { - netagent_handle_update(session, header.message_id, header.message_payload_length, + netagent_handle_update_message(session, header.message_id, header.message_payload_length, packet, sizeof(header)); break; } @@ -415,13 +473,16 @@ netagent_ctl_send(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, mbuf_t p break; } case NETAGENT_MESSAGE_TYPE_ASSERT: { - netagent_handle_assert(session, header.message_id, header.message_payload_length, - packet, sizeof(header)); + NETAGENTLOG0(LOG_ERR, "NETAGENT_MESSAGE_TYPE_ASSERT no longer supported"); break; } case NETAGENT_MESSAGE_TYPE_UNASSERT: { - netagent_handle_unassert(session, header.message_id, header.message_payload_length, - packet, sizeof(header)); + NETAGENTLOG0(LOG_ERR, "NETAGENT_MESSAGE_TYPE_UNASSERT no longer supported"); + break; + } + case NETAGENT_MESSAGE_TYPE_ASSIGN_NEXUS: { + netagent_handle_assign_nexus_message(session, header.message_id, header.message_payload_length, + packet, sizeof(header)); break; } default: { @@ -456,8 +517,45 @@ static errno_t netagent_ctl_setopt(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, int opt, void *data, size_t len) { -#pragma unused(kctlref, unit, unitinfo, opt, data, len) - return (0); +#pragma unused(kctlref, unit) + struct netagent_session *session = (struct netagent_session *)unitinfo; + errno_t error; + + if (session == NULL) { + NETAGENTLOG0(LOG_ERR, "Received a NULL session"); + error = EINVAL; + goto done; + } + + switch (opt) { + case NETAGENT_OPTION_TYPE_REGISTER: { + NETAGENTLOG0(LOG_DEBUG, "Request for registration"); + error = netagent_handle_register_setopt(session, data, len); + } + break; + case NETAGENT_OPTION_TYPE_UPDATE: { + NETAGENTLOG0(LOG_DEBUG, "Request for update"); + error = netagent_handle_update_setopt(session, data, len); + } + break; + case NETAGENT_OPTION_TYPE_UNREGISTER: { + NETAGENTLOG0(LOG_DEBUG, "Request for unregistration"); + error = netagent_handle_unregister_setopt(session, data, len); + } + break; + case NETAGENT_OPTION_TYPE_ASSIGN_NEXUS: { + NETAGENTLOG0(LOG_DEBUG, "Request for assigning nexus"); + error = netagent_handle_assign_nexus_setopt(session, data, len); + } + break; + default: + NETAGENTLOG0(LOG_ERR, "Received unknown option"); + error = ENOPROTOOPT; + break; + } + +done: + return (error); } // Session Management @@ -473,12 +571,26 @@ netagent_create_session(u_int32_t control_unit) NETAGENTLOG(LOG_DEBUG, "Create agent session, control unit %d", control_unit); memset(new_session, 0, sizeof(*new_session)); new_session->control_unit = control_unit; - LIST_INIT(&new_session->assertion_list); new_session->wrapper = NULL; done: return (new_session); } +static void +netagent_free_wrapper(struct netagent_wrapper *wrapper) +{ + // Free any pending client triggers + struct netagent_client *search_client = NULL; + struct netagent_client *temp_client = NULL; + LIST_FOREACH_SAFE(search_client, &wrapper->pending_triggers_list, client_chain, temp_client) { + LIST_REMOVE(search_client, client_chain); + FREE(search_client, M_NETAGENT); + } + + // Free wrapper itself + FREE(wrapper, M_NETAGENT); +} + static void netagent_unregister_session_wrapper(struct netagent_session *session) { @@ -502,7 +614,7 @@ netagent_unregister_session_wrapper(struct netagent_session *session) unregistered = TRUE; uuid_copy(unregistered_uuid, session->wrapper->netagent.netagent_uuid); - FREE(wrapper, M_NETAGENT); + netagent_free_wrapper(session->wrapper); session->wrapper = NULL; NETAGENTLOG0(LOG_DEBUG, "Unregistered agent"); } @@ -510,8 +622,8 @@ netagent_unregister_session_wrapper(struct netagent_session *session) lck_rw_done(&netagent_lock); if (unregistered) { - netagent_post_event(unregistered_uuid, KEV_NETAGENT_UNREGISTERED); ifnet_clear_netagent(unregistered_uuid); + netagent_post_event(unregistered_uuid, KEV_NETAGENT_UNREGISTERED, TRUE); } } @@ -520,21 +632,6 @@ netagent_delete_session(struct netagent_session *session) { if (session != NULL) { netagent_unregister_session_wrapper(session); - - // Unassert any pending assertions - lck_rw_lock_shared(&netagent_lock); - struct netagent_assertion *search_assertion = NULL; - struct netagent_assertion *temp_assertion = NULL; - LIST_FOREACH_SAFE(search_assertion, &session->assertion_list, assertion_chain, temp_assertion) { - struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(search_assertion->asserted_uuid); - if (wrapper != NULL) { - netagent_send_trigger(wrapper, current_proc(), NETAGENT_TRIGGER_FLAG_USER, NETAGENT_MESSAGE_TYPE_TRIGGER_UNASSERT); - } - LIST_REMOVE(search_assertion, assertion_chain); - FREE(search_assertion, M_NETAGENT); - } - lck_rw_done(&netagent_lock); - FREE(session, M_NETAGENT); } } @@ -558,9 +655,97 @@ netagent_packet_get_netagent_data_size(mbuf_t packet, int offset, int *err) return (netagent_peek.netagent_data_size); } +static errno_t +netagent_handle_register_inner(struct netagent_session *session, struct netagent_wrapper *new_wrapper) +{ + lck_rw_lock_exclusive(&netagent_lock); + + new_wrapper->control_unit = session->control_unit; + new_wrapper->generation = g_next_generation++; + + session->wrapper = new_wrapper; + LIST_INSERT_HEAD(&master_netagent_list, new_wrapper, master_chain); + LIST_INIT(&new_wrapper->pending_triggers_list); + + new_wrapper->netagent.netagent_flags |= NETAGENT_FLAG_REGISTERED; + netagent_registered_count++; + if (new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) { + netagent_active_count++; + } + + lck_rw_done(&netagent_lock); + + return 0; +} + +static errno_t +netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payload, + u_int32_t payload_length) +{ + int data_size = 0; + struct netagent_wrapper *new_wrapper = NULL; + u_int32_t response_error = 0; + struct netagent *register_netagent = (struct netagent *)(void *)payload; + + if (session == NULL) { + NETAGENTLOG0(LOG_ERR, "Failed to find session"); + response_error = EINVAL; + goto done; + } + + if (payload == NULL) { + NETAGENTLOG0(LOG_ERR, "No payload received"); + response_error = EINVAL; + goto done; + } + + if (session->wrapper != NULL) { + NETAGENTLOG0(LOG_ERR, "Session already has a registered agent"); + response_error = EINVAL; + goto done; + } + + if (payload_length < sizeof(struct netagent)) { + NETAGENTLOG(LOG_ERR, "Register message size too small for agent: (%d < %d)", + payload_length, sizeof(struct netagent)); + response_error = EINVAL; + goto done; + } + + data_size = register_netagent->netagent_data_size; + if (data_size < 0 || data_size > NETAGENT_MAX_DATA_SIZE) { + NETAGENTLOG(LOG_ERR, "Register message size could not be read, data_size %d", + data_size); + response_error = EINVAL; + goto done; + } + + MALLOC(new_wrapper, struct netagent_wrapper *, sizeof(*new_wrapper) + data_size, M_NETAGENT, M_WAITOK); + if (new_wrapper == NULL) { + NETAGENTLOG0(LOG_ERR, "Failed to allocate agent"); + response_error = ENOMEM; + goto done; + } + + memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); + memcpy(&new_wrapper->netagent, register_netagent, sizeof(struct netagent) + data_size); + + response_error = netagent_handle_register_inner(session, new_wrapper); + if (response_error != 0) { + FREE(new_wrapper, M_NETAGENT); + goto done; + } + + NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); + netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE); + +done: + return response_error; +} + static void -netagent_handle_register(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset) +netagent_handle_register_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset) { int error; int data_size = 0; @@ -614,32 +799,38 @@ netagent_handle_register(struct netagent_session *session, u_int32_t message_id, goto fail; } - lck_rw_lock_exclusive(&netagent_lock); - - new_wrapper->control_unit = session->control_unit; - - session->wrapper = new_wrapper; - LIST_INSERT_HEAD(&master_netagent_list, new_wrapper, master_chain); - - new_wrapper->netagent.netagent_flags |= NETAGENT_FLAG_REGISTERED; - netagent_registered_count++; - if (new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) { - netagent_active_count++; - } - - lck_rw_done(&netagent_lock); + (void)netagent_handle_register_inner(session, new_wrapper); NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED); + netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE); return; fail: netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id, response_error); } +static errno_t +netagent_handle_unregister_setopt(struct netagent_session *session, u_int8_t *payload, + u_int32_t payload_length) +{ +#pragma unused(payload, payload_length) + u_int32_t response_error = 0; + + if (session == NULL) { + NETAGENTLOG0(LOG_ERR, "Failed to find session"); + response_error = EINVAL; + goto done; + } + + netagent_unregister_session_wrapper(session); + +done: + return response_error; +} + static void -netagent_handle_unregister(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset) +netagent_handle_unregister_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset) { #pragma unused(payload_length, packet, offset) u_int32_t response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; @@ -659,15 +850,182 @@ netagent_handle_unregister(struct netagent_session *session, u_int32_t message_i } static void -netagent_handle_update(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset) +netagent_send_cellular_failed_event(struct netagent_wrapper *wrapper, + pid_t pid, uuid_t proc_uuid) +{ + if (strncmp(wrapper->netagent.netagent_domain, "Cellular", NETAGENT_DOMAINSIZE) != 0) { + return; + } + + struct kev_netpolicy_ifdenied ev_ifdenied; + + bzero(&ev_ifdenied, sizeof(ev_ifdenied)); + + ev_ifdenied.ev_data.epid = pid; + uuid_copy(ev_ifdenied.ev_data.euuid, proc_uuid); + ev_ifdenied.ev_if_functional_type = IFRTYPE_FUNCTIONAL_CELLULAR; + + netpolicy_post_msg(KEV_NETPOLICY_IFFAILED, &ev_ifdenied.ev_data, sizeof(ev_ifdenied)); +} + +static errno_t +netagent_handle_update_inner(struct netagent_session *session, struct netagent_wrapper *new_wrapper, u_int32_t data_size, u_int8_t *agent_changed, netagent_error_domain_t error_domain) +{ + u_int32_t response_error = 0; + + if (agent_changed == NULL) { + NETAGENTLOG0(LOG_ERR, "Invalid argument: agent_changed"); + return EINVAL; + } + + lck_rw_lock_exclusive(&netagent_lock); + + if (uuid_compare(session->wrapper->netagent.netagent_uuid, new_wrapper->netagent.netagent_uuid) != 0 || + memcmp(&session->wrapper->netagent.netagent_domain, &new_wrapper->netagent.netagent_domain, + sizeof(new_wrapper->netagent.netagent_domain)) != 0 || + memcmp(&session->wrapper->netagent.netagent_type, &new_wrapper->netagent.netagent_type, + sizeof(new_wrapper->netagent.netagent_type)) != 0) { + lck_rw_done(&netagent_lock); + NETAGENTLOG0(LOG_ERR, "Basic agent parameters do not match, cannot update"); + if (error_domain == kNetagentErrorDomainPOSIX) { + response_error = EINVAL; + } else if (error_domain == kNetagentErrorDomainUserDefined) { + response_error = NETAGENT_MESSAGE_ERROR_CANNOT_UPDATE; + } + return response_error; + } + + new_wrapper->netagent.netagent_flags |= NETAGENT_FLAG_REGISTERED; + if (session->wrapper->netagent.netagent_data_size == new_wrapper->netagent.netagent_data_size && + memcmp(&session->wrapper->netagent, &new_wrapper->netagent, sizeof(struct netagent) + data_size) == 0) { + // Agent is exactly identical, don't increment the generation count + + // Make a copy of the list of pending clients, and clear the current list + struct netagent_client_list_s pending_triggers_list_copy; + LIST_INIT(&pending_triggers_list_copy); + struct netagent_client *search_client = NULL; + struct netagent_client *temp_client = NULL; + LIST_FOREACH_SAFE(search_client, &session->wrapper->pending_triggers_list, client_chain, temp_client) { + LIST_REMOVE(search_client, client_chain); + LIST_INSERT_HEAD(&pending_triggers_list_copy, search_client, client_chain); + } + lck_rw_done(&netagent_lock); + + // Update pending client triggers without holding a lock + search_client = NULL; + temp_client = NULL; + LIST_FOREACH_SAFE(search_client, &pending_triggers_list_copy, client_chain, temp_client) { + necp_force_update_client(search_client->client_id, session->wrapper->netagent.netagent_uuid); + netagent_send_cellular_failed_event(new_wrapper, search_client->client_pid, search_client->client_proc_uuid); + LIST_REMOVE(search_client, client_chain); + FREE(search_client, M_NETAGENT); + } + NETAGENTLOG0(LOG_DEBUG, "Updated agent (no changes)"); + *agent_changed = FALSE; + return response_error; + } + + new_wrapper->generation = g_next_generation++; + + if ((new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) && + !(session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE)) { + netagent_active_count++; + } else if (!(new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) && + (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) && + netagent_active_count > 0) { + netagent_active_count--; + } + + LIST_REMOVE(session->wrapper, master_chain); + netagent_free_wrapper(session->wrapper); + session->wrapper = new_wrapper; + new_wrapper->control_unit = session->control_unit; + LIST_INSERT_HEAD(&master_netagent_list, new_wrapper, master_chain); + LIST_INIT(&new_wrapper->pending_triggers_list); + + NETAGENTLOG0(LOG_DEBUG, "Updated agent"); + *agent_changed = TRUE; + + lck_rw_done(&netagent_lock); + + return response_error; +} + +static errno_t +netagent_handle_update_setopt(struct netagent_session *session, u_int8_t *payload, u_int32_t payload_length) +{ + u_int32_t data_size = 0; + struct netagent_wrapper *new_wrapper = NULL; + errno_t response_error = 0; + struct netagent *update_netagent = (struct netagent *)(void *)payload; + u_int8_t agent_changed; + + if (session == NULL) { + NETAGENTLOG0(LOG_ERR, "Failed to find session"); + response_error = EINVAL; + goto done; + } + + if (payload == NULL) { + NETAGENTLOG0(LOG_ERR, "No payload received"); + response_error = EINVAL; + goto done; + } + + if (session->wrapper == NULL) { + NETAGENTLOG0(LOG_ERR, "Session has no agent to update"); + response_error = ENOENT; + goto done; + } + + if (payload_length < sizeof(struct netagent)) { + NETAGENTLOG(LOG_ERR, "Update message size too small for agent: (%d < %d)", + payload_length, sizeof(struct netagent)); + response_error = EINVAL; + goto done; + } + + data_size = update_netagent->netagent_data_size; + if (data_size > NETAGENT_MAX_DATA_SIZE) { + NETAGENTLOG(LOG_ERR, "Update message size (%u > %u) too large", data_size, NETAGENT_MAX_DATA_SIZE); + response_error = EINVAL; + goto done; + } + + MALLOC(new_wrapper, struct netagent_wrapper *, sizeof(*new_wrapper) + data_size, M_NETAGENT, M_WAITOK); + if (new_wrapper == NULL) { + NETAGENTLOG0(LOG_ERR, "Failed to allocate agent"); + response_error = ENOMEM; + goto done; + } + + memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); + memcpy(&new_wrapper->netagent, update_netagent, sizeof(struct netagent) + data_size); + + response_error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainPOSIX); + if (response_error == 0) { + netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed); + if (agent_changed == FALSE) { + // The session wrapper does not need the "new_wrapper" as nothing changed + FREE(new_wrapper, M_NETAGENT); + } + } else { + FREE(new_wrapper, M_NETAGENT); + } + +done: + return response_error; +} + +static void +netagent_handle_update_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset) { int error; int data_size = 0; struct netagent_wrapper *new_wrapper = NULL; u_int32_t response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - uuid_t netagent_uuid; - uuid_clear(netagent_uuid); + u_int8_t agent_changed; if (session == NULL) { NETAGENTLOG0(LOG_ERR, "Failed to find session"); @@ -713,41 +1071,20 @@ netagent_handle_update(struct netagent_session *session, u_int32_t message_id, goto fail; } - lck_rw_lock_exclusive(&netagent_lock); - - if (uuid_compare(session->wrapper->netagent.netagent_uuid, new_wrapper->netagent.netagent_uuid) != 0 || - memcmp(&session->wrapper->netagent.netagent_domain, &new_wrapper->netagent.netagent_domain, - sizeof(new_wrapper->netagent.netagent_domain)) != 0 || - memcmp(&session->wrapper->netagent.netagent_type, &new_wrapper->netagent.netagent_type, - sizeof(new_wrapper->netagent.netagent_type)) != 0) { - NETAGENTLOG0(LOG_ERR, "Basic agent parameters do not match, cannot update"); + response_error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed , kNetagentErrorDomainUserDefined); + if (response_error != 0) { FREE(new_wrapper, M_NETAGENT); - response_error = NETAGENT_MESSAGE_ERROR_CANNOT_UPDATE; - lck_rw_done(&netagent_lock); goto fail; } - new_wrapper->netagent.netagent_flags |= NETAGENT_FLAG_REGISTERED; - if ((new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) && - !(session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE)) { - netagent_active_count++; - } else if (!(new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) && - (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) && - netagent_active_count > 0) { - netagent_active_count--; - } - - LIST_REMOVE(session->wrapper, master_chain); - FREE(session->wrapper, M_NETAGENT); - session->wrapper = new_wrapper; - new_wrapper->control_unit = session->control_unit; - LIST_INSERT_HEAD(&master_netagent_list, new_wrapper, master_chain); + netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_UPDATE, message_id); + netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed); - lck_rw_done(&netagent_lock); + if (agent_changed == FALSE) { + // The session wrapper does not need the "new_wrapper" as nothing changed + FREE(new_wrapper, M_NETAGENT); + } - NETAGENTLOG0(LOG_DEBUG, "Updated agent"); - netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_UPDATE, message_id); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED); return; fail: netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_UPDATE, message_id, response_error); @@ -801,83 +1138,77 @@ netagent_handle_get(struct netagent_session *session, u_int32_t message_id, netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_GET, message_id, response_error); } -static void -netagent_handle_assert(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset) +static errno_t +netagent_handle_assign_nexus_setopt(struct netagent_session *session, u_int8_t *payload, + u_int32_t payload_length) { - int error; - struct netagent_assertion *new_assertion = NULL; - u_int32_t response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - uuid_t netagent_uuid; - uuid_clear(netagent_uuid); + errno_t response_error = 0; + struct netagent_assign_nexus_message *assign_nexus_netagent = (struct netagent_assign_nexus_message *)(void *)payload; + uuid_t client_id; + u_int8_t *assigned_results = NULL; if (session == NULL) { NETAGENTLOG0(LOG_ERR, "Failed to find session"); - response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - goto fail; + response_error = ENOENT; + goto done; } - if (payload_length < sizeof(uuid_t)) { - NETAGENTLOG(LOG_ERR, "Assert message size too small for uuid: (%d < %d)", - payload_length, sizeof(uuid_t)); - response_error = NETAGENT_MESSAGE_ERROR_INVALID_DATA; - goto fail; + if (payload == NULL) { + NETAGENTLOG0(LOG_ERR, "No payload received"); + response_error = EINVAL; + goto done; } - error = mbuf_copydata(packet, offset, sizeof(uuid_t), &netagent_uuid); - if (error) { - NETAGENTLOG(LOG_ERR, "Failed to read uuid: %d", error); - response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - goto fail; + if (session->wrapper == NULL) { + NETAGENTLOG0(LOG_ERR, "Session has no agent to get"); + response_error = ENOENT; + goto done; } - MALLOC(new_assertion, struct netagent_assertion *, sizeof(*new_assertion), M_NETAGENT, M_WAITOK); - if (new_assertion == NULL) { - NETAGENTLOG0(LOG_ERR, "Failed to allocate assertion"); - response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - goto fail; + if (payload_length < sizeof(uuid_t)) { + NETAGENTLOG0(LOG_ERR, "Assign message is too short"); + response_error = EINVAL; + goto done; } - uuid_copy(new_assertion->asserted_uuid, netagent_uuid); - - lck_rw_lock_shared(&netagent_lock); + memcpy(client_id, assign_nexus_netagent->assign_client_id, sizeof(client_id)); + size_t assigned_results_length = (payload_length - sizeof(client_id)); - struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(netagent_uuid); - if (wrapper == NULL) { - lck_rw_done(&netagent_lock); - response_error = NETAGENT_MESSAGE_ERROR_NOT_REGISTERED; - FREE(new_assertion, M_NETAGENT); - goto fail; + if (assigned_results_length > 0) { + MALLOC(assigned_results, u_int8_t *, assigned_results_length, M_NETAGENT, M_WAITOK); + if (assigned_results == NULL) { + NETAGENTLOG(LOG_ERR, "Failed to allocate assign message (%lu bytes)", assigned_results_length); + response_error = ENOMEM; + goto done; + } + memcpy(assigned_results, assign_nexus_netagent->assign_necp_results, assigned_results_length); } - error = netagent_send_trigger(wrapper, current_proc(), NETAGENT_TRIGGER_FLAG_USER, NETAGENT_MESSAGE_TYPE_TRIGGER_ASSERT); - if (error) { - lck_rw_done(&netagent_lock); - NETAGENTLOG(LOG_ERR, "Failed to trigger assert agent: %d", error); - response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - FREE(new_assertion, M_NETAGENT); - goto fail; + // Note that if the error is 0, NECP has taken over our malloc'ed buffer + response_error = necp_assign_client_result(session->wrapper->netagent.netagent_uuid, client_id, assigned_results, assigned_results_length); + if (response_error) { + // necp_assign_client_result returns POSIX errors + if (assigned_results) { + FREE(assigned_results, M_NETAGENT); + } + NETAGENTLOG(LOG_ERR, "Client assignment failed: %d", response_error); + goto done; } - LIST_INSERT_HEAD(&session->assertion_list, new_assertion, assertion_chain); - - lck_rw_done(&netagent_lock); - - NETAGENTLOG0(LOG_DEBUG, "Asserted agent"); - netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_ASSERT, message_id); - return; -fail: - netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_ASSERT, message_id, response_error); + NETAGENTLOG0(LOG_DEBUG, "Agent assigned nexus properties to client"); +done: + return response_error; } + static void -netagent_handle_unassert(struct netagent_session *session, u_int32_t message_id, - u_int32_t payload_length, mbuf_t packet, int offset) +netagent_handle_assign_nexus_message(struct netagent_session *session, u_int32_t message_id, + u_int32_t payload_length, mbuf_t packet, int offset) { - int error; + int error = 0; u_int32_t response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - uuid_t netagent_uuid; - uuid_clear(netagent_uuid); + uuid_t client_id; + u_int8_t *assigned_results = NULL; if (session == NULL) { NETAGENTLOG0(LOG_ERR, "Failed to find session"); @@ -885,63 +1216,59 @@ netagent_handle_unassert(struct netagent_session *session, u_int32_t message_id, goto fail; } + if (session->wrapper == NULL) { + NETAGENTLOG0(LOG_ERR, "Session has no agent to get"); + response_error = NETAGENT_MESSAGE_ERROR_NOT_REGISTERED; + goto fail; + } + if (payload_length < sizeof(uuid_t)) { - NETAGENTLOG(LOG_ERR, "Unassert message size too small for uuid: (%d < %d)", - payload_length, sizeof(uuid_t)); + NETAGENTLOG0(LOG_ERR, "Assign message is too short"); response_error = NETAGENT_MESSAGE_ERROR_INVALID_DATA; goto fail; } - error = mbuf_copydata(packet, offset, sizeof(uuid_t), &netagent_uuid); + error = mbuf_copydata(packet, offset, sizeof(client_id), &client_id); if (error) { - NETAGENTLOG(LOG_ERR, "Failed to read uuid: %d", error); + NETAGENTLOG(LOG_ERR, "Failed to read uuid for assign message: %d", error); response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; goto fail; } - struct netagent_assertion *found_assertion = NULL; - struct netagent_assertion *search_assertion = NULL; - LIST_FOREACH(search_assertion, &session->assertion_list, assertion_chain) { - if (uuid_compare(search_assertion->asserted_uuid, netagent_uuid) == 0) { - found_assertion = search_assertion; - break; + size_t assigned_results_length = (payload_length - sizeof(client_id)); + if (assigned_results_length > 0) { + MALLOC(assigned_results, u_int8_t *, assigned_results_length, M_NETAGENT, M_WAITOK); + if (assigned_results == NULL) { + NETAGENTLOG(LOG_ERR, "Failed to allocate assign message (%lu bytes)", assigned_results_length); + response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; + goto fail; } - } - - if (found_assertion == NULL) { - NETAGENTLOG0(LOG_ERR, "Netagent uuid not previously asserted"); - response_error = NETAGENT_MESSAGE_ERROR_INVALID_DATA; - goto fail; - } - - LIST_REMOVE(found_assertion, assertion_chain); - FREE(found_assertion, M_NETAGENT); - found_assertion = NULL; - lck_rw_lock_shared(&netagent_lock); - - struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(netagent_uuid); - if (wrapper == NULL) { - lck_rw_done(&netagent_lock); - response_error = NETAGENT_MESSAGE_ERROR_NOT_REGISTERED; - goto fail; + error = mbuf_copydata(packet, offset + sizeof(client_id), assigned_results_length, assigned_results); + if (error) { + FREE(assigned_results, M_NETAGENT); + NETAGENTLOG(LOG_ERR, "Failed to read assign message: %d", error); + response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; + goto fail; + } } - error = netagent_send_trigger(wrapper, current_proc(), NETAGENT_TRIGGER_FLAG_USER, NETAGENT_MESSAGE_TYPE_TRIGGER_UNASSERT); + // Note that if the error is 0, NECP has taken over our malloc'ed buffer + error = necp_assign_client_result(session->wrapper->netagent.netagent_uuid, client_id, assigned_results, assigned_results_length); if (error) { - lck_rw_done(&netagent_lock); - NETAGENTLOG(LOG_ERR, "Failed to trigger assert agent: %d", error); - response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; + if (assigned_results) { + FREE(assigned_results, M_NETAGENT); + } + NETAGENTLOG(LOG_ERR, "Client assignment failed: %d", error); + response_error = NETAGENT_MESSAGE_ERROR_CANNOT_ASSIGN; goto fail; } - lck_rw_done(&netagent_lock); - - NETAGENTLOG0(LOG_DEBUG, "Unasserted agent"); - netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_UNASSERT, message_id); + NETAGENTLOG0(LOG_DEBUG, "Agent assigned nexus properties to client"); + netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_ASSIGN_NEXUS, message_id); return; fail: - netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_UNASSERT, message_id, response_error); + netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_ASSIGN_NEXUS, message_id, response_error); } static struct netagent_wrapper * @@ -967,7 +1294,7 @@ netagent_post_updated_interfaces(uuid_t uuid) lck_rw_done(&netagent_lock); if (wrapper != NULL) { - netagent_post_event(uuid, KEV_NETAGENT_UPDATED_INTERFACES); + netagent_post_event(uuid, KEV_NETAGENT_UPDATED_INTERFACES, TRUE); } else { NETAGENTLOG0(LOG_DEBUG, "Interface event with no associated agent"); } @@ -975,11 +1302,53 @@ netagent_post_updated_interfaces(uuid_t uuid) return; } +static u_int32_t +netagent_dump_get_data_size_locked() +{ + struct netagent_wrapper *search_netagent = NULL; + u_int32_t total_netagent_data_size = 0; + // Traverse the master list to know how much data the client needs to allocate to get the list of agent UUIDs + LIST_FOREACH(search_netagent, &master_netagent_list, master_chain) { + total_netagent_data_size += sizeof(search_netagent->netagent.netagent_uuid); + } + return total_netagent_data_size; +} + +static void +netagent_dump_copy_data_locked(u_int8_t *buffer, u_int32_t buffer_length) +{ + size_t response_size = 0; + u_int8_t *cursor = NULL; + struct netagent_wrapper *search_netagent = NULL; + + response_size = buffer_length; // We already know that buffer_length is the same as total_netagent_data_size. + cursor = buffer; + LIST_FOREACH(search_netagent, &master_netagent_list, master_chain) { + memcpy(cursor, search_netagent->netagent.netagent_uuid, sizeof(search_netagent->netagent.netagent_uuid)); + cursor += sizeof(search_netagent->netagent.netagent_uuid); + } +} + int netagent_ioctl(u_long cmd, caddr_t data) { int error = 0; + switch (cmd) { + case SIOCGIFAGENTLIST32: + case SIOCGIFAGENTLIST64: { + /* Check entitlement if the client requests agent dump */ + errno_t cred_result = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NECP_POLICIES, 0); + if (cred_result != 0) { + NETAGENTLOG0(LOG_ERR, "Client does not hold the necessary entitlement to get netagent information"); + return EINVAL; + } + break; + } + default: + break; + } + lck_rw_lock_shared(&netagent_lock); switch (cmd) { case SIOCGIFAGENTDATA32: { @@ -1030,6 +1399,54 @@ netagent_ioctl(u_long cmd, caddr_t data) } break; } + case SIOCGIFAGENTLIST32: { + struct netagentlist_req32 *ifsir32 = (struct netagentlist_req32 *)(void *)data; + if (ifsir32->data_size == 0) { + // First pass, client wants data size + ifsir32->data_size = netagent_dump_get_data_size_locked(); + } else if (ifsir32->data != USER_ADDR_NULL && + ifsir32->data_size > 0 && + ifsir32->data_size == netagent_dump_get_data_size_locked()) { + // Second pass, client wants data buffer filled out + u_int8_t *response = NULL; + MALLOC(response, u_int8_t *, ifsir32->data_size, M_NETAGENT, M_NOWAIT | M_ZERO); + if (response == NULL) { + error = ENOMEM; + break; + } + + netagent_dump_copy_data_locked(response, ifsir32->data_size); + error = copyout(response, ifsir32->data, ifsir32->data_size); + FREE(response, M_NETAGENT); + } else { + error = EINVAL; + } + break; + } + case SIOCGIFAGENTLIST64: { + struct netagentlist_req64 *ifsir64 = (struct netagentlist_req64 *)(void *)data; + if (ifsir64->data_size == 0) { + // First pass, client wants data size + ifsir64->data_size = netagent_dump_get_data_size_locked(); + } else if (ifsir64->data != USER_ADDR_NULL && + ifsir64->data_size > 0 && + ifsir64->data_size == netagent_dump_get_data_size_locked()) { + // Second pass, client wants data buffer filled out + u_int8_t *response = NULL; + MALLOC(response, u_int8_t *, ifsir64->data_size, M_NETAGENT, M_NOWAIT | M_ZERO); + if (response == NULL) { + error = ENOMEM; + break; + } + + netagent_dump_copy_data_locked(response, ifsir64->data_size); + error = copyout(response, ifsir64->data, ifsir64->data_size); + FREE(response, M_NETAGENT); + } else { + error = EINVAL; + } + break; + } default: { error = EINVAL; break; @@ -1055,6 +1472,45 @@ netagent_get_flags(uuid_t uuid) return (flags); } +u_int32_t +netagent_get_generation(uuid_t uuid) +{ + u_int32_t generation = 0; + lck_rw_lock_shared(&netagent_lock); + struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(uuid); + if (wrapper != NULL) { + generation = wrapper->generation; + } else { + NETAGENTLOG0(LOG_DEBUG, "Generation requested for invalid netagent"); + } + lck_rw_done(&netagent_lock); + + return (generation); +} + +bool +netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char *type) +{ + bool found = FALSE; + if (domain == NULL || type == NULL) { + NETAGENTLOG(LOG_ERR, "Invalid arguments for netagent_get_agent_domain_and_type %p %p", domain, type); + return (FALSE); + } + + lck_rw_lock_shared(&netagent_lock); + struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(uuid); + if (wrapper != NULL) { + found = TRUE; + memcpy(domain, wrapper->netagent.netagent_domain, NETAGENT_DOMAINSIZE); + memcpy(type, wrapper->netagent.netagent_type, NETAGENT_TYPESIZE); + } else { + NETAGENTLOG0(LOG_DEBUG, "Type requested for invalid netagent"); + } + lck_rw_done(&netagent_lock); + + return (found); +} + int netagent_kernel_trigger(uuid_t uuid) { @@ -1089,6 +1545,121 @@ netagent_kernel_trigger(uuid_t uuid) return (error); } +int +netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, u_int8_t message_type) +{ + int error = 0; + + if (message_type != NETAGENT_MESSAGE_TYPE_CLIENT_TRIGGER && + message_type != NETAGENT_MESSAGE_TYPE_CLIENT_ASSERT && + message_type != NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT && + message_type != NETAGENT_MESSAGE_TYPE_REQUEST_NEXUS && + message_type != NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS) { + NETAGENTLOG(LOG_ERR, "Client netagent message type (%d) is invalid", message_type); + return(EINVAL); + } + + lck_rw_lock_shared(&netagent_lock); + bool should_unlock = TRUE; + struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(agent_uuid); + if (wrapper == NULL) { + NETAGENTLOG0(LOG_ERR, "Requested netagent for nexus instance could not be found"); + error = ENOENT; + goto done; + } + + if (message_type == NETAGENT_MESSAGE_TYPE_CLIENT_TRIGGER) { + if ((wrapper->netagent.netagent_flags & NETAGENT_FLAG_USER_ACTIVATED) == 0) { + // Agent does not accept user triggers + // Don't log, since this is a common case used to trigger events that cellular data is blocked, etc. + error = ENOTSUP; + + struct proc *p = current_proc(); + pid_t current_pid = 0; + uuid_t current_proc_uuid; + uuid_clear(current_proc_uuid); + if (p != NULL) { + current_pid = proc_pid(p); + proc_getexecutableuuid(p, current_proc_uuid, sizeof(current_proc_uuid)); + } + netagent_send_cellular_failed_event(wrapper, current_pid, current_proc_uuid); + goto done; + } + } else if (message_type == NETAGENT_MESSAGE_TYPE_REQUEST_NEXUS || + message_type == NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS) { + if ((wrapper->netagent.netagent_flags & NETAGENT_FLAG_NEXUS_PROVIDER) == 0) { + NETAGENTLOG0(LOG_ERR, "Requested netagent for nexus instance is not a nexus provider"); + // Agent is not a nexus provider + error = EINVAL; + goto done; + } + + if ((wrapper->netagent.netagent_flags & NETAGENT_FLAG_ACTIVE) == 0) { + // Agent not active + NETAGENTLOG0(LOG_INFO, "Requested netagent for nexus instance is not active"); + error = EINVAL; + goto done; + } + } + + error = netagent_send_client_message(wrapper, necp_client_uuid, message_type); + if (error == 0 && message_type == NETAGENT_MESSAGE_TYPE_CLIENT_TRIGGER) { + if (lck_rw_lock_shared_to_exclusive(&netagent_lock)) { + // Grab the lock exclusively to add a pending client to the list + struct netagent_client *new_pending_client = NULL; + MALLOC(new_pending_client, struct netagent_client *, sizeof(*new_pending_client), M_NETAGENT, M_WAITOK); + if (new_pending_client == NULL) { + NETAGENTLOG0(LOG_ERR, "Failed to allocate client for trigger"); + } else { + uuid_copy(new_pending_client->client_id, necp_client_uuid); + struct proc *p = current_proc(); + if (p != NULL) { + new_pending_client->client_pid = proc_pid(p); + proc_getexecutableuuid(p, new_pending_client->client_proc_uuid, sizeof(new_pending_client->client_proc_uuid)); + } + LIST_INSERT_HEAD(&wrapper->pending_triggers_list, new_pending_client, client_chain); + } + } else { + // If lck_rw_lock_shared_to_exclusive fails, it unlocks automatically + should_unlock = FALSE; + } + } + NETAGENTLOG((error ? LOG_ERR : LOG_INFO), "Send message %d for client (error %d)", message_type, error); +done: + if (should_unlock) { + lck_rw_done(&netagent_lock); + } + return (error); +} + +int +netagent_copyout(uuid_t agent_uuid, user_addr_t user_addr, u_int32_t user_size) +{ + int error = 0; + + lck_rw_lock_shared(&netagent_lock); + struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(agent_uuid); + if (wrapper == NULL) { + NETAGENTLOG0(LOG_ERR, "Requested netagent for nexus instance could not be found"); + error = ENOENT; + goto done; + } + + u_int32_t total_size = (sizeof(struct netagent) + wrapper->netagent.netagent_data_size); + if (user_size < total_size) { + NETAGENTLOG(LOG_ERR, "Provided user buffer is too small (%u < %u)", user_size, total_size); + error = EINVAL; + goto done; + } + + error = copyout(&wrapper->netagent, user_addr, total_size); + + NETAGENTLOG((error ? LOG_ERR : LOG_DEBUG), "Copied agent content (error %d)", error); +done: + lck_rw_done(&netagent_lock); + return (error); +} + int netagent_trigger(struct proc *p, struct netagent_trigger_args *uap, int32_t *retval) { @@ -1131,7 +1702,7 @@ netagent_trigger(struct proc *p, struct netagent_trigger_args *uap, int32_t *ret if ((wrapper->netagent.netagent_flags & NETAGENT_FLAG_USER_ACTIVATED) == 0) { // Agent does not accept triggers NETAGENTLOG0(LOG_ERR, "Requested netagent UUID is not eligible for triggering"); - error = EINVAL; + error = ENOTSUP; goto done; } diff --git a/bsd/net/network_agent.h b/bsd/net/network_agent.h index 6fe55b96c..ce4bdf001 100644 --- a/bsd/net/network_agent.h +++ b/bsd/net/network_agent.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, 2015 Apple Inc. All rights reserved. + * Copyright (c) 2014-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,6 +28,9 @@ #ifndef _NETAGENT_H_ #define _NETAGENT_H_ +#include + +#ifdef PRIVATE #include #include @@ -37,7 +40,6 @@ errno_t netagent_init(void); #endif -#ifdef PRIVATE /* * Name registered by the Network Agent kernel control */ @@ -57,15 +59,35 @@ struct netagent_trigger_message { uuid_t trigger_proc_uuid; }; +struct netagent_client_message { + uuid_t client_id; +}; + +struct netagent_assign_nexus_message { + uuid_t assign_client_id; + u_int8_t assign_necp_results[0]; +}; + #define NETAGENT_MESSAGE_TYPE_REGISTER 1 // Pass netagent to set, no return value #define NETAGENT_MESSAGE_TYPE_UNREGISTER 2 // No value, no return value #define NETAGENT_MESSAGE_TYPE_UPDATE 3 // Pass netagent to update, no return value -#define NETAGENT_MESSAGE_TYPE_GET 4 // No value, return netagent +#define NETAGENT_MESSAGE_TYPE_GET 4 // No value, return netagent #define NETAGENT_MESSAGE_TYPE_TRIGGER 5 // Kernel initiated, no reply expected -#define NETAGENT_MESSAGE_TYPE_ASSERT 6 // Pass uuid of netagent to assert -#define NETAGENT_MESSAGE_TYPE_UNASSERT 7 // Pass uuid of netagent to unassert +#define NETAGENT_MESSAGE_TYPE_ASSERT 6 // Deprecated +#define NETAGENT_MESSAGE_TYPE_UNASSERT 7 // Deprecated #define NETAGENT_MESSAGE_TYPE_TRIGGER_ASSERT 8 // Kernel initiated, no reply expected #define NETAGENT_MESSAGE_TYPE_TRIGGER_UNASSERT 9 // Kernel initiated, no reply expected +#define NETAGENT_MESSAGE_TYPE_REQUEST_NEXUS 10 // Kernel initiated, struct netagent_client_message +#define NETAGENT_MESSAGE_TYPE_ASSIGN_NEXUS 11 // Pass struct netagent_assign_nexus_message +#define NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS 12 // Kernel initiated, struct netagent_client_message +#define NETAGENT_MESSAGE_TYPE_CLIENT_TRIGGER 13 // Kernel initiated, struct netagent_client_message +#define NETAGENT_MESSAGE_TYPE_CLIENT_ASSERT 14 // Kernel initiated, struct netagent_client_message +#define NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT 15 // Kernel initiated, struct netagent_client_message + +#define NETAGENT_OPTION_TYPE_REGISTER NETAGENT_MESSAGE_TYPE_REGISTER // Pass netagent to set, no return value +#define NETAGENT_OPTION_TYPE_UNREGISTER NETAGENT_MESSAGE_TYPE_UNREGISTER // No value, no return value +#define NETAGENT_OPTION_TYPE_UPDATE NETAGENT_MESSAGE_TYPE_UPDATE // Pass netagent to update, no return value +#define NETAGENT_OPTION_TYPE_ASSIGN_NEXUS NETAGENT_MESSAGE_TYPE_ASSIGN_NEXUS // Pass struct netagent_assign_nexus_message #define NETAGENT_MESSAGE_FLAGS_RESPONSE 0x01 // Used for acks, errors, and query responses @@ -76,12 +98,13 @@ struct netagent_trigger_message { #define NETAGENT_MESSAGE_ERROR_NOT_REGISTERED 4 #define NETAGENT_MESSAGE_ERROR_ALREADY_REGISTERED 5 #define NETAGENT_MESSAGE_ERROR_CANNOT_UPDATE 6 +#define NETAGENT_MESSAGE_ERROR_CANNOT_ASSIGN 7 #define NETAGENT_DOMAINSIZE 32 #define NETAGENT_TYPESIZE 32 #define NETAGENT_DESCSIZE 128 -#define NETAGENT_MAX_DATA_SIZE 1024 +#define NETAGENT_MAX_DATA_SIZE 4096 #define NETAGENT_FLAG_REGISTERED 0x0001 // Agent is registered #define NETAGENT_FLAG_ACTIVE 0x0002 // Agent is active @@ -89,16 +112,28 @@ struct netagent_trigger_message { #define NETAGENT_FLAG_USER_ACTIVATED 0x0008 // Agent can be activated by system call (netagent_trigger) #define NETAGENT_FLAG_VOLUNTARY 0x0010 // Use of agent is optional #define NETAGENT_FLAG_SPECIFIC_USE_ONLY 0x0020 // Agent should only be used and activated when specifically required +#define NETAGENT_FLAG_NETWORK_PROVIDER 0x0040 // Agent provides network access +#define NETAGENT_FLAG_NEXUS_PROVIDER 0x0080 // Agent provides a skywalk nexus + +#define NETAGENT_NEXUS_MAX_REQUEST_TYPES 16 +#define NETAGENT_NEXUS_MAX_RESOLUTION_TYPE_PAIRS 16 + +#define NETAGENT_NEXUS_FRAME_TYPE_UNKNOWN 0 +#define NETAGENT_NEXUS_FRAME_TYPE_LINK 1 +#define NETAGENT_NEXUS_FRAME_TYPE_INTERNET 2 +#define NETAGENT_NEXUS_FRAME_TYPE_TRANSPORT 3 +#define NETAGENT_NEXUS_FRAME_TYPE_APPLICATION 4 + +struct netagent_nexus { + u_int32_t frame_type; + u_int32_t endpoint_assignment_type; + u_int32_t endpoint_request_types[NETAGENT_NEXUS_MAX_REQUEST_TYPES]; + u_int32_t endpoint_resolution_type_pairs[NETAGENT_NEXUS_MAX_RESOLUTION_TYPE_PAIRS * 2]; +}; #define NETAGENT_TRIGGER_FLAG_USER 0x0001 // Userspace triggered agent #define NETAGENT_TRIGGER_FLAG_KERNEL 0x0002 // Kernel triggered agent -#define KEV_NETAGENT_SUBCLASS 9 -#define KEV_NETAGENT_REGISTERED 1 -#define KEV_NETAGENT_UNREGISTERED 2 -#define KEV_NETAGENT_UPDATED 3 -#define KEV_NETAGENT_UPDATED_INTERFACES 4 - struct kev_netagent_data { uuid_t netagent_uuid; }; @@ -124,6 +159,12 @@ struct netagent_req { u_int32_t netagent_data_size; u_int8_t *netagent_data; }; + +// To be used with SIOCGAGENTLIST +struct netagentlist_req { + u_int32_t data_size; + u_int8_t *data; +}; #ifdef BSD_KERNEL_PRIVATE int netagent_ioctl(u_long cmd, caddr_t data); @@ -145,19 +186,35 @@ struct netagent_req64 { u_int32_t netagent_data_size; user64_addr_t netagent_data __attribute__((aligned(8))); }; +struct netagentlist_req32 { + u_int32_t data_size; + user32_addr_t data; +}; +struct netagentlist_req64 { + u_int32_t data_size; + user64_addr_t data __attribute__((aligned(8))); +}; // Kernel accessors -void netagent_post_updated_interfaces(uuid_t uuid); // To be called from interface ioctls +extern void netagent_post_updated_interfaces(uuid_t uuid); // To be called from interface ioctls -u_int32_t netagent_get_flags(uuid_t uuid); +extern u_int32_t netagent_get_flags(uuid_t uuid); -int netagent_kernel_trigger(uuid_t uuid); -#endif /* BSD_KERNEL_PRIVATE */ +extern u_int32_t netagent_get_generation(uuid_t uuid); -#endif /* PRIVATE */ +extern bool netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char *type); + +extern int netagent_kernel_trigger(uuid_t uuid); + +extern int netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, u_int8_t message_type); + +extern int netagent_copyout(uuid_t uuid, user_addr_t user_addr, u_int32_t user_size); +#endif /* BSD_KERNEL_PRIVATE */ #ifndef KERNEL -int netagent_trigger(uuid_t agent_uuid, size_t agent_uuidlen); +extern int netagent_trigger(uuid_t agent_uuid, size_t agent_uuidlen); #endif /* !KERNEL */ +#endif /* PRIVATE */ + #endif /* _NETAGENT_H_ */ diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c index f742a560a..35a02c0b3 100644 --- a/bsd/net/ntstat.c +++ b/bsd/net/ntstat.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2010-2015 Apple Inc. All rights reserved. + * Copyright (c) 2010-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -101,21 +101,30 @@ enum #define QUERY_CONTINUATION_SRC_COUNT 100 +typedef struct nstat_provider_filter +{ + u_int64_t npf_flags; + u_int64_t npf_events; + pid_t npf_pid; + uuid_t npf_uuid; +} nstat_provider_filter; + + typedef struct nstat_control_state { struct nstat_control_state *ncs_next; - u_int32_t ncs_watching; + u_int32_t ncs_watching; decl_lck_mtx_data(, mtx); kern_ctl_ref ncs_kctl; - u_int32_t ncs_unit; + u_int32_t ncs_unit; nstat_src_ref_t ncs_next_srcref; struct nstat_src *ncs_srcs; - mbuf_t ncs_accumulated; - u_int32_t ncs_flags; - u_int64_t ncs_provider_filters[NSTAT_PROVIDER_COUNT]; + mbuf_t ncs_accumulated; + u_int32_t ncs_flags; + nstat_provider_filter ncs_provider_filters[NSTAT_PROVIDER_COUNT]; /* state maintained for partial query requests */ - u_int64_t ncs_context; - u_int64_t ncs_seq; + u_int64_t ncs_context; + u_int64_t ncs_seq; } nstat_control_state; typedef struct nstat_provider @@ -130,9 +139,14 @@ typedef struct nstat_provider void (*nstat_watcher_remove)(nstat_control_state *state); errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len); void (*nstat_release)(nstat_provider_cookie_t cookie, boolean_t locked); - bool (*nstat_reporting_allowed)(nstat_provider_cookie_t cookie, uint64_t filter); + bool (*nstat_reporting_allowed)(nstat_provider_cookie_t cookie, nstat_provider_filter *filter); } nstat_provider; +typedef STAILQ_HEAD(, nstat_src) stailq_head_nstat_src; +typedef STAILQ_ENTRY(nstat_src) stailq_entry_nstat_src; + +typedef TAILQ_HEAD(, nstat_tu_shadow) tailq_head_tu_shadow; +typedef TAILQ_ENTRY(nstat_tu_shadow) tailq_entry_tu_shadow; typedef struct nstat_src { @@ -157,7 +171,9 @@ static u_int16_t nstat_control_end_query(nstat_control_state *state, nstat_src * static void nstat_ifnet_report_ecn_stats(void); static u_int32_t nstat_udp_watchers = 0; +static u_int32_t nstat_userland_udp_watchers = 0; static u_int32_t nstat_tcp_watchers = 0; +static u_int32_t nstat_userland_tcp_watchers = 0; static void nstat_control_register(void); @@ -184,7 +200,7 @@ nstat_copy_sa_out( int maxlen) { if (src->sa_len > maxlen) return; - + bcopy(src, dst, src->sa_len); if (src->sa_family == AF_INET6 && src->sa_len >= sizeof(struct sockaddr_in6)) @@ -208,7 +224,7 @@ nstat_ip_to_sockaddr( { if (maxlen < sizeof(struct sockaddr_in)) return; - + sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = port; @@ -224,7 +240,7 @@ nstat_ip6_to_sockaddr( { if (maxlen < sizeof(struct sockaddr_in6)) return; - + sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = port; @@ -236,6 +252,47 @@ nstat_ip6_to_sockaddr( } } +static u_int16_t +nstat_ifnet_to_flags( + struct ifnet *ifp) +{ + u_int16_t flags = 0; + u_int32_t functional_type = if_functional_type(ifp, FALSE); + + /* Panic if someone adds a functional type without updating ntstat. */ + VERIFY(0 <= functional_type && functional_type <= IFRTYPE_FUNCTIONAL_LAST); + + switch (functional_type) + { + case IFRTYPE_FUNCTIONAL_UNKNOWN: + flags |= NSTAT_IFNET_IS_UNKNOWN_TYPE; + break; + case IFRTYPE_FUNCTIONAL_LOOPBACK: + flags |= NSTAT_IFNET_IS_LOOPBACK; + break; + case IFRTYPE_FUNCTIONAL_WIRED: + flags |= NSTAT_IFNET_IS_WIRED; + break; + case IFRTYPE_FUNCTIONAL_WIFI_INFRA: + flags |= NSTAT_IFNET_IS_WIFI; + break; + case IFRTYPE_FUNCTIONAL_WIFI_AWDL: + flags |= NSTAT_IFNET_IS_WIFI; + flags |= NSTAT_IFNET_IS_AWDL; + break; + case IFRTYPE_FUNCTIONAL_CELLULAR: + flags |= NSTAT_IFNET_IS_CELLULAR; + break; + } + + if (IFNET_IS_EXPENSIVE(ifp)) + { + flags |= NSTAT_IFNET_IS_EXPENSIVE; + } + + return flags; +} + static u_int16_t nstat_inpcb_to_flags( const struct inpcb *inp) @@ -245,41 +302,13 @@ nstat_inpcb_to_flags( if ((inp != NULL ) && (inp->inp_last_outifp != NULL)) { struct ifnet *ifp = inp->inp_last_outifp; + flags = nstat_ifnet_to_flags(ifp); - u_int32_t functional_type = if_functional_type(ifp); - - /* Panic if someone adds a functional type without updating ntstat. */ - VERIFY(0 <= functional_type && functional_type <= IFRTYPE_FUNCTIONAL_LAST); - - switch (functional_type) + if (flags & NSTAT_IFNET_IS_CELLULAR) { - case IFRTYPE_FUNCTIONAL_UNKNOWN: - flags |= NSTAT_IFNET_IS_UNKNOWN_TYPE; - break; - case IFRTYPE_FUNCTIONAL_LOOPBACK: - flags |= NSTAT_IFNET_IS_LOOPBACK; - break; - case IFRTYPE_FUNCTIONAL_WIRED: - flags |= NSTAT_IFNET_IS_WIRED; - break; - case IFRTYPE_FUNCTIONAL_WIFI_INFRA: - flags |= NSTAT_IFNET_IS_WIFI; - break; - case IFRTYPE_FUNCTIONAL_WIFI_AWDL: - flags |= NSTAT_IFNET_IS_WIFI; - flags |= NSTAT_IFNET_IS_AWDL; - break; - case IFRTYPE_FUNCTIONAL_CELLULAR: - flags |= NSTAT_IFNET_IS_CELLULAR; if (inp->inp_socket != NULL && (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK)) flags |= NSTAT_IFNET_VIA_CELLFALLBACK; - break; - } - - if (IFNET_IS_EXPENSIVE(ifp)) - { - flags |= NSTAT_IFNET_IS_EXPENSIVE; } } else @@ -300,13 +329,13 @@ nstat_find_provider_by_id( nstat_provider_id_t id) { struct nstat_provider *provider; - + for (provider = nstat_providers; provider != NULL; provider = provider->next) { if (provider->nstat_provider_id == id) break; } - + return provider; } @@ -323,20 +352,22 @@ nstat_lookup_entry( { return ENOENT; } - + return (*out_provider)->nstat_lookup(data, length, out_cookie); } static void nstat_init_route_provider(void); static void nstat_init_tcp_provider(void); +static void nstat_init_userland_tcp_provider(void); static void nstat_init_udp_provider(void); +static void nstat_init_userland_udp_provider(void); static void nstat_init_ifnet_provider(void); __private_extern__ void nstat_init(void) { if (nstat_malloc_tag != NULL) return; - + OSMallocTag tag = OSMalloc_Tagalloc(NET_STAT_CONTROL_NAME, OSMT_DEFAULT); if (!OSCompareAndSwapPtr(NULL, tag, &nstat_malloc_tag)) { @@ -348,7 +379,9 @@ nstat_init(void) // we need to initialize other things, we do it here as this code path will only be hit once; nstat_init_route_provider(); nstat_init_tcp_provider(); + nstat_init_userland_tcp_provider(); nstat_init_udp_provider(); + nstat_init_userland_udp_provider(); nstat_init_ifnet_provider(); nstat_control_register(); } @@ -370,17 +403,17 @@ nstat_malloc_aligned( { struct align_header *hdr = NULL; u_int32_t size = length + sizeof(*hdr) + alignment - 1; - + u_int8_t *buffer = OSMalloc(size, tag); if (buffer == NULL) return NULL; - + u_int8_t *aligned = buffer + sizeof(*hdr); aligned = (u_int8_t*)P2ROUNDUP(aligned, alignment); - + hdr = (struct align_header*)(void *)(aligned - sizeof(*hdr)); hdr->offset = aligned - buffer; hdr->length = size; - + return aligned; } @@ -410,22 +443,22 @@ nstat_route_lookup( struct sockaddr *sa; const struct sockaddr *const_sa; } dst, mask; - + const nstat_route_add_param *param = (const nstat_route_add_param*)data; *out_cookie = NULL; - + if (length < sizeof(*param)) { return EINVAL; } - + if (param->dst.v4.sin_family == 0 || param->dst.v4.sin_family > AF_MAX || (param->mask.v4.sin_family != 0 && param->mask.v4.sin_family != param->dst.v4.sin_family)) { return EINVAL; } - + if (param->dst.v4.sin_len > sizeof(param->dst) || (param->mask.v4.sin_family && param->mask.v4.sin_len > sizeof(param->mask.v4.sin_len))) { @@ -438,19 +471,19 @@ nstat_route_lookup( { return EINVAL; } - + dst.const_sa = (const struct sockaddr*)¶m->dst; mask.const_sa = param->mask.v4.sin_family ? (const struct sockaddr*)¶m->mask : NULL; - + struct radix_node_head *rnh = rt_tables[dst.sa->sa_family]; if (rnh == NULL) return EAFNOSUPPORT; - + lck_mtx_lock(rnh_lock); struct rtentry *rt = rt_lookup(TRUE, dst.sa, mask.sa, rnh, param->ifindex); lck_mtx_unlock(rnh_lock); - + if (rt) *out_cookie = (nstat_provider_cookie_t)rt; - + return rt ? 0 : ENOENT; } @@ -470,11 +503,11 @@ nstat_route_counts( { struct rtentry *rt = (struct rtentry*)cookie; struct nstat_counts *rt_stats = rt->rt_stats; - + if (out_gone) *out_gone = 0; - + if (out_gone && (rt->rt_flags & RTF_UP) == 0) *out_gone = 1; - + if (rt_stats) { atomic_get_64(out_counts->nstat_rxpackets, &rt_stats->nstat_rxpackets); @@ -495,7 +528,7 @@ nstat_route_counts( { bzero(out_counts, sizeof(*out_counts)); } - + return 0; } @@ -541,7 +574,7 @@ nstat_route_walktree_add( if (result != 0) rtfree_locked(rt); } - + return result; } @@ -552,14 +585,14 @@ nstat_route_add_watcher( int i; errno_t result = 0; OSIncrementAtomic(&nstat_route_watchers); - + lck_mtx_lock(rnh_lock); for (i = 1; i < AF_MAX; i++) { struct radix_node_head *rnh; rnh = rt_tables[i]; if (!rnh) continue; - + result = rnh->rnh_walktree(rnh, nstat_route_walktree_add, state); if (result != 0) { @@ -567,7 +600,7 @@ nstat_route_add_watcher( } } lck_mtx_unlock(rnh_lock); - + return result; } @@ -577,7 +610,7 @@ nstat_route_new_entry( { if (nstat_route_watchers == 0) return; - + lck_mtx_lock(&nstat_mtx); if ((rt->rt_flags & RTF_UP) != 0) { @@ -589,7 +622,7 @@ nstat_route_new_entry( // this client is watching routes // acquire a reference for the route RT_ADDREF(rt); - + // add the source, if that fails, release the reference if (nstat_control_source_add(0, state, &nstat_route_provider, rt) != 0) RT_REMREF(rt); @@ -618,34 +651,57 @@ nstat_route_copy_descriptor( return EINVAL; } bzero(desc, sizeof(*desc)); - + struct rtentry *rt = (struct rtentry*)cookie; desc->id = (uint64_t)VM_KERNEL_ADDRPERM(rt); desc->parent_id = (uint64_t)VM_KERNEL_ADDRPERM(rt->rt_parent); desc->gateway_id = (uint64_t)VM_KERNEL_ADDRPERM(rt->rt_gwroute); - + // key/dest struct sockaddr *sa; if ((sa = rt_key(rt))) nstat_copy_sa_out(sa, &desc->dst.sa, sizeof(desc->dst)); - + // mask if ((sa = rt_mask(rt)) && sa->sa_len <= sizeof(desc->mask)) memcpy(&desc->mask, sa, sa->sa_len); - + // gateway if ((sa = rt->rt_gateway)) nstat_copy_sa_out(sa, &desc->gateway.sa, sizeof(desc->gateway)); - + if (rt->rt_ifp) desc->ifindex = rt->rt_ifp->if_index; - + desc->flags = rt->rt_flags; - + return 0; } +static bool +nstat_route_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_filter *filter) +{ + bool retval = true; + + if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) + { + struct rtentry *rt = (struct rtentry*)cookie; + struct ifnet *ifp = rt->rt_ifp; + + if (ifp) + { + uint16_t interface_properties = nstat_ifnet_to_flags(ifp); + + if ((filter->npf_flags & interface_properties) == 0) + { + retval = false; + } + } + } + return retval; +} + static void nstat_init_route_provider(void) { @@ -659,6 +715,7 @@ nstat_init_route_provider(void) nstat_route_provider.nstat_watcher_add = nstat_route_add_watcher; nstat_route_provider.nstat_watcher_remove = nstat_route_remove_watcher; nstat_route_provider.nstat_copy_descriptor = nstat_route_copy_descriptor; + nstat_route_provider.nstat_reporting_allowed = nstat_route_reporting_allowed; nstat_route_provider.next = nstat_providers; nstat_providers = &nstat_route_provider; } @@ -671,20 +728,20 @@ nstat_route_attach( { struct nstat_counts *result = rte->rt_stats; if (result) return result; - + if (nstat_malloc_tag == NULL) nstat_init(); - + result = nstat_malloc_aligned(sizeof(*result), sizeof(u_int64_t), nstat_malloc_tag); if (!result) return result; - + bzero(result, sizeof(*result)); - + if (!OSCompareAndSwapPtr(NULL, result, &rte->rt_stats)) { nstat_free_aligned(result, nstat_malloc_tag); result = rte->rt_stats; } - + return result; } @@ -710,7 +767,7 @@ nstat_route_connect_attempt( { OSIncrementAtomic(&stats->nstat_connectattempts); } - + rte = rte->rt_parent; } } @@ -727,7 +784,7 @@ nstat_route_connect_success( { OSIncrementAtomic(&stats->nstat_connectsuccesses); } - + rte = rte->rt_parent; } } @@ -754,7 +811,7 @@ nstat_route_tx( OSAddAtomic64((SInt64)bytes, (SInt64*)&stats->nstat_txbytes); } } - + rte = rte->rt_parent; } } @@ -784,7 +841,7 @@ nstat_route_rx( OSAddAtomic(bytes, &stats->nstat_rxduplicatebytes); } } - + rte = rte->rt_parent; } } @@ -796,7 +853,7 @@ nstat_route_rtt( u_int32_t rtt_var) { const int32_t factor = 8; - + while (rte) { struct nstat_counts* stats = nstat_route_attach(rte); @@ -804,7 +861,7 @@ nstat_route_rtt( { int32_t oldrtt; int32_t newrtt; - + // average do { @@ -819,7 +876,7 @@ nstat_route_rtt( } if (oldrtt == newrtt) break; } while (!OSCompareAndSwap(oldrtt, newrtt, &stats->nstat_avg_rtt)); - + // minimum do { @@ -829,7 +886,7 @@ nstat_route_rtt( break; } } while (!OSCompareAndSwap(oldrtt, rtt, &stats->nstat_min_rtt)); - + // variance do { @@ -845,13 +902,13 @@ nstat_route_rtt( if (oldrtt == newrtt) break; } while (!OSCompareAndSwap(oldrtt, newrtt, &stats->nstat_var_rtt)); } - + rte = rte->rt_parent; } } -#pragma mark -- TCP Provider -- +#pragma mark -- TCP Kernel Provider -- /* * Due to the way the kernel deallocates a process (the process structure @@ -978,15 +1035,15 @@ nstat_tcpudp_lookup( { return EINVAL; } - + // src and dst must match if (param->remote.v4.sin_family != 0 && param->remote.v4.sin_family != param->local.v4.sin_family) { return EINVAL; } - - + + switch (param->local.v4.sin_family) { case AF_INET: @@ -997,12 +1054,12 @@ nstat_tcpudp_lookup( { return EINVAL; } - + inp = in_pcblookup_hash(inpinfo, param->remote.v4.sin_addr, param->remote.v4.sin_port, param->local.v4.sin_addr, param->local.v4.sin_port, 1, NULL); } break; - + #if INET6 case AF_INET6: { @@ -1011,30 +1068,30 @@ nstat_tcpudp_lookup( const struct in6_addr *in6c; struct in6_addr *in6; } local, remote; - + if (param->local.v6.sin6_len != sizeof(param->local.v6) || (param->remote.v6.sin6_family != 0 && param->remote.v6.sin6_len != sizeof(param->remote.v6))) { return EINVAL; } - + local.in6c = ¶m->local.v6.sin6_addr; remote.in6c = ¶m->remote.v6.sin6_addr; - + inp = in6_pcblookup_hash(inpinfo, remote.in6, param->remote.v6.sin6_port, local.in6, param->local.v6.sin6_port, 1, NULL); } break; #endif - + default: return EINVAL; } - + if (inp == NULL) return ENOENT; - + // At this point we have a ref to the inpcb *out_cookie = nstat_tucookie_alloc(inp); if (*out_cookie == NULL) @@ -1060,7 +1117,7 @@ nstat_tcp_gone( (struct nstat_tucookie *)cookie; struct inpcb *inp; struct tcpcb *tp; - + return (!(inp = tucookie->inp) || !(tp = intotcpcb(inp)) || inp->inp_state == INPCB_STATE_DEAD) ? 1 : 0; @@ -1077,19 +1134,19 @@ nstat_tcp_counts( struct inpcb *inp; bzero(out_counts, sizeof(*out_counts)); - + if (out_gone) *out_gone = 0; - + // if the pcb is in the dead state, we should stop using it if (nstat_tcp_gone(cookie)) { if (out_gone) *out_gone = 1; if (!(inp = tucookie->inp) || !intotcpcb(inp)) return EINVAL; - } + } inp = tucookie->inp; struct tcpcb *tp = intotcpcb(inp); - + atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); @@ -1110,7 +1167,7 @@ nstat_tcp_counts( atomic_get_64(out_counts->nstat_wifi_txbytes, &inp->inp_wstat->txbytes); atomic_get_64(out_counts->nstat_wired_rxbytes, &inp->inp_Wstat->rxbytes); atomic_get_64(out_counts->nstat_wired_txbytes, &inp->inp_Wstat->txbytes); - + return 0; } @@ -1130,9 +1187,9 @@ nstat_tcp_add_watcher( nstat_control_state *state) { OSIncrementAtomic(&nstat_tcp_watchers); - + lck_rw_lock_shared(tcbinfo.ipi_lock); - + // Add all current tcp inpcbs. Ignore those in timewait struct inpcb *inp; struct nstat_tucookie *cookie; @@ -1148,9 +1205,9 @@ nstat_tcp_add_watcher( break; } } - + lck_rw_done(tcbinfo.ipi_lock); - + return 0; } @@ -1175,7 +1232,7 @@ nstat_tcp_new_pcb( nstat_control_state *state; for (state = nstat_controls; state; state = state->ncs_next) { - if ((state->ncs_watching & (1 << NSTAT_PROVIDER_TCP)) != 0) + if ((state->ncs_watching & (1 << NSTAT_PROVIDER_TCP_KERNEL)) != 0) { // this client is watching tcp // acquire a reference for it @@ -1212,7 +1269,7 @@ nstat_pcb_detach(struct inpcb *inp) { lck_mtx_lock(&state->mtx); for (prevsrc = NULL, src = state->ncs_srcs; src; - prevsrc = src, src = src->next) + prevsrc = src, src = src->next) { tucookie = (struct nstat_tucookie *)src->cookie; if (tucookie->inp == inp) @@ -1222,12 +1279,12 @@ nstat_pcb_detach(struct inpcb *inp) if (src) { result = nstat_control_send_goodbye(state, src); - + if (prevsrc) prevsrc->next = src->next; else state->ncs_srcs = src->next; - + src->next = dead_list; dead_list = src; } @@ -1250,14 +1307,14 @@ nstat_pcb_cache(struct inpcb *inp) nstat_src *src; struct nstat_tucookie *tucookie; - if (inp == NULL || nstat_udp_watchers == 0 || + if (inp == NULL || nstat_udp_watchers == 0 || inp->inp_nstat_refcnt == 0) return; VERIFY(SOCK_PROTO(inp->inp_socket) == IPPROTO_UDP); lck_mtx_lock(&nstat_mtx); for (state = nstat_controls; state; state = state->ncs_next) { lck_mtx_lock(&state->mtx); - for (src = state->ncs_srcs; src; src = src->next) + for (src = state->ncs_srcs; src; src = src->next) { tucookie = (struct nstat_tucookie *)src->cookie; if (tucookie->inp == inp) @@ -1265,7 +1322,7 @@ nstat_pcb_cache(struct inpcb *inp) if (inp->inp_vflag & INP_IPV6) { nstat_ip6_to_sockaddr(&inp->in6p_laddr, - inp->inp_lport, + inp->inp_lport, &tucookie->local.v6, sizeof(tucookie->local)); nstat_ip6_to_sockaddr(&inp->in6p_faddr, @@ -1276,16 +1333,16 @@ nstat_pcb_cache(struct inpcb *inp) else if (inp->inp_vflag & INP_IPV4) { nstat_ip_to_sockaddr(&inp->inp_laddr, - inp->inp_lport, + inp->inp_lport, &tucookie->local.v4, sizeof(tucookie->local)); nstat_ip_to_sockaddr(&inp->inp_faddr, - inp->inp_fport, + inp->inp_fport, &tucookie->remote.v4, sizeof(tucookie->remote)); } if (inp->inp_last_outifp) - tucookie->if_index = + tucookie->if_index = inp->inp_last_outifp->if_index; tucookie->ifnet_properties = nstat_inpcb_to_flags(inp); @@ -1312,7 +1369,7 @@ nstat_pcb_invalidate_cache(struct inpcb *inp) lck_mtx_lock(&nstat_mtx); for (state = nstat_controls; state; state = state->ncs_next) { lck_mtx_lock(&state->mtx); - for (src = state->ncs_srcs; src; src = src->next) + for (src = state->ncs_srcs; src; src = src->next) { tucookie = (struct nstat_tucookie *)src->cookie; if (tucookie->inp == inp) @@ -1339,14 +1396,14 @@ nstat_tcp_copy_descriptor( if (nstat_tcp_gone(cookie)) return EINVAL; - + nstat_tcp_descriptor *desc = (nstat_tcp_descriptor*)data; struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; struct inpcb *inp = tucookie->inp; struct tcpcb *tp = intotcpcb(inp); bzero(desc, sizeof(*desc)); - + if (inp->inp_vflag & INP_IPV6) { nstat_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, @@ -1361,11 +1418,11 @@ nstat_tcp_copy_descriptor( nstat_ip_to_sockaddr(&inp->inp_faddr, inp->inp_fport, &desc->remote.v4, sizeof(desc->remote)); } - + desc->state = intotcpcb(inp)->t_state; desc->ifindex = (inp->inp_last_outifp == NULL) ? 0 : inp->inp_last_outifp->if_index; - + // danger - not locked, values could be bogus desc->txunacked = tp->snd_max - tp->snd_una; desc->txwindow = tp->snd_wnd; @@ -1375,7 +1432,7 @@ nstat_tcp_copy_descriptor( strlcpy(desc->cc_algo, CC_ALGO(tp)->name, sizeof(desc->cc_algo)); } - + struct socket *so = inp->inp_socket; if (so) { @@ -1384,7 +1441,10 @@ nstat_tcp_copy_descriptor( desc->upid = so->last_upid; desc->pid = so->last_pid; desc->traffic_class = so->so_traffic_class; - desc->traffic_mgt_flags = so->so_traffic_mgt_flags; + if ((so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND)) + desc->traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND; + if ((so->so_flags1 & SOF1_TRAFFIC_MGT_TCP_RECVBG)) + desc->traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; proc_name(desc->pid, desc->pname, sizeof(desc->pname)); if (desc->pname[0] == 0) { @@ -1420,30 +1480,101 @@ nstat_tcp_copy_descriptor( } static bool -nstat_tcpudp_reporting_allowed(nstat_provider_cookie_t cookie, uint64_t filter) +nstat_tcpudp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_filter *filter, bool is_UDP) { bool retval = true; - /* Only apply interface filter if at least one is allowed. */ - if ((filter & NSTAT_FILTER_ACCEPT_ALL) != 0) + if ((filter->npf_flags & (NSTAT_FILTER_IFNET_FLAGS|NSTAT_FILTER_SPECIFIC_USER)) != 0) { struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; - struct inpcb *inp = tucookie->inp; + struct inpcb *inp = tucookie->inp; + + /* Only apply interface filter if at least one is allowed. */ + if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) + { + uint16_t interface_properties = nstat_inpcb_to_flags(inp); + + if ((filter->npf_flags & interface_properties) == 0) + { + // For UDP, we could have an undefined interface and yet transfers may have occurred. + // We allow reporting if there have been transfers of the requested kind. + // This is imperfect as we cannot account for the expensive attribute over wifi. + // We also assume that cellular is expensive and we have no way to select for AWDL + if (is_UDP) + { + do + { + if ((filter->npf_flags & (NSTAT_FILTER_ACCEPT_CELLULAR|NSTAT_FILTER_ACCEPT_EXPENSIVE)) && + (inp->inp_cstat->rxbytes || inp->inp_cstat->txbytes)) + { + break; + } + if ((filter->npf_flags & NSTAT_FILTER_ACCEPT_WIFI) && + (inp->inp_wstat->rxbytes || inp->inp_wstat->txbytes)) + { + break; + } + if ((filter->npf_flags & NSTAT_FILTER_ACCEPT_WIRED) && + (inp->inp_Wstat->rxbytes || inp->inp_Wstat->txbytes)) + { + break; + } + return false; + } while (0); + } + else + { + return false; + } + } + } - uint16_t interface_properties = nstat_inpcb_to_flags(inp); + if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER) != 0) && (retval)) + { + struct socket *so = inp->inp_socket; + retval = false; - /* For now, just check on interface type. */ - retval = ((filter & interface_properties) != 0); + if (so) + { + if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_PID) != 0) && + (filter->npf_pid == so->last_pid)) + { + retval = true; + } + else if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_EPID) != 0) && + (filter->npf_pid == (so->so_flags & SOF_DELEGATED)? so->e_upid : so->last_pid)) + { + retval = true; + } + else if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_UUID) != 0) && + (memcmp(filter->npf_uuid, so->last_uuid, sizeof(so->last_uuid)) == 0)) + { + retval = true; + } + else if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_EUUID) != 0) && + (memcmp(filter->npf_uuid, (so->so_flags & SOF_DELEGATED)? so->e_uuid : so->last_uuid, + sizeof(so->last_uuid)) == 0)) + { + retval = true; + } + } + } } return retval; } +static bool +nstat_tcp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_filter *filter) +{ + return nstat_tcpudp_reporting_allowed(cookie, filter, FALSE); +} + static void nstat_init_tcp_provider(void) { bzero(&nstat_tcp_provider, sizeof(nstat_tcp_provider)); nstat_tcp_provider.nstat_descriptor_length = sizeof(nstat_tcp_descriptor); - nstat_tcp_provider.nstat_provider_id = NSTAT_PROVIDER_TCP; + nstat_tcp_provider.nstat_provider_id = NSTAT_PROVIDER_TCP_KERNEL; nstat_tcp_provider.nstat_lookup = nstat_tcp_lookup; nstat_tcp_provider.nstat_gone = nstat_tcp_gone; nstat_tcp_provider.nstat_counts = nstat_tcp_counts; @@ -1451,7 +1582,7 @@ nstat_init_tcp_provider(void) nstat_tcp_provider.nstat_watcher_add = nstat_tcp_add_watcher; nstat_tcp_provider.nstat_watcher_remove = nstat_tcp_remove_watcher; nstat_tcp_provider.nstat_copy_descriptor = nstat_tcp_copy_descriptor; - nstat_tcp_provider.nstat_reporting_allowed = nstat_tcpudp_reporting_allowed; + nstat_tcp_provider.nstat_reporting_allowed = nstat_tcp_reporting_allowed; nstat_tcp_provider.next = nstat_providers; nstat_providers = &nstat_tcp_provider; } @@ -1489,9 +1620,9 @@ nstat_udp_counts( { struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; - + if (out_gone) *out_gone = 0; - + // if the pcb is in the dead state, we should stop using it if (nstat_udp_gone(cookie)) { @@ -1500,7 +1631,7 @@ nstat_udp_counts( return EINVAL; } struct inpcb *inp = tucookie->inp; - + atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); @@ -1511,7 +1642,7 @@ nstat_udp_counts( atomic_get_64(out_counts->nstat_wifi_txbytes, &inp->inp_wstat->txbytes); atomic_get_64(out_counts->nstat_wired_rxbytes, &inp->inp_Wstat->rxbytes); atomic_get_64(out_counts->nstat_wired_txbytes, &inp->inp_Wstat->txbytes); - + return 0; } @@ -1534,7 +1665,7 @@ nstat_udp_add_watcher( struct nstat_tucookie *cookie; OSIncrementAtomic(&nstat_udp_watchers); - + lck_rw_lock_shared(udbinfo.ipi_lock); // Add all current UDP inpcbs. LIST_FOREACH(inp, udbinfo.ipi_listhead, inp_list) @@ -1549,9 +1680,9 @@ nstat_udp_add_watcher( break; } } - + lck_rw_done(udbinfo.ipi_lock); - + return 0; } @@ -1570,13 +1701,13 @@ nstat_udp_new_pcb( if (nstat_udp_watchers == 0) return; - + socket_lock(inp->inp_socket, 0); lck_mtx_lock(&nstat_mtx); nstat_control_state *state; for (state = nstat_controls; state; state = state->ncs_next) { - if ((state->ncs_watching & (1 << NSTAT_PROVIDER_UDP)) != 0) + if ((state->ncs_watching & (1 << NSTAT_PROVIDER_UDP_KERNEL)) != 0) { // this client is watching tcp // acquire a reference for it @@ -1584,7 +1715,7 @@ nstat_udp_new_pcb( if (cookie == NULL) continue; // add the source, if that fails, release the reference - if (nstat_control_source_add(0, state, + if (nstat_control_source_add(0, state, &nstat_udp_provider, cookie) != 0) { nstat_tucookie_release_locked(cookie); @@ -1606,7 +1737,7 @@ nstat_udp_copy_descriptor( { return EINVAL; } - + if (nstat_udp_gone(cookie)) return EINVAL; @@ -1616,7 +1747,7 @@ nstat_udp_copy_descriptor( struct inpcb *inp = tucookie->inp; bzero(desc, sizeof(*desc)); - + if (tucookie->cached == false) { if (inp->inp_vflag & INP_IPV6) { @@ -1636,84 +1767,510 @@ nstat_udp_copy_descriptor( } else { - if (inp->inp_vflag & INP_IPV6) - { - memcpy(&desc->local.v6, &tucookie->local.v6, - sizeof(desc->local.v6)); - memcpy(&desc->remote.v6, &tucookie->remote.v6, - sizeof(desc->remote.v6)); - } - else if (inp->inp_vflag & INP_IPV4) + if (inp->inp_vflag & INP_IPV6) + { + memcpy(&desc->local.v6, &tucookie->local.v6, + sizeof(desc->local.v6)); + memcpy(&desc->remote.v6, &tucookie->remote.v6, + sizeof(desc->remote.v6)); + } + else if (inp->inp_vflag & INP_IPV4) + { + memcpy(&desc->local.v4, &tucookie->local.v4, + sizeof(desc->local.v4)); + memcpy(&desc->remote.v4, &tucookie->remote.v4, + sizeof(desc->remote.v4)); + } + desc->ifnet_properties = tucookie->ifnet_properties; + } + + if (inp->inp_last_outifp) + desc->ifindex = inp->inp_last_outifp->if_index; + else + desc->ifindex = tucookie->if_index; + + struct socket *so = inp->inp_socket; + if (so) + { + // TBD - take the socket lock around these to make sure + // they're in sync? + desc->upid = so->last_upid; + desc->pid = so->last_pid; + proc_name(desc->pid, desc->pname, sizeof(desc->pname)); + if (desc->pname[0] == 0) + { + strlcpy(desc->pname, tucookie->pname, + sizeof(desc->pname)); + } + else + { + desc->pname[sizeof(desc->pname) - 1] = 0; + strlcpy(tucookie->pname, desc->pname, + sizeof(tucookie->pname)); + } + memcpy(desc->uuid, so->last_uuid, sizeof(so->last_uuid)); + memcpy(desc->vuuid, so->so_vuuid, sizeof(so->so_vuuid)); + if (so->so_flags & SOF_DELEGATED) { + desc->eupid = so->e_upid; + desc->epid = so->e_pid; + memcpy(desc->euuid, so->e_uuid, sizeof(so->e_uuid)); + } else { + desc->eupid = desc->upid; + desc->epid = desc->pid; + memcpy(desc->euuid, desc->uuid, sizeof(desc->uuid)); + } + desc->rcvbufsize = so->so_rcv.sb_hiwat; + desc->rcvbufused = so->so_rcv.sb_cc; + desc->traffic_class = so->so_traffic_class; + } + + return 0; +} + +static bool +nstat_udp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_filter *filter) +{ + return nstat_tcpudp_reporting_allowed(cookie, filter, TRUE); +} + + +static void +nstat_init_udp_provider(void) +{ + bzero(&nstat_udp_provider, sizeof(nstat_udp_provider)); + nstat_udp_provider.nstat_provider_id = NSTAT_PROVIDER_UDP_KERNEL; + nstat_udp_provider.nstat_descriptor_length = sizeof(nstat_udp_descriptor); + nstat_udp_provider.nstat_lookup = nstat_udp_lookup; + nstat_udp_provider.nstat_gone = nstat_udp_gone; + nstat_udp_provider.nstat_counts = nstat_udp_counts; + nstat_udp_provider.nstat_watcher_add = nstat_udp_add_watcher; + nstat_udp_provider.nstat_watcher_remove = nstat_udp_remove_watcher; + nstat_udp_provider.nstat_copy_descriptor = nstat_udp_copy_descriptor; + nstat_udp_provider.nstat_release = nstat_udp_release; + nstat_udp_provider.nstat_reporting_allowed = nstat_udp_reporting_allowed; + nstat_udp_provider.next = nstat_providers; + nstat_providers = &nstat_udp_provider; +} + +#pragma mark -- TCP/UDP Userland + +// Almost all of this infrastucture is common to both TCP and UDP + +static nstat_provider nstat_userland_tcp_provider; +static nstat_provider nstat_userland_udp_provider; + + +struct nstat_tu_shadow { + tailq_entry_tu_shadow shad_link; + userland_stats_request_vals_fn *shad_getvals_fn; + userland_stats_provider_context *shad_provider_context; + u_int64_t shad_properties; + int shad_provider; + uint32_t shad_magic; +}; + +// Magic number checking should remain in place until the userland provider has been fully proven +#define TU_SHADOW_MAGIC 0xfeedf00d +#define TU_SHADOW_UNMAGIC 0xdeaddeed + +static tailq_head_tu_shadow nstat_userprot_shad_head = TAILQ_HEAD_INITIALIZER(nstat_userprot_shad_head); + +static errno_t +nstat_userland_tu_lookup( + __unused const void *data, + __unused u_int32_t length, + __unused nstat_provider_cookie_t *out_cookie) +{ + // Looking up a specific connection is not supported + return ENOTSUP; +} + +static int +nstat_userland_tu_gone( + __unused nstat_provider_cookie_t cookie) +{ + // Returns non-zero if the source has gone. + // We don't keep a source hanging around, so the answer is always 0 + return 0; +} + +static errno_t +nstat_userland_tu_counts( + nstat_provider_cookie_t cookie, + struct nstat_counts *out_counts, + int *out_gone) + { + struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)cookie; + assert(shad->shad_magic == TU_SHADOW_MAGIC); + + bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, out_counts, NULL); + + if (out_gone) *out_gone = 0; + + return (result)? 0 : EIO; +} + + +static errno_t +nstat_userland_tu_copy_descriptor( + nstat_provider_cookie_t cookie, + void *data, + __unused u_int32_t len) +{ + struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)cookie; + assert(shad->shad_magic == TU_SHADOW_MAGIC); + + bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, data); + + return (result)? 0 : EIO; +} + +static void +nstat_userland_tu_release( + __unused nstat_provider_cookie_t cookie, + __unused int locked) +{ + // Called when a nstat_src is detached. + // We don't reference count or ask for delayed release so nothing to do here. +} + +static bool +check_reporting_for_user(nstat_provider_filter *filter, pid_t pid, pid_t epid, uuid_t *uuid, uuid_t *euuid) +{ + bool retval = true; + + if ((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER) != 0) + { + retval = false; + + if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_PID) != 0) && + (filter->npf_pid == pid)) + { + retval = true; + } + else if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_EPID) != 0) && + (filter->npf_pid == epid)) + { + retval = true; + } + else if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_UUID) != 0) && + (memcmp(filter->npf_uuid, uuid, sizeof(*uuid)) == 0)) + { + retval = true; + } + else if (((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER_BY_EUUID) != 0) && + (memcmp(filter->npf_uuid, euuid, sizeof(*euuid)) == 0)) + { + retval = true; + } + } + return retval; +} + +static bool +nstat_userland_tcp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_filter *filter) +{ + bool retval = true; + + if ((filter->npf_flags & (NSTAT_FILTER_IFNET_FLAGS|NSTAT_FILTER_SPECIFIC_USER)) != 0) + { + nstat_tcp_descriptor tcp_desc; // Stack allocation - OK or pushing the limits too far? + struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)cookie; + + assert(shad->shad_magic == TU_SHADOW_MAGIC); + + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, &tcp_desc)) + { + if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) + { + if ((filter->npf_flags & tcp_desc.ifnet_properties) == 0) + { + return false; + } + } + if ((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER) != 0) + { + retval = check_reporting_for_user(filter, (pid_t)tcp_desc.pid, (pid_t)tcp_desc.epid, + &tcp_desc.uuid, &tcp_desc.euuid); + } + } + else + { + retval = false; // No further information, so might as well give up now. + } + } + return retval; +} + +static bool +nstat_userland_udp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_filter *filter) +{ + bool retval = true; + + if ((filter->npf_flags & (NSTAT_FILTER_IFNET_FLAGS|NSTAT_FILTER_SPECIFIC_USER)) != 0) + { + nstat_udp_descriptor udp_desc; // Stack allocation - OK or pushing the limits too far? + struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)cookie; + + assert(shad->shad_magic == TU_SHADOW_MAGIC); + + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, &udp_desc)) + { + if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) + { + if ((filter->npf_flags & udp_desc.ifnet_properties) == 0) + { + return false; + } + } + if ((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER) != 0) + { + retval = check_reporting_for_user(filter, (pid_t)udp_desc.pid, (pid_t)udp_desc.epid, + &udp_desc.uuid, &udp_desc.euuid); + } + } + else + { + retval = false; // No further information, so might as well give up now. + } + } + return retval; +} + + + +static errno_t +nstat_userland_tcp_add_watcher( + nstat_control_state *state) +{ + struct nstat_tu_shadow *shad; + + OSIncrementAtomic(&nstat_userland_tcp_watchers); + + lck_mtx_lock(&nstat_mtx); + + TAILQ_FOREACH(shad, &nstat_userprot_shad_head, shad_link) { + assert(shad->shad_magic == TU_SHADOW_MAGIC); + + if (shad->shad_provider == NSTAT_PROVIDER_TCP_USERLAND) + { + int result = nstat_control_source_add(0, state, &nstat_userland_tcp_provider, shad); + if (result != 0) + { + printf("%s - nstat_control_source_add returned %d\n", __func__, result); + } + } + } + lck_mtx_unlock(&nstat_mtx); + + return 0; +} + +static errno_t +nstat_userland_udp_add_watcher( + nstat_control_state *state) +{ + struct nstat_tu_shadow *shad; + + OSIncrementAtomic(&nstat_userland_udp_watchers); + + lck_mtx_lock(&nstat_mtx); + + TAILQ_FOREACH(shad, &nstat_userprot_shad_head, shad_link) { + assert(shad->shad_magic == TU_SHADOW_MAGIC); + + if (shad->shad_provider == NSTAT_PROVIDER_UDP_USERLAND) + { + int result = nstat_control_source_add(0, state, &nstat_userland_udp_provider, shad); + if (result != 0) + { + printf("%s - nstat_control_source_add returned %d\n", __func__, result); + } + } + } + lck_mtx_unlock(&nstat_mtx); + + return 0; +} + + +static void +nstat_userland_tcp_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_userland_tcp_watchers); +} + +static void +nstat_userland_udp_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_userland_udp_watchers); +} + +static void +nstat_init_userland_tcp_provider(void) +{ + bzero(&nstat_userland_tcp_provider, sizeof(nstat_tcp_provider)); + nstat_userland_tcp_provider.nstat_descriptor_length = sizeof(nstat_tcp_descriptor); + nstat_userland_tcp_provider.nstat_provider_id = NSTAT_PROVIDER_TCP_USERLAND; + nstat_userland_tcp_provider.nstat_lookup = nstat_userland_tu_lookup; + nstat_userland_tcp_provider.nstat_gone = nstat_userland_tu_gone; + nstat_userland_tcp_provider.nstat_counts = nstat_userland_tu_counts; + nstat_userland_tcp_provider.nstat_release = nstat_userland_tu_release; + nstat_userland_tcp_provider.nstat_watcher_add = nstat_userland_tcp_add_watcher; + nstat_userland_tcp_provider.nstat_watcher_remove = nstat_userland_tcp_remove_watcher; + nstat_userland_tcp_provider.nstat_copy_descriptor = nstat_userland_tu_copy_descriptor; + nstat_userland_tcp_provider.nstat_reporting_allowed = nstat_userland_tcp_reporting_allowed; + nstat_userland_tcp_provider.next = nstat_providers; + nstat_providers = &nstat_userland_tcp_provider; +} + + +static void +nstat_init_userland_udp_provider(void) +{ + bzero(&nstat_userland_udp_provider, sizeof(nstat_udp_provider)); + nstat_userland_udp_provider.nstat_descriptor_length = sizeof(nstat_udp_descriptor); + nstat_userland_udp_provider.nstat_provider_id = NSTAT_PROVIDER_UDP_USERLAND; + nstat_userland_udp_provider.nstat_lookup = nstat_userland_tu_lookup; + nstat_userland_udp_provider.nstat_gone = nstat_userland_tu_gone; + nstat_userland_udp_provider.nstat_counts = nstat_userland_tu_counts; + nstat_userland_udp_provider.nstat_release = nstat_userland_tu_release; + nstat_userland_udp_provider.nstat_watcher_add = nstat_userland_udp_add_watcher; + nstat_userland_udp_provider.nstat_watcher_remove = nstat_userland_udp_remove_watcher; + nstat_userland_udp_provider.nstat_copy_descriptor = nstat_userland_tu_copy_descriptor; + nstat_userland_udp_provider.nstat_reporting_allowed = nstat_userland_udp_reporting_allowed; + nstat_userland_udp_provider.next = nstat_providers; + nstat_providers = &nstat_userland_udp_provider; +} + + + +// Things get started with a call to netstats to say that there’s a new connection: +__private_extern__ nstat_userland_context +ntstat_userland_stats_open(userland_stats_provider_context *ctx, + int provider_id, + u_int64_t properties, + userland_stats_request_vals_fn req_fn) +{ + struct nstat_tu_shadow *shad; + + if ((provider_id != NSTAT_PROVIDER_TCP_USERLAND) && (provider_id != NSTAT_PROVIDER_UDP_USERLAND)) + { + printf("%s - incorrect provider is supplied, %d\n", __func__, provider_id); + return NULL; + } + + shad = OSMalloc(sizeof(*shad), nstat_malloc_tag); + if (shad == NULL) + return NULL; + + shad->shad_getvals_fn = req_fn; + shad->shad_provider_context = ctx; + shad->shad_provider = provider_id; + shad->shad_properties = properties; + shad->shad_magic = TU_SHADOW_MAGIC; + + lck_mtx_lock(&nstat_mtx); + nstat_control_state *state; + + // Even if there are no watchers, we save the shadow structure + TAILQ_INSERT_HEAD(&nstat_userprot_shad_head, shad, shad_link); + + for (state = nstat_controls; state; state = state->ncs_next) + { + if ((state->ncs_watching & (1 << provider_id)) != 0) { - memcpy(&desc->local.v4, &tucookie->local.v4, - sizeof(desc->local.v4)); - memcpy(&desc->remote.v4, &tucookie->remote.v4, - sizeof(desc->remote.v4)); + // this client is watching tcp/udp userland + // Link to it. + int result = nstat_control_source_add(0, state, &nstat_userland_tcp_provider, shad); + if (result != 0) + { + printf("%s - nstat_control_source_add returned %d\n", __func__, result); + } } - desc->ifnet_properties = tucookie->ifnet_properties; } - - if (inp->inp_last_outifp) - desc->ifindex = inp->inp_last_outifp->if_index; - else - desc->ifindex = tucookie->if_index; - - struct socket *so = inp->inp_socket; - if (so) + lck_mtx_unlock(&nstat_mtx); + + return (nstat_userland_context)shad; +} + + +__private_extern__ void +ntstat_userland_stats_close(nstat_userland_context nstat_ctx) +{ + struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)nstat_ctx; + nstat_src *dead_list = NULL; + + if (shad == NULL) + return; + + assert(shad->shad_magic == TU_SHADOW_MAGIC); + + lck_mtx_lock(&nstat_mtx); + if (nstat_userland_udp_watchers != 0 || nstat_userland_tcp_watchers != 0) { - // TBD - take the socket lock around these to make sure - // they're in sync? - desc->upid = so->last_upid; - desc->pid = so->last_pid; - proc_name(desc->pid, desc->pname, sizeof(desc->pname)); - if (desc->pname[0] == 0) - { - strlcpy(desc->pname, tucookie->pname, - sizeof(desc->pname)); - } - else + nstat_control_state *state; + nstat_src *src, *prevsrc; + errno_t result; + + for (state = nstat_controls; state; state = state->ncs_next) { - desc->pname[sizeof(desc->pname) - 1] = 0; - strlcpy(tucookie->pname, desc->pname, - sizeof(tucookie->pname)); - } - memcpy(desc->uuid, so->last_uuid, sizeof(so->last_uuid)); - memcpy(desc->vuuid, so->so_vuuid, sizeof(so->so_vuuid)); - if (so->so_flags & SOF_DELEGATED) { - desc->eupid = so->e_upid; - desc->epid = so->e_pid; - memcpy(desc->euuid, so->e_uuid, sizeof(so->e_uuid)); - } else { - desc->eupid = desc->upid; - desc->epid = desc->pid; - memcpy(desc->euuid, desc->uuid, sizeof(desc->uuid)); + lck_mtx_lock(&state->mtx); + for (prevsrc = NULL, src = state->ncs_srcs; src; + prevsrc = src, src = src->next) + { + if (shad == (struct nstat_tu_shadow *)src->cookie) + break; + } + + if (src) + { + result = nstat_control_send_goodbye(state, src); + + if (prevsrc) + prevsrc->next = src->next; + else + state->ncs_srcs = src->next; + + src->next = dead_list; + dead_list = src; + } + lck_mtx_unlock(&state->mtx); } - desc->rcvbufsize = so->so_rcv.sb_hiwat; - desc->rcvbufused = so->so_rcv.sb_cc; - desc->traffic_class = so->so_traffic_class; } + TAILQ_REMOVE(&nstat_userprot_shad_head, shad, shad_link); - return 0; + lck_mtx_unlock(&nstat_mtx); + + while (dead_list) + { + nstat_src *src; + src = dead_list; + dead_list = src->next; + + nstat_control_cleanup_source(NULL, src, TRUE); + } + + shad->shad_magic = TU_SHADOW_UNMAGIC; + + OSFree(shad, sizeof(*shad), nstat_malloc_tag); } -static void -nstat_init_udp_provider(void) + +__private_extern__ void +ntstat_userland_stats_event( + __unused nstat_userland_context context, + __unused userland_stats_event_t event) { - bzero(&nstat_udp_provider, sizeof(nstat_udp_provider)); - nstat_udp_provider.nstat_provider_id = NSTAT_PROVIDER_UDP; - nstat_udp_provider.nstat_descriptor_length = sizeof(nstat_udp_descriptor); - nstat_udp_provider.nstat_lookup = nstat_udp_lookup; - nstat_udp_provider.nstat_gone = nstat_udp_gone; - nstat_udp_provider.nstat_counts = nstat_udp_counts; - nstat_udp_provider.nstat_watcher_add = nstat_udp_add_watcher; - nstat_udp_provider.nstat_watcher_remove = nstat_udp_remove_watcher; - nstat_udp_provider.nstat_copy_descriptor = nstat_udp_copy_descriptor; - nstat_udp_provider.nstat_release = nstat_udp_release; - nstat_udp_provider.nstat_reporting_allowed = nstat_tcpudp_reporting_allowed; - nstat_udp_provider.next = nstat_providers; - nstat_providers = &nstat_udp_provider; + // This is a dummy for when we hook up event reporting to NetworkStatistics. + // See NetworkStatistics should provide opt-in notifications } + + + #pragma mark -- ifnet Provider -- static nstat_provider nstat_ifnet_provider; @@ -1744,7 +2301,7 @@ nstat_ifnet_lookup( if (length < sizeof(*param) || param->threshold < 1024*1024) return EINVAL; if (nstat_privcheck != 0) { - errno_t result = priv_check_cred(kauth_cred_get(), + errno_t result = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0); if (result != 0) return result; @@ -1835,7 +2392,7 @@ nstat_ifnet_counts( struct ifnet *ifp = ifcookie->ifp; if (out_gone) *out_gone = 0; - + // if the ifnet is gone, we should stop using it if (nstat_ifnet_gone(cookie)) { @@ -1879,7 +2436,7 @@ nstat_ifnet_release( continue; ifcookie = (struct nstat_ifnet_cookie *)src->cookie; if (ifcookie->threshold < minthreshold) - minthreshold = ifcookie->threshold; + minthreshold = ifcookie->threshold; } lck_mtx_unlock(&state->mtx); } @@ -1896,7 +2453,7 @@ nstat_ifnet_release( else ifp->if_data_threshold = minthreshold; ifnet_lock_done(ifp); - ifnet_decr_iorefcnt(ifp); + ifnet_decr_iorefcnt(ifp); } ifnet_release(ifp); OSFree(ifcookie, sizeof(*ifcookie), nstat_malloc_tag); @@ -1996,7 +2553,10 @@ nstat_ifnet_copy_link_status( cell_status->valid_bitmask |= NSTAT_IFNET_DESC_CELL_CONFIG_BACKOFF_TIME_VALID; cell_status->config_backoff_time = if_cell_sr->config_backoff_time; } - + if (if_cell_sr->valid_bitmask & IF_CELL_UL_MSS_RECOMMENDED_VALID) { + cell_status->valid_bitmask |= NSTAT_IFNET_DESC_CELL_MSS_RECOMMENDED_VALID; + cell_status->mss_recommended = if_cell_sr->mss_recommended; + } } else if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) { nstat_ifnet_desc_wifi_status *wifi_status = &link_status->u.wifi; @@ -2186,7 +2746,7 @@ nstat_ifnet_normalize_counter(struct if_tcp_ecn_stat *if_st) if_st->ecn_total_conn = ecn_off_conn + ecn_on_conn; } -void +static void nstat_ifnet_report_ecn_stats(void) { u_int64_t uptime, last_report_time; @@ -2226,7 +2786,7 @@ nstat_ifnet_report_ecn_stats(void) } else { st->ifnet_type = NSTAT_IFNET_ECN_TYPE_ETHERNET; } - + data.unsent_data_cnt = ifp->if_unsent_data_cnt; /* skip if there was no update since last report */ if (ifp->if_ipv4_stat->timestamp <= 0 || ifp->if_ipv4_stat->timestamp < last_report_time) @@ -2236,7 +2796,6 @@ nstat_ifnet_report_ecn_stats(void) nstat_ifnet_compute_percentages(&ifp->if_ipv4_stat->ecn_on); nstat_ifnet_compute_percentages(&ifp->if_ipv4_stat->ecn_off); nstat_ifnet_normalize_counter(ifp->if_ipv4_stat); - bcopy(ifp->if_ipv4_stat, &st->ecn_stat, sizeof(st->ecn_stat)); nstat_sysinfo_send_data(&data); @@ -2253,7 +2812,6 @@ nstat_ifnet_report_ecn_stats(void) nstat_ifnet_compute_percentages(&ifp->if_ipv6_stat->ecn_on); nstat_ifnet_compute_percentages(&ifp->if_ipv6_stat->ecn_off); nstat_ifnet_normalize_counter(ifp->if_ipv6_stat); - bcopy(ifp->if_ipv6_stat, &st->ecn_stat, sizeof(st->ecn_stat)); nstat_sysinfo_send_data(&data); @@ -2270,7 +2828,7 @@ nstat_ifnet_copy_descriptor( nstat_provider_cookie_t cookie, void *data, u_int32_t len) -{ +{ nstat_ifnet_descriptor *desc = (nstat_ifnet_descriptor *)data; struct nstat_ifnet_cookie *ifcookie = (struct nstat_ifnet_cookie *)cookie; @@ -2278,7 +2836,7 @@ nstat_ifnet_copy_descriptor( if (len < sizeof(nstat_ifnet_descriptor)) return EINVAL; - + if (nstat_ifnet_gone(cookie)) return EINVAL; @@ -2359,7 +2917,7 @@ nstat_sysinfo_send_data_internal( nstat_sysinfo_keyval *kv; errno_t result = 0; size_t i = 0; - + allocsize = offsetof(nstat_msg_sysinfo_counts, counts); countsize = offsetof(nstat_sysinfo_counts, nstat_sysinfo_keyvals); finalsize = allocsize; @@ -2381,6 +2939,9 @@ nstat_sysinfo_send_data_internal( /* Two more keys for ifnet type and proto */ nkeyvals += 2; + + /* One key for unsent data. */ + nkeyvals++; break; default: return; @@ -2392,7 +2953,7 @@ nstat_sysinfo_send_data_internal( if (syscnt == NULL) return; bzero(syscnt, allocsize); - + kv = (nstat_sysinfo_keyval *) &syscnt->counts.nstat_sysinfo_keyvals; switch (data->flags) { @@ -2422,6 +2983,9 @@ nstat_sysinfo_send_data_internal( nstat_set_keyval_scalar(&kv[i++], NSTAT_SYSINFO_MBUF_MEM_RELEASED, data->u.mb_stats.memreleased); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_KEY_SOCK_MBFLOOR, + data->u.mb_stats.sbmb_floor); VERIFY(i == nkeyvals); break; } @@ -2541,6 +3105,18 @@ nstat_sysinfo_send_data_internal( nstat_set_keyval_scalar(&kv[i++], NSTAT_SYSINFO_TFO_BLACKHOLE, data->u.tcp_stats.tfo_blackhole); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_TFO_COOKIE_WRONG, + data->u.tcp_stats.tfo_cookie_wrong); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_TFO_NO_COOKIE_RCV, + data->u.tcp_stats.tfo_no_cookie_rcv); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_TFO_HEURISTICS_DISABLE, + data->u.tcp_stats.tfo_heuristics_disable); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_TFO_SEND_BLACKHOLE, + data->u.tcp_stats.tfo_sndblackhole); VERIFY(i == nkeyvals); break; } @@ -2678,6 +3254,15 @@ nstat_sysinfo_send_data_internal( nstat_set_keyval_scalar(&kv[i++], NSTAT_SYSINFO_ECN_IFNET_TOTAL_CONN, data->u.ifnet_ecn_stats.ecn_stat.ecn_total_conn); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_IFNET_UNSENT_DATA, + data->unsent_data_cnt); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_FALLBACK_DROPRST, + data->u.ifnet_ecn_stats.ecn_stat.ecn_fallback_droprst); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_FALLBACK_DROPRXMT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_fallback_droprxmt); break; } } @@ -2818,7 +3403,7 @@ nstat_control_send_goodbye( if (failed != 0) nstat_stats.nstat_control_send_goodbye_failures++; - + return result; } @@ -2854,9 +3439,9 @@ nstat_accumulate_msg( // Will send the current mbuf nstat_flush_accumulated_msgs(state); } - + errno_t result = 0; - + if (state->ncs_accumulated == NULL) { unsigned int one = 1; @@ -2871,14 +3456,14 @@ nstat_accumulate_msg( mbuf_setlen(state->ncs_accumulated, 0); } } - + if (result == 0) { hdr->length = length; result = mbuf_copyback(state->ncs_accumulated, mbuf_len(state->ncs_accumulated), length, hdr, MBUF_DONTWAIT); } - + if (result != 0) { nstat_flush_accumulated_msgs(state); @@ -2886,7 +3471,7 @@ nstat_accumulate_msg( printf("%s - resorting to ctl_enqueuedata\n", __func__); result = ctl_enqueuedata(state->ncs_kctl, state->ncs_unit, hdr, length, CTL_DATA_EOR); } - + if (result != 0) nstat_stats.nstat_accumulate_msg_failures++; @@ -2899,9 +3484,9 @@ nstat_idle_check( __unused thread_call_param_t p1) { lck_mtx_lock(&nstat_mtx); - + nstat_idle_time = 0; - + nstat_control_state *control; nstat_src *dead = NULL; nstat_src *dead_list = NULL; @@ -2909,7 +3494,7 @@ nstat_idle_check( { lck_mtx_lock(&control->mtx); nstat_src **srcpp = &control->ncs_srcs; - + if (!(control->ncs_flags & NSTAT_FLAG_REQCOUNTS)) { while(*srcpp != NULL) @@ -2917,13 +3502,13 @@ nstat_idle_check( if ((*srcpp)->provider->nstat_gone((*srcpp)->cookie)) { errno_t result; - + // Pull it off the list dead = *srcpp; *srcpp = (*srcpp)->next; - + result = nstat_control_send_goodbye(control, dead); - + // Put this on the list to release later dead->next = dead_list; dead_list = dead; @@ -2943,21 +3528,21 @@ nstat_idle_check( clock_interval_to_deadline(60, NSEC_PER_SEC, &nstat_idle_time); thread_call_func_delayed((thread_call_func_t)nstat_idle_check, NULL, nstat_idle_time); } - + lck_mtx_unlock(&nstat_mtx); - + /* Generate any system level reports, if needed */ nstat_sysinfo_generate_report(); - + // Release the sources now that we aren't holding lots of locks while (dead_list) { dead = dead_list; dead_list = dead->next; - + nstat_control_cleanup_source(NULL, dead, FALSE); } - + return NULL; } @@ -2969,12 +3554,12 @@ nstat_control_register(void) lck_grp_attr_setdefault(grp_attr); nstat_lck_grp = lck_grp_alloc_init("network statistics kctl", grp_attr); lck_grp_attr_free(grp_attr); - + lck_mtx_init(&nstat_mtx, nstat_lck_grp, NULL); - + // Register the control struct kern_ctl_reg nstat_control; - bzero(&nstat_control, sizeof(nstat_control)); + bzero(&nstat_control, sizeof(nstat_control)); strlcpy(nstat_control.ctl_name, NET_STAT_CONTROL_NAME, sizeof(nstat_control.ctl_name)); nstat_control.ctl_flags = CTL_FLAG_REG_EXTENDED | CTL_FLAG_REG_CRIT; nstat_control.ctl_sendsize = nstat_sendspace; @@ -2982,7 +3567,7 @@ nstat_control_register(void) nstat_control.ctl_connect = nstat_control_connect; nstat_control.ctl_disconnect = nstat_control_disconnect; nstat_control.ctl_send = nstat_control_send; - + ctl_register(&nstat_control, &nstat_ctlref); } @@ -2993,7 +3578,7 @@ nstat_control_cleanup_source( boolean_t locked) { errno_t result; - + if (state) { result = nstat_control_send_removed(state, src); @@ -3020,8 +3605,8 @@ nstat_control_reporting_allowed( return TRUE; return ( - src->provider->nstat_reporting_allowed( src->cookie, - state->ncs_provider_filters[src->provider->nstat_provider_id]) + src->provider->nstat_reporting_allowed(src->cookie, + &state->ncs_provider_filters[src->provider->nstat_provider_id]) ); } @@ -3034,26 +3619,26 @@ nstat_control_connect( { nstat_control_state *state = OSMalloc(sizeof(*state), nstat_malloc_tag); if (state == NULL) return ENOMEM; - + bzero(state, sizeof(*state)); lck_mtx_init(&state->mtx, nstat_lck_grp, NULL); state->ncs_kctl = kctl; state->ncs_unit = sac->sc_unit; state->ncs_flags = NSTAT_FLAG_REQCOUNTS; *uinfo = state; - + lck_mtx_lock(&nstat_mtx); state->ncs_next = nstat_controls; nstat_controls = state; - + if (nstat_idle_time == 0) { clock_interval_to_deadline(60, NSEC_PER_SEC, &nstat_idle_time); thread_call_func_delayed((thread_call_func_t)nstat_idle_check, NULL, nstat_idle_time); } - + lck_mtx_unlock(&nstat_mtx); - + return 0; } @@ -3065,7 +3650,7 @@ nstat_control_disconnect( { u_int32_t watching; nstat_control_state *state = (nstat_control_state*)uinfo; - + // pull it out of the global list of states lck_mtx_lock(&nstat_mtx); nstat_control_state **statepp; @@ -3078,7 +3663,7 @@ nstat_control_disconnect( } } lck_mtx_unlock(&nstat_mtx); - + lck_mtx_lock(&state->mtx); // Stop watching for sources nstat_provider *provider; @@ -3092,35 +3677,35 @@ nstat_control_disconnect( provider->nstat_watcher_remove(state); } } - + // set cleanup flags state->ncs_flags |= NSTAT_FLAG_CLEANUP; - + if (state->ncs_accumulated) { mbuf_freem(state->ncs_accumulated); state->ncs_accumulated = NULL; } - + // Copy out the list of sources nstat_src *srcs = state->ncs_srcs; state->ncs_srcs = NULL; lck_mtx_unlock(&state->mtx); - + while (srcs) { nstat_src *src; - + // pull it out of the list src = srcs; srcs = src->next; - + // clean it up nstat_control_cleanup_source(NULL, src, FALSE); } lck_mtx_destroy(&state->mtx, nstat_lck_grp); OSFree(state, sizeof(*state), nstat_malloc_tag); - + return 0; } @@ -3128,29 +3713,7 @@ static nstat_src_ref_t nstat_control_next_src_ref( nstat_control_state *state) { - int i = 0; - nstat_src_ref_t toReturn = NSTAT_SRC_REF_INVALID; - - for (i = 0; i < 1000 && toReturn == NSTAT_SRC_REF_INVALID; i++) - { - if (state->ncs_next_srcref == NSTAT_SRC_REF_INVALID || - state->ncs_next_srcref == NSTAT_SRC_REF_ALL) - { - state->ncs_next_srcref = 1; - } - - nstat_src *src; - for (src = state->ncs_srcs; src; src = src->next) - { - if (src->srcref == state->ncs_next_srcref) - break; - } - - if (src == NULL) toReturn = state->ncs_next_srcref; - state->ncs_next_srcref++; - } - - return toReturn; + return ++state->ncs_next_srcref; } static errno_t @@ -3174,11 +3737,12 @@ nstat_control_send_counts( counts.hdr.flags = hdr_flags; counts.hdr.context = context; counts.srcref = src->srcref; - + counts.event_flags = 0; + if (src->provider->nstat_counts(src->cookie, &counts.counts, gone) == 0) { if ((src->filter & NSTAT_FILTER_NOZEROBYTES) && - counts.counts.nstat_rxbytes == 0 && + counts.counts.nstat_rxbytes == 0 && counts.counts.nstat_txbytes == 0) { result = EAGAIN; @@ -3203,26 +3767,27 @@ nstat_control_append_counts( { /* Some providers may not have any counts to send */ if (!src->provider->nstat_counts) return 0; - + nstat_msg_src_counts counts; bzero(&counts, sizeof(counts)); counts.hdr.type = NSTAT_MSG_TYPE_SRC_COUNTS; counts.hdr.length = sizeof(counts); counts.srcref = src->srcref; - + counts.event_flags = 0; + errno_t result = 0; result = src->provider->nstat_counts(src->cookie, &counts.counts, gone); if (result != 0) { return result; } - + if ((src->filter & NSTAT_FILTER_NOZEROBYTES) == NSTAT_FILTER_NOZEROBYTES && counts.counts.nstat_rxbytes == 0 && counts.counts.nstat_txbytes == 0) { return EAGAIN; } - + return nstat_accumulate_msg(state, &counts.hdr, counts.hdr.length); } @@ -3268,6 +3833,7 @@ nstat_control_send_description( desc->hdr.length = size; desc->hdr.flags = hdr_flags; desc->srcref = src->srcref; + desc->event_flags = 0; desc->provider = src->provider->nstat_provider_id; result = ctl_enqueuembuf(state->ncs_kctl, state->ncs_unit, msg, CTL_DATA_EOR); @@ -3291,17 +3857,18 @@ nstat_control_append_description( { return EOPNOTSUPP; } - + // Fill out a buffer on the stack, we will copy to the mbuf later u_int64_t buffer[size/sizeof(u_int64_t) + 1]; // u_int64_t to ensure alignment bzero(buffer, size); - + nstat_msg_src_description *desc = (nstat_msg_src_description*)buffer; desc->hdr.type = NSTAT_MSG_TYPE_SRC_DESC; desc->hdr.length = size; desc->srcref = src->srcref; + desc->event_flags = 0; desc->provider = src->provider->nstat_provider_id; - + errno_t result = 0; // Fill in the description // Query the provider for the provider specific bits @@ -3311,7 +3878,7 @@ nstat_control_append_description( { return result; } - + return nstat_accumulate_msg(state, &desc->hdr, size); } @@ -3330,7 +3897,7 @@ nstat_control_send_update( { return EOPNOTSUPP; } - + // Allocate storage for the descriptor message mbuf_t msg; unsigned int one = 1; @@ -3340,7 +3907,7 @@ nstat_control_send_update( { return ENOMEM; } - + nstat_msg_src_update *desc = (nstat_msg_src_update*)mbuf_data(msg); bzero(desc, size); desc->hdr.context = context; @@ -3348,11 +3915,12 @@ nstat_control_send_update( desc->hdr.length = size; desc->hdr.flags = hdr_flags; desc->srcref = src->srcref; + desc->event_flags = 0; desc->provider = src->provider->nstat_provider_id; - + mbuf_setlen(msg, size); mbuf_pkthdr_setlen(msg, mbuf_len(msg)); - + errno_t result = 0; if (src->provider->nstat_descriptor_length != 0 && src->provider->nstat_copy_descriptor) { @@ -3365,7 +3933,7 @@ nstat_control_send_update( return result; } } - + if (src->provider->nstat_counts) { result = src->provider->nstat_counts(src->cookie, &desc->counts, gone); @@ -3382,7 +3950,7 @@ nstat_control_send_update( } } } - + if (result != 0) { nstat_stats.nstat_srcupatefailures += 1; @@ -3405,17 +3973,18 @@ nstat_control_append_update( { return EOPNOTSUPP; } - + // Fill out a buffer on the stack, we will copy to the mbuf later u_int64_t buffer[size/sizeof(u_int64_t) + 1]; // u_int64_t to ensure alignment bzero(buffer, size); - + nstat_msg_src_update *desc = (nstat_msg_src_update*)buffer; desc->hdr.type = NSTAT_MSG_TYPE_SRC_UPDATE; desc->hdr.length = size; desc->srcref = src->srcref; + desc->event_flags = 0; desc->provider = src->provider->nstat_provider_id; - + errno_t result = 0; // Fill in the description if (src->provider->nstat_descriptor_length != 0 && src->provider->nstat_copy_descriptor) @@ -3431,7 +4000,7 @@ nstat_control_append_update( return result; } } - + if (src->provider->nstat_counts) { result = src->provider->nstat_counts(src->cookie, &desc->counts, gone); @@ -3442,14 +4011,14 @@ nstat_control_append_update( printf("%s: src->provider->nstat_counts: %d\n", __func__, result); return result; } - + if ((src->filter & NSTAT_FILTER_NOZEROBYTES) == NSTAT_FILTER_NOZEROBYTES && desc->counts.nstat_rxbytes == 0 && desc->counts.nstat_txbytes == 0) { return EAGAIN; } } - + return nstat_accumulate_msg(state, &desc->hdr, size); } @@ -3486,14 +4055,14 @@ nstat_control_handle_add_request( { return EINVAL; } - + // Calculate the length of the parameter field int32_t paramlength = mbuf_pkthdr_len(m) - offsetof(nstat_msg_add_src_req, param); if (paramlength < 0 || paramlength > 2 * 1024) { return EINVAL; } - + nstat_provider *provider; nstat_provider_cookie_t cookie; nstat_msg_add_src_req *req = mbuf_data(m); @@ -3501,7 +4070,7 @@ nstat_control_handle_add_request( { // parameter is too large, we need to make a contiguous copy void *data = OSMalloc(paramlength, nstat_malloc_tag); - + if (!data) return ENOMEM; result = mbuf_copydata(m, offsetof(nstat_msg_add_src_req, param), paramlength, data); if (result == 0) @@ -3512,16 +4081,16 @@ nstat_control_handle_add_request( { result = nstat_lookup_entry(req->provider, (void*)&req->param, paramlength, &provider, &cookie); } - + if (result != 0) { return result; } - + result = nstat_control_source_add(req->hdr.context, state, provider, cookie); if (result != 0) provider->nstat_release(cookie, 0); - + return result; } @@ -3531,25 +4100,24 @@ nstat_control_handle_add_all( mbuf_t m) { errno_t result = 0; - + // Verify the header fits in the first mbuf if (mbuf_len(m) < sizeof(nstat_msg_add_all_srcs)) { return EINVAL; } - + nstat_msg_add_all_srcs *req = mbuf_data(m); if (req->provider > NSTAT_PROVIDER_LAST) return ENOENT; nstat_provider *provider = nstat_find_provider_by_id(req->provider); - u_int64_t filter = req->filter; if (!provider) return ENOENT; if (provider->nstat_watcher_add == NULL) return ENOTSUP; - + if (nstat_privcheck != 0) { - result = priv_check_cred(kauth_cred_get(), + result = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0); if (result != 0) return result; @@ -3563,19 +4131,28 @@ nstat_control_handle_add_all( lck_mtx_unlock(&state->mtx); if (result != 0) return result; - state->ncs_provider_filters[req->provider] = filter; + state->ncs_provider_filters[req->provider].npf_flags = req->filter; + state->ncs_provider_filters[req->provider].npf_events = req->events; + state->ncs_provider_filters[req->provider].npf_pid = req->target_pid; + memcpy(state->ncs_provider_filters[req->provider].npf_uuid, req->target_uuid, + sizeof(state->ncs_provider_filters[req->provider].npf_uuid)); result = provider->nstat_watcher_add(state); if (result != 0) { - state->ncs_provider_filters[req->provider] = 0; + state->ncs_provider_filters[req->provider].npf_flags = 0; + state->ncs_provider_filters[req->provider].npf_events = 0; + state->ncs_provider_filters[req->provider].npf_pid = 0; + bzero(state->ncs_provider_filters[req->provider].npf_uuid, + sizeof(state->ncs_provider_filters[req->provider].npf_uuid)); + lck_mtx_lock(&state->mtx); state->ncs_watching &= ~(1 << provider->nstat_provider_id); lck_mtx_unlock(&state->mtx); } if (result == 0) nstat_enqueue_success(req->hdr.context, state, 0); - + return result; } @@ -3590,22 +4167,22 @@ nstat_control_source_add( mbuf_t msg = NULL; nstat_src_ref_t *srcrefp = NULL; - u_int64_t provider_filters = - state->ncs_provider_filters[provider->nstat_provider_id]; + u_int64_t provider_filter_flagss = + state->ncs_provider_filters[provider->nstat_provider_id].npf_flags; boolean_t tell_user = - ((provider_filters & NSTAT_FILTER_SUPPRESS_SRC_ADDED) == 0); + ((provider_filter_flagss & NSTAT_FILTER_SUPPRESS_SRC_ADDED) == 0); u_int32_t src_filter = - (provider_filters & NSTAT_FILTER_PROVIDER_NOZEROBYTES) + (provider_filter_flagss & NSTAT_FILTER_PROVIDER_NOZEROBYTES) ? NSTAT_FILTER_NOZEROBYTES : 0; if (tell_user) { unsigned int one = 1; - + if (mbuf_allocpacket(MBUF_DONTWAIT, sizeof(nstat_msg_src_added), &one, &msg) != 0) return ENOMEM; - + mbuf_setlen(msg, sizeof(nstat_msg_src_added)); mbuf_pkthdr_setlen(msg, mbuf_len(msg)); nstat_msg_src_added *add = mbuf_data(msg); @@ -3616,7 +4193,7 @@ nstat_control_source_add( add->provider = provider->nstat_provider_id; srcrefp = &add->srcref; } - + // Allocate storage for the source nstat_src *src = OSMalloc(sizeof(*src), nstat_malloc_tag); if (src == NULL) @@ -3624,7 +4201,7 @@ nstat_control_source_add( if (msg) mbuf_freem(msg); return ENOMEM; } - + // Fill in the source, including picking an unused source ref lck_mtx_lock(&state->mtx); @@ -3660,9 +4237,9 @@ nstat_control_source_add( // Put the source in the list src->next = state->ncs_srcs; state->ncs_srcs = src; - + lck_mtx_unlock(&state->mtx); - + return 0; } @@ -3672,14 +4249,14 @@ nstat_control_handle_remove_request( mbuf_t m) { nstat_src_ref_t srcref = NSTAT_SRC_REF_INVALID; - + if (mbuf_copydata(m, offsetof(nstat_msg_rem_src_req, srcref), sizeof(srcref), &srcref) != 0) { return EINVAL; } - + lck_mtx_lock(&state->mtx); - + // Remove this source as we look for it nstat_src **nextp; nstat_src *src = NULL; @@ -3692,11 +4269,11 @@ nstat_control_handle_remove_request( break; } } - + lck_mtx_unlock(&state->mtx); - + if (src) nstat_control_cleanup_source(state, src, FALSE); - + return src ? 0 : ENOENT; } @@ -3706,7 +4283,7 @@ nstat_control_handle_query_request( mbuf_t m) { // TBD: handle this from another thread so we can enqueue a lot of data - // As written, if a client requests query all, this function will be + // As written, if a client requests query all, this function will be // called from their send of the request message. We will attempt to write // responses and succeed until the buffer fills up. Since the clients thread // is blocked on send, it won't be reading unless the client has two threads @@ -3723,7 +4300,7 @@ nstat_control_handle_query_request( } const boolean_t all_srcs = (req.srcref == NSTAT_SRC_REF_ALL); - + lck_mtx_lock(&state->mtx); if (all_srcs) @@ -3745,7 +4322,7 @@ nstat_control_handle_query_request( { nstat_src *src = NULL; int gone; - + src = *srcpp; gone = 0; // XXX ignore IFACE types? @@ -3788,7 +4365,7 @@ nstat_control_handle_query_request( } } } - + if (gone) { // send one last descriptor message so client may see last state @@ -3803,10 +4380,10 @@ nstat_control_handle_query_request( state->ncs_flags &= ~NSTAT_FLAG_REQCOUNTS; break; } - + // pull src out of the list *srcpp = src->next; - + src->next = dead_srcs; dead_srcs = src; } @@ -3814,7 +4391,7 @@ nstat_control_handle_query_request( { srcpp = &(*srcpp)->next; } - + if (!all_srcs && req.srcref == src->srcref) { break; @@ -3838,18 +4415,18 @@ nstat_control_handle_query_request( nstat_enqueue_success(req.hdr.context, state, flags); result = 0; } - + while (dead_srcs) { nstat_src *src; - + src = dead_srcs; dead_srcs = src->next; - + // release src and send notification nstat_control_cleanup_source(state, src, FALSE); } - + return result; } @@ -3914,7 +4491,7 @@ nstat_control_handle_get_src_description( src_count++; } } - + if (!all_srcs) { break; @@ -3938,7 +4515,7 @@ nstat_control_handle_get_src_description( nstat_enqueue_success(req.hdr.context, state, flags); result = 0; } - + return result; } @@ -4070,7 +4647,7 @@ nstat_control_handle_get_update( } lck_mtx_lock(&state->mtx); - + state->ncs_flags |= NSTAT_FLAG_SUPPORTS_UPDATES; errno_t result = ENOENT; @@ -4091,7 +4668,7 @@ nstat_control_handle_get_update( || src_count < QUERY_CONTINUATION_SRC_COUNT)) { int gone; - + gone = 0; src = *srcpp; if (nstat_control_reporting_allowed(state, src)) @@ -4126,7 +4703,7 @@ nstat_control_handle_get_update( result = nstat_control_send_update(state, src, req.hdr.context, 0, &gone); } } - + if (gone) { // pull src out of the list @@ -4139,7 +4716,7 @@ nstat_control_handle_get_update( { srcpp = &(*srcpp)->next; } - + if (req.srcref != NSTAT_SRC_REF_ALL && req.srcref == src->srcref) { break; @@ -4169,11 +4746,11 @@ nstat_control_handle_get_update( { src = dead_srcs; dead_srcs = src->next; - + // release src and send notification nstat_control_cleanup_source(state, src, FALSE); } - + return result; } @@ -4182,16 +4759,16 @@ nstat_control_handle_subscribe_sysinfo( nstat_control_state *state) { errno_t result = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0); - + if (result != 0) { return result; } - + lck_mtx_lock(&state->mtx); state->ncs_flags |= NSTAT_FLAG_SYSINFO_SUBSCRIBED; lck_mtx_unlock(&state->mtx); - + return 0; } @@ -4207,14 +4784,14 @@ nstat_control_send( struct nstat_msg_hdr *hdr; struct nstat_msg_hdr storage; errno_t result = 0; - + if (mbuf_pkthdr_len(m) < sizeof(*hdr)) { // Is this the right thing to do? mbuf_freem(m); return EINVAL; } - + if (mbuf_len(m) >= sizeof(*hdr)) { hdr = mbuf_data(m); @@ -4224,7 +4801,7 @@ nstat_control_send( mbuf_copydata(m, 0, sizeof(storage), &storage); hdr = &storage; } - + // Legacy clients may not set the length // Those clients are likely not setting the flags either // Fix everything up so old clients continue to work @@ -4237,56 +4814,56 @@ nstat_control_send( mbuf_copyback(m, 0, sizeof(*hdr), hdr, MBUF_DONTWAIT); } } - + switch (hdr->type) { case NSTAT_MSG_TYPE_ADD_SRC: result = nstat_control_handle_add_request(state, m); break; - + case NSTAT_MSG_TYPE_ADD_ALL_SRCS: result = nstat_control_handle_add_all(state, m); break; - + case NSTAT_MSG_TYPE_REM_SRC: result = nstat_control_handle_remove_request(state, m); break; - + case NSTAT_MSG_TYPE_QUERY_SRC: result = nstat_control_handle_query_request(state, m); break; - + case NSTAT_MSG_TYPE_GET_SRC_DESC: result = nstat_control_handle_get_src_description(state, m); break; - + case NSTAT_MSG_TYPE_SET_FILTER: result = nstat_control_handle_set_filter(state, m); break; - + case NSTAT_MSG_TYPE_GET_UPDATE: result = nstat_control_handle_get_update(state, m); break; - + case NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO: result = nstat_control_handle_subscribe_sysinfo(state); break; - + default: result = EINVAL; break; } - + if (result != 0) { struct nstat_msg_error err; - + bzero(&err, sizeof(err)); err.hdr.type = NSTAT_MSG_TYPE_ERROR; err.hdr.length = sizeof(err) + mbuf_pkthdr_len(m); err.hdr.context = hdr->context; err.error = result; - + if (mbuf_prepend(&m, sizeof(err), MBUF_DONTWAIT) == 0 && mbuf_copyback(m, 0, sizeof(err), &err, MBUF_DONTWAIT) == 0) { @@ -4297,7 +4874,7 @@ nstat_control_send( } m = NULL; } - + if (result != 0) { // Unable to prepend the error to the request - just send the error @@ -4309,8 +4886,8 @@ nstat_control_send( } nstat_stats.nstat_handle_msg_failures += 1; } - + if (m) mbuf_freem(m); - + return result; } diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h index ae6cdf1a0..e686d877d 100644 --- a/bsd/net/ntstat.h +++ b/bsd/net/ntstat.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2010-2015 Apple Inc. All rights reserved. + * Copyright (c) 2010-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #ifndef __NTSTAT_H__ @@ -36,10 +36,26 @@ #pragma pack(push, 4) #pragma mark -- Common Data Structures -- -#define __NSTAT_REVISION__ 7 +#define __NSTAT_REVISION__ 8 typedef u_int32_t nstat_provider_id_t; -typedef u_int32_t nstat_src_ref_t; +typedef u_int64_t nstat_src_ref_t; +typedef u_int64_t nstat_event_flags_t; + +// The following event definitions are very provisional.. +enum +{ + NSTAT_EVENT_SRC_ADDED = 0x00000001 + ,NSTAT_EVENT_SRC_REMOVED = 0x00000002 + ,NSTAT_EVENT_SRC_QUERIED = 0x00000004 + ,NSTAT_EVENT_SRC_QUERIED_ALL = 0x00000008 + ,NSTAT_EVENT_SRC_WILL_CHANGE_STATE = 0x00000010 + ,NSTAT_EVENT_SRC_DID_CHANGE_STATE = 0x00000020 + ,NSTAT_EVENT_SRC_WILL_CHANGE_OWNER = 0x00000040 + ,NSTAT_EVENT_SRC_DID_CHANGE_OWNER = 0x00000080 + ,NSTAT_EVENT_SRC_WILL_CHANGE_PROPERTY = 0x00000100 + ,NSTAT_EVENT_SRC_DID_CHANGE_PROPERTY = 0x00000200 +}; typedef struct nstat_counts { @@ -52,10 +68,10 @@ typedef struct nstat_counts u_int32_t nstat_rxduplicatebytes; u_int32_t nstat_rxoutoforderbytes; u_int32_t nstat_txretransmit; - + u_int32_t nstat_connectattempts; u_int32_t nstat_connectsuccesses; - + u_int32_t nstat_min_rtt; u_int32_t nstat_avg_rtt; u_int32_t nstat_var_rtt; @@ -184,8 +200,16 @@ enum ,NSTAT_SYSINFO_ECN_IFNET_OFF_TOTAL_OOPKTS = 89 ,NSTAT_SYSINFO_ECN_IFNET_OFF_DROP_RST = 90 ,NSTAT_SYSINFO_ECN_IFNET_TOTAL_CONN = 91 + ,NSTAT_SYSINFO_TFO_COOKIE_WRONG = 92 + ,NSTAT_SYSINFO_TFO_NO_COOKIE_RCV = 93 + ,NSTAT_SYSINFO_TFO_HEURISTICS_DISABLE = 94 + ,NSTAT_SYSINFO_TFO_SEND_BLACKHOLE = 95 + ,NSTAT_SYSINFO_KEY_SOCK_MBFLOOR = 96 + ,NSTAT_SYSINFO_IFNET_UNSENT_DATA = 97 + ,NSTAT_SYSINFO_ECN_IFNET_FALLBACK_DROPRST = 98 + ,NSTAT_SYSINFO_ECN_IFNET_FALLBACK_DROPRXMT = 99 // NSTAT_SYSINFO_ENUM_VERSION must be updated any time a value is added -#define NSTAT_SYSINFO_ENUM_VERSION 20151208 +#define NSTAT_SYSINFO_ENUM_VERSION 20160715 }; #pragma mark -- Network Statistics Providers -- @@ -208,10 +232,12 @@ enum { NSTAT_PROVIDER_NONE = 0 ,NSTAT_PROVIDER_ROUTE = 1 - ,NSTAT_PROVIDER_TCP = 2 - ,NSTAT_PROVIDER_UDP = 3 - ,NSTAT_PROVIDER_IFNET = 4 - ,NSTAT_PROVIDER_SYSINFO = 5 + ,NSTAT_PROVIDER_TCP_KERNEL = 2 + ,NSTAT_PROVIDER_TCP_USERLAND = 3 + ,NSTAT_PROVIDER_UDP_KERNEL = 4 + ,NSTAT_PROVIDER_UDP_USERLAND = 5 + ,NSTAT_PROVIDER_IFNET = 6 + ,NSTAT_PROVIDER_SYSINFO = 7 }; #define NSTAT_PROVIDER_LAST NSTAT_PROVIDER_SYSINFO #define NSTAT_PROVIDER_COUNT (NSTAT_PROVIDER_LAST+1) @@ -252,17 +278,17 @@ typedef struct nstat_tcp_descriptor struct sockaddr_in v4; struct sockaddr_in6 v6; } local; - + union { struct sockaddr_in v4; struct sockaddr_in6 v6; } remote; - + u_int32_t ifindex; - + u_int32_t state; - + u_int32_t sndbufsize; u_int32_t sndbufused; u_int32_t rcvbufsize; @@ -273,16 +299,16 @@ typedef struct nstat_tcp_descriptor u_int32_t traffic_class; u_int32_t traffic_mgt_flags; char cc_algo[16]; - + u_int64_t upid; u_int32_t pid; char pname[64]; u_int64_t eupid; u_int32_t epid; - uint8_t uuid[16]; - uint8_t euuid[16]; - uint8_t vuuid[16]; + uuid_t uuid; + uuid_t euuid; + uuid_t vuuid; struct tcp_conn_status connstatus; uint16_t ifnet_properties __attribute__((aligned(4))); } nstat_tcp_descriptor; @@ -296,28 +322,28 @@ typedef struct nstat_udp_descriptor struct sockaddr_in v4; struct sockaddr_in6 v6; } local; - + union { struct sockaddr_in v4; struct sockaddr_in6 v6; } remote; - + u_int32_t ifindex; - + u_int32_t rcvbufsize; u_int32_t rcvbufused; u_int32_t traffic_class; - + u_int64_t upid; u_int32_t pid; char pname[64]; u_int64_t eupid; u_int32_t epid; - uint8_t uuid[16]; - uint8_t euuid[16]; - uint8_t vuuid[16]; + uuid_t uuid; + uuid_t euuid; + uuid_t vuuid; uint16_t ifnet_properties; } nstat_udp_descriptor; @@ -326,31 +352,31 @@ typedef struct nstat_route_descriptor u_int64_t id; u_int64_t parent_id; u_int64_t gateway_id; - + union { struct sockaddr_in v4; struct sockaddr_in6 v6; struct sockaddr sa; } dst; - + union { struct sockaddr_in v4; struct sockaddr_in6 v6; struct sockaddr sa; } mask; - + union { struct sockaddr_in v4; struct sockaddr_in6 v6; struct sockaddr sa; } gateway; - + u_int32_t ifindex; u_int32_t flags; - + } nstat_route_descriptor; typedef struct nstat_ifnet_add_param @@ -377,6 +403,7 @@ typedef struct nstat_ifnet_desc_cellular_status #define NSTAT_IFNET_DESC_CELL_DL_MAX_BANDWIDTH_VALID 0x1000 #define NSTAT_IFNET_DESC_CELL_CONFIG_INACTIVITY_TIME_VALID 0x2000 #define NSTAT_IFNET_DESC_CELL_CONFIG_BACKOFF_TIME_VALID 0x4000 +#define NSTAT_IFNET_DESC_CELL_MSS_RECOMMENDED_VALID 0x8000 u_int32_t link_quality_metric; u_int32_t ul_effective_bandwidth; /* Measured uplink bandwidth based on current activity (bps) */ @@ -405,6 +432,10 @@ typedef struct nstat_ifnet_desc_cellular_status (bps) */ u_int32_t config_inactivity_time; /* ms */ u_int32_t config_backoff_time; /* new connections backoff time in ms */ +#define NSTAT_IFNET_DESC_MSS_RECOMMENDED_NONE 0x0 +#define NSTAT_IFNET_DESC_MSS_RECOMMENDED_MEDIUM 0x1 +#define NSTAT_IFNET_DESC_MSS_RECOMMENDED_LOW 0x2 + u_int16_t mss_recommended; /* recommended MSS */ } nstat_ifnet_desc_cellular_status; typedef struct nstat_ifnet_desc_wifi_status { @@ -513,7 +544,7 @@ typedef struct nstat_sysinfo_add_param } nstat_sysinfo_add_param; #define NSTAT_SYSINFO_MBUF_STATS 0x0001 -#define NSTAT_SYSINFO_TCP_STATS 0x0002 +#define NSTAT_SYSINFO_TCP_STATS 0x0002 #define NSTAT_SYSINFO_IFNET_ECN_STATS 0x0003 #pragma mark -- Network Statistics User Client -- @@ -525,7 +556,7 @@ enum // generic response messages NSTAT_MSG_TYPE_SUCCESS = 0 ,NSTAT_MSG_TYPE_ERROR = 1 - + // Requests ,NSTAT_MSG_TYPE_ADD_SRC = 1001 ,NSTAT_MSG_TYPE_ADD_ALL_SRCS = 1002 @@ -535,7 +566,7 @@ enum ,NSTAT_MSG_TYPE_SET_FILTER = 1006 ,NSTAT_MSG_TYPE_GET_UPDATE = 1007 ,NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO = 1008 - + // Responses/Notfications ,NSTAT_MSG_TYPE_SRC_ADDED = 10001 ,NSTAT_MSG_TYPE_SRC_REMOVED = 10002 @@ -547,7 +578,7 @@ enum enum { - NSTAT_SRC_REF_ALL = 0xffffffff + NSTAT_SRC_REF_ALL = 0xffffffffffffffffULL ,NSTAT_SRC_REF_INVALID = 0 }; @@ -565,10 +596,10 @@ enum ,NSTAT_FILTER_ACCEPT_CELLULAR = 0x00000004 ,NSTAT_FILTER_ACCEPT_WIFI = 0x00000008 ,NSTAT_FILTER_ACCEPT_WIRED = 0x00000010 - ,NSTAT_FILTER_ACCEPT_ALL = 0x0000001F - ,NSTAT_FILTER_IFNET_FLAGS = 0x000000FF - - ,NSTAT_FILTER_PROVIDER_NOZEROBYTES = 0x00000100 + ,NSTAT_FILTER_ACCEPT_AWDL = 0x00000020 + ,NSTAT_FILTER_ACCEPT_EXPENSIVE = 0x00000040 + ,NSTAT_FILTER_ACCEPT_CELLFALLBACK = 0x00000100 + ,NSTAT_FILTER_IFNET_FLAGS = 0x00000FFF ,NSTAT_FILTER_TCP_NO_LISTENER = 0x00001000 ,NSTAT_FILTER_TCP_ONLY_LISTENER = 0x00002000 @@ -580,6 +611,13 @@ enum ,NSTAT_FILTER_SUPPRESS_SRC_ADDED = 0x00100000 ,NSTAT_FILTER_REQUIRE_SRC_ADDED = 0x00200000 + ,NSTAT_FILTER_PROVIDER_NOZEROBYTES = 0x00400000 + + ,NSTAT_FILTER_SPECIFIC_USER_BY_PID = 0x01000000 + ,NSTAT_FILTER_SPECIFIC_USER_BY_EPID = 0x02000000 + ,NSTAT_FILTER_SPECIFIC_USER_BY_UUID = 0x04000000 + ,NSTAT_FILTER_SPECIFIC_USER_BY_EUUID = 0x08000000 + ,NSTAT_FILTER_SPECIFIC_USER = 0x0F000000 }; enum @@ -614,7 +652,10 @@ typedef struct nstat_msg_add_all_srcs { nstat_msg_hdr hdr; nstat_provider_id_t provider; - u_int64_t filter; + u_int64_t filter; + nstat_event_flags_t events; + pid_t target_pid; + uuid_t target_uuid; } nstat_msg_add_all_srcs; typedef struct nstat_msg_src_added @@ -647,6 +688,7 @@ typedef struct nstat_msg_src_description { nstat_msg_hdr hdr; nstat_src_ref_t srcref; + nstat_event_flags_t event_flags; nstat_provider_id_t provider; u_int8_t data[]; } nstat_msg_src_description; @@ -661,6 +703,7 @@ typedef struct nstat_msg_src_counts { nstat_msg_hdr hdr; nstat_src_ref_t srcref; + nstat_event_flags_t event_flags; nstat_counts counts; } nstat_msg_src_counts; @@ -668,6 +711,7 @@ typedef struct nstat_msg_src_update { nstat_msg_hdr hdr; nstat_src_ref_t srcref; + nstat_event_flags_t event_flags; nstat_counts counts; nstat_provider_id_t provider; u_int8_t data[]; @@ -726,11 +770,12 @@ typedef struct nstat_sysinfo_mbuf_stats u_int32_t sb_atmbuflimit; /* Memory limit reached for socket buffer autoscaling */ u_int32_t draincnt; /* Number of times mbuf pool has been drained under memory pressure */ u_int32_t memreleased; /* Memory (bytes) released from mbuf pool to VM */ + u_int32_t sbmb_floor; /* Lowest mbufs in sock buffer pool */ } nstat_sysinfo_mbuf_stats; typedef struct nstat_sysinfo_tcp_stats { - u_int32_t ipv4_avgrtt; /* Average RTT for IPv4 */ + u_int32_t ipv4_avgrtt; /* Average RTT for IPv4 */ u_int32_t ipv6_avgrtt; /* Average RTT for IPv6 */ u_int32_t send_plr; /* Average uplink packet loss rate */ u_int32_t recv_plr; /* Average downlink packet loss rate */ @@ -768,6 +813,10 @@ typedef struct nstat_sysinfo_tcp_stats u_int32_t tfo_syn_data_acked;/* Number of times our SYN+data has been acknowledged */ u_int32_t tfo_syn_loss; /* Number of times SYN+TFO has been lost and we fallback */ u_int32_t tfo_blackhole; /* Number of times SYN+TFO has been lost and we fallback */ + u_int32_t tfo_cookie_wrong; /* TFO-cookie we sent was wrong */ + u_int32_t tfo_no_cookie_rcv; /* We asked for a cookie but didn't get one */ + u_int32_t tfo_heuristics_disable; /* TFO got disabled due to heuristics */ + u_int32_t tfo_sndblackhole; /* TFO got blackholed in the sending direction */ } nstat_sysinfo_tcp_stats; enum { @@ -795,6 +844,7 @@ typedef struct nstat_sysinfo_data nstat_sysinfo_tcp_stats tcp_stats; nstat_sysinfo_ifnet_ecn_stats ifnet_ecn_stats; } u; + uint32_t unsent_data_cnt; /* Before sleeping */ } nstat_sysinfo_data; #pragma mark -- Generic Network Statistics Provider -- @@ -842,6 +892,38 @@ void nstat_ifnet_threshold_reached(unsigned int ifindex); void nstat_sysinfo_send_data(struct nstat_sysinfo_data *); +// Userland stats reporting + +// Each side, NetworkStatistics and the kernel provider for userland, +// pass opaque references. +typedef void *userland_stats_provider_context; +typedef void *nstat_userland_context; + +// When things have been set up, Netstats can request a refresh of its data. +typedef bool (userland_stats_request_vals_fn)(userland_stats_provider_context *ctx, + nstat_counts *countsp, + void *metadatap); + +// Things get started with a call to netstats to say that there’s a new connection: +nstat_userland_context ntstat_userland_stats_open(userland_stats_provider_context *ctx, + int provider_id, + u_int64_t properties, + userland_stats_request_vals_fn req_fn); + +void ntstat_userland_stats_close(nstat_userland_context nstat_ctx); + + +// There may be other occasions where the stats have changed and NECP should push the new values. +// This is provisional, ahead of full implementation. + +typedef enum { + USERLAND_STATS_WILL_UPDATE, + USERLAND_STATS_DID_UPDATE +} userland_stats_event_t; + +void ntstat_userland_stats_event(nstat_userland_context nstat_ctx, userland_stats_event_t event); + + // locked_add_64 uses atomic operations on 32bit so the 64bit // value can be properly read. The values are only ever incremented // while under the socket lock, so on 64bit we don't actually need diff --git a/bsd/net/packet_mangler.c b/bsd/net/packet_mangler.c index fbdc502e1..5a1776d74 100644 --- a/bsd/net/packet_mangler.c +++ b/bsd/net/packet_mangler.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -927,16 +927,12 @@ static errno_t pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u break; case IPPROTO_UDP: goto input_done; - break; case IPPROTO_ICMP: goto input_done; - break; case IPPROTO_ICMPV6: goto input_done; - break; default: goto input_done; - break; } /* XXX Do IP actions here */ diff --git a/bsd/net/pf.c b/bsd/net/pf.c index 92808d67e..70ea3cf33 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2015 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -128,7 +128,7 @@ #endif /* DUMMYNET */ /* - * For RandomULong(), to get a 32 bits random value + * For RandomULong(), to get a 32 bits random value * Note that random() returns a 31 bits value, see rdar://11159750 */ #include @@ -260,8 +260,8 @@ static int pf_test_rule(struct pf_rule **, struct pf_state **, void *, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **, struct ifqueue *); #if DUMMYNET -static int pf_test_dummynet(struct pf_rule **, int, - struct pfi_kif *, struct mbuf **, +static int pf_test_dummynet(struct pf_rule **, int, + struct pfi_kif *, struct mbuf **, struct pf_pdesc *, struct ip_fw_args *); #endif /* DUMMYNET */ static int pf_test_fragment(struct pf_rule **, int, @@ -685,24 +685,7 @@ static const char *pf_pptp_ctrl_type_name(u_int16_t code) #endif static const size_t PF_PPTP_CTRL_MSG_MINSIZE = - sizeof (struct pf_pptp_hdr) + - sizeof (struct pf_pptp_ctrl_hdr) + - MIN(sizeof (struct pf_pptp_ctrl_start_req), - MIN(sizeof (struct pf_pptp_ctrl_start_rpy), - MIN(sizeof (struct pf_pptp_ctrl_stop_req), - MIN(sizeof (struct pf_pptp_ctrl_stop_rpy), - MIN(sizeof (struct pf_pptp_ctrl_echo_req), - MIN(sizeof (struct pf_pptp_ctrl_echo_rpy), - MIN(sizeof (struct pf_pptp_ctrl_call_out_req), - MIN(sizeof (struct pf_pptp_ctrl_call_out_rpy), - MIN(sizeof (struct pf_pptp_ctrl_call_in_1st), - MIN(sizeof (struct pf_pptp_ctrl_call_in_2nd), - MIN(sizeof (struct pf_pptp_ctrl_call_in_3rd), - MIN(sizeof (struct pf_pptp_ctrl_call_clr), - MIN(sizeof (struct pf_pptp_ctrl_call_disc), - MIN(sizeof (struct pf_pptp_ctrl_error), - sizeof (struct pf_pptp_ctrl_set_linkinfo) - )))))))))))))); + sizeof (struct pf_pptp_hdr) + sizeof (struct pf_pptp_ctrl_hdr); union pf_pptp_ctrl_msg_union { struct pf_pptp_ctrl_start_req start_req; @@ -3591,7 +3574,7 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, return (rm); } -/* +/* * Get address translation information for NAT/BINAT/RDR * pd : pf packet descriptor * m : mbuf holding the packet @@ -3852,7 +3835,7 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; - struct inpcbinfo *pi; + struct inpcbinfo *pi; int inp = 0; if (pd == NULL) @@ -3927,7 +3910,7 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) #else if (inp == 0) { inp = in_pcblookup_hash_exists(pi, saddr->v4, sport, - daddr->v4, dport, INPLOOKUP_WILDCARD, + daddr->v4, dport, INPLOOKUP_WILDCARD, &pd->lookup.uid, &pd->lookup.gid, NULL); if (inp == 0) return (-1); @@ -3948,7 +3931,7 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) } break; #endif /* INET6 */ - + default: return (-1); } @@ -4532,7 +4515,7 @@ pf_nat64_ipv6(struct mbuf *m, int off, struct pf_pdesc *pd) int moff, hlen = sizeof(*ip4); if ((mp = m_pulldown(m, hlen, ICMP_MINLEN, &moff)) == NULL) - return (PF_NAT64); + return (PF_DROP); icmp = (struct icmp *)(void *)(mtod(mp, char *) + moff); icmp->icmp_cksum = 0; @@ -4571,13 +4554,24 @@ pf_nat64_ipv4(struct mbuf *m, int off, struct pf_pdesc *pd) int moff, hlen = sizeof(*ip6); if ((mp = m_pulldown(m, hlen, sizeof(*icmp6), &moff)) == NULL) - return (PF_NAT64); + return (PF_DROP); icmp6 = (struct icmp6_hdr *)(void *)(mtod(mp, char *) + moff); icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = inet6_cksum(m, IPPROTO_ICMPV6, hlen, ntohs(ip6->ip6_plen)); + } else if (pd->proto == IPPROTO_UDP) { + struct mbuf *mp; + struct udphdr *uh; + int moff, hlen = sizeof(*ip6); + if ((mp = m_pulldown(m, hlen, sizeof(*uh), &moff)) == NULL) + return (PF_DROP); + uh = (struct udphdr *)(void *)(mtod(mp, char *) + moff); + if (uh->uh_sum == 0) + uh->uh_sum = inet6_cksum(m, IPPROTO_UDP, hlen, + ntohs(ip6->ip6_plen)); } + ip6_input(m); return (PF_NAT64); } @@ -4964,14 +4958,14 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, (r->flagset & th->th_flags) != r->flags) r = TAILQ_NEXT(r, entries); /* tcp/udp only. uid.op always 0 in other cases */ - else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = - pf_socket_lookup(direction, pd), 1)) && + else if (r->uid.op && (pd->lookup.done || ((void)(pd->lookup.done = + pf_socket_lookup(direction, pd)), 1)) && !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], pd->lookup.uid)) r = TAILQ_NEXT(r, entries); /* tcp/udp only. gid.op always 0 in other cases */ - else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = - pf_socket_lookup(direction, pd), 1)) && + else if (r->gid.op && (pd->lookup.done || ((void)(pd->lookup.done = + pf_socket_lookup(direction, pd)), 1)) && !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], pd->lookup.gid)) r = TAILQ_NEXT(r, entries); @@ -5764,14 +5758,14 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #if DUMMYNET /* - * When pf_test_dummynet() returns PF_PASS, the rule matching parameter "rm" + * When pf_test_dummynet() returns PF_PASS, the rule matching parameter "rm" * remains unchanged, meaning the packet did not match a dummynet rule. - * when the packet does match a dummynet rule, pf_test_dummynet() returns - * PF_PASS and zero out the mbuf rule as the packet is effectively siphoned + * when the packet does match a dummynet rule, pf_test_dummynet() returns + * PF_PASS and zero out the mbuf rule as the packet is effectively siphoned * out by dummynet. */ static int -pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, +pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, struct mbuf **m0, struct pf_pdesc *pd, struct ip_fw_args *fwa) { struct mbuf *m = *m0; @@ -5797,16 +5791,16 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, if (!DUMMYNET_LOADED) return (PF_PASS); - + if (TAILQ_EMPTY(pf_main_ruleset.rules[PF_RULESET_DUMMYNET].active.ptr)) return (PF_PASS); - + bzero(&dnflow, sizeof(dnflow)); hdrlen = 0; /* Fragments don't gave protocol headers */ - if (!(pd->flags & PFDESC_IP_FRAG)) + if (!(pd->flags & PFDESC_IP_FRAG)) switch (pd->proto) { case IPPROTO_TCP: dnflow.fwa_id.flags = pd->hdr.tcp->th_flags; @@ -5862,7 +5856,7 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ - else if (r->proto == pd->proto && + else if (r->proto == pd->proto && (r->proto == IPPROTO_TCP || r->proto == IPPROTO_UDP) && ((pd->flags & PFDESC_IP_FRAG) || ((r->src.xport.range.op && @@ -5883,12 +5877,12 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, th->th_dport))) r = r->skip[PF_SKIP_DST_PORT].ptr; /* icmp only. type always 0 in other cases */ - else if (r->type && + else if (r->type && ((pd->flags & PFDESC_IP_FRAG) || r->type != icmptype + 1)) r = TAILQ_NEXT(r, entries); /* icmp only. type always 0 in other cases */ - else if (r->code && + else if (r->code && ((pd->flags & PFDESC_IP_FRAG) || r->code != icmpcode + 1)) r = TAILQ_NEXT(r, entries); @@ -5905,8 +5899,8 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); else { - /* - * Need to go past the previous dummynet matching rule + /* + * Need to go past the previous dummynet matching rule */ if (r->anchor == NULL) { if (found_prev_rule) { @@ -5949,7 +5943,7 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, if (r->action == PF_NODUMMYNET) { int dirndx = (direction == PF_OUT); - + r->packets[dirndx]++; r->bytes[dirndx] += pd->tot_len; @@ -5963,10 +5957,10 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, if (r->dnpipe && ip_dn_io_ptr != NULL) { int dirndx = (direction == PF_OUT); - + r->packets[dirndx]++; r->bytes[dirndx] += pd->tot_len; - + dnflow.fwa_cookie = r->dnpipe; dnflow.fwa_pf_rule = r; dnflow.fwa_id.proto = pd->proto; @@ -5988,8 +5982,8 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, dnflow.fwa_oif = fwa->fwa_oif; dnflow.fwa_oflags = fwa->fwa_oflags; /* - * Note that fwa_ro, fwa_dst and fwa_ipoa are - * actually in a union so the following does work + * Note that fwa_ro, fwa_dst and fwa_ipoa are + * actually in a union so the following does work * for both IPv4 and IPv6 */ dnflow.fwa_ro = fwa->fwa_ro; @@ -6002,29 +5996,29 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, dnflow.fwa_unfragpartlen = fwa->fwa_unfragpartlen; dnflow.fwa_exthdrs = fwa->fwa_exthdrs; } - + if (af == AF_INET) { struct ip *iphdr = mtod(m, struct ip *); NTOHS(iphdr->ip_len); NTOHS(iphdr->ip_off); } /* - * Don't need to unlock pf_lock as NET_THREAD_HELD_PF + * Don't need to unlock pf_lock as NET_THREAD_HELD_PF * allows for recursive behavior */ ip_dn_io_ptr(m, dnflow.fwa_cookie, - af == AF_INET ? + af == AF_INET ? direction == PF_IN ? DN_TO_IP_IN : DN_TO_IP_OUT : direction == PF_IN ? DN_TO_IP6_IN : DN_TO_IP6_OUT, &dnflow, DN_CLIENT_PF); - + /* - * The packet is siphoned out by dummynet so return a NULL + * The packet is siphoned out by dummynet so return a NULL * mbuf so the caller can still return success. */ *m0 = NULL; - + return (PF_PASS); } @@ -6135,7 +6129,7 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, struct tcphdr *th; struct pf_pptp_state *pptps; struct pf_pptp_ctrl_msg cm; - size_t plen; + size_t plen, tlen; struct pf_state *gs; u_int16_t ct; u_int16_t *pac_call_id; @@ -6160,7 +6154,7 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, plen = min(sizeof (cm), m->m_pkthdr.len - off); if (plen < PF_PPTP_CTRL_MSG_MINSIZE) return; - + tlen = plen - PF_PPTP_CTRL_MSG_MINSIZE; m_copydata(m, off, plen, &cm); if (ntohl(cm.hdr.magic) != PF_PPTP_MAGIC_NUMBER) @@ -6168,6 +6162,33 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, if (ntohs(cm.hdr.type) != 1) return; +#define TYPE_LEN_CHECK(_type, _name) \ + case PF_PPTP_CTRL_TYPE_##_type: \ + if (tlen < sizeof(struct pf_pptp_ctrl_##_name)) \ + return; \ + break; + + switch (cm.ctrl.type) { + TYPE_LEN_CHECK(START_REQ, start_req); + TYPE_LEN_CHECK(START_RPY, start_rpy); + TYPE_LEN_CHECK(STOP_REQ, stop_req); + TYPE_LEN_CHECK(STOP_RPY, stop_rpy); + TYPE_LEN_CHECK(ECHO_REQ, echo_req); + TYPE_LEN_CHECK(ECHO_RPY, echo_rpy); + TYPE_LEN_CHECK(CALL_OUT_REQ, call_out_req); + TYPE_LEN_CHECK(CALL_OUT_RPY, call_out_rpy); + TYPE_LEN_CHECK(CALL_IN_1ST, call_in_1st); + TYPE_LEN_CHECK(CALL_IN_2ND, call_in_2nd); + TYPE_LEN_CHECK(CALL_IN_3RD, call_in_3rd); + TYPE_LEN_CHECK(CALL_CLR, call_clr); + TYPE_LEN_CHECK(CALL_DISC, call_disc); + TYPE_LEN_CHECK(ERROR, error); + TYPE_LEN_CHECK(SET_LINKINFO, set_linkinfo); + default: + return; + } +#undef TYPE_LEN_CHECK + if (!gs) { gs = pool_get(&pf_state_pl, PR_WAITOK); if (!gs) @@ -7790,7 +7811,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } return (PF_PASS); - break; } case IPPROTO_UDP: { struct udphdr uh; @@ -7990,7 +8010,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } return (PF_PASS); - break; } #if INET case IPPROTO_ICMP: { @@ -8046,7 +8065,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } return (PF_PASS); - break; } #endif /* INET */ #if INET6 @@ -8105,7 +8123,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } return (PF_PASS); - break; } #endif /* INET6 */ default: { @@ -8164,7 +8181,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } return (PF_PASS); - break; } } } diff --git a/bsd/net/pf_if.c b/bsd/net/pf_if.c index 75b52b626..05f265677 100644 --- a/bsd/net/pf_if.c +++ b/bsd/net/pf_if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -284,7 +284,6 @@ pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET)); } - break; #endif /* INET */ #if INET6 case AF_INET6: @@ -297,7 +296,6 @@ pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6)); } - break; #endif /* INET6 */ default: return (0); diff --git a/bsd/net/pf_norm.c b/bsd/net/pf_norm.c index 3df62ef3c..547ff705f 100644 --- a/bsd/net/pf_norm.c +++ b/bsd/net/pf_norm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1876,7 +1876,6 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, switch (proto) { case IPPROTO_FRAGMENT: goto fragment; - break; case IPPROTO_AH: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: diff --git a/bsd/net/pf_ruleset.c b/bsd/net/pf_ruleset.c index dc78423c3..f03f3297f 100644 --- a/bsd/net/pf_ruleset.c +++ b/bsd/net/pf_ruleset.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2015 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -152,25 +152,20 @@ pf_get_ruleset_number(u_int8_t action) case PF_SCRUB: case PF_NOSCRUB: return (PF_RULESET_SCRUB); - break; case PF_PASS: case PF_DROP: return (PF_RULESET_FILTER); - break; case PF_NAT: case PF_NONAT: return (PF_RULESET_NAT); - break; case PF_BINAT: case PF_NOBINAT: return (PF_RULESET_BINAT); - break; case PF_RDR: case PF_NORDR: case PF_NAT64: case PF_NONAT64: return (PF_RULESET_RDR); - break; #if DUMMYNET case PF_DUMMYNET: case PF_NODUMMYNET: @@ -178,7 +173,6 @@ pf_get_ruleset_number(u_int8_t action) #endif /* DUMMYNET */ default: return (PF_RULESET_MAX); - break; } } diff --git a/bsd/net/pktap.c b/bsd/net/pktap.c index e02810ac4..2b5c5e0ce 100644 --- a/bsd/net/pktap.c +++ b/bsd/net/pktap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2014 Apple Inc. All rights reserved. + * Copyright (c) 2012-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -231,6 +231,13 @@ pktap_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) pktap->pktp_filters[1].filter_op = PKTAP_FILTER_OP_PASS; pktap->pktp_filters[1].filter_param = PKTAP_FILTER_PARAM_IF_TYPE; pktap->pktp_filters[1].filter_param_if_type = IFT_IEEE1394; + +#if (DEVELOPMENT || DEBUG) + pktap->pktp_filters[2].filter_op = PKTAP_FILTER_OP_PASS; + pktap->pktp_filters[2].filter_param = PKTAP_FILTER_PARAM_IF_TYPE; + pktap->pktp_filters[2].filter_param_if_type = IFT_OTHER; +#endif /* DEVELOPMENT || DEBUG */ + /* * We do not use a set_bpf_tap() function as we rather rely on the more * accurate callback passed to bpf_attach() @@ -818,6 +825,8 @@ pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto, hdr->pth_ipproto = IPPROTO_RAW; else hdr->pth_ipproto = m->m_pkthdr.pkt_proto; + if (m->m_pkthdr.pkt_flags & PKTF_NEW_FLOW) + hdr->pth_flags |= PTH_FLAG_NEW_FLOW; } else if (outgoing == 0) { struct inpcb *inp = NULL; @@ -1083,6 +1092,10 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, hdr_size = sizeof(hdr_buffer); break; } + hdr->pth_dlt = DLT_NULL; + hdr_buffer.proto = proto; + hdr_size = sizeof(hdr_buffer); + break; default: if (pre == 0) hdr->pth_dlt = DLT_RAW; diff --git a/bsd/net/pktap.h b/bsd/net/pktap.h index d3406bce1..ef5ec8c7c 100644 --- a/bsd/net/pktap.h +++ b/bsd/net/pktap.h @@ -136,6 +136,7 @@ struct pktap_header { #define PTH_FLAG_DELAY_PKTAP 0x1000 /* Finalize pktap header on read */ #endif /* BSD_KERNEL_PRIVATE */ #define PTH_FLAG_TSTAMP 0x2000 /* Has time stamp */ +#define PTH_FLAG_NEW_FLOW 0x4000 /* Packet from a new flow */ #ifdef BSD_KERNEL_PRIVATE diff --git a/bsd/net/pktsched/Makefile b/bsd/net/pktsched/Makefile index f091673a3..884775f00 100644 --- a/bsd/net/pktsched/Makefile +++ b/bsd/net/pktsched/Makefile @@ -12,7 +12,8 @@ KERNELFILES= \ PRIVATE_DATAFILES = \ pktsched.h pktsched_cbq.h pktsched_fairq.h pktsched_hfsc.h \ - pktsched_priq.h pktsched_tcq.h pktsched_rmclass.h pktsched_qfq.h + pktsched_priq.h pktsched_tcq.h pktsched_rmclass.h pktsched_qfq.h \ + pktsched_fq_codel.h PRIVATE_KERNELFILES = ${KERNELFILES} diff --git a/bsd/net/pktsched/pktsched.c b/bsd/net/pktsched/pktsched.c index 4f7d32a75..451aa1709 100644 --- a/bsd/net/pktsched/pktsched.c +++ b/bsd/net/pktsched/pktsched.c @@ -45,6 +45,7 @@ #include #include #include +#include #if PKTSCHED_PRIQ #include #endif /* PKTSCHED_PRIQ */ @@ -185,7 +186,9 @@ pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags) case PKTSCHEDT_QFQ: error = qfq_setup_ifclassq(ifq, sflags); break; - + case PKTSCHEDT_FQ_CODEL: + error = fq_if_setup_ifclassq(ifq, sflags); + break; default: error = ENXIO; break; @@ -227,6 +230,9 @@ pktsched_teardown(struct ifclassq *ifq) error = qfq_teardown_ifclassq(ifq); break; + case PKTSCHEDT_FQ_CODEL: + error = fq_if_teardown_ifclassq(ifq); + break; default: error = ENXIO; break; @@ -257,6 +263,9 @@ pktsched_getqstats(struct ifclassq *ifq, u_int32_t qid, error = qfq_getqstats_ifclassq(ifq, qid, ifqs); break; + case PKTSCHEDT_FQ_CODEL: + error = fq_if_getqstats_ifclassq(ifq, qid, ifqs); + break; default: error = ENXIO; break; diff --git a/bsd/net/pktsched/pktsched.h b/bsd/net/pktsched/pktsched.h index a1ecb25db..fb25dfcef 100644 --- a/bsd/net/pktsched/pktsched.h +++ b/bsd/net/pktsched/pktsched.h @@ -42,7 +42,8 @@ extern "C" { #define PKTSCHEDT_FAIRQ 4 /* fairq */ #define PKTSCHEDT_TCQ 5 /* traffic class queue */ #define PKTSCHEDT_QFQ 6 /* quick fair queueing */ -#define PKTSCHEDT_MAX 7 /* should be max sched type + 1 */ +#define PKTSCHEDT_FQ_CODEL 7 /* Flow queues with CoDel */ +#define PKTSCHEDT_MAX 8 /* should be max sched type + 1 */ #ifdef BSD_KERNEL_PRIVATE #include @@ -57,6 +58,7 @@ extern "C" { #define PKTSCHEDF_QALG_ECN 0x10 /* enable ECN */ #define PKTSCHEDF_QALG_FLOWCTL 0x20 /* enable flow control advisories */ #define PKTSCHEDF_QALG_DELAYBASED 0x40 /* Delay based queueing */ +#define PKTSCHEDF_QALG_FQ_CODEL 0x80 /* Flow queueing with Codel */ /* macro for timeout/untimeout */ /* use old-style timeout/untimeout */ diff --git a/bsd/net/pktsched/pktsched_fq_codel.c b/bsd/net/pktsched/pktsched_fq_codel.c new file mode 100644 index 000000000..f7eef8378 --- /dev/null +++ b/bsd/net/pktsched/pktsched_fq_codel.c @@ -0,0 +1,1006 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +static size_t fq_if_size; +static struct zone *fq_if_zone; + +static fq_if_t *fq_if_alloc(struct ifnet *ifp, int how); +static void fq_if_destroy(fq_if_t *fqs); +static void fq_if_classq_init(fq_if_t *fqs, u_int32_t priority, + u_int32_t quantum, u_int32_t drr_max, u_int32_t svc_class); +static int fq_if_enqueue_classq(struct ifclassq *ifq, struct mbuf *m); +static struct mbuf *fq_if_dequeue_classq(struct ifclassq *ifq, cqdq_op_t); +static int fq_if_dequeue_classq_multi(struct ifclassq *, cqdq_op_t, + u_int32_t, u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, + u_int32_t *); +static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, u_int32_t, + u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *); +static int fq_if_request_classq(struct ifclassq *ifq, cqrq_t op, void *arg); +void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat); +static void fq_if_purge(fq_if_t *); +static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *); +static void fq_if_purge_flow(fq_if_t *, fq_t *, u_int32_t *, u_int32_t *); +static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl, + bool add_to_old); +static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, + fq_t *fq, bool remove_hash); +static void fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, + fq_t *fq); + +#define FQ_IF_ZONE_MAX 32 /* Maximum elements in zone */ +#define FQ_IF_ZONE_NAME "pktsched_fq_if" /* zone for fq_if class */ + +#define FQ_IF_FLOW_HASH_ID(_flowid_) \ + (((_flowid_) >> FQ_IF_HASH_TAG_SHIFT) & FQ_IF_HASH_TAG_MASK) + +#define FQ_IF_CLASSQ_IDLE(_fcl_) \ + (STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \ + STAILQ_EMPTY(&(_fcl_)->fcl_old_flows)) + +void +fq_codel_scheduler_init(void) +{ + /* Initialize the zone for flow queue structures */ + fq_codel_init(); + + fq_if_size = sizeof (fq_if_t); + fq_if_zone = zinit(fq_if_size, (FQ_IF_ZONE_MAX * fq_if_size), 0, + FQ_IF_ZONE_NAME); + if (fq_if_zone == NULL) { + panic("%s: failed allocating from %s", __func__, + (FQ_IF_ZONE_NAME)); + } + zone_change(fq_if_zone, Z_EXPAND, TRUE); + zone_change(fq_if_zone, Z_CALLERACCT, TRUE); + +} + +fq_if_t * +fq_if_alloc(struct ifnet *ifp, int how) +{ + fq_if_t *fqs; + fqs = (how == M_WAITOK) ? zalloc(fq_if_zone) : + zalloc_noblock(fq_if_zone); + if (fqs == NULL) + return (NULL); + + bzero(fqs, fq_if_size); + fqs->fqs_ifq = &ifp->if_snd; + + /* Calculate target queue delay */ + ifclassq_calc_target_qdelay(ifp, &fqs->fqs_target_qdelay); + + /* Calculate update interval */ + ifclassq_calc_update_interval(&fqs->fqs_update_interval); + fqs->fqs_pkt_droplimit = FQ_IF_MAX_PKT_LIMIT; + STAILQ_INIT(&fqs->fqs_fclist); + return (fqs); +} + +void +fq_if_destroy(fq_if_t *fqs) +{ + IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); + fq_if_purge(fqs); + fqs->fqs_ifq = NULL; + zfree(fq_if_zone, fqs); +} + +static inline u_int32_t +fq_if_service_to_priority(mbuf_svc_class_t svc) +{ + u_int32_t pri; + + switch (svc) { + case MBUF_SC_BK_SYS: + pri = FQ_IF_BK_SYS_INDEX; + break; + case MBUF_SC_BK: + pri = FQ_IF_BK_INDEX; + break; + case MBUF_SC_BE: + pri = FQ_IF_BE_INDEX; + break; + case MBUF_SC_RD: + pri = FQ_IF_RD_INDEX; + break; + case MBUF_SC_OAM: + pri = FQ_IF_OAM_INDEX; + break; + case MBUF_SC_AV: + pri = FQ_IF_AV_INDEX; + break; + case MBUF_SC_RV: + pri = FQ_IF_RV_INDEX; + break; + case MBUF_SC_VI: + pri = FQ_IF_VI_INDEX; + break; + case MBUF_SC_VO: + pri = FQ_IF_VO_INDEX; + break; + case MBUF_SC_CTL: + pri = FQ_IF_CTL_INDEX; + break; + default: + pri = FQ_IF_BE_INDEX; /* Use best effort by default */ + break; + } + return (pri); +} + +void +fq_if_classq_init(fq_if_t *fqs, u_int32_t pri, u_int32_t quantum, + u_int32_t drr_max, u_int32_t svc_class) +{ + fq_if_classq_t *fq_cl; + + fq_cl = &fqs->fqs_classq[pri]; + + VERIFY(pri >= 0 && pri < FQ_IF_MAX_CLASSES && + fq_cl->fcl_quantum == 0); + fq_cl->fcl_quantum = quantum; + fq_cl->fcl_pri = pri; + fq_cl->fcl_drr_max = drr_max; + fq_cl->fcl_service_class = svc_class; + STAILQ_INIT(&fq_cl->fcl_new_flows); + STAILQ_INIT(&fq_cl->fcl_old_flows); +} + +int +fq_if_enqueue_classq(struct ifclassq *ifq, struct mbuf *m) +{ + u_int32_t pri; + fq_if_t *fqs; + fq_if_classq_t *fq_cl; + int ret, len; + mbuf_svc_class_t svc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + if (!(m->m_flags & M_PKTHDR)) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + + fqs = (fq_if_t *)ifq->ifcq_disc; + svc = mbuf_get_service_class(m); + pri = fq_if_service_to_priority(svc); + VERIFY(pri >= 0 && pri < FQ_IF_MAX_CLASSES); + fq_cl = &fqs->fqs_classq[pri]; + + if (svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1) { + /* BK_SYS is currently throttled */ + fq_cl->fcl_stat.fcl_throttle_drops++; + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (EQSUSPENDED); + } + + len = m_length(m); + ret = fq_addq(fqs, m, fq_cl); + if (!FQ_IF_CLASSQ_IDLE(fq_cl)) { + if (((fqs->fqs_bitmaps[FQ_IF_ER] | fqs->fqs_bitmaps[FQ_IF_EB]) & + (1 << pri)) == 0) { + /* + * this group is not in ER or EB groups, + * mark it as IB + */ + pktsched_bit_set(pri, &fqs->fqs_bitmaps[FQ_IF_IB]); + } + } + + if (ret != 0) { + if (ret == CLASSQEQ_SUCCESS_FC) { + /* packet enqueued, return advisory feedback */ + ret = EQFULL; + } else { + VERIFY(ret == CLASSQEQ_DROPPED || + ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP); + switch (ret) { + case CLASSQEQ_DROPPED: + return (ENOBUFS); + case CLASSQEQ_DROPPED_FC: + return (EQFULL); + case CLASSQEQ_DROPPED_SP: + return (EQSUSPENDED); + } + } + } + IFCQ_INC_LEN(ifq); + IFCQ_INC_BYTES(ifq, len); + return (ret); +} + +struct mbuf * +fq_if_dequeue_classq(struct ifclassq *ifq, cqdq_op_t op) +{ + struct mbuf *top; + + (void) fq_if_dequeue_classq_multi(ifq, op, 1, + CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &top, NULL, NULL, NULL); + + return (top); +} + +int +fq_if_dequeue_classq_multi(struct ifclassq *ifq, cqdq_op_t op, + u_int32_t maxpktcnt, u_int32_t maxbytecnt, struct mbuf **first_packet, + struct mbuf **last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt) +{ +#pragma unused(op) + struct mbuf *top = NULL, *tail = NULL, *first, *last; + u_int32_t pktcnt = 0, bytecnt = 0, total_pktcnt, total_bytecnt; + fq_if_t *fqs; + fq_if_classq_t *fq_cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + fqs = (fq_if_t *)ifq->ifcq_disc; + + first = last = NULL; + total_pktcnt = total_bytecnt = 0; + + for (;;) { + if (fqs->fqs_bitmaps[FQ_IF_ER] == 0 && + fqs->fqs_bitmaps[FQ_IF_EB] == 0) { + fqs->fqs_bitmaps[FQ_IF_EB] = fqs->fqs_bitmaps[FQ_IF_IB]; + fqs->fqs_bitmaps[FQ_IF_IB] = 0; + if (fqs->fqs_bitmaps[FQ_IF_EB] == 0) + break; + } + pri = pktsched_ffs(fqs->fqs_bitmaps[FQ_IF_ER]); + if (pri == 0) { + /* + * There are no ER flows, move the highest + * priority one from EB if there are any in that + * category + */ + pri = pktsched_ffs(fqs->fqs_bitmaps[FQ_IF_EB]); + VERIFY(pri > 0); + pktsched_bit_clr((pri - 1), + &fqs->fqs_bitmaps[FQ_IF_EB]); + pktsched_bit_set((pri - 1), + &fqs->fqs_bitmaps[FQ_IF_ER]); + } + pri--; /* index starts at 0 */ + fq_cl = &fqs->fqs_classq[pri]; + + if (fq_cl->fcl_budget <= 0) { + /* Update the budget */ + fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max, + fq_cl->fcl_stat.fcl_flows_cnt) * + fq_cl->fcl_quantum); + if (fq_cl->fcl_budget <= 0) + goto state_change; + } + fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), + (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt, + &bytecnt); + if (top != NULL) { + VERIFY(pktcnt > 0 && bytecnt > 0); + if (first == NULL) { + first = top; + last = tail; + total_pktcnt = pktcnt; + total_bytecnt = bytecnt; + } else { + last->m_nextpkt = top; + last = tail; + total_pktcnt += pktcnt; + total_bytecnt += bytecnt; + } + last->m_nextpkt = NULL; + fq_cl->fcl_budget -= bytecnt; + pktcnt = 0; + bytecnt = 0; + } + + /* + * If the class has exceeded the budget but still has data + * to send, move it to IB + */ +state_change: + if (!FQ_IF_CLASSQ_IDLE(fq_cl)) { + if (fq_cl->fcl_budget <= 0) { + pktsched_bit_set(pri, + &fqs->fqs_bitmaps[FQ_IF_IB]); + pktsched_bit_clr(pri, + &fqs->fqs_bitmaps[FQ_IF_ER]); + } + } else { + pktsched_bit_clr(pri, &fqs->fqs_bitmaps[FQ_IF_ER]); + VERIFY(((fqs->fqs_bitmaps[FQ_IF_ER] | + fqs->fqs_bitmaps[FQ_IF_EB] | + fqs->fqs_bitmaps[FQ_IF_IB])&(1 << pri)) == 0); + fq_cl->fcl_budget = 0; + } + if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) + break; + } + if (first != NULL) { + if (first_packet != NULL) + *first_packet = first; + if (last_packet != NULL) + *last_packet = last; + if (retpktcnt != NULL) + *retpktcnt = total_pktcnt; + if (retbytecnt != NULL) + *retbytecnt = total_bytecnt; + IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt); + } else { + if (first_packet != NULL) + *first_packet = NULL; + if (last_packet != NULL) + *last_packet = NULL; + if (retpktcnt != NULL) + *retpktcnt = 0; + if (retbytecnt != NULL) + *retbytecnt = 0; + } + return (0); +} + +static void +fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, u_int32_t *pktsp, + u_int32_t *bytesp) +{ + fq_if_classq_t *fq_cl; + u_int32_t pkts, bytes; + struct mbuf *m; + + fq_cl = &fqs->fqs_classq[fq->fq_sc_index]; + pkts = bytes = 0; + while ((m = fq_getq_flow(fqs, fq)) != NULL) { + pkts++; + bytes += m_length(m); + m_freem(m); + m = NULL; + } + IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes); + + if (fq->fq_flags & FQF_NEW_FLOW) { + fq_if_empty_new_flow(fq, fq_cl, false); + } else if (fq->fq_flags & FQF_OLD_FLOW) { + fq_if_empty_old_flow(fqs, fq_cl, fq, false); + } + + fq_if_destroy_flow(fqs, fq_cl, fq); + + if (FQ_IF_CLASSQ_IDLE(fq_cl)) { + int i; + for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) { + pktsched_bit_clr(fq_cl->fcl_pri, + &fqs->fqs_bitmaps[i]); + } + } + if (pktsp != NULL) + *pktsp = pkts; + if (bytesp != NULL) + *bytesp = bytes; +} + +static void +fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl) +{ + fq_t *fq, *tfq; + /* + * Take each flow from new/old flow list and flush mbufs + * in that flow + */ + STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) { + fq_if_purge_flow(fqs, fq, NULL, NULL); + } + STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) { + fq_if_purge_flow(fqs, fq, NULL, NULL); + } + VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows)); + VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows)); + + STAILQ_INIT(&fq_cl->fcl_new_flows); + STAILQ_INIT(&fq_cl->fcl_old_flows); + fq_cl->fcl_budget = 0; +} + +static void +fq_if_purge(fq_if_t *fqs) +{ + int i; + + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + for (i = 0; i < FQ_IF_MAX_CLASSES; i++) { + fq_if_purge_classq(fqs, &fqs->fqs_classq[i]); + } + + VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist)); + + fqs->fqs_large_flow = NULL; + for (i = 0; i < FQ_IF_HASH_TABLE_SIZE; i++) { + VERIFY(SLIST_EMPTY(&fqs->fqs_flows[i])); + } + + bzero(&fqs->fqs_bitmaps, sizeof (fqs->fqs_bitmaps)); + + IFCQ_LEN(fqs->fqs_ifq) = 0; + IFCQ_BYTES(fqs->fqs_ifq) = 0; +} + +static void +fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req) +{ + fq_t *fq; + + IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); + req->packets = req->bytes = 0; + VERIFY(req->flow != 0); + + fq = fq_if_hash_pkt(fqs, req->flow, req->sc, 0, FALSE); + + if (fq != NULL) + fq_if_purge_flow(fqs, fq, &req->packets, &req->bytes); +} + +static void +fq_if_event(fq_if_t *fqs, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); + + switch (ev) { + case CLASSQ_EV_LINK_UP: + case CLASSQ_EV_LINK_DOWN: + fq_if_purge(fqs); + break; + default: + break; + } +} + +static void +fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl) +{ + fq_if_purge_classq(fqs, fq_cl); + fqs->fqs_throttle = 1; + fq_cl->fcl_stat.fcl_throttle_on++; +} + +static void +fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl) +{ + VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl)); + fqs->fqs_throttle = 0; + fq_cl->fcl_stat.fcl_throttle_off++; +} + + +static int +fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr) +{ + struct ifclassq *ifq = fqs->fqs_ifq; + int index; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!tr->set) { + tr->level = fqs->fqs_throttle; + return (0); + } + + if (tr->level == fqs->fqs_throttle) + return (EALREADY); + + /* Throttling is allowed on BK_SYS class only */ + index = fq_if_service_to_priority(MBUF_SC_BK_SYS); + switch (tr->level) { + case IFNET_THROTTLE_OFF: + fq_if_classq_resume(fqs, &fqs->fqs_classq[index]); + break; + case IFNET_THROTTLE_OPPORTUNISTIC: + fq_if_classq_suspend(fqs, &fqs->fqs_classq[index]); + break; + default: + break; + } + return (0); +} + +void +fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat) +{ + u_int32_t pri; + fq_if_classq_t *fq_cl; + + if (stat == NULL) + return; + + pri = fq_if_service_to_priority(stat->sc); + fq_cl = &fqs->fqs_classq[pri]; + stat->packets = fq_cl->fcl_stat.fcl_pkt_cnt; + stat->bytes = fq_cl->fcl_stat.fcl_byte_cnt; +} + +int +fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg) +{ + int err = 0; + fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + /* + * These are usually slow operations, convert the lock ahead of time + */ + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + switch (rq) { + case CLASSQRQ_PURGE: + fq_if_purge(fqs); + break; + case CLASSQRQ_PURGE_SC: + fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg); + break; + case CLASSQRQ_EVENT: + fq_if_event(fqs, (cqev_t)arg); + break; + case CLASSQRQ_THROTTLE: + fq_if_throttle(fqs, (cqrq_throttle_t *)arg); + break; + case CLASSQRQ_STAT_SC: + fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg); + break; + } + return (err); +} + +int +fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) +{ +#pragma unused(flags) + struct ifnet *ifp = ifq->ifcq_ifp; + fq_if_t *fqs = NULL; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); + + fqs = fq_if_alloc(ifp, M_WAITOK); + if (fqs == NULL) + return (ENOMEM); + + fq_if_classq_init(fqs, FQ_IF_BK_SYS_INDEX, 1500, 2, MBUF_SC_BK_SYS); + fq_if_classq_init(fqs, FQ_IF_BK_INDEX, 1500, 2, MBUF_SC_BK); + fq_if_classq_init(fqs, FQ_IF_BE_INDEX, 1500, 4, MBUF_SC_BE); + fq_if_classq_init(fqs, FQ_IF_RD_INDEX, 1500, 4, MBUF_SC_RD); + fq_if_classq_init(fqs, FQ_IF_OAM_INDEX, 1500, 4, MBUF_SC_OAM); + fq_if_classq_init(fqs, FQ_IF_AV_INDEX, 3000, 6, MBUF_SC_AV); + fq_if_classq_init(fqs, FQ_IF_RV_INDEX, 3000, 6, MBUF_SC_RV); + fq_if_classq_init(fqs, FQ_IF_VI_INDEX, 3000, 6, MBUF_SC_VI); + fq_if_classq_init(fqs, FQ_IF_VO_INDEX, 600, 8, MBUF_SC_VO); + fq_if_classq_init(fqs, FQ_IF_CTL_INDEX, 600, 8, MBUF_SC_CTL); + + err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs, + fq_if_enqueue_classq, fq_if_dequeue_classq, NULL, + fq_if_dequeue_classq_multi, fq_if_request_classq); + + if (err != 0) { + printf("%s: error from ifclassq_attach, " + "failed to attach fq_if: %d\n", __func__, err); + fq_if_destroy(fqs); + } + return (err); +} + +fq_t * +fq_if_hash_pkt(fq_if_t *fqs, u_int32_t flowid, mbuf_svc_class_t svc_class, + u_int64_t now, boolean_t create) +{ + fq_t *fq = NULL; + flowq_list_t *fq_list; + fq_if_classq_t *fq_cl; + u_int8_t fqs_hash_id; + u_int8_t scidx; + + scidx = fq_if_service_to_priority(svc_class); + + fqs_hash_id = FQ_IF_FLOW_HASH_ID(flowid); + + fq_list = &fqs->fqs_flows[fqs_hash_id]; + + SLIST_FOREACH(fq, fq_list, fq_hashlink) { + if (fq->fq_flowhash == flowid && + fq->fq_sc_index == scidx) + break; + } + if (fq == NULL && create == TRUE) { + /* If the flow is not already on the list, allocate it */ + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + fq = fq_alloc(M_WAITOK); + if (fq != NULL) { + fq->fq_flowhash = flowid; + fq->fq_sc_index = scidx; + fq->fq_updatetime = now + fqs->fqs_update_interval; + fq_cl = &fqs->fqs_classq[scidx]; + + fq->fq_flags = FQF_FLOWCTL_CAPABLE; + SLIST_INSERT_HEAD(fq_list, fq, fq_hashlink); + fq_cl->fcl_stat.fcl_flows_cnt++; + } + } + + /* + * If getq time is not set because this is the first packet or after + * idle time, set it now so that we can detect a stall. + */ + if (fq->fq_getqtime == 0) + fq->fq_getqtime = now; + + return (fq); +} + +static void +fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq) +{ + u_int8_t hash_id; + hash_id = FQ_IF_FLOW_HASH_ID(fq->fq_flowhash); + SLIST_REMOVE(&fqs->fqs_flows[hash_id], fq, flowq, + fq_hashlink); + fq_cl->fcl_stat.fcl_flows_cnt--; + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + fq_destroy(fq); + +} + +inline boolean_t +fq_if_at_drop_limit(fq_if_t *fqs) +{ + return (((IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ? + TRUE : FALSE)); +} + +static void +fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, + bool remove_hash) +{ + /* + * Remove the flow queue if it is empty + * and delete it + */ + STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, + fq_actlink); + fq->fq_flags &= ~FQF_OLD_FLOW; + fq_cl->fcl_stat.fcl_oldflows_cnt--; + VERIFY(fq->fq_bytes == 0); + + if (remove_hash) { + /* Remove from the hash list */ + fq_if_destroy_flow(fqs, fq_cl, fq); + } +} + +static void +fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl, bool add_to_old) +{ + /* Move to the end of old queue list */ + STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq, + flowq, fq_actlink); + fq->fq_flags &= ~FQF_NEW_FLOW; + fq_cl->fcl_stat.fcl_newflows_cnt--; + + if (add_to_old) { + STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, + fq_actlink); + fq->fq_flags |= FQF_OLD_FLOW; + fq_cl->fcl_stat.fcl_oldflows_cnt++; + } +} + +inline void +fq_if_drop_packet(fq_if_t *fqs) +{ + fq_t *fq = fqs->fqs_large_flow; + struct mbuf *m; + fq_if_classq_t *fq_cl; + + if (fq == NULL) + return; + /* mbufq can not be empty on the largest flow */ + VERIFY(!MBUFQ_EMPTY(&fq->fq_mbufq)); + + fq_cl = &fqs->fqs_classq[fq->fq_sc_index]; + + m = fq_getq_flow(fqs, fq); + + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + if (MBUFQ_EMPTY(&fq->fq_mbufq)) { + if (fq->fq_flags & FQF_OLD_FLOW) { + fq_if_empty_old_flow(fqs, fq_cl, fq, true); + } else { + VERIFY(fq->fq_flags & FQF_NEW_FLOW); + fq_if_empty_new_flow(fq, fq_cl, true); + } + } + IFCQ_DROP_ADD(fqs->fqs_ifq, 1, m_length(m)); + + m_freem(m); + fq_cl->fcl_stat.fcl_drop_overflow++; +} + +inline void +fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq) +{ + fq_t *prev_fq = fqs->fqs_large_flow; + if (prev_fq == NULL && !MBUFQ_EMPTY(&fq->fq_mbufq)) { + fqs->fqs_large_flow = fq; + return; + } else if (fq->fq_bytes > prev_fq->fq_bytes) { + fqs->fqs_large_flow = fq; + } +} + +boolean_t +fq_if_add_fcentry(fq_if_t *fqs, struct pkthdr *pkt, fq_if_classq_t *fq_cl) +{ + struct flowadv_fcentry *fce; + u_int32_t flowsrc, flowid; + + flowsrc = pkt->pkt_flowsrc; + flowid = pkt->pkt_flowid; + + STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) { + if (fce->fce_flowsrc == flowsrc && + fce->fce_flowid == flowid) { + /* Already on flowcontrol list */ + return (TRUE); + } + } + + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + fce = flowadv_alloc_entry(M_WAITOK); + if (fce != NULL) { + fce->fce_flowsrc = flowsrc; + fce->fce_flowid = flowid; + /* XXX Add number of bytes in the queue */ + STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link); + fq_cl->fcl_stat.fcl_flow_control++; + } + return ((fce != NULL) ? TRUE : FALSE); +} + +void +fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl) +{ + struct flowadv_fcentry *fce = NULL; + + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) { + if (fce->fce_flowid == fq->fq_flowhash) + break; + } + if (fce != NULL) { + STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, + fce_link); + STAILQ_NEXT(fce, fce_link) = NULL; + flowadv_add_entry(fce); + fq_cl->fcl_stat.fcl_flow_feedback++; + } + fq->fq_flags &= ~FQF_FLOWCTL_ON; +} + +void +fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, u_int32_t pktlimit, + u_int32_t bytelimit, struct mbuf **top, struct mbuf **tail, + u_int32_t *retpktcnt, u_int32_t *retbytecnt) +{ + fq_t *fq = NULL, *tfq = NULL; + struct mbuf *m = NULL, *last = NULL; + flowq_stailq_t temp_stailq; + u_int32_t pktcnt, bytecnt, mlen; + boolean_t limit_reached = FALSE; + + /* + * maximum byte limit should not be greater than the budget for + * this class + */ + if ((int32_t)bytelimit > fq_cl->fcl_budget) + bytelimit = fq_cl->fcl_budget; + + VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL); + + *top = NULL; + pktcnt = bytecnt = 0; + STAILQ_INIT(&temp_stailq); + + STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) { + VERIFY((fq->fq_flags & (FQF_NEW_FLOW|FQF_OLD_FLOW)) == + FQF_NEW_FLOW); + while (fq->fq_deficit > 0 && limit_reached == FALSE && + !MBUFQ_EMPTY(&fq->fq_mbufq)) { + + m = fq_getq_flow(fqs, fq); + m->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW; + mlen = m_length(m); + fq->fq_deficit -= mlen; + + if (*top == NULL) { + *top = m; + } else { + last->m_nextpkt = m; + } + last = m; + last->m_nextpkt = NULL; + fq_cl->fcl_stat.fcl_dequeue++; + fq_cl->fcl_stat.fcl_dequeue_bytes += mlen; + + pktcnt++; + bytecnt += mlen; + + /* Check if the limit is reached */ + if (pktcnt >= pktlimit || bytecnt >= bytelimit) + limit_reached = TRUE; + } + + if (fq->fq_deficit <= 0 || MBUFQ_EMPTY(&fq->fq_mbufq)) { + fq_if_empty_new_flow(fq, fq_cl, true); + fq->fq_deficit += fq_cl->fcl_quantum; + } + if (limit_reached == TRUE) + goto done; + } + + STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) { + VERIFY((fq->fq_flags & (FQF_NEW_FLOW|FQF_OLD_FLOW)) == + FQF_OLD_FLOW); + while (fq->fq_deficit > 0 && !MBUFQ_EMPTY(&fq->fq_mbufq) && + limit_reached == FALSE) { + m = fq_getq_flow(fqs, fq); + mlen = m_length(m); + fq->fq_deficit -= mlen; + if (*top == NULL) { + *top = m; + } else { + last->m_nextpkt = m; + } + last = m; + last->m_nextpkt = NULL; + fq_cl->fcl_stat.fcl_dequeue++; + fq_cl->fcl_stat.fcl_dequeue_bytes += mlen; + + pktcnt++; + bytecnt += mlen; + + /* Check if the limit is reached */ + if (pktcnt >= pktlimit || bytecnt >= bytelimit) + limit_reached = TRUE; + } + + if (MBUFQ_EMPTY(&fq->fq_mbufq)) { + fq_if_empty_old_flow(fqs, fq_cl, fq, true); + } else if (fq->fq_deficit <= 0) { + STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, + flowq, fq_actlink); + /* + * Move to the end of the old queues list. We do not + * need to update the flow count since this flow + * will be added to the tail again + */ + STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink); + fq->fq_deficit += fq_cl->fcl_quantum; + } + + if (limit_reached == TRUE) + break; + } + +done: + if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) { + STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq); + } else if (!STAILQ_EMPTY(&temp_stailq)) { + fq_cl->fcl_old_flows = temp_stailq; + } + + if (last != NULL) { + VERIFY(*top != NULL); + if (tail != NULL) + *tail = last; + if (retpktcnt != NULL) + *retpktcnt = pktcnt; + if (retbytecnt != NULL) + *retbytecnt = bytecnt; + } +} + +int +fq_if_teardown_ifclassq(struct ifclassq *ifq) +{ + fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL); + + fq_if_destroy(fqs); + ifq->ifcq_disc = NULL; + + return (ifclassq_detach(ifq)); +} + +int +fq_if_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t qid, + struct if_ifclassq_stats *ifqs) +{ + struct fq_codel_classstats *fcls; + fq_if_classq_t *fq_cl; + fq_if_t *fqs; + + if (qid >= FQ_IF_MAX_CLASSES) + return (EINVAL); + + fqs = (fq_if_t *)ifq->ifcq_disc; + fcls = &ifqs->ifqs_fq_codel_stats; + + fq_cl = &fqs->fqs_classq[qid]; + + fcls->fcls_pri = fq_cl->fcl_pri; + fcls->fcls_service_class = fq_cl->fcl_service_class; + fcls->fcls_quantum = fq_cl->fcl_quantum; + fcls->fcls_drr_max = fq_cl->fcl_drr_max; + fcls->fcls_budget = fq_cl->fcl_budget; + fcls->fcls_target_qdelay = fqs->fqs_target_qdelay; + fcls->fcls_update_interval = fqs->fqs_update_interval; + fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control; + fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback; + fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall; + fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow; + fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early; + fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure; + fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt; + fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt; + fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt; + fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt; + fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail; + fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail; + fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue; + fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes; + fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt; + fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on; + fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off; + fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops; + fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts; + + return (0); +} diff --git a/bsd/net/pktsched/pktsched_fq_codel.h b/bsd/net/pktsched/pktsched_fq_codel.h new file mode 100644 index 000000000..8d760a409 --- /dev/null +++ b/bsd/net/pktsched/pktsched_fq_codel.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_PKTSCHED_FQ_CODEL_H_ +#define _NET_PKTSCHED_FQ_CODEL_H_ + +#ifdef PRIVATE +#include +#include + +#ifdef BSD_KERNEL_PRIVATE +#include +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef BSD_KERNEL_PRIVATE +struct fcl_stat { + u_int32_t fcl_flow_control; + u_int32_t fcl_flow_feedback; + u_int32_t fcl_dequeue_stall; + u_int32_t fcl_flow_control_fail; + u_int64_t fcl_drop_overflow; + u_int64_t fcl_drop_early; + u_int32_t fcl_drop_memfailure; + u_int32_t fcl_flows_cnt; + u_int32_t fcl_newflows_cnt; + u_int32_t fcl_oldflows_cnt; + u_int64_t fcl_pkt_cnt; + u_int64_t fcl_dequeue; + u_int64_t fcl_dequeue_bytes; + u_int64_t fcl_byte_cnt; + u_int32_t fcl_throttle_on; + u_int32_t fcl_throttle_off; + u_int32_t fcl_throttle_drops; + u_int32_t fcl_dup_rexmts; +}; + +/* + * Use the top most 8 bits of flow id as the tag for set associative + * hashing + */ + +#define FQ_IF_HASH_TAG_SIZE 8 +#define FQ_IF_HASH_TAG_SHIFT 24 +#define FQ_IF_HASH_TAG_MASK 0xFF +#define FQ_IF_HASH_TABLE_SIZE (1 << FQ_IF_HASH_TAG_SIZE) + +/* maximum number f packets stored across all queues */ +#define FQ_IF_MAX_PKT_LIMIT 2048 + +/* Set the quantum to be one MTU */ +#define FQ_IF_DEFAULT_QUANTUM 1500 + +/* Max number of service classes currently supported */ +#define FQ_IF_MAX_CLASSES 10 + +struct flowq; +typedef u_int32_t pktsched_bitmap_t; +struct if_ifclassq_stats; + +enum fq_if_state { + FQ_IF_ER = 0, /* eligible, ready */ + FQ_IF_IR = 1, /* ineligible, ready */ + FQ_IF_EB = 2, /* eligible blocked */ + FQ_IF_IB = 3, /* ineligible, blocked */ + FQ_IF_MAX_STATE +}; + +/* + * This priority index is used for QFQ state bitmaps, lower index gets + * higher priority + */ +#define FQ_IF_BK_SYS_INDEX 9 +#define FQ_IF_BK_INDEX 8 +#define FQ_IF_BE_INDEX 7 +#define FQ_IF_RD_INDEX 6 +#define FQ_IF_OAM_INDEX 5 +#define FQ_IF_AV_INDEX 4 +#define FQ_IF_RV_INDEX 3 +#define FQ_IF_VI_INDEX 2 +#define FQ_IF_VO_INDEX 1 +#define FQ_IF_CTL_INDEX 0 + +typedef SLIST_HEAD(, flowq) flowq_list_t; +typedef STAILQ_HEAD(, flowq) flowq_stailq_t; +typedef struct fq_if_classq { + u_int32_t fcl_pri; /* class priority, lower the better */ + u_int32_t fcl_service_class; /* service class */ + u_int32_t fcl_quantum; /* quantum in bytes */ + u_int32_t fcl_drr_max; /* max flows per class for DRR */ + int64_t fcl_budget; /* budget for this classq */ + flowq_stailq_t fcl_new_flows; /* List of new flows */ + flowq_stailq_t fcl_old_flows; /* List of old flows */ + struct fcl_stat fcl_stat; +} fq_if_classq_t; + +typedef struct fq_codel_sched_data { + struct ifclassq *fqs_ifq; /* back pointer to ifclassq */ + u_int64_t fqs_target_qdelay; /* Target queue delay (ns) */ + u_int64_t fqs_update_interval; /* update interval (ns) */ + flowq_list_t fqs_flows[FQ_IF_HASH_TABLE_SIZE]; /* flows table */ + pktsched_bitmap_t fqs_bitmaps[FQ_IF_MAX_STATE]; + u_int32_t fqs_pkt_droplimit; /* drop limit */ + u_int8_t fqs_throttle; /* throttle on or off */ + fq_if_classq_t fqs_classq[FQ_IF_MAX_CLASSES]; /* class queues */ + struct flowadv_fclist fqs_fclist; /* flow control state */ + struct flowq *fqs_large_flow; /* flow has highest number of bytes */ +} fq_if_t; + +#endif /* BSD_KERNEL_PRIVATE */ + +struct fq_codel_classstats { + u_int32_t fcls_pri; + u_int32_t fcls_service_class; + u_int32_t fcls_quantum; + u_int32_t fcls_drr_max; + int64_t fcls_budget; + u_int64_t fcls_target_qdelay; + u_int64_t fcls_update_interval; + u_int32_t fcls_flow_control; + u_int32_t fcls_flow_feedback; + u_int32_t fcls_dequeue_stall; + u_int32_t fcls_flow_control_fail; + u_int64_t fcls_drop_overflow; + u_int64_t fcls_drop_early; + u_int32_t fcls_drop_memfailure; + u_int32_t fcls_flows_cnt; + u_int32_t fcls_newflows_cnt; + u_int32_t fcls_oldflows_cnt; + u_int64_t fcls_pkt_cnt; + u_int64_t fcls_dequeue; + u_int64_t fcls_dequeue_bytes; + u_int64_t fcls_byte_cnt; + u_int32_t fcls_throttle_on; + u_int32_t fcls_throttle_off; + u_int32_t fcls_throttle_drops; + u_int32_t fcls_dup_rexmts; +}; + +#ifdef BSD_KERNEL_PRIVATE + +extern void fq_codel_scheduler_init(void); +extern struct flowq *fq_if_hash_pkt(fq_if_t *, u_int32_t, mbuf_svc_class_t, + u_int64_t, boolean_t); +extern boolean_t fq_if_at_drop_limit(fq_if_t *); +extern void fq_if_drop_packet(fq_if_t *); +extern void fq_if_is_flow_heavy(fq_if_t *, struct flowq *); +extern boolean_t fq_if_add_fcentry(fq_if_t *, struct pkthdr *, + fq_if_classq_t *); +extern void fq_if_flow_feedback(fq_if_t *, struct flowq *, fq_if_classq_t *); +extern int fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags); +extern int fq_if_teardown_ifclassq(struct ifclassq *ifq); +extern int fq_if_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t qid, + struct if_ifclassq_stats *ifqs); + + +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif + +#endif /* PRIVATE */ +#endif /* _NET_PKTSCHED_PKTSCHED_FQ_CODEL_H_ */ diff --git a/bsd/net/pktsched/pktsched_priq.c b/bsd/net/pktsched/pktsched_priq.c index 78da2f1b1..ffbf5cf28 100644 --- a/bsd/net/pktsched/pktsched_priq.c +++ b/bsd/net/pktsched/pktsched_priq.c @@ -1103,7 +1103,7 @@ priq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) err = ifclassq_attach(ifq, PKTSCHEDT_PRIQ, pif, priq_enqueue_ifclassq, priq_dequeue_ifclassq, NULL, - priq_request_ifclassq); + NULL, priq_request_ifclassq); /* cache these for faster lookup */ if (err == 0) { diff --git a/bsd/net/pktsched/pktsched_qfq.c b/bsd/net/pktsched/pktsched_qfq.c index bc7cc2215..b1a88d435 100644 --- a/bsd/net/pktsched/pktsched_qfq.c +++ b/bsd/net/pktsched/pktsched_qfq.c @@ -1872,7 +1872,7 @@ qfq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) err = ifclassq_attach(ifq, PKTSCHEDT_QFQ, qif, qfq_enqueue_ifclassq, qfq_dequeue_ifclassq, NULL, - qfq_request_ifclassq); + NULL, qfq_request_ifclassq); /* cache these for faster lookup */ if (err == 0) { diff --git a/bsd/net/pktsched/pktsched_tcq.c b/bsd/net/pktsched/pktsched_tcq.c index 5a57824e6..d3e64b5e1 100644 --- a/bsd/net/pktsched/pktsched_tcq.c +++ b/bsd/net/pktsched/pktsched_tcq.c @@ -1046,7 +1046,7 @@ tcq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) err = ifclassq_attach(ifq, PKTSCHEDT_TCQ, tif, tcq_enqueue_ifclassq, NULL, tcq_dequeue_tc_ifclassq, - tcq_request_ifclassq); + NULL, tcq_request_ifclassq); /* cache these for faster lookup */ if (err == 0) { diff --git a/bsd/net/route.c b/bsd/net/route.c index d13a1994f..4e5dd5af7 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,6 +68,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -378,11 +381,12 @@ static unsigned int primary6_ifscope = IFSCOPE_NONE; #define RN(r) ((struct radix_node *)r) #define RT_HOST(r) (RT(r)->rt_flags & RTF_HOST) +unsigned int rt_verbose = 0; +#if (DEVELOPMENT || DEBUG) SYSCTL_DECL(_net_route); - -unsigned int rt_verbose; /* verbosity level (0 to disable) */ SYSCTL_UINT(_net_route, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &rt_verbose, 0, ""); +#endif /* (DEVELOPMENT || DEBUG) */ static void rtable_init(void **table) @@ -905,11 +909,7 @@ rtalloc1_scoped_locked(struct sockaddr *dst, int report, uint32_t ignflags, return (rtalloc1_common_locked(dst, report, ignflags, ifscope)); } -/* - * Look up the route that matches the address given - * Or, at least try.. Create a cloned route if needed. - */ -static struct rtentry * +struct rtentry * rtalloc1_common_locked(struct sockaddr *dst, int report, uint32_t ignflags, unsigned int ifscope) { @@ -1007,6 +1007,7 @@ rtalloc1_common_locked(struct sockaddr *dst, int report, uint32_t ignflags, * Which basically means "cant get there from here" */ rtstat.rts_unreach++; + miss: if (report) { /* @@ -1358,10 +1359,9 @@ rtredirect(struct ifnet *ifp, struct sockaddr *dst, struct sockaddr *gateway, * comparison against rt_gateway below. */ #if INET6 - if ((af == AF_INET && ip_doscopedroute) || - (af == AF_INET6 && ip6_doscopedroute)) + if ((af == AF_INET) || (af == AF_INET6)) #else - if (af == AF_INET && ip_doscopedroute) + if (af == AF_INET) #endif /* !INET6 */ src = sa_copy(src, &ss, &ifscope); @@ -1551,19 +1551,19 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, */ #if INET6 if (dst != NULL && - ((dst->sa_family == AF_INET && ip_doscopedroute) || - (dst->sa_family == AF_INET6 && ip6_doscopedroute))) + ((dst->sa_family == AF_INET) || + (dst->sa_family == AF_INET6))) #else - if (dst != NULL && dst->sa_family == AF_INET && ip_doscopedroute) + if (dst != NULL && dst->sa_family == AF_INET) #endif /* !INET6 */ dst = sa_copy(SA((uintptr_t)dst), &dst_ss, NULL); #if INET6 if (gw != NULL && - ((gw->sa_family == AF_INET && ip_doscopedroute) || - (gw->sa_family == AF_INET6 && ip6_doscopedroute))) + ((gw->sa_family == AF_INET) || + (gw->sa_family == AF_INET6))) #else - if (gw != NULL && gw->sa_family == AF_INET && ip_doscopedroute) + if (gw != NULL && gw->sa_family == AF_INET) #endif /* !INET6 */ gw = sa_copy(SA((uintptr_t)gw), &gw_ss, NULL); @@ -1749,11 +1749,9 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * routing socket request. */ #if INET6 - if (req != RTM_RESOLVE && - ((af == AF_INET && ip_doscopedroute) || - (af == AF_INET6 && ip6_doscopedroute))) { + if (req != RTM_RESOLVE && ((af == AF_INET) || (af == AF_INET6))) { #else - if (req != RTM_RESOLVE && af == AF_INET && ip_doscopedroute) { + if (req != RTM_RESOLVE && af == AF_INET) { #endif /* !INET6 */ /* Transform dst into the internal routing table form */ dst = sa_copy(dst, &ss, &ifscope); @@ -1764,17 +1762,9 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, if (ifscope != IFSCOPE_NONE) flags |= RTF_IFSCOPE; - } else { - if ((flags & RTF_IFSCOPE) && (af != AF_INET && af != AF_INET6)) - senderr(EINVAL); - -#if INET6 - if ((af == AF_INET && !ip_doscopedroute) || - (af == AF_INET6 && !ip6_doscopedroute)) -#else - if (af == AF_INET && !ip_doscopedroute) -#endif /* !INET6 */ - ifscope = IFSCOPE_NONE; + } else if ((flags & RTF_IFSCOPE) && + (af != AF_INET && af != AF_INET6)) { + senderr(EINVAL); } if (ifscope == IFSCOPE_NONE) @@ -1912,7 +1902,7 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, senderr(EINVAL); /* * According to the UNIX conformance tests, we need to return - * ENETUNREACH when the parent route is RTF_REJECT. + * ENETUNREACH when the parent route is RTF_REJECT. * However, there isn't any point in cloning RTF_REJECT * routes, so we immediately return an error. */ @@ -1943,11 +1933,9 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, flags |= RTF_HOST; #if INET6 - if ((af != AF_INET && af != AF_INET6) || - (af == AF_INET && !ip_doscopedroute) || - (af == AF_INET6 && !ip6_doscopedroute)) + if (af != AF_INET && af != AF_INET6) #else - if (af != AF_INET || !ip_doscopedroute) + if (af != AF_INET) #endif /* !INET6 */ goto makeroute; @@ -2822,13 +2810,15 @@ static struct rtentry * rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, struct sockaddr *netmask, struct radix_node_head *rnh, unsigned int ifscope) { - struct radix_node *rn0, *rn; - boolean_t dontcare; + struct radix_node *rn0, *rn = NULL; int af = dst->sa_family; - struct sockaddr_storage dst_ss, mask_ss; - char s_dst[MAX_IPv6_STR_LEN], s_netmask[MAX_IPv6_STR_LEN]; + struct sockaddr_storage dst_ss; + struct sockaddr_storage mask_ss; + boolean_t dontcare; +#if (DEVELOPMENT || DEBUG) char dbuf[MAX_SCOPE_ADDR_STR_LEN], gbuf[MAX_IPv6_STR_LEN]; - + char s_dst[MAX_IPv6_STR_LEN], s_netmask[MAX_IPv6_STR_LEN]; +#endif VERIFY(!coarse || ifscope == IFSCOPE_NONE); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); @@ -2847,11 +2837,9 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, * Non-scoped route lookup. */ #if INET6 - if ((af != AF_INET && af != AF_INET6) || - (af == AF_INET && !ip_doscopedroute) || - (af == AF_INET6 && !ip6_doscopedroute)) { + if (af != AF_INET && af != AF_INET6) { #else - if (af != AF_INET || !ip_doscopedroute) { + if (af != AF_INET) { #endif /* !INET6 */ rn = rnh->rnh_matchaddr(dst, rnh); @@ -2881,6 +2869,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, netmask = ma_copy(af, netmask, &mask_ss, ifscope); dontcare = (ifscope == IFSCOPE_NONE); +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { if (af == AF_INET) (void) inet_ntop(af, &SIN(dst)->sin_addr.s_addr, @@ -2900,6 +2889,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, printf("%s (%d, %d, %s, %s, %u)\n", __func__, lookup_only, coarse, s_dst, s_netmask, ifscope); } +#endif /* * Scoped route lookup: @@ -2935,7 +2925,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, */ if (rn != NULL) { struct rtentry *rt = RT(rn); - +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf)); printf("%s unscoped search %p to %s->%s->%s ifa_ifp %s\n", @@ -2945,7 +2935,9 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, (rt->rt_ifa->ifa_ifp != NULL) ? rt->rt_ifa->ifa_ifp->if_xname : ""); } - if (!(rt->rt_ifp->if_flags & IFF_LOOPBACK)) { +#endif + if (!(rt->rt_ifp->if_flags & IFF_LOOPBACK) || + (rt->rt_flags & RTF_GATEWAY)) { if (rt->rt_ifp->if_index != ifscope) { /* * Wrong interface; keep the original result @@ -2983,7 +2975,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, */ if (rn == NULL) { rn = node_lookup(dst, netmask, ifscope); - +#if (DEVELOPMENT || DEBUG) if (rt_verbose && rn != NULL) { struct rtentry *rt = RT(rn); @@ -2995,6 +2987,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, (rt->rt_ifa->ifa_ifp != NULL) ? rt->rt_ifa->ifa_ifp->if_xname : ""); } +#endif } /* * Use the original result if either of the following is true: @@ -3039,7 +3032,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, rn = NULL; } } - +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { if (rn == NULL) printf("%s %u return NULL\n", __func__, ifscope); @@ -3056,7 +3049,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, rt->rt_ifa->ifa_ifp->if_xname : ""); } } - +#endif return (RT(rn)); } @@ -3118,8 +3111,10 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) { struct radix_node_head *rnh; uint8_t nbuf[128]; /* long enough for IPv6 */ +#if (DEVELOPMENT || DEBUG) char dbuf[MAX_IPv6_STR_LEN], gbuf[MAX_IPv6_STR_LEN]; char abuf[MAX_IPv6_STR_LEN]; +#endif struct rtentry *rt = NULL; struct sockaddr *dst; struct sockaddr *netmask; @@ -3153,6 +3148,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) goto done; } +#if (DEVELOPMENT || DEBUG) if (dst->sa_family == AF_INET) { (void) inet_ntop(AF_INET, &SIN(dst)->sin_addr.s_addr, abuf, sizeof (abuf)); @@ -3163,6 +3159,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) abuf, sizeof (abuf)); } #endif /* INET6 */ +#endif /* (DEVELOPMENT || DEBUG) */ if ((rnh = rt_tables[dst->sa_family]) == NULL) { error = EINVAL; @@ -3194,7 +3191,9 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) */ rt = rt_lookup_coarse(TRUE, dst, NULL, rnh); if (rt != NULL) { +#if (DEVELOPMENT || DEBUG) rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf)); +#endif /* * Ok so we found the rtentry. it has an extra reference * for us at this stage. we won't need that so @@ -3209,6 +3208,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * an error. This seems to be the only point * of this whole RTM_DELETE clause. */ +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { log(LOG_DEBUG, "%s: not removing " "route to %s->%s->%s, flags %b, " @@ -3221,6 +3221,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) rt->rt_ifa), (uint64_t)VM_KERNEL_ADDRPERM(ifa)); } +#endif /* (DEVELOPMENT || DEBUG) */ RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); rt = NULL; @@ -3232,6 +3233,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * Don't remove the subnet/prefix route if * this was manually added from above. */ +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { log(LOG_DEBUG, "%s: not removing " "static route to %s->%s->%s, " @@ -3240,12 +3242,14 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) rt->rt_ifp->if_xname : ""), rt->rt_flags, RTF_BITS, abuf); } +#endif /* (DEVELOPMENT || DEBUG) */ RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); rt = NULL; error = EBUSY; goto done; } +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { log(LOG_DEBUG, "%s: removing route to " "%s->%s->%s, flags %b, ifaddr %s\n", @@ -3254,6 +3258,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) rt->rt_ifp->if_xname : ""), rt->rt_flags, RTF_BITS, abuf); } +#endif /* (DEVELOPMENT || DEBUG) */ RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); rt = NULL; @@ -3267,9 +3272,9 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) goto done; VERIFY(rt != NULL); - +#if (DEVELOPMENT || DEBUG) rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf)); - +#endif /* (DEVELOPMENT || DEBUG) */ switch (cmd) { case RTM_DELETE: /* @@ -3280,12 +3285,14 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) RT_LOCK(rt); rt_newaddrmsg(cmd, ifa, error, rt); RT_UNLOCK(rt); +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { log(LOG_DEBUG, "%s: removed route to %s->%s->%s, " "flags %b, ifaddr %s\n", __func__, dbuf, gbuf, ((rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : ""), rt->rt_flags, RTF_BITS, abuf); } +#endif /* (DEVELOPMENT || DEBUG) */ rtfree_locked(rt); break; @@ -3300,20 +3307,20 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) if (rt->rt_ifa != ifa) { void (*ifa_rtrequest) (int, struct rtentry *, struct sockaddr *); - - if (!(rt->rt_ifa->ifa_ifp->if_flags & - (IFF_POINTOPOINT|IFF_LOOPBACK))) { - log(LOG_ERR, "%s: %s route to %s->%s->%s, " - "flags %b, ifaddr %s, rt_ifa 0x%llx != " - "ifa 0x%llx\n", __func__, rtm2str(cmd), - dbuf, gbuf, ((rt->rt_ifp != NULL) ? - rt->rt_ifp->if_xname : ""), rt->rt_flags, - RTF_BITS, abuf, - (uint64_t)VM_KERNEL_ADDRPERM(rt->rt_ifa), - (uint64_t)VM_KERNEL_ADDRPERM(ifa)); - } - +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { + if (!(rt->rt_ifa->ifa_ifp->if_flags & + (IFF_POINTOPOINT|IFF_LOOPBACK))) { + log(LOG_ERR, "%s: %s route to %s->%s->%s, " + "flags %b, ifaddr %s, rt_ifa 0x%llx != " + "ifa 0x%llx\n", __func__, rtm2str(cmd), + dbuf, gbuf, ((rt->rt_ifp != NULL) ? + rt->rt_ifp->if_xname : ""), rt->rt_flags, + RTF_BITS, abuf, + (uint64_t)VM_KERNEL_ADDRPERM(rt->rt_ifa), + (uint64_t)VM_KERNEL_ADDRPERM(ifa)); + } + log(LOG_DEBUG, "%s: %s route to %s->%s->%s, " "flags %b, ifaddr %s, rt_ifa was 0x%llx " "now 0x%llx\n", __func__, rtm2str(cmd), @@ -3323,6 +3330,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) (uint64_t)VM_KERNEL_ADDRPERM(rt->rt_ifa), (uint64_t)VM_KERNEL_ADDRPERM(ifa)); } +#endif /* (DEVELOPMENT || DEBUG) */ /* * Ask that the protocol in question @@ -3372,6 +3380,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) if (ifa_rtrequest != NULL) ifa_rtrequest(RTM_ADD, rt, NULL); } else { +#if (DEVELOPMENT || DEBUG) if (rt_verbose) { log(LOG_DEBUG, "%s: added route to %s->%s->%s, " "flags %b, ifaddr %s\n", __func__, dbuf, @@ -3379,6 +3388,7 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) rt->rt_ifp->if_xname : ""), rt->rt_flags, RTF_BITS, abuf); } +#endif /* (DEVELOPMENT || DEBUG) */ } /* * notify any listenning routing agents of the change @@ -3593,7 +3603,7 @@ rte_if_ref(struct ifnet *ifp, int cnt) ev_msg.dv[0].data_length = sizeof (struct net_event_data); ev_msg.dv[0].data_ptr = &ev_data; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(NULL, &ev_msg); } } diff --git a/bsd/net/route.h b/bsd/net/route.h index 3ee46dfe0..4ceed51bf 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -207,6 +207,7 @@ struct rtentry { uint64_t base_calendartime; /* calendar time upon entry creation */ uint64_t base_uptime; /* uptime upon entry creation */ u_int32_t rtt_hist[NRTT_HIST]; /* RTT history sample by TCP connections */ + u_int32_t rtt_min; /* minimum RTT computed from history */ u_int32_t rtt_expire_ts; /* RTT history expire timestamp */ u_int8_t rtt_index; /* Index into RTT history */ }; @@ -312,6 +313,11 @@ struct rt_msghdr2 { }; #ifdef PRIVATE +struct kev_netevent_apnfallbk_data { + pid_t epid; /* effective PID */ + uuid_t euuid; /* effective UUID */ +}; + /* * Route reachability info. */ diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index 9ab2a3192..ea575bddc 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -24,6 +23,7 @@ PRIVATE_DATAFILES = \ in.h \ in_gif.h \ in_pcb.h \ + in_tclass.h \ ip.h \ ip_compat.h \ ip_dummynet.h \ @@ -53,4 +53,3 @@ INSTALL_KF_MI_LCL_LIST = $(sort ${DATAFILES} ${PRIVATE_DATAFILES} ${PRIVATE_KERN include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/bsd/netinet/flow_divert.c b/bsd/netinet/flow_divert.c index d9f02927f..a918039f2 100644 --- a/bsd/netinet/flow_divert.c +++ b/bsd/netinet/flow_divert.c @@ -63,6 +63,7 @@ #include #include #include +#include #define FLOW_DIVERT_CONNECT_STARTED 0x00000001 #define FLOW_DIVERT_READ_CLOSED 0x00000002 @@ -72,17 +73,11 @@ #define FLOW_DIVERT_TRANSFERRED 0x00000020 #define FLOW_DIVERT_HAS_HMAC 0x00000040 -#define FDLOG(level, pcb, format, ...) do { \ - if (level <= (pcb)->log_level) { \ - log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s (%u): " format "\n", __FUNCTION__, (pcb)->hash, __VA_ARGS__); \ - } \ -} while (0) +#define FDLOG(level, pcb, format, ...) \ + os_log_with_type(OS_LOG_DEFAULT, flow_divert_syslog_type_to_oslog_type(level), "(%u): " format "\n", (pcb)->hash, __VA_ARGS__) -#define FDLOG0(level, pcb, msg) do { \ - if (level <= (pcb)->log_level) { \ - log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s (%u): %s\n", __FUNCTION__, (pcb)->hash, msg); \ - } \ -} while (0) +#define FDLOG0(level, pcb, msg) \ + os_log_with_type(OS_LOG_DEFAULT, flow_divert_syslog_type_to_oslog_type(level), "(%u): " msg "\n", (pcb)->hash) #define FDRETAIN(pcb) if ((pcb) != NULL) OSIncrementAtomic(&(pcb)->ref_count) #define FDRELEASE(pcb) \ @@ -95,7 +90,7 @@ #define FDLOCK(pcb) lck_mtx_lock(&(pcb)->mtx) #define FDUNLOCK(pcb) lck_mtx_unlock(&(pcb)->mtx) -#define FD_CTL_SENDBUFF_SIZE (2 * FLOW_DIVERT_CHUNK_SIZE) +#define FD_CTL_SENDBUFF_SIZE (128 * 1024) #define FD_CTL_RCVBUFF_SIZE (128 * 1024) #define GROUP_BIT_CTL_ENQUEUE_BLOCKED 0 @@ -105,29 +100,11 @@ #define FLOW_DIVERT_MAX_KEY_SIZE 1024 #define FLOW_DIVERT_MAX_TRIE_MEMORY (1024 * 1024) -#define DNS_SERVICE_GROUP_UNIT (GROUP_COUNT_MAX + 1) - struct flow_divert_trie_node { uint16_t start; uint16_t length; uint16_t child_map; - uint32_t group_unit; -}; - -struct flow_divert_trie -{ - struct flow_divert_trie_node *nodes; - uint16_t *child_maps; - uint8_t *bytes; - void *memory; - size_t nodes_count; - size_t child_maps_count; - size_t bytes_count; - size_t nodes_free_next; - size_t child_maps_free_next; - size_t bytes_free_next; - uint16_t root; }; #define CHILD_MAP_SIZE 256 @@ -141,7 +118,6 @@ static struct flow_divert_pcb nil_pcb; decl_lck_rw_data(static, g_flow_divert_group_lck); static struct flow_divert_group **g_flow_divert_groups = NULL; static uint32_t g_active_group_count = 0; -static struct flow_divert_trie g_signing_id_trie; static lck_grp_attr_t *flow_divert_grp_attr = NULL; static lck_attr_t *flow_divert_mtx_attr = NULL; @@ -187,6 +163,17 @@ flow_divert_has_pcb_local_address(const struct inpcb *inp); static void flow_divert_disconnect_socket(struct socket *so); +static inline uint8_t +flow_divert_syslog_type_to_oslog_type(int syslog_type) +{ + switch (syslog_type) { + case LOG_ERR: return OS_LOG_TYPE_ERROR; + case LOG_INFO: return OS_LOG_TYPE_INFO; + case LOG_DEBUG: return OS_LOG_TYPE_DEBUG; + default: return OS_LOG_TYPE_DEFAULT; + } +} + static inline int flow_divert_pcb_cmp(const struct flow_divert_pcb *pcb_a, const struct flow_divert_pcb *pcb_b) { @@ -212,8 +199,6 @@ flow_divert_packet_type2str(uint8_t packet_type) return "read notification"; case FLOW_DIVERT_PKT_PROPERTIES_UPDATE: return "properties update"; - case FLOW_DIVERT_PKT_APP_MAP_UPDATE: - return "app map update"; case FLOW_DIVERT_PKT_APP_MAP_CREATE: return "app map create"; default: @@ -419,7 +404,7 @@ flow_divert_packet_append_tlv(mbuf_t packet, uint8_t type, uint32_t length, cons error = mbuf_copyback(packet, mbuf_pkthdr_len(packet), sizeof(net_length), &net_length, MBUF_DONTWAIT); if (error) { - FDLOG(LOG_ERR, &nil_pcb, "failed to append the length (%lu)", length); + FDLOG(LOG_ERR, &nil_pcb, "failed to append the length (%u)", length); return error; } @@ -1006,6 +991,8 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr char *signing_id = NULL; int free_signing_id = 0; mbuf_t connect_packet = NULL; + proc_t src_proc = p; + int release_proc = 0; error = flow_divert_packet_init(fd_cb, FLOW_DIVERT_PKT_CONNECT, &connect_packet); if (error) { @@ -1028,69 +1015,63 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr } socket_unlock(so, 0); - if (g_signing_id_trie.root != NULL_TRIE_IDX) { - proc_t src_proc = p; - int release_proc = 0; - - if (signing_id == NULL) { - release_proc = flow_divert_get_src_proc(so, &src_proc); - if (src_proc != PROC_NULL) { - proc_lock(src_proc); - if (src_proc->p_csflags & CS_VALID) { - const char * cs_id; - cs_id = cs_identity_get(src_proc); - signing_id = __DECONST(char *, cs_id); - } else { - FDLOG0(LOG_WARNING, fd_cb, "Signature is invalid"); - } + + if (signing_id == NULL) { + release_proc = flow_divert_get_src_proc(so, &src_proc); + if (src_proc != PROC_NULL) { + proc_lock(src_proc); + if (src_proc->p_csflags & (CS_VALID|CS_DEBUGGED)) { + const char * cs_id; + cs_id = cs_identity_get(src_proc); + signing_id = __DECONST(char *, cs_id); } else { - FDLOG0(LOG_WARNING, fd_cb, "Failed to determine the current proc"); + FDLOG0(LOG_WARNING, fd_cb, "Signature is invalid"); } } else { - src_proc = PROC_NULL; + FDLOG0(LOG_WARNING, fd_cb, "Failed to determine the current proc"); } + } else { + src_proc = PROC_NULL; + } - if (signing_id != NULL) { - uint16_t result = NULL_TRIE_IDX; - lck_rw_lock_shared(&g_flow_divert_group_lck); - result = flow_divert_trie_search(&g_signing_id_trie, (uint8_t *)signing_id); - lck_rw_done(&g_flow_divert_group_lck); - if (result != NULL_TRIE_IDX) { - error = 0; - FDLOG(LOG_INFO, fd_cb, "%s matched", signing_id); + if (signing_id != NULL) { + uint16_t result = NULL_TRIE_IDX; + lck_rw_lock_shared(&fd_cb->group->lck); + result = flow_divert_trie_search(&fd_cb->group->signing_id_trie, (uint8_t *)signing_id); + lck_rw_done(&fd_cb->group->lck); + if (result != NULL_TRIE_IDX) { + error = 0; + FDLOG(LOG_INFO, fd_cb, "%s matched", signing_id); - error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_SIGNING_ID, strlen(signing_id), signing_id); - if (error == 0) { - if (src_proc != PROC_NULL) { - unsigned char cdhash[SHA1_RESULTLEN]; - error = proc_getcdhash(src_proc, cdhash); - if (error == 0) { - error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_CDHASH, sizeof(cdhash), cdhash); - if (error) { - FDLOG(LOG_ERR, fd_cb, "failed to append the cdhash: %d", error); - } - } else { - FDLOG(LOG_ERR, fd_cb, "failed to get the cdhash: %d", error); + error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_SIGNING_ID, strlen(signing_id), signing_id); + if (error == 0) { + if (src_proc != PROC_NULL) { + unsigned char cdhash[SHA1_RESULTLEN]; + error = proc_getcdhash(src_proc, cdhash); + if (error == 0) { + error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_CDHASH, sizeof(cdhash), cdhash); + if (error) { + FDLOG(LOG_ERR, fd_cb, "failed to append the cdhash: %d", error); } + } else { + FDLOG(LOG_ERR, fd_cb, "failed to get the cdhash: %d", error); } - } else { - FDLOG(LOG_ERR, fd_cb, "failed to append the signing ID: %d", error); } } else { - FDLOG(LOG_WARNING, fd_cb, "%s did not match", signing_id); + FDLOG(LOG_ERR, fd_cb, "failed to append the signing ID: %d", error); } } else { - FDLOG0(LOG_WARNING, fd_cb, "Failed to get the code signing identity"); + FDLOG(LOG_WARNING, fd_cb, "%s did not match", signing_id); } + } else { + FDLOG0(LOG_WARNING, fd_cb, "Failed to get the code signing identity"); + } - if (src_proc != PROC_NULL) { - proc_unlock(src_proc); - if (release_proc) { - proc_rele(src_proc); - } + if (src_proc != PROC_NULL) { + proc_unlock(src_proc); + if (release_proc) { + proc_rele(src_proc); } - } else { - FDLOG0(LOG_WARNING, fd_cb, "The signing ID trie is empty"); } socket_lock(so, 0); @@ -1466,7 +1447,7 @@ flow_divert_send_buffered_data(struct flow_divert_pcb *fd_cb, Boolean force) } } data_len = mbuf_pkthdr_len(m); - FDLOG(LOG_DEBUG, fd_cb, "mbuf_copym() data_len = %u", data_len); + FDLOG(LOG_DEBUG, fd_cb, "mbuf_copym() data_len = %lu", data_len); error = mbuf_copym(m, 0, data_len, MBUF_DONTWAIT, &data); if (error) { FDLOG(LOG_ERR, fd_cb, "mbuf_copym failed: %d", error); @@ -1573,7 +1554,7 @@ flow_divert_send_app_data(struct flow_divert_pcb *fd_cb, mbuf_t data, struct soc if (to_send) { error = flow_divert_send_data_packet(fd_cb, data, to_send, toaddr, FALSE); if (error) { - FDLOG(LOG_ERR, fd_cb, "flow_divert_send_data_packet failed. send data size = %u", to_send); + FDLOG(LOG_ERR, fd_cb, "flow_divert_send_data_packet failed. send data size = %lu", to_send); } else { fd_cb->send_window -= to_send; } @@ -1676,6 +1657,7 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, int out_if_index = 0; struct sockaddr_storage remote_address; uint32_t send_window; + uint32_t app_data_length = 0; memset(&local_address, 0, sizeof(local_address)); memset(&remote_address, 0, sizeof(remote_address)); @@ -1696,32 +1678,37 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_CTL_UNIT, sizeof(ctl_unit), &ctl_unit, NULL); if (error) { - FDLOG(LOG_ERR, fd_cb, "failed to get the control unit: %d", error); - return; + FDLOG0(LOG_INFO, fd_cb, "No control unit provided in the connect result"); } error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_LOCAL_ADDR, sizeof(local_address), &local_address, NULL); if (error) { - FDLOG0(LOG_NOTICE, fd_cb, "No local address provided"); + FDLOG0(LOG_INFO, fd_cb, "No local address provided"); } error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_REMOTE_ADDR, sizeof(remote_address), &remote_address, NULL); if (error) { - FDLOG0(LOG_NOTICE, fd_cb, "No remote address provided"); + FDLOG0(LOG_INFO, fd_cb, "No remote address provided"); } error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_OUT_IF_INDEX, sizeof(out_if_index), &out_if_index, NULL); if (error) { - FDLOG0(LOG_NOTICE, fd_cb, "No output if index provided"); + FDLOG0(LOG_INFO, fd_cb, "No output if index provided"); + } + + error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_APP_DATA, 0, NULL, &app_data_length); + if (error) { + FDLOG0(LOG_INFO, fd_cb, "No application data provided in connect result"); } + error = 0; connect_error = ntohl(connect_error); ctl_unit = ntohl(ctl_unit); lck_rw_lock_shared(&g_flow_divert_group_lck); - if (connect_error == 0) { - if (ctl_unit == 0 || ctl_unit >= GROUP_COUNT_MAX) { + if (connect_error == 0 && ctl_unit > 0) { + if (ctl_unit >= GROUP_COUNT_MAX) { FDLOG(LOG_ERR, fd_cb, "Connect result contains an invalid control unit: %u", ctl_unit); error = EINVAL; } else if (g_flow_divert_groups == NULL || g_active_group_count == 0) { @@ -1774,6 +1761,27 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, goto set_socket_state; } + if (app_data_length > 0) { + uint8_t *app_data = NULL; + MALLOC(app_data, uint8_t *, app_data_length, M_TEMP, M_WAITOK); + if (app_data != NULL) { + error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_APP_DATA, app_data_length, app_data, NULL); + if (error == 0) { + FDLOG(LOG_INFO, fd_cb, "Got %u bytes of app data from the connect result", app_data_length); + if (fd_cb->app_data != NULL) { + FREE(fd_cb->app_data, M_TEMP); + } + fd_cb->app_data = app_data; + fd_cb->app_data_length = app_data_length; + } else { + FDLOG(LOG_ERR, fd_cb, "Failed to copy %u bytes of application data from the connect result packet", app_data_length); + FREE(app_data, M_TEMP); + } + } else { + FDLOG(LOG_ERR, fd_cb, "Failed to allocate a buffer of size %u to hold the application data from the connect result", app_data_length); + } + } + ifnet_head_lock_shared(); if (out_if_index > 0 && out_if_index <= if_index) { ifp = ifindex2ifnet[out_if_index]; @@ -1795,20 +1803,22 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, goto set_socket_state; } - old_group = fd_cb->group; + if (grp != NULL) { + old_group = fd_cb->group; - lck_rw_lock_exclusive(&old_group->lck); - lck_rw_lock_exclusive(&grp->lck); + lck_rw_lock_exclusive(&old_group->lck); + lck_rw_lock_exclusive(&grp->lck); - RB_REMOVE(fd_pcb_tree, &old_group->pcb_tree, fd_cb); - if (RB_INSERT(fd_pcb_tree, &grp->pcb_tree, fd_cb) != NULL) { - panic("group with unit %u already contains a connection with hash %u", grp->ctl_unit, fd_cb->hash); - } + RB_REMOVE(fd_pcb_tree, &old_group->pcb_tree, fd_cb); + if (RB_INSERT(fd_pcb_tree, &grp->pcb_tree, fd_cb) != NULL) { + panic("group with unit %u already contains a connection with hash %u", grp->ctl_unit, fd_cb->hash); + } - fd_cb->group = grp; + fd_cb->group = grp; - lck_rw_done(&grp->lck); - lck_rw_done(&old_group->lck); + lck_rw_done(&grp->lck); + lck_rw_done(&old_group->lck); + } fd_cb->send_window = ntohl(send_window); @@ -2039,7 +2049,7 @@ flow_divert_handle_group_init(struct flow_divert_group *group, mbuf_t packet, in } if (key_size == 0 || key_size > FLOW_DIVERT_MAX_KEY_SIZE) { - FDLOG(LOG_ERR, &nil_pcb, "Invalid key size: %lu", key_size); + FDLOG(LOG_ERR, &nil_pcb, "Invalid key size: %u", key_size); return; } @@ -2168,7 +2178,7 @@ flow_divert_handle_properties_update(struct flow_divert_pcb *fd_cb, mbuf_t packe } static void -flow_divert_handle_app_map_create(mbuf_t packet, int offset) +flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet, int offset) { size_t bytes_mem_size; size_t child_maps_mem_size; @@ -2181,14 +2191,14 @@ flow_divert_handle_app_map_create(mbuf_t packet, int offset) int signing_id_count = 0; size_t trie_memory_size = 0; - lck_rw_lock_exclusive(&g_flow_divert_group_lck); + lck_rw_lock_exclusive(&group->lck); /* Re-set the current trie */ - if (g_signing_id_trie.memory != NULL) { - FREE(g_signing_id_trie.memory, M_TEMP); + if (group->signing_id_trie.memory != NULL) { + FREE(group->signing_id_trie.memory, M_TEMP); } - memset(&g_signing_id_trie, 0, sizeof(g_signing_id_trie)); - g_signing_id_trie.root = NULL_TRIE_IDX; + memset(&group->signing_id_trie, 0, sizeof(group->signing_id_trie)); + group->signing_id_trie.root = NULL_TRIE_IDX; memset(&new_trie, 0, sizeof(new_trie)); @@ -2196,7 +2206,7 @@ flow_divert_handle_app_map_create(mbuf_t packet, int offset) flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_PREFIX_COUNT, sizeof(prefix_count), &prefix_count, NULL); if (prefix_count < 0) { - lck_rw_done(&g_flow_divert_group_lck); + lck_rw_done(&group->lck); return; } @@ -2212,7 +2222,7 @@ flow_divert_handle_app_map_create(mbuf_t packet, int offset) } if (signing_id_count == 0) { - lck_rw_done(&g_flow_divert_group_lck); + lck_rw_done(&group->lck); return; } @@ -2228,8 +2238,8 @@ flow_divert_handle_app_map_create(mbuf_t packet, int offset) trie_memory_size = nodes_mem_size + child_maps_mem_size + bytes_mem_size; if (trie_memory_size > FLOW_DIVERT_MAX_TRIE_MEMORY) { - FDLOG(LOG_ERR, &nil_pcb, "Trie memory size (%u) is too big (maximum is %u)", trie_memory_size, FLOW_DIVERT_MAX_TRIE_MEMORY); - lck_rw_done(&g_flow_divert_group_lck); + FDLOG(LOG_ERR, &nil_pcb, "Trie memory size (%lu) is too big (maximum is %u)", trie_memory_size, FLOW_DIVERT_MAX_TRIE_MEMORY); + lck_rw_done(&group->lck); return; } @@ -2237,7 +2247,7 @@ flow_divert_handle_app_map_create(mbuf_t packet, int offset) if (new_trie.memory == NULL) { FDLOG(LOG_ERR, &nil_pcb, "Failed to allocate %lu bytes of memory for the signing ID trie", nodes_mem_size + child_maps_mem_size + bytes_mem_size); - lck_rw_done(&g_flow_divert_group_lck); + lck_rw_done(&group->lck); return; } @@ -2264,20 +2274,10 @@ flow_divert_handle_app_map_create(mbuf_t packet, int offset) uint32_t sid_size = 0; flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size); if (new_trie.bytes_free_next + sid_size <= new_trie.bytes_count) { - boolean_t is_dns; uint16_t new_node_idx; flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, sid_size, &TRIE_BYTE(&new_trie, new_trie.bytes_free_next), NULL); - is_dns = (sid_size == sizeof(FLOW_DIVERT_DNS_SERVICE_SIGNING_ID) - 1 && - !memcmp(&TRIE_BYTE(&new_trie, new_trie.bytes_free_next), - FLOW_DIVERT_DNS_SERVICE_SIGNING_ID, - sid_size)); new_node_idx = flow_divert_trie_insert(&new_trie, new_trie.bytes_free_next, sid_size); - if (new_node_idx != NULL_TRIE_IDX) { - if (is_dns) { - FDLOG(LOG_INFO, &nil_pcb, "Setting group unit for %s to %d", FLOW_DIVERT_DNS_SERVICE_SIGNING_ID, DNS_SERVICE_GROUP_UNIT); - TRIE_NODE(&new_trie, new_node_idx).group_unit = DNS_SERVICE_GROUP_UNIT; - } - } else { + if (new_node_idx == NULL_TRIE_IDX) { insert_error = EINVAL; break; } @@ -2289,72 +2289,12 @@ flow_divert_handle_app_map_create(mbuf_t packet, int offset) } if (!insert_error) { - g_signing_id_trie = new_trie; + group->signing_id_trie = new_trie; } else { FREE(new_trie.memory, M_TEMP); } - lck_rw_done(&g_flow_divert_group_lck); -} - -static void -flow_divert_handle_app_map_update(struct flow_divert_group *group, mbuf_t packet, int offset) -{ - int error = 0; - int cursor; - size_t max_size = 0; - uint8_t *signing_id; - uint32_t ctl_unit; - - lck_rw_lock_shared(&group->lck); - ctl_unit = group->ctl_unit; lck_rw_done(&group->lck); - - for (cursor = flow_divert_packet_find_tlv(packet, offset, FLOW_DIVERT_TLV_SIGNING_ID, &error, 0); - cursor >= 0; - cursor = flow_divert_packet_find_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, &error, 1)) - { - uint32_t sid_size = 0; - flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size); - if (sid_size > max_size) { - max_size = sid_size; - } - } - - MALLOC(signing_id, uint8_t *, max_size + 1, M_TEMP, M_WAITOK); - if (signing_id == NULL) { - FDLOG(LOG_ERR, &nil_pcb, "Failed to allocate a string to hold the signing ID (size %lu)", max_size); - return; - } - - for (cursor = flow_divert_packet_find_tlv(packet, offset, FLOW_DIVERT_TLV_SIGNING_ID, &error, 0); - cursor >= 0; - cursor = flow_divert_packet_find_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, &error, 1)) - { - uint32_t signing_id_len = 0; - uint16_t node; - - flow_divert_packet_get_tlv(packet, - cursor, FLOW_DIVERT_TLV_SIGNING_ID, max_size, signing_id, &signing_id_len); - - signing_id[signing_id_len] = '\0'; - - lck_rw_lock_exclusive(&g_flow_divert_group_lck); - - node = flow_divert_trie_search(&g_signing_id_trie, signing_id); - if (node != NULL_TRIE_IDX) { - if (TRIE_NODE(&g_signing_id_trie, node).group_unit != DNS_SERVICE_GROUP_UNIT) { - FDLOG(LOG_INFO, &nil_pcb, "Setting %s to ctl unit %u", signing_id, group->ctl_unit); - TRIE_NODE(&g_signing_id_trie, node).group_unit = ctl_unit; - } - } else { - FDLOG(LOG_ERR, &nil_pcb, "Failed to find signing ID %s", signing_id); - } - - lck_rw_done(&g_flow_divert_group_lck); - } - - FREE(signing_id, M_TEMP); } static int @@ -2371,7 +2311,7 @@ flow_divert_input(mbuf_t packet, struct flow_divert_group *group) } if (mbuf_pkthdr_len(packet) > FD_CTL_RCVBUFF_SIZE) { - FDLOG(LOG_ERR, &nil_pcb, "got a bad packet, length (%lu) > %lu", mbuf_pkthdr_len(packet), FD_CTL_RCVBUFF_SIZE); + FDLOG(LOG_ERR, &nil_pcb, "got a bad packet, length (%lu) > %d", mbuf_pkthdr_len(packet), FD_CTL_RCVBUFF_SIZE); error = EINVAL; goto done; } @@ -2391,10 +2331,7 @@ flow_divert_input(mbuf_t packet, struct flow_divert_group *group) flow_divert_handle_group_init(group, packet, sizeof(hdr)); break; case FLOW_DIVERT_PKT_APP_MAP_CREATE: - flow_divert_handle_app_map_create(packet, sizeof(hdr)); - break; - case FLOW_DIVERT_PKT_APP_MAP_UPDATE: - flow_divert_handle_app_map_update(group, packet, sizeof(hdr)); + flow_divert_handle_app_map_create(group, packet, sizeof(hdr)); break; default: FDLOG(LOG_WARNING, &nil_pcb, "got an unknown message type: %d", hdr.packet_type); @@ -2467,6 +2404,7 @@ flow_divert_close_all(struct flow_divert_group *group) flow_divert_pcb_remove(fd_cb); flow_divert_update_closed_state(fd_cb, SHUT_RDWR, TRUE); fd_cb->so->so_error = ECONNABORTED; + flow_divert_disconnect_socket(fd_cb->so); socket_unlock(fd_cb->so, 0); } FDUNLOCK(fd_cb); @@ -3610,6 +3548,12 @@ flow_divert_token_get(struct socket *so, struct sockopt *sopt) goto done; } + if (sopt->sopt_val == USER_ADDR_NULL) { + /* If the caller passed NULL to getsockopt, just set the size of the token and return */ + sopt->sopt_valsize = mbuf_pkthdr_len(token); + goto done; + } + error = soopt_mcopyout(sopt, token); if (error) { token = NULL; /* For some reason, soopt_mcopyout() frees the mbuf if it fails */ @@ -3649,6 +3593,7 @@ flow_divert_kctl_connect(kern_ctl_ref kctlref __unused, struct sockaddr_ctl *sac RB_INIT(&new_group->pcb_tree); new_group->ctl_unit = sac->sc_unit; MBUFQ_INIT(&new_group->send_queue); + new_group->signing_id_trie.root = NULL_TRIE_IDX; lck_rw_lock_exclusive(&g_flow_divert_group_lck); @@ -3685,7 +3630,6 @@ flow_divert_kctl_disconnect(kern_ctl_ref kctlref __unused, uint32_t unit, void * { struct flow_divert_group *group = NULL; errno_t error = 0; - uint16_t node = 0; if (unit >= GROUP_COUNT_MAX) { return EINVAL; @@ -3714,6 +3658,14 @@ flow_divert_kctl_disconnect(kern_ctl_ref kctlref __unused, uint32_t unit, void * group->token_key = NULL; group->token_key_size = 0; } + + /* Re-set the current trie */ + if (group->signing_id_trie.memory != NULL) { + FREE(group->signing_id_trie.memory, M_TEMP); + } + memset(&group->signing_id_trie, 0, sizeof(group->signing_id_trie)); + group->signing_id_trie.root = NULL_TRIE_IDX; + FREE_ZONE(group, sizeof(*group), M_FLOW_DIVERT_GROUP); g_flow_divert_groups[unit] = NULL; g_active_group_count--; @@ -3726,13 +3678,6 @@ flow_divert_kctl_disconnect(kern_ctl_ref kctlref __unused, uint32_t unit, void * g_flow_divert_groups = NULL; } - /* Remove all signing IDs that point to this unit */ - for (node = 0; node < g_signing_id_trie.nodes_count; node++) { - if (TRIE_NODE(&g_signing_id_trie, node).group_unit == unit) { - TRIE_NODE(&g_signing_id_trie, node).group_unit = 0; - } - } - lck_rw_done(&g_flow_divert_group_lck); return error; @@ -3993,9 +3938,6 @@ flow_divert_init(void) lck_rw_init(&g_flow_divert_group_lck, flow_divert_mtx_grp, flow_divert_mtx_attr); - memset(&g_signing_id_trie, 0, sizeof(g_signing_id_trie)); - g_signing_id_trie.root = NULL_TRIE_IDX; - done: if (g_init_result != 0) { if (flow_divert_mtx_attr != NULL) { diff --git a/bsd/netinet/flow_divert.h b/bsd/netinet/flow_divert.h index 1af72b8be..47abceaa3 100644 --- a/bsd/netinet/flow_divert.h +++ b/bsd/netinet/flow_divert.h @@ -32,6 +32,7 @@ #include struct flow_divert_group; +struct flow_divert_trie_node; struct flow_divert_pcb { decl_lck_mtx_data(, mtx); @@ -60,6 +61,21 @@ struct flow_divert_pcb { RB_HEAD(fd_pcb_tree, flow_divert_pcb); +struct flow_divert_trie +{ + struct flow_divert_trie_node *nodes; + uint16_t *child_maps; + uint8_t *bytes; + void *memory; + size_t nodes_count; + size_t child_maps_count; + size_t bytes_count; + size_t nodes_free_next; + size_t child_maps_free_next; + size_t bytes_free_next; + uint16_t root; +}; + struct flow_divert_group { decl_lck_rw_data(, lck); struct fd_pcb_tree pcb_tree; @@ -68,6 +84,7 @@ struct flow_divert_group { MBUFQ_HEAD(send_queue_head) send_queue; uint8_t *token_key; size_t token_key_size; + struct flow_divert_trie signing_id_trie; }; void flow_divert_init(void); diff --git a/bsd/netinet/flow_divert_proto.h b/bsd/netinet/flow_divert_proto.h index a2b89bb8b..934746d01 100644 --- a/bsd/netinet/flow_divert_proto.h +++ b/bsd/netinet/flow_divert_proto.h @@ -40,7 +40,6 @@ #define FLOW_DIVERT_PKT_READ_NOTIFY 5 #define FLOW_DIVERT_PKT_GROUP_INIT 6 #define FLOW_DIVERT_PKT_PROPERTIES_UPDATE 7 -#define FLOW_DIVERT_PKT_APP_MAP_UPDATE 8 #define FLOW_DIVERT_PKT_APP_MAP_CREATE 9 #define FLOW_DIVERT_TLV_NIL 0 @@ -78,8 +77,6 @@ #define FLOW_DIVERT_TOKEN_GETOPT_MAX_SIZE 128 -#define FLOW_DIVERT_DNS_SERVICE_SIGNING_ID "com.apple.mDNSResponder" - #define FLOW_DIVERT_TOKEN_FLAG_VALIDATED 0x0000001 #define FLOW_DIVERT_TOKEN_FLAG_TFO 0x0000002 #define FLOW_DIVERT_TOKEN_FLAG_MPTCP 0x0000004 diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index 0dc3dda2f..52228f43c 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -328,6 +328,7 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 +#define ND_OPT_NONCE 14 /* RFC 3971 */ #define ND_OPT_RDNSS 25 /* RFC 5006 */ #define ND_OPT_DNSSL 31 /* RFC 6106 */ @@ -347,6 +348,17 @@ struct nd_opt_prefix_info { /* prefix information */ #define ND_OPT_PI_FLAG_ONLINK 0x80 #define ND_OPT_PI_FLAG_AUTO 0x40 +#define ND_OPT_NONCE_LEN ((1 * 8) - 2) +#if ((ND_OPT_NONCE_LEN + 2) % 8) != 0 +#error "(ND_OPT_NONCE_LEN + 2) must be a multiple of 8." +#endif + +struct nd_opt_nonce { /* nonce option */ + u_int8_t nd_opt_nonce_type; + u_int8_t nd_opt_nonce_len; + u_int8_t nd_opt_nonce[ND_OPT_NONCE_LEN]; +} __attribute__((__packed__)); + struct nd_opt_rd_hdr { /* redirected header */ u_int8_t nd_opt_rh_type; u_int8_t nd_opt_rh_len; @@ -642,6 +654,7 @@ struct icmp6stat { u_quad_t icp6s_badrs; /* bad router advertisement */ u_quad_t icp6s_badra; /* bad router advertisement */ u_quad_t icp6s_badredirect; /* bad redirect message */ + u_quad_t icp6s_rfc6980_drop; /* NDP packet dropped based on RFC 6980 */ }; /* diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index 2b1859270..da146da81 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -3958,22 +3958,16 @@ igmp_rec_type_to_str(const int type) switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; - break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; - break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; - break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; - break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; - break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; - break; default: break; } diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index 9f65560f8..267490025 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -234,8 +234,9 @@ inaddr_local(struct in_addr in) /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal - * is true, this includes other subnets of the local net. - * Otherwise, it includes only the directly-connected (sub)nets. + * is true, this includes other subnets of the local net, + * otherwise, it includes the directly-connected (sub)nets. + * The IPv4 link local prefix 169.254/16 is also included. */ int in_localaddr(struct in_addr in) @@ -243,6 +244,9 @@ in_localaddr(struct in_addr in) u_int32_t i = ntohl(in.s_addr); struct in_ifaddr *ia; + if (IN_LINKLOCAL(i)) + return (1); + if (subnetsarelocal) { lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia != NULL; @@ -725,7 +729,7 @@ inctl_ifaddr(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd, ev_msg.dv[0].data_length = sizeof (struct kev_in_data); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); } else { IFA_UNLOCK(&ia->ia_ifa); } @@ -827,7 +831,7 @@ inctl_ifaddr(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd, } /* Post the kernel event */ - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); /* * See if there is any IPV4 address left and if so, @@ -946,7 +950,7 @@ inctl_ifdstaddr(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd, ev_msg.dv[0].data_length = sizeof (struct kev_in_data); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); lck_mtx_lock(rnh_lock); IFA_LOCK(&ia->ia_ifa); @@ -1041,7 +1045,7 @@ inctl_ifbrdaddr(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd, ev_msg.dv[0].data_length = sizeof (struct kev_in_data); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); break; default: @@ -1119,7 +1123,7 @@ inctl_ifnetmask(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd, ev_msg.dv[0].data_length = sizeof (struct kev_in_data); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(ifp, &ev_msg); break; } diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index 497a03cea..f57b76805 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -505,7 +505,7 @@ struct ip_opts { #define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */ #ifdef PRIVATE -#define IP_FORCE_OUT_IFP 69 /* deprecated; use IP_BOUND_IF instead */ +#define IP_FORCE_OUT_IFP 69 /* not implemented; use IP_BOUND_IF instead */ #define IP_NO_IFT_CELLULAR 6969 /* for internal use only */ #define IP_NO_IFT_PDP IP_NO_IFT_CELLULAR /* deprecated */ #define IP_OUT_IF 9696 /* for internal use only */ @@ -801,6 +801,42 @@ union sockaddr_in_4_6 { struct sockaddr_in sin; struct sockaddr_in6 sin6; }; + +/* + * Recommended DiffServ Code Point values + */ + +#define _DSCP_DF 0 /* RFC 2474 */ + +#define _DSCP_CS0 0 /* RFC 2474 */ +#define _DSCP_CS1 8 /* RFC 2474 */ +#define _DSCP_CS2 16 /* RFC 2474 */ +#define _DSCP_CS3 24 /* RFC 2474 */ +#define _DSCP_CS4 32 /* RFC 2474 */ +#define _DSCP_CS5 40 /* RFC 2474 */ +#define _DSCP_CS6 48 /* RFC 2474 */ +#define _DSCP_CS7 56 /* RFC 2474 */ + +#define _DSCP_EF 46 /* RFC 2474 */ +#define _DSCP_VA 44 /* RFC 5865 */ + +#define _DSCP_AF11 10 /* RFC 2597 */ +#define _DSCP_AF12 12 /* RFC 2597 */ +#define _DSCP_AF13 14 /* RFC 2597 */ +#define _DSCP_AF21 18 /* RFC 2597 */ +#define _DSCP_AF22 20 /* RFC 2597 */ +#define _DSCP_AF23 22 /* RFC 2597 */ +#define _DSCP_AF31 26 /* RFC 2597 */ +#define _DSCP_AF32 28 /* RFC 2597 */ +#define _DSCP_AF33 30 /* RFC 2597 */ +#define _DSCP_AF41 34 /* RFC 2597 */ +#define _DSCP_AF42 36 /* RFC 2597 */ +#define _DSCP_AF43 38 /* RFC 2597 */ + +#define _DSCP_52 52 /* Wi-Fi WMM Certification: Sigma */ + +#define _MAX_DSCP 63 /* coded on 6 bits */ + #endif /* PRIVATE */ #ifdef KERNEL diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index b889a8aaf..e7eafd51d 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2015 Apple Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,9 @@ #include #include +#include +#include + #define CONST_LLADDR(s) ((const u_char*)((s)->sdl_data + (s)->sdl_nlen)) static const size_t MAX_HW_LEN = 10; @@ -104,7 +107,7 @@ static const size_t MAX_HW_LEN = 10; * * - Routing lock (rnh_lock) * - * la_hold, la_asked, la_llreach, la_lastused, la_flags + * la_holdq, la_asked, la_llreach, la_lastused, la_flags * * - Routing entry lock (rt_lock) * @@ -122,20 +125,28 @@ struct llinfo_arp { /* * The following are protected by rt_lock */ - struct mbuf *la_hold; /* last packet until resolved/timeout */ + class_queue_t la_holdq; /* packets awaiting resolution */ struct if_llreach *la_llreach; /* link-layer reachability record */ u_int64_t la_lastused; /* last used timestamp */ u_int32_t la_asked; /* # of requests sent */ u_int32_t la_maxtries; /* retry limit */ - uint32_t la_flags; + u_int64_t la_probeexp; /* probe deadline timestamp */ + u_int32_t la_flags; #define LLINFO_RTRFAIL_EVTSENT 0x1 /* sent an ARP event */ +#define LLINFO_PROBING 0x2 /* waiting for an ARP reply */ }; static LIST_HEAD(, llinfo_arp) llinfo_arp; +static thread_call_t arp_timeout_tcall; static int arp_timeout_run; /* arp_timeout is scheduled to run */ -static void arp_timeout(void *); +static void arp_timeout(thread_call_param_t arg0, thread_call_param_t arg1); static void arp_sched_timeout(struct timeval *); +static thread_call_t arp_probe_tcall; +static int arp_probe_run; /* arp_probe is scheduled to run */ +static void arp_probe(thread_call_param_t arg0, thread_call_param_t arg1); +static void arp_sched_probe(struct timeval *); + static void arptfree(struct llinfo_arp *, void *); static errno_t arp_lookup_route(const struct in_addr *, int, int, route_t *, unsigned int); @@ -143,6 +154,7 @@ static int arp_getstat SYSCTL_HANDLER_ARGS; static struct llinfo_arp *arp_llinfo_alloc(int); static void arp_llinfo_free(void *); +static uint32_t arp_llinfo_flushq(struct llinfo_arp *); static void arp_llinfo_purge(struct rtentry *); static void arp_llinfo_get_ri(struct rtentry *, struct rt_reach_info *); static void arp_llinfo_get_iflri(struct rtentry *, struct ifnet_llreach_info *); @@ -160,11 +172,15 @@ static int arpinit_done; SYSCTL_DECL(_net_link_ether); SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW|CTLFLAG_LOCKED, 0, ""); -/* timer values */ static int arpt_prune = (5*60*1); /* walk list every 5 minutes */ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_prune, 0, ""); +#define ARP_PROBE_TIME 7 /* seconds */ +static u_int32_t arpt_probe = ARP_PROBE_TIME; +SYSCTL_UINT(_net_link_ether_inet, OID_AUTO, probe_intvl, + CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_probe, 0, ""); + static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_keep, 0, ""); @@ -173,12 +189,12 @@ static int arpt_down = 20; /* once declared down, don't send for 20 sec */ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_down, 0, ""); -static int arp_llreach_base = (LL_BASE_REACHABLE / 1000); /* seconds */ +static int arp_llreach_base = 120; /* seconds */ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, arp_llreach_base, - CTLFLAG_RW | CTLFLAG_LOCKED, &arp_llreach_base, LL_BASE_REACHABLE, + CTLFLAG_RW | CTLFLAG_LOCKED, &arp_llreach_base, 0, "default ARP link-layer reachability max lifetime (in seconds)"); -#define ARP_UNICAST_LIMIT 5 /* # of probes until ARP refresh broadcast */ +#define ARP_UNICAST_LIMIT 3 /* # of probes until ARP refresh broadcast */ static u_int32_t arp_unicast_lim = ARP_UNICAST_LIMIT; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, arp_unicast_lim, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_unicast_lim, ARP_UNICAST_LIMIT, @@ -188,6 +204,10 @@ static u_int32_t arp_maxtries = 5; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_maxtries, 0, ""); +static u_int32_t arp_maxhold = 16; +SYSCTL_UINT(_net_link_ether_inet, OID_AUTO, maxhold, + CTLFLAG_RW | CTLFLAG_LOCKED, &arp_maxhold, 0, ""); + static int useloopback = 1; /* use loopback interface for local traffic */ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW | CTLFLAG_LOCKED, &useloopback, 0, ""); @@ -222,19 +242,16 @@ static int arp_verbose; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_verbose, 0, ""); -struct arpstat arpstat; +/* + * Generally protected by rnh_lock; use atomic operations on fields + * that are also modified outside of that lock (if needed). + */ +struct arpstat arpstat __attribute__((aligned(sizeof (uint64_t)))); SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, stats, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, arp_getstat, "S,arpstat", "ARP statistics (struct arpstat, net/if_arp.h)"); -/* these are deprecated (read-only); use net.link.generic.system node instead */ -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_tx, - CTLFLAG_RD | CTLFLAG_LOCKED, &hwcksum_tx, 0, ""); - -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_rx, - CTLFLAG_RD | CTLFLAG_LOCKED, &hwcksum_rx, 0, ""); - static struct zone *llinfo_arp_zone; #define LLINFO_ARP_ZONE_MAX 256 /* maximum elements in zone */ #define LLINFO_ARP_ZONE_NAME "llinfo_arp" /* name for zone */ @@ -265,8 +282,16 @@ arp_llinfo_alloc(int how) la = (how == M_WAITOK) ? zalloc(llinfo_arp_zone) : zalloc_noblock(llinfo_arp_zone); - if (la != NULL) + if (la != NULL) { bzero(la, sizeof (*la)); + /* + * The type of queue (Q_DROPHEAD) here is just a hint; + * the actual logic that works on this queue performs + * a head drop, details in arp_llinfo_addq(). + */ + _qinit(&la->la_holdq, Q_DROPHEAD, (arp_maxhold == 0) ? + (uint32_t)-1 : arp_maxhold); + } return (la); } @@ -281,12 +306,8 @@ arp_llinfo_free(void *arg) /* NOTREACHED */ } - /* Just in case there's anything there, free it */ - if (la->la_hold != NULL) { - m_freem(la->la_hold); - la->la_hold = NULL; - arpstat.purged++; - } + /* Free any held packets */ + (void) arp_llinfo_flushq(la); /* Purge any link-layer info caching */ VERIFY(la->la_rt->rt_llinfo == la); @@ -296,6 +317,46 @@ arp_llinfo_free(void *arg) zfree(llinfo_arp_zone, la); } +static void +arp_llinfo_addq(struct llinfo_arp *la, struct mbuf *m) +{ + if (qlen(&la->la_holdq) >= qlimit(&la->la_holdq)) { + struct mbuf *_m; + /* prune less than CTL, else take what's at the head */ + _m = _getq_scidx_lt(&la->la_holdq, SCIDX_CTL); + if (_m == NULL) + _m = _getq(&la->la_holdq); + VERIFY(_m != NULL); + if (arp_verbose) { + log(LOG_DEBUG, "%s: dropping packet (scidx %u)\n", + __func__, MBUF_SCIDX(mbuf_get_service_class(_m))); + } + m_freem(_m); + atomic_add_32(&arpstat.dropped, 1); + atomic_add_32(&arpstat.held, -1); + } + _addq(&la->la_holdq, m); + atomic_add_32(&arpstat.held, 1); + if (arp_verbose) { + log(LOG_DEBUG, "%s: enqueued packet (scidx %u), qlen now %u\n", + __func__, MBUF_SCIDX(mbuf_get_service_class(m)), + qlen(&la->la_holdq)); + } +} + +static uint32_t +arp_llinfo_flushq(struct llinfo_arp *la) +{ + uint32_t held = qlen(&la->la_holdq); + + atomic_add_32(&arpstat.purged, held); + atomic_add_32(&arpstat.held, -held); + _flushq(&la->la_holdq); + VERIFY(qempty(&la->la_holdq)); + + return (held); +} + static void arp_llinfo_purge(struct rtentry *rt) { @@ -371,9 +432,8 @@ arp_llinfo_refresh(struct rtentry *rt) return; } - if (rt->rt_expire > timenow + arp_unicast_lim) { - rt->rt_expire = timenow + arp_unicast_lim; - } + if (rt->rt_expire > timenow) + rt->rt_expire = timenow; return; } @@ -514,10 +574,6 @@ arp_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr, } } - /* Bump up retry ceiling to accomodate unicast retries */ - if (lr != NULL) - la->la_maxtries = arp_maxtries + arp_unicast_lim; - if (arp_verbose > 1 && lr != NULL && why != NULL) { char tmp[MAX_IPv4_STR_LEN]; @@ -529,11 +585,14 @@ arp_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr, } struct arptf_arg { - int draining; + boolean_t draining; + boolean_t probing; uint32_t killed; uint32_t aging; uint32_t sticky; uint32_t found; + uint32_t qlen; + uint32_t qsize; }; /* @@ -544,6 +603,7 @@ arptfree(struct llinfo_arp *la, void *arg) { struct arptf_arg *ap = arg; struct rtentry *rt = la->la_rt; + uint64_t timenow; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); @@ -554,6 +614,20 @@ arptfree(struct llinfo_arp *la, void *arg) VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); ap->found++; + timenow = net_uptime(); + + /* If we're probing, flush out held packets upon probe expiration */ + if (ap->probing && (la->la_flags & LLINFO_PROBING) && + la->la_probeexp <= timenow) { + struct sockaddr_dl *sdl = SDL(rt->rt_gateway); + if (sdl != NULL) + sdl->sdl_alen = 0; + (void) arp_llinfo_flushq(la); + } + + ap->qlen += qlen(&la->la_holdq); + ap->qsize += qsize(&la->la_holdq); + if (rt->rt_expire == 0 || (rt->rt_flags & RTF_STATIC)) { ap->sticky++; /* ARP entry is permanent? */ @@ -564,7 +638,7 @@ arptfree(struct llinfo_arp *la, void *arg) } /* ARP entry hasn't expired and we're not draining? */ - if (!ap->draining && rt->rt_expire > net_uptime()) { + if (!ap->draining && rt->rt_expire > timenow) { RT_UNLOCK(rt); ap->aging++; return; @@ -576,7 +650,7 @@ arptfree(struct llinfo_arp *la, void *arg) * If we're not draining, force ARP query to be * generated next time this entry is used. */ - if (!ap->draining) { + if (!ap->draining && !ap->probing) { struct sockaddr_dl *sdl = SDL(rt->rt_gateway); if (sdl != NULL) sdl->sdl_alen = 0; @@ -584,7 +658,7 @@ arptfree(struct llinfo_arp *la, void *arg) rt->rt_flags &= ~RTF_REJECT; } RT_UNLOCK(rt); - } else if (!(rt->rt_flags & RTF_STATIC)) { + } else if (!(rt->rt_flags & RTF_STATIC) && !ap->probing) { /* * ARP entry has no outstanding refcnt, and we're either * draining or it has expired; delete it from the routing @@ -616,14 +690,16 @@ in_arpdrain(void *arg) lck_mtx_lock(rnh_lock); la = llinfo_arp.lh_first; bzero(&farg, sizeof (farg)); - farg.draining = 1; + farg.draining = TRUE; while ((ola = la) != NULL) { la = la->la_le.le_next; arptfree(ola, &farg); } if (arp_verbose) { - log(LOG_DEBUG, "%s: found %u, aging %u, sticky %u, killed %u\n", - __func__, farg.found, farg.aging, farg.sticky, farg.killed); + log(LOG_DEBUG, "%s: found %u, aging %u, sticky %u, killed %u; " + "%u pkts held (%u bytes)\n", __func__, farg.found, + farg.aging, farg.sticky, farg.killed, farg.qlen, + farg.qsize); } lck_mtx_unlock(rnh_lock); } @@ -632,9 +708,9 @@ in_arpdrain(void *arg) * Timeout routine. Age arp_tab entries periodically. */ static void -arp_timeout(void *arg) +arp_timeout(thread_call_param_t arg0, thread_call_param_t arg1) { -#pragma unused(arg) +#pragma unused(arg0, arg1) struct llinfo_arp *la, *ola; struct timeval atv; struct arptf_arg farg; @@ -647,11 +723,13 @@ arp_timeout(void *arg) arptfree(ola, &farg); } if (arp_verbose) { - log(LOG_DEBUG, "%s: found %u, aging %u, sticky %u, killed %u\n", - __func__, farg.found, farg.aging, farg.sticky, farg.killed); + log(LOG_DEBUG, "%s: found %u, aging %u, sticky %u, killed %u; " + "%u pkts held (%u bytes)\n", __func__, farg.found, + farg.aging, farg.sticky, farg.killed, farg.qlen, + farg.qsize); } atv.tv_usec = 0; - atv.tv_sec = arpt_prune; + atv.tv_sec = MAX(arpt_prune, 5); /* re-arm the timer if there's work to do */ arp_timeout_run = 0; if (farg.aging > 0) @@ -668,6 +746,13 @@ arp_sched_timeout(struct timeval *atv) if (!arp_timeout_run) { struct timeval tv; + uint64_t deadline = 0; + + if (arp_timeout_tcall == NULL) { + arp_timeout_tcall = + thread_call_allocate(arp_timeout, NULL); + VERIFY(arp_timeout_tcall != NULL); + } if (atv == NULL) { tv.tv_usec = 0; @@ -680,7 +765,79 @@ arp_sched_timeout(struct timeval *atv) (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec); } arp_timeout_run = 1; - timeout(arp_timeout, NULL, tvtohz(atv)); + + clock_deadline_for_periodic_event(atv->tv_sec * NSEC_PER_SEC, + mach_absolute_time(), &deadline); + (void) thread_call_enter_delayed(arp_timeout_tcall, deadline); + } +} + +/* + * Probe routine. + */ +static void +arp_probe(thread_call_param_t arg0, thread_call_param_t arg1) +{ +#pragma unused(arg0, arg1) + struct llinfo_arp *la, *ola; + struct timeval atv; + struct arptf_arg farg; + + lck_mtx_lock(rnh_lock); + la = llinfo_arp.lh_first; + bzero(&farg, sizeof (farg)); + farg.probing = TRUE; + while ((ola = la) != NULL) { + la = la->la_le.le_next; + arptfree(ola, &farg); + } + if (arp_verbose) { + log(LOG_DEBUG, "%s: found %u, aging %u, sticky %u, killed %u; " + "%u pkts held (%u bytes)\n", __func__, farg.found, + farg.aging, farg.sticky, farg.killed, farg.qlen, + farg.qsize); + } + atv.tv_usec = 0; + atv.tv_sec = MAX(arpt_probe, ARP_PROBE_TIME); + /* re-arm the probe if there's work to do */ + arp_probe_run = 0; + if (farg.qlen > 0) + arp_sched_probe(&atv); + else if (arp_verbose) + log(LOG_DEBUG, "%s: not rescheduling probe\n", __func__); + lck_mtx_unlock(rnh_lock); +} + +static void +arp_sched_probe(struct timeval *atv) +{ + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + + if (!arp_probe_run) { + struct timeval tv; + uint64_t deadline = 0; + + if (arp_probe_tcall == NULL) { + arp_probe_tcall = + thread_call_allocate(arp_probe, NULL); + VERIFY(arp_probe_tcall != NULL); + } + + if (atv == NULL) { + tv.tv_usec = 0; + tv.tv_sec = MAX(arpt_probe, ARP_PROBE_TIME); + atv = &tv; + } + if (arp_verbose) { + log(LOG_DEBUG, "%s: probe scheduled in " + "T+%llus.%lluu\n", __func__, + (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec); + } + arp_probe_run = 1; + + clock_deadline_for_periodic_event(atv->tv_sec * NSEC_PER_SEC, + mach_absolute_time(), &deadline); + (void) thread_call_enter_delayed(arp_probe_tcall, deadline); } } @@ -888,11 +1045,7 @@ arp_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) rt->rt_llinfo_purge(rt); rt->rt_flags &= ~RTF_LLINFO; - if (la->la_hold != NULL) { - m_freem(la->la_hold); - la->la_hold = NULL; - arpstat.purged++; - } + (void) arp_llinfo_flushq(la); } } @@ -1022,14 +1175,17 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, errno_t result = 0; struct sockaddr_dl *gateway; struct llinfo_arp *llinfo = NULL; + boolean_t usable, probing = FALSE; uint64_t timenow; - int unreachable = 0; struct if_llreach *lr; struct ifaddr *rt_ifa; struct sockaddr *sa; uint32_t rtflags; struct sockaddr_dl sdl; + if (ifp == NULL || net_dest == NULL) + return (EINVAL); + if (net_dest->sin_family != AF_INET) return (EAFNOSUPPORT); @@ -1052,7 +1208,8 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, RT_LOCK_ASSERT_HELD(route); } - if (packet->m_flags & M_BCAST) { + if ((packet != NULL && (packet->m_flags & M_BCAST)) || + in_broadcast(net_dest->sin_addr, ifp)) { size_t broadcast_len; bzero(ll_dest, ll_dest_len); result = ifnet_llbroadcast_copy_bytes(ifp, LLADDR(ll_dest), @@ -1065,7 +1222,9 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, } goto release; } - if (packet->m_flags & M_MCAST) { + if ((packet != NULL && (packet->m_flags & M_MCAST)) || + ((ifp->if_flags & IFF_MULTICAST) && + IN_MULTICAST(ntohl(net_dest->sin_addr.s_addr)))) { if (route != NULL) RT_UNLOCK(route); result = dlil_resolve_multi(ifp, @@ -1123,26 +1282,48 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, timenow = net_uptime(); VERIFY(route->rt_expire == 0 || route->rt_rmx.rmx_expire != 0); VERIFY(route->rt_expire != 0 || route->rt_rmx.rmx_expire == 0); - if ((route->rt_expire == 0 || - route->rt_expire > timenow) && gateway != NULL && - gateway->sdl_family == AF_LINK && gateway->sdl_alen != 0 && - !(unreachable = !arp_llreach_reachable(llinfo))) { + + usable = ((route->rt_expire == 0 || route->rt_expire > timenow) && + gateway != NULL && gateway->sdl_family == AF_LINK && + gateway->sdl_alen != 0); + + if (usable) { + boolean_t unreachable = !arp_llreach_reachable(llinfo); + + /* Entry is usable, so fill in info for caller */ bcopy(gateway, ll_dest, MIN(gateway->sdl_len, ll_dest_len)); result = 0; arp_llreach_use(llinfo); /* Mark use timestamp */ - /* - * Start the unicast probe right before the entry expires. - */ + lr = llinfo->la_llreach; if (lr == NULL) goto release; rt_ifa = route->rt_ifa; + /* Become a regular mutex, just in case */ RT_CONVERT_LOCK(route); IFLR_LOCK_SPIN(lr); - if (route->rt_expire <= timenow + arp_unicast_lim && - ifp->if_addrlen == IF_LLREACH_MAXLEN && - lr->lr_probes <= arp_unicast_lim) { + + if ((unreachable || (llinfo->la_flags & LLINFO_PROBING)) && + lr->lr_probes < arp_unicast_lim) { + /* + * Thus mark the entry with la_probeexp deadline to + * trigger the probe timer to be scheduled (if not + * already). This gets cleared the moment we get + * an ARP reply. + */ + probing = TRUE; + if (lr->lr_probes == 0) { + llinfo->la_probeexp = (timenow + arpt_probe); + llinfo->la_flags |= LLINFO_PROBING; + } + + /* + * Start the unicast probe and anticipate a reply; + * afterwards, return existing entry to caller and + * let it be used anyway. If peer is non-existent + * we'll broadcast ARP next time around. + */ lr->lr_probes++; bzero(&sdl, sizeof (sdl)); sdl.sdl_alen = ifp->if_addrlen; @@ -1160,14 +1341,19 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, (const struct sockaddr *)net_dest, rtflags); IFA_REMREF(rt_ifa); RT_LOCK(route); - } else + goto release; + } else { IFLR_UNLOCK(lr); - goto release; - } else if (unreachable) { - /* - * Discard existing answer in case we need to probe. - */ - gateway->sdl_alen = 0; + if (!unreachable && + !(llinfo->la_flags & LLINFO_PROBING)) { + /* + * Normal case where peer is still reachable, + * we're not probing and if_addrlen is anything + * but IF_LLREACH_MAXLEN. + */ + goto release; + } + } } if (ifp->if_flags & IFF_NOARP) { @@ -1176,16 +1362,28 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, } /* - * Route wasn't complete/valid. We need to arp. + * Route wasn't complete/valid; we need to send out ARP request. + * If we've exceeded the limit of la_holdq, drop from the head + * of queue and add this packet to the tail. If we end up with + * RTF_REJECT below, we'll dequeue this from tail and have the + * caller free the packet instead. It's safe to do that since + * we still hold the route's rt_lock. */ - if (packet != NULL) { - if (llinfo->la_hold != NULL) { - m_freem(llinfo->la_hold); - arpstat.dropped++; - } - llinfo->la_hold = packet; - } + if (packet != NULL) + arp_llinfo_addq(llinfo, packet); + /* + * Regardless of permanent vs. expirable entry, we need to + * avoid having packets sit in la_holdq forever; thus mark the + * entry with la_probeexp deadline to trigger the probe timer + * to be scheduled (if not already). This gets cleared the + * moment we get an ARP reply. + */ + probing = TRUE; + if (qlen(&llinfo->la_holdq) == 1) { + llinfo->la_probeexp = (timenow + arpt_probe); + llinfo->la_flags |= LLINFO_PROBING; + } if (route->rt_expire) { route->rt_flags &= ~RTF_REJECT; if (llinfo->la_asked == 0 || route->rt_expire != timenow) { @@ -1224,7 +1422,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, IFA_REMREF(rt_ifa); if (sendkev) { bzero(&ev_msg, sizeof(ev_msg)); - bzero(&in_arpfailure, + bzero(&in_arpfailure, sizeof(in_arpfailure)); in_arpfailure.link_data.if_family = ifp->if_family; @@ -1240,8 +1438,8 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, ev_msg.dv[0].data_ptr = &in_arpfailure; ev_msg.dv[0].data_length = sizeof(struct - kev_in_arpfailure); - kev_post_msg(&ev_msg); + kev_in_arpfailure); + dlil_post_complete_msg(NULL, &ev_msg); } result = EJUSTRETURN; RT_LOCK(route); @@ -1252,23 +1450,31 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, route->rt_expire + arpt_down); llinfo->la_asked = 0; /* - * Clear la_hold; don't free the packet since - * we're not returning EJUSTRETURN; the caller - * will handle the freeing. + * Remove the packet that was just added above; + * don't free it since we're not returning + * EJUSTRETURN. The caller will handle the + * freeing. Since we haven't dropped rt_lock + * from the time of _addq() above, this packet + * must be at the tail. */ - llinfo->la_hold = NULL; + if (packet != NULL) { + struct mbuf *_m = + _getq_tail(&llinfo->la_holdq); + atomic_add_32(&arpstat.held, -1); + VERIFY(_m == packet); + } result = EHOSTUNREACH; goto release; } } } - /* The packet is now held inside la_hold (can "packet" be NULL?) */ + /* The packet is now held inside la_holdq */ result = EJUSTRETURN; release: if (result == EHOSTUNREACH) - arpstat.dropped++; + atomic_add_32(&arpstat.dropped, 1); if (route != NULL) { if (route == hint) { @@ -1279,6 +1485,12 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, rtfree(route); } } + if (probing) { + /* Do this after we drop rt_lock to preserve ordering */ + lck_mtx_lock(rnh_lock); + arp_sched_probe(NULL); + lck_mtx_unlock(rnh_lock); + } return (result); } @@ -1301,6 +1513,11 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, int created_announcement = 0; int bridged = 0, is_bridge = 0; + /* + * Here and other places within this routine where we don't hold + * rnh_lock, trade accuracy for speed for the common scenarios + * and avoid the use of atomic updates. + */ arpstat.received++; /* Do not respond to requests for 0.0.0.0 */ @@ -1449,8 +1666,8 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, ev_msg.dv[0].data_length = sizeof (struct kev_in_collision) + in_collision->hw_len; ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - arpstat.dupips++; + dlil_post_complete_msg(NULL, &ev_msg); + atomic_add_32(&arpstat.dupips, 1); goto respond; } @@ -1550,7 +1767,7 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, (const struct sockaddr *)target_ip); IFA_REMREF(ifa); ifa = NULL; - arpstat.txconflicts++; + atomic_add_32(&arpstat.txconflicts, 1); } goto respond; } else if (keep_announcements != 0 && @@ -1715,7 +1932,7 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, llinfo = route->rt_llinfo; /* send a notification that the route is back up */ if (ifp->if_addrlen == IF_LLREACH_MAXLEN && - route->rt_flags & RTF_ROUTER && + route->rt_flags & RTF_ROUTER && llinfo->la_flags & LLINFO_RTRFAIL_EVTSENT) { struct kev_msg ev_msg; struct kev_in_arpalive in_arpalive; @@ -1732,15 +1949,23 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, ev_msg.kev_subclass = KEV_INET_SUBCLASS; ev_msg.event_code = KEV_INET_ARPRTRALIVE; ev_msg.dv[0].data_ptr = &in_arpalive; - ev_msg.dv[0].data_length = sizeof(struct kev_in_arpalive); - kev_post_msg(&ev_msg); + ev_msg.dv[0].data_length = sizeof(struct kev_in_arpalive); + dlil_post_complete_msg(NULL, &ev_msg); RT_LOCK(route); } - /* update the llinfo, send a queued packet if there is one */ + /* Update the llinfo, send out all queued packets at once */ llinfo->la_asked = 0; - if (llinfo->la_hold) { - struct mbuf *m0 = llinfo->la_hold; - llinfo->la_hold = NULL; + llinfo->la_flags &= ~LLINFO_PROBING; + if (!qempty(&llinfo->la_holdq)) { + uint32_t held; + struct mbuf *m0 = + _getq_all(&llinfo->la_holdq, NULL, &held, NULL); + if (arp_verbose) { + log(LOG_DEBUG, "%s: sending %u held packets\n", + __func__, held); + } + atomic_add_32(&arpstat.held, -held); + VERIFY(qempty(&llinfo->la_holdq)); RT_UNLOCK(route); dlil_output(ifp, PF_INET, m0, (caddr_t)route, rt_key(route), 0, NULL); @@ -1762,6 +1987,7 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, if (arpop != ARPOP_REQUEST) goto done; + /* See comments at the beginning of this routine */ arpstat.rxrequests++; /* If we are not the target, check if we should proxy */ diff --git a/bsd/netinet/in_gif.c b/bsd/netinet/in_gif.c index 0cd7a2287..345440463 100644 --- a/bsd/netinet/in_gif.c +++ b/bsd/netinet/in_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -111,7 +111,8 @@ in_gif_output( int proto, error; u_int8_t tos; struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0, SO_TC_UNSPEC, + _NET_SERVICE_TYPE_UNSPEC }; GIF_LOCK_ASSERT(sc); @@ -232,9 +233,7 @@ in_gif_output( } void -in_gif_input(m, off) - struct mbuf *m; - int off; +in_gif_input(struct mbuf *m, int off) { struct ifnet *gifp = NULL; struct ip *ip; diff --git a/bsd/netinet/in_mcast.c b/bsd/netinet/in_mcast.c index 320c7394a..1d1b56563 100644 --- a/bsd/netinet/in_mcast.c +++ b/bsd/netinet/in_mcast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2013 Apple Inc. All rights reserved. + * Copyright (c) 2010-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1544,7 +1544,6 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) IGMP_PRINTF(("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name)); return (EOPNOTSUPP); - break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) @@ -2182,7 +2181,6 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) IGMP_PRINTF(("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name)); return (EOPNOTSUPP); - break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) @@ -2309,10 +2307,24 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) /* * Begin state merge transaction at IGMP layer. */ - if (is_new) { + /* + * Unlock socket as we may end up calling ifnet_ioctl() to join (or leave) + * the multicast group and we run the risk of a lock ordering issue + * if the ifnet thread calls into the socket layer to acquire the pcb list + * lock while the input thread delivers multicast packets + */ + IMO_ADDREF_LOCKED(imo); + IMO_UNLOCK(imo); + socket_unlock(inp->inp_socket, 0); + VERIFY(inm == NULL); error = in_joingroup(ifp, &gsa->sin.sin_addr, imf, &inm); + + socket_lock(inp->inp_socket, 0); + IMO_REMREF(imo); + IMO_LOCK(imo); + VERIFY(inm != NULL || error != 0); if (error) goto out_imo_free; @@ -2484,7 +2496,6 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) IGMP_PRINTF(("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name)); return (EOPNOTSUPP); - break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) @@ -2548,6 +2559,7 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) * Begin state merge transaction at IGMP layer. */ + if (is_final) { /* * Give up the multicast address record to which @@ -2583,10 +2595,23 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) imf_reap(imf); if (is_final) { - /* Remove the gap in the membership and filter array. */ + /* Remove the gap in the membership array. */ VERIFY(inm == imo->imo_membership[idx]); imo->imo_membership[idx] = NULL; + + /* + * See inp_join_group() for why we need to unlock + */ + IMO_ADDREF_LOCKED(imo); + IMO_UNLOCK(imo); + socket_unlock(inp->inp_socket, 0); + INM_REMREF(inm); + + socket_lock(inp->inp_socket, 0); + IMO_REMREF(imo); + IMO_LOCK(imo); + for (++idx; idx < imo->imo_num_memberships; ++idx) { imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index dce4177d4..4bb25be28 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -78,6 +78,7 @@ #include #include #include +#include #include #include @@ -108,10 +109,16 @@ #include #include +#include + #if NECP #include #endif +#include +#include +#include + static lck_grp_t *inpcb_lock_grp; static lck_attr_t *inpcb_lock_attr; static lck_grp_attr_t *inpcb_lock_grp_attr; @@ -124,6 +131,7 @@ static u_int16_t inpcb_timeout_run = 0; /* INPCB timer is scheduled to run */ static boolean_t inpcb_garbage_collecting = FALSE; /* gc timer is scheduled */ static boolean_t inpcb_ticking = FALSE; /* "slow" timer is scheduled */ static boolean_t inpcb_fast_timer_on = FALSE; +static boolean_t intcoproc_unrestricted = FALSE; /* * If the total number of gc reqs is above a threshold, schedule @@ -132,11 +140,13 @@ static boolean_t inpcb_fast_timer_on = FALSE; static boolean_t inpcb_toomany_gcreq = FALSE; #define INPCB_GCREQ_THRESHOLD 50000 -#define INPCB_TOOMANY_GCREQ_TIMER (hz/10) /* 10 times a second */ -static void inpcb_sched_timeout(struct timeval *); -static void inpcb_timeout(void *); -int inpcb_timeout_lazy = 10; /* 10 seconds leeway for lazy timers */ +static thread_call_t inpcb_thread_call, inpcb_fast_thread_call; +static void inpcb_sched_timeout(void); +static void inpcb_sched_lazy_timeout(void); +static void _inpcb_sched_timeout(unsigned int); +static void inpcb_timeout(void *, void *); +const int inpcb_timeout_lazy = 10; /* 10 seconds leeway for lazy timers */ extern int tvtohz(struct timeval *); #if CONFIG_PROC_UUID_POLICY @@ -206,6 +216,11 @@ SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); +static uint32_t apn_fallbk_debug = 0; +#define apn_fallbk_log(x) do { if (apn_fallbk_debug >= 1) log x; } while (0) + +static boolean_t apn_fallbk_enabled = FALSE; + extern int udp_use_randomport; extern int tcp_use_randomport; @@ -272,6 +287,12 @@ in_pcbinit(void) inpcb_lock_attr = lck_attr_alloc_init(); lck_mtx_init(&inpcb_lock, inpcb_lock_grp, inpcb_lock_attr); lck_mtx_init(&inpcb_timeout_lock, inpcb_lock_grp, inpcb_lock_attr); + inpcb_thread_call = thread_call_allocate_with_priority(inpcb_timeout, + NULL, THREAD_CALL_PRIORITY_KERNEL); + inpcb_fast_thread_call = thread_call_allocate_with_priority( + inpcb_timeout, NULL, THREAD_CALL_PRIORITY_KERNEL); + if (inpcb_thread_call == NULL || inpcb_fast_thread_call == NULL) + panic("unable to alloc the inpcb thread call"); /* * Initialize data structures required to deliver @@ -282,23 +303,25 @@ in_pcbinit(void) RB_INIT(&inp_fc_tree); bzero(&key_inp, sizeof(key_inp)); lck_mtx_unlock(&inp_fc_lck); + + PE_parse_boot_argn("intcoproc_unrestricted", &intcoproc_unrestricted, + sizeof (intcoproc_unrestricted)); } #define INPCB_HAVE_TIMER_REQ(req) (((req).intimer_lazy > 0) || \ ((req).intimer_fast > 0) || ((req).intimer_nodelay > 0)) static void -inpcb_timeout(void *arg) +inpcb_timeout(void *arg0, void *arg1) { -#pragma unused(arg) +#pragma unused(arg0) struct inpcbinfo *ipi; boolean_t t, gc; struct intimercount gccnt, tmcnt; - struct timeval leeway; boolean_t toomany_gc = FALSE; - if (arg != NULL) { - VERIFY(arg == &inpcb_toomany_gcreq); - toomany_gc = *(boolean_t *)arg; + if (arg1 != NULL) { + VERIFY(arg1 == &inpcb_toomany_gcreq); + toomany_gc = *(boolean_t *)arg1; } /* @@ -368,58 +391,74 @@ inpcb_timeout(void *arg) VERIFY(inpcb_timeout_run >= 0 && inpcb_timeout_run < 2); } - bzero(&leeway, sizeof(leeway)); - leeway.tv_sec = inpcb_timeout_lazy; if (gccnt.intimer_nodelay > 0 || tmcnt.intimer_nodelay > 0) - inpcb_sched_timeout(NULL); + inpcb_sched_timeout(); else if ((gccnt.intimer_fast + tmcnt.intimer_fast) <= 5) /* be lazy when idle with little activity */ - inpcb_sched_timeout(&leeway); + inpcb_sched_lazy_timeout(); else - inpcb_sched_timeout(NULL); + inpcb_sched_timeout(); lck_mtx_unlock(&inpcb_timeout_lock); } static void -inpcb_sched_timeout(struct timeval *leeway) +inpcb_sched_timeout(void) { - lck_mtx_assert(&inpcb_timeout_lock, LCK_MTX_ASSERT_OWNED); + _inpcb_sched_timeout(0); +} + +static void +inpcb_sched_lazy_timeout(void) +{ + _inpcb_sched_timeout(inpcb_timeout_lazy); +} +static void +_inpcb_sched_timeout(unsigned int offset) +{ + uint64_t deadline, leeway; + + clock_interval_to_deadline(1, NSEC_PER_SEC, &deadline); + lck_mtx_assert(&inpcb_timeout_lock, LCK_MTX_ASSERT_OWNED); if (inpcb_timeout_run == 0 && - (inpcb_garbage_collecting || inpcb_ticking)) { + (inpcb_garbage_collecting || inpcb_ticking)) { lck_mtx_convert_spin(&inpcb_timeout_lock); inpcb_timeout_run++; - if (leeway == NULL) { + if (offset == 0) { inpcb_fast_timer_on = TRUE; - timeout(inpcb_timeout, NULL, hz); + thread_call_enter_delayed(inpcb_thread_call, + deadline); } else { inpcb_fast_timer_on = FALSE; - timeout_with_leeway(inpcb_timeout, NULL, hz, - tvtohz(leeway)); + clock_interval_to_absolutetime_interval(offset, + NSEC_PER_SEC, &leeway); + thread_call_enter_delayed_with_leeway( + inpcb_thread_call, NULL, deadline, leeway, + THREAD_CALL_DELAY_LEEWAY); } } else if (inpcb_timeout_run == 1 && - leeway == NULL && !inpcb_fast_timer_on) { + offset == 0 && !inpcb_fast_timer_on) { /* * Since the request was for a fast timer but the * scheduled timer is a lazy timer, try to schedule - * another instance of fast timer also + * another instance of fast timer also. */ lck_mtx_convert_spin(&inpcb_timeout_lock); inpcb_timeout_run++; inpcb_fast_timer_on = TRUE; - timeout(inpcb_timeout, NULL, hz); + thread_call_enter_delayed(inpcb_fast_thread_call, deadline); } } void inpcb_gc_sched(struct inpcbinfo *ipi, u_int32_t type) { - struct timeval leeway; u_int32_t gccnt; + uint64_t deadline; + lck_mtx_lock_spin(&inpcb_timeout_lock); inpcb_garbage_collecting = TRUE; - gccnt = ipi->ipi_gc_req.intimer_nodelay + ipi->ipi_gc_req.intimer_fast; @@ -432,24 +471,23 @@ inpcb_gc_sched(struct inpcbinfo *ipi, u_int32_t type) * the caller's request */ lck_mtx_convert_spin(&inpcb_timeout_lock); - timeout(inpcb_timeout, (void *)&inpcb_toomany_gcreq, - INPCB_TOOMANY_GCREQ_TIMER); + clock_interval_to_deadline(100, NSEC_PER_MSEC, &deadline); + thread_call_enter1_delayed(inpcb_thread_call, + &inpcb_toomany_gcreq, deadline); } switch (type) { case INPCB_TIMER_NODELAY: atomic_add_32(&ipi->ipi_gc_req.intimer_nodelay, 1); - inpcb_sched_timeout(NULL); + inpcb_sched_timeout(); break; case INPCB_TIMER_FAST: atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1); - inpcb_sched_timeout(NULL); + inpcb_sched_timeout(); break; default: atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1); - leeway.tv_sec = inpcb_timeout_lazy; - leeway.tv_usec = 0; - inpcb_sched_timeout(&leeway); + inpcb_sched_lazy_timeout(); break; } lck_mtx_unlock(&inpcb_timeout_lock); @@ -458,23 +496,21 @@ inpcb_gc_sched(struct inpcbinfo *ipi, u_int32_t type) void inpcb_timer_sched(struct inpcbinfo *ipi, u_int32_t type) { - struct timeval leeway; + lck_mtx_lock_spin(&inpcb_timeout_lock); inpcb_ticking = TRUE; switch (type) { case INPCB_TIMER_NODELAY: atomic_add_32(&ipi->ipi_timer_req.intimer_nodelay, 1); - inpcb_sched_timeout(NULL); + inpcb_sched_timeout(); break; case INPCB_TIMER_FAST: atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1); - inpcb_sched_timeout(NULL); + inpcb_sched_timeout(); break; default: atomic_add_32(&ipi->ipi_timer_req.intimer_lazy, 1); - leeway.tv_sec = inpcb_timeout_lazy; - leeway.tv_usec = 0; - inpcb_sched_timeout(&leeway); + inpcb_sched_lazy_timeout(); break; } lck_mtx_unlock(&inpcb_timeout_lock); @@ -593,7 +629,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p) panic("%s: insufficient space to align inp_Wstat", __func__); /* NOTREACHED */ } - + so->so_pcb = (caddr_t)inp; if (so->so_proto->pr_flags & PR_PCBLOCK) { @@ -608,6 +644,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p) if (ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif /* INET6 */ + if (intcoproc_unrestricted) + inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED; (void) inp_update_policy(inp); @@ -678,7 +716,7 @@ in_pcb_conflict_post_msg(u_int16_t port) ev_msg.dv[0].data_ptr = &in_portinuse; ev_msg.dv[0].data_length = sizeof (struct kev_in_portinuse); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(NULL, &ev_msg); } /* @@ -916,7 +954,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) *lastport = first; lport = htons(*lastport); } while (in_pcblookup_local_and_cleanup(pcbinfo, - ((laddr.s_addr != INADDR_ANY) ? laddr : + ((laddr.s_addr != INADDR_ANY) ? laddr : inp->inp_laddr), lport, wild)); } else { /* @@ -984,6 +1022,161 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) return (0); } +#define APN_FALLBACK_IP_FILTER(a) \ + (IN_LINKLOCAL(ntohl((a)->sin_addr.s_addr)) || \ + IN_LOOPBACK(ntohl((a)->sin_addr.s_addr)) || \ + IN_ZERONET(ntohl((a)->sin_addr.s_addr)) || \ + IN_MULTICAST(ntohl((a)->sin_addr.s_addr)) || \ + IN_PRIVATE(ntohl((a)->sin_addr.s_addr))) + +#define APN_FALLBACK_NOTIF_INTERVAL 2 /* Magic Number */ +static uint64_t last_apn_fallback = 0; + +static boolean_t +apn_fallback_required (proc_t proc, struct socket *so, struct sockaddr_in *p_dstv4) +{ + uint64_t timenow; + struct sockaddr_storage lookup_default_addr; + struct rtentry *rt = NULL; + + VERIFY(proc != NULL); + + if (apn_fallbk_enabled == FALSE) + return FALSE; + + if (proc == kernproc) + return FALSE; + + if (so && (so->so_options & SO_NOAPNFALLBK)) + return FALSE; + + timenow = net_uptime(); + if ((timenow - last_apn_fallback) < APN_FALLBACK_NOTIF_INTERVAL) { + apn_fallbk_log((LOG_INFO, "APN fallback notification throttled.\n")); + return FALSE; + } + + if (p_dstv4 && APN_FALLBACK_IP_FILTER(p_dstv4)) + return FALSE; + + /* Check if we have unscoped IPv6 default route through cellular */ + bzero(&lookup_default_addr, sizeof(lookup_default_addr)); + lookup_default_addr.ss_family = AF_INET6; + lookup_default_addr.ss_len = sizeof(struct sockaddr_in6); + + rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0); + if (NULL == rt) { + apn_fallbk_log((LOG_INFO, "APN fallback notification could not find " + "unscoped default IPv6 route.\n")); + return FALSE; + } + + if (!IFNET_IS_CELLULAR(rt->rt_ifp)) { + rtfree(rt); + apn_fallbk_log((LOG_INFO, "APN fallback notification could not find " + "unscoped default IPv6 route through cellular interface.\n")); + return FALSE; + } + + /* + * We have a default IPv6 route, ensure that + * we do not have IPv4 default route before triggering + * the event + */ + rtfree(rt); + rt = NULL; + + bzero(&lookup_default_addr, sizeof(lookup_default_addr)); + lookup_default_addr.ss_family = AF_INET; + lookup_default_addr.ss_len = sizeof(struct sockaddr_in); + + rt = rtalloc1((struct sockaddr *)&lookup_default_addr, 0, 0); + + if (rt) { + rtfree(rt); + rt = NULL; + apn_fallbk_log((LOG_INFO, "APN fallback notification found unscoped " + "IPv4 default route!\n")); + return FALSE; + } + + { + /* + * We disable APN fallback if the binary is not a third-party app. + * Note that platform daemons use their process name as a + * bundle ID so we filter out bundle IDs without dots. + */ + const char *bundle_id = cs_identity_get(proc); + if (bundle_id == NULL || + bundle_id[0] == '\0' || + strchr(bundle_id, '.') == NULL || + strncmp(bundle_id, "com.apple.", sizeof("com.apple.") - 1) == 0) { + apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found first-" + "party bundle ID \"%s\"!\n", (bundle_id ? bundle_id : "NULL"))); + return FALSE; + } + } + + { + /* + * The Apple App Store IPv6 requirement started on + * June 1st, 2016 at 12:00:00 AM PDT. + * We disable APN fallback if the binary is more recent than that. + * We check both atime and birthtime since birthtime is not always supported. + */ + static const long ipv6_start_date = 1464764400L; + vfs_context_t context; + struct stat64 sb; + int vn_stat_error; + + bzero(&sb, sizeof(struct stat64)); + context = vfs_context_create(NULL); + vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, context); + (void)vfs_context_rele(context); + + if (vn_stat_error != 0 || + sb.st_atimespec.tv_sec >= ipv6_start_date || + sb.st_birthtimespec.tv_sec >= ipv6_start_date) { + apn_fallbk_log((LOG_INFO, "Abort: APN fallback notification found binary " + "too recent! (err %d atime %ld mtime %ld ctime %ld birthtime %ld)\n", + vn_stat_error, sb.st_atimespec.tv_sec, sb.st_mtimespec.tv_sec, + sb.st_ctimespec.tv_sec, sb.st_birthtimespec.tv_sec)); + return FALSE; + } + } + return TRUE; +} + +static void +apn_fallback_trigger(proc_t proc) +{ + pid_t pid = 0; + struct kev_msg ev_msg; + struct kev_netevent_apnfallbk_data apnfallbk_data; + + last_apn_fallback = net_uptime(); + pid = proc_pid(proc); + uuid_t application_uuid; + uuid_clear(application_uuid); + proc_getexecutableuuid(proc, application_uuid, + sizeof(application_uuid)); + + bzero(&ev_msg, sizeof (struct kev_msg)); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_NETEVENT_SUBCLASS; + ev_msg.event_code = KEV_NETEVENT_APNFALLBACK; + + bzero(&apnfallbk_data, sizeof(apnfallbk_data)); + apnfallbk_data.epid = pid; + uuid_copy(apnfallbk_data.euuid, application_uuid); + + ev_msg.dv[0].data_ptr = &apnfallbk_data; + ev_msg.dv[0].data_length = sizeof(apnfallbk_data); + kev_post_msg(&ev_msg); + apn_fallbk_log((LOG_INFO, "APN fallback notification issued.\n")); +} + /* * Transform old in_pcbconnect() into an inner subroutine for new * in_pcbconnect(); do some validity-checking on the remote address @@ -1006,7 +1199,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) */ int in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr, - unsigned int ifscope, struct ifnet **outif) + unsigned int ifscope, struct ifnet **outif, int raw) { struct route *ro = &inp->inp_route; struct in_ifaddr *ia = NULL; @@ -1020,7 +1213,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr, return (EINVAL); if (SIN(nam)->sin_family != AF_INET) return (EAFNOSUPPORT); - if (SIN(nam)->sin_port == 0) + if (raw == 0 && SIN(nam)->sin_port == 0) return (EADDRNOTAVAIL); /* @@ -1030,8 +1223,8 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ - if (SIN(nam)->sin_addr.s_addr == INADDR_ANY || - SIN(nam)->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST) { + if (raw == 0 && (SIN(nam)->sin_addr.s_addr == INADDR_ANY || + SIN(nam)->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST)) { lck_rw_lock_shared(in_ifaddr_rwlock); if (!TAILQ_EMPTY(&in_ifaddrhead)) { ia = TAILQ_FIRST(&in_ifaddrhead); @@ -1103,11 +1296,18 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr, * interface to take the source address from. */ if (ro->ro_rt == NULL) { + proc_t proc = current_proc(); + VERIFY(ia == NULL); ia = ifatoia(ifa_ifwithdstaddr(SA(&sin))); if (ia == NULL) ia = ifatoia(ifa_ifwithnet_scoped(SA(&sin), ifscope)); error = ((ia == NULL) ? ENETUNREACH : 0); + + if (apn_fallback_required(proc, inp->inp_socket, + (void *)nam)) + apn_fallback_trigger(proc); + goto done; } RT_LOCK_ASSERT_HELD(ro->ro_rt); @@ -1267,7 +1467,7 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, /* * Call inner routine, to assign local interface address. */ - if ((error = in_pcbladdr(inp, nam, &laddr, ifscope, outif)) != 0) + if ((error = in_pcbladdr(inp, nam, &laddr, ifscope, outif, 0)) != 0) return (error); socket_unlock(so, 0); @@ -1377,18 +1577,18 @@ in_pcbdetach(struct inpcb *inp) inp, so, SOCK_PROTO(so)); /* NOTREACHED */ } - + #if IPSEC if (inp->inp_sp != NULL) { (void) ipsec4_delete_pcbpolicy(inp); } #endif /* IPSEC */ - + /* * Let NetworkStatistics know this PCB is going away * before we detach it. */ - if (nstat_collect && + if (nstat_collect && (SOCK_PROTO(so) == IPPROTO_TCP || SOCK_PROTO(so) == IPPROTO_UDP)) nstat_pcb_detach(inp); @@ -1416,14 +1616,21 @@ in_pcbdetach(struct inpcb *inp) ROUTE_RELEASE(&inp->inp_route); imo = inp->inp_moptions; inp->inp_moptions = NULL; - if (imo != NULL) - IMO_REMREF(imo); sofreelastref(so, 0); inp->inp_state = INPCB_STATE_DEAD; /* makes sure we're not called twice from so_close */ so->so_flags |= SOF_PCBCLEARING; inpcb_gc_sched(inp->inp_pcbinfo, INPCB_TIMER_FAST); + + /* + * See inp_join_group() for why we need to unlock + */ + if (imo != NULL) { + socket_unlock(so, 0); + IMO_REMREF(imo); + socket_lock(so, 0); + } } } @@ -1488,6 +1695,9 @@ in_pcbdispose(struct inpcb *inp) #if CONFIG_MACF_NET mac_inpcb_label_destroy(inp); #endif /* CONFIG_MACF_NET */ +#if NECP + necp_inpcb_dispose(inp); +#endif /* NECP */ /* * In case there a route cached after a detach (possible * in the tcp case), make sure that it is freed before @@ -2141,12 +2351,12 @@ in_pcbinshash(struct inpcb *inp, int locked) if (!locked) lck_rw_done(pcbinfo->ipi_lock); - + #if NECP // This call catches the original setting of the local address inp_update_necp_policy(inp, NULL, NULL, 0); #endif /* NECP */ - + return (0); } @@ -2181,7 +2391,7 @@ in_pcbrehash(struct inpcb *inp) VERIFY(!(inp->inp_flags2 & INP2_INHASHLIST)); LIST_INSERT_HEAD(head, inp, inp_hash); inp->inp_flags2 |= INP2_INHASHLIST; - + #if NECP // This call catches updates to the remote addresses inp_update_necp_policy(inp, NULL, NULL, 0); @@ -2199,7 +2409,7 @@ in_pcbremlists(struct inpcb *inp) /* * Check if it's in hashlist -- an inp is placed in hashlist when - * it's local port gets assigned. So it should also be present + * it's local port gets assigned. So it should also be present * in the port list. */ if (inp->inp_flags2 & INP2_INHASHLIST) { @@ -2289,7 +2499,6 @@ in_pcb_checkstate(struct inpcb *pcb, int mode, int locked) OSCompareAndSwap(origwant, newwant, wantcnt); } return (WNT_STOPUSING); - break; case WNT_ACQUIRE: /* @@ -2306,7 +2515,6 @@ in_pcb_checkstate(struct inpcb *pcb, int mode, int locked) newwant = origwant + 1; } while (!OSCompareAndSwap(origwant, newwant, wantcnt)); return (WNT_ACQUIRE); - break; case WNT_RELEASE: /* @@ -2343,7 +2551,6 @@ in_pcb_checkstate(struct inpcb *pcb, int mode, int locked) if (locked == 0) socket_unlock(pcb->inp_socket, 1); return (WNT_RELEASE); - break; default: panic("%s: so=%p not a valid state =%x\n", __func__, @@ -2463,7 +2670,7 @@ inp_route_copyin(struct inpcb *inp, struct route *src) } /* - * Handler for setting IP_FORCE_OUT_IFP/IP_BOUND_IF/IPV6_BOUND_IF socket option. + * Handler for setting IP_BOUND_IF/IPV6_BOUND_IF socket option. */ int inp_bindif(struct inpcb *inp, unsigned int ifscope, struct ifnet **pifp) @@ -2572,6 +2779,30 @@ inp_clear_awdl_unrestricted(struct inpcb *inp) ROUTE_RELEASE(&inp->inp_route); } +void +inp_set_intcoproc_allowed(struct inpcb *inp) +{ + inp->inp_flags2 |= INP2_INTCOPROC_ALLOWED; + + /* Blow away any cached route in the PCB */ + ROUTE_RELEASE(&inp->inp_route); +} + +boolean_t +inp_get_intcoproc_allowed(struct inpcb *inp) +{ + return (inp->inp_flags2 & INP2_INTCOPROC_ALLOWED) ? TRUE : FALSE; +} + +void +inp_clear_intcoproc_allowed(struct inpcb *inp) +{ + inp->inp_flags2 &= ~INP2_INTCOPROC_ALLOWED; + + /* Blow away any cached route in the PCB */ + ROUTE_RELEASE(&inp->inp_route); +} + #if NECP /* * Called when PROC_UUID_NECP_APP_POLICY is set. @@ -3054,13 +3285,19 @@ inp_update_policy(struct inpcb *inp) return (0); #endif /* !CONFIG_PROC_UUID_POLICY */ } + +static unsigned int log_restricted; +SYSCTL_DECL(_net_inet); +SYSCTL_INT(_net_inet, OID_AUTO, log_restricted, + CTLFLAG_RW | CTLFLAG_LOCKED, &log_restricted, 0, + "Log network restrictions"); /* * Called when we need to enforce policy restrictions in the input path. * * Returns TRUE if we're not allowed to receive data, otherwise FALSE. */ -boolean_t -inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp) +static boolean_t +_inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp) { VERIFY(inp != NULL); @@ -3081,7 +3318,7 @@ inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp) if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) return (TRUE); - + if (!(ifp->if_eflags & IFEF_RESTRICTED_RECV)) return (FALSE); @@ -3091,16 +3328,32 @@ inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp) if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp == ifp) return (FALSE); + if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) + return (TRUE); + return (TRUE); } +boolean_t +inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp) +{ + boolean_t ret; + + ret = _inp_restricted_recv(inp, ifp); + if (ret == TRUE && log_restricted) { + printf("pid %d is unable to receive packets on %s\n", + current_proc()->p_pid, ifp->if_xname); + } + return (ret); +} + /* * Called when we need to enforce policy restrictions in the output path. * * Returns TRUE if we're not allowed to send data out, otherwise FALSE. */ -boolean_t -inp_restricted_send(struct inpcb *inp, struct ifnet *ifp) +static boolean_t +_inp_restricted_send(struct inpcb *inp, struct ifnet *ifp) { VERIFY(inp != NULL); @@ -3122,5 +3375,111 @@ inp_restricted_send(struct inpcb *inp, struct ifnet *ifp) if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) return (TRUE); + if (IFNET_IS_INTCOPROC(ifp) && !INP_INTCOPROC_ALLOWED(inp)) + return (TRUE); + return (FALSE); } + +boolean_t +inp_restricted_send(struct inpcb *inp, struct ifnet *ifp) +{ + boolean_t ret; + + ret = _inp_restricted_send(inp, ifp); + if (ret == TRUE && log_restricted) { + printf("pid %d is unable to transmit packets on %s\n", + current_proc()->p_pid, ifp->if_xname); + } + return (ret); +} + +inline void +inp_count_sndbytes(struct inpcb *inp, u_int32_t th_ack) +{ + struct ifnet *ifp = inp->inp_last_outifp; + struct socket *so = inp->inp_socket; + if (ifp != NULL && !(so->so_flags & SOF_MP_SUBFLOW) && + (ifp->if_type == IFT_CELLULAR || + ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) { + int32_t unsent; + + so->so_snd.sb_flags |= SB_SNDBYTE_CNT; + + /* + * There can be data outstanding before the connection + * becomes established -- TFO case + */ + if (so->so_snd.sb_cc > 0) + inp_incr_sndbytes_total(so, so->so_snd.sb_cc); + + unsent = inp_get_sndbytes_allunsent(so, th_ack); + if (unsent > 0) + inp_incr_sndbytes_unsent(so, unsent); + } +} + +inline void +inp_incr_sndbytes_total(struct socket *so, int32_t len) +{ + struct inpcb *inp = (struct inpcb *)so->so_pcb; + struct ifnet *ifp = inp->inp_last_outifp; + + if (ifp != NULL) { + VERIFY(ifp->if_sndbyte_total >= 0); + OSAddAtomic64(len, &ifp->if_sndbyte_total); + } +} + +inline void +inp_decr_sndbytes_total(struct socket *so, int32_t len) +{ + struct inpcb *inp = (struct inpcb *)so->so_pcb; + struct ifnet *ifp = inp->inp_last_outifp; + + if (ifp != NULL) { + VERIFY(ifp->if_sndbyte_total >= len); + OSAddAtomic64(-len, &ifp->if_sndbyte_total); + } +} + +inline void +inp_incr_sndbytes_unsent(struct socket *so, int32_t len) +{ + struct inpcb *inp = (struct inpcb *)so->so_pcb; + struct ifnet *ifp = inp->inp_last_outifp; + + if (ifp != NULL) { + VERIFY(ifp->if_sndbyte_unsent >= 0); + OSAddAtomic64(len, &ifp->if_sndbyte_unsent); + } +} + +inline void +inp_decr_sndbytes_unsent(struct socket *so, int32_t len) +{ + struct inpcb *inp = (struct inpcb *)so->so_pcb; + struct ifnet *ifp = inp->inp_last_outifp; + + if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) + return; + + if (ifp != NULL) { + if (ifp->if_sndbyte_unsent >= len) + OSAddAtomic64(-len, &ifp->if_sndbyte_unsent); + else + ifp->if_sndbyte_unsent = 0; + } +} + +inline void +inp_decr_sndbytes_allunsent(struct socket *so, u_int32_t th_ack) +{ + int32_t len; + + if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) + return; + + len = inp_get_sndbytes_allunsent(so, th_ack); + inp_decr_sndbytes_unsent(so, len); +} diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 861658176..05fd56a35 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -594,6 +594,9 @@ struct inpcbinfo { lck_attr_t *ipi_lock_attr; lck_grp_t *ipi_lock_grp; lck_grp_attr_t *ipi_lock_grp_attr; + +#define INPCBINFO_UPDATE_MSS 0x1 + u_int32_t ipi_flags; }; #define INP_PCBHASH(faddr, lport, fport, mask) \ @@ -615,6 +618,8 @@ struct inpcbinfo { ((_inp)->inp_flags2 & INP2_NO_IFF_EXPENSIVE) #define INP_AWDL_UNRESTRICTED(_inp) \ ((_inp)->inp_flags2 & INP2_AWDL_UNRESTRICTED) +#define INP_INTCOPROC_ALLOWED(_inp) \ + ((_inp)->inp_flags2 & INP2_INTCOPROC_ALLOWED) #endif /* BSD_KERNEL_PRIVATE */ @@ -691,7 +696,8 @@ struct inpcbinfo { #define INP2_NO_IFF_EXPENSIVE 0x00000008 /* do not use expensive interface */ #define INP2_INHASHLIST 0x00000010 /* pcb is in inp_hash list */ #define INP2_AWDL_UNRESTRICTED 0x00000020 /* AWDL restricted mode allowed */ -#define INP2_KEEPALIVE_OFFLOAD 0x00000040 /* Enable UDP keepalive offload */ +#define INP2_KEEPALIVE_OFFLOAD 0x00000040 /* Enable UDP or TCP keepalive offload */ +#define INP2_INTCOPROC_ALLOWED 0x00000080 /* Allow communication via internal co-processor interfaces */ /* * Flags passed to in_pcblookup*() functions. @@ -748,7 +754,7 @@ extern void in_pcbdispose(struct inpcb *); extern void in_pcbdisconnect(struct inpcb *); extern int in_pcbinshash(struct inpcb *, int); extern int in_pcbladdr(struct inpcb *, struct sockaddr *, struct in_addr *, - unsigned int, struct ifnet **); + unsigned int, struct ifnet **, int); extern struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_int, int); extern struct inpcb *in_pcblookup_local_and_cleanup(struct inpcbinfo *, @@ -792,6 +798,9 @@ extern void inp_set_noexpensive(struct inpcb *); extern void inp_set_awdl_unrestricted(struct inpcb *); extern boolean_t inp_get_awdl_unrestricted(struct inpcb *); extern void inp_clear_awdl_unrestricted(struct inpcb *); +extern void inp_set_intcoproc_allowed(struct inpcb *); +extern boolean_t inp_get_intcoproc_allowed(struct inpcb *); +extern void inp_clear_intcoproc_allowed(struct inpcb *); #if NECP extern void inp_update_necp_policy(struct inpcb *, struct sockaddr *, struct sockaddr *, u_int); extern void inp_set_want_app_policy(struct inpcb *); @@ -808,6 +817,13 @@ extern void inp_get_soprocinfo(struct inpcb *, struct so_procinfo *); extern int inp_update_policy(struct inpcb *); extern boolean_t inp_restricted_recv(struct inpcb *, struct ifnet *); extern boolean_t inp_restricted_send(struct inpcb *, struct ifnet *); +extern void inp_incr_sndbytes_total(struct socket *, int); +extern void inp_decr_sndbytes_total(struct socket *, int); +extern void inp_count_sndbytes(struct inpcb *, u_int32_t); +extern void inp_incr_sndbytes_unsent(struct socket *, int32_t); +extern void inp_decr_sndbytes_unsent(struct socket *, int32_t); +extern int32_t inp_get_sndbytes_allunsent(struct socket *, u_int32_t); +extern void inp_decr_sndbytes_allunsent(struct socket *, u_int32_t); #endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE /* exported for PPP */ diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c index e54d8b996..5e667775b 100644 --- a/bsd/netinet/in_pcblist.c +++ b/bsd/netinet/in_pcblist.c @@ -319,13 +319,21 @@ get_pcblist_n(short proto, struct sysctl_req *req, struct inpcbinfo *pcbinfo) goto done; } - for (inp = pcbinfo->ipi_listhead->lh_first, i = 0; inp && i < n; - inp = inp->inp_list.le_next) { - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) - inp_list[i++] = inp; + /* + * Special case TCP to include the connections in time wait + */ + if (proto == IPPROTO_TCP) { + n = get_tcp_inp_list(inp_list, n, gencnt); + } else { + for (inp = pcbinfo->ipi_listhead->lh_first, i = 0; inp && i < n; + inp = inp->inp_list.le_next) { + if (inp->inp_gencnt <= gencnt && + inp->inp_state != INPCB_STATE_DEAD) + inp_list[i++] = inp; + } + n = i; } - n = i; + error = 0; for (i = 0; i < n; i++) { @@ -548,12 +556,12 @@ inpcb_count_opportunistic(unsigned int ifindex, struct inpcbinfo *pcbinfo, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME)); } - SOTHROTTLELOG(("throttle[%d]: so 0x%llx " + SOTHROTTLELOG("throttle[%d]: so 0x%llx " "[%d,%d] %s\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), (so->so_flags & SOF_SUSPENDED) ? - "SUSPENDED" : "RESUMED")); + "SUSPENDED" : "RESUMED"); socket_unlock(so, 1); } } diff --git a/bsd/netinet/in_proto.c b/bsd/netinet/in_proto.c index bb0fee864..4a9d19819 100644 --- a/bsd/netinet/in_proto.c +++ b/bsd/netinet/in_proto.c @@ -73,6 +73,7 @@ #include #include +#include #include #include @@ -302,6 +303,13 @@ in_dinit(struct domain *dp) inetdomain = dp; + /* + * Verify that the maximum possible tcp/ip header will still + * fit in a small mbuf because m_pullup only puls into 256 + * byte mbuf + */ + _CASSERT((sizeof(struct tcpiphdr) + TCP_MAXOLEN) <= _MHLEN); + /* * Attach first, then initialize; ip_init() needs raw IP handler. */ diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index 81ebd2641..fa4b580dc 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c index 20a37fd9f..3fe179ce9 100644 --- a/bsd/netinet/in_tclass.c +++ b/bsd/netinet/in_tclass.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2014 Apple Inc. All rights reserved. + * Copyright (c) 2009-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -39,8 +39,10 @@ #include #include #include +#include #include +#include #include #include @@ -56,8 +58,204 @@ #include #include #include +#include -extern char *proc_name_address(void *p); +struct dcsp_msc_map { + u_int8_t dscp; + mbuf_svc_class_t msc; +}; +static inline int so_throttle_best_effort(struct socket *, struct ifnet *); +static void set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *, int); +static errno_t dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *, size_t, + struct dcsp_msc_map *); + +static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ +static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ +static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ +decl_lck_mtx_data(static, tclass_lock_data); +static lck_mtx_t *tclass_lock = &tclass_lock_data; + +SYSCTL_NODE(_net, OID_AUTO, qos, + CTLFLAG_RW|CTLFLAG_LOCKED, 0, "QoS"); + +static int sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS; +SYSCTL_PROC(_net_qos, OID_AUTO, default_netsvctype_to_dscp_map, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_default_netsvctype_to_dscp_map, "S", ""); + +static int sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS; +SYSCTL_PROC(_net_qos, OID_AUTO, dscp_to_wifi_ac_map, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_dscp_to_wifi_ac_map, "S", ""); + +static int sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS; +SYSCTL_PROC(_net_qos, OID_AUTO, reset_dscp_to_wifi_ac_map, + CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED, + 0, 0, sysctl_reset_dscp_to_wifi_ac_map, "I", ""); + +int net_qos_verbose = 0; +SYSCTL_INT(_net_qos, OID_AUTO, verbose, + CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_verbose, 0, ""); + +/* + * Fastlane QoS policy: + * By Default allow all apps to get traffic class to DSCP mapping + */ +SYSCTL_NODE(_net_qos, OID_AUTO, policy, + CTLFLAG_RW|CTLFLAG_LOCKED, 0, ""); + +int net_qos_policy_restricted = 0; +SYSCTL_INT(_net_qos_policy, OID_AUTO, restricted, + CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restricted, 0, ""); + +int net_qos_policy_restrict_avapps = 0; +SYSCTL_INT(_net_qos_policy, OID_AUTO, restrict_avapps, + CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restrict_avapps, 0, ""); + +int net_qos_policy_wifi_enabled = 0; +SYSCTL_INT(_net_qos_policy, OID_AUTO, wifi_enabled, + CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_wifi_enabled, 0, ""); + +int net_qos_policy_capable_enabled = 0; +SYSCTL_INT(_net_qos_policy, OID_AUTO, capable_enabled, + CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_capable_enabled, 0, ""); + +/* + * Socket traffic class from network service type + */ +const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT] = { + SO_TC_BE, /* NET_SERVICE_TYPE_BE */ + SO_TC_BK_SYS, /* NET_SERVICE_TYPE_BK */ + SO_TC_VI, /* NET_SERVICE_TYPE_SIG */ + SO_TC_VI, /* NET_SERVICE_TYPE_VI */ + SO_TC_VO, /* NET_SERVICE_TYPE_VO */ + SO_TC_RV, /* NET_SERVICE_TYPE_RV */ + SO_TC_AV, /* NET_SERVICE_TYPE_AV */ + SO_TC_OAM, /* NET_SERVICE_TYPE_OAM */ + SO_TC_RD /* NET_SERVICE_TYPE_RD */ +}; + +/* + * DSCP mappings for QoS Fastlane as based on network service types + */ +static const +struct netsvctype_dscp_map fastlane_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = { + { NET_SERVICE_TYPE_BE, _DSCP_DF }, + { NET_SERVICE_TYPE_BK, _DSCP_AF11 }, + { NET_SERVICE_TYPE_SIG, _DSCP_CS3 }, + { NET_SERVICE_TYPE_VI, _DSCP_AF41 }, + { NET_SERVICE_TYPE_VO, _DSCP_EF }, + { NET_SERVICE_TYPE_RV, _DSCP_CS4 }, + { NET_SERVICE_TYPE_AV, _DSCP_AF31 }, + { NET_SERVICE_TYPE_OAM, _DSCP_CS2 }, + { NET_SERVICE_TYPE_RD, _DSCP_AF21 }, +}; + +static struct net_qos_dscp_map default_net_qos_dscp_map; + +/* + * The size is one more than the max because DSCP start at zero + */ +#define DSCP_ARRAY_SIZE (_MAX_DSCP + 1) + +/* + * The DSCP to UP mapping (via mbuf service class) for WiFi follows is the mapping + * that implemented at the 802.11 driver level when the mbuf service class is + * MBUF_SC_BE. + * + * This clashes with the recommended mapping documented by the IETF document + * draft-szigeti-tsvwg-ieee-802-11e-01.txt but we keep the mapping to maintain + * binary compatibility. Applications should use the network service type socket + * option instead to select L2 QoS marking instead of IP_TOS or IPV6_TCLASS. + */ +static const struct dcsp_msc_map default_dscp_to_wifi_ac_map[] = { + { _DSCP_DF, MBUF_SC_BE }, /* RFC 2474 Standard */ + { 1, MBUF_SC_BE }, /* */ + { 2, MBUF_SC_BE }, /* */ + { 3, MBUF_SC_BE }, /* */ + { 4, MBUF_SC_BE }, /* */ + { 5, MBUF_SC_BE }, /* */ + { 6, MBUF_SC_BE }, /* */ + { 7, MBUF_SC_BE }, /* */ + + { _DSCP_CS1, MBUF_SC_BK }, /* RFC 3662 Low-Priority Data */ + { 9, MBUF_SC_BK }, /* */ + { _DSCP_AF11, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ + { 11, MBUF_SC_BK }, /* */ + { _DSCP_AF12, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ + { 13, MBUF_SC_BK }, /* */ + { _DSCP_AF13, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */ + { 15, MBUF_SC_BK }, /* */ + + { _DSCP_CS2, MBUF_SC_BK }, /* RFC 4594 OAM */ + { 17, MBUF_SC_BK }, /* */ + { _DSCP_AF21, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ + { 19, MBUF_SC_BK }, /* */ + { _DSCP_AF22, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ + { 21, MBUF_SC_BK }, /* */ + { _DSCP_AF23, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */ + { 23, MBUF_SC_BK }, /* */ + + { _DSCP_CS3, MBUF_SC_BE }, /* RFC 2474 Broadcast Video */ + { 25, MBUF_SC_BE }, /* */ + { _DSCP_AF31, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ + { 27, MBUF_SC_BE }, /* */ + { _DSCP_AF32, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ + { 29, MBUF_SC_BE }, /* */ + { _DSCP_AF33, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */ + { 31, MBUF_SC_BE }, /* */ + + { _DSCP_CS4, MBUF_SC_VI }, /* RFC 2474 Real-Time Interactive */ + { 33, MBUF_SC_VI }, /* */ + { _DSCP_AF41, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ + { 35, MBUF_SC_VI }, /* */ + { _DSCP_AF42, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ + { 37, MBUF_SC_VI }, /* */ + { _DSCP_AF43, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */ + { 39, MBUF_SC_VI }, /* */ + + { _DSCP_CS5, MBUF_SC_VI }, /* RFC 2474 Signaling */ + { 41, MBUF_SC_VI }, /* */ + { 42, MBUF_SC_VI }, /* */ + { 43, MBUF_SC_VI }, /* */ + { _DSCP_VA, MBUF_SC_VI }, /* RFC 5865 VOICE-ADMIT */ + { 45, MBUF_SC_VI }, /* */ + { _DSCP_EF, MBUF_SC_VI }, /* RFC 3246 Telephony */ + { 47, MBUF_SC_VI }, /* */ + + { _DSCP_CS6, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */ + { 49, MBUF_SC_VO }, /* */ + { 50, MBUF_SC_VO }, /* */ + { 51, MBUF_SC_VO }, /* */ + { 52, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Sigma */ + { 53, MBUF_SC_VO }, /* */ + { 54, MBUF_SC_VO }, /* */ + { 55, MBUF_SC_VO }, /* */ + + { _DSCP_CS7, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */ + { 57, MBUF_SC_VO }, /* */ + { 58, MBUF_SC_VO }, /* */ + { 59, MBUF_SC_VO }, /* */ + { 60, MBUF_SC_VO }, /* */ + { 61, MBUF_SC_VO }, /* */ + { 62, MBUF_SC_VO }, /* */ + { 63, MBUF_SC_VO }, /* */ + + { 255, MBUF_SC_UNSPEC } /* invalid DSCP to mark last entry */ +}; + +mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE]; + +/* + * If there is no foreground activity on the interface for bg_switch_time + * seconds, the background connections can switch to foreground TCP + * congestion control. + */ +#define TCP_BG_SWITCH_TIME 2 /* seconds */ + +#if (DEVELOPMENT || DEBUG) + +extern char *proc_best_name(proc_t p); static int tfp_count = 0; @@ -66,12 +264,12 @@ static TAILQ_HEAD(, tclass_for_proc) tfp_head = struct tclass_for_proc { TAILQ_ENTRY(tclass_for_proc) tfp_link; - int tfp_class; - pid_t tfp_pid; - char tfp_pname[MAXCOMLEN + 1]; + int tfp_class; + pid_t tfp_pid; + char tfp_pname[(2 * MAXCOMLEN) + 1]; + u_int32_t tfp_qos_mode; }; -static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t); static int get_pid_tclass(struct so_tcdbg *); static int get_pname_tclass(struct so_tcdbg *); static int set_pid_tclass(struct so_tcdbg *); @@ -79,21 +277,7 @@ static int set_pname_tclass(struct so_tcdbg *); static int flush_pid_tclass(struct so_tcdbg *); static int purge_tclass_for_proc(void); static int flush_tclass_for_proc(void); -int get_tclass_for_curr_proc(int *); -static inline int so_throttle_best_effort(struct socket* ,struct ifnet *); - -static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ -static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ -static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ -decl_lck_mtx_data(static, tclass_lock_data); -static lck_mtx_t *tclass_lock = &tclass_lock_data; - -/* - * If there is no foreground activity on the interface for bg_switch_time - * seconds, the background connections can switch to foreground TCP - * congestion control. - */ -#define TCP_BG_SWITCH_TIME 2 /* seconds */ +static void set_tclass_for_curr_proc(struct socket *); /* * Must be called with tclass_lock held @@ -126,15 +310,13 @@ find_tfp_by_pname(const char *pname) return (tfp); } -__private_extern__ int -get_tclass_for_curr_proc(int *sotc) +__private_extern__ void +set_tclass_for_curr_proc(struct socket *so) { struct tclass_for_proc *tfp = NULL; proc_t p = current_proc(); /* Not ref counted */ pid_t pid = proc_pid(p); - char *pname = proc_name_address(p); - - *sotc = -1; + char *pname = proc_best_name(p); lck_mtx_lock(tclass_lock); @@ -142,14 +324,18 @@ get_tclass_for_curr_proc(int *sotc) if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 && strncmp(pname, tfp->tfp_pname, sizeof (tfp->tfp_pname)) == 0)) { - *sotc = tfp->tfp_class; + if (tfp->tfp_class != SO_TC_UNSPEC) + so->so_traffic_class = tfp->tfp_class; + + if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE) + so->so_flags1 |= SOF1_QOSMARKING_ALLOWED; + else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE) + so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED; break; } } lck_mtx_unlock(tclass_lock); - - return ((tfp == NULL) ? 0 : 1); } /* @@ -251,7 +437,7 @@ alloc_tclass_for_proc(pid_t pid, const char *pname) } /* - * -1 for tclass means to remove the entry + * SO_TC_UNSPEC for tclass means to remove the entry */ int set_pid_tclass(struct so_tcdbg *so_tcdbg) @@ -264,6 +450,7 @@ set_pid_tclass(struct so_tcdbg *so_tcdbg) int i; pid_t pid = so_tcdbg->so_tcdbg_pid; int tclass = so_tcdbg->so_tcdbg_tclass; + int netsvctype = so_tcdbg->so_tcdbg_netsvctype; p = proc_find(pid); if (p == NULL) { @@ -284,6 +471,7 @@ set_pid_tclass(struct so_tcdbg *so_tcdbg) } } tfp->tfp_class = tclass; + tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode; lck_mtx_unlock(tclass_lock); @@ -303,19 +491,21 @@ set_pid_tclass(struct so_tcdbg *so_tcdbg) so = (struct socket *)fp->f_fglob->fg_data; if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) continue; + socket_lock(so, 1); - if (tclass != -1) { - error = so_set_traffic_class(so, tclass); - if (error != 0) { - printf("%s: so_set_traffic_class" - "(so=0x%llx, fd=%d, tclass=%d) " - "failed %d\n", __func__, - (uint64_t)VM_KERNEL_ADDRPERM(so), - i, tclass, error); - error = 0; - } - } + if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE) + so->so_flags1 |= SOF1_QOSMARKING_ALLOWED; + else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE) + so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED; socket_unlock(so, 1); + + if (netsvctype != _NET_SERVICE_TYPE_UNSPEC) + error = sock_setsockopt(so, SOL_SOCKET, + SO_NET_SERVICE_TYPE, &netsvctype, sizeof(int)); + if (tclass != SO_TC_UNSPEC) + error = sock_setsockopt(so, SOL_SOCKET, + SO_TRAFFIC_CLASS, &tclass, sizeof(int)); + } proc_fdunlock(p); @@ -347,6 +537,7 @@ set_pname_tclass(struct so_tcdbg *so_tcdbg) } } tfp->tfp_class = so_tcdbg->so_tcdbg_tclass; + tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode; lck_mtx_unlock(tclass_lock); @@ -413,8 +604,7 @@ get_pid_tclass(struct so_tcdbg *so_tcdbg) struct tclass_for_proc *tfp; pid_t pid = so_tcdbg->so_tcdbg_pid; - so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ - so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ + so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */ p = proc_find(pid); if (p == NULL) { @@ -428,6 +618,7 @@ get_pid_tclass(struct so_tcdbg *so_tcdbg) tfp = find_tfp_by_pid(pid); if (tfp != NULL) { so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; + so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode; error = 0; } lck_mtx_unlock(tclass_lock); @@ -444,8 +635,7 @@ get_pname_tclass(struct so_tcdbg *so_tcdbg) int error = EINVAL; struct tclass_for_proc *tfp; - so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ - so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ + so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */ /* Need a tfp */ lck_mtx_lock(tclass_lock); @@ -453,6 +643,7 @@ get_pname_tclass(struct so_tcdbg *so_tcdbg) tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); if (tfp != NULL) { so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; + so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode; error = 0; } lck_mtx_unlock(tclass_lock); @@ -604,6 +795,7 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) sizeof (ptr->so_tcdbg_pname)); } ptr->so_tcdbg_tclass = tfp->tfp_class; + ptr->so_tcbbg_qos_mode = tfp->tfp_qos_mode; ptr++; } @@ -630,6 +822,46 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) return (error); } +#endif /* (DEVELOPMENT || DEBUG) */ + +int +so_get_netsvc_marking_level(struct socket *so) +{ + int marking_level = NETSVC_MRKNG_UNKNOWN; + struct ifnet *ifp = NULL; + + switch (SOCK_DOM(so)) { + case PF_INET: { + struct inpcb *inp = sotoinpcb(so); + + if (inp != NULL) + ifp = inp->inp_last_outifp; + break; + } + case PF_INET6: { + struct in6pcb *in6p = sotoin6pcb(so); + + if (in6p != NULL) + ifp = in6p->in6p_last_outifp; + break; + } + default: + break; + } + if (ifp != NULL) { + if ((ifp->if_eflags & + (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) == + (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) { + if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) + marking_level = NETSVC_MRKNG_LVL_L3L2_ALL; + else + marking_level = NETSVC_MRKNG_LVL_L3L2_BK; + } else { + marking_level = NETSVC_MRKNG_LVL_L2; + } + } + return (marking_level); +} __private_extern__ int so_set_traffic_class(struct socket *so, int optval) @@ -677,28 +909,49 @@ so_set_traffic_class(struct socket *so, int optval) if (oldval == SO_TC_BK_SYS) inp_reset_fc_state(so->so_pcb); - SOTHROTTLELOG(("throttle[%d]: so 0x%llx " + SOTHROTTLELOG("throttle[%d]: so 0x%llx " "[%d,%d] opportunistic %s\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), - (optval == SO_TC_BK_SYS) ? "ON" : "OFF")); + (optval == SO_TC_BK_SYS) ? "ON" : "OFF"); } } } return (error); } +__private_extern__ int +so_set_net_service_type(struct socket *so, int netsvctype) +{ + int sotc; + int error; + + if (!IS_VALID_NET_SERVICE_TYPE(netsvctype)) + return (EINVAL); + + sotc = sotc_by_netservicetype[netsvctype]; + error = so_set_traffic_class(so, sotc); + if (error != 0) + return (error); + so->so_netsvctype = netsvctype; + so->so_flags1 |= SOF1_TC_NET_SERV_TYPE; + + return (0); +} + __private_extern__ void so_set_default_traffic_class(struct socket *so) { - int sotc = -1; - - if (tfp_count > 0 && - (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) { - get_tclass_for_curr_proc(&sotc); + so->so_traffic_class = SO_TC_BE; + + if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) { + if (net_qos_policy_restricted == 0) + so->so_flags1 |= SOF1_QOSMARKING_ALLOWED; +#if (DEVELOPMENT || DEBUG) + if (tfp_count > 0) + set_tclass_for_curr_proc(so); +#endif /* (DEVELOPMENT || DEBUG) */ } - - so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE; } __private_extern__ int @@ -714,79 +967,79 @@ so_get_opportunistic(struct socket *so) return (so->so_traffic_class == SO_TC_BK_SYS); } -__private_extern__ mbuf_svc_class_t -mbuf_service_class_from_control(struct mbuf *control) +__private_extern__ int +so_tc_from_control(struct mbuf *control, int *out_netsvctype) { struct cmsghdr *cm; - mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + int sotc = SO_TC_UNSPEC; + + *out_netsvctype = _NET_SERVICE_TYPE_UNSPEC; for (cm = M_FIRST_CMSGHDR(control); cm != NULL; cm = M_NXT_CMSGHDR(control, cm)) { - int tc; + int val; if (cm->cmsg_len < sizeof (struct cmsghdr)) break; - if (cm->cmsg_level != SOL_SOCKET || - cm->cmsg_type != SO_TRAFFIC_CLASS) - continue; - if (cm->cmsg_len != CMSG_LEN(sizeof (int))) - continue; - - tc = *(int *)(void *)CMSG_DATA(cm); - msc = so_tc2msc(tc); - if (MBUF_VALID_SC(msc)) - break; - } - - return (msc); -} - -__private_extern__ int -dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc) -{ - int dscp_code; - - switch (mtc) { - default: - case MBUF_TC_BE: - dscp_code = 0; - break; - case MBUF_TC_BK: - dscp_code = 0x08; - break; - case MBUF_TC_VI: - dscp_code = 0x20; - break; - case MBUF_TC_VO: - dscp_code = 0x30; - break; + cm->cmsg_len != CMSG_LEN(sizeof(int))) + continue; + val = *(int *)(void *)CMSG_DATA(cm); + /* + * The first valid option wins + */ + switch (cm->cmsg_type) { + case SO_TRAFFIC_CLASS: + if (SO_VALID_TC(val)) { + sotc = val; + return (sotc); + /* NOT REACHED */ + } else if (val < SO_TC_NET_SERVICE_OFFSET) { + break; + } + /* + * Handle the case SO_NET_SERVICE_TYPE values are + * passed using SO_TRAFFIC_CLASS + */ + val = val - SO_TC_NET_SERVICE_OFFSET; + /* FALLTHROUGH */ + case SO_NET_SERVICE_TYPE: + if (!IS_VALID_NET_SERVICE_TYPE(val)) + break; + *out_netsvctype = val; + sotc = sotc_by_netservicetype[val]; + return (sotc); + /* NOT REACHED */ + default: + break; + } } - return (dscp_code); + return (sotc); } __private_extern__ void so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off) { - uint32_t sotc = m_get_traffic_class(m); + uint32_t mtc = m_get_traffic_class(m); - if (sotc >= SO_TC_STATS_MAX) - sotc = SO_TC_BE; + if (mtc >= SO_TC_STATS_MAX) + mtc = MBUF_TC_BE; - so->so_tc_stats[sotc].rxpackets += 1; - so->so_tc_stats[sotc].rxbytes += + so->so_tc_stats[mtc].rxpackets += 1; + so->so_tc_stats[mtc].rxbytes += ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; } __private_extern__ void -so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes, uint32_t tc) +so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes, + uint32_t mtc) { - if (tc >= SO_TC_STATS_MAX) - tc = SO_TC_BE; + if (mtc >= SO_TC_STATS_MAX) + mtc = MBUF_TC_BE; - so->so_tc_stats[tc].rxpackets += pkts; - so->so_tc_stats[tc].rxbytes +=bytes; + so->so_tc_stats[mtc].rxpackets += pkts; + so->so_tc_stats[mtc].rxbytes += bytes; } static inline int @@ -798,7 +1051,7 @@ so_throttle_best_effort(struct socket *so, struct ifnet *ifp) ifp->if_rt_sendts > 0 && (int)(uptime - ifp->if_rt_sendts) <= TCP_BG_SWITCH_TIME); } - + __private_extern__ void set_tcp_stream_priority(struct socket *so) { @@ -810,12 +1063,12 @@ set_tcp_stream_priority(struct socket *so) bool is_local = false, fg_active = false; u_int32_t uptime; - VERIFY((SOCK_CHECK_DOM(so, PF_INET) - || SOCK_CHECK_DOM(so, PF_INET6)) - && SOCK_CHECK_TYPE(so, SOCK_STREAM) - && SOCK_CHECK_PROTO(so, IPPROTO_TCP)); + VERIFY((SOCK_CHECK_DOM(so, PF_INET) || + SOCK_CHECK_DOM(so, PF_INET6)) && + SOCK_CHECK_TYPE(so, SOCK_STREAM) && + SOCK_CHECK_PROTO(so, IPPROTO_TCP)); - /* Return if the socket is in a terminal state */ + /* Return if the socket is in a terminal state */ if (inp->inp_state == INPCB_STATE_DEAD) return; @@ -836,13 +1089,13 @@ set_tcp_stream_priority(struct socket *so) if (outifp != NULL) { /* * If the traffic source is background, check if - * if it can be switched to foreground. This can + * if it can be switched to foreground. This can * happen when there is no indication of foreground * activity. */ - if (soissrcbackground(so) && + if (soissrcbackground(so) && ((outifp->if_fg_sendts > 0 && - (int)(uptime - outifp->if_fg_sendts) <= + (int)(uptime - outifp->if_fg_sendts) <= TCP_BG_SWITCH_TIME) || net_io_policy_throttled)) fg_active = true; @@ -853,7 +1106,7 @@ set_tcp_stream_priority(struct socket *so) * interface recently. If this is true, enable * algorithms that respond to increased latency * on best-effort traffic. - */ + */ if (so_throttle_best_effort(so, outifp)) fg_active = true; } @@ -870,10 +1123,10 @@ set_tcp_stream_priority(struct socket *so) * loopback, do not use background congestion * control algorithm. * - * If there has been recent foreground activity or if - * there was an indication that a foreground application + * If there has been recent foreground activity or if + * there was an indication that a foreground application * is going to use networking (net_io_policy_throttled), - * switch the backgroung streams to use background + * switch the backgroung streams to use background * congestion control algorithm. Otherwise, even background * flows can move into foreground. */ @@ -900,12 +1153,13 @@ set_tcp_stream_priority(struct socket *so) } if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) { - SOTHROTTLELOG(("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; " - "%s recv\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), - (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ? - "background" : "foreground", - IS_TCP_RECV_BG(so) ? "background" : "foreground")); + SOTHROTTLELOG("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; " + "%s recv\n", so->last_pid, + (uint64_t)VM_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), + (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ? + "background" : "foreground", + IS_TCP_RECV_BG(so) ? "background" : "foreground"); } } @@ -916,15 +1170,10 @@ set_tcp_stream_priority(struct socket *so) */ __private_extern__ void set_packet_service_class(struct mbuf *m, struct socket *so, - mbuf_svc_class_t in_msc, u_int32_t flags) + int sotc, u_int32_t flags) { mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */ struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ - struct ip *ip = mtod(m, struct ip *); -#if INET6 - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); -#endif /* INET6 */ - int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0; if (!(m->m_flags & M_PKTHDR)) return; @@ -935,12 +1184,9 @@ set_packet_service_class(struct mbuf *m, struct socket *so, * 2) Traffic class passed via ancillary data to sendmsdg(2) * 3) Traffic class socket option last */ - if (in_msc != MBUF_SC_UNSPEC) { - if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL) - msc = in_msc; - } else { - VERIFY(SO_VALID_TC(so->so_traffic_class)); - msc = so_tc2msc(so->so_traffic_class); + if (sotc != SO_TC_UNSPEC) { + VERIFY(SO_VALID_TC(sotc)); + msc = so_tc2msc(sotc); /* Assert because tc must have been valid */ VERIFY(MBUF_VALID_SC(msc)); } @@ -967,18 +1213,20 @@ set_packet_service_class(struct mbuf *m, struct socket *so, if (sotcdb & SOTCDB_NO_MTC) goto no_mbtc; - /* Elevate service class if the packet is a pure TCP ACK. + /* + * Elevate service class if the packet is a pure TCP ACK. * We can do this only when the flow is not a background - * flow and the outgoing interface supports + * flow and the outgoing interface supports * transmit-start model. */ - if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK)) + if (!IS_MBUF_SC_BACKGROUND(msc) && + (flags & (PKT_SCF_TCP_ACK | PKT_SCF_TCP_SYN)) != 0) msc = MBUF_SC_CTL; (void) m_set_service_class(m, msc); /* - * Set the privileged traffic auxiliary flag if applicable, + * Set the privileged traffic auxiliary flag if applicable, * or clear it. */ if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) && @@ -988,79 +1236,6 @@ set_packet_service_class(struct mbuf *m, struct socket *so, m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED; no_mbtc: - /* - * Quick exit when best effort - */ - if (msc == MBUF_SC_BE) - goto no_dscp; - - /* - * The default behavior is for the networking stack to not set the - * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is - * cleared, set the DSCP code in IPv4 or IPv6 header only for local - * traffic, if it is not already set. - */ - if (sotcdb & SOTCDB_NO_DSCP) - goto no_dscp; - - /* - * Test if a IP TOS or IPV6 TCLASS has already been set - * on the socket or the raw packet. - */ - if (!(sotcdb & SOTCDB_NO_DSCPTST)) { -#if INET6 - if (isipv6) { - if ((so->so_type == SOCK_RAW && - (ip6->ip6_flow & htonl(0xff << 20)) != 0) || - (inp->in6p_outputopts && - inp->in6p_outputopts->ip6po_tclass != -1)) - goto no_dscp; - } else -#endif /* INET6 */ - if ((so->so_type == SOCK_RAW && - (inp->inp_flags & INP_HDRINCL)) || - inp->inp_ip_tos != 0) - goto no_dscp; - } - - /* - * Test if destination is local - */ - if (!(sotcdb & SOTCDB_NO_LCLTST)) { - int islocal = 0; - struct rtentry *rt = inp->inp_route.ro_rt; - - if (so->so_type == SOCK_STREAM) { - if (intotcpcb(inp)->t_flags & TF_LOCAL) - islocal = 1; - } else if (rt != NULL && - (rt->rt_gateway->sa_family == AF_LINK || - (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) { - if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT)) - islocal = 1; - } else -#if INET6 - if (isipv6 && in6addr_local(&ip6->ip6_dst)) { - islocal = 1; - } else -#endif /* INET6 */ - if (inaddr_local(ip->ip_dst)) { - islocal = 1; - } - if (islocal == 0) - goto no_dscp; - } - -#if INET6 - if (isipv6) - ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass( - m_get_traffic_class(m)) << 20); - else -#endif /* INET6 */ - ip->ip_tos |= dscp_code_from_mbuf_tclass( - m_get_traffic_class(m)) << 2; - -no_dscp: /* * For TCP with background traffic class switch CC algo based on sysctl */ @@ -1090,7 +1265,7 @@ so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc) __private_extern__ void socket_tclass_init(void) { - _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX); + _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX); tclass_lck_grp_attr = lck_grp_attr_alloc_init(); tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr); @@ -1150,30 +1325,29 @@ __private_extern__ int so_svc2tc(mbuf_svc_class_t svc) { switch (svc) { - case MBUF_SC_UNSPEC: - return SO_TC_BE; case MBUF_SC_BK_SYS: - return SO_TC_BK_SYS; + return (SO_TC_BK_SYS); case MBUF_SC_BK: - return SO_TC_BK; + return (SO_TC_BK); case MBUF_SC_BE: - return SO_TC_BE; + return (SO_TC_BE); case MBUF_SC_RD: - return SO_TC_RD; + return (SO_TC_RD); case MBUF_SC_OAM: - return SO_TC_OAM; + return (SO_TC_OAM); case MBUF_SC_AV: - return SO_TC_AV; + return (SO_TC_AV); case MBUF_SC_RV: - return SO_TC_RV; + return (SO_TC_RV); case MBUF_SC_VI: - return SO_TC_VI; + return (SO_TC_VI); case MBUF_SC_VO: - return SO_TC_VO; + return (SO_TC_VO); case MBUF_SC_CTL: - return SO_TC_CTL; + return (SO_TC_CTL); + case MBUF_SC_UNSPEC: default: - return SO_TC_BE; + return (SO_TC_BE); } } @@ -1196,12 +1370,451 @@ so_set_lro(struct socket *so, int optval) if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) { tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr, - inp->inp_lport, + inp->inp_lport, inp->inp_fport); - tp->t_flagsext &= ~TF_LRO_OFFLOADED; + tp->t_flagsext &= ~TF_LRO_OFFLOADED; } } } } } +static size_t +sotc_index(int sotc) +{ + switch (sotc) { + case SO_TC_BK_SYS: + return (SOTCIX_BK_SYS); + case _SO_TC_BK: + case SO_TC_BK: + return (SOTCIX_BK); + + case SO_TC_BE: + return (SOTCIX_BE); + case SO_TC_RD: + return (SOTCIX_RD); + case SO_TC_OAM: + return (SOTCIX_OAM); + + case SO_TC_AV: + return (SOTCIX_AV); + case SO_TC_RV: + return (SOTCIX_RV); + case _SO_TC_VI: + case SO_TC_VI: + return (SOTCIX_VI); + + case _SO_TC_VO: + case SO_TC_VO: + return (SOTCIX_VO); + case SO_TC_CTL: + return (SOTCIX_CTL); + + default: + break; + } + /* + * Unknown traffic class value + */ + return (SIZE_T_MAX); +} + +/* + * Pass NULL ifp for default map + */ +static errno_t +set_netsvctype_dscp_map(size_t in_count, + const struct netsvctype_dscp_map *netsvctype_dscp_map) +{ + size_t i; + struct net_qos_dscp_map *net_qos_dscp_map = NULL; + int netsvctype; + + /* + * Do not accept more that max number of distinct DSCPs + */ + if (in_count > _MAX_DSCP || netsvctype_dscp_map == NULL) + return (EINVAL); + + /* + * Validate input parameters + */ + for (i = 0; i < in_count; i++) { + if (!IS_VALID_NET_SERVICE_TYPE(netsvctype_dscp_map[i].netsvctype)) + return (EINVAL); + if (netsvctype_dscp_map[i].dscp > _MAX_DSCP) + return (EINVAL); + } + + net_qos_dscp_map = &default_net_qos_dscp_map; + + for (i = 0; i < in_count; i++) { + netsvctype = netsvctype_dscp_map[i].netsvctype; + + net_qos_dscp_map->netsvctype_to_dscp[netsvctype] = + netsvctype_dscp_map[i].dscp; + } + for (netsvctype = 0; netsvctype < _NET_SERVICE_TYPE_COUNT; netsvctype++) { + switch (netsvctype) { + case NET_SERVICE_TYPE_BE: + case NET_SERVICE_TYPE_BK: + case NET_SERVICE_TYPE_VI: + case NET_SERVICE_TYPE_VO: + case NET_SERVICE_TYPE_RV: + case NET_SERVICE_TYPE_AV: + case NET_SERVICE_TYPE_OAM: + case NET_SERVICE_TYPE_RD: { + int sotcix; + + sotcix = sotc_index(sotc_by_netservicetype[netsvctype]); + net_qos_dscp_map->sotc_to_dscp[sotcix] = + netsvctype_dscp_map[netsvctype].dscp; + break; + } + case NET_SERVICE_TYPE_SIG: + /* Signaling does not have its own traffic class */ + break; + default: + /* We should not be here */ + ASSERT(0); + } + } + /* Network control socket traffic class is always best effort */ + net_qos_dscp_map->sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF; + + /* Backround socket traffic class DSCP same as backround system */ + net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK] = + net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK_SYS]; + + return (0); +} + +/* + * out_count is an input/ouput parameter + */ +static errno_t +get_netsvctype_dscp_map(size_t *out_count, + struct netsvctype_dscp_map *netsvctype_dscp_map) +{ + size_t i; + struct net_qos_dscp_map *net_qos_dscp_map = NULL; + + /* + * Do not accept more that max number of distinct DSCPs + */ + if (out_count == NULL || netsvctype_dscp_map == NULL) + return (EINVAL); + if (*out_count > _MAX_DSCP) + return (EINVAL); + + net_qos_dscp_map = &default_net_qos_dscp_map; + + for (i = 0; i < MIN(_NET_SERVICE_TYPE_COUNT, *out_count); i++) { + netsvctype_dscp_map[i].netsvctype = i; + netsvctype_dscp_map[i].dscp = net_qos_dscp_map->netsvctype_to_dscp[i]; + + } + *out_count = i; + + return (0); +} + +void +net_qos_map_init() +{ + errno_t error; + + /* + * By default use the Fastlane DSCP mappngs + */ + error = set_netsvctype_dscp_map(_NET_SERVICE_TYPE_COUNT, + fastlane_netsvctype_dscp_map); + ASSERT(error == 0); + + /* + * No DSCP mapping for network control + */ + default_net_qos_dscp_map.sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF; + + set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1); +} + +int +sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + const size_t max_netsvctype_to_dscp_map_len = + _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map); + size_t len; + struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT]; + size_t count; + + if (req->oldptr == USER_ADDR_NULL) { + req->oldidx = + _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map); + } else if (req->oldlen > 0) { + count = _NET_SERVICE_TYPE_COUNT; + error = get_netsvctype_dscp_map(&count, netsvctype_dscp_map); + if (error != 0) + goto done; + len = count * sizeof(struct netsvctype_dscp_map); + error = SYSCTL_OUT(req, netsvctype_dscp_map, + MIN(len, req->oldlen)); + if (error != 0) + goto done; + } + + if (req->newptr == USER_ADDR_NULL) + goto done; + + error = proc_suser(current_proc()); + if (error != 0) + goto done; + + /* + * Check input length + */ + if (req->newlen > max_netsvctype_to_dscp_map_len) { + error = EINVAL; + goto done; + } + /* + * Cap the number of entries to copy from input buffer + */ + error = SYSCTL_IN(req, netsvctype_dscp_map, req->newlen); + if (error != 0) + goto done; + + count = req->newlen / sizeof(struct netsvctype_dscp_map); + error = set_netsvctype_dscp_map(count, netsvctype_dscp_map); +done: + return (error); +} + +__private_extern__ errno_t +set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed, + int sotc, int netsvctype, u_int8_t *dscp_inout) +{ + if (ifp == NULL || dscp_inout == NULL) + return (EINVAL); + + if ((ifp->if_eflags & + (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) == + (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) { + u_int8_t dscp; + + /* + * When on a Fastlane network, IP_TOS/IPV6_TCLASS are no-ops + */ + dscp = _DSCP_DF; + + /* + * For DSCP use the network service type is specified, otherwise + * use the socket traffic class + * + * When not whitelisted by the policy, set DSCP only for best + * effort and background, and set the mbuf service class to + * best effort as well so the packet will be queued and + * scheduled at a lower priority. + * We still want to prioritize control traffic on the interface + * so we do not change the mbuf service class for SO_TC_CTL + */ + if (netsvctype != _NET_SERVICE_TYPE_UNSPEC && + netsvctype != NET_SERVICE_TYPE_BE) { + dscp = default_net_qos_dscp_map.netsvctype_to_dscp[netsvctype]; + + if (qos_allowed == FALSE && + netsvctype != NET_SERVICE_TYPE_BE && + netsvctype != NET_SERVICE_TYPE_BK) { + dscp = _DSCP_DF; + if (sotc != SO_TC_CTL) + m_set_service_class(m, MBUF_SC_BE); + } + } else { + size_t sotcix = sotc_index(sotc); + + dscp = default_net_qos_dscp_map.sotc_to_dscp[sotcix]; + + if (qos_allowed == FALSE && sotc != SO_TC_BE && + sotc != SO_TC_BK && sotc != SO_TC_BK_SYS && + sotc != SO_TC_CTL) { + dscp = _DSCP_DF; + if (sotc != SO_TC_CTL) + m_set_service_class(m, MBUF_SC_BE); + } + } + if (net_qos_verbose != 0) + printf("%s qos_allowed %d sotc %u netsvctype %u dscp %u\n", + __func__, qos_allowed, sotc, netsvctype, dscp); + + if (*dscp_inout != dscp) { + *dscp_inout = dscp; + } + } else if (*dscp_inout != _DSCP_DF && IFNET_IS_WIFI_INFRA(ifp)) { + mbuf_svc_class_t msc = m_get_service_class(m); + + /* + * For WiFi infra, when the mbuf service class is best effort + * and the DSCP is not default, set the service class based + * on DSCP + */ + if (msc == MBUF_SC_BE) { + msc = wifi_dscp_to_msc_array[*dscp_inout]; + + if (msc != MBUF_SC_BE) { + m_set_service_class(m, msc); + + if (net_qos_verbose != 0) + printf("%s set msc %u for dscp %u\n", + __func__, msc, *dscp_inout); + } + } + } + + return (0); +} + +static void +set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *map, int clear) +{ + int i; + + if (clear) + bzero(wifi_dscp_to_msc_array, sizeof(wifi_dscp_to_msc_array)); + + for (i = 0; i < DSCP_ARRAY_SIZE; i++) { + const struct dcsp_msc_map *elem = map + i; + + if (elem->dscp > _MAX_DSCP || elem->msc == MBUF_SC_UNSPEC) + break; + switch (elem->msc) { + case MBUF_SC_BK_SYS: + case MBUF_SC_BK: + wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BK; + break; + default: + case MBUF_SC_BE: + case MBUF_SC_RD: + case MBUF_SC_OAM: + wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BE; + break; + case MBUF_SC_AV: + case MBUF_SC_RV: + case MBUF_SC_VI: + wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VI; + break; + case MBUF_SC_VO: + case MBUF_SC_CTL: + wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VO; + break; + } + } +} + +static errno_t +dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *netsvctype_dscp_map, + size_t count, struct dcsp_msc_map *dcsp_msc_map) +{ + errno_t error = 0; + u_int32_t i; + + /* + * Validate input parameters + */ + for (i = 0; i < count; i++) { + if (!SO_VALID_TC(netsvctype_dscp_map[i].netsvctype)) { + error = EINVAL; + goto done; + } + if (netsvctype_dscp_map[i].dscp > _MAX_DSCP) { + error = EINVAL; + goto done; + } + } + + bzero(dcsp_msc_map, DSCP_ARRAY_SIZE * sizeof(struct dcsp_msc_map)); + + for (i = 0; i < count; i++) { + dcsp_msc_map[i].dscp = netsvctype_dscp_map[i].dscp; + dcsp_msc_map[i].msc = so_tc2msc(netsvctype_dscp_map[i].netsvctype); + } +done: + return (error); +} + +int +sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + size_t len = DSCP_ARRAY_SIZE * sizeof(struct netsvctype_dscp_map); + struct netsvctype_dscp_map netsvctype_dscp_map[DSCP_ARRAY_SIZE]; + struct dcsp_msc_map dcsp_msc_map[DSCP_ARRAY_SIZE]; + size_t count; + u_int32_t i; + + if (req->oldptr == USER_ADDR_NULL) { + req->oldidx = len; + } else if (req->oldlen > 0) { + for (i = 0; i < DSCP_ARRAY_SIZE; i++) { + netsvctype_dscp_map[i].dscp = i; + netsvctype_dscp_map[i].netsvctype = + so_svc2tc(wifi_dscp_to_msc_array[i]); + } + error = SYSCTL_OUT(req, netsvctype_dscp_map, + MIN(len, req->oldlen)); + if (error != 0) + goto done; + } + + if (req->newptr == USER_ADDR_NULL) + goto done; + + error = proc_suser(current_proc()); + if (error != 0) + goto done; + + /* + * Check input length + */ + if (req->newlen > len) { + error = EINVAL; + goto done; + } + /* + * Cap the number of entries to copy from input buffer + */ + if (len > req->newlen) + len = req->newlen; + error = SYSCTL_IN(req, netsvctype_dscp_map, len); + if (error != 0) { + goto done; + } + count = len / sizeof(struct netsvctype_dscp_map); + bzero(dcsp_msc_map, sizeof(dcsp_msc_map)); + error = dscp_msc_map_from_netsvctype_dscp_map(netsvctype_dscp_map, count, + dcsp_msc_map); + if (error != 0) { + goto done; + } + set_dscp_to_wifi_ac_map(dcsp_msc_map, 0); +done: + return (error); +} + +int +sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + int val = 0; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + + set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1); + + return (0); +} diff --git a/bsd/netinet/in_tclass.h b/bsd/netinet/in_tclass.h new file mode 100644 index 000000000..430de9f27 --- /dev/null +++ b/bsd/netinet/in_tclass.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __NETINET_IN_TCLASS_H__ +#define __NETINET_IN_TCLASS_H__ + +#ifdef PRIVATE + +#include +#include +#include +#include +#include +#include + +#define SO_TCDBG_PID 0x01 /* Set/get traffic class policy for PID */ +#define SO_TCDBG_PNAME 0x02 /* Set/get traffic class policy for processes of that name */ +#define SO_TCDBG_PURGE 0x04 /* Purge entries for unused PIDs */ +#define SO_TCDBG_FLUSH 0x08 /* Flush all entries */ +#define SO_TCDBG_COUNT 0x10 /* Get count of entries */ +#define SO_TCDBG_LIST 0x20 /* List entries */ +#define SO_TCDBG_DELETE 0x40 /* Delete a process entry */ +#define SO_TCDBG_TCFLUSH_PID 0x80 /* Flush traffic class for PID */ + +struct so_tcdbg { + u_int32_t so_tcdbg_cmd; + int32_t so_tcdbg_tclass; + int32_t so_tcdbg_netsvctype; + u_int32_t so_tcdbg_count; + pid_t so_tcdbg_pid; + u_int32_t so_tcbbg_qos_mode; + char so_tcdbg_pname[(2 * MAXCOMLEN) + 1]; +}; +#define QOS_MODE_MARKING_POLICY_DEFAULT 0 +#define QOS_MODE_MARKING_POLICY_ENABLE 1 +#define QOS_MODE_MARKING_POLICY_DISABLE 2 + +#define NET_QOS_MARKING_POLICY_DEFAULT QOS_MODE_MARKING_POLICY_DEFAULT /* obsolete, to be removed */ +#define NET_QOS_MARKING_POLICY_ENABLE QOS_MODE_MARKING_POLICY_ENABLE /* obsolete, to be removed */ +#define NET_QOS_MARKING_POLICY_DISABLE QOS_MODE_MARKING_POLICY_DISABLE /* obsolete, to be removed */ + +#ifdef BSD_KERNEL_PRIVATE + +extern int net_qos_policy_restricted; +extern int net_qos_policy_wifi_enabled; +extern int net_qos_policy_capable_enabled; + +extern void net_qos_map_init(void); +extern errno_t set_packet_qos(struct mbuf *, struct ifnet *, boolean_t, int, + int, u_int8_t *); +extern int so_get_netsvc_marking_level(struct socket *); + +#endif /* BSD_KERNEL_PRIVATE */ + +#endif /* PRIVATE */ + +#endif /* __NETINET_IN_TCLASS_H__ */ diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index 5b047a561..ed563523b 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,6 +66,7 @@ #include #include +#include #ifdef BSD_KERNEL_PRIVATE #include @@ -143,25 +144,6 @@ struct kev_in_portinuse { }; #endif /* __APPLE_API_PRIVATE */ -/* - * Define inet event subclass and specific inet events. - */ -#define KEV_INET_SUBCLASS 1 /* inet subclass identifier */ - -#define KEV_INET_NEW_ADDR 1 /* Userland configured IP address */ -#define KEV_INET_CHANGED_ADDR 2 /* Address changed event */ -#define KEV_INET_ADDR_DELETED 3 /* IPv6 address was deleted */ -#define KEV_INET_SIFDSTADDR 4 /* Dest. address was set */ -#define KEV_INET_SIFBRDADDR 5 /* Broadcast address was set */ -#define KEV_INET_SIFNETMASK 6 /* Netmask was set */ -#define KEV_INET_ARPCOLLISION 7 /* ARP collision detected */ -#ifdef __APPLE_API_PRIVATE -#define KEV_INET_PORTINUSE 8 /* use ken_in_portinuse */ -#endif -#define KEV_INET_ARPRTRFAILURE 9 /* ARP resolution failed for router */ -#define KEV_INET_ARPRTRALIVE 10 /* ARP resolution succeeded for - router */ - #ifdef BSD_KERNEL_PRIVATE #include #include diff --git a/bsd/netinet/ip.h b/bsd/netinet/ip.h index 0f68a7080..0574f5c23 100644 --- a/bsd/netinet/ip.h +++ b/bsd/netinet/ip.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -127,6 +127,8 @@ struct ip { #define IPTOS_ECT 0x02 #endif +#define IPTOS_DSCP_SHIFT 2 + /* * ECN (Explicit Congestion Notification) codepoints in RFC3168 * mapped to the lower 2 bits of the TOS field. diff --git a/bsd/netinet/ip6.h b/bsd/netinet/ip6.h index 3982daf0d..a4c4ea92e 100644 --- a/bsd/netinet/ip6.h +++ b/bsd/netinet/ip6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -139,6 +139,12 @@ struct ip6_hdr { #define IP6TOS_ECT 0x02 /* ECN-capable transport */ #endif +/* + * To access the 6 bits of the DSCP value in the 32 bits ip6_flow field + */ +#define IP6FLOW_DSCP_MASK 0x0fc00000 +#define IP6FLOW_DSCP_SHIFT 22 + /* * Extension Headers */ diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index 614c0c7c1..083b1484b 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -327,14 +327,19 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct inpcb *const inp = sotoinpcb(so); struct ip *const ip = mtod(m, struct ip *); int error = 0; - mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + int sotc = SO_TC_UNSPEC; if (control != NULL) { - msc = mbuf_service_class_from_control(control); + int ignored; + + (void) so_tc_from_control(contro, &sotc, &ignored); m_freem(control); /* XXX */ control = NULL; } + if (sotc == SO_TC_UNSPEC) + sotc = so->so_traffic_class; + /* Loopback avoidance and state recovery */ if (sin) { struct m_tag *mtag; @@ -370,7 +375,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0, SO_TC_UNSPEC, + _NET_SERVICE_TYPE_UNSPEC }; struct route ro; struct ip_moptions *imo; @@ -394,7 +400,11 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); - set_packet_service_class(m, so, msc, 0); + if (sotc != SO_TC_UNSPEC) { + ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED; + ipoa.ipoa_sotc = sotc; + } + set_packet_service_class(m, so, sotc, 0); imo = inp->inp_moptions; if (imo != NULL) diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index c24935fd9..b40ca4612 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -2,7 +2,7 @@ * Copyright (c) 2000-2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -116,7 +116,7 @@ static dn_key curr_time = 0 ; /* current simulation time */ /* this is for the timer that fires to call dummynet() - we only enable the timer when there are packets to process, otherwise it's disabled */ -static int timer_enabled = 0; +static int timer_enabled = 0; static int dn_hash_size = 64 ; /* default hash size */ @@ -155,7 +155,7 @@ static void ready_event(struct dn_flow_queue *q, struct mbuf **head, static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail); -/* +/* * Packets are retrieved from queues in Dummynet in chains instead of * packet-by-packet. The entire list of packets is first dequeued and * sent out by the following function. @@ -186,7 +186,7 @@ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW | CTLFLAG_LOCKED, &pipe_expire, 0, "Expire queue if empty"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, - CTLFLAG_RW | CTLFLAG_LOCKED, &dn_max_ratio, 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &dn_max_ratio, 0, "Max ratio between dynamic queues and buckets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD | CTLFLAG_LOCKED, &red_lookup_depth, 0, "Depth of RED lookup table"); @@ -268,7 +268,7 @@ int cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p ) { struct dn_pipe_32 user_pipe_32; int error=0; - + error = sooptcopyin(sopt, &user_pipe_32, sizeof(struct dn_pipe_32), sizeof(struct dn_pipe_32)); if ( !error ){ p->pipe_nr = user_pipe_32.pipe_nr; @@ -280,7 +280,7 @@ int cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p ) p->sched_time = user_pipe_32.sched_time; bcopy( user_pipe_32.if_name, p->if_name, IFNAMSIZ); p->ready = user_pipe_32.ready; - + p->fs.fs_nr = user_pipe_32.fs.fs_nr; p->fs.flags_fs = user_pipe_32.fs.flags_fs; p->fs.parent_nr = user_pipe_32.fs.parent_nr; @@ -314,7 +314,7 @@ int cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p ) { struct dn_pipe_64 user_pipe_64; int error=0; - + error = sooptcopyin(sopt, &user_pipe_64, sizeof(struct dn_pipe_64), sizeof(struct dn_pipe_64)); if ( !error ){ p->pipe_nr = user_pipe_64.pipe_nr; @@ -326,7 +326,7 @@ int cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p ) p->sched_time = user_pipe_64.sched_time; bcopy( user_pipe_64.if_name, p->if_name, IFNAMSIZ); p->ready = user_pipe_64.ready; - + p->fs.fs_nr = user_pipe_64.fs.fs_nr; p->fs.flags_fs = user_pipe_64.fs.flags_fs; p->fs.parent_nr = user_pipe_64.fs.parent_nr; @@ -461,7 +461,7 @@ static char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp) { char *bp; - + pipe_bp->pipe_nr = p->pipe_nr; pipe_bp->bandwidth = p->bandwidth; pipe_bp->delay = p->delay; @@ -478,10 +478,10 @@ char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp) bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ); pipe_bp->ifp = CAST_DOWN_EXPLICIT(user32_addr_t, p->ifp); pipe_bp->ready = p->ready; - + cp_flow_set_to_32_user( &(p->fs), &(pipe_bp->fs)); - - pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; + + pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; /* * XXX the following is a hack based on ->next being the * first field in dn_pipe and dn_flow_set. The correct @@ -502,7 +502,7 @@ static char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp) { char *bp; - + pipe_bp->pipe_nr = p->pipe_nr; pipe_bp->bandwidth = p->bandwidth; pipe_bp->delay = p->delay; @@ -519,10 +519,10 @@ char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp) bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ); pipe_bp->ifp = CAST_DOWN(user64_addr_t, p->ifp); pipe_bp->ready = p->ready; - + cp_flow_set_to_64_user( &(p->fs), &(pipe_bp->fs)); - - pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; + + pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; /* * XXX the following is a hack based on ->next being the * first field in dn_pipe and dn_flow_set. The correct @@ -745,7 +745,7 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) *head = m; *tail = m; } - + if (*tail != NULL) (*tail)->m_nextpkt = NULL; } @@ -768,11 +768,11 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) * before being able to transmit a packet. The credit is taken from * either a pipe (WF2Q) or a flow_queue (per-flow queueing) */ - -/* hz is 100, which gives a granularity of 10ms in the old timer. + +/* hz is 100, which gives a granularity of 10ms in the old timer. * The timer has been changed to fire every 1ms, so the use of * hz has been modified here. All instances of hz have been left - * in place but adjusted by a factor of 10 so that hz is functionally + * in place but adjusted by a factor of 10 so that hz is functionally * equal to 1000. */ #define SET_TICKS(_m, q, p) \ @@ -818,7 +818,7 @@ ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail) int p_was_empty ; lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - + if (p == NULL) { printf("dummynet: ready_event pipe is gone\n"); return ; @@ -985,7 +985,7 @@ ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) * queue on error hoping next time we are luckier. */ } - + /* Fit (adjust if necessary) 64bit result into 32bit variable. */ if (p_numbytes > INT_MAX) p->numbytes = INT_MAX; @@ -1024,9 +1024,9 @@ dummynet(__unused void * unused) heaps[2] = &extract_heap ; /* delay line */ lck_mtx_lock(dn_mutex); - - /* make all time measurements in milliseconds (ms) - - * here we convert secs and usecs to msecs (just divide the + + /* make all time measurements in milliseconds (ms) - + * here we convert secs and usecs to msecs (just divide the * usecs and take the closest whole number). */ microuptime(&tv); @@ -1066,8 +1066,8 @@ dummynet(__unused void * unused) pe->sum -= q->fs->weight ; } - /* check the heaps to see if there's still stuff in there, and - * only set the timer if there are packets to process + /* check the heaps to see if there's still stuff in there, and + * only set the timer if there are packets to process */ timer_enabled = 0; for (i=0; i < 3 ; i++) { @@ -1080,10 +1080,10 @@ dummynet(__unused void * unused) break; } } - + if (head != NULL) serialize++; - + lck_mtx_unlock(dn_mutex); /* Send out the de-queued list of ready-to-send packets */ @@ -1136,7 +1136,7 @@ dummynet_send(struct mbuf *m) case DN_TO_IP6_IN: proto_inject(PF_INET6, m); break; -#endif /* INET6 */ +#endif /* INET6 */ default: printf("dummynet: bad switch %d!\n", pkt->dn_dir); m_freem(m); @@ -1146,7 +1146,7 @@ dummynet_send(struct mbuf *m) } - + /* * called by an interface when tx_rdy occurs. */ @@ -1156,9 +1156,9 @@ if_tx_rdy(struct ifnet *ifp) struct dn_pipe *p; struct mbuf *head = NULL, *tail = NULL; int i; - + lck_mtx_lock(dn_mutex); - + for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(p, &pipehash[i], next) if (p->ifp == ifp) @@ -1180,11 +1180,11 @@ if_tx_rdy(struct ifnet *ifp) p->numbytes = 0 ; /* mark ready for I/O */ ready_event_wfq(p, &head, &tail); } - + if (head != NULL) { serialize++; } - + lck_mtx_unlock(dn_mutex); /* Send out the de-queued list of ready-to-send packets */ @@ -1486,7 +1486,7 @@ locate_flowset(int fs_nr) SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) if (fs->fs_nr == fs_nr) return fs ; - + return (NULL); } @@ -1561,13 +1561,13 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl lck_mtx_lock(dn_mutex); - /* make all time measurements in milliseconds (ms) - - * here we convert secs and usecs to msecs (just divide the + /* make all time measurements in milliseconds (ms) - + * here we convert secs and usecs to msecs (just divide the * usecs and take the closest whole number). */ microuptime(&tv); curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); - + /* * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. */ @@ -1577,8 +1577,8 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl fs = &(pipe->fs); } else fs = locate_flowset(pipe_nr); - - + + if (fs == NULL){ goto dropit ; /* this queue/pipe does not exist! */ } @@ -1649,7 +1649,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl if (fwa->fwa_dst) { if (fwa->fwa_dst == (struct sockaddr_in *)&fwa->fwa_ro->ro_dst) /* dst points into ro */ fwa->fwa_dst = (struct sockaddr_in *)&(pkt->dn_ro.ro_dst) ; - + bcopy (fwa->fwa_dst, &pkt->dn_dst, sizeof(pkt->dn_dst)); } } else if (dir == DN_TO_IP6_OUT) { @@ -1664,7 +1664,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl if (fwa->fwa_dst6) { if (fwa->fwa_dst6 == (struct sockaddr_in6 *)&fwa->fwa_ro6->ro_dst) /* dst points into ro */ fwa->fwa_dst6 = (struct sockaddr_in6 *)&(pkt->dn_ro6.ro_dst) ; - + bcopy (fwa->fwa_dst6, &pkt->dn_dst6, sizeof(pkt->dn_dst6)); } pkt->dn_origifp = fwa->fwa_origifp; @@ -1673,10 +1673,10 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen; if (fwa->fwa_exthdrs) { bcopy (fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs)); - /* + /* * Need to zero out the source structure so the mbufs * won't be freed by ip6_output() - */ + */ bzero(fwa->fwa_exthdrs, sizeof(struct ip6_exthdrs)); } } @@ -1774,7 +1774,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl } lck_mtx_unlock(dn_mutex); - + if (head != NULL) { dummynet_send(head); } @@ -2092,7 +2092,7 @@ config_pipe(struct dn_pipe *p) /* locate pipe */ b = locate_pipe(p->pipe_nr); - + if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ; if (x == NULL) { @@ -2244,7 +2244,7 @@ dummynet_drain(void) for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(p, &pipehash[i], next) { - purge_flow_set(&(p->fs), 0); + purge_flow_set(&(p->fs), 0); mnext = p->head; while ((m = mnext) != NULL) { @@ -2302,7 +2302,7 @@ delete_pipe(struct dn_pipe *p) pipe_remove_from_heap(&extract_heap, b); pipe_remove_from_heap(&wfq_ready_heap, b); lck_mtx_unlock(dn_mutex); - + FREE(b, M_DUMMYNET); } else { /* this is a WF2Q queue (dn_flow_set) */ struct dn_flow_set *b; @@ -2341,15 +2341,15 @@ delete_pipe(struct dn_pipe *p) /* * helper function used to copy data from kernel in DUMMYNET_GET */ -static +static char* dn_copy_set_32(struct dn_flow_set *set, char *bp) { int i, copied = 0 ; struct dn_flow_queue *q; struct dn_flow_queue_32 *qp = (struct dn_flow_queue_32 *)bp; - + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - + for (i = 0 ; i <= set->rq_size ; i++) for (q = set->rq[i] ; q ; q = q->next, qp++ ) { if (q->hash_slot != i) @@ -2373,15 +2373,15 @@ char* dn_copy_set_32(struct dn_flow_set *set, char *bp) return (char *)qp ; } -static +static char* dn_copy_set_64(struct dn_flow_set *set, char *bp) { int i, copied = 0 ; struct dn_flow_queue *q; struct dn_flow_queue_64 *qp = (struct dn_flow_queue_64 *)bp; - + lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - + for (i = 0 ; i <= set->rq_size ; i++) for (q = set->rq[i] ; q ; q = q->next, qp++ ) { if (q->hash_slot != i) @@ -2585,16 +2585,17 @@ ip_dn_init(void) ip_dn_io_ptr = dummynet_io; bzero(&default_rule, sizeof default_rule); - +#if IPFIREWALL default_rule.act_ofs = 0; default_rule.rulenum = IPFW_DEFAULT_RULE; default_rule.cmd_len = 1; default_rule.set = RESVD_SET; default_rule.cmd[0].len = 1; - default_rule.cmd[0].opcode = + default_rule.cmd[0].opcode = #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT (1) ? O_ACCEPT : #endif O_DENY; +#endif } diff --git a/bsd/netinet/ip_ecn.c b/bsd/netinet/ip_ecn.c index 70ce0cfb8..da773e9e7 100644 --- a/bsd/netinet/ip_ecn.c +++ b/bsd/netinet/ip_ecn.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000, 2007, 2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -81,10 +81,7 @@ * modify outer ECN (TOS) field on ingress operation (tunnel encapsulation). */ void -ip_ecn_ingress(mode, outer, inner) - int mode; - u_int8_t *outer; - const u_int8_t *inner; +ip_ecn_ingress(int mode, u_int8_t *outer, const u_int8_t *inner) { if (!outer || !inner) panic("NULL pointer passed to ip_ecn_ingress"); @@ -105,10 +102,7 @@ ip_ecn_ingress(mode, outer, inner) * modify inner ECN (TOS) field on egress operation (tunnel decapsulation). */ int -ip_ecn_egress(mode, outer, inner) - int mode; - const u_int8_t *outer; - u_int8_t *inner; +ip_ecn_egress(int mode, const u_int8_t *outer, u_int8_t *inner) { if (!outer || !inner) panic("NULL pointer passed to ip_ecn_egress"); @@ -133,10 +127,7 @@ ip_ecn_egress(mode, outer, inner) #if INET6 void -ip6_ecn_ingress(mode, outer, inner) - int mode; - u_int32_t *outer; - const u_int32_t *inner; +ip6_ecn_ingress(int mode, u_int32_t *outer, const u_int32_t *inner) { u_int8_t outer8, inner8; @@ -150,10 +141,7 @@ ip6_ecn_ingress(mode, outer, inner) } int -ip6_ecn_egress(mode, outer, inner) - int mode; - const u_int32_t *outer; - u_int32_t *inner; +ip6_ecn_egress(int mode, const u_int32_t *outer, u_int32_t *inner) { u_int8_t outer8, inner8; @@ -175,10 +163,7 @@ ip6_ecn_egress(mode, outer, inner) * on ingress operation (tunnel encapsulation). */ void -ip46_ecn_ingress(mode, outer, tos) - int mode; - u_int32_t *outer; - const u_int8_t *tos; +ip46_ecn_ingress(int mode, u_int32_t *outer, const u_int8_t *tos) { u_int8_t outer8; @@ -195,10 +180,7 @@ ip46_ecn_ingress(mode, outer, tos) * on egress operation (tunnel decapsulation). */ int -ip46_ecn_egress(mode, outer, tos) - int mode; - const u_int32_t *outer; - u_int8_t *tos; +ip46_ecn_egress(int mode, const u_int32_t *outer, u_int8_t *tos) { u_int8_t outer8; @@ -214,10 +196,7 @@ ip46_ecn_egress(mode, outer, tos) * on ingress operation (tunnel encapsulation). */ void -ip64_ecn_ingress(mode, outer, inner) - int mode; - u_int8_t *outer; - const u_int32_t *inner; +ip64_ecn_ingress(int mode, u_int8_t *outer, const u_int32_t *inner) { u_int8_t inner8; @@ -233,10 +212,7 @@ ip64_ecn_ingress(mode, outer, inner) * on egress operation (tunnel decapsulation). */ int -ip64_ecn_egress(mode, outer, inner) - int mode; - const u_int8_t *outer; - u_int32_t *inner; +ip64_ecn_egress(int mode, const u_int8_t *outer, u_int32_t *inner) { u_int8_t inner8; diff --git a/bsd/netinet/ip_encap.c b/bsd/netinet/ip_encap.c index 6c4d33072..5276504a9 100644 --- a/bsd/netinet/ip_encap.c +++ b/bsd/netinet/ip_encap.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* $FreeBSD: src/sys/netinet/ip_encap.c,v 1.1.2.2 2001/07/03 11:01:46 ume Exp $ */ @@ -169,9 +169,7 @@ encap6_init(struct ip6protosw *pp, struct domain *dp) #if INET void -encap4_input(m, off) - struct mbuf *m; - int off; +encap4_input(struct mbuf *m, int off) { int proto; struct ip *ip; @@ -333,10 +331,8 @@ encap6_input(struct mbuf **mp, int *offp, int proto) #endif static void -encap_add(ep) - struct encaptab *ep; +encap_add(struct encaptab *ep) { - LIST_INSERT_HEAD(&encaptab, ep, chain); } @@ -346,13 +342,9 @@ encap_add(ep) * Return value will be necessary as input (cookie) for encap_detach(). */ const struct encaptab * -encap_attach(af, proto, sp, sm, dp, dm, psw, arg) - int af; - int proto; - const struct sockaddr *sp, *sm; - const struct sockaddr *dp, *dm; - const struct protosw *psw; - void *arg; +encap_attach(int af, int proto, const struct sockaddr *sp, + const struct sockaddr *sm, const struct sockaddr *dp, + const struct sockaddr *dm, const struct protosw *psw, void *arg) { struct encaptab *ep; int error; @@ -415,12 +407,9 @@ encap_attach(af, proto, sp, sm, dp, dm, psw, arg) } const struct encaptab * -encap_attach_func(af, proto, func, psw, arg) - int af; - int proto; - int (*func)(const struct mbuf *, int, int, void *); - const struct protosw *psw; - void *arg; +encap_attach_func( int af, int proto, + int (*func)(const struct mbuf *, int, int, void *), + const struct protosw *psw, void *arg) { struct encaptab *ep; int error; @@ -453,8 +442,7 @@ encap_attach_func(af, proto, func, psw, arg) } int -encap_detach(cookie) - const struct encaptab *cookie; +encap_detach(const struct encaptab *cookie) { const struct encaptab *ep = cookie; struct encaptab *p; @@ -471,10 +459,8 @@ encap_detach(cookie) } static int -mask_match(ep, sp, dp) - const struct encaptab *ep; - const struct sockaddr *sp; - const struct sockaddr *dp; +mask_match(const struct encaptab *ep, const struct sockaddr *sp, + const struct sockaddr *dp) { struct sockaddr_storage s; struct sockaddr_storage d; @@ -546,8 +532,7 @@ encap_fillarg( } void * -encap_getarg(m) - struct mbuf *m; +encap_getarg(struct mbuf *m) { struct m_tag *tag; struct encaptabtag *et; diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c index bfdc61964..63543fd6c 100644 --- a/bsd/netinet/ip_fw2.c +++ b/bsd/netinet/ip_fw2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2013 Apple Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,6 +83,7 @@ #include #include +#include #include #include #include @@ -337,9 +338,6 @@ lck_mtx_t *ipfw_mutex = &ipfw_mutex_data; extern void ipfwsyslog( int level, const char *format,...); -#define KEV_LOG_SUBCLASS 10 -#define IPFWLOGEVENT 0 - #define ipfwstring "ipfw:" static size_t ipfwstringlen; @@ -470,7 +468,7 @@ is_icmp_query(struct ip *ip) #undef TT static int -Get32static_len() +Get32static_len(void) { int diff; int len = static_len_32; @@ -494,7 +492,7 @@ Get32static_len() } static int -Get64static_len() +Get64static_len(void) { int diff; int len = static_len_64; @@ -1104,7 +1102,7 @@ verify_rev_path(struct in_addr src, struct ifnet *ifp) dst->sin_len = sizeof(*dst); dst->sin_addr = src; - rtalloc_ign(&ro, RTF_CLONING|RTF_PRCLONING); + rtalloc_ign(&ro, RTF_CLONING|RTF_PRCLONING, false); } if (ro.ro_rt != NULL) { RT_LOCK_SPIN(ro.ro_rt); @@ -4089,4 +4087,3 @@ ipfw_init(void) } #endif /* IPFW2 */ - diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index cb0e43f7a..1e2877525 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -182,9 +182,10 @@ static int icmpbmcastecho = 1; SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW | CTLFLAG_LOCKED, &icmpbmcastecho, 0, ""); - -#if ICMPPRINTFS -int icmpprintfs = 0; +#if (DEBUG | DEVELOPMENT) +static int icmpprintfs = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, + &icmpprintfs, 0, ""); #endif static void icmp_reflect(struct mbuf *); @@ -213,8 +214,8 @@ icmp_error( oip = mtod(n, struct ip *); oiphlen = IP_VHL_HL(oip->ip_vhl) << 2; -#if ICMPPRINTFS - if (icmpprintfs) +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 1) printf("icmp_error(0x%llx, %x, %d)\n", (uint64_t)VM_KERNEL_ADDRPERM(oip), type, code); #endif @@ -277,7 +278,7 @@ icmp_error( (oip->ip_len - oiphlen))); } else stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen, - (ntohs(oip->ip_len) - oiphlen))); + (oip->ip_len - oiphlen))); icmplen = min(oiphlen + icmpelen, min(nlen, oip->ip_len)); if (icmplen < sizeof(struct ip)) @@ -390,15 +391,15 @@ icmp_input(struct mbuf *m, int hlen) * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ -#if ICMPPRINTFS - if (icmpprintfs) { - char buf[MAX_IPv4_STR_LEN]; - char ipv4str[MAX_IPv4_STR_LEN]; - - printf("icmp_input from %s to %s, len %d\n", - inet_ntop(AF_INET, &ip->ip_src, buf, sizeof(buf)), - inet_ntop(AF_INET, &ip->ip_dst, ipv4str, sizeof(ipv4str)), - icmplen); +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 2) { + char src_str[MAX_IPv4_STR_LEN]; + char dst_str[MAX_IPv4_STR_LEN]; + + inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str)); + inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str)); + printf("%s: from %s to %s, len %d\n", + __func__, src_str, dst_str, icmplen); } #endif if (icmplen < ICMP_MINLEN) { @@ -421,8 +422,8 @@ icmp_input(struct mbuf *m, int hlen) m->m_len += hlen; m->m_data -= hlen; -#if ICMPPRINTFS - if (icmpprintfs) +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 2) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif @@ -507,8 +508,9 @@ icmp_input(struct mbuf *m, int hlen) /* * Problem with datagram; advise higher level routines. */ - if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || - IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) + || IP_VHL_HL(icp->icmp_ip.ip_vhl) < + (sizeof(struct ip) >> 2)) { icmpstat.icps_badlen++; goto freeit; } @@ -520,14 +522,15 @@ icmp_input(struct mbuf *m, int hlen) /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; -#if ICMPPRINTFS - if (icmpprintfs) - printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 2) + printf("deliver to protocol %d\n", + icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* - * XXX if the packet contains [IPv4 AH TCP], we can't make a + * if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ ctlfunc = ip_protox[icp->icmp_ip.ip_p]->pr_ctlinput; @@ -541,11 +544,36 @@ icmp_input(struct mbuf *m, int hlen) break; case ICMP_ECHO: - if (!icmpbmcastecho - && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + if ((m->m_flags & (M_MCAST | M_BCAST))) { + if (icmpbmcastecho == 0) { + icmpstat.icps_bmcastecho++; + break; + } + } + + /* + * rdar://18644769 + * Do not reply when the destination is link local multicast or broadcast + * and the source is not from a directly connected subnet + */ + if ((IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr)) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) && + in_localaddr(ip->ip_src) == 0) { icmpstat.icps_bmcastecho++; +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 0) { + char src_str[MAX_IPv4_STR_LEN]; + char dst_str[MAX_IPv4_STR_LEN]; + + inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str)); + inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str)); + printf("%s: non local (B|M)CAST %s to %s, len %d\n", + __func__, src_str, dst_str, icmplen); + } +#endif break; } + icp->icmp_type = ICMP_ECHOREPLY; #if ICMP_BANDLIM if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) @@ -555,7 +583,6 @@ icmp_input(struct mbuf *m, int hlen) goto reflect; case ICMP_TSTAMP: - if (icmptimestamp == 0) break; @@ -659,14 +686,14 @@ icmp_input(struct mbuf *m, int hlen) */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; -#if ICMPPRINTFS - if (icmpprintfs) { - char buf[MAX_IPv4_STR_LEN]; - - printf("redirect dst %s to %s\n", - inet_ntop(AF_INET, &icp->icmp_ip.ip_dst, buf, sizeof(buf)), - inet_ntop(AF_INET, &icp->icmp_gwaddr, ipv4str, - sizeof(ipv4str))); +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 0) { + char dst_str[MAX_IPv4_STR_LEN]; + char gw_str[MAX_IPv4_STR_LEN]; + + inet_ntop(AF_INET, &icp->icmp_ip.ip_dst, dst_str, sizeof(dst_str)); + inet_ntop(AF_INET, &icp->icmp_gwaddr, gw_str, sizeof(gw_str)); + printf("%s: redirect dst %s to %s\n", __func__, dst_str, gw_str); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; @@ -806,8 +833,8 @@ icmp_reflect(struct mbuf *m) mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { -#if ICMPPRINTFS - if (icmpprintfs) +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 1) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif @@ -844,8 +871,8 @@ icmp_reflect(struct mbuf *m) opts->m_len++; } } -#if ICMPPRINTFS - if (icmpprintfs) +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 1) printf("%d\n", opts->m_len); #endif } @@ -881,7 +908,8 @@ icmp_send(struct mbuf *m, struct mbuf *opts) struct icmp *icp; struct route ro; struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, - IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR, 0 }; + IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP) && m->m_pkthdr.rcvif != NULL) { ipoa.ipoa_boundif = m->m_pkthdr.rcvif->if_index; @@ -899,14 +927,14 @@ icmp_send(struct mbuf *m, struct mbuf *opts) m->m_pkthdr.rcvif = NULL; m->m_pkthdr.csum_data = 0; m->m_pkthdr.csum_flags = 0; -#if ICMPPRINTFS - if (icmpprintfs) { - char buf[MAX_IPv4_STR_LEN]; - char ipv4str[MAX_IPv4_STR_LEN]; - - printf("icmp_send dst %s src %s\n", - inet_ntop(AF_INET, &ip->ip_dst, buf, sizeof(buf)), - inet_ntop(AF_INET, &ip->ip_src, ipv4str, sizeof(ipv4str))); +#if (DEBUG | DEVELOPMENT) + if (icmpprintfs > 2) { + char src_str[MAX_IPv4_STR_LEN]; + char dst_str[MAX_IPv4_STR_LEN]; + + inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str)); + inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str)); + printf("%s: dst %s src %s\n", __func__, dst_str, src_str); } #endif bzero(&ro, sizeof ro); @@ -1138,9 +1166,6 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IP_STRIPHDR: case IP_RECVTTL: case IP_BOUND_IF: -#if CONFIG_FORCE_OUT_IFP - case IP_FORCE_OUT_IFP: -#endif case IP_NO_IFT_CELLULAR: error = rip_ctloutput(so, sopt); break; diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 994bbf8a3..35d7a9f20 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,6 +103,7 @@ #include #include #include +#include #if PF #include #endif /* PF */ @@ -186,9 +187,12 @@ static u_int32_t ipq_count; /* current # of allocated ipq's */ static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS; static int sysctl_maxnipq SYSCTL_HANDLER_ARGS; static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS; + +#if (DEBUG || DEVELOPMENT) static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS; static int sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS; static int sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS; +#endif /* (DEBUG || DEVELOPMENT) */ int ipforwarding = 0; SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding, @@ -231,10 +235,6 @@ SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket, sysctl_maxfragsperpacket, "I", "Maximum number of IPv4 fragments allowed per packet"); -int ip_doscopedroute = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, - &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); - static uint32_t ip_adj_clear_hwcksum = 0; SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0, @@ -265,6 +265,7 @@ static int ip_chainsz = 6; SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_chainsz, 1, "IP receive side max chaining"); +#if (DEBUG || DEVELOPMENT) static int ip_input_measure = 0; SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, @@ -281,6 +282,7 @@ SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_data, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_ip_input_getperf, "S,net_perf", "IP input performance data (struct net_perf, net/net_perf.h)"); +#endif /* (DEBUG || DEVELOPMENT) */ #if DIAGNOSTIC static int ipprintfs = 0; @@ -423,6 +425,25 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, */ static gre_input_func_t gre_input_func; +static void +ip_init_delayed(void) +{ + struct ifreq ifr; + int error; + struct sockaddr_in *sin; + + bzero(&ifr, sizeof(ifr)); + strlcpy(ifr.ifr_name, "lo0", sizeof(ifr.ifr_name)); + sin = (struct sockaddr_in *)(void *)&ifr.ifr_addr; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + error = in_control(NULL, SIOCSIFADDR, (caddr_t)&ifr, lo_ifp, kernproc); + if (error) + printf("%s: failed to initialise lo0's address, error=%d\n", + __func__, error); +} + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -451,9 +472,6 @@ ip_init(struct protosw *pp, struct domain *dp) return; ip_initialized = 1; - PE_parse_boot_argn("net.inet.ip.scopedroute", - &ip_doscopedroute, sizeof (ip_doscopedroute)); - in_ifaddr_init(); in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); @@ -524,6 +542,7 @@ ip_init(struct protosw *pp, struct domain *dp) #endif arp_init(); + net_init_add(ip_init_delayed); } /* @@ -730,10 +749,10 @@ ip_input_second_pass_loop_tbl(pktchain_elm_t *tbl, struct ip_fw_in_args *args) ipstat.ips_rxc_chainsz_gt2++; if (tbl[i].pkte_npkts > 4) ipstat.ips_rxc_chainsz_gt4++; - +#if (DEBUG || DEVELOPMENT) if (ip_input_measure) net_perf_histogram(&net_perf, tbl[i].pkte_npkts); - +#endif /* (DEBUG || DEVELOPMENT) */ tbl[i].pkte_head = tbl[i].pkte_tail = NULL; tbl[i].pkte_npkts = 0; tbl[i].pkte_nbytes = 0; @@ -1655,15 +1674,20 @@ ip_input_process_list(struct mbuf *packet_list) int retval = 0; u_int32_t div_info = 0; int ours = 0; +#if (DEBUG || DEVELOPMENT) struct timeval start_tv; +#endif /* (DEBUG || DEVELOPMENT) */ int num_pkts = 0; int chain = 0; struct ip_fw_in_args args; if (ip_chaining == 0) { struct mbuf *m = packet_list; +#if (DEBUG || DEVELOPMENT) if (ip_input_measure) net_perf_start_time(&net_perf, &start_tv); +#endif /* (DEBUG || DEVELOPMENT) */ + while (m) { packet_list = mbuf_nextpkt(m); mbuf_setnextpkt(m, NULL); @@ -1671,12 +1695,16 @@ ip_input_process_list(struct mbuf *packet_list) m = packet_list; num_pkts++; } +#if (DEBUG || DEVELOPMENT) if (ip_input_measure) net_perf_measure_time(&net_perf, &start_tv, num_pkts); +#endif /* (DEBUG || DEVELOPMENT) */ return; } +#if (DEBUG || DEVELOPMENT) if (ip_input_measure) net_perf_start_time(&net_perf, &start_tv); +#endif /* (DEBUG || DEVELOPMENT) */ bzero(&pktchain_tbl, sizeof(pktchain_tbl)); restart_list_process: @@ -1726,9 +1754,10 @@ ip_input_process_list(struct mbuf *packet_list) * equivalent update in chaining case if performed in * ip_input_second_pass_loop_tbl(). */ +#if (DEBUG || DEVELOPMENT) if (ip_input_measure) net_perf_histogram(&net_perf, 1); - +#endif /* (DEBUG || DEVELOPMENT) */ ip_input_second_pass(packet, packet->m_pkthdr.rcvif, div_info, 1, packet->m_pkthdr.len, &args, ours); } @@ -1736,8 +1765,10 @@ ip_input_process_list(struct mbuf *packet_list) if (packet_list) goto restart_list_process; +#if (DEBUG || DEVELOPMENT) if (ip_input_measure) net_perf_measure_time(&net_perf, &start_tv, num_pkts); +#endif /* (DEBUG || DEVELOPMENT) */ } /* * Ip input routine. Checksum and byte swap header. If fragmented @@ -3736,7 +3767,8 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) n_long dest; struct in_addr pkt_dst; u_int32_t nextmtu = 0, len; - struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; struct ifnet *rcvifp = m->m_pkthdr.rcvif; #if IPSEC struct secpolicy *sp = NULL; @@ -4355,6 +4387,7 @@ ip_gre_register_input(gre_input_func_t fn) return (0); } +#if (DEBUG || DEVELOPMENT) static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS { @@ -4408,4 +4441,4 @@ sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen))); } - +#endif /* (DEBUG || DEVELOPMENT) */ diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index 2788b0e79..741bde155 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,6 +104,7 @@ #include #include #include +#include #if CONFIG_MACF_NET #include @@ -345,9 +346,14 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, uint32_t raw; } ipobf = { .raw = 0 }; +/* + * Here we check for restrictions when sending frames. + * N.B.: IPv4 over internal co-processor interfaces is not allowed. + */ #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \ (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \ ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \ + (IFNET_IS_INTCOPROC(_ifp)) || \ (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp))) if (ip_output_measure) @@ -434,10 +440,10 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, } } #endif /* IPSEC */ - + VERIFY(ro != NULL); - if (ip_doscopedroute && (flags & IP_OUTARGS)) { + if (flags & IP_OUTARGS) { /* * In the forwarding case, only the ifscope value is used, * as source interface selection doesn't take place. @@ -484,7 +490,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, adv->code = FADV_SUCCESS; ipoa->ipoa_retflags = 0; } - + #if IPSEC if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { so = ipsec_getsocket(m); @@ -657,6 +663,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, if (ia == NULL) { OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; + /* XXX IPv6 APN fallback notification?? */ goto bad; } } @@ -704,11 +711,11 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, /* * If the source address belongs to a restricted - * interface and the caller forbids our using + * interface and the caller forbids our using * interfaces of such type, pretend that there is no * route. */ - if (ia0 != NULL && + if (ia0 != NULL && IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) { IFA_REMREF(ia0); ia0 = NULL; @@ -801,7 +808,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, rtalloc_scoped_ign(ro, ign, ifscope); /* - * If the route points to a cellular/expensive interface + * If the route points to a cellular/expensive interface * and the caller forbids our using interfaces of such type, * pretend that there is no route. */ @@ -1237,7 +1244,6 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, goto bad; } } - break; } default: break; @@ -1701,7 +1707,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, ROUTE_RELEASE(ro_fwd); bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst)); - rtalloc_ign(ro_fwd, RTF_PRCLONING); + rtalloc_ign(ro_fwd, RTF_PRCLONING, false); if (ro_fwd->ro_rt == NULL) { OSAddAtomic(1, &ipstat.ips_noroute); @@ -1773,6 +1779,31 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, goto bad; } + if (ipoa != NULL) { + u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT; + + error = set_packet_qos(m, ifp, + ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE, + ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp); + if (error == 0) { + ip->ip_tos &= IPTOS_ECN_MASK; + ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT; + } else { + printf("%s if_dscp_for_mbuf() error %d\n", __func__, error); + error = 0; + } + } + + /* + * Some Wi-Fi AP implementations do not correctly handle multicast IP + * packets with DSCP bits set -- see radr://9331522 -- so as a + * workaround we clear the DSCP bits and set the service class to BE + */ + if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && IFNET_IS_WIFI_INFRA(ifp)) { + ip->ip_tos &= IPTOS_ECN_MASK; + mbuf_set_service_class(m, MBUF_SC_BE); + } + ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2), ip->ip_len, &sw_csum); @@ -2478,70 +2509,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) } break; #undef OPTSET - -#if CONFIG_FORCE_OUT_IFP - /* - * Apple private interface, similar to IP_BOUND_IF, except - * that the parameter is a NULL-terminated string containing - * the name of the network interface; an emptry string means - * unbind. Applications are encouraged to use IP_BOUND_IF - * instead, as that is the current "official" API. - */ - case IP_FORCE_OUT_IFP: { - char ifname[IFNAMSIZ]; - unsigned int ifscope; - - /* This option is settable only for IPv4 */ - if (!(inp->inp_vflag & INP_IPV4)) { - error = EINVAL; - break; - } - - /* Verify interface name parameter is sane */ - if (sopt->sopt_valsize > sizeof (ifname)) { - error = EINVAL; - break; - } - - /* Copy the interface name */ - if (sopt->sopt_valsize != 0) { - error = sooptcopyin(sopt, ifname, - sizeof (ifname), sopt->sopt_valsize); - if (error) - break; - } - - if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { - /* Unbind this socket from any interface */ - ifscope = IFSCOPE_NONE; - } else { - ifnet_t ifp; - - /* Verify name is NULL terminated */ - if (ifname[sopt->sopt_valsize - 1] != '\0') { - error = EINVAL; - break; - } - - /* Bail out if given bogus interface name */ - if (ifnet_find_by_name(ifname, &ifp) != 0) { - error = ENXIO; - break; - } - - /* Bind this socket to this interface */ - ifscope = ifp->if_index; - - /* - * Won't actually free; since we don't release - * this later, we should do it now. - */ - ifnet_release(ifp); - } - error = inp_bindif(inp, ifscope, NULL); - } - break; -#endif /* CONFIG_FORCE_OUT_IFP */ /* * Multicast socket options are processed by the in_mcast * module. @@ -2602,7 +2569,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) int priv; struct mbuf *m; int optname; - + if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -2796,11 +2763,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) #if TRAFFIC_MGT case IP_TRAFFIC_MGT_BACKGROUND: { - unsigned background = (so->so_traffic_mgt_flags & - TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; + unsigned background = (so->so_flags1 & + SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; return (sooptcopyout(sopt, &background, sizeof (background))); - break; } #endif /* TRAFFIC_MGT */ @@ -3577,4 +3543,3 @@ sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen))); } - diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index 99982111d..cc54d85d8 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -227,6 +227,7 @@ struct ipstat { u_int32_t ips_rxc_chainsz_gt2; /* rx chain size greater than 2 */ u_int32_t ips_rxc_chainsz_gt4; /* rx chain size greater than 4 */ u_int32_t ips_rxc_notlist; /* count of pkts through ip_input */ + u_int32_t ips_raw_sappend_fail; /* sock append failed */ }; @@ -294,8 +295,11 @@ struct ip_out_args { #define IPOAF_NO_EXPENSIVE 0x00000020 /* skip IFT_EXPENSIVE */ #define IPOAF_AWDL_UNRESTRICTED 0x00000040 /* can send over AWDL_RESTRICTED */ +#define IPOAF_QOSMARKING_ALLOWED 0x00000080 /* policy allows Fastlane DSCP marking */ u_int32_t ipoa_retflags; /* IPOARF return flags (see below) */ #define IPOARF_IFDENIED 0x00000001 /* denied access to interface */ + int ipoa_sotc; /* traffic class for Fastlane DSCP mapping */ + int ipoa_netsvctype; /* network service type */ }; extern struct ipstat ipstat; @@ -305,7 +309,6 @@ extern int ip_defttl; /* default IP ttl */ extern int ipforwarding; /* ip forwarding */ extern struct protosw *ip_protox[]; extern struct pr_usrreqs rip_usrreqs; -extern int ip_doscopedroute; extern void ip_moptions_init(void); extern struct ip_moptions *ip_allocmoptions(int); diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c index 87250037e..a63d4a583 100644 --- a/bsd/netinet/kpi_ipfilter.c +++ b/bsd/netinet/kpi_ipfilter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2014 Apple Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -293,7 +293,8 @@ ipf_injectv4_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) errno_t error = 0; struct m_tag *mtag = NULL; struct ip_moptions *imo = NULL; - struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; /* Make the IP header contiguous in the mbuf */ if ((size_t)m->m_len < sizeof (struct ip)) { @@ -369,7 +370,8 @@ ipf_injectv6_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) errno_t error = 0; struct m_tag *mtag = NULL; struct ip6_moptions *im6o = NULL; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, 0, 0 }; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, 0, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; /* Make the IP header contiguous in the mbuf */ if ((size_t)m->m_len < sizeof(struct ip6_hdr)) { diff --git a/bsd/netinet/mptcp.c b/bsd/netinet/mptcp.c index 71ea9f4a8..264c9d7c1 100644 --- a/bsd/netinet/mptcp.c +++ b/bsd/netinet/mptcp.c @@ -222,10 +222,10 @@ mptcp_input(struct mptses *mpte, struct mbuf *m) * In the degraded fallback case, data is accepted without DSS map */ if (in_fallback) { -fallback: - /* - * assume degraded flow as this may be the first packet - * without DSS, and the subflow state is not updated yet. +fallback: + /* + * assume degraded flow as this may be the first packet + * without DSS, and the subflow state is not updated yet. */ if (sbappendstream(&mp_so->so_rcv, m)) sorwakeup(mp_so); @@ -621,7 +621,7 @@ struct mptsub * mptcp_get_pending_subflow(struct mptses *mpte, struct mptsub *ignore) { struct mptsub *mpts = NULL; - + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { @@ -692,9 +692,6 @@ mptcp_state_to_str(mptcp_state_t state) case MPTCPS_TIME_WAIT: c = "MPTCPS_TIME_WAIT"; break; - case MPTCPS_FASTCLOSE_WAIT: - c = "MPTCPS_FASTCLOSE_WAIT"; - break; case MPTCPS_TERMINATE: c = "MPTCPS_TERMINATE"; break; @@ -708,7 +705,7 @@ mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event) MPT_LOCK_ASSERT_HELD(mp_tp); mptcp_state_t old_state = mp_tp->mpt_state; - DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, + DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, uint32_t, event); switch (mp_tp->mpt_state) { @@ -721,27 +718,26 @@ mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event) if (event == MPCE_CLOSE) { mp_tp->mpt_state = MPTCPS_FIN_WAIT_1; mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */ - } - else if (event == MPCE_RECV_DATA_FIN) { + } else if (event == MPCE_RECV_DATA_FIN) { mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */ mp_tp->mpt_state = MPTCPS_CLOSE_WAIT; - } + } break; case MPTCPS_CLOSE_WAIT: if (event == MPCE_CLOSE) { mp_tp->mpt_state = MPTCPS_LAST_ACK; mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */ - } + } break; case MPTCPS_FIN_WAIT_1: - if (event == MPCE_RECV_DATA_ACK) + if (event == MPCE_RECV_DATA_ACK) { mp_tp->mpt_state = MPTCPS_FIN_WAIT_2; - else if (event == MPCE_RECV_DATA_FIN) { + } else if (event == MPCE_RECV_DATA_FIN) { mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */ mp_tp->mpt_state = MPTCPS_CLOSING; - } + } break; case MPTCPS_CLOSING: @@ -758,25 +754,19 @@ mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event) if (event == MPCE_RECV_DATA_FIN) { mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */ mp_tp->mpt_state = MPTCPS_TIME_WAIT; - } + } break; case MPTCPS_TIME_WAIT: break; - case MPTCPS_FASTCLOSE_WAIT: - if (event == MPCE_CLOSE) { - /* no need to adjust for data FIN */ - mp_tp->mpt_state = MPTCPS_TERMINATE; - } - break; case MPTCPS_TERMINATE: break; default: VERIFY(0); /* NOTREACHED */ } - DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, + DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, uint32_t, event); mptcplog((LOG_INFO, "MPTCP State: %s to %s on event %s\n", mptcp_state_to_str(old_state), @@ -805,7 +795,7 @@ mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack) /* bring back sndnxt to retransmit MPTCP data */ mp_tp->mpt_sndnxt = mp_tp->mpt_dsn_at_csum_fail; mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC; - tp->t_inpcb->inp_socket->so_flags1 |= + tp->t_inpcb->inp_socket->so_flags1 |= SOF1_POST_FALLBACK_SYNC; } } @@ -1021,7 +1011,7 @@ mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, int off) if (tp->t_mpflags & TMPF_TCP_FALLBACK) return (0); - /* + /* * The remote side may send a packet with fewer bytes than the * claimed DSS checksum length. */ diff --git a/bsd/netinet/mptcp_opt.c b/bsd/netinet/mptcp_opt.c index 834a26e20..f8611236b 100644 --- a/bsd/netinet/mptcp_opt.c +++ b/bsd/netinet/mptcp_opt.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2014 Apple Inc. All rights reserved. + * Copyright (c) 2012-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include @@ -56,8 +56,8 @@ /* * SYSCTL for enforcing 64 bit dsn */ -int32_t force_64bit_dsn = 0; -SYSCTL_INT(_net_inet_mptcp, OID_AUTO, force_64bit_dsn, +int32_t force_64bit_dsn = 0; +SYSCTL_INT(_net_inet_mptcp, OID_AUTO, force_64bit_dsn, CTLFLAG_RW|CTLFLAG_LOCKED, &force_64bit_dsn, 0, "Force MPTCP 64bit dsn"); @@ -77,14 +77,21 @@ mptcp_setup_first_subflow_syn_opts(struct socket *so, int flags, u_char *opt, struct mptcb *mp_tp = NULL; mp_tp = tptomptp(tp); - if (!(so->so_flags & SOF_MP_SUBFLOW)) - return (optlen); - /* * Avoid retransmitting the MP_CAPABLE option. */ - if (tp->t_rxtshift > mptcp_mpcap_retries) + if (tp->t_rxtshift > mptcp_mpcap_retries) { + if (!(mp_tp->mpt_flags & (MPTCPF_FALLBACK_HEURISTIC | MPTCPF_HEURISTIC_TRAC))) { + mp_tp->mpt_flags |= MPTCPF_HEURISTIC_TRAC; + tcp_heuristic_mptcp_loss(tp); + } return (optlen); + } + + if (!tcp_heuristic_do_mptcp(tp)) { + mp_tp->mpt_flags |= MPTCPF_FALLBACK_HEURISTIC; + return (optlen); + } if ((flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { struct mptcp_mpcapable_opt_rsp mptcp_opt; @@ -191,29 +198,28 @@ mptcp_setup_join_subflow_syn_opts(struct socket *so, int flags, u_char *opt, optlen += mpjoin_rsp.mmjo_len; } else { struct mptcp_mpjoin_opt_req mpjoin_req; + bzero(&mpjoin_req, sizeof (mpjoin_req)); mpjoin_req.mmjo_kind = TCPOPT_MULTIPATH; mpjoin_req.mmjo_len = sizeof (mpjoin_req); mpjoin_req.mmjo_subtype_bkp = MPO_JOIN << 4; - /* A secondary subflow is started off as backup */ - mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP; - tp->t_mpflags |= TMPF_BACKUP_PATH; + if (tp->t_mpflags & TMPF_BACKUP_PATH) + mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP; mpjoin_req.mmjo_addr_id = tp->t_local_aid; mpjoin_req.mmjo_peer_token = mptcp_get_remotetoken(tp->t_mptcb); if (mpjoin_req.mmjo_peer_token == 0) { mptcplog((LOG_DEBUG, "MPTCP Socket: %s: peer token 0", __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - } + } mptcp_get_rands(tp->t_local_aid, tptomptp(tp), &mpjoin_req.mmjo_rand, NULL); memcpy(opt + optlen, &mpjoin_req, mpjoin_req.mmjo_len); optlen += mpjoin_req.mmjo_len; /* send an event up, if Fast Join is requested */ - if (mptcp_zerortt_fastjoin && + if (mptcp_zerortt_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) { - soevent(so, - (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFASTJ)); + soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFASTJ)); } } return (optlen); @@ -248,11 +254,6 @@ mptcp_setup_syn_opts(struct socket *so, int flags, u_char *opt, unsigned optlen) { unsigned new_optlen; - if (mptcp_enable == 0) { - /* do nothing */ - return (optlen); - } - if (!(so->so_flags & SOF_MP_SEC_SUBFLOW)) { new_optlen = mptcp_setup_first_subflow_syn_opts(so, flags, opt, optlen); @@ -287,7 +288,7 @@ mptcp_send_mpfail(struct tcpcb *tp, u_char *opt, unsigned int optlen) if ((MAX_TCPOPTLEN - optlen) < sizeof (struct mptcp_mpfail_opt)) { tp->t_mpflags &= ~TMPF_SND_MPFAIL; return (optlen); - } + } MPT_LOCK(mp_tp); dsn = mp_tp->mpt_rcvnxt; @@ -302,7 +303,7 @@ mptcp_send_mpfail(struct tcpcb *tp, u_char *opt, unsigned int optlen) optlen += len; tp->t_mpflags &= ~TMPF_SND_MPFAIL; mptcplog((LOG_DEBUG, "MPTCP Socket: %s: %d \n", __func__, - tp->t_local_aid), (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), + tp->t_local_aid), (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG); return (optlen); } @@ -381,7 +382,7 @@ mptcp_send_infinite_mapping(struct tcpcb *tp, u_char *opt, unsigned int optlen) optlen += csum_len; } - mptcplog((LOG_DEBUG, "MPTCP Socket: %s: dsn = %x, seq = %x len = %x\n", + mptcplog((LOG_DEBUG, "MPTCP Socket: %s: dsn = %x, seq = %x len = %x\n", __func__, ntohl(infin_opt.mdss_dsn), ntohl(infin_opt.mdss_subflow_seqn), @@ -412,37 +413,6 @@ mptcp_ok_to_fin(struct tcpcb *tp, u_int64_t dsn, u_int32_t datalen) return (0); } - -/* Must be called from tcp_output to fill in the fast close option */ -static int -mptcp_send_fastclose(struct tcpcb *tp, u_char *opt, unsigned int optlen, - int flags) -{ - struct mptcp_fastclose_opt fastclose_opt; - struct mptcb *mp_tp = tptomptp(tp); - - /* Only ACK flag should be set */ - if (flags != TH_ACK) - return (optlen); - - if ((MAX_TCPOPTLEN - optlen) < - sizeof (struct mptcp_fastclose_opt)) { - return (optlen); - } - - bzero(&fastclose_opt, sizeof (struct mptcp_fastclose_opt)); - fastclose_opt.mfast_kind = TCPOPT_MULTIPATH; - fastclose_opt.mfast_len = sizeof (struct mptcp_fastclose_opt); - fastclose_opt.mfast_subtype = MPO_FASTCLOSE; - MPT_LOCK_SPIN(mp_tp); - fastclose_opt.mfast_key = mptcp_get_remotekey(mp_tp); - MPT_UNLOCK(mp_tp); - memcpy(opt + optlen, &fastclose_opt, fastclose_opt.mfast_len); - optlen += fastclose_opt.mfast_len; - - return (optlen); -} - unsigned int mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, unsigned int optlen, int flags, int datalen, @@ -486,12 +456,6 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, goto ret_optlen; } - if (tp->t_mpflags & TMPF_FASTCLOSE) { - optlen = mptcp_send_fastclose(tp, opt, optlen, flags); - VERIFY(datalen == 0); - goto ret_optlen; - } - if (tp->t_mpflags & TMPF_TCP_FALLBACK) { if (tp->t_mpflags & TMPF_SND_MPFAIL) optlen = mptcp_send_mpfail(tp, opt, optlen); @@ -500,11 +464,6 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, goto ret_optlen; } - if (tp->t_mpflags & TMPF_SND_MPPRIO) { - optlen = mptcp_snd_mpprio(tp, opt, optlen); - goto ret_optlen; - } - if (((tp->t_mpflags & TMPF_FASTJOINBY2_SEND) || (tp->t_mpflags & TMPF_FASTJOIN_SEND )) && (datalen > 0)) { @@ -548,18 +507,6 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, /* its a retransmission of the MP_CAPABLE ACK */ } goto ret_optlen; - } else if (tp->t_mpflags & TMPF_MPTCP_TRUE) { - if (tp->t_mpflags & TMPF_SND_REM_ADDR) { - int rem_opt_len = sizeof (struct mptcp_remaddr_opt); - if ((optlen + rem_opt_len) <= MAX_TCPOPTLEN) { - mptcp_send_remaddr_opt(tp, - (struct mptcp_remaddr_opt *)(opt + optlen)); - optlen += rem_opt_len; - goto ret_optlen; - } else { - tp->t_mpflags &= ~TMPF_SND_REM_ADDR; - } - } } if ((tp->t_mpflags & TMPF_JOINED_FLOW) && @@ -567,8 +514,8 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, (!(tp->t_mpflags & TMPF_RECVD_JOIN)) && (tp->t_mpflags & TMPF_SENT_JOIN) && (!(tp->t_mpflags & TMPF_MPTCP_TRUE))) { - MPT_LOCK(mp_tp); - if (mptcp_get_localkey(mp_tp) == 0) { + MPT_LOCK(mp_tp); + if (mptcp_get_localkey(mp_tp) == 0) { MPT_UNLOCK(mp_tp); goto ret_optlen; } @@ -587,18 +534,34 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) goto ret_optlen; fastjoin_send: - /* - * From here on, all options are sent only if MPTCP_TRUE + /* + * From here on, all options are sent only if MPTCP_TRUE * or when data is sent early on as in Fast Join */ + if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && + (tp->t_mpflags & TMPF_SND_REM_ADDR)) { + int rem_opt_len = sizeof (struct mptcp_remaddr_opt); + if ((optlen + rem_opt_len) <= MAX_TCPOPTLEN) { + mptcp_send_remaddr_opt(tp, + (struct mptcp_remaddr_opt *)(opt + optlen)); + optlen += rem_opt_len; + } else { + tp->t_mpflags &= ~TMPF_SND_REM_ADDR; + } + } + + if (tp->t_mpflags & TMPF_SND_MPPRIO) { + optlen = mptcp_snd_mpprio(tp, opt, optlen); + } + MPT_LOCK(mp_tp); if ((mp_tp->mpt_flags & MPTCPF_SND_64BITDSN) || force_64bit_dsn) { send_64bit_dsn = TRUE; } - if (mp_tp->mpt_flags & MPTCPF_SND_64BITACK) { + if (mp_tp->mpt_flags & MPTCPF_SND_64BITACK) send_64bit_ack = TRUE; - } + MPT_UNLOCK(mp_tp); #define CHECK_OPTLEN { \ @@ -617,7 +580,7 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, dsn_opt.mdss_copt.mdss_flags |= MDSS_F; \ *finp = opt + optlen + offsetof(struct mptcp_dss_copt, \ mdss_flags); \ - dsn_opt.mdss_data_len += 1; \ + dsn_opt.mdss_data_len += 1; \ } \ } @@ -706,7 +669,7 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, mptcp_ntoh64(dsn_ack_opt.mdss_dsn), mptcp_ntoh64(dsn_ack_opt.mdss_ack)), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); - + tp->t_mpflags &= ~TMPF_MPTCP_ACKNOW; goto ret_optlen; } @@ -996,7 +959,7 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt, * If none of the above mpflags were acted on by * this routine, reset these flags and set p_mptcp_acknow * to false. - * XXX The reset value of p_mptcp_acknow can be used + * XXX The reset value of p_mptcp_acknow can be used * to communicate tcp_output to NOT send a pure ack without any * MPTCP options as it will be treated as a dup ack. * Since the instances of mptcp_setup_opts not acting on @@ -1087,18 +1050,12 @@ static void mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen) { - struct mptcp_mpcapable_opt_rsp1 *rsp1 = NULL; struct mptcp_mpcapable_opt_rsp *rsp = NULL; struct mptcb *mp_tp = tptomptp(tp); -#define MPTCP_OPT_ERROR_PATH(tp) { \ - tp->t_mpflags |= TMPF_RESET; \ - tcpstat.tcps_invalid_mpcap++; \ - if (tp->t_inpcb->inp_socket != NULL) { \ - soevent(tp->t_inpcb->inp_socket, \ - SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); \ - } \ -} + /* Only valid on SYN/ACK */ + if ((th->th_flags & (TH_SYN | TH_ACK)) != (TH_SYN | TH_ACK)) + return; /* Validate the kind, len, flags */ if (mptcp_valid_mpcapable_common_opt(cp) != 1) { @@ -1106,141 +1063,53 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, return; } - /* A SYN contains only the MP_CAPABLE option */ - if ((th->th_flags & (TH_SYN | TH_ACK)) == TH_SYN) { - /* XXX passive side not supported yet */ + /* Handle old duplicate SYN/ACK retransmission */ + if (SEQ_GT(tp->rcv_nxt, (tp->irs + 1))) return; - } else if ((th->th_flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { - - /* Handle old duplicate SYN/ACK retransmission */ - if (SEQ_GT(tp->rcv_nxt, (tp->irs + 1))) - return; - /* handle SYN/ACK retransmission by acknowledging with ACK */ - if (mp_tp->mpt_state >= MPTCPS_ESTABLISHED) { - tp->t_mpflags |= TMPF_MPCAP_RETRANSMIT; - return; - } + /* handle SYN/ACK retransmission by acknowledging with ACK */ + if (mp_tp->mpt_state >= MPTCPS_ESTABLISHED) { + tp->t_mpflags |= TMPF_MPCAP_RETRANSMIT; + return; + } - /* A SYN/ACK contains peer's key and flags */ - if (optlen != sizeof (struct mptcp_mpcapable_opt_rsp)) { - /* complain */ - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: SYN_ACK optlen = %d, sizeof mp opt = %lu \n", - __func__, optlen, - sizeof (struct mptcp_mpcapable_opt_rsp)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - tcpstat.tcps_invalid_mpcap++; - return; - } + /* A SYN/ACK contains peer's key and flags */ + if (optlen != sizeof (struct mptcp_mpcapable_opt_rsp)) { + /* complain */ + mptcplog((LOG_ERR, "MPTCP Socket: " + "%s: SYN_ACK optlen = %d, sizeof mp opt = %lu \n", + __func__, optlen, + sizeof (struct mptcp_mpcapable_opt_rsp)), + MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + tcpstat.tcps_invalid_mpcap++; + return; + } - /* - * If checksum flag is set, enable MPTCP checksum, even if - * it was not negotiated on the first SYN. - */ - if (((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags & - MPCAP_CHECKSUM_CBIT) - mp_tp->mpt_flags |= MPTCPF_CHECKSUM; + /* + * If checksum flag is set, enable MPTCP checksum, even if + * it was not negotiated on the first SYN. + */ + if (((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags & + MPCAP_CHECKSUM_CBIT) + mp_tp->mpt_flags |= MPTCPF_CHECKSUM; - rsp = (struct mptcp_mpcapable_opt_rsp *)cp; - MPT_LOCK(mp_tp); - mp_tp->mpt_remotekey = rsp->mmc_localkey; - /* For now just downgrade to the peer's version */ - mp_tp->mpt_peer_version = rsp->mmc_common.mmco_version; - if (rsp->mmc_common.mmco_version < mp_tp->mpt_version) { - mp_tp->mpt_version = rsp->mmc_common.mmco_version; - tcpstat.tcps_mp_verdowngrade++; - } - if (mptcp_init_remote_parms(mp_tp) != 0) { - tcpstat.tcps_invalid_mpcap++; - MPT_UNLOCK(mp_tp); - return; - } + rsp = (struct mptcp_mpcapable_opt_rsp *)cp; + MPT_LOCK(mp_tp); + mp_tp->mpt_remotekey = rsp->mmc_localkey; + /* For now just downgrade to the peer's version */ + mp_tp->mpt_peer_version = rsp->mmc_common.mmco_version; + if (rsp->mmc_common.mmco_version < mp_tp->mpt_version) { + mp_tp->mpt_version = rsp->mmc_common.mmco_version; + tcpstat.tcps_mp_verdowngrade++; + } + if (mptcp_init_remote_parms(mp_tp) != 0) { + tcpstat.tcps_invalid_mpcap++; MPT_UNLOCK(mp_tp); - tp->t_mpflags |= TMPF_PREESTABLISHED; - - } else if ((th->th_flags & TH_ACK) && - (tp->t_mpflags & TMPF_PREESTABLISHED)) { - - /* - * Verify checksum flag is set, if we initially negotiated - * checksum. - */ - if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && - !(((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags & - MPCAP_CHECKSUM_CBIT)) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: checksum negotiation failure \n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - MPTCP_OPT_ERROR_PATH(tp); - return; - } - - if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM) && - (((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags & - MPCAP_CHECKSUM_CBIT)) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: checksum negotiation failure 2.\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - MPTCP_OPT_ERROR_PATH(tp); - return; - } - - /* - * The ACK of a three way handshake contains peer's key and - * flags. - */ - if (optlen != sizeof (struct mptcp_mpcapable_opt_rsp1)) { - /* complain */ - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: ACK optlen = %d , sizeof mp option = %lu, " - " state = %d \n", __func__, optlen, - sizeof (struct mptcp_mpcapable_opt_rsp1), - tp->t_state), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - MPTCP_OPT_ERROR_PATH(tp); - return; - } - - rsp1 = (struct mptcp_mpcapable_opt_rsp1 *)cp; - - /* Skipping MPT_LOCK for invariant key */ - if (rsp1->mmc_remotekey != *mp_tp->mpt_localkey) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: key mismatch locally stored key. " - "rsp = %llx local = %llx \n", __func__, - rsp1->mmc_remotekey, *mp_tp->mpt_localkey), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - MPTCP_OPT_ERROR_PATH(tp); - return; - } else { - /* We received both keys. Almost an MPTCP connection */ - /* Skipping MPT_LOCK for invariant key */ - if (mp_tp->mpt_remotekey != rsp1->mmc_localkey) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: keys don't match\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - tp->t_mpflags &= ~TMPF_PREESTABLISHED; - MPTCP_OPT_ERROR_PATH(tp); - return; - } - tp->t_mpflags &= ~TMPF_PREESTABLISHED; - tp->t_mpflags |= TMPF_MPTCP_RCVD_KEY; - tp->t_mpflags |= TMPF_MPTCP_TRUE; - tp->t_inpcb->inp_socket->so_flags |= SOF_MPTCP_TRUE; - MPT_LOCK(mp_tp); - DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, - uint32_t, 0 /* event */); - mptcplog((LOG_DEBUG, "MPTCP State: " - "MPTCPS_ESTABLISHED \n"), - MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG); - - mp_tp->mpt_state = MPTCPS_ESTABLISHED; - MPT_UNLOCK(mp_tp); - } - if (tp->t_mpuna) { - tp->t_mpuna = 0; - } + return; } + MPT_UNLOCK(mp_tp); + tcp_heuristic_mptcp_success(tp); + tp->t_mpflags |= TMPF_PREESTABLISHED; } @@ -1256,116 +1125,39 @@ mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen) } \ } int error = 0; + struct mptcp_mpjoin_opt_rsp *join_rsp = + (struct mptcp_mpjoin_opt_rsp *)cp; - if ((th->th_flags & (TH_SYN | TH_ACK)) == TH_SYN) { - /* We won't accept join requests as an active opener */ - if (tp->t_inpcb->inp_socket->so_flags & SOF_MPTCP_CLIENT) { - MPTCP_JOPT_ERROR_PATH(tp); - return; - } - - if (optlen != sizeof (struct mptcp_mpjoin_opt_req)) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: SYN: unexpected optlen = %d, mp option" - "= %lu\n", __func__, optlen, - sizeof (struct mptcp_mpjoin_opt_req)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - /* send RST and close */ - MPTCP_JOPT_ERROR_PATH(tp); - return; - } - /* not supported yet */ + /* Only valid on SYN/ACK */ + if ((th->th_flags & (TH_SYN | TH_ACK)) != (TH_SYN | TH_ACK)) return; -#ifdef MPTCP_NOTYET - struct mptcp_mpjoin_opt_req *join_req = - (struct mptcp_mpjoin_opt_req *)cp; - mp_so = mptcp_find_mpso(join_req->mmjo_peer_token); - if (!mp_so) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: cannot find mp_so token = %x\n", - __func__, join_req->mmjo_peer_token), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - /* send RST */ - MPTCP_JOPT_ERROR_PATH(tp); - return; - } - if (tp->t_mpflags & TMPF_PREESTABLISHED) { - return; - } - mp_so->ms_remote_addr_id = join_req->mmjo_addr_id; - mp_so->ms_remote_rand = join_req->mmjo_rand; - tp->t_mpflags |= TMPF_PREESTABLISHED | TMPF_JOINED_FLOW; - tp->t_mpflags |= TMPF_RECVD_JOIN; - tp->t_inpcb->inp_socket->so_flags |= SOF_MP_SEC_SUBFLOW; - if (join_req->mmjo_subtype & MPTCP_BACKUP) { - tp->t_mpflags |= TMPF_BACKUP_PATH; - } -#endif - } else if ((th->th_flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { - struct mptcp_mpjoin_opt_rsp *join_rsp = - (struct mptcp_mpjoin_opt_rsp *)cp; - - if (optlen != sizeof (struct mptcp_mpjoin_opt_rsp)) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "SYN_ACK: unexpected optlen = %d mp " - "option = %lu\n", optlen, - sizeof (struct mptcp_mpjoin_opt_rsp)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - tp->t_mpflags &= ~TMPF_PREESTABLISHED; - /* send RST and close */ - MPTCP_JOPT_ERROR_PATH(tp); - return; - } - - mptcp_set_raddr_rand(tp->t_local_aid, - tptomptp(tp), - join_rsp->mmjo_addr_id, join_rsp->mmjo_rand); - error = mptcp_validate_join_hmac(tp, - (u_char*)&join_rsp->mmjo_mac, SHA1_TRUNCATED); - if (error) { - mptcplog((LOG_ERR, "MPTCP Socket: %s: " - "SYN_ACK error = %d \n", __func__, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - tp->t_mpflags &= ~TMPF_PREESTABLISHED; - /* send RST and close */ - MPTCP_JOPT_ERROR_PATH(tp); - return; - } - tp->t_mpflags |= TMPF_SENT_JOIN; - } else if ((th->th_flags & TH_ACK) && - (tp->t_mpflags & TMPF_PREESTABLISHED)) { - struct mptcp_mpjoin_opt_rsp2 *join_rsp2 = - (struct mptcp_mpjoin_opt_rsp2 *)cp; - - if (optlen != sizeof (struct mptcp_mpjoin_opt_rsp2)) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "ACK: unexpected optlen = %d mp option " - "= %lu \n", optlen, - sizeof (struct mptcp_mpjoin_opt_rsp2)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - tp->t_mpflags &= ~TMPF_PREESTABLISHED; - /* send RST and close */ - MPTCP_JOPT_ERROR_PATH(tp); - return; - } + if (optlen != sizeof (struct mptcp_mpjoin_opt_rsp)) { + mptcplog((LOG_ERR, "MPTCP Socket: " + "SYN_ACK: unexpected optlen = %d mp " + "option = %lu\n", optlen, + sizeof (struct mptcp_mpjoin_opt_rsp)), + MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + tp->t_mpflags &= ~TMPF_PREESTABLISHED; + /* send RST and close */ + MPTCP_JOPT_ERROR_PATH(tp); + return; + } - error = mptcp_validate_join_hmac(tp, join_rsp2->mmjo_mac, - SHA1_RESULTLEN); - if (error) { - mptcplog((LOG_ERR, "MPTCP Socket: " - "%s: ACK error = %d\n", __func__, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); - tp->t_mpflags &= ~TMPF_PREESTABLISHED; - MPTCP_JOPT_ERROR_PATH(tp); - return; - } - tp->t_mpflags |= TMPF_MPTCP_TRUE; + mptcp_set_raddr_rand(tp->t_local_aid, tptomptp(tp), + join_rsp->mmjo_addr_id, join_rsp->mmjo_rand); + error = mptcp_validate_join_hmac(tp, + (u_char*)&join_rsp->mmjo_mac, SHA1_TRUNCATED); + if (error) { + mptcplog((LOG_ERR, "MPTCP Socket: %s: " + "SYN_ACK error = %d \n", __func__, error), + MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); tp->t_mpflags &= ~TMPF_PREESTABLISHED; - tp->t_flags |= TF_ACKNOW; - tp->t_mpflags |= TMPF_MPTCP_ACKNOW; - tp->t_inpcb->inp_socket->so_flags |= SOF_MPTCP_TRUE; + /* send RST and close */ + MPTCP_JOPT_ERROR_PATH(tp); + return; } + tp->t_mpflags |= TMPF_SENT_JOIN; } static int @@ -1382,6 +1174,17 @@ mptcp_validate_join_hmac(struct tcpcb *tp, u_char* hmac, int mac_len) MPT_LOCK(mp_tp); rem_key = mp_tp->mpt_remotekey; + + /* + * Can happen if the MPTCP-connection is about to be closed and we + * receive an MP_JOIN in-between the events are being handled by the + * worker thread. + */ + if (mp_tp->mpt_localkey == NULL) { + MPT_UNLOCK(mp_tp); + return (-1); + } + loc_key = *mp_tp->mpt_localkey; MPT_UNLOCK(mp_tp); @@ -1408,6 +1211,8 @@ mptcp_do_dss_opt_ack_meat(u_int64_t full_dack, struct tcpcb *tp) struct mptcb *mp_tp = tptomptp(tp); int close_notify = 0; + tp->t_mpflags |= TMPF_RCVD_DACK; + MPT_LOCK(mp_tp); if (MPTCP_SEQ_LEQ(full_dack, mp_tp->mpt_sndmax) && MPTCP_SEQ_GEQ(full_dack, mp_tp->mpt_snduna)) { @@ -1623,7 +1428,7 @@ mptcp_do_dss_opt_meat(u_char *cp, struct tcpcb *tp) "%s: 32-bit M and 64-bit A present.\n", __func__), (MPTCP_SOCKET_DBG|MPTCP_RECEIVER_DBG), MPTCP_LOGLVL_LOG); - + full_dack = mptcp_ntoh64(dss32_ack64_opt->mdss_ack); mptcp_do_dss_opt_ack_meat(full_dack, tp); NTOHL(dss32_ack64_opt->mdss_dsn); @@ -1789,7 +1594,7 @@ mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th) } /* Reset this flow */ - tp->t_mpflags |= TMPF_RESET; + tp->t_mpflags |= (TMPF_RESET | TMPF_FASTCLOSERCV); if (tp->t_inpcb->inp_socket != NULL) { soevent(tp->t_inpcb->inp_socket, diff --git a/bsd/netinet/mptcp_subr.c b/bsd/netinet/mptcp_subr.c index ac55c8a2d..a2ecbf4c0 100644 --- a/bsd/netinet/mptcp_subr.c +++ b/bsd/netinet/mptcp_subr.c @@ -66,6 +66,8 @@ #endif /* INET6 */ #include +extern char *proc_best_name(proc_t); + /* * Notes on MPTCP implementation. * @@ -131,11 +133,8 @@ static void mptcp_thread_destroy(struct mptses *); static void mptcp_key_pool_init(void); static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t); static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *); -static void mptcp_conn_properties(struct mptcb *); static uint32_t mptcp_gc(struct mppcbinfo *); -static int mptcp_subflow_socreate(struct mptses *, struct mptsub *, - int, struct proc *, struct socket **); static int mptcp_subflow_soclose(struct mptsub *, struct socket *); static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *); static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **, @@ -147,7 +146,7 @@ static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t); static void mptcp_update_last_owner(struct mptsub *, struct socket *); static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts); static void mptcp_get_rtt_measurement(struct mptsub *, struct mptses *); -static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *); +static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *, int *); /* * Possible return values for subflow event handlers. Note that success @@ -922,7 +921,7 @@ mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts) socket_lock(so, 0); mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last); - + /* connect the subflow socket */ error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl, mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope, @@ -995,9 +994,10 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, struct sockbuf *sb = &so->so_rcv; error = ENOTCONN; - SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", - __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so), error)); + SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n", + __func__, proc_pid(p), proc_best_name(p), + (uint64_t)VM_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), error); /* * This socket should have been disconnected and flushed * prior to being returned from sodefunct(); there should @@ -1160,7 +1160,7 @@ mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts, * * This will increase the current 64k buffer size to whatever is best. */ - if (!(so->so_rcv.sb_flags & SB_USRSIZE)) + if (!(so->so_rcv.sb_flags & SB_USRSIZE)) so->so_rcv.sb_flags |= SB_AUTOSIZE; if (!(so->so_snd.sb_flags & SB_USRSIZE)) so->so_snd.sb_flags |= SB_AUTOSIZE; @@ -1314,11 +1314,18 @@ mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts, VERIFY(mpts->mpts_outif != NULL); mpts->mpts_flags |= MPTSF_BOUND_IF; + if (IFNET_IS_EXPENSIVE(mpts->mpts_outif)) { + sototcpcb(so)->t_mpflags |= TMPF_BACKUP_PATH; + } else { + mpts->mpts_flags |= MPTSF_PREFERRED; + } + mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add mp_so 0x%llx " - "bindif %s[%d] cid d\n", + "bindif %s[%d] cid %d expensive %d\n", (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_outif->if_xname, - ifscope, mpts->mpts_connid), + ifscope, mpts->mpts_connid, + IFNET_IS_EXPENSIVE(mpts->mpts_outif)), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); socket_unlock(so, 0); } @@ -1509,7 +1516,7 @@ mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close) MPTS_LOCK(mpts); so = mpts->mpts_socket; VERIFY(so != NULL); - + if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) && (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) { MPTS_UNLOCK(mpts); @@ -1550,7 +1557,7 @@ mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close) (void) sock_catchevents(so, NULL, NULL, 0); mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so); - + if (close) (void) mptcp_subflow_soclose(mpts, so); @@ -1648,9 +1655,9 @@ mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf) struct mptsub *mpts = arg; struct mptses *mpte = mpts->mpts_mpte; - /* - * mpte should never be NULL, except in a race with - * mptcp_subflow_del + /* + * mpte should never be NULL, except in a race with + * mptcp_subflow_del */ if (mpte == NULL) return; @@ -1677,7 +1684,7 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ MPTS_LOCK_ASSERT_HELD(mpts); - DTRACE_MPTCP2(subflow__input, struct mptses *, mpte, + DTRACE_MPTCP2(subflow__input, struct mptses *, mpte, struct mptsub *, mpts); if (!(mpts->mpts_flags & MPTSF_CONNECTED)) @@ -1709,7 +1716,7 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) } } else { mptcplog((LOG_ERR, "MPTCP Receiver: %s: no alt" - " path for cid %d\n", __func__, + " path for cid %d\n", __func__, mpts->mpts_connid), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR); } @@ -1801,7 +1808,7 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) struct socket *mp_so, *so; size_t sb_cc = 0, tot_sent = 0; struct mbuf *sb_mb; - int error = 0; + int error = 0, wakeup = 0; u_int64_t mpt_dsn = 0; struct mptcb *mp_tp = mpte->mpte_mptcb; struct mbuf *mpt_mbuf = NULL; @@ -1814,7 +1821,7 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) mp_so = mpte->mpte_mppcb->mpp_socket; so = mpts->mpts_socket; - DTRACE_MPTCP2(subflow__output, struct mptses *, mpte, + DTRACE_MPTCP2(subflow__output, struct mptses *, mpte, struct mptsub *, mpts); /* subflow socket is suspended? */ @@ -1848,7 +1855,7 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) } if (mpts->mpts_flags & MPTSF_TFO_REQD) { - mptcp_drop_tfo_data(mpte, mpts); + mptcp_drop_tfo_data(mpte, mpts, &wakeup); } /* @@ -1899,6 +1906,7 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) len = mp_tp->mpt_snduna - mpt_dsn; MPT_UNLOCK(mp_tp); sbdrop(&mp_so->so_snd, (int)len); + wakeup = 1; MPT_LOCK(mp_tp); } @@ -1918,6 +1926,7 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) u_int64_t len = 0; len = mp_tp->mpt_sndnxt - mpt_dsn; sbdrop(&mp_so->so_snd, (int)len); + wakeup = 1; mp_tp->mpt_snduna = mp_tp->mpt_sndnxt; } @@ -2030,14 +2039,14 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) if ((mpts->mpts_flags & MPTSF_TFO_REQD) && (tp->t_tfo_stats == 0)) { - tp->t_mpflags |= TMPF_TFO_REQUEST; + tp->t_mpflags |= TMPF_TFO_REQUEST; } else if (mpts->mpts_flags & MPTSF_FASTJ_SEND) { tp->t_mpflags |= TMPF_FASTJOIN_SEND; } error = sock_sendmbuf(so, NULL, head, 0, NULL); - DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so, + DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so, struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, struct mptses *, mpte, struct mptsub *, mpts, @@ -2097,6 +2106,9 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); } out: + if (wakeup) + sowwakeup(mp_so); + return (error); } @@ -2189,7 +2201,7 @@ mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts, if (events != 0 || ret < MPTS_EVRET_OK) { mptcplog((LOG_ERR, "MPTCP Events %s%s: cid %d evret %s (%d)" " unhandled events=%b\n", - (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "", + (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "", __func__, mpts->mpts_connid, mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR); @@ -2239,7 +2251,9 @@ mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts, MPT_LOCK(mp_tp); if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED; - } else if (mpte->mpte_nummpcapflows < 1) { + } else if (mpte->mpte_nummpcapflows < 1 || + ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && + (mpts->mpts_flags & MPTSF_ACTIVE))) { mpts->mpts_soerror = mp_so->so_error = ECONNRESET; *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET; } @@ -2260,12 +2274,13 @@ static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts, uint64_t *p_mpsofilt_hint) { -#pragma unused(p_mpsofilt_hint) + struct mptcb *mp_tp; struct socket *so; MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ MPTS_LOCK_ASSERT_HELD(mpts); + mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; mptcplog((LOG_DEBUG, "MPTCP Events: " @@ -2273,11 +2288,18 @@ mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts, MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); /* - * We got a FIN for this subflow connection. This subflow socket - * is no longer available for receiving data; - * The FIN may arrive with data. The data is handed up to the - * mptcp socket and the subflow is disconnected. - */ + * A FIN on a fallen back MPTCP-connection should be treated like a + * DATA_FIN. + */ + MPT_LOCK(mp_tp); + if ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && + (mpts->mpts_flags & MPTSF_ACTIVE)) { + mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN); + if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) { + *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE; + } + } + MPT_UNLOCK(mp_tp); return (MPTS_EVRET_OK); /* keep the subflow socket around */ } @@ -2445,17 +2467,17 @@ mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts, mptcplog((LOG_DEBUG, "MPTCP Events: " "%s: cid %d\n", __func__, mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG); - + /* - * We got a Data FIN for the MPTCP connection. + * We got a Data FIN for the MPTCP connection. * The FIN may arrive with data. The data is handed up to the * mptcp socket and the user is notified so that it may close * the socket if needed. */ MPT_LOCK(mp_tp); - if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) { + if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE; - } + MPT_UNLOCK(mp_tp); return (MPTS_EVRET_OK); /* keep the subflow socket around */ } @@ -2498,7 +2520,7 @@ mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts, MPTS_LOCK(mpts_alt); (void) mptcp_subflow_soconnectx(mpte, mpts_alt); - MPTS_UNLOCK(mpts_alt); + MPTS_UNLOCK(mpts_alt); } } MPTS_LOCK(mpts); @@ -2920,11 +2942,16 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG); mp_tp->mpt_state = MPTCPS_ESTABLISHED; mpte->mpte_associd = mpts->mpts_connid; - DTRACE_MPTCP2(state__change, - struct mptcb *, mp_tp, + DTRACE_MPTCP2(state__change, + struct mptcb *, mp_tp, uint32_t, 0 /* event */); - (void) mptcp_setconnorder(mpte, mpts->mpts_connid, 1); + if (mpts->mpts_outif && + IFNET_IS_EXPENSIVE(mpts->mpts_outif)) { + sototcpcb(so)->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO); + } else { + mpts->mpts_flags |= MPTSF_PREFERRED; + } soisconnected(mp_so); } MPTS_LOCK(mpts); @@ -2960,7 +2987,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, if (mpts->mpts_sndnxt == 0) { mpts->mpts_sndnxt = mp_tp->mpt_snduna; mpts->mpts_rel_seq = 1; - } + } MPT_UNLOCK(mp_tp); mptcp_output_needed(mpte, mpts); } else { @@ -3132,7 +3159,7 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, { struct socket *mp_so, *so; struct mptcb *mp_tp; - boolean_t linger; + boolean_t linger, is_fastclose; MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ @@ -3157,6 +3184,8 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, tp = intotcpcb(inp); so->so_error = ECONNABORTED; + is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV); + t_template = tcp_maketemplate(tp); if (t_template) { struct tcp_respond_args tra; @@ -3183,20 +3212,22 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, *p_mpsofilt_hint |= (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); - if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) + MPT_LOCK(mp_tp); + + if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) { *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET; - MPT_LOCK(mp_tp); - if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) || - (mp_tp->mpt_state == MPTCPS_FASTCLOSE_WAIT)) { - mp_so->so_error = ECONNABORTED; + if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) + mp_so->so_error = ECONNABORTED; + else + mp_so->so_error = ECONNRESET; + + /* + * mptcp_drop is being called after processing the events, to fully + * close the MPTCP connection + */ } - /* - * Ideally there should be a state transition for when a FASTCLOSE - * is received. Right now we keep the connection in MPTCPS_ESTABLISHED - * state and only go to terminal state when the user level code calls - * close after processing the SO_FILT_HINT_CONNRESET event. - */ + if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST; MPT_UNLOCK(mp_tp); @@ -3216,7 +3247,7 @@ mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts, MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ MPTS_LOCK_ASSERT_HELD(mpts); VERIFY(mpte->mpte_mppcb != NULL); - + if (mpte->mpte_nummpcapflows == 0) { struct mptcb *mp_tp = mpte->mpte_mptcb; mptcplog((LOG_DEBUG,"MPTCP Events: %s: %llx %llx \n", @@ -3549,12 +3580,13 @@ mptcp_gc(struct mppcbinfo *mppi) mp_so->so_usecount, mp_so->so_retaincnt), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - DTRACE_MPTCP4(dispose, struct socket *, mp_so, + DTRACE_MPTCP4(dispose, struct socket *, mp_so, struct sockbuf *, &mp_so->so_rcv, struct sockbuf *, &mp_so->so_snd, struct mppcb *, mpp); mp_pcbdispose(mpp); + sodealloc(mp_so); } return (active); @@ -3574,7 +3606,7 @@ mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno) mp_so = mpte->mpte_mppcb->mpp_socket; mp_tp->mpt_state = MPTCPS_TERMINATE; - DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, + DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, uint32_t, 0 /* event */); if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) @@ -3744,6 +3776,19 @@ mptcp_thread_dowork(struct mptses *mpte) } if (mpsofilt_hint_mask) { + if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) { + socantrcvmore(mp_so); + mpsofilt_hint_mask &= ~SO_FILT_HINT_CANTRCVMORE; + } + + if (mpsofilt_hint_mask & SO_FILT_HINT_CONNRESET) { + struct mptcb *mp_tp = mpte->mpte_mptcb; + + MPT_LOCK(mp_tp); + mptcp_drop(mpte, mp_tp, ECONNRESET); + MPT_UNLOCK(mp_tp); + } + soevent(mp_so, mpsofilt_hint_mask); } @@ -3773,7 +3818,7 @@ mptcp_thread_dowork(struct mptses *mpte) } if (mpts->mpts_flags & MPTSF_TFO_REQD) - mptcp_drop_tfo_data(mpte, mpts); + mptcp_drop_tfo_data(mpte, mpts, NULL); so = mpts->mpts_socket; @@ -4602,7 +4647,7 @@ mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len) m->m_pkthdr.mp_rlen, len), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); m->m_pkthdr.mp_rlen -= len; - return; + break; } } else { panic("%s: MPTCP tag not set", __func__); @@ -4610,6 +4655,16 @@ mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len) } m = m->m_next; } + + if (so->so_flags & SOF_MP_SUBFLOW && + !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) && + !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) { + /* + * Received an ack without receiving a DATA_ACK. + * Need to fallback to regular TCP (or destroy this subflow). + */ + mptcp_notify_mpfail(so); + } } /* Obtain the DSN mapping stored in the mbuf */ @@ -4903,14 +4958,14 @@ mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len) mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len, &mdss_dsn, &mdss_subflow_seq, &mdss_data_len); - /* + /* * Special case handling for Fast Join. We want to send data right * after ACK of the 3-way handshake, but not piggyback the data * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and * mdss_data_len control this. */ struct tcpcb *tp = NULL; - tp = intotcpcb(sotoinpcb(so)); + tp = intotcpcb(sotoinpcb(so)); if ((tp->t_mpflags & TMPF_JOINED_FLOW) && (tp->t_mpflags & TMPF_PREESTABLISHED) && (!(tp->t_mpflags & TMPF_RECVD_JOIN)) && @@ -5075,7 +5130,7 @@ void mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso) { struct socket *subflow_so = mpts->mpts_socket; - + MPTS_LOCK_ASSERT_HELD(mpts); socket_lock(subflow_so, 0); @@ -5106,7 +5161,7 @@ fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts) SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport; SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr; SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr; - } else + } else #endif if ((inp->inp_vflag & INP_IPV4) != 0) { flow->flow_src.ss_family = AF_INET; @@ -5150,7 +5205,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS n = mtcbinfo.mppi_count; if (req->oldptr == USER_ADDR_NULL) { lck_mtx_unlock(&mtcbinfo.mppi_lock); - req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) + + req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) + 4 * (n + n/8) * sizeof(mptcp_flow_t); return (0); } @@ -5205,7 +5260,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS } else { mptcpci.mptcpci_len = sizeof(mptcpci); error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci)); - } + } if (error) { lck_mtx_unlock(&mpp->mpp_lock); FREE(flows, M_TEMP); @@ -5235,7 +5290,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS } SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, - 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t", + 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t", "List of active MPTCP connections"); /* @@ -5251,8 +5306,8 @@ mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts) MPTE_LOCK_ASSERT_HELD(mpte); MPTS_UNLOCK(to_mpts); - - from_mpts = mpte->mpte_active_sub; + + from_mpts = mpte->mpte_active_sub; if (from_mpts == NULL) goto output_needed; @@ -5269,8 +5324,8 @@ mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts) MPTS_LOCK(to_mpts); return; -output_needed: - mptcp_output(mpte); +output_needed: + mptcp_output(mpte); MPTS_LOCK(to_mpts); } @@ -5291,7 +5346,7 @@ mptcp_set_notsent_lowat(struct mptses *mpte, int optval) else error = EINVAL; - return error; + return error; } u_int32_t @@ -5308,7 +5363,7 @@ mptcp_get_notsent_lowat(struct mptses *mpte) return 0; } -int +int mptcp_notsent_lowat_check(struct socket *so) { struct mptses *mpte; struct mppcb *mpp; @@ -5352,10 +5407,10 @@ mptcp_notsent_lowat_check(struct socket *so) { struct socket *subf_so = mpts->mpts_socket; socket_lock(subf_so, 0); struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so)); - + notsent = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una); - + if ((tp->t_flags & TF_NODELAY) == 0 && notsent > 0 && (notsent <= (int)tp->t_maxseg)) { retval = 1; @@ -5523,8 +5578,8 @@ mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best) return (NULL); /* - * There could be devices with more than one wifi interface or - * more than one wired or cell interfaces. + * There could be devices with more than one wifi interface or + * more than one wired or cell interfaces. * TBD: SymptomsD is unavailable on such platforms as of now. * Try to prefer best when possible in general. * Also, SymptomsD sends notifications about wifi only when it @@ -5588,12 +5643,12 @@ mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best) } /* little is known about the state of the network or wifi is good */ - return (NULL); + return (NULL); } /* If TFO data is succesfully acked, it must be dropped from the mptcp so */ static void -mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts) +mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts, int *wakeup) { struct socket *mp_so = mpte->mpte_mppcb->mpp_socket; struct socket *so = mpts->mpts_socket; @@ -5628,6 +5683,8 @@ mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts) if (mp_droplen != 0) { VERIFY(mp_so->so_snd.sb_mb != NULL); sbdrop(&mp_so->so_snd, (int)mp_droplen); + if (wakeup) + *wakeup = 1; } mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d " "TFO tcp len %d mptcp len %d\n", __func__, @@ -5636,4 +5693,3 @@ mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts) MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG); } } - diff --git a/bsd/netinet/mptcp_timer.c b/bsd/netinet/mptcp_timer.c index 5469b8323..7ac605b74 100644 --- a/bsd/netinet/mptcp_timer.c +++ b/bsd/netinet/mptcp_timer.c @@ -152,7 +152,7 @@ mptcp_timer(struct mppcbinfo *mppi) VERIFY(mpte != NULL); MPTE_LOCK(mpte); VERIFY(mpp->mpp_flags & MPP_ATTACHED); - + if (mpp->mpp_flags & MPP_DEFUNCT) { MPTE_UNLOCK(mpte); continue; diff --git a/bsd/netinet/mptcp_usrreq.c b/bsd/netinet/mptcp_usrreq.c index e0b8fbcbc..a3118841a 100644 --- a/bsd/netinet/mptcp_usrreq.c +++ b/bsd/netinet/mptcp_usrreq.c @@ -107,18 +107,6 @@ struct pr_usrreqs mptcp_usrreqs = { .pru_preconnect = mptcp_usr_preconnect, }; -/* - * Sysctl for testing and tuning mptcp connectx with data api. - * Mirrors tcp_preconnect_sbspace for now. - */ -#define MPTCP_PRECONNECT_SBSZ_MAX 1460 -#define MPTCP_PRECONNECT_SBSZ_MIN (TCP_MSS) -#define MPTCP_PRECONNECT_SBSZ_DEF (TCP6_MSS) -static int mptcp_preconnect_sbspace = MPTCP_PRECONNECT_SBSZ_DEF; -SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mp_preconn_sbsz, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_preconnect_sbspace, 0, "Maximum preconnect space"); - - /* * Attaches an MPTCP control block to a socket. */ @@ -180,8 +168,7 @@ mptcp_attach(struct socket *mp_so, struct proc *p) } if (mp_so->so_snd.sb_preconn_hiwat == 0) { - soreserve_preconnect(mp_so, imin(MPTCP_PRECONNECT_SBSZ_MAX, - imax(mptcp_preconnect_sbspace, MPTCP_PRECONNECT_SBSZ_MIN))); + soreserve_preconnect(mp_so, 2048); } /* @@ -635,7 +622,7 @@ mptcp_connorder_helper(struct mptsub *mpts) struct tcpcb *tp = NULL; socket_lock(so, 0); - + tp = intotcpcb(sotoinpcb(so)); tp->t_mpflags |= TMPF_SND_MPPRIO; if (mpts->mpts_flags & MPTSF_PREFERRED) @@ -857,7 +844,7 @@ mptcp_disconnectx(struct mptses *mpte, sae_associd_t aid, sae_connid_t cid) } else { bool disconnect_embryonic_subflows = false; struct socket *so = NULL; - + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { if (mpts->mpts_connid != cid) continue; @@ -1173,14 +1160,14 @@ mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m, error = mptcp_output(mpte); if (error != 0) goto out; - + if (mp_so->so_state & SS_ISCONNECTING) { if (mp_so->so_state & SS_NBIO) error = EWOULDBLOCK; else error = sbwait(&mp_so->so_snd); } - + out: if (error) { if (m != NULL) @@ -1511,6 +1498,7 @@ mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt) case SO_FLUSH: /* MP + subflow */ case SO_MPTCP_FASTJOIN: /* MP + subflow */ case SO_NOWAKEFROMSLEEP: + case SO_NOAPNFALLBK: /* * Tell the caller that these options are to be processed; * these will also be recorded later by mptcp_setopt(). @@ -1689,6 +1677,7 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) case SO_RESTRICTIONS: case SO_NOWAKEFROMSLEEP: case SO_MPTCP_FASTJOIN: + case SO_NOAPNFALLBK: /* record it */ break; case SO_FLUSH: @@ -2082,6 +2071,9 @@ mptcp_sopt2str(int level, int optname, char *dst, int size) case SO_MPTCP_FASTJOIN: o = "SO_MPTCP_FASTJOIN"; break; + case SO_NOAPNFALLBK: + o = "SO_NOAPNFALLBK"; + break; } break; case IPPROTO_TCP: diff --git a/bsd/netinet/mptcp_var.h b/bsd/netinet/mptcp_var.h index c4fdf2c7e..09dcac342 100644 --- a/bsd/netinet/mptcp_var.h +++ b/bsd/netinet/mptcp_var.h @@ -260,8 +260,7 @@ typedef enum mptcp_state { MPTCPS_LAST_ACK = 6, /* had DFIN and close; await DFIN ACK */ MPTCPS_FIN_WAIT_2 = 7, /* have closed, DFIN is acked */ MPTCPS_TIME_WAIT = 8, /* in 2*MSL quiet wait after close */ - MPTCPS_FASTCLOSE_WAIT = 9, /* sent MP_FASTCLOSE */ - MPTCPS_TERMINATE = 10, /* terminal state */ + MPTCPS_TERMINATE = 9, /* terminal state */ } mptcp_state_t; typedef u_int64_t mptcp_key_t; @@ -350,11 +349,14 @@ struct mptcb { #define MPTCPF_SND_64BITDSN 0x20 /* Send full 64-bit DSN */ #define MPTCPF_SND_64BITACK 0x40 /* Send 64-bit ACK response */ #define MPTCPF_RCVD_64BITACK 0x80 /* Received 64-bit Data ACK */ -#define MPTCPF_POST_FALLBACK_SYNC 0x100 /* Post fallback resend data */ +#define MPTCPF_POST_FALLBACK_SYNC 0x100 /* Post fallback resend data */ +#define MPTCPF_FALLBACK_HEURISTIC 0x200 /* Send SYN without MP_CAPABLE due to heuristic */ +#define MPTCPF_HEURISTIC_TRAC 0x400 /* Tracked this connection in the heuristics as a failure */ #define MPTCPF_BITS \ "\020\1CHECKSUM\2FALLBACK_TO_TCP\3JOIN_READY\4RECVD_MPFAIL\5PEEL_OFF" \ - "\6SND_64BITDSN\7SND_64BITACK\10RCVD_64BITACK\11POST_FALLBACK_SYNC" + "\6SND_64BITDSN\7SND_64BITACK\10RCVD_64BITACK\11POST_FALLBACK_SYNC" \ + "\12FALLBACK_HEURISTIC\13HEURISTIC_TRAC" /* valid values for mpt_timer_vals */ #define MPTT_REXMT 0x01 /* Starting Retransmit Timer */ @@ -527,9 +529,9 @@ extern int mptcp_rwnotify; /* Enable RW notification on resume */ extern uint32_t mptcp_dbg_level; /* Multipath TCP debugging level */ extern uint32_t mptcp_dbg_area; /* Multipath TCP debugging area */ -#define MPPCB_LIMIT 16 +#define MPPCB_LIMIT 32 extern uint32_t mptcp_socket_limit; /* max number of mptcp sockets allowed */ -extern uint32_t mptcp_delayed_subf_start; /* delayed cellular subflow start */ +extern uint32_t mptcp_delayed_subf_start; /* delayed cellular subflow start */ extern int tcp_jack_rxmt; /* Join ACK retransmission value in msecs */ __BEGIN_DECLS diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index c2b41a365..1f7ccb227 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -89,6 +89,7 @@ #define _IP_VHL #include #include +#include #include #include #include @@ -197,9 +198,7 @@ static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET , 0, {0}, {0,0,0,0, * mbuf chain. */ void -rip_input(m, iphlen) - struct mbuf *m; - int iphlen; +rip_input(struct mbuf *m, int iphlen) { struct ip *ip = mtod(m, struct ip *); struct inpcb *inp; @@ -276,7 +275,7 @@ rip_input(m, iphlen) } else { if (error) { /* should notify about lost packet */ - kprintf("rip_input can't append to socket\n"); + ipstat.ips_raw_sappend_fail++; } } opts = 0; @@ -312,7 +311,7 @@ rip_input(m, iphlen) if (ret != 0) { m_freem(m); m_freem(opts); - goto unlock; + goto unlock; } } if (last->inp_flags & INP_STRIPHDR) { @@ -325,7 +324,7 @@ rip_input(m, iphlen) (struct sockaddr *)&ripsrc, m, opts, NULL) != 0) { sorwakeup(last->inp_socket); } else { - kprintf("rip_input(2) can't append to socket\n"); + ipstat.ips_raw_sappend_fail++; } } else { m_freem(m); @@ -335,7 +334,7 @@ rip_input(m, iphlen) } unlock: /* - * Keep the list locked because socket filter may force the socket lock + * Keep the list locked because socket filter may force the socket lock * to be released when calling sbappendaddr() -- see rdar://7627704 */ lck_rw_done(ripcbinfo.ipi_lock); @@ -356,17 +355,22 @@ rip_output( struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0, 0, 0 }; struct ip_moptions *imo; int error = 0; - mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + int sotc = SO_TC_UNSPEC; + int netsvctype = _NET_SERVICE_TYPE_UNSPEC; if (control != NULL) { - msc = mbuf_service_class_from_control(control); + sotc = so_tc_from_control(control, &netsvctype); m_freem(control); control = NULL; } + if (sotc == SO_TC_UNSPEC) { + sotc = so->so_traffic_class; + netsvctype = so->so_netsvctype; + } if (inp == NULL #if NECP @@ -391,6 +395,8 @@ rip_output( ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; if (INP_AWDL_UNRESTRICTED(inp)) ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; + ipoa.ipoa_sotc = sotc; + ipoa.ipoa_netsvctype = netsvctype; if (inp->inp_flowhash == 0) inp->inp_flowhash = inp_calc_flowhash(inp); @@ -444,6 +450,41 @@ rip_output( { necp_kernel_policy_id policy_id; u_int32_t route_rule_id; + + /* + * We need a route to perform NECP route rule checks + */ + if (net_qos_policy_restricted != 0 && + ROUTE_UNUSABLE(&inp->inp_route)) { + struct sockaddr_in to; + struct sockaddr_in from; + struct in_addr laddr = ip->ip_src; + + ROUTE_RELEASE(&inp->inp_route); + + bzero(&from, sizeof(struct sockaddr_in)); + from.sin_family = AF_INET; + from.sin_len = sizeof(struct sockaddr_in); + from.sin_addr = laddr; + + bzero(&to, sizeof(struct sockaddr_in)); + to.sin_family = AF_INET; + to.sin_len = sizeof(struct sockaddr_in); + to.sin_addr.s_addr = ip->ip_dst.s_addr; + + if ((error = in_pcbladdr(inp, (struct sockaddr *)&to, + &laddr, ipoa.ipoa_boundif, NULL, 1)) != 0) { + printf("%s in_pcbladdr(%p) error %d\n", + __func__, inp, error); + m_freem(m); + return (error); + } + + inp_update_necp_policy(inp, (struct sockaddr *)&from, + (struct sockaddr *)&to, ipoa.ipoa_boundif); + inp->inp_policyresult.results.qos_marking_gencount = 0; + } + if (!necp_socket_is_allowed_to_send_recv_v4(inp, 0, 0, &ip->ip_src, &ip->ip_dst, NULL, &policy_id, &route_rule_id)) { m_freem(m); @@ -451,8 +492,27 @@ rip_output( } necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id); + + if (net_qos_policy_restricted != 0) { + struct ifnet *rt_ifp = NULL; + + if (inp->inp_route.ro_rt != NULL) + rt_ifp = inp->inp_route.ro_rt->rt_ifp; + + printf("%s inp %p last_pid %u inp_boundifp %d inp_last_outifp %d rt_ifp %d route_rule_id %u\n", + __func__, inp, + inp->inp_socket != NULL ? inp->inp_socket->last_pid : -1, + inp->inp_boundifp != NULL ? inp->inp_boundifp->if_index : -1, + inp->inp_last_outifp != NULL ? inp->inp_last_outifp->if_index : -1, + rt_ifp != NULL ? rt_ifp->if_index : -1, + route_rule_id); + necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt, + NULL, route_rule_id); + } } #endif /* NECP */ + if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) + ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED; #if IPSEC if (inp->inp_sp != NULL && ipsec_setsocket(m, so) != 0) { @@ -464,7 +524,7 @@ rip_output( if (ROUTE_UNUSABLE(&inp->inp_route)) ROUTE_RELEASE(&inp->inp_route); - set_packet_service_class(m, so, msc, 0); + set_packet_service_class(m, so, sotc, 0); m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = inp->inp_flowhash; m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | @@ -533,15 +593,15 @@ int load_ipfw(void) { kern_return_t err; - + ipfw_init(); - + #if DUMMYNET if (!DUMMYNET_LOADED) ip_dn_init(); #endif /* DUMMYNET */ err = 0; - + return err == 0 && ip_fw_ctl_ptr == NULL ? -1 : err; } #endif /* IPFIREWALL */ @@ -550,9 +610,7 @@ load_ipfw(void) * Raw IP socket option processing. */ int -rip_ctloutput(so, sopt) - struct socket *so; - struct sockopt *sopt; +rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; @@ -965,7 +1023,7 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, } /* note: rip_unlock is called from different protos instead of the generic socket_unlock, - * it will handle the socket dealloc on last reference + * it will handle the socket dealloc on last reference * */ int rip_unlock(struct socket *so, int refcount, void *debug) @@ -1040,7 +1098,7 @@ rip_pcblist SYSCTL_HANDLER_ARGS */ gencnt = ripcbinfo.ipi_gencnt; n = ripcbinfo.ipi_count; - + bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = n; @@ -1056,7 +1114,7 @@ rip_pcblist SYSCTL_HANDLER_ARGS */ if (n == 0) { lck_rw_done(ripcbinfo.ipi_lock); - return 0; + return 0; } inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); @@ -1064,7 +1122,7 @@ rip_pcblist SYSCTL_HANDLER_ARGS lck_rw_done(ripcbinfo.ipi_lock); return ENOMEM; } - + for (inp = ripcbinfo.ipi_listhead->lh_first, i = 0; inp && i < n; inp = inp->inp_list.le_next) { if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index 9dcb06620..312f8c252 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -244,10 +244,42 @@ struct tcphdr { #define TCP_NOTIMEWAIT 0x208 /* Avoid going into time-wait */ #define TCP_DISABLE_BLACKHOLE_DETECTION 0x209 /* disable PMTU blackhole detection */ #define TCP_ECN_MODE 0x210 /* fine grain control for A/B testing */ +#define TCP_KEEPALIVE_OFFLOAD 0x211 /* offload keep alive processing to firmware */ -#define ECN_MODE_DEFAULT 0x0 /* per interface or system wide default */ -#define ECN_MODE_ENABLE 0x1 /* force enable ECN on connection */ -#define ECN_MODE_DISABLE 0x2 /* force disable ECN on connection */ +/* + * TCP_ECN_MODE values + */ +#define ECN_MODE_DEFAULT 0x0 /* per interface or system wide default */ +#define ECN_MODE_ENABLE 0x1 /* force enable ECN on connection */ +#define ECN_MODE_DISABLE 0x2 /* force disable ECN on connection */ + +/* + * TCP_NOTIFY_ACKNOWLEDGEMENT + * + * Application can use this socket option to get a notification when + * data that is currently written to the socket is acknowledged. The input + * argument given to this socket option is a marker_id that will be used for + * returning the notification. The application can continue to write + * data after setting the marker. There can be multiple of these events + * outstanding on a socket at any time up to a max of TCP_MAX_NOTIFY_ACK. + * + * To get the completed notifications, getsockopt should be called with the + * TCP_NOTIFY_ACKNOWLEDGEMENT with the following tcp_notify_ack_complete + * structure as an out argument. At most TCP_MAX_NOTIFY_ACK ids will be + * returned if they have been successfully acknowledged in each call. + */ + +#define TCP_MAX_NOTIFY_ACK 10 + +typedef u_int32_t tcp_notify_ack_id_t; + +struct tcp_notify_ack_complete { + u_int32_t notify_pending; /* still pending */ + u_int32_t notify_complete_count; + tcp_notify_ack_id_t notify_complete_id[TCP_MAX_NOTIFY_ACK]; +}; + +#define TCP_NOTIFY_ACKNOWLEDGEMENT 0x212 /* Notify when data is acknowledged */ /* * The TCP_INFO socket option is a private API and is subject to change @@ -260,6 +292,7 @@ struct tcphdr { #define TCPI_OPT_ECN 0x08 #define TCPI_FLAG_LOSSRECOVERY 0x01 /* Currently in loss recovery */ +#define TCPI_FLAG_STREAMING_ON 0x02 /* Streaming detection on */ struct tcp_conn_status { unsigned int probe_activated : 1; @@ -297,10 +330,10 @@ struct tcp_info { u_int32_t tcpi_snd_wnd; /* Advertised send window. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ - + int32_t tcpi_last_outif; /* if_index of interface used to send last */ u_int32_t tcpi_snd_sbbytes; /* bytes in snd buffer including data inflight */ - + u_int64_t tcpi_txpackets __attribute__((aligned(8))); /* total packets sent */ u_int64_t tcpi_txbytes __attribute__((aligned(8))); /* total bytes sent */ @@ -333,17 +366,21 @@ struct tcp_info { u_int64_t tcpi_wired_txbytes __attribute((aligned(8))); /* bytes transmitted over Wired */ struct tcp_conn_status tcpi_connstatus; /* status of connection probes */ - u_int16_t /* Client-side information */ + u_int16_t tcpi_tfo_cookie_req:1, /* Cookie requested? */ tcpi_tfo_cookie_rcv:1, /* Cookie received? */ tcpi_tfo_syn_loss:1, /* Fallback to reg. TCP after SYN-loss */ tcpi_tfo_syn_data_sent:1, /* SYN+data has been sent out */ tcpi_tfo_syn_data_acked:1, /* SYN+data has been fully acknowledged */ - /* And the following are for server-side information (must be set on the listener socket) */ tcpi_tfo_syn_data_rcv:1, /* Server received SYN+data with a valid cookie */ tcpi_tfo_cookie_req_rcv:1, /* Server received cookie-request */ tcpi_tfo_cookie_sent:1, /* Server announced cookie */ - tcpi_tfo_cookie_invalid:1; /* Server received an invalid cookie */ + tcpi_tfo_cookie_invalid:1, /* Server received an invalid cookie */ + tcpi_tfo_cookie_wrong:1, /* Our sent cookie was wrong */ + tcpi_tfo_no_cookie_rcv:1, /* We did not receive a cookie upon our request */ + tcpi_tfo_heuristics_disable:1, /* TFO-heuristics disabled it */ + tcpi_tfo_send_blackhole:1, /* A sending-blackhole got detected */ + tcpi_tfo_recv_blackhole:1; /* A receiver-blackhole got detected */ u_int16_t tcpi_ecn_client_setup:1, /* Attempted ECN setup from client side */ tcpi_ecn_server_setup:1, /* Attempted ECN setup from server side */ @@ -435,18 +472,21 @@ struct tcp_connection_info { u_int32_t tcpi_srtt; /* average RTT in ms */ u_int32_t tcpi_rttvar; /* RTT variance */ u_int32_t - /* Client-side information */ tcpi_tfo_cookie_req:1, /* Cookie requested? */ tcpi_tfo_cookie_rcv:1, /* Cookie received? */ tcpi_tfo_syn_loss:1, /* Fallback to reg. TCP after SYN-loss */ tcpi_tfo_syn_data_sent:1, /* SYN+data has been sent out */ tcpi_tfo_syn_data_acked:1, /* SYN+data has been fully acknowledged */ - /* And the following are for server-side information (must be set on the listener socket) */ tcpi_tfo_syn_data_rcv:1, /* Server received SYN+data with a valid cookie */ tcpi_tfo_cookie_req_rcv:1, /* Server received cookie-request */ tcpi_tfo_cookie_sent:1, /* Server announced cookie */ tcpi_tfo_cookie_invalid:1, /* Server received an invalid cookie */ - __pad2:23; + tcpi_tfo_cookie_wrong:1, /* Our sent cookie was wrong */ + tcpi_tfo_no_cookie_rcv:1, /* We did not receive a cookie upon our request */ + tcpi_tfo_heuristics_disable:1, /* TFO-heuristics disabled it */ + tcpi_tfo_send_blackhole:1, /* A sending-blackhole got detected */ + tcpi_tfo_recv_blackhole:1, /* A receiver-blackhole got detected */ + __pad2:18; u_int64_t tcpi_txpackets __attribute__((aligned(8))); u_int64_t tcpi_txbytes __attribute__((aligned(8))); u_int64_t tcpi_txretransmitbytes __attribute__((aligned(8))); diff --git a/bsd/netinet/tcp_cache.c b/bsd/netinet/tcp_cache.c index b872c7d32..ecd3ad590 100644 --- a/bsd/netinet/tcp_cache.c +++ b/bsd/netinet/tcp_cache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,16 +59,21 @@ struct tcp_heuristic { char th_val_start[0]; /* Marker for memsetting to 0 */ u_int8_t th_tfo_cookie_loss; /* The number of times a SYN+cookie has been lost */ + u_int8_t th_mptcp_loss; /* The number of times a SYN+MP_CAPABLE has been lost */ u_int8_t th_ecn_loss; /* The number of times a SYN+ecn has been lost */ u_int8_t th_ecn_aggressive; /* The number of times we did an aggressive fallback */ + u_int8_t th_ecn_droprst; /* The number of times ECN connections received a RST after first data pkt */ + u_int8_t th_ecn_droprxmt; /* The number of times ECN connection is dropped after multiple retransmits */ u_int32_t th_tfo_fallback_trials; /* Number of times we did not try out TFO due to SYN-loss */ u_int32_t th_tfo_cookie_backoff; /* Time until when we should not try out TFO */ + u_int32_t th_mptcp_backoff; /* Time until when we should not try out MPTCP */ u_int32_t th_ecn_backoff; /* Time until when we should not try out ECN */ u_int8_t th_tfo_in_backoff:1, /* Are we avoiding TFO due to the backoff timer? */ th_tfo_aggressive_fallback:1, /* Aggressive fallback due to nasty middlebox */ th_tfo_snd_middlebox_supp:1, /* We are sure that the network supports TFO in upstream direction */ - th_tfo_rcv_middlebox_supp:1; /* We are sure that the network supports TFO in downstream direction*/ + th_tfo_rcv_middlebox_supp:1, /* We are sure that the network supports TFO in downstream direction*/ + th_mptcp_in_backoff:1; /* Are we avoiding MPTCP due to the backoff timer? */ char th_val_end[0]; /* Marker for memsetting to 0 */ }; @@ -134,10 +139,38 @@ static lck_attr_t *tcp_heuristic_mtx_attr; static lck_grp_t *tcp_heuristic_mtx_grp; static lck_grp_attr_t *tcp_heuristic_mtx_grp_attr; -int tcp_ecn_timeout = 60; +static int tcp_ecn_timeout = 60; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_timeout, 0, "Initial minutes to wait before re-trying ECN"); +static int disable_tcp_heuristics = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_tcp_heuristics, CTLFLAG_RW | CTLFLAG_LOCKED, + &disable_tcp_heuristics, 0, "Set to 1, to disable all TCP heuristics (TFO, ECN, MPTCP)"); + +/* + * This number is coupled with tcp_ecn_timeout, because we want to prevent + * integer overflow. Need to find an unexpensive way to prevent integer overflow + * while still allowing a dynamic sysctl. + */ +#define TCP_CACHE_OVERFLOW_PROTECT 9 + +/* Number of SYN-losses we accept */ +#define TFO_MAX_COOKIE_LOSS 2 +#define ECN_MAX_SYN_LOSS 2 +#define MPTCP_MAX_SYN_LOSS 2 +#define ECN_MAX_DROPRST 2 +#define ECN_MAX_DROPRXMT 4 + +/* Flags for setting/unsetting loss-heuristics, limited to 1 byte */ +#define TCPCACHE_F_TFO 0x01 +#define TCPCACHE_F_ECN 0x02 +#define TCPCACHE_F_MPTCP 0x04 +#define TCPCACHE_F_ECN_DROPRST 0x08 +#define TCPCACHE_F_ECN_DROPRXMT 0x10 + +/* Always retry ECN after backing off to this level for some heuristics */ +#define ECN_RETRY_LIMIT 9 + /* * Round up to next higher power-of 2. See "Bit Twiddling Hacks". * @@ -468,6 +501,7 @@ static struct tcp_heuristic *tcp_getheuristic_with_lock(struct tcpcb *tp, */ tpheur->th_ecn_backoff = tcp_now; tpheur->th_tfo_cookie_backoff = tcp_now; + tpheur->th_mptcp_backoff = tcp_now; memcpy(&tpheur->th_key, &key, sizeof(key)); } @@ -486,19 +520,47 @@ static struct tcp_heuristic *tcp_getheuristic_with_lock(struct tcpcb *tp, return (NULL); } -void tcp_heuristic_tfo_success(struct tcpcb *tp) +static void tcp_heuristic_reset_loss(struct tcpcb *tp, u_int8_t flags) { struct tcp_heuristics_head *head; + struct tcp_heuristic *tpheur; - struct tcp_heuristic *tpheur = tcp_getheuristic_with_lock(tp, 1, &head); + /* + * Don't attempt to create it! Keep the heuristics clean if the + * server does not support TFO. This reduces the lookup-cost on + * our side. + */ + tpheur = tcp_getheuristic_with_lock(tp, 0, &head); if (tpheur == NULL) return; - tpheur->th_tfo_cookie_loss = 0; + if (flags & TCPCACHE_F_TFO) + tpheur->th_tfo_cookie_loss = 0; + + if (flags & TCPCACHE_F_ECN) + tpheur->th_ecn_loss = 0; + + if (flags & TCPCACHE_F_MPTCP) + tpheur->th_mptcp_loss = 0; tcp_heuristic_unlock(head); } +void tcp_heuristic_tfo_success(struct tcpcb *tp) +{ + tcp_heuristic_reset_loss(tp, TCPCACHE_F_TFO); +} + +void tcp_heuristic_mptcp_success(struct tcpcb *tp) +{ + tcp_heuristic_reset_loss(tp, TCPCACHE_F_MPTCP); +} + +void tcp_heuristic_ecn_success(struct tcpcb *tp) +{ + tcp_heuristic_reset_loss(tp, TCPCACHE_F_ECN); +} + void tcp_heuristic_tfo_rcv_good(struct tcpcb *tp) { struct tcp_heuristics_head *head; @@ -529,7 +591,7 @@ void tcp_heuristic_tfo_snd_good(struct tcpcb *tp) tp->t_tfo_flags |= TFO_F_NO_SNDPROBING; } -void tcp_heuristic_inc_loss(struct tcpcb *tp, int tfo, int ecn) +static void tcp_heuristic_inc_loss(struct tcpcb *tp, u_int8_t flags) { struct tcp_heuristics_head *head; struct tcp_heuristic *tpheur; @@ -538,24 +600,87 @@ void tcp_heuristic_inc_loss(struct tcpcb *tp, int tfo, int ecn) if (tpheur == NULL) return; - /* Limit to 9 to prevent integer-overflow during exponential backoff */ - if (tfo && tpheur->th_tfo_cookie_loss < 9) + /* Limit to prevent integer-overflow during exponential backoff */ + if ((flags & TCPCACHE_F_TFO) && tpheur->th_tfo_cookie_loss < TCP_CACHE_OVERFLOW_PROTECT) tpheur->th_tfo_cookie_loss++; - if (ecn && tpheur->th_ecn_loss < 9) { + if ((flags & TCPCACHE_F_ECN) && tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT) { tpheur->th_ecn_loss++; if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS) { tcpstat.tcps_ecn_fallback_synloss++; INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_synloss); tpheur->th_ecn_backoff = tcp_now + - ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) - << (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS)); + ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << + (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS)); + } + } + + if ((flags & TCPCACHE_F_MPTCP) && + tpheur->th_mptcp_loss < TCP_CACHE_OVERFLOW_PROTECT) { + tpheur->th_mptcp_loss++; + if (tpheur->th_mptcp_loss >= MPTCP_MAX_SYN_LOSS) { + /* + * Yes, we take tcp_ecn_timeout, to avoid adding yet + * another sysctl that is just used for testing. + */ + tpheur->th_mptcp_backoff = tcp_now + + ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << + (tpheur->th_mptcp_loss - MPTCP_MAX_SYN_LOSS)); + } + } + + if ((flags & TCPCACHE_F_ECN_DROPRST) && + tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT) { + tpheur->th_ecn_droprst++; + if (tpheur->th_ecn_droprst >= ECN_MAX_DROPRST) { + tcpstat.tcps_ecn_fallback_droprst++; + INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_droprst); + tpheur->th_ecn_backoff = tcp_now + + ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << + (tpheur->th_ecn_droprst - ECN_MAX_DROPRST)); + } } + if ((flags & TCPCACHE_F_ECN_DROPRXMT) && + tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT) { + tpheur->th_ecn_droprxmt++; + if (tpheur->th_ecn_droprxmt >= ECN_MAX_DROPRXMT) { + tcpstat.tcps_ecn_fallback_droprxmt++; + INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_droprxmt); + tpheur->th_ecn_backoff = tcp_now + + ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << + (tpheur->th_ecn_droprxmt - ECN_MAX_DROPRXMT)); + } + } tcp_heuristic_unlock(head); } +void tcp_heuristic_tfo_loss(struct tcpcb *tp) +{ + tcp_heuristic_inc_loss(tp, TCPCACHE_F_TFO); +} + +void tcp_heuristic_mptcp_loss(struct tcpcb *tp) +{ + tcp_heuristic_inc_loss(tp, TCPCACHE_F_MPTCP); +} + +void tcp_heuristic_ecn_loss(struct tcpcb *tp) +{ + tcp_heuristic_inc_loss(tp, TCPCACHE_F_ECN); +} + +void tcp_heuristic_ecn_droprst(struct tcpcb *tp) +{ + tcp_heuristic_inc_loss(tp, TCPCACHE_F_ECN_DROPRST); +} + +void tcp_heuristic_ecn_droprxmt(struct tcpcb *tp) +{ + tcp_heuristic_inc_loss(tp, TCPCACHE_F_ECN_DROPRXMT); +} + void tcp_heuristic_tfo_middlebox(struct tcpcb *tp) { struct tcp_heuristics_head *head; @@ -584,52 +709,32 @@ void tcp_heuristic_ecn_aggressive(struct tcpcb *tp) ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << (tpheur->th_ecn_aggressive)); /* - * Ugly way to prevent integer overflow... limit to 9 to prevent in + * Ugly way to prevent integer overflow... limit to prevent in * overflow during exp. backoff. */ - if (tpheur->th_ecn_aggressive < 9) + if (tpheur->th_ecn_aggressive < TCP_CACHE_OVERFLOW_PROTECT) tpheur->th_ecn_aggressive++; tcp_heuristic_unlock(head); } -void tcp_heuristic_reset_loss(struct tcpcb *tp, int tfo, int ecn) -{ - struct tcp_heuristics_head *head; - struct tcp_heuristic *tpheur; - - /* - * Don't attempt to create it! Keep the heuristics clean if the - * server does not support TFO. This reduces the lookup-cost on - * our side. - */ - tpheur = tcp_getheuristic_with_lock(tp, 0, &head); - if (tpheur == NULL) - return; - - if (tfo) - tpheur->th_tfo_cookie_loss = 0; - - if (ecn) - tpheur->th_ecn_loss = 0; - - tcp_heuristic_unlock(head); -} - boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp) { struct tcp_heuristics_head *head; struct tcp_heuristic *tpheur; + if (disable_tcp_heuristics) + return (TRUE); + /* Get the tcp-heuristic. */ tpheur = tcp_getheuristic_with_lock(tp, 0, &head); if (tpheur == NULL) - return (true); + return (TRUE); if (tpheur->th_tfo_aggressive_fallback) { /* Aggressive fallback - don't do TFO anymore... :'( */ tcp_heuristic_unlock(head); - return (false); + return (FALSE); } if (tpheur->th_tfo_cookie_loss >= TFO_MAX_COOKIE_LOSS && @@ -658,7 +763,7 @@ boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp) } tcp_heuristic_unlock(head); - return (false); + return (FALSE); } /* @@ -675,22 +780,54 @@ boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp) tcp_heuristic_unlock(head); - return (true); + return (TRUE); +} + +boolean_t tcp_heuristic_do_mptcp(struct tcpcb *tp) +{ + struct tcp_heuristics_head *head; + struct tcp_heuristic *tpheur; + boolean_t ret = TRUE; + + if (disable_tcp_heuristics) + return (TRUE); + + /* Get the tcp-heuristic. */ + tpheur = tcp_getheuristic_with_lock(tp, 0, &head); + if (tpheur == NULL) + return ret; + + if (TSTMP_GT(tpheur->th_mptcp_backoff, tcp_now)) + ret = FALSE; + + tcp_heuristic_unlock(head); + + return (ret); } boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp) { struct tcp_heuristics_head *head; struct tcp_heuristic *tpheur; - boolean_t ret = true; + boolean_t ret = TRUE; + + if (disable_tcp_heuristics) + return (TRUE); /* Get the tcp-heuristic. */ tpheur = tcp_getheuristic_with_lock(tp, 0, &head); if (tpheur == NULL) return ret; - if (TSTMP_GT(tpheur->th_ecn_backoff, tcp_now)) - ret = false; + if (TSTMP_GT(tpheur->th_ecn_backoff, tcp_now)) { + ret = FALSE; + } else { + /* Reset the following counters to start re-evaluating */ + if (tpheur->th_ecn_droprst >= ECN_RETRY_LIMIT) + tpheur->th_ecn_droprst = 0; + if (tpheur->th_ecn_droprxmt >= ECN_RETRY_LIMIT) + tpheur->th_ecn_droprxmt = 0; + } tcp_heuristic_unlock(head); diff --git a/bsd/netinet/tcp_cache.h b/bsd/netinet/tcp_cache.h index 4408fd5ff..4516d7578 100644 --- a/bsd/netinet/tcp_cache.h +++ b/bsd/netinet/tcp_cache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,10 +34,6 @@ #include #include -/* Number of SYN-losses we accept */ -#define TFO_MAX_COOKIE_LOSS 2 -#define ECN_MAX_SYN_LOSS 2 - #define ECN_MIN_CE_PROBES 10 /* Probes are basically the number of incoming packets */ #define ECN_MAX_CE_RATIO 7 /* Ratio is the maximum number of CE-packets we accept per incoming "probe" */ @@ -45,15 +41,21 @@ extern void tcp_cache_set_cookie(struct tcpcb *tp, u_char *cookie, u_int8_t len) extern int tcp_cache_get_cookie(struct tcpcb *tp, u_char *cookie, u_int8_t *len); extern unsigned int tcp_cache_get_cookie_len(struct tcpcb *tp); -extern void tcp_heuristic_inc_loss(struct tcpcb *tp, int tfo, int ecn); +extern void tcp_heuristic_tfo_loss(struct tcpcb *tp); +extern void tcp_heuristic_mptcp_loss(struct tcpcb *tp); +extern void tcp_heuristic_ecn_loss(struct tcpcb *tp); extern void tcp_heuristic_tfo_snd_good(struct tcpcb *tp); extern void tcp_heuristic_tfo_rcv_good(struct tcpcb *tp); extern void tcp_heuristic_tfo_middlebox(struct tcpcb *tp); extern void tcp_heuristic_ecn_aggressive(struct tcpcb *tp); -extern void tcp_heuristic_reset_loss(struct tcpcb *tp, int tfo, int ecn); extern void tcp_heuristic_tfo_success(struct tcpcb *tp); +extern void tcp_heuristic_mptcp_success(struct tcpcb *tp); +extern void tcp_heuristic_ecn_success(struct tcpcb *tp); extern boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp); +extern boolean_t tcp_heuristic_do_mptcp(struct tcpcb *tp); extern boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp); +extern void tcp_heuristic_ecn_droprst(struct tcpcb *tp); +extern void tcp_heuristic_ecn_droprxmt(struct tcpcb *tp); extern void tcp_cache_init(void); diff --git a/bsd/netinet/tcp_cc.c b/bsd/netinet/tcp_cc.c index ade6b7d03..a15fd6a07 100644 --- a/bsd/netinet/tcp_cc.c +++ b/bsd/netinet/tcp_cc.c @@ -354,12 +354,6 @@ tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) int tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) { - /* If any flags other than TH_ACK is set, set "end-of-write" bit */ - if ((th->th_flags & ~TH_ACK)) - tp->t_flagsext |= TF_STREAMEOW; - else - tp->t_flagsext &= ~(TF_STREAMEOW); - switch (tcp_delack_enabled) { case 1: case 2: diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 24450b276..72f5f611c 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -93,7 +93,7 @@ #include #include #include -#include /* for ICMP_BANDLIM */ +#include /* for ICMP_BANDLIM */ #include #include /* for ICMP_BANDLIM */ #include @@ -142,7 +142,7 @@ struct tcphdr tcp_savetcp; #if MPTCP #include #include -#include +#include #endif /* MPTCP */ #include @@ -154,6 +154,7 @@ struct tcphdr tcp_savetcp; #define TCP_RTT_HISTORY_EXPIRE_TIME (60 * TCP_RETRANSHZ) #define TCP_RECV_THROTTLE_WIN (5 * TCP_RETRANSHZ) +#define TCP_STRETCHACK_ENABLE_PKTCNT 2000 tcp_cc tcp_ccgen; @@ -246,6 +247,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max, 0, "Maximum receive socket buffer size"); +u_int32_t tcp_autorcvbuf_max_ca = 512 * 1024; +#if (DEBUG || DEVELOPMENT) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmaxca, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max_ca, 0, + "Maximum receive socket buffer size"); +#endif /* (DEBUG || DEVELOPMENT) */ + int sw_lro = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED, &sw_lro, 0, "Used to coalesce TCP packets"); @@ -285,13 +293,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, dsack_enable, "use DSACK TCP option to report duplicate segments"); #endif /* (DEVELOPMENT || DEBUG) */ -#if CONFIG_IFEF_NOWINDOWSCALE -int tcp_obey_ifef_nowindowscale = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, - CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_obey_ifef_nowindowscale, 0, ""); -#endif - extern int tcp_TCPTV_MIN; extern int tcp_acc_iaj_high; extern int tcp_acc_iaj_react_limit; @@ -315,7 +316,7 @@ static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *, struct ifnet *); static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq); static inline unsigned int tcp_maxmtu(struct rtentry *); -static inline int tcp_stretch_ack_enable(struct tcpcb *tp); +static inline int tcp_stretch_ack_enable(struct tcpcb *tp, int thflags); static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int); #if TRAFFIC_MGT @@ -329,29 +330,28 @@ static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj); static inline unsigned int tcp_maxmtu6(struct rtentry *); #endif -static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb, - struct tcpopt *to, u_int32_t tlen); - +static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb, + struct tcpopt *to, u_int32_t tlen, u_int32_t rcvbuf_max); void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb); static void tcp_sbsnd_trim(struct sockbuf *sbsnd); static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp); static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb, - u_int32_t newsize, u_int32_t idealsize); + u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max); static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th); -static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, +static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th); static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th); static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); /* - * Constants used for resizing receive socket buffer - * when timestamps are not supported + * Constants used for resizing receive socket buffer + * when timestamps are not supported */ #define TCPTV_RCVNOTS_QUANTUM 100 #define TCP_RCVNOTS_BYTELEVEL 204800 -/* - * Constants used for limiting early retransmits +/* + * Constants used for limiting early retransmits * to 10 per minute. */ #define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */ @@ -413,12 +413,13 @@ update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size) } } -/* For every 32 bit unsigned integer(v), this function will find the - * largest integer n such that (n*n <= v). This takes at most 16 iterations - * irrespective of the value of v and does not involve multiplications. +/* For every 32 bit unsigned integer(v), this function will find the + * largest integer n such that (n*n <= v). This takes at most 16 iterations + * irrespective of the value of v and does not involve multiplications. */ static inline int -isqrt(unsigned int val) { +isqrt(unsigned int val) +{ unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100}; unsigned int temp, g=0, b=0x8000, bshft=15; if ( val <= 100) { @@ -441,14 +442,14 @@ isqrt(unsigned int val) { } while ( b > 0 && val > 0); } return(g); -} +} -/* +/* * With LRO, roughly estimate the inter arrival time between * each sub coalesced packet as an average. Count the delay * cur_iaj to be the delay between the last packet received * and the first packet of the LRO stream. Due to round off errors -* cur_iaj may be the same as lro_delay_factor. Averaging has +* cur_iaj may be the same as lro_delay_factor. Averaging has * round off errors too. lro_delay_factor may be close to 0 * in steady state leading to lower values fed to compute_iaj_meat. */ @@ -463,17 +464,17 @@ compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor) } compute_iaj_meat(tp, cur_iaj); - + if (nlropkts <= 1) return; nlropkts--; - + timediff = lro_delay_factor/nlropkts; - while (nlropkts > 0) + while (nlropkts > 0) { - compute_iaj_meat(tp, timediff); + compute_iaj_meat(tp, timediff); nlropkts--; } } @@ -481,7 +482,7 @@ compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor) static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj) { - /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds, + /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds, * throttle the receive window to a minimum of MIN_IAJ_WIN packets */ #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit) @@ -493,20 +494,20 @@ void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj) uint32_t mean, temp; int32_t cur_iaj_dev; - cur_iaj_dev = (cur_iaj - tp->avg_iaj); - - /* Allow a jitter of "allowed_iaj" milliseconds. Some connections - * may have a constant jitter more than that. We detect this by + cur_iaj_dev = (cur_iaj - tp->avg_iaj); + + /* Allow a jitter of "allowed_iaj" milliseconds. Some connections + * may have a constant jitter more than that. We detect this by * using standard deviation. */ allowed_iaj = tp->avg_iaj + tp->std_dev_iaj; if (allowed_iaj < tcp_allowed_iaj) allowed_iaj = tcp_allowed_iaj; - /* Initially when the connection starts, the senders congestion - * window is small. During this period we avoid throttling a - * connection because we do not have a good starting point for - * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over + /* Initially when the connection starts, the senders congestion + * window is small. During this period we avoid throttling a + * connection because we do not have a good starting point for + * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over * the first few packets. */ if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) { @@ -515,7 +516,7 @@ void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj) acc_iaj = tp->acc_iaj - 2; else acc_iaj = 0; - + } else { acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj); } @@ -526,33 +527,33 @@ void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj) } /* Compute weighted average where the history has a weight of - * 15 out of 16 and the current value has a weight of 1 out of 16. + * 15 out of 16 and the current value has a weight of 1 out of 16. * This will make the short-term measurements have more weight. * - * The addition of 8 will help to round-up the value + * The addition of 8 will help to round-up the value * instead of round-down */ - tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj) + tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj) + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT; /* Compute Root-mean-square of deviation where mean is a weighted - * average as described above. + * average as described above. */ temp = tp->std_dev_iaj * tp->std_dev_iaj; - mean = (((temp << IAJ_DIV_SHIFT) - temp) - + (cur_iaj_dev * cur_iaj_dev) + mean = (((temp << IAJ_DIV_SHIFT) - temp) + + (cur_iaj_dev * cur_iaj_dev) + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT; - + tp->std_dev_iaj = isqrt(mean); - DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj, + DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj, uint32_t, allowed_iaj); return; } #endif /* TRAFFIC_MGT */ -/* Check if enough amount of data has been acknowledged since +/* Check if enough amount of data has been acknowledged since * bw measurement was started */ static void @@ -569,7 +570,7 @@ tcp_bwmeas_check(struct tcpcb *tp) bw = bytes / elapsed_time; if ( bw > 0) { if (tp->t_bwmeas->bw_sndbw > 0) { - tp->t_bwmeas->bw_sndbw = + tp->t_bwmeas->bw_sndbw = (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3; } else { tp->t_bwmeas->bw_sndbw = bw; @@ -606,10 +607,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, */ if (th == NULL) goto present; - + /* - * If the reassembly queue already has entries or if we are going - * to add a new one, then the connection has reached a loss state. + * If the reassembly queue already has entries or if we are going + * to add a new one, then the connection has reached a loss state. * Reset the stretch-ack algorithm at this point. */ tcp_reset_stretch_ack(tp); @@ -617,7 +618,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, #if TRAFFIC_MGT if (tp->acc_iaj > 0) reset_acc_iaj(tp); -#endif /* TRAFFIC_MGT */ +#endif /* TRAFFIC_MGT */ /* * Limit the number of segments in the reassembly queue to prevent @@ -627,7 +628,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, * process the missing segment. */ qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10), - tcp_autorcvbuf_max >> 10); + (TCP_AUTORCVBUF_MAX(ifp) >> 10)); if (th->th_seq != tp->rcv_nxt && (tp->t_reassqlen + 1) >= qlimit) { tcp_reass_overflows++; @@ -777,10 +778,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, LIST_INSERT_AFTER(p, te, tqe_q); } - /* + /* * New out-of-order data exists, and is pointed to by - * queue entry te. Set copy_oodata to 1 so out-of-order data - * can be copied off to sockbuf after in-order data + * queue entry te. Set copy_oodata to 1 so out-of-order data + * can be copied off to sockbuf after in-order data * is copied off. */ if (!(so->so_state & SS_CANTRCVMORE)) @@ -799,11 +800,11 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, if (tp->t_flagsext & TF_LRO_OFFLOADED) { tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr, th->th_dport, th->th_sport); - tp->t_flagsext &= ~TF_LRO_OFFLOADED; + tp->t_flagsext &= ~TF_LRO_OFFLOADED; } /* - * continue processing if out-of-order data + * continue processing if out-of-order data * can be delivered */ if (q && (so->so_flags & SOF_ENABLE_MSGS)) @@ -812,48 +813,57 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, return (0); } + /* + * If there is already another thread doing reassembly for this + * connection, it is better to let it finish the job -- + * (radar 16316196) + */ + if (tp->t_flagsext & TF_REASS_INPROG) + return (0); + + tp->t_flagsext |= TF_REASS_INPROG; /* lost packet was recovered, so ooo data can be returned */ tcpstat.tcps_recovered_pkts++; do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; - nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_state & SS_CANTRCVMORE) { m_freem(q->tqe_m); } else { so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */ if (so->so_flags & SOF_ENABLE_MSGS) { - /* - * Append the inorder data as a message to the - * receive socket buffer. Also check to see if - * the data we are about to deliver is the same - * data that we wanted to pass up to the user - * out of order. If so, reset copy_oodata -- + /* + * Append the inorder data as a message to the + * receive socket buffer. Also check to see if + * the data we are about to deliver is the same + * data that we wanted to pass up to the user + * out of order. If so, reset copy_oodata -- * the received data filled a gap, and * is now in order! */ if (q == te) copy_oodata = 0; } - if (sbappendstream_rcvdemux(so, q->tqe_m, + if (sbappendstream_rcvdemux(so, q->tqe_m, q->tqe_th->th_seq - (tp->irs + 1), 0)) dowakeup = 1; - if (tp->t_flagsext & TF_LRO_OFFLOADED) { - tcp_update_lro_seq(tp->rcv_nxt, + if (tp->t_flagsext & TF_LRO_OFFLOADED) { + tcp_update_lro_seq(tp->rcv_nxt, inp->inp_laddr, inp->inp_faddr, th->th_dport, th->th_sport); } } zfree(tcp_reass_zone, q); tp->t_reassqlen--; - q = nq; + q = LIST_FIRST(&tp->t_segq); } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + tp->t_flagsext &= ~TF_REASS_INPROG; #if INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { - + KERNEL_DEBUG(DBG_LAYER_BEG, ((inp->inp_fport << 16) | inp->inp_lport), (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | @@ -873,13 +883,13 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, msg_unordered_delivery: /* Deliver out-of-order data as a message */ if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) { - /* - * make a copy of the mbuf to be delivered up to + /* + * make a copy of the mbuf to be delivered up to * the user, and add it to the sockbuf */ oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT); if (oodata != NULL) { - if (sbappendmsgstream_rcv(&so->so_rcv, oodata, + if (sbappendmsgstream_rcv(&so->so_rcv, oodata, te->tqe_th->th_seq - (tp->irs + 1), 1)) { dowakeup = 1; tcpstat.tcps_msg_unopkts++; @@ -956,7 +966,7 @@ tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen) inline void tcp_keepalive_reset(struct tcpcb *tp) { - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPIDLE(tp)); tp->t_flagsext &= ~(TF_DETECT_READSTALL); tp->t_rtimo_probes = 0; @@ -971,7 +981,7 @@ int tcp6_input(struct mbuf **mp, int *offp, int proto) { #pragma unused(proto) - register struct mbuf *m = *mp; + struct mbuf *m = *mp; uint32_t ia6_flags; struct ifnet *ifp = m->m_pkthdr.rcvif; @@ -1005,7 +1015,7 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) #endif /* Depending on the usage of mbuf space in the system, this function - * will return true or false. This is used to determine if a socket + * will return true or false. This is used to determine if a socket * buffer can take more memory from the system for auto-tuning or not. */ u_int8_t @@ -1013,7 +1023,7 @@ tcp_cansbgrow(struct sockbuf *sb) { /* Calculate the host level space limit in terms of MSIZE buffers. * We can use a maximum of half of the available mbuf space for - * socket buffers. + * socket buffers. */ u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT)); @@ -1034,14 +1044,13 @@ tcp_cansbgrow(struct sockbuf *sb) static void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv, - u_int32_t newsize, u_int32_t idealsize) + u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max) { - /* newsize should not exceed max */ - newsize = min(newsize, tcp_autorcvbuf_max); + newsize = min(newsize, rcvbuf_max); - /* The receive window scale negotiated at the - * beginning of the connection will also set a + /* The receive window scale negotiated at the + * beginning of the connection will also set a * limit on the socket buffer size */ newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale); @@ -1049,44 +1058,43 @@ tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv, /* Set new socket buffer size */ if (newsize > sbrcv->sb_hiwat && (sbreserve(sbrcv, newsize) == 1)) { - sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize, - (idealsize != 0) ? idealsize : newsize), - tcp_autorcvbuf_max); + sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize, + (idealsize != 0) ? idealsize : newsize), rcvbuf_max); - /* Again check the limit set by the advertised - * window scale + /* Again check the limit set by the advertised + * window scale */ - sbrcv->sb_idealsize = min(sbrcv->sb_idealsize, + sbrcv->sb_idealsize = min(sbrcv->sb_idealsize, TCP_MAXWIN << tp->rcv_scale); } } -/* +/* * This function is used to grow a receive socket buffer. It * will take into account system-level memory usage and the * bandwidth available on the link to make a decision. */ static void -tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, - struct tcpopt *to, u_int32_t pktlen) +tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, + struct tcpopt *to, u_int32_t pktlen, u_int32_t rcvbuf_max) { struct socket *so = sbrcv->sb_so; - + /* * Do not grow the receive socket buffer if * - auto resizing is disabled, globally or on this socket * - the high water mark already reached the maximum - * - the stream is in background and receive side is being + * - the stream is in background and receive side is being * throttled * - if there are segments in reassembly queue indicating loss, - * do not need to increase recv window during recovery as more + * do not need to increase recv window during recovery as more * data is not going to be sent. A duplicate ack sent during * recovery should not change the receive window */ if (tcp_do_autorcvbuf == 0 || (sbrcv->sb_flags & SB_AUTOSIZE) == 0 || tcp_cansbgrow(sbrcv) == 0 || - sbrcv->sb_hiwat >= tcp_autorcvbuf_max || + sbrcv->sb_hiwat >= rcvbuf_max || (tp->t_flagsext & TF_RECV_THROTTLE) || (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) || !LIST_EMPTY(&tp->t_segq)) { @@ -1109,11 +1117,12 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, * the receive socket buffer needs to grow, increase * the high water mark. */ - if (TSTMP_GEQ(tcp_now, + if (TSTMP_GEQ(tcp_now, tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) { if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) { tcp_sbrcv_reserve(tp, sbrcv, - tcp_autorcvbuf_max, 0); + tcp_autorcvbuf_max, 0, + tcp_autorcvbuf_max); } goto out; } else { @@ -1137,12 +1146,12 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, /* * Increment the receive window by a * multiple of maximum sized segments. - * This will prevent a connection from + * This will prevent a connection from * sending smaller segments on wire if it * is limited by the receive window. * * Set the ideal size based on current - * bandwidth measurements. We set the + * bandwidth measurements. We set the * ideal size on receive socket buffer to * be twice the bandwidth delay product. */ @@ -1155,13 +1164,13 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, */ min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift; if (rcvbuf_inc < min_incr) - rcvbuf_inc = min_incr; + rcvbuf_inc = min_incr; - rcvbuf_inc = + rcvbuf_inc = (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg; tcp_sbrcv_reserve(tp, sbrcv, - sbrcv->sb_hiwat + rcvbuf_inc, - (tp->rfbuf_cnt * 2)); + sbrcv->sb_hiwat + rcvbuf_inc, + (tp->rfbuf_cnt * 2), rcvbuf_max); } goto out; } else { @@ -1178,11 +1187,12 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, /* This function will trim the excess space added to the socket buffer * to help a slow-reading app. The ideal-size of a socket buffer depends - * on the link bandwidth or it is set by an application and we aim to + * on the link bandwidth or it is set by an application and we aim to * reach that size. */ void -tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) { +tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) +{ if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 && sbrcv->sb_hiwat > sbrcv->sb_idealsize) { int32_t trim; @@ -1193,10 +1203,10 @@ tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) { * this connection. */ u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt; - + /* How much can we trim the receive socket buffer? * 1. it can not be trimmed beyond the max rcv win advertised - * 2. if possible, leave 1/16 of bandwidth*delay to + * 2. if possible, leave 1/16 of bandwidth*delay to * avoid closing the win completely */ u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4)); @@ -1206,7 +1216,7 @@ tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) { */ if (leave == 0) leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift; - + trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave); trim = imin(trim, (int32_t)diff); @@ -1222,9 +1232,10 @@ tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) { * to hold more mbufs for that connection than what the cwnd will allow. */ void -tcp_sbsnd_trim(struct sockbuf *sbsnd) { - if (tcp_do_autosendbuf == 1 && - ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) == +tcp_sbsnd_trim(struct sockbuf *sbsnd) +{ + if (tcp_do_autosendbuf == 1 && + ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) == (SB_AUTOSIZE | SB_TRIM)) && (sbsnd->sb_idealsize > 0) && (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) { @@ -1240,15 +1251,16 @@ tcp_sbsnd_trim(struct sockbuf *sbsnd) { sbsnd->sb_flags &= ~(SB_TRIM); } -/* +/* * If timestamp option was not negotiated on this connection * and this connection is on the receiving side of a stream * then we can not measure the delay on the link accurately. * Instead of enabling automatic receive socket buffer * resizing, just give more space to the receive socket buffer. */ -static inline void -tcp_sbrcv_tstmp_check(struct tcpcb *tp) { +static inline void +tcp_sbrcv_tstmp_check(struct tcpcb *tp) +{ struct socket *so = tp->t_inpcb->inp_socket; u_int32_t newsize = 2 * tcp_recvspace; struct sockbuf *sbrcv = &so->so_rcv; @@ -1256,53 +1268,79 @@ tcp_sbrcv_tstmp_check(struct tcpcb *tp) { if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) != (TF_REQ_TSTMP | TF_RCVD_TSTMP) && (sbrcv->sb_flags & SB_AUTOSIZE) != 0) { - tcp_sbrcv_reserve(tp, sbrcv, newsize, 0); + tcp_sbrcv_reserve(tp, sbrcv, newsize, 0, newsize); } } -/* A receiver will evaluate the flow of packets on a connection - * to see if it can reduce ack traffic. The receiver will start +/* A receiver will evaluate the flow of packets on a connection + * to see if it can reduce ack traffic. The receiver will start * stretching acks if all of the following conditions are met: * 1. tcp_delack_enabled is set to 3 * 2. If the bytes received in the last 100ms is greater than a threshold * defined by maxseg_unacked * 3. If the connection has not been idle for tcp_maxrcvidle period. - * 4. If the connection has seen enough packets to let the slow-start + * 4. If the connection has seen enough packets to let the slow-start * finish after connection establishment or after some packet loss. * * The receiver will stop stretching acks if there is congestion/reordering - * as indicated by packets on reassembly queue or an ECN. If the delayed-ack - * timer fires while stretching acks, it means that the packet flow has gone + * as indicated by packets on reassembly queue or an ECN. If the delayed-ack + * timer fires while stretching acks, it means that the packet flow has gone * below the threshold defined by maxseg_unacked and the receiver will stop - * stretching acks. The receiver gets no indication when slow-start is completed - * or when the connection reaches an idle state. That is why we use - * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle + * stretching acks. The receiver gets no indication when slow-start is completed + * or when the connection reaches an idle state. That is why we use + * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle * state. */ static inline int -tcp_stretch_ack_enable(struct tcpcb *tp) +tcp_stretch_ack_enable(struct tcpcb *tp, int thflags) { + if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) && + TSTMP_GEQ(tp->rcv_unackwin, tcp_now)) + tp->t_flags |= TF_STREAMING_ON; + else + tp->t_flags &= ~TF_STREAMING_ON; + + /* If there has been an idle time, reset streaming detection */ + if (TSTMP_GT(tcp_now, tp->rcv_unackwin + tcp_maxrcvidle)) + tp->t_flags &= ~TF_STREAMING_ON; + + /* + * If there are flags other than TH_ACK set, reset streaming + * detection + */ + if (thflags & ~TH_ACK) + tp->t_flags &= ~TF_STREAMING_ON; + + if (tp->t_flagsext & TF_DISABLE_STRETCHACK) { + if (tp->rcv_nostrack_pkts >= TCP_STRETCHACK_ENABLE_PKTCNT) { + tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; + tp->rcv_nostrack_pkts = 0; + tp->rcv_nostrack_ts = 0; + } else { + tp->rcv_nostrack_pkts++; + } + } + if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) && - tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) && - TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) && - (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) || - (tp->rcv_waitforss >= tcp_rcvsspktcnt))) { + (tp->t_flags & TF_STREAMING_ON) && + (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) || + (tp->rcv_waitforss >= tcp_rcvsspktcnt))) { return(1); } - + return(0); } /* * Reset the state related to stretch-ack algorithm. This will make * the receiver generate an ack every other packet. The receiver - * will start re-evaluating the rate at which packets come to decide + * will start re-evaluating the rate at which packets come to decide * if it can benefit by lowering the ack traffic. */ void tcp_reset_stretch_ack(struct tcpcb *tp) { - tp->t_flags &= ~(TF_STRETCHACK); + tp->t_flags &= ~(TF_STRETCHACK|TF_STREAMING_ON); tp->rcv_by_unackwin = 0; tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; @@ -1310,26 +1348,26 @@ tcp_reset_stretch_ack(struct tcpcb *tp) * When there is packet loss or packet re-ordering or CWR due to * ECN, the sender's congestion window is reduced. In these states, * generate an ack for every other packet for some time to allow - * the sender's congestion window to grow. + * the sender's congestion window to grow. */ tp->t_flagsext |= TF_RCVUNACK_WAITSS; tp->rcv_waitforss = 0; } /* - * The last packet was a retransmission, check if this ack + * The last packet was a retransmission, check if this ack * indicates that the retransmission was spurious. - * + * * If the connection supports timestamps, we could use it to * detect if the last retransmit was not needed. Otherwise, - * we check if the ACK arrived within RTT/2 window, then it + * we check if the ACK arrived within RTT/2 window, then it * was a mistake to do the retransmit in the first place. * - * This function will return 1 if it is a spurious retransmit, - * 0 otherwise. + * This function will return 1 if it is a spurious retransmit, + * 0 otherwise. */ int -tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th, +tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, u_int32_t rxtime) { int32_t tdiff, bad_rexmt_win; @@ -1340,11 +1378,11 @@ tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th, return (0); if (TSTMP_SUPPORTED(tp)) { if (rxtime > 0 && (to->to_flags & TOF_TS) - && to->to_tsecr != 0 + && to->to_tsecr != 0 && TSTMP_LT(to->to_tsecr, rxtime)) return (1); } else { - if ((tp->t_rxtshift == 1 + if ((tp->t_rxtshift == 1 || (tp->t_flagsext & TF_SENT_TLPROBE)) && rxtime > 0) { tdiff = (int32_t)(tcp_now - rxtime); @@ -1379,7 +1417,7 @@ tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th) CC_ALGO(tp)->cwnd_init(tp); tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd); - + } else { tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; @@ -1392,14 +1430,12 @@ tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES); tp->snd_recover = tp->snd_recover_prev; tp->snd_nxt = tp->snd_max; - tp->t_rxtshift = 0; - tp->t_rxtstart = 0; /* Fix send socket buffer to reflect the change in cwnd */ tcp_bad_rexmt_fix_sndbuf(tp); /* - * This RTT might reflect the extra delay induced + * This RTT might reflect the extra delay induced * by the network. Skip using this sample for RTO * calculation and mark the connection so we can * recompute RTT when the next eligible sample is @@ -1445,7 +1481,7 @@ tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) } /* - * The tail loss probe recovered the last packet and + * The tail loss probe recovered the last packet and * we need to adjust the congestion window to take * this loss into account. */ @@ -1478,7 +1514,7 @@ tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) */ if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) && tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) { - if ((to->to_flags & TOF_TS) && to->to_tsecr != 0 + if ((to->to_flags & TOF_TS) && to->to_tsecr != 0 && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) { tcp_pmtud_revert_segment_size(tp); } @@ -1509,36 +1545,36 @@ tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th) if (early_rexmt && (SACK_ENABLED(tp) || tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && - (tp->t_dupacks == 1 || - (SACK_ENABLED(tp) && + (tp->t_dupacks == 1 || + (SACK_ENABLED(tp) && !TAILQ_EMPTY(&tp->snd_holes)))) { /* - * If there are only a few outstanding + * If there are only a few outstanding * segments on the connection, we might need * to lower the retransmit threshold. This - * will allow us to do Early Retransmit as + * will allow us to do Early Retransmit as * described in RFC 5827. */ - if (SACK_ENABLED(tp) && + if (SACK_ENABLED(tp) && !TAILQ_EMPTY(&tp->snd_holes)) { obytes = (tp->snd_max - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; } else { - obytes = (tp->snd_max - tp->snd_una); + obytes = (tp->snd_max - tp->snd_una); } /* - * In order to lower retransmit threshold the + * In order to lower retransmit threshold the * following two conditions must be met. - * 1. the amount of outstanding data is less + * 1. the amount of outstanding data is less * than 4*SMSS bytes - * 2. there is no unsent data ready for - * transmission or the advertised window + * 2. there is no unsent data ready for + * transmission or the advertised window * will limit sending new segments. */ snd_off = tp->snd_max - tp->snd_una; snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off; - if (obytes < (tp->t_maxseg << 2) && + if (obytes < (tp->t_maxseg << 2) && snd_len <= 0) { u_int32_t osegs; @@ -1546,21 +1582,21 @@ tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th) if ((osegs * tp->t_maxseg) < obytes) osegs++; - /* - * Since the connection might have already + /* + * Since the connection might have already * received some dupacks, we add them to * to the outstanding segments count to get * the correct retransmit threshold. * - * By checking for early retransmit after + * By checking for early retransmit after * receiving some duplicate acks when SACK - * is supported, the connection will - * enter fast recovery even if multiple + * is supported, the connection will + * enter fast recovery even if multiple * segments are lost in the same window. */ osegs += tp->t_dupacks; if (osegs < 4) { - tp->t_rexmtthresh = + tp->t_rexmtthresh = ((osegs - 1) > 1) ? (osegs - 1) : 1; tp->t_rexmtthresh = min(tp->t_rexmtthresh, tcprexmtthresh); @@ -1596,9 +1632,7 @@ tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th) } static boolean_t -tcp_tfo_syn(tp, to) - struct tcpcb *tp; - struct tcpopt *to; +tcp_tfo_syn(struct tcpcb *tp, struct tcpopt *to) { u_char out[CCAES_BLOCK_SIZE]; unsigned char len; @@ -1644,9 +1678,7 @@ tcp_tfo_syn(tp, to) } static void -tcp_tfo_synack(tp, to) - struct tcpcb *tp; - struct tcpopt *to; +tcp_tfo_synack(struct tcpcb *tp, struct tcpopt *to) { if (to->to_flags & TOF_TFO) { unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ; @@ -1664,6 +1696,10 @@ tcp_tfo_synack(tp, to) tp->t_tfo_stats |= TFO_S_COOKIE_RCV; tcpstat.tcps_tfo_cookie_rcv++; + if (tp->t_tfo_flags & TFO_F_COOKIE_SENT) { + tcpstat.tcps_tfo_cookie_wrong++; + tp->t_tfo_stats |= TFO_S_COOKIE_WRONG; + } } else { /* * Thus, no cookie in the response, but we either asked for one @@ -1671,10 +1707,19 @@ tcp_tfo_synack(tp, to) * rexmit the SYN. If that's the case, it's better to start * backing of TFO-cookie requests. */ - if (tp->t_tfo_flags & TFO_F_SYN_LOSS) - tcp_heuristic_inc_loss(tp, 1, 0); - else - tcp_heuristic_reset_loss(tp, 1, 0); + if (tp->t_tfo_flags & TFO_F_SYN_LOSS) { + tp->t_tfo_stats |= TFO_S_SYN_LOSS; + tcpstat.tcps_tfo_syn_loss++; + + tcp_heuristic_tfo_loss(tp); + } else { + if (tp->t_tfo_flags & TFO_F_COOKIE_REQ) { + tp->t_tfo_stats |= TFO_S_NO_COOKIE_RCV; + tcpstat.tcps_tfo_no_cookie_rcv++; + } + + tcp_heuristic_tfo_success(tp); + } } } @@ -1724,20 +1769,44 @@ tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th) } } +/* + * Update snd_wnd information. + */ +static inline bool +tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th, + u_int32_t tiwin, int tlen) +{ + /* Don't look at the window if there is no ACK flag */ + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + return (true); + } + return (false); +} + void -tcp_input(m, off0) - struct mbuf *m; - int off0; +tcp_input(struct mbuf *m, int off0) { - register struct tcphdr *th; - register struct ip *ip = NULL; - register struct inpcb *inp; + struct tcphdr *th; + struct ip *ip = NULL; + struct inpcb *inp; u_char *optp = NULL; int optlen = 0; int tlen, off; int drop_hdrlen; - register struct tcpcb *tp = 0; - register int thflags; + struct tcpcb *tp = 0; + int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; struct in_addr laddr; @@ -1745,7 +1814,7 @@ tcp_input(m, off0) struct in6_addr laddr6; #endif int dropsocket = 0; - int iss = 0, nosock = 0; + int iss = 0, nosock = 0; u_int32_t tiwin, sack_bytes_acked = 0; struct tcpopt to; /* options in this segment */ #if TCPDEBUG @@ -1785,7 +1854,7 @@ tcp_input(m, off0) fwd_tag = NULL; } if (fwd_tag != NULL) { - struct ip_fwd_tag *ipfwd_tag = + struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); next_hop = ipfwd_tag->next_hop; @@ -1809,9 +1878,9 @@ tcp_input(m, off0) #if INET6 if (isipv6) { - /* - * Expect 32-bit aligned data pointer on - * strict-align platforms + /* + * Expect 32-bit aligned data pointer on + * strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); @@ -1840,7 +1909,7 @@ tcp_input(m, off0) goto dropnosock; } DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, - struct ip6_hdr *, ip6, struct tcpcb *, NULL, + struct ip6_hdr *, ip6, struct tcpcb *, NULL, struct tcphdr *, th); ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; @@ -1889,7 +1958,7 @@ tcp_input(m, off0) /* * Check that TCP offset makes sense, - * pull out TCP options and adjust length. XXX + * pull out TCP options and adjust length. */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { @@ -1918,7 +1987,7 @@ tcp_input(m, off0) } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); - /* + /* * Do quick retrieval of timestamp options ("options * prediction?"). If timestamp is the only option and it's * formatted as recommended in RFC 1323 appendix A, we @@ -1961,7 +2030,7 @@ tcp_input(m, off0) * parameters to be unchanged. */ drop_hdrlen = off0 + off; - + /* Since this is an entry point for input processing of tcp packets, we * can update the tcp clock here. */ @@ -2006,12 +2075,12 @@ tcp_input(m, off0) ) { /* * Diverted. Pretend to be the destination. - * already got one like this? + * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { - /* + /* * No, then it's new. Try find the ambushing socket */ if (!next_hop->sin_port) { @@ -2109,9 +2178,9 @@ tcp_input(m, off0) break; } } - if (blackhole) { + if (blackhole) { if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) - + switch (blackhole) { case 1: if (thflags & TH_SYN) @@ -2151,22 +2220,18 @@ tcp_input(m, off0) #if NECP #if INET6 if (isipv6) { - if (!necp_socket_is_allowed_to_send_recv_v6(inp, th->th_dport, - th->th_sport, - &ip6->ip6_dst, - &ip6->ip6_src, - ifp, NULL, NULL)) { + if (!necp_socket_is_allowed_to_send_recv_v6(inp, + th->th_dport, th->th_sport, &ip6->ip6_dst, + &ip6->ip6_src, ifp, NULL, NULL)) { IF_TCP_STATINC(ifp, badformatipsec); goto drop; } } else #endif { - if (!necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport, - th->th_sport, - &ip->ip_dst, - &ip->ip_src, - ifp, NULL, NULL)) { + if (!necp_socket_is_allowed_to_send_recv_v4(inp, + th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src, + ifp, NULL, NULL)) { IF_TCP_STATINC(ifp, badformatipsec); goto drop; } @@ -2194,8 +2259,8 @@ tcp_input(m, off0) #endif /* Avoid processing packets while closing a listen socket */ - if (tp->t_state == TCPS_LISTEN && - (so->so_options & SO_ACCEPTCONN) == 0) + if (tp->t_state == TCPS_LISTEN && + (so->so_options & SO_ACCEPTCONN) == 0) goto drop; if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { @@ -2213,7 +2278,7 @@ tcp_input(m, off0) } #endif if (so->so_options & SO_ACCEPTCONN) { - register struct tcpcb *tp0 = tp; + struct tcpcb *tp0 = tp; struct socket *so2; struct socket *oso; struct sockaddr_storage from; @@ -2222,7 +2287,8 @@ tcp_input(m, off0) #endif /* INET6 */ struct ifnet *head_ifscope; unsigned int head_nocell, head_recvanyif, - head_noexpensive, head_awdl_unrestricted; + head_noexpensive, head_awdl_unrestricted, + head_intcoproc_allowed; /* Get listener's bound-to-interface, if any */ head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? @@ -2234,6 +2300,7 @@ tcp_input(m, off0) /* Get listener's no-expensive information, if any */ head_noexpensive = INP_NO_EXPENSIVE(inp); head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp); + head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp); /* * If the state is LISTEN then ignore segment if it contains an RST. @@ -2259,7 +2326,7 @@ tcp_input(m, off0) goto drop; } KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0); - if (th->th_dport == th->th_sport) { + if (th->th_dport == th->th_sport) { #if INET6 if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, @@ -2328,7 +2395,7 @@ tcp_input(m, off0) #if INET6 if (isipv6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from; - + sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_port = th->th_sport; @@ -2340,7 +2407,7 @@ tcp_input(m, off0) #endif { struct sockaddr_in *sin = (struct sockaddr_in*)&from; - + sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_port = th->th_sport; @@ -2358,7 +2425,7 @@ tcp_input(m, off0) else so2 = sonewconn(so, 0, NULL); } - if (!so2) + if (!so2) goto drop; } @@ -2404,6 +2471,8 @@ tcp_input(m, off0) inp_set_noexpensive(inp); if (head_awdl_unrestricted) inp_set_awdl_unrestricted(inp); + if (head_intcoproc_allowed) + inp_set_intcoproc_allowed(inp); /* * Inherit {IN,IN6}_RECV_ANYIF from listener. */ @@ -2493,11 +2562,13 @@ tcp_input(m, off0) tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) tp->t_notsent_lowat = tp0->t_notsent_lowat; + tp->t_inpcb->inp_flags2 |= + tp0->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD; /* now drop the reference on the listener */ tcp_unlock(oso, 1, 0); - tcp_set_max_rwinscale(tp, so); + tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(ifp)); KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); } @@ -2506,16 +2577,16 @@ tcp_input(m, off0) LCK_MTX_ASSERT_OWNED); if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) { - /* - * Evaluate the rate of arrival of packets to see if the - * receiver can reduce the ack traffic. The algorithm to - * stretch acks will be enabled if the connection meets + /* + * Evaluate the rate of arrival of packets to see if the + * receiver can reduce the ack traffic. The algorithm to + * stretch acks will be enabled if the connection meets * certain criteria defined in tcp_stretch_ack_enable function. */ if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) { TCP_INC_VAR(tp->rcv_waitforss, nlropkts); } - if (tcp_stretch_ack_enable(tp)) { + if (tcp_stretch_ack_enable(tp, thflags)) { tp->t_flags |= TF_STRETCHACK; tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS); tp->rcv_waitforss = 0; @@ -2530,7 +2601,7 @@ tcp_input(m, off0) } } - /* + /* * Keep track of how many bytes were received in the LRO packet */ if ((pktf_sw_lro_pkt) && (nlropkts > 2)) { @@ -2567,8 +2638,8 @@ tcp_input(m, off0) tp->t_ecn_recv_cwr++; } - /* - * If we received an explicit notification of congestion in + /* + * If we received an explicit notification of congestion in * ip tos ecn bits or by the CWR bit in TCP header flags, reset * the ack-strteching state. We need to handle ECN notification if * an ECN setup SYN was sent even once. @@ -2604,9 +2675,9 @@ tcp_input(m, off0) } } - /* + /* * Try to determine if we are receiving a packet after a long time. - * Use our own approximation of idletime to roughly measure remote + * Use our own approximation of idletime to roughly measure remote * end's idle time. Since slowstart is used after an idle period * we want to avoid doing LRO if the remote end is not up to date * on initial window support and starts with 1 or 2 packets as its IW. @@ -2668,8 +2739,7 @@ tcp_input(m, off0) ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && - th->th_seq == tp->rcv_nxt && - LIST_EMPTY(&tp->t_segq)) { + th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) { int seg_size = tlen; if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) { TCP_INC_VAR(tp->iaj_pktcnt, nlropkts); @@ -2697,7 +2767,7 @@ tcp_input(m, off0) m->m_pkthdr.lro_elapsed); else compute_iaj(tp, 1, 0); - } + } if (seg_size < tp->iaj_size) { /* * There is a smaller packet in the stream. @@ -2709,7 +2779,7 @@ tcp_input(m, off0) * iaj_size, we try to learn the iaj_size * again. */ - TCP_INC_VAR(tp->iaj_small_pkt, nlropkts); + TCP_INC_VAR(tp->iaj_small_pkt, nlropkts); if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) { update_iaj_state(tp, seg_size, 1); } else { @@ -2766,8 +2836,8 @@ tcp_input(m, off0) if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_ssthresh && - (!IN_FASTRECOVERY(tp) && - ((!(SACK_ENABLED(tp)) && + (!IN_FASTRECOVERY(tp) && + ((!(SACK_ENABLED(tp)) && tp->t_dupacks < tp->t_rexmtthresh) || (SACK_ENABLED(tp) && to.to_nsacks == 0 && TAILQ_EMPTY(&tp->snd_holes))))) { @@ -2776,7 +2846,7 @@ tcp_input(m, off0) */ ++tcpstat.tcps_predack; - tcp_bad_rexmt_check(tp, th, &to), + tcp_bad_rexmt_check(tp, th, &to); /* Recalculate the RTT */ tcp_compute_rtt(tp, &to, th); @@ -2785,12 +2855,12 @@ tcp_input(m, off0) acked = BYTES_ACKED(th, tp); tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; - + /* * Handle an ack that is in sequence during * congestion avoidance phase. The * calculations in this function - * assume that snd_una is not updated yet. + * assume that snd_una is not updated yet. */ if (CC_ALGO(tp)->congestion_avd != NULL) CC_ALGO(tp)->congestion_avd(tp, th); @@ -2807,6 +2877,8 @@ tcp_input(m, off0) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; + TCP_RESET_REXMT_STATE(tp); + /* * pull snd_wl2 up to prevent seq wrap relative * to th_ack. @@ -2845,7 +2917,11 @@ tcp_input(m, off0) if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && tp->t_bwmeas != NULL) tcp_bwmeas_check(tp); + sowwakeup(so); /* has to be done with socket lock held */ + if (!SLIST_EMPTY(&tp->t_notify_ack)) + tcp_notify_acknowledgement(tp, so); + if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); } @@ -2873,17 +2949,17 @@ tcp_input(m, off0) if (turnoff_lro) { tcp_lro_remove_state(tp->t_inpcb->inp_laddr, tp->t_inpcb->inp_faddr, - tp->t_inpcb->inp_lport, + tp->t_inpcb->inp_lport, tp->t_inpcb->inp_fport); tp->t_flagsext &= ~TF_LRO_OFFLOADED; tp->t_idleat = tp->rcv_nxt; } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 && - (so->so_flags & SOF_USELRO) && + (so->so_flags & SOF_USELRO) && !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) && (m->m_pkthdr.rcvif->if_type != IFT_LOOP) && - ((th->th_seq - tp->irs) > + ((th->th_seq - tp->irs) > (tp->t_maxseg << lro_start)) && - ((tp->t_idleat == 0) || ((th->th_seq - + ((tp->t_idleat == 0) || ((th->th_seq - tp->t_idleat) > (tp->t_maxseg << lro_start)))) { tp->t_flagsext |= TF_LRO_OFFLOADED; tcp_start_coalescing(ip, th, tlen); @@ -2909,7 +2985,7 @@ tcp_input(m, off0) tcpstat.tcps_rcvbyte += tlen; if (nstat_collect) { if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) { - INP_ADD_STAT(inp, cell, wifi, wired, + INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, m->m_pkthdr.lro_npkts); } else { INP_ADD_STAT(inp, cell, wifi, wired, @@ -2920,43 +2996,43 @@ tcp_input(m, off0) } /* - * Calculate the RTT on the receiver only if the - * connection is in streaming mode and the last + * Calculate the RTT on the receiver only if the + * connection is in streaming mode and the last * packet was not an end-of-write */ - if ((tp->t_flags & TF_STRETCHACK) && - !(tp->t_flagsext & TF_STREAMEOW)) + if (tp->t_flags & TF_STREAMING_ON) tcp_compute_rtt(tp, &to, th); - tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); - + tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen, + TCP_AUTORCVBUF_MAX(ifp)); + /* * Add data to socket buffer. */ so_recv_data_stat(so, m, 0); m_adj(m, drop_hdrlen); /* delayed header drop */ - + /* - * If message delivery (SOF_ENABLE_MSGS) is enabled on + * If message delivery (SOF_ENABLE_MSGS) is enabled on * this socket, deliver the packet received as an * in-order message with sequence number attached to it. */ - if (sbappendstream_rcvdemux(so, m, + if (sbappendstream_rcvdemux(so, m, th->th_seq - (tp->irs + 1), 0)) { sorwakeup(so); - } + } #if INET6 if (isipv6) { KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])), - th->th_seq, th->th_ack, th->th_win); + th->th_seq, th->th_ack, th->th_win); } else -#endif +#endif { KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), - th->th_seq, th->th_ack, th->th_win); + th->th_seq, th->th_ack, th->th_win); } TCP_INC_VAR(tp->t_unacksegs, nlropkts); if (DELAY_ACK(tp, th)) { @@ -3006,7 +3082,7 @@ tcp_input(m, off0) (mp_tp = tptomptp(tp))) { MPT_LOCK(mp_tp); if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) { - tp->rcv_wnd = mp_tp->mpt_rcvwnd; + tp->rcv_wnd = imax(mp_tp->mpt_rcvwnd, (int)(tp->rcv_adv - tp->rcv_nxt)); tcpstat.tcps_mp_reducedwin++; } MPT_UNLOCK(mp_tp); @@ -3025,9 +3101,9 @@ tcp_input(m, off0) * segment in this state. */ case TCPS_LISTEN: { - register struct sockaddr_in *sin; + struct sockaddr_in *sin; #if INET6 - register struct sockaddr_in6 *sin6; + struct sockaddr_in6 *sin6; #endif lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, @@ -3106,7 +3182,7 @@ tcp_input(m, off0) DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); tp->t_state = TCPS_SYN_RECEIVED; - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINIT(tp)); dropsocket = 0; /* committed to socket */ @@ -3130,13 +3206,6 @@ tcp_input(m, off0) tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT); } -#if CONFIG_IFEF_NOWINDOWSCALE - if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL && - (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) { - /* Window scaling is not enabled on this interface */ - tp->t_flags &= ~TF_REQ_SCALE; - } -#endif goto trimthenstep6; } @@ -3195,7 +3264,7 @@ tcp_input(m, off0) goto drop; } #endif /* MPTCP */ - soevent(so, + soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET)); tp = tcp_drop(tp, ECONNREFUSED); @@ -3211,28 +3280,28 @@ tcp_input(m, off0) tcp_rcvseqinit(tp); if (thflags & TH_ACK) { tcpstat.tcps_connects++; - + if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) { /* ECN-setup SYN-ACK */ tp->ecn_flags |= TE_SETUPRECEIVED; if (TCP_ECN_ENABLED(tp)) { - tcp_heuristic_reset_loss(tp, 0, 1); + tcp_heuristic_ecn_success(tp); tcpstat.tcps_ecn_client_success++; } } else { if (tp->ecn_flags & TE_SETUPSENT && tp->t_rxtshift == 0) { - tcp_heuristic_reset_loss(tp, 0, 1); + tcp_heuristic_ecn_success(tp); tcpstat.tcps_ecn_not_supported++; } if (tp->ecn_flags & TE_SETUPSENT && tp->t_rxtshift > 0) - tcp_heuristic_inc_loss(tp, 0, 1); + tcp_heuristic_ecn_loss(tp); /* non-ECN-setup SYN-ACK */ tp->ecn_flags &= ~TE_SENDIPECT; } - + #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET /* XXXMAC: recursive lock: SOCK_LOCK(so); */ mac_socketpeer_label_associate_mbuf(m, so); @@ -3298,7 +3367,13 @@ tcp_input(m, off0) TCP_CONN_KEEPIDLE(tp)); if (nstat_collect) nstat_route_connect_success( - tp->t_inpcb->inp_route.ro_rt); + inp->inp_route.ro_rt); + /* + * The SYN is acknowledged but una is not + * updated yet. So pass the value of + * ack to compute sndbytes correctly + */ + inp_count_sndbytes(inp, th->th_ack); } #if MPTCP /* @@ -3310,7 +3385,7 @@ tcp_input(m, off0) isconnected = FALSE; /* Start data xmit if fastjoin */ if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) { - soevent(so, (SO_FILT_HINT_LOCKED | + soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFASTJ)); } } else @@ -3396,7 +3471,7 @@ tcp_input(m, off0) /* Received a SYN while connection is already established. * This is a "half open connection and other anomalies" described * in RFC793 page 34, send an ACK so the remote reset the connection - * or recovers by adjusting its sequence numberering + * or recovers by adjusting its sequence numberering */ case TCPS_ESTABLISHED: if (thflags & TH_SYN) @@ -3475,8 +3550,8 @@ tcp_input(m, off0) if (thflags & TH_RST) { if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || - (tp->rcv_wnd == 0 && - ((tp->last_ack_sent == th->th_seq) || + (tp->rcv_wnd == 0 && + ((tp->last_ack_sent == th->th_seq) || ((tp->last_ack_sent -1) == th->th_seq)))) { switch (tp->t_state) { @@ -3490,6 +3565,16 @@ tcp_input(m, off0) tcpstat.tcps_badrst++; goto drop; } + if (TCP_ECN_ENABLED(tp) && + tp->snd_una == tp->iss + 1 && + SEQ_GT(tp->snd_max, tp->snd_una)) { + /* + * If the first data packet on an + * ECN connection, receives a RST + * increment the heuristic + */ + tcp_heuristic_ecn_droprst(tp); + } case TCPS_FIN_WAIT_1: case TCPS_CLOSE_WAIT: /* @@ -3562,7 +3647,7 @@ tcp_input(m, off0) } if (nstat_collect) { - nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, tlen, NSTAT_RX_FLAG_DUPLICATE); INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1); @@ -3624,7 +3709,7 @@ tcp_input(m, off0) } todrop = tlen; tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += todrop; + tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; @@ -3640,7 +3725,7 @@ tcp_input(m, off0) tp->t_flags |= TF_ACKNOW; } if (nstat_collect) { - nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, todrop, NSTAT_RX_FLAG_DUPLICATE); INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1); INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop); @@ -3723,14 +3808,14 @@ tcp_input(m, off0) /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. - * NOTE: + * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. - * 3) That we modify the segment boundary check to be - * Last.ACK.Sent <= SEG.SEQ + SEG.Len + * 3) That we modify the segment boundary check to be + * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's @@ -3843,6 +3928,12 @@ tcp_input(m, off0) if (nstat_collect) nstat_route_connect_success( tp->t_inpcb->inp_route.ro_rt); + /* + * The SYN is acknowledged but una is not updated + * yet. So pass the value of ack to compute + * sndbytes correctly + */ + inp_count_sndbytes(inp, th->th_ack); } /* * If segment contains data or ACK, will call tcp_reass() @@ -3988,22 +4079,24 @@ tcp_input(m, off0) /* * If we have outstanding data (other than * a window probe), this is a completely - * duplicate ack (ie, window info didn't - * change) and the ack is the biggest we've seen. - */ + * duplicate ack and the ack is the biggest we've seen. + * + * Need to accommodate a change in window on duplicate acks + * to allow operating systems that update window during + * recovery with SACK + */ if (SEQ_LEQ(th->th_ack, tp->snd_una)) { - if (tlen == 0 && tiwin == tp->snd_wnd) { + if (tlen == 0 && (tiwin == tp->snd_wnd || + (to.to_nsacks > 0 && sack_bytes_acked > 0))) { /* * If both ends send FIN at the same time, * then the ack will be a duplicate ack * but we have to process the FIN. Check * for this condition and process the FIN * instead of the dupack - */ + */ if ((thflags & TH_FIN) && - (tp->t_flags & TF_SENTFIN) && - !TCPS_HAVERCVDFIN(tp->t_state) && - (th->th_ack + 1) == tp->snd_max) + !TCPS_HAVERCVDFIN(tp->t_state)) break; process_dupack: #if MPTCP @@ -4018,7 +4111,7 @@ tcp_input(m, off0) if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) { mptcplog((LOG_DEBUG, "MPTCP " "Sockets: bypass ack recovery\n"), - MPTCP_SOCKET_DBG, + MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); break; } @@ -4046,7 +4139,7 @@ tcp_input(m, off0) TCP_EARLY_REXMT_WIN))) tp->t_early_rexmt_count = 0; - /* + /* * Is early retransmit needed? We check for * this when the connection is waiting for * duplicate acks to enter fast recovery. @@ -4055,7 +4148,7 @@ tcp_input(m, off0) tcp_early_rexmt_check(tp, th); /* - * If we've seen exactly rexmt threshold + * If we've seen exactly rexmt threshold * of duplicate acks, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion @@ -4089,7 +4182,7 @@ tcp_input(m, off0) * reordering and loss */ if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) && - (tp->t_flagsext & + (tp->t_flagsext & (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) { /* @@ -4099,14 +4192,14 @@ tcp_input(m, off0) break; } - if (SACK_ENABLED(tp) + if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp)) { int awnd; - + /* * Compute the amount of data in flight first. - * We can inject new data into the pipe iff - * we have less than 1/2 the original window's + * We can inject new data into the pipe iff + * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + @@ -4116,12 +4209,19 @@ tcp_input(m, off0) if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } - } else + } else { tp->snd_cwnd += tp->t_maxseg; + } - tcp_ccdbg_trace(tp, th, TCP_CC_IN_FASTRECOVERY); + /* Process any window updates */ + if (tiwin > tp->snd_wnd) + tcp_update_window(tp, thflags, + th, tiwin, tlen); + tcp_ccdbg_trace(tp, th, + TCP_CC_IN_FASTRECOVERY); (void) tcp_output(tp); + goto drop; } else if (tp->t_dupacks == tp->t_rexmtthresh) { tcp_seq onxt = tp->snd_nxt; @@ -4180,7 +4280,7 @@ tcp_input(m, off0) tcp_rexmt_save_state(tp); /* - * If the current tcp cc module has + * If the current tcp cc module has * defined a hook for tasks to run * before entering FR, call it */ @@ -4198,6 +4298,13 @@ tcp_input(m, off0) tp->snd_cwnd = tp->t_maxseg; tp->t_flagsext &= ~TF_CWND_NONVALIDATED; + + /* Process any window updates */ + if (tiwin > tp->snd_wnd) + tcp_update_window( + tp, thflags, + th, tiwin, tlen); + tcp_ccdbg_trace(tp, th, TCP_CC_ENTER_FASTRECOVERY); (void) tcp_output(tp); @@ -4205,6 +4312,13 @@ tcp_input(m, off0) } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; + + /* Process any window updates */ + if (tiwin > tp->snd_wnd) + tcp_update_window(tp, + thflags, + th, tiwin, tlen); + (void) tcp_output(tp); if (tp->t_flagsext & TF_CWND_NONVALIDATED) { tcp_cc_adjust_nonvalidated_cwnd(tp); @@ -4214,12 +4328,13 @@ tcp_input(m, off0) } if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; + tcp_ccdbg_trace(tp, th, TCP_CC_ENTER_FASTRECOVERY); goto drop; - } else if (limited_txmt && + } else if (limited_txmt && ALLOW_LIMITED_TRANSMIT(tp) && - (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) && + (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) && (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) { u_int32_t incr = (tp->t_maxseg * tp->t_dupacks); @@ -4231,13 +4346,10 @@ tcp_input(m, off0) (void) tcp_output(tp); tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT); - + /* Reset snd_cwnd back to normal */ tp->snd_cwnd -= incr; } - } else { - tp->t_dupacks = 0; - tp->t_rexmtthresh = tcprexmtthresh; } break; } @@ -4248,7 +4360,7 @@ tcp_input(m, off0) if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { /* - * If we received an ECE and entered + * If we received an ECE and entered * recovery, the subsequent ACKs should * not be treated as partial acks. */ @@ -4258,7 +4370,7 @@ tcp_input(m, off0) if (SACK_ENABLED(tp)) tcp_sack_partialack(tp, th); else - tcp_newreno_partial_ack(tp, th); + tcp_newreno_partial_ack(tp, th); tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK); } else { EXIT_FASTRECOVERY(tp); @@ -4269,7 +4381,7 @@ tcp_input(m, off0) tcp_ccdbg_trace(tp, th, TCP_CC_EXIT_FASTRECOVERY); } - } else if ((tp->t_flagsext & + } else if ((tp->t_flagsext & (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) { /* @@ -4344,6 +4456,10 @@ tcp_input(m, off0) * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ + TCP_RESET_REXMT_STATE(tp); + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + tp->t_rttmin, TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); if (th->th_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; tp->t_timer[TCPT_PTO] = 0; @@ -4409,14 +4525,14 @@ tcp_input(m, off0) tp->snd_wnd -= so->so_snd.sb_cc; sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); if (so->so_flags & SOF_ENABLE_MSGS) { - so->so_msg_state->msg_serial_bytes -= + so->so_msg_state->msg_serial_bytes -= (int)so->so_snd.sb_cc; } ourfinisacked = 1; } else { sbdrop(&so->so_snd, acked); if (so->so_flags & SOF_ENABLE_MSGS) { - so->so_msg_state->msg_serial_bytes -= + so->so_msg_state->msg_serial_bytes -= acked; } tcp_sbsnd_trim(&so->so_snd); @@ -4434,6 +4550,7 @@ tcp_input(m, off0) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; + if (SACK_ENABLED(tp)) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; @@ -4449,11 +4566,15 @@ tcp_input(m, off0) tcp_bwmeas_check(tp); /* - * sowwakeup must happen after snd_una, et al. are updated so that - * the sequence numbers are in sync with so_snd + * sowwakeup must happen after snd_una, et al. are + * updated so that the sequence numbers are in sync with + * so_snd */ sowwakeup(so); + if (!SLIST_EMPTY(&tp->t_notify_ack)) + tcp_notify_acknowledgement(tp, so); + switch (tp->t_state) { /* @@ -4476,16 +4597,15 @@ tcp_input(m, off0) isconnected = FALSE; isdisconnected = TRUE; } - DTRACE_TCP4(state__change, void, NULL, + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, - struct tcpcb *, tp, + struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_2); tp->t_state = TCPS_FIN_WAIT_2; - /* fall through and make sure we also recognize - * data ACKed with the FIN + /* fall through and make sure we also recognize + * data ACKed with the FIN */ } - tp->t_flags |= TF_ACKNOW; break; /* @@ -4496,9 +4616,9 @@ tcp_input(m, off0) */ case TCPS_CLOSING: if (ourfinisacked) { - DTRACE_TCP4(state__change, void, NULL, + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, - struct tcpcb *, tp, + struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); @@ -4510,7 +4630,6 @@ tcp_input(m, off0) isconnected = FALSE; isdisconnected = TRUE; } - tp->t_flags |= TF_ACKNOW; break; /* @@ -4537,18 +4656,18 @@ tcp_input(m, off0) } /* - * If there is a SACK option on the ACK and we + * If there is a SACK option on the ACK and we * haven't seen any duplicate acks before, count * it as a duplicate ack even if the cumulative * ack is advanced. If the receiver delayed an * ack and detected loss afterwards, then the ack - * will advance cumulative ack and will also have + * will advance cumulative ack and will also have * a SACK option. So counting it as one duplicate * ack is ok. - */ + */ if (sack_ackadv == 1 && - tp->t_state == TCPS_ESTABLISHED && - SACK_ENABLED(tp) && sack_bytes_acked > 0 && + tp->t_state == TCPS_ESTABLISHED && + SACK_ENABLED(tp) && sack_bytes_acked > 0 && to.to_nsacks > 0 && tp->t_dupacks == 0 && SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 && !(tp->t_flagsext & TF_PKTS_REORDERED)) { @@ -4560,23 +4679,9 @@ tcp_input(m, off0) step6: /* * Update window information. - * Don't look at window if no ACK: TAC's send garbage on first SYN. */ - if ((thflags & TH_ACK) && - (SEQ_LT(tp->snd_wl1, th->th_seq) || - (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || - (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { - /* keep track of pure window updates */ - if (tlen == 0 && - tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) - tcpstat.tcps_rcvwinupd++; - tp->snd_wnd = tiwin; - tp->snd_wl1 = th->th_seq; - tp->snd_wl2 = th->th_ack; - if (tp->snd_wnd > tp->max_sndwnd) - tp->max_sndwnd = tp->snd_wnd; + if (tcp_update_window(tp, thflags, th, tiwin, tlen)) needoutput = 1; - } /* * Process segments with URG. @@ -4653,14 +4758,14 @@ tcp_input(m, off0) soisdisconnected(so); } - /* Let's check the state of pcb just to make sure that it did not get closed + /* Let's check the state of pcb just to make sure that it did not get closed * when we unlocked above */ if (inp->inp_state == INPCB_STATE_DEAD) { /* Just drop the packet that we are processing and return */ goto drop; } - + /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. @@ -4695,22 +4800,21 @@ tcp_input(m, off0) if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) { TCP_INC_VAR(tp->t_unacksegs, nlropkts); /* - * Calculate the RTT on the receiver only if the - * connection is in streaming mode and the last + * Calculate the RTT on the receiver only if the + * connection is in streaming mode and the last * packet was not an end-of-write */ - if ((tp->t_flags & TF_STRETCHACK) && - !(tp->t_flagsext & TF_STREAMEOW)) + if (tp->t_flags & TF_STREAMING_ON) tcp_compute_rtt(tp, &to, th); - - if (DELAY_ACK(tp, th) && + + if (DELAY_ACK(tp, th) && ((tp->t_flags & TF_ACKNOW) == 0) ) { if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; - tp->t_timer[TCPT_DELACK] = + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); } - } + } else { tp->t_flags |= TF_ACKNOW; } @@ -4720,7 +4824,7 @@ tcp_input(m, off0) tcpstat.tcps_rcvbyte += tlen; if (nstat_collect) { if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) { - INP_ADD_STAT(inp, cell, wifi, wired, + INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, m->m_pkthdr.lro_npkts); } else { INP_ADD_STAT(inp, cell, wifi, wired, @@ -4729,9 +4833,10 @@ tcp_input(m, off0) INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, tlen); } - tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); + tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen, + TCP_AUTORCVBUF_MAX(ifp)); so_recv_data_stat(so, m, drop_hdrlen); - + if (sbappendstream_rcvdemux(so, m, th->th_seq - (tp->irs + 1), 0)) { sorwakeup(so); @@ -4741,30 +4846,33 @@ tcp_input(m, off0) tp->t_flags |= TF_ACKNOW; } - if (tlen > 0 && SACK_ENABLED(tp)) + if ((tlen > 0 || (th->th_flags & TH_FIN)) && SACK_ENABLED(tp)) { + if (th->th_flags & TH_FIN) + save_end++; tcp_update_sack_list(tp, save_start, save_end); + } tcp_adaptive_rwtimo_check(tp, tlen); if (tlen > 0) tcp_tfo_rcv_data(tp); - if (tp->t_flags & TF_DELACK) + if (tp->t_flags & TF_DELACK) { #if INET6 if (isipv6) { KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])), - th->th_seq, th->th_ack, th->th_win); + th->th_seq, th->th_ack, th->th_win); } else #endif { KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), - th->th_seq, th->th_ack, th->th_win); + th->th_seq, th->th_ack, th->th_win); } - + } } else { m_freem(m); @@ -4827,9 +4935,9 @@ tcp_input(m, off0) * standard timers. */ case TCPS_FIN_WAIT_2: - DTRACE_TCP4(state__change, void, NULL, + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, - struct tcpcb *, tp, + struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); @@ -4865,7 +4973,7 @@ tcp_input(m, off0) tcp_check_timer_state(tp); - + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -4930,7 +5038,7 @@ tcp_input(m, off0) goto drop; /* IPv6 anycast check is done at tcp6_input() */ - /* + /* * Perform bandwidth limiting. */ #if ICMP_BANDLIM @@ -4946,6 +5054,7 @@ tcp_input(m, off0) bzero(&tra, sizeof(tra)); tra.ifscope = ifscope; tra.awdl_unrestricted = 1; + tra.intcoproc_allowed = 1; if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, @@ -4959,7 +5068,7 @@ tcp_input(m, off0) } /* destroy temporarily created socket */ if (dropsocket) { - (void) soabort(so); + (void) soabort(so); tcp_unlock(so, 1, 0); } else if ((inp != NULL) && (nosock == 0)) { tcp_unlock(so, 1, 0); @@ -4980,7 +5089,7 @@ tcp_input(m, off0) m_freem(m); /* destroy temporarily created socket */ if (dropsocket) { - (void) soabort(so); + (void) soabort(so); tcp_unlock(so, 1, 0); } else if (nosock == 0) { @@ -5129,13 +5238,11 @@ tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope) * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. + * + * @param off delayed to be droped hdrlen */ static void -tcp_pulloutofband(so, th, m, off) - struct socket *so; - struct tcphdr *th; - register struct mbuf *m; - int off; /* delayed to be droped hdrlen */ +tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; @@ -5161,20 +5268,10 @@ tcp_pulloutofband(so, th, m, off) } uint32_t -get_base_rtt(struct tcpcb *tp) +get_base_rtt(struct tcpcb *tp) { - uint32_t base_rtt = 0, i; struct rtentry *rt = tp->t_inpcb->inp_route.ro_rt; - - if (rt != NULL) { - for (i = 0; i < NRTT_HIST; ++i) { - if (rt->rtt_hist[i] != 0 && - (base_rtt == 0 || rt->rtt_hist[i] < base_rtt)) - base_rtt = rt->rtt_hist[i]; - } - } - - return (base_rtt); + return ((rt == NULL) ? 0 : rt->rtt_min); } /* Each value of RTT base represents the minimum RTT seen in a minute. @@ -5183,14 +5280,13 @@ get_base_rtt(struct tcpcb *tp) void update_base_rtt(struct tcpcb *tp, uint32_t rtt) { - u_int32_t base_rtt; + u_int32_t base_rtt, i; struct rtentry *rt; if ((rt = tp->t_inpcb->inp_route.ro_rt) == NULL) return; if (rt->rtt_expire_ts == 0) { RT_LOCK_SPIN(rt); - /* check again to avoid any race */ if (rt->rtt_expire_ts != 0) { RT_UNLOCK(rt); goto update; @@ -5198,6 +5294,7 @@ update_base_rtt(struct tcpcb *tp, uint32_t rtt) rt->rtt_expire_ts = tcp_now; rt->rtt_index = 0; rt->rtt_hist[0] = rtt; + rt->rtt_min = rtt; RT_UNLOCK(rt); return; } @@ -5210,7 +5307,7 @@ update_base_rtt(struct tcpcb *tp, uint32_t rtt) */ if ((tp->t_flagsext & TF_RECV_THROTTLE) && (int)(tcp_now - tp->t_recv_throttle_ts) >= TCP_RECV_THROTTLE_WIN) { - base_rtt = get_base_rtt(tp); + base_rtt = rt->rtt_min; if (tp->t_rttcur <= (base_rtt + target_qdelay)) { tp->t_flagsext &= ~TF_RECV_THROTTLE; tp->t_recv_throttle_ts = 0; @@ -5232,23 +5329,35 @@ update_base_rtt(struct tcpcb *tp, uint32_t rtt) rt->rtt_hist[rt->rtt_index] = min(rt->rtt_hist[rt->rtt_index], rtt); } + /* forget the old value and update minimum */ + rt->rtt_min = 0; + for (i = 0; i < NRTT_HIST; ++i) { + if (rt->rtt_hist[i] != 0 && + (rt->rtt_min == 0 || + rt->rtt_hist[i] < rt->rtt_min)) + rt->rtt_min = rt->rtt_hist[i]; + } RT_UNLOCK(rt); } else { rt->rtt_hist[rt->rtt_index] = min(rt->rtt_hist[rt->rtt_index], rtt); + if (rt->rtt_min == 0) + rt->rtt_min = rtt; + else + rt->rtt_min = min(rt->rtt_min, rtt); } } /* * If we have a timestamp reply, update smoothed RTT. If no timestamp is - * present but transmit timer is running and timed sequence number was - * acked, update smoothed RTT. + * present but transmit timer is running and timed sequence number was + * acked, update smoothed RTT. * * If timestamps are supported, a receiver can update RTT even if * there is no outstanding data. * * Some boxes send broken timestamp replies during the SYN+ACK phase, - * ignore timestamps of 0or we could calculate a huge RTT and blow up + * ignore timestamps of 0or we could calculate a huge RTT and blow up * the retransmit timer. */ static void @@ -5279,7 +5388,7 @@ tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) /* start another measurement */ tp->t_rtttime = 0; } - if (((to->to_flags & TOF_TS) != 0) && + if (((to->to_flags & TOF_TS) != 0) && (to->to_tsecr != 0) && TSTMP_GEQ(tcp_now, to->to_tsecr)) { tcp_xmit_timer(tp, (tcp_now - to->to_tsecr), @@ -5294,10 +5403,10 @@ tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) * and update averages and current timeout. */ static void -tcp_xmit_timer(register struct tcpcb *tp, int rtt, +tcp_xmit_timer(struct tcpcb *tp, int rtt, u_int32_t tsecr, tcp_seq th_ack) { - register int delta; + int delta; if (tp->t_flagsext & TF_RECOMPUTE_RTT) { if (SEQ_GT(th_ack, tp->snd_una) && @@ -5306,7 +5415,7 @@ tcp_xmit_timer(register struct tcpcb *tp, int rtt, TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) { /* * We received a new ACk after a - * spurious timeout. Adapt retransmission + * spurious timeout. Adapt retransmission * timer as described in rfc 4015. */ tp->t_flagsext &= ~(TF_RECOMPUTE_RTT); @@ -5341,9 +5450,9 @@ tcp_xmit_timer(register struct tcpcb *tp, int rtt, * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). * - * Freebsd adjusts rtt to origin 0 by subtracting 1 - * from the provided rtt value. This was required because - * of the way t_rtttime was initiailised to 1 before. + * Freebsd adjusts rtt to origin 0 by subtracting 1 + * from the provided rtt value. This was required because + * of the way t_rtttime was initiailised to 1 before. * Since we changed t_rtttime to be based on * tcp_now, this extra adjustment is not needed. */ @@ -5368,7 +5477,7 @@ tcp_xmit_timer(register struct tcpcb *tp, int rtt, delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; - if (tp->t_rttbest == 0 || + if (tp->t_rttbest == 0 || tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { @@ -5382,10 +5491,8 @@ tcp_xmit_timer(register struct tcpcb *tp, int rtt, } compute_rto: - nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, + nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, tp->t_rttvar); - tp->t_rxtshift = 0; - tp->t_rxtstart = 0; /* * the retransmit should happen at rtt + 4 * rttvar. @@ -5399,7 +5506,7 @@ tcp_xmit_timer(register struct tcpcb *tp, int rtt, * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX, + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX, TCP_ADD_REXMTSLOP(tp)); /* @@ -5460,8 +5567,8 @@ tcp_maxmtu6(struct rtentry *rt) * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start - * window. While looking at the routing entry, we also initialize - * other path-dependent parameters from pre-set or cached values + * window. While looking at the routing entry, we also initialize + * other path-dependent parameters from pre-set or cached values * in the routing entry. * * Also take into account the space needed for options that we @@ -5475,10 +5582,7 @@ tcp_maxmtu6(struct rtentry *rt) * */ void -tcp_mss(tp, offer, input_ifscope) - struct tcpcb *tp; - int offer; - unsigned int input_ifscope; +tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope) { struct rtentry *rt; struct ifnet *ifp; @@ -5531,7 +5635,7 @@ tcp_mss(tp, offer, input_ifscope) * Excludes 9600bps as it is the default value adversized * by pseudo-devices over ppp. */ - if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 && + if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 && ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) { tp->t_flags |= TF_SLOWLINK; } @@ -5628,7 +5732,7 @@ tcp_mss(tp, offer, input_ifscope) mss -= mptcp_adj_mss(tp, FALSE); #endif /* MPTCP */ tp->t_maxseg = mss; - + /* * Calculate corrected value for sb_max; ensure to upgrade the * numerator for large sb_max values else it will overflow. @@ -5657,6 +5761,12 @@ tcp_mss(tp, offer, input_ifscope) } tp->t_maxseg = mss; + /* + * Update MSS using recommendation from link status report. This is + * temporary + */ + tcp_update_mss_locked(so, ifp); + #if RTV_RPIPE bufsize = rt->rt_rmx.rmx_recvpipe; if (bufsize < so->so_rcv.sb_hiwat) @@ -5701,8 +5811,7 @@ tcp_mss(tp, offer, input_ifscope) * Determine the MSS option to send on an outgoing SYN. */ int -tcp_mssopt(tp) - struct tcpcb *tp; +tcp_mssopt(struct tcpcb *tp) { struct rtentry *rt; int mss; @@ -5738,7 +5847,7 @@ tcp_mssopt(tp) * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as * it is the default value adversized by pseudo-devices over ppp. */ - if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 && + if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 && rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) { tp->t_flags |= TF_SLOWLINK; } @@ -5766,9 +5875,7 @@ tcp_mssopt(tp) * be started again. */ static void -tcp_newreno_partial_ack(tp, th) - struct tcpcb *tp; - struct tcphdr *th; +tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; u_int32_t ocwnd = tp->snd_cwnd; @@ -5778,7 +5885,7 @@ tcp_newreno_partial_ack(tp, th) tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset - * (tp->snd_una has not yet been updated when this function + * (tp->snd_una has not yet been updated when this function * is called) */ tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp); @@ -5826,7 +5933,7 @@ tcp_dropdropablreq(struct socket *head) if (TAILQ_EMPTY(&head->so_incomp)) return (0); - /* + /* * Check if there is any socket in the incomp queue * that is closed because of a reset from the peer and is * waiting to be garbage collected. If so, pick that as @@ -5839,8 +5946,8 @@ tcp_dropdropablreq(struct socket *head) so->so_head != NULL && (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) == (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) { - /* - * The listen socket is already locked but we + /* + * The listen socket is already locked but we * can lock this socket here without lock ordering * issues because it is in the incomp queue and * is not visible to others. @@ -5862,8 +5969,8 @@ tcp_dropdropablreq(struct socket *head) old_cnt = cur_cnt / i; cur_cnt = 0; } - - + + qlen = head->so_incqlen; if (rnd == 0) rnd = RandomULong(); @@ -5878,36 +5985,36 @@ tcp_dropdropablreq(struct socket *head) /* Find a connection that is not already closing (or being served) */ while (so) { inp = (struct inpcb *)so->so_pcb; - + sonext = TAILQ_NEXT(so, so_list); - if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { - /* + /* * Avoid the issue of a socket being accepted - * by one input thread and being dropped by - * another input thread. If we can't get a hold - * on this mutex, then grab the next socket in + * by one input thread and being dropped by + * another input thread. If we can't get a hold + * on this mutex, then grab the next socket in * line. */ if (lck_mtx_try_lock(&inp->inpcb_mtx)) { so->so_usecount++; - if ((so->so_usecount == 2) && + if ((so->so_usecount == 2) && (so->so_state & SS_INCOMP) && !(so->so_flags & SOF_INCOMP_INPROGRESS)) { break; } else { - /* - * don't use if being accepted or + /* + * don't use if being accepted or * used in any other way */ in_pcb_checkstate(inp, WNT_RELEASE, 1); tcp_unlock(so, 1, 0); } } else { - /* - * do not try to lock the inp in - * in_pcb_checkstate because the lock + /* + * do not try to lock the inp in + * in_pcb_checkstate because the lock * is already held in some other thread. * Only drop the inp_wntcnt reference. */ @@ -5915,7 +6022,7 @@ tcp_dropdropablreq(struct socket *head) } } so = sonext; - + } if (so == NULL) { return (0); @@ -5945,7 +6052,7 @@ tcp_dropdropablreq(struct socket *head) tcp_close(tp); if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) { - /* + /* * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE * doesn't require a lock, it could have happened while * we are holding the lock. This pcb will have to @@ -5955,10 +6062,10 @@ tcp_dropdropablreq(struct socket *head) so->so_usecount--; tcp_unlock(so, 1, 0); } else { - /* - * Unlock this socket and leave the reference on. - * We need to acquire the pcbinfo lock in order to - * fully dispose it off + /* + * Unlock this socket and leave the reference on. + * We need to acquire the pcbinfo lock in order to + * fully dispose it off */ tcp_unlock(so, 0, 0); @@ -5968,13 +6075,13 @@ tcp_dropdropablreq(struct socket *head) /* Release the reference held for so_incomp queue */ so->so_usecount--; - if (so->so_usecount != 1 || - (inp->inp_wantcnt > 0 && + if (so->so_usecount != 1 || + (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING)) { - /* - * There is an extra wantcount or usecount - * that must have been added when the socket - * was unlocked. This socket will have to be + /* + * There is an extra wantcount or usecount + * that must have been added when the socket + * was unlocked. This socket will have to be * garbage collected later */ tcp_unlock(so, 1, 0); @@ -6039,7 +6146,7 @@ void tcp_set_recv_bg(struct socket *so) { if (!IS_TCP_RECV_BG(so)) - so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; + so->so_flags1 |= SOF1_TRAFFIC_MGT_TCP_RECVBG; /* Unset Large Receive Offload on background sockets */ so_set_lro(so, SO_TC_BK); @@ -6049,10 +6156,10 @@ void tcp_clear_recv_bg(struct socket *so) { if (IS_TCP_RECV_BG(so)) - so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); + so->so_flags1 &= ~(SOF1_TRAFFIC_MGT_TCP_RECVBG); - /* - * Set/unset use of Large Receive Offload depending on + /* + * Set/unset use of Large Receive Offload depending on * the traffic class */ so_set_lro(so, so->so_traffic_class); @@ -6078,12 +6185,11 @@ inp_fc_unthrottle_tcp(struct inpcb *inp) tp->t_bytes_acked = 0; /* Reset retransmit shift as we know that the reason - * for delay in sending a packet is due to flow + * for delay in sending a packet is due to flow * control on the outgoing interface. There is no need * to backoff retransmit timer. */ - tp->t_rxtshift = 0; - tp->t_rtttime = 0; + TCP_RESET_REXMT_STATE(tp); /* * Start the output stream again. Since we are @@ -6092,7 +6198,7 @@ inp_fc_unthrottle_tcp(struct inpcb *inp) */ tcp_output(tp); } - + static int tcp_getstat SYSCTL_HANDLER_ARGS { @@ -6309,7 +6415,7 @@ sysctl_rexmtthresh SYSCTL_HANDLER_ARGS /* * Constrain the number of duplicate ACKs - * to consider for TCP fast retransmit + * to consider for TCP fast retransmit * to either 2 or 3 */ @@ -6324,4 +6430,3 @@ sysctl_rexmtthresh SYSCTL_HANDLER_ARGS SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit"); - diff --git a/bsd/netinet/tcp_ledbat.c b/bsd/netinet/tcp_ledbat.c index 9e18bf9eb..457233d8d 100644 --- a/bsd/netinet/tcp_ledbat.c +++ b/bsd/netinet/tcp_ledbat.c @@ -249,8 +249,8 @@ tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { * greater than or equal to the congestion window. */ - register u_int cw = tp->snd_cwnd; - register u_int incr = tp->t_maxseg; + u_int cw = tp->snd_cwnd; + u_int incr = tp->t_maxseg; int acked = 0; acked = BYTES_ACKED(th, tp); @@ -374,15 +374,8 @@ tcp_ledbat_after_timeout(struct tcpcb *tp) { int tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th) { - /* If any flag other than TH_ACK is set, set "end-of-write" bit */ - if (th->th_flags & ~TH_ACK) - tp->t_flagsext |= TF_STREAMEOW; - else - tp->t_flagsext &= ~(TF_STREAMEOW); - if ((tp->t_flags & TF_RXWIN0SENT) == 0 && - (th->th_flags & TH_PUSH) == 0 && - (tp->t_unacksegs == 1)) + (th->th_flags & TH_PUSH) == 0 && (tp->t_unacksegs == 1)) return(1); return(0); } diff --git a/bsd/netinet/tcp_lro.c b/bsd/netinet/tcp_lro.c index d2a2539c7..59ee6f445 100644 --- a/bsd/netinet/tcp_lro.c +++ b/bsd/netinet/tcp_lro.c @@ -95,8 +95,7 @@ static void lro_proto_input(struct mbuf *); static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ip *, struct tcphdr*); -static struct mbuf *tcp_lro_process_pkt(struct mbuf*, struct ip*, struct tcphdr*, - int); +static struct mbuf *tcp_lro_process_pkt(struct mbuf*, int); void tcp_lro_init(void) @@ -401,8 +400,7 @@ tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr, } struct mbuf* -tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, - struct tcphdr *tcp_hdr, int drop_hdrlen) +tcp_lro_process_pkt(struct mbuf *lro_mb, int drop_hdrlen) { int flow_id = TCP_LRO_FLOW_UNINIT; int hash; @@ -418,18 +416,23 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, int ret_response = TCP_LRO_CONSUMED; int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0; u_int8_t ecn; + struct ip *ip_hdr; + struct tcphdr *tcp_hdr; - if (lro_mb->m_len < (int32_t)sizeof (struct tcpiphdr)) { - if ((lro_mb = m_pullup(lro_mb, sizeof(struct tcpiphdr))) == 0) { + if (lro_mb->m_len < drop_hdrlen) { + if ((lro_mb = m_pullup(lro_mb, drop_hdrlen)) == NULL) { tcpstat.tcps_rcvshort++; m_freem(lro_mb); if (lrodebug) { printf("tcp_lro_process_pkt:mbuf too short.\n"); } - return NULL; + return (NULL); } } - + + ip_hdr = mtod(lro_mb, struct ip*); + tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + sizeof(struct ip)); + /* Just in case */ lro_mb->m_pkthdr.pkt_flags &= ~PKTF_SW_LRO_DID_CSUM; @@ -437,7 +440,7 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, if (lrodebug) { printf("tcp_lro_process_pkt: TCP xsum failed.\n"); } - return NULL; + return (NULL); } /* Update stats */ @@ -585,7 +588,7 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, if (ret_response == TCP_LRO_FLOW_NOTFOUND) { lro_proto_input(lro_mb); } - return NULL; + return (NULL); } static void @@ -674,7 +677,7 @@ tcp_lro(struct mbuf *m, unsigned int hlen) unsigned int off = 0; if (kipf_count != 0) - return m; + return (m); /* * Experiments on cellular show that the RTT is much higher @@ -686,29 +689,30 @@ tcp_lro(struct mbuf *m, unsigned int hlen) */ if (IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) || (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) { - return m; + return (m); } ip_hdr = mtod(m, struct ip*); /* don't deal with IP options */ - if (hlen > sizeof (struct ip)) + if (hlen != sizeof (struct ip)) return (m); /* only TCP is coalesced */ if (ip_hdr->ip_p != IPPROTO_TCP) { - return m; + return (m); } if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) { if (lrodebug) printf("tcp_lro m_pullup \n"); - if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { tcpstat.tcps_rcvshort++; if (lrodebug) { printf("ip_lro: rcvshort.\n"); } - return NULL; + return (NULL); } + ip_hdr = mtod(m, struct ip*); } tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen); @@ -722,10 +726,10 @@ tcp_lro(struct mbuf *m, unsigned int hlen) if (lrodebug) { printf("ip_lro: TCP off greater than TCP header.\n"); } - return m; + return (m); } - return (tcp_lro_process_pkt(m, ip_hdr, tcp_hdr, hlen + off)); + return (tcp_lro_process_pkt(m, hlen + off)); } static void diff --git a/bsd/netinet/tcp_newreno.c b/bsd/netinet/tcp_newreno.c index f2de1c010..c0def7ffa 100644 --- a/bsd/netinet/tcp_newreno.c +++ b/bsd/netinet/tcp_newreno.c @@ -173,8 +173,8 @@ tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { * greater than or equal to the congestion window. */ - register u_int cw = tp->snd_cwnd; - register u_int incr = tp->t_maxseg; + u_int cw = tp->snd_cwnd; + u_int incr = tp->t_maxseg; int acked = 0; acked = BYTES_ACKED(th, tp); diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 2dda8cb0f..5c29ff3a6 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include @@ -154,6 +155,11 @@ int tcp_do_tso = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); +int tcp_ecn_setup_percentage = 50; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_setup_percentage, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_setup_percentage, 0, + "Max ECN setup percentage"); + static int sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS { @@ -275,7 +281,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_recvbg, "Use RTT for bg recv algorithm"); uint32_t tcp_recv_throttle_minwin = 16 * 1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_throttle_minwin, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_throttle_minwin, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_recv_throttle_minwin, 1, "Minimum recv win for throttling"); @@ -295,8 +301,8 @@ extern int ipsec_bypass; extern int slowlink_wsize; /* window correction for slow links */ #if IPFIREWALL -extern int fw_enable; /* firewall check for packet chaining */ -extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */ +extern int fw_enable; /* firewall check for packet chaining */ +extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */ #endif /* IPFIREWALL */ extern u_int32_t dlil_filter_disable_tso_count; @@ -317,8 +323,11 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len) if (tp->t_flags & TF_NOOPT) goto fallback; - if (!tcp_heuristic_do_tfo(tp)) + if (!tcp_heuristic_do_tfo(tp)) { + tp->t_tfo_stats |= TFO_S_HEURISTICS_DISABLE; + tcpstat.tcps_tfo_heuristics_disable++; goto fallback; + } optlen += TCPOLEN_MAXSEG; @@ -439,7 +448,6 @@ tcp_send_ecn_flags_on_syn(struct tcpcb *tp, struct socket *so) (tp->t_flagsext & TF_FASTOPEN))); } -#define TCP_ECN_SETUP_PERCENTAGE_MAX 5 void tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp) { @@ -500,7 +508,7 @@ tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp) * Use the random value in iss for randomizing * this selection */ - if ((tp->iss % 100) >= TCP_ECN_SETUP_PERCENTAGE_MAX) + if ((tp->iss % 100) >= tcp_ecn_setup_percentage) tp->ecn_flags &= ~TE_ENABLE_ECN; } } @@ -561,11 +569,13 @@ tcp_output(struct tcpcb *tp) struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options; #if INET6 int isipv6 = inp->inp_vflag & INP_IPV6 ; +#else + int isipv6 = 0; #endif short packchain_listadd = 0; int so_options = so->so_options; struct rtentry *rt; - u_int32_t basertt, svc_flags = 0, allocated_len; + u_int32_t svc_flags = 0, allocated_len; u_int32_t lro_ackmore = (tp->t_lropktlen != 0) ? 1 : 0; struct mbuf *mnext = NULL; int sackoptlen = 0; @@ -581,6 +591,7 @@ tcp_output(struct tcpcb *tp) boolean_t wifi = FALSE; boolean_t wired = FALSE; boolean_t sack_rescue_rxt = FALSE; + int sotc = so->so_traffic_class; /* * Determine length of data that should be transmitted, @@ -619,11 +630,11 @@ tcp_output(struct tcpcb *tp) idle = 0; } } -#if MPTCP +#if MPTCP if (tp->t_mpflags & TMPF_RESET) { tcp_check_timer_state(tp); - /* - * Once a RST has been sent for an MPTCP subflow, + /* + * Once a RST has been sent for an MPTCP subflow, * the subflow socket stays around until deleted. * No packets such as FINs must be sent after RST. */ @@ -702,9 +713,8 @@ tcp_output(struct tcpcb *tp) OFFSET_FROM_START(tp, tp->t_rxtcur); if (tp->t_timer[TCPT_PERSIST]) { tp->t_timer[TCPT_PERSIST] = 0; - tp->t_rxtshift = 0; tp->t_persist_stop = 0; - tp->t_rxtstart = 0; + TCP_RESET_REXMT_STATE(tp); } } @@ -713,7 +723,7 @@ tcp_output(struct tcpcb *tp) TCP_PKTLIST_CLEAR(tp); /* drop connection if source address isn't available */ - if (so->so_flags & SOF_NOADDRAVAIL) { + if (so->so_flags & SOF_NOADDRAVAIL) { tcp_drop(tp, EADDRNOTAVAIL); return(EADDRNOTAVAIL); } else { @@ -748,9 +758,9 @@ tcp_output(struct tcpcb *tp) * has been disabled) */ - if (!path_mtu_discovery || ((rt != NULL) && + if (!path_mtu_discovery || ((rt != NULL) && (!(rt->rt_flags & RTF_UP) || - (rt->rt_rmx.rmx_locks & RTV_MTU)))) + (rt->rt_rmx.rmx_locks & RTV_MTU)))) tp->t_flags &= ~TF_PMTUD; else tp->t_flags |= TF_PMTUD; @@ -797,7 +807,7 @@ tcp_output(struct tcpcb *tp) if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { int32_t cwin; - + cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; @@ -825,7 +835,7 @@ tcp_output(struct tcpcb *tp) len = ((int32_t)min(cwin, p->end - p->rxmit)); } if (len > 0) { - off = p->rxmit - tp->snd_una; + off = p->rxmit - tp->snd_una; sack_rxmit = 1; sendalot = 1; tcpstat.tcps_sack_rexmits++; @@ -874,9 +884,8 @@ tcp_output(struct tcpcb *tp) sendwin = 1; } else { tp->t_timer[TCPT_PERSIST] = 0; - tp->t_rxtshift = 0; - tp->t_rxtstart = 0; tp->t_persist_stop = 0; + TCP_RESET_REXMT_STATE(tp); } } @@ -911,12 +920,12 @@ tcp_output(struct tcpcb *tp) * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = min(so->so_snd.sb_cc, tp->snd_wnd) + len = min(so->so_snd.sb_cc, tp->snd_wnd) - off; /* * Don't remove this (len > 0) check ! - * We explicitly check for len > 0 here (although it - * isn't really necessary), to work around a gcc + * We explicitly check for len > 0 here (although it + * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. @@ -980,7 +989,8 @@ tcp_output(struct tcpcb *tp) if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED || tfo_enabled(tp)) flags &= ~TH_SYN; - off--, len++; + off--; + len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) { while (inp->inp_sndinprog_cnt == 0 && tp->t_pktlist_head != NULL) { @@ -993,18 +1003,12 @@ tcp_output(struct tcpcb *tp) packchain_listadd, tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), 0, -#if INET6 isipv6); -#else /* INET6 */ - 0); -#endif /* !INET6 */ - - } /* * tcp was closed while we were in ip, - * resume close + * resume close */ if (inp->inp_sndinprog_cnt == 0 && (tp->t_flags & TF_CLOSING)) { @@ -1039,9 +1043,9 @@ tcp_output(struct tcpcb *tp) /* * The check here used to be (len < 0). Some times len is zero * when the congestion window is closed and we need to check - * if persist timer has to be set in that case. But don't set + * if persist timer has to be set in that case. But don't set * persist until connection is established. - */ + */ if (len <= 0 && !(flags & TH_SYN)) { /* * If FIN has been sent but not acked, @@ -1057,8 +1061,7 @@ tcp_output(struct tcpcb *tp) if (sendwin == 0) { tp->t_timer[TCPT_REXMT] = 0; tp->t_timer[TCPT_PTO] = 0; - tp->t_rxtshift = 0; - tp->t_rxtstart = 0; + TCP_RESET_REXMT_STATE(tp); tp->snd_nxt = tp->snd_una; off = 0; if (tp->t_timer[TCPT_PERSIST] == 0) @@ -1075,34 +1078,17 @@ tcp_output(struct tcpcb *tp) * 3. our send window (slow start and congestion controlled) is * larger than sent but unacknowledged data in send buffer. */ - basertt = get_base_rtt(tp); if (tcp_do_autosendbuf == 1 && !INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) && (so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE && tcp_cansbgrow(&so->so_snd)) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && - sendwin >= (so->so_snd.sb_cc - - (tp->snd_nxt - tp->snd_una))) { - /* Also increase the send buffer only if the - * round-trip time is not increasing because we do - * not want to contribute to latency by filling - * buffers. - * We also do not want to hold onto application's - * old data for too long. Interactive applications - * would rather discard old data. - */ - if (tp->t_rttcur <= (basertt + 25)) { - if (sbreserve(&so->so_snd, - min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, - tcp_autosndbuf_max)) == 1) { - so->so_snd.sb_idealsize = so->so_snd.sb_hiwat; - } - } else { - so->so_snd.sb_idealsize = - max(tcp_sendspace, so->so_snd.sb_hiwat - - (2 * tcp_autosndbuf_inc)); - so->so_snd.sb_flags |= SB_TRIM; + sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + if (sbreserve(&so->so_snd, + min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, + tcp_autosndbuf_max)) == 1) { + so->so_snd.sb_idealsize = so->so_snd.sb_hiwat; } } } @@ -1164,7 +1150,7 @@ tcp_output(struct tcpcb *tp) } #if MPTCP - if ((so->so_flags & SOF_MP_SUBFLOW) && + if ((so->so_flags & SOF_MP_SUBFLOW) && !(tp->t_mpflags & TMPF_TCP_FALLBACK)) { int newlen = len; if ((tp->t_state >= TCPS_ESTABLISHED) && @@ -1182,7 +1168,7 @@ tcp_output(struct tcpcb *tp) } /* * The contiguous bytes in the subflow socket buffer can be - * discontiguous at the MPTCP level. Since only one DSS + * discontiguous at the MPTCP level. Since only one DSS * option can be sent in one packet, reduce length to match * the contiguous MPTCP level. Set sendalot to send remainder. */ @@ -1200,8 +1186,8 @@ tcp_output(struct tcpcb *tp) * pull the amount of data that can be sent from the * unordered priority queues to the serial queue in * the socket buffer. If bytes are not yet available - * in the highest priority message, we may not be able - * to send any new data. + * in the highest priority message, we may not be able + * to send any new data. */ if (so->so_flags & SOF_ENABLE_MSGS) { if ((off + len) > @@ -1209,7 +1195,7 @@ tcp_output(struct tcpcb *tp) sbpull_unordered_data(so, off, len); /* check if len needs to be modified */ - if ((off + len) > + if ((off + len) > so->so_msg_state->msg_serial_bytes) { len = so->so_msg_state->msg_serial_bytes - off; if (len <= 0) { @@ -1227,8 +1213,55 @@ tcp_output(struct tcpcb *tp) if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } - + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 25% of the maximum possible + * window, then want to send a window update to peer. + * Skip this if the connection is in T/TCP half-open state. + */ recwin = tcp_sbspace(tp); +#if MPTCP + if (so->so_flags & SOF_MP_SUBFLOW) { + struct mptcb *mp_tp = tptomptp(tp); + + if (mp_tp != NULL) { + MPT_LOCK(mp_tp); + recwin = imin(recwin, (int)mp_tp->mpt_rcvwnd); + MPT_UNLOCK(mp_tp); + } + } +#endif + + if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && + recwin < (int)tp->t_maxseg) + recwin = 0; + if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) { + if (recwin > (int32_t)slowlink_wsize) + recwin = slowlink_wsize; + } + +#if TRAFFIC_MGT + if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) { + if (tcp_recv_throttle(tp)) { + uint32_t min_iaj_win = + tcp_min_iaj_win * tp->t_maxseg; + if (tp->iaj_rwintop == 0 || + SEQ_LT(tp->iaj_rwintop, tp->rcv_adv)) + tp->iaj_rwintop = tp->rcv_adv; + if (SEQ_LT(tp->iaj_rwintop, + tp->rcv_nxt + min_iaj_win)) + tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win; + recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin); + } + } +#endif /* TRAFFIC_MGT */ + + if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) + recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); + if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) + recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt); /* * Sender silly window avoidance. We transmit under the following @@ -1259,8 +1292,8 @@ tcp_output(struct tcpcb *tp) if (len >= tp->t_maxseg) goto send; if (!(tp->t_flags & TF_MORETOCOME) && - (idle || tp->t_flags & TF_NODELAY || - tp->t_flags & TF_MAXSEGSNT || + (idle || tp->t_flags & TF_NODELAY || + (tp->t_flags & TF_MAXSEGSNT) || ALLOW_LIMITED_TRANSMIT(tp)) && (tp->t_flags & TF_NOPUSH) == 0 && len + off >= so->so_snd.sb_cc) @@ -1272,14 +1305,6 @@ tcp_output(struct tcpcb *tp) } } - /* - * Compare available window to amount of window - * known to peer (as advertised window less - * next expected input). If the difference is at least two - * max size segments, or at least 25% of the maximum possible - * window, then want to send a window update to peer. - * Skip this if the connection is in T/TCP half-open state. - */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) { /* * "adv" is the amount we can increase the window, @@ -1297,34 +1322,35 @@ tcp_output(struct tcpcb *tp) /* * Update only if the resulting scaled value of * the window changed, or if there is a change in - * the sequence since the last ack. This avoids + * the sequence since the last ack. This avoids * what appears as dupe ACKS (see rdar://5640997) * * If streaming is detected avoid sending too many - * window updates. We will depend on the delack + * window updates. We will depend on the delack * timer to send a window update when needed. */ if (!(tp->t_flags & TF_STRETCHACK) && - (tp->last_ack_sent != tp->rcv_nxt || + (tp->last_ack_sent != tp->rcv_nxt || ((oldwin + adv) >> tp->rcv_scale) > (oldwin >> tp->rcv_scale))) { goto send; } - /* - * Make sure that the delayed ack timer is set if - * we delayed sending a window update because of - * streaming detection. - */ - if ((tp->t_flags & TF_STRETCHACK) && - !(tp->t_flags & TF_DELACK)) { - tp->t_flags |= TF_DELACK; - tp->t_timer[TCPT_DELACK] = - OFFSET_FROM_START(tp, tcp_delack); - } } - if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat) - goto send; + if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat) + goto send; + + /* + * Make sure that the delayed ack timer is set if + * we delayed sending a window update because of + * streaming detection. + */ + if ((tp->t_flags & TF_STRETCHACK) && + !(tp->t_flags & TF_DELACK)) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = + OFFSET_FROM_START(tp, tcp_delack); + } } /* @@ -1354,14 +1380,14 @@ tcp_output(struct tcpcb *tp) * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ - if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) && + if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); goto just_return; - } + } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window @@ -1386,8 +1412,7 @@ tcp_output(struct tcpcb *tp) */ if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { - tp->t_rxtshift = 0; - tp->t_rxtstart = 0; + TCP_RESET_REXMT_STATE(tp); tcp_setpersist(tp); } just_return: @@ -1406,11 +1431,7 @@ tcp_output(struct tcpcb *tp) packchain_listadd, tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin, -#if INET6 isipv6); -#else /* INET6 */ - 0); -#endif /* !INET6 */ } /* tcp was closed while we were in ip; resume close */ if (inp->inp_sndinprog_cnt == 0 && @@ -1471,7 +1492,7 @@ tcp_output(struct tcpcb *tp) optlen += 4; } #if MPTCP - if (mptcp_enable) { + if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) { optlen = mptcp_setup_syn_opts(so, flags, opt, optlen); } @@ -1502,7 +1523,7 @@ tcp_output(struct tcpcb *tp) tp->rfbuf_ts = tcp_now; if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) { - /* + /* * Tack on the SACK permitted option *last*. * And do padding of options after tacking this on. * This is because of MSS, TS, WinScale and Signatures are @@ -1828,12 +1849,12 @@ tcp_output(struct tcpcb *tp) panic("tcphdr too big"); /* Check if there is enough data in the send socket - * buffer to start measuring bw + * buffer to start measuring bw */ if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && (tp->t_bwmeas != NULL) && (tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0 && - (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) >= + (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) >= tp->t_bwmeas->bw_minsize) { tp->t_bwmeas->bw_size = min( (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)), @@ -1869,13 +1890,14 @@ tcp_output(struct tcpcb *tp) } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; - + if (nstat_collect) { INP_ADD_STAT(inp, cell, wifi, wired, txpackets, 1); INP_ADD_STAT(inp, cell, wifi, wired, txbytes, len); } + inp_decr_sndbytes_unsent(so, len); } #if MPTCP if (tp->t_mpflags & TMPF_MPTCP_TRUE) { @@ -1884,9 +1906,9 @@ tcp_output(struct tcpcb *tp) } #endif /* MPTCP */ /* - * try to use the new interface that allocates all - * the necessary mbuf hdrs under 1 mbuf lock and - * avoids rescanning the socket mbuf list if + * try to use the new interface that allocates all + * the necessary mbuf hdrs under 1 mbuf lock and + * avoids rescanning the socket mbuf list if * certain conditions are met. This routine can't * be used in the following cases... * 1) the protocol headers exceed the capacity of @@ -1965,7 +1987,7 @@ tcp_output(struct tcpcb *tp) error = 0; /* should we return an error? */ goto out; } - + /* * m_copym_with_hdrs will always return the * last mbuf pointer and the offset into it that @@ -2037,8 +2059,8 @@ tcp_output(struct tcpcb *tp) } svc_flags |= PKT_SCF_IPV6; #if PF_ECN - m->m_pkthdr.pf_mtag.pftag_hdr = (void *)ip6; - m->m_pkthdr.pf_mtag.pftag_flags |= PF_TAG_HDR_INET6; + m_pftag(m)->pftag_hdr = (void *)ip6; + m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET6; #endif /* PF_ECN */ } else #endif /* INET6 */ @@ -2054,8 +2076,8 @@ tcp_output(struct tcpcb *tp) ip->ip_tos |= IPTOS_ECN_ECT0; } #if PF_ECN - m->m_pkthdr.pf_mtag.pftag_hdr = (void *)ip; - m->m_pkthdr.pf_mtag.pftag_flags |= PF_TAG_HDR_INET; + m_pftag(m)->pftag_hdr = (void *)ip; + m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET; #endif /* PF_ECN */ } @@ -2087,6 +2109,10 @@ tcp_output(struct tcpcb *tp) if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) { th->th_seq = htonl(tp->snd_nxt); + if (len > 0) { + m->m_pkthdr.tx_start_seq = tp->snd_nxt; + m->m_pkthdr.pkt_flags |= PKTF_START_SEQ; + } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { if (SACK_ENABLED(tp) && len > 1) { tcp_rxtseg_insert(tp, tp->snd_nxt, @@ -2101,11 +2127,14 @@ tcp_output(struct tcpcb *tp) } } else { th->th_seq = htonl(p->rxmit); + if (len > 0) { + m->m_pkthdr.pkt_flags |= + (PKTF_TCP_REXMT | PKTF_START_SEQ); + m->m_pkthdr.tx_start_seq = p->rxmit; + } tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - 1)); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; - if (len > 0) - m->m_pkthdr.pkt_flags |= PKTF_TCP_REXMT; } th->th_ack = htonl(tp->rcv_nxt); tp->last_ack_sent = tp->rcv_nxt; @@ -2120,37 +2149,6 @@ tcp_output(struct tcpcb *tp) th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; - /* - * Calculate receive window. Don't shrink window, - * but avoid silly window syndrome. - */ - if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && recwin < (int)tp->t_maxseg) - recwin = 0; - if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) - recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt); - if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) { - if (recwin > (int32_t)slowlink_wsize) - recwin = slowlink_wsize; - } - -#if TRAFFIC_MGT - if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) { - if (tcp_recv_throttle(tp)) { - uint32_t min_iaj_win = - tcp_min_iaj_win * tp->t_maxseg; - if (tp->iaj_rwintop == 0 || - SEQ_LT(tp->iaj_rwintop, tp->rcv_adv)) - tp->iaj_rwintop = tp->rcv_adv; - if (SEQ_LT(tp->iaj_rwintop, - tp->rcv_nxt + min_iaj_win)) - tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win; - recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin); - } - } -#endif /* TRAFFIC_MGT */ - - if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) - recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); /* @@ -2183,6 +2181,17 @@ tcp_output(struct tcpcb *tp) * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ + + /* + * If this is potentially the last packet on the stream, then mark + * it in order to enable some optimizations in the underlying + * layers + */ + if (tp->t_state != TCPS_ESTABLISHED && + (tp->t_state == TCPS_CLOSING || tp->t_state == TCPS_TIME_WAIT + || tp->t_state == TCPS_LAST_ACK || (th->th_flags & TH_RST))) + m->m_pkthdr.pkt_flags |= PKTF_LAST_PKT; + #if INET6 if (isipv6) { /* @@ -2192,7 +2201,7 @@ tcp_output(struct tcpcb *tp) m->m_pkthdr.csum_flags = CSUM_TCPIPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (len + optlen) - th->th_sum = in_addword(th->th_sum, + th->th_sum = in_addword(th->th_sum, htons((u_short)(optlen + len))); } else @@ -2201,7 +2210,7 @@ tcp_output(struct tcpcb *tp) m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (len + optlen) - th->th_sum = in_addword(th->th_sum, + th->th_sum = in_addword(th->th_sum, htons((u_short)(optlen + len))); } @@ -2236,7 +2245,7 @@ tcp_output(struct tcpcb *tp) if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; - if ((flags & TH_FIN) && + if ((flags & TH_FIN) && !(tp->t_flags & TF_SENTFIN)) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; @@ -2277,9 +2286,8 @@ tcp_output(struct tcpcb *tp) tp->snd_nxt != tp->snd_una || (flags & TH_FIN))) { if (tp->t_timer[TCPT_PERSIST]) { tp->t_timer[TCPT_PERSIST] = 0; - tp->t_rxtshift = 0; - tp->t_rxtstart = 0; tp->t_persist_stop = 0; + TCP_RESET_REXMT_STATE(tp); } tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); @@ -2303,7 +2311,7 @@ tcp_output(struct tcpcb *tp) /* * Using SRTT alone to set PTO can cause spurious * retransmissions on wireless networks where there - * is a lot of variance in RTT. Taking variance + * is a lot of variance in RTT. Taking variance * into account will avoid this. */ srtt = tp->t_srtt >> TCP_RTT_SHIFT; @@ -2341,7 +2349,7 @@ tcp_output(struct tcpcb *tp) int xlen = len; if (flags & TH_SYN) ++xlen; - if ((flags & TH_FIN) && + if ((flags & TH_FIN) && !(tp->t_flags & TF_SENTFIN)) { ++xlen; tp->t_flags |= TF_SENTFIN; @@ -2421,8 +2429,12 @@ tcp_output(struct tcpcb *tp) error = EHOSTUNREACH; goto out; } - necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id); + + if (net_qos_policy_restricted != 0) { + necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt, + NULL, route_rule_id); + } } #endif /* NECP */ @@ -2456,19 +2468,28 @@ tcp_output(struct tcpcb *tp) !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) { /* Hint to prioritize this packet if * 1. if the packet has no data - * 2. the interface supports transmit-start model and did + * 2. the interface supports transmit-start model and did * not disable ACK prioritization. * 3. Only ACK flag is set. * 4. there is no outstanding data on this connection. */ if (tcp_prioritize_acks != 0 && len == 0 && - (inp->inp_last_outifp->if_eflags & - (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART && - th->th_flags == TH_ACK && tp->snd_una == tp->snd_max && - tp->t_timer[TCPT_REXMT] == 0) { - svc_flags |= PKT_SCF_TCP_ACK; - } - set_packet_service_class(m, so, MBUF_SC_UNSPEC, svc_flags); + (inp->inp_last_outifp->if_eflags & + (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART) { + if (th->th_flags == TH_ACK && + tp->snd_una == tp->snd_max && + tp->t_timer[TCPT_REXMT] == 0) + svc_flags |= PKT_SCF_TCP_ACK; + if (th->th_flags & TH_SYN) + svc_flags |= PKT_SCF_TCP_SYN; + } + set_packet_service_class(m, so, sotc, svc_flags); + } else { + /* + * Optimization for loopback just set the mbuf + * service class + */ + (void) m_set_service_class(m, so_tc2msc(sotc)); } tp->t_pktlist_sentlen += len; @@ -2497,7 +2518,7 @@ tcp_output(struct tcpcb *tp) if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) && ((th->th_flags & TH_ACK) == TH_ACK) && (!len) && (tp->t_state == TCPS_ESTABLISHED)) { - /* For a pure ACK, see if you need to send more of them */ + /* For a pure ACK, see if you need to send more of them */ mnext = tcp_send_lroacks(tp, m, th); if (mnext) { tp->t_pktlist_tail->m_nextpkt = mnext; @@ -2536,11 +2557,7 @@ tcp_output(struct tcpcb *tp) packchain_listadd, tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin, -#if INET6 isipv6); -#else /* INET6 */ - 0); -#endif /* !INET6 */ if (error) { /* * Take into account the rest of unsent @@ -2609,7 +2626,7 @@ tcp_output(struct tcpcb *tp) if (error == ENOBUFS) { if (!tp->t_timer[TCPT_REXMT] && !tp->t_timer[TCPT_PERSIST]) - tp->t_timer[TCPT_REXMT] = + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; tp->t_bytes_acked = 0; @@ -2646,7 +2663,7 @@ tcp_output(struct tcpcb *tp) * treat EHOSTUNREACH/ENETDOWN as a soft error. */ if ((error == EHOSTUNREACH || error == ENETDOWN) && - TCPS_HAVERCVDSYN(tp->t_state) && + TCPS_HAVERCVDSYN(tp->t_state) && !inp_restricted_send(inp, inp->inp_last_outifp)) { tp->t_softerror = error; error = 0; @@ -2677,12 +2694,14 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, boolean_t ifdenied = FALSE; struct inpcb *inp = tp->t_inpcb; struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF|IPOAF_BOUND_SRCADDR, 0 }; + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF|IPOAF_BOUND_SRCADDR, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; struct route ro; struct ifnet *outif = NULL; #if INET6 struct ip6_out_args ip6oa = - { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF|IP6OAF_BOUND_SRCADDR, 0 }; + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF|IP6OAF_BOUND_SRCADDR, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; struct route_in6 ro6; struct flowadv *adv = (isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv); @@ -2711,7 +2730,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, else #endif /* INET6 */ ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; - } + } if (INP_NO_EXPENSIVE(inp)) { #if INET6 if (isipv6) @@ -2719,7 +2738,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, else #endif /* INET6 */ ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; - + } if (INP_AWDL_UNRESTRICTED(inp)) { #if INET6 @@ -2728,7 +2747,28 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, else #endif /* INET6 */ ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; - + + } +#if INET6 + if (INP_INTCOPROC_ALLOWED(inp) && isipv6) { + ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED; + } + if (isipv6) { + ip6oa.ip6oa_sotc = so->so_traffic_class; + ip6oa.ip6oa_netsvctype = so->so_netsvctype; + } else +#endif /* INET6 */ + { + ipoa.ipoa_sotc = so->so_traffic_class; + ipoa.ipoa_netsvctype = so->so_netsvctype; + } + if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) { +#if INET6 + if (isipv6) + ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED; + else +#endif /* INET6 */ + ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED; } #if INET6 if (isipv6) @@ -2778,7 +2818,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, unlocked = TRUE; socket_unlock(so, 0); } - + /* * Don't send down a chain of packets when: * - TCP chaining is disabled @@ -2843,27 +2883,27 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, if (unlocked) socket_lock(so, 0); - /* + /* * Enter flow controlled state if the connection is established * and is not in recovery. * - * A connection will enter suspended state even if it is in + * A connection will enter suspended state even if it is in * recovery. */ if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) || - adv->code == FADV_SUSPENDED) && + adv->code == FADV_SUSPENDED) && !(tp->t_flags & TF_CLOSING) && tp->t_state == TCPS_ESTABLISHED) { int rc; rc = inp_set_fc_state(inp, adv->code); - if (rc == 1) - tcp_ccdbg_trace(tp, NULL, + if (rc == 1) + tcp_ccdbg_trace(tp, NULL, ((adv->code == FADV_FLOW_CONTROLLED) ? TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND)); } - /* + /* * When an interface queue gets suspended, some of the * packets are dropped. Return ENOBUFS, to update the * pcb state. @@ -2877,18 +2917,27 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, #if INET6 if (isipv6) { - if (ro6.ro_rt != NULL && (outif = ro6.ro_rt->rt_ifp) != - inp->in6p_last_outifp) - inp->in6p_last_outifp = outif; + if (ro6.ro_rt != NULL) + outif = ro6.ro_rt->rt_ifp; } else #endif /* INET6 */ - if (ro.ro_rt != NULL && (outif = ro.ro_rt->rt_ifp) != - inp->inp_last_outifp) - inp->inp_last_outifp = outif; + if (ro.ro_rt != NULL) + outif = ro.ro_rt->rt_ifp; + + if (outif != NULL && outif != inp->inp_last_outifp && + so->so_snd.sb_cc > 0) { + /* Update the send byte count */ + if (so->so_snd.sb_flags & SB_SNDBYTE_CNT) { + inp_decr_sndbytes_total(so, so->so_snd.sb_cc); + inp_decr_sndbytes_allunsent(so, tp->snd_una); + so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT; + } + inp->inp_last_outifp = outif; + } - if (error != 0 && ifdenied && + if (error != 0 && ifdenied && (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp))) - soevent(inp->inp_socket, + soevent(so, (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED)); /* Synchronize cached PCB route & options */ @@ -2899,7 +2948,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, #endif /* INET6 */ inp_route_copyin(inp, &ro); - if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 && + if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 && tp->t_inpcb->inp_route.ro_rt != NULL) { /* If we found the route and there is an rtt on it * reset the retransmit timer @@ -2911,8 +2960,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, } void -tcp_setpersist(tp) - register struct tcpcb *tp; +tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; @@ -2951,8 +2999,8 @@ tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th) int count = 0; tcp_seq org_ack = ntohl(th->th_ack); tcp_seq prev_ack = 0; - int tack_offset = 28; /* XXX IPv6 and IP options not supported */ - int twin_offset = 34; /* XXX IPv6 and IP options not supported */ + int tack_offset = 28; /* IPv6 and IP options not supported */ + int twin_offset = 34; /* IPv6 and IP options not supported */ int ack_size = (tp->t_flags & TF_STRETCHACK) ? (maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << 1); int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : 2; @@ -3047,12 +3095,12 @@ tcp_recv_throttle (struct tcpcb *tp) if (tcp_use_rtt_recvbg == 1 && TSTMP_SUPPORTED(tp)) { - /* + /* * Timestamps are supported on this connection. Use * RTT to look for an increase in latency. */ - /* + /* * If the connection is already being throttled, leave it * in that state until rtt comes closer to base rtt */ @@ -3060,7 +3108,7 @@ tcp_recv_throttle (struct tcpcb *tp) return (1); base_rtt = get_base_rtt(tp); - + if (base_rtt != 0 && tp->t_rttcur != 0) { /* * if latency increased on a background flow, @@ -3074,12 +3122,12 @@ tcp_recv_throttle (struct tcpcb *tp) * Reduce the recv socket buffer size to * minimize latecy. */ - if (sbrcv->sb_idealsize > + if (sbrcv->sb_idealsize > tcp_recv_throttle_minwin) { newsize = sbrcv->sb_idealsize >> 1; /* Set a minimum of 16 K */ - newsize = - max(newsize, + newsize = + max(newsize, tcp_recv_throttle_minwin); sbrcv->sb_idealsize = newsize; } @@ -3096,6 +3144,6 @@ tcp_recv_throttle (struct tcpcb *tp) */ if (tp->acc_iaj > tcp_acc_iaj_react_limit) return (1); - + return (0); } diff --git a/bsd/netinet/tcp_sack.c b/bsd/netinet/tcp_sack.c index 5d0bf9130..4a68325db 100644 --- a/bsd/netinet/tcp_sack.c +++ b/bsd/netinet/tcp_sack.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2004-2015 Apple Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -93,6 +93,7 @@ #include #include #include +#include #if TCPDEBUG #include #endif @@ -379,6 +380,16 @@ tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s, tcpstat.tcps_reordered_pkts++; tp->t_reordered_pkts++; + /* + * If reordering is seen on a connection wth ECN enabled, + * increment the heuristic + */ + if (TCP_ECN_ENABLED(tp)) { + INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_reorder); + tcpstat.tcps_ecn_fallback_reorder++; + tcp_heuristic_ecn_aggressive(tp); + } + VERIFY(SEQ_GEQ(snd_fack, s->rxmit)); if (s->rxmit_start > 0) { @@ -653,9 +664,7 @@ tcp_free_sackholes(struct tcpcb *tp) * of sack recovery. */ void -tcp_sack_partialack(tp, th) - struct tcpcb *tp; - struct tcphdr *th; +tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) { int num_segs = 1; diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index 88d18875a..d7289624e 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -58,7 +58,6 @@ * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 - * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $ */ /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce @@ -92,8 +91,8 @@ #include #include -#define tcp_minmssoverload fring -#define _IP_VHL +#define tcp_minmssoverload fring +#define _IP_VHL #include #include #include @@ -134,7 +133,7 @@ #if INET6 #include #endif -#endif /*IPSEC*/ +#endif /* IPSEC */ #if NECP #include @@ -154,7 +153,7 @@ #include -#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) +#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) extern int tcp_lq_overflow; @@ -163,38 +162,39 @@ extern struct tcptailq tcp_tw_tailq; int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + &tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); #if INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt , 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6"); #endif extern int tcp_do_autorcvbuf; -int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int , +int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int, struct sysctl_req *); -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, - CTLTYPE_STRING | CTLFLAG_WR, - 0 , 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key"); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING | CTLFLAG_WR, + 0, 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key"); /* Current count of half-open TFO connections */ int tcp_tfo_halfcnt = 0; /* Maximum of half-open TFO connection backlog */ int tcp_tfo_backlog = 10; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen_backlog, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_tfo_backlog, 0, "Backlog queue for half-open TFO connections"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen_backlog, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_tfo_backlog, 0, + "Backlog queue for half-open TFO connections"); int tcp_fastopen = TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_fastopen, 0, "Enable TCP Fastopen (RFC 7413)"); + &tcp_fastopen, 0, "Enable TCP Fastopen (RFC 7413)"); int tcp_tfo_fallback_min = 10; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen_fallback_min, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_tfo_fallback_min, 0, "Mininum number of trials without TFO when in fallback mode"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen_fallback_min, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_tfo_fallback_min, 0, + "Mininum number of trials without TFO when in fallback mode"); /* * Minimum MSS we accept and use. This prevents DoS attacks where @@ -206,62 +206,64 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen_fallback_min, CTLFLAG_RW | CTLFLAG_ */ int tcp_minmss = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); + &tcp_minmss, 0, "Minmum TCP Maximum Segment Size"); int tcp_do_rfc1323 = 1; #if (DEVELOPMENT || DEBUG) SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, - CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1323 , 0, - "Enable rfc1323 (high performance TCP) extensions"); + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1323, 0, + "Enable rfc1323 (high performance TCP) extensions"); #endif /* (DEVELOPMENT || DEBUG) */ // Not used static int tcp_do_rfc1644 = 0; -SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1644, 0, + "Enable rfc1644 (TTCP) extensions"); static int do_tcpdrain = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, &do_tcpdrain, 0, - "Enable tcp_drain routine for extra help when low on mbufs"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, + &do_tcpdrain, 0, + "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, - &tcbinfo.ipi_count, 0, "Number of active PCBs"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, - CTLFLAG_RD | CTLFLAG_LOCKED, - &tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state"); static int icmp_may_rst = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp_may_rst, 0, - "Certain ICMP unreachable messages may abort connections in SYN_SENT"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, + &icmp_may_rst, 0, + "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_strict_rfc1948 = 0; static int tcp_isn_reseed_interval = 0; #if (DEVELOPMENT || DEBUG) -SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, - CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly"); SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, - CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); + CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); #endif /* (DEVELOPMENT || DEBUG) */ int tcp_TCPTV_MIN = 100; /* 100ms minimum RTT */ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_TCPTV_MIN, 0, "min rtt value allowed"); + &tcp_TCPTV_MIN, 0, "min rtt value allowed"); int tcp_rexmt_slop = TCPTV_REXMTSLOP; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW, &tcp_rexmt_slop, 0, "Slop added to retransmit timeout"); __private_extern__ int tcp_use_randomport = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_use_randomport, 0, "Randomize TCP port numbers"); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_use_randomport, 0, + "Randomize TCP port numbers"); __private_extern__ int tcp_win_scale = 3; SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, - CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_win_scale, 0, "Window scaling factor"); + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_win_scale, 0, + "Window scaling factor"); static void tcp_cleartaocache(void); static void tcp_notify(struct inpcb *, int); @@ -274,12 +276,10 @@ struct zone *tcp_rxt_seg_zone; extern int slowlink_wsize; /* window correction for slow links */ extern int path_mtu_discovery; -extern u_int32_t tcp_autorcvbuf_max; -extern u_int32_t tcp_autorcvbuf_inc_shift; static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb); -#define TCP_BWMEAS_BURST_MINSIZE 6 -#define TCP_BWMEAS_BURST_MAXSIZE 25 +#define TCP_BWMEAS_BURST_MINSIZE 6 +#define TCP_BWMEAS_BURST_MAXSIZE 25 static uint32_t bwmeas_elm_size; @@ -290,12 +290,12 @@ static uint32_t bwmeas_elm_size; * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE -#define TCBHASHSIZE CONFIG_TCBHASHSIZE +#define TCBHASHSIZE CONFIG_TCBHASHSIZE #endif __private_extern__ int tcp_tcbhashsize = TCBHASHSIZE; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED, - &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); /* * This is the actual shape of what we allocate using the zone @@ -317,9 +317,9 @@ int get_tcp_str_size(void); static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); -static lck_attr_t *tcp_uptime_mtx_attr = NULL; /* mutex attributes */ -static lck_grp_t *tcp_uptime_mtx_grp = NULL; /* mutex group definition */ -static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL; /* mutex group attributes */ +static lck_attr_t *tcp_uptime_mtx_attr = NULL; +static lck_grp_t *tcp_uptime_mtx_grp = NULL; +static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL; int tcp_notsent_lowat_check(struct socket *so); static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */ @@ -352,7 +352,10 @@ tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { int error = 0; - /* TFO-key is expressed as a string in hex format (+1 to account for \0 char) */ + /* + * TFO-key is expressed as a string in hex format + * (+1 to account for \0 char) + */ char keystring[TCP_FASTOPEN_KEYLEN * 2 + 1]; u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)]; int i; @@ -363,7 +366,8 @@ tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1, goto exit; } - /* sysctl_io_string copies keystring into the oldptr of the sysctl_req. + /* + * sysctl_io_string copies keystring into the oldptr of the sysctl_req. * Make sure everything is zero, to avoid putting garbage in there or * leaking the stack. */ @@ -374,7 +378,10 @@ tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1, goto exit; for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) { - /* We jump over the keystring in 8-character (4 byte in hex) steps */ + /* + * We jump over the keystring in 8-character (4 byte in hex) + * steps + */ if (sscanf(&keystring[i * 8], "%8x", &key[i]) != 1) { error = EINVAL; goto exit; @@ -387,14 +394,16 @@ tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1, return (error); } -int get_inpcb_str_size(void) +int +get_inpcb_str_size(void) { - return sizeof(struct inpcb); + return (sizeof(struct inpcb)); } -int get_tcp_str_size(void) +int +get_tcp_str_size(void) { - return sizeof(struct tcpcb); + return (sizeof(struct tcpcb)); } int tcp_freeq(struct tcpcb *tp); @@ -403,26 +412,27 @@ static int scale_to_powerof2(int size); /* * This helper routine returns one of the following scaled value of size: - * 1. Rounded down power of two value of size if the size value passed as + * 1. Rounded down power of two value of size if the size value passed as * argument is not a power of two and the rounded up value overflows. * OR - * 2. Rounded up power of two value of size if the size value passed as - * argument is not a power of two and the rounded up value does not overflow + * 2. Rounded up power of two value of size if the size value passed as + * argument is not a power of two and the rounded up value does not overflow * OR * 3. Same value as argument size if it is already a power of two. - */ -static int scale_to_powerof2(int size) { + */ +static int +scale_to_powerof2(int size) { /* Handle special case of size = 0 */ int ret = size ? size : 1; if (!powerof2(ret)) { - while(!powerof2(size)) { - /* + while (!powerof2(size)) { + /* * Clear out least significant * set bit till size is left with * its highest set bit at which point * it is rounded down power of two. - */ + */ size = size & (size -1); } @@ -434,11 +444,11 @@ static int scale_to_powerof2(int size) { } } - return ret; + return (ret); } static void -tcp_tfo_init() +tcp_tfo_init(void) { u_char key[TCP_FASTOPEN_KEYLEN]; @@ -454,7 +464,7 @@ tcp_init(struct protosw *pp, struct domain *dp) { #pragma unused(dp) static int tcp_initialized = 0; - vm_size_t str_size; + vm_size_t str_size; struct inpcbinfo *pcbinfo; VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED); @@ -475,7 +485,9 @@ tcp_init(struct protosw *pp, struct domain *dp) microuptime(&tcp_uptime); read_random(&tcp_now, sizeof(tcp_now)); - tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */ + + /* Starts tcp internal clock at a random value */ + tcp_now = tcp_now & 0x3fffffff; tcp_tfo_init(); @@ -487,7 +499,8 @@ tcp_init(struct protosw *pp, struct domain *dp) * allocate lock group attribute and group for tcp pcb mutexes */ pcbinfo->ipi_lock_grp_attr = lck_grp_attr_alloc_init(); - pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb", pcbinfo->ipi_lock_grp_attr); + pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb", + pcbinfo->ipi_lock_grp_attr); /* * allocate the lock attribute for tcp pcb mutexes @@ -518,7 +531,8 @@ tcp_init(struct protosw *pp, struct domain *dp) tcp_tcbhashsize); } - tcbinfo.ipi_hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.ipi_hashmask); + tcbinfo.ipi_hashbase = hashinit(tcp_tcbhashsize, M_PCB, + &tcbinfo.ipi_hashmask); tcbinfo.ipi_porthashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.ipi_porthashmask); str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t)); @@ -531,7 +545,8 @@ tcp_init(struct protosw *pp, struct domain *dp) in_pcbinfo_attach(&tcbinfo); str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t)); - sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone"); + sack_hole_zone = zinit(str_size, 120000*str_size, 8192, + "sack_hole zone"); zone_change(sack_hole_zone, Z_CALLERACCT, FALSE); zone_change(sack_hole_zone, Z_EXPAND, TRUE); @@ -546,7 +561,8 @@ tcp_init(struct protosw *pp, struct domain *dp) zone_change(tcp_reass_zone, Z_EXPAND, TRUE); bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t)); - tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0, "tcp_bwmeas_zone"); + tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0, + "tcp_bwmeas_zone"); if (tcp_bwmeas_zone == NULL) { panic("%s: failed allocating tcp_bwmeas_zone", __func__); /* NOTREACHED */ @@ -566,9 +582,9 @@ tcp_init(struct protosw *pp, struct domain *dp) zone_change(tcp_rxt_seg_zone, Z_EXPAND, TRUE); #if INET6 -#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ -#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) { _max_protohdr = TCP_MINPROTOHDR; @@ -584,25 +600,32 @@ tcp_init(struct protosw *pp, struct domain *dp) bzero(&tcp_timer_list, sizeof(tcp_timer_list)); LIST_INIT(&tcp_timer_list.lhead); /* - * allocate lock group attribute, group and attribute for the tcp timer list + * allocate lock group attribute, group and attribute for + * the tcp timer list */ tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init(); - tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr); + tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", + tcp_timer_list.mtx_grp_attr); tcp_timer_list.mtx_attr = lck_attr_alloc_init(); - if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) { + if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, + tcp_timer_list.mtx_attr)) == NULL) { panic("failed to allocate memory for tcp_timer_list.mtx\n"); }; - if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) { + tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL); + if (tcp_timer_list.call == NULL) { panic("failed to allocate call entry 1 in tcp_init\n"); } /* - * allocate lock group attribute, group and attribute for tcp_uptime_lock + * allocate lock group attribute, group and attribute for + * tcp_uptime_lock */ tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init(); - tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr); + tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", + tcp_uptime_mtx_grp_attr); tcp_uptime_mtx_attr = lck_attr_alloc_init(); - tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr); + tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, + tcp_uptime_mtx_attr); /* Initialize TCP LRO data structures */ tcp_lro_init(); @@ -617,6 +640,12 @@ tcp_init(struct protosw *pp, struct domain *dp) if (nmbclusters > 30720) { tcp_autorcvbuf_max = 1024 * 1024; tcp_autosndbuf_max = 1024 * 1024; + + /* + * Receive buffer max for cellular interfaces supporting + * Carrier Aggregation is higher + */ + tcp_autorcvbuf_max_ca = 2 * 1024 * 1024; } } @@ -626,10 +655,7 @@ tcp_init(struct protosw *pp, struct domain *dp) * of the tcpcb each time to conserve mbufs. */ void -tcp_fillheaders(tp, ip_ptr, tcp_ptr) - struct tcpcb *tp; - void *ip_ptr; - void *tcp_ptr; +tcp_fillheaders(struct tcpcb *tp, void *ip_ptr, void *tcp_ptr) { struct inpcb *inp = tp->t_inpcb; struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; @@ -643,8 +669,9 @@ tcp_fillheaders(tp, ip_ptr, tcp_ptr) (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; - ip6->ip6_plen = sizeof(struct tcphdr); + ip6->ip6_hlim = 0; ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr, @@ -652,20 +679,21 @@ tcp_fillheaders(tp, ip_ptr, tcp_ptr) } else #endif { - struct ip *ip = (struct ip *) ip_ptr; - - ip->ip_vhl = IP_VHL_BORING; - ip->ip_tos = 0; - ip->ip_len = 0; - ip->ip_id = 0; - ip->ip_off = 0; - ip->ip_ttl = 0; - ip->ip_sum = 0; - ip->ip_p = IPPROTO_TCP; - ip->ip_src = inp->inp_laddr; - ip->ip_dst = inp->inp_faddr; - tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(sizeof(struct tcphdr) + IPPROTO_TCP)); + struct ip *ip = (struct ip *) ip_ptr; + + ip->ip_vhl = IP_VHL_BORING; + ip->ip_tos = 0; + ip->ip_len = 0; + ip->ip_id = 0; + ip->ip_off = 0; + ip->ip_ttl = 0; + ip->ip_sum = 0; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = inp->inp_laddr; + ip->ip_dst = inp->inp_faddr; + tcp_hdr->th_sum = + in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + IPPROTO_TCP)); } tcp_hdr->th_sport = inp->inp_lport; @@ -685,8 +713,7 @@ tcp_fillheaders(tp, ip_ptr, tcp_ptr) * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * -tcp_maketemplate(tp) - struct tcpcb *tp; +tcp_maketemplate(struct tcpcb *tp) { struct mbuf *m; struct tcptemp *n; @@ -732,6 +759,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, int isipv6; #endif /* INET6 */ struct ifnet *outif; + int sotc = SO_TC_UNSPEC; #if INET6 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; @@ -755,12 +783,12 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #if INET6 if (isipv6) { ro6 = &sro6; - bzero(ro6, sizeof *ro6); + bzero(ro6, sizeof(*ro6)); } else #endif /* INET6 */ { ro = &sro; - bzero(ro, sizeof *ro); + bzero(ro, sizeof(*ro)); } } if (m == 0) { @@ -773,8 +801,8 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (isipv6) { VERIFY((MHLEN - max_linkhdr) >= (sizeof (*ip6) + sizeof (*nth))); - bcopy((caddr_t)ip6, mtod(m, caddr_t), - sizeof(struct ip6_hdr)); + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(void *)(ip6 + 1); } else @@ -799,7 +827,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; -#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#define xchg(a, b, type) { type t; t = a; a = b; b = t; } #if INET6 if (isipv6) { /* Expect 32-bit aligned IP on strict-align platforms */ @@ -808,12 +836,12 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, nth = (struct tcphdr *)(void *)(ip6 + 1); } else #endif /* INET6 */ - { - /* Expect 32-bit aligned IP on strict-align platforms */ - IP_HDR_STRICT_ALIGNMENT_CHECK(ip); - xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); - nth = (struct tcphdr *)(void *)(ip + 1); - } + { + /* Expect 32-bit aligned IP on strict-align platforms */ + IP_HDR_STRICT_ALIGNMENT_CHECK(ip); + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); + nth = (struct tcphdr *)(void *)(ip + 1); + } if (th != nth) { /* * this is usually a case when an extension header @@ -833,11 +861,11 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif - { - tlen += sizeof (struct tcpiphdr); - ip->ip_len = tlen; - ip->ip_ttl = ip_defttl; - } + { + tlen += sizeof (struct tcpiphdr); + ip->ip_len = tlen; + ip->ip_ttl = ip_defttl; + } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = 0; @@ -875,9 +903,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, m->m_pkthdr.csum_flags = CSUM_TCPIPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, - ro6 && ro6->ro_rt ? - ro6->ro_rt->rt_ifp : - NULL); + ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); } else #endif /* INET6 */ { @@ -908,8 +934,9 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (isipv6) { svc_flags |= PKT_SCF_IPV6; } + sotc = tp->t_inpcb->inp_socket->so_traffic_class; set_packet_service_class(m, tp->t_inpcb->inp_socket, - MBUF_SC_UNSPEC, svc_flags); + sotc, svc_flags); /* Embed flowhash and flow control flags */ m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; @@ -926,7 +953,8 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #if INET6 if (isipv6) { struct ip6_out_args ip6oa = { tra->ifscope, { 0 }, - IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0 }; + IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC}; if (tra->ifscope != IFSCOPE_NONE) ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; @@ -936,7 +964,14 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; if (tra->awdl_unrestricted) ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; - + if (tra->intcoproc_allowed) + ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED; + ip6oa.ip6oa_sotc = sotc; + if (tp != NULL) { + if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) + ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED; + ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype; + } (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL, NULL, &ip6oa); @@ -951,7 +986,8 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #endif /* INET6 */ { struct ip_out_args ipoa = { tra->ifscope, { 0 }, - IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR, 0 }; + IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; if (tra->ifscope != IFSCOPE_NONE) ipoa.ipoa_flags |= IPOAF_BOUND_IF; @@ -961,7 +997,12 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; if (tra->awdl_unrestricted) ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; - + ipoa.ipoa_sotc = sotc; + if (tp != NULL) { + if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) + ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED; + ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype; + } if (ro != &sro) { /* Copy the cached route and take an extra reference */ inp_route_copyout(tp->t_inpcb, &sro); @@ -992,12 +1033,11 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, * come from the zone allocator set up in tcp_init(). */ struct tcpcb * -tcp_newtcpcb(inp) - struct inpcb *inp; +tcp_newtcpcb(struct inpcb *inp) { struct inp_tp *it; - register struct tcpcb *tp; - register struct socket *so = inp->inp_socket; + struct tcpcb *tp; + struct socket *so = inp->inp_socket; #if INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ @@ -1005,12 +1045,12 @@ tcp_newtcpcb(inp) calculate_tcp_clock(); if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) { - it = (struct inp_tp *)(void *)inp; - tp = &it->tcb; + it = (struct inp_tp *)(void *)inp; + tp = &it->tcb; } else { - tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb; + tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb; } - + bzero((char *) tp, sizeof(struct tcpcb)); LIST_INIT(&tp->t_segq); tp->t_maxseg = tp->t_maxopd = @@ -1026,14 +1066,16 @@ tcp_newtcpcb(inp) TAILQ_INIT(&tp->snd_holes); SLIST_INIT(&tp->t_rxt_segments); - tp->t_inpcb = inp; /* XXX */ + SLIST_INIT(&tp->t_notify_ack); + tp->t_inpcb = inp; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; - tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttvar = + ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_TCPTV_MIN; tp->t_rxtcur = TCPTV_RTOBASE; @@ -1078,9 +1120,7 @@ tcp_newtcpcb(inp) * then send a RST to peer. */ struct tcpcb * -tcp_drop(tp, errno) - register struct tcpcb *tp; - int errno; +tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; #if CONFIG_DTRACE @@ -1102,7 +1142,7 @@ tcp_drop(tp, errno) } void -tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) +tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) { u_int32_t rtt = rt->rt_rmx.rmx_rtt; int isnetlocal = (tp->t_flags & TF_LOCAL); @@ -1115,17 +1155,19 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) if (rt->rt_rmx.rmx_locks & RTV_RTT) tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ); else - tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; - tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); + tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : + TCPTV_REXMTMIN; + tp->t_srtt = + rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); tcpstat.tcps_usedrtt++; if (rt->rt_rmx.rmx_rttvar) { tp->t_rttvar = rt->rt_rmx.rmx_rttvar / - (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); + (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, @@ -1176,7 +1218,6 @@ tcp_update_ecn_perf_stats(struct tcpcb *tp, if (inp->inp_socket->so_error == ECONNRESET) stat->rst_drop++; - return; } /* @@ -1186,8 +1227,7 @@ tcp_update_ecn_perf_stats(struct tcpcb *tp, * wake up any sleepers */ struct tcpcb * -tcp_close(tp) - register struct tcpcb *tp; +tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; @@ -1199,11 +1239,11 @@ tcp_close(tp) int dosavessthresh; /* tcp_close was called previously, bail */ - if (inp->inp_ppcb == NULL) - return(NULL); + if (inp->inp_ppcb == NULL) + return (NULL); tcp_canceltimers(tp); - KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp, 0, 0, 0, 0); /* * If another thread for this tcp is currently in ip (indicated by @@ -1247,7 +1287,7 @@ tcp_close(tp) * update anything that the user "locked". */ if (tp->t_rttupdated >= 16) { - register u_int32_t i = 0; + u_int32_t i = 0; #if INET6 if (isipv6) { @@ -1317,8 +1357,8 @@ tcp_close(tp) else dosavessthresh = (i < so->so_snd.sb_hiwat / 2); if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && - i != 0 && rt->rt_rmx.rmx_ssthresh != 0) - || dosavessthresh) { + i != 0 && rt->rt_rmx.rmx_ssthresh != 0) || + dosavessthresh) { /* * convert the limit from user data bytes to * packets then to packet data bytes. @@ -1328,14 +1368,10 @@ tcp_close(tp) i = 2; i *= (u_int32_t)(tp->t_maxseg + #if INET6 - (isipv6 ? sizeof (struct ip6_hdr) + - sizeof (struct tcphdr) : -#endif - sizeof (struct tcpiphdr) -#if INET6 - ) -#endif - ); + isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif /* INET6 */ + sizeof (struct tcpiphdr)); if (rt->rt_rmx.rmx_ssthresh) rt->rt_rmx.rmx_ssthresh = (rt->rt_rmx.rmx_ssthresh + i) / 2; @@ -1448,6 +1484,10 @@ tcp_close(tp) } tcp_free_sackholes(tp); + tcp_notify_ack_free(tp); + + inp_decr_sndbytes_allunsent(so, tp->snd_una); + if (tp->t_bwmeas != NULL) { tcp_bwmeas_free(tp); } @@ -1472,13 +1512,14 @@ tcp_close(tp) tp->t_state = TCPS_CLOSED; - /* Issue a wakeup before detach so that we don't miss + /* + * Issue a wakeup before detach so that we don't miss * a wakeup */ sodisconnectwakeup(so); - /* - * Clean up any LRO state + /* + * Clean up any LRO state */ if (tp->t_flagsext & TF_LRO_OFFLOADED) { tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr, @@ -1523,23 +1564,23 @@ tcp_close(tp) #endif /* INET6 */ in_pcbdetach(inp); - /* Call soisdisconnected after detach because it might unlock the socket */ + /* + * Call soisdisconnected after detach because it might unlock the socket + */ soisdisconnected(so); tcpstat.tcps_closed++; KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed, 0, 0, 0, 0); - return(NULL); + return (NULL); } int -tcp_freeq(tp) - struct tcpcb *tp; +tcp_freeq(struct tcpcb *tp) { - - register struct tseg_qent *q; + struct tseg_qent *q; int rv = 0; - while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); zfree(tcp_reass_zone, q); @@ -1557,12 +1598,12 @@ tcp_freeq(tp) * Do it next time if the pcbinfo lock is in use */ void -tcp_drain() +tcp_drain(void) { struct inpcb *inp; struct tcpcb *tp; - if (!lck_rw_try_lock_exclusive(tcbinfo.ipi_lock)) + if (!lck_rw_try_lock_exclusive(tcbinfo.ipi_lock)) return; LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { @@ -1574,10 +1615,10 @@ tcp_drain() /* lost a race, try the next one */ tcp_unlock(inp->inp_socket, 1, 0); continue; - } + } tp = intotcpcb(inp); - if (do_tcpdrain) + if (do_tcpdrain) tcp_freeq(tp); so_drain_extended_bk_idle(inp->inp_socket); @@ -1598,13 +1639,11 @@ tcp_drain() * reporting soft errors (yet - a kqueue filter may be added). */ static void -tcp_notify(inp, error) - struct inpcb *inp; - int error; +tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; - if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) + if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) return; /* pcb is gone already */ tp = (struct tcpcb *)inp->inp_ppcb; @@ -1617,8 +1656,8 @@ tcp_notify(inp, error) * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && - (error == EHOSTUNREACH || error == ENETUNREACH || - error == EHOSTDOWN)) { + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { return; } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) @@ -1632,30 +1671,56 @@ tcp_notify(inp, error) #endif } -struct bwmeas* +struct bwmeas * tcp_bwmeas_alloc(struct tcpcb *tp) { struct bwmeas *elm; elm = zalloc(tcp_bwmeas_zone); if (elm == NULL) - return(elm); + return (elm); bzero(elm, bwmeas_elm_size); elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE; elm->bw_maxsizepkts = TCP_BWMEAS_BURST_MAXSIZE; elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg; elm->bw_maxsize = elm->bw_maxsizepkts * tp->t_maxseg; - return(elm); + return (elm); } void -tcp_bwmeas_free(struct tcpcb* tp) +tcp_bwmeas_free(struct tcpcb *tp) { zfree(tcp_bwmeas_zone, tp->t_bwmeas); tp->t_bwmeas = NULL; tp->t_flagsext &= ~(TF_MEASURESNDBW); } +int +get_tcp_inp_list(struct inpcb **inp_list, int n, inp_gen_t gencnt) +{ + struct tcpcb *tp; + struct inpcb *inp; + int i = 0; + + LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { + if (inp->inp_gencnt <= gencnt && + inp->inp_state != INPCB_STATE_DEAD) + inp_list[i++] = inp; + if (i >= n) + break; + } + + TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) { + inp = tp->t_inpcb; + if (inp->inp_gencnt <= gencnt && + inp->inp_state != INPCB_STATE_DEAD) + inp_list[i++] = inp; + if (i >= n) + break; + } + return (i); +} + /* * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format. * The otcpcb data structure is passed to user space and must not change. @@ -1669,7 +1734,8 @@ tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp) otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST]; otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP]; otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL]; - otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRPERM(tp->t_inpcb); + otp->t_inpcb = + (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRPERM(tp->t_inpcb); otp->t_state = tp->t_state; otp->t_flags = tp->t_flags; otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0; @@ -1724,8 +1790,7 @@ tcp_pcblist SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) int error, i = 0, n; - struct inpcb *inp, **inp_list; - struct tcpcb *tp; + struct inpcb **inp_list; inp_gen_t gencnt; struct xinpgen xig; @@ -1736,15 +1801,15 @@ tcp_pcblist SYSCTL_HANDLER_ARGS lck_rw_lock_shared(tcbinfo.ipi_lock); if (req->oldptr == USER_ADDR_NULL) { n = tcbinfo.ipi_count; - req->oldidx = 2 * (sizeof xig) + req->oldidx = 2 * (sizeof(xig)) + (n + n/8) * sizeof(struct xtcpcb); lck_rw_done(tcbinfo.ipi_lock); - return 0; + return (0); } if (req->newptr != USER_ADDR_NULL) { lck_rw_done(tcbinfo.ipi_lock); - return EPERM; + return (EPERM); } /* @@ -1754,70 +1819,68 @@ tcp_pcblist SYSCTL_HANDLER_ARGS n = tcbinfo.ipi_count; bzero(&xig, sizeof(xig)); - xig.xig_len = sizeof xig; + xig.xig_len = sizeof(xig); xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; - error = SYSCTL_OUT(req, &xig, sizeof xig); + error = SYSCTL_OUT(req, &xig, sizeof(xig)); if (error) { lck_rw_done(tcbinfo.ipi_lock); - return error; + return (error); } /* * We are done if there is no pcb */ if (n == 0) { lck_rw_done(tcbinfo.ipi_lock); - return 0; + return (0); } - inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); + inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK); if (inp_list == 0) { lck_rw_done(tcbinfo.ipi_lock); - return ENOMEM; - } - - LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) - inp_list[i++] = inp; - if (i >= n) break; - } - - TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) { - inp = tp->t_inpcb; - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) - inp_list[i++] = inp; - if (i >= n) break; + return (ENOMEM); } - n = i; + n = get_tcp_inp_list(inp_list, n, gencnt); error = 0; for (i = 0; i < n; i++) { + struct xtcpcb xt; + caddr_t inp_ppcb; + struct inpcb *inp; + inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) { - struct xtcpcb xt; - caddr_t inp_ppcb; - - bzero(&xt, sizeof(xt)); - xt.xt_len = sizeof xt; - /* XXX should avoid extra copy */ - inpcb_to_compat(inp, &xt.xt_inp); - inp_ppcb = inp->inp_ppcb; - if (inp_ppcb != NULL) { - tcpcb_to_otcpcb( - (struct tcpcb *)(void *)inp_ppcb, - &xt.xt_tp); - } else { - bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); - } - if (inp->inp_socket) - sotoxsocket(inp->inp_socket, &xt.xt_socket); - error = SYSCTL_OUT(req, &xt, sizeof xt); + + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + tcp_lock(inp->inp_socket, 1, NULL); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + tcp_unlock(inp->inp_socket, 1, NULL); + continue; + } + if (inp->inp_gencnt > gencnt) { + tcp_unlock(inp->inp_socket, 1, NULL); + continue; + } + + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof(xt); + /* XXX should avoid extra copy */ + inpcb_to_compat(inp, &xt.xt_inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) { + tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb, + &xt.xt_tp); + } else { + bzero((char *) &xt.xt_tp, sizeof(xt.xt_tp)); } + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + + tcp_unlock(inp->inp_socket, 1, NULL); + + error = SYSCTL_OUT(req, &xt, sizeof(xt)); } if (!error) { /* @@ -1828,15 +1891,15 @@ tcp_pcblist SYSCTL_HANDLER_ARGS * might be necessary to retry. */ bzero(&xig, sizeof(xig)); - xig.xig_len = sizeof xig; + xig.xig_len = sizeof(xig); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count; - error = SYSCTL_OUT(req, &xig, sizeof xig); + error = SYSCTL_OUT(req, &xig, sizeof(xig)); } FREE(inp_list, M_TEMP); lck_rw_done(tcbinfo.ipi_lock); - return error; + return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, @@ -1847,59 +1910,59 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, static void tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp) { - otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first); - otp->t_dupacks = tp->t_dupacks; + otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first); + otp->t_dupacks = tp->t_dupacks; otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT]; otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST]; otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP]; otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL]; - otp->t_state = tp->t_state; - otp->t_flags = tp->t_flags; - otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0; - otp->snd_una = tp->snd_una; - otp->snd_max = tp->snd_max; - otp->snd_nxt = tp->snd_nxt; - otp->snd_up = tp->snd_up; - otp->snd_wl1 = tp->snd_wl1; - otp->snd_wl2 = tp->snd_wl2; - otp->iss = tp->iss; - otp->irs = tp->irs; - otp->rcv_nxt = tp->rcv_nxt; - otp->rcv_adv = tp->rcv_adv; - otp->rcv_wnd = tp->rcv_wnd; - otp->rcv_up = tp->rcv_up; - otp->snd_wnd = tp->snd_wnd; - otp->snd_cwnd = tp->snd_cwnd; - otp->snd_ssthresh = tp->snd_ssthresh; - otp->t_maxopd = tp->t_maxopd; - otp->t_rcvtime = tp->t_rcvtime; - otp->t_starttime = tp->t_starttime; - otp->t_rtttime = tp->t_rtttime; - otp->t_rtseq = tp->t_rtseq; - otp->t_rxtcur = tp->t_rxtcur; - otp->t_maxseg = tp->t_maxseg; - otp->t_srtt = tp->t_srtt; - otp->t_rttvar = tp->t_rttvar; - otp->t_rxtshift = tp->t_rxtshift; - otp->t_rttmin = tp->t_rttmin; - otp->t_rttupdated = tp->t_rttupdated; - otp->max_sndwnd = tp->max_sndwnd; - otp->t_softerror = tp->t_softerror; - otp->t_oobflags = tp->t_oobflags; - otp->t_iobc = tp->t_iobc; - otp->snd_scale = tp->snd_scale; - otp->rcv_scale = tp->rcv_scale; - otp->request_r_scale = tp->request_r_scale; - otp->requested_s_scale = tp->requested_s_scale; - otp->ts_recent = tp->ts_recent; - otp->ts_recent_age = tp->ts_recent_age; - otp->last_ack_sent = tp->last_ack_sent; - otp->cc_send = 0; - otp->cc_recv = 0; - otp->snd_recover = tp->snd_recover; - otp->snd_cwnd_prev = tp->snd_cwnd_prev; - otp->snd_ssthresh_prev = tp->snd_ssthresh_prev; - otp->t_badrxtwin = 0; + otp->t_state = tp->t_state; + otp->t_flags = tp->t_flags; + otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0; + otp->snd_una = tp->snd_una; + otp->snd_max = tp->snd_max; + otp->snd_nxt = tp->snd_nxt; + otp->snd_up = tp->snd_up; + otp->snd_wl1 = tp->snd_wl1; + otp->snd_wl2 = tp->snd_wl2; + otp->iss = tp->iss; + otp->irs = tp->irs; + otp->rcv_nxt = tp->rcv_nxt; + otp->rcv_adv = tp->rcv_adv; + otp->rcv_wnd = tp->rcv_wnd; + otp->rcv_up = tp->rcv_up; + otp->snd_wnd = tp->snd_wnd; + otp->snd_cwnd = tp->snd_cwnd; + otp->snd_ssthresh = tp->snd_ssthresh; + otp->t_maxopd = tp->t_maxopd; + otp->t_rcvtime = tp->t_rcvtime; + otp->t_starttime = tp->t_starttime; + otp->t_rtttime = tp->t_rtttime; + otp->t_rtseq = tp->t_rtseq; + otp->t_rxtcur = tp->t_rxtcur; + otp->t_maxseg = tp->t_maxseg; + otp->t_srtt = tp->t_srtt; + otp->t_rttvar = tp->t_rttvar; + otp->t_rxtshift = tp->t_rxtshift; + otp->t_rttmin = tp->t_rttmin; + otp->t_rttupdated = tp->t_rttupdated; + otp->max_sndwnd = tp->max_sndwnd; + otp->t_softerror = tp->t_softerror; + otp->t_oobflags = tp->t_oobflags; + otp->t_iobc = tp->t_iobc; + otp->snd_scale = tp->snd_scale; + otp->rcv_scale = tp->rcv_scale; + otp->request_r_scale = tp->request_r_scale; + otp->requested_s_scale = tp->requested_s_scale; + otp->ts_recent = tp->ts_recent; + otp->ts_recent_age = tp->ts_recent_age; + otp->last_ack_sent = tp->last_ack_sent; + otp->cc_send = 0; + otp->cc_recv = 0; + otp->snd_recover = tp->snd_recover; + otp->snd_cwnd_prev = tp->snd_cwnd_prev; + otp->snd_ssthresh_prev = tp->snd_ssthresh_prev; + otp->t_badrxtwin = 0; } @@ -1907,95 +1970,97 @@ static int tcp_pcblist64 SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) - int error, i = 0, n; - struct inpcb *inp, **inp_list; - struct tcpcb *tp; - inp_gen_t gencnt; - struct xinpgen xig; - - /* - * The process of preparing the TCB list is too time-consuming and - * resource-intensive to repeat twice on every request. - */ - lck_rw_lock_shared(tcbinfo.ipi_lock); - if (req->oldptr == USER_ADDR_NULL) { - n = tcbinfo.ipi_count; - req->oldidx = 2 * (sizeof xig) - + (n + n/8) * sizeof(struct xtcpcb64); - lck_rw_done(tcbinfo.ipi_lock); - return 0; - } - - if (req->newptr != USER_ADDR_NULL) { - lck_rw_done(tcbinfo.ipi_lock); - return EPERM; - } - - /* - * OK, now we're committed to doing something. - */ - gencnt = tcbinfo.ipi_gencnt; - n = tcbinfo.ipi_count; - - bzero(&xig, sizeof(xig)); - xig.xig_len = sizeof xig; - xig.xig_count = n; - xig.xig_gen = gencnt; - xig.xig_sogen = so_gencnt; - error = SYSCTL_OUT(req, &xig, sizeof xig); - if (error) { - lck_rw_done(tcbinfo.ipi_lock); - return error; - } - /* - * We are done if there is no pcb - */ - if (n == 0) { - lck_rw_done(tcbinfo.ipi_lock); - return 0; - } - - inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) { - lck_rw_done(tcbinfo.ipi_lock); - return ENOMEM; - } + int error, i = 0, n; + struct inpcb **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; - LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) - inp_list[i++] = inp; - if (i >= n) break; - } + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + lck_rw_lock_shared(tcbinfo.ipi_lock); + if (req->oldptr == USER_ADDR_NULL) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof(xig)) + + (n + n/8) * sizeof(struct xtcpcb64); + lck_rw_done(tcbinfo.ipi_lock); + return (0); + } - TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) { - inp = tp->t_inpcb; - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) - inp_list[i++] = inp; - if (i >= n) break; - } - - n = i; - - error = 0; - for (i = 0; i < n; i++) { - inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { - struct xtcpcb64 xt; - - bzero(&xt, sizeof(xt)); - xt.xt_len = sizeof xt; - inpcb_to_xinpcb64(inp, &xt.xt_inpcb); - xt.xt_inpcb.inp_ppcb = (uint64_t)VM_KERNEL_ADDRPERM(inp->inp_ppcb); - if (inp->inp_ppcb != NULL) - tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt); - if (inp->inp_socket) - sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket); - error = SYSCTL_OUT(req, &xt, sizeof xt); - } - } - if (!error) { + if (req->newptr != USER_ADDR_NULL) { + lck_rw_done(tcbinfo.ipi_lock); + return (EPERM); + } + + /* + * OK, now we're committed to doing something. + */ + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof(xig); + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof(xig)); + if (error) { + lck_rw_done(tcbinfo.ipi_lock); + return (error); + } + /* + * We are done if there is no pcb + */ + if (n == 0) { + lck_rw_done(tcbinfo.ipi_lock); + return (0); + } + + inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK); + if (inp_list == 0) { + lck_rw_done(tcbinfo.ipi_lock); + return (ENOMEM); + } + + n = get_tcp_inp_list(inp_list, n, gencnt); + + error = 0; + for (i = 0; i < n; i++) { + struct xtcpcb64 xt; + struct inpcb *inp; + + inp = inp_list[i]; + + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + tcp_lock(inp->inp_socket, 1, NULL); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + tcp_unlock(inp->inp_socket, 1, NULL); + continue; + } + if (inp->inp_gencnt > gencnt) { + tcp_unlock(inp->inp_socket, 1, NULL); + continue; + } + + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof(xt); + inpcb_to_xinpcb64(inp, &xt.xt_inpcb); + xt.xt_inpcb.inp_ppcb = + (uint64_t)VM_KERNEL_ADDRPERM(inp->inp_ppcb); + if (inp->inp_ppcb != NULL) + tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, + &xt); + if (inp->inp_socket) + sotoxsocket64(inp->inp_socket, + &xt.xt_inpcb.xi_socket); + + tcp_unlock(inp->inp_socket, 1, NULL); + + error = SYSCTL_OUT(req, &xt, sizeof(xt)); + } + if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told @@ -2004,20 +2069,20 @@ tcp_pcblist64 SYSCTL_HANDLER_ARGS * might be necessary to retry. */ bzero(&xig, sizeof(xig)); - xig.xig_len = sizeof xig; + xig.xig_len = sizeof(xig); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count; - error = SYSCTL_OUT(req, &xig, sizeof xig); - } - FREE(inp_list, M_TEMP); - lck_rw_done(tcbinfo.ipi_lock); - return error; + error = SYSCTL_OUT(req, &xig, sizeof(xig)); + } + FREE(inp_list, M_TEMP); + lck_rw_done(tcbinfo.ipi_lock); + return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, - tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections"); + tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections"); static int @@ -2025,43 +2090,136 @@ tcp_pcblist_n SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) int error = 0; - + error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo); - - return error; + + return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, - tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); + tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); __private_extern__ void tcp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags, bitstr_t *bitfield) { - inpcb_get_ports_used(ifindex, protocol, flags, + inpcb_get_ports_used(ifindex, protocol, flags, bitfield, &tcbinfo); } __private_extern__ uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags) { - return inpcb_count_opportunistic(ifindex, &tcbinfo, flags); + return (inpcb_count_opportunistic(ifindex, &tcbinfo, flags)); } __private_extern__ uint32_t tcp_find_anypcb_byaddr(struct ifaddr *ifa) { - return inpcb_find_anypcb_byaddr(ifa, &tcbinfo); + return (inpcb_find_anypcb_byaddr(ifa, &tcbinfo)); +} + +static void +tcp_handle_msgsize(struct ip *ip, struct inpcb *inp) +{ + struct rtentry *rt = NULL; + u_short ifscope = IFSCOPE_NONE; + int mtu; + struct sockaddr_in icmpsrc = { + sizeof (struct sockaddr_in), + AF_INET, 0, { 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0 } }; + struct icmp *icp = NULL; + + icp = (struct icmp *)(void *) + ((caddr_t)ip - offsetof(struct icmp, icmp_ip)); + + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + + /* + * MTU discovery: + * If we got a needfrag and there is a host route to the + * original destination, and the MTU is not locked, then + * set the MTU in the route to the suggested new value + * (if given) and then notify as usual. The ULPs will + * notice that the MTU has changed and adapt accordingly. + * If no new MTU was suggested, then we guess a new one + * less than the current value. If the new MTU is + * unreasonably small (defined by sysctl tcp_minmss), then + * we reset the MTU to the interface value and enable the + * lock bit, indicating that we are no longer doing MTU + * discovery. + */ + if (ROUTE_UNUSABLE(&(inp->inp_route)) == false) + rt = inp->inp_route.ro_rt; + + /* + * icmp6_mtudisc_update scopes the routing lookup + * to the incoming interface (delivered from mbuf + * packet header. + * That is mostly ok but for asymmetric networks + * that may be an issue. + * Frag needed OR Packet too big really communicates + * MTU for the out data path. + * Take the interface scope from cached route or + * the last outgoing interface from inp + */ + if (rt != NULL) + ifscope = (rt->rt_ifp != NULL) ? + rt->rt_ifp->if_index : IFSCOPE_NONE; + else + ifscope = (inp->inp_last_outifp != NULL) ? + inp->inp_last_outifp->if_index : IFSCOPE_NONE; + + if ((rt == NULL) || + !(rt->rt_flags & RTF_HOST) || + (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING))) { + rt = rtalloc1_scoped((struct sockaddr *)&icmpsrc, 0, + RTF_CLONING | RTF_PRCLONING, ifscope); + } else if (rt) { + RT_LOCK(rt); + rtref(rt); + RT_UNLOCK(rt); + } + + if (rt != NULL) { + RT_LOCK(rt); + if ((rt->rt_flags & RTF_HOST) && + !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + mtu = ntohs(icp->icmp_nextmtu); + /* + * XXX Stock BSD has changed the following + * to compare with icp->icmp_ip.ip_len + * to converge faster when sent packet + * < route's MTU. We may want to adopt + * that change. + */ + if (mtu == 0) + mtu = ip_next_mtu(rt->rt_rmx. + rmx_mtu, 1); +#if DEBUG_MTUDISC + printf("MTU for %s reduced to %d\n", + inet_ntop(AF_INET, + &icmpsrc.sin_addr, ipv4str, + sizeof (ipv4str)), mtu); +#endif + if (mtu < max(296, (tcp_minmss + + sizeof (struct tcpiphdr)))) { + rt->rt_rmx.rmx_locks |= RTV_MTU; + } else if (rt->rt_rmx.rmx_mtu > mtu) { + rt->rt_rmx.rmx_mtu = mtu; + } + } + RT_UNLOCK(rt); + rtfree(rt); + } } void -tcp_ctlinput(cmd, sa, vip) - int cmd; - struct sockaddr *sa; - void *vip; +tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { tcp_seq icmp_tcp_seq; struct ip *ip = vip; @@ -2089,7 +2247,7 @@ tcp_ctlinput(cmd, sa, vip) } else if (cmd == PRC_HOSTDEAD) ip = 0; /* Source quench is deprecated */ - else if (cmd == PRC_QUENCH) + else if (cmd == PRC_QUENCH) return; else if (inetctlerrmap[cmd] == 0) return; @@ -2099,13 +2257,17 @@ tcp_ctlinput(cmd, sa, vip) icp = (struct icmp *)(void *) ((caddr_t)ip - offsetof(struct icmp, icmp_ip)); - bcopy(((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)), - &th, sizeof (th)); + /* + * Only the first 8 bytes of TCP header will be returned. + */ + bzero(&th, sizeof(th)); + bcopy(((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)), &th, 8); inp = in_pcblookup_hash(&tcbinfo, faddr, th.th_dport, ip->ip_src, th.th_sport, 0, NULL); if (inp != NULL && inp->inp_socket != NULL) { tcp_lock(inp->inp_socket, 1, 0); - if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == + WNT_STOPUSING) { tcp_unlock(inp->inp_socket, 1, 0); return; } @@ -2113,57 +2275,8 @@ tcp_ctlinput(cmd, sa, vip) tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { - if (cmd == PRC_MSGSIZE) { - - /* - * MTU discovery: - * If we got a needfrag and there is a host route to the - * original destination, and the MTU is not locked, then - * set the MTU in the route to the suggested new value - * (if given) and then notify as usual. The ULPs will - * notice that the MTU has changed and adapt accordingly. - * If no new MTU was suggested, then we guess a new one - * less than the current value. If the new MTU is - * unreasonably small (defined by sysctl tcp_minmss), then - * we reset the MTU to the interface value and enable the - * lock bit, indicating that we are no longer doing MTU - * discovery. - */ - struct rtentry *rt; - int mtu; - struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET, - 0 , { 0 }, { 0,0,0,0,0,0,0,0 } }; - icmpsrc.sin_addr = icp->icmp_ip.ip_dst; - - rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, - RTF_CLONING | RTF_PRCLONING); - if (rt != NULL) { - RT_LOCK(rt); - if ((rt->rt_flags & RTF_HOST) && - !(rt->rt_rmx.rmx_locks & RTV_MTU)) { - mtu = ntohs(icp->icmp_nextmtu); - if (!mtu) - mtu = ip_next_mtu(rt->rt_rmx. - rmx_mtu, 1); -#if DEBUG_MTUDISC - printf("MTU for %s reduced to %d\n", - inet_ntop(AF_INET, - &icmpsrc.sin_addr, ipv4str, - sizeof (ipv4str)), mtu); -#endif - if (mtu < max(296, (tcp_minmss + - sizeof (struct tcpiphdr)))) { - /* rt->rt_rmx.rmx_mtu = - rt->rt_ifp->if_mtu; */ - rt->rt_rmx.rmx_locks |= RTV_MTU; - } else if (rt->rt_rmx.rmx_mtu > mtu) { - rt->rt_rmx.rmx_mtu = mtu; - } - } - RT_UNLOCK(rt); - rtfree(rt); - } - } + if (cmd == PRC_MSGSIZE) + tcp_handle_msgsize(ip, inp); (*notify)(inp, inetctlerrmap[cmd]); } @@ -2175,10 +2288,7 @@ tcp_ctlinput(cmd, sa, vip) #if INET6 void -tcp6_ctlinput(cmd, sa, d) - int cmd; - struct sockaddr *sa; - void *d; +tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; void (*notify)(struct inpcb *, int) = tcp_notify; @@ -2204,7 +2314,7 @@ tcp6_ctlinput(cmd, sa, d) else if (!PRC_IS_REDIRECT(cmd) && (inet6ctlerrmap[cmd] == 0)) return; /* Source quench is deprecated */ - else if (cmd == PRC_QUENCH) + else if (cmd == PRC_QUENCH) return; /* if the parameter is from icmp6, decode it. */ @@ -2252,7 +2362,7 @@ tcp6_ctlinput(cmd, sa, d) * 1. In SYN-ACK packets. * 2. In SYN packets. * - * The ISNs in SYN-ACK packets have no monotonicity requirement, + * The ISNs in SYN-ACK packets have no monotonicity requirement, * and should be as unpredictable as possible to avoid the possibility * of spoofing and/or connection hijacking. To satisfy this * requirement, SYN-ACK ISNs are generated via the arc4random() @@ -2288,11 +2398,10 @@ tcp6_ctlinput(cmd, sa, d) * */ -#define ISN_BYTES_PER_SECOND 1048576 +#define ISN_BYTES_PER_SECOND 1048576 tcp_seq -tcp_new_isn(tp) - struct tcpcb *tp; +tcp_new_isn(struct tcpcb *tp) { u_int32_t md5_buffer[4]; tcp_seq new_isn; @@ -2302,19 +2411,19 @@ tcp_new_isn(tp) MD5_CTX isn_ctx; /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */ - if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) - && tcp_strict_rfc1948 == 0) + if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) && + tcp_strict_rfc1948 == 0) #ifdef __APPLE__ - return RandomULong(); + return (RandomULong()); #else - return arc4random(); + return (arc4random()); #endif getmicrotime(&timenow); /* Seed if this is the first use, reseed if requested. */ if ((isn_last_reseed == 0) || ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) && - (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) + (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) < (u_int)timenow.tv_sec))) { #ifdef __APPLE__ read_random(&isn_secret, sizeof(isn_secret)); @@ -2323,30 +2432,32 @@ tcp_new_isn(tp) #endif isn_last_reseed = timenow.tv_sec; } - + /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, + sizeof(u_short)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, + sizeof(u_short)); #if INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, - sizeof(struct in6_addr)); + sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, - sizeof(struct in6_addr)); + sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, - sizeof(struct in_addr)); + sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, - sizeof(struct in_addr)); + sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz); - return new_isn; + return (new_isn); } @@ -2356,9 +2467,7 @@ tcp_new_isn(tp) * is controlled by the icmp_may_rst sysctl. */ void -tcp_drop_syn_sent(inp, errno) - struct inpcb *inp; - int errno; +tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp = intotcpcb(inp); @@ -2385,8 +2494,13 @@ tcp_mtudisc( int offered; int mss; u_int32_t mtu; + u_int32_t protoHdrOverhead = sizeof (struct tcpiphdr); #if INET6 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; + + if (isipv6) + protoHdrOverhead = sizeof(struct ip6_hdr) + + sizeof(struct tcphdr); #endif /* INET6 */ if (tp) { @@ -2395,7 +2509,7 @@ tcp_mtudisc( rt = tcp_rtlookup6(inp, IFSCOPE_NONE); else #endif /* INET6 */ - rt = tcp_rtlookup(inp, IFSCOPE_NONE); + rt = tcp_rtlookup(inp, IFSCOPE_NONE); if (!rt || !rt->rt_rmx.rmx_mtu) { tp->t_maxopd = tp->t_maxseg = #if INET6 @@ -2419,17 +2533,7 @@ tcp_mtudisc( // Adjust MTU if necessary. mtu = necp_socket_get_effective_mtu(inp, mtu); #endif /* NECP */ - - mss = mtu - -#if INET6 - (isipv6 ? - sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : -#endif /* INET6 */ - sizeof(struct tcpiphdr) -#if INET6 - ) -#endif /* INET6 */ - ; + mss = mtu - protoHdrOverhead; if (offered) mss = min(mss, offered); @@ -2467,7 +2571,8 @@ tcp_mtudisc( tp->t_maxseg = mss; /* - * Reset the slow-start flight size as it may depends on the new MSS + * Reset the slow-start flight size as it may depends on the + * new MSS */ if (CC_ALGO(tp)->cwnd_init != NULL) CC_ALGO(tp)->cwnd_init(tp); @@ -2486,9 +2591,7 @@ tcp_mtudisc( * hold the rtentry lock; the caller is responsible for unlocking. */ struct rtentry * -tcp_rtlookup(inp, input_ifscope) - struct inpcb *inp; - unsigned int input_ifscope; +tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope) { struct route *ro; struct rtentry *rt; @@ -2541,23 +2644,14 @@ tcp_rtlookup(inp, input_ifscope) * disabled) */ - tp = intotcpcb(inp); + tp = intotcpcb(inp); - if (!path_mtu_discovery || ((rt != NULL) && - (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) + if (!path_mtu_discovery || ((rt != NULL) && + (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) tp->t_flags &= ~TF_PMTUD; else tp->t_flags |= TF_PMTUD; -#if CONFIG_IFEF_NOWINDOWSCALE - if (tcp_obey_ifef_nowindowscale && - tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && - (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) { - /* Window scaling is enabled on this interface */ - tp->t_flags &= ~TF_REQ_SCALE; - } -#endif - if (rt != NULL && rt->rt_ifp != NULL) { somultipages(inp->inp_socket, (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); @@ -2565,6 +2659,8 @@ tcp_rtlookup(inp, input_ifscope) soif2kcl(inp->inp_socket, (rt->rt_ifp->if_eflags & IFEF_2KCL)); tcp_set_ecn(tp, rt->rt_ifp); + if (inp->inp_last_outifp == NULL) + inp->inp_last_outifp = rt->rt_ifp; } /* Note if the peer is local */ @@ -2574,18 +2670,16 @@ tcp_rtlookup(inp, input_ifscope) in_localaddr(inp->inp_faddr))) { tp->t_flags |= TF_LOCAL; } - + /* * Caller needs to call RT_UNLOCK(rt). */ - return rt; + return (rt); } #if INET6 struct rtentry * -tcp_rtlookup6(inp, input_ifscope) - struct inpcb *inp; - unsigned int input_ifscope; +tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope) { struct route_in6 *ro6; struct rtentry *rt; @@ -2640,7 +2734,7 @@ tcp_rtlookup6(inp, input_ifscope) */ - tp = intotcpcb(inp); + tp = intotcpcb(inp); /* * Update MTU discovery determination. Don't do it if: @@ -2650,21 +2744,12 @@ tcp_rtlookup6(inp, input_ifscope) * disabled) */ - if (!path_mtu_discovery || ((rt != NULL) && - (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) + if (!path_mtu_discovery || ((rt != NULL) && + (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) tp->t_flags &= ~TF_PMTUD; else tp->t_flags |= TF_PMTUD; -#if CONFIG_IFEF_NOWINDOWSCALE - if (tcp_obey_ifef_nowindowscale && - tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && - (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) { - /* Window scaling is not enabled on this interface */ - tp->t_flags &= ~TF_REQ_SCALE; - } -#endif - if (rt != NULL && rt->rt_ifp != NULL) { somultipages(inp->inp_socket, (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES)); @@ -2672,6 +2757,8 @@ tcp_rtlookup6(inp, input_ifscope) soif2kcl(inp->inp_socket, (rt->rt_ifp->if_eflags & IFEF_2KCL)); tcp_set_ecn(tp, rt->rt_ifp); + if (inp->inp_last_outifp == NULL) + inp->inp_last_outifp = rt->rt_ifp; } /* Note if the peer is local */ @@ -2686,15 +2773,14 @@ tcp_rtlookup6(inp, input_ifscope) /* * Caller needs to call RT_UNLOCK(rt). */ - return rt; + return (rt); } #endif /* INET6 */ #if IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t -ipsec_hdrsiz_tcp(tp) - struct tcpcb *tp; +ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; @@ -2706,10 +2792,10 @@ ipsec_hdrsiz_tcp(tp) struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) - return 0; + return (0); MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */ if (!m) - return 0; + return (0); #if INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { @@ -2721,17 +2807,17 @@ ipsec_hdrsiz_tcp(tp) hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ - { - ip = mtod(m, struct ip *); - th = (struct tcphdr *)(ip + 1); - m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); - tcp_fillheaders(tp, ip, th); - hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); - } + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + tcp_fillheaders(tp, ip, th); + hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } m_free(m); - return hdrsiz; + return (hdrsiz); } -#endif /*IPSEC*/ +#endif /* IPSEC */ /* * Return a pointer to the cached information about the remote host. @@ -2739,8 +2825,7 @@ ipsec_hdrsiz_tcp(tp) * the route metrics. */ struct rmxp_tao * -tcp_gettaocache(inp) - struct inpcb *inp; +tcp_gettaocache(struct inpcb *inp) { struct rtentry *rt; struct rmxp_tao *taop; @@ -2758,9 +2843,9 @@ tcp_gettaocache(inp) /* Route locked during lookup above */ if (rt != NULL) RT_UNLOCK(rt); - return NULL; + return (NULL); } - + taop = rmx_taop(rt->rt_rmx); /* Route locked during lookup above */ RT_UNLOCK(rt); @@ -2776,7 +2861,7 @@ tcp_gettaocache(inp) * nothing in the cache left over. */ static void -tcp_cleartaocache() +tcp_cleartaocache(void) { } @@ -2793,14 +2878,15 @@ tcp_lock(struct socket *so, int refcount, void *lr) if (so->so_pcb != NULL) { lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } else { - panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n", + panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n", so, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } if (so->so_usecount < 0) { panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", - so, so->so_pcb, lr_saved, so->so_usecount, solockhistory_nr(so)); + so, so->so_pcb, lr_saved, so->so_usecount, + solockhistory_nr(so)); /* NOTREACHED */ } if (refcount) @@ -2831,12 +2917,12 @@ tcp_unlock(struct socket *so, int refcount, void *lr) so->so_usecount--; if (so->so_usecount < 0) { - panic("tcp_unlock: so=%p usecount=%x lrh= %s\n", + panic("tcp_unlock: so=%p usecount=%x lrh= %s\n", so, so->so_usecount, solockhistory_nr(so)); /* NOTREACHED */ } if (so->so_pcb == NULL) { - panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", + panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", so, so->so_usecount, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } else { @@ -2850,28 +2936,25 @@ tcp_unlock(struct socket *so, int refcount, void *lr) } lck_mtx_t * -tcp_getlock( - struct socket *so, - __unused int locktype) +tcp_getlock(struct socket *so, __unused int locktype) { struct inpcb *inp = sotoinpcb(so); if (so->so_pcb) { if (so->so_usecount < 0) - panic("tcp_getlock: so=%p usecount=%x lrh= %s\n", - so, so->so_usecount, solockhistory_nr(so)); - return(&inp->inpcb_mtx); - } - else { - panic("tcp_getlock: so=%p NULL so_pcb %s\n", + panic("tcp_getlock: so=%p usecount=%x lrh= %s\n", + so, so->so_usecount, solockhistory_nr(so)); + return (&inp->inpcb_mtx); + } else { + panic("tcp_getlock: so=%p NULL so_pcb %s\n", so, solockhistory_nr(so)); return (so->so_proto->pr_domain->dom_mtx); } } -/* +/* * Determine if we can grow the recieve socket buffer to avoid sending - * a zero window update to the peer. We allow even socket buffers that + * a zero window update to the peer. We allow even socket buffers that * have fixed size (set by the application) to grow if the resource * constraints are met. They will also be trimmed after the application * reads data. @@ -2906,10 +2989,10 @@ tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb) int32_t tcp_sbspace(struct tcpcb *tp) { - struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv; + struct socket *so = tp->t_inpcb->inp_socket; + struct sockbuf *sb = &so->so_rcv; u_int32_t rcvbuf = sb->sb_hiwat; int32_t space; - struct socket *so = tp->t_inpcb->inp_socket; int32_t pending = 0; /* @@ -2926,7 +3009,7 @@ tcp_sbspace(struct tcpcb *tp) space = ((int32_t) imin((rcvbuf - sb->sb_cc), (sb->sb_mbmax - sb->sb_mbcnt))); - if (space < 0) + if (space < 0) space = 0; #if CONTENT_FILTER @@ -2938,20 +3021,21 @@ tcp_sbspace(struct tcpcb *tp) else space -= pending; - /* Avoid increasing window size if the current window + /* + * Avoid increasing window size if the current window * is already very low, we could be in "persist" mode and * we could break some apps (see rdar://5409343) */ - if (space < tp->t_maxseg) - return space; + if (space < tp->t_maxseg) + return (space); - /* Clip window size for slower link */ + /* Clip window size for slower link */ - if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 ) - return imin(space, slowlink_wsize); + if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0) + return (imin(space, slowlink_wsize)); - return space; + return (space); } /* * Checks TCP Segment Offloading capability for a given connection @@ -2980,20 +3064,20 @@ tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp) if (isipv6) { if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV6)) { tp->t_flags |= TF_TSO; - if (ifp->if_tso_v6_mtu != 0) + if (ifp->if_tso_v6_mtu != 0) tp->tso_max_segment_size = ifp->if_tso_v6_mtu; else tp->tso_max_segment_size = TCP_MAXWIN; } else tp->t_flags &= ~TF_TSO; - } else + } else #endif /* INET6 */ { if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV4)) { tp->t_flags |= TF_TSO; - if (ifp->if_tso_v4_mtu != 0) + if (ifp->if_tso_v4_mtu != 0) tp->tso_max_segment_size = ifp->if_tso_v4_mtu; else tp->tso_max_segment_size = TCP_MAXWIN; @@ -3002,18 +3086,20 @@ tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp) } } -#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC) +#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + \ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC) -/* Function to calculate the tcp clock. The tcp clock will get updated +/* + * Function to calculate the tcp clock. The tcp clock will get updated * at the boundaries of the tcp layer. This is done at 3 places: - * 1. Right before processing an input tcp packet + * 1. Right before processing an input tcp packet * 2. Whenever a connection wants to access the network using tcp_usrreqs * 3. When a tcp timer fires or before tcp slow timeout * */ void -calculate_tcp_clock() +calculate_tcp_clock(void) { struct timeval tv = tcp_uptime; struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC}; @@ -3050,18 +3136,20 @@ calculate_tcp_clock() tcp_now += incr; } - lck_spin_unlock(tcp_uptime_lock); - } - return; + lck_spin_unlock(tcp_uptime_lock); + } } -/* Compute receive window scaling that we are going to request - * for this connection based on sb_hiwat. Try to leave some - * room to potentially increase the window size upto a maximum +/* + * Compute receive window scaling that we are going to request + * for this connection based on sb_hiwat. Try to leave some + * room to potentially increase the window size upto a maximum * defined by the constant tcp_autorcvbuf_max. */ void -tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) { +tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, + u_int32_t rcvbuf_max) +{ u_int32_t maxsockbufsize; if (!tcp_do_rfc1323) { tp->request_r_scale = 0; @@ -3070,7 +3158,7 @@ tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) { tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ? - so->so_rcv.sb_hiwat : tcp_autorcvbuf_max; + so->so_rcv.sb_hiwat : rcvbuf_max; while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize) @@ -3088,31 +3176,34 @@ tcp_notsent_lowat_check(struct socket *so) { tp = intotcpcb(inp); } - notsent = so->so_snd.sb_cc - + notsent = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una); - /* When we send a FIN or SYN, not_sent can be negative. - * In that case also we need to send a write event to the + /* + * When we send a FIN or SYN, not_sent can be negative. + * In that case also we need to send a write event to the * process if it is waiting. In the FIN case, it will * get an error from send because cantsendmore will be set. */ if (notsent <= tp->t_notsent_lowat) { - return(1); + return (1); } - /* When Nagle's algorithm is not disabled, it is better + /* + * When Nagle's algorithm is not disabled, it is better * to wakeup the client until there is atleast one * maxseg of data to write. */ - if ((tp->t_flags & TF_NODELAY) == 0 && + if ((tp->t_flags & TF_NODELAY) == 0 && notsent > 0 && notsent < tp->t_maxseg) { - return(1); + return (1); } - return(0); + return (0); } void -tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end) { +tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end) +{ struct tcp_rxt_seg *rxseg = NULL, *prev = NULL, *next = NULL; u_int32_t rxcount = 0; @@ -3180,7 +3271,6 @@ tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end) { } else { SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link); } - return; } struct tcp_rxt_seg * @@ -3251,7 +3341,7 @@ tcp_rxtseg_dsack_for_tlp(struct tcpcb *tp) SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) { if (rxseg->rx_count == 1 && - SLIST_NEXT(rxseg,rx_link) == NULL && + SLIST_NEXT(rxseg, rx_link) == NULL && (rxseg->rx_flags & TCP_RXT_DSACK_FOR_TLP)) { dsack_for_tlp = TRUE; break; @@ -3261,7 +3351,8 @@ tcp_rxtseg_dsack_for_tlp(struct tcpcb *tp) } u_int32_t -tcp_rxtseg_total_size(struct tcpcb *tp) { +tcp_rxtseg_total_size(struct tcpcb *tp) +{ struct tcp_rxt_seg *rxseg; u_int32_t total_size = 0; @@ -3287,16 +3378,15 @@ tcp_get_connectivity_status(struct tcpcb *tp, } if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX) connstatus->read_probe_failed = 1; - if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL - && (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)) + if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL && + (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)) connstatus->probe_activated = 1; - return; } boolean_t tfo_enabled(const struct tcpcb *tp) { - return !!(tp->t_flagsext & TF_FASTOPEN); + return ((tp->t_flagsext & TF_FASTOPEN)? TRUE : FALSE); } void @@ -3305,3 +3395,422 @@ tcp_disable_tfo(struct tcpcb *tp) tp->t_flagsext &= ~TF_FASTOPEN; } +static struct mbuf * +tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp, + boolean_t is_probe) +{ + struct inpcb *inp = tp->t_inpcb; + struct tcphdr *th; + u_int8_t *data; + int win = 0; + struct mbuf *m; + + /* + * The code assumes the IP + TCP headers fit in an mbuf packet header + */ + _CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN); + _CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN); + + MGETHDR(m, M_WAIT, MT_HEADER); + if (m == NULL) { + return (NULL); + } + m->m_pkthdr.pkt_proto = IPPROTO_TCP; + + data = mbuf_datastart(m); + + if (inp->inp_vflag & INP_IPV4) { + bzero(data, sizeof(struct ip) + sizeof(struct tcphdr)); + th = (struct tcphdr *)(void *) (data + sizeof(struct ip)); + m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); + m->m_pkthdr.len = m->m_len; + } else { + VERIFY(inp->inp_vflag & INP_IPV6); + + bzero(data, sizeof(struct ip6_hdr) + + sizeof(struct tcphdr)); + th = (struct tcphdr *)(void *)(data + sizeof(struct ip6_hdr)); + m->m_len = sizeof(struct ip6_hdr) + + sizeof(struct tcphdr); + m->m_pkthdr.len = m->m_len; + } + + tcp_fillheaders(tp, data, th); + + if (inp->inp_vflag & INP_IPV4) { + struct ip *ip; + + ip = (__typeof__(ip))(void *)data; + + ip->ip_id = ip_randomid(); + ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); + ip->ip_ttl = inp->inp_ip_ttl; + ip->ip_tos = inp->inp_ip_tos; + ip->ip_sum = in_cksum_hdr(ip); + } else { + struct ip6_hdr *ip6; + + ip6 = (__typeof__(ip6))(void *)data; + + ip6->ip6_plen = htons(sizeof(struct tcphdr)); + ip6->ip6_hlim = in6_selecthlim(inp, ifp); + + if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) + ip6->ip6_src.s6_addr16[1] = 0; + if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) + ip6->ip6_dst.s6_addr16[1] = 0; + } + th->th_flags = TH_ACK; + + win = tcp_sbspace(tp); + if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale)) + win = (int32_t)TCP_MAXWIN << tp->rcv_scale; + th->th_win = htons((u_short) (win >> tp->rcv_scale)); + + if (is_probe) { + th->th_seq = htonl(tp->snd_una - 1); + } else { + th->th_seq = htonl(tp->snd_una); + } + th->th_ack = htonl(tp->rcv_nxt); + + /* Force recompute TCP checksum to be the final value */ + th->th_sum = 0; + if (inp->inp_vflag & INP_IPV4) { + th->th_sum = inet_cksum(m, IPPROTO_TCP, + sizeof(struct ip), sizeof(struct tcphdr)); + } else { + th->th_sum = inet6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), sizeof(struct tcphdr)); + } + + return (m); +} + +void +tcp_fill_keepalive_offload_frames(ifnet_t ifp, + struct ifnet_keepalive_offload_frame *frames_array, + u_int32_t frames_array_count, size_t frame_data_offset, + u_int32_t *used_frames_count) +{ + struct inpcb *inp; + inp_gen_t gencnt; + u_int32_t frame_index = *used_frames_count; + + if (ifp == NULL || frames_array == NULL || + frames_array_count == 0 || + frame_index >= frames_array_count || + frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) + return; + + /* + * This function is called outside the regular TCP processing + * so we need to update the TCP clock. + */ + calculate_tcp_clock(); + + lck_rw_lock_shared(tcbinfo.ipi_lock); + gencnt = tcbinfo.ipi_gencnt; + LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { + struct socket *so; + struct ifnet_keepalive_offload_frame *frame; + struct mbuf *m = NULL; + struct tcpcb *tp = intotcpcb(inp); + + if (frame_index >= frames_array_count) + break; + + if (inp->inp_gencnt > gencnt || + inp->inp_state == INPCB_STATE_DEAD) + continue; + + if ((so = inp->inp_socket) == NULL || + (so->so_state & SS_DEFUNCT)) + continue; + /* + * check for keepalive offload flag without socket + * lock to avoid a deadlock + */ + if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) { + continue; + } + + if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) { + continue; + } + if (inp->inp_ppcb == NULL || + in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + tcp_lock(so, 1, 0); + /* Release the want count */ + if (inp->inp_ppcb == NULL || + (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) { + tcp_unlock(so, 1, 0); + continue; + } + if ((inp->inp_vflag & INP_IPV4) && + (inp->inp_laddr.s_addr == INADDR_ANY || + inp->inp_faddr.s_addr == INADDR_ANY)) { + tcp_unlock(so, 1, 0); + continue; + } + if ((inp->inp_vflag & INP_IPV6) && + (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) { + tcp_unlock(so, 1, 0); + continue; + } + if (inp->inp_lport == 0 || inp->inp_fport == 0) { + tcp_unlock(so, 1, 0); + continue; + } + if (inp->inp_last_outifp == NULL || + inp->inp_last_outifp->if_index != ifp->if_index) { + tcp_unlock(so, 1, 0); + continue; + } + if ((inp->inp_vflag & INP_IPV4) && frame_data_offset + + sizeof(struct ip) + sizeof(struct tcphdr) > + IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) { + tcp_unlock(so, 1, 0); + continue; + } else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset + + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) > + IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) { + tcp_unlock(so, 1, 0); + continue; + } + /* + * There is no point in waking up the device for connections + * that are not established. Long lived connection are meant + * for processes that will sent and receive data + */ + if (tp->t_state != TCPS_ESTABLISHED) { + tcp_unlock(so, 1, 0); + continue; + } + /* + * This inp has all the information that is needed to + * generate an offload frame. + */ + frame = &frames_array[frame_index]; + frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP; + frame->ether_type = (inp->inp_vflag & INP_IPV4) ? + IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 : + IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6; + frame->interval = tp->t_keepidle > 0 ? tp->t_keepidle : + tcp_keepidle; + frame->keep_cnt = TCP_CONN_KEEPCNT(tp); + frame->keep_retry = TCP_CONN_KEEPINTVL(tp); + frame->local_port = ntohs(inp->inp_lport); + frame->remote_port = ntohs(inp->inp_fport); + frame->local_seq = tp->snd_nxt; + frame->remote_seq = tp->rcv_nxt; + if (inp->inp_vflag & INP_IPV4) { + frame->length = frame_data_offset + + sizeof(struct ip) + sizeof(struct tcphdr); + frame->reply_length = frame->length; + + frame->addr_length = sizeof(struct in_addr); + bcopy(&inp->inp_laddr, frame->local_addr, + sizeof(struct in_addr)); + bcopy(&inp->inp_faddr, frame->remote_addr, + sizeof(struct in_addr)); + } else { + struct in6_addr *ip6; + + frame->length = frame_data_offset + + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + frame->reply_length = frame->length; + + frame->addr_length = sizeof(struct in6_addr); + ip6 = (struct in6_addr *)(void *)frame->local_addr; + bcopy(&inp->in6p_laddr, ip6, sizeof(struct in6_addr)); + if (IN6_IS_SCOPE_EMBED(ip6)) + ip6->s6_addr16[1] = 0; + + ip6 = (struct in6_addr *)(void *)frame->remote_addr; + bcopy(&inp->in6p_faddr, ip6, sizeof(struct in6_addr)); + if (IN6_IS_SCOPE_EMBED(ip6)) + ip6->s6_addr16[1] = 0; + } + + /* + * First the probe + */ + m = tcp_make_keepalive_frame(tp, ifp, TRUE); + if (m == NULL) { + tcp_unlock(so, 1, 0); + continue; + } + bcopy(m->m_data, frame->data + frame_data_offset, + m->m_len); + m_freem(m); + + /* + * Now the response packet to incoming probes + */ + m = tcp_make_keepalive_frame(tp, ifp, FALSE); + if (m == NULL) { + tcp_unlock(so, 1, 0); + continue; + } + bcopy(m->m_data, frame->reply_data + frame_data_offset, + m->m_len); + m_freem(m); + + frame_index++; + tcp_unlock(so, 1, 0); + } + lck_rw_done(tcbinfo.ipi_lock); + *used_frames_count = frame_index; +} + +errno_t +tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so, + u_int32_t notify_id) +{ + struct tcp_notify_ack_marker *elm; + + if (so->so_snd.sb_cc == 0) + return (ENOBUFS); + + SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) { + /* Duplicate id is not allowed */ + if (elm->notify_id == notify_id) + return (EINVAL); + /* Duplicate position is not allowed */ + if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc) + return (EINVAL); + } + return (0); +} + +errno_t +tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id) +{ + struct tcp_notify_ack_marker *nm, *elm = NULL; + struct socket *so = tp->t_inpcb->inp_socket; + + MALLOC(nm, struct tcp_notify_ack_marker *, sizeof (*nm), + M_TEMP, M_WAIT | M_ZERO); + if (nm == NULL) + return (ENOMEM); + nm->notify_id = notify_id; + nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc; + + SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) { + if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una)) + break; + } + + if (elm == NULL) { + VERIFY(SLIST_EMPTY(&tp->t_notify_ack)); + SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next); + } else { + SLIST_INSERT_AFTER(elm, nm, notify_next); + } + tp->t_notify_ack_count++; + return (0); +} + +void +tcp_notify_ack_free(struct tcpcb *tp) +{ + struct tcp_notify_ack_marker *elm, *next; + if (SLIST_EMPTY(&tp->t_notify_ack)) + return; + + SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) { + SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker, + notify_next); + FREE(elm, M_TEMP); + } + SLIST_INIT(&tp->t_notify_ack); + tp->t_notify_ack_count = 0; +} + +inline void +tcp_notify_acknowledgement(struct tcpcb *tp, struct socket *so) +{ + struct tcp_notify_ack_marker *elm; + + elm = SLIST_FIRST(&tp->t_notify_ack); + if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) { + soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOTIFY_ACK); + } +} + +void +tcp_get_notify_ack_count(struct tcpcb *tp, + struct tcp_notify_ack_complete *retid) +{ + struct tcp_notify_ack_marker *elm; + size_t complete = 0; + + SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) { + if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) + complete++; + else + break; + } + retid->notify_pending = tp->t_notify_ack_count - complete; + retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, complete); +} + +void +tcp_get_notify_ack_ids(struct tcpcb *tp, + struct tcp_notify_ack_complete *retid) +{ + size_t i = 0; + struct tcp_notify_ack_marker *elm, *next; + + SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) { + if (i >= retid->notify_complete_count) + break; + if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) { + retid->notify_complete_id[i++] = elm->notify_id; + SLIST_REMOVE(&tp->t_notify_ack, elm, + tcp_notify_ack_marker, notify_next); + FREE(elm, M_TEMP); + tp->t_notify_ack_count--; + } else { + break; + } + } +} + +bool +tcp_notify_ack_active(struct socket *so) +{ + if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) && + SOCK_TYPE(so) == SOCK_STREAM) { + struct tcpcb *tp = intotcpcb(sotoinpcb(so)); + + if (!SLIST_EMPTY(&tp->t_notify_ack)) { + struct tcp_notify_ack_marker *elm; + elm = SLIST_FIRST(&tp->t_notify_ack); + if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) + return (true); + } + } + return (false); +} + +inline int32_t +inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack) +{ + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + + if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) && + so->so_snd.sb_cc > 0) { + int32_t unsent, sent; + sent = tp->snd_max - th_ack; + if (tp->t_flags & TF_SENTFIN) + sent--; + unsent = so->so_snd.sb_cc - sent; + return (unsent); + } + return (0); +} diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index bda29ca82..e50bab301 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -106,29 +106,6 @@ #include #include -#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next))) - -#define VERIFY_NEXT_LINK(elm,field) do { \ - if (LIST_NEXT((elm),field) != NULL && \ - LIST_NEXT((elm),field)->field.le_prev != \ - &((elm)->field.le_next)) \ - panic("Bad link elm %p next->prev != elm", (elm)); \ -} while(0) - -#define VERIFY_PREV_LINK(elm,field) do { \ - if (*(elm)->field.le_prev != (elm)) \ - panic("Bad link elm %p prev->next != elm", (elm)); \ -} while(0) - -#define TCP_SET_TIMER_MODE(mode, i) do { \ - if (IS_TIMER_HZ_10MS(i)) \ - (mode) |= TCP_TIMERLIST_10MS_MODE; \ - else if (IS_TIMER_HZ_100MS(i)) \ - (mode) |= TCP_TIMERLIST_100MS_MODE; \ - else \ - (mode) |= TCP_TIMERLIST_500MS_MODE; \ -} while(0) - /* Max number of times a stretch ack can be delayed on a connection */ #define TCP_STRETCHACK_DELAY_THRESHOLD 5 @@ -190,7 +167,7 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); -/* +/* * Avoid DoS via TCP Robustness in Persist Condition * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt) * by allowing a system wide maximum persistence timeout value when in @@ -202,7 +179,7 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, u_int32_t tcp_max_persist_timeout = 0; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", + &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timeout for ZWP"); static int always_keepalive = 0; @@ -212,7 +189,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, /* * This parameter determines how long the timer list will stay in fast or - * quick mode even though all connections are idle. In this state, the + * quick mode even though all connections are idle. In this state, the * timer will run more frequently anticipating new data. */ int timer_fastmode_idlemax = TCP_FASTMODE_IDLERUN_MAX; @@ -240,7 +217,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, static int tcp_resched_timerlist = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, - CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0, + CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0, "Number of times timer list was rescheduled as part of processing a packet"); int tcp_pmtud_black_hole_detect = 1 ; @@ -253,13 +230,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS"); +static u_int32_t tcp_mss_rec_medium = 1200; +static u_int32_t tcp_mss_rec_low = 512; + #define TCP_REPORT_STATS_INTERVAL 43200 /* 12 hours, in seconds */ int tcp_report_stats_interval = TCP_REPORT_STATS_INTERVAL; -#if (DEVELOPMENT || DEBUG) -SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval, - CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0, - "Report stats interval"); -#endif /* (DEVELOPMENT || DEBUG) */ /* performed garbage collection of "used" sockets */ static boolean_t tcp_gc_done = FALSE; @@ -290,19 +265,9 @@ static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode, static void tcp_sched_timers(struct tcpcb *tp); static inline void tcp_set_lotimer_index(struct tcpcb *); __private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp); +static inline void tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp); __private_extern__ void tcp_report_stats(void); -/* - * Macro to compare two timers. If there is a reset of the sign bit, - * it is safe to assume that the timer has wrapped around. By doing - * signed comparision, we take care of wrap around such that the value - * with the sign bit reset is actually ahead of the other. - */ -inline int32_t -timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { - return (int32_t)((t1 + toff1) - (t2 + toff2)); -}; - static u_int64_t tcp_last_report_time; /* @@ -342,6 +307,10 @@ struct tcp_last_report_stats { u_int32_t tcps_tfo_syn_data_acked; u_int32_t tcps_tfo_syn_loss; u_int32_t tcps_tfo_blackhole; + u_int32_t tcps_tfo_cookie_wrong; + u_int32_t tcps_tfo_no_cookie_rcv; + u_int32_t tcps_tfo_heuristics_disable; + u_int32_t tcps_tfo_sndblackhole; }; @@ -355,6 +324,93 @@ struct tcp_last_report_stats { static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay); static boolean_t tcp_garbage_collect(struct inpcb *, int); +#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next))) + +#define VERIFY_NEXT_LINK(elm,field) do { \ + if (LIST_NEXT((elm),field) != NULL && \ + LIST_NEXT((elm),field)->field.le_prev != \ + &((elm)->field.le_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while(0) + +#define VERIFY_PREV_LINK(elm,field) do { \ + if (*(elm)->field.le_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while(0) + +#define TCP_SET_TIMER_MODE(mode, i) do { \ + if (IS_TIMER_HZ_10MS(i)) \ + (mode) |= TCP_TIMERLIST_10MS_MODE; \ + else if (IS_TIMER_HZ_100MS(i)) \ + (mode) |= TCP_TIMERLIST_100MS_MODE; \ + else \ + (mode) |= TCP_TIMERLIST_500MS_MODE; \ +} while(0) + +#if (DEVELOPMENT || DEBUG) +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_medium, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_medium, 0, + "Medium MSS based on recommendation in link status report"); +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_low, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_low, 0, + "Low MSS based on recommendation in link status report"); + +static int32_t tcp_change_mss_recommended = 0; +static int +sysctl_change_mss_recommended SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int i, err = 0, changed = 0; + struct ifnet *ifp; + struct if_link_status ifsr; + struct if_cellular_status_v1 *new_cell_sr; + err = sysctl_io_number(req, tcp_change_mss_recommended, + sizeof (int32_t), &i, &changed); + if (changed) { + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (IFNET_IS_CELLULAR(ifp)) { + bzero(&ifsr, sizeof (ifsr)); + new_cell_sr = &ifsr.ifsr_u.ifsr_cell.if_cell_u.if_status_v1; + ifsr.ifsr_version = IF_CELLULAR_STATUS_REPORT_CURRENT_VERSION; + ifsr.ifsr_len = sizeof(*new_cell_sr); + + /* Set MSS recommended */ + new_cell_sr->valid_bitmask |= IF_CELL_UL_MSS_RECOMMENDED_VALID; + new_cell_sr->mss_recommended = i; + err = ifnet_link_status_report(ifp, new_cell_sr, sizeof (new_cell_sr)); + if (err == 0) { + tcp_change_mss_recommended = i; + } else { + break; + } + } + } + ifnet_head_done(); + } + return (err); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, change_mss_recommended, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_change_mss_recommended, + 0, sysctl_change_mss_recommended, "IU", "Change MSS recommended"); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0, + "Report stats interval"); +#endif /* (DEVELOPMENT || DEBUG) */ + +/* + * Macro to compare two timers. If there is a reset of the sign bit, + * it is safe to assume that the timer has wrapped around. By doing + * signed comparision, we take care of wrap around such that the value + * with the sign bit reset is actually ahead of the other. + */ +inline int32_t +timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { + return (int32_t)((t1 + toff1) - (t2 + toff2)); +}; + /* * Add to tcp timewait list, delay is given in milliseconds. */ @@ -372,7 +428,7 @@ add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) if (!(inp->inp_flags2 & INP2_TIMEWAIT)) { pcbinfo->ipi_twcount++; inp->inp_flags2 |= INP2_TIMEWAIT; - + /* Remove from global inp list */ LIST_REMOVE(inp, inp_list); } else { @@ -381,7 +437,7 @@ add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) /* Compute the time at which this socket can be closed */ timer = tcp_now + delay; - + /* We will use the TCPT_2MSL timer for tracking this delay */ if (TIMER_IS_ON_LIST(tp)) @@ -494,13 +550,13 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) } /* - * We get here because the PCB is no longer searchable - * (WNT_STOPUSING); detach (if needed) and dispose if it is dead - * (usecount is 0). This covers all cases, including overflow - * sockets and those that are considered as "embryonic", - * i.e. created by sonewconn() in TCP input path, and have + * We get here because the PCB is no longer searchable + * (WNT_STOPUSING); detach (if needed) and dispose if it is dead + * (usecount is 0). This covers all cases, including overflow + * sockets and those that are considered as "embryonic", + * i.e. created by sonewconn() in TCP input path, and have * not yet been committed. For the former, we reduce the usecount - * to 0 as done by the code above. For the latter, the usecount + * to 0 as done by the code above. For the latter, the usecount * would have reduced to 0 as part calling soabort() when the * socket is dropped at the end of tcp_input(). */ @@ -511,7 +567,7 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) lck_mtx_convert_spin(&inp->inpcb_mtx); /* - * If this tp still happens to be on the timer list, + * If this tp still happens to be on the timer list, * take it out */ if (TIMER_IS_ON_LIST(tp)) { @@ -588,16 +644,16 @@ tcp_gc(struct inpcbinfo *ipi) /* Now cleanup the time wait ones */ TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) { /* - * We check the timestamp here without holding the + * We check the timestamp here without holding the * socket lock for better performance. If there are * any pcbs in time-wait, the timer will get rescheduled. * Hence some error in this check can be tolerated. * * Sometimes a socket on time-wait queue can be closed if * 2MSL timer expired but the application still has a - * usecount on it. + * usecount on it. */ - if (tw_tp->t_state == TCPS_CLOSED || + if (tw_tp->t_state == TCPS_CLOSED || TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) { if (tcp_garbage_collect(tw_tp->t_inpcb, 1)) atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1); @@ -623,10 +679,9 @@ tcp_gc(struct inpcbinfo *ipi) * Cancel all timers for TCP tp. */ void -tcp_canceltimers(tp) - struct tcpcb *tp; +tcp_canceltimers(struct tcpcb *tp) { - register int i; + int i; tcp_remove_timer(tp); for (i = 0; i < TCPT_NTIMERS; i++) @@ -643,12 +698,13 @@ int tcp_backoff[TCP_MAXRXTSHIFT + 1] = static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ -void tcp_rexmt_save_state(struct tcpcb *tp) +void +tcp_rexmt_save_state(struct tcpcb *tp) { u_int32_t fsize; if (TSTMP_SUPPORTED(tp)) { /* - * Since timestamps are supported on the connection, + * Since timestamps are supported on the connection, * we can do recovery as described in rfc 4015. */ fsize = tp->snd_max - tp->snd_una; @@ -659,9 +715,9 @@ void tcp_rexmt_save_state(struct tcpcb *tp) * Timestamp option is not supported on this connection. * Record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. - * A retransmit is considered "bad" if an ACK for this + * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption - * here is that the ACK was already in flight. See + * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ @@ -682,18 +738,19 @@ void tcp_rexmt_save_state(struct tcpcb *tp) * Revert to the older segment size if there is an indication that PMTU * blackhole detection was not needed. */ -void tcp_pmtud_revert_segment_size(struct tcpcb *tp) +void +tcp_pmtud_revert_segment_size(struct tcpcb *tp) { int32_t optlen; VERIFY(tp->t_pmtud_saved_maxopd > 0); - tp->t_flags |= TF_PMTUD; - tp->t_flags &= ~TF_BLACKHOLE; + tp->t_flags |= TF_PMTUD; + tp->t_flags &= ~TF_BLACKHOLE; optlen = tp->t_maxopd - tp->t_maxseg; tp->t_maxopd = tp->t_pmtud_saved_maxopd; tp->t_maxseg = tp->t_maxopd - optlen; /* - * Reset the slow-start flight size as it + * Reset the slow-start flight size as it * may depend on the new MSS */ if (CC_ALGO(tp)->cwnd_init != NULL) @@ -706,9 +763,7 @@ void tcp_pmtud_revert_segment_size(struct tcpcb *tp) * TCP timer processing. */ struct tcpcb * -tcp_timers(tp, timer) - register struct tcpcb *tp; - int timer; +tcp_timers(struct tcpcb *tp, int timer) { int32_t rexmt, optlen = 0, idle_time = 0; struct socket *so; @@ -741,7 +796,7 @@ tcp_timers(tp, timer) if (tp->t_state != TCPS_TIME_WAIT && tp->t_state != TCPS_FIN_WAIT_2 && ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) { - tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, (u_int32_t)TCP_CONN_KEEPINTVL(tp)); } else { tp = tcp_close(tp); @@ -755,7 +810,9 @@ tcp_timers(tp, timer) * to a longer retransmit interval and retransmit one segment. */ case TCPT_REXMT: - accsleep_ms = mach_absolutetime_asleep / 1000000UL; + absolutetime_to_nanoseconds(mach_absolutetime_asleep, + &accsleep_ms); + accsleep_ms = accsleep_ms / 1000000UL; if (accsleep_ms > tp->t_accsleep_ms) last_sleep_ms = accsleep_ms - tp->t_accsleep_ms; /* @@ -792,9 +849,14 @@ tcp_timers(tp, timer) } } tp->t_rxtshift = TCP_MAXRXTSHIFT; - postevent(so, 0, EV_TIMEOUT); - soevent(so, + postevent(so, 0, EV_TIMEOUT); + soevent(so, (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); + + if (TCP_ECN_ENABLED(tp) && + tp->t_state == TCPS_ESTABLISHED) + tcp_heuristic_ecn_droprxmt(tp); + tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); @@ -804,16 +866,16 @@ tcp_timers(tp, timer) tcpstat.tcps_rexmttimeo++; tp->t_accsleep_ms = accsleep_ms; - if (tp->t_rxtshift == 1 && + if (tp->t_rxtshift == 1 && tp->t_state == TCPS_ESTABLISHED) { /* Set the time at which retransmission started. */ tp->t_rxtstart = tcp_now; - /* + /* * if this is the first retransmit timeout, save * the state so that we can recover if the timeout * is spurious. - */ + */ tcp_rexmt_save_state(tp); } #if MPTCP @@ -876,6 +938,9 @@ tcp_timers(tp, timer) so->so_error = ENODATA; sorwakeup(so); sowwakeup(so); + + tp->t_tfo_stats |= TFO_S_SEND_BLACKHOLE; + tcpstat.tcps_tfo_sndblackhole++; } if (tp->t_state == TCPS_SYN_SENT) { @@ -886,9 +951,6 @@ tcp_timers(tp, timer) if (tfo_enabled(tp)) { tp->t_flagsext &= ~TF_FASTOPEN; tp->t_tfo_flags |= TFO_F_SYN_LOSS; - - tp->t_tfo_stats |= TFO_S_SYN_LOSS; - tcpstat.tcps_tfo_syn_loss++; } } else { rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; @@ -912,7 +974,7 @@ tcp_timers(tp, timer) ((tp->t_flags & TF_MAXSEGSNT) || tp->t_pmtud_lastseg_size > tcp_pmtud_black_hole_mss) && tp->t_rxtshift == 2) { - /* + /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we @@ -941,11 +1003,12 @@ tcp_timers(tp, timer) tp->t_maxseg = tp->t_maxopd - optlen; /* - * Reset the slow-start flight size + * Reset the slow-start flight size * as it may depend on the new MSS */ if (CC_ALGO(tp)->cwnd_init != NULL) CC_ALGO(tp)->cwnd_init(tp); + tp->snd_cwnd = tp->t_maxseg; } /* * If further retransmissions are still @@ -954,10 +1017,11 @@ tcp_timers(tp, timer) * MSS and blackhole detection flags. */ else { - + if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) { tcp_pmtud_revert_segment_size(tp); + tp->snd_cwnd = tp->t_maxseg; } } } @@ -1018,7 +1082,7 @@ tcp_timers(tp, timer) * been retransmitted by way of the retransmission timer at * least once, the value of ssthresh is held constant */ - if (tp->t_rxtshift == 1 && + if (tp->t_rxtshift == 1 && CC_ALGO(tp)->after_timeout != NULL) { CC_ALGO(tp)->after_timeout(tp); /* @@ -1054,15 +1118,15 @@ tcp_timers(tp, timer) * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. - * - * Drop the connection if we reached the maximum allowed time for - * Zero Window Probes without a non-zero update from the peer. + * + * Drop the connection if we reached the maximum allowed time for + * Zero Window Probes without a non-zero update from the peer. * See rdar://5805356 */ if ((tp->t_rxtshift == TCP_MAXRXTSHIFT && (idle_time >= tcp_maxpersistidle || - idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || - ((tp->t_persist_stop != 0) && + idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || + ((tp->t_persist_stop != 0) && TSTMP_LEQ(tp->t_persist_stop, tcp_now))) { tcpstat.tcps_persistdrop++; postevent(so, 0, EV_TIMEOUT); @@ -1128,6 +1192,7 @@ tcp_timers(tp, timer) tra.nocell = INP_NO_CELLULAR(inp); tra.noexpensive = INP_NO_EXPENSIVE(inp); tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp); + tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp); if (tp->t_inpcb->inp_flags & INP_BOUND_IF) tra.ifscope = tp->t_inpcb->inp_boundifp->if_index; else @@ -1148,9 +1213,9 @@ tcp_timers(tp, timer) if (tp->t_flagsext & TF_DETECT_READSTALL) { struct ifnet *outifp = tp->t_inpcb->inp_last_outifp; bool reenable_probe = false; - /* + /* * The keep alive packets sent to detect a read - * stall did not get a response from the + * stall did not get a response from the * peer. Generate more keep-alives to confirm this. * If the number of probes sent reaches the limit, * generate an event. @@ -1201,6 +1266,7 @@ tcp_timers(tp, timer) so->so_error = ENODATA; sorwakeup(so); + tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE; tcpstat.tcps_tfo_blackhole++; } break; @@ -1213,7 +1279,7 @@ tcp_timers(tp, timer) /* * If delayed ack timer fired while stretching * acks, count the number of times the streaming - * detection was not correct. If this exceeds a + * detection was not correct. If this exceeds a * threshold, disable strech ack on this * connection * @@ -1234,6 +1300,7 @@ tcp_timers(tp, timer) tp->rcv_nostrack_ts = tcp_now; tcpstat.tcps_nostretchack++; tp->t_stretchack_delayed = 0; + tp->rcv_nostrack_pkts = 0; } tcp_reset_stretch_ack(tp); } @@ -1259,7 +1326,7 @@ tcp_timers(tp, timer) if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) { tcpstat.tcps_timeoutdrop++; postevent(so, 0, EV_TIMEOUT); - soevent(so, + soevent(so, (SO_FILT_HINT_LOCKED| SO_FILT_HINT_TIMEOUT)); tp = tcp_drop(tp, tp->t_softerror ? @@ -1270,7 +1337,7 @@ tcp_timers(tp, timer) tp->t_flags |= TF_ACKNOW; /* - * No backoff is implemented for simplicity for this + * No backoff is implemented for simplicity for this * corner case. */ (void) tcp_output(tp); @@ -1385,13 +1452,13 @@ tcp_remove_timer(struct tcpcb *tp) return; } lck_mtx_lock(listp->mtx); - + /* Check if pcb is on timer list again after acquiring the lock */ if (!(TIMER_IS_ON_LIST(tp))) { lck_mtx_unlock(listp->mtx); return; } - + if (listp->next_te != NULL && listp->next_te == &tp->tentry) listp->next_te = LIST_NEXT(&tp->tentry, le); @@ -1448,7 +1515,7 @@ need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode) } void -tcp_sched_timerlist(uint32_t offset) +tcp_sched_timerlist(uint32_t offset) { uint64_t deadline = 0; struct tcptimerlist *listp = &tcp_timer_list; @@ -1471,7 +1538,7 @@ tcp_sched_timerlist(uint32_t offset) /* * Function to run the timers for a connection. * - * Returns the offset of next timer to be run for this connection which + * Returns the offset of next timer to be run for this connection which * can be used to reschedule the timerlist. * * te_mode is an out parameter that indicates the modes of active @@ -1495,14 +1562,14 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, tcp_lock(tp->t_inpcb->inp_socket, 1, 0); so = tp->t_inpcb->inp_socket; - /* Release the want count on inp */ + /* Release the want count on inp */ if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) { if (TIMER_IS_ON_LIST(tp)) { tcp_remove_timer(tp); } - /* Looks like the TCP connection got closed while we + /* Looks like the TCP connection got closed while we * were waiting for the lock.. Done */ goto done; @@ -1517,7 +1584,7 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, tp->t_flagsext |= TF_PROBING; tcp_timers(tp, TCPT_PTO); tp->t_timer[TCPT_PTO] = 0; - tp->t_flagsext &= TF_PROBING; + tp->t_flagsext &= ~TF_PROBING; } /* @@ -1527,7 +1594,7 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, */ if ((index = tp->tentry.index) == TCPT_NONE) goto done; - + timer_val = tp->t_timer[index]; diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0); @@ -1545,7 +1612,7 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, if (tp == NULL) goto done; } - + /* * Check if there are any other timers that need to be run. * While doing it, adjust the timer values wrt tcp_now. @@ -1569,7 +1636,7 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, } } } - + tp->tentry.timer_start = tcp_now; tp->tentry.index = lo_index; VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0); @@ -1613,7 +1680,8 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, } void -tcp_run_timerlist(void * arg1, void * arg2) { +tcp_run_timerlist(void * arg1, void * arg2) +{ #pragma unused(arg1, arg2) struct tcptimerentry *te, *next_te; struct tcptimerlist *listp = &tcp_timer_list; @@ -1628,7 +1696,7 @@ tcp_run_timerlist(void * arg1, void * arg2) { lck_mtx_lock(listp->mtx); listp->running = TRUE; - + LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { uint32_t offset = 0; uint32_t runtime = te->runtime; @@ -1653,7 +1721,7 @@ tcp_run_timerlist(void * arg1, void * arg2) { * Some how this pcb went into dead state while * on the timer list, just take it off the list. * Since the timer list entry pointers are - * protected by the timer list lock, we can + * protected by the timer list lock, we can * do it here without the socket lock. */ if (TIMER_IS_ON_LIST(tp)) { @@ -1674,7 +1742,7 @@ tcp_run_timerlist(void * arg1, void * arg2) { * release the lock, this pointer will be updated to the * element after that. */ - listp->next_te = next_te; + listp->next_te = next_te; VERIFY_NEXT_LINK(&tp->tentry, le); VERIFY_PREV_LINK(&tp->tentry, le); @@ -1683,7 +1751,7 @@ tcp_run_timerlist(void * arg1, void * arg2) { offset = tcp_run_conn_timer(tp, &te_mode, listp->probe_if_index); - + lck_mtx_lock(listp->mtx); next_te = listp->next_te; @@ -1757,8 +1825,8 @@ tcp_run_timerlist(void * arg1, void * arg2) { * Function to check if the timerlist needs to be rescheduled to run this * connection's timers correctly. */ -void -tcp_sched_timers(struct tcpcb *tp) +void +tcp_sched_timers(struct tcpcb *tp) { struct tcptimerentry *te = &tp->tentry; u_int16_t index = te->index; @@ -1816,7 +1884,7 @@ tcp_sched_timers(struct tcpcb *tp) */ if (need_to_resched_timerlist(te->runtime, mode)) { tcp_resched_timerlist++; - + if (!list_locked) { lck_mtx_lock(listp->mtx); list_locked = TRUE; @@ -1875,7 +1943,7 @@ tcp_sched_timers(struct tcpcb *tp) return; } - + static inline void tcp_set_lotimer_index(struct tcpcb *tp) { @@ -1895,7 +1963,7 @@ tcp_set_lotimer_index(struct tcpcb *tp) VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0); if (tp->tentry.index != TCPT_NONE) { - tp->tentry.runtime = tp->tentry.timer_start + tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[tp->tentry.index]; if (tp->tentry.runtime == 0) tp->tentry.runtime++; @@ -1937,7 +2005,7 @@ tcp_report_stats(void) struct sockaddr_in6 dst6; struct rtentry *rt = NULL; static struct tcp_last_report_stats prev; - u_int64_t var, uptime; + u_int64_t var, uptime; #define stat data.u.tcp_stats if (((uptime = net_uptime()) - tcp_last_report_time) < @@ -2002,13 +2070,13 @@ tcp_report_stats(void) } /* RTO after tail loss, shift by 10 for precision */ - if (tcpstat.tcps_sndrexmitpack > 0 + if (tcpstat.tcps_sndrexmitpack > 0 && tcpstat.tcps_tailloss_rto > 0) { var = tcpstat.tcps_tailloss_rto << 10; stat.send_tlrto_rate = (var * 100) / tcpstat.tcps_sndrexmitpack; } - + /* packet reordering */ if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) { var = tcpstat.tcps_reordered_pkts << 10; @@ -2084,6 +2152,17 @@ tcp_report_stats(void) &prev.tcps_tfo_syn_loss, &stat.tfo_syn_loss); tcp_cumulative_stat(tcpstat.tcps_tfo_blackhole, &prev.tcps_tfo_blackhole, &stat.tfo_blackhole); + tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_wrong, + &prev.tcps_tfo_cookie_wrong, &stat.tfo_cookie_wrong); + tcp_cumulative_stat(tcpstat.tcps_tfo_no_cookie_rcv, + &prev.tcps_tfo_no_cookie_rcv, &stat.tfo_no_cookie_rcv); + tcp_cumulative_stat(tcpstat.tcps_tfo_heuristics_disable, + &prev.tcps_tfo_heuristics_disable, &stat.tfo_heuristics_disable); + tcp_cumulative_stat(tcpstat.tcps_tfo_sndblackhole, + &prev.tcps_tfo_sndblackhole, &stat.tfo_sndblackhole); + + + nstat_sysinfo_send_data(&data); @@ -2259,6 +2338,76 @@ tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable) return; } +inline void +tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp) +{ + struct if_cellular_status_v1 *ifsr; + u_int32_t optlen; + ifsr = &ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1; + if (ifsr->valid_bitmask & IF_CELL_UL_MSS_RECOMMENDED_VALID) { + optlen = tp->t_maxopd - tp->t_maxseg; + + if (ifsr->mss_recommended == + IF_CELL_UL_MSS_RECOMMENDED_NONE && + tp->t_cached_maxopd > 0 && + tp->t_maxopd < tp->t_cached_maxopd) { + tp->t_maxopd = tp->t_cached_maxopd; + tcpstat.tcps_mss_to_default++; + } else if (ifsr->mss_recommended == + IF_CELL_UL_MSS_RECOMMENDED_MEDIUM && + tp->t_maxopd > tcp_mss_rec_medium) { + tp->t_cached_maxopd = tp->t_maxopd; + tp->t_maxopd = tcp_mss_rec_medium; + tcpstat.tcps_mss_to_medium++; + } else if (ifsr->mss_recommended == + IF_CELL_UL_MSS_RECOMMENDED_LOW && + tp->t_maxopd > tcp_mss_rec_low) { + tp->t_cached_maxopd = tp->t_maxopd; + tp->t_maxopd = tcp_mss_rec_low; + tcpstat.tcps_mss_to_low++; + } + tp->t_maxseg = tp->t_maxopd - optlen; + + /* + * clear the cached value if it is same as the current + */ + if (tp->t_maxopd == tp->t_cached_maxopd) + tp->t_cached_maxopd = 0; + } +} + +void +tcp_update_mss_locked(struct socket *so, struct ifnet *ifp) +{ + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + + if (ifp == NULL && inp->inp_last_outifp == NULL) + return; + + if (ifp == NULL) + ifp = inp->inp_last_outifp; + + if (!IFNET_IS_CELLULAR(ifp)) { + /* + * This optimization is implemented for cellular + * networks only + */ + return; + } + if ( tp->t_state <= TCPS_CLOSE_WAIT) { + /* + * If the connection is currently doing or has done PMTU + * blackhole detection, do not change the MSS + */ + if (tp->t_flags & TF_BLACKHOLE) + return; + if (ifp->if_link_status == NULL) + return; + tcp_update_mss_core(tp, ifp); + } +} + void tcp_itimer(struct inpcbinfo *ipi) { @@ -2278,7 +2427,8 @@ tcp_itimer(struct inpcbinfo *ipi) LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) { struct socket *so; - if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + if (inp->inp_ppcb == NULL || + in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) continue; so = inp->inp_socket; tcp_lock(so, 1, 0); @@ -2287,9 +2437,12 @@ tcp_itimer(struct inpcbinfo *ipi) continue; } so_check_extended_bk_idle_time(so); + if (ipi->ipi_flags & INPCBINFO_UPDATE_MSS) { + tcp_update_mss_locked(so, NULL); + } tcp_unlock(so, 1, 0); } + ipi->ipi_flags &= ~INPCBINFO_UPDATE_MSS; lck_rw_done(ipi->ipi_lock); } - diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index c80d20e8a..67306d65d 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -136,13 +136,8 @@ static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct proc *); static int tcp6_usr_connect(struct socket *, struct sockaddr *, struct proc *); #endif /* INET6 */ -static struct tcpcb * - tcp_disconnect(struct tcpcb *); -static struct tcpcb * - tcp_usrclosed(struct tcpcb *); - -extern uint32_t tcp_autorcvbuf_max; - +static struct tcpcb *tcp_disconnect(struct tcpcb *); +static struct tcpcb *tcp_usrclosed(struct tcpcb *); extern void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb); #if TCPDEBUG @@ -183,7 +178,7 @@ tcp_usr_attach(struct socket *so, __unused int proto, struct proc *p) error = EISCONN; goto out; } - + error = tcp_attach(so, p); if (error) goto out; @@ -1212,6 +1207,7 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; + so->so_state &= ~SS_RCVATMARK; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); COMMON_END(PRU_RCVOOB); @@ -1315,10 +1311,7 @@ struct pr_usrreqs tcp6_usrreqs = { * in_pcbladdr:EADDRNOTAVAIL Address not available */ static int -tcp_connect(tp, nam, p) - register struct tcpcb *tp; - struct sockaddr *nam; - struct proc *p; +tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; @@ -1339,7 +1332,7 @@ tcp_connect(tp, nam, p) * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ - error = in_pcbladdr(inp, nam, &laddr, IFSCOPE_NONE, &outif); + error = in_pcbladdr(inp, nam, &laddr, IFSCOPE_NONE, &outif, 0); if (error) goto done; @@ -1402,7 +1395,7 @@ tcp_connect(tp, nam, p) if (inp->inp_flowhash == 0) inp->inp_flowhash = inp_calc_flowhash(inp); - tcp_set_max_rwinscale(tp, so); + tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(outif)); soisconnecting(so); tcpstat.tcps_connattempt++; @@ -1422,10 +1415,7 @@ tcp_connect(tp, nam, p) #if INET6 static int -tcp6_connect(tp, nam, p) - register struct tcpcb *tp; - struct sockaddr *nam; - struct proc *p; +tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; @@ -1499,7 +1489,7 @@ tcp6_connect(tp, nam, p) (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK); } - tcp_set_max_rwinscale(tp, so); + tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(outif)); soisconnecting(so); tcpstat.tcps_connattempt++; @@ -1549,6 +1539,9 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) if (IN_FASTRECOVERY(tp) || tp->t_rxtshift > 0) ti->tcpi_flags |= TCPI_FLAG_LOSSRECOVERY; + if (tp->t_flags & TF_STREAMING_ON) + ti->tcpi_flags |= TCPI_FLAG_STREAMING_ON; + ti->tcpi_rto = tp->t_timer[TCPT_REXMT] ? tp->t_rxtcur : 0; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; @@ -1618,6 +1611,11 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_tfo_syn_data_sent = !!(tp->t_tfo_stats & TFO_S_SYN_DATA_SENT); ti->tcpi_tfo_syn_data_acked = !!(tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED); ti->tcpi_tfo_syn_loss = !!(tp->t_tfo_stats & TFO_S_SYN_LOSS); + ti->tcpi_tfo_cookie_wrong = !!(tp->t_tfo_stats & TFO_S_COOKIE_WRONG); + ti->tcpi_tfo_no_cookie_rcv = !!(tp->t_tfo_stats & TFO_S_NO_COOKIE_RCV); + ti->tcpi_tfo_heuristics_disable = !!(tp->t_tfo_stats & TFO_S_HEURISTICS_DISABLE); + ti->tcpi_tfo_send_blackhole = !!(tp->t_tfo_stats & TFO_S_SEND_BLACKHOLE); + ti->tcpi_tfo_recv_blackhole = !!(tp->t_tfo_stats & TFO_S_RECV_BLACKHOLE); ti->tcpi_ecn_client_setup = !!(tp->ecn_flags & TE_SETUPSENT); ti->tcpi_ecn_server_setup = !!(tp->ecn_flags & TE_SETUPRECEIVED); @@ -1757,6 +1755,11 @@ tcp_connection_fill_info(struct tcpcb *tp, struct tcp_connection_info *tci) tci->tcpi_tfo_syn_data_sent = !!(tp->t_tfo_stats & TFO_S_SYN_DATA_SENT); tci->tcpi_tfo_syn_data_acked = !!(tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED); tci->tcpi_tfo_syn_loss = !!(tp->t_tfo_stats & TFO_S_SYN_LOSS); + tci->tcpi_tfo_cookie_wrong = !!(tp->t_tfo_stats & TFO_S_COOKIE_WRONG); + tci->tcpi_tfo_no_cookie_rcv = !!(tp->t_tfo_stats & TFO_S_NO_COOKIE_RCV); + tci->tcpi_tfo_heuristics_disable = !!(tp->t_tfo_stats & TFO_S_HEURISTICS_DISABLE); + tci->tcpi_tfo_send_blackhole = !!(tp->t_tfo_stats & TFO_S_SEND_BLACKHOLE); + tci->tcpi_tfo_recv_blackhole = !!(tp->t_tfo_stats & TFO_S_RECV_BLACKHOLE); } } @@ -1875,9 +1878,7 @@ tcp_getconninfo(struct socket *so, struct conninfo_tcp *tcp_ci) * splnet() any more. This needs more examination.) */ int -tcp_ctloutput(so, sopt) - struct socket *so; - struct sockopt *sopt; +tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int error, opt, optval; struct inpcb *inp; @@ -2095,6 +2096,21 @@ tcp_ctloutput(so, sopt) } break; + case TCP_KEEPALIVE_OFFLOAD: + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + if (optval < 0 || optval > INT32_MAX) { + error = EINVAL; + break; + } + if (optval != 0) + inp->inp_flags2 |= INP2_KEEPALIVE_OFFLOAD; + else + inp->inp_flags2 &= ~INP2_KEEPALIVE_OFFLOAD; + break; + case PERSIST_TIMEOUT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -2180,13 +2196,13 @@ tcp_ctloutput(so, sopt) } /* - * allocate memory for storing message + * allocate memory for storing message * related state */ VERIFY(so->so_msg_state == NULL); - MALLOC(so->so_msg_state, + MALLOC(so->so_msg_state, struct msg_state *, - sizeof(struct msg_state), + sizeof(struct msg_state), M_TEMP, M_WAITOK | M_ZERO); if (so->so_msg_state == NULL) { error = ENOMEM; @@ -2196,9 +2212,9 @@ tcp_ctloutput(so, sopt) /* Enable message delivery */ so->so_flags |= SOF_ENABLE_MSGS; } else { - /* - * Can't disable message delivery on socket - * because of restrictions imposed by + /* + * Can't disable message delivery on socket + * because of restrictions imposed by * encoding/decoding */ error = EINVAL; @@ -2286,6 +2302,30 @@ tcp_ctloutput(so, sopt) error = EINVAL; } break; + case TCP_NOTIFY_ACKNOWLEDGEMENT: + error = sooptcopyin(sopt, &optval, + sizeof(optval), sizeof(optval)); + if (error) + break; + if (optval <= 0) { + error = EINVAL; + break; + } + if (tp->t_notify_ack_count >= TCP_MAX_NOTIFY_ACK) { + error = ETOOMANYREFS; + break; + } + + /* + * validate that the given marker id is not + * a duplicate to avoid ambiguity + */ + if ((error = tcp_notify_ack_id_valid(tp, so, + optval)) != 0) { + break; + } + error = tcp_add_notify_ack_marker(tp, optval); + break; case SO_FLUSH: if ((error = sooptcopyin(sopt, &optval, sizeof (optval), sizeof (optval))) != 0) @@ -2323,13 +2363,25 @@ tcp_ctloutput(so, sopt) optval = tp->t_maxseg; break; case TCP_KEEPALIVE: - optval = tp->t_keepidle / TCP_RETRANSHZ; + if (tp->t_keepidle > 0) + optval = tp->t_keepidle / TCP_RETRANSHZ; + else + optval = tcp_keepidle / TCP_RETRANSHZ; break; case TCP_KEEPINTVL: - optval = tp->t_keepintvl / TCP_RETRANSHZ; + if (tp->t_keepintvl > 0) + optval = tp->t_keepintvl / TCP_RETRANSHZ; + else + optval = tcp_keepintvl / TCP_RETRANSHZ; break; case TCP_KEEPCNT: - optval = tp->t_keepcnt; + if (tp->t_keepcnt > 0) + optval = tp->t_keepcnt; + else + optval = tcp_keepcnt; + break; + case TCP_KEEPALIVE_OFFLOAD: + optval = !!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; @@ -2442,9 +2494,24 @@ tcp_ctloutput(so, sopt) optval = tp->t_adaptive_wtimo; break; case SO_TRAFFIC_MGT_BACKGROUND: - optval = (so->so_traffic_mgt_flags & - TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; + optval = (so->so_flags1 & + SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; break; + case TCP_NOTIFY_ACKNOWLEDGEMENT: { + struct tcp_notify_ack_complete retid; + + if (sopt->sopt_valsize != sizeof (retid)) { + error = EINVAL; + break; + } + bzero(&retid, sizeof (retid)); + tcp_get_notify_ack_count(tp, &retid); + if (retid.notify_complete_count > 0) + tcp_get_notify_ack_ids(tp, &retid); + + error = sooptcopyout(sopt, &retid, sizeof (retid)); + goto done; + } default: error = ENOPROTOOPT; break; @@ -2504,15 +2571,6 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_recvspace , 0, &sysctl_tcp_sospace, "IU", "Maximum incoming TCP datagram size"); -/* Sysctl for testing and tuning the connectx with data api */ -#define TCP_PRECONNECT_SBSZ_MAX 1460 -#define TCP_PRECONNECT_SBSZ_MIN (TCP_MSS) -#define TCP_PRECONNECT_SBSZ_DEF (TCP6_MSS) -static int tcp_preconnect_sbspace = TCP_PRECONNECT_SBSZ_DEF; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, preconn_sbsz, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_preconnect_sbspace, 0, "Maximum preconnect space"); - - /* * Attach TCP protocol to socket, allocating * internet protocol control block, tcp control block, @@ -2525,11 +2583,9 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, preconn_sbsz, CTLFLAG_RW | CTLFLAG_LOCKED, * soreserve:ENOBUFS */ static int -tcp_attach(so, p) - struct socket *so; - struct proc *p; +tcp_attach(struct socket *so, struct proc *p) { - register struct tcpcb *tp; + struct tcpcb *tp; struct inpcb *inp; int error; #if INET6 @@ -2549,8 +2605,7 @@ tcp_attach(so, p) } if (so->so_snd.sb_preconn_hiwat == 0) { - soreserve_preconnect(so, imin(TCP_PRECONNECT_SBSZ_MAX, - imax(tcp_preconnect_sbspace, TCP_PRECONNECT_SBSZ_MIN))); + soreserve_preconnect(so, 2048); } if ((so->so_rcv.sb_flags & SB_USRSIZE) == 0) @@ -2595,8 +2650,7 @@ tcp_attach(so, p) * send segment to peer (with FIN). */ static struct tcpcb * -tcp_disconnect(tp) - register struct tcpcb *tp; +tcp_disconnect(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; @@ -2631,10 +2685,8 @@ tcp_disconnect(tp) * We can let the user exit from the close as soon as the FIN is acked. */ static struct tcpcb * -tcp_usrclosed(tp) - register struct tcpcb *tp; +tcp_usrclosed(struct tcpcb *tp) { - switch (tp->t_state) { case TCPS_CLOSED: diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 7cce3784f..afbcb41d5 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -75,7 +75,7 @@ #define _TCPCB_LIST_HEAD(name, type) \ struct name { \ u_int32_t lh_first; \ -}; +} #else #define _TCPCB_PTR(x) x #define _TCPCB_LIST_HEAD(name, type) LIST_HEAD(name, type) @@ -83,7 +83,7 @@ struct name { \ #ifdef KERNEL_PRIVATE -#define TCP_RETRANSHZ 1000 /* granularity of TCP timestamps, 1ms */ +#define TCP_RETRANSHZ 1000 /* granularity of TCP timestamps, 1ms */ /* Minimum time quantum within which the timers are coalesced */ #define TCP_TIMER_10MS_QUANTUM (TCP_RETRANSHZ/100) /* every 10ms */ #define TCP_TIMER_100MS_QUANTUM (TCP_RETRANSHZ/10) /* every 100ms */ @@ -93,12 +93,12 @@ struct name { \ #define N_TIME_WAIT_SLOTS 128 /* must be power of 2 */ -/* Always allow at least 4 packets worth of recv window when adjusting +/* Always allow at least 16 packets worth of recv window when adjusting * recv window using inter-packet arrival jitter. */ -#define MIN_IAJ_WIN 4 +#define MIN_IAJ_WIN 16 -/* A variation in delay of this many milliseconds is tolerable. This limit has to +/* A variation in delay of this many milliseconds is tolerable. This limit has to * be low but greater than zero. We also use standard deviation on jitter to adjust * this limit for different link and connection types. */ @@ -114,12 +114,12 @@ struct name { \ */ #define ACC_IAJ_HIGH_THRESH 100 -/* When accumulated IAJ reaches this value, the receiver starts to react by +/* When accumulated IAJ reaches this value, the receiver starts to react by * closing the window */ #define ACC_IAJ_REACT_LIMIT 200 -/* If the number of small packets (smaller than IAJ packet size) seen on a +/* If the number of small packets (smaller than IAJ packet size) seen on a * connection is more than this threshold, reset the size and learn it again. * This is needed because the sender might send smaller segments after PMTU * discovery and the receiver has to learn the new size. @@ -128,12 +128,12 @@ struct name { \ /* * Adaptive timeout is a read/write timeout specified by the application to - * get a socket event when the transport layer detects a stall in data - * transfer. The value specified is the number of probes that can be sent + * get a socket event when the transport layer detects a stall in data + * transfer. The value specified is the number of probes that can be sent * to the peer before generating an event. Since it is not specified as * a time value, the timeout will adjust based on the RTT seen on the link. - * The timeout will start only when there is an indication that the read/write - * operation is not making progress. + * The timeout will start only when there is an indication that the read/write + * operation is not making progress. * * If a write operation stalls, the probe will be retransmission of data. * If a read operation stalls, the probe will be a keep-alive packet. @@ -186,6 +186,12 @@ struct tcp_rxt_seg { SLIST_ENTRY(tcp_rxt_seg) rx_link; }; +struct tcp_notify_ack_marker { + tcp_seq notify_snd_una; /* Notify when snd_una crosses this seq */ + tcp_notify_ack_id_t notify_id; + SLIST_ENTRY(tcp_notify_ack_marker) notify_next; +}; + struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; @@ -274,6 +280,7 @@ struct tcpcb { #define TF_WASFRECOVERY 0x400000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x800000 /* require MD5 digests (RFC2385) */ #define TF_MAXSEGSNT 0x1000000 /* last segment sent was a full segment */ +#define TF_STREAMING_ON 0x2000000 /* Receiver detected streaming */ #define TF_PMTUD 0x4000000 /* Perform Path MTU Discovery for this connection */ #define TF_CLOSING 0x8000000 /* pending tcp close */ #define TF_TSO 0x10000000 /* TCP Segment Offloading is enable on this connection */ @@ -306,8 +313,7 @@ struct tcpcb { */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ - u_int t_maxopd; /* mss plus options */ - + u_int32_t t_maxopd; /* mss plus options */ u_int32_t t_rcvtime; /* time at which a packet was received */ u_int32_t t_starttime; /* time connection was established */ int t_rtttime; /* tcp clock when rtt calculation was started */ @@ -350,7 +356,7 @@ struct tcpcb { /* State for limiting early retransmits when SACK is not enabled */ u_int16_t t_early_rexmt_count; /* count of early rexmts */ - u_int32_t t_early_rexmt_win; /* window for limiting early rexmts */ + u_int32_t t_early_rexmt_win; /* window for limiting early rexmts */ u_int32_t ts_recent; /* timestamp echo data */ @@ -371,6 +377,7 @@ struct tcpcb { u_int32_t rcv_unackwin; /* to measure win for stretching acks */ u_int32_t rcv_by_unackwin; /* bytes seen during the last ack-stretching win */ u_int32_t rcv_nostrack_ts; /* timestamp when stretch ack was disabled automatically */ + u_int32_t rcv_nostrack_pkts; /* pkts received since strech ack was disabled */ u_int16_t rcv_waitforss; /* wait for packets during slow-start */ /* ECN stats */ @@ -394,7 +401,6 @@ struct tcpcb { u_int32_t t_ecn_recv_ce; /* Received CE from the network */ u_int32_t t_ecn_recv_cwr; /* Packets received with CWR */ - u_int8_t t_ecn_recv_ce_pkt; /* Received packet with CE-bit set (independent from last_ack_sent) */ /* state for bad retransmit recovery */ u_int32_t snd_cwnd_prev; /* cwnd prior to retransmit */ @@ -403,7 +409,7 @@ struct tcpcb { int t_srtt_prev; /* srtt prior to retransmit */ int t_rttvar_prev; /* rttvar prior to retransmit */ u_int32_t t_badrexmt_time; /* bad rexmt detection time */ - + /* Packet reordering metric */ u_int16_t t_reorderwin; /* Reordering late time offset */ @@ -417,13 +423,13 @@ struct tcpcb { int rcv_numsacks; /* # distinct sack blks present */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ - + struct mbuf *t_pktlist_head; /* First packet in transmit chain */ struct mbuf *t_pktlist_tail; /* Last packet in transmit chain */ u_int32_t t_pktlist_sentlen; /* total bytes in transmit chain */ u_int32_t t_keepidle; /* keepalive idle timer (override global if > 0) */ - u_int32_t t_keepinit; /* connection timeout, i.e. idle time + u_int32_t t_keepinit; /* connection timeout, i.e. idle time in SYN_SENT or SYN_RECV state */ u_int32_t t_keepintvl; /* interval between keepalives */ u_int32_t t_keepcnt; /* number of keepalives before close */ @@ -432,7 +438,7 @@ struct tcpcb { u_int16_t t_pmtud_lastseg_size; /* size of the last sent segment */ u_int16_t t_pmtud_saved_maxopd; /* MSS saved before performing PMTU-D BlackHole detection */ u_int32_t t_pmtud_start_ts; /* Time of PMTUD blackhole detection */ - + struct { u_int32_t rxduplicatebytes; @@ -443,8 +449,10 @@ struct tcpcb { u_int16_t unused_pad_to_8; u_int32_t rxmitpkts; } t_stat; - - /* Background congestion related state */ + u_int8_t t_notify_ack_count; + u_int8_t t_ecn_recv_ce_pkt; /* Received packet with CE-bit set (independent from last_ack_sent) */ + u_int16_t t_cached_maxopd; /* default for MSS adjustment using link status report */ + uint32_t bg_ssthresh; /* Slow start threshold until delay increases */ uint32_t t_flagsext; /* Another field to accommodate more flags */ #define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ @@ -457,7 +465,6 @@ struct tcpcb { #define TF_DETECT_READSTALL 0x80 /* Used to detect a stall during read operation */ #define TF_RECV_THROTTLE 0x100 /* Input throttling active */ #define TF_NOSTRETCHACK 0x200 /* ack every other packet */ -#define TF_STREAMEOW 0x400 /* Last packet was small indicating end of write */ #define TF_NOTIMEWAIT 0x800 /* Avoid going into time-wait */ #define TF_SENT_TLPROBE 0x1000 /* Sent data in PTO */ #define TF_PKTS_REORDERED 0x2000 /* Detected reordering */ @@ -470,6 +477,7 @@ struct tcpcb { #define TF_CWND_NONVALIDATED 0x100000 /* cwnd non validated */ #define TF_PROBING 0x200000 /* Trigger probe timeout */ #define TF_FASTOPEN 0x400000 /* TCP Fastopen is enabled */ +#define TF_REASS_INPROG 0x800000 /* Reassembly is in progress */ #if TRAFFIC_MGT /* Inter-arrival jitter related state */ @@ -483,7 +491,7 @@ struct tcpcb { uint32_t avg_iaj; /* Mean */ uint32_t std_dev_iaj; /* Standard deviation */ #endif /* TRAFFIC_MGT */ - struct bwmeas *t_bwmeas; /* State for bandwidth measurement */ + struct bwmeas *t_bwmeas; /* State for bandwidth measurement */ uint32_t t_lropktlen; /* Bytes in a LRO frame */ tcp_seq t_idleat; /* rcv_nxt at idle time */ TAILQ_ENTRY(tcpcb) t_twentry; /* link for time wait queue */ @@ -510,10 +518,10 @@ struct tcpcb { #define TMPF_PREESTABLISHED 0x00000001 /* conn in pre-established state */ #define TMPF_SENT_KEYS 0x00000002 /* indicates that keys were sent */ #define TMPF_MPTCP_TRUE 0x00000004 /* negotiated MPTCP successfully */ -#define TMPF_MPTCP_RCVD_KEY 0x00000008 /* state for 3-way handshake */ +#define TMPF_MPTCP_RCVD_KEY 0x00000008 /* state for 3-way handshake */ #define TMPF_SND_MPPRIO 0x00000010 /* send priority of subflow */ #define TMPF_SND_REM_ADDR 0x00000020 /* initiate address removal */ -#define TMPF_UNUSED 0x00000040 /* address addition acked by peer */ +#define TMPF_RCVD_DACK 0x00000040 /* received a data-ack */ #define TMPF_JOINED_FLOW 0x00000080 /* Indicates additional flow */ #define TMPF_BACKUP_PATH 0x00000100 /* Indicates backup path */ #define TMPF_MPTCP_ACKNOW 0x00000200 /* Send Data ACK */ @@ -524,7 +532,7 @@ struct tcpcb { #define TMPF_RECVD_JOIN 0x00004000 /* Received Join */ #define TMPF_RESET 0x00008000 /* Send RST */ #define TMPF_TCP_FALLBACK 0x00010000 /* Fallback to TCP */ -#define TMPF_FASTCLOSE 0x00020000 /* Send Fastclose option */ +#define TMPF_FASTCLOSERCV 0x00020000 /* Received Fastclose option */ #define TMPF_EMBED_DSN 0x00040000 /* tp has DSN mapping */ #define TMPF_MPTCP_READY 0x00080000 /* Can send DSS options on data */ #define TMPF_INFIN_SENT 0x00100000 /* Sent infinite mapping */ @@ -559,6 +567,11 @@ struct tcpcb { #define TFO_S_SYN_DATA_SENT 0x40 /* SYN+data sent */ #define TFO_S_SYN_DATA_ACKED 0x80 /* SYN+data has been acknowledged in SYN/ACK */ #define TFO_S_SYN_LOSS 0x0100 /* SYN+TFO has been lost - fallback to regular TCP */ +#define TFO_S_COOKIE_WRONG 0x0200 /* Cookie we sent in the SYN was wrong */ +#define TFO_S_NO_COOKIE_RCV 0x0400 /* We asked for a cookie but didn't get one */ +#define TFO_S_HEURISTICS_DISABLE 0x0800 /* TFO-heuristics disabled it for this connection */ +#define TFO_S_SEND_BLACKHOLE 0x1000 /* TFO got blackholed in the send direction */ +#define TFO_S_RECV_BLACKHOLE 0x2000 /* TFO got blackholed in the recv direction */ u_int16_t t_tfo_stats; u_int8_t t_tfo_probes; /* TFO-probes we did send */ @@ -590,23 +603,24 @@ struct tcpcb { #define TFO_PROBE_PROBING 1 /* Sending out TCP-keepalives waiting for reply */ #define TFO_PROBE_WAIT_DATA 2 /* Received reply, waiting for data */ u_int8_t t_tfo_probe_state; - + u_int32_t t_rcvoopack; /* out-of-order packets received */ u_int32_t t_pawsdrop; /* segments dropped due to PAWS */ u_int32_t t_sack_recovery_episode; /* SACK recovery episodes */ u_int32_t t_reordered_pkts; /* packets reorderd */ u_int32_t t_dsack_sent; /* Sent DSACK notification */ u_int32_t t_dsack_recvd; /* Received a valid DSACK option */ - u_int32_t t_recv_throttle_ts; + SLIST_HEAD(,tcp_notify_ack_marker) t_notify_ack; /* state for notifying data acknowledgements */ + u_int32_t t_recv_throttle_ts; /* TS for start of recv throttle */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) #define SACK_ENABLED(tp) (tp->t_flagsext & TF_SACK_ENABLE) /* - * If the connection is in a throttled state due to advisory feedback from + * If the connection is in a throttled state due to advisory feedback from * the interface output queue, reset that state. We do this in favor - * of entering recovery because the data transfer during recovery + * of entering recovery because the data transfer during recovery * should be just a trickle and it will help to improve performance. * We also do not want to back off twice in the same RTT. */ @@ -630,7 +644,7 @@ struct tcpcb { } while(0) /* - * When the number of duplicate acks received is less than + * When the number of duplicate acks received is less than * the retransmit threshold, use Limited Transmit algorithm */ extern int tcprexmtthresh; @@ -690,6 +704,14 @@ extern int tcprexmtthresh; (SEQ_LEQ((_seq_), (_tp_)->snd_max) && \ SEQ_GEQ((_seq_), ((_una_) - TCP_DSACK_MAX_SEND_WINDOW(_tp_)))) +#define TCP_RESET_REXMT_STATE(_tp_) do { \ + (_tp_)->t_rxtshift = 0; \ + (_tp_)->t_rxtstart = 0; \ +} while(0); + +#define TCP_AUTORCVBUF_MAX(_ifp_) (((_ifp_) != NULL && \ + ((_ifp_)->if_eflags & IFEF_3CA)) ? tcp_autorcvbuf_max_ca : \ + tcp_autorcvbuf_max) enum tcp_cc_event { TCP_CC_CWND_INIT, /* 0 */ @@ -763,7 +785,7 @@ struct rmxp_tao { #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* - * The rtt measured is in milliseconds as the timestamp granularity is + * The rtt measured is in milliseconds as the timestamp granularity is * a millisecond. The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average @@ -1133,6 +1155,15 @@ struct tcpstat { u_int32_t tcps_tfo_syn_data_acked;/* SYN+data has been acknowledged */ u_int32_t tcps_tfo_syn_loss; /* SYN+TFO has been lost and we fallback */ u_int32_t tcps_tfo_blackhole; /* TFO got blackholed by a middlebox. */ + u_int32_t tcps_tfo_cookie_wrong; /* TFO-cookie we sent was wrong */ + u_int32_t tcps_tfo_no_cookie_rcv; /* We asked for a cookie but didn't get one */ + u_int32_t tcps_tfo_heuristics_disable; /* TFO got disabled due to heuristics */ + u_int32_t tcps_tfo_sndblackhole; /* TFO got blackholed in the sending direction */ + u_int32_t tcps_mss_to_default; /* Change MSS to default using link status report */ + u_int32_t tcps_mss_to_medium; /* Change MSS to medium using link status report */ + u_int32_t tcps_mss_to_low; /* Change MSS to low using link status report */ + u_int32_t tcps_ecn_fallback_droprst; /* ECN fallback caused by connection drop due to RST */ + u_int32_t tcps_ecn_fallback_droprxmt; /* ECN fallback due to drop after multiple retransmits */ }; @@ -1266,54 +1297,54 @@ struct xtcpcb_n { u_int64_t t_segq; int t_dupacks; /* consecutive dup acks recd */ - + int t_timer[TCPT_NTIMERS_EXT]; /* tcp timers */ - + int t_state; /* state of this connection */ u_int t_flags; - + int t_force; /* 1 if forcing out a byte */ - + tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; - * used to recognize retransmits - */ + * used to recognize retransmits + */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ - + tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq iss; /* initial send sequence number */ tcp_seq irs; /* initial receive sequence number */ - + tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ u_int32_t rcv_wnd; /* receive window */ tcp_seq rcv_up; /* receive urgent pointer */ - + u_int32_t snd_wnd; /* send window */ u_int32_t snd_cwnd; /* congestion-controlled window */ u_int32_t snd_ssthresh; /* snd_cwnd size threshold for - * for slow start exponential to - * linear switch - */ + * for slow start exponential to + * linear switch + */ u_int t_maxopd; /* mss plus options */ - + u_int32_t t_rcvtime; /* time at which a packet was received */ u_int32_t t_starttime; /* time connection was established */ int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ - + int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ - + int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rttmin; /* minimum rtt allowed */ u_int32_t t_rttupdated; /* number of times rtt sampled */ u_int32_t max_sndwnd; /* largest window peer has offered */ - + int t_softerror; /* possible error not yet reported */ /* out-of-band data */ char t_oobflags; /* have some */ @@ -1324,7 +1355,7 @@ struct xtcpcb_n { u_char request_r_scale; /* pending window scaling */ u_char requested_s_scale; u_int32_t ts_recent; /* timestamp echo data */ - + u_int32_t ts_recent_age; /* when last updated */ tcp_seq last_ack_sent; /* RFC 1644 variables */ @@ -1404,7 +1435,7 @@ extern int ss_fltsz_local; extern int tcp_do_rfc3390; /* Calculate ss_fltsz according to RFC 3390 */ extern int tcp_do_rfc1323; extern int target_qdelay; -extern u_int32_t tcp_now; /* for RFC 1323 timestamps */ +extern u_int32_t tcp_now; /* for RFC 1323 timestamps */ extern struct timeval tcp_uptime; extern lck_spin_t *tcp_uptime_lock; extern int tcp_delack_enabled; @@ -1417,11 +1448,9 @@ extern struct zone *tcp_reass_zone; extern struct zone *tcp_rxt_seg_zone; extern int tcp_ecn_outbound; extern int tcp_ecn_inbound; - - -#if CONFIG_IFEF_NOWINDOWSCALE -extern int tcp_obey_ifef_nowindowscale; -#endif +extern u_int32_t tcp_autorcvbuf_max; +extern u_int32_t tcp_autorcvbuf_max_ca; +extern u_int32_t tcp_autorcvbuf_inc_shift; struct protosw; struct domain; @@ -1430,7 +1459,8 @@ struct tcp_respond_args { unsigned int ifscope; unsigned int nocell:1, noexpensive:1, - awdl_unrestricted:1; + awdl_unrestricted:1, + intcoproc_allowed:1; }; void tcp_canceltimers(struct tcpcb *); @@ -1487,7 +1517,8 @@ void tcp_reset_stretch_ack(struct tcpcb *tp); extern void tcp_get_ports_used(u_int32_t, int, u_int32_t, bitstr_t *); uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags); uint32_t tcp_find_anypcb_byaddr(struct ifaddr *ifa); -void tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so); +void tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, + u_int32_t maxrcvbuf); struct bwmeas* tcp_bwmeas_alloc(struct tcpcb *tp); void tcp_bwmeas_free(struct tcpcb *tp); extern int32_t timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2); @@ -1498,7 +1529,7 @@ extern void tcp_set_recv_bg(struct socket *); extern void tcp_clear_recv_bg(struct socket *); extern boolean_t tcp_sack_byte_islost(struct tcpcb *tp); #define IS_TCP_RECV_BG(_so) \ - ((_so)->so_traffic_mgt_flags & TRAFFIC_MGT_TCP_RECVBG) + ((_so)->so_flags1 & SOF1_TRAFFIC_MGT_TCP_RECVBG) #if TRAFFIC_MGT #define CLEAR_IAJ_STATE(_tp_) (_tp_)->iaj_rcv_ts = 0 @@ -1518,7 +1549,6 @@ lck_mtx_t * tcp_getlock (struct socket *, int); void * tcp_getlock (struct socket *, int); #endif - extern struct pr_usrreqs tcp_usrreqs; extern u_int32_t tcp_sendspace; extern u_int32_t tcp_recvspace; @@ -1540,14 +1570,29 @@ extern void tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable); extern void tcp_get_connectivity_status(struct tcpcb *, struct tcp_conn_status *); +extern void tcp_fill_keepalive_offload_frames(struct ifnet *, + struct ifnet_keepalive_offload_frame *, u_int32_t, size_t, u_int32_t *); + extern boolean_t tfo_enabled(const struct tcpcb *tp); extern void tcp_disable_tfo(struct tcpcb *tp); extern void tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out, size_t blk_size); #define TCP_FASTOPEN_KEYLEN 16 +extern errno_t tcp_notify_ack_id_valid(struct tcpcb *, struct socket *, u_int32_t); +extern errno_t tcp_add_notify_ack_marker(struct tcpcb *, u_int32_t); +extern void tcp_notify_ack_free(struct tcpcb *); +extern void tcp_notify_acknowledgement(struct tcpcb *, struct socket *); +extern void tcp_get_notify_ack_count(struct tcpcb *, + struct tcp_notify_ack_complete *); +extern void tcp_get_notify_ack_ids(struct tcpcb *tp, + struct tcp_notify_ack_complete *); +extern void tcp_update_mss_locked(struct socket *, struct ifnet *); + +extern int get_tcp_inp_list(struct inpcb **, int, inp_gen_t); +extern bool tcp_notify_ack_active(struct socket *so); #if MPTCP extern int mptcp_input_preproc(struct tcpcb *, struct mbuf *, int); -extern void mptcp_output_csum(struct tcpcb *, struct mbuf *, int32_t, unsigned, +extern void mptcp_output_csum(struct tcpcb *, struct mbuf *, int32_t, unsigned, u_int64_t, u_int32_t *); extern int mptcp_adj_mss(struct tcpcb *, boolean_t); extern void mptcp_insert_rmap(struct tcpcb *, struct mbuf *); diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 09a5a3631..765cd6826 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -58,7 +58,6 @@ * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 - * $FreeBSD: src/sys/netinet/udp_usrreq.c,v 1.64.2.13 2001/08/08 18:59:54 ghelmer Exp $ */ #include @@ -85,6 +84,7 @@ #include #include +#include #include #if INET6 #include @@ -139,18 +139,18 @@ SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED, - &udp_log_in_vain, 0, "Log all incoming UDP packets"); + &udp_log_in_vain, 0, "Log all incoming UDP packets"); static int blackhole = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED, - &blackhole, 0, "Do not send port unreachables for refused connects"); + &blackhole, 0, "Do not send port unreachables for refused connects"); struct inpcbhead udb; /* from udp_var.h */ #define udb6 udb /* for KAME src sync over BSD*'s */ struct inpcbinfo udbinfo; #ifndef UDBHASHSIZE -#define UDBHASHSIZE 16 +#define UDBHASHSIZE 16 #endif /* Garbage collection performed during most recent udp_gc() run */ @@ -158,38 +158,38 @@ static boolean_t udp_gc_done = FALSE; #if IPFIREWALL extern int fw_verbose; -extern void ipfwsyslog( int level, const char *format,...); +extern void ipfwsyslog(int level, const char *format, ...); extern void ipfw_stealth_stats_incr_udp(void); /* Apple logging, log to ipfw.log */ -#define log_in_vain_log(a) { \ +#define log_in_vain_log(a) { \ if ((udp_log_in_vain == 3) && (fw_verbose == 2)) { \ ipfwsyslog a; \ } else if ((udp_log_in_vain == 4) && (fw_verbose == 2)) { \ - ipfw_stealth_stats_incr_udp(); \ + ipfw_stealth_stats_incr_udp(); \ } else { \ log a; \ } \ } #else /* !IPFIREWALL */ -#define log_in_vain_log( a ) { log a; } +#define log_in_vain_log(a) { log a; } #endif /* !IPFIREWALL */ static int udp_getstat SYSCTL_HANDLER_ARGS; struct udpstat udpstat; /* from udp_var.h */ SYSCTL_PROC(_net_inet_udp, UDPCTL_STATS, stats, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, - 0, 0, udp_getstat, "S,udpstat", - "UDP statistics (struct udpstat, netinet/udp_var.h)"); + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, udp_getstat, "S,udpstat", + "UDP statistics (struct udpstat, netinet/udp_var.h)"); SYSCTL_INT(_net_inet_udp, OID_AUTO, pcbcount, - CTLFLAG_RD | CTLFLAG_LOCKED, &udbinfo.ipi_count, 0, - "Number of active PCBs"); + CTLFLAG_RD | CTLFLAG_LOCKED, &udbinfo.ipi_count, 0, + "Number of active PCBs"); __private_extern__ int udp_use_randomport = 1; SYSCTL_INT(_net_inet_udp, OID_AUTO, randomize_ports, - CTLFLAG_RW | CTLFLAG_LOCKED, &udp_use_randomport, 0, - "Randomize UDP port numbers"); + CTLFLAG_RW | CTLFLAG_LOCKED, &udp_use_randomport, 0, + "Randomize UDP port numbers"); #if INET6 struct udp_in6 { @@ -319,7 +319,7 @@ udp_input(struct mbuf *m, int iphlen) udpstat.udps_ipackets++; - KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_START, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); /* Expect 32-bit aligned data pointer on strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); @@ -344,7 +344,7 @@ udp_input(struct mbuf *m, int iphlen) if (m == NULL) { udpstat.udps_hdrops++; KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, - 0,0,0,0,0); + 0, 0, 0, 0, 0); return; } ip = mtod(m, struct ip *); @@ -433,8 +433,8 @@ udp_input(struct mbuf *m, int iphlen) /* NOTREACHED */ } #if INET6 - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; #endif /* INET6 */ if (inp_restricted_recv(inp, ifp)) continue; @@ -514,7 +514,7 @@ udp_input(struct mbuf *m, int iphlen) #if NECP skipit = 0; - if (!necp_socket_is_allowed_to_send_recv_v4(inp, + if (!necp_socket_is_allowed_to_send_recv_v4(inp, uh->uh_dport, uh->uh_sport, &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL)) { /* do not inject data to pcb */ @@ -580,7 +580,7 @@ udp_input(struct mbuf *m, int iphlen) /* free the extra copy of mbuf or skipped by IPSec */ if (m != NULL) m_freem(m); - KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); return; } @@ -602,7 +602,7 @@ udp_input(struct mbuf *m, int iphlen) payload_len)) == NULL) { udpstat.udps_hdrops++; KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, - 0,0,0,0,0); + 0, 0, 0, 0, 0); return; } /* @@ -615,17 +615,17 @@ udp_input(struct mbuf *m, int iphlen) uh = (struct udphdr *)(void *)((caddr_t)ip + iphlen); } /* Check for NAT keepalive packet */ - if (payload_len == 1 && *(u_int8_t*) + if (payload_len == 1 && *(u_int8_t *) ((caddr_t)uh + sizeof (struct udphdr)) == 0xFF) { m_freem(m); KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, - 0,0,0,0,0); + 0, 0, 0, 0, 0); return; - } else if (payload_len == 4 && *(u_int32_t*)(void *) + } else if (payload_len == 4 && *(u_int32_t *)(void *) ((caddr_t)uh + sizeof (struct udphdr)) != 0) { /* UDP encapsulated IPSec packet to pass through NAT */ KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, - 0,0,0,0,0); + 0, 0, 0, 0, 0); /* preserve the udp header */ esp4_input(m, iphlen + sizeof (struct udphdr)); return; @@ -649,7 +649,7 @@ udp_input(struct mbuf *m, int iphlen) if (udp_log_in_vain < 3) { log(LOG_INFO, "Connection attempt to " "UDP %s:%d from %s:%d\n", inet_ntop(AF_INET, - &ip->ip_dst, buf, sizeof (buf)), + &ip->ip_dst, buf, sizeof (buf)), ntohs(uh->uh_dport), inet_ntop(AF_INET, &ip->ip_src, buf2, sizeof (buf2)), ntohs(uh->uh_sport)); @@ -679,7 +679,7 @@ udp_input(struct mbuf *m, int iphlen) *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); return; } udp_lock(inp->inp_socket, 1, 0); @@ -752,13 +752,13 @@ udp_input(struct mbuf *m, int iphlen) sorwakeup(inp->inp_socket); } udp_unlock(inp->inp_socket, 1, 0); - KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); return; bad: m_freem(m); if (opts) m_freem(opts); - KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); } #if INET6 @@ -848,7 +848,7 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, append_sa = (struct sockaddr *)pudp_in; if (nstat_collect) { INP_ADD_STAT(last, cell, wifi, wired, rxpackets, 1); - INP_ADD_STAT(last, cell, wifi, wired, rxbytes, + INP_ADD_STAT(last, cell, wifi, wired, rxbytes, n->m_pkthdr.len); } so_recv_data_stat(last->inp_socket, n, 0); @@ -863,7 +863,6 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, error: m_freem(n); m_freem(opts); - return; } /* @@ -883,7 +882,7 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; void (*notify)(struct inpcb *, int) = udp_notify; - struct in_addr faddr; + struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr; @@ -903,7 +902,7 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) bcopy(((caddr_t)ip + (ip->ip_hl << 2)), &uh, sizeof (uh)); inp = in_pcblookup_hash(&udbinfo, faddr, uh.uh_dport, - ip->ip_src, uh.uh_sport, 0, NULL); + ip->ip_src, uh.uh_sport, 0, NULL); if (inp != NULL && inp->inp_socket != NULL) { udp_lock(inp->inp_socket, 1, 0); if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == @@ -1002,7 +1001,7 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) UDP_KEEPALIVE_OFFLOAD_DATA_SIZE); if (inp->inp_keepalive_datalen > 0) { MALLOC(inp->inp_keepalive_data, - u_int8_t *, + u_int8_t *, inp->inp_keepalive_datalen, M_TEMP, M_WAITOK); if (inp->inp_keepalive_data == NULL) { @@ -1113,7 +1112,7 @@ udp_pcblist SYSCTL_HANDLER_ARGS } for (inp = LIST_FIRST(udbinfo.ipi_listhead), i = 0; inp && i < n; - inp = LIST_NEXT(inp, inp_list)) { + inp = LIST_NEXT(inp, inp_list)) { if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) inp_list[i++] = inp; @@ -1122,19 +1121,32 @@ udp_pcblist SYSCTL_HANDLER_ARGS error = 0; for (i = 0; i < n; i++) { + struct xinpcb xi; + inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) { - struct xinpcb xi; - - bzero(&xi, sizeof (xi)); - xi.xi_len = sizeof (xi); - /* XXX should avoid extra copy */ - inpcb_to_compat(inp, &xi.xi_inp); - if (inp->inp_socket) - sotoxsocket(inp->inp_socket, &xi.xi_socket); - error = SYSCTL_OUT(req, &xi, sizeof (xi)); + + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + udp_lock(inp->inp_socket, 1, 0); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } + if (inp->inp_gencnt > gencnt) { + udp_unlock(inp->inp_socket, 1, 0); + continue; } + + bzero(&xi, sizeof (xi)); + xi.xi_len = sizeof (xi); + /* XXX should avoid extra copy */ + inpcb_to_compat(inp, &xi.xi_inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + + udp_unlock(inp->inp_socket, 1, 0); + + error = SYSCTL_OUT(req, &xi, sizeof (xi)); } if (!error) { /* @@ -1157,53 +1169,53 @@ udp_pcblist SYSCTL_HANDLER_ARGS } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist, - "S,xinpcb", "List of active UDP sockets"); + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist, + "S,xinpcb", "List of active UDP sockets"); static int udp_pcblist64 SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) - int error, i, n; - struct inpcb *inp, **inp_list; - inp_gen_t gencnt; - struct xinpgen xig; - - /* - * The process of preparing the TCB list is too time-consuming and - * resource-intensive to repeat twice on every request. - */ - lck_rw_lock_shared(udbinfo.ipi_lock); - if (req->oldptr == USER_ADDR_NULL) { - n = udbinfo.ipi_count; - req->oldidx = + int error, i, n; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + lck_rw_lock_shared(udbinfo.ipi_lock); + if (req->oldptr == USER_ADDR_NULL) { + n = udbinfo.ipi_count; + req->oldidx = 2 * (sizeof (xig)) + (n + n/8) * sizeof (struct xinpcb64); - lck_rw_done(udbinfo.ipi_lock); - return (0); - } - - if (req->newptr != USER_ADDR_NULL) { - lck_rw_done(udbinfo.ipi_lock); - return (EPERM); - } - - /* - * OK, now we're committed to doing something. - */ - gencnt = udbinfo.ipi_gencnt; - n = udbinfo.ipi_count; - - bzero(&xig, sizeof (xig)); - xig.xig_len = sizeof (xig); - xig.xig_count = n; - xig.xig_gen = gencnt; - xig.xig_sogen = so_gencnt; - error = SYSCTL_OUT(req, &xig, sizeof (xig)); - if (error) { - lck_rw_done(udbinfo.ipi_lock); - return (error); - } + lck_rw_done(udbinfo.ipi_lock); + return (0); + } + + if (req->newptr != USER_ADDR_NULL) { + lck_rw_done(udbinfo.ipi_lock); + return (EPERM); + } + + /* + * OK, now we're committed to doing something. + */ + gencnt = udbinfo.ipi_gencnt; + n = udbinfo.ipi_count; + + bzero(&xig, sizeof (xig)); + xig.xig_len = sizeof (xig); + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof (xig)); + if (error) { + lck_rw_done(udbinfo.ipi_lock); + return (error); + } /* * We are done if there is no pcb */ @@ -1212,58 +1224,71 @@ udp_pcblist64 SYSCTL_HANDLER_ARGS return (0); } - inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK); - if (inp_list == 0) { - lck_rw_done(udbinfo.ipi_lock); - return (ENOMEM); - } + inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK); + if (inp_list == 0) { + lck_rw_done(udbinfo.ipi_lock); + return (ENOMEM); + } - for (inp = LIST_FIRST(udbinfo.ipi_listhead), i = 0; inp && i < n; - inp = LIST_NEXT(inp, inp_list)) { - if (inp->inp_gencnt <= gencnt && + for (inp = LIST_FIRST(udbinfo.ipi_listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) - inp_list[i++] = inp; - } - n = i; - - error = 0; - for (i = 0; i < n; i++) { - inp = inp_list[i]; - if (inp->inp_gencnt <= gencnt && - inp->inp_state != INPCB_STATE_DEAD) { - struct xinpcb64 xi; - - bzero(&xi, sizeof (xi)); - xi.xi_len = sizeof (xi); - inpcb_to_xinpcb64(inp, &xi); - if (inp->inp_socket) - sotoxsocket64(inp->inp_socket, &xi.xi_socket); - error = SYSCTL_OUT(req, &xi, sizeof (xi)); - } - } - if (!error) { - /* - * Give the user an updated idea of our state. - * If the generation differs from what we told - * her before, she knows that something happened - * while we were processing this request, and it - * might be necessary to retry. - */ - bzero(&xig, sizeof (xig)); - xig.xig_len = sizeof (xig); - xig.xig_gen = udbinfo.ipi_gencnt; - xig.xig_sogen = so_gencnt; - xig.xig_count = udbinfo.ipi_count; - error = SYSCTL_OUT(req, &xig, sizeof (xig)); - } - FREE(inp_list, M_TEMP); - lck_rw_done(udbinfo.ipi_lock); - return (error); + inp_list[i++] = inp; + } + n = i; + + error = 0; + for (i = 0; i < n; i++) { + struct xinpcb64 xi; + + inp = inp_list[i]; + + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + udp_lock(inp->inp_socket, 1, 0); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } + if (inp->inp_gencnt > gencnt) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } + + bzero(&xi, sizeof (xi)); + xi.xi_len = sizeof (xi); + inpcb_to_xinpcb64(inp, &xi); + if (inp->inp_socket) + sotoxsocket64(inp->inp_socket, &xi.xi_socket); + + udp_unlock(inp->inp_socket, 1, 0); + + error = SYSCTL_OUT(req, &xi, sizeof (xi)); + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + bzero(&xig, sizeof (xig)); + xig.xig_len = sizeof (xig); + xig.xig_gen = udbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = udbinfo.ipi_count; + error = SYSCTL_OUT(req, &xig, sizeof (xig)); + } + FREE(inp_list, M_TEMP); + lck_rw_done(udbinfo.ipi_lock); + return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist64, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist64, - "S,xinpcb64", "List of active UDP sockets"); + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist64, + "S,xinpcb64", "List of active UDP sockets"); static int @@ -1274,8 +1299,8 @@ udp_pcblist_n SYSCTL_HANDLER_ARGS } SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist_n, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist_n, - "S,xinpcb_n", "List of active UDP sockets"); + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist_n, + "S,xinpcb_n", "List of active UDP sockets"); __private_extern__ void udp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags, @@ -1386,10 +1411,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct ip_moptions *mopts; struct route ro; struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0, 0, 0 }; struct ifnet *outif = NULL; struct flowadv *adv = &ipoa.ipoa_flowadv; - mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + int sotc = SO_TC_UNSPEC; + int netsvctype = _NET_SERVICE_TYPE_UNSPEC; struct ifnet *origoutifp = NULL; int flowadv = 0; @@ -1397,11 +1423,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, flowadv = (so->so_state & SS_ISCONNECTED) ? 1 : 0; pi_laddr.s_addr = INADDR_ANY; - KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); if (control != NULL) { - msc = mbuf_service_class_from_control(control); + sotc = so_tc_from_control(control, &netsvctype); VERIFY(outif == NULL); error = udp_check_pktinfo(control, &outif, &pi_laddr); m_freem(control); @@ -1412,6 +1438,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (outif != NULL) ipoa.ipoa_boundif = outif->if_index; } + if (sotc == SO_TC_UNSPEC) { + sotc = so->so_traffic_class; + netsvctype = so->so_netsvctype; + } KERNEL_DEBUG(DBG_LAYER_OUT_BEG, inp->inp_fport, inp->inp_lport, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, @@ -1450,6 +1480,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; if (INP_AWDL_UNRESTRICTED(inp)) ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; + ipoa.ipoa_sotc = sotc; + ipoa.ipoa_netsvctype = netsvctype; soopts |= IP_OUTARGS; /* @@ -1500,7 +1532,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, origladdr.s_addr = INADDR_ANY; /* we don't want to keep the laddr or route */ udp_dodisconnect = 1; - /* remember we don't care about src addr.*/ + /* remember we don't care about src addr */ inp->inp_flags |= INP_INADDR_ANY; } else { origladdr = laddr = inp->inp_laddr; @@ -1546,8 +1578,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, /* synch up in case in_pcbladdr() overrides */ if (outif != NULL && ipoa.ipoa_boundif != IFSCOPE_NONE) ipoa.ipoa_boundif = outif->if_index; - } - else { + } else { /* * Fast path case * @@ -1560,7 +1591,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, */ if (laddr.s_addr == INADDR_ANY) { if ((error = in_pcbladdr(inp, addr, &laddr, - ipoa.ipoa_boundif, &outif)) != 0) + ipoa.ipoa_boundif, &outif, 0)) != 0) goto release; /* * from pcbconnect: remember we don't @@ -1631,12 +1662,45 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, udpstat.udps_opackets++; KERNEL_DEBUG(DBG_LAYER_OUT_END, ui->ui_dport, ui->ui_sport, - ui->ui_src.s_addr, ui->ui_dst.s_addr, ui->ui_ulen); + ui->ui_src.s_addr, ui->ui_dst.s_addr, ui->ui_ulen); #if NECP { necp_kernel_policy_id policy_id; u_int32_t route_rule_id; + + /* + * We need a route to perform NECP route rule checks + */ + if (net_qos_policy_restricted != 0 && + ROUTE_UNUSABLE(&inp->inp_route)) { + struct sockaddr_in to; + struct sockaddr_in from; + + ROUTE_RELEASE(&inp->inp_route); + + bzero(&from, sizeof(struct sockaddr_in)); + from.sin_family = AF_INET; + from.sin_len = sizeof(struct sockaddr_in); + from.sin_addr = laddr; + + bzero(&to, sizeof(struct sockaddr_in)); + to.sin_family = AF_INET; + to.sin_len = sizeof(struct sockaddr_in); + to.sin_addr = faddr; + + inp->inp_route.ro_dst.sa_family = AF_INET; + inp->inp_route.ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *)(void *)&inp->inp_route.ro_dst)->sin_addr = + faddr; + + rtalloc_scoped(&inp->inp_route, ipoa.ipoa_boundif); + + inp_update_necp_policy(inp, (struct sockaddr *)&from, + (struct sockaddr *)&to, ipoa.ipoa_boundif); + inp->inp_policyresult.results.qos_marking_gencount = 0; + } + if (!necp_socket_is_allowed_to_send_recv_v4(inp, lport, fport, &laddr, &faddr, NULL, &policy_id, &route_rule_id)) { error = EHOSTUNREACH; @@ -1644,8 +1708,15 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id); + + if (net_qos_policy_restricted != 0) { + necp_socket_update_qos_marking(inp, + inp->inp_route.ro_rt, NULL, route_rule_id); + } } #endif /* NECP */ + if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) + ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED; #if IPSEC if (inp->inp_sp != NULL && ipsec_setsocket(m, inp->inp_socket) != 0) { @@ -1671,7 +1742,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); - set_packet_service_class(m, so, msc, 0); + set_packet_service_class(m, so, sotc, 0); m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = inp->inp_flowhash; m->m_pkthdr.pkt_proto = IPPROTO_UDP; @@ -1710,7 +1781,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (flowadv && (adv->code == FADV_FLOW_CONTROLLED || adv->code == FADV_SUSPENDED)) { - /* return a hint to the application that + /* + * return a hint to the application that * the packet has been dropped */ error = ENOBUFS; @@ -1811,26 +1883,26 @@ sysctl_udp_sospace(struct sysctl_oid *oidp, void *arg1, int arg2, space_p = &udp_sendspace; break; default: - return EINVAL; + return (EINVAL); } - error = sysctl_io_number(req, *space_p, sizeof (u_int32_t), + error = sysctl_io_number(req, *space_p, sizeof (u_int32_t), &new_value, &changed); - if (changed) { - if (new_value > 0 && new_value <= sb_effective_max) - *space_p = new_value; - else - error = ERANGE; - } - return (error); + if (changed) { + if (new_value > 0 && new_value <= sb_effective_max) + *space_p = new_value; + else + error = ERANGE; + } + return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_recvspace, 0, - &sysctl_udp_sospace, "IU", "Maximum incoming UDP datagram size"); + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_recvspace, 0, + &sysctl_udp_sospace, "IU", "Maximum incoming UDP datagram size"); SYSCTL_PROC(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_sendspace, 0, - &sysctl_udp_sospace, "IU", "Maximum outgoing UDP datagram size"); + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_sendspace, 0, + &sysctl_udp_sospace, "IU", "Maximum outgoing UDP datagram size"); static int udp_abort(struct socket *so) @@ -1856,7 +1928,7 @@ udp_attach(struct socket *so, int proto, struct proc *p) inp = sotoinpcb(so); if (inp != NULL) { - panic ("%s so=%p inp=%p\n", __func__, so, inp); + panic("%s so=%p inp=%p\n", __func__, so, inp); /* NOTREACHED */ } error = in_pcballoc(so, &udbinfo, p); @@ -1961,7 +2033,7 @@ udp_connectx_common(struct socket *so, int af, inp_update_necp_policy(inp, src_se ? src_se->se_addr : NULL, dst_se ? dst_se->se_addr : NULL, ifscope); #endif /* NECP */ - + /* bind socket to the specified interface, if requested */ if (ifscope != IFSCOPE_NONE && (error = inp_bindif(inp, ifscope, NULL)) != 0) @@ -1994,7 +2066,7 @@ udp_connectx_common(struct socket *so, int af, /* * If there is data, copy it. DATA_IDEMPOTENT is ignored. - * CONNECT_RESUME_ON_READ_WRITE is ignored. + * CONNECT_RESUME_ON_READ_WRITE is ignored. */ if (uio != NULL) { socket_unlock(so, 0); @@ -2003,14 +2075,14 @@ udp_connectx_common(struct socket *so, int af, datalen = uio_resid(uio); error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, - (uio_t)uio, NULL, NULL, 0); + (uio_t)uio, NULL, NULL, 0); socket_lock(so, 0); /* If error returned is EMSGSIZE, for example, disconnect */ if (error == 0 || error == EWOULDBLOCK) *bytes_written = datalen - uio_resid(uio); else - (void)so->so_proto->pr_usrreqs->pru_disconnectx(so, + (void) so->so_proto->pr_usrreqs->pru_disconnectx(so, SAE_ASSOCID_ANY, SAE_CONNID_ANY); /* * mask the EWOULDBLOCK error so that the caller @@ -2049,8 +2121,8 @@ udp_detach(struct socket *so) /* * If this is a socket that does not want to wakeup the device - * for it's traffic, the application might be waiting for - * close to complete before going to sleep. Send a notification + * for it's traffic, the application might be waiting for + * close to complete before going to sleep. Send a notification * for this kind of sockets */ if (so->so_options & SO_NOWAKEFROMSLEEP) @@ -2119,7 +2191,8 @@ udp_send(struct socket *so, int flags, struct mbuf *m, #if FLOW_DIVERT if (necp_socket_should_use_flow_divert(inp)) { /* Implicit connect */ - return (flow_divert_implicit_data_out(so, flags, m, addr, control, p)); + return (flow_divert_implicit_data_out(so, flags, m, addr, + control, p)); } #endif /* FLOW_DIVERT */ #endif /* NECP */ @@ -2271,8 +2344,6 @@ udp_gc(struct inpcbinfo *ipi) } } lck_rw_done(ipi->ipi_lock); - - return; } static int @@ -2394,12 +2465,6 @@ udp_input_checksum(struct mbuf *m, struct udphdr *uh, int off, int ulen) return (0); } -extern void -udp_fill_keepalive_offload_frames(ifnet_t ifp, - struct ifnet_keepalive_offload_frame *frames_array, - u_int32_t frames_array_count, size_t frame_data_offset, - u_int32_t *used_frames_count); - void udp_fill_keepalive_offload_frames(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frames_array, @@ -2469,7 +2534,7 @@ udp_fill_keepalive_offload_frames(ifnet_t ifp, continue; } if ((inp->inp_vflag & INP_IPV4)) { - if ((frame_data_offset + sizeof(struct udpiphdr) + + if ((frame_data_offset + sizeof(struct udpiphdr) + inp->inp_keepalive_datalen) > IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) { udp_unlock(so, 1, 0); diff --git a/bsd/netinet/udp_var.h b/bsd/netinet/udp_var.h index c82931b50..e95dd8aeb 100644 --- a/bsd/netinet/udp_var.h +++ b/bsd/netinet/udp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -178,6 +178,10 @@ extern lck_mtx_t *udp_getlock(struct socket *, int); extern void udp_get_ports_used(u_int32_t, int, u_int32_t, bitstr_t *); extern uint32_t udp_count_opportunistic(unsigned int, u_int32_t); extern uint32_t udp_find_anypcb_byaddr(struct ifaddr *); + +extern void udp_fill_keepalive_offload_frames(struct ifnet *, + struct ifnet_keepalive_offload_frame *, u_int32_t, size_t, u_int32_t *); + __END_DECLS #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET_UDP_VAR_H_ */ diff --git a/bsd/netinet6/Makefile b/bsd/netinet6/Makefile index 2e6677f6c..039ad151d 100644 --- a/bsd/netinet6/Makefile +++ b/bsd/netinet6/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) diff --git a/bsd/netinet6/ah_core.c b/bsd/netinet6/ah_core.c index 60a6e5b69..b072b692e 100644 --- a/bsd/netinet6/ah_core.c +++ b/bsd/netinet6/ah_core.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -161,8 +161,7 @@ static void ah_update_mbuf(struct mbuf *, int, int, const struct ah_algorithm *, struct ah_algorithm_state *); const struct ah_algorithm * -ah_algorithm_lookup(idx) - int idx; +ah_algorithm_lookup(int idx) { /* checksum algorithms */ static struct ah_algorithm hmac_md5 = @@ -228,8 +227,7 @@ ah_algorithm_lookup(idx) static int -ah_sumsiz_1216(sav) - struct secasvar *sav; +ah_sumsiz_1216(struct secasvar *sav) { if (!sav) return -1; @@ -240,8 +238,7 @@ ah_sumsiz_1216(sav) } static int -ah_sumsiz_zero(sav) - struct secasvar *sav; +ah_sumsiz_zero(struct secasvar *sav) { if (!sav) return -1; @@ -249,8 +246,7 @@ ah_sumsiz_zero(sav) } static int -ah_none_mature(sav) - struct secasvar *sav; +ah_none_mature(struct secasvar *sav) { if (sav->sah->saidx.proto == IPPROTO_AH) { ipseclog((LOG_ERR, @@ -294,9 +290,7 @@ ah_keyed_md5_mature( } static int -ah_keyed_md5_init(state, sav) - struct ah_algorithm_state *state; - struct secasvar *sav; +ah_keyed_md5_init(struct ah_algorithm_state *state, struct secasvar *sav) { size_t padlen; size_t keybitlen; @@ -352,10 +346,7 @@ ah_keyed_md5_init(state, sav) } static void -ah_keyed_md5_loop(state, addr, len) - struct ah_algorithm_state *state; - caddr_t addr; - size_t len; +ah_keyed_md5_loop(struct ah_algorithm_state *state, caddr_t addr, size_t len) { if (!state) panic("ah_keyed_md5_loop: what?"); @@ -364,10 +355,7 @@ ah_keyed_md5_loop(state, addr, len) } static void -ah_keyed_md5_result(state, addr, l) - struct ah_algorithm_state *state; - caddr_t addr; - size_t l; +ah_keyed_md5_result(struct ah_algorithm_state *state, caddr_t addr, size_t l) { u_char digest[16] __attribute__((aligned(4))); @@ -385,8 +373,7 @@ ah_keyed_md5_result(state, addr, l) } static int -ah_keyed_sha1_mature(sav) - struct secasvar *sav; +ah_keyed_sha1_mature(struct secasvar *sav) { const struct ah_algorithm *algo; @@ -413,9 +400,7 @@ ah_keyed_sha1_mature(sav) } static int -ah_keyed_sha1_init(state, sav) - struct ah_algorithm_state *state; - struct secasvar *sav; +ah_keyed_sha1_init(struct ah_algorithm_state *state, struct secasvar *sav) { SHA1_CTX *ctxt; size_t padlen; @@ -471,10 +456,7 @@ ah_keyed_sha1_init(state, sav) } static void -ah_keyed_sha1_loop(state, addr, len) - struct ah_algorithm_state *state; - caddr_t addr; - size_t len; +ah_keyed_sha1_loop(struct ah_algorithm_state *state, caddr_t addr, size_t len) { SHA1_CTX *ctxt; @@ -486,10 +468,7 @@ ah_keyed_sha1_loop(state, addr, len) } static void -ah_keyed_sha1_result(state, addr, l) - struct ah_algorithm_state *state; - caddr_t addr; - size_t l; +ah_keyed_sha1_result(struct ah_algorithm_state *state, caddr_t addr, size_t l) { u_char digest[SHA1_RESULTLEN] __attribute__((aligned(4))); /* SHA-1 generates 160 bits */ SHA1_CTX *ctxt; @@ -509,8 +488,7 @@ ah_keyed_sha1_result(state, addr, l) } static int -ah_hmac_md5_mature(sav) - struct secasvar *sav; +ah_hmac_md5_mature(struct secasvar *sav) { const struct ah_algorithm *algo; @@ -537,9 +515,7 @@ ah_hmac_md5_mature(sav) } static int -ah_hmac_md5_init(state, sav) - struct ah_algorithm_state *state; - struct secasvar *sav; +ah_hmac_md5_init(struct ah_algorithm_state *state, struct secasvar *sav) { u_char *ipad; u_char *opad; @@ -590,10 +566,7 @@ ah_hmac_md5_init(state, sav) } static void -ah_hmac_md5_loop(state, addr, len) - struct ah_algorithm_state *state; - caddr_t addr; - size_t len; +ah_hmac_md5_loop(struct ah_algorithm_state *state, caddr_t addr, size_t len) { MD5_CTX *ctxt; @@ -604,10 +577,7 @@ ah_hmac_md5_loop(state, addr, len) } static void -ah_hmac_md5_result(state, addr, l) - struct ah_algorithm_state *state; - caddr_t addr; - size_t l; +ah_hmac_md5_result(struct ah_algorithm_state *state, caddr_t addr, size_t l) { u_char digest[16] __attribute__((aligned(4))); u_char *ipad; @@ -634,8 +604,7 @@ ah_hmac_md5_result(state, addr, l) } static int -ah_hmac_sha1_mature(sav) - struct secasvar *sav; +ah_hmac_sha1_mature(struct secasvar *sav) { const struct ah_algorithm *algo; @@ -662,9 +631,7 @@ ah_hmac_sha1_mature(sav) } static int -ah_hmac_sha1_init(state, sav) - struct ah_algorithm_state *state; - struct secasvar *sav; +ah_hmac_sha1_init(struct ah_algorithm_state *state, struct secasvar *sav) { u_char *ipad; u_char *opad; @@ -716,10 +683,7 @@ ah_hmac_sha1_init(state, sav) } static void -ah_hmac_sha1_loop(state, addr, len) - struct ah_algorithm_state *state; - caddr_t addr; - size_t len; +ah_hmac_sha1_loop(struct ah_algorithm_state *state, caddr_t addr, size_t len) { SHA1_CTX *ctxt; @@ -731,10 +695,7 @@ ah_hmac_sha1_loop(state, addr, len) } static void -ah_hmac_sha1_result(state, addr, l) - struct ah_algorithm_state *state; - caddr_t addr; - size_t l; +ah_hmac_sha1_result(struct ah_algorithm_state *state, caddr_t addr, size_t l) { u_char digest[SHA1_RESULTLEN] __attribute__((aligned(4))); /* SHA-1 generates 160 bits */ u_char *ipad; @@ -762,8 +723,7 @@ ah_hmac_sha1_result(state, addr, l) #if AH_ALL_CRYPTO static int -ah_sumsiz_sha2_256(sav) - struct secasvar *sav; +ah_sumsiz_sha2_256(struct secasvar *sav) { if (!sav) return -1; @@ -772,8 +732,7 @@ ah_sumsiz_sha2_256(sav) } static int -ah_hmac_sha2_256_mature(sav) - struct secasvar *sav; +ah_hmac_sha2_256_mature(struct secasvar *sav) { const struct ah_algorithm *algo; @@ -802,9 +761,7 @@ ah_hmac_sha2_256_mature(sav) } static int -ah_hmac_sha2_256_init(state, sav) - struct ah_algorithm_state *state; - struct secasvar *sav; +ah_hmac_sha2_256_init(struct ah_algorithm_state *state, struct secasvar *sav) { u_char *ipad; u_char *opad; @@ -859,10 +816,9 @@ ah_hmac_sha2_256_init(state, sav) } static void -ah_hmac_sha2_256_loop(state, addr, len) - struct ah_algorithm_state *state; - caddr_t addr; - size_t len; +ah_hmac_sha2_256_loop(struct ah_algorithm_state *state, + caddr_t addr, + size_t len) { SHA256_CTX *ctxt; @@ -874,10 +830,9 @@ ah_hmac_sha2_256_loop(state, addr, len) } static void -ah_hmac_sha2_256_result(state, addr, l) - struct ah_algorithm_state *state; - caddr_t addr; - size_t l; +ah_hmac_sha2_256_result(struct ah_algorithm_state *state, + caddr_t addr, + size_t l) { u_char digest[SHA256_DIGEST_LENGTH] __attribute__((aligned(4))); u_char *ipad; @@ -904,8 +859,7 @@ ah_hmac_sha2_256_result(state, addr, l) } static int -ah_sumsiz_sha2_384(sav) - struct secasvar *sav; +ah_sumsiz_sha2_384(struct secasvar *sav) { if (!sav) return -1; @@ -914,8 +868,7 @@ ah_sumsiz_sha2_384(sav) } static int -ah_hmac_sha2_384_mature(sav) - struct secasvar *sav; +ah_hmac_sha2_384_mature(struct secasvar *sav) { const struct ah_algorithm *algo; @@ -944,9 +897,7 @@ ah_hmac_sha2_384_mature(sav) } static int -ah_hmac_sha2_384_init(state, sav) - struct ah_algorithm_state *state; - struct secasvar *sav; +ah_hmac_sha2_384_init(struct ah_algorithm_state *state, struct secasvar *sav) { u_char *ipad; u_char *opad; @@ -1001,10 +952,9 @@ ah_hmac_sha2_384_init(state, sav) } static void -ah_hmac_sha2_384_loop(state, addr, len) - struct ah_algorithm_state *state; - caddr_t addr; - size_t len; +ah_hmac_sha2_384_loop(struct ah_algorithm_state *state, + caddr_t addr, + size_t len) { SHA384_CTX *ctxt; @@ -1016,10 +966,9 @@ ah_hmac_sha2_384_loop(state, addr, len) } static void -ah_hmac_sha2_384_result(state, addr, l) - struct ah_algorithm_state *state; - caddr_t addr; - size_t l; +ah_hmac_sha2_384_result(struct ah_algorithm_state *state, + caddr_t addr, + size_t l) { u_char digest[SHA384_DIGEST_LENGTH]; u_char *ipad; @@ -1046,8 +995,7 @@ ah_hmac_sha2_384_result(state, addr, l) } static int -ah_sumsiz_sha2_512(sav) - struct secasvar *sav; +ah_sumsiz_sha2_512(struct secasvar *sav) { if (!sav) return -1; @@ -1056,8 +1004,7 @@ ah_sumsiz_sha2_512(sav) } static int -ah_hmac_sha2_512_mature(sav) - struct secasvar *sav; +ah_hmac_sha2_512_mature(struct secasvar *sav) { const struct ah_algorithm *algo; @@ -1086,9 +1033,7 @@ ah_hmac_sha2_512_mature(sav) } static int -ah_hmac_sha2_512_init(state, sav) - struct ah_algorithm_state *state; - struct secasvar *sav; +ah_hmac_sha2_512_init(struct ah_algorithm_state *state, struct secasvar *sav) { u_char *ipad; u_char *opad; @@ -1143,10 +1088,9 @@ ah_hmac_sha2_512_init(state, sav) } static void -ah_hmac_sha2_512_loop(state, addr, len) - struct ah_algorithm_state *state; - caddr_t addr; - size_t len; +ah_hmac_sha2_512_loop(struct ah_algorithm_state *state, + caddr_t addr, + size_t len) { SHA512_CTX *ctxt; @@ -1158,10 +1102,9 @@ ah_hmac_sha2_512_loop(state, addr, len) } static void -ah_hmac_sha2_512_result(state, addr, l) - struct ah_algorithm_state *state; - caddr_t addr; - size_t l; +ah_hmac_sha2_512_result(struct ah_algorithm_state *state, + caddr_t addr, + size_t l) { u_char digest[SHA512_DIGEST_LENGTH] __attribute__((aligned(4))); u_char *ipad; @@ -1194,12 +1137,9 @@ ah_hmac_sha2_512_result(state, addr, l) * go generate the checksum. */ static void -ah_update_mbuf(m, off, len, algo, algos) - struct mbuf *m; - int off; - int len; - const struct ah_algorithm *algo; - struct ah_algorithm_state *algos; +ah_update_mbuf(struct mbuf *m,int off, int len, + const struct ah_algorithm *algo, + struct ah_algorithm_state *algos) { struct mbuf *n; int tlen; @@ -1244,12 +1184,8 @@ ah_update_mbuf(m, off, len, algo, algos) * Don't use m_copy(), it will try to share cluster mbuf by using refcnt. */ int -ah4_calccksum(m, ahdat, len, algo, sav) - struct mbuf *m; - caddr_t ahdat; - size_t len; - const struct ah_algorithm *algo; - struct secasvar *sav; +ah4_calccksum(struct mbuf *m, caddr_t ahdat, size_t len, + const struct ah_algorithm *algo, struct secasvar *sav) { int off; int hdrtype; @@ -1488,12 +1424,8 @@ ah4_calccksum(m, ahdat, len, algo, sav) * Don't use m_copy(), it will try to share cluster mbuf by using refcnt. */ int -ah6_calccksum(m, ahdat, len, algo, sav) - struct mbuf *m; - caddr_t ahdat; - size_t len; - const struct ah_algorithm *algo; - struct secasvar *sav; +ah6_calccksum(struct mbuf *m, caddr_t ahdat, size_t len, + const struct ah_algorithm *algo, struct secasvar *sav) { int newoff, off; int proto, nxt; diff --git a/bsd/netinet6/ah_input.c b/bsd/netinet6/ah_input.c index bfd976bb7..a6054b601 100644 --- a/bsd/netinet6/ah_input.c +++ b/bsd/netinet6/ah_input.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -404,6 +404,7 @@ ah4_input(struct mbuf *m, int off) if (ipsec4_tunnel_validate(m, off + stripsiz, nxt, sav, &ifamily)) { ifaddr_t ifa; struct sockaddr_storage addr; + struct sockaddr_in *ipaddr; /* * strip off all the headers that precedes AH. @@ -485,21 +486,17 @@ ah4_input(struct mbuf *m, int off) goto fail; } - if (ip_doscopedroute) { - struct sockaddr_in *ipaddr; - - bzero(&addr, sizeof(addr)); - ipaddr = (__typeof__(ipaddr))&addr; - ipaddr->sin_family = AF_INET; - ipaddr->sin_len = sizeof(*ipaddr); - ipaddr->sin_addr = ip->ip_dst; - - // update the receiving interface address based on the inner address - ifa = ifa_ifwithaddr((struct sockaddr *)&addr); - if (ifa) { - m->m_pkthdr.rcvif = ifa->ifa_ifp; - IFA_REMREF(ifa); - } + bzero(&addr, sizeof(addr)); + ipaddr = (__typeof__(ipaddr))&addr; + ipaddr->sin_family = AF_INET; + ipaddr->sin_len = sizeof(*ipaddr); + ipaddr->sin_addr = ip->ip_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); } // Input via IPSec interface @@ -833,7 +830,7 @@ ah6_input(struct mbuf **mp, int *offp, int proto) if (ipsec6_tunnel_validate(m, off + stripsiz, nxt, sav, &ifamily)) { ifaddr_t ifa; struct sockaddr_storage addr; - + struct sockaddr_in6 *ip6addr; /* * strip off all the headers that precedes AH. * IP6 xx AH IP6' payload -> IP6' payload @@ -894,21 +891,17 @@ ah6_input(struct mbuf **mp, int *offp, int proto) goto fail; } - if (ip6_doscopedroute) { - struct sockaddr_in6 *ip6addr; - - bzero(&addr, sizeof(addr)); - ip6addr = (__typeof__(ip6addr))&addr; - ip6addr->sin6_family = AF_INET6; - ip6addr->sin6_len = sizeof(*ip6addr); - ip6addr->sin6_addr = ip6->ip6_dst; - - // update the receiving interface address based on the inner address - ifa = ifa_ifwithaddr((struct sockaddr *)&addr); - if (ifa) { - m->m_pkthdr.rcvif = ifa->ifa_ifp; - IFA_REMREF(ifa); - } + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); } // Input via IPSec interface @@ -995,10 +988,7 @@ ah6_input(struct mbuf **mp, int *offp, int proto) } void -ah6_ctlinput(cmd, sa, d) - int cmd; - struct sockaddr *sa; - void *d; +ah6_ctlinput(int cmd, struct sockaddr *sa, void *d) { const struct newah *ahp; struct newah ah; diff --git a/bsd/netinet6/ah_output.c b/bsd/netinet6/ah_output.c index 18391b187..1e723fa3f 100644 --- a/bsd/netinet6/ah_output.c +++ b/bsd/netinet6/ah_output.c @@ -117,8 +117,7 @@ extern lck_mtx_t *sadb_mutex; * virtual interface, and control MTU/MSS by the interface MTU. */ size_t -ah_hdrsiz(isr) - struct ipsecrequest *isr; +ah_hdrsiz(struct ipsecrequest *isr) { /* sanity check */ @@ -184,9 +183,7 @@ ah_hdrsiz(isr) * the function does not modify m. */ int -ah4_output(m, sav) - struct mbuf *m; - struct secasvar *sav; +ah4_output(struct mbuf *m, struct secasvar *sav) { const struct ah_algorithm *algo; u_int32_t spi; @@ -371,8 +368,7 @@ ah4_output(m, sav) /* Calculate AH length */ int -ah_hdrlen(sav) - struct secasvar *sav; +ah_hdrlen(struct secasvar *sav) { const struct ah_algorithm *algo; int plen, ahlen; @@ -398,11 +394,8 @@ ah_hdrlen(sav) * Fill in the Authentication Header and calculate checksum. */ int -ah6_output(m, nexthdrp, md, sav) - struct mbuf *m; - u_char *nexthdrp; - struct mbuf *md; - struct secasvar *sav; +ah6_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md, + struct secasvar *sav) { struct mbuf *mprev; struct mbuf *mah; @@ -553,8 +546,7 @@ ah6_output(m, nexthdrp, md, sav) * The mbuf must be pulled up toward, at least, ip option part. */ static struct in_addr * -ah4_finaldst(m) - struct mbuf *m; +ah4_finaldst(struct mbuf *m) { struct ip *ip; int optlen; diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index 90c01fe19..bc9a75ed6 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -193,8 +193,7 @@ static const struct esp_algorithm *esp_algorithms[] = { }; const struct esp_algorithm * -esp_algorithm_lookup(idx) - int idx; +esp_algorithm_lookup(int idx) { switch (idx) { case SADB_EALG_DESCBC: @@ -213,7 +212,7 @@ esp_algorithm_lookup(idx) } int -esp_max_ivlen() +esp_max_ivlen(void) { int idx; int ivlen; @@ -229,9 +228,7 @@ esp_max_ivlen() } int -esp_schedule(algo, sav) - const struct esp_algorithm *algo; - struct secasvar *sav; +esp_schedule(const struct esp_algorithm *algo, struct secasvar *sav) { int error; @@ -319,8 +316,7 @@ esp_null_encrypt( } static int -esp_descbc_mature(sav) - struct secasvar *sav; +esp_descbc_mature(struct secasvar *sav) { const struct esp_algorithm *algo; @@ -425,8 +421,7 @@ esp_des_blockencrypt( } static int -esp_cbc_mature(sav) - struct secasvar *sav; +esp_cbc_mature(struct secasvar *sav) { int keylen; const struct esp_algorithm *algo; @@ -488,8 +483,7 @@ esp_cbc_mature(sav) } static int -esp_gcm_mature(sav) - struct secasvar *sav; +esp_gcm_mature(struct secasvar *sav) { int keylen; const struct esp_algorithm *algo; @@ -605,12 +599,8 @@ esp_common_ivlen( } static int -esp_cbc_decrypt(m, off, sav, algo, ivlen) - struct mbuf *m; - size_t off; - struct secasvar *sav; - const struct esp_algorithm *algo; - int ivlen; +esp_cbc_decrypt(struct mbuf *m, size_t off, struct secasvar *sav, + const struct esp_algorithm *algo, int ivlen) { struct mbuf *s; struct mbuf *d, *d0, *dp; @@ -1065,12 +1055,12 @@ esp_cbc_encrypt( /* does not free m0 on error */ int -esp_auth(m0, skip, length, sav, sum) - struct mbuf *m0; - size_t skip; /* offset to ESP header */ - size_t length; /* payload length */ - struct secasvar *sav; - u_char *sum; +esp_auth( + struct mbuf *m0, + size_t skip, /* offset to ESP header */ + size_t length, /* payload length */ + struct secasvar *sav, + u_char *sum) { struct mbuf *m; size_t off; diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index e51ea9b0f..f2f8af9be 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -174,9 +174,7 @@ esp6_input_strip_udp_encap (struct mbuf *m, int ip6hlen) } void -esp4_input(m, off) - struct mbuf *m; - int off; +esp4_input(struct mbuf *m, int off) { struct ip *ip; #if INET6 @@ -498,17 +496,6 @@ esp4_input(m, off) esp = (struct esp *)(void *)(((u_int8_t *)ip) + off); } - if (sav->utun_is_keepalive_fn) { - if (sav->utun_is_keepalive_fn(sav->utun_pcb, &m, nxt, sav->flags, (off + esplen + ivlen))) { - if (m) { - // not really bad, we just wanna exit - IPSEC_STAT_INCREMENT(ipsecstat.in_success); - m = NULL; - } - goto bad; - } - } - /* was it transmitted over the IPsec tunnel SA? */ if (ipsec4_tunnel_validate(m, off + esplen + ivlen, nxt, sav, &ifamily)) { ifaddr_t ifa; @@ -562,13 +549,11 @@ esp4_input(m, off) goto bad; } - if (ip_doscopedroute) { - bzero(&addr, sizeof(addr)); - ipaddr = (__typeof__(ipaddr))&addr; - ipaddr->sin_family = AF_INET; - ipaddr->sin_len = sizeof(*ipaddr); - ipaddr->sin_addr = ip->ip_dst; - } + bzero(&addr, sizeof(addr)); + ipaddr = (__typeof__(ipaddr))&addr; + ipaddr->sin_family = AF_INET; + ipaddr->sin_len = sizeof(*ipaddr); + ipaddr->sin_addr = ip->ip_dst; #if INET6 } else if (ifamily == AF_INET6) { struct sockaddr_in6 *ip6addr; @@ -608,13 +593,11 @@ esp4_input(m, off) goto bad; } - if (ip6_doscopedroute) { - bzero(&addr, sizeof(addr)); - ip6addr = (__typeof__(ip6addr))&addr; - ip6addr->sin6_family = AF_INET6; - ip6addr->sin6_len = sizeof(*ip6addr); - ip6addr->sin6_addr = ip6->ip6_dst; - } + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; #endif /* INET6 */ } else { ipseclog((LOG_ERR, "ipsec tunnel unsupported address family " @@ -629,13 +612,11 @@ esp4_input(m, off) goto bad; } - if (ip_doscopedroute || ip6_doscopedroute) { - // update the receiving interface address based on the inner address - ifa = ifa_ifwithaddr((struct sockaddr *)&addr); - if (ifa) { - m->m_pkthdr.rcvif = ifa->ifa_ifp; - IFA_REMREF(ifa); - } + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); } /* Clear the csum flags, they can't be valid for the inner headers */ @@ -651,14 +632,6 @@ esp4_input(m, off) } } - if (sav->utun_in_fn) { - if (!(sav->utun_in_fn(sav->utun_pcb, &m, ifamily == AF_INET ? PF_INET : PF_INET6))) { - m = NULL; - // we just wanna exit since packet has been completely processed - goto bad; - } - } - if (proto_input(ifamily == AF_INET ? PF_INET : PF_INET6, m) != 0) goto bad; @@ -765,14 +738,6 @@ esp4_input(m, off) } } - if (sav->utun_in_fn) { - if (!(sav->utun_in_fn(sav->utun_pcb, &m, PF_INET))) { - m = NULL; - // we just wanna exit since packet has been completely processed - goto bad; - } - } - ip_proto_dispatch_in(m, off, nxt, 0); } else m_freem(m); @@ -1092,17 +1057,6 @@ esp6_input(struct mbuf **mp, int *offp, int proto) ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - taillen); - if (sav->utun_is_keepalive_fn) { - if (sav->utun_is_keepalive_fn(sav->utun_pcb, &m, nxt, sav->flags, (off + esplen + ivlen))) { - if (m) { - // not really bad, we just wanna exit - IPSEC_STAT_INCREMENT(ipsec6stat.in_success); - m = NULL; - } - goto bad; - } - } - if (*nproto == IPPROTO_UDP) { // offset includes the outer ip and udp header lengths. if (m->m_len < off) { @@ -1149,6 +1103,8 @@ esp6_input(struct mbuf **mp, int *offp, int proto) flowinfo = ip6->ip6_flow; m_adj(m, off + esplen + ivlen); if (ifamily == AF_INET6) { + struct sockaddr_in6 *ip6addr; + if (m->m_len < sizeof(*ip6)) { #ifndef PULLDOWN_TEST /* @@ -1180,15 +1136,11 @@ esp6_input(struct mbuf **mp, int *offp, int proto) goto bad; } - if (ip6_doscopedroute) { - struct sockaddr_in6 *ip6addr; - - bzero(&addr, sizeof(addr)); - ip6addr = (__typeof__(ip6addr))&addr; - ip6addr->sin6_family = AF_INET6; - ip6addr->sin6_len = sizeof(*ip6addr); - ip6addr->sin6_addr = ip6->ip6_dst; - } + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; } else if (ifamily == AF_INET) { struct sockaddr_in *ipaddr; @@ -1228,13 +1180,11 @@ esp6_input(struct mbuf **mp, int *offp, int proto) goto bad; } - if (ip_doscopedroute) { - bzero(&addr, sizeof(addr)); - ipaddr = (__typeof__(ipaddr))&addr; - ipaddr->sin_family = AF_INET; - ipaddr->sin_len = sizeof(*ipaddr); - ipaddr->sin_addr = ip->ip_dst; - } + bzero(&addr, sizeof(addr)); + ipaddr = (__typeof__(ipaddr))&addr; + ipaddr->sin_family = AF_INET; + ipaddr->sin_len = sizeof(*ipaddr); + ipaddr->sin_addr = ip->ip_dst; } key_sa_recordxfer(sav, m); @@ -1244,13 +1194,11 @@ esp6_input(struct mbuf **mp, int *offp, int proto) goto bad; } - if (ip_doscopedroute || ip6_doscopedroute) { - // update the receiving interface address based on the inner address - ifa = ifa_ifwithaddr((struct sockaddr *)&addr); - if (ifa) { - m->m_pkthdr.rcvif = ifa->ifa_ifp; - IFA_REMREF(ifa); - } + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); } // Input via IPSec interface @@ -1264,14 +1212,6 @@ esp6_input(struct mbuf **mp, int *offp, int proto) } } - if (sav->utun_in_fn) { - if (!(sav->utun_in_fn(sav->utun_pcb, &m, PF_INET6))) { - m = NULL; - // we just wanna exit since packet has been completely processed - goto bad; - } - } - if (proto_input(ifamily == AF_INET ? PF_INET : PF_INET6, m) != 0) goto bad; nxt = IPPROTO_DONE; @@ -1393,13 +1333,6 @@ esp6_input(struct mbuf **mp, int *offp, int proto) } } - if (sav->utun_in_fn) { - if (!(sav->utun_in_fn(sav->utun_pcb, &m, PF_INET6))) { - m = NULL; - // we just wanna exit since packet has been completely processed - goto bad; - } - } } done: @@ -1427,10 +1360,7 @@ esp6_input(struct mbuf **mp, int *offp, int proto) } void -esp6_ctlinput(cmd, sa, d) - int cmd; - struct sockaddr *sa; - void *d; +esp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { const struct newesp *espp; struct newesp esp; diff --git a/bsd/netinet6/esp_output.c b/bsd/netinet6/esp_output.c index e17336346..3b8b817d4 100644 --- a/bsd/netinet6/esp_output.c +++ b/bsd/netinet6/esp_output.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -119,7 +119,7 @@ static int esp_output(struct mbuf *, u_char *, struct mbuf *, int, struct secasvar *sav); extern int esp_udp_encap_port; -extern u_int32_t natt_now; +extern u_int64_t natt_now; extern lck_mtx_t *sadb_mutex; @@ -230,12 +230,12 @@ esp_hdrsiz(__unused struct ipsecrequest *isr) * <-----------------> espoff */ static int -esp_output(m, nexthdrp, md, af, sav) - struct mbuf *m; - u_char *nexthdrp; - struct mbuf *md; - int af; - struct secasvar *sav; +esp_output( + struct mbuf *m, + u_char *nexthdrp, + struct mbuf *md, + int af, + struct secasvar *sav) { struct mbuf *n; struct mbuf *mprev; @@ -858,9 +858,9 @@ esp_output(m, nexthdrp, md, af, sav) #if INET int -esp4_output(m, sav) - struct mbuf *m; - struct secasvar *sav; +esp4_output( + struct mbuf *m, + struct secasvar *sav) { struct ip *ip; if (m->m_len < sizeof(struct ip)) { @@ -876,11 +876,11 @@ esp4_output(m, sav) #if INET6 int -esp6_output(m, nexthdrp, md, sav) - struct mbuf *m; - u_char *nexthdrp; - struct mbuf *md; - struct secasvar *sav; +esp6_output( + struct mbuf *m, + u_char *nexthdrp, + struct mbuf *md, + struct secasvar *sav) { if (m->m_len < sizeof(struct ip6_hdr)) { ipseclog((LOG_DEBUG, "esp6_output: first mbuf too short\n")); diff --git a/bsd/netinet6/esp_rijndael.c b/bsd/netinet6/esp_rijndael.c index d05f8bf7e..cdd86bff3 100644 --- a/bsd/netinet6/esp_rijndael.c +++ b/bsd/netinet6/esp_rijndael.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -148,12 +148,12 @@ esp_aes_schedule( int -esp_cbc_decrypt_aes(m, off, sav, algo, ivlen) - struct mbuf *m; - size_t off; - struct secasvar *sav; - const struct esp_algorithm *algo; - int ivlen; +esp_cbc_decrypt_aes( + struct mbuf *m, + size_t off, + struct secasvar *sav, + const struct esp_algorithm *algo, + int ivlen) { struct mbuf *s; struct mbuf *d, *d0, *dp; @@ -791,12 +791,12 @@ esp_gcm_encrypt_aes( } int -esp_gcm_decrypt_aes(m, off, sav, algo, ivlen) - struct mbuf *m; - size_t off; - struct secasvar *sav; - const struct esp_algorithm *algo __unused; - int ivlen; +esp_gcm_decrypt_aes( + struct mbuf *m, + size_t off, + struct secasvar *sav, + const struct esp_algorithm *algo __unused, + int ivlen) { struct mbuf *s; struct mbuf *d, *d0, *dp; diff --git a/bsd/netinet6/frag6.c b/bsd/netinet6/frag6.c index 1a0718415..8e5f416ea 100644 --- a/bsd/netinet6/frag6.c +++ b/bsd/netinet6/frag6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -356,6 +356,32 @@ frag6_input(struct mbuf **mp, int *offp, int proto) /* offset now points to data portion */ offset += sizeof(struct ip6_frag); + /* + * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0) + * upfront, unrelated to any reassembly. Just skip the fragment header. + */ + if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { + /* + * In ICMPv6 processing, we drop certain + * NDP messages that are not expected to + * have fragment header based on recommendations + * against security vulnerability as described in + * RFC 6980. + * We set PKTF_REASSEMBLED flag to let ICMPv6 NDP + * drop such packets. + * However there are already devices running software + * that are creating interface with MTU < IPv6 Min + * MTU. We should not have allowed that but they are + * out, and sending atomic NDP fragments. + * For that reason, we do not set the same flag here + * and relax the check. + */ + ip6stat.ip6s_atmfrag_rcvd++; + in6_ifstat_inc(dstifp, ifs6_atmfrag_rcvd); + *offp = offset; + return (ip6f->ip6f_nxt); + } + /* * Leverage partial checksum offload for simple UDP/IP fragments, * as that is the most common case. @@ -781,9 +807,18 @@ frag6_input(struct mbuf **mp, int *offp, int proto) frag6_nfrags -= q6->ip6q_nfrag; ip6q_free(q6); - if (m->m_flags & M_PKTHDR) /* Isn't it always true? */ + if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ m_fixhdr(m); - + /* + * Mark packet as reassembled + * In ICMPv6 processing, we drop certain + * NDP messages that are not expected to + * have fragment header based on recommendations + * against security vulnerability as described in + * RFC 6980. + */ + m->m_pkthdr.pkt_flags |= PKTF_REASSEMBLED; + } ip6stat.ip6s_reassembled++; /* diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index fb8d179bf..cc5f1af53 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -190,9 +190,7 @@ icmp6_init(struct ip6protosw *pp, struct domain *dp) } static void -icmp6_errcount(stat, type, code) - struct icmp6errstat *stat; - int type, code; +icmp6_errcount(struct icmp6errstat *stat, int type, int code) { switch (type) { case ICMP6_DST_UNREACH: @@ -486,6 +484,25 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) #endif code = icmp6->icmp6_code; + /* + * Early check for RFC 6980 + * Drop certain NDP packets if they came in fragmented + */ + switch (icmp6->icmp6_type) { + case ND_ROUTER_SOLICIT: + case ND_ROUTER_ADVERT: + case ND_NEIGHBOR_SOLICIT: + case ND_NEIGHBOR_ADVERT: + case ND_REDIRECT: + if (m->m_pkthdr.pkt_flags & PKTF_REASSEMBLED) { + icmp6stat.icp6s_rfc6980_drop++; + goto freeit; + } + break; + default: + break; + } + /* Apply rate limit before checksum validation. */ if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { icmp6stat.icp6s_toofreq++; @@ -577,7 +594,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) goto badcode; } goto deliver; - break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig); @@ -589,7 +605,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) * intermediate extension headers. */ goto deliver; - break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed); @@ -604,7 +619,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) goto badcode; } goto deliver; - break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob); @@ -620,7 +634,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) goto badcode; } goto deliver; - break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo); @@ -630,7 +643,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) if ((n = m_copy(m, 0, M_COPYALL)) == NULL) { /* Give up remote */ goto rate_limit_checked; - break; } if ((n->m_flags & M_EXT) != 0 || n->m_len < off + sizeof(struct icmp6_hdr)) { @@ -645,7 +657,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) /* Give up remote */ m_freem(n0); goto rate_limit_checked; - break; } MGETHDR(n, M_DONTWAIT, n0->m_type); /* MAC-OK */ if (n && maxlen >= MHLEN) { @@ -659,7 +670,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) /* Give up remote */ m_freem(n0); goto rate_limit_checked; - break; } M_COPY_PKTHDR(n, n0); /* @@ -698,7 +708,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) icmp6_reflect(n, noff); } goto rate_limit_checked; - break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply); @@ -726,7 +735,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) m_freem(n); /* m stays. */ goto rate_limit_checked; - break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone); @@ -766,7 +774,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) icmp6_reflect(n, noff); } goto rate_limit_checked; - break; case ICMP6_WRUREPLY: if (code != 0) @@ -789,7 +796,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) nd6_rs_input(n, off, icmp6len); /* m stays. */ goto rate_limit_checked; - break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert); @@ -807,7 +813,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) nd6_ra_input(n, off, icmp6len); /* m stays. */ goto rate_limit_checked; - break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit); @@ -816,7 +821,8 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; - if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { + if (proxy || + ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL)) { /* give up local */ nd6_ns_input(m, off, icmp6len); m = NULL; @@ -825,7 +831,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) nd6_ns_input(n, off, icmp6len); /* m stays. */ goto rate_limit_checked; - break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert); @@ -843,7 +848,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) nd6_na_input(n, off, icmp6len); /* m stays. */ goto rate_limit_checked; - break; case ND_REDIRECT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect); @@ -861,7 +865,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) icmp6_redirect_input(n, off); /* m stays. */ goto rate_limit_checked; - break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && @@ -884,7 +887,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) } else { /* ICMPv6 informational: MUST not deliver */ goto rate_limit_checked; - break; } deliver: if (icmp6_notify_error(m, off, icmp6len, code)) { @@ -903,11 +905,8 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) } rate_limit_checked: - /* deliver the packet to appropriate sockets (unless proxying) */ - if (!proxy) { - icmp6_rip6_input(&m, *offp); - return IPPROTO_DONE; - } + icmp6_rip6_input(&m, *offp); + return IPPROTO_DONE; freeit: m_freem(m); @@ -915,9 +914,7 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) } static int -icmp6_notify_error(m, off, icmp6len, code) - struct mbuf *m; - int off, icmp6len, code; +icmp6_notify_error(struct mbuf *m, int off, int icmp6len, int code) { struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; @@ -1151,9 +1148,7 @@ icmp6_notify_error(m, off, icmp6len, code) } void -icmp6_mtudisc_update(ip6cp, validated) - struct ip6ctlparam *ip6cp; - int validated; +icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; @@ -1190,6 +1185,13 @@ icmp6_mtudisc_update(ip6cp, validated) htons(m->m_pkthdr.rcvif->if_index); } /* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */ + /* + * XXX On a side note, for asymmetric data-path + * the lookup on receive interace is probably not + * what we want to do. + * That requires looking at the cached route for the + * protocol control block. + */ rt = rtalloc1_scoped((struct sockaddr *)&sin6, 0, RTF_CLONING | RTF_PRCLONING, m->m_pkthdr.rcvif->if_index); if (rt != NULL) { @@ -1219,9 +1221,7 @@ icmp6_mtudisc_update(ip6cp, validated) */ #define hostnamelen strlen(hostname) static struct mbuf * -ni6_input(m, off) - struct mbuf *m; - int off; +ni6_input(struct mbuf *m, int off) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; @@ -1560,10 +1560,10 @@ ni6_input(m, off) * treated as truncated name (two \0 at the end). this is a wild guess. */ static struct mbuf * -ni6_nametodns(name, namelen, old) - const char *name; - int namelen; - int old; /* return pascal string if non-zero */ +ni6_nametodns( + const char *name, + int namelen, + int old) /* return pascal string if non-zero */ { struct mbuf *m; char *cp, *ep; @@ -1660,11 +1660,7 @@ ni6_nametodns(name, namelen, old) * XXX upper/lowercase match (see RFC2065) */ static int -ni6_dnsmatch(a, alen, b, blen) - const char *a; - int alen; - const char *b; - int blen; +ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; @@ -1724,10 +1720,7 @@ ni6_dnsmatch(a, alen, b, blen) * calculate the number of addresses to be returned in the node info reply. */ static int -ni6_addrs(ni6, ifpp, subj) - struct icmp6_nodeinfo *ni6; - struct ifnet **ifpp; - char *subj; +ni6_addrs(struct icmp6_nodeinfo *ni6, struct ifnet **ifpp, char *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; @@ -1843,10 +1836,8 @@ ni6_addrs(ni6, ifpp, subj) } static int -ni6_store_addrs(ni6, nni6, ifp0, resid) - struct icmp6_nodeinfo *ni6, *nni6; - struct ifnet *ifp0; - int resid; +ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, + struct ifnet *ifp0, int resid) { struct ifnet *ifp = ifp0; struct in6_ifaddr *ifa6; @@ -2014,9 +2005,7 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) * XXX almost dup'ed code with rip6_input. */ static int -icmp6_rip6_input(mp, off) - struct mbuf **mp; - int off; +icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -2133,9 +2122,7 @@ icmp6_rip6_input(mp, off) * OFF points to the icmp6 header, counted from the top of the mbuf. */ void -icmp6_reflect(m, off) - struct mbuf *m; - size_t off; +icmp6_reflect(struct mbuf *m, size_t off) { struct mbuf *m_ip6hdr = m; struct ip6_hdr *ip6; @@ -2149,7 +2136,8 @@ icmp6_reflect(m, off) struct nd_ifinfo *ndi = NULL; u_int32_t oflow; struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, - IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0 }; + IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP) && m->m_pkthdr.rcvif != NULL) { ip6oa.ip6oa_boundif = m->m_pkthdr.rcvif->if_index; @@ -2345,10 +2333,9 @@ icmp6_reflect(m, off) } static const char * -icmp6_redirect_diag(src6, dst6, tgt6) - struct in6_addr *src6; - struct in6_addr *dst6; - struct in6_addr *tgt6; +icmp6_redirect_diag(struct in6_addr *src6, + struct in6_addr *dst6, + struct in6_addr *tgt6) { static char buf[1024]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", @@ -2357,9 +2344,7 @@ icmp6_redirect_diag(src6, dst6, tgt6) } void -icmp6_redirect_input(m, off) - struct mbuf *m; - int off; +icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -2567,9 +2552,7 @@ icmp6_redirect_input(m, off) } void -icmp6_redirect_output(m0, rt) - struct mbuf *m0; - struct rtentry *rt; +icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr ifp_ll6; @@ -2583,7 +2566,8 @@ icmp6_redirect_output(m0, rt) struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, - IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0 }; + IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); @@ -2877,9 +2861,7 @@ noredhdropt:; * ICMPv6 socket option processing. */ int -icmp6_ctloutput(so, sopt) - struct socket *so; - struct sockopt *sopt; +icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index 40ece5dfc..f5e206f03 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2015 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -141,6 +141,8 @@ #include +#include + #if PF #include #endif /* PF */ @@ -195,20 +197,17 @@ static void in6_ifaddr_detached(struct ifaddr *); static void in6_ifaddr_free(struct ifaddr *); static void in6_ifaddr_trace(struct ifaddr *, int); #if defined(__LP64__) -static void in6_llstartreq_32_to_64(struct in6_llstartreq_32 *, - struct in6_llstartreq_64 *); +static void in6_cgareq_32_to_64(struct in6_cgareq_32 *, + struct in6_cgareq_64 *); #else -static void in6_llstartreq_64_to_32(struct in6_llstartreq_64 *, - struct in6_llstartreq_32 *); +static void in6_cgareq_64_to_32(struct in6_cgareq_64 *, + struct in6_cgareq_32 *); #endif static struct in6_aliasreq *in6_aliasreq_to_native(void *, int, struct in6_aliasreq *); -static struct in6_llstartreq *in6_llstartreq_to_native(void *, int, - struct in6_llstartreq *); +static struct in6_cgareq *in6_cgareq_to_native(void *, int, + struct in6_cgareq *); static int in6_to_kamescope(struct sockaddr_in6 *, struct ifnet *); - -static void in6_ifaddr_set_dadprogress(struct in6_ifaddr *); - static int in6_getassocids(struct socket *, uint32_t *, user_addr_t); static int in6_getconnids(struct socket *, sae_associd_t, uint32_t *, user_addr_t); @@ -430,9 +429,7 @@ in6_ifremloop(struct ifaddr *ifa) int -in6_mask2len(mask, lim0) - struct in6_addr *mask; - u_char *lim0; +in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; @@ -468,9 +465,7 @@ in6_mask2len(mask, lim0) } void -in6_len2mask(mask, len) - struct in6_addr *mask; - int len; +in6_len2mask(struct in6_addr *mask, int len) { int i; @@ -513,41 +508,41 @@ in6_aliasreq_32_to_64(struct in6_aliasreq_32 *src, struct in6_aliasreq_64 *dst) #if defined(__LP64__) void -in6_llstartreq_32_to_64(struct in6_llstartreq_32 *src, - struct in6_llstartreq_64 *dst) +in6_cgareq_32_to_64(struct in6_cgareq_32 *src, + struct in6_cgareq_64 *dst) { bzero(dst, sizeof (*dst)); - bcopy(src->llsr_name, dst->llsr_name, sizeof (dst->llsr_name)); - dst->llsr_flags = src->llsr_flags; - bcopy(src->llsr_cgaprep.cga_modifier.octets, - dst->llsr_cgaprep.cga_modifier.octets, - sizeof (dst->llsr_cgaprep.cga_modifier.octets)); - dst->llsr_cgaprep.cga_security_level = - src->llsr_cgaprep.cga_security_level; - dst->llsr_lifetime.ia6t_expire = src->llsr_lifetime.ia6t_expire; - dst->llsr_lifetime.ia6t_preferred = src->llsr_lifetime.ia6t_preferred; - dst->llsr_lifetime.ia6t_vltime = src->llsr_lifetime.ia6t_vltime; - dst->llsr_lifetime.ia6t_pltime = src->llsr_lifetime.ia6t_pltime; + bcopy(src->cgar_name, dst->cgar_name, sizeof (dst->cgar_name)); + dst->cgar_flags = src->cgar_flags; + bcopy(src->cgar_cgaprep.cga_modifier.octets, + dst->cgar_cgaprep.cga_modifier.octets, + sizeof (dst->cgar_cgaprep.cga_modifier.octets)); + dst->cgar_cgaprep.cga_security_level = + src->cgar_cgaprep.cga_security_level; + dst->cgar_lifetime.ia6t_expire = src->cgar_lifetime.ia6t_expire; + dst->cgar_lifetime.ia6t_preferred = src->cgar_lifetime.ia6t_preferred; + dst->cgar_lifetime.ia6t_vltime = src->cgar_lifetime.ia6t_vltime; + dst->cgar_lifetime.ia6t_pltime = src->cgar_lifetime.ia6t_pltime; } #endif #if !defined(__LP64__) void -in6_llstartreq_64_to_32(struct in6_llstartreq_64 *src, - struct in6_llstartreq_32 *dst) +in6_cgareq_64_to_32(struct in6_cgareq_64 *src, + struct in6_cgareq_32 *dst) { bzero(dst, sizeof (*dst)); - bcopy(src->llsr_name, dst->llsr_name, sizeof (dst->llsr_name)); - dst->llsr_flags = src->llsr_flags; - bcopy(src->llsr_cgaprep.cga_modifier.octets, - dst->llsr_cgaprep.cga_modifier.octets, - sizeof (dst->llsr_cgaprep.cga_modifier.octets)); - dst->llsr_cgaprep.cga_security_level = - src->llsr_cgaprep.cga_security_level; - dst->llsr_lifetime.ia6t_expire = src->llsr_lifetime.ia6t_expire; - dst->llsr_lifetime.ia6t_preferred = src->llsr_lifetime.ia6t_preferred; - dst->llsr_lifetime.ia6t_vltime = src->llsr_lifetime.ia6t_vltime; - dst->llsr_lifetime.ia6t_pltime = src->llsr_lifetime.ia6t_pltime; + bcopy(src->cgar_name, dst->cgar_name, sizeof (dst->cgar_name)); + dst->cgar_flags = src->cgar_flags; + bcopy(src->cgar_cgaprep.cga_modifier.octets, + dst->cgar_cgaprep.cga_modifier.octets, + sizeof (dst->cgar_cgaprep.cga_modifier.octets)); + dst->cgar_cgaprep.cga_security_level = + src->cgar_cgaprep.cga_security_level; + dst->cgar_lifetime.ia6t_expire = src->cgar_lifetime.ia6t_expire; + dst->cgar_lifetime.ia6t_preferred = src->cgar_lifetime.ia6t_preferred; + dst->cgar_lifetime.ia6t_vltime = src->cgar_lifetime.ia6t_vltime; + dst->cgar_lifetime.ia6t_pltime = src->cgar_lifetime.ia6t_pltime; } #endif @@ -570,19 +565,19 @@ in6_aliasreq_to_native(void *data, int data_is_64, struct in6_aliasreq *dst) return (dst); } -static struct in6_llstartreq * -in6_llstartreq_to_native(void *data, int is64, struct in6_llstartreq *dst) +static struct in6_cgareq * +in6_cgareq_to_native(void *data, int is64, struct in6_cgareq *dst) { #if defined(__LP64__) if (is64) bcopy(data, dst, sizeof (*dst)); else - in6_llstartreq_32_to_64((struct in6_llstartreq_32 *)data, - (struct in6_llstartreq_64 *)dst); + in6_cgareq_32_to_64((struct in6_cgareq_32 *)data, + (struct in6_cgareq_64 *)dst); #else if (is64) - in6_llstartreq_64_to_32((struct in6_llstartreq_64 *)data, - (struct in6_llstartreq_32 *)dst); + in6_cgareq_64_to_32((struct in6_cgareq_64 *)data, + (struct in6_cgareq_32 *)dst); else bcopy(data, dst, sizeof (*dst)); #endif /* __LP64__ */ @@ -733,8 +728,7 @@ in6ctl_llstart(struct ifnet *ifp, u_long cmd, caddr_t data) * be done here. They are currently done in in6_ifattach_aux() * for the interfaces that need it. */ - if ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0 && - ifra->ifra_addr.sin6_family == AF_INET6 && + if (ifra->ifra_addr.sin6_family == AF_INET6 && /* Only check ifra_dstaddr if valid */ (ifra->ifra_dstaddr.sin6_len == 0 || ifra->ifra_dstaddr.sin6_family == AF_INET6)) { @@ -812,33 +806,36 @@ in6ctl_llstop(struct ifnet *ifp) return (0); } +/* + * This routine configures secure link local address + */ static __attribute__((noinline)) int in6ctl_cgastart(struct ifnet *ifp, u_long cmd, caddr_t data) { - struct in6_llstartreq llsr; + struct in6_cgareq llcgasr; int is64, error = 0; VERIFY(ifp != NULL); switch (cmd) { - case SIOCLL_CGASTART_32: /* struct in6_llstartreq_32 */ - case SIOCLL_CGASTART_64: /* struct in6_llstartreq_64 */ + case SIOCLL_CGASTART_32: /* struct in6_cgareq_32 */ + case SIOCLL_CGASTART_64: /* struct in6_cgareq_64 */ is64 = (cmd == SIOCLL_CGASTART_64); /* - * Convert user llstartreq to the kernel form, when appropriate. + * Convert user cgareq to the kernel form, when appropriate. * This allows the conversion between different data models * to be centralized, so that it can be passed around to other * routines that are expecting the kernel form. */ - in6_llstartreq_to_native(data, is64, &llsr); + in6_cgareq_to_native(data, is64, &llcgasr); /* * NOTE: All the interface specific DLIL attachements * should be done here. They are currently done in - * in6_ifattach_llstartreq() for the interfaces that + * in6_ifattach_cgareq() for the interfaces that * need it. */ - error = in6_ifattach_llstartreq(ifp, &llsr); + error = in6_ifattach_llcgareq(ifp, &llcgasr); if (error == 0) in6_if_up_dad_start(ifp); break; @@ -1230,6 +1227,8 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCSDEFIFACE_IN6_32: /* struct in6_ndifreq_32 */ case SIOCSDEFIFACE_IN6_64: /* struct in6_ndifreq_64 */ case SIOCSIFINFO_FLAGS: /* struct in6_ndireq */ + case SIOCGIFCGAPREP_IN6: /* struct in6_ifreq */ + case SIOCSIFCGAPREP_IN6: /* struct in6_ifreq */ if (!privileged) { error = EPERM; goto done; @@ -1266,8 +1265,8 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, error = EOPNOTSUPP; goto done; - case SIOCLL_CGASTART_32: /* struct in6_llstartreq_32 */ - case SIOCLL_CGASTART_64: /* struct in6_llstartreq_64 */ + case SIOCLL_CGASTART_32: /* struct in6_cgareq_32 */ + case SIOCLL_CGASTART_64: /* struct in6_cgareq_64 */ if (!privileged) error = EPERM; else @@ -1823,6 +1822,15 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags) if (in6if_do_dad(ifp)) in6_ifaddr_set_dadprogress(ia); + /* + * Do not delay sending neighbor solicitations when using optimistic + * duplicate address detection, c.f. RFC 4429. + */ + if (ia->ia6_flags & IN6_IFF_OPTIMISTIC) + ifaupflags &= ~IN6_IFAUPDATE_DADDELAY; + else + ifaupflags |= IN6_IFAUPDATE_DADDELAY; + /* Join necessary multicast groups */ if ((ifp->if_flags & IFF_MULTICAST) != 0) { @@ -2294,13 +2302,6 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int ifaupflags, ia->ia6_lifetime.ia6ti_preferred = timenow; } - /* - * Do not delay sending neighbor solicitations when using optimistic - * duplicate address detection, c.f. RFC 4429. - */ - if ((ia->ia6_flags & IN6_IFF_OPTIMISTIC) == 0) - ifaupflags |= IN6_IFAUPDATE_DADDELAY; - /* * Update flag or prefix length */ @@ -2441,19 +2442,33 @@ in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) */ ifa = &oia->ia_ifa; IFA_LOCK(ifa); - if (oia->ia6_ndpr == NULL) { - log(LOG_NOTICE, "in6_unlink_ifa: IPv6 address " - "0x%llx has no prefix\n", - (uint64_t)VM_KERNEL_ADDRPERM(oia)); - } else { - struct nd_prefix *pr = oia->ia6_ndpr; - oia->ia6_flags &= ~IN6_IFF_AUTOCONF; - oia->ia6_ndpr = NULL; - NDPR_LOCK(pr); - VERIFY(pr->ndpr_addrcnt != 0); - pr->ndpr_addrcnt--; - NDPR_UNLOCK(pr); - NDPR_REMREF(pr); /* release addr reference */ + /* + * Only log the below message for addresses other than + * link local. + * Only one LLA (auto-configured or statically) is allowed + * on an interface. + * LLA prefix, while added to the prefix list, is not + * reference countedi (as it is the only one). + * The prefix also never expires on its own as LLAs + * have infinite lifetime. + * + * For now quiece down the log message for LLAs. + */ + if (!IN6_IS_ADDR_LINKLOCAL(&oia->ia_addr.sin6_addr)) { + if (oia->ia6_ndpr == NULL) + log(LOG_NOTICE, "in6_unlink_ifa: IPv6 address " + "0x%llx has no prefix\n", + (uint64_t)VM_KERNEL_ADDRPERM(oia)); + else { + struct nd_prefix *pr = oia->ia6_ndpr; + oia->ia6_flags &= ~IN6_IFF_AUTOCONF; + oia->ia6_ndpr = NULL; + NDPR_LOCK(pr); + VERIFY(pr->ndpr_addrcnt != 0); + pr->ndpr_addrcnt--; + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); /* release addr reference */ + } } IFA_UNLOCK(ifa); lck_rw_done(&in6_ifaddr_rwlock); @@ -2587,9 +2602,7 @@ in6_purgeaddrs(struct ifnet *ifp) * Find an IPv6 interface link-local address specific to an interface. */ struct in6_ifaddr * -in6ifa_ifpforlinklocal(ifp, ignoreflags) - struct ifnet *ifp; - int ignoreflags; +in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; @@ -2622,9 +2635,7 @@ in6ifa_ifpforlinklocal(ifp, ignoreflags) * find the internet address corresponding to a given interface and address. */ struct in6_ifaddr * -in6ifa_ifpwithaddr(ifp, addr) - struct ifnet *ifp; - struct in6_addr *addr; +in6ifa_ifpwithaddr(struct ifnet *ifp, struct in6_addr *addr) { struct ifaddr *ifa; @@ -2850,36 +2861,12 @@ in6_localaddr(struct in6_addr *in6) return (0); } -int -in6_is_addr_deprecated(struct sockaddr_in6 *sa6) -{ - struct in6_ifaddr *ia; - - lck_rw_lock_shared(&in6_ifaddr_rwlock); - for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { - IFA_LOCK_SPIN(&ia->ia_ifa); - if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, - &sa6->sin6_addr) && - (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) { - IFA_UNLOCK(&ia->ia_ifa); - lck_rw_done(&in6_ifaddr_rwlock); - return (1); /* true */ - } - /* XXX: do we still have to go thru the rest of the list? */ - IFA_UNLOCK(&ia->ia_ifa); - } - - lck_rw_done(&in6_ifaddr_rwlock); - return (0); /* false */ -} - /* * return length of part which dst and src are equal * hard coding... */ int -in6_matchlen(src, dst) -struct in6_addr *src, *dst; +in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; @@ -2899,9 +2886,7 @@ struct in6_addr *src, *dst; /* XXX: to be scope conscious */ int -in6_are_prefix_equal(p1, p2, len) - struct in6_addr *p1, *p2; - int len; +in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; @@ -2925,9 +2910,7 @@ in6_are_prefix_equal(p1, p2, len) } void -in6_prefixlen2mask(maskp, len) - struct in6_addr *maskp; - int len; +in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; @@ -3397,6 +3380,12 @@ static void in6_if_up_dad_start(struct ifnet *ifp) { struct ifaddr *ifa; + struct nd_ifinfo *ndi = NULL; + + ndi = ND_IFINFO(ifp); + VERIFY((NULL != ndi) && (TRUE == ndi->initialized)); + if (!(ndi->flags & ND6_IFF_DAD)) + return; /* start DAD on all the interface addresses */ ifnet_lock_exclusive(ifp); @@ -3424,9 +3413,16 @@ int in6if_do_dad( struct ifnet *ifp) { + struct nd_ifinfo *ndi = NULL; + if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); + ndi = ND_IFINFO(ifp); + VERIFY((NULL != ndi) && (TRUE == ndi->initialized)); + if (!(ndi->flags & ND6_IFF_DAD)) + return (0); + /* * If we are using the alternative neighbor discovery * interface on this interface, then skip DAD. @@ -3684,7 +3680,7 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa, ev_msg.dv[0].data_length = sizeof (in6_event_data); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(NULL, &ev_msg); } /* @@ -3841,87 +3837,6 @@ in6_ifaddr_trace(struct ifaddr *ifa, int refhold) ctrace_record(&tr[idx]); } -static void -in6_ifaddr_set_dadprogress(struct in6_ifaddr *ia) -{ - struct ifnet* ifp = ia->ia_ifp; - uint32_t flags = IN6_IFF_TENTATIVE; - uint32_t optdad = nd6_optimistic_dad; - - if (optdad) { - if ((ifp->if_eflags & IFEF_IPV6_ROUTER) != 0) { - optdad = 0; - } else { - struct nd_ifinfo *ndi = NULL; - - ndi = ND_IFINFO(ifp); - VERIFY (ndi != NULL && ndi->initialized); - lck_mtx_lock(&ndi->lock); - if ((ndi->flags & ND6_IFF_REPLICATED) != 0) { - optdad = 0; - } - lck_mtx_unlock(&ndi->lock); - } - } - - if (optdad) { - if ((optdad & ND6_OPTIMISTIC_DAD_LINKLOCAL) && - IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) - flags = IN6_IFF_OPTIMISTIC; - else if ((optdad & ND6_OPTIMISTIC_DAD_AUTOCONF) && - (ia->ia6_flags & IN6_IFF_AUTOCONF)) { - if (ia->ia6_flags & IN6_IFF_TEMPORARY) { - if (optdad & ND6_OPTIMISTIC_DAD_TEMPORARY) - flags = IN6_IFF_OPTIMISTIC; - } else if (ia->ia6_flags & IN6_IFF_SECURED) { - if (optdad & ND6_OPTIMISTIC_DAD_SECURED) - flags = IN6_IFF_OPTIMISTIC; - } else { - /* - * Keeping the behavior for temp and CGA - * SLAAC addresses to have a knob for optimistic - * DAD. - * Other than that if ND6_OPTIMISTIC_DAD_AUTOCONF - * is set, we should default to optimistic - * DAD. - * For now this means SLAAC addresses with interface - * identifier derived from modified EUI-64 bit - * identifiers. - */ - flags = IN6_IFF_OPTIMISTIC; - } - } else if ((optdad & ND6_OPTIMISTIC_DAD_DYNAMIC) && - (ia->ia6_flags & IN6_IFF_DYNAMIC)) { - if (ia->ia6_flags & IN6_IFF_TEMPORARY) { - if (optdad & ND6_OPTIMISTIC_DAD_TEMPORARY) - flags = IN6_IFF_OPTIMISTIC; - } else { - flags = IN6_IFF_OPTIMISTIC; - } - } else if ((optdad & ND6_OPTIMISTIC_DAD_MANUAL) && - (ia->ia6_flags & IN6_IFF_OPTIMISTIC)) { - /* - * rdar://17483438 - * Bypass tentative for address assignments - * not covered above (e.g. manual) upon request - */ - if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr) && - !(ia->ia6_flags & IN6_IFF_AUTOCONF) && - !(ia->ia6_flags & IN6_IFF_DYNAMIC)) - flags = IN6_IFF_OPTIMISTIC; - } - } - - ia->ia6_flags &= ~(IN6_IFF_DUPLICATED | IN6_IFF_DADPROGRESS); - ia->ia6_flags |= flags; - - nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n", - __func__, - ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ia->ia_ifp), - ia->ia6_flags)); -} - /* * Handle SIOCGASSOCIDS ioctl for PF_INET6 domain. */ diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index bdda50103..b09c026b4 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -922,6 +922,7 @@ extern int inet6_rth_reverse(const void *, void *); extern int inet6_rth_segments(const void *); extern struct in6_addr *inet6_rth_getaddr(const void *, int); extern void addrsel_policy_init(void); + __END_DECLS #endif /* !KERNEL */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ diff --git a/bsd/netinet6/in6_cga.c b/bsd/netinet6/in6_cga.c index 3e43e73da..5174dfdf0 100644 --- a/bsd/netinet6/in6_cga.c +++ b/bsd/netinet6/in6_cga.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Apple Inc. All rights reserved. + * Copyright (c) 2013-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -120,6 +120,30 @@ in6_cga_is_prepare_valid(const struct in6_cga_prepare *prepare, return (TRUE); } +/* + * @brief Generate interface identifier for CGA + * XXX You may notice that following does not really + * mirror what is decribed in: + * https://tools.ietf.org/html/rfc3972#section-4 + * By design kernel here will assume that that + * modifier has been converged on by userspace + * for first part of the algorithm for the given + * security level. + * We are not doing that yet but that's how the code + * below is written. So really we are starting + * from bullet 4 of the algorithm. + * + * @param prepare Pointer to object containing modifier, + * security level & externsion to be used. + * @param pubkey Public key used for IID generation + * @param collisions Collission count on DAD failure + * XXX We are not really re-generating IID on DAD + * failures for now. + * @param in6 Pointer to the address containing + * the prefix. + * + * @return void + */ static void in6_cga_generate_iid(const struct in6_cga_prepare *prepare, const struct iovec *pubkey, u_int8_t collisions, struct in6_addr *in6) @@ -297,7 +321,7 @@ in6_cga_parameters_prepare(void *output, size_t max, } int -in6_cga_generate(const struct in6_cga_prepare *prepare, u_int8_t collisions, +in6_cga_generate(struct in6_cga_prepare *prepare, u_int8_t collisions, struct in6_addr *in6) { int error; @@ -308,6 +332,9 @@ in6_cga_generate(const struct in6_cga_prepare *prepare, u_int8_t collisions, if (prepare == NULL) prepare = &in6_cga.cga_prepare; + else + prepare->cga_security_level = + in6_cga.cga_prepare.cga_security_level; pubkey = &in6_cga.cga_pubkey; diff --git a/bsd/netinet6/in6_gif.c b/bsd/netinet6/in6_gif.c index 54840b69e..89037d5b1 100644 --- a/bsd/netinet6/in6_gif.c +++ b/bsd/netinet6/in6_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2013 Apple Inc. All rights reserved. + * Copyright (c) 2009-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index 8b5379f53..804a77d8e 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2015 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -787,6 +787,10 @@ in6_ifattach_prelim(struct ifnet *ifp) return (0); } +/* + * This routine is only meant to configure IPv6 Link Local + * addresses. + */ int in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp, struct in6_aliasreq *ifra0) @@ -802,7 +806,13 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp, if (!ip6_auto_linklocal) return (0); - /* assign a link-local address, only if there isn't one here already. */ + /* + * Assign a link-local address, only if there isn't one here already. + * XXX If we ever allow more than one LLA on the interface + * make sure that the corresponding prefix on the prefixlist + * is reference counted and the address's prefix pointer + * points to the prefix. + */ ia6 = in6ifa_ifpforlinklocal(ifp, 0); if (ia6 != NULL) { IFA_REMREF(&ia6->ia_ifa); @@ -818,7 +828,7 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp, strlcpy(ifra.ifra_name, if_name(ifp), sizeof (ifra.ifra_name)); /* Initialize the IPv6 interface address in our in6_aliasreq block */ - if ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0 && ifra0 != NULL) { + if (ifra0 != NULL) { /* interface provided both addresses for us */ struct sockaddr_in6 *sin6 = &ifra.ifra_addr; struct in6_addr *in6 = &sin6->sin6_addr; @@ -887,20 +897,20 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp, } int -in6_ifattach_llstartreq(struct ifnet *ifp, struct in6_llstartreq *llsr) +in6_ifattach_llcgareq(struct ifnet *ifp, struct in6_cgareq *llcgasr) { struct in6_aliasreq ifra; struct in6_ifaddr *ia6 = NULL; struct nd_ifinfo *ndi = NULL; int error; - VERIFY(llsr != NULL); + VERIFY(llcgasr != NULL); error = in6_ifattach_prelim(ifp); if (error != 0) return (error); - if (!ip6_auto_linklocal || (ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0) + if (!ip6_auto_linklocal) return (0); if (nd6_send_opstate == ND6_SEND_OPMODE_DISABLED) @@ -912,7 +922,13 @@ in6_ifattach_llstartreq(struct ifnet *ifp, struct in6_llstartreq *llsr) return (ENXIO); } - /* assign a link-local address, only if there isn't one here already. */ + /* + * Assign a link-local address, only if there isn't one here already. + * XXX If we ever allow more than one LLA on the interface + * make sure that the corresponding prefix on the prefixlist + * is reference counted and the address's prefix pointer + * points to the prefix. + */ ia6 = in6ifa_ifpforlinklocal(ifp, 0); if (ia6 != NULL) { IFA_REMREF(&ia6->ia_ifa); @@ -930,7 +946,7 @@ in6_ifattach_llstartreq(struct ifnet *ifp, struct in6_llstartreq *llsr) ifra.ifra_flags = IN6_IFF_SECURED; in6_cga_node_lock(); - if (in6_cga_generate(&llsr->llsr_cgaprep, 0, + if (in6_cga_generate(&llcgasr->cgar_cgaprep, 0, &ifra.ifra_addr.sin6_addr)) { in6_cga_node_unlock(); return (EADDRNOTAVAIL); @@ -952,7 +968,7 @@ in6_ifattach_llstartreq(struct ifnet *ifp, struct in6_llstartreq *llsr) * identifiers]. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; - ifra.ifra_lifetime.ia6t_pltime = llsr->llsr_lifetime.ia6t_pltime; + ifra.ifra_lifetime.ia6t_pltime = llcgasr->cgar_lifetime.ia6t_pltime; /* Attach the link-local address */ if (in6_ifattach_linklocal(ifp, &ifra) != 0) { diff --git a/bsd/netinet6/in6_ifattach.h b/bsd/netinet6/in6_ifattach.h index 394636c06..24fd6e4ce 100644 --- a/bsd/netinet6/in6_ifattach.h +++ b/bsd/netinet6/in6_ifattach.h @@ -64,7 +64,7 @@ extern int in6_domifattach(struct ifnet *); extern int in6_ifattach_prelim(struct ifnet *); extern int in6_ifattach_aliasreq(struct ifnet *, struct ifnet *, struct in6_aliasreq *); -extern int in6_ifattach_llstartreq(struct ifnet *, struct in6_llstartreq *); +extern int in6_ifattach_llcgareq(struct ifnet *, struct in6_cgareq *); extern void in6_ifdetach(struct ifnet *); extern int in6_iid_from_hw(struct ifnet *, struct in6_addr *); extern void in6_iid_mktmp(struct ifnet *, u_int8_t *, const u_int8_t *, int); diff --git a/bsd/netinet6/in6_mcast.c b/bsd/netinet6/in6_mcast.c index 74dd6496c..4454449b0 100644 --- a/bsd/netinet6/in6_mcast.c +++ b/bsd/netinet6/in6_mcast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2013 Apple Inc. All rights reserved. + * Copyright (c) 2010-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1461,7 +1461,6 @@ in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) MLD_PRINTF(("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name)); return (EOPNOTSUPP); - break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) @@ -2088,7 +2087,6 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) MLD_PRINTF(("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name)); return (EOPNOTSUPP); - break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) @@ -2243,9 +2241,21 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) */ if (is_new) { + /* + * See inp_join_group() for why we need to unlock + */ + IM6O_ADDREF_LOCKED(imo); + IM6O_UNLOCK(imo); + socket_unlock(inp->inp_socket, 0); + VERIFY(inm == NULL); error = in6_mc_join(ifp, &gsa->sin6.sin6_addr, imf, &inm, 0); VERIFY(inm != NULL || error != 0); + + socket_lock(inp->inp_socket, 0); + IM6O_REMREF(imo); + IM6O_LOCK(imo); + if (error) goto out_im6o_free; imo->im6o_membership[idx] = inm; /* from in6_mc_join() */ @@ -2416,7 +2426,6 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) MLD_PRINTF(("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name)); return (EOPNOTSUPP); - break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) @@ -2566,7 +2575,20 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) /* Remove the gap in the membership array. */ VERIFY(inm == imo->im6o_membership[idx]); imo->im6o_membership[idx] = NULL; + + /* + * See inp_join_group() for why we need to unlock + */ + IM6O_ADDREF_LOCKED(imo); + IM6O_UNLOCK(imo); + socket_unlock(inp->inp_socket, 0); + IN6M_REMREF(inm); + + socket_lock(inp->inp_socket, 0); + IM6O_REMREF(imo); + IM6O_LOCK(imo); + for (++idx; idx < imo->im6o_num_memberships; ++idx) { imo->im6o_membership[idx-1] = imo->im6o_membership[idx]; imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx]; diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 35888ba18..54b9555f6 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -657,19 +657,28 @@ in6_pcbdetach(struct inpcb *inp) } im6o = inp->in6p_moptions; inp->in6p_moptions = NULL; - if (im6o != NULL) - IM6O_REMREF(im6o); imo = inp->inp_moptions; inp->inp_moptions = NULL; - if (imo != NULL) - IMO_REMREF(imo); + sofreelastref(so, 0); inp->inp_state = INPCB_STATE_DEAD; /* makes sure we're not called twice from so_close */ so->so_flags |= SOF_PCBCLEARING; inpcb_gc_sched(inp->inp_pcbinfo, INPCB_TIMER_FAST); + + /* + * See inp_join_group() for why we need to unlock + */ + if (im6o != NULL || imo != NULL) { + socket_unlock(so, 0); + if (im6o != NULL) + IM6O_REMREF(im6o); + if (imo != NULL) + IMO_REMREF(imo); + socket_lock(so, 0); + } } } diff --git a/bsd/netinet6/in6_rmx.c b/bsd/netinet6/in6_rmx.c index cdcbf6252..ff874090c 100644 --- a/bsd/netinet6/in6_rmx.c +++ b/bsd/netinet6/in6_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2013 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/netinet6/in6_src.c b/bsd/netinet6/in6_src.c index fda321bb7..28e831f86 100644 --- a/bsd/netinet6/in6_src.c +++ b/bsd/netinet6/in6_src.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -232,7 +232,8 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, u_int32_t odstzone; int prefer_tempaddr; struct ip6_moptions *mopts; - struct ip6_out_args ip6oa = { ifscope, { 0 }, IP6OAF_SELECT_SRCIF, 0 }; + struct ip6_out_args ip6oa = { ifscope, { 0 }, IP6OAF_SELECT_SRCIF, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; boolean_t islocal = FALSE; uint64_t secs = net_uptime(); char s_src[MAX_IPv6_STR_LEN], s_dst[MAX_IPv6_STR_LEN]; @@ -252,9 +253,13 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; if (INP_AWDL_UNRESTRICTED(inp)) ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; - + if (INP_INTCOPROC_ALLOWED(inp)) + ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED; } else { mopts = NULL; + /* Allow the kernel to retransmit packets. */ + ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED | + IP6OAF_AWDL_UNRESTRICTED; } if (ip6oa.ip6oa_boundif != IFSCOPE_NONE) @@ -345,7 +350,7 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, tmp = &in6addr_any; (void) inet_ntop(AF_INET6, tmp, s_src, sizeof (s_src)); - printf("%s out src %s dst %s ifscope %d ifp %s\n", + printf("%s out src %s dst %s ifscope %d ifp %s\n", __func__, s_src, s_dst, ifscope, ifp ? ifp->if_xname : "NULL"); } @@ -637,7 +642,7 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, lck_rw_done(&in6_ifaddr_rwlock); - if (ia_best != NULL && inp && + if (ia_best != NULL && inp && inp_restricted_send(inp, ia_best->ia_ifa.ifa_ifp)) { IFA_REMREF(&ia_best->ia_ifa); ia_best = NULL; @@ -663,8 +668,8 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, tmp = (src_storage != NULL) ? src_storage : &in6addr_any; (void) inet_ntop(AF_INET6, tmp, s_src, sizeof (s_src)); - - printf("%s out src %s dst %s ifscope %d dst_scope %d best_scope %d\n", + + printf("%s out src %s dst %s ifscope %d dst_scope %d best_scope %d\n", __func__, s_src, s_dst, ifscope, dst_scope, best_scope); } if (ifpp != NULL) { @@ -751,20 +756,13 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, * Perform source interface selection only if Scoped Routing * is enabled and a source address that isn't unspecified. */ - select_srcif = (ip6_doscopedroute && srcsock != NULL && + select_srcif = (srcsock != NULL && !IN6_IS_ADDR_UNSPECIFIED(&srcsock->sin6_addr)); if (ip6_select_srcif_debug) { - printf("%s src %s dst %s ifscope %d select_srcif %d\n", + printf("%s src %s dst %s ifscope %d select_srcif %d\n", __func__, s_src, s_dst, ifscope, select_srcif); } - /* - * If Scoped Routing is disabled, ignore the given ifscope. - * Otherwise even if source selection won't be performed, - * we still obey IPV6_BOUND_IF. - */ - if (!ip6_doscopedroute && ifscope != IFSCOPE_NONE) - ifscope = IFSCOPE_NONE; /* If the caller specified the outgoing interface explicitly, use it */ if (opts != NULL && (pi = opts->ip6po_pktinfo) != NULL && @@ -811,7 +809,7 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, * If the outgoing interface was not set via IPV6_BOUND_IF or * IPV6_PKTINFO, use the scope ID in the destination address. */ - if (ip6_doscopedroute && ifscope == IFSCOPE_NONE) + if (ifscope == IFSCOPE_NONE) ifscope = dstsock->sin6_scope_id; /* @@ -884,13 +882,13 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, if (ro->ro_rt != NULL) { printf("%s %s->%s ifscope %d->%d ifa_if %s " "ro_if %s\n", - __func__, + __func__, s_src, s_dst, ifscope, scope, if_name(ifa->ifa_ifp), if_name(rt_ifp)); } else { printf("%s %s->%s ifscope %d->%d ifa_if %s\n", - __func__, + __func__, s_src, s_dst, ifscope, scope, if_name(ifa->ifa_ifp)); } @@ -1160,12 +1158,14 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, IFNET_IS_CELLULAR(_ifp)) || \ (((_ip6oa)->ip6oa_flags & IP6OAF_NO_EXPENSIVE) && \ IFNET_IS_EXPENSIVE(_ifp)) || \ + (!((_ip6oa)->ip6oa_flags & IP6OAF_INTCOPROC_ALLOWED) && \ + IFNET_IS_INTCOPROC(_ifp)) || \ (!((_ip6oa)->ip6oa_flags & IP6OAF_AWDL_UNRESTRICTED) && \ - IFNET_IS_AWDL_RESTRICTED(_ifp))) + IFNET_IS_AWDL_RESTRICTED(_ifp))) if (error == 0 && ip6oa != NULL && ((ifp && CHECK_RESTRICTIONS(ip6oa, ifp)) || - (route && route->ro_rt && + (route && route->ro_rt && CHECK_RESTRICTIONS(ip6oa, route->ro_rt->rt_ifp)))) { if (route != NULL && route->ro_rt != NULL) { ROUTE_RELEASE(route); @@ -1216,7 +1216,7 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, if (error == 0) { if (retrt != NULL && route != NULL) *retrt = route->ro_rt; /* ro_rt may be NULL */ - } + } if (ip6_select_srcif_debug) { printf("%s %s->%s ifscope %d ifa_if %s ro_if %s (error=%d)\n", __func__, diff --git a/bsd/netinet6/in6_var.h b/bsd/netinet6/in6_var.h index 07cc9e16f..aee032c49 100644 --- a/bsd/netinet6/in6_var.h +++ b/bsd/netinet6/in6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,6 +93,7 @@ #ifndef _NETINET6_IN6_VAR_H_ #define _NETINET6_IN6_VAR_H_ #include +#include #ifdef BSD_KERNEL_PRIVATE #include @@ -171,9 +172,7 @@ struct in6_ifaddr { u_int64_t ia6_createtime; u_int64_t ia6_updatetime; - struct ifprefix *ia6_ifpr; /* back pointer to ifprefix */ - - /* back pointer to the ND prefix (for autoconfigured addresses only) */ + /* back pointer to the ND prefix */ struct nd_prefix *ia6_ndpr; /* multicast addresses joined from the kernel */ @@ -222,6 +221,7 @@ struct in6_ifstat { u_quad_t ifs6_reass_ok; /* # of reassembled packets */ /* NOTE: this is # after reass */ /* NOTE: increment on final dst if */ + u_quad_t ifs6_atmfrag_rcvd; /* # of atomic fragments received */ u_quad_t ifs6_reass_fail; /* # of reass failures */ /* NOTE: may not be packet count */ /* NOTE: increment on final dst if */ @@ -358,6 +358,11 @@ struct in6_cga_nodecfg { struct in6_cga_prepare cga_prepare; }; +/* + * XXX in6_llstartreq will be removed once + * configd adopts the more generically named + * in6_cgareq structure. + */ struct in6_llstartreq { char llsr_name[IFNAMSIZ]; int llsr_flags; @@ -365,7 +370,19 @@ struct in6_llstartreq { struct in6_addrlifetime llsr_lifetime; }; +struct in6_cgareq { + char cgar_name[IFNAMSIZ]; + int cgar_flags; + struct in6_cga_prepare cgar_cgaprep; + struct in6_addrlifetime cgar_lifetime; +}; + #ifdef BSD_KERNEL_PRIVATE +/* + * XXX Corresponding versions of in6_llstartreq + * will be removed after the new in6_cgareq is + * adopted by configd + */ struct in6_llstartreq_32 { char llsr_name[IFNAMSIZ]; int llsr_flags; @@ -379,6 +396,21 @@ struct in6_llstartreq_64 { struct in6_cga_prepare llsr_cgaprep; struct in6_addrlifetime_64 llsr_lifetime; }; + +struct in6_cgareq_32 { + char cgar_name[IFNAMSIZ]; + int cgar_flags; + struct in6_cga_prepare cgar_cgaprep; + struct in6_addrlifetime_32 cgar_lifetime; +}; + +struct in6_cgareq_64 { + char cgar_name[IFNAMSIZ]; + int cgar_flags; + struct in6_cga_prepare cgar_cgaprep; + struct in6_addrlifetime_64 cgar_lifetime; +}; + #endif /* !BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ @@ -506,18 +538,6 @@ struct kev_in6_data { uint8_t ia_mac[ETHER_ADDR_LEN]; }; -/* - * Define inet6 event subclass and specific inet6 events. - */ -#define KEV_INET6_SUBCLASS 6 /* inet6 subclass identifier */ - -#define KEV_INET6_NEW_USER_ADDR 1 /* Userland configured IPv6 address */ -#define KEV_INET6_CHANGED_ADDR 2 /* Address changed event (future) */ -#define KEV_INET6_ADDR_DELETED 3 /* IPv6 address was deleted */ -#define KEV_INET6_NEW_LL_ADDR 4 /* Autoconf LL address appeared */ -#define KEV_INET6_NEW_RTADV_ADDR 5 /* Autoconf address has appeared */ -#define KEV_INET6_DEFROUTER 6 /* Default router detected */ - #ifdef BSD_KERNEL_PRIVATE /* Utility function used inside netinet6 kernel code for generating events */ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *, uint8_t *mac); @@ -693,11 +713,14 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *, uint8_t *mac); /* * start secure link-local interface addresses */ -#define SIOCLL_CGASTART _IOW('i', 160, struct in6_llstartreq) +#define SIOCLL_CGASTART _IOW('i', 160, struct in6_cgareq) #ifdef BSD_KERNEL_PRIVATE -#define SIOCLL_CGASTART_32 _IOW('i', 160, struct in6_llstartreq_32) -#define SIOCLL_CGASTART_64 _IOW('i', 160, struct in6_llstartreq_64) +#define SIOCLL_CGASTART_32 _IOW('i', 160, struct in6_cgareq_32) +#define SIOCLL_CGASTART_64 _IOW('i', 160, struct in6_cgareq_64) #endif +#define SIOCGIFCGAPREP_IN6 _IOWR('i', 187, struct in6_cgareq) +#define SIOCSIFCGAPREP_IN6 _IOWR('i', 188, struct in6_cgareq) + #endif /* PRIVATE */ #ifdef BSD_KERNEL_PRIVATE @@ -734,6 +757,9 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *, uint8_t *mac); /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) +/* SLAAC/DHCPv6 address */ +#define IN6_IFF_NOTMANUAL (IN6_IFF_AUTOCONF|IN6_IFF_DYNAMIC) + #ifdef KERNEL #define IN6_ARE_SCOPE_CMP(a, b) ((a) - (b)) #define IN6_ARE_SCOPE_EQUAL(a, b) ((a) == (b)) @@ -1093,7 +1119,6 @@ extern int in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia); extern void in6_prefix_remove_ifid(int iilen, struct in6_ifaddr *ia); extern void in6_purgeprefix(struct ifnet *); extern void in6_purgeaddrs(struct ifnet *); -extern int in6_is_addr_deprecated(struct sockaddr_in6 *); extern uint8_t im6s_get_mode(const struct in6_multi *, const struct ip6_msource *, uint8_t); extern void im6f_leave(struct in6_mfilter *); @@ -1125,7 +1150,7 @@ extern int in6_cga_start(const struct in6_cga_nodecfg *); extern int in6_cga_stop(void); extern ssize_t in6_cga_parameters_prepare(void *, size_t, const struct in6_addr *, u_int8_t, const struct in6_cga_modifier *); -extern int in6_cga_generate(const struct in6_cga_prepare *, u_int8_t, +extern int in6_cga_generate(struct in6_cga_prepare *, u_int8_t, struct in6_addr *); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index ee5a70453..e6beab89a 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2009-2013 Apple Inc. All rights reserved. + * Copyright (c) 2009-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -97,10 +97,13 @@ extern int ipsec_bypass; #endif /* IPSEC */ -#include - #include +#if DUMMYNET +#include +#include +#endif /* DUMMYNET */ + #if PF #include #endif /* PF */ @@ -617,29 +620,6 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, } type = ND_REDIRECT; } - -#if IPFW2 - /* - * Check with the firewall... - */ - if (ip6_fw_enable && ip6_fw_chk_ptr) { - u_short port = 0; - ifp = rt->rt_ifp; - /* Drop the lock but retain the extra ref */ - RT_UNLOCK(rt); - /* If ipfw says divert, we have to just drop packet */ - if (ip6_fw_chk_ptr(&ip6, ifp, &port, &m)) { - m_freem(m); - goto freecopy; - } - if (!m) { - goto freecopy; - } - /* We still have the extra ref on rt */ - RT_LOCK(rt); - } -#endif - /* * Fake scoped addresses. Note that even link-local source or * destinaion can appear, if the originating node just sends the @@ -709,25 +689,47 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, return (m); } -#if PF - /* Invoke outbound packet filter */ - error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE, NULL); + /* Mark this packet as being forwarded from another interface */ + m->m_pkthdr.pkt_flags |= PKTF_FORWARDED; - if (error != 0 || m == NULL) { - if (m != NULL) { - panic("%s: unexpected packet %p\n", __func__, m); - /* NOTREACHED */ +#if PF + if (PF_IS_ENABLED) { +#if DUMMYNET + struct ip_fw_args args; + bzero(&args, sizeof(args)); + + args.fwa_m = m; + args.fwa_oif = ifp; + args.fwa_oflags = 0; + args.fwa_ro6 = ip6forward_rt; + args.fwa_ro6_pmtu = ip6forward_rt; + args.fwa_mtu = rt->rt_ifp->if_mtu; + args.fwa_dst6 = dst; + args.fwa_origifp = origifp; + /* Invoke outbound packet filter */ + error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE, &args); +#else /* !DUMMYNET */ + error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE, NULL); +#endif /* !DUMMYNET */ + if (error != 0 || m == NULL) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + goto senderr; } - /* Already freed by callee */ - goto senderr; + /* + * We do not use ip6 header again in the code below, + * however still adding the bit here so that any new + * code in future doesn't end up working with the + * wrong pointer + */ + ip6 = mtod(m, struct ip6_hdr *); } - ip6 = mtod(m, struct ip6_hdr *); #endif /* PF */ - /* Mark this packet as being forwarded from another interface */ - m->m_pkthdr.pkt_flags |= PKTF_FORWARDED; len = m_pktlen(m); - error = nd6_output(ifp, origifp, m, dst, rt, NULL); if (error) { in6_ifstat_inc(ifp, ifs6_out_discard); diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index 12692628e..c8fda8efd 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2015 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -146,10 +146,6 @@ extern int ipsec_bypass; #endif /* IPSEC */ -#if IPFW2 -#include -#endif /* IPFW2 */ - #if DUMMYNET #include #include @@ -178,13 +174,6 @@ struct in6_ifaddr *in6_ifaddrs = NULL; #define ICMP6_IFSTAT_REQUIRE_ALIGNED_64(f) \ _CASSERT(!(offsetof(struct icmp6_ifstat, f) % sizeof (uint64_t))) -#if IPFW2 -/* firewall hooks */ -ip6_fw_chk_t *ip6_fw_chk_ptr; -ip6_fw_ctl_t *ip6_fw_ctl_ptr; -int ip6_fw_enable = 1; -#endif /* IPFW2 */ - struct ip6stat ip6stat; decl_lck_mtx_data(, proxy6_lock); @@ -217,11 +206,6 @@ extern void stfattach(void); SYSCTL_DECL(_net_inet6_ip6); -int ip6_doscopedroute = 1; -SYSCTL_INT(_net_inet6_ip6, OID_AUTO, scopedroute, - CTLFLAG_RD | CTLFLAG_LOCKED, &ip6_doscopedroute, 0, - "Enable IPv6 scoped routing"); - static uint32_t ip6_adj_clear_hwcksum = 0; SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, adj_clear_hwcksum, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_adj_clear_hwcksum, 0, @@ -321,9 +305,6 @@ ip6_init(struct ip6protosw *pp, struct domain *dp) return; ip6_initialized = 1; - PE_parse_boot_argn("net.inet6.ip6.scopedroute", &ip6_doscopedroute, - sizeof (ip6_doscopedroute)); - pr = pffindproto_locked(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) { panic("%s: Unable to find [PF_INET6,IPPROTO_RAW,SOCK_RAW]\n", @@ -702,22 +683,6 @@ ip6_input(struct mbuf *m) goto bad; } #endif -#if IPFW2 - /* - * Check with the firewall... - */ - if (ip6_fw_enable && ip6_fw_chk_ptr) { - u_short port = 0; - /* If ipfw says divert, we have to just drop packet */ - /* use port as a dummy argument */ - if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) { - m_freem(m); - m = NULL; - } - if (!m) - goto done; - } -#endif /* IPFW2 */ /* * Naively assume we can attribute inbound data to the route we would @@ -1351,12 +1316,8 @@ ip6_hopopts_input(uint32_t *plenp, uint32_t *rtalertp, struct mbuf **mp, * opthead + hbhlen is located in continuous memory region. */ int -ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) - struct mbuf *m; - u_int8_t *opthead; - int hbhlen; - u_int32_t *rtalertp; - u_int32_t *plenp; +ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, + u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; @@ -1821,9 +1782,7 @@ ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu) * we develop `neater' mechanism to process extension headers. */ char * -ip6_get_prevhdr(m, off) - struct mbuf *m; - int off; +ip6_get_prevhdr(struct mbuf *m, int off) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -1928,8 +1887,6 @@ ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp) default: return (-1); } - - return (-1); } /* diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 0bdaaa7e2..973c9dcfa 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -127,6 +127,7 @@ #include #include #include +#include #include #include @@ -150,7 +151,6 @@ extern int ipsec_bypass; #endif /* CONFIG_MACF_NET */ #if DUMMYNET -#include #include #include #endif /* DUMMYNET */ @@ -418,7 +418,7 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, } /* If packet is bound to an interface, check bound policies */ if ((flags & IPV6_OUTARGS) && - (ip6oa->ip6oa_flags & IPOAF_BOUND_IF) && + (ip6oa->ip6oa_flags & IP6OAF_BOUND_IF) && ip6oa->ip6oa_boundif != IFSCOPE_NONE) { /* ip6obf.noipsec is a bitfield, use temp integer */ int noipsec = 0; @@ -431,10 +431,10 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, } } #endif /* IPSEC */ - + ippo = &ipf_pktopts; - if (ip6_doscopedroute && (flags & IPV6_OUTARGS)) { + if (flags & IPV6_OUTARGS) { /* * In the forwarding case, only the ifscope value is used, * as source interface selection doesn't take place. @@ -615,14 +615,13 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, goto freehdrs; } } - break; } default: break; } } #endif /* NECP */ - + #if IPSEC if (ipsec_bypass != 0 || ip6obf.noipsec) goto skip_ipsec; @@ -1319,7 +1318,7 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, * forbid loopback, loop back a copy. */ ip6_mloopback(NULL, ifp, m, dst, optlen, nxt0); - } else if (im6o != NULL) + } else if (im6o != NULL) IM6O_UNLOCK(im6o); if (in6m != NULL) IN6M_REMREF(in6m); @@ -1385,28 +1384,6 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); - -#if IPFW2 - /* - * Check with the firewall... - */ - if (ip6_fw_enable && ip6_fw_chk_ptr) { - u_short port = 0; - m->m_pkthdr.rcvif = NULL; /* XXX */ - /* If ipfw says divert, we have to just drop packet */ - if (ip6_fw_chk_ptr(&ip6, ifp, &port, &m) || m == NULL) { - if (m != NULL) { - m_freem(m); - m = NULL; - goto evaluateloop; - } else { - error = EACCES; - goto bad; - } - } - } -#endif /* IPFW2 */ - /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. @@ -1491,6 +1468,22 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, ipsec_delaux(m); #endif /* IPSEC */ + if (ip6oa != NULL) { + u_int8_t dscp; + + dscp = (ntohl(ip6->ip6_flow) & IP6FLOW_DSCP_MASK) >> IP6FLOW_DSCP_SHIFT; + + error = set_packet_qos(m, ifp, + ip6oa->ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED ? TRUE : FALSE, + ip6oa->ip6oa_sotc, ip6oa->ip6oa_netsvctype, &dscp); + if (error == 0) { + ip6->ip6_flow &= ~htonl(IP6FLOW_DSCP_MASK); + ip6->ip6_flow |= htonl((u_int32_t)dscp << IP6FLOW_DSCP_SHIFT); + } else { + printf("%s if_dscp_for_mbuf() error %d\n", __func__, error); + error = 0; + } + } /* * Determine whether fragmentation is necessary. If so, m is passed * back as a chain of packets and original mbuf is freed. Otherwise, m @@ -1667,6 +1660,9 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt, size_t tlen = m->m_pkthdr.len; boolean_t dontfrag = (opt != NULL && (opt->ip6po_flags & IP6PO_DONTFRAG)); + if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) + dontfrag = TRUE; + if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ return EMSGSIZE; @@ -2657,12 +2653,12 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) caddr_t req = NULL; size_t len = 0; struct mbuf *m; - + if ((error = soopt_getm(sopt, &m)) != 0) break; if ((error = soopt_mcopyin(sopt, m)) != 0) break; - + req = mtod(m, caddr_t); len = m->m_len; error = ipsec6_set_policy(in6p, optname, req, @@ -2671,20 +2667,6 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) break; } #endif /* IPSEC */ -#if IPFIREWALL - case IPV6_FW_ADD: - case IPV6_FW_DEL: - case IPV6_FW_FLUSH: - case IPV6_FW_ZERO: { - if (ip6_fw_ctl_ptr == NULL) - load_ip6fw(); - if (ip6_fw_ctl_ptr != NULL) - error = (*ip6_fw_ctl_ptr)(sopt); - else - error = ENOPROTOOPT; - break; - } -#endif /* IPFIREWALL */ /* * IPv6 variant of IP_BOUND_IF; for details see * comments on IP_BOUND_IF in ip_ctloutput(). @@ -2914,17 +2896,6 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) break; } #endif /* IPSEC */ -#if IPFIREWALL - case IPV6_FW_GET: { - if (ip6_fw_ctl_ptr == NULL) - load_ip6fw(); - if (ip6_fw_ctl_ptr != NULL) - error = (*ip6_fw_ctl_ptr)(sopt); - else - error = ENOPROTOOPT; - break; - } -#endif /* IPFIREWALL */ case IPV6_BOUND_IF: if (in6p->inp_flags & INP_BOUND_IF) optval = in6p->inp_boundifp->if_index; @@ -4179,4 +4150,3 @@ sysctl_ip6_output_getperf SYSCTL_HANDLER_ARGS return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen))); } - diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index dc2b4399d..e04dee46a 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -272,6 +272,7 @@ struct ip6stat { u_quad_t ip6s_localout; /* total ip packets generated here */ u_quad_t ip6s_odropped; /* lost packets due to nobufs, etc. */ u_quad_t ip6s_reassembled; /* total packets reassembled ok */ + u_quad_t ip6s_atmfrag_rcvd; /* atomic fragments received */ u_quad_t ip6s_fragmented; /* datagrams successfully fragmented */ u_quad_t ip6s_ofragments; /* output fragments created */ u_quad_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ @@ -332,6 +333,9 @@ struct ip6stat { /* duplicate address detection collisions */ u_quad_t ip6s_dad_collide; + + /* DAD NS looped back */ + u_quad_t ip6s_dad_loopcount; }; enum ip6s_sources_rule_index { @@ -412,8 +416,12 @@ struct ip6_out_args { #define IP6OAF_NO_CELLULAR 0x00000010 /* skip IFT_CELLULAR */ #define IP6OAF_NO_EXPENSIVE 0x00000020 /* skip IFEF_EXPENSIVE */ #define IP6OAF_AWDL_UNRESTRICTED 0x00000040 /* privileged AWDL */ +#define IP6OAF_QOSMARKING_ALLOWED 0x00000080 /* policy allows Fastlane DSCP marking */ +#define IP6OAF_INTCOPROC_ALLOWED 0x00000100 /* access to internal coproc interfaces */ u_int32_t ip6oa_retflags; /* IP6OARF return flags (see below) */ #define IP6OARF_IFDENIED 0x00000001 /* denied access to interface */ + int ip6oa_sotc; /* traffic class for Fastlane DSCP mapping */ + int ip6oa_netsvctype; }; extern struct ip6stat ip6stat; /* statistics */ @@ -461,8 +469,6 @@ extern int ip6_use_defzone; extern struct pr_usrreqs rip6_usrreqs; extern struct pr_usrreqs icmp6_dgram_usrreqs; -extern int ip6_doscopedroute; - struct sockopt; struct inpcb; struct in6_ifaddr; diff --git a/bsd/netinet6/ipcomp_core.c b/bsd/netinet6/ipcomp_core.c index 533bfd198..01c9e928b 100644 --- a/bsd/netinet6/ipcomp_core.c +++ b/bsd/netinet6/ipcomp_core.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/ipcomp_core.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $ */ /* $KAME: ipcomp_core.c,v 1.24 2000/10/23 04:24:22 itojun Exp $ */ @@ -166,12 +194,9 @@ deflate_free( FREE(ptr, M_TEMP); } +/* @param mode 0: compress 1: decompress */ static int -deflate_common(m, md, lenp, mode) - struct mbuf *m; - struct mbuf *md; - size_t *lenp; - int mode; /* 0: compress 1: decompress */ +deflate_common(struct mbuf *m, struct mbuf *md, size_t *lenp, int mode) { struct mbuf *mprev; struct mbuf *p; @@ -382,10 +407,7 @@ do { \ } static int -deflate_compress(m, md, lenp) - struct mbuf *m; - struct mbuf *md; - size_t *lenp; +deflate_compress(struct mbuf *m, struct mbuf *md, size_t *lenp) { if (!m) panic("m == NULL in deflate_compress"); @@ -398,10 +420,7 @@ deflate_compress(m, md, lenp) } static int -deflate_decompress(m, md, lenp) - struct mbuf *m; - struct mbuf *md; - size_t *lenp; +deflate_decompress(struct mbuf *m, struct mbuf *md, size_t *lenp) { if (!m) panic("m == NULL in deflate_decompress"); diff --git a/bsd/netinet6/ipcomp_output.c b/bsd/netinet6/ipcomp_output.c index c9159972f..8ad23be73 100644 --- a/bsd/netinet6/ipcomp_output.c +++ b/bsd/netinet6/ipcomp_output.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/ipcomp_output.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $ */ /* $KAME: ipcomp_output.c,v 1.23 2001/01/23 08:59:37 itojun Exp $ */ @@ -101,12 +129,7 @@ static int ipcomp_output(struct mbuf *, u_char *, struct mbuf *, * <-----------------> compoff */ static int -ipcomp_output(m, nexthdrp, md, af, sav) - struct mbuf *m; - u_char *nexthdrp; - struct mbuf *md; - int af; - struct secasvar *sav; +ipcomp_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md, int af, struct secasvar *sav) { struct mbuf *n; struct mbuf *md0; @@ -345,9 +368,7 @@ ipcomp_output(m, nexthdrp, md, af, sav) #if INET int -ipcomp4_output(m, sav) - struct mbuf *m; - struct secasvar *sav; +ipcomp4_output(struct mbuf *m, struct secasvar *sav) { struct ip *ip; if (m->m_len < sizeof(struct ip)) { @@ -364,11 +385,11 @@ ipcomp4_output(m, sav) #if INET6 int -ipcomp6_output(m, nexthdrp, md, sav) - struct mbuf *m; - u_char *nexthdrp; - struct mbuf *md; - struct secasvar *sav; +ipcomp6_output( + struct mbuf *m, + u_char *nexthdrp, + struct mbuf *md, + struct secasvar *sav) { if (m->m_len < sizeof(struct ip6_hdr)) { ipseclog((LOG_DEBUG, "ipcomp6_output: first mbuf too short\n")); diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index d3ef5ed0c..19f926ec4 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008-2015 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -160,7 +160,7 @@ int ip4_esp_randpad = -1; int esp_udp_encap_port = 0; static int sysctl_def_policy SYSCTL_HANDLER_ARGS; extern int natt_keepalive_interval; -extern u_int32_t natt_now; +extern u_int64_t natt_now; struct ipsec_tag; @@ -1010,9 +1010,7 @@ ipsec_setspidx_interface( } static int -ipsec4_setspidx_inpcb(m, pcb) -struct mbuf *m; -struct inpcb *pcb; +ipsec4_setspidx_inpcb(struct mbuf *m, struct inpcb *pcb) { struct secpolicyindex *spidx; int error; @@ -1053,9 +1051,7 @@ struct inpcb *pcb; #if INET6 static int -ipsec6_setspidx_in6pcb(m, pcb) -struct mbuf *m; -struct in6pcb *pcb; +ipsec6_setspidx_in6pcb(struct mbuf *m, struct in6pcb *pcb) { struct secpolicyindex *spidx; int error; @@ -1185,10 +1181,7 @@ ipsec_setspidx(struct mbuf *m, } static void -ipsec4_get_ulp(m, spidx, needport) - struct mbuf *m; - struct secpolicyindex *spidx; - int needport; +ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport) { struct ip ip; struct ip6_ext ip6e; @@ -1263,9 +1256,7 @@ ipsec4_get_ulp(m, spidx, needport) /* assumes that m is sane */ static int -ipsec4_setspidx_ipaddr(m, spidx) - struct mbuf *m; - struct secpolicyindex *spidx; +ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx) { struct ip *ip = NULL; struct ip ipbuf; @@ -1394,7 +1385,7 @@ ipsec6_setspidx_ipaddr(struct mbuf *m, #endif static struct inpcbpolicy * -ipsec_newpcbpolicy() +ipsec_newpcbpolicy(void) { struct inpcbpolicy *p; @@ -1761,8 +1752,7 @@ ipsec6_delete_pcbpolicy(struct in6pcb *in6p) * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned. */ u_int -ipsec_get_reqlevel(isr) - struct ipsecrequest *isr; +ipsec_get_reqlevel(struct ipsecrequest *isr) { u_int level = 0; u_int esp_trans_deflev = 0, esp_net_deflev = 0, ah_trans_deflev = 0, ah_net_deflev = 0; @@ -1864,9 +1854,7 @@ ipsec_get_reqlevel(isr) * 1: invalid */ static int -ipsec_in_reject(sp, m) - struct secpolicy *sp; - struct mbuf *m; +ipsec_in_reject(struct secpolicy *sp, struct mbuf *m) { struct ipsecrequest *isr; u_int level; @@ -1961,9 +1949,7 @@ ipsec_in_reject(sp, m) * and {ah,esp}4_input for tunnel mode */ int -ipsec4_in_reject_so(m, so) - struct mbuf *m; - struct socket *so; +ipsec4_in_reject_so(struct mbuf *m, struct socket *so) { struct secpolicy *sp = NULL; int error; @@ -1997,11 +1983,8 @@ ipsec4_in_reject_so(m, so) } int -ipsec4_in_reject(m, inp) - struct mbuf *m; - struct inpcb *inp; +ipsec4_in_reject(struct mbuf *m, struct inpcb *inp) { - lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); if (inp == NULL) return ipsec4_in_reject_so(m, NULL); @@ -2021,9 +2004,7 @@ ipsec4_in_reject(m, inp) * and {ah,esp}6_input for tunnel mode */ int -ipsec6_in_reject_so(m, so) - struct mbuf *m; - struct socket *so; +ipsec6_in_reject_so(struct mbuf *m, struct socket *so) { struct secpolicy *sp = NULL; int error; @@ -2056,9 +2037,7 @@ ipsec6_in_reject_so(m, so) } int -ipsec6_in_reject(m, in6p) - struct mbuf *m; - struct in6pcb *in6p; +ipsec6_in_reject(struct mbuf *m, struct in6pcb *in6p) { lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -2080,8 +2059,7 @@ ipsec6_in_reject(m, in6p) * NOTE: SP passed is free in this function. */ size_t -ipsec_hdrsiz(sp) - struct secpolicy *sp; +ipsec_hdrsiz(struct secpolicy *sp) { struct ipsecrequest *isr; size_t siz, clen; @@ -2154,10 +2132,7 @@ ipsec_hdrsiz(sp) /* This function is called from ip_forward() and ipsec4_hdrsize_tcp(). */ size_t -ipsec4_hdrsiz(m, dir, inp) - struct mbuf *m; - u_int dir; - struct inpcb *inp; +ipsec4_hdrsiz(struct mbuf *m, u_int dir, struct inpcb *inp) { struct secpolicy *sp = NULL; int error; @@ -2198,10 +2173,7 @@ ipsec4_hdrsiz(m, dir, inp) * and maybe from ip6_forward.() */ size_t -ipsec6_hdrsiz(m, dir, in6p) - struct mbuf *m; - u_int dir; - struct in6pcb *in6p; +ipsec6_hdrsiz(struct mbuf *m, u_int dir, struct in6pcb *in6p) { struct secpolicy *sp = NULL; int error; @@ -2241,9 +2213,7 @@ ipsec6_hdrsiz(m, dir, in6p) * ip->ip_src must be fixed later on. */ int -ipsec4_encapsulate(m, sav) - struct mbuf *m; - struct secasvar *sav; +ipsec4_encapsulate(struct mbuf *m, struct secasvar *sav) { struct ip *oip; struct ip *ip; @@ -2354,101 +2324,11 @@ ipsec4_encapsulate(m, sav) return 0; } -/* - * encapsulate for ipsec tunnel. - * ip->ip_src must be fixed later on. - */ -int -ipsec4_encapsulate_utun_esp_keepalive(m_ptr, sav) - struct mbuf **m_ptr; - struct secasvar *sav; -{ - struct ip *ip; - size_t plen; - struct mbuf *m = *m_ptr; - - /* can't tunnel between different AFs */ - if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family - != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family - || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET) { - m_freem(m); - *m_ptr = NULL; - return EINVAL; - } - - plen = m->m_pkthdr.len; - - /* - * grow the mbuf to accomodate the new IPv4 header. - * NOTE: IPv4 options will never be copied. - */ - { - struct mbuf *n; - MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ - if (!n) { - m_freem(m); - *m_ptr = NULL; - return ENOBUFS; - } - if (m->m_flags & M_PKTHDR) { - M_COPY_PKTHDR(n, m); - m->m_flags &= ~M_PKTHDR; - } - MH_ALIGN(n, sizeof(*ip)); - n->m_len = sizeof(*ip); - n->m_next = m; - n->m_pkthdr.len = (plen + n->m_len); - m_fixhdr(m); - m = n; - *m_ptr = m; - plen = m->m_pkthdr.len; - } - ip = mtod(m, __typeof__(ip)); - - /* construct new IPv4 header. see RFC 2401 5.1.2.1 */ - // ip_ecn_ingress(ip4_ipsec_ecn, &ip->ip_tos, &oip->ip_tos); -#ifdef _IP_VHL - ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(*ip) >> 2); -#else - ip->ip_hl = sizeof(*ip) >> 2; -#endif - ip->ip_off &= htons(~IP_OFFMASK); - ip->ip_off &= htons(~IP_MF); - switch (ip4_ipsec_dfbit) { - case 0: /* clear DF bit */ - ip->ip_off &= htons(~IP_DF); - break; - case 1: /* set DF bit */ - ip->ip_off |= htons(IP_DF); - break; - default: /* copy DF bit */ - break; - } - ip->ip_p = IPPROTO_IPIP; - if (plen < IP_MAXPACKET) - ip->ip_len = htons(plen); - else { - ipseclog((LOG_ERR, "IPv4 ipsec: size exceeds limit: " - "leave ip_len as is (invalid packet)\n")); - } - ip->ip_id = ip_randomid(); - bcopy(&((struct sockaddr_in *)&sav->sah->saidx.src)->sin_addr, - &ip->ip_src, sizeof(ip->ip_src)); - bcopy(&((struct sockaddr_in *)&sav->sah->saidx.dst)->sin_addr, - &ip->ip_dst, sizeof(ip->ip_dst)); - ip->ip_ttl = IPDEFTTL; - - /* XXX Should ip_src be updated later ? */ - - return 0; -} #endif /*INET*/ #if INET6 int -ipsec6_encapsulate(m, sav) - struct mbuf *m; - struct secasvar *sav; +ipsec6_encapsulate(struct mbuf *m, struct secasvar *sav) { struct ip6_hdr *oip6; struct ip6_hdr *ip6; @@ -2524,9 +2404,7 @@ ipsec6_encapsulate(m, sav) } static int -ipsec64_encapsulate(m, sav) - struct mbuf *m; - struct secasvar *sav; +ipsec64_encapsulate(struct mbuf *m, struct secasvar *sav) { struct ip6_hdr *ip6, *ip6i; struct ip *ip; @@ -2611,73 +2489,9 @@ ipsec64_encapsulate(m, sav) } int -ipsec6_encapsulate_utun_esp_keepalive(m_ptr, sav) - struct mbuf **m_ptr; - struct secasvar *sav; -{ - struct ip6_hdr *ip6; - size_t plen; - struct mbuf *m = *m_ptr; - - /* can't tunnel between different AFs */ - if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family - != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family - || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET6) { - m_freem(m); - *m_ptr = NULL; - return EINVAL; - } - - plen = m->m_pkthdr.len; - - /* - * grow the mbuf to accomodate the new IPv6 header. - */ - { - struct mbuf *n; - MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ - if (!n) { - m_freem(m); - *m_ptr = NULL; - return ENOBUFS; - } - if (m->m_flags & M_PKTHDR) { - M_COPY_PKTHDR(n, m); - m->m_flags &= ~M_PKTHDR; - } - MH_ALIGN(n, sizeof(*ip6)); - n->m_len = sizeof(*ip6); - n->m_next = m; - n->m_pkthdr.len = (plen + n->m_len); - m_fixhdr(m); - m = n; - *m_ptr = m; - plen = m->m_pkthdr.len; - } - ip6 = mtod(m, __typeof__(ip6)); - - /* construct new IPv6 header. see RFC 2401 5.1.2.2 */ - if (plen < IPV6_MAXPACKET) - ip6->ip6_plen = htons(plen); - else { - /* ip6->ip6_plen will be updated in ip6_output() */ - } - ip6->ip6_nxt = IPPROTO_IPV6; - bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.src)->sin6_addr, - &ip6->ip6_src, sizeof(ip6->ip6_src)); - bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.dst)->sin6_addr, - &ip6->ip6_dst, sizeof(ip6->ip6_dst)); - ip6->ip6_hlim = IPV6_DEFHLIM; - - /* XXX Should ip6_src be updated later ? */ - - return 0; -} - -int -ipsec6_update_routecache_and_output(state, sav) - struct ipsec_output_state *state; - struct secasvar *sav; +ipsec6_update_routecache_and_output( + struct ipsec_output_state *state, + struct secasvar *sav) { struct sockaddr_in6* dst6; struct route *ro6; @@ -2712,7 +2526,7 @@ ipsec6_update_routecache_and_output(state, sav) dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = ip6->ip6_dst; - rtalloc(ro6); + rtalloc_scoped(ro6, sav->sah->outgoing_if); if (ro6->ro_rt) { RT_LOCK(ro6->ro_rt); } @@ -2792,6 +2606,7 @@ ipsec6_update_routecache_and_output(state, sav) ip6->ip6_plen = htons(plen); ipsec_set_pkthdr_for_interface(sav->sah->ipsec_if, state->m, AF_INET6); + ipsec_set_ip6oa_for_interface(sav->sah->ipsec_if, &ip6oa); /* Increment statistics */ ifnet_stat_increment_out(sav->sah->ipsec_if, 1, mbuf_pkthdr_len(state->m), 0); @@ -2800,10 +2615,10 @@ ipsec6_update_routecache_and_output(state, sav) bzero(&ro6_new, sizeof(ro6_new)); bzero(&ip6oa, sizeof(ip6oa)); ip6oa.ip6oa_flowadv.code = 0; - ip6oa.ip6oa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR; + ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR; if (state->outgoing_if) { ip6oa.ip6oa_boundif = state->outgoing_if; - ip6oa.ip6oa_flags |= IPOAF_BOUND_IF; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; } adv = &ip6oa.ip6oa_flowadv; @@ -2819,9 +2634,7 @@ ipsec6_update_routecache_and_output(state, sav) } int -ipsec46_encapsulate(state, sav) - struct secasvar *sav; - struct ipsec_output_state *state; +ipsec46_encapsulate(struct ipsec_output_state *state, struct secasvar *sav) { struct mbuf *m; struct ip6_hdr *ip6; @@ -2961,9 +2774,7 @@ ipsec46_encapsulate(state, sav) * based on RFC 2401. */ int -ipsec_chkreplay(seq, sav) - u_int32_t seq; - struct secasvar *sav; +ipsec_chkreplay(u_int32_t seq, struct secasvar *sav) { const struct secreplay *replay; u_int32_t diff; @@ -3034,9 +2845,7 @@ ipsec_chkreplay(seq, sav) * 1: NG */ int -ipsec_updatereplay(seq, sav) - u_int32_t seq; - struct secasvar *sav; +ipsec_updatereplay(u_int32_t seq, struct secasvar *sav) { struct secreplay *replay; u_int32_t diff; @@ -3141,9 +2950,7 @@ ipsec_updatereplay(seq, sav) * wsize: buffer size (bytes). */ static void -vshiftl(bitmap, nbit, wsize) - unsigned char *bitmap; - int nbit, wsize; +vshiftl(unsigned char *bitmap, int nbit, int wsize) { int s, j, i; unsigned char over; @@ -3162,9 +2969,7 @@ vshiftl(bitmap, nbit, wsize) } const char * -ipsec4_logpacketstr(ip, spi) - struct ip *ip; - u_int32_t spi; +ipsec4_logpacketstr(struct ip *ip, u_int32_t spi) { static char buf[256] __attribute__((aligned(4))); char *p; @@ -3192,9 +2997,7 @@ ipsec4_logpacketstr(ip, spi) #if INET6 const char * -ipsec6_logpacketstr(ip6, spi) - struct ip6_hdr *ip6; - u_int32_t spi; +ipsec6_logpacketstr(struct ip6_hdr *ip6, u_int32_t spi) { static char buf[256] __attribute__((aligned(4))); char *p; @@ -3218,8 +3021,7 @@ ipsec6_logpacketstr(ip6, spi) #endif /*INET6*/ const char * -ipsec_logsastr(sav) - struct secasvar *sav; +ipsec_logsastr(struct secasvar *sav) { static char buf[256] __attribute__((aligned(4))); char *p; @@ -3262,8 +3064,7 @@ ipsec_logsastr(sav) } void -ipsec_dumpmbuf(m) - struct mbuf *m; +ipsec_dumpmbuf(struct mbuf *m) { int totlen; int i; @@ -3371,7 +3172,7 @@ ipsec4_output_internal(struct ipsec_output_state *state, struct secasvar *sav) dst4->sin_family = AF_INET; dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; - rtalloc(ro4); + rtalloc_scoped(ro4, sav->sah->outgoing_if); if (ro4->ro_rt == 0) { OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; @@ -3883,8 +3684,8 @@ ipsec6_output_tunnel_internal(struct ipsec_output_state *state, struct secasvar struct sockaddr_in* dst4; struct route *ro4 = NULL; struct route ro4_copy; - struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, - IPOAF_SELECT_SRCIF, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; if (must_be_last) *must_be_last = 1; @@ -3975,6 +3776,8 @@ ipsec6_output_tunnel_internal(struct ipsec_output_state *state, struct secasvar goto bad; } ipsec_set_pkthdr_for_interface(sav->sah->ipsec_if, state->m, AF_INET); + ipsec_set_ipoa_for_interface(sav->sah->ipsec_if, &ipoa); + ip = mtod(state->m, struct ip *); ip->ip_len = ntohs(ip->ip_len); /* flip len field before calling ip_output */ error = ip_output(state->m, NULL, &ro4_copy, IP_OUTARGS, NULL, &ipoa); @@ -4013,7 +3816,7 @@ ipsec6_output_tunnel_internal(struct ipsec_output_state *state, struct secasvar dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = ip6->ip6_dst; - rtalloc(ro6); + rtalloc_scoped(ro6, sav->sah->outgoing_if); if (ro6->ro_rt) { RT_LOCK(ro6->ro_rt); } @@ -4288,8 +4091,7 @@ ipsec6_interface_output(struct ipsec_output_state *state, ifnet_t interface, u_c * Chop IP header and option off from the payload. */ struct mbuf * -ipsec4_splithdr(m) - struct mbuf *m; +ipsec4_splithdr(struct mbuf *m) { struct mbuf *mh; struct ip *ip; @@ -4330,8 +4132,7 @@ ipsec4_splithdr(m) #if INET6 struct mbuf * -ipsec6_splithdr(m) - struct mbuf *m; +ipsec6_splithdr(struct mbuf *m) { struct mbuf *mh; struct ip6_hdr *ip6; @@ -4368,12 +4169,12 @@ ipsec6_splithdr(m) /* validate inbound IPsec tunnel packet. */ int -ipsec4_tunnel_validate(m, off, nxt0, sav, ifamily) - struct mbuf *m; /* no pullup permitted, m->m_len >= ip */ - int off; - u_int nxt0; - struct secasvar *sav; - sa_family_t *ifamily; +ipsec4_tunnel_validate( + struct mbuf *m, /* no pullup permitted, m->m_len >= ip */ + int off, + u_int nxt0, + struct secasvar *sav, + sa_family_t *ifamily) { u_int8_t nxt = nxt0 & 0xff; struct sockaddr_in *sin; @@ -4412,9 +4213,8 @@ ipsec4_tunnel_validate(m, off, nxt0, sav, ifamily) if (bcmp(&oip->ip_dst, &sin->sin_addr, sizeof(oip->ip_dst)) != 0) return 0; - if (sav->utun_in_fn || - sav->sah->ipsec_if != NULL) { - // the ipsec/utun interface SAs don't have a policies. + if (sav->sah->ipsec_if != NULL) { + // the ipsec interface SAs don't have a policies. if (nxt == IPPROTO_IPV4) { *ifamily = AF_INET; } else if (nxt == IPPROTO_IPV6) { @@ -4488,12 +4288,12 @@ ipsec4_tunnel_validate(m, off, nxt0, sav, ifamily) #if INET6 /* validate inbound IPsec tunnel packet. */ int -ipsec6_tunnel_validate(m, off, nxt0, sav, ifamily) - struct mbuf *m; /* no pullup permitted, m->m_len >= ip */ - int off; - u_int nxt0; - struct secasvar *sav; - sa_family_t *ifamily; +ipsec6_tunnel_validate( + struct mbuf *m, /* no pullup permitted, m->m_len >= ip */ + int off, + u_int nxt0, + struct secasvar *sav, + sa_family_t *ifamily) { u_int8_t nxt = nxt0 & 0xff; struct sockaddr_in6 *sin6; @@ -4525,9 +4325,8 @@ ipsec6_tunnel_validate(m, off, nxt0, sav, ifamily) if (!IN6_ARE_ADDR_EQUAL(&oip6->ip6_dst, &sin6->sin6_addr)) return 0; - if (sav->utun_in_fn || - sav->sah->ipsec_if != NULL) { - // the ipsec/utun interface SAs don't have a policies. + if (sav->sah->ipsec_if != NULL) { + // the ipsec interface SAs don't have a policies. if (nxt == IPPROTO_IPV4) { *ifamily = AF_INET; } else if (nxt == IPPROTO_IPV6) { @@ -4537,7 +4336,7 @@ ipsec6_tunnel_validate(m, off, nxt0, sav, ifamily) } return 1; } - + /* XXX slow */ bzero(&osrc, sizeof(osrc)); bzero(&odst, sizeof(odst)); @@ -4599,8 +4398,7 @@ ipsec6_tunnel_validate(m, off, nxt0, sav, ifamily) * of TCP retransmission... */ struct mbuf * -ipsec_copypkt(m) - struct mbuf *m; +ipsec_copypkt(struct mbuf *m) { struct mbuf *n, **mpp, *mnew; @@ -4870,7 +4668,8 @@ ipsec_send_natt_keepalive( struct ip *ip; int error; struct ip_out_args ipoa = - { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; struct route ro; int keepalive_interval = natt_keepalive_interval; diff --git a/bsd/netinet6/ipsec.h b/bsd/netinet6/ipsec.h index 7a422a690..823bab27e 100644 --- a/bsd/netinet6/ipsec.h +++ b/bsd/netinet6/ipsec.h @@ -362,12 +362,10 @@ extern int ipsec4_output(struct ipsec_output_state *, struct secpolicy *, int); #if INET extern struct mbuf * ipsec4_splithdr(struct mbuf *); extern int ipsec4_encapsulate(struct mbuf *, struct secasvar *); -extern int ipsec4_encapsulate_utun_esp_keepalive(struct mbuf **, struct secasvar *); #endif #if INET6 extern struct mbuf * ipsec6_splithdr(struct mbuf *); extern int ipsec6_encapsulate(struct mbuf *, struct secasvar *); -extern int ipsec6_encapsulate_utun_esp_keepalive(struct mbuf **, struct secasvar *); #endif extern int ipsec4_tunnel_validate(struct mbuf *, int, u_int, struct secasvar *, sa_family_t *); extern struct mbuf *ipsec_copypkt(struct mbuf *); diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 4dda3d82e..6cbcaef13 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1219,7 +1219,6 @@ mld_v2_process_group_query(struct in6_multi *inm, int timer, struct mbuf *m0, case MLD_IDLE_MEMBER: case MLD_LEAVING_MEMBER: return (retval); - break; case MLD_REPORTING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: @@ -3616,22 +3615,16 @@ mld_rec_type_to_str(const int type) switch (type) { case MLD_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; - break; case MLD_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; - break; case MLD_MODE_IS_EXCLUDE: return "MODE_EX"; - break; case MLD_MODE_IS_INCLUDE: return "MODE_IN"; - break; case MLD_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; - break; case MLD_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; - break; default: break; } diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index ce0dfdf21..a9d446315 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -422,13 +422,46 @@ nd6_llinfo_refresh(struct rtentry *rt) if ((ln->ln_state > ND6_LLINFO_INCOMPLETE) && (ln->ln_state < ND6_LLINFO_PROBE)) { if (ln->ln_expire > timenow) { - ln->ln_expire = timenow; - ln->ln_state = ND6_LLINFO_PROBE; + ln_setexpire(ln, timenow); + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_PROBE); } } return; } +const char * +ndcache_state2str(short ndp_state) +{ + const char *ndp_state_str = "UNKNOWN"; + switch (ndp_state) { + case ND6_LLINFO_PURGE: + ndp_state_str = "ND6_LLINFO_PURGE"; + break; + case ND6_LLINFO_NOSTATE: + ndp_state_str = "ND6_LLINFO_NOSTATE"; + break; + case ND6_LLINFO_INCOMPLETE: + ndp_state_str = "ND6_LLINFO_INCOMPLETE"; + break; + case ND6_LLINFO_REACHABLE: + ndp_state_str = "ND6_LLINFO_REACHABLE"; + break; + case ND6_LLINFO_STALE: + ndp_state_str = "ND6_LLINFO_STALE"; + break; + case ND6_LLINFO_DELAY: + ndp_state_str = "ND6_LLINFO_DELAY"; + break; + case ND6_LLINFO_PROBE: + ndp_state_str = "ND6_LLINFO_PROBE"; + break; + default: + /* Init'd to UNKNOWN */ + break; + } + return ndp_state_str; +} + void ln_setexpire(struct llinfo_nd6 *ln, uint64_t expiry) { @@ -484,6 +517,7 @@ nd6_ifattach(struct ifnet *ifp) if (!ndi->initialized) { lck_mtx_init(&ndi->lock, nd_if_lock_grp, nd_if_lock_attr); ndi->flags = ND6_IFF_PERFORMNUD; + ndi->flags |= ND6_IFF_DAD; ndi->initialized = TRUE; } @@ -659,6 +693,7 @@ nd6_options(union nd_opts *ndopts) case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: + case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", @@ -722,12 +757,14 @@ nd6_service(void *arg) { struct nd6svc_arg *ap = arg; struct llinfo_nd6 *ln; - struct nd_defrouter *dr; - struct nd_prefix *pr; + struct nd_defrouter *dr = NULL; + struct nd_prefix *pr = NULL; struct ifnet *ifp = NULL; struct in6_ifaddr *ia6, *nia6; uint64_t timenow; bool send_nc_failure_kev = false; + struct nd_drhead nd_defrouter_tmp; + struct nd_defrouter *ndr = NULL; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); /* @@ -789,7 +826,7 @@ nd6_service(void *arg) ev_msg.dv[0].data_ptr = &nd6_ndfailure; ev_msg.dv[0].data_length = sizeof(nd6_ndfailure); - kev_post_msg(&ev_msg); + dlil_post_complete_msg(NULL, &ev_msg); } send_nc_failure_kev = false; @@ -862,9 +899,9 @@ nd6_service(void *arg) * entries without oustanding route refcnt. */ if (ln->ln_state > ND6_LLINFO_INCOMPLETE) - ln->ln_state = ND6_LLINFO_STALE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_STALE); else - ln->ln_state = ND6_LLINFO_PURGE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_PURGE); ln_setexpire(ln, timenow); } @@ -909,7 +946,7 @@ nd6_service(void *arg) NULL, &dst->sin6_addr, ln); } else { nd6_ns_output(ifp, NULL, - &dst->sin6_addr, ln, 0); + &dst->sin6_addr, ln, NULL); } RT_REMREF(rt); ap->aging++; @@ -947,7 +984,7 @@ nd6_service(void *arg) case ND6_LLINFO_REACHABLE: if (ln->ln_expire != 0) { - ln->ln_state = ND6_LLINFO_STALE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_STALE); ln_setexpire(ln, timenow + nd6_gctimer); ap->aging_lazy++; } @@ -975,19 +1012,19 @@ nd6_service(void *arg) if ((flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->ln_asked = 1; - ln->ln_state = ND6_LLINFO_PROBE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_PROBE); ln_setexpire(ln, timenow + retrans / 1000); RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, &dst->sin6_addr, - &dst->sin6_addr, ln, 0); + &dst->sin6_addr, ln, NULL); RT_REMREF(rt); ap->aging++; lck_mtx_lock(rnh_lock); goto again; } - ln->ln_state = ND6_LLINFO_STALE; /* XXX */ + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_STALE); /* XXX */ ln_setexpire(ln, timenow + nd6_gctimer); RT_UNLOCK(rt); ap->aging_lazy++; @@ -1001,7 +1038,7 @@ nd6_service(void *arg) RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, &dst->sin6_addr, - &dst->sin6_addr, ln, 0); + &dst->sin6_addr, ln, NULL); RT_REMREF(rt); ap->aging++; lck_mtx_lock(rnh_lock); @@ -1041,24 +1078,79 @@ nd6_service(void *arg) lck_mtx_unlock(rnh_lock); /* expire default router list */ + TAILQ_INIT(&nd_defrouter_tmp); + lck_mtx_lock(nd6_mutex); - dr = TAILQ_FIRST(&nd_defrouter); - while (dr) { + TAILQ_FOREACH_SAFE(dr, &nd_defrouter, dr_entry, ndr) { ap->found++; if (dr->expire != 0 && dr->expire < timenow) { - struct nd_defrouter *t; - t = TAILQ_NEXT(dr, dr_entry); - defrtrlist_del(dr); - dr = t; - ap->killed++; + if (dr->ifp != NULL && + dr->ifp->if_type == IFT_CELLULAR) { + /* + * Some buggy cellular gateways may not send + * periodic router advertisements. + * Or they may send it with router lifetime + * value that is less than the configured Max and Min + * Router Advertisement interval. + * To top that an idle device may not wake up + * when periodic RA is received on cellular + * interface. + * We could send RS on every wake but RFC + * 4861 precludes that. + * The addresses are of infinite lifetimes + * and are tied to the lifetime of the bearer, + * so keeping the addresses and just getting rid of + * the router does not help us anyways. + * If there's network renumbering, a lifetime with + * value 0 would remove the default router. + * Also it will get deleted as part of purge when + * the PDP context is torn down and configured again. + * For that reason, do not expire the default router + * learned on cellular interface. Ever. + */ + dr->expire += dr->rtlifetime; + nd6log2((LOG_DEBUG, + "%s: Refreshing expired default router entry " + "%s for interface %s\n", __func__, + ip6_sprintf(&dr->rtaddr), if_name(dr->ifp))); + } else { + ap->killed++; + /* + * Remove the entry from default router list + * and add it to the temp list. + * nd_defrouter_tmp will be a local temporary + * list as no one else can get the same + * removed entry once it is removed from default + * router list. + * Remove the reference after calling defrtrlist_del + */ + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + TAILQ_INSERT_TAIL(&nd_defrouter_tmp, dr, dr_entry); + } } else { if (dr->expire == 0 || (dr->stateflags & NDDRF_STATIC)) ap->sticky++; else ap->aging_lazy++; - dr = TAILQ_NEXT(dr, dr_entry); } } + + /* + * Keep the following separate from the above + * iteration of nd_defrouter because it's not safe + * to call defrtrlist_del while iterating global default + * router list. Global list has to be traversed + * while holding nd6_mutex throughout. + * + * The following call to defrtrlist_del should be + * safe as we are iterating a local list of + * default routers. + */ + TAILQ_FOREACH_SAFE(dr, &nd_defrouter_tmp, dr_entry, ndr) { + TAILQ_REMOVE(&nd_defrouter_tmp, dr, dr_entry); + defrtrlist_del(dr); + NDDR_REMREF(dr); /* remove list reference */ + } lck_mtx_unlock(nd6_mutex); /* @@ -1243,21 +1335,18 @@ nd6_service(void *arg) } } + +static int nd6_need_draining = 0; + void nd6_drain(void *arg) { #pragma unused(arg) - struct nd6svc_arg sarg; - nd6log2((LOG_DEBUG, "%s: draining ND6 entries\n", __func__)); lck_mtx_lock(rnh_lock); - bzero(&sarg, sizeof (sarg)); - sarg.draining = 1; - nd6_service(&sarg); - nd6log2((LOG_DEBUG, "%s: found %u, aging_lazy %u, aging %u, " - "sticky %u, killed %u\n", __func__, sarg.found, sarg.aging_lazy, - sarg.aging, sarg.sticky, sarg.killed)); + nd6_need_draining = 1; + nd6_sched_timeout(NULL, NULL); lck_mtx_unlock(rnh_lock); } @@ -1275,6 +1364,10 @@ nd6_timeout(void *arg) lck_mtx_lock(rnh_lock); bzero(&sarg, sizeof (sarg)); + if (nd6_need_draining != 0) { + nd6_need_draining = 0; + sarg.draining = 1; + } nd6_service(&sarg); nd6log2((LOG_DEBUG, "%s: found %u, aging_lazy %u, aging %u, " "sticky %u, killed %u\n", __func__, sarg.found, sarg.aging_lazy, @@ -1417,7 +1510,7 @@ nd6_post_msg(u_int32_t code, struct nd_prefix_list *prefix_list, ev_msg.dv[0].data_ptr = &nd6_ra_msg_data; ev_msg.dv[0].data_length = sizeof (nd6_ra_msg_data); ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); + dlil_post_complete_msg(NULL, &ev_msg); /* clean up for the next prefix */ bzero(&nd6_ra_msg_data.prefix, sizeof (nd6_ra_msg_data.prefix)); @@ -1520,33 +1613,57 @@ nd6_purge(struct ifnet *ifp) struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; boolean_t removed; + struct nd_drhead nd_defrouter_tmp; + + TAILQ_INIT(&nd_defrouter_tmp); /* Nuke default router list entries toward ifp */ lck_mtx_lock(nd6_mutex); - if ((dr = TAILQ_FIRST(&nd_defrouter)) != NULL) { + TAILQ_FOREACH_SAFE(dr, &nd_defrouter, dr_entry, ndr) { + if (dr->ifp != ifp) + continue; /* - * The first entry of the list may be stored in - * the routing table, so we'll delete it later. + * Remove the entry from default router list + * and add it to the temp list. + * nd_defrouter_tmp will be a local temporary + * list as no one else can get the same + * removed entry once it is removed from default + * router list. + * Remove the reference after calling defrtrlist_del. + * + * The uninstalled entries have to be iterated first + * when we call defrtrlist_del. + * This is to ensure that we don't end up calling + * default router selection when there are other + * uninstalled candidate default routers on + * the interface. + * If we don't respect that order, we may end + * up missing out on some entries. + * + * For that reason, installed ones must be inserted + * at the tail and uninstalled ones at the head */ - for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = ndr) { - ndr = TAILQ_NEXT(dr, dr_entry); - if (dr->stateflags & NDDRF_INSTALLED) - continue; - if (dr->ifp == ifp) - defrtrlist_del(dr); - } - dr = TAILQ_FIRST(&nd_defrouter); - if (dr->ifp == ifp) - defrtrlist_del(dr); - } + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); - for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = ndr) { - ndr = TAILQ_NEXT(dr, dr_entry); - if (!(dr->stateflags & NDDRF_INSTALLED)) - continue; + if (dr->stateflags & NDDRF_INSTALLED) + TAILQ_INSERT_TAIL(&nd_defrouter_tmp, dr, dr_entry); + else + TAILQ_INSERT_HEAD(&nd_defrouter_tmp, dr, dr_entry); + } - if (dr->ifp == ifp) - defrtrlist_del(dr); + /* + * The following call to defrtrlist_del should be + * safe as we are iterating a local list of + * default routers. + * + * We don't really need nd6_mutex here but keeping + * it as it is to avoid changing assertios held in + * the functions in the call-path. + */ + TAILQ_FOREACH_SAFE(dr, &nd_defrouter_tmp, dr_entry, ndr) { + TAILQ_REMOVE(&nd_defrouter_tmp, dr, dr_entry); + defrtrlist_del(dr); + NDDR_REMREF(dr); /* remove list reference */ } /* Nuke prefix list entries toward ifp */ @@ -1595,12 +1712,10 @@ nd6_purge(struct ifnet *ifp) * Perform default router selection even when we are a router, * if Scoped Routing is enabled. */ - if (ip6_doscopedroute || !ip6_forwarding) { - lck_mtx_lock(nd6_mutex); - /* refresh default router list */ - defrouter_select(ifp); - lck_mtx_unlock(nd6_mutex); - } + lck_mtx_lock(nd6_mutex); + /* refresh default router list */ + defrouter_select(ifp); + lck_mtx_unlock(nd6_mutex); /* * Nuke neighbor cache entries for the ifp. @@ -1737,7 +1852,20 @@ nd6_lookup(struct in6_addr *addr6, int create, struct ifnet *ifp, int rt_locked) RT_LOCK(rt); if (rt->rt_llinfo) { struct llinfo_nd6 *ln = rt->rt_llinfo; - ln->ln_state = ND6_LLINFO_NOSTATE; + struct nd_ifinfo *ndi = ND_IFINFO(rt->rt_ifp); + + VERIFY((NULL != ndi) && (TRUE == ndi->initialized)); + /* + * For interface's that do not perform NUD + * neighbor cache entres must always be marked + * reachable with no expiry + */ + if (ndi->flags & ND6_IFF_PERFORMNUD) { + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_NOSTATE); + } else { + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_REACHABLE); + ln_setexpire(ln, 0); + } } } else { return (NULL); @@ -1859,21 +1987,6 @@ nd6_is_new_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp) dstaddr = NULL; } - /* - * If the default router list is empty, all addresses are regarded - * as on-link, and thus, as a neighbor. - * XXX: we restrict the condition to hosts, because routers usually do - * not have the "default router list". - * XXX: this block should eventually be removed (it is disabled when - * Scoped Routing is in effect); treating all destinations as on-link - * in the absence of a router is rather harmful. - */ - if (!ip6_doscopedroute && !ip6_forwarding && - TAILQ_FIRST(&nd_defrouter) == NULL && - nd6_defifindex == ifp->if_index) { - return (1); - } - return (0); } @@ -1944,60 +2057,55 @@ nd6_free(struct rtentry *rt) * not harmful, it was not really necessary. Perform default router * selection even when we are a router, if Scoped Routing is enabled. */ - if (ip6_doscopedroute || !ip6_forwarding) { - dr = defrouter_lookup(&SIN6(rt_key(rt))->sin6_addr, rt->rt_ifp); - - if ((ln && ln->ln_router) || dr) { - /* - * rt6_flush must be called whether or not the neighbor - * is in the Default Router List. - * See a corresponding comment in nd6_na_input(). - */ - RT_UNLOCK(rt); - lck_mtx_unlock(nd6_mutex); - rt6_flush(&in6, rt->rt_ifp); - lck_mtx_lock(nd6_mutex); - } else { - RT_UNLOCK(rt); - } + dr = defrouter_lookup(&SIN6(rt_key(rt))->sin6_addr, rt->rt_ifp); - if (dr) { - NDDR_REMREF(dr); - /* - * Unreachablity of a router might affect the default - * router selection and on-link detection of advertised - * prefixes. - */ + if ((ln && ln->ln_router) || dr) { + /* + * rt6_flush must be called whether or not the neighbor + * is in the Default Router List. + * See a corresponding comment in nd6_na_input(). + */ + RT_UNLOCK(rt); + lck_mtx_unlock(nd6_mutex); + rt6_flush(&in6, rt->rt_ifp); + lck_mtx_lock(nd6_mutex); + } else { + RT_UNLOCK(rt); + } - /* - * Temporarily fake the state to choose a new default - * router and to perform on-link determination of - * prefixes correctly. - * Below the state will be set correctly, - * or the entry itself will be deleted. - */ - RT_LOCK_SPIN(rt); - ln->ln_state = ND6_LLINFO_INCOMPLETE; + if (dr) { + NDDR_REMREF(dr); + /* + * Unreachablity of a router might affect the default + * router selection and on-link detection of advertised + * prefixes. + */ - /* - * Since defrouter_select() does not affect the - * on-link determination and MIP6 needs the check - * before the default router selection, we perform - * the check now. - */ - RT_UNLOCK(rt); - pfxlist_onlink_check(); + /* + * Temporarily fake the state to choose a new default + * router and to perform on-link determination of + * prefixes correctly. + * Below the state will be set correctly, + * or the entry itself will be deleted. + */ + RT_LOCK_SPIN(rt); + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE); - /* - * refresh default router list - */ - defrouter_select(rt->rt_ifp); - } - RT_LOCK_ASSERT_NOTHELD(rt); - } else { + /* + * Since defrouter_select() does not affect the + * on-link determination and MIP6 needs the check + * before the default router selection, we perform + * the check now. + */ RT_UNLOCK(rt); - } + pfxlist_onlink_check(); + /* + * refresh default router list + */ + defrouter_select(rt->rt_ifp); + } + RT_LOCK_ASSERT_NOTHELD(rt); lck_mtx_unlock(nd6_mutex); /* * Detach the route from the routing tree and the list of neighbor @@ -2022,7 +2130,9 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) struct ifaddr *ifa; uint64_t timenow; char buf[MAX_IPv6_STR_LEN]; + struct nd_ifinfo *ndi = ND_IFINFO(rt->rt_ifp); + VERIFY((NULL != ndi) && (TRUE == ndi->initialized)); VERIFY(nd6_init_done); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK_ASSERT_HELD(rt); @@ -2187,20 +2297,26 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) rt->rt_flags |= RTF_LLINFO; ln->ln_rt = rt; /* this is required for "ndp" command. - shin */ - if (req == RTM_ADD) { + /* + * For interface's that do not perform NUD + * neighbor cache entries must always be marked + * reachable with no expiry + */ + if ((req == RTM_ADD) || + !(ndi->flags & ND6_IFF_PERFORMNUD)) { /* * gate should have some valid AF_LINK entry, * and ln->ln_expire should have some lifetime * which is specified by ndp command. */ - ln->ln_state = ND6_LLINFO_REACHABLE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_REACHABLE); + ln_setexpire(ln, 0); } else { /* * When req == RTM_RESOLVE, rt is created and * initialized in rtrequest(), so rt_expire is 0. */ - ln->ln_state = ND6_LLINFO_NOSTATE; - + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_NOSTATE); /* In case we're called before 1.0 sec. has elapsed */ ln_setexpire(ln, (ifp->if_eflags & IFEF_IPV6_ND6ALT) ? 0 : MAX(timenow, 1)); @@ -2236,9 +2352,9 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) continue; } if (ln_end->ln_state > ND6_LLINFO_INCOMPLETE) - ln_end->ln_state = ND6_LLINFO_STALE; + ND6_CACHE_STATE_TRANSITION(ln_end, ND6_LLINFO_STALE); else - ln_end->ln_state = ND6_LLINFO_PURGE; + ND6_CACHE_STATE_TRANSITION(ln_end, ND6_LLINFO_PURGE); ln_setexpire(ln_end, timenow); RT_UNLOCK(rt_end); } @@ -2253,7 +2369,7 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) if (ifa != NULL) { caddr_t macp = nd6_ifptomac(ifp); ln_setexpire(ln, 0); - ln->ln_state = ND6_LLINFO_REACHABLE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_REACHABLE); if (macp != NULL) { Bcopy(macp, LLADDR(SDL(gate)), ifp->if_addrlen); SDL(gate)->sdl_alen = ifp->if_addrlen; @@ -2298,7 +2414,7 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) IFA_REMREF(ifa); } else if (rt->rt_flags & RTF_ANNOUNCE) { ln_setexpire(ln, 0); - ln->ln_state = ND6_LLINFO_REACHABLE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_REACHABLE); /* join solicited node multicast for proxy ND */ if (ifp->if_flags & IFF_MULTICAST) { @@ -2385,6 +2501,7 @@ nd6_siocgdrlst(void *data, int data_is_64) dr = TAILQ_FIRST(&nd_defrouter); + /* XXX Handle mapped defrouter entries */ /* For 64-bit process */ if (data_is_64) { struct in6_drlist_64 *drl_64; @@ -2467,6 +2584,7 @@ nd6_siocgprlst(void *data, int data_is_64) pr = nd_prefix.lh_first; + /* XXX Handle mapped defrouter entries */ /* For 64-bit process */ if (data_is_64) { struct in6_prlist_64 *prl_64; @@ -2765,7 +2883,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) case SIOCSRTRFLUSH_IN6: { /* struct in6_ifreq */ /* flush all the default routers */ struct nd_defrouter *next; + struct nd_drhead nd_defrouter_tmp; + TAILQ_INIT(&nd_defrouter_tmp); lck_mtx_lock(nd6_mutex); if ((dr = TAILQ_FIRST(&nd_defrouter)) != NULL) { /* @@ -2774,12 +2894,44 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) */ for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = next) { next = TAILQ_NEXT(dr, dr_entry); - if (ifp == lo_ifp || dr->ifp == ifp) - defrtrlist_del(dr); + if (ifp == lo_ifp || dr->ifp == ifp) { + /* + * Remove the entry from default router list + * and add it to the temp list. + * nd_defrouter_tmp will be a local temporary + * list as no one else can get the same + * removed entry once it is removed from default + * router list. + * Remove the reference after calling defrtrlist_de + */ + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + TAILQ_INSERT_TAIL(&nd_defrouter_tmp, dr, dr_entry); + } } + + dr = TAILQ_FIRST(&nd_defrouter); if (ifp == lo_ifp || - TAILQ_FIRST(&nd_defrouter)->ifp == ifp) - defrtrlist_del(TAILQ_FIRST(&nd_defrouter)); + dr->ifp == ifp) { + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + TAILQ_INSERT_TAIL(&nd_defrouter_tmp, dr, dr_entry); + } + } + + /* + * Keep the following separate from the above iteration of + * nd_defrouter because it's not safe to call + * defrtrlist_del while iterating global default + * router list. Global list has to be traversed + * while holding nd6_mutex throughout. + * + * The following call to defrtrlist_del should be + * safe as we are iterating a local list of + * default routers. + */ + TAILQ_FOREACH_SAFE(dr, &nd_defrouter_tmp, dr_entry, next) { + TAILQ_REMOVE(&nd_defrouter_tmp, dr, dr_entry); + defrtrlist_del(dr); + NDDR_REMREF(dr); /* remove list reference */ } lck_mtx_unlock(nd6_mutex); break; @@ -2896,6 +3048,35 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) return (error); /* NOTREACHED */ } + case SIOCGIFCGAPREP_IN6: + case SIOCSIFCGAPREP_IN6: + { + struct in6_cgareq *p_cgareq = + (struct in6_cgareq *)(void *)data; + struct nd_ifinfo *ndi = ND_IFINFO(ifp); + + struct in6_cga_modifier *req_cga_mod = + &(p_cgareq->cgar_cgaprep.cga_modifier); + struct in6_cga_modifier *ndi_cga_mod = NULL; + + if ((NULL == ndi) || !ndi->initialized) { + error = EINVAL; + break; + } + + lck_mtx_lock(&ndi->lock); + ndi_cga_mod = &(ndi->local_cga_modifier); + + if (cmd == SIOCSIFCGAPREP_IN6) { + bcopy(req_cga_mod, ndi_cga_mod, sizeof(*ndi_cga_mod)); + ndi->cga_initialized = TRUE; + } else + bcopy(ndi_cga_mod, req_cga_mod, sizeof(*req_cga_mod)); + + lck_mtx_unlock(&ndi->lock); + return (error); + /* NOTREACHED */ + } } return (error); } @@ -2919,6 +3100,7 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int newstate = 0; uint64_t timenow; boolean_t sched_timeout = FALSE; + struct nd_ifinfo *ndi = NULL; if (ifp == NULL) panic("ifp == NULL in nd6_cache_lladdr"); @@ -3017,13 +3199,27 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, newstate = ND6_LLINFO_STALE; } + /* + * For interface's that do not perform NUD + * neighbor cache entres must always be marked + * reachable with no expiry + */ + ndi = ND_IFINFO(ifp); + VERIFY((NULL != ndi) && (TRUE == ndi->initialized)); + + if (ndi && !(ndi->flags & ND6_IFF_PERFORMNUD)) { + newstate = ND6_LLINFO_REACHABLE; + ln_setexpire(ln, 0); + } + if (do_update) { /* * Update the state of the neighbor cache. */ - ln->ln_state = newstate; + ND6_CACHE_STATE_TRANSITION(ln, newstate); - if (ln->ln_state == ND6_LLINFO_STALE) { + if ((ln->ln_state == ND6_LLINFO_STALE) || + (ln->ln_state == ND6_LLINFO_REACHABLE)) { struct mbuf *m = ln->ln_hold; /* * XXX: since nd6_output() below will cause @@ -3031,9 +3227,10 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, * we must set the timer now, although it is actually * meaningless. */ - ln_setexpire(ln, timenow + nd6_gctimer); - ln->ln_hold = NULL; + if (ln->ln_state == ND6_LLINFO_STALE) + ln_setexpire(ln, timenow + nd6_gctimer); + ln->ln_hold = NULL; if (m != NULL) { struct sockaddr_in6 sin6; @@ -3128,8 +3325,7 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, * Note: Perform default router selection even when we are a router, * if Scoped Routing is enabled. */ - if (do_update && ln->ln_router && - (ip6_doscopedroute || !ip6_forwarding)) { + if (do_update && ln->ln_router) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_lock(nd6_mutex); @@ -3476,7 +3672,7 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, /* We don't have to do link-layer address resolution on a p2p link. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ln->ln_state < ND6_LLINFO_REACHABLE) { - ln->ln_state = ND6_LLINFO_STALE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_STALE); ln_setexpire(ln, timenow + nd6_gctimer); } @@ -3489,7 +3685,7 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, */ if (ln->ln_state == ND6_LLINFO_STALE) { ln->ln_asked = 0; - ln->ln_state = ND6_LLINFO_DELAY; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_DELAY); ln_setexpire(ln, timenow + nd6_delay); /* N.B.: we will re-arm the timer below. */ _CASSERT(ND6_LLINFO_DELAY > ND6_LLINFO_INCOMPLETE); @@ -3548,7 +3744,7 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, * an NS below. */ if (ln->ln_state == ND6_LLINFO_NOSTATE) - ln->ln_state = ND6_LLINFO_INCOMPLETE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE); if (ln->ln_hold) m_freem_list(ln->ln_hold); ln->ln_hold = m0; @@ -3566,13 +3762,13 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, nd6_prproxy_ns_output(ifp, origifp, NULL, &dst->sin6_addr, ln); else - nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); + nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, NULL); lck_mtx_lock(rnh_lock); nd6_sched_timeout(NULL, NULL); lck_mtx_unlock(rnh_lock); } else { if(ln->ln_state == ND6_LLINFO_INCOMPLETE) { - ln->ln_expire = timenow; + ln_setexpire(ln, timenow); } RT_UNLOCK(rt); } @@ -3801,6 +3997,9 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, struct sockaddr_dl *sdl = NULL; size_t copy_len; + if (ifp == NULL || ip6_dest == NULL) + return (EINVAL); + if (ip6_dest->sin6_family != AF_INET6) return (EAFNOSUPPORT); @@ -3820,7 +4019,9 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, RT_LOCK_ASSERT_HELD(route); } - if ((packet->m_flags & M_MCAST) != 0) { + if ((packet != NULL && (packet->m_flags & M_MCAST) != 0) || + ((ifp->if_flags & IFF_MULTICAST) && + IN6_IS_ADDR_MULTICAST(&ip6_dest->sin6_addr))) { if (route != NULL) RT_UNLOCK(route); result = dlil_resolve_multi(ifp, @@ -3829,6 +4030,21 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, if (route != NULL) RT_LOCK(route); goto release; + } else if (route == NULL) { + /* + * rdar://24596652 + * For unicast, lookup existing ND6 entries but + * do not trigger a resolution + */ + lck_mtx_lock(rnh_lock); + route = rt_lookup(TRUE, + __DECONST(struct sockaddr *, ip6_dest), NULL, + rt_tables[AF_INET6], ifp->if_index); + lck_mtx_unlock(rnh_lock); + + if (route != NULL) { + RT_LOCK(route); + } } if (route == NULL) { @@ -3874,6 +4090,82 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, return (result); } +#if (DEVELOPMENT || DEBUG) + +static int sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS; +SYSCTL_PROC(_net_inet6_icmp6, OID_AUTO, nd6_lookup_ipv6, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, + sysctl_nd6_lookup_ipv6, "S", ""); + +int +sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + struct nd6_lookup_ipv6_args nd6_lookup_ipv6_args; + ifnet_t ifp = NULL; + + /* + * Only root can lookup MAC addresses + */ + error = proc_suser(current_proc()); + if (error != 0) { + printf("%s: proc_suser() error %d\n", + __func__, error); + goto done; + } + if (req->oldptr == USER_ADDR_NULL) { + req->oldidx = sizeof(struct nd6_lookup_ipv6_args); + } + if (req->newptr == USER_ADDR_NULL) { + goto done; + } + if (req->oldlen != sizeof(struct nd6_lookup_ipv6_args) || + req->newlen != sizeof(struct nd6_lookup_ipv6_args)) { + error = EINVAL; + printf("%s: bad req, error %d\n", + __func__, error); + goto done; + } + error = SYSCTL_IN(req, &nd6_lookup_ipv6_args, + sizeof(struct nd6_lookup_ipv6_args)); + if (error != 0) { + printf("%s: SYSCTL_IN() error %d\n", + __func__, error); + goto done; + } + /* Make sure to terminate the string */ + nd6_lookup_ipv6_args.ifname[IFNAMSIZ - 1] = 0; + + error = ifnet_find_by_name(nd6_lookup_ipv6_args.ifname, &ifp); + if (error != 0) { + printf("%s: ifnet_find_by_name() error %d\n", + __func__, error); + goto done; + } + + error = nd6_lookup_ipv6(ifp, &nd6_lookup_ipv6_args.ip6_dest, + &nd6_lookup_ipv6_args.ll_dest_._sdl, + nd6_lookup_ipv6_args.ll_dest_len, NULL, NULL); + if (error != 0) { + printf("%s: nd6_lookup_ipv6() error %d\n", + __func__, error); + goto done; + } + + error = SYSCTL_OUT(req, &nd6_lookup_ipv6_args, + sizeof(struct nd6_lookup_ipv6_args)); + if (error != 0) { + printf("%s: SYSCTL_OUT() error %d\n", + __func__, error); + goto done; + } +done: + return (error); +} + +#endif /* (DEVELOPEMENT || DEBUG) */ + int nd6_setifinfo(struct ifnet *ifp, u_int32_t before, u_int32_t after) { @@ -3937,6 +4229,7 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS if (req->newptr != USER_ADDR_NULL) return (EPERM); + /* XXX Handle mapped defrouter entries */ lck_mtx_lock(nd6_mutex); if (proc_is64bit(req->p)) { struct in6_defrouter_64 d; @@ -3954,7 +4247,6 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS &dr->rtaddr, pbuf, sizeof (pbuf))); d.flags = dr->flags; d.stateflags = dr->stateflags; - d.stateflags &= ~NDDRF_PROCESSED; d.rtlifetime = dr->rtlifetime; d.expire = nddr_getexpire(dr); d.if_index = dr->ifp->if_index; @@ -3978,7 +4270,6 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS &dr->rtaddr, pbuf, sizeof (pbuf))); d.flags = dr->flags; d.stateflags = dr->stateflags; - d.stateflags &= ~NDDRF_PROCESSED; d.rtlifetime = dr->rtlifetime; d.expire = nddr_getexpire(dr); d.if_index = dr->ifp->if_index; @@ -4008,6 +4299,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS s6.sin6_family = AF_INET6; s6.sin6_len = sizeof (s6); + /* XXX Handle mapped defrouter entries */ lck_mtx_lock(nd6_mutex); if (proc_is64bit(req->p)) { struct in6_prefix_64 p; @@ -4107,3 +4399,87 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS return (error); } + +void +in6_ifaddr_set_dadprogress(struct in6_ifaddr *ia) +{ + struct ifnet* ifp = ia->ia_ifp; + uint32_t flags = IN6_IFF_TENTATIVE; + uint32_t optdad = nd6_optimistic_dad; + struct nd_ifinfo *ndi = NULL; + + ndi = ND_IFINFO(ifp); + VERIFY((NULL != ndi) && (TRUE == ndi->initialized)); + if (!(ndi->flags & ND6_IFF_DAD)) + return; + + if (optdad) { + if ((ifp->if_eflags & IFEF_IPV6_ROUTER) != 0) { + optdad = 0; + } else { + lck_mtx_lock(&ndi->lock); + if ((ndi->flags & ND6_IFF_REPLICATED) != 0) { + optdad = 0; + } + lck_mtx_unlock(&ndi->lock); + } + } + + if (optdad) { + if ((optdad & ND6_OPTIMISTIC_DAD_LINKLOCAL) && + IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) + flags = IN6_IFF_OPTIMISTIC; + else if ((optdad & ND6_OPTIMISTIC_DAD_AUTOCONF) && + (ia->ia6_flags & IN6_IFF_AUTOCONF)) { + if (ia->ia6_flags & IN6_IFF_TEMPORARY) { + if (optdad & ND6_OPTIMISTIC_DAD_TEMPORARY) + flags = IN6_IFF_OPTIMISTIC; + } else if (ia->ia6_flags & IN6_IFF_SECURED) { + if (optdad & ND6_OPTIMISTIC_DAD_SECURED) + flags = IN6_IFF_OPTIMISTIC; + } else { + /* + * Keeping the behavior for temp and CGA + * SLAAC addresses to have a knob for optimistic + * DAD. + * Other than that if ND6_OPTIMISTIC_DAD_AUTOCONF + * is set, we should default to optimistic + * DAD. + * For now this means SLAAC addresses with interface + * identifier derived from modified EUI-64 bit + * identifiers. + */ + flags = IN6_IFF_OPTIMISTIC; + } + } else if ((optdad & ND6_OPTIMISTIC_DAD_DYNAMIC) && + (ia->ia6_flags & IN6_IFF_DYNAMIC)) { + if (ia->ia6_flags & IN6_IFF_TEMPORARY) { + if (optdad & ND6_OPTIMISTIC_DAD_TEMPORARY) + flags = IN6_IFF_OPTIMISTIC; + } else { + flags = IN6_IFF_OPTIMISTIC; + } + } else if ((optdad & ND6_OPTIMISTIC_DAD_MANUAL) && + (ia->ia6_flags & IN6_IFF_OPTIMISTIC)) { + /* + * rdar://17483438 + * Bypass tentative for address assignments + * not covered above (e.g. manual) upon request + */ + if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr) && + !(ia->ia6_flags & IN6_IFF_AUTOCONF) && + !(ia->ia6_flags & IN6_IFF_DYNAMIC)) + flags = IN6_IFF_OPTIMISTIC; + } + } + + ia->ia6_flags &= ~(IN6_IFF_DUPLICATED | IN6_IFF_DADPROGRESS); + ia->ia6_flags |= flags; + + nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n", + __func__, + ip6_sprintf(&ia->ia_addr.sin6_addr), + if_name(ia->ia_ifp), + ia->ia6_flags)); +} + diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index 08b52d26a..bd71a5a9b 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -57,6 +57,7 @@ #ifndef _NETINET6_ND6_H_ #define _NETINET6_ND6_H_ #include +#include /* see net/route.h, or net/if_inarp.h */ #ifndef RTF_ANNOUNCE @@ -114,6 +115,25 @@ struct llinfo_nd6 { #define ND6_LLINFO_PROBE 4 #ifdef BSD_KERNEL_PRIVATE + +#define ND6_CACHE_STATE_TRANSITION(ln, nstate) do {\ + if (nd6_debug >= 1) {\ + nd6log((LOG_INFO,\ + "[%s:%d]: NDP cache entry changed from %s -> %s",\ + __FILE__,\ + __LINE__,\ + ndcache_state2str((ln)->ln_state),\ + ndcache_state2str(nstate)));\ + if ((ln)->ln_rt)\ + nd6log((LOG_INFO,\ + " for address: %s.\n",\ + ip6_sprintf(&SIN6(rt_key((ln)->ln_rt))->sin6_addr)));\ + else\ + nd6log((LOG_INFO, "\n"));\ + }\ + (ln)->ln_state = nstate;\ +} while(0) + #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE) #define ND6_LLINFO_PERMANENT(n) \ (((n)->ln_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE)) @@ -177,6 +197,7 @@ struct nd_ifinfo_compat { #define ND6_IFF_INSECURE 0x80 #endif #define ND6_IFF_REPLICATED 0x100 /* sleep proxy registered */ +#define ND6_IFF_DAD 0x200 /* Perform DAD on the interface */ struct in6_nbrinfo { char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ @@ -249,9 +270,7 @@ struct in6_drlist_64 { #define NDDRF_INSTALLED 0x1 /* installed in the routing table */ #define NDDRF_IFSCOPE 0x2 /* installed as a scoped route */ #define NDDRF_STATIC 0x4 /* for internal use only */ -#ifdef BSD_KERNEL_PRIVATE -#define NDDRF_PROCESSED 0x10 -#endif +#define NDDRF_MAPPED 0x8 /* Default router addr is mapped to a different one for routing */ struct in6_defrouter { struct sockaddr_in6 rtaddr; @@ -506,9 +525,9 @@ struct nd_defrouter { u_char flags; /* flags on RA message */ u_char stateflags; u_short rtlifetime; - unsigned int genid; int err; struct ifnet *ifp; + struct in6_addr rtaddr_mapped; /* Mapped gateway address for routing */ void (*nddr_trace)(struct nd_defrouter *, int); /* trace callback fn */ }; @@ -649,9 +668,6 @@ struct inet6_ndpr_msghdr { #define prm_rrf_decrvalid prm_flags.prf_rr.decrvalid #define prm_rrf_decrprefd prm_flags.prf_rr.decrprefd -#define ifpr2ndpr(ifpr) ((struct nd_prefix *)(ifpr)) -#define ndpr2ifpr(ndpr) ((struct ifprefix *)(ndpr)) - struct nd_pfxrouter { LIST_ENTRY(nd_pfxrouter) pfr_entry; #define pfr_next pfr_entry.le_next @@ -675,14 +691,6 @@ struct kev_nd6_ndalive { struct net_event_data link_data; }; -/* ND6 kernel event subclass value */ -#define KEV_ND6_SUBCLASS 7 - -/* ND6 kernel event action type */ -#define KEV_ND6_RA 1 -#define KEV_ND6_NDFAILURE 2 /* IPv6 neighbor cache entry expiry */ -#define KEV_ND6_NDALIVE 3 /* IPv6 neighbor reachable */ - /* ND6 RA L2 source address length */ #define ND6_ROUTER_LL_SIZE 64 @@ -714,6 +722,18 @@ struct kev_nd6_ra_data { struct nd6_ra_prefix prefix; u_int32_t pad; }; + +struct nd6_lookup_ipv6_args { + char ifname[IFNAMSIZ]; + struct sockaddr_in6 ip6_dest; + u_int32_t ll_dest_len; + union { + char buffer[256]; + struct sockaddr_dl _sdl; + } ll_dest_; +}; +#define ll_dest_sdl ll_dest_._sdl + #endif /* PRIVATE */ #if defined(BSD_KERNEL_PRIVATE) @@ -753,7 +773,7 @@ extern u_int32_t ip6_temp_valid_lifetime; /* seconds */ extern int ip6_temp_regen_advance; /* seconds */ union nd_opts { - struct nd_opt_hdr *nd_opt_array[8]; /* max = target address list */ + struct nd_opt_hdr *nd_opt_array[16]; /* max = target address list */ struct { struct nd_opt_hdr *zero; struct nd_opt_hdr *src_lladdr; @@ -761,6 +781,16 @@ union nd_opts { struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */ struct nd_opt_rd_hdr *rh; struct nd_opt_mtu *mtu; + struct nd_opt_hdr *__res6; + struct nd_opt_hdr *__res7; + struct nd_opt_hdr *__res8; + struct nd_opt_hdr *__res9; + struct nd_opt_hdr *__res10; + struct nd_opt_hdr *__res11; + struct nd_opt_hdr *__res12; + struct nd_opt_hdr *__res13; + struct nd_opt_nonce *nonce; + struct nd_opt_hdr *__res15; struct nd_opt_hdr *search; /* multiple opts */ struct nd_opt_hdr *last; /* multiple opts */ int done; @@ -773,6 +803,7 @@ union nd_opts { #define nd_opts_pi_end nd_opt_each.pi_end #define nd_opts_rh nd_opt_each.rh #define nd_opts_mtu nd_opt_each.mtu +#define nd_opts_nonce nd_opt_each.nonce #define nd_opts_search nd_opt_each.search #define nd_opts_last nd_opt_each.last #define nd_opts_done nd_opt_each.done @@ -810,6 +841,7 @@ extern void nd6_drain(void *); extern void nd6_post_msg(u_int32_t, struct nd_prefix_list *, u_int32_t, u_int32_t, char *, u_int32_t); extern int nd6_setifinfo(struct ifnet *, u_int32_t, u_int32_t); +extern const char *ndcache_state2str(short); extern void ln_setexpire(struct llinfo_nd6 *, uint64_t); /* nd6_nbr.c */ @@ -819,7 +851,7 @@ extern void nd6_na_output(struct ifnet *, const struct in6_addr *, const struct in6_addr *, u_int32_t, int, struct sockaddr *); extern void nd6_ns_input(struct mbuf *, int, int); extern void nd6_ns_output(struct ifnet *, const struct in6_addr *, - const struct in6_addr *, struct llinfo_nd6 *, int); + const struct in6_addr *, struct llinfo_nd6 *, uint8_t *); extern caddr_t nd6_ifptomac(struct ifnet *); extern void nd6_dad_start(struct ifaddr *, int *); extern void nd6_dad_stop(struct ifaddr *); @@ -880,12 +912,13 @@ extern boolean_t nd6_prproxy_isours(struct mbuf *, struct ip6_hdr *, extern void nd6_prproxy_ns_output(struct ifnet *, struct ifnet *, struct in6_addr *, struct in6_addr *, struct llinfo_nd6 *); extern void nd6_prproxy_ns_input(struct ifnet *, struct in6_addr *, - char *, int, struct in6_addr *, struct in6_addr *); + char *, int, struct in6_addr *, struct in6_addr *, uint8_t *nonce); extern void nd6_prproxy_na_input(struct ifnet *, struct in6_addr *, struct in6_addr *, struct in6_addr *, int); extern void nd6_prproxy_sols_reap(struct nd_prefix *); extern void nd6_prproxy_sols_prune(struct nd_prefix *, u_int32_t); extern int nd6_if_disable(struct ifnet *, boolean_t); +void in6_ifaddr_set_dadprogress(struct in6_ifaddr *ia); #endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL diff --git a/bsd/netinet6/nd6_nbr.c b/bsd/netinet6/nd6_nbr.c index 64c4720e3..96eb73444 100644 --- a/bsd/netinet6/nd6_nbr.c +++ b/bsd/netinet6/nd6_nbr.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -79,6 +80,7 @@ #include #include #include +#include #include #include @@ -98,11 +100,11 @@ #endif struct dadq; -static struct dadq *nd6_dad_find(struct ifaddr *); +static struct dadq *nd6_dad_find(struct ifaddr *, struct nd_opt_nonce *); void nd6_dad_stoptimer(struct ifaddr *); static void nd6_dad_timer(struct ifaddr *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); -static void nd6_dad_ns_input(struct mbuf *, struct ifaddr *, char *, int); +static void nd6_dad_ns_input(struct ifaddr *, char *, int, struct nd_opt_nonce *); static struct mbuf *nd6_dad_na_input(struct mbuf *, struct ifnet *, struct in6_addr *, caddr_t, int); static void dad_addref(struct dadq *, int); @@ -150,16 +152,21 @@ static struct zone *dad_zone; /* zone for dadq */ extern lck_mtx_t *dad6_mutex; extern lck_mtx_t *nd6_mutex; -static int nd6_llreach_base = (LL_BASE_REACHABLE / 1000); /* seconds */ +static int nd6_llreach_base = 30; /* seconds */ static struct sockaddr_in6 hostrtmask; SYSCTL_DECL(_net_inet6_icmp6); - SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_llreach_base, - CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_llreach_base, LL_BASE_REACHABLE, + CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_llreach_base, 0, "default ND6 link-layer reachability max lifetime (in seconds)"); +int dad_enhanced = 1; +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, dad_enhanced, CTLFLAG_RW | CTLFLAG_LOCKED, + &dad_enhanced, 0, + "Enable Enhanced DAD, which adds a random nonce to NS messages for DAD."); + /* * Obtain a link-layer source cache entry for the sender. * @@ -421,7 +428,9 @@ nd6_ns_input( */ if (ip6_forwarding && nd6_prproxy) nd6_prproxy_ns_input(ifp, &saddr6, lladdr, - lladdrlen, &daddr6, &taddr6); + lladdrlen, &daddr6, &taddr6, + (ndopts.nd_opts_nonce == NULL) ? NULL : + ndopts.nd_opts_nonce->nd_opt_nonce); goto freeit; } IFA_LOCK(ifa); @@ -480,7 +489,7 @@ nd6_ns_input( oflgclr = 1; } else { if (is_dad_probe) - nd6_dad_ns_input(m, ifa, lladdr, lladdrlen); + nd6_dad_ns_input(ifa, lladdr, lladdrlen, ndopts.nd_opts_nonce); goto freeit; } @@ -553,7 +562,7 @@ nd6_ns_output( const struct in6_addr *daddr6, const struct in6_addr *taddr6, struct llinfo_nd6 *ln, /* for source address determination */ - int dad) /* duplicated address detection */ + uint8_t *nonce) /* duplicated address detection */ { struct mbuf *m; struct ip6_hdr *ip6; @@ -568,7 +577,9 @@ nd6_ns_output( caddr_t mac; struct route_in6 ro; struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, - IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0 }; + IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR | + IP6OAF_AWDL_UNRESTRICTED | IP6OAF_INTCOPROC_ALLOWED, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; u_int32_t rtflags = 0; if ((ifp->if_eflags & IFEF_IPV6_ND6ALT) || IN6_IS_ADDR_MULTICAST(taddr6)) @@ -640,7 +651,7 @@ nd6_ns_output( if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) goto bad; } - if (!dad) { + if (nonce == NULL) { /* * RFC2461 7.2.2: * "If the source address of the packet prompting the @@ -714,8 +725,8 @@ nd6_ns_output( /* * RFC 4429 section 3.2: * When a node has a unicast packet to send - * from an Optimistic Address to a neighbor, - * but does not know the neighbor's link-layer + * from an Optimistic Address to a neighbor, + * but does not know the neighbor's link-layer * address, it MUST NOT perform Address * Resolution. */ @@ -760,7 +771,7 @@ nd6_ns_output( * Multicast NS MUST add one add the option * Unicast NS SHOULD add one add the option */ - if (!dad && (mac = nd6_ifptomac(ifp))) { + if (nonce == NULL && (mac = nd6_ifptomac(ifp))) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8 byte alignments... */ @@ -774,13 +785,32 @@ nd6_ns_output( nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } + /* + * Add a Nonce option (RFC 3971) to detect looped back NS messages. + * This behavior is documented as Enhanced Duplicate Address + * Detection in draft-ietf-6man-enhanced-dad-13. + * net.inet6.ip6.dad_enhanced=0 disables this. + */ + if (dad_enhanced != 0 && nonce != NULL && !(ifp->if_flags & IFF_POINTOPOINT)) { + int optlen = sizeof(struct nd_opt_hdr) + ND_OPT_NONCE_LEN; + struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); + /* 8-byte alignment is required. */ + optlen = (optlen + 7) & ~7; + m->m_pkthdr.len += optlen; + m->m_len += optlen; + icmp6len += optlen; + bzero((caddr_t)nd_opt, optlen); + nd_opt->nd_opt_type = ND_OPT_NONCE; + nd_opt->nd_opt_len = optlen >> 3; + bcopy(nonce, (caddr_t)(nd_opt + 1), ND_OPT_NONCE_LEN); + } ip6->ip6_plen = htons((u_short)icmp6len); nd_ns->nd_ns_cksum = 0; nd_ns->nd_ns_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len); - flags = dad ? IPV6_UNSPECSRC : 0; + flags = nonce ? IPV6_UNSPECSRC : 0; flags |= IPV6_OUTARGS; /* @@ -940,7 +970,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) * another interface (in case we are doing prefix proxying.) */ if ((rt = nd6_lookup(&taddr6, 0, ifp, 0)) == NULL) { - if (!ip6_forwarding || !ip6_doscopedroute || !nd6_prproxy) + if (!ip6_forwarding || !nd6_prproxy) goto freeit; if ((rt = nd6_lookup(&taddr6, 0, NULL, 0)) == NULL) @@ -1000,7 +1030,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); if (is_solicited) { send_nc_alive_kev = (rt->rt_flags & RTF_ROUTER) ? true : false; - ln->ln_state = ND6_LLINFO_REACHABLE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_REACHABLE); if (ln->ln_expire != 0) { struct nd_ifinfo *ndi = NULL; @@ -1016,7 +1046,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) RT_LOCK(rt); } } else { - ln->ln_state = ND6_LLINFO_STALE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_STALE); ln_setexpire(ln, timenow + nd6_gctimer); } if ((ln->ln_router = is_router) != 0) { @@ -1074,7 +1104,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) * no other updates should be done. */ if (ln->ln_state == ND6_LLINFO_REACHABLE) { - ln->ln_state = ND6_LLINFO_STALE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_STALE); ln_setexpire(ln, timenow + nd6_gctimer); } RT_REMREF_LOCKED(rt); @@ -1097,7 +1127,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) * changed, make it STALE. */ if (is_solicited) { - ln->ln_state = ND6_LLINFO_REACHABLE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_REACHABLE); if (ln->ln_expire != 0) { struct nd_ifinfo *ndi = NULL; @@ -1115,7 +1145,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) } } else { if (lladdr && llchange) { - ln->ln_state = ND6_LLINFO_STALE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_STALE); ln_setexpire(ln, timenow + nd6_gctimer); } } @@ -1138,21 +1168,21 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) lck_mtx_lock(nd6_mutex); dr = defrouter_lookup(in6, rt_ifp); if (dr) { + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); defrtrlist_del(dr); + NDDR_REMREF(dr); /* remove list reference */ NDDR_REMREF(dr); lck_mtx_unlock(nd6_mutex); } else { lck_mtx_unlock(nd6_mutex); - if (ip6_doscopedroute || !ip6_forwarding) { - /* - * Even if the neighbor is not in the - * default router list, the neighbor - * may be used as a next hop for some - * destinations (e.g. redirect case). - * So we must call rt6_flush explicitly. - */ - rt6_flush(&ip6->ip6_src, rt_ifp); - } + /* + * Even if the neighbor is not in the + * default router list, the neighbor + * may be used as a next hop for some + * destinations (e.g. redirect case). + * So we must call rt6_flush explicitly. + */ + rt6_flush(&ip6->ip6_src, rt_ifp); } RT_LOCK(rt); } @@ -1177,7 +1207,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) ev_msg.dv[0].data_ptr = &nd6_ndalive; ev_msg.dv[0].data_length = sizeof(nd6_ndalive); - kev_post_msg(&ev_msg); + dlil_post_complete_msg(NULL, &ev_msg); } RT_LOCK_ASSERT_HELD(rt); @@ -1256,7 +1286,9 @@ nd6_na_output( int icmp6len, maxlen, error; struct ifnet *outif = NULL; struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, - IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0 }; + IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR | + IP6OAF_AWDL_UNRESTRICTED | IP6OAF_INTCOPROC_ALLOWED, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; bzero(&ro, sizeof(ro)); @@ -1475,8 +1507,13 @@ struct dadq { int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; - int dad_nd_ixcount; /* Count of IFDISABLED eligible ND rx'd */ - uint8_t dad_ehsrc[ETHER_ADDR_LEN]; + int dad_ns_lcount; /* looped back NS */ + int dad_loopbackprobe; /* probing state for loopback detection */ + uint8_t dad_lladdr[ETHER_ADDR_LEN]; + uint8_t dad_lladdrlen; +#define ND_OPT_NONCE_LEN32 \ + ((ND_OPT_NONCE_LEN + sizeof(uint32_t) - 1)/sizeof(uint32_t)) + uint32_t dad_nonce[ND_OPT_NONCE_LEN32]; }; static struct dadq_head dadq; @@ -1485,7 +1522,7 @@ void nd6_nbr_init(void) { int i; - + TAILQ_INIT(&dadq); dad_size = sizeof (struct dadq); @@ -1505,23 +1542,43 @@ nd6_nbr_init(void) } static struct dadq * -nd6_dad_find(struct ifaddr *ifa) +nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *nonce) { struct dadq *dp; lck_mtx_lock(dad6_mutex); for (dp = dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { DAD_LOCK_SPIN(dp); - if (dp->dad_ifa == ifa) { - DAD_ADDREF_LOCKED(dp); + if (dp->dad_ifa != ifa) { DAD_UNLOCK(dp); - lck_mtx_unlock(dad6_mutex); - return (dp); + continue; } + + /* + * Skip if the nonce matches the received one. + * +2 in the length is required because of type and + * length fields are included in a header. + */ + if (nonce != NULL && + nonce->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 && + memcmp(&nonce->nd_opt_nonce[0], &dp->dad_nonce[0], + ND_OPT_NONCE_LEN) == 0) { + nd6log((LOG_ERR, "%s: a looped back NS message is " + "detected during DAD for %s. Ignoring.\n", + if_name(ifa->ifa_ifp), + ip6_sprintf(IFA_IN6(ifa)))); + dp->dad_ns_lcount++; + ++ip6stat.ip6s_dad_loopcount; + DAD_UNLOCK(dp); + continue; + } + + DAD_ADDREF_LOCKED(dp); DAD_UNLOCK(dp); + break; } lck_mtx_unlock(dad6_mutex); - return (NULL); + return (dp); } void @@ -1577,7 +1634,7 @@ nd6_dad_start( (ifa->ifa_ifp->if_eflags & IFEF_IPV6_ND6ALT)) { return; } - if ((dp = nd6_dad_find(ifa)) != NULL) { + if ((dp = nd6_dad_find(ifa, NULL)) != NULL) { DAD_REMREF(dp); /* DAD already in progress */ return; @@ -1597,9 +1654,10 @@ nd6_dad_start( /* Callee adds one reference for us */ dp = nd6_dad_attach(dp, ifa); - nd6log((LOG_DEBUG, "%s: starting %sDAD for %s\n", + nd6log((LOG_DEBUG, "%s: starting %sDAD %sfor %s\n", if_name(ifa->ifa_ifp), (ia->ia6_flags & IN6_IFF_OPTIMISTIC) ? "optimistic " : "", + (tick_delay == NULL) ? "immediately " : "", ip6_sprintf(&ia->ia_addr.sin6_addr))); /* @@ -1644,9 +1702,10 @@ nd6_dad_attach(struct dadq *dp, struct ifaddr *ifa) dp->dad_count = ip6_dad_count; dp->dad_ns_icount = dp->dad_na_icount = 0; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; - dp->dad_nd_ixcount = 0; + dp->dad_ns_lcount = dp->dad_loopbackprobe = 0; VERIFY(!dp->dad_attached); dp->dad_attached = 1; + dp->dad_lladdrlen = 0; DAD_ADDREF_LOCKED(dp); /* for caller */ DAD_ADDREF_LOCKED(dp); /* for dadq_head list */ TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); @@ -1685,7 +1744,7 @@ nd6_dad_stop(struct ifaddr *ifa) { struct dadq *dp; - dp = nd6_dad_find(ifa); + dp = nd6_dad_find(ifa, NULL); if (!dp) { /* DAD wasn't started yet */ return; @@ -1730,6 +1789,7 @@ nd6_dad_timer(struct ifaddr *ifa) struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp = NULL; struct nd_ifinfo *ndi = NULL; + u_int32_t retrans; /* Sanity check */ if (ia == NULL) { @@ -1743,7 +1803,7 @@ nd6_dad_timer(struct ifaddr *ifa) if_name(ia->ia_ifp), ia->ia6_flags)); - dp = nd6_dad_find(ifa); + dp = nd6_dad_find(ifa, NULL); if (dp == NULL) { log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); goto done; @@ -1780,8 +1840,6 @@ nd6_dad_timer(struct ifaddr *ifa) /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { - u_int32_t retrans; - DAD_UNLOCK(dp); /* * We have more NS to go. Send NS packet for DAD. @@ -1798,36 +1856,49 @@ nd6_dad_timer(struct ifaddr *ifa) * We have transmitted sufficient number of DAD packets. * See what we've got. */ - int duplicate; - boolean_t candisable; - - duplicate = 0; - candisable = dp->dad_nd_ixcount > 0; - - if (dp->dad_na_icount) { - /* - * the check is in nd6_dad_na_input(), - * but just in case - */ - duplicate++; - } - - if (dp->dad_ns_icount) { - /* We've seen NS, means DAD has failed. */ - duplicate++; - } - DAD_UNLOCK(dp); - - if (duplicate) { + if (dp->dad_na_icount > 0 || dp->dad_ns_icount) { + /* We've seen NS or NA, means DAD has failed. */ + DAD_UNLOCK(dp); nd6log((LOG_INFO, "%s: duplicate IPv6 address %s [timer]\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ia->ia_ifp))); nd6_dad_duplicated(ifa); /* (*dp) will be freed in nd6_dad_duplicated() */ + } else if (dad_enhanced != 0 && + dp->dad_ns_lcount > 0 && + dp->dad_ns_lcount > dp->dad_loopbackprobe) { + dp->dad_loopbackprobe = dp->dad_ns_lcount; + dp->dad_count = + dp->dad_ns_ocount + dad_maxtry - 1; + DAD_UNLOCK(dp); + ndi = ND_IFINFO(ifa->ifa_ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); + retrans = ndi->retrans * hz / 1000; + lck_mtx_unlock(&ndi->lock); + + /* + * Sec. 4.1 in RFC 7527 requires transmission of + * additional probes until the loopback condition + * becomes clear when a looped back probe is detected. + */ + nd6log((LOG_INFO, + "%s: a looped back NS message is " + "detected during DAD for %s. " + "Another DAD probe is being sent on interface.\n", + __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), + if_name(ia->ia_ifp))); + /* + * Send an NS immediately and increase dad_count by + * nd6_mmaxtries - 1. + */ + nd6_dad_ns_output(dp, ifa); + timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, retrans); + goto done; } else { boolean_t txunsolna; - + DAD_UNLOCK(dp); /* * We are done with DAD. No NA came, no NS came. * No duplicate address found. @@ -1851,8 +1922,18 @@ nd6_dad_timer(struct ifaddr *ifa) if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr), txunsolna ? ", tx unsolicited NA with O=1" : ".")); - in6_post_msg(ia->ia_ifp, KEV_INET6_NEW_USER_ADDR, ia, - dp->dad_ehsrc); + + if (dp->dad_ns_lcount > 0) + nd6log((LOG_DEBUG, + "%s: DAD completed while " + "a looped back NS message is detected " + "during DAD for %s om interface %s\n", + __func__, + ip6_sprintf(&ia->ia_addr.sin6_addr), + if_name(ia->ia_ifp))); + + in6_post_msg(ia->ia_ifp, KEV_INET6_NEW_USER_ADDR, ia, + dp->dad_lladdr); nd6_dad_detach(dp, ifa); } } @@ -1868,20 +1949,84 @@ nd6_dad_duplicated(struct ifaddr *ifa) struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; struct ifnet *ifp = ifa->ifa_ifp; - boolean_t disable; + boolean_t candisable; - dp = nd6_dad_find(ifa); + dp = nd6_dad_find(ifa, NULL); if (dp == NULL) { log(LOG_ERR, "%s: DAD structure not found.\n", __func__); return; } IFA_LOCK(&ia->ia_ifa); DAD_LOCK(dp); - nd6log((LOG_ERR, "%s: NS in/out=%d/%d, NA in=%d, ND x=%d\n", - __func__, dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount, - dp->dad_nd_ixcount)); - disable = dp->dad_nd_ixcount > 0; + nd6log((LOG_ERR, "%s: NS in/out/loopback=%d/%d, NA in=%d\n", + __func__, dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount, + dp->dad_na_icount)); + candisable = FALSE; + + if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr) && + !(ia->ia6_flags & IN6_IFF_SECURED)) { + struct in6_addr in6; + struct ifaddr *llifa = NULL; + struct sockaddr_dl *sdl = NULL; + uint8_t *lladdr = dp->dad_lladdr; + uint8_t lladdrlen = dp->dad_lladdrlen; + + /* + * To avoid over-reaction, we only apply this logic when we are + * very sure that hardware addresses are supposed to be unique. + */ + switch (ifp->if_type) { + case IFT_BRIDGE: + case IFT_ETHER: + case IFT_FDDI: + case IFT_ATM: + case IFT_IEEE1394: +#ifdef IFT_IEEE80211 + case IFT_IEEE80211: +#endif + /* + * Check if our hardware address matches the + * link layer information received in the + * NS/NA + */ + llifa = ifp->if_lladdr; + IFA_LOCK(llifa); + sdl = (struct sockaddr_dl *)(void *) + llifa->ifa_addr; + if (lladdrlen == sdl->sdl_alen && + bcmp(lladdr, LLADDR(sdl), lladdrlen) == 0) + candisable = TRUE; + IFA_UNLOCK(llifa); + + in6 = ia->ia_addr.sin6_addr; + if (in6_iid_from_hw(ifp, &in6) != 0) + break; + + /* Refine decision about whether IPv6 can be disabled */ + if (candisable && + !IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { + /* + * Apply this logic only to the embedded MAC + * address form of link-local IPv6 address. + */ + candisable = FALSE; + } else if (lladdr == NULL && + IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { + /* + * We received a NA with no target link-layer + * address option. This means that someone else + * has our address. Mark it as a hardware + * duplicate so we disable IPv6 later on. + */ + candisable = TRUE; + } + break; + default: + break; + } + } DAD_UNLOCK(dp); + ia->ia6_flags &= ~IN6_IFF_DADPROGRESS; ia->ia6_flags |= IN6_IFF_DUPLICATED; IFA_UNLOCK(&ia->ia_ifa); @@ -1897,7 +2042,7 @@ nd6_dad_duplicated(struct ifaddr *ifa) if_name(ifp), ip6_sprintf(&ia->ia_addr.sin6_addr)); IFA_UNLOCK(&ia->ia_ifa); - if (disable) { + if (candisable) { struct nd_ifinfo *ndi = ND_IFINFO(ifp); log(LOG_ERR, "%s: possible hardware address duplication " "detected, disabling IPv6 for interface.\n", if_name(ifp)); @@ -1914,7 +2059,7 @@ nd6_dad_duplicated(struct ifaddr *ifa) * duplicate address will be notified to the user and will * be removed. */ - in6_post_msg(ifp, KEV_INET6_NEW_USER_ADDR, ia, dp->dad_ehsrc); + in6_post_msg(ifp, KEV_INET6_NEW_USER_ADDR, ia, dp->dad_lladdr); nd6_dad_detach(dp, ifa); DAD_REMREF(dp); /* drop our reference */ } @@ -1924,6 +2069,7 @@ nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; + int i = 0; struct in6_addr taddr6; DAD_LOCK(dp); @@ -1942,94 +2088,69 @@ nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa) IFA_LOCK_SPIN(&ia->ia_ifa); taddr6 = ia->ia_addr.sin6_addr; IFA_UNLOCK(&ia->ia_ifa); - nd6_ns_output(ifp, NULL, &taddr6, NULL, 1); + if (dad_enhanced != 0 && !(ifp->if_flags & IFF_POINTOPOINT)) { + for (i = 0; i < ND_OPT_NONCE_LEN32; i++) + dp->dad_nonce[i] = RandomULong(); + /* + * XXXHRS: Note that in the case that + * DupAddrDetectTransmits > 1, multiple NS messages with + * different nonces can be looped back in an unexpected + * order. The current implementation recognizes only + * the latest nonce on the sender side. Practically it + * should work well in almost all cases. + */ + } + nd6_ns_output(ifp, NULL, &taddr6, NULL, + (uint8_t *)&dp->dad_nonce[0]); } +/* + * @brief Called to process DAD NS + * + * @param ifa is the pointer to the interface's address + * @param lladdr is source link layer information + * @param lladdrlen is source's linklayer length + * + * @return void + */ static void -nd6_dad_ns_input(struct mbuf *m, struct ifaddr *ifa, char *lladdr, - int lladdrlen) +nd6_dad_ns_input(struct ifaddr *ifa, char *lladdr, + int lladdrlen, struct nd_opt_nonce *ndopt_nonce) { struct dadq *dp; - struct in6_ifaddr *ia; - boolean_t candisable, dadstarted; - struct ip6aux *ip6a; - VERIFY(ifa != NULL); - candisable = FALSE; - IFA_LOCK(ifa); - ia = (struct in6_ifaddr *) ifa; - if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { - ip6a = ip6_findaux(m); - candisable = TRUE; - if (ip6a && (ip6a->ip6a_flags & IP6A_HASEEN) != 0) { - struct in6_addr in6 = ia->ia_addr.sin6_addr; - - nd6log((LOG_INFO, - "%s: eh_src=%02x:%02x:%02x:%02x:%02x:%02x -> %s\n", - __func__, - ip6a->ip6a_ehsrc[0], ip6a->ip6a_ehsrc[1], - ip6a->ip6a_ehsrc[2], ip6a->ip6a_ehsrc[3], - ip6a->ip6a_ehsrc[4], ip6a->ip6a_ehsrc[5], - if_name(ifa->ifa_ifp))); - - in6.s6_addr8[8] = ip6a->ip6a_ehsrc[0] ^ ND6_EUI64_UBIT; - in6.s6_addr8[9] = ip6a->ip6a_ehsrc[1]; - in6.s6_addr8[10] = ip6a->ip6a_ehsrc[2]; - in6.s6_addr8[11] = 0xff; - in6.s6_addr8[12] = 0xfe; - in6.s6_addr8[13] = ip6a->ip6a_ehsrc[3]; - in6.s6_addr8[14] = ip6a->ip6a_ehsrc[4]; - in6.s6_addr8[15] = ip6a->ip6a_ehsrc[5]; - - if (!IN6_ARE_ADDR_EQUAL(&in6, &ia->ia_addr.sin6_addr)) { - nd6log((LOG_ERR, "%s: DAD NS for %s on %s " - "is from another MAC address.\n", __func__, - ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ifa->ifa_ifp))); - candisable = FALSE; - } - } else { - nd6log((LOG_INFO, - "%s: no eh_src for DAD NS %s at %s.\n", __func__, - ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ifa->ifa_ifp))); - } - } - IFA_UNLOCK(ifa); - /* If DAD has not yet started, then this DAD NS probe is proof that - * another node has started first. Otherwise, it could be a multicast - * loopback, in which case it should be counted and handled later in - * the DAD timer callback. - */ - dadstarted = FALSE; - dp = nd6_dad_find(ifa); - if (dp != NULL) { - DAD_LOCK(dp); - ++dp->dad_ns_icount; - if (candisable) - ++dp->dad_nd_ixcount; - if (dp->dad_ns_ocount > 0) - dadstarted = TRUE; - if (lladdr && lladdrlen >= ETHER_ADDR_LEN) - memcpy(dp->dad_ehsrc, lladdr, ETHER_ADDR_LEN); - DAD_UNLOCK(dp); - DAD_REMREF(dp); - dp = NULL; - } + /* Ignore Nonce option when Enhanced DAD is disabled. */ + if (dad_enhanced == 0) + ndopt_nonce = NULL; - nd6log((LOG_INFO, "%s: dadstarted=%d candisable=%d\n", - __func__, dadstarted, candisable)); + dp = nd6_dad_find(ifa, ndopt_nonce); + if (dp == NULL) + return; - if (!dadstarted) { - nd6log((LOG_INFO, - "%s: duplicate IPv6 address %s [processing NS on %s]\n", - __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ifa->ifa_ifp))); - nd6_dad_duplicated(ifa); + DAD_LOCK(dp); + ++dp->dad_ns_icount; + if (lladdr && lladdrlen >= ETHER_ADDR_LEN) { + memcpy(dp->dad_lladdr, lladdr, ETHER_ADDR_LEN); + dp->dad_lladdrlen = lladdrlen; } + DAD_UNLOCK(dp); + DAD_REMREF(dp); } +/* + * @brief Called to process received NA for DAD + * + * @param m is the pointer to the packet's mbuf + * @param ifp is the pointer to the interface on which packet + * was receicved. + * @param taddr is pointer to target's IPv6 address + * @param lladdr is target's link layer information + * @param lladdrlen is target's linklayer length + * + * @return NULL if the packet is consumed by DAD processing, else + * pointer to the mbuf. + */ static struct mbuf * nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, caddr_t lladdr, int lladdrlen) @@ -2038,13 +2159,12 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, struct in6_ifaddr *ia = NULL; struct dadq *dp = NULL; struct nd_ifinfo *ndi = NULL; - boolean_t candisable, replicated; + boolean_t replicated; ifa = (struct ifaddr *) in6ifa_ifpwithaddr(ifp, taddr); if (ifa == NULL) return m; - candisable = FALSE; replicated = FALSE; /* Get the ND6_IFF_REPLICATED flag. */ @@ -2065,15 +2185,6 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, IFA_LOCK(ifa); ia = (struct in6_ifaddr *) ifa; - /* - * If the address is a link-local address formed from an interface - * identifier based on the hardware address which is supposed to be - * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP - * operation on the interface SHOULD be disabled according to RFC 4862, - * section 5.4.5, but here we decide not to disable if the target - * hardware address is not also ours, which is a transitory possibility - * in the presence of network-resident sleep proxies on the local link. - */ if (!(ia->ia6_flags & IN6_IFF_DADPROGRESS)) { IFA_UNLOCK(ifa); nd6log((LOG_INFO, "%s: ignoring duplicate NA on " @@ -2090,7 +2201,7 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, if (lladdr != NULL && lladdrlen >= ETHER_ADDR_LEN) { struct ip6aux *ip6a = ip6_findaux(m); if (ip6a && (ip6a->ip6a_flags & IP6A_HASEEN) != 0 && - bcmp(ip6a->ip6a_ehsrc, lladdr, ETHER_ADDR_LEN) != 0) { + bcmp(ip6a->ip6a_ehsrc, lladdr, ETHER_ADDR_LEN) != 0) { IFA_UNLOCK(ifa); nd6log((LOG_ERR, "%s: ignoring duplicate NA on %s " "[eh_src != tgtlladdr]\n", __func__, if_name(ifp))); @@ -2100,68 +2211,7 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, IFA_UNLOCK(ifa); - if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr) && - !(ia->ia6_flags & IN6_IFF_SECURED)) { - struct in6_addr in6; - - /* - * To avoid over-reaction, we only apply this logic when we are - * very sure that hardware addresses are supposed to be unique. - */ - switch (ifp->if_type) { - case IFT_BRIDGE: - case IFT_ETHER: - case IFT_FDDI: - case IFT_ATM: - case IFT_IEEE1394: -#ifdef IFT_IEEE80211 - case IFT_IEEE80211: -#endif - /* Check if our hardware address matches the target */ - if (lladdr != NULL && lladdrlen > 0) { - struct ifaddr *llifa; - struct sockaddr_dl *sdl; - - llifa = ifp->if_lladdr; - IFA_LOCK(llifa); - sdl = (struct sockaddr_dl *)(void *) - llifa->ifa_addr; - if (lladdrlen == sdl->sdl_alen && - bcmp(lladdr, LLADDR(sdl), lladdrlen) == 0) - candisable = TRUE; - IFA_UNLOCK(llifa); - } - in6 = ia->ia_addr.sin6_addr; - if (in6_iid_from_hw(ifp, &in6) != 0) - break; - - /* Refine decision about whether IPv6 can be disabled */ - IFA_LOCK(ifa); - if (candisable && - !IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { - /* - * Apply this logic only to the embedded MAC - * address form of link-local IPv6 address. - */ - candisable = FALSE; - } else if (lladdr == NULL && - IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { - /* - * We received a NA with no target link-layer - * address option. This means that someone else - * has our address. Mark it as a hardware - * duplicate so we disable IPv6 later on. - */ - candisable = TRUE; - } - IFA_UNLOCK(ifa); - break; - default: - break; - } - } - - dp = nd6_dad_find(ifa); + dp = nd6_dad_find(ifa, NULL); if (dp == NULL) { nd6log((LOG_INFO, "%s: no DAD structure for %s on %s.\n", __func__, ip6_sprintf(taddr), if_name(ifp))); @@ -2169,11 +2219,11 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, } DAD_LOCK_SPIN(dp); - if (lladdr != NULL && lladdrlen >= ETHER_ADDR_LEN) - memcpy(dp->dad_ehsrc, lladdr, ETHER_ADDR_LEN); + if (lladdr != NULL && lladdrlen >= ETHER_ADDR_LEN) { + memcpy(dp->dad_lladdr, lladdr, ETHER_ADDR_LEN); + dp->dad_lladdrlen = lladdrlen; + } dp->dad_na_icount++; - if (candisable) - dp->dad_nd_ixcount++; DAD_UNLOCK(dp); DAD_REMREF(dp); @@ -2181,8 +2231,6 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, nd6log((LOG_INFO, "%s: duplicate IPv6 address %s [processing NA on %s]\n", __func__, ip6_sprintf(taddr), if_name(ifp))); - nd6_dad_duplicated(ifa); - done: IFA_LOCK_ASSERT_NOTHELD(ifa); IFA_REMREF(ifa); @@ -2335,6 +2383,11 @@ nd6_alt_node_present(struct ifnet *ifp, struct sockaddr_in6 *sin6, struct rtentry *rt; struct llinfo_nd6 *ln; struct if_llreach *lr; + const uint16_t temp_embedded_id = sin6->sin6_addr.s6_addr16[1]; + + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && + (temp_embedded_id == 0)) + sin6->sin6_addr.s6_addr16[1] = htons(ifp->if_index); nd6_cache_lladdr(ifp, &sin6->sin6_addr, LLADDR(sdl), sdl->sdl_alen, ND_NEIGHBOR_ADVERT, 0); @@ -2344,13 +2397,18 @@ nd6_alt_node_present(struct ifnet *ifp, struct sockaddr_in6 *sin6, rt = rtalloc1_scoped_locked((struct sockaddr *)sin6, 1, 0, ifp->if_index); + + /* Restore the address that was passed to us */ + if (temp_embedded_id == 0) + sin6->sin6_addr.s6_addr16[1] = 0; + if (rt != NULL) { RT_LOCK(rt); VERIFY(rt->rt_flags & RTF_LLINFO); VERIFY(rt->rt_llinfo); ln = rt->rt_llinfo; - ln->ln_state = ND6_LLINFO_REACHABLE; + ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_REACHABLE); ln_setexpire(ln, 0); lr = ln->ln_llreach; @@ -2382,15 +2440,25 @@ void nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6) { struct rtentry *rt; + const uint16_t temp_embedded_id = sin6->sin6_addr.s6_addr16[1]; nd6log((LOG_DEBUG, "%s: host route to %s\n", __func__, ip6_sprintf(&sin6->sin6_addr))); + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && + (temp_embedded_id == 0)) + sin6->sin6_addr.s6_addr16[1] = htons(ifp->if_index); + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rnh_lock); rt = rtalloc1_scoped_locked((struct sockaddr *)sin6, 0, 0, ifp->if_index); + + /* Restore the address that was passed to us */ + if (temp_embedded_id == 0) + sin6->sin6_addr.s6_addr16[1] = 0; + if (rt != NULL) { RT_LOCK(rt); diff --git a/bsd/netinet6/nd6_prproxy.c b/bsd/netinet6/nd6_prproxy.c index b0898905c..c85830994 100644 --- a/bsd/netinet6/nd6_prproxy.c +++ b/bsd/netinet6/nd6_prproxy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2013 Apple Inc. All rights reserved. + * Copyright (c) 2011-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -986,7 +986,7 @@ nd6_prproxy_ns_output(struct ifnet *ifp, struct ifnet *exclifp, "on %s\n", if_name(fwd_ifp), ip6_sprintf(taddr), if_name(ifp))); - nd6_ns_output(fwd_ifp, daddr, taddr, NULL, 0); + nd6_ns_output(fwd_ifp, daddr, taddr, NULL, NULL); } else { NDPR_UNLOCK(pr); } @@ -996,7 +996,7 @@ nd6_prproxy_ns_output(struct ifnet *ifp, struct ifnet *exclifp, } VERIFY(SLIST_EMPTY(&ndprl_head)); - nd6_ns_output(ifp, daddr, taddr, ln, 0); + nd6_ns_output(ifp, daddr, taddr, ln, NULL); } /* @@ -1009,7 +1009,8 @@ nd6_prproxy_ns_output(struct ifnet *ifp, struct ifnet *exclifp, */ void nd6_prproxy_ns_input(struct ifnet *ifp, struct in6_addr *saddr, - char *lladdr, int lladdrlen, struct in6_addr *daddr, struct in6_addr *taddr) + char *lladdr, int lladdrlen, struct in6_addr *daddr, + struct in6_addr *taddr, uint8_t *nonce) { SLIST_HEAD(, nd6_prproxy_prelist) ndprl_head; struct nd6_prproxy_prelist *ndprl, *ndprl_tmp; @@ -1119,7 +1120,7 @@ nd6_prproxy_ns_input(struct ifnet *ifp, struct in6_addr *saddr, ip6_sprintf(taddr), if_name(ifp))); nd6_ns_output(fwd_ifp, ndprl->ndprl_sol ? taddr : NULL, - taddr, NULL, !ndprl->ndprl_sol); + taddr, NULL, nonce); } else { NDPR_UNLOCK(pr); } diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index a363d3a92..4dc3b3b3c 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2015 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include #include +#include #include #include #include @@ -101,7 +102,6 @@ static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); static struct in6_ifaddr *in6_pfx_newpersistaddr(struct nd_prefix *, int, int *); -static void defrtrlist_sync(struct ifnet *); static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, struct nd_defrouter *); @@ -130,8 +130,8 @@ static void ndpr_trace(struct nd_prefix *, int); extern int nd6_recalc_reachtm_interval; -static struct ifnet *nd6_defifp; -int nd6_defifindex; +static struct ifnet *nd6_defifp = NULL; +int nd6_defifindex = 0; static unsigned int nd6_defrouter_genid; int ip6_use_tempaddr = 1; /* use temp addr by default for testing now */ @@ -489,6 +489,9 @@ nd6_ra_input( pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end; pt = (struct nd_opt_hdr *)((caddr_t)pt + (pt->nd_opt_len << 3))) { + struct in6_addr pi_mask; + bzero(&pi_mask, sizeof(pi_mask)); + if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION) continue; pi = (struct nd_opt_prefix_info *)pt; @@ -509,7 +512,18 @@ nd6_ra_input( continue; } - if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || + /* + * To ignore ::/64 make sure bits beyond prefixlen + * are set to zero + */ + in6_prefixlen2mask(&pi_mask, pi->nd_opt_pi_prefix_len); + pi->nd_opt_pi_prefix.s6_addr32[0] &= pi_mask.s6_addr32[0]; + pi->nd_opt_pi_prefix.s6_addr32[1] &= pi_mask.s6_addr32[1]; + pi->nd_opt_pi_prefix.s6_addr32[2] &= pi_mask.s6_addr32[2]; + pi->nd_opt_pi_prefix.s6_addr32[3] &= pi_mask.s6_addr32[3]; + + if (IN6_IS_ADDR_UNSPECIFIED(&pi->nd_opt_pi_prefix) || + IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, "%s: invalid prefix %s, ignored\n", @@ -663,6 +677,9 @@ nd6_ra_input( goto bad; } + if (dr && dr->stateflags & NDDRF_MAPPED) + saddr6 = dr->rtaddr_mapped; + nd6_cache_lladdr(ifp, &saddr6, lladdr, (int)lladdrlen, ND_ROUTER_ADVERT, 0); @@ -703,9 +720,7 @@ nd6_ra_input( /* tell the change to user processes watching the routing socket. */ static void -nd6_rtmsg(cmd, rt) - int cmd; - struct rtentry *rt; +nd6_rtmsg(int cmd, struct rtentry *rt) { struct rt_addrinfo info; struct ifnet *ifp = rt->rt_ifp; @@ -734,6 +749,7 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) struct rtentry *newrt = NULL; unsigned int ifscope; int err; + struct nd_ifinfo *ndi = ND_IFINFO(new->ifp); lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); NDDR_LOCK_ASSERT_NOTHELD(new); @@ -765,25 +781,59 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof (struct sockaddr_in6); def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; - gate.sin6_addr = new->rtaddr; + + if (new->stateflags & NDDRF_MAPPED) + gate.sin6_addr = new->rtaddr_mapped; + else + gate.sin6_addr = new->rtaddr; ifscope = scoped ? new->ifp->if_index : IFSCOPE_NONE; NDDR_UNLOCK(new); + /* + * Cellular networks may have buggy deployments + * with gateway IPv6 link local address with same + * interface identifier as the one that has been + * assigned for the cellular context. + * If gateway is same as locally configured link local + * interface on cellular interface, generated a different one + * and store it in the nd_defrouter entry and use it to work + * on routing table + */ + if (new->ifp->if_type == IFT_CELLULAR && + !(new->stateflags & NDDRF_STATIC) && + !(new->stateflags & NDDRF_MAPPED) && + IN6_IS_ADDR_LINKLOCAL(&gate.sin6_addr) && + ndi && !(ndi->flags & ND6_IFF_PERFORMNUD)) { + struct in6_ifaddr *tmp_ia6 = in6ifa_ifpforlinklocal(new->ifp, 0); + + if (tmp_ia6 != NULL && + !(tmp_ia6->ia6_flags & IN6_IFF_NOTMANUAL) && + IN6_ARE_ADDR_EQUAL(&tmp_ia6->ia_addr.sin6_addr, + &gate.sin6_addr)) { + gate.sin6_addr.s6_addr8[15] += 1; + new->rtaddr_mapped = gate.sin6_addr; + new->stateflags |= NDDRF_MAPPED; + + nd6log((LOG_INFO, "%s: Default router %s mapped " + "to ", if_name(new->ifp), ip6_sprintf(&new->rtaddr))); + nd6log((LOG_INFO, "%s\n", ip6_sprintf(&new->rtaddr_mapped))); + } + } + err = rtrequest_scoped(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &newrt, ifscope); if (newrt) { RT_LOCK(newrt); - nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ + nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ RT_REMREF_LOCKED(newrt); RT_UNLOCK(newrt); NDDR_LOCK(new); new->stateflags |= NDDRF_INSTALLED; if (ifscope != IFSCOPE_NONE) new->stateflags |= NDDRF_IFSCOPE; - new->genid = nd6_defrouter_genid; } else { nd6log((LOG_ERR, "%s: failed to add default router " "%s on %s scoped %d (errno = %d)\n", __func__, @@ -856,7 +906,18 @@ defrouter_delreq(struct nd_defrouter *dr) def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof (struct sockaddr_in6); def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; - gate.sin6_addr = dr->rtaddr; + + /* + * The router entry may be mapped to a different address. + * If that is the case, use the mapped address as gateway + * to do operation on the routing table. + * To get more context, read the related comment in + * defrouter_addreq + */ + if (dr->stateflags & NDDRF_MAPPED) + gate.sin6_addr = dr->rtaddr_mapped; + else + gate.sin6_addr = dr->rtaddr; if (dr->ifp != NULL) { ifscope = (dr->stateflags & NDDRF_IFSCOPE) ? @@ -865,6 +926,7 @@ defrouter_delreq(struct nd_defrouter *dr) ifscope = IFSCOPE_NONE; } NDDR_UNLOCK(dr); + err = rtrequest_scoped(RTM_DELETE, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, ifscope); @@ -921,15 +983,12 @@ defrouter_reset(void) } /* Nuke primary (non-scoped) default router */ - if (ip6_doscopedroute) { - bzero(&drany, sizeof (drany)); - lck_mtx_init(&drany.nddr_lock, ifa_mtx_grp, ifa_mtx_attr); - lck_mtx_unlock(nd6_mutex); - defrouter_delreq(&drany); - lck_mtx_destroy(&drany.nddr_lock, ifa_mtx_grp); - lck_mtx_lock(nd6_mutex); - } - + bzero(&drany, sizeof (drany)); + lck_mtx_init(&drany.nddr_lock, ifa_mtx_grp, ifa_mtx_attr); + lck_mtx_unlock(nd6_mutex); + defrouter_delreq(&drany); + lck_mtx_destroy(&drany.nddr_lock, ifa_mtx_grp); + lck_mtx_lock(nd6_mutex); } int @@ -940,6 +999,7 @@ defrtrlist_ioctl(u_long cmd, caddr_t data) struct ifnet *dr_ifp; int error = 0, add = 0; + /* XXX Handle mapped default router entries */ switch (cmd) { case SIOCDRADD_IN6_32: /* struct in6_defrouter_32 */ case SIOCDRADD_IN6_64: /* struct in6_defrouter_64 */ @@ -1008,10 +1068,19 @@ defrtrlist_ioctl(u_long cmd, caddr_t data) return (error); } +/* + * XXX Please make sure to remove dr from the + * global default router tailq list before this + * function call. + * Also ensure that you release the list reference + * only after calling this routine. + */ void defrtrlist_del(struct nd_defrouter *dr) { - struct nd_defrouter *deldr = NULL; +#if (DEVELOPMENT || DEBUG) + struct nd_defrouter *dr_itr = NULL; +#endif struct nd_prefix *pr; struct ifnet *ifp = dr->ifp; struct nd_ifinfo *ndi = NULL; @@ -1019,23 +1088,34 @@ defrtrlist_del(struct nd_defrouter *dr) lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - if (!ip6_doscopedroute && dr == TAILQ_FIRST(&nd_defrouter)) - deldr = dr; /* The router is primary. */ - - TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); +#if (DEVELOPMENT || DEBUG) + /* + * Verify that the router is not in the global default + * router list. + * Can't use defrouter_lookup here because that just works + * with address and ifp pointer. + * We have to compare the memory here. + * Also we can't use ASSERT here as that is not defined + * for development builds. + */ + TAILQ_FOREACH(dr_itr, &nd_defrouter, dr_entry) + VERIFY(dr != dr_itr); +#endif ++nd6_defrouter_genid; /* * Flush all the routing table entries that use the router * as a next hop. */ - if (ip6_doscopedroute || !ip6_forwarding) { - /* above is a good condition? */ - NDDR_ADDREF(dr); - lck_mtx_unlock(nd6_mutex); + /* above is a good condition? */ + NDDR_ADDREF(dr); + lck_mtx_unlock(nd6_mutex); + if (dr->stateflags & NDDRF_MAPPED) + rt6_flush(&dr->rtaddr_mapped, ifp); + else rt6_flush(&dr->rtaddr, ifp); - lck_mtx_lock(nd6_mutex); - NDDR_REMREF(dr); - } + + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); nd6log2((LOG_INFO, "%s: freeing defrouter %s\n", if_name(dr->ifp), ip6_sprintf(&dr->rtaddr))); /* @@ -1061,14 +1141,6 @@ defrtrlist_del(struct nd_defrouter *dr) pfxlist_onlink_check(); - /* - * If the router is the primary one, choose a new one. If Scoped - * Routing is enabled, always try to pick another eligible router - * on this interface. - */ - if (deldr || ip6_doscopedroute) - defrouter_select(ifp); - resetmtu = FALSE; ndi = ND_IFINFO(ifp); VERIFY((NULL != ndi) && (TRUE == ndi->initialized)); @@ -1080,10 +1152,15 @@ defrtrlist_del(struct nd_defrouter *dr) } lck_mtx_unlock(&ndi->lock); + /* + * If the router is the primary one, choose a new one. + * We always try to pick another eligible router + * on this interface as we do scoped routing + */ + defrouter_select(ifp); + if (resetmtu) nd6_setmtu(ifp); - - NDDR_REMREF(dr); /* remove list reference */ } int @@ -1130,7 +1207,9 @@ defrtrlist_del_static(struct nd_defrouter *new) NDDR_REMREF(dr); dr = NULL; } else { + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); defrtrlist_del(dr); + NDDR_REMREF(dr); /* remove list reference */ NDDR_REMREF(dr); } lck_mtx_unlock(nd6_mutex); @@ -1198,28 +1277,38 @@ rtpref(struct nd_defrouter *dr) void defrouter_select(struct ifnet *ifp) { -#pragma unused(ifp) - struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL; - struct nd_defrouter *installed_dr0 = NULL; - struct rtentry *rt = NULL; + struct nd_defrouter *dr = NULL; + struct nd_defrouter *selected_dr = NULL; + struct nd_defrouter *installed_dr = NULL; struct llinfo_nd6 *ln = NULL; - int update = 0; - boolean_t found_installedrt = FALSE; + struct rtentry *rt = NULL; + struct nd_ifinfo *ndi = NULL; + unsigned int genid = 0; + boolean_t is_installed_reachable = FALSE; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - /* - * We no longer install (default) interface route; only prefix routes - * are installed as interface routes. Therefore, there is no harm in - * going through this routine even if a default interface is specified, - * which happens when Scoped Routing is enabled. But for efficiency, - * we fall back to the original KAME logic when Scoped Routing is - * not in effect. - */ - if (ip6_forwarding && !ip6_doscopedroute) { - nd6log((LOG_WARNING, - "defrouter_select: called unexpectedly (forwarding=%d)\n", - ip6_forwarding)); + if (ifp == NULL) { + nd6log2((LOG_INFO, + "%s:%d: Return early. NULL interface", + __func__, __LINE__)); + return; + } + + if (ifp == lo_ifp) { + nd6log2((LOG_INFO, + "%s:%d: Return early. " + "Default router select called for loopback.\n", + __func__, __LINE__)); + return; + } + + if (ifp->if_eflags & IFEF_IPV6_ROUTER) { + nd6log2((LOG_INFO, + "%s:%d: Return early. " + "Default router select called for interface" + " %s with IFEF_IPV6_ROUTER flag set\n", + __func__, __LINE__, if_name(ifp))); return; } @@ -1227,8 +1316,33 @@ defrouter_select(struct ifnet *ifp) * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ - if (!TAILQ_FIRST(&nd_defrouter)) + if (!TAILQ_FIRST(&nd_defrouter)) { + nd6log2((LOG_INFO, + "%s:%d: Return early. " + "Default router is empty.\n", __func__, __LINE__)); + return; + } + + /* + * Take an early exit if number of routers in nd_ifinfo is + * 0 for the interface. + */ + ndi = ND_IFINFO(ifp); + if (!ndi || !ndi->initialized) { + nd6log2((LOG_INFO, + "%s:%d: Return early. " + "Interface %s's nd_ifinfo not initialized.\n", + __func__, __LINE__, if_name(ifp))); return; + } + + if (ndi->ndefrouters == 0) { + nd6log2((LOG_INFO, + "%s:%d: Return early. " + "%s does not have any default routers.\n", + __func__, __LINE__, if_name(ifp))); + return; + } /* * Due to the number of times we drop nd6_mutex, we need to @@ -1251,50 +1365,84 @@ defrouter_select(struct ifnet *ifp) * selected_dr = candidate for primary router * installed_dr = currently installed primary router */ - for (dr = TAILQ_FIRST(&nd_defrouter); dr; - dr = TAILQ_NEXT(dr, dr_entry)) { - boolean_t reachable, advrouter; + genid = nd6_defrouter_genid; + dr = TAILQ_FIRST(&nd_defrouter); + + while (dr != NULL) { struct in6_addr rtaddr; - struct ifnet *drifp; - struct nd_defrouter *drrele; + struct ifnet *drifp = NULL; + struct nd_defrouter *drrele = NULL; - drrele = NULL; - reachable = FALSE; NDDR_LOCK(dr); - rtaddr = *(&dr->rtaddr); drifp = dr->ifp; - advrouter = (drifp != NULL && - (drifp->if_eflags & IFEF_IPV6_ROUTER)); + if (drifp != ifp) { + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + continue; + } + + /* + * Optimize for the common case. + * When the interface has only one default router + * there's no point checking for reachability as + * there's nothing else to choose from. + */ + if (ndi->ndefrouters == 1) { + nd6log2((LOG_INFO, + "%s:%d: Fast forward default router selection " + "as interface %s has learned only one default " + "router and there's nothing else to choose from.\n", + __func__, __LINE__, if_name(ifp))); + VERIFY(selected_dr == NULL && installed_dr == NULL); + selected_dr = dr; + if (dr->stateflags & NDDRF_INSTALLED) + installed_dr = dr; + NDDR_ADDREF_LOCKED(selected_dr); + NDDR_UNLOCK(dr); + goto install_route; + } + + if (dr->stateflags & NDDRF_MAPPED) + rtaddr = dr->rtaddr_mapped; + else + rtaddr = dr->rtaddr; + NDDR_ADDREF_LOCKED(dr); /* for this for loop */ NDDR_UNLOCK(dr); - lck_mtx_unlock(nd6_mutex); /* Callee returns a locked route upon success */ - if ((rt = nd6_lookup(&rtaddr, 0, drifp, 0)) != NULL) { - RT_LOCK_ASSERT_HELD(rt); - if ((ln = rt->rt_llinfo) != NULL && + if (selected_dr == NULL) { + lck_mtx_unlock(nd6_mutex); + if ((rt = nd6_lookup(&rtaddr, 0, drifp, 0)) != NULL && + (ln = rt->rt_llinfo) != NULL && ND6_IS_LLINFO_PROBREACH(ln)) { - reachable = TRUE; - if (selected_dr == NULL && - (!ip6_doscopedroute || - (drifp == nd6_defifp && !advrouter))) { - selected_dr = dr; - NDDR_ADDREF(selected_dr); - } + RT_LOCK_ASSERT_HELD(rt); + selected_dr = dr; + NDDR_ADDREF(selected_dr); } + lck_mtx_lock(nd6_mutex); + } + + if (rt) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); rt = NULL; } - lck_mtx_lock(nd6_mutex); - /* Handle case (b) */ + /* + * Handle case (b) + * When there are more than one routers on the same link, the one with + * the highest router preference will be installed. + * Since the list is in decreasing order of preference: + * 1) If selected_dr is not NULL, only use dr if it is static and has + * equal preference and selected_dr is not static. + * 2) Else if selected_dr is NULL, and dr is static make selected_dr = dr + */ NDDR_LOCK(dr); - if (ip6_doscopedroute && drifp == nd6_defifp && !advrouter && - (selected_dr == NULL || rtpref(dr) > rtpref(selected_dr) || - (rtpref(dr) == rtpref(selected_dr) && - (dr->stateflags & NDDRF_STATIC) && - !(selected_dr->stateflags & NDDRF_STATIC)))) { + if (((selected_dr && (rtpref(dr) >= rtpref(selected_dr)) && + !(selected_dr->stateflags & NDDRF_STATIC)) || + (selected_dr == NULL)) && + (dr->stateflags & NDDRF_STATIC)) { if (selected_dr) { /* Release it later on */ VERIFY(drrele == NULL); @@ -1304,382 +1452,188 @@ defrouter_select(struct ifnet *ifp) NDDR_ADDREF_LOCKED(selected_dr); } - if (!(dr->stateflags & NDDRF_INSTALLED)) { - /* - * If the router hasn't been installed and it is - * reachable, try to install it later on below. - * If it's static, try to install it anyway. - */ - if (!advrouter && (reachable || - (dr->stateflags & NDDRF_STATIC))) { - dr->genid = -1; - ++update; - nd6log2((LOG_INFO, "%s: possible router %s, " - "scoped=%d, static=%d\n", if_name(drifp), - ip6_sprintf(&rtaddr), - (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0, - (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); - } - NDDR_UNLOCK(dr); - NDDR_REMREF(dr); /* for this for loop */ - if (drrele != NULL) - NDDR_REMREF(drrele); - continue; - } - - /* Record the currently installed primary/non-scoped router */ - if (!ip6_doscopedroute || !(dr->stateflags & NDDRF_IFSCOPE)) { + /* Record the currently installed router */ + if (dr->stateflags & NDDRF_INSTALLED) { if (installed_dr == NULL) { installed_dr = dr; NDDR_ADDREF_LOCKED(installed_dr); + if (dr->stateflags & NDDRF_MAPPED) + rtaddr = installed_dr->rtaddr_mapped; + else + rtaddr = installed_dr->rtaddr; + lck_mtx_unlock(nd6_mutex); + /* Callee returns a locked route upon success */ + if ((rt = nd6_lookup(&rtaddr, 0, ifp, 0)) != NULL) { + RT_LOCK_ASSERT_HELD(rt); + if ((ln = rt->rt_llinfo) != NULL && + ND6_IS_LLINFO_PROBREACH(ln)) + is_installed_reachable = TRUE; + + RT_REMREF_LOCKED(rt); + RT_UNLOCK(rt); + rt = NULL; + } + lck_mtx_lock(nd6_mutex); } else { /* this should not happen; warn for diagnosis */ - log(LOG_ERR, "defrouter_select: more than one " - "%s default router is installed\n", - ip6_doscopedroute ? "non-scoped" : ""); + nd6log((LOG_ERR, "defrouter_select: more than one " + "default router is installed for interface :%s.\n", + if_name(ifp))); } } NDDR_UNLOCK(dr); NDDR_REMREF(dr); /* for this for loop */ if (drrele != NULL) NDDR_REMREF(drrele); - } - - /* If none was selected, use the currently installed one */ - if (ip6_doscopedroute && selected_dr == NULL && installed_dr != NULL) { - selected_dr = installed_dr; - NDDR_ADDREF(selected_dr); - } - - /* - * Install the unreachable one(s) if necesssary. - */ - for (dr = TAILQ_FIRST(&nd_defrouter); dr; - dr = TAILQ_NEXT(dr, dr_entry)) { - struct nd_defrouter *_dr; - - if (!ip6_doscopedroute) - break; - - NDDR_LOCK(dr); - - /* If already (or will be) installed, skip */ - if ((dr->stateflags & NDDRF_INSTALLED) || dr->genid == -1) { - NDDR_UNLOCK(dr); - continue; - } - /* See if there is already a default router for the link */ - for (_dr = TAILQ_FIRST(&nd_defrouter); _dr; - _dr = TAILQ_NEXT(_dr, dr_entry)) { - if (_dr != dr) - NDDR_LOCK(_dr); - if (_dr == dr || _dr->ifp != dr->ifp) { - if (_dr != dr) - NDDR_UNLOCK(_dr); - continue; + /* + * Check if the list changed when we gave up + * the nd6_mutex lock + */ + if(genid != nd6_defrouter_genid) { + if (selected_dr) { + NDDR_REMREF(selected_dr); + selected_dr = NULL; } - if ((_dr->stateflags & NDDRF_INSTALLED) || - _dr->genid == -1) { - if (_dr != dr) - NDDR_UNLOCK(_dr); - break; + if (installed_dr) { + NDDR_REMREF(selected_dr); + installed_dr = NULL; } - if (_dr != dr) - NDDR_UNLOCK(_dr); - } - /* If none so far, schedule it to be installed below */ - if (_dr == NULL && dr->ifp != NULL && - !(dr->ifp->if_eflags & IFEF_IPV6_ROUTER)) { - dr->genid = -1; - ++update; - nd6log2((LOG_INFO, "%s: possible router %s, " - "static=%d (unreachable)\n", if_name(dr->ifp), - ip6_sprintf(&dr->rtaddr), - (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + if (ndi->ndefrouters == 0) { + nd6log2((LOG_INFO, + "%s:%d: Interface %s no longer " + "has any default routers. Abort.\n", + __func__, __LINE__, if_name(ifp))); + goto out; + } + nd6log2((LOG_INFO, + "%s:%d: Iterate default router list again " + "for interface %s, as the list seems to have " + "changed during release-reaquire of global " + "nd6_mutex lock.\n", + __func__, __LINE__, if_name(ifp))); + + is_installed_reachable = FALSE; + genid = nd6_defrouter_genid; + dr = TAILQ_FIRST(&nd_defrouter); + } else { + dr = TAILQ_NEXT(dr, dr_entry); } - NDDR_UNLOCK(dr); - } - - dr = selected_dr; - if (dr != NULL) { - nd6log2((LOG_INFO, "%s: considering primary default router %s, " - "static=%d [round 1]\n", if_name(dr->ifp), - ip6_sprintf(&dr->rtaddr), - (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); } /* * If none of the default routers was found to be reachable, - * round-robin the list regardless of preference, except when - * Scoped Routing is enabled per case (c). - * - * Otherwise, if we have an installed router, check if the selected - * (reachable) router should really be preferred to the installed one. - * We only prefer the new router when the old one is not reachable - * or when the new one has a really higher preference value. + * round-robin the list regardless of preference. + * Please note selected_dr equal to NULL implies that even + * installed default router is not reachable */ - if (!ip6_doscopedroute && selected_dr == NULL) { - if (installed_dr == NULL || - !TAILQ_NEXT(installed_dr, dr_entry)) { - selected_dr = TAILQ_FIRST(&nd_defrouter); - if (selected_dr) - NDDR_ADDREF(selected_dr); - } else { - selected_dr = TAILQ_NEXT(installed_dr, dr_entry); - if (selected_dr) - NDDR_ADDREF(selected_dr); - } - } else if (selected_dr != NULL && installed_dr != NULL) { - lck_mtx_unlock(nd6_mutex); - rt = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp, 0); - if (rt) { - RT_LOCK_ASSERT_HELD(rt); - if ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) && - ND6_IS_LLINFO_PROBREACH(ln) && - (!ip6_doscopedroute || - installed_dr->ifp == nd6_defifp) && - rtpref(selected_dr) <= rtpref(installed_dr)) { - NDDR_REMREF(selected_dr); - selected_dr = installed_dr; - NDDR_ADDREF(selected_dr); + if (selected_dr == NULL) { + if (installed_dr) { + for (dr = TAILQ_NEXT(installed_dr, dr_entry); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + if (installed_dr->ifp != dr->ifp) + continue; + selected_dr = dr; + break; } - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - rt = NULL; - found_installedrt = TRUE; } - lck_mtx_lock(nd6_mutex); - } - if (ip6_doscopedroute) { /* - * If the installed primary router is not on the current - * IPv6 default interface, demote it to a scoped entry. + * If none was installed or the installed one if the last + * one on the list, select the first one from the list */ - if (installed_dr != NULL && installed_dr->ifp != nd6_defifp && - !(installed_dr->stateflags & NDDRF_IFSCOPE)) { - if (selected_dr != NULL && - selected_dr->ifp != nd6_defifp) { - NDDR_REMREF(selected_dr); - selected_dr = NULL; + if ((installed_dr == NULL) || (selected_dr == NULL)) { + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + if (dr->ifp == ifp) { + selected_dr = dr; + break; + } } - ++update; - } - - /* - * If the selected router is currently scoped, make sure - * we update (it needs to be promoted to primary.) - */ - if (selected_dr != NULL && - (selected_dr->stateflags & NDDRF_IFSCOPE)) - ++update; - - /* - * If the installed router is no longer reachable, remove - * it and install the selected router instead. - */ - if (installed_dr != NULL - && selected_dr != NULL - && installed_dr != selected_dr - && found_installedrt == FALSE - && installed_dr->ifp == selected_dr->ifp) { - /* skip it below */ - installed_dr0 = installed_dr; - /* NB: we previousled referenced installed_dr */ - installed_dr = NULL; - selected_dr->genid = -1; - ++update; } - } - /* - * If Scoped Routing is enabled and there's nothing to update, - * just return. Otherwise, if Scoped Routing is disabled and if - * the selected router is different than the installed one, - * remove the installed router and install the selected one. - */ - dr = selected_dr; - VERIFY(dr != NULL || ip6_doscopedroute); - if (!ip6_doscopedroute || !update) { - if (dr == NULL) + if ((selected_dr == NULL) && (installed_dr == NULL)) { + nd6log2((LOG_INFO, + "%s:%d: Between release and reaquire of global " + "nd6_mutex lock, the list seems to have changed " + "and it does not have any default routers for " + "interface %s.\n", + __func__, __LINE__, if_name(ifp))); goto out; - - if (dr != installed_dr) { - nd6log2((LOG_INFO, "%s: no update, selected router %s, " - "installed router %s\n", if_name(dr->ifp), - ip6_sprintf(&dr->rtaddr), installed_dr != NULL ? - ip6_sprintf(&installed_dr->rtaddr) : "NONE")); - } else { - nd6log2((LOG_INFO, "%s: no update, router is %s\n", - if_name(dr->ifp), ip6_sprintf(&dr->rtaddr))); } - if (!ip6_doscopedroute && installed_dr != dr) { + + if (selected_dr != installed_dr) + NDDR_ADDREF(selected_dr); + } else if (installed_dr != NULL) { + if (installed_dr != selected_dr) { /* - * No need to ADDREF dr because at this point - * dr points to selected_dr, which already holds - * a reference. + * This means that selected default router is reachable + * while installed one may or may not be. + * Static router should always be considered as reachable + * for router selection process. */ - lck_mtx_unlock(nd6_mutex); - if (installed_dr) { - defrouter_delreq(installed_dr); + if ((installed_dr->stateflags & NDDRF_STATIC) && + rtpref(installed_dr) >= rtpref(selected_dr)) { + NDDR_REMREF(selected_dr); + selected_dr = installed_dr; + } else if (is_installed_reachable) { + if (rtpref(selected_dr) <= rtpref(installed_dr)) { + NDDR_REMREF(selected_dr); + selected_dr = installed_dr; + } } - defrouter_addreq(dr, FALSE); - lck_mtx_lock(nd6_mutex); + } else { + NDDR_REMREF(selected_dr); } - goto out; } +install_route: /* - * Scoped Routing is enabled and we need to update. The selected - * router needs to be installed as primary/non-scoped entry. If - * there is any existing entry that is non-scoped, remove it from - * the routing table and reinstall it as scoped entry. - */ - if (dr != NULL) { - nd6log2((LOG_INFO, "%s: considering primary default router %s, " - "static=%d [round 2]\n", if_name(dr->ifp), - ip6_sprintf(&dr->rtaddr), - (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); - } - - /* - * On the following while loops we use two flags: - * dr->genid - * NDDRF_PROCESSED - * - * genid is used to skip entries that are not to be added/removed on the - * second while loop. - * NDDRF_PROCESSED is used to skip entries that were already - * processed. - * This is necessary because we drop the nd6_mutex and start the while - * loop again. + * If the selected router is different than the installed one, + * remove the installed router and install the selected one. + * Note that the selected router is never NULL here. + * Else check if the route entry scope has to be changed. */ - TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { - NDDR_LOCK(dr); - VERIFY((dr->stateflags & NDDRF_PROCESSED) == 0); - NDDR_UNLOCK(dr); - } - /* Remove conflicting entries */ - dr = TAILQ_FIRST(&nd_defrouter); - while (dr) { - NDDR_LOCK(dr); - if (!(dr->stateflags & NDDRF_INSTALLED) || - dr->stateflags & NDDRF_PROCESSED) { - NDDR_UNLOCK(dr); - dr = TAILQ_NEXT(dr, dr_entry); - continue; - } - dr->stateflags |= NDDRF_PROCESSED; - - /* A NULL selected_dr will remove primary default route */ - if ((dr == selected_dr && (dr->stateflags & NDDRF_IFSCOPE)) || - (dr != selected_dr && !(dr->stateflags & NDDRF_IFSCOPE))) { - NDDR_ADDREF_LOCKED(dr); - NDDR_UNLOCK(dr); - lck_mtx_unlock(nd6_mutex); - defrouter_delreq(dr); - lck_mtx_lock(nd6_mutex); - NDDR_LOCK(dr); - if (dr && dr != installed_dr0) - dr->genid = -1; - NDDR_UNLOCK(dr); - NDDR_REMREF(dr); - /* - * Since we lost nd6_mutex, we have to start over. - */ - dr = TAILQ_FIRST(&nd_defrouter); - continue; - } - NDDR_UNLOCK(dr); - dr = TAILQ_NEXT(dr, dr_entry); - } - - /* -1 is a special number, make sure we don't use it for genid */ - if (++nd6_defrouter_genid == -1) - nd6_defrouter_genid = 1; - - TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { - NDDR_LOCK(dr); - dr->stateflags &= ~NDDRF_PROCESSED; - NDDR_UNLOCK(dr); - } - /* Add the entries back */ - dr = TAILQ_FIRST(&nd_defrouter); - while (dr) { - struct nd_defrouter *_dr; - - NDDR_LOCK(dr); - if (dr->stateflags & NDDRF_PROCESSED || - dr->genid != -1) { - NDDR_UNLOCK(dr); - dr = TAILQ_NEXT(dr, dr_entry); - continue; - } - dr->stateflags |= NDDRF_PROCESSED; - - /* Handle case (b) */ - for (_dr = TAILQ_FIRST(&nd_defrouter); _dr; - _dr = TAILQ_NEXT(_dr, dr_entry)) { - if (_dr == dr) - continue; - /* - * This is safe because we previously checked if - * _dr == dr. - */ - NDDR_LOCK(_dr); - if (_dr->ifp == dr->ifp && rtpref(_dr) >= rtpref(dr) && - (_dr->stateflags & NDDRF_INSTALLED)) { - NDDR_ADDREF_LOCKED(_dr); - NDDR_UNLOCK(_dr); - break; - } - NDDR_UNLOCK(_dr); - } - - /* If same preference and i/f, static entry takes precedence */ - if (_dr != NULL && rtpref(_dr) == rtpref(dr) && - !(_dr->stateflags & NDDRF_STATIC) && - (dr->stateflags & NDDRF_STATIC)) { - lck_mtx_unlock(nd6_mutex); - defrouter_delreq(_dr); - lck_mtx_lock(nd6_mutex); - NDDR_REMREF(_dr); - _dr = NULL; - } - - if (_dr == NULL && !(dr->stateflags & NDDRF_INSTALLED)) { - NDDR_ADDREF_LOCKED(dr); - NDDR_UNLOCK(dr); - lck_mtx_unlock(nd6_mutex); - defrouter_addreq(dr, (selected_dr == NULL || - dr->ifp != selected_dr->ifp)); - dr->genid = nd6_defrouter_genid; - lck_mtx_lock(nd6_mutex); - NDDR_REMREF(dr); - /* - * Since we lost nd6_mutex, we have to start over. - */ - dr = TAILQ_FIRST(&nd_defrouter); - continue; + lck_mtx_unlock(nd6_mutex); + if (installed_dr != selected_dr) { + nd6log((LOG_INFO, + "%s:%d: Found a better router for interface " + "%s. Installing new default route.\n", + __func__, __LINE__, if_name(ifp))); + if (installed_dr != NULL) { + defrouter_delreq(installed_dr); } - NDDR_UNLOCK(dr); - dr = TAILQ_NEXT(dr, dr_entry); + /* + * Install scoped route if the interface is not + * the default nd6 interface. + */ + defrouter_addreq(selected_dr, + (selected_dr->ifp != nd6_defifp)); + } else if (((installed_dr->stateflags & NDDRF_IFSCOPE) && + (installed_dr->ifp == nd6_defifp)) || + (!(installed_dr->stateflags & NDDRF_IFSCOPE) && + (installed_dr->ifp != nd6_defifp))) { + nd6log((LOG_INFO, + "%s:%d: Need to reinstall default route for interface " + "%s as its scope has changed.\n", + __func__, __LINE__, if_name(ifp))); + defrouter_delreq(installed_dr); + defrouter_addreq(installed_dr, + (installed_dr->ifp != nd6_defifp)); + } else { + nd6log2((LOG_INFO, + "%s:%d: No need to change the default " + "route for interface %s.\n", + __func__, __LINE__, if_name(ifp))); } + lck_mtx_lock(nd6_mutex); out: - TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { - NDDR_LOCK(dr); - dr->stateflags &= ~NDDRF_PROCESSED; - NDDR_UNLOCK(dr); - } - if (selected_dr) + if (selected_dr && (selected_dr != installed_dr)) NDDR_REMREF(selected_dr); if (installed_dr) NDDR_REMREF(installed_dr); - if (installed_dr0) - NDDR_REMREF(installed_dr0); lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); VERIFY(nd_defrouter_busy); nd_defrouter_busy = FALSE; @@ -1702,12 +1656,14 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) if ((dr = defrouter_lookup(&new->rtaddr, ifp)) != NULL) { /* entry exists */ if (new->rtlifetime == 0) { + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); defrtrlist_del(dr); + NDDR_REMREF(dr); /* remove list reference */ NDDR_REMREF(dr); dr = NULL; } else { int oldpref = rtpref(dr); - + struct nd_defrouter *p = NULL; /* override */ dr->flags = new->flags; /* xxx flag check */ dr->rtlifetime = new->rtlifetime; @@ -1720,36 +1676,30 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) * list of routers in the same preference band, unless * it's already at that position. */ - if (ip6_doscopedroute) { - struct nd_defrouter *p = NULL; - - /* same preference and scoped; just return */ - if (rtpref(new) == oldpref && scoped) - return (dr); - - n = TAILQ_FIRST(&nd_defrouter); - while (n != NULL) { - /* preference changed; sort it */ - if (rtpref(new) != oldpref) - break; - - /* not at the top of band; sort it */ - if (n != dr && rtpref(n) == oldpref && - (!p || rtpref(p) > rtpref(n))) - break; - - p = n; - n = TAILQ_NEXT(n, dr_entry); - } - - /* nothing has changed, just return */ - if (n == NULL && (scoped || - !(dr->stateflags & NDDRF_IFSCOPE))) - return (dr); - } else if (rtpref(new) == oldpref) { + /* same preference and scoped; just return */ + if (rtpref(new) == oldpref && scoped) return (dr); + + n = TAILQ_FIRST(&nd_defrouter); + while (n != NULL) { + /* preference changed; sort it */ + if (rtpref(new) != oldpref) + break; + + /* not at the top of band; sort it */ + if (n != dr && rtpref(n) == oldpref && + (!p || rtpref(p) > rtpref(n))) + break; + + p = n; + n = TAILQ_NEXT(n, dr_entry); } + /* nothing has changed, just return */ + if (n == NULL && (scoped || + !(dr->stateflags & NDDRF_IFSCOPE))) + return (dr); + /* * preferred router may be changed, so relocate * this router. @@ -1761,7 +1711,6 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) */ TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); new->stateflags = dr->stateflags; - new->stateflags &= ~NDDRF_PROCESSED; n = dr; goto insert; @@ -1807,13 +1756,11 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) memcpy(&n->rtaddr, &new->rtaddr, sizeof (n->rtaddr)); n->flags = new->flags; n->stateflags = new->stateflags; - n->stateflags &= ~NDDRF_PROCESSED; n->rtlifetime = new->rtlifetime; n->expire = new->expire; n->base_calendartime = caltime.tv_sec; n->base_uptime = net_uptime(); n->ifp = new->ifp; - n->genid = new->genid; n->err = new->err; NDDR_UNLOCK(n); insert: @@ -1834,7 +1781,7 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (rtpref(n) > rtpref(dr) || - (ip6_doscopedroute && !scoped && rtpref(n) == rtpref(dr))) + (!scoped && rtpref(n) == rtpref(dr))) break; } if (dr) @@ -1859,45 +1806,6 @@ defrtrlist_update(struct nd_defrouter *new) return (dr); } -static void -defrtrlist_sync(struct ifnet *ifp) -{ - struct nd_defrouter *dr, new; - - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - - if (!ip6_doscopedroute) { - defrouter_select(ifp); - return; - } - - for (dr = TAILQ_FIRST(&nd_defrouter); dr; - dr = TAILQ_NEXT(dr, dr_entry)) { - NDDR_LOCK(dr); - if (dr->ifp == ifp && (dr->stateflags & NDDRF_INSTALLED)) - break; - NDDR_UNLOCK(dr); - } - - if (dr == NULL) { - defrouter_select(ifp); - } else { - memcpy(&new.rtaddr, &dr->rtaddr, sizeof (new.rtaddr)); - new.flags = dr->flags; - new.stateflags = dr->stateflags; - new.stateflags &= ~NDDRF_PROCESSED; - new.rtlifetime = dr->rtlifetime; - new.expire = dr->expire; - new.ifp = dr->ifp; - new.genid = dr->genid; - new.err = dr->err; - NDDR_UNLOCK(dr); - dr = defrtrlist_update_common(&new, FALSE); - if (dr) - NDDR_REMREF(dr); - } -} - static struct nd_pfxrouter * pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) { @@ -1950,7 +1858,7 @@ pfxrtr_del(struct nd_pfxrouter *pfr, struct nd_prefix *pr) /* * The routine has been modified to atomically refresh expiry * time for nd6 prefix as the part of lookup. - * rdar://20339655 explains the corner case where a system going + * There's a corner case where a system going * in sleep gets rid of manual addresses configured in the system * and then schedules the prefix for deletion. * However before the prefix gets deleted, if system comes out @@ -2049,6 +1957,11 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, new->ndpr_debug |= IFD_ATTACHED; NDPR_ADDREF(new); /* for nd_prefix list */ + lck_mtx_lock(&ndi->lock); + ndi->nprefixes++; + VERIFY(ndi->nprefixes != 0); + lck_mtx_unlock(&ndi->lock); + /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { int e; @@ -2068,11 +1981,6 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, pfxrtr_add(new, dr); } - lck_mtx_lock(&ndi->lock); - ndi->nprefixes++; - VERIFY(ndi->nprefixes != 0); - lck_mtx_unlock(&ndi->lock); - lck_mtx_unlock(nd6_mutex); return (0); @@ -2828,7 +2736,11 @@ find_pfxlist_reachable_router(struct nd_prefix *pr) pfxrtr = LIST_FIRST(&pr->ndpr_advrtrs); while (pfxrtr) { ifp = pfxrtr->router->ifp; - rtaddr = pfxrtr->router->rtaddr; + if (pfxrtr->router->stateflags & NDDRF_MAPPED) + rtaddr = pfxrtr->router->rtaddr_mapped; + else + rtaddr = pfxrtr->router->rtaddr; + NDPR_UNLOCK(pr); lck_mtx_unlock(nd6_mutex); /* Callee returns a locked route upon success */ @@ -3160,7 +3072,7 @@ pfxlist_onlink_check(void) IFA_LOCK(&ifa->ia_ifa); if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; - ifa->ia6_flags |= IN6_IFF_TENTATIVE; + in6_ifaddr_set_dadprogress((struct in6_ifaddr *)ifa); IFA_UNLOCK(&ifa->ia_ifa); nd6_dad_start((struct ifaddr *)ifa, 0); } else { @@ -3184,7 +3096,7 @@ pfxlist_onlink_check(void) } if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; - ifa->ia6_flags |= IN6_IFF_TENTATIVE; + in6_ifaddr_set_dadprogress((struct in6_ifaddr *)ifa); IFA_UNLOCK(&ifa->ia_ifa); /* Do we need a delay in this case? */ nd6_dad_start((struct ifaddr *)ifa, 0); @@ -3239,7 +3151,7 @@ nd6_prefix_sync(struct ifnet *ifp) lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - if (!ip6_doscopedroute || ifp == NULL) + if (ifp == NULL) return; for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { @@ -3364,12 +3276,7 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, if (opr != NULL) NDPR_REMREF(opr); - if (!ip6_doscopedroute) { - /* if an interface route already exists, just return */ - if (opr != NULL) - return (0); - ifscope = IFSCOPE_NONE; - } else if (!force_scoped) { + if (!force_scoped) { /* * If a primary/non-scoped interface route already exists, * install the new one as a scoped entry. If the existing @@ -3554,7 +3461,6 @@ nd6_prefix_offlink(struct nd_prefix *pr) { int plen, error = 0, prproxy; struct ifnet *ifp = pr->ndpr_ifp; - struct nd_prefix *opr; struct sockaddr_in6 sa6, mask6, prefix; struct rtentry *rt = NULL, *ndpr_rt = NULL; unsigned int ifscope; @@ -3602,66 +3508,6 @@ nd6_prefix_offlink(struct nd_prefix *pr) RT_UNLOCK(rt); rtfree(rt); - /* - * The following check takes place only when Scoped Routing - * is not enabled. There might be the same prefix on another - * interface, the prefix which could not be on-link just - * because we have the interface route (see comments in - * nd6_prefix_onlink). If there's one, try to make the prefix - * on-link on the interface. - */ - lck_mtx_lock(nd6_mutex); - opr = nd_prefix.lh_first; - while (opr) { - /* does not apply in the Scoped Routing case */ - if (ip6_doscopedroute) - break; - - if (opr == pr) { - opr = opr->ndpr_next; - continue; - } - - NDPR_LOCK(opr); - if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) { - NDPR_UNLOCK(opr); - opr = opr->ndpr_next; - continue; - } - /* - * KAME specific: detached prefixes should not be - * on-link. - */ - if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) { - NDPR_UNLOCK(opr); - opr = opr->ndpr_next; - continue; - } - if (opr->ndpr_plen == plen && - in6_are_prefix_equal(&prefix.sin6_addr, - &opr->ndpr_prefix.sin6_addr, plen)) { - int e; - - NDPR_ADDREF_LOCKED(opr); - NDPR_UNLOCK(opr); - if ((e = nd6_prefix_onlink(opr)) != 0) { - nd6log((LOG_ERR, - "nd6_prefix_offlink: failed to " - "recover a prefix %s/%d from %s " - "to %s (errno = %d)\n", - ip6_sprintf( - &opr->ndpr_prefix.sin6_addr), - opr->ndpr_plen, if_name(ifp), - if_name(opr->ndpr_ifp), e)); - } - NDPR_REMREF(opr); - opr = nd_prefix.lh_first; - } else { - NDPR_UNLOCK(opr); - opr = opr->ndpr_next; - } - } - lck_mtx_unlock(nd6_mutex); } else { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to delete route: " @@ -3800,7 +3646,18 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp) ia6 = NULL; } else { in6_cga_node_lock(); - error = in6_cga_generate(NULL, 0, &ifra.ifra_addr.sin6_addr); + struct in6_cga_prepare local_cga_prepare; + + if (ndi->cga_initialized) { + bcopy(&(ndi->local_cga_modifier), + &(local_cga_prepare.cga_modifier), + sizeof(local_cga_prepare.cga_modifier)); + error = in6_cga_generate(&local_cga_prepare, 0, + &ifra.ifra_addr.sin6_addr); + } else { + error = in6_cga_generate(NULL, 0, + &ifra.ifra_addr.sin6_addr); + } in6_cga_node_unlock(); if (error == 0) ifra.ifra_flags |= IN6_IFF_SECURED; @@ -4200,8 +4057,12 @@ nd6_setdefaultiface( * we do this here to avoid re-install the default route * if the list is NOT empty. */ - if (ip6_doscopedroute || TAILQ_FIRST(&nd_defrouter) == NULL) { - defrtrlist_sync(nd6_defifp); + if (odef_ifp != NULL) { + defrouter_select(odef_ifp); + } + + if (nd6_defifp != NULL) { + defrouter_select(nd6_defifp); nd6_prefix_sync(nd6_defifp); } @@ -4213,6 +4074,5 @@ nd6_setdefaultiface( scope6_setdefault(nd6_defifp); } lck_mtx_unlock(nd6_mutex); - return (error); } diff --git a/bsd/netinet6/nd6_send.c b/bsd/netinet6/nd6_send.c index 04f5223d4..2b99a3bf4 100644 --- a/bsd/netinet6/nd6_send.c +++ b/bsd/netinet6/nd6_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Apple Inc. All rights reserved. + * Copyright (c) 2013-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -52,8 +52,7 @@ SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ SYSCTL_NODE(_net_inet6, OID_AUTO, send, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IPv6 Secure Neighbor Discovery"); -static int nd6_send_opmode = ND6_SEND_OPMODE_DISABLED; - +static int nd6_send_opmode = ND6_SEND_OPMODE_CGA_QUIET; SYSCTL_INT(_net_inet6_send, OID_AUTO, opstate, CTLFLAG_RD | CTLFLAG_LOCKED, &nd6_send_opstate, 0, "current SEND operating state"); @@ -74,8 +73,8 @@ SYSCTL_PROC(_net_inet6_send, OID_AUTO, cga_parameters, * userland and the kernel will be mismatched between ILP32 and LP64. */ #define SYSCTL_CGA_PARAMETERS_BUFFER_SIZE \ - 2 * (sizeof (u_int16_t) + IN6_CGA_KEY_MAXSIZE) + \ - sizeof (struct in6_cga_prepare) + (2 * (sizeof (u_int16_t) + IN6_CGA_KEY_MAXSIZE) + \ + sizeof (struct in6_cga_prepare)) static int sysctl_cga_parameters SYSCTL_HANDLER_ARGS diff --git a/bsd/netinet6/nd6_var.h b/bsd/netinet6/nd6_var.h index 0743c0a94..0a92f8bc8 100644 --- a/bsd/netinet6/nd6_var.h +++ b/bsd/netinet6/nd6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ struct nd_ifinfo { /* keep track of routers and prefixes on this link */ int32_t nprefixes; int32_t ndefrouters; + boolean_t cga_initialized; struct in6_cga_modifier local_cga_modifier; }; #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index 3cee2fb21..4fff23a3c 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -108,6 +108,7 @@ #include #include #include +#include #include #include #include @@ -117,7 +118,6 @@ #include #include #include -#include #if IPSEC #include @@ -211,7 +211,7 @@ rip6_input( m_freem(opts); last = in6p; continue; - } + } } /* strip intermediate headers */ m_adj(n, *offp); @@ -247,7 +247,7 @@ rip6_input( ip6stat.ip6s_delivered--; goto unlock; } - + } /* strip intermediate headers */ m_adj(m, *offp); @@ -298,9 +298,10 @@ rip6_ctlinput( if ((unsigned)cmd >= PRC_NCMDS) return; - if (PRC_IS_REDIRECT(cmd)) - notify = in6_rtchange, d = NULL; - else if (cmd == PRC_HOSTDEAD) + if (PRC_IS_REDIRECT(cmd)) { + notify = in6_rtchange; + d = NULL; + } else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; @@ -344,9 +345,10 @@ rip6_output( struct ip6_moptions *im6o = NULL; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ - mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + int sotc = SO_TC_UNSPEC; + int netsvctype = _NET_SERVICE_TYPE_UNSPEC; struct ip6_out_args ip6oa = - { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF, 0, 0, 0 }; int flags = IPV6_OUTARGS; in6p = sotoin6pcb(so); @@ -377,10 +379,12 @@ rip6_output( ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; if (INP_AWDL_UNRESTRICTED(in6p)) ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; + if (INP_INTCOPROC_ALLOWED(in6p)) + ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED; dst = &dstsock->sin6_addr; if (control) { - msc = mbuf_service_class_from_control(control); + sotc = so_tc_from_control(control, &netsvctype); if ((error = ip6_setpktopts(control, &opt, NULL, SOCK_PROTO(so))) != 0) @@ -388,6 +392,12 @@ rip6_output( optp = &opt; } else optp = in6p->in6p_outputopts; + if (sotc == SO_TC_UNSPEC) { + sotc = so->so_traffic_class; + netsvctype = so->so_netsvctype; + } + ip6oa.ip6oa_sotc = sotc; + ip6oa.ip6oa_netsvctype = netsvctype; /* * For an ICMPv6 packet, we should know its type and code @@ -459,8 +469,8 @@ rip6_output( ifnet_reference(oifp); ip6->ip6_dst.s6_addr16[1] = htons(oifp->if_index); } else if (dstsock->sin6_scope_id) { - /* - * boundary check + /* + * boundary check * * Sinced stsock->sin6_scope_id is unsigned, we don't * need to check if it's < 0 @@ -554,6 +564,39 @@ rip6_output( { necp_kernel_policy_id policy_id; u_int32_t route_rule_id; + + /* + * We need a route to perform NECP route rule checks + */ + if (net_qos_policy_restricted != 0 && + ROUTE_UNUSABLE(&in6p->in6p_route)) { + struct sockaddr_in6 to; + struct sockaddr_in6 from; + + ROUTE_RELEASE(&in6p->in6p_route); + + bzero(&from, sizeof(struct sockaddr_in6)); + from.sin6_family = AF_INET6; + from.sin6_len = sizeof(struct sockaddr_in6); + from.sin6_addr = ip6->ip6_src; + + bzero(&to, sizeof(struct sockaddr_in6)); + to.sin6_family = AF_INET6; + to.sin6_len = sizeof(struct sockaddr_in6); + to.sin6_addr = ip6->ip6_dst; + + in6p->in6p_route.ro_dst.sin6_family = AF_INET6; + in6p->in6p_route.ro_dst.sin6_len = sizeof(struct sockaddr_in6); + ((struct sockaddr_in6 *)(void *)&in6p->in6p_route.ro_dst)->sin6_addr = + ip6->ip6_dst; + + rtalloc_scoped((struct route *)&in6p->in6p_route, ip6oa.ip6oa_boundif); + + inp_update_necp_policy(in6p, (struct sockaddr *)&from, + (struct sockaddr *)&to, ip6oa.ip6oa_boundif); + in6p->inp_policyresult.results.qos_marking_gencount = 0; + } + if (!necp_socket_is_allowed_to_send_recv_v6(in6p, 0, 0, &ip6->ip6_src, &ip6->ip6_dst, NULL, &policy_id, &route_rule_id)) { error = EHOSTUNREACH; @@ -561,8 +604,15 @@ rip6_output( } necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id); + + if (net_qos_policy_restricted != 0) { + necp_socket_update_qos_marking(in6p, in6p->in6p_route.ro_rt, + NULL, route_rule_id); + } } #endif /* NECP */ + if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) + ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED; #if IPSEC if (in6p->in6p_sp != NULL && ipsec_setsocket(m, so) != 0) { @@ -579,7 +629,7 @@ rip6_output( oifp = NULL; } - set_packet_service_class(m, so, msc, PKT_SCF_IPV6); + set_packet_service_class(m, so, sotc, PKT_SCF_IPV6); m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = in6p->inp_flowhash; m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | @@ -659,14 +709,6 @@ rip6_output( return(error); } -#if IPFW2 -__private_extern__ void -load_ip6fw(void) -{ - ip6_fw_init(); -} -#endif - /* * Raw IPv6 socket option processing. */ @@ -693,17 +735,6 @@ rip6_ctloutput( switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { -#if IPFW2 - case IPV6_FW_ADD: - case IPV6_FW_GET: - if (ip6_fw_ctl_ptr == 0) - load_ip6fw(); - if (ip6_fw_ctl_ptr) - error = ip6_fw_ctl_ptr(sopt); - else - error = ENOPROTOOPT; - break; -#endif case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; @@ -715,20 +746,6 @@ rip6_ctloutput( case SOPT_SET: switch (sopt->sopt_name) { -#if IPFW2 - case IPV6_FW_ADD: - case IPV6_FW_DEL: - case IPV6_FW_FLUSH: - case IPV6_FW_ZERO: - if (ip6_fw_ctl_ptr == 0) - load_ip6fw(); - if (ip6_fw_ctl_ptr) - error = ip6_fw_ctl_ptr(sopt); - else - error = ENOPROTOOPT; - break; -#endif - case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; @@ -907,6 +924,10 @@ rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) } #endif + /* KAME hack: embed scopeid */ + if (in6_embedscope(&SIN6(nam)->sin6_addr, SIN6(nam), inp, NULL, NULL) != 0) + return (EINVAL); + ifscope = (inp->inp_flags & INP_BOUND_IF) ? inp->inp_boundifp->if_index : IFSCOPE_NONE; diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index a8d0e2fb3..c0cd894e4 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -117,6 +117,7 @@ #include #include #include +#include #include #include #include @@ -158,9 +159,10 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, int flags; struct sockaddr_in6 tmp; struct in6_addr storage; - mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + int sotc = SO_TC_UNSPEC; + int netsvctype = _NET_SERVICE_TYPE_UNSPEC; struct ip6_out_args ip6oa = - { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF, 0 }; + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF, 0, 0, 0 }; struct flowadv *adv = &ip6oa.ip6oa_flowadv; struct socket *so = in6p->in6p_socket; struct route_in6 ro; @@ -184,9 +186,11 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; if (INP_AWDL_UNRESTRICTED(in6p)) ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; + if (INP_INTCOPROC_ALLOWED(in6p)) + ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED; if (control) { - msc = mbuf_service_class_from_control(control); + sotc = so_tc_from_control(control, &netsvctype); if ((error = ip6_setpktopts(control, &opt, NULL, IPPROTO_UDP)) != 0) goto release; @@ -194,6 +198,13 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, } else optp = in6p->in6p_outputopts; + if (sotc == SO_TC_UNSPEC) { + sotc = so->so_traffic_class; + netsvctype = so->so_netsvctype; + } + ip6oa.ip6oa_sotc = sotc; + ip6oa.ip6oa_netsvctype = netsvctype; + if (addr6) { /* * IPv4 version of udp_output calls in_pcbconnect in this case, @@ -356,14 +367,54 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, { necp_kernel_policy_id policy_id; u_int32_t route_rule_id; + + /* + * We need a route to perform NECP route rule checks + */ + if (net_qos_policy_restricted != 0 && + ROUTE_UNUSABLE(&in6p->inp_route)) { + struct sockaddr_in6 to; + struct sockaddr_in6 from; + + ROUTE_RELEASE(&in6p->inp_route); + + bzero(&from, sizeof(struct sockaddr_in6)); + from.sin6_family = AF_INET6; + from.sin6_len = sizeof(struct sockaddr_in6); + from.sin6_addr = *laddr; + + bzero(&to, sizeof(struct sockaddr_in6)); + to.sin6_family = AF_INET6; + to.sin6_len = sizeof(struct sockaddr_in6); + to.sin6_addr = *faddr; + + in6p->inp_route.ro_dst.sa_family = AF_INET6; + in6p->inp_route.ro_dst.sa_len = sizeof(struct sockaddr_in6); + ((struct sockaddr_in6 *)(void *)&in6p->inp_route.ro_dst)->sin6_addr = + *faddr; + + rtalloc_scoped(&in6p->inp_route, ip6oa.ip6oa_boundif); + + inp_update_necp_policy(in6p, (struct sockaddr *)&from, + (struct sockaddr *)&to, ip6oa.ip6oa_boundif); + in6p->inp_policyresult.results.qos_marking_gencount = 0; + } + if (!necp_socket_is_allowed_to_send_recv_v6(in6p, in6p->in6p_lport, fport, laddr, faddr, NULL, &policy_id, &route_rule_id)) { error = EHOSTUNREACH; goto release; } necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id); + + if (net_qos_policy_restricted != 0) { + necp_socket_update_qos_marking(in6p, in6p->in6p_route.ro_rt, + NULL, route_rule_id); + } } #endif /* NECP */ + if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) + ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED; #if IPSEC if (in6p->in6p_sp != NULL && ipsec_setsocket(m, so) != 0) { @@ -380,7 +431,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, /* Copy the cached route and take an extra reference */ in6p_route_copyout(in6p, &ro); - set_packet_service_class(m, so, msc, PKT_SCF_IPV6); + set_packet_service_class(m, so, sotc, PKT_SCF_IPV6); m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = in6p->inp_flowhash; @@ -484,7 +535,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, ifnet_hdrlen(outif) + ifnet_packetpreamblelen(outif), sizeof(u_int32_t)); - } + } } else { ROUTE_RELEASE(&in6p->in6p_route); } diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index f4b1f11cc..aa1964dfc 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -2,7 +2,7 @@ * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -165,25 +165,6 @@ static void udp6_append(struct inpcb *, struct ip6_hdr *, struct sockaddr_in6 *, struct mbuf *, int, struct ifnet *); static int udp6_input_checksum(struct mbuf *, struct udphdr *, int, int); -#if IPFIREWALL -extern int fw_verbose; -extern void ipfwsyslog( int level, const char *format,...); -extern void ipfw_stealth_stats_incr_udpv6(void); - -/* Apple logging, log to ipfw.log */ -#define log_in_vain_log(a) { \ - if ((udp_log_in_vain == 3) && (fw_verbose == 2)) { \ - ipfwsyslog a; \ - } else if ((udp_log_in_vain == 4) && (fw_verbose == 2)) { \ - ipfw_stealth_stats_incr_udpv6(); \ - } else { \ - log a; \ - } \ -} -#else /* !IPFIREWALL */ -#define log_in_vain_log( a ) { log a; } -#endif /* !IPFIREWALL */ - struct pr_usrreqs udp6_usrreqs = { .pru_abort = udp6_abort, .pru_attach = udp6_attach, @@ -548,11 +529,11 @@ udp6_input(struct mbuf **mp, int *offp, int proto) ntohs(uh->uh_sport)); } else if (!(m->m_flags & (M_BCAST | M_MCAST)) && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { - log_in_vain_log((LOG_INFO, "Connection attempt " + log(LOG_INFO, "Connection attempt " "to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), ip6_sprintf(&ip6->ip6_src), - ntohs(uh->uh_sport))); + ntohs(uh->uh_sport)); } } udpstat.udps_noport++; @@ -643,9 +624,10 @@ udp6_ctlinput(int cmd, struct sockaddr *sa, void *d) if ((unsigned)cmd >= PRC_NCMDS) return; - if (PRC_IS_REDIRECT(cmd)) - notify = in6_rtchange, d = NULL; - else if (cmd == PRC_HOSTDEAD) + if (PRC_IS_REDIRECT(cmd)) { + notify = in6_rtchange; + d = NULL; + } else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; @@ -1016,9 +998,9 @@ udp6_input_checksum(struct mbuf *m, struct udphdr *uh, int off, int ulen) uh->uh_sum == 0) { /* UDP/IPv6 checksum is mandatory (RFC2460) */ - /* + /* * If checksum was already validated, ignore this check. - * This is necessary for transport-mode ESP, which may be + * This is necessary for transport-mode ESP, which may be * getting UDP payloads without checksums when the network * has a NAT64. */ diff --git a/bsd/netkey/Makefile b/bsd/netkey/Makefile index 6152b0760..272aedd3a 100644 --- a/bsd/netkey/Makefile +++ b/bsd/netkey/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -30,5 +29,3 @@ INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${PRIVATE_KERNELFILES include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index 7a0c50b89..1b4e6420b 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -172,7 +172,8 @@ static int ipsec_sav_count = 0; static u_int32_t acq_seq = 0; static int key_tick_init_random = 0; -__private_extern__ u_int32_t natt_now = 0; +static u_int64_t up_time = 0; +__private_extern__ u_int64_t natt_now = 0; static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */ static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ @@ -594,7 +595,6 @@ extern int ipsec_bypass; extern int esp_udp_encap_port; int ipsec_send_natt_keepalive(struct secasvar *sav); bool ipsec_fill_offload_frame(ifnet_t ifp, struct secasvar *sav, struct ifnet_keepalive_offload_frame *frame, size_t frame_data_offset); -u_int32_t key_fill_offload_frames_for_savs (ifnet_t ifp, struct ifnet_keepalive_offload_frame *frames_array, u_int32_t frames_array_count, size_t frame_data_offset); void key_init(struct protosw *, struct domain *); @@ -6214,8 +6214,10 @@ key_timehandler(void) key_tick_init_random = 0; key_srandom(); } - - natt_now++; + + uint64_t acc_sleep_time = 0; + absolutetime_to_nanoseconds(mach_absolutetime_asleep, &acc_sleep_time); + natt_now = ++up_time + (acc_sleep_time / NSEC_PER_SEC); lck_mtx_unlock(sadb_mutex); @@ -6357,7 +6359,6 @@ key_satype2proto( return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_IPCOMP; - break; default: return 0; } @@ -6380,7 +6381,6 @@ key_proto2satype( return SADB_SATYPE_ESP; case IPPROTO_IPCOMP: return SADB_X_SATYPE_IPCOMP; - break; default: return 0; } @@ -7075,7 +7075,7 @@ key_migrate(struct socket *so, sav->natt_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_interval; sav->natt_offload_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_offload_interval; sav->natt_last_activity = natt_now; - + /* * Verify if SADB_X_EXT_NATT_MULTIPLEUSERS flag is set that * SADB_X_EXT_NATT is set and SADB_X_EXT_NATT_KEEPALIVE is not diff --git a/bsd/netkey/key.h b/bsd/netkey/key.h index 82c97c639..aec0ae52d 100644 --- a/bsd/netkey/key.h +++ b/bsd/netkey/key.h @@ -108,6 +108,12 @@ extern void key_delsav(struct secasvar *sav); extern struct secpolicy *key_getspbyid(u_int32_t); extern void key_delsp_for_ipsec_if(ifnet_t ipsec_if); +struct ifnet; +struct ifnet_keepalive_offload_frame; +extern u_int32_t key_fill_offload_frames_for_savs(struct ifnet *, + struct ifnet_keepalive_offload_frame *frames_array, u_int32_t, size_t); + + #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETKEY_KEY_H_ */ diff --git a/bsd/netkey/keydb.c b/bsd/netkey/keydb.c index f1d5c830b..eba59f728 100644 --- a/bsd/netkey/keydb.c +++ b/bsd/netkey/keydb.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $KAME: keydb.c,v 1.61 2000/03/25 07:24:13 sumikawa Exp $ */ /* @@ -59,7 +87,7 @@ MALLOC_DEFINE(M_SECA, "key mgmt", "security associations, key management"); * secpolicy management */ struct secpolicy * -keydb_newsecpolicy() +keydb_newsecpolicy(void) { struct secpolicy *p; @@ -70,10 +98,8 @@ keydb_newsecpolicy() } void -keydb_delsecpolicy(p) - struct secpolicy *p; +keydb_delsecpolicy(struct secpolicy *p) { - _FREE(p, M_SECA); } @@ -81,7 +107,7 @@ keydb_delsecpolicy(p) * secashead management */ struct secashead * -keydb_newsecashead() +keydb_newsecashead(void) { struct secashead *p; int i; @@ -170,8 +196,7 @@ keydb_delsecasvar(p) * secreplay management */ struct secreplay * -keydb_newsecreplay(wsize) - size_t wsize; +keydb_newsecreplay(size_t wsize) { struct secreplay *p; @@ -205,10 +230,8 @@ keydb_newsecreplay(wsize) } void -keydb_delsecreplay(p) - struct secreplay *p; +keydb_delsecreplay(struct secreplay *p) { - if (p->bitmap) _FREE(p->bitmap, M_SECA); _FREE(p, M_SECA); diff --git a/bsd/netkey/keydb.h b/bsd/netkey/keydb.h index 715e5e6fb..c62dc964c 100644 --- a/bsd/netkey/keydb.h +++ b/bsd/netkey/keydb.h @@ -36,7 +36,6 @@ #ifdef BSD_KERNEL_PRIVATE #include -#include /* Security Association Index */ /* NOTE: Ensure to be same address family */ @@ -71,9 +70,6 @@ struct secashead { struct route sa_route; /* route cache */ }; -typedef int (*utun_is_keepalive_func) __P((void *, void *, u_int16_t, u_int32_t, size_t)); -typedef int (*utun_input_func) __P((void *, void *, protocol_family_t family)); - /* Security Association */ struct secasvar { LIST_ENTRY(secasvar) chain; @@ -109,17 +105,13 @@ struct secasvar { struct secashead *sah; /* back pointer to the secashead */ /* Nat Traversal related bits */ - u_int32_t natt_last_activity; + u_int64_t natt_last_activity; u_int16_t remote_ike_port; u_int16_t natt_encapsulated_src_port; /* network byte order */ u_int16_t natt_interval; /* Interval in seconds */ u_int16_t natt_offload_interval; /* Hardware Offload Interval in seconds */ u_int8_t always_expire; /* Send expire/delete messages even if unused */ - - void *utun_pcb; - utun_is_keepalive_func utun_is_keepalive_fn; - utun_input_func utun_in_fn; }; /* replay prevention */ diff --git a/bsd/netkey/keysock.c b/bsd/netkey/keysock.c index dd7e2da45..5502485d3 100644 --- a/bsd/netkey/keysock.c +++ b/bsd/netkey/keysock.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $KAME: keysock.c,v 1.13 2000/03/25 07:24:13 sumikawa Exp $ */ /* @@ -161,10 +189,7 @@ key_output(m, va_alist) * send message to the socket. */ static int -key_sendup0(rp, m, promisc) - struct rawcb *rp; - struct mbuf *m; - int promisc; +key_sendup0(struct rawcb *rp, struct mbuf *m, int promisc) { int error; @@ -210,10 +235,7 @@ key_sendup0(rp, m, promisc) /* so can be NULL if target != KEY_SENDUP_ONE */ int -key_sendup_mbuf(so, m, target) - struct socket *so; - struct mbuf *m; - int target; +key_sendup_mbuf(struct socket *so, struct mbuf *m, int target) { struct mbuf *n; struct keycb *kp; diff --git a/bsd/nfs/Makefile b/bsd/nfs/Makefile index 605a48118..c010c8756 100644 --- a/bsd/nfs/Makefile +++ b/bsd/nfs/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -11,16 +10,15 @@ DATAFILES = \ krpc.h nfs.h nfsdiskless.h nfsm_subs.h nfsmount.h nfsnode.h \ nfs_lock.h nfs_gss.h nfs_ioctl.h \ nfsproto.h nfsrvcache.h rpcv2.h xdr_subs.h +# gss/gss_krb5_mech.h INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = nfs -EXPORT_MI_LIST = +EXPORT_MI_LIST = EXPORT_MI_DIR = nfs include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/nfs/gss/ccrypto.c b/bsd/nfs/gss/ccrypto.c new file mode 100644 index 000000000..b7aab7c55 --- /dev/null +++ b/bsd/nfs/gss/ccrypto.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +int corecrypto_available(void); + +int +corecrypto_available(void) +{ + return (g_crypto_funcs ? 1 : 0); +} + +const struct ccmode_cbc * +ccaes_cbc_decrypt_mode(void) +{ + if (g_crypto_funcs) + return (g_crypto_funcs->ccaes_cbc_decrypt); + return (NULL); +} + +const struct ccmode_cbc * +ccaes_cbc_encrypt_mode(void) +{ + if (g_crypto_funcs) + return (g_crypto_funcs->ccaes_cbc_encrypt); + return (NULL); +} + +const struct ccmode_cbc * +ccdes3_cbc_decrypt_mode(void) +{ + if (g_crypto_funcs) + return (g_crypto_funcs->cctdes_cbc_decrypt); + return (NULL); +} + +const struct ccmode_cbc * +ccdes3_cbc_encrypt_mode(void) +{ + if (g_crypto_funcs) + return (g_crypto_funcs->cctdes_cbc_encrypt); + return (NULL); +} + +size_t +ccpad_cts3_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *cbc_key, + cccbc_iv *iv, size_t nbytes, const void *in, void *out) +{ + if (g_crypto_funcs) + return (*g_crypto_funcs->ccpad_cts3_decrypt_fn)(cbc, cbc_key, iv, nbytes, in, out); + return (0); +} + +size_t +ccpad_cts3_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *cbc_key, + cccbc_iv *iv, size_t nbytes, const void *in, void *out) +{ + if (g_crypto_funcs) + return (*g_crypto_funcs->ccpad_cts3_encrypt_fn)(cbc, cbc_key, iv, nbytes, in, out); + return (0); +} + +const struct ccdigest_info *ccsha1_ltc_di_ptr; + +const struct ccdigest_info * +ccsha1_di(void) +{ + if (g_crypto_funcs) + return (g_crypto_funcs->ccsha1_di); + return (NULL); +} + +void ccdes_key_set_odd_parity(void *key, unsigned long length) +{ + if (g_crypto_funcs) + (*g_crypto_funcs->ccdes_key_set_odd_parity_fn)(key, length); +} diff --git a/bsd/nfs/gss/gss_krb5_mech.c b/bsd/nfs/gss/gss_krb5_mech.c new file mode 100644 index 000000000..59df39f4a --- /dev/null +++ b/bsd/nfs/gss/gss_krb5_mech.c @@ -0,0 +1,2573 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 1999 Kungliga Tekniska Högskolan + * (Royal Institute of Technology, Stockholm, Sweden). + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of KTH nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY KTH AND ITS CONTRIBUTORS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KTH OR ITS CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gss_krb5_mech.h" + +lck_grp_t *gss_krb5_mech_grp; + +typedef struct crypt_walker_ctx { + size_t length; + const struct ccmode_cbc *ccmode; + cccbc_ctx *crypt_ctx; + cccbc_iv *iv; +} *crypt_walker_ctx_t; + +typedef struct hmac_walker_ctx { + const struct ccdigest_info *di; + cchmac_ctx_t hmac_ctx; +} *hmac_walker_ctx_t; + +typedef size_t (*ccpad_func)(const struct ccmode_cbc *, cccbc_ctx *, cccbc_iv *, + size_t nbytes, const void *, void *); + +static int krb5_n_fold(const void *instr, size_t len, void *foldstr, size_t size); + +size_t gss_mbuf_len(mbuf_t, size_t); +errno_t gss_prepend_mbuf(mbuf_t *, uint8_t *, size_t); +errno_t gss_append_mbuf(mbuf_t, uint8_t *, size_t); +errno_t gss_strip_mbuf(mbuf_t, ssize_t); +int mbuf_walk(mbuf_t, size_t, size_t, size_t, int (*)(void *, uint8_t *, uint32_t), void *); + +void do_crypt_init(crypt_walker_ctx_t, int, crypto_ctx_t, cccbc_ctx *); +int do_crypt(void *, uint8_t *, uint32_t); +void do_hmac_init(hmac_walker_ctx_t, crypto_ctx_t, void *); +int do_hmac(void *, uint8_t *, uint32_t); + +void krb5_make_usage(uint32_t, uint8_t, uint8_t [KRB5_USAGE_LEN]); +void krb5_key_derivation(crypto_ctx_t, const void *, size_t, void **, size_t); +void cc_key_schedule_create(crypto_ctx_t); +void gss_crypto_ctx_free(crypto_ctx_t); +int gss_crypto_ctx_init(struct crypto_ctx *, lucid_context_t); + +errno_t krb5_crypt_mbuf(crypto_ctx_t, mbuf_t *, uint32_t, int, cccbc_ctx *); +int krb5_mic(crypto_ctx_t, gss_buffer_t, gss_buffer_t, gss_buffer_t, uint8_t *, int *, int, int); +int krb5_mic_mbuf(crypto_ctx_t, gss_buffer_t, mbuf_t, uint32_t, uint32_t, gss_buffer_t, uint8_t *, int *, int, int); + +uint32_t gss_krb5_cfx_get_mic(uint32_t *, gss_ctx_id_t, gss_qop_t, gss_buffer_t, gss_buffer_t); +uint32_t gss_krb5_cfx_verify_mic(uint32_t *, gss_ctx_id_t, gss_buffer_t, gss_buffer_t, gss_qop_t *); +uint32_t gss_krb5_cfx_get_mic_mbuf(uint32_t *, gss_ctx_id_t, gss_qop_t, mbuf_t, size_t, size_t, gss_buffer_t); +uint32_t gss_krb5_cfx_verify_mic_mbuf(uint32_t *, gss_ctx_id_t, mbuf_t, size_t, size_t, gss_buffer_t, gss_qop_t *); +errno_t krb5_cfx_crypt_mbuf(crypto_ctx_t, mbuf_t *, size_t *, int, int); +uint32_t gss_krb5_cfx_wrap_mbuf(uint32_t *, gss_ctx_id_t, int, gss_qop_t, mbuf_t *, size_t, int *); +uint32_t gss_krb5_cfx_unwrap_mbuf(uint32_t *, gss_ctx_id_t, mbuf_t *, size_t, int *, gss_qop_t *); + +int gss_krb5_mech_is_initialized(void); +void gss_krb5_mech_init(void); + +/* Debugging routines */ +void +printmbuf(const char *str, mbuf_t mb, uint32_t offset, uint32_t len) +{ + size_t i; + int cout = 1; + + len = len ? len : ~0; + printf("%s mbuf = %p offset = %d len = %d:\n", str ? str : "mbuf", mb, offset, len); + for (; mb && len; mb = mbuf_next(mb)) { + if (offset >= mbuf_len(mb)) { + offset -= mbuf_len(mb); + continue; + } + for(i = offset; len && i < mbuf_len(mb); i++) { + const char *s = (cout % 8) ? " " : (cout % 16) ? " " : "\n"; + printf("%02x%s", ((uint8_t *)mbuf_data(mb))[i], s); + len--; + cout++; + } + offset = 0; + } + if ((cout-1) % 16) + printf("\n"); + printf("Count chars %d\n", cout - 1); +} + +void +printgbuf(const char *str, gss_buffer_t buf) +{ + size_t i; + size_t len = buf->length > 128 ? 128 : buf->length; + + printf("%s: len = %d value = %p\n", str ? str : "buffer", (int)buf->length, buf->value); + for (i = 0; i < len; i++) { + const char *s = ((i + 1) % 8) ? " " : ((i + 1) % 16) ? " " : "\n"; + printf("%02x%s", ((uint8_t *)buf->value)[i], s); + } + if (i % 16) + printf("\n"); +} + +/* + * Initialize the data structures for the gss kerberos mech. + */ +#define GSS_KRB5_NOT_INITIALIZED 0 +#define GSS_KRB5_INITIALIZING 1 +#define GSS_KRB5_INITIALIZED 2 +static volatile uint32_t gss_krb5_mech_initted = GSS_KRB5_NOT_INITIALIZED; + +int +gss_krb5_mech_is_initialized(void) +{ + return (gss_krb5_mech_initted == GSS_KRB5_NOT_INITIALIZED); +} + +void +gss_krb5_mech_init(void) +{ + extern void IOSleep(int); + + /* Once initted always initted */ + if (gss_krb5_mech_initted == GSS_KRB5_INITIALIZED) + return; + + /* make sure we init only once */ + if (!OSCompareAndSwap(GSS_KRB5_NOT_INITIALIZED, GSS_KRB5_INITIALIZING, &gss_krb5_mech_initted)) { + /* wait until initialization is complete */ + while (!gss_krb5_mech_is_initialized()) + IOSleep(10); + return; + } + gss_krb5_mech_grp = lck_grp_alloc_init("gss_krb5_mech", LCK_GRP_ATTR_NULL); + gss_krb5_mech_initted = GSS_KRB5_INITIALIZED; +} + +uint32_t +gss_release_buffer(uint32_t *minor, gss_buffer_t buf) +{ + if (minor) + *minor = 0; + if (buf->value) + FREE(buf->value, M_TEMP); + buf->value = NULL; + buf->length = 0; + return (GSS_S_COMPLETE); +} + +/* + * GSS mbuf routines + */ + +size_t +gss_mbuf_len(mbuf_t mb, size_t offset) +{ + size_t len; + + for (len = 0; mb; mb = mbuf_next(mb)) + len += mbuf_len(mb); + return ((offset > len) ? 0 : len - offset); +} + +/* + * Split an mbuf in a chain into two mbufs such that the original mbuf + * points to the original mbuf and the new mbuf points to the rest of the + * chain. The first mbuf length is the first len bytes and the second + * mbuf contains the remaining bytes. if len is zero or equals + * mbuf_len(mb) the don't create a new mbuf. We are already at an mbuf + * boundary. Return the mbuf that starts at the offset. + */ +static errno_t +split_one_mbuf(mbuf_t mb, size_t offset, mbuf_t *nmb, int join) +{ + errno_t error; + + *nmb = mb; + /* We don't have an mbuf or we're alread on an mbuf boundary */ + if (mb == NULL || offset == 0) + return (0); + + /* If the mbuf length is offset then the next mbuf is the one we want */ + if (mbuf_len(mb) == offset) { + *nmb = mbuf_next(mb); + if (!join) + mbuf_setnext(mb, NULL); + return (0); + } + + if (offset > mbuf_len(mb)) + return (EINVAL); + + error = mbuf_split(mb, offset, MBUF_WAITOK, nmb); + if (error) + return (error); + + if (mbuf_flags(*nmb) & MBUF_PKTHDR) { + /* We don't want to copy the pkthdr. mbuf_split does that. */ + error = mbuf_setflags_mask(*nmb, ~MBUF_PKTHDR, MBUF_PKTHDR); + } + + if (join) + /* Join the chain again */ + mbuf_setnext(mb, *nmb); + + return (0); +} + +/* + * Given an mbuf with an offset and length return the chain such that + * offset and offset + *subchain_length are on mbuf boundaries. If + * *mbuf_length is less that the length of the chain after offset + * return that length in *mbuf_length. The mbuf sub chain starting at + * offset is returned in *subchain. If an error occurs return the + * corresponding errno. Note if there are less than offset bytes then + * subchain will be set to NULL and *subchain_length will be set to + * zero. If *subchain_length is 0; then set it to the length of the + * chain starting at offset. Join parameter is used to indicate whether + * the mbuf chain will be joined again as on chain, just rearranged so + * that offset and subchain_length are on mbuf boundaries. + */ + +errno_t +gss_normalize_mbuf(mbuf_t chain, size_t offset, size_t *subchain_length, mbuf_t *subchain, mbuf_t *tail, int join) +{ + size_t length = *subchain_length ? *subchain_length : ~0; + size_t len; + mbuf_t mb, nmb; + errno_t error; + + if (tail == NULL) + tail = &nmb; + *tail = NULL; + *subchain = NULL; + + for (len = offset, mb = chain; mb && len > mbuf_len(mb); mb = mbuf_next(mb)) + len -= mbuf_len(mb); + + /* if we don't have offset bytes just return */ + if (mb == NULL) + return (0); + + error = split_one_mbuf(mb, len, subchain, join); + if (error) + return (error); + + assert(subchain != NULL && *subchain != NULL); + assert(offset == 0 ? mb == *subchain : 1); + + len = gss_mbuf_len(*subchain, 0); + length = (length > len) ? len : length; + *subchain_length = length; + + for (len = length, mb = *subchain; mb && len > mbuf_len(mb); mb = mbuf_next(mb)) + len -= mbuf_len(mb); + + error = split_one_mbuf(mb, len, tail, join); + + return (error); +} + +mbuf_t +gss_join_mbuf(mbuf_t head, mbuf_t body, mbuf_t tail) +{ + mbuf_t mb; + + for (mb = head; mb && mbuf_next(mb); mb = mbuf_next(mb)) + ; + if (mb) + mbuf_setnext(mb, body); + for (mb = body; mb && mbuf_next(mb); mb = mbuf_next(mb)) + ; + if (mb) + mbuf_setnext(mb, tail); + mb = head ? head : (body ? body : tail); + return (mb); +} + +/* + * Prepend size bytes to the mbuf chain. + */ +errno_t +gss_prepend_mbuf(mbuf_t *chain, uint8_t *bytes, size_t size) +{ + uint8_t *data = mbuf_data(*chain); + size_t leading = mbuf_leadingspace(*chain); + size_t trailing = mbuf_trailingspace(*chain); + size_t mlen = mbuf_len(*chain); + errno_t error; + + if (size > leading && size <= leading + trailing) { + data = memmove(data + size - leading, data, mlen); + mbuf_setdata(*chain, data, mlen); + } + + error = mbuf_prepend(chain, size, MBUF_WAITOK); + if (error) + return (error); + data = mbuf_data(*chain); + memcpy(data, bytes, size); + + return (0); +} + +errno_t +gss_append_mbuf(mbuf_t chain, uint8_t *bytes, size_t size) +{ + size_t len = 0; + mbuf_t mb; + + if (chain == NULL) + return (EINVAL); + + for (mb = chain; mb; mb = mbuf_next(mb)) + len += mbuf_len(mb); + + return (mbuf_copyback(chain, len, size, bytes, MBUF_WAITOK)); +} + +errno_t +gss_strip_mbuf(mbuf_t chain, ssize_t size) +{ + if (chain == NULL) + return (EINVAL); + + mbuf_adj(chain, size); + + return (0); +} + + +/* + * Kerberos mech generic crypto support for mbufs + */ + +/* + * Walk the mbuf after the given offset calling the passed in crypto function + * for len bytes. Note the length, len should be a multiple of the blocksize and + * there should be at least len bytes available after the offset in the mbuf chain. + * padding should be done before calling this routine. + */ +int +mbuf_walk(mbuf_t mbp, size_t offset, size_t len, size_t blocksize, int (*crypto_fn)(void *, uint8_t *data, uint32_t length), void *ctx) +{ + mbuf_t mb; + size_t mlen, residue; + uint8_t *ptr; + int error = 0; + + /* Move to the start of the chain */ + for (mb = mbp; mb && len > 0; mb = mbuf_next(mb)) { + ptr = mbuf_data(mb); + mlen = mbuf_len(mb); + if (offset >= mlen) { + /* Offset not yet reached */ + offset -= mlen; + continue; + } + /* Found starting point in chain */ + ptr += offset; + mlen -= offset; + offset = 0; + + /* + * Handle the data in this mbuf. If the length to + * walk is less than the data in the mbuf, set + * the mbuf length left to be the length left + */ + mlen = mlen < len ? mlen : len; + /* Figure out how much is a multple of blocksize */ + residue = mlen % blocksize; + /* And addjust the mleft length to be the largest multiple of blocksized */ + mlen -= residue; + /* run our hash/encrypt/decrpyt function */ + if (mlen > 0) { + error = crypto_fn(ctx, ptr, mlen); + if (error) + break; + ptr += mlen; + len -= mlen; + } + /* + * If we have a residue then to get a full block for our crypto + * function, we need to copy the residue into our block size + * block and use the next mbuf to get the rest of the data for + * the block. N.B. We generally assume that from the offset + * passed in, that the total length, len, is a multple of + * blocksize and that there are at least len bytes in the chain + * from the offset. We also assume there is at least (blocksize + * - residue) size data in any next mbuf for residue > 0. If not + * we attemp to pullup bytes from down the chain. + */ + if (residue) { + mbuf_t nmb = mbuf_next(mb); + uint8_t *nptr = NULL, block[blocksize]; + + assert(nmb); + len -= residue; + offset = blocksize - residue; + if (len < offset) { + offset = len; + /* + * We don't have enough bytes so zero the block + * so that any trailing bytes will be zero. + */ + cc_clear(sizeof(block), block); + } + memcpy(block, ptr, residue); + if (len && nmb) { + mlen = mbuf_len(nmb); + if (mlen < offset) { + error = mbuf_pullup(&nmb, offset - mlen); + if (error) { + mbuf_setnext(mb, NULL); + return (error); + } + } + nptr = mbuf_data(nmb); + memcpy(block + residue, nptr, offset); + } + len -= offset; + error = crypto_fn(ctx, block, sizeof(block)); + if (error) + break; + memcpy(ptr, block, residue); + if (nptr) + memcpy(nptr, block + residue, offset); + } + } + + return (error); +} + +void +do_crypt_init(crypt_walker_ctx_t wctx, int encrypt, crypto_ctx_t cctx, cccbc_ctx *ks) +{ + wctx->ccmode = encrypt ? cctx->enc_mode : cctx->dec_mode; + + wctx->crypt_ctx = ks; + MALLOC(wctx->iv, cccbc_iv *, wctx->ccmode->block_size, M_TEMP, M_WAITOK|M_ZERO); + cccbc_set_iv(wctx->ccmode, wctx->iv, NULL); +} + +int +do_crypt(void *walker, uint8_t *data, uint32_t len) +{ + struct crypt_walker_ctx *wctx = (crypt_walker_ctx_t)walker; + uint32_t nblocks; + + nblocks = len / wctx->ccmode->block_size; + assert(len % wctx->ccmode->block_size == 0); + cccbc_update(wctx->ccmode, wctx->crypt_ctx, wctx->iv, nblocks, data, data); + wctx->length += len; + + return (0); +} + +void +do_hmac_init(hmac_walker_ctx_t wctx, crypto_ctx_t cctx, void *key) +{ + size_t alloc_size = cc_ctx_n(struct cchmac_ctx, cchmac_di_size(cctx->di)) * sizeof(struct cchmac_ctx); + + wctx->di = cctx->di; + MALLOC(wctx->hmac_ctx.hdr, struct cchmac_ctx *, alloc_size, M_TEMP, M_WAITOK|M_ZERO); + cchmac_init(cctx->di, wctx->hmac_ctx, cctx->keylen, key); +} + +int +do_hmac(void *walker, uint8_t *data, uint32_t len) +{ + hmac_walker_ctx_t wctx = (hmac_walker_ctx_t)walker; + + cchmac_update(wctx->di, wctx->hmac_ctx, len, data); + + return (0); +} + + +int +krb5_mic(crypto_ctx_t ctx, gss_buffer_t header, gss_buffer_t bp, gss_buffer_t trailer, uint8_t *mic, int *verify, int ikey, int reverse) +{ + uint8_t digest[ctx->di->output_size]; + cchmac_di_decl(ctx->di, hmac_ctx); + int kdx = (verify == NULL) ? (reverse ? GSS_RCV : GSS_SND) : (reverse ? GSS_SND : GSS_RCV); + void *key2use; + + if (ikey) { + if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { + lck_mtx_lock(ctx->lock); + if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { + cc_key_schedule_create(ctx); + } + ctx->flags |= CRYPTO_KS_ALLOCED; + lck_mtx_unlock(ctx->lock); + } + key2use = ctx->ks.ikey[kdx]; + } else { + key2use = ctx->ckey[kdx]; + } + + cchmac_init(ctx->di, hmac_ctx, ctx->keylen, key2use); + + if (header) { + cchmac_update(ctx->di, hmac_ctx, header->length, header->value); + } + + cchmac_update(ctx->di, hmac_ctx, bp->length, bp->value); + + if (trailer) { + cchmac_update(ctx->di, hmac_ctx, trailer->length, trailer->value); + } + + cchmac_final(ctx->di, hmac_ctx, digest); + + if (verify) { + *verify = (memcmp(mic, digest, ctx->digest_size) == 0); + } + else + memcpy(mic, digest, ctx->digest_size); + + return (0); +} + +int +krb5_mic_mbuf(crypto_ctx_t ctx, gss_buffer_t header, + mbuf_t mbp, uint32_t offset, uint32_t len, gss_buffer_t trailer, uint8_t *mic, int *verify, int ikey, int reverse) +{ + struct hmac_walker_ctx wctx; + uint8_t digest[ctx->di->output_size]; + int error; + int kdx = (verify == NULL) ? (reverse ? GSS_RCV : GSS_SND) : (reverse ? GSS_SND : GSS_RCV); + void *key2use; + + if (ikey) { + if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { + lck_mtx_lock(ctx->lock); + if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { + cc_key_schedule_create(ctx); + } + ctx->flags |= CRYPTO_KS_ALLOCED; + lck_mtx_unlock(ctx->lock); + } + key2use = ctx->ks.ikey[kdx]; + } else { + key2use = ctx->ckey[kdx]; + } + + do_hmac_init(&wctx, ctx, key2use); + + if (header) { + cchmac_update(ctx->di, wctx.hmac_ctx, header->length, header->value); + } + + error = mbuf_walk(mbp, offset, len, 1, do_hmac, &wctx); + + if (error) + return (error); + if (trailer) + cchmac_update(ctx->di, wctx.hmac_ctx, trailer->length, trailer->value); + + cchmac_final(ctx->di, wctx.hmac_ctx, digest); + FREE(wctx.hmac_ctx.hdr, M_TEMP); + + if (verify) { + *verify = (memcmp(mic, digest, ctx->digest_size) == 0); + if (!*verify) + return (EBADRPC); + } else + memcpy(mic, digest, ctx->digest_size); + + return (0); +} + +errno_t /* __attribute__((optnone)) */ +krb5_crypt_mbuf(crypto_ctx_t ctx, mbuf_t *mbp, uint32_t len, int encrypt, cccbc_ctx *ks) +{ + struct crypt_walker_ctx wctx; + const struct ccmode_cbc *ccmode = encrypt ? ctx->enc_mode : ctx->dec_mode; + size_t plen = len; + size_t cts_len = 0; + mbuf_t mb, lmb; + int error; + + if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { + lck_mtx_lock(ctx->lock); + if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { + cc_key_schedule_create(ctx); + } + ctx->flags |= CRYPTO_KS_ALLOCED; + lck_mtx_unlock(ctx->lock); + } + if (!ks) + ks = encrypt ? ctx->ks.enc : ctx->ks.dec; + + if ((ctx->flags & CRYPTO_CTS_ENABLE) && ctx->mpad == 1) { + uint8_t block[ccmode->block_size]; + /* if the length is less than or equal to a blocksize. We just encrypt the block */ + if (len <= ccmode->block_size) { + if (len < ccmode->block_size) { + memset(block, 0, sizeof(block)); + gss_append_mbuf(*mbp, block, ccmode->block_size); + } + plen = ccmode->block_size; + } else { + /* determine where the last two blocks are */ + uint32_t r = len % ccmode->block_size; + + cts_len = r ? r + ccmode->block_size : 2 * ccmode->block_size; + plen = len - cts_len; + /* If plen is 0 we only have two blocks to crypt with ccpad below */ + if (plen == 0) + lmb = *mbp; + else { + gss_normalize_mbuf(*mbp, 0, &plen, &mb, &lmb, 0); + assert(*mbp == mb); + assert(plen == len - cts_len); + assert(gss_mbuf_len(mb, 0) == plen); + assert(gss_mbuf_len(lmb, 0) == cts_len); + } + } + } else if (len % ctx->mpad) { + uint8_t pad_block[ctx->mpad]; + size_t padlen = ctx->mpad - (len % ctx->mpad); + + memset(pad_block, 0, padlen); + error = gss_append_mbuf(*mbp, pad_block, padlen); + if (error) + return (error); + plen = len + padlen; + } + do_crypt_init(&wctx, encrypt, ctx, ks); + if (plen) { + error = mbuf_walk(*mbp, 0, plen, ccmode->block_size, do_crypt, &wctx); + if (error) + return (error); + } + + if ((ctx->flags & CRYPTO_CTS_ENABLE) && cts_len) { + uint8_t cts_pad[2*ccmode->block_size]; + ccpad_func do_ccpad = encrypt ? ccpad_cts3_encrypt : ccpad_cts3_decrypt; + + assert(cts_len <= 2*ccmode->block_size && cts_len > ccmode->block_size); + memset(cts_pad, 0, sizeof(cts_pad)); + mbuf_copydata(lmb, 0, cts_len, cts_pad); + mbuf_freem(lmb); + do_ccpad(ccmode, wctx.crypt_ctx, wctx.iv, cts_len, cts_pad, cts_pad); + gss_append_mbuf(*mbp, cts_pad, cts_len); + } + FREE(wctx.iv, M_TEMP); + + return (0); +} + +/* + * Key derivation routines + */ + +static int +rr13(unsigned char *buf, size_t len) +{ + size_t bytes = (len + 7) / 8; + unsigned char tmp[bytes]; + size_t i; + + if(len == 0) + return 0; + + { + const int bits = 13 % len; + const int lbit = len % 8; + + memcpy(tmp, buf, bytes); + if(lbit) { + /* pad final byte with inital bits */ + tmp[bytes - 1] &= 0xff << (8 - lbit); + for(i = lbit; i < 8; i += len) + tmp[bytes - 1] |= buf[0] >> i; + } + for(i = 0; i < bytes; i++) { + ssize_t bb; + ssize_t b1, s1, b2, s2; + + /* calculate first bit position of this byte */ + bb = 8 * i - bits; + while(bb < 0) + bb += len; + /* byte offset and shift count */ + b1 = bb / 8; + s1 = bb % 8; + if((size_t)bb + 8 > bytes * 8) + /* watch for wraparound */ + s2 = (len + 8 - s1) % 8; + else + s2 = 8 - s1; + b2 = (b1 + 1) % bytes; + buf[i] = (tmp[b1] << s1) | (tmp[b2] >> s2); + } + } + return 0; +} + + +/* Add `b' to `a', both being one's complement numbers. */ +static void +add1(unsigned char *a, unsigned char *b, size_t len) +{ + ssize_t i; + int carry = 0; + + for(i = len - 1; i >= 0; i--){ + int x = a[i] + b[i] + carry; + carry = x > 0xff; + a[i] = x & 0xff; + } + for(i = len - 1; carry && i >= 0; i--){ + int x = a[i] + carry; + carry = x > 0xff; + a[i] = x & 0xff; + } +} + + +static int +krb5_n_fold(const void *instr, size_t len, void *foldstr, size_t size) +{ + /* if len < size we need at most N * len bytes, ie < 2 * size; + if len > size we need at most 2 * len */ + int ret = 0; + size_t maxlen = 2 * max(size, len); + size_t l = 0; + unsigned char tmp[maxlen]; + unsigned char buf[len]; + + memcpy(buf, instr, len); + memset(foldstr, 0, size); + do { + memcpy(tmp + l, buf, len); + l += len; + ret = rr13(buf, len * 8); + if (ret) + goto out; + while(l >= size) { + add1(foldstr, tmp, size); + l -= size; + if(l == 0) + break; + memmove(tmp, tmp + size, l); + } + } while(l != 0); +out: + + return ret; +} + +void +krb5_make_usage(uint32_t usage_no, uint8_t suffix, uint8_t usage_string[KRB5_USAGE_LEN]) +{ + uint32_t i; + + for (i = 0; i < 4; i++) + usage_string[i] = ((usage_no >> 8*(3-i)) & 0xff); + usage_string[i] = suffix; +} + +void +krb5_key_derivation(crypto_ctx_t ctx, const void *cons, size_t conslen, void **dkey, size_t dklen) +{ + size_t blocksize = ctx->enc_mode->block_size; + cccbc_iv_decl(blocksize, iv); + cccbc_ctx_decl(ctx->enc_mode->size, enc_ctx); + size_t ksize = 8*dklen; + size_t nblocks = (ksize + 8*blocksize - 1) / (8*blocksize); + uint8_t *dkptr; + uint8_t block[blocksize]; + + MALLOC(*dkey, void *, nblocks * blocksize, M_TEMP, M_WAITOK | M_ZERO); + dkptr = *dkey; + + krb5_n_fold(cons, conslen, block, blocksize); + cccbc_init(ctx->enc_mode, enc_ctx, ctx->keylen, ctx->key); + for (size_t i = 0; i < nblocks; i++) { + cccbc_set_iv(ctx->enc_mode, iv, NULL); + cccbc_update(ctx->enc_mode, enc_ctx, iv, 1, block, block); + memcpy(dkptr, block, blocksize); + dkptr += blocksize; + } +} + +static void +des_make_key(const uint8_t rawkey[7], uint8_t deskey[8]) +{ + uint8_t val = 0; + + memcpy(deskey, rawkey, 7); + for (int i = 0; i < 7; i++) + val |= ((deskey[i] & 1) << (i+1)); + deskey[7] = val; + ccdes_key_set_odd_parity(deskey, 8); +} + +static void +krb5_3des_key_derivation(crypto_ctx_t ctx, const void *cons, size_t conslen, void **des3key) +{ + const struct ccmode_cbc *cbcmode = ctx->enc_mode; + void *rawkey; + uint8_t *kptr, *rptr; + + MALLOC(*des3key, void *, 3*cbcmode->block_size, M_TEMP, M_WAITOK | M_ZERO); + krb5_key_derivation(ctx, cons, conslen, &rawkey, 3*(cbcmode->block_size - 1)); + kptr = (uint8_t *)*des3key; + rptr = (uint8_t *)rawkey; + + for (int i = 0; i < 3; i++) { + des_make_key(rptr, kptr); + rptr += cbcmode->block_size - 1; + kptr += cbcmode->block_size; + } + + cc_clear(3*(cbcmode->block_size - 1), rawkey); + FREE(rawkey, M_TEMP); +} + +/* + * Create a key schecule + * + */ +void +cc_key_schedule_create(crypto_ctx_t ctx) +{ + uint8_t usage_string[KRB5_USAGE_LEN]; + lucid_context_t lctx = ctx->gss_ctx; + void *ekey; + + switch (lctx->key_data.proto) { + case 0: { + if (ctx->ks.enc == NULL) { + MALLOC(ctx->ks.enc, cccbc_ctx *, ctx->enc_mode->size, M_TEMP, M_WAITOK | M_ZERO); + cccbc_init(ctx->enc_mode, ctx->ks.enc, ctx->keylen, ctx->key); + } + if (ctx->ks.dec == NULL) { + MALLOC(ctx->ks.dec, cccbc_ctx *, ctx->dec_mode->size, M_TEMP, M_WAITOK | M_ZERO); + cccbc_init(ctx->dec_mode, ctx->ks.dec, ctx->keylen, ctx->key); + } + } + case 1: { + if (ctx->ks.enc == NULL) { + krb5_make_usage(lctx->initiate ? + KRB5_USAGE_INITIATOR_SEAL : KRB5_USAGE_ACCEPTOR_SEAL, + 0xAA, usage_string); + krb5_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ekey, ctx->keylen); + MALLOC(ctx->ks.enc, cccbc_ctx *, ctx->enc_mode->size, M_TEMP, M_WAITOK | M_ZERO); + cccbc_init(ctx->enc_mode, ctx->ks.enc, ctx->keylen, ekey); + FREE(ekey, M_TEMP); + } + if (ctx->ks.dec == NULL) { + krb5_make_usage(lctx->initiate ? + KRB5_USAGE_ACCEPTOR_SEAL : KRB5_USAGE_INITIATOR_SEAL, + 0xAA, usage_string); + krb5_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ekey, ctx->keylen); + MALLOC(ctx->ks.dec, cccbc_ctx *, ctx->dec_mode->size, M_TEMP, M_WAITOK | M_ZERO); + cccbc_init(ctx->dec_mode, ctx->ks.dec, ctx->keylen, ekey); + FREE(ekey, M_TEMP); + } + if (ctx->ks.ikey[GSS_SND] == NULL) { + krb5_make_usage(lctx->initiate ? + KRB5_USAGE_INITIATOR_SEAL : KRB5_USAGE_ACCEPTOR_SEAL, + 0x55, usage_string); + krb5_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ctx->ks.ikey[GSS_SND], ctx->keylen); + } + if (ctx->ks.ikey[GSS_RCV] == NULL) { + krb5_make_usage(lctx->initiate ? + KRB5_USAGE_ACCEPTOR_SEAL : KRB5_USAGE_INITIATOR_SEAL, + 0x55, usage_string); + krb5_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ctx->ks.ikey[GSS_RCV], ctx->keylen); + } + } + } +} + +void +gss_crypto_ctx_free(crypto_ctx_t ctx) +{ + ctx->ks.ikey[GSS_SND] = NULL; + if (ctx->ks.ikey[GSS_RCV] && ctx->key != ctx->ks.ikey[GSS_RCV]) { + cc_clear(ctx->keylen, ctx->ks.ikey[GSS_RCV]); + FREE(ctx->ks.ikey[GSS_RCV], M_TEMP); + } + ctx->ks.ikey[GSS_RCV] = NULL; + if (ctx->ks.enc) { + cccbc_ctx_clear(ctx->enc_mode->size, ctx->ks.enc); + FREE(ctx->ks.enc, M_TEMP); + ctx->ks.enc = NULL; + } + if (ctx->ks.dec) { + cccbc_ctx_clear(ctx->dec_mode->size, ctx->ks.dec); + FREE(ctx->ks.dec, M_TEMP); + ctx->ks.dec = NULL; + } + if (ctx->ckey[GSS_SND] && ctx->ckey[GSS_SND] != ctx->key) { + cc_clear(ctx->keylen, ctx->ckey[GSS_SND]); + FREE(ctx->ckey[GSS_SND], M_TEMP); + } + ctx->ckey[GSS_SND] = NULL; + if (ctx->ckey[GSS_RCV] && ctx->ckey[GSS_RCV] != ctx->key) { + cc_clear(ctx->keylen, ctx->ckey[GSS_RCV]); + FREE(ctx->ckey[GSS_RCV], M_TEMP); + } + ctx->ckey[GSS_RCV] = NULL; + ctx->key = NULL; + ctx->keylen = 0; +} + +int +gss_crypto_ctx_init(struct crypto_ctx *ctx, lucid_context_t lucid) +{ + ctx->gss_ctx = lucid; + void *key; + uint8_t usage_string[KRB5_USAGE_LEN]; + + ctx->keylen = ctx->gss_ctx->ctx_key.key.key_len; + key = ctx->gss_ctx->ctx_key.key.key_val; + ctx->etype = ctx->gss_ctx->ctx_key.etype; + ctx->key = key; + + switch(ctx->etype) { + case AES128_CTS_HMAC_SHA1_96: + case AES256_CTS_HMAC_SHA1_96: + ctx->enc_mode = ccaes_cbc_encrypt_mode(); + assert(ctx->enc_mode); + ctx->dec_mode = ccaes_cbc_decrypt_mode(); + assert(ctx->dec_mode); + ctx->ks.enc = NULL; + ctx->ks.dec = NULL; + ctx->di = ccsha1_di(); + assert(ctx->di); + ctx->flags = CRYPTO_CTS_ENABLE; + ctx->mpad = 1; + ctx->digest_size = 12; /* 96 bits */ + krb5_make_usage(ctx->gss_ctx->initiate ? + KRB5_USAGE_INITIATOR_SIGN : KRB5_USAGE_ACCEPTOR_SIGN, + 0x99, usage_string); + krb5_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ctx->ckey[GSS_SND], ctx->keylen); + krb5_make_usage(ctx->gss_ctx->initiate ? + KRB5_USAGE_ACCEPTOR_SIGN : KRB5_USAGE_INITIATOR_SIGN, + 0x99, usage_string); + krb5_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ctx->ckey[GSS_RCV], ctx->keylen); + break; + case DES3_CBC_SHA1_KD: + ctx->enc_mode = ccdes3_cbc_encrypt_mode(); + assert(ctx->enc_mode); + ctx->dec_mode = ccdes3_cbc_decrypt_mode(); + assert(ctx->dec_mode); + ctx->ks.ikey[GSS_SND] = ctx->key; + ctx->ks.ikey[GSS_RCV] = ctx->key; + ctx->di = ccsha1_di(); + assert(ctx->di); + ctx->flags = 0; + ctx->mpad = ctx->enc_mode->block_size; + ctx->digest_size = 20; /* 160 bits */ + krb5_make_usage(KRB5_USAGE_ACCEPTOR_SIGN, 0x99, usage_string); + krb5_3des_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ctx->ckey[GSS_SND]); + krb5_3des_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ctx->ckey[GSS_RCV]); + break; + default: + return (ENOTSUP); + } + + ctx->lock = lck_mtx_alloc_init(gss_krb5_mech_grp, LCK_ATTR_NULL); + + return (0); +} + +/* + * CFX gss support routines + */ +/* From Heimdal cfx.h file RFC 4121 Cryptoo framework extensions */ +typedef struct gss_cfx_mic_token_desc_struct +{ + uint8_t TOK_ID[2]; /* 04 04 */ + uint8_t Flags; + uint8_t Filler[5]; + uint8_t SND_SEQ[8]; +} gss_cfx_mic_token_desc, *gss_cfx_mic_token; + +typedef struct gss_cfx_wrap_token_desc_struct +{ + uint8_t TOK_ID[2]; /* 05 04 */ + uint8_t Flags; + uint8_t Filler; + uint8_t EC[2]; + uint8_t RRC[2]; + uint8_t SND_SEQ[8]; +} gss_cfx_wrap_token_desc, *gss_cfx_wrap_token; + +/* End of cfx.h file */ + +#define CFXSentByAcceptor (1 << 0) +#define CFXSealed (1 << 1) +#define CFXAcceptorSubkey (1 << 2) + +const gss_cfx_mic_token_desc mic_cfx_token = { + .TOK_ID = "\x04\x04", + .Flags = 0, + .Filler = "\xff\xff\xff\xff\xff", + .SND_SEQ = "\x00\x00\x00\x00\x00\x00\x00\x00" +}; + +const gss_cfx_wrap_token_desc wrap_cfx_token = { + .TOK_ID = "\x05\04", + .Flags = 0, + .Filler = '\xff', + .EC = "\x00\x00", + .RRC = "\x00\x00", + .SND_SEQ = "\x00\x00\x00\x00\x00\x00\x00\x00" +}; + +static int +gss_krb5_cfx_verify_mic_token(gss_ctx_id_t ctx, gss_cfx_mic_token token) +{ + int i; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + uint8_t flags = 0; + + if (token->TOK_ID[0] != mic_cfx_token.TOK_ID[0] || token->TOK_ID[1] != mic_cfx_token.TOK_ID[1]) { + printf("Bad mic TOK_ID %x %x\n", token->TOK_ID[0], token->TOK_ID[1]); + return (EBADRPC); + } + if (lctx->initiate) + flags |= CFXSentByAcceptor; + if (lctx->key_data.lucid_protocol_u.data_4121.acceptor_subkey) + flags |= CFXAcceptorSubkey; + if (token->Flags != flags) { + printf("Bad flags received %x exptect %x\n", token->Flags, flags); + return (EBADRPC); + } + for (i = 0; i < 5; i++) { + if (token->Filler[i] != mic_cfx_token.Filler[i]) + break; + } + + if (i != 5) { + printf("Bad mic filler %x @ %d\n", token->Filler[i], i); + return (EBADRPC); + } + + return (0); +} + +uint32_t +gss_krb5_cfx_get_mic(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + gss_qop_t qop __unused, /* qop_req (ignored) */ + gss_buffer_t mbp, /* message mbuf */ + gss_buffer_t mic /* message_token */) +{ + gss_cfx_mic_token_desc token; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + gss_buffer_desc header; + uint32_t rv; + uint64_t seq = htonll(lctx->send_seq); + + if (minor == NULL) + minor = &rv; + *minor = 0; + token = mic_cfx_token; + mic->length = sizeof (token) + cctx->digest_size; + MALLOC(mic->value, void *, mic->length, M_TEMP, M_WAITOK | M_ZERO); + if (!lctx->initiate) + token.Flags |= CFXSentByAcceptor; + if (lctx->key_data.lucid_protocol_u.data_4121.acceptor_subkey) + token.Flags |= CFXAcceptorSubkey; + memcpy(&token.SND_SEQ, &seq, sizeof(lctx->send_seq)); + lctx->send_seq++; //XXX should only update this below on success? Heimdal seems to do it this way + header.value = &token; + header.length = sizeof (gss_cfx_mic_token_desc); + + *minor = krb5_mic(cctx, NULL, mbp, &header, (uint8_t *)mic->value + sizeof(token), NULL, 0, 0); + + if (*minor) { + mic->length = 0; + FREE(mic->value, M_TEMP); + mic->value = NULL; + } else { + memcpy(mic->value, &token, sizeof(token)); + } + + return (*minor ? GSS_S_FAILURE : GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_cfx_verify_mic(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + gss_buffer_t mbp, /* message_buffer */ + gss_buffer_t mic, /* message_token */ + gss_qop_t *qop /* qop_state */) +{ + gss_cfx_mic_token token = mic->value; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + uint8_t *digest = (uint8_t *)mic->value + sizeof (gss_cfx_mic_token_desc); + int verified = 0; + uint64_t seq; + uint32_t rv; + gss_buffer_desc header; + + if (qop) + *qop = GSS_C_QOP_DEFAULT; + if (minor == NULL) + minor = &rv; + + if (mic->length != sizeof(gss_cfx_mic_token_desc) + cctx->digest_size) { + printf("mic token wrong length\n"); + *minor = EBADRPC; + goto out; + } + *minor = gss_krb5_cfx_verify_mic_token(ctx, token); + if (*minor) + return (GSS_S_FAILURE); + header.value = token; + header.length = sizeof (gss_cfx_mic_token_desc); + *minor = krb5_mic(cctx, NULL, mbp, &header, digest, &verified, 0, 0); + + if (verified) { + //XXX errors and such? Sequencing and replay? Not supported in RPCSEC_GSS + memcpy(&seq, token->SND_SEQ, sizeof (uint64_t)); + seq = ntohll(seq); + lctx->recv_seq = seq; + } + +out: + return (verified ? GSS_S_COMPLETE : GSS_S_BAD_SIG); +} + +uint32_t +gss_krb5_cfx_get_mic_mbuf(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + gss_qop_t qop __unused ,/* qop_req (ignored) */ + mbuf_t mbp, /* message mbuf */ + size_t offset, /* offest */ + size_t len, /* length */ + gss_buffer_t mic /* message_token */) +{ + gss_cfx_mic_token_desc token; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + uint32_t rv; + uint64_t seq = htonll(lctx->send_seq); + gss_buffer_desc header; + + if (minor == NULL) + minor = &rv; + *minor = 0; + + token = mic_cfx_token; + mic->length = sizeof (token) + cctx->digest_size; + MALLOC(mic->value, void *, mic->length, M_TEMP, M_WAITOK | M_ZERO); + if (!lctx->initiate) + token.Flags |= CFXSentByAcceptor; + if (lctx->key_data.lucid_protocol_u.data_4121.acceptor_subkey) + token.Flags |= CFXAcceptorSubkey; + + memcpy(&token.SND_SEQ, &seq, sizeof(lctx->send_seq)); + lctx->send_seq++; //XXX should only update this below on success? Heimdal seems to do it this way + + header.length = sizeof(token); + header.value = &token; + + len = len ? len : gss_mbuf_len(mbp, offset); + *minor = krb5_mic_mbuf(cctx, NULL, mbp, offset, len, &header, (uint8_t *)mic->value + sizeof(token), NULL, 0, 0); + + if (*minor) { + mic->length = 0; + FREE(mic->value, M_TEMP); + mic->value = NULL; + } else { + memcpy(mic->value, &token, sizeof(token)); + } + + return (*minor ? GSS_S_FAILURE : GSS_S_COMPLETE); +} + + +uint32_t +gss_krb5_cfx_verify_mic_mbuf(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + mbuf_t mbp, /* message_buffer */ + size_t offset, /* offset */ + size_t len, /* length */ + gss_buffer_t mic, /* message_token */ + gss_qop_t *qop /* qop_state */) +{ + gss_cfx_mic_token token = mic->value; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + uint8_t *digest = (uint8_t *)mic->value + sizeof (gss_cfx_mic_token_desc); + int verified; + uint64_t seq; + uint32_t rv; + gss_buffer_desc header; + + if (qop) + *qop = GSS_C_QOP_DEFAULT; + + if (minor == NULL) + minor = &rv; + + *minor = gss_krb5_cfx_verify_mic_token(ctx, token); + if (*minor) + return (GSS_S_FAILURE); + + header.length = sizeof(gss_cfx_mic_token_desc); + header.value = mic->value; + + *minor = krb5_mic_mbuf(cctx, NULL, mbp, offset, len, &header, digest, &verified, 0, 0); + + //XXX errors and such? Sequencing and replay? Not Supported RPCSEC_GSS + memcpy(&seq, token->SND_SEQ, sizeof (uint64_t)); + seq = ntohll(seq); + lctx->recv_seq = seq; + + return (verified ? GSS_S_COMPLETE : GSS_S_BAD_SIG); +} + +errno_t +krb5_cfx_crypt_mbuf(crypto_ctx_t ctx, mbuf_t *mbp, size_t *len, int encrypt, int reverse) +{ + const struct ccmode_cbc *ccmode = encrypt ? ctx->enc_mode : ctx->dec_mode; + uint8_t confounder[ccmode->block_size]; + uint8_t digest[ctx->digest_size]; + size_t tlen, r = 0; + errno_t error; + + if (encrypt) { + read_random(confounder, ccmode->block_size); + error = gss_prepend_mbuf(mbp, confounder, ccmode->block_size); + if (error) + return (error); + tlen = *len + ccmode->block_size; + if (ctx->mpad > 1) + r = ctx->mpad - (tlen % ctx->mpad); + /* We expect that r == 0 from krb5_cfx_wrap */ + if (r != 0) { + uint8_t mpad[r]; + memset(mpad, 0, r); + error = gss_append_mbuf(*mbp, mpad, r); + if (error) + return (error); + } + tlen += r; + error = krb5_mic_mbuf(ctx, NULL, *mbp, 0, tlen, NULL, digest, NULL, 1, 0); + if (error) + return (error); + error = krb5_crypt_mbuf(ctx, mbp, tlen, 1, NULL); + if (error) + return (error); + error = gss_append_mbuf(*mbp, digest, ctx->digest_size); + if (error) + return (error); + *len = tlen + ctx->digest_size; + return (0); + } else { + int verf; + cccbc_ctx *ks = NULL; + + if (*len < ctx->digest_size + sizeof(confounder)) + return (EBADRPC); + tlen = *len - ctx->digest_size; + /* get the digest */ + error = mbuf_copydata(*mbp, tlen, ctx->digest_size, digest); + /* Remove the digest from the mbuffer */ + error = gss_strip_mbuf(*mbp, -ctx->digest_size); + if (error) + return (error); + + if (reverse) { + /* + * Derive a key schedule that the sender can unwrap with. This + * is so that RPCSEC_GSS can restore encrypted arguments for + * resending. We do that because the RPCSEC_GSS sequence number in + * the rpc header is prepended to the body of the message before wrapping. + */ + void *ekey; + uint8_t usage_string[KRB5_USAGE_LEN]; + lucid_context_t lctx = ctx->gss_ctx; + + krb5_make_usage(lctx->initiate ? + KRB5_USAGE_INITIATOR_SEAL : KRB5_USAGE_ACCEPTOR_SEAL, + 0xAA, usage_string); + krb5_key_derivation(ctx, usage_string, KRB5_USAGE_LEN, &ekey, ctx->keylen); + MALLOC(ks, cccbc_ctx *, ctx->dec_mode->size, M_TEMP, M_WAITOK | M_ZERO); + cccbc_init(ctx->dec_mode, ks, ctx->keylen, ekey); + FREE(ekey, M_TEMP); + } + error = krb5_crypt_mbuf(ctx, mbp, tlen, 0, ks); + FREE(ks, M_TEMP); + if (error) + return (error); + error = krb5_mic_mbuf(ctx, NULL, *mbp, 0, tlen, NULL, digest, &verf, 1, reverse); + if (error) + return (error); + if (!verf) + return (EBADRPC); + /* strip off the confounder */ + error = gss_strip_mbuf(*mbp, ccmode->block_size); + if (error) + return (error); + *len = tlen - ccmode->block_size; + } + return (0); +} + +uint32_t +gss_krb5_cfx_wrap_mbuf(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + int conf_flag, /* conf_req_flag */ + gss_qop_t qop __unused, /* qop_req */ + mbuf_t *mbp, /* input/output message_buffer */ + size_t len, /* mbuf chain length */ + int *conf /* conf_state */) +{ + gss_cfx_wrap_token_desc token; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + int error = 0; + uint32_t mv; + uint64_t seq = htonll(lctx->send_seq); + + if (minor == NULL) + minor = &mv; + if (conf) + *conf = conf_flag; + + *minor = 0; + token = wrap_cfx_token; + if (!lctx->initiate) + token.Flags |= CFXSentByAcceptor; + if (lctx->key_data.lucid_protocol_u.data_4121.acceptor_subkey) + token.Flags |= CFXAcceptorSubkey; + memcpy(&token.SND_SEQ, &seq, sizeof(uint64_t)); + lctx->send_seq++; + if (conf_flag) { + uint8_t pad[cctx->mpad]; + uint16_t plen = 0; + + token.Flags |= CFXSealed; + memset(pad, 0, cctx->mpad); + if (cctx->mpad > 1) { + plen = htons(cctx->mpad - ((len + sizeof (gss_cfx_wrap_token_desc)) % cctx->mpad)); + token.EC[0] = ((plen >> 8) & 0xff); + token.EC[1] = (plen & 0xff); + } + if (plen) { + error = gss_append_mbuf(*mbp, pad, plen); + len += plen; + } + if (error == 0) { + error = gss_append_mbuf(*mbp, (uint8_t *)&token, sizeof(gss_cfx_wrap_token_desc)); + len += sizeof (gss_cfx_wrap_token_desc); + } + if (error == 0) + error = krb5_cfx_crypt_mbuf(cctx, mbp, &len, 1, 0); + if (error == 0) + error = gss_prepend_mbuf(mbp, (uint8_t *)&token, sizeof(gss_cfx_wrap_token_desc)); + } else { + uint8_t digest[cctx->digest_size]; + gss_buffer_desc header; + + header.length = sizeof(token); + header.value = &token; + + error = krb5_mic_mbuf(cctx, NULL, *mbp, 0, len, &header, digest, NULL, 1, 0); + if (error == 0) { + error = gss_append_mbuf(*mbp, digest, cctx->digest_size); + if (error == 0) { + uint16_t plen = htons(cctx->digest_size); + memcpy(token.EC, &plen, 2); + error = gss_prepend_mbuf(mbp, (uint8_t *)&token, sizeof (gss_cfx_wrap_token_desc)); + } + } + } + if (error) { + *minor = error; + return (GSS_S_FAILURE); + } + + return (GSS_S_COMPLETE); +} + +/* + * Given a wrap token the has a rrc, move the trailer back to the end. + */ +static void +gss_krb5_cfx_unwrap_rrc_mbuf(mbuf_t header, size_t rrc) +{ + mbuf_t body, trailer; + + gss_normalize_mbuf(header, sizeof(gss_cfx_wrap_token_desc), &rrc, &trailer, &body, 0); + gss_join_mbuf(header, body, trailer); +} + +uint32_t +gss_krb5_cfx_unwrap_mbuf(uint32_t * minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + mbuf_t *mbp, /* input/output message_buffer */ + size_t len, /* mbuf chain length */ + int *conf_flag, /* conf_state */ + gss_qop_t *qop /* qop state */) +{ + gss_cfx_wrap_token_desc token; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + int error, conf; + uint16_t ec = 0 , rrc = 0; + uint64_t seq; + int reverse = (*qop == GSS_C_QOP_REVERSE); + int initiate = lctx->initiate ? (reverse ? 0 : 1) : (reverse ? 1 : 0); + + error = mbuf_copydata(*mbp, 0, sizeof (gss_cfx_wrap_token_desc), &token); + gss_strip_mbuf(*mbp, sizeof (gss_cfx_wrap_token_desc)); + len -= sizeof (gss_cfx_wrap_token_desc); + + /* Check for valid token */ + if (token.TOK_ID[0] != wrap_cfx_token.TOK_ID[0] || + token.TOK_ID[1] != wrap_cfx_token.TOK_ID[1] || + token.Filler != wrap_cfx_token.Filler) { + printf("Token id does not match\n"); + goto badrpc; + } + if ((initiate && !(token.Flags & CFXSentByAcceptor)) || + (lctx->key_data.lucid_protocol_u.data_4121.acceptor_subkey && !(token.Flags & CFXAcceptorSubkey))) { + printf("Bad flags %x\n", token.Flags); + goto badrpc; + } + + /* XXX Sequence replay detection */ + memcpy(&seq, token.SND_SEQ, sizeof (seq)); + seq = ntohll(seq); + lctx->recv_seq = seq; + + ec = (token.EC[0] << 8) | token.EC[1]; + rrc = (token.RRC[0] << 8) | token.RRC[1]; + *qop = GSS_C_QOP_DEFAULT; + conf = ((token.Flags & CFXSealed) == CFXSealed); + if (conf_flag) + *conf_flag = conf; + if (conf) { + gss_cfx_wrap_token_desc etoken; + + if (rrc) /* Handle Right rotation count */ + gss_krb5_cfx_unwrap_rrc_mbuf(*mbp, rrc); + error = krb5_cfx_crypt_mbuf(cctx, mbp, &len, 0, reverse); + if (error) { + printf("krb5_cfx_crypt_mbuf %d\n", error); + *minor = error; + return (GSS_S_FAILURE); + } + if (len >= sizeof(gss_cfx_wrap_token_desc)) + len -= sizeof(gss_cfx_wrap_token_desc); + else + goto badrpc; + mbuf_copydata(*mbp, len, sizeof(gss_cfx_wrap_token_desc), &etoken); + /* Verify etoken with the token wich should be the same, except the rc field is always zero */ + token.RRC[0] = token.RRC[1] = 0; + if (memcmp(&token, &etoken, sizeof (gss_cfx_wrap_token_desc)) != 0) { + printf("Encrypted token mismach\n"); + goto badrpc; + } + /* strip the encrypted token and any pad bytes */ + gss_strip_mbuf(*mbp, -(sizeof(gss_cfx_wrap_token_desc) + ec)); + len -= (sizeof(gss_cfx_wrap_token_desc) + ec); + } else { + uint8_t digest[cctx->digest_size]; + int verf; + gss_buffer_desc header; + + if (ec != cctx->digest_size || len >= cctx->digest_size) + goto badrpc; + len -= cctx->digest_size; + mbuf_copydata(*mbp, len, cctx->digest_size, digest); + gss_strip_mbuf(*mbp, -cctx->digest_size); + /* When calculating the mic header fields ec and rcc must be zero */ + token.EC[0] = token.EC[1] = token.RRC[0] = token.RRC[1] = 0; + header.value = &token; + header.length = sizeof(gss_cfx_wrap_token_desc); + error = krb5_mic_mbuf(cctx, NULL, *mbp, 0, len, &header, digest, &verf, 1, reverse); + if (error) + goto badrpc; + } + return (GSS_S_COMPLETE); + +badrpc: + *minor = EBADRPC; + return (GSS_S_FAILURE); +} + +/* + * RFC 1964 3DES support + */ + +typedef struct gss_1964_mic_token_desc_struct { + uint8_t TOK_ID[2]; /* 01 01 */ + uint8_t Sign_Alg[2]; + uint8_t Filler[4]; /* ff ff ff ff */ +} gss_1964_mic_token_desc, *gss_1964_mic_token; + +typedef struct gss_1964_wrap_token_desc_struct { + uint8_t TOK_ID[2]; /* 02 01 */ + uint8_t Sign_Alg[2]; + uint8_t Seal_Alg[2]; + uint8_t Filler[2]; /* ff ff */ +} gss_1964_wrap_token_desc, *gss_1964_wrap_token; + +typedef struct gss_1964_delete_token_desc_struct { + uint8_t TOK_ID[2]; /* 01 02 */ + uint8_t Sign_Alg[2]; + uint8_t Filler[4]; /* ff ff ff ff */ +} gss_1964_delete_token_desc, *gss_1964_delete_token; + +typedef struct gss_1964_header_desc_struct { + uint8_t App0; /* 0x60 Application 0 constructed */ + uint8_t AppLen[]; /* Variable Der length */ +} gss_1964_header_desc, *gss_1964_header; + +typedef union { + gss_1964_mic_token_desc mic_tok; + gss_1964_wrap_token_desc wrap_tok; + gss_1964_delete_token_desc del_tok; +} gss_1964_tok_type __attribute__((transparent_union)); + +typedef struct gss_1964_token_body_struct +{ + uint8_t OIDType; /* 0x06 */ + uint8_t OIDLen; /* 0x09 */ + uint8_t kerb_mech[9]; /* Der Encode kerberos mech 1.2.840.113554.1.2.2 + 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 */ + gss_1964_tok_type body; + uint8_t SND_SEQ[8]; + uint8_t Hash[]; /* Mic */ +} gss_1964_token_body_desc, *gss_1964_token_body; + + +gss_1964_header_desc tok_1964_header = { + .App0 = 0x60 +}; + +gss_1964_mic_token_desc mic_1964_token = { + .TOK_ID = "\x01\x01", + .Filler = "\xff\xff\xff\xff" +}; + +gss_1964_wrap_token_desc wrap_1964_token = { + .TOK_ID = "\x02\x01", + .Filler = "\xff\xff" +}; + +gss_1964_delete_token_desc del_1964_token = { + .TOK_ID = "\x01\x01", + .Filler = "\xff\xff\xff\xff" +}; + +gss_1964_token_body_desc body_1964_token = { + .OIDType = 0x06, + .OIDLen = 0x09, + .kerb_mech = "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02", +}; + +#define GSS_KRB5_3DES_MAXTOKSZ (sizeof(gss_1964_header_desc) + 5 /* max der length supported */ + sizeof(gss_1964_token_body_desc)) + +uint32_t gss_krb5_3des_get_mic(uint32_t *, gss_ctx_id_t, gss_qop_t, gss_buffer_t, gss_buffer_t); +uint32_t gss_krb5_3des_verify_mic(uint32_t *, gss_ctx_id_t, gss_buffer_t, gss_buffer_t, gss_qop_t *); +uint32_t gss_krb5_3des_get_mic_mbuf(uint32_t *, gss_ctx_id_t, gss_qop_t, mbuf_t, size_t, size_t, gss_buffer_t); +uint32_t gss_krb5_3des_verify_mic_mbuf(uint32_t *, gss_ctx_id_t, mbuf_t, size_t, size_t, gss_buffer_t, gss_qop_t *); +uint32_t gss_krb5_3des_wrap_mbuf(uint32_t *, gss_ctx_id_t, int, gss_qop_t, mbuf_t *, size_t, int *); +uint32_t gss_krb5_3des_unwrap_mbuf(uint32_t *, gss_ctx_id_t, mbuf_t *, size_t, int *, gss_qop_t *); + +/* + * Decode an ASN.1 DER length field + */ +static ssize_t +gss_krb5_der_length_get(uint8_t **pp) +{ + uint8_t *p = *pp; + uint32_t flen, len = 0; + + flen = *p & 0x7f; + + if (*p++ & 0x80) { + if (flen > sizeof(uint32_t)) + return (-1); + while (flen--) + len = (len << 8) + *p++; + } else { + len = flen; + } + *pp = p; + return (len); +} + +/* + * Determine size of ASN.1 DER length + */ +static int +gss_krb5_der_length_size(int len) +{ + return + len < (1 << 7) ? 1 : + len < (1 << 8) ? 2 : + len < (1 << 16) ? 3 : + len < (1 << 24) ? 4 : 5; +} + +/* + * Encode an ASN.1 DER length field + */ +static void +gss_krb5_der_length_put(uint8_t **pp, int len) +{ + int sz = gss_krb5_der_length_size(len); + uint8_t *p = *pp; + + if (sz == 1) { + *p++ = (uint8_t) len; + } else { + *p++ = (uint8_t) ((sz-1) | 0x80); + sz -= 1; + while (sz--) + *p++ = (uint8_t) ((len >> (sz * 8)) & 0xff); + } + + *pp = p; +} + +static void +gss_krb5_3des_token_put(gss_ctx_id_t ctx, gss_1964_tok_type body, gss_buffer_t hash, size_t datalen, gss_buffer_t des3_token) +{ + gss_1964_header token; + gss_1964_token_body tokbody; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + uint32_t seq = (uint32_t) (lctx->send_seq++ & 0xffff); + size_t toklen = sizeof(gss_1964_token_body_desc) + cctx->digest_size; + size_t alloclen = toklen + sizeof (gss_1964_header_desc) + gss_krb5_der_length_size(toklen + datalen); + uint8_t *tokptr; + + MALLOC(token, gss_1964_header, alloclen, M_TEMP, M_WAITOK|M_ZERO); + *token = tok_1964_header; + tokptr = token->AppLen; + gss_krb5_der_length_put(&tokptr, toklen + datalen); + tokbody = (gss_1964_token_body)tokptr; + *tokbody = body_1964_token; /* Initalize the token body */ + tokbody->body = body; /* and now set the body to the token type passed in */ + seq = htonl(seq); + for (int i = 0; i < 4; i++) + tokbody->SND_SEQ[i] = (uint8_t)((seq >> (i * 8)) & 0xff); + for (int i = 4; i < 8; i++) + tokbody->SND_SEQ[i] = lctx->initiate ? 0x00 : 0xff; + + size_t blocksize = cctx->enc_mode->block_size; + cccbc_iv_decl(blocksize, iv); + cccbc_ctx_decl(cctx->enc_mode->size, enc_ctx); + cccbc_set_iv(cctx->enc_mode, iv, hash->value); + cccbc_init(cctx->enc_mode, enc_ctx, cctx->keylen, cctx->key); + cccbc_update(cctx->enc_mode, enc_ctx, iv, 1, tokbody->SND_SEQ, tokbody->SND_SEQ); + + assert(hash->length == cctx->digest_size); + memcpy(tokbody->Hash, hash->value, hash->length); + des3_token->length = alloclen; + des3_token->value = token; +} + +static int +gss_krb5_3des_token_get(gss_ctx_id_t ctx, gss_buffer_t intok, + gss_1964_tok_type body, gss_buffer_t hash, size_t *offset, size_t *len, int reverse) +{ + gss_1964_header token = intok->value; + gss_1964_token_body tokbody; + lucid_context_t lctx = &ctx->gss_lucid_ctx; + crypto_ctx_t cctx = &ctx->gss_cryptor; + ssize_t length; + size_t toklen; + uint8_t *tokptr; + uint32_t seq; + int initiate; + + if (token->App0 != tok_1964_header.App0) { + printf("%s: bad framing\n", __func__); + printgbuf(__func__, intok); + return (EBADRPC); + } + tokptr = token->AppLen; + length = gss_krb5_der_length_get(&tokptr); + if (length < 0) { + printf("%s: invalid length\n", __func__); + printgbuf(__func__, intok); + return (EBADRPC); + } + toklen = sizeof (gss_1964_header_desc) + gss_krb5_der_length_size(length) + + sizeof (gss_1964_token_body_desc); + + if (intok->length < toklen + cctx->digest_size) { + printf("%s: token to short", __func__); + printf("toklen = %d, length = %d\n", (int)toklen, (int)length); + printgbuf(__func__, intok); + return (EBADRPC); + } + + if (offset) + *offset = toklen + cctx->digest_size; + + if (len) + *len = length - sizeof (gss_1964_token_body_desc) - cctx->digest_size; + + tokbody = (gss_1964_token_body)tokptr; + if (tokbody->OIDType != body_1964_token.OIDType || + tokbody->OIDLen != body_1964_token.OIDLen || + memcmp(tokbody->kerb_mech, body_1964_token.kerb_mech, tokbody->OIDLen) != 0) { + printf("%s: Invalid mechanism\n", __func__); + printgbuf(__func__, intok); + return (EBADRPC); + } + if (memcmp(&tokbody->body, &body, sizeof(gss_1964_tok_type)) != 0) { + printf("%s: Invalid body\n", __func__); + printgbuf(__func__, intok); + return (EBADRPC); + } + size_t blocksize = cctx->enc_mode->block_size; + uint8_t *block = tokbody->SND_SEQ; + + assert(blocksize == sizeof(tokbody->SND_SEQ)); + cccbc_iv_decl(blocksize, iv); + cccbc_ctx_decl(cctx->dec_mode->size, dec_ctx); + cccbc_set_iv(cctx->dec_mode, iv, tokbody->Hash); + cccbc_init(cctx->dec_mode, dec_ctx, cctx->keylen, cctx->key); + cccbc_update(cctx->dec_mode, dec_ctx, iv, 1, block, block); + + initiate = lctx->initiate ? (reverse ? 0 : 1) : (reverse ? 1 : 0); + for(int i = 4; i < 8; i++) { + if (tokbody->SND_SEQ[i] != (initiate ? 0xff : 0x00)) { + printf("%s: Invalid des mac\n", __func__); + printgbuf(__func__, intok); + return (EAUTH); + } + } + + memcpy(&seq, tokbody->SND_SEQ, sizeof (uint32_t)); + + lctx->recv_seq = ntohl(seq); + + assert(hash->length >= cctx->digest_size); + memcpy(hash->value, tokbody->Hash, cctx->digest_size); + + return (0); +} + +uint32_t +gss_krb5_3des_get_mic(uint32_t *minor, /* minor status */ + gss_ctx_id_t ctx, /* krb5 context id */ + gss_qop_t qop __unused, /* qop_req (ignored) */ + gss_buffer_t mbp, /* message buffer in */ + gss_buffer_t mic) /* mic token out */ +{ + gss_1964_mic_token_desc tokbody = mic_1964_token; + crypto_ctx_t cctx = &ctx->gss_cryptor; + gss_buffer_desc hash; + gss_buffer_desc header; + uint8_t hashval[cctx->digest_size]; + + hash.length = cctx->digest_size; + hash.value = hashval; + tokbody.Sign_Alg[0] = 0x04; /* lctx->keydata.lucid_protocol_u.data_1964.sign_alg */ + tokbody.Sign_Alg[1] = 0x00; + header.length = sizeof (gss_1964_mic_token_desc); + header.value = & tokbody; + + /* Hash the data */ + *minor = krb5_mic(cctx, &header, mbp, NULL, hashval, NULL, 0, 0); + if (*minor) + return (GSS_S_FAILURE); + + /* Make the token */ + gss_krb5_3des_token_put(ctx, tokbody, &hash, 0, mic); + + return (GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_3des_verify_mic(uint32_t *minor, + gss_ctx_id_t ctx, + gss_buffer_t mbp, + gss_buffer_t mic, + gss_qop_t *qop) +{ + crypto_ctx_t cctx = &ctx->gss_cryptor; + uint8_t hashval[cctx->digest_size]; + gss_buffer_desc hash; + gss_1964_mic_token_desc mtok = mic_1964_token; + gss_buffer_desc header; + int verf; + + mtok.Sign_Alg[0] = 0x04; /* lctx->key_data.lucid_protocol_u.data_1964.sign_alg */ + mtok.Sign_Alg[1] = 0x00; + hash.length = cctx->digest_size; + hash.value = hashval; + header.length = sizeof(gss_1964_mic_token_desc); + header.value = &mtok; + + if (qop) + *qop = GSS_C_QOP_DEFAULT; + + *minor = gss_krb5_3des_token_get(ctx, mic, mtok, &hash, NULL, NULL, 0); + if (*minor) + return (GSS_S_FAILURE); + + *minor = krb5_mic(cctx, &header, mbp, NULL, hashval, &verf, 0, 0); + if (*minor) + return (GSS_S_FAILURE); + + return (verf ? GSS_S_COMPLETE : GSS_S_BAD_SIG); +} + +uint32_t +gss_krb5_3des_get_mic_mbuf(uint32_t *minor, + gss_ctx_id_t ctx, + gss_qop_t qop __unused, + mbuf_t mbp, + size_t offset, + size_t len, + gss_buffer_t mic) +{ + gss_1964_mic_token_desc tokbody = mic_1964_token; + crypto_ctx_t cctx = &ctx->gss_cryptor; + gss_buffer_desc header; + gss_buffer_desc hash; + uint8_t hashval[cctx->digest_size]; + + hash.length = cctx->digest_size; + hash.value = hashval; + tokbody.Sign_Alg[0] = 0x04; /* lctx->key_data.lucid_protocol_u.data_4121.sign_alg */ + tokbody.Sign_Alg[1] = 0x00; + header.length = sizeof (gss_1964_mic_token_desc); + header.value = &tokbody; + + /* Hash the data */ + *minor = krb5_mic_mbuf(cctx, &header, mbp, offset, len, NULL, hashval, NULL, 0, 0); + if (*minor) + return (GSS_S_FAILURE); + + /* Make the token */ + gss_krb5_3des_token_put(ctx, tokbody, &hash, 0, mic); + + return (GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_3des_verify_mic_mbuf(uint32_t *minor, + gss_ctx_id_t ctx, + mbuf_t mbp, + size_t offset, + size_t len, + gss_buffer_t mic, + gss_qop_t *qop) +{ + crypto_ctx_t cctx = &ctx->gss_cryptor; + uint8_t hashval[cctx->digest_size]; + gss_buffer_desc header; + gss_buffer_desc hash; + gss_1964_mic_token_desc mtok = mic_1964_token; + int verf; + + mtok.Sign_Alg[0] = 0x04; /* lctx->key_data.lucic_protocol_u.data1964.sign_alg */ + mtok.Sign_Alg[1] = 0x00; + hash.length = cctx->digest_size; + hash.value = hashval; + header.length = sizeof(gss_1964_mic_token_desc); + header.value = &mtok; + + if (qop) + *qop = GSS_C_QOP_DEFAULT; + + *minor = gss_krb5_3des_token_get(ctx, mic, mtok, &hash, NULL, NULL, 0); + if (*minor) + return (GSS_S_FAILURE); + + *minor = krb5_mic_mbuf(cctx, &header, mbp, offset, len, NULL, hashval, &verf, 0, 0); + if (*minor) + return (GSS_S_FAILURE); + + return (verf ? GSS_S_COMPLETE : GSS_S_BAD_SIG); +} + +uint32_t +gss_krb5_3des_wrap_mbuf(uint32_t *minor, + gss_ctx_id_t ctx, + int conf_flag, + gss_qop_t qop __unused, + mbuf_t *mbp, + size_t len, + int *conf_state) +{ + crypto_ctx_t cctx = &ctx->gss_cryptor; + const struct ccmode_cbc *ccmode = cctx->enc_mode; + uint8_t padlen; + uint8_t pad[8]; + uint8_t confounder[ccmode->block_size]; + gss_1964_wrap_token_desc tokbody = wrap_1964_token; + gss_buffer_desc header; + gss_buffer_desc mic; + gss_buffer_desc hash; + uint8_t hashval[cctx->digest_size]; + + if (conf_state) + *conf_state = conf_flag; + + hash.length = cctx->digest_size; + hash.value = hashval; + tokbody.Sign_Alg[0] = 0x04; /* lctx->key_data.lucid_protocol_u.data_1964.sign_alg */ + tokbody.Sign_Alg[1] = 0x00; + /* conf_flag ? lctx->key_data.lucid_protocol_u.data_1964.seal_alg : 0xffff */ + tokbody.Seal_Alg[0] = conf_flag ? 0x02 : 0xff; + tokbody.Seal_Alg[1] = conf_flag ? 0x00 : 0xff; + header.length = sizeof (gss_1964_wrap_token_desc); + header.value = &tokbody; + + /* Prepend confounder */ + read_random(confounder, ccmode->block_size); + *minor = gss_prepend_mbuf(mbp, confounder, ccmode->block_size); + if (*minor) + return (GSS_S_FAILURE); + + /* Append trailer of up to 8 bytes and set pad length in each trailer byte */ + padlen = 8 - len % 8; + for (int i = 0; i < padlen; i++) + pad[i] = padlen; + *minor = gss_append_mbuf(*mbp, pad, padlen); + if (*minor) + return (GSS_S_FAILURE); + + len += ccmode->block_size + padlen; + + /* Hash the data */ + *minor = krb5_mic_mbuf(cctx, &header, *mbp, 0, len, NULL, hashval, NULL, 0, 0); + if (*minor) + return (GSS_S_FAILURE); + + /* Make the token */ + gss_krb5_3des_token_put(ctx, tokbody, &hash, len, &mic); + + if (conf_flag) { + *minor = krb5_crypt_mbuf(cctx, mbp, len, 1, 0); + if (*minor) + return (GSS_S_FAILURE); + } + + *minor = gss_prepend_mbuf(mbp, mic.value, mic.length); + + return (*minor ? GSS_S_FAILURE : GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_3des_unwrap_mbuf(uint32_t *minor, + gss_ctx_id_t ctx, + mbuf_t *mbp, + size_t len, + int *conf_state, + gss_qop_t *qop) +{ + crypto_ctx_t cctx = &ctx->gss_cryptor; + const struct ccmode_cbc *ccmode = cctx->dec_mode; + size_t length = 0, offset; + gss_buffer_desc hash; + uint8_t hashval[cctx->digest_size]; + gss_buffer_desc itoken; + uint8_t tbuffer[GSS_KRB5_3DES_MAXTOKSZ + cctx->digest_size]; + itoken.length = GSS_KRB5_3DES_MAXTOKSZ + cctx->digest_size; + itoken.value = tbuffer; + gss_1964_wrap_token_desc wrap = wrap_1964_token; + gss_buffer_desc header; + uint8_t padlen; + mbuf_t smb, tmb; + int cflag, verified, reverse = 0; + + if (len < GSS_KRB5_3DES_MAXTOKSZ) { + *minor = EBADRPC; + return (GSS_S_FAILURE); + } + + if (*qop == GSS_C_QOP_REVERSE) + reverse = 1; + *qop = GSS_C_QOP_DEFAULT; + + *minor = mbuf_copydata(*mbp, 0, itoken.length, itoken.value); + if (*minor) + return (GSS_S_FAILURE); + + hash.length = cctx->digest_size; + hash.value = hashval; + wrap.Sign_Alg[0] = 0x04; + wrap.Sign_Alg[1] = 0x00; + wrap.Seal_Alg[0] = 0x02; + wrap.Seal_Alg[1] = 0x00; + + for (cflag = 1; cflag >= 0; cflag--) { + *minor = gss_krb5_3des_token_get(ctx, &itoken, wrap, &hash, &offset, &length, reverse); + if (*minor == 0) + break; + wrap.Seal_Alg[0] = 0xff; + wrap.Seal_Alg[0] = 0xff; + } + if (*minor) + return (GSS_S_FAILURE); + + if (conf_state) + *conf_state = cflag; + + /* + * Seperate off the header + */ + *minor = gss_normalize_mbuf(*mbp, offset, &length, &smb, &tmb, 0); + if (*minor) + return (GSS_S_FAILURE); + + assert(tmb == NULL); + + /* Decrypt the chain if needed */ + if (cflag) { + *minor = krb5_crypt_mbuf(cctx, &smb, length, 0, NULL); + if (*minor) + return (GSS_S_FAILURE); + } + + /* Verify the mic */ + header.length = sizeof(gss_1964_wrap_token_desc); + header.value = &wrap; + + *minor = krb5_mic_mbuf(cctx, &header, smb, 0, length, NULL, hashval, &verified, 0, 0); + if (!verified) + return (GSS_S_BAD_SIG); + if (*minor) + return (GSS_S_FAILURE); + + /* Get the pad bytes */ + *minor = mbuf_copydata(smb, length - 1, 1, &padlen); + if (*minor) + return (GSS_S_FAILURE); + + /* Strip the confounder and trailing pad bytes */ + gss_strip_mbuf(smb, -padlen); + gss_strip_mbuf(smb, ccmode->block_size); + + if (*mbp != smb) { + mbuf_freem(*mbp); + *mbp = smb; + } + + return (GSS_S_COMPLETE); +} + +static const char * +etype_name(etypes etype) +{ + switch (etype) { + case DES3_CBC_SHA1_KD: + return ("des3-cbc-sha1"); + case AES128_CTS_HMAC_SHA1_96: + return ("aes128-cts-hmac-sha1-96"); + case AES256_CTS_HMAC_SHA1_96: + return ("aes-cts-hmac-sha1-96"); + default: + return ("unknown enctype"); + } +} + +static int +supported_etype(uint32_t proto, etypes etype) +{ + const char *proto_name; + + switch(proto) { + case 0: + /* RFC 1964 */ + proto_name = "RFC 1964 krb5 gss mech"; + switch (etype) { + case DES3_CBC_SHA1_KD: + return (1); + default: + break; + } + break; + case 1: + /* RFC 4121 */ + proto_name = "RFC 4121 krb5 gss mech"; + switch (etype) { + case AES256_CTS_HMAC_SHA1_96: + case AES128_CTS_HMAC_SHA1_96: + return (1); + default: + break; + } + break; + default: + proto_name = "Unknown krb5 gss mech"; + break; + } + printf("%s: Non supported encryption %s (%d) type for protocol %s (%d)\n", + __func__, etype_name(etype), etype, proto_name, proto); + return (0); +} + +/* + * Kerberos gss mech entry points + */ +uint32_t +gss_krb5_get_mic(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + gss_qop_t qop, /* qop_req */ + gss_buffer_t mbp, /* message buffer */ + gss_buffer_t mic /* message_token */) +{ + uint32_t minor_stat = 0; + + if (minor == NULL) + minor = &minor_stat; + *minor = 0; + + /* Validate context */ + if (ctx == NULL || ((lucid_context_version_t)ctx)->version != 1) + return (GSS_S_NO_CONTEXT); + + if (!supported_etype(ctx->gss_lucid_ctx.key_data.proto, ctx->gss_cryptor.etype)) { + *minor = ENOTSUP; + return (GSS_S_FAILURE); + } + + switch(ctx->gss_lucid_ctx.key_data.proto) { + case 0: + /* RFC 1964 DES3 case */ + return (gss_krb5_3des_get_mic(minor, ctx, qop, mbp, mic)); + case 1: + /* RFC 4121 CFX case */ + return (gss_krb5_cfx_get_mic(minor, ctx, qop, mbp, mic)); + } + + return (GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_verify_mic(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + gss_buffer_t mbp, /* message_buffer */ + gss_buffer_t mic, /* message_token */ + gss_qop_t *qop /* qop_state */) +{ + uint32_t minor_stat = 0; + gss_qop_t qop_val = GSS_C_QOP_DEFAULT; + + if (minor == NULL) + minor = &minor_stat; + if (qop == NULL) + qop = &qop_val; + + *minor = 0; + + /* Validate context */ + if (ctx == NULL || ((lucid_context_version_t)ctx)->version != 1) + return (GSS_S_NO_CONTEXT); + + if (!supported_etype(ctx->gss_lucid_ctx.key_data.proto, ctx->gss_cryptor.etype)) { + *minor = ENOTSUP; + return (GSS_S_FAILURE); + } + + switch(ctx->gss_lucid_ctx.key_data.proto) { + case 0: + /* RFC 1964 DES3 case */ + return (gss_krb5_3des_verify_mic(minor, ctx, mbp, mic, qop)); + case 1: + /* RFC 4121 CFX case */ + return (gss_krb5_cfx_verify_mic(minor, ctx, mbp, mic, qop)); + } + return (GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_get_mic_mbuf(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + gss_qop_t qop, /* qop_req */ + mbuf_t mbp, /* message mbuf */ + size_t offset, /* offest */ + size_t len, /* length */ + gss_buffer_t mic /* message_token */) +{ + uint32_t minor_stat = 0; + + if (minor == NULL) + minor = &minor_stat; + *minor = 0; + + if (len == 0) + len = ~(size_t)0; + + /* Validate context */ + if (ctx == NULL || ((lucid_context_version_t)ctx)->version != 1) + return (GSS_S_NO_CONTEXT); + + if (!supported_etype(ctx->gss_lucid_ctx.key_data.proto, ctx->gss_cryptor.etype)) { + *minor = ENOTSUP; + return (GSS_S_FAILURE); + } + + switch(ctx->gss_lucid_ctx.key_data.proto) { + case 0: + /* RFC 1964 DES3 case */ + return (gss_krb5_3des_get_mic_mbuf(minor, ctx, qop, mbp, offset, len, mic)); + case 1: + /* RFC 4121 CFX case */ + return (gss_krb5_cfx_get_mic_mbuf(minor, ctx, qop, mbp, offset, len, mic)); + } + + return (GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_verify_mic_mbuf(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + mbuf_t mbp, /* message_buffer */ + size_t offset, /* offset */ + size_t len, /* length */ + gss_buffer_t mic, /* message_token */ + gss_qop_t *qop /* qop_state */) +{ + uint32_t minor_stat = 0; + gss_qop_t qop_val = GSS_C_QOP_DEFAULT; + + if (minor == NULL) + minor = &minor_stat; + if (qop == NULL) + qop = &qop_val; + + *minor = 0; + + if (len == 0) + len = ~(size_t)0; + + /* Validate context */ + if (ctx == NULL || ((lucid_context_version_t)ctx)->version != 1) + return (GSS_S_NO_CONTEXT); + + if (!supported_etype(ctx->gss_lucid_ctx.key_data.proto, ctx->gss_cryptor.etype)) { + *minor = ENOTSUP; + return (GSS_S_FAILURE); + } + + switch(ctx->gss_lucid_ctx.key_data.proto) { + case 0: + /* RFC 1964 DES3 case */ + return (gss_krb5_3des_verify_mic_mbuf(minor, ctx, mbp, offset, len, mic, qop)); + case 1: + /* RFC 4121 CFX case */ + return (gss_krb5_cfx_verify_mic_mbuf(minor, ctx, mbp, offset, len, mic, qop)); + } + + return (GSS_S_COMPLETE); +} + +uint32_t +gss_krb5_wrap_mbuf(uint32_t *minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + int conf_flag, /* conf_req_flag */ + gss_qop_t qop, /* qop_req */ + mbuf_t *mbp, /* input/output message_buffer */ + size_t offset, /* offset */ + size_t len, /* length */ + int *conf_state /* conf state */) +{ + uint32_t major, minor_stat = 0; + mbuf_t smb, tmb; + int conf_val = 0; + + if (minor == NULL) + minor = &minor_stat; + if (conf_state == NULL) + conf_state = &conf_val; + + *minor = 0; + + /* Validate context */ + if (ctx == NULL || ((lucid_context_version_t)ctx)->version != 1) + return (GSS_S_NO_CONTEXT); + + if (!supported_etype(ctx->gss_lucid_ctx.key_data.proto, ctx->gss_cryptor.etype)) { + *minor = ENOTSUP; + return (GSS_S_FAILURE); + } + + gss_normalize_mbuf(*mbp, offset, &len, &smb, &tmb, 0); + + switch(ctx->gss_lucid_ctx.key_data.proto) { + case 0: + /* RFC 1964 DES3 case */ + major = gss_krb5_3des_wrap_mbuf(minor, ctx, conf_flag, qop, &smb, len, conf_state); + break; + case 1: + /* RFC 4121 CFX case */ + major = gss_krb5_cfx_wrap_mbuf(minor, ctx, conf_flag, qop, &smb, len, conf_state); + break; + } + + if (offset) + gss_join_mbuf(*mbp, smb, tmb); + else { + *mbp = smb; + gss_join_mbuf(smb, tmb, NULL); + } + + return (major); +} + +uint32_t +gss_krb5_unwrap_mbuf(uint32_t * minor, /* minor_status */ + gss_ctx_id_t ctx, /* context_handle */ + mbuf_t *mbp, /* input/output message_buffer */ + size_t offset, /* offset */ + size_t len, /* length */ + int *conf_flag, /* conf_state */ + gss_qop_t *qop /* qop state */) +{ + uint32_t major, minor_stat = 0; + gss_qop_t qop_val = GSS_C_QOP_DEFAULT; + int conf_val = 0; + mbuf_t smb, tmb; + + if (minor == NULL) + minor = &minor_stat; + if (qop == NULL) + qop = &qop_val; + if (conf_flag == NULL) + conf_flag = &conf_val; + + /* Validate context */ + if (ctx == NULL || ((lucid_context_version_t)ctx)->version != 1) + return (GSS_S_NO_CONTEXT); + + if (!supported_etype(ctx->gss_lucid_ctx.key_data.proto, ctx->gss_cryptor.etype)) { + *minor = ENOTSUP; + return (GSS_S_FAILURE); + } + + gss_normalize_mbuf(*mbp, offset, &len, &smb, &tmb, 0); + + switch(ctx->gss_lucid_ctx.key_data.proto) { + case 0: + /* RFC 1964 DES3 case */ + major = gss_krb5_3des_unwrap_mbuf(minor, ctx, &smb, len, conf_flag, qop); + break; + case 1: + /* RFC 4121 CFX case */ + major = gss_krb5_cfx_unwrap_mbuf(minor, ctx, &smb, len, conf_flag, qop); + break; + } + + if (offset) + gss_join_mbuf(*mbp, smb, tmb); + else { + *mbp = smb; + gss_join_mbuf(smb, tmb, NULL); + } + + return (major); +} + +#include + +static int +xdr_lucid_context(void *data, size_t length, lucid_context_t lctx) +{ + struct xdrbuf xb; + int error = 0; + uint32_t keylen = 0; + + xb_init_buffer(&xb, data, length); + xb_get_32(error, &xb, lctx->vers); + if (!error && lctx->vers != 1) { + error = EINVAL; + printf("%s: invalid version %d\n", __func__, (int)lctx->vers); + goto out; + } + xb_get_32(error, &xb, lctx->initiate); + if (error) { + printf("%s: Could not decode initiate\n", __func__); + goto out; + } + xb_get_32(error, &xb, lctx->endtime); + if (error) { + printf("%s: Could not decode endtime\n", __func__); + goto out; + } + xb_get_64(error, &xb, lctx->send_seq); + if (error) { + printf("%s: Could not decode send_seq\n", __func__); + goto out; + } + xb_get_64(error, &xb, lctx->recv_seq); + if (error) { + printf("%s: Could not decode recv_seq\n", __func__); + goto out; + } + xb_get_32(error, &xb, lctx->key_data.proto); + if (error) { + printf("%s: Could not decode mech protocol\n", __func__); + goto out; + } + switch(lctx->key_data.proto) { + case 0: + xb_get_32(error, &xb, lctx->key_data.lucid_protocol_u.data_1964.sign_alg); + xb_get_32(error, &xb, lctx->key_data.lucid_protocol_u.data_1964.seal_alg); + if (error) + printf("%s: Could not decode rfc1964 sign and seal\n", __func__); + break; + case 1: + xb_get_32(error, &xb, lctx->key_data.lucid_protocol_u.data_4121.acceptor_subkey); + if (error) + printf("%s: Could not decode rfc4121 acceptor_subkey", __func__); + break; + default: + printf("%s: Invalid mech protocol %d\n", __func__, (int)lctx->key_data.proto); + error = EINVAL; + } + if (error) + goto out; + xb_get_32(error, &xb, lctx->ctx_key.etype); + if (error) { + printf("%s: Could not decode key enctype\n", __func__); + goto out; + } + switch(lctx->ctx_key.etype) { + case DES3_CBC_SHA1_KD: + keylen = 24; + break; + case AES128_CTS_HMAC_SHA1_96: + keylen = 16; + break; + case AES256_CTS_HMAC_SHA1_96: + keylen = 32; + break; + default: + error = ENOTSUP; + goto out; + } + xb_get_32(error, &xb, lctx->ctx_key.key.key_len); + if (error) { + printf("%s: could not decode key length\n", __func__); + goto out; + } + if (lctx->ctx_key.key.key_len != keylen) { + error = EINVAL; + printf("%s: etype = %d keylen = %d expected keylen = %d\n", __func__, + lctx->ctx_key.etype, lctx->ctx_key.key.key_len, keylen); + goto out; + } + + lctx->ctx_key.key.key_val = xb_malloc(keylen); + if (lctx->ctx_key.key.key_val == NULL) { + printf("%s: could not get memory for key\n", __func__); + error = ENOMEM; + goto out; + } + error = xb_get_bytes(&xb, (char *)lctx->ctx_key.key.key_val, keylen, 1); + if (error) { + printf("%s: could get key value\n", __func__); + xb_free(lctx->ctx_key.key.key_val); + } +out: + return (error); +} + +gss_ctx_id_t +gss_krb5_make_context(void *data, uint32_t datalen) +{ + gss_ctx_id_t ctx; + + if (!corecrypto_available()) + return (NULL); + + gss_krb5_mech_init(); + MALLOC(ctx, gss_ctx_id_t, sizeof (struct gss_ctx_id_desc), M_TEMP, M_WAITOK | M_ZERO); + if (xdr_lucid_context(data, datalen, &ctx->gss_lucid_ctx) || + !supported_etype(ctx->gss_lucid_ctx.key_data.proto, ctx->gss_lucid_ctx.ctx_key.etype)) { + FREE(ctx, M_TEMP); + FREE(data, M_TEMP); + return (NULL); + } + + /* Set up crypto context */ + gss_crypto_ctx_init(&ctx->gss_cryptor, &ctx->gss_lucid_ctx); + FREE(data, M_TEMP); + + return (ctx); +} + +void +gss_krb5_destroy_context(gss_ctx_id_t ctx) +{ + if (ctx == NULL) + return; + gss_crypto_ctx_free(&ctx->gss_cryptor); + FREE(ctx->gss_lucid_ctx.ctx_key.key.key_val, M_TEMP); + cc_clear(sizeof (lucid_context_t), &ctx->gss_lucid_ctx); + FREE(ctx, M_TEMP); +} diff --git a/bsd/nfs/gss/gss_krb5_mech.h b/bsd/nfs/gss/gss_krb5_mech.h new file mode 100644 index 000000000..01386b6da --- /dev/null +++ b/bsd/nfs/gss/gss_krb5_mech.h @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +/* + * GSS-API things from gssapi.h + */ +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +typedef uint32_t OM_uint32; + +#define GSS_S_COMPLETE 0 + +/* + * Some "helper" definitions to make the status code macros obvious. + * From gssapi.h: + */ +#define GSS_C_CALLING_ERROR_OFFSET 24 +#define GSS_C_ROUTINE_ERROR_OFFSET 16 +#define GSS_C_SUPPLEMENTARY_OFFSET 0 +#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul) + +/* + * The macros that test status codes for error conditions. Note that the + * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now + * evaluates its argument only once. + */ +#define GSS_CALLING_ERROR(x) \ + ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET)) +#define GSS_ROUTINE_ERROR(x) \ + ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)) +#define GSS_SUPPLEMENTARY_INFO(x) \ + ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET)) +#define GSS_ERROR(x) \ + ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \ + (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))) + +/* + * Calling errors: + */ +#define GSS_S_CALL_INACCESSIBLE_READ \ + (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_INACCESSIBLE_WRITE \ + (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_BAD_STRUCTURE \ + (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET) + +/* + * Routine errors: + */ +#define GSS_S_BAD_MECH (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAME (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAMETYPE (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_BINDINGS (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_STATUS (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_SIG (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CRED (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CONTEXT (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_TOKEN (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_CREDENTIAL \ + (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CREDENTIALS_EXPIRED \ + (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CONTEXT_EXPIRED \ + (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_FAILURE (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_QOP (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAUTHORIZED (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAVAILABLE (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DUPLICATE_ELEMENT \ + (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NAME_NOT_MN \ + (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET) + +/* + * Supplementary info bits: + */ +#define GSS_S_CONTINUE_NEEDED (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0)) +#define GSS_S_DUPLICATE_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1)) +#define GSS_S_OLD_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2)) +#define GSS_S_UNSEQ_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3)) +#define GSS_S_GAP_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4)) + +#define GSS_C_QOP_DEFAULT 0 + +/* end of gssapi.h */ + +/* + * The following data structures are genenrated from lucid.x in the gssd project + * and must be kept in sync with that project. This is a more memory efficient + * representation of the gss_kerb5_lucid_context_v1_t defined in gssapi_krb5.h + */ +struct lucid_key { + uint32_t etype; + struct { + uint32_t key_len; + uint8_t *key_val; + } key; +}; +typedef struct lucid_key lucid_key; + +struct key_data_1964 { + uint32_t sign_alg; + uint32_t seal_alg; +}; +typedef struct key_data_1964 key_data_1964; + +struct key_data_4121 { + uint32_t acceptor_subkey; +}; +typedef struct key_data_4121 key_data_4121; + +struct lucid_protocol { + uint32_t proto; + union { + key_data_1964 data_1964; + key_data_4121 data_4121; + } lucid_protocol_u; +}; +typedef struct lucid_protocol lucid_protocol; + +struct lucid_context { + uint32_t vers; + uint32_t initiate; + uint32_t endtime; + uint64_t send_seq; + uint64_t recv_seq; + lucid_protocol key_data; + lucid_key ctx_key; +}; +typedef struct lucid_context lucid_context; + +/* end of lucid.x generated data structures */ + +typedef struct lucid_context *lucid_context_t; +/* + * Mask for determining the returned structure version. + * See example below for usage. + */ +typedef struct lucid_context_version { + uint32_t version; + /* Structure version number */ +} *lucid_context_version_t; + +typedef enum etypes { + DES3_CBC_SHA1_KD = 16, + AES128_CTS_HMAC_SHA1_96 = 17, + AES256_CTS_HMAC_SHA1_96 = 18, +} etypes; + +#define KRB5_USAGE_ACCEPTOR_SEAL 22 +#define KRB5_USAGE_ACCEPTOR_SIGN 23 +#define KRB5_USAGE_INITIATOR_SEAL 24 +#define KRB5_USAGE_INITIATOR_SIGN 25 +#define KRB5_USAGE_LEN 5 + +#define GSS_SND 0 +#define GSS_RCV 1 +#define GSS_C_QOP_REVERSE 0x80000000 /* Pseudo QOP value to use as input to gss_krb5_unwrap to allow Sender to unwrap */ + +/* + * Key schedule is the cbc state for encryption and decryption. + * For DES3 we always use the session key from the lucid context, + * and in that case Ekey and Ikey will point to the session key. + */ +struct key_schedule { + cccbc_ctx *enc; + cccbc_ctx *dec; + void *ikey[2]; /* Drived integrity key (same length context key); */ +}; + +/* + * Crypto context that supports AES and DES3 etypes + * All supported encryption types use hmac with SHA1 + * All are CBC encryption types + * des3-cbc-sha1 -- 7 + * des3-dbc-sha1-kd -- 16 ??? + * aes128-cts-hmac-sha1-96 -- 17 + * aes256-cts-hmac-sha1-96 -- 18 + */ + +typedef struct crypto_ctx { + uint32_t etype; + uint32_t mpad; /* Message padding */ + uint32_t flags; + lck_mtx_t *lock; + lucid_context_t gss_ctx; /* Back pointer to lucid context */ + uint32_t keylen; + void *key; /* Points to session key from lucid context */ + const struct ccdigest_info *di; + const struct ccmode_cbc *enc_mode; + const struct ccmode_cbc *dec_mode; + struct key_schedule ks; + uint32_t digest_size; + void *ckey[2]; /* Derived checksum key. Same as key for DES3 */ +} *crypto_ctx_t; + +#define CRYPTO_KS_ALLOCED 0x00001 +#define CRYPTO_CTS_ENABLE 0x00002 + +typedef struct gss_ctx_id_desc { + lucid_context gss_lucid_ctx; + struct crypto_ctx gss_cryptor; +} *gss_ctx_id_t; + +typedef struct gss_buffer_desc_struct { + size_t length; + void *value; +} gss_buffer_desc, *gss_buffer_t; + +uint32_t +gss_release_buffer(uint32_t *, /* minor_status */ + gss_buffer_t); + + +/* Per message interfaces for kerberos gss mech in the kernel */ + +typedef uint32_t gss_qop_t; + +uint32_t +gss_krb5_get_mic_mbuf(uint32_t *, /* minor_status */ + gss_ctx_id_t, /* context_handle */ + gss_qop_t, /* qop_req */ + mbuf_t, /* message mbuf */ + size_t, /* offest */ + size_t, /* length */ + gss_buffer_t /* message_token */ + ); + +uint32_t +gss_krb5_get_mic(uint32_t *, /* minor_status */ + gss_ctx_id_t, /* context_handle */ + gss_qop_t, /* qop_req */ + gss_buffer_t, /* message buffer */ + gss_buffer_t /* message_token */ + ); + +uint32_t +gss_krb5_verify_mic(uint32_t *, /* minor_status */ + gss_ctx_id_t, /* context_handle */ + gss_buffer_t, /* message_buffer */ + gss_buffer_t, /* message_token */ + gss_qop_t * /* qop_state */ + ); + +uint32_t +gss_krb5_verify_mic_mbuf(uint32_t *, /* minor_status */ + gss_ctx_id_t, /* context_handle */ + mbuf_t, /* message_buffer */ + size_t, /* offset */ + size_t, /* length */ + gss_buffer_t, /* message_token */ + gss_qop_t * /* qop_state */ + ); + +uint32_t +gss_krb5_wrap_mbuf(uint32_t *, /* minor_status */ + gss_ctx_id_t, /* context_handle */ + int, /* conf_req_flag */ + gss_qop_t, /* qop_req */ + mbuf_t *, /* input/output message_buffer */ + size_t, /* offset */ + size_t, /* length */ + int * /* conf_state */ + ); + +uint32_t +gss_krb5_unwrap_mbuf(uint32_t *, /* minor_status */ + gss_ctx_id_t, /* context_handle */ + mbuf_t *, /* input/output message_buffer */ + size_t, /* offset */ + size_t, /* length */ + int *, /* conf_state */ + gss_qop_t * /* qop state */ + ); + +void gss_krb5_destroy_context(gss_ctx_id_t); + +gss_ctx_id_t gss_krb5_make_context(void *, uint32_t); + +void gss_krb5_mech_init(void); + +int corecrypto_available(void); + +errno_t gss_normalize_mbuf(mbuf_t, size_t, size_t *, mbuf_t *, mbuf_t *, int); + +mbuf_t gss_join_mbuf(mbuf_t, mbuf_t, mbuf_t); + +typedef struct hmac_ctx_struct { + size_t keylen; + uint8_t *key; + ccdigest_ctx_t di_ctx; +} hmac_ctx, hmac_ctx_t[1]; + +void hmac_init(const struct ccdigest_info *, hmac_ctx_t, size_t, void *); +void hmac_update(const struct ccdigest_info *, hmac_ctx_t, size_t, void *); +void hmac_final(const struct ccdigest_info *, hmac_ctx_t, uint8_t *); + +void printmbuf(const char *, mbuf_t, uint32_t, uint32_t); + +void printgbuf(const char *, gss_buffer_t); diff --git a/bsd/nfs/krpc_subr.c b/bsd/nfs/krpc_subr.c index 8ded0f04b..53355a6e4 100644 --- a/bsd/nfs/krpc_subr.c +++ b/bsd/nfs/krpc_subr.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,12 +22,12 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* - * Copyright (c) 1994 Gordon Ross, Adam Glass + * Copyright (c) 1994 Gordon Ross, Adam Glass * Copyright (c) 1992 Regents of the University of California. * All rights reserved. * @@ -146,10 +146,10 @@ struct rpc_reply { * Returns non-zero error on failure. */ int -krpc_portmap(sin, prog, vers, proto, portp) - struct sockaddr_in *sin; /* server address */ - u_int prog, vers, proto; /* host order */ - u_int16_t *portp; /* network order */ +krpc_portmap( + struct sockaddr_in *sin, /* server address */ + u_int prog, u_int vers, u_int proto, /* host order */ + u_int16_t *portp) /* network order */ { struct sdata { u_int32_t prog; /* call program */ @@ -204,11 +204,11 @@ krpc_portmap(sin, prog, vers, proto, portp) * the address from whence the response came is saved there. */ int -krpc_call(sa, sotype, prog, vers, func, data, from_p) - struct sockaddr_in *sa; - u_int sotype, prog, vers, func; - mbuf_t *data; /* input/output */ - struct sockaddr_in *from_p; /* output */ +krpc_call( + struct sockaddr_in *sa, + u_int sotype, u_int prog, u_int vers, u_int func, + mbuf_t *data, /* input/output */ + struct sockaddr_in *from_p) /* output */ { socket_t so; struct sockaddr_in *sin; diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index 3bc6a641c..bb8cf8433 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -186,6 +186,7 @@ extern int nfs_ticks; #define NFS_MATTR_PRINCIPAL 25 /* GSS principal to authenticate with */ #define NFS_MATTR_SVCPRINCIPAL 26 /* GSS principal to authenticate to, the server principal */ #define NFS_MATTR_NFS_VERSION_RANGE 27 /* Packed version range to try */ +#define NFS_MATTR_KERB_ETYPE 28 /* Enctype to use for kerberos mounts */ /* NFS mount flags */ #define NFS_MFLAG_SOFT 0 /* soft mount (requests fail if unresponsive) */ @@ -217,6 +218,22 @@ extern int nfs_ticks; #define NFS_LOCK_MODE_DISABLED 1 /* do not support advisory file locking */ #define NFS_LOCK_MODE_LOCAL 2 /* perform advisory file locking locally */ + +/* Supported encryption types for kerberos session keys */ +typedef enum nfs_supported_kerberos_etypes { + NFS_DES3_CBC_SHA1_KD = 16, + NFS_AES128_CTS_HMAC_SHA1_96 = 17, + NFS_AES256_CTS_HMAC_SHA1_96 = 18 +} nfs_supported_kerberos_etypes; + +/* Structure to hold an array of kerberos enctypes to allow on a mount */ +#define NFS_MAX_ETYPES 3 +struct nfs_etype { + uint32_t count; + uint32_t selected; /* index in etypes that is being used. Set to count if nothing has been selected */ + nfs_supported_kerberos_etypes etypes[NFS_MAX_ETYPES]; +}; + /* * Old-style arguments to mount NFS */ @@ -970,7 +987,7 @@ extern lck_grp_t *nfs_request_grp; extern u_int32_t nfs_xid, nfs_xidwrap; extern int nfs_iosize, nfs_allow_async, nfs_statfs_rate_limit; extern int nfs_access_cache_timeout, nfs_access_delete, nfs_access_dotzfs, nfs_access_for_getattr; -extern int nfs_lockd_mounts, nfs_lockd_request_sent, nfs_single_des; +extern int nfs_lockd_mounts, nfs_lockd_request_sent; extern int nfs_tprintf_initial_delay, nfs_tprintf_delay; extern int nfsiod_thread_count, nfsiod_thread_max, nfs_max_async_writes; extern int nfs_idmap_ctrl, nfs_callback_port; @@ -1132,6 +1149,9 @@ extern thread_call_t nfsrv_idlesock_timer_call; extern thread_call_t nfsrv_fmod_timer_call; #endif +/* nfs 4 default domain for user mapping */ +extern char nfs4_domain[MAXPATHLEN]; + __BEGIN_DECLS nfstype vtonfs_type(enum vtype, int); diff --git a/bsd/nfs/nfs4_subs.c b/bsd/nfs/nfs4_subs.c index 60a52d867..130b04f7a 100644 --- a/bsd/nfs/nfs4_subs.c +++ b/bsd/nfs/nfs4_subs.c @@ -596,14 +596,14 @@ nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp) /* we only recognize KRB5, KRB5I, KRB5P */ nfsm_chain_get_32(error, nmc, val); /* OID length */ nfsmout_if(error); - if (val != sizeof(krb5_mech)) { + if (val != sizeof(krb5_mech_oid)) { nfsm_chain_adv(error, nmc, val); nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); break; } nfsm_chain_get_opaque(error, nmc, val, oid); /* OID bytes */ nfsmout_if(error); - if (bcmp(oid, krb5_mech, sizeof(krb5_mech))) { + if (bcmp(oid, krb5_mech_oid, sizeof(krb5_mech_oid))) { nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); break; } @@ -1024,7 +1024,7 @@ nfs4_id2guid(/*const*/ char *id, guid_t *guidp, int isgroup) guid_t guid1, guid2, *gp; ntsid_t sid; long num, unknown; - const char *p, *at; + char *p, *at, *new_id = NULL; *guidp = kauth_null_guid; compare = ((nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) && @@ -1052,9 +1052,55 @@ nfs4_id2guid(/*const*/ char *id, guid_t *guidp, int isgroup) /* must be numeric ID (or empty) */ num = *id ? strtol(id, NULL, 10) : unknown; gp = guidp; + /* Since we are not initilizing guid1 and guid2, skip compare */ + compare = 0; goto gotnumid; } + /* Handle nfs4 domain first */ + if (at && at[1]) { + /* Try mapping nfs4 domain */ + char *dsnode, *nfs4domain = at + 1; + size_t otw_domain_len = strnlen(nfs4domain, MAXPATHLEN); + int otw_id_2_at_len = at - id + 1; + + MALLOC(dsnode, char*, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (dsnode) { + /* first try to map nfs4 domain to dsnode for scoped lookups */ + memset(dsnode, 0, MAXPATHLEN); + error = kauth_cred_nfs4domain2dsnode(nfs4domain, dsnode); + if (!error) { + /* Success! Make new id be id@dsnode */ + int dsnode_len = strnlen(dsnode, MAXPATHLEN); + int new_id_len = otw_id_2_at_len + dsnode_len + 1; + + MALLOC(new_id, char*, new_id_len, M_NAMEI, M_WAITOK); + if (new_id) { + (void)strlcpy(new_id, id, otw_id_2_at_len + 1); + (void)strlcpy(new_id + otw_id_2_at_len, dsnode, dsnode_len + 1); + id = new_id; + at = id; + while (*at++ != '@'); + at--; + } + } else { + /* Bummer:-( See if default nfs4 set for unscoped lookup */ + size_t default_domain_len = strnlen(nfs4_domain, MAXPATHLEN); + + if ((otw_domain_len == default_domain_len) && (strncmp(nfs4domain, nfs4_domain, otw_domain_len) == 0)) { + /* Woohoo! We have matching domains, do unscoped lookups */ + *at = '\0'; + } + } + FREE(dsnode, M_NAMEI); + } + + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS) { + printf("nfs4_id2guid: after domain mapping id is %s\n", id); + } + } + + /* Now try to do actual id mapping */ if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) { /* * Ask the ID mapping service to map the ID string to a GUID. @@ -1223,6 +1269,14 @@ nfs4_id2guid(/*const*/ char *id, guid_t *guidp, int isgroup) } } + /* restore @ symbol in case we clobered for unscoped lookup */ + if (at && *at == '\0') + *at = '@'; + + /* free mapped domain id string */ + if (id == new_id) + FREE(id, M_NAMEI); + return (error); } @@ -1236,7 +1290,7 @@ nfs4_guid2id(guid_t *guidp, char *id, int *idlen, int isgroup) { int error1 = 0, error = 0, compare; int id1len, id2len, len; - char *id1buf, *id1; + char *id1buf, *id1, *at; char numbuf[32]; const char *id2 = NULL; @@ -1268,6 +1322,7 @@ nfs4_guid2id(guid_t *guidp, char *id, int *idlen, int isgroup) id1len = *idlen; } + memset(id1, 0, id1len); if (isgroup) error = kauth_cred_guid2grnam(guidp, id1); else @@ -1457,12 +1512,56 @@ nfs4_guid2id(guid_t *guidp, char *id, int *idlen, int isgroup) id, isgroup ? "G" : " ", error1, error); } } + + at = id; + while (at && at[0] != '@' && at[0] != '\0' && at++); + if (at && at[0] == '@' && at[1] != '\0') { + char *dsnode = at + 1; + int id_2_at_len = at - id + 1; + char *nfs4domain, *new_id; + MALLOC(nfs4domain, char*, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (nfs4domain) { + int domain_len; + char *mapped_domain; + memset(nfs4domain, 0, MAXPATHLEN); + error = kauth_cred_dsnode2nfs4domain(dsnode, nfs4domain); + if (!error) { + domain_len = strnlen(nfs4domain, MAXPATHLEN); + mapped_domain = nfs4domain; + } else { + domain_len = strnlen(nfs4_domain, MAXPATHLEN); + mapped_domain = nfs4_domain; + } + if (domain_len) { + MALLOC(new_id, char*, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (new_id) { + strlcpy(new_id, id, id_2_at_len + 1); + strlcpy(new_id + id_2_at_len, mapped_domain, domain_len + 1); + strlcpy(id, new_id, strnlen(new_id, MAXPATHLEN) + 1); + *idlen = strnlen(id, MAXPATHLEN); + FREE(new_id, M_NAMEI); + } + } + FREE(nfs4domain, M_NAMEI); + } + } else if (at && at[0] == '\0') { + int default_domain_len = strnlen(nfs4_domain, MAXPATHLEN); + + if (default_domain_len && MAXPATHLEN - *idlen > default_domain_len) { + at[0] = '@'; + strlcpy(at + 1, nfs4_domain, default_domain_len + 1); + *idlen = strnlen(id, MAXPATHLEN); + } + } + + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS) + printf("nfs4_guid2id: id after nfs4 domain map: %s[%d].\n", id, *idlen); + if (id1buf) FREE_ZONE(id1buf, MAXPATHLEN, M_NAMEI); return (error); } - /* * Set a vnode attr's supported bits according to the given bitmap */ diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 926cc0ad7..acaf26c24 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -97,6 +97,8 @@ #include #include +#define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__) + kern_return_t thread_terminate(thread_t); /* XXX */ #define NFSBUFHASH(np, lbn) \ @@ -3797,6 +3799,28 @@ nfs_asyncio_finish(struct nfsreq *req) } } + /* + * If we got here while being on the resendq we need to get off. This + * happens when the timer fires and errors out requests from nfs_sigintr + * or we receive a reply (UDP case) while being on the resend queue so + * we're just finishing up and are not going to be resent. + */ + lck_mtx_lock(&req->r_mtx); + if (req->r_flags & R_RESENDQ) { + lck_mtx_lock(&nmp->nm_lock); + if (req->r_rchain.tqe_next != NFSREQNOLIST) { + NFS_BIO_DBG("Proccessing async request on resendq. Removing"); + TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); + req->r_rchain.tqe_next = NFSREQNOLIST; + assert(req->r_refs > 1); + /* Remove resendq reference */ + req->r_refs--; + } + lck_mtx_unlock(&nmp->nm_lock); + req->r_flags &= ~R_RESENDQ; + } + lck_mtx_unlock(&req->r_mtx); + if (req->r_achain.tqe_next == NFSREQNOLIST) TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain); diff --git a/bsd/nfs/nfs_boot.c b/bsd/nfs/nfs_boot.c index 7fcd73bee..c0c5877f6 100644 --- a/bsd/nfs/nfs_boot.c +++ b/bsd/nfs/nfs_boot.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995, 1997 NeXT Computer, Inc. All Rights Reserved */ @@ -347,8 +347,7 @@ nfs_boot_getfh(struct nfs_diskless *nd, int v3, int sotype) } static int -get_file_handle(ndmntp) - struct nfs_dlmount *ndmntp; +get_file_handle(struct nfs_dlmount *ndmntp) { char *sp, *dp, *endp; int error; @@ -440,10 +439,9 @@ struct bp_inaddr { * know about us (don't want to broadcast a getport call). */ static int -bp_whoami(bpsin, my_ip, gw_ip) - struct sockaddr_in *bpsin; - struct in_addr *my_ip; - struct in_addr *gw_ip; +bp_whoami(struct sockaddr_in *bpsin, + struct in_addr *my_ip, + struct in_addr *gw_ip) { /* RPC structures for PMAPPROC_CALLIT */ struct whoami_call { @@ -576,12 +574,11 @@ bp_whoami(bpsin, my_ip, gw_ip) * server pathname */ static int -bp_getfile(bpsin, key, md_sin, serv_name, pathname) - struct sockaddr_in *bpsin; - const char *key; - struct sockaddr_in *md_sin; - char *serv_name; - char *pathname; +bp_getfile(struct sockaddr_in *bpsin, + const char *key, + struct sockaddr_in *md_sin, + char *serv_name, + char *pathname) { struct rpc_string *str; mbuf_t m; @@ -691,13 +688,12 @@ bp_getfile(bpsin, key, md_sin, serv_name, pathname) * Also, sets sin->sin_port to the NFS service port. */ static int -md_mount(mdsin, path, v3, sotype, fhp, fhlenp) - struct sockaddr_in *mdsin; /* mountd server address */ - char *path; - int v3; - int sotype; - u_char *fhp; - u_int32_t *fhlenp; +md_mount(struct sockaddr_in *mdsin, /* mountd server address */ + char *path, + int v3, + int sotype, + u_char *fhp, + u_int32_t *fhlenp) { /* The RPC structures */ struct rpc_string *str; diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index 0283fa918..53f4a08f2 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -109,7 +109,6 @@ #include #include #include -#include "nfs_gss_crypto.h" #include #include @@ -120,22 +119,6 @@ #define NFS_GSS_DBG(...) NFS_DBG(NFS_FAC_GSS, 7, ## __VA_ARGS__) #define NFS_GSS_ISDBG (NFS_DEBUG_FACILITY & NFS_FAC_GSS) -typedef struct { - int type; - union { - MD5_DESCBC_CTX m_ctx; - HMAC_SHA1_DES3KD_CTX h_ctx; - }; -} GSS_DIGEST_CTX; - -#define MAX_DIGEST SHA_DIGEST_LENGTH -#ifdef NFS_KERNEL_DEBUG -#define HASHLEN(ki) (((ki)->hash_len > MAX_DIGEST) ? \ - (panic("nfs_gss.c:%d ki->hash_len is invalid = %d\n", __LINE__, (ki)->hash_len), MAX_DIGEST) : (ki)->hash_len) -#else -#define HASHLEN(ki) (((ki)->hash_len > MAX_DIGEST) ? \ - (printf("nfs_gss.c:%d ki->hash_len is invalid = %d\n", __LINE__, (ki)->hash_len), MAX_DIGEST) : (ki)->hash_len) -#endif #if NFSSERVER u_long nfs_gss_svc_ctx_hash; @@ -148,45 +131,11 @@ uint32_t nfsrv_gss_context_ttl = GSS_CTX_EXPIRE; #if NFSCLIENT lck_grp_t *nfs_gss_clnt_grp; -int nfs_single_des; #endif /* NFSCLIENT */ -/* - * These octet strings are used to encode/decode ASN.1 tokens - * in the RPCSEC_GSS verifiers. - */ -static u_char krb5_tokhead[] __attribute__((unused)) = { 0x60, 0x23 }; - u_char krb5_mech[11] = { 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 }; -static u_char krb5_mic[] = { 0x01, 0x01, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; -static u_char krb5_mic3[] = { 0x01, 0x01, 0x04, 0x00, 0xff, 0xff, 0xff, 0xff }; -static u_char krb5_wrap[] = { 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff }; -static u_char krb5_wrap3[] = { 0x02, 0x01, 0x04, 0x00, 0x02, 0x00, 0xff, 0xff }; -static u_char iv0[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; // DES MAC Initialization Vector - -#define ALG_MIC(ki) (((ki)->type == NFS_GSS_1DES) ? krb5_mic : krb5_mic3) -#define ALG_WRAP(ki) (((ki)->type == NFS_GSS_1DES) ? krb5_wrap : krb5_wrap3) - -/* - * The size of the Kerberos v5 ASN.1 token - * in the verifier. - * - * Note that the second octet of the krb5_tokhead (0x23) is a - * DER-encoded size field that has variable length. If the size - * is 128 bytes or greater, then it uses two bytes, three bytes - * if 65536 or greater, and so on. Since the MIC tokens are - * separate from the data, the size is always the same: 35 bytes (0x23). - * However, the wrap token is different. Its size field includes the - * size of the token + the encrypted data that follows. So the size - * field may be two, three or four bytes. - */ -#define KRB5_SZ_TOKHEAD sizeof(krb5_tokhead) -#define KRB5_SZ_MECH sizeof(krb5_mech) -#define KRB5_SZ_ALG sizeof(krb5_mic) // 8 - same as krb5_wrap -#define KRB5_SZ_SEQ 8 -#define KRB5_SZ_EXTRA 3 // a wrap token may be longer by up to this many octets -#define KRB5_SZ_TOKEN_NOSUM (KRB5_SZ_TOKHEAD + KRB5_SZ_MECH + KRB5_SZ_ALG + KRB5_SZ_SEQ) -#define KRB5_SZ_TOKEN(cksumlen) ((cksumlen) + KRB5_SZ_TOKEN_NOSUM) -#define KRB5_SZ_TOKMAX(cksumlen) (KRB5_SZ_TOKEN(cksumlen) + KRB5_SZ_EXTRA) +#define KRB5_MAX_MIC_SIZE 128 +uint8_t krb5_mech_oid[11] = { 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 }; +static uint8_t xdrpad[] = { 0x00, 0x00, 0x00, 0x00}; #if NFSCLIENT static int nfs_gss_clnt_ctx_find(struct nfsreq *); @@ -194,10 +143,10 @@ static int nfs_gss_clnt_ctx_init(struct nfsreq *, struct nfs_gss_clnt_ctx *); static int nfs_gss_clnt_ctx_init_retry(struct nfsreq *, struct nfs_gss_clnt_ctx *); static int nfs_gss_clnt_ctx_callserver(struct nfsreq *, struct nfs_gss_clnt_ctx *); static uint8_t *nfs_gss_clnt_svcname(struct nfsmount *, gssd_nametype *, uint32_t *); -static int nfs_gss_clnt_gssd_upcall(struct nfsreq *, struct nfs_gss_clnt_ctx *); +static int nfs_gss_clnt_gssd_upcall(struct nfsreq *, struct nfs_gss_clnt_ctx *, uint32_t); void nfs_gss_clnt_ctx_neg_cache_reap(struct nfsmount *); static void nfs_gss_clnt_ctx_clean(struct nfs_gss_clnt_ctx *); -static int nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *, struct nfs_gss_clnt_ctx **, gss_key_info *); +static int nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *, struct nfs_gss_clnt_ctx **); static void nfs_gss_clnt_ctx_destroy(struct nfs_gss_clnt_ctx *); static void nfs_gss_clnt_log_error(struct nfsreq *, struct nfs_gss_clnt_ctx *, uint32_t, uint32_t); #endif /* NFSCLIENT */ @@ -214,26 +163,10 @@ static void host_release_special_port(mach_port_t); static mach_port_t host_copy_special_port(mach_port_t); static void nfs_gss_mach_alloc_buffer(u_char *, uint32_t, vm_map_copy_t *); static int nfs_gss_mach_vmcopyout(vm_map_copy_t, uint32_t, u_char *); -static int nfs_gss_token_get(gss_key_info *ki, u_char *, u_char *, int, uint32_t *, u_char *); -static int nfs_gss_token_put(gss_key_info *ki, u_char *, u_char *, int, int, u_char *); -static int nfs_gss_der_length_size(int); -static void nfs_gss_der_length_put(u_char **, int); -static int nfs_gss_der_length_get(u_char **); + static int nfs_gss_mchain_length(mbuf_t); static int nfs_gss_append_chain(struct nfsm_chain *, mbuf_t); static void nfs_gss_nfsm_chain(struct nfsm_chain *, mbuf_t); -static void nfs_gss_cksum_mchain(gss_key_info *, mbuf_t, u_char *, int, int, u_char *); -static void nfs_gss_cksum_chain(gss_key_info *, struct nfsm_chain *, u_char *, int, int, u_char *); -static void nfs_gss_cksum_rep(gss_key_info *, uint32_t, u_char *); -static void nfs_gss_encrypt_mchain(gss_key_info *, mbuf_t, int, int, int); -static void nfs_gss_encrypt_chain(gss_key_info *, struct nfsm_chain *, int, int, int); - -static void gss_digest_Init(GSS_DIGEST_CTX *, gss_key_info *); -static void gss_digest_Update(GSS_DIGEST_CTX *, void *, size_t); -static void gss_digest_Final(GSS_DIGEST_CTX *, void *); -static void gss_des_crypt(gss_key_info *, des_cblock *, des_cblock *, - int32_t, des_cblock *, des_cblock *, int, int); -static int gss_key_init(gss_key_info *, uint32_t); #if NFSSERVER thread_call_t nfs_gss_svc_ctx_timer_call; @@ -262,8 +195,291 @@ nfs_gss_init(void) #endif /* NFSSERVER */ } +/* + * Common RPCSEC_GSS support routines + */ + +static errno_t +rpc_gss_prepend_32(mbuf_t *mb, uint32_t value) +{ + int error; + uint32_t *data; + +#if 0 + data = mbuf_data(*mb); + /* + * If a wap token comes back and is not aligned + * get a new buffer (which should be aligned) to put the + * length in. + */ + if ((uintptr_t)data & 0x3) { + mbuf_t nmb; + + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &nmb); + if (error) + return (error); + mbuf_setnext(nmb, *mb); + *mb = nmb; + } +#endif + error = mbuf_prepend(mb, sizeof(uint32_t), MBUF_WAITOK); + if (error) + return (error); + + data = mbuf_data(*mb); + *data = txdr_unsigned(value); + + return (0); +} + +/* + * Prepend the sequence number to the xdr encode argumen or result + * Sequence number is prepended in its own mbuf. + * + * On successful return mbp_head will point to the old mbuf chain + * prepended with a new mbuf that has the sequence number. + */ + +static errno_t +rpc_gss_data_create(mbuf_t *mbp_head, uint32_t seqnum) +{ + int error; + mbuf_t mb; + struct nfsm_chain nmc; + struct nfsm_chain *nmcp = &nmc; + uint8_t *data; + + error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_DATA, &mb); + if (error) + return (error); + data = mbuf_data(mb); +#if 0 + /* Reserve space for prepending */ + len = mbuf_maxlen(mb); + len = (len & ~0x3) - NFSX_UNSIGNED; + printf("%s: data = %p, len = %d\n", __func__, data, (int)len); + error = mbuf_setdata(mb, data + len, 0); + if (error || mbuf_trailingspace(mb)) + printf("%s: data = %p trailingspace = %d error = %d\n", __func__, mbuf_data(mb), (int)mbuf_trailingspace(mb), error); +#endif + /* Reserve 16 words for prepending */ + error = mbuf_setdata(mb, data + 16*sizeof(uint32_t), 0); + nfsm_chain_init(nmcp, mb); + nfsm_chain_add_32(error, nmcp, seqnum); + nfsm_chain_build_done(error, nmcp); + if (error) + return (EINVAL); + mbuf_setnext(nmcp->nmc_mcur, *mbp_head); + *mbp_head = nmcp->nmc_mhead; + + return (0); +} + +/* + * Create an rpc_gss_integ_data_t given an argument or result in mb_head. + * On successful return mb_head will point to the rpc_gss_integ_data_t of length len. + * Note mb_head will now point to a 4 byte sequence number. len does not include + * any extra xdr padding. + * Returns 0 on success, else an errno_t + */ + +static errno_t +rpc_gss_integ_data_create(gss_ctx_id_t ctx, mbuf_t *mb_head, uint32_t seqnum, uint32_t *len) +{ + uint32_t error; + uint32_t major; + uint32_t length; + gss_buffer_desc mic; + struct nfsm_chain nmc; + + /* Length of the argument or result */ + length = nfs_gss_mchain_length(*mb_head); + if (len) + *len = length; + error = rpc_gss_data_create(mb_head, seqnum); + if (error) + return (error); + + /* + * length is the length of the rpc_gss_data + */ + length += NFSX_UNSIGNED; /* Add the sequence number to the length */ + major = gss_krb5_get_mic_mbuf(&error, ctx, 0, *mb_head, 0, length, &mic); + if (major != GSS_S_COMPLETE) { + printf("gss_krb5_get_mic_mbuf failed %d\n", error); + return (error); + } + + error = rpc_gss_prepend_32(mb_head, length); + if (error) + return (error); + + nfsm_chain_dissect_init(error, &nmc, *mb_head); + /* Append GSS mic token by advancing rpc_gss_data_t length + NFSX_UNSIGNED (size of the length field) */ + nfsm_chain_adv(error, &nmc, length + NFSX_UNSIGNED); + nfsm_chain_finish_mbuf(error, &nmc); // Force the mic into its own sub chain. + nfsm_chain_add_32(error, &nmc, mic.length); + nfsm_chain_add_opaque(error, &nmc, mic.value, mic.length); + nfsm_chain_build_done(error, &nmc); + gss_release_buffer(NULL, &mic); + +// printmbuf("rpc_gss_integ_data_create done", *mb_head, 0, 0); + assert(nmc.nmc_mhead == *mb_head); + + return (error); +} + +/* + * Create an rpc_gss_priv_data_t out of the supplied raw arguments or results in mb_head. + * On successful return mb_head will point to a wrap token of lenght len. + * Note len does not include any xdr padding + * Returns 0 on success, else an errno_t + */ +static errno_t +rpc_gss_priv_data_create(gss_ctx_id_t ctx, mbuf_t *mb_head, uint32_t seqnum, uint32_t *len) +{ + uint32_t error; + uint32_t major; + struct nfsm_chain nmc; + uint32_t pad; + uint32_t length; + + error = rpc_gss_data_create(mb_head, seqnum); + if (error) + return (error); + + length = nfs_gss_mchain_length(*mb_head); + major = gss_krb5_wrap_mbuf(&error, ctx, 1, 0, mb_head, 0, length, NULL); + if (major != GSS_S_COMPLETE) + return (error); + + length = nfs_gss_mchain_length(*mb_head); + if (len) + *len = length; + pad = nfsm_pad(length); + + /* Prepend the opaque length of rep rpc_gss_priv_data */ + error = rpc_gss_prepend_32(mb_head, length); + + if (error) + return (error); + if (pad) { + nfsm_chain_dissect_init(error, &nmc, *mb_head); + /* Advance the opauque size of length and length data */ + nfsm_chain_adv(error, &nmc, NFSX_UNSIGNED + length); + nfsm_chain_finish_mbuf(error, &nmc); + nfsm_chain_add_opaque_nopad(error, &nmc, xdrpad, pad); + nfsm_chain_build_done(error, &nmc); + } + + return (error); +} + #if NFSCLIENT +/* + * Restore the argument or result from an rpc_gss_integ_data mbuf chain + * We have a four byte seqence number, len arguments, and an opaque + * encoded mic, possibly followed by some pad bytes. The mic and possible + * pad bytes are on their own sub mbuf chains. + * + * On successful return mb_head is the chain of the xdr args or results sans + * the sequence number and mic and return 0. Otherwise return an errno. + * + */ +static errno_t +rpc_gss_integ_data_restore(gss_ctx_id_t ctx __unused, mbuf_t *mb_head, size_t len) +{ + mbuf_t mb = *mb_head; + mbuf_t tail = NULL, next; + + /* Chop of the opaque length and seq number */ + mbuf_adj(mb, 2 * NFSX_UNSIGNED); + + /* should only be one, ... but */ + for (; mb; mb = next) { + next = mbuf_next(mb); + if (mbuf_len(mb) == 0) + mbuf_free(mb); + else + break; + } + *mb_head = mb; + + for (; mb && len; mb = mbuf_next(mb)) { + tail = mb; + if (mbuf_len(mb) <= len) + len -= mbuf_len(mb); + else + return (EBADRPC); + } + /* drop the mic */ + if (tail) { + mbuf_setnext(tail, NULL); + mbuf_freem(mb); + } + + return (0); +} + +/* + * Restore the argument or result rfom an rpc_gss_priv_data mbuf chain + * mb_head points to the wrap token of length len. + * + * On successful return mb_head is our original xdr arg or result an + * the return value is 0. Otherise return an errno + */ +static errno_t +rpc_gss_priv_data_restore(gss_ctx_id_t ctx, mbuf_t *mb_head, size_t len) +{ + uint32_t major, error; + mbuf_t mb = *mb_head, next; + uint32_t plen; + size_t length; + gss_qop_t qop = GSS_C_QOP_REVERSE; + + /* Chop of the opaque length */ + mbuf_adj(mb, NFSX_UNSIGNED); + /* If we have padding, drop it */ + plen = nfsm_pad(len); + if (plen) { + mbuf_t tail = NULL; + + for(length = 0; length < len && mb; mb = mbuf_next(mb)) { + tail = mb; + length += mbuf_len(mb); + } + if ((length != len) || (mb == NULL) || (tail == NULL)) + return (EBADRPC); + + mbuf_freem(mb); + mbuf_setnext(tail, NULL); + } + + major = gss_krb5_unwrap_mbuf(&error, ctx, mb_head, 0, len, NULL, &qop); + if (major != GSS_S_COMPLETE) { + printf("gss_krb5_unwrap_mbuf failed. major = %d minor = %d\n", (int)major, error); + return (error); + } + mb = *mb_head; + + /* Drop the seqence number */ + mbuf_adj(mb, NFSX_UNSIGNED); + assert(mbuf_len(mb) == 0); + + /* Chop of any empty mbufs */ + for (mb = *mb_head; mb; mb = next) { + next = mbuf_next(mb); + if (mbuf_len(mb) == 0) + mbuf_free(mb); + else + break; + } + *mb_head = mb; + + return (0); +} + /* * Find the context for a particular user. * @@ -423,7 +639,6 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t struct nfsreq treq; int error = 0; struct timeval now; - gss_key_info *ki; char CTXBUF[NFS_CTXBUFSZ]; bzero(&treq, sizeof (struct nfsreq)); @@ -487,7 +702,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t cp->gss_clnt_flags |= GSS_CTX_DESTROY; NFS_GSS_DBG("Context %s has expired but we still have %d references\n", NFS_GSS_CTX(req, cp), cp->gss_clnt_refcnt); - error = nfs_gss_clnt_ctx_copy(cp, &ncp, NULL); + error = nfs_gss_clnt_ctx_copy(cp, &ncp); lck_mtx_unlock(cp->gss_clnt_mtx); if (error) { lck_mtx_unlock(&nmp->nm_lock); @@ -496,11 +711,6 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t cp = ncp; break; } else { - /* cp->gss_clnt_kinfo should be NULL here */ - if (cp->gss_clnt_kinfo) { - FREE(cp->gss_clnt_kinfo, M_TEMP); - cp->gss_clnt_kinfo = NULL; - } if (cp->gss_clnt_nctime) nmp->nm_ncentries--; lck_mtx_unlock(cp->gss_clnt_mtx); @@ -536,12 +746,6 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t } } - MALLOC(ki, gss_key_info *, sizeof (gss_key_info), M_TEMP, M_WAITOK|M_ZERO); - if (ki == NULL) { - lck_mtx_unlock(&nmp->nm_lock); - return (ENOMEM); - } - NFS_GSS_DBG("Context %s%sfound in Neg Cache @ %ld\n", NFS_GSS_CTX(req, cp), cp == NULL ? " not " : "", @@ -557,7 +761,6 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t lck_mtx_unlock(&nmp->nm_lock); return (ENOMEM); } - cp->gss_clnt_kinfo = ki; cp->gss_clnt_cred = req->r_cred; kauth_cred_ref(cp->gss_clnt_cred); cp->gss_clnt_mtx = lck_mtx_alloc_init(nfs_gss_clnt_grp, LCK_ATTR_NULL); @@ -571,7 +774,6 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t nfs_gss_clnt_mnt_ref(nmp); } } else { - cp->gss_clnt_kinfo = ki; nfs_gss_clnt_ctx_clean(cp); if (principal) { /* @@ -619,20 +821,17 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) * to build the verifier which contains a signed checksum * of the RPC header. */ + int nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) { struct nfs_gss_clnt_ctx *cp; uint32_t seqnum = 0; - int error = 0; - int slpflag, recordmark = 0; - int start, len, offset = 0; - int pad, toklen; - struct nfsm_chain nmc_tmp; + uint32_t major; + uint32_t error = 0; + int slpflag, recordmark = 0, offset; struct gss_seq *gsp; - u_char tokbuf[KRB5_SZ_TOKMAX(MAX_DIGEST)]; - u_char cksum[MAX_DIGEST]; - gss_key_info *ki; + gss_buffer_desc mic; slpflag = (PZERO-1); if (req->r_nmp) { @@ -671,7 +870,6 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) } lck_mtx_unlock(cp->gss_clnt_mtx); - ki = cp->gss_clnt_kinfo; if (cp->gss_clnt_flags & GSS_CTX_COMPLETE) { /* * Get a sequence number for this request. @@ -742,12 +940,17 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) offset = recordmark ? NFSX_UNSIGNED : 0; // record mark nfsm_chain_build_done(error, nmc); - nfs_gss_cksum_chain(ki, nmc, ALG_MIC(ki), offset, 0, cksum); - toklen = nfs_gss_token_put(ki, ALG_MIC(ki), tokbuf, 1, 0, cksum); + major = gss_krb5_get_mic_mbuf((uint32_t *)&error, cp->gss_clnt_ctx_id, 0, nmc->nmc_mhead, offset, 0, &mic); + if (major != GSS_S_COMPLETE) { + printf ("gss_krb5_get_mic_buf failed %d\n", error); + return (error); + } + nfsm_chain_add_32(error, nmc, RPCSEC_GSS); // flavor - nfsm_chain_add_32(error, nmc, toklen); // length - nfsm_chain_add_opaque(error, nmc, tokbuf, toklen); + nfsm_chain_add_32(error, nmc, mic.length); // length + nfsm_chain_add_opaque(error, nmc, mic.value, mic.length); + (void)gss_release_buffer(NULL, &mic); nfsm_chain_build_done(error, nmc); if (error) return (error); @@ -758,85 +961,45 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) */ switch (cp->gss_clnt_service) { case RPCSEC_GSS_SVC_NONE: - nfs_gss_append_chain(nmc, args); + if (args) + nfs_gss_append_chain(nmc, args); break; case RPCSEC_GSS_SVC_INTEGRITY: - len = nfs_gss_mchain_length(args); // Find args length - req->r_gss_arglen = len; // Stash the args len - len += NFSX_UNSIGNED; // Add seqnum length - nfsm_chain_add_32(error, nmc, len); // and insert it - start = nfsm_chain_offset(nmc); - nfsm_chain_add_32(error, nmc, seqnum); // Insert seqnum - req->r_gss_argoff = nfsm_chain_offset(nmc); // Offset to args - nfsm_chain_build_done(error, nmc); + /* + * r_gss_arglen is the length of args mbuf going into the routine. + * Its used to find the mic if we need to restore the args. + */ + /* Note the mbufs that were used in r_mrest are being encapsulated in the rpc_gss_integ_data_t */ + assert(req->r_mrest == args); + nfsm_chain_finish_mbuf(error, nmc); if (error) return (error); - nfs_gss_append_chain(nmc, args); // Append the args mbufs - - /* Now compute a checksum over the seqnum + args */ - nfs_gss_cksum_chain(ki, nmc, ALG_MIC(ki), start, len, cksum); - - /* Insert it into a token and append to the request */ - toklen = nfs_gss_token_put(ki, ALG_MIC(ki), tokbuf, 1, 0, cksum); - nfsm_chain_finish_mbuf(error, nmc); // force checksum into new mbuf - nfsm_chain_add_32(error, nmc, toklen); - nfsm_chain_add_opaque(error, nmc, tokbuf, toklen); - nfsm_chain_build_done(error, nmc); + error = rpc_gss_integ_data_create(cp->gss_clnt_ctx_id, &args, seqnum, &req->r_gss_arglen); + if (error) + break; + req->r_mrest = args; + req->r_gss_argoff = nfsm_chain_offset(nmc); + nfs_gss_append_chain(nmc, args); break; case RPCSEC_GSS_SVC_PRIVACY: - /* Prepend a new mbuf with the confounder & sequence number */ - nfsm_chain_build_alloc_init(error, &nmc_tmp, 3 * NFSX_UNSIGNED); - nfsm_chain_add_32(error, &nmc_tmp, random()); // confounder bytes 1-4 - nfsm_chain_add_32(error, &nmc_tmp, random()); // confounder bytes 4-8 - nfsm_chain_add_32(error, &nmc_tmp, seqnum); - nfsm_chain_build_done(error, &nmc_tmp); - if (error) - return (error); - nfs_gss_append_chain(&nmc_tmp, args); // Append the args mbufs - - len = nfs_gss_mchain_length(args); // Find args length - len += 3 * NFSX_UNSIGNED; // add confounder & seqnum - req->r_gss_arglen = len; // Stash length - /* - * Append a pad trailer - per RFC 1964 section 1.2.2.3 - * Since XDR data is always 32-bit aligned, it - * needs to be padded either by 4 bytes or 8 bytes. + * r_gss_arglen is the length of the wrap token sans any padding length. + * Its used to find any XDR padding of the wrap token. */ - nfsm_chain_finish_mbuf(error, &nmc_tmp); // force padding into new mbuf - if (len % 8 > 0) { - nfsm_chain_add_32(error, &nmc_tmp, 0x04040404); - len += NFSX_UNSIGNED; - } else { - nfsm_chain_add_32(error, &nmc_tmp, 0x08080808); - nfsm_chain_add_32(error, &nmc_tmp, 0x08080808); - len += 2 * NFSX_UNSIGNED; - } - nfsm_chain_build_done(error, &nmc_tmp); - - /* Now compute a checksum over the confounder + seqnum + args */ - nfs_gss_cksum_chain(ki, &nmc_tmp, ALG_WRAP(ki), 0, len, cksum); - - /* Insert it into a token */ - toklen = nfs_gss_token_put(ki, ALG_WRAP(ki), tokbuf, 1, len, cksum); - nfsm_chain_add_32(error, nmc, toklen + len); // token + args length - nfsm_chain_add_opaque_nopad(error, nmc, tokbuf, toklen); - req->r_gss_argoff = nfsm_chain_offset(nmc); // Stash offset - nfsm_chain_build_done(error, nmc); + /* Note the mbufs that were used in r_mrest are being encapsulated in the rpc_gss_priv_data_t */ + assert(req->r_mrest == args); + nfsm_chain_finish_mbuf(error, nmc); if (error) return (error); - nfs_gss_append_chain(nmc, nmc_tmp.nmc_mhead); // Append the args mbufs - - /* Finally, encrypt the args */ - nfs_gss_encrypt_chain(ki, &nmc_tmp, 0, len, DES_ENCRYPT); - - /* Add null XDR pad if the ASN.1 token misaligned the data */ - pad = nfsm_pad(toklen + len); - if (pad > 0) { - nfsm_chain_add_opaque_nopad(error, nmc, iv0, pad); - nfsm_chain_build_done(error, nmc); - } + error = rpc_gss_priv_data_create(cp->gss_clnt_ctx_id, &args, seqnum, &req->r_gss_arglen); + if (error) + break; + req->r_mrest = args; + req->r_gss_argoff = nfsm_chain_offset(nmc); + nfs_gss_append_chain(nmc, args); break; + default: + return (EINVAL); } return (error); @@ -858,17 +1021,18 @@ nfs_gss_clnt_verf_get( uint32_t verflen, uint32_t *accepted_statusp) { - u_char tokbuf[KRB5_SZ_TOKMAX(MAX_DIGEST)]; - u_char cksum1[MAX_DIGEST], cksum2[MAX_DIGEST]; + gss_buffer_desc cksum; uint32_t seqnum = 0; + uint32_t major; struct nfs_gss_clnt_ctx *cp = req->r_gss_ctx; struct nfsm_chain nmc_tmp; struct gss_seq *gsp; - uint32_t reslen, start, cksumlen, toklen; + uint32_t reslen, offset; int error = 0; - gss_key_info *ki = cp->gss_clnt_kinfo; + mbuf_t results_mbuf, prev_mbuf, pad_mbuf; + size_t ressize; - reslen = cksumlen = 0; + reslen = 0; *accepted_statusp = 0; if (cp == NULL) @@ -905,24 +1069,25 @@ nfs_gss_clnt_verf_get( MALLOC(cp->gss_clnt_verf, u_char *, verflen, M_TEMP, M_WAITOK|M_ZERO); if (cp->gss_clnt_verf == NULL) return (ENOMEM); + cp->gss_clnt_verflen = verflen; nfsm_chain_get_opaque(error, nmc, verflen, cp->gss_clnt_verf); nfsm_chain_get_32(error, nmc, *accepted_statusp); return (error); } - if (verflen != KRB5_SZ_TOKEN(ki->hash_len)) - return (NFSERR_EAUTH); + if (verflen > KRB5_MAX_MIC_SIZE) + return (EBADRPC); + cksum.length = verflen; + MALLOC(cksum.value, void *, verflen, M_TEMP, M_WAITOK); /* - * Get the 8 octet sequence number - * checksum out of the verifier token. + * Get the gss mic */ - nfsm_chain_get_opaque(error, nmc, verflen, tokbuf); - if (error) - goto nfsmout; - error = nfs_gss_token_get(ki, ALG_MIC(ki), tokbuf, 0, NULL, cksum1); - if (error) + nfsm_chain_get_opaque(error, nmc, verflen, cksum.value); + if (error) { + FREE(cksum.value, M_TEMP); goto nfsmout; + } /* * Search the request sequence numbers for this reply, starting @@ -930,10 +1095,16 @@ nfs_gss_clnt_verf_get( * the one in the verifier returned by the server. */ SLIST_FOREACH(gsp, &req->r_gss_seqlist, gss_seqnext) { - nfs_gss_cksum_rep(ki, gsp->gss_seqnum, cksum2); - if (bcmp(cksum1, cksum2, HASHLEN(ki)) == 0) + gss_buffer_desc seqnum_buf; + uint32_t network_seqnum = htonl(gsp->gss_seqnum); + + seqnum_buf.length = sizeof(network_seqnum); + seqnum_buf.value = &network_seqnum; + major = gss_krb5_verify_mic(NULL, cp->gss_clnt_ctx_id, &seqnum_buf, &cksum, NULL); + if (major == GSS_S_COMPLETE) break; } + FREE(cksum.value, M_TEMP); if (gsp == NULL) return (NFSERR_EAUTH); @@ -954,29 +1125,49 @@ nfs_gss_clnt_verf_get( break; case RPCSEC_GSS_SVC_INTEGRITY: /* - * Here's what we expect in the integrity results: + * Here's what we expect in the integrity results from RFC 2203: * * - length of seq num + results (4 bytes) * - sequence number (4 bytes) * - results (variable bytes) - * - length of checksum token (37) - * - checksum of seqnum + results (37 bytes) + * - length of checksum token + * - checksum of seqnum + results */ + nfsm_chain_get_32(error, nmc, reslen); // length of results if (reslen > NFS_MAXPACKET) { error = EBADRPC; goto nfsmout; } - /* Compute a checksum over the sequence number + results */ - start = nfsm_chain_offset(nmc); - nfs_gss_cksum_chain(ki, nmc, ALG_MIC(ki), start, reslen, cksum1); + /* Advance and fetch the mic */ + nmc_tmp = *nmc; + nfsm_chain_adv(error, &nmc_tmp, reslen); // skip over the results + nfsm_chain_get_32(error, &nmc_tmp, cksum.length); + MALLOC(cksum.value, void *, cksum.length, M_TEMP, M_WAITOK); + nfsm_chain_get_opaque(error, &nmc_tmp, cksum.length, cksum.value); + //XXX chop offf the cksum? + + /* Call verify mic */ + offset = nfsm_chain_offset(nmc); + major = gss_krb5_verify_mic_mbuf((uint32_t *)&error, cp->gss_clnt_ctx_id, nmc->nmc_mhead, offset, reslen, &cksum, NULL); + FREE(cksum.value, M_TEMP); + if (major != GSS_S_COMPLETE) { + printf("client results: gss_krb5_verify_mic_mbuf failed %d\n", error); + error = EBADRPC; + goto nfsmout; + } /* * Get the sequence number prepended to the results - * and compare it against the list in the request. + * and compare it against the header. */ nfsm_chain_get_32(error, nmc, seqnum); + if (gsp->gss_seqnum != seqnum) { + error = EBADRPC; + goto nfsmout; + } +#if 0 SLIST_FOREACH(gsp, &req->r_gss_seqlist, gss_seqnext) { if (seqnum == gsp->gss_seqnum) break; @@ -985,79 +1176,60 @@ nfs_gss_clnt_verf_get( error = EBADRPC; goto nfsmout; } - - /* - * Advance to the end of the results and - * fetch the checksum computed by the server. - */ - nmc_tmp = *nmc; - reslen -= NFSX_UNSIGNED; // already skipped seqnum - nfsm_chain_adv(error, &nmc_tmp, reslen); // skip over the results - nfsm_chain_get_32(error, &nmc_tmp, cksumlen); // length of checksum - if (cksumlen != KRB5_SZ_TOKEN(ki->hash_len)) { - error = EBADRPC; - goto nfsmout; - } - nfsm_chain_get_opaque(error, &nmc_tmp, cksumlen, tokbuf); - if (error) - goto nfsmout; - error = nfs_gss_token_get(ki, ALG_MIC(ki), tokbuf, 0, NULL, cksum2); - if (error) - goto nfsmout; - - /* Verify that the checksums are the same */ - if (bcmp(cksum1, cksum2, HASHLEN(ki)) != 0) { - error = EBADRPC; - goto nfsmout; - } +#endif break; case RPCSEC_GSS_SVC_PRIVACY: /* * Here's what we expect in the privacy results: * - * - length of confounder + seq num + token + results - * - wrap token (37-40 bytes) - * - confounder (8 bytes) - * - sequence number (4 bytes) - * - results (encrypted) + * opaque encodeing of the wrap token + * - length of wrap token + * - wrap token */ + prev_mbuf = nmc->nmc_mcur; nfsm_chain_get_32(error, nmc, reslen); // length of results - if (reslen > NFS_MAXPACKET) { + if (reslen == 0 || reslen > NFS_MAXPACKET) { error = EBADRPC; goto nfsmout; } - /* Get the token that prepends the encrypted results */ - nfsm_chain_get_opaque(error, nmc, KRB5_SZ_TOKMAX(ki->hash_len), tokbuf); - if (error) - goto nfsmout; - error = nfs_gss_token_get(ki, ALG_WRAP(ki), tokbuf, 0, - &toklen, cksum1); + /* Get the wrap token (current mbuf in the chain starting at the current offset) */ + offset = nmc->nmc_ptr - (caddr_t)mbuf_data(nmc->nmc_mcur); + + /* split out the wrap token */ + ressize = reslen; + error = gss_normalize_mbuf(nmc->nmc_mcur, offset, &ressize, &results_mbuf, &pad_mbuf, 0); if (error) goto nfsmout; - nfsm_chain_reverse(nmc, nfsm_pad(toklen)); - reslen -= toklen; // size of confounder + seqnum + results - /* decrypt the confounder + sequence number + results */ - start = nfsm_chain_offset(nmc); - nfs_gss_encrypt_chain(ki, nmc, start, reslen, DES_DECRYPT); - - /* Compute a checksum over the confounder + sequence number + results */ - nfs_gss_cksum_chain(ki, nmc, ALG_WRAP(ki), start, reslen, cksum2); + if (pad_mbuf) { + assert(nfsm_pad(reslen) == mbuf_len(pad_mbuf)); + mbuf_free(pad_mbuf); + } - /* Verify that the checksums are the same */ - if (bcmp(cksum1, cksum2, HASHLEN(ki)) != 0) { - error = EBADRPC; + major = gss_krb5_unwrap_mbuf((uint32_t *)&error, cp->gss_clnt_ctx_id, &results_mbuf, 0, ressize, NULL, NULL); + if (major) { + printf("%s unwraped failed %d\n", __func__, error); goto nfsmout; } - nfsm_chain_adv(error, nmc, 8); // skip over the confounder + /* Now replace the wrapped arguments with the unwrapped ones */ + mbuf_setnext(prev_mbuf, results_mbuf); + nmc->nmc_mcur = results_mbuf; + nmc->nmc_ptr = mbuf_data(results_mbuf); + nmc->nmc_left = mbuf_len(results_mbuf); /* * Get the sequence number prepended to the results - * and compare it against the list in the request. + * and compare it against the header */ nfsm_chain_get_32(error, nmc, seqnum); + if (gsp->gss_seqnum != seqnum) { + printf("%s bad seqnum\n", __func__); + error = EBADRPC; + goto nfsmout; + } +#if 0 SLIST_FOREACH(gsp, &req->r_gss_seqlist, gss_seqnext) { if (seqnum == gsp->gss_seqnum) break; @@ -1066,7 +1238,7 @@ nfs_gss_clnt_verf_get( error = EBADRPC; goto nfsmout; } - +#endif break; } nfsmout: @@ -1077,9 +1249,10 @@ nfs_gss_clnt_verf_get( * An RPCSEC_GSS request with no integrity or privacy consists * of just the header mbufs followed by the arg mbufs. * - * However, integrity or privacy both trailer mbufs to the args, - * which means we have to do some work to restore the arg mbuf - * chain to its previous state in case we need to retransmit. + * However, integrity or privacy the original mbufs have mbufs + * prepended and appended to, which means we have to do some work to + * restore the arg mbuf chain to its previous state in case we need to + * retransmit. * * The location and length of the args is marked by two fields * in the request structure: r_gss_argoff and r_gss_arglen, @@ -1090,7 +1263,7 @@ nfs_gss_clnt_args_restore(struct nfsreq *req) { struct nfs_gss_clnt_ctx *cp = req->r_gss_ctx; struct nfsm_chain mchain, *nmc = &mchain; - int len, error = 0; + int error = 0, merr; if (cp == NULL) return (NFSERR_EAUTH); @@ -1098,50 +1271,24 @@ nfs_gss_clnt_args_restore(struct nfsreq *req) if ((cp->gss_clnt_flags & GSS_CTX_COMPLETE) == 0) return (ENEEDAUTH); + /* Nothing to restore for SVC_NONE */ + if (cp->gss_clnt_service == RPCSEC_GSS_SVC_NONE) + return (0); + nfsm_chain_dissect_init(error, nmc, req->r_mhead); // start at RPC header nfsm_chain_adv(error, nmc, req->r_gss_argoff); // advance to args if (error) return (error); - switch (cp->gss_clnt_service) { - case RPCSEC_GSS_SVC_NONE: - /* nothing to do */ - break; - case RPCSEC_GSS_SVC_INTEGRITY: - /* - * All we have to do here is remove the appended checksum mbufs. - * We know that the checksum starts in a new mbuf beyond the end - * of the args. - */ - nfsm_chain_adv(error, nmc, req->r_gss_arglen); // adv to last args mbuf - if (error) - return (error); + if (cp->gss_clnt_service == RPCSEC_GSS_SVC_INTEGRITY) + error = rpc_gss_integ_data_restore(cp->gss_clnt_ctx_id, &req->r_mrest, req->r_gss_arglen); + else + error = rpc_gss_priv_data_restore(cp->gss_clnt_ctx_id, &req->r_mrest, req->r_gss_arglen); - mbuf_freem(mbuf_next(nmc->nmc_mcur)); // free the cksum mbuf - error = mbuf_setnext(nmc->nmc_mcur, NULL); - break; - case RPCSEC_GSS_SVC_PRIVACY: - /* - * The args are encrypted along with prepended confounders and seqnum. - * First we decrypt, the confounder, seqnum and args then skip to the - * final mbuf of the args. - * The arglen includes 8 bytes of confounder and 4 bytes of seqnum. - * Finally, we remove between 4 and 8 bytes of encryption padding - * as well as any alignment padding in the trailing mbuf. - */ - len = req->r_gss_arglen; - len += len % 8 > 0 ? 4 : 8; // add DES padding length - nfs_gss_encrypt_chain(cp->gss_clnt_kinfo, nmc, - req->r_gss_argoff, len, DES_DECRYPT); - nfsm_chain_adv(error, nmc, req->r_gss_arglen); - if (error) - return (error); - mbuf_freem(mbuf_next(nmc->nmc_mcur)); // free the pad mbuf - error = mbuf_setnext(nmc->nmc_mcur, NULL); - break; - } + merr = mbuf_setnext(nmc->nmc_mcur, req->r_mrest); /* Should always succeed */ + assert (merr == 0); - return (error); + return (error ? error : merr); } /* @@ -1158,11 +1305,13 @@ static int nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) { struct nfsmount *nmp = req->r_nmp; + gss_buffer_desc cksum, window; + uint32_t network_seqnum; int client_complete = 0; int server_complete = 0; - u_char cksum1[MAX_DIGEST], cksum2[MAX_DIGEST]; int error = 0; - gss_key_info *ki = cp->gss_clnt_kinfo; + int retrycnt = 0; + uint32_t major; /* Initialize a new client context */ @@ -1181,27 +1330,36 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) req->r_auth == RPCAUTH_KRB5I ? RPCSEC_GSS_SVC_INTEGRITY : req->r_auth == RPCAUTH_KRB5P ? RPCSEC_GSS_SVC_PRIVACY : 0; - cp->gss_clnt_gssd_flags = (nfs_single_des ? GSSD_NFS_1DES : 0); /* * Now loop around alternating gss_init_sec_context and * gss_accept_sec_context upcalls to the gssd on the client * and server side until the context is complete - or fails. */ for (;;) { - retry: /* Upcall to the gss_init_sec_context in the gssd */ - error = nfs_gss_clnt_gssd_upcall(req, cp); + error = nfs_gss_clnt_gssd_upcall(req, cp, retrycnt); if (error) goto nfsmout; if (cp->gss_clnt_major == GSS_S_COMPLETE) { client_complete = 1; + NFS_GSS_DBG("Client complete\n"); if (server_complete) break; } else if (cp->gss_clnt_major != GSS_S_CONTINUE_NEEDED) { - error = NFSERR_EAUTH; - goto nfsmout; + /* + * We may have gotten here because the accept sec context + * from the server failed and sent back a GSS token that + * encapsulates a kerberos error token per RFC 1964/4121 + * with a status of GSS_S_CONTINUE_NEEDED. That caused us + * to loop to the above up call and received the now + * decoded errors. + */ + retrycnt++; + cp->gss_clnt_gssd_flags |= GSSD_RESTART; + NFS_GSS_DBG("Retrying major = %x minor = %d\n", cp->gss_clnt_major, (int)cp->gss_clnt_minor); + goto retry; } /* @@ -1209,26 +1367,38 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) */ error = nfs_gss_clnt_ctx_callserver(req, cp); if (error) { - if (error == ENEEDAUTH && cp->gss_clnt_proc == RPCSEC_GSS_INIT && - (cp->gss_clnt_gssd_flags & (GSSD_RESTART | GSSD_NFS_1DES)) == 0) { - NFS_GSS_DBG("Retrying with single DES for req %p\n", req); - cp->gss_clnt_gssd_flags = (GSSD_RESTART | GSSD_NFS_1DES); - if (cp->gss_clnt_token) - FREE(cp->gss_clnt_token, M_TEMP); - cp->gss_clnt_token = NULL; - cp->gss_clnt_tokenlen = 0; + if (error == ENEEDAUTH && + (cp->gss_clnt_proc == RPCSEC_GSS_INIT || + cp->gss_clnt_proc == RPCSEC_GSS_CONTINUE_INIT)) { + /* + * We got here because the server had a problem + * trying to establish a context and sent that there + * was a context problem at the rpc sec layer. Perhaps + * gss_accept_sec_context succeeded in user space, + * but the kernel could not handle the etype + * to generate the mic for the verifier of the rpc_sec + * window size. + */ + retrycnt++; + cp->gss_clnt_gssd_flags |= GSSD_RESTART; + NFS_GSS_DBG("Retrying major = %x minor = %d\n", cp->gss_clnt_major, (int)cp->gss_clnt_minor); goto retry; } - // Reset flags, if error = ENEEDAUTH we will try 3des again - cp->gss_clnt_gssd_flags = 0; goto nfsmout; } if (cp->gss_clnt_major == GSS_S_COMPLETE) { + NFS_GSS_DBG("Server complete\n"); server_complete = 1; if (client_complete) break; + } else if (cp->gss_clnt_major == GSS_S_CONTINUE_NEEDED) { + cp->gss_clnt_proc = RPCSEC_GSS_CONTINUE_INIT; + } else { + /* Server didn't like us. Try something else */ + retrycnt++; + cp->gss_clnt_gssd_flags |= GSSD_RESTART; + NFS_GSS_DBG("Retrying major = %x minor = %d\n", cp->gss_clnt_major, (int)cp->gss_clnt_minor); } - cp->gss_clnt_proc = RPCSEC_GSS_CONTINUE_INIT; } /* @@ -1239,21 +1409,17 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) lck_mtx_unlock(cp->gss_clnt_mtx); cp->gss_clnt_proc = RPCSEC_GSS_DATA; - /* - * Compute checksum of the server's window - */ - nfs_gss_cksum_rep(ki, cp->gss_clnt_seqwin, cksum1); - - /* - * and see if it matches the one in the - * verifier the server returned. - */ - error = nfs_gss_token_get(ki, ALG_MIC(ki), cp->gss_clnt_verf, 0, - NULL, cksum2); + network_seqnum = htonl(cp->gss_clnt_seqwin); + window.length = sizeof (cp->gss_clnt_seqwin); + window.value = &network_seqnum; + cksum.value = cp->gss_clnt_verf; + cksum.length = cp->gss_clnt_verflen; + major = gss_krb5_verify_mic((uint32_t *)&error, cp->gss_clnt_ctx_id, &window, &cksum, NULL); + cp->gss_clnt_verflen = 0; FREE(cp->gss_clnt_verf, M_TEMP); cp->gss_clnt_verf = NULL; - - if (error || bcmp(cksum1, cksum2, HASHLEN(ki)) != 0) { + if (major != GSS_S_COMPLETE) { + printf("%s: could not verify window\n", __func__); error = NFSERR_EAUTH; goto nfsmout; } @@ -1274,14 +1440,17 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) nfsm_rndup((cp->gss_clnt_seqwin + 7) / 8), M_TEMP, M_WAITOK|M_ZERO); if (cp->gss_clnt_seqbits == NULL) error = NFSERR_EAUTH; + nfsmout: /* * If the error is ENEEDAUTH we're not done, so no need * to wake up other threads again. This thread will retry in * the find or renew routines. */ - if (error == ENEEDAUTH) + if (error == ENEEDAUTH) { + NFS_GSS_DBG("Returning ENEEDAUTH\n"); return (error); + } /* * If there's an error, just mark it as invalid. @@ -1302,6 +1471,7 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) } lck_mtx_unlock(cp->gss_clnt_mtx); + NFS_GSS_DBG("Returning error = %d\n", error); return (error); } @@ -1632,11 +1802,12 @@ nfs_gss_clnt_log_error(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp, uint32_t * must have access to the user's credential cache. */ static int -nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) +nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp, uint32_t retrycnt) { kern_return_t kr; - gssd_byte_buffer okey = NULL; - uint32_t skeylen = 0; + gssd_byte_buffer octx = NULL; + uint32_t lucidlen = 0; + void *lucid_ctx_buffer; int retry_cnt = 0; vm_map_copy_t itoken = NULL; gssd_byte_buffer otoken = NULL; @@ -1649,24 +1820,59 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) vm_map_copy_t svcname = NULL; char display_name[MAX_DISPLAY_STR] = ""; uint32_t ret_flags; - uint32_t nfs_1des = (cp->gss_clnt_gssd_flags & GSSD_NFS_1DES); - struct nfsmount *nmp; + struct nfsmount *nmp = req->r_nmp; uint32_t major = cp->gss_clnt_major, minor = cp->gss_clnt_minor; - - /* - * NFS currently only supports default principals or - * principals based on the uid of the caller, unless - * the principal to use for the mounting cred was specified - * in the mount argmuments. If the realm to use was specified - * then will send that up as the principal since the realm is + uint32_t selected = (uint32_t)-1; + struct nfs_etype etype; + + if (nmp == NULL || vfs_isforce(nmp->nm_mountp) || (nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD))) + return (ENXIO); + + if (cp->gss_clnt_gssd_flags & GSSD_RESTART) { + if (cp->gss_clnt_token) + FREE(cp->gss_clnt_token, M_TEMP); + cp->gss_clnt_token = NULL; + cp->gss_clnt_tokenlen = 0; + cp->gss_clnt_proc = RPCSEC_GSS_INIT; + } + + NFS_GSS_DBG("Retrycnt = %d nm_etype.count = %d\n", retrycnt, nmp->nm_etype.count); + if (retrycnt >= nmp->nm_etype.count) + return (EACCES); + + /* Copy the mount etypes to an order set of etypes to try */ + etype = nmp->nm_etype; + + /* + * If we've already selected an etype, lets put that first in our + * array of etypes to try, since overwhelmingly, that is likely + * to be the etype we want. + */ + if (etype.selected < etype.count) { + etype.etypes[0] = nmp->nm_etype.etypes[etype.selected]; + for (uint32_t i = 0; i < etype.selected; i++) + etype.etypes[i+1] = nmp->nm_etype.etypes[i]; + for (uint32_t i = etype.selected + 1; i < etype.count; i++) + etype.etypes[i] = nmp->nm_etype.etypes[i]; + } + + /* Remove the ones we've already have tried */ + for (uint32_t i = retrycnt; i < etype.count; i++) + etype.etypes[i - retrycnt] = etype.etypes[i]; + etype.count = etype.count - retrycnt; + + NFS_GSS_DBG("etype count = %d preferred etype = %d\n", etype.count, etype.etypes[0]); + + /* + * NFS currently only supports default principals or + * principals based on the uid of the caller, unless + * the principal to use for the mounting cred was specified + * in the mount argmuments. If the realm to use was specified + * then will send that up as the principal since the realm is * preceed by an "@" gssd that will try and select the default * principal for that realm. */ - nmp = req->r_nmp; - if (nmp == NULL || vfs_isforce(nmp->nm_mountp) || (nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD))) - return (ENXIO); - if (cp->gss_clnt_principal && cp->gss_clnt_prinlen) { principal = cp->gss_clnt_principal; plen = cp->gss_clnt_prinlen; @@ -1698,8 +1904,11 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) if (cp->gss_clnt_tokenlen) nfs_gss_mach_alloc_buffer(cp->gss_clnt_token, cp->gss_clnt_tokenlen, &itoken); + /* Always want to export the lucid context */ + cp->gss_clnt_gssd_flags |= GSSD_LUCID_CONTEXT; + retry: - kr = mach_gss_init_sec_context_v2( + kr = mach_gss_init_sec_context_v3( cp->gss_clnt_mport, GSSD_KRB5_MECH, (gssd_byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_clnt_tokenlen, @@ -1709,19 +1918,24 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) cp->gss_clnt_svcnt, (gssd_byte_buffer)svcname, (mach_msg_type_number_t) cp->gss_clnt_svcnamlen, GSSD_MUTUAL_FLAG, + (gssd_etype_list)etype.etypes, (mach_msg_type_number_t)etype.count, &cp->gss_clnt_gssd_flags, &cp->gss_clnt_context, &cp->gss_clnt_cred_handle, &ret_flags, - &okey, (mach_msg_type_number_t *) &skeylen, + &octx, (mach_msg_type_number_t *) &lucidlen, &otoken, &otokenlen, cp->gss_clnt_display ? NULL : display_name, &cp->gss_clnt_major, &cp->gss_clnt_minor); - /* Should be cleared and set in gssd ? */ + /* Clear the RESTART flag */ cp->gss_clnt_gssd_flags &= ~GSSD_RESTART; - cp->gss_clnt_gssd_flags |= nfs_1des; + if (cp->gss_clnt_major != GSS_S_CONTINUE_NEEDED) { + /* We're done with the gssd handles */ + cp->gss_clnt_context = 0; + cp->gss_clnt_cred_handle = 0; + } if (kr != KERN_SUCCESS) { printf("nfs_gss_clnt_gssd_upcall: mach_gss_init_sec_context failed: %x (%d)\n", kr, kr); @@ -1744,7 +1958,7 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) if (cp->gss_clnt_display == NULL && *display_name != '\0') { int dlen = strnlen(display_name, MAX_DISPLAY_STR) + 1; /* Add extra byte to include '\0' */ - + if (dlen < MAX_DISPLAY_STR) { MALLOC(cp->gss_clnt_display, char *, dlen, M_TEMP, M_WAITOK); if (cp->gss_clnt_display == NULL) @@ -1762,26 +1976,37 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) */ if (cp->gss_clnt_major != GSS_S_COMPLETE && cp->gss_clnt_major != GSS_S_CONTINUE_NEEDED) { + NFS_GSS_DBG("Up call returned error\n"); nfs_gss_clnt_log_error(req, cp, major, minor); } - if (skeylen > 0) { - if (skeylen != SKEYLEN && skeylen != SKEYLEN3) { - printf("nfs_gss_clnt_gssd_upcall: bad key length (%d)\n", skeylen); - vm_map_copy_discard((vm_map_copy_t) okey); + if (lucidlen > 0) { + if (lucidlen > MAX_LUCIDLEN) { + printf("nfs_gss_clnt_gssd_upcall: bad context length (%d)\n", lucidlen); + vm_map_copy_discard((vm_map_copy_t) octx); vm_map_copy_discard((vm_map_copy_t) otoken); goto out; } - error = nfs_gss_mach_vmcopyout((vm_map_copy_t) okey, skeylen, - cp->gss_clnt_kinfo->skey); + MALLOC(lucid_ctx_buffer, void *, lucidlen, M_TEMP, M_WAITOK | M_ZERO); + error = nfs_gss_mach_vmcopyout((vm_map_copy_t) octx, lucidlen, lucid_ctx_buffer); if (error) { vm_map_copy_discard((vm_map_copy_t) otoken); goto out; } - - error = gss_key_init(cp->gss_clnt_kinfo, skeylen); - if (error) + + if (cp->gss_clnt_ctx_id) + gss_krb5_destroy_context(cp->gss_clnt_ctx_id); + cp->gss_clnt_ctx_id = gss_krb5_make_context(lucid_ctx_buffer, lucidlen); + if (cp->gss_clnt_ctx_id == NULL) { + printf("Failed to make context from lucid_ctx_buffer\n"); goto out; + } + for (uint32_t i = 0; i < nmp->nm_etype.count; i++) { + if (nmp->nm_etype.etypes[i] == cp->gss_clnt_ctx_id->gss_cryptor.etype) { + selected = i; + break; + } + } } /* Free context token used as input */ @@ -1800,6 +2025,7 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) } error = nfs_gss_mach_vmcopyout((vm_map_copy_t) otoken, otokenlen, cp->gss_clnt_token); if (error) { + printf("Could not copyout gss token\n"); FREE(cp->gss_clnt_token, M_TEMP); cp->gss_clnt_token = NULL; return (NFSERR_EAUTH); @@ -1807,6 +2033,11 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) cp->gss_clnt_tokenlen = otokenlen; } + if (selected != (uint32_t)-1) { + nmp->nm_etype.selected = selected; + NFS_GSS_DBG("etype selected = %d\n", nmp->nm_etype.etypes[selected]); + } + NFS_GSS_DBG("Up call succeeded major = %d\n", cp->gss_clnt_major); return (0); out: @@ -1815,6 +2046,7 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) cp->gss_clnt_token = NULL; cp->gss_clnt_tokenlen = 0; + NFS_GSS_DBG("Up call returned NFSERR_EAUTH"); return (NFSERR_EAUTH); } @@ -1914,9 +2146,9 @@ nfs_gss_clnt_ctx_unref(struct nfsreq *req) if (cp->gss_clnt_refcnt == 0) { if ((cp->gss_clnt_flags & GSS_CTX_INVAL) && - cp->gss_clnt_kinfo) { - FREE(cp->gss_clnt_kinfo, M_TEMP); - cp->gss_clnt_kinfo = NULL; + cp->gss_clnt_ctx_id) { + gss_krb5_destroy_context(cp->gss_clnt_ctx_id); + cp->gss_clnt_ctx_id = NULL; } if (cp->gss_clnt_flags & GSS_CTX_DESTROY) { destroy = 1; @@ -1969,8 +2201,10 @@ nfs_gss_clnt_ctx_neg_cache_reap(struct nfsmount *nmp) struct timeval now; int reaped = 0; - NFS_GSS_DBG("Reaping contexts ncentries = %d\n", nmp->nm_ncentries); /* Try and reap old, unreferenced, expired contexts */ + microuptime(&now); + + NFS_GSS_DBG("Reaping contexts ncentries = %d\n", nmp->nm_ncentries); TAILQ_FOREACH_SAFE(cp, &nmp->nm_gsscl, gss_clnt_entries, tcp) { int destroy = 0; @@ -2055,8 +2289,7 @@ nfs_gss_clnt_ctx_clean(struct nfs_gss_clnt_ctx *cp) cp->gss_clnt_token = NULL; } cp->gss_clnt_tokenlen = 0; - if (cp->gss_clnt_kinfo) - bzero(cp->gss_clnt_kinfo, sizeof(gss_key_info)); + /* XXX gss_clnt_ctx_id ??? */ /* * Preserve: * gss_clnt_gssd_flags @@ -2073,7 +2306,7 @@ nfs_gss_clnt_ctx_clean(struct nfs_gss_clnt_ctx *cp) * context. */ static int -nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *scp, struct nfs_gss_clnt_ctx **dcpp, gss_key_info *ki) +nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *scp, struct nfs_gss_clnt_ctx **dcpp) { struct nfs_gss_clnt_ctx *dcp; @@ -2082,16 +2315,6 @@ nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *scp, struct nfs_gss_clnt_ctx **dc if (dcp == NULL) return (ENOMEM); bzero(dcp, sizeof (struct nfs_gss_clnt_ctx)); - if (ki == NULL) { - MALLOC(dcp->gss_clnt_kinfo, gss_key_info *, sizeof (gss_key_info), M_TEMP, M_WAITOK); - if (dcp->gss_clnt_kinfo == NULL) { - FREE(dcp, M_TEMP); - return (ENOMEM); - } - } else { - dcp->gss_clnt_kinfo = ki; - } - bzero(dcp->gss_clnt_kinfo, sizeof (gss_key_info)); dcp->gss_clnt_mtx = lck_mtx_alloc_init(nfs_gss_clnt_grp, LCK_ATTR_NULL); dcp->gss_clnt_cred = scp->gss_clnt_cred; kauth_cred_ref(dcp->gss_clnt_cred); @@ -2100,7 +2323,6 @@ nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *scp, struct nfs_gss_clnt_ctx **dc if (scp->gss_clnt_principal) { MALLOC(dcp->gss_clnt_principal, uint8_t *, dcp->gss_clnt_prinlen, M_TEMP, M_WAITOK | M_ZERO); if (dcp->gss_clnt_principal == NULL) { - FREE(dcp->gss_clnt_kinfo, M_TEMP); FREE(dcp, M_TEMP); return (ENOMEM); } @@ -2109,7 +2331,7 @@ nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *scp, struct nfs_gss_clnt_ctx **dc /* Note we don't preserve the display name, that will be set by a successful up call */ dcp->gss_clnt_service = scp->gss_clnt_service; dcp->gss_clnt_mport = host_copy_special_port(scp->gss_clnt_mport); - /* gss_clnt_kinfo allocated above */ + dcp->gss_clnt_ctx_id = NULL; /* Will be set from successful upcall */ dcp->gss_clnt_gssd_flags = scp->gss_clnt_gssd_flags; dcp->gss_clnt_major = scp->gss_clnt_major; dcp->gss_clnt_minor = scp->gss_clnt_minor; @@ -2149,9 +2371,9 @@ nfs_gss_clnt_ctx_destroy(struct nfs_gss_clnt_ctx *cp) FREE(cp->gss_clnt_display, M_TEMP); cp->gss_clnt_display = NULL; } - if (cp->gss_clnt_kinfo) { - FREE(cp->gss_clnt_kinfo, M_TEMP); - cp->gss_clnt_kinfo = NULL; + if (cp->gss_clnt_ctx_id) { + gss_krb5_destroy_context(cp->gss_clnt_ctx_id); + cp->gss_clnt_ctx_id = NULL; } nfs_gss_clnt_ctx_clean(cp); @@ -2195,7 +2417,15 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) } lck_mtx_unlock(cp->gss_clnt_mtx); - error = nfs_gss_clnt_ctx_copy(cp, &ncp, NULL); + if (cp->gss_clnt_proc == RPCSEC_GSS_DESTROY) + return (EACCES); /* Destroying a context is best effort. Don't renew. */ + /* + * If we're setting up a context let nfs_gss_clnt_ctx_init know this is not working + * and to try some other etype. + */ + if (cp->gss_clnt_proc != RPCSEC_GSS_DATA) + return (ENEEDAUTH); + error = nfs_gss_clnt_ctx_copy(cp, &ncp); NFS_GSS_DBG("Renewing context %s\n", NFS_GSS_CTX(req, ncp)); nfs_gss_clnt_ctx_unref(req); if (error) @@ -2383,9 +2613,15 @@ nfs_gss_clnt_ctx_get_principal(struct nfsmount *nmp, vfs_context_t ctx, int error = 0; struct nfs_gss_clnt_ctx *cp; kauth_cred_t cred = vfs_context_ucred(ctx); - const char *princ; + const char *princ = NULL; char CTXBUF[NFS_CTXBUFSZ]; + /* Make sure the the members of the struct user_nfs_gss_principal are initialized */ + p->nametype = GSSD_STRING_NAME; + p->principal = USER_ADDR_NULL; + p->princlen = 0; + p->flags = 0; + req.r_nmp = nmp; lck_mtx_lock(&nmp->nm_lock); TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) { @@ -2408,30 +2644,40 @@ nfs_gss_clnt_ctx_get_principal(struct nfsmount *nmp, vfs_context_t ctx, out: if (cp == NULL) { lck_mtx_unlock(&nmp->nm_lock); - p->princlen = 0; - p->principal = USER_ADDR_NULL; - p->nametype = GSSD_STRING_NAME; - p->flags |= NFS_IOC_NO_CRED_FLAG; + p->flags |= NFS_IOC_NO_CRED_FLAG; /* No credentials, valid or invalid on this mount */ NFS_GSS_DBG("No context found for session %d by uid %d\n", kauth_cred_getasid(cred), kauth_cred_getuid(cred)); return (0); } - princ = cp->gss_clnt_principal ? (char *)cp->gss_clnt_principal : cp->gss_clnt_display; - p->princlen = cp->gss_clnt_principal ? cp->gss_clnt_prinlen : - (cp->gss_clnt_display ? strlen(cp->gss_clnt_display) : 0); - p->nametype = cp->gss_clnt_prinnt; + /* Indicate if the cred is INVALID */ + if (cp->gss_clnt_flags & GSS_CTX_INVAL) + p->flags |= NFS_IOC_INVALID_CRED_FLAG; + + /* We have set a principal on the mount */ + if (cp->gss_clnt_principal) { + princ = (char *)cp->gss_clnt_principal; + p->princlen = cp->gss_clnt_prinlen; + p->nametype = cp->gss_clnt_prinnt; + } else if (cp->gss_clnt_display) { + /* We have a successful use the the default credential */ + princ = cp->gss_clnt_display; + p->princlen = strlen(cp->gss_clnt_display); + } + + /* + * If neither of the above is true we have an invalid default credential + * So from above p->principal is USER_ADDR_NULL and princ is NULL + */ + if (princ) { char *pp; MALLOC(pp, char *, p->princlen, M_TEMP, M_WAITOK); - if (pp) { - bcopy(princ, pp, p->princlen); - p->principal = CAST_USER_ADDR_T(pp); - } - else - error = ENOMEM; + bcopy(princ, pp, p->princlen); + p->principal = CAST_USER_ADDR_T(pp); } + lck_mtx_unlock(&nmp->nm_lock); req.r_gss_ctx = cp; @@ -2615,17 +2861,18 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) { uint32_t vers, proc, seqnum, service; uint32_t handle, handle_len; + uint32_t major; struct nfs_gss_svc_ctx *cp = NULL; - uint32_t flavor = 0, verflen = 0; + uint32_t flavor = 0, header_len; int error = 0; - uint32_t arglen, start, toklen, cksumlen; - u_char tokbuf[KRB5_SZ_TOKMAX(MAX_DIGEST)]; - u_char cksum1[MAX_DIGEST], cksum2[MAX_DIGEST]; + uint32_t arglen, start; + size_t argsize; + gss_buffer_desc cksum; struct nfsm_chain nmc_tmp; - gss_key_info *ki; - + mbuf_t reply_mbuf, prev_mbuf, pad_mbuf; + vers = proc = seqnum = service = handle_len = 0; - arglen = cksumlen = 0; + arglen = 0; nfsm_chain_get_32(error, nmc, vers); if (vers != RPCSEC_GSS_VERS_1) { @@ -2683,7 +2930,6 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) cp->gss_svc_mtx = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); cp->gss_svc_refcnt = 1; } else { - /* * Use the handle to find the context */ @@ -2702,7 +2948,6 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) } cp->gss_svc_proc = proc; - ki = &cp->gss_svc_kinfo; if (proc == RPCSEC_GSS_DATA || proc == RPCSEC_GSS_DESTROY) { struct posix_cred temp_pcred; @@ -2723,9 +2968,6 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) goto nfsmout; } - /* Now compute the client's call header checksum */ - nfs_gss_cksum_chain(ki, nmc, ALG_MIC(ki), 0, 0, cksum1); - /* * Validate the verifier. * The verifier contains an encrypted checksum @@ -2734,23 +2976,23 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) * checksum and compare it with what came in * the verifier. */ + header_len = nfsm_chain_offset(nmc); nfsm_chain_get_32(error, nmc, flavor); - nfsm_chain_get_32(error, nmc, verflen); + nfsm_chain_get_32(error, nmc, cksum.length); if (error) goto nfsmout; - if (flavor != RPCSEC_GSS || verflen != KRB5_SZ_TOKEN(ki->hash_len)) + if (flavor != RPCSEC_GSS || cksum.length > KRB5_MAX_MIC_SIZE) error = NFSERR_AUTHERR | AUTH_BADVERF; - nfsm_chain_get_opaque(error, nmc, verflen, tokbuf); - if (error) - goto nfsmout; - - /* Get the checksum from the token inside the verifier */ - error = nfs_gss_token_get(ki, ALG_MIC(ki), tokbuf, 1, - NULL, cksum2); + MALLOC(cksum.value, void *, cksum.length, M_TEMP, M_WAITOK); + nfsm_chain_get_opaque(error, nmc, cksum.length, cksum.value); if (error) goto nfsmout; - if (bcmp(cksum1, cksum2, HASHLEN(ki)) != 0) { + /* Now verify the client's call header checksum */ + major = gss_krb5_verify_mic_mbuf((uint32_t *)&error, cp->gss_svc_ctx_id, nmc->nmc_mhead, 0, header_len, &cksum, NULL); + (void)gss_release_buffer(NULL, &cksum); + if (major != GSS_S_COMPLETE) { + printf("Server header: gss_krb5_verify_mic_mbuf failed %d\n", error); error = NFSERR_AUTHERR | RPCSEC_GSS_CTXPROBLEM; goto nfsmout; } @@ -2788,8 +3030,8 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) * - length of seq num + call args (4 bytes) * - sequence number (4 bytes) * - call args (variable bytes) - * - length of checksum token (37) - * - checksum of seqnum + call args (37 bytes) + * - length of checksum token + * - checksum of seqnum + call args */ nfsm_chain_get_32(error, nmc, arglen); // length of args if (arglen > NFS_MAXPACKET) { @@ -2797,10 +3039,29 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) goto nfsmout; } - /* Compute the checksum over the call args */ + nmc_tmp = *nmc; + nfsm_chain_adv(error, &nmc_tmp, arglen); + nfsm_chain_get_32(error, &nmc_tmp, cksum.length); + MALLOC(cksum.value, void *, cksum.length, M_TEMP, M_WAITOK); + + if (cksum.value == NULL) { + error = EBADRPC; + goto nfsmout; + } + nfsm_chain_get_opaque(error, &nmc_tmp, cksum.length, cksum.value); + + /* Verify the checksum over the call args */ start = nfsm_chain_offset(nmc); - nfs_gss_cksum_chain(ki, nmc, ALG_MIC(ki), start, arglen, cksum1); - + + major = gss_krb5_verify_mic_mbuf((uint32_t *)&error, cp->gss_svc_ctx_id, + nmc->nmc_mhead, start, arglen, &cksum, NULL); + FREE(cksum.value, M_TEMP); + if (major != GSS_S_COMPLETE) { + printf("Server args: gss_krb5_verify_mic_mbuf failed %d\n", error); + error = EBADRPC; + goto nfsmout; + } + /* * Get the sequence number prepended to the args * and compare it against the one sent in the @@ -2811,85 +3072,75 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) error = EBADRPC; // returns as GARBAGEARGS goto nfsmout; } - - /* - * Advance to the end of the args and - * fetch the checksum computed by the client. - */ - nmc_tmp = *nmc; - arglen -= NFSX_UNSIGNED; // skipped seqnum - nfsm_chain_adv(error, &nmc_tmp, arglen); // skip args - nfsm_chain_get_32(error, &nmc_tmp, cksumlen); // length of checksum - if (cksumlen != KRB5_SZ_TOKEN(ki->hash_len)) { - error = EBADRPC; - goto nfsmout; - } - nfsm_chain_get_opaque(error, &nmc_tmp, cksumlen, tokbuf); - if (error) - goto nfsmout; - error = nfs_gss_token_get(ki, ALG_MIC(ki), tokbuf, 1, - NULL, cksum2); - - /* Verify that the checksums are the same */ - if (error || bcmp(cksum1, cksum2, HASHLEN(ki)) != 0) { - error = EBADRPC; - goto nfsmout; - } break; case RPCSEC_GSS_SVC_PRIVACY: /* * Here's what we expect in the privacy call args: * - * - length of confounder + seq num + token + call args + * - length of wrap token * - wrap token (37-40 bytes) - * - confounder (8 bytes) - * - sequence number (4 bytes) - * - call args (encrypted) */ + prev_mbuf = nmc->nmc_mcur; nfsm_chain_get_32(error, nmc, arglen); // length of args if (arglen > NFS_MAXPACKET) { error = EBADRPC; goto nfsmout; } - - /* Get the token that prepends the encrypted args */ - nfsm_chain_get_opaque(error, nmc, KRB5_SZ_TOKMAX(ki->hash_len), tokbuf); - if (error) - goto nfsmout; - error = nfs_gss_token_get(ki, ALG_WRAP(ki), tokbuf, 1, - &toklen, cksum1); + + /* Get the wrap token (current mbuf in the chain starting at the current offset) */ + start = nmc->nmc_ptr - (caddr_t)mbuf_data(nmc->nmc_mcur); + + /* split out the wrap token */ + argsize = arglen; + error = gss_normalize_mbuf(nmc->nmc_mcur, start, &argsize, &reply_mbuf, &pad_mbuf, 0); if (error) goto nfsmout; - nfsm_chain_reverse(nmc, nfsm_pad(toklen)); - - /* decrypt the 8 byte confounder + seqnum + args */ - start = nfsm_chain_offset(nmc); - arglen -= toklen; - nfs_gss_encrypt_chain(ki, nmc, start, arglen, DES_DECRYPT); - - /* Compute a checksum over the sequence number + results */ - nfs_gss_cksum_chain(ki, nmc, ALG_WRAP(ki), start, arglen, cksum2); - - /* Verify that the checksums are the same */ - if (bcmp(cksum1, cksum2, HASHLEN(ki)) != 0) { - error = EBADRPC; + + assert(argsize == arglen); + if (pad_mbuf) { + assert(nfsm_pad(arglen) == mbuf_len(pad_mbuf)); + mbuf_free(pad_mbuf); + } else { + assert(nfsm_pad(arglen) == 0); + } + + major = gss_krb5_unwrap_mbuf((uint32_t *)&error, cp->gss_svc_ctx_id, &reply_mbuf, 0, arglen, NULL, NULL); + if (major != GSS_S_COMPLETE) { + printf("%s: gss_krb5_unwrap_mbuf failes %d\n", __func__, error); goto nfsmout; } + /* Now replace the wrapped arguments with the unwrapped ones */ + mbuf_setnext(prev_mbuf, reply_mbuf); + nmc->nmc_mcur = reply_mbuf; + nmc->nmc_ptr = mbuf_data(reply_mbuf); + nmc->nmc_left = mbuf_len(reply_mbuf); + + /* + * - sequence number (4 bytes) + * - call args + */ + + // nfsm_chain_reverse(nmc, nfsm_pad(toklen)); + /* * Get the sequence number prepended to the args * and compare it against the one sent in the * call credential. */ - nfsm_chain_adv(error, nmc, 8); // skip over the confounder nfsm_chain_get_32(error, nmc, seqnum); if (seqnum != nd->nd_gss_seqnum) { + printf("%s: Sequence number mismatch seqnum = %d nd->nd_gss_seqnum = %d\n", + __func__, seqnum, nd->nd_gss_seqnum); + printmbuf("reply_mbuf", nmc->nmc_mhead, 0, 0); + printf("reply_mbuf %p nmc_head %p\n", reply_mbuf, nmc->nmc_mhead); error = EBADRPC; // returns as GARBAGEARGS goto nfsmout; } break; } } else { + uint32_t verflen; /* * If the proc is RPCSEC_GSS_INIT or RPCSEC_GSS_CONTINUE_INIT * then we expect a null verifier. @@ -2927,14 +3178,11 @@ nfs_gss_svc_verf_put(struct nfsrv_descript *nd, struct nfsm_chain *nmc) { struct nfs_gss_svc_ctx *cp; int error = 0; - u_char tokbuf[KRB5_SZ_TOKEN(MAX_DIGEST)]; - int toklen; - u_char cksum[MAX_DIGEST]; - gss_key_info *ki; - + gss_buffer_desc cksum, seqbuf; + uint32_t network_seqnum; cp = nd->nd_gss_context; - ki = &cp->gss_svc_kinfo; - + uint32_t major; + if (cp->gss_svc_major != GSS_S_COMPLETE) { /* * If the context isn't yet complete @@ -2951,19 +3199,26 @@ nfs_gss_svc_verf_put(struct nfsrv_descript *nd, struct nfsm_chain *nmc) * then return the checksum of the context * window size. */ + seqbuf.length = NFSX_UNSIGNED; if (cp->gss_svc_proc == RPCSEC_GSS_INIT || cp->gss_svc_proc == RPCSEC_GSS_CONTINUE_INIT) - nfs_gss_cksum_rep(ki, cp->gss_svc_seqwin, cksum); + network_seqnum = htonl(cp->gss_svc_seqwin); else - nfs_gss_cksum_rep(ki, nd->nd_gss_seqnum, cksum); + network_seqnum = htonl(nd->nd_gss_seqnum); + seqbuf.value = &network_seqnum; + + major = gss_krb5_get_mic((uint32_t *)&error, cp->gss_svc_ctx_id, 0, &seqbuf, &cksum); + if (major != GSS_S_COMPLETE) + return (error); + /* * Now wrap it in a token and add * the verifier to the reply. */ - toklen = nfs_gss_token_put(ki, ALG_MIC(ki), tokbuf, 0, 0, cksum); nfsm_chain_add_32(error, nmc, RPCSEC_GSS); - nfsm_chain_add_32(error, nmc, toklen); - nfsm_chain_add_opaque(error, nmc, tokbuf, toklen); + nfsm_chain_add_32(error, nmc, cksum.length); + nfsm_chain_add_opaque(error, nmc, cksum.value, cksum.length); + gss_release_buffer(NULL, &cksum); return (error); } @@ -2991,16 +3246,9 @@ nfs_gss_svc_prepare_reply(struct nfsrv_descript *nd, struct nfsm_chain *nmc) /* Nothing to do */ break; case RPCAUTH_KRB5I: - nd->nd_gss_mb = nmc->nmc_mcur; // record current mbuf - nfsm_chain_finish_mbuf(error, nmc); // split the chain here - nfsm_chain_add_32(error, nmc, nd->nd_gss_seqnum); // req sequence number - break; case RPCAUTH_KRB5P: nd->nd_gss_mb = nmc->nmc_mcur; // record current mbuf nfsm_chain_finish_mbuf(error, nmc); // split the chain here - nfsm_chain_add_32(error, nmc, random()); // confounder bytes 1-4 - nfsm_chain_add_32(error, nmc, random()); // confounder bytes 5-8 - nfsm_chain_add_32(error, nmc, nd->nd_gss_seqnum); // req sequence number break; } @@ -3011,20 +3259,15 @@ nfs_gss_svc_prepare_reply(struct nfsrv_descript *nd, struct nfsm_chain *nmc) * The results are checksummed or encrypted for return to the client */ int -nfs_gss_svc_protect_reply(struct nfsrv_descript *nd, mbuf_t mrep) +nfs_gss_svc_protect_reply(struct nfsrv_descript *nd, mbuf_t mrep __unused) { struct nfs_gss_svc_ctx *cp = nd->nd_gss_context; struct nfsm_chain nmrep_res, *nmc_res = &nmrep_res; - struct nfsm_chain nmrep_pre, *nmc_pre = &nmrep_pre; mbuf_t mb, results; uint32_t reslen; - u_char tokbuf[KRB5_SZ_TOKMAX(MAX_DIGEST)]; - int pad, toklen; - u_char cksum[MAX_DIGEST]; int error = 0; - gss_key_info *ki = &cp->gss_svc_kinfo; - /* + /* XXX * Using a reference to the mbuf where we previously split the reply * mbuf chain, we split the mbuf chain argument into two mbuf chains, * one that allows us to prepend a length field or token, (nmc_pre) @@ -3032,69 +3275,25 @@ nfs_gss_svc_protect_reply(struct nfsrv_descript *nd, mbuf_t mrep) * checksum and/or encrypt. When we're done, we join the chains back * together. */ - nfs_gss_nfsm_chain(nmc_res, mrep); // set up the results chain + mb = nd->nd_gss_mb; // the mbuf where we split results = mbuf_next(mb); // first mbuf in the results - reslen = nfs_gss_mchain_length(results); // length of results error = mbuf_setnext(mb, NULL); // disconnect the chains if (error) return (error); - nfs_gss_nfsm_chain(nmc_pre, mb); // set up the prepend chain + nfs_gss_nfsm_chain(nmc_res, mb); // set up the prepend chain + nfsm_chain_build_done(error, nmc_res); + if (error) + return (error); if (nd->nd_sec == RPCAUTH_KRB5I) { - nfsm_chain_add_32(error, nmc_pre, reslen); - nfsm_chain_build_done(error, nmc_pre); - if (error) - return (error); - nfs_gss_append_chain(nmc_pre, results); // Append the results mbufs - - /* Now compute the checksum over the results data */ - nfs_gss_cksum_mchain(ki, results, ALG_MIC(ki), 0, reslen, cksum); - - /* Put it into a token and append to the request */ - toklen = nfs_gss_token_put(ki, ALG_MIC(ki), tokbuf, 0, 0, cksum); - nfsm_chain_add_32(error, nmc_res, toklen); - nfsm_chain_add_opaque(error, nmc_res, tokbuf, toklen); - nfsm_chain_build_done(error, nmc_res); + error = rpc_gss_integ_data_create(cp->gss_svc_ctx_id, &results, nd->nd_gss_seqnum, &reslen); } else { /* RPCAUTH_KRB5P */ - /* - * Append a pad trailer - per RFC 1964 section 1.2.2.3 - * Since XDR data is always 32-bit aligned, it - * needs to be padded either by 4 bytes or 8 bytes. - */ - if (reslen % 8 > 0) { - nfsm_chain_add_32(error, nmc_res, 0x04040404); - reslen += NFSX_UNSIGNED; - } else { - nfsm_chain_add_32(error, nmc_res, 0x08080808); - nfsm_chain_add_32(error, nmc_res, 0x08080808); - reslen += 2 * NFSX_UNSIGNED; - } - nfsm_chain_build_done(error, nmc_res); - - /* Now compute the checksum over the results data */ - nfs_gss_cksum_mchain(ki, results, ALG_WRAP(ki), 0, reslen, cksum); - - /* Put it into a token and insert in the reply */ - toklen = nfs_gss_token_put(ki, ALG_WRAP(ki), tokbuf, 0, reslen, cksum); - nfsm_chain_add_32(error, nmc_pre, toklen + reslen); - nfsm_chain_add_opaque_nopad(error, nmc_pre, tokbuf, toklen); - nfsm_chain_build_done(error, nmc_pre); - if (error) - return (error); - nfs_gss_append_chain(nmc_pre, results); // Append the results mbufs - - /* Encrypt the confounder + seqnum + results */ - nfs_gss_encrypt_mchain(ki, results, 0, reslen, DES_ENCRYPT); - - /* Add null XDR pad if the ASN.1 token misaligned the data */ - pad = nfsm_pad(toklen + reslen); - if (pad > 0) { - nfsm_chain_add_opaque_nopad(error, nmc_pre, iv0, pad); - nfsm_chain_build_done(error, nmc_pre); - } + error = rpc_gss_priv_data_create(cp->gss_svc_ctx_id, &results, nd->nd_gss_seqnum, &reslen); } + nfs_gss_append_chain(nmc_res, results); // Append the results mbufs + nfsm_chain_build_done(error, nmc_res); return (error); } @@ -3258,8 +3457,9 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) kern_return_t kr; mach_port_t mp; int retry_cnt = 0; - gssd_byte_buffer okey = NULL; - uint32_t skeylen = 0; + gssd_byte_buffer octx = NULL; + uint32_t lucidlen = 0; + void *lucid_ctx_buffer; uint32_t ret_flags; vm_map_copy_t itoken = NULL; gssd_byte_buffer otoken = NULL; @@ -3281,6 +3481,7 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) nfs_gss_mach_alloc_buffer(cp->gss_svc_token, cp->gss_svc_tokenlen, &itoken); retry: + printf("Calling mach_gss_accept_sec_context\n"); kr = mach_gss_accept_sec_context( mp, (gssd_byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_svc_tokenlen, @@ -3292,11 +3493,12 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) &cp->gss_svc_uid, cp->gss_svc_gids, &cp->gss_svc_ngroups, - &okey, (mach_msg_type_number_t *) &skeylen, + &octx, (mach_msg_type_number_t *) &lucidlen, &otoken, &otokenlen, &cp->gss_svc_major, &cp->gss_svc_minor); + printf("mach_gss_accept_sec_context returned %d\n", kr); if (kr != KERN_SUCCESS) { printf("nfs_gss_svc_gssd_upcall failed: %x (%d)\n", kr, kr); if (kr == MIG_SERVER_DIED && cp->gss_svc_context == 0 && @@ -3311,22 +3513,27 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) host_release_special_port(mp); - if (skeylen > 0) { - if (skeylen != SKEYLEN && skeylen != SKEYLEN3) { - printf("nfs_gss_svc_gssd_upcall: bad key length (%d)\n", skeylen); - vm_map_copy_discard((vm_map_copy_t) okey); + if (lucidlen > 0) { + if (lucidlen > MAX_LUCIDLEN) { + printf("nfs_gss_svc_gssd_upcall: bad context length (%d)\n", lucidlen); + vm_map_copy_discard((vm_map_copy_t) octx); vm_map_copy_discard((vm_map_copy_t) otoken); goto out; } - error = nfs_gss_mach_vmcopyout((vm_map_copy_t) okey, skeylen, cp->gss_svc_kinfo.skey); + MALLOC(lucid_ctx_buffer, void *, lucidlen, M_TEMP, M_WAITOK | M_ZERO); + error = nfs_gss_mach_vmcopyout((vm_map_copy_t) octx, lucidlen, lucid_ctx_buffer); if (error) { vm_map_copy_discard((vm_map_copy_t) otoken); + FREE(lucid_ctx_buffer, M_TEMP); goto out; } - error = gss_key_init(&cp->gss_svc_kinfo, skeylen); - if (error) + if (cp->gss_svc_ctx_id) + gss_krb5_destroy_context(cp->gss_svc_ctx_id); + cp->gss_svc_ctx_id = gss_krb5_make_context(lucid_ctx_buffer, lucidlen); + if (cp->gss_svc_ctx_id == NULL) { + printf("Failed to make context from lucid_ctx_buffer\n"); goto out; - + } } /* Free context token used as input */ @@ -3585,221 +3792,6 @@ nfs_gss_mach_vmcopyout(vm_map_copy_t in, uint32_t len, u_char *out) return (0); } -/* - * Encode an ASN.1 token to be wrapped in an RPCSEC_GSS verifier. - * Returns the size of the token, since it contains a variable - * length DER encoded size field. - */ -static int -nfs_gss_token_put( - gss_key_info *ki, - u_char *alg, - u_char *p, - int initiator, - int datalen, - u_char *cksum) -{ - static uint32_t seqnum = 0; - u_char *psave = p; - u_char plain[8]; - int toklen, i; - - /* - * Fill in the token header: 2 octets. - * This is 0x06 - an ASN.1 tag for APPLICATION, 0, SEQUENCE - * followed by the length of the token: 35 + 0 octets for a - * MIC token, or 35 + encrypted octets for a wrap token; - */ - *p++ = 0x060; - toklen = KRB5_SZ_MECH + KRB5_SZ_ALG + KRB5_SZ_SEQ + HASHLEN(ki); - nfs_gss_der_length_put(&p, toklen + datalen); - - /* - * Fill in the DER encoded mech OID for Kerberos v5. - * This represents the Kerberos OID 1.2.840.113554.1.2.2 - * described in RFC 2623, section 4.2 - */ - bcopy(krb5_mech, p, sizeof(krb5_mech)); - p += sizeof(krb5_mech); - - /* - * Now at the token described in RFC 1964, section 1.2.1 - * Fill in the token ID, integrity algorithm indicator, - * for DES MAC MD5, and four filler octets. - * The alg string encodes the bytes to represent either - * a MIC token or a WRAP token for Kerberos. - */ - bcopy(alg, p, KRB5_SZ_ALG); - p += KRB5_SZ_ALG; - - /* - * Now encode the sequence number according to - * RFC 1964, section 1.2.1.2 which dictates 4 octets - * of sequence number followed by 4 bytes of direction - * indicator: 0x00 for initiator or 0xff for acceptor. - * We DES CBC encrypt the sequence number using the first - * 8 octets of the checksum field as an initialization - * vector. - * Note that this sequence number is not at all related - * to the RPCSEC_GSS protocol sequence number. This - * number is private to the ASN.1 token. The only - * requirement is that it not be repeated in case the - * server has replay detection on, which normally should - * not be the case, since RFC 2203 section 5.2.3 says that - * replay detection and sequence checking must be turned off. - */ - seqnum++; - for (i = 0; i < 4; i++) - plain[i] = (u_char) ((seqnum >> (i * 8)) & 0xff); - for (i = 4; i < 8; i++) - plain[i] = initiator ? 0x00 : 0xff; - gss_des_crypt(ki, (des_cblock *) plain, (des_cblock *) p, 8, - (des_cblock *) cksum, NULL, DES_ENCRYPT, KG_USAGE_SEQ); - p += 8; - - /* - * Finally, append the octets of the - * checksum of the alg + plaintext data. - * The plaintext could be an RPC call header, - * the window value, or a sequence number. - */ - bcopy(cksum, p, HASHLEN(ki)); - p += HASHLEN(ki); - - return (p - psave); -} - -/* - * Determine size of ASN.1 DER length - */ -static int -nfs_gss_der_length_size(int len) -{ - return - len < (1 << 7) ? 1 : - len < (1 << 8) ? 2 : - len < (1 << 16) ? 3 : - len < (1 << 24) ? 4 : 5; -} - -/* - * Encode an ASN.1 DER length field - */ -static void -nfs_gss_der_length_put(u_char **pp, int len) -{ - int sz = nfs_gss_der_length_size(len); - u_char *p = *pp; - - if (sz == 1) { - *p++ = (u_char) len; - } else { - *p++ = (u_char) ((sz-1) | 0x80); - sz -= 1; - while (sz--) - *p++ = (u_char) ((len >> (sz * 8)) & 0xff); - } - - *pp = p; -} - -/* - * Decode an ASN.1 DER length field - */ -static int -nfs_gss_der_length_get(u_char **pp) -{ - u_char *p = *pp; - uint32_t flen, len = 0; - - flen = *p & 0x7f; - - if ((*p++ & 0x80) == 0) - len = flen; - else { - if (flen > sizeof(uint32_t)) - return (-1); - while (flen--) - len = (len << 8) + *p++; - } - *pp = p; - return (len); -} - -/* - * Decode an ASN.1 token from an RPCSEC_GSS verifier. - */ -static int -nfs_gss_token_get( - gss_key_info *ki, - u_char *alg, - u_char *p, - int initiator, - uint32_t *len, - u_char *cksum) -{ - u_char d, plain[8]; - u_char *psave = p; - int seqnum, i; - - /* - * Check that we have a valid token header - */ - if (*p++ != 0x60) - return (AUTH_BADCRED); - (void) nfs_gss_der_length_get(&p); // ignore the size - - /* - * Check that we have the DER encoded Kerberos v5 mech OID - */ - if (bcmp(p, krb5_mech, sizeof(krb5_mech) != 0)) - return (AUTH_BADCRED); - p += sizeof(krb5_mech); - - /* - * Now check the token ID, DES MAC MD5 algorithm - * indicator, and filler octets. - */ - if (bcmp(p, alg, KRB5_SZ_ALG) != 0) - return (AUTH_BADCRED); - p += KRB5_SZ_ALG; - - /* - * Now decrypt the sequence number. - * Note that the gss decryption uses the first 8 octets - * of the checksum field as an initialization vector (p + 8). - * Per RFC 2203 section 5.2.2 we don't check the sequence number - * in the ASN.1 token because the RPCSEC_GSS protocol has its - * own sequence number described in section 5.3.3.1 - */ - seqnum = 0; - gss_des_crypt(ki, (des_cblock *)p, (des_cblock *) plain, 8, - (des_cblock *) (p + 8), NULL, DES_DECRYPT, KG_USAGE_SEQ); - p += 8; - for (i = 0; i < 4; i++) - seqnum |= plain[i] << (i * 8); - - /* - * Make sure the direction - * indicator octets are correct. - */ - d = initiator ? 0x00 : 0xff; - for (i = 4; i < 8; i++) - if (plain[i] != d) - return (AUTH_BADCRED); - - /* - * Finally, get the checksum - */ - bcopy(p, cksum, HASHLEN(ki)); - p += HASHLEN(ki); - - if (len != NULL) - *len = p - psave; - - return (0); -} - /* * Return the number of bytes in an mbuf chain. */ @@ -3862,306 +3854,6 @@ nfs_gss_nfsm_chain(struct nfsm_chain *nmc, mbuf_t mc) } -/* - * Compute a checksum over an mbuf chain. - * Start building an MD5 digest at the given offset and keep - * going until the end of data in the current mbuf is reached. - * Then convert the 16 byte MD5 digest to an 8 byte DES CBC - * checksum. - */ -static void -nfs_gss_cksum_mchain( - gss_key_info *ki, - mbuf_t mhead, - u_char *alg, - int offset, - int len, - u_char *digest) -{ - mbuf_t mb; - u_char *ptr; - int left, bytes; - GSS_DIGEST_CTX context; - - gss_digest_Init(&context, ki); - - /* - * Logically prepend the first 8 bytes of the algorithm - * field as required by RFC 1964, section 1.2.1.1 - */ - gss_digest_Update(&context, alg, KRB5_SZ_ALG); - - /* - * Move down the mbuf chain until we reach the given - * byte offset, then start MD5 on the mbuf data until - * we've done len bytes. - */ - - for (mb = mhead; mb && len > 0; mb = mbuf_next(mb)) { - ptr = mbuf_data(mb); - left = mbuf_len(mb); - if (offset >= left) { - /* Offset not yet reached */ - offset -= left; - continue; - } - /* At or beyond offset - checksum data */ - ptr += offset; - left -= offset; - offset = 0; - - bytes = left < len ? left : len; - if (bytes > 0) - gss_digest_Update(&context, ptr, bytes); - len -= bytes; - } - - gss_digest_Final(&context, digest); -} - -/* - * Compute a checksum over an NFS mbuf chain. - * Start building an MD5 digest at the given offset and keep - * going until the end of data in the current mbuf is reached. - * Then convert the 16 byte MD5 digest to an 8 byte DES CBC - * checksum. - */ -static void -nfs_gss_cksum_chain( - gss_key_info *ki, - struct nfsm_chain *nmc, - u_char *alg, - int offset, - int len, - u_char *cksum) -{ - /* - * If the length parameter is zero, then we need - * to use the length from the offset to the current - * encode/decode offset. - */ - if (len == 0) - len = nfsm_chain_offset(nmc) - offset; - - return (nfs_gss_cksum_mchain(ki, nmc->nmc_mhead, alg, offset, len, cksum)); -} - -/* - * Compute a checksum of the sequence number (or sequence window) - * of an RPCSEC_GSS reply. - */ -static void -nfs_gss_cksum_rep(gss_key_info *ki, uint32_t seqnum, u_char *cksum) -{ - GSS_DIGEST_CTX context; - uint32_t val = htonl(seqnum); - - gss_digest_Init(&context, ki); - - /* - * Logically prepend the first 8 bytes of the MIC - * token as required by RFC 1964, section 1.2.1.1 - */ - gss_digest_Update(&context, ALG_MIC(ki), KRB5_SZ_ALG); - - /* - * Compute the digest of the seqnum in network order - */ - gss_digest_Update(&context, &val, 4); - gss_digest_Final(&context, cksum); -} - -/* - * Encrypt or decrypt data in an mbuf chain with des-cbc. - */ -static void -nfs_gss_encrypt_mchain( - gss_key_info *ki, - mbuf_t mhead, - int offset, - int len, - int encrypt) -{ - mbuf_t mb, mbn; - u_char *ptr, *nptr; - u_char tmp[8], ivec[8]; - int left, left8, remain; - - - bzero(ivec, 8); - - /* - * Move down the mbuf chain until we reach the given - * byte offset, then start encrypting the mbuf data until - * we've done len bytes. - */ - - for (mb = mhead; mb && len > 0; mb = mbn) { - mbn = mbuf_next(mb); - ptr = mbuf_data(mb); - left = mbuf_len(mb); - if (offset >= left) { - /* Offset not yet reached */ - offset -= left; - continue; - } - /* At or beyond offset - encrypt data */ - ptr += offset; - left -= offset; - offset = 0; - - /* - * DES or DES3 CBC has to encrypt 8 bytes at a time. - * If the number of bytes to be encrypted in this - * mbuf isn't some multiple of 8 bytes, encrypt all - * the 8 byte blocks, then combine the remaining - * bytes with enough from the next mbuf to make up - * an 8 byte block and encrypt that block separately, - * i.e. that block is split across two mbufs. - */ - remain = left % 8; - left8 = left - remain; - left = left8 < len ? left8 : len; - if (left > 0) { - gss_des_crypt(ki, (des_cblock *) ptr, (des_cblock *) ptr, - left, &ivec, &ivec, encrypt, KG_USAGE_SEAL); - len -= left; - } - - if (mbn && remain > 0) { - nptr = mbuf_data(mbn); - offset = 8 - remain; - bcopy(ptr + left, tmp, remain); // grab from this mbuf - bcopy(nptr, tmp + remain, offset); // grab from next mbuf - gss_des_crypt(ki, (des_cblock *) tmp, (des_cblock *) tmp, 8, - &ivec, &ivec, encrypt, KG_USAGE_SEAL); - bcopy(tmp, ptr + left, remain); // return to this mbuf - bcopy(tmp + remain, nptr, offset); // return to next mbuf - len -= 8; - } - } -} - -/* - * Encrypt or decrypt data in an NFS mbuf chain with des-cbc. - */ -static void -nfs_gss_encrypt_chain( - gss_key_info *ki, - struct nfsm_chain *nmc, - int offset, - int len, - int encrypt) -{ - /* - * If the length parameter is zero, then we need - * to use the length from the offset to the current - * encode/decode offset. - */ - if (len == 0) - len = nfsm_chain_offset(nmc) - offset; - - return (nfs_gss_encrypt_mchain(ki, nmc->nmc_mhead, offset, len, encrypt)); -} - -/* - * The routines that follow provide abstractions for doing digests and crypto. - */ - -static void -gss_digest_Init(GSS_DIGEST_CTX *ctx, gss_key_info *ki) -{ - ctx->type = ki->type; - switch (ki->type) { - case NFS_GSS_1DES: MD5_DESCBC_Init(&ctx->m_ctx, &ki->ks_u.des.gss_sched); - break; - case NFS_GSS_3DES: HMAC_SHA1_DES3KD_Init(&ctx->h_ctx, ki->ks_u.des3.ckey, 0); - break; - default: - printf("gss_digest_Init: Unknown key info type %d\n", ki->type); - } -} - -static void -gss_digest_Update(GSS_DIGEST_CTX *ctx, void *data, size_t len) -{ - switch (ctx->type) { - case NFS_GSS_1DES: MD5_DESCBC_Update(&ctx->m_ctx, data, len); - break; - case NFS_GSS_3DES: HMAC_SHA1_DES3KD_Update(&ctx->h_ctx, data, len); - break; - } -} - -static void -gss_digest_Final(GSS_DIGEST_CTX *ctx, void *digest) -{ - switch (ctx->type) { - case NFS_GSS_1DES: MD5_DESCBC_Final(digest, &ctx->m_ctx); - break; - case NFS_GSS_3DES: HMAC_SHA1_DES3KD_Final(digest, &ctx->h_ctx); - break; - } -} - -static void -gss_des_crypt(gss_key_info *ki, des_cblock *in, des_cblock *out, - int32_t len, des_cblock *iv, des_cblock *retiv, int encrypt, int usage) -{ - switch (ki->type) { - case NFS_GSS_1DES: - { - des_cbc_key_schedule *sched = ((usage == KG_USAGE_SEAL) ? - &ki->ks_u.des.gss_sched_Ke : - &ki->ks_u.des.gss_sched); - des_cbc_encrypt(in, out, len, sched, iv, retiv, encrypt); - } - break; - case NFS_GSS_3DES: - - des3_cbc_encrypt(in, out, len, &ki->ks_u.des3.gss_sched, iv, retiv, encrypt); - break; - } -} - -static int -gss_key_init(gss_key_info *ki, uint32_t skeylen) -{ - size_t i; - int rc; - des_cblock k[3]; - - ki->keybytes = skeylen; - switch (skeylen) { - case sizeof(des_cblock): - ki->type = NFS_GSS_1DES; - ki->hash_len = MD5_DESCBC_DIGEST_LENGTH; - ki->ks_u.des.key = (des_cblock *)ki->skey; - rc = des_cbc_key_sched(ki->ks_u.des.key, &ki->ks_u.des.gss_sched); - if (rc) - return (rc); - for (i = 0; i < ki->keybytes; i++) - k[0][i] = 0xf0 ^ (*ki->ks_u.des.key)[i]; - rc = des_cbc_key_sched(&k[0], &ki->ks_u.des.gss_sched_Ke); - break; - case 3*sizeof(des_cblock): - ki->type = NFS_GSS_3DES; - ki->hash_len = SHA_DIGEST_LENGTH; - ki->ks_u.des3.key = (des_cblock (*)[3])ki->skey; - des3_derive_key(*ki->ks_u.des3.key, ki->ks_u.des3.ckey, - KEY_USAGE_DES3_SIGN, KEY_USAGE_LEN); - rc = des3_cbc_key_sched(*ki->ks_u.des3.key, &ki->ks_u.des3.gss_sched); - if (rc) - return (rc); - break; - default: - printf("gss_key_init: Invalid key length %d\n", skeylen); - rc = EINVAL; - break; - } - - return (rc); -} #if 0 #define DISPLAYLEN 16 diff --git a/bsd/nfs/nfs_gss.h b/bsd/nfs/nfs_gss.h index e590eb1bf..fe3db1893 100644 --- a/bsd/nfs/nfs_gss.h +++ b/bsd/nfs/nfs_gss.h @@ -29,10 +29,10 @@ #ifndef _NFS_NFS_GSS_H_ #define _NFS_NFS_GSS_H_ +#include "gss/gss_krb5_mech.h" #include #include #include -#include #define RPCSEC_GSS 6 #define RPCSEC_GSS_VERS_1 1 @@ -51,78 +51,20 @@ enum rpcsec_gss_service { }; /* encoded krb5 OID */ -extern u_char krb5_mech[11]; +extern u_char krb5_mech_oid[11]; -/* - * GSS-API things - */ -typedef uint32_t OM_uint32; - -#define GSS_S_COMPLETE 0 -#define GSS_S_CONTINUE_NEEDED 1 - -/* - * Some "helper" definitions to make the status code macros obvious. - * From gssapi.h: - */ -#define GSS_C_CALLING_ERROR_OFFSET 24 -#define GSS_C_ROUTINE_ERROR_OFFSET 16 -#define GSS_C_SUPPLEMENTARY_OFFSET 0 -#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul) -#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul) -#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul) - -/* - * The macros that test status codes for error conditions. Note that the - * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now - * evaluates its argument only once. - */ -#define GSS_CALLING_ERROR(x) \ - ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET)) -#define GSS_ROUTINE_ERROR(x) \ - ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)) -#define GSS_SUPPLEMENTARY_INFO(x) \ - ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET)) -#define GSS_ERROR(x) \ - ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \ - (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))) #define GSS_MAXSEQ 0x80000000 // The biggest sequence number #define GSS_SVC_MAXCONTEXTS 500000 // Max contexts supported #define GSS_SVC_SEQWINDOW 256 // Server's sequence window #define GSS_CLNT_SEQLISTMAX 32 // Max length of req seq num list -#define SKEYLEN 8 // length of DES key -#define SKEYLEN3 24 // length of DES3 keyboard -#define MAX_SKEYLEN SKEYLEN3 - +#define MAX_SKEYLEN 32 +#define MAX_LUCIDLEN (sizeof (lucid_context) + MAX_SKEYLEN) #define GSS_MAX_NEG_CACHE_ENTRIES 16 #define GSS_NEG_CACHE_TO 3 #define GSS_PRINT_DELAY (8 * 3600) // Wait day before printing the same error message -typedef struct { - uint32_t type; // See defines below - uint32_t keybytes; // Session key length bytes; - uint32_t hash_len; - u_char skey[MAX_SKEYLEN]; // Session key; - union { - struct { - des_cblock *key; - des_cbc_key_schedule gss_sched; - des_cbc_key_schedule gss_sched_Ke; - } des; - struct { - des_cblock (*key)[3]; - des_cblock ckey[3]; - des3_cbc_key_schedule gss_sched; - } des3; - } ks_u; -} gss_key_info; - -#define NFS_GSS_0DES 0 // Not DES or uninitialized -#define NFS_GSS_1DES 1 // Single DES with DES_MAC_MD5 -#define NFS_GSS_3DES 2 // Triple EDE DES KD with SHA1 - /* * The client's RPCSEC_GSS context information */ @@ -146,15 +88,16 @@ struct nfs_gss_clnt_ctx { uint32_t gss_clnt_seqwin; // Server's seq num window uint32_t *gss_clnt_seqbits; // Bitmap to track seq numbers in use mach_port_t gss_clnt_mport; // Mach port for gssd upcall + uint32_t gss_clnt_verflen; // RPC verifier length from server uint8_t *gss_clnt_verf; // RPC verifier from server uint8_t *gss_clnt_svcname; // Service name e.g. "nfs/big.apple.com" uint32_t gss_clnt_svcnamlen; // Service name length gssd_nametype gss_clnt_svcnt; // Service name type gssd_cred gss_clnt_cred_handle; // Opaque cred handle from gssd gssd_ctx gss_clnt_context; // Opaque context handle from gssd + gss_ctx_id_t gss_clnt_ctx_id; // Underlying gss context uint8_t *gss_clnt_token; // GSS token exchanged via gssd & server uint32_t gss_clnt_tokenlen; // Length of token - gss_key_info *gss_clnt_kinfo; // GSS key info uint32_t gss_clnt_gssd_flags; // Special flag bits to gssd uint32_t gss_clnt_major; // GSS major result from gssd or server uint32_t gss_clnt_minor; // GSS minor result from gssd or server @@ -189,9 +132,9 @@ struct nfs_gss_svc_ctx { uint32_t *gss_svc_seqbits; // Bitmap to track seq numbers gssd_cred gss_svc_cred_handle; // Opaque cred handle from gssd gssd_ctx gss_svc_context; // Opaque context handle from gssd + gss_ctx_id_t gss_svc_ctx_id; // Underlying gss context u_char *gss_svc_token; // GSS token exchanged via gssd & client uint32_t gss_svc_tokenlen; // Length of token - gss_key_info gss_svc_kinfo; // Session key info uint32_t gss_svc_major; // GSS major result from gssd uint32_t gss_svc_minor; // GSS minor result from gssd }; diff --git a/bsd/nfs/nfs_gss_crypto.c b/bsd/nfs/nfs_gss_crypto.c deleted file mode 100644 index 370560b67..000000000 --- a/bsd/nfs/nfs_gss_crypto.c +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright (c) 2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -/* - * Copyright (C) 1998 by the FundsXpress, INC. - * - * All rights reserved. - * - * Export of this software from the United States of America may require - * a specific license from the United States Government. It is the - * responsibility of any person or organization contemplating export to - * obtain such a license before exporting. - * - * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and - * distribute this software and its documentation for any purpose and - * without fee is hereby granted, provided that the above copyright - * notice appear in all copies and that both that copyright notice and - * this permission notice appear in supporting documentation, and that - * the name of FundsXpress. not be used in advertising or publicity pertaining - * to distribution of the software without specific, written prior - * permission. FundsXpress makes no representations about the suitability of - * this software for any purpose. It is provided "as is" without express - * or implied warranty. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED - * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include "nfs_gss_crypto.h" - - -/* -n-fold(k-bits): - l = lcm(n,k) - r = l/k - s = k-bits | k-bits rot 13 | k-bits rot 13*2 | ... | k-bits rot 13*(r-1) - compute the 1's complement sum: - n-fold = s[0..n-1]+s[n..2n-1]+s[2n..3n-1]+..+s[(k-1)*n..k*n-1] -*/ - -/* representation: msb first, assume n and k are multiples of 8, and - that k>=16. this is the case of all the cryptosystems which are - likely to be used. this function can be replaced if that - assumption ever fails. */ - -/* input length is in bits */ - -void -krb5_nfold(unsigned int inbits, const unsigned char *in, unsigned int outbits, - unsigned char *out) -{ - int a,b,c,lcm; - int byte, i, msbit; - - /* the code below is more readable if I make these bytes - instead of bits */ - - inbits >>= 3; - outbits >>= 3; - - /* first compute lcm(n,k) */ - - a = outbits; - b = inbits; - - while(b != 0) { - c = b; - b = a%b; - a = c; - } - - lcm = outbits*inbits/a; - - /* now do the real work */ - - memset(out, 0, outbits); - byte = 0; - - /* this will end up cycling through k lcm(k,n)/k times, which - is correct */ - for (i=lcm-1; i>=0; i--) { - /* compute the msbit in k which gets added into this byte */ - msbit = (/* first, start with the msbit in the first, unrotated - byte */ - ((inbits<<3)-1) - /* then, for each byte, shift to the right for each - repetition */ - +(((inbits<<3)+13)*(i/inbits)) - /* last, pick out the correct byte within that - shifted repetition */ - +((inbits-(i%inbits))<<3) - )%(inbits<<3); - - /* pull out the byte value itself */ - byte += (((in[((inbits-1)-(msbit>>3))%inbits]<<8)| - (in[((inbits)-(msbit>>3))%inbits])) - >>((msbit&7)+1))&0xff; - - /* do the addition */ - byte += out[i%outbits]; - out[i%outbits] = byte&0xff; - -#if 0 - printf("msbit[%d] = %d\tbyte = %02x\tsum = %03x\n", i, msbit, - (((in[((inbits-1)-(msbit>>3))%inbits]<<8)| - (in[((inbits)-(msbit>>3))%inbits])) - >>((msbit&7)+1))&0xff, byte); -#endif - - /* keep around the carry bit, if any */ - byte >>= 8; - -#if 0 - printf("carry=%d\n", byte); -#endif - } - - /* if there's a carry bit left over, add it back in */ - if (byte) { - for (i=outbits-1; i>=0; i--) { - /* do the addition */ - byte += out[i]; - out[i] = byte&0xff; - - /* keep around the carry bit, if any */ - byte >>= 8; - } - } -} - -/* - * Given 21 bytes of random bits, make a triple DES key. - */ - -void -des3_make_key(const unsigned char randombits[21], des_cblock key[3]) -{ - int i; - - for (i = 0; i < 3; i++) { - memcpy(&key[i], &randombits[i*7], 7); - key[i][7] = (((key[i][0] & 1) << 1) | - ((key[i][1] & 1) << 2) | - ((key[i][2] & 1) << 3) | - ((key[i][3] & 1) << 4) | - ((key[i][4] & 1) << 5) | - ((key[i][5] & 1) << 6) | - ((key[i][6] & 1) << 7)); - des_fixup_key_parity(&key[i]); - } -} - -/* - * Key derivation for triple DES. - * Given the session key in in key, produce a new key in out key using - * the supplied constant. - */ - -int -des3_derive_key(des_cblock inkey[3], des_cblock outkey[3], - const unsigned char *constant, int clen) -{ - des_cblock inblock, outblock, ivec; - des3_cbc_key_schedule sched; - unsigned char rawkey[21]; - size_t n, keybytes = sizeof(rawkey); - - /* initialize the input block */ - - if (clen == sizeof(des_cblock)) { - memcpy(inblock, constant, clen); - } else { - krb5_nfold(clen*8, constant, sizeof(des_cblock)*8, inblock); - } - - /* loop encrypting the blocks until enough key bytes are generated */ - - bzero(ivec, sizeof(ivec)); - des3_cbc_key_sched(inkey, &sched); - for (n = 0; n < sizeof(rawkey); n += sizeof(des_cblock)) { - des3_cbc_encrypt(&inblock, &outblock, sizeof(outblock), &sched, &ivec, NULL, 1); - if ((keybytes - n) <= sizeof (des_cblock)) { - memcpy(rawkey+n, outblock, (keybytes - n)); - break; - } - memcpy(rawkey+n, outblock, sizeof(des_cblock)); - memcpy(inblock, outblock, sizeof(des_cblock)); - } - - /* postprocess the key */ - des3_make_key(rawkey, outkey); - - /* clean memory, free resources and exit */ - - bzero(inblock, sizeof (des_cblock)); - bzero(outblock, sizeof (des_cblock)); - bzero(rawkey, keybytes); - bzero(&sched, sizeof (sched)); - - return(0); -} - -/* - * Initialize a context for HMAC SHA1 - * if drived is true we derive a new key - * based on KG_USAGE_SIGN - */ - -void -HMAC_SHA1_DES3KD_Init(HMAC_SHA1_DES3KD_CTX *ctx, des_cblock key[3], int derive) -{ - unsigned char ipad[64]; - size_t i, j; - - SHA1Init(&ctx->sha1_ctx); - if (derive) - des3_derive_key(key, ctx->dk, KEY_USAGE_DES3_SIGN, KEY_USAGE_LEN); - else - memcpy(ctx->dk, key, 3*sizeof(des_cblock)); - memset(ipad, 0x36, sizeof(ipad)); - for (i = 0; i < 3; i++) - for (j = 0; j < sizeof(des_cblock); j++) - ipad[j + i * sizeof(des_cblock)] ^= ctx->dk[i][j]; - SHA1Update(&ctx->sha1_ctx, ipad, sizeof(ipad)); -} - -/* - * Update the HMAC SHA1 context with the supplied data. - */ -void -HMAC_SHA1_DES3KD_Update(HMAC_SHA1_DES3KD_CTX *ctx, void *data, size_t len) -{ - SHA1Update(&ctx->sha1_ctx, data, len); -} - -/* - * Finish the context and produce the HMAC SHA1 digest. - */ - -void -HMAC_SHA1_DES3KD_Final(void *digest, HMAC_SHA1_DES3KD_CTX *ctx) -{ - unsigned char opad[64]; - size_t i, j; - - SHA1Final(digest, &ctx->sha1_ctx); - memset(opad, 0x5c, sizeof(opad)); - for (i = 0; i < 3; i++) - for (j = 0; j < sizeof(des_cblock); j++) - opad[j + i * sizeof(des_cblock)] ^= ctx->dk[i][j]; - SHA1Init(&ctx->sha1_ctx); - SHA1Update(&ctx->sha1_ctx, opad, sizeof(opad)); - SHA1Update(&ctx->sha1_ctx, digest, SHA1_RESULTLEN); - SHA1Final(digest, &ctx->sha1_ctx); -} - -/* - * Initialize an MD5 DES CBC context with a schedule. - */ - -void MD5_DESCBC_Init(MD5_DESCBC_CTX *ctx, des_cbc_key_schedule *sched) -{ - MD5Init(&ctx->md5_ctx); - ctx->sched = sched; -} - -/* - * Update MD5 DES CBC context with the supplied data. - */ - -void MD5_DESCBC_Update(MD5_DESCBC_CTX *ctx, void *data, size_t len) -{ - MD5Update(&ctx->md5_ctx, data, len); -} - -/* - * Finalize the context and extract the digest. - */ - -void MD5_DESCBC_Final(void *digest, MD5_DESCBC_CTX *ctx) -{ - unsigned char md5_digest[MD5_DIGEST_LENGTH]; - - MD5Final(md5_digest, &ctx->md5_ctx); - - /* - * Now get the DES CBC checksum for the digest. - */ - des_cbc_cksum((des_cblock *) md5_digest, (des_cblock *)digest, - sizeof (md5_digest), ctx->sched); -} - diff --git a/bsd/nfs/nfs_gss_crypto.h b/bsd/nfs/nfs_gss_crypto.h deleted file mode 100644 index 4819dcd9d..000000000 --- a/bsd/nfs/nfs_gss_crypto.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -#ifndef _NFS_NFS_GSS_CRYPTO_H_ -#define _NFS_NFS_GSS_CRYPTO_H_ -#include -#include -#include -#include - -#define KG_USAGE_SEAL 22 -#define KG_USAGE_SIGN 23 -#define KG_USAGE_SEQ 24 - -#define KEY_USAGE_DES3_SEAL (const unsigned char *)"\x00\x00\x00\x16\xaa" -#define KEY_USAGE_DES3_SIGN (const unsigned char *)"\x00\x00\x00\x17\x99" -#define KEY_USAGE_DES3_SEQ (const unsigned char *)"\x00\x00\x00\x18\x55" -#define KEY_USAGE_LEN 5 - -typedef struct { - SHA1_CTX sha1_ctx; - des_cblock dk[3]; -} HMAC_SHA1_DES3KD_CTX; - -typedef struct { - MD5_CTX md5_ctx; - des_cbc_key_schedule *sched; -} MD5_DESCBC_CTX; - -#define MD5_DESCBC_DIGEST_LENGTH 8 - -__BEGIN_DECLS - -void krb5_nfold(unsigned int, const unsigned char *, unsigned int, unsigned char *); -void des3_make_key(const unsigned char[21], des_cblock[3]); -int des3_derive_key(des_cblock[3], des_cblock[3], const unsigned char *, int); - -void HMAC_SHA1_DES3KD_Init(HMAC_SHA1_DES3KD_CTX *, des_cblock[3], int); -void HMAC_SHA1_DES3KD_Update(HMAC_SHA1_DES3KD_CTX *, void *, size_t); -void HMAC_SHA1_DES3KD_Final(void *, HMAC_SHA1_DES3KD_CTX *); - -void MD5_DESCBC_Init(MD5_DESCBC_CTX *, des_cbc_key_schedule *); -void MD5_DESCBC_Update(MD5_DESCBC_CTX *, void *, size_t); -void MD5_DESCBC_Final(void *, MD5_DESCBC_CTX *); - -__END_DECLS - -#endif /* _NFS_NFS_GSS_CRYPTO_H_ */ diff --git a/bsd/nfs/nfs_ioctl.h b/bsd/nfs/nfs_ioctl.h index 9b2cbb5ff..ff4f3eaae 100644 --- a/bsd/nfs/nfs_ioctl.h +++ b/bsd/nfs/nfs_ioctl.h @@ -69,6 +69,7 @@ struct user_nfs_gss_principal /* If no credential was found returned NFS_IOC_NO_CRED_FLAG in the flags field. */ #define NFS_IOC_NO_CRED_FLAG 1 /* No credential was found */ +#define NFS_IOC_INVALID_CRED_FLAG 2 /* Found a credential, but its not valid */ #define NFS_IOC_SET_CRED _IOW('n', 2, struct nfs_gss_principal) #define NFS_FSCTL_SET_CRED IOCBASECMD(NFS_IOC_SET_CRED) diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index 981779afa..aaf567271 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2014 Apple Inc. All rights reserved. + * Copyright (c) 2002-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -399,9 +399,7 @@ nfs_lockd_send_request(LOCKD_MSG *msg, int interruptable) */ return (EAGAIN); } - return (kr); } - /* * NFS advisory byte-level locks (client) diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index c2a8867f4..437242919 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ @@ -168,7 +168,7 @@ nfs_case_insensitive(mount_t mp) skip = 1; } - if (!skip && NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_CASE_INSENSITIVE)) + if (!skip && (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_CASE_INSENSITIVE)) answer = 1; lck_mtx_unlock(&nmp->nm_lock); @@ -353,8 +353,8 @@ nfs_nget( if (vnode_parent(vp) != NFSTOV(dnp)) update_flags |= VNODE_UPDATE_PARENT; if (update_flags) { - NFS_NODE_DBG("vnode_update_identity old name %s new name %*s\n", - vp->v_name, cnp->cn_namelen, cnp->cn_nameptr ? cnp->cn_nameptr : ""); + NFS_NODE_DBG("vnode_update_identity old name %s new name %.*s update flags = %x\n", + vp->v_name, cnp->cn_namelen, cnp->cn_nameptr ? cnp->cn_nameptr : "", update_flags); vnode_update_identity(vp, NFSTOV(dnp), cnp->cn_nameptr, cnp->cn_namelen, 0, update_flags); } } @@ -582,12 +582,12 @@ nfs_nget( int -nfs_vnop_inactive(ap) +nfs_vnop_inactive( struct vnop_inactive_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; vfs_context_t a_context; - } */ *ap; + } */ *ap) { vnode_t vp = ap->a_vp; vfs_context_t ctx = ap->a_context; @@ -851,12 +851,12 @@ nfs_vnop_inactive(ap) * Reclaim an nfsnode so that it can be used for other purposes. */ int -nfs_vnop_reclaim(ap) +nfs_vnop_reclaim( struct vnop_reclaim_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; vfs_context_t a_context; - } */ *ap; + } */ *ap) { vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index 191a8fa29..023163518 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -4901,6 +4901,13 @@ nfs_request_timer(__unused void *param0, __unused void *param1) !(nmp->nm_sockflags & (NMSOCK_POKE|NMSOCK_UNMOUNT)) && (nmp->nm_sockflags & NMSOCK_READY)) { nmp->nm_sockflags |= NMSOCK_POKE; + /* + * We take a ref on the mount so that we know the mount will still be there + * when we process the nfs_mount_poke_queue. An unmount request will block + * in nfs_mount_drain_and_cleanup until after the poke is finished. We release + * the reference after calling nfs_sock_poke below; + */ + nmp->nm_ref++; TAILQ_INSERT_TAIL(&nfs_mount_poke_queue, nmp, nm_pokeq); } lck_mtx_unlock(&nmp->nm_lock); @@ -4973,6 +4980,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) while ((nmp = TAILQ_FIRST(&nfs_mount_poke_queue))) { TAILQ_REMOVE(&nfs_mount_poke_queue, nmp, nm_pokeq); nfs_sock_poke(nmp); + nfs_mount_rele(nmp); } nfs_interval_timer_start(nfs_request_timer_call, NFS_REQUESTDELAY); diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index 1b082e748..d4dead825 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -163,7 +163,6 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CT SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, ""); @@ -174,6 +173,7 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLA SYSCTL_UINT(_vfs_generic_nfs_client, OID_AUTO, debug_ctl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_debug_ctl, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, readlink_nocache, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_readlink_nocache, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, root_steals_gss_context, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_root_steals_ctx, 0, ""); +SYSCTL_STRING(_vfs_generic_nfs_client, OID_AUTO, default_nfs4domain, CTLFLAG_RW | CTLFLAG_LOCKED, nfs4_domain, sizeof(nfs4_domain), ""); #endif /* NFSCLIENT */ #if NFSSERVER diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index bb1b425f3..dcff07169 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -150,6 +150,7 @@ uint32_t nfs_open_owner_seqnum = 0; uint32_t nfs_lock_owner_seqnum = 0; thread_call_t nfs4_callback_timer_call; int nfs4_callback_timer_on = 0; +char nfs4_domain[MAXPATHLEN]; /* nfsiod */ lck_grp_t *nfsiod_lck_grp; @@ -204,26 +205,19 @@ int nfs_vfs_init(struct vfsconf *); int nfs_vfs_sysctl(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t); struct vfsops nfs_vfsops = { - nfs_vfs_mount, - nfs_vfs_start, - nfs_vfs_unmount, - nfs_vfs_root, - nfs_vfs_quotactl, - nfs_vfs_getattr, - nfs_vfs_sync, - nfs_vfs_vget, - nfs_vfs_fhtovp, - nfs_vfs_vptofh, - nfs_vfs_init, - nfs_vfs_sysctl, - NULL, /* setattr */ - { NULL, /* reserved */ - NULL, /* reserved */ - NULL, /* reserved */ - NULL, /* reserved */ - NULL, /* reserved */ - NULL, /* reserved */ - NULL } /* reserved */ + .vfs_mount = nfs_vfs_mount, + .vfs_start = nfs_vfs_start, + .vfs_unmount = nfs_vfs_unmount, + .vfs_root = nfs_vfs_root, + .vfs_quotactl = nfs_vfs_quotactl, + .vfs_getattr = nfs_vfs_getattr, + .vfs_sync = nfs_vfs_sync, + .vfs_vget = nfs_vfs_vget, + .vfs_fhtovp = nfs_vfs_fhtovp, + .vfs_vptofh = nfs_vfs_vptofh, + .vfs_init = nfs_vfs_init, + .vfs_sysctl = nfs_vfs_sysctl, + // We do not support the remaining VFS ops }; @@ -2667,7 +2661,14 @@ mountnfs( uint32_t *mflags; uint32_t argslength, attrslength; struct nfs_location_index firstloc = { NLI_VALID, 0, 0, 0 }; - + static const struct nfs_etype nfs_default_etypes = { + .count = NFS_MAX_ETYPES, + .selected = NFS_MAX_ETYPES, + .etypes = { NFS_AES256_CTS_HMAC_SHA1_96, + NFS_AES128_CTS_HMAC_SHA1_96, + NFS_DES3_CBC_SHA1_KD + } + }; /* make sure mbuf constants are set up */ if (!nfs_mbuf_mhlen) nfs_mbuf_init(); @@ -2725,6 +2726,7 @@ mountnfs( nmp->nm_acregmax = NFS_MAXATTRTIMO; nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; + nmp->nm_etype = nfs_default_etypes; nmp->nm_auth = RPCAUTH_SYS; nmp->nm_iodlink.tqe_next = NFSNOLIST; nmp->nm_deadtimeout = 0; @@ -2867,6 +2869,31 @@ mountnfs( /* start with the first flavor */ nmp->nm_auth = nmp->nm_sec.flavors[0]; } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_KERB_ETYPE)) { + uint32_t etypecnt; + xb_get_32(error, &xb, etypecnt); + if (!error && ((etypecnt < 1) || (etypecnt > NFS_MAX_ETYPES))) + error = EINVAL; + nfsmerr_if(error); + nmp->nm_etype.count = etypecnt; + xb_get_32(error, &xb, nmp->nm_etype.selected); + nfsmerr_if(error); + if (etypecnt) { + nmp->nm_etype.selected = etypecnt; /* Nothing is selected yet, so set selected to count */ + for (i=0; i < etypecnt; i++) { + xb_get_32(error, &xb, nmp->nm_etype.etypes[i]); + /* Check for valid encryption type */ + switch (nmp->nm_etype.etypes[i]) { + case NFS_DES3_CBC_SHA1_KD: + case NFS_AES128_CTS_HMAC_SHA1_96: + case NFS_AES256_CTS_HMAC_SHA1_96: + break; + default: + error = EINVAL; + } + } + } + } if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) xb_get_32(error, &xb, nmp->nm_numgrps); if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) { @@ -3309,7 +3336,7 @@ mountnfs( lck_mtx_unlock(&nmp->nm_lock); return (0); nfsmerr: - nfs_mount_cleanup(nmp); + nfs_mount_drain_and_cleanup(nmp); return (error); } @@ -3515,6 +3542,12 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) while (!error && (count-- > 0)) xb_copy_32(error, &xb, &xbnew, val); } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_KERB_ETYPE)) { + xb_copy_32(error, &xb, &xbnew, count); + xb_add_32(error, &xbnew, -1); + while (!error && (count-- > 0)) + xb_copy_32(error, &xb, &xbnew, val); + } if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) xb_copy_32(error, &xb, &xbnew, val); if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) @@ -4514,8 +4547,8 @@ nfs_mount_cleanup(struct nfsmount *nmp) NFS_VFS_DBG("Unmounting %s from %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, vfs_statfs(nmp->nm_mountp)->f_mntonname); - NFS_VFS_DBG("nfs state = %x\n", nmp->nm_state); - NFS_VFS_DBG("nfs socket flags = %x\n", nmp->nm_sockflags); + NFS_VFS_DBG("nfs state = 0x%8.8x\n", nmp->nm_state); + NFS_VFS_DBG("nfs socket flags = 0x%8.8x\n", nmp->nm_sockflags); NFS_VFS_DBG("nfs mount ref count is %d\n", nmp->nm_ref); NFS_VFS_DBG("mount ref count is %d\n", nmp->nm_mountp->mnt_count); @@ -4524,7 +4557,7 @@ nfs_mount_cleanup(struct nfsmount *nmp) lck_mtx_lock(&nmp->nm_lock); if (nmp->nm_ref) - panic("Some one has grabbed a ref %d\n", nmp->nm_ref); + panic("Some one has grabbed a ref %d state flags = 0x%8.8x\n", nmp->nm_ref, nmp->nm_state); if (nmp->nm_saddr) FREE(nmp->nm_saddr, M_SONAME); @@ -5015,6 +5048,8 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX); NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCK_MODE); NFS_BITMAP_SET(mattrs, NFS_MATTR_SECURITY); + if (nmp->nm_etype.selected < nmp->nm_etype.count) + NFS_BITMAP_SET(mattrs, NFS_MATTR_KERB_ETYPE); NFS_BITMAP_SET(mattrs, NFS_MATTR_MAX_GROUP_LIST); NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); @@ -5164,6 +5199,13 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) xb_add_32(error, &xbinfo, 1); /* SECURITY */ xb_add_32(error, &xbinfo, nmp->nm_auth); } + if (nmp->nm_etype.selected < nmp->nm_etype.count) { + xb_add_32(error, &xbinfo, nmp->nm_etype.count); + xb_add_32(error, &xbinfo, nmp->nm_etype.selected); + for (uint32_t j=0; j < nmp->nm_etype.count; j++) + xb_add_32(error, &xbinfo, nmp->nm_etype.etypes[j]); + nfsmerr_if(error); + } xb_add_32(error, &xbinfo, nmp->nm_numgrps); /* MAX_GROUP_LIST */ nfsmerr_if(error); snprintf(sotype, sizeof(sotype), "%s%s", (nmp->nm_sotype == SOCK_DGRAM) ? "udp" : "tcp", diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index ae1906aed..860338b3c 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -6933,7 +6933,6 @@ nfs_vnop_ioctl( case NFS_FSCTL_SET_CRED: if (!auth_is_kerberized(mp->nm_auth)) return (ENOTSUP); - NFS_DBG(NFS_FAC_GSS, 7, "Enter NFS_FSCTL_SET_CRED (proc %d) data = %p\n", vfs_context_is64bit(ctx), (void *)ap->a_data); if (vfs_context_is64bit(ctx)) { gprinc = *(struct user_nfs_gss_principal *)ap->a_data; } else { @@ -6943,9 +6942,9 @@ nfs_vnop_ioctl( gprinc.nametype = tp->nametype; gprinc.principal = CAST_USER_ADDR_T(tp->principal); } + NFS_DBG(NFS_FAC_GSS, 7, "Enter NFS_FSCTL_SET_CRED (64-bit=%d): principal length %d name type %d usr pointer 0x%llx\n", vfs_context_is64bit(ctx), gprinc.princlen, gprinc.nametype, (unsigned long long)gprinc.principal); if (gprinc.princlen > MAXPATHLEN) return (EINVAL); - NFS_DBG(NFS_FAC_GSS, 7, "Received principal length %d name type = %d\n", gprinc.princlen, gprinc.nametype); uint8_t *p; MALLOC(p, uint8_t *, gprinc.princlen+1, M_TEMP, M_WAITOK|M_ZERO); if (p == NULL) @@ -6997,7 +6996,8 @@ nfs_vnop_ioctl( NFS_DBG(NFS_FAC_GSS, 7, "NFS_FSCTL_GET_CRED could not copy out princiapl data of len %d: %d\n", gprinc.princlen, error); } - FREE(gprinc.principal, M_TEMP); + if (gprinc.principal) + FREE(gprinc.principal, M_TEMP); } return (error); diff --git a/bsd/nfs/nfsmount.h b/bsd/nfs/nfsmount.h index 7721e6336..feb205951 100644 --- a/bsd/nfs/nfsmount.h +++ b/bsd/nfs/nfsmount.h @@ -291,6 +291,7 @@ struct nfsmount { uint32_t nm_mappers; /* Number of nodes that have mmapped */ struct nfs_sec nm_sec; /* acceptable security mechanism flavors */ struct nfs_sec nm_servsec; /* server's acceptable security mechanism flavors */ + struct nfs_etype nm_etype; /* If using kerberos, the support session key encryption types */ fhandle_t *nm_fh; /* initial file handle */ uint8_t nm_lockmode; /* advisory file locking mode */ /* mount info */ diff --git a/bsd/nfs/xdr_subs.h b/bsd/nfs/xdr_subs.h index 59356190a..77590a33b 100644 --- a/bsd/nfs/xdr_subs.h +++ b/bsd/nfs/xdr_subs.h @@ -304,6 +304,8 @@ xb_grow(struct xdrbuf *xbp) oldsize = xbp->xb_u.xb_buffer.xbb_size; oldbuf = xbp->xb_u.xb_buffer.xbb_base; newsize = oldsize + xbp->xb_growsize; + if (newsize < oldsize) + return (ENOMEM); newbuf = xb_malloc(newsize); if (newbuf == NULL) return (ENOMEM); diff --git a/bsd/pgo/profile_runtime.c b/bsd/pgo/profile_runtime.c index ac308b681..c90fd7761 100644 --- a/bsd/pgo/profile_runtime.c +++ b/bsd/pgo/profile_runtime.c @@ -100,6 +100,33 @@ static int write_buffer(int flags, char *buffer) #endif +/* this variable is used to signal to the debugger that we'd like it to reset + * the counters */ +int kdp_pgo_reset_counters = 0; + +/* called in debugger context */ +static kern_return_t do_pgo_reset_counters(void *context) +{ +#pragma unused(context) +#ifdef PROFILE + memset(&__pgo_hib_CountersStart, 0, + ((uintptr_t)(&__pgo_hib_CountersEnd)) - ((uintptr_t)(&__pgo_hib_CountersStart))); +#endif + OSKextResetPgoCounters(); + kdp_pgo_reset_counters = 0; + return KERN_SUCCESS; +} + +static kern_return_t +pgo_reset_counters() +{ + kern_return_t r; + OSKextResetPgoCountersLock(); + kdp_pgo_reset_counters = 1; + r = DebuggerWithCallback(do_pgo_reset_counters, NULL, FALSE); + OSKextResetPgoCountersUnlock(); + return r; +} /* @@ -142,6 +169,26 @@ int grab_pgo_data(struct proc *p, goto out; } + if ( uap->flags & PGO_RESET_ALL ) { + if (uap->flags != PGO_RESET_ALL || uap->uuid || uap->buffer || uap->size ) { + err = EINVAL; + } else { + kern_return_t r = pgo_reset_counters(); + switch (r) { + case KERN_SUCCESS: + err = 0; + break; + case KERN_OPERATION_TIMED_OUT: + err = ETIMEDOUT; + break; + default: + err = EIO; + break; + } + } + goto out; + } + *retval = 0; if (uap->uuid) { diff --git a/bsd/security/audit/Makefile b/bsd/security/audit/Makefile index e2cf77f5a..48da02056 100644 --- a/bsd/security/audit/Makefile +++ b/bsd/security/audit/Makefile @@ -15,11 +15,9 @@ INSTALL_MI_DIR = security/audit INSTALL_KF_MI_LIST = ${DATAFILES} -EXPORT_MI_LIST = ${DATAFILES} audit.h audit_bsd.h audit_private.h +EXPORT_MI_LIST = ${DATAFILES} audit.h audit_bsd.h audit_private.h EXPORT_MI_DIR = security/audit include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/security/audit/audit.c b/bsd/security/audit/audit.c index c5f18ebe4..830c1fc33 100644 --- a/bsd/security/audit/audit.c +++ b/bsd/security/audit/audit.c @@ -610,7 +610,7 @@ audit_syscall_enter(unsigned int code, proc_t proc, struct uthread *uthread) * the syscall table(s). This table is generated by makesyscalls.sh * from syscalls.master and stored in audit_kevents.c. */ - if (code > NUM_SYSENT) + if (code > nsysent) return; event = sys_au_event[code]; if (event == AUE_NULL) @@ -792,6 +792,7 @@ kau_will_audit(void) return (audit_enabled && currecord() != NULL); } +#if CONFIG_COREDUMP void audit_proc_coredump(proc_t proc, char *path, int errcode) { @@ -850,4 +851,5 @@ audit_proc_coredump(proc_t proc, char *path, int errcode) ret = 1; audit_commit(ar, errcode, ret); } +#endif /* CONFIG_COREDUMP */ #endif /* CONFIG_AUDIT */ diff --git a/bsd/security/audit/audit.h b/bsd/security/audit/audit.h index d85139b2b..67803a185 100644 --- a/bsd/security/audit/audit.h +++ b/bsd/security/audit/audit.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004-2009 Apple Inc. + * Copyright (c) 2004-2016 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -88,6 +88,7 @@ extern int audit_syscalls; #define ARG_AUID 0x0000000000000100ULL #define ARG_GID 0x0000000000000200ULL #define ARG_FD 0x0000000000000400ULL +#define ARG_FD1 ARG_FD #define ARG_POSIX_IPC_PERM 0x0000000000000800ULL #define ARG_FFLAGS 0x0000000000001000ULL #define ARG_MODE 0x0000000000002000ULL @@ -133,6 +134,7 @@ extern int audit_syscalls; #define ARG_OPAQUE 0x0008000000000000ULL /* darwin-only */ #define ARG_DATA 0x0010000000000000ULL /* darwin-only */ #define ARG_ADDR64 0x0020000000000000ULL /* darwin-only */ +#define ARG_FD2 0x0040000000000000ULL /* darwin-only */ #define ARG_NONE 0x0000000000000000ULL #define ARG_ALL 0xFFFFFFFFFFFFFFFFULL @@ -189,6 +191,7 @@ void audit_arg_addr(struct kaudit_record *ar, user_addr_t addr); void audit_arg_exit(struct kaudit_record *ar, int status, int retval); void audit_arg_len(struct kaudit_record *ar, user_size_t len); void audit_arg_fd(struct kaudit_record *ar, int fd); +void audit_arg_fd2(struct kaudit_record *ar, int fd); void audit_arg_fflags(struct kaudit_record *ar, int fflags); void audit_arg_gid(struct kaudit_record *ar, gid_t gid); void audit_arg_uid(struct kaudit_record *ar, uid_t uid); diff --git a/bsd/security/audit/audit_arg.c b/bsd/security/audit/audit_arg.c index 207337909..00f65b3e0 100644 --- a/bsd/security/audit/audit_arg.c +++ b/bsd/security/audit/audit_arg.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2012 Apple Inc. + * Copyright (c) 1999-2016 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -134,6 +134,14 @@ audit_arg_len(struct kaudit_record *ar, user_size_t len) ARG_SET_VALID(ar, ARG_LEN); } +void +audit_arg_fd2(struct kaudit_record *ar, int fd) +{ + + ar->k_ar.ar_arg_fd2 = fd; + ARG_SET_VALID(ar, ARG_FD2); +} + void audit_arg_fd(struct kaudit_record *ar, int fd) { diff --git a/bsd/security/audit/audit_bsd.c b/bsd/security/audit/audit_bsd.c index 6f4d416c9..a08ab453d 100644 --- a/bsd/security/audit/audit_bsd.c +++ b/bsd/security/audit/audit_bsd.c @@ -51,6 +51,8 @@ #include #include +extern void ipc_port_release_send(ipc_port_t port); + #if CONFIG_AUDIT struct mhdr { size_t mh_size; @@ -650,7 +652,8 @@ audit_send_trigger(unsigned int trigger) error = host_get_audit_control_port(host_priv_self(), &audit_port); if (error == KERN_SUCCESS && audit_port != MACH_PORT_NULL) { - audit_triggers(audit_port, trigger); + (void)audit_triggers(audit_port, trigger); + ipc_port_release_send(audit_port); return (0); } else { printf("Cannot get audit control port\n"); diff --git a/bsd/security/audit/audit_bsm.c b/bsd/security/audit/audit_bsm.c index 7ca2771d4..da938d8a1 100644 --- a/bsd/security/audit/audit_bsm.c +++ b/bsd/security/audit/audit_bsm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2009 Apple Inc. + * Copyright (c) 1999-2016 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -1419,6 +1419,36 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) UPATH1_VNODE1_TOKENS; break; + case AUE_CLONEFILEAT: + if (ARG_IS_VALID(kar, ARG_FD)) { + tok = au_to_arg32(1, "src dir fd", ar->ar_arg_fd); + kau_write(rec, tok); + } + UPATH1_VNODE1_TOKENS; + if (ARG_IS_VALID(kar, ARG_FD2)) { + tok = au_to_arg32(1, "dst dir fd", ar->ar_arg_fd2); + kau_write(rec, tok); + } + UPATH2_TOKENS; + if (ARG_IS_VALID(kar, ARG_VALUE32)) { + tok = au_to_arg32(1, "flags", ar->ar_arg_value32); + kau_write(rec, tok); + } + break; + + case AUE_FCLONEFILEAT: + FD_VNODE1_TOKENS; + if (ARG_IS_VALID(kar, ARG_FD2)) { + tok = au_to_arg32(1, "dst dir fd", ar->ar_arg_fd2); + kau_write(rec, tok); + } + UPATH2_TOKENS; + if (ARG_IS_VALID(kar, ARG_VALUE32)) { + tok = au_to_arg32(1, "flags", ar->ar_arg_value32); + kau_write(rec, tok); + } + break; + case AUE_PTRACE: if (ARG_IS_VALID(kar, ARG_CMD)) { tok = au_to_arg32(1, "request", ar->ar_arg_cmd); diff --git a/bsd/security/audit/audit_bsm_klib.c b/bsd/security/audit/audit_bsm_klib.c index c5588d98e..4a8187f44 100644 --- a/bsd/security/audit/audit_bsm_klib.c +++ b/bsd/security/audit/audit_bsm_klib.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -150,7 +151,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class) void au_evclassmap_init(void) { - int i; + unsigned int i; EVCLASS_LOCK_INIT(); for (i = 0; i < EVCLASSMAP_HASH_TABLE_SIZE; i++) @@ -159,7 +160,7 @@ au_evclassmap_init(void) /* * Set up the initial event to class mapping for system calls. */ - for (i = 0; i < NUM_SYSENT; i++) { + for (i = 0; i < nsysent; i++) { if (sys_au_event[i] != AUE_NULL) au_evclassmap_insert(sys_au_event[i], 0); diff --git a/bsd/security/audit/audit_private.h b/bsd/security/audit/audit_private.h index 40f0ee658..8a5a556d8 100644 --- a/bsd/security/audit/audit_private.h +++ b/bsd/security/audit/audit_private.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2009 Apple Inc. + * Copyright (c) 1999-2016 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -274,6 +274,7 @@ struct audit_record { int ar_arg_exitstatus; int ar_arg_exitretval; struct sockaddr_storage ar_arg_sockaddr; + int ar_arg_fd2; #if CONFIG_MACF /* diff --git a/bsd/security/audit/audit_worker.c b/bsd/security/audit/audit_worker.c index aa44fa446..85b5c8241 100644 --- a/bsd/security/audit/audit_worker.c +++ b/bsd/security/audit/audit_worker.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2011 Apple Inc. + * Copyright (c) 1999-2016 Apple Inc. * Copyright (c) 2006-2008 Robert N. M. Watson * All rights reserved. * @@ -414,6 +414,7 @@ audit_worker_process_record(struct kaudit_record *ar) * Note: this means that the effect bound on the size of the pending record * queue is 2x the length of the global queue. */ +__attribute__((noreturn)) static void audit_worker(void) { diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index f07d5649f..366a86188 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -13,20 +12,20 @@ INSTINC_SUBDIRS = \ EXPINC_SUBDIRS = \ _types -# Installs header file for user level - +# Installs header file for user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders # $(DSTROOT)/usr/include/ DATAFILES = \ - appleapiopts.h acct.h aio.h attr.h \ - buf.h cdefs.h conf.h \ + appleapiopts.h acct.h aio.h attr.h \ + buf.h cdefs.h clonefile.h conf.h \ dir.h dirent.h disk.h dkstat.h dtrace.h dtrace_glue.h dtrace_impl.h \ errno.h ev.h event.h fasttrap.h fasttrap_isa.h fcntl.h file.h filedesc.h filio.h gmon.h \ ioccom.h ioctl.h \ ioctl_compat.h ipc.h kernel.h kernel_types.h kern_event.h lctx.h loadable_fs.h lock.h lockf.h \ - kauth.h kdebug.h kern_control.h lockstat.h malloc.h \ + kauth.h kdebug.h kdebug_signpost.h kern_control.h lockstat.h malloc.h \ mbuf.h mman.h mount.h msg.h msgbuf.h netport.h param.h paths.h pipe.h poll.h \ - proc.h proc_info.h ptrace.h queue.h quota.h random.h reboot.h resource.h resourcevar.h \ - sbuf.h posix_sem.h posix_shm.h sdt.h \ + proc.h proc_info.h ptrace.h queue.h quota.h reboot.h resource.h resourcevar.h \ + sbuf.h posix_sem.h posix_shm.h random.h sdt.h\ select.h sem.h semaphore.h shm.h signal.h signalvar.h socket.h socketvar.h sockio.h stat.h stdio.h \ sysctl.h syslimits.h syslog.h sys_domain.h termios.h time.h \ timeb.h times.h tprintf.h trace.h tty.h ttychars.h ttycom.h \ @@ -36,11 +35,12 @@ DATAFILES = \ _select.h _structs.h _types.h _endian.h domain.h protosw.h \ spawn.h -# Installs header file for Apple internal use in user level - +# Installs header file for Apple internal use in user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders PRIVATE_DATAFILES = \ attr.h \ cdefs.h \ + clonefile.h \ coalition.h \ codesign.h \ content_protection.h \ @@ -73,9 +73,11 @@ PRIVATE_DATAFILES = \ proc_info.h \ proc_uuid_policy.h \ process_policy.h \ + reason.h \ resource.h \ sfi.h \ shm_internal.h \ + snapshot.h \ socket.h \ socketvar.h \ sockio.h \ @@ -84,6 +86,7 @@ PRIVATE_DATAFILES = \ stackshot.h \ sys_domain.h \ tree.h \ + ulock.h \ unpcb.h \ ux_exception.h \ work_interval.h \ @@ -93,11 +96,11 @@ PRIVATE_DATAFILES = \ pgo.h \ memory_maintenance.h -# Installs header file for kernel extensions - +# Installs header file for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders KERNELFILES = \ - appleapiopts.h attr.h \ + appleapiopts.h attr.h \ buf.h cdefs.h conf.h \ dir.h dirent.h disk.h disklabel.h dkstat.h \ errno.h ev.h event.h fcntl.h file.h filio.h \ @@ -119,8 +122,7 @@ KERNELFILES = \ # The last line was added to export needed headers for the MAC calls # whose source is outside of the xnu/bsd tree. - -# Installs header file for Apple internal use for kernel extensions - +# Installs header file for Apple internal use for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders PRIVATE_KERNELFILES = \ codesign.h \ @@ -136,6 +138,7 @@ PRIVATE_KERNELFILES = \ kasl.h \ kern_memorystatus.h \ kpi_private.h \ + ktrace.h \ mach_swapon.h \ msgbuf.h \ eventvar.h \ @@ -152,11 +155,13 @@ PRIVATE_KERNELFILES = \ user.h \ vfs_context.h \ vmmeter.h \ + reason.h \ spawn_internal.h \ priv.h \ pgo.h \ - memory_maintenance.h - + memory_maintenance.h \ + doc_tombstone.h \ + fsevents.h # /usr/include INSTALL_MI_LIST = ${DATAFILES} @@ -167,7 +172,7 @@ INSTALL_MI_DIR = sys EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h pthread_internal.h filedesc.h pipe.h resourcevar.h semaphore.h \ vnode_internal.h proc_internal.h file_internal.h mount_internal.h \ - uio_internal.h tree.h munge.h + uio_internal.h tree.h munge.h guarded.h ulock.h EXPORT_MI_GEN_LIST = syscall.h sysproto.h kdebugevents.h @@ -192,32 +197,47 @@ MAKEKDEBUGEVENTS = $(SRCROOT)/bsd/kern/makekdebugevents.py $(OBJROOT)/cscope.genhdrs: $(_v)mkdir -p $(OBJROOT)/cscope.genhdrs +$(OBJROOT)/syscall.codes: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)"; + $(_v)$(MAKESYSCALLS) $< trace > $@ + +$(OBJROOT)/trace.codes: $(SRCROOT)/bsd/kern/trace_codes $(OBJROOT)/syscall.codes + $(_v)sort -g $(SRCROOT)/bsd/kern/trace_codes $(OBJROOT)/syscall.codes >$@ + syscall.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscope.genhdrs - @echo "Generating bsd/sys/$@ from $<"; + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)"; @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path $(_v)$(MAKESYSCALLS) $< header > /dev/null sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscope.genhdrs - @echo "Generating bsd/sys/$@ from $<"; + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)"; @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path $(_v)$(MAKESYSCALLS) $< proto > /dev/null -kdebugevents.h: $(SRCROOT)/bsd/kern/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs - @echo "Generating bsd/sys/$@ from $<"; +kdebugevents.h: $(OBJROOT)/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)"; @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path $(_v)$(MAKEKDEBUGEVENTS) $< > "$(OBJPATH)/bsd/sys/$@" MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh _posix_availability.h: $(MAKE_POSIX_AVAILABILITY) - @echo "Generating bsd/sys/$@" + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)"; $(_v)$(MAKE_POSIX_AVAILABILITY) "$@" MAKE_SYMBOL_ALIASING = $(SRCROOT)/bsd/sys/make_symbol_aliasing.sh _symbol_aliasing.h: $(MAKE_SYMBOL_ALIASING) - @echo "Generating bsd/sys/$@" + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)"; $(_v)$(MAKE_SYMBOL_ALIASING) "$(SDKROOT)" "$@" -include $(MakeInc_rule) -include $(MakeInc_dir) +TRACE_CODES_DEST = \ + $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR)/trace.codes +$(TRACE_CODES_DEST): $(OBJROOT)/trace.codes + $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR) + @echo INSTALL $(@F) + $(_v)$(INSTALL) $(INSTALL_FLAGS) $(OBJROOT)/trace.codes $@ +do_textfiles_install:: $(TRACE_CODES_DEST) + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/sys/_types/Makefile b/bsd/sys/_types/Makefile index 1cc149aa8..0cf91f657 100644 --- a/bsd/sys/_types/Makefile +++ b/bsd/sys/_types/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -11,7 +10,7 @@ INSTINC_SUBDIRS = EXPINC_SUBDIRS = -# Installs header file for user level - +# Installs header file for user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/Headers # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders # $(DSTROOT)/usr/include/ @@ -33,6 +32,7 @@ DATAFILES = \ _fsblkcnt_t.h \ _fsfilcnt_t.h \ _fsid_t.h \ + _fsobj_id_t.h \ _gid_t.h \ _guid_t.h \ _id_t.h \ @@ -73,12 +73,16 @@ DATAFILES = \ _suseconds_t.h \ _time_t.h \ _timespec.h \ - _timeval.h \ - _timeval32.h \ - _timeval64.h \ - _ucontext.h \ - _ucontext64.h \ + _timeval.h \ + _timeval32.h \ + _timeval64.h \ + _ucontext.h \ + _ucontext64.h \ _uid_t.h \ + _u_int16_t.h \ + _u_int32_t.h \ + _u_int64_t.h \ + _u_int8_t.h \ _uintptr_t.h \ _useconds_t.h \ _uuid_t.h \ @@ -86,14 +90,11 @@ DATAFILES = \ _wchar_t.h \ _wint_t.h \ - - -# Installs header file for Apple internal use in user level - +# Installs header file for Apple internal use in user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders PRIVATE_DATAFILES = \ - -# Installs header file for kernel extensions - +# Installs header file for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders KERNELFILES = ${DATAFILES} \ @@ -106,23 +107,20 @@ KERNELFILES = ${DATAFILES} \ _user64_itimerval.h \ _user32_itimerval.h \ - -# Installs header file for Apple internal use for kernel extensions - +# Installs header file for Apple internal use for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders PRIVATE_KERNELFILES = \ - - # /System/Library/Frameworks/System.framework/Headers and /usr/include INSTALL_MI_LIST = ${DATAFILES} -INSTALL_MI_GEN_LIST = +INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = sys/_types -EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} - -EXPORT_MI_GEN_LIST = +EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} + +EXPORT_MI_GEN_LIST = EXPORT_MI_DIR = sys/_types @@ -132,12 +130,12 @@ INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} # /System/Library/Frameworks/Kernel.framework/PrivateHeaders INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} -INSTALL_KF_MI_LCL_GEN_LIST = +INSTALL_KF_MI_LCL_GEN_LIST = # /System/Library/Frameworks/Kernel.framework/Headers INSTALL_KF_MI_LIST = ${KERNELFILES} -INSTALL_KF_MI_GEN_LIST = +INSTALL_KF_MI_GEN_LIST = include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/sys/_types/_fd_def.h b/bsd/sys/_types/_fd_def.h index 0a36997c8..158fb8f16 100644 --- a/bsd/sys/_types/_fd_def.h +++ b/bsd/sys/_types/_fd_def.h @@ -52,11 +52,11 @@ __END_DECLS static __inline int __darwin_fd_isset(int _n, const struct fd_set *_p) { - return (_p->fds_bits[(unsigned long)_n/__DARWIN_NFDBITS] & ((__int32_t)(1<<((unsigned long)_n % __DARWIN_NFDBITS)))); + return (_p->fds_bits[(unsigned long)_n/__DARWIN_NFDBITS] & ((__int32_t)(((unsigned long)1)<<((unsigned long)_n % __DARWIN_NFDBITS)))); } -#define __DARWIN_FD_SET(n, p) do { int __fd = (n); ((p)->fds_bits[(unsigned long)__fd/__DARWIN_NFDBITS] |= ((__int32_t)(1<<((unsigned long)__fd % __DARWIN_NFDBITS)))); } while(0) -#define __DARWIN_FD_CLR(n, p) do { int __fd = (n); ((p)->fds_bits[(unsigned long)__fd/__DARWIN_NFDBITS] &= ~((__int32_t)(1<<((unsigned long)__fd % __DARWIN_NFDBITS)))); } while(0) +#define __DARWIN_FD_SET(n, p) do { int __fd = (n); ((p)->fds_bits[(unsigned long)__fd/__DARWIN_NFDBITS] |= ((__int32_t)(((unsigned long)1)<<((unsigned long)__fd % __DARWIN_NFDBITS)))); } while(0) +#define __DARWIN_FD_CLR(n, p) do { int __fd = (n); ((p)->fds_bits[(unsigned long)__fd/__DARWIN_NFDBITS] &= ~((__int32_t)(((unsigned long)1)<<((unsigned long)__fd % __DARWIN_NFDBITS)))); } while(0) #define __DARWIN_FD_ISSET(n, p) __darwin_fd_isset((n), (p)) #if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 3 diff --git a/bsd/sys/_types/_fsobj_id_t.h b/bsd/sys/_types/_fsobj_id_t.h new file mode 100644 index 000000000..20e1bcff9 --- /dev/null +++ b/bsd/sys/_types/_fsobj_id_t.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _FSOBJ_ID_T +#define _FSOBJ_ID_T + +typedef struct fsobj_id { + u_int32_t fid_objno; + u_int32_t fid_generation; +} fsobj_id_t; + +#endif /* _FSOBJ_ID_T */ diff --git a/bsd/sys/_types/_u_int16_t.h b/bsd/sys/_types/_u_int16_t.h new file mode 100644 index 000000000..a29896811 --- /dev/null +++ b/bsd/sys/_types/_u_int16_t.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _U_INT16_T +#define _U_INT16_T +typedef unsigned short u_int16_t; +#endif /* _U_INT16_T */ diff --git a/osfmk/kperf/x86_64/kperf_arch.h b/bsd/sys/_types/_u_int32_t.h similarity index 89% rename from osfmk/kperf/x86_64/kperf_arch.h rename to bsd/sys/_types/_u_int32_t.h index 7d361c768..7ebf744dc 100644 --- a/osfmk/kperf/x86_64/kperf_arch.h +++ b/bsd/sys/_types/_u_int32_t.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 Apple Inc. All rights reserved. + * Copyright (c) 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,6 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -/* arch-dependent wrapper for kperf */ - +#ifndef _U_INT32_T +#define _U_INT32_T +typedef unsigned int u_int32_t; +#endif /* _U_INT32_T */ diff --git a/bsd/sys/_types/_u_int64_t.h b/bsd/sys/_types/_u_int64_t.h new file mode 100644 index 000000000..ff097cbdc --- /dev/null +++ b/bsd/sys/_types/_u_int64_t.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _U_INT64_T +#define _U_INT64_T +typedef unsigned long long u_int64_t; +#endif /* _U_INT64_T */ diff --git a/libsyscall/mach/dylib_link.c b/bsd/sys/_types/_u_int8_t.h similarity index 89% rename from libsyscall/mach/dylib_link.c rename to bsd/sys/_types/_u_int8_t.h index 5aa27f230..569b529a0 100644 --- a/libsyscall/mach/dylib_link.c +++ b/bsd/sys/_types/_u_int8_t.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Inc. All rights reserved. + * Copyright (c) 2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,5 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -/* This empty file is here to force the dylib target to actually link */ +#ifndef _U_INT8_T +#define _U_INT8_T +typedef unsigned char u_int8_t; +#endif /* _U_INT8_T */ diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index ebfeb6091..d3bf9580a 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,6 +54,9 @@ #endif #define FSOPT_ATTR_CMN_EXTENDED 0x00000020 +#ifdef PRIVATE +#define FSOPT_LIST_SNAPSHOT 0x00000040 +#endif /* PRIVATE */ /* we currently aren't anywhere near this amount for a valid * fssearchblock.sizeofsearchparams1 or fssearchblock.sizeofsearchparams2 @@ -72,10 +75,7 @@ typedef u_int32_t fsfile_type_t; typedef u_int32_t fsvolid_t; -typedef struct fsobj_id { - u_int32_t fid_objno; - u_int32_t fid_generation; -} fsobj_id_t; +#include /* file object id type */ typedef u_int32_t attrgroup_t; @@ -220,6 +220,15 @@ typedef struct vol_capabilities_attr { * only legitimate attributes for obtaining object IDs from this volume and the * 32-bit fid_objno fields of the fsobj_id_t returned by ATTR_CMN_OBJID, * ATTR_CMN_OBJPERMID, and ATTR_CMN_PAROBJID are undefined. + * + * VOL_CAP_FMT_DIR_HARDLINKS: When set, the volume supports directory + * hard links. + * + * VOL_CAP_FMT_DOCUMENT_ID: When set, the volume supports document IDs + * (an ID which persists across object ID changes) for document revisions. + * + * VOL_CAP_FMT_WRITE_GENERATION_COUNT: When set, the volume supports write + * generation counts (a count of how many times an object has been modified) */ #define VOL_CAP_FMT_PERSISTENTOBJECTIDS 0x00000001 #define VOL_CAP_FMT_SYMBOLICLINKS 0x00000002 @@ -239,6 +248,9 @@ typedef struct vol_capabilities_attr { #define VOL_CAP_FMT_NO_VOLUME_SIZES 0x00008000 #define VOL_CAP_FMT_DECMPFS_COMPRESSION 0x00010000 #define VOL_CAP_FMT_64BIT_OBJECT_IDS 0x00020000 +#define VOL_CAP_FMT_DIR_HARDLINKS 0x00040000 +#define VOL_CAP_FMT_DOCUMENT_ID 0x00080000 +#define VOL_CAP_FMT_WRITE_GENERATION_COUNT 0x00100000 /* @@ -297,6 +309,15 @@ typedef struct vol_capabilities_attr { * * VOL_CAP_INT_NAMEDSTREAMS: When set, the volume supports * native named streams. + * + * VOL_CAP_INT_CLONE: When set, the volume supports clones. + * + * VOL_CAP_INT_RENAME_SWAP: When set, the volume supports swapping + * file system objects. + * + * VOL_CAP_INT_RENAME_EXCL: When set, the volume supports an + * exclusive rename operation. + * */ #define VOL_CAP_INT_SEARCHFS 0x00000001 #define VOL_CAP_INT_ATTRLIST 0x00000002 @@ -317,6 +338,13 @@ typedef struct vol_capabilities_attr { /* Volume supports kqueue notifications for remote events */ #define VOL_CAP_INT_REMOTE_EVENT 0x00008000 #endif /* PRIVATE */ +#define VOL_CAP_INT_CLONE 0x00010000 +#ifdef PRIVATE +/* Volume supports snapshots */ +#define VOL_CAP_INT_SNAPSHOT 0x00020000 +#endif /* PRIVATE */ +#define VOL_CAP_INT_RENAME_SWAP 0x00040000 +#define VOL_CAP_INT_RENAME_EXCL 0x00080000 typedef struct vol_attributes_attr { attribute_set_t validattr; diff --git a/bsd/sys/bsdtask_info.h b/bsd/sys/bsdtask_info.h index 1f5fb1cc7..c6df6786c 100644 --- a/bsd/sys/bsdtask_info.h +++ b/bsd/sys/bsdtask_info.h @@ -117,8 +117,11 @@ void fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo); int fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *); int fill_taskthreadlist(task_t task, void * buffer, int thcount); int get_numthreads(task_t); +boolean_t bsd_hasthreadname(void *uth); void bsd_getthreadname(void *uth, char* buffer); +void bsd_setthreadname(void *uth, const char* buffer); void bsd_threadcdir(void * uth, void *vptr, int *vidp); +extern void bsd_copythreadname(void *dst_uth, void *src_uth); #endif /*_SYS_BSDTASK_INFO_H */ diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index 3763a223a..8233ac527 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,14 +106,14 @@ __BEGIN_DECLS @abstract Mark a buffer as "aged," i.e. as a good candidate to be discarded and reused after buf_brelse(). @param bp Buffer to mark. */ -void buf_markaged(buf_t); +void buf_markaged(buf_t bp); /*! @function buf_markinvalid @abstract Mark a buffer as not having valid data and being ready for immediate reuse after buf_brelse(). @param bp Buffer to mark. */ -void buf_markinvalid(buf_t); +void buf_markinvalid(buf_t bp); /*! @function buf_markdelayed @@ -122,7 +122,7 @@ void buf_markinvalid(buf_t); or pressure necessitating reuse of the buffer will cause it to be written back to disk. @param bp Buffer to mark. */ -void buf_markdelayed(buf_t); +void buf_markdelayed(buf_t bp); void buf_markclean(buf_t); @@ -133,14 +133,14 @@ void buf_markclean(buf_t); buf_markeintr does not itself do a wakeup. @param bp Buffer to mark. */ -void buf_markeintr(buf_t); +void buf_markeintr(buf_t bp); /*! @function buf_markfua @abstract Mark a buffer for write through disk cache, if disk supports it. @param bp Buffer to mark. */ -void buf_markfua(buf_t); +void buf_markfua(buf_t bp); /*! @function buf_fua @@ -148,7 +148,7 @@ void buf_markfua(buf_t); @param bp Buffer to test. @return Nonzero if buffer is marked for write-through, 0 if not. */ -int buf_fua(buf_t); +int buf_fua(buf_t bp); /*! @function buf_valid @@ -156,7 +156,7 @@ int buf_fua(buf_t); @param bp Buffer to test. @return Nonzero if buffer has valid data, 0 if not. */ -int buf_valid(buf_t); +int buf_valid(buf_t bp); /*! @function buf_fromcache @@ -166,7 +166,7 @@ int buf_valid(buf_t); @param bp Buffer to test. @return Nonzero if we got this buffer's data without doing I/O, 0 if not. */ -int buf_fromcache(buf_t); +int buf_fromcache(buf_t bp); /*! @function buf_upl @@ -176,7 +176,7 @@ int buf_fromcache(buf_t); @param bp Buffer whose upl to grab. @return Buffer's upl if it has one, else NULL. */ -void * buf_upl(buf_t); +void * buf_upl(buf_t bp); /*! @function buf_uploffset @@ -185,7 +185,7 @@ void * buf_upl(buf_t); @param bp Buffer whose uploffset to grab. @return Buffer's uploffset--does not check whether that value makes sense for this buffer. */ -uint32_t buf_uploffset(buf_t); +uint32_t buf_uploffset(buf_t bp); /*! @function buf_rcred @@ -195,7 +195,7 @@ uint32_t buf_uploffset(buf_t); @param bp Buffer whose credential to grab. @return Credential if it exists, else NULL. */ -kauth_cred_t buf_rcred(buf_t); +kauth_cred_t buf_rcred(buf_t bp); /*! @function buf_wcred @@ -205,7 +205,7 @@ kauth_cred_t buf_rcred(buf_t); @param bp Buffer whose credential to grab. @return Credential if it exists, else NULL. */ -kauth_cred_t buf_wcred(buf_t); +kauth_cred_t buf_wcred(buf_t bp); /*! @function buf_proc @@ -215,7 +215,7 @@ kauth_cred_t buf_wcred(buf_t); @param bp Buffer whose associated process to find. @return Associated process, possibly NULL. */ -proc_t buf_proc(buf_t); +proc_t buf_proc(buf_t bp); /*! @function buf_dirtyoff @@ -224,7 +224,7 @@ proc_t buf_proc(buf_t); @param bp Buffer whose dirty offset to get. @return Dirty offset (0 if not explicitly changed). */ -uint32_t buf_dirtyoff(buf_t); +uint32_t buf_dirtyoff(buf_t bp); /*! @function buf_dirtyend @@ -234,16 +234,15 @@ uint32_t buf_dirtyoff(buf_t); @param bp Buffer whose dirty end to get. @return 0 if buffer is found clean; size of buffer if found dirty. Can be set to any value by callers of buf_setdirtyend(). */ -uint32_t buf_dirtyend(buf_t); +uint32_t buf_dirtyend(buf_t bp); /*! @function buf_setdirtyoff @abstract Set the starting offset of the dirty region associated with a buffer. @discussion This value is zero unless someone set it explicitly. @param bp Buffer whose dirty end to set. - @return void. */ -void buf_setdirtyoff(buf_t, uint32_t); +void buf_setdirtyoff(buf_t bp, uint32_t); /*! @function buf_setdirtyend @@ -251,9 +250,8 @@ void buf_setdirtyoff(buf_t, uint32_t); @discussion If the buffer's data was found incore and dirty, the dirty end is the size of the block; otherwise, unless someone outside of xnu explicitly changes it by calling buf_setdirtyend(), it will be zero. @param bp Buffer whose dirty end to set. - @return void. */ -void buf_setdirtyend(buf_t, uint32_t); +void buf_setdirtyend(buf_t bp, uint32_t); /*! @function buf_error @@ -262,15 +260,14 @@ void buf_setdirtyend(buf_t, uint32_t); @param bp Buffer whose error value to retrieve. @return Error value, directly. */ -errno_t buf_error(buf_t); +errno_t buf_error(buf_t bp); /*! @function buf_seterror @abstract Set an error value on a buffer. @param bp Buffer whose error value to set. - @return void. */ -void buf_seterror(buf_t, errno_t); +void buf_seterror(buf_t bp, errno_t); /*! @function buf_setflags @@ -278,9 +275,8 @@ void buf_seterror(buf_t, errno_t); @discussion buffer_flags |= flags @param bp Buffer whose flags to set. @param flags Flags to add to buffer's mask. B_LOCKED/B_NOCACHE/B_ASYNC/B_READ/B_WRITE/B_PAGEIO/B_FUA - @return void. */ -void buf_setflags(buf_t, int32_t); +void buf_setflags(buf_t bp, int32_t flags); /*! @function buf_clearflags @@ -288,9 +284,8 @@ void buf_setflags(buf_t, int32_t); @discussion buffer_flags &= ~flags @param bp Buffer whose flags to clear. @param flags Flags to remove from buffer's mask. B_LOCKED/B_NOCACHE/B_ASYNC/B_READ/B_WRITE/B_PAGEIO/B_FUA - @return void. */ -void buf_clearflags(buf_t, int32_t); +void buf_clearflags(buf_t bp, int32_t flags); /*! @function buf_flags @@ -299,7 +294,7 @@ void buf_clearflags(buf_t, int32_t); @param bp Buffer whose flags to grab. @return flags. */ -int32_t buf_flags(buf_t); +int32_t buf_flags(buf_t bp); /*! @function buf_reset @@ -308,9 +303,8 @@ int32_t buf_flags(buf_t); Used perhaps to prepare an iobuf for reuse. @param bp Buffer whose flags to grab. @param flags Flags to set on buffer: B_READ, B_WRITE, B_ASYNC, B_NOCACHE. - @return void. */ -void buf_reset(buf_t, int32_t); +void buf_reset(buf_t bp, int32_t flags); /*! @function buf_map @@ -327,7 +321,7 @@ void buf_reset(buf_t, int32_t); @param io_addr Destination for mapping address. @return 0 for success, ENOMEM if unable to map the buffer. */ -errno_t buf_map(buf_t, caddr_t *); +errno_t buf_map(buf_t bp, caddr_t *io_addr); /*! @function buf_unmap @@ -340,19 +334,17 @@ errno_t buf_map(buf_t, caddr_t *); buf_setupl() was subsequently called; buf_map() created the mapping. In this case, buf_unmap() will unmap the buffer. @param bp Buffer whose mapping to find or create. - @param io_addr Destination for mapping address. @return 0 for success, EINVAL if unable to unmap buffer. */ -errno_t buf_unmap(buf_t); +errno_t buf_unmap(buf_t bp); /*! @function buf_setdrvdata @abstract Set driver-specific data on a buffer. @param bp Buffer whose driver-data to set. @param drvdata Opaque driver data. - @return void. */ -void buf_setdrvdata(buf_t, void *); +void buf_setdrvdata(buf_t bp, void *drvdata); /*! @function buf_setdrvdata @@ -360,16 +352,15 @@ void buf_setdrvdata(buf_t, void *); @param bp Buffer whose driver data to get. @return Opaque driver data. */ -void * buf_drvdata(buf_t); +void * buf_drvdata(buf_t bp); /*! @function buf_setfsprivate @abstract Set filesystem-specific data on a buffer. @param bp Buffer whose filesystem data to set. @param fsprivate Opaque filesystem data. - @return void. */ -void buf_setfsprivate(buf_t, void *); +void buf_setfsprivate(buf_t bp, void *fsprivate); /*! @function buf_fsprivate @@ -377,7 +368,7 @@ void buf_setfsprivate(buf_t, void *); @param bp Buffer whose filesystem data to get. @return Opaque filesystem data. */ -void * buf_fsprivate(buf_t); +void * buf_fsprivate(buf_t bp); /*! @function buf_blkno @@ -389,7 +380,7 @@ void * buf_fsprivate(buf_t); @param bp Buffer whose physical block number to get. @return Block number. */ -daddr64_t buf_blkno(buf_t); +daddr64_t buf_blkno(buf_t bp); /*! @function buf_lblkno @@ -399,7 +390,7 @@ daddr64_t buf_blkno(buf_t); @param bp Buffer whose logical block number to get. @return Block number. */ -daddr64_t buf_lblkno(buf_t); +daddr64_t buf_lblkno(buf_t bp); /*! @function buf_setblkno @@ -407,9 +398,8 @@ daddr64_t buf_lblkno(buf_t); @discussion Physical block number is generally set by the cluster layer or by buf_getblk(). @param bp Buffer whose physical block number to set. @param blkno Block number to set. - @return void. */ -void buf_setblkno(buf_t, daddr64_t); +void buf_setblkno(buf_t bp, daddr64_t blkno); /*! @function buf_setlblkno @@ -418,9 +408,8 @@ void buf_setblkno(buf_t, daddr64_t); for example by buf_bread(). @param bp Buffer whose logical block number to set. @param lblkno Block number to set. - @return void. */ -void buf_setlblkno(buf_t, daddr64_t); +void buf_setlblkno(buf_t bp, daddr64_t lblkno); /*! @function buf_count @@ -428,7 +417,7 @@ void buf_setlblkno(buf_t, daddr64_t); @param bp Buffer whose byte count to get. @return Byte count. */ -uint32_t buf_count(buf_t); +uint32_t buf_count(buf_t bp); /*! @function buf_size @@ -437,7 +426,7 @@ uint32_t buf_count(buf_t); @param bp Buffer whose size to get. @return Size. */ -uint32_t buf_size(buf_t); +uint32_t buf_size(buf_t bp); /*! @function buf_resid @@ -446,16 +435,15 @@ uint32_t buf_size(buf_t); @param bp Buffer whose outstanding count to get. @return Count of unwritten/unread bytes. */ -uint32_t buf_resid(buf_t); +uint32_t buf_resid(buf_t bp); /*! @function buf_setcount @abstract Set count of valid bytes in a buffer. This may be less than the space allocated to the buffer. @param bp Buffer whose byte count to set. @param bcount Count to set. - @return void. */ -void buf_setcount(buf_t, uint32_t); +void buf_setcount(buf_t bp, uint32_t bcount); /*! @function buf_setsize @@ -463,9 +451,8 @@ void buf_setcount(buf_t, uint32_t); @discussion May be larger than amount of valid data in buffer. Should be used by code which is manually providing storage for an iobuf, one allocated with buf_alloc(). @param bp Buffer whose size to set. - @return void. */ -void buf_setsize(buf_t, uint32_t); +void buf_setsize(buf_t bp, uint32_t); /*! @function buf_setresid @@ -474,9 +461,8 @@ void buf_setsize(buf_t, uint32_t); completes, often called on an "original" buffer when using a manipulated buffer to perform I/O on behalf of the first. @param bp Buffer whose outstanding count to set. - @return Count of unwritten/unread bytes. */ -void buf_setresid(buf_t, uint32_t); +void buf_setresid(buf_t bp, uint32_t resid); /*! @function buf_setdataptr @@ -485,9 +471,8 @@ void buf_setresid(buf_t, uint32_t); useful with iobufs (allocated with buf_alloc()). @param bp Buffer whose data pointer to set. @param data Pointer to data region. - @return void. */ -void buf_setdataptr(buf_t, uintptr_t); +void buf_setdataptr(buf_t bp, uintptr_t data); /*! @function buf_dataptr @@ -496,7 +481,7 @@ void buf_setdataptr(buf_t, uintptr_t); @param bp Buffer whose data pointer to retrieve. @return Data pointer; NULL if unset. */ -uintptr_t buf_dataptr(buf_t); +uintptr_t buf_dataptr(buf_t bp); /*! @function buf_vnode @@ -507,7 +492,7 @@ uintptr_t buf_dataptr(buf_t); @param bp Buffer whose vnode to retrieve. @return Buffer's vnode. */ -vnode_t buf_vnode(buf_t); +vnode_t buf_vnode(buf_t bp); /*! @function buf_setvnode @@ -515,9 +500,8 @@ vnode_t buf_vnode(buf_t); @discussion This call need not be used on traditional buffers; it is for use with iobufs. @param bp Buffer whose vnode to set. @param vp The vnode to attach to the buffer. - @return void. */ -void buf_setvnode(buf_t, vnode_t); +void buf_setvnode(buf_t bp, vnode_t vp); /*! @function buf_device @@ -528,7 +512,7 @@ void buf_setvnode(buf_t, vnode_t); @param bp Buffer whose device ID to retrieve. @return Device id. */ -dev_t buf_device(buf_t); +dev_t buf_device(buf_t bp); /*! @function buf_setdevice @@ -539,7 +523,7 @@ dev_t buf_device(buf_t); @param vp Device to set on the buffer. @return 0 for success, EINVAL if vp is not a device file. */ -errno_t buf_setdevice(buf_t, vnode_t); +errno_t buf_setdevice(buf_t bp, vnode_t vp); /*! @function buf_strategy @@ -554,7 +538,7 @@ errno_t buf_setdevice(buf_t, vnode_t); @param ap vnop_strategy_args structure (most importantly, a buffer). @return 0 for success, or errors from filesystem or device layers. */ -errno_t buf_strategy(vnode_t, void *); +errno_t buf_strategy(vnode_t devvp, void *ap); /* * Flags for buf_invalblkno() @@ -574,13 +558,13 @@ errno_t buf_strategy(vnode_t, void *); obtained with a buf_getblk(). If data has been read into core without using traditional buffer cache routines, buf_invalblkno() will not be able to invalidate it--this includes the use of iobufs. - @param bp Buffer whose block to invalidate. + @param vp vnode whose block to invalidate. @param lblkno Logical block number. @param flags BUF_WAIT: wait for busy buffers to become unbusy and invalidate them then. Otherwise, just return EBUSY for busy blocks. @return 0 for success, EINVAL if vp is not a device file. */ -errno_t buf_invalblkno(vnode_t, daddr64_t, int); +errno_t buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags); /*! @function buf_callback @@ -589,7 +573,7 @@ errno_t buf_invalblkno(vnode_t, daddr64_t, int); @param bp Buffer whose callback to get. @return 0 for success, or errors from filesystem or device layers. */ -void * buf_callback(buf_t); +void * buf_callback(buf_t bp); /*! @function buf_setcallback @@ -602,7 +586,7 @@ void * buf_callback(buf_t); @param transaction Additional argument to callback function. @return 0; always succeeds. */ -errno_t buf_setcallback(buf_t, void (*)(buf_t, void *), void *); +errno_t buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction); /*! @function buf_setupl @@ -615,10 +599,10 @@ errno_t buf_setcallback(buf_t, void (*)(buf_t, void *), void *); buffer. @param bp Buffer whose upl to set. @param upl UPL to set in the buffer. - @parma offset Offset within upl at which relevant data begin. + @param offset Offset within upl at which relevant data begin. @return 0 for success, EINVAL if the buffer was not allocated with buf_alloc(). */ -errno_t buf_setupl(buf_t, upl_t, uint32_t); +errno_t buf_setupl(buf_t bp, upl_t upl, uint32_t offset); /*! @function buf_clone @@ -637,7 +621,7 @@ errno_t buf_setupl(buf_t, upl_t, uint32_t); @param arg Argument to pass to iodone() callback. @return NULL if io_offset/io_size combination is invalid for the buffer to be cloned; otherwise, the new buffer. */ -buf_t buf_clone(buf_t, int, int, void (*)(buf_t, void *), void *); +buf_t buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg); /*! @@ -674,16 +658,15 @@ int buf_shadow(buf_t bp); the buffer's associated device will be set. If vp is NULL, it can be set later with buf_setvnode(). @return New buffer. */ -buf_t buf_alloc(vnode_t); +buf_t buf_alloc(vnode_t vp); /*! @function buf_free @abstract Free a buffer that was allocated with buf_alloc(). @discussion The storage (UPL, data pointer) associated with an iobuf must be freed manually. @param bp The buffer to free. - @return void. */ -void buf_free(buf_t); +void buf_free(buf_t bp); /* * flags for buf_invalidateblks @@ -699,7 +682,7 @@ void buf_free(buf_t); Again, it will only be able to invalidate data which were populated with traditional buffer cache routines, i.e. by buf_getblk() and callers thereof. Unlike buf_invalblkno(), it can be made to write dirty data to disk rather than casting it aside. - @param bp The buffer whose data to invalidate. + @param vp The vnode whose data to invalidate. @param flags BUF_WRITE_DATA: write dirty data to disk with VNOP_BWRITE() before kicking buffer cache entries out. BUF_SKIP_META: do not invalidate metadata blocks. @param slpflag Flags to pass to "msleep" while waiting to acquire busy buffers. @@ -707,7 +690,7 @@ void buf_free(buf_t); and re-starting the scan. @return 0 for success, error values from msleep(). */ -int buf_invalidateblks(vnode_t, int, int, int); +int buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo); /* * flags for buf_flushdirtyblks and buf_iterate @@ -732,9 +715,8 @@ int buf_invalidateblks(vnode_t, int, int, int); BUF_SKIP_NONLOCKED: Skip buffers which are not busy when we encounter them. BUF_SKIP_LOCKED: Skip buffers which are busy when we encounter them. @param msg String to pass to msleep(). - @return void. */ -void buf_flushdirtyblks(vnode_t, int, int, const char *); +void buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg); /*! @function buf_iterate @@ -745,16 +727,15 @@ void buf_flushdirtyblks(vnode_t, int, int, const char *); BUF_RETURNED_DONE: buf_iterate() should call buf_brelse() on the buffer and then stop iterating. BUF_CLAIMED: buf_iterate() should continue iterating (and not call buf_brelse()). BUF_CLAIMED_DONE: buf_iterate() should stop iterating (and not call buf_brelse()). - @param flag + @param flags BUF_SKIP_NONLOCKED: Skip buffers which are not busy when we encounter them. BUF_SKIP_LOCKED: Skip buffers which are busy when we encounter them. BUF_SCAN_CLEAN: Call out on clean buffers. BUF_SCAN_DIRTY: Call out on dirty buffers. BUF_NOTIFY_BUSY: If a buffer cannot be acquired, pass a NULL buffer to callout; otherwise, that buffer will be silently skipped. @param arg Argument to pass to callout in addition to buffer. - @return void. */ -void buf_iterate(vnode_t, int (*)(buf_t, void *), int, void *); +void buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg); /*! @function buf_clear @@ -764,9 +745,8 @@ void buf_iterate(vnode_t, int (*)(buf_t, void *), int, void *); UPL into memory; should only be called once during the life cycle of an iobuf (one allocated with buf_alloc()). @param bp The buffer to zero out. - @return void. */ -void buf_clear(buf_t); +void buf_clear(buf_t bp); /*! @function buf_bawrite @@ -775,13 +755,9 @@ void buf_clear(buf_t); Callers can wait for writes to complete at their discretion using buf_biowait(). When this function is called, data should already have been written to the buffer's data region. @param bp The buffer on which to initiate I/O. - @param throttle If "throttle" is nonzero and more than VNODE_ASYNC_THROTTLE writes are in progress on this file, - buf_bawrite() will block until the write count drops below VNODE_ASYNC_THROTTLE. If "throttle" is zero and the write - count is high, it will fail with EWOULDBLOCK; the caller can decide whether to make a blocking call or pursue - other opportunities. @return EWOULDBLOCK if write count is high and "throttle" is zero; otherwise, errors from VNOP_BWRITE. */ -errno_t buf_bawrite(buf_t); +errno_t buf_bawrite(buf_t bp); /*! @function buf_bdwrite @@ -793,11 +769,9 @@ errno_t buf_bawrite(buf_t); requested otherwise [see return_error] , buf_bdwrite() will unilaterally launch an asynchronous I/O with buf_bawrite() to keep the pile of delayed writes from getting too large. @param bp The buffer to mark for delayed write. - @param return_error If the number of pending delayed writes systemwide is larger than an internal limit, - return EAGAIN rather than doing an asynchronous write. @return EAGAIN for return_error != 0 case, 0 for succeess, errors from buf_bawrite. */ -errno_t buf_bdwrite(buf_t); +errno_t buf_bdwrite(buf_t bp); /*! @function buf_bwrite @@ -808,7 +782,7 @@ errno_t buf_bdwrite(buf_t); @param bp The buffer to write to disk. @return 0 for success; errors from buf_biowait(). */ -errno_t buf_bwrite(buf_t); +errno_t buf_bwrite(buf_t bp); /*! @function buf_biodone @@ -821,9 +795,8 @@ errno_t buf_bwrite(buf_t); considers itself justified in calling buf_brelse() to return it to free lists--no one is waiting for it. Finally, waiters on the bp (e.g. in buf_biowait()) are woken up. @param bp The buffer to mark as done with I/O. - @return void. */ -void buf_biodone(buf_t); +void buf_biodone(buf_t bp); /*! @function buf_biowait @@ -832,7 +805,7 @@ void buf_biodone(buf_t); @param bp The buffer to wait on. @return 0 for a successful wait; nonzero the buffer has been marked as EINTR or had an error set on it. */ -errno_t buf_biowait(buf_t); +errno_t buf_biowait(buf_t bp); /*! @function buf_brelse @@ -846,9 +819,8 @@ errno_t buf_biowait(buf_t); B_LOCKED buffer will not be available for reuse by other files, though its data may be paged out. Note that buf_brelse() is intended for use with traditionally allocated buffers. @param bp The buffer to release. - @retrn void. */ -void buf_brelse(buf_t); +void buf_brelse(buf_t bp); /*! @function buf_bread @@ -865,7 +837,7 @@ void buf_brelse(buf_t); @param bpp Destination pointer for buffer. @return 0 for success, or an error from buf_biowait(). */ -errno_t buf_bread(vnode_t, daddr64_t, int, kauth_cred_t, buf_t *); +errno_t buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp); /*! @function buf_breadn @@ -885,7 +857,7 @@ errno_t buf_bread(vnode_t, daddr64_t, int, kauth_cred_t, buf_t *); @param bpp Destination pointer for buffer. @return 0 for success, or an error from buf_biowait(). */ -errno_t buf_breadn(vnode_t, daddr64_t, int, daddr64_t *, int *, int, kauth_cred_t, buf_t *); +errno_t buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp); /*! @function buf_meta_bread @@ -902,7 +874,7 @@ errno_t buf_breadn(vnode_t, daddr64_t, int, daddr64_t *, int *, int, kauth_cred_ @param bpp Destination pointer for buffer. @return 0 for success, or an error from buf_biowait(). */ -errno_t buf_meta_bread(vnode_t, daddr64_t, int, kauth_cred_t, buf_t *); +errno_t buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp); /*! @function buf_meta_breadn @@ -921,7 +893,7 @@ errno_t buf_meta_bread(vnode_t, daddr64_t, int, kauth_cred_t, buf_t *); @param bpp Destination pointer for buffer. @return 0 for success, or an error from buf_biowait(). */ -errno_t buf_meta_breadn(vnode_t, daddr64_t, int, daddr64_t *, int *, int, kauth_cred_t, buf_t *); +errno_t buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp); /*! @function minphys @@ -949,7 +921,7 @@ u_int minphys(buf_t bp); @param blocksize Logical block size for this vnode. @return 0 for success; EFAULT for an invalid uio; errors from buf_biowait(). */ -int physio(void (*)(buf_t), buf_t, dev_t, int , u_int (*)(buf_t), struct uio *, int ); +int physio(void (*f_strategy)(buf_t), buf_t bp, dev_t dev, int flags, u_int (*f_minphys)(buf_t), struct uio *uio, int blocksize); /* @@ -987,7 +959,7 @@ int physio(void (*)(buf_t), buf_t, dev_t, int , u_int (*)(buf_t), struct uio *, that if a given logical block is found in core with a different size than what is requested, the buffer size will not be modified. @return Buffer found in core or newly allocated, either containing valid data or ready for I/O. */ -buf_t buf_getblk(vnode_t, daddr64_t, int, int, int, int); +buf_t buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation); /*! @function buf_geteblk @@ -997,7 +969,7 @@ buf_t buf_getblk(vnode_t, daddr64_t, int, int, int, int); @param size Size of buffer. @return Always returns a new buffer. */ -buf_t buf_geteblk(int); +buf_t buf_geteblk(int size); /*! @function buf_clear_redundancy_flags @@ -1005,9 +977,8 @@ buf_t buf_geteblk(int); @discussion buffer_redundancy_flags &= ~flags @param bp Buffer whose flags to clear. @param flags Flags to remove from buffer's mask - @return void. */ -void buf_clear_redundancy_flags(buf_t, uint32_t); +void buf_clear_redundancy_flags(buf_t bp, uint32_t flags); /*! @function buf_redundancyflags @@ -1015,7 +986,7 @@ void buf_clear_redundancy_flags(buf_t, uint32_t); @param bp Buffer whose redundancy flags to grab. @return flags. */ -uint32_t buf_redundancy_flags(buf_t); +uint32_t buf_redundancy_flags(buf_t bp); /*! @function buf_setredundancyflags @@ -1023,9 +994,8 @@ uint32_t buf_redundancy_flags(buf_t); @discussion buffer_redundancy_flags |= flags @param bp Buffer whose flags to set. @param flags Flags to add to buffer's redundancy flags - @return void. */ -void buf_set_redundancy_flags(buf_t, uint32_t); +void buf_set_redundancy_flags(buf_t bp, uint32_t flags); /*! @function buf_attr @@ -1033,15 +1003,14 @@ void buf_set_redundancy_flags(buf_t, uint32_t); @param bp Buffer whose attributes to get. @return bufattr_t. */ -bufattr_t buf_attr(buf_t); +bufattr_t buf_attr(buf_t bp); /*! @function buf_markstatic @abstract Mark a buffer as being likely to contain static data. @param bp Buffer to mark. - @return void. */ - void buf_markstatic(buf_t); + void buf_markstatic(buf_t bp); /*! @function buf_static @@ -1049,7 +1018,7 @@ bufattr_t buf_attr(buf_t); @param bp Buffer to test. @return Nonzero if buffer has static data, 0 otherwise. */ -int buf_static(buf_t); +int buf_static(buf_t bp); #ifdef KERNEL_PRIVATE void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void (**)(buf_t, void *), void **); @@ -1065,32 +1034,29 @@ void bufattr_free(bufattr_t bap); @param bap Buffer Attribute whose cpx_t structure you wish to get. @return Returns a cpx_t structure, or NULL if not valid */ -struct cpx *bufattr_cpx(bufattr_t); +struct cpx *bufattr_cpx(bufattr_t bap); /*! @function bufattr_setcpx @abstract Set the cp_ctx on a buffer attribute. @param bap Buffer Attribute that you wish to change - @return void */ -void bufattr_setcpx(bufattr_t, struct cpx *cpx); +void bufattr_setcpx(bufattr_t bap, struct cpx *cpx); /*! @function bufattr_cpoff @abstract Gets the file offset on the buffer. @param bap Buffer Attribute whose file offset value is used - @return void. */ -uint64_t bufattr_cpoff(bufattr_t); +uint64_t bufattr_cpoff(bufattr_t bap); /*! @function bufattr_setcpoff @abstract Set the file offset for a content protected I/O on a buffer attribute. @param bap Buffer Attribute whose cp file offset has to be set - @return void. */ -void bufattr_setcpoff(bufattr_t, uint64_t); +void bufattr_setcpoff(bufattr_t bap, uint64_t); /*! @function bufattr_rawencrypted @@ -1105,7 +1071,6 @@ int bufattr_rawencrypted(bufattr_t bap); @abstract Mark a buffer to use the greedy mode for writing. @param bap Buffer attributes to mark. @discussion Greedy Mode: request improved write performance from the underlying device at the expense of storage efficiency - @return void. */ void bufattr_markgreedymode(bufattr_t bap); @@ -1123,7 +1088,6 @@ int bufattr_greedymode(bufattr_t bap); @abstract Mark a buffer to use the isochronous throughput mode for writing. @param bap Buffer attributes to mark. @discussion isochronous mode: request improved write performance from the underlying device at the expense of storage efficiency - @return void. */ void bufattr_markisochronous(bufattr_t bap); @@ -1174,7 +1138,6 @@ int bufattr_meta(bufattr_t bap); @function bufattr_markmeta @abstract Set the bufattr meta attribute. @param bap Buffer attribute to manipulate. - @return void */ void bufattr_markmeta(bufattr_t bap); @@ -1201,7 +1164,6 @@ vm_offset_t buf_kernel_addrperm_addr(void * addr); @discussion This flag hints the storage driver that some thread is waiting for this I/O to complete. It should therefore attempt to complete it as soon as possible at the cost of device efficiency. @param bap Buffer attributes to mark. - @return void. */ void bufattr_markquickcomplete(bufattr_t bap); @@ -1215,6 +1177,22 @@ void bufattr_markquickcomplete(bufattr_t bap); */ int bufattr_quickcomplete(bufattr_t bap); +int count_lock_queue(void); + +/* + * Flags for buf_acquire + */ +#define BAC_NOWAIT 0x01 /* Don't wait if buffer is busy */ +#define BAC_REMOVE 0x02 /* Remove from free list once buffer is acquired */ +#define BAC_SKIP_NONLOCKED 0x04 /* Don't return LOCKED buffers */ +#define BAC_SKIP_LOCKED 0x08 /* Only return LOCKED buffers */ + +errno_t buf_acquire(buf_t, int, int, int); + +buf_t buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg); + +void buf_drop(buf_t); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index 6ff3284bc..18d0e1119 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -177,6 +177,7 @@ extern vm_offset_t buf_kernel_addrperm; #define BL_WAITSHADOW 0x00000080 #define BL_IOBUF_ALLOC 0x00000100 #define BL_WANTED_REF 0x00000200 +#define BL_IOBUF_VDEV 0x00000400 /* iobuf was for a diskimage */ /* * Parameters for buffer cache garbage collection @@ -235,14 +236,9 @@ extern vm_offset_t buf_kernel_addrperm; #define B_WASDIRTY 0x02000000 /* page was found dirty in the VM cache */ #define B_HDRALLOC 0x04000000 /* zone allocated buffer header */ #define B_ZALLOC 0x08000000 /* b_datap is zalloc()ed */ -/* - * private flags used by the journal layer - */ -#define B_NORELSE 0x10000000 /* don't brelse() in bwrite() */ /* * private flags used by by the cluster layer */ -#define B_TWANTED 0x20000000 /* but_t that is part of a cluster level transaction is wanted */ #define B_COMMIT_UPL 0x40000000 /* commit/abort the UPL on I/O success/failure */ #define B_TDONE 0x80000000 /* buf_t that is part of a cluster level transaction has completed */ @@ -273,6 +269,8 @@ extern vm_offset_t buf_kernel_addrperm; #define BA_ISOCHRONOUS 0x00001000 /* device specific isochronous throughput to media */ +#define BA_STRATEGY_TRACKED_IO 0x00002000 /* tracked by spec_strategy */ + #define GET_BUFATTR_IO_TIER(bap) ((bap->ba_flags & BA_IO_TIER_MASK) >> BA_IO_TIER_SHIFT) #define SET_BUFATTR_IO_TIER(bap, tier) \ @@ -303,31 +301,18 @@ extern struct buf *buf_headers; /* The buffer headers. */ __BEGIN_DECLS -buf_t buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg); - buf_t alloc_io_buf(vnode_t, int); void free_io_buf(buf_t); int allocbuf(struct buf *, int); void bufinit(void); -/* - * Flags for buf_acquire - */ -#define BAC_NOWAIT 0x01 /* Don't wait if buffer is busy */ -#define BAC_REMOVE 0x02 /* Remove from free list once buffer is acquired */ -#define BAC_SKIP_NONLOCKED 0x04 /* Don't return LOCKED buffers */ -#define BAC_SKIP_LOCKED 0x08 /* Only return LOCKED buffers */ - void buf_list_lock(void); void buf_list_unlock(void); void cluster_init(void); -void buf_drop(buf_t); -errno_t buf_acquire(buf_t, int, int, int); int count_busy_buffers(void); -int count_lock_queue(void); int buf_flushdirtyblks_skipinfo (vnode_t, int, int, const char *); void buf_wait_for_shadow_io (vnode_t, daddr64_t); @@ -356,6 +341,8 @@ struct bufstats { long bufs_iobufmax; /* Max. number of IO buffers used */ long bufs_iobufinuse; /* number of IO buffers in use */ long bufs_iobufsleeps; /* IO buffer starvation */ + long bufs_iobufinuse_vdev; /* number of IO buffers in use by + diskimages */ }; #endif /* KERNEL */ diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index a8d95eb3a..8137cb3d9 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -223,6 +223,58 @@ #ifndef __null_unspecified #define __null_unspecified #endif +#ifndef _Nullable +#define _Nullable +#endif +#ifndef _Nonnull +#define _Nonnull +#endif +#ifndef _Null_unspecified +#define _Null_unspecified +#endif +#endif + +/* + * __disable_tail_calls causes the compiler to not perform tail call + * optimization inside the marked function. + */ +#if __has_attribute(disable_tail_calls) +#define __disable_tail_calls __attribute__((__disable_tail_calls__)) +#else +#define __disable_tail_calls +#endif + +/* + * __not_tail_called causes the compiler to prevent tail call optimization + * on statically bound calls to the function. It has no effect on indirect + * calls. Virtual functions, objective-c methods, and functions marked as + * "always_inline" cannot be marked as __not_tail_called. + */ +#if __has_attribute(not_tail_called) +#define __not_tail_called __attribute__((__not_tail_called__)) +#else +#define __not_tail_called +#endif + +/* + * __result_use_check warns callers of a function that not using the function + * return value is a bug, i.e. dismissing malloc() return value results in a + * memory leak. + */ +#if __has_attribute(warn_unused_result) +#define __result_use_check __attribute__((__warn_unused_result__)) +#else +#define __result_use_check +#endif + +/* + * __swift_unavailable causes the compiler to mark a symbol as specifically + * unavailable in Swift, regardless of any other availability in C. + */ +#if __has_feature(attribute_availability_swift) +#define __swift_unavailable(_msg) __attribute__((__availability__(swift, unavailable, message=_msg))) +#else +#define __swift_unavailable(_msg) #endif /* Declaring inline functions within headers is error-prone due to differences @@ -297,6 +349,8 @@ */ #define __printflike(fmtarg, firstvararg) \ __attribute__((__format__ (__printf__, fmtarg, firstvararg))) +#define __printf0like(fmtarg, firstvararg) \ + __attribute__((__format__ (__printf0__, fmtarg, firstvararg))) #define __scanflike(fmtarg, firstvararg) \ __attribute__((__format__ (__scanf__, fmtarg, firstvararg))) diff --git a/bsd/hfs/hfs_unistr.h b/bsd/sys/clonefile.h similarity index 59% rename from bsd/hfs/hfs_unistr.h rename to bsd/sys/clonefile.h index 5b300a28d..17773fd3a 100644 --- a/bsd/hfs/hfs_unistr.h +++ b/bsd/sys/clonefile.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2013 Apple Inc. All rights reserved. + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,43 +22,33 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef __HFS_UNISTR__ -#define __HFS_UNISTR__ +#ifndef _SYS_CLONEFILE_H_ +#define _SYS_CLONEFILE_H_ -#include +/* Options for clonefile calls */ +#define CLONE_NOFOLLOW 0x0001 /* Don't follow symbolic links */ -/* - * hfs_unitstr.h - * - * This file contains definition of the unicode string used for HFS Plus - * files and folder names, as described by the on-disk format. - * - */ +#ifndef KERNEL + +#include +#include +#include <_types/_uint32_t.h> +#include -#ifdef __cplusplus -extern "C" { -#endif +__BEGIN_DECLS +int clonefileat(int, const char *, int, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); -#ifndef _HFSUNISTR255_DEFINED_ -#define _HFSUNISTR255_DEFINED_ -/* Unicode strings are used for HFS Plus file and folder names */ -struct HFSUniStr255 { - u_int16_t length; /* number of unicode characters */ - u_int16_t unicode[255]; /* unicode characters */ -} __attribute__((aligned(2), packed)); -typedef struct HFSUniStr255 HFSUniStr255; -typedef const HFSUniStr255 *ConstHFSUniStr255Param; -#endif /* _HFSUNISTR255_DEFINED_ */ +int fclonefileat(int, int, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); +int clonefile(const char *, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); -#ifdef __cplusplus -} -#endif +__END_DECLS +#endif /* KERNEL */ -#endif /* __HFS_UNISTR__ */ +#endif /* _SYS_CLONEFILE_H_ */ diff --git a/bsd/sys/codesign.h b/bsd/sys/codesign.h index 5f78699a4..ccd1c3aa7 100644 --- a/bsd/sys/codesign.h +++ b/bsd/sys/codesign.h @@ -41,9 +41,9 @@ #define CS_RESTRICT 0x0000800 /* tell dyld to treat restricted */ #define CS_ENFORCEMENT 0x0001000 /* require enforcement */ #define CS_REQUIRE_LV 0x0002000 /* require library validation */ -#define CS_ENTITLEMENTS_VALIDATED 0x0004000 +#define CS_ENTITLEMENTS_VALIDATED 0x0004000 /* code signature permits restricted entitlements */ -#define CS_ALLOWED_MACHO 0x00ffffe +#define CS_ALLOWED_MACHO (CS_ADHOC | CS_HARD | CS_KILL | CS_CHECK_EXPIRATION | CS_RESTRICT | CS_ENFORCEMENT | CS_REQUIRE_LV) #define CS_EXEC_SET_HARD 0x0100000 /* set CS_HARD on any exec'ed process */ #define CS_EXEC_SET_KILL 0x0200000 /* set CS_KILL on any exec'ed process */ @@ -55,6 +55,7 @@ #define CS_PLATFORM_BINARY 0x4000000 /* this is a platform binary */ #define CS_PLATFORM_PATH 0x8000000 /* platform binary by the fact of path (osx only) */ #define CS_DEBUGGED 0x10000000 /* process is currently or has previously been debugged and allowed to run with invalid pages */ +#define CS_SIGNED 0x20000000 /* process has a signature (may have gone invalid) */ #define CS_ENTITLEMENT_FLAGS (CS_GET_TASK_ALLOW | CS_INSTALLER) @@ -76,6 +77,7 @@ #define CS_OPS_SET_STATUS 9 /* set codesign flags */ #define CS_OPS_BLOB 10 /* get codesign blob */ #define CS_OPS_IDENTITY 11 /* get codesign identity */ +#define CS_OPS_CLEARINSTALLER 12 /* clear INSTALLER flag */ /* * Magic numbers used by Code Signing @@ -195,6 +197,8 @@ __END_DECLS #else /* !KERNEL */ +#include + #include #include @@ -203,8 +207,10 @@ struct cs_blob; struct fileglob; __BEGIN_DECLS +int cs_valid(struct proc *); int cs_enforcement(struct proc *); int cs_require_lv(struct proc *); +int cs_system_require_lv(void); uint32_t cs_entitlement_flags(struct proc *p); int cs_entitlements_blob_get(struct proc *, void **, size_t *); int cs_restricted(struct proc *); @@ -214,16 +220,23 @@ struct cs_blob * csproc_get_blob(struct proc *); struct cs_blob * csvnode_get_blob(struct vnode *, off_t); void csvnode_print_debug(struct vnode *); +off_t csblob_get_base_offset(struct cs_blob *); +vm_size_t csblob_get_size(struct cs_blob *); +vm_address_t csblob_get_addr(struct cs_blob *); const char * csblob_get_teamid(struct cs_blob *); const char * csblob_get_identity(struct cs_blob *); const uint8_t * csblob_get_cdhash(struct cs_blob *); -int csblob_get_platform_binary(struct cs_blob *); -unsigned int csblob_get_flags(struct cs_blob *blob); +int csblob_get_platform_binary(struct cs_blob *); +unsigned int csblob_get_flags(struct cs_blob *blob); + int csblob_get_entitlements(struct cs_blob *, void **, size_t *); + const CS_GenericBlob * csblob_find_blob(struct cs_blob *, uint32_t, uint32_t); const CS_GenericBlob * csblob_find_blob_bytes(const uint8_t *, size_t, uint32_t, uint32_t); +void * csblob_entitlements_dictionary_copy(struct cs_blob *csblob); +void csblob_entitlements_dictionary_set(struct cs_blob *csblob, void * entitlements); /* * Mostly convenience functions below @@ -243,7 +256,7 @@ extern int cs_debug; void cs_init(void); int cs_allow_invalid(struct proc *); -int cs_invalid_page(addr64_t); +int cs_invalid_page(addr64_t vaddr, boolean_t *cs_killed); int csproc_get_platform_path(struct proc *); #if !SECURE_KERNEL diff --git a/bsd/sys/conf.h b/bsd/sys/conf.h index d03a3df43..3b3f0e234 100644 --- a/bsd/sys/conf.h +++ b/bsd/sys/conf.h @@ -198,7 +198,6 @@ struct cdevsw { }; #ifdef BSD_KERNEL_PRIVATE -void devsw_init(void); extern uint64_t cdevsw_flags[]; #define CDEVSW_SELECT_KQUEUE 0x01 @@ -294,6 +293,7 @@ extern struct swdevt swdevt[]; */ __BEGIN_DECLS #ifdef KERNEL_PRIVATE +void devsw_init(void); extern struct cdevsw cdevsw[]; extern int cdevsw_setkqueueok(int, struct cdevsw*, int); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/cprotect.h b/bsd/sys/cprotect.h index 642858921..97cb5a7c1 100644 --- a/bsd/sys/cprotect.h +++ b/bsd/sys/cprotect.h @@ -68,16 +68,43 @@ enum { #define CP_MAX_WRAPPEDKEYSIZE 128 /* The size of the largest allowed key */ /* lock events from AppleKeyStore */ -#define CP_LOCKED_STATE 0 /* Device is locked */ -#define CP_UNLOCKED_STATE 1 /* Device is unlocked */ +enum { + CP_ACTION_LOCKED = 0, + CP_ACTION_UNLOCKED = 1, +}; +/* + * Ideally, cp_key_store_action_t would be an enum, but we cannot fix + * that until AppleKeyStore is updated. + */ +typedef int cp_key_store_action_t; -#define CP_MAX_STATE 1 /* uint8_t ; maximum # of states is 255 */ +/* + * It was once the case (and it may still be the case) where the lock + * state got conflated with the possible actions/events that + * AppleKeyStore can send. For that reason, the locked states below + * should numerically match their corresponding actions above. + */ +typedef unsigned char cp_lock_state_t; +enum { + CP_LOCKED_STATE = 0, + CP_UNLOCKED_STATE = 1, +}; + +typedef uint32_t cp_key_class_t; +typedef uint32_t cp_key_os_version_t; +typedef uint16_t cp_key_revision_t; +typedef uint64_t cp_crypto_id_t; typedef struct cprotect *cprotect_t; typedef struct cp_wrap_func *cp_wrap_func_t; typedef struct cpx *cpx_t; -/* Structures passed between HFS and AKS kext */ +typedef struct cp_key { + uint8_t len; + void *key; +} cp_key_t; + +/* Interface to AKS kext */ typedef struct { void *key; unsigned key_len; @@ -91,15 +118,16 @@ typedef cp_raw_key_s* cp_raw_key_t; typedef struct { void *key; unsigned key_len; - uint32_t dp_class; + cp_key_class_t dp_class; } cp_wrapped_key_s; typedef cp_wrapped_key_s* cp_wrapped_key_t; -typedef uint16_t cp_key_revision_t; - typedef struct { - ino64_t inode; + union { + ino64_t inode; + cp_crypto_id_t crypto_id; + }; uint32_t volume; pid_t pid; uid_t uid; @@ -110,12 +138,11 @@ typedef cp_cred_s* cp_cred_t; /* The wrappers are invoked on the AKS kext */ typedef int unwrapper_t(cp_cred_t access, const cp_wrapped_key_t wrapped_key_in, cp_raw_key_t key_out); -typedef int rewrapper_t(cp_cred_t access, uint32_t dp_class, const cp_wrapped_key_t wrapped_key_in, cp_wrapped_key_t wrapped_key_out); -typedef int new_key_t(cp_cred_t access, uint32_t dp_class, cp_raw_key_t key_out, cp_wrapped_key_t wrapped_key_out); +typedef int rewrapper_t(cp_cred_t access, cp_key_class_t dp_class, const cp_wrapped_key_t wrapped_key_in, cp_wrapped_key_t wrapped_key_out); +typedef int new_key_t(cp_cred_t access, cp_key_class_t dp_class, cp_raw_key_t key_out, cp_wrapped_key_t wrapped_key_out); typedef int invalidater_t(cp_cred_t access); /* invalidates keys */ typedef int backup_key_t(cp_cred_t access, const cp_wrapped_key_t wrapped_key_in, cp_wrapped_key_t wrapped_key_out); - /* * Flags for Interaction between AKS / Kernel * These are twiddled via the input/output structs in the above @@ -129,16 +156,26 @@ typedef int backup_key_t(cp_cred_t access, const cp_wrapped_key_t wrapped_key_in * without requiring kext changes. */ cpx_t cpx_alloc(size_t key_size); +void cpx_init(cpx_t, size_t key_len); void cpx_free(cpx_t); __attribute__((const)) size_t cpx_size(size_t key_size); __attribute__((pure)) bool cpx_is_sep_wrapped_key(const struct cpx *); void cpx_set_is_sep_wrapped_key(struct cpx *, bool); __attribute__((pure)) bool cpx_use_offset_for_iv(const struct cpx *); void cpx_set_use_offset_for_iv(struct cpx *, bool); +__attribute__((pure)) bool cpx_synthetic_offset_for_iv(const struct cpx *); +void cpx_set_synthetic_offset_for_iv(struct cpx *, bool); __attribute__((pure)) uint16_t cpx_key_len(const struct cpx *); void cpx_set_key_len(struct cpx *, uint16_t key_len); __attribute__((pure)) void *cpx_key(const struct cpx *); aes_encrypt_ctx *cpx_iv_aes_ctx(struct cpx *); +void cpx_flush(cpx_t cpx); +bool cpx_can_copy(const struct cpx *src, const struct cpx *dst); +void cpx_copy(const struct cpx *src, cpx_t dst); +uint16_t cpx_max_key_len(const struct cpx *cpx); +bool cpx_has_key(const struct cpx *cpx); +size_t cpx_sizex(const struct cpx *cpx); +void cpx_set_aes_iv_key(struct cpx *cpx, void *iv_key); /* Structure to store pointers for AKS functions */ struct cp_wrap_func { @@ -149,41 +186,20 @@ struct cp_wrap_func { backup_key_t *backup_key; }; -int cp_key_store_action(int); +int cp_key_store_action(cp_key_store_action_t); int cp_register_wraps(cp_wrap_func_t); - -#ifdef BSD_KERNEL_PRIVATE - -/* - * Declarations that are not exported from the kernel but are used by - * VFS to call into the implementation (i.e. HFS) should be here. - */ - -/* Content Protection VNOP Operation flags */ -#define CP_READ_ACCESS 0x1 -#define CP_WRITE_ACCESS 0x2 - -/* - * Functions to check the status of a CP and to query - * the containing filesystem to see if it is supported. - */ -struct vnode; -struct hfsmount; - -int cp_vnode_getclass(struct vnode *, int *); -int cp_vnode_setclass(struct vnode *, uint32_t); -int cp_vnode_transcode(struct vnode * vp, void *key, unsigned *len); - -int cp_handle_vnop(struct vnode *, int, int); -int cp_handle_open(struct vnode *vp, int mode); -int cp_get_root_major_vers (struct vnode *vp, uint32_t *level); -int cp_get_default_level (struct vnode *vp, uint32_t *level); +int cp_rewrap_key(cp_cred_t access, cp_key_class_t dp_class, + const cp_wrapped_key_t wrapped_key_in, + cp_wrapped_key_t wrapped_key_out); +int cp_new_key(cp_cred_t access, cp_key_class_t dp_class, cp_raw_key_t key_out, + cp_wrapped_key_t wrapped_key_out); +int cp_unwrap_key(cp_cred_t access, const cp_wrapped_key_t wrapped_key_in, + cp_raw_key_t key_out); +int cp_get_backup_key(cp_cred_t access, const cp_wrapped_key_t wrapped_key_in, + cp_wrapped_key_t wrapped_key_out); +cp_key_os_version_t cp_os_version(void); +// Should be cp_key_class_t but HFS has a conflicting definition int cp_is_valid_class (int isdir, int32_t protectionclass); -int cp_set_trimmed(struct hfsmount *hfsmp); -int cp_set_rewrapped(struct hfsmount *hfsmp); -int cp_flop_generation (struct hfsmount *hfsmp); - -#endif /* BSD_KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/csr.h b/bsd/sys/csr.h index cbff7a08b..1349ec948 100644 --- a/bsd/sys/csr.h +++ b/bsd/sys/csr.h @@ -48,6 +48,7 @@ typedef uint32_t csr_op_t; #define CSR_ALLOW_UNRESTRICTED_DTRACE (1 << 5) #define CSR_ALLOW_UNRESTRICTED_NVRAM (1 << 6) #define CSR_ALLOW_DEVICE_CONFIGURATION (1 << 7) +#define CSR_ALLOW_ANY_RECOVERY_OS (1 << 8) #define CSR_VALID_FLAGS (CSR_ALLOW_UNTRUSTED_KEXTS | \ CSR_ALLOW_UNRESTRICTED_FS | \ @@ -56,7 +57,8 @@ typedef uint32_t csr_op_t; CSR_ALLOW_APPLE_INTERNAL | \ CSR_ALLOW_UNRESTRICTED_DTRACE | \ CSR_ALLOW_UNRESTRICTED_NVRAM | \ - CSR_ALLOW_DEVICE_CONFIGURATION) + CSR_ALLOW_DEVICE_CONFIGURATION | \ + CSR_ALLOW_ANY_RECOVERY_OS) /* CSR capabilities that a booter can give to the system */ diff --git a/bsd/sys/decmpfs.h b/bsd/sys/decmpfs.h index 8cef87b69..58ad8fed5 100644 --- a/bsd/sys/decmpfs.h +++ b/bsd/sys/decmpfs.h @@ -29,6 +29,8 @@ #define _SYS_DECMPFS_H_ 1 #include +#include +#include #define MAX_DECMPFS_XATTR_SIZE 3802 @@ -77,7 +79,7 @@ typedef struct { #include -typedef struct decmpfs_cnode { +struct decmpfs_cnode { uint8_t cmp_state; uint8_t cmp_minimal_xattr; /* if non-zero, this file's com.apple.decmpfs xattr contained only the minimal decmpfs_disk_header */ uint32_t cmp_type; @@ -86,7 +88,11 @@ typedef struct decmpfs_cnode { uint64_t uncompressed_size __attribute__((aligned(8))); uint64_t decompression_flags; lck_rw_t compressed_data_lock; -} decmpfs_cnode; +}; + +#endif // XNU_KERNEL_PRIVATE + +typedef struct decmpfs_cnode decmpfs_cnode; /* return values from decmpfs_file_is_compressed */ enum { @@ -101,19 +107,22 @@ extern vfs_context_t decmpfs_ctx; /* client filesystem entrypoints */ void decmpfs_init(void); +decmpfs_cnode *decmpfs_cnode_alloc(void); +void decmpfs_cnode_free(decmpfs_cnode *dp); void decmpfs_cnode_init(decmpfs_cnode *cp); void decmpfs_cnode_destroy(decmpfs_cnode *cp); int decmpfs_hides_rsrc(vfs_context_t ctx, decmpfs_cnode *cp); int decmpfs_hides_xattr(vfs_context_t ctx, decmpfs_cnode *cp, const char *xattr); -boolean_t decmpfs_trylock_compressed_data(decmpfs_cnode *cp, int exclusive); +bool decmpfs_trylock_compressed_data(decmpfs_cnode *cp, int exclusive); void decmpfs_lock_compressed_data(decmpfs_cnode *cp, int exclusive); void decmpfs_unlock_compressed_data(decmpfs_cnode *cp, int exclusive); uint32_t decmpfs_cnode_get_vnode_state(decmpfs_cnode *cp); void decmpfs_cnode_set_vnode_state(decmpfs_cnode *cp, uint32_t state, int skiplock); uint64_t decmpfs_cnode_get_vnode_cached_size(decmpfs_cnode *cp); +uint32_t decmpfs_cnode_cmp_type(decmpfs_cnode *cp); int decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp); errno_t decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp); @@ -124,8 +133,6 @@ int decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap); errno_t decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmpfs_cnode *cp); errno_t decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_cnode *cp); -#endif /* XNU_KERNEL_PRIVATE */ - /* types shared between the kernel and kexts */ typedef int (*decmpfs_validate_compressed_file_func)(vnode_t vp, vfs_context_t ctx, decmpfs_header *hdr); typedef void (*decmpfs_adjust_fetch_region_func)(vnode_t vp, vfs_context_t ctx, decmpfs_header *hdr, off_t *offset, user_ssize_t *size); diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 6a4718016..c75d1dae1 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -73,6 +73,9 @@ * DKIOCGETFEATURES get device's feature set * DKIOCGETPHYSICALBLOCKSIZE get device's block size * DKIOCGETCOMMANDPOOLSIZE get device's queue depth + * + * DKIOCGETPROVISIONSTATUS get device's block provision status + * DKIOCGETIOMINSATURATIONBYTECOUNT get minimum byte count to saturate storage bandwidth */ #define DK_FEATURE_BARRIER 0x00000002 @@ -149,6 +152,27 @@ typedef struct #define DK_CORESTORAGE_ENABLE_HOTFILES 0x00000002 #define DK_CORESTORAGE_PIN_YOUR_SWAPFILE 0x00000004 +#define DK_PROVISION_TYPE_MAPPED 0x00 +#define DK_PROVISION_TYPE_DEALLOCATED 0x01 +#define DK_PROVISION_TYPE_ANCHORED 0x02 + +typedef struct +{ + uint64_t offset; + uint64_t length; + uint8_t provisionType; + uint8_t reserved[7]; +} dk_provision_extent_t; + +typedef struct +{ + uint64_t offset; /* input: logical byte offset */ + uint64_t length; /* input: byte length, 0 for whole length */ + uint64_t options; /* reserved, clear to zero */ + uint32_t reserved; /* not used */ + uint32_t extentsCount; /* input/output: count for extents */ + dk_provision_extent_t * extents; /* output: provision extents */ +} dk_provision_status_t; #ifdef KERNEL #ifdef PRIVATE @@ -193,6 +217,8 @@ typedef struct #define DKIOCGETPHYSICALBLOCKSIZE _IOR('d', 77, uint32_t) #define DKIOCGETCOMMANDPOOLSIZE _IOR('d', 78, uint32_t) +#define DKIOCGETPROVISIONSTATUS _IOWR('d', 79, dk_provision_status_t) + #define DKIOCSYNCHRONIZECACHE _IO('d', 22) #ifdef KERNEL @@ -244,6 +270,7 @@ typedef struct #define DKIOCSETTIER _IOW('d', 85, dk_set_tier_t) #define DKIOCGETENCRYPTIONTYPE _IOR('d', 86, uint32_t) #define DKIOCISLOWPOWERMODE _IOR('d', 87, uint32_t) +#define DKIOCGETIOMINSATURATIONBYTECOUNT _IOR('d', 88, uint32_t) #ifdef XNU_KERNEL_PRIVATE typedef struct diff --git a/bsd/sys/doc_tombstone.h b/bsd/sys/doc_tombstone.h new file mode 100644 index 000000000..8dbe1be32 --- /dev/null +++ b/bsd/sys/doc_tombstone.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef SYS_DOC_TOMBSTONE_H_ +#define SYS_DOC_TOMBSTONE_H_ + +#include +#include + +#ifdef KERNEL_PRIVATE + +/* + * struct representing a document "tombstone" that's recorded + * when a thread manipulates files marked with a document-id. + * if the thread recreates the same item, this tombstone is + * used to preserve the document_id on the new file. + * + * It is a separate structure because of its size - we want to + * allocate it on demand instead of just stuffing it into the + * uthread structure. + */ +struct doc_tombstone { + struct vnode *t_lastop_parent; + struct vnode *t_lastop_item; + uint32_t t_lastop_parent_vid; + uint32_t t_lastop_item_vid; + uint64_t t_lastop_fileid; + uint64_t t_lastop_document_id; + unsigned char t_lastop_filename[NAME_MAX+1]; +}; + +struct componentname; + +struct doc_tombstone *doc_tombstone_get(void); +void doc_tombstone_clear(struct doc_tombstone *ut, struct vnode **old_vpp); +void doc_tombstone_save(struct vnode *dvp, struct vnode *vp, + struct componentname *cnp, uint64_t doc_id, + ino64_t file_id); +bool doc_tombstone_should_ignore_name(const char *nameptr, int len); +bool doc_tombstone_should_save(struct doc_tombstone *ut, struct vnode *vp, + struct componentname *cnp); + +#endif // defined(KERNEL_PRIVATE) + +#endif // SYS_DOC_TOMBSTONE_H_ diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index debf2f767..ede1f7ac3 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -390,14 +390,15 @@ typedef enum dtrace_probespec { #define DIF_SUBR_INET_NTOA6 43 #define DIF_SUBR_TOUPPER 44 #define DIF_SUBR_TOLOWER 45 -#define DIF_SUBR_VM_KERNEL_ADDRPERM 46 -#if !defined(__APPLE__) - #define DIF_SUBR_MAX 46 /* max subroutine value */ -#else -#define DIF_SUBR_COREPROFILE 47 -#define DIF_SUBR_MAX 47 /* max subroutine value */ +/* Apple-specific subroutines */ +#if defined(__APPLE__) +#define DIF_SUBR_APPLE_MIN 200 /* min apple-specific subroutine value */ +#define DIF_SUBR_VM_KERNEL_ADDRPERM 200 +#define DIF_SUBR_KDEBUG_TRACE 201 +#define DIF_SUBR_KDEBUG_TRACE_STRING 202 +#define DIF_SUBR_APPLE_MAX 202 /* max apple-specific subroutine value */ #endif /* __APPLE__ */ typedef uint32_t dif_instr_t; @@ -1159,7 +1160,8 @@ typedef struct dtrace_fmtdesc { #define DTRACEOPT_MAX 31 /* number of options */ #else #define DTRACEOPT_STACKSYMBOLS 31 /* clear to prevent stack symbolication */ -#define DTRACEOPT_MAX 32 /* number of options */ +#define DTRACEOPT_BUFLIMIT 32 /* buffer signaling limit in % of the size */ +#define DTRACEOPT_MAX 33 /* number of options */ #endif /* __APPLE__ */ #define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */ @@ -1424,6 +1426,8 @@ typedef struct dtrace_providerdesc { #define DTRACEIOC_MODUUIDSLIST (DTRACEIOC | 30) /* APPLE ONLY, query for modules with missing symbols */ #define DTRACEIOC_PROVMODSYMS (DTRACEIOC | 31) /* APPLE ONLY, provide missing symbols for a given module */ #define DTRACEIOC_PROCWAITFOR (DTRACEIOC | 32) /* APPLE ONLY, wait for process exec */ +#define DTRACEIOC_SLEEP (DTRACEIOC | 33) /* APPLE ONLY, sleep */ +#define DTRACEIOC_SIGNAL (DTRACEIOC | 34) /* APPLE ONLY, signal sleeping process */ /* * The following structs are used to provide symbol information to the kernel from userspace. @@ -1458,6 +1462,15 @@ typedef struct dtrace_procdesc { pid_t p_pid; } dtrace_procdesc_t; +/** + * DTrace wake reasons. + * This is used in userspace to determine what's the reason why it woke up, + * to start aggregating / switching buffer right away if it is because a buffer + * got over its limit + */ +#define DTRACE_WAKE_TIMEOUT 0 /* dtrace client woke up because of a timeout */ +#define DTRACE_WAKE_BUF_LIMIT 1 /* dtrace client woke up because of a over limit buffer */ + #endif /* __APPLE__ */ /* @@ -2439,6 +2452,7 @@ typedef struct dtrace_mops { void (*dtms_create_probe)(void *, void *, dtrace_helper_probedesc_t *); void *(*dtms_provide_pid)(void *, dtrace_helper_provdesc_t *, pid_t); void (*dtms_remove_pid)(void *, dtrace_helper_provdesc_t *, pid_t); + char* (*dtms_provider_name)(void *); } dtrace_mops_t; typedef uintptr_t dtrace_meta_provider_id_t; diff --git a/bsd/sys/dtrace_glue.h b/bsd/sys/dtrace_glue.h index 494cbfcd2..c5a1ebf6d 100644 --- a/bsd/sys/dtrace_glue.h +++ b/bsd/sys/dtrace_glue.h @@ -366,11 +366,6 @@ typedef uint_t minor_t; typedef struct __dev_info *dev_info_t; extern void ddi_report_dev(dev_info_t *); -extern int ddi_soft_state_init(void **, size_t, size_t); -extern void *ddi_get_soft_state(void *, int); -extern int ddi_soft_state_free(void *, int); -extern int ddi_soft_state_zalloc(void *, int); -extern void ddi_soft_state_fini(void **); int ddi_getprop(dev_t dev, dev_info_t *dip, int flags, const char *name, int defvalue); @@ -510,9 +505,9 @@ extern void vmem_free(vmem_t *vmp, void *vaddr, size_t size); * Atomic */ -static inline void atomic_add_32( uint32_t *theAddress, int32_t theAmount ) +static inline uint32_t atomic_add_32( uint32_t *theAddress, int32_t theAmount ) { - (void)OSAddAtomic( theAmount, theAddress ); + return OSAddAtomic( theAmount, theAddress ); } #if defined(__i386__) || defined(__x86_64__) @@ -522,6 +517,17 @@ static inline void atomic_add_64( uint64_t *theAddress, int64_t theAmount ) } #endif +static inline uint32_t atomic_and_32(uint32_t *addr, uint32_t mask) +{ + return OSBitAndAtomic(mask, addr); +} + +static inline uint32_t atomic_or_32(uint32_t *addr, uint32_t mask) +{ + return OSBitOrAtomic(mask, addr); +} + + /* * Miscellaneous */ @@ -537,14 +543,14 @@ extern volatile int panicwait; /* kern/debug.c */ #define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) -extern void delay( int ); /* kern/clock.h */ - extern int vuprintf(const char *, va_list); extern hrtime_t dtrace_abs_to_nano(uint64_t); __private_extern__ const char * strstr(const char *, const char *); +#define DTRACE_NCLIENTS 32 + #undef proc_t /* diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index e74def4ec..9229998a3 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -24,6 +24,7 @@ * Use is subject to license terms. * * Portions Copyright (c) 2012 by Delphix. All rights reserved. + * Portions Copyright (c) 2016 by Joyent, Inc. */ #ifndef _SYS_DTRACE_IMPL_H @@ -421,6 +422,8 @@ typedef struct dtrace_aggregation { typedef struct dtrace_buffer { uint64_t dtb_offset; /* current offset in buffer */ + uint64_t dtb_cur_limit; /* current limit before signaling/dropping */ + uint64_t dtb_limit; /* limit before signaling */ uint64_t dtb_size; /* size of buffer */ uint32_t dtb_flags; /* flags */ uint32_t dtb_drops; /* number of drops */ @@ -436,6 +439,7 @@ typedef struct dtrace_buffer { #endif uint64_t dtb_switched; /* time of last switch */ uint64_t dtb_interval; /* observed switch interval */ + uint64_t dtb_pad2[4]; /* pad to avoid false sharing */ } dtrace_buffer_t; /* @@ -927,6 +931,7 @@ typedef struct dtrace_mstate { int dtms_ipl; /* cached interrupt pri lev */ int dtms_fltoffs; /* faulting DIFO offset */ uintptr_t dtms_strtok; /* saved strtok() pointer */ + uintptr_t dtms_strtok_limit; /* upper bound of strtok ptr */ uint32_t dtms_access; /* memory access rights */ dtrace_difo_t *dtms_difo; /* current dif object */ } dtrace_mstate_t; @@ -954,6 +959,7 @@ typedef struct dtrace_mstate { * directed acyclic graph. The activity transition diagram is as follows: * * + * * +----------+ +--------+ +--------+ * | INACTIVE |------------------>| WARMUP |------------------>| ACTIVE | * +----------+ dtrace_go(), +--------+ dtrace_go(), +--------+ @@ -1125,6 +1131,7 @@ typedef struct dtrace_helpers { #define DTRACE_HELPTRACE_DONE (-2) #define DTRACE_HELPTRACE_ERR (-3) + typedef struct dtrace_helptrace { dtrace_helper_action_t *dtht_helper; /* helper action */ int dtht_where; /* where in helper action */ @@ -1219,6 +1226,7 @@ struct dtrace_state { dtrace_cred_t dts_cred; /* credentials */ size_t dts_nretained; /* number of retained enabs */ uint64_t dts_arg_error_illval; + uint32_t dts_buf_over_limit; /* number of bufs over dtb_limit */ }; struct dtrace_provider { @@ -1302,6 +1310,18 @@ typedef struct dtrace_errhash { #endif /* DTRACE_ERRDEBUG */ +/** + * DTrace Matching pre-conditions + * + * Used when matching new probes to discard matching of enablings that + * doesn't match the condition tested by dmc_func + */ +typedef struct dtrace_match_cond { + int (*dmc_func)(dtrace_probedesc_t*, void*); + void *dmc_data; +} dtrace_match_cond_t; + + /* * DTrace Toxic Ranges * @@ -1354,13 +1374,22 @@ extern void dtrace_isa_init(void); extern void dtrace_copy(uintptr_t, uintptr_t, size_t); extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); +/* + * DTrace state handling + */ +extern minor_t dtrace_state_reserve(void); +extern dtrace_state_t* dtrace_state_allocate(minor_t minor); +extern dtrace_state_t* dtrace_state_get(minor_t minor); +extern void dtrace_state_free(minor_t minor); + /* * DTrace restriction checks */ extern void dtrace_restriction_policy_load(void); extern boolean_t dtrace_is_restricted(void); -extern boolean_t dtrace_is_running_apple_internal(void); +extern boolean_t dtrace_are_restrictions_relaxed(void); extern boolean_t dtrace_fbt_probes_restricted(void); +extern boolean_t dtrace_sdt_probes_restricted(void); extern boolean_t dtrace_can_attach_to_proc(proc_t); /* diff --git a/bsd/sys/dtrace_ptss.h b/bsd/sys/dtrace_ptss.h index 26381cfa9..e7f1825dd 100644 --- a/bsd/sys/dtrace_ptss.h +++ b/bsd/sys/dtrace_ptss.h @@ -72,7 +72,6 @@ extern "C" { */ - #define DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD (64) #define DTRACE_PTSS_ENTRIES_PER_PAGE (PAGE_SIZE / DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD) diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index 522fec1f8..246deb772 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -270,11 +270,12 @@ __END_DECLS #define ERESTART (-1) /* restart syscall */ #define EJUSTRETURN (-2) /* don't modify regs, just return */ -#ifdef BSD_KERNEL_PRIVATE +#ifdef KERNEL_PRIVATE #define ERECYCLE (-5) /* restart lookup under heavy vnode pressure/recycling */ +#endif +#ifdef BSD_KERNEL_PRIVATE #define EREDRIVEOPEN (-6) #define EKEEPLOOKING (-7) -#define ERESERVEDNAME (-8) /* path is known but not usable */ /* used for cvwait error returns to Libc */ #define ECVCERORR 256 #define ECVPERORR 512 diff --git a/bsd/sys/event.h b/bsd/sys/event.h index 00635c1b4..8e43692bd 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2015 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,8 +77,9 @@ #define EVFILT_SOCK (-13) /* Socket events */ #define EVFILT_MEMORYSTATUS (-14) /* Memorystatus events */ #endif /* PRIVATE */ +#define EVFILT_EXCEPT (-15) /* Exception events */ -#define EVFILT_SYSCOUNT 14 +#define EVFILT_SYSCOUNT 15 #define EVFILT_THREADMARKER EVFILT_SYSCOUNT /* Internal use only */ #pragma pack(4) @@ -109,17 +110,19 @@ struct user32_kevent { uint16_t flags; /* general flags */ uint32_t fflags; /* filter-specific flags */ int32_t data; /* filter-specific data */ - user32_addr_t udata; /* opaque user data identifier */ + user32_addr_t udata; /* opaque user data identifier */ }; struct kevent_internal_s { - uint64_t ident; /* identifier for this event */ - int16_t filter; /* filter for event */ - uint16_t flags; /* general flags */ - uint32_t fflags; /* filter-specific flags */ - int64_t data; /* filter-specific data */ - uint64_t udata; /* opaque user data identifier */ - uint64_t ext[2]; /* filter-specific extensions */ + uint64_t ident; /* identifier for this event */ + int16_t filter; /* filter for event */ + uint16_t flags; /* general flags */ + int32_t qos; /* quality of service */ + uint32_t fflags; /* filter-specific flags */ +// uint32_t xflags; /* extra filter-specific flags */ + int64_t data; /* filter-specific data */ + uint64_t udata; /* opaque user data identifier */ + uint64_t ext[4]; /* filter-specific extensions */ }; #endif @@ -174,61 +177,92 @@ struct kevent_qos_s { /* kevent system call flags */ -#define KEVENT_FLAG_NONE 0x00 /* no flag value */ -#define KEVENT_FLAG_IMMEDIATE 0x01 /* immediate timeout */ -#define KEVENT_FLAG_ERROR_EVENTS 0x02 /* output events only include change errors */ +#define KEVENT_FLAG_NONE 0x000 /* no flag value */ +#define KEVENT_FLAG_IMMEDIATE 0x001 /* immediate timeout */ +#define KEVENT_FLAG_ERROR_EVENTS 0x002 /* output events only include change errors */ #ifdef PRIVATE -#define EV_SET_QOS 0 /* * Rather than provide an EV_SET_QOS macro for kevent_qos_t structure * initialization, we encourage use of named field initialization support * instead. */ -#define KEVENT_FLAG_STACK_EVENTS 0x04 /* output events treated as stack (grows down) */ -#define KEVENT_FLAG_STACK_DATA 0x08 /* output data allocated as stack (grows down) */ -#define KEVENT_FLAG_WORKQ 0x20 /* interact with the default workq kq */ +#define KEVENT_FLAG_STACK_EVENTS 0x004 /* output events treated as stack (grows down) */ +#define KEVENT_FLAG_STACK_DATA 0x008 /* output data allocated as stack (grows down) */ +#define KEVENT_FLAG_WORKQ 0x020 /* interact with the default workq kq */ +#define KEVENT_FLAG_WORKQ_MANAGER 0x200 /* current thread is the workq manager */ #ifdef XNU_KERNEL_PRIVATE -#define KEVENT_FLAG_LEGACY32 0x40 /* event data in legacy 32-bit format */ -#define KEVENT_FLAG_LEGACY64 0x80 /* event data in legacy 64-bit format */ - +#define KEVENT_FLAG_LEGACY32 0x040 /* event data in legacy 32-bit format */ +#define KEVENT_FLAG_LEGACY64 0x080 /* event data in legacy 64-bit format */ +#define KEVENT_FLAG_KERNEL 0x100 /* caller is in-kernel */ #define KEVENT_FLAG_USER (KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS | \ - KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \ - KEVENT_FLAG_WORKQ) + KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \ + KEVENT_FLAG_WORKQ) + +/* + * Since some filter ops are not part of the standard sysfilt_ops, we + * use kn_filtid starting from EVFILT_SYSCOUNT to identify these cases. + * This is to let kn_fops() get the correct fops for all cases. +*/ +#define EVFILTID_KQREAD (EVFILT_SYSCOUNT) +#define EVFILTID_PIPE_R (EVFILT_SYSCOUNT + 1) +#define EVFILTID_PIPE_W (EVFILT_SYSCOUNT + 2) +#define EVFILTID_PTSD (EVFILT_SYSCOUNT + 3) +#define EVFILTID_SOREAD (EVFILT_SYSCOUNT + 4) +#define EVFILTID_SOWRITE (EVFILT_SYSCOUNT + 5) +#define EVFILTID_SCK (EVFILT_SYSCOUNT + 6) +#define EVFILTID_SOEXCEPT (EVFILT_SYSCOUNT + 7) +#define EVFILTID_SPEC (EVFILT_SYSCOUNT + 8) +#define EVFILTID_BPFREAD (EVFILT_SYSCOUNT + 9) +#define EVFILTID_NECP_FD (EVFILT_SYSCOUNT + 10) +#define EVFILTID_FSEVENT (EVFILT_SYSCOUNT + 13) +#define EVFILTID_VN (EVFILT_SYSCOUNT + 14) + +#define EVFILTID_MAX (EVFILT_SYSCOUNT + 15) + #endif /* XNU_KERNEL_PRIVATE */ + +#define EV_SET_QOS 0 + #endif /* PRIVATE */ /* actions */ -#define EV_ADD 0x0001 /* add event to kq (implies enable) */ -#define EV_DELETE 0x0002 /* delete event from kq */ -#define EV_ENABLE 0x0004 /* enable event */ -#define EV_DISABLE 0x0008 /* disable event (not reported) */ +#define EV_ADD 0x0001 /* add event to kq (implies enable) */ +#define EV_DELETE 0x0002 /* delete event from kq */ +#define EV_ENABLE 0x0004 /* enable event */ +#define EV_DISABLE 0x0008 /* disable event (not reported) */ /* flags */ -#define EV_ONESHOT 0x0010 /* only report one occurrence */ -#define EV_CLEAR 0x0020 /* clear event state after reporting */ -#define EV_RECEIPT 0x0040 /* force EV_ERROR on success, data == 0 */ -#define EV_DISPATCH 0x0080 /* disable event after reporting */ +#define EV_ONESHOT 0x0010 /* only report one occurrence */ +#define EV_CLEAR 0x0020 /* clear event state after reporting */ +#define EV_RECEIPT 0x0040 /* force immediate event output */ + /* ... with or without EV_ERROR */ + /* ... use KEVENT_FLAG_ERROR_EVENTS */ + /* on syscalls supporting flags */ + +#define EV_DISPATCH 0x0080 /* disable event after reporting */ +#define EV_UDATA_SPECIFIC 0x0100 /* unique kevent per udata value */ -#define EV_UDATA_SPECIFIC 0x0100 /* unique kevent per udata value */ - /* ... in combination with EV_DELETE */ - /* will defer delete until udata-specific */ - /* event enabled. EINPROGRESS will be */ - /* returned to indicate the deferral */ +#define EV_DISPATCH2 (EV_DISPATCH | EV_UDATA_SPECIFIC) + /* ... in combination with EV_DELETE */ + /* will defer delete until udata-specific */ + /* event enabled. EINPROGRESS will be */ + /* returned to indicate the deferral */ -#define EV_DISPATCH2 (EV_DISPATCH | EV_UDATA_SPECIFIC) +#define EV_VANISHED 0x0200 /* report that source has vanished */ + /* ... only valid with EV_DISPATCH2 */ -#define EV_SYSFLAGS 0xF000 /* reserved by system */ -#define EV_FLAG0 0x1000 /* filter-specific flag */ -#define EV_FLAG1 0x2000 /* filter-specific flag */ +#define EV_SYSFLAGS 0xF000 /* reserved by system */ +#define EV_FLAG0 0x1000 /* filter-specific flag */ +#define EV_FLAG1 0x2000 /* filter-specific flag */ /* returned values */ -#define EV_EOF 0x8000 /* EOF detected */ -#define EV_ERROR 0x4000 /* error, data contains errno */ +#define EV_EOF 0x8000 /* EOF detected */ +#define EV_ERROR 0x4000 /* error, data contains errno */ /* * Filter specific flags for EVFILT_READ @@ -289,6 +323,9 @@ struct kevent_qos_s { */ #define NOTE_LOWAT 0x00000001 /* low water mark */ +/* data/hint flags for EVFILT_EXCEPT, shared with userspace */ +#define NOTE_OOB 0x00000002 /* OOB data */ + /* * data/hint fflags for EVFILT_VNODE, shared with userspace */ @@ -300,6 +337,7 @@ struct kevent_qos_s { #define NOTE_RENAME 0x00000020 /* vnode was renamed */ #define NOTE_REVOKE 0x00000040 /* vnode access was revoked */ #define NOTE_NONE 0x00000080 /* No specific vnode event: to test for EVFILT_READ activation*/ +#define NOTE_FUNLOCK 0x00000100 /* vnode was unlocked by flock(2) */ /* * data/hint fflags for EVFILT_PROC, shared with userspace @@ -375,6 +413,8 @@ enum { #define NOTE_MEMORYSTATUS_PRESSURE_WARN 0x00000002 /* system memory pressure has changed to the warning state */ #define NOTE_MEMORYSTATUS_PRESSURE_CRITICAL 0x00000004 /* system memory pressure has changed to the critical state */ #define NOTE_MEMORYSTATUS_LOW_SWAP 0x00000008 /* system is in a low-swap state */ +#define NOTE_MEMORYSTATUS_PROC_LIMIT_WARN 0x00000010 /* process memory limit has hit a warning state */ +#define NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL 0x00000020 /* process memory limit has hit a critical state - soft limit */ typedef enum vm_pressure_level { kVMPressureNormal = 0, @@ -400,6 +440,7 @@ typedef enum vm_pressure_level { #define NOTE_LEEWAY 0x00000010 /* ext[1] holds leeway for power aware timers */ #define NOTE_CRITICAL 0x00000020 /* system does minimal timer coalescing */ #define NOTE_BACKGROUND 0x00000040 /* system does maximum timer coalescing */ +#define NOTE_MACH_CONTINUOUS_TIME 0x00000080 /* use continuous time base */ #ifdef PRIVATE /* * data/hint fflags for EVFILT_SOCK, shared with userspace. @@ -419,12 +460,13 @@ typedef enum vm_pressure_level { #define NOTE_CONNECTED 0x00000800 /* socket is connected */ #define NOTE_DISCONNECTED 0x00001000 /* socket is disconnected */ #define NOTE_CONNINFO_UPDATED 0x00002000 /* connection info was updated */ +#define NOTE_NOTIFY_ACK 0x00004000 /* notify acknowledgement */ #define EVFILT_SOCK_LEVEL_TRIGGER_MASK \ (NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_SUSPEND | NOTE_RESUME | NOTE_CONNECTED | NOTE_DISCONNECTED) #define EVFILT_SOCK_ALL_MASK \ - (NOTE_CONNRESET | NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_TIMEOUT | NOTE_NOSRCADDR | NOTE_IFDENIED | NOTE_SUSPEND | NOTE_RESUME | NOTE_KEEPALIVE | NOTE_ADAPTIVE_WTIMO | NOTE_ADAPTIVE_RTIMO | NOTE_CONNECTED | NOTE_DISCONNECTED | NOTE_CONNINFO_UPDATED) + (NOTE_CONNRESET | NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_TIMEOUT | NOTE_NOSRCADDR | NOTE_IFDENIED | NOTE_SUSPEND | NOTE_RESUME | NOTE_KEEPALIVE | NOTE_ADAPTIVE_WTIMO | NOTE_ADAPTIVE_RTIMO | NOTE_CONNECTED | NOTE_DISCONNECTED | NOTE_CONNINFO_UPDATED | NOTE_NOTIFY_ACK) #endif /* PRIVATE */ @@ -481,7 +523,7 @@ SLIST_HEAD(klist, knote); #ifdef KERNEL -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include #include @@ -491,38 +533,59 @@ MALLOC_DECLARE(M_KQUEUE); TAILQ_HEAD(kqtailq, knote); /* a list of "queued" events */ -struct knote { - int kn_inuse; /* inuse count */ - int kn_hookid; - TAILQ_ENTRY(knote) kn_tqe; /* linkage for tail queue */ - struct kqtailq *kn_tq; /* pointer to tail queue */ - struct kqueue *kn_kq; /* which kqueue we are on */ - SLIST_ENTRY(knote) kn_link; /* linkage for search list */ - SLIST_ENTRY(knote) kn_selnext; /* klist element chain */ +/* Bit size for packed field within knote */ +#define KNOTE_KQ_BITSIZE 40 + + +/* index into various kq queues */ +typedef uint8_t kq_index_t; +typedef uint16_t kn_status_t; + +#define KN_ACTIVE 0x0001 /* event has been triggered */ +#define KN_QUEUED 0x0002 /* event is on queue */ +#define KN_DISABLED 0x0004 /* event is disabled */ +#define KN_DROPPING 0x0008 /* knote is being dropped */ +#define KN_USEWAIT 0x0010 /* wait for knote use */ +#define KN_ATTACHING 0x0020 /* event is pending attach */ +#define KN_STAYACTIVE 0x0040 /* force event to stay active */ +#define KN_DEFERDELETE 0x0080 /* defer delete until re-enabled */ +#define KN_ATTACHED 0x0100 /* currently attached to source */ +#define KN_DISPATCH 0x0200 /* disables as part of deliver */ +#define KN_UDATA_SPECIFIC 0x0400 /* udata is part of matching */ +#define KN_SUPPRESSED 0x0800 /* event is suppressed during delivery */ +#define KN_STOLENDROP 0x1000 /* someone stole the drop privilege */ +#define KN_REQVANISH 0x2000 /* requested EV_VANISH */ +#define KN_VANISHED 0x4000 /* has vanished */ + +#define KN_DISPATCH2 (KN_DISPATCH | KN_UDATA_SPECIFIC) + /* combination defines deferred-delete mode enabled */ + +struct __attribute__((__packed__)) knote { + uint16_t kn_inuse; /* inuse count */ + kn_status_t kn_status; /* status bits */ + int kn_hookid; + TAILQ_ENTRY(knote) kn_tqe; /* linkage for tail queue */ + SLIST_ENTRY(knote) kn_link; /* linkage for search list */ + SLIST_ENTRY(knote) kn_selnext; /* klist element chain */ union { - struct fileproc *p_fp; /* file data pointer */ - struct proc *p_proc; /* proc pointer */ - struct ipc_pset *p_pset; /* pset pointer */ + struct fileproc *p_fp; /* file data pointer */ + struct proc *p_proc; /* proc pointer */ + struct ipc_mqueue *p_mqueue; /* pset pointer */ } kn_ptr; - struct filterops *kn_fop; - int kn_status; /* status bits */ - int kn_sfflags; /* saved filter flags */ + uint64_t kn_req_index:4, /* requested qos index */ + kn_qos_index:4, /* in-use qos index */ + kn_qos_override:4, /* qos override index */ + kn_reserved:4, /* reserved bits */ + kn_filtid:8, /* filter id to index filter ops */ + kn_kq_packed:KNOTE_KQ_BITSIZE; /* packed pointer for kq */ + + int kn_sfflags; /* saved filter flags */ union { - void *kn_hook; - uint64_t kn_hook_data; + void *kn_hook; + uint64_t kn_hook_data; }; - int64_t kn_sdata; /* saved data field */ - struct kevent_internal_s kn_kevent; - -#define KN_ACTIVE 0x01 /* event has been triggered */ -#define KN_QUEUED 0x02 /* event is on queue */ -#define KN_DISABLED 0x04 /* event is disabled */ -#define KN_DROPPING 0x08 /* knote is being dropped */ -#define KN_USEWAIT 0x10 /* wait for knote use */ -#define KN_ATTACHING 0x20 /* event is pending attach */ -#define KN_STAYQUEUED 0x40 /* force event to stay on queue */ -#define KN_DEFERDROP 0x80 /* defer drop until re-enabled */ -#define KN_TOUCH 0x100 /* Always call f_touch callback */ + int64_t kn_sdata; /* saved data field */ + struct kevent_internal_s kn_kevent; #define kn_id kn_kevent.ident #define kn_filter kn_kevent.filter @@ -536,18 +599,135 @@ struct knote { #define kn_fp kn_ptr.p_fp }; -/* Hint values for f_touch filter operation */ -#define EVENT_REGISTER 1 -#define EVENT_PROCESS 2 +static inline struct kqueue *knote_get_kq(struct knote *kn) +{ + if (!(kn->kn_kq_packed)) + return 0; + else + return (struct kqueue *)((uintptr_t)(kn->kn_kq_packed) + (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS); +} + +static inline void knote_set_kq(struct knote *kn, void *kq) +{ + if (!kq) + kn->kn_kq_packed = 0; + else { + uint64_t offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS); + kn->kn_kq_packed = offset; + } +} + +struct filt_process_s { + int fp_fd; + unsigned int fp_flags; + user_addr_t fp_data_out; + user_size_t fp_data_size; + user_size_t fp_data_resid; +}; +typedef struct filt_process_s *filt_process_data_t; + +/* + * Filter operators + * + * These routines, provided by each filter, are called to attach, detach, deliver events, + * change/update filter registration and process/deliver events. They are called with the + * with a use-count referenced knote, with the kq unlocked. Here are more details: + * + * f_isfd - + * identifies if the "ident" field in the kevent structure is a file-descriptor. + * + * If so, the knote is associated with the file descriptor prior to attach and + * auto-removed when the file descriptor is closed (this latter behavior may change + * for EV_DISPATCH2 kevent types to allow delivery of events identifying unintended + * closes). + * + * Otherwise the knote is hashed by the ident and has no auto-close behavior. + * + * f_attach - + * called to attach the knote to the underlying object that will be delivering events + * through it when EV_ADD is supplied and no existing matching event is found + * + * provided a knote that is pre-attached to the fd or hashed (see above) but is + * specially marked to avoid concurrent access until the attach is complete. The + * kevent structure embedded in this knote has been filled in with a sanitized + * version of the user-supplied kevent data. However, the user-supplied filter-specific + * flags (fflags) and data fields have been moved into the knote's kn_sfflags and kn_sdata + * fields respectively. These are usually interpretted as a set of "interest" flags and + * data by each filter - to be matched against delivered events. + * + * The attach operator indicated errors by setting the EV_ERROR flog in the flags field + * embedded in the knote's kevent structure - with the specific error indicated in the + * corresponding data field. + * + * The return value indicates if the knote should already be considered "activated" at + * the time of attach (one or more of the interest events has already occured). + * + * f_detach - + * called to disassociate the knote from the underlying object delivering events + * the filter should not attempt to deliver events through this knote after this + * operation returns control to the kq system. + * + * f_event - + * if the knote() function (or KNOTE() macro) is called against a list of knotes, + * this operator will be called on each knote in the list. + * + * The "hint" parameter is completely filter-specific, but usually indicates an + * event or set of events that have occured against the source object associated + * with the list. + * + * The return value indicates if the knote should already be considered "activated" at + * the time of attach (one or more of the interest events has already occured). + * + * f_process - + * called when attempting to deliver triggered events to user-space. + * + * If the knote was previously activated, this operator will be called when a + * thread is trying to deliver events to user-space. The filter gets one last + * chance to determine if the event/events are still interesting for this knote + * (are the conditions still right to deliver an event). If so, the filter + * fills in the output kevent structure with the information to be delivered. + * + * The input context/data parameter is used during event delivery. Some + * filters allow additional data delivery as part of event delivery. This + * context field indicates if space was made available for these additional + * items and how that space is to be allocated/carved-out. + * + * The filter may set EV_CLEAR or EV_ONESHOT in the output flags field to indicate + * special post-delivery dispositions for the knote. + * + * EV_CLEAR - indicates that all matching events have been delivered. Even + * though there were events to deliver now, there will not be any + * more until some additional events are delivered to the knote + * via the f_event operator, or the interest set is changed via + * the f_touch operator. The knote can remain deactivated after + * processing this event delivery. + * + * EV_ONESHOT - indicates that this is the last event to be delivered via + * this knote. It will automatically be deleted upon delivery + * (or if in dispatch-mode, upon re-enablement after this delivery). + * + * The return value indicates if the knote has delivered an output event. + * Unless one of the special output flags was set in the output kevent, a non- + * zero return value ALSO indicates that the knote should be re-activated + * for future event processing (in case it delivers level-based or a multi-edge + * type events like message queues that already exist). + * + * NOTE: In the future, the boolean may change to an enum that allows more + * explicit indication of just delivering a current event vs delivering + * an event with more events still pending. + * + * f_peek - + * For knotes marked KN_STAYACTIVE, indicate if the knote is truly active at + * the moment (not used for event delivery, but for status checks). + */ struct filterops { int f_isfd; /* true if ident == filedescriptor */ int (*f_attach)(struct knote *kn); void (*f_detach)(struct knote *kn); int (*f_event)(struct knote *kn, long hint); - /* Optional f_touch operation, called only if !f_isfd && non-NULL */ - void (*f_touch)(struct knote *kn, struct kevent_internal_s *kev, long type); - /* Optional f_peek operation, called only if KN_STAYQUEUED is set */ + int (*f_touch)(struct knote *kn, struct kevent_internal_s *kev); + int (*f_process)(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); unsigned (*f_peek)(struct knote *kn); }; @@ -566,18 +746,41 @@ extern void klist_init(struct klist *list); extern void knote(struct klist *list, long hint); extern int knote_attach(struct klist *list, struct knote *kn); extern int knote_detach(struct klist *list, struct knote *kn); +extern void knote_vanish(struct klist *list); extern int knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link); extern int knote_unlink_waitq(struct knote *kn, struct waitq *wq); -extern void knote_fdclose(struct proc *p, int fd); -extern void knote_markstayqueued(struct knote *kn); -extern void knote_clearstayqueued(struct knote *kn); +extern void knote_fdclose(struct proc *p, int fd, int force); +extern void knote_markstayactive(struct knote *kn); +extern void knote_clearstayactive(struct knote *kn); +extern void knote_adjust_qos(struct knote *kn, int qos, int override); +extern struct filterops *knote_fops(struct knote *kn); +#elif defined(KERNEL_PRIVATE) /* !XNU_KERNEL_PRIVATE: kexts still need a klist structure definition */ + +#include +struct proc; +struct knote; +SLIST_HEAD(klist, knote); + +#endif /* !XNU_KERNEL_PRIVATE && KERNEL_PRIVATE */ + +#ifdef KERNEL_PRIVATE +#ifdef PRIVATE + +/* make these private functions available to the pthread kext */ extern int kevent_qos_internal(struct proc *p, int fd, user_addr_t changelist, int nchanges, user_addr_t eventlist, int nevents, user_addr_t data_out, user_size_t *data_available, unsigned int flags, int32_t *retval); -#endif /* !KERNEL_PRIVATE */ + +extern int kevent_qos_internal_bind(struct proc *p, + int fd, thread_t thread, unsigned int flags); +extern int kevent_qos_internal_unbind(struct proc *p, + int fd, thread_t thread, unsigned int flags); + +#endif /* PRIVATE */ +#endif /* KERNEL_PRIVATE */ #else /* KERNEL */ diff --git a/bsd/sys/eventvar.h b/bsd/sys/eventvar.h index 6ce00103a..8d47aad64 100644 --- a/bsd/sys/eventvar.h +++ b/bsd/sys/eventvar.h @@ -61,38 +61,172 @@ #include #include -#define KQ_NEVENTS 16 /* minimize copy{in,out} calls */ +#if defined(XNU_KERNEL_PRIVATE) + +#include +#include + #define KQEXTENT 256 /* linear growth by this amount */ +/* + * kqueue - common core definition of a kqueue + * + * No real structures are allocated of this type. They are + * either kqfile objects or kqworkq objects - each of which is + * derived from this definition. + */ struct kqueue { - struct waitq_set *kq_wqs; /* private waitq set */ - decl_lck_spin_data( ,kq_lock) /* kqueue lock */ - int kq_state; - int kq_count; /* number of queued events */ - uint32_t kq_nprocess; /* atomic counter for kqueue_process */ - struct kqtailq kq_head; /* list of queued events */ - struct selinfo kq_sel; /* parent select/kqueue info */ - struct proc *kq_p; /* process containing kqueue */ - int kq_level; /* nesting level */ - -#define KQ_SEL 0x01 -#define KQ_SLEEP 0x02 -#define KQ_PROCWAIT 0x04 -#define KQ_KEV32 0x08 -#define KQ_KEV64 0x10 -#define KQ_KEV_QOS 0x20 -#define KQ_WORKQ 0x40 + struct waitq_set kq_wqs; /* private waitq set */ + lck_spin_t kq_lock; /* kqueue lock */ + uint16_t kq_state; /* state of the kq */ + uint16_t kq_level; /* nesting level of the kq */ + uint32_t kq_count; /* number of queued events */ + struct proc *kq_p; /* process containing kqueue */ + struct kqtailq kq_queue[1]; /* variable array of kqtailq structs */ +}; + +#define KQ_SEL 0x001 /* select was recorded for kq */ +#define KQ_SLEEP 0x002 /* thread is waiting for events */ +#define KQ_PROCWAIT 0x004 /* thread waiting for processing */ +#define KQ_KEV32 0x008 /* kq is used with 32-bit events */ +#define KQ_KEV64 0x010 /* kq is used with 64-bit events */ +#define KQ_KEV_QOS 0x020 /* kq events carry QoS info */ +#define KQ_WORKQ 0x040 /* KQ is bould to process workq */ +#define KQ_PROCESSING 0x080 /* KQ is being processed */ +#define KQ_DRAIN 0x100 /* kq is draining */ +#define KQ_WAKEUP 0x200 /* kq awakened while processing */ + +/* + * kqfile - definition of a typical kqueue opened as a file descriptor + * via the kqueue() system call. + * + * Adds selinfo support to the base kqueue definition, as these + * fds can be fed into select(). + */ +struct kqfile { + struct kqueue kqf_kqueue; /* common kqueue core */ + struct kqtailq kqf_suppressed; /* suppression queue */ + struct selinfo kqf_sel; /* parent select/kqueue info */ +}; + +#define kqf_wqs kqf_kqueue.kq_wqs +#define kqf_lock kqf_kqueue.kq_lock +#define kqf_state kqf_kqueue.kq_state +#define kqf_level kqf_kqueue.kq_level +#define kqf_count kqf_kqueue.kq_count +#define kqf_p kqf_kqueue.kq_p +#define kqf_queue kqf_kqueue.kq_queue + +#define QOS_INDEX_KQFILE 0 /* number of qos levels in a file kq */ + +/* + * WorkQ kqueues need to request threads to service the triggered + * knotes in the queue. These threads are brought up on a + * effective-requested-QoS basis. Knotes are segregated based on + * that value - calculated by computing max(event-QoS, kevent-QoS). + * Only one servicing thread is requested at a time for all the + * knotes at a given effective-requested-QoS. + */ + +#if !defined(KQWQ_QOS_MANAGER) +#define KQWQ_QOS_MANAGER (THREAD_QOS_LAST) +#endif + +#if !defined(KQWQ_NQOS) +#define KQWQ_NQOS (KQWQ_QOS_MANAGER + 1) +#endif + + +/* + * kqrequest - per-QoS thread request status + */ +struct kqrequest { + struct kqtailq kqr_suppressed; /* Per-QoS suppression queues */ + thread_t kqr_thread; /* thread to satisfy request */ + uint8_t kqr_state; /* KQ/workq interaction state */ + uint8_t kqr_override_delta; /* current override delta */ }; -extern struct kqueue *kqueue_alloc(struct proc *); +/* + * Workq thread start out a particular effective-requested-QoS, but + * additional events processed by the filters may represent + * backlogged events that may themselves have a higher requested-QoS. + * To represent this, the filter may apply an override to a knote's + * requested QoS. + * + * We further segregate these overridden knotes into different buckets + * by grouping. This allows easy matching of + * knotes to process vs. the highest workq thread override applied. + * + * Only certain override patterns need to be supported. A knote + * cannot have an effective-requested-QoS of UNSPECIFIED - because + * the kevent->qos (when canonicalized) will always be above that + * or indicate manager. And we don't allow an override to specify + * manager. This results in the following buckets being needed: + * + * Effective-Requested QoS + * MAINT BG UTIL DEFAULT UINIT UINTER MANAGER + * override: + * MAINT 0 + * BG 1 6 + * UTILITY 2 7 11 + * DEFAULT 3 8 12 15 + * UINIT 4 9 13 16 18 + * UINTER 5 10 14 17 19 20 + * 21 + */ +#if !defined(KQWQ_NBUCKETS) +#define KQWQ_NBUCKETS 22 +#endif + +/* + * kqworkq - definition of a private kqueue used to coordinate event + * handling for pthread work queues. + * + * These have per-qos processing queues and state to coordinate with + * the pthread kext to ask for threads at corresponding pthread priority + * values. + */ +struct kqworkq { + struct kqueue kqwq_kqueue; + struct kqtailq kqwq_queuecont[KQWQ_NBUCKETS-1]; /* continue array of queues */ + struct kqrequest kqwq_request[KQWQ_NQOS]; /* per-QoS request states */ + lck_spin_t kqwq_reqlock; /* kqueue request lock */ +}; + +#define kqwq_wqs kqwq_kqueue.kq_wqs +#define kqwq_lock kqwq_kqueue.kq_lock +#define kqwq_state kqwq_kqueue.kq_state +#define kqwq_level kqwq_kqueue.kq_level +#define kqwq_count kqwq_kqueue.kq_count +#define kqwq_p kqwq_kqueue.kq_p +#define kqwq_queue kqwq_kqueue.kq_queue + +#define kqwq_req_lock(kqwq) (lck_spin_lock(&kqwq->kqwq_reqlock)) +#define kqwq_req_unlock(kqwq) (lck_spin_unlock(&kqwq->kqwq_reqlock)) +#define kqwq_req_held(kqwq) (lck_spin_held(&kqwq->kqwq_reqlock)) + +#define KQWQ_PROCESSING 0x01 /* running the kq in workq mode */ +#define KQWQ_THREQUESTED 0x02 /* thread requested from workq */ +#define KQWQ_THMANAGER 0x04 /* expect manager thread to run the queue */ +#define KQWQ_HOOKCALLED 0x10 /* hook called during processing */ +#define KQWQ_WAKEUP 0x20 /* wakeup called during processing */ + +extern struct kqueue *kqueue_alloc(struct proc *, unsigned int); extern void kqueue_dealloc(struct kqueue *); typedef int (*kevent_callback_t)(struct kqueue *, struct kevent_internal_s *, void *); typedef void (*kqueue_continue_t)(struct kqueue *, void *, int); -extern int kevent_register(struct kqueue *, struct kevent_internal_s *, struct proc *); +extern void kevent_register(struct kqueue *, struct kevent_internal_s *, struct proc *); extern int kqueue_scan(struct kqueue *, kevent_callback_t, kqueue_continue_t, - void *, struct timeval *, struct proc *); + void *, struct filt_process_s *, struct timeval *, struct proc *); extern int kqueue_stat(struct kqueue *, void *, int, proc_t); +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* !_SYS_EVENTVAR_H_ */ + + + + diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index 08ad4e546..2de689085 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -362,6 +362,7 @@ #endif #define F_ADDFILESIGS_RETURN 97 /* Add signature from same file, return end offset in structure on sucess */ +#define F_CHECK_LV 98 /* Check if Library Validation allows this Mach-O file to be mapped into the calling process */ // FS-specific fcntl()'s numbers begin at 0x00010000 and go up @@ -527,7 +528,7 @@ typedef struct fsignatures { size_t fs_blob_size; } fsignatures_t; #ifdef KERNEL -/* LP64 version of fsignatures. all pointers +/* LP64 version of fsignatures. all pointers * grow when we're dealing with a 64-bit process. * WARNING - keep in sync with fsignatures */ @@ -547,6 +548,43 @@ typedef struct user_fsignatures { } user_fsignatures_t; #endif /* KERNEL */ +/* + * DYLD needs to check if the object is allowed to be combined + * into the main binary. This is done between the code signature + * is loaded and dyld is doing all the work to process the LOAD commands. + * + * While this could be done in F_ADDFILESIGS.* family the hook into + * the MAC module doesn't say no when LV isn't enabled and then that + * is cached on the vnode, and the MAC module never gets change once + * a process that library validation enabled. + */ +typedef struct fchecklv { + off_t lv_file_start; + size_t lv_error_message_size; + void *lv_error_message; +} fchecklv_t; + +#ifdef KERNEL +/* LP64 version of fchecklv. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with fsignatures + */ + +typedef struct user32_fchecklv { + user32_off_t lv_file_start; + user32_size_t lv_error_message_size; + user32_addr_t lv_error_message; +} user32_fchecklv_t; + +typedef struct user_fchecklv { + off_t lv_file_start; + user_size_t lv_error_message_size; + user_addr_t lv_error_message; +} user_fchecklv_t; + +#endif /* KERNEL */ + + /* lock operations for flock(2) */ #define LOCK_SH 0x01 /* shared file lock */ #define LOCK_EX 0x02 /* exclusive file lock */ @@ -572,7 +610,7 @@ typedef struct fbootstraptransfer { } fbootstraptransfer_t; #ifdef KERNEL -/* LP64 version of fbootstraptransfer. all pointers +/* LP64 version of fbootstraptransfer. all pointers * grow when we're dealing with a 64-bit process. * WARNING - keep in sync with fbootstraptransfer */ diff --git a/bsd/sys/file.h b/bsd/sys/file.h index dcf08f448..e79451706 100644 --- a/bsd/sys/file.h +++ b/bsd/sys/file.h @@ -97,6 +97,9 @@ int file_drop(int); #ifdef KERNEL_PRIVATE int fd_rdwr(int fd, enum uio_rw, uint64_t base, int64_t len, enum uio_seg, off_t offset, int io_flg, int64_t *aresid); +struct fileproc; +struct vnode; +int fp_getfvp(struct proc *p, int fd, struct fileproc **resultfp, struct vnode **resultvp); #endif /* KERNEL_PRIVATE */ __END_DECLS #endif /* !_SYS_FILE_H_ */ diff --git a/bsd/sys/file_internal.h b/bsd/sys/file_internal.h index 172aa3d04..60a47afa5 100644 --- a/bsd/sys/file_internal.h +++ b/bsd/sys/file_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,7 @@ #include #include #include +#include struct proc; struct uio; @@ -144,7 +145,8 @@ typedef enum { DTYPE_KQUEUE, /* kqueue */ DTYPE_PIPE, /* pipe */ DTYPE_FSEVENTS, /* fsevents */ - DTYPE_ATALK /* (obsolete) */ + DTYPE_ATALK, /* (obsolete) */ + DTYPE_NETPOLICY, /* networking policy */ } file_type_t; /* defines for fg_lflags */ @@ -185,7 +187,7 @@ struct fileglob { int (*fo_drain) (struct fileproc *fp, vfs_context_t ctx); } *fg_ops; off_t fg_offset; - void *fg_data; /* vnode or socket or SHM or semaphore */ + void *fg_data; /* vnode or socket or SHM or semaphore */ void *fg_vn_data; /* Per fd vnode data, used for directories */ lck_mtx_t fg_lock; #if CONFIG_MACF @@ -229,7 +231,6 @@ int fp_getfpipe(struct proc *p, int fd, struct fileproc **resultfp, struct pipe struct atalk; int fp_getfatalk(struct proc *p, int fd, struct fileproc **resultfp, struct atalk **resultatalk); struct vnode; -int fp_getfvp(struct proc *p, int fd, struct fileproc **resultfp, struct vnode **resultvp); int fp_getfvpandvid(struct proc *p, int fd, struct fileproc **resultfp, struct vnode **resultvp, uint32_t * vidp); struct socket; int fp_getfsock(struct proc *p, int fd, struct fileproc **resultfp, struct socket **results); @@ -261,6 +262,8 @@ extern void fileproc_free(struct fileproc *fp); extern void guarded_fileproc_free(struct fileproc *fp); extern void fg_vn_data_free(void *fgvndata); extern int nameiat(struct nameidata *ndp, int dirfd); +extern int falloc_guarded(struct proc *p, struct fileproc **fp, int *fd, + vfs_context_t ctx, const guardid_t *guard, u_int attrs); __END_DECLS #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/sys/filio.h b/bsd/sys/filio.h index de132c60d..80e03305b 100644 --- a/bsd/sys/filio.h +++ b/bsd/sys/filio.h @@ -81,4 +81,9 @@ #define FIOGETOWN _IOR('f', 123, int) /* get owner */ #define FIODTYPE _IOR('f', 122, int) /* get d_type */ +#ifdef KERNEL_PRIVATE +#define FIODEVICELOCKED _IO('f', 121) /* device locked/unlocked */ +#define FIOPINSWAP _IO('f', 120) /* pin swap file to fast device */ +#endif + #endif /* !_SYS_FILIO_H_ */ diff --git a/bsd/sys/fsctl.h b/bsd/sys/fsctl.h index 623d16f55..8f15b24ae 100644 --- a/bsd/sys/fsctl.h +++ b/bsd/sys/fsctl.h @@ -158,8 +158,6 @@ typedef struct namespace_handler_data { } namespace_handler_data; -#define NSPACE_REARM_NO_ARG ((void *)1) -extern int resolve_nspace_item(struct vnode *vp, uint64_t op); extern int resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg); extern int get_nspace_item_status(struct vnode *vp, int32_t *status); @@ -191,6 +189,14 @@ typedef struct namespace_handler_data { #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE + +#define NSPACE_REARM_NO_ARG ((void *)1) +int resolve_nspace_item(struct vnode *vp, uint64_t op); +int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg); + +#endif // defined(KERNEL_PRIVATE) + #define NAMESPACE_HANDLER_READ_OP 0x0001 #define NAMESPACE_HANDLER_WRITE_OP 0x0002 #define NAMESPACE_HANDLER_DELETE_OP 0x0004 diff --git a/bsd/sys/fsevents.h b/bsd/sys/fsevents.h index 16fb22425..13ec75cfd 100644 --- a/bsd/sys/fsevents.h +++ b/bsd/sys/fsevents.h @@ -124,11 +124,9 @@ typedef struct fsevent_dev_filter_args { #define FSEVENTS_GET_CURRENT_ID _IOR('s', 103, uint64_t) -#ifdef KERNEL +#ifdef BSD_KERNEL_PRIVATE void fsevents_init(void); -int need_fsevent(int type, vnode_t vp); -int add_fsevent(int type, vfs_context_t, ...); void fsevent_unmount(struct mount *mp); struct vnode_attr; void create_fsevent_from_kevent(vnode_t vp, uint32_t kevents, struct vnode_attr *vap); @@ -149,6 +147,13 @@ int vnode_get_fse_info_from_vap(vnode_t vp, fse_info *fse, struct vnode_attr * char *get_pathbuff(void); void release_pathbuff(char *path); -#endif /* KERNEL */ +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef KERNEL_PRIVATE + +int need_fsevent(int type, vnode_t vp); +int add_fsevent(int type, vfs_context_t, ...); + +#endif /* KERNEL_PRIVATE */ #endif /* FSEVENT_H */ diff --git a/bsd/sys/fsgetpath.h b/bsd/sys/fsgetpath.h index 941f31c02..bad8b4e1b 100644 --- a/bsd/sys/fsgetpath.h +++ b/bsd/sys/fsgetpath.h @@ -44,11 +44,8 @@ __BEGIN_DECLS * Obtain the full pathname of a file system object by id. * * This is a private SPI used by the File Manager. - * - * ssize_t fsgetpath_np(char *restrict buf, size_t bufsize, fsid_t fsid, uint64_t objid); */ -#define fsgetpath(buf, bufsize, fsid, objid) \ - (ssize_t)syscall(SYS_fsgetpath, buf, (size_t)bufsize, fsid, (uint64_t)objid) +ssize_t fsgetpath(char * __restrict buf, size_t bufsize, fsid_t* fsid, uint64_t objid); /* * openbyid_np: open a file given a file system id and a file system object id diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index 57df28890..f20dfb2d4 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -118,6 +118,7 @@ struct image_params { void *ip_px_spa; void *ip_px_smpx; /* MAC-specific spawn attrs. */ void *ip_px_persona; /* persona args */ + void *ip_cs_error; /* codesigning error reason */ }; /* diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h index 8a533524e..dd496f8da 100644 --- a/bsd/sys/kauth.h +++ b/bsd/sys/kauth.h @@ -261,6 +261,8 @@ extern int kauth_cred_gid2ntsid(gid_t _gid, ntsid_t *_sidp); extern int kauth_cred_guid2ntsid(guid_t *_guid, ntsid_t *_sidp); extern int kauth_cred_ismember_gid(kauth_cred_t _cred, gid_t _gid, int *_resultp); extern int kauth_cred_ismember_guid(kauth_cred_t _cred, guid_t *_guidp, int *_resultp); +extern int kauth_cred_nfs4domain2dsnode(char *nfs4domain, char *dsnode); +extern int kauth_cred_dsnode2nfs4domain(char *dsnode, char *nfs4domain); extern int groupmember(gid_t gid, kauth_cred_t cred); diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 956d7234c..e44013e0a 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,10 +26,8 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1997 Apple Computer, Inc. All rights reserved. - * - * kdebug.h - kernel_debug definitions - * +/* + * kdebug.h - kernel_debug definitions */ #ifndef BSD_SYS_KDEBUG_H @@ -48,29 +46,30 @@ __BEGIN_DECLS #include #endif -#ifdef XNU_KERNEL_PRIVATE -#include -#include +#ifdef XNU_KERNEL_PRIVATE +#include /* __improbable */ #endif /* * Kdebug is a facility for tracing events occurring on a system. * - * All events are tagged with a debugid, consisting of the following: + * All events are tagged with a 32-bit debugid: * * +----------------+----------------+----------------------------+----+ * | Class (8) | Subclass (8) | Code (14) |Func| * | | | |(2) | * +----------------+----------------+----------------------------+----+ - * \______________________________________________________________/ - * Eventid + * \_________________________________/ + * ClassSubclass (CSC) + * \________________________________________________________________00_/ + * Eventid * \___________________________________________________________________/ * Debugid * * The eventid is a hierarchical ID, indicating which components an event is * referring to. The debugid includes an eventid and two function qualifier * bits, to determine the structural significance of an event (whether it - * starts or ends a series of grouped events). + * starts or ends an interval). */ #define KDBG_CLASS_MASK (0xff000000) @@ -84,12 +83,14 @@ __BEGIN_DECLS /* class and subclass mask */ #define KDBG_CSC_MASK (0xffff0000) #define KDBG_CSC_OFFSET (KDBG_SUBCLASS_OFFSET) +#define KDBG_CSC_MAX (0xffff) #define KDBG_CODE_MASK (0x0000fffc) #define KDBG_CODE_OFFSET (2) #define KDBG_CODE_MAX (0x3fff) #define KDBG_EVENTID_MASK (0xfffffffc) +#define KDBG_FUNC_MASK (0x00000003) /* Generate an eventid corresponding to Class, SubClass, and Code. */ #define KDBG_EVENTID(Class, SubClass, Code) \ @@ -110,21 +111,40 @@ __BEGIN_DECLS #define KDBG_EXTRACT_CODE(Debugid) \ ((uint16_t)(((Debugid) & KDBG_CODE_MASK) >> KDBG_CODE_OFFSET)) +/* function qualifiers */ +#define DBG_FUNC_START 1 +#define DBG_FUNC_END 2 +#define DBG_FUNC_NONE 0 + +/* + * Definitions to support IOP tracing. + */ + #ifdef KERNEL_PRIVATE -typedef enum -{ - KD_CALLBACK_KDEBUG_ENABLED, // Trace is now enabled. No arguments - KD_CALLBACK_KDEBUG_DISABLED, // Trace is now disabled. No arguments - KD_CALLBACK_SYNC_FLUSH, // Request the latest entries from the IOP, and block until complete. No arguments - KD_CALLBACK_TYPEFILTER_CHANGED, // Typefilter is enabled. A read-only pointer to the typefilter is provided, but is only valid while in the callback. +typedef enum { + /* Trace is now enabled; no arguments. */ + KD_CALLBACK_KDEBUG_ENABLED, + /* Trace is now disabled; no arguments. */ + KD_CALLBACK_KDEBUG_DISABLED, + /* + * Request the latest entries from the IOP and block until complete; no + * arguments. + */ + KD_CALLBACK_SYNC_FLUSH, + /* + * The typefilter is enabled; a read-only pointer to the typefilter is + * provided, valid only while in the callback. + */ + KD_CALLBACK_TYPEFILTER_CHANGED, } kd_callback_type; typedef void (*kd_callback_fn) (void* context, kd_callback_type reason, void* arg); struct kd_callback { - kd_callback_fn func; - void* context; - char iop_name[8]; // null-terminated string with name of core. + kd_callback_fn func; + void *context; + /* name of IOP, NUL-terminated */ + char iop_name[8]; }; typedef struct kd_callback kd_callback_t; @@ -140,65 +160,67 @@ typedef struct kd_callback kd_callback_t; * kernel_debug_enter() to refer to your IOP. If the allocation * failed, then 0 will be returned. * - * * Caveats: * Note that not all callback calls will indicate a change in * state (e.g. disabling trace twice would send two disable * notifications). - * */ extern int kernel_debug_register_callback(kd_callback_t callback); extern void kernel_debug_enter( - uint32_t coreid, - uint32_t debugid, - uint64_t timestamp, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t threadid + uint32_t coreid, + uint32_t debugid, + uint64_t timestamp, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t threadid ); #endif /* KERNEL_PRIVATE */ -/* The Function qualifiers */ -#define DBG_FUNC_START 1 -#define DBG_FUNC_END 2 -#define DBG_FUNC_NONE 0 - /* The Kernel Debug Classes */ -#define DBG_MACH 1 -#define DBG_NETWORK 2 -#define DBG_FSYSTEM 3 -#define DBG_BSD 4 -#define DBG_IOKIT 5 -#define DBG_DRIVERS 6 -#define DBG_TRACE 7 -#define DBG_DLIL 8 -#define DBG_WORKQUEUE 9 -#define DBG_CORESTORAGE 10 -#define DBG_CG 11 -#define DBG_MISC 20 -#define DBG_SECURITY 30 -#define DBG_DYLD 31 -#define DBG_QT 32 -#define DBG_APPS 33 -#define DBG_LAUNCHD 34 -#define DBG_PERF 37 -#define DBG_IMPORTANCE 38 -#define DBG_BANK 40 -#define DBG_XPC 41 -#define DBG_ATM 42 -#define DBG_ARIADNE 43 -#define DBG_DAEMON 44 -#define DBG_ENERGYTRACE 45 -#define DBG_IMG 49 - - -#define DBG_MIG 255 +#define DBG_MACH 1 +#define DBG_NETWORK 2 +#define DBG_FSYSTEM 3 +#define DBG_BSD 4 +#define DBG_IOKIT 5 +#define DBG_DRIVERS 6 +#define DBG_TRACE 7 +#define DBG_DLIL 8 +#define DBG_WORKQUEUE 9 +#define DBG_CORESTORAGE 10 +#define DBG_CG 11 +#define DBG_MISC 20 +#define DBG_SECURITY 30 +#define DBG_DYLD 31 +#define DBG_QT 32 +#define DBG_APPS 33 +#define DBG_LAUNCHD 34 +#define DBG_PERF 37 +#define DBG_IMPORTANCE 38 +#define DBG_BANK 40 +#define DBG_XPC 41 +#define DBG_ATM 42 +#define DBG_ARIADNE 43 +#define DBG_DAEMON 44 +#define DBG_ENERGYTRACE 45 +#define DBG_DISPATCH 46 +#define DBG_IMG 49 +#define DBG_UMALLOC 51 + + +#define DBG_MIG 255 #ifdef PRIVATE + +/* + * Private kdebug userspace API + */ +#ifndef KERNEL +#include + /* * OS components can use the full precision of the "code" field * (Class, SubClass, Code) to inject events using kdebug_trace() by @@ -216,9 +238,13 @@ extern void kernel_debug_enter( * * On error, -1 will be returned and errno will indicate the error. */ -#ifndef KERNEL -extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) __OSX_AVAILABLE_STARTING(__MAC_10_10_2, __IPHONE_8_2); -#endif +extern int kdebug_trace( + uint32_t code, + uint64_t arg1, + uint64_t arg2, + uint64_t arg3, + uint64_t arg4) + __OSX_AVAILABLE(10.10.2) __IOS_AVAILABLE(8.2); /*! * @function kdebug_trace_string @@ -284,16 +310,47 @@ extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t ar * EFAULT * `str` is an invalid address or NULL when `str_id` is 0. */ -#ifndef KERNEL extern uint64_t kdebug_trace_string(uint32_t debugid, uint64_t str_id, const char *str) -__OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); -#endif + __OSX_AVAILABLE(10.11) __IOS_AVAILABLE(9.0); + +/* + * Although the performance impact of kdebug_trace() when kernel + * tracing is not enabled is minimal, it may require the caller to + * perform an expensive calculation/summarization. This cost can be + * skipped by checking the kdebug_is_enabled() predicate: + * + * if (kdebug_is_enabled(KDBG_CODE(DBG_XPC, 15, 1))) { + * uint64_t arg1 = ...; + * uint64_t arg2 = ...; + * kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, arg1, arg2, 0, 0); + * } + * + * If tracing is enabled for the code at the time of the check, 1 + * will be returned. Otherwise, 0 will be returned. + */ +extern bool kdebug_is_enabled(uint32_t code) + __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) + __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); + +/* + * Returns a pointer to the userspace typefilter, if one is available. + * May return NULL. + */ +extern void *kdebug_typefilter(void) + __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) + __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); + +#endif /* !KERNEL (Private kdebug userspace API) */ #endif /* PRIVATE */ #ifdef XNU_KERNEL_PRIVATE /* Used in early boot to log strings spanning only a single tracepoint. */ -extern void kernel_debug_string_simple(const char *message); +extern void kernel_debug_string_early(const char *message); +/* Used to trace strings within kdebug tracepoints on arbitrary eventids. */ +extern void kernel_debug_string_simple(uint32_t eventid, const char *str); +/* Only used by ktrace to reset kdebug. ktrace_lock must be held. */ +extern void kdebug_reset(void); #endif /* XNU_KERNEL_PRIVATE */ /* **** The Kernel Debug Sub Classes for Mach (DBG_MACH) **** */ @@ -305,23 +362,26 @@ extern void kernel_debug_string_simple(const char *message); #define DBG_MACH_EXCP_UTRAP_x86 0x07 /* User Traps on x86 */ #define DBG_MACH_EXCP_FP 0x08 /* FP Unavail */ #define DBG_MACH_EXCP_DECI 0x09 /* Decrementer Interrupt */ -#define DBG_MACH_CHUD 0x0A /* CHUD */ +#define DBG_MACH_CHUD 0x0A /* deprecated name */ +#define DBG_MACH_SIGNPOST 0x0A /* kernel signposts */ #define DBG_MACH_EXCP_SC 0x0C /* System Calls */ #define DBG_MACH_EXCP_TRACE 0x0D /* Trace exception */ #define DBG_MACH_EXCP_EMUL 0x0E /* Instruction emulated */ #define DBG_MACH_IHDLR 0x10 /* Interrupt Handlers */ #define DBG_MACH_IPC 0x20 /* Inter Process Comm */ -#define DBG_MACH_VM 0x30 /* Virtual Memory */ -#define DBG_MACH_LEAKS 0x31 /* alloc/free */ -#define DBG_MACH_SCHED 0x40 /* Scheduler */ -#define DBG_MACH_MSGID_INVALID 0x50 /* Messages - invalid */ +#define DBG_MACH_RESOURCE 0x25 /* tracing limits, etc */ +#define DBG_MACH_VM 0x30 /* Virtual Memory */ +#define DBG_MACH_LEAKS 0x31 /* alloc/free */ +#define DBG_MACH_WORKINGSET 0x32 /* private subclass for working set related debugging */ +#define DBG_MACH_SCHED 0x40 /* Scheduler */ +#define DBG_MACH_MSGID_INVALID 0x50 /* Messages - invalid */ #define DBG_MACH_LOCKS 0x60 /* new lock APIs */ #define DBG_MACH_PMAP 0x70 /* pmap */ #define DBG_MACH_CLOCK 0x80 /* clock */ #define DBG_MACH_MP 0x90 /* MP related */ #define DBG_MACH_VM_PRESSURE 0xA0 /* Memory Pressure Events */ -#define DBG_MACH_STACKSHOT 0xA1 /* Stackshot/Microstackshot subsystem */ -#define DBG_MACH_SFI 0xA2 /* Selective Forced Idle (SFI) */ +#define DBG_MACH_STACKSHOT 0xA1 /* Stackshot/Microstackshot subsystem */ +#define DBG_MACH_SFI 0xA2 /* Selective Forced Idle (SFI) */ #define DBG_MACH_ENERGY_PERF 0xA3 /* Energy/performance resource stats */ #define DBG_MACH_SYSDIAGNOSE 0xA4 /* sysdiagnose keychord */ #define DBG_MACH_ZALLOC 0xA5 /* Zone allocator */ @@ -402,6 +462,8 @@ extern void kernel_debug_string_simple(const char *message); #define MACH_IPC_VOUCHER_CREATE 0x7 /* Voucher added to global voucher hashtable */ #define MACH_IPC_VOUCHER_CREATE_ATTR_DATA 0x8 /* Attr data for newly created voucher */ #define MACH_IPC_VOUCHER_DESTROY 0x9 /* Voucher removed from global voucher hashtable */ +#define MACH_IPC_KMSG_INFO 0xa /* Send/Receive info for a kmsg */ +#define MACH_IPC_KMSG_LINK 0xb /* link a kernel kmsg pointer to user mach_msg_header_t */ /* Codes for pmap (DBG_MACH_PMAP) */ #define PMAP__CREATE 0x0 @@ -449,6 +511,29 @@ extern void kernel_debug_string_simple(const char *message); /* Codes for Zone Allocator (DBG_MACH_ZALLOC) */ #define ZALLOC_ZCRAM 0x0 +/* Codes for Mach resource management (DBG_MACH_RESOURCE) */ +/* _K32A/B codes start at double the low nibble */ +#define RMON_ENABLE_CPUUSAGE_MONITOR 0x001 +#define RMON_CPUUSAGE_VIOLATED 0x002 +#define RMON_CPUUSAGE_SUSPENDED 0x003 +#define RMON_CPUUSAGE_VIOLATED_K32A 0x004 +#define RMON_CPUUSAGE_VIOLATED_K32B 0x005 +#define RMON_CPUUSAGE_RESUMED 0x006 +#define RMON_DISABLE_CPUUSAGE_MONITOR 0x00f + +#define RMON_ENABLE_CPUWAKES_MONITOR 0x011 +#define RMON_CPUWAKES_VIOLATED 0x012 +#define RMON_CPUWAKES_VIOLATED_K32A 0x014 +#define RMON_CPUWAKES_VIOLATED_K32B 0x015 +#define RMON_DISABLE_CPUWAKES_MONITOR 0x01f + +#define RMON_ENABLE_IO_MONITOR 0x021 +#define RMON_LOGWRITES_VIOLATED 0x022 +#define RMON_PHYSWRITES_VIOLATED 0x023 +#define RMON_LOGWRITES_VIOLATED_K32A 0x024 +#define RMON_LOGWRITES_VIOLATED_K32B 0x025 +#define RMON_DISABLE_IO_MONITOR 0x02f + /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */ #define DBG_NETIP 1 /* Internet Protocol */ #define DBG_NETARP 2 /* Address Resolution Protocol */ @@ -539,6 +624,7 @@ extern void kernel_debug_string_simple(const char *message); #define DBG_DRVNAND 20 /* NAND drivers and layers */ #define DBG_SSD 21 /* SSD */ #define DBG_DRVSPI 22 /* SPI */ +#define DBG_DRVWLAN_802_11 23 /* WLAN 802.11 */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ @@ -552,14 +638,16 @@ extern void kernel_debug_string_simple(const char *message); #define DBG_DLIL_IF_FLT 5 /* DLIL Interface FIlter */ /* The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) */ -#define DBG_FSRW 1 /* reads and writes to the filesystem */ -#define DBG_DKRW 2 /* reads and writes to the disk */ -#define DBG_FSVN 3 /* vnode operations (inc. locking/unlocking) */ -#define DBG_FSLOOOKUP 4 /* namei and other lookup-related operations */ -#define DBG_JOURNAL 5 /* journaling operations */ -#define DBG_IOCTL 6 /* ioctl to the disk */ -#define DBG_BOOTCACHE 7 /* bootcache operations */ -#define DBG_HFS 8 /* HFS-specific events; see bsd/hfs/hfs_kdebug.h */ +#define DBG_FSRW 0x1 /* reads and writes to the filesystem */ +#define DBG_DKRW 0x2 /* reads and writes to the disk */ +#define DBG_FSVN 0x3 /* vnode operations (inc. locking/unlocking) */ +#define DBG_FSLOOOKUP 0x4 /* namei and other lookup-related operations */ +#define DBG_JOURNAL 0x5 /* journaling operations */ +#define DBG_IOCTL 0x6 /* ioctl to the disk */ +#define DBG_BOOTCACHE 0x7 /* bootcache operations */ +#define DBG_HFS 0x8 /* HFS-specific events; see the hfs project */ +#define DBG_APFS 0x9 /* APFS-specific events; see the apfs project */ +#define DBG_SMB 0xA /* SMB-specific events; see the smb project */ #define DBG_EXFAT 0xE /* ExFAT-specific events; see the exfat project */ #define DBG_MSDOS 0xF /* FAT-specific events; see the msdosfs project */ #define DBG_ACFS 0x10 /* Xsan-specific events; see the XsanFS project */ @@ -579,17 +667,21 @@ extern void kernel_debug_string_simple(const char *message); #define DBG_HFS_UPDATE_SKIPPED 0x80 /* The Kernel Debug Sub Classes for BSD */ -#define DBG_BSD_PROC 0x01 /* process/signals related */ -#define DBG_BSD_MEMSTAT 0x02 /* memorystatus / jetsam operations */ -#define DBG_BSD_EXCP_SC 0x0C /* System Calls */ -#define DBG_BSD_AIO 0x0D /* aio (POSIX async IO) */ -#define DBG_BSD_SC_EXTENDED_INFO 0x0E /* System Calls, extended info */ -#define DBG_BSD_SC_EXTENDED_INFO2 0x0F /* System Calls, extended info */ +#define DBG_BSD_PROC 0x01 /* process/signals related */ +#define DBG_BSD_MEMSTAT 0x02 /* memorystatus / jetsam operations */ +#define DBG_BSD_EXCP_SC 0x0C /* System Calls */ +#define DBG_BSD_AIO 0x0D /* aio (POSIX async IO) */ +#define DBG_BSD_SC_EXTENDED_INFO 0x0E /* System Calls, extended info */ +#define DBG_BSD_SC_EXTENDED_INFO2 0x0F /* System Calls, extended info */ +#define DBG_BSD_KDEBUG_TEST 0xFF /* for testing kdebug */ /* The Codes for BSD subcode class DBG_BSD_PROC */ -#define BSD_PROC_EXIT 1 /* process exit */ -#define BSD_PROC_FRCEXIT 2 /* Kernel force termination */ +#define BSD_PROC_EXIT 1 /* process exit */ +#define BSD_PROC_FRCEXIT 2 /* Kernel force termination */ +#define BSD_PROC_EXEC 3 /* process spawn / exec */ +#define BSD_PROC_EXITREASON_CREATE 4 /* exit reason creation */ +#define BSD_PROC_EXITREASON_COMMIT 5 /* exit reason commited to a proc */ /* Codes for BSD subcode class DBG_BSD_MEMSTAT */ #define BSD_MEMSTAT_SCAN 1 /* memorystatus thread awake */ @@ -617,9 +709,13 @@ extern void kernel_debug_string_simple(const char *message); #define TRACE_DATA_NEWTHREAD (TRACEDBG_CODE(DBG_TRACE_DATA, 1)) #define TRACE_DATA_EXEC (TRACEDBG_CODE(DBG_TRACE_DATA, 2)) #define TRACE_DATA_THREAD_TERMINATE (TRACEDBG_CODE(DBG_TRACE_DATA, 3)) +#define TRACE_DATA_THREAD_TERMINATE_PID (TRACEDBG_CODE(DBG_TRACE_DATA, 4)) #define TRACE_STRING_GLOBAL (TRACEDBG_CODE(DBG_TRACE_STRING, 0)) #define TRACE_STRING_NEWTHREAD (TRACEDBG_CODE(DBG_TRACE_STRING, 1)) #define TRACE_STRING_EXEC (TRACEDBG_CODE(DBG_TRACE_STRING, 2)) +#define TRACE_STRING_PROC_EXIT (TRACEDBG_CODE(DBG_TRACE_STRING, 3)) +#define TRACE_STRING_THREADNAME (TRACEDBG_CODE(DBG_TRACE_STRING, 4)) +#define TRACE_STRING_THREADNAME_PREV (TRACEDBG_CODE(DBG_TRACE_STRING, 5)) #define TRACE_PANIC (TRACEDBG_CODE(DBG_TRACE_INFO, 0)) #define TRACE_TIMESTAMPS (TRACEDBG_CODE(DBG_TRACE_INFO, 1)) #define TRACE_LOST_EVENTS (TRACEDBG_CODE(DBG_TRACE_INFO, 2)) @@ -639,7 +735,24 @@ extern void kernel_debug_string_simple(const char *message); #define DBG_BUFFER 0x20 /* The Kernel Debug Sub Classes for DBG_DYLD */ -#define DBG_DYLD_STRING 5 +#define DBG_DYLD_UUID (5) + +/* Kernel Debug codes for the DBG_DYLD_UUID subclass */ +#define DBG_DYLD_UUID_MAP_A (0) +#define DBG_DYLD_UUID_MAP_B (1) +#define DBG_DYLD_UUID_MAP_32_A (2) +#define DBG_DYLD_UUID_MAP_32_B (3) +#define DBG_DYLD_UUID_MAP_32_C (4) +#define DBG_DYLD_UUID_UNMAP_A (5) +#define DBG_DYLD_UUID_UNMAP_B (6) +#define DBG_DYLD_UUID_UNMAP_32_A (7) +#define DBG_DYLD_UUID_UNMAP_32_B (8) +#define DBG_DYLD_UUID_UNMAP_32_C (9) +#define DBG_DYLD_UUID_SHARED_CACHE_A (10) +#define DBG_DYLD_UUID_SHARED_CACHE_B (11) +#define DBG_DYLD_UUID_SHARED_CACHE_32_A (12) +#define DBG_DYLD_UUID_SHARED_CACHE_32_B (13) +#define DBG_DYLD_UUID_SHARED_CACHE_32_C (14) /* The Kernel Debug modifiers for the DBG_DKRW sub class */ #define DKIO_DONE 0x01 @@ -656,8 +769,10 @@ extern void kernel_debug_string_simple(const char *message); /* Kernel Debug Sub Classes for Applications (DBG_APPS) */ #define DBG_APP_LOGINWINDOW 0x03 #define DBG_APP_AUDIO 0x04 -#define DBG_APP_SIGPOST 0x0A +#define DBG_APP_SYSTEMUI 0x05 +#define DBG_APP_SIGNPOST 0x0A #define DBG_APP_APPKIT 0x0C +#define DBG_APP_DFR 0x0E #define DBG_APP_SAMBA 0x80 /* Kernel Debug codes for Throttling (DBG_THROTTLE) */ @@ -732,6 +847,9 @@ extern void kernel_debug_string_simple(const char *message); /* Kernel Debug Sub Classes for daemons (DBG_DAEMON) */ #define DBG_DAEMON_COREDUET 0x1 +/* Subclasses for the user space allocator */ +#define DBG_UMALLOC_EXTERNAL 0x1 +#define DBG_UMALLOC_INTERNAL 0x2 /**********************************************************************/ #define KDBG_MIGCODE(msgid) ((DBG_MIG << KDBG_CLASS_OFFSET) | \ @@ -773,44 +891,95 @@ extern void kernel_debug_string_simple(const char *message); /* Kernel Debug Macros for specific daemons */ #define COREDUETDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_COREDUET, code) -/* Usage: -* kernel_debug((KDBG_CODE(DBG_NETWORK, DNET_PROTOCOL, 51) | DBG_FUNC_START), -* offset, 0, 0, 0,0) -* -* For ex, -* -* #include -* -* #define DBG_NETIPINIT NETDBG_CODE(DBG_NETIP,1) -* -* -* void -* ip_init() -* { -* register struct protosw *pr; -* register int i; -* -* KERNEL_DEBUG(DBG_NETIPINIT | DBG_FUNC_START, 0,0,0,0,0) -* -------- -* KERNEL_DEBUG(DBG_NETIPINIT, 0,0,0,0,0) -* -------- -* KERNEL_DEBUG(DBG_NETIPINIT | DBG_FUNC_END, 0,0,0,0,0) -* } -* +/* + * To use kdebug in the kernel: + * + * #include + * + * #define DBG_NETIPINIT NETDBG_CODE(DBG_NETIP, 1) + * + * void + * ip_init(void) + * { + * KDBG(DBG_NETIPINIT | DBG_FUNC_START, 1, 2, 3, 4); + * ... + * KDBG(DBG_NETIPINIT); + * ... + * KDBG(DBG_NETIPINIT | DBG_FUNC_END); + * } + */ -*/ +#ifdef KERNEL_PRIVATE + +/* + * The KDBG{,_DEBUG,_RELEASE,_FILTERED} macros are the preferred method of + * making tracepoints. + * + * Kernel pointers must be unslid or permuted using VM_KERNEL_UNSLIDE_OR_PERM. + * Do not trace any sensitive data. + */ + +/* + * Traced on debug and development (and release OS X) kernels. + */ +#define KDBG(x, ...) KDBG_(, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +/* + * Traced on debug and development (and release OS X) kernels if explicitly + * requested. Omitted from tracing without a typefilter. + */ +#define KDBG_FILTERED(x, ...) KDBG_(_FILTERED, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +/* + * Traced on debug, development, and release kernels. + * + * Only use this tracepoint if the events are required for a shipping trace + * tool. + */ +#define KDBG_RELEASE(x, ...) KDBG_(_RELEASE, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +/* + * Traced only on debug kernels. + */ +#define KDBG_DEBUG(x, ...) KDBG_(_DEBUG, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + +#define KDBG_(f, x, a, b, c, d, n, ...) KDBG##n(f, x, a, b, c, d) +#define KDBG0(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, 0, 0, 0, 0, 0) +#define KDBG1(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, 0, 0, 0, 0) +#define KDBG2(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, 0, 0, 0) +#define KDBG3(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, 0, 0) +#define KDBG4(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, d, 0) + +#endif /* defined(KERNEL_PRIVATE) */ extern unsigned int kdebug_enable; -#define KDEBUG_ENABLE_TRACE 0x1 -#define KDEBUG_ENABLE_ENTROPY 0x2 /* Obsolescent */ -#define KDEBUG_ENABLE_CHUD 0x4 -#define KDEBUG_ENABLE_PPT 0x8 -#define KDEBUG_ENABLE_SERIAL 0x10 /* - * Infer the supported kernel debug event level from config option. - * Use (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect - * unaudited debug code. + * Bits used by kdebug_enable. These control which events are traced at + * runtime. + */ +#define KDEBUG_ENABLE_TRACE (1U << 0) +#define KDEBUG_ENABLE_ENTROPY (1U << 1) /* obsolete */ +#define KDEBUG_ENABLE_CHUD (1U << 2) /* obsolete */ +#define KDEBUG_ENABLE_PPT (1U << 3) +#define KDEBUG_ENABLE_SERIAL (1U << 4) + +#define KDEBUG_TRACE (KDEBUG_ENABLE_TRACE) + +/* + * Specify KDEBUG_PPT to indicate that the event belongs to the limited PPT set. + * PPT is deprecated -- use a typefilter and the PPTDBG class instead. + */ +#define KDEBUG_PPT (KDEBUG_ENABLE_PPT) +#define KDEBUG_COMMON (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_PPT) + +/* + * The kernel debug configuration level. These values control which events are + * compiled in under different build configurations. + * + * Infer the supported kernel debug event level from config option. Use + * (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect unaudited debug + * code. */ #define KDEBUG_LEVEL_NONE 0 #define KDEBUG_LEVEL_IST 1 @@ -826,109 +995,169 @@ extern unsigned int kdebug_enable; #define KDEBUG_LEVEL KDEBUG_LEVEL_FULL #else #define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD -/* Currently, all other kernel configurations (development, etc) - build with KDEBUG_LEVEL_STANDARD. As a result, KERNEL_DEBUG_CONSTANT*() - are on by default but KERNEL_DEBUG*() are not. -*/ +/* + * Currently, all other kernel configurations (development, etc) build with + * KDEBUG_LEVEL_STANDARD. As a result, KERNEL_DEBUG_CONSTANT*() are on by + * default but KERNEL_DEBUG*() are not. + */ #endif -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) #ifdef XNU_KERNEL_PRIVATE -#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ -do { \ - if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ - kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ - (uintptr_t)d,(uintptr_t)e); \ -} while(0) - -#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ -do { \ - if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ - kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ - (uintptr_t)d,(uintptr_t)e); \ -} while(0) +#define KDBG_IMPROBABLE __improbable +#else +#define KDBG_IMPROBABLE +#endif -#define KERNEL_DEBUG_EARLY(x,a,b,c,d) \ -do { \ - kernel_debug_early((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ - (uintptr_t)c, (uintptr_t)d); \ -} while(0) -#else /* XNU_KERNEL_PRIVATE */ -#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ -do { \ - if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ - kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ - (uintptr_t)d,(uintptr_t)e); \ -} while(0) +/* + * KERNEL_DEBUG_CONSTANT_FILTERED events are omitted from tracing unless they + * are explicitly requested in the typefilter. They are not emitted when + * tracing without a typefilter. + */ +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d)); \ + } \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ +#define KERNEL_DEBUG_CONSTANT_FILTERED(type, x, a, b, c, d, ...) do {} while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ -#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ -do { \ - if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ - kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ - (uintptr_t)d,(uintptr_t)e); \ -} while(0) +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ + (uintptr_t)(d),(uintptr_t)(e)); \ + } \ + } while (0) -#define KERNEL_DEBUG_EARLY(x,a,b,c,d) \ -do { \ - kernel_debug_early((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ - (uintptr_t)c, (uintptr_t)d); \ -} while(0) -#endif /* XNU_KERNEL_PRIVATE */ +/* + * DO NOT USE THIS MACRO -- it breaks fundamental assumptions about ktrace and + * is only meant to be used by the pthread kext and other points in the kernel + * where the thread ID must be provided explicitly. + */ +#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ + (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) + +#define KERNEL_DEBUG_EARLY(x, a, b, c, d) \ + do { \ + kernel_debug_early((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d)); \ + } while (0) #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ -#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) do { } while(0) -#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) do { } while(0) -#define KERNEL_DEBUG_EARLY(x,a,b,c,d) do { } while(0) +#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) do {} while (0) +#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) do {} while (0) +#define KERNEL_DEBUG_EARLY(x, a, b, c, d) do {} while (0) #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ -#ifdef KERNEL_PRIVATE +/* + * KERNEL_DEBUG_CONSTANT_IST (in-system trace) events provide an audited subset + * of tracepoints for userland system tracing tools. This tracing level was + * created by 8857227 to protect fairplayd and other PT_DENY_ATTACH processes. + * It has two effects: only KERNEL_DEBUG_CONSTANT_IST() traces are emitted and + * any PT_DENY_ATTACH processes will only emit basic traces as defined by the + * kernel_debug_filter() routine. + */ +#define KERNEL_DEBUG_CONSTANT_RELEASE(x, a, b, c, d, e) \ + KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, x, a, b, c, d, 0) -// Abbreviated version of above -#define KDBG(x, ...) KDBG_(x, ## __VA_ARGS__, 5, 4, 3, 2, 1, 0) -#define KDBG_(x, a, b, c, d, e, n, ...) KDBG##n(x, a, b, c, d, e) -#define KDBG0(x, a, b, c, d, e) KERNEL_DEBUG_CONSTANT(x, 0, 0, 0, 0, 0) -#define KDBG1(x, a, b, c, d, e) KERNEL_DEBUG_CONSTANT(x, a, 0, 0, 0, 0) -#define KDBG2(x, a, b, c, d, e) KERNEL_DEBUG_CONSTANT(x, a, b, 0, 0, 0) -#define KDBG3(x, a, b, c, d, e) KERNEL_DEBUG_CONSTANT(x, a, b, c, 0, 0) -#define KDBG4(x, a, b, c, d, e) KERNEL_DEBUG_CONSTANT(x, a, b, c, d, 0) -#define KDBG5(x, a, b, c, d, e) KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) +#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & (type))) { \ + kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \ + (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ +#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) do {} while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ -#endif // KERNEL_PRIVATE +#if NO_KDEBUG +#define __kdebug_constant_only __unused +#endif /* - * Specify KDEBUG_PPT to indicate that the event belongs to the - * limited PPT set. + * KERNEL_DEBUG events are only traced for DEBUG kernels. */ -#define KDEBUG_COMMON (KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_CHUD|KDEBUG_ENABLE_PPT) -#define KDEBUG_TRACE (KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_CHUD) -#define KDEBUG_PPT (KDEBUG_ENABLE_PPT) +#define KERNEL_DEBUG_CONSTANT_DEBUG(x, a, b, c, d, e) \ + KERNEL_DEBUG(x, a, b, c, d, e) + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) +#define __kdebug_only + +#define KERNEL_DEBUG(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) /* - KERNEL_DEBUG_CONSTANT_IST events provide an audited subset of - tracepoints for userland system tracing tools. This tracing level was - created by 8857227 to protect fairplayd and other PT_DENY_ATTACH - processes. It has two effects: only KERNEL_DEBUG_CONSTANT_IST() traces - are emitted and any PT_DENY_ATTACH processes will only emit basic - traces as defined by the kernel_debug_filter() routine. + * DO NOT USE THIS MACRO -- see warning above for KERNEL_DEBUG_CONSTANT1. */ -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) -#ifdef XNU_KERNEL_PRIVATE -#define KERNEL_DEBUG_CONSTANT_IST(type,x,a,b,c,d,e) \ -do { \ - if (__improbable(kdebug_enable & type)) \ - kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ - (uintptr_t)d,(uintptr_t)e); \ -} while(0) -#else /* XNU_KERNEL_PRIVATE */ -#define KERNEL_DEBUG_CONSTANT_IST(type,x,a,b,c,d,e) \ -do { \ - if (kdebug_enable & type) \ - kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ - (uintptr_t)d,(uintptr_t)e); \ -} while(0) -#endif /* XNU_KERNEL_PRIVATE */ +#define KERNEL_DEBUG1(x, a, b, c, d, e) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug1((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e)); \ + } \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ +#define __kdebug_only __unused + +#define KERNEL_DEBUG(x,a,b,c,d,e) do {} while (0) +#define KERNEL_DEBUG1(x,a,b,c,d,e) do {} while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ + + +extern void kernel_debug( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5); + +extern void kernel_debug1( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5); + +extern void kernel_debug_filtered( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4); + +extern void kernel_debug_early( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4); + +/* + * EnergyTracing macros. + */ + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) // whether to bother calculating EnergyTracing inputs -// could chnage in future to see if DBG_ENERGYTRACE is active +// could change in future to see if DBG_ENERGYTRACE is active #define ENTR_SHOULDTRACE kdebug_enable // encode logical EnergyTracing into 32/64 KDebug trace #define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value) \ @@ -972,7 +1201,6 @@ do { \ #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ -#define KERNEL_DEBUG_CONSTANT_IST(type,x,a,b,c,d,e) do { } while(0) #define ENTR_SHOULDTRACE FALSE #define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value) \ do {} while (0) @@ -982,33 +1210,6 @@ do { \ #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ -#if NO_KDEBUG -#define __kdebug_constant_only __unused -#endif - -extern void kernel_debug( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t arg5); - -extern void kernel_debug1( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t arg5); - -extern void kernel_debug_early( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4); - #ifdef KERNEL_PRIVATE /* * kernel_debug_string provides the same functionality as the @@ -1021,49 +1222,19 @@ extern void kernel_debug_early( */ extern int kernel_debug_string(uint32_t debugid, uint64_t *str_id, const char *str); -#endif - -#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) -#ifdef XNU_KERNEL_PRIVATE -#define KERNEL_DEBUG(x,a,b,c,d,e) \ -do { \ - if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ - kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ - (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ -} while(0) - -#define KERNEL_DEBUG1(x,a,b,c,d,e) \ -do { \ - if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ - kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ - (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ -} while(0) - -#define __kdebug_only -#else /* !XNU_KERNEL_PRIVATE */ -#define KERNEL_DEBUG(x,a,b,c,d,e) \ -do { \ - if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ - kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ - (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ -} while(0) - -#define KERNEL_DEBUG1(x,a,b,c,d,e) \ -do { \ - if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ - kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ - (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ -} while(0) -#endif /* XNU_KERNEL_PRIVATE */ -#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ - -#define KERNEL_DEBUG(x,a,b,c,d,e) do {} while (0) -#define KERNEL_DEBUG1(x,a,b,c,d,e) do {} while (0) - -#define __kdebug_only __unused -#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ +/* + * kernel_debug_disable disables event logging, but leaves any buffers + * intact. + */ +extern void kernel_debug_disable(void); +#endif +/* + * Bits set in the comm page for kdebug. + */ +#define KDEBUG_COMMPAGE_ENABLE_TRACE 0x1 +#define KDEBUG_COMMPAGE_ENABLE_TYPEFILTER 0x2 /* Forced to false if ENABLE_TRACE is 0 */ // for EnergyTracing user space & clients #define kEnTrCompKernel 2 @@ -1121,17 +1292,19 @@ do { \ struct proc; +extern boolean_t kdebug_debugid_enabled(uint32_t debugid); +extern uint32_t kdebug_commpage_state(void); extern void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t lookup); extern void kdbg_trace_data(struct proc *proc, long *arg_pid); extern void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4); extern void kdbg_dump_trace_to_file(const char *); -void start_kern_tracing(unsigned int, boolean_t); -void start_kern_tracing_with_typefilter(unsigned int, boolean_t, unsigned int); +void kdebug_boot_trace(unsigned int n_events, char *filterdesc); +void kdebug_trace_start(unsigned int n_events, const char *filterdesc, boolean_t need_map); struct task; extern void kdbg_get_task_name(char*, int, struct task *task); -void disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags); +boolean_t disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags); void enable_wrap(uint32_t old_slowcheck, boolean_t lostevents); void release_storage_unit(int cpu, uint32_t storage_unit); int allocate_storage_unit(int cpu); @@ -1139,38 +1312,34 @@ int allocate_storage_unit(int cpu); #define KDBG_CLASS_ENCODE(Class, SubClass) KDBG_EVENTID(Class, SubClass, 0) #define KDBG_CLASS_DECODE(Debugid) (Debugid & KDBG_CSC_MASK) - -#endif /* KERNEL_PRIVATE */ - - +#endif /* KERNEL_PRIVATE */ #endif /* __APPLE_API_UNSTABLE */ __END_DECLS - -#ifdef PRIVATE +#ifdef PRIVATE #ifdef __APPLE_API_PRIVATE /* * private kernel_debug definitions */ typedef struct { - uint64_t timestamp; - uintptr_t arg1; - uintptr_t arg2; - uintptr_t arg3; - uintptr_t arg4; - uintptr_t arg5; /* will hold current thread */ - uint32_t debugid; + uint64_t timestamp; + uintptr_t arg1; + uintptr_t arg2; + uintptr_t arg3; + uintptr_t arg4; + uintptr_t arg5; /* the thread ID */ + uint32_t debugid; #if defined(__LP64__) - uint32_t cpuid; - uintptr_t unused; + uint32_t cpuid; + uintptr_t unused; #endif } kd_buf; #if !defined(__LP64__) -#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL -#define KDBG_CPU_MASK 0xff00000000000000ULL -#define KDBG_CPU_SHIFT 56 +#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL +#define KDBG_CPU_MASK 0xff00000000000000ULL +#define KDBG_CPU_SHIFT 56 static inline void kdbg_set_cpu(kd_buf *kp, int cpu) { @@ -1228,57 +1397,93 @@ kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) } #endif -/* 2^16 bits (8 kilobytes), one for each possible class/subclass combination */ -#define KDBG_TYPEFILTER_BITMAP_SIZE ( (256 * 256) / 8 ) - -/* Debug Flags */ -#define KDBG_INIT 0x001 -#define KDBG_NOWRAP 0x002 -#define KDBG_FREERUN 0x004 -#define KDBG_WRAPPED 0x008 -#define KDBG_USERFLAGS (KDBG_FREERUN|KDBG_NOWRAP|KDBG_INIT) -#define KDBG_PIDCHECK 0x010 -#define KDBG_MAPINIT 0x020 -#define KDBG_PIDEXCLUDE 0x040 -#define KDBG_LOCKINIT 0x080 -#define KDBG_LP64 0x100 +/* + * 2^16 bits (8 kilobytes), one for each possible class/subclass combination + */ +#define KDBG_TYPEFILTER_BITMAP_SIZE ((256 * 256) / 8) -typedef struct { - unsigned int type; - unsigned int value1; - unsigned int value2; - unsigned int value3; - unsigned int value4; +/* + * Bits for kd_ctrl_page.flags, KERN_KD{D,E}FLAGS. + */ +#define KDBG_INIT (1U << 0) /* obsolete */ +/* disable tracing when buffers are full */ +#define KDBG_NOWRAP (1U << 1) +#define KDBG_FREERUN (1U << 2) /* obsolete */ +/* buffer has wrapped */ +#define KDBG_WRAPPED (1U << 3) +/* flags that are allowed to be set by user space */ +#define KDBG_USERFLAGS (KDBG_FREERUN | KDBG_NOWRAP | KDBG_INIT) +/* only include processes with kdebug bit set in proc */ +#define KDBG_PIDCHECK (1U << 4) +/* thread map is initialized */ +#define KDBG_MAPINIT (1U << 5) +/* exclude processes based on kdebug bit in proc */ +#define KDBG_PIDEXCLUDE (1U << 6) +/* whether the kdebug locks are intialized */ +#define KDBG_LOCKINIT (1U << 7) +/* word size of the kernel */ +#define KDBG_LP64 (1U << 8) + +/* bits for kd_ctrl_page.flags and kbufinfo_t.flags */ + +/* only trace events within a range */ +#define KDBG_RANGECHECK 0x00100000U +/* only trace at most 4 types of events, at the code granularity */ +#define KDBG_VALCHECK 0x00200000U +/* check class and subclass against the typefilter */ +#define KDBG_TYPEFILTER_CHECK 0x00400000U +/* kdebug trace buffers are initialized */ +#define KDBG_BUFINIT 0x80000000U + +/* bits for the type field of kd_regtype */ +#define KDBG_CLASSTYPE 0x10000 +#define KDBG_SUBCLSTYPE 0x20000 +#define KDBG_RANGETYPE 0x40000 +#define KDBG_TYPENONE 0x80000 +#define KDBG_CKTYPES 0xF0000 +typedef struct { + unsigned int type; + unsigned int value1; + unsigned int value2; + unsigned int value3; + unsigned int value4; } kd_regtype; -typedef struct -{ - int nkdbufs; - int nolog; - int flags; - int nkdthreads; - int bufid; +typedef struct { + /* number of events that can fit in the buffers */ + int nkdbufs; + /* set if trace is disabled */ + int nolog; + /* kd_ctrl_page.flags */ + unsigned int flags; + /* number of threads in thread map */ + int nkdthreads; + /* the owning pid */ + int bufid; } kbufinfo_t; typedef struct { - uintptr_t thread; - int valid; - char command[20]; + /* the thread ID */ + uintptr_t thread; + /* 0 for invalid, otherwise the PID (or 1 for kernel_task) */ + int valid; + /* the name of the process owning the thread */ + char command[20]; } kd_threadmap; typedef struct { - uint32_t version_no; - uint32_t cpu_count; + uint32_t version_no; + uint32_t cpu_count; } kd_cpumap_header; /* cpumap flags */ #define KDBG_CPUMAP_IS_IOP 0x1 typedef struct { - uint32_t cpu_id; - uint32_t flags; - char name[8]; + uint32_t cpu_id; + uint32_t flags; + char name[8]; } kd_cpumap; /* @@ -1370,29 +1575,29 @@ typedef struct { // The CPU map is an optional sub-chunk of the header chunk. It provides // information about the CPUs that are referenced from the trace events. typedef struct { - uint32_t tag; - uint32_t sub_tag; - uint64_t length; - uint32_t timebase_numer; - uint32_t timebase_denom; - uint64_t timestamp; - uint64_t walltime_secs; - uint32_t walltime_usecs; - uint32_t timezone_minuteswest; - uint32_t timezone_dst; - uint32_t flags; -} kd_header_v3; + uint32_t tag; + uint32_t sub_tag; + uint64_t length; + uint32_t timebase_numer; + uint32_t timebase_denom; + uint64_t timestamp; + uint64_t walltime_secs; + uint32_t walltime_usecs; + uint32_t timezone_minuteswest; + uint32_t timezone_dst; + uint32_t flags; +} __attribute__((packed)) kd_header_v3; typedef struct { uint32_t tag; uint32_t sub_tag; uint64_t length; -} kd_chunk_header_v3; +} __attribute__((packed)) kd_chunk_header_v3; #define RAW_VERSION0 0x55aa0000 #define RAW_VERSION1 0x55aa0101 #define RAW_VERSION2 0x55aa0200 /* Only used by kperf and Instruments */ -#define RAW_VERSION3 0x00001000 +#define RAW_VERSION3 0x00001000 #define V3_CONFIG 0x00001b00 #define V3_CPU_MAP 0x00001c00 @@ -1414,33 +1619,15 @@ typedef struct { int kdbg_write_v3_chunk_header_to_buffer(void *buffer, uint32_t tag, uint32_t sub_tag, uint64_t length); int kdbg_write_v3_chunk_to_fd(uint32_t tag, uint32_t sub_tag, uint64_t length, void *payload, uint64_t payload_size, int fd); -#define KDBG_CLASSTYPE 0x10000 -#define KDBG_SUBCLSTYPE 0x20000 -#define KDBG_RANGETYPE 0x40000 -#define KDBG_TYPENONE 0x80000 -#define KDBG_CKTYPES 0xF0000 - -#define KDBG_RANGECHECK 0x100000 -#define KDBG_VALCHECK 0x200000 /* Check up to 4 individual values */ - -#define KDBG_TYPEFILTER_CHECK ((uint32_t) 0x400000) /* Check class and subclass against a bitmap */ - -#define KDBG_BUFINIT 0x80000000 - -/* Minimum value allowed when setting decrementer ticks */ -#define KDBG_MINRTCDEC 2500 - /* VFS lookup events for serial traces */ #define VFS_LOOKUP (FSDBG_CODE(DBG_FSRW,36)) #define VFS_LOOKUP_DONE (FSDBG_CODE(DBG_FSRW,39)) -#ifdef XNU_KERNEL_PRIVATE -#if (DEVELOPMENT || DEBUG) +#if defined(XNU_KERNEL_PRIVATE) && (DEVELOPMENT || DEBUG) #define KDEBUG_MOJO_TRACE 1 #endif -#endif #endif /* __APPLE_API_PRIVATE */ -#endif /* PRIVATE */ +#endif /* PRIVATE */ #endif /* !BSD_SYS_KDEBUG_H */ diff --git a/bsd/sys/kdebug_signpost.h b/bsd/sys/kdebug_signpost.h new file mode 100644 index 000000000..e4332be3b --- /dev/null +++ b/bsd/sys/kdebug_signpost.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef BSD_SYS_KDEBUG_SIGNPOST_H +#define BSD_SYS_KDEBUG_SIGNPOST_H + +#include +#include +#include + +__BEGIN_DECLS + +#ifndef KERNEL + +/* + * In previous versions of the operating system, applications could use: + * + * syscall(SYS_kdebug_trace, APPSDBG_CODE(DBG_MACH_CHUD, ) | DBG_FUNC_, arg1, arg2, arg3, arg4); + * + * to record events that would be displayed by Instruments. + * + * syscall(2) is now deprecated and this interface replaces the above call as follows: + * + * The code argument is . Only the low 14-bits of the code are + * preserved. + */ + +/* + * When is NONE, use kdebug_signpost. + */ +int kdebug_signpost(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) + __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); + +/* + * When is START, use kdebug_signpost_start. + */ +int kdebug_signpost_start(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) + __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); + +/* + * When is END, use kdebug_signpost_end. + */ +int kdebug_signpost_end(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) + __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0); + +#endif /* !KERNEL */ + +__END_DECLS + +#endif /* !BSD_SYS_KDEBUG_SIGNPOST_H */ diff --git a/bsd/sys/kern_control.h b/bsd/sys/kern_control.h index 3d87bce89..ba5f6be37 100644 --- a/bsd/sys/kern_control.h +++ b/bsd/sys/kern_control.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004, 2012-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2004, 2012-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -188,6 +188,9 @@ struct kctlstat { u_int64_t kcs_enqueue_fail __attribute__((aligned(8))); u_int64_t kcs_enqueue_fullsock __attribute__((aligned(8))); u_int64_t kcs_bad_kctlref __attribute__((aligned(8))); + u_int64_t kcs_tbl_size_too_big __attribute__((aligned(8))); + u_int64_t kcs_enqdata_mb_alloc_fail __attribute__((aligned(8))); + u_int64_t kcs_enqdata_sbappend_fail __attribute__((aligned(8))); }; #endif /* PRIVATE */ @@ -547,7 +550,7 @@ ctl_enqueuembuf(kern_ctl_ref kctlref, u_int32_t unit, mbuf_t m, u_int32_t flags) Not valid if ctl_flags contains CTL_FLAG_REG_SOCK_STREAM. @param kctlref The control reference of the kernel control. @param unit The unit number of the kernel control instance. - @param m An mbuf chain containing the data to send to the client. + @param m_list An mbuf chain containing the data to send to the client. @param flags Send flags. CTL_DATA_NOWAKEUP is the only supported flags. @param m_remain A pointer to the list of mbuf packets in the chain that @@ -594,7 +597,7 @@ ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space); low-water mark for the socket receive buffer. @param kctlref The control reference of the kernel control. @param unit The unit number of the kernel control instance. - @param u_int32_t The address at which to return the current difference + @param difference The address at which to return the current difference between the low-water mark for the socket and the number of bytes enqueued. 0 indicates that the socket is readable by the client (the number of bytes in the buffer is above the low-water mark). diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index 0cb7e52b7..de639b6f6 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -34,14 +34,19 @@ #include #include +#define MEMORYSTATUS_ENTITLEMENT "com.apple.private.memorystatus" + #define JETSAM_PRIORITY_REVISION 2 #define JETSAM_PRIORITY_IDLE_HEAD -2 /* The value -1 is an alias to JETSAM_PRIORITY_DEFAULT */ #define JETSAM_PRIORITY_IDLE 0 -#define JETSAM_PRIORITY_IDLE_DEFERRED 1 +#define JETSAM_PRIORITY_IDLE_DEFERRED 1 /* Keeping this around till all xnu_quick_tests can be moved away from it.*/ +#define JETSAM_PRIORITY_AGING_BAND1 JETSAM_PRIORITY_IDLE_DEFERRED #define JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC 2 +#define JETSAM_PRIORITY_AGING_BAND2 JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC #define JETSAM_PRIORITY_BACKGROUND 3 +#define JETSAM_PRIORITY_ELEVATED_INACTIVE JETSAM_PRIORITY_BACKGROUND #define JETSAM_PRIORITY_MAIL 4 #define JETSAM_PRIORITY_PHONE 5 #define JETSAM_PRIORITY_UI_SUPPORT 8 @@ -86,7 +91,7 @@ typedef struct memorystatus_priority_entry { pid_t pid; int32_t priority; uint64_t user_data; - int32_t limit; + int32_t limit; /* MB */ uint32_t state; } memorystatus_priority_entry_t; @@ -113,24 +118,39 @@ typedef struct memorystatus_kernel_stats { typedef struct jetsam_snapshot_entry { pid_t pid; - char name[MAXCOMLEN+1]; + char name[(2*MAXCOMLEN)+1]; int32_t priority; - uint32_t pages; - uint32_t max_pages; uint32_t state; - uint32_t killed; - uint64_t user_data; - uint8_t uuid[16]; uint32_t fds; - uint32_t max_pages_lifetime; - uint32_t purgeable_pages; + uint8_t uuid[16]; + uint64_t user_data; + uint64_t killed; + uint64_t pages; + uint64_t max_pages; + uint64_t max_pages_lifetime; + uint64_t purgeable_pages; + uint64_t jse_internal_pages; + uint64_t jse_internal_compressed_pages; + uint64_t jse_purgeable_nonvolatile_pages; + uint64_t jse_purgeable_nonvolatile_compressed_pages; + uint64_t jse_alternate_accounting_pages; + uint64_t jse_alternate_accounting_compressed_pages; + uint64_t jse_iokit_mapped_pages; + uint64_t jse_page_table_pages; + uint64_t jse_memory_region_count; + uint64_t jse_gencount; /* memorystatus_thread generation counter */ + uint64_t jse_starttime; /* absolute time when process starts */ + uint64_t jse_killtime; /* absolute time when jetsam chooses to kill a process */ + uint64_t jse_idle_delta; /* time spent in idle band */ + uint64_t jse_coalition_jetsam_id; /* we only expose coalition id for COALITION_TYPE_JETSAM */ struct timeval cpu_time; } memorystatus_jetsam_snapshot_entry_t; typedef struct jetsam_snapshot { - uint64_t snapshot_time; - uint64_t notification_time; - memorystatus_kernel_stats_t stats; + uint64_t snapshot_time; /* absolute time snapshot was initialized */ + uint64_t notification_time; /* absolute time snapshot was consumed */ + uint64_t js_gencount; /* memorystatus_thread generation counter */ + memorystatus_kernel_stats_t stats; /* system stat when snapshot is initialized */ size_t entry_count; memorystatus_jetsam_snapshot_entry_t entries[]; } memorystatus_jetsam_snapshot_t; @@ -165,6 +185,19 @@ enum { kMemorystatusKilledIdleExit }; +/* Jetsam exit reason definitions */ +#define JETSAM_REASON_INVALID 0 +#define JETSAM_REASON_GENERIC 1 +#define JETSAM_REASON_MEMORY_HIGHWATER 2 +#define JETSAM_REASON_VNODE 3 +#define JETSAM_REASON_MEMORY_VMPAGESHORTAGE 4 +#define JETSAM_REASON_MEMORY_VMTHRASHING 5 +#define JETSAM_REASON_MEMORY_FCTHRASHING 6 +#define JETSAM_REASON_MEMORY_PERPROCESSLIMIT 7 +#define JETSAM_REASON_MEMORY_DIAGNOSTIC 8 +#define JETSAM_REASON_MEMORY_IDLE_EXIT 9 +#define JETSAM_REASON_CPULIMIT 10 + /* Temporary, to prevent the need for a linked submission of ReportCrash */ /* Remove when has been integrated */ enum { @@ -192,6 +225,10 @@ int memorystatus_control(uint32_t command, int32_t pid, uint32_t flags, void *bu #define MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE 10 /* Reset the task's status as a privileged listener w.r.t memory notifications */ #define MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE 11 /* Enable the 'lenient' mode for aggressive jetsam. See comments in kern_memorystatus.c near the top. */ #define MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE 12 /* Disable the 'lenient' mode for aggressive jetsam. */ +#define MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS 13 /* Compute how much a process's phys_footprint exceeds inactive memory limit */ +#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE 14 +#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE 15 + /* Commands that act on a group of processes */ #define MEMORYSTATUS_CMD_GRP_SET_PROPERTIES 100 @@ -267,7 +304,6 @@ typedef struct memorystatus_memlimit_properties { #define MEMORYSTATUS_MEMLIMIT_ATTR_FATAL 0x1 /* if set, exceeding the memlimit is fatal */ - #ifdef XNU_KERNEL_PRIVATE /* @@ -321,6 +357,8 @@ typedef struct memorystatus_memlimit_properties { #define P_MEMSTAT_MEMLIMIT_ACTIVE_EXC_TRIGGERED 0x00008000 /* if set, supresses high-water-mark EXC_RESOURCE, allows one hit per active limit */ #define P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL 0x00010000 /* if set, exceeding limit is fatal when the process is inactive */ #define P_MEMSTAT_MEMLIMIT_INACTIVE_EXC_TRIGGERED 0x00020000 /* if set, supresses high-water-mark EXC_RESOURCE, allows one hit per inactive limit */ +#define P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND 0x00040000 /* if set, the process will go into this band & stay there when in the background instead + of the aging bands and/or the IDLE band. */ extern void memorystatus_init(void) __attribute__((section("__TEXT, initcode"))); @@ -333,6 +371,9 @@ extern int memorystatus_update(proc_t p, int priority, uint64_t user_data, boole extern int memorystatus_remove(proc_t p, boolean_t locked); +int memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t opflags, boolean_t effective_now); + + extern int memorystatus_dirty_track(proc_t p, uint32_t pcontrol); extern int memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol); extern int memorystatus_dirty_get(proc_t p); @@ -353,6 +394,13 @@ void memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr); int memorystatus_knote_register(struct knote *kn); void memorystatus_knote_unregister(struct knote *kn); +#if CONFIG_MEMORYSTATUS +boolean_t memorystatus_turnoff_exception_and_get_fatalness(boolean_t warning, const int max_footprint_mb); +void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t is_fatal); +void proc_memstat_terminated(proc_t p, boolean_t set); +boolean_t memorystatus_proc_is_dirty_unsafe(void *v); +#endif /* CONFIG_MEMORYSTATUS */ + #if CONFIG_JETSAM int memorystatus_get_pressure_status_kdp(void); @@ -373,8 +421,6 @@ boolean_t memorystatus_kill_on_VM_thrashing(boolean_t async); boolean_t memorystatus_kill_on_FC_thrashing(boolean_t async); boolean_t memorystatus_kill_on_vnode_limit(void); -void memorystatus_on_ledger_footprint_exceeded(int warning, const int max_footprint_mb); -void proc_memstat_terminated(proc_t p, boolean_t set); void jetsam_on_ledger_cpulimit_exceeded(void); void memorystatus_pages_update(unsigned int pages_avail); diff --git a/bsd/sys/kernel.h b/bsd/sys/kernel.h index 2674b234c..35555f842 100644 --- a/bsd/sys/kernel.h +++ b/bsd/sys/kernel.h @@ -75,6 +75,8 @@ #include #ifdef BSD_KERNEL_PRIVATE +#include + /* Global variables for the kernel. */ /* 1.1 */ @@ -87,6 +89,8 @@ extern int domainnamelen; /* 1.2 */ extern int stathz; /* statistics clock's frequency */ extern int profhz; /* profiling clock's frequency */ + +extern bool send_sigsys; #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index 50cda45db..6b5693c00 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2015 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -47,6 +47,9 @@ #define __KPI_MBUF__ #include #include +#ifdef KERNEL_PRIVATE +#include +#endif /* KERNEL_PRIVATE */ /*! @enum mbuf_flags_t @@ -444,6 +447,20 @@ extern errno_t mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr); */ extern void mbuf_freecluster(caddr_t addr, size_t size); +#ifdef BSD_KERNEL_PRIVATE +/* + * For now, restrict these to BSD kernel privates, since they are + * used only by the Nexus netif compatibility code. + */ +extern errno_t mbuf_ring_cluster_alloc(mbuf_how_t how, mbuf_type_t type, + mbuf_t *mbuf, void (*extfree)(caddr_t, u_int, caddr_t), size_t *size); +extern int mbuf_ring_cluster_is_active(mbuf_t mbuf); +extern errno_t mbuf_ring_cluster_activate(mbuf_t mbuf); +extern errno_t mbuf_cluster_set_prop(mbuf_t mbuf, u_int32_t oldprop, + u_int32_t newprop); +extern errno_t mbuf_cluster_get_prop(mbuf_t mbuf, u_int32_t *prop); +#endif /* BSD_KERNEL_PRIVATE */ + /*! @function mbuf_getcluster @discussion Allocate a cluster of the requested size and attach it to @@ -519,7 +536,7 @@ extern errno_t mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf); pointed to by maxchunks. E.g. a request for 9018 bytes may result in 1 chunk when jumbo clusters are available, or 3 chunks otherwise. - @param Upon success, *mbuf will be a reference to the new mbuf. + @param mbuf Upon success, *mbuf will be a reference to the new mbuf. @result Returns 0 upon success or the following error code: EINVAL - Invalid parameter ENOMEM - Not enough memory available @@ -558,7 +575,7 @@ extern errno_t mbuf_allocpacket(mbuf_how_t how, size_t packetlen, pointed to by maxchunks. E.g. a request for 9018 bytes may result in 1 chunk when jumbo clusters are available, or 3 chunks otherwise. - @param Upon success, *mbuf will be a reference to the new mbuf. + @param mbuf Upon success, *mbuf will be a reference to the new mbuf. @result Returns 0 upon success or the following error code: EINVAL - Invalid parameter ENOMEM - Not enough memory available @@ -862,7 +879,6 @@ extern size_t mbuf_len(const mbuf_t mbuf); not set the length over the space available in the mbuf. @param mbuf The mbuf. @param len The new length. - @result 0 upon success otherwise the errno error. */ extern void mbuf_setlen(mbuf_t mbuf, size_t len); @@ -929,7 +945,7 @@ extern errno_t mbuf_setflags_mask(mbuf_t mbuf, mbuf_flags_t flags, @function mbuf_copy_pkthdr @discussion Copies the packet header from src to dest. @param src The mbuf from which the packet header will be copied. - @param mbuf The mbuf to which the packet header will be copied. + @param dest The mbuf to which the packet header will be copied. @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_copy_pkthdr(mbuf_t dest, const mbuf_t src); @@ -991,7 +1007,7 @@ extern ifnet_t mbuf_pkthdr_rcvif(const mbuf_t mbuf); @function mbuf_pkthdr_setrcvif @discussion Sets the interface the packet was received on. @param mbuf The mbuf containing the packet header. - @param ifnet A reference to an interface. + @param ifp A reference to an interface. @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifp); @@ -1008,8 +1024,7 @@ extern void *mbuf_pkthdr_header(const mbuf_t mbuf); @function mbuf_pkthdr_setheader @discussion Sets the pointer to the packet header. @param mbuf The mbuf containing the packet header. - @param ifnet A pointer to the header. - @result 0 upon success otherwise the errno error. + @param header A pointer to the header. */ extern void mbuf_pkthdr_setheader(mbuf_t mbuf, void *header); @@ -1103,7 +1118,7 @@ extern errno_t mbuf_get_vlan_tag(mbuf_t mbuf, u_int16_t *vlan); extern errno_t mbuf_clear_vlan_tag(mbuf_t mbuf); #ifdef KERNEL_PRIVATE -/* +/*! @function mbuf_set_csum_requested @discussion This function is used by the stack to indicate which checksums should be calculated in hardware. The stack normally @@ -1224,7 +1239,7 @@ extern errno_t mbuf_clear_csum_performed(mbuf_t mbuf); /*! @function mbuf_inet_cksum - @discussions Calculates 16-bit 1's complement Internet checksum of the + @discussion Calculates 16-bit 1's complement Internet checksum of the transport segment with or without the pseudo header checksum of a given IPv4 packet. If the caller specifies a non-zero transport protocol, the checksum returned will also include @@ -1253,7 +1268,7 @@ extern errno_t mbuf_inet_cksum(mbuf_t mbuf, int protocol, u_int32_t offset, /*! @function mbuf_inet6_cksum - @discussions Calculates 16-bit 1's complement Internet checksum of the + @discussion Calculates 16-bit 1's complement Internet checksum of the transport segment with or without the pseudo header checksum of a given IPv6 packet. If the caller specifies a non-zero transport protocol, the checksum returned will also include @@ -1353,7 +1368,7 @@ extern void mbuf_tag_free(mbuf_t mbuf, mbuf_tag_id_t module_id, mbuf_tag_type_t type); #ifdef KERNEL_PRIVATE -/* +/*! @function mbuf_add_drvaux @discussion Allocate space for driver auxiliary data and attach it to the packet (MBUF_PKTHDR is required.) This space is freed @@ -1380,7 +1395,7 @@ extern void mbuf_tag_free(mbuf_t mbuf, mbuf_tag_id_t module_id, extern errno_t mbuf_add_drvaux(mbuf_t mbuf, mbuf_how_t how, u_int32_t family, u_int32_t subfamily, size_t length, void **data_p); -/* +/*! @function mbuf_find_drvaux @discussion Find the driver auxiliary data associated with a packet. @param mbuf The mbuf the auxiliary data is attached to. @@ -1400,7 +1415,7 @@ extern errno_t mbuf_add_drvaux(mbuf_t mbuf, mbuf_how_t how, extern errno_t mbuf_find_drvaux(mbuf_t mbuf, u_int32_t *family_p, u_int32_t *subfamily_p, u_int32_t *length_p, void **data_p); -/* +/*! @function mbuf_del_drvaux @discussion Remove and free any driver auxility data associated with the packet. @@ -1455,7 +1470,7 @@ extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); @function mbuf_set_traffic_class @discussion Set the traffic class of an mbuf packet. @param mbuf The mbuf to set the traffic class on. - @tc The traffic class + @param tc The traffic class @result 0 on success, EINVAL if bad parameter is passed */ extern errno_t mbuf_set_traffic_class(mbuf_t mbuf, mbuf_traffic_class_t tc); @@ -1575,7 +1590,7 @@ extern mbuf_svc_class_t mbuf_get_service_class(mbuf_t mbuf); @function mbuf_set_servicec_class @discussion Set the service class of an mbuf packet. @param mbuf The mbuf to set the service class on. - @sc The service class + @param sc The service class @result 0 on success, EINVAL if bad parameter is passed */ extern errno_t mbuf_set_service_class(mbuf_t mbuf, mbuf_svc_class_t sc); @@ -1589,7 +1604,7 @@ extern errno_t mbuf_set_service_class(mbuf_t mbuf, mbuf_svc_class_t sc); */ extern int mbuf_is_service_class_privileged(mbuf_t mbuf); -/* +/*! @enum mbuf_pkthdr_aux_flags_t @abstract Constants defining mbuf auxiliary flags. Only the flags listed below can be retrieved. @@ -1606,7 +1621,7 @@ enum { }; typedef u_int32_t mbuf_pkthdr_aux_flags_t; -/* +/*! @function mbuf_pkthdr_aux_flags @discussion Returns the auxiliary flags of a packet. @param mbuf The mbuf containing the packet header. @@ -1616,7 +1631,7 @@ typedef u_int32_t mbuf_pkthdr_aux_flags_t; extern errno_t mbuf_pkthdr_aux_flags(mbuf_t mbuf, mbuf_pkthdr_aux_flags_t *paux_flags); -/* +/*! @function mbuf_get_driver_scratch @discussion Returns a pointer to a driver specific area in the mbuf @param m The mbuf whose driver scratch space is to be returned @@ -1629,13 +1644,13 @@ extern errno_t mbuf_pkthdr_aux_flags(mbuf_t mbuf, extern errno_t mbuf_get_driver_scratch(mbuf_t m, u_int8_t **area, size_t *area_ln); -/* +/*! @function mbuf_get_unsent_data_bytes @discussion Returns the amount of data that is waiting to be sent on this interface. This is a private SPI used by cellular interface as an indication of future activity on that interface. - @param mbuf The mbuf containingthe packet header + @param m The mbuf containing the packet header @param unsent_data A pointer to an integer where the value of unsent data will be set. @result 0 upon success otherwise the errno error. If the mbuf @@ -1644,6 +1659,58 @@ extern errno_t mbuf_get_driver_scratch(mbuf_t m, u_int8_t **area, */ extern errno_t mbuf_get_unsent_data_bytes(const mbuf_t m, u_int32_t *unsent_data); + +typedef struct { + int32_t buf_interface; /* data to send at interface */ + int32_t buf_sndbuf; /* data to send at socket buffer */ +} mbuf_buffer_status_t; + +/*! + @function mbuf_get_buffer_status + @discussion Returns the amount of data that is waiting to be sent + on this interface. This is a private SPI used by cellular + interface as an indication of future activity on that + interface. + @param m The mbuf containing the packet header + @param buf_status A pointer to the structure where the value of + unsent data will be set. + @result 0 upon success. If any of the arguments is NULL or if the + mbuf packet header does not have valid data bytes, + EINVAL will be returned + */ +extern errno_t mbuf_get_buffer_status(const mbuf_t m, + mbuf_buffer_status_t *buf_status); + +/*! + @function mbuf_pkt_new_flow + @discussion This function is used to check if the packet is from a + new flow that can be treated with higher priority. This is + a private SPI. + @param m The mbuf containing the packet header + @param retval A pointer to an integer used as an out argument. The + value is set to 1 if the packet is from a new flow, + otherwise it is set to 0. + @result 0 upon success otherwise the errno error. If any of the + arguments is NULL or if the mbuf does not have valid packet + header, the error code will be EINVAL + */ +extern errno_t mbuf_pkt_new_flow(const mbuf_t m, u_int32_t *retval); + +/*! + @function mbuf_last_pkt + @discussion This function is used to check if the packet is the + last one sent on a TCP socket. This is an advisory + for the underlying layers. + @param m The mbuf containing the packet header + @param retval A pointer to an integer whose value will be set to + 1 if the packet is the last packet, otherwise it will + be set to 0. + @result 0 upon success otherwise the errno error. If any of the + arguments is NULL or if the mbuf does not have valid + packet header, the error code will be EINVAL + */ +extern errno_t mbuf_last_pkt(const mbuf_t m, u_int32_t *retval); + #endif /* KERNEL_PRIVATE */ #ifdef XNU_KERNEL_PRIVATE @@ -1667,6 +1734,181 @@ extern size_t mbuf_pkt_list_len(const mbuf_t mbuf); extern size_t mbuf_pkt_list_maxlen(const mbuf_t mbuf); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +/*! + @function mbuf_get_timestamp + @discussion Retrieves the timestamp of the packet. + @param mbuf The mbuf representing the packet. + @param ts A pointer where the value of the timestamp will be copied + to. + @param valid A pointer to a boolean value that indicate if the + timestamp is valid (i.e. the packet timestamp has been set). + If "false" the value of "ts" is undetermined. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_get_timestamp(mbuf_t mbuf, u_int64_t *ts, boolean_t *valid); + +/*! + @function mbuf_set_timestamp + @discussion Set the timestamp of the packet. + @param mbuf The mbuf representing the packet. + @param ts The value of the timestamp to be stored in the mbuf packet + header + @param valid A boolean value that indicate if the timestamp is valid. + Passing false clears any previous timestamp value. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_set_timestamp(mbuf_t mbuf, u_int64_t ts, boolean_t valid); + +/*! + @typedef mbuf_tx_compl_func + @discussion This callback is used to indicate when a driver has + transmitted a packet. + @param pktid The packet indentifier that was returned by + mbuf_set_timestamp_requested() + @param ifp The outgoing interface or NULL if the packet was dropped + before reaching the driver + @param ts The timestamp in nanoseconds when the packet was transmitted + @param tx_compl_arg An argument set by the driver + @param tx_compl_data Additional data set by the driver + @param tx_compl_val The transmission status is expected to be an + IOReturn value -- see +*/ + +typedef void (*mbuf_tx_compl_func)(uintptr_t pktid, ifnet_t ifp, u_int64_t ts, + uintptr_t tx_compl_arg, uintptr_t tx_compl_data, kern_return_t tx_compl_val); + +/*! + @function mbuf_register_tx_compl_callback + @discussion Register a transmit completion callback function. The + callback function must be unregistered before the calling + module unloads. + @param callback The completion callback function to register + @result 0 upon success otherwise the errno error. ENOSPC is returned + if too many callbacks are registered. EINVAL is returned when + the function pointer is invalid. EEXIST is returned when + the function pointer is already registered. + */ +extern errno_t mbuf_register_tx_compl_callback( + mbuf_tx_compl_func callback); + +/*! + @function mbuf_unregister_tx_compl_callback + @discussion Unregister a transmit completion callback function. The + callback function must be unregistered before the calling + module unloads. + @param callback The completion callback function to unregister + @result 0 upon success otherwise the errno error. EINVAL is returned + when the function pointer is invalid. ENOENT is returned when + the function pointer is not registered. + */ +extern errno_t mbuf_unregister_tx_compl_callback( + mbuf_tx_compl_func callback); + +/*! + @function mbuf_get_timestamp_requested + @discussion Tell if the packet timestamp needs to be set. This is meant + to be used by a driver on egress packets. + @param mbuf The mbuf representing the packet. + @param requested A pointer to a boolean value that indicate if the + timestamp was requested to be set. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_get_timestamp_requested(mbuf_t mbuf, boolean_t *requested); + +/*! + @function mbuf_set_timestamp_requested + @discussion Indicate the callback is expected to be called with the + transmission complete timestamp. This is meant to be used + on egress packet by the driver. + @param mbuf The mbuf representing the packet. + @param callback A previously registered completion callback function. + @param pktid An output parameter with an opaque value that can be used + to identify the packet. + @result 0 upon success otherwise the errno error. EINVAL is retuned + if the mbuf is not a valid packet or if one of the parameter + is NULL. ENOENT if the callback is not registred. + */ +extern errno_t mbuf_set_timestamp_requested(mbuf_t mbuf, + uintptr_t *pktid, mbuf_tx_compl_func callback); + +/*! + @function mbuf_get_status + @discussion Retrieves the packet completion status. + @param mbuf The mbuf representing the packet. + @param status A pointer where the value of the completion status will + be copied to. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_get_status(mbuf_t mbuf, kern_return_t *status); + +/*! + @function mbuf_set_status + @discussion Store the packet completion status in the mbuf packet + header. + @param mbuf The mbuf representing the packet. + @param status The value of the completion status. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_set_status(mbuf_t mbuf, kern_return_t status); + +/*! + @function mbuf_get_tx_compl_data + @discussion Retrieves the packet completion status. + @param m The mbuf representing the packet. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_get_tx_compl_data(mbuf_t m, uintptr_t *arg, + uintptr_t *data); + +/*! + @function mbuf_set_tx_compl_data + @discussion Retrieves the packet completion status. + @param m The mbuf representing the packet. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_set_tx_compl_data(mbuf_t m, uintptr_t arg, + uintptr_t data); + +/*! + @function mbuf_get_flowid + @discussion Retrieve the flow ID of the packet . + @param mbuf The mbuf representing the packet. + @param flowid The flow ID of the packet. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_get_flowid(mbuf_t mbuf, u_int16_t *flowid); + +/*! + @function mbuf_set_flowid + @discussion Set the flow ID of the packet . + @param mbuf The mbuf representing the packet. + @param flowid The flow ID to be set. + @result 0 upon success otherwise the errno error. If the mbuf + packet header does not have valid data bytes, the error + code will be EINVAL + */ +extern errno_t mbuf_set_flowid(mbuf_t mbuf, u_int16_t flowid); + + +#endif /* KERNEL_PRIVATE */ + /* IF_QUEUE interaction */ #define IF_ENQUEUE_MBUF(ifq, m) { \ diff --git a/bsd/sys/kpi_socket.h b/bsd/sys/kpi_socket.h index ff2475e98..6045af624 100644 --- a/bsd/sys/kpi_socket.h +++ b/bsd/sys/kpi_socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2012 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,7 +79,7 @@ typedef void (*sock_upcall)(socket_t so, void *cookie, int waitf); when an event status is available. @param so A reference to the socket that's ready. @param cookie The cookie passed in when the socket was created. - @param int Indicates the event as defined by SO_FILT_HINT_* + @param event Indicates the event as defined by SO_FILT_HINT_* */ typedef void (*sock_evupcall)(socket_t so, void *cookie, u_int32_t event); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/kpi_socketfilter.h b/bsd/sys/kpi_socketfilter.h index 14fef1c2a..bb82c5439 100644 --- a/bsd/sys/kpi_socketfilter.h +++ b/bsd/sys/kpi_socketfilter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -182,7 +182,7 @@ typedef errno_t (*sf_attach_func)(void **cookie, socket_t so); @param cookie Cookie value specified when the filter attach was called. @param so The socket the filter is attached to. - @result If you return a non-zero value, your filter will not be + @discussion If you return a non-zero value, your filter will not be attached to this socket. */ typedef void (*sf_detach_func)(void *cookie, socket_t so); @@ -285,7 +285,7 @@ typedef errno_t (*sf_data_in_func)(void *cookie, socket_t so, @param cookie Cookie value specified when the filter attach was called. @param so The socket the filter is attached to. - @param from The address the data is from, may be NULL if the socket + @param to The address the data is to, may be NULL if the socket is connected. @param data The data being received. Control data may appear in the mbuf chain, be sure to check the mbuf types to find control @@ -591,7 +591,7 @@ extern errno_t sflt_unregister(sflt_handle handle); @param handle The handle of the registered filter to be attached. @result 0 on success otherwise the errno error. */ -extern errno_t sflt_attach(socket_t so, sflt_handle); +extern errno_t sflt_attach(socket_t socket, sflt_handle handle); /*! @function sflt_detach @@ -600,7 +600,7 @@ extern errno_t sflt_attach(socket_t so, sflt_handle); @param handle The handle of the registered filter to be detached. @result 0 on success otherwise the errno error. */ -extern errno_t sflt_detach(socket_t so, sflt_handle); +extern errno_t sflt_detach(socket_t socket, sflt_handle handle); /* Functions for manipulating sockets */ /* diff --git a/bsd/sys/ktrace.h b/bsd/sys/ktrace.h new file mode 100644 index 000000000..c67c9f6d1 --- /dev/null +++ b/bsd/sys/ktrace.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef SYS_KTRACE_H +#define SYS_KTRACE_H + +#include + +#include + +/* The states that ktrace can be in. */ +enum ktrace_state { + /* No tool has configured ktrace. */ + KTRACE_STATE_OFF = 0, + /* A foreground tool has configured ktrace. */ + KTRACE_STATE_FG, + /* A background tool has configured ktrace. */ + KTRACE_STATE_BG +}; + +extern lck_mtx_t *ktrace_lock; + +/* + * Subsystems that use ktrace to manage ownership. These values are passed as + * part of the `*_mask` arguments in `ktrace_configure` and `ktrace_reset`. + */ +#define KTRACE_KDEBUG (1 << 0) +#define KTRACE_KPERF (1 << 1) + +/* + * Used by subsystems to inform ktrace that a configuration is occurring. + * Validates whether the current process has privileges to configure + * ktrace. Pass the subsystem(s) being configured in config_mask. + * + * `ktrace_lock` must be held. + * + * Returns 0 if configuration is allowed, EPERM if process is not privileged, + * and EBUSY if ktrace is owned by another process. + */ +int ktrace_configure(uint32_t config_mask); + +/* + * Tell ktrace to reset a configuration. Pass the susbsystem(s) that are to + * be reset in the reset_mask. + * + * `ktrace_lock` must be held. + */ +void ktrace_reset(uint32_t reset_mask); + +/* + * Determine if the current process can read the configuration of ktrace. + * Only the owning process or a root privileged process is allowed. + * + * `ktrace_lock` must be held. + * + * Returns 0 if allowed, EPERM otherwise. + */ +int ktrace_read_check(void); + +/* + * With certain boot-args, the kernel can start tracing without user space + * intervention. With `trace=`, the kernel will start tracing at + * boot. With `trace_wake=`, the kernel will start tracing on the + * wake path out of hibernation (on Intel only). + * + * In these cases, ktrace must be aware of the state changes. This function + * should be called whenever the kernel initiates configuring ktrace. + * + * `ktrace_lock` must be held. + */ +void ktrace_kernel_configure(uint32_t config_mask); + +/* + * This KPI allows kernel systems to disable ktrace. ktrace will only be + * disabled if the state matches the provided state_to_match. + * + * This does not reset the configuration of any subsystems -- it just makes + * them stop logging events or sampling data. + * + * `ktrace_lock` must be held. + */ +void ktrace_disable(enum ktrace_state state_to_match); + +/* + * Returns the pid of the process that owns ktrace. If ktrace is unowned, + * returns 0. + * + * `ktrace_lock` must be held. + */ +int ktrace_get_owning_pid(void); + +/* + * Returns true if background tracing is active, false otherwise. + * + * `ktrace_lock` must be held. + */ +bool ktrace_background_active(void); + +/* + * These functions exist for the transition for kperf to allow blessing other + * processes. They should not be used by other clients. + */ +extern boolean_t ktrace_keep_ownership_on_reset; +extern int ktrace_root_set_owner_allowed; +int ktrace_set_owning_pid(int pid); + +/* Initialize ktrace. Must only be called by the bootstrap thread. */ +void ktrace_init(void); + +#endif /* SYS_KTRACE_H */ diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index 713210e1d..6dac79920 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -165,9 +165,9 @@ #define M_BUFHDR 72 /* File buffer cache headers */ #define M_OFILETABL 73 /* Open file descriptor table */ #define M_MCLUST 74 /* mbuf cluster buffers */ -#define M_HFSMNT 75 /* HFS mount structure */ -#define M_HFSNODE 76 /* HFS catalog node */ -#define M_HFSFORK 77 /* HFS file fork */ +/* unused 75 */ +/* unused 76 */ +/* unused 77 */ /* unused 78 */ /* unused 79 */ #define M_TEMP 80 /* misc temporary data buffers */ @@ -181,11 +181,11 @@ #define M_IP6MISC 88 /* IPv6 misc. memory */ /* unused 89 */ #define M_IGMP 90 -#define M_JNL_JNL 91 /* Journaling: "struct journal" */ -#define M_JNL_TR 92 /* Journaling: "struct transaction" */ +/* unused 91 */ +/* unused 92 */ #define M_SPECINFO 93 /* special file node */ -#define M_KQUEUE 94 /* kqueue */ -#define M_HFSDIRHINT 95 /* HFS directory hint */ +#define M_KQUEUE 94 /* kqueue system */ +/* unused 95 */ #define M_CLRDAHEAD 96 /* storage for cluster read-ahead state */ #define M_CLWRBEHIND 97 /* storage for cluster write-behind state */ #define M_IOV64 98 /* large iov's for 64 bit process */ @@ -199,9 +199,9 @@ #define M_EXTATTR 106 /* extended attribute */ #define M_SELECT 107 /* per-thread select memory */ /* M_TRAFFIC_MGT 108 */ -#if HFS_COMPRESSION +#if FS_COMPRESSION #define M_DECMPFS_CNODE 109 /* decmpfs cnode structures */ -#endif /* HFS_COMPRESSION */ +#endif /* FS_COMPRESSION */ #define M_INMFILTER 110 /* IPv4 multicast PCB-layer source filter */ #define M_IPMSOURCE 111 /* IPv4 multicast IGMP-layer source filter */ #define M_IN6MFILTER 112 /* IPv6 multicast PCB-layer source filter */ @@ -226,7 +226,6 @@ #define M_IFADDR 9 /* interface address (IOFireWireIP)*/ #define M_LOCKF 40 /* Byte-range locking structures (msdos) */ #define M_TEMP 80 /* misc temporary data buffers */ -#define M_HFSMNT 75 /* HFS mount structure (afpfs) */ #define M_KAUTH 100 /* kauth subsystem (smb) */ #define M_SONAME 11 /* socket name (smb) */ #define M_PCB 4 /* protocol control block (smb) */ diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index b5b7ee802..2c703373b 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -214,6 +214,7 @@ struct tcp_pktinfo { union { struct { u_int32_t segsz; /* segment size (actual MSS) */ + u_int32_t start_seq; /* start seq of this packet */ } __tx; struct { u_int16_t lro_pktlen; /* max seg size encountered */ @@ -226,6 +227,7 @@ struct tcp_pktinfo { u_int32_t seq; /* recv msg sequence # */ } __msgattr; #define tso_segsz proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.segsz +#define tx_start_seq proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.start_seq #define lro_pktlen proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_pktlen #define lro_npkts proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_npkts #define lro_elapsed proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_timediff @@ -238,17 +240,11 @@ struct tcp_pktinfo { */ struct mptcp_pktinfo { u_int64_t mtpi_dsn; /* MPTCP Data Sequence Number */ - union { - u_int64_t mtpi_dan; /* MPTCP Data Ack Number */ - struct { - u_int32_t mtpi_rel_seq; /* Relative Seq Number */ - u_int32_t mtpi_length; /* Length of mapping */ - } mtpi_subf; - }; + u_int32_t mtpi_rel_seq; /* Relative Seq Number */ + u_int32_t mtpi_length; /* Length of mapping */ #define mp_dsn proto_mtag.__pr_u.tcp.tm_mptcp.mtpi_dsn -#define mp_rseq proto_mtag.__pr_u.tcp.tm_mptcp.mtpi_subf.mtpi_rel_seq -#define mp_rlen proto_mtag.__pr_u.tcp.tm_mptcp.mtpi_subf.mtpi_length -#define mp_dack proto_mtag.__pr_u.tcp.tm_mptcp.mtpi_subf.mtpi_dan +#define mp_rseq proto_mtag.__pr_u.tcp.tm_mptcp.mtpi_rel_seq +#define mp_rlen proto_mtag.__pr_u.tcp.tm_mptcp.mtpi_length }; /* @@ -264,6 +260,17 @@ struct tcp_mtag { }; }; +struct driver_mtag_ { + uintptr_t _drv_tx_compl_arg; + uintptr_t _drv_tx_compl_data; + kern_return_t _drv_tx_status; + uint16_t _drv_flowid; +#define drv_tx_compl_arg builtin_mtag._drv_mtag._drv_tx_compl_arg +#define drv_tx_compl_data builtin_mtag._drv_mtag._drv_tx_compl_data +#define drv_tx_status builtin_mtag._drv_mtag._drv_tx_status +#define drv_flowid builtin_mtag._drv_mtag._drv_flowid +}; + /* * Protocol specific mbuf tag (at most one protocol metadata per mbuf). * @@ -272,7 +279,7 @@ struct tcp_mtag { * that the former is used on the virtual ipsec interface that does * not advertise the TSO capability.) */ -struct proto_mtag { +struct proto_mtag_ { union { struct tcp_mtag tcp; /* TCP specific */ } __pr_u; @@ -281,17 +288,30 @@ struct proto_mtag { /* * NECP specific mbuf tag. */ -struct necp_mtag { +struct necp_mtag_ { u_int32_t necp_policy_id; u_int32_t necp_last_interface_index; u_int32_t necp_route_rule_id; + u_int32_t necp_app_id; +}; + +union builtin_mtag { + struct { + struct proto_mtag_ _proto_mtag; /* built-in protocol-specific tag */ + struct pf_mtag _pf_mtag; /* built-in PF tag */ + struct necp_mtag_ _necp_mtag; /* built-in NECP tag */ + } _net_mtag; + struct driver_mtag_ _drv_mtag; +#define necp_mtag builtin_mtag._net_mtag._necp_mtag +#define proto_mtag builtin_mtag._net_mtag._proto_mtag +#define driver_mtag builtin_mtag._drv_mtag }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR set. */ -struct pkthdr { - struct ifnet *rcvif; /* rcv interface */ +struct pkthdr { + struct ifnet *rcvif; /* rcv interface */ /* variables for ip and tcp reassembly */ void *pkt_hdr; /* pointer to packet header */ int32_t len; /* total packet length */ @@ -348,6 +368,9 @@ struct pkthdr { u_int32_t pkt_flowid; /* flow ID */ u_int32_t pkt_flags; /* PKTF flags (see below) */ u_int32_t pkt_svc; /* MBUF_SVC value */ + + u_int32_t pkt_compl_context; /* Packet completion context */ + union { struct { u_int16_t src; /* ifindex of src addr i/f */ @@ -360,25 +383,27 @@ struct pkthdr { #define dst_ifindex _pkt_iaif.dst #define dst_iff _pkt_iaif.dst_flags u_int64_t pkt_ifainfo; /* data field used by ifainfo */ - u_int32_t pkt_unsent_databytes; /* unsent data */ + struct { + u_int32_t if_data; /* bytes in interface queue */ + u_int32_t sndbuf_data; /* bytes in socket buffer */ + } _pkt_bsr; /* Buffer status report used by cellular interface */ +#define bufstatus_if _pkt_bsr.if_data +#define bufstatus_sndbuf _pkt_bsr.sndbuf_data }; #if MEASURE_BW u_int64_t pkt_bwseq; /* sequence # */ #endif /* MEASURE_BW */ - u_int64_t pkt_enqueue_ts; /* enqueue time */ + u_int64_t pkt_timestamp; /* enqueue time */ /* * Tags (external and built-in) */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of external tags */ - struct proto_mtag proto_mtag; /* built-in protocol-specific tag */ - struct pf_mtag pf_mtag; /* built-in PF tag */ - struct necp_mtag necp_mtag; /* built-in NECP tag */ + union builtin_mtag builtin_mtag; /* * Module private scratch space (32-bit aligned), currently 16-bytes - * large. Anything stored here is not guaranteed to survive across - * modules. This should be the penultimate structure right before - * the red zone. Add new fields above this. + * large. Anything stored here is not guaranteed to survive across + * modules. */ struct { union { @@ -395,6 +420,7 @@ struct pkthdr { } __mpriv_u; } pkt_mpriv __attribute__((aligned(4))); u_int32_t redzone; /* red zone */ + u_int32_t pkt_compl_callbacks; /* Packet completion callbacks */ }; /* @@ -407,6 +433,7 @@ struct pkthdr { #define FLOWSRC_INPCB 1 /* flow ID generated by INPCB */ #define FLOWSRC_IFNET 2 /* flow ID generated by interface */ #define FLOWSRC_PF 3 /* flow ID generated by PF */ +#define FLOWSRC_CHANNEL 4 /* flow ID generated by channel */ /* * Packet flags. Unlike m_flags, all packet flags are copied along when @@ -451,6 +478,13 @@ struct pkthdr { #define PKTF_SO_REALTIME 0x80000 /* data is realtime traffic */ #define PKTF_VALID_UNSENT_DATA 0x100000 /* unsent data is valid */ #define PKTF_TCP_REXMT 0x200000 /* packet is TCP retransmission */ +#define PKTF_REASSEMBLED 0x400000 /* Packet was reassembled */ +#define PKTF_TX_COMPL_TS_REQ 0x800000 /* tx completion timestamp requested */ +#define PKTF_DRV_TS_VALID 0x1000000 /* driver timestamp is valid */ +#define PKTF_DRIVER_MTAG 0x2000000 /* driver mbuf tags fields inited */ +#define PKTF_NEW_FLOW 0x4000000 /* Data from a new flow */ +#define PKTF_START_SEQ 0x8000000 /* valid start sequence */ +#define PKTF_LAST_PKT 0x10000000 /* last packet in the flow */ /* flags related to flow control/advisory and identification */ #define PKTF_FLOW_MASK \ @@ -465,12 +499,13 @@ struct m_ext { (caddr_t, u_int, caddr_t); u_int ext_size; /* size of buffer, for ext_free */ caddr_t ext_arg; /* additional ext_free argument */ - struct ext_refsq { /* references held */ - struct ext_refsq *forward, *backward; - } ext_refs; struct ext_ref { - u_int32_t refcnt; - u_int32_t flags; + struct mbuf *paired; + u_int16_t minref; + u_int16_t refcnt; + u_int16_t prefcnt; + u_int16_t flags; + u_int32_t priv; } *ext_refflags; }; @@ -481,12 +516,12 @@ typedef struct m_ext _m_ext_t; * The mbuf object */ struct mbuf { - struct m_hdr m_hdr; + struct m_hdr m_hdr; union { struct { - struct pkthdr MH_pkthdr; /* M_PKTHDR set */ + struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { - struct m_ext MH_ext; /* M_EXT set */ + struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[_MHLEN]; } MH_dat; } MH; @@ -506,7 +541,7 @@ struct mbuf { #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf #define m_pktlen(_m) ((_m)->m_pkthdr.len) -#define m_pftag(_m) (&(_m)->m_pkthdr.pf_mtag) +#define m_pftag(_m) (&(_m)->m_pkthdr.builtin_mtag._net_mtag._pf_mtag) /* mbuf flags (private) */ #define M_EXT 0x0001 /* has associated external storage */ @@ -897,7 +932,16 @@ struct name { \ #define MBUFQ_EMPTY(q) ((q)->mq_first == NULL) #define MBUFQ_FIRST(q) ((q)->mq_first) #define MBUFQ_NEXT(m) ((m)->m_nextpkt) -#define MBUFQ_LAST(q) (*(q)->mq_last) +/* + * mq_last is initialized to point to mq_first, so check if they're + * equal and return NULL when the list is empty. Otherwise, we need + * to subtract the offset of MBUQ_NEXT (i.e. m_nextpkt field) to get + * to the base mbuf address to return to caller. + */ +#define MBUFQ_LAST(head) \ + (((head)->mq_last == &MBUFQ_FIRST(head)) ? NULL : \ + ((struct mbuf *)(void *)((char *)(head)->mq_last - \ + (size_t)(&MBUFQ_NEXT((struct mbuf *)0))))) #define max_linkhdr P2ROUNDUP(_max_linkhdr, sizeof (u_int32_t)) #define max_protohdr P2ROUNDUP(_max_protohdr, sizeof (u_int32_t)) @@ -1258,7 +1302,7 @@ extern int _max_protohdr; /* largest protocol header */ __private_extern__ unsigned int mbuf_default_ncl(int, u_int64_t); __private_extern__ void mbinit(void); __private_extern__ struct mbuf *m_clattach(struct mbuf *, int, caddr_t, - void (*)(caddr_t, u_int, caddr_t), u_int, caddr_t, int); + void (*)(caddr_t, u_int, caddr_t), u_int, caddr_t, int, int); __private_extern__ caddr_t m_bigalloc(int); __private_extern__ void m_bigfree(caddr_t, u_int, caddr_t); __private_extern__ struct mbuf *m_mbigget(struct mbuf *, int); @@ -1317,6 +1361,11 @@ __private_extern__ struct mbuf *m_getpackets_internal(unsigned int *, int, __private_extern__ struct mbuf *m_allocpacket_internal(unsigned int *, size_t, unsigned int *, int, int, size_t); +__private_extern__ int m_ext_set_prop(struct mbuf *, uint32_t, uint32_t); +__private_extern__ uint32_t m_ext_get_prop(struct mbuf *); +__private_extern__ int m_ext_paired_is_active(struct mbuf *); +__private_extern__ void m_ext_paired_activate(struct mbuf *); + __private_extern__ void m_drain(void); /* @@ -1407,6 +1456,8 @@ __private_extern__ u_int16_t m_adj_sum16(struct mbuf *, u_int32_t, u_int32_t, u_int32_t); __private_extern__ u_int16_t m_sum16(struct mbuf *, u_int32_t, u_int32_t); +extern void m_do_tx_compl_callback(struct mbuf *, struct ifnet *); + __END_DECLS #endif /* XNU_KERNEL_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 9bd70a21e..3993b3fd3 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2014 Apple Inc. All rights reserved. + * Copyright (c) 2006-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,6 +37,7 @@ extern "C" { #include #include #include +#include #include #include @@ -51,7 +52,8 @@ extern "C" { /* * Unlike VERIFY(), ASSERT() is evaluated only in DEBUG build. */ -#define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) +#define VERIFY(EX) \ + ((void)(__probable((EX)) || assfail(#EX, __FILE__, __LINE__))) #if DEBUG #define ASSERT(EX) VERIFY(EX) #else @@ -61,8 +63,7 @@ extern "C" { /* * Compile time assert; this should be on its own someday. */ -#define _CASSERT(x) \ - switch (0) { case 0: case (x): ; } +#define _CASSERT(x) _Static_assert(x, "compile-time assertion failed") /* * Atomic macros; these should be on their own someday. @@ -85,8 +86,19 @@ extern "C" { #define atomic_add_64(a, n) \ ((void) atomic_add_64_ov(a, n)) +#define atomic_test_set_32(a, o, n) \ + OSCompareAndSwap(o, n, (volatile UInt32 *)a) + +#define atomic_set_32(a, n) do { \ + while (!atomic_test_set_32(a, *a, n)) \ + ; \ +} while (0) + +#define atomic_test_set_64(a, o, n) \ + OSCompareAndSwap64(o, n, (volatile UInt64 *)a) + #define atomic_set_64(a, n) do { \ - while (!OSCompareAndSwap64(*a, n, (volatile UInt64 *)a)) \ + while (!atomic_test_set_64(a, *a, n)) \ ; \ } while (0) @@ -100,6 +112,14 @@ extern "C" { } while (0) #endif /* __LP64__ */ +#define atomic_test_set_ptr(a, o, n) \ + OSCompareAndSwapPtr(o, n, (void * volatile *)a) + +#define atomic_set_ptr(a, n) do { \ + while (!atomic_test_set_ptr(a, *a, n)) \ + ; \ +} while (0) + #define atomic_or_8_ov(a, n) \ ((u_int8_t) OSBitOrAtomic8(n, (volatile UInt8 *)a)) @@ -154,11 +174,13 @@ extern "C" { #define atomic_bitclear_32(a, n) \ atomic_and_32(a, ~(n)) +#define membar_sync OSMemoryBarrier + /* * Use CPU_CACHE_LINE_SIZE instead of MAX_CPU_CACHE_LINE_SIZE, unless * wasting space is of no concern. */ -#define MAX_CPU_CACHE_LINE_SIZE 64 +#define MAX_CPU_CACHE_LINE_SIZE 128 #define CPU_CACHE_LINE_SIZE mcache_cache_line_size() #ifndef IS_P2ALIGNED @@ -303,7 +325,8 @@ typedef struct mcache { /* * Per-CPU layer, aligned at cache line boundary */ - mcache_cpu_t mc_cpu[1]; + mcache_cpu_t mc_cpu[1] + __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE))); } mcache_t; #define MCACHE_ALIGN 8 /* default guaranteed alignment */ @@ -379,6 +402,7 @@ __private_extern__ void mcache_audit_panic(mcache_audit_t *, void *, size_t, int64_t, int64_t); extern int32_t total_sbmb_cnt; +extern int32_t total_sbmb_cnt_floor; extern int32_t total_sbmb_cnt_peak; extern int64_t sbmb_limreached; extern mcache_t *mcache_audit_cache; diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index 2db9e5779..242f50480 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -498,11 +498,16 @@ struct vfsioattr { u_int32_t io_maxsegwritesize; /* Max. segment write size */ u_int32_t io_devblocksize; /* the underlying device block size */ u_int32_t io_flags; /* flags for underlying device */ - void * io_reserved[2]; /* extended attribute information */ + union { + int64_t io_max_swappin_available; + // On 32 bit architectures, we don't have any spare + void *io_reserved[2]; + }; }; -#define VFS_IOATTR_FLAGS_FUA 0x01 /* Write-through cache supported */ -#define VFS_IOATTR_FLAGS_UNMAP 0x02 /* Unmap (trim) supported */ +#define VFS_IOATTR_FLAGS_FUA 0x00000001 /* Write-through cache supported */ +#define VFS_IOATTR_FLAGS_UNMAP 0x00000002 /* Unmap (trim) supported */ +#define VFS_IOATTR_FLAGS_SWAPPIN_SUPPORTED 0x00000010 /* Pinning swap file supported */ /* * Filesystem Registration information @@ -521,9 +526,8 @@ struct vfsioattr { #define VFS_TBLVNOP_PAGEINV2 0x2000 #define VFS_TBLVNOP_PAGEOUTV2 0x4000 #define VFS_TBLVNOP_NOUPDATEID_RENAME 0x8000 /* vfs should not call vnode_update_ident on rename */ -#if CONFIG_SECLUDED_RENAME #define VFS_TBLVNOP_SECLUDE_RENAME 0x10000 -#endif +#define VFS_TBLCANMOUNTROOT 0x20000 struct vfs_fsentry { @@ -702,9 +706,64 @@ struct vfsops { @return 0 for success, else an error code. */ int (*vfs_setattr)(struct mount *mp, struct vfs_attr *, vfs_context_t context); - void *vfs_reserved[7]; + + /*! + @function vfs_ioctl + @abstract File system control operations. + @discussion Unlike vfs_sysctl, this is specific to a particular volume. + @param mp The mount to execute the command on. + @param command Identifier for action to take. The command used here + should be in the same namespace as VNOP ioctl commands. + @param data Pointer to data; this can be an integer constant (of 32 bits + only) or an address to be read from or written to, depending on "command." + If it is an address, it is valid and resides in the kernel; callers of + VFS_IOCTL() are responsible for copying to and from userland. + @param flags Reserved for future use, set to zero + @param ctx Context against which to authenticate ioctl request. + @return 0 for success, else an error code. + */ + int (*vfs_ioctl)(struct mount *mp, u_long command, caddr_t data, + int flags, vfs_context_t context); + + /*! + @function vfs_vget_snapdir + @abstract Get the vnode for the snapshot directory of a filesystem. + @discussion Upon success, should return with an iocount held on the root vnode which the caller will + drop with vnode_put(). + @param mp Mount for which to get the root. + @param vpp Destination for snapshot directory vnode. + @param context Context to authenticate for getting the snapshot directory. + @return 0 for success, else an error code. + */ + int (*vfs_vget_snapdir)(struct mount *mp, struct vnode **vpp, vfs_context_t context); + void *vfs_reserved5; + void *vfs_reserved4; + void *vfs_reserved3; + void *vfs_reserved2; + void *vfs_reserved1; }; +#ifdef KERNEL + +/* + * Commands for vfs_ioctl. While they are encoded the same way as for ioctl(2), + * there is no generic interface for them from userspace like ioctl(2). + */ +struct fs_snapshot_mount_args { + mount_t sm_mp; + struct componentname *sm_cnp; +}; + +#define VFSIOC_MOUNT_SNAPSHOT _IOW('V', 1, struct fs_snapshot_mount_args) +#define VFSCTL_MOUNT_SNAPSHOT IOCBASECMD(VFSIOC_MOUNT_SNAPSHOT) + +struct fs_snapshot_revert_args { + struct componentname *sr_cnp; +}; +#define VFSIOC_REVERT_SNAPSHOT _IOW('V', 2, struct fs_snapshot_revert_args) +#define VFSCTL_REVERT_SNAPSHOT IOCBASECMD(VFSIOC_REVERT_SNAPSHOT) + +#endif /* KERNEL */ /* * flags passed into vfs_iterate @@ -736,6 +795,9 @@ extern int VFS_SYNC(mount_t, int, vfs_context_t); extern int VFS_VGET(mount_t, ino64_t, vnode_t *, vfs_context_t); extern int VFS_FHTOVP(mount_t, int, unsigned char *, vnode_t *, vfs_context_t); extern int VFS_VPTOFH(vnode_t, int *, unsigned char *, vfs_context_t); +extern int VFS_IOCTL(mount_t mp, u_long command, caddr_t data, + int flags, vfs_context_t context); +extern int VFS_VGET_SNAPDIR(mount_t, vnode_t *, vfs_context_t); #endif /* BSD_KERNEL_PRIVATE */ /* * prototypes for exported VFS operations @@ -750,7 +812,7 @@ extern int VFS_VPTOFH(vnode_t, int *, unsigned char *, vfs_context_t); @param handle Opaque handle which will be passed to vfs_fsremove. @return 0 for success, else an error code. */ -int vfs_fsadd(struct vfs_fsentry *, vfstable_t *); +int vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t *handle); /*! @function vfs_fsremove @@ -759,18 +821,18 @@ int vfs_fsadd(struct vfs_fsentry *, vfstable_t *); @param handle Handle which was returned by vfs_fsadd. @return 0 for success, else an error code. */ -int vfs_fsremove(vfstable_t); +int vfs_fsremove(vfstable_t handle); /*! @function vfs_iterate @abstract Iterate over all mountpoints with a callback. Used, for example, by sync(). @param flags Unused. - @param callback Function which takes a mount and arbitrary passed-in "arg," and returns one of VFS_RETURNED_DONE or VFS_CLAIMED_DONE: end + @param callout Function which takes a mount and arbitrary passed-in "arg," and returns one of VFS_RETURNED_DONE or VFS_CLAIMED_DONE: end iteration and return success. VFS_RETURNED or VFS_CLAIMED: continue iterating. Anything else: continue iterating. @param arg Arbitrary data to pass to callback. @return 0 for success, else an error code. */ -int vfs_iterate(int, int (*)(struct mount *, void *), void *); +int vfs_iterate(int flags, int (*callout)(struct mount *, void *), void *arg); /*! @function vfs_init_io_attributes @@ -779,7 +841,7 @@ int vfs_iterate(int, int (*)(struct mount *, void *), void *); @param mp Mountpoint whose I/O parameters to initialize. @return 0 for success, else an error code. */ -int vfs_init_io_attributes(vnode_t, mount_t); +int vfs_init_io_attributes(vnode_t devvp, mount_t mp); /*! @function vfs_flags @@ -788,7 +850,7 @@ int vfs_init_io_attributes(vnode_t, mount_t); @param mp Mount whose flags to grab. @return Flags. */ -uint64_t vfs_flags(mount_t); +uint64_t vfs_flags(mount_t mp); /*! @function vfs_setflags @@ -797,9 +859,8 @@ uint64_t vfs_flags(mount_t); used by a filesystem as part of the mount process. @param mp Mount whose flags to set. @param flags Flags to activate. Must be in the bitwise "OR" of MNT_VISFLAGMASK and MNT_CMDFLAGS. - @return Flags. */ -void vfs_setflags(mount_t, uint64_t); +void vfs_setflags(mount_t mp, uint64_t flags); /*! @function vfs_clearflags @@ -807,9 +868,8 @@ void vfs_setflags(mount_t, uint64_t); @discussion Sets mount flags to the bitwise "AND" of their current value and the complement of the specified bits. @param mp Mount whose flags to set. @param flags Flags to deactivate. Must be in the bitwise "OR" of MNT_VISFLAGMASK and MNT_CMDFLAGS. - @return void. */ -void vfs_clearflags(mount_t, uint64_t); +void vfs_clearflags(mount_t mp, uint64_t flags); /*! @function vfs_issynchronous @@ -817,7 +877,7 @@ void vfs_clearflags(mount_t, uint64_t); @param mp Mount to test. @return Nonzero if writes occur synchronously, else 0. */ -int vfs_issynchronous(mount_t); +int vfs_issynchronous(mount_t mp); /*! @function vfs_iswriteupgrade @@ -826,7 +886,7 @@ int vfs_issynchronous(mount_t); @param mp Mount to test. @return Nonzero if a request has been made to update from read-only to read-write, else 0. */ -int vfs_iswriteupgrade(mount_t); +int vfs_iswriteupgrade(mount_t mp); /*! @function vfs_isupdate @@ -834,7 +894,7 @@ int vfs_iswriteupgrade(mount_t); @param mp Mount to test. @return Nonzero if a mount update is in progress, 0 otherwise. */ -int vfs_isupdate(mount_t); +int vfs_isupdate(mount_t mp); /*! @function vfs_isreload @@ -843,7 +903,7 @@ int vfs_isupdate(mount_t); @param mp Mount to test. @return Nonzero if a request has been made to reload data, else 0. */ -int vfs_isreload(mount_t); +int vfs_isreload(mount_t mp); /*! @function vfs_isforce @@ -852,7 +912,7 @@ int vfs_isreload(mount_t); @param mp Mount to test. @return Nonzero if a request has been made to forcibly unmount, else 0. */ -int vfs_isforce(mount_t); +int vfs_isforce(mount_t mp); /*! @function vfs_isunmount @@ -870,7 +930,7 @@ int vfs_isunmount(mount_t mp); @param mp Mount to test. @return Nonzero if filesystem is mounted read-only, else 0. */ -int vfs_isrdonly(mount_t); +int vfs_isrdonly(mount_t mp); /*! @function vfs_isrdwr @@ -878,7 +938,7 @@ int vfs_isrdonly(mount_t); @param mp Mount to test. @return Nonzero if filesystem is mounted read-write, else 0. */ -int vfs_isrdwr(mount_t); +int vfs_isrdwr(mount_t mp); /*! @function vfs_authopaque @@ -886,7 +946,7 @@ int vfs_isrdwr(mount_t); @param mp Mount to test. @return Nonzero if filesystem authorization is controlled remotely, else 0. */ -int vfs_authopaque(mount_t); +int vfs_authopaque(mount_t mp); /*! @function vfs_authopaqueaccess @@ -894,66 +954,59 @@ int vfs_authopaque(mount_t); @param mp Mount to test. @return Nonzero if VNOP_ACCESS is supported remotely, else 0. */ -int vfs_authopaqueaccess(mount_t); +int vfs_authopaqueaccess(mount_t mp); /*! @function vfs_setauthopaque @abstract Mark a filesystem as having authorization decisions controlled remotely. @param mp Mount to mark. - @return void. */ -void vfs_setauthopaque(mount_t); +void vfs_setauthopaque(mount_t mp); /*! @function vfs_setauthopaqueaccess @abstract Mark a filesystem as having remote VNOP_ACCESS support. @param mp Mount to mark. - @return void. */ -void vfs_setauthopaqueaccess(mount_t); +void vfs_setauthopaqueaccess(mount_t mp); /*! @function vfs_clearauthopaque @abstract Mark a filesystem as not having remote authorization decisions. @param mp Mount to mark. - @return void. */ -void vfs_clearauthopaque(mount_t); +void vfs_clearauthopaque(mount_t mp); /*! @function vfs_clearauthopaque @abstract Mark a filesystem as not having remote VNOP_ACCESS support. @param mp Mount to mark. - @return void. */ -void vfs_clearauthopaqueaccess(mount_t); +void vfs_clearauthopaqueaccess(mount_t mp); /*! @function vfs_setextendedsecurity @abstract Mark a filesystem as supporting security controls beyond POSIX permissions. @discussion Specific controls include ACLs, file owner UUIDs, and group UUIDs. @param mp Mount to test. - @return void. */ -void vfs_setextendedsecurity(mount_t); +void vfs_setextendedsecurity(mount_t mp); /*! @function vfs_clearextendedsecurity @abstract Mark a filesystem as NOT supporting security controls beyond POSIX permissions. @discussion Specific controls include ACLs, file owner UUIDs, and group UUIDs. @param mp Mount to test. - @return void. */ -void vfs_clearextendedsecurity(mount_t); +void vfs_clearextendedsecurity(mount_t mp); /*! @function vfs_setlocklocal @abstract Mark a filesystem as using VFS-level advisory locking support. @discussion Advisory locking operations will not call down to the filesystem if this flag is set. @param mp Mount to mark. - @return void. */ -void vfs_setlocklocal(mount_t); +void vfs_setlocklocal(mount_t mp); /*! @function vfs_authcache_ttl @@ -964,7 +1017,7 @@ void vfs_setlocklocal(mount_t); @param mp Mount for which to check cache lifetime. @return Cache lifetime in seconds. CACHED_RIGHT_INFINITE_TTL indicates that credentials never expire. */ -int vfs_authcache_ttl(mount_t); +int vfs_authcache_ttl(mount_t mp); /*! @function vfs_setauthcache_ttl @@ -973,18 +1026,16 @@ int vfs_authcache_ttl(mount_t); previously-authorized actions from the same vfs_context_t without calling down to the filesystem (though it will not deny based on the cache). @param mp Mount for which to set cache lifetime. - @return void. */ -void vfs_setauthcache_ttl(mount_t, int); +void vfs_setauthcache_ttl(mount_t mp, int ttl); /*! @function vfs_clearauthcache_ttl @abstract Remove time-to-live controls for cached credentials on a filesytem. Filesystems with remote authorization decisions (opaque) will still have KAUTH_VNODE_SEARCH rights cached for a default of CACHED_LOOKUP_RIGHT_TTL seconds. @param mp Mount for which to clear cache lifetime. - @return void. */ -void vfs_clearauthcache_ttl(mount_t); +void vfs_clearauthcache_ttl(mount_t mp); /* * return value from vfs_cachedrights_ttl if @@ -1000,16 +1051,15 @@ void vfs_clearauthcache_ttl(mount_t); @param mp Mount from which to get symlink length cap. @return Max symlink length. */ -uint32_t vfs_maxsymlen(mount_t); +uint32_t vfs_maxsymlen(mount_t mp); /*! @function vfs_setmaxsymlen @abstract Set the maximum length of a symbolic link on a filesystem. @param mp Mount on which to set symlink length cap. @param symlen Length to set. - @return Max symlink length. */ -void vfs_setmaxsymlen(mount_t, uint32_t); +void vfs_setmaxsymlen(mount_t mp, uint32_t symlen); /*! @function vfs_fsprivate @@ -1019,7 +1069,7 @@ void vfs_setmaxsymlen(mount_t, uint32_t); @param mp Mount for which to get private data. @return Private data. */ -void * vfs_fsprivate(mount_t); +void * vfs_fsprivate(mount_t mp); /*! @function vfs_setfsprivate @@ -1027,9 +1077,8 @@ void * vfs_fsprivate(mount_t); @discussion A filesystem generally has an internal mount structure which it attaches to the VFS-level mount structure as part of the mounting process. @param mp Mount for which to set private data. - @return Void. */ -void vfs_setfsprivate(mount_t, void *mntdata); +void vfs_setfsprivate(mount_t mp, void *mntdata); /*! @function vfs_statfs @@ -1040,7 +1089,7 @@ void vfs_setfsprivate(mount_t, void *mntdata); @param mp Mount for which to get vfsstatfs pointer. @return Pointer to vfsstatfs. */ -struct vfsstatfs * vfs_statfs(mount_t); +struct vfsstatfs * vfs_statfs(mount_t mp); #define VFS_USER_EVENT 0 #define VFS_KERNEL_EVENT 1 @@ -1056,7 +1105,7 @@ struct vfsstatfs * vfs_statfs(mount_t); @return 0 for success, or an error code for authentication failure or problem with call to filesystem to request information. */ -int vfs_update_vfsstat(mount_t, vfs_context_t, int eventtype); +int vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, int eventtype); /*! @function vfs_typenum @@ -1066,7 +1115,7 @@ int vfs_update_vfsstat(mount_t, vfs_context_t, int eventtype); @param mp Mount for which to get type number. @return Type number. */ -int vfs_typenum(mount_t); +int vfs_typenum(mount_t mp); /*! @function vfs_name @@ -1075,9 +1124,8 @@ int vfs_typenum(mount_t); rather than a name specific to the mountpoint. @param mp Mount for which to get name. @param buffer Destination for name; length should be at least MFSNAMELEN. - @return void. */ -void vfs_name(mount_t, char *); +void vfs_name(mount_t mp, char *buffer); /*! @function vfs_devblocksize @@ -1085,25 +1133,23 @@ void vfs_name(mount_t, char *); @param mp Mount for which to get block size. @return Block size. */ -int vfs_devblocksize(mount_t); +int vfs_devblocksize(mount_t mp); /*! @function vfs_ioattr @abstract Get I/O attributes associated with a mounpoint. @param mp Mount for which to get attributes. If NULL, system defaults are filled into ioattrp. @param ioattrp Destination for results. - @return void. */ -void vfs_ioattr(mount_t, struct vfsioattr *); +void vfs_ioattr(mount_t mp, struct vfsioattr *ioattrp); /*! @function vfs_setioattr @abstract Set I/O attributes associated with a mounpoint. @param mp Mount for which to set attributes. @param ioattrp Structure containing I/O parameters; all fields must be filled in. - @return void. */ -void vfs_setioattr(mount_t, struct vfsioattr *); +void vfs_setioattr(mount_t mp, struct vfsioattr *ioattrp); /*! @function vfs_64bitready @@ -1111,7 +1157,7 @@ void vfs_setioattr(mount_t, struct vfsioattr *); @param mp Mount to test. @return Nonzero if filesystem is ready for 64-bit; 0 otherwise. */ -int vfs_64bitready(mount_t); +int vfs_64bitready(mount_t mp); #define LK_NOWAIT 1 @@ -1127,16 +1173,15 @@ int vfs_64bitready(mount_t); @param flags LK_NOWAIT: fail with ENOENT if an unmount is in progress. @return 0 for success, with a lock held; an error code otherwise, with no lock held. */ -int vfs_busy(mount_t, int); +int vfs_busy(mount_t mp, int flags); /*! @function vfs_unbusy @abstract "Unbusy" a mountpoint by releasing its read-write lock. @discussion A successful vfs_busy() must be followed by a vfs_unbusy() to release the lock on the mount. @param mp Mount to unbusy. - @return void. */ -void vfs_unbusy(mount_t); +void vfs_unbusy(mount_t mp); /*! @function vfs_getnewfsid @@ -1144,9 +1189,8 @@ void vfs_unbusy(mount_t); @discussion Filesystem IDs are returned as part of "struct statfs." This function is typically called as part of file-system specific mount code (i.e. through VFS_MOUNT). @param mp Mount to set an ID for. - @return void. */ -void vfs_getnewfsid(struct mount *); +void vfs_getnewfsid(struct mount *mp); /*! @function vfs_getvfs @@ -1154,7 +1198,7 @@ void vfs_getnewfsid(struct mount *); @param fsid Filesystem ID to look up. @return Mountpoint if found, else NULL. Note unmounting mountpoints can be returned. */ -mount_t vfs_getvfs(fsid_t *); +mount_t vfs_getvfs(fsid_t *fsid); /*! @function vfs_mountedon @@ -1165,7 +1209,7 @@ mount_t vfs_getvfs(fsid_t *); @param vp The vnode to test. @return EBUSY if vnode is indeed the source of a filesystem; 0 if it is not. */ -int vfs_mountedon(struct vnode *); +int vfs_mountedon(struct vnode *vp); /*! @function vfs_unmountbyfsid @@ -1176,7 +1220,7 @@ int vfs_mountedon(struct vnode *); @param ctx Context against which to authenticate unmount operation. @return 0 for succcess, nonero for failure. */ -int vfs_unmountbyfsid(fsid_t *, int, vfs_context_t); +int vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx); /*! @function vfs_event_signal @@ -1184,14 +1228,15 @@ int vfs_unmountbyfsid(fsid_t *, int, vfs_context_t); @param fsid Unused. @param event Events to post. @param data Unused. - @return void. */ -void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); +void vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data); + /*! @function vfs_event_init @abstract This function should not be called by kexts. */ void vfs_event_init(void); /* XXX We should not export this */ + #ifdef KERNEL_PRIVATE int vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx); int vfs_getattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx); @@ -1204,6 +1249,7 @@ int vfs_nativexattrs (mount_t mp); /* whether or not the FS supports EAs nativel void * vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "void*" to limit dependence of mount.h on security headers. */ void vfs_setcompoundopen(mount_t mp); uint64_t vfs_throttle_mask(mount_t mp); +int vfs_isswapmount(mount_t mp); struct vnode_trigger_info; @@ -1274,6 +1320,9 @@ typedef void vfs_trigger_callback_t(mount_t mp, vfs_trigger_callback_op_t op, vo */ int vfs_settriggercallback(fsid_t *fsid, vfs_trigger_callback_t vtc, void *data, uint32_t flags, vfs_context_t ctx); +/* tags a volume as not supporting extended readdir for NFS exports */ +void mount_set_noreaddirext (mount_t); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 2f966ca5d..107467f08 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -133,6 +133,7 @@ struct mount { uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ uint32_t mnt_ioflags; /* flags for underlying device */ + uint32_t mnt_minsaturationbytecount; /* if non-zero, mininum amount of writes (in bytes) needed to max out throughput */ pending_io_t mnt_pending_write_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending writes */ pending_io_t mnt_pending_read_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending reads */ struct timeval mnt_last_write_issued_timestamp; @@ -192,6 +193,8 @@ struct mount { */ int mnt_authcache_ttl; char fstypename_override[MFSTYPENAMELEN]; + + uint32_t mnt_iobufinuse; }; /* @@ -215,11 +218,6 @@ struct mount { */ #define MNT_DEFAULT_IOQUEUE_DEPTH 32 - -/* XXX 3762912 hack to support HFS filesystem 'owner' */ -#define vfs_setowner(_mp, _uid, _gid) do {(_mp)->mnt_fsowner = (_uid); (_mp)->mnt_fsgroup = (_gid); } while (0) - - /* mount point to which dead vps point to */ extern struct mount * dead_mountp; @@ -258,10 +256,11 @@ extern struct mount * dead_mountp; #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_WANTRDWR 0x04000000 /* upgrade to read/write requested */ #if REV_ENDIAN_FS -#define MNT_REVEND 0x08000000 /* Reverse endian FS */ +#define MNT_REVEND 0x08000000 /* Reverse endian FS */ #endif /* REV_ENDIAN_FS */ -#define MNTK_AUTH_OPAQUE 0x20000000 /* authorisation decisions are not made locally */ -#define MNTK_AUTH_OPAQUE_ACCESS 0x40000000 /* VNOP_ACCESS is reliable for remote auth */ +#define MNTK_DIR_HARDLINKS 0x10000000 /* mounted file system supports directory hard links */ +#define MNTK_AUTH_OPAQUE 0x20000000 /* authorisation decisions are not made locally */ +#define MNTK_AUTH_OPAQUE_ACCESS 0x40000000 /* VNOP_ACCESS is reliable for remote auth */ #define MNTK_EXTENDED_SECURITY 0x80000000 /* extended security supported */ #define MNT_LNOTRESP 0x00000001 /* mount not responding */ @@ -316,7 +315,7 @@ struct vfstable { #define VFC_VFSLOCALARGS 0x002 #define VFC_VFSGENERICARGS 0x004 #define VFC_VFSNATIVEXATTR 0x010 -#define VFC_VFSDIRLINKS 0x020 +#define VFC_VFSCANMOUNTROOT 0x020 #define VFC_VFSPREFLIGHT 0x040 #define VFC_VFSREADDIR_EXTENDED 0x080 #define VFC_VFS64BITREADY 0x100 @@ -324,10 +323,7 @@ struct vfstable { #define VFC_VFSVNOP_PAGEINV2 0x2000 #define VFC_VFSVNOP_PAGEOUTV2 0x4000 #define VFC_VFSVNOP_NOUPDATEID_RENAME 0x8000 -#if CONFIG_SECLUDED_RENAME #define VFC_VFSVNOP_SECLUDE_RENAME 0x10000 -#endif - extern int maxvfstypenum; /* highest defined filesystem type */ extern struct vfstable *vfsconf; /* head of list of filesystem types */ @@ -416,7 +412,6 @@ struct user32_statfs { __BEGIN_DECLS -extern boolean_t root_is_CF_drive; extern uint32_t mount_generation; extern TAILQ_HEAD(mntlist, mount) mountlist; void mount_list_lock(void); @@ -453,14 +448,11 @@ void mount_iterdrop(mount_t); void mount_iterdrain(mount_t); void mount_iterreset(mount_t); -/* tags a volume as not supporting extended readdir for NFS exports */ -#ifdef BSD_KERNEL_PRIVATE -void mount_set_noreaddirext (mount_t); -#endif - /* Private NFS spi */ #define KERNEL_MOUNT_NOAUTH 0x01 /* Don't check the UID of the directory we are mounting on */ #define KERNEL_MOUNT_PERMIT_UNMOUNT 0x02 /* Allow (non-forced) unmounts by users other the one who mounted the volume */ +/* used by snapshot mounting SPI */ +#define KERNEL_MOUNT_SNAPSHOT 0x04 /* Mounting a snapshot */ #if NFSCLIENT || DEVFS || ROUTEFS /* * NOTE: kernel_mount() does not force MNT_NOSUID, MNT_NOEXEC, or MNT_NODEC for non-privileged @@ -470,19 +462,12 @@ int kernel_mount(char *, vnode_t, vnode_t, const char *, void *, size_t, int, ui boolean_t vfs_iskernelmount(mount_t); #endif -/* throttled I/O api */ - -/* returned by throttle_io_will_be_throttled */ -#define THROTTLE_DISENGAGED 0 -#define THROTTLE_ENGAGED 1 -#define THROTTLE_NOW 2 +/* Throttled I/O API. KPI/SPI is in systm.h. */ int throttle_get_io_policy(struct uthread **ut); int throttle_get_passive_io_policy(struct uthread **ut); -int throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp); void *throttle_info_update_by_mount(mount_t mp); void rethrottle_thread(uthread_t ut); -void throttle_info_reset_window(uthread_t ut); /* throttled I/O helper function */ diff --git a/bsd/sys/munge.h b/bsd/sys/munge.h index d1ab96096..47f07923c 100644 --- a/bsd/sys/munge.h +++ b/bsd/sys/munge.h @@ -86,6 +86,7 @@ void munge_wwlll(void *args); void munge_wwllww(void *args); void munge_wlw(void *args); void munge_wlww(void *args); +void munge_wlwwwl(void *args); void munge_wlwwwll(void *args); void munge_wlwwwllw(void *args); void munge_wlwwlwlw(void *args); @@ -115,5 +116,7 @@ void munge_l(void *args); void munge_ll(void *args); void munge_lw(void *args); void munge_lwww(void *args); +void munge_wwlww(void *args); void munge_wwlwww(void *args); +void munge_wwlwwwl(void *args); #endif /* __MUNGE_H__ */ diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 57b577d90..2f5b90bfb 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -70,8 +70,13 @@ #define LOCKLEAF 0x0004 /* lock inode on return */ #define LOCKPARENT 0x0008 /* want parent vnode returned */ #define WANTPARENT 0x0010 /* want parent vnode returned */ + +#ifdef KERNEL_PRIVATE +#define CN_SECLUDE_RENAME 0x10000000 /*rename iff ¬(hard-linked ∨ opened ∨ mmaped)*/ +#define CN_RAW_ENCRYPTED 0x80000000 /* Look-up is for RO raw encrypted access. */ #endif +#endif // KERNEL #ifdef BSD_KERNEL_PRIVATE @@ -178,16 +183,13 @@ struct nameidata { #if NAMEDRSRCFORK #define CN_WANTSRSRCFORK 0x04000000 #define CN_ALLOWRSRCFORK 0x08000000 -#endif -#if CONFIG_SECLUDED_RENAME -#ifdef BSD_KERNEL_PRIVATE -#define CN_SECLUDE_RENAME 0x10000000 /*rename iff ¬(hard-linked ∨ opened ∨ mmaped)*/ -#endif -#endif +#endif // NAMEDRSRCFORK +// CN_SECLUDE_RENAME is defined above as 0x10000000 (SPI) #define CN_NBMOUNTLOOK 0x20000000 /* do not block for cross mount lookups */ #ifdef BSD_KERNEL_PRIVATE #define CN_SKIPNAMECACHE 0x40000000 /* skip cache during lookup(), allow FS to handle all components */ #endif +// CN_RAW_ENCRYPTED is defined above as 0x80000000 (SPI) /* * Initialization of an nameidata structure. @@ -236,12 +238,12 @@ struct nameidata { */ struct namecache { TAILQ_ENTRY(namecache) nc_entry; /* chain of all entries */ - LIST_ENTRY(namecache) nc_hash; /* hash chain */ - LIST_ENTRY(namecache) nc_child; /* chain of ncp's that are children of a vp */ + TAILQ_ENTRY(namecache) nc_child; /* chain of ncp's that are children of a vp */ union { LIST_ENTRY(namecache) nc_link; /* chain of ncp's that 'name' a vp */ TAILQ_ENTRY(namecache) nc_negentry; /* chain of ncp's that 'name' a vp */ } nc_un; + LIST_ENTRY(namecache) nc_hash; /* hash chain */ vnode_t nc_dvp; /* vnode of parent of name */ vnode_t nc_vp; /* vnode the name refers to */ unsigned int nc_hashval; /* hashval of stringname */ diff --git a/bsd/sys/pgo.h b/bsd/sys/pgo.h index 8f7909b82..167b212fa 100644 --- a/bsd/sys/pgo.h +++ b/bsd/sys/pgo.h @@ -37,8 +37,9 @@ #define PGO_HIB (1) #define PGO_WAIT_FOR_UNLOAD (2) #define PGO_METADATA (4) +#define PGO_RESET_ALL (8) -#define PGO_ALL_FLAGS (PGO_HIB | PGO_WAIT_FOR_UNLOAD | PGO_METADATA) +#define PGO_ALL_FLAGS (PGO_HIB | PGO_WAIT_FOR_UNLOAD | PGO_METADATA | PGO_RESET_ALL) /** diff --git a/bsd/sys/priv.h b/bsd/sys/priv.h index fe42c655d..1e17a1fdf 100644 --- a/bsd/sys/priv.h +++ b/bsd/sys/priv.h @@ -2,7 +2,7 @@ * Copyright (c) 2010-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /*- @@ -77,15 +77,20 @@ * privileges, such as the ability to reboot, and then loosely by * subsystem, indicated by a subsystem name. */ -#define PRIV_ADJTIME 1000 /* Set time adjustment. */ -#define PRIV_PROC_UUID_POLICY 1001 /* Change process uuid policy table. */ -#define PRIV_GLOBAL_PROC_INFO 1002 /* Query information for processes owned by other users */ -#define PRIV_SYSTEM_OVERRIDE 1003 /* Override global system settings for various subsystems for a limited duration/system-mode */ -#define PRIV_HW_DEBUG_DATA 1004 /* Extract hw-specific debug data (e.g. ECC data) */ +#define PRIV_ADJTIME 1000 /* Set time adjustment. */ +#define PRIV_PROC_UUID_POLICY 1001 /* Change process uuid policy table. */ +#define PRIV_GLOBAL_PROC_INFO 1002 /* Query information for processes owned by other users */ +#define PRIV_SYSTEM_OVERRIDE 1003 /* Override global system settings for various subsystems for a limited duration/system-mode */ +#define PRIV_HW_DEBUG_DATA 1004 /* Extract hw-specific debug data (e.g. ECC data) */ #define PRIV_SELECTIVE_FORCED_IDLE 1005 /* Configure and control Selective Forced Idle (SFI) subsystem */ -#define PRIV_PROC_TRACE_INSPECT 1006 /* Request trace memory of arbitrary process to be inspected */ -#define PRIV_DARKBOOT 1007 /* Manipulate the darkboot flag */ -#define PRIV_WORK_INTERVAL 1008 /* Express details about a work interval */ +#define PRIV_PROC_TRACE_INSPECT 1006 /* Request trace memory of arbitrary process to be inspected */ +#define PRIV_DARKBOOT 1007 /* Manipulate the darkboot flag */ +#define PRIV_WORK_INTERVAL 1008 /* Express details about a work interval */ +#define PRIV_SMB_TIMEMACHINE_CONTROL 1009 /* Control Time Machine properties of an SMB share */ +#define PRIV_AUDIO_LATENCY 1010 /* set audio latency requirements for background tracing */ +#define PRIV_KTRACE_BACKGROUND 1011 /* Operate ktrace in the background */ +#define PRIV_SETPRIORITY_DARWIN_ROLE 1012 /* Allow setpriority(PRIO_DARWIN_ROLE) */ +#define PRIV_PACKAGE_EXTENSIONS 1013 /* Push package extension list used by vn_path_package_check() */ /* * Virtual memory privileges. @@ -97,24 +102,27 @@ /* * Network stack privileges. */ -#define PRIV_NET_PRIVILEGED_TRAFFIC_CLASS 10000 /* Set SO_PRIVILEGED_TRAFFIC_CLASS. */ +#define PRIV_NET_PRIVILEGED_TRAFFIC_CLASS 10000 /* Set SO_PRIVILEGED_TRAFFIC_CLASS. */ #define PRIV_NET_PRIVILEGED_SOCKET_DELEGATE 10001 /* Set delegate on a socket */ #define PRIV_NET_INTERFACE_CONTROL 10002 /* Enable interface debug logging. */ #define PRIV_NET_PRIVILEGED_NETWORK_STATISTICS 10003 /* Access to all sockets */ #define PRIV_NET_PRIVILEGED_NECP_POLICIES 10004 /* Access to privileged Network Extension policies */ #define PRIV_NET_RESTRICTED_AWDL 10005 /* Access to restricted AWDL mode */ #define PRIV_NET_PRIVILEGED_NECP_MATCH 10006 /* Privilege verified by Network Extension policies */ - +#define PRIV_NET_QOSMARKING_POLICY_OVERRIDE 10007 /* Privilege verified by Network Extension policies */ +#define PRIV_NET_RESTRICTED_INTCOPROC 10008 /* Access to internal co-processor network interfaces */ /* * IPv4 and IPv6 privileges. */ #define PRIV_NETINET_RESERVEDPORT 11000 /* Bind low port number. */ + /* * VFS privileges */ #define PRIV_VFS_OPEN_BY_ID 14000 /* Allow calling openbyid_np() */ #define PRIV_VFS_MOVE_DATA_EXTENTS 14001 /* Allow F_MOVEDATAEXTENTS fcntl */ +#define PRIV_VFS_SNAPSHOT 14002 /* Allow calling fs_snapshot_*() */ #ifdef KERNEL /* diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index e8b0cc6c3..279e9670c 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */ @@ -86,7 +86,7 @@ #include /* COALITION_NUM_TYPES */ #endif -#if defined(XNU_KERNEL_PRIVATE) || !defined(KERNEL) +#if defined(XNU_KERNEL_PRIVATE) || !defined(KERNEL) struct session; struct pgrp; @@ -216,7 +216,7 @@ struct extern_proc { #define P_DIRTY_TERMINATED 0x00000020 /* process has been marked for termination */ #define P_DIRTY_BUSY 0x00000040 /* serialization flag */ #define P_DIRTY_MARKED 0x00000080 /* marked dirty previously */ -#define P_DIRTY_DEFER_IN_PROGRESS 0x00000100 /* deferral to idle-band in process */ +#define P_DIRTY_AGING_IN_PROGRESS 0x00000100 /* aging in one of the 'aging bands' */ #define P_DIRTY_LAUNCH_IN_PROGRESS 0x00000200 /* launch is in progress */ #define P_DIRTY_IS_DIRTY (P_DIRTY | P_DIRTY_SHUTDOWN) @@ -249,9 +249,9 @@ extern void proc_signal(int pid, int signum); extern int proc_issignal(int pid, sigset_t mask); /* this routine returns 1 if the pid1 is inferior of pid2 */ extern int proc_isinferior(int pid1, int pid2); -/* this routine copies the process's name of the executable to the passed in buffer. It - * is always null terminated. The size of the buffer is to be passed in as well. This - * routine is to be used typically for debugging +/* this routine copies the process's name of the executable to the passed in buffer. It + * is always null terminated. The size of the buffer is to be passed in as well. This + * routine is to be used typically for debugging */ void proc_name(int pid, char * buf, int size); /* This routine is simillar to proc_name except it returns for current process */ @@ -298,7 +298,7 @@ pid_t proc_selfpgrpid(void); @param p Process whose pgrpid to grab. @return pgrpid for "p". */ -pid_t proc_pgrpid(proc_t); +pid_t proc_pgrpid(proc_t p); #ifdef KERNEL_PRIVATE // mark a process as being allowed to call vfs_markdependency() @@ -315,29 +315,32 @@ extern uint32_t proc_getuid(proc_t); extern uint32_t proc_getgid(proc_t); extern int proc_getcdhash(proc_t, unsigned char *); -/*! +/*! @function proc_pidbackgrounded @abstract KPI to determine if a process is currently backgrounded. - @discussion The process may move into or out of background state at any time, - so be prepared for this value to be outdated immediately. + @discussion The process may move into or out of background state at any time, + so be prepared for this value to be outdated immediately. @param pid PID of the process to be queried. @param state Pointer to a value which will be set to 1 if the process - is currently backgrounded, 0 otherwise. + is currently backgrounded, 0 otherwise. @return ESRCH if pid cannot be found or has started exiting. EINVAL if state is NULL. */ extern int proc_pidbackgrounded(pid_t pid, uint32_t* state); -/* - * This returns an unique 64bit id of a given process. - * Caller needs to hold proper reference on the +/* + * This returns an unique 64bit id of a given process. + * Caller needs to hold proper reference on the * passed in process strucutre. */ extern uint64_t proc_uniqueid(proc_t); extern void proc_set_responsible_pid(proc_t target_proc, pid_t responsible_pid); +/* return 1 if process is forcing case-sensitive HFS+ access, 0 for default */ +extern int proc_is_forcing_hfs_case_sensitivity(proc_t); + #endif /* KERNEL_PRIVATE */ #ifdef XNU_KERNEL_PRIVATE @@ -369,11 +372,14 @@ __END_DECLS #ifdef PRIVATE /* Values for pid_shutdown_sockets */ +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC 0x00000001 +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL 0x00000002 + #ifdef KERNEL -#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL 0x0 -#endif /* KERNEL */ -#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC 0x1 -#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL 0x2 +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL 0x10000000 +#define SHUTDOWN_SOCKET_LEVEL_NECP 0x20000000 +#define SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER 0x40000000 +#endif #ifndef KERNEL diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index afd022407..8f22d8007 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef PRIVATE #include /* COALITION_NUM_TYPES */ @@ -283,9 +284,9 @@ struct proc_workqueueinfo { /* * workqueue state (pwq_state field) */ -#define WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT 0x1 -#define WQ_EXCEEDED_TOTAL_THREAD_LIMIT 0x2 - +#define WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT 0x1 +#define WQ_EXCEEDED_TOTAL_THREAD_LIMIT 0x2 +#define WQ_FLAGS_AVAILABLE 0x4 struct proc_fileinfo { uint32_t fi_openflags; @@ -306,6 +307,21 @@ struct proc_fileinfo { #define PROC_FI_GUARD_SOCKET_IPC (1u << 2) #define PROC_FI_GUARD_FILEPORT (1u << 3) +struct proc_exitreasonbasicinfo { + uint32_t beri_namespace; + uint64_t beri_code; + uint64_t beri_flags; + uint32_t beri_reason_buf_size; +} __attribute__((packed)); + +struct proc_exitreasoninfo { + uint32_t eri_namespace; + uint64_t eri_code; + uint64_t eri_flags; + uint32_t eri_reason_buf_size; + uint64_t eri_kcd_buf; +} __attribute__((packed)); + /* * A copy of stat64 with static sized fields. */ @@ -662,6 +678,7 @@ struct appletalk_fdinfo { #define PROX_FDTYPE_KQUEUE 5 #define PROX_FDTYPE_PIPE 6 #define PROX_FDTYPE_FSEVENTS 7 +#define PROX_FDTYPE_NETPOLICY 9 struct proc_fdinfo { int32_t proc_fd; @@ -673,6 +690,7 @@ struct proc_fileportinfo { uint32_t proc_fdtype; }; + /* Flavors for proc_pidinfo() */ #define PROC_PIDLISTFDS 1 #define PROC_PIDLISTFD_SIZE (sizeof(struct proc_fdinfo)) @@ -749,6 +767,12 @@ struct proc_fileportinfo { #define PROC_PIDREGIONPATHINFO3 23 #define PROC_PIDREGIONPATHINFO3_SIZE (sizeof(struct proc_regionwithpathinfo)) +#define PROC_PIDEXITREASONINFO 24 +#define PROC_PIDEXITREASONINFO_SIZE (sizeof(struct proc_exitreasoninfo)) + +#define PROC_PIDEXITREASONBASICINFO 25 +#define PROC_PIDEXITREASONBASICINFOSIZE (sizeof(struct proc_exitreasonbasicinfo)) + #endif /* Flavors for proc_pidfdinfo */ @@ -780,8 +804,10 @@ struct proc_fileportinfo { #ifdef PRIVATE #define PROC_PIDFDKQUEUE_EXTINFO 9 #define PROC_PIDFDKQUEUE_EXTINFO_SIZE (sizeof(struct kevent_extinfo)) +#define PROC_PIDFDKQUEUE_KNOTES_MAX (1024 * 128) #endif /* PRIVATE */ + /* Flavors for proc_pidfileportinfo */ #define PROC_PIDFILEPORTVNODEPATHINFO 2 /* out: vnode_fdinfowithpath */ @@ -891,8 +917,15 @@ extern int fill_pshminfo(struct pshmnode * pshm, struct pshm_info * pinfo); extern int fill_pseminfo(struct psemnode * psem, struct psem_info * pinfo); extern int fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo); extern int fill_kqueueinfo(struct kqueue * kq, struct kqueue_info * kinfo); -extern int pid_kqueue_extinfo(proc_t, struct kqueue * kq, user_addr_t buffer, uint32_t buffersize, int32_t * retval); +extern int pid_kqueue_extinfo(proc_t, struct kqueue * kq, user_addr_t buffer, + uint32_t buffersize, int32_t * retval); +extern int pid_kqueue_udatainfo(proc_t p, struct kqueue *kq, uint64_t *buf, + uint32_t bufsize); extern int fill_procworkqueue(proc_t, struct proc_workqueueinfo *); +extern boolean_t workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total, + boolean_t *exceeded_constrained); +extern uint32_t workqueue_get_pwq_state_kdp(void *proc); + #endif /* XNU_KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 5e7a3750a..6c00e20e0 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -77,6 +77,9 @@ #include #include +#include // command/proc_name_t + + __BEGIN_DECLS #include #if PSYNCH @@ -319,10 +322,12 @@ struct proc { int p_mac_enforce; /* MAC policy enforcement control */ #endif - char p_comm[MAXCOMLEN+1]; - char p_name[(2*MAXCOMLEN)+1]; /* PL */ + // types currently in sys/param.h + command_t p_comm; + proc_name_t p_name; /* can be changed by the process */ - struct pgrp *p_pgrp; /* Pointer to process group. (LL) */ + + struct pgrp *p_pgrp; /* Pointer to process group. (LL) */ uint32_t p_csflags; /* flags for codesign (PL) */ uint32_t p_pcaction; /* action for process control on starvation */ uint8_t p_uuid[16]; /* from LC_UUID load command */ @@ -357,12 +362,8 @@ struct proc { user_addr_t p_wqthread; /* pthread workqueue fn */ int p_pthsize; /* pthread size */ uint32_t p_pth_tsd_offset; /* offset from pthread_t to TSD for new threads */ - user_addr_t p_targconc; /* target concurrency ptr */ user_addr_t p_stack_addr_hint; /* stack allocation hint for wq threads */ void * p_wqptr; /* workq ptr */ - int p_wqsize; /* allocated size */ - boolean_t p_wqiniting; /* semaphore to serialze wq_open */ - lck_spin_t p_wqlock; /* lock to protect work queue */ struct kqueue * p_wqkqueue; /* private workq kqueue */ struct timeval p_start; /* starting time */ @@ -396,11 +397,11 @@ struct proc { uint32_t p_memstat_dirty; /* dirty state */ uint64_t p_memstat_userdata; /* user state */ uint64_t p_memstat_idledeadline; /* time at which process became clean */ -#if CONFIG_JETSAM + uint64_t p_memstat_idle_start; /* abstime process transitions into the idle band */ + uint64_t p_memstat_idle_delta; /* abstime delta spent in idle band */ int32_t p_memstat_memlimit; /* cached memory limit, toggles between active and inactive limits */ int32_t p_memstat_memlimit_active; /* memory limit enforced when process is in active jetsam state */ int32_t p_memstat_memlimit_inactive; /* memory limit enforced when process is in inactive jetsam state */ -#endif #if CONFIG_FREEZE uint32_t p_memstat_suspendedfootprint; /* footprint at time of suspensions */ #endif /* CONFIG_FREEZE */ @@ -408,6 +409,8 @@ struct proc { /* cached proc-specific data required for corpse inspection */ pid_t p_responsible_pid; /* pid resonsible for this process */ + + struct os_reason *p_exit_reason; }; #define PGRPID_DEAD 0xdeaddead @@ -492,27 +495,11 @@ struct proc { /* p_vfs_iopolicy flags */ #define P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY 0x0001 -/* defns for proc_iterate */ -#define PROC_ALLPROCLIST 1 /* walk the allproc list (procs not exited yet) */ -#define PROC_ZOMBPROCLIST 2 /* walk the zombie list */ -#define PROC_NOWAITTRANS 4 /* do not wait for transitions (checkdirs only) */ - -/* defns for pgrp_iterate */ -#define PGRP_DROPREF 1 -#define PGRP_BLOCKITERATE 2 - -/* return values of the proc iteration callback routine */ -#define PROC_RETURNED 0 -#define PROC_RETURNED_DONE 1 -#define PROC_CLAIMED 2 -#define PROC_CLAIMED_DONE 3 - /* process creation arguments */ #define PROC_CREATE_FORK 0 /* independent child (running) */ #define PROC_CREATE_SPAWN 1 /* independent child (suspended) */ #define PROC_CREATE_VFORK 2 /* child borrows context */ - /* LP64 version of extern_proc. all pointers * grow when we're dealing with a 64-bit process. * WARNING - keep in sync with extern_proc @@ -677,6 +664,7 @@ extern lck_attr_t * proc_lck_attr; LIST_HEAD(proclist, proc); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ + extern struct proc *initproc; extern void procinit(void); extern void proc_lock(struct proc *); @@ -717,13 +705,11 @@ extern int msleep0(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, int t extern void vfork_return(struct proc *child, int32_t *retval, int rval); extern int exit1(struct proc *, int, int *); extern int exit1_internal(struct proc *, int, int *, boolean_t, boolean_t, int); +extern int exit_with_reason(struct proc *, int, int *, boolean_t, boolean_t, int, struct os_reason *); extern int fork1(proc_t, thread_t *, int, coalition_t *); extern void vfork_exit_internal(struct proc *p, int rv, int forced); extern void proc_reparentlocked(struct proc *child, struct proc * newparent, int cansignal, int locked); -extern int pgrp_iterate(struct pgrp * pgrp, int flags, int (*callout)(proc_t , void *), void *arg, int (*filterfn)(proc_t , void *), void *filterarg); -extern int proc_iterate(int flags, int (*callout)(proc_t , void *), void *arg, int (*filterfn)(proc_t , void *), void *filterarg); -extern int proc_rebootscan(int (*callout)(proc_t , void *), void *arg, int (*filterfn)(proc_t , void *), void *filterarg); -extern int proc_childrenwalk(proc_t p, int (*callout)(proc_t , void *), void *arg); + extern proc_t proc_findinternal(int pid, int locked); extern proc_t proc_findthread(thread_t thread); extern void proc_refdrain(proc_t); @@ -751,6 +737,7 @@ extern proc_t proc_parentholdref(proc_t); extern int proc_parentdropref(proc_t, int); int itimerfix(struct timeval *tv); int itimerdecr(struct proc * p, struct itimerval *itp, int usec); +void proc_free_realitimer(proc_t proc); int timespec_is_valid(const struct timespec *); void proc_signalstart(struct proc *, int locked); void proc_signalend(struct proc *, int locked); @@ -762,8 +749,6 @@ void proc_rele_locked(struct proc * p); struct proc *proc_ref_locked(struct proc * p); void proc_knote(struct proc * p, long hint); void proc_knote_drain(struct proc *p); -void workqueue_init_lock(proc_t p); -void workqueue_destroy_lock(proc_t p); void proc_setregister(proc_t p); void proc_resetregister(proc_t p); /* returns the first thread_t in the process, or NULL XXX for NFS, DO NOT USE */ @@ -786,10 +771,79 @@ void proc_set_return_wait(struct proc *); void proc_clear_return_wait(proc_t p, thread_t child_thread); void proc_wait_to_return(void); -/* return 1 if process is forcing case-sensitive HFS+ access, 0 for default */ -extern int proc_is_forcing_hfs_case_sensitivity(proc_t); +/* process iteration */ + +#define ALLPROC_FOREACH(var) \ + LIST_FOREACH((var), &allproc, p_list) + +#define ZOMBPROC_FOREACH(var) \ + LIST_FOREACH((var), &zombproc, p_list) + +#define PGMEMBERS_FOREACH(group, var) \ + LIST_FOREACH((var), &((struct pgrp *)(group))->pg_members, p_pglist) + +#define PCHILDREN_FOREACH(parent, var) \ + LIST_FOREACH((var), &(((struct proc *)(parent))->p_children), p_sibling) + +typedef int (*proc_iterate_fn_t)(proc_t, void *); + +/* + * These are the only valid return values of `callout` functions provided to + * process iterators. + * + * CLAIMED returns expect the caller to call proc_rele on the proc. DONE + * returns stop iterating processes early. + */ +#define PROC_RETURNED (0) +#define PROC_RETURNED_DONE (1) +#define PROC_CLAIMED (2) +#define PROC_CLAIMED_DONE (3) + +/* + * pgrp_iterate walks the provided process group, calling `filterfn` with + * `filterarg` for each process. For processes where `filterfn` returned + * non-zero, `callout` is called with `arg`. If `PGRP_DROPREF` is supplied in + * `flags`, a reference will be dropped from the process group after obtaining + * the list of processes to call `callout` on. + * + * `PGMEMBERS_FOREACH` might also be used under the pgrp_lock to achieve a + * similar effect. + */ +#define PGRP_DROPREF (1) + +extern int pgrp_iterate(struct pgrp *pgrp, unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); + +/* + * proc_iterate walks the `allproc` and/or `zombproc` lists, calling `filterfn` + * with `filterarg` for each process. For processes where `filterfn` returned + * non-zero, `callout` is called with `arg`. If the `PROC_NOWAITTRANS` flag is + * set, this function waits for transitions. + * + * `ALLPROC_FOREACH` or `ZOMBPROC_FOREACH` might also be used under the + * `proc_list_lock` to achieve a similar effect. + */ +#define PROC_ALLPROCLIST (1U << 0) /* walk the allproc list (processes not yet exited) */ +#define PROC_ZOMBPROCLIST (1U << 1) /* walk the zombie list */ +#define PROC_NOWAITTRANS (1U << 2) /* do not wait for transitions (checkdirs only) */ + +extern int proc_iterate(unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); + +/* + * proc_childrenwalk walks the children of process `p`, calling `callout` for + * each one. + * + * `PCHILDREN_FOREACH` might also be used under the `proc_list_lock` to achieve + * a similar effect. + */ +extern int proc_childrenwalk(proc_t p, proc_iterate_fn_t callout, void *arg); + +/* + * proc_rebootscan should only be used by kern_shutdown.c + */ +extern void proc_rebootscan(proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); pid_t dtrace_proc_selfpid(void); pid_t dtrace_proc_selfppid(void); uid_t dtrace_proc_selfruid(void); + #endif /* !_SYS_PROC_INTERNAL_H_ */ diff --git a/bsd/sys/proc_uuid_policy.h b/bsd/sys/proc_uuid_policy.h index 9838993b3..fa41fc60c 100644 --- a/bsd/sys/proc_uuid_policy.h +++ b/bsd/sys/proc_uuid_policy.h @@ -55,6 +55,7 @@ __BEGIN_DECLS #define PROC_UUID_POLICY_FLAGS_NONE 0x00000000 #define PROC_UUID_NO_CELLULAR 0x00000001 #define PROC_UUID_NECP_APP_POLICY 0x00000002 +#define PROC_UUID_ALT_DYLD_POLICY 0x00000004 /* To be removed, replaced by PROC_UUID_NECP_APP_POLICY */ #define PROC_UUID_FLOW_DIVERT 0x00000002 diff --git a/bsd/sys/pthread_internal.h b/bsd/sys/pthread_internal.h index 4b0c21282..634470f89 100644 --- a/bsd/sys/pthread_internal.h +++ b/bsd/sys/pthread_internal.h @@ -43,6 +43,8 @@ struct ksyn_waitq_element { void workqueue_mark_exiting(struct proc *); void workqueue_exit(struct proc *); void pthread_init(void); +int thread_qos_from_pthread_priority(unsigned long, unsigned long *); +unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t propagation); #endif /* _SYS_PTHREAD_INTERNAL_H_ */ diff --git a/bsd/sys/pthread_shims.h b/bsd/sys/pthread_shims.h index 872173b09..c1f1eb19c 100644 --- a/bsd/sys/pthread_shims.h +++ b/bsd/sys/pthread_shims.h @@ -59,14 +59,17 @@ typedef struct workq_reqthreads_req_s {unsigned long priority; int count;} *work */ #define PTHREAD_FUNCTIONS_TABLE_VERSION 1 -typedef struct pthread_functions_s { +typedef const struct pthread_functions_s { int version; /* internal calls, kernel core -> kext */ void (*pthread_init)(void); int (*fill_procworkqueue)(proc_t p, struct proc_workqueueinfo * pwqinfo); + + // UNUSED - TO BE DELETED void (*workqueue_init_lock)(proc_t p); void (*workqueue_destroy_lock)(proc_t p); + void (*workqueue_exit)(struct proc *p); void (*workqueue_mark_exiting)(struct proc *p); void (*workqueue_thread_yielded)(void); @@ -108,11 +111,17 @@ typedef struct pthread_functions_s { /* Resolve a pthread_priority_t to a QoS/relative pri */ integer_t (*thread_qos_from_pthread_priority)(unsigned long pthread_priority, unsigned long *flags); + /* try to get wq flags in debugger context */ + uint32_t (*get_pwq_state_kdp)(proc_t p); + + unsigned long (*pthread_priority_canonicalize)(unsigned long pthread_priority); + unsigned long (*pthread_priority_canonicalize2)(unsigned long pthread_priority, boolean_t propagation); + /* padding for future */ - void* _pad[95]; -} *pthread_functions_t; + void * _pad[92]; +} * pthread_functions_t; -typedef struct pthread_callbacks_s { +typedef const struct pthread_callbacks_s { int version; /* config information */ @@ -129,16 +138,16 @@ typedef struct pthread_callbacks_s { void (*proc_set_wqthread)(struct proc *t, user_addr_t addr); int (*proc_get_pthsize)(struct proc *t); void (*proc_set_pthsize)(struct proc *t, int size); - user_addr_t (*proc_get_targconc)(struct proc *t); - void (*proc_set_targconc)(struct proc *t, user_addr_t addr); + void *unused_was_proc_get_targconc; + void *unused_was_proc_set_targconc; uint64_t (*proc_get_dispatchqueue_offset)(struct proc *t); void (*proc_set_dispatchqueue_offset)(struct proc *t, uint64_t offset); - lck_spin_t* (*proc_get_wqlockptr)(struct proc *t); - boolean_t* (*proc_get_wqinitingptr)(struct proc *t); + void *unused_was_proc_get_wqlockptr; + void *unused_was_proc_get_wqinitingptr; void* (*proc_get_wqptr)(struct proc *t); void (*proc_set_wqptr)(struct proc *t, void* ptr); - int (*proc_get_wqsize)(struct proc *t); - void (*proc_set_wqsize)(struct proc *t, int sz); + void *unused_was_proc_get_wqsize; + void *unused_was_proc_set_wqsize; void (*proc_lock)(struct proc *t); void (*proc_unlock)(struct proc *t); task_t (*proc_get_task)(struct proc *t); @@ -174,9 +183,8 @@ typedef struct pthread_callbacks_s { /* kern/clock.h */ void (*absolutetime_to_microtime)(uint64_t abstime, clock_sec_t *secs, clock_usec_t *microsecs); - /* osfmk/kern/task.h */ - int (*proc_restore_workq_bgthreadpolicy)(thread_t t); - int (*proc_apply_workq_bgthreadpolicy)(thread_t t); + kern_return_t (*thread_set_workq_pri)(thread_t thread, integer_t priority, integer_t policy); + kern_return_t (*thread_set_workq_qos)(thread_t thread, int qos_tier, int relprio); /* osfmk/kern/thread.h */ struct uthread* (*get_bsdthread_info)(thread_t th); @@ -221,8 +229,9 @@ typedef struct pthread_callbacks_s { uint64_t (*proc_get_dispatchqueue_serialno_offset)(struct proc *p); void (*proc_set_dispatchqueue_serialno_offset)(struct proc *p, uint64_t offset); - user_addr_t (*proc_get_stack_addr_hint)(struct proc *p); - void (*proc_set_stack_addr_hint)(struct proc *p, user_addr_t stack_addr_hint); + int (*proc_usynch_thread_qos_add_override_for_resource_check_owner)(thread_t thread, int override_qos, boolean_t first_override_for_resource, + user_addr_t resource, int resource_type, user_addr_t user_lock_addr, mach_port_name_t user_lock_owner); + void *unused_was_proc_set_stack_addr_hint; uint32_t (*proc_get_pthread_tsd_offset)(struct proc *p); void (*proc_set_pthread_tsd_offset)(struct proc *p, uint32_t pthread_tsd_offset); @@ -230,8 +239,8 @@ typedef struct pthread_callbacks_s { kern_return_t (*thread_set_tsd_base)(thread_t thread, mach_vm_offset_t tsd_base); int (*proc_usynch_get_requested_thread_qos)(struct uthread *); - boolean_t (*proc_usynch_thread_qos_add_override)(struct uthread *, uint64_t tid, int override_qos, boolean_t first_override_for_resource); - boolean_t (*proc_usynch_thread_qos_remove_override)(struct uthread *, uint64_t tid); + void *unused_was_proc_usynch_thread_qos_add_override; + void *unused_was_proc_usynch_thread_qos_remove_override; kern_return_t (*thread_policy_get)(thread_t t, thread_policy_flavor_t flavor, thread_policy_t info, mach_msg_type_number_t *count, boolean_t *get_default); boolean_t (*qos_main_thread_active)(void); @@ -242,8 +251,21 @@ typedef struct pthread_callbacks_s { boolean_t (*proc_usynch_thread_qos_remove_override_for_resource)(task_t task, struct uthread *, uint64_t tid, user_addr_t resource, int resource_type); boolean_t (*proc_usynch_thread_qos_reset_override_for_resource)(task_t task, struct uthread *, uint64_t tid, user_addr_t resource, int resource_type); + boolean_t (*proc_init_wqptr_or_wait)(proc_t proc); + + uint16_t (*thread_set_tag)(thread_t thread, uint16_t tag); + uint16_t (*thread_get_tag)(thread_t thread); + + int (*proc_usynch_thread_qos_squash_override_for_resource)(thread_t thread, user_addr_t resource, int resource_type); + int (*task_get_default_manager_qos)(task_t task); + + int (*thread_create_workq_waiting)(task_t task, thread_continue_t thread_return, event_t event, thread_t *new_thread); + + user_addr_t (*proc_get_stack_addr_hint)(struct proc *p); + void (*proc_set_stack_addr_hint)(struct proc *p, user_addr_t stack_addr_hint); + /* padding for future */ - void* _pad[84]; + void* _pad[76]; } *pthread_callbacks_t; diff --git a/bsd/sys/quota.h b/bsd/sys/quota.h index 3693ff592..35cb6b70a 100644 --- a/bsd/sys/quota.h +++ b/bsd/sys/quota.h @@ -340,7 +340,6 @@ struct dquot { #define CHOWN 0x02 /* (advisory) change initiated by chown */ -#ifdef XNU_KERNEL_PRIVATE /* * Functions that manage the in-core dquot and the * on-disk dqblk data structures. @@ -367,7 +366,6 @@ void qf_put(struct quotafile *, int type); __private_extern__ void munge_dqblk(struct dqblk *dqblkp, struct user_dqblk *user_dqblkp, boolean_t to64); __END_DECLS -#endif /* XNU_KERNEL_PRIVATE */ #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/random.h b/bsd/sys/random.h index 9b77c2585..858641893 100644 --- a/bsd/sys/random.h +++ b/bsd/sys/random.h @@ -32,6 +32,13 @@ #include #include +#ifndef KERNEL +__BEGIN_DECLS + +int getentropy(void* buffer, size_t size); +__END_DECLS + +#else /* KERNEL */ #ifdef __APPLE_API_UNSTABLE __BEGIN_DECLS void read_random(void* buffer, u_int numBytes); @@ -40,5 +47,6 @@ int write_random(void* buffer, u_int numBytes); __END_DECLS #endif /* __APPLE_API_UNSTABLE */ +#endif /* KERNEL */ #endif /* __SYS_RANDOM_H__ */ diff --git a/bsd/sys/reason.h b/bsd/sys/reason.h new file mode 100644 index 000000000..13a49e3e6 --- /dev/null +++ b/bsd/sys/reason.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _REASON_H_ +#define _REASON_H_ + +#include + +__BEGIN_DECLS + +#ifdef KERNEL + +#include + +#ifdef XNU_KERNEL_PRIVATE +#include + +typedef struct os_reason { + decl_lck_mtx_data(, osr_lock) + int osr_refcount; + uint32_t osr_namespace; + uint64_t osr_code; + uint64_t osr_flags; + uint32_t osr_bufsize; + struct kcdata_descriptor osr_kcd_descriptor; + char *osr_kcd_buf; +} *os_reason_t; + +#define OS_REASON_NULL ((os_reason_t) 0) + +/* We only include 800 bytes of the exit reason description to not blow through the panic buffer */ +#define LAUNCHD_PANIC_REASON_STRING_MAXLEN "800" + +void os_reason_init(void); + +os_reason_t build_userspace_exit_reason(uint32_t reason_namespace, uint64_t reason_code, user_addr_t payload, uint32_t payload_size, + user_addr_t reason_string, uint64_t reason_flags); +char *launchd_exit_reason_get_string_desc(os_reason_t exit_reason); + +#else /* XNU_KERNEL_PRIVATE */ + +typedef void * os_reason_t; + +#endif /* XNU_KERNEL_PRIVATE */ + +os_reason_t os_reason_create(uint32_t osr_namespace, uint64_t osr_code); +int os_reason_alloc_buffer(os_reason_t cur_reason, uint32_t osr_bufsize); +struct kcdata_descriptor * os_reason_get_kcdata_descriptor(os_reason_t cur_reason); +void os_reason_ref(os_reason_t cur_reason); +void os_reason_free(os_reason_t cur_reason); + +#endif /* KERNEL */ + +/* + * Reason namespaces. + */ +#define OS_REASON_INVALID 0 +#define OS_REASON_JETSAM 1 +#define OS_REASON_SIGNAL 2 +#define OS_REASON_CODESIGNING 3 +#define OS_REASON_HANGTRACER 4 +#define OS_REASON_TEST 5 +#define OS_REASON_DYLD 6 +#define OS_REASON_LIBXPC 7 +#define OS_REASON_OBJC 8 +#define OS_REASON_EXEC 9 +#define OS_REASON_SPRINGBOARD 10 +#define OS_REASON_TCC 11 +#define OS_REASON_REPORTCRASH 12 +#define OS_REASON_COREANIMATION 13 +#define OS_REASON_AGGREGATED 14 + +/* + * Update whenever new OS_REASON namespaces are added. + */ +#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_AGGREGATED + +#define OS_REASON_BUFFER_MAX_SIZE 5120 + +#define OS_REASON_FLAG_NO_CRASH_REPORT 0x1 /* Don't create a crash report */ +#define OS_REASON_FLAG_GENERATE_CRASH_REPORT 0x2 /* Create a crash report - the default for userspace requests */ +#define OS_REASON_FLAG_FROM_USERSPACE 0x4 /* Reason created from a userspace syscall */ +#define OS_REASON_FLAG_FAILED_DATA_COPYIN 0x8 /* We failed to copyin data from userspace */ +#define OS_REASON_FLAG_PAYLOAD_TRUNCATED 0x10 /* The payload was truncated because it was longer than allowed */ +#define OS_REASON_FLAG_BAD_PARAMS 0x20 /* Invalid parameters were passed involved with creating this reason */ +#define OS_REASON_FLAG_CONSISTENT_FAILURE 0x40 /* Whatever caused this reason to be created will happen again */ +#define OS_REASON_FLAG_ONE_TIME_FAILURE 0x80 /* Whatever caused this reason to be created was a one time issue */ + +/* + * Set of flags that are allowed to be passed from userspace + */ +#define OS_REASON_FLAG_MASK_ALLOWED_FROM_USER (OS_REASON_FLAG_CONSISTENT_FAILURE | OS_REASON_FLAG_ONE_TIME_FAILURE | OS_REASON_FLAG_NO_CRASH_REPORT) + +/* + * Macros to encode the exit reason namespace and first 32 bits of code in exception code + * which is used by Report Crash as a hint. It should be only used as a hint since it + * looses higher 32 bits of exit reason code. + */ +#define ENCODE_OSR_NAMESPACE_TO_MACH_EXCEPTION_CODE(code, osr_namespace) \ + (code) = (code) | (((osr_namespace) & ((uint64_t)UINT32_MAX)) << 32) +#define ENCODE_OSR_CODE_TO_MACH_EXCEPTION_CODE(code, osr_code) \ + (code) = (code) | ((osr_code) & ((uint64_t)UINT32_MAX)) + +#ifndef KERNEL +/* + * abort_with_reason: Used to exit the current process and pass along + * specific information about why it is being terminated. + * + * Inputs: args->reason_namespace - OS_REASON namespace specified for the reason + * args->reason_code - code in the specified namespace for the reason + * args->reason_string - additional string formatted information about the request + * args->reason_flags - options requested for how the process should be terminated (see OS_REASON_FLAG_* above). + * + * Outputs: Does not return. + */ +void abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const char *reason_string, uint64_t reason_flags) __attribute__((noreturn)); + +/* + * abort_with_payload: Used to exit the current process and pass along + * specific information about why it is being terminated. The payload pointer + * should point to structured data that can be interpreted by the consumer of + * exit reason information. + * + * Inputs: args->reason_namespace - OS_REASON namespace specified for the reason + * args->reason_code - code in the specified namespace for the reason + * args->payload - pointer to payload structure in user space + * args->payload_size - length of payload buffer (this will be truncated to EXIT_REASON_PAYLOAD_MAX_LEN) + * args->reason_string - additional string formatted information about the request + * args->reason_flags - options requested for how the process should be terminated (see OS_REASON_FLAG_* above). + * + * Outputs: Does not return. + */ +void abort_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, + uint64_t reason_flags) __attribute__((noreturn)); + +/* + * terminate_with_reason: Used to terminate a specific process and pass along + * specific information about why it is being terminated. + * + * Inputs: args->pid - the PID of the process to be terminated + * args->reason_namespace - OS_REASON namespace specified for the reason + * args->reason_code - code in the specified namespace for the reason + * args->reason_string - additional string formatted information about the request + * args->reason_flags - options requested for how the process should be terminated (see OS_REASON_FLAG_* above) + * + * Outputs: EINVAL if the PID requested is the same as that of the calling process, invalid or the namespace provided is invalid. + * ESRCH if we couldn't find a live process with the requested PID + * EPERM if the caller is not privileged enough to kill the process with the requested PID + * returns 0 otherwise + */ +int terminate_with_reason(int pid, uint32_t reason_namespace, uint64_t reason_code, const char *reason_string, uint64_t reason_flags); + +/* + * terminate_with_payload: Used to terminate a specific process and pass along + * specific information about why it is being terminated. The payload pointer + * should point to structured data that can be interpreted by the consumer of + * exit reason information. + * + * Inputs: args->pid - the PID of the process to be terminated. + * args->reason_namespace - OS_REASON namespace specified for the reason + * args->reason_code - code in the specified namespace for the reason + * args->payload - pointer to payload structure in user space + * args->payload_size - length of payload buffer (this will be truncated to EXIT_REASON_PAYLOAD_MAX_LEN) + * args->reason_string - additional string formatted information about the request + * args->reason_flags - options requested for how the process should be terminated (see OS_REASON_FLAG_* above) + * + * Outputs: EINVAL if the PID requested is the same as that of the calling process, is invalid or the namespace provided is invalid. + * ESRCH if we couldn't find a live process with the requested PID + * EPERM if the caller is not privileged enough to kill the process with the requested PID + * returns 0 otherwise + */ +int terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, + const char *reason_string, uint64_t reason_flags); +#endif /* KERNEL */ + +/* + * codesigning exit reasons + */ +#define CODESIGNING_EXIT_REASON_TASKGATED_INVALID_SIG 1 +#define CODESIGNING_EXIT_REASON_INVALID_PAGE 2 +#define CODESIGNING_EXIT_REASON_TASK_ACCESS_PORT 3 + +/* + * exec path specific exit reasons + */ +#define EXEC_EXIT_REASON_BAD_MACHO 1 +#define EXEC_EXIT_REASON_SUGID_FAILURE 2 +#define EXEC_EXIT_REASON_ACTV_THREADSTATE 3 +#define EXEC_EXIT_REASON_STACK_ALLOC 4 +#define EXEC_EXIT_REASON_APPLE_STRING_INIT 5 +#define EXEC_EXIT_REASON_COPYOUT_STRINGS 6 +#define EXEC_EXIT_REASON_COPYOUT_DYNLINKER 7 +#define EXEC_EXIT_REASON_SECURITY_POLICY 8 +#define EXEC_EXIT_REASON_TASKGATED_OTHER 9 +#define EXEC_EXIT_REASON_FAIRPLAY_DECRYPT 10 +#define EXEC_EXIT_REASON_DECRYPT 11 +#define EXEC_EXIT_REASON_UPX 12 + +__END_DECLS + +#endif /* _REASON_H_ */ diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index 993907e54..1dfb214a5 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -433,6 +433,16 @@ struct proc_rlimit_control_wakeupmon { int32_t wm_rate; }; +#if PRIVATE +/* + * Flags for I/O monitor control. + */ +#define IOMON_ENABLE 0x01 +#define IOMON_DISABLE 0x02 + +#endif /* PRIVATE */ + + /* I/O type */ #define IOPOL_TYPE_DISK 0 #if PRIVATE diff --git a/bsd/sys/signal.h b/bsd/sys/signal.h index 2d0fc43e8..2483e8db3 100644 --- a/bsd/sys/signal.h +++ b/bsd/sys/signal.h @@ -573,7 +573,7 @@ struct sigstack { sigmask(SIGSEGV)|sigmask(SIGSYS)|\ sigmask(SIGPIPE)|sigmask(SIGKILL)) -#define workq_threadmask (threadmask | sigcantmask) +#define workq_threadmask (threadmask | sigcantmask | sigmask(SIGPROF)) /* * Signals carried across exec. diff --git a/bsd/sys/signalvar.h b/bsd/sys/signalvar.h index 6d8488807..f427f7215 100644 --- a/bsd/sys/signalvar.h +++ b/bsd/sys/signalvar.h @@ -145,7 +145,7 @@ struct sigacts { #define SA_CANTMASK 0x40 /* non-maskable, catchable */ #ifdef SIGPROP -int sigprop[NSIG + 1] = { +int sigprop[NSIG] = { 0, /* unused */ SA_KILL, /* SIGHUP */ SA_KILL, /* SIGINT */ @@ -211,6 +211,7 @@ void pt_setrunnable(struct proc *p); int hassigprop(int sig, int prop); int setsigvec(proc_t, thread_t, int signum, struct __kern_sigaction *, boolean_t in_sigstart); +struct os_reason; /* * Machine-dependent functions: */ @@ -218,14 +219,20 @@ void sendsig(struct proc *, /*sig_t*/ user_addr_t action, int sig, int returnmask, uint32_t code); void psignal(struct proc *p, int sig); +void psignal_with_reason(struct proc *p, int sig, struct os_reason *signal_reason); void psignal_locked(struct proc *, int); +void psignal_try_thread(proc_t, thread_t, int signum); +void psignal_try_thread_with_reason(proc_t, thread_t, int, struct os_reason*); +void psignal_uthread(thread_t, int); void pgsignal(struct pgrp *pgrp, int sig, int checkctty); void tty_pgsignal(struct tty * tp, int sig, int checkctty); void threadsignal(thread_t sig_actthread, int signum, - mach_exception_code_t code); + mach_exception_code_t code, boolean_t set_exitreason); int thread_issignal(proc_t p, thread_t th, sigset_t mask); void psignal_vfork(struct proc *p, task_t new_task, thread_t thread, int signum); +void psignal_vfork_with_reason(proc_t p, task_t new_task, thread_t thread, + int signum, struct os_reason *signal_reason); void signal_setast(thread_t sig_actthread); void pgsigio(pid_t pgid, int signalnum); @@ -243,6 +250,7 @@ int sig_try_locked(struct proc *p); #define COREDUMP_FULLFSYNC 0x0002 /* Run F_FULLFSYNC on the core file's vnode */ int coredump(struct proc *p, uint32_t reserve_mb, int coredump_flags); +void set_thread_exit_reason(void *th, void *reason, boolean_t proc_locked); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/sys/snapshot.h b/bsd/sys/snapshot.h new file mode 100644 index 000000000..c18cd8510 --- /dev/null +++ b/bsd/sys/snapshot.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_SNAPSHOT_H_ +#define _SYS_SNAPSHOT_H_ + +#ifndef KERNEL + +#include +#include +#include +#include <_types/_uint32_t.h> +#include +#include + +__BEGIN_DECLS + +int fs_snapshot_create(int, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); + +int fs_snapshot_list(int, struct attrlist *, void *, size_t, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); + +int fs_snapshot_delete(int, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); + +int fs_snapshot_rename(int, const char *, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); + +int fs_snapshot_mount(int, const char *, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); + +int fs_snapshot_revert(int, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); + +__END_DECLS + +#endif /* !KERNEL */ + +#ifdef PRIVATE + +#define SNAPSHOT_OP_CREATE 0x01 +#define SNAPSHOT_OP_DELETE 0x02 +#define SNAPSHOT_OP_RENAME 0x03 +#define SNAPSHOT_OP_MOUNT 0x04 +#define SNAPSHOT_OP_REVERT 0x05 + +#endif + +#endif /* !_SYS_SNAPSHOT_H_ */ diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 8c577ce7c..e290f0a55 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */ @@ -75,6 +75,7 @@ #include #include #include +#include #ifdef PRIVATE #include @@ -101,7 +102,7 @@ /* XXX Not explicitly defined by POSIX, but function return types are */ #include - + /* XXX Not explicitly defined by POSIX, but function return types are */ #include @@ -110,26 +111,6 @@ */ #include -#ifdef PRIVATE -#define SO_TCDBG_PID 0x01 /* Set/get traffic class for PID */ -#define SO_TCDBG_PNAME 0x02 /* Set/get traffic class for processes of that name */ -#define SO_TCDBG_PURGE 0x04 /* Purge entries for unused PIDs */ -#define SO_TCDBG_FLUSH 0x08 /* Flush all entries */ -#define SO_TCDBG_COUNT 0x10 /* Get count of entries */ -#define SO_TCDBG_LIST 0x20 /* List entries */ -#define SO_TCDBG_DELETE 0x40 /* Delete a process entry */ -#define SO_TCDBG_TCFLUSH_PID 0x80 /* Flush traffic class for PID */ - -struct so_tcdbg { - u_int32_t so_tcdbg_cmd; - int32_t so_tcdbg_tclass; - u_int32_t so_tcdbg_count; - pid_t so_tcdbg_pid; - char so_tcdbg_pname[MAXCOMLEN + 1]; - int32_t so_tcdbg_opportunistic; /* -1: unspecified, 0: off, 1: on, other: errors */ -}; -#endif /* PRIVATE */ - /* * Types */ @@ -152,25 +133,26 @@ struct so_tcdbg { #define SO_BROADCAST 0x0020 /* permit sending of broadcast msgs */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define SO_USELOOPBACK 0x0040 /* bypass hardware when possible */ -#define SO_LINGER 0x0080 /* linger on close if data present (in ticks) */ +#define SO_LINGER 0x0080 /* linger on close if data present (in ticks) */ #else -#define SO_LINGER 0x1080 /* linger on close if data present (in seconds) */ +#define SO_LINGER 0x1080 /* linger on close if data present (in seconds) */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ #define SO_OOBINLINE 0x0100 /* leave received OOB data in line */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */ -#define SO_TIMESTAMP_MONOTONIC 0x0800 /* Monotonically increasing timestamp on rcvd dgram */ +#define SO_TIMESTAMP_MONOTONIC 0x0800 /* Monotonically increasing timestamp on rcvd dgram */ #ifndef __APPLE__ #define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */ #else -#define SO_DONTTRUNC 0x2000 /* APPLE: Retain unread data */ +#define SO_DONTTRUNC 0x2000 /* APPLE: Retain unread data */ /* (ATOMIC proto) */ -#define SO_WANTMORE 0x4000 /* APPLE: Give hint when more data ready */ -#define SO_WANTOOBFLAG 0x8000 /* APPLE: Want OOB in MSG_FLAG on receive */ +#define SO_WANTMORE 0x4000 /* APPLE: Give hint when more data ready */ +#define SO_WANTOOBFLAG 0x8000 /* APPLE: Want OOB in MSG_FLAG on receive */ #ifdef PRIVATE #define SO_NOWAKEFROMSLEEP 0x10000 /* Don't wake for traffic to this socket */ +#define SO_NOAPNFALLBK 0x20000 /* Don't attempt APN fallback for the socket */ #endif #endif /* (!__APPLE__) */ @@ -179,43 +161,43 @@ struct so_tcdbg { /* * Additional options, not kept in so_options. */ -#define SO_SNDBUF 0x1001 /* send buffer size */ -#define SO_RCVBUF 0x1002 /* receive buffer size */ -#define SO_SNDLOWAT 0x1003 /* send low-water mark */ -#define SO_RCVLOWAT 0x1004 /* receive low-water mark */ -#define SO_SNDTIMEO 0x1005 /* send timeout */ -#define SO_RCVTIMEO 0x1006 /* receive timeout */ +#define SO_SNDBUF 0x1001 /* send buffer size */ +#define SO_RCVBUF 0x1002 /* receive buffer size */ +#define SO_SNDLOWAT 0x1003 /* send low-water mark */ +#define SO_RCVLOWAT 0x1004 /* receive low-water mark */ +#define SO_SNDTIMEO 0x1005 /* send timeout */ +#define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -/*efine SO_PRIVSTATE 0x1009 get/deny privileged state */ -#define SO_LABEL 0x1010 /* socket's MAC label */ -#define SO_PEERLABEL 0x1011 /* socket's peer MAC label */ +#define SO_LABEL 0x1010 /* socket's MAC label */ +#define SO_PEERLABEL 0x1011 /* socket's peer MAC label */ #ifdef __APPLE__ -#define SO_NREAD 0x1020 /* APPLE: get 1st-packet byte count */ -#define SO_NKE 0x1021 /* APPLE: Install socket-level NKE */ -#define SO_NOSIGPIPE 0x1022 /* APPLE: No SIGPIPE on EPIPE */ -#define SO_NOADDRERR 0x1023 /* APPLE: Returns EADDRNOTAVAIL when src is not available anymore */ -#define SO_NWRITE 0x1024 /* APPLE: Get number of bytes currently in send socket buffer */ -#define SO_REUSESHAREUID 0x1025 /* APPLE: Allow reuse of port/socket by different userids */ +#define SO_NREAD 0x1020 /* APPLE: get 1st-packet byte count */ +#define SO_NKE 0x1021 /* APPLE: Install socket-level NKE */ +#define SO_NOSIGPIPE 0x1022 /* APPLE: No SIGPIPE on EPIPE */ +#define SO_NOADDRERR 0x1023 /* APPLE: Returns EADDRNOTAVAIL when src is not available anymore */ +#define SO_NWRITE 0x1024 /* APPLE: Get number of bytes currently in send socket buffer */ +#define SO_REUSESHAREUID 0x1025 /* APPLE: Allow reuse of port/socket by different userids */ #ifdef __APPLE_API_PRIVATE -#define SO_NOTIFYCONFLICT 0x1026 /* APPLE: send notification if there is a bind on a port which is already in use */ +#define SO_NOTIFYCONFLICT 0x1026 /* APPLE: send notification if there is a bind on a port which is already in use */ #define SO_UPCALLCLOSEWAIT 0x1027 /* APPLE: block on close until an upcall returns */ #endif -#define SO_LINGER_SEC 0x1080 /* linger on close if data present (in seconds) */ +#define SO_LINGER_SEC 0x1080 /* linger on close if data present (in seconds) */ #ifdef PRIVATE #define SO_RESTRICTIONS 0x1081 /* APPLE: deny flag set */ #define SO_RESTRICT_DENY_IN 0x1 /* deny inbound (trapdoor) */ #define SO_RESTRICT_DENY_OUT 0x2 /* deny outbound (trapdoor) */ #define SO_RESTRICT_DENY_CELLULAR 0x4 /* deny use of cellular (trapdoor) */ -#define SO_RESTRICT_DENY_EXPENSIVE 0x8 /* deny use of expensive if (trapdoor)*/ +#define SO_RESTRICT_DENY_EXPENSIVE 0x8 /* deny use of expensive if (trapdoor) */ #endif /* PRIVATE */ -#define SO_RANDOMPORT 0x1082 /* APPLE: request local port randomization */ -#define SO_NP_EXTENSIONS 0x1083 /* To turn off some POSIX behavior */ +#define SO_RANDOMPORT 0x1082 /* APPLE: request local port randomization */ +#define SO_NP_EXTENSIONS 0x1083 /* To turn off some POSIX behavior */ #endif #ifdef PRIVATE #define SO_EXECPATH 0x1085 /* Application Firewall Socket option */ + /* * Traffic service class definitions (lowest to highest): * @@ -272,38 +254,54 @@ struct so_tcdbg { * certain types of locally-originated ICMP, ICMPv6; IGMP/MLD join/leave, * ARP. */ -#define SO_TRAFFIC_CLASS 0x1086 /* Traffic service class (int) */ -#define SO_TC_BK_SYS 100 /* lowest class */ -#define SO_TC_BK 200 -#define SO_TC_BE 0 -#define SO_TC_RD 300 -#define SO_TC_OAM 400 -#define SO_TC_AV 500 -#define SO_TC_RV 600 -#define SO_TC_VI 700 -#define SO_TC_VO 800 -#define SO_TC_CTL 900 /* highest class */ -#define SO_TC_MAX 10 /* Total # of traffic classes */ +#define SO_TRAFFIC_CLASS 0x1086 /* Traffic service class (int) */ +#define SO_TC_BK_SYS 100 /* lowest class */ +#define SO_TC_BK 200 +#define SO_TC_BE 0 +#define SO_TC_RD 300 +#define SO_TC_OAM 400 +#define SO_TC_AV 500 +#define SO_TC_RV 600 +#define SO_TC_VI 700 +#define SO_TC_VO 800 +#define SO_TC_CTL 900 /* highest class */ +#define SO_TC_MAX 10 /* Total # of traffic classes */ #ifdef XNU_KERNEL_PRIVATE -#define _SO_TC_BK 1 /* deprecated */ -#define _SO_TC_VI 2 /* deprecated */ -#define _SO_TC_VO 3 /* deprecated */ -#define _SO_TC_MAX 4 /* deprecated */ +#define _SO_TC_BK 1 /* deprecated */ +#define _SO_TC_VI 2 /* deprecated */ +#define _SO_TC_VO 3 /* deprecated */ +#define _SO_TC_MAX 4 /* deprecated */ #define SO_VALID_TC(c) \ (c == SO_TC_BK_SYS || c == SO_TC_BK || c == SO_TC_BE || \ c == SO_TC_RD || c == SO_TC_OAM || c == SO_TC_AV || \ - c == SO_TC_RV || c == SO_TC_VI || c == SO_TC_VO || c == SO_TC_CTL) + c == SO_TC_RV || c == SO_TC_VI || c == SO_TC_VO || \ + c == SO_TC_CTL) + +#define SO_TC_UNSPEC ((int)-1) /* Traffic class not specified */ + +#define SO_TC_SIG SO_TC_VI /* to be removed XXX */ + +#define SOTCIX_BK_SYS 0 +#define SOTCIX_BK 1 +#define SOTCIX_BE 2 +#define SOTCIX_RD 3 +#define SOTCIX_OAM 4 +#define SOTCIX_AV 5 +#define SOTCIX_RV 6 +#define SOTCIX_VI 7 +#define SOTCIX_VO 8 +#define SOTCIX_CTL 9 #endif /* XNU_KERNEL_PRIVATE */ -/* Background socket configuration flags */ -#define TRAFFIC_MGT_SO_BACKGROUND 0x0001 /* background socket */ -#define TRAFFIC_MGT_TCP_RECVBG 0x0002 /* Only TCP sockets, receiver throttling */ +/* Background socket configuration flags */ +#define TRAFFIC_MGT_SO_BACKGROUND 0x0001 /* background socket */ +#define TRAFFIC_MGT_TCP_RECVBG 0x0002 /* Only TCP sockets, receiver throttling */ -#define SO_RECV_TRAFFIC_CLASS 0x1087 /* Receive traffic class (bool)*/ -#define SO_TRAFFIC_CLASS_DBG 0x1088 /* Debug traffic class (struct so_tcdbg) */ -#define SO_TRAFFIC_CLASS_STATS 0x1089 /* Traffic class statistics */ -#define SO_PRIVILEGED_TRAFFIC_CLASS 0x1090 /* Privileged traffic class (bool) */ +#define SO_RECV_TRAFFIC_CLASS 0x1087 /* Receive traffic class (bool) */ +#define SO_TRAFFIC_CLASS_DBG 0x1088 /* Debug traffic class (struct so_tcdbg) */ +#define SO_TRAFFIC_CLASS_STATS 0x1089 /* Traffic class statistics */ +#define SO_PRIVILEGED_TRAFFIC_CLASS 0x1090 /* Privileged traffic class (bool) */ #define SO_DEFUNCTOK 0x1100 /* can be defunct'd */ #define SO_ISDEFUNCT 0x1101 /* get defunct status */ @@ -319,7 +317,7 @@ struct so_tcdbg { #define SO_RECV_ANYIF 0x1104 /* unrestricted inbound processing */ #define SO_TRAFFIC_MGT_BACKGROUND 0x1105 /* Background traffic management */ - + #define SO_FLOW_DIVERT_TOKEN 0x1106 /* flow divert token */ #define SO_DELEGATED 0x1107 /* set socket as delegate (pid_t) */ @@ -327,16 +325,139 @@ struct so_tcdbg { #define SO_NECP_ATTRIBUTES 0x1109 /* NECP socket attributes (domain, account, etc.) */ #define SO_CFIL_SOCK_ID 0x1110 /* get content filter socket ID (cfil_sock_id_t) */ #if MPTCP -#define SO_MPTCP_FASTJOIN 0x1111 /* fast join MPTCP */ +#define SO_MPTCP_FASTJOIN 0x1111 /* fast join MPTCP */ #endif /* MPTCP */ #endif /* PRIVATE */ -#define SO_NUMRCVPKT 0x1112 /* number of datagrams in receive socket buffer */ +#define SO_NUMRCVPKT 0x1112 /* number of datagrams in receive socket buffer */ #ifdef PRIVATE #define SO_AWDL_UNRESTRICTED 0x1113 /* try to use AWDL in restricted mode */ -#define SO_EXTENDED_BK_IDLE 0x1114 /* extended time to keep socket idle after app is suspended (int) */ +#define SO_EXTENDED_BK_IDLE 0x1114 /* extended time to keep socket idle after app is suspended (int) */ #define SO_MARK_CELLFALLBACK 0x1115 /* Mark as initiated by cell fallback */ #endif /* PRIVATE */ +/* + * Network Service Type for option SO_NET_SERVICE_TYPE + * + * The vast majority of sockets should use Best Effort that is the default + * Network Service Type. Other Network Service Types have to be used only if + * the traffic actually matches the description of the Network Service Type. + * + * Network Service Types do not represent priorities but rather describe + * different categories of delay, jitter and loss parameters. + * Those parameters may influence protocols from layer 4 protocols like TCP + * to layer 2 protocols like Wi-Fi. The Network Service Type can determine + * how the traffic is queued and scheduled by the host networking stack and + * by other entities on the network like switches and routers. For example + * for Wi-Fi, the Network Service Type can select the marking of the + * layer 2 packet with the appropriate WMM Access Category. + * + * There is no point in attempting to game the system and use + * a Network Service Type that does not correspond to the actual + * traffic characteristic but one that seems to have a higher precedence. + * The reason is that for service classes that have lower tolerance + * for delay and jitter, the queues size is lower than for service + * classes that are more tolerant to delay and jitter. + * + * For example using a voice service type for bulk data transfer will lead + * to disastrous results as soon as congestion happens because the voice + * queue overflows and packets get dropped. This is not only bad for the bulk + * data transfer but it is also bad for VoIP apps that legitimately are using + * the voice service type. + * + * The characteristics of the Network Service Types are based on the service + * classes defined in RFC 4594 "Configuration Guidelines for DiffServ Service + * Classes" + * + * When system detects the outgoing interface belongs to a DiffServ domain + * that follows the recommendation of the IETF draft "Guidelines for DiffServ to + * IEEE 802.11 Mapping", the packet will marked at layer 3 with a DSCP value + * that corresponds to Network Service Type. + * + * NET_SERVICE_TYPE_BE + * "Best Effort", unclassified/standard. This is the default service + * class and cover the majority of the traffic. + * + * NET_SERVICE_TYPE_BK + * "Background", high delay tolerant, loss tolerant. elastic flow, + * variable size & long-lived. E.g: non-interactive network bulk transfer + * like synching or backup. + * + * NET_SERVICE_TYPE_RD + * "Responsive Data", a notch higher than "Best Effort", medium delay + * tolerant, elastic & inelastic flow, bursty, long-lived. E.g. email, + * instant messaging, for which there is a sense of interactivity and + * urgency (user waiting for output). + * + * NET_SERVICE_TYPE_OAM + * "Operations, Administration, and Management", medium delay tolerant, + * low-medium loss tolerant, elastic & inelastic flows, variable size. + * E.g. VPN tunnels. + * + * NET_SERVICE_TYPE_AV + * "Multimedia Audio/Video Streaming", medium delay tolerant, low-medium + * loss tolerant, elastic flow, constant packet interval, variable rate + * and size. E.g. video and audio playback with buffering. + * + * NET_SERVICE_TYPE_RV + * "Responsive Multimedia Audio/Video", low delay tolerant, low-medium + * loss tolerant, elastic flow, variable packet interval, rate and size. + * E.g. screen sharing. + * + * NET_SERVICE_TYPE_VI + * "Interactive Video", low delay tolerant, low-medium loss tolerant, + * elastic flow, constant packet interval, variable rate & size. E.g. + * video telephony. + * + * NET_SERVICE_TYPE_SIG + * "Signaling", low delay tolerant, low loss tolerant, inelastic flow, + * jitter tolerant, rate is bursty but short, variable size. E.g. SIP. + * + * NET_SERVICE_TYPE_VO + * "Interactive Voice", very low delay tolerant, very low loss tolerant, + * inelastic flow, constant packet rate, somewhat fixed size. + * E.g. VoIP. + */ +#define SO_NET_SERVICE_TYPE 0x1116 /* Network service type */ + +#define NET_SERVICE_TYPE_BE 0 /* Best effort */ +#define NET_SERVICE_TYPE_BK 1 /* Background system initiated */ +#define NET_SERVICE_TYPE_SIG 2 /* Signaling */ +#define NET_SERVICE_TYPE_VI 3 /* Interactive Video */ +#define NET_SERVICE_TYPE_VO 4 /* Interactive Voice */ +#define NET_SERVICE_TYPE_RV 5 /* Responsive Multimedia Audio/Video */ +#define NET_SERVICE_TYPE_AV 6 /* Multimedia Audio/Video Streaming */ +#define NET_SERVICE_TYPE_OAM 7 /* Operations, Administration, and Management */ +#define NET_SERVICE_TYPE_RD 8 /* Responsive Data */ + +#if PRIVATE +#define SO_QOSMARKING_POLICY_OVERRIDE 0x1117 /* int */ +#define SO_INTCOPROC_ALLOW 0x1118 /* Try to use internal co-processor interfaces. */ + +#define _NET_SERVICE_TYPE_COUNT 9 +#define _NET_SERVICE_TYPE_UNSPEC ((int)-1) + +#define IS_VALID_NET_SERVICE_TYPE(c) \ + (c >= NET_SERVICE_TYPE_BE && c <= NET_SERVICE_TYPE_RD) + +extern const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT]; + +/* + * Facility to pass Network Service Type values using SO_TRAFFIC_CLASS + * Mostly useful to simplify implementation of frameworks to adopt the new + * Network Service Type values for Signaling. + */ +#define SO_TC_NET_SERVICE_OFFSET 10000 +#define SO_TC_NETSVC_SIG (SO_TC_NET_SERVICE_OFFSET + NET_SERVICE_TYPE_SIG) +#endif /* PRIVATE */ + +#define SO_NETSVC_MARKING_LEVEL 0x1119 /* Get QoS marking in effect for socket */ + +#define NETSVC_MRKNG_UNKNOWN 0 /* The outgoing network interface is not known */ +#define NETSVC_MRKNG_LVL_L2 1 /* Default marking at layer 2 (for example Wi-Fi WMM) */ +#define NETSVC_MRKNG_LVL_L3L2_ALL 2 /* Layer 3 DSCP marking and layer 2 marking for all Network Service Types */ +#define NETSVC_MRKNG_LVL_L3L2_BK 3 /* The system policy limits layer 3 DSCP marking and layer 2 marking + * to background Network Service Types */ + typedef __uint32_t sae_associd_t; #define SAE_ASSOCID_ANY 0 #define SAE_ASSOCID_ALL ((sae_associd_t)(-1ULL)) @@ -346,16 +467,16 @@ typedef __uint32_t sae_connid_t; #define SAE_CONNID_ALL ((sae_connid_t)(-1ULL)) /* connectx() flag parameters */ -#define CONNECT_RESUME_ON_READ_WRITE 0x1 /* resume connect() on read/write */ -#define CONNECT_DATA_IDEMPOTENT 0x2 /* data is idempotent */ +#define CONNECT_RESUME_ON_READ_WRITE 0x1 /* resume connect() on read/write */ +#define CONNECT_DATA_IDEMPOTENT 0x2 /* data is idempotent */ /* sockaddr endpoints */ typedef struct sa_endpoints { - unsigned int sae_srcif; /* optional source interface */ - struct sockaddr *sae_srcaddr; /* optional source address */ - socklen_t sae_srcaddrlen; /* size of source address */ - struct sockaddr *sae_dstaddr; /* destination address */ - socklen_t sae_dstaddrlen; /* size of destination address */ + unsigned int sae_srcif; /* optional source interface */ + const struct sockaddr *sae_srcaddr; /* optional source address */ + socklen_t sae_srcaddrlen; /* size of source address */ + const struct sockaddr *sae_dstaddr; /* destination address */ + socklen_t sae_dstaddrlen; /* size of destination address */ } sa_endpoints_t; #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ @@ -378,20 +499,20 @@ struct accept_filter_arg { #ifdef __APPLE__ /* - * Structure to control non-portable Sockets extension to POSIX + * Structure to control non-portable Sockets extension to POSIX */ struct so_np_extensions { u_int32_t npx_flags; u_int32_t npx_mask; }; -#define SONPX_SETOPTSHUT 0x000000001 /* flag for allowing setsockopt after shutdown */ +#define SONPX_SETOPTSHUT 0x000000001 /* flag for allowing setsockopt after shutdown */ #ifdef KERNEL_PRIVATE -#define SONPX_MASK_VALID (SONPX_SETOPTSHUT) -#define IS_SO_TC_BACKGROUND(_tc_) ((_tc_) == SO_TC_BK || (_tc_) == SO_TC_BK_SYS) -#define IS_SO_TC_BACKGROUNDSYSTEM(_tc_) ((_tc_) == SO_TC_BK_SYS) +#define SONPX_MASK_VALID (SONPX_SETOPTSHUT) +#define IS_SO_TC_BACKGROUND(_tc_) ((_tc_) == SO_TC_BK || (_tc_) == SO_TC_BK_SYS) +#define IS_SO_TC_BACKGROUNDSYSTEM(_tc_) ((_tc_) == SO_TC_BK_SYS) #endif /* KERNEL_PRIVATE */ #endif @@ -423,9 +544,9 @@ struct so_np_extensions { #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ -#define AF_DECnet 12 /* DECnet */ -#define AF_DLI 13 /* DEC Direct data link interface */ -#define AF_LAT 14 /* LAT */ +#define AF_DECnet 12 /* DECnet */ +#define AF_DLI 13 /* DEC Direct data link interface */ +#define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ @@ -433,31 +554,30 @@ struct so_np_extensions { #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ -#define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ +#define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ -#define pseudo_AF_PIP 25 /* Help Identify PIP packets */ -/*define pseudo_AF_BLUE 26 Identify packets for Blue Box - Not used */ -#define AF_NDRV 27 /* Network Driver 'raw' access */ -#define AF_ISDN 28 /* Integrated Services Digital Network*/ +#define pseudo_AF_PIP 25 /* Help Identify PIP packets */ +#define AF_NDRV 27 /* Network Driver 'raw' access */ +#define AF_ISDN 28 /* Integrated Services Digital Network */ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 29 /* Internal key-management function */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ #define AF_INET6 30 /* IPv6 */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define AF_NATM 31 /* native ATM access */ -#define AF_SYSTEM 32 /* Kernel event messages */ -#define AF_NETBIOS 33 /* NetBIOS */ -#define AF_PPP 34 /* PPP communication protocol */ -#define pseudo_AF_HDRCMPLT 35 /* Used by BPF to not rewrite headers - * in interface output routine */ +#define AF_SYSTEM 32 /* Kernel event messages */ +#define AF_NETBIOS 33 /* NetBIOS */ +#define AF_PPP 34 /* PPP communication protocol */ +#define pseudo_AF_HDRCMPLT 35 /* Used by BPF to not rewrite headers + in interface output routine */ #ifdef PRIVATE -#define AF_AFP 36 /* Used by AFP */ +#define AF_AFP 36 /* Used by AFP */ #else -#define AF_RESERVED_36 36 /* Reserved for internal usage */ +#define AF_RESERVED_36 36 /* Reserved for internal usage */ #endif -#define AF_IEEE80211 37 /* IEEE 802.11 protocol */ -#define AF_UTUN 38 +#define AF_IEEE80211 37 /* IEEE 802.11 protocol */ +#define AF_UTUN 38 #ifdef PRIVATE #define AF_MULTIPATH 39 #endif /* PRIVATE */ @@ -484,7 +604,7 @@ struct sockproto { __uint16_t sp_family; /* address family */ __uint16_t sp_protocol; /* protocol */ }; -#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE)*/ +#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* * RFC 2553: protocol-independent placeholder for socket addresses @@ -543,9 +663,9 @@ struct sockaddr_list { #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA -#define PF_DECnet AF_DECnet -#define PF_DLI AF_DLI -#define PF_LAT AF_LAT +#define PF_DECnet AF_DECnet +#define PF_DLI AF_DLI +#define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE @@ -555,22 +675,22 @@ struct sockaddr_list { #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX /* same format as AF_NS */ -#define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ -#define PF_PIP pseudo_AF_PIP -#define PF_NDRV AF_NDRV +#define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ +#define PF_PIP pseudo_AF_PIP +#define PF_NDRV AF_NDRV #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM -#define PF_SYSTEM AF_SYSTEM -#define PF_NETBIOS AF_NETBIOS -#define PF_PPP AF_PPP +#define PF_SYSTEM AF_SYSTEM +#define PF_NETBIOS AF_NETBIOS +#define PF_PPP AF_PPP #ifdef PRIVATE -#define PF_AFP AF_AFP +#define PF_AFP AF_AFP #else -#define PF_RESERVED_36 AF_RESERVED_36 +#define PF_RESERVED_36 AF_RESERVED_36 #endif -#define PF_UTUN AF_UTUN +#define PF_UTUN AF_UTUN #ifdef PRIVATE #define PF_MULTIPATH AF_MULTIPATH #endif /* PRIVATE */ @@ -580,9 +700,9 @@ struct sockaddr_list { * These do not have socket-layer support: */ #define PF_VLAN ((uint32_t)0x766c616e) /* 'vlan' */ -#define PF_BOND ((uint32_t)0x626f6e64) /* 'bond' */ +#define PF_BOND ((uint32_t)0x626f6e64) /* 'bond' */ #ifdef KERNEL_PRIVATE -#define PF_BRIDGE ((uint32_t)0x62726467) /* 'brdg' */ +#define PF_BRIDGE ((uint32_t)0x62726467) /* 'brdg' */ #endif /* KERNEL_PRIVATE */ /* @@ -594,11 +714,11 @@ struct sockaddr_list { * Further levels are defined by the individual families below. */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define NET_MAXID AF_MAX +#define NET_MAXID AF_MAX #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #ifdef KERNEL_PRIVATE -#define CTL_NET_NAMES { \ +#define CTL_NET_NAMES { \ { 0, 0 }, \ { "local", CTLTYPE_NODE }, \ { "inet", CTLTYPE_NODE }, \ @@ -647,13 +767,13 @@ struct sockaddr_list { * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ -#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ -#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ -#define NET_RT_IFLIST 3 /* survey interface list */ -#define NET_RT_STAT 4 /* routing statistics */ -#define NET_RT_TRASH 5 /* routes not in table but not freed */ -#define NET_RT_IFLIST2 6 /* interface list with addresses */ -#define NET_RT_DUMP2 7 /* dump; may limit to a.f. */ +#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ +#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ +#define NET_RT_IFLIST 3 /* survey interface list */ +#define NET_RT_STAT 4 /* routing statistics */ +#define NET_RT_TRASH 5 /* routes not in table but not freed */ +#define NET_RT_IFLIST2 6 /* interface list with addresses */ +#define NET_RT_DUMP2 7 /* dump; may limit to a.f. */ #ifdef PRIVATE #define NET_RT_DUMPX 8 /* private */ #define NET_RT_DUMPX_FLAGS 9 /* private */ @@ -662,7 +782,7 @@ struct sockaddr_list { #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #ifdef KERNEL_PRIVATE -#define CTL_NET_RT_NAMES { \ +#define CTL_NET_RT_NAMES { \ { 0, 0 }, \ { "dump", CTLTYPE_STRUCT }, \ { "flags", CTLTYPE_STRUCT }, \ @@ -670,9 +790,9 @@ struct sockaddr_list { { "stat", CTLTYPE_STRUCT }, \ { "trash", CTLTYPE_INT }, \ { "iflist2", CTLTYPE_STRUCT }, \ - { "dump2", CTLTYPE_STRUCT }, \ - { "dumpx", CTLTYPE_STRUCT }, \ - { "dumpx_flags", CTLTYPE_STRUCT }, \ + { "dump2", CTLTYPE_STRUCT }, \ + { "dumpx", CTLTYPE_STRUCT }, \ + { "dumpx_flags", CTLTYPE_STRUCT }, \ } #endif /* KERNEL_PRIVATE */ @@ -703,7 +823,7 @@ struct msghdr { * For recvmsg_x(), the size of the data received is given by the field * msg_datalen. * - * For sendmsg_x(), the size of the data to send is given by the length of + * For sendmsg_x(), the size of the data to send is given by the length of * the iovec array -- like sendmsg(). The field msg_datalen is ignored. */ struct msghdr_x { @@ -869,27 +989,27 @@ struct user32_sa_endpoints { #ifdef __APPLE__ #ifndef PRIVATE #ifdef __APPLE_API_OBSOLETE -#define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ +#define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ #endif #else -#define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ +#define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ #endif -#define MSG_FLUSH 0x400 /* Start of 'hold' seq; dump so_temp */ -#define MSG_HOLD 0x800 /* Hold frag in so_temp */ -#define MSG_SEND 0x1000 /* Send the packet in so_temp */ -#define MSG_HAVEMORE 0x2000 /* Data ready to be read */ -#define MSG_RCVMORE 0x4000 /* Data remains in current pkt */ +#define MSG_FLUSH 0x400 /* Start of 'hold' seq; dump so_temp */ +#define MSG_HOLD 0x800 /* Hold frag in so_temp */ +#define MSG_SEND 0x1000 /* Send the packet in so_temp */ +#define MSG_HAVEMORE 0x2000 /* Data ready to be read */ +#define MSG_RCVMORE 0x4000 /* Data remains in current pkt */ #endif #ifdef KERNEL_PRIVATE -#define MSG_COMPAT 0x8000 /* deprecated */ +#define MSG_COMPAT 0x8000 /* deprecated */ #endif /* KERNEL_PRIVATE */ -#define MSG_NEEDSA 0x10000 /* Fail receive if socket address cannot be allocated */ +#define MSG_NEEDSA 0x10000 /* Fail receive if socket address cannot be allocated */ #ifdef KERNEL_PRIVATE -#define MSG_NBIO 0x20000 /* FIONBIO mode, used by fifofs */ -#define MSG_SKIPCFIL 0x40000 /* skip pass content filter */ +#define MSG_NBIO 0x20000 /* FIONBIO mode, used by fifofs */ +#define MSG_SKIPCFIL 0x40000 /* skip pass content filter */ #endif #ifdef KERNEL -#define MSG_USEUPCALL 0x80000000 /* Inherit upcall in sock_accept */ +#define MSG_USEUPCALL 0x80000000 /* Inherit upcall in sock_accept */ #endif #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ @@ -912,8 +1032,8 @@ struct cmsghdr { * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf, and NGROUPS_MAX is too large to allow * this. -*/ -#define CMGROUP_MAX 16 + */ +#define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer @@ -935,32 +1055,32 @@ struct cmsgcred { /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ - __DARWIN_ALIGN32(sizeof(struct cmsghdr))) + __DARWIN_ALIGN32(sizeof(struct cmsghdr))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ -#define CMSG_FIRSTHDR(mhdr) \ - ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ - (struct cmsghdr *)(mhdr)->msg_control : \ - (struct cmsghdr *)0L) +#define CMSG_FIRSTHDR(mhdr) \ + ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ + (struct cmsghdr *)(mhdr)->msg_control : \ + (struct cmsghdr *)0L) -/* +/* * Given pointer to struct cmsghdr, return pointer to next cmsghdr * RFC 2292 says that CMSG_NXTHDR(mhdr, NULL) is equivalent to CMSG_FIRSTHDR(mhdr) */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == (char *)0L ? CMSG_FIRSTHDR(mhdr) : \ - ((((unsigned char *)(cmsg) + \ + ((((unsigned char *)(cmsg) + \ __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len) + \ __DARWIN_ALIGN32(sizeof(struct cmsghdr))) > \ ((unsigned char *)(mhdr)->msg_control + \ - (mhdr)->msg_controllen)) ? \ - (struct cmsghdr *)0L /* NULL */ : \ - (struct cmsghdr *)(void *)((unsigned char *)(cmsg) + \ - __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len)))) + (mhdr)->msg_controllen)) ? \ + (struct cmsghdr *)0L /* NULL */ : \ + (struct cmsghdr *)(void *)((unsigned char *)(cmsg) + \ + __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len)))) #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) /* RFC 2292 additions */ @@ -977,11 +1097,11 @@ struct cmsgcred { #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ -#define SCM_TIMESTAMP_MONOTONIC 0x04 /* timestamp (uint64_t) */ +#define SCM_TIMESTAMP_MONOTONIC 0x04 /* timestamp (uint64_t) */ #ifdef PRIVATE -#define SCM_SEQNUM 0x05 /* TCP unordered recv seq no */ -#define SCM_MSG_PRIORITY 0x06 /* TCP unordered snd priority */ +#define SCM_SEQNUM 0x05 /* TCP unordered recv seq no */ +#define SCM_MSG_PRIORITY 0x06 /* TCP unordered snd priority */ #endif /* PRIVATE */ #ifdef KERNEL_PRIVATE @@ -1162,7 +1282,7 @@ struct so_cinforeq64 { #define CIF_MP_CAPABLE 0x100 /* supports multipath protocol */ #define CIF_MP_READY 0x200 /* multipath protocol confirmed */ #define CIF_MP_DEGRADED 0x400 /* has lost its multipath capability */ -#define CIF_MP_ACTIVE 0x800 /* this is the active subflow */ +#define CIF_MP_ACTIVE 0x800 /* this is the active subflow */ /* valid connection info auxiliary data types */ #define CIAUX_TCP 0x1 /* TCP auxiliary data (conninfo_tcp_t) */ @@ -1175,13 +1295,6 @@ struct so_cordreq { __uint32_t sco_rank; /* rank (0 means unspecified) */ }; -/* - * Network policy subclass (of KEV_NETWORK_CLASS) - */ -#define KEV_NETPOLICY_SUBCLASS 3 - -#define KEV_NETPOLICY_IFDENIED 1 /* denied access to interface */ - /* * Common structure for KEV_NETPOLICY_SUBCLASS */ @@ -1199,18 +1312,9 @@ struct netpolicy_event_data { */ struct kev_netpolicy_ifdenied { struct netpolicy_event_data ev_data; + __uint32_t ev_if_functional_type; }; -/* - * Socket subclass (of KEV_NETWORK_CLASS) - */ -#define KEV_SOCKET_SUBCLASS 4 - -/* - * Events for KEV_SOCKET_SUBCLASS of KEV_NETWORK_CLASS - */ -#define KEV_SOCKET_CLOSED 1 /* completely closed by protocol */ - /* * Common structure for KEV_SOCKET_SUBCLASS */ @@ -1223,6 +1327,14 @@ struct kev_socket_closed { struct kev_socket_event_data ev_data; }; +/* + * Network Service Type to DiffServ Code Point mapping + */ +struct netsvctype_dscp_map { + int netsvctype; + u_int8_t dscp; /* 6 bits diffserv code point */ +}; + #ifndef KERNEL __BEGIN_DECLS @@ -1235,11 +1347,11 @@ extern int socket_delegate(int, int, int, pid_t); * * recvmsg_x() can be used only with protocols handlers that have been specially * modified to support sending and receiving several datagrams at once. - * + * * The size of the array "msgp" is given by the argument "cnt". * * The "flags" arguments supports only the value MSG_DONTWAIT. - * + * * Each member of "msgp" array is of type "struct msghdr_x". * * The "msg_iov" and "msg_iovlen" are input parameters that describe where to @@ -1255,7 +1367,7 @@ extern int socket_delegate(int, int, int, pid_t); * the low water mark and the amount of data pending in the socket buffer. * * recvmsg_x() returns the number of datagrams that have been received, - * or -1 if an error occurred. + * or -1 if an error occurred. * * NOTE: This a private system call, the API is subject to change. */ @@ -1267,11 +1379,11 @@ ssize_t recvmsg_x(int s, const struct msghdr_x *msgp, u_int cnt, int flags); * * sendmsg_x() can be used only with protocols handlers that have been specially * modified to support sending and receiving several datagrams at once. - * + * * The size of the array "msgp" is given by the argument "cnt". * * The "flags" arguments supports only the value MSG_DONTWAIT. - * + * * Each member of "msgp" array is of type "struct msghdr_x". * * The "msg_iov" and "msg_iovlen" are input parameters that specify the @@ -1284,10 +1396,10 @@ ssize_t recvmsg_x(int s, const struct msghdr_x *msgp, u_int cnt, int flags); * must be set to zero on input: * "msg_name", "msg_namelen", "msg_control" and "msg_controllen". * - * The field "msg_flags" and "msg_datalen" must be set to zero on input. + * The field "msg_flags" and "msg_datalen" must be set to zero on input. * * sendmsg_x() returns the number of datagrams that have been sent, - * or -1 if an error occurred. + * or -1 if an error occurred. * * NOTE: This a private system call, the API is subject to change. */ @@ -1303,7 +1415,7 @@ __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict) __DARWIN_ALIAS_C(accept); int bind(int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS(bind); -int connect(int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS_C( connect); +int connect(int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS_C(connect); int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict) __DARWIN_ALIAS(getpeername); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict) @@ -1330,9 +1442,9 @@ int sendfile(int, int, off_t, off_t *, struct sf_hdtr *, int); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) void pfctlinput(int, struct sockaddr *); -int connectx(int , const sa_endpoints_t *, sae_associd_t, unsigned int, +int connectx(int, const sa_endpoints_t *, sae_associd_t, unsigned int, const struct iovec *, unsigned int, size_t *, sae_connid_t *); -int disconnectx(int , sae_associd_t, sae_connid_t); +int disconnectx(int, sae_associd_t, sae_connid_t); #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ __END_DECLS #endif /* !KERNEL */ diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index f83ffc529..e3d1dfdf3 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -2,7 +2,7 @@ * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,12 +22,12 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/*- +/* * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * @@ -102,23 +102,23 @@ extern char netio[], netcon[], netcls[]; #endif /* KERNEL_PRIVATE */ #ifdef PRIVATE -#define SO_TC_STATS_MAX 4 +#define SO_TC_STATS_MAX 4 struct data_stats { - u_int64_t rxpackets; - u_int64_t rxbytes; - u_int64_t txpackets; - u_int64_t txbytes; + u_int64_t rxpackets; + u_int64_t rxbytes; + u_int64_t txpackets; + u_int64_t txbytes; }; -#define MSG_PRI_0 0 /* TCP message priority, lowest */ -#define MSG_PRI_1 1 -#define MSG_PRI_2 2 -#define MSG_PRI_3 3 /* TCP message priority, highest */ -#define MSG_PRI_MAX MSG_PRI_3 -#define MSG_PRI_MIN MSG_PRI_0 -#define MSG_PRI_COUNT 4 -#define MSG_PRI_DEFAULT MSG_PRI_1 +#define MSG_PRI_0 0 /* TCP message priority, lowest */ +#define MSG_PRI_1 1 +#define MSG_PRI_2 2 +#define MSG_PRI_3 3 /* TCP message priority, highest */ +#define MSG_PRI_MAX MSG_PRI_3 +#define MSG_PRI_MIN MSG_PRI_0 +#define MSG_PRI_COUNT 4 +#define MSG_PRI_DEFAULT MSG_PRI_1 #endif /* PRIVATE */ #ifdef KERNEL_PRIVATE @@ -128,7 +128,7 @@ struct msg_priq { struct mbuf *msgq_tail; /* last mbuf in the queue */ struct mbuf *msgq_lastmsg; /* last message in the queue */ u_int32_t msgq_flags; /* flags per priority queue */ -#define MSGQ_MSG_NOTDONE 0x1 /* set when EOR of a msg is not seen */ +#define MSGQ_MSG_NOTDONE 0x1 /* set when EOR of a msg is not seen */ u_int32_t msgq_bytes; /* data bytes in this queue */ }; @@ -139,7 +139,7 @@ struct msg_state { }; /* mbuf flag used to indicate out of order data received */ -#define M_UNORDERED_DATA M_PROTO1 +#define M_UNORDERED_DATA M_PROTO1 /* * Kernel structure per socket. @@ -173,7 +173,7 @@ struct socket { TAILQ_ENTRY(socket) so_list; /* list of unaccepted connections */ short so_qlen; /* number of unaccepted connections */ short so_incqlen; /* number of unaccepted incomplete - connections */ + connections */ short so_qlimit; /* max number queued connections */ short so_timeo; /* connection timeout */ pid_t so_pgid; /* pgid for signals */ @@ -196,18 +196,18 @@ struct socket { struct timeval sb_timeo; /* timeout for read/write */ u_int32_t sb_flags; /* flags, see below */ u_int32_t sb_idealsize; /* Ideal size for the sb based - on bandwidth and delay */ + on bandwidth and delay */ void (*sb_upcall)(struct socket *, void *arg, int waitf); void *sb_upcallarg; /* Arg for above */ u_int32_t sb_wantlock; /* # of SB_LOCK waiters */ u_int32_t sb_waiters; /* # of data/space waiters */ thread_t sb_cfil_thread; /* content filter thread */ u_int32_t sb_cfil_refs; /* # of nested calls */ - u_int32_t sb_preconn_hiwat;/* preconnect hiwat mark */ + u_int32_t sb_preconn_hiwat; /* preconnect hiwat mark */ } so_rcv, so_snd; #define SB_MAX (8192*1024) /* default for max chars in sockbuf */ -#define LOW_SB_MAX (2*9*1024) /* lower limit on max socket buffer - size, 2 max datagrams */ +#define LOW_SB_MAX (2*9*1024) /* lower limit on max socket buffer + size, 2 max datagrams */ #define SB_LOCK 0x1 /* lock on data queue */ #define SB_NOINTR 0x2 /* operations not interruptible */ #define SB_RECV 0x4 /* this is rcv sb */ @@ -217,12 +217,13 @@ struct socket { #define SB_KNOTE 0x40 /* kernel note attached */ #define SB_DROP 0x80 /* does not accept any more data */ #define SB_UNIX 0x100 /* UNIX domain socket buffer */ -#define SB_USRSIZE 0x200 /* user specified sbreserve */ +#define SB_USRSIZE 0x200 /* user specified sbreserve */ #define SB_AUTOSIZE 0x400 /* automatically size socket buffer */ #define SB_TRIM 0x800 /* Trim the socket buffer */ #define SB_NOCOMPRESS 0x1000 /* do not compress socket buffer */ +#define SB_SNDBYTE_CNT 0x2000 /* keep track of snd bytes per interface */ caddr_t so_tpcb; /* Misc. protocol control block, used - by some kexts */ + by some kexts */ void (*so_event)(struct socket *, void *, u_int32_t); void *so_eventarg; /* Arg for above */ @@ -231,7 +232,7 @@ struct socket { so_gen_t so_gencnt; /* generation count */ TAILQ_HEAD(, eventqelt) so_evlist; STAILQ_ENTRY(socket) so_cache_ent; /* socache entry */ - caddr_t so_saved_pcb; /* Saved pcb when cacheing */ + caddr_t so_saved_pcb; /* Saved pcb when cacheing */ u_int32_t cache_timestamp; /* time socket was cached */ pid_t last_pid; /* pid of most recent accessor */ @@ -241,43 +242,43 @@ struct socket { /* Plug-in support - make the socket interface overridable */ struct mbuf *so_tail; struct socket_filter_entry *so_filt; /* NKE hook */ - u_int32_t so_flags; /* Flags */ + u_int32_t so_flags; /* Flags */ #define SOF_NOSIGPIPE 0x00000001 #define SOF_NOADDRAVAIL 0x00000002 /* EADDRNOTAVAIL if src addr is gone */ #define SOF_PCBCLEARING 0x00000004 /* pru_disconnect done; don't - call pru_detach */ + call pru_detach */ #define SOF_DEFUNCT 0x00000008 /* socket marked as inactive */ #define SOF_CLOSEWAIT 0x00000010 /* blocked in close awaiting some events */ -#define SOF_REUSESHAREUID 0x00000040 /* Allows SO_REUSEADDR/SO_REUSEPORT - for multiple so_uid */ +#define SOF_REUSESHAREUID 0x00000040 /* Allows SO_REUSEADDR/SO_REUSEPORT + for multiple so_uid */ #define SOF_MULTIPAGES 0x00000080 /* jumbo clusters may be used for sosend */ -#define SOF_ABORTED 0x00000100 /* soabort was already called once */ -#define SOF_OVERFLOW 0x00000200 /* socket was dropped as overflow of - listen q */ -#define SOF_NOTIFYCONFLICT 0x00000400 /* notify that a bind was done on a - port already in use */ +#define SOF_ABORTED 0x00000100 /* soabort was already called once */ +#define SOF_OVERFLOW 0x00000200 /* socket was dropped as overflow of + listen q */ +#define SOF_NOTIFYCONFLICT 0x00000400 /* notify that a bind was done on a + port already in use */ #define SOF_UPCALLCLOSEWAIT 0x00000800 /* block close until upcall returns */ -#define SOF_BINDRANDOMPORT 0x00001000 /* Randomized port number for bind */ -#define SOF_NPX_SETOPTSHUT 0x00002000 /* Non POSIX extension to allow - setsockopt(2) after shut down */ -#define SOF_RECV_TRAFFIC_CLASS 0x00004000 /* Receive TC as ancillary data */ +#define SOF_BINDRANDOMPORT 0x00001000 /* Randomized port number for bind */ +#define SOF_NPX_SETOPTSHUT 0x00002000 /* Non POSIX extension to allow + setsockopt(2) after shut down */ +#define SOF_RECV_TRAFFIC_CLASS 0x00004000 /* Receive TC as ancillary data */ #define SOF_NODEFUNCT 0x00008000 /* socket cannot be defunct'd */ #define SOF_PRIVILEGED_TRAFFIC_CLASS 0x00010000 /* traffic class is privileged */ -#define SOF_SUSPENDED 0x00020000 /* i/f output queue is suspended */ -#define SOF_INCOMP_INPROGRESS 0x00040000 /* incomp socket is being processed */ +#define SOF_SUSPENDED 0x00020000 /* i/f output queue is suspended */ +#define SOF_INCOMP_INPROGRESS 0x00040000 /* incomp socket is being processed */ #define SOF_NOTSENT_LOWAT 0x00080000 /* A different lowat on not sent - data has been set */ -#define SOF_KNOTE 0x00100000 /* socket is on the EV_SOCK klist */ -#define SOF_USELRO 0x00200000 /* TCP must use LRO on these sockets */ -#define SOF_ENABLE_MSGS 0x00400000 /* TCP must enable message delivery */ -#define SOF_FLOW_DIVERT 0x00800000 /* Flow Divert is enabled */ + data has been set */ +#define SOF_KNOTE 0x00100000 /* socket is on the EV_SOCK klist */ +#define SOF_USELRO 0x00200000 /* TCP must use LRO on these sockets */ +#define SOF_ENABLE_MSGS 0x00400000 /* TCP must enable message delivery */ +#define SOF_FLOW_DIVERT 0x00800000 /* Flow Divert is enabled */ #define SOF_MP_SUBFLOW 0x01000000 /* is a multipath subflow socket */ -#define SOF_MPTCP_TRUE 0x02000000 /* Established e2e MPTCP connection */ -#define SOF_MPTCP_CLIENT 0x04000000 /* Only client starts addtnal flows */ -#define SOF_MP_SEC_SUBFLOW 0x08000000 /* Set up secondary flow */ -#define SOF_MP_TRYFAILOVER 0x10000000 /* Failing subflow */ +#define SOF_MPTCP_TRUE 0x02000000 /* Established e2e MPTCP connection */ +#define SOF_MPTCP_CLIENT 0x04000000 /* Only client starts addtnal flows */ +#define SOF_MP_SEC_SUBFLOW 0x08000000 /* Set up secondary flow */ +#define SOF_MP_TRYFAILOVER 0x10000000 /* Failing subflow */ #define SOF_DELEGATED 0x20000000 /* on behalf of another process */ -#define SOF_MPTCP_FASTJOIN 0x40000000 /* fast join support */ +#define SOF_MPTCP_FASTJOIN 0x40000000 /* fast join support */ #define SOF_CONTENT_FILTER 0x80000000 /* Content filter enabled */ uint32_t so_upcallusecount; /* number of upcalls in progress */ @@ -285,7 +286,7 @@ struct socket { int so_retaincnt; u_int32_t so_filteruse; /* usecount for the socket filters */ u_int16_t so_traffic_class; - u_int8_t so_traffic_mgt_flags; /* traffic_mgt socket config */ + int8_t so_netsvctype; u_int8_t so_restrictions; thread_t so_send_filt_thread; @@ -303,7 +304,7 @@ struct socket { struct label *so_label; /* MAC label for socket */ struct label *so_peerlabel; /* cached MAC label for socket peer */ thread_t so_background_thread; /* thread that marked - this socket background */ + this socket background */ struct data_stats so_tc_stats[SO_TC_STATS_MAX]; struct klist so_klist; /* klist for EV_SOCK events */ @@ -324,37 +325,42 @@ struct socket { int32_t so_policy_gencnt; /* UUID policy gencnt */ u_int32_t so_flags1; -#define SOF1_POST_FALLBACK_SYNC 0x00000001 /* fallback to TCP */ -#define SOF1_AWDL_PRIVILEGED 0x00000002 -#define SOF1_IF_2KCL 0x00000004 /* interface prefers 2 KB clusters */ -#define SOF1_DEFUNCTINPROG 0x00000008 -#define SOF1_DATA_IDEMPOTENT 0x00000010 /* idempotent data for TFO */ -#define SOF1_PRECONNECT_DATA 0x00000020 /* request for preconnect data */ +#define SOF1_POST_FALLBACK_SYNC 0x00000001 /* fallback to TCP */ +#define SOF1_AWDL_PRIVILEGED 0x00000002 /* unused */ +#define SOF1_IF_2KCL 0x00000004 /* interface prefers 2 KB clusters */ +#define SOF1_DEFUNCTINPROG 0x00000008 +#define SOF1_DATA_IDEMPOTENT 0x00000010 /* idempotent data for TFO */ +#define SOF1_PRECONNECT_DATA 0x00000020 /* request for preconnect data */ #define SOF1_EXTEND_BK_IDLE_WANTED 0x00000040 /* option set */ #define SOF1_EXTEND_BK_IDLE_INPROG 0x00000080 /* socket */ #define SOF1_CACHED_IN_SOCK_LAYER 0x00000100 /* bundled with inpcb and tcpcb */ -#define SOF1_TFO_REWIND 0x00000200 /* rewind mptcp meta data */ -#define SOF1_CELLFALLBACK 0x00000400 /* Initiated by cell fallback */ +#define SOF1_TFO_REWIND 0x00000200 /* rewind mptcp meta data */ +#define SOF1_CELLFALLBACK 0x00000400 /* Initiated by cell fallback */ +#define SOF1_QOSMARKING_ALLOWED 0x00000800 /* policy allows DSCP map */ +#define SOF1_TC_NET_SERV_TYPE 0x00001000 /* traffic class set by SO_NETWORK_SERVICE_TYPE */ +#define SOF1_TRAFFIC_MGT_SO_BACKGROUND 0x00002000 /* background socket */ +#define SOF1_TRAFFIC_MGT_TCP_RECVBG 0x00004000 /* Only TCP sockets, receiver throttling */ +#define SOF1_QOSMARKING_POLICY_OVERRIDE 0x00008000 /* Opt-out of QoS marking NECP policy */ u_int64_t so_extended_bk_start; }; /* Control message accessor in mbufs */ -#define _MIN_NXT_CMSGHDR_PTR(cmsg) \ +#define _MIN_NXT_CMSGHDR_PTR(cmsg) \ ((char *)(cmsg) + \ __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len) + \ __DARWIN_ALIGN32(sizeof(struct cmsghdr))) -#define M_FIRST_CMSGHDR(m) \ - ((char *)(m) != (char *)0L && \ +#define M_FIRST_CMSGHDR(m) \ + ((char *)(m) != (char *)0L && \ (size_t)(m)->m_len >= sizeof (struct cmsghdr) && \ (socklen_t)(m)->m_len >= \ __DARWIN_ALIGN32(((struct cmsghdr *)(void *)(m)->m_data)->cmsg_len) ? \ (struct cmsghdr *)(void *)(m)->m_data : (struct cmsghdr *)0L) -#define M_NXT_CMSGHDR(m, cmsg) \ - ((char *)(cmsg) == (char *)0L ? M_FIRST_CMSGHDR(m) : \ +#define M_NXT_CMSGHDR(m, cmsg) \ + ((char *)(cmsg) == (char *)0L ? M_FIRST_CMSGHDR(m) : \ _MIN_NXT_CMSGHDR_PTR(cmsg) > ((char *)(m)->m_data) + (m)->m_len || \ _MIN_NXT_CMSGHDR_PTR(cmsg) < (char *)(m)->m_data ? \ (struct cmsghdr *)0L /* NULL */ : \ @@ -380,7 +386,7 @@ struct socket { #define SS_COMP 0x1000 /* unaccepted, complete connection */ #define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ #define SS_DRAINING 0x4000 /* close waiting for blocked system - calls to drain */ + calls to drain */ #define SS_DEFUNCT 0x8000 /* has been fully defunct'd */ #endif /* KERNEL_PRIVATE */ @@ -392,8 +398,8 @@ struct socket { #ifdef PRIVATE /* Flags returned in data field for EVFILT_SOCK events. */ -#define SOCKEV_CONNECTED 0x00000001 /* connected */ -#define SOCKEV_DISCONNECTED 0x00000002 /* disconnected */ +#define SOCKEV_CONNECTED 0x00000001 /* connected */ +#define SOCKEV_DISCONNECTED 0x00000002 /* disconnected */ #endif /* PRIVATE */ #pragma pack(4) @@ -456,15 +462,15 @@ struct xsocket64 { }; #ifdef PRIVATE -#define XSO_SOCKET 0x001 -#define XSO_RCVBUF 0x002 -#define XSO_SNDBUF 0x004 -#define XSO_STATS 0x008 -#define XSO_INPCB 0x010 -#define XSO_TCPCB 0x020 -#define XSO_KCREG 0x040 -#define XSO_KCB 0x080 -#define XSO_EVT 0x100 +#define XSO_SOCKET 0x001 +#define XSO_RCVBUF 0x002 +#define XSO_SNDBUF 0x004 +#define XSO_STATS 0x008 +#define XSO_INPCB 0x010 +#define XSO_TCPCB 0x020 +#define XSO_KCREG 0x040 +#define XSO_KCB 0x080 +#define XSO_EVT 0x100 struct xsocket_n { u_int32_t xso_len; /* length of this structure */ @@ -572,44 +578,46 @@ struct kextcb { #define EXT_NULL 0x0 /* STATE: Not in use */ /* Hints for socket event processing */ -#define SO_FILT_HINT_LOCKED 0x00000001 /* socket is already locked */ -#define SO_FILT_HINT_CONNRESET 0x00000002 /* Reset is received */ -#define SO_FILT_HINT_CANTRCVMORE 0x00000004 /* No more data to read */ -#define SO_FILT_HINT_CANTSENDMORE 0x00000008 /* Can't write more data */ -#define SO_FILT_HINT_TIMEOUT 0x00000010 /* timeout */ -#define SO_FILT_HINT_NOSRCADDR 0x00000020 /* No src address available */ -#define SO_FILT_HINT_IFDENIED 0x00000040 /* interface denied access */ -#define SO_FILT_HINT_SUSPEND 0x00000080 /* output queue suspended */ -#define SO_FILT_HINT_RESUME 0x00000100 /* output queue resumed */ -#define SO_FILT_HINT_KEEPALIVE 0x00000200 /* TCP Keepalive received */ -#define SO_FILT_HINT_ADAPTIVE_WTIMO 0x00000400 /* TCP adaptive write timeout */ -#define SO_FILT_HINT_ADAPTIVE_RTIMO 0x00000800 /* TCP adaptive read timeout */ -#define SO_FILT_HINT_CONNECTED 0x00001000 /* socket is connected */ -#define SO_FILT_HINT_DISCONNECTED 0x00002000 /* socket is disconnected */ -#define SO_FILT_HINT_CONNINFO_UPDATED 0x00004000 /* updated conninfo avail. */ +#define SO_FILT_HINT_LOCKED 0x00000001 /* socket is already locked */ +#define SO_FILT_HINT_CONNRESET 0x00000002 /* Reset is received */ +#define SO_FILT_HINT_CANTRCVMORE 0x00000004 /* No more data to read */ +#define SO_FILT_HINT_CANTSENDMORE 0x00000008 /* Can't write more data */ +#define SO_FILT_HINT_TIMEOUT 0x00000010 /* timeout */ +#define SO_FILT_HINT_NOSRCADDR 0x00000020 /* No src address available */ +#define SO_FILT_HINT_IFDENIED 0x00000040 /* interface denied access */ +#define SO_FILT_HINT_SUSPEND 0x00000080 /* output queue suspended */ +#define SO_FILT_HINT_RESUME 0x00000100 /* output queue resumed */ +#define SO_FILT_HINT_KEEPALIVE 0x00000200 /* TCP Keepalive received */ +#define SO_FILT_HINT_ADAPTIVE_WTIMO 0x00000400 /* TCP adaptive write timeout */ +#define SO_FILT_HINT_ADAPTIVE_RTIMO 0x00000800 /* TCP adaptive read timeout */ +#define SO_FILT_HINT_CONNECTED 0x00001000 /* socket is connected */ +#define SO_FILT_HINT_DISCONNECTED 0x00002000 /* socket is disconnected */ +#define SO_FILT_HINT_CONNINFO_UPDATED 0x00004000 /* updated conninfo avail. */ #define SO_FILT_HINT_MPFAILOVER 0x00008000 /* multipath failover */ #define SO_FILT_HINT_MPSTATUS 0x00010000 /* multipath status */ -#define SO_FILT_HINT_MUSTRST 0x00020000 /* must send RST and close */ -#define SO_FILT_HINT_MPFASTJ 0x00040000 /* can do MPTCP fast join */ -#define SO_FILT_HINT_DELETEOK 0x00100000 /* Ok to delete socket */ -#define SO_FILT_HINT_MPCANTRCVMORE 0x00200000 /* MPTCP DFIN Received */ +#define SO_FILT_HINT_MUSTRST 0x00020000 /* must send RST and close */ +#define SO_FILT_HINT_MPFASTJ 0x00040000 /* can do MPTCP fast join */ +#define SO_FILT_HINT_DELETEOK 0x00100000 /* Ok to delete socket */ +#define SO_FILT_HINT_MPCANTRCVMORE 0x00200000 /* MPTCP DFIN Received */ +#define SO_FILT_HINT_NOTIFY_ACK 0x00400000 /* Notify Acknowledgement */ #define SO_FILT_HINT_BITS \ "\020\1LOCKED\2CONNRESET\3CANTRCVMORE\4CANTSENDMORE\5TIMEOUT" \ "\6NOSRCADDR\7IFDENIED\10SUSPEND\11RESUME\12KEEPALIVE\13AWTIMO" \ "\14ARTIMO\15CONNECTED\16DISCONNECTED\17CONNINFO_UPDATED" \ "\20MPFAILOVER\21MPSTATUS\22MUSTRST\23MPFASTJ\25DELETEOK" \ - "\26MPCANTRCVMORE" + "\26MPCANTRCVMORE\27NOTIFYACK" /* Mask for hints that have corresponding kqueue events */ -#define SO_FILT_HINT_EV \ +#define SO_FILT_HINT_EV \ (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE | \ SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT | \ SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED | \ SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME | \ SO_FILT_HINT_KEEPALIVE | SO_FILT_HINT_ADAPTIVE_WTIMO | \ SO_FILT_HINT_ADAPTIVE_RTIMO | SO_FILT_HINT_CONNECTED | \ - SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_CONNINFO_UPDATED) + SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_CONNINFO_UPDATED | \ + SO_FILT_HINT_NOTIFY_ACK) #if SENDFILE struct sf_buf { @@ -633,26 +641,33 @@ struct sf_buf { } \ } -#define SB_MB_CHECK(sb) do { \ +#define SB_MB_CHECK(sb) do { \ if (((sb)->sb_mb != NULL && \ (sb)->sb_cc == 0) || \ ((sb)->sb_mb == NULL && (sb)->sb_cc > 0)) \ panic("corrupt so_rcv: sb_mb %p sb_cc %d\n", \ (sb)->sb_mb, (sb)->sb_cc); \ -} while(0) +} while (0) -#define SODEFUNCTLOG(x) do { if (sodefunctlog) printf x; } while (0) -#define SOTHROTTLELOG(x) do { if (sothrottlelog) printf x; } while (0) +#define SODEFUNCTLOG(fmt, ...) do { \ + if (sodefunctlog) \ + printf(fmt, __VA_ARGS__); \ +} while (0) + +#define SOTHROTTLELOG(fmt, ...) do { \ + if (sothrottlelog) \ + printf(fmt, __VA_ARGS__); \ +} while (0) /* * For debugging traffic class behaviors */ -#define SOTCDB_NO_DSCP 0x01 /* Do not set DSCP code in IP header */ -#define SOTCDB_NO_MTC 0x02 /* Do not set the mbuf traffic class */ -#define SOTCDB_NO_SENDTCPBG 0x04 /* Do not use background TCP CC algorithm for sender */ -#define SOTCDB_NO_LCLTST 0x08 /* Do not test for local destination for setting DSCP */ -#define SOTCDB_NO_DSCPTST 0x10 /* Overwritte any existing DSCP code */ -#define SOTCDB_NO_RECVTCPBG 0x20 /* Do not use throttling on receiver-side of TCP */ +#define SOTCDB_RESERVED 0x01 +#define SOTCDB_NO_MTC 0x02 /* Do not set the mbuf traffic class */ +#define SOTCDB_NO_SENDTCPBG 0x04 /* Do not use background TCP CC algorithm for sender */ +#define SOTCDB_NO_LCLTST 0x08 /* Do not test for local destination for setting DSCP */ +#define SOTCDB_NO_DSCPTST 0x10 /* Overwritte any existing DSCP code */ +#define SOTCDB_NO_RECVTCPBG 0x20 /* Do not use throttling on receiver-side of TCP */ #define SOTCDB_NO_PRIVILEGED 0x40 /* Do not set privileged traffic flag */ #define SOCK_DOM(so) ((so)->so_proto->pr_domain->dom_family) @@ -695,6 +710,11 @@ extern u_int32_t net_io_policy_uuid; extern struct soextbkidlestat soextbkidlestat; +struct net_qos_dscp_map { + u_int8_t sotc_to_dscp[SO_TC_MAX]; + u_int8_t netsvctype_to_dscp[_NET_SERVICE_TYPE_COUNT]; +}; + #endif /* BSD_KERNEL_PRIVATE */ struct mbuf; @@ -748,9 +768,10 @@ extern void soreserve_preconnect(struct socket *so, unsigned int pre_cc); extern void sorwakeup(struct socket *so); extern int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags); -extern int sosend_list(struct socket *so, struct uio **uio, u_int uiocnt, int flags); -extern int soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int msgcnt, - int *flags); +extern int sosend_list(struct socket *so, struct uio **uio, u_int uiocnt, + int flags); +extern int soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, + u_int msgcnt, int *flags); extern void sonullevent(struct socket *so, void *arg, uint32_t hint); __END_DECLS @@ -787,7 +808,7 @@ extern void sblastmbufchk(struct sockbuf *, const char *); extern void sblastrecordchk(struct sockbuf *, const char *); extern struct mbuf *sbcreatecontrol(caddr_t p, int size, int type, int level); extern struct mbuf **sbcreatecontrol_mbuf(caddr_t p, int size, int type, - int level, struct mbuf** m); + int level, struct mbuf **m); extern void sbdrop(struct sockbuf *sb, int len); extern void sbdroprecord(struct sockbuf *sb); extern int sbinsertoob(struct sockbuf *sb, struct mbuf *m0); @@ -825,7 +846,7 @@ extern int soconnectlock(struct socket *so, struct sockaddr *nam, int dolock); extern int soconnect2(struct socket *so1, struct socket *so2); extern int soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl, struct sockaddr_list **dst_sl, struct proc *, uint32_t, sae_associd_t, - sae_connid_t *, uint32_t, void *, u_int32_t, uio_t, user_ssize_t*); + sae_connid_t *, uint32_t, void *, u_int32_t, uio_t, user_ssize_t *); extern int sodisconnectx(struct socket *so, sae_associd_t, sae_connid_t); extern int sodisconnectxlocked(struct socket *so, sae_associd_t, sae_connid_t); extern int sopeelofflocked(struct socket *so, sae_associd_t, struct socket **); @@ -836,7 +857,8 @@ extern void soevupcall(struct socket *, u_int32_t); extern int socreate_internal(int dom, struct socket **aso, int type, int proto, struct proc *, uint32_t, struct proc *); extern int socreate(int dom, struct socket **aso, int type, int proto); -extern int socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid); +extern int socreate_delegate(int dom, struct socket **aso, int type, int proto, + pid_t epid); extern void sodealloc(struct socket *so); extern int sodisconnectlocked(struct socket *so); extern void soreference(struct socket *so); @@ -900,22 +922,23 @@ extern struct sockaddr_list *sockaddrlist_dup(const struct sockaddr_list *, int); /* Service class flags used for setting service class on a packet */ -#define PKT_SCF_IPV6 0x00000001 /* IPv6 packet */ -#define PKT_SCF_TCP_ACK 0x00000002 /* Pure TCP ACK */ +#define PKT_SCF_IPV6 0x00000001 /* IPv6 packet */ +#define PKT_SCF_TCP_ACK 0x00000002 /* Pure TCP ACK */ +#define PKT_SCF_TCP_SYN 0x00000004 /* TCP SYN */ /* * Flags for connectx(2) user-protocol request routine. */ #define CONNREQF_MPTCP 0x1 /* called internally by MPTCP */ -#define CONNREQF_UIO 0x2 /* there's data */ -#define CONNREQF_IDEM 0x4 /* data is idempotent */ +#define CONNREQF_UIO 0x2 /* there's data */ +#define CONNREQF_IDEM 0x4 /* data is idempotent */ extern void set_packet_service_class(struct mbuf *, struct socket *, mbuf_svc_class_t, u_int32_t); extern void so_tc_update_stats(struct mbuf *, struct socket *, mbuf_svc_class_t); -extern mbuf_svc_class_t mbuf_service_class_from_control(struct mbuf *); +extern int so_tc_from_control(struct mbuf *, int *); extern mbuf_svc_class_t so_tc2msc(int); extern int so_svc2tc(mbuf_svc_class_t); @@ -923,6 +946,7 @@ extern u_int8_t tcp_cansbgrow(struct sockbuf *sb); extern int tcp_get_msg_priority(struct mbuf *, uint32_t *); extern void set_tcp_stream_priority(struct socket *so); +extern int so_set_net_service_type(struct socket *, int); extern int so_set_traffic_class(struct socket *, int); extern void so_set_default_traffic_class(struct socket *); extern int so_set_opportunistic(struct socket *, int); @@ -934,9 +958,11 @@ extern int so_set_effective_uuid(struct socket *, uuid_t, struct proc *); extern int so_set_restrictions(struct socket *, uint32_t); extern uint32_t so_get_restrictions(struct socket *); extern void socket_tclass_init(void); +#if (DEVELOPMENT || DEBUG) extern int so_set_tcdbg(struct socket *, struct so_tcdbg *); extern int sogetopt_tcdbg(struct socket *, struct sockopt *); -extern void so_set_lro(struct socket*, int); +#endif /* (DEVELOPMENT || DEBUG) */ +extern void so_set_lro(struct socket *, int); extern int so_isdstlocal(struct socket *); extern void so_recv_data_stat(struct socket *, struct mbuf *, size_t); @@ -958,7 +984,7 @@ extern void netpolicy_post_msg(uint32_t, struct netpolicy_event_data *, extern void socket_post_kev_msg(uint32_t, struct kev_socket_event_data *, uint32_t); extern void socket_post_kev_msg_closed(struct socket *); -/* +/* * Socket operation routines. * These routines are called by the routines in * sys_socket.c or from a system process, and diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index d5f8ac636..f098e7a13 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -121,7 +121,6 @@ #define SIOCAUTONETMASK _IOW('i', 39, struct ifreq) /* autoconf netmask */ #define SIOCARPIPLL _IOWR('i', 40, struct ifreq) /* arp for IPv4LL address */ - #define SIOCADDMULTI _IOW('i', 49, struct ifreq) /* add m'cast addr */ #define SIOCDELMULTI _IOW('i', 50, struct ifreq) /* del m'cast addr */ #define SIOCGIFMTU _IOWR('i', 51, struct ifreq) /* get IF mtu */ @@ -256,6 +255,7 @@ #define SIOCDIFAGENTID _IOWR('i', 166, struct if_agentidreq) /* Delete netagent id */ #define SIOCGIFAGENTIDS _IOWR('i', 167, struct if_agentidsreq) /* Get netagent ids */ #define SIOCGIFAGENTDATA _IOWR('i', 168, struct netagent_req) /* Get netagent data */ + #ifdef BSD_KERNEL_PRIVATE #define SIOCGIFAGENTIDS32 _IOWR('i', 167, struct if_agentidsreq32) #define SIOCGIFAGENTIDS64 _IOWR('i', 167, struct if_agentidsreq64) @@ -274,6 +274,30 @@ #define SIOCGECNMODE _IOWR('i', 176, struct ifreq) #define SIOCSECNMODE _IOW('i', 177, struct ifreq) + +#define SIOCSIFORDER _IOWR('i', 178, struct if_order) +#define SIOCGIFORDER _IOWR('i', 179, struct if_order) + +#define SIOCSQOSMARKINGMODE _IOWR('i', 180, struct ifreq) +#define SIOCSFASTLANECAPABLE SIOCSQOSMARKINGMODE +#define SIOCSQOSMARKINGENABLED _IOWR('i', 181, struct ifreq) +#define SIOCSFASTLEENABLED SIOCSQOSMARKINGENABLED +#define SIOCGQOSMARKINGMODE _IOWR('i', 182, struct ifreq) +#define SIOCGQOSMARKINGENABLED _IOWR('i', 183, struct ifreq) + + +#define SIOCSIFTIMESTAMPENABLE _IOWR('i', 184, struct ifreq) +#define SIOCSIFTIMESTAMPDISABLE _IOWR('i', 185, struct ifreq) +#define SIOCGIFTIMESTAMPENABLED _IOWR('i', 186, struct ifreq) + +#define SIOCSIFDISABLEOUTPUT _IOWR('i', 187, struct ifreq) + +#define SIOCGIFAGENTLIST _IOWR('i', 190, struct netagentlist_req) /* Get netagent dump */ +#ifdef BSD_KERNEL_PRIVATE +#define SIOCGIFAGENTLIST32 _IOWR('i', 190, struct netagentlist_req32) +#define SIOCGIFAGENTLIST64 _IOWR('i', 190, struct netagentlist_req64) +#endif /* BSD_KERNEL_PRIVATE */ + #endif /* PRIVATE */ #endif /* !_SYS_SOCKIO_H_ */ diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index e86e6c2fd..9985b3cfc 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -204,7 +204,7 @@ typedef struct _posix_spawnattr { #define POSIX_SPAWN_JETSAM_SET 0x8000 #define POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY 0x01 -#define POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND 0x02 +#define POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND 0x02 /* to be deprecated */ #define POSIX_SPAWN_JETSAM_MEMLIMIT_FATAL 0x04 /* to be deprecated */ /* diff --git a/bsd/sys/stackshot.h b/bsd/sys/stackshot.h index 50a1eb500..dcd24eada 100644 --- a/bsd/sys/stackshot.h +++ b/bsd/sys/stackshot.h @@ -37,7 +37,7 @@ typedef struct stackshot_config { /* Input options */ int sc_pid; /* PID to trace, or -1 for the entire system */ uint32_t sc_flags; /* Stackshot flags */ - uint64_t sc_since_timestamp; /* Get traces of threads that have run since this time (NOT YET SUPPORTED) */ + uint64_t sc_delta_timestamp; /* Retrieve a delta stackshot of system state that has changed since this time */ /* Stackshot results */ uint64_t sc_buffer; /* Pointer to stackshot buffer */ @@ -50,10 +50,6 @@ typedef struct stackshot_config { #ifndef KERNEL -#if !LIBSYSCALL_INTERFACE -typedef struct stackshot_config stackshot_config_t; -#endif - stackshot_config_t * stackshot_config_create(void); int stackshot_config_set_pid(stackshot_config_t * stackshot_config, int pid); int stackshot_config_set_flags(stackshot_config_t * stackshot_config, uint32_t flags); @@ -61,6 +57,7 @@ int stackshot_capture_with_config(stackshot_config_t * stackshot_config); void * stackshot_config_get_stackshot_buffer(stackshot_config_t * stackshot_config); uint32_t stackshot_config_get_stackshot_size(stackshot_config_t * stackshot_config); int stackshot_config_set_size_hint(stackshot_config_t * stackshot_config, uint32_t suggested_size); +int stackshot_config_set_delta_timestamp(stackshot_config_t * stackshot_config, uint64_t delta_timestamp); int stackshot_config_dealloc_buffer(stackshot_config_t * stackshot_config); int stackshot_config_dealloc(stackshot_config_t * stackshot_config); diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index c9fac7e25..75b8d9322 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -468,7 +468,7 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); * in Mac OS X. */ /* #define UF_NOUNLINK 0x00000010 */ /* file may not be removed or renamed */ -#define UF_COMPRESSED 0x00000020 /* file is hfs-compressed */ +#define UF_COMPRESSED 0x00000020 /* file is compressed (some file-systems) */ /* UF_TRACKED is used for dealing with document IDs. We no longer issue notifications for deletes or renames for files which have UF_TRACKED set. */ diff --git a/bsd/sys/stdio.h b/bsd/sys/stdio.h index f90426f43..b6957c83b 100644 --- a/bsd/sys/stdio.h +++ b/bsd/sys/stdio.h @@ -39,6 +39,16 @@ __BEGIN_DECLS int renameat(int, const char *, int, const char *) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL + +#define RENAME_SECLUDE 0x00000001 +#define RENAME_SWAP 0x00000002 +#define RENAME_EXCL 0x00000004 +int renamex_np(const char *, const char *, unsigned int) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); +int renameatx_np(int, const char *, int, const char *, unsigned int) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); + +#endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL */ + __END_DECLS #endif /* __DARWIN_C_LEVEL >= 200809L */ diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index d597067de..4ccc92c72 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -281,8 +281,8 @@ int sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int void sysctl_register_oid(struct sysctl_oid *oidp); void sysctl_unregister_oid(struct sysctl_oid *oidp); -/* Not exported */ -void sysctl_register_fixed(void); +/* Deprecated */ +void sysctl_register_fixed(void) __deprecated; __END_DECLS @@ -410,6 +410,10 @@ SYSCTL_DECL(_hw); SYSCTL_DECL(_machdep); SYSCTL_DECL(_user); +#ifdef PRIVATE +SYSCTL_DECL(_hw_features); +#endif + #endif /* KERNEL */ #ifdef XNU_KERNEL_PRIVATE @@ -562,34 +566,33 @@ SYSCTL_DECL(_user); #define KERN_TFP_POLICY_DEFAULT 2 /* Default Mode: related ones allowed and upcall authentication */ /* KERN_KDEBUG types */ -#define KERN_KDEFLAGS 1 -#define KERN_KDDFLAGS 2 -#define KERN_KDENABLE 3 -#define KERN_KDSETBUF 4 -#define KERN_KDGETBUF 5 -#define KERN_KDSETUP 6 -#define KERN_KDREMOVE 7 -#define KERN_KDSETREG 8 -#define KERN_KDGETREG 9 -#define KERN_KDREADTR 10 -#define KERN_KDPIDTR 11 -#define KERN_KDTHRMAP 12 +#define KERN_KDEFLAGS 1 +#define KERN_KDDFLAGS 2 +#define KERN_KDENABLE 3 +#define KERN_KDSETBUF 4 +#define KERN_KDGETBUF 5 +#define KERN_KDSETUP 6 +#define KERN_KDREMOVE 7 +#define KERN_KDSETREG 8 +#define KERN_KDGETREG 9 +#define KERN_KDREADTR 10 +#define KERN_KDPIDTR 11 +#define KERN_KDTHRMAP 12 /* Don't use 13 as it is overloaded with KERN_VNODE */ -#define KERN_KDPIDEX 14 -#define KERN_KDSETRTCDEC 15 -#define KERN_KDGETENTROPY 16 /* Obsolescent */ -#define KERN_KDWRITETR 17 -#define KERN_KDWRITEMAP 18 -#define KERN_KDENABLE_BG_TRACE 19 -#define KERN_KDDISABLE_BG_TRACE 20 -#define KERN_KDREADCURTHRMAP 21 -#define KERN_KDSET_TYPEFILTER 22 -#define KERN_KDBUFWAIT 23 -#define KERN_KDCPUMAP 24 -#define KERN_KDWAIT_BG_TRACE_RESET 25 -#define KERN_KDSET_BG_TYPEFILTER 26 -#define KERN_KDWRITEMAP_V3 27 -#define KERN_KDWRITETR_V3 28 +#define KERN_KDPIDEX 14 +#define KERN_KDSETRTCDEC 15 /* obsolete */ +#define KERN_KDGETENTROPY 16 /* obsolete */ +#define KERN_KDWRITETR 17 +#define KERN_KDWRITEMAP 18 +#define KERN_KDTEST 19 +/* 20 unused */ +#define KERN_KDREADCURTHRMAP 21 +#define KERN_KDSET_TYPEFILTER 22 +#define KERN_KDBUFWAIT 23 +#define KERN_KDCPUMAP 24 +/* 25 - 26 unused */ +#define KERN_KDWRITEMAP_V3 27 +#define KERN_KDWRITETR_V3 28 #define CTL_KERN_NAMES { \ { 0, 0 }, \ @@ -1117,7 +1120,7 @@ void sysctl_mib_init(void); int sysctl_int(user_addr_t, size_t *, user_addr_t, size_t, int *); int sysctl_quad(user_addr_t, size_t *, user_addr_t, size_t, quad_t *); -void sysctl_register_all(void); +void sysctl_early_init(void); #endif /* BSD_KERNEL_PRIVATE */ #else /* !KERNEL */ diff --git a/bsd/sys/sysent.h b/bsd/sys/sysent.h index 297de2cc2..8182994d3 100644 --- a/bsd/sys/sysent.h +++ b/bsd/sys/sysent.h @@ -56,8 +56,7 @@ struct sysent { /* system call table */ extern struct sysent sysent[]; #endif /* __INIT_SYSENT_C__ */ -extern int nsysent; -#define NUM_SYSENT 500 /* Current number of defined syscalls */ +extern unsigned int nsysent; /* * Valid values for sy_cancel diff --git a/bsd/sys/syslog.h b/bsd/sys/syslog.h index 71f546ae5..2449ad379 100644 --- a/bsd/sys/syslog.h +++ b/bsd/sys/syslog.h @@ -229,9 +229,9 @@ __BEGIN_DECLS void closelog(void); void openlog(const char *, int, int); int setlogmask(int); -void syslog(int, const char *, ...) __printflike(2, 3); +void syslog(int, const char *, ...) __printflike(2, 3) __not_tail_called; #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL -void vsyslog(int, const char *, __darwin_va_list) __printflike(2, 0); +void vsyslog(int, const char *, __darwin_va_list) __printflike(2, 0) __not_tail_called; #endif __END_DECLS diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index 7d5a43c31..e346413c8 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -125,8 +125,8 @@ extern int boothowto; /* reboot flags, from console subsystem */ extern int show_space; extern int minimalboot; -extern int nblkdev; /* number of entries in bdevsw */ -extern int nchrdev; /* number of entries in cdevsw */ +extern const int nblkdev; /* number of entries in bdevsw */ +extern const int nchrdev; /* number of entries in cdevsw */ #endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -164,8 +164,6 @@ int64_t fulong(user_addr_t addr); int sulong(user_addr_t addr, int64_t longword); uint64_t fuulong(user_addr_t addr); int suulong(user_addr_t addr, uint64_t ulongword); -int vslock(user_addr_t addr, user_size_t len); -int vsunlock(user_addr_t addr, user_size_t len, int dirtied); int clone_system_shared_regions(int shared_regions_active, int chain_regions, int base_vnode); @@ -182,6 +180,7 @@ void load_init_program(struct proc *p); void __pthread_testcancel(int presyscall); void throttle_info_get_last_io_time(mount_t mp, struct timeval *tv); void update_last_io_time(mount_t mp); +void throttle_info_end_io(buf_t bp); #endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -189,6 +188,8 @@ void timeout(void (*)(void *), void *arg, int ticks); void timeout_with_leeway(void (*)(void *), void *arg, int ticks, int leeway_ticks); void untimeout(void (*)(void *), void *arg); int bsd_hostname(char *, int, int*); +int vslock(user_addr_t addr, user_size_t len); +int vsunlock(user_addr_t addr, user_size_t len, int dirtied); #endif /* KERNEL_PRIVATE */ int nullop(void); @@ -238,6 +239,20 @@ void throttle_info_disable_throttle(int devno, boolean_t isfusion); */ int throttle_info_io_will_be_throttled(void *throttle_info_handle, int policy); +#ifdef KERNEL_PRIVATE + +/* returned by throttle_io_will_be_throttled */ +#define THROTTLE_DISENGAGED 0 +#define THROTTLE_ENGAGED 1 +#define THROTTLE_NOW 2 + +int throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp); +int throttle_lowpri_window(void) __attribute__((pure)); +struct uthread; +void throttle_info_reset_window(struct uthread *ut); + +#endif + #ifdef XNU_KERNEL_PRIVATE void *exec_spawnattr_getmacpolicyinfo(const void *macextensions, const char *policyname, size_t *lenp); #endif diff --git a/bsd/sys/time.h b/bsd/sys/time.h index da5e4d784..85e7fe3df 100644 --- a/bsd/sys/time.h +++ b/bsd/sys/time.h @@ -211,6 +211,7 @@ void timevalsub(struct timeval *t1, struct timeval *t2); void timevalfix(struct timeval *t1); #ifdef BSD_KERNEL_PRIVATE time_t boottime_sec(void); +void boottime_timeval(struct timeval *tv); void inittodr(time_t base); int ratecheck(struct timeval *lasttime, const struct timeval *mininterval); int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps); diff --git a/bsd/sys/ttycom.h b/bsd/sys/ttycom.h index a9c137862..6f874a197 100644 --- a/bsd/sys/ttycom.h +++ b/bsd/sys/ttycom.h @@ -70,7 +70,6 @@ #define _SYS_TTYCOM_H_ #include - /* * Tty ioctl's except for those supported only for backwards compatibility * with the old tty driver. diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index 720ae9818..fad05f101 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -40,6 +40,10 @@ #include #include +#ifdef KERNEL_PRIVATE +#include +#endif // KERNEL_PRIVATE + /* defns for ubc_msync() and ubc_msync */ #define UBC_PUSHDIRTY 0x01 /* clean any dirty pages in the specified range to the backing store */ @@ -83,7 +87,7 @@ struct cs_blob *ubc_cs_blob_get(vnode_t, cpu_type_t, off_t); /* apis to handle generation count for cs blob */ void cs_blob_reset_cache(void); -int ubc_cs_blob_revalidate(vnode_t, struct cs_blob *, int); +int ubc_cs_blob_revalidate(vnode_t, struct cs_blob *, struct image_params *, int); int ubc_cs_generation_check(vnode_t); int cs_entitlements_blob_get(proc_t, void **, size_t *); @@ -140,6 +144,15 @@ int is_file_clean(vnode_t, off_t); errno_t mach_to_bsd_errno(kern_return_t mach_err); +#ifdef KERNEL_PRIVATE + +__attribute__((pure)) boolean_t ubc_is_mapped(const struct vnode *, boolean_t *writable); +__attribute__((pure)) boolean_t ubc_is_mapped_writable(const struct vnode *); + +uint32_t cluster_max_io_size(mount_t, int); + +#endif + __END_DECLS #endif /* _SYS_UBC_H_ */ diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index d3a87d049..6ea151d09 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -103,14 +103,19 @@ struct cs_blob { off_t csb_base_offset; /* Offset of Mach-O binary in fat binary */ off_t csb_start_offset; /* Blob coverage area start, from csb_base_offset */ off_t csb_end_offset; /* Blob coverage area end, from csb_base_offset */ - ipc_port_t csb_mem_handle; vm_size_t csb_mem_size; vm_offset_t csb_mem_offset; vm_address_t csb_mem_kaddr; unsigned char csb_cdhash[CS_CDHASH_LEN]; struct cs_hash *csb_hashtype; + vm_size_t csb_hash_pagesize; /* each hash entry represent this many bytes in the file */ + vm_size_t csb_hash_pagemask; + vm_size_t csb_hash_pageshift; + vm_size_t csb_hash_firstlevel_pagesize; /* First hash this many bytes, then hash the hashes together */ const CS_CodeDirectory *csb_cd; const char *csb_teamid; + const CS_GenericBlob *csb_entitlements_blob; /* raw blob, subrange of csb_mem_kaddr */ + void * csb_entitlements; /* The entitlements as an OSDictionary */ unsigned int csb_platform_binary:1; unsigned int csb_platform_path:1; }; @@ -167,7 +172,6 @@ __private_extern__ void ubc_destroy_named(vnode_t); /* internal only */ __private_extern__ void cluster_release(struct ubc_info *); -__private_extern__ uint32_t cluster_max_io_size(mount_t, int); __private_extern__ uint32_t cluster_throttle_io_limit(vnode_t, uint32_t *); @@ -188,22 +192,20 @@ int ubc_isinuse_locked(vnode_t, int, int); int ubc_getcdhash(vnode_t, off_t, unsigned char *); -__attribute__((pure)) boolean_t ubc_is_mapped(const struct vnode *, boolean_t *writable); -__attribute__((pure)) boolean_t ubc_is_mapped_writable(const struct vnode *); - #ifdef XNU_KERNEL_PRIVATE int UBCINFOEXISTS(const struct vnode *); #endif /* XNU_KERNEL_PRIVATE */ /* code signing */ struct cs_blob; -int ubc_cs_blob_add(vnode_t, cpu_type_t, off_t, vm_address_t, vm_size_t, int, struct cs_blob **); +int ubc_cs_blob_add(vnode_t, cpu_type_t, off_t, vm_address_t *, vm_size_t, struct image_params *, int, struct cs_blob **); int ubc_cs_sigpup_add(vnode_t, vm_address_t, vm_size_t); struct cs_blob *ubc_get_cs_blobs(vnode_t); void ubc_get_cs_mtime(vnode_t, struct timespec *); int ubc_cs_getcdhash(vnode_t, off_t, unsigned char *); kern_return_t ubc_cs_blob_allocate(vm_offset_t *, vm_size_t *); void ubc_cs_blob_deallocate(vm_offset_t, vm_size_t); +boolean_t ubc_cs_is_range_codesigned(vnode_t, mach_vm_offset_t, mach_vm_size_t); kern_return_t ubc_cs_validation_bitmap_allocate( vnode_t ); void ubc_cs_validation_bitmap_deallocate( vnode_t ); diff --git a/bsd/sys/ulock.h b/bsd/sys/ulock.h new file mode 100644 index 000000000..de799d8f1 --- /dev/null +++ b/bsd/sys/ulock.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_ULOCK_H +#define _SYS_ULOCK_H + +__BEGIN_DECLS + +#if PRIVATE + +#ifdef XNU_KERNEL_PRIVATE +extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); + +static __inline mach_port_name_t +ulock_owner_value_to_port_name(uint32_t uval) +{ + /* + * userland uses the least significant bits for flags as these are + * never used in the mach port name, and are generally always set by + * the ipc_entry code in the kernel. Here we reconstruct a mach port + * name that we can use in the kernel. + */ + return ipc_entry_name_mask((mach_port_name_t)uval); +} +#else +static __inline mach_port_name_t +ulock_owner_value_to_port_name(uint32_t uval) +{ + return uval | 0x3; +} +#endif + +#ifndef KERNEL + +extern int __ulock_wait(uint32_t operation, void *addr, uint64_t value, + uint32_t timeout); /* timeout is specified in microseconds */ +extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); + +#endif /* !KERNEL */ + +/* + * operation bits [7, 0] contain the operation code + */ +#define UL_COMPARE_AND_WAIT 1 +#define UL_UNFAIR_LOCK 2 +/* obsolete names */ +#define UL_OSSPINLOCK UL_COMPARE_AND_WAIT +#define UL_HANDOFFLOCK UL_UNFAIR_LOCK +/* These operation code are only implemented in (DEVELOPMENT || DEBUG) kernels */ +#define UL_DEBUG_SIMULATE_COPYIN_FAULT 253 +#define UL_DEBUG_HASH_DUMP_ALL 254 +#define UL_DEBUG_HASH_DUMP_PID 255 + +/* + * operation bits [15, 8] contain the flags for __ulock_wake + */ +#define ULF_WAKE_ALL 0x00000100 +#define ULF_WAKE_THREAD 0x00000200 + +/* + * operation bits [23, 16] contain the flags for __ulock_wait + */ +/* The waiter is contending on this lock for synchronization around global data. + * This causes the workqueue subsystem to not create new threads to offset for + * waiters on this lock. + */ +#define ULF_WAIT_WORKQ_DATA_CONTENTION 0x00010000 + +/* + * operation bits [31, 24] contain the generic flags + */ +#define ULF_NO_ERRNO 0x01000000 + +/* + * masks + */ +#define UL_OPCODE_MASK 0x000000FF +#define UL_FLAGS_MASK 0xFFFFFF00 +#define ULF_GENERIC_MASK 0xFFFF0000 + +#define ULF_WAIT_MASK (ULF_NO_ERRNO | \ + ULF_WAIT_WORKQ_DATA_CONTENTION) + +#define ULF_WAKE_MASK (ULF_WAKE_ALL | \ + ULF_WAKE_THREAD | \ + ULF_NO_ERRNO) + +#endif /* PRIVATE */ + +__END_DECLS + +#endif diff --git a/bsd/sys/un.h b/bsd/sys/un.h index 2f06218cb..f6c6d592a 100644 --- a/bsd/sys/un.h +++ b/bsd/sys/un.h @@ -85,7 +85,7 @@ struct sockaddr_un { #define SOL_LOCAL 0 /* Socket options. */ -#define LOCAL_PEERCRED 0x001 /* retrieve peer credentails */ +#define LOCAL_PEERCRED 0x001 /* retrieve peer credentials */ #define LOCAL_PEERPID 0x002 /* retrieve peer pid */ #define LOCAL_PEEREPID 0x003 /* retrieve eff. peer pid */ #define LOCAL_PEERUUID 0x004 /* retrieve peer UUID */ diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 4e5235754..9f612c780 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -97,26 +97,6 @@ struct vfs_context { kauth_cred_t vc_ucred; /* per thread credential */ }; -/* - * struct representing a document "tombstone" that's recorded - * when a thread manipulates files marked with a document-id. - * if the thread recreates the same item, this tombstone is - * used to preserve the document_id on the new file. - * - * It is a separate structure because of its size - we want to - * allocate it on demand instead of just stuffing it into the - * uthread structure. - */ -struct doc_tombstone { - struct vnode *t_lastop_parent; - struct vnode *t_lastop_item; - uint32_t t_lastop_parent_vid; - uint32_t t_lastop_item_vid; - uint64_t t_lastop_fileid; - uint64_t t_lastop_document_id; - unsigned char t_lastop_filename[NAME_MAX+1]; -}; - #endif /* !__LP64 || XNU_KERNEL_PRIVATE */ #ifdef BSD_KERNEL_PRIVATE @@ -147,24 +127,27 @@ struct uthread { u_int64_t abstime; uint64_t *wqp; int count; - struct select_nocancel_args *args; /* original syscall arguments */ - int32_t *retval; /* place to store return val */ + struct select_nocancel_args *args; /* original syscall arguments */ + int32_t *retval; /* place to store return val */ } ss_select_data; struct _kqueue_scan { - kevent_callback_t call; /* per-event callback */ - kqueue_continue_t cont; /* whole call continuation */ - uint64_t deadline; /* computed deadline for operation */ - void *data; /* caller's private data */ - } ss_kqueue_scan; /* saved state for kevent_scan() */ + kevent_callback_t call; /* per-event callback */ + kqueue_continue_t cont; /* whole call continuation */ + filt_process_data_t process_data; /* needed for filter processing */ + uint8_t servicer_qos_index; /* requested qos index of servicer */ + uint64_t deadline; /* computed deadline for operation */ + void *data; /* caller's private data */ + } ss_kqueue_scan; /* saved state for kevent_scan() */ struct _kevent { - struct _kqueue_scan scan;/* space for the generic data */ - struct fileproc *fp; /* fileproc we hold iocount on */ - int fd; /* filedescriptor for kq */ - unsigned int eventflags; /* flags to determine kevent size/direction */ - int eventcount; /* user-level event count */ - int eventout; /* number of events output */ - int32_t *retval; /* place to store return val */ - user_addr_t eventlist; /* user-level event list address */ + struct _kqueue_scan scan; /* space for the generic data */ + struct fileproc *fp; /* fileproc we hold iocount on */ + int fd; /* filedescriptor for kq */ + int eventcount; /* user-level event count */ + int eventout; /* number of events output */ + struct filt_process_s process_data; /* space for process data fed thru */ + int32_t *retval; /* place to store return val */ + user_addr_t eventlist; /* user-level event list address */ + uint64_t data_available; /* [user/kernel] addr of in/out size */ } ss_kevent; /* saved state for kevent() */ struct _kauth { @@ -216,10 +199,13 @@ struct uthread { lck_mtx_t *uu_mtx; + lck_spin_t uu_rethrottle_lock; /* locks was_rethrottled and is_throttled */ TAILQ_ENTRY(uthread) uu_throttlelist; /* List of uthreads currently throttled */ void * uu_throttle_info; /* pointer to throttled I/Os info */ int uu_on_throttlelist; int uu_lowpri_window; + boolean_t uu_was_rethrottled; + boolean_t uu_is_throttled; boolean_t uu_throttle_bc; u_int32_t uu_network_marks; /* network control flow marks */ @@ -230,6 +216,9 @@ struct uthread { int uu_dupfd; /* fd in fdesc_open/dupfdopen */ int uu_defer_reclaims; + unsigned int uu_kqueue_bound; /* qos index we are bound to service */ + unsigned int uu_kqueue_flags; /* if so, the flags being using */ + #ifdef JOE_DEBUG int uu_iocount; int uu_vpindex; @@ -291,6 +280,8 @@ struct uthread { /* Document Tracking struct used to track a "tombstone" for a document */ struct doc_tombstone *t_tombstone; + + struct os_reason *uu_exit_reason; }; typedef struct uthread * uthread_t; diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index bc8b67337..72da264c4 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,11 +108,10 @@ enum vtagtype { VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS, VT_MOCKFS, /* 16 - 20 */ VT_HFS, VT_ZFS, VT_DEVFS, VT_WEBDAV, VT_UDF, - /* 21 - 24 */ - VT_AFP, VT_CDDA, VT_CIFS, VT_OTHER + /* 21 - 25 */ + VT_AFP, VT_CDDA, VT_CIFS, VT_OTHER, VT_APFS }; - /* * flags for VNOP_BLOCKMAP */ @@ -358,7 +357,7 @@ extern int vfs_resolver_auxiliary(resolver_result_t); @param vp The trigger vnode which needs resolving @param cnp Various data about lookup, e.g. filename and state flags @param pop The pathname operation that initiated the lookup (see enum path_operation). - @param flags + @param flags resolve flags @param data Arbitrary data supplied by vnode trigger creator @param ctx Context for authentication. @return RESOLVER_RESOLVED, RESOLVER_NOCHANGE, RESOLVER_UNRESOLVED or RESOLVER_ERROR @@ -394,7 +393,7 @@ typedef resolver_result_t (* trigger_vnode_unresolve_callback_t)( @discussion This function is associated with a trigger vnode during a vnode create. It is called to verify a rearm from VFS (i.e. should VFS rearm the trigger?). @param vp The trigger vnode which needs rearming - @param flags + @param flags rearm flags @param data Arbitrary data supplied by vnode trigger creator @param ctx Context for authentication. @return RESOLVER_NOCHANGE or RESOLVER_ERROR @@ -700,7 +699,8 @@ struct vnode_attr { #define VA_UTIMES_NULL 0x010000 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x020000 /* exclusive create request */ #define VA_NOINHERIT 0x040000 /* Don't inherit ACLs from parent */ -#define VA_NOAUTH 0x080000 +#define VA_NOAUTH 0x080000 +#define VA_64BITOBJIDS 0x100000 /* fileid/linkid/parentid are 64 bit */ /* * Modes. Some values same as Ixxx entries from inode.h for now. @@ -806,7 +806,7 @@ __BEGIN_DECLS @param vpp Pointer to a vnode pointer, to be filled in with newly created vnode. @return 0 for success, error code otherwise. */ -errno_t vnode_create(uint32_t, uint32_t, void *, vnode_t *); +errno_t vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp); #if KERNEL_PRIVATE /*! @@ -822,7 +822,7 @@ errno_t vnode_create(uint32_t, uint32_t, void *, vnode_t *); @param vpp Pointer to a vnode pointer, to be filled in with newly created vnode. @return 0 for success, error code otherwise. */ -errno_t vnode_create_empty(vnode_t *); +errno_t vnode_create_empty(vnode_t *vpp); /*! @function vnode_initialize @@ -836,7 +836,7 @@ errno_t vnode_create_empty(vnode_t *); @param vpp Pointer to a vnode pointer, to be filled in with newly created vnode. @return 0 for success, error code otherwise. */ -errno_t vnode_initialize(uint32_t, uint32_t, void *, vnode_t *); +errno_t vnode_initialize(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp); #endif /* KERNEL_PRIVATE */ /*! @@ -848,7 +848,7 @@ errno_t vnode_initialize(uint32_t, uint32_t, void *, vnode_t *); @param vp The vnode to mark. @return Always 0. */ -int vnode_addfsref(vnode_t); +int vnode_addfsref(vnode_t vp); /*! @function vnode_removefsref @@ -857,7 +857,7 @@ int vnode_addfsref(vnode_t); @param vp The vnode to unmark. @return Always 0. */ -int vnode_removefsref(vnode_t); +int vnode_removefsref(vnode_t vp); /*! @function vnode_hasdirtyblks @@ -866,7 +866,7 @@ int vnode_removefsref(vnode_t); @param vp The vnode to test. @return Nonzero if there are dirty blocks, 0 otherwise */ -int vnode_hasdirtyblks(vnode_t); +int vnode_hasdirtyblks(vnode_t vp); /*! @function vnode_hascleanblks @@ -875,7 +875,7 @@ int vnode_hasdirtyblks(vnode_t); @param vp The vnode to test. @return Nonzero if there are clean blocks, 0 otherwise. */ -int vnode_hascleanblks(vnode_t); +int vnode_hascleanblks(vnode_t vp); #define VNODE_ASYNC_THROTTLE 15 /*! @@ -888,24 +888,22 @@ int vnode_hascleanblks(vnode_t); @param msg String to pass msleep() . @return 0 for success, or an error value from msleep(). */ -int vnode_waitforwrites(vnode_t, int, int, int, const char *); +int vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg); /*! @function vnode_startwrite @abstract Increment the count of pending writes on a vnode. @param vp The vnode whose count to increment. - @return void. */ -void vnode_startwrite(vnode_t); +void vnode_startwrite(vnode_t vp); /*! @function vnode_startwrite @abstract Decrement the count of pending writes on a vnode . @discussion Also wakes up threads waiting for the write count to drop, as in vnode_waitforwrites. @param vp The vnode whose count to decrement. - @return void. */ -void vnode_writedone(vnode_t); +void vnode_writedone(vnode_t vp); /*! @function vnode_vtype @@ -913,7 +911,7 @@ void vnode_writedone(vnode_t); @param vp The vnode whose type to grab. @return The vnode's type. */ -enum vtype vnode_vtype(vnode_t); +enum vtype vnode_vtype(vnode_t vp); /*! @function vnode_vid @@ -921,7 +919,7 @@ enum vtype vnode_vtype(vnode_t); @param vp The vnode whose vid to grab. @return The vnode's vid. */ -uint32_t vnode_vid(vnode_t); +uint32_t vnode_vid(vnode_t vp); /*! @function vnode_mountedhere @@ -937,7 +935,7 @@ mount_t vnode_mountedhere(vnode_t vp); @param vp The vnode whose mount to grab. @return The mount, directly. */ -mount_t vnode_mount(vnode_t); +mount_t vnode_mount(vnode_t vp); /*! @function vnode_specrdev @@ -945,7 +943,7 @@ mount_t vnode_mount(vnode_t); @param vp The vnode whose device id to extract--vnode must be a special file. @return The device id. */ -dev_t vnode_specrdev(vnode_t); +dev_t vnode_specrdev(vnode_t vp); /*! @function vnode_fsnode @@ -953,16 +951,15 @@ dev_t vnode_specrdev(vnode_t); @param vp The vnode whose data to grab. @return The filesystem-specific data, directly. */ -void * vnode_fsnode(vnode_t); +void * vnode_fsnode(vnode_t vp); /*! @function vnode_clearfsnode @abstract Sets a vnode's filesystem-specific data to be NULL. @discussion This routine should only be called when a vnode is no longer in use, i.e. during a VNOP_RECLAIM. @param vp The vnode whose data to clear out. - @return void. */ -void vnode_clearfsnode(vnode_t); +void vnode_clearfsnode(vnode_t vp); /*! @function vnode_isvroot @@ -970,7 +967,7 @@ void vnode_clearfsnode(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is the root, 0 if it is not. */ -int vnode_isvroot(vnode_t); +int vnode_isvroot(vnode_t vp); /*! @function vnode_issystem @@ -978,7 +975,7 @@ int vnode_isvroot(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is a system vnode, 0 if it is not. */ -int vnode_issystem(vnode_t); +int vnode_issystem(vnode_t vp); /*! @function vnode_ismount @@ -987,7 +984,7 @@ int vnode_issystem(vnode_t); @param vp The vnode to test. @return Nonzero if there is a mount in progress, 0 otherwise. */ -int vnode_ismount(vnode_t); +int vnode_ismount(vnode_t vp); /*! @function vnode_isreg @@ -995,7 +992,7 @@ int vnode_ismount(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is of type VREG, 0 otherwise. */ -int vnode_isreg(vnode_t); +int vnode_isreg(vnode_t vp); /*! @function vnode_isdir @@ -1003,7 +1000,7 @@ int vnode_isreg(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is of type VDIR, 0 otherwise. */ -int vnode_isdir(vnode_t); +int vnode_isdir(vnode_t vp); /*! @function vnode_islnk @@ -1011,7 +1008,7 @@ int vnode_isdir(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is of type VLNK, 0 otherwise. */ -int vnode_islnk(vnode_t); +int vnode_islnk(vnode_t vp); /*! @function vnode_isfifo @@ -1019,7 +1016,7 @@ int vnode_islnk(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is of type VFIFO, 0 otherwise. */ -int vnode_isfifo(vnode_t); +int vnode_isfifo(vnode_t vp); /*! @function vnode_isblk @@ -1027,7 +1024,7 @@ int vnode_isfifo(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is of type VBLK, 0 otherwise. */ -int vnode_isblk(vnode_t); +int vnode_isblk(vnode_t vp); /*! @function vnode_ischr @@ -1035,7 +1032,7 @@ int vnode_isblk(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is of type VCHR, 0 otherwise. */ -int vnode_ischr(vnode_t); +int vnode_ischr(vnode_t vp); /*! @function vnode_isswap @@ -1052,7 +1049,7 @@ int vnode_isswap(vnode_t vp); @param vp The vnode to test. @return Nonzero if the vnode is a named stream, 0 otherwise. */ -int vnode_isnamedstream(vnode_t); +int vnode_isnamedstream(vnode_t vp); #endif /*! @@ -1062,24 +1059,22 @@ int vnode_isnamedstream(vnode_t); @param vp The vnode to test. @return Nonzero if the vnode is a block device on which an filesystem is mounted, 0 otherwise. */ -int vnode_ismountedon(vnode_t); +int vnode_ismountedon(vnode_t vp); /*! @function vnode_setmountedon @abstract Set flags indicating that a block device vnode has been mounted as a filesystem. @discussion A block device marked as being mounted on cannot be opened. @param vp The vnode to set flags on, a block device. - @return void. */ -void vnode_setmountedon(vnode_t); +void vnode_setmountedon(vnode_t vp); /*! @function vnode_clearmountedon @abstract Clear flags indicating that a block device vnode has been mounted as a filesystem. @param vp The vnode to clear flags on, a block device. - @return void. */ -void vnode_clearmountedon(vnode_t); +void vnode_clearmountedon(vnode_t vp); /*! @function vnode_isrecycled @@ -1088,7 +1083,7 @@ void vnode_clearmountedon(vnode_t); @param vp The vnode to test. @return Nonzero if vnode is dead or being recycled, 0 otherwise. */ -int vnode_isrecycled(vnode_t); +int vnode_isrecycled(vnode_t vp); /*! @function vnode_isnocache @@ -1096,7 +1091,7 @@ int vnode_isrecycled(vnode_t); @param vp The vnode to test. @return Nonzero if vnode is set to not have data chached, 0 otherwise. */ -int vnode_isnocache(vnode_t); +int vnode_isnocache(vnode_t vp); /*! @function vnode_israge @@ -1104,7 +1099,7 @@ int vnode_isnocache(vnode_t); @param vp The vnode to test. @return Nonzero if vnode is marked for rapid aging, 0 otherwise */ -int vnode_israge(vnode_t); +int vnode_israge(vnode_t vp); /*! @function vnode_needssnapshots @@ -1112,23 +1107,21 @@ int vnode_israge(vnode_t); @param vp The vnode to test. @return Nonzero if vnode needs snapshot events, 0 otherwise */ -int vnode_needssnapshots(vnode_t); +int vnode_needssnapshots(vnode_t vp); /*! @function vnode_setnocache @abstract Set a vnode to not have its data cached in memory (i.e. we write-through to disk and always read from disk). @param vp The vnode whose flags to set. - @return void. */ -void vnode_setnocache(vnode_t); +void vnode_setnocache(vnode_t vp); /*! @function vnode_clearnocache @abstract Clear the flag on a vnode indicating that data should not be cached in memory (i.e. we write-through to disk and always read from disk). @param vp The vnode whose flags to clear. - @return void. */ -void vnode_clearnocache(vnode_t); +void vnode_clearnocache(vnode_t vp); /*! @function vnode_isnoreadahead @@ -1136,75 +1129,67 @@ void vnode_clearnocache(vnode_t); @param vp The vnode to test. @return Nonzero if readahead is disabled, 0 otherwise. */ -int vnode_isnoreadahead(vnode_t); +int vnode_isnoreadahead(vnode_t vp); /*! @function vnode_setnoreadahead @abstract Set a vnode to not have data speculatively read in in hopes of hitting in cache. @param vp The vnode on which to prevent readahead. - @return void. */ -void vnode_setnoreadahead(vnode_t); +void vnode_setnoreadahead(vnode_t vp); /*! @function vnode_clearnoreadahead @abstract Clear the flag indicating that a vnode should not have data speculatively read in. @param vp The vnode whose flag to clear. - @return void. */ -void vnode_clearnoreadahead(vnode_t); +void vnode_clearnoreadahead(vnode_t vp); /*! @function vnode_isfastdevicecandidate @abstract Check if a vnode is a candidate to store on the fast device of a composite disk system @param vp The vnode which you want to test. @return Nonzero if the vnode is marked as a fast-device candidate - @return void. */ -int vnode_isfastdevicecandidate(vnode_t); +int vnode_isfastdevicecandidate(vnode_t vp); /*! @function vnode_setfastdevicecandidate @abstract Mark a vnode as a candidate to store on the fast device of a composite disk system - @abstract If the vnode is a directory, all its children will inherit this bit. + @discussion If the vnode is a directory, all its children will inherit this bit. @param vp The vnode which you want marked. - @return void. */ -void vnode_setfastdevicecandidate(vnode_t); +void vnode_setfastdevicecandidate(vnode_t vp); /*! @function vnode_clearfastdevicecandidate @abstract Clear the status of a vnode being a candidate to store on the fast device of a composite disk system. @param vp The vnode whose flag to clear. - @return void. */ -void vnode_clearfastdevicecandidate(vnode_t); +void vnode_clearfastdevicecandidate(vnode_t vp); /*! @function vnode_isautocandidate @abstract Check if a vnode was automatically selected to be fast-dev candidate (see vnode_setfastdevicecandidate) @param vp The vnode which you want to test. @return Nonzero if the vnode was automatically marked as a fast-device candidate - @return void. */ -int vnode_isautocandidate(vnode_t); +int vnode_isautocandidate(vnode_t vp); /*! @function vnode_setfastdevicecandidate @abstract Mark a vnode as an automatically selected candidate for storing on the fast device of a composite disk system - @abstract If the vnode is a directory, all its children will inherit this bit. + @discussion If the vnode is a directory, all its children will inherit this bit. @param vp The vnode which you want marked. - @return void. */ -void vnode_setautocandidate(vnode_t); +void vnode_setautocandidate(vnode_t vp); /*! @function vnode_clearautocandidate @abstract Clear the status of a vnode being an automatic candidate (see above) @param vp The vnode whose flag to clear. - @return void. */ -void vnode_clearautocandidate(vnode_t); +void vnode_clearautocandidate(vnode_t vp); /* left only for compat reasons as User code depends on this from getattrlist, for ex */ @@ -1213,9 +1198,8 @@ void vnode_clearautocandidate(vnode_t); @abstract Set a vnode filesystem-specific "tag." @discussion Sets a tag indicating which filesystem a vnode belongs to, e.g. VT_HFS, VT_UDF, VT_ZFS. The kernel never inspects this data, though the filesystem tags are defined in vnode.h; it is for the benefit of user programs via getattrlist. @param vp The vnode whose tag to set. - @return void. */ -void vnode_settag(vnode_t, int); +void vnode_settag(vnode_t vp, int tag); /*! @function vnode_tag @@ -1224,7 +1208,7 @@ void vnode_settag(vnode_t, int); @param vp The vnode whose tag to grab. @return The tag. */ -int vnode_tag(vnode_t); +int vnode_tag(vnode_t vp); /*! @function vnode_getattr @@ -1261,7 +1245,6 @@ vnode_t vfs_rootvnode(void); @abstract Clear out cached credentials on a vnode. @discussion When we authorize an action on a vnode, we cache the credential that was authorized and the actions it was authorized for in case a similar request follows. This function destroys that caching. @param vp The vnode whose cache to clear. - @return void. */ void vnode_uncache_credentials(vnode_t vp); @@ -1270,7 +1253,6 @@ void vnode_uncache_credentials(vnode_t vp); @abstract Mark a vnode as being reachable by multiple paths, i.e. as a hard link. @discussion "Multipath" vnodes can be reached through more than one entry in the filesystem, and so must be handled differently for caching and event notification purposes. A filesystem should mark a vnode with multiple hardlinks this way. @param vp The vnode to mark. - @return void. */ void vnode_setmultipath(vnode_t vp); @@ -1280,7 +1262,7 @@ void vnode_setmultipath(vnode_t vp); @param vp The vnode for which to get filesystem symlink size cap. @return Max symlink length. */ -uint32_t vnode_vfsmaxsymlen(vnode_t); +uint32_t vnode_vfsmaxsymlen(vnode_t vp); /*! @function vnode_vfsisrdonly @@ -1288,7 +1270,7 @@ uint32_t vnode_vfsmaxsymlen(vnode_t); @param vp The vnode for which to get filesystem writeability. @return Nonzero if the filesystem is read-only, 0 otherwise. */ -int vnode_vfsisrdonly(vnode_t); +int vnode_vfsisrdonly(vnode_t vp); /*! @function vnode_vfstypenum @@ -1297,16 +1279,15 @@ int vnode_vfsisrdonly(vnode_t); @param vp The vnode whose filesystem to examine. @return The type number of the fileystem to which the vnode belongs. */ -int vnode_vfstypenum(vnode_t); +int vnode_vfstypenum(vnode_t vp); /*! @function vnode_vfsname @abstract Get the name of the filesystem to which a vnode belongs. @param vp The vnode whose filesystem to examine. @param buf Destination for vfs name: should have size MFSNAMELEN or greater. - @return The name of the fileystem to which the vnode belongs. */ -void vnode_vfsname(vnode_t, char *); +void vnode_vfsname(vnode_t vp, char *buf); /*! @function vnode_vfs64bitready @@ -1314,7 +1295,7 @@ void vnode_vfsname(vnode_t, char *); @param vp The vnode whose filesystem to examine. @return Nonzero if filesystem is marked ready for 64-bit interactions; 0 otherwise. */ -int vnode_vfs64bitready(vnode_t); +int vnode_vfs64bitready(vnode_t vp); /* These should move to private ... not documenting for now */ int vfs_context_get_special_port(vfs_context_t, int, ipc_port_t *); @@ -1326,7 +1307,7 @@ int vfs_context_set_special_port(vfs_context_t, int, ipc_port_t); @param ctx Context whose associated process to find. @return Process if available, NULL otherwise. */ -proc_t vfs_context_proc(vfs_context_t); +proc_t vfs_context_proc(vfs_context_t ctx); /*! @function vfs_context_ucred @@ -1335,7 +1316,7 @@ proc_t vfs_context_proc(vfs_context_t); @param ctx Context whose associated process to find. @returns credential if process available; NULL otherwise */ -kauth_cred_t vfs_context_ucred(vfs_context_t); +kauth_cred_t vfs_context_ucred(vfs_context_t ctx); /*! @function vfs_context_pid @@ -1343,7 +1324,7 @@ kauth_cred_t vfs_context_ucred(vfs_context_t); @param ctx Context whose associated process to find. @return Process id. */ -int vfs_context_pid(vfs_context_t); +int vfs_context_pid(vfs_context_t ctx); /*! @function vfs_context_issignal @@ -1352,7 +1333,7 @@ int vfs_context_pid(vfs_context_t); @param ctx Context whose associated process to find. @return Bitfield of pending signals. */ -int vfs_context_issignal(vfs_context_t, sigset_t); +int vfs_context_issignal(vfs_context_t ctx, sigset_t mask); /*! @function vfs_context_suser @@ -1360,7 +1341,7 @@ int vfs_context_issignal(vfs_context_t, sigset_t); @param ctx Context to examine. @return Nonzero if context belongs to superuser, 0 otherwise. */ -int vfs_context_suser(vfs_context_t); +int vfs_context_suser(vfs_context_t ctx); /*! @function vfs_context_is64bit @@ -1368,7 +1349,7 @@ int vfs_context_suser(vfs_context_t); @param ctx Context to examine. @return Nonzero if context is of 64-bit process, 0 otherwise. */ -int vfs_context_is64bit(vfs_context_t); +int vfs_context_is64bit(vfs_context_t ctx); /*! @function vfs_context_create @@ -1377,7 +1358,7 @@ int vfs_context_is64bit(vfs_context_t); @param ctx Context to copy, or NULL to use information from running thread. @return The new context, or NULL in the event of failure. */ -vfs_context_t vfs_context_create(vfs_context_t); +vfs_context_t vfs_context_create(vfs_context_t ctx); /*! @function vfs_context_rele @@ -1386,7 +1367,7 @@ vfs_context_t vfs_context_create(vfs_context_t); @param ctx Context to release. @return Always 0. */ -int vfs_context_rele(vfs_context_t); +int vfs_context_rele(vfs_context_t ctx); /*! @function vfs_context_current @@ -1483,7 +1464,7 @@ int vnode_getwithvid_drainok(vnode_t, uint32_t); recycled. An iocount is required for any operation on a vnode. @return 0 for success, ENOENT if the vnode is dead, in the process of being reclaimed, or has been recycled and reused. */ -int vnode_getwithref(vnode_t); +int vnode_getwithref(vnode_t vp); /*! @function vnode_put @@ -1493,7 +1474,7 @@ int vnode_getwithref(vnode_t); @param vp The vnode whose iocount to drop. @return Always 0. */ -int vnode_put(vnode_t); +int vnode_put(vnode_t vp); /*! @function vnode_ref @@ -1506,7 +1487,7 @@ int vnode_put(vnode_t); @param vp The vnode on which to obtain a persistent reference. @return 0 for success; ENOENT if the vnode is dead or in the process of being recycled AND the calling thread is not the vnode owner. */ -int vnode_ref(vnode_t); +int vnode_ref(vnode_t vp); /*! @function vnode_rele @@ -1515,9 +1496,8 @@ int vnode_ref(vnode_t); opens the door for a vnode to be reused as a new file; it also triggers a VNOP_INACTIVE call to the filesystem, though that will not happen immediately if there are outstanding iocount references. @param vp The vnode whose usecount to drop. - @return void. */ -void vnode_rele(vnode_t); +void vnode_rele(vnode_t vp); /*! @function vnode_isinuse @@ -1530,9 +1510,8 @@ void vnode_rele(vnode_t); may no longer be correct the very moment that the caller receives it. @param vp The vnode whose use-status to check. @param refcnt The threshold for saying that a vnode is in use. - @return void. */ -int vnode_isinuse(vnode_t, int); +int vnode_isinuse(vnode_t vp, int refcnt); /*! @function vnode_recycle @@ -1542,7 +1521,7 @@ int vnode_isinuse(vnode_t, int); @param vp The vnode to recycle. @return 1 if the vnode was reclaimed (i.e. there were no existing references), 0 if it was only marked for future reclaim. */ -int vnode_recycle(vnode_t); +int vnode_recycle(vnode_t vp); #ifdef KERNEL_PRIVATE @@ -1579,7 +1558,7 @@ int vnode_recycle(vnode_t); @discussion Will not reenter the filesystem. @return Zero if not monitored, nonzero if monitored. */ -int vnode_ismonitored(vnode_t); +int vnode_ismonitored(vnode_t vp); /*! @@ -1589,7 +1568,7 @@ int vnode_ismonitored(vnode_t); @discussion Will not reenter the filesystem. @return nonzero if a dyld shared cache file, zero otherwise. */ -int vnode_isdyldsharedcache(vnode_t); +int vnode_isdyldsharedcache(vnode_t vp); /*! @@ -1626,7 +1605,6 @@ int vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len); @param name_hashval Hash value of name, if known. Passing 0 causes the cache to hash the name itself. @param flags VNODE_UPDATE_PARENT: set parent. VNODE_UPDATE_NAME: set name. VNODE_UPDATE_CACHE: flush cache entries for hard links associated with this file. VNODE_UPDATE_PURGE: flush cache entries for hard links and children of this file. - @return void. */ void vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, uint32_t name_hashval, int flags); @@ -1650,7 +1628,7 @@ int vn_bwrite(struct vnop_bwrite_args *ap); @param ctx Context for which to authorize actions. @return EACCESS if permission is denied. 0 if operation allowed. Various errors from lower layers. */ -int vnode_authorize(vnode_t /*vp*/, vnode_t /*dvp*/, kauth_action_t, vfs_context_t); +int vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx); /*! @function vnode_authattr @@ -1664,7 +1642,7 @@ int vnode_authorize(vnode_t /*vp*/, vnode_t /*dvp*/, kauth_action_t, vfs_context @param ctx Context for which to authorize actions. @return 0 (and a result in "actionp" for success. Otherwise, an error code. */ -int vnode_authattr(vnode_t, struct vnode_attr *, kauth_action_t *, vfs_context_t); +int vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx); /*! @function vnode_authattr_new @@ -1677,7 +1655,7 @@ int vnode_authattr(vnode_t, struct vnode_attr *, kauth_action_t *, vfs_context_t @param ctx Context for which to authorize actions. @return KAUTH_RESULT_ALLOW for success, an error to indicate invalid or disallowed attributes. */ -int vnode_authattr_new(vnode_t /*dvp*/, struct vnode_attr *, int /*noauth*/, vfs_context_t); +int vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx); /*! @function vnode_close @@ -1689,7 +1667,7 @@ int vnode_authattr_new(vnode_t /*dvp*/, struct vnode_attr *, int /*noauth*/, vfs @param ctx Context against which to validate operation. @return 0 for success or an error from the filesystem. */ -errno_t vnode_close(vnode_t, int, vfs_context_t); +errno_t vnode_close(vnode_t vp, int flags, vfs_context_t ctx); /*! @function vn_getpath @@ -1716,7 +1694,7 @@ int vn_getpath(struct vnode *vp, char *pathbuf, int *len); Will not reenter the filesystem. @return 0 for success, else an error code. */ -int vnode_notify(vnode_t, uint32_t, struct vnode_attr*); +int vnode_notify(vnode_t vp, uint32_t events, struct vnode_attr *vap); /*! @function vfs_get_notify_attributes @@ -1742,7 +1720,7 @@ int vfs_get_notify_attributes(struct vnode_attr *vap); @param flags VNODE_LOOKUP_NOFOLLOW: do not follow symbolic links. VNODE_LOOKUP_NOCROSSMOUNT: do not cross mount points. @return Results 0 for success or an error code. */ -errno_t vnode_lookup(const char *, int, vnode_t *, vfs_context_t); +errno_t vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx); /*! @function vnode_open @@ -1758,7 +1736,7 @@ errno_t vnode_lookup(const char *, int, vnode_t *, vfs_context_t); @param ctx Context with which to authorize open/creation. @return 0 for success or an error code. */ -errno_t vnode_open(const char *, int, int, int, vnode_t *, vfs_context_t); +errno_t vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx); /* * exported vnode operations @@ -1785,7 +1763,7 @@ errno_t vnode_open(const char *, int, int, int, vnode_t *, vfs_context_t); @return Zero for success, else an error code. Will return 0 immediately if there are no vnodes hooked into the mount. @discussion Skips vnodes which are dead, in the process of reclaim, suspended, or of type VNON. */ -int vnode_iterate(struct mount *, int, int (*)(struct vnode *, void *), void *); +int vnode_iterate(struct mount *mp, int flags, int (*callout)(struct vnode *, void *), void *arg); /* * flags passed into vnode_iterate @@ -1824,7 +1802,7 @@ int vnode_iterate(struct mount *, int, int (*)(struct vnode *, void *), void *); @param ctx Context against which to validate operation. @return 0 always. */ -int vn_revoke(vnode_t vp, int flags, vfs_context_t); +int vn_revoke(vnode_t vp, int flags, vfs_context_t ctx); /* namecache function prototypes */ /*! @@ -1850,7 +1828,6 @@ int cache_lookup(vnode_t dvp, vnode_t *vpp, struct componentname *cnp); @param vp File to add to cache. A non-NULL vp is stored for rapid access; a NULL vp indicates that there is no such file in the directory and speeds future failed lookups. @param cnp Various data about lookup, e.g. filename and intended operation. - @return void. */ void cache_enter(vnode_t dvp, vnode_t vp, struct componentname *cnp); @@ -1860,7 +1837,6 @@ void cache_enter(vnode_t dvp, vnode_t vp, struct componentname *cnp); @discussion Will flush all hardlinks to the vnode as well as all children (should any exist). Logical to use when cached data about a vnode becomes invalid, for instance in an unlink. @param vp The vnode to purge. - @return void. */ void cache_purge(vnode_t vp); @@ -1870,7 +1846,6 @@ void cache_purge(vnode_t vp); @discussion Appropriate to use when negative cache information for a directory could have become invalid, e.g. after file creation. @param vp The vnode whose negative children to purge. - @return void. */ void cache_purge_negatives(vnode_t vp); @@ -1948,7 +1923,7 @@ int vn_searchfs_inappropriate_name(const char *name, int len); @param p Process requesting I/O. @return 0 for success; errors from filesystem, and EIO if did not perform all requested I/O and the "aresid" parameter is NULL. */ -int vn_rdwr(enum uio_rw, vnode_t, caddr_t, int, off_t, enum uio_seg, int, kauth_cred_t, int *, proc_t); +int vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, kauth_cred_t cred, int *aresid, proc_t p); /*! @function vnode_getname @@ -1965,7 +1940,6 @@ const char *vnode_getname(vnode_t vp); @abstract Release a reference on a name from the VFS cache. @discussion Should be called on a string obtained with vnode_getname(). @param name String to release. - @return void. */ void vnode_putname(const char *name); @@ -1984,19 +1958,19 @@ vnode_t vnode_getparent(vnode_t vp); @function vnode_setdirty @abstract Mark the vnode as having data or metadata that needs to be written out during reclaim @discussion The vnode should be marked as dirty anytime a file system defers flushing of data or meta-data associated with it. - @param the vnode to mark as dirty + @param vp the vnode to mark as dirty @return 0 if successful else an error code. */ -int vnode_setdirty(vnode_t); +int vnode_setdirty(vnode_t vp); /*! @function vnode_cleardirty @abstract Mark the vnode as clean i.e. all its data or metadata has been flushed @discussion The vnode should be marked as clean whenever the file system is done flushing data or meta-data associated with it. - @param the vnode to clear as being dirty + @param vp the vnode to clear as being dirty @return 0 if successful else an error code. */ -int vnode_cleardirty(vnode_t); +int vnode_cleardirty(vnode_t vp); /*! @function vnode_isdirty @@ -2005,9 +1979,7 @@ int vnode_cleardirty(vnode_t); @param vp the vnode to test. @return Non-zero if the vnode is dirty, 0 otherwise. */ -int vnode_isdirty(vnode_t); - - +int vnode_isdirty(vnode_t vp); #ifdef KERNEL_PRIVATE /*! @@ -2030,10 +2002,20 @@ int vnode_lookup_continue_needed(vnode_t vp, struct componentname *cnp); */ int vnode_istty(vnode_t vp); +/*! + @function bdevvp + @abstract create a vnode for a given dev_t + @result non-zero to indicate failure, vnode provided in *vpp arg + */ +int bdevvp (dev_t dev, struct vnode **vpp); + /* - * Get the context for the first kernel thread (private SPI) + @function vnode_getfromfd + @abstract get a vnode from a file descriptor + @result non-zero to indicate failure, vnode provided in *vpp arg */ -vfs_context_t vfs_context_kernel(void); /* get from 1st kernel thread */ +int vnode_getfromfd (vfs_context_t ctx, int fd, vnode_t *vpp); + #endif /* KERNEL_PRIVATE */ #ifdef BSD_KERNEL_PRIVATE @@ -2046,8 +2028,6 @@ int vaccess(mode_t file_mode, uid_t uid, gid_t gid, int check_mountedon(dev_t dev, enum vtype type, int *errorp); int vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash); void vnode_reclaim(vnode_t); -int vfs_context_issuser(vfs_context_t); -vnode_t vfs_context_cwd(vfs_context_t); vnode_t current_rootdir(void); vnode_t current_workingdir(void); void *vnode_vfsfsprivate(vnode_t); @@ -2070,32 +2050,30 @@ boolean_t vnode_on_reliable_media(vnode_t); */ vnode_t vnode_parent(vnode_t); void vnode_setparent(vnode_t, vnode_t); +void vnode_setname(vnode_t, char *); +/* XXX temporary until we can arrive at a KPI for NFS, Seatbelt */ +thread_t vfs_context_thread(vfs_context_t); +#if CONFIG_IOSCHED +vnode_t vnode_mountdevvp(vnode_t); +#endif +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef KERNEL_PRIVATE /*! @function vnode_getname_printable @abstract Get a non-null printable name of a vnode. - @Used to make sure a printable name is returned for all vnodes. If a name exists or can be artificially created, the routine creates a new entry in the VFS namecache. Otherwise, the function returns an artificially created vnode name which is safer and easier to use. vnode_putname_printable() should be used to release names obtained by this routine. + @Used to make sure a printable name is returned for all vnodes. If a name exists or can be artificially created, the routine creates a new entry in the VFS namecache. Otherwise, the function returns an artificially created vnode name which is safer and easier to use. vnode_putname_printable() should be used to release names obtained by this routine. @param vp The vnode whose name to grab. @return The printable name. */ const char *vnode_getname_printable(vnode_t vp); - /*! @function vnode_putname_printable @abstract Release a reference on a name from the VFS cache if it was added by the matching vnode_getname_printable() call. @param name String to release. - @return void. */ void vnode_putname_printable(const char *name); -void vnode_setname(vnode_t, char *); -int vnode_isnoflush(vnode_t); -void vnode_setnoflush(vnode_t); -void vnode_clearnoflush(vnode_t); -/* XXX temporary until we can arrive at a KPI for NFS, Seatbelt */ -thread_t vfs_context_thread(vfs_context_t); -#if CONFIG_IOSCHED -vnode_t vnode_mountdevvp(vnode_t); -#endif -#endif /* BSD_KERNEL_PRIVATE */ +#endif // KERNEL_PRIVATE /* * Helper functions for implementing VNOP_GETATTRLISTBULK for a filesystem @@ -2108,18 +2086,18 @@ vnode_t vnode_mountdevvp(vnode_t); @param alp Pointer to attribute list structure. @param vap Pointer to vnode_attr structure. @param obj_vtype Type of object - If VNON is passed, then the type is ignored and common, file and dir attrs are used to initialise the vattrs. If set to VDIR, only common and directory attributes are used. For all other types, only common and file attrbutes are used. - @param attr_fixed_sizep. Returns the fixed length required in the attrbute buffer for the object. NULL should be passed if it is not required. + @param attr_fixed_sizep Returns the fixed length required in the attrbute buffer for the object. NULL should be passed if it is not required. @param ctx vfs context of caller. @return error. */ -errno_t vfs_setup_vattr_from_attrlist(struct attrlist * /* alp */, struct vnode_attr * /* vap */, enum vtype /* obj_vtype */, ssize_t * /* attr_fixed_sizep */, vfs_context_t /* ctx */); +errno_t vfs_setup_vattr_from_attrlist(struct attrlist *alp, struct vnode_attr *vap, enum vtype obj_vtype, ssize_t *attr_fixed_sizep, vfs_context_t ctx); /*! @function vfs_attr_pack @abstract Pack a vnode_attr structure into a buffer in the same format as getattrlist(2). @Used by a VNOP_GETATTRLISTBULK implementation to pack data provided into a vnode_attr structure into a buffer the way getattrlist(2) does. @param vp If available, the vnode for which the attributes are being given, NULL if vnode is not available (which will usually be the case for a VNOP_GETATTRLISTBULK implementation. - @param auio - a uio_t initialised with one iovec.. + @param uio - a uio_t initialised with one iovec.. @param alp - Pointer to an attrlist structure. @param options - options for call (same as options for getattrlistbulk(2)). @param vap Pointer to a filled in vnode_attr structure. Data from the vnode_attr structure will be used to copy and lay out the data in the required format for getatrlistbulk(2) by this function. @@ -2127,7 +2105,41 @@ errno_t vfs_setup_vattr_from_attrlist(struct attrlist * /* alp */, struct vnode_ @param ctx vfs context of caller. @return error. */ -errno_t vfs_attr_pack(vnode_t /* vp */, uio_t /* uio */, struct attrlist * /* alp */, uint64_t /* options */, struct vnode_attr * /* vap */, void * /* fndesc */, vfs_context_t /* ctx */); +errno_t vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, void *fndesc, vfs_context_t ctx); + +#ifdef KERNEL_PRIVATE + +// Returns a value suitable, safe and consistent for tracing and logging +vm_offset_t kdebug_vnode(vnode_t vp); +int vn_pathconf(vnode_t, int, int32_t *, vfs_context_t); +int vnode_should_flush_after_write(vnode_t vp, int ioflag); +void vfs_setowner(mount_t mp, uid_t uid, gid_t gid); +uint64_t vfs_idle_time(mount_t mp); +// Required until XsanFS is fixed... +#ifndef vnode_usecount +int vnode_usecount(vnode_t vp); +#endif +int vnode_iocount(vnode_t vp); +void vnode_rele_ext(vnode_t, int, int); +int is_package_name(const char *name, int len); +int vfs_context_issuser(vfs_context_t); +int vfs_context_iskernel(vfs_context_t); +vfs_context_t vfs_context_kernel(void); /* get from 1st kernel thread */ +vnode_t vfs_context_cwd(vfs_context_t); +int vnode_isnoflush(vnode_t); +void vnode_setnoflush(vnode_t); +void vnode_clearnoflush(vnode_t); + +#define BUILDPATH_NO_FS_ENTER 0x1 /* Use cache values, do not enter file system */ +#define BUILDPATH_CHECKACCESS 0x2 /* Check if parents have search rights */ +#define BUILDPATH_CHECK_MOVED 0x4 /* Return EAGAIN if the parent hierarchy is modified */ +#define BUILDPATH_VOLUME_RELATIVE 0x8 /* Return path relative to the nearest mount point */ + +int build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx); + +int vnode_issubdir(vnode_t vp, vnode_t dvp, int *is_subdir, vfs_context_t ctx); + +#endif // KERNEL_PRIVATE __END_DECLS diff --git a/bsd/sys/vnode_if.h b/bsd/sys/vnode_if.h index f2a0be407..2123aa04e 100644 --- a/bsd/sys/vnode_if.h +++ b/bsd/sys/vnode_if.h @@ -1,6 +1,5 @@ - /* - * Copyright (c) 2000-2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +118,7 @@ extern struct vnodeop_desc vnop_fsync_desc; extern struct vnodeop_desc vnop_remove_desc; extern struct vnodeop_desc vnop_link_desc; extern struct vnodeop_desc vnop_rename_desc; +extern struct vnodeop_desc vnop_renamex_desc; extern struct vnodeop_desc vnop_mkdir_desc; extern struct vnodeop_desc vnop_rmdir_desc; extern struct vnodeop_desc vnop_symlink_desc; @@ -137,6 +137,7 @@ extern struct vnodeop_desc vnop_pagein_desc; extern struct vnodeop_desc vnop_pageout_desc; extern struct vnodeop_desc vnop_searchfs_desc; extern struct vnodeop_desc vnop_copyfile_desc; +extern struct vnodeop_desc vnop_clonefile_desc; extern struct vnodeop_desc vnop_blktooff_desc; extern struct vnodeop_desc vnop_offtoblk_desc; extern struct vnodeop_desc vnop_blockmap_desc; @@ -437,7 +438,7 @@ struct vnop_read_args { @return 0 for success or a filesystem-specific error. VNOP_READ() can return success even if less data was read than originally requested; returning an error value should indicate that something actually went wrong. */ -extern errno_t VNOP_READ(vnode_t, struct uio *, int, vfs_context_t); +extern errno_t VNOP_READ(vnode_t vp, struct uio *uio, int, vfs_context_t ctx); struct vnop_write_args { struct vnodeop_desc *a_desc; @@ -461,7 +462,7 @@ struct vnop_write_args { @return 0 for success or a filesystem-specific error. VNOP_WRITE() can return success even if less data was written than originally requested; returning an error value should indicate that something actually went wrong. */ -extern errno_t VNOP_WRITE(vnode_t, struct uio *, int, vfs_context_t); +extern errno_t VNOP_WRITE(vnode_t vp, struct uio *uio, int ioflag, vfs_context_t ctx); struct vnop_ioctl_args { struct vnodeop_desc *a_desc; @@ -489,7 +490,7 @@ struct vnop_ioctl_args { @param ctx Context against which to authenticate ioctl request. @return 0 for success or a filesystem-specific error. */ -extern errno_t VNOP_IOCTL(vnode_t, u_long, caddr_t, int, vfs_context_t); +extern errno_t VNOP_IOCTL(vnode_t vp, u_long command, caddr_t data, int fflag, vfs_context_t ctx); struct vnop_select_args { struct vnodeop_desc *a_desc; @@ -620,7 +621,7 @@ struct vnop_fsync_args { @param ctx Context to authenticate for fsync request. @return 0 for success, else an error code. */ -extern errno_t VNOP_FSYNC(vnode_t, int, vfs_context_t); +extern errno_t VNOP_FSYNC(vnode_t vp, int waitfor, vfs_context_t ctx); struct vnop_remove_args { struct vnodeop_desc *a_desc; @@ -718,6 +719,49 @@ struct vnop_rename_args { extern errno_t VNOP_RENAME(vnode_t, vnode_t, struct componentname *, vnode_t, vnode_t, struct componentname *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +typedef unsigned int vfs_rename_flags_t; + +// Must match sys/stdio.h +enum { + VFS_RENAME_SECLUDE = 0x00000001, + VFS_RENAME_SWAP = 0x00000002, + VFS_RENAME_EXCL = 0x00000004, + + VFS_RENAME_FLAGS_MASK = (VFS_RENAME_SECLUDE | VFS_RENAME_SWAP + | VFS_RENAME_EXCL), +}; + +struct vnop_renamex_args { + struct vnodeop_desc *a_desc; + vnode_t a_fdvp; + vnode_t a_fvp; + struct componentname *a_fcnp; + vnode_t a_tdvp; + vnode_t a_tvp; + struct componentname *a_tcnp; + struct vnode_attr *a_vap; // Reserved for future use + vfs_rename_flags_t a_flags; + vfs_context_t a_context; +}; + +/*! + @function VNOP_RENAMEX + @abstract Call down to a filesystem to rename a file. + @discussion VNOP_RENAMEX() will only be called with a source and target on the same volume. + @param fdvp Directory in which source file resides. + @param fvp File being renamed. + @param fcnp Name information for source file. + @param tdvp Directory file is being moved to. + @param tvp Existing file with same name as target, should one exist. + @param tcnp Name information for target path. + @param flags Control certain rename semantics. + @param ctx Context to authenticate for rename request. + @return 0 for success, else an error code. + */ +#ifdef XNU_KERNEL_PRIVATE +extern errno_t VNOP_RENAMEX(vnode_t, vnode_t, struct componentname *, vnode_t, vnode_t, struct componentname *, vfs_rename_flags_t, vfs_context_t); +#endif /* XNU_KERNEL_PRIVATE */ + #ifdef KERNEL_PRIVATE struct vnop_compound_rename_args { struct vnodeop_desc *a_desc; @@ -1282,6 +1326,36 @@ struct vnop_copyfile_args { extern errno_t VNOP_COPYFILE(vnode_t, vnode_t, vnode_t, struct componentname *, int, int, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +struct vnop_clonefile_args { + struct vnodeop_desc *a_desc; + vnode_t a_fvp; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + uint32_t a_flags; + vfs_context_t a_context; + /* XXX Add recursive directory cloning authorizer */ +}; + +/*! + @function VNOP_CLONEFILE + @abstract Call down to a filesystem to clone a filesystem object (regular file, directory or symbolic link.) + @discussion If file creation succeeds, "vpp" should be returned with an iocount to be dropped by the caller. + @param dvp Directory in which to clone object. + @param vpp Destination for vnode for newly cloned object. + @param cnp Description of name of object to clone. + @param vap File creation properties, as seen in vnode_getattr(). Manipulated with VATTR_ISACTIVE, VATTR_RETURN, + VATTR_SET_SUPPORTED, and so forth. All attributes not set here should either be copied + from the source object + or set to values which are used for creating new filesystem objects + @param ctx Context against which to authenticate file creation. + @return 0 for success or a filesystem-specific error. + */ +#ifdef XNU_KERNEL_PRIVATE +extern errno_t VNOP_CLONEFILE(vnode_t, vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, uint32_t, vfs_context_t); +#endif /* XNU_KERNEL_PRIVATE */ + struct vnop_getxattr_args { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -1304,7 +1378,7 @@ extern struct vnodeop_desc vnop_getxattr_desc; @param ctx Context to authenticate for getxattr request. @return 0 for success, or an error code. */ -extern errno_t VNOP_GETXATTR(vnode_t, const char *, uio_t, size_t *, int, vfs_context_t); +extern errno_t VNOP_GETXATTR(vnode_t vp, const char *name, uio_t uio, size_t *size, int options, vfs_context_t ctx); struct vnop_setxattr_args { struct vnodeop_desc *a_desc; @@ -1327,7 +1401,7 @@ extern struct vnodeop_desc vnop_setxattr_desc; @param ctx Context to authenticate for setxattr request. @return 0 for success, or an error code. */ -extern errno_t VNOP_SETXATTR(vnode_t, const char *, uio_t, int, vfs_context_t); +extern errno_t VNOP_SETXATTR(vnode_t vp, const char *name, uio_t uio, int options, vfs_context_t ctx); struct vnop_removexattr_args { struct vnodeop_desc *a_desc; @@ -1480,7 +1554,7 @@ struct vnop_bwrite_args { @param bp The buffer to write. @return 0 for success, else an error code. */ -extern errno_t VNOP_BWRITE(buf_t); +extern errno_t VNOP_BWRITE(buf_t bp); struct vnop_kqfilt_add_args { struct vnodeop_desc *a_desc; @@ -1545,7 +1619,7 @@ extern struct vnodeop_desc vnop_monitor_desc; Each BEGIN will be matched with an END with the same handle. Note that vnode_ismonitored() can be used to see if there are currently watchers for a file. */ -errno_t VNOP_MONITOR(vnode_t , uint32_t, uint32_t, void*, vfs_context_t); +errno_t VNOP_MONITOR(vnode_t vp, uint32_t events, uint32_t flags, void *handle, vfs_context_t ctx); #endif /* XNU_KERNEL_PRIVATE */ struct label; @@ -1655,9 +1729,10 @@ struct vnop_removenamedstream_args { #ifdef XNU_KERNEL_PRIVATE extern errno_t VNOP_REMOVENAMEDSTREAM(vnode_t, vnode_t, const char *, int flags, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ -#endif -#endif +#endif // NAMEDSTREAMS + +#endif // defined(__APPLE_API_UNSTABLE) __END_DECLS diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index 73722ba63..22750b6b9 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -128,8 +128,8 @@ struct vnode { lck_mtx_t v_lock; /* vnode mutex */ TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ TAILQ_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ + TAILQ_HEAD(, namecache) v_ncchildren; /* name cache entries that regard us as their parent */ LIST_HEAD(, namecache) v_nclinks; /* name cache entries that name this vnode */ - LIST_HEAD(, namecache) v_ncchildren; /* name cache entries that regard us as their parent */ vnode_t v_defer_reclaimlist; /* in case we have to defer the reclaim to avoid recursion */ uint32_t v_listflag; /* flags protected by the vnode_list_lock (see below) */ uint32_t v_flag; /* vnode flags (see below) */ @@ -397,19 +397,11 @@ extern struct vnodeop_desc *vnodeop_descs[]; struct ostat; -#define BUILDPATH_NO_FS_ENTER 0x1 /* Use cache values, do not enter file system */ -#define BUILDPATH_CHECKACCESS 0x2 /* Check if parents have search rights */ -#define BUILDPATH_CHECK_MOVED 0x4 /* Return EAGAIN if the parent hierarchy is modified */ -#define BUILDPATH_VOLUME_RELATIVE 0x8 /* Return path relative to the nearest mount point */ - -int build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx); - -int bdevvp(dev_t dev, struct vnode **vpp); +/* bdevvp moved to vnode.h as private KPI */ void cvtstat(struct stat *st, struct ostat *ost); void vprint(const char *label, struct vnode *vp); -__private_extern__ int is_package_name(const char *name, int len); __private_extern__ int set_package_extensions_table(user_addr_t data, int nentries, int maxwidth); int vn_rdwr_64(enum uio_rw rw, struct vnode *vp, uint64_t base, int64_t len, off_t offset, enum uio_seg segflg, @@ -447,6 +439,9 @@ int vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_ int vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, vfs_context_t ctx, void *reserved); +int vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved); int vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); typedef int (*vn_create_authorizer_t)(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); @@ -486,7 +481,6 @@ void name_cache_unlock(void); void cache_enter_with_gen(vnode_t dvp, vnode_t vp, struct componentname *cnp, int gen); const char *cache_enter_create(vnode_t dvp, vnode_t vp, struct componentname *cnp); -int vn_pathconf(vnode_t, int, int32_t *, vfs_context_t); extern int nc_disabled; #define vnode_lock_convert(v) lck_mtx_convert_spin(&(v)->v_lock) @@ -500,7 +494,6 @@ void vnode_list_unlock(void); #define VNODE_REF_FORCE 0x1 int vnode_ref_ext(vnode_t, int, int); -void vnode_rele_ext(vnode_t, int, int); void vnode_rele_internal(vnode_t, int, int, int); #ifdef BSD_KERNEL_PRIVATE int vnode_getalways(vnode_t); @@ -595,6 +588,8 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, void vnode_setswapmount(vnode_t); int64_t vnode_getswappin_avail(vnode_t); +int vnode_get_snapdir(vnode_t , vnode_t *, vfs_context_t); + #if CONFIG_TRIGGERS /* VFS Internal Vnode Trigger Interfaces (Private) */ int vnode_trigger_resolve(vnode_t, struct nameidata *, vfs_context_t); diff --git a/bsd/uuid/Makefile b/bsd/uuid/Makefile index 1e5f59ecc..6f07c30e9 100644 --- a/bsd/uuid/Makefile +++ b/bsd/uuid/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -15,7 +14,6 @@ DATAFILES = \ KERNELFILES = \ uuid.h - INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = uuid @@ -34,5 +32,3 @@ INSTALL_KF_MI_LIST = ${KERNELFILES} include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/uuid/uuid.h b/bsd/uuid/uuid.h index 52602867e..65524909a 100644 --- a/bsd/uuid/uuid.h +++ b/bsd/uuid/uuid.h @@ -46,6 +46,8 @@ typedef __darwin_uuid_string_t uuid_string_t; #define UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \ static const uuid_t name __attribute__ ((unused)) = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15} +UUID_DEFINE(UUID_NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + #ifdef __cplusplus extern "C" { #endif diff --git a/bsd/uxkern/ux_exception.c b/bsd/uxkern/ux_exception.c index 795a40808..21bd3eec9 100644 --- a/bsd/uxkern/ux_exception.c +++ b/bsd/uxkern/ux_exception.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,8 +97,8 @@ mach_port_name_t ux_exception_port; static task_t ux_handler_self; -static -void +__attribute__((noreturn)) +static void ux_handler(void) { task_t self = current_task(); @@ -333,7 +333,7 @@ catch_mach_exception_raise( ut->uu_exception = exception; //ut->uu_code = code[0]; // filled in by threadsignal ut->uu_subcode = code[1]; - threadsignal(th_act, ux_signal, code[0]); + threadsignal(th_act, ux_signal, code[0], TRUE); } if (p != NULL) proc_rele(p); diff --git a/bsd/vfs/Makefile b/bsd/vfs/Makefile index 68e740b9a..ee71a736c 100644 --- a/bsd/vfs/Makefile +++ b/bsd/vfs/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -20,5 +19,3 @@ EXPORT_MI_DIR = vfs include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/vfs/doc_tombstone.c b/bsd/vfs/doc_tombstone.c new file mode 100644 index 000000000..05120a56d --- /dev/null +++ b/bsd/vfs/doc_tombstone.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +// -- Document ID Tombstone Support -- + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// +// This function gets the doc_tombstone structure for the +// current thread. If the thread doesn't have one, the +// structure is allocated. +// +struct doc_tombstone * +doc_tombstone_get(void) +{ + struct uthread *ut; + ut = get_bsdthread_info(current_thread()); + + if (ut->t_tombstone == NULL) { + ut->t_tombstone = kalloc(sizeof(struct doc_tombstone)); + if (ut->t_tombstone) { + memset(ut->t_tombstone, 0, sizeof(struct doc_tombstone)); + } + } + + return ut->t_tombstone; +} + +// +// This routine clears out the current tombstone for the +// current thread and if necessary passes the doc-id of +// the tombstone on to the dst_cnode. +// +// The caller is responsible for generating the appropriate +// fsevents. +// +void +doc_tombstone_clear(struct doc_tombstone *ut, vnode_t *old_vpp) +{ + uint32_t old_id = ut->t_lastop_document_id; + + ut->t_lastop_document_id = 0; + ut->t_lastop_parent = NULL; + ut->t_lastop_parent_vid = 0; + ut->t_lastop_filename[0] = '\0'; + + // + // If the lastop item is still the same and needs to be cleared, + // clear it. The following isn't ideal because the vnode might + // have been recycled. + // + if (old_vpp) { + *old_vpp = NULL; + if (old_id && ut->t_lastop_item + && vnode_vid(ut->t_lastop_item) == ut->t_lastop_item_vid) { + int res = vnode_get(ut->t_lastop_item); + if (!res) { + // Need to check vid again + if (vnode_vid(ut->t_lastop_item) == ut->t_lastop_item_vid + && !ISSET(ut->t_lastop_item->v_lflag, VL_TERMINATE)) + *old_vpp = ut->t_lastop_item; + else + vnode_put(ut->t_lastop_item); + } + } + } + + // last, clear these now that we're all done + ut->t_lastop_item = NULL; + ut->t_lastop_fileid = 0; + ut->t_lastop_item_vid = 0; +} + + +// +// This function is used to filter out operations on temp +// filenames. We have to filter out operations on certain +// temp filenames to work-around questionable application +// behavior from apps like Autocad that perform unusual +// sequences of file system operations for a "safe save". +bool doc_tombstone_should_ignore_name(const char *nameptr, int len) +{ + if (len == 0) { + len = strlen(nameptr); + } + + if ( strncmp(nameptr, "atmp", 4) == 0 + || (len > 4 && strncmp(nameptr+len-4, ".bak", 4) == 0) + || (len > 4 && strncmp(nameptr+len-4, ".tmp", 4) == 0)) { + return true; + } + + return false; +} + +// +// Decide if we need to save a tombstone or not. Normally we always +// save a tombstone - but if there already is one and the name we're +// given is an ignorable name, then we will not save a tombstone. +// +bool doc_tombstone_should_save(struct doc_tombstone *ut, struct vnode *vp, + struct componentname *cnp) +{ + if (cnp->cn_nameptr == NULL) { + return false; + } + + if (ut->t_lastop_document_id && ut->t_lastop_item == vp + && doc_tombstone_should_ignore_name(cnp->cn_nameptr, cnp->cn_namelen)) { + return false; + } + + return true; +} + +// +// This function saves a tombstone for the given vnode and name. The +// tombstone represents the parent directory and name where the document +// used to live and the document-id of that file. This info is recorded +// in the doc_tombstone structure hanging off the uthread (which assumes +// that all safe-save operations happen on the same thread). +// +// If later on the same parent/name combo comes back into existence then +// we'll preserve the doc-id from this vnode onto the new vnode. +// +// The caller is responsible for generating the appropriate +// fsevents. +// +void +doc_tombstone_save(struct vnode *dvp, struct vnode *vp, + struct componentname *cnp, uint64_t doc_id, + ino64_t file_id) +{ + struct doc_tombstone *ut; + ut = doc_tombstone_get(); + + ut->t_lastop_parent = dvp; + ut->t_lastop_parent_vid = vnode_vid(dvp); + ut->t_lastop_fileid = file_id; + ut->t_lastop_item = vp; + ut->t_lastop_item_vid = vp ? vnode_vid(vp) : 0; + ut->t_lastop_document_id = doc_id; + + strlcpy((char *)&ut->t_lastop_filename[0], cnp->cn_nameptr, sizeof(ut->t_lastop_filename)); +} diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index 19a4be3d1..abe9556f7 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -138,6 +138,8 @@ static void xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, vfs_context_t ctx); #endif /* CONFIG_APPLEDOUBLE */ +static errno_t post_rename(vnode_t fdvp, vnode_t fvp, vnode_t tdvp, vnode_t tvp); + /* * vnode_setneedinactive * @@ -233,7 +235,7 @@ VFS_UNMOUNT(mount_t mp, int flags, vfs_context_t ctx) * * The return codes documented above are those which may currently * be returned by HFS from hfs_vfs_root, which is a simple wrapper - * for a call to hfs_vget on the volume mount poit, not including + * for a call to hfs_vget on the volume mount point, not including * additional error codes which may be propagated from underlying * routines called by hfs_vget. */ @@ -336,7 +338,7 @@ VFS_VGET(mount_t mp, ino64_t ino, struct vnode **vpp, vfs_context_t ctx) } int -VFS_FHTOVP(mount_t mp, int fhlen, unsigned char * fhp, vnode_t * vpp, vfs_context_t ctx) +VFS_FHTOVP(mount_t mp, int fhlen, unsigned char *fhp, vnode_t *vpp, vfs_context_t ctx) { int error; @@ -353,7 +355,7 @@ VFS_FHTOVP(mount_t mp, int fhlen, unsigned char * fhp, vnode_t * vpp, vfs_contex } int -VFS_VPTOFH(struct vnode * vp, int *fhlenp, unsigned char * fhp, vfs_context_t ctx) +VFS_VPTOFH(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t ctx) { int error; @@ -369,6 +371,31 @@ VFS_VPTOFH(struct vnode * vp, int *fhlenp, unsigned char * fhp, vfs_context_t ct return(error); } +int VFS_IOCTL(struct mount *mp, u_long command, caddr_t data, + int flags, vfs_context_t context) +{ + if (mp == dead_mountp || !mp->mnt_op->vfs_ioctl) + return ENOTSUP; + + return mp->mnt_op->vfs_ioctl(mp, command, data, flags, + context ?: vfs_context_current()); +} + +int +VFS_VGET_SNAPDIR(mount_t mp, vnode_t *vpp, vfs_context_t ctx) +{ + int error; + + if ((mp == dead_mountp) || (mp->mnt_op->vfs_vget_snapdir == 0)) + return(ENOTSUP); + + if (ctx == NULL) + ctx = vfs_context_current(); + + error = (*mp->mnt_op->vfs_vget_snapdir)(mp, vpp, ctx); + + return (error); +} /* returns the cached throttle mask for the mount_t */ uint64_t @@ -379,7 +406,7 @@ vfs_throttle_mask(mount_t mp) /* returns a copy of vfs type name for the mount_t */ void -vfs_name(mount_t mp, char * buffer) +vfs_name(mount_t mp, char *buffer) { strncpy(buffer, mp->mnt_vtable->vfc_name, MFSNAMELEN); } @@ -736,8 +763,10 @@ vfs_devvp(mount_t mp) void vfs_ioattr(mount_t mp, struct vfsioattr *ioattrp) { - if (mp == NULL) { - ioattrp->io_maxreadcnt = MAXPHYS; + ioattrp->io_reserved[0] = NULL; + ioattrp->io_reserved[1] = NULL; + if (mp == NULL) { + ioattrp->io_maxreadcnt = MAXPHYS; ioattrp->io_maxwritecnt = MAXPHYS; ioattrp->io_segreadcnt = 32; ioattrp->io_segwritecnt = 32; @@ -745,8 +774,9 @@ vfs_ioattr(mount_t mp, struct vfsioattr *ioattrp) ioattrp->io_maxsegwritesize = MAXPHYS; ioattrp->io_devblocksize = DEV_BSIZE; ioattrp->io_flags = 0; + ioattrp->io_max_swappin_available = 0; } else { - ioattrp->io_maxreadcnt = mp->mnt_maxreadcnt; + ioattrp->io_maxreadcnt = mp->mnt_maxreadcnt; ioattrp->io_maxwritecnt = mp->mnt_maxwritecnt; ioattrp->io_segreadcnt = mp->mnt_segreadcnt; ioattrp->io_segwritecnt = mp->mnt_segwritecnt; @@ -754,9 +784,8 @@ vfs_ioattr(mount_t mp, struct vfsioattr *ioattrp) ioattrp->io_maxsegwritesize = mp->mnt_maxsegwritesize; ioattrp->io_devblocksize = mp->mnt_devblocksize; ioattrp->io_flags = mp->mnt_ioflags; + ioattrp->io_max_swappin_available = mp->mnt_max_swappin_available; } - ioattrp->io_reserved[0] = NULL; - ioattrp->io_reserved[1] = NULL; } @@ -776,6 +805,7 @@ vfs_setioattr(mount_t mp, struct vfsioattr * ioattrp) mp->mnt_maxsegwritesize = ioattrp->io_maxsegwritesize; mp->mnt_devblocksize = ioattrp->io_devblocksize; mp->mnt_ioflags = ioattrp->io_flags; + mp->mnt_max_swappin_available = ioattrp->io_max_swappin_available; } /* @@ -787,7 +817,7 @@ vfs_setioattr(mount_t mp, struct vfsioattr * ioattrp) typedef int (*PFI)(void *); extern int vfs_opv_numops; errno_t -vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) +vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t *handle) { struct vfstable *newvfstbl = NULL; int i,j; @@ -854,6 +884,10 @@ vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) newvfstbl->vfc_vfsflags |= VFC_VFSNOMACLABEL; if (vfe->vfe_flags & VFS_TBLVNOP_NOUPDATEID_RENAME) newvfstbl->vfc_vfsflags |= VFC_VFSVNOP_NOUPDATEID_RENAME; + if (vfe->vfe_flags & VFS_TBLVNOP_SECLUDE_RENAME) + newvfstbl->vfc_vfsflags |= VFC_VFSVNOP_SECLUDE_RENAME; + if (vfe->vfe_flags & VFS_TBLCANMOUNTROOT) + newvfstbl->vfc_vfsflags |= VFC_VFSCANMOUNTROOT; /* * Allocate and init the vectors. @@ -970,7 +1004,7 @@ vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) * file system was added */ errno_t -vfs_fsremove(vfstable_t handle) +vfs_fsremove(vfstable_t handle) { struct vfstable * vfstbl = (struct vfstable *)handle; void *old_desc = NULL; @@ -1002,6 +1036,32 @@ vfs_fsremove(vfstable_t handle) return(err); } +void vfs_setowner(mount_t mp, uid_t uid, gid_t gid) +{ + mp->mnt_fsowner = uid; + mp->mnt_fsgroup = gid; +} + +/* + * Callers should be careful how they use this; accessing + * mnt_last_write_completed_timestamp is not thread-safe. Writing to + * it isn't either. Point is: be prepared to deal with strange values + * being returned. + */ +uint64_t vfs_idle_time(mount_t mp) +{ + if (mp->mnt_pending_write_size) + return 0; + + struct timeval now; + + microuptime(&now); + + return ((now.tv_sec + - mp->mnt_last_write_completed_timestamp.tv_sec) * 1000000 + + now.tv_usec - mp->mnt_last_write_completed_timestamp.tv_usec); +} + int vfs_context_pid(vfs_context_t ctx) { @@ -1295,6 +1355,11 @@ vfs_context_issuser(vfs_context_t ctx) return(kauth_cred_issuser(vfs_context_ucred(ctx))); } +int vfs_context_iskernel(vfs_context_t ctx) +{ + return ctx == &kerncontext; +} + /* * Given a context, for all fields of vfs_context_t which * are not held with a reference, set those fields to the @@ -1316,6 +1381,11 @@ vfs_context_bind(vfs_context_t ctx) return 0; } +int vfs_isswapmount(mount_t mnt) +{ + return mnt && ISSET(mnt->mnt_kern_flag, MNTK_SWAP_MOUNT) ? 1 : 0; +} + /* XXXXXXXXXXXXXX VNODE KAPIS XXXXXXXXXXXXXXXXXXXXXXXXX */ @@ -2050,10 +2120,10 @@ vnode_get_filesec(vnode_t vp, kauth_filesec_t *fsecp, vfs_context_t ctx) fsec = NULL; fsec_uio = NULL; - error = 0; - + /* find out how big the EA is */ - if (vn_getxattr(vp, KAUTH_FILESEC_XATTR, NULL, &xsize, XATTR_NOSECURITY, ctx) != 0) { + error = vn_getxattr(vp, KAUTH_FILESEC_XATTR, NULL, &xsize, XATTR_NOSECURITY, ctx); + if (error != 0) { /* no EA, no filesec */ if ((error == ENOATTR) || (error == ENOENT) || (error == EJUSTRETURN)) error = 0; @@ -3762,7 +3832,7 @@ VNOP_LINK(vnode_t vp, vnode_t tdvp, struct componentname * cnp, vfs_context_t ct errno_t vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, - uint32_t flags, vfs_context_t ctx) + vfs_rename_flags_t flags, vfs_context_t ctx) { int _err; struct nameidata *fromnd = NULL; @@ -3785,13 +3855,6 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s panic("Not batched, and no fvp?"); } -#if CONFIG_SECLUDED_RENAME - if ((fcnp->cn_flags & CN_SECLUDE_RENAME) && - (((*fvpp)->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSVNOP_SECLUDE_RENAME) == 0)) { - return ENOTSUP; - } -#endif - #if CONFIG_APPLEDOUBLE /* * We need to preflight any potential AppleDouble file for the source file @@ -3875,7 +3938,17 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s printf("VNOP_COMPOUND_RENAME() returned %d\n", _err); } } else { - _err = VNOP_RENAME(fdvp, *fvpp, fcnp, tdvp, *tvpp, tcnp, ctx); + if (flags) { + _err = VNOP_RENAMEX(fdvp, *fvpp, fcnp, tdvp, *tvpp, tcnp, flags, ctx); + if (_err == ENOTSUP && flags == VFS_RENAME_SECLUDE) { + // Legacy... + if ((*fvpp)->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSVNOP_SECLUDE_RENAME) { + fcnp->cn_flags |= CN_SECLUDE_RENAME; + _err = VNOP_RENAME(fdvp, *fvpp, fcnp, tdvp, *tvpp, tcnp, ctx); + } + } + } else + _err = VNOP_RENAME(fdvp, *fvpp, fcnp, tdvp, *tvpp, tcnp, ctx); } /* @@ -4048,7 +4121,6 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, vfs_context_t ctx) { int _err = 0; - int events; struct vnop_rename_args a; a.a_desc = &vnop_rename_desc; @@ -4064,41 +4136,96 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, _err = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); DTRACE_FSINFO(rename, vnode_t, fdvp); - if (_err == 0) { - if (tvp && tvp != fvp) - vnode_setneedinactive(tvp); - } + if (_err) + return _err; - /* Wrote at least one directory. If transplanted a dir, also changed link counts */ - if (_err == 0) { - events = NOTE_WRITE; - if (vnode_isdir(fvp)) { - /* Link count on dir changed only if we are moving a dir and... - * --Moved to new dir, not overwriting there - * --Kept in same dir and DID overwrite - */ - if (((fdvp != tdvp) && (!tvp)) || ((fdvp == tdvp) && (tvp))) { - events |= NOTE_LINK; - } - } + return post_rename(fdvp, fvp, tdvp, tvp); +} - lock_vnode_and_post(fdvp, events); - if (fdvp != tdvp) { - lock_vnode_and_post(tdvp, events); - } +static errno_t +post_rename(vnode_t fdvp, vnode_t fvp, vnode_t tdvp, vnode_t tvp) +{ + if (tvp && tvp != fvp) + vnode_setneedinactive(tvp); - /* If you're replacing the target, post a deletion for it */ - if (tvp) - { - lock_vnode_and_post(tvp, NOTE_DELETE); + /* Wrote at least one directory. If transplanted a dir, also changed link counts */ + int events = NOTE_WRITE; + if (vnode_isdir(fvp)) { + /* Link count on dir changed only if we are moving a dir and... + * --Moved to new dir, not overwriting there + * --Kept in same dir and DID overwrite + */ + if (((fdvp != tdvp) && (!tvp)) || ((fdvp == tdvp) && (tvp))) { + events |= NOTE_LINK; } + } - lock_vnode_and_post(fvp, NOTE_RENAME); + lock_vnode_and_post(fdvp, events); + if (fdvp != tdvp) { + lock_vnode_and_post(tdvp, events); } - return (_err); + /* If you're replacing the target, post a deletion for it */ + if (tvp) + { + lock_vnode_and_post(tvp, NOTE_DELETE); + } + + lock_vnode_and_post(fvp, NOTE_RENAME); + + return 0; } +#if 0 +/* + *# + *#% renamex fdvp U U U + *#% renamex fvp U U U + *#% renamex tdvp L U U + *#% renamex tvp X U U + *# + */ +struct vnop_renamex_args { + struct vnodeop_desc *a_desc; + vnode_t a_fdvp; + vnode_t a_fvp; + struct componentname *a_fcnp; + vnode_t a_tdvp; + vnode_t a_tvp; + struct componentname *a_tcnp; + vfs_rename_flags_t a_flags; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_RENAMEX(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_rename_flags_t flags, vfs_context_t ctx) +{ + int _err = 0; + struct vnop_renamex_args a; + + a.a_desc = &vnop_renamex_desc; + a.a_fdvp = fdvp; + a.a_fvp = fvp; + a.a_fcnp = fcnp; + a.a_tdvp = tdvp; + a.a_tvp = tvp; + a.a_tcnp = tcnp; + a.a_flags = flags; + a.a_context = ctx; + + /* do the rename of the main file. */ + _err = (*fdvp->v_op[vnop_renamex_desc.vdesc_offset])(&a); + DTRACE_FSINFO(renamex, vnode_t, fdvp); + + if (_err) + return _err; + + return post_rename(fdvp, fvp, tdvp, tvp); +} + + int VNOP_COMPOUND_RENAME( struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, @@ -4951,6 +5078,8 @@ VNOP_ADVLOCK(struct vnode *vp, caddr_t id, int op, struct flock *fl, int flags, _err = (*vp->v_op[vnop_advlock_desc.vdesc_offset])(&a); } DTRACE_FSINFO(advlock, vnode_t, vp); + if (op == F_UNLCK && flags == F_FLOCK) + post_event_if_success(vp, _err, NOTE_FUNLOCK); } return (_err); @@ -5181,6 +5310,45 @@ VNOP_COPYFILE(struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct c return (_err); } +#if 0 +struct vnop_clonefile_args { + struct vnodeop_desc *a_desc; + vnode_t a_fvp; + vnode_t a_dvp; + vnode_t a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + uint32_t a_flags; + vfs_context_t a_context; +}; +#endif /* 0 */ + +errno_t +VNOP_CLONEFILE(vnode_t fvp, vnode_t dvp, vnode_t *vpp, + struct componentname *cnp, struct vnode_attr *vap, uint32_t flags, + vfs_context_t ctx) +{ + int _err; + struct vnop_clonefile_args a; + a.a_desc = &vnop_clonefile_desc; + a.a_fvp = fvp; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = cnp; + a.a_vap = vap; + a.a_flags = flags; + a.a_context = ctx; + + _err = (*dvp->v_op[vnop_clonefile_desc.vdesc_offset])(&a); + + if (_err == 0 && *vpp) + DTRACE_FSINFO(clonefile, vnode_t, *vpp); + + post_event_if_success(dvp, _err, NOTE_WRITE); + + return (_err); +} + errno_t VNOP_GETXATTR(vnode_t vp, const char *name, uio_t uio, size_t *size, int options, vfs_context_t ctx) { diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index 38d7a3f24..86eabd93d 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2014 Apple Inc. All rights reserved. + * Copyright (c) 1995-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -51,7 +51,6 @@ #include #include #include -#include #if CONFIG_MACF #include @@ -1167,6 +1166,7 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, * This attribute isn't really Finder Info, at least for HFS. */ if (vp->v_tag == VT_HFS) { +#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) error = VNOP_IOCTL(vp, HFS_GET_BOOT_INFO, (caddr_t)&f, 0, ctx); if (error == 0) { attrlist_pack_fixed(&ab, f, sizeof(f)); @@ -1475,45 +1475,64 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp, } } if (alp->commonattr & ATTR_CMN_OBJID) { - fsobj_id_t f; /* * Carbon can't deal with us reporting the target ID * for links. So we ask the filesystem to give us the * source ID as well, and if it gives us one, we use * it instead. */ - if (VATTR_IS_SUPPORTED(vap, va_linkid)) { - f.fid_objno = vap->va_linkid; + if (vap->va_vaflags & VA_64BITOBJIDS) { + if (VATTR_IS_SUPPORTED(vap, va_linkid)) { + ATTR_PACK8((*abp), vap->va_linkid); + } else { + ATTR_PACK8((*abp), vap->va_fileid); + } } else { - f.fid_objno = vap->va_fileid; + fsobj_id_t f; + if (VATTR_IS_SUPPORTED(vap, va_linkid)) { + f.fid_objno = (uint32_t)vap->va_linkid; + } else { + f.fid_objno = (uint32_t)vap->va_fileid; + } + f.fid_generation = 0; + ATTR_PACK8((*abp), f); } - f.fid_generation = 0; - ATTR_PACK8((*abp), f); abp->actual.commonattr |= ATTR_CMN_OBJID; } if (alp->commonattr & ATTR_CMN_OBJPERMANENTID) { - fsobj_id_t f; /* * Carbon can't deal with us reporting the target ID * for links. So we ask the filesystem to give us the * source ID as well, and if it gives us one, we use * it instead. */ - if (VATTR_IS_SUPPORTED(vap, va_linkid)) { - f.fid_objno = vap->va_linkid; + if (vap->va_vaflags & VA_64BITOBJIDS) { + if (VATTR_IS_SUPPORTED(vap, va_linkid)) { + ATTR_PACK8((*abp), vap->va_linkid); + } else { + ATTR_PACK8((*abp), vap->va_fileid); + } } else { - f.fid_objno = vap->va_fileid; + fsobj_id_t f; + if (VATTR_IS_SUPPORTED(vap, va_linkid)) { + f.fid_objno = (uint32_t)vap->va_linkid; + } else { + f.fid_objno = (uint32_t)vap->va_fileid; + } + f.fid_generation = 0; + ATTR_PACK8((*abp), f); } - f.fid_generation = 0; - ATTR_PACK8((*abp), f); abp->actual.commonattr |= ATTR_CMN_OBJPERMANENTID; } if (alp->commonattr & ATTR_CMN_PAROBJID) { - fsobj_id_t f; - - f.fid_objno = vap->va_parentid; /* could be lossy here! */ - f.fid_generation = 0; - ATTR_PACK8((*abp), f); + if (vap->va_vaflags & VA_64BITOBJIDS) { + ATTR_PACK8((*abp), vap->va_parentid); + } else { + fsobj_id_t f; + f.fid_objno = (uint32_t)vap->va_parentid; + f.fid_generation = 0; + ATTR_PACK8((*abp), f); + } abp->actual.commonattr |= ATTR_CMN_PAROBJID; } if (alp->commonattr & ATTR_CMN_SCRIPT) { @@ -1741,8 +1760,10 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp, } if (alp->commonattr & ATTR_CMN_FULLPATH) { - attrlist_pack_string (abp, fullpathptr, fullpathlen); - abp->actual.commonattr |= ATTR_CMN_FULLPATH; + if (vp) { + attrlist_pack_string (abp, fullpathptr, fullpathlen); + abp->actual.commonattr |= ATTR_CMN_FULLPATH; + } } if (alp->commonattr & ATTR_CMN_ADDEDTIME) { @@ -2255,13 +2276,14 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp, goto out; } - if (alp->commonattr & (ATTR_CMN_FULLPATH)) { + if (vp && (alp->commonattr & (ATTR_CMN_FULLPATH))) { fullpathptr = (char*) kalloc(MAXPATHLEN); if (fullpathptr == NULL) { error = ENOMEM; VFS_DEBUG(ctx,vp, "ATTRLIST - ERROR: cannot allocate fullpath buffer"); goto out; } + bzero(fullpathptr, MAXPATHLEN); } /* @@ -3409,6 +3431,27 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval) goto out; } + if (uap->options & FSOPT_LIST_SNAPSHOT) { + vnode_t snapdvp; + + if (!vfs_context_issuser(ctx)) { + error = EPERM; + goto out; + } + + if (!vnode_isvroot(dvp)) { + error = EINVAL; + goto out; + } + + /* switch directory to snapshot directory */ + error = vnode_get_snapdir(dvp, &snapdvp, ctx); + if (error) + goto out; + vnode_put(dvp); + dvp = snapdvp; + } + if (dvp->v_type != VDIR) { error = ENOTDIR; goto out; @@ -3910,12 +3953,19 @@ setattrlist_internal(vnode_t vp, struct setattrlist_args *uap, proc_t p, vfs_con goto out; } +#if CONFIG_MACF + mac_vnode_notify_setattrlist(ctx, vp, &al); + if (VATTR_IS_ACTIVE(&va, va_flags)) + mac_vnode_notify_setflags(ctx, vp, va.va_flags); +#endif + /* * Write the Finder Info if we have any. */ if (fndrinfo != NULL) { if (al.volattr & ATTR_VOL_INFO) { if (vp->v_tag == VT_HFS) { +#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) error = VNOP_IOCTL(vp, HFS_SET_BOOT_INFO, (caddr_t)fndrinfo, 0, ctx); if (error != 0) goto out; diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 9c4b20a0f..495a5f3df 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -129,7 +129,7 @@ static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv); -__private_extern__ int bdwrite_internal(buf_t, int); +int bdwrite_internal(buf_t, int); /* zone allocated buffer headers */ static void bufzoneinit(void); @@ -1320,7 +1320,7 @@ buf_strategy(vnode_t devvp, void *ap) cpx_t cpx = bufattr_cpx(buf_attr(bp)); if (cpx) { /* No need to go here for older EAs */ - if(cpx_use_offset_for_iv(cpx)) { + if(cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) { off_t f_offset; if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset))) return error; @@ -1362,7 +1362,7 @@ buf_strategy(vnode_t devvp, void *ap) buf_t buf_alloc(vnode_t vp) { - return(alloc_io_buf(vp, 0)); + return(alloc_io_buf(vp, is_vm_privileged())); } void @@ -2276,12 +2276,7 @@ buf_bwrite(buf_t bp) } /* Release the buffer. */ - // XXXdbg - only if the unused bit is set - if (!ISSET(bp->b_flags, B_NORELSE)) { - buf_brelse(bp); - } else { - CLR(bp->b_flags, B_NORELSE); - } + buf_brelse(bp); return (rv); } else { @@ -2314,7 +2309,7 @@ vn_bwrite(struct vnop_bwrite_args *ap) * buffers faster than the disks can service. Doing a buf_bawrite() in * cases where we have "too many" outstanding buf_bdwrite()s avoids that. */ -__private_extern__ int +int bdwrite_internal(buf_t bp, int return_error) { proc_t p = current_proc(); @@ -2940,7 +2935,6 @@ buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int return (NULL); goto start; /*NOTREACHED*/ - break; default: /* @@ -3937,6 +3931,8 @@ buf_biodone(buf_t bp) INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size); } + throttle_info_end_io(bp); + if (kdebug_enable) { int code = DKIO_DONE; int io_tier = GET_BUFATTR_IO_TIER(bap); @@ -4133,20 +4129,48 @@ vfs_bufstats() #define NRESERVEDIOBUFS 128 +#define MNT_VIRTUALDEV_MAX_IOBUFS 16 +#define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100) buf_t alloc_io_buf(vnode_t vp, int priv) { buf_t bp; + mount_t mp = NULL; + int alloc_for_virtualdev = FALSE; lck_mtx_lock_spin(iobuffer_mtxp); + /* + * We subject iobuf requests for diskimages to additional restrictions. + * + * a) A single diskimage mount cannot use up more than + * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests + * are not subject to this restriction. + * b) iobuf headers used by all diskimage headers by all mount + * points cannot exceed VIRTUALDEV_MAX_IOBUFS. + */ + if (vp && ((mp = vp->v_mount)) && mp != dead_mountp && + mp->mnt_kern_flag & MNTK_VIRTUALDEV) { + alloc_for_virtualdev = TRUE; + while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) || + bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) { + bufstats.bufs_iobufsleeps++; + + need_iobuffer = 1; + (void)msleep(&need_iobuffer, iobuffer_mtxp, + PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf (1)", + NULL); + } + } + while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) || (bp = iobufqueue.tqh_first) == NULL) { bufstats.bufs_iobufsleeps++; need_iobuffer = 1; - (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL); + (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), + (const char *)"alloc_io_buf (2)", NULL); } TAILQ_REMOVE(&iobufqueue, bp, b_freelist); @@ -4154,6 +4178,11 @@ alloc_io_buf(vnode_t vp, int priv) if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse; + if (alloc_for_virtualdev) { + mp->mnt_iobufinuse++; + bufstats.bufs_iobufinuse_vdev++; + } + lck_mtx_unlock(iobuffer_mtxp); /* @@ -4168,6 +4197,8 @@ alloc_io_buf(vnode_t vp, int priv) bp->b_datap = 0; bp->b_flags = 0; bp->b_lflags = BL_BUSY | BL_IOBUF; + if (alloc_for_virtualdev) + bp->b_lflags |= BL_IOBUF_VDEV; bp->b_redundancy_flags = 0; bp->b_blkno = bp->b_lblkno = 0; #ifdef JOE_DEBUG @@ -4196,7 +4227,16 @@ alloc_io_buf(vnode_t vp, int priv) void free_io_buf(buf_t bp) { - int need_wakeup = 0; + int need_wakeup = 0; + int free_for_virtualdev = FALSE; + mount_t mp = NULL; + + /* Was this iobuf for a diskimage ? */ + if (bp->b_lflags & BL_IOBUF_VDEV) { + free_for_virtualdev = TRUE; + if (bp->b_vp) + mp = bp->b_vp->v_mount; + } /* * put buffer back on the head of the iobufqueue @@ -4229,6 +4269,12 @@ free_io_buf(buf_t bp) bufstats.bufs_iobufinuse--; + if (free_for_virtualdev) { + bufstats.bufs_iobufinuse_vdev--; + if (mp && mp != dead_mountp) + mp->mnt_iobufinuse--; + } + lck_mtx_unlock(iobuffer_mtxp); if (need_wakeup) @@ -4267,6 +4313,7 @@ bcleanbuf_thread_init(void) typedef int (*bcleanbufcontinuation)(int); +__attribute__((noreturn)) static void bcleanbuf_thread(void) { diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index b91ca2e44..b7876475d 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -178,7 +178,214 @@ static unsigned int crc32tab[256]; #define NCHHASH(dvp, hash_val) \ (&nchashtbl[(dvp->v_id ^ (hash_val)) & nchashmask]) +/* + * This function tries to check if a directory vp is a subdirectory of dvp + * only from valid v_parent pointers. It is called with the name cache lock + * held and does not drop the lock anytime inside the function. + * + * It returns a boolean that indicates whether or not it was able to + * successfully infer the parent/descendent relationship via the v_parent + * pointers, or if it could not infer such relationship and that the decision + * must be delegated to the owning filesystem. + * + * If it does not defer the decision, i.e. it was successfuly able to determine + * the parent/descendent relationship, *is_subdir tells the caller if vp is a + * subdirectory of dvp. + * + * If the decision is deferred, *next_vp is where it stopped i.e. *next_vp + * is the vnode whose parent is to be determined from the filesystem. + * *is_subdir, in this case, is not indicative of anything and should be + * ignored. + * + * The return value and output args should be used as follows : + * + * defer = cache_check_vnode_issubdir(vp, dvp, is_subdir, next_vp); + * if (!defer) { + * if (*is_subdir) + * vp is subdirectory; + * else + * vp is not a subdirectory; + * } else { + * if (*next_vp) + * check this vnode's parent from the filesystem + * else + * error (likely because of forced unmount). + * } + * + */ +static boolean_t +cache_check_vnode_issubdir(vnode_t vp, vnode_t dvp, boolean_t *is_subdir, + vnode_t *next_vp) +{ + vnode_t tvp = vp; + int defer = FALSE; + + *is_subdir = FALSE; + *next_vp = NULLVP; + while (1) { + mount_t tmp; + + if (tvp == dvp) { + *is_subdir = TRUE; + break; + } else if (tvp == rootvnode) { + /* *is_subdir = FALSE */ + break; + } + + tmp = tvp->v_mount; + while ((tvp->v_flag & VROOT) && tmp && tmp->mnt_vnodecovered && + tvp != dvp && tvp != rootvnode) { + tvp = tmp->mnt_vnodecovered; + tmp = tvp->v_mount; + } + + /* + * If dvp is not at the top of a mount "stack" then + * vp is not a subdirectory of dvp either. + */ + if (tvp == dvp || tvp == rootvnode) { + /* *is_subdir = FALSE */ + break; + } + + if (!tmp) { + defer = TRUE; + *next_vp = NULLVP; + break; + } + + if ((tvp->v_flag & VISHARDLINK) || !(tvp->v_parent)) { + defer = TRUE; + *next_vp = tvp; + break; + } + tvp = tvp->v_parent; + } + + return (defer); +} + +/* maximum times retry from potentially transient errors in vnode_issubdir */ +#define MAX_ERROR_RETRY 3 + +/* + * This function checks if a given directory (vp) is a subdirectory of dvp. + * It walks backwards from vp and if it hits dvp in its parent chain, + * it is a subdirectory. If it encounters the root directory, it is not + * a subdirectory. + * + * This function returns an error if it is unsuccessful and 0 on success. + * + * On entry (and exit) vp has an iocount and if this function has to take + * any iocounts on other vnodes in the parent chain traversal, it releases them. + */ +int +vnode_issubdir(vnode_t vp, vnode_t dvp, int *is_subdir, vfs_context_t ctx) +{ + vnode_t start_vp, tvp; + vnode_t vp_with_iocount; + int error = 0; + char dotdotbuf[] = ".."; + int error_retry_count = 0; /* retry count for potentially transient + errors */ + + *is_subdir = FALSE; + tvp = start_vp = vp; + /* + * Anytime we acquire an iocount in this function, we save the vnode + * in this variable and release it before exiting. + */ + vp_with_iocount = NULLVP; + + while (1) { + boolean_t defer; + vnode_t pvp; + uint32_t vid; + struct componentname cn; + boolean_t is_subdir_locked = FALSE; + + if (tvp == dvp) { + *is_subdir = TRUE; + break; + } else if (tvp == rootvnode) { + /* *is_subdir = FALSE */ + break; + } + + NAME_CACHE_LOCK_SHARED(); + + defer = cache_check_vnode_issubdir(tvp, dvp, &is_subdir_locked, + &tvp); + + if (defer && tvp) + vid = vnode_vid(tvp); + + NAME_CACHE_UNLOCK(); + + if (!defer) { + *is_subdir = is_subdir_locked; + break; + } + + if (!tvp) { + if (error_retry_count++ < MAX_ERROR_RETRY) { + tvp = vp; + continue; + } + error = ENOENT; + break; + } + + if (tvp != start_vp) { + if (vp_with_iocount) { + vnode_put(vp_with_iocount); + vp_with_iocount = NULLVP; + } + + error = vnode_getwithvid(tvp, vid); + if (error) { + if (error_retry_count++ < MAX_ERROR_RETRY) { + tvp = vp; + error = 0; + continue; + } + break; + } + + vp_with_iocount = tvp; + } + + bzero(&cn, sizeof(cn)); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | ISDOTDOT; + cn.cn_context = ctx; + cn.cn_pnbuf = &dotdotbuf[0]; + cn.cn_pnlen = sizeof(dotdotbuf); + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = 2; + + pvp = NULLVP; + if ((error = VNOP_LOOKUP(tvp, &pvp, &cn, ctx))) + break; + + if (!(tvp->v_flag & VISHARDLINK) && tvp->v_parent != pvp) { + (void)vnode_update_identity(tvp, pvp, NULL, 0, 0, + VNODE_UPDATE_PARENT); + } + + if (vp_with_iocount) + vnode_put(vp_with_iocount); + + vp_with_iocount = tvp = pvp; + } + + if (vp_with_iocount) + vnode_put(vp_with_iocount); + + return (error); +} /* * This function builds the path to a filename in "buff". The @@ -466,6 +673,26 @@ build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs vp = vp->v_parent; } + if (vp && (flags & BUILDPATH_CHECKACCESS)) { + vid = vp->v_id; + + NAME_CACHE_UNLOCK(); + + if (vp != first_vp && vp != vp_with_iocount) { + if (vp_with_iocount) { + vnode_put(vp_with_iocount); + vp_with_iocount = NULLVP; + } + if (vnode_getwithvid(vp, vid)) + goto again; + vp_with_iocount = vp; + } + if ((ret = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx))) + goto out; /* no peeking */ + + NAME_CACHE_LOCK_SHARED(); + } + /* * When a mount point is crossed switch the vp. * Continue until we find the root or we find @@ -491,26 +718,6 @@ build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs if (tvp == NULLVP) goto out_unlock; vp = tvp; - - if (vp && (flags & BUILDPATH_CHECKACCESS)) { - vid = vp->v_id; - - NAME_CACHE_UNLOCK(); - - if (vp != first_vp && vp != vp_with_iocount) { - if (vp_with_iocount) { - vnode_put(vp_with_iocount); - vp_with_iocount = NULLVP; - } - if (vnode_getwithvid(vp, vid)) - goto again; - vp_with_iocount = vp; - } - if ((ret = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx))) - goto out; /* no peeking */ - - NAME_CACHE_LOCK_SHARED(); - } } out_unlock: NAME_CACHE_UNLOCK(); @@ -700,7 +907,7 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u while ( (ncp = LIST_FIRST(&vp->v_nclinks)) ) cache_delete(ncp, 1); - while ( (ncp = LIST_FIRST(&vp->v_ncchildren)) ) + while ( (ncp = TAILQ_FIRST(&vp->v_ncchildren)) ) cache_delete(ncp, 1); /* @@ -1060,6 +1267,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, mount_t mp; unsigned int hash; int error = 0; + boolean_t dotdotchecked = FALSE; #if CONFIG_TRIGGERS vnode_t trigger_vp; @@ -1175,7 +1383,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, * NAME_CACHE_LOCK holds these fields stable * * We can't cache KAUTH_VNODE_SEARCHBYANYONE for root correctly - * so we make an ugly check for root here. root is always + * so we make an ugly check for root here. root is always * allowed and breaking out of here only to find out that is * authorized by virtue of being root is very very expensive. */ @@ -1234,9 +1442,53 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, */ if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') vp = dp; - else if ( (cnp->cn_flags & ISDOTDOT) ) - vp = dp->v_parent; - else { + else if ( (cnp->cn_flags & ISDOTDOT) ) { + /* + * If this is a chrooted process, we need to check if + * the process is trying to break out of its chrooted + * jail. We do that by trying to determine if dp is + * a subdirectory of ndp->ni_rootdir. If we aren't + * able to determine that by the v_parent pointers, we + * will leave the fast path. + * + * Since this function may see dotdot components + * many times and it has the name cache lock held for + * the entire duration, we optimise this by doing this + * check only once per cache_lookup_path call. + * If dotdotchecked is set, it means we've done this + * check once already and don't need to do it again. + */ + if (!dotdotchecked && (ndp->ni_rootdir != rootvnode)) { + vnode_t tvp = dp; + boolean_t defer = FALSE; + boolean_t is_subdir = FALSE; + + defer = cache_check_vnode_issubdir(tvp, + ndp->ni_rootdir, &is_subdir, &tvp); + + if (defer) { + /* defer to Filesystem */ + break; + } else if (!is_subdir) { + /* + * This process is trying to break out + * of its chrooted jail, so all its + * dotdot accesses will be translated to + * its root directory. + */ + vp = ndp->ni_rootdir; + } else { + /* + * All good, let this dotdot access + * proceed normally + */ + vp = dp->v_parent; + } + dotdotchecked = TRUE; + } else { + vp = dp->v_parent; + } + } else { if ( (vp = cache_lookup_locked(dp, cnp)) == NULLVP) break; @@ -1784,7 +2036,10 @@ cache_enter_locked(struct vnode *dvp, struct vnode *vp, struct componentname *cn * add us to the list of name cache entries that * are children of dvp */ - LIST_INSERT_HEAD(&dvp->v_ncchildren, ncp, nc_child); + if (vp) + TAILQ_INSERT_TAIL(&dvp->v_ncchildren, ncp, nc_child); + else + TAILQ_INSERT_HEAD(&dvp->v_ncchildren, ncp, nc_child); } @@ -1956,7 +2211,7 @@ cache_delete(struct namecache *ncp, int age_entry) TAILQ_REMOVE(&neghead, ncp, nc_un.nc_negentry); ncs_negtotal--; } - LIST_REMOVE(ncp, nc_child); + TAILQ_REMOVE(&(ncp->nc_dvp->v_ncchildren), ncp, nc_child); LIST_REMOVE(ncp, nc_hash); /* @@ -1991,7 +2246,7 @@ cache_purge(vnode_t vp) kauth_cred_t tcred = NULL; if ((LIST_FIRST(&vp->v_nclinks) == NULL) && - (LIST_FIRST(&vp->v_ncchildren) == NULL) && + (TAILQ_FIRST(&vp->v_ncchildren) == NULL) && (vp->v_cred == NOCRED) && (vp->v_parent == NULLVP)) return; @@ -2004,7 +2259,7 @@ cache_purge(vnode_t vp) while ( (ncp = LIST_FIRST(&vp->v_nclinks)) ) cache_delete(ncp, 1); - while ( (ncp = LIST_FIRST(&vp->v_ncchildren)) ) + while ( (ncp = TAILQ_FIRST(&vp->v_ncchildren)) ) cache_delete(ncp, 1); /* @@ -2034,9 +2289,12 @@ cache_purge_negatives(vnode_t vp) NAME_CACHE_LOCK(); - LIST_FOREACH_SAFE(ncp, &vp->v_ncchildren, nc_child, next_ncp) - if (ncp->nc_vp == NULL) - cache_delete(ncp , 1); + TAILQ_FOREACH_SAFE(ncp, &vp->v_ncchildren, nc_child, next_ncp) { + if (ncp->nc_vp) + break; + + cache_delete(ncp, 1); + } NAME_CACHE_UNLOCK(); } @@ -2167,6 +2425,9 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_ uint32_t lock_index; char *ptr; + if (len > MAXPATHLEN) + len = MAXPATHLEN; + /* * if the length already accounts for the null-byte, then * subtract one so later on we don't index past the end diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index c565a3df4..ee241e7ae 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include @@ -126,6 +127,8 @@ #define MAX_VECTOR_UPL_ELEMENTS 8 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES) +#define CLUSTER_IO_WAITING ((buf_t)1) + extern upl_t vector_upl_create(vm_offset_t); extern boolean_t vector_upl_is_valid(upl_t); extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t); @@ -737,16 +740,10 @@ cluster_iodone(buf_t bp, void *callback_arg) cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { - boolean_t need_wakeup = FALSE; - lck_mtx_lock_spin(cl_transaction_mtxp); bp->b_flags |= B_TDONE; - if (bp->b_flags & B_TWANTED) { - CLR(bp->b_flags, B_TWANTED); - need_wakeup = TRUE; - } for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { /* * all I/O requests that are part of this transaction @@ -759,19 +756,24 @@ cluster_iodone(buf_t bp, void *callback_arg) lck_mtx_unlock(cl_transaction_mtxp); - if (need_wakeup == TRUE) - wakeup(bp); + return 0; + } + + if (cbp->b_trans_next == CLUSTER_IO_WAITING) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); + + lck_mtx_unlock(cl_transaction_mtxp); + wakeup(cbp); return 0; } + if (cbp->b_flags & B_EOT) transaction_complete = TRUE; } lck_mtx_unlock(cl_transaction_mtxp); - if (need_wakeup == TRUE) - wakeup(bp); - if (transaction_complete == FALSE) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, cbp_head, 0, 0, 0, 0); @@ -971,40 +973,53 @@ cluster_wait_IO(buf_t cbp_head, int async) buf_t cbp; if (async) { - /* - * async callback completion will not normally - * generate a wakeup upon I/O completion... - * by setting B_TWANTED, we will force a wakeup - * to occur as any outstanding I/Os complete... - * I/Os already completed will have B_TDONE already - * set and we won't cause us to block - * note that we're actually waiting for the bp to have - * completed the callback function... only then - * can we safely take back ownership of the bp + /* + * Async callback completion will not normally generate a + * wakeup upon I/O completion. To get woken up, we set + * b_trans_next (which is safe for us to modify) on the last + * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows + * to wake us up when all buffers as part of this transaction + * are completed. This is done under the umbrella of + * cl_transaction_mtxp which is also taken in cluster_iodone. */ + bool done = true; + buf_t last = NULL; + lck_mtx_lock_spin(cl_transaction_mtxp); - for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_flags |= B_TWANTED; + for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) { + if (!ISSET(cbp->b_flags, B_TDONE)) + done = false; + } - lck_mtx_unlock(cl_transaction_mtxp); - } - for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { + if (!done) { + last->b_trans_next = CLUSTER_IO_WAITING; + + DTRACE_IO1(wait__start, buf_t, last); + do { + msleep(last, cl_transaction_mtxp, PSPIN | (PRIBIO+1), "cluster_wait_IO", NULL); - if (async) { - while (!ISSET(cbp->b_flags, B_TDONE)) { + /* + * We should only have been woken up if all the + * buffers are completed, but just in case... + */ + done = true; + for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) { + if (!ISSET(cbp->b_flags, B_TDONE)) { + done = false; + break; + } + } + } while (!done); + DTRACE_IO1(wait__done, buf_t, last); - lck_mtx_lock_spin(cl_transaction_mtxp); + last->b_trans_next = NULL; + } - if (!ISSET(cbp->b_flags, B_TDONE)) { - DTRACE_IO1(wait__start, buf_t, cbp); - (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); - DTRACE_IO1(wait__done, buf_t, cbp); - } else - lck_mtx_unlock(cl_transaction_mtxp); - } - } else - buf_biowait(cbp); + lck_mtx_unlock(cl_transaction_mtxp); + } else { // !async + for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) + buf_biowait(cbp); } } @@ -1167,8 +1182,18 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no u_int max_cluster_size; u_int scale; - max_cluster_size = MAX_CLUSTER_SIZE(vp); + if (vp->v_mount->mnt_minsaturationbytecount) { + max_cluster_size = vp->v_mount->mnt_minsaturationbytecount; + scale = 1; + } else { + max_cluster_size = MAX_CLUSTER_SIZE(vp); + + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + scale = WRITE_THROTTLE_SSD; + else + scale = WRITE_THROTTLE; + } if (max_iosize > max_cluster_size) max_cluster = max_cluster_size; else @@ -1177,14 +1202,9 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (size < max_cluster) max_cluster = size; - if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) - scale = WRITE_THROTTLE_SSD; - else - scale = WRITE_THROTTLE; - if (flags & CL_CLOSE) scale += MAX_CLUSTERS; - + async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); } } @@ -2329,6 +2349,7 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in u_int32_t max_io_size; u_int32_t max_upl_size; u_int32_t max_vector_size; + u_int32_t bytes_outstanding_limit; boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; @@ -2405,7 +2426,7 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in goto wait_for_dwrites; } - task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE); + task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp); while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { int throttle_type; @@ -2567,7 +2588,12 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in * if there are already too many outstanding writes * wait until some complete before issuing the next */ - cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); + if (vp->v_mount->mnt_minsaturationbytecount) + bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount; + else + bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2); + + cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct"); if (iostate.io_error) { /* @@ -3478,11 +3504,20 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { uint32_t n; - if (vp->v_mount->mnt_kern_flag & MNTK_SSD) - n = WRITE_BEHIND_SSD; - else - n = WRITE_BEHIND; + if (vp->v_mount->mnt_minsaturationbytecount) { + n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp); + + if (n > MAX_CLUSTERS) + n = MAX_CLUSTERS; + } else + n = 0; + if (n == 0) { + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + n = WRITE_BEHIND_SSD; + else + n = WRITE_BEHIND; + } while (n--) cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); } @@ -3593,17 +3628,6 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* if (flags & IO_SKIP_ENCRYPTION) flags |= IO_ENCRYPTED; - /* - * If we're doing an encrypted IO, then first check to see - * if the IO requested was page aligned. If not, then bail - * out immediately. - */ - if (flags & IO_ENCRYPTED) { - if (read_length & PAGE_MASK) { - retval = EINVAL; - return retval; - } - } /* * do a read through the cache if one of the following is true.... @@ -3620,7 +3644,7 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* retval = cluster_io_type(uio, &read_type, &read_length, 0); } - + while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { switch (read_type) { @@ -4377,11 +4401,6 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, io_req_size = *read_length; iov_base = uio_curriovbase(uio); - max_io_size = filesize - uio->uio_offset; - - if ((off_t)io_req_size > max_io_size) - io_req_size = max_io_size; - offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1); offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; @@ -4401,15 +4420,23 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, misaligned = 1; } + max_io_size = filesize - uio->uio_offset; + /* * The user must request IO in aligned chunks. If the * offset into the file is bad, or the userland pointer * is non-aligned, then we cannot service the encrypted IO request. */ - if ((flags & IO_ENCRYPTED) && (misaligned)) { - retval = EINVAL; + if (flags & IO_ENCRYPTED) { + if (misaligned || (io_req_size & (devblocksize - 1))) + retval = EINVAL; + + max_io_size = roundup(max_io_size, devblocksize); } + if ((off_t)io_req_size > max_io_size) + io_req_size = max_io_size; + /* * When we get to this point, we know... * -- the offset into the file is on a devblocksize boundary @@ -4510,31 +4537,14 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * (which overlaps the end of the direct read) in order to * get at the overhang bytes */ - if (io_size & (devblocksize - 1)) { - if (flags & IO_ENCRYPTED) { - /* - * Normally, we'd round down to the previous page boundary to - * let the UBC manage the zero-filling of the file past the EOF. - * But if we're doing encrypted IO, we can't let any of - * the data hit the UBC. This means we have to do the full - * IO to the upper block boundary of the device block that - * contains the EOF. The user will be responsible for not - * interpreting data PAST the EOF in its buffer. - * - * So just bump the IO back up to a multiple of devblocksize - */ - io_size = ((io_size + devblocksize) & ~(devblocksize - 1)); - io_min = io_size; - } - else { - /* - * Clip the request to the previous page size boundary - * since request does NOT end on a device block boundary - */ - io_size &= ~PAGE_MASK; - io_min = PAGE_SIZE; - } - + if (io_size & (devblocksize - 1)) { + assert(!(flags & IO_ENCRYPTED)); + /* + * Clip the request to the previous page size boundary + * since request does NOT end on a device block boundary + */ + io_size &= ~PAGE_MASK; + io_min = PAGE_SIZE; } if (retval || io_size < io_min) { /* @@ -4755,18 +4765,8 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, else { uio_update(uio, (user_size_t)io_size); } - /* - * Under normal circumstances, the io_size should not be - * bigger than the io_req_size, but we may have had to round up - * to the end of the page in the encrypted IO case. In that case only, - * ensure that we only decrement io_req_size to 0. - */ - if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) { - io_req_size = 0; - } - else { - io_req_size -= io_size; - } + + io_req_size -= io_size; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, upl, (int)uio->uio_offset, io_req_size, retval, 0); @@ -5321,7 +5321,7 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca struct cl_writebehind *wbp; if ( !UBCINFOEXISTS(vp)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -1, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0); return (0); } /* return if deferred write is set */ @@ -5329,13 +5329,13 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca return (0); } if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0); return (0); } if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) { lck_mtx_unlock(&wbp->cl_lockw); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0); return(0); } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, @@ -5349,11 +5349,11 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca * in the sparse map case */ while (wbp->cl_sparse_wait) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0); msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0); } if (flags & IO_SYNC) { my_sparse_wait = 1; @@ -5366,11 +5366,11 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca * fsync actually get cleaned to the disk before this fsync returns */ while (wbp->cl_sparse_pushes) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0); msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0); } } if (wbp->cl_scmap) { @@ -5509,7 +5509,9 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla cl_len = cl_index; - if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) { + /* skip switching to the sparse cluster mechanism if on diskimage */ + if ( ((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) && + !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) ) { int i; /* @@ -5815,7 +5817,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c { int cl_index; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0); for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { int flags; @@ -5834,7 +5836,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c } wbp->cl_number = 0; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0); } @@ -5850,7 +5852,7 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_f off_t offset; u_int length; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0); if (push_flag & PUSH_ALL) vfs_drt_control(scmap, 1); @@ -5867,7 +5869,7 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_f if ( !(push_flag & PUSH_ALL) ) break; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0); } @@ -5897,7 +5899,7 @@ sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, in offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0); } @@ -5997,8 +5999,6 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t return (error); } - - int cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) { @@ -6065,10 +6065,10 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) uio->uio_segflg = segflg; - task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED); + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl)); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, (int)uio->uio_offset, xsize, retval, segflg, 0); - + return (retval); } diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index 71acea8d1..af7ba7c5b 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -89,14 +89,13 @@ int (*mountroot)(void) = NULL; */ extern struct vfsops mfs_vfsops; extern int mfs_mountroot(mount_t, vnode_t, vfs_context_t); /* dead */ -extern struct vfsops hfs_vfsops; -extern int hfs_mountroot(mount_t, vnode_t, vfs_context_t); extern struct vfsops nfs_vfsops; extern int nfs_mountroot(void); extern struct vfsops afs_vfsops; extern struct vfsops null_vfsops; extern struct vfsops devfs_vfsops; extern struct vfsops routefs_vfsops; +extern struct vfsops nullfs_vfsops; #if MOCKFS extern struct vfsops mockfs_vfsops; @@ -111,10 +110,10 @@ typedef int (*mountroot_t)(mount_t, vnode_t, vfs_context_t); enum fs_type_num { FT_NFS = 2, - FT_HFS = 17, FT_DEVFS = 19, FT_SYNTHFS = 20, FT_ROUTEFS = 21, + FT_NULLFS = 22, FT_MOCKFS = 0x6D6F636B }; @@ -122,15 +121,6 @@ enum fs_type_num { * Set up the filesystem operations for vnodes. */ static struct vfstable vfstbllist[] = { - /* HFS/HFS+ Filesystem */ -#if HFS - { &hfs_vfsops, "hfs", FT_HFS, 0, (MNT_LOCAL | MNT_DOVOLFS), hfs_mountroot, NULL, 0, 0, VFC_VFSLOCALARGS | VFC_VFSREADDIR_EXTENDED | VFC_VFS64BITREADY | VFC_VFSVNOP_PAGEOUTV2 | VFC_VFSVNOP_PAGEINV2 -#if CONFIG_SECLUDED_RENAME - | VFC_VFSVNOP_SECLUDE_RENAME -#endif - , NULL, 0, NULL}, -#endif - /* Sun-compatible Network Filesystem */ #if NFSCLIENT { &nfs_vfsops, "nfs", FT_NFS, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFSPREFLIGHT | VFC_VFS64BITREADY | VFC_VFSREADDIR_EXTENDED, NULL, 0, NULL}, @@ -148,6 +138,10 @@ static struct vfstable vfstbllist[] = { #ifndef __LP64__ #endif /* __LP64__ */ +#if NULLFS + { &nullfs_vfsops, "nullfs", FT_NULLFS, 0, (MNT_DONTBROWSE | MNT_RDONLY), NULL, NULL, 0, 0, VFC_VFS64BITREADY, NULL, 0, NULL}, +#endif /* NULLFS */ + #if MOCKFS /* If we are configured for it, mockfs should always be the last standard entry (and thus the last FS we attempt mountroot with) */ { &mockfs_vfsops, "mockfs", FT_MOCKFS, 0, MNT_LOCAL, mockfs_mountroot, NULL, 0, 0, VFC_VFSGENERICARGS, NULL, 0, NULL}, @@ -190,12 +184,6 @@ extern struct vnodeopv_desc nfsv4_vnodeop_opv_desc; extern struct vnodeopv_desc spec_nfsv4nodeop_opv_desc; extern struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc; extern struct vnodeopv_desc null_vnodeop_opv_desc; -extern struct vnodeopv_desc hfs_vnodeop_opv_desc; -#if CONFIG_HFS_STD -extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; -#endif -extern struct vnodeopv_desc hfs_specop_opv_desc; -extern struct vnodeopv_desc hfs_fifoop_opv_desc; extern struct vnodeopv_desc devfs_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_spec_vnodeop_opv_desc; #if FDESC @@ -207,6 +195,8 @@ extern struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc; extern struct vnodeopv_desc mockfs_vnodeop_opv_desc; #endif /* MOCKFS */ +extern struct vnodeopv_desc nullfs_vnodeop_opv_desc; + struct vnodeopv_desc *vfs_opv_descs[] = { &dead_vnodeop_opv_desc, #if FIFO && SOCKETS @@ -226,16 +216,6 @@ struct vnodeopv_desc *vfs_opv_descs[] = { &fifo_nfsv4nodeop_opv_desc, #endif #endif -#if HFS - &hfs_vnodeop_opv_desc, -#if CONFIG_HFS_STD - &hfs_std_vnodeop_opv_desc, -#endif - &hfs_specop_opv_desc, -#if FIFO - &hfs_fifoop_opv_desc, -#endif -#endif #if DEVFS &devfs_vnodeop_opv_desc, &devfs_spec_vnodeop_opv_desc, @@ -244,6 +224,9 @@ struct vnodeopv_desc *vfs_opv_descs[] = { &devfs_fdesc_vnodeop_opv_desc, #endif /* FDESC */ #endif /* DEVFS */ +#if NULLFS + &nullfs_vnodeop_opv_desc, +#endif /* NULLFS */ #if MOCKFS &mockfs_vnodeop_opv_desc, #endif /* MOCKFS */ diff --git a/bsd/vfs/vfs_cprotect.c b/bsd/vfs/vfs_cprotect.c new file mode 100644 index 000000000..5cb03ba43 --- /dev/null +++ b/bsd/vfs/vfs_cprotect.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#define PTR_ADD(type, base, offset) (type)((uintptr_t)(base) + (offset)) + +// -- struct cpx -- + +/* + * This structure contains the unwrapped key and is passed to the lower layers. + * It is private so users must use the accessors declared in sys/cprotect.h + * to read/write it. + */ + +// cpx_flags +typedef uint32_t cpx_flags_t; +enum { + CPX_SEP_WRAPPEDKEY = 0x01, + CPX_IV_AES_CTX_INITIALIZED = 0x02, + CPX_USE_OFFSET_FOR_IV = 0x04, + + // Using AES IV context generated from key + CPX_IV_AES_CTX_VFS = 0x08, + CPX_SYNTHETIC_OFFSET_FOR_IV = 0x10, +}; + +struct cpx { +#if DEBUG + uint32_t cpx_magic1; +#endif + cpx_flags_t cpx_flags; + uint16_t cpx_max_key_len; + uint16_t cpx_key_len; + aes_encrypt_ctx cpx_iv_aes_ctx; // Context used for generating the IV + uint8_t cpx_cached_key[]; +} __attribute__((packed)); + +// -- cpx_t accessors -- + +size_t cpx_size(size_t key_size) +{ + size_t size = sizeof(struct cpx) + key_size; + +#if DEBUG + size += 4; // Extra for magic +#endif + + return size; +} + +size_t cpx_sizex(const struct cpx *cpx) +{ + return cpx_size(cpx->cpx_max_key_len); +} + +cpx_t cpx_alloc(size_t key_len) +{ + cpx_t cpx; + + MALLOC(cpx, cpx_t, cpx_size(key_len), M_TEMP, M_WAITOK); + + cpx_init(cpx, key_len); + + return cpx; +} + +#if DEBUG +static const uint32_t cpx_magic1 = 0x7b787063; // cpx{ +static const uint32_t cpx_magic2 = 0x7870637d; // }cpx +#endif + +void cpx_free(cpx_t cpx) +{ +#if DEBUG + assert(cpx->cpx_magic1 == cpx_magic1); + assert(*PTR_ADD(uint32_t *, cpx, cpx_sizex(cpx) - 4) == cpx_magic2); +#endif + bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len); + FREE(cpx, M_TEMP); +} + +void cpx_init(cpx_t cpx, size_t key_len) +{ +#if DEBUG + cpx->cpx_magic1 = cpx_magic1; + *PTR_ADD(uint32_t *, cpx, cpx_size(key_len) - 4) = cpx_magic2; +#endif + cpx->cpx_flags = 0; + cpx->cpx_key_len = 0; + cpx->cpx_max_key_len = key_len; +} + +bool cpx_is_sep_wrapped_key(const struct cpx *cpx) +{ + return ISSET(cpx->cpx_flags, CPX_SEP_WRAPPEDKEY); +} + +void cpx_set_is_sep_wrapped_key(struct cpx *cpx, bool v) +{ + if (v) + SET(cpx->cpx_flags, CPX_SEP_WRAPPEDKEY); + else + CLR(cpx->cpx_flags, CPX_SEP_WRAPPEDKEY); +} + +bool cpx_use_offset_for_iv(const struct cpx *cpx) +{ + return ISSET(cpx->cpx_flags, CPX_USE_OFFSET_FOR_IV); +} + +void cpx_set_use_offset_for_iv(struct cpx *cpx, bool v) +{ + if (v) + SET(cpx->cpx_flags, CPX_USE_OFFSET_FOR_IV); + else + CLR(cpx->cpx_flags, CPX_USE_OFFSET_FOR_IV); +} + +bool cpx_synthetic_offset_for_iv(const struct cpx *cpx) +{ + return ISSET(cpx->cpx_flags, CPX_SYNTHETIC_OFFSET_FOR_IV); +} + +void cpx_set_synthetic_offset_for_iv(struct cpx *cpx, bool v) +{ + if (v) + SET(cpx->cpx_flags, CPX_SYNTHETIC_OFFSET_FOR_IV); + else + CLR(cpx->cpx_flags, CPX_SYNTHETIC_OFFSET_FOR_IV); +} + +uint16_t cpx_max_key_len(const struct cpx *cpx) +{ + return cpx->cpx_max_key_len; +} + +uint16_t cpx_key_len(const struct cpx *cpx) +{ + return cpx->cpx_key_len; +} + +void cpx_set_key_len(struct cpx *cpx, uint16_t key_len) +{ + cpx->cpx_key_len = key_len; + + if (ISSET(cpx->cpx_flags, CPX_IV_AES_CTX_VFS)) { + /* + * We assume that if the key length is being modified, the key + * has changed. As a result, un-set any bits related to the + * AES context, if needed. They should be re-generated + * on-demand. + */ + CLR(cpx->cpx_flags, CPX_IV_AES_CTX_INITIALIZED | CPX_IV_AES_CTX_VFS); + } +} + +bool cpx_has_key(const struct cpx *cpx) +{ + return cpx->cpx_key_len > 0; +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcast-qual" +void *cpx_key(const struct cpx *cpx) +{ + return (void *)cpx->cpx_cached_key; +} +#pragma clang diagnostic pop + +void cpx_set_aes_iv_key(struct cpx *cpx, void *iv_key) +{ + aes_encrypt_key128(iv_key, &cpx->cpx_iv_aes_ctx); + SET(cpx->cpx_flags, CPX_IV_AES_CTX_INITIALIZED | CPX_USE_OFFSET_FOR_IV); + CLR(cpx->cpx_flags, CPX_IV_AES_CTX_VFS); +} + +aes_encrypt_ctx *cpx_iv_aes_ctx(struct cpx *cpx) +{ + if (ISSET(cpx->cpx_flags, CPX_IV_AES_CTX_INITIALIZED)) + return &cpx->cpx_iv_aes_ctx; + + SHA1_CTX sha1ctxt; + uint8_t digest[SHA_DIGEST_LENGTH]; /* Kiv */ + + /* First init the cp_cache_iv_key[] */ + SHA1Init(&sha1ctxt); + + /* + * We can only use this when the keys are generated in the AP; As a result + * we only use the first 32 bytes of key length in the cache key + */ + SHA1Update(&sha1ctxt, cpx->cpx_cached_key, cpx->cpx_key_len); + SHA1Final(digest, &sha1ctxt); + + cpx_set_aes_iv_key(cpx, digest); + SET(cpx->cpx_flags, CPX_IV_AES_CTX_VFS); + + return &cpx->cpx_iv_aes_ctx; +} + +void cpx_flush(cpx_t cpx) +{ + bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len); + bzero(&cpx->cpx_iv_aes_ctx, sizeof(cpx->cpx_iv_aes_ctx)); + cpx->cpx_flags = 0; + cpx->cpx_key_len = 0; +} + +bool cpx_can_copy(const struct cpx *src, const struct cpx *dst) +{ + return src->cpx_key_len <= dst->cpx_max_key_len; +} + +void cpx_copy(const struct cpx *src, cpx_t dst) +{ + uint16_t key_len = cpx_key_len(src); + cpx_set_key_len(dst, key_len); + memcpy(cpx_key(dst), cpx_key(src), key_len); + dst->cpx_flags = src->cpx_flags; + if (ISSET(dst->cpx_flags, CPX_IV_AES_CTX_INITIALIZED)) + dst->cpx_iv_aes_ctx = src->cpx_iv_aes_ctx; +} + +static struct cp_wrap_func g_cp_wrap_func = {}; + +static int +cp_lock_vfs_callback(mount_t mp, void *arg) +{ + VFS_IOCTL(mp, FIODEVICELOCKED, arg, 0, vfs_context_kernel()); + + return 0; +} + +int +cp_key_store_action(cp_key_store_action_t action) +{ + switch (action) { + case CP_ACTION_LOCKED: + case CP_ACTION_UNLOCKED:; + cp_lock_state_t state = (action == CP_ACTION_LOCKED + ? CP_LOCKED_STATE : CP_UNLOCKED_STATE); + return vfs_iterate(0, cp_lock_vfs_callback, (void *)(uintptr_t)state); + default: + return -1; + } +} + +int +cp_register_wraps(cp_wrap_func_t key_store_func) +{ + g_cp_wrap_func.new_key = key_store_func->new_key; + g_cp_wrap_func.unwrapper = key_store_func->unwrapper; + g_cp_wrap_func.rewrapper = key_store_func->rewrapper; + /* do not use invalidater until rdar://12170050 goes in ! */ + g_cp_wrap_func.invalidater = key_store_func->invalidater; + g_cp_wrap_func.backup_key = key_store_func->backup_key; + + return 0; +} + +int cp_rewrap_key(cp_cred_t access, uint32_t dp_class, + const cp_wrapped_key_t wrapped_key_in, + cp_wrapped_key_t wrapped_key_out) +{ + if (!g_cp_wrap_func.rewrapper) + return ENXIO; + return g_cp_wrap_func.rewrapper(access, dp_class, wrapped_key_in, + wrapped_key_out); +} + +int cp_new_key(cp_cred_t access, uint32_t dp_class, cp_raw_key_t key_out, + cp_wrapped_key_t wrapped_key_out) +{ + if (!g_cp_wrap_func.new_key) + return ENXIO; + return g_cp_wrap_func.new_key(access, dp_class, key_out, wrapped_key_out); +} + +int cp_unwrap_key(cp_cred_t access, const cp_wrapped_key_t wrapped_key_in, + cp_raw_key_t key_out) +{ + if (!g_cp_wrap_func.unwrapper) + return ENXIO; + return g_cp_wrap_func.unwrapper(access, wrapped_key_in, key_out); +} + +int cp_get_backup_key(cp_cred_t access, const cp_wrapped_key_t wrapped_key_in, + cp_wrapped_key_t wrapped_key_out) +{ + if (!g_cp_wrap_func.backup_key) + return ENXIO; + return g_cp_wrap_func.backup_key(access, wrapped_key_in, wrapped_key_out); +} + +int +cp_is_valid_class(int isdir, int32_t protectionclass) +{ + /* + * The valid protection classes are from 0 -> N + * We use a signed argument to detect unassigned values from + * directory entry creation time in HFS. + */ + if (isdir) { + /* Directories are not allowed to have F, but they can have "NONE" */ + return ((protectionclass >= PROTECTION_CLASS_DIR_NONE) && + (protectionclass <= PROTECTION_CLASS_D)); + } + else { + return ((protectionclass >= PROTECTION_CLASS_A) && + (protectionclass <= PROTECTION_CLASS_F)); + } +} + +/* + * Parses versions of the form 12A316, i.e. and + * returns a uint32_t in the form 0xaabbcccc where aa = , + * bb = , cccc = . + */ +static cp_key_os_version_t +parse_os_version(const char *vers) +{ + const char *p = vers; + + int a = 0; + while (*p >= '0' && *p <= '9') { + a = a * 10 + *p - '0'; + ++p; + } + + if (!a) + return 0; + + int b = *p++; + if (!b) + return 0; + + int c = 0; + while (*p >= '0' && *p <= '9') { + c = c * 10 + *p - '0'; + ++p; + } + + if (!c) + return 0; + + return (a & 0xff) << 24 | b << 16 | (c & 0xffff); +} + +cp_key_os_version_t +cp_os_version(void) +{ + static cp_key_os_version_t cp_os_version; + + if (cp_os_version) + return cp_os_version; + + if (!osversion[0]) + return 0; + + cp_os_version = parse_os_version(osversion); + if (!cp_os_version) { + printf("cp_os_version: unable to parse osversion `%s'\n", osversion); + cp_os_version = 1; + } + + return cp_os_version; +} diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index d2f32bd74..c348df21b 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -1895,28 +1895,84 @@ filt_fsevent(struct knote *kn, long hint) } +static int +filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev) +{ + int res; + + lock_watch_table(); + + /* accept new fflags/data as saved */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* restrict the current results to the (smaller?) set of new interest */ + /* + * For compatibility with previous implementations, we leave kn_fflags + * as they were before. + */ + //kn->kn_fflags &= kev->fflags; + + /* determine if the filter is now fired */ + res = filt_fsevent(kn, 0); + + unlock_watch_table(); + + return res; +} + +static int +filt_fsevent_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + int res; + + lock_watch_table(); + + res = filt_fsevent(kn, 0); + if (res) { + *kev = kn->kn_kevent; + if (kev->flags & EV_CLEAR) { + kn->kn_data = 0; + kn->kn_fflags = 0; + } + } + + unlock_watch_table(); + return res; +} + struct filterops fsevent_filtops = { .f_isfd = 1, .f_attach = NULL, .f_detach = filt_fsevent_detach, - .f_event = filt_fsevent + .f_event = filt_fsevent, + .f_touch = filt_fsevent_touch, + .f_process = filt_fsevent_process, }; static int fseventsf_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn, __unused vfs_context_t ctx) { fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data; + int res; kn->kn_hook = (void*)fseh; kn->kn_hookid = 1; - kn->kn_fop = &fsevent_filtops; - + kn->kn_filtid = EVFILTID_FSEVENT; + lock_watch_table(); KNOTE_ATTACH(&fseh->knotes, kn); + /* check to see if it is fired already */ + res = filt_fsevent(kn, 0); + unlock_watch_table(); - return 0; + + return res; } @@ -2147,14 +2203,14 @@ fseventswrite(__unused dev_t dev, struct uio *uio, __unused int ioflag) static const struct fileops fsevents_fops = { - DTYPE_FSEVENTS, - fseventsf_read, - fseventsf_write, - fseventsf_ioctl, - fseventsf_select, - fseventsf_close, - fseventsf_kqfilter, - fseventsf_drain + .fo_type = DTYPE_FSEVENTS, + .fo_read = fseventsf_read, + .fo_write = fseventsf_write, + .fo_ioctl = fseventsf_ioctl, + .fo_select = fseventsf_select, + .fo_close = fseventsf_close, + .fo_kqfilter = fseventsf_kqfilter, + .fo_drain = fseventsf_drain, }; typedef struct fsevent_clone_args32 { @@ -2461,6 +2517,9 @@ create_fsevent_from_kevent(vnode_t vp, uint32_t kevents, struct vnode_attr *vap) } #else /* CONFIG_FSE */ + +#include + /* * The get_pathbuff and release_pathbuff routines are used in places not * related to fsevents, and it's a handy abstraction, so define trivial @@ -2481,4 +2540,16 @@ release_pathbuff(char *path) { FREE_ZONE(path, MAXPATHLEN, M_NAMEI); } + +int +add_fsevent(__unused int type, __unused vfs_context_t ctx, ...) +{ + return 0; +} + +int need_fsevent(__unused int type, __unused vnode_t vp) +{ + return 0; +} + #endif /* CONFIG_FSE */ diff --git a/bsd/vfs/vfs_init.c b/bsd/vfs/vfs_init.c index 54b6880fc..7a5c95a61 100644 --- a/bsd/vfs/vfs_init.c +++ b/bsd/vfs/vfs_init.c @@ -83,8 +83,8 @@ #include #include #include +#include -#include /* journal_init() */ #if CONFIG_MACF #include #include @@ -402,12 +402,6 @@ vfsinit(void) */ nchinit(); -#if JOURNALING - /* - * Initialize the journaling locks - */ - journal_init(); -#endif nspace_handler_init(); /* @@ -501,6 +495,10 @@ vfsinit(void) mac_mount_label_associate(vfs_context_kernel(), mp); #endif dead_mountp = mp; + +#if FS_COMPRESSION + decmpfs_init(); +#endif } void diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c deleted file mode 100644 index 714ba335a..000000000 --- a/bsd/vfs/vfs_journal.c +++ /dev/null @@ -1,5110 +0,0 @@ -/* - * Copyright (c) 2002-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -// -// This file implements a simple write-ahead journaling layer. -// In theory any file system can make use of it by calling these -// functions when the fs wants to modify meta-data blocks. See -// vfs_journal.h for a more detailed description of the api and -// data structures. -// -// Dominic Giampaolo (dbg@apple.com) -// - -#ifdef KERNEL - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* OSAddAtomic */ - -kern_return_t thread_terminate(thread_t); - -/* - * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT - * logging of trim-related calls within the journal. (They're - * disabled by default because there can be a lot of these events, - * and we don't want to overwhelm the kernel debug buffer. If you - * want to watch these events in particular, just set the sysctl.) - */ -static int jnl_kdebug = 0; -SYSCTL_DECL(_vfs_generic); -SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal"); -SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug"); -SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM"); - -#define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1) -#define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2) -#define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3) -#define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4) -#define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5) -#define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6) -#define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7) - -/* - * Cap the journal max size to 2GB. On HFS, it will attempt to occupy - * a full allocation block if the current size is smaller than the allocation - * block on which it resides. Once we hit the exabyte filesystem range, then - * it will use 2GB allocation blocks. As a result, make the cap 2GB. - */ -#define MAX_JOURNAL_SIZE 0x80000000U - -#include /* DTRACE_IO1 */ -#else - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "compat.h" - -#endif /* KERNEL */ - -#include "vfs_journal.h" - -#include - -#if 0 -#undef KERNEL_DEBUG -#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT -#endif - - -#ifndef CONFIG_HFS_TRIM -#define CONFIG_HFS_TRIM 0 -#endif - - -#if JOURNALING - -// -// By default, we grow the list of extents to trim by 4K at a time. -// We'll opt to flush a transaction if it contains at least -// JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number -// of modified blocks is small). -// -enum { - JOURNAL_DEFAULT_TRIM_BYTES = 4096, - JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t), - JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16 -}; - -unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS; -SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush"); - -/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */ -__private_extern__ void qsort( - void * array, - size_t nmembers, - size_t member_size, - int (*)(const void *, const void *)); - - - -// number of bytes to checksum in a block_list_header -// NOTE: this should be enough to clear out the header -// fields as well as the first entry of binfo[] -#define BLHDR_CHECKSUM_SIZE 32 - -static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name); -static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name); -static void unlock_condition(journal *jnl, boolean_t *condition); -static void finish_end_thread(transaction *tr); -static void write_header_thread(journal *jnl); -static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg); -static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait); -static void abort_transaction(journal *jnl, transaction *tr); -static void dump_journal(journal *jnl); - -static __inline__ void lock_oldstart(journal *jnl); -static __inline__ void unlock_oldstart(journal *jnl); -static __inline__ void lock_flush(journal *jnl); -static __inline__ void unlock_flush(journal *jnl); - - -// -// 3105942 - Coalesce writes to the same block on journal replay -// - -typedef struct bucket { - off_t block_num; - uint32_t jnl_offset; - uint32_t block_size; - int32_t cksum; -} bucket; - -#define STARTING_BUCKETS 256 - -static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr); -static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size); -static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full); -static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr); -static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting); - -#define CHECK_JOURNAL(jnl) \ - do { \ - if (jnl == NULL) { \ - panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \ - } \ - if (jnl->jdev == NULL) { \ - panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \ - } \ - if (jnl->fsdev == NULL) { \ - panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \ - } \ - if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \ - panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \ - __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \ - } \ - if ( jnl->jhdr->start <= 0 \ - || jnl->jhdr->start > jnl->jhdr->size) { \ - panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ - __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \ - } \ - if ( jnl->jhdr->end <= 0 \ - || jnl->jhdr->end > jnl->jhdr->size) { \ - panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ - __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \ - } \ - } while(0) - -#define CHECK_TRANSACTION(tr) \ - do { \ - if (tr == NULL) { \ - panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \ - } \ - if (tr->jnl == NULL) { \ - panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \ - } \ - if (tr->blhdr != (block_list_header *)tr->tbuffer) { \ - panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \ - } \ - if (tr->total_bytes < 0) { \ - panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \ - } \ - if (tr->journal_start < 0) { \ - panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \ - } \ - if (tr->journal_end < 0) { \ - panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \ - } \ - if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \ - panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \ - } \ - } while(0) - - - -// -// this isn't a great checksum routine but it will do for now. -// we use it to checksum the journal header and the block list -// headers that are at the start of each transaction. -// -static unsigned int -calc_checksum(char *ptr, int len) -{ - int i; - unsigned int cksum=0; - - // this is a lame checksum but for now it'll do - for(i = 0; i < len; i++, ptr++) { - cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); - } - - return (~cksum); -} - -// -// Journal Locking -// -lck_grp_attr_t * jnl_group_attr; -lck_attr_t * jnl_lock_attr; -lck_grp_t * jnl_mutex_group; - -void -journal_init(void) -{ - jnl_lock_attr = lck_attr_alloc_init(); - jnl_group_attr = lck_grp_attr_alloc_init(); - jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr); -} - -__inline__ void -journal_lock(journal *jnl) -{ - lck_mtx_lock(&jnl->jlock); - if (jnl->owner) { - panic ("jnl: owner is %p, expected NULL\n", jnl->owner); - } - jnl->owner = current_thread(); -} - -__inline__ void -journal_unlock(journal *jnl) -{ - jnl->owner = NULL; - lck_mtx_unlock(&jnl->jlock); -} - -static __inline__ void -lock_flush(journal *jnl) -{ - lck_mtx_lock(&jnl->flock); -} - -static __inline__ void -unlock_flush(journal *jnl) -{ - lck_mtx_unlock(&jnl->flock); -} - -static __inline__ void -lock_oldstart(journal *jnl) -{ - lck_mtx_lock(&jnl->old_start_lock); -} - -static __inline__ void -unlock_oldstart(journal *jnl) -{ - lck_mtx_unlock(&jnl->old_start_lock); -} - - - -#define JNL_WRITE 0x0001 -#define JNL_READ 0x0002 -#define JNL_HEADER 0x8000 - -// -// This function sets up a fake buf and passes it directly to the -// journal device strategy routine (so that it won't get cached in -// the block cache. -// -// It also handles range checking the i/o so that we don't write -// outside the journal boundaries and it will wrap the i/o back -// to the beginning if necessary (skipping over the journal header) -// -static size_t -do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) -{ - int err, curlen=len; - size_t io_sz = 0; - buf_t bp; - off_t max_iosize; - struct bufattr *bap; - boolean_t was_vm_privileged = FALSE; - boolean_t need_vm_privilege = FALSE; - - if (jnl->fsmount) { - if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) - need_vm_privilege = TRUE; - } - - if (*offset < 0 || *offset > jnl->jhdr->size) { - panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); - } - - if (direction & JNL_WRITE) - max_iosize = jnl->max_write_size; - else if (direction & JNL_READ) - max_iosize = jnl->max_read_size; - else - max_iosize = 128 * 1024; - -again: - bp = alloc_io_buf(jnl->jdev, 1); - - if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { - if (*offset == jnl->jhdr->size) { - *offset = jnl->jhdr->jhdr_size; - } else { - curlen = (off_t)jnl->jhdr->size - *offset; - } - } - - if (curlen > max_iosize) { - curlen = max_iosize; - } - - if (curlen <= 0) { - panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len); - } - - if (*offset == 0 && (direction & JNL_HEADER) == 0) { - panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data); - } - - /* - * As alluded to in the block comment at the top of the function, we use a "fake" iobuf - * here and issue directly to the disk device that the journal protects since we don't - * want this to enter the block cache. As a result, we lose the ability to mark it - * as a metadata buf_t for the layers below us that may care. If we were to - * simply attach the B_META flag into the b_flags this may confuse things further - * since this is an iobuf, not a metadata buffer. - * - * To address this, we use the extended bufattr struct embedded in the bp. - * Explicitly mark the buf here as a metadata buffer in its bufattr flags. - */ - bap = &bp->b_attr; - bap->ba_flags |= BA_META; - - if (direction & JNL_READ) - buf_setflags(bp, B_READ); - else { - /* - * don't have to set any flags - */ - vnode_startwrite(jnl->jdev); - } - buf_setsize(bp, curlen); - buf_setcount(bp, curlen); - buf_setdataptr(bp, (uintptr_t)data); - buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); - buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); - - if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) { - buf_markfua(bp); - } - - if (need_vm_privilege == TRUE) { - /* - * if we block waiting for memory, and there is enough pressure to - * cause us to try and create a new swap file, we may end up deadlocking - * due to waiting for the journal on the swap file creation path... - * by making ourselves vm_privileged, we give ourselves the best chance - * of not blocking - */ - was_vm_privileged = set_vm_privilege(TRUE); - } - DTRACE_IO1(journal__start, buf_t, bp); - err = VNOP_STRATEGY(bp); - if (!err) { - err = (int)buf_biowait(bp); - } - DTRACE_IO1(journal__done, buf_t, bp); - - if (need_vm_privilege == TRUE && was_vm_privileged == FALSE) - set_vm_privilege(FALSE); - - free_io_buf(bp); - - if (err) { - printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err); - return 0; - } - - *offset += curlen; - io_sz += curlen; - - if (io_sz != len) { - // handle wrap-around - data = (char *)data + curlen; - curlen = len - io_sz; - if (*offset >= jnl->jhdr->size) { - *offset = jnl->jhdr->jhdr_size; - } - goto again; - } - - return io_sz; -} - -static size_t -read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) -{ - return do_journal_io(jnl, offset, data, len, JNL_READ); -} - -static size_t -write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) -{ - return do_journal_io(jnl, offset, data, len, JNL_WRITE); -} - - -static size_t -read_journal_header(journal *jnl, void *data, size_t len) -{ - off_t hdr_offset = 0; - - return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER); -} - -static int -write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) -{ - static int num_err_prints = 0; - int ret=0; - off_t jhdr_offset = 0; - struct vfs_context context; - - context.vc_thread = current_thread(); - context.vc_ucred = NOCRED; - // - // Flush the track cache if we're not doing force-unit-access - // writes. - // - if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { - - dk_synchronize_t sync_request = { - .options = DK_SYNCHRONIZE_OPTION_BARRIER, - }; - - /* - * If device doesn't support barrier-only flush, or - * the journal is on a different device, use full flush. - */ - if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) { - sync_request.options = 0; - jnl->flush_counter++; - } - - ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, &context); - } - if (ret != 0) { - // - // Only print this error if it's a different error than the - // previous one, or if it's the first time for this device - // or if the total number of printfs is less than 25. We - // allow for up to 25 printfs to insure that some make it - // into the on-disk syslog. Otherwise if we only printed - // one, it's possible it would never make it to the syslog - // for the root volume and that makes debugging hard. - // - if ( ret != jnl->last_flush_err - || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 - || num_err_prints++ < 25) { - - printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret); - - jnl->flags |= JOURNAL_FLUSHCACHE_ERR; - jnl->last_flush_err = ret; - } - } - - jnl->jhdr->sequence_num = sequence_num; - jnl->jhdr->checksum = 0; - jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); - - if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { - printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name); - jnl->flags |= JOURNAL_INVALID; - return -1; - } - - // If we're not doing force-unit-access writes, then we - // have to flush after writing the journal header so that - // a future transaction doesn't sneak out to disk before - // the header does and thus overwrite data that the old - // journal header refers to. Saw this exact case happen - // on an IDE bus analyzer with Larry Barras so while it - // may seem obscure, it's not. - // - if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { - - dk_synchronize_t sync_request = { - .options = DK_SYNCHRONIZE_OPTION_BARRIER, - }; - - /* - * If device doesn't support barrier-only flush, or - * the journal is on a different device, use full flush. - */ - if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) { - sync_request.options = 0; - jnl->flush_counter++; - } - - VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, &context); - } - - return 0; -} - - - -// -// this is a work function used to free up transactions that -// completed. they can't be free'd from buffer_flushed_callback -// because it is called from deep with the disk driver stack -// and thus can't do something that would potentially cause -// paging. it gets called by each of the journal api entry -// points so stuff shouldn't hang around for too long. -// -static void -free_old_stuff(journal *jnl) -{ - transaction *tr, *next; - block_list_header *blhdr=NULL, *next_blhdr=NULL; - - if (jnl->tr_freeme == NULL) - return; - - lock_oldstart(jnl); - tr = jnl->tr_freeme; - jnl->tr_freeme = NULL; - unlock_oldstart(jnl); - - for(; tr; tr=next) { - for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) { - next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum); - blhdr->binfo[0].bnum = 0xdeadc0de; - - kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); - - KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0); - } - next = tr->next; - FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); - } -} - - - -// -// This is our callback that lets us know when a buffer has been -// flushed to disk. It's called from deep within the driver stack -// and thus is quite limited in what it can do. Notably, it can -// not initiate any new i/o's or allocate/free memory. -// -static void -buffer_flushed_callback(struct buf *bp, void *arg) -{ - transaction *tr; - journal *jnl; - transaction *ctr, *prev=NULL, *next; - size_t i; - int bufsize, amt_flushed, total_bytes; - - - //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", - // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); - - // snarf out the bits we want - bufsize = buf_size(bp); - tr = (transaction *)arg; - - // then we've already seen it - if (tr == NULL) { - return; - } - - CHECK_TRANSACTION(tr); - - jnl = tr->jnl; - - CHECK_JOURNAL(jnl); - - amt_flushed = tr->num_killed; - total_bytes = tr->total_bytes; - - // update the number of blocks that have been flushed. - // this buf may represent more than one block so take - // that into account. - // - // OSAddAtomic() returns the value of tr->num_flushed before the add - // - amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed); - - - // if this transaction isn't done yet, just return as - // there is nothing to do. - // - // NOTE: we are careful to not reference anything through - // the tr pointer after doing the OSAddAtomic(). if - // this if statement fails then we are the last one - // and then it's ok to dereference "tr". - // - if ((amt_flushed + bufsize) < total_bytes) { - return; - } - - // this will single thread checking the transaction - lock_oldstart(jnl); - - if (tr->total_bytes == (int)0xfbadc0de) { - // then someone beat us to it... - unlock_oldstart(jnl); - return; - } - - // mark this so that we're the owner of dealing with the - // cleanup for this transaction - tr->total_bytes = 0xfbadc0de; - - if (jnl->flags & JOURNAL_INVALID) - goto transaction_done; - - //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", - // tr, tr->journal_start, tr->journal_end, jnl); - - // find this entry in the old_start[] index and mark it completed - for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { - - if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { - jnl->old_start[i] &= ~(0x8000000000000000ULL); - break; - } - } - - if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { - panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", - tr->journal_start, tr, jnl); - } - - - // if we are here then we need to update the journal header - // to reflect that this transaction is complete - if (tr->journal_start == jnl->active_start) { - jnl->active_start = tr->journal_end; - tr->journal_start = tr->journal_end = (off_t)0; - } - - // go through the completed_trs list and try to coalesce - // entries, restarting back at the beginning if we have to. - for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) { - if (ctr->journal_start == jnl->active_start) { - jnl->active_start = ctr->journal_end; - if (prev) { - prev->next = ctr->next; - } - if (ctr == jnl->completed_trs) { - jnl->completed_trs = ctr->next; - } - - next = jnl->completed_trs; // this starts us over again - ctr->next = jnl->tr_freeme; - jnl->tr_freeme = ctr; - ctr = NULL; - } else if (tr->journal_end == ctr->journal_start) { - ctr->journal_start = tr->journal_start; - next = jnl->completed_trs; // this starts us over again - ctr = NULL; - tr->journal_start = tr->journal_end = (off_t)0; - } else if (tr->journal_start == ctr->journal_end) { - ctr->journal_end = tr->journal_end; - next = ctr->next; - tr->journal_start = tr->journal_end = (off_t)0; - } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { - // coalesce the next entry with this one and link the next - // entry in at the head of the tr_freeme list - next = ctr->next; // temporarily use the "next" variable - ctr->journal_end = next->journal_end; - ctr->next = next->next; - next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list - jnl->tr_freeme = next; - - next = jnl->completed_trs; // this starts us over again - ctr = NULL; - } else { - next = ctr->next; - } - } - - // if this is true then we didn't merge with anyone - // so link ourselves in at the head of the completed - // transaction list. - if (tr->journal_start != 0) { - // put this entry into the correct sorted place - // in the list instead of just at the head. - // - - prev = NULL; - for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { - // just keep looping - } - - if (ctr == NULL && prev == NULL) { - jnl->completed_trs = tr; - tr->next = NULL; - } else if (ctr == jnl->completed_trs) { - tr->next = jnl->completed_trs; - jnl->completed_trs = tr; - } else { - tr->next = prev->next; - prev->next = tr; - } - } else { - // if we're here this tr got merged with someone else so - // put it on the list to be free'd - tr->next = jnl->tr_freeme; - jnl->tr_freeme = tr; - } -transaction_done: - unlock_oldstart(jnl); - - unlock_condition(jnl, &jnl->asyncIO); -} - - -#include - -#define SWAP16(x) OSSwapInt16(x) -#define SWAP32(x) OSSwapInt32(x) -#define SWAP64(x) OSSwapInt64(x) - - -static void -swap_journal_header(journal *jnl) -{ - jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); - jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); - jnl->jhdr->start = SWAP64(jnl->jhdr->start); - jnl->jhdr->end = SWAP64(jnl->jhdr->end); - jnl->jhdr->size = SWAP64(jnl->jhdr->size); - jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); - jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); - jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); - jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); -} - -static void -swap_block_list_header(journal *jnl, block_list_header *blhdr) -{ - int i; - - blhdr->max_blocks = SWAP16(blhdr->max_blocks); - blhdr->num_blocks = SWAP16(blhdr->num_blocks); - blhdr->bytes_used = SWAP32(blhdr->bytes_used); - blhdr->checksum = SWAP32(blhdr->checksum); - blhdr->flags = SWAP32(blhdr->flags); - - if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { - printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size); - return; - } - - for(i = 0; i < blhdr->num_blocks; i++) { - blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum); - blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize); - blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum); - } -} - - -static int -update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) -{ - int ret; - struct buf *oblock_bp=NULL; - boolean_t was_vm_privileged = FALSE; - - - // first read the block we want. - ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); - if (ret != 0) { - printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret); - - if (oblock_bp) { - buf_brelse(oblock_bp); - oblock_bp = NULL; - } - - // let's try to be aggressive here and just re-write the block - oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META); - if (oblock_bp == NULL) { - printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block); - return -1; - } - } - - // make sure it's the correct size. - if (buf_size(oblock_bp) != bsize) { - buf_brelse(oblock_bp); - return -1; - } - - // copy the journal data over top of it - memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize); - - if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { - /* - * if we block waiting for memory, and there is enough pressure to - * cause us to try and create a new swap file, we may end up deadlocking - * due to waiting for the journal on the swap file creation path... - * by making ourselves vm_privileged, we give ourselves the best chance - * of not blocking - */ - was_vm_privileged = set_vm_privilege(TRUE); - } - ret = VNOP_BWRITE(oblock_bp); - - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - - if (ret != 0) { - printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret); - return ret; - } - // and now invalidate it so that if someone else wants to read - // it in a different size they'll be able to do it. - ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); - if (oblock_bp) { - buf_markinvalid(oblock_bp); - buf_brelse(oblock_bp); - } - - return 0; -} - -static int -grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) -{ - struct bucket *newBuf; - int current_size = num_buckets, i; - - // return if newsize is less than the current size - if (new_size < num_buckets) { - return current_size; - } - - if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { - printf("jnl: grow_table: no memory to expand coalesce buffer!\n"); - return -1; - } - - // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); - - // copy existing elements - bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); - - // initialize the new ones - for(i = num_buckets; i < new_size; i++) { - newBuf[i].block_num = (off_t)-1; - } - - // free the old container - FREE(*buf_ptr, M_TEMP); - - // reset the buf_ptr - *buf_ptr = newBuf; - - return new_size; -} - -static int -lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) -{ - int lo, hi, index, matches, i; - - if (num_full == 0) { - return 0; // table is empty, so insert at index=0 - } - - lo = 0; - hi = num_full - 1; - index = -1; - - // perform binary search for block_num - do { - int mid = (hi - lo)/2 + lo; - off_t this_num = (*buf_ptr)[mid].block_num; - - if (block_num == this_num) { - index = mid; - break; - } - - if (block_num < this_num) { - hi = mid; - continue; - } - - if (block_num > this_num) { - lo = mid + 1; - continue; - } - } while (lo < hi); - - // check if lo and hi converged on the match - if (block_num == (*buf_ptr)[hi].block_num) { - index = hi; - } - - // if no existing entry found, find index for new one - if (index == -1) { - index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; - } else { - // make sure that we return the right-most index in the case of multiple matches - matches = 0; - i = index + 1; - while (i < num_full && block_num == (*buf_ptr)[i].block_num) { - matches++; - i++; - } - - index += matches; - } - - return index; -} - -static int -insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) -{ - if (!overwriting) { - // grow the table if we're out of space - if (*num_full_ptr >= *num_buckets_ptr) { - int new_size = *num_buckets_ptr * 2; - int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); - - if (grow_size < new_size) { - printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name); - return -1; - } - - *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size - } - - // if we're not inserting at the end, we need to bcopy - if (blk_index != *num_full_ptr) { - bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); - } - - (*num_full_ptr)++; // increment only if we're not overwriting - } - - // sanity check the values we're about to add - if ((off_t)offset >= jnl->jhdr->size) { - offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); - } - if (size <= 0) { - panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); - } - - (*buf_ptr)[blk_index].block_num = num; - (*buf_ptr)[blk_index].block_size = size; - (*buf_ptr)[blk_index].jnl_offset = offset; - (*buf_ptr)[blk_index].cksum = cksum; - - return blk_index; -} - -static int -do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) -{ - int num_to_remove, index, i, overwrite, err; - size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; - off_t overlap, block_start, block_end; - - block_start = block_num*jhdr_size; - block_end = block_start + size; - overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); - - // first, eliminate any overlap with the previous entry - if (blk_index != 0 && !overwrite) { - off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; - off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; - overlap = prev_block_end - block_start; - if (overlap > 0) { - if (overlap % jhdr_size != 0) { - panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); - } - - // if the previous entry completely overlaps this one, we need to break it into two pieces. - if (prev_block_end > block_end) { - off_t new_num = block_end / jhdr_size; - size_t new_size = prev_block_end - block_end; - - new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); - - err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); - if (err < 0) { - panic("jnl: do_overlap: error inserting during pre-overlap\n"); - } - } - - // Regardless, we need to truncate the previous entry to the beginning of the overlap - (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start; - (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it - } - } - - // then, bail out fast if there's no overlap with the entries that follow - if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { - return 0; // no overlap, no overwrite - } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { - - (*buf_ptr)[blk_index].cksum = cksum; // update this - return 1; // simple overwrite - } - - // Otherwise, find all cases of total and partial overlap. We use the special - // block_num of -2 to designate entries that are completely overlapped and must - // be eliminated. The block_num, size, and jnl_offset of partially overlapped - // entries must be adjusted to keep the array consistent. - index = blk_index; - num_to_remove = 0; - while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { - if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { - (*buf_ptr)[index].block_num = -2; // mark this for deletion - num_to_remove++; - } else { - overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; - if (overlap > 0) { - if (overlap % jhdr_size != 0) { - panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); - } - - // if we partially overlap this entry, adjust its block number, jnl offset, and size - (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up - (*buf_ptr)[index].cksum = 0; - - new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around - if ((off_t)new_offset >= jnl->jhdr->size) { - new_offset = jhdr_size + (new_offset - jnl->jhdr->size); - } - (*buf_ptr)[index].jnl_offset = new_offset; - - (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value - if ((*buf_ptr)[index].block_size <= 0) { - panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); - // return -1; // if above panic is removed, return -1 for error - } - } - - } - - index++; - } - - // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) - index--; // start with the last index used within the above loop - while (index >= blk_index) { - if ((*buf_ptr)[index].block_num == -2) { - if (index == *num_full_ptr-1) { - (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free - } else { - bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); - } - (*num_full_ptr)--; - } - index--; - } - - // eliminate any stale entries at the end of the table - for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { - (*buf_ptr)[i].block_num = -1; - } - - return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) -} - -// PR-3105942: Coalesce writes to the same block in journal replay -// We coalesce writes by maintaining a dynamic sorted array of physical disk blocks -// to be replayed and the corresponding location in the journal which contains -// the most recent data for those blocks. The array is "played" once the all the -// blocks in the journal have been coalesced. The code for the case of conflicting/ -// overlapping writes to a single block is the most dense. Because coalescing can -// disrupt the existing time-ordering of blocks in the journal playback, care -// is taken to catch any overlaps and keep the array consistent. -static int -add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) -{ - int blk_index, overwriting; - - // on return from lookup_bucket(), blk_index is the index into the table where block_num should be - // inserted (or the index of the elem to overwrite). - blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); - - // check if the index is within bounds (if we're adding this block to the end of - // the table, blk_index will be equal to num_full) - if (blk_index < 0 || blk_index > *num_full_ptr) { - //printf("jnl: add_block: trouble adding block to co_buf\n"); - return -1; - } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); - - // Determine whether we're overwriting an existing entry by checking for overlap - overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); - if (overwriting < 0) { - return -1; // if we got an error, pass it along - } - - // returns the index, or -1 on error - blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); - - return blk_index; -} - -static int -replay_journal(journal *jnl) -{ - int i, bad_blocks=0; - unsigned int orig_checksum, checksum, check_block_checksums = 0; - size_t ret; - size_t max_bsize = 0; /* protected by block_ptr */ - block_list_header *blhdr; - off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; - char *buff, *block_ptr=NULL; - struct bucket *co_buf; - int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; - uint32_t last_sequence_num = 0; - int replay_retry_count = 0; - - // wrap the start ptr if it points to the very end of the journal - if (jnl->jhdr->start == jnl->jhdr->size) { - jnl->jhdr->start = jnl->jhdr->jhdr_size; - } - if (jnl->jhdr->end == jnl->jhdr->size) { - jnl->jhdr->end = jnl->jhdr->jhdr_size; - } - - if (jnl->jhdr->start == jnl->jhdr->end) { - return 0; - } - - orig_jnl_start = jnl->jhdr->start; - - // allocate memory for the header_block. we'll read each blhdr into this - if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size, VM_KERN_MEMORY_FILE)) { - printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n", - jnl->jdev_name, jnl->jhdr->blhdr_size); - return -1; - } - - // allocate memory for the coalesce buffer - if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { - printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name); - return -1; - } - -restart_replay: - - // initialize entries - for(i = 0; i < num_buckets; i++) { - co_buf[i].block_num = -1; - } - num_full = 0; // empty at first - - - printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", - jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); - - while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { - offset = blhdr_offset = jnl->jhdr->start; - ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size); - if (ret != (size_t)jnl->jhdr->blhdr_size) { - printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset); - bad_blocks = 1; - goto bad_txn_handling; - } - - blhdr = (block_list_header *)buff; - - orig_checksum = blhdr->checksum; - blhdr->checksum = 0; - if (jnl->flags & JOURNAL_NEED_SWAP) { - // calculate the checksum based on the unswapped data - // because it is done byte-at-a-time. - orig_checksum = (unsigned int)SWAP32(orig_checksum); - checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); - swap_block_list_header(jnl, blhdr); - } else { - checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); - } - - - // - // XXXdbg - if these checks fail, we should replay as much - // we can in the hopes that it will still leave the - // drive in a better state than if we didn't replay - // anything - // - if (checksum != orig_checksum) { - if (check_past_jnl_end && in_uncharted_territory) { - - if (blhdr_offset != jnl->jhdr->end) { - printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); - } - - check_past_jnl_end = 0; - jnl->jhdr->end = blhdr_offset; - continue; - } - - printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", - jnl->jdev_name, blhdr_offset, orig_checksum, checksum); - - if (blhdr_offset == orig_jnl_start) { - // if there's nothing in the journal at all, just bail out altogether. - goto bad_replay; - } - - bad_blocks = 1; - goto bad_txn_handling; - } - - if ( (last_sequence_num != 0) - && (blhdr->binfo[0].u.bi.b.sequence_num != 0) - && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) - && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { - - txn_start_offset = jnl->jhdr->end = blhdr_offset; - - if (check_past_jnl_end) { - check_past_jnl_end = 0; - printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", - jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); - continue; - } - - printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", - jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); - bad_blocks = 1; - goto bad_txn_handling; - } - last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num; - - if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) { - if (last_sequence_num == 0) { - check_past_jnl_end = 0; - printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", - jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - if (jnl->jhdr->start != jnl->jhdr->end) { - jnl->jhdr->start = jnl->jhdr->end; - } - continue; - } - printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); - } - - if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size) - || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { - printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n", - jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks); - bad_blocks = 1; - goto bad_txn_handling; - } - - max_bsize = 0; - for (i = 1; i < blhdr->num_blocks; i++) { - if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) { - printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum); - bad_blocks = 1; - goto bad_txn_handling; - } - - if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) { - max_bsize = blhdr->binfo[i].u.bi.bsize; - } - } - - if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) { - check_block_checksums = 1; - if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize, VM_KERN_MEMORY_FILE)) { - goto bad_replay; - } - } else { - block_ptr = NULL; - } - - if (blhdr->flags & BLHDR_FIRST_HEADER) { - txn_start_offset = blhdr_offset; - } - - //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n", - // blhdr->num_blocks-1, jnl->jhdr->start); - bad_blocks = 0; - for (i = 1; i < blhdr->num_blocks; i++) { - int size, ret_val; - off_t number; - - size = blhdr->binfo[i].u.bi.bsize; - number = blhdr->binfo[i].bnum; - - // don't add "killed" blocks - if (number == (off_t)-1) { - //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); - } else { - - if (check_block_checksums) { - int32_t disk_cksum; - off_t block_offset; - - block_offset = offset; - - // read the block so we can check the checksum - ret = read_journal_data(jnl, &block_offset, block_ptr, size); - if (ret != (size_t)size) { - printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); - bad_blocks = 1; - goto bad_txn_handling; - } - - disk_cksum = calc_checksum(block_ptr, size); - - // there is no need to swap the checksum from disk because - // it got swapped when the blhdr was read in. - if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { - printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", - jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); - printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", - *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], - *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); - - bad_blocks = 1; - goto bad_txn_handling; - } - } - - - // add this bucket to co_buf, coalescing where possible - // printf("jnl: replay_journal: adding block 0x%llx\n", number); - ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); - - if (ret_val == -1) { - printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name); - goto bad_replay; - } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); - } - - // increment offset - offset += size; - - // check if the last block added puts us off the end of the jnl. - // if so, we need to wrap to the beginning and take any remainder - // into account - // - if (offset >= jnl->jhdr->size) { - offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); - } - } - - if (block_ptr) { - kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - block_ptr = NULL; - } - -bad_txn_handling: - if (bad_blocks) { - /* Journal replay got error before it found any valid - * transations, abort replay */ - if (txn_start_offset == 0) { - printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); - goto bad_replay; - } - - /* Repeated error during journal replay, abort replay */ - if (replay_retry_count == 3) { - printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name); - goto bad_replay; - } - replay_retry_count++; - - /* There was an error replaying the journal (possibly - * EIO/ENXIO from the device). So retry replaying all - * the good transactions that we found before getting - * the error. - */ - jnl->jhdr->start = orig_jnl_start; - jnl->jhdr->end = txn_start_offset; - check_past_jnl_end = 0; - last_sequence_num = 0; - printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - goto restart_replay; - } - - jnl->jhdr->start += blhdr->bytes_used; - if (jnl->jhdr->start >= jnl->jhdr->size) { - // wrap around and skip the journal header block - jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size; - } - - if (jnl->jhdr->start == jnl->jhdr->end) { - in_uncharted_territory = 1; - } - } - - if (jnl->jhdr->start != jnl->jhdr->end) { - printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - jnl->jhdr->end = jnl->jhdr->start; - } - - //printf("jnl: replay_journal: replaying %d blocks\n", num_full); - - /* - * make sure it's at least one page in size, so - * start max_bsize at PAGE_SIZE - */ - for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { - - if (co_buf[i].block_num == (off_t)-1) - continue; - - if (co_buf[i].block_size > max_bsize) - max_bsize = co_buf[i].block_size; - } - /* - * round max_bsize up to the nearest PAGE_SIZE multiple - */ - if (max_bsize & (PAGE_SIZE - 1)) { - max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); - } - - if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize, VM_KERN_MEMORY_FILE)) { - goto bad_replay; - } - - // Replay the coalesced entries in the co-buf - for(i = 0; i < num_full; i++) { - size_t size = co_buf[i].block_size; - off_t jnl_offset = (off_t) co_buf[i].jnl_offset; - off_t number = co_buf[i].block_num; - - - // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, - // co_buf[i].block_size, co_buf[i].jnl_offset); - - if (number == (off_t)-1) { - // printf("jnl: replay_journal: skipping killed fs block\n"); - } else { - - // do journal read, and set the phys. block - ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); - if (ret != size) { - printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); - goto bad_replay; - } - - if (update_fs_block(jnl, block_ptr, number, size) != 0) { - goto bad_replay; - } - } - } - - - // done replaying; update jnl header - if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { - goto bad_replay; - } - - printf("jnl: %s: journal replay done.\n", jnl->jdev_name); - - // free block_ptr - if (block_ptr) { - kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - block_ptr = NULL; - } - - // free the coalesce buffer - FREE(co_buf, M_TEMP); - co_buf = NULL; - - kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); - return 0; - -bad_replay: - if (block_ptr) { - kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - } - if (co_buf) { - FREE(co_buf, M_TEMP); - } - kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); - - return -1; -} - - -#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024) -#define MAX_TRANSACTION_BUFFER_SIZE (3072*1024) - -// XXXdbg - so I can change it in the debugger -int def_tbuffer_size = 0; - - -// -// This function sets the size of the tbuffer and the -// size of the blhdr. It assumes that jnl->jhdr->size -// and jnl->jhdr->jhdr_size are already valid. -// -static void -size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) -{ - // - // one-time initialization based on how much memory - // there is in the machine. - // - if (def_tbuffer_size == 0) { - if (max_mem < (256*1024*1024)) { - def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE; - } else if (max_mem < (512*1024*1024)) { - def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2; - } else if (max_mem < (1024*1024*1024)) { - def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3; - } else { - def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024)); - } - } - - // size up the transaction buffer... can't be larger than the number - // of blocks that can fit in a block_list_header block. - if (tbuffer_size == 0) { - jnl->tbuffer_size = def_tbuffer_size; - } else { - // make sure that the specified tbuffer_size isn't too small - if (tbuffer_size < jnl->jhdr->blhdr_size * 2) { - tbuffer_size = jnl->jhdr->blhdr_size * 2; - } - // and make sure it's an even multiple of the block size - if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) { - tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size); - } - - jnl->tbuffer_size = tbuffer_size; - } - - if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { - jnl->tbuffer_size = (jnl->jhdr->size / 2); - } - - if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { - jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE; - } - - jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); - if (jnl->jhdr->blhdr_size < phys_blksz) { - jnl->jhdr->blhdr_size = phys_blksz; - } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { - // have to round up so we're an even multiple of the physical block size - jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1); - } -} - -static void -get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context) -{ - off_t readblockcnt; - off_t writeblockcnt; - off_t readmaxcnt=0, tmp_readmaxcnt; - off_t writemaxcnt=0, tmp_writemaxcnt; - off_t readsegcnt, writesegcnt; - int32_t features; - - if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { - if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { - const char *name = vnode_getname_printable(devvp); - jnl->flags |= JOURNAL_DO_FUA_WRITES; - printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features); - vnode_putname_printable(name); - } - if (features & DK_FEATURE_UNMAP) { - jnl->flags |= JOURNAL_USE_UNMAP; - } - - if (features & DK_FEATURE_BARRIER) { - jnl->flags |= JOURNAL_FEATURE_BARRIER; - } - } - - // - // First check the max read size via several different mechanisms... - // - VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context); - - if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) { - tmp_readmaxcnt = readblockcnt * phys_blksz; - if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { - readmaxcnt = tmp_readmaxcnt; - } - } - - if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) { - readsegcnt = 0; - } - - if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { - readmaxcnt = readsegcnt * PAGE_SIZE; - } - - if (readmaxcnt == 0) { - readmaxcnt = 128 * 1024; - } else if (readmaxcnt > UINT32_MAX) { - readmaxcnt = UINT32_MAX; - } - - - // - // Now check the max writes size via several different mechanisms... - // - VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context); - - if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) { - tmp_writemaxcnt = writeblockcnt * phys_blksz; - if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { - writemaxcnt = tmp_writemaxcnt; - } - } - - if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) { - writesegcnt = 0; - } - - if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { - writemaxcnt = writesegcnt * PAGE_SIZE; - } - - if (writemaxcnt == 0) { - writemaxcnt = 128 * 1024; - } else if (writemaxcnt > UINT32_MAX) { - writemaxcnt = UINT32_MAX; - } - - jnl->max_read_size = readmaxcnt; - jnl->max_write_size = writemaxcnt; - // printf("jnl: %s: max read/write: %lld k / %lld k\n", - // jnl->jdev_name ? jnl->jdev_name : "unknown", - // jnl->max_read_size/1024, jnl->max_write_size/1024); -} - - -journal * -journal_create(struct vnode *jvp, - off_t offset, - off_t journal_size, - struct vnode *fsvp, - size_t min_fs_blksz, - int32_t flags, - int32_t tbuffer_size, - void (*flush)(void *arg), - void *arg, - struct mount *fsmount) -{ - journal *jnl; - uint32_t phys_blksz, new_txn_base; - u_int32_t min_size; - struct vfs_context context; - const char *jdev_name; - /* - * Cap the journal max size to 2GB. On HFS, it will attempt to occupy - * a full allocation block if the current size is smaller than the allocation - * block on which it resides. Once we hit the exabyte filesystem range, then - * it will use 2GB allocation blocks. As a result, make the cap 2GB. - */ - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; - - jdev_name = vnode_getname_printable(jvp); - - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - goto cleanup_jdev_name; - } - - if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { - printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size); - goto cleanup_jdev_name; - } - - min_size = phys_blksz * (phys_blksz / sizeof(block_info)); - /* Reject journals that are too small given the sector size of the device */ - if (journal_size < min_size) { - printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n", - jdev_name, journal_size, phys_blksz); - goto cleanup_jdev_name; - } - - if (phys_blksz > min_fs_blksz) { - printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_blksz); - goto cleanup_jdev_name; - } - - if ((journal_size % phys_blksz) != 0) { - printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n", - jdev_name, journal_size, phys_blksz); - goto cleanup_jdev_name; - } - - - MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); - memset(jnl, 0, sizeof(*jnl)); - - jnl->jdev = jvp; - jnl->jdev_offset = offset; - jnl->fsdev = fsvp; - jnl->flush = flush; - jnl->flush_arg = arg; - jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - jnl->jdev_name = jdev_name; - lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); - - // Keep a point to the mount around for use in IO throttling. - jnl->fsmount = fsmount; - // XXX: This lock discipline looks correct based on dounmount(), but it - // doesn't seem to be documented anywhere. - mount_ref(fsmount, 0); - - get_io_info(jvp, phys_blksz, jnl, &context); - - if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) { - printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); - goto bad_kmem_alloc; - } - jnl->header_buf_size = phys_blksz; - - jnl->jhdr = (journal_header *)jnl->header_buf; - memset(jnl->jhdr, 0, sizeof(journal_header)); - - // we have to set this up here so that do_journal_io() will work - jnl->jhdr->jhdr_size = phys_blksz; - - // - // We try and read the journal header to see if there is already one - // out there. If there is, it's possible that it has transactions - // in it that we might replay if we happen to pick a sequence number - // that is a little less than the old one, there is a crash and the - // last txn written ends right at the start of a txn from the previous - // incarnation of this file system. If all that happens we would - // replay the transactions from the old file system and that would - // destroy your disk. Although it is extremely unlikely for all those - // conditions to happen, the probability is non-zero and the result is - // severe - you lose your file system. Therefore if we find a valid - // journal header and the sequence number is non-zero we write junk - // over the entire journal so that there is no way we will encounter - // any old transactions. This is slow but should be a rare event - // since most tools erase the journal. - // - if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz - && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC - && jnl->jhdr->sequence_num != 0) { - - new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; - printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base); - -#if 0 - int i; - off_t pos=0; - - for(i = 1; i < journal_size / phys_blksz; i++) { - pos = i*phys_blksz; - - // we don't really care what data we write just so long - // as it's not a valid transaction header. since we have - // the header_buf sitting around we'll use that. - write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz); - } - printf("jnl: create: done clearing journal (i=%d)\n", i); -#endif - } else { - new_txn_base = random() & 0x00ffffff; - } - - memset(jnl->header_buf, 0, phys_blksz); - - jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; - jnl->jhdr->endian = ENDIAN_MAGIC; - jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself - jnl->jhdr->end = phys_blksz; - jnl->jhdr->size = journal_size; - jnl->jhdr->jhdr_size = phys_blksz; - size_up_tbuffer(jnl, tbuffer_size, phys_blksz); - - jnl->active_start = jnl->jhdr->start; - - // XXXdbg - for testing you can force the journal to wrap around - // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); - // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); - - jnl->jhdr->sequence_num = new_txn_base; - - lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); - lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); - lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); - - - jnl->flushing = FALSE; - jnl->asyncIO = FALSE; - jnl->flush_aborted = FALSE; - jnl->writing_header = FALSE; - jnl->async_trim = NULL; - jnl->sequence_num = jnl->jhdr->sequence_num; - - if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { - printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name); - goto bad_write; - } - - goto journal_create_complete; - - -bad_write: - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); -bad_kmem_alloc: - jnl->jhdr = NULL; - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - mount_drop(fsmount, 0); -cleanup_jdev_name: - vnode_putname_printable(jdev_name); - jnl = NULL; -journal_create_complete: - return jnl; -} - - -journal * -journal_open(struct vnode *jvp, - off_t offset, - off_t journal_size, - struct vnode *fsvp, - size_t min_fs_blksz, - int32_t flags, - int32_t tbuffer_size, - void (*flush)(void *arg), - void *arg, - struct mount *fsmount) -{ - journal *jnl; - uint32_t orig_blksz=0; - uint32_t phys_blksz; - u_int32_t min_size = 0; - int orig_checksum, checksum; - struct vfs_context context; - const char *jdev_name = vnode_getname_printable(jvp); - - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; - - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - goto cleanup_jdev_name; - } - - if (phys_blksz > min_fs_blksz) { - printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_blksz); - goto cleanup_jdev_name; - } - - if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size); - goto cleanup_jdev_name; - } - - min_size = phys_blksz * (phys_blksz / sizeof(block_info)); - /* Reject journals that are too small given the sector size of the device */ - if (journal_size < min_size) { - printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n", - jdev_name, journal_size, phys_blksz); - goto cleanup_jdev_name; - } - - if ((journal_size % phys_blksz) != 0) { - printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jdev_name, journal_size, phys_blksz); - goto cleanup_jdev_name; - } - - MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); - memset(jnl, 0, sizeof(*jnl)); - - jnl->jdev = jvp; - jnl->jdev_offset = offset; - jnl->fsdev = fsvp; - jnl->flush = flush; - jnl->flush_arg = arg; - jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - jnl->jdev_name = jdev_name; - lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); - - /* We need a reference to the mount to later pass to the throttling code for - * IO accounting. - */ - jnl->fsmount = fsmount; - mount_ref(fsmount, 0); - - get_io_info(jvp, phys_blksz, jnl, &context); - - if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) { - printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); - goto bad_kmem_alloc; - } - jnl->header_buf_size = phys_blksz; - - jnl->jhdr = (journal_header *)jnl->header_buf; - memset(jnl->jhdr, 0, sizeof(journal_header)); - - // we have to set this up here so that do_journal_io() will work - jnl->jhdr->jhdr_size = phys_blksz; - - if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { - printf("jnl: %s: open: could not read %u bytes for the journal header.\n", - jdev_name, phys_blksz); - goto bad_journal; - } - - orig_checksum = jnl->jhdr->checksum; - jnl->jhdr->checksum = 0; - - if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { - // do this before the swap since it's done byte-at-a-time - orig_checksum = SWAP32(orig_checksum); - checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); - swap_journal_header(jnl); - jnl->flags |= JOURNAL_NEED_SWAP; - } else { - checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); - } - - if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { - printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n", - jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); - goto bad_journal; - } - - // only check if we're the current journal header magic value - if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) { - - if (orig_checksum != checksum) { - printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n", - jdev_name, orig_checksum, checksum); - - //goto bad_journal; - } - } - - // XXXdbg - convert old style magic numbers to the new one - if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) { - jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; - } - - if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { - /* - * The volume has probably been resized (such that we had to adjust the - * logical sector size), or copied to media with a different logical - * sector size. - * - * Temporarily change the device's logical block size to match the - * journal's header size. This will allow us to replay the journal - * safely. If the replay succeeds, we will update the journal's header - * size (later in this function). - */ - orig_blksz = phys_blksz; - phys_blksz = jnl->jhdr->jhdr_size; - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context); - printf("jnl: %s: open: temporarily switched block size from %u to %u\n", - jdev_name, orig_blksz, phys_blksz); - } - - if ( jnl->jhdr->start <= 0 - || jnl->jhdr->start > jnl->jhdr->size - || jnl->jhdr->start > 1024*1024*1024) { - printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n", - jdev_name, jnl->jhdr->start, jnl->jhdr->size); - goto bad_journal; - } - - if ( jnl->jhdr->end <= 0 - || jnl->jhdr->end > jnl->jhdr->size - || jnl->jhdr->end > 1024*1024*1024) { - printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n", - jdev_name, jnl->jhdr->end, jnl->jhdr->size); - goto bad_journal; - } - - if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { - printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size); - goto bad_journal; - } - -// XXXdbg - can't do these checks because hfs writes all kinds of -// non-uniform sized blocks even on devices that have a block size -// that is larger than 512 bytes (i.e. optical media w/2k blocks). -// therefore these checks will fail and so we just have to punt and -// do more relaxed checking... -// XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) { - if ((jnl->jhdr->start % 512) != 0) { - printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n", - jdev_name, jnl->jhdr->start); - goto bad_journal; - } - -//XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) { - if ((jnl->jhdr->end % 512) != 0) { - printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n", - jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size); - goto bad_journal; - } - - // take care of replaying the journal if necessary - if (flags & JOURNAL_RESET) { - printf("jnl: %s: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n", - jdev_name, jnl->jhdr->start, jnl->jhdr->end); - jnl->jhdr->start = jnl->jhdr->end; - } else if (replay_journal(jnl) != 0) { - printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); - goto bad_journal; - } - - /* - * When we get here, we know that the journal is empty (jnl->jhdr->start == - * jnl->jhdr->end). If the device's logical block size was different from - * the journal's header size, then we can now restore the device's logical - * block size and update the journal's header size to match. - * - * Note that we also adjust the journal's start and end so that they will - * be aligned on the new block size. We pick a new sequence number to - * avoid any problems if a replay found previous transactions using the old - * journal header size. (See the comments in journal_create(), above.) - */ - - if (orig_blksz != 0) { - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - phys_blksz = orig_blksz; - - orig_blksz = 0; - - jnl->jhdr->jhdr_size = phys_blksz; - jnl->jhdr->start = phys_blksz; - jnl->jhdr->end = phys_blksz; - jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + - (journal_size / phys_blksz) + - (random() % 16384)) & 0x00ffffff; - - if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { - printf("jnl: %s: open: failed to update journal header size\n", jdev_name); - goto bad_journal; - } - } - - // make sure this is in sync! - jnl->active_start = jnl->jhdr->start; - jnl->sequence_num = jnl->jhdr->sequence_num; - - // set this now, after we've replayed the journal - size_up_tbuffer(jnl, tbuffer_size, phys_blksz); - - // TODO: Does this need to change if the device's logical block size changed? - if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { - printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, - jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); - goto bad_journal; - } - - lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); - lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); - lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); - - goto journal_open_complete; - -bad_journal: - if (orig_blksz != 0) { - phys_blksz = orig_blksz; - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - printf("jnl: %s: open: restored block size after error\n", jdev_name); - } - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); -bad_kmem_alloc: - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - mount_drop(fsmount, 0); -cleanup_jdev_name: - vnode_putname_printable(jdev_name); - jnl = NULL; -journal_open_complete: - return jnl; -} - - -int -journal_is_clean(struct vnode *jvp, - off_t offset, - off_t journal_size, - struct vnode *fsvp, - size_t min_fs_block_size) -{ - journal jnl; - uint32_t phys_blksz; - int ret; - int orig_checksum, checksum; - struct vfs_context context; - const char *jdev_name = vnode_getname_printable(jvp); - - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; - - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); - ret = EINVAL; - goto cleanup_jdev_name; - } - - if (phys_blksz > (uint32_t)min_fs_block_size) { - printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_block_size); - ret = EINVAL; - goto cleanup_jdev_name; - } - - if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { - printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size); - ret = EINVAL; - goto cleanup_jdev_name; - } - - if ((journal_size % phys_blksz) != 0) { - printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jdev_name, journal_size, phys_blksz); - ret = EINVAL; - goto cleanup_jdev_name; - } - - memset(&jnl, 0, sizeof(jnl)); - - if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) { - printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz); - ret = ENOMEM; - goto cleanup_jdev_name; - } - jnl.header_buf_size = phys_blksz; - - get_io_info(jvp, phys_blksz, &jnl, &context); - - jnl.jhdr = (journal_header *)jnl.header_buf; - memset(jnl.jhdr, 0, sizeof(journal_header)); - - jnl.jdev = jvp; - jnl.jdev_offset = offset; - jnl.fsdev = fsvp; - - // we have to set this up here so that do_journal_io() will work - jnl.jhdr->jhdr_size = phys_blksz; - - if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { - printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n", - jdev_name, phys_blksz); - ret = EINVAL; - goto get_out; - } - - orig_checksum = jnl.jhdr->checksum; - jnl.jhdr->checksum = 0; - - if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { - // do this before the swap since it's done byte-at-a-time - orig_checksum = SWAP32(orig_checksum); - checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); - swap_journal_header(&jnl); - jnl.flags |= JOURNAL_NEED_SWAP; - } else { - checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); - } - - if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { - printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n", - jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); - ret = EINVAL; - goto get_out; - } - - if (orig_checksum != checksum) { - printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum); - ret = EINVAL; - goto get_out; - } - - // - // if the start and end are equal then the journal is clean. - // otherwise it's not clean and therefore an error. - // - if (jnl.jhdr->start == jnl.jhdr->end) { - ret = 0; - } else { - ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one - } - -get_out: - kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz); -cleanup_jdev_name: - vnode_putname_printable(jdev_name); - return ret; -} - - -void -journal_close(journal *jnl) -{ - volatile off_t *start, *end; - int counter=0; - - CHECK_JOURNAL(jnl); - - // set this before doing anything that would block so that - // we start tearing things down properly. - // - jnl->flags |= JOURNAL_CLOSE_PENDING; - - if (jnl->owner != current_thread()) { - journal_lock(jnl); - } - - wait_condition(jnl, &jnl->flushing, "journal_close"); - - // - // only write stuff to disk if the journal is still valid - // - if ((jnl->flags & JOURNAL_INVALID) == 0) { - - if (jnl->active_tr) { - /* - * "journal_end_transaction" will fire the flush asynchronously - */ - journal_end_transaction(jnl); - } - - // flush any buffered transactions - if (jnl->cur_tr) { - transaction *tr = jnl->cur_tr; - - jnl->cur_tr = NULL; - /* - * "end_transaction" will wait for any in-progress flush to complete - * before flushing "cur_tr" synchronously("must_wait" == TRUE) - */ - end_transaction(tr, 1, NULL, NULL, FALSE, TRUE); - } - /* - * if there was an "active_tr", make sure we wait for - * it to flush if there was no "cur_tr" to process - */ - wait_condition(jnl, &jnl->flushing, "journal_close"); - - //start = &jnl->jhdr->start; - start = &jnl->active_start; - end = &jnl->jhdr->end; - - while (*start != *end && counter++ < 5000) { - //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end); - if (jnl->flush) { - jnl->flush(jnl->flush_arg); - } - tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2); - } - - if (*start != *end) { - printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n", - jnl->jdev_name, *start, *end); - } - - // make sure this is in sync when we close the journal - jnl->jhdr->start = jnl->active_start; - - // if this fails there's not much we can do at this point... - write_journal_header(jnl, 1, jnl->sequence_num); - } else { - // if we're here the journal isn't valid any more. - // so make sure we don't leave any locked blocks lying around - printf("jnl: %s: close: journal is invalid. aborting outstanding transactions\n", jnl->jdev_name); - if (jnl->active_tr || jnl->cur_tr) { - transaction *tr; - - if (jnl->active_tr) { - tr = jnl->active_tr; - jnl->active_tr = NULL; - } else { - tr = jnl->cur_tr; - jnl->cur_tr = NULL; - } - abort_transaction(jnl, tr); - - if (jnl->active_tr || jnl->cur_tr) { - panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl); - } - } - } - wait_condition(jnl, &jnl->asyncIO, "journal_close"); - - free_old_stuff(jnl); - - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); - jnl->jhdr = (void *)0xbeefbabe; - - // Release reference on the mount - if (jnl->fsmount) - mount_drop(jnl->fsmount, 0); - - vnode_putname_printable(jnl->jdev_name); - - journal_unlock(jnl); - lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group); - lck_mtx_destroy(&jnl->jlock, jnl_mutex_group); - lck_mtx_destroy(&jnl->flock, jnl_mutex_group); - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); -} - -static void -dump_journal(journal *jnl) -{ - transaction *ctr; - - printf("journal for dev %s:", jnl->jdev_name); - printf(" jdev_offset %.8llx\n", jnl->jdev_offset); - printf(" magic: 0x%.8x\n", jnl->jhdr->magic); - printf(" start: 0x%.8llx\n", jnl->jhdr->start); - printf(" end: 0x%.8llx\n", jnl->jhdr->end); - printf(" size: 0x%.8llx\n", jnl->jhdr->size); - printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); - printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); - printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); - - printf(" completed transactions:\n"); - for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) { - printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end); - } -} - - - -static off_t -free_space(journal *jnl) -{ - off_t free_space_offset; - - if (jnl->jhdr->start < jnl->jhdr->end) { - free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size; - } else if (jnl->jhdr->start > jnl->jhdr->end) { - free_space_offset = jnl->jhdr->start - jnl->jhdr->end; - } else { - // journal is completely empty - free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size; - } - - return free_space_offset; -} - - -// -// The journal must be locked on entry to this function. -// The "desired_size" is in bytes. -// -static int -check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num) -{ - size_t i; - int counter=0; - - //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", - // desired_size, free_space(jnl)); - - if (delayed_header_write) - *delayed_header_write = FALSE; - - while (1) { - int old_start_empty; - - // make sure there's space in the journal to hold this transaction - if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { - break; - } - if (counter++ == 5000) { - dump_journal(jnl); - panic("jnl: check_free_space: buffer flushing isn't working " - "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, - jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); - } - if (counter > 7500) { - printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name); - return ENOSPC; - } - - // - // here's where we lazily bump up jnl->jhdr->start. we'll consume - // entries until there is enough space for the next transaction. - // - old_start_empty = 1; - lock_oldstart(jnl); - - for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { - int lcl_counter; - - lcl_counter = 0; - while (jnl->old_start[i] & 0x8000000000000000LL) { - if (lcl_counter++ > 10000) { - panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", - jnl->old_start[i], jnl); - } - - unlock_oldstart(jnl); - if (jnl->flush) { - jnl->flush(jnl->flush_arg); - } - tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1); - lock_oldstart(jnl); - } - - if (jnl->old_start[i] == 0) { - continue; - } - - old_start_empty = 0; - jnl->jhdr->start = jnl->old_start[i]; - jnl->old_start[i] = 0; - - if (free_space(jnl) > desired_size) { - - if (delayed_header_write) - *delayed_header_write = TRUE; - else { - unlock_oldstart(jnl); - write_journal_header(jnl, 1, sequence_num); - lock_oldstart(jnl); - } - break; - } - } - unlock_oldstart(jnl); - - // if we bumped the start, loop and try again - if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { - continue; - } else if (old_start_empty) { - // - // if there is nothing in old_start anymore then we can - // bump the jhdr->start to be the same as active_start - // since it is possible there was only one very large - // transaction in the old_start array. if we didn't do - // this then jhdr->start would never get updated and we - // would wind up looping until we hit the panic at the - // start of the loop. - // - jnl->jhdr->start = jnl->active_start; - - if (delayed_header_write) - *delayed_header_write = TRUE; - else - write_journal_header(jnl, 1, sequence_num); - continue; - } - - - // if the file system gave us a flush function, call it to so that - // it can flush some blocks which hopefully will cause some transactions - // to complete and thus free up space in the journal. - if (jnl->flush) { - jnl->flush(jnl->flush_arg); - } - - // wait for a while to avoid being cpu-bound (this will - // put us to sleep for 10 milliseconds) - tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1); - } - - return 0; -} - -/* - * Allocate a new active transaction. - */ -static errno_t -journal_allocate_transaction(journal *jnl) -{ - transaction *tr; - boolean_t was_vm_privileged = FALSE; - kern_return_t retval; - - if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { - /* - * the disk driver can allocate memory on this path... - * if we block waiting for memory, and there is enough pressure to - * cause us to try and create a new swap file, we may end up deadlocking - * due to waiting for the journal on the swap file creation path... - * by making ourselves vm_privileged, we give ourselves the best chance - * of not blocking - */ - was_vm_privileged = set_vm_privilege(TRUE); - } - MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK); - memset(tr, 0, sizeof(transaction)); - - tr->tbuffer_size = jnl->tbuffer_size; - - retval = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size, VM_KERN_MEMORY_FILE); - - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - - if (retval) { - FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); - jnl->active_tr = NULL; - return ENOMEM; - } - - // journal replay code checksum check depends on this. - memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); - // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) - memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); - - tr->blhdr = (block_list_header *)tr->tbuffer; - tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; - tr->blhdr->num_blocks = 1; // accounts for this header block - tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; - tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; - - tr->sequence_num = ++jnl->sequence_num; - tr->num_blhdrs = 1; - tr->total_bytes = jnl->jhdr->blhdr_size; - tr->jnl = jnl; - - jnl->active_tr = tr; - - return 0; -} - -int -journal_start_transaction(journal *jnl) -{ - int ret; - - CHECK_JOURNAL(jnl); - - free_old_stuff(jnl); - - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } - if (jnl->owner == current_thread()) { - if (jnl->active_tr == NULL) { - panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n", - jnl, jnl->owner, current_thread()); - } - jnl->nested_count++; - return 0; - } - - journal_lock(jnl); - - if (jnl->nested_count != 0 || jnl->active_tr != NULL) { - panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n", - jnl->owner, jnl->nested_count, jnl->active_tr, jnl); - } - - jnl->nested_count = 1; - -#if JOE - // make sure there's room in the journal - if (free_space(jnl) < jnl->tbuffer_size) { - - KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); - - // this is the call that really waits for space to free up - // as well as updating jnl->jhdr->start - if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) { - printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name); - ret = ENOSPC; - goto bad_start; - } - KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0); - } -#endif - - // if there's a buffered transaction, use it. - if (jnl->cur_tr) { - jnl->active_tr = jnl->cur_tr; - jnl->cur_tr = NULL; - - return 0; - } - - ret = journal_allocate_transaction(jnl); - if (ret) { - goto bad_start; - } - - // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); - - return 0; - -bad_start: - jnl->nested_count = 0; - journal_unlock(jnl); - - return ret; -} - - -int -journal_modify_block_start(journal *jnl, struct buf *bp) -{ - transaction *tr; - boolean_t was_vm_privileged = FALSE; - - CHECK_JOURNAL(jnl); - - - free_old_stuff(jnl); - - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } - - if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { - /* - * if we block waiting for memory, and there is enough pressure to - * cause us to try and create a new swap file, we may end up deadlocking - * due to waiting for the journal on the swap file creation path... - * by making ourselves vm_privileged, we give ourselves the best chance - * of not blocking - */ - was_vm_privileged = set_vm_privilege(TRUE); - } - - // XXXdbg - for debugging I want this to be true. later it may - // not be necessary. - if ((buf_flags(bp) & B_META) == 0) { - panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl); - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - - if (jnl->owner != current_thread()) { - panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", - // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); - - // can't allow blocks that aren't an even multiple of the - // underlying block size. - if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { - uint32_t phys_blksz, bad=0; - - if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { - bad = 1; - } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { - if (phys_blksz < 512) { - panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", - phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size); - } - - if ((buf_size(bp) % phys_blksz) != 0) { - bad = 1; - } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { - jnl->jhdr->jhdr_size = phys_blksz; - } else { - // the phys_blksz is now larger... need to realloc the jhdr - char *new_header_buf; - - printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n", - jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); - if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) { - printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n", - jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz); - bad = 1; - } else { - memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); - memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); - jnl->header_buf = new_header_buf; - jnl->header_buf_size = phys_blksz; - - jnl->jhdr = (journal_header *)jnl->header_buf; - jnl->jhdr->jhdr_size = phys_blksz; - } - } - } else { - bad = 1; - } - - if (bad) { - panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", - buf_size(bp), jnl->jhdr->jhdr_size); - - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - return -1; - } - } - - // make sure that this transaction isn't bigger than the whole journal - if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { - panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n", - tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); - - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - return -1; - } - - // if the block is dirty and not already locked we have to write - // it out before we muck with it because it has data that belongs - // (presumably) to another transaction. - // - if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) { - - if (buf_flags(bp) & B_ASYNC) { - panic("modify_block_start: bp @ %p has async flag set!\n", bp); - } - if (bp->b_shadow_ref) - panic("modify_block_start: dirty bp @ %p has shadows!\n", bp); - - // this will cause it to not be buf_brelse()'d - buf_setflags(bp, B_NORELSE); - VNOP_BWRITE(bp); - } - buf_setflags(bp, B_LOCKED); - - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - - return 0; -} - -int -journal_modify_block_abort(journal *jnl, struct buf *bp) -{ - transaction *tr; - block_list_header *blhdr; - int i; - - CHECK_JOURNAL(jnl); - - free_old_stuff(jnl); - - tr = jnl->active_tr; - - // - // if there's no active transaction then we just want to - // call buf_brelse() and return since this is just a block - // that happened to be modified as part of another tr. - // - if (tr == NULL) { - buf_brelse(bp); - return 0; - } - - if (jnl->flags & JOURNAL_INVALID) { - /* Still need to buf_brelse(). Callers assume we consume the bp. */ - buf_brelse(bp); - return EINVAL; - } - - CHECK_TRANSACTION(tr); - - if (jnl->owner != current_thread()) { - panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); - - // first check if it's already part of this transaction - for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - for (i = 1; i < blhdr->num_blocks; i++) { - if (bp == blhdr->binfo[i].u.bp) { - break; - } - } - - if (i < blhdr->num_blocks) { - break; - } - } - - // - // if blhdr is null, then this block has only had modify_block_start - // called on it as part of the current transaction. that means that - // it is ok to clear the LOCKED bit since it hasn't actually been - // modified. if blhdr is non-null then modify_block_end was called - // on it and so we need to keep it locked in memory. - // - if (blhdr == NULL) { - buf_clearflags(bp, B_LOCKED); - } - - buf_brelse(bp); - return 0; -} - - -int -journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg) -{ - int i = 1; - int tbuffer_offset=0; - block_list_header *blhdr, *prev=NULL; - transaction *tr; - - CHECK_JOURNAL(jnl); - - free_old_stuff(jnl); - - if (jnl->flags & JOURNAL_INVALID) { - /* Still need to buf_brelse(). Callers assume we consume the bp. */ - buf_brelse(bp); - return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - - if (jnl->owner != current_thread()) { - panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", - // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); - - if ((buf_flags(bp) & B_LOCKED) == 0) { - panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl); - } - - // first check if it's already part of this transaction - for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - tbuffer_offset = jnl->jhdr->blhdr_size; - - for (i = 1; i < blhdr->num_blocks; i++) { - if (bp == blhdr->binfo[i].u.bp) { - break; - } - if (blhdr->binfo[i].bnum != (off_t)-1) { - tbuffer_offset += buf_size(blhdr->binfo[i].u.bp); - } else { - tbuffer_offset += blhdr->binfo[i].u.bi.bsize; - } - } - - if (i < blhdr->num_blocks) { - break; - } - } - - if (blhdr == NULL - && prev - && (prev->num_blocks+1) <= prev->max_blocks - && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) { - blhdr = prev; - - } else if (blhdr == NULL) { - block_list_header *nblhdr; - if (prev == NULL) { - panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp); - } - - // we got to the end of the list, didn't find the block and there's - // no room in the block_list_header pointed to by prev - - // we allocate another tbuffer and link it in at the end of the list - // through prev->binfo[0].bnum. that's a skanky way to do things but - // avoids having yet another linked list of small data structures to manage. - - if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size, VM_KERN_MEMORY_FILE)) { - panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n", - tr, tr->total_bytes); - } - - // journal replay code checksum check depends on this. - memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE); - // Fill up the rest of the block with unimportant bytes - memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); - - // initialize the new guy - nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; - nblhdr->num_blocks = 1; // accounts for this header block - nblhdr->bytes_used = jnl->jhdr->blhdr_size; - nblhdr->flags = BLHDR_CHECK_CHECKSUMS; - - tr->num_blhdrs++; - tr->total_bytes += jnl->jhdr->blhdr_size; - - // then link him in at the end - prev->binfo[0].bnum = (off_t)((long)nblhdr); - - // and finally switch to using the new guy - blhdr = nblhdr; - tbuffer_offset = jnl->jhdr->blhdr_size; - i = 1; - } - - - if ((i+1) > blhdr->max_blocks) { - panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks); - } - - // if this is true then this is a new block we haven't seen - if (i >= blhdr->num_blocks) { - int bsize; - vnode_t vp; - - vp = buf_vnode(bp); - if (vnode_ref(vp)) { - // Nobody checks the return values, so... - jnl->flags |= JOURNAL_INVALID; - - buf_brelse(bp); - - // We're probably here due to a force unmount, so EIO is appropriate - return EIO; - } - - bsize = buf_size(bp); - - blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); - blhdr->binfo[i].u.bp = bp; - - task_update_logical_writes(current_task(), (2 * bsize), TASK_WRITE_METADATA); - KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0); - - if (func) { - void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL; - - buf_setfilter(bp, func, arg, &old_func, &old_arg); - if (old_func != NULL && old_func != func) { - panic("jnl: modify_block_end: old func %p / arg %p (func %p)", old_func, old_arg, func); - } - } - - blhdr->bytes_used += bsize; - tr->total_bytes += bsize; - - blhdr->num_blocks++; - } - buf_bdwrite(bp); - - return 0; -} - -int -journal_kill_block(journal *jnl, struct buf *bp) -{ - int i; - int bflags; - block_list_header *blhdr; - transaction *tr; - - CHECK_JOURNAL(jnl); - - free_old_stuff(jnl); - - if (jnl->flags & JOURNAL_INVALID) { - buf_brelse(bp); - return 0; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - - if (jnl->owner != current_thread()) { - panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - bflags = buf_flags(bp); - - if ( !(bflags & B_LOCKED)) - panic("jnl: modify_block_end: called with bp not B_LOCKED"); - - /* - * bp must be BL_BUSY and B_LOCKED - * first check if it's already part of this transaction - */ - for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - - for (i = 1; i < blhdr->num_blocks; i++) { - if (bp == blhdr->binfo[i].u.bp) { - vnode_t vp; - - buf_clearflags(bp, B_LOCKED); - - // this undoes the vnode_ref() in journal_modify_block_end() - vp = buf_vnode(bp); - vnode_rele_ext(vp, 0, 1); - - // if the block has the DELWRI and FILTER bits sets, then - // things are seriously weird. if it was part of another - // transaction then journal_modify_block_start() should - // have force it to be written. - // - //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) { - // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp); - //} else { - tr->num_killed += buf_size(bp); - //} - blhdr->binfo[i].bnum = (off_t)-1; - blhdr->binfo[i].u.bp = NULL; - blhdr->binfo[i].u.bi.bsize = buf_size(bp); - - buf_markinvalid(bp); - buf_brelse(bp); - - return 0; - } - } - } - - /* - * We did not find the block in any transaction buffer but we still - * need to release it or else it will be left locked forever. - */ - buf_brelse(bp); - - return 0; -} - -/* -;________________________________________________________________________________ -; -; Routine: journal_trim_set_callback -; -; Function: Provide the journal with a routine to be called back when a -; TRIM has (or would have) been issued to the device. That -; is, the transaction has been flushed to the device, and the -; blocks freed by the transaction are now safe for reuse. -; -; CAUTION: If the journal becomes invalid (eg., due to an I/O -; error when trying to write to the journal), this callback -; will stop getting called, even if extents got freed before -; the journal became invalid! -; -; Input Arguments: -; jnl - The journal structure for the filesystem. -; callback - The function to call when the TRIM is complete. -; arg - An argument to be passed to callback. -;________________________________________________________________________________ -*/ -__private_extern__ void -journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg) -{ - jnl->trim_callback = callback; - jnl->trim_callback_arg = arg; -} - - -/* -;________________________________________________________________________________ -; -; Routine: journal_trim_realloc -; -; Function: Increase the amount of memory allocated for the list of extents -; to be unmapped (trimmed). This routine will be called when -; adding an extent to the list, and the list already occupies -; all of the space allocated to it. This routine returns ENOMEM -; if unable to allocate more space, or 0 if the extent list was -; grown successfully. -; -; Input Arguments: -; trim - The trim list to be resized. -; -; Output: -; (result) - ENOMEM or 0. -; -; Side effects: -; The allocated_count and extents fields of tr->trim are updated -; if the function returned 0. -;________________________________________________________________________________ -*/ -static int -trim_realloc(journal *jnl, struct jnl_trim_list *trim) -{ - void *new_extents; - uint32_t new_allocated_count; - boolean_t was_vm_privileged = FALSE; - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0); - - new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; - - if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { - /* - * if we block waiting for memory, and there is enough pressure to - * cause us to try and create a new swap file, we may end up deadlocking - * due to waiting for the journal on the swap file creation path... - * by making ourselves vm_privileged, we give ourselves the best chance - * of not blocking - */ - was_vm_privileged = set_vm_privilege(TRUE); - } - new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t)); - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - - if (new_extents == NULL) { - printf("jnl: trim_realloc: unable to grow extent list!\n"); - /* - * Since we could be called when allocating space previously marked - * to be trimmed, we need to empty out the list to be safe. - */ - trim->extent_count = 0; - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0); - return ENOMEM; - } - - /* Copy the old extent list to the newly allocated list. */ - if (trim->extents != NULL) { - memmove(new_extents, - trim->extents, - trim->allocated_count * sizeof(dk_extent_t)); - kfree(trim->extents, - trim->allocated_count * sizeof(dk_extent_t)); - } - - trim->allocated_count = new_allocated_count; - trim->extents = new_extents; - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0); - - return 0; -} - -/* - ;________________________________________________________________________________ - ; - ; Routine: trim_search_extent - ; - ; Function: Search the given extent list to see if any of its extents - ; overlap the given extent. - ; - ; Input Arguments: - ; trim - The trim list to be searched. - ; offset - The first byte of the range to be searched for. - ; length - The number of bytes of the extent being searched for. - ; overlap_start - start of the overlapping extent - ; overlap_len - length of the overlapping extent - ; - ; Output: - ; (result) - TRUE if one or more extents overlap, FALSE otherwise. - ;________________________________________________________________________________ - */ -static int -trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, - uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len) -{ - uint64_t end = offset + length; - uint32_t lower = 0; /* Lowest index to search */ - uint32_t upper = trim->extent_count; /* Highest index to search + 1 */ - uint32_t middle; - - /* A binary search over the extent list. */ - while (lower < upper) { - middle = (lower + upper) / 2; - - if (trim->extents[middle].offset >= end) - upper = middle; - else if (trim->extents[middle].offset + trim->extents[middle].length <= offset) - lower = middle + 1; - else { - if (overlap_start) { - *overlap_start = trim->extents[middle].offset; - } - if (overlap_len) { - *overlap_len = trim->extents[middle].length; - } - return TRUE; - } - } - - return FALSE; -} - - -/* -;________________________________________________________________________________ -; -; Routine: journal_trim_add_extent -; -; Function: Keep track of extents that have been freed as part of this -; transaction. If the underlying device supports TRIM (UNMAP), -; then those extents will be trimmed/unmapped once the -; transaction has been written to the journal. (For example, -; SSDs can support trim/unmap and avoid having to recopy those -; blocks when doing wear leveling, and may reuse the same -; phsyical blocks for different logical blocks.) -; -; HFS also uses this, in combination with journal_trim_set_callback, -; to add recently freed extents to its free extent cache, but -; only after the transaction that freed them is committed to -; disk. (This reduces the chance of overwriting live data in -; a way that causes data loss if a transaction never gets -; written to the journal.) -; -; Input Arguments: -; jnl - The journal for the volume containing the byte range. -; offset - The first byte of the range to be trimmed. -; length - The number of bytes of the extent being trimmed. -;________________________________________________________________________________ -*/ -__private_extern__ int -journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) -{ - uint64_t end; - transaction *tr; - dk_extent_t *extent; - uint32_t insert_index; - uint32_t replace_count; - - CHECK_JOURNAL(jnl); - - /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0); - - if (jnl->owner != current_thread()) { - panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); - - end = offset + length; - - /* - * Find the range of existing extents that can be combined with the - * input extent. We start by counting the number of extents that end - * strictly before the input extent, then count the number of extents - * that overlap or are contiguous with the input extent. - */ - extent = tr->trim.extents; - insert_index = 0; - while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) { - ++insert_index; - ++extent; - } - replace_count = 0; - while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) { - ++replace_count; - ++extent; - } - - /* - * If none of the existing extents can be combined with the input extent, - * then just insert it in the list (before item number insert_index). - */ - if (replace_count == 0) { - /* If the list was already full, we need to grow it. */ - if (tr->trim.extent_count == tr->trim.allocated_count) { - if (trim_realloc(jnl, &tr->trim) != 0) { - printf("jnl: trim_add_extent: out of memory!"); - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0); - return ENOMEM; - } - } - - /* Shift any existing extents with larger offsets. */ - if (insert_index < tr->trim.extent_count) { - memmove(&tr->trim.extents[insert_index+1], - &tr->trim.extents[insert_index], - (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t)); - } - tr->trim.extent_count++; - - /* Store the new extent in the list. */ - tr->trim.extents[insert_index].offset = offset; - tr->trim.extents[insert_index].length = length; - - /* We're done. */ - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); - return 0; - } - - /* - * Update extent number insert_index to be the union of the input extent - * and all of the replaced extents. - */ - if (tr->trim.extents[insert_index].offset < offset) - offset = tr->trim.extents[insert_index].offset; - extent = &tr->trim.extents[insert_index + replace_count - 1]; - if (extent->offset + extent->length > end) - end = extent->offset + extent->length; - tr->trim.extents[insert_index].offset = offset; - tr->trim.extents[insert_index].length = end - offset; - - /* - * If we were replacing more than one existing extent, then shift any - * extents with larger offsets, and update the count of extents. - * - * We're going to leave extent #insert_index alone since it was just updated, above. - * We need to move extents from index (insert_index + replace_count) through the end of - * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1). - */ - if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) { - memmove(&tr->trim.extents[insert_index + 1], - &tr->trim.extents[insert_index + replace_count], - (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t)); - } - tr->trim.extent_count -= replace_count - 1; - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); - return 0; -} - -/* - * journal_trim_extent_overlap - * - * Return 1 if there are any pending TRIMs that overlap with the given offset and length - * Return 0 otherwise. - */ - -int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) { - transaction *tr = NULL; - int overlap = 0; - - uint64_t overlap_start; - uint64_t overlap_len; - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - - /* - * There are two lists that need to be examined for potential overlaps: - * - * The first is the current transaction. Since this function requires that - * a transaction be active when this is called, this is the "active_tr" - * pointer in the journal struct. This has a trimlist pointer which needs - * to be searched. - */ - overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len); - if (overlap == 0) { - /* - * The second is the async trim list, which is only done if the current - * transaction group (active transaction) did not overlap with our target - * extent. This async trim list is the set of all previously - * committed transaction groups whose I/Os are now in-flight. We need to hold the - * trim lock in order to search this list. If we grab the list before the - * TRIM has completed, then we will compare it. If it is grabbed AFTER the - * TRIM has completed, then the pointer will be zeroed out and we won't have - * to check anything. - */ - lck_rw_lock_shared (&jnl->trim_lock); - if (jnl->async_trim != NULL) { - overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len); - } - lck_rw_unlock_shared (&jnl->trim_lock); - } - - if (overlap) { - /* compute the end (min) of the overlapping range */ - if ( (overlap_start + overlap_len) < (offset + length)) { - *end = (overlap_start + overlap_len); - } - else { - *end = (offset + length); - } - } - - - return overlap; -} - -/* - * journal_request_immediate_flush - * - * FS requests that the journal flush immediately upon the - * active transaction's completion. - * - * Returns 0 if operation succeeds - * Returns EPERM if we failed to leave hint - */ -int -journal_request_immediate_flush (journal *jnl) { - - transaction *tr = NULL; - /* - * Is a transaction still in process? You must do - * this while there are txns open - */ - tr = jnl->active_tr; - if (tr != NULL) { - CHECK_TRANSACTION(tr); - tr->flush_on_completion = TRUE; - } - else { - return EPERM; - } - return 0; -} - - - -/* -;________________________________________________________________________________ -; -; Routine: trim_remove_extent -; -; Function: Indicate that a range of bytes, some of which may have previously -; been passed to journal_trim_add_extent, is now allocated. -; Any overlapping ranges currently in the journal's trim list will -; be removed. If the underlying device supports TRIM (UNMAP), then -; these extents will not be trimmed/unmapped when the transaction -; is written to the journal. -; -; HFS also uses this to prevent newly allocated space from being -; added to its free extent cache (if some portion of the newly -; allocated space was recently freed). -; -; Input Arguments: -; trim - The trim list to update. -; offset - The first byte of the range to be trimmed. -; length - The number of bytes of the extent being trimmed. -;________________________________________________________________________________ -*/ -static int -trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length) -{ - u_int64_t end; - dk_extent_t *extent; - u_int32_t keep_before; - u_int32_t keep_after; - - end = offset + length; - - /* - * Find any existing extents that start before or end after the input - * extent. These extents will be modified if they overlap the input - * extent. Other extents between them will be deleted. - */ - extent = trim->extents; - keep_before = 0; - while (keep_before < trim->extent_count && extent->offset < offset) { - ++keep_before; - ++extent; - } - keep_after = keep_before; - if (keep_after > 0) { - /* See if previous extent extends beyond both ends of input extent. */ - --keep_after; - --extent; - } - while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) { - ++keep_after; - ++extent; - } - - /* - * When we get here, the first keep_before extents (0 .. keep_before-1) - * start before the input extent, and extents (keep_after .. extent_count-1) - * end after the input extent. We'll need to keep, all of those extents, - * but possibly modify #(keep_before-1) and #keep_after to remove the portion - * that overlaps with the input extent. - */ - - /* - * Does the input extent start after and end before the same existing - * extent? If so, we have to "punch a hole" in that extent and convert - * it to two separate extents. - */ - if (keep_before > keep_after) { - /* If the list was already full, we need to grow it. */ - if (trim->extent_count == trim->allocated_count) { - if (trim_realloc(jnl, trim) != 0) { - printf("jnl: trim_remove_extent: out of memory!"); - return ENOMEM; - } - } - - /* - * Make room for a new extent by shifting extents #keep_after and later - * down by one extent. When we're done, extents #keep_before and - * #keep_after will be identical, and we can fall through to removing - * the portion that overlaps the input extent. - */ - memmove(&trim->extents[keep_before], - &trim->extents[keep_after], - (trim->extent_count - keep_after) * sizeof(dk_extent_t)); - ++trim->extent_count; - ++keep_after; - - /* - * Fall through. We now have the case where the length of extent - * #(keep_before - 1) needs to be updated, and the start of extent - * #(keep_after) needs to be updated. - */ - } - - /* - * May need to truncate the end of extent #(keep_before - 1) if it overlaps - * the input extent. - */ - if (keep_before > 0) { - extent = &trim->extents[keep_before - 1]; - if (extent->offset + extent->length > offset) { - extent->length = offset - extent->offset; - } - } - - /* - * May need to update the start of extent #(keep_after) if it overlaps the - * input extent. - */ - if (keep_after < trim->extent_count) { - extent = &trim->extents[keep_after]; - if (extent->offset < end) { - extent->length = extent->offset + extent->length - end; - extent->offset = end; - } - } - - /* - * If there were whole extents that overlapped the input extent, get rid - * of them by shifting any following extents, and updating the count. - */ - if (keep_after > keep_before && keep_after < trim->extent_count) { - memmove(&trim->extents[keep_before], - &trim->extents[keep_after], - (trim->extent_count - keep_after) * sizeof(dk_extent_t)); - } - trim->extent_count -= keep_after - keep_before; - - return 0; -} - -/* - ;________________________________________________________________________________ - ; - ; Routine: journal_trim_remove_extent - ; - ; Function: Make note of a range of bytes, some of which may have previously - ; been passed to journal_trim_add_extent, is now in use on the - ; volume. The given bytes will be not be trimmed as part of - ; this transaction, or a pending trim of a transaction being - ; asynchronously flushed. - ; - ; Input Arguments: - ; jnl - The journal for the volume containing the byte range. - ; offset - The first byte of the range to be trimmed. - ; length - The number of bytes of the extent being trimmed. - ;________________________________________________________________________________ - */ -__private_extern__ int -journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) -{ - int error = 0; - transaction *tr; - - CHECK_JOURNAL(jnl); - - /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0); - - if (jnl->owner != current_thread()) { - panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); - - error = trim_remove_extent(jnl, &tr->trim, offset, length); - if (error == 0) { - int found = FALSE; - - /* - * See if a pending trim has any extents that overlap with the - * one we were given. - */ - lck_rw_lock_shared(&jnl->trim_lock); - if (jnl->async_trim != NULL) - found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL); - lck_rw_unlock_shared(&jnl->trim_lock); - - if (found) { - /* - * There was an overlap, so avoid trimming the extent we - * just allocated. (Otherwise, it might get trimmed after - * we've written to it, which will cause that data to be - * corrupted.) - */ - uint32_t async_extent_count = 0; - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0); - lck_rw_lock_exclusive(&jnl->trim_lock); - if (jnl->async_trim != NULL) { - error = trim_remove_extent(jnl, jnl->async_trim, offset, length); - async_extent_count = jnl->async_trim->extent_count; - } - lck_rw_unlock_exclusive(&jnl->trim_lock); - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0); - } - } - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0); - return error; -} - - -static int -journal_trim_flush(journal *jnl, transaction *tr) -{ - int errno = 0; - boolean_t was_vm_privileged = FALSE; - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0); - - if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { - /* - * the disk driver can allocate memory on this path... - * if we block waiting for memory, and there is enough pressure to - * cause us to try and create a new swap file, we may end up deadlocking - * due to waiting for the journal on the swap file creation path... - * by making ourselves vm_privileged, we give ourselves the best chance - * of not blocking - */ - was_vm_privileged = set_vm_privilege(TRUE); - } - lck_rw_lock_shared(&jnl->trim_lock); - if (tr->trim.extent_count > 0) { - dk_unmap_t unmap; - - bzero(&unmap, sizeof(unmap)); - if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) { - unmap.extents = tr->trim.extents; - unmap.extentsCount = tr->trim.extent_count; - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0); - errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel()); - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0); - } - - /* - * Call back into the file system to tell them that we have - * trimmed some extents and that they can now be reused. - * - * CAUTION: If the journal becomes invalid (eg., due to an I/O - * error when trying to write to the journal), this callback - * will stop getting called, even if extents got freed before - * the journal became invalid! - */ - if (jnl->trim_callback) - jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents); - } - lck_rw_unlock_shared(&jnl->trim_lock); - - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - /* - * If the transaction we're flushing was the async transaction, then - * tell the current transaction that there is no pending trim - * any more. - * - * NOTE: Since we released the lock, another thread could have - * removed one or more extents from our list. That's not a - * problem since any writes to the re-allocated blocks - * would get sent to the device after the DKIOCUNMAP. - */ - lck_rw_lock_exclusive(&jnl->trim_lock); - if (jnl->async_trim == &tr->trim) - jnl->async_trim = NULL; - lck_rw_unlock_exclusive(&jnl->trim_lock); - - /* - * By the time we get here, no other thread can discover the address - * of "tr", so it is safe for us to manipulate tr->trim without - * holding any locks. - */ - if (tr->trim.extents) { - kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); - tr->trim.allocated_count = 0; - tr->trim.extent_count = 0; - tr->trim.extents = NULL; - } - - if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, errno, 0, 0, 0, 0); - - return errno; -} - -static int -journal_binfo_cmp(const void *a, const void *b) -{ - const block_info *bi_a = (const struct block_info *)a; - const block_info *bi_b = (const struct block_info *)b; - daddr64_t res; - - if (bi_a->bnum == (off_t)-1) { - return 1; - } - if (bi_b->bnum == (off_t)-1) { - return -1; - } - - // don't have to worry about negative block - // numbers so this is ok to do. - // - res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp)); - - return (int)res; -} - - -/* - * End a transaction. If the transaction is small enough, and we're not forcing - * a write to disk, the "active" transaction becomes the "current" transaction, - * and will be reused for the next transaction that is started (group commit). - * - * If the transaction gets written to disk (because force_it is true, or no - * group commit, or the transaction is sufficiently full), the blocks get - * written into the journal first, then the are written asynchronously. When - * those async writes complete, the transaction can be freed and removed from - * the journal. - * - * An optional callback can be supplied. If given, it is called after the - * the blocks have been written to the journal, but before the async writes - * of those blocks to their normal on-disk locations. This is used by - * journal_relocate so that the location of the journal can be changed and - * flushed to disk before the blocks get written to their normal locations. - * Note that the callback is only called if the transaction gets written to - * the journal during this end_transaction call; you probably want to set the - * force_it flag. - * - * Inputs: - * tr Transaction to add to the journal - * force_it If true, force this transaction to the on-disk journal immediately. - * callback See description above. Pass NULL for no callback. - * callback_arg Argument passed to callback routine. - * - * Result - * 0 No errors - * -1 An error occurred. The journal is marked invalid. - */ -static int -end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait) -{ - block_list_header *blhdr=NULL, *next=NULL; - int i, ret_val = 0; - errno_t errno; - journal *jnl = tr->jnl; - struct buf *bp; - size_t tbuffer_offset; - boolean_t drop_lock_early; - - if (jnl->cur_tr) { - panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n", - jnl, jnl->cur_tr, tr); - } - - // if there weren't any modified blocks in the transaction - // just save off the transaction pointer and return. - if (tr->total_bytes == jnl->jhdr->blhdr_size) { - jnl->cur_tr = tr; - goto done; - } - - // if our transaction buffer isn't very full, just hang - // on to it and don't actually flush anything. this is - // what is known as "group commit". we will flush the - // transaction buffer if it's full or if we have more than - // one of them so we don't start hogging too much memory. - // - // We also check the device supports UNMAP/TRIM, and if so, - // the number of extents waiting to be trimmed. If it is - // small enough, then keep accumulating more (so we can - // reduce the overhead of trimming). If there was a prior - // trim error, then we stop issuing trims for this - // volume, so we can also coalesce transactions. - // - if ( force_it == 0 - && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 - && tr->num_blhdrs < 3 - && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8)) - && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) { - - jnl->cur_tr = tr; - goto done; - } - - KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0); - - lock_condition(jnl, &jnl->flushing, "end_transaction"); - - /* - * if the previous 'finish_end_transaction' was being run - * asynchronously, it could have encountered a condition - * that caused it to mark the journal invalid... if that - * occurred while we were waiting for it to finish, we - * need to notice and abort the current transaction - */ - if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) { - unlock_condition(jnl, &jnl->flushing); - - abort_transaction(jnl, tr); - ret_val = -1; - KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); - goto done; - } - - /* - * Store a pointer to this transaction's trim list so that - * future transactions can find it. - * - * Note: if there are no extents in the trim list, then don't - * bother saving the pointer since nothing can add new extents - * to the list (and other threads/transactions only care if - * there is a trim pending). - */ - lck_rw_lock_exclusive(&jnl->trim_lock); - if (jnl->async_trim != NULL) - panic("jnl: end_transaction: async_trim already non-NULL!"); - if (tr->trim.extent_count > 0) - jnl->async_trim = &tr->trim; - lck_rw_unlock_exclusive(&jnl->trim_lock); - - /* - * snapshot the transaction sequence number while we are still behind - * the journal lock since it will be bumped upon the start of the - * next transaction group which may overlap the current journal flush... - * we pass the snapshot into write_journal_header during the journal - * flush so that it can write the correct version in the header... - * because we hold the 'flushing' condition variable for the duration - * of the journal flush, 'saved_sequence_num' remains stable - */ - jnl->saved_sequence_num = jnl->sequence_num; - - /* - * if we're here we're going to flush the transaction buffer to disk. - * 'check_free_space' will not return untl there is enough free - * space for this transaction in the journal and jnl->old_start[0] - * is avaiable for use - */ - KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); - - check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num); - - KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0); - - // range check the end index - if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { - panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", - jnl->jhdr->end, jnl->jhdr->size); - } - if (tr->delayed_header_write == TRUE) { - thread_t thread = THREAD_NULL; - - lock_condition(jnl, &jnl->writing_header, "end_transaction"); - /* - * fire up a thread to write the journal header - * asynchronously... when it finishes, it will call - * unlock_condition... we can overlap the preparation of - * the log and buffers during this time - */ - kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread); - } else - jnl->write_header_failed = FALSE; - - - // this transaction starts where the current journal ends - tr->journal_start = jnl->jhdr->end; - - lock_oldstart(jnl); - /* - * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. - * slide everyone else down and put our latest guy in the last - * entry in the old_start array - */ - memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); - jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; - - unlock_oldstart(jnl); - - - for (blhdr = tr->blhdr; blhdr; blhdr = next) { - char *blkptr; - buf_t sbp; - int32_t bsize; - - tbuffer_offset = jnl->jhdr->blhdr_size; - - for (i = 1; i < blhdr->num_blocks; i++) { - - if (blhdr->binfo[i].bnum != (off_t)-1) { - void (*func)(buf_t, void *); - void *arg; - - bp = blhdr->binfo[i].u.bp; - - if (bp == NULL) { - panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n", - blhdr->binfo[i].bnum, jnl, tr); - } - /* - * acquire the bp here so that we can safely - * mess around with its data. buf_acquire() - * will return EAGAIN if the buffer was busy, - * so loop trying again. - */ - do { - errno = buf_acquire(bp, BAC_REMOVE, 0, 0); - } while (errno == EAGAIN); - - if (errno) - panic("could not acquire bp %p (err %d)\n", bp, errno); - - if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { - if (jnl->flags & JOURNAL_CLOSE_PENDING) { - buf_clearflags(bp, B_LOCKED); - buf_brelse(bp); - - /* - * this is an odd case that appears to happen occasionally - * make sure we mark this block as no longer valid - * so that we don't process it in "finish_end_transaction" since - * the bp that is recorded in our array no longer belongs - * to us (normally we substitute a shadow bp to be processed - * issuing a 'buf_bawrite' on a stale buf_t pointer leads - * to all kinds of problems. - */ - blhdr->binfo[i].bnum = (off_t)-1; - continue; - } else { - panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); - } - } - bsize = buf_size(bp); - - buf_setfilter(bp, NULL, NULL, &func, &arg); - - blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - - sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0); - - if (sbp == NULL) - panic("jnl: buf_create_shadow returned NULL"); - - /* - * copy the data into the transaction buffer... - */ - memcpy(blkptr, (char *)buf_dataptr(bp), bsize); - - buf_clearflags(bp, B_LOCKED); - buf_markclean(bp); - buf_drop(bp); - - /* - * adopt the shadow buffer for this block - */ - if (func) { - /* - * transfer FS hook function to the - * shadow buffer... it will get called - * in finish_end_transaction - */ - buf_setfilter(sbp, func, arg, NULL, NULL); - } - blhdr->binfo[i].u.bp = sbp; - - } else { - // bnum == -1, only true if a block was "killed" - bsize = blhdr->binfo[i].u.bi.bsize; - } - tbuffer_offset += bsize; - } - next = (block_list_header *)((long)blhdr->binfo[0].bnum); - } - /* - * if callback != NULL, we don't want to drop the journal - * lock, or complete end_transaction asynchronously, since - * the caller is expecting the callback to run in the calling - * context - * - * if drop_lock == FALSE, we can't complete end_transaction - * asynchronously - */ - if (callback) - drop_lock_early = FALSE; - else - drop_lock_early = drop_lock; - - if (drop_lock_early == FALSE) - must_wait = TRUE; - - if (drop_lock_early == TRUE) { - journal_unlock(jnl); - drop_lock = FALSE; - } - if (must_wait == TRUE) - ret_val = finish_end_transaction(tr, callback, callback_arg); - else { - thread_t thread = THREAD_NULL; - - /* - * fire up a thread to complete processing this transaction - * asynchronously... when it finishes, it will call - * unlock_condition - */ - kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread); - } - KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); -done: - if (drop_lock == TRUE) { - journal_unlock(jnl); - } - return (ret_val); -} - - -static void -finish_end_thread(transaction *tr) -{ - proc_set_task_policy(current_task(), current_thread(), - TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE); - - finish_end_transaction(tr, NULL, NULL); - - thread_deallocate(current_thread()); - thread_terminate(current_thread()); -} - -static void -write_header_thread(journal *jnl) -{ - proc_set_task_policy(current_task(), current_thread(), - TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE); - - if (write_journal_header(jnl, 1, jnl->saved_sequence_num)) - jnl->write_header_failed = TRUE; - else - jnl->write_header_failed = FALSE; - unlock_condition(jnl, &jnl->writing_header); - - thread_deallocate(current_thread()); - thread_terminate(current_thread()); -} - -static int -finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) -{ - int i, amt; - int ret = 0; - off_t end; - journal *jnl = tr->jnl; - buf_t bp, *bparray; - vnode_t vp; - block_list_header *blhdr=NULL, *next=NULL; - size_t tbuffer_offset; - int bufs_written = 0; - int ret_val = 0; - boolean_t was_vm_privileged = FALSE; - - KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0); - - if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { - /* - * if we block waiting for memory, and there is enough pressure to - * cause us to try and create a new swap file, we may end up deadlocking - * due to waiting for the journal on the swap file creation path... - * by making ourselves vm_privileged, we give ourselves the best chance - * of not blocking - */ - was_vm_privileged = set_vm_privilege(TRUE); - } - end = jnl->jhdr->end; - - for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - - amt = blhdr->bytes_used; - - blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num; - - blhdr->checksum = 0; - blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); - - if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *), VM_KERN_MEMORY_FILE)) { - panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); - } - tbuffer_offset = jnl->jhdr->blhdr_size; - - for (i = 1; i < blhdr->num_blocks; i++) { - void (*func)(buf_t, void *); - void *arg; - int32_t bsize; - - /* - * finish preparing the shadow buf_t before - * calculating the individual block checksums - */ - if (blhdr->binfo[i].bnum != (off_t)-1) { - daddr64_t blkno; - daddr64_t lblkno; - - bp = blhdr->binfo[i].u.bp; - - vp = buf_vnode(bp); - blkno = buf_blkno(bp); - lblkno = buf_lblkno(bp); - - if (vp == NULL && lblkno == blkno) { - printf("jnl: %s: end_tr: bad news! buffer w/null vp and l/blkno = %qd/%qd. aborting the transaction.\n", - jnl->jdev_name, lblkno, blkno); - ret_val = -1; - goto bad_journal; - } - - // if the lblkno is the same as blkno and this bp isn't - // associated with the underlying file system device then - // we need to call bmap() to get the actual physical block. - // - if ((lblkno == blkno) && (vp != jnl->fsdev)) { - off_t f_offset; - size_t contig_bytes; - - if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) { - printf("jnl: %s: end_tr: vnop_blktooff failed\n", jnl->jdev_name); - ret_val = -1; - goto bad_journal; - } - if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) { - printf("jnl: %s: end_tr: can't blockmap the buffer", jnl->jdev_name); - ret_val = -1; - goto bad_journal; - } - if ((uint32_t)contig_bytes < buf_count(bp)) { - printf("jnl: %s: end_tr: blk not physically contiguous on disk\n", jnl->jdev_name); - ret_val = -1; - goto bad_journal; - } - buf_setblkno(bp, blkno); - } - // update this so we write out the correct physical block number! - blhdr->binfo[i].bnum = (off_t)(blkno); - - /* - * pick up the FS hook function (if any) and prepare - * to fire this buffer off in the next pass - */ - buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg); - - if (func) { - /* - * call the hook function supplied by the filesystem... - * this needs to happen BEFORE cacl_checksum in case - * the FS morphs the data in the buffer - */ - func(bp, arg); - } - bparray[i] = bp; - bsize = buf_size(bp); - blhdr->binfo[i].u.bi.bsize = bsize; - blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize); - } else { - bparray[i] = NULL; - bsize = blhdr->binfo[i].u.bi.bsize; - blhdr->binfo[i].u.bi.b.cksum = 0; - } - tbuffer_offset += bsize; - } - /* - * if we fired off the journal_write_header asynchronously in - * 'end_transaction', we need to wait for its completion - * before writing the actual journal data - */ - wait_condition(jnl, &jnl->writing_header, "finish_end_transaction"); - - if (jnl->write_header_failed == FALSE) - ret = write_journal_data(jnl, &end, blhdr, amt); - else - ret_val = -1; - /* - * put the bp pointers back so that we can - * make the final pass on them - */ - for (i = 1; i < blhdr->num_blocks; i++) - blhdr->binfo[i].u.bp = bparray[i]; - - kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *)); - - if (ret_val == -1) - goto bad_journal; - - if (ret != amt) { - printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n", - jnl->jdev_name, ret, amt); - - ret_val = -1; - goto bad_journal; - } - } - jnl->jhdr->end = end; // update where the journal now ends - tr->journal_end = end; // the transaction ends here too - - if (tr->journal_start == 0 || tr->journal_end == 0) { - panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n", - tr->journal_start, tr->journal_end); - } - - if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) { - ret_val = -1; - goto bad_journal; - } - /* - * If the caller supplied a callback, call it now that the blocks have been - * written to the journal. This is used by journal_relocate so, for example, - * the file system can change its pointer to the new journal. - */ - if (callback != NULL && callback(callback_arg) != 0) { - ret_val = -1; - goto bad_journal; - } - - // - // Send a DKIOCUNMAP for the extents trimmed by this transaction, and - // free up the extent list. - // - journal_trim_flush(jnl, tr); - - // the buffer_flushed_callback will only be called for the - // real blocks that get flushed so we have to account for - // the block_list_headers here. - // - tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; - - lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction"); - - // - // setup for looping through all the blhdr's. - // - for (blhdr = tr->blhdr; blhdr; blhdr = next) { - uint16_t num_blocks; - - /* - * grab this info ahead of issuing the buf_bawrites... - * once the last one goes out, its possible for blhdr - * to be freed (especially if we get preempted) before - * we do the last check of num_blocks or - * grab the next blhdr pointer... - */ - next = (block_list_header *)((long)blhdr->binfo[0].bnum); - num_blocks = blhdr->num_blocks; - - /* - * we can re-order the buf ptrs because everything is written out already - */ - qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp); - - /* - * need to make sure that the loop issuing the buf_bawrite's - * does not touch blhdr once the last buf_bawrite has been - * issued... at that point, we no longer have a legitmate - * reference on the associated storage since it will be - * released upon the completion of that last buf_bawrite - */ - for (i = num_blocks-1; i >= 1; i--) { - if (blhdr->binfo[i].bnum != (off_t)-1) - break; - num_blocks--; - } - for (i = 1; i < num_blocks; i++) { - - if ((bp = blhdr->binfo[i].u.bp)) { - vp = buf_vnode(bp); - - buf_bawrite(bp); - - // this undoes the vnode_ref() in journal_modify_block_end() - vnode_rele_ext(vp, 0, 1); - - bufs_written++; - } - } - } - if (bufs_written == 0) { - /* - * since we didn't issue any buf_bawrite's, there is no - * async trigger to cause the memory associated with this - * transaction to be freed... so, move it to the garbage - * list now - */ - lock_oldstart(jnl); - - tr->next = jnl->tr_freeme; - jnl->tr_freeme = tr; - - unlock_oldstart(jnl); - - unlock_condition(jnl, &jnl->asyncIO); - } - - //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", - // tr, tr->journal_start, tr->journal_end); - -bad_journal: - if (ret_val == -1) { - abort_transaction(jnl, tr); // cleans up list of extents to be trimmed - - /* - * 'flush_aborted' is protected by the flushing condition... we need to - * set it before dropping the condition so that it will be - * noticed in 'end_transaction'... we add this additional - * aborted condition so that we can drop the 'flushing' condition - * before grabbing the journal lock... this avoids a deadlock - * in 'end_transaction' which is holding the journal lock while - * waiting for the 'flushing' condition to clear... - * everyone else will notice the JOURNAL_INVALID flag - */ - jnl->flush_aborted = TRUE; - - unlock_condition(jnl, &jnl->flushing); - journal_lock(jnl); - - jnl->flags |= JOURNAL_INVALID; - jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; - - journal_unlock(jnl); - } else - unlock_condition(jnl, &jnl->flushing); - - if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) - set_vm_privilege(FALSE); - - KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0); - - return (ret_val); -} - - -static void -lock_condition(journal *jnl, boolean_t *condition, const char *condition_name) -{ - - KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0); - - lock_flush(jnl); - - while (*condition == TRUE) - msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); - - *condition = TRUE; - unlock_flush(jnl); - - KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0); -} - -static void -wait_condition(journal *jnl, boolean_t *condition, const char *condition_name) -{ - - if (*condition == FALSE) - return; - - KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0); - - lock_flush(jnl); - - while (*condition == TRUE) - msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); - - unlock_flush(jnl); - - KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0); -} - -static void -unlock_condition(journal *jnl, boolean_t *condition) -{ - lock_flush(jnl); - - *condition = FALSE; - wakeup(condition); - - unlock_flush(jnl); -} - -static void -abort_transaction(journal *jnl, transaction *tr) -{ - block_list_header *blhdr, *next; - - // for each block list header, iterate over the blocks then - // free up the memory associated with the block list. - // - // find each of the primary blocks (i.e. the list could - // contain a mix of shadowed and real buf_t's depending - // on when the abort condition was detected) and mark them - // clean and locked in the cache... this at least allows - // the FS a consistent view between it's incore data structures - // and the meta-data held in the cache - // - KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0); - - for (blhdr = tr->blhdr; blhdr; blhdr = next) { - int i; - - for (i = 1; i < blhdr->num_blocks; i++) { - buf_t bp, tbp, sbp; - vnode_t bp_vp; - errno_t errno; - - if (blhdr->binfo[i].bnum == (off_t)-1) - continue; - - tbp = blhdr->binfo[i].u.bp; - - bp_vp = buf_vnode(tbp); - - if (buf_shadow(tbp)) { - sbp = tbp; - buf_setfilter(tbp, NULL, NULL, NULL, NULL); - } else { - assert(ISSET(buf_flags(tbp), B_LOCKED)); - - sbp = NULL; - - do { - errno = buf_acquire(tbp, BAC_REMOVE, 0, 0); - } while (errno == EAGAIN); - - if (!errno) { - buf_setfilter(tbp, NULL, NULL, NULL, NULL); - buf_brelse(tbp); - } - } - - if (bp_vp) { - errno = buf_meta_bread(bp_vp, - buf_lblkno(tbp), - buf_size(tbp), - NOCRED, - &bp); - if (errno == 0) { - if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) { - panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n", - bp, tbp, jnl); - } - /* - * once the journal has been marked INVALID and aborted, - * NO meta data can be written back to the disk, so - * mark the buf_t clean and make sure it's locked in the cache - * note: if we found a shadow, the real buf_t needs to be relocked - */ - buf_setflags(bp, B_LOCKED); - buf_markclean(bp); - buf_brelse(bp); - - KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0); - - /* - * this undoes the vnode_ref() in journal_modify_block_end() - */ - vnode_rele_ext(bp_vp, 0, 1); - } else { - printf("jnl: %s: abort_tr: could not find block %lld for vnode!\n", - jnl->jdev_name, blhdr->binfo[i].bnum); - if (bp) { - buf_brelse(bp); - } - } - } - if (sbp) - buf_brelse(sbp); - } - next = (block_list_header *)((long)blhdr->binfo[0].bnum); - - // we can free blhdr here since we won't need it any more - blhdr->binfo[0].bnum = 0xdeadc0de; - kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); - } - - /* - * If the transaction we're aborting was the async transaction, then - * tell the current transaction that there is no pending trim - * any more. - */ - lck_rw_lock_exclusive(&jnl->trim_lock); - if (jnl->async_trim == &tr->trim) - jnl->async_trim = NULL; - lck_rw_unlock_exclusive(&jnl->trim_lock); - - - if (tr->trim.extents) { - kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); - } - tr->trim.allocated_count = 0; - tr->trim.extent_count = 0; - tr->trim.extents = NULL; - tr->tbuffer = NULL; - tr->blhdr = NULL; - tr->total_bytes = 0xdbadc0de; - FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); - - KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0); -} - - -int -journal_end_transaction(journal *jnl) -{ - int ret; - transaction *tr; - - CHECK_JOURNAL(jnl); - - free_old_stuff(jnl); - - if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) { - return 0; - } - - if (jnl->owner != current_thread()) { - panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - jnl->nested_count--; - - if (jnl->nested_count > 0) { - return 0; - } else if (jnl->nested_count < 0) { - panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count); - } - - if (jnl->flags & JOURNAL_INVALID) { - if (jnl->active_tr) { - if (jnl->cur_tr != NULL) { - panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n", - jnl, jnl->active_tr, jnl->cur_tr); - } - tr = jnl->active_tr; - jnl->active_tr = NULL; - - abort_transaction(jnl, tr); - } - journal_unlock(jnl); - - return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - - // clear this out here so that when check_free_space() calls - // the FS flush function, we don't panic in journal_flush() - // if the FS were to call that. note: check_free_space() is - // called from end_transaction(). - // - jnl->active_tr = NULL; - - /* Examine the force-journal-flush state in the active txn */ - if (tr->flush_on_completion == TRUE) { - /* - * If the FS requested it, disallow group commit and force the - * transaction out to disk immediately. - */ - ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE); - } - else { - /* in the common path we can simply use the double-buffered journal */ - ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE); - } - - return ret; -} - - -/* - * Flush the contents of the journal to the disk. - * - * Input: - * wait_for_IO - - * If TRUE, wait to write in-memory journal to the disk - * consistently, and also wait to write all asynchronous - * metadata blocks to its corresponding locations - * consistently on the disk. This means that the journal - * is empty at this point and does not contain any - * transactions. This is overkill in normal scenarios - * but is useful whenever the metadata blocks are required - * to be consistent on-disk instead of just the journal - * being consistent; like before live verification - * and live volume resizing. - * - * If FALSE, only wait to write in-memory journal to the - * disk consistently. This means that the journal still - * contains uncommitted transactions and the file system - * metadata blocks in the journal transactions might be - * written asynchronously to the disk. But there is no - * guarantee that they are written to the disk before - * returning to the caller. Note that this option is - * sufficient for file system data integrity as it - * guarantees consistent journal content on the disk. - */ -int -journal_flush(journal *jnl, journal_flush_options_t options) -{ - boolean_t drop_lock = FALSE; - errno_t error = 0; - uint32_t flush_count; - - CHECK_JOURNAL(jnl); - - free_old_stuff(jnl); - - if (jnl->flags & JOURNAL_INVALID) { - return -1; - } - - KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0); - - if (jnl->owner != current_thread()) { - journal_lock(jnl); - drop_lock = TRUE; - } - - if (ISSET(options, JOURNAL_FLUSH_FULL)) - flush_count = jnl->flush_counter; - - // if we're not active, flush any buffered transactions - if (jnl->active_tr == NULL && jnl->cur_tr) { - transaction *tr = jnl->cur_tr; - - jnl->cur_tr = NULL; - - if (ISSET(options, JOURNAL_WAIT_FOR_IO)) { - wait_condition(jnl, &jnl->flushing, "journal_flush"); - wait_condition(jnl, &jnl->asyncIO, "journal_flush"); - } - /* - * "end_transction" will wait for any current async flush - * to complete, before flushing "cur_tr"... because we've - * specified the 'must_wait' arg as TRUE, it will then - * synchronously flush the "cur_tr" - */ - end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed - - } else { - if (drop_lock == TRUE) { - journal_unlock(jnl); - } - - /* Because of pipelined journal, the journal transactions - * might be in process of being flushed on another thread. - * If there is nothing to flush currently, we should - * synchronize ourselves with the pipelined journal thread - * to ensure that all inflight transactions, if any, are - * flushed before we return success to caller. - */ - wait_condition(jnl, &jnl->flushing, "journal_flush"); - } - if (ISSET(options, JOURNAL_WAIT_FOR_IO)) { - wait_condition(jnl, &jnl->asyncIO, "journal_flush"); - } - - if (ISSET(options, JOURNAL_FLUSH_FULL)) { - - dk_synchronize_t sync_request = { - .options = 0, - }; - - // We need a full cache flush. If it has not been done, do it here. - if (flush_count == jnl->flush_counter) - error = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel()); - - // If external journal partition is enabled, flush filesystem data partition. - if (jnl->jdev != jnl->fsdev) - error = VNOP_IOCTL(jnl->fsdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel()); - - } - - KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl, 0, 0, 0, 0); - - return 0; -} - -int -journal_active(journal *jnl) -{ - if (jnl->flags & JOURNAL_INVALID) { - return -1; - } - - return (jnl->active_tr == NULL) ? 0 : 1; -} - -void * -journal_owner(journal *jnl) -{ - return jnl->owner; -} - -int journal_uses_fua(journal *jnl) -{ - if (jnl->flags & JOURNAL_DO_FUA_WRITES) - return 1; - return 0; -} - -/* - * Relocate the journal. - * - * You provide the new starting offset and size for the journal. You may - * optionally provide a new tbuffer_size; passing zero defaults to not - * changing the tbuffer size except as needed to fit within the new journal - * size. - * - * You must have already started a transaction. The transaction may contain - * modified blocks (such as those needed to deallocate the old journal, - * allocate the new journal, and update the location and size of the journal - * in filesystem-private structures). Any transactions prior to the active - * transaction will be flushed to the old journal. The new journal will be - * initialized, and the blocks from the active transaction will be written to - * the new journal. - * - * The caller will need to update the structures that identify the location - * and size of the journal. These updates should be made in the supplied - * callback routine. These updates must NOT go into a transaction. You should - * force these updates to the media before returning from the callback. In the - * even of a crash, either the old journal will be found, with an empty journal, - * or the new journal will be found with the contents of the active transaction. - * - * Upon return from the callback, the blocks from the active transaction are - * written to their normal locations on disk. - * - * (Remember that we have to ensure that blocks get committed to the journal - * before being committed to their normal locations. But the blocks don't count - * as committed until the new journal is pointed at.) - * - * Upon return, there is still an active transaction: newly allocated, and - * with no modified blocks. Call journal_end_transaction as normal. You may - * modifiy additional blocks before calling journal_end_transaction, and those - * blocks will (eventually) go to the relocated journal. - * - * Inputs: - * jnl The (opened) journal to relocate. - * offset The new journal byte offset (from start of the journal device). - * journal_size The size, in bytes, of the new journal. - * tbuffer_size The new desired transaction buffer size. Pass zero to keep - * the same size as the current journal. The size will be - * modified as needed to fit the new journal. - * callback Routine called after the new journal has been initialized, - * and the active transaction written to the new journal, but - * before the blocks are written to their normal locations. - * Pass NULL for no callback. - * callback_arg An argument passed to the callback routine. - * - * Result: - * 0 No errors - * EINVAL The offset is not block aligned - * EINVAL The journal_size is not a multiple of the block size - * EINVAL The journal is invalid - * (any) An error returned by journal_flush. - * - */ -int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size, - errno_t (*callback)(void *), void *callback_arg) -{ - int ret; - transaction *tr; - size_t i = 0; - - /* - * Sanity check inputs, and adjust the size of the transaction buffer. - */ - if ((offset % jnl->jhdr->jhdr_size) != 0) { - printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n", - jnl->jdev_name, offset, jnl->jhdr->jhdr_size); - return EINVAL; - } - if ((journal_size % jnl->jhdr->jhdr_size) != 0) { - printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size); - return EINVAL; - } - - CHECK_JOURNAL(jnl); - - /* Guarantee we own the active transaction. */ - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } - if (jnl->owner != current_thread()) { - panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - if (tbuffer_size == 0) - tbuffer_size = jnl->tbuffer_size; - size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size); - - /* - * Flush any non-active transactions. We have to temporarily hide the - * active transaction to make journal_flush flush out non-active but - * current (unwritten) transactions. - */ - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); - jnl->active_tr = NULL; - ret = journal_flush(jnl, JOURNAL_WAIT_FOR_IO); - jnl->active_tr = tr; - - if (ret) { - return ret; - } - wait_condition(jnl, &jnl->flushing, "end_transaction"); - - /* - * At this point, we have completely flushed the contents of the current - * journal to disk (and have asynchronously written all of the txns to - * their actual desired locations). As a result, we can (and must) clear - * out the old_start array. If we do not, then if the last written transaction - * started at the beginning of the journal (starting 1 block into the - * journal file) it could confuse the buffer_flushed callback. This is - * because we're about to reset the start/end pointers of the journal header - * below. - */ - lock_oldstart(jnl); - for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) { - jnl->old_start[i] = 0; - } - unlock_oldstart(jnl); - - /* Update the journal's offset and size in memory. */ - jnl->jdev_offset = offset; - jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size; - jnl->jhdr->size = journal_size; - jnl->active_start = jnl->jhdr->start; - - /* - * Force the active transaction to be written to the new journal. Call the - * supplied callback after the blocks have been written to the journal, but - * before they get written to their normal on-disk locations. - */ - jnl->active_tr = NULL; - ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE); - if (ret) { - printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret); - goto bad_journal; - } - - /* - * Create a new, empty transaction to be the active transaction. This way - * our caller can use journal_end_transaction as usual. - */ - ret = journal_allocate_transaction(jnl); - if (ret) { - printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret); - goto bad_journal; - } - - return 0; - -bad_journal: - jnl->flags |= JOURNAL_INVALID; - abort_transaction(jnl, tr); - return ret; -} - -uint32_t journal_current_txn(journal *jnl) -{ - return jnl->sequence_num + (jnl->active_tr || jnl->cur_tr ? 0 : 1); -} - -#else // !JOURNALING - so provide stub functions - -int journal_uses_fua(__unused journal *jnl) -{ - return 0; -} - -journal * -journal_create(__unused struct vnode *jvp, - __unused off_t offset, - __unused off_t journal_size, - __unused struct vnode *fsvp, - __unused size_t min_fs_blksz, - __unused int32_t flags, - __unused int32_t tbuffer_size, - __unused void (*flush)(void *arg), - __unused void *arg, - __unused struct mount *fsmount) -{ - return NULL; -} - -journal * -journal_open(__unused struct vnode *jvp, - __unused off_t offset, - __unused off_t journal_size, - __unused struct vnode *fsvp, - __unused size_t min_fs_blksz, - __unused int32_t flags, - __unused int32_t tbuffer_size, - __unused void (*flush)(void *arg), - __unused void *arg, - __unused struct mount *fsmount) -{ - return NULL; -} - - -int -journal_modify_block_start(__unused journal *jnl, __unused struct buf *bp) -{ - return EINVAL; -} - -int -journal_modify_block_end(__unused journal *jnl, - __unused struct buf *bp, - __unused void (*func)(struct buf *bp, void *arg), - __unused void *arg) -{ - return EINVAL; -} - -int -journal_kill_block(__unused journal *jnl, __unused struct buf *bp) -{ - return EINVAL; -} - -int journal_relocate(__unused journal *jnl, - __unused off_t offset, - __unused off_t journal_size, - __unused int32_t tbuffer_size, - __unused errno_t (*callback)(void *), - __unused void *callback_arg) -{ - return EINVAL; -} - -void -journal_close(__unused journal *jnl) -{ -} - -int -journal_start_transaction(__unused journal *jnl) -{ - return EINVAL; -} - -int -journal_end_transaction(__unused journal *jnl) -{ - return EINVAL; -} - -int -journal_flush(__unused journal *jnl, __unused journal_flush_options_t options) -{ - return EINVAL; -} - -int -journal_is_clean(__unused struct vnode *jvp, - __unused off_t offset, - __unused off_t journal_size, - __unused struct vnode *fsvp, - __unused size_t min_fs_block_size) -{ - return 0; -} - - -void * -journal_owner(__unused journal *jnl) -{ - return NULL; -} - -void -journal_lock(__unused journal *jnl) -{ - return; -} - -void -journal_unlock(__unused journal *jnl) -{ - return; -} - -__private_extern__ int -journal_trim_add_extent(__unused journal *jnl, - __unused uint64_t offset, - __unused uint64_t length) -{ - return 0; -} - -int -journal_request_immediate_flush(__unused journal *jnl) -{ - return 0; -} - -__private_extern__ int -journal_trim_remove_extent(__unused journal *jnl, - __unused uint64_t offset, - __unused uint64_t length) -{ - return 0; -} - -int journal_trim_extent_overlap(__unused journal *jnl, - __unused uint64_t offset, - __unused uint64_t length, - __unused uint64_t *end) -{ - return 0; -} - -#endif // !JOURNALING diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h deleted file mode 100644 index 42fd81e5c..000000000 --- a/bsd/vfs/vfs_journal.h +++ /dev/null @@ -1,381 +0,0 @@ - -/* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * This header contains the structures and function prototypes - * for the vfs journaling code. The data types are not meant - * to be modified by user code. Just use the functions and do - * not mess around with the structs. - */ -#ifndef _SYS_VFS_JOURNAL_H_ -#define _SYS_VFS_JOURNAL_H_ - -#include -#include - -#ifdef __APPLE_API_UNSTABLE - -#include -#include -#include - - -typedef struct _blk_info { - int32_t bsize; - union { - int32_t cksum; - uint32_t sequence_num; - } b; -} _blk_info; - -typedef struct block_info { - off_t bnum; // block # on the file system device - union { - _blk_info bi; - struct buf *bp; - } u; -} __attribute__((__packed__)) block_info; - -typedef struct block_list_header { - u_int16_t max_blocks; // max number of blocks in this chunk - u_int16_t num_blocks; // number of valid block numbers in block_nums - int32_t bytes_used; // how many bytes of this tbuffer are used - uint32_t checksum; // on-disk: checksum of this header and binfo[0] - int32_t flags; // check-checksums, initial blhdr, etc - block_info binfo[1]; // so we can reference them by name -} block_list_header; - -#define BLHDR_CHECK_CHECKSUMS 0x0001 -#define BLHDR_FIRST_HEADER 0x0002 - - -struct journal; - -struct jnl_trim_list { - uint32_t allocated_count; - uint32_t extent_count; - dk_extent_t *extents; -}; - -typedef void (*jnl_trim_callback_t)(void *arg, uint32_t extent_count, const dk_extent_t *extents); - -typedef struct transaction { - int tbuffer_size; // in bytes - char *tbuffer; // memory copy of the transaction - block_list_header *blhdr; // points to the first byte of tbuffer - int num_blhdrs; // how many buffers we've allocated - int total_bytes; // total # of bytes in transaction - int num_flushed; // how many bytes have been flushed - int num_killed; // how many bytes were "killed" - off_t journal_start; // where in the journal this transaction starts - off_t journal_end; // where in the journal this transaction ends - struct journal *jnl; // ptr back to the journal structure - struct transaction *next; // list of tr's (either completed or to be free'd) - uint32_t sequence_num; - struct jnl_trim_list trim; - boolean_t delayed_header_write; - boolean_t flush_on_completion; //flush transaction immediately upon txn end. -} transaction; - - -/* - * This is written to block zero of the journal and it - * maintains overall state about the journal. - */ -typedef struct journal_header { - int32_t magic; - int32_t endian; - volatile off_t start; // zero-based byte offset of the start of the first transaction - volatile off_t end; // zero-based byte offset of where free space begins - off_t size; // size in bytes of the entire journal - int32_t blhdr_size; // size in bytes of each block_list_header in the journal - uint32_t checksum; - int32_t jhdr_size; // block size (in bytes) of the journal header - uint32_t sequence_num; // NEW FIELD: a monotonically increasing value assigned to all txn's -} journal_header; - -#define JOURNAL_HEADER_MAGIC 0x4a4e4c78 // 'JNLx' -#define ENDIAN_MAGIC 0x12345678 - -// -// we only checksum the original size of the journal_header to remain -// backwards compatible. the size of the original journal_heade is -// everything up to the the sequence_num field, hence we use the -// offsetof macro to calculate the size. -// -#define JOURNAL_HEADER_CKSUM_SIZE (offsetof(struct journal_header, sequence_num)) - -#define OLD_JOURNAL_HEADER_MAGIC 0x4a484452 // 'JHDR' - - -/* - * In memory structure about the journal. - */ -typedef struct journal { - lck_mtx_t jlock; // protects the struct journal data - lck_mtx_t flock; // serializes flushing of journal - lck_rw_t trim_lock; // protects the async_trim field, below - - - struct vnode *jdev; // vnode of the device where the journal lives - off_t jdev_offset; // byte offset to the start of the journal - const char *jdev_name; - - struct vnode *fsdev; // vnode of the file system device - struct mount *fsmount; // mount of the file system - - void (*flush)(void *arg); // fs callback to flush meta data blocks - void *flush_arg; // arg that's passed to flush() - - int32_t flags; - int32_t tbuffer_size; // default transaction buffer size - boolean_t flush_aborted; - boolean_t flushing; - boolean_t asyncIO; - boolean_t writing_header; - boolean_t write_header_failed; - - struct jnl_trim_list *async_trim; // extents to be trimmed by transaction being asynchronously flushed - jnl_trim_callback_t trim_callback; - void *trim_callback_arg; - - char *header_buf; // in-memory copy of the journal header - int32_t header_buf_size; - journal_header *jhdr; // points to the first byte of header_buf - - uint32_t saved_sequence_num; - uint32_t sequence_num; - - off_t max_read_size; - off_t max_write_size; - - transaction *cur_tr; // for group-commit - transaction *completed_trs; // out-of-order transactions that completed - transaction *active_tr; // for nested transactions - int32_t nested_count; // for nested transactions - void *owner; // a ptr that's unique to the calling process - - transaction *tr_freeme; // transaction structs that need to be free'd - - volatile off_t active_start; // the active start that we only keep in memory - lck_mtx_t old_start_lock; // protects the old_start - volatile off_t old_start[16]; // this is how we do lazy start update - - int last_flush_err; // last error from flushing the cache - uint32_t flush_counter; // a monotonically increasing value assigned on track cache flush -} journal; - -/* internal-only journal flags (top 16 bits) */ -#define JOURNAL_CLOSE_PENDING 0x00010000 -#define JOURNAL_INVALID 0x00020000 -#define JOURNAL_FLUSHCACHE_ERR 0x00040000 // means we already printed this err -#define JOURNAL_NEED_SWAP 0x00080000 // swap any data read from disk -#define JOURNAL_DO_FUA_WRITES 0x00100000 // do force-unit-access writes -#define JOURNAL_USE_UNMAP 0x00200000 // device supports UNMAP (TRIM) -#define JOURNAL_FEATURE_BARRIER 0x00400000 // device supports barrier-only flush - - -/* journal_open/create options are always in the low-16 bits */ -#define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff - -__BEGIN_DECLS -/* - * Prototypes. - */ - -/* - * Call journal_init() to initialize the journaling code (sets up lock attributes) - */ -void journal_init(void); - -/* - * Call journal_create() to create a new journal. You only - * call this once, typically at file system creation time. - * - * The "jvp" argument is the vnode where the journal is written. - * The journal starts at "offset" and is "journal_size" bytes long. - * - * The "fsvp" argument is the vnode of your file system. It may be - * the same as "jvp". - * - * The "min_fs_block_size" argument is the minimum block size - * (in bytes) that the file system will ever write. Typically - * this is the block size of the file system (1k, 4k, etc) but - * on HFS+ it is the minimum block size of the underlying device. - * - * The flags argument lets you disable group commit if you - * want tighter guarantees on transactions (in exchange for - * lower performance). - * - * The tbuffer_size is the size of the transaction buffer - * used by the journal. If you specify zero, the journal code - * will use a reasonable defaults. The tbuffer_size should - * be an integer multiple of the min_fs_block_size. - * - * Returns a valid journal pointer or NULL if one could not - * be created. - */ -journal *journal_create(struct vnode *jvp, - off_t offset, - off_t journal_size, - struct vnode *fsvp, - size_t min_fs_block_size, - int32_t flags, - int32_t tbuffer_size, - void (*flush)(void *arg), - void *arg, - struct mount *fsmount); - -/* - * Call journal_open() when mounting an existing file system - * that has a previously created journal. It will take care - * of validating the journal and replaying it if necessary. - * - * See journal_create() for a description of the arguments. - * - * Returns a valid journal pointer of NULL if it runs into - * trouble reading/playing back the journal. - */ -journal *journal_open(struct vnode *jvp, - off_t offset, - off_t journal_size, - struct vnode *fsvp, - size_t min_fs_block_size, - int32_t flags, - int32_t tbuffer_size, - void (*flush)(void *arg), - void *arg, - struct mount *fsmount); - -/* - * Test whether the journal is clean or not. This is intended - * to be used when you're mounting read-only. If the journal - * is not clean for some reason then you should not mount the - * volume as your data structures may be in an unknown state. - */ -int journal_is_clean(struct vnode *jvp, - off_t offset, - off_t journal_size, - struct vnode *fsvp, - size_t min_fs_block_size); - - -/* - * Call journal_close() just before your file system is unmounted. - * It flushes any outstanding transactions and makes sure the - * journal is in a consistent state. - */ -void journal_close(journal *journalp); - -/* - * flags for journal_create/open. only can use - * the low 16 bits for flags because internal - * bits go in the high 16. - */ -#define JOURNAL_NO_GROUP_COMMIT 0x00000001 -#define JOURNAL_RESET 0x00000002 - -/* - * Transaction related functions. - * - * Before you start modifying file system meta data, you - * should call journal_start_transaction(). Then before - * you modify each block, call journal_modify_block_start() - * and when you're done, journal_modify_block_end(). When - * you've modified the last block as part of a transaction, - * call journal_end_transaction() to commit the changes. - * - * If you decide to abort the modifications to a block you - * should call journal_modify_block_abort(). - * - * If as part of a transaction you need want to throw out - * any previous copies of a block (because it got deleted) - * then call journal_kill_block(). This will mark it so - * that the journal does not play it back (effectively - * dropping it). - * - * journal_trim_add_extent() marks a range of bytes on the device which should - * be trimmed (invalidated, unmapped). journal_trim_remove_extent() marks a - * range of bytes which should no longer be trimmed. Accumulated extents - * will be trimmed when the transaction is flushed to the on-disk journal. - */ -int journal_start_transaction(journal *jnl); -int journal_modify_block_start(journal *jnl, struct buf *bp); -int journal_modify_block_abort(journal *jnl, struct buf *bp); -int journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg); -int journal_kill_block(journal *jnl, struct buf *bp); -#ifdef BSD_KERNEL_PRIVATE -int journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length); -int journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length); -void journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg); -int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end); -/* Mark state in the journal that requests an immediate journal flush upon txn completion */ -int journal_request_immediate_flush (journal *jnl); -#endif -int journal_end_transaction(journal *jnl); - -int journal_active(journal *jnl); - -typedef enum journal_flush_options { - JOURNAL_WAIT_FOR_IO = 0x01, // Flush journal and metadata blocks, wait for async IO to complete. - JOURNAL_FLUSH_FULL = 0x02, // Flush track cache to media -} journal_flush_options_t; - -int journal_flush(journal *jnl, journal_flush_options_t options); -void *journal_owner(journal *jnl); // compare against current_thread() -int journal_uses_fua(journal *jnl); -void journal_lock(journal *jnl); -void journal_unlock(journal *jnl); - - -/* - * Relocate the journal. - * - * You provide the new starting offset and size for the journal. You may - * optionally provide a new tbuffer_size; passing zero defaults to not - * changing the tbuffer size except as needed to fit within the new journal - * size. - * - * You must have already started a transaction. The transaction may contain - * modified blocks (such as those needed to deallocate the old journal, - * allocate the new journal, and update the location and size of the journal - * in filesystem-private structures). Any transactions prior to the active - * transaction will be flushed to the old journal. The new journal will be - * initialized, and the blocks from the active transaction will be written to - * the new journal. The caller will need to update the structures that - * identify the location and size of the journal from the callback routine. - */ -int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size, - errno_t (*callback)(void *), void *callback_arg); - -uint32_t journal_current_txn(journal *jnl); - -__END_DECLS - -#endif /* __APPLE_API_UNSTABLE */ -#endif /* !_SYS_VFS_JOURNAL_H_ */ diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 09bd470a9..128a8ce04 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -990,6 +990,40 @@ lookup(struct nameidata *ndp) * .. in the other file system. */ if ( (cnp->cn_flags & ISDOTDOT) ) { + /* + * if this is a chroot'ed process, check if the current + * directory is still a subdirectory of the process's + * root directory. + */ + if (ndp->ni_rootdir && (ndp->ni_rootdir != rootvnode) && + dp != ndp->ni_rootdir) { + int sdir_error; + int is_subdir = FALSE; + + sdir_error = vnode_issubdir(dp, ndp->ni_rootdir, + &is_subdir, vfs_context_kernel()); + + /* + * If we couldn't determine if dp is a subdirectory of + * ndp->ni_rootdir (sdir_error != 0), we let the request + * proceed. + */ + if (!sdir_error && !is_subdir) { + vnode_put(dp); + dp = ndp->ni_rootdir; + /* + * There's a ref on the process's root directory + * but we can't use vnode_getwithref here as + * there is nothing preventing that ref being + * released by another thread. + */ + if (vnode_get(dp)) { + error = ENOENT; + goto bad; + } + } + } + for (;;) { if (dp == ndp->ni_rootdir || dp == rootvnode) { ndp->ni_dvp = dp; @@ -1419,7 +1453,14 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) int error; char *cp; /* pointer into pathname argument */ uio_t auio; - char uio_buf[ UIO_SIZEOF(1) ]; + union { + union { + struct user_iovec s_uiovec; + struct kern_iovec s_kiovec; + } u_iovec; + struct uio s_uio; + char uio_buf[ UIO_SIZEOF(1) ]; + } u_uio_buf; /* union only for aligning uio_buf correctly */ int need_newpathbuf; u_int linklen; struct componentname *cnp = &ndp->ni_cnd; @@ -1446,7 +1487,8 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) } else { cp = cnp->cn_pnbuf; } - auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, + &u_uio_buf.uio_buf[0], sizeof(u_uio_buf.uio_buf)); uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); @@ -1695,7 +1737,7 @@ kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t l if (dbg_namelen <= (int)(3 * sizeof(long))) code |= DBG_FUNC_END; - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, VM_KERNEL_ADDRPERM(dp), dbg_parms[0], dbg_parms[1], dbg_parms[2], 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, kdebug_vnode(dp), dbg_parms[0], dbg_parms[1], dbg_parms[2], 0); code &= ~DBG_FUNC_START; diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 4f31d45e7..2b79b659c 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,7 +76,6 @@ * External virtual filesystem routines */ - #include #include #include @@ -112,7 +111,7 @@ #include #include - +#include #include #include @@ -1091,13 +1090,20 @@ vfs_mountroot(void) bdevvp_rootvp = rootvp; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { - if (vfsp->vfc_mountroot == NULL) + if (vfsp->vfc_mountroot == NULL + && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) { continue; + } mp = vfs_rootmountalloc_internal(vfsp, "root_device"); mp->mnt_devvp = rootvp; - if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx)) == 0) { + if (vfsp->vfc_mountroot) + error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx); + else + error = VFS_MOUNT(mp, rootvp, 0, ctx); + + if (!error) { if ( bdevvp_rootvp != rootvp ) { /* * rootvp changed... @@ -1126,13 +1132,10 @@ vfs_mountroot(void) */ vfs_init_io_attributes(rootvp, mp); - if ((mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) && - (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) { - /* - * only for CF - */ + if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) { root_is_CF_drive = TRUE; } + /* * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS. */ @@ -1172,6 +1175,11 @@ vfs_mountroot(void) (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) { mp->mnt_kern_flag |= MNTK_PATH_FROM_ID; } + + if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) && + (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) { + mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS; + } } /* @@ -1267,7 +1275,6 @@ vfs_getnewfsid(struct mount *mp) fsid_t tfsid; int mtype; - mount_t nmp; mount_list_lock(); @@ -1278,13 +1285,12 @@ vfs_getnewfsid(struct mount *mp) tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); tfsid.val[1] = mtype; - TAILQ_FOREACH(nmp, &mountlist, mnt_list) { - while (vfs_getvfs_locked(&tfsid)) { - if (++mntid_gen == 0) - mntid_gen++; - tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); - } + while (vfs_getvfs_locked(&tfsid)) { + if (++mntid_gen == 0) + mntid_gen++; + tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); } + mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1]; mount_list_unlock(); @@ -2535,6 +2541,10 @@ vcount(vnode_t vp) int count; int vid; + if (!vnode_isspec(vp)) { + return (vp->v_usecount - vp->v_kusecount); + } + loop: if (!vnode_isaliased(vp)) return (vp->v_specinfo->si_opencount); @@ -2714,8 +2724,7 @@ set_package_extensions_table(user_addr_t data, int nentries, int maxwidth) } -__private_extern__ int -is_package_name(const char *name, int len) +int is_package_name(const char *name, int len) { int i, extlen; const char *ptr, *name_ext; @@ -2888,7 +2897,6 @@ is_bad_sysctl_name(struct vfstable *vfsp, int selector_name) case VFS_CTL_DISC: case VFS_CTL_SERVERINFO: return 1; - break; default: break; @@ -2908,7 +2916,6 @@ is_bad_sysctl_name(struct vfstable *vfsp, int selector_name) case AFPFS_VFS_CTL_NETCHANGE: case AFPFS_VFS_CTL_VOLCHANGE: return 1; - break; } } @@ -3114,6 +3121,7 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) off_t readsegsize = 0; off_t writesegsize = 0; off_t alignment = 0; + u_int32_t minsaturationbytecount = 0; u_int32_t ioqueue_depth = 0; u_int32_t blksize; u_int64_t temp; @@ -3293,6 +3301,12 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) if (features & DK_FEATURE_FORCE_UNIT_ACCESS) mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED; + + if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) { + mp->mnt_minsaturationbytecount = minsaturationbytecount; + } else { + mp->mnt_minsaturationbytecount = 0; + } if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0) cs_present = TRUE; @@ -3586,10 +3600,12 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_ffree = (user64_long_t)sp->f_ffree; sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; - +#ifdef NFSCLIENT if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); - } else { + } else +#endif + { strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); } strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); @@ -3644,10 +3660,13 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_ffree = (user32_long_t)sp->f_ffree; sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; - + +#ifdef NFS_CLIENT if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); - } else { + } else +#endif + { strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); } strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); @@ -3669,20 +3688,27 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); +static int filt_fstouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); struct filterops fs_filtops = { .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent, + .f_touch = filt_fstouch, + .f_process = filt_fsprocess, }; static int filt_fsattach(struct knote *kn) { - lck_mtx_lock(fs_klist_lock); - kn->kn_flags |= EV_CLEAR; KNOTE_ATTACH(&fs_klist, kn); lck_mtx_unlock(fs_klist_lock); + + /* + * filter only sees future events, + * so it can't be fired already. + */ return (0); } @@ -3709,6 +3735,52 @@ filt_fsevent(struct knote *kn, long hint) return (kn->kn_fflags != 0); } +static int +filt_fstouch(struct knote *kn, struct kevent_internal_s *kev) +{ + int res; + + lck_mtx_lock(fs_klist_lock); + + kn->kn_sfflags = kev->fflags; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + /* + * the above filter function sets bits even if nobody is looking for them. + * Just preserve those bits even in the new mask is more selective + * than before. + * + * For compatibility with previous implementations, we leave kn_fflags + * as they were before. + */ + //if (kn->kn_sfflags) + // kn->kn_fflags &= kn->kn_sfflags; + res = (kn->kn_fflags != 0); + + lck_mtx_unlock(fs_klist_lock); + + return res; +} + +static int +filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + int res; + + lck_mtx_lock(fs_klist_lock); + res = (kn->kn_fflags != 0); + if (res) { + *kev = kn->kn_kevent; + kn->kn_flags |= EV_CLEAR; /* automatic */ + kn->kn_fflags = 0; + kn->kn_data = 0; + } + lck_mtx_unlock(fs_klist_lock); + return res; +} + static int sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -3936,7 +4008,7 @@ process_vp(vnode_t vp, int want_vp, int *deferred) panic("new_vnode(%p): free vnode still referenced", vp); if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) panic("new_vnode(%p): vnode seems to be on mount list", vp); - if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren)) + if ( !LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren)) panic("new_vnode(%p): vnode still hooked into the name cache", vp); } else { vnode_unlock(vp); @@ -3946,8 +4018,7 @@ process_vp(vnode_t vp, int want_vp, int *deferred) return (vp); } - - +__attribute__((noreturn)) static void async_work_continue(void) { @@ -4025,6 +4096,8 @@ new_vnode(vnode_t *vpp) VLISTNONE(vp); /* avoid double queue removal */ lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr); + TAILQ_INIT(&vp->v_ncchildren); + klist_init(&vp->v_knotes); nanouptime(&ts); vp->v_id = ts.tv_nsec; @@ -4436,6 +4509,15 @@ vnode_isinuse(vnode_t vp, int refcnt) return(vnode_isinuse_locked(vp, refcnt, 0)); } +int vnode_usecount(vnode_t vp) +{ + return vp->v_usecount; +} + +int vnode_iocount(vnode_t vp) +{ + return vp->v_iocount; +} static int vnode_isinuse_locked(vnode_t vp, int refcnt, int locked) @@ -5011,6 +5093,51 @@ vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp, */ vp->v_flag |= VRAGE; } + +#if CONFIG_SECLUDED_MEMORY + switch (secluded_for_filecache) { + case 0: + /* + * secluded_for_filecache == 0: + * + no file contents in secluded pool + */ + break; + case 1: + /* + * secluded_for_filecache == 1: + * + no files from / + * + files from /Applications/ are OK + * + files from /Applications/Camera are not OK + * + no files that are open for write + */ + if (vnode_vtype(vp) == VREG && + vnode_mount(vp) != NULL && + (! (vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) { + /* not from root filesystem: eligible for secluded pages */ + memory_object_mark_eligible_for_secluded( + ubc_getobject(vp, UBC_FLAGS_NONE), + TRUE); + } + break; + case 2: + /* + * secluded_for_filecache == 2: + * + all read-only files OK, except: + * + dyld_shared_cache_arm64* + * + Camera + * + mediaserverd + */ + if (vnode_vtype(vp) == VREG) { + memory_object_mark_eligible_for_secluded( + ubc_getobject(vp, UBC_FLAGS_NONE), + TRUE); + } + break; + default: + break; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + return (0); error_out: @@ -5584,7 +5711,6 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *v boolean_t batched; struct componentname *cnp; uint32_t defaulted; - uint32_t dfflags; // Directory file flags cnp = &ndp->ni_cnd; error = 0; @@ -5611,14 +5737,6 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *v panic("Mode for open, but not trying to open..."); } - /* - * Handle inheritance of restricted flag - */ - error = vnode_flags(dvp, &dfflags, ctx); - if (error) - return error; - if (dfflags & SF_RESTRICTED) - VATTR_SET(vap, va_flags, SF_RESTRICTED); /* * Create the requested node. @@ -5813,7 +5931,7 @@ vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_cont * However, some file systems may have limited support. */ if ((vp->v_type == VDIR) && - !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { + !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) { return (EPERM); /* POSIX */ } @@ -5955,13 +6073,22 @@ vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *v return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)); } -int +int vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, - struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, - vfs_context_t ctx, void *reserved) + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx, void *reserved) +{ + return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved); +} + +int +vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved) { int error = 0; int moving = 0; + bool swap = flags & VFS_RENAME_SWAP; if (reserved != NULL) { panic("Passed something other than NULL as reserved field!"); @@ -5992,18 +6119,34 @@ vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentnam error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp); if (error) goto out; + if (swap) { + error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp); + if (error) + goto out; + } #endif /***** *****/ /***** *****/ if (tvp != NULL) { - if (fvp->v_type == VDIR && tvp->v_type != VDIR) { - error = ENOTDIR; - goto out; - } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { - error = EISDIR; - goto out; + if (!swap) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } } + } else if (swap) { + /* + * Caller should have already checked this and returned + * ENOENT. If we send back ENOENT here, caller will retry + * which isn't what we want so we send back EINVAL here + * instead. + */ + error = EINVAL; + goto out; } if (fvp == tdvp) { @@ -6030,51 +6173,88 @@ vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentnam error = EINVAL; goto out; } + + if (swap && fdvp->v_parent == tvp) { + error = EINVAL; + goto out; + } /***** *****/ /***** *****/ - error = 0; - if ((tvp != NULL) && vnode_isdir(tvp)) { - if (tvp != fdvp) - moving = 1; - } else if (tdvp != fdvp) { - moving = 1; - } - + if (swap) { + kauth_action_t f = 0, t = 0; - /* - * must have delete rights to remove the old name even in - * the simple case of fdvp == tdvp. - * - * If fvp is a directory, and we are changing it's parent, - * then we also need rights to rewrite its ".." entry as well. - */ - if (vnode_isdir(fvp)) { - if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) - goto out; - } else { - if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) + /* + * Directories changing parents need ...ADD_SUBDIR... to + * permit changing ".." + */ + if (fdvp != tdvp) { + if (vnode_isdir(fvp)) + f = KAUTH_VNODE_ADD_SUBDIRECTORY; + if (vnode_isdir(tvp)) + t = KAUTH_VNODE_ADD_SUBDIRECTORY; + } + error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx); + if (error) goto out; - } - if (moving) { - /* moving into tdvp or tvp, must have rights to add */ - if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, - NULL, - vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, - ctx)) != 0) { + error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx); + if (error) goto out; + f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE; + t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE; + if (fdvp == tdvp) + error = vnode_authorize(fdvp, NULL, f | t, ctx); + else { + error = vnode_authorize(fdvp, NULL, t, ctx); + if (error) + goto out; + error = vnode_authorize(tdvp, NULL, f, ctx); } + if (error) + goto out; } else { - /* node staying in same directory, must be allowed to add new name */ - if ((error = vnode_authorize(fdvp, NULL, - vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) + error = 0; + if ((tvp != NULL) && vnode_isdir(tvp)) { + if (tvp != fdvp) + moving = 1; + } else if (tdvp != fdvp) { + moving = 1; + } + + /* + * must have delete rights to remove the old name even in + * the simple case of fdvp == tdvp. + * + * If fvp is a directory, and we are changing it's parent, + * then we also need rights to rewrite its ".." entry as well. + */ + if (vnode_isdir(fvp)) { + if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) + goto out; + } else { + if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) + goto out; + } + if (moving) { + /* moving into tdvp or tvp, must have rights to add */ + if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, + NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, + ctx)) != 0) { + goto out; + } + } else { + /* node staying in same directory, must be allowed to add new name */ + if ((error = vnode_authorize(fdvp, NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) + goto out; + } + /* overwriting tvp */ + if ((tvp != NULL) && !vnode_isdir(tvp) && + ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { goto out; - } - /* overwriting tvp */ - if ((tvp != NULL) && !vnode_isdir(tvp) && - ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { - goto out; + } } /***** *****/ @@ -6615,36 +6795,47 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) * - Neither the node nor the directory are immutable. * - The user is not the superuser. * - * Deletion is not permitted if the directory is sticky and the caller is - * not owner of the node or directory. + * The precedence of factors for authorizing or denying delete for a credential + * + * 1) Explicit ACE on the node. (allow or deny DELETE) + * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD). * - * If either the node grants DELETE, or the directory grants DELETE_CHILD, - * the node may be deleted. If neither denies the permission, and the - * caller has Posix write access to the directory, then the node may be - * deleted. + * If there are conflicting ACEs on the node and the directory, the node + * ACE wins. + * + * 3) Sticky bit on the directory. + * Deletion is not permitted if the directory is sticky and the caller is + * not owner of the node or directory. The sticky bit rules are like a deny + * delete ACE except lower in priority than ACL's either allowing or denying + * delete. + * + * 4) POSIX permisions on the directory. * * As an optimization, we cache whether or not delete child is permitted - * on directories without the sticky bit set. + * on directories. This enables us to skip directory ACL and POSIX checks + * as we already have the result from those checks. However, we always check the + * node ACL and, if the directory has the sticky bit set, we always check its + * ACL (even for a directory with an authorized delete child). Furthermore, + * caching the delete child authorization is independent of the sticky bit + * being set as it is only applicable in determining whether the node can be + * deleted or not. */ -int -vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child); -/*static*/ int +static int vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) { struct vnode_attr *vap = vcp->vap; struct vnode_attr *dvap = vcp->dvap; kauth_cred_t cred = vcp->ctx->vc_ucred; struct kauth_acl_eval eval; - int error, delete_denied, delete_child_denied, ismember; + int error, ismember; - /* check the ACL on the directory */ - delete_child_denied = 0; - if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) { - eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; - eval.ae_acl = &dvap->va_acl->acl_ace[0]; - eval.ae_count = dvap->va_acl->acl_entrycount; + /* Check the ACL on the node first */ + if (VATTR_IS_NOT(vap, va_acl, NULL)) { + eval.ae_requested = KAUTH_VNODE_DELETE; + eval.ae_acl = &vap->va_acl->acl_ace[0]; + eval.ae_count = vap->va_acl->acl_entrycount; eval.ae_options = 0; - if (vauth_dir_owner(vcp)) + if (vauth_file_owner(vcp)) eval.ae_options |= KAUTH_AEVAL_IS_OWNER; /* * We use ENOENT as a marker to indicate we could not get @@ -6652,8 +6843,8 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) - return(error); + if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + return (error); if (error == ENOENT) eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; else if (ismember) @@ -6663,40 +6854,48 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; - /* - * If there is no entry, we are going to defer to other - * authorization mechanisms. - */ - error = kauth_acl_evaluate(cred, &eval); - - if (error != 0) { + if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); - return(error); + return (error); } + switch(eval.ae_result) { case KAUTH_RESULT_DENY: - delete_child_denied = 1; - break; - /* FALLSTHROUGH */ - case KAUTH_RESULT_ALLOW: - KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); - return(0); + KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp); + return (EACCES); + case KAUTH_RESULT_ALLOW: + KAUTH_DEBUG("%p ALLOWED - granted by ACL", vcp->vp); + return (0); case KAUTH_RESULT_DEFER: default: - /* Effectively the same as !delete_child_denied */ - KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); + /* Defer to directory */ + KAUTH_DEBUG("%p DEFERRED - by file ACL", vcp->vp); break; } } - /* check the ACL on the node */ - delete_denied = 0; - if (VATTR_IS_NOT(vap, va_acl, NULL)) { - eval.ae_requested = KAUTH_VNODE_DELETE; - eval.ae_acl = &vap->va_acl->acl_ace[0]; - eval.ae_count = vap->va_acl->acl_entrycount; + /* + * Without a sticky bit, a previously authorized delete child is + * sufficient to authorize this delete. + * + * If the sticky bit is set, a directory ACL which allows delete child + * overrides a (potential) sticky bit deny. The authorized delete child + * cannot tell us if it was authorized because of an explicit delete + * child allow ACE or because of POSIX permisions so we have to check + * the directory ACL everytime if the directory has a sticky bit. + */ + if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) { + KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp); + return (0); + } + + /* check the ACL on the directory */ + if (VATTR_IS_NOT(dvap, va_acl, NULL)) { + eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; + eval.ae_acl = &dvap->va_acl->acl_ace[0]; + eval.ae_count = dvap->va_acl->acl_entrycount; eval.ae_options = 0; - if (vauth_file_owner(vcp)) + if (vauth_dir_owner(vcp)) eval.ae_options |= KAUTH_AEVAL_IS_OWNER; /* * We use ENOENT as a marker to indicate we could not get @@ -6704,7 +6903,7 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) return(error); if (error == ENOENT) eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; @@ -6715,47 +6914,64 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; - if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { + /* + * If there is no entry, we are going to defer to other + * authorization mechanisms. + */ + error = kauth_acl_evaluate(cred, &eval); + + if (error != 0) { KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); - return(error); + return (error); } - switch(eval.ae_result) { case KAUTH_RESULT_DENY: - delete_denied = 1; - break; + KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp); + return (EACCES); case KAUTH_RESULT_ALLOW: - KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp); - return(0); + KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); + if (!cached_delete_child && vcp->dvp) { + vnode_cache_authorized_action(vcp->dvp, + vcp->ctx, KAUTH_VNODE_DELETE_CHILD); + } + return (0); case KAUTH_RESULT_DEFER: default: - /* Effectively the same as !delete_child_denied */ - KAUTH_DEBUG("%p DEFERRED%s - by file ACL", vcp->vp, delete_denied ? "(DENY)" : ""); + /* Deferred by directory ACL */ + KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); break; } } - /* if denied by ACL on directory or node, return denial */ - if (delete_denied || delete_child_denied) { - KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp); - return(EACCES); + /* + * From this point, we can't explicitly allow and if we reach the end + * of the function without a denial, then the delete is authorized. + */ + if (!cached_delete_child) { + if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) { + KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp); + return (EACCES); + } + /* + * Cache the authorized action on the vnode if allowed by the + * directory ACL or POSIX permissions. It is correct to cache + * this action even if sticky bit would deny deleting the node. + */ + if (vcp->dvp) { + vnode_cache_authorized_action(vcp->dvp, vcp->ctx, + KAUTH_VNODE_DELETE_CHILD); + } } /* enforce sticky bit behaviour */ if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid); - return(EACCES); - } - - /* check the directory */ - if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { - KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp); - return(error); + return (EACCES); } /* not denied, must be OK */ - return(0); + return (0); } @@ -7217,6 +7433,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i boolean_t parent_authorized_for_delete_child = FALSE; boolean_t found_deny = FALSE; boolean_t parent_ref= FALSE; + boolean_t is_suser = FALSE; vcp = &auth_context; ctx = vcp->ctx = (vfs_context_t)arg0; @@ -7315,33 +7532,8 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i goto out; /* - * Get vnode attributes and extended security information for the vnode - * and directory if required. - */ - VATTR_WANTED(&va, va_mode); - VATTR_WANTED(&va, va_uid); - VATTR_WANTED(&va, va_gid); - VATTR_WANTED(&va, va_flags); - VATTR_WANTED(&va, va_acl); - if ((result = vnode_getattr(vp, &va, ctx)) != 0) { - KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); - goto out; - } - if (dvp) { - VATTR_WANTED(&dva, va_mode); - VATTR_WANTED(&dva, va_uid); - VATTR_WANTED(&dva, va_gid); - VATTR_WANTED(&dva, va_flags); - VATTR_WANTED(&dva, va_acl); - if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) { - KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result); - goto out; - } - } - - /* - * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes - * *_EXTATTRIBUTES. + * If the vnode is a namedstream (extended attribute) data vnode (eg. + * a resource fork), *_DATA becomes *_EXTATTRIBUTES. */ if (vnode_isnamedstream(vp)) { if (rights & KAUTH_VNODE_READ_DATA) { @@ -7352,26 +7544,58 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i rights &= ~KAUTH_VNODE_WRITE_DATA; rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; } + + /* + * Point 'vp' to the namedstream's parent for ACL checking + */ + if ((vp->v_parent != NULL) && + (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) { + parent_ref = TRUE; + vcp->vp = vp = vp->v_parent; + } + } + + if (vfs_context_issuser(ctx)) { + /* + * if we're not asking for execute permissions or modifications, + * then we're done, this action is authorized. + */ + if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) + goto success; + + is_suser = TRUE; } /* - * Point 'vp' to the resource fork's parent for ACL checking + * Get vnode attributes and extended security information for the vnode + * and directory if required. + * + * If we're root we only want mode bits and flags for checking + * execute and immutability. */ - if (vnode_isnamedstream(vp) && - (vp->v_parent != NULL) && - (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) { - parent_ref = TRUE; - vcp->vp = vp = vp->v_parent; - if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) - kauth_acl_free(va.va_acl); - VATTR_INIT(&va); - VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_flags); + if (!is_suser) { VATTR_WANTED(&va, va_uid); VATTR_WANTED(&va, va_gid); - VATTR_WANTED(&va, va_flags); VATTR_WANTED(&va, va_acl); - if ((result = vnode_getattr(vp, &va, ctx)) != 0) + } + if ((result = vnode_getattr(vp, &va, ctx)) != 0) { + KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); + goto out; + } + if (dvp) { + VATTR_WANTED(&dva, va_mode); + VATTR_WANTED(&dva, va_flags); + if (!is_suser) { + VATTR_WANTED(&dva, va_uid); + VATTR_WANTED(&dva, va_gid); + VATTR_WANTED(&dva, va_acl); + } + if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) { + KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result); goto out; + } } /* @@ -7400,7 +7624,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i * note that even if parent_authorized_for_delete_child is TRUE, we * need to check on the node itself. */ - if (!vfs_context_issuser(ctx)) { + if (!is_suser) { /* process delete rights */ if ((rights & KAUTH_VNODE_DELETE) && ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) @@ -7461,24 +7685,10 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE); } } - if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) { - /* - * parent was successfully and newly authorized for content deletions - * add it to the cache, but only if it doesn't have the sticky - * bit set on it. This same check is done earlier guarding - * fetching of dva, and if we jumped to out without having done - * this, we will have returned already because of a non-zero - * 'result' value. - */ - if (VATTR_IS_SUPPORTED(&dva, va_mode) && - !(dva.va_mode & (S_ISVTX))) { - /* OK to cache delete rights */ - KAUTH_DEBUG("%p - caching DELETE_CHILD rights", dvp); - vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD); - } - } +success: if (parent_ref) vnode_put(vp); + /* * Note that this implies that we will allow requests for no rights, as well as * for rights that we do not recognise. There should be none of these. @@ -7501,10 +7711,11 @@ static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx) { int error; - int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; + int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode, inherit_restricted; kauth_cred_t cred; guid_t changer; mount_t dmp; + struct vnode_attr dva; error = 0; @@ -7514,6 +7725,8 @@ vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uin defaulted_owner = defaulted_group = defaulted_mode = 0; + inherit_restricted = 0; + /* * Require that the filesystem support extended security to apply any. */ @@ -7543,6 +7756,16 @@ vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uin } } + /* + * We need the dvp's va_flags and *may* need the gid of the directory, + * we ask for both here. + */ + VATTR_INIT(&dva); + VATTR_WANTED(&dva, va_gid); + VATTR_WANTED(&dva, va_flags); + if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) + goto out; + /* * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that * group takes ownership of all new files. @@ -7553,11 +7776,6 @@ vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uin } else { if (!VATTR_IS_ACTIVE(vap, va_gid)) { /* default group comes from parent object, fallback to current user */ - struct vnode_attr dva; - VATTR_INIT(&dva); - VATTR_WANTED(&dva, va_gid); - if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) - goto out; if (VATTR_IS_SUPPORTED(&dva, va_gid)) { VATTR_SET(vap, va_gid, dva.va_gid); } else { @@ -7569,7 +7787,14 @@ vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uin if (!VATTR_IS_ACTIVE(vap, va_flags)) VATTR_SET(vap, va_flags, 0); - + + /* Determine if SF_RESTRICTED should be inherited from the parent + * directory. */ + if (VATTR_IS_SUPPORTED(&dva, va_flags) && + (dva.va_flags & SF_RESTRICTED)) { + inherit_restricted = 1; + } + /* default mode is everything, masked with current umask */ if (!VATTR_IS_ACTIVE(vap, va_mode)) { VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask); @@ -7694,6 +7919,12 @@ vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uin } } out: + if (inherit_restricted) { + /* Apply SF_RESTRICTED to the file if its parent directory was + * restricted. This is done at the end so that root is not + * required if this flag is only set due to inheritance. */ + VATTR_SET(vap, va_flags, (vap->va_flags | SF_RESTRICTED)); + } if (defaulted_fieldsp) { if (defaulted_mode) { *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE; @@ -8172,19 +8403,33 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_ required_action |= KAUTH_VNODE_WRITE_SECURITY; } - /* clear set-uid and set-gid bits as required by Posix */ - if (VATTR_IS_ACTIVE(vap, va_mode)) { - newmode = vap->va_mode; - } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) { - newmode = ova.va_mode; - } else { - KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits"); - newmode = 0; - } - if (newmode & (S_ISUID | S_ISGID)) { - VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID)); - KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode); + } + + /* + * clear set-uid and set-gid bits. POSIX only requires this for + * non-privileged processes but we do it even for root. + */ + if (VATTR_IS_ACTIVE(vap, va_mode)) { + newmode = vap->va_mode; + } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) { + newmode = ova.va_mode; + } else { + KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits"); + newmode = 0; + } + + /* chown always clears setuid/gid bits. An exception is made for + * setattrlist executed by a root process to set on a file: + * setattrlist is allowed to set the new mode on the file and change (chown) + * uid/gid. + */ + if (newmode & (S_ISUID | S_ISGID)) { + if (!VATTR_IS_ACTIVE(vap, va_mode) || !has_priv_suser) { + KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", + newmode, newmode & ~(S_ISUID | S_ISGID)); + newmode &= ~(S_ISUID | S_ISGID); } + VATTR_SET(vap, va_mode, newmode); } } @@ -8551,6 +8796,7 @@ lock_vnode_and_post(vnode_t vp, int kevent_num) } void panic_print_vnodes(void); + /* define PANIC_PRINTS_VNODES only if investigation is required. */ #ifdef PANIC_PRINTS_VNODES @@ -8658,7 +8904,21 @@ void panic_print_vnodes(void) * iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist */ TAILQ_FOREACH(mnt, &mountlist, mnt_list) { + + if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) { + kdb_printf("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n", + &mountlist, mnt); + break; + } + TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) { + + if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) { + kdb_printf("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n", + &mnt->mnt_vnodelist, vp); + break; + } + if (++nvnodes > SANE_VNODE_PRINT_LIMIT) return; type = __vtype(vp->v_type); @@ -9302,3 +9562,81 @@ vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, } #endif /* CONFIG_TRIGGERS */ + +vm_offset_t kdebug_vnode(vnode_t vp) +{ + return VM_KERNEL_ADDRPERM(vp); +} + +static int flush_cache_on_write = 0; +SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, + CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, + "always flush the drive cache on writes to uncached files"); + +int vnode_should_flush_after_write(vnode_t vp, int ioflag) +{ + return (flush_cache_on_write + && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp))); +} + +/* + * sysctl for use by disk I/O tracing tools to get the list of existing + * vnodes' paths + */ + +struct vnode_trace_paths_context { + uint64_t count; + long path[MAXPATHLEN / sizeof (long) + 1]; /* + 1 in case sizeof (long) does not divide MAXPATHLEN */ +}; + +static int vnode_trace_path_callback(struct vnode *vp, void *arg) { + int len, rv; + struct vnode_trace_paths_context *ctx; + + ctx = arg; + + len = sizeof (ctx->path); + rv = vn_getpath(vp, (char *)ctx->path, &len); + /* vn_getpath() NUL-terminates, and len includes the NUL */ + + if (!rv) { + kdebug_lookup_gen_events(ctx->path, len, vp, TRUE); + + if (++(ctx->count) == 1000) { + thread_yield_to_preemption(); + ctx->count = 0; + } + } + + return VNODE_RETURNED; +} + +static int vfs_trace_paths_callback(mount_t mp, void *arg) { + if (mp->mnt_flag & MNT_LOCAL) + vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg); + + return VFS_RETURNED; +} + +static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS { + struct vnode_trace_paths_context ctx; + + (void)oidp; + (void)arg1; + (void)arg2; + (void)req; + + if (!kauth_cred_issuser(kauth_cred_get())) + return EPERM; + + if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP)) + return EINVAL; + + bzero(&ctx, sizeof (struct vnode_trace_paths_context)); + + vfs_iterate(0, vfs_trace_paths_callback, &ctx); + + return 0; +} + +SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths"); diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 0ed08b06f..04b382fb9 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 1995-2015 Apple Inc. All rights reserved. + * Copyright (c) 1995-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -100,6 +100,8 @@ #include #include #include +#include +#include #include #include #include @@ -114,6 +116,7 @@ #include #include +#include #include #include @@ -128,14 +131,14 @@ #include #endif -#if CONFIG_FSE +#if CONFIG_FSE #define GET_PATH(x) \ - (x) = get_pathbuff(); + (x) = get_pathbuff(); #define RELEASE_PATH(x) \ release_pathbuff(x); -#else +#else #define GET_PATH(x) \ - MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK); #define RELEASE_PATH(x) \ FREE_ZONE((x), MAXPATHLEN, M_NAMEI); #endif /* CONFIG_FSE */ @@ -157,8 +160,8 @@ static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, i static int sync_callback(mount_t, void *); static void sync_thread(void *, __unused wait_result_t); static int sync_async(int); -static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, - user_addr_t bufp, int *sizep, boolean_t is_64_bit, +static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, + user_addr_t bufp, int *sizep, boolean_t is_64_bit, boolean_t partial_copy); static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp); @@ -222,16 +225,9 @@ unsigned int vfs_nummntops=0; extern const struct fileops vnops; #if CONFIG_APPLEDOUBLE -extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); +extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); #endif /* CONFIG_APPLEDOUBLE */ -typedef uint32_t vfs_rename_flags_t; -#if CONFIG_SECLUDED_RENAME -enum { - VFS_SECLUDE_RENAME = 0x00000001 -}; -#endif - /* * Virtual File System System Calls */ @@ -256,7 +252,7 @@ kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path, boolean_t did_namei; int error; - NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); /* @@ -308,7 +304,7 @@ mount(proc_t p, struct mount_args *uap, __unused int32_t *retval) } void -vfs_notify_mount(vnode_t pdvp) +vfs_notify_mount(vnode_t pdvp) { vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL); lock_vnode_and_post(pdvp, NOTE_WRITE); @@ -321,14 +317,14 @@ vfs_notify_mount(vnode_t pdvp) * * Parameters: p Process requesting the mount * uap User argument descriptor (see below) - * retval (ignored) + * retval (ignored) * * Indirect: uap->type Filesystem type * uap->path Path to mount - * uap->data Mount arguments - * uap->mac_p MAC info + * uap->data Mount arguments + * uap->mac_p MAC info * uap->flags Mount flags - * + * * * Returns: 0 Success * !0 Not success @@ -348,7 +344,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 char *labelstr = NULL; int flags = uap->flags; int error; -#if CONFIG_IMGSRC_ACCESS || CONFIG_MACF +#if CONFIG_IMGSRC_ACCESS || CONFIG_MACF boolean_t is_64bit = IS_64BIT_PROCESS(p); #else #pragma unused(p) @@ -363,7 +359,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 /* * Get the vnode to be covered */ - NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) { @@ -372,7 +368,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 need_nameidone = 1; vp = nd.ni_vp; pvp = nd.ni_dvp; - + #ifdef CONFIG_IMGSRC_ACCESS /* Mounting image source cannot be batched with other operations */ if (flags == MNT_IMGSRC_BY_INDEX) { @@ -433,12 +429,12 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 flags |= MNT_UPDATE; } else { - /* + /* * For a union mount on '/', treat it as fresh - * mount instead of update. - * Otherwise, union mouting on '/' used to panic the - * system before, since mnt_vnodecovered was found to - * be NULL for '/' which is required for unionlookup + * mount instead of update. + * Otherwise, union mouting on '/' used to panic the + * system before, since mnt_vnodecovered was found to + * be NULL for '/' which is required for unionlookup * after it gets ENOENT on union mount. */ flags = (flags & ~(MNT_UPDATE)); @@ -448,15 +444,15 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 if ((flags & MNT_RDONLY) == 0) { /* Release kernels are not allowed to mount "/" as rw */ error = EPERM; - goto out; + goto out; } #endif /* * See 7392553 for more details on why this check exists. * Suffice to say: If this check is ON and something tries * to mount the rootFS RW, we'll turn off the codesign - * bitmap optimization. - */ + * bitmap optimization. + */ #if CHECK_CS_VALIDATION_BITMAP if ((flags & MNT_RDONLY) == 0 ) { root_fs_upgrade_try = TRUE; @@ -489,7 +485,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 /* * common mount implementation (final stage of mounting) - + * Arguments: * fstypename file system type (ie it's vfs name) * pvp parent of covered vnode @@ -560,13 +556,13 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, * If content protection is enabled, update mounts are not * allowed to turn it off. */ - if ((mp->mnt_flag & MNT_CPROTECT) && + if ((mp->mnt_flag & MNT_CPROTECT) && ((flags & MNT_CPROTECT) == 0)) { error = EINVAL; goto out1; } -#ifdef CONFIG_IMGSRC_ACCESS +#ifdef CONFIG_IMGSRC_ACCESS /* Can't downgrade the backer of the root FS */ if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) && (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) { @@ -731,15 +727,16 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, /* * Process device path for local file systems if requested */ - if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) { + if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS && + !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) { if (vfs_context_is64bit(ctx)) { if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) - goto out1; + goto out1; fsmountargs += sizeof(devpath); } else { user32_addr_t tmp; if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) ) - goto out1; + goto out1; /* munge into LP64 addr */ devpath = CAST_USER_ADDR_T(tmp); fsmountargs += sizeof(tmp); @@ -791,7 +788,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, */ if ( (error = vfs_mountedon(devvp)) ) goto out3; - + if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) { error = EBUSY; goto out3; @@ -829,7 +826,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, vnode_getalways(device_vnode); if (suser(vfs_context_ucred(ctx), NULL) && - (error = vnode_authorize(device_vnode, NULL, + (error = vnode_authorize(device_vnode, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) { vnode_put(device_vnode); @@ -867,7 +864,12 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, /* * Mount the filesystem. */ - error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx); + if (internal_flags & KERNEL_MOUNT_SNAPSHOT) { + error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT, + (caddr_t)fsmountargs, 0, ctx); + } else { + error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx); + } if (flags & MNT_UPDATE) { if (mp->mnt_kern_flag & MNTK_WANTRDWR) @@ -937,8 +939,8 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, /* Unmount the filesystem as cdir/rdirs cannot be updated */ goto out4; } - /* - * there is no cleanup code here so I have made it void + /* + * there is no cleanup code here so I have made it void * we need to revisit this */ (void)VFS_START(mp, 0, ctx); @@ -959,7 +961,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, VFSATTR_INIT(&vfsattr); VFSATTR_WANTED(&vfsattr, f_capabilities); if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 && - vfs_getattr(mp, &vfsattr, ctx) == 0 && + vfs_getattr(mp, &vfsattr, ctx) == 0 && VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { @@ -979,6 +981,11 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */ mp->mnt_kern_flag |= MNTK_PATH_FROM_ID; } + + if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) && + (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) { + mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS; + } } if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) { mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; @@ -1000,7 +1007,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, * defaults will have been set, so no reason to bail or care */ vfs_init_io_attributes(device_vnode, mp); - } + } /* Now that mount is setup, notify the listeners */ vfs_notify_mount(pvp); @@ -1009,7 +1016,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, } else { /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */ if (mp->mnt_vnodelist.tqh_first != NULL) { - panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.", + panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.", mp->mnt_vtable->vfc_name, error); } @@ -1026,7 +1033,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, } lck_rw_done(&mp->mnt_rwlock); is_rwlock_locked = FALSE; - + /* * if we get here, we have a mount structure that needs to be freed, * but since the coveredvp hasn't yet been updated to point at it, @@ -1051,8 +1058,8 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, /* Error condition exits */ out4: (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx); - - /* + + /* * If the mount has been placed on the covered vp, * it may have been discovered by now, so we have * to treat this just like an unmount @@ -1089,7 +1096,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, if (is_rwlock_locked == TRUE) { lck_rw_done(&mp->mnt_rwlock); } - + if (mntalloc) { if (mp->mnt_crossref) mount_dropcrossref(mp, vp, 0); @@ -1110,7 +1117,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, return(error); } -/* +/* * Flush in-core data, check for competing mount attempts, * and set VMOUNT */ @@ -1132,7 +1139,7 @@ prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, cons VATTR_WANTED(&va, va_uid); if ((error = vnode_getattr(vp, &va, ctx)) || (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && - (!vfs_context_issuser(ctx)))) { + (!vfs_context_issuser(ctx)))) { error = EPERM; goto out; } @@ -1175,7 +1182,7 @@ prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, cons #define IMGSRC_DEBUG(args...) printf(args) #else #define IMGSRC_DEBUG(args...) do { } while(0) -#endif +#endif static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx) @@ -1334,7 +1341,7 @@ mount_begin_update(mount_t mp, vfs_context_t ctx, int flags) * permitted to update it. */ if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) && - (!vfs_context_issuser(ctx))) { + (!vfs_context_issuser(ctx))) { error = EPERM; goto out; } @@ -1353,7 +1360,7 @@ mount_begin_update(mount_t mp, vfs_context_t ctx, int flags) return error; } -static void +static void mount_end_update(mount_t mp) { lck_rw_done(&mp->mnt_rwlock); @@ -1378,8 +1385,8 @@ get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp) } static int -relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, - const char *fsname, vfs_context_t ctx, +relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, + const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index) { int error; @@ -1484,7 +1491,7 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, goto out0; } - /* + /* * It can only be moved once. Flag is set under the rwlock, * so we're now safe to proceed. */ @@ -1492,8 +1499,8 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, IMGSRC_DEBUG("Already moved [2]\n"); goto out1; } - - + + IMGSRC_DEBUG("Preparing coveredvp.\n"); /* Mark covered vnode as mount in progress, authorize placing mount on top */ @@ -1502,7 +1509,7 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error); goto out1; } - + IMGSRC_DEBUG("Covered vp OK.\n"); /* Sanity check the name caller has provided */ @@ -1528,9 +1535,9 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, } } - /* + /* * Place mp on top of vnode, ref the vnode, call checkdirs(), - * and increment the name cache's mount generation + * and increment the name cache's mount generation */ IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n"); @@ -1574,9 +1581,9 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, mount_unlock(mp); out2: - /* + /* * Placing the mp on the vnode clears VMOUNT, - * so cleanup is different after that point + * so cleanup is different after that point */ if (placed) { /* Rele the vp, clear VMOUNT and v_mountedhere */ @@ -1611,7 +1618,7 @@ enablequotas(struct mount *mp, vfs_context_t ctx) if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) { return; } - /* + /* * Enable filesystem disk quotas if necessary. * We ignore errors as this should not interfere with final mount */ @@ -1632,7 +1639,7 @@ enablequotas(struct mount *mp, vfs_context_t ctx) static int -checkdirs_callback(proc_t p, void * arg) +checkdirs_callback(proc_t p, void * arg) { struct cdirargs * cdrp = (struct cdirargs * )arg; vnode_t olddp = cdrp->olddp; @@ -1741,7 +1748,7 @@ unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -1771,7 +1778,7 @@ unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval) } int -vfs_unmountbyfsid(fsid_t * fsid, int flags, vfs_context_t ctx) +vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx) { mount_t mp; @@ -1807,7 +1814,7 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx) } /* - * Skip authorization if the mount is tagged as permissive and + * Skip authorization if the mount is tagged as permissive and * this is not a forced-unmount attempt. */ if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) { @@ -1893,7 +1900,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) */ mp->mnt_realrootvp = NULLVP; mount_unlock(mp); - + if (forcedunmount && (flags & MNT_LNOSUB) == 0) { /* * Force unmount any mounts in this filesystem. @@ -1942,7 +1949,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) #if CONFIG_TRIGGERS vfs_nested_trigger_unmounts(mp, flags, ctx); did_vflush = 1; -#endif +#endif if (forcedunmount) lflags |= FORCECLOSE; error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags); @@ -2029,7 +2036,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) out: if (mp->mnt_lflag & MNT_LWAIT) { mp->mnt_lflag &= ~MNT_LWAIT; - needwakeup = 1; + needwakeup = 1; } #if CONFIG_TRIGGERS @@ -2039,9 +2046,9 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag); } - /* + /* * Callback and context are set together under the mount lock, and - * never cleared, so we're safe to examine them here, drop the lock, + * never cleared, so we're safe to examine them here, drop the lock, * and call out. */ if (mp->mnt_triggercallback != NULL) { @@ -2054,7 +2061,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) } else { mount_unlock(mp); } -#else +#else mount_unlock(mp); #endif /* CONFIG_TRIGGERS */ @@ -2142,7 +2149,7 @@ dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx) /* * Fill the array with submount fsids. * Since mounts are always added to the tail of the mount list, the - * list is always in mount order. + * list is always in mount order. * For each mount check if the mounted-on vnode belongs to a * mount that's already added to our array of mounts to be unmounted. */ @@ -2185,7 +2192,7 @@ mount_dropcrossref(mount_t mp, vnode_t dp, int need_put) panic("mount cross refs -ve"); if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) { - + if (need_put) vnode_put_locked(dp); vnode_unlock(dp); @@ -2213,7 +2220,7 @@ int syncprt = 0; int print_vmpage_stat=0; int sync_timeout = 60; // Sync time limit (sec) -static int +static int sync_callback(mount_t mp, __unused void *arg) { if ((mp->mnt_flag & MNT_RDONLY) == 0) { @@ -2426,18 +2433,24 @@ statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval) vfs_context_t ctx = vfs_context_current(); vnode_t vp; - NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); - if (error) + if (error != 0) return (error); vp = nd.ni_vp; mp = vp->v_mount; sp = &mp->mnt_vfsstat; nameidone(&nd); +#if CONFIG_MACF + error = mac_mount_check_stat(ctx, mp); + if (error != 0) + return (error); +#endif + error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT); - if (error != 0) { + if (error != 0) { vnode_put(vp); return (error); } @@ -2477,8 +2490,15 @@ fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval) error = EBADF; goto out; } + +#if CONFIG_MACF + error = mac_mount_check_stat(vfs_context_current(), mp); + if (error != 0) + goto out; +#endif + sp = &mp->mnt_vfsstat; - if ((error = vfs_update_vfsstat(mp,vfs_context_current(),VFS_USER_EVENT)) != 0) { + if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) { goto out; } @@ -2491,15 +2511,15 @@ fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval) return (error); } -/* - * Common routine to handle copying of statfs64 data to user space +/* + * Common routine to handle copying of statfs64 data to user space */ -static int +static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp) { int error; struct statfs64 sfs; - + bzero(&sfs, sizeof(sfs)); sfs.f_bsize = sfsp->f_bsize; @@ -2527,8 +2547,8 @@ statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp) return(error); } -/* - * Get file system statistics in 64-bit mode +/* + * Get file system statistics in 64-bit mode */ int statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval) @@ -2540,18 +2560,24 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r vfs_context_t ctxp = vfs_context_current(); vnode_t vp; - NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctxp); error = namei(&nd); - if (error) + if (error != 0) return (error); vp = nd.ni_vp; mp = vp->v_mount; sp = &mp->mnt_vfsstat; nameidone(&nd); +#if CONFIG_MACF + error = mac_mount_check_stat(ctxp, mp); + if (error != 0) + return (error); +#endif + error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT); - if (error != 0) { + if (error != 0) { vnode_put(vp); return (error); } @@ -2562,8 +2588,8 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r return (error); } -/* - * Get file system statistics in 64-bit mode +/* + * Get file system statistics in 64-bit mode */ int fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval) @@ -2591,6 +2617,13 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t error = EBADF; goto out; } + +#if CONFIG_MACF + error = mac_mount_check_stat(vfs_context_current(), mp); + if (error != 0) + goto out; +#endif + sp = &mp->mnt_vfsstat; if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) { goto out; @@ -2618,13 +2651,20 @@ struct getfsstat_struct { static int getfsstat_callback(mount_t mp, void * arg) { - + struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg; struct vfsstatfs *sp; int error, my_size; vfs_context_t ctx = vfs_context_current(); if (fstp->sfsp && fstp->count < fstp->maxcount) { +#if CONFIG_MACF + error = mac_mount_check_stat(ctx, mp); + if (error != 0) { + fstp->error = error; + return(VFS_RETURNED_DONE); + } +#endif sp = &mp->mnt_vfsstat; /* * If MNT_NOWAIT is specified, do not refresh the @@ -2684,14 +2724,14 @@ getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval) * * Parameters: p (ignored) * uap User argument descriptor (see below) - * retval Count of file system statistics (N stats) + * retval Count of file system statistics (N stats) * * Indirect: uap->bufsize Buffer size * uap->macsize MAC info size * uap->buf Buffer where information will be returned * uap->mac MAC info * uap->flags File system flags - * + * * * Returns: 0 Success * !0 Not success @@ -2766,7 +2806,7 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval fst.error = 0; fst.maxcount = maxcount; - + vfs_iterate(0, getfsstat_callback, &fst); if (mp) @@ -2792,6 +2832,13 @@ getfsstat64_callback(mount_t mp, void * arg) int error; if (fstp->sfsp && fstp->count < fstp->maxcount) { +#if CONFIG_MACF + error = mac_mount_check_stat(vfs_context_current(), mp); + if (error != 0) { + fstp->error = error; + return(VFS_RETURNED_DONE); + } +#endif sp = &mp->mnt_vfsstat; /* * If MNT_NOWAIT is specified, do not refresh the fsstat @@ -2868,7 +2915,7 @@ getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval) * by this call needs a vnode_put * */ -static int +int vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp) { int error; @@ -3080,7 +3127,7 @@ common_chdir(proc_t p, struct chdir_args *uap, int per_thread) vnode_t tvp; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = change_dir(&nd, ctx); if (error) @@ -3182,7 +3229,7 @@ chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval) if ((error = suser(kauth_cred_get(), &p->p_acflag))) return (error); - NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = change_dir(&nd, ctx); if (error) @@ -3409,6 +3456,81 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, if (flags & O_CLOFORK) *fdflags(p, indx) |= UF_FORKCLOSE; procfdtbl_releasefd(p, indx, NULL); + +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache && + FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE && + vnode_vtype(vp) == VREG) { + memory_object_control_t moc; + + moc = ubc_getobject(vp, UBC_FLAGS_NONE); + + if (moc == MEMORY_OBJECT_CONTROL_NULL) { + /* nothing to do... */ + } else if (fp->f_fglob->fg_flag & FWRITE) { + /* writable -> no longer eligible for secluded pages */ + memory_object_mark_eligible_for_secluded(moc, + FALSE); + } else if (secluded_for_filecache == 1) { + char pathname[32] = { 0, }; + size_t copied; + /* XXX FBDP: better way to detect /Applications/ ? */ + if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) { + copyinstr(ndp->ni_dirp, + pathname, + sizeof (pathname), + &copied); + } else { + copystr(CAST_DOWN(void *, ndp->ni_dirp), + pathname, + sizeof (pathname), + &copied); + } + pathname[sizeof (pathname) - 1] = '\0'; + if (strncmp(pathname, + "/Applications/", + strlen("/Applications/")) == 0 && + strncmp(pathname, + "/Applications/Camera.app/", + strlen("/Applications/Camera.app/")) != 0) { + /* + * not writable + * AND from "/Applications/" + * AND not from "/Applications/Camera.app/" + * ==> eligible for secluded + */ + memory_object_mark_eligible_for_secluded(moc, + TRUE); + } + } else if (secluded_for_filecache == 2) { +/* not implemented... */ + if (!strncmp(vp->v_name, + DYLD_SHARED_CACHE_NAME, + strlen(DYLD_SHARED_CACHE_NAME)) || + !strncmp(vp->v_name, + "dyld", + strlen(vp->v_name)) || + !strncmp(vp->v_name, + "launchd", + strlen(vp->v_name)) || + !strncmp(vp->v_name, + "Camera", + strlen(vp->v_name)) || + !strncmp(vp->v_name, + "mediaserverd", + strlen(vp->v_name))) { + /* + * This file matters when launching Camera: + * do not store its contents in the secluded + * pool that will be drained on Camera launch. + */ + memory_object_mark_eligible_for_secluded(moc, + FALSE); + } + } + } +#endif /* CONFIG_SECLUDED_MEMORY */ + fp_drop(p, indx, fp, 1); proc_fdunlock(p); @@ -3418,14 +3540,14 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, bad: context = *vfs_context_current(); context.vc_ucred = fp->f_fglob->fg_cred; - + if ((fp->f_fglob->fg_flag & FHASLOCK) && (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; - + (void)VNOP_ADVLOCK( vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL); } @@ -3547,9 +3669,9 @@ open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval) return ciferror; } -/* +/* * Go through the data-protected atomically controlled open (2) - * + * * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode) */ int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) { @@ -3557,7 +3679,7 @@ int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int class = uap->class; int dpflags = uap->dpflags; - /* + /* * Follow the same path as normal open(2) * Look up the item if it exists, and acquire the vnode. */ @@ -3566,7 +3688,7 @@ int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, struct nameidata nd; int cmode; int error; - + VATTR_INIT(&va); /* Mask off all but regular access permissions */ cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; @@ -3575,13 +3697,13 @@ int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, vfs_context_current()); - /* - * Initialize the extra fields in vnode_attr to pass down our + /* + * Initialize the extra fields in vnode_attr to pass down our * extra fields. * 1. target cprotect class. - * 2. set a flag to mark it as requiring open-raw-encrypted semantics. - */ - if (flags & O_CREAT) { + * 2. set a flag to mark it as requiring open-raw-encrypted semantics. + */ + if (flags & O_CREAT) { /* lower level kernel code validates that the class is valid before applying it. */ if (class != PROTECTION_CLASS_DEFAULT) { /* @@ -3591,12 +3713,12 @@ int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, VATTR_SET(&va, va_dataprotect_class, class); } } - + if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) { if ( flags & (O_RDWR | O_WRONLY)) { /* Not allowed to write raw encrypted bytes */ - return EINVAL; - } + return EINVAL; + } if (uap->dpflags & O_DP_GETRAWENCRYPTED) { VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED); } @@ -3771,7 +3893,7 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval) if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) return (error); - NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -3858,7 +3980,7 @@ mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap) int error; struct nameidata nd; - NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, upath, ctx); error = namei(&nd); if (error) @@ -3986,7 +4108,7 @@ safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *trunc len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1; if (len > MAXPATHLEN) { char *ptr; - + // the string got truncated! *truncated_path = 1; ptr = my_strrchr(path, '/'); @@ -4004,9 +4126,9 @@ safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *trunc if (ret != ENOSPC) { printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n", dvp, dvp->v_name ? dvp->v_name : "no-name", ret); - } + } *truncated_path = 1; - + do { if (mydvp->v_parent != NULL) { mydvp = mydvp->v_parent; @@ -4019,7 +4141,7 @@ safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *trunc len = 2; mydvp = NULL; } - + if (mydvp == NULL) { break; } @@ -4079,10 +4201,11 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, * However, some file systems may have limited support. */ if (vp->v_type == VDIR) { - if (!(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { + if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) { error = EPERM; /* POSIX */ goto out; } + /* Linking to a directory requires ownership. */ if (!kauth_cred_issuser(vfs_context_ucred(ctx))) { struct vnode_attr dva; @@ -4130,7 +4253,7 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, error = EXDEV; goto out2; } - + /* authorize creation of the target note */ if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) goto out2; @@ -4175,11 +4298,11 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, link_name_len = MAXPATHLEN; if (vn_getpath(vp, link_to_path, &link_name_len) == 0) { /* - * Call out to allow 3rd party notification of rename. + * Call out to allow 3rd party notification of rename. * Ignore result of kauth_authorize_fileop call. */ - kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK, - (uintptr_t)link_to_path, + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK, + (uintptr_t)link_to_path, (uintptr_t)target_path); } if (link_to_path != NULL) { @@ -4258,7 +4381,6 @@ symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd, int error; struct nameidata nd; vnode_t vp, dvp; - uint32_t dfflags; // Directory file flags size_t dummy=0; proc_t p; @@ -4287,15 +4409,6 @@ symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd, VATTR_SET(&va, va_type, VLNK); VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask); - /* - * Handle inheritance of restricted flag - */ - error = vnode_flags(dvp, &dfflags, ctx); - if (error) - goto skipit; - if (dfflags & SF_RESTRICTED) - VATTR_SET(&va, va_flags, SF_RESTRICTED); - #if CONFIG_MACF error = mac_vnode_check_create(ctx, dvp, &nd.ni_cnd, &va); @@ -4493,7 +4606,7 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp, if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) { flags |= VNODE_REMOVE_NODELETEBUSY; } - + /* Skip any potential upcalls if told to. */ if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) { flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT; @@ -4597,13 +4710,13 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp, } /* - * Call out to allow 3rd party notification of delete. + * Call out to allow 3rd party notification of delete. * Ignore result of kauth_authorize_fileop call. */ if (!error) { if (has_listeners) { - kauth_authorize_fileop(vfs_context_ucred(ctx), - KAUTH_FILEOP_DELETE, + kauth_authorize_fileop(vfs_context_ucred(ctx), + KAUTH_FILEOP_DELETE, (uintptr_t)vp, (uintptr_t)path); } @@ -4642,14 +4755,14 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp, RELEASE_PATH(path); #if NAMEDRSRCFORK - /* recycle the deleted rsrc fork vnode to force a reclaim, which + /* recycle the deleted rsrc fork vnode to force a reclaim, which * will cause its shadow file to go away if necessary. */ if (vp && (vnode_isnamedstream(vp)) && (vp->v_parent != NULLVP) && vnode_isshadow(vp)) { vnode_recycle(vp); - } + } #endif /* * nameidone has to happen before we vnode_put(dvp) @@ -4786,7 +4899,7 @@ lseek(proc_t p, struct lseek_args *uap, off_t *retval) } } - /* + /* * An lseek can affect whether data is "available to read." Use * hint of NOTE_NONE so no EVFILT_VNODE events fire */ @@ -4837,7 +4950,7 @@ access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx) /* take advantage of definition of uflags */ action = uflags >> 8; } - + #if CONFIG_MACF error = mac_vnode_check_access(ctx, vp, uflags); if (error) @@ -4860,8 +4973,8 @@ access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx) * access_extended: Check access permissions in bulk. * * Description: uap->entries Pointer to an array of accessx - * descriptor structs, plus one or - * more NULL terminated strings (see + * descriptor structs, plus one or + * more NULL terminated strings (see * "Notes" section below). * uap->size Size of the area pointed to by * uap->entries. @@ -4902,7 +5015,7 @@ access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx) * * since we must have at least one string, and the string must * be at least one character plus the NULL terminator in length. - * + * * XXX: Need to support the check-as uid argument */ int @@ -4994,6 +5107,12 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in goto out; } + /* Also do not let ad_name_offset point to something beyond the size of the input */ + if (input[i].ad_name_offset >= uap->size) { + error = EINVAL; + goto out; + } + /* * An offset of 0 means use the previous descriptor's offset; * this is used to chain multiple requests for the same file @@ -5055,7 +5174,7 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in vnode_put(dvp); dvp = NULL; } - + /* * Scan forward in the descriptor list to see if we * need the parent vnode. We will need it if we are @@ -5067,7 +5186,7 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) if (input[j].ad_flags & _DELETE_OK) wantdelete = 1; - + niopts = FOLLOW | AUDITVNPATH1; /* need parent for vnode_authorize for deletion test */ @@ -5112,7 +5231,7 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in /* copy out results */ error = copyout(result, uap->results, desc_actual * sizeof(errno_t)); - + out: if (input && input != stack_input) FREE(input, M_TEMP); @@ -5181,7 +5300,7 @@ faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode, goto out; #if NAMEDRSRCFORK - /* Grab reference on the shadow stream file vnode to + /* Grab reference on the shadow stream file vnode to * force an inactive on release which will mark it * for recycle. */ @@ -5205,7 +5324,7 @@ faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode, if (amode & _DELETE_OK) vnode_put(nd.ni_dvp); nameidone(&nd); - + out: if (!(flag & AT_EACCESS)) kauth_cred_unref(&context.vc_ucred); @@ -5277,8 +5396,8 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, statptr = (void *)&source; #if NAMEDRSRCFORK - /* Grab reference on the shadow stream file vnode to - * force an inactive on release which will mark it + /* Grab reference on the shadow stream file vnode to + * force an inactive on release which will mark it * for recycle. */ if (vnode_isnamedstream(nd.ni_vp) && @@ -5307,11 +5426,11 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, source.sb64.st_qspare[0] = 0LL; source.sb64.st_qspare[1] = 0LL; if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) { - munge_user64_stat64(&source.sb64, &dest.user64_sb64); + munge_user64_stat64(&source.sb64, &dest.user64_sb64); my_size = sizeof(dest.user64_sb64); sbp = (caddr_t)&dest.user64_sb64; } else { - munge_user32_stat64(&source.sb64, &dest.user32_sb64); + munge_user32_stat64(&source.sb64, &dest.user32_sb64); my_size = sizeof(dest.user32_sb64); sbp = (caddr_t)&dest.user32_sb64; } @@ -5326,11 +5445,11 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, source.sb.st_qspare[0] = 0LL; source.sb.st_qspare[1] = 0LL; if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) { - munge_user64_stat(&source.sb, &dest.user64_sb); + munge_user64_stat(&source.sb, &dest.user64_sb); my_size = sizeof(dest.user64_sb); sbp = (caddr_t)&dest.user64_sb; } else { - munge_user32_stat(&source.sb, &dest.user32_sb); + munge_user32_stat(&source.sb, &dest.user32_sb); my_size = sizeof(dest.user32_sb); sbp = (caddr_t)&dest.user32_sb; } @@ -5380,13 +5499,13 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, * * Parameters: p (ignored) * uap User argument descriptor (see below) - * retval (ignored) + * retval (ignored) * * Indirect: uap->path Path of file to get status from * uap->ub User buffer (holds file status info) * uap->xsecurity ACL to get (extended security) * uap->xsecurity_size Size of ACL - * + * * Returns: 0 Success * !0 errno value * @@ -5423,13 +5542,13 @@ stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval) * * Parameters: p (ignored) * uap User argument descriptor (see below) - * retval (ignored) + * retval (ignored) * * Indirect: uap->path Path of file to get status from * uap->ub User buffer (holds file status info) * uap->xsecurity ACL to get (extended security) * uap->xsecurity_size Size of ACL - * + * * Returns: 0 Success * !0 errno value * @@ -5447,13 +5566,13 @@ stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused in * * Parameters: p (ignored) * uap User argument descriptor (see below) - * retval (ignored) + * retval (ignored) * * Indirect: uap->path Path of file to get status from * uap->ub User buffer (holds file status info) * uap->xsecurity ACL to get (extended security) * uap->xsecurity_size Size of ACL - * + * * Returns: 0 Success * !0 errno value * @@ -5489,13 +5608,13 @@ lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval) * * Parameters: p (ignored) * uap User argument descriptor (see below) - * retval (ignored) + * retval (ignored) * * Indirect: uap->path Path of file to get status from * uap->ub User buffer (holds file status info) * uap->xsecurity ACL to get (extended security) * uap->xsecurity_size Size of ACL - * + * * Returns: 0 Success * !0 errno value * @@ -5552,7 +5671,7 @@ pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -5663,6 +5782,11 @@ chflags1(vnode_t vp, int flags, vfs_context_t ctx) goto out; error = vnode_setattr(vp, &va, ctx); +#if CONFIG_MACF + if (error == 0) + mac_vnode_notify_setflags(ctx, vp, flags); +#endif + if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) { error = ENOTSUP; } @@ -5684,7 +5808,7 @@ chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval) struct nameidata nd; AUDIT_ARG(fflags, uap->flags); - NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -5742,7 +5866,7 @@ chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap) { kauth_action_t action; int error; - + AUDIT_ARG(mode, vap->va_mode); /* XXX audit new args */ @@ -5757,6 +5881,17 @@ chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap) if (VATTR_IS_ACTIVE(vap, va_mode) && (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) return (error); + + if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) { + if ((error = mac_vnode_check_setowner(ctx, vp, + VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1, + VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) + return (error); + } + + if (VATTR_IS_ACTIVE(vap, va_acl) && + (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) + return (error); #endif /* make sure that the caller is allowed to set this security information */ @@ -5766,8 +5901,22 @@ chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap) error = EPERM; return(error); } - - error = vnode_setattr(vp, vap, ctx); + + if ((error = vnode_setattr(vp, vap, ctx)) != 0) + return (error); + +#if CONFIG_MACF + if (VATTR_IS_ACTIVE(vap, va_mode)) + mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode); + + if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) + mac_vnode_notify_setowner(ctx, vp, + VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1, + VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1); + + if (VATTR_IS_ACTIVE(vap, va_acl)) + mac_vnode_notify_setacl(ctx, vp, vap->va_acl); +#endif return (error); } @@ -5799,7 +5948,7 @@ chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, } /* - * chmod_extended: Change the mode of a file given a path name; with extended + * chmod_extended: Change the mode of a file given a path name; with extended * argument list (including extended security (ACL)). * * Parameters: p Process requesting the open @@ -5926,14 +6075,14 @@ fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap) * * Parameters: p Process requesting to change file mode * uap User argument descriptor (see below) - * retval (ignored) + * retval (ignored) * * Indirect: uap->mode File mode to set (same as 'chmod') * uap->uid UID to set * uap->gid GID to set * uap->xsecurity ACL to set (or delete) * uap->fd File descriptor of file to change mode - * + * * Returns: 0 Success * !0 errno value * @@ -5974,7 +6123,7 @@ fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *re error = fchmod1(p, uap->fd, &va); - + switch(uap->xsecurity) { case USER_ADDR_NULL: case CAST_USER_ADDR_T(-1): @@ -6043,7 +6192,12 @@ fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid, if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) goto out; error = vnode_setattr(vp, &va, ctx); - + +#if CONFIG_MACF + if (error == 0) + mac_vnode_notify_setowner(ctx, vp, uid, gid); +#endif + out: /* * EACCES is only allowed from namei(); permissions failure should @@ -6135,6 +6289,11 @@ fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval) } error = vnode_setattr(vp, &va, ctx); +#if CONFIG_MACF + if (error == 0) + mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid); +#endif + out: (void)vnode_put(vp); file_drop(uap->fd); @@ -6215,6 +6374,11 @@ setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, } error = vnode_setattr(vp, &va, ctx); +#if CONFIG_MACF + if (error == 0) + mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]); +#endif + out: return error; } @@ -6233,10 +6397,10 @@ utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval) vfs_context_t ctx = vfs_context_current(); /* - * AUDIT: Needed to change the order of operations to do the + * AUDIT: Needed to change the order of operations to do the * name lookup first because auditing wants the path. */ - NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -6303,7 +6467,7 @@ truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval) if (uap->length < 0) return(EINVAL); - NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd))) return (error); @@ -6325,6 +6489,12 @@ truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval) if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) goto out; error = vnode_setattr(vp, &va, ctx); + +#if CONFIG_MACF + if (error == 0) + mac_vnode_notify_truncate(ctx, NOCRED, vp); +#endif + out: vnode_put(vp); return (error); @@ -6347,7 +6517,7 @@ ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval) AUDIT_ARG(fd, uap->fd); if (uap->length < 0) return(EINVAL); - + if ( (error = fp_lookup(p,fd,&fp,0)) ) { return(error); } @@ -6388,6 +6558,12 @@ ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval) VATTR_INIT(&va); VATTR_SET(&va, va_data_size, uap->length); error = vnode_setattr(vp, &va, ctx); + +#if CONFIG_MACF + if (error == 0) + mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp); +#endif + (void)vnode_put(vp); out: file_drop(fd); @@ -6414,7 +6590,7 @@ fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval) * thread cancellation points. */ /* ARGSUSED */ -int +int fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval) { return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT)); @@ -6482,7 +6658,7 @@ fsync_common(proc_t p, struct fsync_args *uap, int flags) #if NAMEDRSRCFORK /* Sync resource fork shadow file if necessary. */ if ((error == 0) && - (vp->v_flag & VISNAMEDSTREAM) && + (vp->v_flag & VISNAMEDSTREAM) && (vp->v_parent != NULLVP) && vnode_isshadow(vp) && (fp->f_flags & FP_WRITTEN)) { @@ -6496,7 +6672,7 @@ fsync_common(proc_t p, struct fsync_args *uap, int flags) } /* - * Duplicate files. Source must be a file, target must be a file or + * Duplicate files. Source must be a file, target must be a file or * must not exist. * * XXX Copyfile authorisation checking is woefully inadequate, and will not @@ -6510,6 +6686,10 @@ copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval) struct nameidata fromnd, tond; int error; vfs_context_t ctx = vfs_context_current(); +#if CONFIG_MACF + struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd; + struct vnode_attr va; +#endif /* Check that the flags are valid. */ @@ -6538,11 +6718,42 @@ copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval) goto out; } } + if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) { error = EISDIR; goto out; } + /* This calls existing MAC hooks for open */ + if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx, + NULL))) { + goto out; + } + + if (tvp) { + /* + * See unlinkat_internal for an explanation of the potential + * ENOENT from the MAC hook but the gist is that the MAC hook + * can fail because vn_getpath isn't able to return the full + * path. We choose to ignore this failure. + */ + error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL); + if (error && error != ENOENT) + goto out; + error = 0; + } + +#if CONFIG_MACF + VATTR_INIT(&va); + VATTR_SET(&va, va_type, fvp->v_type); + /* Mask off all but regular access permissions */ + VATTR_SET(&va, va_mode, + ((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS)); + error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va); + if (error) + goto out; +#endif /* CONFIG_MACF */ + if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) goto out; @@ -6579,49 +6790,335 @@ copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval) return (error); } +#define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1 /* - * Rename files. Source and destination must either both be directories, - * or both not be directories. If target is a directory, it must be empty. + * Helper function for doing clones. The caller is expected to provide an + * iocounted source vnode and release it. */ -/* ARGSUSED */ static int -renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, - int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags) +clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd, + user_addr_t dst, uint32_t flags, vfs_context_t ctx) { vnode_t tvp, tdvp; - vnode_t fvp, fdvp; - struct nameidata *fromnd, *tond; + struct nameidata tond; int error; - int do_retry; - int retry_count; - int mntrename; - int need_event; - const char *oname = NULL; - char *from_name = NULL, *to_name = NULL; - int from_len=0, to_len=0; - int holding_mntlock; - mount_t locked_mp = NULL; - vnode_t oparent = NULLVP; -#if CONFIG_FSE - fse_info from_finfo, to_finfo; -#endif - int from_truncated=0, to_truncated; - int batched = 0; - struct vnode_attr *fvap, *tvap; - int continuing = 0; - /* carving out a chunk for structs that are too big to be on stack. */ - struct { - struct nameidata from_node, to_node; - struct vnode_attr fv_attr, tv_attr; - } * __rename_data; - MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK); - fromnd = &__rename_data->from_node; - tond = &__rename_data->to_node; + int follow; + boolean_t free_acl; + boolean_t attr_cleanup; + enum vtype v_type; + kauth_action_t action; + struct componentname *cnp; + uint32_t defaulted; + struct vnode_attr va; - holding_mntlock = 0; - do_retry = 0; - retry_count = 0; + v_type = vnode_vtype(fvp); + switch (v_type) { + case VLNK: + /* FALLTHRU */ + case VREG: + action = KAUTH_VNODE_ADD_FILE; + break; + case VDIR: + if (vnode_isvroot(fvp) || vnode_ismount(fvp) || + fvp->v_mountedhere) { + return (EINVAL); + } + action = KAUTH_VNODE_ADD_SUBDIRECTORY; + break; + default: + return (EINVAL); + } + + AUDIT_ARG(fd2, dst_dirfd); + AUDIT_ARG(value32, flags); + + follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW; + NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2, + UIO_USERSPACE, dst, ctx); + if ((error = nameiat(&tond, dst_dirfd))) + return (error); + cnp = &tond.ni_cnd; + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + + free_acl = FALSE; + attr_cleanup = FALSE; + + if (tvp != NULL) { + error = EEXIST; + goto out; + } + + if (vnode_mount(tdvp) != vnode_mount(fvp)) { + error = EXDEV; + goto out; + } + +#if CONFIG_MACF + if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) + goto out; +#endif + if ((error = vnode_authorize(tdvp, NULL, action, ctx))) + goto out; + + action = KAUTH_VNODE_GENERIC_READ_BITS; + if (data_read_authorised) + action &= ~KAUTH_VNODE_READ_DATA; + if ((error = vnode_authorize(fvp, NULL, action, ctx))) + goto out; + + /* + * certain attributes may need to be changed from the source, we ask for + * those here. + */ + VATTR_INIT(&va); + VATTR_WANTED(&va, va_type); + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_flags); + VATTR_WANTED(&va, va_acl); + + if ((error = vnode_getattr(fvp, &va, ctx)) != 0) + goto out; + + if (!VATTR_IS_SUPPORTED(&va, va_acl)) + VATTR_CLEAR_ACTIVE(&va, va_acl); + else if (va.va_acl != NULL) + free_acl = TRUE; + + if (!VATTR_IS_SUPPORTED(&va, va_mode)) { + VATTR_CLEAR_ACTIVE(&va, va_mode); + } else { + proc_t p = vfs_context_proc(ctx); + + VATTR_SET(&va, va_mode, + (va.va_mode & ACCESSPERMS) & ~p->p_fd->fd_cmask); + } + + if (!VATTR_IS_SUPPORTED(&va, va_flags)) { + VATTR_CLEAR_ACTIVE(&va, va_flags); + } else if (va.va_flags & SF_RESTRICTED) { + /* + * Turn off SF_RESTRICTED from source, if the destination needs + * it, it will be handled in vnode_authattr_new. + */ + VATTR_SET(&va, va_flags, (va.va_flags & ~SF_RESTRICTED)); + } + + /* Handle ACL inheritance, initialize vap. */ + if (v_type == VLNK) { + error = vnode_authattr_new(tdvp, &va, 0, ctx); + } else { + error = vn_attribute_prepare(tdvp, &va, &defaulted, ctx); + attr_cleanup = TRUE; + } + + if (error) { + attr_cleanup = FALSE; + goto out; + } + + error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &va, flags, ctx); + + if (!error && tvp) { + int update_flags = 0; +#if CONFIG_FSE + int fsevent; +#endif /* CONFIG_FSE */ + +#if CONFIG_MACF + (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp, + VNODE_LABEL_CREATE, ctx); +#endif + /* + * If some of the requested attributes weren't handled by the + * VNOP, use our fallback code. + */ + if (!VATTR_ALL_SUPPORTED(&va)) + (void)vnode_setattr_fallback(tvp, &va, ctx); + + // Make sure the name & parent pointers are hooked up + if (tvp->v_name == NULL) + update_flags |= VNODE_UPDATE_NAME; + if (tvp->v_parent == NULLVP) + update_flags |= VNODE_UPDATE_PARENT; + + if (update_flags) { + (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr, + cnp->cn_namelen, cnp->cn_hash, update_flags); + } + +#if CONFIG_FSE + switch (vnode_vtype(tvp)) { + case VLNK: + /* FALLTHRU */ + case VREG: + fsevent = FSE_CREATE_FILE; + break; + case VDIR: + fsevent = FSE_CREATE_DIR; + break; + default: + goto out; + } + + if (need_fsevent(fsevent, tvp)) { + add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp, + FSE_ARG_DONE); + } +#endif /* CONFIG_FSE */ + } +#if CLONE_SNAPSHOT_FALLBACKS_ENABLED + else if (error == ENOTSUP) { + struct vfs_attr vfa; + + /* + * Fallback to VNOP_COPYFILE but check first that the + * filesystem supports cloning. + */ + VFSATTR_INIT(&vfa); + VFSATTR_WANTED(&vfa, f_capabilities); + if ((vfs_getattr(vnode_mount(tdvp), &vfa, ctx) == 0) && + VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) && + (vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_CLONE) && + (vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_CLONE)) { + + error = VNOP_COPYFILE(fvp, tdvp, tvp, cnp, 0, + 0, ctx); + } + } +#endif /* CLONE_SNAPSHOT_FALLBACKS_ENABLED */ + +out: + if (attr_cleanup) + vn_attribute_cleanup(&va, defaulted); + if (free_acl && va.va_acl) + kauth_acl_free(va.va_acl); + nameidone(&tond); + if (tvp) + vnode_put(tvp); + vnode_put(tdvp); + return (error); +} + +/* + * clone files or directories, target must not exist. + */ +/* ARGSUSED */ +int +clonefileat(__unused proc_t p, struct clonefileat_args *uap, + __unused int32_t *retval) +{ + vnode_t fvp; + struct nameidata fromnd; + int follow; + int error; + vfs_context_t ctx = vfs_context_current(); + + /* Check that the flags are valid. */ + if (uap->flags & ~CLONE_NOFOLLOW) + return (EINVAL); + + AUDIT_ARG(fd, uap->src_dirfd); + + follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW; + NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1, + UIO_USERSPACE, uap->src, ctx); + if ((error = nameiat(&fromnd, uap->src_dirfd))) + return (error); + + fvp = fromnd.ni_vp; + nameidone(&fromnd); + + error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst, + uap->flags, ctx); + + vnode_put(fvp); + return (error); +} + +int +fclonefileat(__unused proc_t p, struct fclonefileat_args *uap, + __unused int32_t *retval) +{ + vnode_t fvp; + struct fileproc *fp; + int error; + vfs_context_t ctx = vfs_context_current(); + + AUDIT_ARG(fd, uap->src_fd); + error = fp_getfvp(p, uap->src_fd, &fp, &fvp); + if (error) + return (error); + + if ((fp->f_fglob->fg_flag & FREAD) == 0) { + AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1); + error = EBADF; + goto out; + } + + if ((error = vnode_getwithref(fvp))) + goto out; + + AUDIT_ARG(vnpath, fvp, ARG_VNODE1); + + error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst, + uap->flags, ctx); + + vnode_put(fvp); +out: + file_drop(uap->src_fd); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +/* ARGSUSED */ +static int +renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, + int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags) +{ + if (flags & ~VFS_RENAME_FLAGS_MASK) + return EINVAL; + + if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) + return EINVAL; + + vnode_t tvp, tdvp; + vnode_t fvp, fdvp; + struct nameidata *fromnd, *tond; + int error; + int do_retry; + int retry_count; + int mntrename; + int need_event; + const char *oname = NULL; + char *from_name = NULL, *to_name = NULL; + int from_len=0, to_len=0; + int holding_mntlock; + mount_t locked_mp = NULL; + vnode_t oparent = NULLVP; +#if CONFIG_FSE + fse_info from_finfo, to_finfo; +#endif + int from_truncated=0, to_truncated; + int batched = 0; + struct vnode_attr *fvap, *tvap; + int continuing = 0; + /* carving out a chunk for structs that are too big to be on stack. */ + struct { + struct nameidata from_node, to_node; + struct vnode_attr fv_attr, tv_attr; + } * __rename_data; + MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK); + fromnd = &__rename_data->from_node; + tond = &__rename_data->to_node; + + holding_mntlock = 0; + do_retry = 0; + retry_count = 0; retry: fvp = tvp = NULL; fdvp = tdvp = NULL; @@ -6660,6 +7157,16 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, tvp = tond->ni_vp; } + if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) { + error = ENOENT; + goto out1; + } + + if (tvp && ISSET(flags, VFS_RENAME_EXCL)) { + error = EEXIST; + goto out1; + } + batched = vnode_compound_rename_available(fdvp); if (!fvp) { /* @@ -6680,7 +7187,7 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, } if (!batched) { - error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL); + error = vn_authorize_renamex(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, flags, NULL); if (error) { if (error == ENOENT) { assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES); @@ -6727,6 +7234,12 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, * * XXX Handle this in VFS after a continued lookup (if we missed * in the cache to start off) + * + * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so + * we'll skip past here. The file system is responsible for + * checking that @tvp is not a descendent of @fvp and vice versa + * so it should always return EINVAL if either @tvp or @fvp is the + * root of a volume. */ if ((fvp->v_flag & VROOT) && (fvp->v_type == VDIR) && @@ -6915,16 +7428,9 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); } -#if CONFIG_SECLUDED_RENAME - if (flags & VFS_SECLUDE_RENAME) { - fromnd->ni_cnd.cn_flags |= CN_SECLUDE_RENAME; - } -#else - #pragma unused(flags) -#endif error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap, tdvp, &tvp, &tond->ni_cnd, tvap, - 0, ctx); + flags, ctx); if (holding_mntlock) { /* @@ -6982,6 +7488,11 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_RENAME, (uintptr_t)from_name, (uintptr_t)to_name); + if (flags & VFS_RENAME_SWAP) { + kauth_authorize_fileop(vfs_context_ucred(ctx), + KAUTH_FILEOP_RENAME, + (uintptr_t)to_name, (uintptr_t)from_name); + } #if CONFIG_FSE if (from_name != NULL && to_name != NULL) { @@ -6997,13 +7508,27 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap); } - if (tvp) { - add_fsevent(FSE_RENAME, ctx, - FSE_ARG_STRING, from_len, from_name, - FSE_ARG_FINFO, &from_finfo, - FSE_ARG_STRING, to_len, to_name, - FSE_ARG_FINFO, &to_finfo, - FSE_ARG_DONE); + if (tvp) { + add_fsevent(FSE_RENAME, ctx, + FSE_ARG_STRING, from_len, from_name, + FSE_ARG_FINFO, &from_finfo, + FSE_ARG_STRING, to_len, to_name, + FSE_ARG_FINFO, &to_finfo, + FSE_ARG_DONE); + if (flags & VFS_RENAME_SWAP) { + /* + * Strictly speaking, swap is the equivalent of + * *three* renames. FSEvents clients should only take + * the events as a hint, so we only bother reporting + * two. + */ + add_fsevent(FSE_RENAME, ctx, + FSE_ARG_STRING, to_len, to_name, + FSE_ARG_FINFO, &to_finfo, + FSE_ARG_STRING, from_len, from_name, + FSE_ARG_FINFO, &from_finfo, + FSE_ARG_DONE); + } } else { add_fsevent(FSE_RENAME, ctx, FSE_ARG_STRING, from_len, from_name, @@ -7132,17 +7657,15 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) AT_FDCWD, uap->to, UIO_USERSPACE, 0)); } -#if CONFIG_SECLUDED_RENAME -int rename_ext(__unused proc_t p, struct rename_ext_args *uap, __unused int32_t *retval) +int renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval) { return renameat_internal( - vfs_context_current(), - AT_FDCWD, uap->from, - AT_FDCWD, uap->to, + vfs_context_current(), + uap->fromfd, uap->from, + uap->tofd, uap->to, UIO_USERSPACE, uap->flags); } -#endif - + int renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval) { @@ -7571,7 +8094,7 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, int *numdirent, vfs_context_t ctxp) { /* Check if fs natively supports VNODE_READDIR_EXTENDED */ - if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) && + if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) && ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) { return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp); } else { @@ -7592,9 +8115,9 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, * will prevent us from reading more than we can pack. * * Since this buffer is wired memory, we will limit the - * buffer size to a maximum of 32K. We would really like to + * buffer size to a maximum of 32K. We would really like to * use 32K in the MIN(), but we use magic number 87371 to - * prevent uio_resid() * 3 / 8 from overflowing. + * prevent uio_resid() * 3 / 8 from overflowing. */ bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8; MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK); @@ -7745,7 +8268,7 @@ getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *byt if (offset) { *offset = loff; } - + *bytesread = bufsize - uio_resid(auio); out: file_drop(fd); @@ -7822,7 +8345,7 @@ umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval) * * Indirect: uap->newmask umask to set * uap->xsecurity ACL to set - * + * * Returns: 0 Success * !0 Not success * @@ -7931,14 +8454,14 @@ getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval uint32_t newstate; int error, eofflag; uint32_t loff; - struct attrlist attributelist; + struct attrlist attributelist; vfs_context_t ctx = vfs_context_current(); int fd = uap->fd; char uio_buf[ UIO_SIZEOF(1) ]; kauth_action_t action; AUDIT_ARG(fd, fd); - + /* Get the attributes into kernel space */ if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) { return(error); @@ -7989,7 +8512,7 @@ getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval loff = fp->f_fglob->fg_offset; auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf)); uio_addiov(auio, uap->buffer, uap->buffersize); - + /* * If the only item requested is file names, we can let that past with * just LIST_DIRECTORY. If they want any other attributes, that means @@ -7999,7 +8522,7 @@ getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval if ((attributelist.commonattr & ~ATTR_CMN_NAME) || attributelist.fileattr || attributelist.dirattr) action |= KAUTH_VNODE_SEARCH; - + if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) { /* Believe it or not, uap->options only has 32-bits of valid @@ -8041,7 +8564,7 @@ getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval (void)vnode_put(vp); - if (error) + if (error) goto out; fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */ @@ -8082,7 +8605,7 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t #if CONFIG_FSE fse_info f_finfo, s_finfo; #endif - + nameiflags = 0; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; @@ -8096,7 +8619,7 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t nameidone(&fnd); fvp = fnd.ni_vp; - NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2, + NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2, UIO_USERSPACE, uap->path2, ctx); error = namei(&snd); @@ -8113,7 +8636,7 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t if (svp == fvp) { error = EINVAL; goto out; - } + } /* * if the files are on different volumes, return an error @@ -8141,7 +8664,7 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t if ( #if CONFIG_FSE - need_fsevent(FSE_EXCHANGE, fvp) || + need_fsevent(FSE_EXCHANGE, fvp) || #endif kauth_authorize_fileop_has_listeners()) { GET_PATH(fpath); @@ -8153,7 +8676,7 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated); slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated); - + #if CONFIG_FSE get_fse_info(fvp, &f_finfo, ctx); get_fse_info(svp, &s_finfo, ctx); @@ -8170,10 +8693,10 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t const char *tmpname; if (fpath != NULL && spath != NULL) { - /* call out to allow 3rd party notification of exchangedata. + /* call out to allow 3rd party notification of exchangedata. * Ignore result of kauth_authorize_fileop call. */ - kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE, + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE, (uintptr_t)fpath, (uintptr_t)spath); } name_cache_lock(); @@ -8181,7 +8704,7 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t tmpname = fvp->v_name; fvp->v_name = svp->v_name; svp->v_name = tmpname; - + if (fvp->v_parent != svp->v_parent) { vnode_t tmp; @@ -8222,7 +8745,7 @@ uint32_t freespace_mb(vnode_t vp); uint32_t freespace_mb(vnode_t vp) { - vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT); + vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT); return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail * vp->v_mount->mnt_vfsstat.f_bsize) >> 20); } @@ -8266,7 +8789,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer); searchblock.returnbuffersize = tmp_searchblock.returnbuffersize; searchblock.maxmatches = tmp_searchblock.maxmatches; - /* + /* * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary * from a 32 bit long, and tv_usec is already a signed 32 bit int. */ @@ -8281,12 +8804,12 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) if (error) return(error); - /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2. + /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2. */ - if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS || + if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS || searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) return(EINVAL); - + /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */ /* It all has to do into local memory and it's not that big so we might as well put it all together. */ /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/ @@ -8295,7 +8818,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */ /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */ /* assumes the size is still 556 bytes it will continue to work */ - + mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 + sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t)); @@ -8317,7 +8840,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) goto freeandexit; - + if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) goto freeandexit; @@ -8328,25 +8851,25 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) */ if (uap->options & SRCHFS_START) state->ss_union_layer = 0; - else + else uap->options |= state->ss_union_flags; state->ss_union_flags = 0; /* * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter, * which is passed in with an attrreference_t, we need to inspect the buffer manually here. - * The KPI does not provide us the ability to pass in the length of the buffers searchparams1 - * and searchparams2. To obviate the need for all searchfs-supporting filesystems to + * The KPI does not provide us the ability to pass in the length of the buffers searchparams1 + * and searchparams2. To obviate the need for all searchfs-supporting filesystems to * validate the user-supplied data offset of the attrreference_t, we'll do it here. */ if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) { attrreference_t* string_ref; u_int32_t* start_length; - user64_size_t param_length; + user64_size_t param_length; /* validate searchparams1 */ - param_length = searchblock.sizeofsearchparams1; + param_length = searchblock.sizeofsearchparams1; /* skip the word that specifies length of the buffer */ start_length= (u_int32_t*) searchparams1; start_length= start_length+1; @@ -8355,13 +8878,13 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) /* ensure no negative offsets or too big offsets */ if (string_ref->attr_dataoffset < 0 ) { error = EINVAL; - goto freeandexit; + goto freeandexit; } if (string_ref->attr_length > MAXPATHLEN) { error = EINVAL; goto freeandexit; } - + /* Check for pointer overflow in the string ref */ if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) { error = EINVAL; @@ -8430,9 +8953,9 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) } #endif - + /* - * If searchblock.maxmatches == 0, then skip the search. This has happened + * If searchblock.maxmatches == 0, then skip the search. This has happened * before and sometimes the underlying code doesnt deal with it well. */ if (searchblock.maxmatches == 0) { @@ -8442,7 +8965,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) /* * Allright, we have everything we need, so lets make that call. - * + * * We keep special track of the return value from the file system: * EAGAIN is an acceptable error condition that shouldn't keep us * from copying out any results... @@ -8461,7 +8984,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) auio, (struct searchstate *) &state->ss_fsstate, ctx); - + /* * If it's a union mount we need to be called again * to search the mounted-on filesystem. @@ -8484,7 +9007,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) goto freeandexit; - + error = fserror; freeandexit: @@ -8653,7 +9176,7 @@ void nspace_proc_exit(struct proc *p) { int i, event_mask = 0; - + for (i = 0; i < NSPACE_HANDLER_COUNT; i++) { if (p == nspace_handlers[i].handler_proc) { event_mask |= nspace_item_flags_for_type(i); @@ -8665,16 +9188,16 @@ nspace_proc_exit(struct proc *p) if (event_mask == 0) { return; } - + + lck_mtx_lock(&nspace_handler_lock); if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) { // if this process was the snapshot handler, zero snapshot_timeout snapshot_timestamp = 0; } - + // // unblock anyone that's waiting for the handler that died // - lck_mtx_lock(&nspace_handler_lock); for(i=0; i < MAX_NSPACE_ITEMS; i++) { if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) { @@ -8689,24 +9212,24 @@ nspace_proc_exit(struct proc *p) nspace_items[i].vid = 0; nspace_items[i].flags = NSPACE_ITEM_DONE; nspace_items[i].token = 0; - + wakeup((caddr_t)&(nspace_items[i].vp)); } } } - + wakeup((caddr_t)&nspace_item_idx); lck_mtx_unlock(&nspace_handler_lock); } -int +int resolve_nspace_item(struct vnode *vp, uint64_t op) { return resolve_nspace_item_ext(vp, op, NULL); } -int +int resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) { int i, error, keep_waiting; @@ -8764,7 +9287,7 @@ resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) } else { nspace_items[i].refcount++; } - + if (i >= MAX_NSPACE_ITEMS) { ts.tv_sec = nspace_handler_timeout; ts.tv_nsec = 0; @@ -8801,7 +9324,7 @@ resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) nspace_items[i].token = 0; nspace_items[i].refcount = 1; - + wakeup((caddr_t)&nspace_item_idx); } @@ -8830,7 +9353,7 @@ resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) // hmmm, why did we get woken up? printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n", nspace_items[i].token); - } + } if (--nspace_items[i].refcount == 0) { nspace_items[i].vp = NULL; // clear this so that no one will match on it again @@ -8847,16 +9370,48 @@ resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) return error; } - -int -get_nspace_item_status(struct vnode *vp, int32_t *status) +int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg) { - int i; + int snapshot_error = 0; - lck_mtx_lock(&nspace_handler_lock); - for(i=0; i < MAX_NSPACE_ITEMS; i++) { - if (nspace_items[i].vp == vp) { - break; + if (vp == NULL) { + return 0; + } + + /* Swap files are special; skip them */ + if (vnode_isswap(vp)) { + return 0; + } + + if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) { + // the change time is within this epoch + int error; + + error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg); + if (error == EDEADLK) { + snapshot_error = 0; + } else if (error) { + if (error == EAGAIN) { + printf("nspace_snapshot_event: timed out waiting for namespace handler...\n"); + } else if (error == EINTR) { + // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n"); + snapshot_error = EINTR; + } + } + } + + return snapshot_error; +} + +int +get_nspace_item_status(struct vnode *vp, int32_t *status) +{ + int i; + + lck_mtx_lock(&nspace_handler_lock); + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].vp == vp) { + break; } } @@ -8869,7 +9424,7 @@ get_nspace_item_status(struct vnode *vp, int32_t *status) lck_mtx_unlock(&nspace_handler_lock); return 0; } - + #if 0 static int @@ -8937,7 +9492,7 @@ vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx) if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) return error; - + // // if the vnode is tagged VOPENEVT and the current process @@ -8960,13 +9515,13 @@ vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx) return error; } - /* Call out to allow 3rd party notification of open. + /* Call out to allow 3rd party notification of open. * Ignore result of kauth_authorize_fileop call. */ #if CONFIG_MACF mac_vnode_notify_open(ctx, vp, fmode); #endif - kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, (uintptr_t)vp, 0); @@ -8976,157 +9531,163 @@ vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx) static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type) { - int i, error=0, unblock=0; + int i; + int error = 0; + int unblock = 0; task_t curtask; - + lck_mtx_lock(&nspace_handler_exclusion_lock); if (nspace_handlers[nspace_type].handler_busy) { lck_mtx_unlock(&nspace_handler_exclusion_lock); return EBUSY; } + nspace_handlers[nspace_type].handler_busy = 1; lck_mtx_unlock(&nspace_handler_exclusion_lock); - - /* + + /* * Any process that gets here will be one of the namespace handlers. * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation * as we can cause deadlocks to occur, because the namespace handler may prevent - * VNOP_INACTIVE from proceeding. Mark the current task as a P_DEPENDENCY_CAPABLE + * VNOP_INACTIVE from proceeding. Mark the current task as a P_DEPENDENCY_CAPABLE * process. */ curtask = current_task(); - bsd_set_dependency_capable (curtask); - + bsd_set_dependency_capable (curtask); + lck_mtx_lock(&nspace_handler_lock); if (nspace_handlers[nspace_type].handler_proc == NULL) { nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread()); nspace_handlers[nspace_type].handler_proc = current_proc(); } - + + if (nspace_type == NSPACE_HANDLER_SNAPSHOT && + (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + error = EINVAL; + } + while (error == 0) { - - for(i=0; i < MAX_NSPACE_ITEMS; i++) { + + /* Try to find matching namespace item */ + for (i = 0; i < MAX_NSPACE_ITEMS; i++) { if (nspace_items[i].flags & NSPACE_ITEM_NEW) { - if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { - continue; + if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { + break; } - break; } } - - if (i < MAX_NSPACE_ITEMS) { - nspace_items[i].flags &= ~NSPACE_ITEM_NEW; - nspace_items[i].flags |= NSPACE_ITEM_PROCESSING; - nspace_items[i].token = ++nspace_token_id; - - if (nspace_items[i].vp) { - struct fileproc *fp; - int32_t indx, fmode; - struct proc *p = current_proc(); - vfs_context_t ctx = vfs_context_current(); - struct vnode_attr va; - - - /* - * Use vnode pointer to acquire a file descriptor for - * hand-off to userland - */ - fmode = nspace_open_flags_for_type(nspace_type); - error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid); - if (error) { - unblock = 1; - break; - } - error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx); - if (error) { - unblock = 1; - vnode_put(nspace_items[i].vp); - break; - } - - if ((error = falloc(p, &fp, &indx, ctx))) { - vn_close(nspace_items[i].vp, fmode, ctx); - vnode_put(nspace_items[i].vp); - unblock = 1; - break; - } - - fp->f_fglob->fg_flag = fmode; - fp->f_fglob->fg_ops = &vnops; - fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp; - - proc_fdlock(p); - procfdtbl_releasefd(p, indx, NULL); - fp_drop(p, indx, fp, 1); - proc_fdunlock(p); - - /* - * All variants of the namespace handler struct support these three fields: - * token, flags, and the FD pointer - */ - error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t)); - error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t)); - error = copyout(&indx, nhd->fdptr, sizeof(uint32_t)); - - /* - * Handle optional fields: - * extended version support an info ptr (offset, length), and the - * - * namedata version supports a unique per-link object ID - * - */ - if (nhd->infoptr) { - uio_t uio = (uio_t)nspace_items[i].arg; - uint64_t u_offset, u_length; - - if (uio) { - u_offset = uio_offset(uio); - u_length = uio_resid(uio); - } else { - u_offset = 0; - u_length = 0; - } - error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t)); - error = copyout(&u_length, nhd->infoptr+sizeof(uint64_t), sizeof(uint64_t)); - } - if (nhd->objid) { - VATTR_INIT(&va); - VATTR_WANTED(&va, va_linkid); - error = vnode_getattr(nspace_items[i].vp, &va, ctx); - if (error == 0 ) { - uint64_t linkid = 0; - if (VATTR_IS_SUPPORTED (&va, va_linkid)) { - linkid = (uint64_t)va.va_linkid; - } - error = copyout (&linkid, nhd->objid, sizeof(uint64_t)); - } - } - - if (error) { - vn_close(nspace_items[i].vp, fmode, ctx); - fp_free(p, indx, fp); - unblock = 1; - } - - vnode_put(nspace_items[i].vp); - - break; - } else { - printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n", - i, nspace_items[i].vp, error, nspace_items[i].vp->v_name); - } - - } else { + if (i >= MAX_NSPACE_ITEMS) { + /* Nothing is there yet. Wait for wake up and retry */ error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0); if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + /* Prevent infinite loop if snapshot handler exited */ error = EINVAL; break; } - + continue; } + + nspace_items[i].flags &= ~NSPACE_ITEM_NEW; + nspace_items[i].flags |= NSPACE_ITEM_PROCESSING; + nspace_items[i].token = ++nspace_token_id; + + assert(nspace_items[i].vp); + struct fileproc *fp; + int32_t indx; + int32_t fmode; + struct proc *p = current_proc(); + vfs_context_t ctx = vfs_context_current(); + struct vnode_attr va; + bool vn_get_succsessful = false; + bool vn_open_successful = false; + bool fp_alloc_successful = false; + + /* + * Use vnode pointer to acquire a file descriptor for + * hand-off to userland + */ + fmode = nspace_open_flags_for_type(nspace_type); + error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid); + if (error) goto cleanup; + vn_get_succsessful = true; + + error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx); + if (error) goto cleanup; + vn_open_successful = true; + + error = falloc(p, &fp, &indx, ctx); + if (error) goto cleanup; + fp_alloc_successful = true; + + fp->f_fglob->fg_flag = fmode; + fp->f_fglob->fg_ops = &vnops; + fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp; + + proc_fdlock(p); + procfdtbl_releasefd(p, indx, NULL); + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); + + /* + * All variants of the namespace handler struct support these three fields: + * token, flags, and the FD pointer + */ + error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t)); + if (error) goto cleanup; + error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t)); + if (error) goto cleanup; + error = copyout(&indx, nhd->fdptr, sizeof(uint32_t)); + if (error) goto cleanup; + + /* + * Handle optional fields: + * extended version support an info ptr (offset, length), and the + * + * namedata version supports a unique per-link object ID + * + */ + if (nhd->infoptr) { + uio_t uio = (uio_t)nspace_items[i].arg; + uint64_t u_offset, u_length; + + if (uio) { + u_offset = uio_offset(uio); + u_length = uio_resid(uio); + } else { + u_offset = 0; + u_length = 0; + } + error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t)); + if (error) goto cleanup; + error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t)); + if (error) goto cleanup; + } + + if (nhd->objid) { + VATTR_INIT(&va); + VATTR_WANTED(&va, va_linkid); + error = vnode_getattr(nspace_items[i].vp, &va, ctx); + if (error) goto cleanup; + + uint64_t linkid = 0; + if (VATTR_IS_SUPPORTED (&va, va_linkid)) { + linkid = (uint64_t)va.va_linkid; + } + error = copyout(&linkid, nhd->objid, sizeof(uint64_t)); + } +cleanup: + if (error) { + if (fp_alloc_successful) fp_free(p, indx, fp); + if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx); + unblock = 1; + } + + if (vn_get_succsessful) vnode_put(nspace_items[i].vp); + + break; } - + if (unblock) { if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) { vnode_lock_spin(nspace_items[i].vp); @@ -9137,34 +9698,34 @@ wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type) nspace_items[i].vid = 0; nspace_items[i].flags = NSPACE_ITEM_DONE; nspace_items[i].token = 0; - + wakeup((caddr_t)&(nspace_items[i].vp)); } - + if (nspace_type == NSPACE_HANDLER_SNAPSHOT) { // just go through every snapshot event and unblock it immediately. if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { - for(i=0; i < MAX_NSPACE_ITEMS; i++) { + for(i = 0; i < MAX_NSPACE_ITEMS; i++) { if (nspace_items[i].flags & NSPACE_ITEM_NEW) { if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { nspace_items[i].vp = NULL; nspace_items[i].vid = 0; nspace_items[i].flags = NSPACE_ITEM_DONE; nspace_items[i].token = 0; - - wakeup((caddr_t)&(nspace_items[i].vp)); + + wakeup((caddr_t)&(nspace_items[i].vp)); } } } } } - + lck_mtx_unlock(&nspace_handler_lock); - + lck_mtx_lock(&nspace_handler_exclusion_lock); nspace_handlers[nspace_type].handler_busy = 0; lck_mtx_unlock(&nspace_handler_exclusion_lock); - + return error; } @@ -9207,23 +9768,18 @@ static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int { int error = 0; namespace_handler_data nhd; - + bzero (&nhd, sizeof(namespace_handler_data)); - if (nspace_type == NSPACE_HANDLER_SNAPSHOT && - (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { - return EINVAL; - } - if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { return error; } - + error = validate_namespace_args (is64bit, size); if (error) { return error; } - + /* Copy in the userland pointers into our kernel-only struct */ if (is64bit) { @@ -9242,13 +9798,13 @@ static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int } /* Otherwise the fields were pre-zeroed when we did the bzero above. */ } - } + } else { /* 32 bit userland structures */ nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token); nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags); nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr); - + if (size > (sizeof(user32_namespace_handler_info))) { if (size >= (sizeof(user32_namespace_handler_info_ext))) { nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr); @@ -9259,7 +9815,7 @@ static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int /* Otherwise the fields were pre-zeroed when we did the bzero above. */ } } - + return wait_for_namespace_event(&nhd, nspace_type); } @@ -9274,7 +9830,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long boolean_t is64bit; u_int size; #define STK_PARAMS 128 - char stkbuf[STK_PARAMS]; + char stkbuf[STK_PARAMS] = {0}; caddr_t data, memp; vnode_t vp = *arg_vp; @@ -9308,13 +9864,13 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } else { data = &stkbuf[0]; }; - + if (cmd & IOC_IN) { if (size) { error = copyin(udata, data, size); - if (error) { + if (error) { if (memp) { - kfree (memp, size); + kfree (memp, size); } return error; } @@ -9365,7 +9921,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long /* issue the sync for this volume */ (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL); - /* + /* * Then release the mount_iterref once we're done syncing; it's not * needed for the VNOP_IOCTL below */ @@ -9388,7 +9944,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long #if ROUTEFS char routepath[MAXPATHLEN]; size_t len = 0; - + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { break; } @@ -9410,6 +9966,9 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long uint32_t num_entries; uint32_t max_width; + if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) + break; + if ( (is64bit && size != sizeof(user64_package_ext_info)) || (is64bit == 0 && size != sizeof(user32_package_ext_info))) { @@ -9433,7 +9992,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } break; - /* namespace handlers */ + /* namespace handlers */ case FSCTL_NAMESPACE_HANDLER_GET: { error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data); } @@ -9442,13 +10001,13 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long /* Snapshot handlers */ case FSCTL_OLD_SNAPSHOT_HANDLER_GET: { error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); - } + } break; case FSCTL_SNAPSHOT_HANDLER_GET_EXT: { error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); } - break; + break; case FSCTL_NAMESPACE_HANDLER_UPDATE: { uint32_t token, val; @@ -9489,10 +10048,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long if (error) { printf("nspace-handler-update: did not find token %u\n", token); } - } + } break; - - case FSCTL_NAMESPACE_HANDLER_UNBLOCK: { + + case FSCTL_NAMESPACE_HANDLER_UNBLOCK: { uint32_t token, val; int i; @@ -9537,7 +10096,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } lck_mtx_unlock(&nspace_handler_lock); - } + } break; case FSCTL_NAMESPACE_HANDLER_CANCEL: { @@ -9574,18 +10133,18 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long vnode_unlock(nspace_items[i].vp); } - nspace_items[i].vp = NULL; - nspace_items[i].arg = NULL; + nspace_items[i].vp = NULL; + nspace_items[i].arg = NULL; nspace_items[i].vid = 0; nspace_items[i].token = val; nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING; - nspace_items[i].flags |= NSPACE_ITEM_CANCELLED; + nspace_items[i].flags |= NSPACE_ITEM_CANCELLED; wakeup((caddr_t)&(nspace_items[i].vp)); } lck_mtx_unlock(&nspace_handler_lock); - } + } break; case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: { @@ -9601,7 +10160,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long lck_mtx_unlock(&nspace_handler_lock); printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp); - } + } break; case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS: @@ -9620,8 +10179,8 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } break; - case FSCTL_SET_FSTYPENAME_OVERRIDE: - { + case FSCTL_SET_FSTYPENAME_OVERRIDE: + { if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { break; } @@ -9645,7 +10204,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long } } break; - + default: { /* Invoke the filesystem-specific code */ error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx); @@ -9657,13 +10216,13 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long * if no errors, copy any data to user. Size was * already set and checked above. */ - if (error == 0 && (cmd & IOC_OUT) && size) + if (error == 0 && (cmd & IOC_OUT) && size) error = copyout(data, udata, size); - + if (memp) { kfree(memp, size); } - + return error; } @@ -9672,7 +10231,7 @@ int fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval) { int error; - struct nameidata nd; + struct nameidata nd; u_long nameiflags; vnode_t vp = NULL; vfs_context_t ctx = vfs_context_current(); @@ -9714,7 +10273,7 @@ ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval) AUDIT_ARG(fd, uap->fd); AUDIT_ARG(cmd, uap->cmd); AUDIT_ARG(value32, uap->options); - + /* Get the vnode for the file we are getting info on: */ if ((error = file_vnode(uap->fd, &vp))) return error; @@ -9786,14 +10345,14 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) /* * the specific check for 0xffffffff is a hack to preserve * binaray compatibilty in K64 with applications that discovered - * that passing in a buf pointer and a size of -1 resulted in + * that passing in a buf pointer and a size of -1 resulted in * just the size of the indicated extended attribute being returned. * this isn't part of the documented behavior, but because of the * original implemtation's check for "uap->size > 0", this behavior * was allowed. In K32 that check turned into a signed comparison * even though uap->size is unsigned... in K64, we blow by that * check because uap->size is unsigned and doesn't get sign smeared - * in the munger for a 32 bit user app. we also need to add a + * in the munger for a 32 bit user app. we also need to add a * check to limit the maximum size of the buffer being passed in... * unfortunately, the underlying fileystems seem to just malloc * the requested size even if the actual extended attribute is tiny. @@ -9810,7 +10369,7 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) if (uap->value) { if (uap->size > (size_t)XATTR_MAXSIZE) uap->size = XATTR_MAXSIZE; - + auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf)); uio_addiov(auio, uap->value, uap->size); @@ -10158,7 +10717,7 @@ flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval) return(error); } if (uap->namebuf != 0 && uap->bufsize > 0) { - auio = uio_createwithbuffer(1, 0, spacetype, + auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf)); uio_addiov(auio, uap->namebuf, uap->bufsize); } @@ -10289,23 +10848,23 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) AUDIT_ARG(value32, fsid.val[0]); AUDIT_ARG(value64, uap->objid); /* Restrict output buffer size for now. */ - + if (uap->bufsize > PAGE_SIZE) { return (EINVAL); - } + } MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK); if (realpath == NULL) { return (ENOMEM); } error = fsgetpath_internal( - ctx, fsid.val[0], uap->objid, + ctx, fsid.val[0], uap->objid, uap->bufsize, realpath, &length); if (error) { goto out; } - + error = copyout((caddr_t)realpath, uap->buf, length); *retval = (user_ssize_t)length; /* may be superseded by error */ @@ -10324,8 +10883,8 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) * EFAULT */ static int -munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, - user_addr_t bufp, int *sizep, boolean_t is_64_bit, +munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, + user_addr_t bufp, int *sizep, boolean_t is_64_bit, boolean_t partial_copy) { int error; @@ -10365,23 +10924,23 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, my_size = copy_size = sizeof(sfs); bzero(&sfs, my_size); - + sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; sfs.f_type = mp->mnt_vtable->vfc_typenum; sfs.f_reserved1 = (short)sfsp->f_fssubtype; - + /* * It's possible for there to be more than 2^^31 blocks in the filesystem, so we * have to fudge the numbers here in that case. We inflate the blocksize in order * to reflect the filesystem size as best we can. */ - if ((sfsp->f_blocks > INT_MAX) - /* Hack for 4061702 . I think the real fix is for Carbon to + if ((sfsp->f_blocks > INT_MAX) + /* Hack for 4061702 . I think the real fix is for Carbon to * look for some volume capability and not depend on hidden - * semantics agreed between a FS and carbon. + * semantics agreed between a FS and carbon. * f_blocks, f_bfree, and f_bavail set to -1 is the trigger * for Carbon to set bNoVolumeSizes volume attribute. - * Without this the webdavfs files cannot be copied onto + * Without this the webdavfs files cannot be copied onto * disk as they look huge. This change should not affect * XSAN as they should not setting these to -1.. */ @@ -10437,7 +10996,7 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, } error = copyout((caddr_t)&sfs, bufp, copy_size); } - + if (sizep != NULL) { *sizep = my_size; } @@ -10630,3 +11189,534 @@ vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused return 0; } +/* + * gets the vnode associated with the (unnamed) snapshot directory + * for a Filesystem. The snapshot directory vnode is returned with + * an iocount on it. + */ +int +vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx) +{ + int error; + + error = VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx); + +#if CLONE_SNAPSHOT_FALLBACKS_ENABLED + if (error == ENOTSUP) { + struct nameidata snapnd; + + /* + * Temporary fallback to /.snaps lookup + * XXX: To be removed. + */ + NDINIT(&snapnd, LOOKUP, OP_LOOKUP, USEDVP, + UIO_SYSSPACE, CAST_USER_ADDR_T(".snaps"), ctx); + snapnd.ni_dvp = rvp; + + if ((error = namei(&snapnd))) { + error = ENOTSUP; + *sdvpp = NULLVP; + } else { + *sdvpp = snapnd.ni_vp; + nameidone(&snapnd); + } + } +#endif /* CLONE_SNAPSHOT_FALLBACKS_ENABLED */ + return (error); +} + +/* + * Get the snapshot vnode. + * + * If successful, the call returns with an iocount on *rvpp ,*sdvpp and + * needs nameidone() on ndp. + * + * If the snapshot vnode exists it is returned in ndp->ni_vp. + * + * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is + * not needed. + */ +static int +vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp, + user_addr_t name, struct nameidata *ndp, int32_t op, +#if !CONFIG_TRIGGERS + __unused +#endif + enum path_operation pathop, + vfs_context_t ctx) +{ + int error, i; + caddr_t name_buf; + size_t name_len; + struct vfs_attr vfa; + + *sdvpp = NULLVP; + *rvpp = NULLVP; + + error = vnode_getfromfd(ctx, dirfd, rvpp); + if (error) + return (error); + + if (!vnode_isvroot(*rvpp)) { + error = EINVAL; + goto out; + } + + /* Make sure the filesystem supports snapshots */ + VFSATTR_INIT(&vfa); + VFSATTR_WANTED(&vfa, f_capabilities); + if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) || + !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) || + !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & + VOL_CAP_INT_SNAPSHOT)) || + !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & + VOL_CAP_INT_SNAPSHOT))) { + error = ENOTSUP; + goto out; + } + + error = vnode_get_snapdir(*rvpp, sdvpp, ctx); + if (error) + goto out; + + MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK); + error = copyinstr(name, name_buf, MAXPATHLEN, &name_len); + if (error) + goto out1; + + /* + * Some sanity checks- name can't be empty, "." or ".." or have slashes. + * (the length returned by copyinstr includes the terminating NUL) + */ + if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') || + (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) { + error = EINVAL; + goto out1; + } + for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++); + if (i < (int)name_len) { + error = EINVAL; + goto out1; + } + +#if CONFIG_MACF + if (op == CREATE) { + error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp), + name_buf); + } else if (op == DELETE) { + error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp), + name_buf); + } + if (error) + goto out1; +#endif + + /* Check if the snapshot already exists ... */ + NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1, + UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx); + ndp->ni_dvp = *sdvpp; + + error = namei(ndp); +out1: + FREE(name_buf, M_TEMP); +out: + if (error) { + if (*sdvpp) { + vnode_put(*sdvpp); + *sdvpp = NULLVP; + } + if (*rvpp) { + vnode_put(*rvpp); + *rvpp = NULLVP; + } + } + return (error); +} + +/* + * create a filesystem snapshot (for supporting filesystems) + * + * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL) + * We get to the (unnamed) snapshot directory vnode and create the vnode + * for the snapshot in it. + * + * Restrictions: + * + * a) Passed in name for snapshot cannot have slashes. + * b) name can't be "." or ".." + * + * Since this requires superuser privileges, vnode_authorize calls are not + * made. + */ +static int +snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags, + vfs_context_t ctx) +{ + vnode_t rvp, snapdvp; + int error; + struct nameidata namend; + + error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE, + OP_LINK, ctx); + if (error) + return (error); + + if (namend.ni_vp) { + vnode_put(namend.ni_vp); + error = EEXIST; + } else { + struct vnode_attr va; + vnode_t vp = NULLVP; + + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VREG); + VATTR_SET(&va, va_mode, 0); + + error = vn_create(snapdvp, &vp, &namend, &va, + VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx); + if (!error && vp) + vnode_put(vp); +#if CLONE_SNAPSHOT_FALLBACKS_ENABLED + else if (error) { + error = VNOP_COPYFILE(rvp, rvp, NULLVP, &namend.ni_cnd, + 0, 0, ctx); + } +#endif /* CLONE_SNAPSHOT_FALLBACKS_ENABLED */ + } + + nameidone(&namend); + vnode_put(snapdvp); + vnode_put(rvp); + return (error); +} + +/* + * Delete a Filesystem snapshot + * + * get the vnode for the unnamed snapshot directory and the snapshot and + * delete the snapshot. + */ +static int +snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags, + vfs_context_t ctx) +{ + vnode_t rvp, snapdvp; + int error; + struct nameidata namend; + + error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE, + OP_UNLINK, ctx); + if (error) + goto out; + + error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd, + VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx); + + vnode_put(namend.ni_vp); + nameidone(&namend); + vnode_put(snapdvp); + vnode_put(rvp); +out: + return (error); +} + +/* + * Revert a filesystem to a snapshot + * + * Marks the filesystem to revert to the given snapshot on next mount. + */ +static int +snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags, + vfs_context_t ctx) +{ + int error; + vnode_t rvp; + mount_t mp; + struct fs_snapshot_revert_args revert_data; + struct componentname cnp; + caddr_t name_buf; + size_t name_len; + + error = vnode_getfromfd(ctx, dirfd, &rvp); + if (error) { + return (error); + } + mp = vnode_mount(rvp); + + /* + * Grab mount_iterref so that we can release the vnode, + * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync. + */ + error = mount_iterref (mp, 0); + vnode_put(rvp); + if (error) { + return (error); + } + + MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK); + error = copyinstr(name, name_buf, MAXPATHLEN, &name_len); + if (error) { + mount_iterdrop(mp); + FREE(name_buf, M_TEMP); + return (error); + } + + memset(&cnp, 0, sizeof(cnp)); + cnp.cn_pnbuf = (char *)name_buf; + cnp.cn_nameiop = LOOKUP; + cnp.cn_flags = ISLASTCN | HASBUF; + cnp.cn_pnlen = MAXPATHLEN; + cnp.cn_nameptr = cnp.cn_pnbuf; + cnp.cn_namelen = (int)name_len; + revert_data.sr_cnp = &cnp; + + error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx); + mount_iterdrop(mp); + FREE(name_buf, M_TEMP); + + if (error) { + /* If there was any error, try again using VNOP_IOCTL */ + + vnode_t snapdvp; + struct nameidata namend; + + error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP, + OP_LOOKUP, ctx); + if (error) { + return (error); + } + + +#ifndef APFSIOC_REVERT_TO_SNAPSHOT +#define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t) +#endif + +#ifndef APFS_REVERT_TO_SNAPSHOT +#define APFS_REVERT_TO_SNAPSHOT IOCBASECMD(APFSIOC_REVERT_TO_SNAPSHOT) +#endif + + error = VNOP_IOCTL(namend.ni_vp, APFS_REVERT_TO_SNAPSHOT, (caddr_t) NULL, + 0, ctx); + + vnode_put(namend.ni_vp); + nameidone(&namend); + vnode_put(snapdvp); + vnode_put(rvp); + } + + return (error); +} + +/* + * rename a Filesystem snapshot + * + * get the vnode for the unnamed snapshot directory and the snapshot and + * rename the snapshot. This is a very specialised (and simple) case of + * rename(2) (which has to deal with a lot more complications). It differs + * slightly from rename(2) in that EEXIST is returned if the new name exists. + */ +static int +snapshot_rename(int dirfd, user_addr_t old, user_addr_t new, + __unused uint32_t flags, vfs_context_t ctx) +{ + vnode_t rvp, snapdvp; + int error, i; + caddr_t newname_buf; + size_t name_len; + vnode_t fvp; + struct nameidata *fromnd, *tond; + /* carving out a chunk for structs that are too big to be on stack. */ + struct { + struct nameidata from_node; + struct nameidata to_node; + } * __rename_data; + + MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK); + fromnd = &__rename_data->from_node; + tond = &__rename_data->to_node; + + error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE, + OP_UNLINK, ctx); + if (error) + goto out; + fvp = fromnd->ni_vp; + + MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK); + error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len); + if (error) + goto out1; + + /* + * Some sanity checks- new name can't be empty, "." or ".." or have + * slashes. + * (the length returned by copyinstr includes the terminating NUL) + * + * The FS rename VNOP is suppossed to handle this but we'll pick it + * off here itself. + */ + if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') || + (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) { + error = EINVAL; + goto out1; + } + for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++); + if (i < (int)name_len) { + error = EINVAL; + goto out1; + } + +#if CONFIG_MACF + error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp), + newname_buf); + if (error) + goto out1; +#endif + + NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2, + UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx); + tond->ni_dvp = snapdvp; + + error = namei(tond); + if (error) { + goto out2; + } else if (tond->ni_vp) { + /* + * snapshot rename behaves differently than rename(2) - if the + * new name exists, EEXIST is returned. + */ + vnode_put(tond->ni_vp); + error = EEXIST; + goto out2; + } + + error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP, + &tond->ni_cnd, ctx); + +out2: + nameidone(tond); +out1: + FREE(newname_buf, M_TEMP); + vnode_put(fvp); + vnode_put(snapdvp); + vnode_put(rvp); + nameidone(fromnd); +out: + FREE(__rename_data, M_TEMP); + return (error); +} + +/* + * Mount a Filesystem snapshot + * + * get the vnode for the unnamed snapshot directory and the snapshot and + * mount the snapshot. + */ +static int +snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory, + user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx) +{ + vnode_t rvp, snapdvp, snapvp, vp, pvp; + int error; + struct nameidata *snapndp, *dirndp; + /* carving out a chunk for structs that are too big to be on stack. */ + struct { + struct nameidata snapnd; + struct nameidata dirnd; + } * __snapshot_mount_data; + + MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data), + M_TEMP, M_WAITOK); + snapndp = &__snapshot_mount_data->snapnd; + dirndp = &__snapshot_mount_data->dirnd; + + error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP, + OP_LOOKUP, ctx); + if (error) + goto out; + + snapvp = snapndp->ni_vp; + if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) { + error = EIO; + goto out1; + } + + /* Get the vnode to be covered */ + NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, + UIO_USERSPACE, directory, ctx); + error = namei(dirndp); + if (error) + goto out1; + + vp = dirndp->ni_vp; + pvp = dirndp->ni_dvp; + + if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) { + error = EINVAL; + } else { + mount_t mp = vnode_mount(rvp); + struct fs_snapshot_mount_args smnt_data; + + smnt_data.sm_mp = mp; + smnt_data.sm_cnp = &snapndp->ni_cnd; + error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp, + &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), 0, + KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx); + if (error) { + /* Retry with user passed args */ + error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, + vp, &dirndp->ni_cnd, CAST_USER_ADDR_T(mnt_data), 0, + 0, NULL, FALSE, ctx); + } + } + + vnode_put(vp); + vnode_put(pvp); + nameidone(dirndp); +out1: + vnode_put(snapvp); + vnode_put(snapdvp); + vnode_put(rvp); + nameidone(snapndp); +out: + FREE(__snapshot_mount_data, M_TEMP); + return (error); +} + +/* + * FS snapshot operations dispatcher + */ +int +fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap, + __unused int32_t *retval) +{ + int error; + vfs_context_t ctx = vfs_context_current(); + + error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0); + if (error) + return (error); + + switch (uap->op) { + case SNAPSHOT_OP_CREATE: + error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx); + break; + case SNAPSHOT_OP_DELETE: + error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx); + break; + case SNAPSHOT_OP_RENAME: + error = snapshot_rename(uap->dirfd, uap->name1, uap->name2, + uap->flags, ctx); + break; + case SNAPSHOT_OP_MOUNT: + error = snapshot_mount(uap->dirfd, uap->name1, uap->name2, + uap->data, uap->flags, ctx); + break; + case SNAPSHOT_OP_REVERT: + error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx); + break; + default: + error = ENOSYS; + } + + return (error); +} diff --git a/bsd/vfs/vfs_utfconv.c b/bsd/vfs/vfs_utfconv.c index 8639edc99..1f014aacf 100644 --- a/bsd/vfs/vfs_utfconv.c +++ b/bsd/vfs/vfs_utfconv.c @@ -36,6 +36,12 @@ #include #include +#if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST) +#include +#else +#include +#endif + /* * UTF-8 (Unicode Transformation Format) * @@ -1099,7 +1105,7 @@ prioritysort(u_int16_t* characters, int count) * colon in our tables and everything will just work. */ static u_int8_t -sfm2mac[42] = { +sfm2mac[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */ @@ -1107,9 +1113,10 @@ sfm2mac[42] = { 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */ 0x20, 0x2e /* 28 - 29 */ }; +#define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0])) static u_int8_t -mac2sfm[112] = { +mac2sfm[] = { 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */ 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */ @@ -1123,6 +1130,7 @@ mac2sfm[112] = { 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */ 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */ }; +#define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0])) /* @@ -1146,6 +1154,7 @@ ucs_to_sfm(u_int16_t ucs_ch, int lastchar) } else /* 0x20 - 0x7f */ { u_int16_t lsb; + assert((ucs_ch - 0x0020) < MAC2SFM_LEN); lsb = mac2sfm[ucs_ch - 0x0020]; if (lsb != ucs_ch) return(0xf000 | lsb); @@ -1161,6 +1170,7 @@ sfm_to_ucs(u_int16_t ucs_ch) { if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) && ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) { + assert((ucs_ch & 0x003f) < SFM2MAC_LEN); ucs_ch = sfm2mac[ucs_ch & 0x003f]; } return (ucs_ch); diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index ca14ddec6..1912d1179 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -111,10 +111,6 @@ int ubc_setcred(struct vnode *, struct proc *); #include #endif -#if CONFIG_PROTECT -#include -#endif - #include static int vn_closefile(struct fileglob *fp, vfs_context_t ctx); @@ -130,6 +126,7 @@ static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx); static void filt_vndetach(struct knote *kn); static int filt_vnode(struct knote *kn, long hint); +static int filt_vnode_common(struct knote *kn, vnode_t vp, long hint); static int vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx); #if 0 static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident, @@ -137,21 +134,26 @@ static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident, #endif const struct fileops vnops = { - DTYPE_VNODE, - vn_read, - vn_write, - vn_ioctl, - vn_select, - vn_closefile, - vn_kqfilt_add, - NULL + .fo_type = DTYPE_VNODE, + .fo_read = vn_read, + .fo_write = vn_write, + .fo_ioctl = vn_ioctl, + .fo_select = vn_select, + .fo_close = vn_closefile, + .fo_kqfilter = vn_kqfilt_add, + .fo_drain = NULL, }; +static int filt_vntouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_vnprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); + struct filterops vnode_filtops = { .f_isfd = 1, .f_attach = NULL, .f_detach = filt_vndetach, - .f_event = filt_vnode + .f_event = filt_vnode, + .f_touch = filt_vntouch, + .f_process = filt_vnprocess, }; /* @@ -383,6 +385,12 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) fmode = *fmodep; origcnflags = ndp->ni_cnd.cn_flags; + // If raw encrypted mode is requested, handle that here + if (VATTR_IS_ACTIVE (vap, va_dataprotect_flags) + && ISSET(vap->va_dataprotect_flags, VA_DP_RAWENCRYPTED)) { + fmode |= FENCRYPTED; + } + /* * O_CREAT */ @@ -511,6 +519,8 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif + if (fmode & FENCRYPTED) + ndp->ni_cnd.cn_flags |= CN_RAW_ENCRYPTED | CN_SKIPNAMECACHE; ndp->ni_flag = NAMEI_COMPOUNDOPEN; /* preserve NOFOLLOW from vnode_open() */ @@ -574,12 +584,6 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) } } -#if CONFIG_PROTECT - // If raw encrypted mode is requested, handle that here - if (VATTR_IS_ACTIVE (vap, va_dataprotect_flags) - && ISSET(vap->va_dataprotect_flags, VA_DP_RAWENCRYPTED)) { - fmode |= FENCRYPTED; - } if (VATTR_IS_ACTIVE (vap, va_dataprotect_flags) && ISSET(vap->va_dataprotect_flags, VA_DP_RAWUNENCRYPTED)) { /* Don't allow unencrypted io request from user space unless entitled */ @@ -594,16 +598,6 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) fmode |= FUNENCRYPTED; } - /* - * Perform any content protection access checks prior to calling - * into the filesystem. - */ - error = cp_handle_open (vp, fmode); - if (error) { - goto bad; - } -#endif - error = VNOP_OPEN(vp, fmode, ctx); if (error) { goto bad; @@ -1636,8 +1630,9 @@ vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx) static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) { - int error; struct vnode *vp; + int error = 0; + int result = 0; vp = (struct vnode *)fp->f_fglob->fg_data; @@ -1655,13 +1650,14 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) } } else if (!vnode_isreg(vp)) { - if (vnode_ischr(vp) && - (error = spec_kqfilter(vp, kn)) == 0) { - /* claimed by a special device */ - vnode_put(vp); - return 0; + if (vnode_ischr(vp)) { + result = spec_kqfilter(vp, kn); + if ((kn->kn_flags & EV_ERROR) == 0) { + /* claimed by a special device */ + vnode_put(vp); + return result; + } } - error = EINVAL; } break; @@ -1671,34 +1667,42 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) error = EINVAL; } - if (error) { - vnode_put(vp); - return error; - } + if (error == 0) { #if CONFIG_MACF - error = mac_vnode_check_kqfilter(ctx, fp->f_fglob->fg_cred, kn, vp); - if (error) { - vnode_put(vp); - return error; - } + error = mac_vnode_check_kqfilter(ctx, fp->f_fglob->fg_cred, kn, vp); + if (error) { + vnode_put(vp); + goto out; + } #endif - kn->kn_hook = (void*)vp; - kn->kn_hookid = vnode_vid(vp); - kn->kn_fop = &vnode_filtops; + kn->kn_hook = (void*)vp; + kn->kn_hookid = vnode_vid(vp); + kn->kn_filtid = EVFILTID_VN; - vnode_lock(vp); - KNOTE_ATTACH(&vp->v_knotes, kn); - vnode_unlock(vp); + vnode_lock(vp); + KNOTE_ATTACH(&vp->v_knotes, kn); + result = filt_vnode_common(kn, vp, 0); + vnode_unlock(vp); - /* Ask the filesystem to provide remove notifications, but ignore failure */ - VNOP_MONITOR(vp, 0, VNODE_MONITOR_BEGIN, (void*) kn, ctx); + /* + * Ask the filesystem to provide remove notifications, + * but ignore failure + */ + VNOP_MONITOR(vp, 0, VNODE_MONITOR_BEGIN, (void*) kn, ctx); + } vnode_put(vp); } - return (error); + out: + if (error) { + kn->kn_flags = EV_ERROR; + kn->kn_data = error; + } + + return result; } static void @@ -1803,22 +1807,11 @@ vnode_writable_space_count(vnode_t vp) * --If hint is revoke, set special flags and activate */ static int -filt_vnode(struct knote *kn, long hint) +filt_vnode_common(struct knote *kn, vnode_t vp, long hint) { - vnode_t vp = (struct vnode *)kn->kn_hook; int activate = 0; - long orig_hint = hint; - if (0 == hint) { - vnode_lock(vp); - - if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { - /* Is recycled */ - hint = NOTE_REVOKE; - } - } else { - lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); - } + lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); /* Special handling for vnodes that are in recycle or already gone */ if (NOTE_REVOKE == hint) { @@ -1857,16 +1850,71 @@ filt_vnode(struct knote *kn, long hint) panic("Invalid knote filter on a vnode!\n"); } } + return (activate); +} - if (orig_hint == 0) { - /* - * Definitely need to unlock, may need to put - */ - if (hint == 0) { - vnode_put_locked(vp); +static int +filt_vnode(struct knote *kn, long hint) +{ + vnode_t vp = (struct vnode *)kn->kn_hook; + + return filt_vnode_common(kn, vp, hint); +} + +static int +filt_vntouch(struct knote *kn, struct kevent_internal_s *kev) +{ + vnode_t vp = (struct vnode *)kn->kn_hook; + int activate; + int hint = 0; + + vnode_lock(vp); + if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { + /* is recycled */ + hint = NOTE_REVOKE; + } + + /* accept new input fflags mask */ + kn->kn_sfflags = kev->fflags; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + + activate = filt_vnode_common(kn, vp, hint); + + if (hint == 0) + vnode_put_locked(vp); + vnode_unlock(vp); + + return activate; +} + +static int +filt_vnprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +{ +#pragma unused(data) + vnode_t vp = (struct vnode *)kn->kn_hook; + int activate; + int hint = 0; + + vnode_lock(vp); + if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { + /* Is recycled */ + hint = NOTE_REVOKE; + } + activate = filt_vnode_common(kn, vp, hint); + if (activate) { + *kev = kn->kn_kevent; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_data = 0; + kn->kn_fflags = 0; } - vnode_unlock(vp); } - return (activate); + /* Definitely need to unlock, may need to put */ + if (hint == 0) + vnode_put_locked(vp); + vnode_unlock(vp); + + return activate; } + diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index be1898b45..bd38c5f51 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -254,9 +254,11 @@ vn_setxattr(vnode_t vp, const char *name, uio_t uio, int options, vfs_context_t error = default_setxattr(vp, name, uio, options, context); } #if CONFIG_MACF - if ((error == 0) && !(options & XATTR_NOSECURITY) && - (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL)) - mac_vnode_label_update_extattr(vnode_mount(vp), vp, name); + if ((error == 0) && !(options & XATTR_NOSECURITY)) { + mac_vnode_notify_setextattr(context, vp, name, uio); + if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) + mac_vnode_label_update_extattr(vnode_mount(vp), vp, name); + } #endif out: return (error); @@ -313,9 +315,11 @@ vn_removexattr(vnode_t vp, const char * name, int options, vfs_context_t context #endif /* DUAL_EAS */ } #if CONFIG_MACF - if ((error == 0) && !(options & XATTR_NOSECURITY) && - (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL)) - mac_vnode_label_update_extattr(vnode_mount(vp), vp, name); + if ((error == 0) && !(options & XATTR_NOSECURITY)) { + mac_vnode_notify_deleteextattr(context, vp, name); + if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) + mac_vnode_label_update_extattr(vnode_mount(vp), vp, name); + } #endif out: return (error); diff --git a/bsd/vfs/vnode_if.c b/bsd/vfs/vnode_if.c index 1fc9bd7df..cfa0c70e3 100644 --- a/bsd/vfs/vnode_if.c +++ b/bsd/vfs/vnode_if.c @@ -1,6 +1,6 @@ /* - * Copyright (c) 2000-2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -558,6 +558,26 @@ struct vnodeop_desc vnop_rename_desc = { NULL }; +int vnop_renamex_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_renamex_args,a_fdvp), + VOPARG_OFFSETOF(struct vnop_renamex_args,a_fvp), + VOPARG_OFFSETOF(struct vnop_renamex_args,a_tdvp), + VOPARG_OFFSETOF(struct vnop_renamex_args,a_tvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_renamex_desc = { + 0, + "vnop_renamex", + 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLRELE | VDESC_VP3_WILLRELE, + vnop_renamex_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_renamex_args, a_fcnp), + VOPARG_OFFSETOF(struct vnop_renamex_args, a_context), + NULL +}; + int vnop_compound_rename_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_fdvp), VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_fvpp), @@ -888,6 +908,24 @@ struct vnodeop_desc vnop_copyfile_desc = { NULL }; +int vnop_clonefile_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_clonefile_args,a_fvp), + VOPARG_OFFSETOF(struct vnop_clonefile_args,a_dvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_clonefile_desc = { + 0, + "vnop_clonefile", + 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VPP_WILLRELE, + vnop_clonefile_vp_offsets, + VOPARG_OFFSETOF(struct vnop_clonefile_args, a_vpp), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_clonefile_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_clonefile_args, a_context), + NULL +}; + int vop_getxattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_getxattr_args,a_vp), VDESC_NO_OFFSET @@ -1132,6 +1170,7 @@ struct vnodeop_desc *vfs_op_descs[] = { &vnop_compound_remove_desc, &vnop_link_desc, &vnop_rename_desc, + &vnop_renamex_desc, &vnop_compound_rename_desc, &vnop_mkdir_desc, &vnop_compound_mkdir_desc, @@ -1151,6 +1190,7 @@ struct vnodeop_desc *vfs_op_descs[] = { &vnop_pageout_desc, &vnop_searchfs_desc, &vnop_copyfile_desc, + &vnop_clonefile_desc, &vnop_getxattr_desc, &vnop_setxattr_desc, &vnop_removexattr_desc, diff --git a/bsd/vm/Makefile b/bsd/vm/Makefile index a116bf789..ba54ca196 100644 --- a/bsd/vm/Makefile +++ b/bsd/vm/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -20,5 +19,3 @@ EXPORT_MI_DIR = vm include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/vm/dp_backing_file.c b/bsd/vm/dp_backing_file.c index e17287ff7..2684c29dd 100644 --- a/bsd/vm/dp_backing_file.c +++ b/bsd/vm/dp_backing_file.c @@ -44,12 +44,8 @@ #include #include #include -#if CONFIG_PROTECT -#include -#endif #include -#include #include #include @@ -66,6 +62,7 @@ #include #include #include +#include #include @@ -80,43 +77,6 @@ #include -void macx_init(void); - -static lck_grp_t *macx_lock_group; -static lck_mtx_t *macx_lock; - -/* - * temporary support for delayed instantiation - * of default_pager - */ -int default_pager_init_flag = 0; - -struct bs_map bs_port_table[MAX_BACKING_STORE] = { - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, - {0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}}; - -/* ###################################################### */ - -/* - * Routine: macx_init - * Function: - * Initialize locks so that only one caller can change - * state at a time. - */ -void -macx_init(void) -{ - macx_lock_group = lck_grp_alloc_init("macx", NULL); - macx_lock = lck_mtx_alloc_init(macx_lock_group, NULL); -} /* * Routine: macx_backing_store_recovery @@ -127,26 +87,11 @@ macx_init(void) */ int macx_backing_store_recovery( - struct macx_backing_store_recovery_args *args) + __unused struct macx_backing_store_recovery_args *args) { - int pid = args->pid; - int error; - struct proc *p = current_proc(); + assert(FALSE); - if ((error = suser(kauth_cred_get(), 0))) - goto backing_store_recovery_return; - - /* for now restrict backing_store_recovery */ - /* usage to only present task */ - if(pid != proc_selfpid()) { - error = EINVAL; - goto backing_store_recovery_return; - } - - task_backing_store_privileged(p->task); - -backing_store_recovery_return: - return(error); + return ENOTSUP; } /* @@ -158,24 +103,14 @@ macx_backing_store_recovery( int macx_backing_store_suspend( - struct macx_backing_store_suspend_args *args) + __unused struct macx_backing_store_suspend_args *args) { - boolean_t suspend = args->suspend; - int error; + assert(FALSE); - lck_mtx_lock(macx_lock); - if ((error = suser(kauth_cred_get(), 0))) - goto backing_store_suspend_return; - - /* Multiple writers protected by macx_lock */ - vm_backing_store_disable(suspend); - -backing_store_suspend_return: - lck_mtx_unlock(macx_lock); - return(error); + return ENOTSUP; } -extern boolean_t backing_store_stop_compaction; + extern boolean_t compressor_store_stop_compaction; /* @@ -198,21 +133,18 @@ macx_backing_store_compaction(int flags) { int error; - lck_mtx_assert(macx_lock, LCK_MTX_ASSERT_OWNED); if ((error = suser(kauth_cred_get(), 0))) return error; if (flags & SWAP_COMPACT_DISABLE) { - backing_store_stop_compaction = TRUE; compressor_store_stop_compaction = TRUE; - kprintf("backing_store_stop_compaction = TRUE\n"); + kprintf("compressor_store_stop_compaction = TRUE\n"); } else if (flags & SWAP_COMPACT_ENABLE) { - backing_store_stop_compaction = FALSE; compressor_store_stop_compaction = FALSE; - kprintf("backing_store_stop_compaction = FALSE\n"); + kprintf("compressor_store_stop_compaction = FALSE\n"); } return 0; @@ -228,244 +160,27 @@ int macx_triggers( struct macx_triggers_args *args) { - int error; - - lck_mtx_lock(macx_lock); - error = suser(kauth_cred_get(), 0); - if (error) - return error; + int flags = args->flags; - error = mach_macx_triggers(args); - - lck_mtx_unlock(macx_lock); - return error; -} + if (flags & (SWAP_COMPACT_DISABLE | SWAP_COMPACT_ENABLE)) + return (macx_backing_store_compaction(flags)); + assert(FALSE); -extern boolean_t dp_isssd; + return ENOTSUP; +} -/* - * In the compressed pager world, the swapfiles are created by the kernel. - * Well, all except the first one. That swapfile is absorbed by the kernel at - * the end of the macx_swapon function (if swap is enabled). That's why - * we allow the first invocation of macx_swapon to succeed. - * - * If the compressor pool is running low, the kernel messages the dynamic pager - * on the port it has registered with the kernel. That port can transport 1 of 2 - * pieces of information to dynamic pager: create a swapfile or delete a swapfile. - * - * We choose to transmit the former. So, that message tells dynamic pager - * to create a swapfile and activate it by calling macx_swapon. - * - * We deny this new macx_swapon request. That leads dynamic pager to interpret the - * failure as a serious error and notify all it's clients that swap is running low. - * That's how we get the loginwindow "Resume / Force Quit Applications" dialog to appear. - * - * NOTE: - * If the kernel has already created multiple swapfiles by the time the compressor - * pool is running low (and it has to play this trick), dynamic pager won't be able to - * create a file in user-space and, that too will lead to a similar notification blast - * to all of it's clients. So, that behaves as desired too. - */ -boolean_t macx_swapon_allowed = TRUE; -/* - * Routine: macx_swapon - * Function: - * Syscall interface to add a file to backing store - */ int macx_swapon( - struct macx_swapon_args *args) + __unused struct macx_swapon_args *args) { - int size = args->size; - vnode_t vp = (vnode_t)NULL; - struct nameidata nd, *ndp; - register int error; - kern_return_t kr; - mach_port_t backing_store; - memory_object_default_t default_pager; - int i; - off_t file_size; - vfs_context_t ctx = vfs_context_current(); - struct proc *p = current_proc(); - int dp_cluster_size; - - AUDIT_MACH_SYSCALL_ENTER(AUE_SWAPON); - AUDIT_ARG(value32, args->priority); - - lck_mtx_lock(macx_lock); - - if (COMPRESSED_PAGER_IS_ACTIVE) { - if (macx_swapon_allowed == FALSE) { - error = EINVAL; - goto swapon_bailout; - } else { - macx_swapon_allowed = FALSE; - error = 0; - goto swapon_bailout; - } - } - - ndp = &nd; - - if ((error = suser(kauth_cred_get(), 0))) - goto swapon_bailout; - - /* - * Get a vnode for the paging area. - */ - NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - ((IS_64BIT_PROCESS(p)) ? UIO_USERSPACE64 : UIO_USERSPACE32), - (user_addr_t) args->filename, ctx); - - if ((error = namei(ndp))) - goto swapon_bailout; - nameidone(ndp); - vp = ndp->ni_vp; - - if (vp->v_type != VREG) { - error = EINVAL; - goto swapon_bailout; - } - - /* get file size */ - if ((error = vnode_size(vp, &file_size, ctx)) != 0) - goto swapon_bailout; -#if CONFIG_MACF - vnode_lock(vp); - error = mac_system_check_swapon(vfs_context_ucred(ctx), vp); - vnode_unlock(vp); - if (error) - goto swapon_bailout; -#endif - - /* resize to desired size if it's too small */ - if ((file_size < (off_t)size) && ((error = vnode_setsize(vp, (off_t)size, 0, ctx)) != 0)) - goto swapon_bailout; + assert(FALSE); -#if CONFIG_PROTECT - { - /* initialize content protection keys manually */ - if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { - goto swapon_bailout; - } - } -#endif - - - if (default_pager_init_flag == 0) { - start_def_pager(NULL); - default_pager_init_flag = 1; - } - - /* add new backing store to list */ - i = 0; - while(bs_port_table[i].vp != 0) { - if(i == MAX_BACKING_STORE) - break; - i++; - } - if(i == MAX_BACKING_STORE) { - error = ENOMEM; - goto swapon_bailout; - } - - /* remember the vnode. This vnode has namei() reference */ - bs_port_table[i].vp = vp; - - /* - * Look to see if we are already paging to this file. - */ - /* make certain the copy send of kernel call will work */ - default_pager = MEMORY_OBJECT_DEFAULT_NULL; - kr = host_default_memory_manager(host_priv_self(), &default_pager, 0); - if(kr != KERN_SUCCESS) { - error = EAGAIN; - bs_port_table[i].vp = 0; - goto swapon_bailout; - } - - if ((dp_isssd = vnode_pager_isSSD(vp)) == TRUE) { - /* - * keep the cluster size small since the - * seek cost is effectively 0 which means - * we don't care much about fragmentation - */ - dp_cluster_size = 2 * PAGE_SIZE; - } else { - /* - * use the default cluster size - */ - dp_cluster_size = 0; - } - kr = default_pager_backing_store_create(default_pager, - -1, /* default priority */ - dp_cluster_size, - &backing_store); - memory_object_default_deallocate(default_pager); - - if(kr != KERN_SUCCESS) { - error = ENOMEM; - bs_port_table[i].vp = 0; - goto swapon_bailout; - } - - /* Mark this vnode as being used for swapfile */ - vnode_lock_spin(vp); - SET(vp->v_flag, VSWAP); - vnode_unlock(vp); - - /* - * NOTE: we are able to supply PAGE_SIZE here instead of - * an actual record size or block number because: - * a: we do not support offsets from the beginning of the - * file (allowing for non page size/record modulo offsets. - * b: because allow paging will be done modulo page size - */ - - kr = default_pager_add_file(backing_store, (vnode_ptr_t) vp, - PAGE_SIZE, (int)(file_size/PAGE_SIZE)); - if(kr != KERN_SUCCESS) { - bs_port_table[i].vp = 0; - if(kr == KERN_INVALID_ARGUMENT) - error = EINVAL; - else - error = ENOMEM; - - /* This vnode is not to be used for swapfile */ - vnode_lock_spin(vp); - CLR(vp->v_flag, VSWAP); - vnode_unlock(vp); - - goto swapon_bailout; - } - bs_port_table[i].bs = (void *)backing_store; - error = 0; - - ubc_setthreadcred(vp, p, current_thread()); - - /* - * take a long term reference on the vnode to keep - * vnreclaim() away from this vnode. - */ - vnode_ref(vp); - -swapon_bailout: - if (vp) { - vnode_put(vp); - } - lck_mtx_unlock(macx_lock); - AUDIT_MACH_SYSCALL_EXIT(error); - - if (error) - printf("macx_swapon FAILED - %d\n", error); - else - printf("macx_swapon SUCCESS\n"); - - return(error); + return ENOTSUP; } + /* * Routine: macx_swapoff * Function: @@ -473,110 +188,11 @@ macx_swapon( */ int macx_swapoff( - struct macx_swapoff_args *args) + __unused struct macx_swapoff_args *args) { - __unused int flags = args->flags; - kern_return_t kr; - mach_port_t backing_store; - - struct vnode *vp = 0; - struct nameidata nd, *ndp; - struct proc *p = current_proc(); - int i; - int error; - vfs_context_t ctx = vfs_context_current(); - int orig_iopol_disk; + assert(FALSE); - AUDIT_MACH_SYSCALL_ENTER(AUE_SWAPOFF); - - lck_mtx_lock(macx_lock); - - backing_store = NULL; - ndp = &nd; - - if ((error = suser(kauth_cred_get(), 0))) - goto swapoff_bailout; - - /* - * Get the vnode for the paging area. - */ - NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - ((IS_64BIT_PROCESS(p)) ? UIO_USERSPACE64 : UIO_USERSPACE32), - (user_addr_t) args->filename, ctx); - - if ((error = namei(ndp))) - goto swapoff_bailout; - nameidone(ndp); - vp = ndp->ni_vp; - - if (vp->v_type != VREG) { - error = EINVAL; - goto swapoff_bailout; - } -#if CONFIG_MACF - vnode_lock(vp); - error = mac_system_check_swapoff(vfs_context_ucred(ctx), vp); - vnode_unlock(vp); - if (error) - goto swapoff_bailout; -#endif - - for(i = 0; i < MAX_BACKING_STORE; i++) { - if(bs_port_table[i].vp == vp) { - break; - } - } - if (i == MAX_BACKING_STORE) { - error = EINVAL; - goto swapoff_bailout; - } - backing_store = (mach_port_t)bs_port_table[i].bs; - - orig_iopol_disk = proc_get_task_policy(current_task(), current_thread(), - TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL); - - proc_set_task_policy(current_task(), current_thread(), TASK_POLICY_INTERNAL, - TASK_POLICY_IOPOL, IOPOL_THROTTLE); - - kr = default_pager_backing_store_delete(backing_store); - - proc_set_task_policy(current_task(), current_thread(), TASK_POLICY_INTERNAL, - TASK_POLICY_IOPOL, orig_iopol_disk); - - switch (kr) { - case KERN_SUCCESS: - error = 0; - bs_port_table[i].vp = 0; - /* This vnode is no longer used for swapfile */ - vnode_lock_spin(vp); - CLR(vp->v_flag, VSWAP); - vnode_unlock(vp); - - /* get rid of macx_swapon() "long term" reference */ - vnode_rele(vp); - - break; - case KERN_FAILURE: - error = EAGAIN; - break; - default: - error = EAGAIN; - break; - } - -swapoff_bailout: - /* get rid of macx_swapoff() namei() reference */ - if (vp) - vnode_put(vp); - lck_mtx_unlock(macx_lock); - AUDIT_MACH_SYSCALL_EXIT(error); - - if (error) - printf("macx_swapoff FAILED - %d\n", error); - else - printf("macx_swapoff SUCCESS\n"); - - return(error); + return ENOTSUP; } /* @@ -585,7 +201,6 @@ macx_swapoff( * Syscall interface to get general swap statistics */ extern uint64_t vm_swap_get_total_space(void); -extern uint64_t vm_swap_get_used_space(void); extern uint64_t vm_swap_get_free_space(void); extern boolean_t vm_swap_up; @@ -596,77 +211,19 @@ macx_swapinfo( vm_size_t *pagesize_p, boolean_t *encrypted_p) { - int error; - memory_object_default_t default_pager; - default_pager_info_64_t dpi64; - kern_return_t kr; + if (VM_CONFIG_SWAP_IS_PRESENT) { - error = 0; - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { + *total_p = vm_swap_get_total_space(); + *avail_p = vm_swap_get_free_space(); + *pagesize_p = (vm_size_t)PAGE_SIZE_64; + *encrypted_p = TRUE; - if (vm_swap_up == TRUE) { - - *total_p = vm_swap_get_total_space(); - *avail_p = vm_swap_get_free_space(); - *pagesize_p = (vm_size_t)PAGE_SIZE_64; - *encrypted_p = TRUE; - - } else { - - *total_p = 0; - *avail_p = 0; - *pagesize_p = 0; - *encrypted_p = FALSE; - } } else { - - /* - * Get a handle on the default pager. - */ - default_pager = MEMORY_OBJECT_DEFAULT_NULL; - kr = host_default_memory_manager(host_priv_self(), &default_pager, 0); - if (kr != KERN_SUCCESS) { - error = EAGAIN; /* XXX why EAGAIN ? */ - goto done; - } - if (default_pager == MEMORY_OBJECT_DEFAULT_NULL) { - /* - * The default pager has not initialized yet, - * so it can't be using any swap space at all. - */ - *total_p = 0; - *avail_p = 0; - *pagesize_p = 0; - *encrypted_p = FALSE; - goto done; - } - /* - * Get swap usage data from default pager. - */ - kr = default_pager_info_64(default_pager, &dpi64); - if (kr != KERN_SUCCESS) { - error = ENOTSUP; - goto done; - } - - /* - * Provide default pager info to caller. - */ - *total_p = dpi64.dpi_total_space; - *avail_p = dpi64.dpi_free_space; - *pagesize_p = dpi64.dpi_page_size; - if (dpi64.dpi_flags & DPI_ENCRYPTED) { - *encrypted_p = TRUE; - } else { - *encrypted_p = FALSE; - } - -done: - if (default_pager != MEMORY_OBJECT_DEFAULT_NULL) { - /* release our handle on default pager */ - memory_object_default_deallocate(default_pager); - } + *total_p = 0; + *avail_p = 0; + *pagesize_p = 0; + *encrypted_p = FALSE; } - return error; + return 0; } diff --git a/bsd/vm/vm_compressor_backing_file.c b/bsd/vm/vm_compressor_backing_file.c index 7ec5873db..5ac063e60 100644 --- a/bsd/vm/vm_compressor_backing_file.c +++ b/bsd/vm/vm_compressor_backing_file.c @@ -39,7 +39,6 @@ #include #include #include -#include void vm_swapfile_open(const char *path, vnode_t *vp); void vm_swapfile_close(uint64_t path, vnode_t vp); @@ -108,27 +107,6 @@ vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin) ctx = vfs_context_current(); -#if CONFIG_PROTECT - { -#if 0 // - - if ((error = cp_vnode_setclass(vp, PROTECTION_CLASS_F))) { - if(config_protect_bug) { - printf("swap protection class set failed with %d\n", error); - } else { - panic("swap protection class set failed with %d\n", error); - } - } -#endif - /* initialize content protection keys manually */ - if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { - printf("Content Protection key failure on swap: %d\n", error); - vnode_put(vp); - vp = NULL; - goto done; - } - } -#endif error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx); if (error) { @@ -145,13 +123,10 @@ vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin) assert(file_size == *size); if (pin != NULL && *pin != FALSE) { - - assert(vnode_tag(vp) == VT_HFS); - - error = hfs_pin_vnode(VTOHFS(vp), vp, HFS_PIN_IT | HFS_DATALESS_PIN, NULL, ctx); + error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, 0, ctx); if (error) { - printf("hfs_pin_vnode for swap files failed: %d\n", error); + printf("pin for swap files failed: %d, file_size = %lld\n", error, file_size); /* this is not fatal, carry on with files wherever they landed */ *pin = FALSE; error = 0; diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 73abe01da..612b81b20 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,46 @@ int _shared_region_map_and_slide(struct proc*, int, unsigned int, struct shared_file_mapping_np*, uint32_t, user_addr_t, user_addr_t); int shared_region_copyin_mappings(struct proc*, user_addr_t, unsigned int, struct shared_file_mapping_np *); +#if VM_MAP_DEBUG_APPLE_PROTECT +SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, ""); +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ + +#if VM_MAP_DEBUG_FOURK +SYSCTL_INT(_vm, OID_AUTO, map_debug_fourk, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_fourk, 0, ""); +#endif /* VM_MAP_DEBUG_FOURK */ + +#if DEVELOPMENT || DEBUG + +static int +sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + vm_offset_t kaddr; + kern_return_t kr; + int error = 0; + int size = 0; + + error = sysctl_handle_int(oidp, &size, 0, req); + if (error || !req->newptr) + return (error); + + kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size, 0, 0, 0, 0, VM_KERN_MEMORY_IOKIT); + + if (kr == KERN_SUCCESS) + kmem_free(kernel_map, kaddr, size); + + return error; +} + +SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_kmem_alloc_contig, "I", ""); + +extern int vm_region_footprint; +SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_region_footprint, 0, ""); + +#endif /* DEVELOPMENT || DEBUG */ + + #if DEVELOPMENT || DEBUG extern int radar_20146450; @@ -115,6 +156,7 @@ SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_prin extern int apple_protect_pager_data_request_debug; SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, ""); + #endif /* DEVELOPMENT || DEBUG */ SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, ""); @@ -155,13 +197,13 @@ __attribute__((noinline)) int __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL_ * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c */ -#ifndef SECURE_KERNEL +#if DEVELOPMENT || DEBUG extern int allow_stack_exec, allow_data_exec; SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, ""); SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, ""); -#endif /* !SECURE_KERNEL */ +#endif /* DEVELOPMENT || DEBUG */ static const char *prot_values[] = { "none", @@ -181,6 +223,13 @@ log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot) current_proc()->p_comm, current_proc()->p_pid, vaddr, prot_values[prot & VM_PROT_ALL]); } +/* + * shared_region_unnest_logging: level of logging of unnesting events + * 0 - no logging + * 1 - throttled logging of unexpected unnesting events (default) + * 2 - unthrottled logging of unexpected unnesting events + * 3+ - unthrottled logging of all unnesting events + */ int shared_region_unnest_logging = 1; SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED, @@ -206,27 +255,46 @@ SYSCTL_INT(_vm, OID_AUTO, enforce_shared_cache_dir, CTLFLAG_RW | CTLFLAG_LOCKED, static int64_t last_unnest_log_time = 0; static int shared_region_unnest_log_count = 0; -void log_unnest_badness( +void +log_unnest_badness( vm_map_t m, vm_map_offset_t s, - vm_map_offset_t e) { + vm_map_offset_t e, + boolean_t is_nested_map, + vm_map_offset_t lowest_unnestable_addr) +{ struct timeval tv; if (shared_region_unnest_logging == 0) return; - if (shared_region_unnest_logging == 1) { + if (shared_region_unnest_logging <= 2 && + is_nested_map && + s >= lowest_unnestable_addr) { + /* + * Unnesting of writable map entries is fine. + */ + return; + } + + if (shared_region_unnest_logging <= 1) { microtime(&tv); - if ((tv.tv_sec - last_unnest_log_time) < vm_shared_region_unnest_log_interval) { - if (shared_region_unnest_log_count++ > shared_region_unnest_log_count_threshold) + if ((tv.tv_sec - last_unnest_log_time) < + vm_shared_region_unnest_log_interval) { + if (shared_region_unnest_log_count++ > + shared_region_unnest_log_count_threshold) return; - } - else { + } else { last_unnest_log_time = tv.tv_sec; shared_region_unnest_log_count = 0; } } + DTRACE_VM4(log_unnest_badness, + vm_map_t, m, + vm_map_offset_t, s, + vm_map_offset_t, e, + vm_map_offset_t, lowest_unnestable_addr); printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, current_proc()->p_pid, (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m)); } @@ -506,7 +574,10 @@ pid_for_task( if (p) { pid = proc_pid(p); err = KERN_SUCCESS; - } else { + } else if (is_corpsetask(t1)) { + pid = task_pid(t1); + err = KERN_SUCCESS; + }else { err = KERN_FAILURE; } } @@ -1178,6 +1249,7 @@ _shared_region_map_and_slide( #endif memory_object_control_t file_control; struct vm_shared_region *shared_region; + uint32_t i; SHARED_REGION_TRACE_DEBUG( ("shared_region: %p [%d(%s)] -> map\n", @@ -1258,16 +1330,6 @@ _shared_region_map_and_slide( } #endif /* MAC */ -#if CONFIG_PROTECT - /* check for content protection access */ - { - error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0); - if (error) { - goto done; - } - } -#endif /* CONFIG_PROTECT */ - /* make sure vnode is on the process's root volume */ root_vp = p->p_fd->fd_rdir; if (root_vp == NULL) { @@ -1372,6 +1434,36 @@ _shared_region_map_and_slide( goto done; } + /* check that the mappings are properly covered by code signatures */ + if (!cs_enforcement(NULL)) { + /* code signing is not enforced: no need to check */ + } else for (i = 0; i < mappings_count; i++) { + if (mappings[i].sfm_init_prot & VM_PROT_ZF) { + /* zero-filled mapping: not backed by the file */ + continue; + } + if (ubc_cs_is_range_codesigned(vp, + mappings[i].sfm_file_offset, + mappings[i].sfm_size)) { + /* this mapping is fully covered by code signatures */ + continue; + } + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] map(%p:'%s'): " + "mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] " + "is not code-signed\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + p->p_pid, p->p_comm, + (void *)VM_KERNEL_ADDRPERM(vp), vp->v_name, + i, mappings_count, + mappings[i].sfm_address, + mappings[i].sfm_size, + mappings[i].sfm_file_offset, + mappings[i].sfm_max_prot, + mappings[i].sfm_init_prot)); + error = EINVAL; + goto done; + } /* get the process's shared region (setup in vm_map_exec()) */ shared_region = vm_shared_region_get(current_task()); @@ -1593,7 +1685,7 @@ SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_success, ""); SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_failure, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED, +SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_shared, ""); SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.all_reusable_calls, ""); @@ -1613,6 +1705,12 @@ SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.can_reuse_failure, ""); SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_reclaimed, ""); +SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_page_stats_reusable.reusable_nonwritable, ""); +SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_page_stats_reusable.reusable_shared, ""); +SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_page_stats_reusable.free_shared, ""); extern unsigned int vm_page_free_count, vm_page_speculative_count; @@ -1622,6 +1720,10 @@ SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, extern unsigned int vm_page_cleaned_count; SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size"); +extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count; +SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, ""); + /* pageout counts */ extern unsigned int vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external, vm_pageout_inactive_clean, vm_pageout_speculative_clean, vm_pageout_inactive_used; extern unsigned int vm_pageout_freed_from_inactive_clean, vm_pageout_freed_from_speculative; @@ -1658,6 +1760,37 @@ extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout; SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, ""); SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, ""); +#if CONFIG_SECLUDED_MEMORY + +SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, ""); +extern unsigned int vm_page_secluded_target; +extern unsigned int vm_page_secluded_count; +extern unsigned int vm_page_secluded_count_free; +extern unsigned int vm_page_secluded_count_inuse; +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, ""); + +extern struct vm_page_secluded_data vm_page_secluded; +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, ""); + +extern uint64_t vm_pageout_freed_from_secluded; +extern uint64_t vm_pageout_secluded_reactivated; +extern uint64_t vm_pageout_secluded_burst_count; +SYSCTL_QUAD(_vm, OID_AUTO, pageout_freed_from_secluded, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_secluded, ""); +SYSCTL_QUAD(_vm, OID_AUTO, pageout_secluded_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_secluded_reactivated, "Secluded pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */ +SYSCTL_QUAD(_vm, OID_AUTO, pageout_secluded_burst_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_secluded_burst_count, ""); + +#endif /* CONFIG_SECLUDED_MEMORY */ + #include #include diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index 45e338a5e..b70717b92 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -728,11 +728,9 @@ vnode_pagein( if(error == EAGAIN) { ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); } -#if CONFIG_PROTECT if(error == EPERM) { ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); } -#endif } result = PAGER_ERROR; error = PAGER_ERROR; @@ -747,24 +745,6 @@ vnode_pagein( return (error); } -void -vnode_pager_shutdown(void) -{ - int i; - vnode_t vp; - - for(i = 0; i < MAX_BACKING_STORE; i++) { - vp = (vnode_t)(bs_port_table[i]).vp; - if (vp) { - (bs_port_table[i]).vp = 0; - - /* get rid of macx_swapon() reference */ - vnode_rele(vp); - } - } -} - - void * upl_get_internal_page_list(upl_t upl) { diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index f858726ab..82e56171e 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -199,8 +199,6 @@ _futimes _fuword _groupmember _hashinit -_hfs_addconverter -_hfs_remconverter _ifaddr_address _ifaddr_address_family _ifaddr_dstaddress @@ -346,6 +344,7 @@ _kauth_unlisten_scope _kdebug_enable _kernel_debug _kernel_debug1 +_kernel_debug_filtered _kernel_debug_enter _kernel_debug_register_callback _kernproc @@ -425,6 +424,7 @@ _minphys _msleep _nanotime _nanouptime +_nd6_lookup_ipv6 _net_init_add _nop_access _nop_advlock @@ -565,7 +565,6 @@ _sysctl_handle_long _sysctl_handle_opaque _sysctl_handle_quad _sysctl_handle_string -_sysctl_register_fixed _sysctl_register_oid _sysctl_unregister_oid _thread_issignal @@ -765,6 +764,7 @@ _vnop_blockmap_desc _vnop_bwrite_desc _vnop_close_desc _vnop_copyfile_desc +_vnop_clonefile_desc _vnop_create_desc _vnop_default_desc _vnop_exchange_desc @@ -794,6 +794,7 @@ _vnop_reclaim_desc _vnop_remove_desc _vnop_removexattr_desc _vnop_rename_desc +_vnop_renamex_desc _vnop_revoke_desc _vnop_rmdir_desc _vnop_searchfs_desc diff --git a/config/BSDKernel.x86_64.exports b/config/BSDKernel.x86_64.exports index 4fb38fe48..0d534ae6d 100644 --- a/config/BSDKernel.x86_64.exports +++ b/config/BSDKernel.x86_64.exports @@ -11,4 +11,4 @@ _mbuf_pkthdr_setheader _mbuf_setlen _mbuf_setnextpkt _mbuf_type -_nd6_lookup_ipv6 +_sysctl_register_fixed diff --git a/config/IOKit.exports b/config/IOKit.exports index 958678722..ec0b12790 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -142,6 +142,7 @@ __ZN10IOWorkLoop13_maintRequestEPvS0_S0_S0_ __ZN10IOWorkLoop14addEventSourceEP13IOEventSource __ZN10IOWorkLoop15runEventSourcesEv __ZN10IOWorkLoop17removeEventSourceEP13IOEventSource +__ZN10IOWorkLoop18setMaximumLockTimeEyj __ZN10IOWorkLoop19signalWorkAvailableEv __ZN10IOWorkLoop4freeEv __ZN10IOWorkLoop4initEv @@ -1612,6 +1613,7 @@ __ZN19IOHistogramReporter10gMetaClassE __ZN19IOHistogramReporter10superClassE __ZN19IOHistogramReporter10tallyValueEx __ZN19IOHistogramReporter18handleCreateLegendEv +__ZN19IOHistogramReporter20overrideBucketValuesEjyxxx __ZN19IOHistogramReporter4freeEv __ZN19IOHistogramReporter4withEP9IOServicetyPKcyiP24IOHistogramSegmentConfig __ZN19IOHistogramReporter8initWithEP9IOServicetyPK8OSSymbolyiP24IOHistogramSegmentConfig diff --git a/config/Libkern.exports b/config/Libkern.exports index 6f67de350..93dfdd556 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -44,6 +44,9 @@ _OSlibkernInit _SHA1Final _SHA1Init _SHA1Update +_SHA384_Final +_SHA384_Init +_SHA384_Update _STRDUP __Z13OSUnserializePKcPP8OSString __Z16OSUnserializeXMLPKcPP8OSString @@ -391,7 +394,6 @@ __ZN9OSBooleanD2Ev __ZNK10OSIterator12getMetaClassEv __ZNK10OSIterator9MetaClass5allocEv __ZNK11OSMetaClass12getClassNameEv -__ZNK11OSMetaClass18getClassNameSymbolEv __ZNK11OSMetaClass12getClassSizeEv __ZNK11OSMetaClass12getMetaClassEv __ZNK11OSMetaClass12taggedRetainEPKv @@ -402,6 +404,7 @@ __ZNK11OSMetaClass13taggedReleaseEPKvi __ZNK11OSMetaClass14getRetainCountEv __ZNK11OSMetaClass14reservedCalledEi __ZNK11OSMetaClass16getInstanceCountEv +__ZNK11OSMetaClass18getClassNameSymbolEv __ZNK11OSMetaClass18instanceDestructedEv __ZNK11OSMetaClass19instanceConstructedEv __ZNK11OSMetaClass6retainEv @@ -587,8 +590,11 @@ __Znam __Znwm ___bzero ___cxa_pure_virtual +___llvm_profile_runtime ___stack_chk_fail ___stack_chk_guard +__os_log_default +__os_log_internal _adler32 _atoi _bcmp @@ -628,10 +634,11 @@ _invalidate_icache64 _itoa _kOSBooleanFalse _kOSBooleanTrue +_kdp_lck_spin_is_acquired _kern_os_free _kern_os_malloc -_kern_os_malloc_size _kern_os_realloc +_kext_assertions_enable _kprintf _lck_attr_alloc_init _lck_attr_free @@ -644,11 +651,14 @@ _lck_grp_attr_setdefault _lck_grp_attr_setstat _lck_grp_free _lck_mtx_alloc_init +_lck_mtx_convert_spin _lck_mtx_destroy _lck_mtx_free _lck_mtx_init _lck_mtx_lock +_lck_mtx_lock_spin _lck_mtx_try_lock +_lck_mtx_try_lock_spin _lck_mtx_unlock _lck_rw_alloc_init _lck_rw_destroy @@ -677,6 +687,11 @@ _memset _ml_at_interrupt_context _ml_get_interrupts_enabled _ml_set_interrupts_enabled +_os_log_create +_os_log_debug_enabled +_os_log_info_enabled +_os_release +_os_retain _osrelease _ostype _page_mask @@ -729,4 +744,3 @@ _vsnprintf _vsscanf _zError _zlibVersion -___llvm_profile_runtime diff --git a/config/Libkern.x86_64.exports b/config/Libkern.x86_64.exports index f67db63a8..9ea8e005a 100644 --- a/config/Libkern.x86_64.exports +++ b/config/Libkern.x86_64.exports @@ -128,9 +128,6 @@ __ZN9OSBoolean19_RESERVEDOSBoolean4Ev __ZN9OSBoolean19_RESERVEDOSBoolean5Ev __ZN9OSBoolean19_RESERVEDOSBoolean6Ev __ZN9OSBoolean19_RESERVEDOSBoolean7Ev -_lck_mtx_convert_spin -_lck_mtx_lock_spin -_lck_mtx_try_lock_spin _sprintf _strcat _strcpy diff --git a/config/MACFramework.exports b/config/MACFramework.exports index d6f7ad04d..6c190be01 100644 --- a/config/MACFramework.exports +++ b/config/MACFramework.exports @@ -3,6 +3,9 @@ _mac_policy_unregister _mac_vnop_getxattr _mac_vnop_setxattr _mac_vnop_removexattr +_mac_file_getxattr +_mac_file_setxattr +_mac_file_removexattr _mac_label_get _mac_label_set diff --git a/config/MASTER b/config/MASTER index 4e5023e9b..16a1f7853 100644 --- a/config/MASTER +++ b/config/MASTER @@ -63,6 +63,7 @@ options MACH_FASTLINK # Fast symbolic links options MACH_HOST # Mach host (resource alloc.) # options MACH_IPC_COMPAT # Enable old IPC interface # options MACH_IPC_TEST # Testing code/printfs # +options MACH_FLIPC # Fast-Local IPC # options MACH_NP # Mach IPC support # options MACH_NBC # No buffer cache # options MACH_NET # Fast network access # @@ -85,13 +86,8 @@ options MEASURE_BW # interface bandwidth measurement # options CLASSQ_BLUE # BLUE queueing algorithm # options CLASSQ_RED # RED queueing algorithm # options CLASSQ_RIO # RIO queueing algorithm # -options IPDIVERT # Divert sockets (for NAT) # -options IPFIREWALL # IP Firewalling (used by NAT) # -options IPFIREWALL_FORWARD #Transparent proxy # -options IPFIREWALL_DEFAULT_TO_ACCEPT # allow everything by default # options DUMMYNET # dummynet support # options TRAFFIC_MGT # traffic management support # -options IPFW2 # IP firewall (new version) # options MULTICAST # Internet Protocol Class-D $ options TCPDEBUG # TCP debug # options TCP_DROP_SYNFIN # Drop TCP packets with SYN+FIN set # @@ -107,6 +103,7 @@ options FLOW_DIVERT # options NECP # options CONTENT_FILTER # # options PACKET_MANGLER # # + # secure_kernel - secure kernel from user programs options SECURE_KERNEL # @@ -124,26 +121,19 @@ options NETWORKING # networking layer # options CONFIG_FSE # file system events # options CONFIG_IMAGEBOOT # local image boot # options CONFIG_MBUF_JUMBO # jumbo cluster pool # -options CONFIG_FORCE_OUT_IFP # Enable IP_FORCE_OUT_IFP # -options CONFIG_IFEF_NOWINDOWSCALE # Scale TCP window per driver # options CONFIG_WORKQUEUE # # # 4.4 filesystems # -options HFS # HFS/HFS+ support # options MOCKFS # Boot from an executable # options FIFO # fifo support # options FDESC # fdesc_fs support # options DEVFS # devfs support # options ROUTEFS # routefs support # -options JOURNALING # journaling support # -options HFS_COMPRESSION # hfs compression # -options CONFIG_HFS_STD # hfs standard support # -options CONFIG_HFS_TRIM # hfs trims unused blocks # -options CONFIG_HFS_MOUNT_UNMAP # hfs trims blocks at mount # -options CONFIG_HFS_DIRLINK # allow directory hardlink creation # +options NULLFS # nullfs support # +options FS_COMPRESSION # fs compression # options CONFIG_DEV_KMEM # /dev/kmem device for reading KVA # # @@ -157,7 +147,6 @@ options CONFIG_IMGSRC_ACCESS # source of imageboot dmg # options CONFIG_TRIGGERS # trigger vnodes # options CONFIG_EXT_RESOLVER # e.g. memberd # options CONFIG_SEARCHFS # searchfs syscall support # -options CONFIG_SECLUDED_RENAME # secluded rename syscall # # # NFS support @@ -314,6 +303,7 @@ options CONFIG_MSG_BSIZE=16384 # options CONFIG_IPC_TABLE_ENTRIES_STEPS=64 # 137898 entries # options CONFIG_IPC_TABLE_ENTRIES_STEPS=256 # 300714 entries # + # # configurable kernel - use these options to strip strings from panic # and printf calls. @@ -358,6 +348,13 @@ options CONFIG_MEMORYSTATUS # # options CONFIG_JETSAM # +# +# enable new link table implementation stats/debugging +# (adds mesaureable overhead) +# +options CONFIG_LTABLE_STATS # +options CONFIG_LTABLE_DEBUG # + # # enable new wait queue implementation stats / debugging # @@ -382,11 +379,20 @@ options CONFIG_PHANTOM_CACHE # # options VM_PRESSURE_EVENTS # +options CONFIG_SECLUDED_MEMORY # + +options CONFIG_BACKGROUND_QUEUE # + # # I/O Scheduling # options CONFIG_IOSCHED # +# +# Accounting for I/O usage +# +options CONFIG_IO_ACCOUNTING # + # # Enable inheritance of importance through specially marked mach ports and for file locks # For now debug is enabled wherever inheritance is @@ -403,6 +409,11 @@ options CONFIG_PROC_UUID_POLICY # # options CONFIG_ECC_LOGGING # +# +# Application core dumps +# +options CONFIG_COREDUMP # + # # Ethernet (ARP) # @@ -513,7 +524,6 @@ options CONFIG_AUDIT # Kernel auditing # # forcibly suspending tasks when the demand exceeds supply. This # option should be on. # -options MACH_PAGEMAP # options MACH_RT options TASK_SWAPPER # @@ -553,10 +563,6 @@ options ZONE_DEBUG # # options CONFIG_ZLEAKS # Live zone leak debugging # -# -options ZONE_ALIAS_ADDR # # - - # # CONFIG_TASK_ZONE_INFO allows per-task zone information to be extracted # Primarily useful for xnu debug and development. @@ -697,6 +703,11 @@ options CONFIG_XNUPOST # # options PROC_REF_DEBUG # +# +# Kernel OS reason debug instrumentation +# +options OS_REASON_DEBUG # + # # Kernel Voucher Attr Manager for Activity Trace # diff --git a/config/MASTER.x86_64 b/config/MASTER.x86_64 index 3baf698bc..94cbf4233 100644 --- a/config/MASTER.x86_64 +++ b/config/MASTER.x86_64 @@ -18,15 +18,15 @@ # # KERNEL_BASE = [ intel medium config_requires_u32_munging ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug] -# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_waitq_stats config_waitq_debug ] -# BSD = [ mach_bsd sysv_sem sysv_msg sysv_shm config_imageboot config_workqueue psynch config_proc_uuid_policy ] -# FILESYS_BASE = [ devfs hfs journaling fdesc config_dev_kmem config_fse quota namedstreams fifo config_volfs hfs_compression config_hfs_std config_hfs_alloc_rbtree config_hfs_trim config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_hfs_dirlink config_appledouble ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ] +# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_ltable_stats config_ltable_debug config_waitq_stats config_waitq_debug ] +# BSD = [ mach_bsd sysv_sem sysv_msg sysv_shm config_imageboot config_workqueue psynch config_proc_uuid_policy config_coredump ] +# FILESYS_BASE = [ devfs fdesc config_dev_kmem config_fse quota namedstreams fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE ] # FILESYS_DEBUG = [ FILESYS_BASE ] # NFS = [ nfsclient nfsserver ] -# NETWORKING = [ inet inet6 ipv6send tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge ipcomp_zlib MULTIPATH packet_mangler ] +# NETWORKING = [ inet inet6 ipv6send tcpdrop_synfin bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge ipcomp_zlib MULTIPATH packet_mangler ] # VPN = [ ipsec flow_divert necp content_filter ] # PF = [ pf pflog ] # PKTSCHED = [ pktsched_cbq pktsched_fairq pktsched_hfsc pktsched_priq ] @@ -41,7 +41,7 @@ # LIBKERN_DEV = [ LIBKERN_BASE iotracking ] # LIBKERN_DEBUG = [ LIBKERN_BASE iotracking ] # PERF_DBG = [ config_dtrace mach_kdp config_serial_kdp kdp_interactive_debugging kperf kpc zleaks config_gzalloc ] -# MACH_BASE = [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_bank config_coalitions hypervisor config_iosched config_sysdiagnose ] +# MACH_BASE = [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_bank config_coalitions hypervisor config_iosched config_sysdiagnose ] # MACH_RELEASE = [ MACH_BASE ] # MACH_DEV = [ MACH_BASE task_zone_info ] # MACH_DEBUG = [ MACH_BASE task_zone_info ] @@ -49,11 +49,11 @@ # SCHED_RELEASE = [ SCHED_BASE ] # SCHED_DEV = [ SCHED_BASE ] # SCHED_DEBUG = [ SCHED_BASE config_sched_grrr config_sched_proto ] -# VM = [ vm_pressure_events memorystatus dynamic_codesigning config_code_decryption encrypted_swap phantom_cache] +# VM = [ vm_pressure_events memorystatus dynamic_codesigning config_code_decryption encrypted_swap phantom_cache config_background_queue] # SECURITY = [ config_macf config_audit config_csr ] -# RELEASE = [ KERNEL_RELEASE BSD FILESYS_RELEASE NFS NETWORKING PF VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG MACH_RELEASE SCHED_RELEASE VM SECURITY ] -# DEVELOPMENT = [ KERNEL_DEV BSD FILESYS_DEV NFS NETWORKING PF VPN IOKIT_DEV LIBKERN_DEV PERF_DBG MACH_DEV SCHED_DEV VM SECURITY ] -# DEBUG = [ KERNEL_DEBUG BSD FILESYS_DEBUG NFS NETWORKING PF VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG MACH_DEBUG SCHED_DEBUG VM SECURITY ] +# RELEASE = [ KERNEL_RELEASE BSD FILESYS_RELEASE NFS SKYWALK_RELEASE NETWORKING PF VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG MACH_RELEASE SCHED_RELEASE VM SECURITY ] +# DEVELOPMENT = [ KERNEL_DEV BSD FILESYS_DEV NFS SKYWALK_DEV NETWORKING PF VPN IOKIT_DEV LIBKERN_DEV PERF_DBG MACH_DEV SCHED_DEV VM SECURITY ] +# DEBUG = [ KERNEL_DEBUG BSD FILESYS_DEBUG NFS SKYWALK_DEBUG NETWORKING PF VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG MACH_DEBUG SCHED_DEBUG VM SECURITY ] # ###################################################################### # diff --git a/config/Mach.exports b/config/Mach.exports index 1ea2e2030..09ca16fb4 100644 --- a/config/Mach.exports +++ b/config/Mach.exports @@ -23,6 +23,7 @@ _lck_rw_sleep_deadline _lck_spin_sleep _lck_spin_sleep_deadline _mach_absolute_time +_mach_continuous_time _mach_msg_send_from_kernel_proper _mach_vm_pressure_level_monitor _mach_vm_pressure_monitor @@ -54,6 +55,7 @@ _thread_deallocate _thread_policy_set _thread_reference _thread_terminate +_thread_tid _thread_wakeup_prim _vm_kernel_addrperm_external _vm_kernel_unslide_or_perm_external diff --git a/config/Makefile b/config/Makefile index 73a907635..d88a78568 100644 --- a/config/Makefile +++ b/config/Makefile @@ -53,7 +53,7 @@ $(OBJPATH)/allsymbols: $(OBJPATH)/$(KERNEL_FILE_NAME) $(_v)$(NM) -gj $< > $@ $(SYMBOL_SET_BUILD): $(OBJPATH)/%.symbolset : %.exports %.$(EXPORT_SOURCE_ARCH_CONFIG_LC).exports $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET) - @echo SYMBOLSET $* "($(CURRENT_ARCH_CONFIG_LC))" + @echo "$(ColorH)SYMBOLSET$(Color0) $(ColorF)$*$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(KEXT_CREATE_SYMBOL_SET) \ $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG)) \ -import $(OBJPATH)/allsymbols \ @@ -80,11 +80,11 @@ check_all_exports: $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET) -output /dev/null $(_vstdout) $(OBJPATH)/$(MD_SUPPORTED_KPI_FILENAME): $(EXPORTS_FILES) - @echo SUPPORTED_KPI "($(CURRENT_ARCH_CONFIG_LC))" + @echo "$(ColorH)SUPPORTED_KPI$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(SRCROOT)/config/list_supported.sh $(SOURCE) $(EXPORT_SOURCE_ARCH_CONFIG_LC) $@ $(OBJPATH)/$(MI_SUPPORTED_KPI_FILENAME): $(EXPORTS_FILES) - @echo SUPPORTED_KPI "(all)" + @echo "$(ColorH)SUPPORTED_KPI$(Color0) \"($(ColorLF)all$(Color0))\"" $(_v)$(SRCROOT)/config/list_supported.sh $(SOURCE) all $@ build_symbol_sets: check_all_exports $(SYMBOL_SET_BUILD) $(OBJPATH)/allsymbols \ @@ -109,23 +109,23 @@ $(SYMROOT_INSTALL_KEXT_MACHO_FILES): ALWAYS $(SYMROOT_INSTALL_KEXT_PLISTS): $(SYMROOT)/% : $(SOURCE)/% $(_v)$(MKDIR) $(dir $@) - @echo INSTALLSYM kextplist $* + @echo "$(ColorH)INSTALLSYM$(ColorH) $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)" $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ $(_v)$(NEWVERS) $@ $(_vstdout) $(DSTROOT_INSTALL_KEXT_PLISTS): $(INSTALL_KEXT_DIR)/% : $(SYMROOT)/% $(_v)$(MKDIR) $(dir $@) - @echo INSTALL kextplist $* + @echo "$(ColorH)INSTALL$(ColorH) $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)" $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ $(DSTROOT_INSTALL_KEXT_MACHO_FILES): $(INSTALL_KEXT_DIR)/% : $(SYMROOT)/% ALWAYS $(_v)$(MKDIR) $(dir $@) - @echo INSTALL $(notdir $@) "($(CURRENT_ARCH_CONFIG_LC))" + @echo "$(ColorF)INSTALL$(Color0) $(ColorF)$(notdir $@)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@ $(DSTROOT)/$(KRESDIR)/$(MD_SUPPORTED_KPI_FILENAME) $(DSTROOT)/$(KRESDIR)/$(MI_SUPPORTED_KPI_FILENAME): $(DSTROOT)/$(KRESDIR)/% : $(OBJPATH)/% $(_v)$(MKDIR) $(dir $@) - @echo INSTALL $* + @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$*$(Color0)" $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ do_config_install:: $(SYMROOT_INSTALL_KEXT_MACHO_FILES) \ diff --git a/config/MasterVersion b/config/MasterVersion index 016f89ea8..e239b6df5 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -15.6.0 +16.0.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.exports b/config/Private.exports index ea91129b6..119402c9e 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -12,11 +12,14 @@ __ZN24IOCPUInterruptController* __ZNK24IOCPUInterruptController* __ZTV24IOCPUInterruptController _PE_i_can_has_kernel_configuration +_add_fsevent +_need_fsevent _assert_wait_deadline_with_leeway _assert_wait_timeout_with_leeway _audio_active _b_to_q _bdevsw +_bdevvp _bootcache_contains_block _bsd_hostname _bsd_set_dependency_capable @@ -26,6 +29,8 @@ _buf_kernel_addrperm_addr _buf_setfilter _buf_shadow _bufattr_alloc +_bufattr_cpoff +_bufattr_cpx _bufattr_dup _bufattr_free _bufattr_greedymode @@ -38,21 +43,49 @@ _bufattr_meta _bufattr_nocache _bufattr_passive _bufattr_quickcomplete +_bufattr_rawencrypted +_bufattr_setcpoff +_bufattr_setcpx _bufattr_throttled _cdevsw _cdevsw_setkqueueok _chudxnu_platform_ptr -_chudxnu_thread_get_dirty -_chudxnu_thread_set_dirty _clalloc _clfree _cons_cinput _convert_port_to_task_suspension_token _convert_task_suspension_token_to_port _convert_task_to_port +_cp_get_backup_key +_cp_is_valid_class _cp_key_store_action +_cp_new_key +_cp_os_version _cp_register_wraps +_cp_rewrap_key +_cp_unwrap_key _cpu_to_processor +_cpx_alloc +_cpx_can_copy +_cpx_copy +_cpx_flush +_cpx_free +_cpx_has_key +_cpx_init +_cpx_is_sep_wrapped_key +_cpx_iv_aes_ctx +_cpx_key +_cpx_key_len +_cpx_max_key_len +_cpx_set_aes_iv_key +_cpx_set_is_sep_wrapped_key +_cpx_set_key_len +_cpx_set_use_offset_for_iv +_cpx_set_synthetic_offset_for_iv +_cpx_size +_cpx_sizex +_cpx_use_offset_for_iv +_cpx_synthetic_offset_for_iv _cs_blob_reset_cache _cs_debug _cs_enforcement @@ -61,14 +94,21 @@ _cs_entitlements_blob_get _cs_get_cdhash _cs_identity_get _cs_require_lv +_cs_system_require_lv _cs_restricted +_cs_valid +_csblob_entitlements_dictionary_copy +_csblob_entitlements_dictionary_set _csblob_find_blob_bytes +_csblob_get_addr +_csblob_get_base_offset _csblob_get_cdhash _csblob_get_entitlements _csblob_get_identity _csblob_get_platform_binary _csblob_get_flags _csblob_get_teamid +_csblob_get_size _csfg_get_cdhash _csfg_get_path _csfg_get_platform_binary @@ -96,6 +136,7 @@ _ifnet_clone_attach _ifnet_clone_detach _ifnet_dequeue _ifnet_dequeue_multi +_ifnet_dequeue_multi_bytes _ifnet_dequeue_service_class _ifnet_dequeue_service_class_multi _ifnet_disable_output @@ -137,7 +178,16 @@ _ifnet_start _ifnet_subfamily _ifnet_transmit_burst_end _ifnet_transmit_burst_start +_ifnet_tx_compl _ifnet_tx_compl_status +_ifnet_set_packetpreamblelen +_ifnet_packetpreamblelen +_ifnet_maxpacketpreamblelen +_ifnet_set_fastlane_capable +_ifnet_get_fastlane_capable +_ifnet_get_unsent_bytes +_ifnet_get_buffer_status +_ifnet_normalise_unsent_data _in6_localaddr _in6addr_local _in_localaddr @@ -153,19 +203,30 @@ _kauth_cred_grnam2guid _kauth_cred_guid2grnam _kauth_cred_guid2pwnam _kauth_cred_pwnam2guid +_kauth_cred_nfs4domain2dsnode +_kauth_cred_dsnode2nfs4domain +_kcdata_estimate_required_buffer_size +_kcdata_memory_get_used_bytes +_kcdata_memcpy +_kcdata_get_memory_addr +_kcdata_get_memory_addr_for_array _kdp_register_link _kdp_set_interface _kdp_unregister_link _kdp_unregister_send_receive _kern_asl_msg _kern_asl_msg_va +_kern_config_is_development _kern_stack_snapshot_with_reason _kernel_debug_string _kevent_qos_internal +_kevent_qos_internal_bind +_kevent_qos_internal_unbind _kmem_alloc_kobject:_kmem_alloc_kobject_external _kmem_alloc_pageable:_kmem_alloc_pageable_external _kx_qsort _linesw +_localnode_id _log _logwakeup _m_cat @@ -180,6 +241,7 @@ _m_split _m_trailingspace:_mbuf_trailingspace _mac_proc_set_enforce _mach_vm_allocate +_mach_vm_behavior_set _mach_vm_deallocate _mach_vm_map _mach_vm_protect @@ -197,13 +259,43 @@ _mbuf_get_traffic_class_max_count _mbuf_is_service_class_privileged:_mbuf_is_traffic_class_privileged _mbuf_pkthdr_aux_flags _mbuf_get_unsent_data_bytes +_mbuf_get_buffer_status +_mbuf_get_timestamp +_mbuf_set_timestamp +_mbuf_register_tx_compl_callback +_mbuf_unregister_tx_compl_callback +_mbuf_get_timestamp_requested +_mbuf_set_timestamp_requested +_mbuf_get_status +_mbuf_set_status +_mbuf_get_tx_compl_data +_mbuf_set_tx_compl_data +_mbuf_get_flowid +_mbuf_set_flowid +_mbuf_pkt_new_flow +_mbuf_last_pkt _mcl_to_paddr +_mnl_instantiate +_mnl_register +_mnl_msg_alloc +_mnl_msg_complete +_mnl_msg_free +_mnl_msg_to_node +_mnl_msg_from_node +_mnl_set_link_state +_mnl_terminate _mountroot_post_hook _net_add_domain:_net_add_domain_old _net_add_proto:_net_add_proto_old _net_del_domain:_net_del_domain_old _net_del_proto:_net_del_proto_old _netboot_root +_os_reason_create +_os_reason_alloc_buffer +_os_reason_get_kcdata_descriptor +_os_reason_ref +_os_reason_free +_panic_with_options _persona_find _persona_get _persona_get_id @@ -224,6 +316,7 @@ _proc_pidversion _proc_set_responsible_pid _proc_task _proc_uniqueid +_priv_check_cred _pru_abort_notsupp _pru_accept_notsupp _pru_bind_notsupp @@ -284,7 +377,7 @@ _thread_call_cancel_wait _thread_clear_eager_preempt _thread_dispatchqaddr _thread_set_eager_preempt -_thread_tid +_thread_set_mach_voucher _throttle_info_create _throttle_info_io_will_be_throttled _throttle_info_mount_ref @@ -296,6 +389,7 @@ _throttle_info_reset_window _throttle_info_update _throttle_info_update_by_mask _throttle_lowpri_io +_throttle_lowpri_window _throttle_set_thread_io_policy _timeout _timeout_with_leeway @@ -327,7 +421,6 @@ _utun_pkt_dtls_input _vfs_context_bind _vfs_context_get_special_port _vfs_context_set_special_port -_vfs_context_kernel _vfs_devvp _vfs_getattr _vfs_getbyid @@ -358,9 +451,110 @@ _vnode_isdyldsharedcache _vnode_ismonitored _vnode_istty _vnode_lookup_continue_needed +_vnode_clearnoflush +_vnode_isnoflush _vnop_compound_mkdir_desc _vnop_compound_open_desc _vnop_compound_remove_desc _vnop_compound_rename_desc _vnop_compound_rmdir_desc _vnop_monitor_desc + +# HFS Kext Requirements +_IOBSDMountChange +_OSKextUnloadKextWithLoadTag +_bdwrite_internal +_buf_markstatic +_count_lock_queue +_decmpfs_cnode_destroy +_decmpfs_cnode_get_vnode_cached_size +_decmpfs_cnode_get_vnode_state +_decmpfs_cnode_init +_decmpfs_cnode_alloc +_decmpfs_cnode_free +_decmpfs_cnode_set_vnode_state +_decmpfs_cnode_cmp_type +_decmpfs_ctx +_decmpfs_decompress_file +_decmpfs_file_is_compressed +_decmpfs_hides_rsrc +_decmpfs_hides_xattr +_decmpfs_init +_decmpfs_lock_compressed_data +_decmpfs_pagein_compressed +_decmpfs_read_compressed +_decmpfs_unlock_compressed_data +_decmpfs_update_attributes +_decmpfs_validate_compressed_file +_fp_getfvp +_kauth_cred_issuser +_kdebug_lookup_gen_events +_kdebug_vnode +_set_vm_privilege +_throttle_io_will_be_throttled +_ubc_is_mapped_writable +_ubc_setsize_ex +_ubc_upl_range_needed +_vfs_context_current +_vfs_context_issuser +_vfs_context_kernel +_vfs_ctx_skipatime +_vfs_extendedsecurity +_vfs_update_vfsstat +_vn_pathconf +_vnode_cleardirty +_vnode_clearfastdevicecandidate +_vnode_getname_printable +_vnode_getfromfd +_vnode_isautocandidate +_vnode_isfastdevicecandidate +_vnode_isnamedstream +_vnode_putname_printable +_vnode_setautocandidate +_vnode_setdirty +_vnode_setfastdevicecandidate +_vnode_setnoflush +_vslock +_vsunlock +_vfs_isswapmount +_buf_acquire +_buf_create_shadow_priv +_buf_drop +_build_path +_doc_tombstone_get +_doc_tombstone_should_save +_doc_tombstone_save +_doc_tombstone_clear +_doc_tombstone_should_ignore_name +_nspace_snapshot_event +_vnode_should_flush_after_write +_vfs_setowner +_vfs_idle_time +_mount_set_noreaddirext +_cluster_max_io_size +_vfs_context_cwd +_resolve_nspace_item +_vnode_usecount +_vnode_iocount +_vfs_context_iskernel +_mach_to_bsd_errno +_vnode_rele_ext +_proc_is_forcing_hfs_case_sensitivity +_is_package_name +_sysctl__hw_features_children +_task_update_logical_writes +_dqfileclose +_dqfileopen +_dqflush +_dqget +_dqhashinit +_dqisinitialized +_dqlock +_dqrele +_dqsync +_dqsync_orphans +_dqunlock +_qf_get +_qf_put +_dqfileinit +_dqreclaim diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index ec3e60406..bfe836f99 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -8,7 +8,6 @@ __ZN22IOInterruptEventSource7warmCPUEy _acpi_install_wake_handler _acpi_sleep_kernel _acpi_idle_kernel -_add_fsevent _apic_table _apply_func_phys _bufattr_delayidlesleep @@ -29,7 +28,6 @@ _mp_broadcast _mp_cpus_call _mp_cpus_call1 _mp_cpus_kick -_need_fsevent _pal_efi_call_in_32bit_mode _pal_efi_call_in_64bit_mode _semaphore_timedwait @@ -50,3 +48,7 @@ _xts_start _aes_decrypt _PE_reboot_on_panic +# HFS Kext Requirements +_file_vnode +_proc_ucred +_suser diff --git a/config/Unsupported.exports b/config/Unsupported.exports index 6cea97213..70375325a 100644 --- a/config/Unsupported.exports +++ b/config/Unsupported.exports @@ -1,6 +1,5 @@ _PE_i_can_has_debugger _Debugger -_FastUnicodeCompare _KUNCExecute _KUNCGetNotificationID _KUNCUserNotificationDisplayAlert @@ -78,9 +77,6 @@ _get_bsdtask_info _get_task_map _get_task_pmap _getsectdatafromheader -_hfs_getconverter -_hfs_pickencoding -_hfs_relconverter _host_get_special_port _host_get_exception_ports _host_priv_self @@ -125,6 +121,7 @@ _mig_dealloc_reply_port _mig_get_reply_port _mig_put_reply_port _mig_strncpy +_mig_strncpy_zerofill _mig_user_allocate _mig_user_deallocate _ml_io_map @@ -162,7 +159,7 @@ _task_resume _task_resume2 _task_suspend _task_suspend2 -_thread_tid +_thread_wakeup_thread _tsleep _ubc_cs_blob_get _vfs_context_current diff --git a/config/Unused.exports b/config/Unused.exports index 4acf84e35..976fb68de 100644 --- a/config/Unused.exports +++ b/config/Unused.exports @@ -1,9 +1,11 @@ # Symbols that are unused as KPI, but must be globally exported -_dtrace_zero* -_gLoadedKextSummaries -_ipc_mqueue_full +_arm64_root_pgtable_level +_arm64_root_pgtable_num_ttes +_arm_hardware_page_size _atm_mana* _bank_mana* +_dtrace_zero* +_gLoadedKextSummaries _ipc_importance_mana* +_ipc_mqueue_full _user_data_mana* -_arm_hardware_page_size diff --git a/config/newvers.pl b/config/newvers.pl index 7b41feac4..f093b3378 100755 --- a/config/newvers.pl +++ b/config/newvers.pl @@ -96,16 +96,35 @@ sub WriteFile { # need to synthesize the directory name to be more interesting. # +sub describe { + my ($basename) = @_; + + # get a git tag if we can + my $tag = `git describe --dirty 2>/dev/null`; + chomp $tag; + if ($? != 0 or $tag !~ /^xnu-([^\s\n]+)$/) { + return $basename; + } + + # If basename is just 'xnu' then replace it with the tag. Otherwise add + # the tag in brackets. + if ($basename eq 'xnu') { + return $tag + } else { + return "${basename}[$tag]" + } +} + if ($BUILD_OBJPATH =~ m,^$BUILD_SRCROOT/(.*)$,) { - $BUILD_OBJROOT = basename($BUILD_SRCROOT) . "/" . $1; + $BUILD_OBJROOT = describe(basename($BUILD_SRCROOT)) . "/" . $1; } elsif ($BUILD_OBJPATH =~ m,^$BUILD_OBJROOT/(.*)$,) { - if (defined($RC_STRING)) { + if (defined($RC_STRING)) { $BUILD_OBJROOT = $RC_STRING . "/" . $1; - } else { - $BUILD_OBJROOT = basename($BUILD_OBJROOT) . "/" . $1; - } + } else { + $BUILD_OBJROOT = describe(basename($BUILD_OBJROOT)) . "/" . $1; + } } else { - # Use original OBJROOT + # Use original OBJROOT } my $rawvers = &ReadFile($versfile); diff --git a/iokit/IOKit/IOBSD.h b/iokit/IOKit/IOBSD.h index f2aadfed8..505e23efd 100644 --- a/iokit/IOKit/IOBSD.h +++ b/iokit/IOKit/IOBSD.h @@ -32,6 +32,7 @@ * bsd-related registry properties */ +#define kIOBSDKey "IOBSD" // (BSD subsystem resource) #define kIOBSDNameKey "BSD Name" // (an OSString) #define kIOBSDNamesKey "BSD Names" // (an OSDictionary of OSString's, for links) #define kIOBSDMajorKey "BSD Major" // (an OSNumber) @@ -39,7 +40,7 @@ #define kIOBSDUnitKey "BSD Unit" // (an OSNumber) -#ifdef XNU_KERNEL_PRIVATE +#ifdef KERNEL_PRIVATE #include #include diff --git a/iokit/IOKit/IOBufferMemoryDescriptor.h b/iokit/IOKit/IOBufferMemoryDescriptor.h index 486ce4e13..6e6f051b9 100644 --- a/iokit/IOKit/IOBufferMemoryDescriptor.h +++ b/iokit/IOKit/IOBufferMemoryDescriptor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,6 +46,7 @@ enum { | kIOMemoryThreadSafe | kIOMemoryClearEncrypt | kIOMemoryMapperNone + | kIOMemoryUseReserve }; #define _IOBUFFERMEMORYDESCRIPTOR_INTASKWITHOPTIONS_ 1 @@ -178,7 +179,7 @@ class IOBufferMemoryDescriptor : public IOGeneralMemoryDescriptor kIOMapCopybackCache - allocate memory with copyback cache setting.
kIOMapWriteCombineCache - allocate memory with writecombined cache setting. @param capacity The number of bytes to allocate. - @param mask The buffer will be allocated with pages such that physical addresses will only have bits set present in physicalMask. For example, pass 0x00000000FFFFFFFFULL for a buffer to be accessed by hardware that has 32 address bits. + @param physicalMask The buffer will be allocated with pages such that physical addresses will only have bits set present in physicalMask. For example, pass 0x00000000FFFFFFFFULL for a buffer to be accessed by hardware that has 32 address bits. @result Returns an instance of class IOBufferMemoryDescriptor to be released by the caller, which will free the memory desriptor and associated buffer. */ static IOBufferMemoryDescriptor * inTaskWithPhysicalMask( diff --git a/iokit/IOKit/IOCatalogue.h b/iokit/IOKit/IOCatalogue.h index 693e0ef7e..0f1a8bdb9 100644 --- a/iokit/IOKit/IOCatalogue.h +++ b/iokit/IOKit/IOCatalogue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2012 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,7 +86,7 @@ class IOCatalogue : public OSObject /*! @function findDrivers @abstract This is the primary entry point for IOService. - @param service + @param service The service @param generationCount Returns a reference to the generation count of the database. The generation count increases only when personalities are added to the database *and* IOService matching has been initiated. @result Returns an ordered set of driver personalities ranked on probe-scores. The ordered set must be released by the receiver. */ @@ -105,7 +105,7 @@ class IOCatalogue : public OSObject @function addDrivers @abstract Adds an array of driver personalities to the database. @param array Array of driver personalities to be added to the database. - @param doNubMatchng Start matching process after personalities have been added. + @param doNubMatching Start matching process after personalities have been added. @result Returns true if driver personality was added to the database successfully. Failure is due to a memory allocation failure. */ bool addDrivers( OSArray * array, bool doNubMatching = true ); @@ -114,7 +114,7 @@ class IOCatalogue : public OSObject @function removeDrivers @abstract Remove driver personalities from the database based on matching information provided. @param matching A dictionary whose keys and values are used for matching personalities in the database. For example, a matching dictionary containing a 'IOProviderClass' key with the value 'IOPCIDevice' will remove all personalities which have the key 'IOProviderClass' equal to 'IOPCIDevice'. - @param doNubMatchng Start matching process after personalities have been removed. Matching criteria is based on IOProviderClass of those personalities which were removed. This is to allow drivers which haven't been matched to match against NUB's which were blocked by the previous personalities. + @param doNubMatching Start matching process after personalities have been removed. Matching criteria is based on IOProviderClass of those personalities which were removed. This is to allow drivers which haven't been matched to match against NUB's which were blocked by the previous personalities. @result Returns true if personality was removed successfully. Failure is due to a memory allocation failure. */ bool removeDrivers( OSDictionary * matching, bool doNubMatching = true ); diff --git a/iokit/IOKit/IOCommandGate.h b/iokit/IOKit/IOCommandGate.h index 26624116f..ce4851284 100644 --- a/iokit/IOKit/IOCommandGate.h +++ b/iokit/IOKit/IOCommandGate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2009 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -195,7 +195,7 @@ client's thread attemptCommand will fail if the work loop's gate is closed. /*! @function commandWakeup @abstract Wakeup one or more threads that are asleep on an event. @param event Pointer to an address. - @param onlyOneThread true to only wake up at most one thread, false otherwise. */ + @param oneThread true to only wake up at most one thread, false otherwise. */ virtual void commandWakeup(void *event, bool oneThread = false); /*! @function disable diff --git a/iokit/IOKit/IOCommandPool.h b/iokit/IOKit/IOCommandPool.h index 442815761..c21455c5c 100644 --- a/iokit/IOKit/IOCommandPool.h +++ b/iokit/IOKit/IOCommandPool.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -118,7 +118,7 @@ class IOCommandPool : public OSObject * @abstract Primary initializer for an IOCommandPool object. * @discussion Primary initializer for an IOCommandPool. * Should probably use IOCommandPool::withWorkLoop() as it is easier to use. - * @param inWorkLoop + * @param workLoop * The workloop that this command pool should synchronize with. * @result Returns true if command pool was successfully initialized. */ @@ -176,7 +176,7 @@ class IOCommandPool : public OSObject * @discussion * The returnCommand method is used to place an object of type IOCommand * into the pool, whether it be the first time, or the 1000th time. - * @param commmand + * @param command * The command to place in the pool. */ @@ -189,10 +189,10 @@ class IOCommandPool : public OSObject * @discussion * The gatedGetCommand method is used to serialize the extraction of a * command from the pool behind a command gate, runAction-ed by getCommand. - * @param vCommand + * @param command * A pointer to a pointer to an IOCommand object where the returned * command will be stored. - * @param vBlock + * @param blockForCommand * A bool that indicates whether to block the request until a command * becomes available. * @result @@ -207,7 +207,7 @@ class IOCommandPool : public OSObject * @discussion * The gatedReturnCommand method is used to serialize the return of a * command to the pool behind a command gate, runAction-ed by returnCommand. - * @param vCommand + * @param command * A pointer to the IOCommand object to be returned to the pool. * @result * Always returns kIOReturnSuccess if the vCommand argument is valid. diff --git a/iokit/IOKit/IODMACommand.h b/iokit/IOKit/IODMACommand.h index 54e0815bb..89e301e47 100644 --- a/iokit/IOKit/IODMACommand.h +++ b/iokit/IOKit/IODMACommand.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,6 +33,19 @@ class IOMapper; class IOBufferMemoryDescriptor; +enum +{ + kIODMAMapOptionMapped = 0x00000000, + kIODMAMapOptionBypassed = 0x00000001, + kIODMAMapOptionNonCoherent = 0x00000002, + kIODMAMapOptionUnmapped = 0x00000003, + kIODMAMapOptionTypeMask = 0x0000000f, + + kIODMAMapOptionNoCacheStore = 0x00000010, // Memory in descriptor + kIODMAMapOptionOnChip = 0x00000020, // Indicates DMA is on South Bridge + kIODMAMapOptionIterateOnly = 0x00000040 // DMACommand will be used as a cursor only +}; + /**************************** class IODMACommand ***************************/ /*! @@ -47,20 +60,6 @@ class IOBufferMemoryDescriptor; The IODMACommand can be used in a 'weak-linked' manner. To do this you must avoid using any static member functions. Use the, much slower but safe, weakWithSpecification function. On success a dma command instance will be returned. This instance can then be used to clone as many commands as is needed. Remember deriving from this class can not be done weakly, that is no weak subclassing! */ - -enum -{ - kIODMAMapOptionMapped = 0x00000000, - kIODMAMapOptionBypassed = 0x00000001, - kIODMAMapOptionNonCoherent = 0x00000002, - kIODMAMapOptionUnmapped = 0x00000003, - kIODMAMapOptionTypeMask = 0x0000000f, - - kIODMAMapOptionNoCacheStore = 0x00000010, // Memory in descriptor - kIODMAMapOptionOnChip = 0x00000020, // Indicates DMA is on South Bridge - kIODMAMapOptionIterateOnly = 0x00000040 // DMACommand will be used as a cursor only -}; - class IODMACommand : public IOCommand { OSDeclareDefaultStructors(IODMACommand); @@ -342,7 +341,7 @@ friend class IODMAEventSource; /*! @function complete @abstract Complete processing of DMA mappings after an I/O transfer is finished. @discussion This method should not be called unless a prepare was previously issued; the prepare() and complete() must occur in pairs, before and after an I/O transfer - @param invalidCache Invalidate the caches for the memory descriptor. Defaults to true for kNonCoherent and is ignored by the other types. + @param invalidateCache Invalidate the caches for the memory descriptor. Defaults to true for kNonCoherent and is ignored by the other types. @param synchronize Copy any buffered data back to the target IOMemoryDescriptor. Defaults to true, if synchronize() is being used to explicitly copy data, passing false may avoid an unneeded copy. @result kIOReturnNotReady if not prepared, kIOReturnSuccess otherwise. */ @@ -402,7 +401,7 @@ friend class IODMAEventSource; inline IOReturn gen32IOVMSegments(UInt64 *offset, Segment32 *segments, UInt32 *numSegments) - { return genIOVMSegments(offset, segments, numSegments); }; + { return genIOVMSegments(offset, segments, numSegments); } /*! @function gen64IOVMSegments @abstract Helper function for a type checked call to genIOVMSegments(qv), for use with an IODMACommand set up with the output function kIODMACommandOutputHost64, kIODMACommandOutputBig64, or kIODMACommandOutputLittle64. If the output function of the IODMACommand is not a 64 bit function, results will be incorrect. @@ -410,7 +409,7 @@ friend class IODMAEventSource; inline IOReturn gen64IOVMSegments(UInt64 *offset, Segment64 *segments, UInt32 *numSegments) - { return genIOVMSegments(offset, segments, numSegments); }; + { return genIOVMSegments(offset, segments, numSegments); } IOReturn genIOVMSegments(SegmentFunction segmentFunction, diff --git a/iokit/IOKit/IODeviceMemory.h b/iokit/IOKit/IODeviceMemory.h index 0665efc3c..bcc31ab3d 100644 --- a/iokit/IOKit/IODeviceMemory.h +++ b/iokit/IOKit/IODeviceMemory.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,8 +73,8 @@ class IODeviceMemory : public IOMemoryDescriptor /*! @function withRange @abstract Constructs an IODeviceMemory instance, describing one physical range. @discussion This method creates an IODeviceMemory instance for one physical range passed as a physical address and length. It just calls IOMemoryDescriptor::withPhysicalAddress. - @param address The physical address of the first byte in the memory. - @param withLength The length of memory. + @param start The physical address of the first byte in the memory. + @param length The length of memory. @result Returns the created IODeviceMemory on success, to be released by the caller, or zero on failure. */ static IODeviceMemory * withRange( diff --git a/iokit/IOKit/IOFilterInterruptEventSource.h b/iokit/IOKit/IOFilterInterruptEventSource.h index 60154944b..13ef854d8 100644 --- a/iokit/IOKit/IOFilterInterruptEventSource.h +++ b/iokit/IOKit/IOFilterInterruptEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,7 +60,7 @@ class IOFilterInterruptEventSource : public IOInterruptEventSource @param owner Pointer to the owning/client instance. @param sender Where is the interrupt comming from. @result false if this interrupt can be ignored. */ - typedef bool (*Filter)(OSObject *, IOFilterInterruptEventSource *); + typedef bool (*Filter)(OSObject *owner, IOFilterInterruptEventSource *sender); /*! @defined IOFilterInterruptAction @discussion Backward compatibilty define for the old non-class scoped type definition. See $link IOFilterInterruptSource::Filter */ diff --git a/iokit/IOKit/IOHibernatePrivate.h b/iokit/IOKit/IOHibernatePrivate.h index a9d25fa98..ee9139154 100644 --- a/iokit/IOKit/IOHibernatePrivate.h +++ b/iokit/IOKit/IOHibernatePrivate.h @@ -218,7 +218,7 @@ enum struct hibernate_graphics_t { - uint32_t physicalAddress; // Base address of video memory + uint64_t physicalAddress; // Base address of video memory int32_t gfxStatus; // EFI config restore status uint32_t rowBytes; // Number of bytes per pixel row uint32_t width; // Width @@ -304,6 +304,7 @@ void IOHibernateSystemInit(IOPMrootDomain * rootDomain); IOReturn IOHibernateSystemSleep(void); void IOOpenDebugDataFile(const char *fname, uint64_t size); +void IOCloseDebugDataFile(); IOReturn IOHibernateIOKitSleep(void); IOReturn IOHibernateSystemHasSlept(void); IOReturn IOHibernateSystemWake(void); @@ -340,6 +341,9 @@ hibernate_teardown(hibernate_page_list_t * page_list, hibernate_page_list_t * page_list_wired, hibernate_page_list_t * page_list_pal); +kern_return_t +hibernate_pin_swap(boolean_t begin); + kern_return_t hibernate_processor_setup(IOHibernateImageHeader * header); @@ -425,6 +429,7 @@ extern uint32_t gIOHibernateState; extern uint32_t gIOHibernateMode; extern uint32_t gIOHibernateDebugFlags; extern uint32_t gIOHibernateFreeTime; // max time to spend freeing pages (ms) +extern boolean_t gIOHibernateStandbyDisabled; extern uint8_t gIOHibernateRestoreStack[]; extern uint8_t gIOHibernateRestoreStackEnd[]; extern IOHibernateImageHeader * gIOHibernateCurrentHeader; diff --git a/iokit/IOKit/IOInterruptEventSource.h b/iokit/IOKit/IOInterruptEventSource.h index 074af7930..693fb800e 100644 --- a/iokit/IOKit/IOInterruptEventSource.h +++ b/iokit/IOKit/IOInterruptEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,7 +68,7 @@ class IOInterruptEventSource : public IOEventSource @param owner Pointer to client instance. @param sender Pointer to generation interrupt event source. @param count Number of interrupts seen before delivery. */ - typedef void (*Action)(OSObject *, IOInterruptEventSource *, int count); + typedef void (*Action)(OSObject *owner, IOInterruptEventSource *sender, int count); /*! @defined IOInterruptEventAction @discussion Backward compatibilty define for the old non-class scoped type definition. See $link IOInterruptEventSource::Action */ diff --git a/iokit/IOKit/IOKernelReportStructs.h b/iokit/IOKit/IOKernelReportStructs.h index e02aa4ab0..4018f7995 100644 --- a/iokit/IOKit/IOKernelReportStructs.h +++ b/iokit/IOKit/IOKernelReportStructs.h @@ -240,6 +240,12 @@ enum { kIOReportScaleBytes) #define kIOReportUnit_KiB __IOR_MAKEUNIT(kIOReportQuantityData, \ kIOReportScaleKiBytes) +#define kIOReportUnit_MiB __IOR_MAKEUNIT(kIOReportQuantityData, \ + kIOReportScaleMiBytes) +#define kIOReportUnit_GiB __IOR_MAKEUNIT(kIOReportQuantityData, \ + kIOReportScaleGiBytes) +#define kIOReportUnit_TiB __IOR_MAKEUNIT(kIOReportQuantityData, \ + kIOReportScaleTiBytes) #define kIOReportUnitEvents __IOR_MAKEUNIT(kIOReportQuantityEventCount, \ kIOReportScaleUnity) diff --git a/iokit/IOKit/IOKernelReporters.h b/iokit/IOKit/IOKernelReporters.h index 58475afd9..5257ca081 100644 --- a/iokit/IOKit/IOKernelReporters.h +++ b/iokit/IOKit/IOKernelReporters.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2014 Apple Computer, Inc. All Rights Reserved. + * Copyright (c) 2012-2016 Apple Inc. All Rights Reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -321,7 +321,7 @@ class IOReporter : public OSObject @abstract call updateReport() on multiple IOReporter objects @param reporters - OSSet of IOReporter objects - @param channels - full list of channels to update + @param channelList - full list of channels to update @param action - type/style of update @param result - returned details about what was updated @param destination - destination for this update (action-specific) @@ -454,8 +454,8 @@ class IOReporter : public OSObject /*! @function IOReporter::handleAddChannelSwap @abstract update primary instance variables with new buffers - @param channelID ID of channel being added - @param channelName optional channel name, in an allocated object + @param channel_id ID of channel being added + @param symChannelName optional channel name, in an allocated object @result IOReturn code @discussion @@ -625,9 +625,7 @@ class IOReporter : public OSObject @param element_index - index of the _element in internal array @result A pointer to the element values requested or NULL on failure - @discussion - - Locking: Caller must ensure that the reporter (data) lock is held. + @discussion Locking: Caller must ensure that the reporter (data) lock is held. The returned pointer is only valid until unlockReporter() is called. */ virtual const IOReportElementValues* getElementValues(int element_index); @@ -651,7 +649,7 @@ class IOReporter : public OSObject @abstract Returns the index of a channel from internal data structures @param channel_id - ID of the channel - @param element_index - pointer to the returned element_index + @param channel_index - pointer to the returned element_index @result appropriate IOReturn code @discussion @@ -705,7 +703,6 @@ class IOReporter : public OSObject @abstract return an an OSArray of the reporter's channel IDs - @param none @result An OSArray of the repoter's channel ID's as OSNumbers @discussion @@ -962,20 +959,48 @@ class IOStateReporter : public IOReporter Locking: same-instance concurrency SAFE, WILL NOT BLOCK */ - IOReturn setChannelState(uint64_t channel_id, - uint64_t new_state_id); - IOReturn setChannelState(uint64_t channel_id, uint64_t new_state_id, uint64_t last_intransition, uint64_t prev_state_residency) __deprecated; +/*! @function IOStateReporter::setChannelState + @abstract Updates the current state of a channel to a new state + + @param channel_id - ID of the channel which is updated to a new state + @param new_state_id - ID of the target state for this channel + @result Appropriate IOReturn code + + @discussion + setChannelState() updates the amount of time spent in the previous + state (if any) and increments the number of transitions into the + new state. It also sets the target state's last transition time to + the current time and enables internal time-keeping for the channel. + In this mode, calls like getStateResidencyTime() and updateReport() + automatically update a channel's time in state. + + new_state_id identifies the target state as initialized + (0..) or as configured by setStateID(). + + Drivers wishing to compute and report their own time in state + should use incrementChannelState() or overrideChannelState(). It + is not currently possible for a driver to synchronize with the + automatic time-keeping enabled by setChannelState(). The + 4-argument version of setChannelState() is thus impossible to + use correctly. In the future, there may be a setChannelState() + which accepts a last_intransition parameter and uses it to + automatically calculate time in state (ERs -> IOReporting / X). + + Locking: same-instance concurrency SAFE, WILL NOT BLOCK +*/ + IOReturn setChannelState(uint64_t channel_id, + uint64_t new_state_id); + + /*! @function IOStateReporter::setState @abstract Updates state for single channel reporters @param new_state_id - New state for the channel - @param last_intransition - deprecated: time of most recent entry - @param prev_state_residency - deprecated: spent in previous state @result Appropriate IOReturn code. @discussion @@ -989,6 +1014,23 @@ class IOStateReporter : public IOReporter */ IOReturn setState(uint64_t new_state_id); +/*! @function IOStateReporter::setState + @abstract Updates state for single channel reporters + + @param new_state_id - New state for the channel + @param last_intransition - deprecated: time of most recent entry + @param prev_state_residency - deprecated: spent in previous state + @result Appropriate IOReturn code. + + @discussion + setState() is a convenience method for single-channel state + reporter instances. An error will be returned if the reporter + in question has more than one channel. + + See further discussion at setChannelState(). + + Locking: same-instance concurrency SAFE, WILL NOT BLOCK +*/ IOReturn setState(uint64_t new_state_id, uint64_t last_intransition, uint64_t prev_state_residency) __deprecated; @@ -1096,9 +1138,7 @@ class IOStateReporter : public IOReporter @abstract update a channel state without validating channel_id @param channel_index - 0.., available from getChannelIndex() - @param new_state - New state (by index) for the channel - @param last_intransition - deprecated: time of most recent entry - @param prev_state_residency - deprecated: time spent in previous state + @param new_state_index - New state (by index) for the channel @result Appropriate IOReturn code @discussion @@ -1129,6 +1169,40 @@ class IOStateReporter : public IOReporter IOReturn setStateByIndices(int channel_index, int new_state_index); +/*! @function IOStateReporter::setStateByIndices + @abstract update a channel state without validating channel_id + + @param channel_index - 0.., available from getChannelIndex() + @param new_state_index - New state (by index) for the channel + @param last_intransition - deprecated: time of most recent entry + @param prev_state_residency - deprecated: time spent in previous state + @result Appropriate IOReturn code + + @discussion + Similar to setState(), setStateByIndices() sets a channel's state + without searching for the channel or state IDs. It will perform + bounds checking, but relies on the caller to properly indicate + the indices of the channel and state. Clients can rely on channels + being added to IOStateReporter in order: the first channel will + have index 0, the second index 1, etc. Like ::setState(), + "time in state" calculations are handled automatically. + + setStateByIndices() is faster than than setChannelState(), but + it should only be used where the latter's performance overhead + might be a problem. For example, many channels in a single + reporter and high-frequency state changes. + + Drivers wishing to compute and report their own time in state + should use incrementChannelState() or overrideChannelState(). It + is not currently possible for a driver to synchronize with the + automatic time-keeping enabled by setStateByIndices(). The + 4-argument version of setChannelState() is thus impossible to + use correctly. In the future, there may be a setChannelState() + which accepts a last_intransition parameter and uses it to + automatically calculate time in state (ERs -> IOReporting / X). + + Locking: same-instance concurrency SAFE, WILL NOT BLOCK +*/ IOReturn setStateByIndices(int channel_index, int new_state_index, uint64_t last_intransition, @@ -1138,7 +1212,7 @@ class IOStateReporter : public IOReporter @abstract Accessor method for count of transitions into state @param channel_id - ID of the channel - @param channel_state - State of the channel + @param state_id - State of the channel @result Count of transitions into the requested state. @discussion @@ -1155,7 +1229,7 @@ class IOStateReporter : public IOReporter @abstract Accessor method for time spent in a given state @param channel_id - ID of the channel - @param channel_state - State of the channel + @param state_id - State of the channel @result Absolute time spent in specified state @discussion @@ -1173,7 +1247,7 @@ class IOStateReporter : public IOReporter @abstract Accessor method for last time a transition occured @param channel_id - ID of the channel - @param channel_state - State of the channel + @param state_id - State of the channel @result Absolute time for when the last transition occured @discussion @@ -1231,18 +1305,21 @@ class IOStateReporter : public IOReporter /*! @function IOStateReporter::handleSwapPrepare @abstract _swap* = + [see IOReporter::handle*Swap* for more info] +*/ + virtual IOReturn handleSwapPrepare(int newNChannels) APPLE_KEXT_OVERRIDE; +/*! @function IOStateReporter::handleAddChannelSwap @abstract swap in IOStateReporter's variables +*/ + virtual IOReturn handleAddChannelSwap(uint64_t channel_id, + const OSSymbol *symChannelName) APPLE_KEXT_OVERRIDE; +/*! @function IOStateReporter::handleSwapCleanup @abstract clean up unused buffers in _swap* - - [see IOReporter::handle*Swap* for more info] */ - virtual IOReturn handleSwapPrepare(int newNChannels) APPLE_KEXT_OVERRIDE; - virtual IOReturn handleAddChannelSwap(uint64_t channel_id, - const OSSymbol *symChannelName) APPLE_KEXT_OVERRIDE; virtual void handleSwapCleanup(int swapNChannels) APPLE_KEXT_OVERRIDE; /*! @function IOStateReporter::updateChannelValues @@ -1264,7 +1341,7 @@ class IOStateReporter : public IOReporter @abstract update a channel state without validating channel_id @param channel_index - 0.., available from getChannelIndex() - @param new_state - New state for the channel + @param new_state_index - New state for the channel @param last_intransition - to remove: time of most recent entry @param prev_state_residency - to remove: time spent in previous state @result Appropriate IOReturn code @@ -1414,9 +1491,33 @@ FIXME: need more explanation of the config @result kIOReturnUnsupported - doesn't support adding channels */ - IOReturn addChannel(uint64_t channelID, const char *channelName = NULL) { + IOReturn addChannel(__unused uint64_t channelID, __unused const char *channelName = NULL) { return kIOReturnUnsupported; } + +/*! @function IOHistogramReporter::overrideBucketValues + @abstract Override values of a bucket at specified index + + @param index - index of bucket to override + @param bucket_hits - new bucket hits count + @param bucket_min - new bucket minimum value + @param bucket_max - new bucket maximum value + @param bucket_sum - new bucket sum + @result Appropriate IOReturn code + + @discussion + Replaces data in the bucket at the specified index with the data pointed + to by bucket. No sanity check is performed on the data. If the index + is out of bounds, kIOReturnBadArgument is returned. + + Locking: same-instance concurrency SAFE, WILL NOT BLOCK +*/ + + IOReturn overrideBucketValues(unsigned int index, + uint64_t bucket_hits, + int64_t bucket_min, + int64_t bucket_max, + int64_t bucket_sum); /*! @function IOHistogramReporter::tallyValue @abstract Add a new value to the histogram @@ -1555,6 +1656,34 @@ class IOReportLegend : public OSObject const char *groupName, const char *subGroupName); +/*! @function IOReportLegend::addReporterLegend + @abstract Add a legend entry from a reporter object + + @param reporter - IOReporter to use to extract and append the legend + @param groupName - primary group name for this entry + @param subGroupName - secondary group name for this entry + @result appropriate IOReturn code + + @discussion + An IOReportLegendEntry will be created internally to this method from + the IOReporter object passed in argument. The entry will be released + internally after being appended to the IOReportLegend object. + Legend entries are available from reporter objects. Entries + represent some number of channels with similar properties (such + as group and sub-group). Multiple legend entries with the same + group names will be aggregated in user space. + + Drivers that instantiate their reporter objects in response to + IOService::configureReport(kIOReportDisable) will need to create + temporary reporter objects for the purpose of creating their + legend entries. User-space legends are tracked by 12836893. + + Locking: same-reportingService and same-IORLegend concurrency UNSAFE +*/ + IOReturn addReporterLegend(IOReporter *reporter, + const char *groupName, + const char *subGroupName); + /*! @function IOReportLegend::addReporterLegend @abstract Add a legend entry from a reporter object @@ -1584,10 +1713,6 @@ class IOReportLegend : public OSObject Locking: same-reportingService and same-IORLegend concurrency UNSAFE */ - IOReturn addReporterLegend(IOReporter *reporter, - const char *groupName, - const char *subGroupName); - static IOReturn addReporterLegend(IOService *reportingService, IOReporter *reporter, const char *groupName, @@ -1632,8 +1757,6 @@ class IOReportLegend : public OSObject @param groupName - Primary group name @param subGroupName - Secondary group name @result IOReturn code - - @discussion */ IOReturn organizeLegend(IOReportLegendEntry *legendEntry, const OSSymbol *groupName, diff --git a/iokit/IOKit/IOKitDebug.h b/iokit/IOKit/IOKitDebug.h index 86f34c995..87467f3c9 100644 --- a/iokit/IOKit/IOKitDebug.h +++ b/iokit/IOKit/IOKitDebug.h @@ -154,9 +154,12 @@ enum struct IOTrackingCallSiteInfo { - uint32_t count; - size_t size[2]; - uintptr_t bt[kIOTrackingCallSiteBTs]; + uint32_t count; + pid_t addressPID; + mach_vm_address_t address; + mach_vm_size_t size[2]; + pid_t btPID; + mach_vm_address_t bt[2][kIOTrackingCallSiteBTs]; }; #define kIOMallocTrackingName "IOMalloc" @@ -187,11 +190,35 @@ struct IOTrackingAddress #endif }; +struct IOTrackingUser +{ + queue_chain_t link; + pid_t btPID; + uint8_t user32; + uint8_t userCount; + uintptr_t bt[kIOTrackingCallSiteBTs]; + uintptr_t btUser[kIOTrackingCallSiteBTs]; +}; + +enum +{ + kIOTrackingQueueTypeDefaultOn = 0x00000001, + kIOTrackingQueueTypeAlloc = 0x00000002, + kIOTrackingQueueTypeMap = 0x00000004, + kIOTrackingQueueTypeUser = 0x00000008, +}; + + void IOTrackingInit(void); -IOTrackingQueue * IOTrackingQueueAlloc(const char * name, size_t allocSize, size_t minCaptureSize, bool isAlloc); +IOTrackingQueue * IOTrackingQueueAlloc(const char * name, uintptr_t btEntry, + size_t allocSize, size_t minCaptureSize, + uint32_t type, uint32_t numSiteQs); void IOTrackingQueueFree(IOTrackingQueue * head); void IOTrackingAdd(IOTrackingQueue * head, IOTracking * mem, size_t size, bool address); void IOTrackingRemove(IOTrackingQueue * head, IOTracking * mem, size_t size); +void IOTrackingAddUser(IOTrackingQueue * queue, IOTrackingUser * mem, vm_size_t size); +void IOTrackingRemoveUser(IOTrackingQueue * head, IOTrackingUser * tracking); + void IOTrackingAlloc(IOTrackingQueue * head, uintptr_t address, size_t size); void IOTrackingFree(IOTrackingQueue * head, uintptr_t address, size_t size); void IOTrackingReset(IOTrackingQueue * head); @@ -214,7 +241,7 @@ enum enum { kIOTrackingGetTracking = 0x00000001, - kIOTrackingPrintTracking = 0x00000002, + kIOTrackingGetMappings = 0x00000002, kIOTrackingResetTracking = 0x00000003, kIOTrackingStartCapture = 0x00000004, kIOTrackingStopCapture = 0x00000005, diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h index 698cf86c9..240ec58e9 100644 --- a/iokit/IOKit/IOKitKeys.h +++ b/iokit/IOKit/IOKitKeys.h @@ -55,6 +55,8 @@ // registry ID number #define kIORegistryEntryIDKey "IORegistryEntryID" +// property name to get array of property names +#define kIORegistryEntryPropertyKeysKey "IORegistryEntryPropertyKeys" // IOService class name #define kIOServiceClass "IOService" @@ -71,10 +73,12 @@ #define kIOProviderClassKey "IOProviderClass" #define kIONameMatchKey "IONameMatch" #define kIOPropertyMatchKey "IOPropertyMatch" +#define kIOPropertyExistsMatchKey "IOPropertyExistsMatch" #define kIOPathMatchKey "IOPathMatch" #define kIOLocationMatchKey "IOLocationMatch" #define kIOParentMatchKey "IOParentMatch" #define kIOResourceMatchKey "IOResourceMatch" +#define kIOResourceMatchedKey "IOResourceMatched" #define kIOMatchedServiceCountKey "IOMatchedServiceCountMatch" #define kIONameMatchedKey "IONameMatched" @@ -129,6 +133,7 @@ #define kIOMaximumSegmentByteCountWriteKey "IOMaximumSegmentByteCountWrite" // (OSNumber) #define kIOMinimumSegmentAlignmentByteCountKey "IOMinimumSegmentAlignmentByteCount" // (OSNumber) #define kIOMaximumSegmentAddressableBitCountKey "IOMaximumSegmentAddressableBitCount" // (OSNumber) +#define kIOMinimumSaturationByteCountKey "IOMinimumSaturationByteCount" // (OSNumber) // properties found in services that wish to describe an icon // diff --git a/iokit/IOKit/IOLib.h b/iokit/IOKit/IOLib.h index 7bf9ad8ab..4a8ae78d6 100644 --- a/iokit/IOKit/IOLib.h +++ b/iokit/IOKit/IOLib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -290,7 +290,7 @@ void IODelay(unsigned microseconds); /*! @function IOPause @abstract Spin delay for a number of nanoseconds. @discussion This function spins to delay for at least the number of specified nanoseconds. Since the CPU is busy spinning no time is made available to other processes; this method of delay should be used only for short periods. - @param microseconds The integer number of nanoseconds to spin wait. */ + @param nanoseconds The integer number of nanoseconds to spin wait. */ void IOPause(unsigned nanoseconds); @@ -298,7 +298,7 @@ void IOPause(unsigned nanoseconds); @abstract Log a message to console in text mode, and /var/log/system.log. @discussion This function allows a driver to log diagnostic information to the screen during verbose boots, and to a log file found at /var/log/system.log. IOLog should not be called from interrupt context. @param format A printf() style format string (see printf(3) documentation). - @param other arguments described by the format string. */ + */ void IOLog(const char *format, ...) __attribute__((format(printf, 1, 2))); diff --git a/iokit/IOKit/IOMapper.h b/iokit/IOKit/IOMapper.h index fc4f07dbe..f63f5463a 100644 --- a/iokit/IOKit/IOMapper.h +++ b/iokit/IOKit/IOMapper.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,7 +90,7 @@ class IOMapper : public IOService static IOMapper *gSystem; static void checkForSystemMapper() - { if ((uintptr_t) gSystem & kWaitMask) waitForSystemMapper(); }; + { if ((uintptr_t) gSystem & kWaitMask) waitForSystemMapper(); } static IOMapper * copyMapperForDevice(IOService * device); static IOMapper * copyMapperForDeviceWithIndex(IOService * device, unsigned int index); diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 7b193afc0..fb5f5ce26 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,6 +72,19 @@ enum IODirection #define IODIRECTIONCOMPLETEWITHDATAVALIDDEFINED 1 kIODirectionCompleteWithDataValid = 0x00000080, }; + + +#if XNU_KERNEL_PRIVATE +enum +{ + // prepare/complete() notify DMA command active + kIODirectionDMACommand = 0x00000100, + kIODirectionDMACommandMask = 0x0001FE00, + kIODirectionDMACommandShift = 9, +}; +#endif + + #ifdef __LP64__ typedef IOOptionBits IODirection; #endif /* __LP64__ */ @@ -113,6 +126,8 @@ enum { #endif kIOMemoryThreadSafe = 0x00100000, // Shared with Buffer MD kIOMemoryClearEncrypt = 0x00200000, // Shared with Buffer MD + kIOMemoryUseReserve = 0x00800000, // Shared with Buffer MD +#define IOMEMORYUSERESERVEDEFINED 1 #ifdef XNU_KERNEL_PRIVATE kIOMemoryBufferPurgeable = 0x00400000, @@ -144,6 +159,7 @@ enum kIOMemoryPurgeableVolatileBehaviorLifo = VM_PURGABLE_BEHAVIOR_LIFO, kIOMemoryPurgeableVolatileOrderingObsolete = VM_PURGABLE_ORDERING_OBSOLETE, kIOMemoryPurgeableVolatileOrderingNormal = VM_PURGABLE_ORDERING_NORMAL, + kIOMemoryPurgeableFaultOnAccess = VM_PURGABLE_DEBUG_FAULT, }; enum { @@ -383,6 +399,9 @@ typedef IOOptionBits DMACommandOps; uint64_t length, uint64_t * mapAddress, uint64_t * mapLength); + + void setVMTags(vm_tag_t kernelTag, vm_tag_t userTag); + vm_tag_t getVMTag(vm_map_t map); #endif private: @@ -463,7 +482,7 @@ typedef IOOptionBits DMACommandOps; @abstract Create an IOMemoryDescriptor to describe one virtual range of the specified map. @discussion This method creates and initializes an IOMemoryDescriptor for memory consisting of a single virtual memory range mapped into the specified map. This memory descriptor needs to be prepared before it can be used to extract data from the memory described. @param address The virtual address of the first byte in the memory. - @param withLength The length of memory. + @param length The length of memory. @param options kIOMemoryDirectionMask (options:direction) This nibble indicates the I/O direction to be associated with the descriptor, which may affect the operation of the prepare and complete methods on some architectures. @param task The task the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. The task argument may be NULL to specify memory by physical address. @@ -784,7 +803,7 @@ class IOMemoryMap : public OSObject IOMemoryDescriptor * fOwner; uint8_t fUserClientUnmap; #if IOTRACKING - IOTracking fTracking; + IOTrackingUser fTracking; #endif #endif /* XNU_KERNEL_PRIVATE */ @@ -884,22 +903,22 @@ class IOMemoryMap : public OSObject @abstract Accessor to the virtual address of the first byte in the mapping. @discussion This method returns the virtual address of the first byte in the mapping. @result A virtual address. */ + inline mach_vm_address_t getAddress() __attribute__((always_inline)); /*! @function getSize @abstract Accessor to the length of the mapping. @discussion This method returns the length of the mapping. @result A byte count. */ - inline mach_vm_address_t getAddress() __attribute__((always_inline)); inline mach_vm_size_t getSize() __attribute__((always_inline)); #else /* !__LP64__ */ /*! @function getAddress @abstract Accessor to the virtual address of the first byte in the mapping. @discussion This method returns the virtual address of the first byte in the mapping. @result A virtual address. */ + virtual mach_vm_address_t getAddress(); /*! @function getSize @abstract Accessor to the length of the mapping. @discussion This method returns the length of the mapping. @result A byte count. */ - virtual mach_vm_address_t getAddress(); virtual mach_vm_size_t getSize(); #endif /* !__LP64__ */ diff --git a/iokit/IOKit/IORegistryEntry.h b/iokit/IOKit/IORegistryEntry.h index 906baaa9f..97f66e612 100644 --- a/iokit/IOKit/IORegistryEntry.h +++ b/iokit/IOKit/IORegistryEntry.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,6 +43,7 @@ extern const OSSymbol * gIONameKey; extern const OSSymbol * gIOLocationKey; extern const OSSymbol * gIORegistryEntryIDKey; +extern const OSSymbol * gIORegistryEntryPropertyKeysKey; class IORegistryEntry; class IORegistryPlane; @@ -52,8 +53,8 @@ typedef void (*IORegistryEntryApplierFunction)(IORegistryEntry * entry, void * context); enum { - kIORegistryIterateRecursively = 0x00000001, - kIORegistryIterateParents = 0x00000002 + kIORegistryIterateRecursively = 0x00000001, + kIORegistryIterateParents = 0x00000002, }; /*! @class IORegistryEntry : public OSObject @@ -71,10 +72,7 @@ class IORegistryEntry : public OSObject /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of this class in the future. */ - struct ExpansionData - { - uint64_t fRegistryEntryID; - }; + struct ExpansionData; /*! @var reserved Reserved for future use. (Internal use only) */ @@ -252,7 +250,7 @@ member function's parameter list. /*! @function init @abstract Standard init method for all IORegistryEntry subclasses. @discussion A registry entry must be initialized with this method before it can be used. A property dictionary may passed and will be retained by this method for use as the registry entry's property table, or an empty one will be created. - @param A dictionary that will become the registry entry's property table (retaining it), or zero which will cause an empty property table to be created. + @param dictionary A dictionary that will become the registry entry's property table (retaining it), or zero which will cause an empty property table to be created. @result true on success, or false on a resource failure. */ virtual bool init( OSDictionary * dictionary = 0 ); @@ -518,6 +516,11 @@ member function's parameter list. virtual OSIterator * getChildIterator( const IORegistryPlane * plane ) const; +#if XNU_KERNEL_PRIVATE + uint32_t getChildCount( const IORegistryPlane * plane ) const; + OSArray * copyPropertyKeys(void) const; +#endif + virtual void applyToChildren( IORegistryEntryApplierFunction applier, void * context, const IORegistryPlane * plane ) const; @@ -604,7 +607,7 @@ member function's parameter list. /*! @function detachFromChild @abstract Detaches a child entry from its parent in a plane. @discussion This method is called in the parent entry when a child detaches, to make overrides possible. It is a no-op if the entry is not a child of the parent. Detaching the entry will release both the child and parent. This method will call detachFromParent in the child entry if it is not being called from detachFromParent. - @param parent The registry entry to detach. + @param child The registry entry to detach. @param plane The plane object. */ virtual void detachFromChild( IORegistryEntry * child, @@ -797,6 +800,10 @@ member function's parameter list. #endif static IORegistryEntry * initialize( void ); +#ifdef XNU_KERNEL_PRIVATE + SInt32 getRegistryEntryGenerationCount( void ) const; +#endif + private: inline bool arrayMember( OSArray * set, const IORegistryEntry * member, diff --git a/iokit/IOKit/IOReportMacros.h b/iokit/IOKit/IOReportMacros.h index b8c6a4239..f3b5a015a 100644 --- a/iokit/IOKit/IOReportMacros.h +++ b/iokit/IOKit/IOReportMacros.h @@ -30,6 +30,7 @@ #define _IOREPORT_MACROS_H_ #include "IOReportTypes.h" +#include #ifdef __cplusplus extern "C" { @@ -74,16 +75,17 @@ extern "C" { * IOReportCategories categories - categories of this channel * * If the buffer is not of sufficient size, the macro calls IOREPORT_ABORT(). - * If that returns, the buffer is filled with 0xbadcafe. + * If that returns, the buffer is left full of '&'. */ -#define SIMPLEREPORT_INIT(buffer, bufSize, providerID, channelID, cats) \ +#define SIMPLEREPORT_INIT(buf, bufSize, providerID, channelID, cats) \ do { \ - IOReportElement *__elem = (IOReportElement *)(buffer); \ + memset((buf), '&', (bufSize)); \ + IOReportElement *__elem = (IOReportElement *)(buf); \ IOSimpleReportValues *__vals; \ if ((bufSize) >= SIMPLEREPORT_BUFSIZE) { \ - __elem->channel_id = (channelID); \ __elem->provider_id = (providerID); \ + __elem->channel_id = (channelID); \ __elem->channel_type.report_format = kIOReportFormatSimple; \ __elem->channel_type.reserved = 0; \ __elem->channel_type.categories = (cats); \ @@ -95,7 +97,6 @@ do { \ } \ else { \ IOREPORT_ABORT("bufSize is smaller than the required size\n"); \ - __POLLUTE_BUF((buffer), (bufSize)); \ } \ } while(0) @@ -225,10 +226,11 @@ typedef struct { * IOReportCategories categories - categories of this channel * * If the buffer is not of sufficient size, the macro invokes IOREPORT_ABORT. - * If that returns, the buffer is filled with 0xbadcafe. + * If that returns, the buffer is left full of '&'. */ #define STATEREPORT_INIT(nstates, buf, bufSize, providerID, channelID, cats) \ do { \ + memset((buf), '&', (bufSize)); \ IOStateReportInfo *__info = (IOStateReportInfo *)(buf); \ IOStateReportValues *__rep; \ IOReportElement *__elem; \ @@ -236,8 +238,8 @@ do { \ for (unsigned __no = 0; __no < (nstates); __no++) { \ __elem = &(__info->elem[__no]); \ __rep = (IOStateReportValues *) &(__elem->values); \ - __elem->channel_id = (channelID); \ __elem->provider_id = (providerID); \ + __elem->channel_id = (channelID); \ __elem->channel_type.report_format = kIOReportFormatState; \ __elem->channel_type.reserved = 0; \ __elem->channel_type.categories = (cats); \ @@ -247,13 +249,13 @@ do { \ __rep->state_id = __no; \ __rep->intransitions = 0; \ __rep->upticks = 0; \ + __rep->last_intransition = 0; \ } \ __info->curr_state = 0; \ __info->update_ts = 0; \ } \ else { \ IOREPORT_ABORT("bufSize is smaller than the required size\n"); \ - __POLLUTE_BUF((buf), (bufSize)); \ } \ } while(0) @@ -408,12 +410,13 @@ do { \ * uint64_t channelID - ID of this channel, see IOREPORT_MAKEID() * IOReportCategories categories - categories of this channel * - * If the buffer is not of sufficient size, the macro invokes IOREPORT_ABORT() - * and, if that returns, fills the buffer with 0xbadcafe. + * If the buffer is not of sufficient size, the macro invokes IOREPORT_ABORT(). + * If that returns, the buffer is left full of '&'. */ #define SIMPLEARRAY_INIT(nValues, buf, bufSize, providerID, channelID, cats) \ do { \ + memset((buf), '&', (bufSize)); \ IOSimpleArrayReportValues *__rep; \ IOReportElement *__elem; \ uint32_t __nElems = (((nValues) / IOR_VALUES_PER_ELEMENT) + \ @@ -422,8 +425,8 @@ do { \ for (unsigned __no = 0; __no < __nElems; __no++) { \ __elem = &(((IOReportElement *)(buf))[__no]); \ __rep = (IOSimpleArrayReportValues *) &(__elem->values); \ - __elem->channel_id = (channelID); \ __elem->provider_id = (providerID); \ + __elem->channel_id = (channelID); \ __elem->channel_type.report_format = kIOReportFormatSimpleArray; \ __elem->channel_type.reserved = 0; \ __elem->channel_type.categories = (cats); \ @@ -438,7 +441,6 @@ do { \ } \ else { \ IOREPORT_ABORT("bufSize is smaller than the required size\n"); \ - __POLLUTE_BUF((buf), (bufSize)); \ } \ } while(0) @@ -584,10 +586,11 @@ typedef struct { * IOReportCategories categories - categories of this channel * * If the buffer is not of sufficient size, the macro invokes IOREPORT_ABORT. - * If that returns, the buffer is filled with 0xbadcafe. + * If that returns, the buffer is left full of '&'. */ #define HISTREPORT_INIT(nbuckets, bktSize, buf, bufSize, providerID, channelID, cats) \ do { \ + memset((buf), '&', (bufSize)); \ IOHistReportInfo *__info = (IOHistReportInfo *)(buf); \ IOReportElement *__elem; \ IOHistogramReportValues *__rep; \ @@ -596,20 +599,19 @@ do { \ for (unsigned __no = 0; __no < (nbuckets); __no++) { \ __elem = &(__info->elem[__no]); \ __rep = (IOHistogramReportValues *) &(__elem->values); \ - __elem->channel_id = (channelID); \ __elem->provider_id = (providerID); \ + __elem->channel_id = (channelID); \ __elem->channel_type.report_format = kIOReportFormatHistogram; \ __elem->channel_type.reserved = 0; \ __elem->channel_type.categories = (cats); \ __elem->channel_type.nelements = (nbuckets); \ __elem->channel_type.element_idx = __no; \ __elem->timestamp = 0; \ - bzero(__rep, sizeof(IOHistogramReportValues)); \ + memset(__rep, '\0', sizeof(IOHistogramReportValues)); \ } \ } \ else { \ IOREPORT_ABORT("bufSize is smaller than the required size\n"); \ - __POLLUTE_BUF((buf), (bufSize)); \ } \ } while (0) @@ -696,17 +698,6 @@ do { \ #define HISTREPORT_GETCHTYPE(hist_buf) \ (*(uint64_t*)&(((IOHistReportInfo *)(hist_buf))->elem[0].channel_type)) - - -/* generic utilities */ - - #define __POLLUTE_BUF(buf, bufSize) \ - do { \ - int __cnt = (bufSize)/sizeof(uint32_t); \ - while (--__cnt >= 0) \ - ((uint32_t*)(buf))[__cnt] = 0xbadcafe; \ - } while (0) - #ifdef __cplusplus } #endif diff --git a/iokit/IOKit/IOReturn.h b/iokit/IOKit/IOReturn.h index 83d05ce0a..464b84a08 100644 --- a/iokit/IOKit/IOReturn.h +++ b/iokit/IOKit/IOReturn.h @@ -75,6 +75,8 @@ typedef kern_return_t IOReturn; #define sub_iokit_sdio err_sub(0x174) #define sub_iokit_wlan err_sub(0x208) +#define sub_iokit_appleembeddedsleepwakehandler err_sub(0x209) + #define sub_iokit_vendor_specific err_sub(-2) #define sub_iokit_reserved err_sub(-1) diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index d99ceba5b..35e7ec20e 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,6 +108,7 @@ extern const IORegistryPlane * gIOPowerPlane; extern const OSSymbol * gIOResourcesKey; extern const OSSymbol * gIOResourceMatchKey; +extern const OSSymbol * gIOResourceMatchedKey; extern const OSSymbol * gIOProviderClassKey; extern const OSSymbol * gIONameMatchKey; extern const OSSymbol * gIONameMatchedKey; @@ -142,6 +143,12 @@ extern const OSSymbol * gIODeviceMemoryKey; extern const OSSymbol * gIOInterruptControllersKey; extern const OSSymbol * gIOInterruptSpecifiersKey; +extern const OSSymbol * gIOBSDKey; +extern const OSSymbol * gIOBSDNameKey; +extern const OSSymbol * gIOBSDMajorKey; +extern const OSSymbol * gIOBSDMinorKey; +extern const OSSymbol * gIOBSDUnitKey; + extern SInt32 IOServiceOrdering( const OSMetaClassBase * inObj1, const OSMetaClassBase * inObj2, void * ref ); typedef void (*IOInterruptAction)( OSObject * target, void * refCon, @@ -536,6 +543,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, @discussion IOService provides generic open and close semantics to track clients of a provider that have established an active datapath. The use of open and @link close close@/link, and rules regarding ownership are family defined, and defined by the @link handleOpen handleOpen@/link and @link handleClose handleClose@/link methods in the provider. Some families will limit access to a provider based on its open state. @param forClient Designates the client of the provider requesting the open. @param options Options for the open. The provider family may implement options for open; IOService defines only kIOServiceSeize to request the device be withdrawn from its current owner. + @param arg Family specific arguments which are ignored by IOService. @result true if the open was successful; false otherwise. */ virtual bool open( IOService * forClient, @@ -546,8 +554,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, @abstract Releases active access to a provider. @discussion IOService provides generic open and close semantics to track clients of a provider that have established an active datapath. The use of @link open open@/link and close, and rules regarding ownership are family defined, and defined by the @link handleOpen handleOpen@/link and @link handleClose handleClose@/link methods in the provider. @param forClient Designates the client of the provider requesting the close. - @param options Options available for the close. The provider family may implement options for close; IOService defines none. - @param arg Family specific arguments which are ignored by IOService. */ + @param options Options available for the close. The provider family may implement options for close; IOService defines none. */ virtual void close( IOService * forClient, IOOptionBits options = 0 ); @@ -555,8 +562,8 @@ virtual IOReturn updateReport(IOReportChannelList *channels, /*! @function isOpen @abstract Determines whether a specific, or any, client has an IOService object open. @discussion Returns the open state of an IOService object with respect to the specified client, or when it is open by any client. - @param forClient If non-zero, isOpen returns the open state for that client. If zero is passed, isOpen returns the open state for all clients. + @result true if the specific, or any, client has the IOService object open. */ virtual bool isOpen( const IOService * forClient = 0 ) const; @@ -697,7 +704,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, @abstract Uses the resource service to publish a property. @discussion The resource service uses IOService's matching and notification to allow objects to be published and found by any I/O Kit client by a global name. publishResource makes an object available to anyone waiting for it or looking for it in the future. @param key An OSSymbol key that globally identifies the object. - @param The object to be published. */ + @param value The object to be published. */ static void publishResource( const OSSymbol * key, OSObject * value = 0 ); @@ -705,7 +712,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, @abstract Uses the resource service to publish a property. @discussion The resource service uses IOService object's matching and notification to allow objects to be published and found by any I/O Kit client by a global name. publishResource makes an object available to anyone waiting for it or looking for it in the future. @param key A C string key that globally identifies the object. - @param The object to be published. */ + @param value The object to be published. */ static void publishResource( const char * key, OSObject * value = 0 ); virtual bool addNeededResource( const char * key ); @@ -872,7 +879,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, /*! @function registryEntryIDMatching @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify a IORegistryEntryID match. @discussion registryEntryIDMatching creates a matching dictionary that specifies the IOService object with the assigned registry entry ID (returned by IORegistryEntry::getRegistryEntryID()). An existing dictionary may be passed in, in which case the matching properties will be added to that dictionary rather than creating a new one. - @param name The service's ID. Matching is successful on the IOService object that return that ID from the IORegistryEntry::getRegistryEntryID() method. + @param entryID The service's ID. Matching is successful on the IOService object that return that ID from the IORegistryEntry::getRegistryEntryID() method. @param table If zero, registryEntryIDMatching creates a matching dictionary and returns a reference to it, otherwise the matching properties are added to the specified dictionary. @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */ @@ -1438,7 +1445,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, @param controllingDriver A pointer to the calling driver, usually this. @param powerStates A driver-defined array of power states that the driver and device support. Power states are defined in pwr_mgt/IOPMpowerState.h. @param numberOfStates The number of power states in the array. - @result IOPMNoErr. All errors are logged via kprintf. */ + @result IOPMNoErr. All errors are logged via kprintf. */ virtual IOReturn registerPowerDriver( IOService * controllingDriver, @@ -1655,7 +1662,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, @param period The desired idle timer period in seconds. @result kIOReturnSuccess upon success; an I/O Kit error code otherwise. */ - virtual IOReturn setIdleTimerPeriod( unsigned long ); + virtual IOReturn setIdleTimerPeriod( unsigned long period ); #ifndef __LP64__ /*! @function getPMworkloop @@ -1824,9 +1831,7 @@ virtual IOReturn updateReport(IOReportChannelList *channels, IOReturn registerInterestForNotifer( IONotifier *notify, const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, void * target, void * ref ); -#ifdef __LP64__ - static IOWorkLoop * getPMworkloop( void ); -#endif + static IOWorkLoop * getIOPMWorkloop( void ); protected: bool tellClientsWithResponse( int messageType ); @@ -1894,7 +1899,6 @@ virtual IOReturn updateReport(IOReportChannelList *channels, static void watchdog_timer_expired ( thread_call_param_t arg0, thread_call_param_t arg1 ); static void spindump_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 ); static IOReturn actionAckTimerExpired(OSObject *, void *, void *, void *, void * ); - static IOReturn watchdog_timer_expired ( OSObject *, void *, void *, void *, void * ); static IOReturn actionSpinDumpTimerExpired(OSObject *, void *, void *, void *, void * ); static IOReturn actionDriverCalloutDone(OSObject *, void *, void *, void *, void * ); diff --git a/iokit/IOKit/IOTimerEventSource.h b/iokit/IOKit/IOTimerEventSource.h index f5accffa3..538c4991b 100644 --- a/iokit/IOKit/IOTimerEventSource.h +++ b/iokit/IOKit/IOTimerEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -107,15 +107,13 @@ class IOTimerEventSource : public IOEventSource /*! @function timerEventSource @abstract Allocates and returns an initialized timer instance. - @param owner - @param action */ + */ static IOTimerEventSource * timerEventSource(OSObject *owner, Action action = 0); /*! @function init @abstract Initializes the timer with an owner, and a handler to call when the timeout expires. - @param owner - @param action */ + */ virtual bool init(OSObject *owner, Action action = 0); /*! @function enable @@ -131,19 +129,19 @@ class IOTimerEventSource : public IOEventSource /*! @function setTimeoutTicks @abstract Setup a callback at after the delay in scheduler ticks. See wakeAtTime(AbsoluteTime). - @param interval Delay from now to wake up, in scheduler ticks, whatever that may be. + @param ticks Delay from now to wake up, in scheduler ticks, whatever that may be. @result kIOReturnSuccess if everything is fine, kIOReturnNoResources if action hasn't been declared. */ virtual IOReturn setTimeoutTicks(UInt32 ticks); /*! @function setTimeoutMS @abstract Setup a callback at after the delay in milliseconds. See wakeAtTime(AbsoluteTime). - @param interval Delay from now to wake up, time in milliseconds. + @param ms Delay from now to wake up, time in milliseconds. @result kIOReturnSuccess if everything is fine, kIOReturnNoResources if action hasn't been declared. */ virtual IOReturn setTimeoutMS(UInt32 ms); /*! @function setTimeoutUS @abstract Setup a callback at after the delay in microseconds. See wakeAtTime(AbsoluteTime). - @param interval Delay from now to wake up, time in microseconds. + @param us Delay from now to wake up, time in microseconds. @result kIOReturnSuccess if everything is fine, kIOReturnNoResources if action hasn't been declared. */ virtual IOReturn setTimeoutUS(UInt32 us); @@ -168,19 +166,19 @@ class IOTimerEventSource : public IOEventSource /*! @function wakeAtTimeTicks @abstract Setup a callback at this absolute time. See wakeAtTime(AbsoluteTime). - @param abstime Time to wake up in scheduler quantums, whatever that is? + @param ticks Time to wake up in scheduler quantums, whatever that is? @result kIOReturnSuccess if everything is fine, kIOReturnNoResources if action hasn't been declared. */ virtual IOReturn wakeAtTimeTicks(UInt32 ticks); /*! @function wakeAtTimeMS @abstract Setup a callback at this absolute time. See wakeAtTime(AbsoluteTime). - @param abstime Time to wake up in milliseconds. + @param ms Time to wake up in milliseconds. @result kIOReturnSuccess if everything is fine, kIOReturnNoResources if action hasn't been declared. */ virtual IOReturn wakeAtTimeMS(UInt32 ms); /*! @function wakeAtTimeUS @abstract Setup a callback at this absolute time. See wakeAtTime(AbsoluteTime). - @param abstime Time to wake up in microseconds. + @param us Time to wake up in microseconds. @result kIOReturnSuccess if everything is fine, kIOReturnNoResources if action hasn't been declared. */ virtual IOReturn wakeAtTimeUS(UInt32 us); diff --git a/iokit/IOKit/IOUserClient.h b/iokit/IOKit/IOUserClient.h index 81adfefb9..cce1695b9 100644 --- a/iokit/IOKit/IOUserClient.h +++ b/iokit/IOKit/IOUserClient.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -319,7 +319,7 @@ class IOUserClient : public IOService @function releaseNotificationPort @abstract Release the mach_port_t passed to registerNotificationPort(). @discussion The mach_port_t passed to the registerNotificationPort() methods should be released to balance each call to registerNotificationPort(). Behavior is undefined if these calls are not correctly balanced. - @param reference The mach_port_t argument previously passed to the subclass implementation of registerNotificationPort(). + @param port The mach_port_t argument previously passed to the subclass implementation of registerNotificationPort(). @result A return code. */ static IOReturn releaseNotificationPort(mach_port_t port); diff --git a/iokit/IOKit/IOWorkLoop.h b/iokit/IOKit/IOWorkLoop.h index 2db7b17ef..73c868876 100644 --- a/iokit/IOKit/IOWorkLoop.h +++ b/iokit/IOKit/IOWorkLoop.h @@ -75,7 +75,8 @@ member function's parameter list. void *arg0, void *arg1, void *arg2, void *arg3); enum { - kPreciousStack = 0x00000001 + kPreciousStack = 0x00000001, + kTimeLockPanics = 0x00000002, }; private: @@ -150,6 +151,8 @@ IOWorkLoop uses this to determine if the event source should be polled in runEve #else void *iokitstatsReserved; #endif + uint64_t lockInterval; + uint64_t lockTime; }; /*! @var reserved @@ -313,10 +316,21 @@ IOWorkLoop uses this to determine if the event source should be polled in runEve */ virtual bool runEventSources(); +/*! @function setMaximumLockTime + @discussion For diagnostics use in DEVELOPMENT kernels, set a time interval which if the work loop lock is held for this time or greater, IOWorkLoop will panic or log a backtrace. + @param interval An absolute time interval, eg. created with clock_interval_to_absolutetime_interval(). + @param options Pass IOWorkLoop::kTimeLockPanics to panic when the time is exceeded, otherwise a log will be generated with OSReportWithBacktrace(). +*/ + void setMaximumLockTime(uint64_t interval, uint32_t options); + protected: // Internal APIs used by event sources to control the thread virtual int sleepGate(void *event, AbsoluteTime deadline, UInt32 interuptibleType); +#if XNU_KERNEL_PRIVATE + void lockTime(void); +#endif /* XNU_KERNEL_PRIVATE */ + protected: #if __LP64__ OSMetaClassDeclareReservedUnused(IOWorkLoop, 0); diff --git a/iokit/IOKit/Makefile b/iokit/IOKit/Makefile index 5ea0528d5..008d33e7e 100644 --- a/iokit/IOKit/Makefile +++ b/iokit/IOKit/Makefile @@ -22,9 +22,10 @@ INSTINC_SUBDIRS = \ rtc \ system_management + EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -# By default, everything in xnu/iokit/IOKit gets installed into +# By default, everything in xnu/iokit/IOKit gets installed into # Kernel.framework/Headers/IOKit AND Kernel.framework/PrivateHeaders/IOKit. # This is so the files with #ifdef ...PRIVATE portions can be processed. # xnu/README documents the INSTALL* and EXPORT_MI_DIR lists. @@ -37,7 +38,7 @@ NOT_EXPORT_HEADERS = IOInterruptAccountingPrivate.h NOT_KF_MI_HEADERS = $(NOT_EXPORT_HEADERS) \ IOKitKeysPrivate.h IOCPU.h \ IOHibernatePrivate.h IOPolledInterface.h \ - IOCommandQueue.h IOLocksPrivate.h \ + IOCommandQueue.h IOLocksPrivate.h \ IOSyncer.h AppleKeyStoreInterface.h \ IOStatistics.h IOStatisticsPrivate.h \ IOKernelReporters.h diff --git a/iokit/IOKit/machine/Makefile b/iokit/IOKit/machine/Makefile index d68173f0c..d7a224db7 100644 --- a/iokit/IOKit/machine/Makefile +++ b/iokit/IOKit/machine/Makefile @@ -11,13 +11,13 @@ include $(MakeInc_cmd) include $(MakeInc_def) MI_DIR = machine -EXCLUDE_HEADERS = +EXCLUDE_HEADERS = ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) HEADER_LIST = $(filter-out $(EXCLUDE_HEADERS), $(ALL_HEADERS)) INSTALL_MI_LIST = ${HEADER_LIST} -INSTALL_MI_LCL_LIST = +INSTALL_MI_LCL_LIST = INSTALL_MI_DIR = $(MI_DIR) EXPORT_MI_LIST = ${HEADER_LIST} diff --git a/iokit/IOKit/nvram/Makefile b/iokit/IOKit/nvram/Makefile index 1f1db4527..393486e2a 100644 --- a/iokit/IOKit/nvram/Makefile +++ b/iokit/IOKit/nvram/Makefile @@ -11,12 +11,12 @@ include $(MakeInc_cmd) include $(MakeInc_def) MI_DIR = nvram -NOT_EXPORT_HEADERS = +NOT_EXPORT_HEADERS = ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) -INSTALL_MI_LIST = -INSTALL_MI_LCL_LIST = +INSTALL_MI_LIST = +INSTALL_MI_LCL_LIST = INSTALL_MI_DIR = $(MI_DIR) EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) diff --git a/iokit/IOKit/platform/Makefile b/iokit/IOKit/platform/Makefile index fd6716035..ebb1f416e 100644 --- a/iokit/IOKit/platform/Makefile +++ b/iokit/IOKit/platform/Makefile @@ -11,13 +11,13 @@ include $(MakeInc_cmd) include $(MakeInc_def) MI_DIR = platform -NOT_EXPORT_HEADERS = -NOT_KF_MI_HEADERS = +NOT_EXPORT_HEADERS = +NOT_KF_MI_HEADERS = ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) -INSTALL_MI_LIST = -INSTALL_MI_LCL_LIST = +INSTALL_MI_LIST = +INSTALL_MI_LCL_LIST = INSTALL_MI_DIR = $(MI_DIR) EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) diff --git a/iokit/IOKit/power/Makefile b/iokit/IOKit/power/Makefile index acb41d022..01cc4bd09 100644 --- a/iokit/IOKit/power/Makefile +++ b/iokit/IOKit/power/Makefile @@ -11,12 +11,12 @@ include $(MakeInc_cmd) include $(MakeInc_def) MI_DIR = power -NOT_EXPORT_HEADERS = +NOT_EXPORT_HEADERS = ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) -INSTALL_MI_LIST = -INSTALL_MI_LCL_LIST = +INSTALL_MI_LIST = +INSTALL_MI_LCL_LIST = INSTALL_MI_DIR = $(MI_DIR) EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index 9623838aa..03daffd06 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -612,6 +612,7 @@ enum { #define kIOPMPSAdapterDetailsAmperageKey "Amperage" #define kIOPMPSAdapterDetailsDescriptionKey "Description" #define kIOPMPSAdapterDetailsPMUConfigurationKey "PMUConfiguration" +#define kIOPMPSAdapterDetailsVoltage "AdapterVoltage" // Battery's time remaining estimate is invalid this long (seconds) after a wake #define kIOPMPSInvalidWakeSecondsKey "BatteryInvalidWakeSeconds" diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 25226da74..4e498388c 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -531,10 +531,49 @@ enum { /*****************************************************************************/ /* -�* kIOPMLoginWindowSecurityDebugKey - identifies PM debug data specific to LoginWindow - * for use with IOPMrootDomain. -�*/ -#define kIOPMLoginWindowSecurityDebugKey "LoginWindowSecurity" + * Component wake progress keys + * + * Certain components have the ability to log their wake progress with + * root domain using the keys provided below. + * + * LoginWindow - 4 bits + * CoreDisplay - 8 bits + * CoreGraphics - 8 bits + * + * These bits are stored with the trace phase that gets logged to + * the RTC register. + */ + +// Values that should be passed in to IOPMLogWakeProgress +enum { + kIOPMLoginWindowProgress = 1, + kIOPMCoreDisplayProgress = 2, + kIOPMCoreGraphicsProgress = 3 +}; + +enum { + kIOPMLoginWindowProgressMask = 0x0f, + kIOPMCoreDisplayProgressMask = 0xff, + kIOPMCoreGraphicsProgressMask = 0xff +}; + +/* + * kIOPMLoginWindowProgressKey - identifies PM debug data specific to LoginWindow + * for use with IOPMrootDomain. Only 4 bits of data are allotted. + */ +#define kIOPMLoginWindowProgressKey "LoginWindowProgress" + +/* + * kIOPMCoreDisplayProgressKey - identifies PM debug data specific to CoreDisplay + * for use with IOPMrootDomain. Only 8 bits of data are allotted. + */ +#define kIOPMCoreDisplayProgressKey "CoreDisplayProgress" + +/* + * kIOPMCoreGraphicsProgressKey - identifies PM debug data specific to CoreGraphics + * for use with IOPMrootDomain. Only 8 bits of data are allotted. + */ +#define kIOPMCoreGraphicsProgressKey "CoreGraphicsProgress" // For PM internal use only - key to locate sleep failure results within SCDynamicStore. #define kIOPMDynamicStoreSleepFailureKey "SleepFailure" @@ -788,22 +827,13 @@ typedef struct { /* All members from UUID onwards are saved into log file */ char UUID[44]; - char cps[9]; /* Current power state */ + char spindump_status[24]; /* stackshot status*/ char PMStatusCode[32]; char reason[32]; } swd_hdr; -/* - * Structure between stackshot samples, expected by spindump - */ -typedef struct { - uint32_t magic; // 0xbad51ee4 - uint32_t size; // Size of the stackshot buffer following this struct -} swd_stackshot_hdr; - #define SWD_HDR_SIGNATURE 0xdeb8da2a -#define SWD_STACKSHOTHDR_MAGIC 0xbad51ee4 // expected by spindump #define SWD_BUF_SIZE (40*PAGE_SIZE) #define SWD_INITIAL_STACK_SIZE ((SWD_BUF_SIZE/2)-sizeof(swd_hdr)) @@ -819,6 +849,14 @@ typedef struct { #define SWD_LOGS_IN_FILE 0x10 #define SWD_LOGS_IN_MEM 0x20 +#define SWD_DATA_CRC_ERROR 0x010000 +#define SWD_BUF_SIZE_ERROR 0x020000 +#define SWD_HDR_SIZE_ERROR 0x040000 +#define SWD_FILEOP_ERROR 0x080000 +#define SWD_HDR_SIGNATURE_ERROR 0x100000 +#define SWD_INTERNAL_FAILURE 0x200000 + + /* Filenames associated with the stackshots/logs generated by the SWD */ #define kSleepWakeStackBinFilename "/var/log/SleepWakeStacks.bin" #define kSleepWakeStackFilename "/var/log/SleepWakeStacks.dump" diff --git a/iokit/IOKit/pwr_mgt/IOPMlog.h b/iokit/IOKit/pwr_mgt/IOPMlog.h index b950816e0..351ddad2d 100644 --- a/iokit/IOKit/pwr_mgt/IOPMlog.h +++ b/iokit/IOKit/pwr_mgt/IOPMlog.h @@ -81,5 +81,11 @@ enum PMLogEnum { kPMLogIdleCancel, // 53 0x050700d4 - device unidle during change kPMLogSleepWakeTracePoint, // 54 0x050700d8 - kIOPMTracePoint markers kPMLogQuiescePowerTree, // 55 0x050700dc + kPMLogComponentWakeProgress, // 56 0x050700e0 + kPMLogUserActiveState, // 57 0x050700e4 + kPMLogAppResponseDelay, // 58 0x050700e8 + kPMLogDrvResponseDelay, // 59 0x050700ec + kPMLogPCIDevChangeStart, // 60 0x050700f0 + kPMLogPCIDevChangeDone, // 61 0x050700f4 kIOPMlogLastEvent }; diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 2ecc15e63..ff6bcb6fd 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -168,10 +168,7 @@ class IOPMrootDomain: public IOService @discussion systemPowerEventOccurred is a richer alternative to receivePowerNotification() Only Apple-owned kexts should have reason to call systemPowerEventOccurred. @param event An OSSymbol describing the type of power event. - @param value A 32-bit integer value associated with the event. - @param shouldUpdate indicates whether the root domain should send a notification - to interested parties. Pass false if you're calling systemPowerEventOccurred - several times in succession; and pass true only on the last invocatino. + @param intValue A 32-bit integer value associated with the event. @result kIOReturnSuccess on success */ IOReturn systemPowerEventOccurred( @@ -339,7 +336,7 @@ class IOPMrootDomain: public IOService /*! @function createPMAssertion @abstract Creates an assertion to influence system power behavior. - @param whichAssertionBits A bitfield specify the assertion that the caller requests. + @param whichAssertionsBits A bitfield specify the assertion that the caller requests. @param assertionLevel An integer detailing the initial assertion level, kIOPMDriverAssertionLevelOn or kIOPMDriverAssertionLevelOff. @param ownerService A pointer to the caller's IOService class, for tracking. @@ -512,9 +509,10 @@ class IOPMrootDomain: public IOService IOReturn joinAggressiveness( IOService * service ); void handleAggressivesRequests( void ); + void kdebugTrace(uint32_t event, uint64_t regId, + uintptr_t param1, uintptr_t param2, uintptr_t param3 = 0); void tracePoint( uint8_t point ); - void tracePoint( uint8_t point, uint8_t data ); - void traceDetail( uint32_t data32 ); + void traceDetail(uint32_t msgType, uint32_t msgIndex, uintptr_t handler); bool systemMessageFilter( void * object, void * arg1, void * arg2, void * arg3 ); @@ -536,7 +534,7 @@ class IOPMrootDomain: public IOService const char *name, int messageType, uint32_t delay_ms, - int app_pid, + uint64_t id, OSObject *object, IOPMPowerStateIndex ps=0); @@ -554,6 +552,8 @@ class IOPMrootDomain: public IOService bool sleepWakeDebugIsWdogEnabled(); static void saveTimeoutAppStackShot(void *p0, void *p1); void sleepWakeDebugSaveSpinDumpFile(); + void swdDebugSetup(); + void swdDebugTeardown(); private: friend class PMSettingObject; @@ -648,7 +648,8 @@ class IOPMrootDomain: public IOService thread_call_t extraSleepTimer; thread_call_t diskSyncCalloutEntry; thread_call_t fullWakeThreadCall; - thread_call_t hibDebugSetupEntry; + thread_call_t swdDebugSetupEntry; + thread_call_t swdDebugTearDownEntry; thread_call_t updateConsoleUsersEntry; // Track system capabilities. @@ -728,6 +729,8 @@ class IOPMrootDomain: public IOService uint32_t lastSleepReason; uint32_t fullToDarkReason; uint32_t hibernateAborted; + uint8_t standbyNixed; + uint8_t resetTimers; enum FullWakeReason { kFullWakeReasonNone = 0, @@ -750,7 +753,6 @@ class IOPMrootDomain: public IOService OSData * aggressivesData; AbsoluteTime userBecameInactiveTime; - AbsoluteTime systemWakeTime; // PCI top-level PM trace IOService * pciHostBridgeDevice; @@ -778,9 +780,12 @@ class IOPMrootDomain: public IOService #endif volatile uint32_t swd_lock; /* Lock to access swd_buffer & and its header */ void * swd_buffer; /* Memory allocated for dumping sleep/wake logs */ - uint8_t swd_flags; /* Flags defined in IOPMPrivate.h */ + uint32_t swd_flags; /* Flags defined in IOPMPrivate.h */ + uint8_t swd_DebugImageSetup; void * swd_spindump_buffer; + IOBufferMemoryDescriptor *swd_memDesc; + IOMemoryMap * swd_logBufMap; /* Memory with sleep/wake logs from previous boot */ // Wake Event Reporting @@ -845,7 +850,7 @@ class IOPMrootDomain: public IOService void deregisterPMSettingObject( PMSettingObject * pmso ); - void checkForValidDebugData(const char *fname, vfs_context_t *ctx, + uint32_t checkForValidDebugData(const char *fname, vfs_context_t *ctx, void *tmpBuf, struct vnode **vp); void sleepWakeDebugMemAlloc( ); void sleepWakeDebugSpinDumpMemAlloc( ); diff --git a/iokit/IOKit/rtc/Makefile b/iokit/IOKit/rtc/Makefile index 6d08bc486..587476354 100644 --- a/iokit/IOKit/rtc/Makefile +++ b/iokit/IOKit/rtc/Makefile @@ -11,12 +11,12 @@ include $(MakeInc_cmd) include $(MakeInc_def) MI_DIR = rtc -NOT_EXPORT_HEADERS = +NOT_EXPORT_HEADERS = ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) -INSTALL_MI_LIST = -INSTALL_MI_LCL_LIST = +INSTALL_MI_LIST = +INSTALL_MI_LCL_LIST = INSTALL_MI_DIR = $(MI_DIR) EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) diff --git a/iokit/IOKit/system_management/Makefile b/iokit/IOKit/system_management/Makefile index 7453b1620..715cf2d98 100644 --- a/iokit/IOKit/system_management/Makefile +++ b/iokit/IOKit/system_management/Makefile @@ -11,12 +11,12 @@ include $(MakeInc_cmd) include $(MakeInc_def) MI_DIR = system_management -NOT_EXPORT_HEADERS = +NOT_EXPORT_HEADERS = ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) -INSTALL_MI_LIST = -INSTALL_MI_LCL_LIST = +INSTALL_MI_LIST = +INSTALL_MI_LCL_LIST = INSTALL_MI_DIR = $(MI_DIR) EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index 046274c59..5641d3b9e 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -626,6 +626,9 @@ void * IOBufferMemoryDescriptor::getBytesNoCopy(vm_size_t start, vm_size_t withLength) { IOVirtualAddress address; + + if ((start + withLength) < start) return 0; + if (kIOMemoryTypePhysical64 == (_flags & kIOMemoryTypeMask)) address = (IOVirtualAddress) _buffer; else diff --git a/iokit/Kernel/IOCPU.cpp b/iokit/Kernel/IOCPU.cpp index a6e41b648..01354d7cf 100644 --- a/iokit/Kernel/IOCPU.cpp +++ b/iokit/Kernel/IOCPU.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,12 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1999-2000 Apple Computer, Inc. All rights reserved. - * - * DRI: Josh de Cesare - * - */ extern "C" { #include @@ -47,10 +41,14 @@ extern "C" { #include #include #include +#include "IOKitKernelInternal.h" /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include +extern "C" void console_suspend(); +extern "C" void console_resume(); + typedef kern_return_t (*iocpu_platform_action_t)(void * refcon0, void * refcon1, uint32_t priority, void * param1, void * param2, void * param3, const char * name); @@ -157,6 +155,7 @@ IOCPURunPlatformActiveActions(void) extern "C" kern_return_t IOCPURunPlatformHaltRestartActions(uint32_t message) { + if (!gActionQueues[kQueueHaltRestart].next) return (kIOReturnNotReady); return (iocpu_run_platform_actions(&gActionQueues[kQueueHaltRestart], 0, 0U-1, (void *)(uintptr_t) message, NULL, NULL)); } @@ -164,6 +163,7 @@ IOCPURunPlatformHaltRestartActions(uint32_t message) extern "C" kern_return_t IOCPURunPlatformPanicActions(uint32_t message) { + if (!gActionQueues[kQueueHaltRestart].next) return (kIOReturnNotReady); return (iocpu_run_platform_actions(&gActionQueues[kQueuePanic], 0, 0U-1, (void *)(uintptr_t) message, NULL, NULL)); } @@ -441,6 +441,8 @@ void IOCPUSleepKernel(void) assert(bootCPU != NULL); assert(cpu_number() == 0); + console_suspend(); + rootDomain->tracePoint( kIOPMTracePointSleepPlatformDriver ); // Now sleep the boot CPU. @@ -448,6 +450,8 @@ void IOCPUSleepKernel(void) rootDomain->tracePoint( kIOPMTracePointWakePlatformActions ); + console_resume(); + iocpu_run_platform_actions(&gActionQueues[kQueueWake], 0, 0U-1, NULL, NULL, NULL); diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index c8477aaca..936ae0879 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -166,7 +166,7 @@ IODMACommand::cloneCommand(void *refCon) SegmentOptions segmentOptions = { .fStructSize = sizeof(segmentOptions), - .fNumAddressBits = fNumAddressBits, + .fNumAddressBits = (uint8_t)fNumAddressBits, .fMaxSegmentSize = fMaxSegmentSize, .fMaxTransferSize = fMaxTransferSize, .fAlignment = fAlignMask + 1, @@ -379,8 +379,15 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar fInternalState->fNewMD = true; mem->retain(); fMemory = mem; - - mem->dmaCommandOperation(kIOMDSetDMAActive, this, 0); + if (fMapper) + { +#if IOTRACKING + fInternalState->fTag = IOMemoryTag(kernel_map); + __IODEQUALIFY(IOMemoryDescriptor *, mem)->prepare((IODirection) + (kIODirectionDMACommand | (fInternalState->fTag << kIODirectionDMACommandShift))); + IOTrackingAdd(gIOWireTracking, &fInternalState->fWireTracking, fMemory->getLength(), false); +#endif /* IOTRACKING */ + } if (autoPrepare) { err = prepare(); if (err) { @@ -395,13 +402,19 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar IOReturn IODMACommand::clearMemoryDescriptor(bool autoComplete) { - if (fActive && !autoComplete) - return (kIOReturnNotReady); + if (fActive && !autoComplete) return (kIOReturnNotReady); - if (fMemory) { - while (fActive) - complete(); - fMemory->dmaCommandOperation(kIOMDSetDMAInactive, this, 0); + if (fMemory) + { + while (fActive) complete(); + if (fMapper) + { +#if IOTRACKING + __IODEQUALIFY(IOMemoryDescriptor *, fMemory)->complete((IODirection) + (kIODirectionDMACommand | (fInternalState->fTag << kIODirectionDMACommandShift))); + IOTrackingRemove(gIOWireTracking, &fInternalState->fWireTracking, fMemory->getLength()); +#endif /* IOTRACKING */ + } fMemory->release(); fMemory = 0; } @@ -1150,7 +1163,7 @@ IODMACommand::genIOVMSegments(uint32_t op, if (internalState->fMapContig && internalState->fLocalMapperAlloc) { - state->fIOVMAddr = internalState->fLocalMapperAlloc + offset; + state->fIOVMAddr = internalState->fLocalMapperAlloc + offset - internalState->fPreparedOffset; rtn = kIOReturnSuccess; #if 0 { diff --git a/iokit/Kernel/IODataQueue.cpp b/iokit/Kernel/IODataQueue.cpp index e3afbdcf3..3f718a754 100644 --- a/iokit/Kernel/IODataQueue.cpp +++ b/iokit/Kernel/IODataQueue.cpp @@ -156,8 +156,9 @@ void IODataQueue::free() Boolean IODataQueue::enqueue(void * data, UInt32 dataSize) { - const UInt32 head = dataQueue->head; // volatile - const UInt32 tail = dataQueue->tail; + UInt32 head; + UInt32 tail; + UInt32 newTail; const UInt32 entrySize = dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE; UInt32 queueSize; IODataQueueEntry * entry; @@ -167,6 +168,10 @@ Boolean IODataQueue::enqueue(void * data, UInt32 dataSize) return false; } + // Force a single read of head and tail + head = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED); + tail = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_RELAXED); + // Check for underflow of (dataQueue->queueSize - tail) queueSize = ((IODataQueueInternal *) notifyMsg)->queueSize; if ((queueSize < tail) || (queueSize < head)) { @@ -187,8 +192,8 @@ Boolean IODataQueue::enqueue(void * data, UInt32 dataSize) // The tail can be out of bound when the size of the new entry // exactly matches the available space at the end of the queue. // The tail can range from 0 to dataQueue->queueSize inclusive. - - OSAddAtomic(entrySize, (SInt32 *)&dataQueue->tail); + + newTail = tail + entrySize; } else if ( head > entrySize ) // Is there enough room at the beginning? { @@ -207,7 +212,7 @@ Boolean IODataQueue::enqueue(void * data, UInt32 dataSize) } memcpy(&dataQueue->queue->data, data, dataSize); - OSCompareAndSwap(dataQueue->tail, entrySize, &dataQueue->tail); + newTail = entrySize; } else { @@ -225,7 +230,7 @@ Boolean IODataQueue::enqueue(void * data, UInt32 dataSize) entry->size = dataSize; memcpy(&entry->data, data, dataSize); - OSAddAtomic(entrySize, (SInt32 *)&dataQueue->tail); + newTail = tail + entrySize; } else { @@ -233,10 +238,13 @@ Boolean IODataQueue::enqueue(void * data, UInt32 dataSize) } } + // Store tail with a release memory barrier + __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE); + // Send notification (via mach message) that data is available. - if ( ( head == tail ) /* queue was empty prior to enqueue() */ - || ( dataQueue->head == tail ) ) /* queue was emptied during enqueue() */ + if ( ( head == tail ) /* queue was empty prior to enqueue() */ + || ( tail == __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED) ) ) /* queue was emptied during enqueue() */ { sendDataAvailableNotification(); } diff --git a/iokit/Kernel/IODeviceTreeSupport.cpp b/iokit/Kernel/IODeviceTreeSupport.cpp index ce4a65ef7..c66265be1 100644 --- a/iokit/Kernel/IODeviceTreeSupport.cpp +++ b/iokit/Kernel/IODeviceTreeSupport.cpp @@ -430,14 +430,17 @@ static IORegistryEntry * FindPHandle( UInt32 phandle ) static bool GetUInt32( IORegistryEntry * regEntry, const OSSymbol * name, UInt32 * value ) { - OSData *data; + OSObject * obj; + OSData * data; + bool result; - if( (data = OSDynamicCast( OSData, regEntry->getProperty( name ))) - && (4 == data->getLength())) { - *value = *((UInt32 *) data->getBytesNoCopy()); - return( true ); - } else - return( false ); + if (!(obj = regEntry->copyProperty(name))) return (false); + + result = ((data = OSDynamicCast(OSData, obj)) && (sizeof(UInt32) == data->getLength())); + if (result) *value = *((UInt32 *) data->getBytesNoCopy()); + + obj->release(); + return(result); } static IORegistryEntry * IODTFindInterruptParent( IORegistryEntry * regEntry, IOItemCount index ) @@ -771,9 +774,10 @@ bool IODTMapInterrupts( IORegistryEntry * regEntry ) /* */ -static const char * +static bool CompareKey( OSString * key, - const IORegistryEntry * table, const OSSymbol * propName ) + const IORegistryEntry * table, const OSSymbol * propName, + OSString ** matchingName ) { OSObject *prop; OSData *data; @@ -787,8 +791,7 @@ CompareKey( OSString * key, bool matched; const char *result = 0; - if( 0 == (prop = table->getProperty( propName ))) - return( 0 ); + if( 0 == (prop = table->copyProperty( propName ))) return( 0 ); if( (data = OSDynamicCast( OSData, prop ))) { names = (const char *) data->getBytesNoCopy(); @@ -796,47 +799,48 @@ CompareKey( OSString * key, } else if( (string = OSDynamicCast( OSString, prop ))) { names = string->getCStringNoCopy(); lastName = names + string->getLength() + 1; - } else - return( 0 ); + } else names = 0; - ckey = key->getCStringNoCopy(); - keyLen = key->getLength(); - wild = ('*' == key->getChar( keyLen - 1 )); + if (names) { + ckey = key->getCStringNoCopy(); + keyLen = key->getLength(); + wild = ('*' == key->getChar( keyLen - 1 )); - do { - // for each name in the property - nlen = strnlen(names, lastName - names); - if( wild) - matched = ((nlen >= (keyLen - 1)) && (0 == strncmp(ckey, names, keyLen - 1))); - else - matched = (keyLen == nlen) && (0 == strncmp(ckey, names, keyLen)); + do { + // for each name in the property + nlen = strnlen(names, lastName - names); + if( wild) + matched = ((nlen >= (keyLen - 1)) && (0 == strncmp(ckey, names, keyLen - 1))); + else + matched = (keyLen == nlen) && (0 == strncmp(ckey, names, keyLen)); - if( matched) - result = names; + if( matched) + result = names; - names = names + nlen + 1; + names = names + nlen + 1; - } while( (names < lastName) && (false == matched)); + } while( (names < lastName) && (false == matched)); + } + + if (result && matchingName) *matchingName = OSString::withCString( result ); + + if (prop) prop->release(); - return( result); + return (result != 0); } bool IODTCompareNubName( const IORegistryEntry * regEntry, OSString * name, OSString ** matchingName ) { - const char *result; - bool matched; - - matched = (0 != (result = CompareKey( name, regEntry, gIODTNameKey))) - || (0 != (result = CompareKey( name, regEntry, gIODTCompatibleKey))) - || (0 != (result = CompareKey( name, regEntry, gIODTTypeKey))) - || (0 != (result = CompareKey( name, regEntry, gIODTModelKey))); + bool matched; - if( result && matchingName) - *matchingName = OSString::withCString( result ); + matched = CompareKey( name, regEntry, gIODTNameKey, matchingName) + || CompareKey( name, regEntry, gIODTCompatibleKey, matchingName) + || CompareKey( name, regEntry, gIODTTypeKey, matchingName) + || CompareKey( name, regEntry, gIODTModelKey, matchingName); - return( result != 0 ); + return (matched); } bool IODTMatchNubWithKeys( IORegistryEntry * regEntry, diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index 168a6a039..56a77f89d 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -1,8 +1,8 @@ /* - * Copyright (c) 2004-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,11 +22,10 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - /* Sleep: @@ -167,12 +166,14 @@ to restrict I/O ops. #include #include "IOHibernateInternal.h" #include +#include #include "IOKitKernelInternal.h" #include #include #include #include +#include extern "C" addr64_t kvtophys(vm_offset_t va); extern "C" ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); @@ -180,9 +181,8 @@ extern "C" ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define DISABLE_TRIM 0 -#define TRIM_DELAY 5000 +#define TRIM_DELAY 25000 -extern boolean_t root_is_CF_drive; extern unsigned int save_kdebug_enable; extern uint32_t gIOHibernateState; uint32_t gIOHibernateMode; @@ -191,6 +191,7 @@ static char gIOHibernateFilename[MAXPATHLEN+1]; static uint32_t gIOHibernateFreeRatio = 0; // free page target (percent) uint32_t gIOHibernateFreeTime = 0*1000; // max time to spend freeing pages (ms) static uint64_t gIOHibernateCompression = 0x80; // default compression 50% +boolean_t gIOHibernateStandbyDisabled; static IODTNVRAM * gIOOptionsEntry; static IORegistryEntry * gIOChosenEntry; @@ -390,7 +391,7 @@ IOHibernateSystemSleep(void) OSObject * obj; OSString * str; OSNumber * num; - bool dsSSD, vmflush; + bool dsSSD, vmflush, swapPinned; IOHibernateVars * vars; uint64_t setFileSize = 0; @@ -437,10 +438,11 @@ IOHibernateSystemSleep(void) gFSState = kFSOpening; IOLockUnlock(gFSLock); + swapPinned = false; do { vars->srcBuffer = IOBufferMemoryDescriptor::withOptions(kIODirectionOutIn, - 2 * page_size + WKdm_SCRATCH_BUF_SIZE, page_size); + 2 * page_size + WKdm_SCRATCH_BUF_SIZE_INTERNAL, page_size); vars->handoffBuffer = IOBufferMemoryDescriptor::withOptions(kIODirectionOutIn, ptoa_64(gIOHibernateHandoffPageCount), page_size); @@ -471,12 +473,15 @@ IOHibernateSystemSleep(void) gIOHibernateCurrentHeader->debugFlags = gIOHibernateDebugFlags; gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; - vmflush = ((kOSBooleanTrue == IOService::getPMRootDomain()->getProperty(kIOPMDeepSleepEnabledKey)) && root_is_CF_drive == FALSE); + vmflush = ((kOSBooleanTrue == IOService::getPMRootDomain()->getProperty(kIOPMDeepSleepEnabledKey))); err = hibernate_alloc_page_lists(&vars->page_list, &vars->page_list_wired, &vars->page_list_pal); - if (KERN_SUCCESS != err) - break; + if (KERN_SUCCESS != err) break; + + err = hibernate_pin_swap(TRUE); + if (KERN_SUCCESS != err) break; + swapPinned = true; if (vars->fileMinSize || (kIOHibernateModeFileResize & gIOHibernateMode)) { @@ -551,9 +556,19 @@ IOHibernateSystemSleep(void) clock_get_uptime(&endTime); SUB_ABSOLUTETIME(&endTime, &startTime); absolutetime_to_nanoseconds(endTime, &nsec); - HIBLOG("hibernate_setup(%d) took %qd ms\n", err, nsec / 1000000ULL); + + boolean_t haveSwapPin, hibFileSSD; + haveSwapPin = vm_swap_files_pinned(); + + hibFileSSD = (kIOPolledFileSSD & vars->fileVars->flags); + + HIBLOG("hibernate_setup(%d) took %qd ms, swapPin(%d) ssd(%d)\n", + err, nsec / 1000000ULL, + haveSwapPin, hibFileSSD); if (KERN_SUCCESS != err) break; + gIOHibernateStandbyDisabled = ((!haveSwapPin || !hibFileSSD)); + dsSSD = ((0 != (kIOPolledFileSSD & vars->fileVars->flags)) && (kOSBooleanTrue == IOService::getPMRootDomain()->getProperty(kIOPMDeepSleepEnabledKey))); @@ -682,6 +697,8 @@ IOHibernateSystemSleep(void) } while (false); + if (swapPinned) hibernate_pin_swap(FALSE); + IOLockLock(gFSLock); if ((kIOReturnSuccess == err) && (kFSOpening != gFSState)) { @@ -757,7 +774,7 @@ IOWriteExtentsToFile(IOPolledFileIOVars * vars, uint32_t signature) int rc; IOPolledFileExtent * fileExtents; - fileExtents = (typeof(fileExtents)) vars->fileExtents->getBytesNoCopy(), + fileExtents = (typeof(fileExtents)) vars->fileExtents->getBytesNoCopy(); memset(&hdr, 0, sizeof(IOHibernateImageHeader)); count = vars->fileExtents->getLength(); @@ -798,6 +815,8 @@ IOWriteExtentsToFile(IOPolledFileIOVars * vars, uint32_t signature) return err; } +extern "C" boolean_t root_is_CF_drive; + void IOOpenDebugDataFile(const char *fname, uint64_t size) { @@ -809,6 +828,8 @@ IOOpenDebugDataFile(const char *fname, uint64_t size) gDebugImageLock = IOLockAlloc(); } + if (root_is_CF_drive) return; + // Try to get a lock, but don't block for getting lock if (!IOLockTryLock(gDebugImageLock)) { HIBLOG("IOOpenDebugDataFile: Failed to get lock\n"); @@ -848,6 +869,23 @@ IOOpenDebugDataFile(const char *fname, uint64_t size) return; } +void +IOCloseDebugDataFile() +{ + IOSetBootImageNVRAM(0); + + if (gDebugImageLock) { + IOLockLock(gDebugImageLock); + if (gDebugImageFileVars != 0) { + kprintf("IOHibernateSystemPostWake: Closing debugdata file\n"); + IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0); + } + IOLockUnlock(gDebugImageLock); + } + + +} + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ DECLARE_IOHIBERNATEPROGRESSALPHA @@ -1234,6 +1272,7 @@ IOReturn IOHibernateSystemPostWake(void) { gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; + IOLockLock(gFSLock); if (kFSOpened == gFSState) { // invalidate & close the image file @@ -1249,17 +1288,10 @@ IOHibernateSystemPostWake(void) } gFSState = kFSIdle; - IOSetBootImageNVRAM(0); - - if (gDebugImageLock) { - IOLockLock(gDebugImageLock); - if (gDebugImageFileVars != 0) { - kprintf("IOHibernateSystemPostWake: Closing debugdata file\n"); - IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0); - } - IOLockUnlock(gDebugImageLock); - } + IOLockUnlock(gFSLock); + // IOCloseDebugDataFile() calls IOSetBootImageNVRAM() unconditionally + IOCloseDebugDataFile( ); return (kIOReturnSuccess); } @@ -1299,20 +1331,20 @@ SYSCTL_UINT(_kern, OID_AUTO, hibernatemode, &gIOHibernateMode, 0, ""); SYSCTL_STRUCT(_kern, OID_AUTO, hibernatestatistics, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - gIOHibernateStats, hibernate_statistics_t, ""); + &_hibernateStats, hibernate_statistics_t, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernategraphicsready, CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, - &gIOHibernateStats->graphicsReadyTime, 0, ""); + &_hibernateStats.graphicsReadyTime, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatewakenotification, CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, - &gIOHibernateStats->wakeNotificationTime, 0, ""); + &_hibernateStats.wakeNotificationTime, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatelockscreenready, CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, - &gIOHibernateStats->lockScreenReadyTime, 0, ""); + &_hibernateStats.lockScreenReadyTime, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatehidready, CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, - &gIOHibernateStats->hidReadyTime, 0, ""); + &_hibernateStats.hidReadyTime, 0, ""); void @@ -1353,14 +1385,6 @@ IOHibernateSystemInit(IOPMrootDomain * rootDomain) gFSLock = IOLockAlloc(); } - -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -static void -hibernate_setup_for_wake(void) -{ -} - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ static IOReturn @@ -1385,7 +1409,7 @@ hibernate_write_image(void) IOHibernateVars * vars = &gIOHibernateVars; IOPolledFileExtent * fileExtents; - assert_static(sizeof(IOHibernateImageHeader) == 512); + _static_assert_1_arg(sizeof(IOHibernateImageHeader) == 512); uint32_t pageCount, pagesDone; IOReturn err; @@ -1433,8 +1457,9 @@ hibernate_write_image(void) svPageCount = 0; zvPageCount = 0; - if (!vars->fileVars || !vars->fileVars->pollers) - return (false /* sleep */ ); + if (!vars->fileVars + || !vars->fileVars->pollers + || !(kIOHibernateModeOn & gIOHibernateMode)) return (kIOHibernatePostWriteSleep); if (kIOHibernateModeSleep & gIOHibernateMode) kdebug_enable = save_kdebug_enable; @@ -1476,8 +1501,6 @@ hibernate_write_image(void) } #endif /* CRYPTO */ - hibernate_setup_for_wake(); - hibernate_page_list_setall(vars->page_list, vars->page_list_wired, vars->page_list_pal, @@ -1731,6 +1754,9 @@ hibernate_write_image(void) kUnwiredEncrypt = kEncrypt }; + bool cpuAES = (0 != (CPUID_FEATURE_AES & cpuid_features())); +#define _pmap_is_noencrypt(x) (cpuAES ? false : pmap_is_noencrypt((x))) + for (pageType = kWiredEncrypt; pageType >= kUnwiredEncrypt; pageType--) { if (kUnwiredEncrypt == pageType) @@ -1754,13 +1780,13 @@ hibernate_write_image(void) &ppnum); // kprintf("[%d](%x : %x)\n", pageType, ppnum, count); iterDone = !count; - + if (count && (kWired & pageType) && needEncrypt) { uint32_t checkIndex; for (checkIndex = 0; (checkIndex < count) - && (((kEncrypt & pageType) == 0) == pmap_is_noencrypt(ppnum + checkIndex)); + && (((kEncrypt & pageType) == 0) == _pmap_is_noencrypt(ppnum + checkIndex)); checkIndex++) {} if (!checkIndex) @@ -2068,7 +2094,7 @@ hibernate_machine_init(void) #define t40ms(x) (tmrCvt((((uint64_t)(x)) << 8), tscFCvtt2n) / 1000000) #define tStat(x, y) gIOHibernateStats->x = t40ms(gIOHibernateCurrentHeader->y); tStat(booterStart, booterStart); - gIOHibernateStats->smcStart = gIOHibernateCurrentHeader->smcStart, + gIOHibernateStats->smcStart = gIOHibernateCurrentHeader->smcStart; tStat(booterDuration0, booterTime0); tStat(booterDuration1, booterTime1); tStat(booterDuration2, booterTime2); @@ -2124,7 +2150,10 @@ hibernate_machine_init(void) break; case kIOHibernateHandoffTypeGraphicsInfo: - bcopy(data, gIOHibernateGraphicsInfo, sizeof(*gIOHibernateGraphicsInfo)); + if (handoff->bytecount == sizeof(*gIOHibernateGraphicsInfo)) + { + bcopy(data, gIOHibernateGraphicsInfo, sizeof(*gIOHibernateGraphicsInfo)); + } break; case kIOHibernateHandoffTypeCryptVars: @@ -2169,7 +2198,7 @@ hibernate_machine_init(void) if (cryptvars && !foundCryptData) panic("hibernate handoff"); - HIBPRINT("video %x %d %d %d status %x\n", + HIBPRINT("video 0x%llx %d %d %d status %x\n", gIOHibernateGraphicsInfo->physicalAddress, gIOHibernateGraphicsInfo->depth, gIOHibernateGraphicsInfo->width, gIOHibernateGraphicsInfo->height, gIOHibernateGraphicsInfo->gfxStatus); diff --git a/iokit/Kernel/IOHibernateRestoreKernel.c b/iokit/Kernel/IOHibernateRestoreKernel.c index 141a280a5..7c2d3931f 100644 --- a/iokit/Kernel/IOHibernateRestoreKernel.c +++ b/iokit/Kernel/IOHibernateRestoreKernel.c @@ -396,7 +396,7 @@ store_one_page(uint32_t procFlags, uint32_t * src, uint32_t compressedSize, uint32_t * buffer, uint32_t ppnum) { uint64_t dst = ptoa_64(ppnum); - uint8_t scratch[WKdm_SCRATCH_BUF_SIZE] __attribute__ ((aligned (16))); + uint8_t scratch[WKdm_SCRATCH_BUF_SIZE_INTERNAL] __attribute__ ((aligned (16))); if (compressedSize != PAGE_SIZE) { @@ -456,7 +456,7 @@ hibernate_kernel_entrypoint(uint32_t p1, uint64_t timeStart; timeStart = rdtsc64(); - assert_static(sizeof(IOHibernateImageHeader) == 512); + static_assert(sizeof(IOHibernateImageHeader) == 512); headerPhys = ptoa_64(p1); diff --git a/iokit/Kernel/IOHistogramReporter.cpp b/iokit/Kernel/IOHistogramReporter.cpp index 21e92e935..929a830f2 100644 --- a/iokit/Kernel/IOHistogramReporter.cpp +++ b/iokit/Kernel/IOHistogramReporter.cpp @@ -62,6 +62,8 @@ IOHistogramReporter::with(IOService *reportingService, return reporter; } } + OSSafeReleaseNULL(reporter); + OSSafeReleaseNULL(tmpChannelName); return 0; } @@ -208,7 +210,8 @@ IOHistogramReporter::initWith(IOService *reportingService, if (cnt3 >= _nElements) { IORLOG("ERROR: _bucketBounds init"); - return false; + result = false; + goto finish; } if (_histogramSegmentsConfig[cnt].scale_flag) { @@ -245,21 +248,6 @@ IOHistogramReporter::initWith(IOService *reportingService, result = true; finish: - if (result != true) { - - if (_histogramSegmentsConfig) - IOFree(_histogramSegmentsConfig, configSize); - - if (_elements) - IOFree(_elements, elementsSize); - - if (_enableCounts) - IOFree(_enableCounts, eCountsSize); - - if (_bucketBounds) - IOFree(_bucketBounds, boundsSize); - } - return result; } @@ -314,6 +302,33 @@ IOHistogramReporter::handleCreateLegend(void) return legendEntry; } +IOReturn +IOHistogramReporter::overrideBucketValues(unsigned int index, + uint64_t bucket_hits, + int64_t bucket_min, + int64_t bucket_max, + int64_t bucket_sum) +{ + IOReturn result; + IOHistogramReportValues bucket; + lockReporter(); + + if (index >= (unsigned int)_bucketCount) { + result = kIOReturnBadArgument; + goto finish; + } + + bucket.bucket_hits = bucket_hits; + bucket.bucket_min = bucket_min; + bucket.bucket_max = bucket_max; + bucket.bucket_sum = bucket_sum; + + result = setElementValues(index, (IOReportElementValues *)&bucket); +finish: + unlockReporter(); + return result; +} + int IOHistogramReporter::tallyValue(int64_t value) { diff --git a/iokit/Kernel/IOInterruptController.cpp b/iokit/Kernel/IOInterruptController.cpp index 95287f584..fb0aa23b0 100644 --- a/iokit/Kernel/IOInterruptController.cpp +++ b/iokit/Kernel/IOInterruptController.cpp @@ -519,6 +519,7 @@ IOReturn IOSharedInterruptController::registerInterrupt(IOService *nub, // Create the vectorData for the IOInterruptSource. vectorData = OSData::withBytes(&vectorNumber, sizeof(vectorNumber)); if (vectorData == 0) { + IOLockUnlock(vector->interruptLock); return kIOReturnNoMemory; } diff --git a/iokit/Kernel/IOKitDebug.cpp b/iokit/Kernel/IOKitDebug.cpp index d99e9399e..22b315da4 100644 --- a/iokit/Kernel/IOKitDebug.cpp +++ b/iokit/Kernel/IOKitDebug.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2010 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -30,11 +30,14 @@ #include extern "C" { #include +#include +#include } #include #include #include +#include #include #include @@ -42,6 +45,8 @@ extern "C" { #include #include +#include "IOKitKernelInternal.h" + #ifdef IOKITDEBUG #define DEBUG_INIT_VALUE IOKITDEBUG #else @@ -223,13 +228,15 @@ struct IOTrackingQueue { queue_chain_t link; IOTRecursiveLock lock; - queue_head_t sites; const char * name; + uintptr_t btEntry; size_t allocSize; size_t minCaptureSize; uint32_t siteCount; + uint32_t type; + uint32_t numSiteQs; uint8_t captureOn; - uint8_t isAlloc; + queue_head_t sites[]; }; struct IOTrackingCallSite @@ -237,26 +244,24 @@ struct IOTrackingCallSite queue_chain_t link; IOTrackingQueue * queue; uint32_t crc; - IOTrackingCallSiteInfo info; - queue_chain_t instances; + + uint32_t count; + size_t size[2]; + uintptr_t bt[kIOTrackingCallSiteBTs]; + + queue_head_t instances; IOTracking * addresses; }; struct IOTrackingLeaksRef { uintptr_t * instances; + uint32_t zoneSize; uint32_t count; uint32_t found; size_t bytes; }; -enum -{ - kInstanceFlagAddress = 0x01UL, - kInstanceFlagReferenced = 0x02UL, - kInstanceFlags = 0x03UL -}; - lck_mtx_t * gIOTrackingLock; queue_head_t gIOTrackingQ; @@ -310,20 +315,29 @@ IOTrackingInit(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ IOTrackingQueue * -IOTrackingQueueAlloc(const char * name, size_t allocSize, size_t minCaptureSize, bool isAlloc) +IOTrackingQueueAlloc(const char * name, uintptr_t btEntry, + size_t allocSize, size_t minCaptureSize, + uint32_t type, uint32_t numSiteQs) { IOTrackingQueue * queue; - queue = (typeof(queue)) kalloc(sizeof(IOTrackingQueue)); + uint32_t idx; + + if (!numSiteQs) numSiteQs = 1; + queue = (typeof(queue)) kalloc(sizeof(IOTrackingQueue) + numSiteQs * sizeof(queue->sites[0])); bzero(queue, sizeof(IOTrackingQueue)); queue->name = name; + queue->btEntry = btEntry; queue->allocSize = allocSize; queue->minCaptureSize = minCaptureSize; queue->lock.mutex = lck_mtx_alloc_init(IOLockGroup, LCK_ATTR_NULL); - queue_init(&queue->sites); + queue->numSiteQs = numSiteQs; + queue->type = type; + enum { kFlags = (kIOTracking | kIOTrackingBoot) }; + queue->captureOn = (kFlags == (kFlags & gIOKitDebug)) + || (kIOTrackingQueueTypeDefaultOn & type); - queue->captureOn = (0 != (kIOTrackingBoot & gIOKitDebug)); - queue->isAlloc = isAlloc; + for (idx = 0; idx < numSiteQs; idx++) queue_init(&queue->sites[idx]); lck_mtx_lock(gIOTrackingLock); queue_enter(&gIOTrackingQ, queue, IOTrackingQueue *, link); @@ -344,7 +358,7 @@ IOTrackingQueueFree(IOTrackingQueue * queue) lck_mtx_free(queue->lock.mutex, IOLockGroup); - kfree(queue, sizeof(IOTrackingQueue)); + kfree(queue, sizeof(IOTrackingQueue) + queue->numSiteQs * sizeof(queue->sites[0])); }; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -404,11 +418,17 @@ fasthash64(const void *buf, size_t len, uint64_t seed) switch (len & 7) { case 7: v ^= (uint64_t)pos2[6] << 48; + [[clang::fallthrough]]; case 6: v ^= (uint64_t)pos2[5] << 40; + [[clang::fallthrough]]; case 5: v ^= (uint64_t)pos2[4] << 32; + [[clang::fallthrough]]; case 4: v ^= (uint64_t)pos2[3] << 24; + [[clang::fallthrough]]; case 3: v ^= (uint64_t)pos2[2] << 16; + [[clang::fallthrough]]; case 2: v ^= (uint64_t)pos2[1] << 8; + [[clang::fallthrough]]; case 1: v ^= (uint64_t)pos2[0]; h ^= mix(v); h *= m; @@ -431,12 +451,61 @@ fasthash32(const void *buf, size_t len, uint32_t seed) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +void +IOTrackingAddUser(IOTrackingQueue * queue, IOTrackingUser * mem, vm_size_t size) +{ + uint32_t num; + proc_t self; + + if (!queue->captureOn) return; + if (size < queue->minCaptureSize) return; + + assert(!mem->link.next); + + num = backtrace(&mem->bt[0], kIOTrackingCallSiteBTs); + num = 0; + if ((kernel_task != current_task()) && (self = proc_self())) + { + bool user_64; + mem->btPID = proc_pid(self); + (void)backtrace_user(&mem->btUser[0], kIOTrackingCallSiteBTs - 1, &num, + &user_64); + mem->user32 = !user_64; + proc_rele(self); + } + assert(num <= kIOTrackingCallSiteBTs); + mem->userCount = num; + + IOTRecursiveLockLock(&queue->lock); + queue_enter/*last*/(&queue->sites[0], mem, IOTrackingUser *, link); + queue->siteCount++; + IOTRecursiveLockUnlock(&queue->lock); +} + +void +IOTrackingRemoveUser(IOTrackingQueue * queue, IOTrackingUser * mem) +{ + if (!mem->link.next) return; + + IOTRecursiveLockLock(&queue->lock); + if (mem->link.next) + { + remque(&mem->link); + assert(queue->siteCount); + queue->siteCount--; + } + IOTRecursiveLockUnlock(&queue->lock); +} + +uint64_t gIOTrackingAddTime; + void IOTrackingAdd(IOTrackingQueue * queue, IOTracking * mem, size_t size, bool address) { IOTrackingCallSite * site; uint32_t crc, num; uintptr_t bt[kIOTrackingCallSiteBTs + 1]; + queue_head_t * que; if (mem->site) return; if (!queue->captureOn) return; @@ -444,17 +513,19 @@ IOTrackingAdd(IOTrackingQueue * queue, IOTracking * mem, size_t size, bool addre assert(!mem->link.next); - num = fastbacktrace(&bt[0], kIOTrackingCallSiteBTs + 1); + num = backtrace(&bt[0], kIOTrackingCallSiteBTs + 1); + if (!num) return; num--; crc = fasthash32(&bt[1], num * sizeof(bt[0]), 0x04C11DB7); IOTRecursiveLockLock(&queue->lock); - queue_iterate(&queue->sites, site, IOTrackingCallSite *, link) + que = &queue->sites[crc % queue->numSiteQs]; + queue_iterate(que, site, IOTrackingCallSite *, link) { if (crc == site->crc) break; } - if (queue_end(&queue->sites, (queue_entry_t) site)) + if (queue_end(que, (queue_entry_t) site)) { site = (typeof(site)) kalloc(sizeof(IOTrackingCallSite)); @@ -462,26 +533,26 @@ IOTrackingAdd(IOTrackingQueue * queue, IOTracking * mem, size_t size, bool addre site->addresses = (IOTracking *) &site->instances; site->queue = queue; site->crc = crc; - site->info.count = 0; - memset(&site->info.size[0], 0, sizeof(site->info.size)); - bcopy(&bt[1], &site->info.bt[0], num * sizeof(site->info.bt[0])); + site->count = 0; + memset(&site->size[0], 0, sizeof(site->size)); + bcopy(&bt[1], &site->bt[0], num * sizeof(site->bt[0])); assert(num <= kIOTrackingCallSiteBTs); - bzero(&site->info.bt[num], (kIOTrackingCallSiteBTs - num) * sizeof(site->info.bt[0])); + bzero(&site->bt[num], (kIOTrackingCallSiteBTs - num) * sizeof(site->bt[0])); - queue_enter_first(&queue->sites, site, IOTrackingCallSite *, link); + queue_enter_first(que, site, IOTrackingCallSite *, link); queue->siteCount++; } if (address) { - queue_enter/*last*/(&site->instances, mem, IOTrackingCallSite *, link); + queue_enter/*last*/(&site->instances, mem, IOTracking *, link); if (queue_end(&site->instances, (queue_entry_t)site->addresses)) site->addresses = mem; } - else queue_enter_first(&site->instances, mem, IOTrackingCallSite *, link); + else queue_enter_first(&site->instances, mem, IOTracking *, link); - mem->site = site; - site->info.size[0] += size; - site->info.count++; + mem->site = site; + site->size[0] += size; + site->count++; IOTRecursiveLockUnlock(&queue->lock); } @@ -494,26 +565,28 @@ IOTrackingRemove(IOTrackingQueue * queue, IOTracking * mem, size_t size) if (!mem->link.next) return; IOTRecursiveLockLock(&queue->lock); - - assert(mem->site); - - if (mem == mem->site->addresses) mem->site->addresses = (IOTracking *) queue_next(&mem->link); - remque(&mem->link); - - assert(mem->site->info.count); - mem->site->info.count--; - assert(mem->site->info.size[0] >= size); - mem->site->info.size[0] -= size; - if (!mem->site->info.count) + if (mem->link.next) { - assert(queue_empty(&mem->site->instances)); - assert(!mem->site->info.size[0]); - assert(!mem->site->info.size[1]); + assert(mem->site); - remque(&mem->site->link); - assert(queue->siteCount); - queue->siteCount--; - kfree(mem->site, sizeof(IOTrackingCallSite)); + if (mem == mem->site->addresses) mem->site->addresses = (IOTracking *) queue_next(&mem->link); + remque(&mem->link); + + assert(mem->site->count); + mem->site->count--; + assert(mem->site->size[0] >= size); + mem->site->size[0] -= size; + if (!mem->site->count) + { + assert(queue_empty(&mem->site->instances)); + assert(!mem->site->size[0]); + assert(!mem->site->size[1]); + + remque(&mem->site->link); + assert(queue->siteCount); + queue->siteCount--; + kfree(mem->site, sizeof(IOTrackingCallSite)); + } } IOTRecursiveLockUnlock(&queue->lock); } @@ -545,26 +618,30 @@ IOTrackingFree(IOTrackingQueue * queue, uintptr_t address, size_t size) { IOTrackingCallSite * site; IOTrackingAddress * tracking; + uint32_t idx; bool done; address = ~address; IOTRecursiveLockLock(&queue->lock); done = false; - queue_iterate(&queue->sites, site, IOTrackingCallSite *, link) + for (idx = 0; idx < queue->numSiteQs; idx++) { - for (tracking = (IOTrackingAddress *) site->addresses; - !done && !queue_end(&site->instances, (queue_entry_t) tracking); - tracking = (IOTrackingAddress *) queue_next(&tracking->tracking.link)) + queue_iterate(&queue->sites[idx], site, IOTrackingCallSite *, link) { - if ((done = (address == tracking->address))) + for (tracking = (IOTrackingAddress *) site->addresses; + !done && !queue_end(&site->instances, &tracking->tracking.link); + tracking = (IOTrackingAddress *) queue_next(&tracking->tracking.link)) { - IOTrackingRemove(queue, &tracking->tracking, size); - kfree(tracking, sizeof(IOTrackingAddress)); + if ((done = (address == tracking->address))) + { + IOTrackingRemove(queue, &tracking->tracking, size); + kfree(tracking, sizeof(IOTrackingAddress)); + } } + if (done) break; } if (done) break; } - IOTRecursiveLockUnlock(&queue->lock); } @@ -577,8 +654,8 @@ IOTrackingAccumSize(IOTrackingQueue * queue, IOTracking * mem, size_t size) if (mem->link.next) { assert(mem->site); - assert((size > 0) || (mem->site->info.size[1] >= -size)); - mem->site->info.size[1] += size; + assert((size > 0) || (mem->site->size[1] >= -size)); + mem->site->size[1] += size; }; IOTRecursiveLockUnlock(&queue->lock); } @@ -589,30 +666,42 @@ void IOTrackingReset(IOTrackingQueue * queue) { IOTrackingCallSite * site; + IOTrackingUser * user; IOTracking * tracking; IOTrackingAddress * trackingAddress; + uint32_t idx; bool addresses; IOTRecursiveLockLock(&queue->lock); - while (!queue_empty(&queue->sites)) + for (idx = 0; idx < queue->numSiteQs; idx++) { - queue_remove_first(&queue->sites, site, IOTrackingCallSite *, link); - addresses = false; - while (!queue_empty(&site->instances)) + while (!queue_empty(&queue->sites[idx])) { - queue_remove_first(&site->instances, tracking, IOTracking *, link); - tracking->link.next = 0; - if (tracking == site->addresses) addresses = true; - if (addresses) + if (kIOTrackingQueueTypeMap & queue->type) + { + queue_remove_first(&queue->sites[idx], user, IOTrackingUser *, link); + user->link.next = user->link.prev = NULL; + } + else { - trackingAddress = (typeof(trackingAddress)) tracking; - if (kTrackingAddressFlagAllocated & IOTrackingAddressFlags(trackingAddress)) + queue_remove_first(&queue->sites[idx], site, IOTrackingCallSite *, link); + addresses = false; + while (!queue_empty(&site->instances)) { - kfree(tracking, sizeof(IOTrackingAddress)); - } - } + queue_remove_first(&site->instances, tracking, IOTracking *, link); + if (tracking == site->addresses) addresses = true; + if (addresses) + { + trackingAddress = (typeof(trackingAddress)) tracking; + if (kTrackingAddressFlagAllocated & IOTrackingAddressFlags(trackingAddress)) + { + kfree(tracking, sizeof(IOTrackingAddress)); + } + } + } + kfree(site, sizeof(IOTrackingCallSite)); + } } - kfree(site, sizeof(IOTrackingCallSite)); } queue->siteCount = 0; IOTRecursiveLockUnlock(&queue->lock); @@ -642,7 +731,7 @@ IOTrackingAddressCompare(const void * left, const void * right) uintptr_t inst, laddr, raddr; inst = ((typeof(inst) *) left)[0]; - instance = (typeof(instance)) (inst & ~kInstanceFlags); + instance = (typeof(instance)) INSTANCE_GET(inst); if (kInstanceFlagAddress & inst) laddr = ~((IOTrackingAddress *)instance)->address; else laddr = (uintptr_t) (instance + 1); @@ -654,6 +743,42 @@ IOTrackingAddressCompare(const void * left, const void * right) return ((laddr > raddr) ? 1 : ((laddr == raddr) ? 0 : -1)); } + +static int +IOTrackingZoneElementCompare(const void * left, const void * right) +{ + uintptr_t inst, laddr, raddr; + + inst = ((typeof(inst) *) left)[0]; + laddr = INSTANCE_PUT(inst); + inst = ((typeof(inst) *) right)[0]; + raddr = INSTANCE_PUT(inst); + + return ((laddr > raddr) ? 1 : ((laddr == raddr) ? 0 : -1)); +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +static void +CopyOutKernelBacktrace(IOTrackingCallSite * site, IOTrackingCallSiteInfo * siteInfo) +{ + uint32_t j; + mach_vm_address_t bt, btEntry; + + btEntry = site->queue->btEntry; + for (j = 0; j < kIOTrackingCallSiteBTs; j++) + { + bt = site->bt[j]; + if (btEntry + && (!bt || (j == (kIOTrackingCallSiteBTs - 1)))) + { + bt = btEntry; + btEntry = 0; + } + siteInfo->bt[0][j] = VM_KERNEL_UNSLIDE(bt); + } +} + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ static void @@ -664,39 +789,51 @@ IOTrackingLeakScan(void * refcon) IOTracking * instance; uint64_t vaddr, vincr; ppnum_t ppn; - uintptr_t ptr, addr, inst; + uintptr_t ptr, addr, vphysaddr, inst; size_t size; uint32_t baseIdx, lim, ptrIdx, count; boolean_t is; - -// if (cpu_number()) return; + AbsoluteTime deadline; instances = ref->instances; count = ref->count; + size = ref->zoneSize; - for (vaddr = VM_MIN_KERNEL_AND_KEXT_ADDRESS; - vaddr < VM_MAX_KERNEL_ADDRESS; - ml_set_interrupts_enabled(is), vaddr += vincr) + for (deadline = 0, vaddr = VM_MIN_KERNEL_AND_KEXT_ADDRESS; + ; + vaddr += vincr) { -#if !defined(__LP64__) - thread_block(NULL); -#endif - is = ml_set_interrupts_enabled(false); + if ((mach_absolute_time() > deadline) || (vaddr >= VM_MAX_KERNEL_ADDRESS)) + { + if (deadline) + { + ml_set_interrupts_enabled(is); + IODelay(10); + } + if (vaddr >= VM_MAX_KERNEL_ADDRESS) break; + is = ml_set_interrupts_enabled(false); + clock_interval_to_deadline(10, kMillisecondScale, &deadline); + } - ppn = kernel_pmap_present_mapping(vaddr, &vincr); + ppn = kernel_pmap_present_mapping(vaddr, &vincr, &vphysaddr); // check noencrypt to avoid VM structs (map entries) with pointers - if (ppn && (!pmap_valid_page(ppn) || pmap_is_noencrypt(ppn))) ppn = 0; + if (ppn && (!pmap_valid_page(ppn) || (!ref->zoneSize && pmap_is_noencrypt(ppn)))) ppn = 0; if (!ppn) continue; for (ptrIdx = 0; ptrIdx < (page_size / sizeof(uintptr_t)); ptrIdx++) { - ptr = ((uintptr_t *)vaddr)[ptrIdx]; + ptr = ((uintptr_t *)vphysaddr)[ptrIdx]; for (lim = count, baseIdx = 0; lim; lim >>= 1) { inst = instances[baseIdx + (lim >> 1)]; - instance = (typeof(instance)) (inst & ~kInstanceFlags); - if (kInstanceFlagAddress & inst) + instance = (typeof(instance)) INSTANCE_GET(inst); + + if (ref->zoneSize) + { + addr = INSTANCE_PUT(inst) & ~kInstanceFlags; + } + else if (kInstanceFlagAddress & inst) { addr = ~((IOTrackingAddress *)instance)->address; size = ((IOTrackingAddress *)instance)->size; @@ -706,7 +843,10 @@ IOTrackingLeakScan(void * refcon) addr = (uintptr_t) (instance + 1); size = instance->site->queue->allocSize; } - if ((ptr >= addr) && (ptr < (addr + size))) + if ((ptr >= addr) && (ptr < (addr + size)) + + && (((vaddr + ptrIdx * sizeof(uintptr_t)) < addr) + || ((vaddr + ptrIdx * sizeof(uintptr_t)) >= (addr + size)))) { if (!(kInstanceFlagReferenced & inst)) { @@ -725,17 +865,70 @@ IOTrackingLeakScan(void * refcon) // else move left } } - ref->bytes += page_size; + ref->bytes += page_size; } } +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +extern "C" void +zone_leaks_scan(uintptr_t * instances, uint32_t count, uint32_t zoneSize, uint32_t * found) +{ + IOTrackingLeaksRef ref; + IOTrackingCallSiteInfo siteInfo; + uint32_t idx; + + qsort(instances, count, sizeof(*instances), &IOTrackingZoneElementCompare); + + bzero(&siteInfo, sizeof(siteInfo)); + bzero(&ref, sizeof(ref)); + ref.instances = instances; + ref.count = count; + ref.zoneSize = zoneSize; + + for (idx = 0; idx < 2; idx++) + { + ref.bytes = 0; + IOTrackingLeakScan(&ref); + IOLog("leaks(%d) scanned %ld MB, instance count %d, found %d\n", idx, ref.bytes / 1024 / 1024, count, ref.found); + if (count <= ref.found) break; + } + + *found = ref.found; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +static void +ZoneSiteProc(void * refCon, uint32_t siteCount, uint32_t zoneSize, + uintptr_t * backtrace, uint32_t btCount) +{ + IOTrackingCallSiteInfo siteInfo; + OSData * leakData; + uint32_t idx; + + leakData = (typeof(leakData)) refCon; + + bzero(&siteInfo, sizeof(siteInfo)); + siteInfo.count = siteCount; + siteInfo.size[0] = zoneSize * siteCount; + + for (idx = 0; (idx < btCount) && (idx < kIOTrackingCallSiteBTs); idx++) + { + siteInfo.bt[0][idx] = VM_KERNEL_UNSLIDE(backtrace[idx]); + } + + leakData->appendBytes(&siteInfo, sizeof(siteInfo)); +} + + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ static OSData * IOTrackingLeaks(OSData * data) { IOTrackingLeaksRef ref; - IOTrackingCallSiteInfo unslideInfo; + IOTrackingCallSiteInfo siteInfo; IOTrackingCallSite * site; OSData * leakData; uintptr_t * instances; @@ -747,13 +940,17 @@ IOTrackingLeaks(OSData * data) count = (data->getLength() / sizeof(*instances)); qsort(instances, count, sizeof(*instances), &IOTrackingAddressCompare); + bzero(&siteInfo, sizeof(siteInfo)); bzero(&ref, sizeof(ref)); ref.instances = instances; ref.count = count; - - IOTrackingLeakScan(&ref); - - IOLog("leaks scanned %ld MB, instance count %d, found %d\n", ref.bytes / 1024 / 1024, count, ref.found); + for (idx = 0; idx < 2; idx++) + { + ref.bytes = 0; + IOTrackingLeakScan(&ref); + IOLog("leaks(%d) scanned %ld MB, instance count %d, found %d\n", idx, ref.bytes / 1024 / 1024, count, ref.found); + if (count <= ref.found) break; + } leakData = OSData::withCapacity(128 * sizeof(IOTrackingCallSiteInfo)); @@ -761,7 +958,7 @@ IOTrackingLeaks(OSData * data) { inst = instances[idx]; if (kInstanceFlagReferenced & inst) continue; - instance = (typeof(instance)) (inst & ~kInstanceFlags); + instance = (typeof(instance)) INSTANCE_GET(inst); site = instance->site; instances[numSites] = (uintptr_t) site; numSites++; @@ -780,14 +977,11 @@ IOTrackingLeaks(OSData * data) instances[dups] = 0; } } - unslideInfo.count = siteCount; - unslideInfo.size[0] = (site->info.size[0] * site->info.count) / siteCount; - unslideInfo.size[1] = (site->info.size[1] * site->info.count) / siteCount;; - for (uint32_t j = 0; j < kIOTrackingCallSiteBTs; j++) - { - unslideInfo.bt[j] = VM_KERNEL_UNSLIDE(site->info.bt[j]); - } - leakData->appendBytes(&unslideInfo, sizeof(unslideInfo)); + siteInfo.count = siteCount; + siteInfo.size[0] = (site->size[0] * site->count) / siteCount; + siteInfo.size[1] = (site->size[1] * site->count) / siteCount;; + CopyOutKernelBacktrace(site, &siteInfo); + leakData->appendBytes(&siteInfo, sizeof(siteInfo)); } data->release(); @@ -829,7 +1023,7 @@ SkipName(uint32_t options, const char * name, size_t namesLen, const char * name /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ kern_return_t -IOTrackingDebug(uint32_t selector, uint32_t options, +IOTrackingDebug(uint32_t selector, uint32_t options, uint64_t value, const char * names, size_t namesLen, size_t size, OSObject ** result) { @@ -842,23 +1036,39 @@ IOTrackingDebug(uint32_t selector, uint32_t options, #if IOTRACKING + kern_return_t kr; IOTrackingQueue * queue; IOTracking * instance; IOTrackingCallSite * site; - IOTrackingCallSiteInfo * siteInfos; - IOTrackingCallSiteInfo * siteInfo; - bool addresses; - uint32_t num, idx; + IOTrackingCallSiteInfo siteInfo; + IOTrackingUser * user; + task_t mapTask; + mach_vm_address_t mapAddress; + mach_vm_size_t mapSize; + uint32_t num, idx, qIdx; uintptr_t instFlags; + proc_t proc; + bool addresses; - if (!(kIOTracking & gIOKitDebug)) return (kIOReturnNotReady); ret = kIOReturnNotFound; + proc = NULL; + if (kIOTrackingGetMappings == selector) + { + if (value != -1ULL) + { + proc = proc_find(value); + if (!proc) return (kIOReturnNotFound); + } + } + bzero(&siteInfo, sizeof(siteInfo)); lck_mtx_lock(gIOTrackingLock); queue_iterate(&gIOTrackingQ, queue, IOTrackingQueue *, link) { if (SkipName(options, queue->name, namesLen, names)) continue; + if (!(kIOTracking & gIOKitDebug) && (kIOTrackingQueueTypeAlloc & queue->type)) continue; + switch (selector) { case kIOTrackingResetTracking: @@ -885,20 +1095,23 @@ IOTrackingDebug(uint32_t selector, uint32_t options, case kIOTrackingLeaks: { - if (!queue->isAlloc) break; + if (!(kIOTrackingQueueTypeAlloc & queue->type)) break; if (!data) data = OSData::withCapacity(1024 * sizeof(uintptr_t)); IOTRecursiveLockLock(&queue->lock); - queue_iterate(&queue->sites, site, IOTrackingCallSite *, link) + for (idx = 0; idx < queue->numSiteQs; idx++) { - addresses = false; - queue_iterate(&site->instances, instance, IOTracking *, link) + queue_iterate(&queue->sites[idx], site, IOTrackingCallSite *, link) { - if (instance == site->addresses) addresses = true; - instFlags = (typeof(instFlags)) instance; - if (addresses) instFlags |= kInstanceFlagAddress; - data->appendBytes(&instFlags, sizeof(instFlags)); + addresses = false; + queue_iterate(&site->instances, instance, IOTracking *, link) + { + if (instance == site->addresses) addresses = true; + instFlags = (typeof(instFlags)) instance; + if (addresses) instFlags |= kInstanceFlagAddress; + data->appendBytes(&instFlags, sizeof(instFlags)); + } } } // queue is locked @@ -907,35 +1120,84 @@ IOTrackingDebug(uint32_t selector, uint32_t options, } case kIOTrackingGetTracking: - case kIOTrackingPrintTracking: { + if (kIOTrackingQueueTypeMap & queue->type) break; + if (!data) data = OSData::withCapacity(128 * sizeof(IOTrackingCallSiteInfo)); IOTRecursiveLockLock(&queue->lock); num = queue->siteCount; idx = 0; - queue_iterate(&queue->sites, site, IOTrackingCallSite *, link) + for (qIdx = 0; qIdx < queue->numSiteQs; qIdx++) { - assert(idx < num); - idx++; + queue_iterate(&queue->sites[qIdx], site, IOTrackingCallSite *, link) + { + assert(idx < num); + idx++; - if (size && ((site->info.size[0] + site->info.size[1]) < size)) continue; + if (size && ((site->size[0] + site->size[1]) < size)) continue; - IOTrackingCallSiteInfo unslideInfo; - unslideInfo.count = site->info.count; - memcpy(&unslideInfo.size[0], &site->info.size[0], sizeof(unslideInfo.size)); + siteInfo.count = site->count; + siteInfo.size[0] = site->size[0]; + siteInfo.size[1] = site->size[1]; - for (uint32_t j = 0; j < kIOTrackingCallSiteBTs; j++) + CopyOutKernelBacktrace(site, &siteInfo); + data->appendBytes(&siteInfo, sizeof(siteInfo)); + } + } + assert(idx == num); + IOTRecursiveLockUnlock(&queue->lock); + ret = kIOReturnSuccess; + break; + } + + case kIOTrackingGetMappings: + { + if (!(kIOTrackingQueueTypeMap & queue->type)) break; + if (!data) data = OSData::withCapacity(page_size); + + IOTRecursiveLockLock(&queue->lock); + num = queue->siteCount; + idx = 0; + for (qIdx = 0; qIdx < queue->numSiteQs; qIdx++) + { + queue_iterate(&queue->sites[qIdx], user, IOTrackingUser *, link) { - unslideInfo.bt[j] = VM_KERNEL_UNSLIDE(site->info.bt[j]); + assert(idx < num); + idx++; + + kr = IOMemoryMapTracking(user, &mapTask, &mapAddress, &mapSize); + if (kIOReturnSuccess != kr) continue; + if (proc && (mapTask != proc_task(proc))) continue; + if (size && (mapSize < size)) continue; + + siteInfo.count = 1; + siteInfo.size[0] = mapSize; + siteInfo.address = mapAddress; + siteInfo.addressPID = task_pid(mapTask); + siteInfo.btPID = user->btPID; + + for (uint32_t j = 0; j < kIOTrackingCallSiteBTs; j++) + { + siteInfo.bt[0][j] = VM_KERNEL_UNSLIDE(user->bt[j]); + } + uint32_t * bt32 = (typeof(bt32)) &user->btUser[0]; + uint64_t * bt64 = (typeof(bt64)) ((void *) &user->btUser[0]); + for (uint32_t j = 0; j < kIOTrackingCallSiteBTs; j++) + { + if (j >= user->userCount) siteInfo.bt[1][j] = 0; + else if (user->user32) siteInfo.bt[1][j] = bt32[j]; + else siteInfo.bt[1][j] = bt64[j]; + } + data->appendBytes(&siteInfo, sizeof(siteInfo)); } - data->appendBytes(&unslideInfo, sizeof(unslideInfo)); } assert(idx == num); IOTRecursiveLockUnlock(&queue->lock); ret = kIOReturnSuccess; break; } + default: ret = kIOReturnUnsupported; break; @@ -948,40 +1210,54 @@ IOTrackingDebug(uint32_t selector, uint32_t options, queue_iterate(&gIOTrackingQ, queue, IOTrackingQueue *, link) { if (SkipName(options, queue->name, namesLen, names)) continue; - if (!queue->isAlloc) continue; + if (!(kIOTrackingQueueTypeAlloc & queue->type)) continue; IOTRecursiveLockUnlock(&queue->lock); } } lck_mtx_unlock(gIOTrackingLock); - if (data) + if ((kIOTrackingLeaks == selector) && namesLen && names) { - siteInfos = (typeof(siteInfos)) data->getBytesNoCopy(); - num = (data->getLength() / sizeof(IOTrackingCallSiteInfo)); - qsort(siteInfos, num, sizeof(*siteInfos), &IOTrackingCallSiteInfoCompare); + const char * scan; + const char * next; + size_t sLen; + + if (!data) data = OSData::withCapacity(4096 * sizeof(uintptr_t)); + + // ...<0> + scan = names; + do + { + sLen = scan[0]; + scan++; + next = scan + sLen; + if (next >= (names + namesLen)) break; + kr = zone_leaks(scan, sLen, &ZoneSiteProc, data); + if (KERN_SUCCESS == kr) ret = kIOReturnSuccess; + else if (KERN_INVALID_NAME != kr) ret = kIOReturnVMError; + scan = next; + } + while (scan < (names + namesLen)); + } - if (kIOTrackingPrintTracking == selector) - { - for (idx = 0; idx < num; idx++) - { - siteInfo = &siteInfos[idx]; - printf("\n0x%lx bytes (0x%lx + 0x%lx), %d call%s, [%d]\n", - siteInfo->size[0] + siteInfo->size[1], - siteInfo->size[0], siteInfo->size[1], - siteInfo->count, (siteInfo->count != 1) ? "s" : "", idx); - uintptr_t * bt = &siteInfo->bt[0]; - printf(" Backtrace 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - bt[0], bt[1], bt[2], bt[3], bt[4], bt[5], bt[6], bt[7], - bt[8], bt[9], bt[10], bt[11], bt[12], bt[13], bt[14], bt[15]); - kmod_dump_log((vm_offset_t *) &bt[0], kIOTrackingCallSiteBTs, FALSE); - } - data->release(); - data = 0; - } + if (data) switch (selector) + { + case kIOTrackingLeaks: + case kIOTrackingGetTracking: + case kIOTrackingGetMappings: + { + IOTrackingCallSiteInfo * siteInfos; + siteInfos = (typeof(siteInfos)) data->getBytesNoCopy(); + num = (data->getLength() / sizeof(*siteInfos)); + qsort(siteInfos, num, sizeof(*siteInfos), &IOTrackingCallSiteInfoCompare); + break; + } + default: assert(false); break; } *result = data; + if (proc) proc_rele(proc); #endif /* IOTRACKING */ @@ -1044,7 +1320,7 @@ IOReturn IOKitDiagnosticsClient::externalMethod(uint32_t selector, IOExternalMet namesLen = args->structureInputSize - sizeof(IOKitDiagnosticsParameters); if (namesLen) names = (typeof(names))(params + 1); - ret = IOTrackingDebug(selector, params->options, names, namesLen, params->size, &result); + ret = IOTrackingDebug(selector, params->options, params->value, names, namesLen, params->size, &result); if ((kIOReturnSuccess == ret) && args->structureVariableOutputData) *args->structureVariableOutputData = result; else if (result) result->release(); diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index 37e6f9416..8aed5f278 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -89,6 +89,12 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP void IOKernelFreePhysical(mach_vm_address_t address, mach_vm_size_t size); +#if IOTRACKING +IOReturn +IOMemoryMapTracking(IOTrackingUser * tracking, task_t * task, + mach_vm_address_t * address, mach_vm_size_t * size); +#endif /* IOTRACKING */ + extern vm_size_t debug_iomallocpageable_size; // osfmk/device/iokit_rpc.c @@ -135,7 +141,12 @@ struct IODMACommandInternal UInt8 fDoubleBuffer; UInt8 fNewMD; UInt8 fLocalMapper; - + + vm_tag_t fTag; +#if IOTRACKING + IOTracking fWireTracking; +#endif /* IOTRACKING */ + vm_page_t fCopyPageAlloc; vm_page_t fCopyNext; vm_page_t fNextRemapPage; @@ -167,6 +178,8 @@ struct IOMemoryDescriptorReserved { uint64_t preparationID; // for kernel IOMD subclasses... they have no expansion uint64_t kernReserved[4]; + vm_tag_t kernelTag; + vm_tag_t userTag; }; struct iopa_t diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp index 44a436346..01f751a86 100644 --- a/iokit/Kernel/IOLib.cpp +++ b/iokit/Kernel/IOLib.cpp @@ -58,6 +58,7 @@ #include "libkern/OSAtomic.h" #include #include +#include #include #if IOKITSTATS @@ -172,9 +173,17 @@ void IOLibInit(void) #if IOTRACKING IOTrackingInit(); - gIOMallocTracking = IOTrackingQueueAlloc(kIOMallocTrackingName, 0, 0, true); - gIOWireTracking = IOTrackingQueueAlloc(kIOWireTrackingName, 0, page_size, false); - gIOMapTracking = IOTrackingQueueAlloc(kIOMapTrackingName, 0, page_size, false); + gIOMallocTracking = IOTrackingQueueAlloc(kIOMallocTrackingName, 0, 0, 0, + kIOTrackingQueueTypeAlloc, + 37); + gIOWireTracking = IOTrackingQueueAlloc(kIOWireTrackingName, 0, 0, page_size, 0, 0); + + size_t mapCaptureSize = (kIOTracking & gIOKitDebug) ? page_size : (1024*1024); + gIOMapTracking = IOTrackingQueueAlloc(kIOMapTrackingName, 0, 0, mapCaptureSize, + kIOTrackingQueueTypeDefaultOn + | kIOTrackingQueueTypeMap + | kIOTrackingQueueTypeUser, + 0); #endif gIOKitPageableSpace.maps[0].address = 0; @@ -1117,30 +1126,36 @@ static void _iolog_consputc(int ch, void *arg __unused) cons_putc_locked(ch); } -static void _iolog_logputc(int ch, void *arg __unused) -{ - log_putc_locked(ch); -} +static void _IOLogv(const char *format, va_list ap, void *caller); +__attribute__((noinline,not_tail_called)) void IOLog(const char *format, ...) { + void *caller = __builtin_return_address(0); va_list ap; va_start(ap, format); - IOLogv(format, ap); + _IOLogv(format, ap, caller); va_end(ap); } +__attribute__((noinline,not_tail_called)) void IOLogv(const char *format, va_list ap) +{ + void *caller = __builtin_return_address(0); + _IOLogv(format, ap, caller); +} + +void _IOLogv(const char *format, va_list ap, void *caller) { va_list ap2; + /* Ideally not called at interrupt context or with interrupts disabled. Needs further validate */ + /* assert(TRUE == ml_get_interrupts_enabled()); */ + va_copy(ap2, ap); - bsd_log_lock(); - __doprnt(format, ap, _iolog_logputc, NULL, 16, TRUE); - bsd_log_unlock(); - logwakeup(); + os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, format, ap, caller); __doprnt(format, ap2, _iolog_consputc, NULL, 16, TRUE); va_end(ap2); diff --git a/iokit/Kernel/IOLocks.cpp b/iokit/Kernel/IOLocks.cpp index 2febff6c4..c2ece6b9c 100644 --- a/iokit/Kernel/IOLocks.cpp +++ b/iokit/Kernel/IOLocks.cpp @@ -118,7 +118,7 @@ void IOLockWakeup_legacy_x86_64(IOLock * lock, void *event, bool oneThread) struct _IORecursiveLock { - lck_mtx_t *mutex; + lck_mtx_t mutex; lck_grp_t *group; thread_t thread; UInt32 count; @@ -135,15 +135,10 @@ IORecursiveLock * IORecursiveLockAllocWithLockGroup( lck_grp_t * lockGroup ) if( !lock ) return( 0 ); - lock->mutex = lck_mtx_alloc_init( lockGroup, LCK_ATTR_NULL ); - if( lock->mutex ) { - lock->group = lockGroup; - lock->thread = 0; - lock->count = 0; - } else { - IODelete( lock, _IORecursiveLock, 1 ); - lock = 0; - } + lck_mtx_init( &lock->mutex, lockGroup, LCK_ATTR_NULL ); + lock->group = lockGroup; + lock->thread = 0; + lock->count = 0; return( (IORecursiveLock *) lock ); } @@ -158,13 +153,13 @@ void IORecursiveLockFree( IORecursiveLock * _lock ) { _IORecursiveLock * lock = (_IORecursiveLock *)_lock; - lck_mtx_free( lock->mutex, lock->group ); + lck_mtx_destroy(&lock->mutex, lock->group); IODelete( lock, _IORecursiveLock, 1 ); } lck_mtx_t * IORecursiveLockGetMachLock( IORecursiveLock * lock ) { - return( lock->mutex ); + return( &lock->mutex ); } void IORecursiveLockLock( IORecursiveLock * _lock) @@ -174,7 +169,7 @@ void IORecursiveLockLock( IORecursiveLock * _lock) if( lock->thread == IOThreadSelf()) lock->count++; else { - lck_mtx_lock( lock->mutex ); + lck_mtx_lock( &lock->mutex ); assert( lock->thread == 0 ); assert( lock->count == 0 ); lock->thread = IOThreadSelf(); @@ -190,7 +185,7 @@ boolean_t IORecursiveLockTryLock( IORecursiveLock * _lock) lock->count++; return( true ); } else { - if( lck_mtx_try_lock( lock->mutex )) { + if( lck_mtx_try_lock( &lock->mutex )) { assert( lock->thread == 0 ); assert( lock->count == 0 ); lock->thread = IOThreadSelf(); @@ -209,7 +204,7 @@ void IORecursiveLockUnlock( IORecursiveLock * _lock) if( 0 == (--lock->count)) { lock->thread = 0; - lck_mtx_unlock( lock->mutex ); + lck_mtx_unlock( &lock->mutex ); } } @@ -230,7 +225,7 @@ int IORecursiveLockSleep(IORecursiveLock *_lock, void *event, UInt32 interType) lock->count = 0; lock->thread = 0; - res = lck_mtx_sleep(lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event, (wait_interrupt_t) interType); + res = lck_mtx_sleep(&lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event, (wait_interrupt_t) interType); // Must re-establish the recursive lock no matter why we woke up // otherwise we would potentially leave the return path corrupted. @@ -252,7 +247,7 @@ int IORecursiveLockSleepDeadline( IORecursiveLock * _lock, void *event, lock->count = 0; lock->thread = 0; - res = lck_mtx_sleep_deadline(lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event, + res = lck_mtx_sleep_deadline(&lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event, (wait_interrupt_t) interType, __OSAbsoluteTime(deadline)); // Must re-establish the recursive lock no matter why we woke up diff --git a/iokit/Kernel/IOMapper.cpp b/iokit/Kernel/IOMapper.cpp index 8f2b35992..89a00c921 100644 --- a/iokit/Kernel/IOMapper.cpp +++ b/iokit/Kernel/IOMapper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,13 +61,13 @@ IOMapper * IOMapper::gSystem = (IOMapper *) IOMapper::kUnknown; class IOMapperLock { IOLock *fWaitLock; public: - IOMapperLock() { fWaitLock = IOLockAlloc(); }; - ~IOMapperLock() { IOLockFree(fWaitLock); }; + IOMapperLock() { fWaitLock = IOLockAlloc(); } + ~IOMapperLock() { IOLockFree(fWaitLock); } - void lock() { IOLockLock(fWaitLock); }; - void unlock() { IOLockUnlock(fWaitLock); }; - void sleep(void *event) { IOLockSleep(fWaitLock, event, THREAD_UNINT); }; - void wakeup(void *event) { IOLockWakeup(fWaitLock, event, false); }; + void lock() { IOLockLock(fWaitLock); } + void unlock() { IOLockUnlock(fWaitLock); } + void sleep(void *event) { IOLockSleep(fWaitLock, event, THREAD_UNINT); } + void wakeup(void *event) { IOLockWakeup(fWaitLock, event, false); } }; static IOMapperLock sMapperLock; diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index 947461eaf..b2bde45a8 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2007 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -130,26 +131,35 @@ struct ioPLBlock { unsigned int fFlags; // Flags }; -struct ioGMDData { +enum { kMaxWireTags = 6 }; + +struct ioGMDData +{ IOMapper * fMapper; - uint8_t fDMAMapNumAddressBits; uint64_t fDMAMapAlignment; uint64_t fMappedBase; uint64_t fMappedLength; uint64_t fPreparationID; #if IOTRACKING IOTracking fWireTracking; -#endif - unsigned int fPageCnt; - unsigned char fDiscontig:1; - unsigned char fCompletionError:1; - unsigned char _resv:6; + struct vm_tag_set fWireTags; + struct vm_tag_set_entry fWireTagsEntries[kMaxWireTags]; +#endif /* IOTRACKING */ + unsigned int fPageCnt; + uint8_t fDMAMapNumAddressBits; + vm_tag_t fAllocTag; + unsigned char fDiscontig:1; + unsigned char fCompletionError:1; + unsigned char _resv:6; + + /* variable length arrays */ + upl_page_info_t fPageList[1] #if __LP64__ - // align arrays to 8 bytes so following macros work - unsigned char fPad[3]; + // align fPageList as for ioPLBlock + __attribute__((aligned(sizeof(upl_t)))) #endif - upl_page_info_t fPageList[1]; /* variable length */ - ioPLBlock fBlocks[1]; /* variable length */ + ; + ioPLBlock fBlocks[1]; }; #define getDataP(osd) ((ioGMDData *) (osd)->getBytesNoCopy()) @@ -258,7 +268,7 @@ purgeableControlBits(IOOptionBits newState, vm_purgable_t * control, int * state *state = VM_PURGABLE_VOLATILE | (newState & ~kIOMemoryPurgeableControlMask); break; case kIOMemoryPurgeableEmpty: - *state = VM_PURGABLE_EMPTY; + *state = VM_PURGABLE_EMPTY | (newState & ~kIOMemoryPurgeableControlMask); break; default: err = kIOReturnBadArgument; @@ -476,7 +486,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( ref = memoryReferenceAlloc(kCapacity, NULL); if (!ref) return (kIOReturnNoMemory); - tag = IOMemoryTag(kernel_map); + tag = getVMTag(kernel_map); entries = &ref->entries[0]; count = 0; @@ -532,6 +542,8 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate( // IOBufferMemoryDescriptor alloc - set flags for entry + object create prot |= MAP_MEM_NAMED_CREATE; if (kIOMemoryBufferPurgeable & _flags) prot |= MAP_MEM_PURGABLE; + if (kIOMemoryUseReserve & _flags) prot |= MAP_MEM_GRAB_SECLUDED; + prot |= VM_PROT_WRITE; map = NULL; } @@ -730,11 +742,12 @@ IOGeneralMemoryDescriptor::memoryReferenceMap( memEntryCacheMode = (MAP_MEM_ONLY | VM_PROT_WRITE | prot | vmProtForCacheMode(cacheMode)); } - tag = IOMemoryTag(map); + tag = getVMTag(map); if (_task) { // Find first range for offset + if (!_rangesCount) return (kIOReturnBadArgument); for (remain = offset, rangeIdx = 0; rangeIdx < _rangesCount; rangeIdx++) { getAddrLenForInd(nextAddr, nextLen, type, _ranges, rangeIdx); @@ -965,6 +978,7 @@ IOGeneralMemoryDescriptor::memoryReferenceGetPageCounts( unsigned int totalResident, totalDirty; totalResident = totalDirty = 0; + err = kIOReturnSuccess; entries = ref->entries + ref->count; while (entries > &ref->entries[0]) { @@ -991,8 +1005,9 @@ IOGeneralMemoryDescriptor::memoryReferenceSetPurgeable( vm_purgable_t control; int totalState, state; - entries = ref->entries + ref->count; totalState = kIOMemoryPurgeableNonVolatile; + err = kIOReturnSuccess; + entries = ref->entries + ref->count; while (entries > &ref->entries[0]) { entries--; @@ -1574,14 +1589,14 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, unsigned int ind, pages = 0; for (ind = 0; ind < count; ind++) { mach_vm_address_t addr; - mach_vm_size_t len; + mach_vm_address_t endAddr; + mach_vm_size_t len; // addr & len are returned by this function getAddrLenForInd(addr, len, type, vec, ind); - if ((addr + len + PAGE_MASK) < addr) break; /* overflow */ - pages += (atop_64(addr + len + PAGE_MASK) - atop_64(addr)); - totalLength += len; - if (totalLength < len) break; /* overflow */ + if (os_add3_overflow(addr, len, PAGE_MASK, &endAddr)) break; + if (os_add_overflow(pages, (atop_64(endAddr) - atop_64(addr)), &pages)) break; + if (os_add_overflow(totalLength, len, &totalLength)) break; if ((kIOMemoryTypePhysical == type) || (kIOMemoryTypePhysical64 == type)) { ppnum_t highPage = atop_64(addr + len - 1); @@ -1738,6 +1753,9 @@ IOOptionBits IOMemoryDescriptor::getTag( void ) } #ifndef __LP64__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + // @@@ gvdl: who is using this API? Seems like a wierd thing to implement. IOPhysicalAddress IOMemoryDescriptor::getSourceSegment( IOByteCount offset, IOByteCount * length ) @@ -1751,6 +1769,9 @@ IOMemoryDescriptor::getSourceSegment( IOByteCount offset, IOByteCount * length return( (IOPhysicalAddress) physAddr ); // truncated but only page offset is used } + +#pragma clang diagnostic pop + #endif /* !__LP64__ */ IOByteCount IOMemoryDescriptor::readBytes @@ -1904,11 +1925,7 @@ void IOMemoryDescriptor::setPreparationID( void ) { if (getKernelReserved() && (kIOPreparationIDUnprepared == reserved->preparationID)) { -#if defined(__ppc__ ) - reserved->preparationID = gIOMDPreparationID++; -#else reserved->preparationID = OSIncrementAtomic64(&gIOMDPreparationID); -#endif } } @@ -1920,6 +1937,26 @@ uint64_t IOMemoryDescriptor::getPreparationID( void ) return (kIOPreparationIDUnsupported); } +void IOMemoryDescriptor::setVMTags(vm_tag_t kernelTag, vm_tag_t userTag) +{ + if (!getKernelReserved()) return; + reserved->kernelTag = kernelTag; + reserved->userTag = userTag; +} + +vm_tag_t IOMemoryDescriptor::getVMTag(vm_map_t map) +{ + if (!reserved + || (VM_KERN_MEMORY_NONE == reserved->kernelTag) + || (VM_KERN_MEMORY_NONE == reserved->userTag)) + { + return (IOMemoryTag(map)); + } + + if (vm_kernel_map_is_kernel(map)) return (reserved->kernelTag); + return (reserved->userTag); +} + IOReturn IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UInt dataSize) const { IOReturn err = kIOReturnSuccess; @@ -2026,17 +2063,6 @@ IOReturn IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void * return kIOReturnSuccess; -#if IOMD_DEBUG_DMAACTIVE - } else if (kIOMDDMAActive == op) { - if (params) OSIncrementAtomic(&md->__iomd_reservedA); - else { - if (md->__iomd_reservedA) - OSDecrementAtomic(&md->__iomd_reservedA); - else - panic("kIOMDSetDMAInactive"); - } -#endif /* IOMD_DEBUG_DMAACTIVE */ - } else if (kIOMDWalkSegments != op) return kIOReturnBadArgument; @@ -2365,6 +2391,9 @@ IOGeneralMemoryDescriptor::getPhysicalSegment(IOByteCount offset, IOByteCount *l } #ifndef __LP64__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + addr64_t IOMemoryDescriptor::getPhysicalSegment(IOByteCount offset, IOByteCount *lengthOfSegment, IOOptionBits options) { @@ -2385,6 +2414,7 @@ IOMemoryDescriptor::getPhysicalSegment(IOByteCount offset, IOByteCount *lengthOf return (address); } +#pragma clang diagnostic pop addr64_t IOGeneralMemoryDescriptor::getPhysicalSegment64(IOByteCount offset, IOByteCount *lengthOfSegment) @@ -2460,6 +2490,9 @@ IOGeneralMemoryDescriptor::getSourceSegment(IOByteCount offset, IOByteCount *len return ((IOPhysicalAddress) getPhysicalSegment(offset, lengthOfSegment, _kIOMemorySourceSegment)); } +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + void * IOGeneralMemoryDescriptor::getVirtualSegment(IOByteCount offset, IOByteCount * lengthOfSegment) { @@ -2470,6 +2503,7 @@ void * IOGeneralMemoryDescriptor::getVirtualSegment(IOByteCount offset, return 0; } +#pragma clang diagnostic pop #endif /* !__LP64__ */ IOReturn @@ -2709,6 +2743,10 @@ IOReturn IOMemoryDescriptor::performOperation( IOOptionBits options, return (remaining ? kIOReturnUnderrun : kIOReturnSuccess); } +/* + * + */ + #if defined(__i386__) || defined(__x86_64__) #define io_kernel_static_start vm_kernel_stext @@ -2744,7 +2782,7 @@ io_get_kernel_static_upl( if (!phys) break; page_list[page].phys_addr = phys; - page_list[page].pageout = 0; + page_list[page].free_when_done = 0; page_list[page].absent = 0; page_list[page].dirty = 0; page_list[page].precious = 0; @@ -2758,10 +2796,47 @@ io_get_kernel_static_upl( return ((page >= pageCount) ? kIOReturnSuccess : kIOReturnVMError); } +/* + * + */ +#if IOTRACKING +static void +IOMemoryDescriptorUpdateWireOwner(ioGMDData * dataP, OSData * memoryEntries, vm_tag_t tag) +{ + ioPLBlock *ioplList; + UInt ind, count; + vm_tag_t prior; + + count = getNumIOPL(memoryEntries, dataP); + if (!count) return; + ioplList = getIOPLList(dataP); + + if (VM_KERN_MEMORY_NONE == tag) tag = dataP->fAllocTag; + assert(VM_KERN_MEMORY_NONE != tag); + + for (ind = 0; ind < count; ind++) + { + if (!ioplList[ind].fIOPL) continue; + prior = iopl_set_tag(ioplList[ind].fIOPL, tag); + if (VM_KERN_MEMORY_NONE == dataP->fAllocTag) dataP->fAllocTag = prior; +#if 0 + if (tag != prior) + { + char name[2][48]; + vm_tag_get_kext(prior, &name[0][0], sizeof(name[0])); + vm_tag_get_kext(tag, &name[1][0], sizeof(name[1])); + IOLog("switched %48s to %48s\n", name[0], name[1]); + } +#endif + } +} +#endif /* IOTRACKING */ + + IOReturn IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) { IOOptionBits type = _flags & kIOMemoryTypeMask; - IOReturn error = kIOReturnCannotWire; + IOReturn error = kIOReturnSuccess; ioGMDData *dataP; upl_page_info_array_t pageInfo; ppnum_t mapBase; @@ -2784,6 +2859,9 @@ IOReturn IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) uplFlags = 0; // i.e. ~UPL_COPYOUT_FROM break; } + dataP = getDataP(_memoryEntries); + + if (kIODirectionDMACommand & forDirection) assert(_wireCount); if (_wireCount) { @@ -2792,210 +2870,206 @@ IOReturn IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) OSReportWithBacktrace("IOMemoryDescriptor 0x%lx prepared read only", VM_KERNEL_ADDRPERM(this)); error = kIOReturnNotWritable; } - else error = kIOReturnSuccess; - return (error); } - - dataP = getDataP(_memoryEntries); - IOMapper *mapper; - mapper = dataP->fMapper; - dataP->fMappedBase = 0; - - uplFlags |= UPL_SET_IO_WIRE | UPL_SET_LITE; - uplFlags |= UPL_MEMORY_TAG_MAKE(IOMemoryTag(kernel_map)); - - if (kIODirectionPrepareToPhys32 & forDirection) + else { - if (!mapper) uplFlags |= UPL_NEED_32BIT_ADDR; - if (dataP->fDMAMapNumAddressBits > 32) dataP->fDMAMapNumAddressBits = 32; - } - if (kIODirectionPrepareNoFault & forDirection) uplFlags |= UPL_REQUEST_NO_FAULT; - if (kIODirectionPrepareNoZeroFill & forDirection) uplFlags |= UPL_NOZEROFILLIO; - if (kIODirectionPrepareNonCoherent & forDirection) uplFlags |= UPL_REQUEST_FORCE_COHERENCY; - - mapBase = 0; - - // Note that appendBytes(NULL) zeros the data up to the desired length - // and the length parameter is an unsigned int - size_t uplPageSize = dataP->fPageCnt * sizeof(upl_page_info_t); - if (uplPageSize > ((unsigned int)uplPageSize)) return (kIOReturnNoMemory); - if (!_memoryEntries->appendBytes(0, uplPageSize)) return (kIOReturnNoMemory); - dataP = 0; - - // Find the appropriate vm_map for the given task - vm_map_t curMap; - if (_task == kernel_task && (kIOMemoryBufferPageable & _flags)) curMap = 0; - else curMap = get_task_map(_task); - - // Iterate over the vector of virtual ranges - Ranges vec = _ranges; - unsigned int pageIndex = 0; - IOByteCount mdOffset = 0; - ppnum_t highestPage = 0; + IOMapper *mapper; + mapper = dataP->fMapper; + dataP->fMappedBase = 0; - IOMemoryEntry * memRefEntry = 0; - if (_memRef) memRefEntry = &_memRef->entries[0]; + uplFlags |= UPL_SET_IO_WIRE | UPL_SET_LITE; + uplFlags |= UPL_MEMORY_TAG_MAKE(getVMTag(kernel_map)); - for (UInt range = 0; range < _rangesCount; range++) { - ioPLBlock iopl; - mach_vm_address_t startPage; - mach_vm_size_t numBytes; - ppnum_t highPage = 0; - - // Get the startPage address and length of vec[range] - getAddrLenForInd(startPage, numBytes, type, vec, range); - iopl.fPageOffset = startPage & PAGE_MASK; - numBytes += iopl.fPageOffset; - startPage = trunc_page_64(startPage); - - if (mapper) - iopl.fMappedPage = mapBase + pageIndex; - else - iopl.fMappedPage = 0; - - // Iterate over the current range, creating UPLs - while (numBytes) { - vm_address_t kernelStart = (vm_address_t) startPage; - vm_map_t theMap; - if (curMap) theMap = curMap; - else if (_memRef) - { - theMap = NULL; - } - else - { - assert(_task == kernel_task); - theMap = IOPageableMapForAddress(kernelStart); - } - - // ioplFlags is an in/out parameter - upl_control_flags_t ioplFlags = uplFlags; - dataP = getDataP(_memoryEntries); - pageInfo = getPageList(dataP); - upl_page_list_ptr_t baseInfo = &pageInfo[pageIndex]; - - mach_vm_size_t _ioplSize = round_page(numBytes); - upl_size_t ioplSize = (_ioplSize <= MAX_UPL_SIZE_BYTES) ? _ioplSize : MAX_UPL_SIZE_BYTES; - unsigned int numPageInfo = atop_32(ioplSize); - - if ((theMap == kernel_map) - && (kernelStart >= io_kernel_static_start) - && (kernelStart < io_kernel_static_end)) { - error = io_get_kernel_static_upl(theMap, - kernelStart, - &ioplSize, - &iopl.fIOPL, - baseInfo, - &numPageInfo, - &highPage); - } - else if (_memRef) { - memory_object_offset_t entryOffset; - - entryOffset = mdOffset; - entryOffset = (entryOffset - iopl.fPageOffset - memRefEntry->offset); - if (entryOffset >= memRefEntry->size) { - memRefEntry++; - if (memRefEntry >= &_memRef->entries[_memRef->count]) panic("memRefEntry"); - entryOffset = 0; - } - if (ioplSize > (memRefEntry->size - entryOffset)) ioplSize = (memRefEntry->size - entryOffset); - error = memory_object_iopl_request(memRefEntry->entry, - entryOffset, - &ioplSize, - &iopl.fIOPL, - baseInfo, - &numPageInfo, - &ioplFlags); - } - else { - assert(theMap); - error = vm_map_create_upl(theMap, - startPage, - (upl_size_t*)&ioplSize, - &iopl.fIOPL, - baseInfo, - &numPageInfo, - &ioplFlags); - } - - if (error != KERN_SUCCESS) - goto abortExit; - - assert(ioplSize); + if (kIODirectionPrepareToPhys32 & forDirection) + { + if (!mapper) uplFlags |= UPL_NEED_32BIT_ADDR; + if (dataP->fDMAMapNumAddressBits > 32) dataP->fDMAMapNumAddressBits = 32; + } + if (kIODirectionPrepareNoFault & forDirection) uplFlags |= UPL_REQUEST_NO_FAULT; + if (kIODirectionPrepareNoZeroFill & forDirection) uplFlags |= UPL_NOZEROFILLIO; + if (kIODirectionPrepareNonCoherent & forDirection) uplFlags |= UPL_REQUEST_FORCE_COHERENCY; + + mapBase = 0; + + // Note that appendBytes(NULL) zeros the data up to the desired length + // and the length parameter is an unsigned int + size_t uplPageSize = dataP->fPageCnt * sizeof(upl_page_info_t); + if (uplPageSize > ((unsigned int)uplPageSize)) return (kIOReturnNoMemory); + if (!_memoryEntries->appendBytes(0, uplPageSize)) return (kIOReturnNoMemory); + dataP = 0; + + // Find the appropriate vm_map for the given task + vm_map_t curMap; + if (_task == kernel_task && (kIOMemoryBufferPageable & _flags)) curMap = 0; + else curMap = get_task_map(_task); + + // Iterate over the vector of virtual ranges + Ranges vec = _ranges; + unsigned int pageIndex = 0; + IOByteCount mdOffset = 0; + ppnum_t highestPage = 0; + + IOMemoryEntry * memRefEntry = 0; + if (_memRef) memRefEntry = &_memRef->entries[0]; + + for (UInt range = 0; range < _rangesCount; range++) { + ioPLBlock iopl; + mach_vm_address_t startPage; + mach_vm_size_t numBytes; + ppnum_t highPage = 0; + + // Get the startPage address and length of vec[range] + getAddrLenForInd(startPage, numBytes, type, vec, range); + iopl.fPageOffset = startPage & PAGE_MASK; + numBytes += iopl.fPageOffset; + startPage = trunc_page_64(startPage); + + if (mapper) + iopl.fMappedPage = mapBase + pageIndex; + else + iopl.fMappedPage = 0; + + // Iterate over the current range, creating UPLs + while (numBytes) { + vm_address_t kernelStart = (vm_address_t) startPage; + vm_map_t theMap; + if (curMap) theMap = curMap; + else if (_memRef) + { + theMap = NULL; + } + else + { + assert(_task == kernel_task); + theMap = IOPageableMapForAddress(kernelStart); + } - if (iopl.fIOPL) - highPage = upl_get_highest_page(iopl.fIOPL); - if (highPage > highestPage) - highestPage = highPage; + // ioplFlags is an in/out parameter + upl_control_flags_t ioplFlags = uplFlags; + dataP = getDataP(_memoryEntries); + pageInfo = getPageList(dataP); + upl_page_list_ptr_t baseInfo = &pageInfo[pageIndex]; + + mach_vm_size_t _ioplSize = round_page(numBytes); + upl_size_t ioplSize = (_ioplSize <= MAX_UPL_SIZE_BYTES) ? _ioplSize : MAX_UPL_SIZE_BYTES; + unsigned int numPageInfo = atop_32(ioplSize); + + if ((theMap == kernel_map) + && (kernelStart >= io_kernel_static_start) + && (kernelStart < io_kernel_static_end)) { + error = io_get_kernel_static_upl(theMap, + kernelStart, + &ioplSize, + &iopl.fIOPL, + baseInfo, + &numPageInfo, + &highPage); + } + else if (_memRef) { + memory_object_offset_t entryOffset; + + entryOffset = mdOffset; + entryOffset = (entryOffset - iopl.fPageOffset - memRefEntry->offset); + if (entryOffset >= memRefEntry->size) { + memRefEntry++; + if (memRefEntry >= &_memRef->entries[_memRef->count]) panic("memRefEntry"); + entryOffset = 0; + } + if (ioplSize > (memRefEntry->size - entryOffset)) ioplSize = (memRefEntry->size - entryOffset); + error = memory_object_iopl_request(memRefEntry->entry, + entryOffset, + &ioplSize, + &iopl.fIOPL, + baseInfo, + &numPageInfo, + &ioplFlags); + } + else { + assert(theMap); + error = vm_map_create_upl(theMap, + startPage, + (upl_size_t*)&ioplSize, + &iopl.fIOPL, + baseInfo, + &numPageInfo, + &ioplFlags); + } - error = kIOReturnCannotWire; + if (error != KERN_SUCCESS) goto abortExit; - if (baseInfo->device) { - numPageInfo = 1; - iopl.fFlags = kIOPLOnDevice; - } - else { - iopl.fFlags = 0; - } + assert(ioplSize); - iopl.fIOMDOffset = mdOffset; - iopl.fPageInfo = pageIndex; - if (mapper && pageIndex && (page_mask & (mdOffset + iopl.fPageOffset))) dataP->fDiscontig = true; + if (iopl.fIOPL) + highPage = upl_get_highest_page(iopl.fIOPL); + if (highPage > highestPage) + highestPage = highPage; -#if 0 - // used to remove the upl for auto prepares here, for some errant code - // that freed memory before the descriptor pointing at it - if ((_flags & kIOMemoryAutoPrepare) && iopl.fIOPL) - { - upl_commit(iopl.fIOPL, 0, 0); - upl_deallocate(iopl.fIOPL); - iopl.fIOPL = 0; - } -#endif + if (baseInfo->device) { + numPageInfo = 1; + iopl.fFlags = kIOPLOnDevice; + } + else { + iopl.fFlags = 0; + } - if (!_memoryEntries->appendBytes(&iopl, sizeof(iopl))) { - // Clean up partial created and unsaved iopl - if (iopl.fIOPL) { - upl_abort(iopl.fIOPL, 0); - upl_deallocate(iopl.fIOPL); + iopl.fIOMDOffset = mdOffset; + iopl.fPageInfo = pageIndex; + if (mapper && pageIndex && (page_mask & (mdOffset + iopl.fPageOffset))) dataP->fDiscontig = true; + + if (!_memoryEntries->appendBytes(&iopl, sizeof(iopl))) { + // Clean up partial created and unsaved iopl + if (iopl.fIOPL) { + upl_abort(iopl.fIOPL, 0); + upl_deallocate(iopl.fIOPL); + } + goto abortExit; + } + dataP = 0; + + // Check for a multiple iopl's in one virtual range + pageIndex += numPageInfo; + mdOffset -= iopl.fPageOffset; + if (ioplSize < numBytes) { + numBytes -= ioplSize; + startPage += ioplSize; + mdOffset += ioplSize; + iopl.fPageOffset = 0; + if (mapper) iopl.fMappedPage = mapBase + pageIndex; + } + else { + mdOffset += numBytes; + break; } - goto abortExit; - } - dataP = 0; - - // Check for a multiple iopl's in one virtual range - pageIndex += numPageInfo; - mdOffset -= iopl.fPageOffset; - if (ioplSize < numBytes) { - numBytes -= ioplSize; - startPage += ioplSize; - mdOffset += ioplSize; - iopl.fPageOffset = 0; - if (mapper) iopl.fMappedPage = mapBase + pageIndex; - } - else { - mdOffset += numBytes; - break; } } - } - _highestPage = highestPage; + _highestPage = highestPage; - if (UPL_COPYOUT_FROM & uplFlags) _flags |= kIOMemoryPreparedReadOnly; + if (UPL_COPYOUT_FROM & uplFlags) _flags |= kIOMemoryPreparedReadOnly; + } - if ((kIOTracking & gIOKitDebug) - //&& !(_flags & kIOMemoryAutoPrepare) - ) +#if IOTRACKING + if (kIOReturnSuccess == error) { + vm_tag_t tag; + dataP = getDataP(_memoryEntries); -#if IOTRACKING - IOTrackingAdd(gIOWireTracking, &dataP->fWireTracking, ptoa(_pages), false); -#endif + if (forDirection & kIODirectionDMACommand) tag = (forDirection & kIODirectionDMACommandMask) >> kIODirectionDMACommandShift; + else tag = IOMemoryTag(kernel_map); + + if (!_wireCount) vm_tag_set_init(&dataP->fWireTags, kMaxWireTags); + vm_tag_set_enter(&dataP->fWireTags, kMaxWireTags, tag); + + IOMemoryDescriptorUpdateWireOwner(dataP, _memoryEntries, tag); + if (!_wireCount) + { + //if (!(_flags & kIOMemoryAutoPrepare)) + IOTrackingAdd(gIOWireTracking, &dataP->fWireTracking, ptoa(_pages), false); + } } +#endif /* IOTRACKING */ - return kIOReturnSuccess; + return (error); abortExit: { @@ -3129,7 +3203,7 @@ IOReturn IOGeneralMemoryDescriptor::dmaMap( IODMAMapPageList dmaPageList = { - .pageOffset = ioplList->fPageOffset & page_mask, + .pageOffset = (uint32_t)(ioplList->fPageOffset & page_mask), .pageListCount = _pages, .pageList = &pageList[0] }; @@ -3152,33 +3226,37 @@ IOReturn IOGeneralMemoryDescriptor::dmaMap( IOReturn IOGeneralMemoryDescriptor::prepare(IODirection forDirection) { - IOReturn error = kIOReturnSuccess; + IOReturn error = kIOReturnSuccess; IOOptionBits type = _flags & kIOMemoryTypeMask; if ((kIOMemoryTypePhysical == type) || (kIOMemoryTypePhysical64 == type)) return kIOReturnSuccess; - if (_prepareLock) - IOLockLock(_prepareLock); + if (_prepareLock) IOLockLock(_prepareLock); + if (kIODirectionDMACommand & forDirection) + { +#if IOMD_DEBUG_DMAACTIVE + OSIncrementAtomic(&__iomd_reservedA); +#endif /* IOMD_DEBUG_DMAACTIVE */ + } if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) { - error = wireVirtual(forDirection); + error = wireVirtual(forDirection); } - if (kIOReturnSuccess == error) + if ((kIOReturnSuccess == error) && !(kIODirectionDMACommand & forDirection)) { - if (1 == ++_wireCount) - { - if (kIOMemoryClearEncrypt & _flags) - { - performOperation(kIOMemoryClearEncrypted, 0, _length); - } - } + if (1 == ++_wireCount) + { + if (kIOMemoryClearEncrypt & _flags) + { + performOperation(kIOMemoryClearEncrypted, 0, _length); + } + } } - if (_prepareLock) - IOLockUnlock(_prepareLock); + if (_prepareLock) IOLockUnlock(_prepareLock); return error; } @@ -3195,87 +3273,103 @@ IOReturn IOGeneralMemoryDescriptor::prepare(IODirection forDirection) IOReturn IOGeneralMemoryDescriptor::complete(IODirection forDirection) { IOOptionBits type = _flags & kIOMemoryTypeMask; - ioGMDData * dataP; + ioGMDData * dataP; if ((kIOMemoryTypePhysical == type) || (kIOMemoryTypePhysical64 == type)) return kIOReturnSuccess; - if (_prepareLock) - IOLockLock(_prepareLock); + if (_prepareLock) IOLockLock(_prepareLock); + do + { + assert(_wireCount); + if (!_wireCount) break; + dataP = getDataP(_memoryEntries); + if (!dataP) break; - assert(_wireCount); +#if IOMD_DEBUG_DMAACTIVE + if (kIODirectionDMACommand & forDirection) + { + if (__iomd_reservedA) OSDecrementAtomic(&__iomd_reservedA); + else panic("kIOMDSetDMAInactive"); + } +#endif /* IOMD_DEBUG_DMAACTIVE */ +#if IOTRACKING + if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) + { + vm_tag_t tag; - if ((kIODirectionCompleteWithError & forDirection) - && (dataP = getDataP(_memoryEntries))) - dataP->fCompletionError = true; + if (forDirection & kIODirectionDMACommand) tag = (forDirection & kIODirectionDMACommandMask) >> kIODirectionDMACommandShift; + else tag = IOMemoryTag(kernel_map); + vm_tag_set_remove(&dataP->fWireTags, kMaxWireTags, tag, &tag); + IOMemoryDescriptorUpdateWireOwner(dataP, _memoryEntries, tag); + } + if (kIODirectionDMACommand & forDirection) break; +#endif /* IOTRACKING */ + + if (kIODirectionCompleteWithError & forDirection) dataP->fCompletionError = true; - if (_wireCount) - { if ((kIOMemoryClearEncrypt & _flags) && (1 == _wireCount)) { performOperation(kIOMemorySetEncrypted, 0, _length); } - _wireCount--; - if (!_wireCount || (kIODirectionCompleteWithDataValid & forDirection)) - { - IOOptionBits type = _flags & kIOMemoryTypeMask; - dataP = getDataP(_memoryEntries); - ioPLBlock *ioplList = getIOPLList(dataP); - UInt ind, count = getNumIOPL(_memoryEntries, dataP); + _wireCount--; + if (!_wireCount || (kIODirectionCompleteWithDataValid & forDirection)) + { + ioPLBlock *ioplList = getIOPLList(dataP); + UInt ind, count = getNumIOPL(_memoryEntries, dataP); - if (_wireCount) - { - // kIODirectionCompleteWithDataValid & forDirection - if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) - { - for (ind = 0; ind < count; ind++) - { - if (ioplList[ind].fIOPL) iopl_valid_data(ioplList[ind].fIOPL); - } - } - } - else - { + if (_wireCount) + { + // kIODirectionCompleteWithDataValid & forDirection + if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) + { + for (ind = 0; ind < count; ind++) + { + if (ioplList[ind].fIOPL) iopl_valid_data(ioplList[ind].fIOPL); + } + } + } + else + { #if IOMD_DEBUG_DMAACTIVE - if (__iomd_reservedA) panic("complete() while dma active"); + if (__iomd_reservedA) panic("complete() while dma active"); #endif /* IOMD_DEBUG_DMAACTIVE */ - if (dataP->fMappedBase) { - dataP->fMapper->iovmUnmapMemory(this, NULL, dataP->fMappedBase, dataP->fMappedLength); - dataP->fMappedBase = 0; - } - // Only complete iopls that we created which are for TypeVirtual - if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) { + if (dataP->fMappedBase) { + dataP->fMapper->iovmUnmapMemory(this, NULL, dataP->fMappedBase, dataP->fMappedLength); + dataP->fMappedBase = 0; + } + // Only complete iopls that we created which are for TypeVirtual + if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) { #if IOTRACKING - if ((kIOTracking & gIOKitDebug) - //&& !(_flags & kIOMemoryAutoPrepare) - ) - { - IOTrackingRemove(gIOWireTracking, &dataP->fWireTracking, ptoa(_pages)); - } -#endif - for (ind = 0; ind < count; ind++) - if (ioplList[ind].fIOPL) { - if (dataP->fCompletionError) - upl_abort(ioplList[ind].fIOPL, 0 /*!UPL_ABORT_DUMP_PAGES*/); - else - upl_commit(ioplList[ind].fIOPL, 0, 0); - upl_deallocate(ioplList[ind].fIOPL); - } - } else if (kIOMemoryTypeUPL == type) { - upl_set_referenced(ioplList[0].fIOPL, false); - } + //if (!(_flags & kIOMemoryAutoPrepare)) + { + IOTrackingRemove(gIOWireTracking, &dataP->fWireTracking, ptoa(_pages)); + } +#endif /* IOTRACKING */ + for (ind = 0; ind < count; ind++) + if (ioplList[ind].fIOPL) { + if (dataP->fCompletionError) + upl_abort(ioplList[ind].fIOPL, 0 /*!UPL_ABORT_DUMP_PAGES*/); + else + upl_commit(ioplList[ind].fIOPL, 0, 0); + upl_deallocate(ioplList[ind].fIOPL); + } + } else if (kIOMemoryTypeUPL == type) { + upl_set_referenced(ioplList[0].fIOPL, false); + } - (void) _memoryEntries->initWithBytes(dataP, computeDataSize(0, 0)); // == setLength() + (void) _memoryEntries->initWithBytes(dataP, computeDataSize(0, 0)); // == setLength() - dataP->fPreparationID = kIOPreparationIDUnprepared; - } - } + dataP->fPreparationID = kIOPreparationIDUnprepared; + dataP->fAllocTag = VM_KERN_MEMORY_NONE; + } + } } + while (false); - if (_prepareLock) - IOLockUnlock(_prepareLock); + if (_prepareLock) IOLockUnlock(_prepareLock); return kIOReturnSuccess; } @@ -3364,7 +3458,7 @@ IOReturn IOGeneralMemoryDescriptor::doMap( size = round_page(mapping->fLength); flags = UPL_COPYOUT_FROM | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE | UPL_BLOCK_ACCESS - | UPL_MEMORY_TAG_MAKE(IOMemoryTag(kernel_map)); + | UPL_MEMORY_TAG_MAKE(getVMTag(kernel_map)); if (KERN_SUCCESS != memory_object_iopl_request(_memRef->entries[0].entry, 0, &size, &redirUPL2, NULL, NULL, @@ -3411,8 +3505,12 @@ IOReturn IOGeneralMemoryDescriptor::doMap( { err = memoryReferenceMap(_memRef, mapping->fAddressMap, offset, length, options, &mapping->fAddress); #if IOTRACKING - if (err == KERN_SUCCESS) IOTrackingAdd(gIOMapTracking, &mapping->fTracking, length, false); -#endif + if ((err == KERN_SUCCESS) && ((kIOTracking & gIOKitDebug) || _task)) + { + // only dram maps in the default on developement case + IOTrackingAddUser(gIOMapTracking, &mapping->fTracking, mapping->fLength); + } +#endif /* IOTRACKING */ if ((err == KERN_SUCCESS) && pager) { err = populateDevicePager(pager, mapping->fAddressMap, mapping->fAddress, offset, length, options); @@ -3428,6 +3526,25 @@ IOReturn IOGeneralMemoryDescriptor::doMap( return (err); } +#if IOTRACKING +IOReturn +IOMemoryMapTracking(IOTrackingUser * tracking, task_t * task, + mach_vm_address_t * address, mach_vm_size_t * size) +{ +#define iomap_offsetof(type, field) ((size_t)(&((type *)0)->field)) + + IOMemoryMap * map = (typeof(map)) (((uintptr_t) tracking) - iomap_offsetof(IOMemoryMap, fTracking)); + + if (!map->fAddressMap || (map->fAddressMap != get_task_map(map->fAddressTask))) return (kIOReturnNotReady); + + *task = map->fAddressTask; + *address = map->fAddress; + *size = map->fLength; + + return (kIOReturnSuccess); +} +#endif /* IOTRACKING */ + IOReturn IOGeneralMemoryDescriptor::doUnmap( vm_map_t addressMap, IOVirtualAddress __address, @@ -3658,8 +3775,8 @@ IOReturn IOMemoryDescriptor::doUnmap( } #if IOTRACKING - IOTrackingRemove(gIOMapTracking, &mapping->fTracking, length); -#endif + IOTrackingRemoveUser(gIOMapTracking, &mapping->fTracking); +#endif /* IOTRACKING */ return (err); } @@ -3797,8 +3914,8 @@ void IOMemoryMap::taskDied( void ) LOCK; if (fUserClientUnmap) unmap(); #if IOTRACKING - else IOTrackingRemove(gIOMapTracking, &fTracking, fLength); -#endif + else IOTrackingRemoveUser(gIOMapTracking, &fTracking); +#endif /* IOTRACKING */ if( fAddressMap) { vm_map_deallocate(fAddressMap); @@ -3964,7 +4081,7 @@ IOReturn IOMemoryMap::wireRange( prot = (kIODirectionOutIn & options); if (prot) { - prot |= VM_PROT_MEMORY_TAG_MAKE(IOMemoryTag(kernel_map)); + prot |= VM_PROT_MEMORY_TAG_MAKE(fMemory->getVMTag(kernel_map)); kr = vm_map_wire(fAddressMap, start, end, prot, FALSE); } else @@ -4128,7 +4245,7 @@ IOReturn IOMemoryMap::redirect(IOMemoryDescriptor * newBackingMemory, upl_size_t size = round_page(fLength); upl_control_flags_t flags = UPL_COPYOUT_FROM | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE | UPL_BLOCK_ACCESS - | UPL_MEMORY_TAG_MAKE(IOMemoryTag(kernel_map)); + | UPL_MEMORY_TAG_MAKE(fMemory->getVMTag(kernel_map)); if (KERN_SUCCESS != memory_object_iopl_request(fMemory->_memRef->entries[0].entry, 0, &size, &fRedirUPL, NULL, NULL, &flags)) diff --git a/iokit/Kernel/IOMultiMemoryDescriptor.cpp b/iokit/Kernel/IOMultiMemoryDescriptor.cpp index fd17233c1..13a5a39a6 100644 --- a/iokit/Kernel/IOMultiMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMultiMemoryDescriptor.cpp @@ -276,26 +276,38 @@ IOReturn IOMultiMemoryDescriptor::doMap(vm_map_t __addressMap, { prot = VM_PROT_READ; if (!(kIOMapReadOnly & options)) prot |= VM_PROT_WRITE; - ref.map = map; - ref.tag = IOMemoryTag(map); - ref.options = options; - ref.size = length; - ref.prot = prot; - if (options & kIOMapAnywhere) - // vm_map looks for addresses above here, even when VM_FLAGS_ANYWHERE - ref.mapped = 0; - else - ref.mapped = mapping->fAddress; - - if ((ref.map == kernel_map) && (kIOMemoryBufferPageable & _flags)) - err = IOIteratePageableMaps(ref.size, &IOMemoryDescriptorMapAlloc, &ref); - else - err = IOMemoryDescriptorMapAlloc(ref.map, &ref); - - if (KERN_SUCCESS != err) break; - - address = ref.mapped; - mapping->fAddress = address; + + if (kIOMapOverwrite & options) + { + if ((map == kernel_map) && (kIOMemoryBufferPageable & _flags)) + { + map = IOPageableMapForAddress(address); + } + err = KERN_SUCCESS; + } + else + { + ref.map = map; + ref.tag = IOMemoryTag(map); + ref.options = options; + ref.size = length; + ref.prot = prot; + if (options & kIOMapAnywhere) + // vm_map looks for addresses above here, even when VM_FLAGS_ANYWHERE + ref.mapped = 0; + else + ref.mapped = mapping->fAddress; + + if ((ref.map == kernel_map) && (kIOMemoryBufferPageable & _flags)) + err = IOIteratePageableMaps(ref.size, &IOMemoryDescriptorMapAlloc, &ref); + else + err = IOMemoryDescriptorMapAlloc(ref.map, &ref); + + if (KERN_SUCCESS != err) break; + + address = ref.mapped; + mapping->fAddress = address; + } mapOffset = offset; bytesRemaining = length; @@ -329,14 +341,9 @@ IOReturn IOMultiMemoryDescriptor::doMap(vm_map_t __addressMap, if (kIOReturnSuccess == err) { #if IOTRACKING - IOTrackingAdd(gIOMapTracking, &mapping->fTracking, length, false); + IOTrackingAddUser(gIOMapTracking, &mapping->fTracking, mapping->fLength); #endif } - else - { - mapping->release(); - mapping = 0; - } return (err); } @@ -348,6 +355,7 @@ IOReturn IOMultiMemoryDescriptor::setPurgeable( IOOptionBits newState, IOOptionBits totalState, state; totalState = kIOMemoryPurgeableNonVolatile; + err = kIOReturnSuccess; for (unsigned index = 0; index < _descriptorsCount; index++) { err = _descriptors[index]->setPurgeable(newState, &state); diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 9a5ccd20b..6e9701b18 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -171,7 +171,6 @@ IOReturn OSKextSystemSleepOrWake( UInt32 ); } extern "C" ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); extern "C" addr64_t kvtophys(vm_offset_t va); -extern "C" int stack_snapshot_from_kernel(pid_t pid, void *buf, uint32_t size, uint32_t flags, unsigned *bytesTraced); static void idleSleepTimerExpired( thread_call_param_t, thread_call_param_t ); static void notifySystemShutdown( IOService * root, uint32_t messageType ); @@ -189,6 +188,7 @@ static const OSSymbol *sleepMessagePEFunction = NULL; #define kDefaultWranglerIdlePeriod 25 // in milliseconds #define kIOSleepWakeDebugKey "Persistent-memory-note" +#define kIOEFIBootRomFailureKey "wake-failure" #define kRD_AllPowerSources (kIOPMSupportedOnAC \ | kIOPMSupportedOnBatt \ @@ -325,6 +325,8 @@ struct timeval gIOLastWakeTime; static char gWakeReasonString[128]; static bool gWakeReasonSysctlRegistered = false; +static AbsoluteTime gIOLastWakeAbsTime; +static AbsoluteTime gIOLastSleepAbsTime; #if defined(__i386__) || defined(__x86_64__) static bool gSpinDumpBufferFull = false; @@ -435,9 +437,8 @@ class PMTraceWorker : public OSObject static PMTraceWorker *tracer( IOPMrootDomain * ); void tracePCIPowerChange(change_t, IOService *, uint32_t, uint32_t); void tracePoint(uint8_t phase); - void tracePoint(uint8_t phase, uint8_t data8); void traceDetail(uint32_t detail); - void traceLoginWindowPhase(uint8_t phase); + void traceComponentWakeProgress(uint32_t component, uint32_t data); int recordTopLevelPCIDevice(IOService *); void RTC_TRACE(void); virtual bool serialize(OSSerialize *s) const APPLE_KEXT_OVERRIDE; @@ -445,16 +446,19 @@ class PMTraceWorker : public OSObject IOPMTracePointHandler tracePointHandler; void * tracePointTarget; uint64_t getPMStatusCode(); + uint8_t getTracePhase(); + uint32_t getTraceData(); private: IOPMrootDomain *owner; - IOLock *pciMappingLock; + IOLock *pmTraceWorkerLock; OSArray *pciDeviceBitMappings; uint8_t addedToRegistry; uint8_t tracePhase; - uint8_t loginWindowPhase; - uint8_t traceData8; uint32_t traceData32; + uint8_t loginWindowData; + uint8_t coreDisplayData; + uint8_t coreGraphicsData; }; /* @@ -681,6 +685,69 @@ void IOPMrootDomain::updateConsoleUsers(void) //****************************************************************************** +static void swdDebugSetupCallout( thread_call_param_t p0, thread_call_param_t p1 ) +{ + IOPMrootDomain * rootDomain = (IOPMrootDomain *) p0; + uint32_t notifyRef = (uint32_t)(uintptr_t) p1; + + rootDomain->swdDebugSetup(); + + if (p1) { + rootDomain->allowPowerChange(notifyRef); + } + DLOG("swdDebugSetupCallout finish\n"); +} + +void IOPMrootDomain::swdDebugSetup( ) +{ +#if HIBERNATION + static int32_t mem_only = -1; + if ((mem_only == -1) && + (PE_parse_boot_argn("swd_mem_only", &mem_only, sizeof(mem_only)) == false)) { + mem_only = 0; + } + + if ((mem_only == 1) || (gRootDomain->sleepWakeDebugIsWdogEnabled() == false)) { + return; + } + DLOG("swdDebugSetup state:%d\n", swd_DebugImageSetup); + if (swd_DebugImageSetup == FALSE) { + swd_DebugImageSetup = TRUE; + IOOpenDebugDataFile(kSleepWakeStackBinFilename, SWD_BUF_SIZE); + } +#endif + + +} + +static void swdDebugTeardownCallout( thread_call_param_t p0, thread_call_param_t p1 ) +{ + IOPMrootDomain * rootDomain = (IOPMrootDomain *) p0; + uint32_t notifyRef = (uint32_t)(uintptr_t) p1; + + rootDomain->swdDebugTeardown(); + if (p1) { + rootDomain->allowPowerChange(notifyRef); + } + DLOG("swdDebugTeardownCallout finish\n"); +} + +void IOPMrootDomain::swdDebugTeardown( ) +{ + +#if HIBERNATION + DLOG("swdDebugTeardown state:%d\n", swd_DebugImageSetup); + if (swd_DebugImageSetup == TRUE) { + swd_DebugImageSetup = FALSE; + IOCloseDebugDataFile(); + } +#endif + + +} +//****************************************************************************** + + static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 ) { IOService * rootDomain = (IOService *) p0; @@ -692,10 +759,12 @@ static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 ) if (ON_STATE == powerState) { sync_internal(); + swdDebugSetupCallout(p0, NULL); } #if HIBERNATION else { + swdDebugTeardownCallout(p0, NULL); IOHibernateSystemPostWake(); if (gRootDomain) @@ -708,21 +777,6 @@ static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 ) } //****************************************************************************** - -static void hib_debugSetup_callout( thread_call_param_t p0, thread_call_param_t p1 ) -{ - IOService * rootDomain = (IOService *) p0; - uint32_t notifyRef = (uint32_t)(uintptr_t) p1; - -#if HIBERNATION - IOOpenDebugDataFile(kSleepWakeStackBinFilename, SWD_BUF_SIZE); -#endif - - rootDomain->allowPowerChange(notifyRef); - DLOG("hib_debugSetup_callout finish\n"); -} -//****************************************************************************** - static UInt32 computeDeltaTimeMS( const AbsoluteTime * startTime ) { AbsoluteTime endTime; @@ -769,6 +823,8 @@ static SYSCTL_PROC(_kern, OID_AUTO, waketime, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOLastWakeTime, 0, sysctl_sleepwaketime, "S,timeval", ""); +SYSCTL_QUAD(_kern, OID_AUTO, wake_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gIOLastWakeAbsTime, ""); +SYSCTL_QUAD(_kern, OID_AUTO, sleep_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gIOLastSleepAbsTime, ""); static int sysctl_willshutdown @@ -992,10 +1048,12 @@ bool IOPMrootDomain::start( IOService * nub ) diskSyncCalloutEntry = thread_call_allocate( &disk_sync_callout, (thread_call_param_t) this); - hibDebugSetupEntry = thread_call_allocate( - &hib_debugSetup_callout, + swdDebugSetupEntry = thread_call_allocate( + &swdDebugSetupCallout, + (thread_call_param_t) this); + swdDebugTearDownEntry = thread_call_allocate( + &swdDebugTeardownCallout, (thread_call_param_t) this); - updateConsoleUsersEntry = thread_call_allocate( &updateConsoleUsersCallout, (thread_call_param_t) this); @@ -1082,16 +1140,14 @@ bool IOPMrootDomain::start( IOService * nub ) preventSystemSleepList = OSSet::withCapacity(2); PMinit(); // creates gIOPMWorkLoop + gIOPMWorkLoop = getIOPMWorkloop(); // Create IOPMPowerStateQueue used to queue external power // events, and to handle those events on the PM work loop. pmPowerStateQueue = IOPMPowerStateQueue::PMPowerStateQueue( this, OSMemberFunctionCast(IOEventSource::Action, this, &IOPMrootDomain::dispatchPowerEvent)); - getPMworkloop()->addEventSource(pmPowerStateQueue); -#ifdef CHECK_THREAD_CONTEXT - gIOPMWorkLoop = getPMworkloop(); -#endif + gIOPMWorkLoop->addEventSource(pmPowerStateQueue); // create our power parent patriarch = new IORootParent; @@ -1209,7 +1265,9 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) const OSSymbol *idle_seconds_string = OSSymbol::withCString("System Idle Seconds"); const OSSymbol *sleepdisabled_string = OSSymbol::withCString("SleepDisabled"); const OSSymbol *ondeck_sleepwake_uuid_string = OSSymbol::withCString(kIOPMSleepWakeUUIDKey); - const OSSymbol *loginwindow_tracepoint_string = OSSymbol::withCString(kIOPMLoginWindowSecurityDebugKey); + const OSSymbol *loginwindow_progress_string = OSSymbol::withCString(kIOPMLoginWindowProgressKey); + const OSSymbol *coredisplay_progress_string = OSSymbol::withCString(kIOPMCoreDisplayProgressKey); + const OSSymbol *coregraphics_progress_string = OSSymbol::withCString(kIOPMCoreGraphicsProgressKey); #if HIBERNATION const OSSymbol *hibernatemode_string = OSSymbol::withCString(kIOHibernateModeKey); const OSSymbol *hibernatefile_string = OSSymbol::withCString(kIOHibernateFileKey); @@ -1290,10 +1348,29 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) obj->retain(); pmPowerStateQueue->submitPowerEvent(kPowerEventQueueSleepWakeUUID, (void *)obj); } - else if (key->isEqualTo(loginwindow_tracepoint_string)) + else if (key->isEqualTo(loginwindow_progress_string)) { - if (pmTracer && (n = OSDynamicCast(OSNumber, obj))) - pmTracer->traceLoginWindowPhase(n->unsigned8BitValue()); + if (pmTracer && (n = OSDynamicCast(OSNumber, obj))) { + uint32_t data = n->unsigned32BitValue(); + pmTracer->traceComponentWakeProgress(kIOPMLoginWindowProgress, data); + kdebugTrace(kPMLogComponentWakeProgress, 0, kIOPMLoginWindowProgress, data); + } + } + else if (key->isEqualTo(coredisplay_progress_string)) + { + if (pmTracer && (n = OSDynamicCast(OSNumber, obj))) { + uint32_t data = n->unsigned32BitValue(); + pmTracer->traceComponentWakeProgress(kIOPMCoreDisplayProgress, data); + kdebugTrace(kPMLogComponentWakeProgress, 0, kIOPMCoreDisplayProgress, data); + } + } + else if (key->isEqualTo(coregraphics_progress_string)) + { + if (pmTracer && (n = OSDynamicCast(OSNumber, obj))) { + uint32_t data = n->unsigned32BitValue(); + pmTracer->traceComponentWakeProgress(kIOPMCoreGraphicsProgress, data); + kdebugTrace(kPMLogComponentWakeProgress, 0, kIOPMCoreGraphicsProgress, data); + } } else if (key->isEqualTo(kIOPMDeepSleepEnabledKey) || key->isEqualTo(kIOPMDestroyFVKeyOnStandbyKey) || @@ -1372,7 +1449,9 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) if(idle_seconds_string) idle_seconds_string->release(); if(sleepdisabled_string) sleepdisabled_string->release(); if(ondeck_sleepwake_uuid_string) ondeck_sleepwake_uuid_string->release(); - if(loginwindow_tracepoint_string) loginwindow_tracepoint_string->release(); + if(loginwindow_progress_string) loginwindow_progress_string->release(); + if(coredisplay_progress_string) coredisplay_progress_string->release(); + if(coregraphics_progress_string) coregraphics_progress_string->release(); #if HIBERNATION if(hibernatemode_string) hibernatemode_string->release(); if(hibernatefile_string) hibernatefile_string->release(); @@ -1844,7 +1923,7 @@ void IOPMrootDomain::broadcastAggressives( if (!connect || !connect->getReadyFlag()) continue; - if ((service = (IOService *) connect->copyChildEntry(gIOPowerPlane))) + if ((service = OSDynamicCast(IOService, connect->copyChildEntry(gIOPowerPlane)))) { if (service->assertPMDriverCall(&callEntry)) { @@ -1915,11 +1994,11 @@ void IOPMrootDomain::cancelIdleSleepTimer( void ) thread_call_cancel(extraSleepTimer); idleSleepTimerPending = false; - if (!assertOnWakeSecs && systemWakeTime) { + if (!assertOnWakeSecs && gIOLastWakeAbsTime) { AbsoluteTime now; clock_usec_t microsecs; clock_get_uptime(&now); - SUB_ABSOLUTETIME(&now, &systemWakeTime); + SUB_ABSOLUTETIME(&now, &gIOLastWakeAbsTime); absolutetime_to_microtime(now, &assertOnWakeSecs, µsecs); if (assertOnWakeReport) { HISTREPORT_TALLYVALUE(assertOnWakeReport, (int64_t)assertOnWakeSecs); @@ -1949,9 +2028,9 @@ static void idleSleepTimerExpired( void IOPMrootDomain::handleSleepTimerExpiration( void ) { - if (!getPMworkloop()->inGate()) + if (!gIOPMWorkLoop->inGate()) { - getPMworkloop()->runAction( + gIOPMWorkLoop->runAction( OSMemberFunctionCast(IOWorkLoop::Action, this, &IOPMrootDomain::handleSleepTimerExpiration), this); @@ -2131,6 +2210,7 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) gIOLastSleepTime.tv_usec = microsecs; gIOLastWakeTime.tv_sec = 0; gIOLastWakeTime.tv_usec = 0; + gIOLastSleepAbsTime = now; if (wake2DarkwakeDelay && sleepDelaysReport) { clock_usec_t microsecs; @@ -2164,12 +2244,15 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) } assertOnWakeSecs = 0; ((IOService *)this)->stop_watchdog_timer(); //14456299 + lowBatteryCondition = false; + getPlatform()->sleepKernel(); // The CPU(s) are off at this point, // Code will resume execution here upon wake. - clock_get_uptime(&systemWakeTime); + clock_get_uptime(&gIOLastWakeAbsTime); + IOLog("gIOLastWakeAbsTime: %lld\n", gIOLastWakeAbsTime); _highestCapability = 0; ((IOService *)this)->start_watchdog_timer(); //14456299 @@ -2196,20 +2279,14 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : ""); #endif - // log system wake - PMDebug(kPMLogSystemWake, 0, 0); - lowBatteryCondition = false; lastSleepReason = 0; _lastDebugWakeSeconds = _debugWakeSeconds; _debugWakeSeconds = 0; _scheduledAlarms = 0; -#ifndef __LP64__ - systemWake(); -#endif - #if defined(__i386__) || defined(__x86_64__) + kdebugTrace(kPMLogSystemWake, 0, 0, 0); wranglerTickled = false; graphicsSuppressed = false; darkWakePostTickle = false; @@ -2356,6 +2433,7 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) } } #else /* !__i386__ && !__x86_64__ */ + kdebugTrace(kPMLogSystemWake, 0, ml_get_wake_timebase() >> 32, ml_get_wake_timebase()); // stay awake for at least 30 seconds wranglerTickled = true; fullWakeReason = kFullWakeReasonLocalUser; @@ -2468,6 +2546,9 @@ bool IOPMrootDomain::updatePreventIdleSleepList( } #endif + MSG("prevent idle sleep list: %s%c (%u)\n", + service->getName(), + (addNotRemove) ? '+' : '-', newCount); return true; } @@ -2492,11 +2573,11 @@ void IOPMrootDomain::updatePreventSystemSleepList( preventSystemSleepList->setObject(service); DLOG("prevent system sleep list: %s+ (%u)\n", service->getName(), preventSystemSleepList->getCount()); - if (!assertOnWakeSecs && systemWakeTime) { + if (!assertOnWakeSecs && gIOLastWakeAbsTime) { AbsoluteTime now; clock_usec_t microsecs; clock_get_uptime(&now); - SUB_ABSOLUTETIME(&now, &systemWakeTime); + SUB_ABSOLUTETIME(&now, &gIOLastWakeAbsTime); absolutetime_to_microtime(now, &assertOnWakeSecs, µsecs); if (assertOnWakeReport) { HISTREPORT_TALLYVALUE(assertOnWakeReport, (int64_t)assertOnWakeSecs); @@ -2529,9 +2610,9 @@ void IOPMrootDomain::copySleepPreventersList(OSArray **idleSleepList, OSArray ** OSObject *object = NULL; OSArray *array = NULL; - if (!getPMworkloop()->inGate()) + if (!gIOPMWorkLoop->inGate()) { - getPMworkloop()->runAction( + gIOPMWorkLoop->runAction( OSMemberFunctionCast(IOWorkLoop::Action, this, &IOPMrootDomain::IOPMrootDomain::copySleepPreventersList), this, (void *)idleSleepList, (void *)systemSleepList); @@ -2804,6 +2885,26 @@ void IOPMrootDomain::tellChangeUp( unsigned long stateNum ) } } +#define CAP_WILL_CHANGE_TO_OFF(params, flag) \ + (((params)->changeFlags & kIOPMSystemCapabilityWillChange) && \ + ((params)->fromCapabilities & (flag)) && \ + (((params)->toCapabilities & (flag)) == 0)) + +#define CAP_DID_CHANGE_TO_ON(params, flag) \ + (((params)->changeFlags & kIOPMSystemCapabilityDidChange) && \ + ((params)->toCapabilities & (flag)) && \ + (((params)->fromCapabilities & (flag)) == 0)) + +#define CAP_DID_CHANGE_TO_OFF(params, flag) \ + (((params)->changeFlags & kIOPMSystemCapabilityDidChange) && \ + ((params)->fromCapabilities & (flag)) && \ + (((params)->toCapabilities & (flag)) == 0)) + +#define CAP_WILL_CHANGE_TO_ON(params, flag) \ + (((params)->changeFlags & kIOPMSystemCapabilityWillChange) && \ + ((params)->toCapabilities & (flag)) && \ + (((params)->fromCapabilities & (flag)) == 0)) + //****************************************************************************** // sysPowerDownHandler // @@ -2825,21 +2926,13 @@ IOReturn IOPMrootDomain::sysPowerDownHandler( if (messageType == kIOMessageSystemWillSleep) { #if HIBERNATION - static int32_t mem_only = -1; IOPowerStateChangeNotification *notify = - (IOPowerStateChangeNotification *)messageArgs; + (IOPowerStateChangeNotification *)messageArgs; - if ((mem_only == -1) && - (PE_parse_boot_argn("swd_mem_only", &mem_only, sizeof(mem_only)) == false)) { - mem_only = 0; - } - if ((mem_only != 1) && (gRootDomain->sleepWakeDebugIsWdogEnabled())) - { - notify->returnValue = 30 * 1000 * 1000; - thread_call_enter1( - gRootDomain->hibDebugSetupEntry, - (thread_call_param_t)(uintptr_t) notify->powerRef); - } + notify->returnValue = 30 * 1000 * 1000; + thread_call_enter1( + gRootDomain->swdDebugSetupEntry, + (thread_call_param_t)(uintptr_t) notify->powerRef); #endif } else if (messageType == kIOMessageSystemCapabilityChange) @@ -2858,12 +2951,14 @@ IOReturn IOPMrootDomain::sysPowerDownHandler( params->fromCapabilities, params->toCapabilities, params->changeFlags); - if ((params->changeFlags & kIOPMSystemCapabilityWillChange) && - (params->fromCapabilities & kIOPMSystemCapabilityCPU) && - (params->toCapabilities & kIOPMSystemCapabilityCPU) == 0) + if (CAP_WILL_CHANGE_TO_OFF(params, kIOPMSystemCapabilityCPU)) { // We will ack within 20 seconds params->maxWaitForReply = 20 * 1000 * 1000; + + // Remove EFI/BootRom's previous wake's failure data + PERemoveNVRAMProperty(kIOEFIBootRomFailureKey); + #if HIBERNATION gRootDomain->evaluateSystemSleepPolicyEarly(); @@ -2900,20 +2995,36 @@ IOReturn IOPMrootDomain::sysPowerDownHandler( gRootDomain->diskSyncCalloutEntry, (thread_call_param_t)(uintptr_t) params->notifyRef); } - else - if ((params->changeFlags & kIOPMSystemCapabilityDidChange) && - (params->toCapabilities & kIOPMSystemCapabilityCPU) && - (params->fromCapabilities & kIOPMSystemCapabilityCPU) == 0) - { #if HIBERNATION + else if (CAP_DID_CHANGE_TO_ON(params, kIOPMSystemCapabilityCPU)) + { // We will ack within 110 seconds params->maxWaitForReply = 110 * 1000 * 1000; thread_call_enter1( gRootDomain->diskSyncCalloutEntry, (thread_call_param_t)(uintptr_t) params->notifyRef); -#endif } + else if (CAP_WILL_CHANGE_TO_OFF(params, kIOPMSystemCapabilityGraphics) || + CAP_WILL_CHANGE_TO_ON(params, kIOPMSystemCapabilityGraphics)) + { + // WillChange for Full wake -> Darkwake + params->maxWaitForReply = 30 * 1000 * 1000; + thread_call_enter1( + gRootDomain->swdDebugSetupEntry, + (thread_call_param_t)(uintptr_t) params->notifyRef); + } + else if (CAP_DID_CHANGE_TO_OFF(params, kIOPMSystemCapabilityGraphics) || + CAP_DID_CHANGE_TO_ON(params, kIOPMSystemCapabilityGraphics)) + { + // DidChange for Full wake -> Darkwake + params->maxWaitForReply = 30 * 1000 * 1000; + thread_call_enter1( + gRootDomain->swdDebugTearDownEntry, + (thread_call_param_t)(uintptr_t) params->notifyRef); + + } +#endif ret = kIOReturnSuccess; } @@ -3530,7 +3641,7 @@ IOReturn IOPMrootDomain::setPMSetting( fPMSettingsDict->setObject(type, object); // Prep all PMSetting objects with the given 'type' for callout. - array = (const OSArray *) settingsCallbacks->getObject(type); + array = OSDynamicCast(OSArray, settingsCallbacks->getObject(type)); if (!array || ((capacity = array->getCount()) == 0)) goto unlock_exit; @@ -3683,7 +3794,7 @@ IOReturn IOPMrootDomain::registerPMSettingController( PMSETTING_LOCK(); for (i=0; settings[i]; i++) { - list = (OSArray *) settingsCallbacks->getObject(settings[i]); + list = OSDynamicCast(OSArray, settingsCallbacks->getObject(settings[i])); if (!list) { // New array of callbacks for this setting list = OSArray::withCapacity(1); @@ -3748,7 +3859,7 @@ void IOPMrootDomain::deregisterPMSettingObject( PMSettingObject * pmso ) { while ((sym = OSDynamicCast(OSSymbol, iter->getNextObject()))) { - array = (OSArray *) settingsCallbacks->getObject(sym); + array = OSDynamicCast(OSArray, settingsCallbacks->getObject(sym)); index = array->getNextIndexOfObject(pmso, 0); if (-1 != index) { array->removeObject(index); @@ -3800,9 +3911,9 @@ void IOPMrootDomain::informCPUStateChange( varInfoStruct.varType = vBool; varInfoStruct.varInitValue = value; varInfoStruct.varCurValue = value; - strncpy( (char *)varInfoStruct.varName, + strlcpy( (char *)varInfoStruct.varName, (const char *)varNameStr, - strlen(varNameStr) + 1 ); + sizeof(varInfoStruct.varName)); // Set! pmCPUret = pmCPUControl( PMIOCSETVARINFO, (void *)&varInfoStruct ); @@ -3955,8 +4066,13 @@ bool IOPMrootDomain::evaluateSystemSleepPolicy( currentFactors |= kIOPMSleepFactorBatteryLow; if (!standbyDelay) currentFactors |= kIOPMSleepFactorStandbyNoDelay; - if (!standbyEnabled) + if (standbyNixed || !standbyEnabled) currentFactors |= kIOPMSleepFactorStandbyDisabled; + if (resetTimers) + { + currentFactors |= kIOPMSleepFactorLocalUserActivity; + currentFactors &= ~kIOPMSleepFactorSleepTimerWake; + } if (getPMAssertionLevel(kIOPMDriverAssertionUSBExternalDeviceBit) != kIOPMDriverAssertionLevelOff) currentFactors |= kIOPMSleepFactorUSBExternalDevice; @@ -4201,36 +4317,55 @@ void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void ) { IOPMSystemSleepParameters params; OSData * paramsData; - + bool wakeNow; // Evaluate sleep policy after sleeping drivers but before platform sleep. DLOG("%s\n", __FUNCTION__); bzero(¶ms, sizeof(params)); + wakeNow = false; if (evaluateSystemSleepPolicy(¶ms, kIOPMSleepPhase2, &hibernateMode)) { - if ((hibernateDisabled || hibernateAborted) && + if ((kIOPMSleepTypeStandby == params.sleepType) && gIOHibernateStandbyDisabled) + { + standbyNixed = true; + wakeNow = true; + } + if (wakeNow + || ((hibernateDisabled || hibernateAborted) && (getSleepTypeAttributes(params.sleepType) & - kIOPMSleepAttributeHibernateSetup)) + kIOPMSleepAttributeHibernateSetup))) { // Final evaluation picked a state requiring hibernation, - // but hibernate setup was skipped. Arm a short sleep using + // but hibernate isn't going to proceed. Arm a short sleep using // the early non-hibernate sleep parameters. - // Set hibernateRetry flag to force hibernate setup on the - // next sleep. - bcopy(&gEarlySystemSleepParams, ¶ms, sizeof(params)); params.sleepType = kIOPMSleepTypeAbortedSleep; params.ecWakeTimer = 1; - hibernateRetry = true; - DLOG("wake in %u secs for hibernateDisabled %d, hibernateAborted %d\n", - params.ecWakeTimer, hibernateDisabled, hibernateAborted); + gIOHibernateMode = 0; + if (standbyNixed) + { + resetTimers = true; + } + else + { + // Set hibernateRetry flag to force hibernate setup on the + // next sleep. + hibernateRetry = true; + } + DLOG("wake in %u secs for hibernateDisabled %d, hibernateAborted %d, standbyNixed %d\n", + params.ecWakeTimer, hibernateDisabled, hibernateAborted, standbyNixed); } else { hibernateRetry = false; } + if (kIOPMSleepTypeAbortedSleep != params.sleepType) + { + resetTimers = false; + } + paramsData = OSData::withBytes(¶ms, sizeof(params)); if (paramsData) { @@ -4800,15 +4935,6 @@ void IOPMrootDomain::handleOurPowerChangeStart( _pendingCapability = 0; capabilityLoss = true; - // Clear previous stats - IOLockLock(pmStatsLock); - if (pmStatsAppResponses) - { - pmStatsAppResponses->release(); - pmStatsAppResponses = OSArray::withCapacity(5); - } - IOLockUnlock(pmStatsLock); - } else if (kSystemTransitionNewCapClient != _systemTransitionType) { @@ -4846,6 +4972,16 @@ void IOPMrootDomain::handleOurPowerChangeStart( // Full to Dark transition. if (CAP_LOSS(kIOPMSystemCapabilityGraphics)) { + // Clear previous stats + IOLockLock(pmStatsLock); + if (pmStatsAppResponses) + { + pmStatsAppResponses->release(); + pmStatsAppResponses = OSArray::withCapacity(5); + } + IOLockUnlock(pmStatsLock); + + tracePoint( kIOPMTracePointDarkWakeEntry ); *inOutChangeFlags |= kIOPMSyncTellPowerDown; _systemMessageClientMask = kSystemMessageClientPowerd | @@ -4878,7 +5014,7 @@ void IOPMrootDomain::handleOurPowerChangeStart( { // Beginning of a system sleep transition. // Cancellation is still possible. - tracePoint( kIOPMTracePointSleepStarted, sleepReason ); + tracePoint( kIOPMTracePointSleepStarted ); _systemMessageClientMask = kSystemMessageClientAll; if ((_currentCapability & kIOPMSystemCapabilityGraphics) == 0) @@ -5112,7 +5248,7 @@ void IOPMrootDomain::handleOurPowerChangeDone( (changeFlags & kIOPMNotDone))) { setProperty(kIOPMSystemCapabilitiesKey, _currentCapability, 64); - tracePoint( kIOPMTracePointSystemUp, 0 ); + tracePoint( kIOPMTracePointSystemUp ); } _systemTransitionType = kSystemTransitionNone; @@ -5120,6 +5256,10 @@ void IOPMrootDomain::handleOurPowerChangeDone( toldPowerdCapWillChange = false; logGraphicsClamp = false; + + if (lowBatteryCondition) { + privateSleepSystem (kIOPMSleepReasonLowPower); + } } } @@ -5256,7 +5396,7 @@ void IOPMrootDomain::overridePowerChangeForUIService( uint64_t nsec; clock_get_uptime(&now); - SUB_ABSOLUTETIME(&now, &systemWakeTime); + SUB_ABSOLUTETIME(&now, &gIOLastWakeAbsTime); absolutetime_to_nanoseconds(now, &nsec); if (kIOLogPMRootDomain & gIOKitDebug) MSG("Graphics suppressed %u ms\n", @@ -5436,6 +5576,7 @@ class IOPMServiceInterestNotifier: public _IOServiceInterestNotifier protected: uint32_t ackTimeoutCnt; + uint32_t msgType; // Message pending ack }; @@ -5509,7 +5650,9 @@ bool IOPMrootDomain::systemMessageFilter( bool isCapMsg = (context->messageType == kIOMessageSystemCapabilityChange); bool isCapClient = false; bool allow = false; + IOPMServiceInterestNotifier *notifier; + notifier = OSDynamicCast(IOPMServiceInterestNotifier, (OSObject *)object); do { if ((kSystemTransitionNewCapClient == _systemTransitionType) && (!isCapMsg || !_joinedCapabilityClients || @@ -5567,6 +5710,10 @@ bool IOPMrootDomain::systemMessageFilter( { // app has not replied yet, wait for it *((OSObject **) arg3) = kOSBooleanFalse; + + if (notifier) { + notifier->msgType = context->messageType; + } } allow = true; @@ -5584,6 +5731,9 @@ bool IOPMrootDomain::systemMessageFilter( if (object == (OSObject *) systemCapabilityNotifier) { allow = true; + if (notifier) { + notifier->msgType = context->messageType; + } break; } @@ -5598,8 +5748,12 @@ bool IOPMrootDomain::systemMessageFilter( { if ((object == (OSObject *) systemCapabilityNotifier) && CAP_HIGHEST(kIOPMSystemCapabilityGraphics) && - (fullToDarkReason == kIOPMSleepReasonIdle)) + (fullToDarkReason == kIOPMSleepReasonIdle)) { + if (notifier) { + notifier->msgType = context->messageType; + } allow = true; + } break; } @@ -5616,16 +5770,17 @@ bool IOPMrootDomain::systemMessageFilter( if ((context->notifyType == kNotifyApps) && (_systemMessageClientMask & kSystemMessageClientLegacyApp)) { - IOPMServiceInterestNotifier *notify; allow = true; - if ((notify = OSDynamicCast(IOPMServiceInterestNotifier, (OSObject *)object)) - && arg3) { + if (notifier) { + if (arg3) { + if (notifier->ackTimeoutCnt >= 3) + *((OSObject **) arg3) = kOSBooleanFalse; + else + *((OSObject **) arg3) = kOSBooleanTrue; + } - if (notify->ackTimeoutCnt >= 3) - *((OSObject **) arg3) = kOSBooleanFalse; - else - *((OSObject **) arg3) = kOSBooleanTrue; + notifier->msgType = context->messageType; } } else if ((context->notifyType == kNotifyPriority) && @@ -5839,7 +5994,7 @@ void IOPMrootDomain::reportUserInput( void ) if (matching) matching->release(); if(iter) { - wrangler = (IOService *) iter->getNextObject(); + wrangler = OSDynamicCast(IOService, iter->getNextObject()); iter->release(); } } @@ -6225,6 +6380,7 @@ void IOPMrootDomain::dispatchPowerEvent( systemCapabilityNotifier->retain(); } /* intentional fall-through */ + [[clang::fallthrough]]; case kPowerEventRegisterKernelCapabilityClient: if (!_joinedCapabilityClients) @@ -6676,6 +6832,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) if (kFullWakeReasonDisplayOn == fullWakeReason) fullWakeReason = fFullWakeReasonDisplayOnAndLocalUser; + kdebugTrace(kPMLogUserActiveState, 0, 1, 0); setProperty(gIOPMUserIsActiveKey, kOSBooleanTrue); messageClients(kIOPMMessageUserIsActiveChanged); } @@ -6689,6 +6846,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) clock_get_uptime(&userBecameInactiveTime); flags.bit.userBecameInactive = true; + kdebugTrace(kPMLogUserActiveState, 0, 0, 0); setProperty(gIOPMUserIsActiveKey, kOSBooleanFalse); messageClients(kIOPMMessageUserIsActiveChanged); } @@ -7048,7 +7206,7 @@ void IOPMrootDomain::requestFullWake( FullWakeReason reason ) uint64_t nsec; clock_get_uptime(&now); - SUB_ABSOLUTETIME(&now, &systemWakeTime); + SUB_ABSOLUTETIME(&now, &gIOLastWakeAbsTime); absolutetime_to_nanoseconds(now, &nsec); MSG("full wake %s (reason %u) %u ms\n", promotion ? "promotion" : "request", @@ -7070,6 +7228,8 @@ void IOPMrootDomain::willEnterFullWake( void ) { hibernateRetry = false; sleepToStandby = false; + standbyNixed = false; + resetTimers = false; sleepTimerMaintenance = false; _systemMessageClientMask = kSystemMessageClientPowerd | @@ -7134,11 +7294,11 @@ void IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, I if (changedBits & kIOPMDriverAssertionCPUBit) { evaluatePolicy(kStimulusDarkWakeEvaluate); - if (!assertOnWakeSecs && systemWakeTime) { + if (!assertOnWakeSecs && gIOLastWakeAbsTime) { AbsoluteTime now; clock_usec_t microsecs; clock_get_uptime(&now); - SUB_ABSOLUTETIME(&now, &systemWakeTime); + SUB_ABSOLUTETIME(&now, &gIOLastWakeAbsTime); absolutetime_to_microtime(now, &assertOnWakeSecs, µsecs); if (assertOnWakeReport) { HISTREPORT_TALLYVALUE(assertOnWakeReport, (int64_t)assertOnWakeSecs); @@ -7222,7 +7382,7 @@ void IOPMrootDomain::pmStatsRecordApplicationResponse( const char *name, int messageType, uint32_t delay_ms, - int app_pid, + uint64_t id, OSObject *object, IOPMPowerStateIndex powerState) { @@ -7249,6 +7409,14 @@ void IOPMrootDomain::pmStatsRecordApplicationResponse( return; + if (response->isEqualTo(gIOPMStatsDriverPSChangeSlow)) { + kdebugTrace(kPMLogDrvResponseDelay, id, messageType, delay_ms); + } + else if (notify) { + kdebugTrace(kPMLogAppResponseDelay, id, notify->msgType, delay_ms); + notify->msgType = 0; + } + responseDescription = OSDictionary::withCapacity(5); if (responseDescription) { @@ -7271,8 +7439,8 @@ void IOPMrootDomain::pmStatsRecordApplicationResponse( } } - if (app_pid != -1) { - pidNum = OSNumber::withNumber(app_pid, 32); + if (id != 0) { + pidNum = OSNumber::withNumber(id, 32); if (pidNum) { responseDescription->setObject(_statsPIDKey, pidNum); pidNum->release(); @@ -7351,6 +7519,8 @@ IOReturn IOPMrootDomain::callPlatformFunction( void * param1, void * param2, void * param3, void * param4 ) { + uint32_t bootFailureCode = 0xffffffff; + unsigned int len = sizeof(bootFailureCode); if (pmTracer && functionName && functionName->isEqualTo(kIOPMRegisterNVRAMTracePointHandlerKey) && !pmTracer->tracePointHandler && !pmTracer->tracePointTarget) @@ -7362,9 +7532,15 @@ IOReturn IOPMrootDomain::callPlatformFunction( pmTracer->tracePointTarget = (void *) param2; tracePointPCI = (uint32_t)(uintptr_t) param3; tracePointPhases = (uint32_t)(uintptr_t) param4; + if ((tracePointPhases & 0xff) == kIOPMTracePointSystemSleep) { + if (!PEReadNVRAMProperty(kIOEFIBootRomFailureKey, &bootFailureCode, &len)) { + MSG("Failed to read failure code from NVRam\n"); + } + // Failure code from EFI/BootRom is a four byte structure + tracePointPCI = OSSwapBigToHostInt32(bootFailureCode); + } statusCode = (((uint64_t)tracePointPCI) << 32) | tracePointPhases; - if ((tracePointPhases >> 24) != kIOPMTracePointSystemUp) - { + if ((tracePointPhases & 0xff) != kIOPMTracePointSystemUp) { MSG("Sleep failure code 0x%08x 0x%08x\n", tracePointPCI, tracePointPhases); } @@ -7392,6 +7568,18 @@ IOReturn IOPMrootDomain::callPlatformFunction( functionName, waitForFunction, param1, param2, param3, param4); } +void IOPMrootDomain::kdebugTrace(uint32_t event, uint64_t id, + uintptr_t param1, uintptr_t param2, uintptr_t param3) +{ + uint32_t code = IODBG_POWER(event); + uint64_t regId = id; + if (regId == 0) { + regId = getRegistryEntryID(); + } + IOTimeStampConstant(code, (uintptr_t) regId, param1, param2, param3); +} + + void IOPMrootDomain::tracePoint( uint8_t point ) { if (systemBooting) return; @@ -7399,22 +7587,19 @@ void IOPMrootDomain::tracePoint( uint8_t point ) if (kIOPMTracePointWakeCapabilityClients == point) acceptSystemWakeEvents(false); - PMDebug(kPMLogSleepWakeTracePoint, point, 0); + kdebugTrace(kPMLogSleepWakeTracePoint, 0, point, 0); pmTracer->tracePoint(point); } -void IOPMrootDomain::tracePoint( uint8_t point, uint8_t data ) -{ - if (systemBooting) return; - - PMDebug(kPMLogSleepWakeTracePoint, point, data); - pmTracer->tracePoint(point, data); -} - -void IOPMrootDomain::traceDetail( uint32_t detail ) +void IOPMrootDomain::traceDetail(uint32_t msgType, uint32_t msgIndex, uintptr_t handler) { - if (!systemBooting) + if (!systemBooting) { + uint32_t detail = ((msgIndex & 0xff) << 24) | + ((msgType & 0xfff) << 12) | + (handler & 0xfff); pmTracer->traceDetail( detail ); + kdebugTrace(kPMLogSleepWakeTracePoint, 0, pmTracer->getTracePhase(), msgType, handler & 0xfff); + } } @@ -7627,10 +7812,12 @@ PMTraceWorker *PMTraceWorker::tracer(IOPMrootDomain *owner) // this dictionary lazily. me->owner = owner; me->pciDeviceBitMappings = NULL; - me->pciMappingLock = IOLockAlloc(); + me->pmTraceWorkerLock = IOLockAlloc(); me->tracePhase = kIOPMTracePointSystemUp; - me->loginWindowPhase = 0; me->traceData32 = 0; + me->loginWindowData = 0; + me->coreDisplayData = 0; + me->coreGraphicsData = 0; return me; } @@ -7640,8 +7827,10 @@ void PMTraceWorker::RTC_TRACE(void) { uint32_t wordA; - wordA = (tracePhase << 24) | (loginWindowPhase << 16) | - (traceData8 << 8); + IOLockLock(pmTraceWorkerLock); + wordA = (loginWindowData << 24) | (coreDisplayData << 16) | + (coreGraphicsData << 8) | tracePhase; + IOLockUnlock(pmTraceWorkerLock); tracePointHandler( tracePointTarget, traceData32, wordA ); _LOG("RTC_TRACE wrote 0x%08x 0x%08x\n", traceData32, wordA); @@ -7653,7 +7842,7 @@ int PMTraceWorker::recordTopLevelPCIDevice(IOService * pciDevice) const OSSymbol * deviceName; int index = -1; - IOLockLock(pciMappingLock); + IOLockLock(pmTraceWorkerLock); if (!pciDeviceBitMappings) { @@ -7680,7 +7869,7 @@ int PMTraceWorker::recordTopLevelPCIDevice(IOService * pciDevice) addedToRegistry = owner->setProperty("PCITopLevel", this); exit: - IOLockUnlock(pciMappingLock); + IOLockUnlock(pmTraceWorkerLock); return index; } @@ -7689,9 +7878,9 @@ bool PMTraceWorker::serialize(OSSerialize *s) const bool ok = false; if (pciDeviceBitMappings) { - IOLockLock(pciMappingLock); + IOLockLock(pmTraceWorkerLock); ok = pciDeviceBitMappings->serialize(s); - IOLockUnlock(pciMappingLock); + IOLockUnlock(pmTraceWorkerLock); } return ok; } @@ -7708,23 +7897,8 @@ void PMTraceWorker::tracePoint(uint8_t phase) RTC_TRACE(); } -void PMTraceWorker::tracePoint(uint8_t phase, uint8_t data8) -{ - // clear trace detail when phase begins - if (tracePhase != phase) - traceData32 = 0; - - tracePhase = phase; - traceData8 = data8; - - DLOG("trace point 0x%02x 0x%02x\n", tracePhase, traceData8); - RTC_TRACE(); -} - void PMTraceWorker::traceDetail(uint32_t detail) { - if (kIOPMTracePointSleepPriorityClients != tracePhase) - return; traceData32 = detail; DLOG("trace point 0x%02x detail 0x%08x\n", tracePhase, traceData32); @@ -7732,11 +7906,23 @@ void PMTraceWorker::traceDetail(uint32_t detail) RTC_TRACE(); } -void PMTraceWorker::traceLoginWindowPhase(uint8_t phase) +void PMTraceWorker::traceComponentWakeProgress(uint32_t component, uint32_t data) { - loginWindowPhase = phase; - - DLOG("loginwindow tracepoint 0x%02x\n", loginWindowPhase); + switch (component) { + case kIOPMLoginWindowProgress: + loginWindowData = data & kIOPMLoginWindowProgressMask; + break; + case kIOPMCoreDisplayProgress: + coreDisplayData = data & kIOPMCoreDisplayProgressMask; + break; + case kIOPMCoreGraphicsProgress: + coreGraphicsData = data & kIOPMCoreGraphicsProgressMask; + break; + default: + return; + } + + DLOG("component trace point 0x%02x data 0x%08x\n", component, data); RTC_TRACE(); } @@ -7769,12 +7955,14 @@ void PMTraceWorker::tracePCIPowerChange( traceData32 |= bitMask; _LOG("PMTrace: Device %s started - bit %2d mask 0x%08x => 0x%08x\n", service->getName(), bitNum, bitMask, traceData32); + owner->kdebugTrace(kPMLogPCIDevChangeStart, service->getRegistryEntryID(), traceData32, 0); } else { traceData32 &= ~bitMask; _LOG("PMTrace: Device %s finished - bit %2d mask 0x%08x => 0x%08x\n", service->getName(), bitNum, bitMask, traceData32); + owner->kdebugTrace(kPMLogPCIDevChangeDone, service->getRegistryEntryID(), traceData32, 0); } DLOG("trace point 0x%02x detail 0x%08x\n", tracePhase, traceData32); @@ -7784,11 +7972,20 @@ void PMTraceWorker::tracePCIPowerChange( uint64_t PMTraceWorker::getPMStatusCode( ) { - return (((uint64_t)traceData32 << 32) | ((uint64_t)tracePhase << 24) | - (loginWindowPhase << 16) | (traceData8 << 8)); + return (((uint64_t)traceData32 << 32) | ((uint64_t)tracePhase)); } +uint8_t PMTraceWorker::getTracePhase() +{ + return tracePhase; +} + +uint32_t PMTraceWorker::getTraceData() +{ + return traceData32; +} + // MARK: - // MARK: PMHaltWorker @@ -7897,7 +8094,7 @@ void PMHaltWorker::work( PMHaltWorker * me ) inner = (OSSet *)gPMHaltArray->getObject(me->depth); if (inner) { - service = (IOService *)inner->getAnyObject(); + service = OSDynamicCast(IOService, inner->getAnyObject()); if (service) { service->retain(); @@ -9116,20 +9313,22 @@ void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool is { swd_hdr * hdr = NULL; addr64_t data[3]; - uint32_t wdog_panic = 0; + int wdog_panic = -1; int cnt = 0; pid_t pid = 0; + kern_return_t kr = KERN_SUCCESS; uint32_t flags; char * dstAddr; uint32_t size; uint32_t bytesRemaining; + unsigned bytesWritten = 0; + unsigned totalBytes = 0; unsigned int len; OSString * UUIDstring = NULL; uint64_t code; IOMemoryMap * logBufMap = NULL; - swd_stackshot_hdr *stackshotHdr = NULL; uint32_t bufSize; uint32_t initialStackSize; @@ -9144,9 +9343,9 @@ void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool is } if (wdogTrigger) { - if (PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)) && - (wdog_panic == 1)) { - // If boot-arg is set to panic on sleep/wake hang, call panic + PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)); + if (wdog_panic == 1) { + // If boot-arg specifies to panic then panic. panic("Sleep/Wake hang detected\n"); return; } @@ -9188,10 +9387,13 @@ void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool is if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) return; - if (isSpinDump) + if (isSpinDump) { hdr = (swd_hdr *)swd_spindump_buffer; - else + } + else { hdr = (swd_hdr *)swd_buffer; + } + memset(hdr->UUID, 0x20, sizeof(hdr->UUID)); if ((UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey))) != NULL ) { @@ -9213,51 +9415,54 @@ void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool is DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining); - while (bytesRemaining > sizeof(swd_stackshot_hdr)) { - - stackshotHdr = (swd_stackshot_hdr *)dstAddr; - stackshotHdr->magic = SWD_STACKSHOTHDR_MAGIC; - stackshotHdr->size = 0; - bytesRemaining -= sizeof(swd_stackshot_hdr); - dstAddr += sizeof(swd_stackshot_hdr); + flags = STACKSHOT_KCDATA_FORMAT|STACKSHOT_NO_IO_STATS|STACKSHOT_SAVE_KEXT_LOADINFO; + while (kr == KERN_SUCCESS) { - if (isOSXWatchdog) { - pid = -1; - size = bytesRemaining; - flags = STACKSHOT_SAVE_LOADINFO | STACKSHOT_SAVE_KEXT_LOADINFO; - } - else if (cnt == 0) { - /* + if (cnt == 0) { + /* * Take stackshot of all process on first sample. Size is restricted * to SWD_INITIAL_STACK_SIZE */ pid = -1; size = (bytesRemaining > initialStackSize) ? initialStackSize : bytesRemaining; - flags = STACKSHOT_SAVE_LOADINFO | STACKSHOT_SAVE_KEXT_LOADINFO|STACKSHOT_SAVE_KERNEL_FRAMES_ONLY; + flags |= STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY; } else { /* Take sample of kernel threads only */ pid = 0; size = bytesRemaining; - flags = 0; } - stack_snapshot_from_kernel(pid, dstAddr, size, flags, &stackshotHdr->size); + kr = stack_snapshot_from_kernel(pid, dstAddr, size, flags, 0, &bytesWritten); + DLOG("stack_snapshot_from_kernel returned 0x%x. pid: %d bufsize:0x%x flags:0x%x bytesWritten: %d\n", + kr, pid, size, flags, bytesWritten); + if (kr == KERN_INSUFFICIENT_BUFFER_SIZE) { + if (pid == -1) { + // Insufficient buffer when trying to take stackshot of user & kernel space threads. + // Continue to take stackshot of just kernel threads + ++cnt; + kr = KERN_SUCCESS; + continue; + } + else if (totalBytes == 0) { + MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags); + } + } - dstAddr += stackshotHdr->size; - bytesRemaining -= stackshotHdr->size; + dstAddr += bytesWritten; + totalBytes += bytesWritten; + bytesRemaining -= bytesWritten; - DLOG("Sample: %d size: %d bytesRemaining: %d\n", cnt, stackshotHdr->size, bytesRemaining); - if ((stackshotHdr->size == 0) || (++cnt == 10)) + if (++cnt == 10) { break; + } IOSleep(10); // 10 ms } hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset); - memset(hdr->cps, 0x20, sizeof(hdr->cps)); - snprintf(hdr->cps, sizeof(hdr->cps), "\ncps: %d", ((IOService*)this)->getPowerState()); + memset(hdr->spindump_status, 0x20, sizeof(hdr->spindump_status)); code = pmTracer->getPMStatusCode(); memset(hdr->PMStatusCode, 0x20, sizeof(hdr->PMStatusCode)); snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: %08x %08x", @@ -9344,6 +9549,7 @@ void IOPMrootDomain::sleepWakeDebugMemAlloc( ) hdr->spindump_offset = sizeof(swd_hdr); swd_buffer = (void *)hdr; + swd_memDesc = memDesc; DLOG("SleepWake debug buffer size:0x%x spindump offset:0x%x\n", hdr->alloc_size, hdr->spindump_offset); exit: @@ -9431,6 +9637,7 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int S_IRUSR|S_IRGRP|S_IROTH, VNODE_LOOKUP_NOFOLLOW, &vp, ctx) != 0) { IOLog("Failed to open the file %s\n", name); + swd_flags |= SWD_FILEOP_ERROR; goto exit; } VATTR_INIT(&va); @@ -9439,6 +9646,7 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int if (vp->v_type != VREG || vnode_getattr(vp, &va, ctx) || va.va_nlink != 1) { IOLog("Bailing as this is not a regular file\n"); + swd_flags |= SWD_FILEOP_ERROR; goto exit; } VATTR_INIT(&va); @@ -9446,12 +9654,17 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int vnode_setattr(vp, &va, ctx); - error = vn_rdwr(UIO_WRITE, vp, buf, len, 0, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, vfs_context_proc(ctx)); - if (error != 0) - IOLog("Failed to save sleep wake log. err 0x%x\n", error); - else - DLOG("Saved %d bytes to file %s\n",len, name); + if (buf != NULL) { + error = vn_rdwr(UIO_WRITE, vp, buf, len, 0, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, vfs_context_proc(ctx)); + if (error != 0) { + IOLog("Failed to save sleep wake log. err 0x%x\n", error); + swd_flags |= SWD_FILEOP_ERROR; + } + else { + DLOG("Saved %d bytes to file %s\n",len, name); + } + } exit: if (vp) vnode_close(vp, FWRITE, ctx); @@ -9481,7 +9694,8 @@ errno_t IOPMrootDomain::sleepWakeDebugCopyFile( if (vnode_open(dstFname, (O_CREAT | FWRITE | O_NOFOLLOW), S_IRUSR|S_IRGRP|S_IROTH, VNODE_LOOKUP_NOFOLLOW, &vp, ctx) != 0) { - DLOG("Failed to open the file %s\n", dstFname); + IOLog("Failed to open the file %s\n", dstFname); + swd_flags |= SWD_FILEOP_ERROR; goto exit; } VATTR_INIT(&va); @@ -9489,7 +9703,8 @@ errno_t IOPMrootDomain::sleepWakeDebugCopyFile( /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || vnode_getattr(vp, &va, ctx) || va.va_nlink != 1) { - DLOG("Bailing as this is not a regular file\n"); + IOLog("Bailing as this is not a regular file\n"); + swd_flags |= SWD_FILEOP_ERROR; goto exit; } VATTR_INIT(&va); @@ -9507,7 +9722,8 @@ errno_t IOPMrootDomain::sleepWakeDebugCopyFile( vfs_context_ucred(srcCtx), (int *) 0, vfs_context_proc(srcCtx)); if (error) { - DLOG("Failed to read file(numBytes:0x%llx)\n", bytesToRead); + IOLog("Failed to read file(numBytes:0x%llx)\n", bytesToRead); + swd_flags |= SWD_FILEOP_ERROR; break; } @@ -9524,7 +9740,8 @@ errno_t IOPMrootDomain::sleepWakeDebugCopyFile( vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); if (error) { - DLOG("Failed to write file(numBytes:0x%llx)\n", bytesToWrite); + IOLog("Failed to write file(numBytes:0x%llx)\n", bytesToWrite); + swd_flags |= SWD_FILEOP_ERROR; break; } @@ -9534,26 +9751,19 @@ errno_t IOPMrootDomain::sleepWakeDebugCopyFile( } if (crc != newcrc) { - swd_stackshot_hdr *shdr = (swd_stackshot_hdr *)tmpBuf;; - - /* Set statckshot size to 0 if crc doesn't match */ - shdr->magic = SWD_STACKSHOTHDR_MAGIC; - shdr->size = 0; + /* Set stackshot size to 0 if crc doesn't match */ + VATTR_INIT(&va); + VATTR_SET(&va, va_data_size, 0); + vnode_setattr(vp, &va, ctx); - assert(tmpBufSize > sizeof(swd_stackshot_hdr)); - bytesToWrite = round_page(sizeof(swd_stackshot_hdr)); - vn_rdwr(UIO_WRITE, vp, (char *)tmpBuf, bytesToWrite, 0, - UIO_SYSSPACE, IO_SYNC|IO_NODELOCKED|IO_UNIT, - vfs_context_ucred(ctx), (int *) 0, - vfs_context_proc(ctx)); - - DLOG("CRC check failed. expected:0x%x actual:0x%x\n", crc, newcrc); + IOLog("CRC check failed. expected:0x%x actual:0x%x\n", crc, newcrc); + swd_flags |= SWD_DATA_CRC_ERROR; error = EFAULT; } exit: if (vp) { error = vnode_close(vp, FWRITE, ctx); - DLOG("vnode_close returned 0x%x\n", error); + DLOG("vnode_close on file %s returned 0x%x\n",dstFname, error); } if (ctx) vfs_context_rele(ctx); @@ -9562,11 +9772,12 @@ errno_t IOPMrootDomain::sleepWakeDebugCopyFile( } -void IOPMrootDomain::checkForValidDebugData(const char *fname, vfs_context_t *ctx, +uint32_t IOPMrootDomain::checkForValidDebugData(const char *fname, vfs_context_t *ctx, void *tmpBuf, struct vnode **vp) { int rc; uint64_t hdrOffset; + uint32_t error = 0; struct vnode_attr va; IOHibernateImageHeader *imageHdr; @@ -9583,7 +9794,8 @@ void IOPMrootDomain::checkForValidDebugData(const char *fname, vfs_context_t *ct VATTR_WANTED(&va, va_data_alloc); if ((*vp)->v_type != VREG || vnode_getattr((*vp), &va, *ctx) || va.va_nlink != 1) { - DMSG("sleepWakeDebugDumpFromFile: Bailing as %s is not a regular file\n", fname); + IOLog("sleepWakeDebugDumpFromFile: Bailing as %s is not a regular file\n", fname); + error = SWD_FILEOP_ERROR; goto err; } @@ -9593,33 +9805,36 @@ void IOPMrootDomain::checkForValidDebugData(const char *fname, vfs_context_t *ct vfs_context_ucred(*ctx), (int *) 0, vfs_context_proc(*ctx)); if (rc != 0) { - DMSG("sleepWakeDebugDumpFromFile: Failed to read header size %lu(rc=%d) from %s\n", + IOLog("sleepWakeDebugDumpFromFile: Failed to read header size %lu(rc=%d) from %s\n", round_page(sizeof(IOHibernateImageHeader)), rc, fname); + error = SWD_FILEOP_ERROR; goto err; } imageHdr = ((IOHibernateImageHeader *)tmpBuf); if (imageHdr->signature != kIOHibernateHeaderDebugDataSignature) { - DMSG("sleepWakeDebugDumpFromFile: File %s header has unexpected value 0x%x\n", + IOLog("sleepWakeDebugDumpFromFile: File %s header has unexpected value 0x%x\n", fname, imageHdr->signature); + error = SWD_HDR_SIGNATURE_ERROR; goto err; } /* Sleep/Wake debug header(swd_hdr) is at the beggining of the second block */ hdrOffset = imageHdr->deviceBlockSize; if (hdrOffset + sizeof(swd_hdr) >= va.va_data_alloc) { - DMSG("sleepWakeDebugDumpFromFile: header is crossing file size(0x%llx) in file %s\n", + IOLog("sleepWakeDebugDumpFromFile: header is crossing file size(0x%llx) in file %s\n", va.va_data_alloc, fname); + error = SWD_HDR_SIZE_ERROR; goto err; } - return; + return 0; err: if (*vp) vnode_close(*vp, FREAD, *ctx); *vp = NULL; - return; + return error; } void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) @@ -9627,7 +9842,6 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) #if HIBERNATION int rc; char hibernateFilename[MAXPATHLEN+1]; - char PMStatusCode[100]; void *tmpBuf; swd_hdr *hdr = NULL; uint32_t stacksSize, logSize; @@ -9639,6 +9853,7 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) OSNumber *failStat = NULL; struct vnode *vp = NULL; vfs_context_t ctx = NULL; + const char *stacksFname, *logFname; IOBufferMemoryDescriptor *tmpBufDesc = NULL; @@ -9666,7 +9881,7 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) ctx = vfs_context_create(vfs_context_current()); /* First check if 'kSleepWakeStackBinFilename' has valid data */ - checkForValidDebugData(kSleepWakeStackBinFilename, &ctx, tmpBuf, &vp); + swd_flags |= checkForValidDebugData(kSleepWakeStackBinFilename, &ctx, tmpBuf, &vp); if (vp == NULL) { /* Check if the debug data is saved to hibernation file */ hibernateFilename[0] = 0; @@ -9682,7 +9897,7 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) goto exit; } - checkForValidDebugData(hibernateFilename, &ctx, tmpBuf, &vp); + swd_flags |= checkForValidDebugData(hibernateFilename, &ctx, tmpBuf, &vp); if (vp == NULL) { DMSG("sleepWakeDebugDumpFromFile: No valid debug data is found\n"); goto exit; @@ -9704,6 +9919,7 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) if (rc != 0) { DMSG("sleepWakeDebugDumpFromFile: Failed to debug read header size %lu. rc=%d\n", round_page(sizeof(swd_hdr)), rc); + swd_flags |= SWD_FILEOP_ERROR; goto exit; } @@ -9712,6 +9928,7 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) (hdr->spindump_offset > SWD_BUF_SIZE) || (hdr->spindump_size > SWD_BUF_SIZE)) { DMSG("sleepWakeDebugDumpFromFile: Invalid data in debug header. sign:0x%x size:0x%x spindump_offset:0x%x spindump_size:0x%x\n", hdr->signature, hdr->alloc_size, hdr->spindump_offset, hdr->spindump_size); + swd_flags |= SWD_BUF_SIZE_ERROR; goto exit; } stacksSize = hdr->spindump_size; @@ -9720,15 +9937,17 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) stacksOffset = hdrOffset + hdr->spindump_offset; logOffset = hdrOffset + offsetof(swd_hdr, UUID); logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); + stacksFname = getDumpStackFilename(hdr); + logFname = getDumpLogFilename(hdr); error = sleepWakeDebugCopyFile(vp, ctx, (char *)tmpBuf, tmpBufSize, stacksOffset, - getDumpStackFilename(hdr), stacksSize, hdr->crc); + stacksFname, stacksSize, hdr->crc); if (error == EFAULT) { DMSG("sleepWakeDebugDumpFromFile: Stackshot CRC doesn't match\n"); goto exit; } error = sleepWakeDebugCopyFile(vp, ctx, (char *)tmpBuf, tmpBufSize, logOffset, - getDumpLogFilename(hdr), logSize, 0); + logFname, logSize, 0); if (error) { DMSG("sleepWakeDebugDumpFromFile: Failed to write the log file(0x%x)\n", error); goto exit; @@ -9738,6 +9957,11 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) // Write just the SleepWakeLog.dump with failure code uint64_t fcode = 0; const char *fname; + swd_hdr hdrCopy; + char *offset = NULL; + int size; + + hdr = &hdrCopy; if (swd_flags & SWD_BOOT_BY_SW_WDOG) { failStat = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey)); fcode = failStat->unsigned64BitValue(); @@ -9746,10 +9970,17 @@ void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) else { fname = kAppleOSXWatchdogLogFilename; } - memset(PMStatusCode, 0x20, sizeof(PMStatusCode)); // Fill with spaces - PMStatusCode[sizeof(PMStatusCode)-1] = 0xa; // And an end-of-line at the end - snprintf(PMStatusCode, sizeof(PMStatusCode)-1, "Code: 0x%llx", fcode); - sleepWakeDebugSaveFile(fname, PMStatusCode, sizeof(PMStatusCode)); + + offset = (char*)hdr+offsetof(swd_hdr, UUID); + size = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); + memset(offset, 0x20, size); // Fill with spaces + + + snprintf(hdr->spindump_status, sizeof(hdr->spindump_status), "\nstatus: 0x%x", swd_flags); + snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: 0x%llx", fcode); + snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n"); + sleepWakeDebugSaveFile(fname, offset, size); + } gRootDomain->swd_lock = 0; @@ -9768,7 +9999,6 @@ void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap) errno_t error = EIO; uint64_t bufSize = 0; swd_hdr *hdr = NULL; - char PMStatusCode[100]; OSNumber *failStat = NULL; if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) @@ -9784,7 +10014,8 @@ void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap) bufSize = logBufMap->getLength(); if (bufSize <= sizeof(swd_hdr)) { - IOLog("SleepWake log buffer contents are invalid\n"); + IOLog("SleepWake log buffer size is invalid\n"); + swd_flags |= SWD_BUF_SIZE_ERROR; goto exit; } @@ -9807,13 +10038,10 @@ void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap) // Write just the SleepWakeLog.dump with failure code uint64_t fcode = 0; const char *sname, *lname; - swd_stackshot_hdr shdr; - - /* Try writing an empty stacks file */ - shdr.magic = SWD_STACKSHOTHDR_MAGIC; - shdr.size = 0; - + swd_hdr hdrCopy; + /* Try writing an empty stacks file */ + hdr = &hdrCopy; if (swd_flags & SWD_BOOT_BY_SW_WDOG) { failStat = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey)); fcode = failStat->unsigned64BitValue(); @@ -9825,12 +10053,19 @@ void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap) sname= kAppleOSXWatchdogStackFilename; } - sleepWakeDebugSaveFile(sname, (char*)(&shdr), sizeof(shdr)); - memset(PMStatusCode, 0x20, sizeof(PMStatusCode)); // Fill with spaces - PMStatusCode[sizeof(PMStatusCode)-1] = 0xa; // And an end-of-line at the end - snprintf(PMStatusCode, sizeof(PMStatusCode)-1, "Code: 0x%llx", fcode); - sleepWakeDebugSaveFile(lname, PMStatusCode, sizeof(PMStatusCode)); + sleepWakeDebugSaveFile(sname, NULL, 0); + + logOffset = (char*)hdr+offsetof(swd_hdr, UUID); + logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); + memset(logOffset, 0x20, logSize); // Fill with spaces + + + snprintf(hdr->spindump_status, sizeof(hdr->spindump_status), "\nstatus: 0x%x", swd_flags); + snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: 0x%llx", fcode); + snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n"); + sleepWakeDebugSaveFile(lname, logOffset, logSize); } + gRootDomain->swd_lock = 0; } @@ -9869,8 +10104,9 @@ IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) goto exit; } } - else if (len == sizeof(addr64_t)*3) + else if (len == sizeof(addr64_t)*3) { PEReadNVRAMProperty(kIOSleepWakeDebugKey, data, &len); + } else { DLOG("Invalid sleepWakeDebug note length(%d)\n", len); goto exit; @@ -9886,7 +10122,8 @@ IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) paddr = data[2]; if ( (bufSize <= sizeof(swd_hdr)) ||(bufSize > SWD_BUF_SIZE) || (crc == 0) ) { - IOLog("SleepWake log buffer contents are invalid\n"); + IOLog("SleepWake log buffer size is invalid\n"); + swd_flags |= SWD_BUF_SIZE_ERROR; return NULL; } @@ -9899,6 +10136,7 @@ IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) if (desc == NULL) { IOLog("Fail to map SleepWake log buffer\n"); + swd_flags |= SWD_INTERNAL_FAILURE; goto exit; } @@ -9909,13 +10147,15 @@ IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) if ( (logBufMap->getLength() <= sizeof(swd_hdr)) || (vaddr == NULL) ) { IOLog("Fail to map SleepWake log buffer\n"); + swd_flags |= SWD_INTERNAL_FAILURE; goto exit; } hdr = (swd_hdr *)vaddr; if (hdr->spindump_offset+hdr->spindump_size > bufSize) { - IOLog("SleepWake log buffer contents are invalid\n"); + IOLog("SleepWake log header size is invalid\n"); + swd_flags |= SWD_HDR_SIZE_ERROR; goto exit; } @@ -9924,6 +10164,7 @@ IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) hdr->spindump_size); if (newcrc != crc) { IOLog("SleepWake log buffer contents are invalid\n"); + swd_flags |= SWD_DATA_CRC_ERROR; goto exit; } @@ -9947,6 +10188,16 @@ IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) void IOPMrootDomain::sleepWakeDebugTrig(bool restart) { + uint32_t wdog_panic = 1; + + if (restart) { + if (PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)) && + (wdog_panic == 0)) { + return; + } + panic("Sleep/Wake hang detected\n"); + return; + } } void IOPMrootDomain::takeStackshot(bool restart, bool isOSXWatchdog, bool isSpinDump) diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index 31ab8b700..f3cfd8b0e 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -1542,6 +1542,7 @@ IOReturn IOPlatformExpertDevice::newUserClient( task_t owningTask, void * securi { newConnect->detach( this ); newConnect->release(); + err = kIOReturnNotPermitted; } else theConnect = newConnect; diff --git a/iokit/Kernel/IORegistryEntry.cpp b/iokit/Kernel/IORegistryEntry.cpp index 1d9cf8f9d..7af40d293 100644 --- a/iokit/Kernel/IORegistryEntry.cpp +++ b/iokit/Kernel/IORegistryEntry.cpp @@ -25,13 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * 12 Nov 98 sdouglas created. - * - */ #include #include @@ -62,12 +55,21 @@ OSDefineMetaClassAndStructors(IORegistryEntry, OSObject) #define KASLR_IOREG_DEBUG 0 +struct IORegistryEntry::ExpansionData +{ + IORecursiveLock * fLock; + uint64_t fRegistryEntryID; + SInt32 fRegistryEntryGenerationCount; +}; + + static IORegistryEntry * gRegistryRoot; static OSDictionary * gIORegistryPlanes; const OSSymbol * gIONameKey; const OSSymbol * gIOLocationKey; const OSSymbol * gIORegistryEntryIDKey; +const OSSymbol * gIORegistryEntryPropertyKeysKey; enum { kParentSetIndex = 0, @@ -110,8 +112,8 @@ static SInt32 gIORegistryGenerationCount; gIORegistryGenerationCount++ // make atomic -#define PUNLOCK IORecursiveLockUnlock( gPropertiesLock ) -#define PLOCK IORecursiveLockLock( gPropertiesLock ) +#define PUNLOCK IORecursiveLockUnlock( reserved->fLock ) +#define PLOCK IORecursiveLockLock( reserved->fLock ) #define IOREGSPLITTABLES @@ -162,6 +164,7 @@ IORegistryEntry * IORegistryEntry::initialize( void ) gIONameKey = OSSymbol::withCStringNoCopy( "IOName" ); gIOLocationKey = OSSymbol::withCStringNoCopy( "IOLocation" ); gIORegistryEntryIDKey = OSSymbol::withCStringNoCopy( kIORegistryEntryIDKey ); + gIORegistryEntryPropertyKeysKey = OSSymbol::withCStringNoCopy( kIORegistryEntryPropertyKeysKey ); assert( ok && gIONameKey && gIOLocationKey ); @@ -182,6 +185,10 @@ SInt32 IORegistryEntry::getGenerationCount( void ) return( gIORegistryGenerationCount ); } +SInt32 IORegistryEntry::getRegistryEntryGenerationCount(void) const +{ + return (reserved->fRegistryEntryGenerationCount); +} const IORegistryPlane * IORegistryEntry::makePlane( const char * name ) { @@ -278,6 +285,8 @@ bool IORegistryEntry::init( OSDictionary * dict ) if (!reserved) return (false); bzero(reserved, sizeof(ExpansionData)); + reserved->fLock = IORecursiveLockAlloc(); + if (!reserved->fLock) return (false); } if( dict) { if (OSCollection::kImmutable & dict->setOptions(0, 0)) { @@ -328,13 +337,20 @@ bool IORegistryEntry::init( IORegistryEntry * old, if( !super::init()) return( false); + if (!reserved) + { + reserved = IONew(ExpansionData, 1); + if (!reserved) return (false); + bzero(reserved, sizeof(ExpansionData)); + reserved->fLock = IORecursiveLockAlloc(); + if (!reserved->fLock) return (false); + } + WLOCK; - reserved = old->reserved; - old->reserved = NULL; + reserved->fRegistryEntryID = old->reserved->fRegistryEntryID; - fPropertyTable = old->getPropertyTable(); - fPropertyTable->retain(); + fPropertyTable = old->dictionaryWithProperties(); #ifdef IOREGSPLITTABLES fRegistryTable = old->fRegistryTable; old->fRegistryTable = (OSDictionary *) fRegistryTable->copyCollection(); @@ -384,17 +400,21 @@ void IORegistryEntry::free( void ) #endif /* IOREGSPLITTABLES */ if (reserved) + { + if (reserved->fLock) IORecursiveLockFree(reserved->fLock); IODelete(reserved, ExpansionData, 1); + } super::free(); } void IORegistryEntry::setPropertyTable( OSDictionary * dict ) { - if( fPropertyTable) - fPropertyTable->release(); if( dict) dict->retain(); + if( fPropertyTable) + fPropertyTable->release(); + fPropertyTable = dict; } @@ -473,11 +493,22 @@ bool IORegistryEntry::serializeProperties( OSSerialize * s ) const OSCollection *snapshotProperties = getPropertyTable()->copyCollection(); PUNLOCK; + if (!snapshotProperties) return (false); + bool ok = snapshotProperties->serialize( s ); snapshotProperties->release(); return( ok ); } +OSArray * IORegistryEntry::copyPropertyKeys(void) const +{ + PLOCK; + OSArray * keys = getPropertyTable()->copyKeys(); + PUNLOCK; + + return (keys); +} + OSDictionary * IORegistryEntry::dictionaryWithProperties( void ) const { OSDictionary * dict; @@ -1334,6 +1365,7 @@ bool IORegistryEntry::makeLink( IORegistryEntry * to, links->release(); } } + reserved->fRegistryEntryGenerationCount++; return( result); } @@ -1354,6 +1386,7 @@ void IORegistryEntry::breakLink( IORegistryEntry * to, registryTable()->removeObject( plane->keys[ relation ]); } } + reserved->fRegistryEntryGenerationCount++; } @@ -1453,6 +1486,18 @@ OSIterator * IORegistryEntry::getChildIterator( const IORegistryPlane * plane ) return( iter ); } +uint32_t IORegistryEntry::getChildCount( const IORegistryPlane * plane ) const +{ + OSArray * links; + uint32_t count = 0; + + RLOCK; + links = getChildSetReference( plane ); + if (links) count = links->getCount(); + UNLOCK; + + return (count); +} IORegistryEntry * IORegistryEntry::copyChildEntry( const IORegistryPlane * plane ) const diff --git a/iokit/Kernel/IOReportLegend.cpp b/iokit/Kernel/IOReportLegend.cpp index 20ae27af8..47d5e206a 100644 --- a/iokit/Kernel/IOReportLegend.cpp +++ b/iokit/Kernel/IOReportLegend.cpp @@ -56,7 +56,7 @@ IOReportLegend::with(OSArray *legend) if (legend != NULL) { if (iorLegend->initWith(legend) != kIOReturnSuccess) { - delete iorLegend; + OSSafeReleaseNULL(iorLegend); return NULL; } } diff --git a/iokit/Kernel/IOReporter.cpp b/iokit/Kernel/IOReporter.cpp index e132bc17e..81b6bfb94 100644 --- a/iokit/Kernel/IOReporter.cpp +++ b/iokit/Kernel/IOReporter.cpp @@ -216,12 +216,6 @@ IOReporter::init(IOService *reportingService, success = true; finish: - if (!success) { - if (_configLock) IOLockFree(_configLock); - if (_reporterLock) IOSimpleLockFree(_reporterLock); - if (_channelNames) _channelNames->release(); - } - return success; } @@ -235,6 +229,8 @@ IOReporter::init(IOService *reportingService, void IOReporter::free(void) { + OSSafeReleaseNULL(_channelNames); + if (_configLock) IOLockFree(_configLock); if (_reporterLock) IOSimpleLockFree(_reporterLock); diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 356ca01ba..bbb8781d1 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2014 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ OSDefineMetaClassAndStructors(IOService, IORegistryEntry) OSDefineMetaClassAndStructors(_IOServiceNotifier, IONotifier) +OSDefineMetaClassAndStructors(_IOServiceNullNotifier, IONotifier) OSDefineMetaClassAndStructors(_IOServiceInterestNotifier, IONotifier) @@ -102,10 +103,12 @@ const OSSymbol * gIOInterruptSpecifiersKey; const OSSymbol * gIOResourcesKey; const OSSymbol * gIOResourceMatchKey; +const OSSymbol * gIOResourceMatchedKey; const OSSymbol * gIOProviderClassKey; const OSSymbol * gIONameMatchKey; const OSSymbol * gIONameMatchedKey; const OSSymbol * gIOPropertyMatchKey; +const OSSymbol * gIOPropertyExistsMatchKey; const OSSymbol * gIOLocationMatchKey; const OSSymbol * gIOParentMatchKey; const OSSymbol * gIOPathMatchKey; @@ -151,6 +154,12 @@ const OSSymbol * gIOAppPowerStateInterest; const OSSymbol * gIOPriorityPowerStateInterest; const OSSymbol * gIOConsoleSecurityInterest; +const OSSymbol * gIOBSDKey; +const OSSymbol * gIOBSDNameKey; +const OSSymbol * gIOBSDMajorKey; +const OSSymbol * gIOBSDMinorKey; +const OSSymbol * gIOBSDUnitKey; + const OSSymbol * gAKSGetKey; #if defined(__i386__) || defined(__x86_64__) const OSSymbol * gIOCreateEFIDevicePathSymbol; @@ -187,6 +196,7 @@ const OSSymbol * gIOPlatformFunctionHandlerSet; static IOLock * gIOConsoleUsersLock; static thread_call_t gIOConsoleLockCallout; +static IONotifier * gIOServiceNullNotifier; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -293,6 +303,7 @@ void IOService::initialize( void ) gIONameMatchKey = OSSymbol::withCStringNoCopy( kIONameMatchKey ); gIONameMatchedKey = OSSymbol::withCStringNoCopy( kIONameMatchedKey ); gIOPropertyMatchKey = OSSymbol::withCStringNoCopy( kIOPropertyMatchKey ); + gIOPropertyExistsMatchKey = OSSymbol::withCStringNoCopy( kIOPropertyExistsMatchKey ); gIOPathMatchKey = OSSymbol::withCStringNoCopy( kIOPathMatchKey ); gIOLocationMatchKey = OSSymbol::withCStringNoCopy( kIOLocationMatchKey ); gIOParentMatchKey = OSSymbol::withCStringNoCopy( kIOParentMatchKey ); @@ -305,8 +316,9 @@ void IOService::initialize( void ) gIOUserClientClassKey = OSSymbol::withCStringNoCopy( kIOUserClientClassKey ); - gIOResourcesKey = OSSymbol::withCStringNoCopy( kIOResourcesClass ); - gIOResourceMatchKey = OSSymbol::withCStringNoCopy( kIOResourceMatchKey ); + gIOResourcesKey = OSSymbol::withCStringNoCopy( kIOResourcesClass ); + gIOResourceMatchKey = OSSymbol::withCStringNoCopy( kIOResourceMatchKey ); + gIOResourceMatchedKey = OSSymbol::withCStringNoCopy( kIOResourceMatchedKey ); gIODeviceMemoryKey = OSSymbol::withCStringNoCopy( "IODeviceMemory" ); gIOInterruptControllersKey @@ -326,6 +338,12 @@ void IOService::initialize( void ) gIOPriorityPowerStateInterest = OSSymbol::withCStringNoCopy( kIOPriorityPowerStateInterest ); gIOConsoleSecurityInterest = OSSymbol::withCStringNoCopy( kIOConsoleSecurityInterest ); + gIOBSDKey = OSSymbol::withCStringNoCopy(kIOBSDKey); + gIOBSDNameKey = OSSymbol::withCStringNoCopy(kIOBSDNameKey); + gIOBSDMajorKey = OSSymbol::withCStringNoCopy(kIOBSDMajorKey); + gIOBSDMinorKey = OSSymbol::withCStringNoCopy(kIOBSDMinorKey); + gIOBSDUnitKey = OSSymbol::withCStringNoCopy(kIOBSDUnitKey); + gNotifications = OSDictionary::withCapacity( 1 ); gIOPublishNotification = OSSymbol::withCStringNoCopy( kIOPublishNotification ); @@ -399,6 +417,9 @@ void IOService::initialize( void ) gIOResources = IOResources::resources(); assert( gIOResources ); + gIOServiceNullNotifier = OSTypeAlloc(_IOServiceNullNotifier); + assert(gIOServiceNullNotifier); + gArbitrationLockQueueLock = IOLockAlloc(); queue_init(&gArbitrationLockQueueActive); queue_init(&gArbitrationLockQueueWaiting); @@ -411,7 +432,6 @@ void IOService::initialize( void ) gIOStopProviderList = OSArray::withCapacity( 16 ); gIOFinalizeList = OSArray::withCapacity( 16 ); assert( gIOTerminatePhase2List && gIOStopList && gIOStopProviderList && gIOFinalizeList ); - } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -419,6 +439,7 @@ void IOService::initialize( void ) #if defined(__i386__) || defined(__x86_64__) extern "C" { +const char *getCpuDelayBusStallHolderName(void); const char *getCpuDelayBusStallHolderName(void) { return sCPULatencyHolderName[kCpuDelayBusStall]; } @@ -559,7 +580,11 @@ void IOService::free( void ) */ bool IOService::attach( IOService * provider ) { - bool ok; + bool ok; + uint32_t count; + AbsoluteTime deadline; + int waitResult = THREAD_AWAKENED; + bool wait, computeDeadline = true; if( provider) { @@ -567,12 +592,39 @@ bool IOService::attach( IOService * provider ) LOG( "%s::attach(%s)\n", getName(), provider->getName()); - provider->lockForArbitration(); - if( provider->__state[0] & kIOServiceInactiveState) - ok = false; - else - ok = attachToParent( provider, gIOServicePlane); - provider->unlockForArbitration(); + ok = false; + do + { + wait = false; + provider->lockForArbitration(); + if (provider->__state[0] & kIOServiceInactiveState) ok = false; + else + { + count = provider->getChildCount(gIOServicePlane); + wait = (count > (kIOServiceBusyMax - 4)); + if (!wait) ok = attachToParent(provider, gIOServicePlane); + else + { + IOLog("stalling for detach from %s\n", provider->getName()); + IOLockLock( gIOServiceBusyLock ); + provider->__state[1] |= kIOServiceWaitDetachState; + } + } + provider->unlockForArbitration(); + if (wait) + { + if (computeDeadline) + { + clock_interval_to_deadline(15, kSecondScale, &deadline); + computeDeadline = false; + } + assert_wait_deadline((event_t)&provider->__provider, THREAD_UNINT, deadline); + IOLockUnlock( gIOServiceBusyLock ); + waitResult = thread_block(THREAD_CONTINUE_NULL); + wait = (waitResult != THREAD_TIMED_OUT); + } + } + while (wait); } else { gIOServiceRoot = this; @@ -645,6 +697,15 @@ void IOService::detach( IOService * provider ) && (0 == provider->getClient())) { provider->scheduleFinalize(false); } + + IOLockLock( gIOServiceBusyLock ); + if (kIOServiceWaitDetachState & provider->__state[1]) + { + provider->__state[1] &= ~kIOServiceWaitDetachState; + thread_wakeup(&provider->__provider); + } + IOLockUnlock( gIOServiceBusyLock ); + provider->unlockForArbitration(); } } @@ -899,7 +960,7 @@ IOService * IOService::getProvider( void ) const IOService * parent; SInt32 generation; - generation = getGenerationCount(); + generation = getRegistryEntryGenerationCount(); if( __providerGeneration == generation) return( __provider ); @@ -3116,7 +3177,7 @@ void IOService::probeCandidates( OSOrderedSet * matches ) // alloc the driver instance inst = (IOService *) OSMetaClass::allocClassWithName( symbol); - if( !inst) { + if( !inst || !OSDynamicCast(IOService, inst)) { IOLog("Couldn't alloc class \"%s\"\n", symbol->getCStringNoCopy()); continue; @@ -3475,6 +3536,7 @@ void IOService::doServiceMatch( IOOptionBits options ) _IOServiceNotifier * notify; OSIterator * iter; OSOrderedSet * matches; + OSArray * resourceKeys = 0; SInt32 catalogGeneration; bool keepGuessing = true; bool reRegistered = true; @@ -3521,7 +3583,14 @@ void IOService::doServiceMatch( IOOptionBits options ) unlockForArbitration(); if (keepGuessing && matches->getCount() && (kIOReturnSuccess == getResources())) + { + if (this == gIOResources) + { + if (resourceKeys) resourceKeys->release(); + resourceKeys = copyPropertyKeys(); + } probeCandidates( matches ); + } else matches->release(); } @@ -3539,6 +3608,9 @@ void IOService::doServiceMatch( IOOptionBits options ) if( (0 == (__state[0] & kIOServiceInactiveState)) && (0 == (__state[1] & kIOServiceModuleStallState)) ) { + + if (resourceKeys) setProperty(gIOResourceMatchedKey, resourceKeys); + deliverNotification( gIOMatchedNotification, kIOServiceMatchedState, 0xffffffff ); if( 0 == (__state[0] & kIOServiceFirstMatchState)) @@ -3546,6 +3618,8 @@ void IOService::doServiceMatch( IOOptionBits options ) kIOServiceFirstMatchState, 0xffffffff ); } + if (resourceKeys) resourceKeys->release(); + __state[1] &= ~kIOServiceConfigState; scheduleTerminatePhase2(); @@ -3709,55 +3783,101 @@ IOReturn IOService::waitForState( UInt32 mask, UInt32 value, return( kIOReturnSuccess ); } +#if NO_KEXTD +#define WAITING_KEXTD false +#else +extern bool gIOKextdClearedBusy; +#define WAITING_KEXTD (false == gIOKextdClearedBusy) +#endif + IOReturn IOService::waitQuiet( uint64_t timeout ) { IOReturn ret; - ret = waitForState( kIOServiceBusyStateMask, 0, timeout ); - if ((kIOReturnTimeout == ret) && (timeout >= 41000000000) && (kIOWaitQuietPanics & gIOKitDebug)) + uint32_t loops; + char * string = NULL; + size_t len; + uint64_t time; + uint64_t nano; + + time = mach_absolute_time(); + for (loops = 0; loops < 2; loops++) { - IORegistryIterator * iter; - OSOrderedSet * set; - OSOrderedSet * leaves; - IOService * next; - IOService * nextParent; - char * string; - char * s; - size_t len, l; - - len = 256; - string = IONew(char, len); - set = NULL; - iter = IORegistryIterator::iterateOver(this, gIOServicePlane, kIORegistryIterateRecursively); - leaves = OSOrderedSet::withCapacity(4); - if (iter) set = iter->iterateAll(); - if (string && leaves && set) - { - while ((next = (IOService *) set->getLastObject())) - { - if (next->getBusyState()) - { - leaves->setObject(next); - nextParent = next; - while ((nextParent = nextParent->getProvider())) - { - set->removeObject(nextParent); - leaves->removeObject(nextParent); - } - } - set->removeObject(next); - } - s = string; - while ((next = (IOService *) leaves->getLastObject())) - { - l = snprintf(s, len, "%s'%s'", ((s == string) ? "" : ", "), next->getName()); - if (l >= len) break; - s += l; - len -= l; - leaves->removeObject(next); - } - } - panic("busy timeout(%llds): %s", timeout / 1000000000ULL, string ? string : ""); + ret = waitForState( kIOServiceBusyStateMask, 0, timeout ); + + if (loops && (kIOReturnSuccess == ret)) + { + time = mach_absolute_time() - time; + absolutetime_to_nanoseconds(*(AbsoluteTime *)&time, &nano); + IOLog("busy extended ok[%d], (%llds, %llds), kextd wait(%d): %s\n", + loops, timeout / 1000000000ULL, nano / 1000000000ULL, WAITING_KEXTD, + string ? string : ""); + break; + } + else if (kIOReturnTimeout != ret) break; + else if (timeout < 41000000000) break; + + if (!loops) + { + IORegistryIterator * iter; + OSOrderedSet * set; + OSOrderedSet * leaves; + IOService * next; + IOService * nextParent; + char * s; + size_t l; + + len = 256; + string = IONew(char, len); + set = NULL; + iter = IORegistryIterator::iterateOver(this, gIOServicePlane, kIORegistryIterateRecursively); + leaves = OSOrderedSet::withCapacity(4); + if (iter) set = iter->iterateAll(); + if (string && leaves && set) + { + while ((next = (IOService *) set->getLastObject())) + { + if (next->getBusyState()) + { + leaves->setObject(next); + nextParent = next; + while ((nextParent = nextParent->getProvider())) + { + set->removeObject(nextParent); + leaves->removeObject(nextParent); + } + } + set->removeObject(next); + } + s = string; + while ((next = (IOService *) leaves->getLastObject())) + { + l = snprintf(s, len, "%s'%s'", ((s == string) ? "" : ", "), next->getName()); + if (l >= len) break; + s += l; + len -= l; + leaves->removeObject(next); + } + } + OSSafeReleaseNULL(leaves); + OSSafeReleaseNULL(set); + OSSafeReleaseNULL(iter); + } + if (loops && (kIOWaitQuietPanics & gIOKitDebug)) + { + panic("busy timeout[%d], (%llds), kextd wait(%d): %s", + loops, timeout / 1000000000ULL, WAITING_KEXTD, + string ? string : ""); + } + else + { + IOLog("busy timeout[%d], (%llds), kextd wait(%d): %s\n", + loops, timeout / 1000000000ULL, WAITING_KEXTD, + string ? string : ""); + } } + + if (string) IODelete(string, char, 256); + return (ret); } @@ -4258,9 +4378,12 @@ IONotifier * IOService::doInstallNotification( else if( type == gIOFirstPublishNotification) inState = kIOServiceFirstPublishState; - else if( (type == gIOMatchedNotification) - || (type == gIOFirstMatchNotification)) + else if (type == gIOMatchedNotification) inState = kIOServiceMatchedState; + + else if (type == gIOFirstMatchNotification) + inState = kIOServiceFirstMatchState; + else if( type == gIOTerminatedNotification) inState = 0; else @@ -4314,6 +4437,9 @@ IONotifier * IOService::installNotification( notify = doInstallNotification( type, matching, handler, target, ref, priority, existing ); + // in case handler remove()s + if (notify) notify->retain(); + UNLOCKNOTIFY(); return( notify ); @@ -4347,16 +4473,17 @@ IONotifier * IOService::addMatchingNotification( SInt32 priority ) { OSIterator * existing = NULL; + IONotifier * ret; _IOServiceNotifier * notify; IOService * next; - notify = (_IOServiceNotifier *) installNotification( type, matching, + ret = notify = (_IOServiceNotifier *) installNotification( type, matching, handler, target, ref, priority, &existing ); + if (!ret) return (0); // send notifications for existing set - if( existing) { + if (existing) { - notify->retain(); // in case handler remove()s while( (next = (IOService *) existing->getNextObject())) { next->lockForArbitration(); @@ -4364,11 +4491,16 @@ IONotifier * IOService::addMatchingNotification( next->invokeNotifer( notify ); next->unlockForArbitration(); } - notify->release(); existing->release(); } - return( notify ); + LOCKWRITENOTIFY(); + bool removed = (0 == notify->whence); + notify->release(); + if (removed) ret = gIOServiceNullNotifier; + UNLOCKNOTIFY(); + + return( ret ); } bool IOService::syncNotificationHandler( @@ -4723,6 +4855,19 @@ void _IOServiceNotifier::enable( bool was ) UNLOCKNOTIFY(); } + +/* + * _IOServiceNullNotifier + */ + +void _IOServiceNullNotifier::taggedRetain(const void *tag) const {} +void _IOServiceNullNotifier::taggedRelease(const void *tag, const int when) const {} +void _IOServiceNullNotifier::free() {} +void _IOServiceNullNotifier::wait() {} +void _IOServiceNullNotifier::remove() {} +void _IOServiceNullNotifier::enable(bool was) {} +bool _IOServiceNullNotifier::disable() { return(false); } + /* * IOResources */ @@ -4743,7 +4888,7 @@ IOService * IOResources::resources( void ) bool IOResources::init( OSDictionary * dictionary ) { // Do super init first - if ( !super::init() ) + if ( !IOService::init() ) return false; // Allow PAL layer to publish a value @@ -4791,6 +4936,7 @@ bool IOResources::matchPropertyTable( OSDictionary * table ) OSString * str; OSSet * set; OSIterator * iter; + OSArray * keys; bool ok = true; prop = table->getObject( gIOResourceMatchKey ); @@ -4808,6 +4954,17 @@ bool IOResources::matchPropertyTable( OSDictionary * table ) if( iter) iter->release(); } + else if ((prop = table->getObject(gIOResourceMatchedKey))) + { + keys = (OSArray *) copyProperty(gIOResourceMatchedKey); + ok = false; + if (keys) + { + // assuming OSSymbol + ok = ((-1U) != keys->getNextIndexOfObject(prop, 0)); + keys->release(); + } + } return( ok ); } @@ -4974,11 +5131,16 @@ bool IOService::compareProperty( OSDictionary * matching, const char * key ) { OSObject * value; + OSObject * prop; bool ok; value = matching->getObject( key ); if( value) - ok = value->isEqualTo( getProperty( key )); + { + prop = copyProperty(key); + ok = value->isEqualTo(prop); + if (prop) prop->release(); + } else ok = true; @@ -4990,11 +5152,16 @@ bool IOService::compareProperty( OSDictionary * matching, const OSString * key ) { OSObject * value; + OSObject * prop; bool ok; value = matching->getObject( key ); if( value) - ok = value->isEqualTo( getProperty( key )); + { + prop = copyProperty(key); + ok = value->isEqualTo(prop); + if (prop) prop->release(); + } else ok = true; @@ -5074,8 +5241,8 @@ bool IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * { count = table->getCount(); done = 0; - str = OSDynamicCast(OSString, table->getObject(gIOProviderClassKey)); + str = OSDynamicCast(OSString, table->getObject(gIOProviderClassKey)); if (str) { done++; match = ((kIOServiceClassDone & options) || (0 != metaCast(str))); @@ -5145,6 +5312,38 @@ bool IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * if ((!match) || (done == count)) break; } + obj = table->getObject( gIOPropertyExistsMatchKey ); + if( obj) + { + OSDictionary * dict; + OSString * nextKey; + OSIterator * iter; + done++; + match = false; + dict = dictionaryWithProperties(); + if( dict) { + nextKey = OSDynamicCast( OSString, obj); + if( nextKey) + iter = 0; + else + iter = OSCollectionIterator::withCollection( + OSDynamicCast(OSCollection, obj)); + + while( nextKey + || (iter && (0 != (nextKey = OSDynamicCast(OSString, + iter->getNextObject()))))) { + match = (0 != dict->getObject(nextKey)); + if( match) + break; + nextKey = 0; + } + dict->release(); + if( iter) + iter->release(); + } + if ((!match) || (done == count)) break; + } + str = OSDynamicCast( OSString, table->getObject( gIOPathMatchKey )); if( str) { done++; @@ -5196,10 +5395,10 @@ bool IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * if (prop) prop->release(); \ if ((!match) || (done == count)) break; \ } - propMatch(kIOBSDNameKey) - propMatch(kIOBSDMajorKey) - propMatch(kIOBSDMinorKey) - propMatch(kIOBSDUnitKey) + propMatch(gIOBSDNameKey) + propMatch(gIOBSDMajorKey) + propMatch(gIOBSDMinorKey) + propMatch(gIOBSDUnitKey) #undef propMatch } while (false); @@ -5229,7 +5428,7 @@ bool IOService::matchPassive(OSDictionary * table, uint32_t options) OSArray* aliasServiceRegIds = NULL; IOService* foundAlternateService = NULL; -#if MATCH_DEBUG +#if MATCH_DEBUG OSDictionary * root = table; #endif @@ -5239,7 +5438,6 @@ bool IOService::matchPassive(OSDictionary * table, uint32_t options) do { count = table->getCount(); - if (!(kIOServiceInternalDone & options)) { match = where->matchInternal(table, options, &done); @@ -5275,7 +5473,7 @@ bool IOService::matchPassive(OSDictionary * table, uint32_t options) nextTable = OSDynamicCast(OSDictionary, table->getObject( gIOParentMatchKey )); - if(nextTable) { + if( nextTable) { // look for a matching entry anywhere up to root match = false; matchParent = true; @@ -5339,11 +5537,11 @@ bool IOService::matchPassive(OSDictionary * table, uint32_t options) } while( where != NULL ); - OSSafeRelease(foundAlternateService); - OSSafeRelease(aliasServiceRegIds); + OSSafeReleaseNULL(foundAlternateService); + OSSafeReleaseNULL(aliasServiceRegIds); #if MATCH_DEBUG - if (where != this) + if (where != this) { OSSerialize * s = OSSerialize::withCapacity(128); root->serialize(s); @@ -6229,31 +6427,23 @@ IOReturn IOService::configureReport(IOReportChannelList *channelList, } } - /* 24241819: SU fix for NULL 'reserved' field */ - if (reserved) { - IOLockLock(reserved->interruptStatisticsLock); - - /* The array count is signed (because the interrupt indices are signed), hence the cast */ - for (cnt = 0; cnt < (unsigned) reserved->interruptStatisticsArrayCount; cnt++) { - if (reserved->interruptStatisticsArray[cnt].reporter) { - /* - * If the reporter is currently associated with the statistics - * for an event source, we may need to update the reporter. - */ - if (reserved->interruptStatisticsArray[cnt].statistics) - interruptAccountingDataUpdateChannels(reserved->interruptStatisticsArray[cnt].statistics, reserved->interruptStatisticsArray[cnt].reporter); - - reserved->interruptStatisticsArray[cnt].reporter->configureReport(channelList, action, result, destination); - } - } + IOLockLock(reserved->interruptStatisticsLock); - IOLockUnlock(reserved->interruptStatisticsLock); - } - #if DEVELOPMENT || DEBUG - else { - IOLog("ALERT: why is %s's 'reserved' field NULL?!\n", getName()); - } - #endif + /* The array count is signed (because the interrupt indices are signed), hence the cast */ + for (cnt = 0; cnt < (unsigned) reserved->interruptStatisticsArrayCount; cnt++) { + if (reserved->interruptStatisticsArray[cnt].reporter) { + /* + * If the reporter is currently associated with the statistics + * for an event source, we may need to update the reporter. + */ + if (reserved->interruptStatisticsArray[cnt].statistics) + interruptAccountingDataUpdateChannels(reserved->interruptStatisticsArray[cnt].statistics, reserved->interruptStatisticsArray[cnt].reporter); + + reserved->interruptStatisticsArray[cnt].reporter->configureReport(channelList, action, result, destination); + } + } + + IOLockUnlock(reserved->interruptStatisticsLock); return kIOReturnSuccess; } diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index 30612ce42..25021b3d0 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1522,17 +1522,18 @@ bool IOService::handleAcknowledgePowerChange( IOPMRequest * request ) // make sure we're expecting this ack if ( informee->timer != 0 ) { -#if LOG_SETPOWER_TIMES + if (informee->timer > 0) { uint64_t nsec = computeTimeDeltaNS(&informee->startTime); if (nsec > LOG_SETPOWER_TIMES) { getPMRootDomain()->pmStatsRecordApplicationResponse( gIOPMStatsDriverPSChangeSlow, informee->whatObject->getName(), - fDriverCallReason, NS_TO_MS(nsec), 0, NULL, fHeadNotePowerState); + fDriverCallReason, NS_TO_MS(nsec), informee->whatObject->getRegistryEntryID(), + NULL, fHeadNotePowerState); } } -#endif + // mark it acked informee->timer = 0; // that's one fewer to worry about @@ -2750,10 +2751,12 @@ unsigned long IOService::currentPowerConsumption( void ) // [deprecated] getPMworkloop //********************************************************************************* +#ifndef __LP64__ IOWorkLoop * IOService::getPMworkloop( void ) { return gIOPMWorkLoop; } +#endif #if NOT_YET @@ -3494,6 +3497,7 @@ bool IOService::notifyInterestedDrivers( void ) IOPMinformeeList * list = fInterestedDrivers; DriverCallParam * param; IOItemCount count; + IOItemCount skipCnt = 0; PM_ASSERT_IN_GATE(); assert( fDriverCallParamCount == 0 ); @@ -3533,12 +3537,25 @@ bool IOService::notifyInterestedDrivers( void ) assert(informee); for (IOItemCount i = 0; i < count; i++) { + if (fInitialSetPowerState || (fHeadNoteChangeFlags & kIOPMInitialPowerChange)) { + // Skip notifying self, if 'kIOPMInitialDeviceState' is set and + // this is the initial power state change + if ((this == informee->whatObject) && + (fHeadNotePowerArrayEntry->capabilityFlags & kIOPMInitialDeviceState)) { + skipCnt++; + continue; + } + } informee->timer = -1; param[i].Target = informee; informee->retain(); informee = list->nextInList( informee ); } + count -= skipCnt; + if (!count) { + goto done; + } fDriverCallParamCount = count; fHeadNotePendingAcks = count; @@ -3967,7 +3984,7 @@ void IOService::driverSetPowerState( void ) fName, OBFUSCATE(this), fCurrentPowerState, powerState, result); } -#if LOG_SETPOWER_TIMES + if ((result == IOPMAckImplied) || (result < 0)) { uint64_t nsec; @@ -3977,10 +3994,11 @@ void IOService::driverSetPowerState( void ) if (nsec > LOG_SETPOWER_TIMES) { getPMRootDomain()->pmStatsRecordApplicationResponse( gIOPMStatsDriverPSChangeSlow, - fName, kDriverCallSetPowerState, NS_TO_MS(nsec), 0, NULL, powerState); + fName, kDriverCallSetPowerState, NS_TO_MS(nsec), getRegistryEntryID(), + NULL, powerState); } } -#endif + } else result = kIOPMAckImplied; @@ -4046,7 +4064,7 @@ void IOService::driverInformPowerChange( void ) deassertPMDriverCall(&callEntry); -#if LOG_SETPOWER_TIMES + if ((result == IOPMAckImplied) || (result < 0)) { uint64_t nsec; @@ -4056,10 +4074,11 @@ void IOService::driverInformPowerChange( void ) if (nsec > LOG_SETPOWER_TIMES) { getPMRootDomain()->pmStatsRecordApplicationResponse( gIOPMStatsDriverPSChangeSlow, driver->getName(), - fDriverCallReason, NS_TO_MS(nsec), 0, NULL, powerState); + fDriverCallReason, NS_TO_MS(nsec), driver->getRegistryEntryID(), + NULL, powerState); } } -#endif + } else result = kIOPMAckImplied; @@ -5415,6 +5434,13 @@ IOService::watchdog_timer_expired( thread_call_param_t arg0, thread_call_param_t } +IOWorkLoop * IOService::getIOPMWorkloop( void ) +{ + return gIOPMWorkLoop; +} + + + //********************************************************************************* // [private] start_ack_timer //********************************************************************************* @@ -5698,7 +5724,7 @@ static void logAppTimeouts( OSObject * object, void * arg ) IOPMInterestContext * context = (IOPMInterestContext *) arg; OSObject * flag; unsigned int clientIndex; - int pid = -1; + int pid = 0; char name[128]; if (OSDynamicCast(_IOServiceInterestNotifier, object)) @@ -5903,6 +5929,7 @@ bool IOService::tellClientsWithResponse( int messageType ) break; case kNotifyCapabilityChangePriority: + context.enableTracing = isRootDomain; applyToInterested( gIOPriorityPowerStateInterest, pmTellCapabilityClientWithResponse, (void *) &context ); break; @@ -5912,8 +5939,9 @@ bool IOService::tellClientsWithResponse( int messageType ) if ( !checkForDone() ) { OUR_PMLog(kPMLogStartAckTimer, context.maxTimeRequested, 0); - if (context.enableTracing) - getPMRootDomain()->traceDetail( context.maxTimeRequested / 1000 ); + if (context.enableTracing) { + getPMRootDomain()->traceDetail(context.messageType, 0, context.maxTimeRequested / 1000); + } start_ack_timer( context.maxTimeRequested / 1000, kMillisecondScale ); return false; } @@ -6006,7 +6034,7 @@ void IOService::pmTellAppWithResponse( OSObject * object, void * arg ) if (waitForReply == kOSBooleanTrue) { -#if LOG_APP_RESPONSE_TIMES + OSNumber * num; clock_get_uptime(&now); num = OSNumber::withNumber(AbsoluteTime_to_scalar(&now), sizeof(uint64_t) * 8); @@ -6015,9 +6043,9 @@ void IOService::pmTellAppWithResponse( OSObject * object, void * arg ) context->responseArray->setObject(msgIndex, num); num->release(); } - else -#endif - context->responseArray->setObject(msgIndex, kOSBooleanFalse); + else { + context->responseArray->setObject(msgIndex, kOSBooleanFalse); + } } else { @@ -6096,10 +6124,7 @@ void IOService::pmTellClientWithResponse( OSObject * object, void * arg ) if (context->enableTracing && (notifier != 0)) { - uint32_t detail = ((msgIndex & 0xff) << 24) | - ((msgType & 0xfff) << 12) | - (((uintptr_t) notifier->handler) & 0xfff); - getPMRootDomain()->traceDetail( detail ); + getPMRootDomain()->traceDetail(msgType, msgIndex, (uintptr_t) notifier->handler); } retCode = context->us->messageClient(msgType, object, (void *) ¬ify, sizeof(notify)); @@ -6200,7 +6225,7 @@ void IOService::pmTellCapabilityAppWithResponse( OSObject * object, void * arg ) } else { -#if LOG_APP_RESPONSE_TIMES + OSNumber * num; clock_get_uptime(&now); num = OSNumber::withNumber(AbsoluteTime_to_scalar(&now), sizeof(uint64_t) * 8); @@ -6209,9 +6234,9 @@ void IOService::pmTellCapabilityAppWithResponse( OSObject * object, void * arg ) context->responseArray->setObject(msgIndex, num); num->release(); } - else -#endif - context->responseArray->setObject(msgIndex, kOSBooleanFalse); + else { + context->responseArray->setObject(msgIndex, kOSBooleanFalse); + } if (context->notifyClients) context->notifyClients->setObject(msgIndex, object); @@ -6279,10 +6304,7 @@ void IOService::pmTellCapabilityClientWithResponse( if (context->enableTracing && (notifier != 0)) { - uint32_t detail = ((msgIndex & 0xff) << 24) | - ((msgType & 0xfff) << 12) | - (((uintptr_t) notifier->handler) & 0xfff); - getPMRootDomain()->traceDetail( detail ); + getPMRootDomain()->traceDetail(msgType, msgIndex, (uintptr_t) notifier->handler); } retCode = context->us->messageClient( @@ -6582,7 +6604,7 @@ bool IOService::responseValid( uint32_t refcon, int pid ) OSNumber * num; if ((num = OSDynamicCast(OSNumber, theFlag))) { -#if LOG_APP_RESPONSE_TIMES + AbsoluteTime now; AbsoluteTime start; uint64_t nsec; @@ -6623,7 +6645,7 @@ bool IOService::responseValid( uint32_t refcon, int pid ) name, 0, NS_TO_MS(nsec), pid, object); } -#endif + theFlag = kOSBooleanFalse; } else if (object) { @@ -6863,6 +6885,7 @@ IOReturn IOService::updatePowerStatesReport( IOReportConfigureAction action, voi STATEREPORT_UPDATERES(fReportBuf, kIOReportCopyChannelData, result); dest->appendBytes(data2cpy, size2cpy); + break; default: break; @@ -6953,6 +6976,7 @@ IOReturn IOService::updateSimplePowerReport( IOReportConfigureAction action, voi SIMPLEREPORT_UPDATERES(kIOReportCopyChannelData, result); dest->appendBytes(data2cpy, size2cpy); + break; default: break; @@ -7420,8 +7444,13 @@ bool IOService::actionPMWorkQueueInvoke( IOPMRequest * request, IOPMWorkQueue * { OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); PM_ERROR("%s: idle cancel, state %u\n", fName, fMachineState); - // yes, rescind the warning - tellNoChangeDown(fHeadNotePowerState); + if (IS_ROOT_DOMAIN) { + // RootDomain already sent "WillSleep" to its clients + tellChangeUp(fCurrentPowerState); + } + else { + tellNoChangeDown(fHeadNotePowerState); + } // mark the change note un-actioned fHeadNoteChangeFlags |= kIOPMNotDone; // and we're done @@ -7435,8 +7464,13 @@ bool IOService::actionPMWorkQueueInvoke( IOPMRequest * request, IOPMWorkQueue * { OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); PM_ERROR("%s: idle cancel, state %u\n", fName, fMachineState); - // yes, rescind the warning - tellNoChangeDown(fHeadNotePowerState); + if (IS_ROOT_DOMAIN) { + // RootDomain already sent "WillSleep" to its clients + tellChangeUp(fCurrentPowerState); + } + else { + tellNoChangeDown(fHeadNotePowerState); + } // mark the change note un-actioned fHeadNoteChangeFlags |= kIOPMNotDone; // and we're done @@ -7840,14 +7874,15 @@ bool IOService::actionPMReplyQueue( IOPMRequest * request, IOPMRequestQueue * qu // expected ack, stop the timer stop_ack_timer(); -#if LOG_SETPOWER_TIMES + uint64_t nsec = computeTimeDeltaNS(&fDriverCallStartTime); if (nsec > LOG_SETPOWER_TIMES) { getPMRootDomain()->pmStatsRecordApplicationResponse( gIOPMStatsDriverPSChangeSlow, - fName, kDriverCallSetPowerState, NS_TO_MS(nsec), 0, NULL, fHeadNotePowerState); + fName, kDriverCallSetPowerState, NS_TO_MS(nsec), getRegistryEntryID(), + NULL, fHeadNotePowerState); } -#endif + OUR_PMLog(kPMLogDriverAcknowledgeSet, (uintptr_t) this, fDriverTimer); fDriverTimer = 0; more = true; @@ -8817,6 +8852,6 @@ void IOServicePM::pmTrace( ((char *) &name)[sizeof(uintptr_t) - i - 1] = who[i]; } - IOTimeStampConstant(code, name, (uintptr_t) regId, param1, param2); + IOTimeStampConstant(code, name, (uintptr_t) regId, (uintptr_t)(OBFUSCATE(param1)), (uintptr_t)(OBFUSCATE(param2))); } diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index ca91e9d46..9fa90b563 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -458,7 +458,11 @@ the ack timer is ticking every tenth of a second. */ #define ACK_TIMER_PERIOD 100000000 +#if defined(__i386__) || defined(__x86_64__) #define WATCHDOG_TIMER_PERIOD (300) // 300 secs +#else +#define WATCHDOG_TIMER_PERIOD (180) // 180 secs +#endif // Max wait time in microseconds for kernel priority and capability clients // with async message handlers to acknowledge. diff --git a/iokit/Kernel/IOServicePrivate.h b/iokit/Kernel/IOServicePrivate.h index af6ca5636..4ad23fa1a 100644 --- a/iokit/Kernel/IOServicePrivate.h +++ b/iokit/Kernel/IOServicePrivate.h @@ -47,8 +47,8 @@ enum { // masks for __state[1] enum { - kIOServiceBusyStateMask = 0x000000ff, - kIOServiceBusyMax = 255, + kIOServiceBusyStateMask = 0x000003ff, + kIOServiceBusyMax = 1023, kIOServiceNeedConfigState = 0x80000000, kIOServiceSynchronousState = 0x40000000, kIOServiceModuleStallState = 0x20000000, @@ -63,6 +63,7 @@ enum { kIOServiceTerm1WaiterState = 0x00200000, kIOServiceRecursing = 0x00100000, kIOServiceNeedWillTerminate = 0x00080000, + kIOServiceWaitDetachState = 0x00040000, }; // notify state @@ -125,6 +126,20 @@ class _IOServiceInterestNotifier : public IONotifier virtual bool init() APPLE_KEXT_OVERRIDE; }; +class _IOServiceNullNotifier : public IONotifier +{ + OSDeclareDefaultStructors(_IOServiceNullNotifier) + +public: + virtual void taggedRetain(const void *tag) const APPLE_KEXT_OVERRIDE; + virtual void taggedRelease(const void *tag, const int when) const APPLE_KEXT_OVERRIDE; + virtual void free() APPLE_KEXT_OVERRIDE; + virtual void remove() APPLE_KEXT_OVERRIDE; + virtual bool disable() APPLE_KEXT_OVERRIDE; + virtual void enable( bool was ) APPLE_KEXT_OVERRIDE; + virtual void wait(); +}; + class _IOConfigThread : public OSObject { friend class IOService; diff --git a/iokit/Kernel/IOSharedDataQueue.cpp b/iokit/Kernel/IOSharedDataQueue.cpp index 71daaa6b4..abffb156e 100644 --- a/iokit/Kernel/IOSharedDataQueue.cpp +++ b/iokit/Kernel/IOSharedDataQueue.cpp @@ -157,25 +157,35 @@ IOMemoryDescriptor *IOSharedDataQueue::getMemoryDescriptor() IODataQueueEntry * IOSharedDataQueue::peek() { - IODataQueueEntry *entry = 0; + IODataQueueEntry *entry = 0; + UInt32 headOffset; + UInt32 tailOffset; - if (dataQueue && (dataQueue->head != dataQueue->tail)) { + if (!dataQueue) { + return NULL; + } + + // Read head and tail with acquire barrier + headOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED); + tailOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_ACQUIRE); + + if (headOffset != tailOffset) { IODataQueueEntry * head = 0; UInt32 headSize = 0; UInt32 headOffset = dataQueue->head; UInt32 queueSize = getQueueSize(); - + if (headOffset >= queueSize) { return NULL; } - + head = (IODataQueueEntry *)((char *)dataQueue->queue + headOffset); headSize = head->size; - + // Check if there's enough room before the end of the queue for a header. // If there is room, check if there's enough room to hold the header and // the data. - + if ((headOffset > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize) || (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headSize) || @@ -194,11 +204,16 @@ IODataQueueEntry * IOSharedDataQueue::peek() Boolean IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) { - const UInt32 head = dataQueue->head; // volatile - const UInt32 tail = dataQueue->tail; + UInt32 head; + UInt32 tail; + UInt32 newTail; const UInt32 entrySize = dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE; IODataQueueEntry * entry; + // Force a single read of head and tail + head = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED); + tail = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_RELAXED); + // Check for overflow of entrySize if (dataSize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) { return false; @@ -223,7 +238,7 @@ Boolean IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) // exactly matches the available space at the end of the queue. // The tail can range from 0 to dataQueue->queueSize inclusive. - OSAddAtomic(entrySize, (SInt32 *)&dataQueue->tail); + newTail = tail + entrySize; } else if ( head > entrySize ) // Is there enough room at the beginning? { @@ -242,7 +257,7 @@ Boolean IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) } memcpy(&dataQueue->queue->data, data, dataSize); - OSCompareAndSwap(dataQueue->tail, entrySize, &dataQueue->tail); + newTail = entrySize; } else { @@ -260,18 +275,21 @@ Boolean IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) entry->size = dataSize; memcpy(&entry->data, data, dataSize); - OSAddAtomic(entrySize, (SInt32 *)&dataQueue->tail); + newTail = tail + entrySize; } else { return false; // queue is full } } + + // Update tail with release barrier + __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE); // Send notification (via mach message) that data is available. - if ( ( head == tail ) /* queue was empty prior to enqueue() */ - || ( dataQueue->head == tail ) ) /* queue was emptied during enqueue() */ + if ( ( tail == head ) /* queue was empty prior to enqueue() */ + || ( tail == __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED) ) ) /* queue was emptied during enqueue() */ { sendDataAvailableNotification(); } @@ -284,72 +302,77 @@ Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize) Boolean retVal = TRUE; IODataQueueEntry * entry = 0; UInt32 entrySize = 0; + UInt32 headOffset = 0; + UInt32 tailOffset = 0; UInt32 newHeadOffset = 0; - if (dataQueue) { - if (dataQueue->head != dataQueue->tail) { - IODataQueueEntry * head = 0; - UInt32 headSize = 0; - UInt32 headOffset = dataQueue->head; - UInt32 queueSize = getQueueSize(); - - if (headOffset > queueSize) { + if (!dataQueue) { + return false; + } + + // Read head and tail with acquire barrier + tailOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_RELAXED); + headOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_ACQUIRE); + + if (headOffset != tailOffset) { + IODataQueueEntry * head = 0; + UInt32 headSize = 0; + UInt32 queueSize = getQueueSize(); + + if (headOffset > queueSize) { + return false; + } + + head = (IODataQueueEntry *)((char *)dataQueue->queue + headOffset); + headSize = head->size; + + // we wrapped around to beginning, so read from there + // either there was not even room for the header + if ((headOffset > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize) || + // or there was room for the header, but not for the data + (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headSize) || + (headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { + // Note: we have to wrap to the beginning even with the UINT32_MAX checks + // because we have to support a queueSize of UINT32_MAX. + entry = dataQueue->queue; + entrySize = entry->size; + if ((entrySize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { return false; } - - head = (IODataQueueEntry *)((char *)dataQueue->queue + headOffset); - headSize = head->size; - - // we wrapped around to beginning, so read from there - // either there was not even room for the header - if ((headOffset > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || - (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize) || - // or there was room for the header, but not for the data - (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headSize) || - (headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { - // Note: we have to wrap to the beginning even with the UINT32_MAX checks - // because we have to support a queueSize of UINT32_MAX. - entry = dataQueue->queue; - entrySize = entry->size; - if ((entrySize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || - (entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) { - return false; - } - newHeadOffset = entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE; - // else it is at the end - } else { - entry = head; - entrySize = entry->size; - if ((entrySize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || - (entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headOffset) || - (entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE + headOffset > queueSize)) { - return false; - } - newHeadOffset = headOffset + entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE; + newHeadOffset = entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE; + // else it is at the end + } else { + entry = head; + entrySize = entry->size; + if ((entrySize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) || + (entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headOffset) || + (entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE + headOffset > queueSize)) { + return false; } + newHeadOffset = headOffset + entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE; } - - if (entry) { - if (data) { - if (dataSize) { - if (entrySize <= *dataSize) { - memcpy(data, &(entry->data), entrySize); - OSCompareAndSwap( dataQueue->head, newHeadOffset, (SInt32 *)&dataQueue->head); - } else { - retVal = FALSE; - } + } + + if (entry) { + if (data) { + if (dataSize) { + if (entrySize <= *dataSize) { + memcpy(data, &(entry->data), entrySize); + __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE); } else { retVal = FALSE; } } else { - OSCompareAndSwap( dataQueue->head, newHeadOffset, (SInt32 *)&dataQueue->head); - } - - if (dataSize) { - *dataSize = entrySize; + retVal = FALSE; } } else { - retVal = FALSE; + __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE); + } + + if (dataSize) { + *dataSize = entrySize; } } else { retVal = FALSE; diff --git a/iokit/Kernel/IOSimpleReporter.cpp b/iokit/Kernel/IOSimpleReporter.cpp index 81c8232eb..6707bda89 100644 --- a/iokit/Kernel/IOSimpleReporter.cpp +++ b/iokit/Kernel/IOSimpleReporter.cpp @@ -56,7 +56,7 @@ IOSimpleReporter::with(IOService *reportingService, finish: if (!rval) { - if (reporter) delete reporter; + OSSafeReleaseNULL(reporter); } return rval; diff --git a/iokit/Kernel/IOStartIOKit.cpp b/iokit/Kernel/IOStartIOKit.cpp index 8177603cc..f9d9fc018 100644 --- a/iokit/Kernel/IOStartIOKit.cpp +++ b/iokit/Kernel/IOStartIOKit.cpp @@ -43,8 +43,8 @@ #include #include #include - #include +#include #include "IOKitKernelInternal.h" @@ -53,6 +53,10 @@ OSSet * gIORemoveOnReadProperties; extern "C" { +void StartIOKit( void * p1, void * p2, void * p3, void * p4 ); +void IORegistrySetOSBuildVersion(char * build_version); +void IORecordProgressBackbuffer(void * buffer, size_t size, uint32_t theme); + extern void OSlibkernInit (void); void iokit_post_constructor_init(void); @@ -148,6 +152,7 @@ void StartIOKit( void * p1, void * p2, void * p3, void * p4 ) // IOLibInit(); OSlibkernInit(); + devsw_init(); gIOProgressBackbufferKey = OSSymbol::withCStringNoCopy(kIOProgressBackbufferKey); gIORemoveOnReadProperties = OSSet::withObjects((const OSObject **) &gIOProgressBackbufferKey, 1); diff --git a/iokit/Kernel/IOStateReporter.cpp b/iokit/Kernel/IOStateReporter.cpp index 0eaeb6f1d..f6e0b7340 100644 --- a/iokit/Kernel/IOStateReporter.cpp +++ b/iokit/Kernel/IOStateReporter.cpp @@ -58,7 +58,7 @@ IOStateReporter::with(IOService *reportingService, finish: if (!rval) { - if (reporter) delete reporter; + OSSafeReleaseNULL(reporter); } return rval; @@ -778,6 +778,7 @@ IOStateReporter::_getStateValue(uint64_t channel_id, break; case kLastTransitionTime: result = values->last_intransition; + break; default: break; } diff --git a/iokit/Kernel/IOStatistics.cpp b/iokit/Kernel/IOStatistics.cpp index 6e72eb495..b67de2511 100644 --- a/iokit/Kernel/IOStatistics.cpp +++ b/iokit/Kernel/IOStatistics.cpp @@ -27,6 +27,7 @@ */ #include +#include #include #include @@ -1221,7 +1222,7 @@ KextNode *IOStatistics::getKextNodeFromBacktrace(boolean_t write) { * overhead. OSBacktrace does many safety checks that * are not needed in this situation. */ - btCount = fastbacktrace((uintptr_t*)bt, btCount); + btCount = backtrace((uintptr_t*)bt, btCount); if (write) { IORWLockWrite(lock); diff --git a/iokit/Kernel/IOStringFuncs.c b/iokit/Kernel/IOStringFuncs.c index 548e980a1..d536b7243 100644 --- a/iokit/Kernel/IOStringFuncs.c +++ b/iokit/Kernel/IOStringFuncs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -126,11 +126,11 @@ isdigit(char c) long strtol(const char *nptr, char **endptr, int base) { - register const char *s = nptr; - register unsigned long acc; - register int c; - register unsigned long cutoff; - register int neg = 0, any, cutlim; + const char *s = nptr; + unsigned long acc; + char c; + unsigned long cutoff; + int neg = 0, any, cutlim; /* * Skip white space and pick up leading +/- sign if any. @@ -218,11 +218,11 @@ strtol(const char *nptr, char **endptr, int base) unsigned long strtoul(const char *nptr, char **endptr, int base) { - register const char *s = nptr; - register unsigned long acc; - register int c; - register unsigned long cutoff; - register int neg = 0, any, cutlim; + const char *s = nptr; + unsigned long acc; + char c; + unsigned long cutoff; + int neg = 0, any, cutlim; /* * See strtol for comments as to the logic used. @@ -296,11 +296,11 @@ strtoul(const char *nptr, char **endptr, int base) quad_t strtoq(const char *nptr, char **endptr, int base) { - register const char *s; - register u_quad_t acc; - register int c; - register u_quad_t qbase, cutoff; - register int neg, any, cutlim; + const char *s; + u_quad_t acc; + char c; + u_quad_t qbase, cutoff; + int neg, any, cutlim; /* * Skip white space and pick up leading +/- sign if any. @@ -397,13 +397,13 @@ strtoq(const char *nptr, char **endptr, int base) u_quad_t strtouq(const char *nptr, char **endptr, - register int base) + int base) { - register const char *s = nptr; - register u_quad_t acc; - register int c; - register u_quad_t qbase, cutoff; - register int neg, any, cutlim; + const char *s = nptr; + u_quad_t acc; + char c; + u_quad_t qbase, cutoff; + int neg, any, cutlim; /* * See strtoq for comments as to the logic used. diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index 5cfa03377..6cd4737c5 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -45,6 +45,8 @@ #include #include +#include + #if CONFIG_MACF extern "C" { @@ -69,8 +71,9 @@ extern "C" { enum { - kIOUCAsync0Flags = 3ULL, - kIOUCAsync64Flag = 1ULL + kIOUCAsync0Flags = 3ULL, + kIOUCAsync64Flag = 1ULL, + kIOUCAsyncErrorLoggedFlag = 2ULL }; #if IOKITSTATS @@ -99,6 +102,25 @@ do { \ #endif /* IOKITSTATS */ +#if DEVELOPMENT || DEBUG + +#define FAKE_STACK_FRAME(a) \ + const void ** __frameptr; \ + const void * __retaddr; \ + __frameptr = (typeof(__frameptr)) __builtin_frame_address(0); \ + __retaddr = __frameptr[1]; \ + __frameptr[1] = (a); + +#define FAKE_STACK_FRAME_END() \ + __frameptr[1] = __retaddr; + +#else /* DEVELOPMENT || DEBUG */ + +#define FAKE_STACK_FRAME(a) +#define FAKE_STACK_FRAME_END() + +#endif /* DEVELOPMENT || DEBUG */ + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // definitions we should get from osfmk @@ -616,6 +638,7 @@ class IOServiceUserNotification : public IOUserNotification OSArray * newSet; OSObject * lastEntry; bool armed; + bool ipcLogged; public: @@ -646,6 +669,7 @@ class IOServiceMessageUserNotification : public IOUserNotification vm_size_t msgSize; uint8_t clientIs64; int owningPID; + bool ipcLogged; public: @@ -830,8 +854,11 @@ bool IOServiceUserNotification::handler( void * ref, if( port) iokit_release_port( port ); - if( KERN_SUCCESS != kr) - IOLog("%s: mach_msg_send_from_kernel_proper {%x}\n", __FILE__, kr ); + if( (KERN_SUCCESS != kr) && !ipcLogged) + { + ipcLogged = true; + IOLog("%s: mach_msg_send_from_kernel_proper(0x%x)\n", __PRETTY_FUNCTION__, kr ); + } } return( true ); @@ -841,12 +868,11 @@ OSObject * IOServiceUserNotification::getNextObject() { unsigned int count; OSObject * result; + OSObject * releaseEntry; - IOTakeLock( lock ); - - if( lastEntry) - lastEntry->release(); + IOLockLock(lock); + releaseEntry = lastEntry; count = newSet->getCount(); if( count ) { result = newSet->getObject( count - 1 ); @@ -858,7 +884,9 @@ OSObject * IOServiceUserNotification::getNextObject() } lastEntry = result; - IOUnlock( lock ); + IOLockUnlock(lock); + + if (releaseEntry) releaseEntry->release(); return( result ); } @@ -884,7 +912,7 @@ bool IOServiceMessageUserNotification::init( mach_port_t port, natural_t type, owningPID = proc_selfpid(); extraSize += sizeof(IOServiceInterestContent64); - msgSize = sizeof(PingMsg) - sizeof(OSAsyncReference64) + referenceSize + extraSize; + msgSize = sizeof(PingMsg) - sizeof(OSAsyncReference64) + referenceSize; pingMsg = (PingMsg *) IOMalloc( msgSize); if( !pingMsg) return( false ); @@ -940,13 +968,17 @@ IOReturn IOServiceMessageUserNotification::_handler( void * target, void * ref, IOReturn IOServiceMessageUserNotification::handler( void * ref, UInt32 messageType, IOService * provider, - void * messageArgument, vm_size_t argSize ) + void * messageArgument, vm_size_t callerArgSize ) { + enum { kLocalMsgSize = 0x100 }; + uint64_t stackMsg[kLocalMsgSize / sizeof(uint64_t)]; + void * allocMsg; kern_return_t kr; + vm_size_t argSize; + vm_size_t thisMsgSize; ipc_port_t thisPort, providerPort; - IOServiceInterestContent64 * data = (IOServiceInterestContent64 *) - ((((uint8_t *) pingMsg) + msgSize) - pingMsg->notifyHeader.size); - // == pingMsg->notifyHeader.content; + struct PingMsg * thisMsg; + IOServiceInterestContent64 * data; if (kIOMessageCopyClientID == messageType) { @@ -954,24 +986,16 @@ IOReturn IOServiceMessageUserNotification::handler( void * ref, return (kIOReturnSuccess); } - data->messageType = messageType; - - if( argSize == 0) + if (callerArgSize == 0) { - data->messageArgument[0] = (io_user_reference_t) messageArgument; - if (clientIs64) - argSize = sizeof(data->messageArgument[0]); - else - { - data->messageArgument[0] |= (data->messageArgument[0] << 32); - argSize = sizeof(uint32_t); - } + if (clientIs64) argSize = sizeof(data->messageArgument[0]); + else argSize = sizeof(uint32_t); } else { + argSize = callerArgSize; if( argSize > kIOUserNotifyMaxMessageSize) argSize = kIOUserNotifyMaxMessageSize; - bcopy( messageArgument, data->messageArgument, argSize ); } // adjust message size for ipc restrictions @@ -979,20 +1003,55 @@ IOReturn IOServiceMessageUserNotification::handler( void * ref, type = pingMsg->notifyHeader.type; type &= ~(kIOKitNoticationMsgSizeMask << kIOKitNoticationTypeSizeAdjShift); type |= ((argSize & kIOKitNoticationMsgSizeMask) << kIOKitNoticationTypeSizeAdjShift); - pingMsg->notifyHeader.type = type; argSize = (argSize + kIOKitNoticationMsgSizeMask) & ~kIOKitNoticationMsgSizeMask; - pingMsg->msgHdr.msgh_size = msgSize - pingMsg->notifyHeader.size - + sizeof( IOServiceInterestContent64 ) - - sizeof( data->messageArgument) - + argSize; + thisMsgSize = msgSize + + sizeof( IOServiceInterestContent64 ) + - sizeof( data->messageArgument) + + argSize; + + if (thisMsgSize > sizeof(stackMsg)) + { + allocMsg = IOMalloc(thisMsgSize); + if (!allocMsg) return (kIOReturnNoMemory); + thisMsg = (typeof(thisMsg)) allocMsg; + } + else + { + allocMsg = 0; + thisMsg = (typeof(thisMsg)) stackMsg; + } + + bcopy(pingMsg, thisMsg, msgSize); + thisMsg->notifyHeader.type = type; + data = (IOServiceInterestContent64 *) (((uint8_t *) thisMsg) + msgSize); + // == pingMsg->notifyHeader.content; + data->messageType = messageType; + + if (callerArgSize == 0) + { + data->messageArgument[0] = (io_user_reference_t) messageArgument; + if (!clientIs64) + { + data->messageArgument[0] |= (data->messageArgument[0] << 32); + } + } + else + { + bcopy( messageArgument, data->messageArgument, callerArgSize ); + bzero((void *)(((uintptr_t) &data->messageArgument[0]) + callerArgSize), argSize - callerArgSize); + } + + thisMsg->notifyHeader.type = type; + thisMsg->msgHdr.msgh_size = thisMsgSize; providerPort = iokit_port_for_object( provider, IKOT_IOKIT_OBJECT ); - pingMsg->ports[0].name = providerPort; + thisMsg->ports[0].name = providerPort; thisPort = iokit_port_for_object( this, IKOT_IOKIT_OBJECT ); - pingMsg->msgHdr.msgh_local_port = thisPort; - kr = mach_msg_send_from_kernel_with_options( &pingMsg->msgHdr, - pingMsg->msgHdr.msgh_size, + thisMsg->msgHdr.msgh_local_port = thisPort; + + kr = mach_msg_send_from_kernel_with_options( &thisMsg->msgHdr, + thisMsg->msgHdr.msgh_size, (MACH_SEND_MSG | MACH_SEND_ALWAYS | MACH_SEND_IMPORTANCE), 0); if( thisPort) @@ -1000,8 +1059,14 @@ IOReturn IOServiceMessageUserNotification::handler( void * ref, if( providerPort) iokit_release_port( providerPort ); - if( KERN_SUCCESS != kr) - IOLog("%s: mach_msg_send_from_kernel_proper {%x}\n", __FILE__, kr ); + if (allocMsg) + IOFree(allocMsg, thisMsgSize); + + if((KERN_SUCCESS != kr) && !ipcLogged) + { + ipcLogged = true; + IOLog("%s: mach_msg_send_from_kernel_proper (0x%x)\n", __PRETTY_FUNCTION__, kr ); + } return( kIOReturnSuccess ); } @@ -1559,10 +1624,14 @@ IOMemoryMap * IOUserClient::mapClientMemory64( if( memory && (kIOReturnSuccess == err)) { + FAKE_STACK_FRAME(getMetaClass()); + options = (options & ~kIOMapUserOptionsMask) | (mapFlags & kIOMapUserOptionsMask); map = memory->createMappingInTask( task, atAddress, options ); memory->release(); + + FAKE_STACK_FRAME_END(); } return( map ); @@ -1589,6 +1658,17 @@ IOExternalAsyncMethod * IOUserClient::getExternalAsyncMethodForIndex( UInt32 /* return( 0 ); } +IOExternalTrap * IOUserClient:: +getExternalTrapForIndex(UInt32 index) +{ + return NULL; +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + +// Suppressing the deprecated-declarations warning. Avoiding the use of deprecated +// functions can break clients of kexts implementing getExternalMethodForIndex() IOExternalMethod * IOUserClient:: getTargetAndMethodForIndex(IOService **targetP, UInt32 index) { @@ -1611,12 +1691,6 @@ getAsyncTargetAndMethodForIndex(IOService ** targetP, UInt32 index) return method; } -IOExternalTrap * IOUserClient:: -getExternalTrapForIndex(UInt32 index) -{ - return NULL; -} - IOExternalTrap * IOUserClient:: getTargetAndTrapForIndex(IOService ** targetP, UInt32 index) { @@ -1628,6 +1702,7 @@ getTargetAndTrapForIndex(IOService ** targetP, UInt32 index) return trap; } +#pragma clang diagnostic pop IOReturn IOUserClient::releaseAsyncReference64(OSAsyncReference64 reference) { @@ -1713,6 +1788,7 @@ IOReturn IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference, if (numArgs > kMaxAsyncArgs) return kIOReturnMessageTooLarge; + bzero(&replyMsg, sizeof(replyMsg)); replyMsg.msgHdr.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND /*remote*/, 0 /*local*/); replyMsg.msgHdr.msgh_remote_port = replyPort; @@ -1761,8 +1837,11 @@ IOReturn IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference, kr = mach_msg_send_from_kernel_proper( &replyMsg.msgHdr, replyMsg.msgHdr.msgh_size); } - if ((KERN_SUCCESS != kr) && (MACH_SEND_TIMED_OUT != kr)) - IOLog("%s: mach_msg_send_from_kernel_proper {%x}\n", __FILE__, kr ); + if ((KERN_SUCCESS != kr) && (MACH_SEND_TIMED_OUT != kr) && !(kIOUCAsyncErrorLoggedFlag & reference[0])) + { + reference[0] |= kIOUCAsyncErrorLoggedFlag; + IOLog("%s: mach_msg_send_from_kernel_proper(0x%x)\n", __PRETTY_FUNCTION__, kr ); + } return kr; } @@ -1999,8 +2078,9 @@ static kern_return_t internal_io_service_match_property_table( OSObject * obj; OSDictionary * dict; - obj = matching_size ? OSUnserializeXML(matching, matching_size) - : OSUnserializeXML(matching); + assert(matching_size); + obj = OSUnserializeXML(matching, matching_size); + if( (dict = OSDynamicCast( OSDictionary, obj))) { *matches = service->passiveMatch( dict ); kr = kIOReturnSuccess; @@ -2019,7 +2099,7 @@ kern_return_t is_io_service_match_property_table( io_string_t matching, boolean_t *matches ) { - return (internal_io_service_match_property_table(service, matching, 0, matches)); + return (kIOReturnUnsupported); } @@ -2071,8 +2151,9 @@ static kern_return_t internal_io_service_get_matching_services( if( master_port != master_device_port) return( kIOReturnNotPrivileged); - obj = matching_size ? OSUnserializeXML(matching, matching_size) - : OSUnserializeXML(matching); + assert(matching_size); + obj = OSUnserializeXML(matching, matching_size); + if( (dict = OSDynamicCast( OSDictionary, obj))) { *existing = IOUserIterator::withIterator(IOService::getMatchingServices( dict )); kr = kIOReturnSuccess; @@ -2091,7 +2172,7 @@ kern_return_t is_io_service_get_matching_services( io_string_t matching, io_iterator_t *existing ) { - return (internal_io_service_get_matching_services(master_port, matching, 0, existing)); + return (kIOReturnUnsupported); } /* Routine io_service_get_matching_services_ool */ @@ -2145,8 +2226,9 @@ static kern_return_t internal_io_service_get_matching_service( if( master_port != master_device_port) return( kIOReturnNotPrivileged); - obj = matching_size ? OSUnserializeXML(matching, matching_size) - : OSUnserializeXML(matching); + assert(matching_size); + obj = OSUnserializeXML(matching, matching_size); + if( (dict = OSDynamicCast( OSDictionary, obj))) { *service = IOService::copyMatchingService( dict ); kr = *service ? kIOReturnSuccess : kIOReturnNotFound; @@ -2165,7 +2247,7 @@ kern_return_t is_io_service_get_matching_service( io_string_t matching, io_service_t *service ) { - return (internal_io_service_get_matching_service(master_port, matching, 0, service)); + return (kIOReturnUnsupported); } /* Routine io_service_get_matching_services_ool */ @@ -2232,15 +2314,8 @@ static kern_return_t internal_io_service_add_notification( if( !(sym = OSSymbol::withCString( notification_type ))) err = kIOReturnNoResources; - if (matching_size) - { - dict = OSDynamicCast(OSDictionary, OSUnserializeXML(matching, matching_size)); - } - else - { - dict = OSDynamicCast(OSDictionary, OSUnserializeXML(matching)); - } - + assert(matching_size); + dict = OSDynamicCast(OSDictionary, OSUnserializeXML(matching, matching_size)); if (!dict) { err = kIOReturnBadArgument; continue; @@ -2298,9 +2373,7 @@ kern_return_t is_io_service_add_notification( mach_msg_type_number_t referenceCnt, io_object_t * notification ) { - return (internal_io_service_add_notification(master_port, notification_type, - matching, 0, port, &reference[0], sizeof(io_async_ref_t), - false, notification)); + return (kIOReturnUnsupported); } /* Routine io_service_add_notification_64 */ @@ -2313,9 +2386,7 @@ kern_return_t is_io_service_add_notification_64( mach_msg_type_number_t referenceCnt, io_object_t *notification ) { - return (internal_io_service_add_notification(master_port, notification_type, - matching, 0, wake_port, &reference[0], sizeof(io_async_ref64_t), - true, notification)); + return (kIOReturnUnsupported); } /* Routine io_service_add_notification_bin */ @@ -2947,7 +3018,7 @@ kern_return_t is_io_registry_entry_get_property_recursively( #endif obj = entry->copyProperty( property_name, - IORegistryEntry::getPlane( plane ), options); + IORegistryEntry::getPlane( plane ), options ); if( !obj) return( kIOReturnNotFound ); @@ -2971,86 +3042,13 @@ kern_return_t is_io_registry_entry_get_property_recursively( return( err ); } -#if CONFIG_MACF - -static kern_return_t -filteredProperties(IORegistryEntry *entry, OSDictionary *properties, OSDictionary **filteredp) -{ - kern_return_t err = 0; - OSDictionary *filtered = NULL; - OSCollectionIterator *iter = NULL; - OSSymbol *key; - OSObject *p; - kauth_cred_t cred = kauth_cred_get(); - - if (properties == NULL) - return kIOReturnUnsupported; - - if ((iter = OSCollectionIterator::withCollection(properties)) == NULL || - (filtered = OSDictionary::withCapacity(properties->getCapacity())) == NULL) { - err = kIOReturnNoMemory; - goto out; - } - - while ((p = iter->getNextObject()) != NULL) { - if ((key = OSDynamicCast(OSSymbol, p)) == NULL || - mac_iokit_check_get_property(cred, entry, key->getCStringNoCopy()) != 0) - continue; - filtered->setObject(key, properties->getObject(key)); - } - -out: - if (iter != NULL) - iter->release(); - *filteredp = filtered; - return err; -} - -#endif - /* Routine io_registry_entry_get_properties */ kern_return_t is_io_registry_entry_get_properties( io_object_t registry_entry, io_buf_ptr_t *properties, mach_msg_type_number_t *propertiesCnt ) { - kern_return_t err = 0; - vm_size_t len; - - CHECK( IORegistryEntry, registry_entry, entry ); - - OSSerialize * s = OSSerialize::withCapacity(4096); - if( !s) - return( kIOReturnNoMemory ); - - if (!entry->serializeProperties(s)) - err = kIOReturnUnsupported; - -#if CONFIG_MACF - if (!err && mac_iokit_check_filter_properties(kauth_cred_get(), entry)) { - OSObject *propobj = OSUnserializeXML(s->text(), s->getLength()); - OSDictionary *filteredprops = NULL; - err = filteredProperties(entry, OSDynamicCast(OSDictionary, propobj), &filteredprops); - if (propobj) propobj->release(); - - if (!err) { - s->clearText(); - if (!filteredprops->serialize(s)) - err = kIOReturnUnsupported; - } - if (filteredprops != NULL) - filteredprops->release(); - } -#endif /* CONFIG_MACF */ - - if (!err) { - len = s->getLength(); - *propertiesCnt = len; - err = copyoutkdata( s->text(), len, properties ); - } - - s->release(); - return( err ); + return (kIOReturnUnsupported); } #if CONFIG_MACF @@ -3073,13 +3071,13 @@ GetPropertiesEditor(void * reference, if (!ref->root) ref->root = container; if (ref->root == container) - { - if (0 != mac_iokit_check_get_property(ref->cred, ref->entry, name->getCStringNoCopy())) - { - value = 0; - } - } - if (value) value->retain(); + { + if (0 != mac_iokit_check_get_property(ref->cred, ref->entry, name->getCStringNoCopy())) + { + value = 0; + } + } + if (value) value->retain(); return (value); } @@ -3118,9 +3116,9 @@ kern_return_t is_io_registry_entry_get_properties_bin( if (kIOReturnSuccess == err) { - len = s->getLength(); - *propertiesCnt = len; - err = copyoutkdata(s->text(), len, properties); + len = s->getLength(); + *propertiesCnt = len; + err = copyoutkdata(s->text(), len, properties); } s->release(); @@ -3148,25 +3146,29 @@ kern_return_t is_io_registry_entry_get_property_bin( return kIOReturnNotPermitted; #endif - if ((kIORegistryIterateRecursively & options) && plane[0]) + sym = OSSymbol::withCString(property_name); + if (!sym) return (kIOReturnNoMemory); + + if (gIORegistryEntryPropertyKeysKey == sym) { - obj = entry->copyProperty(property_name, - IORegistryEntry::getPlane(plane), options); + obj = entry->copyPropertyKeys(); } else { - obj = entry->copyProperty(property_name); + if ((kIORegistryIterateRecursively & options) && plane[0]) + { + obj = entry->copyProperty(property_name, + IORegistryEntry::getPlane(plane), options ); + } + else + { + obj = entry->copyProperty(property_name); + } + if (obj && gIORemoveOnReadProperties->containsObject(sym)) entry->removeProperty(sym); } - if( !obj) - return( kIOReturnNotFound ); - - sym = OSSymbol::withCString(property_name); - if (sym) - { - if (gIORemoveOnReadProperties->containsObject(sym)) entry->removeProperty(sym); - sym->release(); - } + sym->release(); + if (!obj) return (kIOReturnNotFound); OSSerialize * s = OSSerialize::binaryWithCapacity(4096); if( !s) { @@ -3212,6 +3214,8 @@ kern_return_t is_io_registry_entry_set_properties if( KERN_SUCCESS == err) { + FAKE_STACK_FRAME(entry->getMetaClass()); + // must return success after vm_map_copyout() succeeds obj = OSUnserializeXML( (const char *) data, propertiesCnt ); vm_deallocate( kernel_map, data, propertiesCnt ); @@ -3232,6 +3236,9 @@ kern_return_t is_io_registry_entry_set_properties if (obj) obj->release(); + + FAKE_STACK_FRAME_END(); + } else res = err; @@ -3248,7 +3255,7 @@ kern_return_t is_io_registry_entry_get_child_iterator( CHECK( IORegistryEntry, registry_entry, entry ); *iterator = entry->getChildIterator( - IORegistryEntry::getPlane( plane )); + IORegistryEntry::getPlane( plane )); return( kIOReturnSuccess ); } @@ -3768,6 +3775,7 @@ kern_return_t is_io_connect_method_var_output OSObject * structureVariableOutputData = 0; bzero(&args.__reserved[0], sizeof(args.__reserved)); + args.__reservedA = 0; args.version = kIOExternalMethodArgumentsCurrentVersion; args.selector = selector; @@ -3861,6 +3869,7 @@ kern_return_t is_io_connect_method IOMemoryDescriptor * outputMD = 0; bzero(&args.__reserved[0], sizeof(args.__reserved)); + args.__reservedA = 0; args.version = kIOExternalMethodArgumentsCurrentVersion; args.selector = selector; @@ -3941,6 +3950,7 @@ kern_return_t is_io_connect_async_method IOMemoryDescriptor * outputMD = 0; bzero(&args.__reserved[0], sizeof(args.__reserved)); + args.__reservedA = 0; args.version = kIOExternalMethodArgumentsCurrentVersion; reference[0] = (io_user_reference_t) wake_port; @@ -4046,12 +4056,14 @@ kern_return_t shim_io_connect_method_scalarI_scalarO( if( inputCount != method->count0) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( *outputCount != method->count1) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)*outputCount, (uint64_t)method->count1); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)*outputCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4280,12 +4292,14 @@ kern_return_t shim_io_async_method_scalarI_scalarO( if( inputCount != method->count0) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( *outputCount != method->count1) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)*outputCount, (uint64_t)method->count1); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)*outputCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4390,13 +4404,15 @@ kern_return_t shim_io_connect_method_scalarI_structureO( do { if( inputCount != method->count0) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( (kIOUCVariableStructureSize != method->count1) && (*outputCount != method->count1)) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)*outputCount, (uint64_t)method->count1, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)*outputCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4463,13 +4479,15 @@ kern_return_t shim_io_async_method_scalarI_structureO( do { if( inputCount != method->count0) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( (kIOUCVariableStructureSize != method->count1) && (*outputCount != method->count1)) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)*outputCount, (uint64_t)method->count1, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)*outputCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4561,13 +4579,15 @@ kern_return_t shim_io_connect_method_scalarI_structureI( { if (inputCount != method->count0) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( (kIOUCVariableStructureSize != method->count1) && (inputStructCount != method->count1)) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputStructCount, (uint64_t)method->count1, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputStructCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4637,13 +4657,15 @@ kern_return_t shim_io_async_method_scalarI_structureI( { if (inputCount != method->count0) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( (kIOUCVariableStructureSize != method->count1) && (inputStructCount != method->count1)) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputStructCount, (uint64_t)method->count1, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputStructCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4733,13 +4755,15 @@ kern_return_t shim_io_connect_method_structureI_structureO( if( (kIOUCVariableStructureSize != method->count0) && (inputCount != method->count0)) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( (kIOUCVariableStructureSize != method->count1) && (*outputCount != method->count1)) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)*outputCount, (uint64_t)method->count1, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)*outputCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4787,13 +4811,15 @@ kern_return_t shim_io_async_method_structureI_structureO( if( (kIOUCVariableStructureSize != method->count0) && (inputCount != method->count0)) { - IOLog("%s: IOUserClient inputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient inputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)inputCount, (uint64_t)method->count0, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)inputCount, uint64_t, (uint64_t)method->count0); continue; } if( (kIOUCVariableStructureSize != method->count1) && (*outputCount != method->count1)) { - IOLog("%s: IOUserClient outputCount count mismatch\n", object->getName()); + IOLog("%s:%d %s: IOUserClient outputCount count mismatch 0x%llx 0x%llx 0x%llx\n", __FUNCTION__, __LINE__, object->getName(), (uint64_t)*outputCount, (uint64_t)method->count1, (uint64_t)kIOUCVariableStructureSize); + DTRACE_IO2(iokit_count_mismatch, uint64_t, (uint64_t)*outputCount, uint64_t, (uint64_t)method->count1); continue; } @@ -4818,6 +4844,10 @@ kern_return_t shim_io_async_method_structureI_structureO( return( err); } +#if !NO_KEXTD +bool gIOKextdClearedBusy = false; +#endif + /* Routine io_catalog_send_data */ kern_return_t is_io_catalog_send_data( mach_port_t master_port, @@ -4953,14 +4983,12 @@ kern_return_t is_io_catalog_send_data( case kIOCatalogKextdFinishedLaunching: { #if !NO_KEXTD - static bool clearedBusy = false; - - if (!clearedBusy) { + if (!gIOKextdClearedBusy) { IOService * serviceRoot = IOService::getServiceRoot(); if (serviceRoot) { IOServiceTrace(IOSERVICE_KEXTD_READY, 0, 0, 0, 0); serviceRoot->adjustBusy(-1); - clearedBusy = true; + gIOKextdClearedBusy = true; } } #endif diff --git a/iokit/Kernel/IOWorkLoop.cpp b/iokit/Kernel/IOWorkLoop.cpp index 6207b1ea1..157bdd976 100644 --- a/iokit/Kernel/IOWorkLoop.cpp +++ b/iokit/Kernel/IOWorkLoop.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -84,11 +85,12 @@ do { \ #define IOStatisticsOpenGate() \ do { \ IOStatistics::countWorkLoopOpenGate(reserved->counter); \ + if (reserved->lockInterval) lockTime(); \ } while(0) - #define IOStatisticsCloseGate() \ do { \ - IOStatistics::countWorkLoopCloseGate(reserved->counter); \ + IOStatistics::countWorkLoopCloseGate(reserved->counter); \ + if (reserved->lockInterval) reserved->lockTime = mach_absolute_time(); \ } while(0) #define IOStatisticsAttachEventSource() \ @@ -283,6 +285,13 @@ void IOWorkLoop::free() IOReturn IOWorkLoop::addEventSource(IOEventSource *newEvent) { + if ((workThread) + && !thread_has_thread_name(workThread) + && (newEvent->owner) + && !OSDynamicCast(IOCommandPool, newEvent->owner)) { + thread_set_thread_name(workThread, newEvent->owner->getMetaClass()->getClassName()); + } + return controlG->runCommand((void *) mAddEvent, (void *) newEvent); } @@ -641,3 +650,25 @@ IOWorkLoop::eventSourcePerformsWork(IOEventSource *inEventSource) return result; } + +void +IOWorkLoop::lockTime(void) +{ + uint64_t time; + time = mach_absolute_time() - reserved->lockTime; + if (time > reserved->lockInterval) + { + absolutetime_to_nanoseconds(time, &time); + if (kTimeLockPanics & reserved->options) panic("IOWorkLoop %p lock time %qd us", this, time / 1000ULL); + else OSReportWithBacktrace("IOWorkLoop %p lock time %qd us", this, time / 1000ULL); + } +} + +void +IOWorkLoop::setMaximumLockTime(uint64_t interval, uint32_t options) +{ + IORecursiveLockLock(gateLock); + reserved->lockInterval = interval; + reserved->options = (reserved->options & ~kTimeLockPanics) | (options & kTimeLockPanics); + IORecursiveLockUnlock(gateLock); +} diff --git a/iokit/Kernel/RootDomainUserClient.cpp b/iokit/Kernel/RootDomainUserClient.cpp index ae122f4c1..a7836f4c3 100644 --- a/iokit/Kernel/RootDomainUserClient.cpp +++ b/iokit/Kernel/RootDomainUserClient.cpp @@ -299,10 +299,13 @@ IOReturn RootDomainUserClient::externalMethod( (uint32_t *)&arguments->scalarOutput[0]); break; case kPMSetMaintenanceWakeCalendar: - ret = this->secureSetMaintenanceWakeCalendar( - (IOPMCalendarStruct *)arguments->structureInput, - (uint32_t *)&arguments->structureOutput); - arguments->structureOutputSize = sizeof(uint32_t); + if ((arguments->structureInputSize >= sizeof(IOPMCalendarStruct)) && + (arguments->structureOutputSize >= sizeof(uint32_t) )) { + ret = this->secureSetMaintenanceWakeCalendar( + (IOPMCalendarStruct *)arguments->structureInput, + (uint32_t *)&arguments->structureOutput); + arguments->structureOutputSize = sizeof(uint32_t); + } break; case kPMSetUserAssertionLevels: diff --git a/iokit/Makefile b/iokit/Makefile index 2c53d494f..c9c3d03b6 100644 --- a/iokit/Makefile +++ b/iokit/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -15,5 +14,3 @@ COMP_SUBDIRS = conf include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/iokit/Tests/TestIOMemoryDescriptor.cpp b/iokit/Tests/TestIOMemoryDescriptor.cpp index 926681a7e..4cad3c34c 100644 --- a/iokit/Tests/TestIOMemoryDescriptor.cpp +++ b/iokit/Tests/TestIOMemoryDescriptor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Apple Inc. All rights reserved. + * Copyright (c) 2014-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ #include #include #include +#include "Tests.h" #ifndef __LP64__ #include @@ -107,13 +108,72 @@ static int IOMultMemoryDescriptorTest(int newValue) return (0); } +// +static IOReturn +ZeroLengthTest(int newValue) +{ + IOMemoryDescriptor * md; + + md = IOMemoryDescriptor::withAddressRange( + 0, 0, kIODirectionNone, current_task()); + assert(md); + md->prepare(); + md->complete(); + md->release(); + return (0); +} + +// +static IOReturn +IODirectionPrepareNoZeroFillTest(int newValue) +{ + IOBufferMemoryDescriptor * bmd; + + bmd = IOBufferMemoryDescriptor::inTaskWithOptions(NULL, + kIODirectionIn | kIOMemoryPageable, ptoa(24)); + assert(bmd); + bmd->prepare((IODirection)(kIODirectionIn | kIODirectionPrepareNoZeroFill)); + bmd->prepare(kIODirectionIn); + bmd->complete((IODirection)(kIODirectionIn | kIODirectionCompleteWithDataValid)); + bmd->complete(kIODirectionIn); + bmd->release(); + return (0); +} int IOMemoryDescriptorTest(int newValue) { int result; #if 0 - if (5 == newValue) + if (6 == newValue) + { + IOMemoryDescriptor * sbmds[3]; + IOMultiMemoryDescriptor * smmd; + IOMemoryDescriptor * mds[2]; + IOMultiMemoryDescriptor * mmd; + IOMemoryMap * map; + + sbmds[0] = IOBufferMemoryDescriptor::inTaskWithOptions(kernel_task, kIODirectionOutIn | kIOMemoryKernelUserShared, ptoa(1)); + sbmds[1] = IOBufferMemoryDescriptor::inTaskWithOptions(kernel_task, kIODirectionOutIn | kIOMemoryKernelUserShared, ptoa(2)); + sbmds[2] = IOBufferMemoryDescriptor::inTaskWithOptions(kernel_task, kIODirectionOutIn | kIOMemoryKernelUserShared, ptoa(3)); + smmd = IOMultiMemoryDescriptor::withDescriptors(&sbmds[0], sizeof(sbmds)/sizeof(sbmds[0]), kIODirectionOutIn, false); + + mds[0] = IOBufferMemoryDescriptor::inTaskWithOptions(kernel_task, kIODirectionOutIn | kIOMemoryKernelUserShared, ptoa(1)); + mds[1] = smmd; + mmd = IOMultiMemoryDescriptor::withDescriptors(&mds[0], sizeof(mds)/sizeof(mds[0]), kIODirectionOutIn, false); + map = mmd->createMappingInTask(kernel_task, 0, kIOMapAnywhere); + assert(map); + map->release(); + mmd->release(); + mds[0]->release(); + mds[1]->release(); + sbmds[0]->release(); + sbmds[1]->release(); + sbmds[2]->release(); + + return (0); + } + else if (5 == newValue) { IOReturn ret; IOMemoryDescriptor * md; @@ -311,6 +371,12 @@ int IOMemoryDescriptorTest(int newValue) result = IOMultMemoryDescriptorTest(newValue); if (result) return (result); + result = ZeroLengthTest(newValue); + if (result) return (result); + + result = IODirectionPrepareNoZeroFillTest(newValue); + if (result) return (result); + IOGeneralMemoryDescriptor * md; vm_offset_t data[2]; vm_size_t bsize = 16*1024*1024; diff --git a/iokit/bsddev/DINetBootHook.cpp b/iokit/bsddev/DINetBootHook.cpp index 8e83da5c1..62865b0a1 100644 --- a/iokit/bsddev/DINetBootHook.cpp +++ b/iokit/bsddev/DINetBootHook.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,6 +89,7 @@ #include #include #include +#include "DINetBootHook.h" #define kIOHDIXControllerClassName "IOHDIXController" #define kDIRootImageKey "di-root-image" diff --git a/iokit/bsddev/DINetBootHook.h b/iokit/bsddev/DINetBootHook.h index 4742cc88f..2f44361b0 100644 --- a/iokit/bsddev/DINetBootHook.h +++ b/iokit/bsddev/DINetBootHook.h @@ -1,10 +1,35 @@ +/* + * Copyright (c) 2002-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* * DINetBootHook.h * DiskImages * - * Created by Byron Han on Sat Apr 13 2002. - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * * Revision History * * $Log: DINetBootHook.h,v $ @@ -70,6 +95,7 @@ extern "C" { Comments: */ int di_root_image(const char *path, char devname[], dev_t *dev_p); +void di_root_ramfile( IORegistryEntry * entry ); #ifdef __cplusplus }; diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 6e67a3d89..a35aa5495 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -54,6 +54,7 @@ int panic_on_exception_triage = 0; extern dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys); extern dev_t mdevlookup(int devid); extern void mdevremoveall(void); +extern int mdevgetrange(int devid, uint64_t *base, uint64_t *size); extern void di_root_ramfile(IORegistryEntry * entry); @@ -74,6 +75,13 @@ NewKernelCoreMedia(void * target, void * refCon, IONotifier * notifier); #endif /* IOPOLLED_COREFILE */ +#if CONFIG_KDP_INTERACTIVE_DEBUGGING +/* + * Touched by IOFindBSDRoot() if a RAMDisk is used for the root device. + */ +extern uint64_t kdp_core_ramdisk_addr; +extern uint64_t kdp_core_ramdisk_size; +#endif kern_return_t IOKitBSDInit( void ) @@ -356,10 +364,25 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize, static int mountAttempts = 0; int xchar, dchar; - + + // stall here for anyone matching on the IOBSD resource to finish (filesystems) + matching = IOService::serviceMatching(gIOResourcesKey); + assert(matching); + matching->setObject(gIOResourceMatchedKey, gIOBSDKey); + + if ((service = IOService::waitForMatchingService(matching, 30ULL * kSecondScale))) { + service->release(); + } else { + IOLog("!BSD\n"); + } + matching->release(); + matching = NULL; if( mountAttempts++) + { + IOLog("mount(%d) failed\n", mountAttempts); IOSleep( 5 * 1000 ); + } str = (char *) IOMalloc( kMaxPathBuf + kMaxBootVar ); if( !str) @@ -439,13 +462,22 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize, if(xchar >= 0) { /* Do we have a valid memory device name? */ *root = mdevlookup(xchar); /* Find the device number */ if(*root >= 0) { /* Did we find one? */ - rootName[0] = 'm'; /* Build root name */ rootName[1] = 'd'; /* Build root name */ rootName[2] = dchar; /* Build root name */ rootName[3] = 0; /* Build root name */ IOLog("BSD root: %s, major %d, minor %d\n", rootName, major(*root), minor(*root)); *oflags = 0; /* Show that this is not network */ + +#if CONFIG_KDP_INTERACTIVE_DEBUGGING + /* retrieve final ramdisk range and initialize KDP variables */ + if (mdevgetrange(xchar, &kdp_core_ramdisk_addr, &kdp_core_ramdisk_size) != 0) { + IOLog("Unable to retrieve range for root memory device %d\n", xchar); + kdp_core_ramdisk_addr = 0; + kdp_core_ramdisk_size = 0; + } +#endif + goto iofrootx; /* Join common exit... */ } panic("IOFindBSDRoot: specified root memory device, %s, has not been configured\n", rdBootVar); /* Not there */ @@ -674,117 +706,6 @@ kern_return_t IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout ) return KERN_SUCCESS; } -kern_return_t IOBSDGetPlatformSerialNumber( char *serial_number_str, u_int32_t len ) -{ - OSDictionary * platform_dict; - IOService *platform; - OSString * string; - - if (len < 1) { - return 0; - } - serial_number_str[0] = '\0'; - - platform_dict = IOService::serviceMatching( "IOPlatformExpertDevice" ); - if (platform_dict == NULL) { - return KERN_NOT_SUPPORTED; - } - - platform = IOService::waitForService( platform_dict ); - if (platform) { - string = ( OSString * ) platform->getProperty( kIOPlatformSerialNumberKey ); - if ( string == 0 ) { - return KERN_NOT_SUPPORTED; - } else { - strlcpy( serial_number_str, string->getCStringNoCopy( ), len ); - } - } - - return KERN_SUCCESS; -} - -void IOBSDIterateMediaWithContent(const char *content_uuid_cstring, int (*func)(const char *bsd_dev_name, const char *uuid_str, void *arg), void *arg) -{ - OSDictionary *dictionary; - OSString *content_uuid_string; - - dictionary = IOService::serviceMatching( "IOMedia" ); - if( dictionary ) { - content_uuid_string = OSString::withCString( content_uuid_cstring ); - if( content_uuid_string ) { - IOService *service; - OSIterator *iter; - - dictionary->setObject( "Content", content_uuid_string ); - dictionary->retain(); - - iter = IOService::getMatchingServices(dictionary); - while (iter && (service = (IOService *)iter->getNextObject())) { - if( service ) { - OSString *iostr = (OSString *) service->getProperty( kIOBSDNameKey ); - OSString *uuidstr = (OSString *) service->getProperty( "UUID" ); - const char *uuid; - - if( iostr) { - if (uuidstr) { - uuid = uuidstr->getCStringNoCopy(); - } else { - uuid = "00000000-0000-0000-0000-000000000000"; - } - - // call the callback - if (func && func(iostr->getCStringNoCopy(), uuid, arg) == 0) { - break; - } - } - } - } - if (iter) - iter->release(); - - content_uuid_string->release(); - } - dictionary->release(); - } -} - - -int IOBSDIsMediaEjectable( const char *cdev_name ) -{ - int ret = 0; - OSDictionary *dictionary; - OSString *dev_name; - - if (strncmp(cdev_name, "/dev/", 5) == 0) { - cdev_name += 5; - } - - dictionary = IOService::serviceMatching( "IOMedia" ); - if( dictionary ) { - dev_name = OSString::withCString( cdev_name ); - if( dev_name ) { - IOService *service; - mach_timespec_t tv = { 5, 0 }; // wait up to "timeout" seconds for the device - - dictionary->setObject( kIOBSDNameKey, dev_name ); - dictionary->retain(); - service = IOService::waitForService( dictionary, &tv ); - if( service ) { - OSBoolean *ejectable = (OSBoolean *) service->getProperty( "Ejectable" ); - - if( ejectable ) { - ret = (int)ejectable->getValue(); - } - - } - dev_name->release(); - } - dictionary->release(); - } - - return ret; -} - } /* extern "C" */ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -881,8 +802,10 @@ NewKernelCoreMedia(void * target, void * refCon, IOService * newService, IONotifier * notifier) { + static volatile UInt32 onlyOneCorePartition = 0; do { + if (!OSCompareAndSwap(0, 1, &onlyOneCorePartition)) break; if (gIOPolledCoreFileVars) break; if (!gIOOpenPolledCoreFileTC) break; newService = newService->getProvider(); diff --git a/iokit/conf/Makefile b/iokit/conf/Makefile index 76db9a7d8..7bd79d9ae 100644 --- a/iokit/conf/Makefile +++ b/iokit/conf/Makefile @@ -37,7 +37,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile OBJPATH=${OBJPATH} \ build_all; -do_build_all:: do_all +do_build_all:: do_all include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/iokit/conf/Makefile.template b/iokit/conf/Makefile.template index 4fe56b115..4777a86b0 100644 --- a/iokit/conf/Makefile.template +++ b/iokit/conf/Makefile.template @@ -41,10 +41,15 @@ CFLAGS_RELEASE += -DIOASSERT=0 CFLAGS_DEVELOPMENT += -DIOASSERT=1 CFLAGS_DEBUG += -DIOASSERT=1 +IOUserClient.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes +IOKitDebug.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes +IOKitBSDInit.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes -Wno-documentation +IOPMrootDomain.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes + # # Directories for mig generated files # -COMP_SUBDIRS = +COMP_SUBDIRS = # # Make sure we don't remove this by accident if interrupted at the wrong @@ -90,7 +95,7 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \ mv $${hib_file}__ $${hib_file} || exit 1; \ done - @echo LDFILELIST $(COMPONENT) + @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" $(_v)for obj in ${OBJS}; do \ echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist diff --git a/iokit/conf/Makefile.x86_64 b/iokit/conf/Makefile.x86_64 index 89d432cc6..61de9d584 100644 --- a/iokit/conf/Makefile.x86_64 +++ b/iokit/conf/Makefile.x86_64 @@ -14,4 +14,3 @@ IOHibernateRestoreKernel.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLA ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### - diff --git a/iokit/conf/files b/iokit/conf/files index 0e883af2b..8dd87dd65 100644 --- a/iokit/conf/files +++ b/iokit/conf/files @@ -105,3 +105,4 @@ iokit/Kernel/IOPowerConnection.cpp optional iokitcpp # System Management iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp optional iokitcpp + diff --git a/libkdd/kcdata/KCDBasicTypeDescription.h b/libkdd/KCDBasicTypeDescription.h similarity index 97% rename from libkdd/kcdata/KCDBasicTypeDescription.h rename to libkdd/KCDBasicTypeDescription.h index ebab25863..165b61164 100644 --- a/libkdd/kcdata/KCDBasicTypeDescription.h +++ b/libkdd/KCDBasicTypeDescription.h @@ -27,7 +27,7 @@ */ #include "kdd.h" -#include +#include #import @interface KCDBasicTypeDescription : KCDataType @@ -40,4 +40,6 @@ */ - (id)createDefaultForType:(uint32_t)typeID; +- (BOOL) shouldMergeData; + @end diff --git a/libkdd/kcdata/KCDBasicTypeDescription.m b/libkdd/KCDBasicTypeDescription.m similarity index 65% rename from libkdd/kcdata/KCDBasicTypeDescription.m rename to libkdd/KCDBasicTypeDescription.m index 151093b0e..5b8a54e87 100644 --- a/libkdd/kcdata/KCDBasicTypeDescription.m +++ b/libkdd/KCDBasicTypeDescription.m @@ -28,9 +28,33 @@ #import "KCDBasicTypeDescription.h" +const char * name_for_subtype(uint8_t elem_type); + +const char * name_for_subtype(uint8_t elem_type) +{ + char * retval = "unknown"; + + switch (elem_type) { + case KC_ST_CHAR: retval = "char"; break; + case KC_ST_INT8: retval = "int8_t"; break; + case KC_ST_UINT8: retval = "uint8_t"; break; + case KC_ST_INT16: retval = "int16_t"; break; + case KC_ST_UINT16: retval = "uint16_t"; break; + case KC_ST_INT32: retval = "int32_t"; break; + case KC_ST_UINT32: retval = "uint32_t"; break; + case KC_ST_INT64: retval = "int64_t"; break; + case KC_ST_UINT64: retval = "uint64_t"; break; + + default: retval = "Unknown"; break; + } + + return retval; +} + + @interface KCDBasicTypeDescription () { - int _typeID; + unsigned int _typeID; uint32_t _size; uint32_t _count; NSString * _name; @@ -87,16 +111,30 @@ - (NSObject *)objectForType:(kctype_subtype_t)elem_type withData:(uint8_t *)data return obj; } -- (NSMutableDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length +- (NSDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length { NSMutableDictionary * retval = [[NSMutableDictionary alloc] init]; + if (length <= _subtype_desc.kcs_elem_offset) + return retval; uint8_t * data = (uint8_t *)dataBuffer; - uint32_t elem_count = MIN(_count, length / (_size / _count)); + /* + * Calculate the maximum number of data elements we can parse, Taking into + * account the maximum size specified by the type description, and also the + * actual length of the data buffer and the offset into the buffer where we + * begin parsing. + */ + uint32_t elem_count = MIN(_count, (length - _subtype_desc.kcs_elem_offset) / (_size / _count)); uint32_t elem_size = _size / _count; - if (_count == 1) { + if (elem_count == 0) { + return retval; + } else if (elem_count == 1) { retval[_name] = [self objectForType:_subtype_desc.kcs_elem_type withData:&data[_subtype_desc.kcs_elem_offset]]; } else if (_subtype_desc.kcs_elem_type == KC_ST_CHAR) { - retval[_name] = [NSString stringWithFormat:@"%s", (char *)&data[_subtype_desc.kcs_elem_offset]]; + char *s = (char *)&data[_subtype_desc.kcs_elem_offset]; + if (!(strnlen(s, length) < length)) { + return nil; + } + retval[_name] = [NSString stringWithFormat:@"%s", s]; } else { NSMutableArray * objArray = [NSMutableArray arrayWithCapacity:elem_count]; for (unsigned int i = 0; i < elem_count; i++) { @@ -110,7 +148,12 @@ - (NSMutableDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length - (NSString *)description { - return [NSString stringWithFormat:@"type: %d => \"%@\" ", [self typeID], [self name]]; + if (_subtype_desc.kcs_flags & KCS_SUBTYPE_FLAGS_ARRAY) { + return [NSString stringWithFormat:@"[%d,%d] %s %s[%d];", _subtype_desc.kcs_elem_offset, kcs_get_elem_size(&_subtype_desc), name_for_subtype(_subtype_desc.kcs_elem_type), _subtype_desc.kcs_name, kcs_get_elem_count(&_subtype_desc) ]; + }else { + return [NSString stringWithFormat:@"[%d,%d] %s %s;", _subtype_desc.kcs_elem_offset, kcs_get_elem_size(&_subtype_desc), name_for_subtype(_subtype_desc.kcs_elem_type), _subtype_desc.kcs_name ]; + } + //return [NSString stringWithFormat:@"type: %d => \"%@\" ", [self typeID], [self name]]; } - (NSString *)name @@ -123,9 +166,14 @@ - (uint32_t)count return _count; } -- (int)typeID +- (unsigned int)typeID { return _typeID; } +- (BOOL) shouldMergeData +{ + return TRUE; +} + @end diff --git a/libkdd/KCDEmbeddedBufferDescription.h b/libkdd/KCDEmbeddedBufferDescription.h new file mode 100644 index 000000000..35dc05e6d --- /dev/null +++ b/libkdd/KCDEmbeddedBufferDescription.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "kdd.h" +#include +#import + +@interface KCDEmbeddedBufferDescription : KCDataType +@end diff --git a/libkdd/KCDEmbeddedBufferDescription.m b/libkdd/KCDEmbeddedBufferDescription.m new file mode 100644 index 000000000..32efae932 --- /dev/null +++ b/libkdd/KCDEmbeddedBufferDescription.m @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#import "KCDEmbeddedBufferDescription.h" + +@implementation KCDEmbeddedBufferDescription + +- (NSDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length +{ + return parseKCDataBuffer(dataBuffer, length, NULL); +} + +- (NSString *)name +{ + return @"EmbeddedKcdataBuffer"; +} + +- (BOOL) shouldMergeData +{ + return TRUE; +} + +@end diff --git a/libkdd/kcdata/KCDStructTypeDescription.h b/libkdd/KCDStructTypeDescription.h similarity index 93% rename from libkdd/kcdata/KCDStructTypeDescription.h rename to libkdd/KCDStructTypeDescription.h index 68a200e60..412402ad7 100644 --- a/libkdd/kcdata/KCDStructTypeDescription.h +++ b/libkdd/KCDStructTypeDescription.h @@ -31,8 +31,10 @@ @interface KCDStructTypeDescription : KCDataType -- (id)initWithType:(int)typeID withName:(NSString *)name; +- (id)initWithType:(unsigned int)typeID withName:(NSString *)name; - (void)addFieldBasicType:(KCDBasicTypeDescription *)fieldType; +- (void)setFlagsRequestedMerge; + @end diff --git a/libkdd/kcdata/KCDStructTypeDescription.m b/libkdd/KCDStructTypeDescription.m similarity index 59% rename from libkdd/kcdata/KCDStructTypeDescription.m rename to libkdd/KCDStructTypeDescription.m index 60f70b163..7cc3b208c 100644 --- a/libkdd/kcdata/KCDStructTypeDescription.m +++ b/libkdd/KCDStructTypeDescription.m @@ -34,26 +34,28 @@ @interface KCDStructTypeDescription () { - int _typeID; + unsigned int _typeID; NSString * _name; NSMutableArray * _fields; BOOL _needDescriptionAsKey; + BOOL _flagsRequestedMerge; } @end @implementation KCDStructTypeDescription -- (id)initWithType:(int)typeID withName:(NSString *)name +- (id)initWithType:(unsigned int)typeID withName:(NSString *)name { if ((self = [super init])) { - _typeID = typeID; + _typeID = typeID; _name = name; _needDescriptionAsKey = NO; if (typeID >= 0x1 && typeID <= KCDATA_TYPE_MAX_WITH_DESC) _needDescriptionAsKey = YES; - _fields = [[NSMutableArray alloc] init]; + _fields = [[NSMutableArray alloc] init]; + _flagsRequestedMerge = NO; return self; } return NULL; @@ -64,15 +66,37 @@ - (void)addFieldBasicType:(KCDBasicTypeDescription *)fieldType [_fields addObject:fieldType]; } -- (NSMutableDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length +- (void)setFlagsRequestedMerge +{ + _flagsRequestedMerge = YES; +} + +- (NSDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length { NSMutableDictionary * retval = [[NSMutableDictionary alloc] init]; for (KCDataType * fi in _fields) { - NSMutableDictionary * _d = [fi parseData:dataBuffer ofLength:length]; + NSDictionary * _d = [fi parseData:dataBuffer ofLength:length]; + if (!_d) { + return nil; + } for (NSString * k in [_d keyEnumerator]) { retval[k] = _d[k]; } } + if (_typeID == KCDATA_TYPE_TYPEDEFINTION){ + uint32_t elem_size = sizeof(struct kcdata_subtype_descriptor); + uint32_t elem_count = (length - offsetof(struct kcdata_type_definition, kct_elements))/elem_size; + NSMutableArray * fields_array = [NSMutableArray arrayWithCapacity:elem_count]; + struct kcdata_subtype_descriptor *fields_dsc = (struct kcdata_subtype_descriptor *) ((uintptr_t)dataBuffer + offsetof(struct kcdata_type_definition, kct_elements)); + int i = 0; + for (i = 0; i < elem_count; i++) { + KCDBasicTypeDescription * tmpdsc = [[KCDBasicTypeDescription alloc] initWithKCTypeDesc:&fields_dsc[i]]; + NSString *field_desc_str = [tmpdsc description]; + + [fields_array addObject:field_desc_str]; + } + retval[@"fields"] = fields_array; + } if (_needDescriptionAsKey) { NSString * desc = retval[@"desc"]; NSObject * obj = retval[@"data"]; @@ -83,6 +107,17 @@ - (NSMutableDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length return retval; } +- (BOOL)shouldMergeData +{ + /* + * If this is a type where the kcdata item itself carries the key name, or + * KCS_SUBTYPE_FLAGS_MERGE was used to define the type, then a member of + * this type should have it's dict merged into the parent container, + * instead of being represented as typename => dict. + */ + return _needDescriptionAsKey || _flagsRequestedMerge; +} + - (NSString *)description { return [NSString stringWithFormat:@"type: %d => \"%@\" ", _typeID, _name]; @@ -98,7 +133,7 @@ - (uint32_t)count return (uint32_t)[_fields count]; } -- (int)typeID +- (unsigned int)typeID { return _typeID; } diff --git a/libkdd/README.md b/libkdd/README.md new file mode 100644 index 000000000..a01193a61 --- /dev/null +++ b/libkdd/README.md @@ -0,0 +1,109 @@ +Kernel Data Descriptors +======================= + +This project allows for dynamic data to be passed from the kernel to userspace tools without binding them to particular version of +struct definition. The `libkdd` library provides convenient API for parsing and interpreting `kernel chunked data`. + +The libkdd APIs are defined in [kdd.h](./kdd.h) + +The `KCDATA` format +=================== + +The format for data is setup in a generic format as follows + +Layout of data structure +------------------------ + + | 8 - bytes | + |---------------------------| ------ offset = 00 + | type = MAGIC | LENGTH | # BEGIN Header + | 0 | + |---------------------------| ------ offset = 16 + | type | size | # chunk header + | flags | + |---------------------------| ------ offset = 32 + | data | # arbitrary data (len=16) + |___________data____________| + |---------------------------| ------ offset = 48 + | type | size | # chunk header + | flags | + |---------------------------| ------ offset = 64 + | data | # arbitrary data (len=32) + | data | + | data | + |___________data____________| + |---------------------------| ------ offset = 96 + | type = END | size=0 | # chunk header + | 0 | + + +The type field describes what kind of data is passed. For example type = `TASK_CRASHINFO_UUID` means the following data is a uuid. +These types need to be defined in task_corpses.h for easy consumption by userspace inspection tools. + +Some range of types is reserved for special types like ints, longs etc. A cool new functionality made possible with this +extensible data format is that kernel can decide to put more information as required without requiring user space tools to +re-compile to be compatible. The case of `rusage` struct versions could be introduced without breaking existing tools. + +Feature description: Generic data with description +------------------- +Further more generic data with description is very much possible now. For example + + - kcdata_add_uint64_with_description(cdatainfo, 0x700, "NUM MACH PORTS"); + - and more functions that allow adding description. + +The userspace tools can then look at the description and print the data even if they are not compiled with knowledge of the field apriori. + + Example data: + 0000 57 f1 ad de 00 00 00 00 00 00 00 00 00 00 00 00 W............... + 0010 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... + 0020 50 49 44 00 00 00 00 00 00 00 00 00 00 00 00 00 PID............. + 0030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0040 9c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0050 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... + 0060 50 41 52 45 4e 54 20 50 49 44 00 00 00 00 00 00 PARENT PID...... + 0070 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0080 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0090 ed 58 91 f1 + + +Feature description: Container markers for compound data +------------------ + +If a given kernel data type is complex and requires adding multiple optional fields inside a container +object for a consumer to understand arbitrary data, we package it using container markers. + +For example, the stackshot code gathers information and describes the state of a given task with respect +to many subsystems. It includes data such as io stats, vm counters, process names/flags and syscall counts. + + kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); + // add multiple data, or add__with_description()s here + + kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); + + +Feature description: Custom Data formats on demand +-------------------- + +With the self describing nature of format, the kernel provider can describe a data type (uniquely identified by a number) and use +it in the buffer for sending data. The consumer can parse the type information and have knowledge of describing incoming data. +Following is an example of how we can describe a kernel specific struct sample_disk_io_stats in buffer. + + struct sample_disk_io_stats { + uint64_t disk_reads_count; + uint64_t disk_reads_size; + uint64_t io_priority_count[4]; + uint64_t io_priority_size; + } __attribute__ ((packed)); + + + struct kcdata_subtype_descriptor disk_io_stats_def[] = { + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"}, + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"}, + {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"}, + {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"}, + }; + +Now you can add this custom type definition into the buffer as + kcdata_add_type_definition(kcdata_p, KCTYPE_SAMPLE_DISK_IO_STATS, "sample_disk_io_stats", + &disk_io_stats_def[0], sizeof(disk_io_stats_def)/sizeof(struct kcdata_subtype_descriptor)); + diff --git a/libkdd/kcdata.h b/libkdd/kcdata.h new file mode 100644 index 000000000..3e1c76d31 --- /dev/null +++ b/libkdd/kcdata.h @@ -0,0 +1,1061 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +/* + * + * THE KCDATA MANIFESTO + * + * Kcdata is a self-describing data serialization format. It is meant to get + * nested data structures out of xnu with minimum fuss, but also for that data + * to be easy to parse. It is also meant to allow us to add new fields and + * evolve the data format without breaking old parsers. + * + * Kcdata is a permanent data format suitable for long-term storage including + * in files. It is very important that we continue to be able to parse old + * versions of kcdata-based formats. To this end, there are several + * invariants you MUST MAINTAIN if you alter this file. + * + * * None of the magic numbers should ever be a byteswap of themselves or + * of any of the other magic numbers. + * + * * Never remove any type. + * + * * All kcdata structs must be packed, and must exclusively use fixed-size + * types. + * + * * Never change the definition of any type, except to add new fields to + * the end. + * + * * If you do add new fields to the end of a type, do not actually change + * the definition of the old structure. Instead, define a new structure + * with the new fields. See thread_snapshot_v3 as an example. This + * provides source compatibility for old readers, and also documents where + * the potential size cutoffs are. + * + * * If you change libkdd, or kcdata.py run the unit tests under libkdd. + * + * * If you add a type or extend an existing one, add a sample test to + * libkdd/tests so future changes to libkdd will always parse your struct + * correctly. + * + * For example to add a field to this: + * + * struct foobar { + * uint32_t baz; + * uint32_t quux; + * } __attribute__ ((packed)); + * + * Make it look like this: + * + * struct foobar { + * uint32_t baz; + * uint32_t quux; + * ///////// end version 1 of foobar. sizeof(struct foobar) was 8 //////// + * uint32_t frozzle; + * } __attribute__ ((packed)); + * + * If you are parsing kcdata formats, you MUST + * + * * Check the length field of each struct, including array elements. If the + * struct is longer than you expect, you must ignore the extra data. + * + * * Ignore any data types you do not understand. + * + * Additionally, we want to be as forward compatible as we can. Meaning old + * tools should still be able to use new data whenever possible. To this end, + * you should: + * + * * Try not to add new versions of types that supplant old ones. Instead + * extend the length of existing types or add supplemental types. + * + * * Try not to remove information from existing kcdata formats, unless + * removal was explicitly asked for. For example it is fine to add a + * stackshot flag to remove unwanted information, but you should not + * remove it from the default stackshot if the new flag is absent. + * + * * (TBD) If you do break old readers by removing information or + * supplanting old structs, then increase the major version number. + * + * + * + * The following is a description of the kcdata format. + * + * + * The format for data is setup in a generic format as follows + * + * Layout of data structure: + * + * | 8 - bytes | + * | type = MAGIC | LENGTH | + * | 0 | + * | type | size | + * | flags | + * | data | + * |___________data____________| + * | type | size | + * | flags | + * |___________data____________| + * | type = END | size=0 | + * | 0 | + * + * + * The type field describes what kind of data is passed. For example type = TASK_CRASHINFO_UUID means the following data is a uuid. + * These types need to be defined in task_corpses.h for easy consumption by userspace inspection tools. + * + * Some range of types is reserved for special types like ints, longs etc. A cool new functionality made possible with this + * extensible data format is that kernel can decide to put more information as required without requiring user space tools to + * re-compile to be compatible. The case of rusage struct versions could be introduced without breaking existing tools. + * + * Feature description: Generic data with description + * ------------------- + * Further more generic data with description is very much possible now. For example + * + * - kcdata_add_uint64_with_description(cdatainfo, 0x700, "NUM MACH PORTS"); + * - and more functions that allow adding description. + * The userspace tools can then look at the description and print the data even if they are not compiled with knowledge of the field apriori. + * + * Example data: + * 0000 57 f1 ad de 00 00 00 00 00 00 00 00 00 00 00 00 W............... + * 0010 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... + * 0020 50 49 44 00 00 00 00 00 00 00 00 00 00 00 00 00 PID............. + * 0030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0040 9c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0050 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... + * 0060 50 41 52 45 4e 54 20 50 49 44 00 00 00 00 00 00 PARENT PID...... + * 0070 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0080 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0090 ed 58 91 f1 + * + * Feature description: Container markers for compound data + * ------------------ + * If a given kernel data type is complex and requires adding multiple optional fields inside a container + * object for a consumer to understand arbitrary data, we package it using container markers. + * + * For example, the stackshot code gathers information and describes the state of a given task with respect + * to many subsystems. It includes data such as io stats, vm counters, process names/flags and syscall counts. + * + * kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); + * // add multiple data, or add__with_description()s here + * + * kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); + * + * Feature description: Custom Data formats on demand + * -------------------- + * With the self describing nature of format, the kernel provider can describe a data type (uniquely identified by a number) and use + * it in the buffer for sending data. The consumer can parse the type information and have knowledge of describing incoming data. + * Following is an example of how we can describe a kernel specific struct sample_disk_io_stats in buffer. + * + * struct sample_disk_io_stats { + * uint64_t disk_reads_count; + * uint64_t disk_reads_size; + * uint64_t io_priority_count[4]; + * uint64_t io_priority_size; + * } __attribute__ ((packed)); + * + * + * struct kcdata_subtype_descriptor disk_io_stats_def[] = { + * {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"}, + * {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"}, + * {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"}, + * {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"}, + * }; + * + * Now you can add this custom type definition into the buffer as + * kcdata_add_type_definition(kcdata_p, KCTYPE_SAMPLE_DISK_IO_STATS, "sample_disk_io_stats", + * &disk_io_stats_def[0], sizeof(disk_io_stats_def)/sizeof(struct kcdata_subtype_descriptor)); + * + */ + + +#ifndef _KCDATA_H_ +#define _KCDATA_H_ + +#include +#include +#include + +#define KCDATA_DESC_MAXLEN 32 /* including NULL byte at end */ + +#define KCDATA_FLAGS_STRUCT_PADDING_MASK 0xf +#define KCDATA_FLAGS_STRUCT_HAS_PADDING 0x80 + +/* + * kcdata aligns elements to 16 byte boundaries. + */ +#define KCDATA_ALIGNMENT_SIZE 0x10 + +struct kcdata_item { + uint32_t type; + uint32_t size; /* len(data) */ + /* flags. + * + * For structures: + * padding = flags & 0xf + * has_padding = (flags & 0x80) >> 7 + * + * has_padding is needed to disambiguate cases such as + * thread_snapshot_v2 and thread_snapshot_v3. Their + * respective sizes are 0x68 and 0x70, and thread_snapshot_v2 + * was emmitted by old kernels *before* we started recording + * padding. Since legacy thread_snapsht_v2 and modern + * thread_snapshot_v3 will both record 0 for the padding + * flags, we need some other bit which will be nonzero in the + * flags to disambiguate. + * + * This is why we hardcode a special case for + * STACKSHOT_KCTYPE_THREAD_SNAPSHOT into the iterator + * functions below. There is only a finite number of such + * hardcodings which will ever be needed. They can occur + * when: + * + * * We have a legacy structure that predates padding flags + * + * * which we want to extend without changing the kcdata type + * + * * by only so many bytes as would fit in the space that + * was previously unused padding. + * + * For containers: + * container_id = flags + * + * For arrays: + * element_count = flags & UINT32_MAX + * element_type = (flags >> 32) & UINT32_MAX + */ + uint64_t flags; + char data[]; /* must be at the end */ +}; + +typedef struct kcdata_item * kcdata_item_t; + +enum KCDATA_SUBTYPE_TYPES { KC_ST_CHAR = 1, KC_ST_INT8, KC_ST_UINT8, KC_ST_INT16, KC_ST_UINT16, KC_ST_INT32, KC_ST_UINT32, KC_ST_INT64, KC_ST_UINT64 }; +typedef enum KCDATA_SUBTYPE_TYPES kctype_subtype_t; + +/* + * A subtype description structure that defines + * how a compound data is laid out in memory. This + * provides on the fly definition of types and consumption + * by the parser. + */ +struct kcdata_subtype_descriptor { + uint8_t kcs_flags; +#define KCS_SUBTYPE_FLAGS_NONE 0x0 +#define KCS_SUBTYPE_FLAGS_ARRAY 0x1 +/* Force struct type even if only one element. + * + * Normally a kcdata_type_definition is treated as a structure if it has + * more than one subtype descriptor. Otherwise it is treated as a simple + * type. For example libkdd will represent a simple integer 42 as simply + * 42, but it will represent a structure containing an integer 42 as + * {"field_name": 42}.. + * + * If a kcdata_type_definition has only single subtype, then it will be + * treated as a structure iff KCS_SUBTYPE_FLAGS_STRUCT is set. If it has + * multiple subtypes, it will always be treated as a structure. + * + * KCS_SUBTYPE_FLAGS_MERGE has the opposite effect. If this flag is used then + * even if there are multiple elements, they will all be treated as individual + * properties of the parent dictionary. + */ +#define KCS_SUBTYPE_FLAGS_STRUCT 0x2 /* force struct type even if only one element */ +#define KCS_SUBTYPE_FLAGS_MERGE 0x4 /* treat as multiple elements of parents instead of struct */ + uint8_t kcs_elem_type; /* restricted to kctype_subtype_t */ + uint16_t kcs_elem_offset; /* offset in struct where data is found */ + uint32_t kcs_elem_size; /* size of element (or) packed state for array type */ + char kcs_name[KCDATA_DESC_MAXLEN]; /* max 31 bytes for name of field */ +}; + +typedef struct kcdata_subtype_descriptor * kcdata_subtype_descriptor_t; + +/* + * In case of array of basic c types in kctype_subtype_t, + * size is packed in lower 16 bits and + * count is packed in upper 16 bits of kcs_elem_size field. + */ +#define KCS_SUBTYPE_PACK_SIZE(e_count, e_size) (((e_count)&0xffffu) << 16 | ((e_size)&0xffffu)) + +static inline uint32_t +kcs_get_elem_size(kcdata_subtype_descriptor_t d) +{ + if (d->kcs_flags & KCS_SUBTYPE_FLAGS_ARRAY) { + /* size is composed as ((count &0xffff)<<16 | (elem_size & 0xffff)) */ + return (uint32_t)((d->kcs_elem_size & 0xffff) * ((d->kcs_elem_size & 0xffff0000)>>16)); + } + return d->kcs_elem_size; +} + +static inline uint32_t +kcs_get_elem_count(kcdata_subtype_descriptor_t d) +{ + if (d->kcs_flags & KCS_SUBTYPE_FLAGS_ARRAY) + return (d->kcs_elem_size >> 16) & 0xffff; + return 1; +} + +static inline int +kcs_set_elem_size(kcdata_subtype_descriptor_t d, uint32_t size, uint32_t count) +{ + if (count > 1) { + /* means we are setting up an array */ + if (size > 0xffff || count > 0xffff) + return -1; //invalid argument + d->kcs_elem_size = ((count & 0xffff) << 16 | (size & 0xffff)); + } + else + { + d->kcs_elem_size = size; + } + return 0; +} + +struct kcdata_type_definition { + uint32_t kct_type_identifier; + uint32_t kct_num_elements; + char kct_name[KCDATA_DESC_MAXLEN]; + struct kcdata_subtype_descriptor kct_elements[]; +}; + + +/* chunk type definitions. 0 - 0x7ff are reserved and defined here + * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes + * in STACKSHOT_KCTYPE_* types. + */ + +/* + * Types with description value. + * these will have KCDATA_DESC_MAXLEN-1 length string description + * and rest of kcdata_iter_size() - KCDATA_DESC_MAXLEN bytes as data + */ +#define KCDATA_TYPE_INVALID 0x0u +#define KCDATA_TYPE_STRING_DESC 0x1u +#define KCDATA_TYPE_UINT32_DESC 0x2u +#define KCDATA_TYPE_UINT64_DESC 0x3u +#define KCDATA_TYPE_INT32_DESC 0x4u +#define KCDATA_TYPE_INT64_DESC 0x5u +#define KCDATA_TYPE_BINDATA_DESC 0x6u + +/* + * Compound type definitions + */ +#define KCDATA_TYPE_ARRAY 0x11u /* Array of data OBSOLETE DONT USE THIS*/ +#define KCDATA_TYPE_TYPEDEFINTION 0x12u /* Meta type that describes a type on the fly. */ +#define KCDATA_TYPE_CONTAINER_BEGIN \ + 0x13u /* Container type which has corresponding CONTAINER_END header. \ + * KCDATA_TYPE_CONTAINER_BEGIN has type in the data segment. \ + * Both headers have (uint64_t) ID for matching up nested data. \ + */ +#define KCDATA_TYPE_CONTAINER_END 0x14u + +#define KCDATA_TYPE_ARRAY_PAD0 0x20u /* Array of data with 0 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD1 0x21u /* Array of data with 1 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD2 0x22u /* Array of data with 2 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD3 0x23u /* Array of data with 3 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD4 0x24u /* Array of data with 4 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD5 0x25u /* Array of data with 5 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD6 0x26u /* Array of data with 6 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD7 0x27u /* Array of data with 7 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD8 0x28u /* Array of data with 8 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD9 0x29u /* Array of data with 9 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADa 0x2au /* Array of data with a byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADb 0x2bu /* Array of data with b byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADc 0x2cu /* Array of data with c byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADd 0x2du /* Array of data with d byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADe 0x2eu /* Array of data with e byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADf 0x2fu /* Array of data with f byte of padding*/ + +/* + * Generic data types that are most commonly used + */ +#define KCDATA_TYPE_LIBRARY_LOADINFO 0x30u /* struct dyld_uuid_info_32 */ +#define KCDATA_TYPE_LIBRARY_LOADINFO64 0x31u /* struct dyld_uuid_info_64 */ +#define KCDATA_TYPE_TIMEBASE 0x32u /* struct mach_timebase_info */ +#define KCDATA_TYPE_MACH_ABSOLUTE_TIME 0x33u /* uint64_t */ +#define KCDATA_TYPE_TIMEVAL 0x34u /* struct timeval64 */ +#define KCDATA_TYPE_USECS_SINCE_EPOCH 0x35u /* time in usecs uint64_t */ +#define KCDATA_TYPE_PID 0x36u /* int32_t */ +#define KCDATA_TYPE_PROCNAME 0x37u /* char * */ +#define KCDATA_TYPE_NESTED_KCDATA 0x38u /* nested kcdata buffer */ + +#define KCDATA_TYPE_BUFFER_END 0xF19158EDu + +/* MAGIC numbers defined for each class of chunked data + * + * To future-proof against big-endian arches, make sure none of these magic + * numbers are byteswaps of each other + */ + +#define KCDATA_BUFFER_BEGIN_CRASHINFO 0xDEADF157u /* owner: corpses/task_corpse.h */ + /* type-range: 0x800 - 0x8ff */ +#define KCDATA_BUFFER_BEGIN_STACKSHOT 0x59a25807u /* owner: sys/stackshot.h */ + /* type-range: 0x900 - 0x93f */ +#define KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT 0xDE17A59Au /* owner: sys/stackshot.h */ + /* type-range: 0x940 - 0x9ff */ +#define KCDATA_BUFFER_BEGIN_OS_REASON 0x53A20900u /* owner: sys/reason.h */ + /* type-range: 0x1000-0x103f */ +#define KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG 0x1e21c09fu /* owner: osfmk/tests/kernel_tests.c */ + /* type-range: 0x1040-0x105f */ + +/* next type range number available 0x1060 */ +/**************** definitions for XNUPOST *********************/ +#define XNUPOST_KCTYPE_TESTCONFIG 0x1040 + +/**************** definitions for stackshot *********************/ + +/* This value must always match IO_NUM_PRIORITIES defined in thread_info.h */ +#define STACKSHOT_IO_NUM_PRIORITIES 4 +/* This value must always match MAXTHREADNAMESIZE used in bsd */ +#define STACKSHOT_MAX_THREAD_NAME_SIZE 64 + +/* + * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes + * in STACKSHOT_KCTYPE_* types. + */ +#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ +#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ +#define STACKSHOT_KCCONTAINER_TASK 0x903u +#define STACKSHOT_KCCONTAINER_THREAD 0x904u +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ +#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ +#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ +#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ +#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ +#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ + +#define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u /* task_delta_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v2 */ + +#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ +#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ +#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times */ +#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ +#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ +#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ + +struct stack_snapshot_frame32 { + uint32_t lr; + uint32_t sp; +}; + +struct stack_snapshot_frame64 { + uint64_t lr; + uint64_t sp; +}; + +struct dyld_uuid_info_32 { + uint32_t imageLoadAddress; /* base address image is mapped at */ + uuid_t imageUUID; +}; + +struct dyld_uuid_info_64 { + uint64_t imageLoadAddress; /* XXX image slide */ + uuid_t imageUUID; +}; + +struct dyld_uuid_info_64_v2 { + uint64_t imageLoadAddress; /* XXX image slide */ + uuid_t imageUUID; + /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */ + uint64_t imageSlidBaseAddress; /* slid base address of image */ +}; + +struct user32_dyld_uuid_info { + uint32_t imageLoadAddress; /* base address image is mapped into */ + uuid_t imageUUID; /* UUID of image */ +}; + +struct user64_dyld_uuid_info { + uint64_t imageLoadAddress; /* base address image is mapped into */ + uuid_t imageUUID; /* UUID of image */ +}; + +enum task_snapshot_flags { + kTaskRsrcFlagged = 0x4, // In the EXC_RESOURCE danger zone? + kTerminatedSnapshot = 0x8, + kPidSuspended = 0x10, // true for suspended task + kFrozen = 0x20, // true for hibernated task (along with pidsuspended) + kTaskDarwinBG = 0x40, + kTaskExtDarwinBG = 0x80, + kTaskVisVisible = 0x100, + kTaskVisNonvisible = 0x200, + kTaskIsForeground = 0x400, + kTaskIsBoosted = 0x800, + kTaskIsSuppressed = 0x1000, + kTaskIsTimerThrottled = 0x2000, /* deprecated */ + kTaskIsImpDonor = 0x4000, + kTaskIsLiveImpDonor = 0x8000, + kTaskIsDirty = 0x10000, + kTaskWqExceededConstrainedThreadLimit = 0x20000, + kTaskWqExceededTotalThreadLimit = 0x40000, + kTaskWqFlagsAvailable = 0x80000, + kTaskUUIDInfoFaultedIn = 0x100000, /* successfully faulted in some UUID info */ + kTaskUUIDInfoMissing = 0x200000, /* some UUID info was paged out */ + kTaskUUIDInfoTriedFault = 0x400000, /* tried to fault in UUID info */ + kTaskSharedRegionInfoUnavailable = 0x800000, /* shared region info unavailable */ +}; + +enum thread_snapshot_flags { + kHasDispatchSerial = 0x4, + kStacksPCOnly = 0x8, /* Stack traces have no frame pointers. */ + kThreadDarwinBG = 0x10, /* Thread is darwinbg */ + kThreadIOPassive = 0x20, /* Thread uses passive IO */ + kThreadSuspended = 0x40, /* Thread is suspended */ + kThreadTruncatedBT = 0x80, /* Unmapped pages caused truncated backtrace */ + kGlobalForcedIdle = 0x100, /* Thread performs global forced idle */ + kThreadFaultedBT = 0x200, /* Some thread stack pages were faulted in as part of BT */ + kThreadTriedFaultBT = 0x400, /* We tried to fault in thread stack pages as part of BT */ + kThreadOnCore = 0x800, /* Thread was on-core when we entered debugger context */ + kThreadIdleWorker = 0x1000, /* Thread is an idle libpthread worker thread */ +}; + +struct mem_and_io_snapshot { + uint32_t snapshot_magic; + uint32_t free_pages; + uint32_t active_pages; + uint32_t inactive_pages; + uint32_t purgeable_pages; + uint32_t wired_pages; + uint32_t speculative_pages; + uint32_t throttled_pages; + uint32_t filebacked_pages; + uint32_t compressions; + uint32_t decompressions; + uint32_t compressor_size; + int32_t busy_buffer_count; + uint32_t pages_wanted; + uint32_t pages_reclaimed; + uint8_t pages_wanted_reclaimed_valid; // did mach_vm_pressure_monitor succeed? +} __attribute__((packed)); + +/* SS_TH_* macros are for ths_state */ +#define SS_TH_WAIT 0x01 /* queued for waiting */ +#define SS_TH_SUSP 0x02 /* stopped or requested to stop */ +#define SS_TH_RUN 0x04 /* running or on runq */ +#define SS_TH_UNINT 0x08 /* waiting uninteruptibly */ +#define SS_TH_TERMINATE 0x10 /* halted at termination */ +#define SS_TH_TERMINATE2 0x20 /* added to termination queue */ +#define SS_TH_IDLE 0x80 /* idling processor */ + +struct thread_snapshot_v2 { + uint64_t ths_thread_id; + uint64_t ths_wait_event; + uint64_t ths_continuation; + uint64_t ths_total_syscalls; + uint64_t ths_voucher_identifier; + uint64_t ths_dqserialnum; + uint64_t ths_user_time; + uint64_t ths_sys_time; + uint64_t ths_ss_flags; + uint64_t ths_last_run_time; + uint64_t ths_last_made_runnable_time; + uint32_t ths_state; + uint32_t ths_sched_flags; + int16_t ths_base_priority; + int16_t ths_sched_priority; + uint8_t ths_eqos; + uint8_t ths_rqos; + uint8_t ths_rqos_override; + uint8_t ths_io_tier; +} __attribute__((packed)); + +struct thread_snapshot_v3 { + uint64_t ths_thread_id; + uint64_t ths_wait_event; + uint64_t ths_continuation; + uint64_t ths_total_syscalls; + uint64_t ths_voucher_identifier; + uint64_t ths_dqserialnum; + uint64_t ths_user_time; + uint64_t ths_sys_time; + uint64_t ths_ss_flags; + uint64_t ths_last_run_time; + uint64_t ths_last_made_runnable_time; + uint32_t ths_state; + uint32_t ths_sched_flags; + int16_t ths_base_priority; + int16_t ths_sched_priority; + uint8_t ths_eqos; + uint8_t ths_rqos; + uint8_t ths_rqos_override; + uint8_t ths_io_tier; + uint64_t ths_thread_t; +} __attribute__((packed)); + +struct thread_delta_snapshot_v2 { + uint64_t tds_thread_id; + uint64_t tds_voucher_identifier; + uint64_t tds_ss_flags; + uint64_t tds_last_made_runnable_time; + uint32_t tds_state; + uint32_t tds_sched_flags; + int16_t tds_base_priority; + int16_t tds_sched_priority; + uint8_t tds_eqos; + uint8_t tds_rqos; + uint8_t tds_rqos_override; + uint8_t tds_io_tier; +} __attribute__ ((packed)); + +struct io_stats_snapshot +{ + /* + * I/O Statistics + * XXX: These fields must be together. + */ + uint64_t ss_disk_reads_count; + uint64_t ss_disk_reads_size; + uint64_t ss_disk_writes_count; + uint64_t ss_disk_writes_size; + uint64_t ss_io_priority_count[STACKSHOT_IO_NUM_PRIORITIES]; + uint64_t ss_io_priority_size[STACKSHOT_IO_NUM_PRIORITIES]; + uint64_t ss_paging_count; + uint64_t ss_paging_size; + uint64_t ss_non_paging_count; + uint64_t ss_non_paging_size; + uint64_t ss_data_count; + uint64_t ss_data_size; + uint64_t ss_metadata_count; + uint64_t ss_metadata_size; + /* XXX: I/O Statistics end */ + +} __attribute__ ((packed)); + +struct task_snapshot_v2 { + uint64_t ts_unique_pid; + uint64_t ts_ss_flags; + uint64_t ts_user_time_in_terminated_threads; + uint64_t ts_system_time_in_terminated_threads; + uint64_t ts_p_start_sec; + uint64_t ts_task_size; + uint64_t ts_max_resident_size; + uint32_t ts_suspend_count; + uint32_t ts_faults; + uint32_t ts_pageins; + uint32_t ts_cow_faults; + uint32_t ts_was_throttled; + uint32_t ts_did_throttle; + uint32_t ts_latency_qos; + int32_t ts_pid; + char ts_p_comm[32]; +} __attribute__ ((packed)); + +struct task_delta_snapshot_v2 { + uint64_t tds_unique_pid; + uint64_t tds_ss_flags; + uint64_t tds_user_time_in_terminated_threads; + uint64_t tds_system_time_in_terminated_threads; + uint64_t tds_task_size; + uint64_t tds_max_resident_size; + uint32_t tds_suspend_count; + uint32_t tds_faults; + uint32_t tds_pageins; + uint32_t tds_cow_faults; + uint32_t tds_was_throttled; + uint32_t tds_did_throttle; + uint32_t tds_latency_qos; +} __attribute__ ((packed)); + +struct stackshot_cpu_times { + uint64_t user_usec; + uint64_t system_usec; +} __attribute__((packed)); + +struct stackshot_duration { + uint64_t stackshot_duration; + uint64_t stackshot_duration_outer; +} __attribute__((packed)); + +struct stackshot_fault_stats { + uint32_t sfs_pages_faulted_in; /* number of pages faulted in using KDP fault path */ + uint64_t sfs_time_spent_faulting; /* MATUs spent faulting */ + uint64_t sfs_system_max_fault_time; /* MATUs fault time limit per stackshot */ + uint8_t sfs_stopped_faulting; /* we stopped decompressing because we hit the limit */ +} __attribute__((packed)); + +/**************** definitions for crashinfo *********************/ + +/* + * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes + * in TASK_CRASHINFO_* types. + */ + +/* FIXME some of these types aren't clean (fixed width, packed, and defined *here*) */ + +#define TASK_CRASHINFO_BEGIN KCDATA_BUFFER_BEGIN_CRASHINFO +#define TASK_CRASHINFO_STRING_DESC KCDATA_TYPE_STRING_DESC +#define TASK_CRASHINFO_UINT32_DESC KCDATA_TYPE_UINT32_DESC +#define TASK_CRASHINFO_UINT64_DESC KCDATA_TYPE_UINT64_DESC + +#define TASK_CRASHINFO_EXTMODINFO 0x801 +#define TASK_CRASHINFO_BSDINFOWITHUNIQID 0x802 /* struct proc_uniqidentifierinfo */ +#define TASK_CRASHINFO_TASKDYLD_INFO 0x803 +#define TASK_CRASHINFO_UUID 0x804 +#define TASK_CRASHINFO_PID 0x805 +#define TASK_CRASHINFO_PPID 0x806 +#define TASK_CRASHINFO_RUSAGE 0x807 /* struct rusage DEPRECATED do not use. + This struct has longs in it */ +#define TASK_CRASHINFO_RUSAGE_INFO 0x808 /* struct rusage_info_v3 from resource.h */ +#define TASK_CRASHINFO_PROC_NAME 0x809 /* char * */ +#define TASK_CRASHINFO_PROC_STARTTIME 0x80B /* struct timeval64 */ +#define TASK_CRASHINFO_USERSTACK 0x80C /* uint64_t */ +#define TASK_CRASHINFO_ARGSLEN 0x80D +#define TASK_CRASHINFO_EXCEPTION_CODES 0x80E /* mach_exception_data_t */ +#define TASK_CRASHINFO_PROC_PATH 0x80F /* string of len MAXPATHLEN */ +#define TASK_CRASHINFO_PROC_CSFLAGS 0x810 /* uint32_t */ +#define TASK_CRASHINFO_PROC_STATUS 0x811 /* char */ +#define TASK_CRASHINFO_UID 0x812 /* uid_t */ +#define TASK_CRASHINFO_GID 0x813 /* gid_t */ +#define TASK_CRASHINFO_PROC_ARGC 0x814 /* int */ +#define TASK_CRASHINFO_PROC_FLAGS 0x815 /* unsigned int */ +#define TASK_CRASHINFO_CPUTYPE 0x816 /* cpu_type_t */ +#define TASK_CRASHINFO_WORKQUEUEINFO 0x817 /* struct proc_workqueueinfo */ +#define TASK_CRASHINFO_RESPONSIBLE_PID 0x818 /* pid_t */ +#define TASK_CRASHINFO_DIRTY_FLAGS 0x819 /* int */ +#define TASK_CRASHINFO_CRASHED_THREADID 0x81A /* uint64_t */ +#define TASK_CRASHINFO_COALITION_ID 0x81B /* uint64_t */ +#define TASK_CRASHINFO_UDATA_PTRS 0x81C /* uint64_t */ +#define TASK_CRASHINFO_MEMORY_LIMIT 0x81D /* uint64_t */ + +#define TASK_CRASHINFO_END KCDATA_TYPE_BUFFER_END + +/**************** definitions for os reasons *********************/ + +#define EXIT_REASON_SNAPSHOT 0x1001 +#define EXIT_REASON_USER_DESC 0x1002 /* string description of reason */ +#define EXIT_REASON_USER_PAYLOAD 0x1003 /* user payload data */ +#define EXIT_REASON_CODESIGNING_INFO 0x1004 + +struct exit_reason_snapshot { + uint32_t ers_namespace; + uint64_t ers_code; + /* end of version 1 of exit_reason_snapshot. sizeof v1 was 12 */ + uint64_t ers_flags; +} __attribute__((packed)); + +#define EXIT_REASON_CODESIG_PATH_MAX 1024 + +struct codesigning_exit_reason_info { + uint64_t ceri_virt_addr; + uint64_t ceri_file_offset; + char ceri_pathname[EXIT_REASON_CODESIG_PATH_MAX]; + char ceri_filename[EXIT_REASON_CODESIG_PATH_MAX]; + uint64_t ceri_codesig_modtime_secs; + uint64_t ceri_codesig_modtime_nsecs; + uint64_t ceri_page_modtime_secs; + uint64_t ceri_page_modtime_nsecs; + uint8_t ceri_path_truncated; + uint8_t ceri_object_codesigned; + uint8_t ceri_page_codesig_validated; + uint8_t ceri_page_codesig_tainted; + uint8_t ceri_page_codesig_nx; + uint8_t ceri_page_wpmapped; + uint8_t ceri_page_slid; + uint8_t ceri_page_dirty; + uint32_t ceri_page_shadow_depth; +} __attribute__((packed)); + +#define EXIT_REASON_USER_DESC_MAX_LEN 1024 +#define EXIT_REASON_PAYLOAD_MAX_LEN 2048 +/**************** safe iterators *********************/ + +typedef struct kcdata_iter { + kcdata_item_t item; + void *end; +} kcdata_iter_t; + + +static inline +kcdata_iter_t kcdata_iter(void *buffer, unsigned long size) { + kcdata_iter_t iter; + iter.item = (kcdata_item_t) buffer; + iter.end = (void*) (((uintptr_t)buffer) + size); + return iter; +} + +static inline +kcdata_iter_t kcdata_iter_unsafe(void *buffer) __attribute__((deprecated)); + +static inline +kcdata_iter_t kcdata_iter_unsafe(void *buffer) { + kcdata_iter_t iter; + iter.item = (kcdata_item_t) buffer; + iter.end = (void*) (uintptr_t) ~0; + return iter; +} + +static const kcdata_iter_t kcdata_invalid_iter = { .item = 0, .end = 0 }; + +static inline +int kcdata_iter_valid(kcdata_iter_t iter) { + return + ( (uintptr_t)iter.item + sizeof(struct kcdata_item) <= (uintptr_t)iter.end ) && + ( (uintptr_t)iter.item + sizeof(struct kcdata_item) + iter.item->size <= (uintptr_t)iter.end); +} + + +static inline +kcdata_iter_t kcdata_iter_next(kcdata_iter_t iter) { + iter.item = (kcdata_item_t) (((uintptr_t)iter.item) + sizeof(struct kcdata_item) + (iter.item->size)); + return iter; +} + +static inline uint32_t +kcdata_iter_type(kcdata_iter_t iter) +{ + if ((iter.item->type & ~0xfu) == KCDATA_TYPE_ARRAY_PAD0) + return KCDATA_TYPE_ARRAY; + else + return iter.item->type; +} + +static inline uint32_t +kcdata_calc_padding(uint32_t size) +{ + /* calculate number of bits to add to size to get something divisible by 16 */ + return (-size) & 0xf; +} + +static inline uint32_t +kcdata_flags_get_padding(uint64_t flags) +{ + return flags & KCDATA_FLAGS_STRUCT_PADDING_MASK; +} + +/* see comment above about has_padding */ +static inline int +kcdata_iter_is_legacy_item(kcdata_iter_t iter, uint32_t legacy_size) +{ + uint32_t legacy_size_padded = legacy_size + kcdata_calc_padding(legacy_size); + return (iter.item->size == legacy_size_padded && + (iter.item->flags & (KCDATA_FLAGS_STRUCT_PADDING_MASK | KCDATA_FLAGS_STRUCT_HAS_PADDING)) == 0); + +} + +static inline uint32_t +kcdata_iter_size(kcdata_iter_t iter) +{ + uint32_t legacy_size = 0; + + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: + case KCDATA_TYPE_CONTAINER_BEGIN: + return iter.item->size; + case STACKSHOT_KCTYPE_THREAD_SNAPSHOT: { + legacy_size = sizeof(struct thread_snapshot_v2); + if (kcdata_iter_is_legacy_item(iter, legacy_size)) { + return legacy_size; + } + + goto not_legacy; + } + case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { + legacy_size = sizeof(struct dyld_uuid_info_64); + if (kcdata_iter_is_legacy_item(iter, legacy_size)) { + return legacy_size; + } + + goto not_legacy; + } +not_legacy: + default: + if (iter.item->size < kcdata_flags_get_padding(iter.item->flags)) + return 0; + else + return iter.item->size - kcdata_flags_get_padding(iter.item->flags); + } +} + +static inline uint64_t +kcdata_iter_flags(kcdata_iter_t iter) +{ + return iter.item->flags; +} + +static inline +void * kcdata_iter_payload(kcdata_iter_t iter) { + return &iter.item->data; +} + + +static inline +uint32_t kcdata_iter_array_elem_type(kcdata_iter_t iter) { + return (iter.item->flags >> 32) & UINT32_MAX; +} + +static inline +uint32_t kcdata_iter_array_elem_count(kcdata_iter_t iter) { + return (iter.item->flags) & UINT32_MAX; +} + +/* KCDATA_TYPE_ARRAY is ambiguous about the size of the array elements. Size is + * calculated as total_size / elements_count, but total size got padded out to a + * 16 byte alignment. New kernels will generate KCDATA_TYPE_ARRAY_PAD* instead + * to explicitly tell us how much padding was used. Here we have a fixed, never + * to be altered list of the sizes of array elements that were used before I + * discovered this issue. If you find a KCDATA_TYPE_ARRAY that is not one of + * these types, treat it as invalid data. */ + +static inline +uint32_t +kcdata_iter_array_size_switch(kcdata_iter_t iter) { + switch(kcdata_iter_array_elem_type(iter)) { + case KCDATA_TYPE_LIBRARY_LOADINFO: + return sizeof(struct dyld_uuid_info_32); + case KCDATA_TYPE_LIBRARY_LOADINFO64: + return sizeof(struct dyld_uuid_info_64); + case STACKSHOT_KCTYPE_KERN_STACKFRAME: + case STACKSHOT_KCTYPE_USER_STACKFRAME: + return sizeof(struct stack_snapshot_frame32); + case STACKSHOT_KCTYPE_KERN_STACKFRAME64: + case STACKSHOT_KCTYPE_USER_STACKFRAME64: + return sizeof(struct stack_snapshot_frame64); + case STACKSHOT_KCTYPE_DONATING_PIDS: + return sizeof(int32_t); + case STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT: + return sizeof(struct thread_delta_snapshot_v2); + // This one is only here to make some unit tests work. It should be OK to + // remove. + case TASK_CRASHINFO_CRASHED_THREADID: + return sizeof(uint64_t); + default: + return 0; + } +} + +static inline +int kcdata_iter_array_valid(kcdata_iter_t iter) { + if (!kcdata_iter_valid(iter)) + return 0; + if (kcdata_iter_type(iter) != KCDATA_TYPE_ARRAY) + return 0; + if (kcdata_iter_array_elem_count(iter) == 0) + return iter.item->size == 0; + if (iter.item->type == KCDATA_TYPE_ARRAY) { + uint32_t elem_size = kcdata_iter_array_size_switch(iter); + if (elem_size == 0) + return 0; + /* sizes get aligned to the nearest 16. */ + return + kcdata_iter_array_elem_count(iter) <= iter.item->size / elem_size && + iter.item->size % kcdata_iter_array_elem_count(iter) < 16; + } else { + return + (iter.item->type & 0xf) <= iter.item->size && + kcdata_iter_array_elem_count(iter) <= iter.item->size - (iter.item->type & 0xf) && + (iter.item->size - (iter.item->type & 0xf)) % kcdata_iter_array_elem_count(iter) == 0; + } +} + + +static inline +uint32_t kcdata_iter_array_elem_size(kcdata_iter_t iter) { + if (iter.item->type == KCDATA_TYPE_ARRAY) + return kcdata_iter_array_size_switch(iter); + if (kcdata_iter_array_elem_count(iter) == 0) + return 0; + return (iter.item->size - (iter.item->type & 0xf)) / kcdata_iter_array_elem_count(iter); +} + +static inline +int kcdata_iter_container_valid(kcdata_iter_t iter) { + return + kcdata_iter_valid(iter) && + kcdata_iter_type(iter) == KCDATA_TYPE_CONTAINER_BEGIN && + iter.item->size >= sizeof(uint32_t); +} + +static inline +uint32_t kcdata_iter_container_type(kcdata_iter_t iter) { + return * (uint32_t *) kcdata_iter_payload(iter); +} + +static inline +uint64_t kcdata_iter_container_id(kcdata_iter_t iter) { + return iter.item->flags; +} + + +#define KCDATA_ITER_FOREACH(iter) for(; kcdata_iter_valid(iter) && iter.item->type != KCDATA_TYPE_BUFFER_END; iter = kcdata_iter_next(iter)) +#define KCDATA_ITER_FOREACH_FAILED(iter) (!kcdata_iter_valid(iter) || (iter).item->type != KCDATA_TYPE_BUFFER_END) + +static inline +kcdata_iter_t +kcdata_iter_find_type(kcdata_iter_t iter, uint32_t type) +{ + KCDATA_ITER_FOREACH(iter) + { + if (kcdata_iter_type(iter) == type) + return iter; + } + return kcdata_invalid_iter; +} + +static inline +int kcdata_iter_data_with_desc_valid(kcdata_iter_t iter, uint32_t minsize) { + return + kcdata_iter_valid(iter) && + kcdata_iter_size(iter) >= KCDATA_DESC_MAXLEN + minsize && + ((char*)kcdata_iter_payload(iter))[KCDATA_DESC_MAXLEN-1] == 0; +} + +static inline +char *kcdata_iter_string(kcdata_iter_t iter, uint32_t offset) { + if (offset > kcdata_iter_size(iter)) { + return NULL; + } + uint32_t maxlen = kcdata_iter_size(iter) - offset; + char *s = ((char*)kcdata_iter_payload(iter)) + offset; + if (strnlen(s, maxlen) < maxlen) { + return s; + } else { + return NULL; + } +} + +static inline void kcdata_iter_get_data_with_desc(kcdata_iter_t iter, char **desc_ptr, void **data_ptr, uint32_t *size_ptr) { + if (desc_ptr) + *desc_ptr = (char *)kcdata_iter_payload(iter); + if (data_ptr) + *data_ptr = (void *)((uintptr_t)kcdata_iter_payload(iter) + KCDATA_DESC_MAXLEN); + if (size_ptr) + *size_ptr = kcdata_iter_size(iter) - KCDATA_DESC_MAXLEN; +} + +#endif diff --git a/libkdd/kcdata/kcdata_core.m b/libkdd/kcdata/kcdata_core.m deleted file mode 100644 index 90e61942c..000000000 --- a/libkdd/kcdata/kcdata_core.m +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2015 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#import -#import "kdd.h" -#import "KCDBasicTypeDescription.h" -#import "KCDStructTypeDescription.h" - -#define MAX_KCDATATYPE_BUFFER_SIZE 2048 -extern struct kcdata_type_definition *kcdata_get_typedescription(unsigned type_id, uint8_t *buffer, uint32_t buffer_size); - - -/*! - * @function getTypeFromTypeDef - * - * @abstract - * Build a KCDataType from a type definition. - * - * @param typeDef - * A pointer to kcdata_type_definition_t that specifies the type fields and has subtype definitions - * in the memory immediately following the type_definition. - * - * @return KCDataType * type object which can be used to parse data into dictionaries. - * This may return nil if it finds the data to be invalid. - * - * @discussion - * This routine tries to decode the typeDef structure and create either a basic type (KCDBasicTypeDescription) - * or a struct type. - */ -static KCDataType * getTypeFromTypeDef(struct kcdata_type_definition * typeDef); - -static KCDataType * -getTypeFromTypeDef(struct kcdata_type_definition * typeDef) -{ - if (typeDef == NULL) { - return nil; - } - NSString * kct_name = [NSString stringWithFormat:@"%s", typeDef->kct_name]; - if (typeDef->kct_num_elements == 1) { - KCDBasicTypeDescription * retval = [[KCDBasicTypeDescription alloc] initWithKCTypeDesc:&typeDef->kct_elements[0]]; - return retval; - } else { - KCDStructTypeDescription * retval = - [[KCDStructTypeDescription alloc] initWithType:typeDef->kct_type_identifier withName:kct_name]; - /* need to do work here to get the array of elements setup here */ - KCDBasicTypeDescription * curField = nil; - for (unsigned int i = 0; i < typeDef->kct_num_elements; i++) { - curField = [[KCDBasicTypeDescription alloc] initWithKCTypeDesc:&typeDef->kct_elements[i]]; - [retval addFieldBasicType:curField]; - } - return retval; - } - return nil; -} - -KCDataType * -getKCDataTypeForID(uint32_t typeID) -{ - static dispatch_once_t onceToken; - static NSMutableDictionary * knownTypes = nil; - dispatch_once(&onceToken, ^{ - if (!knownTypes) { - knownTypes = [[NSMutableDictionary alloc] init]; - } - }); - NSNumber * type = [NSNumber numberWithUnsignedInt:typeID]; - if (!knownTypes[type]) { - /* code to query system for type information */ - uint8_t buffer[MAX_KCDATATYPE_BUFFER_SIZE]; - struct kcdata_type_definition * sys_def = kcdata_get_typedescription(typeID, buffer, MAX_KCDATATYPE_BUFFER_SIZE); - if (sys_def == NULL) { - knownTypes[type] = [[KCDBasicTypeDescription alloc] createDefaultForType:typeID]; - } else { - knownTypes[type] = getTypeFromTypeDef(sys_def); - } - } - assert(knownTypes[type] != nil); - return knownTypes[type]; -} - -NSString * -KCDataTypeNameForID(uint32_t typeID) -{ - NSString * retval = [NSString stringWithFormat:@"%u", typeID]; - KCDataType * t = getKCDataTypeForID(typeID); - - if (![[t name] containsString:@"Type_"]) { - retval = [t name]; - } - return retval; -} - -NSMutableDictionary * -parseKCDataArray(void * dataBuffer) -{ - uint32_t typeID = KCDATA_ITEM_ARRAY_GET_EL_TYPE(dataBuffer); - uint32_t count = KCDATA_ITEM_ARRAY_GET_EL_COUNT(dataBuffer); - uint32_t size = KCDATA_ITEM_ARRAY_GET_EL_SIZE(dataBuffer); - uint8_t * buffer = (uint8_t *)KCDATA_ITEM_DATA_PTR(dataBuffer); - KCDataType * datatype = getKCDataTypeForID(typeID); - NSMutableDictionary * retval = [[NSMutableDictionary alloc] initWithCapacity:1]; - NSMutableArray * arr = [[NSMutableArray alloc] initWithCapacity:count]; - retval[[datatype name]] = arr; - NSMutableDictionary * tmpdict = NULL; - for (uint32_t i = 0; i < count; i++) { - tmpdict = [datatype parseData:(void *)&buffer[i * size] ofLength:size]; - [arr addObject:tmpdict]; - } - return retval; -} - -NSMutableDictionary * -parseKCDataContainer(void * dataBuffer, uint32_t * bytesParsed) -{ - if (bytesParsed == NULL) - return nil; - assert(KCDATA_ITEM_TYPE(dataBuffer) == KCDATA_TYPE_CONTAINER_BEGIN); - uint64_t containerID = KCDATA_CONTAINER_ID(dataBuffer); - - /* setup collection object for sub containers */ - NSMutableDictionary * sub_containers = [[NSMutableDictionary alloc] init]; - NSMutableDictionary * retval = [[NSMutableDictionary alloc] init]; - NSMutableDictionary * container = [[NSMutableDictionary alloc] init]; - struct kcdata_item * buffer = (struct kcdata_item *)KCDATA_ITEM_NEXT_HEADER(dataBuffer); - KCDataType * tmptype; - uint32_t _t; - void * _d; - NSMutableDictionary * tmpdict; - retval[KCDataTypeNameForID(kcdata_get_container_type(dataBuffer))] = container; - - KCDATA_ITEM_FOREACH(buffer) - { - _t = KCDATA_ITEM_TYPE(buffer); - _d = KCDATA_ITEM_DATA_PTR(buffer); - if (_t == KCDATA_TYPE_CONTAINER_END) { - if (KCDATA_CONTAINER_ID(buffer) == containerID) { - break; - } - continue; - } - - if (_t == KCDATA_TYPE_ARRAY) { - tmpdict = parseKCDataArray(buffer); - [container addEntriesFromDictionary:tmpdict]; - continue; - } - - if (_t == KCDATA_TYPE_CONTAINER_BEGIN) { - uint32_t container_size = 0; - tmpdict = parseKCDataContainer(buffer, &container_size); - NSString * subcontainerID = [NSString stringWithFormat:@"%llu", KCDATA_CONTAINER_ID(buffer)]; - NSString * k_desc = nil; - assert([tmpdict count] == 1); - for (NSString * k in [tmpdict keyEnumerator]) { - k_desc = k; - if ([k intValue] != 0) - k_desc = KCDataTypeNameForID([k intValue]); - - if ([sub_containers objectForKey:k_desc] == nil) { - sub_containers[k_desc] = [[NSMutableDictionary alloc] init]; - } - sub_containers[k_desc][subcontainerID] = tmpdict[k]; - } - buffer = (struct kcdata_item *)((uintptr_t)buffer + container_size); - if (KCDATA_ITEM_TYPE(buffer) == KCDATA_TYPE_BUFFER_END) { - break; - } - continue; - } - - tmptype = getKCDataTypeForID(_t); - tmpdict = [tmptype parseData:_d ofLength:KCDATA_ITEM_SIZE(buffer)]; - if ([tmpdict count] == 1) - [container addEntriesFromDictionary:tmpdict]; - else - container[[tmptype name]] = tmpdict; - } - [container addEntriesFromDictionary:sub_containers]; - *bytesParsed = (uint32_t)((uintptr_t)buffer - (uintptr_t)dataBuffer); - return retval; -} diff --git a/libkdd/kcdata/kcdtypes.c b/libkdd/kcdata/kcdtypes.c deleted file mode 100644 index 82c97f74b..000000000 --- a/libkdd/kcdata/kcdtypes.c +++ /dev/null @@ -1,552 +0,0 @@ -/* - * Copyright (c) 2015 Apple Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/*! - * @function kcdata_get_typedescription - * - * @abstract - * Search the known type definitions for type with id type_id. - * - * @param type_id - * A unsinged int type specified by the KCDATA. - * - * @param buffer - * pointer to data area where type definition will be saved. - * - * @param buffer_size - * size of the buffer provided. - * - * @return struct kcdata_type_definition * - * pointer to a malloc'ed buffer holding the type definition and each subtype defintion for its fields. - * It may return NULL if no type with id == type_id is found. - * Note: The caller is responsible to free() the memory when its no longer used. - * - * @discussion - * This function queries the known type definitions table. If found the defintion data is returned - * else NULL is returned. It is advised to cache the return value from this function since the data - * is always going to be the same for same type_id. The definition setup requires memory on heap. - * The caller should make sure to free() the data once its done with using it. - * - */ -struct kcdata_type_definition *kcdata_get_typedescription(unsigned type_id, uint8_t *buffer, uint32_t buffer_size); - - - -/* forward declarations for helper routines */ -static uint32_t get_kctype_subtype_size(kctype_subtype_t type); -static void setup_subtype_description(kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, char *name); -static void setup_subtype_array_description(kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, uint32_t count, char *name); -static void setup_type_definition(struct kcdata_type_definition *d, uint32_t type, uint32_t num_elems, char *name); - -struct kcdata_type_definition *kcdata_get_typedescription(unsigned type_id, uint8_t *buffer, uint32_t buffer_size) -{ - int i = 0; -#define _STR_VALUE(x) #x -#define _SUBTYPE(t, s, f) setup_subtype_description(&subtypes[i++], (t), offsetof(s,f), _STR_VALUE(f)) -#define _SUBTYPE_ARRAY(t, s, f, c) setup_subtype_array_description(&subtypes[i++], (t), offsetof(s,f), (c), _STR_VALUE(f)) -#define _STRINGTYPE(f) setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, UINT16_MAX, f) - - - - if (buffer_size < sizeof(struct kcdata_type_definition) || buffer == NULL) - return NULL; - - struct kcdata_type_definition *retval = (struct kcdata_type_definition *)&buffer[0]; - kcdata_subtype_descriptor_t subtypes = (kcdata_subtype_descriptor_t)&buffer[sizeof(struct kcdata_type_definition)]; - switch (type_id) { - - case KCDATA_TYPE_STRING_DESC: { - i = 0; - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, KCDATA_DESC_MAXLEN, UINT16_MAX, "data"); - setup_type_definition(retval, type_id, i, "string_desc"); - break; - } - - case KCDATA_TYPE_UINT32_DESC: { - i = 0; - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); - setup_subtype_description(&subtypes[i++], KC_ST_UINT32, KCDATA_DESC_MAXLEN, "data"); - setup_type_definition(retval, type_id, i, "uint32_desc"); - break; - } - - case KCDATA_TYPE_UINT64_DESC: { - i = 0; - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); - setup_subtype_description(&subtypes[i++], KC_ST_UINT64, KCDATA_DESC_MAXLEN, "data"); - setup_type_definition(retval, type_id, i, "uint64_desc"); - break; - } - - case KCDATA_TYPE_INT32_DESC: { - i = 0; - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); - setup_subtype_description(&subtypes[i++], KC_ST_INT32, KCDATA_DESC_MAXLEN, "data"); - setup_type_definition(retval, type_id, i, "int32_desc"); - break; - } - - case KCDATA_TYPE_INT64_DESC: { - i = 0; - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); - setup_subtype_description(&subtypes[i++], KC_ST_INT64, KCDATA_DESC_MAXLEN, "data"); - setup_type_definition(retval, type_id, i, "int64_desc"); - break; - } - - case KCDATA_TYPE_CONTAINER_BEGIN :{ - i = 0; - setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "kcContainerType"); - setup_type_definition(retval, type_id, i, "container_begin"); - break; - } - - case KCDATA_TYPE_LIBRARY_LOADINFO: { - i = 0; - _SUBTYPE(KC_ST_UINT32, struct dyld_uuid_info_32, imageLoadAddress); - _SUBTYPE_ARRAY(KC_ST_UINT8, struct dyld_uuid_info_32, imageUUID, 16); - setup_type_definition(retval, type_id, i, "dyld_load_info"); - break; - - } - - case KCDATA_TYPE_LIBRARY_LOADINFO64: /* fall through */ - case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { - i = 0; - _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64, imageLoadAddress); - _SUBTYPE_ARRAY(KC_ST_UINT8, struct dyld_uuid_info_64, imageUUID, 16); - setup_type_definition(retval, type_id, i, "dyld_load_info"); - break; - } - - case KCDATA_TYPE_TIMEBASE: { - i = 0; - _SUBTYPE(KC_ST_UINT32, struct mach_timebase_info, numer); - _SUBTYPE(KC_ST_UINT32, struct mach_timebase_info, denom); - setup_type_definition(retval, type_id, i, "mach_timebase_info"); - } - - case KCDATA_TYPE_MACH_ABSOLUTE_TIME: - setup_type_definition(retval, type_id, 1, "mach_absolute_time"); - setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "mach_absolute_time"); - break; - - case KCDATA_TYPE_TIMEVAL: { - i = 0; - _SUBTYPE(KC_ST_INT64, struct timeval64, tv_sec); - _SUBTYPE(KC_ST_INT64, struct timeval64, tv_usec); - setup_type_definition(retval, type_id, i, "timeval"); - } - - case KCDATA_TYPE_USECS_SINCE_EPOCH: - setup_type_definition(retval, type_id, 1, "usecs_since_epoch"); - setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "usecs_since_epoch"); - break; - - - /* stackshot specific types */ - case STACKSHOT_KCTYPE_IOSTATS: { - i = 0; - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_reads_count); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_reads_size); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_writes_count); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_writes_size); - _SUBTYPE_ARRAY(KC_ST_UINT64, struct io_stats_snapshot, ss_io_priority_count, STACKSHOT_IO_NUM_PRIORITIES); - _SUBTYPE_ARRAY(KC_ST_UINT64, struct io_stats_snapshot, ss_io_priority_size, STACKSHOT_IO_NUM_PRIORITIES); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_paging_count); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_paging_size); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_non_paging_count); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_non_paging_size); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_data_count); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_data_size); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_metadata_count); - _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_metadata_size); - - setup_type_definition(retval, type_id, i, "io_statistics"); - break; - } - - case STACKSHOT_KCTYPE_GLOBAL_MEM_STATS : - { i = 0; - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, snapshot_magic); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, free_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, active_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, inactive_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, purgeable_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, wired_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, speculative_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, throttled_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, filebacked_pages); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, compressions); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, decompressions); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, compressor_size); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, busy_buffer_count); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, pages_wanted); - _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, pages_reclaimed); - _SUBTYPE(KC_ST_UINT8, struct mem_and_io_snapshot, pages_wanted_reclaimed_valid); - setup_type_definition(retval, type_id, i, "mem_and_io_snapshot"); - break; - } - - case STACKSHOT_KCCONTAINER_TASK: - setup_type_definition(retval, type_id, 0, "task_snapshots"); - break; - - case STACKSHOT_KCCONTAINER_THREAD: - setup_type_definition(retval, type_id, 0, "thread_snapshots"); - break; - - - case STACKSHOT_KCTYPE_TASK_SNAPSHOT: { - i = 0; - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_unique_pid); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_ss_flags); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_user_time_in_terminated_threads); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_system_time_in_terminated_threads); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_p_start_sec); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_task_size); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_max_resident_size); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_suspend_count); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_faults); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_pageins); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_cow_faults); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_was_throttled); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_did_throttle); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_latency_qos); - _SUBTYPE(KC_ST_INT32, struct task_snapshot_v2, ts_pid); - _SUBTYPE_ARRAY(KC_ST_CHAR, struct task_snapshot_v2, ts_p_comm, 32); - setup_type_definition(retval, type_id, i, "task_snapshot"); - break; - } - - case STACKSHOT_KCTYPE_THREAD_SNAPSHOT: { - i = 0; - - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_thread_id); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_wait_event); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_continuation); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_total_syscalls); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_voucher_identifier); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_dqserialnum); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_user_time); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_sys_time); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_ss_flags); - _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v2, ths_last_run_time); - _SUBTYPE(KC_ST_UINT32, struct thread_snapshot_v2, ths_state); - _SUBTYPE(KC_ST_UINT32, struct thread_snapshot_v2, ths_sched_flags); - _SUBTYPE(KC_ST_INT16, struct thread_snapshot_v2, ths_base_priority); - _SUBTYPE(KC_ST_INT16, struct thread_snapshot_v2, ths_sched_priority); - _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v2, ths_eqos); - _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v2, ths_rqos); - _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v2, ths_rqos_override); - _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v2, ths_io_tier); - - setup_type_definition(retval, type_id, i, "thread_snapshot"); - break; - } - - - case STASKSHOT_KCTYPE_DONATING_PIDS: - setup_type_definition(retval, type_id, 1, "donating_pids"); - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "pid"); - break; - - case STACKSHOT_KCTYPE_THREAD_NAME:{ - i = 0; - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, 64, "pth_name"); - setup_type_definition(retval, type_id, i, "pth_name"); - break; - } - - case STACKSHOT_KCTYPE_KERN_STACKFRAME : - setup_type_definition(retval, type_id, 2, "kernel_stack_frames"); - setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "lr"); - setup_subtype_description(&subtypes[1], KC_ST_UINT32, sizeof(uint32_t), "sp"); - break; - - case STACKSHOT_KCTYPE_KERN_STACKFRAME64 : - setup_type_definition(retval, type_id, 2, "kernel_stack_frames"); - setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "lr"); - setup_subtype_description(&subtypes[1], KC_ST_UINT64, sizeof(uint64_t), "sp"); - break; - - case STACKSHOT_KCTYPE_USER_STACKFRAME : - setup_type_definition(retval, type_id, 2, "user_stack_frames"); - setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "lr"); - setup_subtype_description(&subtypes[1], KC_ST_UINT32, sizeof(uint32_t), "sp"); - break; - - case STACKSHOT_KCTYPE_USER_STACKFRAME64 : - setup_type_definition(retval, type_id, 2, "user_stack_frames"); - setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "lr"); - setup_subtype_description(&subtypes[1], KC_ST_UINT64, sizeof(uint64_t), "sp"); - break; - - case STACKSHOT_KCTYPE_BOOTARGS: { - i = 0; - _STRINGTYPE("boot_args"); - setup_type_definition(retval, type_id, i, "boot_args"); - break; - } - - case STACKSHOT_KCTYPE_OSVERSION: { - i = 0; - _STRINGTYPE("osversion"); - setup_type_definition(retval, type_id, i, "osversion"); - break; - } - - case STACKSHOT_KCTYPE_KERN_PAGE_SIZE: { - i = 0; - setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "kernel_page_size"); - setup_type_definition(retval, type_id, i, "kernel_page_size"); - break; - } - - case STACKSHOT_KCTYPE_JETSAM_LEVEL: { - i = 0; - setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "jetsam_level"); - setup_type_definition(retval, type_id, i, "jetsam_level"); - break; - } - - /* crashinfo types */ - case TASK_CRASHINFO_BSDINFOWITHUNIQID: - { i = 0; - _SUBTYPE_ARRAY(KC_ST_UINT8, struct proc_uniqidentifierinfo, p_uuid, 16); - _SUBTYPE(KC_ST_UINT64, struct proc_uniqidentifierinfo, p_uniqueid); - _SUBTYPE(KC_ST_UINT64, struct proc_uniqidentifierinfo, p_puniqueid); - /* Ignore the p_reserve fields */ - setup_type_definition(retval, type_id, i, "proc_uniqidentifierinfo"); - break; - } - - case TASK_CRASHINFO_PID:{ - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "pid"); - setup_type_definition(retval, type_id, 1, "pid"); - break; - } - - case TASK_CRASHINFO_PPID:{ - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "ppid"); - setup_type_definition(retval, type_id, 1, "ppid"); - break; - } - - case TASK_CRASHINFO_RUSAGE_INFO: { - i = 0; - _SUBTYPE_ARRAY(KC_ST_UINT8, struct rusage_info_v3, ri_uuid, 16); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_user_time); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_system_time); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_pkg_idle_wkups); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_interrupt_wkups); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_pageins); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_wired_size); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_resident_size); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_phys_footprint); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_proc_start_abstime); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_proc_exit_abstime); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_user_time); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_system_time); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_pkg_idle_wkups); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_interrupt_wkups); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_pageins); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_elapsed_abstime); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_diskio_bytesread); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_diskio_byteswritten); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_default); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_maintenance); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_background); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_utility); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_legacy); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_user_initiated); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_user_interactive); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_billed_system_time); - _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_serviced_system_time); - setup_type_definition(retval, type_id, i, "rusage_info"); - } - - case TASK_CRASHINFO_PROC_NAME: { - i = 0; - _STRINGTYPE("p_comm"); - setup_type_definition(retval, type_id, i, "p_comm"); - } - - case TASK_CRASHINFO_USERSTACK: { - i = 0; - setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "userstack_ptr"); - setup_type_definition(retval, type_id, 1, "userstack_ptr"); - break; - } - - case TASK_CRASHINFO_ARGSLEN: { - i = 0; - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "p_argslen"); - setup_type_definition(retval, type_id, 1, "p_argslen"); - break; - } - - case TASK_CRASHINFO_PROC_PATH: { - i = 0; - _STRINGTYPE("p_path"); - setup_type_definition(retval, type_id, i, "p_path"); - } - - case TASK_CRASHINFO_PROC_CSFLAGS:{ - setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "p_csflags"); - setup_type_definition(retval, type_id, 1, "p_csflags"); - break; - } - - case TASK_CRASHINFO_PROC_STATUS: { - setup_subtype_description(&subtypes[0], KC_ST_UINT8, 0, "p_status"); - setup_type_definition(retval, type_id, 1, "p_status"); - break; - } - - case TASK_CRASHINFO_UID:{ - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "uid"); - setup_type_definition(retval, type_id, 1, "uid"); - break; - } - - case TASK_CRASHINFO_GID:{ - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "gid"); - setup_type_definition(retval, type_id, 1, "gid"); - break; - } - - case TASK_CRASHINFO_PROC_ARGC:{ - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "argc"); - setup_type_definition(retval, type_id, 1, "argc"); - break; - } - - case TASK_CRASHINFO_PROC_FLAGS:{ - setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "p_flags"); - setup_type_definition(retval, type_id, 1, "p_flags"); - break; - } - - case TASK_CRASHINFO_CPUTYPE:{ - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "cputype"); - setup_type_definition(retval, type_id, 1, "cputype"); - break; - } - - case TASK_CRASHINFO_RESPONSIBLE_PID:{ - setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "responsible_pid"); - setup_type_definition(retval, type_id, 1, "responsible_pid"); - break; - } - - case TASK_CRASHINFO_DIRTY_FLAGS:{ - setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "dirty_flags"); - setup_type_definition(retval, type_id, 1, "dirty_flags"); - break; - } - - case TASK_CRASHINFO_CRASHED_THREADID: { - setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "crashed_threadid"); - setup_type_definition(retval, type_id, 1, "crashed_threadid"); - break; - } - - default: - retval = NULL; - break; - } - - assert(retval == NULL || (buffer_size > sizeof(struct kcdata_type_definition) + (retval->kct_num_elements * sizeof(struct kcdata_subtype_descriptor)))); - return retval; -} - - -static void setup_type_definition(struct kcdata_type_definition *d, uint32_t type, uint32_t num_elems, char *name) -{ - d->kct_type_identifier = type; - d->kct_num_elements = num_elems; - memcpy(d->kct_name, name, sizeof(d->kct_name)); - d->kct_name[sizeof(d->kct_name) - 1] = '\0'; -} - -static uint32_t get_kctype_subtype_size(kctype_subtype_t type){ - switch (type) { - case KC_ST_CHAR: - case KC_ST_INT8: - case KC_ST_UINT8: - return sizeof(uint8_t); - break; - case KC_ST_INT16: - case KC_ST_UINT16: - return sizeof(uint16_t); - break; - case KC_ST_INT32: - case KC_ST_UINT32: - return sizeof(uint32_t); - break; - case KC_ST_INT64: - case KC_ST_UINT64: - return sizeof(uint64_t); - break; - - default: - assert(0); - break; - } - return 0; -} - -static void setup_subtype_array_description(kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, uint32_t count, char *name) -{ - desc->kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY; - desc->kcs_elem_type = type; - desc->kcs_elem_offset = offset; - desc->kcs_elem_size = KCS_SUBTYPE_PACK_SIZE(count, get_kctype_subtype_size(type)); - memcpy(desc->kcs_name, name, sizeof(desc->kcs_name)); - desc->kcs_name[sizeof(desc->kcs_name) - 1] = '\0'; -} - -static void setup_subtype_description(kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, char *name) -{ - desc->kcs_flags = KCS_SUBTYPE_FLAGS_NONE; - desc->kcs_elem_type = type; - desc->kcs_elem_offset = offset; - desc->kcs_elem_size = get_kctype_subtype_size(type); - memcpy(desc->kcs_name, name, sizeof(desc->kcs_name)); - desc->kcs_name[sizeof(desc->kcs_name) - 1] = '\0'; -} - diff --git a/libkdd/kcdata_core.m b/libkdd/kcdata_core.m new file mode 100644 index 000000000..a293f0d94 --- /dev/null +++ b/libkdd/kcdata_core.m @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#import +#import "kdd.h" +#import "KCDBasicTypeDescription.h" +#import "KCDStructTypeDescription.h" +#import "KCDEmbeddedBufferDescription.h" + +#define LIB_KCD_ERR_DOMAIN @"KCDataError" + +#define GEN_ERROR(code, msg) gen_error(__LINE__, code, @msg) +#define GEN_ERRORF(code, msg, ...) gen_error(__LINE__, code, [NSString stringWithFormat:@msg, __VA_ARGS__]) + +#define MAX_KCDATATYPE_BUFFER_SIZE 2048 +extern struct kcdata_type_definition * kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_size); + +BOOL setKCDataTypeForID(uint32_t newTypeID, KCDataType *newTypeObj); + +static NSError * +gen_error(int line, NSInteger code, NSString *message) +{ + return [NSError errorWithDomain:LIB_KCD_ERR_DOMAIN + code:code + userInfo:@{ @"line": @(line), @"message": message }]; +} + +static BOOL +mergedict(NSMutableDictionary * container, NSDictionary * object, NSError ** error) +{ + for (id key in object) { + id existing = container[key]; + id new = object[key]; + if (existing) { + if ([existing isKindOfClass:[NSMutableArray class]] && [new isKindOfClass:[ NSArray class ]]) { + [existing addObjectsFromArray:new]; + } else { + if (error) { + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "repeated key: %@", key); + } + return FALSE; + } + } else { + [container setValue:new forKey:key]; + } + } + return TRUE; +} + +/*! + * @function getTypeFromTypeDef + * + * @abstract + * Build a KCDataType from a type definition. + * + * @param typeDef + * A pointer to kcdata_type_definition_t that specifies the type fields and has subtype definitions + * in the memory immediately following the type_definition. + * + * @return KCDataType * type object which can be used to parse data into dictionaries. + * This may return nil if it finds the data to be invalid. + * + * @discussion + * This routine tries to decode the typeDef structure and create either a basic type (KCDBasicTypeDescription) + * or a struct type. + */ +static KCDataType * getTypeFromTypeDef(struct kcdata_type_definition * typeDef); + +static KCDataType * +getTypeFromTypeDef(struct kcdata_type_definition * typeDef) +{ + if (typeDef == NULL) { + return nil; + } + NSString * kct_name = [NSString stringWithFormat:@"%s", typeDef->kct_name]; + if (typeDef->kct_num_elements == 1 && !(typeDef->kct_elements[0].kcs_flags & KCS_SUBTYPE_FLAGS_STRUCT)) { + KCDBasicTypeDescription * retval = [[KCDBasicTypeDescription alloc] initWithKCTypeDesc:&typeDef->kct_elements[0]]; + return retval; + } else { + KCDStructTypeDescription * retval = + [[KCDStructTypeDescription alloc] initWithType:typeDef->kct_type_identifier withName:kct_name]; + /* need to do work here to get the array of elements setup here */ + KCDBasicTypeDescription * curField = nil; + for (unsigned int i = 0; i < typeDef->kct_num_elements; i++) { + curField = [[KCDBasicTypeDescription alloc] initWithKCTypeDesc:&typeDef->kct_elements[i]]; + [retval addFieldBasicType:curField]; + if (typeDef->kct_elements[i].kcs_flags & KCS_SUBTYPE_FLAGS_MERGE) { + [retval setFlagsRequestedMerge]; + } + } + return retval; + } + return nil; +} + +static dispatch_once_t onceToken; +static NSMutableDictionary * knownTypes = nil; + +KCDataType * +getKCDataTypeForID(uint32_t typeID) +{ + dispatch_once(&onceToken, ^{ + if (!knownTypes) { + knownTypes = [[NSMutableDictionary alloc] init]; + } + }); + + NSNumber * type = [NSNumber numberWithUnsignedInt:typeID]; + if (!knownTypes[type]) { + if (typeID == KCDATA_TYPE_NESTED_KCDATA) { + knownTypes[type] = [[KCDEmbeddedBufferDescription alloc] init]; + return knownTypes[type]; + } + /* code to query system for type information */ + uint8_t buffer[MAX_KCDATATYPE_BUFFER_SIZE]; + struct kcdata_type_definition * sys_def = kcdata_get_typedescription(typeID, buffer, MAX_KCDATATYPE_BUFFER_SIZE); + if (sys_def == NULL) { + knownTypes[type] = [[KCDBasicTypeDescription alloc] createDefaultForType:typeID]; + } else { + knownTypes[type] = getTypeFromTypeDef(sys_def); + } + } + assert(knownTypes[type] != nil); + return knownTypes[type]; +} + +BOOL +setKCDataTypeForID(uint32_t newTypeID, KCDataType *newTypeObj) { + if (newTypeObj == NULL || newTypeID == 0) { + return FALSE; + } + + dispatch_once(&onceToken, ^{ + if (!knownTypes) { + knownTypes = [[NSMutableDictionary alloc] init]; + } + }); + + NSNumber * type = [NSNumber numberWithUnsignedInt:newTypeID]; + + if (!knownTypes[type]) { + knownTypes[type] = newTypeObj; + return TRUE; + } + + return FALSE; +} + + +NSString * +KCDataTypeNameForID(uint32_t typeID) +{ + NSString * retval = [NSString stringWithFormat:@"%u", typeID]; + KCDataType * t = getKCDataTypeForID(typeID); + + if (![[t name] containsString:@"Type_"]) { + retval = [t name]; + } + return retval; +} + +NSMutableDictionary * +parseKCDataArray(kcdata_iter_t iter, NSError **error) +{ + if (!kcdata_iter_array_valid(iter)) { + if (error) + *error = GEN_ERROR(KERN_INVALID_OBJECT, "invalid array"); + return NULL; + } + + uint32_t typeID = kcdata_iter_array_elem_type(iter); + uint32_t count = kcdata_iter_array_elem_count(iter); + uint32_t size = kcdata_iter_array_elem_size(iter); + uint8_t * buffer = (uint8_t *)kcdata_iter_payload(iter); + KCDataType * datatype = getKCDataTypeForID(typeID); + NSMutableDictionary * retval = [[NSMutableDictionary alloc] initWithCapacity:1]; + NSMutableArray * arr = [[NSMutableArray alloc] initWithCapacity:count]; + retval[[datatype name]] = arr; + NSDictionary * tmpdict = NULL; + for (uint32_t i = 0; i < count; i++) { + tmpdict = [datatype parseData:(void *)&buffer[i * size] ofLength:size]; + if (!tmpdict) { + if (error) + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "failed to parse array element. type=0x%x", (int)typeID); + return NULL; + } + if ([datatype shouldMergeData]) { + assert([tmpdict count] == 1); + [arr addObject: [tmpdict allValues][0]]; + } else { + [arr addObject:tmpdict]; + } + } + return retval; +} + +NSMutableDictionary * +parseKCDataContainer(kcdata_iter_t *iter_p, NSError **error) +{ + kcdata_iter_t iter = *iter_p; + + if (!kcdata_iter_container_valid(iter)) { + if (error) + *error = GEN_ERROR(KERN_INVALID_OBJECT, "invalid container"); + return NULL; + } + uint64_t containerID = kcdata_iter_container_id(iter); + + /* setup collection object for sub containers */ + NSMutableDictionary * sub_containers = [[NSMutableDictionary alloc] init]; + NSMutableDictionary * retval = [[NSMutableDictionary alloc] init]; + NSMutableDictionary * container = [[NSMutableDictionary alloc] init]; + + KCDataType * tmptype; + uint32_t _t; + void * _d; + BOOL ok; + NSDictionary * tmpdict; + BOOL found_end = FALSE; + retval[KCDataTypeNameForID(kcdata_iter_container_type(iter))] = container; + + iter = kcdata_iter_next(iter); + + KCDATA_ITER_FOREACH(iter) + { + _t = kcdata_iter_type(iter); + _d = kcdata_iter_payload(iter); + if (_t == KCDATA_TYPE_CONTAINER_END) { + if (kcdata_iter_container_id(iter) != containerID) { + if (error) + *error = GEN_ERROR(KERN_INVALID_ARGUMENT, "container marker mismatch"); + return NULL; + } + found_end = TRUE; + break; + } + + if (_t == KCDATA_TYPE_ARRAY) { + tmpdict = parseKCDataArray(iter, error); + if (!tmpdict) + return NULL; + + ok = mergedict(container, tmpdict, error); + if (!ok) + return NULL; + + continue; + } + + if (_t == KCDATA_TYPE_CONTAINER_BEGIN) { + NSString * subcontainerID = [NSString stringWithFormat:@"%llu", kcdata_iter_container_id(iter)]; + tmpdict = parseKCDataContainer(&iter, error); + if (!tmpdict) + return NULL; + assert([tmpdict count] == 1); + for (NSString * k in [tmpdict keyEnumerator]) { + if (sub_containers[k] == nil) { + sub_containers[k] = [[NSMutableDictionary alloc] init]; + } + if (sub_containers[k][subcontainerID] != nil) { + if (error) + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "repeated container id: %@", subcontainerID); + return NULL; + } + sub_containers[k][subcontainerID] = tmpdict[k]; + } + continue; + } + + tmptype = getKCDataTypeForID(_t); + tmpdict = [tmptype parseData:_d ofLength:kcdata_iter_size(iter)]; + if (!tmpdict) { + if (error) + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "failed to parse. type=0x%x", (int)_t); + return NULL; + } + if (![tmptype shouldMergeData]) { + tmpdict = @{[tmptype name] : tmpdict}; + } + ok = mergedict(container, tmpdict, error); + if (!ok) + return NULL; + } + + if (!found_end) { + if (error) + *error = GEN_ERROR(KERN_INVALID_ARGUMENT, "missing container end"); + return NULL; + } + + ok = mergedict(container, sub_containers, error); + if (!ok) + return NULL; + + *iter_p = iter; + return retval; +} + +NSDictionary * +parseKCDataBuffer(void * dataBuffer, uint32_t size, NSError ** error) +{ + if (dataBuffer == NULL) { + if (error) + *error = GEN_ERROR(KERN_INVALID_ARGUMENT, "buffer is null"); + return NULL; + } + + uint32_t _type = (size >= sizeof(uint32_t)) ? *(uint32_t*)dataBuffer : 0; + uint32_t _size = 0; + uint64_t _flags = 0; + void * _datap = NULL; + KCDataType * kcd_type = NULL; + NSString * rootKey = NULL; + uint32_t rootType = _type; + BOOL ok; + + /* validate begin tag and get root key */ + switch (_type) { + case KCDATA_BUFFER_BEGIN_CRASHINFO: + rootKey = @"kcdata_crashinfo"; + break; + case KCDATA_BUFFER_BEGIN_STACKSHOT: + rootKey = @"kcdata_stackshot"; + break; + case KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT: + rootKey = @"kcdata_delta_stackshot"; + break; + case KCDATA_BUFFER_BEGIN_OS_REASON: + rootKey = @"kcdata_reason"; + break; + case KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG: + rootKey = @"xnupost_testconfig"; + break; + default: { + if (error) + *error = GEN_ERROR(KERN_INVALID_VALUE, "invalid magic number"); + return NULL; + break; + } + } + assert(rootKey != NULL); + + kcdata_iter_t iter = kcdata_iter(dataBuffer, size); + + if (!kcdata_iter_valid(iter)) { + if (error) { + *error = GEN_ERROR(KERN_INVALID_OBJECT, "initial item is invalid"); + } + return NULL; + } + + NSMutableDictionary * rootObject = [NSMutableDictionary dictionary]; + NSDictionary * retval = [NSMutableDictionary dictionaryWithObject:rootObject forKey:rootKey]; + + /* iterate over each kcdata item */ + KCDATA_ITER_FOREACH(iter) + { + _type = kcdata_iter_type(iter); + _size = kcdata_iter_size(iter); + _flags = kcdata_iter_flags(iter); + _datap = kcdata_iter_payload(iter); + + if (_type == rootType) + continue; + + if (_type == KCDATA_TYPE_ARRAY) { + NSDictionary * dict = parseKCDataArray(iter, error); + if (!dict) + return nil; + + ok = mergedict(rootObject, dict, error); + if (!ok) + return NULL; + + continue; + } + + if (_type == KCDATA_TYPE_CONTAINER_BEGIN) { + NSString * containerID = [NSString stringWithFormat:@"%llu", kcdata_iter_container_id(iter)]; + NSMutableDictionary *container = parseKCDataContainer(&iter, error); + if (!container) + return nil; + assert([container count] == 1); + for (NSString * k in [container keyEnumerator]) { + if (rootObject[k] == nil) { + rootObject[k] = [[NSMutableDictionary alloc] init]; + } + if (rootObject[k][containerID] != nil) { + if (error) + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "repeated container id: %@", containerID); + return NULL; + } + rootObject[k][containerID] = container[k]; + } + continue; + } + + if (_type == KCDATA_TYPE_TYPEDEFINTION) { + KCDataType *new_type = getTypeFromTypeDef((struct kcdata_type_definition *)_datap); + if (new_type != NULL) { + setKCDataTypeForID([new_type typeID], new_type); + kcd_type = getKCDataTypeForID(_type); + NSDictionary * tmpdict = [kcd_type parseData:_datap ofLength:_size]; + if (!tmpdict) { + if (error) + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "failed to parse. type=0x%x", (int)_type); + return NULL; + } + NSString *k = [NSString stringWithFormat:@"typedef[%@]", [new_type name]]; + rootObject[k] = tmpdict; + }else { + if (error) + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "Failed to parse type definition for type %u", _type); + return NULL; + } + continue; + } + + kcd_type = getKCDataTypeForID(_type); + NSDictionary * tmpdict = [kcd_type parseData:_datap ofLength:_size]; + if (!tmpdict) { + if (error) + *error = GEN_ERRORF(KERN_INVALID_OBJECT, "failed to parse. type=0x%x", (int)_type); + return NULL; + } + if (![kcd_type shouldMergeData]) { + tmpdict = @{[kcd_type name] : tmpdict}; + } + ok = mergedict(rootObject, tmpdict, error); + if (!ok) + return NULL; + } + + if (KCDATA_ITER_FOREACH_FAILED(iter)) { + retval = nil; + if (error) { + *error = GEN_ERROR(KERN_INVALID_OBJECT, "invalid item or missing buffer end marker"); + } + } + + return retval; +} diff --git a/libkdd/kcdtypes.c b/libkdd/kcdtypes.c new file mode 100644 index 000000000..fed1f52e4 --- /dev/null +++ b/libkdd/kcdtypes.c @@ -0,0 +1,801 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/*! + * @function kcdata_get_typedescription + * + * @abstract + * Search the known type definitions for type with id type_id. + * + * @param type_id + * A unsinged int type specified by the KCDATA. + * + * @param buffer + * pointer to data area where type definition will be saved. + * + * @param buffer_size + * size of the buffer provided. + * + * @return struct kcdata_type_definition * + * pointer to a malloc'ed buffer holding the type definition and each subtype defintion for its fields. + * It may return NULL if no type with id == type_id is found. + * Note: The caller is responsible to free() the memory when its no longer used. + * + * @discussion + * This function queries the known type definitions table. If found the defintion data is returned + * else NULL is returned. It is advised to cache the return value from this function since the data + * is always going to be the same for same type_id. The definition setup requires memory on heap. + * The caller should make sure to free() the data once its done with using it. + * + */ +struct kcdata_type_definition * kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_size); + +/* forward declarations for helper routines */ +static uint32_t get_kctype_subtype_size(kctype_subtype_t type); +static void setup_subtype_description(kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, char * name); +static void setup_subtype_array_description( + kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, uint32_t count, char * name); +static void setup_type_definition(struct kcdata_type_definition * d, uint32_t type, uint32_t num_elems, char * name); + +struct kcdata_type_definition * +kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_size) +{ + unsigned int i = 0; +#define _STR_VALUE(x) #x +#define _SUBTYPE(t, s, f) setup_subtype_description(&subtypes[i++], (t), offsetof(s, f), _STR_VALUE(f)) +#define _SUBTYPE_ARRAY(t, s, f, c) setup_subtype_array_description(&subtypes[i++], (t), offsetof(s, f), (c), _STR_VALUE(f)) +#define _STRINGTYPE(f) setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, UINT16_MAX, f) + + if (buffer_size < sizeof(struct kcdata_type_definition) || buffer == NULL) + return NULL; + + struct kcdata_type_definition * retval = (struct kcdata_type_definition *)&buffer[0]; + kcdata_subtype_descriptor_t subtypes = (kcdata_subtype_descriptor_t)&buffer[sizeof(struct kcdata_type_definition)]; + switch (type_id) { + case KCDATA_TYPE_STRING_DESC: { + i = 0; + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, KCDATA_DESC_MAXLEN, UINT16_MAX, "data"); + setup_type_definition(retval, type_id, i, "string_desc"); + break; + } + + case KCDATA_TYPE_UINT32_DESC: { + i = 0; + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, KCDATA_DESC_MAXLEN, "data"); + setup_type_definition(retval, type_id, i, "uint32_desc"); + break; + } + + case KCDATA_TYPE_UINT64_DESC: { + i = 0; + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); + setup_subtype_description(&subtypes[i++], KC_ST_UINT64, KCDATA_DESC_MAXLEN, "data"); + setup_type_definition(retval, type_id, i, "uint64_desc"); + break; + } + + case KCDATA_TYPE_INT32_DESC: { + i = 0; + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); + setup_subtype_description(&subtypes[i++], KC_ST_INT32, KCDATA_DESC_MAXLEN, "data"); + setup_type_definition(retval, type_id, i, "int32_desc"); + break; + } + + case KCDATA_TYPE_INT64_DESC: { + i = 0; + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, KCDATA_DESC_MAXLEN, "desc"); + setup_subtype_description(&subtypes[i++], KC_ST_INT64, KCDATA_DESC_MAXLEN, "data"); + setup_type_definition(retval, type_id, i, "int64_desc"); + break; + } + + case KCDATA_TYPE_TYPEDEFINTION: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_type_identifier), "typeID"); + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_num_elements), "numOfFields"); + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, offsetof(struct kcdata_type_definition, kct_name), KCDATA_DESC_MAXLEN, "name"); + // Note "fields" is an array of run time defined length. So we populate fields at parsing time. + setup_type_definition(retval, type_id, i, "typedef"); + break; + } + + case KCDATA_TYPE_CONTAINER_BEGIN: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "kcContainerType"); + setup_type_definition(retval, type_id, i, "container_begin"); + break; + } + + case KCDATA_TYPE_LIBRARY_LOADINFO: { + i = 0; + _SUBTYPE(KC_ST_UINT32, struct user32_dyld_uuid_info, imageLoadAddress); + _SUBTYPE_ARRAY(KC_ST_UINT8, struct user32_dyld_uuid_info, imageUUID, 16); + setup_type_definition(retval, type_id, i, "dyld_load_info"); + break; + } + + case KCDATA_TYPE_LIBRARY_LOADINFO64: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct user64_dyld_uuid_info, imageLoadAddress); + _SUBTYPE_ARRAY(KC_ST_UINT8, struct user64_dyld_uuid_info, imageUUID, 16); + setup_type_definition(retval, type_id, i, "dyld_load_info"); + break; + } + + case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64_v2, imageLoadAddress); + _SUBTYPE_ARRAY(KC_ST_UINT8, struct dyld_uuid_info_64_v2, imageUUID, 16); + _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64_v2, imageSlidBaseAddress); + setup_type_definition(retval, type_id, i, "shared_cache_dyld_load_info"); + break; + } + + case STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64, imageLoadAddress); + _SUBTYPE_ARRAY(KC_ST_UINT8, struct dyld_uuid_info_64, imageUUID, 16); + setup_type_definition(retval, type_id, i, "kernelcache_load_info"); + break; + } + + case KCDATA_TYPE_TIMEBASE: { + i = 0; + _SUBTYPE(KC_ST_UINT32, struct mach_timebase_info, numer); + _SUBTYPE(KC_ST_UINT32, struct mach_timebase_info, denom); + setup_type_definition(retval, type_id, i, "mach_timebase_info"); + break; + } + + case KCDATA_TYPE_MACH_ABSOLUTE_TIME: + setup_type_definition(retval, type_id, 1, "mach_absolute_time"); + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "mach_absolute_time"); + break; + + case KCDATA_TYPE_TIMEVAL: { + i = 0; + _SUBTYPE(KC_ST_INT64, struct timeval64, tv_sec); + _SUBTYPE(KC_ST_INT64, struct timeval64, tv_usec); + setup_type_definition(retval, type_id, i, "timeval"); + break; + } + + case KCDATA_TYPE_USECS_SINCE_EPOCH: + setup_type_definition(retval, type_id, 1, "usecs_since_epoch"); + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "usecs_since_epoch"); + break; + + case KCDATA_TYPE_PID: + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "pid"); + setup_type_definition(retval, type_id, 1, "pid"); + break; + + case KCDATA_TYPE_PROCNAME: + i = 0; + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, 64, "proc_name"); + setup_type_definition(retval, type_id, i, "proc_name"); + break; + + /* stackshot specific types */ + case STACKSHOT_KCTYPE_IOSTATS: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_reads_count); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_reads_size); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_writes_count); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_disk_writes_size); + _SUBTYPE_ARRAY(KC_ST_UINT64, struct io_stats_snapshot, ss_io_priority_count, STACKSHOT_IO_NUM_PRIORITIES); + _SUBTYPE_ARRAY(KC_ST_UINT64, struct io_stats_snapshot, ss_io_priority_size, STACKSHOT_IO_NUM_PRIORITIES); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_paging_count); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_paging_size); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_non_paging_count); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_non_paging_size); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_data_count); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_data_size); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_metadata_count); + _SUBTYPE(KC_ST_UINT64, struct io_stats_snapshot, ss_metadata_size); + + setup_type_definition(retval, type_id, i, "io_statistics"); + break; + } + + case STACKSHOT_KCTYPE_GLOBAL_MEM_STATS: { + i = 0; + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, snapshot_magic); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, free_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, active_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, inactive_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, purgeable_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, wired_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, speculative_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, throttled_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, filebacked_pages); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, compressions); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, decompressions); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, compressor_size); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, busy_buffer_count); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, pages_wanted); + _SUBTYPE(KC_ST_UINT32, struct mem_and_io_snapshot, pages_reclaimed); + _SUBTYPE(KC_ST_UINT8, struct mem_and_io_snapshot, pages_wanted_reclaimed_valid); + setup_type_definition(retval, type_id, i, "mem_and_io_snapshot"); + break; + } + + case STACKSHOT_KCCONTAINER_TASK: + setup_type_definition(retval, type_id, 0, "task_snapshots"); + break; + + case STACKSHOT_KCCONTAINER_THREAD: + setup_type_definition(retval, type_id, 0, "thread_snapshots"); + break; + + case STACKSHOT_KCTYPE_TASK_SNAPSHOT: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_unique_pid); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_ss_flags); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_user_time_in_terminated_threads); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_system_time_in_terminated_threads); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_p_start_sec); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_task_size); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_max_resident_size); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_suspend_count); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_faults); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_pageins); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_cow_faults); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_was_throttled); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_did_throttle); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_latency_qos); + _SUBTYPE(KC_ST_INT32, struct task_snapshot_v2, ts_pid); + _SUBTYPE_ARRAY(KC_ST_CHAR, struct task_snapshot_v2, ts_p_comm, 32); + setup_type_definition(retval, type_id, i, "task_snapshot"); + break; + } + + case STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct task_delta_snapshot_v2, tds_unique_pid); + _SUBTYPE(KC_ST_UINT64, struct task_delta_snapshot_v2, tds_ss_flags); + _SUBTYPE(KC_ST_UINT64, struct task_delta_snapshot_v2, tds_user_time_in_terminated_threads); + _SUBTYPE(KC_ST_UINT64, struct task_delta_snapshot_v2, tds_system_time_in_terminated_threads); + _SUBTYPE(KC_ST_UINT64, struct task_delta_snapshot_v2, tds_task_size); + _SUBTYPE(KC_ST_UINT64, struct task_delta_snapshot_v2, tds_max_resident_size); + _SUBTYPE(KC_ST_UINT32, struct task_delta_snapshot_v2, tds_suspend_count); + _SUBTYPE(KC_ST_UINT32, struct task_delta_snapshot_v2, tds_faults); + _SUBTYPE(KC_ST_UINT32, struct task_delta_snapshot_v2, tds_pageins); + _SUBTYPE(KC_ST_UINT32, struct task_delta_snapshot_v2, tds_cow_faults); + _SUBTYPE(KC_ST_UINT32, struct task_delta_snapshot_v2, tds_was_throttled); + _SUBTYPE(KC_ST_UINT32, struct task_delta_snapshot_v2, tds_did_throttle); + _SUBTYPE(KC_ST_UINT32, struct task_delta_snapshot_v2, tds_latency_qos); + setup_type_definition(retval, type_id, i, "task_delta_snapshot"); + break; + } + + case STACKSHOT_KCTYPE_THREAD_SNAPSHOT: { + i = 0; + + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_thread_id); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_wait_event); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_continuation); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_total_syscalls); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_voucher_identifier); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_dqserialnum); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_user_time); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_sys_time); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_ss_flags); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_last_run_time); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_last_made_runnable_time); + _SUBTYPE(KC_ST_UINT32, struct thread_snapshot_v3, ths_state); + _SUBTYPE(KC_ST_UINT32, struct thread_snapshot_v3, ths_sched_flags); + _SUBTYPE(KC_ST_INT16, struct thread_snapshot_v3, ths_base_priority); + _SUBTYPE(KC_ST_INT16, struct thread_snapshot_v3, ths_sched_priority); + _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v3, ths_eqos); + _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v3, ths_rqos); + _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v3, ths_rqos_override); + _SUBTYPE(KC_ST_UINT8, struct thread_snapshot_v3, ths_io_tier); + _SUBTYPE(KC_ST_UINT64, struct thread_snapshot_v3, ths_thread_t); + + setup_type_definition(retval, type_id, i, "thread_snapshot"); + break; + } + + case STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT: { + i = 0; + + _SUBTYPE(KC_ST_UINT64, struct thread_delta_snapshot_v2, tds_thread_id); + _SUBTYPE(KC_ST_UINT64, struct thread_delta_snapshot_v2, tds_voucher_identifier); + _SUBTYPE(KC_ST_UINT64, struct thread_delta_snapshot_v2, tds_ss_flags); + _SUBTYPE(KC_ST_UINT64, struct thread_delta_snapshot_v2, tds_last_made_runnable_time); + _SUBTYPE(KC_ST_UINT32, struct thread_delta_snapshot_v2, tds_state); + _SUBTYPE(KC_ST_UINT32, struct thread_delta_snapshot_v2, tds_sched_flags); + _SUBTYPE(KC_ST_INT16, struct thread_delta_snapshot_v2, tds_base_priority); + _SUBTYPE(KC_ST_INT16, struct thread_delta_snapshot_v2, tds_sched_priority); + _SUBTYPE(KC_ST_UINT8, struct thread_delta_snapshot_v2, tds_eqos); + _SUBTYPE(KC_ST_UINT8, struct thread_delta_snapshot_v2, tds_rqos); + _SUBTYPE(KC_ST_UINT8, struct thread_delta_snapshot_v2, tds_rqos_override); + _SUBTYPE(KC_ST_UINT8, struct thread_delta_snapshot_v2, tds_io_tier); + + setup_type_definition(retval, type_id, i, "thread_delta_snapshot"); + + break; + } + + case STACKSHOT_KCTYPE_DONATING_PIDS: + setup_type_definition(retval, type_id, 1, "donating_pids"); + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "donating_pids"); + break; + + case STACKSHOT_KCTYPE_THREAD_NAME: { + i = 0; + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, 0, 64, "pth_name"); + setup_type_definition(retval, type_id, i, "pth_name"); + break; + } + + case STACKSHOT_KCTYPE_KERN_STACKFRAME: + setup_type_definition(retval, type_id, 2, "kernel_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "lr"); + setup_subtype_description(&subtypes[1], KC_ST_UINT32, sizeof(uint32_t), "sp"); + break; + + case STACKSHOT_KCTYPE_KERN_STACKFRAME64: + setup_type_definition(retval, type_id, 2, "kernel_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "lr"); + setup_subtype_description(&subtypes[1], KC_ST_UINT64, sizeof(uint64_t), "sp"); + break; + + case STACKSHOT_KCTYPE_USER_STACKFRAME: + setup_type_definition(retval, type_id, 2, "user_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "lr"); + setup_subtype_description(&subtypes[1], KC_ST_UINT32, sizeof(uint32_t), "sp"); + break; + + case STACKSHOT_KCTYPE_USER_STACKFRAME64: + setup_type_definition(retval, type_id, 2, "user_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "lr"); + setup_subtype_description(&subtypes[1], KC_ST_UINT64, sizeof(uint64_t), "sp"); + break; + + case STACKSHOT_KCTYPE_KERN_STACKLR: + setup_type_definition(retval, type_id, 1, "kernel_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "lr"); + subtypes[0].kcs_flags |= KCS_SUBTYPE_FLAGS_STRUCT; + break; + + case STACKSHOT_KCTYPE_KERN_STACKLR64: + setup_type_definition(retval, type_id, 1, "kernel_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "lr"); + subtypes[0].kcs_flags |= KCS_SUBTYPE_FLAGS_STRUCT; + break; + + case STACKSHOT_KCTYPE_USER_STACKLR: + setup_type_definition(retval, type_id, 1, "user_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "lr"); + subtypes[0].kcs_flags |= KCS_SUBTYPE_FLAGS_STRUCT; + break; + + case STACKSHOT_KCTYPE_USER_STACKLR64: + setup_type_definition(retval, type_id, 1, "user_stack_frames"); + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "lr"); + subtypes[0].kcs_flags |= KCS_SUBTYPE_FLAGS_STRUCT; + break; + + case STACKSHOT_KCTYPE_NONRUNNABLE_TIDS: + setup_type_definition(retval, type_id, 1, "nonrunnable_threads"); + setup_subtype_description(&subtypes[0], KC_ST_INT64, 0, "nonrunnable_threads"); + break; + + case STACKSHOT_KCTYPE_NONRUNNABLE_TASKS: + setup_type_definition(retval, type_id, 1, "nonrunnable_tasks"); + setup_subtype_description(&subtypes[0], KC_ST_INT64, 0, "nonrunnable_tasks"); + break; + + case STACKSHOT_KCTYPE_BOOTARGS: { + i = 0; + _STRINGTYPE("boot_args"); + setup_type_definition(retval, type_id, i, "boot_args"); + break; + } + + case STACKSHOT_KCTYPE_OSVERSION: { + i = 0; + _STRINGTYPE("osversion"); + setup_type_definition(retval, type_id, i, "osversion"); + break; + } + + case STACKSHOT_KCTYPE_KERN_PAGE_SIZE: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "kernel_page_size"); + setup_type_definition(retval, type_id, i, "kernel_page_size"); + break; + } + + case STACKSHOT_KCTYPE_JETSAM_LEVEL: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "jetsam_level"); + setup_type_definition(retval, type_id, i, "jetsam_level"); + break; + } + + case STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT64, 0, "stackshot_delta_since_timestamp"); + setup_type_definition(retval, type_id, i, "stackshot_delta_since_timestamp"); + break; + } + + /* crashinfo types */ + case TASK_CRASHINFO_BSDINFOWITHUNIQID: { + i = 0; + _SUBTYPE_ARRAY(KC_ST_UINT8, struct proc_uniqidentifierinfo, p_uuid, 16); + _SUBTYPE(KC_ST_UINT64, struct proc_uniqidentifierinfo, p_uniqueid); + _SUBTYPE(KC_ST_UINT64, struct proc_uniqidentifierinfo, p_puniqueid); + /* Ignore the p_reserve fields */ + setup_type_definition(retval, type_id, i, "proc_uniqidentifierinfo"); + break; + } + + case TASK_CRASHINFO_PID: { + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "pid"); + setup_type_definition(retval, type_id, 1, "pid"); + break; + } + + case TASK_CRASHINFO_PPID: { + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "ppid"); + setup_type_definition(retval, type_id, 1, "ppid"); + break; + } + + /* case TASK_CRASHINFO_RUSAGE: { */ + /* /\* */ + /* * rusage is a complex structure and is only for legacy use for crashed processes rusage info. */ + /* * So we just consider it as opaque data. */ + /* *\/ */ + /* i = 0; */ + /* setup_subtype_array_description(&subtypes[i++], KC_ST_UINT8, 0, sizeof(struct rusage), "rusage"); */ + /* setup_type_definition(retval, type_id, i, "rusage"); */ + /* break; */ + /* } */ + + case TASK_CRASHINFO_RUSAGE_INFO: { + i = 0; + _SUBTYPE_ARRAY(KC_ST_UINT8, struct rusage_info_v3, ri_uuid, 16); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_user_time); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_system_time); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_pkg_idle_wkups); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_interrupt_wkups); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_pageins); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_wired_size); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_resident_size); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_phys_footprint); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_proc_start_abstime); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_proc_exit_abstime); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_user_time); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_system_time); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_pkg_idle_wkups); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_interrupt_wkups); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_pageins); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_child_elapsed_abstime); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_diskio_bytesread); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_diskio_byteswritten); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_default); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_maintenance); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_background); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_utility); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_legacy); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_user_initiated); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_cpu_time_qos_user_interactive); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_billed_system_time); + _SUBTYPE(KC_ST_UINT64, struct rusage_info_v3, ri_serviced_system_time); + setup_type_definition(retval, type_id, i, "rusage_info"); + break; + } + + case STACKSHOT_KCTYPE_CPU_TIMES: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times, user_usec); + _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times, system_usec); + setup_type_definition(retval, type_id, i, "cpu_times"); + break; + } + + case STACKSHOT_KCTYPE_STACKSHOT_DURATION: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct stackshot_duration, stackshot_duration); + _SUBTYPE(KC_ST_UINT64, struct stackshot_duration, stackshot_duration_outer); + subtypes[0].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE; + subtypes[1].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE; + setup_type_definition(retval, type_id, i, "stackshot_duration"); + break; + } + + case STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS: { + i = 0; + _SUBTYPE(KC_ST_UINT32, struct stackshot_fault_stats, sfs_pages_faulted_in); + _SUBTYPE(KC_ST_UINT64, struct stackshot_fault_stats, sfs_time_spent_faulting); + _SUBTYPE(KC_ST_UINT64, struct stackshot_fault_stats, sfs_system_max_fault_time); + _SUBTYPE(KC_ST_UINT8, struct stackshot_fault_stats, sfs_stopped_faulting); + + setup_type_definition(retval, type_id, i, "stackshot_fault_stats"); + break; + } + + case TASK_CRASHINFO_PROC_STARTTIME: { + i = 0; + _SUBTYPE(KC_ST_INT64, struct timeval64, tv_sec); + _SUBTYPE(KC_ST_INT64, struct timeval64, tv_usec); + setup_type_definition(retval, type_id, i, "proc_starttime"); + break; + } + + case TASK_CRASHINFO_EXCEPTION_CODES: { + i = 0; + char codenum[100]; + for (i = 0; i < EXCEPTION_CODE_MAX; i++) { + snprintf(codenum, sizeof(codenum), "code_%d", i); + setup_subtype_description(&subtypes[i], KC_ST_UINT64, i * (sizeof(uint64_t)), codenum); + } + setup_type_definition(retval, type_id, i, "mach_exception_data_t"); + break; + } + + case TASK_CRASHINFO_PROC_NAME: { + i = 0; + _STRINGTYPE("p_comm"); + setup_type_definition(retval, type_id, i, "p_comm"); + break; + } + + case TASK_CRASHINFO_USERSTACK: { + i = 0; + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "userstack_ptr"); + setup_type_definition(retval, type_id, 1, "userstack_ptr"); + break; + } + + case TASK_CRASHINFO_ARGSLEN: { + i = 0; + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "p_argslen"); + setup_type_definition(retval, type_id, 1, "p_argslen"); + break; + } + + case TASK_CRASHINFO_PROC_PATH: { + i = 0; + _STRINGTYPE("p_path"); + setup_type_definition(retval, type_id, i, "p_path"); + break; + } + + case TASK_CRASHINFO_PROC_CSFLAGS: { + setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "p_csflags"); + setup_type_definition(retval, type_id, 1, "p_csflags"); + break; + } + + case TASK_CRASHINFO_PROC_STATUS: { + setup_subtype_description(&subtypes[0], KC_ST_UINT8, 0, "p_status"); + setup_type_definition(retval, type_id, 1, "p_status"); + break; + } + + case TASK_CRASHINFO_UID: { + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "uid"); + setup_type_definition(retval, type_id, 1, "uid"); + break; + } + + case TASK_CRASHINFO_GID: { + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "gid"); + setup_type_definition(retval, type_id, 1, "gid"); + break; + } + + case TASK_CRASHINFO_PROC_ARGC: { + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "argc"); + setup_type_definition(retval, type_id, 1, "argc"); + break; + } + + case TASK_CRASHINFO_PROC_FLAGS: { + setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "p_flags"); + setup_type_definition(retval, type_id, 1, "p_flags"); + break; + } + + case TASK_CRASHINFO_CPUTYPE: { + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "cputype"); + setup_type_definition(retval, type_id, 1, "cputype"); + break; + } + + case TASK_CRASHINFO_RESPONSIBLE_PID: { + setup_subtype_description(&subtypes[0], KC_ST_INT32, 0, "responsible_pid"); + setup_type_definition(retval, type_id, 1, "responsible_pid"); + break; + } + + case TASK_CRASHINFO_DIRTY_FLAGS: { + setup_subtype_description(&subtypes[0], KC_ST_UINT32, 0, "dirty_flags"); + setup_type_definition(retval, type_id, 1, "dirty_flags"); + break; + } + + case TASK_CRASHINFO_CRASHED_THREADID: { + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "crashed_threadid"); + setup_type_definition(retval, type_id, 1, "crashed_threadid"); + break; + } + + case TASK_CRASHINFO_COALITION_ID: { + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "coalition_id"); + setup_type_definition(retval, type_id, 1, "coalition_id"); + break; + } + + case TASK_CRASHINFO_UDATA_PTRS: { + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "udata_ptrs"); + setup_type_definition(retval, type_id, 1, "udata_ptrs"); + break; + } + + case TASK_CRASHINFO_MEMORY_LIMIT: { + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "task_phys_mem_limit"); + setup_type_definition(retval, type_id, 1, "task_phys_mem_limit"); + break; + } + + case EXIT_REASON_SNAPSHOT: { + _SUBTYPE(KC_ST_UINT32, struct exit_reason_snapshot, ers_namespace); + _SUBTYPE(KC_ST_UINT64, struct exit_reason_snapshot, ers_code); + _SUBTYPE(KC_ST_UINT64, struct exit_reason_snapshot, ers_flags); + setup_type_definition(retval, type_id, i, "exit_reason_basic_info"); + + break; + + } + + case EXIT_REASON_USER_DESC: { + i = 0; + + _STRINGTYPE("exit_reason_user_description"); + setup_type_definition(retval, type_id, i, "exit_reason_user_description"); + break; + } + + case EXIT_REASON_USER_PAYLOAD: { + i = 0; + + setup_subtype_array_description(&subtypes[i++], KC_ST_UINT8, 0, EXIT_REASON_PAYLOAD_MAX_LEN, "exit_reason_user_payload"); + setup_type_definition(retval, type_id, i, "exit_reason_user_payload"); + break; + } + + case EXIT_REASON_CODESIGNING_INFO: { + _SUBTYPE(KC_ST_UINT64, struct codesigning_exit_reason_info, ceri_virt_addr); + _SUBTYPE(KC_ST_UINT64, struct codesigning_exit_reason_info, ceri_file_offset); + _SUBTYPE_ARRAY(KC_ST_CHAR, struct codesigning_exit_reason_info, ceri_pathname, EXIT_REASON_CODESIG_PATH_MAX); + _SUBTYPE_ARRAY(KC_ST_CHAR, struct codesigning_exit_reason_info, ceri_filename, EXIT_REASON_CODESIG_PATH_MAX); + _SUBTYPE(KC_ST_UINT64, struct codesigning_exit_reason_info, ceri_codesig_modtime_secs); + _SUBTYPE(KC_ST_UINT64, struct codesigning_exit_reason_info, ceri_codesig_modtime_nsecs); + _SUBTYPE(KC_ST_UINT64, struct codesigning_exit_reason_info, ceri_page_modtime_secs); + _SUBTYPE(KC_ST_UINT64, struct codesigning_exit_reason_info, ceri_page_modtime_nsecs); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_path_truncated); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_object_codesigned); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_codesig_validated); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_codesig_tainted); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_codesig_nx); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_wpmapped); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_slid); + _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_dirty); + _SUBTYPE(KC_ST_UINT32, struct codesigning_exit_reason_info, ceri_page_shadow_depth); + setup_type_definition(retval, type_id, i, "exit_reason_codesigning_info"); + + break; + + } + + default: + retval = NULL; + break; + } + + assert(retval == NULL || (buffer_size > sizeof(struct kcdata_type_definition) + + (retval->kct_num_elements * sizeof(struct kcdata_subtype_descriptor)))); + return retval; +} + +static void +setup_type_definition(struct kcdata_type_definition * d, uint32_t type, uint32_t num_elems, char * name) +{ + d->kct_type_identifier = type; + d->kct_num_elements = num_elems; + memcpy(d->kct_name, name, sizeof(d->kct_name)); + d->kct_name[sizeof(d->kct_name) - 1] = '\0'; +} + +static uint32_t +get_kctype_subtype_size(kctype_subtype_t type) +{ + switch (type) { + case KC_ST_CHAR: + case KC_ST_INT8: + case KC_ST_UINT8: + return sizeof(uint8_t); + break; + case KC_ST_INT16: + case KC_ST_UINT16: + return sizeof(uint16_t); + break; + case KC_ST_INT32: + case KC_ST_UINT32: + return sizeof(uint32_t); + break; + case KC_ST_INT64: + case KC_ST_UINT64: + return sizeof(uint64_t); + break; + + default: + assert(0); + break; + } + return 0; +} + +static void +setup_subtype_array_description( + kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, uint32_t count, char * name) +{ + desc->kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY; + desc->kcs_elem_type = type; + desc->kcs_elem_offset = offset; + desc->kcs_elem_size = KCS_SUBTYPE_PACK_SIZE(count, get_kctype_subtype_size(type)); + memcpy(desc->kcs_name, name, sizeof(desc->kcs_name)); + desc->kcs_name[sizeof(desc->kcs_name) - 1] = '\0'; +} + +static void +setup_subtype_description(kcdata_subtype_descriptor_t desc, kctype_subtype_t type, uint32_t offset, char * name) +{ + desc->kcs_flags = KCS_SUBTYPE_FLAGS_NONE; + desc->kcs_elem_type = type; + desc->kcs_elem_offset = offset; + desc->kcs_elem_size = get_kctype_subtype_size(type); + memcpy(desc->kcs_name, name, sizeof(desc->kcs_name)); + desc->kcs_name[sizeof(desc->kcs_name) - 1] = '\0'; +} + diff --git a/libkdd/kcdata/kdd.h b/libkdd/kdd.h similarity index 63% rename from libkdd/kcdata/kdd.h rename to libkdd/kdd.h index ba9106d73..c7f528035 100644 --- a/libkdd/kcdata/kdd.h +++ b/libkdd/kdd.h @@ -29,6 +29,8 @@ #ifndef _KDD_H_ #define _KDD_H_ +#include + #import /*! @@ -42,8 +44,10 @@ * */ @interface KCDataType : NSObject -- (NSMutableDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length; -- (NSString *)name; +- (NSDictionary * _Nullable)parseData:(void * _Nonnull)dataBuffer ofLength:(uint32_t)length NS_RETURNS_RETAINED; +- (NSString * _Nonnull)name; +- (unsigned int)typeID; +- (BOOL) shouldMergeData; @end /*! @@ -59,7 +63,7 @@ * This routine queries the system for a give type. If a known type description is found it will be used to * initialize a KCDataType object. If no known type is found it assumes the data is uint8_t[]. */ -KCDataType * getKCDataTypeForID(uint32_t typeID); +KCDataType * _Nullable getKCDataTypeForID(uint32_t typeID); /*! * @function KCDataTypeNameForID @@ -74,7 +78,7 @@ KCDataType * getKCDataTypeForID(uint32_t typeID); * Returns name of the type. If a type is not found the return * value will be string object of the passed value. */ -NSString * KCDataTypeNameForID(uint32_t typeID); +NSString * _Nonnull KCDataTypeNameForID(uint32_t typeID) NS_RETURNS_NOT_RETAINED; /*! * @function parseKCDataArray @@ -83,15 +87,18 @@ NSString * KCDataTypeNameForID(uint32_t typeID); * Parse the given KCDATA buffer as an Array of element. The buffer should begin with header * of type KCDATA_TYPE_ARRAY. * - * @param dataBuffer - * A pointer in memory where KCDATA is allocated. + * @param iter + * An iterator into the input buffer + * + * @param error + * Error return. * * @return * A dictionary with key specifying name of the type of each elements and value is an Array of data. * */ -NSMutableDictionary * parseKCDataArray(void * dataBuffer); +NSMutableDictionary * _Nullable parseKCDataArray(kcdata_iter_t iter, NSError * _Nullable * _Nullable error) NS_RETURNS_RETAINED; /*! * @function parseKCDataContainer @@ -99,12 +106,12 @@ NSMutableDictionary * parseKCDataArray(void * dataBuffer); * @abstract * Parse the given KCDATA buffer as a container and convert each sub structures as fields in a dictionary. * - * @param dataBuffer - * A pointer in memory where KCDATA is allocated. The data should be pointing to - * kcdata_item_t of type KCDATA_TYPE_CONTAINER_BEGIN + * @param iter + * A pointer to an iterator into the input buffer. The iterator will be updated + * to point at the container end marker. * - * @param bytesParsed - * A pointer to uint32_t field where the routine will save the number of bytes parsed for this container. + * @param error + * Error return. * * @return NSDictionary * * containing each field and potentially sub containers within the provided container. @@ -112,25 +119,40 @@ NSMutableDictionary * parseKCDataArray(void * dataBuffer); * @discussion * This function tries to parse one container. If it encounters sub containers * they will be parsed and collected within the same dictionary. - * Other data type fields will also be parsed based on their type. The bytesParsed - * param is populated with the number of bytes processed. With this return value the caller can - * advance its buffer_read position as - * buffer = (kcdata_item_t)((uintptr_t)buffer + bytesParsed); //advance to next KCDATA_HEADER. - * Note: Keep in mind that the next header may be KCDATA_TYPE_BUFFER_END. - * - * A sample usage call can be: - * KCDATA_ITEM_FOREACH(buffer) { - * if(KCDATA_ITEM_TYPE(buffer) == KCDATA_TYPE_CONTAINER_BEGIN) { - * uint32_t container_size = 0; - * NSMutableDictionary *parsedContainer = parseKCDataContainer(buffer, &container_size); - * NSLog(@"Parsed container has : %@", parsedContainer); - * buffer = (kcdata_item_t) ((uintptr_t)buffer + container_size); - * if(KCDATA_ITEM_TYPE(buffer) == KCDATA_TYPE_BUFFER_END) - * break; - * } - * } + * Other data type fields will also be parsed based on their type. + * + */ + +NSMutableDictionary * _Nullable parseKCDataContainer(kcdata_iter_t * _Nonnull iter_p, NSError * _Nullable * _Nullable error) NS_RETURNS_RETAINED; + +/*! + * @function parseKCDataBuffer + * + * @abstract + * Parse complete KCDATA buffer into NSMutableDictionary. Depending on the size of buffer and elements + * this routine makes allocations for objects and strings. + * + * @param dataBuffer + * A pointer in memory where KCDATA is allocated. The data should be of type + * kcdata_item_t and have KCDATA_BUFFER_BEGIN_* tags (see kern_cdata.h) + * + * @param size + * Size of the buffer as provided by kernel api. + * + * @return NSDictionary * + * Dictionary with key:value pairs for each data item. KCDATA_TYPE_ARRAY and KCDATA_TYPE_CONTAINERS will + * grouped and recursed as much possible. For unknown types NSData object is returned with "Type_0x123" + * as keys. + * + * @discussion + * This function tries to parse KCDATA buffer with known type description. If an error occurs, + * NULL is returned, and error (if not NULL) will have the error string. + * + * Iff the buffer does begin with a known kcdata magic number, the error code + * will be KERN_INVALID_VALUE. * */ -NSMutableDictionary * parseKCDataContainer(void * dataBuffer, uint32_t * bytesParsed); +NSDictionary * _Nullable parseKCDataBuffer(void * _Nonnull dataBuffer, uint32_t size, NSError * _Nullable * _Nullable error) NS_RETURNS_RETAINED; + #endif /* _KDD_H_ */ diff --git a/libkdd/kcdata/kdd.m b/libkdd/kdd.m similarity index 89% rename from libkdd/kcdata/kdd.m rename to libkdd/kdd.m index 599cea923..ab65e953b 100644 --- a/libkdd/kcdata/kdd.m +++ b/libkdd/kdd.m @@ -30,7 +30,7 @@ @implementation KCDataType -- (NSMutableDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length +- (NSDictionary *)parseData:(void *)dataBuffer ofLength:(uint32_t)length { assert(0); } @@ -40,4 +40,14 @@ - (NSString *)name assert(0); } +- (BOOL) shouldMergeData +{ + assert(0); +} + +- (unsigned int)typeID +{ + assert(0); +} + @end diff --git a/libkdd/kdd.xcodeproj/project.pbxproj b/libkdd/kdd.xcodeproj/project.pbxproj index fb6753025..cb0b189d6 100644 --- a/libkdd/kdd.xcodeproj/project.pbxproj +++ b/libkdd/kdd.xcodeproj/project.pbxproj @@ -7,6 +7,55 @@ objects = { /* Begin PBXBuildFile section */ + 081725D51C3F476500371A54 /* stackshot-sample-duration in Resources */ = {isa = PBXBuildFile; fileRef = 081725D31C3F475200371A54 /* stackshot-sample-duration */; }; + 081725D61C3F476500371A54 /* stackshot-sample-duration.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 081725D41C3F475200371A54 /* stackshot-sample-duration.plist.gz */; }; + 081EDD381C23855700A1C138 /* stackshot-sample-cputime in Resources */ = {isa = PBXBuildFile; fileRef = 081EDD361C23854500A1C138 /* stackshot-sample-cputime */; }; + 081EDD391C23855700A1C138 /* stackshot-sample-cputime.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 081EDD371C23854500A1C138 /* stackshot-sample-cputime.plist.gz */; }; + 08238A3B1BFEB5450053190C /* libz.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 08F1501D1BFEA7AC00F2C89C /* libz.dylib */; }; + 0834719E1BF7D05400D67253 /* kcdata.h in Headers */ = {isa = PBXBuildFile; fileRef = 0834719D1BF7D05400D67253 /* kcdata.h */; settings = {ATTRIBUTES = (Private, ); }; }; + 0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */ = {isa = PBXBuildFile; fileRef = 0843EE911BF6AFB700CD4150 /* stackshot-sample */; }; + 0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */; }; + 08603F371BF69EDE007D3784 /* Tests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 08603F361BF69EDE007D3784 /* Tests.swift */; }; + 08603F391BF69EDE007D3784 /* libkdd.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C91C93C71ACB58B700119B60 /* libkdd.a */; }; + 0860F87A1BFC3857007E1301 /* stackshot-sample-tailspin-2 in Resources */ = {isa = PBXBuildFile; fileRef = 0860F8781BFC3845007E1301 /* stackshot-sample-tailspin-2 */; }; + 0860F87B1BFC3857007E1301 /* stackshot-sample-tailspin-2.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 0860F8791BFC3845007E1301 /* stackshot-sample-tailspin-2.plist.gz */; }; + 086395B51BF5655D005ED913 /* kdd_main.m in Sources */ = {isa = PBXBuildFile; fileRef = 086395B41BF5655D005ED913 /* kdd_main.m */; }; + 086395B91BF565A2005ED913 /* libkdd.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C91C93C71ACB58B700119B60 /* libkdd.a */; }; + 08A4C94C1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = 08A4C94B1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m */; }; + 08A4C94F1C470F1C00D5F010 /* nested-sample in Resources */ = {isa = PBXBuildFile; fileRef = 08A4C94D1C470F0900D5F010 /* nested-sample */; }; + 08A4C9501C470F1C00D5F010 /* nested-sample.plist in Resources */ = {isa = PBXBuildFile; fileRef = 08A4C94E1C470F0900D5F010 /* nested-sample.plist */; }; + 08B480781BF8297500B4AAE0 /* stackshot-sample-new-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */; }; + 08B480791BF8297500B4AAE0 /* stackshot-sample-new-arrays.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */; }; + 08B4807A1BF8297500B4AAE0 /* stackshot-sample-old-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B480761BF8294E00B4AAE0 /* stackshot-sample-old-arrays */; }; + 08B4807B1BF8297500B4AAE0 /* stackshot-sample-old-arrays.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B480771BF8294E00B4AAE0 /* stackshot-sample-old-arrays.plist.gz */; }; + 08B480831BF864D300B4AAE0 /* delta-stackshot-sample-new-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B4807F1BF864C800B4AAE0 /* delta-stackshot-sample-new-arrays */; }; + 08B480841BF864D300B4AAE0 /* delta-stackshot-sample-new-arrays.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B480801BF864C800B4AAE0 /* delta-stackshot-sample-new-arrays.plist.gz */; }; + 08B480851BF864D300B4AAE0 /* delta-stackshot-sample-old-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B480811BF864C800B4AAE0 /* delta-stackshot-sample-old-arrays */; }; + 08B480861BF864D300B4AAE0 /* delta-stackshot-sample-old-arrays.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B480821BF864C800B4AAE0 /* delta-stackshot-sample-old-arrays.plist.gz */; }; + 08B480881BF92E0500B4AAE0 /* kcdata.py in Resources */ = {isa = PBXBuildFile; fileRef = 08B480871BF92DFB00B4AAE0 /* kcdata.py */; }; + 08B4808B1BF9474A00B4AAE0 /* corpse-sample in Resources */ = {isa = PBXBuildFile; fileRef = 08B480891BF9473800B4AAE0 /* corpse-sample */; }; + 08B4808C1BF9474A00B4AAE0 /* corpse-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B4808A1BF9473800B4AAE0 /* corpse-sample.plist.gz */; }; + 08B9297E1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t in Resources */ = {isa = PBXBuildFile; fileRef = 08B9297C1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t */; }; + 08B9297F1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B9297D1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t.plist.gz */; }; + 08C9D83D1BFFF8E100DF6C05 /* exitreason-sample in Resources */ = {isa = PBXBuildFile; fileRef = 08C9D83B1BFFF8D500DF6C05 /* exitreason-sample */; }; + 08C9D83E1BFFF8E100DF6C05 /* exitreason-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08C9D83C1BFFF8D500DF6C05 /* exitreason-sample.plist.gz */; }; + 08CF18FF1BF9B7B100D05813 /* stackshot-sample-tailspin in Resources */ = {isa = PBXBuildFile; fileRef = 08CF18FD1BF9B79E00D05813 /* stackshot-sample-tailspin */; }; + 08CF19001BF9B7B100D05813 /* stackshot-sample-tailspin.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08CF18FE1BF9B79E00D05813 /* stackshot-sample-tailspin.plist.gz */; }; + 08F1501E1BFEA7AC00F2C89C /* libz.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 08F1501D1BFEA7AC00F2C89C /* libz.dylib */; }; + 1368F0851C87E06A00940FC6 /* exitreason-codesigning.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 1368F0841C87E06300940FC6 /* exitreason-codesigning.plist.gz */; }; + 1368F0861C87E06C00940FC6 /* exitreason-codesigning in Resources */ = {isa = PBXBuildFile; fileRef = 1368F0831C87E06300940FC6 /* exitreason-codesigning */; }; + 13A79CAA1CF8C5D600FFC181 /* stackshot-with-kcid in Resources */ = {isa = PBXBuildFile; fileRef = 13A79CA81CF8C5D200FFC181 /* stackshot-with-kcid */; }; + 13A79CAB1CF8C5D600FFC181 /* stackshot-with-kcid.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13A79CA91CF8C5D200FFC181 /* stackshot-with-kcid.plist.gz */; }; + 13CC08441CB97F8D00EA6069 /* stackshot-fault-stats in Resources */ = {isa = PBXBuildFile; fileRef = 13CC08421CB97F8A00EA6069 /* stackshot-fault-stats */; }; + 13CC08451CB97F9000EA6069 /* stackshot-fault-stats.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13CC08431CB97F8A00EA6069 /* stackshot-fault-stats.plist.gz */; }; + 13D6C5D01C4DDDB6005E617C /* corpse-twr-sample in Resources */ = {isa = PBXBuildFile; fileRef = 13AF287B1C4A0D6A000795E2 /* corpse-twr-sample */; }; + 13D6C5D11C4DDDB8005E617C /* corpse-twr-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13AF287C1C4A0D6A000795E2 /* corpse-twr-sample.plist.gz */; }; + 13D6C5D21C4DDDBE005E617C /* test-twr-sample in Resources */ = {isa = PBXBuildFile; fileRef = 13AF287E1C4A0D6A000795E2 /* test-twr-sample */; }; + 13D6C5D31C4DDE0D005E617C /* test-twr-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13EADC171C4DCDA100468D97 /* test-twr-sample.plist.gz */; }; + 13DBA2681CAB1AD600227EB2 /* stackshot-sample-sharedcachev2.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13DBA2661CAB1ACB00227EB2 /* stackshot-sample-sharedcachev2.plist.gz */; }; + 13DBA26A1CAB1BA000227EB2 /* stackshot-sample-sharedcachev2 in Resources */ = {isa = PBXBuildFile; fileRef = 13DBA2691CAB1B9C00227EB2 /* stackshot-sample-sharedcachev2 */; }; + 13F3DA9C1C7C1BEE00ACFFCC /* corpse-twr-sample-v2 in Resources */ = {isa = PBXBuildFile; fileRef = 13F3DA9B1C7C1BE700ACFFCC /* corpse-twr-sample-v2 */; }; + 13F3DA9E1C7C1C6600ACFFCC /* corpse-twr-sample-v2.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13F3DA9D1C7C1C6000ACFFCC /* corpse-twr-sample-v2.plist.gz */; }; C91C93CB1ACB58B700119B60 /* kdd.h in Headers */ = {isa = PBXBuildFile; fileRef = C91C93CA1ACB58B700119B60 /* kdd.h */; settings = {ATTRIBUTES = (Private, ); }; }; C91C93CD1ACB58B700119B60 /* kdd.m in Sources */ = {isa = PBXBuildFile; fileRef = C91C93CC1ACB58B700119B60 /* kdd.m */; }; C91C93E41ACB598700119B60 /* KCDBasicTypeDescription.h in Headers */ = {isa = PBXBuildFile; fileRef = C91C93E01ACB598700119B60 /* KCDBasicTypeDescription.h */; }; @@ -14,10 +63,92 @@ C91C93E61ACB598700119B60 /* KCDStructTypeDescription.h in Headers */ = {isa = PBXBuildFile; fileRef = C91C93E21ACB598700119B60 /* KCDStructTypeDescription.h */; }; C91C93E71ACB598700119B60 /* KCDStructTypeDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = C91C93E31ACB598700119B60 /* KCDStructTypeDescription.m */; }; C9C5C68C1ACDAFDB00BE0E5E /* kcdtypes.c in Sources */ = {isa = PBXBuildFile; fileRef = C9C5C68B1ACDAFDB00BE0E5E /* kcdtypes.c */; }; + C9D7B53F1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = C9D7B53D1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz */; }; + C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */ = {isa = PBXBuildFile; fileRef = C9D7B53E1D1B41D700F1019D /* xnupost_testconfig-sample */; }; C9DE39141ACB5A540020F4A3 /* kcdata_core.m in Sources */ = {isa = PBXBuildFile; fileRef = C9DE39131ACB5A540020F4A3 /* kcdata_core.m */; }; /* End PBXBuildFile section */ +/* Begin PBXContainerItemProxy section */ + 08603F3A1BF69EDE007D3784 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = C91C93BF1ACB58B700119B60 /* Project object */; + proxyType = 1; + remoteGlobalIDString = C91C93C61ACB58B700119B60; + remoteInfo = libkdd; + }; + 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = C91C93BF1ACB58B700119B60 /* Project object */; + proxyType = 1; + remoteGlobalIDString = C91C93C61ACB58B700119B60; + remoteInfo = libkdd; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXCopyFilesBuildPhase section */ + 086395B01BF5655D005ED913 /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = /usr/share/man/man1/; + dstSubfolderSpec = 0; + files = ( + ); + runOnlyForDeploymentPostprocessing = 1; + }; +/* End PBXCopyFilesBuildPhase section */ + /* Begin PBXFileReference section */ + 081725D31C3F475200371A54 /* stackshot-sample-duration */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-duration"; path = "tests/stackshot-sample-duration"; sourceTree = SOURCE_ROOT; }; + 081725D41C3F475200371A54 /* stackshot-sample-duration.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-duration.plist.gz"; path = "tests/stackshot-sample-duration.plist.gz"; sourceTree = SOURCE_ROOT; }; + 081EDD361C23854500A1C138 /* stackshot-sample-cputime */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-cputime"; path = "tests/stackshot-sample-cputime"; sourceTree = SOURCE_ROOT; }; + 081EDD371C23854500A1C138 /* stackshot-sample-cputime.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-cputime.plist.gz"; path = "tests/stackshot-sample-cputime.plist.gz"; sourceTree = SOURCE_ROOT; }; + 0834719D1BF7D05400D67253 /* kcdata.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = kcdata.h; path = ../osfmk/kern/kcdata.h; sourceTree = ""; }; + 0843EE911BF6AFB700CD4150 /* stackshot-sample */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample"; path = "tests/stackshot-sample"; sourceTree = SOURCE_ROOT; }; + 0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = archive.gzip; name = "stackshot-sample.plist.gz"; path = "tests/stackshot-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08603F341BF69EDE007D3784 /* tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; + 08603F361BF69EDE007D3784 /* Tests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tests.swift; sourceTree = ""; }; + 08603F381BF69EDE007D3784 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + 08603F3F1BF69F44007D3784 /* kdd_bridge.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = kdd_bridge.h; path = ../tests/kdd_bridge.h; sourceTree = ""; }; + 0860F8781BFC3845007E1301 /* stackshot-sample-tailspin-2 */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-tailspin-2"; path = "tests/stackshot-sample-tailspin-2"; sourceTree = SOURCE_ROOT; }; + 0860F8791BFC3845007E1301 /* stackshot-sample-tailspin-2.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-tailspin-2.plist.gz"; path = "tests/stackshot-sample-tailspin-2.plist.gz"; sourceTree = SOURCE_ROOT; }; + 086395B21BF5655D005ED913 /* kdd */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = kdd; sourceTree = BUILT_PRODUCTS_DIR; }; + 086395B41BF5655D005ED913 /* kdd_main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = kdd_main.m; sourceTree = ""; }; + 08A4C94A1C47019E00D5F010 /* KCDEmbeddedBufferDescription.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KCDEmbeddedBufferDescription.h; sourceTree = ""; }; + 08A4C94B1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDEmbeddedBufferDescription.m; sourceTree = ""; }; + 08A4C94D1C470F0900D5F010 /* nested-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "nested-sample"; path = "tests/nested-sample"; sourceTree = SOURCE_ROOT; }; + 08A4C94E1C470F0900D5F010 /* nested-sample.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = "nested-sample.plist"; path = "tests/nested-sample.plist"; sourceTree = SOURCE_ROOT; }; + 08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample-new-arrays"; path = "tests/stackshot-sample-new-arrays"; sourceTree = SOURCE_ROOT; }; + 08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-new-arrays.plist.gz"; path = "tests/stackshot-sample-new-arrays.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08B480761BF8294E00B4AAE0 /* stackshot-sample-old-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample-old-arrays"; path = "tests/stackshot-sample-old-arrays"; sourceTree = SOURCE_ROOT; }; + 08B480771BF8294E00B4AAE0 /* stackshot-sample-old-arrays.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-old-arrays.plist.gz"; path = "tests/stackshot-sample-old-arrays.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08B4807F1BF864C800B4AAE0 /* delta-stackshot-sample-new-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "delta-stackshot-sample-new-arrays"; path = "tests/delta-stackshot-sample-new-arrays"; sourceTree = SOURCE_ROOT; }; + 08B480801BF864C800B4AAE0 /* delta-stackshot-sample-new-arrays.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "delta-stackshot-sample-new-arrays.plist.gz"; path = "tests/delta-stackshot-sample-new-arrays.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08B480811BF864C800B4AAE0 /* delta-stackshot-sample-old-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "delta-stackshot-sample-old-arrays"; path = "tests/delta-stackshot-sample-old-arrays"; sourceTree = SOURCE_ROOT; }; + 08B480821BF864C800B4AAE0 /* delta-stackshot-sample-old-arrays.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "delta-stackshot-sample-old-arrays.plist.gz"; path = "tests/delta-stackshot-sample-old-arrays.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08B480871BF92DFB00B4AAE0 /* kcdata.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; name = kcdata.py; path = ../tools/lldbmacros/kcdata.py; sourceTree = ""; }; + 08B480891BF9473800B4AAE0 /* corpse-sample */ = {isa = PBXFileReference; lastKnownFileType = text; name = "corpse-sample"; path = "tests/corpse-sample"; sourceTree = SOURCE_ROOT; }; + 08B4808A1BF9473800B4AAE0 /* corpse-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "corpse-sample.plist.gz"; path = "tests/corpse-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08B9297C1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-ths-thread-t"; path = "tests/stackshot-sample-ths-thread-t"; sourceTree = SOURCE_ROOT; }; + 08B9297D1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-ths-thread-t.plist.gz"; path = "tests/stackshot-sample-ths-thread-t.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08C9D83B1BFFF8D500DF6C05 /* exitreason-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "exitreason-sample"; path = "tests/exitreason-sample"; sourceTree = SOURCE_ROOT; }; + 08C9D83C1BFFF8D500DF6C05 /* exitreason-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "exitreason-sample.plist.gz"; path = "tests/exitreason-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08CF18FD1BF9B79E00D05813 /* stackshot-sample-tailspin */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-tailspin"; path = "tests/stackshot-sample-tailspin"; sourceTree = SOURCE_ROOT; }; + 08CF18FE1BF9B79E00D05813 /* stackshot-sample-tailspin.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-tailspin.plist.gz"; path = "tests/stackshot-sample-tailspin.plist.gz"; sourceTree = SOURCE_ROOT; }; + 08F1501D1BFEA7AC00F2C89C /* libz.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libz.dylib; path = Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.12.sdk/usr/lib/libz.dylib; sourceTree = DEVELOPER_DIR; }; + 1368F0831C87E06300940FC6 /* exitreason-codesigning */ = {isa = PBXFileReference; lastKnownFileType = file; name = "exitreason-codesigning"; path = "tests/exitreason-codesigning"; sourceTree = SOURCE_ROOT; }; + 1368F0841C87E06300940FC6 /* exitreason-codesigning.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "exitreason-codesigning.plist.gz"; path = "tests/exitreason-codesigning.plist.gz"; sourceTree = SOURCE_ROOT; }; + 13A79CA81CF8C5D200FFC181 /* stackshot-with-kcid */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-with-kcid"; path = "tests/stackshot-with-kcid"; sourceTree = SOURCE_ROOT; }; + 13A79CA91CF8C5D200FFC181 /* stackshot-with-kcid.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-with-kcid.plist.gz"; path = "tests/stackshot-with-kcid.plist.gz"; sourceTree = SOURCE_ROOT; }; + 13AF287B1C4A0D6A000795E2 /* corpse-twr-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "corpse-twr-sample"; path = "tests/corpse-twr-sample"; sourceTree = SOURCE_ROOT; }; + 13AF287C1C4A0D6A000795E2 /* corpse-twr-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "corpse-twr-sample.plist.gz"; path = "tests/corpse-twr-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; + 13AF287E1C4A0D6A000795E2 /* test-twr-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "test-twr-sample"; path = "tests/test-twr-sample"; sourceTree = SOURCE_ROOT; }; + 13CC08421CB97F8A00EA6069 /* stackshot-fault-stats */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-fault-stats"; path = "tests/stackshot-fault-stats"; sourceTree = SOURCE_ROOT; }; + 13CC08431CB97F8A00EA6069 /* stackshot-fault-stats.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-fault-stats.plist.gz"; path = "tests/stackshot-fault-stats.plist.gz"; sourceTree = SOURCE_ROOT; }; + 13DBA2661CAB1ACB00227EB2 /* stackshot-sample-sharedcachev2.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-sharedcachev2.plist.gz"; path = "tests/stackshot-sample-sharedcachev2.plist.gz"; sourceTree = SOURCE_ROOT; }; + 13DBA2691CAB1B9C00227EB2 /* stackshot-sample-sharedcachev2 */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-sharedcachev2"; path = "tests/stackshot-sample-sharedcachev2"; sourceTree = SOURCE_ROOT; }; + 13EADC171C4DCDA100468D97 /* test-twr-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "test-twr-sample.plist.gz"; path = "tests/test-twr-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; + 13F3DA9B1C7C1BE700ACFFCC /* corpse-twr-sample-v2 */ = {isa = PBXFileReference; lastKnownFileType = file; name = "corpse-twr-sample-v2"; path = "tests/corpse-twr-sample-v2"; sourceTree = SOURCE_ROOT; }; + 13F3DA9D1C7C1C6000ACFFCC /* corpse-twr-sample-v2.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "corpse-twr-sample-v2.plist.gz"; path = "tests/corpse-twr-sample-v2.plist.gz"; sourceTree = SOURCE_ROOT; }; C91C93C71ACB58B700119B60 /* libkdd.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkdd.a; sourceTree = BUILT_PRODUCTS_DIR; }; C91C93CA1ACB58B700119B60 /* kdd.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = kdd.h; sourceTree = ""; }; C91C93CC1ACB58B700119B60 /* kdd.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = kdd.m; sourceTree = ""; }; @@ -26,10 +157,30 @@ C91C93E21ACB598700119B60 /* KCDStructTypeDescription.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KCDStructTypeDescription.h; sourceTree = ""; }; C91C93E31ACB598700119B60 /* KCDStructTypeDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDStructTypeDescription.m; sourceTree = ""; }; C9C5C68B1ACDAFDB00BE0E5E /* kcdtypes.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kcdtypes.c; sourceTree = ""; }; + C9D7B53D1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "xnupost_testconfig-sample.plist.gz"; sourceTree = ""; }; + C9D7B53E1D1B41D700F1019D /* xnupost_testconfig-sample */ = {isa = PBXFileReference; lastKnownFileType = file; path = "xnupost_testconfig-sample"; sourceTree = ""; }; C9DE39131ACB5A540020F4A3 /* kcdata_core.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = kcdata_core.m; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ + 08603F311BF69EDE007D3784 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 08238A3B1BFEB5450053190C /* libz.dylib in Frameworks */, + 08603F391BF69EDE007D3784 /* libkdd.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 086395AF1BF5655D005ED913 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 08F1501E1BFEA7AC00F2C89C /* libz.dylib in Frameworks */, + 086395B91BF565A2005ED913 /* libkdd.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; C91C93C41ACB58B700119B60 /* Frameworks */ = { isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; @@ -40,23 +191,59 @@ /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ - C91C93BE1ACB58B700119B60 = { - isa = PBXGroup; - children = ( - C91C93C91ACB58B700119B60 /* kcdata */, - C91C93C81ACB58B700119B60 /* Products */, - ); - sourceTree = ""; - }; - C91C93C81ACB58B700119B60 /* Products */ = { + 08603F351BF69EDE007D3784 /* Tests */ = { isa = PBXGroup; children = ( - C91C93C71ACB58B700119B60 /* libkdd.a */, + C9D7B53D1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz */, + C9D7B53E1D1B41D700F1019D /* xnupost_testconfig-sample */, + 13A79CA81CF8C5D200FFC181 /* stackshot-with-kcid */, + 13A79CA91CF8C5D200FFC181 /* stackshot-with-kcid.plist.gz */, + 13CC08421CB97F8A00EA6069 /* stackshot-fault-stats */, + 13CC08431CB97F8A00EA6069 /* stackshot-fault-stats.plist.gz */, + 13DBA2691CAB1B9C00227EB2 /* stackshot-sample-sharedcachev2 */, + 13DBA2661CAB1ACB00227EB2 /* stackshot-sample-sharedcachev2.plist.gz */, + 1368F0831C87E06300940FC6 /* exitreason-codesigning */, + 1368F0841C87E06300940FC6 /* exitreason-codesigning.plist.gz */, + 13F3DA9D1C7C1C6000ACFFCC /* corpse-twr-sample-v2.plist.gz */, + 13F3DA9B1C7C1BE700ACFFCC /* corpse-twr-sample-v2 */, + 13EADC171C4DCDA100468D97 /* test-twr-sample.plist.gz */, + 13AF287B1C4A0D6A000795E2 /* corpse-twr-sample */, + 13AF287C1C4A0D6A000795E2 /* corpse-twr-sample.plist.gz */, + 13AF287E1C4A0D6A000795E2 /* test-twr-sample */, + 08603F3F1BF69F44007D3784 /* kdd_bridge.h */, + 0843EE911BF6AFB700CD4150 /* stackshot-sample */, + 08603F361BF69EDE007D3784 /* Tests.swift */, + 0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */, + 08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */, + 08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */, + 08B480761BF8294E00B4AAE0 /* stackshot-sample-old-arrays */, + 08B480771BF8294E00B4AAE0 /* stackshot-sample-old-arrays.plist.gz */, + 08B4807F1BF864C800B4AAE0 /* delta-stackshot-sample-new-arrays */, + 08B480801BF864C800B4AAE0 /* delta-stackshot-sample-new-arrays.plist.gz */, + 08B480811BF864C800B4AAE0 /* delta-stackshot-sample-old-arrays */, + 08B480821BF864C800B4AAE0 /* delta-stackshot-sample-old-arrays.plist.gz */, + 08CF18FD1BF9B79E00D05813 /* stackshot-sample-tailspin */, + 08CF18FE1BF9B79E00D05813 /* stackshot-sample-tailspin.plist.gz */, + 0860F8781BFC3845007E1301 /* stackshot-sample-tailspin-2 */, + 0860F8791BFC3845007E1301 /* stackshot-sample-tailspin-2.plist.gz */, + 08B9297C1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t */, + 08B9297D1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t.plist.gz */, + 081EDD361C23854500A1C138 /* stackshot-sample-cputime */, + 081EDD371C23854500A1C138 /* stackshot-sample-cputime.plist.gz */, + 08B480891BF9473800B4AAE0 /* corpse-sample */, + 08B4808A1BF9473800B4AAE0 /* corpse-sample.plist.gz */, + 08C9D83B1BFFF8D500DF6C05 /* exitreason-sample */, + 08C9D83C1BFFF8D500DF6C05 /* exitreason-sample.plist.gz */, + 081725D31C3F475200371A54 /* stackshot-sample-duration */, + 081725D41C3F475200371A54 /* stackshot-sample-duration.plist.gz */, + 08A4C94D1C470F0900D5F010 /* nested-sample */, + 08A4C94E1C470F0900D5F010 /* nested-sample.plist */, + 08603F381BF69EDE007D3784 /* Info.plist */, ); - name = Products; + path = tests; sourceTree = ""; }; - C91C93C91ACB58B700119B60 /* kcdata */ = { + 08DE68351BFFB70900BC682F /* libkdd */ = { isa = PBXGroup; children = ( C9C5C68B1ACDAFDB00BE0E5E /* kcdtypes.c */, @@ -65,10 +252,43 @@ C91C93E11ACB598700119B60 /* KCDBasicTypeDescription.m */, C91C93E21ACB598700119B60 /* KCDStructTypeDescription.h */, C91C93E31ACB598700119B60 /* KCDStructTypeDescription.m */, + 08A4C94A1C47019E00D5F010 /* KCDEmbeddedBufferDescription.h */, + 08A4C94B1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m */, C91C93CA1ACB58B700119B60 /* kdd.h */, C91C93CC1ACB58B700119B60 /* kdd.m */, ); - path = kcdata; + name = libkdd; + sourceTree = ""; + }; + 08DE68361BFFB71D00BC682F /* kdd */ = { + isa = PBXGroup; + children = ( + 086395B41BF5655D005ED913 /* kdd_main.m */, + ); + name = kdd; + sourceTree = ""; + }; + C91C93BE1ACB58B700119B60 = { + isa = PBXGroup; + children = ( + 08DE68361BFFB71D00BC682F /* kdd */, + 08DE68351BFFB70900BC682F /* libkdd */, + 08F1501D1BFEA7AC00F2C89C /* libz.dylib */, + 0834719D1BF7D05400D67253 /* kcdata.h */, + 08B480871BF92DFB00B4AAE0 /* kcdata.py */, + 08603F351BF69EDE007D3784 /* Tests */, + C91C93C81ACB58B700119B60 /* Products */, + ); + sourceTree = ""; + }; + C91C93C81ACB58B700119B60 /* Products */ = { + isa = PBXGroup; + children = ( + C91C93C71ACB58B700119B60 /* libkdd.a */, + 086395B21BF5655D005ED913 /* kdd */, + 08603F341BF69EDE007D3784 /* tests.xctest */, + ); + name = Products; sourceTree = ""; }; /* End PBXGroup section */ @@ -79,6 +299,7 @@ buildActionMask = 2147483647; files = ( C91C93CB1ACB58B700119B60 /* kdd.h in Headers */, + 0834719E1BF7D05400D67253 /* kcdata.h in Headers */, C91C93E41ACB598700119B60 /* KCDBasicTypeDescription.h in Headers */, C91C93E61ACB598700119B60 /* KCDStructTypeDescription.h in Headers */, ); @@ -87,9 +308,45 @@ /* End PBXHeadersBuildPhase section */ /* Begin PBXNativeTarget section */ - C91C93C61ACB58B700119B60 /* kdd */ = { + 08603F331BF69EDE007D3784 /* tests */ = { isa = PBXNativeTarget; - buildConfigurationList = C91C93DA1ACB58B700119B60 /* Build configuration list for PBXNativeTarget "kdd" */; + buildConfigurationList = 08603F3E1BF69EDE007D3784 /* Build configuration list for PBXNativeTarget "tests" */; + buildPhases = ( + 08603F301BF69EDE007D3784 /* Sources */, + 08603F311BF69EDE007D3784 /* Frameworks */, + 08603F321BF69EDE007D3784 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + 08603F3B1BF69EDE007D3784 /* PBXTargetDependency */, + ); + name = tests; + productName = Tests; + productReference = 08603F341BF69EDE007D3784 /* tests.xctest */; + productType = "com.apple.product-type.bundle.unit-test"; + }; + 086395B11BF5655D005ED913 /* kdd */ = { + isa = PBXNativeTarget; + buildConfigurationList = 086395B61BF5655D005ED913 /* Build configuration list for PBXNativeTarget "kdd" */; + buildPhases = ( + 086395AE1BF5655D005ED913 /* Sources */, + 086395AF1BF5655D005ED913 /* Frameworks */, + 086395B01BF5655D005ED913 /* CopyFiles */, + ); + buildRules = ( + ); + dependencies = ( + 086395BB1BF565AB005ED913 /* PBXTargetDependency */, + ); + name = kdd; + productName = kdd; + productReference = 086395B21BF5655D005ED913 /* kdd */; + productType = "com.apple.product-type.tool"; + }; + C91C93C61ACB58B700119B60 /* libkdd */ = { + isa = PBXNativeTarget; + buildConfigurationList = C91C93DA1ACB58B700119B60 /* Build configuration list for PBXNativeTarget "libkdd" */; buildPhases = ( C91C93C31ACB58B700119B60 /* Sources */, C91C93C41ACB58B700119B60 /* Frameworks */, @@ -99,7 +356,7 @@ ); dependencies = ( ); - name = kdd; + name = libkdd; productName = kdd; productReference = C91C93C71ACB58B700119B60 /* libkdd.a */; productType = "com.apple.product-type.library.static"; @@ -110,9 +367,17 @@ C91C93BF1ACB58B700119B60 /* Project object */ = { isa = PBXProject; attributes = { - LastUpgradeCheck = 0700; + LastSwiftUpdateCheck = 0730; + LastUpgradeCheck = 0730; ORGANIZATIONNAME = "Vishal Patel"; TargetAttributes = { + 08603F331BF69EDE007D3784 = { + CreatedOnToolsVersion = 7.3; + LastSwiftMigration = 0800; + }; + 086395B11BF5655D005ED913 = { + CreatedOnToolsVersion = 7.3; + }; C91C93C61ACB58B700119B60 = { CreatedOnToolsVersion = 7.0; }; @@ -130,12 +395,83 @@ projectDirPath = ""; projectRoot = ""; targets = ( - C91C93C61ACB58B700119B60 /* kdd */, + C91C93C61ACB58B700119B60 /* libkdd */, + 086395B11BF5655D005ED913 /* kdd */, + 08603F331BF69EDE007D3784 /* tests */, ); }; /* End PBXProject section */ +/* Begin PBXResourcesBuildPhase section */ + 08603F321BF69EDE007D3784 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 08A4C94F1C470F1C00D5F010 /* nested-sample in Resources */, + 08A4C9501C470F1C00D5F010 /* nested-sample.plist in Resources */, + 13D6C5D21C4DDDBE005E617C /* test-twr-sample in Resources */, + 13D6C5D01C4DDDB6005E617C /* corpse-twr-sample in Resources */, + C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */, + 081725D51C3F476500371A54 /* stackshot-sample-duration in Resources */, + 081725D61C3F476500371A54 /* stackshot-sample-duration.plist.gz in Resources */, + 081EDD381C23855700A1C138 /* stackshot-sample-cputime in Resources */, + 081EDD391C23855700A1C138 /* stackshot-sample-cputime.plist.gz in Resources */, + 13A79CAA1CF8C5D600FFC181 /* stackshot-with-kcid in Resources */, + 08B9297E1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t in Resources */, + 08B9297F1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t.plist.gz in Resources */, + 0860F87A1BFC3857007E1301 /* stackshot-sample-tailspin-2 in Resources */, + 13CC08451CB97F9000EA6069 /* stackshot-fault-stats.plist.gz in Resources */, + 13F3DA9E1C7C1C6600ACFFCC /* corpse-twr-sample-v2.plist.gz in Resources */, + 0860F87B1BFC3857007E1301 /* stackshot-sample-tailspin-2.plist.gz in Resources */, + 08CF18FF1BF9B7B100D05813 /* stackshot-sample-tailspin in Resources */, + 1368F0861C87E06C00940FC6 /* exitreason-codesigning in Resources */, + 13DBA26A1CAB1BA000227EB2 /* stackshot-sample-sharedcachev2 in Resources */, + C9D7B53F1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz in Resources */, + 13DBA2681CAB1AD600227EB2 /* stackshot-sample-sharedcachev2.plist.gz in Resources */, + 08CF19001BF9B7B100D05813 /* stackshot-sample-tailspin.plist.gz in Resources */, + 13CC08441CB97F8D00EA6069 /* stackshot-fault-stats in Resources */, + 13F3DA9C1C7C1BEE00ACFFCC /* corpse-twr-sample-v2 in Resources */, + 13D6C5D31C4DDE0D005E617C /* test-twr-sample.plist.gz in Resources */, + 1368F0851C87E06A00940FC6 /* exitreason-codesigning.plist.gz in Resources */, + 08C9D83D1BFFF8E100DF6C05 /* exitreason-sample in Resources */, + 08C9D83E1BFFF8E100DF6C05 /* exitreason-sample.plist.gz in Resources */, + 08B4808B1BF9474A00B4AAE0 /* corpse-sample in Resources */, + 13D6C5D11C4DDDB8005E617C /* corpse-twr-sample.plist.gz in Resources */, + 08B4808C1BF9474A00B4AAE0 /* corpse-sample.plist.gz in Resources */, + 08B480881BF92E0500B4AAE0 /* kcdata.py in Resources */, + 08B480831BF864D300B4AAE0 /* delta-stackshot-sample-new-arrays in Resources */, + 08B480841BF864D300B4AAE0 /* delta-stackshot-sample-new-arrays.plist.gz in Resources */, + 08B480851BF864D300B4AAE0 /* delta-stackshot-sample-old-arrays in Resources */, + 13A79CAB1CF8C5D600FFC181 /* stackshot-with-kcid.plist.gz in Resources */, + 08B480861BF864D300B4AAE0 /* delta-stackshot-sample-old-arrays.plist.gz in Resources */, + 08B480781BF8297500B4AAE0 /* stackshot-sample-new-arrays in Resources */, + 08B480791BF8297500B4AAE0 /* stackshot-sample-new-arrays.plist.gz in Resources */, + 08B4807A1BF8297500B4AAE0 /* stackshot-sample-old-arrays in Resources */, + 08B4807B1BF8297500B4AAE0 /* stackshot-sample-old-arrays.plist.gz in Resources */, + 0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */, + 0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + /* Begin PBXSourcesBuildPhase section */ + 08603F301BF69EDE007D3784 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 08603F371BF69EDE007D3784 /* Tests.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 086395AE1BF5655D005ED913 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 086395B51BF5655D005ED913 /* kdd_main.m in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; C91C93C31ACB58B700119B60 /* Sources */ = { isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; @@ -145,12 +481,85 @@ C91C93E51ACB598700119B60 /* KCDBasicTypeDescription.m in Sources */, C91C93CD1ACB58B700119B60 /* kdd.m in Sources */, C9C5C68C1ACDAFDB00BE0E5E /* kcdtypes.c in Sources */, + 08A4C94C1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; /* End PBXSourcesBuildPhase section */ +/* Begin PBXTargetDependency section */ + 08603F3B1BF69EDE007D3784 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = C91C93C61ACB58B700119B60 /* libkdd */; + targetProxy = 08603F3A1BF69EDE007D3784 /* PBXContainerItemProxy */; + }; + 086395BB1BF565AB005ED913 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = C91C93C61ACB58B700119B60 /* libkdd */; + targetProxy = 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + /* Begin XCBuildConfiguration section */ + 08603F3C1BF69EDE007D3784 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_IDENTITY = "-"; + COMBINE_HIDPI_IMAGES = YES; + ENABLE_TESTABILITY = YES; + INFOPLIST_FILE = tests/Info.plist; + LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks"; + MACOSX_DEPLOYMENT_TARGET = 10.11; + PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = macosx; + SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + SWIFT_VERSION = 2.3; + }; + name = Debug; + }; + 08603F3D1BF69EDE007D3784 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_IDENTITY = "-"; + COMBINE_HIDPI_IMAGES = YES; + INFOPLIST_FILE = tests/Info.plist; + LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks"; + MACOSX_DEPLOYMENT_TARGET = 10.11; + PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = macosx; + SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h; + SWIFT_VERSION = 2.3; + }; + name = Release; + }; + 086395B71BF5655D005ED913 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_IDENTITY = "-"; + ENABLE_TESTABILITY = YES; + MACOSX_DEPLOYMENT_TARGET = 10.11; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = macosx; + }; + name = Debug; + }; + 086395B81BF5655D005ED913 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_IDENTITY = "-"; + MACOSX_DEPLOYMENT_TARGET = 10.11; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = macosx; + }; + name = Release; + }; C91C93D81ACB58B700119B60 /* Debug */ = { isa = XCBuildConfiguration; buildSettings = { @@ -163,6 +572,7 @@ CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; CLANG_WARN_EMPTY_BODY = YES; CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_IMPLICIT_SIGN_CONVERSION = YES; CLANG_WARN_INT_CONVERSION = YES; CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; CLANG_WARN_UNREACHABLE_CODE = YES; @@ -170,6 +580,7 @@ COPY_PHASE_STRIP = NO; DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; GCC_C_LANGUAGE_STANDARD = gnu99; GCC_DYNAMIC_NO_PIC = NO; GCC_NO_COMMON_BLOCKS = YES; @@ -185,9 +596,11 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; + HEADER_SEARCH_PATHS = "$(SRCROOT)"; MTL_ENABLE_DEBUG_INFO = YES; ONLY_ACTIVE_ARCH = YES; OTHER_CFLAGS = ""; + SDKROOT = macosx.internal; }; name = Debug; }; @@ -203,6 +616,7 @@ CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; CLANG_WARN_EMPTY_BODY = YES; CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_IMPLICIT_SIGN_CONVERSION = YES; CLANG_WARN_INT_CONVERSION = YES; CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; CLANG_WARN_UNREACHABLE_CODE = YES; @@ -219,14 +633,17 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; + HEADER_SEARCH_PATHS = "$(SRCROOT)"; MTL_ENABLE_DEBUG_INFO = NO; OTHER_CFLAGS = ""; + SDKROOT = macosx.internal; }; name = Release; }; C91C93DB1ACB58B700119B60 /* Debug */ = { isa = XCBuildConfiguration; buildSettings = { + COMBINE_HIDPI_IMAGES = YES; EXECUTABLE_PREFIX = lib; OTHER_CFLAGS = "-I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders"; PRODUCT_NAME = kdd; @@ -236,6 +653,7 @@ C91C93DC1ACB58B700119B60 /* Release */ = { isa = XCBuildConfiguration; buildSettings = { + COMBINE_HIDPI_IMAGES = YES; EXECUTABLE_PREFIX = lib; OTHER_CFLAGS = "-I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders"; PRODUCT_NAME = kdd; @@ -245,6 +663,24 @@ /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ + 08603F3E1BF69EDE007D3784 /* Build configuration list for PBXNativeTarget "tests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 08603F3C1BF69EDE007D3784 /* Debug */, + 08603F3D1BF69EDE007D3784 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 086395B61BF5655D005ED913 /* Build configuration list for PBXNativeTarget "kdd" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 086395B71BF5655D005ED913 /* Debug */, + 086395B81BF5655D005ED913 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; C91C93C21ACB58B700119B60 /* Build configuration list for PBXProject "kdd" */ = { isa = XCConfigurationList; buildConfigurations = ( @@ -254,7 +690,7 @@ defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; - C91C93DA1ACB58B700119B60 /* Build configuration list for PBXNativeTarget "kdd" */ = { + C91C93DA1ACB58B700119B60 /* Build configuration list for PBXNativeTarget "libkdd" */ = { isa = XCConfigurationList; buildConfigurations = ( C91C93DB1ACB58B700119B60 /* Debug */, diff --git a/libkdd/kdd_main.m b/libkdd/kdd_main.m new file mode 100644 index 000000000..c0984f790 --- /dev/null +++ b/libkdd/kdd_main.m @@ -0,0 +1,126 @@ +// +// main.m +// kdd command +// +// Created by Lawrence D'Anna on 11/9/15. +// Copyright © 2015 Apple Inc. All rights reserved. +// + +#import +#include +#include +#include +#import "kdd.h" + +void usage(char *const* argv) { + fprintf(stderr, "usage: %s [-p] FILE\n", argv[0]); + exit(1); +} + +int main(int argc, char *const*argv) { + + int c ; + int plist = 0; + + while ((c = getopt(argc, argv, "p")) != EOF) { + switch(c) { + case 'p': + plist = TRUE; + break; + case '?': + case 'h': + default: + usage(argv); + break; + } + } + + if (optind != argc -1) { + usage(argv); + } + + NSError *error = nil; + NSData *data; + + if (0 == strcmp(argv[optind], "-")) { + data = [[NSFileHandle fileHandleWithStandardInput] readDataToEndOfFile]; + } else { + data = [NSData dataWithContentsOfFile:[NSString stringWithUTF8String:argv[optind]] + options:NSDataReadingMappedIfSafe + error:&error]; + } + + if (!data || error) { + NSLog(@"couldn't read data: %@", error); + return 1; + } + + if (data.length > UINT32_MAX) { + NSLog(@"data too big"); + return 1; + } + + NSDictionary *dict = parseKCDataBuffer((void*)data.bytes, (uint32_t)data.length, &error); + + if (error && error.code == KERN_INVALID_VALUE) { + uint8_t buffer[100]; + z_stream stream; + bzero(&stream, sizeof(stream)); + stream.next_in = (void*) data.bytes; + stream.avail_in = data.length; + stream.next_out = buffer; + stream.avail_out = sizeof(buffer); + inflateInit2(&stream, 16+MAX_WBITS); + NSMutableData *inflated = [[NSMutableData alloc] init]; + while (1) { + int z = inflate(&stream, Z_NO_FLUSH); + if (z == Z_OK || z == Z_STREAM_END) { + [inflated appendBytes:buffer length:sizeof(buffer) - stream.avail_out]; + stream.avail_out = sizeof(buffer); + stream.next_out = buffer; + if (z == Z_STREAM_END) { + break; + } + } else { + inflated = nil; + break; + } + } + if (inflated) { + error = nil; + dict = parseKCDataBuffer((void*)inflated.bytes, (uint32_t)inflated.length, &error); + } + } + + if (error && error.code == KERN_INVALID_VALUE) { + NSData *decoded = [[NSData alloc] initWithBase64EncodedData:data options:NSDataBase64DecodingIgnoreUnknownCharacters]; + if (decoded) { + error = nil; + dict = parseKCDataBuffer((void*)decoded.bytes, (uint32_t)decoded.length, &error); + } + } + + if (!dict || error) { + NSLog(@"error parsing kcdata: %@", error); + return 1; + } + + if (plist) { + NSData *plist = [NSPropertyListSerialization dataWithPropertyList:dict + format:NSPropertyListXMLFormat_v1_0 + options:0 + error:&error]; + if (!plist || error) { + NSLog(@"couldn't write plist: %@", error); + return 1; + } + + fwrite(plist.bytes, plist.length, 1, stdout); + + } else { + puts([[NSString stringWithFormat: @"%@", dict] UTF8String]); + } + + + return 0; +} diff --git a/libkdd/tests/Info.plist b/libkdd/tests/Info.plist new file mode 100644 index 000000000..ba72822e8 --- /dev/null +++ b/libkdd/tests/Info.plist @@ -0,0 +1,24 @@ + + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + BNDL + CFBundleShortVersionString + 1.0 + CFBundleSignature + ???? + CFBundleVersion + 1 + + diff --git a/libkdd/tests/Tests.swift b/libkdd/tests/Tests.swift new file mode 100644 index 000000000..ec88ed21f --- /dev/null +++ b/libkdd/tests/Tests.swift @@ -0,0 +1,1336 @@ +// +// Tests.swift +// +// Some of these tests here verify that kdd is able to parse old +// kcdata files and generate the correct output. To do so, we include +// compressed versions of the raw kcdata and as well as the expected +// plist output. +// +// NOTE: If you're adding sample data/plist files, you'll need to first +// add them to the project and then make sure each is part of the +// tests target. +// +// Other tests verify the expected behavior of libkdd for certain +// situations. +// +// + +import XCTest +import Foundation + +// Swift's bridging to uuid_t is awkward. + +func nsuuid2uuid_t(nsuuid : NSUUID) -> uuid_t { + let dat = nsuuid2array(nsuuid) + return nsarray2uuid(dat) +} + +func nsarray2uuid(x : AnyObject) -> uuid_t { + let a = x as! NSArray + return uuid_t(UInt8(a[0] as! Int), + UInt8(a[1] as! Int), + UInt8(a[2] as! Int), + UInt8(a[3] as! Int), + UInt8(a[4] as! Int), + UInt8(a[5] as! Int), + UInt8(a[6] as! Int), + UInt8(a[7] as! Int), + UInt8(a[8] as! Int), + UInt8(a[9] as! Int), + UInt8(a[10] as! Int), + UInt8(a[11] as! Int), + UInt8(a[12] as! Int), + UInt8(a[13] as! Int), + UInt8(a[14] as! Int), + UInt8(a[15] as! Int)) +} + +func nsuuid2array(uuid : NSUUID) -> [Int] { + var ret = [Int]() + let ptr = UnsafeMutablePointer.alloc(16) + + defer { ptr.dealloc(16) } + + uuid.getUUIDBytes(ptr) + for i in 0..<16 { + ret.append(Int(ptr[i])) + } + return ret +} + +func decompress(data:NSData) throws -> NSData { + var stream = z_stream(next_in: nil, avail_in: 0, total_in: 0, next_out: nil, avail_out: 0, total_out: 0, msg: nil, state: nil, zalloc: nil, zfree: nil, opaque: nil, data_type: 0, adler: 0, reserved: 0) + + let bufsize : Int = 1000 + let buffer = UnsafeMutablePointer.alloc(bufsize) + defer { buffer.dealloc(bufsize) } + let output = NSMutableData() + stream.next_out = buffer + stream.avail_out = UInt32(bufsize) + stream.next_in = UnsafeMutablePointer(data.bytes) + stream.avail_in = UInt32(data.length) + inflateInit2_(&stream, 16+MAX_WBITS, ZLIB_VERSION, Int32(sizeof(z_stream))) + + while (true) { + let z = inflate(&stream, Z_NO_FLUSH); + if (z == Z_OK || z == Z_STREAM_END) { + output.appendBytes(buffer, length: bufsize - Int(stream.avail_out)) + stream.avail_out = UInt32(bufsize) + stream.next_out = buffer + if (z == Z_STREAM_END) { + return output; + } + } else { + throw NSError(domain: "zlib", code: Int(z), userInfo: nil) + } + } +} + + + +class Tests: XCTestCase { + + override func setUp() { + super.setUp() + continueAfterFailure = false + } + + override func tearDown() { + // Put teardown code here. This method is called after the invocation of each test method in the class. + super.tearDown() + } + + func parseBuffer(buffer:NSData) throws -> NSDictionary { + var error : NSError? + guard let dict = parseKCDataBuffer(UnsafeMutablePointer(buffer.bytes), UInt32(buffer.length), &error) + else { + XCTAssert(error != nil) + throw error! + } + return dict + } + + func testPaddingFlags(pad : Int) { + let buffer = NSMutableData(capacity:1000)! + + var item = kcdata_item() + + item.type = KCDATA_BUFFER_BEGIN_CRASHINFO + item.flags = 0 + item.size = 0 + buffer.appendBytes(&item, length: sizeof(kcdata_item)) + + item.type = UInt32(KCDATA_TYPE_LIBRARY_LOADINFO) + item.flags = UInt64(pad) + item.size = UInt32(sizeof(dyld_uuid_info_32)) + buffer.appendBytes(&item, length: sizeof(kcdata_item)) + + let uuid = NSUUID(UUIDString: "de305d54-75b4-431b-adb2-eb6b9e546014")! + + var payload = dyld_uuid_info_32(imageLoadAddress: 42, imageUUID: nsuuid2uuid_t(uuid)) + buffer.appendBytes(&payload, length:sizeof(dyld_uuid_info_32)) + + item.type = KCDATA_TYPE_BUFFER_END + item.flags = 0 + item.size = 0 + buffer.appendBytes(&item, length: sizeof(kcdata_item)) + + guard let dict = try? self.parseBuffer(buffer) + else { XCTFail(); return; } + + var uuidarray = nsuuid2array(uuid) + for _ in 0.. NSData? { + guard let filename = NSBundle(forClass: self.classForCoder).pathForResource(name, ofType: nil) + else { return nil } + return NSData(contentsOfFile:filename)! + } + + func testSampleStackshot(name : String) { + // check that we agree with sample file + + guard let sampledata = self.dataWithResource(name) + else { XCTFail(); return } + var dict : NSDictionary? + + dict = try? self.parseBuffer(sampledata) + + if (dict == nil) { + if let decoded = NSData(base64EncodedData: sampledata, options:.IgnoreUnknownCharacters) { + dict = try? self.parseBuffer(decoded) + } + } + + if (dict == nil) { + if let decompressed = try? decompress(sampledata) { + dict = try? self.parseBuffer(decompressed) + } + } + + if (dict == nil) { + XCTFail(); return; + } + + guard let plistdata = self.dataWithResource(name + ".plist.gz") ?? + self.dataWithResource(name + ".plist") + else {XCTFail(); return} + + var dict2 = try? NSPropertyListSerialization.propertyListWithData(plistdata, options: NSPropertyListReadOptions.Immutable, format: nil) + if dict2 == nil { + dict2 = try? NSPropertyListSerialization.propertyListWithData(decompress(plistdata), options: .Immutable, format: nil) + } + + XCTAssert(dict2 != nil) + + XCTAssert(dict == dict2 as? NSDictionary) + + // check that we agree with python + + #if os(OSX) + + let kcdatapy = NSBundle(forClass: self.classForCoder).pathForResource("kcdata.py", ofType: nil) + + let task = NSTask() + task.launchPath = kcdatapy + task.arguments = ["-p", + NSBundle(forClass:self.classForCoder).pathForResource(name, ofType: nil)!] + let pipe = NSPipe() + task.standardOutput = pipe + task.launch() + + let data = pipe.fileHandleForReading.readDataToEndOfFile() + + guard let dict3 = try? NSPropertyListSerialization.propertyListWithData(data, options: .Immutable, format: nil) as? NSDictionary + else { XCTFail(); return } + + XCTAssert(dict == dict3) + + #endif + } + + func testSampleStackshot() { + self.testSampleStackshot("stackshot-sample") + } + + func testSampleStackshotOldArrays() { + self.testSampleStackshot("stackshot-sample-old-arrays") + } + + func testSampleStackshotNewArrays() { + self.testSampleStackshot("stackshot-sample-new-arrays") + } + + func testSampleDeltaStackshotOldArrays() { + self.testSampleStackshot("delta-stackshot-sample-old-arrays") + } + + func testSampleDeltaStackshotNewArrays() { + self.testSampleStackshot("delta-stackshot-sample-new-arrays") + } + + func testSampleCorpse() { + self.testSampleStackshot("corpse-sample") + } + + func testSampleStackshotTailspin() { + self.testSampleStackshot("stackshot-sample-tailspin") + } + + func testSampleStackshotTailspin2() { + self.testSampleStackshot("stackshot-sample-tailspin-2") + } + + func testSampleExitReason() { + self.testSampleStackshot("exitreason-sample") + } + + func testSampleThreadT() { + self.testSampleStackshot("stackshot-sample-ths-thread-t") + } + + func testSampleCpuTimes() { + self.testSampleStackshot("stackshot-sample-cputime") + } + + func testSampleDuration() { + self.testSampleStackshot("stackshot-sample-duration") + } + + func testSampleNested() { + self.testSampleStackshot("nested-sample") + } + + func testSampleTermWithReason() { + self.testSampleStackshot("test-twr-sample") + } + + func testSampleCorpseTermWithReason() { + self.testSampleStackshot("corpse-twr-sample") + } + + func testSampleCorpseTermWithReasonV2() { + self.testSampleStackshot("corpse-twr-sample-v2") + } + + func testSampleCodesigningExitReason() { + self.testSampleStackshot("exitreason-codesigning") + } + + func testStackshotSharedcacheV2() { + self.testSampleStackshot("stackshot-sample-sharedcachev2") + } + + func testStackshotFaultStats() { + self.testSampleStackshot("stackshot-fault-stats") + } + + func testStackshotwithKCID() { + self.testSampleStackshot("stackshot-with-kcid") + } + + func testXNUPostTestConfig() { + self.testSampleStackshot("xnupost_testconfig-sample") + } + + func testTrivial() { + } +} diff --git a/libkdd/tests/corpse-sample b/libkdd/tests/corpse-sample new file mode 100644 index 000000000..42bca67f6 Binary files /dev/null and b/libkdd/tests/corpse-sample differ diff --git a/libkdd/tests/corpse-sample.plist.gz b/libkdd/tests/corpse-sample.plist.gz new file mode 100644 index 000000000..b83834b4a Binary files /dev/null and b/libkdd/tests/corpse-sample.plist.gz differ diff --git a/libkdd/tests/corpse-twr-sample b/libkdd/tests/corpse-twr-sample new file mode 100644 index 000000000..74c34bc65 Binary files /dev/null and b/libkdd/tests/corpse-twr-sample differ diff --git a/libkdd/tests/corpse-twr-sample-v2 b/libkdd/tests/corpse-twr-sample-v2 new file mode 100644 index 000000000..5f3590841 Binary files /dev/null and b/libkdd/tests/corpse-twr-sample-v2 differ diff --git a/libkdd/tests/corpse-twr-sample-v2.plist.gz b/libkdd/tests/corpse-twr-sample-v2.plist.gz new file mode 100644 index 000000000..9cdc13038 Binary files /dev/null and b/libkdd/tests/corpse-twr-sample-v2.plist.gz differ diff --git a/libkdd/tests/corpse-twr-sample.plist.gz b/libkdd/tests/corpse-twr-sample.plist.gz new file mode 100644 index 000000000..05ce71449 Binary files /dev/null and b/libkdd/tests/corpse-twr-sample.plist.gz differ diff --git a/libkdd/tests/delta-stackshot-sample-new-arrays b/libkdd/tests/delta-stackshot-sample-new-arrays new file mode 100644 index 000000000..7ce2df6d8 Binary files /dev/null and b/libkdd/tests/delta-stackshot-sample-new-arrays differ diff --git a/libkdd/tests/delta-stackshot-sample-new-arrays.plist.gz b/libkdd/tests/delta-stackshot-sample-new-arrays.plist.gz new file mode 100644 index 000000000..03ef0ffb1 Binary files /dev/null and b/libkdd/tests/delta-stackshot-sample-new-arrays.plist.gz differ diff --git a/libkdd/tests/delta-stackshot-sample-old-arrays b/libkdd/tests/delta-stackshot-sample-old-arrays new file mode 100644 index 000000000..17ab111b8 Binary files /dev/null and b/libkdd/tests/delta-stackshot-sample-old-arrays differ diff --git a/libkdd/tests/delta-stackshot-sample-old-arrays.plist.gz b/libkdd/tests/delta-stackshot-sample-old-arrays.plist.gz new file mode 100644 index 000000000..75fff06b1 Binary files /dev/null and b/libkdd/tests/delta-stackshot-sample-old-arrays.plist.gz differ diff --git a/libkdd/tests/exitreason-codesigning b/libkdd/tests/exitreason-codesigning new file mode 100644 index 000000000..0cd5cffa2 Binary files /dev/null and b/libkdd/tests/exitreason-codesigning differ diff --git a/libkdd/tests/exitreason-codesigning.plist.gz b/libkdd/tests/exitreason-codesigning.plist.gz new file mode 100644 index 000000000..4aae118b4 Binary files /dev/null and b/libkdd/tests/exitreason-codesigning.plist.gz differ diff --git a/libkdd/tests/exitreason-sample b/libkdd/tests/exitreason-sample new file mode 100644 index 000000000..9567a55df Binary files /dev/null and b/libkdd/tests/exitreason-sample differ diff --git a/libkdd/tests/exitreason-sample.plist.gz b/libkdd/tests/exitreason-sample.plist.gz new file mode 100644 index 000000000..d62b884f5 Binary files /dev/null and b/libkdd/tests/exitreason-sample.plist.gz differ diff --git a/libkdd/tests/kdd_bridge.h b/libkdd/tests/kdd_bridge.h new file mode 100644 index 000000000..d6691bafb --- /dev/null +++ b/libkdd/tests/kdd_bridge.h @@ -0,0 +1,16 @@ +// +// kdd_bridge.h +// kdd +// +// Created by Lawrence D'Anna on 11/13/15. +// Copyright © 2015 Vishal Patel. All rights reserved. +// + +#ifndef kdd_bridge_h +#define kdd_bridge_h + +#import "kdd.h" +#include "kcdata.h" +#include + +#endif /* kdd_bridge_h */ diff --git a/libkdd/tests/nested-sample b/libkdd/tests/nested-sample new file mode 100644 index 000000000..87924fe6e Binary files /dev/null and b/libkdd/tests/nested-sample differ diff --git a/libkdd/tests/nested-sample.plist b/libkdd/tests/nested-sample.plist new file mode 100644 index 000000000..c3756e458 --- /dev/null +++ b/libkdd/tests/nested-sample.plist @@ -0,0 +1,14 @@ + + + + + kcdata_crashinfo + + kcdata_crashinfo + + crashed_threadid + 42 + + + + diff --git a/libkdd/tests/stackshot-fault-stats b/libkdd/tests/stackshot-fault-stats new file mode 100644 index 000000000..b99478bac Binary files /dev/null and b/libkdd/tests/stackshot-fault-stats differ diff --git a/libkdd/tests/stackshot-fault-stats.plist.gz b/libkdd/tests/stackshot-fault-stats.plist.gz new file mode 100644 index 000000000..7ddff8b7b Binary files /dev/null and b/libkdd/tests/stackshot-fault-stats.plist.gz differ diff --git a/libkdd/tests/stackshot-sample b/libkdd/tests/stackshot-sample new file mode 100644 index 000000000..c0403ae03 Binary files /dev/null and b/libkdd/tests/stackshot-sample differ diff --git a/libkdd/tests/stackshot-sample-cputime b/libkdd/tests/stackshot-sample-cputime new file mode 100644 index 000000000..e5aece6c5 Binary files /dev/null and b/libkdd/tests/stackshot-sample-cputime differ diff --git a/libkdd/tests/stackshot-sample-cputime.plist.gz b/libkdd/tests/stackshot-sample-cputime.plist.gz new file mode 100644 index 000000000..69d4b2f1c Binary files /dev/null and b/libkdd/tests/stackshot-sample-cputime.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-duration b/libkdd/tests/stackshot-sample-duration new file mode 100644 index 000000000..35042d870 Binary files /dev/null and b/libkdd/tests/stackshot-sample-duration differ diff --git a/libkdd/tests/stackshot-sample-duration.plist.gz b/libkdd/tests/stackshot-sample-duration.plist.gz new file mode 100644 index 000000000..e71c39f8b Binary files /dev/null and b/libkdd/tests/stackshot-sample-duration.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-new-arrays b/libkdd/tests/stackshot-sample-new-arrays new file mode 100644 index 000000000..9f9fb1a5c Binary files /dev/null and b/libkdd/tests/stackshot-sample-new-arrays differ diff --git a/libkdd/tests/stackshot-sample-new-arrays.plist.gz b/libkdd/tests/stackshot-sample-new-arrays.plist.gz new file mode 100644 index 000000000..88a4360e4 Binary files /dev/null and b/libkdd/tests/stackshot-sample-new-arrays.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-old-arrays b/libkdd/tests/stackshot-sample-old-arrays new file mode 100644 index 000000000..6577b9510 Binary files /dev/null and b/libkdd/tests/stackshot-sample-old-arrays differ diff --git a/libkdd/tests/stackshot-sample-old-arrays.plist.gz b/libkdd/tests/stackshot-sample-old-arrays.plist.gz new file mode 100644 index 000000000..7f2afd0b0 Binary files /dev/null and b/libkdd/tests/stackshot-sample-old-arrays.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-sharedcachev2 b/libkdd/tests/stackshot-sample-sharedcachev2 new file mode 100644 index 000000000..b9fe0e973 Binary files /dev/null and b/libkdd/tests/stackshot-sample-sharedcachev2 differ diff --git a/libkdd/tests/stackshot-sample-sharedcachev2.plist.gz b/libkdd/tests/stackshot-sample-sharedcachev2.plist.gz new file mode 100644 index 000000000..58b9b0c2c Binary files /dev/null and b/libkdd/tests/stackshot-sample-sharedcachev2.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-tailspin b/libkdd/tests/stackshot-sample-tailspin new file mode 100644 index 000000000..e087b3491 Binary files /dev/null and b/libkdd/tests/stackshot-sample-tailspin differ diff --git a/libkdd/tests/stackshot-sample-tailspin-2 b/libkdd/tests/stackshot-sample-tailspin-2 new file mode 100644 index 000000000..cd9968215 Binary files /dev/null and b/libkdd/tests/stackshot-sample-tailspin-2 differ diff --git a/libkdd/tests/stackshot-sample-tailspin-2.plist.gz b/libkdd/tests/stackshot-sample-tailspin-2.plist.gz new file mode 100644 index 000000000..f32d9e203 Binary files /dev/null and b/libkdd/tests/stackshot-sample-tailspin-2.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-tailspin.plist.gz b/libkdd/tests/stackshot-sample-tailspin.plist.gz new file mode 100644 index 000000000..20cec9fc9 Binary files /dev/null and b/libkdd/tests/stackshot-sample-tailspin.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-ths-thread-t b/libkdd/tests/stackshot-sample-ths-thread-t new file mode 100644 index 000000000..a53cc1660 Binary files /dev/null and b/libkdd/tests/stackshot-sample-ths-thread-t differ diff --git a/libkdd/tests/stackshot-sample-ths-thread-t.plist.gz b/libkdd/tests/stackshot-sample-ths-thread-t.plist.gz new file mode 100644 index 000000000..d8e291fe8 Binary files /dev/null and b/libkdd/tests/stackshot-sample-ths-thread-t.plist.gz differ diff --git a/libkdd/tests/stackshot-sample.plist.gz b/libkdd/tests/stackshot-sample.plist.gz new file mode 100644 index 000000000..1e8f4ee5d Binary files /dev/null and b/libkdd/tests/stackshot-sample.plist.gz differ diff --git a/libkdd/tests/stackshot-with-kcid b/libkdd/tests/stackshot-with-kcid new file mode 100644 index 000000000..50aa4a1da Binary files /dev/null and b/libkdd/tests/stackshot-with-kcid differ diff --git a/libkdd/tests/stackshot-with-kcid.plist.gz b/libkdd/tests/stackshot-with-kcid.plist.gz new file mode 100644 index 000000000..7df54071c Binary files /dev/null and b/libkdd/tests/stackshot-with-kcid.plist.gz differ diff --git a/libkdd/tests/test-twr-sample b/libkdd/tests/test-twr-sample new file mode 100644 index 000000000..77255fda9 Binary files /dev/null and b/libkdd/tests/test-twr-sample differ diff --git a/libkdd/tests/test-twr-sample.plist.gz b/libkdd/tests/test-twr-sample.plist.gz new file mode 100644 index 000000000..b82b6969a Binary files /dev/null and b/libkdd/tests/test-twr-sample.plist.gz differ diff --git a/libkdd/tests/xnupost_testconfig-sample b/libkdd/tests/xnupost_testconfig-sample new file mode 100644 index 000000000..b2d1f0dcc Binary files /dev/null and b/libkdd/tests/xnupost_testconfig-sample differ diff --git a/libkdd/tests/xnupost_testconfig-sample.plist.gz b/libkdd/tests/xnupost_testconfig-sample.plist.gz new file mode 100644 index 000000000..f67182172 Binary files /dev/null and b/libkdd/tests/xnupost_testconfig-sample.plist.gz differ diff --git a/libkern/.clang-format b/libkern/.clang-format deleted file mode 120000 index 298ac9555..000000000 --- a/libkern/.clang-format +++ /dev/null @@ -1 +0,0 @@ -../iokit/.clang-format \ No newline at end of file diff --git a/libkern/.clang-format b/libkern/.clang-format new file mode 100644 index 000000000..cd99c24e5 --- /dev/null +++ b/libkern/.clang-format @@ -0,0 +1,30 @@ +# See top level .clang-format for explanation of options +AlignEscapedNewlinesLeft: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Allman +ColumnLimit: 132 +IndentCaseLabels: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +PointerAlignment: Middle +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +TabWidth: 4 +UseTab: Never diff --git a/libkern/Makefile b/libkern/Makefile index affdcb336..d15957542 100644 --- a/libkern/Makefile +++ b/libkern/Makefile @@ -7,18 +7,18 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ - libkern os -INSTINC_SUBDIRS_X86_64 = libkern os -INSTINC_SUBDIRS_X86_64H = libkern os -INSTINC_SUBDIRS_ARM = libkern os -INSTINC_SUBDIRS_ARM64 = libkern os + libkern os firehose +INSTINC_SUBDIRS_X86_64 = libkern +INSTINC_SUBDIRS_X86_64H = libkern +INSTINC_SUBDIRS_ARM = libkern +INSTINC_SUBDIRS_ARM64 = libkern EXPINC_SUBDIRS = \ - libkern os -EXPINC_SUBDIRS_X86_64 = libkern os -EXPINC_SUBDIRS_X86_64H = libkern os -EXPINC_SUBDIRS_ARM = libkern os -EXPINC_SUBDIRS_ARM64 = libkern os + libkern os firehose +EXPINC_SUBDIRS_X86_64 = libkern +EXPINC_SUBDIRS_X86_64H = libkern +EXPINC_SUBDIRS_ARM = libkern +EXPINC_SUBDIRS_ARM64 = libkern COMP_SUBDIRS = conf diff --git a/libkern/OSKextLib.cpp b/libkern/OSKextLib.cpp index 175704eec..00264ecf3 100644 --- a/libkern/OSKextLib.cpp +++ b/libkern/OSKextLib.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2012 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,6 +48,7 @@ kern_return_t OSKextLoadKextWithIdentifier(const char * bundle_id) return OSKext::loadKextWithIdentifier(bundle_id); } +uint32_t OSKextGetLoadTagForIdentifier(const char * kextIdentifier); /********************************************************************* *********************************************************************/ uint32_t @@ -353,8 +354,8 @@ kern_return_t kext_request( /********************************************************************* * Gets the vm_map for the current kext *********************************************************************/ -extern vm_offset_t segPRELINKB; -extern unsigned long segSizePRELINK; +extern vm_offset_t segPRELINKTEXTB; +extern unsigned long segSizePRELINKTEXT; extern int kth_started; extern vm_map_t g_kext_map; @@ -364,8 +365,8 @@ kext_get_vm_map(kmod_info_t *info) vm_map_t kext_map = NULL; /* Set the vm map */ - if ((info->address >= segPRELINKB) && - (info->address < (segPRELINKB + segSizePRELINK))) + if ((info->address >= segPRELINKTEXTB) && + (info->address < (segPRELINKTEXTB + segSizePRELINKTEXT))) { kext_map = kernel_map; } else { @@ -457,6 +458,13 @@ kmod_dump_log( OSKext::printKextsInBacktrace(addr, cnt, &printf, /* lock? */ true, doUnslide); } +void * +OSKextKextForAddress(const void *addr) +{ + return OSKext::kextForAddress(addr); +} + + /********************************************************************* * Compatibility implementation for kmod_get_info() host_priv routine. * Only supported on old 32-bit architectures. diff --git a/libkern/OSKextVersion.c b/libkern/OSKextVersion.c index 3f94e0d02..ea6a8e53c 100644 --- a/libkern/OSKextVersion.c +++ b/libkern/OSKextVersion.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,22 +82,21 @@ static int __vers_isspace(char c) { /********************************************************************* *********************************************************************/ -static int __vers_digit_for_char(char c) { - switch (c) { - case '0': return 0; break; - case '1': return 1; break; - case '2': return 2; break; - case '3': return 3; break; - case '4': return 4; break; - case '5': return 5; break; - case '6': return 6; break; - case '7': return 7; break; - case '8': return 8; break; - case '9': return 9; break; - default: return -1; break; - } - - return -1; +static int +__vers_digit_for_char(char c) { + switch (c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + default: return -1; + } } /********************************************************************* @@ -150,10 +149,8 @@ static OSKextVersionStage __OSKextVersionStageForString(const char ** string_p) } else { return kOSKextVersionStageInvalid; } - break; default: return kOSKextVersionStageInvalid; - break; } } @@ -162,18 +159,17 @@ static OSKextVersionStage __OSKextVersionStageForString(const char ** string_p) /********************************************************************* *********************************************************************/ -static const char * __OSKextVersionStringForStage(OSKextVersionStage stage) +static const char * +__OSKextVersionStringForStage(OSKextVersionStage stage) { - switch (stage) { - case kOSKextVersionStageInvalid: return NULL; break; - case kOSKextVersionStageDevelopment: return "d"; break; - case kOSKextVersionStageAlpha: return "a"; break; - case kOSKextVersionStageBeta: return "b"; break; - case kOSKextVersionStageCandidate: return "f"; break; - case kOSKextVersionStageRelease: return ""; break; - } - - return NULL; + switch (stage) { + case kOSKextVersionStageInvalid: return NULL; + case kOSKextVersionStageDevelopment: return "d"; + case kOSKextVersionStageAlpha: return "a"; + case kOSKextVersionStageBeta: return "b"; + case kOSKextVersionStageCandidate: return "f"; + case kOSKextVersionStageRelease: return ""; + } } /********************************************************************* diff --git a/libkern/c++/OSArray.cpp b/libkern/c++/OSArray.cpp index fcdca78d1..5089da70f 100644 --- a/libkern/c++/OSArray.cpp +++ b/libkern/c++/OSArray.cpp @@ -185,7 +185,7 @@ unsigned int OSArray::ensureCapacity(unsigned int newCapacity) { const OSMetaClassBase **newArray; unsigned int finalCapacity; - unsigned int oldSize, newSize; + vm_size_t oldSize, newSize; if (newCapacity <= capacity) return capacity; @@ -200,8 +200,11 @@ unsigned int OSArray::ensureCapacity(unsigned int newCapacity) newSize = sizeof(const OSMetaClassBase *) * finalCapacity; - newArray = (const OSMetaClassBase **) kalloc_container(newSize); + newArray = (const OSMetaClassBase **) kallocp_container(&newSize); if (newArray) { + // use all of the actual allocation size + finalCapacity = newSize / sizeof(const OSMetaClassBase *); + oldSize = sizeof(const OSMetaClassBase *) * capacity; OSCONTAINER_ACCUMSIZE(((size_t)newSize) - ((size_t)oldSize)); @@ -264,6 +267,9 @@ bool OSArray::merge(const OSArray * otherArray) if (!otherCount) return true; + if (newCount < count) + return false; + // do we need more space? if (newCount > capacity && newCount > ensureCapacity(newCount)) return false; diff --git a/libkern/c++/OSCollection.cpp b/libkern/c++/OSCollection.cpp index 4da177f16..53b3b7b96 100644 --- a/libkern/c++/OSCollection.cpp +++ b/libkern/c++/OSCollection.cpp @@ -62,11 +62,7 @@ void OSCollection::haveUpdated() { if (fOptions & kImmutable) { -#if __LP64__ if (!(gIOKitDebug & kOSRegistryModsMode)) -#else - if (gIOKitDebug & kOSRegistryModsMode) -#endif { panic("Trying to change a collection in the registry"); } diff --git a/libkern/c++/OSData.cpp b/libkern/c++/OSData.cpp index a48142d2c..6e17f6c41 100644 --- a/libkern/c++/OSData.cpp +++ b/libkern/c++/OSData.cpp @@ -29,11 +29,15 @@ #include +__BEGIN_DECLS +#include +__END_DECLS + #include #include #include #include -#include +#include #define super OSObject @@ -57,7 +61,8 @@ bool OSData::initWithCapacity(unsigned int inCapacity) if (!inCapacity || (capacity < inCapacity)) { // clean out old data's storage if it isn't big enough - kfree(data, capacity); + if (capacity < page_size) kfree(data, capacity); + else kmem_free(kernel_map, (vm_offset_t)data, capacity); data = 0; capacity = 0; } @@ -67,7 +72,14 @@ bool OSData::initWithCapacity(unsigned int inCapacity) return false; if (inCapacity && !data) { - data = (void *) kalloc_container(inCapacity); + + if (inCapacity < page_size) data = (void *) kalloc_container(inCapacity); + else { + kern_return_t kr; + inCapacity = round_page_32(inCapacity); + kr = kmem_alloc(kernel_map, (vm_offset_t *)&data, inCapacity, IOMemoryTag(kernel_map)); + if (KERN_SUCCESS != kr) data = NULL; + } if (!data) return false; capacity = inCapacity; @@ -185,8 +197,9 @@ OSData *OSData::withData(const OSData *inData, void OSData::free() { - if (capacity != EXTERNAL && data && capacity) { - kfree(data, capacity); + if ((capacity != EXTERNAL) && data && capacity) { + if (capacity < page_size) kfree(data, capacity); + else kmem_free(kernel_map, (vm_offset_t)data, capacity); OSCONTAINER_ACCUMSIZE( -((size_t)capacity) ); } else if (capacity == EXTERNAL) { DeallocFunction freemem = reserved ? reserved->deallocFunction : NULL; @@ -217,6 +230,8 @@ unsigned int OSData::ensureCapacity(unsigned int newCapacity) { unsigned char * newData; unsigned int finalCapacity; + void * copydata; + kern_return_t kr; if (newCapacity <= capacity) return capacity; @@ -225,16 +240,36 @@ unsigned int OSData::ensureCapacity(unsigned int newCapacity) * capacityIncrement; // integer overflow check - if (finalCapacity < newCapacity) - return capacity; - - newData = (unsigned char *) kalloc_container(finalCapacity); + if (finalCapacity < newCapacity) return capacity; + + copydata = data; + + if (finalCapacity >= page_size) { + // round up + finalCapacity = round_page_32(finalCapacity); + // integer overflow check + if (finalCapacity < newCapacity) return capacity; + if (capacity >= page_size) { + copydata = NULL; + kr = kmem_realloc(kernel_map, + (vm_offset_t)data, + capacity, + (vm_offset_t *)&newData, + finalCapacity, + IOMemoryTag(kernel_map)); + } else { + kr = kmem_alloc(kernel_map, (vm_offset_t *)&newData, finalCapacity, IOMemoryTag(kernel_map)); + } + if (KERN_SUCCESS != kr) newData = NULL; + } + else newData = (unsigned char *) kalloc_container(finalCapacity); if ( newData ) { bzero(newData + capacity, finalCapacity - capacity); + if (copydata) bcopy(copydata, newData, capacity); if (data) { - bcopy(data, newData, capacity); - kfree(data, capacity); + if (capacity < page_size) kfree(data, capacity); + else kmem_free(kernel_map, (vm_offset_t)data, capacity); } OSCONTAINER_ACCUMSIZE( ((size_t)finalCapacity) - ((size_t)capacity) ); data = (void *) newData; @@ -308,6 +343,7 @@ const void *OSData::getBytesNoCopy(unsigned int start, if (length && start < length + && (start + inLength) >= inLength // overflow check && (start + inLength) <= length) outData = (const void *) ((char *) data + start); diff --git a/libkern/c++/OSDictionary.cpp b/libkern/c++/OSDictionary.cpp index a53f23ad7..27224c707 100644 --- a/libkern/c++/OSDictionary.cpp +++ b/libkern/c++/OSDictionary.cpp @@ -269,7 +269,8 @@ unsigned int OSDictionary::setCapacityIncrement(unsigned int increment) unsigned int OSDictionary::ensureCapacity(unsigned int newCapacity) { dictEntry *newDict; - unsigned int finalCapacity, oldSize, newSize; + unsigned int finalCapacity; + vm_size_t oldSize, newSize; if (newCapacity <= capacity) return capacity; @@ -284,8 +285,11 @@ unsigned int OSDictionary::ensureCapacity(unsigned int newCapacity) newSize = sizeof(dictEntry) * finalCapacity; - newDict = (dictEntry *) kalloc_container(newSize); + newDict = (dictEntry *) kallocp_container(&newSize); if (newDict) { + // use all of the actual allocation size + finalCapacity = newSize / sizeof(dictEntry); + oldSize = sizeof(dictEntry) * capacity; bcopy(dictionary, newDict, oldSize); @@ -701,3 +705,21 @@ OSCollection * OSDictionary::copyCollection(OSDictionary *cycleDict) return ret; } +OSArray * OSDictionary::copyKeys(void) +{ + OSArray * array; + + array = OSArray::withCapacity(count); + if (!array) return (0); + + for (unsigned int i = 0; i < count; i++) + { + if (!array->setObject(i, dictionary[i].key)) + { + array->release(); + array = 0; + break; + } + } + return (array); +} diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index 53d15d29a..5fc12ffc8 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2012 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -27,9 +27,12 @@ */ extern "C" { +#include #include #include #include +#include +#include #include #include #include @@ -180,6 +183,12 @@ static void * GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDic */ #define _kOSKextExecutableExternalDataKey "_OSKextExecutableExternalData" +#define OS_LOG_HDR_VERSION 1 +#define NUM_OS_LOG_SECTIONS 2 + +#define OS_LOG_SECT_IDX 0 +#define CSTRING_SECT_IDX 1 + #if PRAGMA_MARK #pragma mark Typedefs #endif @@ -187,6 +196,22 @@ static void * GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDic * Typedefs *********************************************************************/ +/********************************************************************* +* osLogDataHeaderRef describes the header information of an OSData +* object that is returned when querying for kOSBundleLogStringsKey. +* We currently return information regarding 2 sections - os_log and +* cstring. In the case that the os_log section doesn't exist, we just +* return an offset and length of 0 for that section. +*********************************************************************/ +typedef struct osLogDataHeader { + uint32_t version; + uint32_t sect_count; + struct { + uint32_t sect_offset; + uint32_t sect_size; + } sections[0]; +} osLogDataHeaderRef; + /********************************************************************* * MkextEntryRef describes the contents of an OSData object * referencing a file entry from an mkext so that we can uncompress @@ -363,9 +388,10 @@ static IOSimpleLock * sKextAccountsLock = &vm_allocation_sites_lock; void (*sLoadedKextSummariesUpdated)(void) = OSKextLoadedKextSummariesUpdated; OSKextLoadedKextSummaryHeader * gLoadedKextSummaries __attribute__((used)) = NULL; +uint64_t gLoadedKextSummariesTimestamp __attribute__((used)) = 0; static size_t sLoadedKextSummariesAllocSize = 0; -static OSKextActiveAccount * sKextAccounts; +static OSKextActiveAccount * sKextAccounts; static uint32_t sKextAccountsCount; }; @@ -379,7 +405,7 @@ static const OSKextLogSpec kDefaultKernelLogFilter = kOSKextLogBasicLevel | static OSKextLogSpec sKernelLogFilter = kDefaultKernelLogFilter; static bool sBootArgLogFilterFound = false; SYSCTL_UINT(_debug, OID_AUTO, kextlog, CTLFLAG_RW | CTLFLAG_LOCKED, &sKernelLogFilter, - sKernelLogFilter, "kernel kext logging"); + 0, "kernel kext logging"); static OSKextLogSpec sUserSpaceKextLogFilter = kOSKextLogSilentFilter; static OSArray * sUserSpaceLogSpecArray = NULL; @@ -494,7 +520,7 @@ kern_allocate( result = 0; } - OSSafeRelease(linkBuffer); + OSSafeReleaseNULL(linkBuffer); return (kxld_addr_t)result; } @@ -667,10 +693,11 @@ OSKext::initialize(void) assert(kernelExecutable); #if KASLR_KEXT_DEBUG - IOLog("kaslr: kernel start 0x%lx end 0x%lx length %lu \n", + IOLog("kaslr: kernel start 0x%lx end 0x%lx length %lu vm_kernel_slide %llu (0x%016lx) \n", (unsigned long)kernelStart, (unsigned long)getlastaddr(), - kernelLength); + kernelLength, + vm_kernel_slide, vm_kernel_slide); #endif sKernelKext->loadTag = sNextLoadTag++; // the kernel is load tag 0 @@ -742,8 +769,8 @@ OSKext::initialize(void) registryRoot->setProperty(kOSKernelCPUTypeKey, kernelCPUType); registryRoot->setProperty(kOSKernelCPUSubtypeKey, kernelCPUSubtype); - OSSafeRelease(kernelCPUType); - OSSafeRelease(kernelCPUSubtype); + OSSafeReleaseNULL(kernelCPUType); + OSSafeReleaseNULL(kernelCPUSubtype); timestamp = __OSAbsoluteTimePtr(&last_loaded_timestamp); *timestamp = 0; @@ -934,7 +961,7 @@ OSKext::removeKextBootstrap(void) * Dump the LINKEDIT segment, unless keepsyms is set. */ if (!sKeepSymbols) { - const char *dt_segment_name = "Kernel-__LINKEDIT"; + dt_segment_name = "Kernel-__LINKEDIT"; if (0 == IODTGetLoaderInfo(dt_segment_name, &segment_paddress, &segment_size)) { #ifdef SECURE_KERNEL @@ -1051,9 +1078,9 @@ OSKext::flushNonloadedKexts( finish: IORecursiveLockUnlock(sKextLock); - OSSafeRelease(prelinkedKexts); - OSSafeRelease(kextIterator); - OSSafeRelease(prelinkIterator); + OSSafeReleaseNULL(prelinkedKexts); + OSSafeReleaseNULL(kextIterator); + OSSafeReleaseNULL(prelinkIterator); return; } @@ -1176,7 +1203,7 @@ OSKext::willShutdown(void) IORecursiveLockUnlock(sKextLock); - OSSafeRelease(exitRequest); + OSSafeReleaseNULL(exitRequest); return; } @@ -1364,11 +1391,12 @@ OSKext::getKernelRequestsEnabled(void) *********************************************************************/ OSKext * OSKext::withPrelinkedInfoDict( - OSDictionary * anInfoDict) + OSDictionary * anInfoDict, + bool doCoalesedSlides) { OSKext * newKext = new OSKext; - if (newKext && !newKext->initWithPrelinkedInfoDict(anInfoDict)) { + if (newKext && !newKext->initWithPrelinkedInfoDict(anInfoDict, doCoalesedSlides)) { newKext->release(); return NULL; } @@ -1380,7 +1408,8 @@ OSKext::withPrelinkedInfoDict( *********************************************************************/ bool OSKext::initWithPrelinkedInfoDict( - OSDictionary * anInfoDict) + OSDictionary * anInfoDict, + bool doCoalesedSlides) { bool result = false; OSString * kextPath = NULL; // do not release @@ -1404,7 +1433,7 @@ OSKext::initWithPrelinkedInfoDict( goto finish; } #if KASLR_KEXT_DEBUG - IOLog("kaslr: kext %s \n", getIdentifierCString()); + IOLog("kaslr: doCoalesedSlides %d kext %s \n", doCoalesedSlides, getIdentifierCString()); #endif /* Also get the executable's bundle-relative path if present. @@ -1441,7 +1470,7 @@ OSKext::initWithPrelinkedInfoDict( length = (uint32_t) (lengthNum->unsigned32BitValue()); #if KASLR_KEXT_DEBUG - IOLog("kaslr: unslid 0x%lx slid 0x%lx length %u - prelink executable \n", + IOLog("kaslr: unslid 0x%lx slid 0x%lx length %u - prelink executable \n", (unsigned long)VM_KERNEL_UNSLIDE(data), (unsigned long)data, length); @@ -1450,42 +1479,42 @@ OSKext::initWithPrelinkedInfoDict( anInfoDict->removeObject(kPrelinkExecutableLoadKey); anInfoDict->removeObject(kPrelinkExecutableSizeKey); - /* If the kext's load address differs from its source address, allocate - * space in the kext map at the load address and copy the kext over. - */ + /* If the kext's load address differs from its source address, allocate + * space in the kext map at the load address and copy the kext over. + */ addressNum = OSDynamicCast(OSNumber, anInfoDict->getObject(kPrelinkExecutableSourceKey)); if (addressNum) { srcData = (void *) ((intptr_t) (addressNum->unsigned64BitValue()) + vm_kernel_slide); - + #if KASLR_KEXT_DEBUG - IOLog("kaslr: unslid 0x%lx slid 0x%lx - prelink executable source \n", - (unsigned long)VM_KERNEL_UNSLIDE(srcData), + IOLog("kaslr: unslid 0x%lx slid 0x%lx - prelink executable source \n", + (unsigned long)VM_KERNEL_UNSLIDE(srcData), (unsigned long)srcData); #endif - + if (data != srcData) { #if __LP64__ kern_return_t alloc_result; - + alloc_result = kext_alloc((vm_offset_t *)&data, length, /* fixed */ TRUE); if (alloc_result != KERN_SUCCESS) { OSKextLog(this, - kOSKextLogErrorLevel | kOSKextLogGeneralFlag, - "Failed to allocate space for prelinked kext %s.", - getIdentifierCString()); + kOSKextLogErrorLevel | kOSKextLogGeneralFlag, + "Failed to allocate space for prelinked kext %s.", + getIdentifierCString()); goto finish; } memcpy(data, srcData, length); #else OSKextLog(this, - kOSKextLogErrorLevel | kOSKextLogGeneralFlag, - "Error: prelinked kext %s - source and load addresses " - "differ on ILP32 architecture.", - getIdentifierCString()); + kOSKextLogErrorLevel | kOSKextLogGeneralFlag, + "Error: prelinked kext %s - source and load addresses " + "differ on ILP32 architecture.", + getIdentifierCString()); goto finish; #endif /* __LP64__ */ } - + anInfoDict->removeObject(kPrelinkExecutableSourceKey); } @@ -1520,7 +1549,7 @@ OSKext::initWithPrelinkedInfoDict( kmod_info = (kmod_info_t *) (intptr_t) (addressNum->unsigned64BitValue() + vm_kernel_slide); kmod_info->address += vm_kernel_slide; #if KASLR_KEXT_DEBUG - IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info \n", + IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info \n", (unsigned long)VM_KERNEL_UNSLIDE(kmod_info), (unsigned long)kmod_info); IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info->address \n", @@ -1543,17 +1572,19 @@ OSKext::initWithPrelinkedInfoDict( } } - result = slidePrelinkedExecutable(); + result = slidePrelinkedExecutable(doCoalesedSlides); if (result != kOSReturnSuccess) { goto finish; } - /* set VM protections now, wire later at kext load */ - result = setVMAttributes(true, false); - if (result != KERN_SUCCESS) { - goto finish; + if (doCoalesedSlides == false) { + /* set VM protections now, wire later at kext load */ + result = setVMAttributes(true, false); + if (result != KERN_SUCCESS) { + goto finish; + } } - + flags.prelinked = true; /* If we created a kext from prelink info, @@ -1564,10 +1595,45 @@ OSKext::initWithPrelinkedInfoDict( result = registerIdentifier(); finish: - OSSafeRelease(prelinkedExecutable); + OSSafeReleaseNULL(prelinkedExecutable); return result; } + +/********************************************************************* + *********************************************************************/ +/* static */ +void OSKext::setAllVMAttributes(void) +{ + OSCollectionIterator * kextIterator = NULL; // must release + const OSSymbol * thisID = NULL; // do not release + + IORecursiveLockLock(sKextLock); + + kextIterator = OSCollectionIterator::withCollection(sKextsByID); + if (!kextIterator) { + goto finish; + } + + while ((thisID = OSDynamicCast(OSSymbol, kextIterator->getNextObject()))) { + OSKext * thisKext; // do not release + + thisKext = OSDynamicCast(OSKext, sKextsByID->getObject(thisID)); + if (!thisKext || thisKext->isInterface() || !thisKext->declaresExecutable()) { + continue; + } + + /* set VM protections now, wire later at kext load */ + thisKext->setVMAttributes(true, false); + } + +finish: + IORecursiveLockUnlock(sKextLock); + OSSafeReleaseNULL(kextIterator); + + return; +} + /********************************************************************* *********************************************************************/ OSKext * @@ -1743,10 +1809,10 @@ OSKext::initWithBooterData( result = registerIdentifier(); finish: - OSSafeRelease(parsedXML); - OSSafeRelease(kextPath); - OSSafeRelease(errorString); - OSSafeRelease(executable); + OSSafeReleaseNULL(parsedXML); + OSSafeReleaseNULL(kextPath); + OSSafeReleaseNULL(errorString); + OSSafeReleaseNULL(executable); return result; } @@ -1943,8 +2009,8 @@ OSKext::registerIdentifier(void) getIdentifierCString(), newVersionCString); } - OSSafeRelease(newUUID); - OSSafeRelease(existingUUID); + OSSafeReleaseNULL(newUUID); + OSSafeReleaseNULL(existingUUID); return result; } @@ -2275,14 +2341,14 @@ OSKext::free(void) panic("Attempt to free loaded kext %s.", getIdentifierCString()); } - OSSafeRelease(infoDict); - OSSafeRelease(bundleID); - OSSafeRelease(path); - OSSafeRelease(executableRelPath); - OSSafeRelease(dependencies); - OSSafeRelease(linkedExecutable); - OSSafeRelease(metaClasses); - OSSafeRelease(interfaceUUID); + OSSafeReleaseNULL(infoDict); + OSSafeReleaseNULL(bundleID); + OSSafeReleaseNULL(path); + OSSafeReleaseNULL(executableRelPath); + OSSafeReleaseNULL(dependencies); + OSSafeReleaseNULL(linkedExecutable); + OSSafeReleaseNULL(metaClasses); + OSSafeReleaseNULL(interfaceUUID); if (isInterface() && kmod_info) { kfree(kmod_info, sizeof(kmod_info_t)); @@ -2511,7 +2577,7 @@ OSKext::readMkext2Archive( */ if (infoDict) { OSKext * newKext = OSKext::withMkext2Info(infoDict, mkextData); - OSSafeRelease(newKext); + OSSafeReleaseNULL(newKext); } } @@ -2522,9 +2588,9 @@ OSKext::readMkext2Archive( finish: - OSSafeRelease(parsedXML); - OSSafeRelease(mkextPlistUncompressedData); - OSSafeRelease(errorString); + OSSafeReleaseNULL(parsedXML); + OSSafeReleaseNULL(mkextPlistUncompressedData); + OSSafeReleaseNULL(errorString); return result; } @@ -2604,8 +2670,8 @@ OSKext::initWithMkext2Info( finish: - OSSafeRelease(executable); - OSSafeRelease(iterator); + OSSafeReleaseNULL(executable); + OSSafeReleaseNULL(iterator); return result; } @@ -2863,7 +2929,7 @@ OSKext::extractMkext2FileData( if (zstream_inited) inflateEnd(&zstream); if (!result) { - OSSafeRelease(uncompressedData); + OSSafeReleaseNULL(uncompressedData); } return result; @@ -3074,10 +3140,10 @@ OSKext::loadFromMkext( IORecursiveLockUnlock(sKextLock); - OSSafeRelease(mkextData); - OSSafeRelease(mkextPlist); - OSSafeRelease(serializer); - OSSafeRelease(logInfoArray); + OSSafeReleaseNULL(mkextData); + OSSafeReleaseNULL(mkextPlist); + OSSafeReleaseNULL(serializer); + OSSafeReleaseNULL(logInfoArray); return result; } @@ -3149,7 +3215,7 @@ OSKext::serializeLogInfo( result = kOSReturnSuccess; finish: - OSSafeRelease(serializer); + OSSafeReleaseNULL(serializer); return result; } @@ -3225,7 +3291,6 @@ OSKext::lookupKextWithAddress(vm_address_t address) (vm_address_t)thisKext->linkedExecutable->getBytesNoCopy(); vm_address_t kext_end = kext_start + thisKext->linkedExecutable->getLength(); - if ((kext_start <= address) && (address < kext_end)) { foundKext = thisKext; foundKext->retain(); @@ -3317,6 +3382,7 @@ OSKext::removeKext( OSKext * aKext, bool terminateServicesAndRemovePersonalitiesFlag) { + OSReturn result = kOSKextReturnInUse; OSKext * checkKext = NULL; // do not release #if CONFIG_MACF @@ -3602,17 +3668,20 @@ OSKext::createExcludeListFromBooterData( myTempDict = OSDynamicCast( OSDictionary, theInfoDict->getObject("OSKextExcludeList")); - if ( myTempDict ) { - IORecursiveLockLock(sKextLock); - - /* get rid of old exclusion list */ - if (sExcludeListByID) { - sExcludeListByID->flushCollection(); - OSSafeRelease(sExcludeListByID); - } - sExcludeListByID = OSDictionary::withDictionary(myTempDict, 0); - IORecursiveLockUnlock(sKextLock); + if ( NULL == myTempDict ) { + /* 25322874 */ + panic("Missing OSKextExcludeList dictionary\n"); + } + + IORecursiveLockLock(sKextLock); + + /* get rid of old exclusion list */ + if (sExcludeListByID) { + OSSafeReleaseNULL(sExcludeListByID); } + sExcludeListByID = OSDictionary::withDictionary(myTempDict, 0); + IORecursiveLockUnlock(sKextLock); + break; } @@ -3649,17 +3718,19 @@ OSKext::createExcludeListFromPrelinkInfo( OSArray * theInfoArray ) OSDictionary * myTempDict; // do not free myTempDict = OSDynamicCast(OSDictionary, myInfoDict->getObject("OSKextExcludeList")); - if ( myTempDict ) { - IORecursiveLockLock(sKextLock); - // get rid of old exclude list - if (sExcludeListByID) { - sExcludeListByID->flushCollection(); - OSSafeRelease(sExcludeListByID); - } - - sExcludeListByID = OSDictionary::withDictionary(myTempDict, 0); - IORecursiveLockUnlock(sKextLock); + if ( NULL == myTempDict ) { + /* 25322874 */ + panic("Missing OSKextExcludeList dictionary\n"); + } + + IORecursiveLockLock(sKextLock); + // get rid of old exclude list + if (sExcludeListByID) { + OSSafeReleaseNULL(sExcludeListByID); } + + sExcludeListByID = OSDictionary::withDictionary(myTempDict, 0); + IORecursiveLockUnlock(sKextLock); break; } } // for (i = 0; i < theInfoArray->getCount()... @@ -3788,7 +3859,7 @@ OSKext::getExecutable(void) finish: - OSSafeRelease(extractedExecutable); + OSSafeReleaseNULL(extractedExecutable); return result; } @@ -4200,7 +4271,7 @@ OSKext::loadKextWithIdentifier( startOpt, startMatchingOpt, personalityNames); finish: - OSSafeRelease(kextIdentifier); + OSSafeReleaseNULL(kextIdentifier); return result; } @@ -4321,8 +4392,8 @@ OSKext::loadKextWithIdentifier( } finish: - OSSafeRelease(loadRequest); - OSSafeRelease(kextIdentifierSymbol); + OSSafeReleaseNULL(loadRequest); + OSSafeReleaseNULL(kextIdentifierSymbol); IORecursiveLockUnlock(sKextLock); @@ -4372,7 +4443,7 @@ OSKext::recordIdentifierRequest( "Failed to record kext %s as a candidate for inclusion in prelinked kernel.", kextIdentifier->getCStringNoCopy()); } - OSSafeRelease(kextIdentifierSymbol); + OSSafeReleaseNULL(kextIdentifierSymbol); return; } @@ -4580,6 +4651,7 @@ OSKext::load( bzero(account, sizeof(*account)); account->loadTag = kmod_info->id; account->site.flags = VM_TAG_KMOD; + account->kext = this; flags.loaded = true; @@ -4760,24 +4832,25 @@ OSKext::lookupSection(const char *segname, const char *secname) *********************************************************************/ OSReturn -OSKext::slidePrelinkedExecutable() -{ - OSReturn result = kOSKextReturnBadData; - kernel_mach_header_t * mh = NULL; - kernel_segment_command_t * seg = NULL; - kernel_segment_command_t * linkeditSeg = NULL; - kernel_section_t * sec = NULL; - char * linkeditBase = NULL; - bool haveLinkeditBase = false; - char * relocBase = NULL; - bool haveRelocBase = false; - struct dysymtab_command * dysymtab = NULL; - struct symtab_command * symtab = NULL; - kernel_nlist_t * sym = NULL; - struct relocation_info * reloc = NULL; - uint32_t i = 0; - int reloc_size; - vm_offset_t new_kextsize; +OSKext::slidePrelinkedExecutable(bool doCoalesedSlides) +{ + OSReturn result = kOSKextReturnBadData; + kernel_mach_header_t * mh = NULL; + kernel_segment_command_t * seg = NULL; + kernel_segment_command_t * linkeditSeg = NULL; + kernel_section_t * sec = NULL; + char * linkeditBase = NULL; + bool haveLinkeditBase = false; + char * relocBase = NULL; + bool haveRelocBase = false; + struct dysymtab_command * dysymtab = NULL; + struct linkedit_data_command * segmentSplitInfo = NULL; + struct symtab_command * symtab = NULL; + kernel_nlist_t * sym = NULL; + struct relocation_info * reloc = NULL; + uint32_t i = 0; + int reloc_size; + vm_offset_t new_kextsize; if (linkedExecutable == NULL || vm_kernel_slide == 0) { result = kOSReturnSuccess; @@ -4785,6 +4858,7 @@ OSKext::slidePrelinkedExecutable() } mh = (kernel_mach_header_t *)linkedExecutable->getBytesNoCopy(); + segmentSplitInfo = (struct linkedit_data_command *) getcommandfromheader(mh, LC_SEGMENT_SPLIT_INFO); for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { if (!seg->vmaddr) { @@ -4793,7 +4867,7 @@ OSKext::slidePrelinkedExecutable() seg->vmaddr += vm_kernel_slide; #if KASLR_KEXT_DEBUG - IOLog("kaslr: segname %s unslid 0x%lx slid 0x%lx \n", + IOLog("kaslr: segname %s unslid 0x%lx slid 0x%lx \n", seg->segname, (unsigned long)VM_KERNEL_UNSLIDE(seg->vmaddr), (unsigned long)seg->vmaddr); @@ -4812,7 +4886,7 @@ OSKext::slidePrelinkedExecutable() sec->addr += vm_kernel_slide; #if KASLR_KEXT_DEBUG - IOLog("kaslr: sectname %s unslid 0x%lx slid 0x%lx \n", + IOLog("kaslr: sectname %s unslid 0x%lx slid 0x%lx \n", sec->sectname, (unsigned long)VM_KERNEL_UNSLIDE(sec->addr), (unsigned long)sec->addr); @@ -4824,7 +4898,7 @@ OSKext::slidePrelinkedExecutable() symtab = (struct symtab_command *) getcommandfromheader(mh, LC_SYMTAB); - if (symtab != NULL) { + if (symtab != NULL && doCoalesedSlides == false) { /* Some pseudo-kexts have symbol tables without segments. * Ignore them. */ if (symtab->nsyms > 0 && haveLinkeditBase) { @@ -4846,8 +4920,8 @@ OSKext::slidePrelinkedExecutable() } } } - - if (dysymtab != NULL) { + + if (dysymtab != NULL && doCoalesedSlides == false) { if (dysymtab->nextrel > 0) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogLoadFlag | @@ -4912,10 +4986,11 @@ OSKext::slidePrelinkedExecutable() /* We should free these relocations, not just delete the reference to them. * Free relocations from PIE kexts. + * + * For now, we do not free LINKEDIT for kexts with split segments. */ new_kextsize = round_page(kmod_info->size - reloc_size); - - if ((kmod_info->size - new_kextsize) > PAGE_SIZE) { + if (((kmod_info->size - new_kextsize) > PAGE_SIZE) && (!segmentSplitInfo)) { vm_offset_t endofkext = kmod_info->address + kmod_info->size; vm_offset_t new_endofkext = kmod_info->address + new_kextsize; vm_offset_t endofrelocInfo = (vm_offset_t) (((uint8_t *)reloc) + reloc_size); @@ -5282,7 +5357,7 @@ OSKext::loadExecutable() result = kOSReturnSuccess; finish: - OSSafeRelease(linkDependencies); + OSSafeReleaseNULL(linkDependencies); /* Clear up locally allocated dependency info. */ @@ -5603,6 +5678,20 @@ OSKext::setVMAttributes(bool protect, bool wire) goto finish; } +#if !VM_MAPPED_KEXTS + if (getcommandfromheader((kernel_mach_header_t *)kmod_info->address, LC_SEGMENT_SPLIT_INFO)) { + /* This is a split kext in a prelinked kernelcache; we'll let the + * platform code take care of protecting it. It is already wired. + */ + /* TODO: Should this still allow protections for the first segment + * to go through, in the event that we have a mix of split and + * unsplit kexts? + */ + result = KERN_SUCCESS; + goto finish; + } +#endif + /* Protect the headers as read-only; they do not need to be wired */ result = (protect) ? OSKext_protect(kext_map, kmod_info->address, kmod_info->address + kmod_info->hdr_size, VM_PROT_READ, TRUE) @@ -5720,22 +5809,52 @@ OSKext::validateKextMapping(bool startFlag) /* Verify that the start/stop function lies within the kext's address range. */ - if (address < kmod_info->address + kmod_info->hdr_size || - kmod_info->address + kmod_info->size <= address) - { - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogLoadFlag, - "Kext %s module %s pointer is outside of kext range " - "(%s %p - kext at %p-%p)..", - getIdentifierCString(), - whichOp, - whichOp, - (void *)VM_KERNEL_UNSLIDE(address), - (void *)VM_KERNEL_UNSLIDE(kmod_info->address), - (void *)(VM_KERNEL_UNSLIDE(kmod_info->address) + kmod_info->size)); - result = kOSKextReturnBadData; - goto finish; + if (getcommandfromheader((kernel_mach_header_t *)kmod_info->address, LC_SEGMENT_SPLIT_INFO)) { + /* This will likely be how we deal with split kexts; walk the segments to + * check that the function lies inside one of the segments of this kext. + */ + for (seg = firstsegfromheader((kernel_mach_header_t *)kmod_info->address); + seg != NULL; + seg = nextsegfromheader((kernel_mach_header_t *)kmod_info->address, seg)) { + if ((address >= seg->vmaddr) && address < (seg->vmaddr + seg->vmsize)) { + break; + } + } + + if (!seg) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Kext %s module %s pointer is outside of kext range " + "(%s %p - kext starts at %p).", + getIdentifierCString(), + whichOp, + whichOp, + (void *)VM_KERNEL_UNSLIDE(address), + (void *)VM_KERNEL_UNSLIDE(kmod_info->address)); + result = kOSKextReturnBadData; + goto finish; + } + + seg = NULL; + } else { + if (address < kmod_info->address + kmod_info->hdr_size || + kmod_info->address + kmod_info->size <= address) + { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Kext %s module %s pointer is outside of kext range " + "(%s %p - kext at %p-%p).", + getIdentifierCString(), + whichOp, + whichOp, + (void *)VM_KERNEL_UNSLIDE(address), + (void *)VM_KERNEL_UNSLIDE(kmod_info->address), + (void *)(VM_KERNEL_UNSLIDE(kmod_info->address) + kmod_info->size)); + result = kOSKextReturnBadData; + goto finish; + } } /* Only do these checks before calling the start function; @@ -5818,6 +5937,35 @@ OSKext::verifySegmentMapping(kernel_segment_command_t *seg) return true; } +/********************************************************************* +*********************************************************************/ +static void +OSKextLogKextInfo(OSKext *aKext, uint64_t address, uint64_t size, firehose_tracepoint_code_t code) +{ + + uint64_t stamp = 0; + firehose_tracepoint_id_u trace_id; + struct firehose_trace_uuid_info_s uuid_info_s; + firehose_trace_uuid_info_t uuid_info = &uuid_info_s; + size_t uuid_info_len = sizeof(struct firehose_trace_uuid_info_s); + OSData *uuid_data; + + stamp = firehose_tracepoint_time(firehose_activity_flags_default); + trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_metadata, _firehose_tracepoint_type_metadata_kext, (firehose_tracepoint_flags_t)0, code); + + uuid_data = aKext->copyUUID(); + if (uuid_data) { + memcpy(uuid_info->ftui_uuid, uuid_data->getBytesNoCopy(), sizeof(uuid_info->ftui_uuid)); + OSSafeReleaseNULL(uuid_data); + } + + uuid_info->ftui_size = size; + uuid_info->ftui_address = VM_KERNEL_UNSLIDE(address); + + firehose_trace_metadata(firehose_stream_metadata, trace_id, stamp, uuid_info, uuid_info_len); + return; +} + /********************************************************************* *********************************************************************/ OSReturn @@ -5906,6 +6054,9 @@ OSKext::start(bool startDependenciesFlag) flags.starting = 1; + // Drop a log message so logd can grab the needed information to decode this kext + OSKextLogKextInfo(this, kmod_info->address, kmod_info->size, firehose_tracepoint_code_load); + #if !CONFIG_STATIC_CPPINIT result = OSRuntimeInitializeCPP(kmod_info, NULL); if (result == KERN_SUCCESS) { @@ -5924,7 +6075,6 @@ OSKext::start(bool startDependenciesFlag) } #endif #endif // CONFIG_KEC_FIPS - result = startfunc(kmod_info, kmodStartData); #if !CONFIG_STATIC_CPPINIT @@ -6004,7 +6154,7 @@ OSKext::stop(void) { OSReturn result = kOSReturnError; kern_return_t (*stopfunc)(kmod_info_t *, void *); - + if (!isStarted() || isInterface()) { result = kOSReturnSuccess; goto finish; @@ -6092,6 +6242,8 @@ OSKext::stop(void) } finish: + // Drop a log message so logd can update this kext's metadata + OSKextLogKextInfo(this, kmod_info->address, kmod_info->size, firehose_tracepoint_code_unload); return result; } @@ -6263,6 +6415,7 @@ OSKext::unload(void) freeAccount = NULL; IOSimpleLockLock(sKextAccountsLock); + account->kext = NULL; if (account->site.tag) account->site.flags |= VM_TAG_UNLOAD; else freeAccount = account; IOSimpleLockUnlock(sKextAccountsLock); @@ -6402,7 +6555,7 @@ OSKext::queueKextNotification( result = kOSReturnSuccess; finish: - OSSafeRelease(loadRequest); + OSSafeReleaseNULL(loadRequest); return result; } @@ -6678,6 +6831,7 @@ void OSKext::considerUnloads(Boolean rescheduleOnlyFlag) *********************************************************************/ extern "C" { +IOReturn OSKextSystemSleepOrWake(UInt32 messageType); IOReturn OSKextSystemSleepOrWake(UInt32 messageType) { IORecursiveLockLock(sKextInnerLock); @@ -6826,8 +6980,8 @@ OSKext::considerRebuildOfPrelinkedKernel(void) finish: IORecursiveLockUnlock(sKextLock); - OSSafeRelease(prelinkRequest); - OSSafeRelease(kextIterator); + OSSafeReleaseNULL(prelinkRequest); + OSSafeReleaseNULL(kextIterator); return; } @@ -7227,8 +7381,8 @@ OSKext::resolveDependencies( getIdentifierCString()); } - OSSafeRelease(localLoopStack); - OSSafeRelease(libraryIterator); + OSSafeReleaseNULL(localLoopStack); + OSSafeReleaseNULL(libraryIterator); return result; } @@ -7478,7 +7632,7 @@ OSKext::hasOSMetaClassInstances(void) finish: - OSSafeRelease(classIterator); + OSSafeReleaseNULL(classIterator); return result; } @@ -7499,7 +7653,7 @@ OSKext::reportOSMetaClassInstances( theKext->reportOSMetaClassInstances(msgLogSpec); finish: - OSSafeRelease(theKext); + OSSafeReleaseNULL(theKext); return; } @@ -7532,7 +7686,7 @@ OSKext::reportOSMetaClassInstances(OSKextLogSpec msgLogSpec) } finish: - OSSafeRelease(classIterator); + OSSafeReleaseNULL(classIterator); return; } @@ -7723,7 +7877,26 @@ OSKext::handleRequest( } else if (predicate->isEqualTo(kKextRequestPredicateSendResource)) { result = OSKext::dispatchResource(requestDict); - } else if (predicate->isEqualTo(kKextRequestPredicateGetLoaded)) { + } else if (predicate->isEqualTo(kKextRequestPredicateGetUUIDByAddress)) { + + OSNumber *lookupNum = NULL; + lookupNum = OSDynamicCast(OSNumber, + _OSKextGetRequestArgument(requestDict, + kKextRequestArgumentLookupAddressKey)); + + responseObject = OSKext::copyKextUUIDForAddress(lookupNum); + if (responseObject) { + result = kOSReturnSuccess; + } else { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogIPCFlag, + "Get UUID by Address failed."); + goto finish; + } + + } else if (predicate->isEqualTo(kKextRequestPredicateGetLoaded) || + predicate->isEqualTo(kKextRequestPredicateGetLoadedByUUID)) { OSBoolean * delayAutounloadBool = NULL; OSObject * infoKeysRaw = NULL; OSArray * infoKeys = NULL; @@ -7764,7 +7937,12 @@ OSKext::handleRequest( } } - responseObject = OSKext::copyLoadedKextInfo(kextIdentifiers, infoKeys); + if (predicate->isEqualTo(kKextRequestPredicateGetLoaded)) { + responseObject = OSKext::copyLoadedKextInfo(kextIdentifiers, infoKeys); + } + else if (predicate->isEqualTo(kKextRequestPredicateGetLoadedByUUID)) { + responseObject = OSKext::copyLoadedKextInfoByUUID(kextIdentifiers, infoKeys); + } if (!responseObject) { result = kOSKextReturnInternalError; } else { @@ -7884,11 +8062,11 @@ OSKext::handleRequest( IORecursiveLockUnlock(sKextLock); - OSSafeRelease(parsedXML); - OSSafeRelease(errorString); - OSSafeRelease(responseObject); - OSSafeRelease(serializer); - OSSafeRelease(logInfoArray); + OSSafeReleaseNULL(parsedXML); + OSSafeReleaseNULL(errorString); + OSSafeReleaseNULL(responseObject); + OSSafeReleaseNULL(serializer); + OSSafeReleaseNULL(logInfoArray); return result; } @@ -7953,7 +8131,7 @@ void OSKextPgoMetadataPutAll(OSKext *kext, size_t bufferSize, uint32_t *num_pairs) { - assert_static(sizeof(clock_sec_t) % 2 == 0); + _static_assert_1_arg(sizeof(clock_sec_t) % 2 == 0); //log_10 2^16 ≈ 4.82 const size_t max_secs_string_size = 5 * sizeof(clock_sec_t)/2; const size_t max_timestamp_string_size = max_secs_string_size + 1 + 6; @@ -7974,7 +8152,7 @@ void OSKextPgoMetadataPutAll(OSKext *kext, uuid_data = kext->copyUUID(); if (uuid_data) { memcpy(uuid, uuid_data->getBytesNoCopy(), sizeof(uuid)); - OSSafeRelease(uuid_data); + OSSafeReleaseNULL(uuid_data); uuid_unparse(uuid, uuid_string); OSKextPgoMetadataPut(pBuffer, position, bufferSize, num_pairs, "UUID", uuid_string); @@ -7985,7 +8163,7 @@ void OSKextPgoMetadataPutAll(OSKext *kext, clock_get_calendar_microtime(&secs, &usecs); assert(usecs < 1000000); char timestamp[max_timestamp_string_size + 1]; - assert_static(sizeof(long) >= sizeof(clock_sec_t)); + _static_assert_1_arg(sizeof(long) >= sizeof(clock_sec_t)); snprintf(timestamp, sizeof(timestamp), "%lu.%06d", (unsigned long)secs, (int)usecs); OSKextPgoMetadataPut(pBuffer, position, bufferSize, num_pairs, "TIMESTAMP", timestamp); @@ -8160,6 +8338,202 @@ OSKextGrabPgoData(uuid_t uuid, return err; } +void +OSKextResetPgoCountersLock() +{ + IORecursiveLockLock(sKextLock); +} + +void +OSKextResetPgoCountersUnlock() +{ + IORecursiveLockUnlock(sKextLock); +} + + +extern unsigned int not_in_kdp; + +void +OSKextResetPgoCounters() +{ + assert(!not_in_kdp); + uint32_t count = sLoadedKexts->getCount(); + for (uint32_t i = 0; i < count; i++) { + OSKext *kext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); + kernel_section_t *sect_prf_cnts = kext->lookupSection("__DATA", "__llvm_prf_cnts"); + if (!sect_prf_cnts) { + continue; + } + memset((void*)sect_prf_cnts->addr, 0, sect_prf_cnts->size); + } +} + +OSDictionary * +OSKext::copyLoadedKextInfoByUUID( + OSArray * kextIdentifiers, + OSArray * infoKeys) +{ + OSDictionary * result = NULL; + OSDictionary * kextInfo = NULL; // must release + uint32_t count, i; + uint32_t idCount = 0; + uint32_t idIndex = 0; + + IORecursiveLockLock(sKextLock); + +#if CONFIG_MACF + /* Is the calling process allowed to query kext info? */ + if (current_task() != kernel_task) { + int macCheckResult = 0; + kauth_cred_t cred = NULL; + + cred = kauth_cred_get_with_ref(); + macCheckResult = mac_kext_check_query(cred); + kauth_cred_unref(&cred); + + if (macCheckResult != 0) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | kOSKextLogLoadFlag, + "Failed to query kext info (MAC policy error 0x%x).", + macCheckResult); + goto finish; + } + } +#endif + + /* Empty list of UUIDs is equivalent to no list (get all). + */ + if (kextIdentifiers && !kextIdentifiers->getCount()) { + kextIdentifiers = NULL; + } else if (kextIdentifiers) { + idCount = kextIdentifiers->getCount(); + } + + /* Same for keys. + */ + if (infoKeys && !infoKeys->getCount()) { + infoKeys = NULL; + } + + count = sLoadedKexts->getCount(); + result = OSDictionary::withCapacity(count); + if (!result) { + goto finish; + } + + for (i = 0; i < count; i++) { + OSKext *thisKext = NULL; // do not release + Boolean includeThis = true; + uuid_t thisKextUUID; + OSData *uuid_data; + uuid_string_t uuid_key; + + if (kextInfo) { + kextInfo->release(); + kextInfo = NULL; + } + + thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); + if (!thisKext) { + continue; + } + + uuid_data = thisKext->copyUUID(); + if (!uuid_data) { + continue; + } + + memcpy(&thisKextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextUUID)); + OSSafeReleaseNULL(uuid_data); + + uuid_unparse(thisKextUUID, uuid_key); + + /* Skip current kext if we have a list of UUIDs and + * it isn't in the list. + */ + if (kextIdentifiers) { + includeThis = false; + + for (idIndex = 0; idIndex < idCount; idIndex++) { + const OSString* wantedUUID = OSDynamicCast(OSString, + kextIdentifiers->getObject(idIndex)); + + uuid_t uuid; + uuid_parse(wantedUUID->getCStringNoCopy(), uuid); + + if (0 == uuid_compare(uuid, thisKextUUID)) { + includeThis = true; + break; + } + + } + } + + if (!includeThis) { + continue; + } + + kextInfo = thisKext->copyInfo(infoKeys); + if (kextInfo) { + result->setObject(uuid_key, kextInfo); + } + } + +finish: + IORecursiveLockUnlock(sKextLock); + + if (kextInfo) kextInfo->release(); + + return result; +} + +/********************************************************************* +*********************************************************************/ +/* static */ +OSData * +OSKext::copyKextUUIDForAddress(OSNumber *address) +{ + OSKext *kext = NULL; + OSData *uuid = NULL; + vm_address_t vm_addr = 0; + + if (!address) + goto finish; + +#if CONFIG_MACF + /* Is the calling process allowed to query kext info? */ + if (current_task() != kernel_task) { + int macCheckResult = 0; + kauth_cred_t cred = NULL; + + cred = kauth_cred_get_with_ref(); + macCheckResult = mac_kext_check_query(cred); + kauth_cred_unref(&cred); + + if (macCheckResult != 0) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | kOSKextLogLoadFlag, + "Failed to query kext UUID (MAC policy error 0x%x).", + macCheckResult); + goto finish; + } + } +#endif + + vm_addr = (vm_address_t)(address->unsigned64BitValue() + vm_kernel_slide); + + kext = OSKext::lookupKextWithAddress(vm_addr); + if (kext) { + uuid = kext->copyUUID(); + } + +finish: + if (kext) { + kext->release(); + } + return uuid; +} + /********************************************************************* *********************************************************************/ @@ -8317,10 +8691,11 @@ OSKext::copyInfo(OSArray * infoKeys) OSDictionary * result = NULL; bool success = false; OSData * headerData = NULL; // must release + OSData * logData = NULL; // must release OSNumber * cpuTypeNumber = NULL; // must release OSNumber * cpuSubtypeNumber = NULL; // must release OSString * versionString = NULL; // do not release - uint32_t executablePathCStringSize = 0; + uint32_t executablePathCStringSize = 0; char * executablePathCString = NULL; // must release OSString * executablePathString = NULL; // must release OSData * uuid = NULL; // must release @@ -8350,6 +8725,7 @@ OSKext::copyInfo(OSArray * infoKeys) */ if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleMachOHeadersKey) || + _OSArrayContainsCString(infoKeys, kOSBundleLogStringsKey) || _OSArrayContainsCString(infoKeys, kOSBundleCPUTypeKey) || _OSArrayContainsCString(infoKeys, kOSBundleCPUSubtypeKey)) { @@ -8425,6 +8801,55 @@ OSKext::copyInfo(OSArray * infoKeys) } #endif // SECURE_KERNEL + if (_OSArrayContainsCString(infoKeys, kOSBundleLogStringsKey)) { + osLogDataHeaderRef *header; + char headerBytes[offsetof(osLogDataHeaderRef, sections) + NUM_OS_LOG_SECTIONS * sizeof(header->sections[0])]; + + void *os_log_data = NULL; + void *cstring_data = NULL; + unsigned long os_log_size = 0; + unsigned long cstring_size = 0; + uint32_t os_log_offset = 0; + uint32_t cstring_offset = 0; + bool res; + + os_log_data = getsectdatafromheader(kext_mach_hdr, "__TEXT", "__os_log", &os_log_size); + os_log_offset = getsectoffsetfromheader(kext_mach_hdr, "__TEXT", "__os_log"); + cstring_data = getsectdatafromheader(kext_mach_hdr, "__TEXT", "__cstring", &cstring_size); + cstring_offset = getsectoffsetfromheader(kext_mach_hdr, "__TEXT", "__cstring"); + + header = (osLogDataHeaderRef *) headerBytes; + header->version = OS_LOG_HDR_VERSION; + header->sect_count = NUM_OS_LOG_SECTIONS; + header->sections[OS_LOG_SECT_IDX].sect_offset = os_log_offset; + header->sections[OS_LOG_SECT_IDX].sect_size = (uint32_t) os_log_size; + header->sections[CSTRING_SECT_IDX].sect_offset = cstring_offset; + header->sections[CSTRING_SECT_IDX].sect_size = (uint32_t) cstring_size; + + + logData = OSData::withBytes(header, (u_int) (sizeof(osLogDataHeaderRef))); + if (!logData) { + goto finish; + } + res = logData->appendBytes(&(header->sections[0]), (u_int)(header->sect_count * sizeof(header->sections[0]))); + if (!res) { + goto finish; + } + if (os_log_data) { + res = logData->appendBytes(os_log_data, (u_int)header->sections[OS_LOG_SECT_IDX].sect_size); + if (!res) { + goto finish; + } + } + if (cstring_data) { + res = logData->appendBytes(cstring_data, (u_int)header->sections[CSTRING_SECT_IDX].sect_size); + if (!res) { + goto finish; + } + } + result->setObject(kOSBundleLogStringsKey, logData); + } + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleCPUTypeKey)) { cpuTypeNumber = OSNumber::withNumber( (uint64_t) kext_mach_hdr->cputype, @@ -8504,7 +8929,7 @@ OSKext::copyInfo(OSArray * infoKeys) executablePathString = OSString::withCString(executablePathCString); - if (!executablePathCString) { + if (!executablePathString) { goto finish; } @@ -8561,15 +8986,19 @@ OSKext::copyInfo(OSArray * infoKeys) if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleLoadAddressKey) || _OSArrayContainsCString(infoKeys, kOSBundleLoadSizeKey) || + _OSArrayContainsCString(infoKeys, kOSBundleExecLoadAddressKey) || + _OSArrayContainsCString(infoKeys, kOSBundleExecLoadSizeKey) || _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey)) { if (isInterface() || linkedExecutable) { /* These go to userspace via serialization, so we don't want any doubts * about their size. */ - uint64_t loadAddress = 0; - uint32_t loadSize = 0; - uint32_t wiredSize = 0; + uint64_t loadAddress = 0; + uint32_t loadSize = 0; + uint32_t wiredSize = 0; + uint64_t execLoadAddress = 0; + uint32_t execLoadSize = 0; /* Interfaces always report 0 load address & size. * Just the way they roll. @@ -8578,10 +9007,25 @@ OSKext::copyInfo(OSArray * infoKeys) * xxx - shouldn't have one! */ if (linkedExecutable /* && !isInterface() */) { + kernel_mach_header_t *mh = NULL; + kernel_segment_command_t *seg = NULL; + loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy(); + mh = (kernel_mach_header_t *)loadAddress; loadAddress = VM_KERNEL_UNSLIDE(loadAddress); loadSize = linkedExecutable->getLength(); - + + /* Walk through the kext, looking for the first executable + * segment in case we were asked for its size/address. + */ + for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { + if (seg->initprot & VM_PROT_EXECUTE) { + execLoadAddress = VM_KERNEL_UNSLIDE(seg->vmaddr); + execLoadSize = seg->vmsize; + break; + } + } + /* If we have a kmod_info struct, calculated the wired size * from that. Otherwise it's the full load size. */ @@ -8602,6 +9046,16 @@ OSKext::copyInfo(OSArray * infoKeys) result->setObject(kOSBundleLoadAddressKey, scratchNumber); OSSafeReleaseNULL(scratchNumber); } + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleExecLoadAddressKey)) { + scratchNumber = OSNumber::withNumber( + (unsigned long long)(execLoadAddress), + /* numBits */ 8 * sizeof(execLoadAddress)); + if (!scratchNumber) { + goto finish; + } + result->setObject(kOSBundleExecLoadAddressKey, scratchNumber); + OSSafeReleaseNULL(scratchNumber); + } if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleLoadSizeKey)) { scratchNumber = OSNumber::withNumber( (unsigned long long)(loadSize), @@ -8612,6 +9066,16 @@ OSKext::copyInfo(OSArray * infoKeys) result->setObject(kOSBundleLoadSizeKey, scratchNumber); OSSafeReleaseNULL(scratchNumber); } + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleExecLoadSizeKey)) { + scratchNumber = OSNumber::withNumber( + (unsigned long long)(execLoadSize), + /* numBits */ 8 * sizeof(execLoadSize)); + if (!scratchNumber) { + goto finish; + } + result->setObject(kOSBundleExecLoadSizeKey, scratchNumber); + OSSafeReleaseNULL(scratchNumber); + } if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey)) { scratchNumber = OSNumber::withNumber( (unsigned long long)(wiredSize), @@ -8726,19 +9190,20 @@ OSKext::copyInfo(OSArray * infoKeys) success = true; finish: - OSSafeRelease(headerData); - OSSafeRelease(cpuTypeNumber); - OSSafeRelease(cpuSubtypeNumber); - OSSafeRelease(executablePathString); - if (executablePathString) kfree(executablePathCString, executablePathCStringSize); - OSSafeRelease(uuid); - OSSafeRelease(scratchNumber); - OSSafeRelease(dependencyLoadTags); - OSSafeRelease(metaClassIterator); - OSSafeRelease(metaClassInfo); - OSSafeRelease(metaClassDict); - OSSafeRelease(metaClassName); - OSSafeRelease(superclassName); + OSSafeReleaseNULL(headerData); + OSSafeReleaseNULL(logData); + OSSafeReleaseNULL(cpuTypeNumber); + OSSafeReleaseNULL(cpuSubtypeNumber); + OSSafeReleaseNULL(executablePathString); + if (executablePathCString) kfree(executablePathCString, executablePathCStringSize); + OSSafeReleaseNULL(uuid); + OSSafeReleaseNULL(scratchNumber); + OSSafeReleaseNULL(dependencyLoadTags); + OSSafeReleaseNULL(metaClassIterator); + OSSafeReleaseNULL(metaClassInfo); + OSSafeReleaseNULL(metaClassDict); + OSSafeReleaseNULL(metaClassName); + OSSafeReleaseNULL(superclassName); if (!success) { OSSafeReleaseNULL(result); } @@ -8953,7 +9418,7 @@ OSKext::dequeueCallbackForRequestTag( callbackRecordOut); finish: - OSSafeRelease(requestTagNum); + OSSafeReleaseNULL(requestTagNum); return result; } @@ -9903,30 +10368,21 @@ inline const char * colorForFlags(OSKextLogSpec flags) switch (logLevel) { case kOSKextLogErrorLevel: return VTRED VTBOLD; - break; case kOSKextLogWarningLevel: return VTRED; - break; case kOSKextLogBasicLevel: return VTYELLOW VTUNDER; - break; case kOSKextLogProgressLevel: return VTYELLOW; - break; case kOSKextLogStepLevel: return VTGREEN; - break; case kOSKextLogDetailLevel: return VTCYAN; - break; case kOSKextLogDebugLevel: return VTMAGENTA; - break; default: return ""; // white - break; } - return ""; } inline bool logSpecMatch( @@ -10079,8 +10535,8 @@ OSKextVLog( if (allocBuffer) { kfree(allocBuffer, (length + 1) * sizeof(char)); } - OSSafeRelease(logString); - OSSafeRelease(logSpecNum); + OSSafeReleaseNULL(logString); + OSSafeReleaseNULL(logSpecNum); return; } @@ -10367,6 +10823,74 @@ OSKext::summaryIsInBacktrace( return FALSE; } +/* static */ +void * +OSKext::kextForAddress( + const void * addr) +{ + void *image = NULL; + u_int i; + +#if !VM_MAPPED_KEXTS + kernel_mach_header_t *mh = NULL; + kernel_segment_command_t *seg = NULL; +#endif + + if (((vm_offset_t)(uintptr_t)addr >= vm_kernel_stext) && + ((vm_offset_t)(uintptr_t)addr < vm_kernel_etext)) { + return (void *)&_mh_execute_header; + } + + if (!sKextSummariesLock) return image; + IOLockLock(sKextSummariesLock); + + if (!gLoadedKextSummaries) { + goto finish; + } + + for (i = 0; i < gLoadedKextSummaries->numSummaries; ++i) { + OSKextLoadedKextSummary * summary; + + summary = gLoadedKextSummaries->summaries + i; + if (!summary->address) { + continue; + } + +#if !VM_MAPPED_KEXTS + mh = (kernel_mach_header_t *)summary->address; + + for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { + if (((uint64_t)addr >= seg->vmaddr) && + ((uint64_t)addr < (seg->vmaddr + seg->vmsize))) { + image = (void *)summary->address; + break; + } + } + + if (image) { + break; + } +#else + /* On our platforms that use VM_MAPPED_KEXTS, we currently do not + * support split kexts, but we also may unmap the kexts, which can + * race with the above codepath (see OSKext::unload). As such, + * use a simple range lookup if we are using VM_MAPPED_KEXTS. + */ + if (((uint64_t)(uintptr_t)addr >= summary->address) && + ((uint64_t)(uintptr_t)addr < (summary->address + summary->size))) + { + image = (void *)(uintptr_t)summary->address; + break; + } +#endif + } + +finish: + IOLockUnlock(sKextSummariesLock); + + return image; +} + /********************************************************************* * scan list of loaded kext summaries looking for a load address match and if * found return the UUID C string. If not found then set empty string. @@ -10432,14 +10956,16 @@ void OSKext::printSummary( if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)kmod_ref)) == 0) { (*printf_func)(" kmod dependency scan stopped " - "due to missing dependency page: %p\n", kmod_ref); + "due to missing dependency page: %p\n", + doUnslide ? (void *)VM_KERNEL_UNSLIDE(kmod_ref) : kmod_ref); break; } rinfo = kmod_ref->info; if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)rinfo)) == 0) { (*printf_func)(" kmod dependency scan stopped " - "due to missing kmod page: %p\n", rinfo); + "due to missing kmod page: %p\n", + doUnslide ? (void *)VM_KERNEL_UNSLIDE(rinfo) : rinfo); break; } @@ -10894,15 +11420,12 @@ OSKext::updateLoadedKextSummaries(void) if (gLoadedKextSummaries == NULL || sLoadedKextSummariesAllocSize < size) { if (gLoadedKextSummaries) { - kmem_free(kernel_map, - (vm_offset_t)gLoadedKextSummaries, - sLoadedKextSummariesAllocSize); + kmem_free(kernel_map, (vm_offset_t)gLoadedKextSummaries, sLoadedKextSummariesAllocSize); gLoadedKextSummaries = NULL; + gLoadedKextSummariesTimestamp = mach_absolute_time(); sLoadedKextSummariesAllocSize = 0; } - result = kmem_alloc(kernel_map, - (vm_offset_t*)&summaryHeaderAlloc, - size, VM_KERN_MEMORY_OSKEXT); + result = kmem_alloc(kernel_map, (vm_offset_t *)&summaryHeaderAlloc, size, VM_KERN_MEMORY_OSKEXT); if (result != KERN_SUCCESS) goto finish; summaryHeader = summaryHeaderAlloc; summarySize = size; @@ -10969,14 +11492,16 @@ OSKext::updateLoadedKextSummaries(void) start = (vm_map_offset_t) summaryHeader; end = start + summarySize; - + result = vm_map_protect(kernel_map, start, end, VM_PROT_READ, FALSE); - if (result != KERN_SUCCESS) goto finish; - + if (result != KERN_SUCCESS) + goto finish; + gLoadedKextSummaries = summaryHeader; + gLoadedKextSummariesTimestamp = mach_absolute_time(); sLoadedKextSummariesAllocSize = summarySize; summaryHeaderAlloc = NULL; - + /* Call the magic breakpoint function through a static function pointer so * the compiler can't optimize the function away. */ @@ -11018,7 +11543,7 @@ OSKext::updateLoadedKextSummary(OSKextLoadedKextSummary *summary) uuid = copyUUID(); if (uuid) { memcpy(summary->uuid, uuid->getBytesNoCopy(), sizeof(summary->uuid)); - OSSafeRelease(uuid); + OSSafeReleaseNULL(uuid); } summary->address = kmod_info->address; @@ -11035,14 +11560,43 @@ OSKext::updateLoadedKextSummary(OSKextLoadedKextSummary *summary) *********************************************************************/ void -OSKext::updateActiveAccount(OSKextActiveAccount *account) +OSKext::updateActiveAccount(OSKextActiveAccount *accountp) { - bzero(account, sizeof(*account)); - account->address = kmod_info->address; - if (account->address) { - account->address_end = kmod_info->address + kmod_info->size; + kernel_mach_header_t *hdr = NULL; + kernel_segment_command_t *seg = NULL; + + hdr = (kernel_mach_header_t *)kmod_info->address; + + if (getcommandfromheader(hdr, LC_SEGMENT_SPLIT_INFO)) { + /* If this kext supports split segments, use the first + * executable segment as the range for instructions + * (and thus for backtracing. + */ + for (seg = firstsegfromheader(hdr); seg != NULL; seg = nextsegfromheader(hdr, seg)) { + if (seg->initprot & VM_PROT_EXECUTE) { + break; + } + } + } + + bzero(accountp, sizeof(*accountp)); + if (seg) { + accountp->address = seg->vmaddr; + if (accountp->address) { + accountp->address_end = seg->vmaddr + seg->vmsize; + } + } else { + /* For non-split kexts and for kexts without executable + * segments, just use the kmod_info range (as the kext + * is either all in one range or should not show up in + * instruction backtraces). + */ + accountp->address = kmod_info->address; + if (accountp->address) { + accountp->address_end = kmod_info->address + kmod_info->size; + } } - account->account = this->account; + accountp->account = this->account; } extern "C" const vm_allocation_site_t * @@ -11079,9 +11633,18 @@ OSKextGetAllocationSiteForCaller(uintptr_t address) } extern "C" uint32_t -OSKextGetKmodIDForSite(vm_allocation_site_t * site) +OSKextGetKmodIDForSite(vm_allocation_site_t * site, char * name, vm_size_t namelen) { OSKextAccount * account = (typeof(account)) site; + const char * kname; + + if (name) + { + if (account->kext) kname = account->kext->getIdentifierCString(); + else kname = "<>"; + strlcpy(name, kname, namelen); + } + return (account->loadTag); } @@ -11107,22 +11670,30 @@ OSKextFreeSite(vm_allocation_site_t * site) static void * GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDict) { - AppleTEXTHash_t my_ath = {1, 0, NULL}; + AppleTEXTHash_t my_ath = {2, 0, NULL}; AppleTEXTHash_t * my_athp = NULL; // do not release - OSDictionary * textHashDict = NULL; // do not release OSData * segmentHash = NULL; // do not release if (theKext == NULL || theInfoDict == NULL) { return(NULL); } - textHashDict = OSDynamicCast(OSDictionary, theInfoDict->getObject(kAppleTextHashesKey)); - if (textHashDict == NULL) { - return(NULL); - } - - segmentHash = OSDynamicCast(OSData, - textHashDict->getObject(ARCHNAME)); + // Get the part of the plist associate with kAppleTextHashesKey and let + // the crypto library do further parsing (slice/architecture) + segmentHash = OSDynamicCast(OSData, theInfoDict->getObject(kAppleTextHashesKey)); + // Support for ATH v1 while rolling out ATH v2 without revision locking submissions + // Remove this when v2 PLIST are supported + if (segmentHash == NULL) { + // If this fails, we may be dealing with a v1 PLIST + OSDictionary * textHashDict = NULL; // do not release + textHashDict = OSDynamicCast(OSDictionary, theInfoDict->getObject(kAppleTextHashesKey)); + if (textHashDict == NULL) { + return(NULL); + } + my_ath.ath_version=1; + segmentHash = OSDynamicCast(OSData,textHashDict->getObject(ARCHNAME)); + } // end of v2 rollout + if (segmentHash == NULL) { return(NULL); } diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index b32ab8c9b..5e25aa5fd 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -392,7 +392,19 @@ OSMetaClass::OSMetaClass( reserved = IONew(ExpansionData, 1); bzero(reserved, sizeof(ExpansionData)); #if IOTRACKING - reserved->tracking = IOTrackingQueueAlloc(inClassName, inClassSize, 0, true); + uint32_t numSiteQs = 0; + if ((this == &OSSymbol ::gMetaClass) + || (this == &OSString ::gMetaClass) + || (this == &OSNumber ::gMetaClass) + || (this == &OSString ::gMetaClass) + || (this == &OSData ::gMetaClass) + || (this == &OSDictionary::gMetaClass) + || (this == &OSArray ::gMetaClass) + || (this == &OSSet ::gMetaClass)) numSiteQs = 27; + + reserved->tracking = IOTrackingQueueAlloc(inClassName, (uintptr_t) this, + inClassSize, 0, kIOTrackingQueueTypeAlloc, + numSiteQs); #endif /* Hack alert: We are just casting inClassName and storing it in @@ -591,6 +603,7 @@ OSMetaClass::postModLoad(void * loadHandle) case kNoDictionaries: sBootstrapState = kMakingDictionaries; // No break; fall through + [[clang::fallthrough]]; case kMakingDictionaries: sAllClassesDict = OSDictionary::withCapacity(kClassCapacityIncrement); @@ -600,7 +613,8 @@ OSMetaClass::postModLoad(void * loadHandle) } sAllClassesDict->setOptions(OSCollection::kSort, OSCollection::kSort); - // No break; fall through + // No break; fall through + [[clang::fallthrough]]; case kCompletedBootstrap: { @@ -711,8 +725,8 @@ OSMetaClass::postModLoad(void * loadHandle) OSMetaClassLogErrorForKext(result, myKext); } - OSSafeRelease(myKextName); - OSSafeRelease(myKext); + OSSafeReleaseNULL(myKextName); + OSSafeReleaseNULL(myKext); if (sStalled) { OSMETA_ACCUMSIZE(-(sStalled->capacity * sizeof(OSMetaClass *) + @@ -774,7 +788,7 @@ OSMetaClass::modHasInstance(const char * kextIdentifier) result = theKext->hasOSMetaClassInstances(); finish: - OSSafeRelease(theKext); + OSSafeReleaseNULL(theKext); return result; } @@ -1191,7 +1205,7 @@ OSMetaClass::serializeClassDictionary(OSDictionary * serializeDictionary) } while (0); finish: - OSSafeRelease(classDict); + OSSafeReleaseNULL(classDict); IOLockUnlock(sAllClassesLock); @@ -1254,4 +1268,4 @@ IOTrackingQueue * OSMetaClass::getTracking() const return (reserved->tracking); } -#endif /* IOTRACKING */ \ No newline at end of file +#endif /* IOTRACKING */ diff --git a/libkern/c++/OSObject.cpp b/libkern/c++/OSObject.cpp index 45652a1ca..2928456f5 100644 --- a/libkern/c++/OSObject.cpp +++ b/libkern/c++/OSObject.cpp @@ -216,6 +216,18 @@ void OSObject::retain() const taggedRetain(0); } +extern "C" void +osobject_retain(void * object) +{ + ((OSObject *)object)->retain(); +} + +extern "C" void +osobject_release(void * object) +{ + ((OSObject *)object)->release(); +} + void OSObject::release(int when) const { taggedRelease(0, when); diff --git a/libkern/c++/OSOrderedSet.cpp b/libkern/c++/OSOrderedSet.cpp index cd9e4477a..ccf2c542b 100644 --- a/libkern/c++/OSOrderedSet.cpp +++ b/libkern/c++/OSOrderedSet.cpp @@ -120,7 +120,8 @@ unsigned int OSOrderedSet::setCapacityIncrement(unsigned int increment) unsigned int OSOrderedSet::ensureCapacity(unsigned int newCapacity) { _Element *newArray; - unsigned int finalCapacity, oldSize, newSize; + unsigned int finalCapacity; + vm_size_t oldSize, newSize; if (newCapacity <= capacity) return capacity; @@ -134,8 +135,11 @@ unsigned int OSOrderedSet::ensureCapacity(unsigned int newCapacity) } newSize = sizeof(_Element) * finalCapacity; - newArray = (_Element *) kalloc_container(newSize); + newArray = (_Element *) kallocp_container(&newSize); if (newArray) { + // use all of the actual allocation size + finalCapacity = newSize / sizeof(_Element); + oldSize = sizeof(_Element) * capacity; OSCONTAINER_ACCUMSIZE(((size_t)newSize) - ((size_t)oldSize)); diff --git a/libkern/c++/OSRuntime.cpp b/libkern/c++/OSRuntime.cpp index d8841a9ee..e6cf48ba7 100644 --- a/libkern/c++/OSRuntime.cpp +++ b/libkern/c++/OSRuntime.cpp @@ -87,36 +87,28 @@ static bool gKernelCPPInitialized = false; extern int debug_iomalloc_size; #endif -struct _mhead { - size_t mlen; - char dat[0]; -}; - /********************************************************************* *********************************************************************/ void * kern_os_malloc(size_t size) { - struct _mhead * mem; - size_t memsize = sizeof (*mem) + size ; - + void *mem; if (size == 0) { return (0); } - mem = (struct _mhead *)kalloc_tag_bt(memsize, VM_KERN_MEMORY_LIBKERN); + mem = kallocp_tag_bt((vm_size_t *)&size, VM_KERN_MEMORY_LIBKERN); if (!mem) { return (0); } #if OSALLOCDEBUG - debug_iomalloc_size += memsize; + OSAddAtomic(size, &debug_iomalloc_size); #endif - mem->mlen = memsize; - bzero(mem->dat, size); + bzero(mem, size); - return mem->dat; + return mem; } /********************************************************************* @@ -124,24 +116,13 @@ kern_os_malloc(size_t size) void kern_os_free(void * addr) { - struct _mhead * hdr; - - if (!addr) { - return; - } - - hdr = (struct _mhead *)addr; - hdr--; - + size_t size; + size = kalloc_size(addr); #if OSALLOCDEBUG - debug_iomalloc_size -= hdr->mlen; + OSAddAtomic(-size, &debug_iomalloc_size); #endif -#if 0 - memset((vm_offset_t)hdr, 0xbb, hdr->mlen); -#else - kfree(hdr, hdr->mlen); -#endif + kfree_addr(addr); } /********************************************************************* @@ -151,60 +132,40 @@ kern_os_realloc( void * addr, size_t nsize) { - struct _mhead * ohdr; - struct _mhead * nmem; - size_t nmemsize, osize; + void *nmem; + size_t osize; if (!addr) { return (kern_os_malloc(nsize)); } - ohdr = (struct _mhead *)addr; - ohdr--; - osize = ohdr->mlen - sizeof(*ohdr); + osize = kalloc_size(addr); if (nsize == osize) { return (addr); } if (nsize == 0) { - kern_os_free(addr); + kfree_addr(addr); return (0); } - nmemsize = sizeof (*nmem) + nsize ; - nmem = (struct _mhead *) kalloc_tag_bt(nmemsize, VM_KERN_MEMORY_LIBKERN); + nmem = kallocp_tag_bt((vm_size_t *)&nsize, VM_KERN_MEMORY_LIBKERN); if (!nmem){ - kern_os_free(addr); + kfree_addr(addr); return (0); } #if OSALLOCDEBUG - debug_iomalloc_size += (nmemsize - ohdr->mlen); + OSAddAtomic((nsize - osize), &debug_iomalloc_size); #endif - nmem->mlen = nmemsize; if (nsize > osize) { - (void) memset(&nmem->dat[osize], 0, nsize - osize); - } - (void)memcpy(nmem->dat, ohdr->dat, (nsize > osize) ? osize : nsize); - kfree(ohdr, ohdr->mlen); - - return (nmem->dat); -} - -/********************************************************************* -*********************************************************************/ -size_t -kern_os_malloc_size(void * addr) -{ - struct _mhead * hdr; - - if (!addr) { - return(0); + (void)memset((char *)nmem + osize, 0, nsize - osize); } + (void)memcpy(nmem, addr, (nsize > osize) ? osize : nsize); + kfree_addr(addr); - hdr = (struct _mhead *) addr; hdr--; - return hdr->mlen - sizeof (struct _mhead); + return (nmem); } #if PRAGMA_MARK @@ -306,7 +267,7 @@ OSRuntimeUnloadCPPForSegmentInKmod( } /* if (strncmp...) */ } /* for (section...) */ - OSSafeRelease(theKext); + OSSafeReleaseNULL(theKext); return; } @@ -391,7 +352,7 @@ OSRuntimeFinalizeCPP( } result = KMOD_RETURN_SUCCESS; finish: - OSSafeRelease(theKext); + OSSafeReleaseNULL(theKext); return result; } @@ -524,7 +485,7 @@ OSRuntimeInitializeCPP( theKext->setCPPInitialized(true); } finish: - OSSafeRelease(theKext); + OSSafeReleaseNULL(theKext); return result; } diff --git a/libkern/c++/OSSerialize.cpp b/libkern/c++/OSSerialize.cpp index 38696bc24..e2d93058e 100644 --- a/libkern/c++/OSSerialize.cpp +++ b/libkern/c++/OSSerialize.cpp @@ -70,42 +70,40 @@ void OSSerialize::clearText() bzero((void *)data, capacity); length = 1; } - tag = 0; tags->flushCollection(); } bool OSSerialize::previouslySerialized(const OSMetaClassBase *o) { char temp[16]; - OSString *tagString; + unsigned int tagIdx; if (binary) return (binarySerialize(o)); // look it up - tagString = (OSString *)tags->getObject((const OSSymbol *) o); + tagIdx = tags->getNextIndexOfObject(o, 0); // xx-review: no error checking here for addString calls! // does it exist? - if (tagString) { + if (tagIdx != -1U) { addString("getCStringNoCopy()); + snprintf(temp, sizeof(temp), "%u", tagIdx); + addString(temp); addString("\"/>"); return true; } - // build a tag - snprintf(temp, sizeof(temp), "%u", tag++); - tagString = OSString::withCString(temp); - - // add to tag dictionary - tags->setObject((const OSSymbol *) o, tagString);// XXX check return - tagString->release(); + // add to tag array + tags->setObject(o);// XXX check return return false; } bool OSSerialize::addXMLStartTag(const OSMetaClassBase *o, const char *tagString) { + char temp[16]; + unsigned int tagIdx; + if (binary) { printf("class %s: xml serialize\n", o->getMetaClass()->getClassName()); @@ -115,7 +113,10 @@ bool OSSerialize::addXMLStartTag(const OSMetaClassBase *o, const char *tagString if (!addChar('<')) return false; if (!addString(tagString)) return false; if (!addString(" ID=\"")) return false; - if (!addString(((OSString *)tags->getObject((const OSSymbol *)o))->getCStringNoCopy())) + tagIdx = tags->getNextIndexOfObject(o, 0); + assert(tagIdx != -1U); + snprintf(temp, sizeof(temp), "%u", tagIdx); + if (!addString(temp)) return false; if (!addChar('\"')) return false; if (!addChar('>')) return false; @@ -164,14 +165,22 @@ bool OSSerialize::initWithCapacity(unsigned int inCapacity) if (!super::init()) return false; - tags = OSDictionary::withCapacity(32); + tags = OSArray::withCapacity(256); if (!tags) { return false; } - tag = 0; length = 1; - capacity = (inCapacity) ? round_page_32(inCapacity) : round_page_32(1); + + if (!inCapacity) { + inCapacity = 1; + } + if (round_page_overflow(inCapacity, &capacity)) { + tags->release(); + tags = 0; + return false; + } + capacityIncrement = capacity; // allocate from the kernel map so that we can safely map this data @@ -219,8 +228,9 @@ unsigned int OSSerialize::ensureCapacity(unsigned int newCapacity) if (newCapacity <= capacity) return capacity; - // round up - newCapacity = round_page_32(newCapacity); + if (round_page_overflow(newCapacity, &newCapacity)) { + return capacity; + } kern_return_t rc = kmem_realloc(kernel_map, (vm_offset_t)data, diff --git a/libkern/c++/OSSerializeBinary.cpp b/libkern/c++/OSSerializeBinary.cpp index 66436dfe3..51bd067dc 100644 --- a/libkern/c++/OSSerializeBinary.cpp +++ b/libkern/c++/OSSerializeBinary.cpp @@ -74,7 +74,7 @@ bool OSSerialize::addBinary(const void * bits, size_t size) if (newCapacity >= capacity) { newCapacity = (((newCapacity - 1) / capacityIncrement) + 1) * capacityIncrement; - if (newCapacity < ensureCapacity(newCapacity)) return (false); + if (newCapacity > ensureCapacity(newCapacity)) return (false); } bcopy(bits, &data[length], size); @@ -88,21 +88,16 @@ bool OSSerialize::addBinaryObject(const OSMetaClassBase * o, uint32_t key, { unsigned int newCapacity; size_t alignSize; - OSNumber * tagNum; - // build a tag - tagNum = OSNumber::withNumber(tag, 32); - tag++; - // add to tag dictionary - tags->setObject((const OSSymbol *) o, tagNum); - tagNum->release(); + // add to tag array + tags->setObject(o); alignSize = ((size + sizeof(key) + 3) & ~3L); newCapacity = length + alignSize; if (newCapacity >= capacity) { newCapacity = (((newCapacity - 1) / capacityIncrement) + 1) * capacityIncrement; - if (newCapacity < ensureCapacity(newCapacity)) return (false); + if (newCapacity > ensureCapacity(newCapacity)) return (false); } if (endCollection) @@ -126,19 +121,19 @@ bool OSSerialize::binarySerialize(const OSMetaClassBase *o) OSNumber * num; OSSymbol * sym; OSString * str; - OSData * data; + OSData * ldata; OSBoolean * boo; - OSNumber * tagNum; + unsigned int tagIdx; uint32_t i, key; size_t len; bool ok; - tagNum = (OSNumber *)tags->getObject((const OSSymbol *) o); + tagIdx = tags->getNextIndexOfObject(o, 0); // does it exist? - if (tagNum) + if (-1U != tagIdx) { - key = (kOSSerializeObject | tagNum->unsigned32BitValue()); + key = (kOSSerializeObject | tagIdx); if (endCollection) { endCollection = false; @@ -158,9 +153,9 @@ bool OSSerialize::binarySerialize(const OSMetaClassBase *o) const OSMetaClassBase * dictValue; const OSMetaClassBase * nvalue = 0; + dictKey = dict->dictionary[i].key; + dictValue = dict->dictionary[i].value; i++; - dictKey = dict->dictionary[i-1].key; - dictValue = dict->dictionary[i-1].value; if (editor) { dictValue = nvalue = (*editor)(editRef, this, dict, dictKey, dictValue); @@ -223,12 +218,12 @@ bool OSSerialize::binarySerialize(const OSMetaClassBase *o) key = (kOSSerializeString | len); ok = addBinaryObject(o, key, str->getCStringNoCopy(), len); } - else if ((data = OSDynamicCast(OSData, o))) + else if ((ldata = OSDynamicCast(OSData, o))) { - len = data->getLength(); - if (data->reserved && data->reserved->disableSerialization) len = 0; + len = ldata->getLength(); + if (ldata->reserved && ldata->reserved->disableSerialization) len = 0; key = (kOSSerializeData | len); - ok = addBinaryObject(o, key, data->getBytesNoCopy(), len); + ok = addBinaryObject(o, key, ldata->getBytesNoCopy(), len); } else return (false); @@ -237,20 +232,27 @@ bool OSSerialize::binarySerialize(const OSMetaClassBase *o) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -#define setAtIndex(v, idx, o) \ - if (idx >= v##Capacity) \ - { \ - uint32_t ncap = v##Capacity + 64; \ - typeof(v##Array) nbuf = (typeof(v##Array)) kalloc_container(ncap * sizeof(o)); \ - if (!nbuf) ok = false; \ - if (v##Array) \ - { \ - bcopy(v##Array, nbuf, v##Capacity * sizeof(o)); \ - kfree(v##Array, v##Capacity * sizeof(o)); \ - } \ - v##Array = nbuf; \ - v##Capacity = ncap; \ - } \ +#define setAtIndex(v, idx, o) \ + if (idx >= v##Capacity) \ + { \ + if (v##Capacity >= v##CapacityMax) ok = false; \ + else \ + { \ + uint32_t ncap = v##Capacity + 64; \ + typeof(v##Array) nbuf = (typeof(v##Array)) kalloc_container(ncap * sizeof(o)); \ + if (!nbuf) ok = false; \ + else \ + { \ + if (v##Array) \ + { \ + bcopy(v##Array, nbuf, v##Capacity * sizeof(o)); \ + kfree(v##Array, v##Capacity * sizeof(o)); \ + } \ + v##Array = nbuf; \ + v##Capacity = ncap; \ + } \ + } \ + } \ if (ok) v##Array[idx] = o; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -260,10 +262,12 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin { OSObject ** objsArray; uint32_t objsCapacity; + enum { objsCapacityMax = 16*1024*1024 }; uint32_t objsIdx; OSObject ** stackArray; uint32_t stackCapacity; + enum { stackCapacityMax = 64*1024 }; uint32_t stackIdx; OSObject * result; @@ -286,9 +290,9 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin bool ok; if (errorString) *errorString = 0; + if (bufferSize < sizeof(kOSSerializeBinarySignature)) return (NULL); if (0 != strcmp(kOSSerializeBinarySignature, buffer)) return (NULL); if (3 & ((uintptr_t) buffer)) return (NULL); - if (bufferSize < sizeof(kOSSerializeBinarySignature)) return (NULL); bufferPos = sizeof(kOSSerializeBinarySignature); next = (typeof(next)) (((uintptr_t) buffer) + bufferPos); @@ -338,13 +342,13 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin case kOSSerializeObject: if (len >= objsIdx) break; o = objsArray[len]; - o->retain(); isRef = true; break; case kOSSerializeNumber: bufferPos += sizeof(long long); if (bufferPos > bufferSize) break; + if ((len != 32) && (len != 64) && (len != 16) && (len != 8)) break; value = next[1]; value <<= 32; value |= next[0]; @@ -355,6 +359,7 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin case kOSSerializeSymbol: bufferPos += (wordLen * sizeof(uint32_t)); if (bufferPos > bufferSize) break; + if (len < 2) break; if (0 != ((const char *)next)[len-1]) break; o = (OSObject *) OSSymbol::withCString((const char *) next); next += wordLen; @@ -387,42 +392,36 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin if (!isRef) { setAtIndex(objs, objsIdx, o); - if (!ok) break; + if (!ok) + { + o->release(); + break; + } objsIdx++; } if (dict) { - if (sym) - { - DEBG("%s = %s\n", sym->getCStringNoCopy(), o->getMetaClass()->getClassName()); - if (o != dict) ok = dict->setObject(sym, o, true); - o->release(); - sym->release(); - sym = 0; - } - else + if (!sym) sym = (OSSymbol *) o; + else { - sym = OSDynamicCast(OSSymbol, o); - if (!sym && (str = OSDynamicCast(OSString, o))) + str = sym; + sym = OSDynamicCast(OSSymbol, sym); + if (!sym && (str = OSDynamicCast(OSString, str))) { sym = (OSSymbol *) OSSymbol::withString(str); - o->release(); - o = 0; + ok = (sym != 0); + if (!ok) break; } - ok = (sym != 0); + DEBG("%s = %s\n", sym->getCStringNoCopy(), o->getMetaClass()->getClassName()); + if (o != dict) ok = dict->setObject(sym, o); + if (sym && (sym != str)) sym->release(); + sym = 0; } } - else if (array) - { - ok = array->setObject(o); - o->release(); - } - else if (set) - { - ok = set->setObject(o); - o->release(); - } + else if (array) ok = array->setObject(o); + else if (set) ok = set->setObject(o); + else if (result) ok = false; else { assert(!parent); @@ -464,13 +463,14 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin } DEBG("ret %p\n", result); - if (objsCapacity) kfree(objsArray, objsCapacity * sizeof(*objsArray)); - if (stackCapacity) kfree(stackArray, stackCapacity * sizeof(*stackArray)); + if (!ok) result = 0; - if (!ok && result) + if (objsCapacity) { - result->release(); - result = 0; - } + for (len = (result != 0); len < objsIdx; len++) objsArray[len]->release(); + kfree(objsArray, objsCapacity * sizeof(*objsArray)); + } + if (stackCapacity) kfree(stackArray, stackCapacity * sizeof(*stackArray)); + return (result); -} \ No newline at end of file +} diff --git a/libkern/c++/OSSet.cpp b/libkern/c++/OSSet.cpp index 0cb188567..644b9dabf 100644 --- a/libkern/c++/OSSet.cpp +++ b/libkern/c++/OSSet.cpp @@ -156,9 +156,10 @@ OSSet *OSSet::withSet(const OSSet *set, void OSSet::free() { - (void) members->super::setOptions(0, kImmutable); - if (members) + if (members) { + (void) members->super::setOptions(0, kImmutable); members->release(); + } super::free(); } diff --git a/libkern/c++/OSString.cpp b/libkern/c++/OSString.cpp index 2bd875ee6..835e5f25e 100644 --- a/libkern/c++/OSString.cpp +++ b/libkern/c++/OSString.cpp @@ -68,7 +68,10 @@ bool OSString::initWithCString(const char *cString) if (!cString || !super::init()) return false; - newLength = strlen(cString) + 1; + newLength = strnlen(cString, kMaxStringLength); + if (newLength >= kMaxStringLength) return false; + + newLength++; newString = (char *) kalloc_container(newLength); if (!newString) return false; @@ -94,6 +97,8 @@ bool OSString::initWithStringOfLength(const char *cString, size_t inlength) if (!cString || !super::init()) return false; + if (inlength >= kMaxStringLength) return false; + newLength = inlength + 1; newString = (char *) kalloc_container(newLength); if (!newString) return false; @@ -120,7 +125,10 @@ bool OSString::initWithCStringNoCopy(const char *cString) if (!cString || !super::init()) return false; - length = strlen(cString) + 1; + length = strnlen(cString, kMaxStringLength); + if (length >= kMaxStringLength) return false; + + length++; flags |= kOSStringNoCopy; string = const_cast(cString); diff --git a/libkern/c++/OSSymbol.cpp b/libkern/c++/OSSymbol.cpp index 7b3f21408..a521f5cea 100644 --- a/libkern/c++/OSSymbol.cpp +++ b/libkern/c++/OSSymbol.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,14 +98,14 @@ class OSSymbolPool static void *operator new(size_t size); static void operator delete(void *mem, size_t size); - OSSymbolPool() { }; + OSSymbolPool() { } OSSymbolPool(const OSSymbolPool *old); virtual ~OSSymbolPool(); bool init(); - inline void closeGate() { lck_mtx_lock(poolGate); }; - inline void openGate() { lck_mtx_unlock(poolGate); }; + inline void closeGate() { lck_mtx_lock(poolGate); } + inline void openGate() { lck_mtx_unlock(poolGate); } OSSymbol *findSymbol(const char *cString) const; OSSymbol *insertSymbol(OSSymbol *sym); diff --git a/libkern/c++/OSUnserializeXML.cpp b/libkern/c++/OSUnserializeXML.cpp index b694f44ec..37c40da4d 100644 --- a/libkern/c++/OSUnserializeXML.cpp +++ b/libkern/c++/OSUnserializeXML.cpp @@ -2014,6 +2014,7 @@ getTag(parser_state_t *state, if (c == '\n') state->lineNumber++; if (c != '?') continue; c = nextChar(); + if (!c) return TAG_IGNORE; if (c == '>') { (void)nextChar(); return TAG_IGNORE; @@ -2068,6 +2069,7 @@ getTag(parser_state_t *state, values[*attributeCount][length++] = c; if (length >= (TAG_MAX_LENGTH - 1)) return TAG_BAD; c = nextChar(); + if (!c) return TAG_BAD; } values[*attributeCount][length] = 0; diff --git a/libkern/conf/Makefile b/libkern/conf/Makefile index 76db9a7d8..7bd79d9ae 100644 --- a/libkern/conf/Makefile +++ b/libkern/conf/Makefile @@ -37,7 +37,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile OBJPATH=${OBJPATH} \ build_all; -do_build_all:: do_all +do_build_all:: do_all include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/libkern/conf/Makefile.template b/libkern/conf/Makefile.template index 23bbd4f6a..f434fc023 100644 --- a/libkern/conf/Makefile.template +++ b/libkern/conf/Makefile.template @@ -22,22 +22,22 @@ CFLAGS+= -include meta_features.h -DLIBKERN_KERNEL_PRIVATE -DOSALLOCDEBUG=1 OSKextLib.cpo_CXXWARNFLAGS_ADD = -Wno-cast-align OSKext.cpo_CXXWARNFLAGS_ADD = -Wno-cast-align OSMetaClass.cpo_CXXWARNFLAGS_ADD = -Wno-cast-align -OSUnserialize.cpo_CXXWARNFLAGS_ADD = -Wno-cast-align +OSRuntime.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes +OSUnserialize.cpo_CXXWARNFLAGS_ADD = -Wno-cast-align -Wno-unreachable-code-break corecrypto_md5.o_CWARNFLAGS_ADD = -Wno-cast-align corecrypto_sha1.o_CWARNFLAGS_ADD = -Wno-cast-align # zlib is 3rd party source -compress.o_CWARNFLAGS_ADD = -Wno-cast-qual -deflate.o_CWARNFLAGS_ADD = -Wno-cast-qual -infback.o_CWARNFLAGS_ADD = -Wno-cast-qual -inffast.o_CWARNFLAGS_ADD = -Wno-cast-qual -inflate.o_CWARNFLAGS_ADD = -Wno-cast-qual -trees.o_CWARNFLAGS_ADD = -Wno-cast-qual -uncompr.o_CWARNFLAGS_ADD = -Wno-cast-qual +compress.o_CWARNFLAGS_ADD = -Wno-cast-qual +deflate.o_CWARNFLAGS_ADD = -Wno-cast-qual +infback.o_CWARNFLAGS_ADD = -Wno-cast-qual +inffast.o_CWARNFLAGS_ADD = -Wno-cast-qual +inflate.o_CWARNFLAGS_ADD = -Wno-cast-qual +trees.o_CWARNFLAGS_ADD = -Wno-cast-qual +uncompr.o_CWARNFLAGS_ADD = -Wno-cast-qual # warnings in bison-generated code -OSUnserializeXML.cpo_CXXWARNFLAGS_ADD = -Wno-uninitialized -OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code +OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-uninitialized -Wno-unreachable-code -Wno-unreachable-code-break OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code # Runtime support functions don't interact well with LTO (9294679) @@ -46,7 +46,7 @@ stack_protector.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) # # Directories for mig generated files # -COMP_SUBDIRS = +COMP_SUBDIRS = # # Make sure we don't remove this by accident if interrupted at the wrong @@ -72,24 +72,6 @@ COMP_SUBDIRS = %MACHDEP -# -# Machine-independent per-file flags -# - -# zlib is 3rd party source -compress.o_CWARNFLAGS_ADD = -Wno-cast-qual -deflate.o_CWARNFLAGS_ADD = -Wno-cast-qual -infback.o_CWARNFLAGS_ADD = -Wno-cast-qual -inffast.o_CWARNFLAGS_ADD = -Wno-cast-qual -inflate.o_CWARNFLAGS_ADD = -Wno-cast-qual -trees.o_CWARNFLAGS_ADD = -Wno-cast-qual -uncompr.o_CWARNFLAGS_ADD = -Wno-cast-qual - -# warnings in bison-generated code -OSUnserializeXML.cpo_CXXWARNFLAGS_ADD = -Wno-uninitialized -OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code -OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code - # Rebuild if per-file overrides change ${OBJS}: $(firstword $(MAKEFILE_LIST)) @@ -110,7 +92,7 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \ mv $${hib_file}__ $${hib_file} || exit 1; \ done - @echo LDFILELIST $(COMPONENT) + @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" $(_v)for obj in ${OBJS}; do \ echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist diff --git a/libkern/conf/files b/libkern/conf/files index c91df14cb..6f8e2f998 100644 --- a/libkern/conf/files +++ b/libkern/conf/files @@ -51,6 +51,10 @@ libkern/stdio/scanf.c standard libkern/uuid/uuid.c standard +libkern/os/log.c standard +libkern/os/object.c standard +libkern/os/internal.c standard + libkern/kernel_mach_header.c standard libkern/zlib/adler32.c optional zlib @@ -67,7 +71,7 @@ libkern/zlib/uncompr.c optional zlib libkern/zlib/zutil.c optional zlib libkern/crypto/register_crypto.c optional crypto -libkern/crypto/corecrypto_sha2.c optional crypto_sha2 +libkern/crypto/corecrypto_sha2.c standard libkern/crypto/corecrypto_sha1.c optional crypto libkern/crypto/corecrypto_md5.c optional crypto libkern/crypto/corecrypto_des.c optional crypto @@ -87,6 +91,7 @@ libkern/kxld/kxld_object.c optional config_kxld libkern/kxld/kxld_sect.c optional config_kxld libkern/kxld/kxld_seg.c optional config_kxld libkern/kxld/kxld_srcversion.c optional config_kxld +libkern/kxld/kxld_splitinfolc.c optional config_kxld libkern/kxld/kxld_sym.c optional config_kxld libkern/kxld/kxld_symtab.c optional config_kxld libkern/kxld/kxld_util.c optional config_kxld diff --git a/libkern/crypto/corecrypto_des.c b/libkern/crypto/corecrypto_des.c index 26f5ab50e..888ed87a4 100644 --- a/libkern/crypto/corecrypto_des.c +++ b/libkern/crypto/corecrypto_des.c @@ -88,121 +88,7 @@ void des3_ecb_encrypt(des_cblock *in, des_cblock *out, des3_ecb_key_schedule *ks ecb->ecb(ctx, 1, in, out); } -/* Single DES CBC - used by nfs_gss */ -int des_cbc_key_sched(des_cblock *key, des_cbc_key_schedule *ks) -{ - const struct ccmode_cbc *enc = g_crypto_funcs->ccdes_cbc_encrypt; - const struct ccmode_cbc *dec = g_crypto_funcs->ccdes_cbc_decrypt; - - /* Make sure the context size for the mode fits in the one we have */ - if((enc->size>sizeof(ks->enc)) || (dec->size>sizeof(ks->dec))) - panic("%s: inconsistent size for DES-CBC context", __FUNCTION__); - - - cccbc_init(enc, ks->enc, CCDES_KEY_SIZE, key); - cccbc_init(dec, ks->dec, CCDES_KEY_SIZE, key); - - /* The old DES interface could return -1 or -2 for weak keys and wrong parity, - but this was disabled all the time, so we never fail here */ - return 0; -} - -/* this is normally only called with length an 8 bytes multiple */ -void -des_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t length, - des_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt) -{ - const struct ccmode_cbc *cbc = encrypt?g_crypto_funcs->ccdes_cbc_encrypt:g_crypto_funcs->ccdes_cbc_decrypt; - cccbc_ctx *ctx = encrypt ? ks->enc : ks->dec; - int nblocks; - cccbc_iv_decl(cbc->block_size, ctx_iv); - - assert(length%8==0); - nblocks=length/8; - - /* set the iv */ - cccbc_set_iv(cbc, ctx_iv, iv); - - cccbc_update(cbc, ctx, ctx_iv, nblocks, in, out); - - /* copy back iv */ - if(retiv) - memcpy(retiv, ctx_iv, 8); -} - -/* Triple DES CBC - used by nfs_gss */ -int des3_cbc_key_sched(des_cblock *key, des3_cbc_key_schedule *ks) -{ - const struct ccmode_cbc *enc = g_crypto_funcs->cctdes_cbc_encrypt; - const struct ccmode_cbc *dec = g_crypto_funcs->cctdes_cbc_decrypt; - - /* Make sure the context size for the mode fits in the one we have */ - if((enc->size>sizeof(ks->enc)) || (dec->size>sizeof(ks->dec))) - panic("%s: inconsistent size for 3DES-CBC context", __FUNCTION__); - - cccbc_init(enc, ks->enc, CCDES_KEY_SIZE*3, key); - cccbc_init(dec, ks->dec, CCDES_KEY_SIZE*3, key); - - /* The old DES interface could return -1 or -2 for weak keys and wrong parity, - but this was disabled all the time, so we never fail here */ - return 0; -} - -/* this is normally only called with length an 8 bytes multiple */ -void -des3_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t length, - des3_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt) -{ - const struct ccmode_cbc *cbc = encrypt?g_crypto_funcs->cctdes_cbc_encrypt:g_crypto_funcs->cctdes_cbc_decrypt; - cccbc_ctx *ctx = encrypt ? ks->enc : ks->dec; - int nblocks; - cccbc_iv_decl(cbc->block_size, ctx_iv); - - assert(length%8==0); - nblocks=length/8; - - /* set the iv */ - cccbc_set_iv(cbc, ctx_iv, iv); - - cccbc_update(cbc, ctx, ctx_iv, nblocks, in, out); - - /* copy back iv */ - if(retiv) - memcpy(retiv, ctx_iv, 8); -} - - -/* - * DES MAC implemented according to FIPS 113 - * http://www.itl.nist.gov/fipspubs/fip113.htm - * Only full blocks. - * Used by nfs-gss - */ -void -des_cbc_cksum(des_cblock *in, des_cblock *out, - int len, des_cbc_key_schedule *ks) -{ - const struct ccmode_cbc *cbc = g_crypto_funcs->ccdes_cbc_encrypt; - int nblocks; - des_cblock cksum; - cccbc_iv_decl(cbc->block_size, ctx_iv); - - assert(len%8==0); - nblocks=len/8; - - cccbc_set_iv(cbc, ctx_iv, NULL); - while(nblocks--) { - cccbc_update(cbc, ks->enc, ctx_iv, 1, in++, cksum); - } - memcpy(out, cksum, sizeof(des_cblock)); -} - - /* Raw key helper functions */ -void des_fixup_key_parity(des_cblock *key) -{ - g_crypto_funcs->ccdes_key_set_odd_parity_fn(key, CCDES_KEY_SIZE); -} int des_is_weak_key(des_cblock *key) { diff --git a/libkern/crypto/corecrypto_sha2.c b/libkern/crypto/corecrypto_sha2.c index 3143c0161..786a895b2 100644 --- a/libkern/crypto/corecrypto_sha2.c +++ b/libkern/crypto/corecrypto_sha2.c @@ -31,6 +31,8 @@ #include #include +#if defined(CRYPTO_SHA2) + void SHA256_Init(SHA256_CTX *ctx) { const struct ccdigest_info *di; @@ -115,3 +117,25 @@ void SHA512_Final(void *digest, SHA512_CTX *ctx) ccdigest_final(di, ctx->ctx, digest); } + +#else + +/* As these are part of the KPI, we need to stub them out for any kernle cofiguration that does not support SHA2. */ + +void SHA384_Init(__unused SHA384_CTX *ctx) +{ + panic("SHA384_Init"); +} + +void SHA384_Update(__unused SHA384_CTX *ctx, __unused const void *data, __unused size_t len) +{ + panic("SHA384_Update"); +} + +void SHA384_Final(__unused void *digest, __unused SHA384_CTX *ctx) +{ + panic("SHA384_Final"); +} + +#endif + diff --git a/libkern/firehose/Makefile b/libkern/firehose/Makefile new file mode 100644 index 000000000..36a760d56 --- /dev/null +++ b/libkern/firehose/Makefile @@ -0,0 +1,40 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +LCLDIR = /usr/local/include + +KERNELFILES = + +DATAFILES = + +PRIVATE_KERNELFILES = + +EXPORTFILES = \ + firehose_types_private.h \ + ioctl_private.h \ + tracepoint_private.h + +PRIVATE_DATAFILES = ${EXPORTFILES} \ + private.h + +INSTALL_MI_DIR = firehose + +INSTALL_MI_LIST = ${DATAFILES} + +INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} + +INSTALL_KF_MI_LIST = ${KERNELFILES} + +INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} + +EXPORT_MI_DIR = firehose + +EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} ${EXPORTFILES} + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/libkern/firehose/firehose_types_private.h b/libkern/firehose/firehose_types_private.h new file mode 100644 index 000000000..209046e27 --- /dev/null +++ b/libkern/firehose/firehose_types_private.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +#ifndef __FIREHOSE_TYPES_PRIVATE__ +#define __FIREHOSE_TYPES_PRIVATE__ + +#include +#include +#include + +OS_ASSUME_NONNULL_BEGIN + +__BEGIN_DECLS + +/*! + * @enum firehose_activity_flags_t + * + * @discussion + * The lower 8 bits are or-ed in the upper 8 bits of Activity ID and propagated + * to children activities + */ +OS_ENUM(firehose_activity_flags, unsigned long, + firehose_activity_flags_default = 0x0000, + + firehose_activity_flags_info_mode = 0x0001, + firehose_activity_flags_debug_mode = 0x0002, + firehose_activity_flags_stream_live_mode = 0x0004, + + firehose_activity_flags_precise_timestamp = 0x0080, +); + +/*! + * @typedef firehose_activity_id_t + * + * @abstract + * Opaque activity identifier. + * + * @discussion + * Scalar value type, not reference counted. + */ +typedef uint64_t firehose_activity_id_t; +#define FIREHOSE_ACTIVITY_ID_NULL ((firehose_activity_id_t)0) +#define FIREHOSE_ACTIVITY_ID_INVALID ((firehose_activity_id_t)~0ULL) +#define FIREHOSE_ACTIVITY_ID_FLAGS_SHIFT 56 +#define FIREHOSE_ACTIVITY_ID_FLAGS(aid) \ + ((firehose_activity_flags_t)((aid) >> FIREHOSE_ACTIVITY_ID_FLAGS_SHIFT)) +#define FIREHOSE_ACTIVITY_ID_MERGE_FLAGS(aid, flags) (\ + ((firehose_activity_id_t)(aid)) | \ + ((firehose_activity_id_t)(flags) << FIREHOSE_ACTIVITY_ID_FLAGS_SHIFT)) + +/*! + * @enum firehose_stream_t + */ +OS_ENUM(firehose_stream, uint8_t, + firehose_stream_persist = 0, + firehose_stream_special = 1, + firehose_stream_memory = 2, + firehose_stream_metadata = 3, + firehose_stream_memory_high_traffic = 4, + firehose_stream_memory_wifi = 5, + firehose_stream_memory_baseband = 6, + + _firehose_stream_max, +); + +/*! + * @enum firehose_tracepoint_namespace_t + * + * @abstract + * Namespaces of tracepoints. + */ +OS_ENUM(firehose_tracepoint_namespace, uint8_t, + firehose_tracepoint_namespace_activity = 0x02, + firehose_tracepoint_namespace_trace = 0x03, + firehose_tracepoint_namespace_log = 0x04, + firehose_tracepoint_namespace_metadata = 0x05, +); + +/*! + * @enum firehose_tracepoint_code_t + * + * @abstract + * Codes of tracepoints. + */ +OS_ENUM(firehose_tracepoint_code, uint32_t, + firehose_tracepoint_code_load = 0x01, + firehose_tracepoint_code_unload = 0x02, +); + +/*! + * @typedef firehose_tracepoint_type_t + * + * @abstract + * Type of tracepoints. + */ +typedef uint8_t firehose_tracepoint_type_t; + +/*! + * @typedef firehose_tracepoint_flags_t + * + * @abstract + * Flags for tracepoints. + */ +OS_ENUM(firehose_tracepoint_flags, uint16_t, + _firehose_tracepoint_flags_base_has_current_aid = 0x0001, + _firehose_tracepoint_flags_base_main_executable = 0x0002, + _firehose_tracepoint_flags_base_shared_cache = 0x0004, + _firehose_tracepoint_flags_base_caller_pc = 0x0008, + _firehose_tracepoint_flags_base_has_unique_pid = 0x0010, +); + +/*! + * @typedef firehose_tracepoint_id_t + * + * @abstract + * Opaque tracepoint identifier. + */ +typedef uint64_t firehose_tracepoint_id_t; + +/*! + * @enum _firehose_tracepoint_type_activity_t + * + * @abstract + * Types of Activity tracepoints (namespace activity). + */ +OS_ENUM(_firehose_tracepoint_type_activity, firehose_tracepoint_type_t, + _firehose_tracepoint_type_activity_create = 0x01, + _firehose_tracepoint_type_activity_swap = 0x02, + _firehose_tracepoint_type_activity_useraction = 0x03, +); + +/*! + * @enum firehose_tracepoint_flags_activity_t + * + * @abstract + * Flags for Activity tracepoints (namespace activity). + */ +OS_ENUM(_firehose_tracepoint_flags_activity, uint16_t, + _firehose_tracepoint_flags_activity_user_interface = 0x0100, + _firehose_tracepoint_flags_activity_has_other_aid = 0x0200, +); + +/*! + * @enum firehose_tracepoint_type_trace_t + * + * @abstract + * Types of trace tracepoints (namespace trace). + */ +OS_ENUM(_firehose_tracepoint_type_trace, firehose_tracepoint_type_t, + _firehose_tracepoint_type_trace_default = 0x00, + _firehose_tracepoint_type_trace_info = 0x01, + _firehose_tracepoint_type_trace_debug = 0x02, + _firehose_tracepoint_type_trace_error = 0x10, + _firehose_tracepoint_type_trace_fault = 0x11, +); + +/*! + * @enum firehose_tracepoint_type_log_t + * + * @abstract + * Types of Log tracepoints (namespace log). + */ +OS_ENUM(_firehose_tracepoint_type_log, firehose_tracepoint_type_t, + _firehose_tracepoint_type_log_default = 0x00, + _firehose_tracepoint_type_log_info = 0x01, + _firehose_tracepoint_type_log_debug = 0x02, + _firehose_tracepoint_type_log_error = 0x10, + _firehose_tracepoint_type_log_fault = 0x11, +); + +/*! + * @enum firehose_tracepoint_flags_log_t + * + * @abstract + * Flags for Log tracepoints (namespace log). + */ +OS_ENUM(_firehose_tracepoint_flags_log, uint16_t, + _firehose_tracepoint_flags_log_has_private_data = 0x0100, + _firehose_tracepoint_flags_log_has_subsystem = 0x0200, + _firehose_tracepoint_flags_log_has_rules = 0x0400, + _firehose_tracepoint_flags_log_has_oversize = 0x0800, +); + +/*! + * @enum _firehose_tracepoint_type_metadata_t + * + * @abstract + * Types for metadata tracepoints (namespace metadata). + */ +OS_ENUM(_firehose_tracepoint_type_metadata, firehose_tracepoint_type_t, + _firehose_tracepoint_type_metadata_dyld = 0x01, + _firehose_tracepoint_type_metadata_subsystem = 0x02, + _firehose_tracepoint_type_metadata_kext = 0x03, +); + +/* MIG firehose push reply structure */ +typedef struct firehose_push_reply_s { + uint64_t fpr_mem_flushed_pos; + uint64_t fpr_io_flushed_pos; +} firehose_push_reply_t; + +typedef struct firehose_buffer_map_info_s { + mach_vm_address_t fbmi_addr; + mach_vm_size_t fbmi_size; +} firehose_buffer_map_info_t; + +#define FIREHOSE_PUSH_REPLY_CORRUPTED ((firehose_push_reply_t){ ~0ULL, ~0ULL }) + +typedef union firehose_buffer_u *firehose_buffer_t; + +__END_DECLS + +OS_ASSUME_NONNULL_END + +#endif // __FIREHOSE_TYPES__ diff --git a/libkern/firehose/ioctl_private.h b/libkern/firehose/ioctl_private.h new file mode 100644 index 000000000..efb828bd8 --- /dev/null +++ b/libkern/firehose/ioctl_private.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +#ifndef __FIREHOSE_IOCTL_PRIVATE__ +#define __FIREHOSE_IOCTL_PRIVATE__ + +#include +#include "firehose_types_private.h" + +// Ioctls implemented by the oslog dev node + +/* Flushed the log data. Return the updated pointers */ +#ifndef LOGFLUSHED +#define LOGFLUSHED _IOW('t', 81, firehose_push_reply_t) +#endif + +/* Map the kernel log buffers to logd's address space */ +#ifndef LOGREGISTER +#define LOGREGISTER _IOR('t', 80, int) +#endif + +/* Map the kernel log buffers to logd's address space */ +#ifndef LOGBUFFERMAP +#define LOGBUFFERMAP _IOR('t', 79, firehose_buffer_map_info_t) +#endif + +#endif // __FIREHOSE_IOCTL_PRIVATE__ diff --git a/libkern/firehose/private.h b/libkern/firehose/private.h new file mode 100644 index 000000000..943fceb35 --- /dev/null +++ b/libkern/firehose/private.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2013-2016 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +#ifndef __FIREHOSE_FIREHOSE_PRIVATE__ +#define __FIREHOSE_FIREHOSE_PRIVATE__ + +#define FIREHOSE_SPI_VERSION 20160602 + +#include "firehose_types_private.h" +#include "tracepoint_private.h" +#include "ioctl_private.h" + +#endif // __FIREHOSE_FIREHOSE_PRIVATE__ diff --git a/libkern/firehose/tracepoint_private.h b/libkern/firehose/tracepoint_private.h new file mode 100644 index 000000000..e0ccc4f7e --- /dev/null +++ b/libkern/firehose/tracepoint_private.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2013-2016 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +#ifndef __FIREHOSE_ACTIVITY__ +#define __FIREHOSE_ACTIVITY__ + +#include +#include +#include +#if KERNEL +#include +#endif +#include "firehose_types_private.h" + +OS_ASSUME_NONNULL_BEGIN + +/*! + * @typedef firehose_tracepoint_id_u + * + * @abstract + * Broken down tracepoint identifier. + */ +typedef union { + struct { + firehose_tracepoint_namespace_t _namespace; + firehose_tracepoint_type_t _type; + firehose_tracepoint_flags_t _flags; + uint32_t _code; + } ftid; + firehose_tracepoint_id_t ftid_value; +} firehose_tracepoint_id_u; + +#define FIREHOSE_STAMP_SLOP (1ULL << 36) // ~1minute + +/*! + * @typedef firehose_trace_uuid_info_t + * + * @abstract + * Info needed by logd when kexts are loaded or unloaded + * + */ +typedef struct firehose_trace_uuid_info_s { + uuid_t ftui_uuid; /* uuid of binary */ + uint64_t ftui_address; /* load address */ + uint64_t ftui_size; /* load size */ + char ftui_path[]; /* full path of binary - Unused in the kernel*/ +} *firehose_trace_uuid_info_t; + +/*! + * @typedef firehose_tracepoint_t + */ +typedef struct firehose_tracepoint_s { + firehose_tracepoint_id_u ft_id; + uint64_t ft_thread; + union { + struct { + uint64_t ft_timestamp_delta : 48; + uint64_t ft_length : 16; + }; + uint64_t ft_stamp_and_length; + }; + uint8_t ft_data[]; +} *firehose_tracepoint_t; + +#define FIREHOSE_TRACE_ID_MAKE(ns, type, flags, code) \ + (((firehose_tracepoint_id_u){ .ftid = { \ + ._namespace = ns, \ + ._type = type, \ + ._flags = flags, \ + ._code = code, \ + } }).ftid_value) + +#define FIREHOSE_TRACE_ID_SET_NS(tid, ns) \ + ((tid).ftid._namespace = firehose_tracepoint_namespace_##ns) + +#define FIREHOSE_TRACE_ID_SET_TYPE(tid, ns, type) \ + ((tid).ftid._type = _firehose_tracepoint_type_##ns##_##type) + +#define FIREHOSE_TRACE_ID_HAS_FLAG(tid, ns, flag) \ + ((tid).ftid._flags & _firehose_tracepoint_flags_##ns##_##flag) +#define FIREHOSE_TRACE_ID_SET_FLAG(tid, ns, flag) \ + ((void)((tid).ftid._flags |= _firehose_tracepoint_flags_##ns##_##flag)) +#define FIREHOSE_TRACE_ID_CLEAR_FLAG(tid, ns, flag) \ + ((void)((tid).ftid._flags &= ~_firehose_tracepoint_flags_##ns##_##flag)) + +#define FIREHOSE_TRACE_ID_SET_CODE(tid, code) \ + ((tid).ftid._code = code) + +__BEGIN_DECLS + +#if __has_feature(address_sanitizer) +__attribute__((no_sanitize("address"))) +#endif +__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) +__TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) +OS_ALWAYS_INLINE +static inline bool +firehose_precise_timestamps_enabled(void) +{ +#if KERNEL + return (atm_get_diagnostic_config() & 0x80) == 0; +#else + return (*((volatile uint32_t *)_COMM_PAGE_ATM_DIAGNOSTIC_CONFIG) & 0x80) == 0; +#endif +} + +#if __has_feature(address_sanitizer) +__attribute__((no_sanitize("address"))) +#endif +__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) +__TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) +OS_ALWAYS_INLINE +static inline uint64_t +firehose_tracepoint_time(firehose_activity_flags_t flags) +{ + if (firehose_precise_timestamps_enabled() || + (flags & firehose_activity_flags_precise_timestamp)) { + return mach_continuous_time(); + } else { + return mach_continuous_approximate_time(); + } +} + +#ifdef KERNEL +__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) +__TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) +void +firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid, + uint64_t stamp, const void* pubdata, size_t publen); +#endif +__END_DECLS + +OS_ASSUME_NONNULL_END + +#endif // __FIREHOSE_FIREHOSE__ diff --git a/libkern/kernel_mach_header.c b/libkern/kernel_mach_header.c index ebef5b5ab..579402474 100644 --- a/libkern/kernel_mach_header.c +++ b/libkern/kernel_mach_header.c @@ -138,6 +138,29 @@ getsectdatafromheader( return result; } +/* + * This routine returns the offset for the named section in the + * named segment if it exist in the mach header passed to it. Otherwise + * it returns zero. + * + * This routine can operate against any kernel mach header. + */ +uint32_t +getsectoffsetfromheader( + kernel_mach_header_t *mhp, + const char *segname, + const char *sectname) +{ + const kernel_section_t *sp; + + sp = getsectbynamefromheader(mhp, segname, sectname); + if(sp == (kernel_section_t *)0){ + return(0); + } + + return sp->offset; +} + /* * This routine returns the a pointer to the data for the named segment * if it exist in the mach header passed to it. Also it returns diff --git a/libkern/kxld/Makefile b/libkern/kxld/Makefile index cc20497a1..81160694c 100644 --- a/libkern/kxld/Makefile +++ b/libkern/kxld/Makefile @@ -90,10 +90,10 @@ STRIP = xcrun -sdk $(SDK_DIR) strip DSYMUTIL = xcrun -sdk $(SDK_DIR) dsymutil # Files -HDR_NAMES=kxld.h kxld_types.h +HDR_NAMES=kxld.h kxld_types.h prelink.h OBJ_NAMES=kxld.o kxld_array.o kxld_copyright.o kxld_demangle.o kxld_dict.o \ kxld_kext.o kxld_object.o kxld_reloc.o kxld_sect.o kxld_seg.o \ - kxld_srcversion.o kxld_sym.o kxld_symtab.o kxld_util.o kxld_uuid.o \ + kxld_srcversion.o kxld_splitinfolc.o kxld_sym.o kxld_symtab.o kxld_util.o kxld_uuid.o \ kxld_vtable.o kxld_versionmin.o HDRS=$(addprefix $(HDRSRC)/, $(HDR_NAMES)) @@ -161,8 +161,7 @@ $(LIBKXLDDST_ARCHIVE): $(LIBKXLDSYM_ARCHIVE) @mkdir -p $(ARCHIVEDST) install -o 0 -g 0 -c -m 555 $< $@ - -KEXTCOPYOBJS=$(OBJROOT)/kextcopyright.o $(OBJROOT)/kxld_copyright.o $(OBJROOT)/kxld_util.o +KEXTCOPYOBJS=$(OBJROOT)/kextcopyright.o $(OBJROOT)/kxld_copyright.o $(OBJROOT)/kxld_util.o kextcopyright: $(TESTDST)/kextcopyright $(TESTDST)/kextcopyright: $(KEXTCOPYOBJS) @mkdir -p $(TESTDST) @@ -198,7 +197,7 @@ analyze: @$(CLANG_ANALYZER) $(CFLAGS) $(INCLUDES) -I$(SRCROOT) tests/*.c @rm -f *.plist -clean: +clean: @rm -rf $(OBJROOT)/* fullclean: @@ -211,4 +210,3 @@ endif # Automatically build dependency information when .c or .h files change based # on implicit rule for .d:.c -include $(OBJS:.o=.d) - diff --git a/libkern/kxld/WKdmCompress.c b/libkern/kxld/WKdmCompress.c index 5109015c9..64a02a58a 100644 --- a/libkern/kxld/WKdmCompress.c +++ b/libkern/kxld/WKdmCompress.c @@ -15,11 +15,11 @@ WK_pack_2bits(WK_word* source_buf, WK_word* source_end, WK_word* dest_buf) { - register WK_word* src_next = source_buf; + WK_word* src_next = source_buf; WK_word* dest_next = dest_buf; while (src_next < source_end) { - register WK_word temp = src_next[0]; + WK_word temp = src_next[0]; temp |= (src_next[1] << 2); temp |= (src_next[2] << 4); temp |= (src_next[3] << 6); @@ -43,12 +43,12 @@ static WK_word* WK_pack_4bits(WK_word* source_buf, WK_word* source_end, WK_word* dest_buf) { - register WK_word* src_next = source_buf; + WK_word* src_next = source_buf; WK_word* dest_next = dest_buf; /* this loop should probably be unrolled */ while (src_next < source_end) { - register WK_word temp = src_next[0]; + WK_word temp = src_next[0]; temp |= (src_next[1] << 4); dest_next[0] = temp; @@ -69,12 +69,12 @@ WK_pack_3_tenbits(WK_word* source_buf, WK_word* source_end, WK_word* dest_buf) { - register WK_word* src_next = source_buf; + WK_word* src_next = source_buf; WK_word* dest_next = dest_buf; /* this loop should probably be unrolled */ while (src_next < source_end) { - register WK_word temp = src_next[0]; + WK_word temp = src_next[0]; temp |= (src_next[1] << 10); temp |= (src_next[2] << 20); diff --git a/libkern/kxld/WKdmDecompress.c b/libkern/kxld/WKdmDecompress.c index 8eaf78bd8..6e27b62f3 100644 --- a/libkern/kxld/WKdmDecompress.c +++ b/libkern/kxld/WKdmDecompress.c @@ -24,16 +24,16 @@ WK_unpack_2bits(WK_word *input_buf, WK_word *input_end, WK_word *output_buf) { - register WK_word *input_next = input_buf; - register WK_word *output_next = output_buf; - register WK_word packing_mask = TWO_BITS_PACKING_MASK; + WK_word *input_next = input_buf; + WK_word *output_next = output_buf; + WK_word packing_mask = TWO_BITS_PACKING_MASK; /* loop to repeatedly grab one input word and unpack it into * 4 output words. This loop could be unrolled a little---it's * designed to be easy to do that. */ while (input_next < input_end) { - register WK_word temp = input_next[0]; + WK_word temp = input_next[0]; DEBUG_PRINT_2("Unpacked tags word: %.8x\n", temp); output_next[0] = temp & packing_mask; output_next[1] = (temp >> 2) & packing_mask; @@ -59,9 +59,9 @@ WK_unpack_4bits(WK_word *input_buf, WK_word *input_end, WK_word *output_buf) { - register WK_word *input_next = input_buf; - register WK_word *output_next = output_buf; - register WK_word packing_mask = FOUR_BITS_PACKING_MASK; + WK_word *input_next = input_buf; + WK_word *output_next = output_buf; + WK_word packing_mask = FOUR_BITS_PACKING_MASK; /* loop to repeatedly grab one input word and unpack it into @@ -69,7 +69,7 @@ WK_unpack_4bits(WK_word *input_buf, * a little---it's designed to be easy to do that. */ while (input_next < input_end) { - register WK_word temp = input_next[0]; + WK_word temp = input_next[0]; DEBUG_PRINT_2("Unpacked dictionary indices word: %.8x\n", temp); output_next[0] = temp & packing_mask; output_next[1] = (temp >> 4) & packing_mask; @@ -90,16 +90,16 @@ WK_unpack_3_tenbits(WK_word *input_buf, WK_word *input_end, WK_word *output_buf) { - register WK_word *input_next = input_buf; - register WK_word *output_next = output_buf; - register WK_word packing_mask = LOW_BITS_MASK; + WK_word *input_next = input_buf; + WK_word *output_next = output_buf; + WK_word packing_mask = LOW_BITS_MASK; /* loop to fetch words of input, splitting each into three * words of output with 10 meaningful low bits. This loop * probably ought to be unrolled and maybe coiled */ while (input_next < input_end) { - register WK_word temp = input_next[0]; + WK_word temp = input_next[0]; output_next[0] = temp & packing_mask; output_next[1] = (temp >> 10) & packing_mask; @@ -203,7 +203,7 @@ WKdm_decompress (WK_word* src_buf, #endif { - register char *next_tag = (char *) tempTagsArray; + char *next_tag = (char *) tempTagsArray; char *tags_area_end = ((char *) tempTagsArray) + PAGE_SIZE_IN_WORDS; char *next_q_pos = (char *) tempQPosArray; diff --git a/libkern/kxld/kxld.c b/libkern/kxld/kxld.c index 728774c37..a98a897f2 100644 --- a/libkern/kxld/kxld.c +++ b/libkern/kxld/kxld.c @@ -41,7 +41,7 @@ #if !KERNEL #include "kxld.h" #include "kxld_types.h" -#else +#else #include #include #endif /* KERNEL */ @@ -72,6 +72,15 @@ struct kxld_context { cpu_subtype_t cpusubtype; }; +// set to TRUE if the kext has a vmaddr_TEXT_EXEC != 0 +boolean_t isSplitKext = FALSE; + +// set to TRUE is we come in via kxld_link_file +boolean_t isOldInterface = FALSE; +uint32_t kaslr_offsets_count = 0; +uint32_t *kaslr_offsets = NULL; +uint32_t kaslr_offsets_index = 0; + /******************************************************************************* * Globals *******************************************************************************/ @@ -96,13 +105,14 @@ static KXLDDict *s_order_dict; *******************************************************************************/ static kern_return_t init_context(KXLDContext *context, u_int ndependencies); -static kern_return_t init_kext_objects(KXLDContext *context, u_char *file, - u_long size, const char *name, KXLDDependency *dependencies, - u_int ndependencies); -static KXLDObject * get_object_for_file(KXLDContext *context, +static KXLDObject * get_object_for_file(KXLDContext *context, u_char *file, u_long size, const char *name); +static kern_return_t allocate_split_kext(KXLDContext *context, splitKextLinkInfo * link_info); static u_char * allocate_kext(KXLDContext *context, void *callback_data, - kxld_addr_t *vmaddr, u_long *vmsize, u_char **linked_object_alloc_out); + kxld_addr_t *vmaddr, u_long *vmsize, u_char **linked_object_alloc_out); +static kern_return_t init_kext_objects(KXLDContext *context, u_char *file, + u_long size, const char *name, KXLDDependency *dependencies, + u_int ndependencies); static void clear_context(KXLDContext *context); /******************************************************************************* @@ -121,7 +131,9 @@ kxld_create_context(KXLDContext **_context, #endif check(_context); - check(allocate_callback); + if (isOldInterface) { + check(allocate_callback); + } check(logging_callback); *_context = NULL; @@ -245,74 +257,185 @@ kxld_destroy_context(KXLDContext *context) } /******************************************************************************* -*******************************************************************************/ + *******************************************************************************/ kern_return_t -kxld_link_file( +kxld_link_split_file( KXLDContext * context, - u_char * file, - u_long size, + splitKextLinkInfo *link_info, const char * name, void * callback_data, KXLDDependency * dependencies, u_int ndependencies, - u_char ** linked_object_out, kxld_addr_t * kmod_info_kern) +{ + kern_return_t rval = KERN_FAILURE; + KXLDObject * kext_object = NULL; + splitKextLinkInfo * my_link_info = NULL; + + isSplitKext = (link_info->vmaddr_TEXT_EXEC != 0); + isOldInterface = FALSE; + + kxld_set_logging_callback_data(name, callback_data); + + kxld_log(kKxldLogLinking, kKxldLogBasic, "Linking kext %s", name); + + kaslr_offsets_count = 0; + kaslr_offsets_index = 0; + kaslr_offsets = NULL; + + require_action(context, finish, rval=KERN_INVALID_ARGUMENT); + require_action(link_info, finish, rval=KERN_INVALID_ARGUMENT); + require_action(dependencies, finish, rval=KERN_INVALID_ARGUMENT); + require_action(ndependencies, finish, rval=KERN_INVALID_ARGUMENT); + require_action(kmod_info_kern, finish, rval=KERN_INVALID_ARGUMENT); + + rval = init_context(context, ndependencies); + require_noerr(rval, finish); + + rval = init_kext_objects(context, + link_info->kextExecutable, + link_info->kextSize, + name, + dependencies, ndependencies); + require_noerr(rval, finish); + + kext_object = get_object_for_file(context, + link_info->kextExecutable, + link_info->kextSize, + name); + require_action(kext_object, finish, rval=KERN_FAILURE); + + // copy vmaddrs and fileoffsets for split segments into kext_object + kxld_object_set_link_info(kext_object, link_info); + + my_link_info = kxld_object_get_link_info(kext_object); + + rval = allocate_split_kext(context, my_link_info); + require_noerr(rval, finish); + +#if SPLIT_KEXTS_DEBUG + kxld_log(kKxldLogLinking, kKxldLogErr, "Linking kext %s", name); + kxld_show_split_info(link_info); +#endif // SPLIT_KEXTS_DEBUG + + rval = kxld_kext_relocate(context->kext, + (kxld_addr_t)my_link_info, + &context->vtables_by_name, + &context->defined_symbols_by_name, + &context->obsolete_symbols_by_name, + &context->defined_cxx_symbols_by_value); + require_noerr(rval, finish); + + rval = kxld_kext_export_linked_object(context->kext, + (void *) my_link_info, + kmod_info_kern); + require_noerr(rval, finish); + + // pass back info about linked kext + link_info->kaslr_offsets_count = kaslr_offsets_count; + link_info->kaslr_offsets = kaslr_offsets; + link_info->linkedKext = my_link_info->linkedKext; + link_info->linkedKextSize = my_link_info->linkedKextSize; + + if (kaslr_offsets_count != kaslr_offsets_index) { + kxld_log(kKxldLogLinking, kKxldLogErr, "[ERROR] %s: KASLR pointers: count=%d, but only populated %d!", name, kaslr_offsets_count, kaslr_offsets_index); + rval = KERN_FAILURE; + goto finish; + } + + // the values are now the responsibility of the caller + kaslr_offsets_count = 0; + kaslr_offsets_index = 0; + kaslr_offsets = NULL; + + rval = KERN_SUCCESS; +finish: + clear_context(context); + kxld_set_logging_callback_data(NULL, NULL); + + return rval; +} + +/******************************************************************************* + *******************************************************************************/ +kern_return_t +kxld_link_file( + KXLDContext * context, + u_char * file, + u_long size, + const char * name, + void * callback_data, + KXLDDependency * dependencies, + u_int ndependencies, + u_char ** linked_object_out, + kxld_addr_t * kmod_info_kern) { kern_return_t rval = KERN_FAILURE; kxld_addr_t vmaddr = 0; u_long vmsize = 0; u_char * linked_object = NULL; u_char * linked_object_alloc = NULL; + + kaslr_offsets_count = 0; + kaslr_offsets_index = 0; + kaslr_offsets = NULL; kxld_set_logging_callback_data(name, callback_data); - + kxld_log(kKxldLogLinking, kKxldLogBasic, "Linking kext %s", name); - + require_action(context, finish, rval=KERN_INVALID_ARGUMENT); - require_action(file, finish, rval=KERN_INVALID_ARGUMENT); - require_action(size, finish, rval=KERN_INVALID_ARGUMENT); require_action(dependencies, finish, rval=KERN_INVALID_ARGUMENT); require_action(ndependencies, finish, rval=KERN_INVALID_ARGUMENT); + require_action(file, finish, rval=KERN_INVALID_ARGUMENT); + require_action(size, finish, rval=KERN_INVALID_ARGUMENT); require_action(linked_object_out, finish, rval=KERN_INVALID_ARGUMENT); require_action(kmod_info_kern, finish, rval=KERN_INVALID_ARGUMENT); + + isSplitKext = FALSE; + isOldInterface = TRUE; rval = init_context(context, ndependencies); require_noerr(rval, finish); - - rval = init_kext_objects(context, file, size, name, - dependencies, ndependencies); + + rval = init_kext_objects(context, file, size, name, + dependencies, ndependencies); require_noerr(rval, finish); - - linked_object = allocate_kext(context, callback_data, - &vmaddr, &vmsize, &linked_object_alloc); + + linked_object = allocate_kext(context, callback_data, + &vmaddr, &vmsize, &linked_object_alloc); require_action(linked_object, finish, rval=KERN_RESOURCE_SHORTAGE); - - rval = kxld_kext_relocate(context->kext, vmaddr, - &context->vtables_by_name, - &context->defined_symbols_by_name, - &context->obsolete_symbols_by_name, - &context->defined_cxx_symbols_by_value); + + + rval = kxld_kext_relocate(context->kext, + vmaddr, + &context->vtables_by_name, + &context->defined_symbols_by_name, + &context->obsolete_symbols_by_name, + &context->defined_cxx_symbols_by_value); require_noerr(rval, finish); - - rval = kxld_kext_export_linked_object(context->kext, - linked_object, kmod_info_kern); + + rval = kxld_kext_export_linked_object(context->kext, + (void *) linked_object, + kmod_info_kern); require_noerr(rval, finish); - *linked_object_out = linked_object; + linked_object_alloc = NULL; - + rval = KERN_SUCCESS; finish: if (linked_object_alloc) { kxld_page_free_untracked(linked_object_alloc, vmsize); } - + clear_context(context); kxld_set_logging_callback_data(NULL, NULL); - + return rval; } + /******************************************************************************* *******************************************************************************/ static kern_return_t @@ -352,17 +475,21 @@ init_context(KXLDContext *context, u_int ndependencies) } /******************************************************************************* -*******************************************************************************/ -static kern_return_t -init_kext_objects(KXLDContext *context, u_char *file, u_long size, - const char *name, KXLDDependency *dependencies, u_int ndependencies) + *******************************************************************************/ +static kern_return_t +init_kext_objects(KXLDContext *context, + u_char *file, + u_long size, + const char *name, + KXLDDependency *dependencies, + u_int ndependencies) { kern_return_t rval = KERN_FAILURE; KXLDKext *kext = NULL; KXLDObject *kext_object = NULL; KXLDObject *interface_object = NULL; u_int i = 0; - + /* Create a kext object for each dependency. If it's a direct dependency, * export its symbols by name by value. If it's indirect, just export the * C++ symbols by value. @@ -371,60 +498,60 @@ init_kext_objects(KXLDContext *context, u_char *file, u_long size, kext = kxld_array_get_item(&context->dependencies, i); kext_object = NULL; interface_object = NULL; - + kext_object = get_object_for_file(context, dependencies[i].kext, - dependencies[i].kext_size, dependencies[i].kext_name); + dependencies[i].kext_size, dependencies[i].kext_name); require_action(kext_object, finish, rval=KERN_FAILURE); - + if (dependencies[i].interface) { - interface_object = get_object_for_file(context, - dependencies[i].interface, dependencies[i].interface_size, - dependencies[i].interface_name); + interface_object = get_object_for_file(context, + dependencies[i].interface, dependencies[i].interface_size, + dependencies[i].interface_name); require_action(interface_object, finish, rval=KERN_FAILURE); } - + rval = kxld_kext_init(kext, kext_object, interface_object); require_noerr(rval, finish); - + if (dependencies[i].is_direct_dependency) { rval = kxld_kext_export_symbols(kext, - &context->defined_symbols_by_name, - &context->obsolete_symbols_by_name, - &context->defined_cxx_symbols_by_value); + &context->defined_symbols_by_name, + &context->obsolete_symbols_by_name, + &context->defined_cxx_symbols_by_value); require_noerr(rval, finish); } else { - rval = kxld_kext_export_symbols(kext, - /* defined_symbols */ NULL, /* obsolete_symbols */ NULL, - &context->defined_cxx_symbols_by_value); + rval = kxld_kext_export_symbols(kext, + /* defined_symbols */ NULL, /* obsolete_symbols */ NULL, + &context->defined_cxx_symbols_by_value); require_noerr(rval, finish); } } - + /* Export the vtables for all of the dependencies. */ for (i = 0; i < context->dependencies.nitems; ++i) { kext = kxld_array_get_item(&context->dependencies, i); - + rval = kxld_kext_export_vtables(kext, - &context->defined_cxx_symbols_by_value, - &context->defined_symbols_by_name, - &context->vtables_by_name); + &context->defined_cxx_symbols_by_value, + &context->defined_symbols_by_name, + &context->vtables_by_name); require_noerr(rval, finish); } - + /* Create a kext object for the kext we're linking and export its locally - * defined C++ symbols. + * defined C++ symbols. */ kext_object = get_object_for_file(context, file, size, name); require_action(kext_object, finish, rval=KERN_FAILURE); - + rval = kxld_kext_init(context->kext, kext_object, /* interface */ NULL); require_noerr(rval, finish); - + rval = kxld_kext_export_symbols(context->kext, - /* defined_symbols */ NULL, /* obsolete_symbols */ NULL, - &context->defined_cxx_symbols_by_value); + /* defined_symbols */ NULL, /* obsolete_symbols */ NULL, + &context->defined_cxx_symbols_by_value); require_noerr(rval, finish); - + rval = KERN_SUCCESS; finish: return rval; @@ -462,38 +589,74 @@ get_object_for_file(KXLDContext *context, u_char *file, u_long size, finish: return rval; } - + +#include + /******************************************************************************* -*******************************************************************************/ + *******************************************************************************/ +static kern_return_t +allocate_split_kext(KXLDContext *context, splitKextLinkInfo * link_info) +{ + kern_return_t rval = KERN_FAILURE; + u_long vmsize = 0; + u_long header_size = 0; + u_char * linked_object = NULL; + + kxld_kext_get_vmsize(context->kext, &header_size, &vmsize); + + if (isSplitKext) { + /* get __LINKEDIT vmsize */ + kxld_kext_get_vmsize_for_seg_by_name(context->kext, SEG_LINKEDIT, &vmsize); + // add in the gaps + vmsize += (link_info->vmaddr_LINKEDIT - link_info->vmaddr_TEXT); + } + link_info->linkedKextSize = vmsize; + + linked_object = kxld_page_alloc_untracked(link_info->linkedKextSize); + require(linked_object, finish); + link_info->linkedKext = linked_object; + + bzero(linked_object, vmsize); + rval = KERN_SUCCESS; + +finish: + return rval; +} + +/******************************************************************************* + *******************************************************************************/ static u_char * -allocate_kext(KXLDContext *context, void *callback_data, - kxld_addr_t *vmaddr_out, u_long *vmsize_out, - u_char **linked_object_alloc_out) +allocate_kext(KXLDContext *context, + void *callback_data, + kxld_addr_t *vmaddr_out, + u_long *vmsize_out, + u_char **linked_object_alloc_out) { KXLDAllocateFlags flags = 0; kxld_addr_t vmaddr = 0; u_long vmsize = 0; u_long header_size = 0; u_char * linked_object = NULL; - + *linked_object_alloc_out = NULL; - + kxld_kext_get_vmsize(context->kext, &header_size, &vmsize); + vmaddr = context->allocate_callback(vmsize, &flags, callback_data); require_action(!(vmaddr & (kxld_get_effective_page_size()-1)), finish, - kxld_log(kKxldLogLinking, kKxldLogErr, - "Load address %p is not page-aligned.", - (void *) (uintptr_t) vmaddr)); - + kxld_log(kKxldLogLinking, kKxldLogErr, + "Load address %p is not page-aligned.", + (void *) (uintptr_t) vmaddr)); + if (flags & kKxldAllocateWritable) { linked_object = (u_char *) (u_long) vmaddr; } else { linked_object = kxld_page_alloc_untracked(vmsize); require(linked_object, finish); - + *linked_object_alloc_out = linked_object; } - + kxld_kext_set_linked_object_size(context->kext, vmsize); /* Zero out the memory before we fill it. We fill this buffer in a @@ -504,7 +667,7 @@ allocate_kext(KXLDContext *context, void *callback_data, bzero(linked_object, vmsize); *vmaddr_out = vmaddr; *vmsize_out = vmsize; - + finish: return linked_object; } @@ -539,4 +702,3 @@ clear_context(KXLDContext *context) kxld_dict_clear(&context->obsolete_symbols_by_name); kxld_dict_clear(&context->vtables_by_name); } - diff --git a/libkern/kxld/kxld_kext.c b/libkern/kxld/kxld_kext.c index 6b09346d2..06b57fe66 100644 --- a/libkern/kxld/kxld_kext.c +++ b/libkern/kxld/kxld_kext.c @@ -45,6 +45,7 @@ #include #include #include + #endif /* KERNEL */ #define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" @@ -61,6 +62,8 @@ #include "kxld_util.h" #include "kxld_vtable.h" +extern boolean_t isSplitKext; + struct symtab_command; struct kxld_kext { @@ -96,9 +99,9 @@ static kern_return_t resolve_symbols(KXLDKext *kext, static kern_return_t patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, const KXLDDict *defined_symbols); -static const KXLDSym *get_metaclass_symbol_from_super_meta_class_pointer_symbol( - KXLDKext *kext, KXLDSym *super_metaclass_pointer_sym); static kern_return_t create_vtable_index(KXLDKext *kext); +static const KXLDSym *get_metaclass_symbol_from_super_meta_class_pointer_symbol( + KXLDKext *kext, KXLDSym *super_metaclass_pointer_sym); static kern_return_t validate_symbols(KXLDKext *kext); @@ -354,6 +357,16 @@ kxld_kext_export_vtables(KXLDKext *kext, const KXLDDict *defined_cxx_symbols, return rval; } +/******************************************************************************* + *******************************************************************************/ +void +kxld_kext_get_vmsize_for_seg_by_name(const KXLDKext *kext, + const char *segname, + u_long *vmsize) +{ + (void) kxld_object_get_vmsize_for_seg_by_name(kext->kext, segname, vmsize); +} + /******************************************************************************* *******************************************************************************/ void @@ -365,27 +378,28 @@ kxld_kext_get_vmsize(const KXLDKext *kext, /******************************************************************************* *******************************************************************************/ -void +void kxld_kext_set_linked_object_size(KXLDKext *kext, u_long vmsize) { (void) kxld_object_set_linked_object_size(kext->kext, vmsize); } - /******************************************************************************* *******************************************************************************/ kern_return_t kxld_kext_export_linked_object(const KXLDKext *kext, - u_char *linked_object, kxld_addr_t *kmod_info) + void *linked_object, + kxld_addr_t *kmod_info) { kern_return_t rval = KERN_FAILURE; const KXLDSym *kmodsym = NULL; kmodsym = kxld_symtab_get_locally_defined_symbol_by_name( kxld_object_get_symtab(kext->kext), KXLD_KMOD_INFO_SYMBOL); + require_action(kmodsym, finish, rval=KERN_FAILURE; kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogNoKmodInfo)); - + *kmod_info = kmodsym->link_addr; rval = kxld_object_export_linked_object(kext->kext, linked_object); @@ -396,9 +410,12 @@ kxld_kext_export_linked_object(const KXLDKext *kext, /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_kext_relocate(KXLDKext *kext, kxld_addr_t link_address, - KXLDDict *patched_vtables, const KXLDDict *defined_symbols, - const KXLDDict *obsolete_symbols, const KXLDDict *defined_cxx_symbols) +kxld_kext_relocate(KXLDKext *kext, + kxld_addr_t link_address, + KXLDDict *patched_vtables, + const KXLDDict *defined_symbols, + const KXLDDict *obsolete_symbols, + const KXLDDict *defined_cxx_symbols) { kern_return_t rval = KERN_FAILURE; @@ -429,9 +446,11 @@ kxld_kext_relocate(KXLDKext *kext, kxld_addr_t link_address, rval = create_vtables(kext, defined_cxx_symbols, /* defined_symbols */ NULL); require_noerr(rval, finish); - rval = patch_vtables(kext, patched_vtables, defined_symbols); - require_noerr(rval, finish); - + if (isSplitKext == FALSE) { + rval = patch_vtables(kext, patched_vtables, defined_symbols); + require_noerr(rval, finish); + } + rval = validate_symbols(kext); require_noerr(rval, finish); @@ -440,7 +459,7 @@ kxld_kext_relocate(KXLDKext *kext, kxld_addr_t link_address, rval = KERN_SUCCESS; finish: - return rval; + return rval; } /******************************************************************************* @@ -840,8 +859,8 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, { /* Get the class name from the smc pointer */ rval = kxld_sym_get_class_name_from_super_metaclass_pointer( - super_metaclass_pointer, class_name, sizeof(class_name)); - require_noerr(rval, finish); + super_metaclass_pointer, class_name, sizeof(class_name)); + require_noerr(rval, finish); /* Get the vtable name from the class name */ rval = kxld_sym_get_vtable_name_from_class_name(class_name, @@ -859,7 +878,7 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, /* Find the SMCP's meta class symbol */ metaclass = get_metaclass_symbol_from_super_meta_class_pointer_symbol( kext, super_metaclass_pointer); - require_action(metaclass, finish, rval=KERN_FAILURE); + require_action(metaclass, finish, rval=KERN_FAILURE); /* Get the super class name from the super metaclass */ rval = kxld_sym_get_class_name_from_metaclass(metaclass, @@ -931,7 +950,7 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, /* Get the meta vtable name from the class name */ rval = kxld_sym_get_meta_vtable_name_from_class_name(class_name, vtable_name, sizeof(vtable_name)); - require_noerr(rval, finish); + require_noerr(rval, finish); /* Get the meta vtable. Whether or not it should exist has already * been tested in create_vtables(), so if it doesn't exist and we're @@ -956,7 +975,7 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, /* Get the super meta vtable */ super_vtable = kxld_dict_find(patched_vtables, super_vtable_name); - require_action(super_vtable && super_vtable->is_patched, + require_action(super_vtable && super_vtable->is_patched, finish, rval=KERN_FAILURE); /* Patch the meta class's vtable */ @@ -1017,10 +1036,10 @@ create_vtable_index(KXLDKext *kext) } /******************************************************************************* -*******************************************************************************/ + *******************************************************************************/ static const KXLDSym * get_metaclass_symbol_from_super_meta_class_pointer_symbol(KXLDKext *kext, - KXLDSym *super_metaclass_pointer_sym) + KXLDSym *super_metaclass_pointer_sym) { kern_return_t rval = KERN_FAILURE; const KXLDReloc *reloc = NULL; @@ -1029,23 +1048,31 @@ get_metaclass_symbol_from_super_meta_class_pointer_symbol(KXLDKext *kext, check(kext); check(super_metaclass_pointer_sym); - + /* Get the relocation entry that fills in the super metaclass pointer. */ reloc = kxld_object_get_reloc_at_symbol(kext->kext, - super_metaclass_pointer_sym); + super_metaclass_pointer_sym); require_action(reloc, finish, rval=KERN_FAILURE); - + /* Get the section of the super metaclass pointer. */ sect = kxld_object_get_section_by_index(kext->kext, - super_metaclass_pointer_sym->sectnum); + super_metaclass_pointer_sym->sectnum); require_action(sect, finish, rval=KERN_FAILURE); - + /* Get the symbol that will be filled into the super metaclass pointer. */ metaclass = kxld_object_get_symbol_of_reloc(kext->kext, reloc, sect); + + finish: + if (metaclass == NULL) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "metaclass == NULL kxld_sym %s <%s>", + super_metaclass_pointer_sym->name, __func__); + } return metaclass; } + /******************************************************************************* *******************************************************************************/ static kern_return_t diff --git a/libkern/kxld/kxld_kext.h b/libkern/kxld/kxld_kext.h index 58e932684..58b68bce3 100644 --- a/libkern/kxld/kxld_kext.h +++ b/libkern/kxld/kxld_kext.h @@ -72,6 +72,11 @@ kern_return_t kxld_kext_export_symbols(const KXLDKext *kext, struct kxld_dict *defined_cxx_symbols_by_value) __attribute__((nonnull(1), visibility("hidden"))); +void kxld_kext_get_vmsize_for_seg_by_name(const KXLDKext *kext, + const char *segname, + u_long *vmsize) +__attribute__((nonnull, visibility("hidden"))); + void kxld_kext_get_vmsize(const KXLDKext *kext, u_long *header_size, u_long *vmsize) __attribute__((nonnull, visibility("hidden"))); @@ -80,7 +85,8 @@ void kxld_kext_set_linked_object_size(KXLDKext *kext, u_long vmsize) __attribute__((nonnull, visibility("hidden"))); kern_return_t kxld_kext_export_linked_object(const KXLDKext *kext, - u_char *linked_object, kxld_addr_t *kmod_info) + void *linked_object, + kxld_addr_t *kmod_info) __attribute__((nonnull, visibility("hidden"))); /******************************************************************************* @@ -92,10 +98,13 @@ kern_return_t kxld_kext_export_vtables(KXLDKext *kext, struct kxld_dict *vtables) __attribute__((nonnull, visibility("hidden"))); -kern_return_t kxld_kext_relocate(KXLDKext *kext, kxld_addr_t link_address, - struct kxld_dict *patched_vtables, const struct kxld_dict *defined_symbols, - const struct kxld_dict *obsolete_symbols, - const struct kxld_dict *defined_cxx_symbols) - __attribute__((nonnull(1,3,4), visibility("hidden"))); +kern_return_t kxld_kext_relocate(KXLDKext *kext, + kxld_addr_t link_address, + struct kxld_dict *patched_vtables, + const struct kxld_dict *defined_symbols, + const struct kxld_dict *obsolete_symbols, + const struct kxld_dict *defined_cxx_symbols) +__attribute__((nonnull(1,3,4), visibility("hidden"))); + #endif /* _KXLD_KEXT_H_ */ diff --git a/libkern/kxld/kxld_object.c b/libkern/kxld/kxld_object.c index 36383e41e..896cf09fc 100644 --- a/libkern/kxld/kxld_object.c +++ b/libkern/kxld/kxld_object.c @@ -63,16 +63,20 @@ #include "kxld_uuid.h" #include "kxld_versionmin.h" #include "kxld_vtable.h" +#include "kxld_splitinfolc.h" #include "kxld_object.h" +extern boolean_t isSplitKext; +extern boolean_t isOldInterface; + /******************************************************************************* * Data structures *******************************************************************************/ struct kxld_object { - u_char *file; - u_long size; + u_char *file; // used by old interface + u_long size; // used by old interface const char *name; uint32_t filetype; cpu_type_t cputype; @@ -87,6 +91,8 @@ struct kxld_object { KXLDsrcversion srcversion; KXLDSymtab *symtab; struct dysymtab_command *dysymtab_hdr; + KXLDsplitinfolc splitinfolc; + splitKextLinkInfo split_info; kxld_addr_t link_addr; u_long output_buffer_size; boolean_t is_kernel; @@ -186,7 +192,7 @@ static kern_return_t populate_kmod_info(KXLDObject *object); *******************************************************************************/ static boolean_t kxld_object_target_needs_swap(const KXLDObject *object __unused); static KXLDSeg * kxld_object_get_seg_by_name(const KXLDObject *object, const char *segname); -static KXLDSect * kxld_object_get_sect_by_name(const KXLDObject *object, const char *segname, +static KXLDSect * kxld_object_get_sect_by_name(const KXLDObject *object, const char *segname, const char *sectname); /******************************************************************************* @@ -207,6 +213,7 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, kern_return_t rval = KERN_FAILURE; KXLDSeg * seg = NULL; u_int i = 0; + u_char * my_file; check(object); check(file); @@ -231,6 +238,13 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, rval = get_macho_slice_for_arch(object, file, size); require_noerr(rval, finish); + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } + /* Allocate the symbol table */ if (!object->symtab) { @@ -241,9 +255,12 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, /* Build the relocator */ - rval = kxld_relocator_init(&object->relocator, object->file, - object->symtab, &object->sects, object->cputype, - object->cpusubtype, kxld_object_target_needs_swap(object)); + rval = kxld_relocator_init(&object->relocator, + my_file, + object->symtab, &object->sects, + object->cputype, + object->cpusubtype, + kxld_object_target_needs_swap(object)); require_noerr(rval, finish); /* There are four types of Mach-O files that we can support: @@ -254,10 +271,10 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, */ if (kxld_object_is_32_bit(object)) { - struct mach_header *mach_hdr = (struct mach_header *) ((void *) object->file); + struct mach_header *mach_hdr = (struct mach_header *) ((void *) my_file); object->filetype = mach_hdr->filetype; } else { - struct mach_header_64 *mach_hdr = (struct mach_header_64 *) ((void *) object->file); + struct mach_header_64 *mach_hdr = (struct mach_header_64 *) ((void *) my_file); object->filetype = mach_hdr->filetype; } @@ -301,10 +318,11 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, , &object->locrelocs, &object->extrelocs, target_supports_slideable_kexts(object) #endif + , isOldInterface ? 0 : object->splitinfolc.datasize ); } } - + (void) set_is_object_linked(object); rval = KERN_SUCCESS; @@ -312,6 +330,34 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, return rval; } +/******************************************************************************* + *******************************************************************************/ +splitKextLinkInfo * +kxld_object_get_link_info(KXLDObject *object) +{ + check(object); + + return &object->split_info; +} + + +/******************************************************************************* + *******************************************************************************/ +void +kxld_object_set_link_info(KXLDObject *object, splitKextLinkInfo *link_info) +{ + check(object); + check(link_info); + + object->split_info.vmaddr_TEXT = link_info->vmaddr_TEXT; + object->split_info.vmaddr_TEXT_EXEC = link_info->vmaddr_TEXT_EXEC; + object->split_info.vmaddr_DATA = link_info->vmaddr_DATA; + object->split_info.vmaddr_DATA_CONST = link_info->vmaddr_DATA_CONST; + object->split_info.vmaddr_LINKEDIT = link_info->vmaddr_LINKEDIT; + + return; +} + /******************************************************************************* *******************************************************************************/ kern_return_t @@ -426,14 +472,13 @@ get_macho_slice_for_arch(KXLDObject *object, u_char *file, u_long size) struct fat_arch *archs = (struct fat_arch *) &fat[1]; boolean_t swap = FALSE; #endif /* KERNEL */ - + u_char *my_file = file; + u_long my_file_size = size; + check(object); check(file); check(size); - object->file = file; - object->size = size; - /* We are assuming that we will never receive a fat file in the kernel */ #if !KERNEL @@ -469,20 +514,20 @@ get_macho_slice_for_arch(KXLDObject *object, u_char *file, u_long size) rval=KERN_FAILURE; kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); - object->file = file + arch->offset; - object->size = arch->size; + my_file = my_file + arch->offset; + my_file_size = arch->size; } #endif /* !KERNEL */ /* Swap the Mach-O's headers to this architecture if necessary */ if (kxld_object_is_32_bit(object)) { - rval = validate_and_swap_macho_32(object->file, object->size + rval = validate_and_swap_macho_32(my_file, my_file_size #if !KERNEL , object->host_order #endif /* !KERNEL */ ); } else { - rval = validate_and_swap_macho_64(object->file, object->size + rval = validate_and_swap_macho_64(my_file, my_file_size #if !KERNEL , object->host_order #endif /* !KERNEL */ @@ -490,12 +535,21 @@ get_macho_slice_for_arch(KXLDObject *object, u_char *file, u_long size) } require_noerr(rval, finish); - mach_hdr = (struct mach_header *) ((void *) object->file); + mach_hdr = (struct mach_header *) ((void *) my_file); require_action(object->cputype == mach_hdr->cputype, finish, rval=KERN_FAILURE; kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); object->cpusubtype = mach_hdr->cpusubtype; /* */ + if (isOldInterface) { + object->file = my_file; + object->size = my_file_size; + } + else { + object->split_info.kextExecutable = my_file; + object->split_info.kextSize = my_file_size; + } + rval = KERN_SUCCESS; finish: return rval; @@ -526,16 +580,24 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, u_int nsegs = 0; u_int nsects = 0; u_int ncmds = 0; + u_char *my_file; + + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } KXLD_3264_FUNC(kxld_object_is_32_bit(object), base_offset, get_macho_cmd_data_32, get_macho_cmd_data_64, - object->file, offset, &filetype, &ncmds); + my_file, offset, &filetype, &ncmds); /* First pass to count segments and sections */ offset = base_offset; for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) ((void *) (object->file + offset)); + cmd_hdr = (struct load_command *) ((void *) (my_file + offset)); switch(cmd_hdr->cmd) { #if KXLD_USER_OR_ILP32 @@ -585,7 +647,7 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, offset = base_offset; for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) ((void *) (object->file + offset)); + cmd_hdr = (struct load_command *) ((void *) (my_file + offset)); seg = NULL; switch(cmd_hdr->cmd) { @@ -634,6 +696,7 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, break; case LC_VERSION_MIN_MACOSX: case LC_VERSION_MIN_IPHONEOS: + case LC_VERSION_MIN_TVOS: case LC_VERSION_MIN_WATCHOS: versionmin_hdr = (struct version_min_command *) cmd_hdr; kxld_versionmin_init_from_macho(&object->versionmin, versionmin_hdr); @@ -644,14 +707,13 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, break; case LC_DYSYMTAB: object->dysymtab_hdr = (struct dysymtab_command *) cmd_hdr; - rval = kxld_reloc_create_macho(&object->extrelocs, &object->relocator, - (struct relocation_info *) ((void *) (object->file + object->dysymtab_hdr->extreloff)), + (struct relocation_info *) ((void *) (my_file + object->dysymtab_hdr->extreloff)), object->dysymtab_hdr->nextrel); require_noerr(rval, finish); rval = kxld_reloc_create_macho(&object->locrelocs, &object->relocator, - (struct relocation_info *) ((void *) (object->file + object->dysymtab_hdr->locreloff)), + (struct relocation_info *) ((void *) (my_file + object->dysymtab_hdr->locreloff)), object->dysymtab_hdr->nlocrel); require_noerr(rval, finish); @@ -665,7 +727,12 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, "LC_UNIXTHREAD/LC_MAIN segment is not valid in a kext.")); break; case LC_SEGMENT_SPLIT_INFO: - /* To be implemented later; treat as uninteresting for now */ + if (isSplitKext) { + struct linkedit_data_command *split_info_hdr = NULL; + split_info_hdr = (struct linkedit_data_command *) (void *) cmd_hdr; + kxld_splitinfolc_init_from_macho(&object->splitinfolc, split_info_hdr); + } + break; case LC_CODE_SIGNATURE: case LC_DYLD_INFO: case LC_DYLD_INFO_ONLY: @@ -686,9 +753,10 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, /* Initialize the sections */ for (j = 0; j < seg->sects.nitems; ++j, ++secti) { sect = kxld_array_get_item(&object->sects, secti); + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, - kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, - sect, object->file, §_offset, secti, &object->relocator); + kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, + sect, my_file, §_offset, secti, &object->relocator); require_noerr(rval, finish); /* Add the section to the segment. This will also make sure @@ -725,11 +793,19 @@ init_from_execute(KXLDObject *object) KXLDSectionName *sname = NULL; u_int i = 0, j = 0, k = 0; #endif /* KXLD_USER_OR_OBJECT */ - + u_char *my_file; + check(object); + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } + require_action(kxld_object_is_kernel(object), finish, rval=KERN_FAILURE); - + rval = init_from_final_linked_image(object, &filetype, &symtab_hdr); require_noerr(rval, finish); @@ -749,7 +825,7 @@ init_from_execute(KXLDObject *object) KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, - object->symtab, symtab_hdr, object->file, kernel_linkedit_seg); + object->symtab, symtab_hdr, my_file, kernel_linkedit_seg); require_noerr(rval, finish); #if KXLD_USER_OR_OBJECT @@ -801,8 +877,16 @@ init_from_bundle(KXLDObject *object) kern_return_t rval = KERN_FAILURE; struct symtab_command *symtab_hdr = NULL; u_int filetype = 0; - + u_char *my_file; + check(object); + + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } require_action(target_supports_bundle(object), finish, rval=KERN_FAILURE; @@ -812,18 +896,18 @@ init_from_bundle(KXLDObject *object) rval = init_from_final_linked_image(object, &filetype, &symtab_hdr); require_noerr(rval, finish); - require_action(filetype == MH_KEXT_BUNDLE, finish, + require_action(filetype == MH_KEXT_BUNDLE, finish, rval=KERN_FAILURE); KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, - object->symtab, symtab_hdr, object->file, + object->symtab, symtab_hdr, my_file, /* kernel_linkedit_seg */ NULL); require_noerr(rval, finish); rval = KERN_SUCCESS; finish: - return rval; + return rval; } #endif /* KXLD_USER_OR_BUNDLE */ @@ -852,8 +936,16 @@ init_from_object(KXLDObject *object) u_int nsects = 0; u_int i = 0; boolean_t has_segment = FALSE; - + u_char *my_file; + check(object); + + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } require_action(target_supports_object(object), finish, rval=KERN_FAILURE; @@ -862,7 +954,7 @@ init_from_object(KXLDObject *object) KXLD_3264_FUNC(kxld_object_is_32_bit(object), offset, get_macho_cmd_data_32, get_macho_cmd_data_64, - object->file, offset, &filetype, &ncmds); + my_file, offset, &filetype, &ncmds); require_action(filetype == MH_OBJECT, finish, rval=KERN_FAILURE); @@ -873,7 +965,7 @@ init_from_object(KXLDObject *object) */ for (; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) ((void *) (object->file + offset)); + cmd_hdr = (struct load_command *) ((void *) (my_file + offset)); switch(cmd_hdr->cmd) { #if KXLD_USER_OR_ILP32 @@ -939,7 +1031,7 @@ init_from_object(KXLDObject *object) KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, - object->symtab, symtab_hdr, object->file, + object->symtab, symtab_hdr, my_file, /* kernel_linkedit_seg */ NULL); require_noerr(rval, finish); break; @@ -961,6 +1053,7 @@ init_from_object(KXLDObject *object) break; case LC_VERSION_MIN_MACOSX: case LC_VERSION_MIN_IPHONEOS: + case LC_VERSION_MIN_TVOS: case LC_VERSION_MIN_WATCHOS: case LC_SOURCE_VERSION: /* Not supported for object files, fall through */ @@ -983,9 +1076,10 @@ init_from_object(KXLDObject *object) for (i = 0; i < nsects; ++i) { sect = kxld_array_get_item(&object->sects, i); + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, - kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, - sect, object->file, §_offset, i, &object->relocator); + kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, + sect, my_file, §_offset, i, &object->relocator); require_noerr(rval, finish); } @@ -1092,6 +1186,10 @@ get_macho_header_size(const KXLDObject *object) header_size += kxld_srcversion_get_macho_header_size(); } + if (isSplitKext && object->splitinfolc.has_splitinfolc) { + header_size += kxld_splitinfolc_get_macho_header_size(); + } + return header_size; } @@ -1125,8 +1223,9 @@ get_macho_data_size(const KXLDObject *object) /* get current __LINKEDIT sizes */ seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); - seg_vmsize = (u_long) kxld_seg_get_vmsize(seg); + seg_vmsize = (u_long) kxld_seg_get_vmsize(seg); + /* get size of symbol table data that will eventually be dumped * into the __LINKEDIT segment */ @@ -1233,10 +1332,10 @@ kxld_object_get_reloc_at_symbol(const KXLDObject *object, const KXLDSym *sym) if (kxld_object_is_final_image(object)) { reloc = kxld_reloc_get_reloc_by_offset(&object->extrelocs, - sym->base_addr); + sym->base_addr); if (!reloc) { reloc = kxld_reloc_get_reloc_by_offset(&object->locrelocs, - sym->base_addr); + sym->base_addr); } } else { offset = kxld_sym_get_section_offset(sym, sect); @@ -1254,13 +1353,20 @@ kxld_object_get_symbol_of_reloc(const KXLDObject *object, const KXLDReloc *reloc, const KXLDSect *sect) { const KXLDSym *sym = NULL; - - if (kxld_object_is_final_image(object)) { - sym = kxld_reloc_get_symbol(&object->relocator, reloc, object->file); - } else { - sym = kxld_reloc_get_symbol(&object->relocator, reloc, sect->data); + u_char *my_file; + + if (isOldInterface) { + my_file = object->file; } - + else { + my_file = object->split_info.kextExecutable; + } + + if (kxld_object_is_final_image(object)) { + sym = kxld_reloc_get_symbol(&object->relocator, reloc, my_file); + } else { + sym = kxld_reloc_get_symbol(&object->relocator, reloc, sect->data); + } return sym; } @@ -1524,17 +1630,25 @@ set_is_object_linked(KXLDObject *object) /******************************************************************************* *******************************************************************************/ -void kxld_object_clear(KXLDObject *object __unused) +void kxld_object_clear(KXLDObject *object) { KXLDSeg *seg = NULL; KXLDSect *sect = NULL; u_int i; - + u_char *my_file; + check(object); + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } + #if !KERNEL if (kxld_object_is_kernel(object)) { - unswap_macho(object->file, object->host_order, object->target_order); + unswap_macho(my_file, object->host_order, object->target_order); } #endif /* !KERNEL */ @@ -1559,8 +1673,15 @@ void kxld_object_clear(KXLDObject *object __unused) if (object->symtab) kxld_symtab_clear(object->symtab); - object->file = NULL; - object->size = 0; + if (isOldInterface) { + object->file = NULL; + object->size = 0; + } + else { + kxld_splitinfolc_clear(&object->splitinfolc); + object->split_info.kextExecutable = NULL; + object->split_info.kextSize = 0; + } object->filetype = 0; object->cputype = 0; object->cpusubtype = 0; @@ -1585,12 +1706,20 @@ void kxld_object_deinit(KXLDObject *object __unused) KXLDSeg *seg = NULL; KXLDSect *sect = NULL; u_int i; - + u_char *my_file; + check(object); + + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } #if !KERNEL - if (object->file && kxld_object_is_kernel(object)) { - unswap_macho(object->file, object->host_order, object->target_order); + if (my_file && kxld_object_is_kernel(object)) { + unswap_macho(my_file, object->host_order, object->target_order); } #endif /* !KERNEL */ @@ -1622,9 +1751,18 @@ void kxld_object_deinit(KXLDObject *object __unused) const u_char * kxld_object_get_file(const KXLDObject *object) { + const u_char *my_file; + check(object); - return object->file; + if (isOldInterface) { + my_file = object->file; + } + else { + my_file = object->split_info.kextExecutable; + } + + return my_file; } /******************************************************************************* @@ -1697,6 +1835,42 @@ kxld_object_target_supports_common_symbols(const KXLDObject *object) return (object->cputype == CPU_TYPE_I386); } + +/******************************************************************************* + *******************************************************************************/ +void +kxld_object_get_vmsize_for_seg_by_name(const KXLDObject *object, + const char *segname, + u_long *vmsize) +{ + check(object); + check(segname); + check(vmsize); + + KXLDSeg *seg = NULL; + u_long my_size = 0; + + /* segment vmsize */ + seg = kxld_object_get_seg_by_name(object, segname); + + my_size = (u_long) kxld_seg_get_vmsize(seg); + +#if KXLD_PIC_KEXTS + if (kxld_seg_is_linkedit_seg(seg)) + { + u_long reloc_size = 0; + + if (target_supports_slideable_kexts(object)) { + /* get size of __DYSYMTAB relocation entries */ + reloc_size = kxld_reloc_get_macho_data_size(&object->locrelocs, &object->extrelocs); + my_size += reloc_size; + } + } +#endif + + *vmsize = my_size; +} + /******************************************************************************* *******************************************************************************/ void @@ -1713,6 +1887,7 @@ kxld_object_get_vmsize(const KXLDObject *object, u_long *header_size, *header_size = (object->is_final_image) ? 0 : (u_long)kxld_round_page_cross_safe(get_macho_header_size(object)); + *vmsize = *header_size + get_macho_data_size(object); } @@ -1722,7 +1897,14 @@ kxld_object_get_vmsize(const KXLDObject *object, u_long *header_size, void kxld_object_set_linked_object_size(KXLDObject *object, u_long vmsize) { - object->output_buffer_size = vmsize; /* cache this for use later */ + check(object); + + if (isOldInterface) { + object->output_buffer_size = vmsize; /* cache this for use later */ + } + else { + object->split_info.linkedKextSize = vmsize; + } return; } @@ -1730,7 +1912,8 @@ kxld_object_set_linked_object_size(KXLDObject *object, u_long vmsize) *******************************************************************************/ kern_return_t kxld_object_export_linked_object(const KXLDObject *object, - u_char *linked_object) + void *linked_object + ) { kern_return_t rval = KERN_FAILURE; KXLDSeg *seg = NULL; @@ -1741,15 +1924,27 @@ kxld_object_export_linked_object(const KXLDObject *object, u_int ncmds = 0; u_int i = 0; boolean_t is_32bit_object = kxld_object_is_32_bit(object); + kxld_addr_t link_addr; + u_char *my_linked_object; check(object); check(linked_object); + + if (isOldInterface) { + size = object->output_buffer_size; + link_addr = object->link_addr; + my_linked_object = (u_char *) linked_object; + } + else { + size = ((splitKextLinkInfo *)linked_object)->linkedKextSize; + link_addr = ((splitKextLinkInfo *)linked_object)->vmaddr_TEXT; + my_linked_object = ((splitKextLinkInfo *)linked_object)->linkedKext; + } /* Calculate the size of the headers and data */ header_size = get_macho_header_size(object); - size = object->output_buffer_size; - + /* Copy data to the file */ ncmds = object->segs.nitems + 1 /* LC_SYMTAB */; @@ -1774,50 +1969,75 @@ kxld_object_export_linked_object(const KXLDObject *object, ncmds++; } - rval = export_macho_header(object, linked_object, ncmds, &header_offset, header_size); - require_noerr(rval, finish); + if (isSplitKext && object->splitinfolc.has_splitinfolc) { + ncmds++; + } + rval = export_macho_header(object, my_linked_object, ncmds, &header_offset, header_size); + require_noerr(rval, finish); + for (i = 0; i < object->segs.nitems; ++i) { seg = kxld_array_get_item(&object->segs, i); - rval = kxld_seg_export_macho_to_vm(seg, linked_object, &header_offset, - header_size, size, object->link_addr, is_32bit_object); - require_noerr(rval, finish); + rval = kxld_seg_export_macho_to_vm(seg, my_linked_object, &header_offset, + header_size, size, link_addr, is_32bit_object); + require_noerr(rval, finish); } seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); - data_offset = (u_long) (seg->link_addr - object->link_addr); + data_offset = (u_long) (seg->link_addr - link_addr); - rval = kxld_symtab_export_macho(object->symtab, linked_object, &header_offset, - header_size, &data_offset, size, is_32bit_object); + // data_offset is used to set the fileoff in the macho header load commands + rval = kxld_symtab_export_macho(object->symtab, + my_linked_object, + &header_offset, + header_size, + &data_offset, size, is_32bit_object); require_noerr(rval, finish); + // data_offset now points past the symbol tab and strings data in the linkedit + // segment - (it was used to set new values for symoff and stroff) + #if KXLD_PIC_KEXTS if (target_supports_slideable_kexts(object)) { - rval = kxld_reloc_export_macho(&object->relocator, &object->locrelocs, - &object->extrelocs, linked_object, &header_offset, header_size, - &data_offset, size); + rval = kxld_reloc_export_macho(&object->relocator, + &object->locrelocs, + &object->extrelocs, + my_linked_object, + &header_offset, + header_size, + &data_offset, size); require_noerr(rval, finish); } #endif /* KXLD_PIC_KEXTS */ if (object->uuid.has_uuid) { - rval = kxld_uuid_export_macho(&object->uuid, linked_object, &header_offset, header_size); + rval = kxld_uuid_export_macho(&object->uuid, my_linked_object, &header_offset, header_size); require_noerr(rval, finish); } if (object->versionmin.has_versionmin) { - rval = kxld_versionmin_export_macho(&object->versionmin, linked_object, &header_offset, header_size); + rval = kxld_versionmin_export_macho(&object->versionmin, my_linked_object, &header_offset, header_size); require_noerr(rval, finish); } if (object->srcversion.has_srcversion) { - rval = kxld_srcversion_export_macho(&object->srcversion, linked_object, &header_offset, header_size); + rval = kxld_srcversion_export_macho(&object->srcversion, my_linked_object, &header_offset, header_size); require_noerr(rval, finish); } - + + if (isSplitKext && object->splitinfolc.has_splitinfolc) { + rval = kxld_splitinfolc_export_macho(&object->splitinfolc, + linked_object, + &header_offset, + header_size, + &data_offset, + size); + require_noerr(rval, finish); + } + #if !KERNEL - unswap_macho(linked_object, object->host_order, object->target_order); + unswap_macho(my_linked_object, object->host_order, object->target_order); #endif /* KERNEL */ rval = KERN_SUCCESS; @@ -1846,7 +2066,7 @@ export_macho_header(const KXLDObject *object, u_char *buf, u_int ncmds, rval = KERN_SUCCESS; finish: - return rval; + return rval; } #if KXLD_USER_OR_ILP32 @@ -1862,7 +2082,7 @@ export_macho_header_32(const KXLDObject *object, u_char *buf, u_int ncmds, check(object); check(buf); check(header_offset); - + require_action(sizeof(*mach) <= header_size - *header_offset, finish, rval=KERN_FAILURE); mach = (struct mach_header *) ((void *) (buf + *header_offset)); @@ -1876,7 +2096,7 @@ export_macho_header_32(const KXLDObject *object, u_char *buf, u_int ncmds, mach->flags = MH_NOUNDEFS; *header_offset += sizeof(*mach); - + rval = KERN_SUCCESS; finish: @@ -1911,7 +2131,21 @@ export_macho_header_64(const KXLDObject *object, u_char *buf, u_int ncmds, mach->flags = MH_NOUNDEFS; *header_offset += sizeof(*mach); - + +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + " %p >>> Start of macho header (size %lu) <%s>", + (void *) mach, + sizeof(*mach), + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + " %p <<< End of macho header <%s>", + (void *) ((u_char *)mach + sizeof(*mach)), + __func__); + } +#endif + rval = KERN_SUCCESS; finish: @@ -1952,7 +2186,7 @@ kxld_object_relocate(KXLDObject *object, kxld_addr_t link_address) for (i = 0; i < object->segs.nitems; ++i) { seg = kxld_array_get_item(&object->segs, i); kxld_seg_relocate(seg, link_address); - } + } // for... /* Relocate symbols */ rval = kxld_symtab_relocate(object->symtab, &object->sects); @@ -1960,7 +2194,7 @@ kxld_object_relocate(KXLDObject *object, kxld_addr_t link_address) rval = KERN_SUCCESS; finish: - return rval; + return rval; } /******************************************************************************* @@ -2087,11 +2321,15 @@ kxld_object_process_relocations(KXLDObject *object, rval = KERN_SUCCESS; finish: - return rval; + return rval; } #if KXLD_USER_OR_BUNDLE +#if SPLIT_KEXTS_DEBUG +static boolean_t kxld_show_ptr_value; +#endif + #define SECT_SYM_PTRS "__nl_symbol_ptr" /******************************************************************************* @@ -2122,7 +2360,7 @@ process_symbol_pointers(KXLDObject *object) /* Get the __DATA,__nl_symbol_ptr section. If it doesn't exist, we have * nothing to do. */ - + sect = kxld_object_get_sect_by_name(object, SEG_DATA, SECT_SYM_PTRS); if (!sect || !(sect->flags & S_NON_LAZY_SYMBOL_POINTERS)) { rval = KERN_SUCCESS; @@ -2155,19 +2393,35 @@ process_symbol_pointers(KXLDObject *object) * action is required. */ - symidx = (int32_t *) ((void *) (object->file + object->dysymtab_hdr->indirectsymoff)); + if (isOldInterface) { + symidx = (int32_t *) ((void *) (object->file + object->dysymtab_hdr->indirectsymoff)); + } + else { + symidx = (int32_t *) ((void *) (object->split_info.kextExecutable + object->dysymtab_hdr->indirectsymoff)); + } + symidx += firstsym; symptr = sect->data; for (i = 0; i < nsyms; ++i, ++symidx, symptr+=symptrsize) { if (*symidx & INDIRECT_SYMBOL_LOCAL) { if (*symidx & INDIRECT_SYMBOL_ABS) continue; - add_to_ptr(symptr, object->link_addr, kxld_object_is_32_bit(object)); + if (isOldInterface) { + add_to_ptr(symptr, object->link_addr, kxld_object_is_32_bit(object)); + } + else { + add_to_ptr(symptr, object->split_info.vmaddr_TEXT, kxld_object_is_32_bit(object)); + } } else { sym = kxld_symtab_get_symbol_by_index(object->symtab, *symidx); require_action(sym, finish, rval=KERN_FAILURE); - - add_to_ptr(symptr, sym->link_addr, kxld_object_is_32_bit(object)); + + if (isOldInterface) { + add_to_ptr(symptr, sym->link_addr, kxld_object_is_32_bit(object)); + } + else { + add_to_ptr(symptr, object->split_info.vmaddr_TEXT, kxld_object_is_32_bit(object)); + } } } @@ -2214,8 +2468,31 @@ process_relocs_from_tables(KXLDObject *object) seg = get_seg_by_base_addr(object, reloc->address); require_action(seg, finish, rval=KERN_FAILURE); - rval = kxld_relocator_process_table_reloc(&object->relocator, reloc, - seg, object->link_addr); + if (isOldInterface) { + rval = kxld_relocator_process_table_reloc(&object->relocator, reloc, + seg, object->link_addr); + } + else { + kxld_addr_t my_link_addr = object->split_info.vmaddr_TEXT; + if (isSplitKext) { + if (kxld_seg_is_text_exec_seg(seg)) { + my_link_addr = object->split_info.vmaddr_TEXT_EXEC; + } + else if (kxld_seg_is_data_seg(seg)) { + my_link_addr = object->split_info.vmaddr_DATA; + } + else if (kxld_seg_is_data_const_seg(seg)) { + my_link_addr = object->split_info.vmaddr_DATA_CONST; + } + else if (kxld_seg_is_linkedit_seg(seg)) { + my_link_addr = object->split_info.vmaddr_LINKEDIT; + } + } + rval = kxld_relocator_process_table_reloc(&object->relocator, + reloc, + seg, + my_link_addr); + } require_noerr(rval, finish); } @@ -2225,9 +2502,32 @@ process_relocs_from_tables(KXLDObject *object) seg = get_seg_by_base_addr(object, reloc->address); require_action(seg, finish, rval=KERN_FAILURE); - - rval = kxld_relocator_process_table_reloc(&object->relocator, reloc, - seg, object->link_addr); + + if (isOldInterface) { + rval = kxld_relocator_process_table_reloc(&object->relocator, reloc, + seg, object->link_addr); + } + else { + kxld_addr_t my_link_addr = object->split_info.vmaddr_TEXT; + if (isSplitKext) { + if (kxld_seg_is_text_exec_seg(seg)) { + my_link_addr = object->split_info.vmaddr_TEXT_EXEC; + } + else if (kxld_seg_is_data_seg(seg)) { + my_link_addr = object->split_info.vmaddr_DATA; + } + else if (kxld_seg_is_data_const_seg(seg)) { + my_link_addr = object->split_info.vmaddr_DATA_CONST; + } + else if (kxld_seg_is_linkedit_seg(seg)) { + my_link_addr = object->split_info.vmaddr_LINKEDIT; + } + } + rval = kxld_relocator_process_table_reloc(&object->relocator, + reloc, + seg, + my_link_addr); + } require_noerr(rval, finish); } @@ -2243,11 +2543,18 @@ add_to_ptr(u_char *symptr, kxld_addr_t val, boolean_t is_32_bit) { if (is_32_bit) { uint32_t *ptr = (uint32_t *) ((void *) symptr); + *ptr += (uint32_t) val; } else { uint64_t *ptr = (uint64_t *) ((void *) symptr); + *ptr += (uint64_t) val; } + +#if SPLIT_KEXTS_DEBUG + kxld_show_ptr_value = FALSE; +#endif + } #endif /* KXLD_USER_OR_BUNDLE */ @@ -2299,12 +2606,20 @@ populate_kmod_info(KXLDObject *object) kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogNoKmodInfo)); kmodsect = kxld_array_get_item(&object->sects, kmodsym->sectnum); + kmod_offset = (u_long) (kmodsym->base_addr - kmodsect->base_addr); kmod_info = (kmod_info_t *) ((void *) (kmodsect->data + kmod_offset)); if (kxld_object_is_32_bit(object)) { kmod_info_32_v1_t *kmod = (kmod_info_32_v1_t *) (kmod_info); - kmod->address = (uint32_t) object->link_addr; + + if (isOldInterface) { + kmod->address = (uint32_t) object->link_addr; + } + else { + kmod->address = (uint32_t) object->split_info.vmaddr_TEXT; + } + kmod->size = (uint32_t) size; kmod->hdr_size = (uint32_t) header_size; @@ -2317,7 +2632,14 @@ populate_kmod_info(KXLDObject *object) #endif /* !KERNEL */ } else { kmod_info_64_v1_t *kmod = (kmod_info_64_v1_t *) (kmod_info); - kmod->address = object->link_addr; + + if (isOldInterface) { + kmod->address = object->link_addr; + } + else { + kmod->address = object->split_info.vmaddr_TEXT; + } + kmod->size = size; kmod->hdr_size = header_size; @@ -2328,8 +2650,29 @@ populate_kmod_info(KXLDObject *object) kmod->hdr_size = OSSwapInt64(kmod->hdr_size); } #endif /* !KERNEL */ - } + +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + " kmodsect %p kmod_info %p = kmodsect->data %p + kmod_offset %lu <%s>", + (void *) kmodsect, + (void *) kmod_info, + (void *) kmodsect->data, + kmod_offset, + __func__); + + kxld_log(kKxldLogLinking, kKxldLogErr, + " kmod_info data: address %p size %llu hdr_size %llu start_addr %p stop_addr %p <%s>", + (void *) kmod->address, + kmod->size, + kmod->hdr_size, + (void *) kmod->start_addr, + (void *) kmod->stop_addr, + __func__); + } +#endif + } rval = KERN_SUCCESS; diff --git a/libkern/kxld/kxld_object.h b/libkern/kxld/kxld_object.h index ab78f200e..45d00530a 100644 --- a/libkern/kxld/kxld_object.h +++ b/libkern/kxld/kxld_object.h @@ -56,7 +56,7 @@ kern_return_t kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, const char *name, struct kxld_array *section_order, cpu_type_t cputype, cpu_subtype_t cpusubtype, KXLDFlags flags) - __attribute__((nonnull(1,2,4) visibility("hidden"))); + __attribute__((nonnull(1,2,4), visibility("hidden"))); void kxld_object_clear(KXLDObject *object) __attribute__((nonnull, visibility("hidden"))); @@ -123,11 +123,26 @@ void kxld_object_get_vmsize(const KXLDObject *object, u_long *header_size, void kxld_object_set_linked_object_size(KXLDObject *object, u_long vmsize) __attribute__((nonnull, visibility("hidden"))); +void kxld_object_get_vmsize_for_seg_by_name(const KXLDObject *object, + const char *segname, + u_long *vmsize) +__attribute__((nonnull, visibility("hidden"))); + +splitKextLinkInfo * kxld_object_get_link_info(KXLDObject *object) +__attribute__((nonnull, visibility("hidden"))); + +void kxld_object_set_link_info(KXLDObject *object, + splitKextLinkInfo *link_info) +__attribute__((nonnull, visibility("hidden"))); + + /* This will be the same size as kxld_kext_get_vmsize */ kern_return_t kxld_object_export_linked_object(const KXLDObject *object, - u_char *linked_object) + void *linked_object + ) __attribute__((nonnull, visibility("hidden"))); + /******************************************************************************* * Modifiers *******************************************************************************/ diff --git a/libkern/kxld/kxld_reloc.c b/libkern/kxld/kxld_reloc.c index bb93003a8..a2186be8f 100644 --- a/libkern/kxld/kxld_reloc.c +++ b/libkern/kxld/kxld_reloc.c @@ -75,6 +75,10 @@ #include #endif +extern uint32_t kaslr_offsets_index; +extern uint32_t kaslr_offsets_count; +extern uint32_t *kaslr_offsets; + #define KXLD_TARGET_NONE (u_int) 0x0 #define KXLD_TARGET_VALUE (u_int) 0x1 #define KXLD_TARGET_SECTNUM (u_int) 0x2 @@ -389,9 +393,8 @@ kxld_reloc_create_macho(KXLDArray *relocarray, const KXLDRelocator *relocator, reloc->pair_target_type = KXLD_TARGET_NONE; } } - } + } // for... } - rval = KERN_SUCCESS; finish: @@ -405,7 +408,7 @@ kxld_reloc_create_macho(KXLDArray *relocarray, const KXLDRelocator *relocator, * 2) Don't reference N_ABS symbols *******************************************************************************/ static u_int -count_relocatable_relocs(const KXLDRelocator *relocator, +count_relocatable_relocs(const KXLDRelocator *relocator, const struct relocation_info *relocs, u_int nrelocs) { u_int num_nonpair_relocs = 0; @@ -499,11 +502,11 @@ kxld_reloc_get_symbol(const KXLDRelocator *relocator, const KXLDReloc *reloc, sym = kxld_symtab_get_symbol_by_index(relocator->symtab, reloc->target); break; case KXLD_TARGET_SECTNUM: - if (data) { - value = kxld_relocator_get_pointer_at_addr(relocator, data, - reloc->address); - sym = kxld_symtab_get_cxx_symbol_by_value(relocator->symtab, value); - } + if (data) { + value = kxld_relocator_get_pointer_at_addr(relocator, data, + reloc->address); + sym = kxld_symtab_get_cxx_symbol_by_value(relocator->symtab, value); + } break; default: sym = NULL; @@ -570,7 +573,8 @@ kxld_reloc_get_macho_header_size() /******************************************************************************* *******************************************************************************/ u_long -kxld_reloc_get_macho_data_size(const KXLDArray *locrelocs, +kxld_reloc_get_macho_data_size( + const KXLDArray *locrelocs, const KXLDArray *extrelocs) { u_long rval = 0; @@ -608,9 +612,18 @@ kxld_reloc_export_macho(const KXLDRelocator *relocator, data_size = kxld_reloc_get_macho_data_size(locrelocs, extrelocs); require_action((*data_offset + data_size) <= size, finish, rval=KERN_FAILURE); - + start = dst = (struct relocation_info *) ((void *) (buf + *data_offset)); + if (kaslr_offsets == NULL) { + kaslr_offsets_index = 0; + kaslr_offsets_count = locrelocs->nitems + extrelocs->nitems; + kaslr_offsets = (uint32_t *)malloc(kaslr_offsets_count * sizeof(*kaslr_offsets)); + bzero(kaslr_offsets, kaslr_offsets_count * sizeof(*kaslr_offsets)); + } + + // copies the reloc data into the __LINKEDIT segment + // data_offset is the new value for locreloff rval = export_macho_for_array(relocator, locrelocs, &dst); require_noerr(rval, finish); @@ -618,7 +631,7 @@ kxld_reloc_export_macho(const KXLDRelocator *relocator, require_noerr(rval, finish); count = dst - start; - + memset(dysymtabhdr, 0, sizeof(*dysymtabhdr)); dysymtabhdr->cmd = LC_DYSYMTAB; dysymtabhdr->cmdsize = (uint32_t) sizeof(*dysymtabhdr); @@ -626,10 +639,34 @@ kxld_reloc_export_macho(const KXLDRelocator *relocator, dysymtabhdr->nlocrel = (uint32_t) count; *data_offset += count * sizeof(struct relocation_info); - + +#if SPLIT_KEXTS_DEBUG + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p >>> Start of dysymtabhdr (size %lu) <%s> ", + (void *) dysymtabhdr, + sizeof(*dysymtabhdr), + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p <<< End of dysymtabhdr <%s> ", + (void *) ((u_char *)dysymtabhdr + sizeof(*dysymtabhdr)), + __func__); + + kxld_log(kKxldLogLinking, kKxldLogErr, + "dysymtabhdr at %p: cmdsize %u indirectsymoff %u nindirectsyms %u extreloff %u nextrel %u locreloff %u nlocrel %u <%s>", + (void *) dysymtabhdr, + dysymtabhdr->cmdsize, + dysymtabhdr->indirectsymoff, + dysymtabhdr->nindirectsyms, + dysymtabhdr->extreloff, + dysymtabhdr->nextrel, + dysymtabhdr->locreloff, + dysymtabhdr->nlocrel, + __func__); +#endif + rval = KERN_SUCCESS; finish: - return rval; + return rval; } #endif /* KXLD_PIC_KEXTS */ @@ -682,6 +719,7 @@ get_pointer_at_addr_64(const KXLDRelocator *relocator, check(relocator); addr = *(const uint64_t *) ((const void *) (data + offset)); + #if !KERNEL if (relocator->swap) { addr = OSSwapInt64(addr); @@ -786,7 +824,9 @@ kxld_reloc_update_symindex(KXLDReloc *reloc, u_int symindex) *******************************************************************************/ kern_return_t kxld_relocator_process_table_reloc(KXLDRelocator *relocator, - const KXLDReloc *reloc, const KXLDSeg *seg, kxld_addr_t link_addr) + const KXLDReloc *reloc, + const KXLDSeg *seg, + kxld_addr_t link_addr) { kern_return_t rval = KERN_FAILURE; u_char *instruction = NULL; @@ -799,7 +839,7 @@ kxld_relocator_process_table_reloc(KXLDRelocator *relocator, check(relocator); check(reloc); - /* Find the instruction */ + /* Find the instruction in original kext file we are trying to link */ offset = (u_long)(seg->fileoff + (reloc->address - seg->base_addr)); instruction = relocator->file + offset; @@ -811,10 +851,14 @@ kxld_relocator_process_table_reloc(KXLDRelocator *relocator, base_pc = reloc->address; link_pc = base_pc + link_addr; + if (kxld_seg_is_split_seg(seg)) { + // link_pc for split segment special case, do not add in the base_pc + link_pc = link_addr; + } /* Relocate */ - rval = relocator->process_reloc(relocator, instruction, reloc->length, + rval = relocator->process_reloc(relocator, instruction, reloc->length, reloc->pcrel, base_pc, link_pc, link_addr, reloc->reloc_type, target, pair_target, relocator->swap); require_noerr(rval, finish); @@ -1040,7 +1084,7 @@ export_macho_for_array(const KXLDRelocator *relocator, struct relocation_info *dst = NULL; struct scattered_relocation_info *scatdst = NULL; u_int i = 0; - + dst = *dstp; for (i = 0; i < relocs->nitems; ++i) { @@ -1053,6 +1097,17 @@ export_macho_for_array(const KXLDRelocator *relocator, switch (reloc->target_type) { case KXLD_TARGET_LOOKUP: + if (kaslr_offsets) { + if (kaslr_offsets_index >= kaslr_offsets_count) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "kaslr_offsets overflow %d > %d <%s> ", + kaslr_offsets_index, kaslr_offsets_count, + __func__); + abort(); + } + // reloc->address is really an offset from the start of the kext + *(kaslr_offsets + kaslr_offsets_index++) = reloc->address; + } scatdst->r_address = reloc->address; scatdst->r_pcrel = reloc->pcrel; scatdst->r_length = reloc->length; @@ -1061,6 +1116,15 @@ export_macho_for_array(const KXLDRelocator *relocator, scatdst->r_scattered = 1; break; case KXLD_TARGET_SECTNUM: + if (kaslr_offsets) { + if (kaslr_offsets_index >= kaslr_offsets_count) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "kaslr_offsets overflow <%s> ", __func__); + abort(); + } + // reloc->address is really an offset from the start of the kext + *(kaslr_offsets + kaslr_offsets_index++) = reloc->address; + } dst->r_address = reloc->address; dst->r_pcrel = reloc->pcrel; dst->r_length = reloc->length; @@ -1072,6 +1136,15 @@ export_macho_for_array(const KXLDRelocator *relocator, /* Assume that everything will be slid together; otherwise, * there is no sensible value for the section number. */ + if (kaslr_offsets) { + if (kaslr_offsets_index >= kaslr_offsets_count) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "kaslr_offsets overflow <%s> ", __func__); + abort(); + } + // reloc->address is really an offset from the start of the kext + *(kaslr_offsets + kaslr_offsets_index++) = reloc->address; + } dst->r_address = reloc->address; dst->r_pcrel = reloc->pcrel; dst->r_length = reloc->length; @@ -1372,7 +1445,7 @@ x86_64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instructio #if !KERNEL if (swap) instr32 = OSSwapInt32(instr32); #endif - + *instr32p = instr32; } else { instr64p = (uint64_t *) ((void *) instruction); @@ -1411,7 +1484,6 @@ x86_64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instructio #if !KERNEL if (swap) instr64 = OSSwapInt64(instr64); #endif - *instr64p = instr64; } @@ -1640,7 +1712,7 @@ arm64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction #if !KERNEL if (swap) instr32 = OSSwapInt32(instr32); #endif - + *instr32p = instr32; } else { /* length == 3 */ uint64_t *instr64p = (uint64_t *) (void *) instruction; @@ -1663,7 +1735,7 @@ arm64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction #if !KERNEL if (swap) instr64 = OSSwapInt64(instr64); #endif - + *instr64p = instr64; } @@ -1672,4 +1744,5 @@ arm64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction return rval; } + #endif /* KXLD_USER_OR_ARM64 */ diff --git a/libkern/kxld/kxld_reloc.h b/libkern/kxld/kxld_reloc.h index 695e708fd..c95d679fb 100644 --- a/libkern/kxld/kxld_reloc.h +++ b/libkern/kxld/kxld_reloc.h @@ -160,7 +160,9 @@ kern_return_t kxld_relocator_process_sect_reloc(KXLDRelocator *relocator, __attribute__((nonnull,visibility("hidden"))); kern_return_t kxld_relocator_process_table_reloc(KXLDRelocator *relocator, - const KXLDReloc *reloc, const struct kxld_seg *seg, kxld_addr_t link_addr) + const KXLDReloc *reloc, + const struct kxld_seg *seg, + kxld_addr_t link_addr) __attribute__((nonnull,visibility("hidden"))); #endif /* _KXLD_RELOC_H */ diff --git a/libkern/kxld/kxld_sect.c b/libkern/kxld/kxld_sect.c index a89e3f693..4a41a6844 100644 --- a/libkern/kxld/kxld_sect.c +++ b/libkern/kxld/kxld_sect.c @@ -49,6 +49,7 @@ static kern_return_t sect_export_macho_header_32(const KXLDSect *sect, u_char *b static kern_return_t sect_export_macho_header_64(const KXLDSect *sect, u_char *buf, u_long *header_offset, u_long header_size, u_long data_offset); #endif +extern boolean_t isSplitKext; #if KXLD_USER_OR_ILP32 /******************************************************************************* @@ -75,7 +76,7 @@ kxld_sect_init_from_macho_32(KXLDSect *sect, u_char *macho, u_long *sect_offset, sect->align = src->align; sect->reserved1 = src->reserved1; sect->reserved2 = src->reserved2; - + if (src->offset) { sect->data = macho + src->offset; } else { @@ -84,7 +85,7 @@ kxld_sect_init_from_macho_32(KXLDSect *sect, u_char *macho, u_long *sect_offset, relocs = (struct relocation_info *) ((void *) (macho + src->reloff)); - rval = kxld_reloc_create_macho(§->relocs, relocator, + rval = kxld_reloc_create_macho(§->relocs, relocator, relocs, src->nreloc); require_noerr(rval, finish); @@ -132,8 +133,8 @@ kxld_sect_init_from_macho_64(KXLDSect *sect, u_char *macho, u_long *sect_offset, relocs = (struct relocation_info *) ((void *) (macho + src->reloff)); - rval = kxld_reloc_create_macho(§->relocs, relocator, - relocs, src->nreloc); + rval = kxld_reloc_create_macho(§->relocs, relocator, + relocs, src->nreloc); require_noerr(rval, finish); *sect_offset += sizeof(*src); @@ -356,7 +357,6 @@ kxld_sect_export_macho_to_file_buffer(const KXLDSect *sect, u_char *buf, *data_offset += (u_long) sect->size; } - rval = KERN_SUCCESS; finish: @@ -364,32 +364,37 @@ kxld_sect_export_macho_to_file_buffer(const KXLDSect *sect, u_char *buf, } /******************************************************************************* -*******************************************************************************/ + *******************************************************************************/ kern_return_t -kxld_sect_export_macho_to_vm(const KXLDSect *sect, u_char *buf, - u_long *header_offset, u_long header_size, - kxld_addr_t link_addr, u_long data_size, - boolean_t is_32_bit __unused) +kxld_sect_export_macho_to_vm(const KXLDSect *sect, + u_char *buf, + u_long *header_offset, + u_long header_size, + kxld_addr_t link_addr, + u_long data_size, + boolean_t is_32_bit __unused) { kern_return_t rval = KERN_FAILURE; - u_long data_offset = (u_long) (sect->link_addr - link_addr); - + u_long data_offset; + check(sect); check(buf); check(header_offset); - + + data_offset = (u_long) (sect->link_addr - link_addr); + KXLD_3264_FUNC(is_32_bit, rval, - sect_export_macho_header_32, sect_export_macho_header_64, - sect, buf, header_offset, header_size, data_offset); + sect_export_macho_header_32, sect_export_macho_header_64, + sect, buf, header_offset, header_size, data_offset); require_noerr(rval, finish); rval = export_macho(sect, buf, data_offset, data_size); require_noerr(rval, finish); - + rval = KERN_SUCCESS; - + finish: - return rval; + return rval; } /******************************************************************************* @@ -401,24 +406,33 @@ export_macho(const KXLDSect *sect, u_char *buf, u_long offset, u_long bufsize) check(sect); check(buf); - + if (!sect->data) { rval = KERN_SUCCESS; goto finish; } - /* Verify that the section is properly aligned */ - - require_action(kxld_sect_align_address(sect, offset) == offset, finish, - rval = KERN_FAILURE); + if (!isSplitKext) { + /* Verify that the section is properly aligned */ + if (kxld_sect_align_address(sect, offset) != offset) { + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "Alignment error: %llu != %lu for %s %s <%s>", + kxld_sect_align_address(sect, offset), offset, + sect->segname, sect->sectname, __func__); + goto finish; + } + } /* Verify that we have enough space to copy */ - - require_action(sect->size <= bufsize - offset, finish, - rval=KERN_FAILURE); + if (buf + offset + sect->size > buf + bufsize) { + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "Overflow: offset %lu + sect->size %llu > bufsize %lu for %s %s", + offset, sect->size, bufsize, + sect->segname, sect->sectname); + goto finish; + } /* Copy section data */ - switch (sect->flags & SECTION_TYPE) { case S_NON_LAZY_SYMBOL_POINTERS: case S_MOD_INIT_FUNC_POINTERS: @@ -431,6 +445,29 @@ export_macho(const KXLDSect *sect, u_char *buf, u_long offset, u_long bufsize) case S_COALESCED: case S_16BYTE_LITERALS: case S_SYMBOL_STUBS: +#if SPLIT_KEXTS_DEBUG + kxld_log(kKxldLogLinking, kKxldLogErr, + " sectname %s copy from %p (sect->data) for %llu bytes (sect->size) to %p (buf %p + offset %lu <%s>", + sect->sectname[0] ? sect->sectname : "none", + (void *) sect->data, + sect->size, + (void *) (buf + offset), + (void *) buf, + offset, + __func__); + + kxld_log(kKxldLogLinking, kKxldLogErr, + " %p >>> Start of %s section data (sect->size %llu) <%s>", + (void *) (buf + offset), + sect->sectname[0] ? sect->sectname : "none", + sect->size, + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + " %p <<< End of %s section data <%s>", + (void *) (buf + offset + sect->size), + sect->sectname[0] ? sect->sectname : "none", + __func__); +#endif memcpy(buf + offset, sect->data, (size_t)sect->size); break; case S_ZEROFILL: /* sect->data should be NULL, so we'll never get here */ @@ -448,7 +485,7 @@ export_macho(const KXLDSect *sect, u_char *buf, u_long offset, u_long bufsize) rval = KERN_SUCCESS; finish: - return rval; + return rval; } #if KXLD_USER_OR_ILP32 @@ -484,6 +521,21 @@ sect_export_macho_header_32(const KXLDSect *sect, u_char *buf, secthdr->reserved1 = sect->reserved1; secthdr->reserved2 = sect->reserved2; +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + "sectname %s secthdr: %p addr %p size %02X %u offset %02X %u <%s>", + sect->sectname[0] ? sect->sectname : "none", + (void *) secthdr, + (void *) ((uint64_t)secthdr->addr), + secthdr->size, + secthdr->size, + secthdr->offset, + secthdr->offset, + __func__); + } +#endif + rval = KERN_SUCCESS; finish: @@ -505,6 +557,7 @@ sect_export_macho_header_64(const KXLDSect *sect, u_char *buf, check(buf); check(header_offset); + require_action(sizeof(*secthdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); secthdr = (struct section_64 *) ((void *) (buf + *header_offset)); @@ -524,6 +577,27 @@ sect_export_macho_header_64(const KXLDSect *sect, u_char *buf, secthdr->reserved1 = sect->reserved1; secthdr->reserved2 = sect->reserved2; +#if SPLIT_KEXTS_DEBUG + kxld_log(kKxldLogLinking, kKxldLogErr, + " %p >>> Start of %s secthdr (size %lu) <%s>", + (void *) secthdr, + sect->sectname[0] ? sect->sectname : "none", + sizeof(*secthdr), + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + " %p <<< End of %s secthdr <%s>", + (void *) ((u_char *)secthdr + sizeof(*secthdr)), + sect->sectname[0] ? sect->sectname : "none", + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + " secthdr: addr %p size %llu offset %u sectname %s <%s>", + (void *) secthdr->addr, + secthdr->size, + secthdr->offset, + sect->sectname[0] ? sect->sectname : "none", + __func__); +#endif + rval = KERN_SUCCESS; finish: @@ -551,8 +625,24 @@ kxld_sect_grow(KXLDSect *sect, kxld_size_t nbytes, u_int align) void kxld_sect_relocate(KXLDSect *sect, kxld_addr_t link_addr) { - sect->link_addr = kxld_sect_align_address(sect, - sect->link_addr + link_addr); +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p >>> Start of %s section (sect->size %llu) <%s>", + (void *) (kxld_sect_align_address(sect, sect->link_addr + link_addr)), + sect->sectname[0] ? sect->sectname : "none", + sect->size, + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p <<< End of %s section <%s>", + (void *) (kxld_sect_align_address(sect, sect->link_addr + link_addr) + sect->size), + sect->sectname[0] ? sect->sectname : "none", + __func__); + } +#endif + + sect->link_addr = kxld_sect_align_address(sect, + sect->link_addr + link_addr); } #if KXLD_USER_OR_GOT diff --git a/libkern/kxld/kxld_sect.h b/libkern/kxld/kxld_sect.h index 96d0b1b35..f3bbdae93 100644 --- a/libkern/kxld/kxld_sect.h +++ b/libkern/kxld/kxld_sect.h @@ -144,11 +144,13 @@ kern_return_t kxld_sect_export_macho_to_file_buffer(const KXLDSect *sect, u_char u_long data_size, boolean_t is_32_bit) __attribute__((nonnull, visibility("hidden"))); -kern_return_t kxld_sect_export_macho_to_vm(const KXLDSect *sect, u_char *buf, - u_long *header_offset, u_long header_size, - kxld_addr_t link_addr, u_long data_size, - boolean_t is_32_bit) - __attribute__((nonnull, visibility("hidden"))); +kern_return_t kxld_sect_export_macho_to_vm(const KXLDSect *sect, u_char *buf, + u_long *header_offset, + u_long header_size, + kxld_addr_t link_addr, + u_long data_size, + boolean_t is_32_bit) +__attribute__((nonnull, visibility("hidden"))); /******************************************************************************* * Mutators diff --git a/libkern/kxld/kxld_seg.c b/libkern/kxld/kxld_seg.c index 00ef81333..f512c2b90 100644 --- a/libkern/kxld/kxld_seg.c +++ b/libkern/kxld/kxld_seg.c @@ -50,6 +50,9 @@ #define TEXT_SEG_PROT (VM_PROT_READ | VM_PROT_EXECUTE) #define DATA_SEG_PROT (VM_PROT_READ | VM_PROT_WRITE) +extern boolean_t isSplitKext; +extern boolean_t isOldInterface; + #if KXLD_USER_OR_OBJECT static kern_return_t reorder_sections(KXLDSeg *seg, KXLDArray *section_order); static void reorder_section(KXLDArray *sects, u_int *sect_reorder_index, @@ -114,11 +117,12 @@ kxld_seg_init_from_macho_64(KXLDSeg *seg, struct segment_command_64 *src) seg->base_addr = src->vmaddr; seg->link_addr = src->vmaddr; seg->vmsize = src->vmsize; + seg->fileoff = src->fileoff; seg->maxprot = src->maxprot; seg->initprot = src->initprot; seg->flags = src->flags; - + rval = kxld_array_init(&seg->sects, sizeof(KXLDSect *), src->nsects); require_noerr(rval, finish); @@ -469,7 +473,7 @@ kxld_size_t kxld_seg_get_vmsize(const KXLDSeg *seg) { check(seg); - + return seg->vmsize; } @@ -581,28 +585,38 @@ kxld_seg_export_macho_to_file_buffer(const KXLDSeg *seg, u_char *buf, } + /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_seg_export_macho_to_vm(const KXLDSeg *seg, u_char *buf, - u_long *header_offset, u_long header_size, - u_long data_size, kxld_addr_t file_link_addr, - boolean_t is_32_bit) +kxld_seg_export_macho_to_vm(const KXLDSeg *seg, + u_char *buf, + u_long *header_offset, + u_long header_size, + u_long data_size, + kxld_addr_t file_link_addr, + boolean_t is_32_bit) { - kern_return_t rval = KERN_FAILURE; - KXLDSect *sect = NULL; - u_long data_offset = (u_long) (seg->link_addr - file_link_addr); - u_int i = 0; + kern_return_t rval = KERN_FAILURE; + KXLDSect * sect = NULL; + + // data_offset is used to set fileoff field in segment header + u_long data_offset; + u_int i = 0; check(seg); check(buf); check(header_offset); + + data_offset = (u_long) (seg->link_addr - file_link_addr); /* Write out the header */ - KXLD_3264_FUNC(is_32_bit, rval, - seg_export_macho_header_32, seg_export_macho_header_64, - seg, buf, header_offset, header_size, data_offset); + KXLD_3264_FUNC(is_32_bit, rval, + seg_export_macho_header_32, seg_export_macho_header_64, + seg, + buf, + header_offset, header_size, data_offset); require_noerr(rval, finish); /* Write out each section */ @@ -610,9 +624,9 @@ kxld_seg_export_macho_to_vm(const KXLDSeg *seg, u_char *buf, for (i = 0; i < seg->sects.nitems; ++i) { sect = get_sect_by_index(seg, i); - rval = kxld_sect_export_macho_to_vm(sect, buf, header_offset, - header_size, file_link_addr, data_size, is_32_bit); - require_noerr(rval, finish); + rval = kxld_sect_export_macho_to_vm(sect, buf, header_offset, + header_size, file_link_addr, data_size, is_32_bit); + require_noerr(rval, finish); } rval = KERN_SUCCESS; @@ -654,6 +668,21 @@ seg_export_macho_header_32(const KXLDSeg *seg, u_char *buf, seghdr->nsects = seg->sects.nitems; seghdr->flags = 0; +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + "segname %s seghdr %p vmaddr %p vmsize 0x%02X %u fileoff 0x%02X %u <%s>", + seg->segname[0] ? seg->segname : "none", + (void *) seghdr, + (void *) ((uint64_t)seghdr->vmaddr), + seghdr->vmsize, + seghdr->vmsize, + seghdr->fileoff, + seghdr->fileoff, + __func__); + } +#endif + rval = KERN_SUCCESS; finish: @@ -677,6 +706,22 @@ seg_export_macho_header_64(const KXLDSeg *seg, u_char *buf, require_action(sizeof(*seghdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); + +#if SPLIT_KEXTS_DEBUG + { + struct mach_header_64 *mach; + + mach = (struct mach_header_64 *) ((void *) buf); + + if (mach->magic != MH_MAGIC_64) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "bad macho header at %p <%s>", + (void *) mach, __func__); + goto finish; + } + } +#endif + seghdr = (struct segment_command_64 *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*seghdr); @@ -694,6 +739,33 @@ seg_export_macho_header_64(const KXLDSeg *seg, u_char *buf, seghdr->nsects = seg->sects.nitems; seghdr->flags = 0; +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p >>> Start of %s seghdr (size %lu) <%s>", + (void *) seghdr, + seg->segname[0] ? seg->segname : "none", + sizeof(*seghdr), + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p <<< End of %s seghdr <%s>", + (void *) ((u_char *)seghdr + sizeof(*seghdr)), + seg->segname[0] ? seg->segname : "none", + __func__); + + kxld_log(kKxldLogLinking, kKxldLogErr, + "%s seghdr, cmdsize %d vmaddr %p vmsize %p %llu fileoff %p %llu <%s>", + seg->segname[0] ? seg->segname : "none", + seghdr->cmdsize, + (void *) seghdr->vmaddr, + (void *) seghdr->vmsize, + seghdr->vmsize, + (void *) seghdr->fileoff, + seghdr->fileoff, + __func__); + } +#endif + rval = KERN_SUCCESS; finish: @@ -760,9 +832,9 @@ kxld_seg_finish_init(KXLDSeg *seg) maxsize = sect->size; } } - - seg->vmsize = kxld_round_page_cross_safe(maxaddr + + seg->vmsize = kxld_round_page_cross_safe(maxaddr + maxsize - seg->base_addr); + } rval = KERN_SUCCESS; @@ -777,7 +849,7 @@ void kxld_seg_set_vm_protections(KXLDSeg *seg, boolean_t strict_protections) { if (strict_protections) { - if (!strncmp(seg->segname, SEG_TEXT, const_strlen(SEG_TEXT))) { + if (!strncmp(seg->segname, SEG_TEXT, sizeof(seg->segname))) { seg->initprot = TEXT_SEG_PROT; seg->maxprot = TEXT_SEG_PROT; } else { @@ -797,11 +869,91 @@ kxld_seg_relocate(KXLDSeg *seg, kxld_addr_t link_addr) { KXLDSect *sect = NULL; u_int i = 0; - - seg->link_addr += link_addr; + splitKextLinkInfo * link_info = (splitKextLinkInfo *) link_addr; + kxld_addr_t my_link_addr; + + if (isOldInterface) { + seg->link_addr += link_addr; + } + else { + if (isSplitKext) { + // we have a split kext + if (kxld_seg_is_text_seg(seg)) { + // assumes this is the beginning of the kext + my_link_addr = link_info->vmaddr_TEXT; + seg->link_addr = my_link_addr; + } + else if (kxld_seg_is_text_exec_seg(seg)) { + my_link_addr = link_info->vmaddr_TEXT_EXEC; + seg->link_addr = my_link_addr; + // vmaddr_TEXT_EXEC is the actual vmaddr for this segment so we need + // to adjust for kxld_sect_relocate assuming the link addr is + // the address of the kext (macho header in __TEXT) + my_link_addr -= seg->base_addr; + } + else if (kxld_seg_is_data_seg(seg)) { + my_link_addr = link_info->vmaddr_DATA; + seg->link_addr = my_link_addr; + // vmaddr_DATA is the actual vmaddr for this segment so we need + // to adjust for kxld_sect_relocate assuming the link addr is + // the address of the kext (macho header in __TEXT) + my_link_addr -= seg->base_addr; + } + else if (kxld_seg_is_data_const_seg(seg)) { + my_link_addr = link_info->vmaddr_DATA_CONST; + seg->link_addr = my_link_addr; + // vmaddr_DATA_CONST is the actual vmaddr for this segment so we need + // to adjust for kxld_sect_relocate assuming the link addr is + // the address of the kext (macho header in __TEXT) + my_link_addr -= seg->base_addr; + } + else if (kxld_seg_is_linkedit_seg(seg)) { + my_link_addr = link_info->vmaddr_LINKEDIT; + seg->link_addr = my_link_addr; + // vmaddr_DATA is the actual vmaddr for this segment so we need + // to adjust for kxld_sect_relocate assuming the link addr is + // the address of the kext (macho header in __TEXT) + my_link_addr -= seg->base_addr; + } + else { + kxld_log(kKxldLogLinking, kKxldLogErr, + " not expecting this segment %s!!! <%s>", + seg->segname[0] ? seg->segname : "none", + __func__); + my_link_addr = link_info->vmaddr_TEXT; + seg->link_addr += my_link_addr; + } + } + else { + my_link_addr = link_info->vmaddr_TEXT; + seg->link_addr += my_link_addr; + } + } + +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p >>> Start of %s segment (vmsize %llu) <%s>)", + (void *) seg->link_addr, + seg->segname[0] ? seg->segname : "none", + seg->vmsize, + __func__); + kxld_log(kKxldLogLinking, kKxldLogErr, + "%p <<< End of %s segment <%s>", + (void *) (seg->link_addr + seg->vmsize), + seg->segname[0] ? seg->segname : "none", + __func__); + } +#endif + for (i = 0; i < seg->sects.nitems; ++i) { sect = get_sect_by_index(seg, i); - kxld_sect_relocate(sect, link_addr); + if (isOldInterface) { + kxld_sect_relocate(sect, link_addr); + } + else { + kxld_sect_relocate(sect, my_link_addr); + } } } @@ -814,7 +966,8 @@ kxld_seg_populate_linkedit(KXLDSeg *seg, const KXLDSymtab *symtab, boolean_t is_ , const KXLDArray *extrelocs , boolean_t target_supports_slideable_kexts #endif /* KXLD_PIC_KEXTS */ - ) + , uint32_t splitinfolc_size + ) { u_long size = 0; @@ -826,6 +979,82 @@ kxld_seg_populate_linkedit(KXLDSeg *seg, const KXLDSymtab *symtab, boolean_t is_ } #endif /* KXLD_PIC_KEXTS */ + // 0 unless this is a split kext + size += splitinfolc_size; + seg->vmsize = kxld_round_page_cross_safe(size); } +/******************************************************************************* + *******************************************************************************/ +boolean_t +kxld_seg_is_split_seg(const KXLDSeg *seg) +{ + boolean_t result = FALSE; + + check(seg); + if (isSplitKext) { + if (kxld_seg_is_data_seg(seg) || kxld_seg_is_linkedit_seg(seg) || + kxld_seg_is_text_exec_seg(seg) || kxld_seg_is_data_const_seg(seg)) { + result = TRUE; + } + } + + return result; +} + +boolean_t +kxld_seg_is_text_seg(const KXLDSeg *seg) +{ + boolean_t result = FALSE; + + check(seg); + result = !strncmp(seg->segname, SEG_TEXT, sizeof(seg->segname)); + + return result; +} + +boolean_t +kxld_seg_is_text_exec_seg(const KXLDSeg *seg) +{ + boolean_t result = FALSE; + + check(seg); + result = !strncmp(seg->segname, "__TEXT_EXEC", sizeof(seg->segname)); + + return result; +} + +boolean_t +kxld_seg_is_data_seg(const KXLDSeg *seg) +{ + boolean_t result = FALSE; + + check(seg); + result = !strncmp(seg->segname, SEG_DATA, sizeof(seg->segname)); + + return result; +} + +boolean_t +kxld_seg_is_data_const_seg(const KXLDSeg *seg) +{ + boolean_t result = FALSE; + + check(seg); + result = !strncmp(seg->segname, "__DATA_CONST", sizeof(seg->segname)); + + return result; +} + +boolean_t +kxld_seg_is_linkedit_seg(const KXLDSeg *seg) +{ + boolean_t result = FALSE; + + check(seg); + result = !strncmp(seg->segname, SEG_LINKEDIT, sizeof(seg->segname)); + + return result; +} + diff --git a/libkern/kxld/kxld_seg.h b/libkern/kxld/kxld_seg.h index 1d863bf02..5eb9f98fc 100644 --- a/libkern/kxld/kxld_seg.h +++ b/libkern/kxld/kxld_seg.h @@ -114,11 +114,14 @@ kxld_seg_export_macho_to_file_buffer(const KXLDSeg *seg, u_char *buf, __attribute__((nonnull, visibility("hidden"))); kern_return_t -kxld_seg_export_macho_to_vm(const KXLDSeg *seg, u_char *buf, - u_long *header_offset, u_long header_size, - u_long data_size, kxld_addr_t file_link_addr, - boolean_t is_32_bit) - __attribute__((nonnull, visibility("hidden"))); +kxld_seg_export_macho_to_vm(const KXLDSeg *seg, + u_char *buf, + u_long *header_offset, + u_long header_size, + u_long data_size, + kxld_addr_t file_link_addr, + boolean_t is_32_bit) +__attribute__((nonnull, visibility("hidden"))); /******************************************************************************* * Modifiers @@ -135,7 +138,7 @@ void kxld_seg_set_vm_protections(KXLDSeg *seg, boolean_t strict_protections) __attribute__((nonnull, visibility("hidden"))); void kxld_seg_relocate(KXLDSeg *seg, kxld_addr_t link_addr) - __attribute__((nonnull, visibility("hidden"))); +__attribute__((nonnull, visibility("hidden"))); void kxld_seg_populate_linkedit(KXLDSeg *seg, const struct kxld_symtab *symtab, boolean_t is_32_bit @@ -144,8 +147,28 @@ void kxld_seg_populate_linkedit(KXLDSeg *seg, const struct kxld_symtab *symtab, , const struct kxld_array *extrelocs , boolean_t target_supports_slideable_kexts #endif /* KXLD_PIC_KEXTS */ + , uint32_t splitinfolc_size ) __attribute__((nonnull, visibility("hidden"))); +boolean_t kxld_seg_is_split_seg(const KXLDSeg *seg) +__attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_seg_is_text_seg(const KXLDSeg *seg) +__attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_seg_is_text_exec_seg(const KXLDSeg *seg) +__attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_seg_is_data_seg(const KXLDSeg *seg) +__attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_seg_is_data_const_seg(const KXLDSeg *seg) +__attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_seg_is_linkedit_seg(const KXLDSeg *seg) +__attribute__((pure, nonnull, visibility("hidden"))); + + #endif /* _KXLD_SEG_H_ */ diff --git a/libkern/kxld/kxld_splitinfolc.c b/libkern/kxld/kxld_splitinfolc.c new file mode 100644 index 000000000..dd3fde261 --- /dev/null +++ b/libkern/kxld/kxld_splitinfolc.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" +#include + +#include "kxld_util.h" +#include "kxld_splitinfolc.h" + +/******************************************************************************* + *******************************************************************************/ +void +kxld_splitinfolc_init_from_macho(KXLDsplitinfolc *splitinfolc, struct linkedit_data_command *src) +{ + check(splitinfolc); + check(src); + + splitinfolc->cmdsize = src->cmdsize; + splitinfolc->dataoff = src->dataoff; + splitinfolc->datasize = src->datasize; + splitinfolc->has_splitinfolc = TRUE; +} + +/******************************************************************************* + *******************************************************************************/ +void +kxld_splitinfolc_clear(KXLDsplitinfolc *splitinfolc) +{ + bzero(splitinfolc, sizeof(*splitinfolc)); +} + +/******************************************************************************* + *******************************************************************************/ +u_long +kxld_splitinfolc_get_macho_header_size(void) +{ + return sizeof(struct linkedit_data_command); +} + +/******************************************************************************* + *******************************************************************************/ +kern_return_t +kxld_splitinfolc_export_macho(const KXLDsplitinfolc *splitinfolc, + splitKextLinkInfo *linked_object, + u_long *header_offset, + u_long header_size, + u_long *data_offset, + u_long size) +{ + kern_return_t rval = KERN_FAILURE; + struct linkedit_data_command *splitinfolc_hdr = NULL; + u_char * buf; + + check(splitinfolc); + check(linked_object); + check(header_offset); + check(data_offset); + + buf = (u_char *)(linked_object->linkedKext); + require_action(sizeof(*splitinfolc_hdr) <= header_size - *header_offset, + finish, + rval=KERN_FAILURE); + splitinfolc_hdr = (struct linkedit_data_command *)((void *)(buf + *header_offset)); + *header_offset += sizeof(*splitinfolc_hdr); + + if (buf + *data_offset > buf + size) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "\n OVERFLOW! linkedKext %p to %p (%lu) copy %p to %p (%u) <%s>", + (void *) buf, + (void *) (buf + size), + size, + (void *) (buf + *data_offset), + (void *) (buf + *data_offset + splitinfolc->datasize), + splitinfolc->datasize, + __func__); + goto finish; + } + + // copy in the split info reloc data from kextExecutable. For example dataoff + // in LC_SEGMENT_SPLIT_INFO load command points to the reloc data in the + // __LINKEDIT segment. In this case 65768 into the kextExecutable file is + // the split seg reloc info (for 920 bytes) +// Load command 9 +// cmd LC_SEGMENT_SPLIT_INFO +// cmdsize 16 +// dataoff 65768 +// datasize 920 + + + memcpy(buf + *data_offset, linked_object->kextExecutable + splitinfolc->dataoff, splitinfolc->datasize); + +#if SPLIT_KEXTS_DEBUG + u_char *dataPtr = buf + *data_offset; + + kxld_log(kKxldLogLinking, kKxldLogErr, + "\n\n linkedKext %p to %p (%lu) copy %p to %p (%u) <%s>", + (void *) buf, + (void *) (buf + size), + size, + (void *) (dataPtr), + (void *) (dataPtr + splitinfolc->datasize), + splitinfolc->datasize, + __func__); + + if (*(dataPtr + 0) != 0x7F) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "\n\n bad LC_SEGMENT_SPLIT_INFO: 0x%02X %02X %02X %02X %02X %02X %02X %02X at %p (buf %p + %lu) <%s>", + *(dataPtr +0), + *(dataPtr +1), + *(dataPtr +2), + *(dataPtr +3), + *(dataPtr +4), + *(dataPtr +5), + *(dataPtr +6), + *(dataPtr +7), + (void *) dataPtr, + (void *) buf, + *data_offset, __func__); + } +#endif + + // update the load command header + splitinfolc_hdr->cmd = LC_SEGMENT_SPLIT_INFO; + splitinfolc_hdr->cmdsize = (uint32_t) sizeof(*splitinfolc_hdr); + splitinfolc_hdr->dataoff = (uint32_t)(*data_offset); + splitinfolc_hdr->datasize = splitinfolc->datasize; + + *data_offset += splitinfolc->datasize; + + rval = KERN_SUCCESS; + +finish: + return rval; +} + diff --git a/libkern/kxld/kxld_splitinfolc.h b/libkern/kxld/kxld_splitinfolc.h new file mode 100644 index 000000000..61bfde198 --- /dev/null +++ b/libkern/kxld/kxld_splitinfolc.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _KXLD_SPLITINFOLC_H_ +#define _KXLD_SPLITINFOLC_H_ + +#include +#if KERNEL +#include +#else +#include "kxld_types.h" +#endif + +struct linkedit_data_command; +typedef struct kxld_splitinfolc KXLDsplitinfolc; + +struct kxld_splitinfolc { + uint32_t cmdsize; + uint32_t dataoff; + uint32_t datasize; + boolean_t has_splitinfolc; +}; + +/******************************************************************************* + * Constructors and destructors + *******************************************************************************/ + +void kxld_splitinfolc_init_from_macho(KXLDsplitinfolc *splitinfolc, struct linkedit_data_command *src) +__attribute__((nonnull, visibility("hidden"))); + +void kxld_splitinfolc_clear(KXLDsplitinfolc *splitinfolc) +__attribute__((nonnull, visibility("hidden"))); + +/******************************************************************************* + * Accessors + *******************************************************************************/ + +u_long kxld_splitinfolc_get_macho_header_size(void) +__attribute__((pure, visibility("hidden"))); + +kern_return_t +kxld_splitinfolc_export_macho(const KXLDsplitinfolc *splitinfolc, + splitKextLinkInfo *linked_object, + u_long *header_offset, + u_long header_size, + u_long *data_offset, + u_long size) +__attribute__((pure, nonnull, visibility("hidden"))); + +#endif /* _KXLD_SPLITINFOLC_H_ */ diff --git a/libkern/kxld/kxld_srcversion.c b/libkern/kxld/kxld_srcversion.c index c6d4462d8..cd8adb871 100644 --- a/libkern/kxld/kxld_srcversion.c +++ b/libkern/kxld/kxld_srcversion.c @@ -88,6 +88,6 @@ kxld_srcversion_export_macho(const KXLDsrcversion *srcversion, u_char *buf, rval = KERN_SUCCESS; finish: - return rval; + return rval; } diff --git a/libkern/kxld/kxld_sym.c b/libkern/kxld/kxld_sym.c index 252d39e3b..2da6477ac 100644 --- a/libkern/kxld/kxld_sym.c +++ b/libkern/kxld/kxld_sym.c @@ -122,7 +122,7 @@ kxld_sym_init_from_macho64(KXLDSym *sym, char *strtab, const struct nlist_64 *sr if (kxld_sym_is_indirect(sym)) { sym->alias = strtab + src->n_value; } - + rval = KERN_SUCCESS; finish: @@ -147,6 +147,7 @@ kxld_sym_init_absolute(KXLDSym *sym, char *name, kxld_addr_t link_addr) init_predicates(sym, N_ABS | N_EXT, 0); sym->is_resolved = TRUE; + } /******************************************************************************* @@ -920,9 +921,11 @@ kxld_sym_export_macho_64(const KXLDSym *sym, u_char *_nl, char *strtab, } str = (char *) (strtab + *stroff); + strlcpy(str, sym->name, strsize - *stroff); *stroff += bytes; + rval = KERN_SUCCESS; finish: @@ -1032,4 +1035,3 @@ kxld_sym_mark_private(KXLDSym *sym) sym->type |= N_PEXT; sym->is_external = FALSE; } - diff --git a/libkern/kxld/kxld_symtab.c b/libkern/kxld/kxld_symtab.c index c5ce51740..53bb6762b 100644 --- a/libkern/kxld/kxld_symtab.c +++ b/libkern/kxld/kxld_symtab.c @@ -249,7 +249,7 @@ restrict_private_symbols(KXLDSymtab *symtab) KXLDSym *sym = NULL; const char *name = NULL; u_int i = 0; - + kxld_symtab_iterator_init(&iter, symtab, kxld_sym_is_exported, FALSE); while ((sym = kxld_symtab_iterator_get_next(&iter))) { for (i = 0; i < const_array_len(private_symbols); ++i) { @@ -448,18 +448,20 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, check(header_offset); check(data_offset); - require_action(sizeof(*symtabhdr) <= header_size - *header_offset, + require_action(sizeof(*symtabhdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); symtabhdr = (struct symtab_command *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*symtabhdr); /* Initialize the symbol table header */ + // note - this assumes LC_SYMTAB is always before the LC_DYSYMTAB in the + // macho header we are processing. symtabhdr->cmd = LC_SYMTAB; symtabhdr->cmdsize = (uint32_t) sizeof(*symtabhdr); symtabhdr->symoff = (uint32_t) *data_offset; symtabhdr->strsize = 1; /* strtab start padding */ - + /* Find the size of the symbol and string tables */ kxld_symtab_iterator_init(&iter, symtab, @@ -482,7 +484,6 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, rval=KERN_FAILURE); /* Get pointers to the symbol and string tables */ - nl = buf + symtabhdr->symoff; strtab = (char *) (buf + symtabhdr->stroff); @@ -490,7 +491,6 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, kxld_symtab_iterator_reset(&iter); while ((sym = kxld_symtab_iterator_get_next(&iter))) { - KXLD_3264_FUNC(is_32_bit, rval, kxld_sym_export_macho_32, kxld_sym_export_macho_64, sym, nl, strtab, &stroff, symtabhdr->strsize); @@ -504,11 +504,35 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, *data_offset += (symtabhdr->nsyms * nlistsize) + stroff; *data_offset = (*data_offset + 7) & ~7; + // at this point data_offset will be the offset just past the + // symbols and strings in the __LINKEDIT data + + +#if SPLIT_KEXTS_DEBUG + { + kxld_log(kKxldLogLinking, kKxldLogErr, + " %p to %p (size %lu) symtabhdr <%s>", + (void *) symtabhdr, + (void *) ((u_char *)symtabhdr + sizeof(*symtabhdr)), + sizeof(*symtabhdr), + __func__); + + kxld_log(kKxldLogLinking, kKxldLogErr, + " symtabhdr %p cmdsize %u symoff %u nsyms %u stroff %u strsize %u <%s>", + (void *) symtabhdr, + symtabhdr->cmdsize, + symtabhdr->symoff, + symtabhdr->nsyms, + symtabhdr->stroff, + symtabhdr->strsize, + __func__); + } +#endif rval = KERN_SUCCESS; finish: - return rval; + return rval; } /******************************************************************************* @@ -559,7 +583,8 @@ kxld_symtab_index_cxx_symbols_by_value(KXLDSymtab *symtab) rval = kxld_dict_insert(&symtab->cxx_index, &sym->base_addr, sym); require_noerr(rval, finish); } - + + symtab->cxx_index_initialized = TRUE; rval = KERN_SUCCESS; finish: @@ -713,4 +738,3 @@ kxld_symtab_iterator_reset(KXLDSymtabIterator *iter) check(iter); iter->idx = 0; } - diff --git a/libkern/kxld/kxld_util.c b/libkern/kxld/kxld_util.c index af9f16e4e..e6d56c2ff 100644 --- a/libkern/kxld/kxld_util.c +++ b/libkern/kxld/kxld_util.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include @@ -769,8 +769,7 @@ kxld_is_32_bit(cpu_type_t cputype) * Find the first occurrence of find in s. *******************************************************************************/ const char * -kxld_strstr(s, find) - const char *s, *find; +kxld_strstr(const char *s, const char *find) { #if KERNEL char c, sc; @@ -860,3 +859,36 @@ kxld_addr_t kxld_round_page_cross_safe(kxld_addr_t offset) } #endif /* KERNEL */ } + +#if SPLIT_KEXTS_DEBUG + +void kxld_show_split_info(splitKextLinkInfo *info) +{ + kxld_log(kKxldLogLinking, kKxldLogErr, + "splitKextLinkInfo: \n" + "kextExecutable %p to %p kextSize %lu \n" + "linkedKext %p to %p linkedKextSize %lu \n" + "vmaddr_TEXT %p vmaddr_TEXT_EXEC %p " + "vmaddr_DATA %p vmaddr_DATA_CONST %p vmaddr_LINKEDIT %p", + (void *) info->kextExecutable, + (void *) (info->kextExecutable + info->kextSize), + info->kextSize, + (void*) info->linkedKext, + (void*) (info->linkedKext + info->linkedKextSize), + info->linkedKextSize, + (void *) info->vmaddr_TEXT, + (void *) info->vmaddr_TEXT_EXEC, + (void *) info->vmaddr_DATA, + (void *) info->vmaddr_DATA_CONST, + (void *) info->vmaddr_LINKEDIT); +} + +boolean_t isTargetKextName(const char * the_name) +{ + if (the_name && 0 == strcmp(the_name, KXLD_TARGET_KEXT)) { + return(TRUE); + } + return(FALSE); +} +#endif + diff --git a/libkern/kxld/kxld_util.h b/libkern/kxld/kxld_util.h index f20bc18e2..5b55b4d8b 100644 --- a/libkern/kxld/kxld_util.h +++ b/libkern/kxld/kxld_util.h @@ -214,5 +214,8 @@ boolean_t kxld_set_cross_link_page_size(kxld_size_t target_page_size); kxld_size_t kxld_get_effective_page_size(void); kxld_addr_t kxld_round_page_cross_safe(kxld_addr_t addr); +#if SPLIT_KEXTS_DEBUG +void kxld_show_split_info(splitKextLinkInfo *info); +#endif #endif /* _KXLD_UTIL_H_ */ diff --git a/libkern/kxld/kxld_uuid.c b/libkern/kxld/kxld_uuid.c index 66f32a0fa..0cbfcf24b 100644 --- a/libkern/kxld/kxld_uuid.c +++ b/libkern/kxld/kxld_uuid.c @@ -88,6 +88,6 @@ kxld_uuid_export_macho(const KXLDuuid *uuid, u_char *buf, rval = KERN_SUCCESS; finish: - return rval; + return rval; } diff --git a/libkern/kxld/kxld_versionmin.c b/libkern/kxld/kxld_versionmin.c index e422495e5..abbfaed6a 100644 --- a/libkern/kxld/kxld_versionmin.c +++ b/libkern/kxld/kxld_versionmin.c @@ -42,7 +42,7 @@ kxld_versionmin_init_from_macho(KXLDversionmin *versionmin, struct version_min_c { check(versionmin); check(src); - check((src->cmd == LC_VERSION_MIN_MACOSX) || (src->cmd == LC_VERSION_MIN_IPHONEOS) || (src->cmd == LC_VERSION_MIN_WATCHOS)); + check((src->cmd == LC_VERSION_MIN_MACOSX) || (src->cmd == LC_VERSION_MIN_IPHONEOS) || (src->cmd == LC_VERSION_MIN_TVOS) || (src->cmd == LC_VERSION_MIN_WATCHOS)); switch (src->cmd) { case LC_VERSION_MIN_MACOSX: @@ -51,6 +51,9 @@ kxld_versionmin_init_from_macho(KXLDversionmin *versionmin, struct version_min_c case LC_VERSION_MIN_IPHONEOS: versionmin->platform = kKxldVersionMiniPhoneOS; break; + case LC_VERSION_MIN_TVOS: + versionmin->platform = kKxldVersionMinAppleTVOS; + break; case LC_VERSION_MIN_WATCHOS: versionmin->platform = kKxldVersionMinWatchOS; break; @@ -102,6 +105,9 @@ kxld_versionmin_export_macho(const KXLDversionmin *versionmin, u_char *buf, case kKxldVersionMiniPhoneOS: versionminhdr->cmd = LC_VERSION_MIN_IPHONEOS; break; + case kKxldVersionMinAppleTVOS: + versionminhdr->cmd = LC_VERSION_MIN_TVOS; + break; case kKxldVersionMinWatchOS: versionminhdr->cmd = LC_VERSION_MIN_WATCHOS; break; @@ -113,6 +119,6 @@ kxld_versionmin_export_macho(const KXLDversionmin *versionmin, u_char *buf, rval = KERN_SUCCESS; finish: - return rval; + return rval; } diff --git a/libkern/kxld/kxld_versionmin.h b/libkern/kxld/kxld_versionmin.h index d4ce76b21..ff9c02124 100644 --- a/libkern/kxld/kxld_versionmin.h +++ b/libkern/kxld/kxld_versionmin.h @@ -41,6 +41,7 @@ typedef struct kxld_versionmin KXLDversionmin; enum kxld_versionmin_platforms { kKxldVersionMinMacOSX, kKxldVersionMiniPhoneOS, + kKxldVersionMinAppleTVOS, kKxldVersionMinWatchOS }; diff --git a/libkern/kxld/kxld_vtable.c b/libkern/kxld/kxld_vtable.c index 24408145b..940814ffd 100644 --- a/libkern/kxld/kxld_vtable.c +++ b/libkern/kxld/kxld_vtable.c @@ -108,7 +108,8 @@ kxld_vtable_init(KXLDVTable *vtable, const KXLDSym *vtable_sym, } else { if (kxld_object_is_final_image(object)) { extrelocs = kxld_object_get_extrelocs(object); - require_action(extrelocs, finish, + + require_action(extrelocs, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMalformedVTable, @@ -119,6 +120,7 @@ kxld_vtable_init(KXLDVTable *vtable, const KXLDSym *vtable_sym, relocator, extrelocs, defined_cxx_symbols); require_noerr(rval, finish); } else { + require_action(kxld_sect_get_num_relocs(vtable_sect) > 0, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, @@ -135,6 +137,7 @@ kxld_vtable_init(KXLDVTable *vtable, const KXLDSym *vtable_sym, rval = KERN_SUCCESS; finish: + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; @@ -399,6 +402,7 @@ init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *vtable_sym, } else { reloc = kxld_reloc_get_reloc_by_offset(relocs, vtable_sym->base_addr + entry_offset); + require_action(reloc, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, @@ -621,6 +625,7 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, rval = kxld_reloc_update_symindex(child_entry->unpatched.reloc, symindex); require_noerr(rval, finish); + kxld_log(kKxldLogPatching, kKxldLogDetail, "In vtable '%s', patching '%s' with '%s'.", kxld_demangle(vtable->name, &demangled_name1, &demangled_length1), @@ -671,7 +676,7 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, rval = KERN_SUCCESS; finish: - if (demangled_name1) kxld_free(demangled_name1, demangled_length1); + if (demangled_name1) kxld_free(demangled_name1, demangled_length1); if (demangled_name2) kxld_free(demangled_name2, demangled_length2); if (demangled_name3) kxld_free(demangled_name3, demangled_length3); diff --git a/libkern/libkern/Makefile b/libkern/libkern/Makefile index db475229b..e1c7b3a4a 100644 --- a/libkern/libkern/Makefile +++ b/libkern/libkern/Makefile @@ -49,7 +49,8 @@ PRIVATE_KERNELFILES = \ OSSerializeBinary.h \ kext_request_keys.h \ mkext.h \ - prelink.h + prelink.h \ + section_keywords.h PRIVATE_DATAFILES = \ ${PRIVATE_KERNELFILES} \ @@ -75,14 +76,12 @@ EXPORT_MI_LIST = \ kxld_types.h \ stack_protector.h - - EXPORT_MI_GEN_LIST = version.h EXPORT_MI_DIR = libkern version.h: version.h.template $(SRCROOT)/config/MasterVersion - @echo "Generating libkern/$@ from $<"; + @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)libkern/$@$(Color0) from $(ColorF)$<$(Color0)"; $(_v)install $(DATA_INSTALL_FLAGS) $< $@ $(_v)$(NEWVERS) $@ > /dev/null; diff --git a/libkern/libkern/OSAtomic.h b/libkern/libkern/OSAtomic.h index 656a2cd2a..f4a2a6736 100644 --- a/libkern/libkern/OSAtomic.h +++ b/libkern/libkern/OSAtomic.h @@ -443,7 +443,7 @@ extern UInt32 OSBitAndAtomic( * @discussion * The OSBitAndAtomic16 function logically ands the bits of the specified mask into the value at the specified address and returns the original value. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * @param mask The mask to logically and with the value. * @param address The 2-byte aligned address of the value to update atomically. * @result The value before the bitwise operation. @@ -461,7 +461,7 @@ extern UInt16 OSBitAndAtomic16( * @discussion * The OSBitAndAtomic8 function logically ands the bits of the specified mask into the value at the specified address and returns the original value. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * @param mask The mask to logically and with the value. * @param address The address of the value to update atomically. * @result The value before the bitwise operation. @@ -479,7 +479,7 @@ extern UInt8 OSBitAndAtomic8( * @discussion * The OSBitOrAtomic function logically ors the bits of the specified mask into the value at the specified address and returns the original value. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * @param mask The mask to logically or with the value. * @param address The 4-byte aligned address of the value to update atomically. * @result The value before the bitwise operation. @@ -499,7 +499,7 @@ extern UInt32 OSBitOrAtomic( * @discussion * The OSBitOrAtomic16 function logically ors the bits of the specified mask into the value at the specified address and returns the original value. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * @param mask The mask to logically or with the value. * @param address The 2-byte aligned address of the value to update atomically. * @result The value before the bitwise operation. @@ -514,7 +514,7 @@ extern UInt16 OSBitOrAtomic16( * @abstract * 8-bit logical or operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * * @discussion * The OSBitOrAtomic8 function logically ors the bits of the specified mask into the value at the specified address and returns the original value. @@ -532,7 +532,7 @@ extern UInt8 OSBitOrAtomic8( * @abstract * 32-bit logical xor operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * * @discussion * The OSBitXorAtomic function logically xors the bits of the specified mask into the value at the specified address and returns the original value. @@ -555,7 +555,7 @@ extern UInt32 OSBitXorAtomic( * @discussion * The OSBitXorAtomic16 function logically xors the bits of the specified mask into the value at the specified address and returns the original value. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * @param mask The mask to logically or with the value. * @param address The 2-byte aligned address of the value to update atomically. * @result The value before the bitwise operation. @@ -570,7 +570,7 @@ extern UInt16 OSBitXorAtomic16( * @abstract * 8-bit logical xor operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * * @discussion * The OSBitXorAtomic8 function logically xors the bits of the specified mask into the value at the specified address and returns the original value. @@ -588,7 +588,7 @@ extern UInt8 OSBitXorAtomic8( * @abstract * Bit test and set operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * * @discussion * The OSTestAndSet function sets a single bit in a byte at a specified address. It returns true if the bit was already set, false otherwise. @@ -609,7 +609,7 @@ extern Boolean OSTestAndSet( * @discussion * The OSTestAndClear function clears a single bit in a byte at a specified address. It returns true if the bit was already clear, false otherwise. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. + * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. * @param bit The bit number in the range 0 through 7. Bit 0 is the most significant. * @param startAddress The address of the byte to update atomically. * @result true if the bit was already clear, false otherwise. diff --git a/libkern/libkern/OSKextLib.h b/libkern/libkern/OSKextLib.h index 8435d0c49..4c863af70 100644 --- a/libkern/libkern/OSKextLib.h +++ b/libkern/libkern/OSKextLib.h @@ -907,6 +907,32 @@ OSKextGrabPgoData(uuid_t uuid, int wait_for_unload, int metadata); +/*! + * @function OSKextResetPgoCountersLock + * + * @abstract + * Call this function before trapping into the debugger to call OSKextResetPgoCounters. + */ +void +OSKextResetPgoCountersLock(); + +/*! + * @function OSKextResetPgoCountersUnlock + * + * @abstract + * Call this function after trapping into the debugger to call OSKextResetPgoCounters. + */ +void +OSKextResetPgoCountersUnlock(); + +/*! + * @function OSKextResetPgoCounters + * + * @abstract Reset the PGO counters for all kexts. Call only from debugger + * context, while holding OSKextResetPgoCountersLock(). + */ +void +OSKextResetPgoCounters(); #if PRAGMA_MARK @@ -985,7 +1011,7 @@ extern const void * gOSKextUnresolved; // Kernel External Components for FIPS compliance (KEC_FIPS) // WARNING - ath_hash is owned by the kernel, do not free typedef struct AppleTEXTHash { - const int ath_version; // version of this structure (value is 1) + int ath_version; // version of this structure (value is 1 or 2) int ath_length; // length of hash data void * ath_hash; // hash extracted from AppleTextHashes dict } AppleTEXTHash_t; diff --git a/libkern/libkern/OSKextLibPrivate.h b/libkern/libkern/OSKextLibPrivate.h index 4ae4b9806..929e0b298 100644 --- a/libkern/libkern/OSKextLibPrivate.h +++ b/libkern/libkern/OSKextLibPrivate.h @@ -29,6 +29,7 @@ #ifndef _LIBKERN_OSKEXTLIBPRIVATE_H #define _LIBKERN_OSKEXTLIBPRIVATE_H + #include #include @@ -123,6 +124,7 @@ typedef uint8_t OSKextExcludeLevel; * kOSKernelResourceKey *********************************************************************/ #define kOSBundleMachOHeadersKey "OSBundleMachOHeaders" +#define kOSBundleLogStringsKey "OSBundleLogStrings" #define kOSBundleCPUTypeKey "OSBundleCPUType" #define kOSBundleCPUSubtypeKey "OSBundleCPUSubtype" #define kOSBundlePathKey "OSBundlePath" @@ -133,6 +135,8 @@ typedef uint8_t OSKextExcludeLevel; #define kOSBundleLoadTagKey "OSBundleLoadTag" #define kOSBundleLoadAddressKey "OSBundleLoadAddress" #define kOSBundleLoadSizeKey "OSBundleLoadSize" +#define kOSBundleExecLoadAddressKey "OSBundleExecLoadAddress" +#define kOSBundleExecLoadSizeKey "OSBundleExecLoadSize" #define kOSBundleWiredSizeKey "OSBundleWiredSize" #define kOSBundleDependenciesKey "OSBundleDependencies" #define kOSBundleRetainCountKey "OSBundleRetainCount" @@ -899,6 +903,15 @@ typedef struct _loaded_kext_summary_header { */ extern OSKextLoadedKextSummaryHeader * gLoadedKextSummaries; +/*! + * @var gLoadedKextSummariesTimestamp + * + * @abstract This will be set to mach_absolute_time() around updates to + * gLoadedKextSummaries. Ie. immediately before gLoadedKextSummaries is set to + * zero, and immediately after it is set to a new value. + */ +extern uint64_t gLoadedKextSummariesTimestamp; + /*! * @function OSKextLoadedKextSummariesUpdated * @abstract Called when gLoadedKextSummaries has been updated. @@ -913,7 +926,8 @@ void OSKextLoadedKextSummariesUpdated(void); #ifdef XNU_KERNEL_PRIVATE extern const vm_allocation_site_t * OSKextGetAllocationSiteForCaller(uintptr_t address); -extern uint32_t OSKextGetKmodIDForSite(vm_allocation_site_t * site); +extern uint32_t OSKextGetKmodIDForSite(vm_allocation_site_t * site, + char * name, vm_size_t namelen); extern void OSKextFreeSite(vm_allocation_site_t * site); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/libkern/libkern/OSMalloc.h b/libkern/libkern/OSMalloc.h index a638b97ae..04a5ba6f2 100644 --- a/libkern/libkern/OSMalloc.h +++ b/libkern/libkern/OSMalloc.h @@ -260,6 +260,19 @@ extern void OSFree( uint32_t size, OSMallocTag tag); +#ifdef XNU_KERNEL_PRIVATE +/*! + * @function OSMalloc_size + * + * @abstract + * Returns the size of a block of memory allocated by @link OSMalloc OSMalloc@/link. + * + * @param addr A pointer to the memory block allocated via OSMalloc. + */ +extern uint32_t OSMalloc_size( + void * addr); +#endif /* XNU_KERNEL_PRIVATE */ + __END_DECLS #endif /* LIBKERN_OSMALLOC_h */ diff --git a/libkern/libkern/c++/Makefile b/libkern/libkern/c++/Makefile index e3d245ce1..69b376774 100644 --- a/libkern/libkern/c++/Makefile +++ b/libkern/libkern/c++/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -40,5 +39,3 @@ EXPORT_MI_DIR = libkern/c++ include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/libkern/libkern/c++/OSArray.h b/libkern/libkern/c++/OSArray.h index 67da96ff4..39d70ec07 100644 --- a/libkern/libkern/c++/OSArray.h +++ b/libkern/libkern/c++/OSArray.h @@ -92,6 +92,16 @@ class OSArray : public OSCollection OSDeclareDefaultStructors(OSArray) +#if APPLE_KEXT_ALIGN_CONTAINERS + +protected: + unsigned int count; + unsigned int capacity; + unsigned int capacityIncrement; + const OSMetaClassBase ** array; + +#else /* APPLE_KEXT_ALIGN_CONTAINERS */ + protected: const OSMetaClassBase ** array; unsigned int count; @@ -99,10 +109,12 @@ class OSArray : public OSCollection unsigned int capacityIncrement; struct ExpansionData { }; - + /* Reserved for future use. (Internal use only) */ ExpansionData * reserved; +#endif /* APPLE_KEXT_ALIGN_CONTAINERS */ + /* OSCollectionIterator interfaces. */ virtual unsigned int iteratorSize() const APPLE_KEXT_OVERRIDE; virtual bool initIterator(void * iterator) const APPLE_KEXT_OVERRIDE; diff --git a/libkern/libkern/c++/OSData.h b/libkern/libkern/c++/OSData.h index b3fcd5732..488087dc6 100644 --- a/libkern/libkern/c++/OSData.h +++ b/libkern/libkern/c++/OSData.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,15 +73,28 @@ class OSString; */ class OSData : public OSObject { - OSDeclareDefaultStructors(OSData) friend class OSSerialize; + OSDeclareDefaultStructors(OSData) + +#if APPLE_KEXT_ALIGN_CONTAINERS + +protected: + unsigned int length; + unsigned int capacity; + unsigned int capacityIncrement; + void * data; + +#else /* APPLE_KEXT_ALIGN_CONTAINERS */ + protected: void * data; unsigned int length; unsigned int capacity; unsigned int capacityIncrement; +#endif /* APPLE_KEXT_ALIGN_CONTAINERS */ + #ifdef XNU_KERNEL_PRIVATE /* Available within xnu source only */ public: @@ -92,12 +105,12 @@ class OSData : public OSObject DeallocFunction deallocFunction; bool disableSerialization; }; -#else +#else /* XNU_KERNEL_PRIVATE */ private: typedef void (*DeallocFunction)(void * ptr, unsigned int length); protected: struct ExpansionData; -#endif +#endif /* XNU_KERNEL_PRIVATE */ /* Reserved for future use. (Internal use only) */ ExpansionData * reserved; @@ -164,7 +177,7 @@ class OSData : public OSObject * @result * A instance of OSData that shares the provided byte array, * with a reference count of 1; - * NULL on failure. + * NULL on failure. * * @discussion * An OSData object created with this function diff --git a/libkern/libkern/c++/OSDictionary.h b/libkern/libkern/c++/OSDictionary.h index 7a515f416..c5438e9d3 100644 --- a/libkern/libkern/c++/OSDictionary.h +++ b/libkern/libkern/c++/OSDictionary.h @@ -112,9 +112,24 @@ class OSString; */ class OSDictionary : public OSCollection { - OSDeclareDefaultStructors(OSDictionary) friend class OSSerialize; + OSDeclareDefaultStructors(OSDictionary) + +#if APPLE_KEXT_ALIGN_CONTAINERS + +protected: + unsigned int count; + unsigned int capacity; + unsigned int capacityIncrement; + struct dictEntry { + const OSSymbol * key; + const OSMetaClassBase * value; + }; + dictEntry * dictionary; + +#else /* APPLE_KEXT_ALIGN_CONTAINERS */ + protected: struct dictEntry { const OSSymbol * key; @@ -130,6 +145,8 @@ class OSDictionary : public OSCollection /* Reserved for future use. (Internal use only) */ ExpansionData * reserved; +#endif /* APPLE_KEXT_ALIGN_CONTAINERS */ + // Member functions used by the OSCollectionIterator class. virtual unsigned int iteratorSize() const APPLE_KEXT_OVERRIDE; virtual bool initIterator(void * iterator) const APPLE_KEXT_OVERRIDE; @@ -905,6 +922,7 @@ class OSDictionary : public OSCollection #if XNU_KERNEL_PRIVATE bool setObject(const OSSymbol *aKey, const OSMetaClassBase *anObject, bool onlyAdd); + OSArray * copyKeys(void); #endif /* XNU_KERNEL_PRIVATE */ OSMetaClassDeclareReservedUnused(OSDictionary, 0); diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index abc3db0e4..2a265d2bd 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2012 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -55,6 +55,13 @@ extern "C" { void osdata_kmem_free(void * ptr, unsigned int length); void osdata_phys_free(void * ptr, unsigned int length); void osdata_vm_deallocate(void * ptr, unsigned int length); +void osdata_kext_free(void * ptr, unsigned int length); +void kxld_log_callback( + KXLDLogSubsystem subsystem, + KXLDLogLevel level, + const char * format, + va_list argList, + void * user_data); }; #endif /* XNU_KERNEL_PRIVATE */ @@ -96,7 +103,7 @@ kern_return_t is_io_catalog_send_data( kern_return_t * result); void kmod_dump_log(vm_offset_t*, unsigned int, boolean_t); - +void *OSKextKextForAddress(const void *addr); #endif /* XNU_KERNEL_PRIVATE */ }; @@ -131,6 +138,7 @@ struct OSKextAccount { vm_allocation_site_t site; uint32_t loadTag; + OSKext * kext; }; struct OSKextActiveAccount @@ -222,7 +230,7 @@ class OSKext : public OSObject friend void kmod_panic_dump(vm_offset_t*, unsigned int); friend void kmod_dump_log(vm_offset_t*, unsigned int, boolean_t); friend void kext_dump_panic_lists(int (*printf_func)(const char * fmt, ...)); - + friend void *OSKextKextForAddress(const void *addr); #endif /* XNU_KERNEL_PRIVATE */ @@ -327,9 +335,13 @@ class OSKext : public OSObject OSData * booterData); static OSKext * withPrelinkedInfoDict( - OSDictionary * infoDict); + OSDictionary * infoDict, + bool doCoalesedSlides); virtual bool initWithPrelinkedInfoDict( - OSDictionary * infoDict); + OSDictionary * infoDict, + bool doCoalesedSlides); + + static void setAllVMAttributes(void); static OSKext * withMkext2Info( OSDictionary * anInfoDict, @@ -419,7 +431,7 @@ class OSKext : public OSObject static void recordIdentifierRequest( OSString * kextIdentifier); - virtual OSReturn slidePrelinkedExecutable(void); + virtual OSReturn slidePrelinkedExecutable(bool doCoalesedSlides); virtual OSReturn loadExecutable(void); virtual void jettisonLinkeditSegment(void); virtual void jettisonDATASegmentPadding(void); @@ -467,6 +479,10 @@ class OSKext : public OSObject static OSDictionary * copyLoadedKextInfo( OSArray * kextIdentifiers = NULL, OSArray * keys = NULL); + static OSDictionary * copyLoadedKextInfoByUUID( + OSArray * kextIdentifiers = NULL, + OSArray * keys = NULL); + static OSData * copyKextUUIDForAddress(OSNumber *address = NULL); virtual OSDictionary * copyInfo(OSArray * keys = NULL); /* Logging to user space. @@ -518,6 +534,8 @@ class OSKext : public OSObject int (* printf_func)(const char *fmt, ...), bool lockFlag, bool doUnslide); + static void * kextForAddress( + const void * addr); static boolean_t summaryIsInBacktrace( OSKextLoadedKextSummary * summary, vm_offset_t * addr, @@ -541,7 +559,7 @@ class OSKext : public OSObject */ static void updateLoadedKextSummaries(void); void updateLoadedKextSummary(OSKextLoadedKextSummary *summary); - void updateActiveAccount(OSKextActiveAccount *account); + void updateActiveAccount(OSKextActiveAccount *accountp); /* C++ Initialization. */ diff --git a/libkern/libkern/c++/OSLib.h b/libkern/libkern/c++/OSLib.h index 80bc292da..578733374 100644 --- a/libkern/libkern/c++/OSLib.h +++ b/libkern/libkern/c++/OSLib.h @@ -51,7 +51,10 @@ __END_DECLS #include #define kalloc_container(size) \ - kalloc_tag_bt(size, VM_KERN_MEMORY_LIBKERN) + ({ kalloc_tag_bt(size, VM_KERN_MEMORY_LIBKERN); }) + +#define kallocp_container(size) \ + ({ kallocp_tag_bt(size, VM_KERN_MEMORY_LIBKERN); }) #if OSALLOCDEBUG extern "C" int debug_container_malloc_size; diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index 2d2267ab1..4660b240e 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,6 +70,8 @@ class OSOrderedSet; #endif /* XNU_KERNEL_PRIVATE */ +#define APPLE_KEXT_ALIGN_CONTAINERS (0 == APPLE_KEXT_VTABLE_PADDING) + #if defined(__LP64__) /*! @parseOnly */ #define APPLE_KEXT_LEGACY_ABI 0 @@ -291,11 +293,9 @@ class OSMetaClassBase #define OSCheckTypeInst(typeinst, inst) \ OSMetaClassBase::checkTypeInst(inst, typeinst) -/*! @function OSSafeRelease - * @abstract Release an object if not NULL. - * @param inst Instance of an OSObject, may be NULL. - */ -#define OSSafeRelease(inst) do { if (inst) (inst)->release(); } while (0) +#define OSSafeRelease(inst) \ + do { int OSSafeRelease __attribute__ ((deprecated("Use OSSafeReleaseNULL"))); (OSSafeRelease); \ + if (inst) (inst)->release(); } while (0) /*! @function OSSafeReleaseNULL * @abstract Release an object if not NULL, then set it to NULL. @@ -789,9 +789,7 @@ typedef bool (*OSMetaClassInstanceApplierFunction)(const OSObject * instance, * OSMetaClass manages run-time type information * for Libkern and I/O Kit C++ classes. * - * @discussion - * - * OSMetaClass manages run-time type information + * @discussion OSMetaClass manages run-time type information * for Libkern and I/O Kit C++ classes. * An instance of OSMetaClass exists for (nearly) every such C++ class, * keeping track of inheritance relationships, class lookup by name, @@ -1107,7 +1105,7 @@ class OSMetaClass : private OSMetaClassBase // Needs to be overriden as NULL as all OSMetaClass objects are allocated // statically at compile time, don't accidently try to free them. - void operator delete(void *, size_t) { }; + void operator delete(void *, size_t) { } public: static const OSMetaClass * const metaClass; diff --git a/libkern/libkern/c++/OSNumber.h b/libkern/libkern/c++/OSNumber.h index c54c3a3c6..e157d9e5c 100644 --- a/libkern/libkern/c++/OSNumber.h +++ b/libkern/libkern/c++/OSNumber.h @@ -70,9 +70,18 @@ */ class OSNumber : public OSObject { - OSDeclareDefaultStructors(OSNumber) friend class OSSerialize; + OSDeclareDefaultStructors(OSNumber) + +#if APPLE_KEXT_ALIGN_CONTAINERS + +protected: + unsigned int size; + unsigned long long value; + +#else /* APPLE_KEXT_ALIGN_CONTAINERS */ + protected: unsigned long long value; unsigned int size; @@ -82,8 +91,9 @@ class OSNumber : public OSObject /* Reserved for future use. (Internal use only) */ ExpansionData * reserved; -public: +#endif /* APPLE_KEXT_ALIGN_CONTAINERS */ +public: /*! * @function withNumber diff --git a/libkern/libkern/c++/OSObject.h b/libkern/libkern/c++/OSObject.h index 01a480f19..14c7defeb 100644 --- a/libkern/libkern/c++/OSObject.h +++ b/libkern/libkern/c++/OSObject.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -265,7 +265,7 @@ class OSObject : public OSMetaClassBase * to deallocate or release all dynamic resources held by the instance, * then call the superclass's implementation. * - * Caution: + * Caution: *
    *
  1. You can not assume that you have completed initialization * before free is called, diff --git a/libkern/libkern/c++/OSOrderedSet.h b/libkern/libkern/c++/OSOrderedSet.h index 4f94b889a..390b8c190 100644 --- a/libkern/libkern/c++/OSOrderedSet.h +++ b/libkern/libkern/c++/OSOrderedSet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,7 +62,7 @@ class OSOffset; *
  2. Add and remove objects in the set
  3. *
  4. Test whether the set contains a particular object
  5. *
  6. Get the object stored at a particular index.
  7. - *
+ * * * Note that automated ordering is performed only upon addition of objects * and depends on the existing objects being properly sorted. @@ -539,9 +539,6 @@ class OSOrderedSet : public OSCollection * @function getFirstObject * * @abstract - * Returns the object at index 0 in the ordered set if there is one. - * - * @abstract * The object at index 0 in the ordered set if there is one, * otherwise NULL. * @@ -560,9 +557,6 @@ class OSOrderedSet : public OSCollection * @function getLastObject * * @abstract - * Returns the last object in the ordered set if there is one. - * - * @abstract * The last object in the ordered set if there is one, * otherwise NULL. * diff --git a/libkern/libkern/c++/OSSerialize.h b/libkern/libkern/c++/OSSerialize.h index 0ffb861d9..59e12d108 100644 --- a/libkern/libkern/c++/OSSerialize.h +++ b/libkern/libkern/c++/OSSerialize.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ class OSCollection; class OSSet; class OSDictionary; +class OSArray; /*! * @header @@ -43,6 +44,8 @@ class OSDictionary; * This header declares the OSSerialize class. */ +OSObject * +OSUnserializeBinary(const void *buffer, size_t bufferSize); /*! * @class OSSerialize @@ -78,27 +81,18 @@ class OSDictionary; * for serializing properties. */ -OSObject * -OSUnserializeBinary(const void *buffer, size_t bufferSize); - class OSSerialize : public OSObject { OSDeclareDefaultStructors(OSSerialize) friend class OSBoolean; -protected: +private: char * data; // container for serialized data unsigned int length; // of serialized data (counting NULL) unsigned int capacity; // of container unsigned int capacityIncrement; // of container - unsigned int tag; - OSDictionary * tags; // tags for all objects seen - - struct ExpansionData { }; - - /* Reserved for future use. (Internal use only) */ - ExpansionData *reserved; + OSArray * tags; // tags for all objects seen #ifdef XNU_KERNEL_PRIVATE public: @@ -111,7 +105,6 @@ class OSSerialize : public OSObject typedef void * Editor; #endif -private: bool binary; bool endCollection; Editor editor; diff --git a/libkern/libkern/c++/OSSet.h b/libkern/libkern/c++/OSSet.h index 6637fa2a8..b64ddb575 100644 --- a/libkern/libkern/c++/OSSet.h +++ b/libkern/libkern/c++/OSSet.h @@ -84,13 +84,28 @@ class OSArray; */ class OSSet : public OSCollection { - OSDeclareDefaultStructors(OSSet) friend class OSSerialize; + OSDeclareDefaultStructors(OSSet) + +#if APPLE_KEXT_ALIGN_CONTAINERS + +private: + OSArray * members; + +#else /* APPLE_KEXT_ALIGN_CONTAINERS */ + private: OSArray * members; protected: + struct ExpansionData { }; + + /* Reserved for future use. (Internal use only) */ + ExpansionData * reserved; + +#endif /* APPLE_KEXT_ALIGN_CONTAINERS */ + /* * OSCollectionIterator interfaces. */ @@ -98,11 +113,6 @@ class OSSet : public OSCollection virtual bool initIterator(void * iterator) const APPLE_KEXT_OVERRIDE; virtual bool getNextObjectForIterator(void * iterator, OSObject ** ret) const APPLE_KEXT_OVERRIDE; - struct ExpansionData { }; - - /* Reserved for future use. (Internal use only) */ - ExpansionData * reserved; - public: diff --git a/libkern/libkern/c++/OSString.h b/libkern/libkern/c++/OSString.h index 5ce0e5f6e..16fc61ddc 100644 --- a/libkern/libkern/c++/OSString.h +++ b/libkern/libkern/c++/OSString.h @@ -102,15 +102,29 @@ class OSData; */ class OSString : public OSObject { + OSDeclareDefaultStructors(OSString) + enum { kMaxStringLength = 262142 }; + +#if APPLE_KEXT_ALIGN_CONTAINERS + +protected: + + unsigned int flags:14, + length:18; + char * string; + +#else /* APPLE_KEXT_ALIGN_CONTAINERS */ + protected: + char * string; unsigned int flags; unsigned int length; - char * string; -public: +#endif /* APPLE_KEXT_ALIGN_CONTAINERS */ +public: /*! * @function withString @@ -249,7 +263,9 @@ class OSString : public OSObject */ virtual bool initWithCStringNoCopy(const char * cString); +#if XNU_KERNEL_PRIVATE bool initWithStringOfLength(const char *cString, size_t inlength); +#endif /* XNU_KERNEL_PRIVATE */ /*! * @function free diff --git a/libkern/libkern/c++/OSSymbol.h b/libkern/libkern/c++/OSSymbol.h index 5fe2f46f7..1bb08101c 100644 --- a/libkern/libkern/c++/OSSymbol.h +++ b/libkern/libkern/c++/OSSymbol.h @@ -85,18 +85,9 @@ class OSSymbol : public OSString OSDeclareAbstractStructors(OSSymbol) private: - struct ExpansionData { }; - - /* Reserved for future use. (Internal use only) */ - ExpansionData * reserved; static void initialize(); - // xx-review: not in xnu, delete? - friend void checkModuleForSymbols(void); /* in catalogue? */ - - // xx-review: these should be removed from the symbol set. - /*! * @function initWithString * diff --git a/libkern/libkern/c++/OSUnserialize.h b/libkern/libkern/c++/OSUnserialize.h index 0dbd2f45e..2e6e7f3ba 100644 --- a/libkern/libkern/c++/OSUnserialize.h +++ b/libkern/libkern/c++/OSUnserialize.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,7 +53,7 @@ class OSString; * * @param buffer A buffer containing nul-terminated XML data * representing the object to be recreated. - * @param errorString If non-
NULL, and the XML parser + * @param errorString If non-NULL, and the XML parser * finds an error in buffer, * *errorString indicates the line number * and type of error encountered. @@ -79,7 +79,7 @@ extern "C++" OSObject * OSUnserializeXML( * representing the object to be recreated. * @param bufferSize The size of the block of memory. The function * never scans beyond the first bufferSize bytes. - * @param errorString If non-NULL, and the XML parser + * @param errorString If non-NULL, and the XML parser * finds an error in buffer, * *errorString indicates the line number * and type of error encountered. diff --git a/libkern/libkern/crypto/Makefile b/libkern/libkern/crypto/Makefile index 0c703fd95..0274f4bf1 100644 --- a/libkern/libkern/crypto/Makefile +++ b/libkern/libkern/crypto/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -23,5 +22,3 @@ EXPORT_MI_DIR = libkern/crypto include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/libkern/libkern/crypto/aes.h b/libkern/libkern/crypto/aes.h index 61d794b9a..827150c6d 100644 --- a/libkern/libkern/crypto/aes.h +++ b/libkern/libkern/crypto/aes.h @@ -41,7 +41,11 @@ extern "C" //Unholy HACK: this works because we know the size of the context for every //possible corecrypto implementation is less than this. +#if defined(__ARM_NEON__) && !defined(__arm64__) // for expanded keys in bit slice format +#define AES_CBC_CTX_MAX_SIZE (ccn_sizeof_size(sizeof(void *)) + ccn_sizeof_size(AES_BLOCK_SIZE) + ccn_sizeof_size(64*4) + (14-1)*128+32 ) +#else #define AES_CBC_CTX_MAX_SIZE (ccn_sizeof_size(sizeof(void *)) + ccn_sizeof_size(AES_BLOCK_SIZE) + ccn_sizeof_size(64*4)) +#endif typedef struct{ cccbc_ctx_decl(AES_CBC_CTX_MAX_SIZE, ctx); diff --git a/libkern/libkern/crypto/des.h b/libkern/libkern/crypto/des.h index 960e60e04..62627e6cd 100644 --- a/libkern/libkern/crypto/des.h +++ b/libkern/libkern/crypto/des.h @@ -44,9 +44,7 @@ typedef unsigned char des_cblock[8]; /* Unholy hack: this is currently the size for the only implementation of DES in corecrypto */ #define DES_ECB_CTX_MAX_SIZE (64*4) -#define DES_CBC_CTX_MAX_SIZE (ccn_sizeof_size(sizeof(struct ccmode_ecb)) + ccn_sizeof_size(CCDES_BLOCK_SIZE) + ccn_sizeof_size(DES_ECB_CTX_MAX_SIZE)) #define DES3_ECB_CTX_MAX_SIZE (64*4*3) -#define DES3_CBC_CTX_MAX_SIZE (ccn_sizeof_size(sizeof(struct ccmode_ecb)) + ccn_sizeof_size(CCDES_BLOCK_SIZE) + ccn_sizeof_size(DES3_ECB_CTX_MAX_SIZE)) typedef struct{ @@ -54,21 +52,11 @@ typedef struct{ ccecb_ctx_decl(DES_ECB_CTX_MAX_SIZE, dec); } des_ecb_key_schedule; -typedef struct{ - cccbc_ctx_decl(DES_CBC_CTX_MAX_SIZE, enc); - cccbc_ctx_decl(DES_CBC_CTX_MAX_SIZE, dec); -} des_cbc_key_schedule; - typedef struct{ ccecb_ctx_decl(DES3_ECB_CTX_MAX_SIZE, enc); ccecb_ctx_decl(DES3_ECB_CTX_MAX_SIZE, dec); } des3_ecb_key_schedule; -typedef struct{ - cccbc_ctx_decl(DES3_CBC_CTX_MAX_SIZE, enc); - cccbc_ctx_decl(DES3_CBC_CTX_MAX_SIZE, dec); -} des3_cbc_key_schedule; - /* Only here for backward compatibility with smb kext */ typedef des_ecb_key_schedule des_key_schedule[1]; #define des_set_key des_ecb_key_sched @@ -85,22 +73,7 @@ void des_ecb_encrypt(des_cblock *in, des_cblock *out, des_ecb_key_schedule *ks, int des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks); void des3_ecb_encrypt(des_cblock *block, des_cblock *, des3_ecb_key_schedule *ks, int encrypt); -/* Single DES CBC */ -int des_cbc_key_sched(des_cblock *key, des_cbc_key_schedule *ks); -void des_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t len, - des_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt); - -/* Triple DES CBC */ -int des3_cbc_key_sched(des_cblock *key, des3_cbc_key_schedule *ks); -void des3_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t len, - des3_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt); - -/* Single DES CBC-MAC */ -void des_cbc_cksum(des_cblock *in, des_cblock *out, int len, des_cbc_key_schedule *ks); - -void des_fixup_key_parity(des_cblock *key); int des_is_weak_key(des_cblock *key); -// int des_set_key(des_cblock *, des_key_schedule); // Unsupported KPI. #ifdef __cplusplus } diff --git a/libkern/libkern/crypto/register_crypto.h b/libkern/libkern/crypto/register_crypto.h index 70ec5f64f..428d2faa5 100644 --- a/libkern/libkern/crypto/register_crypto.h +++ b/libkern/libkern/crypto/register_crypto.h @@ -72,13 +72,16 @@ typedef void (*ccpbkdf2_hmac_fn_t)(const struct ccdigest_info *di, typedef int (*ccdes_key_is_weak_fn_t)(void *key, unsigned long length); typedef void (*ccdes_key_set_odd_parity_fn_t)(void *key, unsigned long length); - +/* XTS padding */ typedef void (*ccpad_xts_decrypt_fn_t)(const struct ccmode_xts *xts, ccxts_ctx *ctx, unsigned long nbytes, const void *in, void *out); typedef void (*ccpad_xts_encrypt_fn_t)(const struct ccmode_xts *xts, ccxts_ctx *ctx, unsigned long nbytes, const void *in, void *out); +/* CBC padding (such as PKCS7 or CTSx per NIST standard) */ +typedef size_t (*ccpad_cts3_crypt_fn_t)(const struct ccmode_cbc *cbc, cccbc_ctx *cbc_key, + cccbc_iv *iv, size_t nbytes, const void *in, void *out); typedef struct crypto_functions { /* digests common functions */ @@ -130,9 +133,12 @@ typedef struct crypto_functions { /* DES key helper functions */ ccdes_key_is_weak_fn_t ccdes_key_is_weak_fn; ccdes_key_set_odd_parity_fn_t ccdes_key_set_odd_parity_fn; - /* XTS padding functions */ + /* XTS padding+encrypt functions */ ccpad_xts_encrypt_fn_t ccpad_xts_encrypt_fn; ccpad_xts_decrypt_fn_t ccpad_xts_decrypt_fn; + /* CTS3 padding+encrypt functions */ + ccpad_cts3_crypt_fn_t ccpad_cts3_encrypt_fn; + ccpad_cts3_crypt_fn_t ccpad_cts3_decrypt_fn; } *crypto_functions_t; int register_crypto_functions(const crypto_functions_t funcs); diff --git a/libkern/libkern/i386/Makefile b/libkern/libkern/i386/Makefile index 40f5b0fc0..78bbfa507 100644 --- a/libkern/libkern/i386/Makefile +++ b/libkern/libkern/i386/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -21,5 +20,3 @@ EXPORT_MD_DIR = libkern/i386 include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/libkern/libkern/kernel_mach_header.h b/libkern/libkern/kernel_mach_header.h index 59218993e..bf5b0f911 100644 --- a/libkern/libkern/kernel_mach_header.h +++ b/libkern/libkern/kernel_mach_header.h @@ -98,6 +98,10 @@ kernel_section_t *getsectbynamefromheader( kernel_mach_header_t *header, const char *seg_name, const char *sect_name); +uint32_t getsectoffsetfromheader( + kernel_mach_header_t *mhp, + const char *segname, + const char *sectname); void *getsectdatafromheader(kernel_mach_header_t *, const char *, const char *, unsigned long *); kernel_section_t *firstsect(kernel_segment_command_t *sgp); kernel_section_t *nextsect(kernel_segment_command_t *sgp, kernel_section_t *sp); diff --git a/libkern/libkern/kext_request_keys.h b/libkern/libkern/kext_request_keys.h index 36dd2ab3f..a04611bc5 100644 --- a/libkern/libkern/kext_request_keys.h +++ b/libkern/libkern/kext_request_keys.h @@ -88,6 +88,26 @@ extern "C" { */ #define kKextRequestPredicateGetLoaded "Get Loaded Kext Info" +/* Predicate: Get Loaded Kext Info By UUID + * Argument: (None) + * Response: An array of information about loaded kexts (see OSKextLib.h). + * Op result: OSReturn indicating any errors in processing (see OSKextLib.h) + * + * Retrieves an array of dictionaries whose properties describe every kext + * loaded at the time of the call. + */ +#define kKextRequestPredicateGetLoadedByUUID "Get Loaded Kext Info By UUID" + +/* Predicate: Get Loaded Kext UUID By Address + * Argument: An address to lookup + * Response: A UUID of the kext + * Op result: OSReturn indicating any errors in processing (see OSKextLib.h) + * + * Retrieves the uuid of a loaded kext in whose address range the given + * lookup address falls into. + */ +#define kKextRequestPredicateGetUUIDByAddress "Get Kext UUID by Address" + /* Predicate: Get All Load Requests * Argument: None * Response: A set of bundle identifiers of all requested kext loads.. @@ -293,6 +313,14 @@ extern "C" { */ #define kKextRequestArgumentResultKey "Kext Request Result Code" +/* Argument: Address + * Type: Number (OSReturn) + * Used by: OSKextGetUUIDByAddress + * + * Contains the address that needs to be looked up + */ +#define kKextRequestArgumentLookupAddressKey "Kext Request Lookup Address" + /* Argument: Value * Type: Varies with the predicate * Used by: several diff --git a/libkern/libkern/kxld.h b/libkern/libkern/kxld.h index 6b3ef392b..5b7d74bfb 100644 --- a/libkern/libkern/kxld.h +++ b/libkern/libkern/kxld.h @@ -60,7 +60,7 @@ kern_return_t kxld_create_context( cpu_type_t cputype, cpu_subtype_t cpusubtype, vm_size_t pagesize) - __attribute__((nonnull(1,2),visibility("default"))); +__attribute__((nonnull(1),visibility("default"))); /******************************************************************************* * Destroys a link context and frees all associated memory. Should be called at @@ -92,16 +92,28 @@ void kxld_destroy_context( * kmod_info_kern Kernel address of the kmod_info_t structure. ******************************************************************************/ kern_return_t kxld_link_file( - KXLDContext *context, - u_char *file, - u_long size, - const char *name, - void *callback_data, - KXLDDependency *dependencies, - u_int num_dependencies, - u_char **linked_object, - kxld_addr_t *kmod_info_kern) - __attribute__((nonnull(1,2,4,6,8,9), visibility("default"))); + KXLDContext *context, + u_char *file, + u_long size, + const char *name, + void *callback_data, + KXLDDependency *dependencies, + u_int num_dependencies, + u_char **linked_object, + kxld_addr_t *kmod_info_kern) +__attribute__((nonnull(1,2,4,6,8,9), visibility("default"))); + + +kern_return_t kxld_link_split_file( + KXLDContext *context, + splitKextLinkInfo *link_info, + const char *name, + void *callback_data, + KXLDDependency *dependencies, + u_int num_dependencies, + kxld_addr_t *kmod_info_kern) +__attribute__((nonnull(1,2,3,5,7), visibility("default"))); + /******************************************************************************* *******************************************************************************/ diff --git a/libkern/libkern/kxld_types.h b/libkern/libkern/kxld_types.h index 1a9b7f3cc..cad04f811 100644 --- a/libkern/libkern/kxld_types.h +++ b/libkern/libkern/kxld_types.h @@ -91,13 +91,15 @@ /* for building the dysymtab command generation into the dylib */ #if (!KERNEL) #define KXLD_PIC_KEXTS 1 +// #define SPLIT_KEXTS 1 + #define SPLIT_KEXTS_DEBUG 0 #endif /******************************************************************************* * Types *******************************************************************************/ -/* Maintains linker state across links. One context should be allocate for +/* Maintains linker state across links. One context should be allocated for * each link thread. */ typedef struct kxld_context KXLDContext; @@ -115,6 +117,20 @@ typedef uint64_t kxld_addr_t; typedef uint64_t kxld_size_t; #endif /* KERNEL && !__LP64__ */ +typedef struct splitKextLinkInfo { + u_char * kextExecutable; // kext we will link + size_t kextSize; // size of kextExecutable + u_char * linkedKext; // linked kext + size_t linkedKextSize; // size of linkedKext + uint64_t vmaddr_TEXT; // vmaddr of kext __TEXT segment + uint64_t vmaddr_TEXT_EXEC; // vmaddr of kext __TEXT_EXEC segment + uint64_t vmaddr_DATA; // vmaddr of kext __DATA segment + uint64_t vmaddr_DATA_CONST; // vmaddr of kext __DATA_CONST segment + uint64_t vmaddr_LINKEDIT; // vmaddr of kext __LINKEDIT segment + uint32_t kaslr_offsets_count; // offsets into the kext to slide + uint32_t * kaslr_offsets; // offsets into the kext to slide +} splitKextLinkInfo; + /* Flags for general linker behavior */ enum kxld_flags { kKxldFlagDefault = 0x0, diff --git a/libkern/libkern/machine/Makefile b/libkern/libkern/machine/Makefile index f89b1afaf..3e9849371 100644 --- a/libkern/libkern/machine/Makefile +++ b/libkern/libkern/machine/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -20,5 +19,3 @@ EXPORT_MI_DIR = libkern/machine include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/libkern/libkern/prelink.h b/libkern/libkern/prelink.h index 59aefd3a2..929ab17bc 100644 --- a/libkern/libkern/prelink.h +++ b/libkern/libkern/prelink.h @@ -32,9 +32,8 @@ #define kPrelinkTextSegment "__PRELINK_TEXT" #define kPrelinkTextSection "__text" -#define kPrelinkLinkStateSegment "__PRELINK_STATE" -#define kPrelinkKernelLinkStateSection "__kernel" -#define kPrelinkKextsLinkStateSection "__kexts" +#define kPrelinkDataSegment "__PRELINK_DATA" +#define kPrelinkDataSection "__data" #define kPrelinkInfoSegment "__PRELINK_INFO" #define kPrelinkInfoSection "__info" @@ -49,6 +48,8 @@ #define kPrelinkKmodInfoKey "_PrelinkKmodInfo" #define kPrelinkLinkStateKey "_PrelinkLinkState" #define kPrelinkLinkStateSizeKey "_PrelinkLinkStateSize" +#define kPrelinkLinkKASLROffsetsKey "_PrelinkLinkKASLROffsets" +#define kPrelinkInfoKCIDKey "_PrelinkKCID" #endif /* _PRELINK_H_ */ diff --git a/libkern/libkern/sysctl.h b/libkern/libkern/sysctl.h index 9af0fa0cc..38a6c2ac7 100644 --- a/libkern/libkern/sysctl.h +++ b/libkern/libkern/sysctl.h @@ -107,7 +107,11 @@ __BEGIN_DECLS /* * Sysctl handling */ +#ifdef XNU_KERNEL_PRIVATE +int kernel_sysctlbyname(const char *, void *, size_t *, void *, size_t); +#else int sysctlbyname(const char *, void *, size_t *, void *, size_t); +#endif __END_DECLS diff --git a/libkern/mkext.c b/libkern/mkext.c index 6dba0e0aa..59634832a 100644 --- a/libkern/mkext.c +++ b/libkern/mkext.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,7 +82,9 @@ mkext_adler32(uint8_t *buf, int32_t len) #define F 18 /* upper limit for match_length */ #define THRESHOLD 2 /* encode string into position and length if match_length is greater than this */ +#if !KERNEL #define NIL N /* index for root of binary search trees */ +#endif struct encode_state { /* diff --git a/libkern/net/inet_ntop.c b/libkern/net/inet_ntop.c index 309d35bb9..d15bc57ff 100644 --- a/libkern/net/inet_ntop.c +++ b/libkern/net/inet_ntop.c @@ -118,10 +118,12 @@ inet_ntop6(const u_char *src, char *dst, socklen_t size) cur.len = 0; for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++) { if (words[i] == 0) { - if (cur.base == -1) - cur.base = i, cur.len = 1; - else + if (cur.base == -1) { + cur.base = i; + cur.len = 1; + } else { cur.len++; + } } else { if (cur.base != -1) { if (best.base == -1 || cur.len > best.len) diff --git a/libkern/os/Makefile b/libkern/os/Makefile index 88789bb56..80a933fb9 100644 --- a/libkern/os/Makefile +++ b/libkern/os/Makefile @@ -8,33 +8,31 @@ include $(MakeInc_def) LCLDIR = /usr/local/include -DATAFILES = - KERNELFILES = \ - ${DATAFILES} \ + base.h \ + object.h \ + log.h \ + trace.h \ overflow.h -PRIVATE_KERNELFILES = +PRIVATE_KERNELFILES = \ + object_private.h -PRIVATE_DATAFILES = \ - ${PRIVATE_KERNELFILES} \ - overflow.h +PRIVATE_DATAFILES = -INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LIST = \ + overflow.h INSTALL_MI_DIR = os -INSTALL_MI_LCL_LIST = \ - ${PRIVATE_DATAFILES} +INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} INSTALL_KF_MI_LIST = ${KERNELFILES} INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} EXPORT_MI_LIST = \ - $(sort ${KERNELFILES} ${PRIVATE_DATAFILES}) - -EXPORT_MI_GEN_LIST = + ${KERNELFILES} ${PRIVATE_KERNELFILES} log_private.h EXPORT_MI_DIR = os diff --git a/libkern/os/base.h b/libkern/os/base.h new file mode 100644 index 000000000..e37800e81 --- /dev/null +++ b/libkern/os/base.h @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +#ifndef __OS_BASE__ +#define __OS_BASE__ + +#include + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif +#ifndef __has_include +#define __has_include(x) 0 +#endif +#ifndef __has_feature +#define __has_feature(x) 0 +#endif +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif +#ifndef __has_extension +#define __has_extension(x) 0 +#endif + +#undef OS_INLINE // +#if __GNUC__ +#define OS_NORETURN __attribute__((__noreturn__)) +#define OS_NOTHROW __attribute__((__nothrow__)) +#define OS_NONNULL1 __attribute__((__nonnull__(1))) +#define OS_NONNULL2 __attribute__((__nonnull__(2))) +#define OS_NONNULL3 __attribute__((__nonnull__(3))) +#define OS_NONNULL4 __attribute__((__nonnull__(4))) +#define OS_NONNULL5 __attribute__((__nonnull__(5))) +#define OS_NONNULL6 __attribute__((__nonnull__(6))) +#define OS_NONNULL7 __attribute__((__nonnull__(7))) +#define OS_NONNULL8 __attribute__((__nonnull__(8))) +#define OS_NONNULL9 __attribute__((__nonnull__(9))) +#define OS_NONNULL10 __attribute__((__nonnull__(10))) +#define OS_NONNULL11 __attribute__((__nonnull__(11))) +#define OS_NONNULL12 __attribute__((__nonnull__(12))) +#define OS_NONNULL13 __attribute__((__nonnull__(13))) +#define OS_NONNULL14 __attribute__((__nonnull__(14))) +#define OS_NONNULL15 __attribute__((__nonnull__(15))) +#define OS_NONNULL_ALL __attribute__((__nonnull__)) +#define OS_SENTINEL __attribute__((__sentinel__)) +#define OS_PURE __attribute__((__pure__)) +#define OS_CONST __attribute__((__const__)) +#define OS_WARN_RESULT __attribute__((__warn_unused_result__)) +#define OS_MALLOC __attribute__((__malloc__)) +#define OS_USED __attribute__((__used__)) +#define OS_UNUSED __attribute__((__unused__)) +#define OS_COLD __attribute__((__cold__)) +#define OS_WEAK __attribute__((__weak__)) +#define OS_WEAK_IMPORT __attribute__((__weak_import__)) +#define OS_NOINLINE __attribute__((__noinline__)) +#define OS_ALWAYS_INLINE __attribute__((__always_inline__)) +#define OS_TRANSPARENT_UNION __attribute__((__transparent_union__)) +#define OS_ALIGNED(n) __attribute__((__aligned__((n)))) +#define OS_FORMAT_PRINTF(x,y) __attribute__((__format__(printf,x,y))) +#define OS_EXPORT extern __attribute__((__visibility__("default"))) +#define OS_INLINE static __inline__ +#define OS_EXPECT(x, v) __builtin_expect((x), (v)) +#else +#define OS_NORETURN +#define OS_NOTHROW +#define OS_NONNULL1 +#define OS_NONNULL2 +#define OS_NONNULL3 +#define OS_NONNULL4 +#define OS_NONNULL5 +#define OS_NONNULL6 +#define OS_NONNULL7 +#define OS_NONNULL8 +#define OS_NONNULL9 +#define OS_NONNULL10 +#define OS_NONNULL11 +#define OS_NONNULL12 +#define OS_NONNULL13 +#define OS_NONNULL14 +#define OS_NONNULL15 +#define OS_NONNULL_ALL +#define OS_SENTINEL +#define OS_PURE +#define OS_CONST +#define OS_WARN_RESULT +#define OS_MALLOC +#define OS_USED +#define OS_UNUSED +#define OS_COLD +#define OS_WEAK +#define OS_WEAK_IMPORT +#define OS_NOINLINE +#define OS_ALWAYS_INLINE +#define OS_TRANSPARENT_UNION +#define OS_ALIGNED(n) +#define OS_FORMAT_PRINTF(x,y) +#define OS_EXPORT extern +#define OS_INLINE static inline +#define OS_EXPECT(x, v) (x) +#endif + +#if __has_attribute(noescape) +#define OS_NOESCAPE __attribute__((__noescape__)) +#else +#define OS_NOESCAPE +#endif + +#if __has_feature(assume_nonnull) +#define OS_ASSUME_NONNULL_BEGIN _Pragma("clang assume_nonnull begin") +#define OS_ASSUME_NONNULL_END _Pragma("clang assume_nonnull end") +#else +#define OS_ASSUME_NONNULL_BEGIN +#define OS_ASSUME_NONNULL_END +#endif + +#if __has_builtin(__builtin_assume) +#define OS_COMPILER_CAN_ASSUME(expr) __builtin_assume(expr) +#else +#define OS_COMPILER_CAN_ASSUME(expr) ((void)(expr)) +#endif + +#if __has_extension(attribute_overloadable) +#define OS_OVERLOADABLE __attribute__((__overloadable__)) +#else +#define OS_OVERLOADABLE +#endif + +#if __has_feature(objc_fixed_enum) || __has_extension(cxx_strong_enums) +#define OS_ENUM(_name, _type, ...) \ + typedef enum : _type { __VA_ARGS__ } _name##_t +#else +#define OS_ENUM(_name, _type, ...) \ + enum { __VA_ARGS__ }; typedef _type _name##_t +#endif + +#if __has_feature(attribute_availability_swift) +// equivalent to __SWIFT_UNAVAILABLE from Availability.h +#define OS_SWIFT_UNAVAILABLE(_msg) \ + __attribute__((__availability__(swift, unavailable, message=_msg))) +#else +#define OS_SWIFT_UNAVAILABLE(_msg) +#endif + +#if __has_attribute(swift_private) +# define OS_REFINED_FOR_SWIFT __attribute__((__swift_private__)) +#else +# define OS_REFINED_FOR_SWIFT +#endif + +#if __has_attribute(swift_name) +# define OS_SWIFT_NAME(_name) __attribute__((__swift_name__(#_name))) +#else +# define OS_SWIFT_NAME(_name) +#endif + +#define __OS_STRINGIFY(s) #s +#define OS_STRINGIFY(s) __OS_STRINGIFY(s) +#define __OS_CONCAT(x, y) x ## y +#define OS_CONCAT(x, y) __OS_CONCAT(x, y) + +#ifdef __GNUC__ +#define os_prevent_tail_call_optimization() __asm__("") +#define os_is_compile_time_constant(expr) __builtin_constant_p(expr) +#define os_compiler_barrier() __asm__ __volatile__("" ::: "memory") +#else +#define os_prevent_tail_call_optimization() do { } while (0) +#define os_is_compile_time_constant(expr) 0 +#define os_compiler_barrier() do { } while (0) +#endif + +#if __has_attribute(not_tail_called) +#define OS_NOT_TAIL_CALLED __attribute__((__not_tail_called__)) +#else +#define OS_NOT_TAIL_CALLED +#endif + +typedef void (*os_function_t)(void *_Nullable); + +#ifdef __BLOCKS__ +/*! + * @typedef os_block_t + * + * @abstract + * Generic type for a block taking no arguments and returning no value. + * + * @discussion + * When not building with Objective-C ARC, a block object allocated on or + * copied to the heap must be released with a -[release] message or the + * Block_release() function. + * + * The declaration of a block literal allocates storage on the stack. + * Therefore, this is an invalid construct: + * + * os_block_t block; + * if (x) { + * block = ^{ printf("true\n"); }; + * } else { + * block = ^{ printf("false\n"); }; + * } + * block(); // unsafe!!! + * + * + * What is happening behind the scenes: + * + * if (x) { + * struct Block __tmp_1 = ...; // setup details + * block = &__tmp_1; + * } else { + * struct Block __tmp_2 = ...; // setup details + * block = &__tmp_2; + * } + * + * + * As the example demonstrates, the address of a stack variable is escaping the + * scope in which it is allocated. That is a classic C bug. + * + * Instead, the block literal must be copied to the heap with the Block_copy() + * function or by sending it a -[copy] message. + */ +typedef void (^os_block_t)(void); +#endif + +#endif // __OS_BASE__ diff --git a/libkern/os/firehose.h b/libkern/os/firehose.h new file mode 100644 index 000000000..fb0a3ce3a --- /dev/null +++ b/libkern/os/firehose.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef __firehose_h +#define __firehose_h + +__BEGIN_DECLS + +/*! + * @function __firehose_buffer_push_to_logd + * + * @abstract + * Called by the dispatch firehose apis to notify logd that a chunk is available + */ +__WATCHOS_AVAILABLE(3.0) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) +void __firehose_buffer_push_to_logd(firehose_buffer_t fb, bool for_io); + +/*! + * @function __firehose_allocate + * + * @abstract + * Wrapper to allocate kernel memory + */ +__WATCHOS_AVAILABLE(3.0) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) +void __firehose_allocate(vm_offset_t *addr, vm_size_t size); + +/*! + * @function __firehose_critical_region_enter + * + * @abstract + * Function that disables preemption + */ +__WATCHOS_AVAILABLE(3.0) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) +extern void __firehose_critical_region_enter(void); + +/*! + * @function __firehose_critical_region_leave + * + * @abstract + * Function that enables preemption + */ +__WATCHOS_AVAILABLE(3.0) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) +extern void __firehose_critical_region_leave(void); + +extern void oslogwakeup(void); + +__END_DECLS + +#endif /* __firehose_h */ diff --git a/libkern/os/internal.c b/libkern/os/internal.c new file mode 100644 index 000000000..fca4dc083 --- /dev/null +++ b/libkern/os/internal.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013-2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include "trace_internal.h" +#include +#include + +static bool +_os_trace_addr_in_text_segment_32(const void *dso, const void *addr) +{ + const struct mach_header *mhp = (const struct mach_header *) dso; + const struct segment_command *sgp = (const struct segment_command *)(const void *)((const char *)mhp + sizeof(struct mach_header)); + + for (uint32_t i = 0; i < mhp->ncmds; i++) { + if (sgp->cmd == LC_SEGMENT) { + if (strncmp(sgp->segname, SEG_TEXT, sizeof(sgp->segname)) == 0) { + return ((uintptr_t)addr >= (sgp->vmaddr) && (uintptr_t)addr < (sgp->vmaddr + sgp->vmsize)); + } + } + sgp = (const struct segment_command *)(const void *)((const char *)sgp + sgp->cmdsize); + } + + return false; +} + +static bool +_os_trace_addr_in_text_segment_64(const void *dso, const void *addr) +{ + const struct mach_header_64 *mhp = (const struct mach_header_64 *) dso; + const struct segment_command_64 *sgp = (const struct segment_command_64 *)(const void *)((const char *)mhp + sizeof(struct mach_header_64)); + + for (uint32_t i = 0; i < mhp->ncmds; i++) { + if (sgp->cmd == LC_SEGMENT_64) { + if (strncmp(sgp->segname, SEG_TEXT, sizeof(sgp->segname)) == 0) { + return ((uintptr_t)addr >= (sgp->vmaddr) && (uintptr_t)addr < (sgp->vmaddr + sgp->vmsize)); + } + } + sgp = (const struct segment_command_64 *)(const void *)((const char *)sgp + sgp->cmdsize); + } + + return false; +} + +bool +_os_trace_addr_in_text_segment(const void *dso, const void *addr) +{ + const struct mach_header *mhp = (const struct mach_header *) dso; + bool retval = false; + + switch (mhp->magic) { + case MH_MAGIC: + retval = _os_trace_addr_in_text_segment_32(dso, addr); + break; + + case MH_MAGIC_64: + retval = _os_trace_addr_in_text_segment_64(dso, addr); + break; + + default: + retval = false; + break; + } + + return retval; +} diff --git a/libkern/os/log.c b/libkern/os/log.c new file mode 100644 index 000000000..da6d6637d --- /dev/null +++ b/libkern/os/log.c @@ -0,0 +1,605 @@ +#include +#undef offset + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "trace_internal.h" + +#include "log_encode.h" + +struct os_log_s { + int a; +}; + +struct os_log_s _os_log_default; +struct os_log_s _os_log_replay; +extern vm_offset_t kernel_firehose_addr; +extern firehose_buffer_chunk_t firehose_boot_chunk; + +extern void bsd_log_lock(void); +extern void bsd_log_unlock(void); +extern void logwakeup(void); + +decl_lck_spin_data(extern, oslog_stream_lock) +extern void oslog_streamwakeup(void); +void oslog_streamwrite_locked(firehose_tracepoint_id_u ftid, + uint64_t stamp, const void *pubdata, size_t publen); +extern void oslog_streamwrite_metadata_locked(oslog_stream_buf_entry_t m_entry); + +extern int oslog_stream_open; + +extern void *OSKextKextForAddress(const void *); + +/* Counters for persistence mode */ +uint32_t oslog_p_total_msgcount = 0; +uint32_t oslog_p_metadata_saved_msgcount = 0; +uint32_t oslog_p_metadata_dropped_msgcount = 0; +uint32_t oslog_p_error_count = 0; +uint32_t oslog_p_saved_msgcount = 0; +uint32_t oslog_p_dropped_msgcount = 0; +uint32_t oslog_p_boot_dropped_msgcount = 0; + +/* Counters for streaming mode */ +uint32_t oslog_s_total_msgcount = 0; +uint32_t oslog_s_error_count = 0; +uint32_t oslog_s_metadata_msgcount = 0; + +static bool oslog_boot_done = false; +extern boolean_t oslog_early_boot_complete; + +// XXX +firehose_tracepoint_id_t +firehose_debug_trace(firehose_stream_t stream, firehose_tracepoint_id_t trace_id, + uint64_t timestamp, const char *format, const void *pubdata, size_t publen); + +static inline firehose_tracepoint_id_t +_firehose_trace(firehose_stream_t stream, firehose_tracepoint_id_u ftid, + uint64_t stamp, const void *pubdata, size_t publen); + +static oslog_stream_buf_entry_t +oslog_stream_create_buf_entry(oslog_stream_link_type_t type, firehose_tracepoint_id_u ftid, + uint64_t stamp, const void* pubdata, size_t publen); + +static void +_os_log_with_args_internal(os_log_t oslog __unused, os_log_type_t type __unused, + const char *format, va_list args, void *addr, void *dso); + +static void +_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging); + +static void +_os_log_to_log_internal(os_log_t oslog, os_log_type_t type, + const char *format, va_list args, void *addr, void *dso); + + +static void +_os_log_actual(os_log_t oslog, os_log_type_t type, const char *format, void + *dso, void *addr, os_log_buffer_context_t context); + +bool +os_log_info_enabled(os_log_t log __unused) +{ + return true; +} + +bool +os_log_debug_enabled(os_log_t log __unused) +{ + return true; +} + +os_log_t +os_log_create(const char *subsystem __unused, const char *category __unused) +{ + return &_os_log_default; +} + +bool +_os_log_string_is_public(const char *str __unused) +{ + return true; +} + +__attribute__((noinline,not_tail_called)) void +_os_log_internal(void *dso, os_log_t log, uint8_t type, const char *message, ...) +{ + va_list args; + void *addr = __builtin_return_address(0); + + va_start(args, message); + + _os_log_with_args_internal(log, type, message, args, addr, dso); + + va_end(args); + + return; +} + +#pragma mark - shim functions + +__attribute__((noinline,not_tail_called)) void +os_log_with_args(os_log_t oslog, os_log_type_t type, const char *format, va_list args, void *addr) +{ + // if no address passed, look it up + if (addr == NULL) { + addr = __builtin_return_address(0); + } + + _os_log_with_args_internal(oslog, type, format, args, addr, NULL); +} + +static void +_os_log_with_args_internal(os_log_t oslog, os_log_type_t type, + const char *format, va_list args, void *addr, void *dso) +{ + uint32_t logging_config = atm_get_diagnostic_config(); + boolean_t safe; + boolean_t logging; + + if (format[0] == '\0') { + return; + } + /* cf. r24974766 & r25201228*/ + safe = (!oslog_early_boot_complete || oslog_is_safe()); + logging = (!(logging_config & ATM_TRACE_DISABLE) || !(logging_config & ATM_TRACE_OFF)); + + if (oslog != &_os_log_replay) { + _os_log_to_msgbuf_internal(format, args, safe, logging); + } + + if (safe && logging) { + _os_log_to_log_internal(oslog, type, format, args, addr, dso); + } +} + +static void +_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging) +{ + static int msgbufreplay = -1; + va_list args_copy; + + bsd_log_lock(); + + if (!safe) { + if (-1 == msgbufreplay) msgbufreplay = msgbufp->msg_bufx; + } else if (logging && (-1 != msgbufreplay)) { + uint32_t i; + uint32_t localbuff_size; + int newl, position; + char *localbuff, *p, *s, *next, ch; + + position = msgbufreplay; + msgbufreplay = -1; + localbuff_size = (msgbufp->msg_size + 2); /* + '\n' + '\0' */ + /* Size for non-blocking */ + if (localbuff_size > 4096) localbuff_size = 4096; + bsd_log_unlock(); + /* Allocate a temporary non-circular buffer */ + if ((localbuff = (char *)kalloc_noblock(localbuff_size))) { + /* in between here, the log could become bigger, but that's fine */ + bsd_log_lock(); + /* + * The message buffer is circular; start at the replay pointer, and + * make one loop up to write pointer - 1. + */ + p = msgbufp->msg_bufc + position; + for (i = newl = 0; p != msgbufp->msg_bufc + msgbufp->msg_bufx - 1; ++p) { + if (p >= msgbufp->msg_bufc + msgbufp->msg_size) + p = msgbufp->msg_bufc; + ch = *p; + if (ch == '\0') continue; + newl = (ch == '\n'); + localbuff[i++] = ch; + if (i >= (localbuff_size - 2)) break; + } + bsd_log_unlock(); + + if (!newl) localbuff[i++] = '\n'; + localbuff[i++] = 0; + + s = localbuff; + while ((next = strchr(s, '\n'))) { + next++; + ch = next[0]; + next[0] = 0; + os_log(&_os_log_replay, "%s", s); + next[0] = ch; + s = next; + } + kfree(localbuff, localbuff_size); + } + bsd_log_lock(); + } + + va_copy(args_copy, args); + vprintf_log_locked(format, args_copy); + va_end(args_copy); + + bsd_log_unlock(); + + if (safe) logwakeup(); +} + +static void +_os_log_to_log_internal(os_log_t oslog, os_log_type_t type, + const char *format, va_list args, void *addr, void *dso) +{ + struct os_log_buffer_context_s context; + unsigned char buffer_data[OS_LOG_BUFFER_MAX_SIZE] __attribute__((aligned(8))); + os_log_buffer_t buffer = (os_log_buffer_t)buffer_data; + uint8_t pubdata[OS_LOG_BUFFER_MAX_SIZE]; + va_list args_copy; + + if (dso == NULL) { + dso = (void *) OSKextKextForAddress(format); + if (dso == NULL) { + return; + } + } + + if (!_os_trace_addr_in_text_segment(dso, format)) { + return; + } + + if (addr == NULL) { + return; + } + + void *dso_addr = (void *) OSKextKextForAddress(addr); + if (dso != dso_addr) { + return; + } + + memset(&context, 0, sizeof(context)); + memset(buffer, 0, OS_LOG_BUFFER_MAX_SIZE); + + context.shimmed = true; + context.buffer = buffer; + context.content_sz = OS_LOG_BUFFER_MAX_SIZE - sizeof(*buffer); + context.pubdata = pubdata; + context.pubdata_sz = sizeof(pubdata); + + va_copy(args_copy, args); + + (void)hw_atomic_add(&oslog_p_total_msgcount, 1); + if (_os_log_encode(format, args_copy, 0, &context)) { + _os_log_actual(oslog, type, format, dso, addr, &context); + } + else { + (void)hw_atomic_add(&oslog_p_error_count, 1); + } + + va_end(args_copy); +} + +size_t +_os_trace_location_for_address(void *dso, const void *address, + os_trace_location_t location, firehose_tracepoint_flags_t *flags); + +size_t +_os_trace_location_for_address(void *dso, const void *address, + os_trace_location_t location, firehose_tracepoint_flags_t *flags) +{ + kernel_mach_header_t *mh = dso; + + if (mh->filetype == MH_EXECUTE) { + location->flags = _firehose_tracepoint_flags_base_main_executable; + location->offset = (uint32_t) ((uintptr_t)address - (uintptr_t)dso); + (*flags) |= location->flags; + return sizeof(location->offset); // offset based + } else { + location->flags = _firehose_tracepoint_flags_base_caller_pc; + (*flags) |= location->flags; + location->pc = (uintptr_t)VM_KERNEL_UNSLIDE(address); + return sizeof(location->encode_value); + } +} + + +OS_ALWAYS_INLINE +inline bool +_os_log_buffer_pack(uint8_t *buffdata, unsigned int *buffdata_sz, os_log_buffer_context_t ctx) +{ + os_log_buffer_t buffer = ctx->buffer; + uint16_t buffer_sz = (uint16_t) (sizeof(*ctx->buffer) + ctx->content_sz); + uint16_t total_sz = buffer_sz + ctx->pubdata_sz; + + // [buffer] [pubdata] + if (total_sz >= (*buffdata_sz)) { + return false; + } + + memcpy(buffdata, buffer, buffer_sz); + memcpy(&buffdata[buffer_sz], ctx->pubdata, ctx->pubdata_sz); + + (*buffdata_sz) = total_sz; + + return true; +} + +static void +_os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format, + void *dso, void *addr, os_log_buffer_context_t context) +{ + firehose_stream_t stream; + firehose_tracepoint_flags_t flags = 0; + firehose_tracepoint_id_u trace_id; + os_trace_location_u addr_loc; + uint8_t buffdata[OS_LOG_BUFFER_MAX_SIZE]; + unsigned int buffdata_sz = (unsigned int) sizeof(buffdata); + size_t buffdata_idx = 0; + size_t addr_loc_sz; + uint64_t timestamp; + uint64_t thread_id; + + memset(&addr_loc, 0, sizeof(addr_loc)); + + // dso == the start of the binary that was loaded + // codes are the offset into the binary from start + addr_loc_sz = _os_trace_location_for_address(dso, addr, &addr_loc, &flags); + + timestamp = firehose_tracepoint_time(firehose_activity_flags_default); + thread_id = thread_tid(current_thread()); + + // insert the location + memcpy(&buffdata[buffdata_idx], &addr_loc, addr_loc_sz); + buffdata_idx += addr_loc_sz; + + // create trace_id after we've set additional flags + trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, + type, flags, _os_trace_offset(dso, format, flags)); + + // pack the buffer data after the header data + buffdata_sz -= buffdata_idx; // subtract the existing content from the size + _os_log_buffer_pack(&buffdata[buffdata_idx], &buffdata_sz, context); + buffdata_sz += buffdata_idx; // add the header amount too + + if (FALSE) { + firehose_debug_trace(stream, trace_id.ftid_value, timestamp, + format, buffdata, buffdata_sz); + } + + if (type == OS_LOG_TYPE_INFO || type == OS_LOG_TYPE_DEBUG) { + stream = firehose_stream_memory; + } + else { + stream = firehose_stream_persist; + } + + _firehose_trace(stream, trace_id, timestamp, buffdata, buffdata_sz); +} + +static inline firehose_tracepoint_id_t +_firehose_trace(firehose_stream_t stream, firehose_tracepoint_id_u ftid, + uint64_t stamp, const void *pubdata, size_t publen) +{ + const uint16_t ft_size = offsetof(struct firehose_tracepoint_s, ft_data); + const size_t _firehose_chunk_payload_size = + sizeof(((struct firehose_buffer_chunk_s *)0)->fbc_data); + + firehose_tracepoint_t ft; + + if (slowpath(ft_size + publen > _firehose_chunk_payload_size)) { + // We'll need to have some handling here. For now - return 0 + (void)hw_atomic_add(&oslog_p_error_count, 1); + return 0; + } + + if (oslog_stream_open && (stream != firehose_stream_metadata)) { + + lck_spin_lock(&oslog_stream_lock); + if (!oslog_stream_open) { + lck_spin_unlock(&oslog_stream_lock); + goto out; + } + + oslog_s_total_msgcount++; + oslog_streamwrite_locked(ftid, stamp, pubdata, publen); + lck_spin_unlock(&oslog_stream_lock); + oslog_streamwakeup(); + } + +out: + ft = __firehose_buffer_tracepoint_reserve(stamp, stream, (uint16_t)publen, 0, NULL); + if (!fastpath(ft)) { + if (oslog_boot_done) { + if (stream == firehose_stream_metadata) { + (void)hw_atomic_add(&oslog_p_metadata_dropped_msgcount, 1); + } + else { + // If we run out of space in the persistence buffer we're + // dropping the message. + (void)hw_atomic_add(&oslog_p_dropped_msgcount, 1); + } + return 0; + } + firehose_buffer_chunk_t fbc = firehose_boot_chunk; + + //only stream available during boot is persist + ft = __firehose_buffer_tracepoint_reserve_with_chunk(fbc, stamp, firehose_stream_persist, publen, 0, NULL); + if (!fastpath(ft)) { + (void)hw_atomic_add(&oslog_p_boot_dropped_msgcount, 1); + return 0; + } + else { + memcpy(ft->ft_data, pubdata, publen); + __firehose_buffer_tracepoint_flush_chunk(fbc, ft, ftid); + (void)hw_atomic_add(&oslog_p_saved_msgcount, 1); + return ftid.ftid_value; + } + } + if (!oslog_boot_done) { + oslog_boot_done = true; + } + memcpy(ft->ft_data, pubdata, publen); + + __firehose_buffer_tracepoint_flush(ft, ftid); + if (stream == firehose_stream_metadata) { + (void)hw_atomic_add(&oslog_p_metadata_saved_msgcount, 1); + } + else { + (void)hw_atomic_add(&oslog_p_saved_msgcount, 1); + } + return ftid.ftid_value; +} + +static oslog_stream_buf_entry_t +oslog_stream_create_buf_entry(oslog_stream_link_type_t type, firehose_tracepoint_id_u ftid, + uint64_t stamp, const void* pubdata, size_t publen) +{ + oslog_stream_buf_entry_t m_entry = NULL; + firehose_tracepoint_t ft = NULL; + size_t m_entry_len = 0; + + if (!pubdata) { + return NULL; + } + + m_entry_len = sizeof(struct oslog_stream_buf_entry_s) + + sizeof(struct firehose_tracepoint_s) + publen; + m_entry = (oslog_stream_buf_entry_t) kalloc(m_entry_len); + if (!m_entry) { + return NULL; + } + + m_entry->type = type; + m_entry->timestamp = stamp; + m_entry->size = sizeof(struct firehose_tracepoint_s) + publen; + + ft = m_entry->metadata; + ft->ft_thread = thread_tid(current_thread()); + ft->ft_id.ftid_value = ftid.ftid_value; + ft->ft_length = publen; + memcpy(ft->ft_data, pubdata, publen); + + return m_entry; +} + +#ifdef KERNEL +void +firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid, + uint64_t stamp, const void *pubdata, size_t publen) +{ + oslog_stream_buf_entry_t m_entry = NULL; + + // If streaming mode is not on, only log the metadata + // in the persistence buffer + + lck_spin_lock(&oslog_stream_lock); + if (!oslog_stream_open) { + lck_spin_unlock(&oslog_stream_lock); + goto finish; + } + lck_spin_unlock(&oslog_stream_lock); + + // Setup and write the stream metadata entry + m_entry = oslog_stream_create_buf_entry(oslog_stream_link_type_metadata, ftid, + stamp, pubdata, publen); + if (!m_entry) { + (void)hw_atomic_add(&oslog_s_error_count, 1); + goto finish; + } + + lck_spin_lock(&oslog_stream_lock); + if (!oslog_stream_open) { + lck_spin_unlock(&oslog_stream_lock); + kfree(m_entry, sizeof(struct oslog_stream_buf_entry_s) + + sizeof(struct firehose_tracepoint_s) + publen); + goto finish; + } + oslog_s_metadata_msgcount++; + oslog_streamwrite_metadata_locked(m_entry); + lck_spin_unlock(&oslog_stream_lock); + +finish: + _firehose_trace(stream, ftid, stamp, pubdata, publen); +} +#endif + +firehose_tracepoint_id_t +firehose_debug_trace(firehose_stream_t stream, firehose_tracepoint_id_t trace_id, + uint64_t timestamp, const char *format, const void *pubdata, size_t publen) +{ + kprintf("[os_log stream 0x%x trace_id 0x%llx timestamp %llu format '%s' data %p len %lu]\n", + (unsigned int)stream, (unsigned long long)trace_id, timestamp, + format, pubdata, publen); + size_t i; + const unsigned char *cdata = (const unsigned char *)pubdata; + for (i=0; i < publen; i += 8) { + kprintf(">oslog 0x%08x: 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n", + (unsigned int)i, + (i+0) < publen ? cdata[i+0] : 0, + (i+1) < publen ? cdata[i+1] : 0, + (i+2) < publen ? cdata[i+2] : 0, + (i+3) < publen ? cdata[i+3] : 0, + (i+4) < publen ? cdata[i+4] : 0, + (i+5) < publen ? cdata[i+5] : 0, + (i+6) < publen ? cdata[i+6] : 0, + (i+7) < publen ? cdata[i+7] : 0 + ); + } + return trace_id; +} + +void +__firehose_buffer_push_to_logd(firehose_buffer_t fb __unused, bool for_io __unused) { + oslogwakeup(); + return; +} + +void +__firehose_allocate(vm_offset_t *addr, vm_size_t size __unused) { + firehose_buffer_chunk_t kernel_buffer = (firehose_buffer_chunk_t)kernel_firehose_addr; + + if (kernel_firehose_addr) { + *addr = kernel_firehose_addr; + } + else { + *addr = 0; + return; + } + // Now that we are done adding logs to this chunk, set the number of writers to 0 + // Without this, logd won't flush when the page is full + firehose_boot_chunk->fbc_pos.fbc_refcnt = 0; + memcpy(&kernel_buffer[FIREHOSE_BUFFER_KERNEL_CHUNK_COUNT - 1], (const void *)firehose_boot_chunk, FIREHOSE_BUFFER_CHUNK_SIZE); + return; +} +// There isnt a lock held in this case. +void +__firehose_critical_region_enter(void) { + disable_preemption(); + return; +} + +void +__firehose_critical_region_leave(void) { + enable_preemption(); + return; +} + diff --git a/libkern/os/log.h b/libkern/os/log.h new file mode 100644 index 000000000..a26a129a1 --- /dev/null +++ b/libkern/os/log.h @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef __os_log_h +#define __os_log_h + +#include +#include +#include + +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if __has_attribute(not_tail_called) +#define OS_LOG_NOTAILCALL __attribute__((not_tail_called)) +#define OS_LOG_NOTAILCALL_MARKER +#else +#define OS_LOG_NOTAILCALL +#define OS_LOG_NOTAILCALL_MARKER __asm__("") +#endif + +__BEGIN_DECLS + +extern void *__dso_handle; + +OS_ALWAYS_INLINE static inline void _os_log_verify_format_str(__unused const char *msg, ...) __attribute__((format(os_trace, 1, 2))); +OS_ALWAYS_INLINE static inline void _os_log_verify_format_str(__unused const char *msg, ...) { /* placeholder */ } + +#if OS_OBJECT_USE_OBJC +OS_OBJECT_DECL(os_log); +#else +typedef struct os_log_s *os_log_t; +#endif /* OS_OBJECT_USE_OBJC */ + +/*! + * @const OS_LOG_DISABLED + * + * @discussion + * Use this to disable a specific log message. + */ +#define OS_LOG_DISABLED NULL + +/*! + * @const OS_LOG_DEFAULT + * + * @discussion + * Use this to log a message in accordance with current system settings. + */ +#define OS_LOG_DEFAULT OS_OBJECT_GLOBAL_OBJECT(os_log_t, _os_log_default) +__OSX_AVAILABLE_STARTING(__MAC_10_12,__IPHONE_10_0) +OS_EXPORT +struct os_log_s _os_log_default; + +/*! + * @enum os_log_type_t + * + * @discussion + * Supported log message types. + * + * @constant OS_LOG_TYPE_DEFAULT + * Equivalent type for "os_log()" messages, i.e., default messages that are always + * captured to memory or disk. + * + * @constant OS_LOG_TYPE_INFO + * Equivalent type for "os_log_info()" messages, i.e., Additional informational messages. + * + * @constant OS_LOG_TYPE_DEBUG + * Equivalent type for "os_log_debug()" messages, i.e., Debug messages. + * + * @constant OS_LOG_TYPE_ERROR + * Equivalent type for "os_log_error()" messages, i.e., local process error messages. + * + * @constant OS_LOG_TYPE_FAULT + * Equivalent type for "os_log_fault()" messages, i.e., a system error that involves + * potentially more than one process, usually used by daemons and services. + */ +OS_ENUM(os_log_type, uint8_t, + OS_LOG_TYPE_DEFAULT = 0x00, + OS_LOG_TYPE_INFO = 0x01, + OS_LOG_TYPE_DEBUG = 0x02, + OS_LOG_TYPE_ERROR = 0x10, + OS_LOG_TYPE_FAULT = 0x11); + +/*! + * @function os_log_create + * + * @abstract + * Creates a log object to be used with other log related functions. + * + * @discussion + * Creates a log object to be used with other log related functions. The + * log object serves two purposes: (1) tag related messages by subsystem + * and category name for easy filtering, and (2) control logging system + * behavior for messages. + * + * A log object may customize logging system behavior for its messages by + * adding a configuration file in /Library/LogPreferences. Most options + * accept 3 values: "Default", "Yes" or "No" as strings, where "Default" + * signifies follow system behavior for the level of messages. + * + * For log: + * + * os_log_create("com.company.mysubsystem", "connections"); + * + * System-provided preferences are located in /System/Library/LogPreferences/.plist + * + * + * + * + * DEFAULT-OPTIONS + * + * Enabled + * Default + * Persist + * No + * TTL + * Default + * + * + * + * connections + * + * + * + * Default + * + * Persist + * Yes + * TTL + * 4d + * + * + * + * Info + * + * Persist + * Yes + * TTL + * 2d + * + * + * + * Debug + * + * Enabled + * No + * + * + * + * + * All other preferences and system-overrides are stored in /Library/LogPreferences/. + * + * @param subsystem + * The identifier of the given subsystem should be in reverse DNS form + * (i.e., com.company.mysubsystem). This string must be a constant string, + * not dynamically generated. + * + * @param category + * The category within the given subsystem that specifies the settings for + * the log object. This string must be a constant string, not dynamically + * generated. + * + * @result + * Returns an os_log_t value to be passed to other os_log API calls. This + * should be called once at log initialization and rely on system to detect + * changes to settings. This object should be released when no longer used + * via os_release or -[release] method. + * + * A value will always be returned to allow for dynamic enablement. + */ +__OSX_AVAILABLE_STARTING(__MAC_10_12,__IPHONE_10_0) +OS_EXPORT OS_NOTHROW OS_WARN_RESULT OS_OBJECT_RETURNS_RETAINED +os_log_t +os_log_create(const char *subsystem, const char *category); + +/*! + * @function os_log_info_enabled + * + * @abstract + * Returns if development log messages are enabled for a particular log object. + * + * @discussion + * Returns if development log messages are enabled for a particular log object. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @result + * Returns ‘true’ if debug log messages are enabled. + */ +__WATCHOS_AVAILABLE(3.0) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) +OS_EXPORT OS_NOTHROW OS_WARN_RESULT +bool +os_log_info_enabled(os_log_t log); + +/*! + * @function os_log_debug_enabled + * + * @abstract + * Returns if debug log messages are enabled for a particular log object. + * + * @discussion + * Returns if debug log messages are enabled for a particular log object. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @result + * Returns ‘true’ if debug log messages are enabled. + */ +__WATCHOS_AVAILABLE(3.0) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) +OS_EXPORT OS_NOTHROW OS_WARN_RESULT +bool +os_log_debug_enabled(os_log_t log); + +/*! + * @function os_log + * + * @abstract + * Insert a log message into the Unified Logging and Tracing system. + * + * @discussion + * Insert a log message into the Unified Logging and Tracing system in + * accordance with the preferences specified by the provided log object. + * These messages cannot be disabled and therefore always captured either + * to memory or disk. + * + * When an os_activity_id_t is present, the log message will also be scoped by + * that identifier. Activities provide granular filtering of log messages + * across threads and processes. + * + * There is a physical cap of 256 bytes per entry for dynamic content, + * i.e., %s and %@, that can be written to the persistence store. As such, + * all content exceeding the limit will be truncated before written to disk. + * Live streams will continue to show the full content. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + */ +#define os_log(log, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + _os_log_internal(&__dso_handle, log, OS_LOG_TYPE_DEFAULT, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + +/*! + * @function os_log_info + * + * @abstract + * Insert a development log message into the Unified Logging and Tracing system. + * + * @discussion + * Insert a log message into the Unified Logging and Tracing system in + * accordance with the preferences specified by the provided log object. + * + * When an os_activity_id_t is present, the log message will also be scoped by + * that identifier. Activities provide granular filtering of log messages + * across threads and processes. + * + * There is a physical cap of 256 bytes per entry for dynamic content, + * i.e., %s and %@, that can be written to the persistence store. As such, + * all content exceeding the limit will be truncated before written to disk. + * Live streams will continue to show the full content. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + */ +#define os_log_info(log, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + _os_log_internal(&__dso_handle, log, OS_LOG_TYPE_INFO, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + +/*! + * @function os_log_debug + * + * @abstract + * Insert a debug log message into the Unified Logging and Tracing system. + * + * @discussion + * Insert a debug log message into the Unified Logging and Tracing system in + * accordance with the preferences specified by the provided log object. + * + * When an os_activity_id_t is present, the log message will also be scoped by + * that identifier. Activities provide granular filtering of log messages + * across threads and processes. + * + * There is a physical cap of 256 bytes per entry for dynamic content, + * i.e., %s and %@, that can be written to the persistence store. As such, + * all content exceeding the limit will be truncated before written to disk. + * Live streams will continue to show the full content. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + */ +#define os_log_debug(log, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + _os_log_internal(&__dso_handle, log, OS_LOG_TYPE_DEBUG, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + +/*! + * @function os_log_error + * + * @abstract + * Insert an error log message into the Unified Logging and Tracing system. + * + * @discussion + * Insert an error log message into the Unified Logging and Tracing system. + * + * When an os_activity_id_t is present, the log message will also be scoped by + * that identifier. Activities provide granular filtering of log messages + * across threads and processes. + * + * There is a physical cap of 256 bytes per entry for dynamic content, + * i.e., %s and %@, that can be written to the persistence store. As such, + * all content exceeding the limit will be truncated before written to disk. + * Live streams will continue to show the full content. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + */ +#define os_log_error(log, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + _os_log_internal(&__dso_handle, log, OS_LOG_TYPE_ERROR, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + +/*! + * @function os_log_fault + * + * @abstract + * Insert a fault log message into the Unified Logging and Tracing system. + * + * @discussion + * Log a fault message issue into the Unified Logging and Tracing system + * signifying a multi-process (i.e., system error) related issue, either + * due to interaction via IPC or some other. Faults will gather information + * from the entire process chain and record it for later inspection. + * + * When an os_activity_id_t is present, the log message will also be scoped by + * that identifier. Activities provide granular filtering of log messages + * across threads and processes. + * + * There is a physical cap of 256 bytes per entry for dynamic content, + * i.e., %s and %@, that can be written to the persistence store. As such, + * all content exceeding the limit will be truncated before written to disk. + * Live streams will continue to show the full content. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + */ +#define os_log_fault(log, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + _os_log_internal(&__dso_handle, log, OS_LOG_TYPE_FAULT, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + +/*! + * @function os_log_with_type + * + * @abstract + * Log a message using a specific type. + * + * @discussion + * Will log a message with the provided os_log_type_t. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param type + * Pass a valid type from os_log_type_t. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + */ +#define os_log_with_type(log, type, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + _os_log_internal(&__dso_handle, log, type, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + +/*! + * @function os_log_sensitive_debug + * + * @abstract + * Insert a debug log message containing sensitive content (i.e., personal + * identifying information). + * + * @discussion + * Insert a debug log message containing sensitive content (i.e., personal + * identifying information) in accordance with the preferences specified by + * the provided log object. + * + * All strings are considered potentially sensitive, though this call + * specifically signifies the message as containing sensitive content. + * The message will be stored separately from other messages. + * + * When an os_activity_id_t is present, the log message will also be scoped by + * that identifier. Activities provide granular filtering of log messages + * across threads and processes. + * + * There is a physical cap of 256 bytes per entry for dynamic content, + * i.e., %s and %@, that can be written to the persistence store. As such, + * all content exceeding the limit will be truncated before written to disk. + * Live streams will continue to show the full content. + * + * @param log + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. This string must be a constant string, not dynamically + * generated. Supports all standard printf types and %@ (objects). + */ +#define os_log_sensitive_debug(log, format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format string must be constant"); \ + __attribute__((section("__TEXT,__os_log_sens"))) static const char _os_log_fmt[] = format; \ + _os_log_verify_format_str(format, ##__VA_ARGS__); \ + _os_log_sensitive(&__dso_handle, log, OS_LOG_TYPE_DEBUG, _os_log_fmt, ##__VA_ARGS__); \ + __asm__(""); /* avoid tailcall */ \ +}) + +/*! + * @function _os_log_internal + * + * @abstract + * Internal function used by macros. + */ +__WATCHOS_AVAILABLE(3.0) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) +OS_EXPORT OS_NOTHROW +void +_os_log_internal(void *dso, os_log_t log, os_log_type_t type, const char *message, ...); + +__END_DECLS + +#endif /* __os_log_h */ diff --git a/libkern/os/log_encode.h b/libkern/os/log_encode.h new file mode 100644 index 000000000..88839fbd7 --- /dev/null +++ b/libkern/os/log_encode.h @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef log_encode_h +#define log_encode_h + +#include "log_encode_types.h" +#include + +#if KERNEL +#define isdigit(ch) (((ch) >= '0') && ((ch) <= '9')) +#endif + +static bool +_encode_data(os_log_buffer_value_t content, const void *arg, uint16_t arg_len, os_log_buffer_context_t context) +{ + struct os_log_arginfo_s arginfo; + void *databuf; + + if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) { + databuf = context->privdata + context->privdata_off; + arginfo.length = MIN(arg_len, (context->privdata_sz - context->privdata_off)); + arginfo.offset = context->privdata_off; + } else { + databuf = context->pubdata + context->pubdata_off; + arginfo.length = MIN(arg_len, (context->pubdata_sz - context->pubdata_off)); + arginfo.offset = context->pubdata_off; + } + + if (context->arg_content_sz > 0) { + arginfo.length = MIN(context->arg_content_sz, arginfo.length); + } + + memcpy(content->value, &arginfo, sizeof(arginfo)); + content->size = sizeof(arginfo); + + if (arginfo.length) { + if (content->type == OS_LOG_BUFFER_VALUE_TYPE_STRING +#ifndef KERNEL + || content->type == OS_LOG_BUFFER_VALUE_TYPE_OBJECT +#endif + ) { + strlcpy(databuf, arg, arginfo.length); + } else { + memcpy(databuf, arg, arginfo.length); + } + } + + if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) { + context->privdata_off += arginfo.length; + } else { + context->pubdata_off += arginfo.length; + } + + context->content_off += sizeof(*content) + content->size; + context->arg_content_sz = 0; + + return true; +} + +#ifndef KERNEL +static void +_os_log_parse_annotated(char *annotated, const char **visibility, const char **library, const char **type) +{ + char *values[3] = { NULL }; + int cnt = 0; + int idx = 0; + + for (; cnt < 3;) { + char *token = strsep(&annotated, ", {}"); + if (token == NULL) { + break; + } + + if (*token == '\0') { + continue; + } + + values[cnt++] = token; + } + + if ((cnt > 0) && (!strcmp(values[0], "public") || !strcmp(values[0], "private"))) { + if (visibility != NULL) { + (*visibility) = values[0]; + } + + idx++; + } + + if (idx < cnt && (library != NULL) && (type != NULL)) { + char *decoder = values[idx]; + + for (cnt = 0; cnt < 3; ) { + char *token = strsep(&decoder, ": {}"); + if (token == NULL) { + break; + } + + if (*token == '\0') { + continue; + } + + values[cnt++] = token; + } + + if (cnt == 2) { + (*library) = values[0]; + (*type) = values[1]; + } + + if (cnt == 1) { + (*library) = "builtin"; + (*type) = values[0]; + } + } +} +#endif /* !KERNEL */ + +OS_ALWAYS_INLINE +static inline bool +_os_log_encode_arg(const void *arg, uint16_t arg_len, os_log_value_type_t ctype, bool is_private, os_log_buffer_context_t context) +{ + os_log_buffer_value_t content = (os_log_buffer_value_t) &context->buffer->content[context->content_off]; + size_t content_sz = sizeof(*content) + arg_len; + char tempString[OS_LOG_BUFFER_MAX_SIZE] = {}; +#ifndef KERNEL + bool obj_private = true; +#endif + + content->type = ctype; + content->flags = (is_private ? OS_LOG_CONTENT_FLAG_PRIVATE : 0); + +#ifndef KERNEL + if (context->annotated != NULL) { + const char *visibility = NULL; + + _os_log_parse_annotated(context->annotated, &visibility, NULL, NULL); + if (visibility) { + if (!strcasecmp(visibility, "private")) { + content->flags |= OS_LOG_CONTENT_FLAG_PRIVATE; + } else if (!strcasecmp(visibility, "public")) { + content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE; + } + } + + context->annotated = NULL; + } +#endif /* !KERNEL */ + + switch (ctype) { + case OS_LOG_BUFFER_VALUE_TYPE_COUNT: + case OS_LOG_BUFFER_VALUE_TYPE_SCALAR: + if (is_private) { + _encode_data(content, tempString, strlen(tempString) + 1, context); + } else { + if ((context->content_off + content_sz) > context->content_sz) { + return false; + } + + memcpy(content->value, arg, arg_len); + content->size = arg_len; + context->content_off += content_sz; + } + break; + + case OS_LOG_BUFFER_VALUE_TYPE_STRING: + context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; + if (_os_log_string_is_public(arg)) { + content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE; + } + + _encode_data(content, arg, arg_len, context); + break; + +#ifndef KERNEL + case OS_LOG_BUFFER_VALUE_TYPE_POINTER: + context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; + _encode_data(content, arg, arg_len, context); + break; + + case OS_LOG_BUFFER_VALUE_TYPE_OBJECT: + context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; + if (!_NSCF2data(arg, tempString, sizeof(tempString), &obj_private)) { + tempString[0] = '\0'; + } + + if (!obj_private) { + content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE; + } + + _encode_data(content, tempString, strlen(tempString) + 1, context); + break; +#endif /* !KERNEL */ + } + + if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) { + context->buffer->flags |= OS_LOG_BUFFER_HAS_PRIVATE; + } + + context->arg_idx++; + + return true; +} + +static bool +_os_log_encode(const char *format, va_list args, int saved_errno, os_log_buffer_context_t context) +{ + const char *percent = strchr(format, '%'); +#ifndef KERNEL + char annotated[256]; +#endif + + while (percent != NULL) { + ++percent; + if (percent[0] != '%') { + struct os_log_format_value_s value; + int type = OST_INT; +#ifndef KERNEL + bool long_double = false; +#endif + int prec = 0; + char ch; + + for (bool done = false; !done; percent++) { + switch (ch = percent[0]) { + /* type of types or other */ + case 'l': // longer + type++; + break; + + case 'h': // shorter + type--; + break; + + case 'z': + type = OST_SIZE; + break; + + case 'j': + type = OST_INTMAX; + break; + + case 't': + type = OST_PTRDIFF; + break; + + case '.': // precision + if ((percent[1]) == '*') { + prec = va_arg(args, int); + _os_log_encode_arg(&prec, sizeof(prec), OS_LOG_BUFFER_VALUE_TYPE_COUNT, false, context); + percent++; + continue; + } else { + // we have to read the precision and do the right thing + const char *fmt = percent + 1; + prec = 0; + while (isdigit(ch = *fmt++)) { + prec = 10 * prec + (ch - '0'); + } + + if (prec > 1024) { + prec = 1024; + } + + _os_log_encode_arg(&prec, sizeof(prec), OS_LOG_BUFFER_VALUE_TYPE_COUNT, false, context); + } + break; + + case '-': // left-align + case '+': // force sign + case ' ': // prefix non-negative with space + case '#': // alternate + case '\'': // group by thousands + break; + + /* fixed types */ + case 'd': // integer + case 'i': // integer + case 'o': // octal + case 'u': // unsigned + case 'x': // hex + case 'X': // upper-hex + switch (type) { + case OST_CHAR: + value.type.ch = va_arg(args, int); + _os_log_encode_arg(&value.type.ch, sizeof(value.type.ch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + case OST_SHORT: + value.type.s = va_arg(args, int); + _os_log_encode_arg(&value.type.s, sizeof(value.type.s), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + case OST_INT: + value.type.i = va_arg(args, int); + _os_log_encode_arg(&value.type.i, sizeof(value.type.i), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + case OST_LONG: + value.type.l = va_arg(args, long); + _os_log_encode_arg(&value.type.l, sizeof(value.type.l), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + case OST_LONGLONG: + value.type.ll = va_arg(args, long long); + _os_log_encode_arg(&value.type.ll, sizeof(value.type.ll), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + case OST_SIZE: + value.type.z = va_arg(args, size_t); + _os_log_encode_arg(&value.type.z, sizeof(value.type.z), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + case OST_INTMAX: + value.type.im = va_arg(args, intmax_t); + _os_log_encode_arg(&value.type.im, sizeof(value.type.im), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + case OST_PTRDIFF: + value.type.pd = va_arg(args, ptrdiff_t); + _os_log_encode_arg(&value.type.pd, sizeof(value.type.pd), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + break; + + default: + return false; + } + done = true; + break; + +#ifndef KERNEL + case '{': + // we do not support this for shimmed code + if (context->shimmed) { + return false; + } + + for (const char *curr2 = percent + 1; (ch = (*curr2)) != NUL; curr2++) { + if (ch == '}') { + strlcpy(annotated, percent, MIN(curr2 - (percent + 1), sizeof(annotated))); + context->annotated = annotated; + percent = curr2; + break; + } + } + break; +#endif /* !KERNEL */ + + case 'p': // pointer + value.type.p = va_arg(args, void *); + _os_log_encode_arg(&value.type.p, sizeof(value.type.p), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + done = true; + break; + +#ifndef KERNEL + case 'P': // pointer data + if (context->shimmed) { // we do not support this for shimmed code + return false; + } + + context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; + value.type.p = va_arg(args, void *); + + // capture the string pointer to generate a symptom + if (context->log && context->log->generate_symptoms && context->arg_idx == 1 && value.type.pch && prec) { + context->symptom_ptr = value.type.p; + context->symptom_ptr_len = prec; + } + + _os_log_encode_arg(value.type.p, prec, OS_LOG_BUFFER_VALUE_TYPE_POINTER, false, context); + prec = 0; + done = true; + break; +#endif /* !KERNEL */ + +#ifndef KERNEL + case 'L': // long double + long_double = true; + break; + + case 'a': case 'A': case 'e': case 'E': // floating types + case 'f': case 'F': case 'g': case 'G': + if (long_double) { + value.type.ld = va_arg(args, long double); + _os_log_encode_arg(&value.type.ld, sizeof(value.type.ld), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + } else { + value.type.d = va_arg(args, double); + _os_log_encode_arg(&value.type.d, sizeof(value.type.d), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + } + done = true; + break; +#endif /* !KERNEL */ + + case 'c': // char + value.type.ch = va_arg(args, int); + _os_log_encode_arg(&value.type.ch, sizeof(value.type.ch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + done = true; + break; + +#ifndef KERNEL + case 'C': // wide-char + value.type.wch = va_arg(args, wint_t); + _os_log_encode_arg(&value.type.wch, sizeof(value.type.wch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + done = true; + break; +#endif /* !KERNEL */ + + case 's': // string + value.type.pch = va_arg(args, char *); + if (!prec && value.type.pch != NULL) { + prec = (int) strlen(value.type.pch) + 1; + } + +#ifndef KERNEL + // capture the string pointer to generate a symptom + if (context->log && context->log->generate_symptoms && context->arg_idx == 0 && value.type.pch) { + context->symptom_str = value.type.pch; + } +#endif + + context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; + _os_log_encode_arg(value.type.pch, prec, OS_LOG_BUFFER_VALUE_TYPE_STRING, false, context); + prec = 0; + done = true; + break; + +#ifndef KERNEL + case 'S': // wide-string + value.type.pwch = va_arg(args, wchar_t *); + if (!prec && value.type.pwch != NULL) { + prec = (int) wcslen(value.type.pwch) + 1; + } + + context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; + _os_log_encode_arg(value.type.pwch, prec, OS_LOG_BUFFER_VALUE_TYPE_STRING, false, context); + prec = 0; + done = true; + break; +#endif /* !KERNEL */ + +#ifndef KERNEL + case '@': // CFTypeRef aka NSObject * + context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; + _os_log_encode_arg(va_arg(args, void *), 0, OS_LOG_BUFFER_VALUE_TYPE_OBJECT, false, context); + done = true; + break; +#endif /* !KERNEL */ + + case 'm': + value.type.i = saved_errno; + _os_log_encode_arg(&value.type.i, sizeof(value.type.i), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); + done = true; + break; + + default: + if (isdigit(ch)) { // [0-9] + continue; + } + return false; + } + + if (done) { + percent = strchr(percent, '%'); // Find next format + break; + } + } + } else { + percent = strchr(percent+1, '%'); // Find next format after %% + } + } + + context->buffer->arg_cnt = context->arg_idx; + context->content_sz = context->content_off; + context->pubdata_sz = context->pubdata_off; + context->privdata_sz = context->privdata_off; + context->arg_idx = context->content_off = context->pubdata_off = context->privdata_off = 0; + + return true; +} + +#endif /* log_encode_h */ diff --git a/libkern/os/log_encode_types.h b/libkern/os/log_encode_types.h new file mode 100644 index 000000000..ae14192c3 --- /dev/null +++ b/libkern/os/log_encode_types.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef log_encode_types_h +#define log_encode_types_h + +/* + * These are IPIs between xnu and libtrace, used to have common encoding + * and implementation for kernel logging and user logging. They are subject + * to change at any point. + */ + +#include +#include +#include +#include +#include + +#pragma mark - buffer support structures, enums + +OS_ENUM(os_log_value_type, uint8_t, + OS_LOG_BUFFER_VALUE_TYPE_SCALAR = 0, + OS_LOG_BUFFER_VALUE_TYPE_COUNT = 1, + OS_LOG_BUFFER_VALUE_TYPE_STRING = 2, +#ifndef KERNEL + OS_LOG_BUFFER_VALUE_TYPE_POINTER = 3, + OS_LOG_BUFFER_VALUE_TYPE_OBJECT = 4, +#endif + ); + +OS_ENUM(os_log_value_subtype, uint8_t, + OS_LOG_BUFFER_VALUE_SUBTYPE_NONE = 0, + OS_LOG_BUFFER_VALUE_SUBTYPE_INTEGER = 1, +#ifndef KERNEL + OS_LOG_BUFFER_VALUE_SUBTYPE_FLOAT = 2, +#endif + ); + +enum os_log_int_types_t { + OST_CHAR = -2, + OST_SHORT = -1, + OST_INT = 0, + OST_LONG = 1, + OST_LONGLONG = 2, + OST_SIZE = 3, + OST_INTMAX = 4, + OST_PTRDIFF = 5, +}; + +union os_log_format_types_u { + uint16_t u16; + uint32_t u32; + uint64_t u64; + char ch; + short s; + int i; + void *p; + char *pch; +#ifndef KERNEL + wchar_t wch; + wchar_t *pwch; +#endif + size_t z; + intmax_t im; + ptrdiff_t pd; + long l; + long long ll; +#ifndef KERNEL + double d; + float f; + long double ld; +#endif +}; + +typedef struct os_log_format_value_s { + union os_log_format_types_u type; + os_log_value_type_t ctype; + uint16_t size; +} *os_log_format_value_t; + +#define OST_FORMAT_MAX_ARGS 48 +#ifdef KERNEL +#define OST_FORMAT_MAX_STRING_SIZE 512 +#else +#define OST_FORMAT_MAX_STRING_SIZE 1024 +#endif + +#define OST_FORMAT_NON_STATIC ~0 + +typedef struct os_log_buffer_value_s { +#define OS_LOG_CONTENT_FLAG_PRIVATE 0x1 + uint8_t flags : 4; + os_log_value_type_t type : 4; + uint8_t size; + uint8_t value[]; +} *os_log_buffer_value_t; + +typedef struct os_log_buffer_s { +#define OS_LOG_BUFFER_HAS_PRIVATE 0x1 +#define OS_LOG_BUFFER_HAS_NON_SCALAR 0x2 +#ifdef KERNEL +#define OS_LOG_BUFFER_MAX_SIZE 256 +#else +#define OS_LOG_BUFFER_MAX_SIZE 1024 +#endif + uint8_t flags; + uint8_t arg_cnt; + uint8_t content[]; +} *os_log_buffer_t; + +typedef struct os_log_buffer_context_s { + os_log_t log; + os_log_buffer_t buffer; + uint8_t *pubdata; + uint8_t *privdata; + + // composed string + char *comp; + size_t comp_off; + size_t comp_sz; + + // sizes and offsets + uint16_t content_off; // offset into buffer->content + uint16_t content_sz; // size not including the header + uint16_t pubdata_off; + uint16_t pubdata_sz; + uint16_t privdata_off; + uint16_t privdata_sz; + + uint8_t arg_idx; + + // if argument content was limited with %.* or %.# + +#ifndef KERNEL + const char *symptom_str; + const void *symptom_ptr; + uint16_t symptom_ptr_len; + char *annotated; +#endif + int arg_content_sz; + bool need_size; + bool shimmed; +} *os_log_buffer_context_t; + +typedef struct os_log_arginfo_s { + uint16_t offset; + uint16_t length; +} *os_log_arginfo_t; + +/* Clients of these interfaces/structures may be expected to provide implementations of the following functions */ + +#ifndef KERNEL +extern bool +_NSCF2data(const void *obj, char *string_value, size_t string_sz, bool *is_private); +#endif + +extern bool +_os_log_string_is_public(const char *str); + +#endif /* log_encode_types_h */ diff --git a/libkern/os/log_private.h b/libkern/os/log_private.h new file mode 100644 index 000000000..47660ede8 --- /dev/null +++ b/libkern/os/log_private.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef __os_log_private_h +#define __os_log_private_h + +#include +#include +#include + +__BEGIN_DECLS + +/*! + * @function os_log_with_args + * + * @abstract + * os_log variant that supports va_list args. + * + * @discussion + * os_log variant that supports va_list args. This SPI should only be used + * to shim legacy logging systems through os_log. + * + * @param oslog + * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create. + * + * @param type + * Pass one of the following message types. + * OS_LOG_TYPE_DEFAULT + * OS_LOG_TYPE_DEBUG + * OS_LOG_TYPE_INFO + * OS_LOG_TYPE_ERROR + * OS_LOG_TYPE_FAULT + * + * @param format + * A format string to generate a human-readable log message when the log + * line is decoded. Supports all standard printf types in addition to %@ + * and %m (objects and errno respectively). + * + * @param args + * A va_list containing the values for the format string. + * + * @param ret_addr + * Pass the __builtin_return_address(0) of the function that created the + * va_list from variadic arguments. The caller must be the same binary + * that generated the message and provided the format string. + */ +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +OS_EXPORT OS_NOTHROW OS_LOG_NOTAILCALL +void +os_log_with_args(os_log_t oslog, os_log_type_t type, const char *format, va_list args, void *ret_addr); + +/*! + * @enum oslog_stream_link_type_t + */ +OS_ENUM(oslog_stream_link_type, uint8_t, + oslog_stream_link_type_log = 0x0, + oslog_stream_link_type_metadata = 0x1, +); + +/*! + * @typedef oslog_stream_buf_entry_t + */ +typedef struct oslog_stream_buf_entry_s { + STAILQ_ENTRY(oslog_stream_buf_entry_s) buf_entries; + uint64_t timestamp; + int offset; + uint16_t size; + oslog_stream_link_type_t type; + struct firehose_tracepoint_s metadata[]; +} *oslog_stream_buf_entry_t; + +__END_DECLS + +#endif // __os_log_private_h diff --git a/libkern/os/object.c b/libkern/os/object.c new file mode 100644 index 000000000..24e70f45a --- /dev/null +++ b/libkern/os/object.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + + +/* XXX temporary until full vtable and refcount support */ +extern struct os_log_s _os_log_default; + +void* +os_retain(void *obj) +{ + /* XXX temporary nop */ + assert(obj == &_os_log_default); + return obj; +} + +void +os_release(void *obj __unused) +{ + /* XXX temporary nop */ + assert(obj == &_os_log_default); +} diff --git a/libkern/os/object.h b/libkern/os/object.h new file mode 100644 index 000000000..a42ae8bd5 --- /dev/null +++ b/libkern/os/object.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2011-2014 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +#ifndef __OS_OBJECT__ +#define __OS_OBJECT__ + +#ifdef __APPLE__ +#include +#endif +#include + +/*! + * @header + * + * @preprocinfo + * By default, libSystem objects such as GCD and XPC objects are declared as + * Objective-C types when building with an Objective-C compiler. This allows + * them to participate in ARC, in RR management by the Blocks runtime and in + * leaks checking by the static analyzer, and enables them to be added to Cocoa + * collections. + * + * NOTE: this requires explicit cancellation of dispatch sources and xpc + * connections whose handler blocks capture the source/connection object, + * resp. ensuring that such captures do not form retain cycles (e.g. by + * declaring the source as __weak). + * + * To opt-out of this default behavior, add -DOS_OBJECT_USE_OBJC=0 to your + * compiler flags. + * + * This mode requires a platform with the modern Objective-C runtime, the + * Objective-C GC compiler option to be disabled, and at least a Mac OS X 10.8 + * or iOS 6.0 deployment target. + */ + +#ifndef OS_OBJECT_HAVE_OBJC_SUPPORT +#if defined(__OBJC__) && defined(__OBJC2__) && !defined(__OBJC_GC__) && ( \ + __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_8 || \ + __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_6_0) +#define OS_OBJECT_HAVE_OBJC_SUPPORT 1 +#else +#define OS_OBJECT_HAVE_OBJC_SUPPORT 0 +#endif +#endif + +#if OS_OBJECT_HAVE_OBJC_SUPPORT +#ifndef OS_OBJECT_USE_OBJC +#define OS_OBJECT_USE_OBJC 1 +#endif +#elif defined(OS_OBJECT_USE_OBJC) && OS_OBJECT_USE_OBJC +/* Unsupported platform for OS_OBJECT_USE_OBJC=1 */ +#undef OS_OBJECT_USE_OBJC +#define OS_OBJECT_USE_OBJC 0 +#else +#define OS_OBJECT_USE_OBJC 0 +#endif + +#if OS_OBJECT_USE_OBJC +#import +#if defined(__has_attribute) +#if __has_attribute(objc_independent_class) +#define OS_OBJC_INDEPENDENT_CLASS __attribute__((objc_independent_class)) +#endif +#endif // __has_attribute(objc_independent_class) +#ifndef OS_OBJC_INDEPENDENT_CLASS +#define OS_OBJC_INDEPENDENT_CLASS +#endif +#define OS_OBJECT_CLASS(name) OS_##name +#define OS_OBJECT_DECL_IMPL(name, ...) \ + @protocol OS_OBJECT_CLASS(name) __VA_ARGS__ \ + @end \ + typedef NSObject \ + * OS_OBJC_INDEPENDENT_CLASS name##_t +#define OS_OBJECT_DECL(name, ...) \ + OS_OBJECT_DECL_IMPL(name, ) +#define OS_OBJECT_DECL_SUBCLASS(name, super) \ + OS_OBJECT_DECL_IMPL(name, ) +#if defined(__has_attribute) +#if __has_attribute(ns_returns_retained) +#define OS_OBJECT_RETURNS_RETAINED __attribute__((__ns_returns_retained__)) +#else +#define OS_OBJECT_RETURNS_RETAINED +#endif +#if __has_attribute(ns_consumed) +#define OS_OBJECT_CONSUMED __attribute__((__ns_consumed__)) +#else +#define OS_OBJECT_CONSUMED +#endif +#else +#define OS_OBJECT_RETURNS_RETAINED +#define OS_OBJECT_CONSUMED +#endif +#if defined(__has_feature) +#if __has_feature(objc_arc) +#define OS_OBJECT_BRIDGE __bridge +#define OS_WARN_RESULT_NEEDS_RELEASE +#else +#define OS_OBJECT_BRIDGE +#define OS_WARN_RESULT_NEEDS_RELEASE OS_WARN_RESULT +#endif +#else +#define OS_OBJECT_BRIDGE +#define OS_WARN_RESULT_NEEDS_RELEASE OS_WARN_RESULT +#endif +#ifndef OS_OBJECT_USE_OBJC_RETAIN_RELEASE +#if defined(__clang_analyzer__) +#define OS_OBJECT_USE_OBJC_RETAIN_RELEASE 1 +#elif defined(__has_feature) +#if __has_feature(objc_arc) +#define OS_OBJECT_USE_OBJC_RETAIN_RELEASE 1 +#else +#define OS_OBJECT_USE_OBJC_RETAIN_RELEASE 0 +#endif +#else +#define OS_OBJECT_USE_OBJC_RETAIN_RELEASE 0 +#endif +#endif +#else +/*! @parseOnly */ +#define OS_OBJECT_RETURNS_RETAINED +/*! @parseOnly */ +#define OS_OBJECT_CONSUMED +/*! @parseOnly */ +#define OS_OBJECT_BRIDGE +/*! @parseOnly */ +#define OS_WARN_RESULT_NEEDS_RELEASE OS_WARN_RESULT +#define OS_OBJECT_USE_OBJC_RETAIN_RELEASE 0 +#endif + +#define OS_OBJECT_GLOBAL_OBJECT(type, object) ((OS_OBJECT_BRIDGE type)&(object)) + +__BEGIN_DECLS + +/*! + * @function os_retain + * + * @abstract + * Increment the reference count of an os_object. + * + * @discussion + * On a platform with the modern Objective-C runtime this is exactly equivalent + * to sending the object the -[retain] message. + * + * @param object + * The object to retain. + * + * @result + * The retained object. + */ +__OSX_AVAILABLE_STARTING(__MAC_10_12,__IPHONE_10_0) +OS_EXPORT +void* +os_retain(void *object); +#if OS_OBJECT_USE_OBJC +#undef os_retain +#define os_retain(object) [object retain] +#endif + +/*! + * @function os_release + * + * @abstract + * Decrement the reference count of a os_object. + * + * @discussion + * On a platform with the modern Objective-C runtime this is exactly equivalent + * to sending the object the -[release] message. + * + * @param object + * The object to release. + */ +__OSX_AVAILABLE_STARTING(__MAC_10_12,__IPHONE_10_0) +OS_EXPORT +void +os_release(void *object); +#if OS_OBJECT_USE_OBJC +#undef os_release +#define os_release(object) [object release] +#endif + +#define fastpath(x) ((typeof(x))__builtin_expect((long)(x), ~0l)) +#define slowpath(x) ((typeof(x))__builtin_expect((long)(x), 0l)) + +__END_DECLS + +#endif diff --git a/libkern/os/object_private.h b/libkern/os/object_private.h new file mode 100644 index 000000000..0f2f01dff --- /dev/null +++ b/libkern/os/object_private.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_APACHE_LICENSE_HEADER_START@ + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @APPLE_APACHE_LICENSE_HEADER_END@ + */ + +/* + * IMPORTANT: This header file describes INTERNAL interfaces to libdispatch + * which are subject to change in future releases of Mac OS X. Any applications + * relying on these interfaces WILL break. + */ + +#ifndef __OS_OBJECT_PRIVATE__ +#define __OS_OBJECT_PRIVATE__ + +#include +#include +#include + +#ifndef __OSX_AVAILABLE_STARTING +#define __OSX_AVAILABLE_STARTING(x, y) +#endif + +#if __GNUC__ +#define OS_OBJECT_NOTHROW __attribute__((__nothrow__)) +#define OS_OBJECT_NONNULL __attribute__((__nonnull__)) +#define OS_OBJECT_WARN_RESULT __attribute__((__warn_unused_result__)) +#define OS_OBJECT_MALLOC __attribute__((__malloc__)) +#define OS_OBJECT_EXPORT extern __attribute__((visibility("default"))) +#else +/*! @parseOnly */ +#define OS_OBJECT_NOTHROW +/*! @parseOnly */ +#define OS_OBJECT_NONNULL +/*! @parseOnly */ +#define OS_OBJECT_WARN_RESULT +/*! @parseOnly */ +#define OS_OBJECT_MALLOC +#define OS_OBJECT_EXPORT extern +#endif + +#if OS_OBJECT_USE_OBJC && defined(__has_feature) +#if __has_feature(objc_arc) +#define _OS_OBJECT_OBJC_ARC 1 +#else +#define _OS_OBJECT_OBJC_ARC 0 +#endif +#else +#define _OS_OBJECT_OBJC_ARC 0 +#endif + +#define _OS_OBJECT_GLOBAL_REFCNT INT_MAX + +#define _OS_OBJECT_HEADER(isa, ref_cnt, xref_cnt) \ + isa; /* must be pointer-sized */ \ + int volatile ref_cnt; \ + int volatile xref_cnt + +#if OS_OBJECT_HAVE_OBJC_SUPPORT +// Must match size of compiler-generated OBJC_CLASS structure rdar://10640168 +#define _OS_OBJECT_CLASS_HEADER() \ + void *_os_obj_objc_class_t[5] +#else +#define _OS_OBJECT_CLASS_HEADER() \ + void (*_os_obj_xref_dispose)(_os_object_t); \ + void (*_os_obj_dispose)(_os_object_t) +#endif + +#define OS_OBJECT_CLASS(name) OS_##name + +#if OS_OBJECT_USE_OBJC +__OSX_AVAILABLE_STARTING(__MAC_10_8,__IPHONE_6_0) +OS_OBJECT_EXPORT +@interface OS_OBJECT_CLASS(object) : NSObject +- (void)_xref_dispose; +- (void)_dispose; +@end +typedef OS_OBJECT_CLASS(object) *_os_object_t; +#define _OS_OBJECT_DECL_SUBCLASS_INTERFACE(name, super) \ + @interface OS_OBJECT_CLASS(name) : OS_OBJECT_CLASS(super) \ + \ + @end +#else +typedef struct _os_object_s *_os_object_t; +#endif + +__BEGIN_DECLS + +#if !_OS_OBJECT_OBJC_ARC + +__OSX_AVAILABLE_STARTING(__MAC_10_8,__IPHONE_6_0) +OS_OBJECT_EXPORT OS_OBJECT_MALLOC OS_OBJECT_WARN_RESULT OS_OBJECT_NOTHROW +_os_object_t +_os_object_alloc(const void *cls, size_t size); + +__OSX_AVAILABLE_STARTING(__MAC_10_9,__IPHONE_7_0) +OS_OBJECT_EXPORT OS_OBJECT_MALLOC OS_OBJECT_WARN_RESULT OS_OBJECT_NOTHROW +_os_object_t +_os_object_alloc_realized(const void *cls, size_t size); + +__OSX_AVAILABLE_STARTING(__MAC_10_8,__IPHONE_6_0) +OS_OBJECT_EXPORT OS_OBJECT_NONNULL OS_OBJECT_NOTHROW +void _os_object_dealloc(_os_object_t object); + +__OSX_AVAILABLE_STARTING(__MAC_10_8,__IPHONE_6_0) +OS_OBJECT_EXPORT OS_OBJECT_NONNULL OS_OBJECT_NOTHROW +_os_object_t +_os_object_retain(_os_object_t object); + +__OSX_AVAILABLE_STARTING(__MAC_10_11,__IPHONE_9_0) +OS_OBJECT_EXPORT OS_OBJECT_NONNULL OS_OBJECT_NOTHROW +_os_object_t +_os_object_retain_with_resurrect(_os_object_t obj); + +__OSX_AVAILABLE_STARTING(__MAC_10_8,__IPHONE_6_0) +OS_OBJECT_EXPORT OS_OBJECT_NONNULL OS_OBJECT_NOTHROW +void +_os_object_release(_os_object_t object); + +__OSX_AVAILABLE_STARTING(__MAC_10_8,__IPHONE_6_0) +OS_OBJECT_EXPORT OS_OBJECT_NONNULL OS_OBJECT_NOTHROW +_os_object_t +_os_object_retain_internal(_os_object_t object); + +__OSX_AVAILABLE_STARTING(__MAC_10_8,__IPHONE_6_0) +OS_OBJECT_EXPORT OS_OBJECT_NONNULL OS_OBJECT_NOTHROW +void +_os_object_release_internal(_os_object_t object); + +#endif // !_OS_OBJECT_OBJC_ARC + +__END_DECLS + +#endif diff --git a/libkern/os/overflow.h b/libkern/os/overflow.h index 2b6034ca6..8d0fd9949 100644 --- a/libkern/os/overflow.h +++ b/libkern/os/overflow.h @@ -42,12 +42,38 @@ #define _OS_OVERFLOW_H #include +#include +#include + +bool __header_always_inline OS_WARN_RESULT +__os_warn_unused(__const bool x) +{ + return x; +} + +#if __has_builtin(__builtin_add_overflow) && \ + __has_builtin(__builtin_sub_overflow) && \ + __has_builtin(__builtin_mul_overflow) + +#define os_add_overflow(a, b, res) __os_warn_unused(__builtin_add_overflow((a), (b), (res))) +#define os_sub_overflow(a, b, res) __os_warn_unused(__builtin_sub_overflow((a), (b), (res))) +#define os_mul_overflow(a, b, res) __os_warn_unused(__builtin_mul_overflow((a), (b), (res))) + +#else /* compile-time assertion that 'x' and 'y' are equivalent types */ +#ifdef __cplusplus +#define __OS_TYPE_CHECK(x, y) do { \ + __typeof__(x) _x; \ + __typeof__(y) _y; \ + (void)(&_x == &_y, "overflow arithmetic: incompatible types"); \ +} while (0) +#else #define __OS_TYPE_CHECK(x, y) do { \ - _Static_assert(__builtin_types_compatible_p(typeof(x),typeof(y)), \ + _Static_assert(__builtin_types_compatible_p(__typeof(x),__typeof(y)), \ "overflow arithmetic: incompatible types"); \ } while (0) +#endif #define __os_add_overflow_func(T,U,V) _Generic((T), \ unsigned: __builtin_uadd_overflow, \ @@ -76,36 +102,51 @@ long long: __builtin_smulll_overflow \ )(T,U,V) -int __header_always_inline __attribute__((__warn_unused_result__)) -__os_warn_unused(const int x) -{ - return x; -} - -#define os_add_overflow(a, b, res) __os_warn_unused(({ \ +#define os_add_overflow(a, b, res) __os_warn_unused(__extension__({ \ __OS_TYPE_CHECK((a), (b)); \ __OS_TYPE_CHECK((b), *(res)); \ __os_add_overflow_func((a), (b), (res)); \ })) -#define os_add3_overflow(a, b, c, res) __os_warn_unused(({ \ - typeof(a) _tmp; \ - int _s, _t; \ - _s = os_add_overflow((a), (b), &_tmp); \ - _t = os_add_overflow((c), _tmp, (res)); \ - _s | _t; \ -})) - -#define os_sub_overflow(a, b, res) __os_warn_unused(({ \ +#define os_sub_overflow(a, b, res) __os_warn_unused(__extension__({ \ __OS_TYPE_CHECK((a), (b)); \ __OS_TYPE_CHECK((b), *(res)); \ __os_sub_overflow_func((a), (b), (res)); \ })) -#define os_mul_overflow(a, b, res) __os_warn_unused(({ \ +#define os_mul_overflow(a, b, res) __os_warn_unused(__extension__({ \ __OS_TYPE_CHECK((a), (b)); \ __OS_TYPE_CHECK((b), *(res)); \ __os_mul_overflow_func((a), (b), (res)); \ })) +#endif /* __has_builtin(...) */ + +/* os_add3_overflow(a, b, c) -> (a + b + c) */ +#define os_add3_overflow(a, b, c, res) __os_warn_unused(__extension__({ \ + __typeof(*(res)) _tmp; \ + bool _s, _t; \ + _s = os_add_overflow((a), (b), &_tmp); \ + _t = os_add_overflow((c), _tmp, (res)); \ + _s | _t; \ +})) + +/* os_add_and_mul_overflow(a, b, x) -> (a + b)*x */ +#define os_add_and_mul_overflow(a, b, x, res) __os_warn_unused(__extension__({ \ + __typeof(*(res)) _tmp; \ + bool _s, _t; \ + _s = os_add_overflow((a), (b), &_tmp); \ + _t = os_mul_overflow((x), _tmp, (res)); \ + _s | _t; \ +})) + +/* os_mul_and_add_overflow(a, x, b) -> a*x + b */ +#define os_mul_and_add_overflow(a, x, b, res) __os_warn_unused(__extension__({ \ + __typeof(*(res)) _tmp; \ + bool _s, _t; \ + _s = os_mul_overflow((a), (x), &_tmp); \ + _t = os_add_overflow((b), _tmp, (res)); \ + _s | _t; \ +})) + #endif /* _OS_OVERFLOW_H */ diff --git a/libkern/os/trace.h b/libkern/os/trace.h new file mode 100644 index 000000000..f658f035c --- /dev/null +++ b/libkern/os/trace.h @@ -0,0 +1,893 @@ +/* + * Copyright (c) 2013-2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef __OS_TRACE_H__ +#define __OS_TRACE_H__ + +#include +#include +#include +#include +#include +#include +#if __has_include() +#include +#else +typedef void *xpc_object_t; +#endif + +#if !__GNUC__ +#error "must be GNU C compatible" +#endif + +__BEGIN_DECLS + +extern void *__dso_handle; + +OS_ALWAYS_INLINE +static inline void +_os_trace_verify_printf(const char *msg, ...) __attribute__((format(printf, 1, 2))) +{ +#pragma unused(msg) +} + +#if !defined OS_COUNT_ARGS +#define OS_COUNT_ARGS(...) OS_COUNT_ARGS1(, ##__VA_ARGS__, _8, _7, _6, _5, _4, _3, _2, _1, _0) +#define OS_COUNT_ARGS1(z, a, b, c, d, e, f, g, h, cnt, ...) cnt +#endif + +/* use old macros for anything less than iOS 10 and MacOS 10.12 */ +#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_10_0) \ + || (defined(__WATCH_OS_VERSION_MIN_REQUIRED) && __WATCH_OS_VERSION_MIN_REQUIRED < __WATCHOS_3_0) \ + || (defined(__TV_OS_VERSION_MIN_REQUIRED) && __TV_OS_VERSION_MIN_REQUIRED < __TVOS_10_0) \ + || (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_12) + +#define _os_trace_0(_l, _m, _t) __extension__({ \ + _os_trace_verify_printf(_l); \ + _os_trace_with_buffer(&__dso_handle, _m, _t, NULL, 0, NULL); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_1(_l, _m, _t, _1) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + _os_trace_verify_printf(_l, _c1); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + uint8_t _s[1]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._cnt = 1, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), NULL); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_2(_l, _m, _t, _1, _2) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + _os_trace_verify_printf(_l, _c1, _c2); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + uint8_t _s[2]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._cnt = 2, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), NULL); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_3(_l, _m, _t, _1, _2, _3) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + uint8_t _s[3]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._cnt = 3, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), NULL); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_4(_l, _m, _t, _1, _2, _3, _4) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + uint8_t _s[4]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._cnt = 4, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), NULL); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_5(_l, _m, _t, _1, _2, _3, _4, _5) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + const __typeof__(_5) _c5 = _5; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4, _c5); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + __typeof__(_c5) _f5; \ + uint8_t _s[5]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._f5 = _c5, ._s[4] = sizeof(_c5), \ + ._cnt = 5, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), NULL); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_6(_l, _m, _t, _1, _2, _3, _4, _5, _6) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + const __typeof__(_5) _c5 = _5; \ + const __typeof__(_6) _c6 = _6; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4, _c5, _c6); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + __typeof__(_c5) _f5; \ + __typeof__(_c6) _f6; \ + uint8_t _s[6]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._f5 = _c5, ._s[4] = sizeof(_c5), \ + ._f6 = _c6, ._s[5] = sizeof(_c6), \ + ._cnt = 6, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), NULL); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_7(_l, _m, _t, _1, _2, _3, _4, _5, _6, _7) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + const __typeof__(_5) _c5 = _5; \ + const __typeof__(_6) _c6 = _6; \ + const __typeof__(_7) _c7 = _7; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4, _c5, _c6, _c7); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + __typeof__(_c5) _f5; \ + __typeof__(_c6) _f6; \ + __typeof__(_c7) _f7; \ + uint8_t _s[7]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._f5 = _c5, ._s[4] = sizeof(_c5), \ + ._f6 = _c6, ._s[5] = sizeof(_c6), \ + ._f7 = _c7, ._s[6] = sizeof(_c7), \ + ._cnt = 7, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), NULL); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_with_payload_1(_l, _m, _t, _payload) __extension__({ \ + _os_trace_verify_printf(_l); \ + _os_trace_with_buffer(&__dso_handle, _m, _t, NULL, 0, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_2(_l, _m, _t, _1, _payload) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + _os_trace_verify_printf(_l, _c1); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + uint8_t _s[1]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._cnt = 1, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), _payload); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_with_payload_3(_l, _m, _t, _1, _2, _payload) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + _os_trace_verify_printf(_l, _c1, _c2); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + uint8_t _s[2]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._cnt = 2, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), _payload); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_with_payload_4(_l, _m, _t, _1, _2, _3, _payload) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + uint8_t _s[3]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._cnt = 3, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), _payload); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_with_payload_5(_l, _m, _t, _1, _2, _3, _4, _payload) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + uint8_t _s[4]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._cnt = 4, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), _payload); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_with_payload_6(_l, _m, _t, _1, _2, _3, _4, _5, _payload) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + const __typeof__(_4) _c5 = _5; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4, _c5); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + __typeof__(_c5) _f5; \ + uint8_t _s[5]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._f5 = _c5, ._s[4] = sizeof(_c5), \ + ._cnt = 5, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), _payload); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_with_payload_7(_l, _m, _t, _1, _2, _3, _4, _5, _6, _payload) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + const __typeof__(_5) _c5 = _5; \ + const __typeof__(_6) _c6 = _6; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4, _c5, _c6); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + __typeof__(_c5) _f5; \ + __typeof__(_c6) _f6; \ + uint8_t _s[6]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._f5 = _c5, ._s[4] = sizeof(_c5), \ + ._f6 = _c6, ._s[5] = sizeof(_c6), \ + ._cnt = 6, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), _payload); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define _os_trace_with_payload_8(_l, _m, _t, _1, _2, _3, _4, _5, _6, _7, _payload) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wpacked\"") \ + const __typeof__(_1) _c1 = _1; \ + const __typeof__(_2) _c2 = _2; \ + const __typeof__(_3) _c3 = _3; \ + const __typeof__(_4) _c4 = _4; \ + const __typeof__(_5) _c5 = _5; \ + const __typeof__(_6) _c6 = _6; \ + const __typeof__(_7) _c7 = _7; \ + _os_trace_verify_printf(_l, _c1, _c2, _c3, _c4, _c5, _c6, _c7); \ + const struct __attribute__((packed)) { \ + __typeof__(_c1) _f1; \ + __typeof__(_c2) _f2; \ + __typeof__(_c3) _f3; \ + __typeof__(_c4) _f4; \ + __typeof__(_c5) _f5; \ + __typeof__(_c6) _f6; \ + __typeof__(_c7) _f7; \ + uint8_t _s[7]; \ + uint8_t _cnt; \ + } _buf = { \ + ._f1 = _c1, ._s[0] = sizeof(_c1), \ + ._f2 = _c2, ._s[1] = sizeof(_c2), \ + ._f3 = _c3, ._s[2] = sizeof(_c3), \ + ._f4 = _c4, ._s[3] = sizeof(_c4), \ + ._f5 = _c5, ._s[4] = sizeof(_c5), \ + ._f6 = _c6, ._s[5] = sizeof(_c6), \ + ._f7 = _c7, ._s[6] = sizeof(_c7), \ + ._cnt = 7, \ + }; \ + _os_trace_with_buffer(&__dso_handle, _m, _t, &_buf, sizeof(_buf), _payload); \ + __asm__(""); /* avoid tailcall */ \ + _Pragma("clang diagnostic pop") \ +}) + +#define OS_TRACE_CALL(format, _m, _t, ...) __extension__({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") \ + OS_CONCAT(_os_trace, OS_COUNT_ARGS(__VA_ARGS__))(format, _m, _t, ##__VA_ARGS__); \ + _Pragma("clang diagnostic pop") \ +}) + +#else + +// Use a new layout in Mac OS 10.12+ and iOS 10.0+ +#define OS_TRACE_CALL(_l, _m, _t, ...) __extension__({ \ + uint8_t buf[1024]; \ + _os_trace_verify_printf(_l, ##__VA_ARGS__); \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, ##__VA_ARGS__); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, NULL); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_1(_l, _m, _t, _payload) __extension__({ \ + _os_trace_verify_printf(_l); \ + _os_trace_internal(&__dso_handle, _t, _m, NULL, 0, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_2(_l, _m, _t, _1, _payload) __extension__({ \ + _os_trace_verify_printf(_l, _1); \ + uint8_t buf[1024]; \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, _1); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_3(_l, _m, _t, _1, _2, _payload) __extension__({ \ + _os_trace_verify_printf(_l, _1, _2); \ + uint8_t buf[1024]; \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, _1, _2); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_4(_l, _m, _t, _1, _2, _3, _payload) __extension__({ \ + _os_trace_verify_printf(_l, _1, _2, _3); \ + uint8_t buf[1024]; \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, _1, _2, _3); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_5(_l, _m, _t, _1, _2, _3, _4, _payload) __extension__({ \ + _os_trace_verify_printf(_l, _1, _2, _3, _4); \ + uint8_t buf[1024]; \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, _1, _2, _3, _4); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_6(_l, _m, _t, _1, _2, _3, _4, _5, _payload) __extension__({ \ + _os_trace_verify_printf(_l, _1, _2, _3, _4, _5); \ + uint8_t buf[1024]; \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, _1, _2, _3, _4, _5); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_7(_l, _m, _t, _1, _2, _3, _4, _5, _6, _payload) __extension__({ \ + _os_trace_verify_printf(_l, _1, _2, _3, _4, _5, _6); \ + uint8_t buf[1024]; \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, _1, _2, _3, _4, _5, _6); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#define _os_trace_with_payload_8(_l, _m, _t, _1, _2, _3, _4, _5, _6, _7, _payload) __extension__({ \ + _os_trace_verify_printf(_l, _1, _2, _3, _4, _5, _6, _7); \ + uint8_t buf[1024]; \ + size_t buf_size = _os_trace_encode(buf, sizeof(buf), _m, _1, _2, _3, _4, _5, _6, _7); \ + _os_trace_internal(&__dso_handle, _t, _m, buf, buf_size, _payload); \ + __asm__(""); /* avoid tailcall */ \ +}) + +#endif /* if Mac OS >= 10.12 or iPhone OS >= 10.0 */ + +/*! + * + * @abstract + * Hashtags in trace messages + * + * @discussion + * Developers are encouraged to include hashtags in log messages, regardless of what API you use. + * A hashtag is composed of a hash (#) symbol, followed by at least three non-whitespace characters, + * terminated by whitespace or the end of the message. Hashtags may not begin with a number. + * + * Below is the list of predefined tags: + * #System - Message in the context of a system process. + * #User - Message in the context of a user process. + * #Developer - Message in the context of software development. For example, deprecated APIs and debugging messages. + * #Attention - Message that should be investigated by a system administrator, because it may be a sign of a larger issue. + * For example, errors from a hard drive controller that typically occur when the drive is about to fail. + * #Critical - Message in the context of a critical event or critical failure. + * #Error - Message that is a noncritical error. + * #Comment - Message that is a comment. + * #Marker - Message that marks a change to divide the messages around it into those before and those after the change. + * #Clue - Message containing extra key/value pairs with additional information to help reconstruct the context. + * #Security - Message related to security concerns. + * #Filesystem - Message describing a file system related event. + * #Network - Message describing a network-related event. + * #Hardware - Message describing a hardware-related event. + * #CPU - Message describing CPU related event, e.g., initiating heavy work load + * #State - Message describing state changed, e.g., global state, preference, etc. + * #Graphics - Message describing significant graphics event + * #Disk - Message describing disk activity + * + */ + +#pragma mark - Other defines + +/*! + * @define OS_TRACE_TYPE_RELEASE + * Trace messages to be captured on a typical user install. These should be + * limited to things which improve diagnosis of a failure/crash/hang. Trace + * buffers are generally smaller on a production system. + */ +#define OS_TRACE_TYPE_RELEASE (1u << 0) + +/*! + * @define OS_TRACE_TYPE_DEBUG + * Trace messages to be captured while debugger or other development tool is + * attached to the originator. + */ +#define OS_TRACE_TYPE_DEBUG (1u << 1) + +/*! + * @define OS_TRACE_TYPE_INFO + * Trace messages that are captured when a debugger is attached, system or + * Application mode has been increased to include additional information. + */ +#define OS_TRACE_TYPE_INFO (1u << 2) + +/*! + * @define OS_TRACE_TYPE_ERROR + * Trace the message as an error and force a collection as a failure may be + * imminent. + */ +#define OS_TRACE_TYPE_ERROR ((1u << 6) | (1u << 0)) + +/*! + * @define OS_TRACE_TYPE_FAULT + * Trace the message as a fatal error which forces a collection and a diagnostic + * to be initiated. + */ +#define OS_TRACE_TYPE_FAULT ((1u << 7) | (1u << 6) | (1u << 0)) + +/*! + * @typedef os_trace_payload_t + * A block that populates an xpc_object_t of type XPC_TYPE_DICTIONARY to represent + * complex data. This block will only be invoked under conditions where tools + * have attached to the process. The payload can be used to send arbitrary data + * via the trace call. Tools may use the data to validate state for integration + * tests or provide other introspection services. No assumptions are made about + * the format or structure of the data. + */ +typedef void (^os_trace_payload_t)(xpc_object_t xdict); + +#pragma mark - function declarations + +/*! + * @function os_trace + * + * @abstract + * Always inserts a trace message into a buffer pool for later decoding. + * + * @discussion + * Trace message that will be recorded on a typical user install. These should + * be limited to things which help diagnose a failure during postmortem + * analysis. Trace buffers are generally smaller on a production system. + * + * @param format + * A printf-style format string to generate a human-readable log message when + * the trace line is decoded. Only scalar types are supported, attempts + * to pass arbitrary strings will store a pointer that is unresolvable and + * will generate an error during decode. + * + * os_trace("network event: %ld, last seen: %ld, avg: %g", event_id, last_seen, avg); + */ +#define os_trace(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_TRACE_CALL(format, _m, OS_TRACE_TYPE_RELEASE, ##__VA_ARGS__); \ +}) + + +#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_10_0) \ + || (defined(__WATCH_OS_VERSION_MIN_REQUIRED) && __WATCH_OS_VERSION_MIN_REQUIRED >= __WATCHOS_3_0) \ + || (defined(__TV_OS_VERSION_MIN_REQUIRED) && __TV_OS_VERSION_MIN_REQUIRED >= __TVOS_10_0) \ + || (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_12) + +/*! + * @function os_trace_info + * + * @abstract + * Optionally inserts a trace message containing additional information into a + * buffer pool for later decoding. + * + * @discussion + * Trace messages that will be captured when additional information is needed + * and are not captured by default. They will only be captured if the + * system/process/activity mode has been increased or if a Development tool has + * been attached to the process. + * + * @param format + * A printf-style format string that represents a human-readable message when + * the trace line is decoded. Only scalar types are supported, attempts + * to pass arbitrary strings will store a pointer that is unresolvable and + * will generate an error during decode. + * + * os_trace_info("network interface status %ld", status); + */ +#define os_trace_info(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_TRACE_CALL(format, _m, OS_TRACE_TYPE_INFO, ##__VA_ARGS__); \ +}) + +#endif + +/*! + * @function os_trace_debug + * + * @abstract + * Insert debug trace message into a buffer pool for later decoding. + * + * @discussion + * Debug trace message to be recorded while debugger or other development tool is + * attached to the originator. This is transported interprocess to help + * diagnose the entire call chain including external helpers. + * + * @param format + * A printf-style format string that represents a human-readable message when + * the trace line is decoded. Only scalar types are supported, attempts + * to pass arbitrary strings will store a pointer that is unresolvable and + * will generate an error during decode. + * + * os_trace_debug("network interface status %ld", status); + */ +#define os_trace_debug(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_TRACE_CALL(format, _m, OS_TRACE_TYPE_DEBUG, ##__VA_ARGS__); \ +}) + +/*! + * @function os_trace_info_enabled + * + * @abstract + * Avoid unnecessary work for a trace point by checking if additional information + * is enabled. + * + * @discussion + * Avoid unnecessary work for a trace point by checking if additional information + * is enabled. Generally trace points should not involve expensive operations, but some + * circumstances warrant it. Use this function to avoid doing the work unless + * debug level trace messages are requested. + * + * if (os_trace_info_enabled()) { + * os_trace_info("value = %d, average = %d", + * [[dict objectForKey: @"myKey"] intValue], + * (int) [self getAverage: dict]); + * } + * + * @result + * Returns true if development mode is enabled. + */ +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +OS_EXPORT OS_NOTHROW OS_WARN_RESULT +bool +os_trace_info_enabled(void); + +/*! + * @function os_trace_debug_enabled + * + * @abstract + * Avoid unnecessary work for a trace point by checking if debug level is enabled. + * + * @discussion + * Avoid unnecessary work for a trace point by checking if debug level is enabled. + * Generally trace points should not involve expensive operations, but some + * circumstances warrant it. Use this function to avoid doing the work unless + * debug level trace messages are requested. + * + * if (os_trace_debug_enabled()) { + * os_trace_debug("value = %d, average = %d", + * [[dict objectForKey: @"myKey"] intValue], + * (int) [self getAverage: dict]); + * } + * + * @result + * Returns true if debug mode is enabled. + */ +__OSX_AVAILABLE(10.10) __IOS_AVAILABLE(8.0) __WATCHOS_AVAILABLE(1.0) __TVOS_AVAILABLE(9.0) +OS_EXPORT OS_NOTHROW OS_WARN_RESULT +bool +os_trace_debug_enabled(void); + +/*! + * @function os_trace_error + * + * @abstract + * Trace the message as an error and force a collection of the trace buffer as a + * failure may be imminent. + * + * @discussion + * Trace the message as an error and force a collection of the trace buffer as a + * failure may be imminent. + * + * @param format + * A printf-style format string to generate a human-readable log message when + * the trace line is decoded. Only scalar types are supported, attempts + * to pass arbitrary strings will store a pointer that is unresolvable and + * will generate an error during decode. + * + * os_trace_error("socket %d connection timeout %ld", fd, secs); + */ +#define os_trace_error(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_TRACE_CALL(format, _m, OS_TRACE_TYPE_ERROR, ##__VA_ARGS__); \ +}) + +/*! + * @function os_trace_fault + * + * @abstract + * Trace the message as a fault which forces a collection of the trace buffer + * and diagnostic of the activity. + * + * @discussion + * Trace the message as a fault which forces a collection of the trace buffer + * and diagnostic of the activity. + * + * @param format + * A printf-style format string to generate a human-readable log message when + * the trace line is decoded. Only scalar types are supported, attempts + * to pass arbitrary strings will store a pointer that is unresolvable and + * will generate an error during decode. + * + * os_trace_fault("failed to lookup uid %d - aborting", uid); + */ +#define os_trace_fault(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_TRACE_CALL(format, _m, OS_TRACE_TYPE_FAULT, ##__VA_ARGS__); \ +}) + +#if __has_include() +/*! + * @function os_trace_with_payload + * + * @abstract + * Add a trace entry containing the provided values and call the block if + * appropriate. + * + * @discussion + * Will insert a trace entry into a limited ring buffer for an activity or + * process. Trace points are for recording interesting data that would improve + * diagnosis of unexpected crashes, failures and hangs. The block will only be + * called under the required conditions. + * + * @param trace_msg + * A printf-style format string to generate a human-readable log message when + * the trace line is decoded. Only scalar types are supported. Attempts + * to pass arbitrary strings will store a pointer that is unresolvable and + * will generate an error during decode. + * + * The final parameter must be a block of type os_trace_payload_t. + * + * os_trace_with_payload("network event %ld", event, ^(xpc_object_t xdict) { + * + * // validate the network interface and address where what was expected + * xpc_dictionary_set_string(xdict, "network", ifp->ifa_name); + * xpc_dictionary_set_string(xdict, "ip_address", _get_address(ifp)); + * }); + */ +#define os_trace_with_payload(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_CONCAT(_os_trace_with_payload, OS_COUNT_ARGS(__VA_ARGS__))(format, _m, OS_TRACE_TYPE_RELEASE, ##__VA_ARGS__); \ +}) + +#define os_trace_info_with_payload(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_CONCAT(_os_trace_with_payload, OS_COUNT_ARGS(__VA_ARGS__))(format, _m, OS_TRACE_TYPE_INFO, ##__VA_ARGS__); \ +}) + +#define os_trace_debug_with_payload(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_CONCAT(_os_trace_with_payload, OS_COUNT_ARGS(__VA_ARGS__))(format, _m, OS_TRACE_TYPE_DEBUG, ##__VA_ARGS__); \ +}) + +#define os_trace_error_with_payload(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_CONCAT(_os_trace_with_payload, OS_COUNT_ARGS(__VA_ARGS__))(format, _m, OS_TRACE_TYPE_ERROR, ##__VA_ARGS__); \ +}) + +#define os_trace_fault_with_payload(format, ...) __extension__({ \ + _Static_assert(__builtin_constant_p(format), "format must be a constant string"); \ + __attribute__((section("__TEXT,__os_trace"))) static const char _m[] = format; \ + OS_CONCAT(_os_trace_with_payload, OS_COUNT_ARGS(__VA_ARGS__))(format, _m, OS_TRACE_TYPE_FAULT, ##__VA_ARGS__); \ +}) + +#endif // __has_include() + +// TODO: change this once we have compiler support +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +OS_EXPORT OS_NOTHROW +size_t +_os_trace_encode(uint8_t *buf, size_t buf_size, const char *format, ...); + +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +OS_EXPORT OS_NOTHROW +void +_os_trace_internal(void *dso, uint8_t type, const char *format, const uint8_t *buf, size_t buf_size, os_trace_payload_t payload); + +/*! + * @function _os_trace_with_buffer + * + * @abstract + * Internal function to support pre-encoded buffer. + */ +__OSX_AVAILABLE(10.10) __IOS_AVAILABLE(8.0) __WATCHOS_AVAILABLE(1.0) __TVOS_AVAILABLE(9.0) +OS_EXPORT OS_NOTHROW +void +_os_trace_with_buffer(void *dso, const char *message, uint8_t type, const void *buffer, size_t buffer_size, os_trace_payload_t payload); + +__END_DECLS + +#endif // __OS_TRACE_H__ diff --git a/libkern/os/trace_internal.h b/libkern/os/trace_internal.h new file mode 100644 index 000000000..3c5b31116 --- /dev/null +++ b/libkern/os/trace_internal.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2013-2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef libtrace_trace_internal_h +#define libtrace_trace_internal_h + +#include +#include +#include + +__BEGIN_DECLS + +typedef union { + struct { +#if __LP64__ + uintptr_t pc : 48; +#else + uintptr_t pc; +#endif + // not encoded + firehose_tracepoint_flags_t flags; + uintptr_t dso; + uuid_t uuid; + }; + + uint32_t offset; + uint64_t vlocation : 48; // we never use the full 64-bits + +#if defined(__LP64__) + uint8_t encode_value[6]; // 48-bits +#else + uint8_t encode_value[sizeof(uintptr_t)]; +#endif +} os_trace_location_u; + +typedef os_trace_location_u *os_trace_location_t; + +OS_ALWAYS_INLINE +inline uint32_t +_os_trace_offset(const void *dso, const void *addr, _firehose_tracepoint_flags_activity_t flags __unused) +{ + return (uint32_t) ((uintptr_t)addr - (uintptr_t)dso); +} + +bool +_os_trace_addr_in_text_segment(const void *dso, const void *addr); + +__END_DECLS + +#endif diff --git a/libkern/stdio/scanf.c b/libkern/stdio/scanf.c index 0bc3a4363..82791cdde 100644 --- a/libkern/stdio/scanf.c +++ b/libkern/stdio/scanf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -160,8 +160,11 @@ vsscanf(const char *inp, char const *fmt0, va_list ap) if (c == 0) return (nassigned); if (isspace(c)) { - while (inr > 0 && isspace(*inp)) - nread++, inr--, inp++; + while (inr > 0 && isspace(*inp)) { + nread++; + inr--; + inp++; + } continue; } if (c != '%') @@ -180,7 +183,8 @@ again: c = *fmt++; goto input_failure; if (*inp != c) goto match_failure; - inr--, inp++; + inr--; + inp++; nread++; continue; @@ -352,7 +356,9 @@ again: c = *fmt++; if (flags & SUPPRESS) { n = 0; while (ccltab[(unsigned char)*inp]) { - n++, inr--, inp++; + n++; + inr--; + inp++; if (--width == 0) break; if (inr <= 0) { @@ -393,7 +399,9 @@ again: c = *fmt++; if (flags & SUPPRESS) { n = 0; while (!isspace(*inp)) { - n++, inr--, inp++; + n++; + inr--; + inp++; if (--width == 0) break; if (inr <= 0) @@ -651,7 +659,6 @@ __sccl(char *tab, const u_char *fmt) * This too is permitted by the standard.... */ goto doswitch; - break; case ']': /* end of scanset */ return (fmt); diff --git a/libkern/uuid/uuid.c b/libkern/uuid/uuid.c index 217b6b667..ce69ad766 100644 --- a/libkern/uuid/uuid.c +++ b/libkern/uuid/uuid.c @@ -42,8 +42,6 @@ extern int uuid_get_ethernet(u_int8_t *); -UUID_DEFINE(UUID_NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - static void read_node(uint8_t *node) { diff --git a/libkern/zlib/adler32.c b/libkern/zlib/adler32.c index e4b6756e9..c15ae8e24 100644 --- a/libkern/zlib/adler32.c +++ b/libkern/zlib/adler32.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* adler32.c -- compute the Adler-32 checksum of a data stream @@ -87,10 +87,8 @@ #endif /* ========================================================================= */ -uLong ZEXPORT adler32(adler, buf, len) - uLong adler; - const Bytef *buf; - uInt len; +uLong ZEXPORT +adler32(uLong adler, const Bytef *buf, uInt len) { unsigned long sum2; unsigned n; @@ -159,10 +157,8 @@ uLong ZEXPORT adler32(adler, buf, len) } /* ========================================================================= */ -uLong ZEXPORT adler32_combine(adler1, adler2, len2) - uLong adler1; - uLong adler2; - z_off_t len2; +uLong ZEXPORT +adler32_combine(uLong adler1, uLong adler2, z_off_t len2) { unsigned long sum1; unsigned long sum2; diff --git a/libkern/zlib/compress.c b/libkern/zlib/compress.c index 274008101..c3d69b2d0 100644 --- a/libkern/zlib/compress.c +++ b/libkern/zlib/compress.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* compress.c -- compress a memory buffer @@ -50,12 +50,9 @@ memory, Z_BUF_ERROR if there was not enough room in the output buffer, Z_STREAM_ERROR if the level parameter is invalid. */ -int ZEXPORT compress2 (dest, destLen, source, sourceLen, level) - Bytef *dest; - uLongf *destLen; - const Bytef *source; - uLong sourceLen; - int level; +int ZEXPORT +compress2(Bytef *dest, uLongf *destLen, const Bytef *source, uLong sourceLen, + int level) { z_stream stream; int err; @@ -90,11 +87,8 @@ int ZEXPORT compress2 (dest, destLen, source, sourceLen, level) /* =========================================================================== */ -int ZEXPORT compress (dest, destLen, source, sourceLen) - Bytef *dest; - uLongf *destLen; - const Bytef *source; - uLong sourceLen; +int ZEXPORT +compress(Bytef *dest, uLongf *destLen, const Bytef *source, uLong sourceLen) { return compress2(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION); } @@ -103,8 +97,8 @@ int ZEXPORT compress (dest, destLen, source, sourceLen) If the default memLevel or windowBits for deflateInit() is changed, then this function needs to be updated. */ -uLong ZEXPORT compressBound (sourceLen) - uLong sourceLen; +uLong ZEXPORT +compressBound(uLong sourceLen) { return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) + 11; } diff --git a/libkern/zlib/crc32.c b/libkern/zlib/crc32.c index 4cafa3157..ac0acac44 100644 --- a/libkern/zlib/crc32.c +++ b/libkern/zlib/crc32.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* crc32.c -- compute the CRC-32 of a data stream @@ -131,7 +131,8 @@ local void make_crc_table OF((void)); allow for word-at-a-time CRC calculation for both big-endian and little- endian machines, where a word is four bytes. */ -local void make_crc_table() +local void +make_crc_table(void) { unsigned long c; int n, k; @@ -208,9 +209,8 @@ local void make_crc_table() } #ifdef MAKECRCH -local void write_table(out, table) - FILE *out; - const unsigned long FAR *table; +local void +write_table(FILE *out, const unsigned long FAR *table) { int n; @@ -230,7 +230,8 @@ local void write_table(out, table) /* ========================================================================= * This function can be used by asm versions of crc32() */ -const unsigned long FAR * ZEXPORT get_crc_table() +const unsigned long FAR * ZEXPORT +get_crc_table(void) { #ifdef DYNAMIC_CRC_TABLE if (crc_table_empty) @@ -244,10 +245,8 @@ const unsigned long FAR * ZEXPORT get_crc_table() #define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 /* ========================================================================= */ -unsigned long ZEXPORT z_crc32(crc, buf, len) - unsigned long crc; - const unsigned char FAR *buf; - unsigned len; +unsigned long ZEXPORT +z_crc32(unsigned long crc, const unsigned char FAR *buf, unsigned len) { if (buf == Z_NULL) return 0UL; @@ -287,13 +286,11 @@ unsigned long ZEXPORT z_crc32(crc, buf, len) #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 /* ========================================================================= */ -local unsigned long crc32_little(crc, buf, len) - unsigned long crc; - const unsigned char FAR *buf; - unsigned len; +local unsigned long +crc32_little(unsigned long crc, const unsigned char FAR *buf, unsigned len) { - register u4 c; - register const u4 FAR *buf4; + u4 c; + const u4 FAR *buf4; c = (u4)crc; c = ~c; @@ -327,13 +324,11 @@ local unsigned long crc32_little(crc, buf, len) #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 /* ========================================================================= */ -local unsigned long crc32_big(crc, buf, len) - unsigned long crc; - const unsigned char FAR *buf; - unsigned len; +local unsigned long +crc32_big(unsigned long crc, const unsigned char FAR *buf, unsigned len) { - register u4 c; - register const u4 FAR *buf4; + u4 c; + const u4 FAR *buf4; c = REV((u4)crc); c = ~c; @@ -367,9 +362,8 @@ local unsigned long crc32_big(crc, buf, len) #define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */ /* ========================================================================= */ -local unsigned long gf2_matrix_times(mat, vec) - unsigned long *mat; - unsigned long vec; +local unsigned long +gf2_matrix_times(unsigned long *mat, unsigned long vec) { unsigned long sum; @@ -384,9 +378,8 @@ local unsigned long gf2_matrix_times(mat, vec) } /* ========================================================================= */ -local void gf2_matrix_square(square, mat) - unsigned long *square; - unsigned long *mat; +local void +gf2_matrix_square(unsigned long *square, unsigned long *mat) { int n; @@ -395,10 +388,8 @@ local void gf2_matrix_square(square, mat) } /* ========================================================================= */ -uLong ZEXPORT z_crc32_combine(crc1, crc2, len2) - uLong crc1; - uLong crc2; - z_off_t len2; +uLong ZEXPORT +z_crc32_combine(uLong crc1, uLong crc2, z_off_t len2) { int n; unsigned long row; diff --git a/libkern/zlib/deflate.c b/libkern/zlib/deflate.c index 6323a0e18..f902d2c9e 100644 --- a/libkern/zlib/deflate.c +++ b/libkern/zlib/deflate.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* deflate.c -- compress data using the deflation algorithm @@ -228,11 +228,8 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head)); /* ========================================================================= */ -int ZEXPORT deflateInit_(strm, level, version, stream_size) - z_streamp strm; - int level; - const char *version; - int stream_size; +int ZEXPORT +deflateInit_(z_streamp strm, int level, const char *version, int stream_size) { return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY, version, stream_size); @@ -240,16 +237,10 @@ int ZEXPORT deflateInit_(strm, level, version, stream_size) } /* ========================================================================= */ -int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, - version, stream_size) - z_streamp strm; - int level; - int method; - int windowBits; - int memLevel; - int strategy; - const char *version; - int stream_size; +int ZEXPORT +deflateInit2_(z_streamp strm, int level, int method, int windowBits, + int memLevel, int strategy, const char *version, + int stream_size) { deflate_state *s; int wrap = 1; @@ -341,10 +332,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, } /* ========================================================================= */ -int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength) - z_streamp strm; - const Bytef *dictionary; - uInt dictLength; +int ZEXPORT +deflateSetDictionary(z_streamp strm, const Bytef *dictionary, uInt dictLength) { deflate_state *s; uInt length = dictLength; @@ -384,7 +373,8 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength) /* ========================================================================= */ -ZEXTERN int ZEXPORT deflateResetWithIO(z_streamp strm, z_input_func zinput, z_output_func zoutput) +ZEXTERN int ZEXPORT +deflateResetWithIO(z_streamp strm, z_input_func zinput, z_output_func zoutput) { int zerr; @@ -397,8 +387,8 @@ ZEXTERN int ZEXPORT deflateResetWithIO(z_streamp strm, z_input_func zinput, z_ou /* ========================================================================= */ -int ZEXPORT deflateReset (strm) - z_streamp strm; +int ZEXPORT +deflateReset(z_streamp strm) { deflate_state *s; @@ -435,9 +425,8 @@ int ZEXPORT deflateReset (strm) } /* ========================================================================= */ -int ZEXPORT deflateSetHeader (strm, head) - z_streamp strm; - gz_headerp head; +int ZEXPORT +deflateSetHeader(z_streamp strm, gz_headerp head) { if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; if (strm->state->wrap != 2) return Z_STREAM_ERROR; @@ -446,10 +435,8 @@ int ZEXPORT deflateSetHeader (strm, head) } /* ========================================================================= */ -int ZEXPORT deflatePrime (strm, bits, value) - z_streamp strm; - int bits; - int value; +int ZEXPORT +deflatePrime(z_streamp strm, int bits, int value) { if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; strm->state->bi_valid = bits; @@ -458,10 +445,8 @@ int ZEXPORT deflatePrime (strm, bits, value) } /* ========================================================================= */ -int ZEXPORT deflateParams(strm, level, strategy) - z_streamp strm; - int level; - int strategy; +int ZEXPORT +deflateParams(z_streamp strm, int level, int strategy) { deflate_state *s; compress_func func; @@ -496,12 +481,9 @@ int ZEXPORT deflateParams(strm, level, strategy) } /* ========================================================================= */ -int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain) - z_streamp strm; - int good_length; - int max_lazy; - int nice_length; - int max_chain; +int ZEXPORT +deflateTune(z_streamp strm, int good_length, int max_lazy, int nice_length, + int max_chain) { deflate_state *s; @@ -531,9 +513,8 @@ int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain) * But even the conservative upper bound of about 14% expansion does not * seem onerous for output buffer allocation. */ -uLong ZEXPORT deflateBound(strm, sourceLen) - z_streamp strm; - uLong sourceLen; +uLong ZEXPORT +deflateBound(z_streamp strm, uLong sourceLen) { deflate_state *s; uLong destLen; @@ -560,9 +541,8 @@ uLong ZEXPORT deflateBound(strm, sourceLen) * IN assertion: the stream state is correct and there is enough room in * pending_buf. */ -local void putShortMSB (s, b) - deflate_state *s; - uInt b; +local void +putShortMSB(deflate_state *s, uInt b) { put_byte(s, (Byte)(b >> 8)); put_byte(s, (Byte)(b & 0xff)); @@ -574,8 +554,8 @@ local void putShortMSB (s, b) * to avoid allocating a large strm->next_out buffer and copying into it. * (See also read_buf()). */ -local void flush_pending(strm) - z_streamp strm; +local void +flush_pending(z_streamp strm) { unsigned len = strm->state->pending; @@ -598,9 +578,8 @@ local void flush_pending(strm) } /* ========================================================================= */ -int ZEXPORT deflate (strm, flush) - z_streamp strm; - int flush; +int ZEXPORT +deflate(z_streamp strm, int flush) { int old_flush; /* value of flush param for previous deflate call */ deflate_state *s; @@ -905,8 +884,8 @@ int ZEXPORT deflate (strm, flush) } /* ========================================================================= */ -int ZEXPORT deflateEnd (strm) - z_streamp strm; +int ZEXPORT +deflateEnd(z_streamp strm) { int status; @@ -940,9 +919,8 @@ int ZEXPORT deflateEnd (strm) * To simplify the source, this is not supported for 16-bit MSDOS (which * doesn't have enough memory anyway to duplicate compression states). */ -int ZEXPORT deflateCopy (dest, source) - z_streamp dest; - z_streamp source; +int ZEXPORT +deflateCopy(z_streamp dest, z_streamp source) { #ifdef MAXSEG_64K return Z_STREAM_ERROR; @@ -1002,10 +980,8 @@ int ZEXPORT deflateCopy (dest, source) * allocating a large strm->next_in buffer and copying from it. * (See also flush_pending()). */ -local int read_buf(strm, buf, size) - z_streamp strm; - Bytef *buf; - unsigned size; +local int +read_buf(z_streamp strm, Bytef *buf, unsigned size) { unsigned len = strm->avail_in; @@ -1032,8 +1008,8 @@ local int read_buf(strm, buf, size) /* =========================================================================== * Initialize the "longest match" routines for a new zlib stream */ -local void lm_init (s) - deflate_state *s; +local void +lm_init(deflate_state *s) { s->window_size = (ulg)2L*s->w_size; @@ -1072,15 +1048,15 @@ local void lm_init (s) #ifndef ASMV /* For 80x86 and 680x0, an optimized version will be provided in match.asm or * match.S. The code will be functionally equivalent. + * @param cur_match current match */ -local uInt longest_match(s, cur_match) - deflate_state *s; - IPos cur_match; /* current match */ +local uInt +longest_match(deflate_state *s, IPos cur_match) { unsigned chain_length = s->max_chain_length;/* max hash chain length */ - register Bytef *scan = s->window + s->strstart; /* current string */ - register Bytef *match; /* matched string */ - register int len; /* length of current match */ + Bytef *scan = s->window + s->strstart; /* current string */ + Bytef *match; /* matched string */ + int len; /* length of current match */ int best_len = s->prev_length; /* best match length so far */ int nice_match = s->nice_match; /* stop if match long enough */ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? @@ -1095,13 +1071,13 @@ local uInt longest_match(s, cur_match) /* Compare two bytes at a time. Note: this is not always beneficial. * Try with and without -DUNALIGNED_OK to check. */ - register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1; - register ush scan_start = *(ushf*)scan; - register ush scan_end = *(ushf*)(scan+best_len-1); + Bytef *strend = s->window + s->strstart + MAX_MATCH - 1; + ush scan_start = *(ushf*)scan; + ush scan_end = *(ushf*)(scan+best_len-1); #else - register Bytef *strend = s->window + s->strstart + MAX_MATCH; - register Byte scan_end1 = scan[best_len-1]; - register Byte scan_end = scan[best_len]; + Bytef *strend = s->window + s->strstart + MAX_MATCH; + Byte scan_end1 = scan[best_len-1]; + Byte scan_end = scan[best_len]; #endif /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. @@ -1178,7 +1154,8 @@ local uInt longest_match(s, cur_match) * are always equal when the other bytes match, given that * the hash keys are equal and that HASH_BITS >= 8. */ - scan += 2, match++; + scan += 2; + match++; Assert(*scan == *match, "match[2]?"); /* We check for insufficient lookahead only every 8th comparison; @@ -1220,15 +1197,15 @@ local uInt longest_match(s, cur_match) /* --------------------------------------------------------------------------- * Optimized version for level == 1 or strategy == Z_RLE only + * @param cur_match current match */ -local uInt longest_match_fast(s, cur_match) - deflate_state *s; - IPos cur_match; /* current match */ +local uInt +longest_match_fast(deflate_state *s, IPos cur_match) { - register Bytef *scan = s->window + s->strstart; /* current string */ - register Bytef *match; /* matched string */ - register int len; /* length of current match */ - register Bytef *strend = s->window + s->strstart + MAX_MATCH; + Bytef *scan = s->window + s->strstart; /* current string */ + Bytef *match; /* matched string */ + int len; /* length of current match */ + Bytef *strend = s->window + s->strstart + MAX_MATCH; /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. * It is easy to get rid of this optimization if necessary. @@ -1251,7 +1228,8 @@ local uInt longest_match_fast(s, cur_match) * are always equal when the other bytes match, given that * the hash keys are equal and that HASH_BITS >= 8. */ - scan += 2, match += 2; + scan += 2; + match += 2; Assert(*scan == *match, "match[2]?"); /* We check for insufficient lookahead only every 8th comparison; @@ -1278,10 +1256,8 @@ local uInt longest_match_fast(s, cur_match) /* =========================================================================== * Check that the match at match_start is indeed a match. */ -local void check_match(s, start, match, length) - deflate_state *s; - IPos start, match; - int length; +local void +check_match(deflate_state *s, IPos start, IPos match, int length) { /* check that the match is indeed a match */ if (zmemcmp(s->window + match, @@ -1312,11 +1288,11 @@ local void check_match(s, start, match, length) * performed for at least two bytes (required for the zip translate_eol * option -- not supported here). */ -local void fill_window(s) - deflate_state *s; +local void +fill_window(deflate_state *s) { - register unsigned n, m; - register Posf *p; + unsigned n, m; + Posf *p; unsigned more; /* Amount of free space at the end of the window. */ uInt wsize = s->w_size; @@ -1436,9 +1412,8 @@ local void fill_window(s) * NOTE: this function should be optimized to avoid extra copying from * window to pending_buf. */ -local block_state deflate_stored(s, flush) - deflate_state *s; - int flush; +local block_state +deflate_stored(deflate_state *s, int flush) { /* Stored blocks are limited to 0xffff bytes, pending_buf is limited * to pending_buf_size, and each stored block has a 5 byte header: @@ -1494,9 +1469,8 @@ local block_state deflate_stored(s, flush) * new strings in the dictionary only for unmatched strings or for short * matches. It is used only for the fast compression options. */ -local block_state deflate_fast(s, flush) - deflate_state *s; - int flush; +local block_state +deflate_fast(deflate_state *s, int flush) { IPos hash_head = NIL; /* head of the hash chain */ int bflush; /* set if current block must be flushed */ @@ -1600,9 +1574,8 @@ local block_state deflate_fast(s, flush) * evaluation for matches: a match is finally adopted only if there is * no better match at the next window position. */ -local block_state deflate_slow(s, flush) - deflate_state *s; - int flush; +local block_state +deflate_slow(deflate_state *s, int flush) { IPos hash_head = NIL; /* head of hash chain */ int bflush; /* set if current block must be flushed */ @@ -1631,7 +1604,8 @@ local block_state deflate_slow(s, flush) /* Find the longest match, discarding those <= prev_length. */ - s->prev_length = s->match_length, s->prev_match = s->match_start; + s->prev_length = s->match_length; + s->prev_match = s->match_start; s->match_length = MIN_MATCH-1; if (hash_head != NIL && s->prev_length < s->max_lazy_match && @@ -1729,9 +1703,8 @@ local block_state deflate_slow(s, flush) * one. Do not maintain a hash table. (It will be regenerated if this run of * deflate switches away from Z_RLE.) */ -local block_state deflate_rle(s, flush) - deflate_state *s; - int flush; +local block_state +deflate_rle(deflate_state *s, int flush) { int bflush; /* set if current block must be flushed */ uInt run; /* length of run */ @@ -1786,7 +1759,8 @@ local block_state deflate_rle(s, flush) #if XNU_KERNEL_PRIVATE -uLong zlib_deflate_memory_size(int wbits, int memlevel) +uLong +zlib_deflate_memory_size(int wbits, int memlevel) { return (31 + sizeof(deflate_state) + (1 << (wbits + 2)) + (1 << (memlevel + 9))); } diff --git a/libkern/zlib/gzio.c b/libkern/zlib/gzio.c index c6ca199c3..9c87cdd0f 100644 --- a/libkern/zlib/gzio.c +++ b/libkern/zlib/gzio.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* gzio.c -- IO on .gz files @@ -117,10 +117,8 @@ local uLong getLong OF((gz_stream *s)); can be checked to distinguish the two cases (if errno is zero, the zlib error is Z_MEM_ERROR). */ -local gzFile gz_open (path, mode, fd) - const char *path; - const char *mode; - int fd; +local gzFile +gz_open(const char *path, const char *mode, int fd) { int err; int level = Z_DEFAULT_COMPRESSION; /* compression level */ @@ -232,9 +230,8 @@ local gzFile gz_open (path, mode, fd) /* =========================================================================== Opens a gzip (.gz) file for reading or writing. */ -gzFile ZEXPORT gzopen (path, mode) - const char *path; - const char *mode; +gzFile ZEXPORT +gzopen(const char *path, const char *mode) { return gz_open (path, mode, -1); } @@ -243,9 +240,8 @@ gzFile ZEXPORT gzopen (path, mode) Associate a gzFile with the file descriptor fd. fd is not dup'ed here to mimic the behavio(u)r of fdopen. */ -gzFile ZEXPORT gzdopen (fd, mode) - int fd; - const char *mode; +gzFile ZEXPORT +gzdopen(int fd, const char *mode) { char name[46]; /* allow for up to 128-bit integers */ @@ -258,10 +254,8 @@ gzFile ZEXPORT gzdopen (fd, mode) /* =========================================================================== * Update the compression level and strategy */ -int ZEXPORT gzsetparams (file, level, strategy) - gzFile file; - int level; - int strategy; +int ZEXPORT +gzsetparams(gzFile file, int level, int strategy) { gz_stream *s = (gz_stream*)file; @@ -285,8 +279,8 @@ int ZEXPORT gzsetparams (file, level, strategy) for end of file. IN assertion: the stream s has been sucessfully opened for reading. */ -local int get_byte(s) - gz_stream *s; +local int +get_byte(gz_stream *s) { if (s->z_eof) return EOF; if (s->stream.avail_in == 0) { @@ -312,8 +306,8 @@ local int get_byte(s) s->stream.avail_in is zero for the first time, but may be non-zero for concatenated .gz files. */ -local void check_header(s) - gz_stream *s; +local void +check_header(gz_stream *s) { int method; /* method byte */ int flags; /* flags byte */ @@ -379,8 +373,8 @@ local void check_header(s) * Cleanup then free the given gz_stream. Return a zlib error code. Try freeing in the reverse order of allocations. */ -local int destroy (s) - gz_stream *s; +local int +destroy(gz_stream *s) { int err = Z_OK; @@ -418,10 +412,8 @@ local int destroy (s) Reads the given number of uncompressed bytes from the compressed file. gzread returns the number of bytes actually read (0 for end of file). */ -int ZEXPORT gzread (file, buf, len) - gzFile file; - voidp buf; - unsigned len; +int ZEXPORT +gzread(gzFile file, voidp buf, unsigned len) { gz_stream *s = (gz_stream*)file; Bytef *start = (Bytef*)buf; /* starting point for crc computation */ @@ -527,8 +519,8 @@ int ZEXPORT gzread (file, buf, len) Reads one byte from the compressed file. gzgetc returns this byte or -1 in case of end of file or error. */ -int ZEXPORT gzgetc(file) - gzFile file; +int ZEXPORT +gzgetc(gzFile file) { unsigned char c; @@ -539,9 +531,8 @@ int ZEXPORT gzgetc(file) /* =========================================================================== Push one byte back onto the stream. */ -int ZEXPORT gzungetc(c, file) - int c; - gzFile file; +int ZEXPORT +gzungetc(int c, gzFile file) { gz_stream *s = (gz_stream*)file; @@ -564,10 +555,8 @@ int ZEXPORT gzungetc(c, file) The current implementation is not optimized at all. */ -char * ZEXPORT gzgets(file, buf, len) - gzFile file; - char *buf; - int len; +char * ZEXPORT +gzgets(gzFile file, char *buf, int len) { char *b = buf; if (buf == Z_NULL || len <= 0) return Z_NULL; @@ -583,10 +572,8 @@ char * ZEXPORT gzgets(file, buf, len) Writes the given number of uncompressed bytes into the compressed file. gzwrite returns the number of bytes actually written (0 in case of error). */ -int ZEXPORT gzwrite (file, buf, len) - gzFile file; - voidpc buf; - unsigned len; +int ZEXPORT +gzwrite(gzFile file, voidpc buf, unsigned len) { gz_stream *s = (gz_stream*)file; @@ -627,7 +614,8 @@ int ZEXPORT gzwrite (file, buf, len) #ifdef STDC #include -int ZEXPORTVA gzprintf (gzFile file, const char *format, /* args */ ...) +int ZEXPORTVA +gzprintf(gzFile file, const char *format, /* args */ ...) { char buf[Z_PRINTF_BUFSIZE]; va_list va; @@ -661,12 +649,10 @@ int ZEXPORTVA gzprintf (gzFile file, const char *format, /* args */ ...) } #else /* not ANSI C */ -int ZEXPORTVA gzprintf (file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, - a11, a12, a13, a14, a15, a16, a17, a18, a19, a20) - gzFile file; - const char *format; - int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, - a11, a12, a13, a14, a15, a16, a17, a18, a19, a20; +int ZEXPORTVA +gzprintf(gzFile file, const char *format, int a1, int a2, int a3, int a4, + int a5, int a6, int a7, int a8, int a9, int a10, int a11, int a12, + int a13, int a14, int a15, int a16, int a17, int a18, int a19, int a20) { char buf[Z_PRINTF_BUFSIZE]; int len; @@ -702,9 +688,8 @@ int ZEXPORTVA gzprintf (file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, Writes c, converted to an unsigned char, into the compressed file. gzputc returns the value that was written, or -1 in case of error. */ -int ZEXPORT gzputc(file, c) - gzFile file; - int c; +int ZEXPORT +gzputc(gzFile file, int c) { unsigned char cc = (unsigned char) c; /* required for big endian systems */ @@ -717,9 +702,8 @@ int ZEXPORT gzputc(file, c) the terminating null character. gzputs returns the number of characters written, or -1 in case of error. */ -int ZEXPORT gzputs(file, s) - gzFile file; - const char *s; +int ZEXPORT +gzputs(gzFile file, const char *s) { return gzwrite(file, (char*)s, (unsigned)strlen(s)); } @@ -729,9 +713,8 @@ int ZEXPORT gzputs(file, s) Flushes all pending output into the compressed file. The parameter flush is as in the deflate() function. */ -local int do_flush (file, flush) - gzFile file; - int flush; +local int +do_flush(gzFile file, int flush) { uInt len; int done = 0; @@ -770,9 +753,8 @@ local int do_flush (file, flush) return s->z_err == Z_STREAM_END ? Z_OK : s->z_err; } -int ZEXPORT gzflush (file, flush) - gzFile file; - int flush; +int ZEXPORT +gzflush(gzFile file, int flush) { gz_stream *s = (gz_stream*)file; int err = do_flush (file, flush); @@ -791,10 +773,8 @@ int ZEXPORT gzflush (file, flush) SEEK_END is not implemented, returns error. In this version of the library, gzseek can be extremely slow. */ -z_off_t ZEXPORT gzseek (file, offset, whence) - gzFile file; - z_off_t offset; - int whence; +z_off_t ZEXPORT +gzseek(gzFile file, z_off_t offset, int whence) { gz_stream *s = (gz_stream*)file; @@ -881,8 +861,8 @@ z_off_t ZEXPORT gzseek (file, offset, whence) /* =========================================================================== Rewinds input file. */ -int ZEXPORT gzrewind (file) - gzFile file; +int ZEXPORT +gzrewind(gzFile file) { gz_stream *s = (gz_stream*)file; @@ -905,8 +885,8 @@ int ZEXPORT gzrewind (file) given compressed file. This position represents a number of bytes in the uncompressed data stream. */ -z_off_t ZEXPORT gztell (file) - gzFile file; +z_off_t ZEXPORT +gztell(gzFile file) { return gzseek(file, 0L, SEEK_CUR); } @@ -915,8 +895,8 @@ z_off_t ZEXPORT gztell (file) Returns 1 when EOF has previously been detected reading the given input stream, otherwise zero. */ -int ZEXPORT gzeof (file) - gzFile file; +int ZEXPORT +gzeof(gzFile file) { gz_stream *s = (gz_stream*)file; @@ -932,8 +912,8 @@ int ZEXPORT gzeof (file) /* =========================================================================== Returns 1 if reading and doing so transparently, otherwise zero. */ -int ZEXPORT gzdirect (file) - gzFile file; +int ZEXPORT +gzdirect(gzFile file) { gz_stream *s = (gz_stream*)file; @@ -944,9 +924,8 @@ int ZEXPORT gzdirect (file) /* =========================================================================== Outputs a long in LSB order to the given file */ -local void putLong (file, x) - FILE *file; - uLong x; +local void +putLong(FILE *file, uLong x) { int n; for (n = 0; n < 4; n++) { @@ -959,8 +938,8 @@ local void putLong (file, x) Reads a long in LSB order from the given gz_stream. Sets z_err in case of error. */ -local uLong getLong (s) - gz_stream *s; +local uLong +getLong(gz_stream *s) { uLong x = (uLong)get_byte(s); int c; @@ -977,8 +956,8 @@ local uLong getLong (s) Flushes all pending output if necessary, closes the compressed file and deallocates all the (de)compression state. */ -int ZEXPORT gzclose (file) - gzFile file; +int ZEXPORT +gzclose(gzFile file) { gz_stream *s = (gz_stream*)file; @@ -1011,9 +990,8 @@ int ZEXPORT gzclose (file) errnum is set to Z_ERRNO and the application may consult errno to get the exact error code. */ -const char * ZEXPORT gzerror (file, errnum) - gzFile file; - int *errnum; +const char * ZEXPORT +gzerror(gzFile file, int *errnum) { char *m; gz_stream *s = (gz_stream*)file; @@ -1041,8 +1019,8 @@ const char * ZEXPORT gzerror (file, errnum) /* =========================================================================== Clear the error and end-of-file flags, and do the same for the real file. */ -void ZEXPORT gzclearerr (file) - gzFile file; +void ZEXPORT +gzclearerr(gzFile file) { gz_stream *s = (gz_stream*)file; diff --git a/libkern/zlib/infback.c b/libkern/zlib/infback.c index 5433556ed..f3151ac8a 100644 --- a/libkern/zlib/infback.c +++ b/libkern/zlib/infback.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* infback.c -- inflate using a call-back interface @@ -52,12 +52,9 @@ local void fixedtables OF((struct inflate_state FAR *state)); windowBits is in the range 8..15, and window is a user-supplied window and output buffer that is 2**windowBits bytes. */ -int ZEXPORT inflateBackInit_(strm, windowBits, window, version, stream_size) -z_streamp strm; -int windowBits; -unsigned char FAR *window; -const char *version; -int stream_size; +int ZEXPORT +inflateBackInit_(z_streamp strm, int windowBits, unsigned char FAR *window, + const char *version, int stream_size) { struct inflate_state FAR *state; @@ -99,8 +96,8 @@ int stream_size; used for threaded applications, since the rewriting of the tables and virgin may not be thread-safe. */ -local void fixedtables(state) -struct inflate_state FAR *state; +local void +fixedtables(struct inflate_state FAR *state) { #ifdef BUILDFIXED static int virgin = 1; @@ -267,12 +264,9 @@ struct inflate_state FAR *state; inflateBack() can also return Z_STREAM_ERROR if the input parameters are not correct, i.e. strm is Z_NULL or the state was not initialized. */ -int ZEXPORT inflateBack(strm, in, in_desc, out, out_desc) -z_streamp strm; -in_func in; -void FAR *in_desc; -out_func out; -void FAR *out_desc; +int ZEXPORT +inflateBack(z_streamp strm, in_func in, void FAR *in_desc, out_func out, + void FAR *out_desc) { struct inflate_state FAR *state; unsigned char FAR *next; /* next input */ @@ -640,8 +634,8 @@ void FAR *out_desc; return ret; } -int ZEXPORT inflateBackEnd(strm) -z_streamp strm; +int ZEXPORT +inflateBackEnd(z_streamp strm) { if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0) return Z_STREAM_ERROR; diff --git a/libkern/zlib/inffast.c b/libkern/zlib/inffast.c index cb93ddc5d..133374e40 100644 --- a/libkern/zlib/inffast.c +++ b/libkern/zlib/inffast.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* inffast.c -- fast decoding @@ -91,10 +91,11 @@ bytes, which is the maximum length that can be coded. inflate_fast() requires strm->avail_out >= 258 for each loop to avoid checking for output space. + + @param start inflate()'s starting value for strm->avail_out */ -void inflate_fast(strm, start) -z_streamp strm; -unsigned start; /* inflate()'s starting value for strm->avail_out */ +void +inflate_fast(z_streamp strm, unsigned start) { struct inflate_state FAR *state; unsigned char FAR *in; /* local strm->next_in */ @@ -344,4 +345,3 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ */ #endif /* !ASMINF */ - diff --git a/libkern/zlib/inflate.c b/libkern/zlib/inflate.c index fe12d1684..e5d688d3a 100644 --- a/libkern/zlib/inflate.c +++ b/libkern/zlib/inflate.c @@ -127,8 +127,8 @@ local int updatewindow OF((z_streamp strm, unsigned out)); local unsigned syncsearch OF((unsigned FAR *have, unsigned char FAR *buf, unsigned len)); -int ZEXPORT inflateReset(strm) -z_streamp strm; +int ZEXPORT +inflateReset(z_streamp strm) { struct inflate_state FAR *state; @@ -152,10 +152,8 @@ z_streamp strm; return Z_OK; } -int ZEXPORT inflatePrime(strm, bits, value) -z_streamp strm; -int bits; -int value; +int ZEXPORT +inflatePrime(z_streamp strm, int bits, int value) { struct inflate_state FAR *state; @@ -168,11 +166,9 @@ int value; return Z_OK; } -int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size) -z_streamp strm; -int windowBits; -const char *version; -int stream_size; +int ZEXPORT +inflateInit2_(z_streamp strm, int windowBits, const char *version, + int stream_size) { struct inflate_state FAR *state; @@ -213,10 +209,8 @@ int stream_size; return inflateReset(strm); } -int ZEXPORT inflateInit_(strm, version, stream_size) -z_streamp strm; -const char *version; -int stream_size; +int ZEXPORT +inflateInit_(z_streamp strm, const char *version, int stream_size) { return inflateInit2_(strm, DEF_WBITS, version, stream_size); } @@ -231,8 +225,8 @@ int stream_size; used for threaded applications, since the rewriting of the tables and virgin may not be thread-safe. */ -local void fixedtables(state) -struct inflate_state FAR *state; +local void +fixedtables(struct inflate_state FAR *state) { #ifdef BUILDFIXED static int virgin = 1; @@ -295,7 +289,8 @@ struct inflate_state FAR *state; a.out > inffixed.h */ -void makefixed() +void +makefixed(void) { unsigned low, size; struct inflate_state state; @@ -349,9 +344,8 @@ void makefixed() output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ -local int updatewindow(strm, out) -z_streamp strm; -unsigned out; +local int +updatewindow(z_streamp strm, unsigned out) { struct inflate_state FAR *state; unsigned copy, dist; @@ -580,9 +574,8 @@ unsigned out; will return Z_BUF_ERROR if it has not reached the end of the stream. */ -int ZEXPORT inflate(strm, flush) -z_streamp strm; -int flush; +int ZEXPORT +inflate(z_streamp strm, int flush) { struct inflate_state FAR *state; unsigned char FAR *next; /* next input */ @@ -1181,8 +1174,8 @@ int flush; return ret; } -int ZEXPORT inflateEnd(strm) -z_streamp strm; +int ZEXPORT +inflateEnd(z_streamp strm) { struct inflate_state FAR *state; if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0) @@ -1195,10 +1188,8 @@ z_streamp strm; return Z_OK; } -int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength) -z_streamp strm; -const Bytef *dictionary; -uInt dictLength; +int ZEXPORT +inflateSetDictionary(z_streamp strm, const Bytef *dictionary, uInt dictLength) { struct inflate_state FAR *state; unsigned long id; @@ -1237,9 +1228,8 @@ uInt dictLength; return Z_OK; } -int ZEXPORT inflateGetHeader(strm, head) -z_streamp strm; -gz_headerp head; +int ZEXPORT +inflateGetHeader(z_streamp strm, gz_headerp head) { struct inflate_state FAR *state; @@ -1265,10 +1255,8 @@ gz_headerp head; called again with more data and the *have state. *have is initialized to zero for the first call. */ -local unsigned syncsearch(have, buf, len) -unsigned FAR *have; -unsigned char FAR *buf; -unsigned len; +local unsigned +syncsearch(unsigned FAR *have, unsigned char FAR *buf, unsigned len) { unsigned got; unsigned next; @@ -1288,8 +1276,8 @@ unsigned len; return next; } -int ZEXPORT inflateSync(strm) -z_streamp strm; +int ZEXPORT +inflateSync(z_streamp strm) { unsigned len; /* number of bytes to look at or looked at */ unsigned long in, out; /* temporary to save total_in and total_out */ @@ -1339,8 +1327,8 @@ z_streamp strm; block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ -int ZEXPORT inflateSyncPoint(strm) -z_streamp strm; +int ZEXPORT +inflateSyncPoint(z_streamp strm) { struct inflate_state FAR *state; @@ -1349,9 +1337,8 @@ z_streamp strm; return state->mode == STORED && state->bits == 0; } -int ZEXPORT inflateCopy(dest, source) -z_streamp dest; -z_streamp source; +int ZEXPORT +inflateCopy(z_streamp dest, z_streamp source) { struct inflate_state FAR *state; struct inflate_state FAR *copy; diff --git a/libkern/zlib/inftrees.c b/libkern/zlib/inftrees.c index 338c455c8..8d4f79594 100644 --- a/libkern/zlib/inftrees.c +++ b/libkern/zlib/inftrees.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* inftrees.c -- generate Huffman trees for efficient decoding @@ -56,13 +56,10 @@ const char inflate_copyright[] = table index bits. It will differ if the request is greater than the longest code or if it is less than the shortest code. */ -int inflate_table(type, lens, codes, table, bits, work) -codetype type; -unsigned short FAR *lens; -unsigned codes; -code FAR * FAR *table; -unsigned FAR *bits; -unsigned short FAR *work; +int +inflate_table(codetype type, unsigned short FAR *lens, unsigned codes, + code FAR * FAR *table, unsigned FAR *bits, + unsigned short FAR *work) { unsigned len; /* a code's length in bits */ unsigned sym; /* index of code symbols */ diff --git a/libkern/zlib/trees.c b/libkern/zlib/trees.c index a64436848..21d483a7f 100644 --- a/libkern/zlib/trees.c +++ b/libkern/zlib/trees.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* trees.c -- output deflated data using Huffman coding @@ -216,10 +216,12 @@ local void gen_trees_header OF((void)); #ifdef DEBUG local void send_bits OF((deflate_state *s, int value, int length)); -local void send_bits(s, value, length) - deflate_state *s; - int value; /* value to send */ - int length; /* number of bits */ +/* + * @param value value to send + * @param length number of bits + */ +local void +send_bits(deflate_state *s, int value, int length) { Tracevv((stderr," l %2d v %4x ", length, value)); Assert(length > 0 && length <= 15, "invalid length"); @@ -262,7 +264,8 @@ local void send_bits(s, value, length) /* =========================================================================== * Initialize the various 'constant' tables. */ -local void tr_static_init() +local void +tr_static_init(void) { #if defined(GEN_TREES_H) || !defined(STDC) static int static_init_done = 0; @@ -354,7 +357,8 @@ local void tr_static_init() ((i) == (last)? "\n};\n\n" : \ ((i) % (width) == (width)-1 ? ",\n" : ", ")) -void gen_trees_header() +void +gen_trees_header(void) { FILE *header = fopen("trees.h", "w"); int i; @@ -406,8 +410,8 @@ void gen_trees_header() /* =========================================================================== * Initialize the tree data structures for a new zlib stream. */ -void _tr_init(s) - deflate_state *s; +void +_tr_init(deflate_state *s) { tr_static_init(); @@ -435,8 +439,8 @@ void _tr_init(s) /* =========================================================================== * Initialize a new block. */ -local void init_block(s) - deflate_state *s; +local void +init_block(deflate_state *s) { int n; /* iterates over tree elements */ @@ -478,11 +482,12 @@ local void init_block(s) * exchanging a node with the smallest of its two sons if necessary, stopping * when the heap property is re-established (each father smaller than its * two sons). + * + * @param tree the tree to restore + * @param k node to move down */ -local void pqdownheap(s, tree, k) - deflate_state *s; - ct_data *tree; /* the tree to restore */ - int k; /* node to move down */ +local void +pqdownheap(deflate_state *s, ct_data *tree, int k) { int v = s->heap[k]; int j = k << 1; /* left son of k */ @@ -513,10 +518,10 @@ local void pqdownheap(s, tree, k) * array bl_count contains the frequencies for each bit length. * The length opt_len is updated; static_len is also updated if stree is * not null. + * @param desc the tree descriptor */ -local void gen_bitlen(s, desc) - deflate_state *s; - tree_desc *desc; /* the tree descriptor */ +local void +gen_bitlen(deflate_state *s, tree_desc *desc) { ct_data *tree = desc->dyn_tree; int max_code = desc->max_code; @@ -541,7 +546,10 @@ local void gen_bitlen(s, desc) for (h = s->heap_max+1; h < HEAP_SIZE; h++) { n = s->heap[h]; bits = tree[tree[n].Dad].Len + 1; - if (bits > max_length) bits = max_length, overflow++; + if (bits > max_length) { + bits = max_length; + overflow++; + } tree[n].Len = (ush)bits; /* We overwrite tree[n].Dad which is no longer needed */ @@ -600,11 +608,13 @@ local void gen_bitlen(s, desc) * the given tree and the field len is set for all tree elements. * OUT assertion: the field code is set for all tree elements of non * zero code length. + * + * @param tree the tree to decorate + * @param max_count largest code with non zero frequency + * @param bl_count number of codes at each bit length */ -local void gen_codes (tree, max_code, bl_count) - ct_data *tree; /* the tree to decorate */ - int max_code; /* largest code with non zero frequency */ - ushf *bl_count; /* number of codes at each bit length */ +local void +gen_codes(ct_data *tree, int max_code, ushf *bl_count) { ush next_code[MAX_BITS+1]; /* next code value for each bit length */ ush code = 0; /* running code value */ @@ -642,10 +652,11 @@ local void gen_codes (tree, max_code, bl_count) * OUT assertions: the fields len and code are set to the optimal bit length * and corresponding code. The length opt_len is updated; static_len is * also updated if stree is not null. The field max_code is set. + * + * @param desc the tree descriptor */ -local void build_tree(s, desc) - deflate_state *s; - tree_desc *desc; /* the tree descriptor */ +local void +build_tree(deflate_state *s, tree_desc *desc) { ct_data *tree = desc->dyn_tree; const ct_data *stree = desc->stat_desc->static_tree; @@ -658,7 +669,8 @@ local void build_tree(s, desc) * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1]. * heap[0] is not used. */ - s->heap_len = 0, s->heap_max = HEAP_SIZE; + s->heap_len = 0; + s->heap_max = HEAP_SIZE; for (n = 0; n < elems; n++) { if (tree[n].Freq != 0) { @@ -730,11 +742,12 @@ local void build_tree(s, desc) /* =========================================================================== * Scan a literal or distance tree to determine the frequencies of the codes * in the bit length tree. + * + * @param tree the tree to be scanned + * @param max_code and its largest code of non zero frequency */ -local void scan_tree (s, tree, max_code) - deflate_state *s; - ct_data *tree; /* the tree to be scanned */ - int max_code; /* and its largest code of non zero frequency */ +local void +scan_tree(deflate_state *s, ct_data *tree, int max_code) { int n; /* iterates over all tree elements */ int prevlen = -1; /* last emitted length */ @@ -744,7 +757,10 @@ local void scan_tree (s, tree, max_code) int max_count = 7; /* max repeat count */ int min_count = 4; /* min repeat count */ - if (nextlen == 0) max_count = 138, min_count = 3; + if (nextlen == 0) { + max_count = 138; + min_count = 3; + } tree[max_code+1].Len = (ush)0xffff; /* guard */ for (n = 0; n <= max_code; n++) { @@ -763,11 +779,14 @@ local void scan_tree (s, tree, max_code) } count = 0; prevlen = curlen; if (nextlen == 0) { - max_count = 138, min_count = 3; + max_count = 138; + min_count = 3; } else if (curlen == nextlen) { - max_count = 6, min_count = 3; + max_count = 6; + min_count = 3; } else { - max_count = 7, min_count = 4; + max_count = 7; + min_count = 4; } } } @@ -775,11 +794,12 @@ local void scan_tree (s, tree, max_code) /* =========================================================================== * Send a literal or distance tree in compressed form, using the codes in * bl_tree. + * + * @param tree the tree to be scanned + * @param max_code and its largest code of non zero frequency */ -local void send_tree (s, tree, max_code) - deflate_state *s; - ct_data *tree; /* the tree to be scanned */ - int max_code; /* and its largest code of non zero frequency */ +local void +send_tree( deflate_state *s, ct_data *tree, int max_code) { int n; /* iterates over all tree elements */ int prevlen = -1; /* last emitted length */ @@ -790,7 +810,10 @@ local void send_tree (s, tree, max_code) int min_count = 4; /* min repeat count */ /* tree[max_code+1].Len = -1; */ /* guard already set */ - if (nextlen == 0) max_count = 138, min_count = 3; + if (nextlen == 0) { + max_count = 138; + min_count = 3; + } for (n = 0; n <= max_code; n++) { curlen = nextlen; nextlen = tree[n+1].Len; @@ -814,11 +837,14 @@ local void send_tree (s, tree, max_code) } count = 0; prevlen = curlen; if (nextlen == 0) { - max_count = 138, min_count = 3; + max_count = 138; + min_count = 3; } else if (curlen == nextlen) { - max_count = 6, min_count = 3; + max_count = 6; + min_count = 3; } else { - max_count = 7, min_count = 4; + max_count = 7; + min_count = 4; } } } @@ -827,8 +853,8 @@ local void send_tree (s, tree, max_code) * Construct the Huffman tree for the bit lengths and return the index in * bl_order of the last bit length code to send. */ -local int build_bl_tree(s) - deflate_state *s; +local int +build_bl_tree(deflate_state *s) { int max_blindex; /* index of last bit length code of non zero freq */ @@ -861,10 +887,13 @@ local int build_bl_tree(s) * Send the header for a block using dynamic Huffman trees: the counts, the * lengths of the bit length codes, the literal tree and the distance tree. * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4. + * + * @param lcodes number of codes for each tree + * @param dcodes number of codes for each tree + * @param blcodes number of codes for each tree */ -local void send_all_trees(s, lcodes, dcodes, blcodes) - deflate_state *s; - int lcodes, dcodes, blcodes; /* number of codes for each tree */ +local void +send_all_trees(deflate_state *s, int lcodes, int dcodes, int blcodes) { int rank; /* index in bl_order */ @@ -890,12 +919,13 @@ local void send_all_trees(s, lcodes, dcodes, blcodes) /* =========================================================================== * Send a stored block + * + * @param buf input block + * @param stored_len length of input block + * @param eof true if this is the last block for a file */ -void _tr_stored_block(s, buf, stored_len, eof) - deflate_state *s; - charf *buf; /* input block */ - ulg stored_len; /* length of input block */ - int eof; /* true if this is the last block for a file */ +void +_tr_stored_block(deflate_state *s, charf *buf, ulg stored_len, int eof) { send_bits(s, (STORED_BLOCK<<1)+eof, 3); /* send block type */ #ifdef DEBUG @@ -916,8 +946,8 @@ void _tr_stored_block(s, buf, stored_len, eof) * To simplify the code, we assume the worst case of last real code encoded * on one bit only. */ -void _tr_align(s) - deflate_state *s; +void +_tr_align(deflate_state *s) { send_bits(s, STATIC_TREES<<1, 3); send_code(s, END_BLOCK, static_ltree); @@ -944,12 +974,13 @@ void _tr_align(s) /* =========================================================================== * Determine the best encoding for the current block: dynamic trees, static * trees or store, and output the encoded block to the zip file. + * + * @param buf input block, or NULL if too old + * @param stored_len length of input block + * @param eof true if this is the last block for a file */ -void _tr_flush_block(s, buf, stored_len, eof) - deflate_state *s; - charf *buf; /* input block, or NULL if too old */ - ulg stored_len; /* length of input block */ - int eof; /* true if this is the last block for a file */ +void +_tr_flush_block(deflate_state *s, charf *buf, ulg stored_len, int eof) { ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */ int max_blindex = 0; /* index of last bit length code of non zero freq */ @@ -1045,11 +1076,12 @@ void _tr_flush_block(s, buf, stored_len, eof) /* =========================================================================== * Save the match info and tally the frequency counts. Return true if * the current block must be flushed. + * + * @param dist distance of matched string + * @param lc match length-MIN_MATCH or unmatched char (if dist==0) */ -int _tr_tally (s, dist, lc) - deflate_state *s; - unsigned dist; /* distance of matched string */ - unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */ +int +_tr_tally(deflate_state *s, unsigned dist, unsigned lc) { s->d_buf[s->last_lit] = (ush)dist; s->l_buf[s->last_lit++] = (uch)lc; @@ -1095,11 +1127,12 @@ int _tr_tally (s, dist, lc) /* =========================================================================== * Send the block data compressed using the given Huffman trees + * + * @param ltree literal tree + * @param dtree distance tree */ -local void compress_block(s, ltree, dtree) - deflate_state *s; - ct_data *ltree; /* literal tree */ - ct_data *dtree; /* distance tree */ +local void +compress_block(deflate_state *s, ct_data *ltree, ct_data *dtree) { unsigned dist; /* distance of matched string */ int lc; /* match length or unmatched char (if dist == 0) */ @@ -1150,8 +1183,8 @@ local void compress_block(s, ltree, dtree) * or white spaces (9 to 13, or 32); or set it to Z_BINARY otherwise. * IN assertion: the fields Freq of dyn_ltree are set. */ -local void set_data_type(s) - deflate_state *s; +local void +set_data_type(deflate_state *s) { int n; @@ -1169,15 +1202,18 @@ local void set_data_type(s) * Reverse the first len bits of a code, using straightforward code (a faster * method would use a table) * IN assertion: 1 <= len <= 15 + * + * @param code the value to invert + * @param len its bit length */ -local unsigned bi_reverse(code, len) - unsigned code; /* the value to invert */ - int len; /* its bit length */ +local unsigned +bi_reverse(unsigned code, int len) { - register unsigned res = 0; + unsigned res = 0; do { res |= code & 1; - code >>= 1, res <<= 1; + code >>= 1; + res <<= 1; } while (--len > 0); return res >> 1; } @@ -1185,8 +1221,8 @@ local unsigned bi_reverse(code, len) /* =========================================================================== * Flush the bit buffer, keeping at most 7 bits in it. */ -local void bi_flush(s) - deflate_state *s; +local void +bi_flush(deflate_state *s) { if (s->bi_valid == 16) { put_short(s, s->bi_buf); @@ -1202,8 +1238,8 @@ local void bi_flush(s) /* =========================================================================== * Flush the bit buffer and align the output on a byte boundary */ -local void bi_windup(s) - deflate_state *s; +local void +bi_windup(deflate_state *s) { if (s->bi_valid > 8) { put_short(s, s->bi_buf); @@ -1220,12 +1256,13 @@ local void bi_windup(s) /* =========================================================================== * Copy a stored block, storing first the length and its * one's complement if requested. + * + * @param buf the input data + * @param len its length + * @param header true if block header must be written */ -local void copy_block(s, buf, len, header) - deflate_state *s; - charf *buf; /* the input data */ - unsigned len; /* its length */ - int header; /* true if block header must be written */ +local void +copy_block(deflate_state *s, charf *buf, unsigned len, int header) { bi_windup(s); /* align on byte boundary */ s->last_eob_len = 8; /* enough lookahead for inflate */ diff --git a/libkern/zlib/uncompr.c b/libkern/zlib/uncompr.c index 00b0b7e0a..1105cbb8e 100644 --- a/libkern/zlib/uncompr.c +++ b/libkern/zlib/uncompr.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* uncompr.c -- decompress a memory buffer @@ -54,11 +54,8 @@ enough memory, Z_BUF_ERROR if there was not enough room in the output buffer, or Z_DATA_ERROR if the input data was corrupted. */ -int ZEXPORT uncompress (dest, destLen, source, sourceLen) - Bytef *dest; - uLongf *destLen; - const Bytef *source; - uLong sourceLen; +int ZEXPORT +uncompress(Bytef *dest, uLongf *destLen, const Bytef *source, uLong sourceLen) { z_stream stream; int err; diff --git a/libkern/zlib/zutil.c b/libkern/zlib/zutil.c index f90ac37a7..81ca65fa4 100644 --- a/libkern/zlib/zutil.c +++ b/libkern/zlib/zutil.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* zutil.c -- target dependent utility functions for the compression library @@ -146,8 +146,8 @@ uLong ZEXPORT zlibCompileFlags() # endif int z_verbose = verbose; -void z_error (m) - char *m; +void +z_error(char *m) { fprintf(stderr, "%s\n", m); exit(1); @@ -157,8 +157,8 @@ void z_error (m) /* exported to allow conversion of error code to string for compress() and * uncompress() */ -const char * ZEXPORT zError(err) - int err; +const char * ZEXPORT +zError(int err) { return ERR_MSG(err); } @@ -173,10 +173,8 @@ const char * ZEXPORT zError(err) #ifndef HAVE_MEMCPY -void zmemcpy(dest, source, len) - Bytef* dest; - const Bytef* source; - uInt len; +void +zmemcpy(Bytef* dest, const Bytef* source, uInt len) { if (len == 0) return; do { @@ -184,10 +182,8 @@ void zmemcpy(dest, source, len) } while (--len != 0); } -int zmemcmp(s1, s2, len) - const Bytef* s1; - const Bytef* s2; - uInt len; +int +zmemcmp(const Bytef* s1, const Bytef* s2, uInt len) { uInt j; @@ -197,9 +193,8 @@ int zmemcmp(s1, s2, len) return 0; } -void zmemzero(dest, len) - Bytef* dest; - uInt len; +void +zmemzero(Bytef* dest, uInt len) { if (len == 0) return; do { @@ -241,7 +236,8 @@ local ptr_table table[MAX_PTR]; * a protected system like OS/2. Use Microsoft C instead. */ -voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +voidpf +zcalloc(voidpf opaque, unsigned items, unsigned size) { voidpf buf = opaque; /* just to make some compilers happy */ ulg bsize = (ulg)items*size; @@ -265,7 +261,8 @@ voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) return buf; } -void zcfree (voidpf opaque, voidpf ptr) +void +zcfree(voidpf opaque, voidpf ptr) { int n; if (*(ush*)&ptr != 0) { /* object < 64K */ @@ -300,13 +297,15 @@ void zcfree (voidpf opaque, voidpf ptr) # define _hfree hfree #endif -voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +voidpf +zcalloc(voidpf opaque, unsigned items, unsigned size) { if (opaque) opaque = 0; /* to make compiler happy */ return _halloc((long)items, size); } -void zcfree (voidpf opaque, voidpf ptr) +void +zcfree(voidpf opaque, voidpf ptr) { if (opaque) opaque = 0; /* to make compiler happy */ _hfree(ptr); @@ -325,10 +324,8 @@ extern voidp calloc OF((uInt items, uInt size)); extern void free OF((voidpf ptr)); #endif -voidpf zcalloc (opaque, items, size) - voidpf opaque; - unsigned items; - unsigned size; +voidpf +zcalloc(voidpf opaque, unsigned items, unsigned size) { if (opaque) items += size - size; /* make compiler happy */ if (sizeof(uInt) > 2) { @@ -342,9 +339,8 @@ voidpf zcalloc (opaque, items, size) return (voidpf)calloc(items, size); } -void zcfree (opaque, ptr) - voidpf opaque; - voidpf ptr; +void +zcfree(voidpf opaque, voidpf ptr) { free(ptr); if (opaque) return; /* make compiler happy */ diff --git a/libsa/Makefile b/libsa/Makefile index 4e8c0e8f1..9fa2dc5ed 100644 --- a/libsa/Makefile +++ b/libsa/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -11,5 +10,3 @@ COMP_SUBDIRS = conf include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/libsa/bootstrap.cpp b/libsa/bootstrap.cpp index 02e5694d0..4c56869b8 100644 --- a/libsa/bootstrap.cpp +++ b/libsa/bootstrap.cpp @@ -29,6 +29,7 @@ extern "C" { #include #include #include + } #include @@ -111,6 +112,25 @@ static const char * sKernelComponentNames[] = { NULL }; +static int __whereIsAddr(vm_offset_t theAddr, unsigned long * segSizes, vm_offset_t *segAddrs, int segCount ); + +#define PLK_SEGMENTS 11 + +static const char * plk_segNames[] = { + "__TEXT", + "__TEXT_EXEC", + "__DATA", + "__DATA_CONST", + "__LINKEDIT", + "__PRELINK_TEXT", + "__PLK_TEXT_EXEC", + "__PRELINK_DATA", + "__PLK_DATA_CONST", + "__PLK_LINKEDIT", + "__PRELINK_INFO", + NULL +}; + #if PRAGMA_MARK #pragma mark KLDBootstrap Class #endif @@ -206,6 +226,11 @@ KLDBootstrap::readStartupExtensions(void) return; } +typedef struct kaslrPackedOffsets { + uint32_t count; /* number of offsets */ + uint32_t offsetsArray[]; /* offsets to slide */ +} kaslrPackedOffsets; + /********************************************************************* *********************************************************************/ void @@ -239,6 +264,9 @@ KLDBootstrap::readPrelinkedExtensions( bool developerDevice; bool dontLoad; #endif + OSData * kaslrOffsets = NULL; + unsigned long plk_segSizes[PLK_SEGMENTS]; + vm_offset_t plk_segAddrs[PLK_SEGMENTS]; OSKextLog(/* kext */ NULL, kOSKextLogProgressLevel | @@ -300,6 +328,14 @@ KLDBootstrap::readPrelinkedExtensions( prelinkData = (void *) prelinkTextSegment->vmaddr; prelinkLength = prelinkTextSegment->vmsize; + /* build arrays of plk info for later use */ + const char ** segNamePtr; + + for (segNamePtr = &plk_segNames[0], i = 0; *segNamePtr && i < PLK_SEGMENTS; segNamePtr++, i++) { + plk_segSizes[i] = 0; + plk_segAddrs[i] = (vm_offset_t)getsegdatafromheader(&_mh_execute_header, *segNamePtr, &plk_segSizes[i]); + } + /* Unserialize the info dictionary from the prelink info section. */ @@ -331,6 +367,7 @@ KLDBootstrap::readPrelinkedExtensions( ramDiskBoot = IORamDiskBSDRoot(); #endif /* NO_KEXTD */ + infoDictArray = OSDynamicCast(OSArray, prelinkInfoDict->getObject(kPrelinkInfoDictionaryKey)); if (!infoDictArray) { @@ -339,10 +376,13 @@ KLDBootstrap::readPrelinkedExtensions( goto finish; } + /* kaslrOffsets are available use them to slide local relocations */ + kaslrOffsets = OSDynamicCast(OSData, + prelinkInfoDict->getObject(kPrelinkLinkKASLROffsetsKey)); + /* Create dictionary of excluded kexts */ OSKext::createExcludeListFromPrelinkInfo(infoDictArray); - /* Create OSKext objects for each info dictionary. */ for (i = 0; i < infoDictArray->getCount(); ++i) { @@ -376,7 +416,7 @@ KLDBootstrap::readPrelinkedExtensions( if (ramDiskOnlyBool == kOSBooleanTrue) { dontLoad = true; } - } + } if (dontLoad == true) { OSString *bundleID = OSDynamicCast(OSString, @@ -403,10 +443,47 @@ KLDBootstrap::readPrelinkedExtensions( * kext system keeps them around until explicitly removed. * Any creation/registration failures are already logged for us. */ - OSKext * newKext = OSKext::withPrelinkedInfoDict(infoDict); + OSKext * newKext = OSKext::withPrelinkedInfoDict(infoDict, (kaslrOffsets ? TRUE : FALSE)); OSSafeReleaseNULL(newKext); } - + + /* slide kxld relocations */ + if (kaslrOffsets && vm_kernel_slide > 0) { + int slidKextAddrCount = 0; + int badSlideAddr = 0; + int badSlideTarget = 0; + + kaslrPackedOffsets * myOffsets = NULL; + myOffsets = (kaslrPackedOffsets *) kaslrOffsets->getBytesNoCopy(); + + for (uint32_t j = 0; j < myOffsets->count; j++) { + + uint64_t slideOffset = (uint64_t) myOffsets->offsetsArray[j]; + uintptr_t * slideAddr = (uintptr_t *) ((uint64_t)prelinkData + slideOffset); + int slideAddrSegIndex = -1; + int addrToSlideSegIndex = -1; + + slideAddrSegIndex = __whereIsAddr( (vm_offset_t)slideAddr, &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS ); + if (slideAddrSegIndex >= 0) { + addrToSlideSegIndex = __whereIsAddr( (vm_offset_t)(*slideAddr + vm_kernel_slide), &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS ); + if (addrToSlideSegIndex < 0) { + badSlideTarget++; + continue; + } + } + else { + badSlideAddr++; + continue; + } + + slidKextAddrCount++; + *(slideAddr) += vm_kernel_slide; + } // for ... + + /* All kexts are now slid, set VM protections for them */ + OSKext::setAllVMAttributes(); + } + /* Store the number of prelinked kexts in the registry so we can tell * when the system has been started from a prelinked kernel. */ @@ -420,7 +497,7 @@ KLDBootstrap::readPrelinkedExtensions( if (prelinkCountObj) { registryRoot->setProperty(kOSPrelinkKextCountKey, prelinkCountObj); } - + OSKextLog(/* kext */ NULL, kOSKextLogProgressLevel | kOSKextLogGeneralFlag | kOSKextLogKextBookkeepingFlag | @@ -445,13 +522,30 @@ KLDBootstrap::readPrelinkedExtensions( } finish: - OSSafeRelease(errorString); - OSSafeRelease(parsedXML); - OSSafeRelease(theKernel); - OSSafeRelease(prelinkCountObj); + OSSafeReleaseNULL(errorString); + OSSafeReleaseNULL(parsedXML); + OSSafeReleaseNULL(theKernel); + OSSafeReleaseNULL(prelinkCountObj); return; } +static int __whereIsAddr(vm_offset_t theAddr, unsigned long * segSizes, vm_offset_t *segAddrs, int segCount) +{ + int i; + + for (i = 0; i < segCount; i++) { + vm_offset_t myAddr = *(segAddrs + i); + unsigned long mySize = *(segSizes + i); + + if (theAddr >= myAddr && theAddr < (myAddr + mySize)) { + return i; + } + } + + return -1; +} + + /********************************************************************* *********************************************************************/ #define BOOTER_KEXT_PREFIX "Driver-" @@ -582,7 +676,7 @@ KLDBootstrap::readBooterExtensions(void) * Any creation/registration failures are already logged for us. */ OSKext * newKext = OSKext::withBooterData(deviceTreeName, booterData); - OSSafeRelease(newKext); + OSSafeReleaseNULL(newKext); booterMemoryMap->removeProperty(deviceTreeName); @@ -590,11 +684,11 @@ KLDBootstrap::readBooterExtensions(void) finish: - OSSafeRelease(booterMemoryMap); - OSSafeRelease(propertyDict); - OSSafeRelease(keyIterator); - OSSafeRelease(booterData); - OSSafeRelease(aKext); + OSSafeReleaseNULL(booterMemoryMap); + OSSafeReleaseNULL(propertyDict); + OSSafeReleaseNULL(keyIterator); + OSSafeReleaseNULL(booterData); + OSSafeReleaseNULL(aKext); return; } @@ -660,8 +754,8 @@ KLDBootstrap::loadSecurityExtensions(void) } finish: - OSSafeRelease(keyIterator); - OSSafeRelease(extensionsDict); + OSSafeReleaseNULL(keyIterator); + OSSafeReleaseNULL(extensionsDict); return; } @@ -703,7 +797,7 @@ KLDBootstrap::loadKernelComponentKexts(void) } } - OSSafeRelease(theKext); + OSSafeReleaseNULL(theKext); return result; } @@ -775,8 +869,8 @@ KLDBootstrap::loadKernelExternalComponents(void) } finish: - OSSafeRelease(keyIterator); - OSSafeRelease(extensionsDict); + OSSafeReleaseNULL(keyIterator); + OSSafeReleaseNULL(extensionsDict); return; } @@ -895,10 +989,10 @@ KLDBootstrap::readBuiltinPersonalities(void) gIOCatalogue->addDrivers(allPersonalities, false); finish: - OSSafeRelease(parsedXML); - OSSafeRelease(allPersonalities); - OSSafeRelease(errorString); - OSSafeRelease(personalitiesIterator); + OSSafeReleaseNULL(parsedXML); + OSSafeReleaseNULL(allPersonalities); + OSSafeReleaseNULL(errorString); + OSSafeReleaseNULL(personalitiesIterator); return; } diff --git a/libsa/conf/Makefile b/libsa/conf/Makefile index 76db9a7d8..7bd79d9ae 100644 --- a/libsa/conf/Makefile +++ b/libsa/conf/Makefile @@ -37,7 +37,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile OBJPATH=${OBJPATH} \ build_all; -do_build_all:: do_all +do_build_all:: do_all include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/libsa/conf/Makefile.template b/libsa/conf/Makefile.template index 657ce25e7..ebfd0d2a8 100644 --- a/libsa/conf/Makefile.template +++ b/libsa/conf/Makefile.template @@ -21,7 +21,7 @@ CFLAGS+= -include meta_features.h -DLIBSA_KERNEL_PRIVATE # # Directories for mig generated files # -COMP_SUBDIRS = +COMP_SUBDIRS = # # Make sure we don't remove this by accident if interrupted at the wrong @@ -68,7 +68,7 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __KLD -o $${kld_file}__ $${kld_file} || exit 1; \ mv $${kld_file}__ $${kld_file} || exit 1; \ done - @echo LDFILELIST $(COMPONENT) + @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" $(_v)for obj in ${OBJS}; do \ echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist diff --git a/libsyscall/Libsyscall.aliases b/libsyscall/Libsyscall.aliases deleted file mode 100644 index e69de29bb..000000000 diff --git a/libsyscall/Libsyscall.xcconfig b/libsyscall/Libsyscall.xcconfig index a1c2fc8d4..181fe1f4e 100644 --- a/libsyscall/Libsyscall.xcconfig +++ b/libsyscall/Libsyscall.xcconfig @@ -27,12 +27,12 @@ GCC_TREAT_WARNINGS_AS_ERRORS = YES GCC_WARN_ABOUT_MISSING_NEWLINE = YES CODE_SIGN_IDENTITY = - DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion) -DYLIB_LDFLAGS = -umbrella System -all_load -Wl,-alias_list,$(SRCROOT)/Libsyscall.aliases +DYLIB_LDFLAGS = -umbrella System -all_load DYLIB_LDFLAGS[sdk=iphoneos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=watchos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=tvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=appletvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 -OTHER_LDFLAGS = +OTHER_LDFLAGS = INSTALLHDRS_SCRIPT_PHASE = YES INSTALLHDRS_COPY_PHASE = YES USE_HEADERMAP = NO diff --git a/libsyscall/Libsyscall.xcodeproj/project.pbxproj b/libsyscall/Libsyscall.xcodeproj/project.pbxproj index bdf83643f..1f54c0158 100644 --- a/libsyscall/Libsyscall.xcodeproj/project.pbxproj +++ b/libsyscall/Libsyscall.xcodeproj/project.pbxproj @@ -23,6 +23,9 @@ isa = PBXAggregateTarget; buildConfigurationList = 249C61191194756B00ED73F3 /* Build configuration list for PBXAggregateTarget "Build" */; buildPhases = ( + BAFE90DF1C3A4D7B0012084F /* CopyFiles */, + BAFE90E11C3A4D9E0012084F /* CopyFiles */, + BAA2D2FB1C3B2CD90049DCBE /* CopyFiles */, ); dependencies = ( 249C61151194756A00ED73F3 /* PBXTargetDependency */, @@ -45,14 +48,15 @@ /* Begin PBXBuildFile section */ 030B179B135377B400DAD1F0 /* open_dprotected_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 030B179A135377B400DAD1F0 /* open_dprotected_np.c */; }; + 139D584B1C7BDE41003D3B17 /* terminate_with_reason.c in Sources */ = {isa = PBXBuildFile; fileRef = 13D932CB1C7B9DE600158FA1 /* terminate_with_reason.c */; }; 13B598941A142F6400DB2D5A /* stackshot.c in Sources */ = {isa = PBXBuildFile; fileRef = 13B598931A142F5900DB2D5A /* stackshot.c */; }; + 14FE60EC1B7D3BF400ACB44C /* mach_get_times.c in Sources */ = {isa = PBXBuildFile; fileRef = 14FE60EB1B7D3BED00ACB44C /* mach_get_times.c */; }; 240BAC4C1214770F000A1719 /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B028D511FF4FBB00CA64A9 /* memcpy.c */; }; 2419382B12135FF6003CDE41 /* chmod.c in Sources */ = {isa = PBXBuildFile; fileRef = 2419382A12135FF6003CDE41 /* chmod.c */; }; 242AB66611EBDC1200107336 /* errno.c in Sources */ = {isa = PBXBuildFile; fileRef = 242AB66511EBDC1200107336 /* errno.c */; }; 24484A7511F6178E00E10CD2 /* string.c in Sources */ = {isa = PBXBuildFile; fileRef = 24484A7411F51E9800E10CD2 /* string.c */; }; 24484A9411F61D2B00E10CD2 /* mig_reply_port.c in Sources */ = {isa = PBXBuildFile; fileRef = 24484A9311F61D1900E10CD2 /* mig_reply_port.c */; }; 24614F0411E7CB5B00E78584 /* syscalls.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 24614F0311E7CB5B00E78584 /* syscalls.a */; }; - 247A08C211F8BDC900E4693F /* _libkernel_init.c in Sources */ = {isa = PBXBuildFile; fileRef = 247A08B311F8B05900E4693F /* _libkernel_init.c */; }; 247A090011F8E18000E4693F /* abort.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A08FF11F8E18000E4693F /* abort.h */; }; 247A091711F8E7A800E4693F /* exc_catcher.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A091611F8E7A800E4693F /* exc_catcher.h */; }; 2485235511582D8F0051B413 /* mach_legacy.c in Sources */ = {isa = PBXBuildFile; fileRef = 2485235411582D8F0051B413 /* mach_legacy.c */; }; @@ -78,7 +82,6 @@ 248BA0BE121DE902008C073F /* select.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA0BC121DE902008C073F /* select.c */; }; 248BA0CD121DEBEF008C073F /* setrlimit.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA0CC121DEBEF008C073F /* setrlimit.c */; }; 249C610B1194750E00ED73F3 /* libsystem_kernel.a in Frameworks */ = {isa = PBXBuildFile; fileRef = D2AAC0630554660B00DB518D /* libsystem_kernel.a */; }; - 249C612F1194828600ED73F3 /* dylib_link.c in Sources */ = {isa = PBXBuildFile; fileRef = 249C612C1194827D00ED73F3 /* dylib_link.c */; }; 24A7C5BC11FF8DA6007669EB /* accept.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5AE11FF8DA6007669EB /* accept.c */; }; 24A7C5BD11FF8DA6007669EB /* bind.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5AF11FF8DA6007669EB /* bind.c */; }; 24A7C5BF11FF8DA6007669EB /* getattrlist.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B111FF8DA6007669EB /* getattrlist.c */; }; @@ -106,22 +109,27 @@ 2BA88DCC1810A3CE00EB63F6 /* coalition.c in Sources */ = {isa = PBXBuildFile; fileRef = 2BA88DCB1810A3CE00EB63F6 /* coalition.c */; }; 374A36E314748F1300AAF39D /* varargs_wrappers.s in Sources */ = {isa = PBXBuildFile; fileRef = 374A36E214748EE400AAF39D /* varargs_wrappers.s */; }; 3F538F891A659C5600B37EFD /* persona.c in Sources */ = {isa = PBXBuildFile; fileRef = 3F538F881A659C5600B37EFD /* persona.c */; }; + 401BB71A1BCAE57B005080D3 /* os_channel.c in Sources */ = {isa = PBXBuildFile; fileRef = 401BB7161BCAE539005080D3 /* os_channel.c */; settings = {COMPILER_FLAGS = "-fno-builtin"; }; }; + 401BB71C1BCAE57B005080D3 /* os_nexus.c in Sources */ = {isa = PBXBuildFile; fileRef = 401BB7181BCAE539005080D3 /* os_nexus.c */; settings = {COMPILER_FLAGS = "-fno-builtin"; }; }; 435F3CAA1B06B7BA005ED9EF /* work_interval.c in Sources */ = {isa = PBXBuildFile; fileRef = 435F3CA91B06B7BA005ED9EF /* work_interval.c */; }; 467DAFD4157E8AF200CE68F0 /* guarded_open_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */; }; 4BDD5F1D1891AB2F004BF300 /* mach_approximate_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */; }; 4BDD5F1E1891AB2F004BF300 /* mach_approximate_time.s in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */; }; 729B7D0A15C8938C000E2501 /* carbon_delete.c in Sources */ = {isa = PBXBuildFile; fileRef = FB50F1B315AB7DE700F814BA /* carbon_delete.c */; }; 72B1E6ED190723DB00FB3FA2 /* guarded_open_dprotected_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */; }; + 72E09E941B444B19006F11A4 /* mach_continuous_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 72FB18801B437F7A00181A5B /* mach_continuous_time.c */; }; 74119F46188F3B6A00C6F48F /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; 7466C924170CBA53004557CC /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; 74F3290B18EB269400B2B70E /* vm_page_size.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; - 978228281B8678DC008385AC /* pselect-darwinext.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228271B8678CB008385AC /* pselect-darwinext.c */; }; - 978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */; }; 7AE28FDF18AC41B1006A5626 /* csr.c in Sources */ = {isa = PBXBuildFile; fileRef = 7AE28FDE18AC41B1006A5626 /* csr.c */; }; - 9002401118FC9A7F00D73BFA /* rename_ext.c in Sources */ = {isa = PBXBuildFile; fileRef = 906AA2D018F74CD1001C681A /* rename_ext.c */; }; + 9002401118FC9A7F00D73BFA /* renamex.c in Sources */ = {isa = PBXBuildFile; fileRef = 906AA2D018F74CD1001C681A /* renamex.c */; }; + 925559921CBC23C300E527CE /* mach_boottime.c in Sources */ = {isa = PBXBuildFile; fileRef = 925559911CBBBBB300E527CE /* mach_boottime.c */; }; 928336A11B83ED9100873B90 /* thread_register_state.c in Sources */ = {isa = PBXBuildFile; fileRef = 928336A01B83ED7800873B90 /* thread_register_state.c */; }; 9299E14A1B841E74005B7350 /* thread_state.h in Headers */ = {isa = PBXBuildFile; fileRef = 928336A21B8412C100873B90 /* thread_state.h */; }; 9299E14B1B841F59005B7350 /* thread_state.h in Headers */ = {isa = PBXBuildFile; fileRef = 928336A21B8412C100873B90 /* thread_state.h */; }; + 929FD46F1C5711DB0087B9C8 /* mach_timebase_info.c in Sources */ = {isa = PBXBuildFile; fileRef = 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */; }; + 978228281B8678DC008385AC /* pselect-darwinext.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228271B8678CB008385AC /* pselect-darwinext.c */; }; + 978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */; }; A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = A59CB95516669DB700B064B3 /* stack_logging_internal.h */; }; A59CB9581666A1A200B064B3 /* munmap.c in Sources */ = {isa = PBXBuildFile; fileRef = A59CB9571666A1A200B064B3 /* munmap.c */; }; BA0D9FB1199031AD007E8A73 /* kdebug_trace.c in Sources */ = {isa = PBXBuildFile; fileRef = BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */; }; @@ -131,7 +139,12 @@ BA4414B518336E3600AAE813 /* mach in Copy Files */ = {isa = PBXBuildFile; fileRef = BA4414A51833697C00AAE813 /* mach */; }; BA4414B618336E3A00AAE813 /* servers in Copy Files */ = {isa = PBXBuildFile; fileRef = BA4414A6183369A100AAE813 /* servers */; }; BA4414B818336E6F00AAE813 /* mach in CopyFiles */ = {isa = PBXBuildFile; fileRef = BA4414A7183369C100AAE813 /* mach */; }; + BA9973471C3B4C9A00B14D8C /* quota_obsolete.c in Sources */ = {isa = PBXBuildFile; fileRef = BA9973461C3B4C8A00B14D8C /* quota_obsolete.c */; }; + BAA2D2FC1C3B2CE90049DCBE /* libsystem_kernel.a in CopyFiles */ = {isa = PBXBuildFile; fileRef = D2AAC0630554660B00DB518D /* libsystem_kernel.a */; }; BABA36CB1A856C4700BBBCF7 /* host.c in Sources */ = {isa = PBXBuildFile; fileRef = BABA36CA1A856C4700BBBCF7 /* host.c */; }; + BAFE90DE1C3A4D2D0012084F /* _libkernel_init.c in Sources */ = {isa = PBXBuildFile; fileRef = 247A08B311F8B05900E4693F /* _libkernel_init.c */; }; + BAFE90E01C3A4D960012084F /* libsystem_kernel.a in CopyFiles */ = {isa = PBXBuildFile; fileRef = D2AAC0630554660B00DB518D /* libsystem_kernel.a */; }; + BAFE90E21C3A4DB00012084F /* libsystem_kernel.a in CopyFiles */ = {isa = PBXBuildFile; fileRef = D2AAC0630554660B00DB518D /* libsystem_kernel.a */; }; C639F0E51741C25800A39F47 /* gethostuuid.h in Headers */ = {isa = PBXBuildFile; fileRef = C639F0E41741C09A00A39F47 /* gethostuuid.h */; settings = {ATTRIBUTES = (Public, ); }; }; C6460B7C182025DF00F73CCA /* sfi.c in Sources */ = {isa = PBXBuildFile; fileRef = C6460B7B182025DF00F73CCA /* sfi.c */; }; C6AB38DB174202C10036DD9F /* gethostuuid.h in Headers */ = {isa = PBXBuildFile; fileRef = C639F0E41741C09A00A39F47 /* gethostuuid.h */; settings = {ATTRIBUTES = (Public, ); }; }; @@ -216,6 +229,8 @@ C9D9BD58114B00600000D8B9 /* thread_act.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD10114B00600000D8B9 /* thread_act.defs */; }; C9D9BD59114B00600000D8B9 /* vm_map.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD11114B00600000D8B9 /* vm_map.defs */; }; C9FD8508166D6BD400963B73 /* tsd.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = C9EE57F51669673D00337E4B /* tsd.h */; }; + E214BDC81C2E358300CEE8A3 /* clonefile.c in Sources */ = {isa = PBXBuildFile; fileRef = E214BDC71C2E34E200CEE8A3 /* clonefile.c */; }; + E2A0F3341C3B17D100A11F8A /* fs_snapshot.c in Sources */ = {isa = PBXBuildFile; fileRef = E2A0F3331C3B17D100A11F8A /* fs_snapshot.c */; }; E4216C311822D404006F2632 /* mach_voucher.defs in Sources */ = {isa = PBXBuildFile; fileRef = E4216C301822D404006F2632 /* mach_voucher.defs */; }; E453AF351700FD3C00F2C94C /* getiopolicy_np.c in Sources */ = {isa = PBXBuildFile; fileRef = E453AF341700FD3C00F2C94C /* getiopolicy_np.c */; }; E453AF3617013CBF00F2C94C /* libproc.h in Headers */ = {isa = PBXBuildFile; fileRef = E4D45C2B16F868ED0002AF25 /* libproc.h */; settings = {ATTRIBUTES = (Public, ); }; }; @@ -326,6 +341,36 @@ ); runOnlyForDeploymentPostprocessing = 1; }; + BAA2D2FB1C3B2CD90049DCBE /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 8; + dstPath = /usr/local/lib/system; + dstSubfolderSpec = 0; + files = ( + BAA2D2FC1C3B2CE90049DCBE /* libsystem_kernel.a in CopyFiles */, + ); + runOnlyForDeploymentPostprocessing = 1; + }; + BAFE90DF1C3A4D7B0012084F /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 8; + dstPath = /usr/local/lib/dyld_stub; + dstSubfolderSpec = 0; + files = ( + BAFE90E01C3A4D960012084F /* libsystem_kernel.a in CopyFiles */, + ); + runOnlyForDeploymentPostprocessing = 1; + }; + BAFE90E11C3A4D9E0012084F /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 8; + dstPath = /usr/local/lib/loaderd; + dstSubfolderSpec = 0; + files = ( + BAFE90E21C3A4DB00012084F /* libsystem_kernel.a in CopyFiles */, + ); + runOnlyForDeploymentPostprocessing = 1; + }; C63F480B1654203800A1F78F /* CopyFiles */ = { isa = PBXCopyFilesBuildPhase; buildActionMask = 8; @@ -351,6 +396,8 @@ /* Begin PBXFileReference section */ 030B179A135377B400DAD1F0 /* open_dprotected_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = open_dprotected_np.c; sourceTree = ""; }; 13B598931A142F5900DB2D5A /* stackshot.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = stackshot.c; sourceTree = ""; }; + 13D932CB1C7B9DE600158FA1 /* terminate_with_reason.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = terminate_with_reason.c; sourceTree = ""; }; + 14FE60EB1B7D3BED00ACB44C /* mach_get_times.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_get_times.c; sourceTree = ""; }; 240D716711933ED300556E97 /* mach_install_mig.sh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.sh; path = mach_install_mig.sh; sourceTree = ""; }; 2419382A12135FF6003CDE41 /* chmod.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = chmod.c; sourceTree = ""; }; 242AB66511EBDC1200107336 /* errno.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = errno.c; sourceTree = ""; }; @@ -388,7 +435,6 @@ 248BA0BC121DE902008C073F /* select.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = select.c; sourceTree = ""; }; 248BA0CC121DEBEF008C073F /* setrlimit.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = setrlimit.c; sourceTree = ""; }; 249C60FF1194747600ED73F3 /* libsystem_kernel.dylib */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.dylib"; includeInIndex = 0; path = libsystem_kernel.dylib; sourceTree = BUILT_PRODUCTS_DIR; }; - 249C612C1194827D00ED73F3 /* dylib_link.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dylib_link.c; sourceTree = ""; }; 24A7C5AE11FF8DA6007669EB /* accept.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = accept.c; sourceTree = ""; }; 24A7C5AF11FF8DA6007669EB /* bind.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = bind.c; sourceTree = ""; }; 24A7C5B111FF8DA6007669EB /* getattrlist.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getattrlist.c; sourceTree = ""; }; @@ -423,7 +469,6 @@ 24D1157311E671B20063D54D /* custom.s */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 8; lastKnownFileType = sourcecode.asm; path = custom.s; sourceTree = ""; tabWidth = 8; }; 24D1157411E671B20063D54D /* SYS.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 8; lastKnownFileType = sourcecode.c.h; path = SYS.h; sourceTree = ""; tabWidth = 8; }; 24D1158C11E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; - 24D1158F11E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; 24D1159111E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; 24D1159711E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; 24D1159811E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; @@ -437,18 +482,23 @@ 374A36E214748EE400AAF39D /* varargs_wrappers.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = varargs_wrappers.s; sourceTree = ""; }; 37DDFB7614748713009D3355 /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; 3F538F881A659C5600B37EFD /* persona.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = persona.c; sourceTree = ""; }; + 401BB7161BCAE539005080D3 /* os_channel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = os_channel.c; path = skywalk/os_channel.c; sourceTree = ""; }; + 401BB7181BCAE539005080D3 /* os_nexus.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = os_nexus.c; path = skywalk/os_nexus.c; sourceTree = ""; }; 435F3CA91B06B7BA005ED9EF /* work_interval.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = work_interval.c; sourceTree = ""; }; 467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = guarded_open_np.c; sourceTree = ""; }; 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_approximate_time.c; sourceTree = ""; }; 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mach_approximate_time.s; sourceTree = ""; }; 72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = guarded_open_dprotected_np.c; sourceTree = ""; }; + 72FB18801B437F7A00181A5B /* mach_continuous_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_continuous_time.c; sourceTree = ""; }; 7466C923170CB99B004557CC /* vm_page_size.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = vm_page_size.h; sourceTree = ""; }; - 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext-cancel.c"; sourceTree = ""; }; - 978228271B8678CB008385AC /* pselect-darwinext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext.c"; sourceTree = ""; }; 7AE28FDE18AC41B1006A5626 /* csr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = csr.c; sourceTree = ""; }; - 906AA2D018F74CD1001C681A /* rename_ext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = rename_ext.c; sourceTree = ""; }; + 906AA2D018F74CD1001C681A /* renamex.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = renamex.c; sourceTree = ""; }; + 925559911CBBBBB300E527CE /* mach_boottime.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_boottime.c; sourceTree = ""; }; 928336A01B83ED7800873B90 /* thread_register_state.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = thread_register_state.c; sourceTree = ""; }; 928336A21B8412C100873B90 /* thread_state.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_state.h; sourceTree = ""; }; + 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_timebase_info.c; sourceTree = ""; }; + 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext-cancel.c"; sourceTree = ""; }; + 978228271B8678CB008385AC /* pselect-darwinext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext.c"; sourceTree = ""; }; A59CB95516669DB700B064B3 /* stack_logging_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = stack_logging_internal.h; sourceTree = ""; }; A59CB9571666A1A200B064B3 /* munmap.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = munmap.c; sourceTree = ""; }; BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kdebug_trace.c; sourceTree = ""; }; @@ -456,6 +506,7 @@ BA4414A6183369A100AAE813 /* servers */ = {isa = PBXFileReference; lastKnownFileType = text; name = servers; path = mig_hdr/include/servers; sourceTree = BUILT_PRODUCTS_DIR; }; BA4414A7183369C100AAE813 /* mach */ = {isa = PBXFileReference; lastKnownFileType = text; name = mach; path = mig_hdr/local/include/mach; sourceTree = BUILT_PRODUCTS_DIR; }; BA5CDB4018AEBAD500E37982 /* __thread_selfusage.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __thread_selfusage.s; sourceTree = ""; }; + BA9973461C3B4C8A00B14D8C /* quota_obsolete.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = quota_obsolete.c; sourceTree = ""; }; BABA36CA1A856C4700BBBCF7 /* host.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = host.c; sourceTree = ""; }; C639F0E41741C09A00A39F47 /* gethostuuid.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = gethostuuid.h; sourceTree = ""; }; C6460B7B182025DF00F73CCA /* sfi.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sfi.c; sourceTree = ""; }; @@ -464,6 +515,7 @@ C6C40121174154D9000AE69F /* gethostuuid_private.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = gethostuuid_private.h; sourceTree = ""; }; C6D3F02E16542C510052CF30 /* libsystem_Libsyscall_headers_Sim.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsystem_Libsyscall_headers_Sim.a; sourceTree = BUILT_PRODUCTS_DIR; }; C6D3F02F16542C980052CF30 /* dummy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dummy.c; sourceTree = ""; }; + C93B50491C487698009DD6AB /* __kdebug_trace_string.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __kdebug_trace_string.s; sourceTree = ""; }; C962B16B18DBA2C80031244A /* setpriority.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = setpriority.c; sourceTree = ""; }; C962B16D18DBB43F0031244A /* thread_act.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = thread_act.c; sourceTree = ""; }; C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = __get_cpu_capabilities.s; sourceTree = ""; }; @@ -531,7 +583,8 @@ C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Libsyscall.xcconfig; sourceTree = ""; }; C9EE57F51669673D00337E4B /* tsd.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tsd.h; sourceTree = ""; }; D2AAC0630554660B00DB518D /* libsystem_kernel.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsystem_kernel.a; sourceTree = BUILT_PRODUCTS_DIR; }; - E40C845216FAFB3F00C238DD /* Libsyscall.aliases */ = {isa = PBXFileReference; lastKnownFileType = text; path = Libsyscall.aliases; sourceTree = ""; }; + E214BDC71C2E34E200CEE8A3 /* clonefile.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = clonefile.c; sourceTree = ""; }; + E2A0F3331C3B17D100A11F8A /* fs_snapshot.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fs_snapshot.c; sourceTree = ""; }; E4216C301822D404006F2632 /* mach_voucher.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = mach_voucher.defs; sourceTree = ""; }; E453AF341700FD3C00F2C94C /* getiopolicy_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getiopolicy_np.c; sourceTree = ""; }; E4D45C2116F856900002AF25 /* __commpage_gettimeofday.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = __commpage_gettimeofday.c; sourceTree = ""; }; @@ -580,7 +633,6 @@ isa = PBXGroup; children = ( C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */, - E40C845216FAFB3F00C238DD /* Libsyscall.aliases */, 24D1158911E672270063D54D /* Platforms */, 24D1156511E671B20063D54D /* custom */, 08FB7795FE84155DC02AAC07 /* mach */, @@ -657,7 +709,6 @@ C962B16D18DBB43F0031244A /* thread_act.c */, C9D9BD10114B00600000D8B9 /* thread_act.defs */, C9D9BD11114B00600000D8B9 /* vm_map.defs */, - 249C612C1194827D00ED73F3 /* dylib_link.c */, ); path = mach; sourceTree = ""; @@ -699,53 +750,62 @@ 247A08B011F8AF1700E4693F /* wrappers */ = { isa = PBXGroup; children = ( - 248BA04A121C8EE4008C073F /* cancelable */, - 24A7C6951200AF8A007669EB /* legacy */, - E4D45C2916F868ED0002AF25 /* libproc */, - E4D45C3B16FB20970002AF25 /* spawn */, - E4D7E55216F8776300F92D8D /* string */, - 2419382912135FE1003CDE41 /* unix03 */, E4D45C2116F856900002AF25 /* __commpage_gettimeofday.c */, - E4D45C2216F856900002AF25 /* __commpage_gettimeofday.s */, - C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */, - 24A7C5CB11FF973C007669EB /* _errno.h */, 24E47824120881DF009A384D /* _libc_funcptr.c */, 247A08B311F8B05900E4693F /* _libkernel_init.c */, - 247A08B211F8B05900E4693F /* _libkernel_init.h */, FB50F1B315AB7DE700F814BA /* carbon_delete.c */, + E214BDC71C2E34E200CEE8A3 /* clonefile.c */, 2BA88DCB1810A3CE00EB63F6 /* coalition.c */, 7AE28FDE18AC41B1006A5626 /* csr.c */, - C6C40121174154D9000AE69F /* gethostuuid_private.h */, + E2A0F3331C3B17D100A11F8A /* fs_snapshot.c */, C6C4012017415384000AE69F /* gethostuuid.c */, - C639F0E41741C09A00A39F47 /* gethostuuid.h */, E453AF341700FD3C00F2C94C /* getiopolicy_np.c */, - 467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */, 72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */, + 467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */, C99A4F511305B43F0054B7B7 /* init_cpu_capabilities.c */, 248BA07F121DA36B008C073F /* ioctl.c */, + BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */, 248BA081121DA4F3008C073F /* kill.c */, - E4D45C2316F856900002AF25 /* mach_absolute_time.s */, 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */, - 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */, + 925559911CBBBBB300E527CE /* mach_boottime.c */, + 72FB18801B437F7A00181A5B /* mach_continuous_time.c */, + 14FE60EB1B7D3BED00ACB44C /* mach_get_times.c */, + 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */, 030B179A135377B400DAD1F0 /* open_dprotected_np.c */, 3F538F881A659C5600B37EFD /* persona.c */, C6BEE9171806840200D25AAB /* posix_sem_obsolete.c */, + BA9973461C3B4C8A00B14D8C /* quota_obsolete.c */, 24B8C2611237F53900D36CC3 /* remove-counter.c */, 248AA966122C7CDA0085F5B1 /* rename.c */, 29A59AE1183B0DE000E8B896 /* renameat.c */, - 906AA2D018F74CD1001C681A /* rename_ext.c */, + 906AA2D018F74CD1001C681A /* renamex.c */, 248AA964122C7C330085F5B1 /* rmdir.c */, 248BA090121DDD7F008C073F /* select-base.c */, C962B16B18DBA2C80031244A /* setpriority.c */, C6460B7B182025DF00F73CCA /* sfi.c */, 24B223B3121DFF12007DAEDE /* sigsuspend-base.c */, 13B598931A142F5900DB2D5A /* stackshot.c */, + 13D932CB1C7B9DE600158FA1 /* terminate_with_reason.c */, 928336A01B83ED7800873B90 /* thread_register_state.c */, 248AA962122C7B2A0085F5B1 /* unlink.c */, 29A59AE5183B110C00E8B896 /* unlinkat.c */, - 374A36E214748EE400AAF39D /* varargs_wrappers.s */, - BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */, 435F3CA91B06B7BA005ED9EF /* work_interval.c */, + 24A7C5CB11FF973C007669EB /* _errno.h */, + 247A08B211F8B05900E4693F /* _libkernel_init.h */, + C6C40121174154D9000AE69F /* gethostuuid_private.h */, + C639F0E41741C09A00A39F47 /* gethostuuid.h */, + E4D45C2216F856900002AF25 /* __commpage_gettimeofday.s */, + C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */, + E4D45C2316F856900002AF25 /* mach_absolute_time.s */, + 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */, + 374A36E214748EE400AAF39D /* varargs_wrappers.s */, + 248BA04A121C8EE4008C073F /* cancelable */, + 24A7C6951200AF8A007669EB /* legacy */, + E4D45C2916F868ED0002AF25 /* libproc */, + 401BB7141BCAE523005080D3 /* skywalk */, + E4D45C3B16FB20970002AF25 /* spawn */, + E4D7E55216F8776300F92D8D /* string */, + 2419382912135FE1003CDE41 /* unix03 */, ); path = wrappers; sourceTree = ""; @@ -799,10 +859,10 @@ 24D1156511E671B20063D54D /* custom */ = { isa = PBXGroup; children = ( - C6D3F02F16542C980052CF30 /* dummy.c */, 24D1156611E671B20063D54D /* __fork.s */, 24D1156711E671B20063D54D /* __getpid.s */, 24D1156811E671B20063D54D /* __gettimeofday.s */, + C93B50491C487698009DD6AB /* __kdebug_trace_string.s */, 24D1156911E671B20063D54D /* __lseek.s */, 24D1156A11E671B20063D54D /* __pipe.s */, 24D1156D11E671B20063D54D /* __ptrace.s */, @@ -813,8 +873,9 @@ BA5CDB4018AEBAD500E37982 /* __thread_selfusage.s */, 24D1157211E671B20063D54D /* __vfork.s */, 24D1157311E671B20063D54D /* custom.s */, - 24D1157411E671B20063D54D /* SYS.h */, + C6D3F02F16542C980052CF30 /* dummy.c */, 242AB66511EBDC1200107336 /* errno.c */, + 24D1157411E671B20063D54D /* SYS.h */, ); path = custom; sourceTree = ""; @@ -849,21 +910,12 @@ 24D1158D11E672270063D54D /* MacOSX */ = { isa = PBXGroup; children = ( - 24D1158E11E672270063D54D /* arm */, 24D1159011E672270063D54D /* i386 */, 24D1159611E672270063D54D /* x86_64 */, ); path = MacOSX; sourceTree = ""; }; - 24D1158E11E672270063D54D /* arm */ = { - isa = PBXGroup; - children = ( - 24D1158F11E672270063D54D /* syscall.map */, - ); - path = arm; - sourceTree = ""; - }; 24D1159011E672270063D54D /* i386 */ = { isa = PBXGroup; children = ( @@ -888,6 +940,15 @@ path = arm64; sourceTree = ""; }; + 401BB7141BCAE523005080D3 /* skywalk */ = { + isa = PBXGroup; + children = ( + 401BB7161BCAE539005080D3 /* os_channel.c */, + 401BB7181BCAE539005080D3 /* os_nexus.c */, + ); + name = skywalk; + sourceTree = ""; + }; BA4414B118336D6A00AAE813 /* Generated MIG headers */ = { isa = PBXGroup; children = ( @@ -1199,6 +1260,7 @@ files = ( 24E4782712088267009A384D /* _libc_funcptr.c in Sources */, E4D7E56216F8776300F92D8D /* strlen.c in Sources */, + BAFE90DE1C3A4D2D0012084F /* _libkernel_init.c in Sources */, E4D7E55C16F8776300F92D8D /* index.c in Sources */, E4D7E55F16F8776300F92D8D /* strcmp.c in Sources */, E4D7E55E16F8776300F92D8D /* memset.c in Sources */, @@ -1206,7 +1268,6 @@ E4D7E56316F8776300F92D8D /* strsep.c in Sources */, E4D7E56016F8776300F92D8D /* strcpy.c in Sources */, E4D7E56116F8776300F92D8D /* strlcpy.c in Sources */, - 249C612F1194828600ED73F3 /* dylib_link.c in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -1222,6 +1283,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + E214BDC81C2E358300CEE8A3 /* clonefile.c in Sources */, C9D9BD19114B00600000D8B9 /* clock_priv.defs in Sources */, C9D9BD1A114B00600000D8B9 /* clock_reply.defs in Sources */, C9D9BD1C114B00600000D8B9 /* clock.defs in Sources */, @@ -1238,6 +1300,7 @@ C9D9BD53114B00600000D8B9 /* netname.defs in Sources */, C9D9BD57114B00600000D8B9 /* task.defs in Sources */, C9D9BD58114B00600000D8B9 /* thread_act.defs in Sources */, + 72E09E941B444B19006F11A4 /* mach_continuous_time.c in Sources */, 29A59AE6183B110C00E8B896 /* unlinkat.c in Sources */, C9D9BD59114B00600000D8B9 /* vm_map.defs in Sources */, C9D9BD1B114B00600000D8B9 /* clock_sleep.c in Sources */, @@ -1258,6 +1321,7 @@ C9D9BD41114B00600000D8B9 /* mig_allocate.c in Sources */, E4D45C2516F856900002AF25 /* __commpage_gettimeofday.s in Sources */, C9D9BD42114B00600000D8B9 /* mig_deallocate.c in Sources */, + BA9973471C3B4C9A00B14D8C /* quota_obsolete.c in Sources */, E4D45C2416F856900002AF25 /* __commpage_gettimeofday.c in Sources */, C9D9BD43114B00600000D8B9 /* mig_reply_setup.c in Sources */, 24484A9411F61D2B00E10CD2 /* mig_reply_port.c in Sources */, @@ -1273,7 +1337,6 @@ 2485235511582D8F0051B413 /* mach_legacy.c in Sources */, 242AB66611EBDC1200107336 /* errno.c in Sources */, E4D45C2E16F868ED0002AF25 /* libproc.c in Sources */, - 247A08C211F8BDC900E4693F /* _libkernel_init.c in Sources */, 24A7C5BC11FF8DA6007669EB /* accept.c in Sources */, 24A7C5BD11FF8DA6007669EB /* bind.c in Sources */, 4BDD5F1D1891AB2F004BF300 /* mach_approximate_time.c in Sources */, @@ -1281,6 +1344,7 @@ 24A7C5BF11FF8DA6007669EB /* getattrlist.c in Sources */, 24A7C5C011FF8DA6007669EB /* getpeername.c in Sources */, 24A7C5C111FF8DA6007669EB /* getsockname.c in Sources */, + 925559921CBC23C300E527CE /* mach_boottime.c in Sources */, 24A7C5C211FF8DA6007669EB /* lchown.c in Sources */, 24A7C5C311FF8DA6007669EB /* listen.c in Sources */, 24A7C5C411FF8DA6007669EB /* recvfrom.c in Sources */, @@ -1291,7 +1355,7 @@ 24A7C5C811FF8DA6007669EB /* setattrlist.c in Sources */, 24A7C5C911FF8DA6007669EB /* socketpair.c in Sources */, 928336A11B83ED9100873B90 /* thread_register_state.c in Sources */, - 9002401118FC9A7F00D73BFA /* rename_ext.c in Sources */, + 9002401118FC9A7F00D73BFA /* renamex.c in Sources */, 2419382B12135FF6003CDE41 /* chmod.c in Sources */, 248BA01D121C56BF008C073F /* connect.c in Sources */, 248BA01F121C607E008C073F /* fchmod.c in Sources */, @@ -1299,10 +1363,13 @@ 13B598941A142F6400DB2D5A /* stackshot.c in Sources */, C962B16C18DBA2C80031244A /* setpriority.c in Sources */, 248BA04F121C8F06008C073F /* fcntl.c in Sources */, + 14FE60EC1B7D3BF400ACB44C /* mach_get_times.c in Sources */, + 139D584B1C7BDE41003D3B17 /* terminate_with_reason.c in Sources */, 248BA05C121C9649008C073F /* fcntl-cancel.c in Sources */, 248BA069121D9E27008C073F /* getrlimit.c in Sources */, C6460B7C182025DF00F73CCA /* sfi.c in Sources */, 248BA080121DA36B008C073F /* ioctl.c in Sources */, + 401BB71A1BCAE57B005080D3 /* os_channel.c in Sources */, C6BEE9181806840200D25AAB /* posix_sem_obsolete.c in Sources */, 248BA082121DA4F3008C073F /* kill.c in Sources */, 248BA085121DA5E4008C073F /* kill.c in Sources */, @@ -1313,6 +1380,8 @@ 248BA08B121DAC86008C073F /* msync.c in Sources */, 248BA08D121DB0E7008C073F /* munmap.c in Sources */, 248BA08F121DC545008C073F /* open.c in Sources */, + E2A0F3341C3B17D100A11F8A /* fs_snapshot.c in Sources */, + 929FD46F1C5711DB0087B9C8 /* mach_timebase_info.c in Sources */, 248BA093121DE369008C073F /* select.c in Sources */, 248BA095121DE565008C073F /* select-pre1050.c in Sources */, BA0D9FB1199031AD007E8A73 /* kdebug_trace.c in Sources */, @@ -1321,6 +1390,7 @@ 248BA0BE121DE902008C073F /* select.c in Sources */, 248BA0CD121DEBEF008C073F /* setrlimit.c in Sources */, 24B223B0121DFD36007DAEDE /* sigsuspend.c in Sources */, + 401BB71C1BCAE57B005080D3 /* os_nexus.c in Sources */, 24B223B2121DFE6D007DAEDE /* sigsuspend-cancel.c in Sources */, E4216C311822D404006F2632 /* mach_voucher.defs in Sources */, 24B223B5121DFF29007DAEDE /* sigsuspend.c in Sources */, @@ -1433,6 +1503,8 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { + COPY_PHASE_STRIP = NO; + INSTALLHDRS_COPY_PHASE = NO; PRODUCT_NAME = Build; STRIP_STYLE = debugging; }; diff --git a/libsyscall/custom/SYS.h b/libsyscall/custom/SYS.h index 46e3735d9..2c66bbf4d 100644 --- a/libsyscall/custom/SYS.h +++ b/libsyscall/custom/SYS.h @@ -48,15 +48,6 @@ #include -/* Binary compatibility stubs for syscalls that no longer exist */ - -#ifndef SYS_setquota -#define SYS_setquota 148 -#endif -#ifndef SYS_quota -#define SYS_quota 149 -#endif - #if defined(__i386__) #include diff --git a/libsyscall/custom/__gettimeofday.s b/libsyscall/custom/__gettimeofday.s index 0076f49ce..8712a2094 100644 --- a/libsyscall/custom/__gettimeofday.s +++ b/libsyscall/custom/__gettimeofday.s @@ -1,8 +1,8 @@ /* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 1999-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,36 +22,73 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright 1998 Apple Computer, Inc. */ #include "SYS.h" -#if defined(__i386__) - /* - * This syscall is special cased: the timeval is returned in eax/edx. + * A third argument, of type uint64_t*, was added to the gettimeofday syscall + * for use cases that also want to know the mach_absolute_time that matches the + * time value returned. + * + * __gettimeofday takes the traditional two arguments. It will zero out the + * third argument argument before entering the kernel, behaving like the old + * call. + * + * __gettimeofday_with_mach will pass it through and supporting kernels will + * copy-out the mach_absolute_time. Old kernels will leave the pointed to + * value alone. */ + +.private_extern ___gettimeofday_with_mach + +#if defined(__i386__) + LABEL(___gettimeofday) + pushl $0 + pushl 12(%esp) + pushl 12(%esp) + calll ___gettimeofday_with_mach + addl $12, %esp + ret + +LABEL(___gettimeofday_with_mach) UNIX_SYSCALL_INT_NONAME(gettimeofday,0) - mov 4(%esp),%ecx - mov %eax,(%ecx) - mov %edx,4(%ecx) - xor %eax,%eax - ret + /* + * + * If eax is 0, we're on a new kernel and timeval was written by the kernel. + * Otherwise, eax:edx contains the timeval and we marshal into timeval. + */ + cmp $0, %eax + je 2f + mov 4(%esp),%ecx + mov %eax,(%ecx) + mov %edx,4(%ecx) + xor %eax,%eax +2: + ret #elif defined(__x86_64__) -/* - * This syscall is special cased: the timeval is returned in rax:rdx. - */ +__SYSCALL(___gettimeofday_with_mach, gettimeofday, 3) + LABEL(___gettimeofday) + movq $0x0, %rdx // zero out third argument + UNIX_SYSCALL_NONAME(gettimeofday,0,cerror_nocancel) - movq %rax, (%rdi) - movl %edx, 8(%rdi) - xorl %eax, %eax + /* + * + * If rax is 0, we're on a new kernel and timeval was written by the kernel. + * Otherwise, rax:rdx contains the timeval and we marshal into timeval. + */ + cmp $0, %rax + je 2f + movq %rax, (%rdi) + movl %edx, 8(%rdi) + xorl %eax, %eax +2: ret #else diff --git a/libsyscall/custom/__kdebug_trace_string.s b/libsyscall/custom/__kdebug_trace_string.s new file mode 100644 index 000000000..e3543e6bf --- /dev/null +++ b/libsyscall/custom/__kdebug_trace_string.s @@ -0,0 +1,41 @@ +/* + * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "SYS.h" + +#if defined(__x86_64__) + +__SYSCALL(___kdebug_trace_string, kdebug_trace_string, 3) + +#elif defined(__i386__) + +__SYSCALL_INT(___kdebug_trace_string, kdebug_trace_string, 3) + +#else +#error Unsupported architecture +#endif diff --git a/libsyscall/custom/__vfork.s b/libsyscall/custom/__vfork.s index 8f5cd224d..8449d25e4 100644 --- a/libsyscall/custom/__vfork.s +++ b/libsyscall/custom/__vfork.s @@ -112,7 +112,7 @@ LEAF(___vfork, 0) pushq %rdi // put return address back on stack for cerror movq __current_pid@GOTPCREL(%rip), %rcx lock - addq $1, (%rcx) + addl $1, (%rcx) movq %rax, %rdi BRANCH_EXTERN(_cerror) @@ -125,7 +125,7 @@ L1: L2: movq __current_pid@GOTPCREL(%rip), %rdx lock - addq $1, (%rdx) + addl $1, (%rdx) jmp *%rdi #else diff --git a/libsyscall/custom/custom.s b/libsyscall/custom/custom.s index b76b96fe7..56a95cf5b 100644 --- a/libsyscall/custom/custom.s +++ b/libsyscall/custom/custom.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2011 Apple Inc. All rights reserved. + * Copyright (c) 1999-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,16 @@ _i386_set_ldt: jmp tramp_cerror 2: ret + ALIGN + .globl __thread_set_tsd_base +__thread_set_tsd_base: + pushl 4(%esp) + pushl $0 + movl $3,%eax + MACHDEP_SYSCALL_TRAP + addl $8,%esp + ret + #elif defined(__x86_64__) .globl _i386_get_ldt @@ -104,4 +114,16 @@ _i386_set_ldt: jmp _cerror 2: ret + ALIGN + .globl __thread_set_tsd_base +__thread_set_tsd_base: + movl $0, %esi // 0 as the second argument + movl $ SYSCALL_CONSTRUCT_MDEP(3), %eax // Machine-dependent syscall number 3 + MACHDEP_SYSCALL_TRAP + ret + +#else +#error unknown architecture #endif + +.subsections_via_symbols diff --git a/libsyscall/fixdups.ed b/libsyscall/fixdups.ed deleted file mode 100644 index 009f6d917..000000000 --- a/libsyscall/fixdups.ed +++ /dev/null @@ -1,2 +0,0 @@ -//;.+1,$g//d -w diff --git a/libsyscall/mach/host.c b/libsyscall/mach/host.c index c6587951f..a781a09ad 100644 --- a/libsyscall/mach/host.c +++ b/libsyscall/mach/host.c @@ -31,6 +31,7 @@ #include #include #include +#include kern_return_t host_get_atm_diagnostic_flag(host_t host __unused, @@ -56,3 +57,35 @@ host_check_multiuser_mode(host_t host __unused, (void)multiuser_mode; return KERN_NOT_SUPPORTED; } + +extern kern_return_t +_kernelrpc_host_create_mach_voucher(mach_port_name_t host, + mach_voucher_attr_raw_recipe_array_t recipes, + mach_voucher_attr_recipe_size_t recipesCnt, + mach_port_name_t *voucher); + +kern_return_t +host_create_mach_voucher(mach_port_name_t host, + mach_voucher_attr_raw_recipe_array_t recipes, + mach_voucher_attr_recipe_size_t recipesCnt, + mach_port_name_t *voucher) +{ + kern_return_t rv; + + rv = host_create_mach_voucher_trap(host, recipes, recipesCnt, voucher); + +#ifdef __x86_64__ + /* REMOVE once XBS kernel has new trap */ + if (rv == ((1 << 24) | 70)) /* see mach/i386/syscall_sw.h */ + rv = MACH_SEND_INVALID_DEST; +#elif defined(__i386__) + /* REMOVE once XBS kernel has new trap */ + if (rv == (kern_return_t)(-70)) + rv = MACH_SEND_INVALID_DEST; +#endif + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_host_create_mach_voucher(host, recipes, recipesCnt, voucher); + + return rv; +} diff --git a/libsyscall/mach/mach_port.c b/libsyscall/mach/mach_port.c index 428ada4f2..e2cf670be 100644 --- a/libsyscall/mach/mach_port.c +++ b/libsyscall/mach/mach_port.c @@ -584,3 +584,36 @@ mach_port_unguard( } +extern kern_return_t +_kernelrpc_mach_voucher_extract_attr_recipe( + mach_port_name_t voucher, + mach_voucher_attr_key_t key, + mach_voucher_attr_raw_recipe_t recipe, + mach_msg_type_number_t *recipe_size); + +kern_return_t +mach_voucher_extract_attr_recipe( + mach_port_name_t voucher, + mach_voucher_attr_key_t key, + mach_voucher_attr_raw_recipe_t recipe, + mach_msg_type_number_t *recipe_size) +{ + kern_return_t rv; + + rv = mach_voucher_extract_attr_recipe_trap(voucher, key, recipe, recipe_size); + +#ifdef __x86_64__ + /* REMOVE once XBS kernel has new trap */ + if (rv == ((1 << 24) | 72)) /* see mach/i386/syscall_sw.h */ + rv = MACH_SEND_INVALID_DEST; +#elif defined(__i386__) + /* REMOVE once XBS kernel has new trap */ + if (rv == (kern_return_t)(-72)) + rv = MACH_SEND_INVALID_DEST; +#endif + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_voucher_extract_attr_recipe(voucher, key, recipe, recipe_size); + + return rv; +} diff --git a/libsyscall/mach/mach_vm.c b/libsyscall/mach/mach_vm.c index d31037398..7fbeae05f 100644 --- a/libsyscall/mach/mach_vm.c +++ b/libsyscall/mach/mach_vm.c @@ -317,3 +317,33 @@ vm_read( return (rv); } + +kern_return_t +mach_vm_purgable_control( + mach_port_name_t target, + mach_vm_offset_t address, + vm_purgable_t control, + int *state) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_vm_purgable_control_trap(target, address, control, state); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_vm_purgable_control(target, address, control, state); + + return (rv); +} + +kern_return_t +vm_purgable_control( + mach_port_name_t task, + vm_offset_t address, + vm_purgable_t control, + int *state) +{ + return mach_vm_purgable_control(task, + (mach_vm_offset_t) address, + control, + state); +} diff --git a/libsyscall/mach/mig_strncpy.c b/libsyscall/mach/mig_strncpy.c index ed17aaff2..3bc188adf 100644 --- a/libsyscall/mach/mig_strncpy.c +++ b/libsyscall/mach/mig_strncpy.c @@ -92,3 +92,59 @@ mig_strncpy( *dest = '\0'; return i; } + +/* + * mig_strncpy_zerofill -- Bounded string copy. Does what the + * library routine strncpy OUGHT to do: Copies the (null terminated) + * string in src into dest, a buffer of length len. Assures that + * the copy is still null terminated and doesn't overflow the buffer, + * truncating the copy if necessary. If the string in src is smaller + * than given length len, it will zero fill the remaining bytes in dest. + * + * Parameters: + * + * dest - Pointer to destination buffer. + * + * src - Pointer to source string. + * + * len - Length of destination buffer. + * + * Result: + * length of string copied, INCLUDING the trailing 0. + */ +int +mig_strncpy_zerofill( + char *dest, + const char *src, + int len) +{ + int i; + boolean_t terminated = FALSE; + int retval = 0; + + if (len <= 0 || dest == 0) { + return 0; + } + + if (src == 0) { + terminated = TRUE; + } + + for (i = 1; i < len; i++) { + if (!terminated) { + if (!(*dest++ = *src++)) { + retval = i; + terminated = TRUE; + } + } else { + *dest++ = '\0'; + } + } + + *dest = '\0'; + if (!terminated) { + retval = i; + } + + return retval; +} diff --git a/libsyscall/mach/panic.c b/libsyscall/mach/panic.c index dd7332742..7049b9561 100644 --- a/libsyscall/mach/panic.c +++ b/libsyscall/mach/panic.c @@ -77,7 +77,7 @@ panic(const char *s, ...) { char buffer[1024]; int len = _mach_snprintf(buffer, sizeof(buffer), "panic: %s\n", s); - write(__STDERR_FILENO, buffer, len+1); + write(__STDERR_FILENO, buffer, len); #define RB_DEBUGGER 0x1000 /* enter debugger NOW */ (void) host_reboot(master_host_port, RB_DEBUGGER); diff --git a/libsyscall/mach/string.c b/libsyscall/mach/string.c index e1555b49d..8dca9927c 100644 --- a/libsyscall/mach/string.c +++ b/libsyscall/mach/string.c @@ -104,8 +104,9 @@ _mach_vsnprintf(char *buffer, int length, const char *fmt, va_list ap) } } } - *out_ptr = '\0'; - return max - length; + if (max > 0) + *out_ptr = '\0'; + return max - (length + 1); /* don't include the final NULL in the return value */ } int diff --git a/libsyscall/os/tsd.h b/libsyscall/os/tsd.h index 279f65d59..0e064b954 100644 --- a/libsyscall/os/tsd.h +++ b/libsyscall/os/tsd.h @@ -49,32 +49,62 @@ _os_cpu_number(void) return 0; } +#if defined(__i386__) || defined(__x86_64__) + +#if defined(__has_attribute) +#if __has_attribute(address_space) +#define OS_GS_RELATIVE __attribute__((address_space(256))) +#endif +#endif + +#ifdef OS_GS_RELATIVE +#define _os_tsd_get_base() ((void * OS_GS_RELATIVE *)0) +#else __attribute__((always_inline)) static __inline__ void* _os_tsd_get_direct(unsigned long slot) { void *ret; -#if defined(__i386__) || defined(__x86_64__) __asm__("mov %%gs:%1, %0" : "=r" (ret) : "m" (*(void **)(slot * sizeof(void *)))); -#endif - - return ret; } __attribute__((always_inline)) static __inline__ int -_os_tsd_set_direct(unsigned long slot, void* val) +_os_tsd_set_direct(unsigned long slot, void *val) { #if defined(__i386__) && defined(__PIC__) __asm__("movl %1, %%gs:%0" : "=m" (*(void **)(slot * sizeof(void *))) : "rn" (val)); #elif defined(__i386__) && !defined(__PIC__) __asm__("movl %1, %%gs:%0" : "=m" (*(void **)(slot * sizeof(void *))) : "ri" (val)); -#elif defined(__x86_64__) +#else __asm__("movq %1, %%gs:%0" : "=m" (*(void **)(slot * sizeof(void *))) : "rn" (val)); +#endif + return 0; +} #endif +#else +#error _os_tsd_get_base not implemented on this architecture +#endif + +#ifdef _os_tsd_get_base +__attribute__((always_inline)) +static __inline__ void* +_os_tsd_get_direct(unsigned long slot) +{ + return _os_tsd_get_base()[slot]; +} + +__attribute__((always_inline)) +static __inline__ int +_os_tsd_set_direct(unsigned long slot, void *val) +{ + _os_tsd_get_base()[slot] = val; return 0; } +#endif + +extern void _thread_set_tsd_base(void *tsd_base); #endif diff --git a/libsyscall/wrappers/__commpage_gettimeofday.c b/libsyscall/wrappers/__commpage_gettimeofday.c index 0bc34b4ab..4967a2f8d 100644 --- a/libsyscall/wrappers/__commpage_gettimeofday.c +++ b/libsyscall/wrappers/__commpage_gettimeofday.c @@ -21,3 +21,59 @@ * @APPLE_LICENSE_HEADER_END@ */ +#include +#include +#include + +int __commpage_gettimeofday(struct timeval *); + +__attribute__((visibility("hidden"))) +int __commpage_gettimeofday_internal(struct timeval *tp, uint64_t *tbr_out); + +#if defined(__x86_64__) || defined(__i386__) + +// XXX: must be kept in sync with __commpage_gettimeofday.s +int +__commpage_gettimeofday_internal(struct timeval *tp, uint64_t *tbr_out) +{ + volatile uint32_t *gtod_generation_p = _COMM_PAGE_GTOD_GENERATION; + volatile uint64_t *gtod_sec_base_p = _COMM_PAGE_GTOD_SEC_BASE; + volatile uint64_t *gtod_ns_base_p = _COMM_PAGE_GTOD_NS_BASE; + + uint64_t tbr, gen, tod_secs, tod_nsecs, elapsed; + while(1) { + gen = *gtod_generation_p; + tbr = mach_absolute_time(); + tod_secs = *gtod_sec_base_p; + tod_nsecs = *gtod_ns_base_p; + uint64_t gen2 = *gtod_generation_p; + if(__builtin_expect(gen, gen2) == gen2) + break; + } + if (gen == 0) return KERN_FAILURE; + elapsed = tbr - tod_nsecs; + + unsigned long secs; + uint32_t nsec; +#if defined(__x86_64__) + secs = elapsed/NSEC_PER_SEC; + nsec = elapsed % NSEC_PER_SEC; +#elif defined(__i386__) + uint32_t secs1, secs2; + secs1 = elapsed >> 32; + secs2 = elapsed; + __asm__ ( + "divl %4" + : "=a" (secs), "=d" (nsec) + : "0" (secs2), "1" (secs1), "rm" (NSEC_PER_SEC) + ); +#endif /* __i386 or __x86_64__ */ + tp->tv_sec = tod_secs + secs; + tp->tv_usec = nsec / NSEC_PER_USEC; + + if (tbr_out) *tbr_out = tbr; + + return KERN_SUCCESS; +} + +#endif diff --git a/libsyscall/wrappers/cancelable/fcntl-base.c b/libsyscall/wrappers/cancelable/fcntl-base.c index e421e0af4..bf6395112 100644 --- a/libsyscall/wrappers/cancelable/fcntl-base.c +++ b/libsyscall/wrappers/cancelable/fcntl-base.c @@ -66,6 +66,7 @@ fcntl(int fd, int cmd, ...) case F_ADDFILESIGS_RETURN: case F_FINDSIGS: case F_TRANSCODEKEY: + case F_CHECK_LV: arg = va_arg(ap, void *); break; default: diff --git a/libsyscall/wrappers/clonefile.c b/libsyscall/wrappers/clonefile.c new file mode 100644 index 000000000..33b6beabd --- /dev/null +++ b/libsyscall/wrappers/clonefile.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include + +int +clonefile(const char *old, const char *new, uint32_t flags) +{ + return (clonefileat(AT_FDCWD, old, AT_FDCWD, new, flags)); +} diff --git a/libsyscall/wrappers/fs_snapshot.c b/libsyscall/wrappers/fs_snapshot.c new file mode 100644 index 000000000..10aed0453 --- /dev/null +++ b/libsyscall/wrappers/fs_snapshot.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +extern int __fs_snapshot(uint32_t, int, const char *, const char *, void *, + uint32_t); + +int +fs_snapshot_create(int dirfd, const char *name, uint32_t flags) +{ + return __fs_snapshot(SNAPSHOT_OP_CREATE, dirfd, name, NULL, NULL, flags); +} + +int +fs_snapshot_list(int dirfd, struct attrlist *alist, void *attrbuf, size_t bufsize, + uint32_t flags) +{ + if (flags != 0) { + errno = EINVAL; + return (-1); + } + + return (getattrlistbulk(dirfd, alist, attrbuf, bufsize, + FSOPT_LIST_SNAPSHOT)); +} + +int +fs_snapshot_delete(int dirfd, const char *name, uint32_t flags) +{ + return __fs_snapshot(SNAPSHOT_OP_DELETE, dirfd, name, NULL, NULL, flags); +} + +int +fs_snapshot_rename(int dirfd, const char *old, const char *new, uint32_t flags) +{ + return __fs_snapshot(SNAPSHOT_OP_RENAME, dirfd, old, new, NULL, flags); +} + +int +fs_snapshot_revert(int dirfd, const char *name, uint32_t flags) +{ + return __fs_snapshot(SNAPSHOT_OP_REVERT, dirfd, name, NULL, NULL, flags); +} + +/* + * XXX Temporary hack to do what mount_apfs(8) does. This will be removed and + * replaced with a VFS_IOCTL based implementation in the kernel. + */ +#include +#include "strings.h" + +#ifndef SNAPSHOT_OP_MOUNT +#define SNAPSHOT_OP_MOUNT 0x4 +#endif + +#define FS_MOUNT_SNAPSHOT 2 +#define MAX_SNAPSHOT_NAMELEN 256 + +struct fs_mount_options { + uint32_t fs_flags; + uint8_t _padding_[2]; +}; + +struct fs_mount_args { + char *specdev; + struct fs_mount_options options; + uint16_t mode; + uint16_t _padding_[3]; + union { + struct { // FS_MOUNT_SNAPSHOT + dev_t snap_fsys; + char snap_name[MAX_SNAPSHOT_NAMELEN]; + }; + struct { // APFS_MOUNT_FOR_CONVERSION + }; + }; +}; + +int +fs_snapshot_mount(int dirfd, const char *dir, const char *snapshot, + uint32_t flags) +{ + struct stat st; + struct fs_mount_args mnt_args; + + mnt_args.specdev = NULL; + mnt_args.mode = FS_MOUNT_SNAPSHOT; + if (fstat(dirfd, &st) == -1) + return (-1); + + mnt_args.snap_fsys = st.st_dev; + strlcpy(mnt_args.snap_name, snapshot, sizeof(mnt_args.snap_name)); + return (__fs_snapshot(SNAPSHOT_OP_MOUNT, dirfd, snapshot, dir, + (void *)&mnt_args, flags)); +} diff --git a/libsyscall/wrappers/kdebug_trace.c b/libsyscall/wrappers/kdebug_trace.c index 02f074cab..8234f4582 100644 --- a/libsyscall/wrappers/kdebug_trace.c +++ b/libsyscall/wrappers/kdebug_trace.c @@ -22,73 +22,115 @@ */ #include +#include #include +#include #include #include +#include #include +#include +#include +#include +#include +extern int __kdebug_typefilter(void** addr, size_t* size); extern int __kdebug_trace64(uint32_t code, uint64_t arg1, uint64_t arg2, - uint64_t arg3, uint64_t arg4); + uint64_t arg3, uint64_t arg4); extern uint64_t __kdebug_trace_string(uint32_t debugid, uint64_t str_id, - const char *str); + const char *str); -/* Returns non-zero if tracing is enabled. */ -static int -kdebug_enabled(void) -{ - volatile uint32_t *kdebug_enable_address = - (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE); +static int kdebug_signpost_internal(uint32_t debugid, uintptr_t arg1, + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); - if (*kdebug_enable_address == 0) { - return 0; +/* + * GENERAL API DESIGN NOTE! + * + * Trace API's are expected to avoid performing checks until tracing has + * been enabled. This includes checks that might cause error codes to be + * returned. + * + * Trace invocations via wrapper and syscall must have the same behavior. + * + * Note that the userspace API is chosing to optimize fastpath, non-error + * performance by eliding validation of each debugid. This means that error + * cases which could have been caught in userspace will make a syscall + * before returning with the correct error code. This tradeoff in performance + * is intentional. + */ + +void * +kdebug_typefilter(void) +{ + static void* typefilter; + + /* We expect kdebug_typefilter_bitmap to be valid (the if is not executed) */ + if (__builtin_expect(!typefilter, 0)) { + // Map the typefilter if it can be mapped. + void* ptr = NULL; + size_t ptr_size = 0; + + if (__kdebug_typefilter(&ptr, &ptr_size) == 0) { + void* old_value = NULL; + if (ptr && !atomic_compare_exchange_strong((void* _Atomic volatile *)&typefilter, &old_value, ptr)) { + mach_vm_deallocate(mach_task_self(), (mach_vm_offset_t)ptr, KDBG_TYPEFILTER_BITMAP_SIZE); + } + } } - return 1; + return typefilter; } -static int -kdebug_validate_debugid(uint32_t debugid) +bool +kdebug_is_enabled(uint32_t debugid) { - uint8_t debugid_class; - - /* - * This filtering is also done in the kernel, but we also do it here so - * that errors are returned in all cases, not just when the system call - * is actually performed. - */ - debugid_class = KDBG_EXTRACT_CLASS(debugid); - switch (debugid_class) { - case DBG_TRACE: - return EPERM; + uint32_t state = *((volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE)); + + if (state == 0) { + return FALSE; + } + + if ((state & KDEBUG_COMMPAGE_ENABLE_TYPEFILTER) > 0) { + /* + * Typefilter rules... + * + * If no typefilter is available (even if due to error), + * debugids are allowed. + * + * The typefilter will always allow DBG_TRACE; this is a kernel + * invariant. There is no need for an explicit check here. + * + * NOTE: The typefilter will always allow DBG_TRACE, but + * it is not legal to inject DBG_TRACE via kdebug_trace. + * Attempts to do so will not be detected here, but will be + * detected in the kernel, and an error will be returned. Per + * the API design note at the top of this file, this is a + * deliberate choice. + */ + uint8_t* typefilter = kdebug_typefilter(); + if (typefilter && isset(typefilter, KDBG_EXTRACT_CSC(debugid)) == 0) { + return FALSE; + } } - return 0; + return TRUE; } int kdebug_trace(uint32_t debugid, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) { - int err; - - if (!kdebug_enabled()) { + if (!kdebug_is_enabled(debugid)) { return 0; } - if ((err = kdebug_validate_debugid(debugid)) != 0) { - errno = err; - return -1; - } - return __kdebug_trace64(debugid, arg1, arg2, arg3, arg4); } uint64_t kdebug_trace_string(uint32_t debugid, uint64_t str_id, const char *str) { - int err; - - if (!kdebug_enabled()) { + if (!kdebug_is_enabled(debugid)) { return 0; } @@ -102,10 +144,36 @@ kdebug_trace_string(uint32_t debugid, uint64_t str_id, const char *str) return (uint64_t)-1; } - if ((err = kdebug_validate_debugid(debugid)) != 0) { - errno = err; - return (uint64_t)-1; + return __kdebug_trace_string(debugid, str_id, str); +} + +static int +kdebug_signpost_internal(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) +{ + if (KDBG_EXTRACT_CSC(debugid) != 0) { + errno = EINVAL; + return -1; } - return __kdebug_trace_string(debugid, str_id, str); + debugid |= APPSDBG_CODE(DBG_APP_SIGNPOST, 0); + + return kdebug_trace(debugid, arg1, arg2, arg3, arg4); +} + +int +kdebug_signpost(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) +{ + return kdebug_signpost_internal(code << KDBG_CODE_OFFSET, arg1, arg2, arg3, arg4); +} + +int +kdebug_signpost_start(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) +{ + return kdebug_signpost_internal((code << KDBG_CODE_OFFSET) | DBG_FUNC_START, arg1, arg2, arg3, arg4); +} + +int +kdebug_signpost_end(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) +{ + return kdebug_signpost_internal((code << KDBG_CODE_OFFSET) | DBG_FUNC_END, arg1, arg2, arg3, arg4); } diff --git a/libsyscall/wrappers/libproc/libproc.c b/libsyscall/wrappers/libproc/libproc.c index 730a15e41..e46dbe0c5 100644 --- a/libsyscall/wrappers/libproc/libproc.c +++ b/libsyscall/wrappers/libproc/libproc.c @@ -362,11 +362,21 @@ proc_terminate(pid_t pid, int *sig) return 0; } +/* + * XXX the _fatal() variant both checks for an existing monitor + * (with important policy effects on first party background apps) + * and validates inputs. + */ int proc_set_cpumon_params(pid_t pid, int percentage, int interval) { proc_policy_cpuusage_attr_t attr; + /* no argument validation ... + * task_set_cpuusage() ignores 0 values and squashes negative + * values into uint32_t. + */ + attr.ppattr_cpu_attr = PROC_POLICY_RSRCACT_NOTIFY_EXC; attr.ppattr_cpu_percentage = percentage; attr.ppattr_cpu_attr_interval = (uint64_t)interval; @@ -410,6 +420,16 @@ proc_set_cpumon_defaults(pid_t pid) PROC_POLICY_RUSAGE_CPU, (proc_policy_attribute_t*)&attr, pid, 0)); } +int +proc_resume_cpumon(pid_t pid) +{ + return __process_policy(PROC_POLICY_SCOPE_PROCESS, + PROC_POLICY_ACTION_ENABLE, + PROC_POLICY_RESOURCE_USAGE, + PROC_POLICY_RUSAGE_CPU, + NULL, pid, 0); +} + int proc_disable_cpumon(pid_t pid) { @@ -449,6 +469,10 @@ proc_set_cpumon_params_fatal(pid_t pid, int percentage, int interval) * already active. If either the percentage or the * interval is nonzero, then CPU monitoring is * already in use for this process. + * + * XXX: need set...() and set..fatal() to behave similarly. + * Currently, this check prevents 1st party apps (which get a + * default non-fatal monitor) not to get a fatal monitor. */ (void)proc_get_cpumon_params(pid, ¤t_percentage, ¤t_interval); if (current_percentage || current_interval) @@ -544,12 +568,18 @@ proc_list_uptrs(int pid, uint64_t *buf, uint32_t bufsz) return -1; } - struct proc_fdinfo fdlist[OPEN_MAX]; - nfds = proc_pidinfo(pid, PROC_PIDLISTFDS, 0, fdlist, OPEN_MAX*sizeof(struct proc_fdinfo)); - if (nfds <= 0 || nfds > OPEN_MAX) { + /* get the list of FDs for this process */ + struct proc_fdinfo fdlist[OPEN_MAX+1]; + nfds = proc_pidinfo(pid, PROC_PIDLISTFDS, 0, &fdlist[1], OPEN_MAX*sizeof(struct proc_fdinfo)); + if (nfds < 0 || nfds > OPEN_MAX) { return -1; } + /* Add FD -1, the implicit workq kqueue */ + fdlist[0].proc_fd = -1; + fdlist[0].proc_fdtype = PROX_FDTYPE_KQUEUE; + nfds++; + struct kevent_extinfo *kqext = malloc(knote_max * sizeof(struct kevent_extinfo)); if (!kqext) { errno = ENOMEM; diff --git a/libsyscall/wrappers/libproc/libproc_internal.h b/libsyscall/wrappers/libproc/libproc_internal.h index 182cf886f..d8fc8f1f9 100644 --- a/libsyscall/wrappers/libproc/libproc_internal.h +++ b/libsyscall/wrappers/libproc/libproc_internal.h @@ -27,6 +27,7 @@ #include #include +#include __BEGIN_DECLS @@ -77,18 +78,21 @@ int proc_denap_assertion_begin_with_msg(mach_msg_header_t *msg, /* drop a de-nap assertion */ int proc_denap_assertion_complete(uint64_t assertion_handle); +/* ongoing percent-over-time CPU monitor */ +int proc_set_cpumon_defaults(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); int proc_set_cpumon_params(pid_t pid, int percentage, int interval) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); +int proc_set_cpumon_params_fatal(pid_t pid, int percentage, int interval) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); + int proc_get_cpumon_params(pid_t pid, int *percentage, int *interval) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); -int proc_set_cpumon_defaults(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); +int proc_resume_cpumon(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0); int proc_disable_cpumon(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0); +/* ongoing wakes/second monitor */ +int proc_set_wakemon_defaults(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); int proc_set_wakemon_params(pid_t pid, int rate_hz, int flags) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); int proc_get_wakemon_params(pid_t pid, int *rate_hz, int *flags) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); -int proc_set_wakemon_defaults(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); int proc_disable_wakemon(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); -int proc_set_cpumon_params_fatal(pid_t pid, int percentage, int interval) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); - /* request trace buffer collection */ int proc_trace_log(pid_t pid, uint64_t uniqueid) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); diff --git a/libsyscall/wrappers/mach_boottime.c b/libsyscall/wrappers/mach_boottime.c new file mode 100644 index 000000000..23b22f93b --- /dev/null +++ b/libsyscall/wrappers/mach_boottime.c @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include + +uint64_t +mach_boottime_usec(void) +{ + return *(uint64_t*)_COMM_PAGE_BOOTTIME_USEC; +} diff --git a/libsyscall/wrappers/mach_continuous_time.c b/libsyscall/wrappers/mach_continuous_time.c new file mode 100644 index 000000000..4f20664f8 --- /dev/null +++ b/libsyscall/wrappers/mach_continuous_time.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include + +__attribute__((visibility("hidden"))) +uint64_t +_mach_continuous_time_base(void) +{ +#if !defined(__x86_64__) && !defined(__arm64__) + // Deal with the lack of 64-bit loads on arm32 (see mach_approximate_time.s) + while(1) { + volatile uint64_t *base_ptr = (volatile uint64_t*)_COMM_PAGE_CONT_TIMEBASE; + uint64_t read1, read2; + read1 = *base_ptr; +#if defined(__i386__) + __asm__ volatile("lfence" ::: "memory"); +#else +#error "unsupported arch" +#endif + read2 = *base_ptr; + + if(__builtin_expect((read1 == read2), 1)) + return read1; + } +#else // 64-bit + return *(volatile uint64_t*)_COMM_PAGE_CONT_TIMEBASE; +#endif // 64-bit +} + + +__attribute__((visibility("hidden"))) +kern_return_t +_mach_continuous_time(uint64_t* absolute_time, uint64_t* cont_time) +{ + volatile uint64_t *base_ptr = (volatile uint64_t*)_COMM_PAGE_CONT_TIMEBASE; + volatile uint64_t read1, read2; + volatile uint64_t absolute; + + do { + read1 = *base_ptr; + absolute = mach_absolute_time(); + read2 = *base_ptr; + } while (__builtin_expect((read1 != read2), 0)); + + if (absolute_time) *absolute_time = absolute; + if (cont_time) *cont_time = absolute + read1; + + return KERN_SUCCESS; +} + +uint64_t +mach_continuous_time(void) +{ + uint64_t cont_time; + _mach_continuous_time(NULL, &cont_time); + return cont_time; +} + +uint64_t +mach_continuous_approximate_time(void) +{ + /* + * No retry loop here because if we use a slightly too old timebase that's + * okay, we are approximate time anyway. + */ + volatile register uint64_t time_base = _mach_continuous_time_base(); + return time_base + mach_approximate_time(); +} diff --git a/libsyscall/wrappers/mach_get_times.c b/libsyscall/wrappers/mach_get_times.c new file mode 100644 index 000000000..37ddfa9fd --- /dev/null +++ b/libsyscall/wrappers/mach_get_times.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include + +// From __commpage_gettimeofday.c +extern int __commpage_gettimeofday_internal(struct timeval *tp, uint64_t *tbr_out); +extern kern_return_t _mach_continuous_time(uint64_t* absolute_time, uint64_t* cont_time); +// From mach_continuous_time.c +extern uint64_t _mach_continuous_time_base(void); +// Underlying syscall stub +extern int __gettimeofday_with_mach(struct timeval *, struct timezone *, uint64_t *); + +kern_return_t +mach_get_times(uint64_t* absolute_time, uint64_t* cont_time, struct timespec *tp) { + if (tp == NULL) { + return _mach_continuous_time(absolute_time, cont_time); + } + + uint64_t continuous_time_base_prior = -1, continuous_time_base_post = -1; + uint64_t tbr; + struct timeval tv; + + do { + /* + * We need to capture the result of gettimeofday without our continuous + * time base changing. Once we have that, and the value for absolute + * time that was used to compute the timespec, we can just add the base + * to get the accompanying continuous time. + */ + continuous_time_base_prior = _mach_continuous_time_base(); + + /* + * This call has the necessary memory barriers for this retry loop, + * since it is implemented with a retry loop of its own. + */ + if (__commpage_gettimeofday_internal(&tv, &tbr)) { + tbr = 0; + if (__gettimeofday_with_mach(&tv, NULL, &tbr) < 0) { + return KERN_FAILURE; + } else if (tbr == 0) { + // On an old kernel, likely chroot'ed. (remove next year) + tbr = mach_absolute_time(); + } + } + + continuous_time_base_post = _mach_continuous_time_base(); + } while (__builtin_expect(continuous_time_base_prior != continuous_time_base_post, 0)); + + if (absolute_time) *absolute_time = tbr; + if (cont_time) *cont_time = continuous_time_base_prior + tbr; + tp->tv_sec = tv.tv_sec; + tp->tv_nsec = tv.tv_usec * NSEC_PER_USEC; + + return KERN_SUCCESS; +} diff --git a/libsyscall/wrappers/mach_timebase_info.c b/libsyscall/wrappers/mach_timebase_info.c new file mode 100644 index 000000000..80cd559ad --- /dev/null +++ b/libsyscall/wrappers/mach_timebase_info.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include + +extern kern_return_t +mach_timebase_info_trap(mach_timebase_info_t info); + +kern_return_t +mach_timebase_info(mach_timebase_info_t info){ + static mach_timebase_info_data_t cached_info; + + /* + * This is racy, but because it is safe to initialize twice we avoid a + * barrier in the fast path by risking double initialization. + */ + if (cached_info.numer == 0 || cached_info.denom == 0){ + kern_return_t kr = mach_timebase_info_trap(&cached_info); + if (kr != KERN_SUCCESS) return kr; + } + + info->numer = cached_info.numer; + info->denom = cached_info.denom; + + return KERN_SUCCESS; +} diff --git a/libsyscall/wrappers/quota_obsolete.c b/libsyscall/wrappers/quota_obsolete.c new file mode 100644 index 000000000..435339b2c --- /dev/null +++ b/libsyscall/wrappers/quota_obsolete.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +/* + * system call stubs are no longer generated for these from + * syscalls.master. Instead, provide simple stubs here. + */ + +extern int quota(void); +extern int setquota(void); + +int quota(void) +{ + return kill(getpid(), SIGSYS); +} + +int setquota(void) +{ + return kill(getpid(), SIGSYS); +} diff --git a/libsyscall/wrappers/rename_ext.c b/libsyscall/wrappers/renamex.c similarity index 69% rename from libsyscall/wrappers/rename_ext.c rename to libsyscall/wrappers/renamex.c index 8c7762139..8bdfdcd0d 100644 --- a/libsyscall/wrappers/rename_ext.c +++ b/libsyscall/wrappers/renamex.c @@ -2,14 +2,14 @@ * Copyright (c) 2014 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -17,29 +17,32 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_LICENSE_HEADER_END@ */ - #include - -/* - * XXXXX REMOVE AND REPLACE the definition of VFS_RENAME_FLAGS_MASK below with - * appropriate header file if/when defined in a header file. - */ -#define VFS_RENAME_FLAGS_MASK 0x00000001 +#include +#include void __inc_remove_counter(void); -int __rename_ext(const char *old, const char *new, int flags); +int __renameatx_np(int oldfd, const char *old, int newfd, const char *new, unsigned int flags); int -rename_ext(const char *old, const char *new, unsigned int flags) +renameatx_np(int oldfd, const char *old, int newfd, const char *new, unsigned int flags) { - if (!(flags & VFS_RENAME_FLAGS_MASK)) { - errno = EINVAL; - return -1; - } - - int res = __rename_ext(old, new, flags & VFS_RENAME_FLAGS_MASK); + int res = __renameatx_np(oldfd, old, newfd, new, flags); if (res == 0) __inc_remove_counter(); return res; } + +int +renamex_np(const char *old, const char *new, unsigned int flags) +{ + return renameatx_np(AT_FDCWD, old, AT_FDCWD, new, flags); +} + +// Deprecated +int +rename_ext(const char *old, const char *new, unsigned int flags) +{ + return renamex_np(old, new, flags); +} diff --git a/libsyscall/wrappers/spawn/posix_spawn.c b/libsyscall/wrappers/spawn/posix_spawn.c index 4c36d0e90..be3e94cea 100644 --- a/libsyscall/wrappers/spawn/posix_spawn.c +++ b/libsyscall/wrappers/spawn/posix_spawn.c @@ -39,7 +39,7 @@ #include #include #include /* for COALITION_TYPE_MAX */ - +#include /* * posix_spawnattr_init @@ -1332,6 +1332,43 @@ posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict attr, } +/* + * posix_spawnattr_setjetsam_ext + * + * Description: Set jetsam attributes for the spawn attribute object + * referred to by 'attr'. + * + * Parameters: flags The flags value to set + * priority Relative jetsam priority + * memlimit_active Value in megabytes; memory footprint + * above this level while process is + * active may result in termination. + * memlimit_inactive Value in megabytes; memory footprint + * above this level while process is + * inactive may result in termination. + * + * Returns: 0 Success + */ +int +posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr, + short flags, int priority, int memlimit_active, int memlimit_inactive) +{ + _posix_spawnattr_t psattr; + + if (attr == NULL || *attr == NULL) + return EINVAL; + + psattr = *(_posix_spawnattr_t *)attr; + + psattr->psa_jetsam_flags = flags; + psattr->psa_jetsam_flags |= POSIX_SPAWN_JETSAM_SET; + psattr->psa_priority = priority; + psattr->psa_memlimit_active = memlimit_active; + psattr->psa_memlimit_inactive = memlimit_inactive; + + return (0); +} + /* * posix_spawnattr_set_importancewatch_port_np diff --git a/libsyscall/wrappers/spawn/spawn_private.h b/libsyscall/wrappers/spawn/spawn_private.h index 3513946a4..ec7f50fb6 100644 --- a/libsyscall/wrappers/spawn/spawn_private.h +++ b/libsyscall/wrappers/spawn/spawn_private.h @@ -41,6 +41,9 @@ int posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict, uint64_t *, ui int posix_spawnattr_setcpumonitor_default(posix_spawnattr_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0); +int posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr, + short flags, int priority, int memlimit_active, int memlimit_inactive) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); + #define POSIX_SPAWN_IMPORTANCE_PORT_COUNT 32 int posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr, int count, mach_port_t portarray[]) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0); diff --git a/libsyscall/wrappers/stackshot.c b/libsyscall/wrappers/stackshot.c index c5633120b..0e065edd2 100644 --- a/libsyscall/wrappers/stackshot.c +++ b/libsyscall/wrappers/stackshot.c @@ -50,7 +50,7 @@ stackshot_config_create(void) s_config->sc_pid = -1; s_config->sc_flags = 0; - s_config->sc_since_timestamp = 0; + s_config->sc_delta_timestamp = 0; s_config->sc_buffer = 0; s_config->sc_size = 0; @@ -213,6 +213,30 @@ stackshot_config_set_size_hint(stackshot_config_t *stackshot_config, uint32_t su return 0; } +/* + * stackshot_config_set_delta_timestamp: set the timestamp to use as the basis for the delta stackshot + * + * This timestamp will be used along with STACKSHOT_COLLECT_DELTA_SNAPSHOT flag to collect delta stackshots + * + * Inputs: stackshot_config - a pointer to a stackshot_config_t + * delta_timestamp - timestamp in MachAbsoluteTime units to be used as the basis for a delta stackshot + * + * Outputs: -1 if the passed stackshot config is NULL or there is existing stackshot buffer set. + * 0 on success + */ +int +stackshot_config_set_delta_timestamp(stackshot_config_t *stackshot_config, uint64_t delta_timestamp) +{ + if (stackshot_config == NULL || (void *)stackshot_config->sc_buffer != NULL) { + return -1; + } + + stackshot_config->sc_delta_timestamp = delta_timestamp; + + return 0; +} + + /* * stackshot_config_dealloc_buffer: dealloc the stackshot buffer and reset the size so that a * stackshot_config_t can be used again diff --git a/libsyscall/wrappers/terminate_with_reason.c b/libsyscall/wrappers/terminate_with_reason.c new file mode 100644 index 000000000..05fdb7848 --- /dev/null +++ b/libsyscall/wrappers/terminate_with_reason.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include + +/* System call entry points */ +int __terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_code, + void *payload, uint32_t payload_size, const char *reason_string, + uint64_t reason_flags); + +void __abort_with_payload(uint32_t reason_namespace, uint64_t reason_code, + void *payload, uint32_t payload_size, const char *reason_string, + uint64_t reason_flags); + +static void abort_with_payload_wrapper_internal(uint32_t reason_namespace, uint64_t reason_code, + void *payload, uint32_t payload_size, const char *reason_string, + uint64_t reason_flags) __attribute__((noreturn)); + +/* System call wrappers */ +int +terminate_with_reason(int pid, uint32_t reason_namespace, uint64_t reason_code, + const char *reason_string, uint64_t reason_flags) +{ + return __terminate_with_payload(pid, reason_namespace, reason_code, 0, 0, + reason_string, reason_flags); +} + +int +terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_code, + void *payload, uint32_t payload_size, + const char *reason_string, uint64_t reason_flags) +{ + return __terminate_with_payload(pid, reason_namespace, reason_code, payload, + payload_size, reason_string, reason_flags); +} + +static void abort_with_payload_wrapper_internal(uint32_t reason_namespace, uint64_t reason_code, + void *payload, uint32_t payload_size, const char *reason_string, + uint64_t reason_flags) +{ + sigset_t unmask_signal; + + /* Try to unmask SIGABRT before trapping to the kernel */ + sigemptyset(&unmask_signal); + sigaddset(&unmask_signal, SIGABRT); + sigprocmask(SIG_UNBLOCK, &unmask_signal, NULL); + + __abort_with_payload(reason_namespace, reason_code, payload, payload_size, + reason_string, reason_flags); + + /* If sending a SIGABRT failed, we try to fall back to SIGKILL */ + terminate_with_payload(getpid(), reason_namespace, reason_code, payload, payload_size, + reason_string, reason_flags); + + /* Last resort, let's use SIGTRAP (SIGILL on i386) */ + sigemptyset(&unmask_signal); + sigaddset(&unmask_signal, SIGTRAP); + sigaddset(&unmask_signal, SIGILL); + sigprocmask(SIG_UNBLOCK, &unmask_signal, NULL); + + __builtin_trap(); +} + +void +abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const char *reason_string, + uint64_t reason_flags) +{ + abort_with_payload_wrapper_internal(reason_namespace, reason_code, 0, 0, reason_string, reason_flags); +} + +void +abort_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, + uint32_t payload_size, const char *reason_string, + uint64_t reason_flags) +{ + abort_with_payload_wrapper_internal(reason_namespace, reason_code, payload, payload_size, + reason_string, reason_flags); +} + diff --git a/libsyscall/xcodescripts/create-syscalls.pl b/libsyscall/xcodescripts/create-syscalls.pl index 9c587b536..54514f454 100755 --- a/libsyscall/xcodescripts/create-syscalls.pl +++ b/libsyscall/xcodescripts/create-syscalls.pl @@ -97,26 +97,6 @@ # Moving towards storing all data in this hash, then we always know # if data is aliased or not, or promoted or not. my %Symbols = ( - "quota" => { - c_sym => "quota", - syscall => "quota", - asm_sym => "_quota", - is_private => undef, - is_custom => undef, - nargs => 4, - bytes => 0, - aliases => {}, - }, - "setquota" => { - c_sym => "setquota", - syscall => "setquota", - asm_sym => "_setquota", - is_private => undef, - is_custom => undef, - nargs => 2, - bytes => 0, - aliases => {}, - }, "syscall" => { c_sym => "syscall", syscall => "syscall", @@ -141,7 +121,7 @@ link linkat lseek lstat msgrcv msgsnd msync open openat - pathconf peeloff poll posix_spawn pread pwrite + pathconf peeloff poll posix_spawn pread pselect pwrite read readv recvfrom recvmsg rename renameat rename_ext __semwait_signal __sigwait diff --git a/libsyscall/xcodescripts/mach_install_mig.sh b/libsyscall/xcodescripts/mach_install_mig.sh index f90bab981..174fa8c3c 100755 --- a/libsyscall/xcodescripts/mach_install_mig.sh +++ b/libsyscall/xcodescripts/mach_install_mig.sh @@ -57,6 +57,11 @@ MIG_INTERNAL_HEADER_DST="$BUILT_PRODUCTS_DIR/internal_hdr/include/mach" MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/System/Library/Frameworks/System.framework/PrivateHeaders" FILTER_MIG="$SRCROOT/xcodescripts/filter_mig.awk" +ASROOT="" +if [ `whoami` = "root" ]; then + ASROOT="-o 0" +fi + MIGS="clock.defs clock_priv.defs clock_reply.defs @@ -109,13 +114,13 @@ MIG_FILTERS="watchos_prohibited_mig.txt tvos_prohibited_mig.txt" # install /usr/include/server headers mkdir -p $SERVER_HEADER_DST for hdr in $SERVER_HDRS; do - install -o 0 -c -m 444 $SRC/servers/$hdr $SERVER_HEADER_DST + install $ASROOT -c -m 444 $SRC/servers/$hdr $SERVER_HEADER_DST done # install /usr/include/mach headers mkdir -p $MACH_HEADER_DST for hdr in $MACH_HDRS; do - install -o 0 -c -m 444 $SRC/mach/$hdr $MACH_HEADER_DST + install $ASROOT -c -m 444 $SRC/mach/$hdr $MACH_HEADER_DST done # special case because we only have one to do here @@ -133,7 +138,7 @@ for mig in $MIGS $MIGS_DUAL_PUBLIC_PRIVATE; do $FILTER_MIG $SRC/$filter $MIG_HEADER_OBJ/$MIG_NAME.h > $MIG_HEADER_OBJ/$MIG_NAME.tmp.h mv $MIG_HEADER_OBJ/$MIG_NAME.tmp.h $MIG_HEADER_OBJ/$MIG_NAME.h done - install -o 0 -c -m 444 $MIG_HEADER_OBJ/$MIG_NAME.h $MIG_HEADER_DST/$MIG_NAME.h + install $ASROOT -c -m 444 $MIG_HEADER_OBJ/$MIG_NAME.h $MIG_HEADER_DST/$MIG_NAME.h done mkdir -p $MIG_PRIVATE_HEADER_DST diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index 06e6a30e1..0619a3324 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -1,6 +1,6 @@ # -*- mode: makefile;-*- # -# Copyright (C) 1999-2012 Apple Inc. All rights reserved. +# Copyright (C) 1999-2016 Apple Inc. All rights reserved. # # MakeInc.cmd contains command paths for use during # the build, as well as make fragments and text @@ -211,18 +211,18 @@ _function_create_build_configs_join = $(strip $(1))^$(strip $(2))^$(strip $(3)) # $(3) is an un-expanded machine config from a TARGET_CONFIGS_UC tuple _function_create_build_configs_do_expand = $(call _function_create_build_configs_join, \ $(if $(filter DEFAULT,$(1)), \ - $(DEFAULT_KERNEL_CONFIG), \ + $(DEFAULT_KERNEL_CONFIG), \ $(1) \ ), \ $(if $(filter DEFAULT,$(2)), \ - $(DEFAULT_ARCH_CONFIG), \ + $(DEFAULT_ARCH_CONFIG), \ $(2) \ ), \ $(if $(filter DEFAULT,$(3)), \ - $(if $(filter DEFAULT,$(2)), \ - $(DEFAULT_$(DEFAULT_ARCH_CONFIG)_MACHINE_CONFIG), \ + $(if $(filter DEFAULT,$(2)), \ + $(DEFAULT_$(DEFAULT_ARCH_CONFIG)_MACHINE_CONFIG), \ $(DEFAULT_$(strip $(2))_MACHINE_CONFIG) \ - ), \ + ), \ $(3) \ ) \ ) @@ -231,20 +231,57 @@ _function_create_build_configs_do_expand = $(call _function_create_buil # 3 elements at a time function_create_build_configs = $(sort \ $(strip \ - $(call _function_create_build_configs_do_expand, \ - $(word 1,$(1)), \ - $(word 2,$(1)), \ - $(word 3,$(1)), \ + $(call _function_create_build_configs_do_expand, \ + $(word 1,$(1)), \ + $(word 2,$(1)), \ + $(word 3,$(1)), \ ) \ $(if $(word 4,$(1)), \ $(call function_create_build_configs, \ $(wordlist 4,$(words $(1)),$(1)) \ ), \ - \ ) \ ) \ ) +# Similar to build configs, but alias configs are a 4-tuple + +# $(1) is an expanded kernel config from a TARGET_CONFIGS_ALIASES_UC tuple +# $(2) is an expanded arch config from a TARGET_CONFIGS_ALIASES_UC tuple +# $(3) is an expanded kernel machine config from a TARGET_CONFIGS_ALIASES_UC tuple +# $(4) is an expanded SoC platform config from a TARGET_CONFIGS_ALIASES_UC tuple, +# which should be an alias of $(3) +_function_create_alias_configs_join = $(strip $(1))^$(strip $(2))^$(strip $(3))^$(strip $(4)) + +_function_create_alias_configs_do_expand = $(call _function_create_alias_configs_join, \ + $(if $(filter DEFAULT,$(1)), \ + $(DEFAULT_KERNEL_CONFIG), \ + $(1) \ + ), \ + $(if $(filter DEFAULT,$(2)), \ + $(DEFAULT_ARCH_CONFIG), \ + $(2) \ + ), \ + $(3), \ + $(4) \ + ) + +function_create_alias_configs = $(sort \ + $(strip \ + $(call _function_create_alias_configs_do_expand, \ + $(word 1,$(1)), \ + $(word 2,$(1)), \ + $(word 3,$(1)), \ + $(word 4,$(1)), \ + ) \ + $(if $(word 5,$(1)), \ + $(call function_create_alias_configs, \ + $(wordlist 5,$(words $(1)),$(1)) \ + ), \ + ) \ + ) \ + ) + # $(1) is a fully-expanded kernel config # $(2) is a fully-expanded arch config # $(3) is a fully-expanded machine config. "NONE" is not represented in the objdir path @@ -252,7 +289,7 @@ function_convert_target_config_uc_to_objdir = $(if $(filter NONE,$(3)),$(strip $ # $(1) is a fully-expanded build config (like "RELEASE^X86_64^NONE") function_convert_build_config_to_objdir = $(call function_convert_target_config_uc_to_objdir, \ - $(word 1,$(subst ^, ,$(1))), \ + $(word 1,$(subst ^, ,$(1))), \ $(word 2,$(subst ^, ,$(1))), \ $(word 3,$(subst ^, ,$(1))) \ ) @@ -280,7 +317,7 @@ space := $(empty) $(empty) # Arithmetic # $(1) is the number to increment -NUM32 = x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +NUM32 = x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x increment = $(words x $(wordlist 1,$(1),$(NUM32))) decrement = $(words $(wordlist 2,$(1),$(NUM32))) diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index ae513f193..b05ec67ca 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -1,6 +1,6 @@ # -*- mode: makefile;-*- # -# Copyright (C) 1999-2014 Apple Inc. All rights reserved. +# Copyright (C) 1999-2016 Apple Inc. All rights reserved. # # MakeInc.def contains global definitions for building, # linking, and installing files. @@ -12,25 +12,24 @@ SUPPORTED_ARCH_CONFIGS := X86_64 X86_64H # -# Kernel Configuration options +# Kernel Configuration options # SUPPORTED_KERNEL_CONFIGS = RELEASE DEVELOPMENT DEBUG PROFILE # -# Machine Configuration options +# Machine Configuration options # SUPPORTED_X86_64_MACHINE_CONFIGS = NONE SUPPORTED_X86_64H_MACHINE_CONFIGS = NONE - # # Setup up *_LC variables during recursive invocations # ifndef CURRENT_ARCH_CONFIG_LC - export CURRENT_ARCH_CONFIG_LC := $(shell printf "%s" "$(CURRENT_ARCH_CONFIG)" | $(TR) A-Z a-z) + export CURRENT_ARCH_CONFIG_LC := $(shell printf "%s" "$(CURRENT_ARCH_CONFIG)" | $(TR) A-Z a-z) endif ifndef CURRENT_KERNEL_CONFIG_LC @@ -41,13 +40,12 @@ ifndef CURRENT_MACHINE_CONFIG_LC export CURRENT_MACHINE_CONFIG_LC := $(shell printf "%s" "$(CURRENT_MACHINE_CONFIG)" | $(TR) A-Z a-z) endif - # # Component List # -COMPONENT_LIST = osfmk bsd libkern iokit pexpert libsa security -COMPONENT = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH)))) -COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST)) +COMPONENT_LIST = osfmk bsd libkern iokit pexpert libsa security +COMPONENT = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH)))) +COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST)) # @@ -92,11 +90,37 @@ GENASSYM_KCC = $(CC) # CWARNFLAGS_STD = \ - -Wall -Werror -Wno-format-y2k -Wextra -Wstrict-prototypes \ + -Weverything -Werror -Wextra -Wstrict-prototypes \ -Wmissing-prototypes -Wpointer-arith -Wreturn-type -Wcast-qual \ -Wwrite-strings -Wswitch -Wshadow -Wcast-align -Wchar-subscripts \ -Winline -Wnested-externs -Wredundant-decls -Wextra-tokens \ - -Wunreachable-code + -Wunreachable-code \ + -Wno-assign-enum \ + -Wno-bad-function-cast \ + -Wno-c++98-compat \ + -Wno-c++-compat \ + -Wno-conditional-uninitialized \ + -Wno-conversion \ + -Wno-covered-switch-default \ + -Wno-disabled-macro-expansion \ + -Wno-documentation-unknown-command \ + -Wno-format-non-iso \ + -Wno-format-nonliteral \ + -Wno-reserved-id-macro \ + -Wno-language-extension-token \ + -Wno-missing-variable-declarations \ + -Wno-packed \ + -Wno-padded \ + -Wno-partial-availability \ + -Wno-pedantic \ + -Wno-shift-sign-overflow \ + -Wno-switch-enum \ + -Wno-undef \ + -Wno-unused-macros \ + -Wno-used-but-marked-unused \ + -Wno-variadic-macros \ + -Wno-vla \ + -Wno-zero-length-array # Can be overridden in Makefile.template or Makefile.$arch export CWARNFLAGS ?= $(CWARNFLAGS_STD) @@ -106,10 +130,40 @@ $(1)_CWARNFLAGS_ADD += $2 endef CXXWARNFLAGS_STD = \ - -Wall -Werror -Wno-format-y2k -Wextra -Wpointer-arith -Wreturn-type \ + -Weverything -Werror -Wextra -Wpointer-arith -Wreturn-type \ -Wcast-qual -Wwrite-strings -Wswitch -Wcast-align -Wchar-subscripts \ -Wredundant-decls -Wextra-tokens \ - -Wunreachable-code + -Wunreachable-code \ + -Wno-assign-enum \ + -Wno-bad-function-cast \ + -Wno-c++98-compat \ + -Wno-c++98-compat-pedantic \ + -Wno-c++-compat \ + -Wno-conditional-uninitialized \ + -Wno-conversion \ + -Wno-covered-switch-default \ + -Wno-disabled-macro-expansion \ + -Wno-documentation-unknown-command \ + -Wno-exit-time-destructors \ + -Wno-format-non-iso \ + -Wno-format-nonliteral \ + -Wno-global-constructors \ + -Wno-reserved-id-macro \ + -Wno-language-extension-token \ + -Wno-missing-variable-declarations \ + -Wno-old-style-cast \ + -Wno-packed \ + -Wno-padded \ + -Wno-partial-availability \ + -Wno-pedantic \ + -Wno-shift-sign-overflow \ + -Wno-switch-enum \ + -Wno-undef \ + -Wno-unused-macros \ + -Wno-used-but-marked-unused \ + -Wno-variadic-macros \ + -Wno-vla \ + -Wno-zero-length-array # overloaded-virtual warnings are non-fatal (9000888) CXXWARNFLAGS_STD += -Wno-error=overloaded-virtual @@ -128,6 +182,7 @@ ARCH_FLAGS_X86_64 = -arch x86_64 ARCH_FLAGS_X86_64H = -arch x86_64h + # # Default CFLAGS # @@ -154,13 +209,13 @@ CFLAGS_GEN = $(DEBUG_CFLAGS) -nostdinc \ -fno-builtin -fno-common \ -fsigned-bitfields $(OTHER_CFLAGS) -CFLAGS_RELEASE = -CFLAGS_DEVELOPMENT = -CFLAGS_DEBUG = -CFLAGS_PROFILE = -pg +CFLAGS_RELEASE = +CFLAGS_DEVELOPMENT = +CFLAGS_DEBUG = +CFLAGS_PROFILE = -pg CFLAGS_X86_64 = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \ - -DPAGE_SIZE_FIXED -mkernel -msoft-float + -DPAGE_SIZE_FIXED -mkernel -msoft-float CFLAGS_X86_64H = $(CFLAGS_X86_64) @@ -183,7 +238,7 @@ CFLAGS_DEBUGARM = -O0 CFLAGS_PROFILEARM = -O2 -CFLAGS = $(CFLAGS_GEN) \ +CFLAGS = $(CFLAGS_GEN) \ $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),CFLAGS_)) \ @@ -199,17 +254,12 @@ CFLAGS = $(CFLAGS_GEN) \ OTHER_CXXFLAGS = -CXXFLAGS_GEN = -fapple-kext $(OTHER_CXXFLAGS) - -# For the moment, do not use gnu++11 -#CXXFLAGS_ARM = -std=gnu++11 - +CXXFLAGS_GEN = -std=gnu++11 -fapple-kext $(OTHER_CXXFLAGS) CXXFLAGS = $(CXXFLAGS_GEN) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),CXXFLAGS_)) \ $($(addsuffix $(CURRENT_KERNEL_CONFIG),CXXFLAGS_)) - # # Assembler command # @@ -221,15 +271,15 @@ S_KCC = $(CC) # SFLAGS_GEN = -D__ASSEMBLER__ -DASSEMBLER $(OTHER_CFLAGS) -SFLAGS_RELEASE = -SFLAGS_DEVELOPMENT = -SFLAGS_DEBUG = -SFLAGS_PROFILE = +SFLAGS_RELEASE = +SFLAGS_DEVELOPMENT = +SFLAGS_DEBUG = +SFLAGS_PROFILE = -SFLAGS_X86_64 = $(CFLAGS_X86_64) -SFLAGS_X86_64H = $(CFLAGS_X86_64H) +SFLAGS_X86_64 = $(CFLAGS_X86_64) +SFLAGS_X86_64H = $(CFLAGS_X86_64H) -SFLAGS = $(SFLAGS_GEN) \ +SFLAGS = $(SFLAGS_GEN) \ $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),SFLAGS_)) \ @@ -257,9 +307,11 @@ LDFLAGS_KERNEL_GEN = \ -Wl,-sectalign,__TEXT,__text,0x1000 \ -Wl,-sectalign,__DATA,__common,0x1000 \ -Wl,-sectalign,__DATA,__bss,0x1000 \ - -Wl,-sectcreate,__PRELINK_TEXT,__text,/dev/null \ - -Wl,-sectcreate,__PRELINK_STATE,__kernel,/dev/null \ - -Wl,-sectcreate,__PRELINK_STATE,__kexts,/dev/null \ + -Wl,-sectcreate,__PRELINK_TEXT,__text,/dev/null \ + -Wl,-sectcreate,"__PLK_TEXT_EXEC",__text,/dev/null \ + -Wl,-sectcreate,__PRELINK_DATA,__data,/dev/null \ + -Wl,-sectcreate,"__PLK_DATA_CONST",__data,/dev/null \ + -Wl,-sectcreate,"__PLK_LINKEDIT",__data,/dev/null \ -Wl,-sectcreate,__PRELINK_INFO,__info,/dev/null \ -Wl,-new_linker \ -Wl,-pagezero_size,0x0 \ @@ -267,10 +319,12 @@ LDFLAGS_KERNEL_GEN = \ -Wl,-function_starts \ -Wl,-headerpad,152 -LDFLAGS_KERNEL_RELEASE = -LDFLAGS_KERNEL_DEVELOPMENT = -LDFLAGS_KERNEL_DEBUG = -LDFLAGS_KERNEL_PROFILE = +LDFLAGS_KERNEL_SDK = -L$(SDKROOT)/usr/local/lib/kernel -lfirehose_kernel + +LDFLAGS_KERNEL_RELEASE = +LDFLAGS_KERNEL_DEVELOPMENT = +LDFLAGS_KERNEL_DEBUG = +LDFLAGS_KERNEL_PROFILE = # KASLR static slide config: ifndef SLIDE @@ -292,8 +346,6 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \ -Wl,-segaddr,__HIB,$(KERNEL_HIB_SECTION_BASE) \ -Wl,-image_base,$(KERNEL_TEXT_BASE) \ -Wl,-seg_page_size,__TEXT,0x200000 \ - -Wl,-sectalign,__DATA,__const,0x1000 \ - -Wl,-sectalign,__DATA,__sysctl_set,0x1000 \ -Wl,-sectalign,__HIB,__bootPT,0x1000 \ -Wl,-sectalign,__HIB,__desc,0x1000 \ -Wl,-sectalign,__HIB,__data,0x1000 \ @@ -305,6 +357,8 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \ -Wl,-sectalign,__HIB,__llvm_prf_names,0x1000 \ -Wl,-sectalign,__HIB,__llvm_prf_data,0x1000 \ -Wl,-sectalign,__HIB,__textcoal_nt,0x1000 \ + -Wl,-rename_section,__DATA,__const,__CONST,__constdata \ + -Wl,-no_zero_fill_sections \ $(LDFLAGS_NOSTRIP_FLAG) # Define KERNEL_BASE_OFFSET so known at compile time: @@ -322,6 +376,7 @@ LDFLAGS_KERNEL_PROFILEX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ + $(LDFLAGS_KERNEL_SDK) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),LDFLAGS_KERNEL_)) \ $($(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_)) \ @@ -355,12 +410,13 @@ endif # # Default INCFLAGS # -INCFLAGS_IMPORT = $(patsubst %, -I$(OBJROOT)/EXPORT_HDRS/%, $(COMPONENT_IMPORT_LIST)) -INCFLAGS_EXTERN = -I$(SRCROOT)/EXTERNAL_HEADERS +INCFLAGS_IMPORT = $(patsubst %, -I$(OBJROOT)/EXPORT_HDRS/%, $(COMPONENT_IMPORT_LIST)) +INCFLAGS_EXTERN = -I$(SRCROOT)/EXTERNAL_HEADERS INCFLAGS_GEN = -I$(SRCROOT)/$(COMPONENT) -I$(OBJROOT)/EXPORT_HDRS/$(COMPONENT) INCFLAGS_LOCAL = -I. +INCFLAGS_SDK = -I$(SDKROOT)/usr/local/include/kernel -INCFLAGS = $(INCFLAGS_LOCAL) $(INCFLAGS_GEN) $(INCFLAGS_IMPORT) $(INCFLAGS_EXTERN) $(INCFLAGS_MAKEFILE) +INCFLAGS = $(INCFLAGS_LOCAL) $(INCFLAGS_GEN) $(INCFLAGS_IMPORT) $(INCFLAGS_EXTERN) $(INCFLAGS_MAKEFILE) $(INCFLAGS_SDK) # # Default MIGFLAGS @@ -368,7 +424,6 @@ INCFLAGS = $(INCFLAGS_LOCAL) $(INCFLAGS_GEN) $(INCFLAGS_IMPORT) $(INCFLAGS_EXTE MIGFLAGS = $(DEFINES) $(INCFLAGS) -novouchers $($(addsuffix $(CURRENT_ARCH_CONFIG),CFLAGS_)) $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \ $(DEPLOYMENT_TARGET_FLAGS) - # Support for LLVM Profile Guided Optimization (PGO) ifeq ($(BUILD_PROFILE),1) @@ -413,7 +468,7 @@ SUPPORTS_CTFCONVERT = 0 ifeq ($(USE_LTO),1) CFLAGS_GEN += -flto CXXFLAGS_GEN += -flto -LDFLAGS_KERNEL_GEN += -Wl,-mllvm,-inline-threshold=125 -Wl,-object_path_lto,$(TARGET)/lto.o # -Wl,-mllvm -Wl,-disable-fp-elim +LDFLAGS_KERNEL_GEN += -Wl,-mllvm,-inline-threshold=125 -Wl,-object_path_lto,$(TARGET)/lto.o # -Wl,-mllvm -Wl,-disable-fp-elim LDFLAGS_NOSTRIP_FLAG = -rdynamic CFLAGS_NOLTO_FLAG = -fno-lto NEEDS_CTF_MACHOS = 1 @@ -446,7 +501,7 @@ export VPATH = .:$(SOURCE) # Macros that control installation of kernel and its header files # # install flags for header files -# +# INSTALL_FLAGS = -c -S -m 0444 DATA_INSTALL_FLAGS = -c -S -m 0644 EXEC_INSTALL_FLAGS = -c -S -m 0755 @@ -493,12 +548,12 @@ EXPDIR = EXPORT_HDRS/$(COMPONENT) # # Strip Flags # -STRIP_FLAGS_RELEASE = -S -x -STRIP_FLAGS_DEVELOPMENT = -S -STRIP_FLAGS_DEBUG = -S +STRIP_FLAGS_RELEASE = -S -x +STRIP_FLAGS_DEVELOPMENT = -S +STRIP_FLAGS_DEBUG = -S STRIP_FLAGS_PROFILE = -S -x -STRIP_FLAGS = $($(addsuffix $(CURRENT_KERNEL_CONFIG),STRIP_FLAGS_)) +STRIP_FLAGS = $($(addsuffix $(CURRENT_KERNEL_CONFIG),STRIP_FLAGS_)) # # dsymutil flags @@ -554,6 +609,12 @@ KERNEL_FILE_NAME = $(KERNEL_FILE_NAME_PREFIX).$(CURRENT_KERNEL_CONFIG_LC).$(CURR KERNEL_LLDBBOOTSTRAP_NAME = $(KERNEL_FILE_NAME_PREFIX)_$(CURRENT_KERNEL_CONFIG_LC).py endif +CURRENT_ALIAS_MACHINE_CONFIG = $(word 4,$(subst ^, ,$(CURRENT_BUILD_CONFIG))) +CURRENT_ALIAS_MACHINE_CONFIG_LC = $(shell printf "%s" "$(CURRENT_ALIAS_MACHINE_CONFIG)" | $(TR) A-Z a-z) +ifneq ($(CURRENT_ALIAS_MACHINE_CONFIG),) +ALIAS_FILE_NAME = $(KERNEL_FILE_NAME_PREFIX).$(CURRENT_KERNEL_CONFIG_LC).$(CURRENT_ALIAS_MACHINE_CONFIG_LC) +endif + # # System.kext pseudo-kext install location # @@ -569,6 +630,7 @@ INSTALL_KERNEL_SYM_DIR = /System/Library/Extensions/KDK # INSTALL_SHARE_MISC_DIR = /usr/share/misc INSTALL_DTRACE_SCRIPTS_DIR = /usr/lib/dtrace +INSTALL_DTRACE_LIBEXEC_DIR = /usr/libexec/dtrace # # Overrides for XBS build aliases diff --git a/makedefs/MakeInc.dir b/makedefs/MakeInc.dir index 9ecadbdf1..ec79505e4 100644 --- a/makedefs/MakeInc.dir +++ b/makedefs/MakeInc.dir @@ -1,6 +1,6 @@ # -*- mode: makefile;-*- # -# Copyright (C) 1999-2012 Apple Inc. All rights reserved. +# Copyright (C) 1999-2016 Apple Inc. All rights reserved. # # MakeInc.dir contains the recursion rules for the build system. # For instance, the "build_installhdrs_md" target is auto-generated @@ -24,23 +24,23 @@ $(1)_recurse_target_list = $$(addprefix $(1)_recurse_into_,$(2)) .PHONY: $$($(1)_recurse_target_list) $$($(1)_recurse_target_list): - $$(_v)$$(MKDIR) $$(CURDIR)/$$(patsubst $(1)_recurse_into_%,%,$$@) - $$(_v)$${MAKE} -C $$(CURDIR)/$$(patsubst $(1)_recurse_into_%,%,$$@) \ - -f $$(SOURCE)$$(patsubst $(1)_recurse_into_%,%,$$@)/Makefile \ + $$(_v)$$(MKDIR) "$$(CURDIR)/$$(patsubst $(1)_recurse_into_%,%,$$@)" + $$(_v)$${MAKE} -C "$$(CURDIR)/$$(patsubst $(1)_recurse_into_%,%,$$@)" \ + -f "$$(SOURCE)$$(patsubst $(1)_recurse_into_%,%,$$@)/Makefile" \ CURRENT_KERNEL_CONFIG=$${CURRENT_KERNEL_CONFIG} \ CURRENT_ARCH_CONFIG=$${CURRENT_ARCH_CONFIG} \ CURRENT_MACHINE_CONFIG=$${CURRENT_MACHINE_CONFIG} \ CURRENT_BUILD_CONFIG=$${CURRENT_BUILD_CONFIG} \ - SOURCE=$$(SOURCE)$$(patsubst $(1)_recurse_into_%,%,$$@)/ \ - RELATIVE_SOURCE_PATH=$$(RELATIVE_SOURCE_PATH)/$$(patsubst $(1)_recurse_into_%,%,$$@) \ + SOURCE="$$(SOURCE)$$(patsubst $(1)_recurse_into_%,%,$$@)/" \ + RELATIVE_SOURCE_PATH="$$(RELATIVE_SOURCE_PATH)/$$(patsubst $(1)_recurse_into_%,%,$$@)" \ TARGET=$(if $(4),$${OBJPATH}/$$(COMPONENT),$$(TARGET)$$(patsubst $(1)_recurse_into_%,%,$$@)/) \ - OBJPATH=$${OBJPATH} \ + OBJPATH=$${OBJPATH} \ $(1); .PHONY: $(1) $(1): $$($(1)_recurse_target_list) - $$(_v)$${MAKE} -C $$(CURDIR) \ + $$(_v)$${MAKE} -C "$$(CURDIR)" \ -f $$(firstword $$(MAKEFILE_LIST)) \ CURRENT_KERNEL_CONFIG=$${CURRENT_KERNEL_CONFIG} \ CURRENT_ARCH_CONFIG=$${CURRENT_ARCH_CONFIG} \ @@ -49,7 +49,7 @@ $(1): $$($(1)_recurse_target_list) SOURCE=$$(SOURCE) \ RELATIVE_SOURCE_PATH=$$(RELATIVE_SOURCE_PATH) \ TARGET=$$(TARGET) \ - OBJPATH=$${OBJPATH} \ + OBJPATH=$${OBJPATH} \ $(3); endef @@ -59,22 +59,22 @@ endef $(eval $(call RECURSIVE_BUILD_RULES_template,build_setup,$(SETUP_SUBDIRS),do_build_setup,)) # -# Install machine independent kernel header files +# Install machine independent kernel header files # $(eval $(call RECURSIVE_BUILD_RULES_template,build_installhdrs_mi,$(INSTINC_SUBDIRS),do_installhdrs_mi,)) # -# Install machine dependent kernel header files +# Install machine dependent kernel header files # $(eval $(call RECURSIVE_BUILD_RULES_template,build_installhdrs_md,$(INSTINC_SUBDIRS_$(CURRENT_ARCH_CONFIG)),do_installhdrs_md,)) # -# Install machine independent kernel header files +# Install machine independent kernel header files # $(eval $(call RECURSIVE_BUILD_RULES_template,build_exporthdrs_mi,$(EXPINC_SUBDIRS),do_exporthdrs_mi,)) # -# Install machine dependent kernel header files +# Install machine dependent kernel header files # $(eval $(call RECURSIVE_BUILD_RULES_template,build_exporthdrs_md,$(EXPINC_SUBDIRS_$(CURRENT_ARCH_CONFIG)),do_exporthdrs_md,)) diff --git a/makedefs/MakeInc.kernel b/makedefs/MakeInc.kernel index 3b6014fee..0885b3fab 100644 --- a/makedefs/MakeInc.kernel +++ b/makedefs/MakeInc.kernel @@ -1,6 +1,6 @@ # -*- mode: makefile;-*- # -# Copyright (C) 1999-2012 Apple Inc. All rights reserved. +# Copyright (C) 1999-2016 Apple Inc. All rights reserved. # # MakeInc.kernel augments the single-architecture # recursive build system with rules specific @@ -56,7 +56,7 @@ endif .PHONY: do_build_kernel_dSYM -do_build_kernel_dSYM: $(TARGET)/$(KERNEL_FILE_NAME).dSYM +do_build_kernel_dSYM: $(TARGET)/$(KERNEL_FILE_NAME).dSYM @: .LDFLAGS: ALWAYS @@ -65,30 +65,30 @@ do_build_kernel_dSYM: $(TARGET)/$(KERNEL_FILE_NAME).dSYM $(_v)$(REPLACECONTENTS) $@ $(KCC) $(CFLAGS) $(INCFLAGS) $(TARGET)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped - @echo STRIP $(@F) + @echo "$(ColorH)STRIP$(Color0) $(ColorLF)$(@F)$(Color0)" $(_v)$(STRIP) $(STRIP_FLAGS) $< -o $@ $(_v)$(RM) $@.ctfdata ifeq ($(DO_CTFMERGE),1) - @echo CTFMERGE $(@F) - $(_v)$(FIND) $(TARGET)/ -name \*.ctf -size +0 | \ + @echo "$(ColorH)CTFMERGE$(Color0) $(ColorLF)$(@F)$(Color0)" + $(_v)$(FIND) $(TARGET)/ -name \*.ctf -size +0 | \ $(XARGS) $(CTFMERGE) -l xnu -o $@ -Z $@.ctfdata || true endif $(_v)if [ -s $@.ctfdata ]; then \ - echo CTFINSERT $(@F); \ - $(CTFINSERT) $@ $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG)) \ + echo "$(ColorH)CTFINSERT$(Color0) $(ColorLF)$(@F)$(Color0)"; \ + $(CTFINSERT) $@ $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG)) \ $@.ctfdata -o $@; \ fi; $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/$(KERNEL_FILE_NAME) $(OBJROOT)/$(KERNEL_FILE_NAME) $(TARGET)/$(KERNEL_FILE_NAME).dSYM: $(TARGET)/$(KERNEL_FILE_NAME).unstripped - $(_v)echo DSYMUTIL $(@F) + $(_v)echo "$(ColorH)DSYMUTIL$(Color0) $(ColorLF)$(@F)$(Color0)" $(_v)$(DSYMUTIL) $(DSYMUTIL_FLAGS) $< -o $@ $(_v)$(MV) $@/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME).unstripped $@/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) $(_v)$(TOUCH) $@ $(TARGET)/$(KERNEL_FILE_NAME).unstripped: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o $(SRCROOT)/config/version.c $(SRCROOT)/config/MasterVersion .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) $(_v)${MAKE} -f $(firstword $(MAKEFILE_LIST)) version.o - @echo LD $(@F) + @echo "$(ColorL)LD$(Color0) $(ColorLF)$(@F)$(Color0)" $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > link.filelist $(_v)$(LD) $(LDFLAGS_KERNEL) -filelist link.filelist version.o $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS) @@ -129,7 +129,7 @@ lastkernelconstructor.o: $(SRCROOT)/libsa/lastkernelconstructor.c ${C_RULE_3} ${C_RULE_4} $(_v)for last_file in ${LAST_FILES}; \ - do \ + do \ $(SEG_HACK) -s __DATA -n __LAST -o $${last_file}__ $${last_file} || exit 1; \ mv $${last_file}__ $${last_file} || exit 1; \ done @@ -184,11 +184,11 @@ do_install_xnu_debug_files: $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/README.DEBUG-kern $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME) ALWAYS $(_v)$(MKDIR) $(dir $@) $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then \ - echo INSTALL $(@F) "($(CURRENT_ARCH_CONFIG_LC) $(CURRENT_MACHINE_CONFIG_LC))"; \ + echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))\""; \ $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@; \ cmdstatus=$$?; \ else \ - echo INSTALL $(@F) "($(CURRENT_ARCH_CONFIG_LC) $(CURRENT_MACHINE_CONFIG_LC))"; \ + echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))\""; \ $(LIPO) -create $@ $< -output $@; \ cmdstatus=$$?; \ fi; \ @@ -197,11 +197,11 @@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NA $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS $(_v)$(MKDIR) $(dir $@) $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then \ - echo INSTALLSYM $(@F) "($(CURRENT_ARCH_CONFIG_LC))"; \ + echo "$(ColorH)INSTALLSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@; \ cmdstatus=$$?; \ else \ - echo INSTALLSYM $(@F) "($(CURRENT_ARCH_CONFIG_LC))"; \ + echo "$(ColorH)INSTALLSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ $(LIPO) -create $@ $< -output $@; \ cmdstatus=$$?; \ fi; \ @@ -209,35 +209,35 @@ $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros: $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros $(_v)$(MKDIR) $(dir $@) - @echo INSTALLMACROS $(@F) "($(CURRENT_ARCH_CONFIG_LC))" + @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(CP) -r $< $(dir $@) $(_v)$(TOUCH) $@ $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME): $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) $(_v)$(MKDIR) $(dir $@) - @echo INSTALLMACROS $(@F) "($(CURRENT_ARCH_CONFIG_LC))" + @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/README.DEBUG-kernel.txt: $(SRCROOT)/config/README.DEBUG-kernel.txt $(_v)$(MKDIR) $(dir $@) - @echo INSTALL $(@F) + @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist: $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist $(_v)$(MKDIR) $(dir $@) - @echo INSTALLSYM dSYM $(@F) "($(CURRENT_ARCH_CONFIG_LC))" + @echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) ALWAYS $(_v)$(MKDIR) $(dir $@) $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then \ - echo INSTALLSYM dSYM $(@F).dSYM "($(CURRENT_ARCH_CONFIG_LC))"; \ - $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@; \ - cmdstatus=$$?; \ + echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ + $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@; \ + cmdstatus=$$?; \ else \ - echo INSTALLSYM dSYM $(@F).dSYM "($(CURRENT_ARCH_CONFIG_LC))"; \ - $(LIPO) -create $@ $< -output $@; \ - cmdstatus=$$?; \ + echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \ + $(LIPO) -create $@ $< -output $@; \ + cmdstatus=$$?; \ fi; \ exit $$cmdstatus @@ -290,5 +290,11 @@ ifeq ($(USE_BINARY_PLIST),1) $(_v)$(PLUTIL) -convert binary1 -o $@ $@ endif +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(ALIAS_FILE_NAME): ALWAYS + $(_v)echo "$(ColorH)ALIAS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_ALIAS_MACHINE_CONFIG_LC)$(Color0))\"" + $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME) $@ + +install_alias: $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(ALIAS_FILE_NAME) + print_exports: $(_v)printenv | sort diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index 13371fd5c..8c1c5d0c8 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -1,6 +1,6 @@ # -*- mode: makefile;-*- # -# Copyright (C) 1999-2012 Apple Inc. All rights reserved. +# Copyright (C) 1999-2016 Apple Inc. All rights reserved. # # MakeInc.rule defines the targets and rules for # leaf directories once MakeInc.dir has recursed @@ -45,6 +45,47 @@ ifndef INSTALL_KF_MD_GEN_LIST INSTALL_KF_MD_GEN_LIST = $(EXPORT_MD_GEN_LIST) endif +ifeq (${XNU_LOGCOLORS},y) + LOGCOLORS ?= y +endif + +ifeq ($(LOGCOLORS),y) + # Get a nice list of device code names associated with the build platform + ifndef CDevs + #ifdef EMBEDDED_DEVICE_MAP + # export CDevs := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query "SELECT DISTINCT TargetType FROM Targets WHERE KernelPlatform = '$(CURRENT_MACHINE_CONFIG_LC)'" | tr '[\r\n]' ':' | sed 's,:$$,,') + #endif + endif + ifndef CMD_MC + export _MACHINE := $(CURRENT_MACHINE_CONFIG_LC) + ifeq ($(CURRENT_MACHINE_CONFIG),NONE) + export _MACHINE := $(subst Mac,,$(PLATFORM)) + endif + export CMD_MC := \\033[1m$(shell __A="$(CURRENT_ARCH_CONFIG_LC)"; \ + __As=$$((6-$${\#__A})); \ + printf "%-.6s%*.*s %9.9s" \ + "$${__A}" \ + $${__As} $${__As} " " \ + "$(_MACHINE)")\\033[m + endif + # Turn off colored output + Color0=\\033[m + # Start a host command: bold, underlined pink text + ColorH=\\033[1;4;35m + # Start a compilation-related command: bold, underlined blue text + ColorC=[$(CMD_MC)] \\033[1;4;34m + # Start a MIG command: bold, green text on light grey background + ColorM=[$(CMD_MC)] \\033[1;32;40m + # Start a linking command: bold, white text on blue background + ColorL=[$(CMD_MC)] \\033[1;37;44m + # Start a filename: bold, white text + ColorF=\\033[1;37m + # Start a linked file name: yellow text on light grey background + ColorLF=\\033[1;33;40m + # Error strings: underlined bold white text on red background + ColorErr=\033[1;4;37;41m +endif + .PHONY: ALWAYS ALWAYS: @@ -71,7 +112,7 @@ $(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) $(4) $(1): $(dir $(firstword $(1)))% : $(if $(2),%,$$(SOURCE)/%) | $(3)_MKDIR - @echo INSTALLHDR $$* + @echo "$$(ColorH)INSTALLHDR$(Color0) $$(ColorF)$$*$$(Color0)" $$(_v)$$(UNIFDEF) $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$; \ if [ $$$$? -eq 2 ]; then \ echo Parse failure for $$<; \ @@ -211,7 +252,7 @@ do_exporthdrs_md: $(EXPORT_MD_GEN_INC_FILES) $(EXPORT_MD_INC_FILES) # Compilation rules to generate .o from .s # -S_RULE_0=@echo AS $@ +S_RULE_0=@echo "$(ColorC)AS$(Color0) $(ColorF)$@$(Color0)" S_RULE_1A=$(_v)${S_KCC} -c ${SFLAGS} -MD -MF $(@:o=d) -MP ${$@_SFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} S_RULE_1B=$( /dev/null || true; fi +C_RULE_2=$(_v)if [ -z "${$@_SKIP_CTFCONVERT}" ]; then \ + ctferr=`${CTFCONVERT} -l xnu -v -o $@.ctf $@ 2>&1 > /dev/null || true`; \ + if [ ! -z "$${ctferr}" ]; then \ + echo "[$(CMD_MC)] $(ColorErr)$@$(Color0) $(ColorErr)$${ctferr}$(Color0)"; \ + fi; \ + fi else C_RULE_2= endif ifeq ($(DO_CTFMACHO), 1) -C_CTFRULE_1A=$(_v)${KCC} -o $@.non_lto -c ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CWARNFLAGS}} ${$@_CFLAGS_ADD} ${$@_CWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} $(CFLAGS_NOLTO_FLAG) +C_CTFRULE_1A=$(_v)${KCC} -o $@.non_lto -c ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CWARNFLAGS}} ${$@_CFLAGS_ADD} ${$@_CWARNFLAGS_ADD} ${INCFLAGS} $(CFLAGS_NOLTO_FLAG) ${$@_INCFLAGS} C_CTFRULE_1B=$( /dev/null || true; fi +C_CTFRULE_2=$(_v)if [ -z "${$@_SKIP_CTFCONVERT}" ]; then \ + ctferr=`${CTFCONVERT} -l xnu -v -o $@.non_lto.ctf $@.non_lto 2>&1 > /dev/null || true`; \ + if [ ! -z "$${ctferr}" ]; then \ + echo "[$(CMD_MC)] $(ColorErr)$@$(Color0) $(ColorErr)$${ctferr}$(Color0)"; \ + fi; \ + fi else C_CTFRULE_1A=@true C_CTFRULE_1B= C_CTFRULE_2=@true endif +C_RULE_3=@true +C_RULE_4A=@true +C_RULE_4B= + # # Compilation rules to generate .o from .c for driver files # @@ -246,22 +301,29 @@ C_RULE_0_D=${C_RULE_0} C_RULE_1A_D=${C_RULE_1A} C_RULE_1B_D=${C_RULE_1B} C_RULE_2_D=${C_RULE_2} +C_RULE_3_D=${C_RULE_3} +C_RULE_4A_D=${C_RULE_4A} +C_RULE_4B_D=${C_RULE_4B} C_CTFRULE_1A_D=${C_CTFRULE_1A} C_CTFRULE_1B_D=${C_CTFRULE_1B} C_CTFRULE_2_D=${C_CTFRULE_2} +C_CTFRULE_3_D=${C_CTFRULE_3} # # Compilation rules to generate .co from .cp or .cpo from .cpp # The config tool slickly changes the last source filename char to 'o' # for the object filename. -P_RULE_0=@echo C++ $@ -P_RULE_1A=$(_v)${KC++} -o $@ -c ${CXXFLAGS} ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CXXWARNFLAGS}} -MD -MF $(@:o=d) -MP ${$@_CFLAGS_ADD} ${$@_CXXWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} +P_RULE_0=@echo "$(ColorC)C++$(Color0) $(ColorF)$@$(Color0)" +P_RULE_1A=$(_v)${KC++} -o $@ -c ${CXXFLAGS} ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CXXWARNFLAGS}} -MD -MF $(@:o=d) -MP ${$@_CFLAGS_ADD} ${$@_CXXWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} P_RULE_1B=$( /dev/null - @rm -f $(OBJROOT)/cscope.genhdrs/* 2> /dev/null || true + @rm -f $(OBJROOT)/cscope.genhdrs/* 2> /dev/null || true @rm -f TAGS 2> /dev/null - # # Build source file list for cscope database and tags # @@ -554,14 +574,13 @@ reindent: .PHONY: help help: - @cat README + @cat README.md .PHONY: print_exports print_exports: $(_v)printenv | sort - generated_top_level_print_exports = $(call TOP_LEVEL_EACH_BUILD_CONFIG_BOOTSTRAP_template,print_exports,,,,1,$(FIRST_BUILD_CONFIG)) ifeq ($(VERBOSE_GENERATED_MAKE_FRAGMENTS),YES) $(warning Generate makefile fragment: $(generated_top_level_print_exports)) diff --git a/osfmk/Makefile b/osfmk/Makefile index c39c53ccc..64af5e1e0 100644 --- a/osfmk/Makefile +++ b/osfmk/Makefile @@ -8,9 +8,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ mach \ - atm \ + atm \ corpses \ bank \ + voucher \ device \ default_pager \ mach_debug \ @@ -50,9 +51,10 @@ INSTINC_SUBDIRS_ARM64 = \ EXPINC_SUBDIRS = \ mach \ - atm \ + atm \ corpses \ bank \ + voucher \ device \ default_pager \ mach_debug \ @@ -88,7 +90,7 @@ EXPINC_SUBDIRS_ARM64 = \ arm \ arm64 -COMP_SUBDIRS = \ +COMP_SUBDIRS = \ conf include $(MakeInc_rule) diff --git a/osfmk/UserNotification/Makefile b/osfmk/UserNotification/Makefile index efde212fa..2de33166a 100644 --- a/osfmk/UserNotification/Makefile +++ b/osfmk/UserNotification/Makefile @@ -37,7 +37,7 @@ EXPORT_MI_DIR = UserNotification # # Build path -# +# INCFLAGS_MAKEFILE= -I.. MIGKSFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_SERVER=1 @@ -55,7 +55,7 @@ MIG_KSHDRS = \ MIG_KSSRC = \ UNDReplyServer.c -COMP_FILES = ${MIG_KUSRC} ${MIG_KSSRC} +COMP_FILES = ${MIG_KUSRC} ${MIG_KSSRC} do_build_all:: $(COMP_FILES) @@ -63,7 +63,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*.c \ -header $*.h \ @@ -73,7 +73,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %Server.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/atm/Makefile b/osfmk/atm/Makefile index ee54e0b3e..aa1f67f54 100644 --- a/osfmk/atm/Makefile +++ b/osfmk/atm/Makefile @@ -7,20 +7,20 @@ include $(MakeInc_cmd) include $(MakeInc_def) MIG_TYPES = \ - atm_types.defs + atm_types.defs MIG_DEFS = \ atm_notification.defs -MACH_PRIVATE_DEFS = +MACH_PRIVATE_DEFS = # # MIG-generated headers that are traditionally used by user # level code. # -MIG_USHDRS = +MIG_USHDRS = -MIG_UUHDRS = +MIG_UUHDRS = MIGINCLUDES = ${MIG_UUHDRS} ${MIG_USHDRS} @@ -43,7 +43,7 @@ INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = atm EXPORT_MI_LIST = \ - ${DATAFILES} + ${DATAFILES} atm_internal.h EXPORT_MI_GEN_LIST = \ ${MIGINCLUDES} @@ -54,21 +54,21 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ - -header $@ \ + -header $@ \ $< ${MIG_USHDRS} : \ %_server.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ -header /dev/null \ - -sheader $@ \ + -sheader $@ \ $< # @@ -87,11 +87,11 @@ MIG_KUHDRS = \ atm_notification.h MIG_KUSRC = \ - atm_notification_user.c + atm_notification_user.c -MIG_KSHDRS = +MIG_KSHDRS = -MIG_KSSRC = +MIG_KSSRC = COMP_FILES = ${MIG_KUSRC} ${MIG_KSSRC} @@ -101,7 +101,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -110,8 +110,8 @@ ${MIG_KUSRC} : \ $< ${MIG_KSSRC}: \ - %_server.c : %.defs - @echo MIG $@ + %_server.c : %.defs + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/atm/atm.c b/osfmk/atm/atm.c index 707a5be66..073aa37c2 100644 --- a/osfmk/atm/atm.c +++ b/osfmk/atm/atm.c @@ -731,9 +731,14 @@ atm_send_user_notification( /* Make sure that honor queue limit option is unset on the thread. */ th->options &= (~TH_OPT_HONOR_QLIMIT); - if (kr == MACH_SEND_TIMED_OUT) { - kr = KERN_SUCCESS; + if (kr != KERN_SUCCESS) { + ipc_port_release_send(user_port); + + if (kr == MACH_SEND_TIMED_OUT) { + kr = KERN_SUCCESS; + } } + return kr; } @@ -794,8 +799,12 @@ atm_send_proc_inspect_notification( /* Make sure that honor queue limit option is unset on the thread. */ th->options &= (~TH_OPT_HONOR_QLIMIT); - if (kr == MACH_SEND_TIMED_OUT) { - kr = KERN_SUCCESS; + if (kr != KERN_SUCCESS) { + ipc_port_release_send(user_port); + + if (kr == MACH_SEND_TIMED_OUT) { + kr = KERN_SUCCESS; + } } ipc_port_release_send(memory_port); diff --git a/osfmk/atm/atm_internal.h b/osfmk/atm/atm_internal.h index 73d2e0c0d..bd5719e45 100644 --- a/osfmk/atm/atm_internal.h +++ b/osfmk/atm/atm_internal.h @@ -130,9 +130,9 @@ void atm_task_descriptor_destroy(atm_task_descriptor_t task_descriptor); kern_return_t atm_register_trace_memory(task_t task, uint64_t trace_buffer_address, uint64_t buffer_size); kern_return_t atm_send_proc_inspect_notification(task_t task, int32_t traced_pid, uint64_t traced_uniqueid); +#endif /* MACH_KERNEL_PRIVATE */ + kern_return_t atm_set_diagnostic_config(uint32_t); uint32_t atm_get_diagnostic_config(void); -#endif /* MACH_KERNEL_PRIVATE */ - #endif /* _ATM_ATM_INTERNAL_H_ */ diff --git a/osfmk/atm/atm_types.h b/osfmk/atm/atm_types.h index afdd1ce37..36f2f2dad 100644 --- a/osfmk/atm/atm_types.h +++ b/osfmk/atm/atm_types.h @@ -64,6 +64,9 @@ typedef atm_memory_descriptor_t *atm_memory_descriptor_array_t; typedef uint64_t *atm_memory_size_array_t; #define ATM_SUBAID32_MAX (UINT32_MAX) -#define ATM_TRACE_DISABLE (0x100) +#define ATM_TRACE_DISABLE (0x0100) /* OS_TRACE_MODE_DISABLE - Do not initialize the new logging*/ +#define ATM_TRACE_OFF (0x0400) /* OS_TRACE_MODE_OFF - Don't drop log messages to new log buffers */ +#define ATM_ENABLE_LEGACY_LOGGING (0x0200) /* OS_TRACE_SYSTEMMODE_LEGACY_LOGGING - Enable legacy logging */ #endif /* _ATM_ATM_TYPES_H_ */ + diff --git a/osfmk/bank/Makefile b/osfmk/bank/Makefile index adf9c6152..2f4246c0f 100644 --- a/osfmk/bank/Makefile +++ b/osfmk/bank/Makefile @@ -6,19 +6,19 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -MIG_TYPES = +MIG_TYPES = -MIG_DEFS = +MIG_DEFS = -MACH_PRIVATE_DEFS = +MACH_PRIVATE_DEFS = # # MIG-generated headers that are traditionally used by user # level code. # -MIG_USHDRS = +MIG_USHDRS = -MIG_UUHDRS = +MIG_UUHDRS = MIGINCLUDES = ${MIG_UUHDRS} ${MIG_USHDRS} @@ -52,21 +52,21 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ - -header $@ \ + -header $@ \ $< ${MIG_USHDRS} : \ %_server.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ -header /dev/null \ - -sheader $@ \ + -sheader $@ \ $< # @@ -81,13 +81,13 @@ MIGKUFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_USER=1 -maxonstack 1024 # MIG-generated headers that are traditionally used by kernel # level code. # -MIG_KUHDRS = +MIG_KUHDRS = -MIG_KUSRC = +MIG_KUSRC = -MIG_KSHDRS = +MIG_KSHDRS = -MIG_KSSRC = +MIG_KSSRC = COMP_FILES = ${MIG_KUSRC} ${MIG_KSSRC} @@ -97,7 +97,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -106,8 +106,8 @@ ${MIG_KUSRC} : \ $< ${MIG_KSSRC}: \ - %_server.c : %.defs - @echo MIG $@ + %_server.c : %.defs + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/bank/bank.c b/osfmk/bank/bank.c index b4293d952..fa08f23ae 100644 --- a/osfmk/bank/bank.c +++ b/osfmk/bank/bank.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013 Apple Inc. All rights reserved. + * Copyright (c) 2012-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -267,13 +267,20 @@ bank_release_value( if (bank_element->be_type == BANK_TASK) { bank_task = CAST_TO_BANK_TASK(bank_element); - - if (bank_task->bt_made != (int)sync) { + + /* Checking of the made ref with sync and clearing of voucher ref should be done under a lock */ + lck_mtx_lock(&bank_task->bt_acc_to_pay_lock); + if (bank_task->bt_made != sync) { + lck_mtx_unlock(&bank_task->bt_acc_to_pay_lock); return KERN_FAILURE; } bank_task_made_release_num(bank_task, sync); - bank_task_dealloc(bank_task, sync); + assert(bank_task->bt_voucher_ref == 1); + bank_task->bt_voucher_ref = 0; + lck_mtx_unlock(&bank_task->bt_acc_to_pay_lock); + + bank_task_dealloc(bank_task, 1); } else if (bank_element->be_type == BANK_ACCOUNT) { bank_account = CAST_TO_BANK_ACCOUNT(bank_element); kr = bank_account_dealloc_with_sync(bank_account, sync); @@ -425,8 +432,16 @@ bank_get_value( if (bank_holder == bank_merchant && bank_holder == bank_secureoriginator && bank_holder == bank_proximateprocess) { - bank_task_reference(bank_holder); + + lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock); bank_task_made_reference(bank_holder); + if (bank_holder->bt_voucher_ref == 0) { + /* Take a ref for voucher system, if voucher system does not have a ref */ + bank_task_reference(bank_holder); + bank_holder->bt_voucher_ref = 1; + } + lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock); + *out_value = BANK_ELEMENT_TO_HANDLE(bank_holder); return kr; } @@ -459,13 +474,8 @@ bank_get_value( } if (bank_element->be_type == BANK_TASK) { bank_task = CAST_TO_BANK_TASK(bank_element); - if (bank_task != get_bank_task_context(task, FALSE)) { - panic("Found a bank task of another task with bank_context: %p", bank_task); - } + panic("Found a bank task in MACH_VOUCHER_ATTR_REDEEM: %p", bank_task); - bank_task_reference(bank_task); - bank_task_made_reference(bank_task); - *out_value = BANK_ELEMENT_TO_HANDLE(bank_task); return kr; } else if (bank_element->be_type == BANK_ACCOUNT) { @@ -475,7 +485,6 @@ bank_get_value( panic("Found another bank task: %p as a bank merchant\n", bank_merchant); } - bank_account_reference(bank_account); bank_account_made_reference(bank_account); *out_value = BANK_ELEMENT_TO_HANDLE(bank_account); return kr; @@ -633,8 +642,6 @@ bank_command( *out_content_size = 0; return KERN_INVALID_VALUE; - break; - case BANK_PERSONA_TOKEN: if ((sizeof(struct persona_token)) > *out_content_size) { @@ -672,8 +679,6 @@ bank_command( *out_content_size = 0; return KERN_INVALID_VALUE; - break; - default: return KERN_INVALID_ARGUMENT; } @@ -712,6 +717,7 @@ bank_task_alloc_init(task_t task) return BANK_TASK_NULL; new_bank_task->bt_type = BANK_TASK; + new_bank_task->bt_voucher_ref = 0; new_bank_task->bt_refs = 1; new_bank_task->bt_made = 0; new_bank_task->bt_creditcard = NULL; @@ -792,6 +798,7 @@ bank_account_alloc_init( } new_bank_account->ba_type = BANK_ACCOUNT; + new_bank_account->ba_voucher_ref = 0; new_bank_account->ba_refs = 1; new_bank_account->ba_made = 1; new_bank_account->ba_bill = new_ledger; @@ -810,7 +817,6 @@ bank_account_alloc_init( entry_found = TRUE; /* Take a made ref, since this value would be returned to voucher system. */ - bank_account_reference(bank_account); bank_account_made_reference(bank_account); break; } @@ -952,15 +958,15 @@ bank_account_dealloc_with_sync( /* Grab the acc to pay list lock and check the sync value */ lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock); - if (bank_account->ba_made != (int)sync) { + if (bank_account->ba_made != sync) { lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock); return KERN_FAILURE; } bank_account_made_release_num(bank_account, sync); - if (bank_account_release_num(bank_account, sync) > (int)sync) - panic("Sync and ref value did not match for bank account %p\n", bank_account); + if (bank_account_release_num(bank_account, 1) > 1) + panic("Releasing a non zero ref bank account %p\n", bank_account); /* Grab both the acc to pay and acc to charge locks */ diff --git a/osfmk/bank/bank_internal.h b/osfmk/bank/bank_internal.h index 2f0d9353c..c733eeb88 100644 --- a/osfmk/bank/bank_internal.h +++ b/osfmk/bank/bank_internal.h @@ -50,9 +50,10 @@ typedef mach_voucher_attr_value_handle_t bank_handle_t; #define BANK_ACCOUNT 1 struct bank_element { - int be_type; /* Type of element */ + unsigned int be_type:31, /* Type of element */ + be_voucher_ref:1; /* Voucher system holds a ref */ int be_refs; /* Ref count */ - int be_made; /* Made refs for voucher, Actual ref is also taken for each Made ref */ + unsigned int be_made; /* Made refs for voucher, Actual ref is only taken for voucher ref transition (0 to 1) */ #if DEVELOPMENT || DEBUG task_t be_task; /* Customer task, do not use it since ref is not taken on task */ #endif @@ -76,6 +77,7 @@ struct bank_task { }; #define bt_type bt_elem.be_type +#define bt_voucher_ref bt_elem.be_voucher_ref #define bt_refs bt_elem.be_refs #define bt_made bt_elem.be_made @@ -105,13 +107,13 @@ typedef struct bank_task * bank_task_t; (OSAddAtomic(-(num), &(elem)->bt_refs)) #define bank_task_made_reference(elem) \ - (OSAddAtomic(1, &(elem)->bt_made)) + (hw_atomic_add(&(elem)->bt_made, 1) - 1) #define bank_task_made_release(elem) \ - (OSAddAtomic(-1, &(elem)->bt_made)) + (hw_atomic_sub(&(elem)->bt_made, 1) + 1) #define bank_task_made_release_num(elem, num) \ - (OSAddAtomic(-(num), &(elem)->bt_made)) + (hw_atomic_sub(&(elem)->bt_made, (num)) + (num)) struct bank_account { @@ -129,6 +131,7 @@ struct bank_account { }; #define ba_type ba_elem.be_type +#define ba_voucher_ref ba_elem.be_voucher_ref #define ba_refs ba_elem.be_refs #define ba_made ba_elem.be_made @@ -149,13 +152,13 @@ typedef struct bank_account * bank_account_t; (OSAddAtomic(-(num), &(elem)->ba_refs)) #define bank_account_made_reference(elem) \ - (OSAddAtomic(1, &(elem)->ba_made)) + (hw_atomic_add(&(elem)->ba_made, 1) - 1) #define bank_account_made_release(elem) \ - (OSAddAtomic(-1, &(elem)->ba_made)) + (hw_atomic_sub(&(elem)->ba_made, 1) + 1) #define bank_account_made_release_num(elem, num) \ - (OSAddAtomic(-(num), &(elem)->ba_made)) + (hw_atomic_sub(&(elem)->ba_made, (num)) + (num)) struct _bank_ledger_indices { int cpu_time; diff --git a/osfmk/chud/chud_thread.c b/osfmk/chud/chud_thread.c index ca78a9614..9074c6dd3 100644 --- a/osfmk/chud/chud_thread.c +++ b/osfmk/chud/chud_thread.c @@ -117,382 +117,3 @@ chudxnu_unbind_thread(thread_t thread, __unused int options) thread_bind(PROCESSOR_NULL); return KERN_SUCCESS; } - -__private_extern__ boolean_t -chudxnu_thread_get_idle(thread_t thread) { - /* - * Instantaneous snapshot of the idle state of - * a given thread. - * - * Should be called only on an interrupted or - * suspended thread to avoid a race. - */ - return ((thread->state & TH_IDLE) == TH_IDLE); -} - -__private_extern__ int -chudxnu_thread_get_scheduler_state(thread_t thread) { - /* - * Instantaneous snapshot of the scheduler state of - * a given thread. - * - * MUST ONLY be called on an interrupted or - * locked thread, to avoid a race. - */ - - int state = 0; - int schedulerState = (volatile int)(thread->state); - processor_t lastProcessor = (volatile processor_t)(thread->last_processor); - - if ((PROCESSOR_NULL != lastProcessor) && (thread == lastProcessor->active_thread)) { - state |= CHUDXNU_TS_RUNNING; - } - - if (schedulerState & TH_RUN) { - state |= CHUDXNU_TS_RUNNABLE; - } - - if (schedulerState & TH_WAIT) { - state |= CHUDXNU_TS_WAIT; - } - - if (schedulerState & TH_UNINT) { - state |= CHUDXNU_TS_UNINT; - } - - if (schedulerState & TH_SUSP) { - state |= CHUDXNU_TS_SUSP; - } - - if (schedulerState & TH_TERMINATE) { - state |= CHUDXNU_TS_TERMINATE; - } - - if (schedulerState & TH_IDLE) { - state |= CHUDXNU_TS_IDLE; - } - - return state; -} - -#if 0 -#pragma mark **** task and thread info **** -#endif - -__private_extern__ boolean_t -chudxnu_is_64bit_task(task_t task) -{ - return (task_has_64BitAddr(task)); -} - -// an exact copy of task_threads() except no mig conversion at the end! -static kern_return_t -chudxnu_private_task_threads( - task_t task, - thread_act_array_t *threads_out, - mach_msg_type_number_t *count) -{ - mach_msg_type_number_t actual; - thread_t *thread_list; - thread_t thread; - vm_size_t size, size_needed; - void *addr; - unsigned int i, j; - - if (task == TASK_NULL) - return (KERN_INVALID_ARGUMENT); - - size = 0; addr = NULL; - - for (;;) { - task_lock(task); - if (!task->active) { - task_unlock(task); - - if (size != 0) - kfree(addr, size); - - return (KERN_FAILURE); - } - - actual = task->thread_count; - - /* do we have the memory we need? */ - size_needed = actual * sizeof (mach_port_t); - if (size_needed <= size) - break; - - /* unlock the task and allocate more memory */ - task_unlock(task); - - if (size != 0) - kfree(addr, size); - - assert(size_needed > 0); - size = size_needed; - - addr = kalloc(size); - if (addr == 0) - return (KERN_RESOURCE_SHORTAGE); - } - - /* OK, have memory and the task is locked & active */ - thread_list = (thread_t *)addr; - - i = j = 0; - - for (thread = (thread_t)queue_first(&task->threads); i < actual; - ++i, thread = (thread_t)queue_next(&thread->task_threads)) { - thread_reference_internal(thread); - thread_list[j++] = thread; - } - - assert(queue_end(&task->threads, (queue_entry_t)thread)); - - actual = j; - size_needed = actual * sizeof (mach_port_t); - - /* can unlock task now that we've got the thread refs */ - task_unlock(task); - - if (actual == 0) { - /* no threads, so return null pointer and deallocate memory */ - - *threads_out = NULL; - *count = 0; - - if (size != 0) - kfree(addr, size); - } - else { - /* if we allocated too much, must copy */ - - if (size_needed < size) { - void *newaddr; - - newaddr = kalloc(size_needed); - if (newaddr == 0) { - for (i = 0; i < actual; ++i) - thread_deallocate(thread_list[i]); - kfree(addr, size); - return (KERN_RESOURCE_SHORTAGE); - } - - bcopy(addr, newaddr, size_needed); - kfree(addr, size); - thread_list = (thread_t *)newaddr; - } - - *threads_out = thread_list; - *count = actual; - } - - return (KERN_SUCCESS); -} - - -__private_extern__ kern_return_t -chudxnu_all_tasks( - task_array_t *task_list, - mach_msg_type_number_t *count) -{ - return processor_set_things(&pset0, (void **)task_list, count, PSET_THING_TASK); -} - -__private_extern__ kern_return_t -chudxnu_free_task_list( - task_array_t *task_list, - mach_msg_type_number_t *count) -{ - vm_size_t size = (*count)*sizeof(mach_port_t); - void *addr = *task_list; - - if(addr) { - int i, maxCount = *count; - for(i=0; it_chud & T_CHUD_MARKED) != 0); - return FALSE; -} - -__private_extern__ boolean_t -chudxnu_thread_set_marked(thread_t thread, boolean_t new_value) -{ - boolean_t old_val; - - if(thread) { - if(new_value) { - // set the marked bit - old_val = OSBitOrAtomic(T_CHUD_MARKED, &(thread->t_chud)); - } else { - // clear the marked bit - old_val = OSBitAndAtomic(~T_CHUD_MARKED, &(thread->t_chud)); - } - return (old_val & T_CHUD_MARKED) == T_CHUD_MARKED; - } - return FALSE; -} - -/* XXX: good thing this code is experimental... */ - -/* external handler */ -extern void (*chudxnu_thread_ast_handler)(thread_t); -void (*chudxnu_thread_ast_handler)(thread_t) = NULL; - -/* AST callback to dispatch to AppleProfile */ -extern void chudxnu_thread_ast(thread_t); -void -chudxnu_thread_ast(thread_t thread) -{ -#if KPC - /* check for PMC work */ - kpc_thread_ast_handler(thread); -#endif - -#if KPERF - /* check for kperf work */ - kperf_thread_ast_handler(thread); -#endif - - /* atomicness for kdebug events */ - void (*handler)(thread_t) = chudxnu_thread_ast_handler; - if( handler ) - handler( thread ); - - thread->t_chud = 0; -} - - - -/* Get and set bits on the thread and trigger an AST handler */ -void chudxnu_set_thread_ast( thread_t thread ); -void -chudxnu_set_thread_ast( thread_t thread ) -{ - /* FIXME: only call this on current thread from an interrupt handler for now... */ - if( thread != current_thread() ) - panic( "unsafe AST set" ); - - act_set_kperf(thread); -} - -/* get and set the thread bits */ -extern uint32_t chudxnu_get_thread_bits( thread_t thread ); -extern void chudxnu_set_thread_bits( thread_t thread, uint32_t bits ); - -uint32_t -chudxnu_get_thread_bits( thread_t thread ) -{ - return thread->t_chud; -} - -void -chudxnu_set_thread_bits( thread_t thread, uint32_t bits ) -{ - thread->t_chud = bits; -} - -/* get and set thread dirty bits. so CHUD can track whether the thread - * has been dispatched since it last looked. caller must hold the - * thread lock - */ -boolean_t -chudxnu_thread_get_dirty(thread_t thread) -{ - if( thread->c_switch != thread->chud_c_switch ) - return TRUE; - else - return FALSE; -} - -void -chudxnu_thread_set_dirty(thread_t thread, boolean_t makedirty) -{ - if( makedirty ) - thread->chud_c_switch = thread->c_switch - 1; - else - thread->chud_c_switch = thread->c_switch; -} diff --git a/osfmk/chud/chud_xnu.h b/osfmk/chud/chud_xnu.h index 657e01fe8..56c7e6591 100644 --- a/osfmk/chud/chud_xnu.h +++ b/osfmk/chud/chud_xnu.h @@ -55,11 +55,6 @@ extern uint32_t chudxnu_version(void); // ******************************************************************************** // task // ******************************************************************************** -extern task_t chudxnu_task_for_pid(int pid); -extern int chudxnu_pid_for_task(task_t task); -extern int chudxnu_current_pid(void); -extern task_t chudxnu_current_task(void); - extern kern_return_t chudxnu_task_read(task_t task, void *kernaddr, uint64_t usraddr, vm_size_t size); extern kern_return_t chudxnu_task_write(task_t task, uint64_t useraddr, void *kernaddr, vm_size_t size); extern kern_return_t chudxnu_kern_read(void *destaddr, vm_offset_t srcaddr, vm_size_t size); @@ -73,9 +68,6 @@ extern boolean_t chudxnu_is_64bit_task(task_t task); // ******************************************************************************** // thread // ******************************************************************************** -extern thread_t chudxnu_current_thread(void); -extern task_t chudxnu_task_for_thread(thread_t thread); - extern kern_return_t chudxnu_bind_thread(thread_t thread, int cpu, int options); extern kern_return_t chudxnu_unbind_thread(thread_t thread, int options); @@ -85,34 +77,6 @@ extern kern_return_t chudxnu_thread_set_state(thread_t thread, thread_flavor_t f extern kern_return_t chudxnu_thread_get_callstack64(thread_t thread, uint64_t *callStack, mach_msg_type_number_t *count, boolean_t user_only); extern kern_return_t chudxnu_thread_get_callstack64_kperf(thread_t thread, uint64_t *callStack, mach_msg_type_number_t *count, boolean_t user_only); -extern kern_return_t chudxnu_all_tasks(task_array_t *task_list, mach_msg_type_number_t *count); -extern kern_return_t chudxnu_free_task_list(task_array_t *task_list, mach_msg_type_number_t *count); - -extern kern_return_t chudxnu_all_threads(thread_array_t *thread_list, mach_msg_type_number_t *count); -extern kern_return_t chudxnu_task_threads(task_t task, thread_array_t *thread_list, mach_msg_type_number_t *count); -extern kern_return_t chudxnu_free_thread_list(thread_array_t *thread_list, mach_msg_type_number_t *count); - -extern kern_return_t chudxnu_thread_info( thread_t thread, thread_flavor_t flavor, thread_info_t thread_info_out, mach_msg_type_number_t *thread_info_count); - -extern boolean_t chudxnu_thread_set_marked(thread_t thread, boolean_t marked); -extern boolean_t chudxnu_thread_get_marked(thread_t thread); -extern boolean_t chudxnu_thread_get_idle(thread_t thread); - -enum { - CHUDXNU_TS_RUNNING = 0x1, - CHUDXNU_TS_RUNNABLE = 0x2, - CHUDXNU_TS_WAIT = 0x4, - CHUDXNU_TS_UNINT = 0x8, - CHUDXNU_TS_SUSP = 0x10, - CHUDXNU_TS_TERMINATE = 0x20, - CHUDXNU_TS_IDLE = 0x40 -}; - -extern int chudxnu_thread_get_scheduler_state(thread_t thread); - -extern boolean_t chudxnu_thread_get_dirty(thread_t thread); -extern void chudxnu_thread_set_dirty(thread_t thread, boolean_t); - #if 0 #pragma mark **** memory **** #endif @@ -250,12 +214,6 @@ typedef kern_return_t (*chudxnu_syscall_callback_func_t)(uint64_t code, uint64_t extern kern_return_t chudxnu_syscall_callback_enter(chudxnu_syscall_callback_func_t func); extern kern_return_t chudxnu_syscall_callback_cancel(void); -// DTrace Triggering -typedef kern_return_t (*chudxnu_dtrace_callback_t)(uint64_t selector, uint64_t *args, uint32_t count); -extern int chudxnu_dtrace_callback(uint64_t selector, uint64_t *args, uint32_t count); -extern kern_return_t chudxnu_dtrace_callback_enter(chudxnu_dtrace_callback_t fn); -extern void chudxnu_dtrace_callback_cancel(void); - // ******************************************************************************** // DEPRECATED // ******************************************************************************** diff --git a/osfmk/chud/i386/chud_osfmk_callback_i386.c b/osfmk/chud/i386/chud_osfmk_callback_i386.c index 9865dc99d..a6775c24c 100644 --- a/osfmk/chud/i386/chud_osfmk_callback_i386.c +++ b/osfmk/chud/i386/chud_osfmk_callback_i386.c @@ -67,10 +67,6 @@ void chudxnu_cancel_all_callbacks(void) chudxnu_cpu_timer_callback_cancel_all(); chudxnu_interrupt_callback_cancel(); chudxnu_perfmon_ast_callback_cancel(); - chudxnu_kdebug_callback_cancel(); - chudxnu_trap_callback_cancel(); - chudxnu_syscall_callback_cancel(); - chudxnu_dtrace_callback_cancel(); } static lck_grp_t chud_request_lck_grp; @@ -218,106 +214,6 @@ chudxnu_cpu_timer_callback_cancel_all(void) return KERN_SUCCESS; } -#if 0 -#pragma mark **** trap **** -#endif -static kern_return_t chud_null_trap(uint32_t trapentry, thread_flavor_t flavor, - thread_state_t tstate, mach_msg_type_number_t count); -static chudxnu_trap_callback_func_t trap_callback_fn = chud_null_trap; - -static kern_return_t chud_null_trap(uint32_t trapentry __unused, thread_flavor_t flavor __unused, - thread_state_t tstate __unused, mach_msg_type_number_t count __unused) { - return KERN_FAILURE; -} - -static kern_return_t -chudxnu_private_trap_callback( - int trapno, - void *regs, - int unused1, - int unused2) -{ -#pragma unused (regs) -#pragma unused (unused1) -#pragma unused (unused2) - kern_return_t retval = KERN_FAILURE; - chudxnu_trap_callback_func_t fn = trap_callback_fn; - - if(fn) { - boolean_t oldlevel; - x86_thread_state_t state; - mach_msg_type_number_t count; - thread_t thread = current_thread(); - - oldlevel = ml_set_interrupts_enabled(FALSE); - - /* prevent reentry into CHUD when dtracing */ - if(thread->t_chud & T_IN_CHUD) { - /* restore interrupts */ - ml_set_interrupts_enabled(oldlevel); - - return KERN_FAILURE; // not handled - pass off to dtrace - } - - /* update the chud state bits */ - thread->t_chud |= T_IN_CHUD; - - count = x86_THREAD_STATE_COUNT; - - if(chudxnu_thread_get_state(thread, - x86_THREAD_STATE, - (thread_state_t)&state, - &count, - FALSE) == KERN_SUCCESS) { - - retval = (fn)( - trapno, - x86_THREAD_STATE, - (thread_state_t)&state, - count); - } - - /* no longer in CHUD */ - thread->t_chud &= ~(T_IN_CHUD); - - ml_set_interrupts_enabled(oldlevel); - } - - return retval; -} - -__private_extern__ kern_return_t -chudxnu_trap_callback_enter(chudxnu_trap_callback_func_t func) -{ - if(OSCompareAndSwapPtr(NULL, chudxnu_private_trap_callback, - (void * volatile *)&perfTrapHook)) { - - chudxnu_trap_callback_func_t old = trap_callback_fn; - while(!OSCompareAndSwapPtr(old, func, - (void * volatile *)&trap_callback_fn)) { - old = trap_callback_fn; - } - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ kern_return_t -chudxnu_trap_callback_cancel(void) -{ - if(OSCompareAndSwapPtr(chudxnu_private_trap_callback, NULL, - (void * volatile *)&perfTrapHook)) { - - chudxnu_trap_callback_func_t old = trap_callback_fn; - while(!OSCompareAndSwapPtr(old, chud_null_trap, - (void * volatile *)&trap_callback_fn)) { - old = trap_callback_fn; - } - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - #if 0 #pragma mark **** ast **** #endif diff --git a/osfmk/conf/Makefile b/osfmk/conf/Makefile index 76db9a7d8..7bd79d9ae 100644 --- a/osfmk/conf/Makefile +++ b/osfmk/conf/Makefile @@ -37,7 +37,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile OBJPATH=${OBJPATH} \ build_all; -do_build_all:: do_all +do_build_all:: do_all include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/osfmk/conf/Makefile.template b/osfmk/conf/Makefile.template index f22798e23..c025d381d 100644 --- a/osfmk/conf/Makefile.template +++ b/osfmk/conf/Makefile.template @@ -58,7 +58,6 @@ OBJS_NO_CAST_ALIGN = \ wait_queue.o \ bsd_kern.o \ pmc.o \ - default_freezer.o \ status.o \ machine_routines.o \ loose_ends.o \ @@ -72,17 +71,17 @@ OBJS_NO_CAST_ALIGN = \ cchmac_final.o \ cchmac_init.o \ ccsha1.o \ - dp_memory_object.o \ ipc_object.o \ ipc_kmsg.o \ ipc_right.o \ bsd_vm.o \ vm_map_store.o \ vm_map_store_ll.o \ - vm_map_store_rb.o + vm_map_store_rb.o \ + vm_debug.o # Objects that don't want -Wsign-compare warning (15294427) -OBJS_NO_SIGN_COMPARE = \ +OBJS_NO_SIGN_COMPARE = \ atm_notification_user.o $(foreach file,$(OBJS_NO_CAST_ALIGN),$(eval $(call add_perfile_cflags,$(file),-Wno-cast-align))) @@ -155,7 +154,7 @@ $(COMPONENT).filelist: $(OBJS) $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \ mv $${hib_file}__ $${hib_file} || exit 1; \ done - @echo LDFILELIST $(COMPONENT) + @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" $(_v)for obj in ${OBJS}; do \ echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist @@ -179,7 +178,7 @@ endif -include genassym.d genassym.o: .CFLAGS $(firstword $(MAKEFILE_LIST)) genassym.o: $(SOURCE_DIR)/$(COMPONENT)/$(GENASSYM_LOCATION)/genassym.c - @echo GENASSYM $< + @echo "[$(CMD_MC)] $(ColorH)GENASSYM$(Color0) $(ColorLF)$<$(Color0)" $(_v)${GENASSYM_KCC} ${CFLAGS} ${CFLAGS_NOLTO_FLAG} -MD -S -o ${@} ${INCFLAGS} $< assym.s: genassym.o diff --git a/osfmk/conf/files b/osfmk/conf/files index 37cd38840..e735d9e71 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -36,6 +36,7 @@ OPTIONS/mach_debug optional mach_debug # OPTIONS/mach_cluster_stats optional mach_cluster_stats OPTIONS/mach_counters optional mach_counters +OPTIONS/mach_flipc optional mach_flipc OPTIONS/mach_ipc_debug optional mach_ipc_debug OPTIONS/mach_ipc_test optional mach_ipc_test OPTIONS/mach_kdp optional mach_kdp @@ -50,7 +51,6 @@ OPTIONS/mach_page_hash_stats optional mach_page_hash_stats OPTIONS/mig_debug optional mig_debug OPTIONS/xpr_debug optional xpr_debug OPTIONS/zone_debug optional zone_debug -OPTIONS/zone_alias_addr optional zone_alias_addr OPTIONS/vm_cpm optional vm_cpm OPTIONS/task_swapper optional task_swapper OPTIONS/stack_usage optional stack_usage @@ -62,16 +62,6 @@ OPTIONS/config_dtrace optional config_dtrace OPTIONS/no_kextd optional no_kextd -# Default pager and system pager files, to be moved to separate component - -osfmk/vm/vm_compressor_backing_store.c standard - -osfmk/default_pager/default_pager.c standard -osfmk/default_pager/dp_backing_store.c standard -osfmk/default_pager/dp_memory_object.c standard -./default_pager/default_pager_alerts_user.c standard -./default_pager/default_pager_object_server.c standard - # # gssd files # @@ -113,6 +103,7 @@ osfmk/ipc/ipc_right.c standard osfmk/ipc/ipc_space.c standard osfmk/ipc/ipc_table.c standard osfmk/ipc/ipc_voucher.c standard +osfmk/ipc/flipc.c optional mach_flipc osfmk/ipc/mach_debug.c standard osfmk/ipc/mach_kernelrpc.c standard osfmk/ipc/mach_msg.c standard @@ -121,7 +112,9 @@ osfmk/ipc/mig_log.c optional mig_debug osfmk/kern/affinity.c standard osfmk/kern/ast.c standard osfmk/kern/audit_sessionport.c optional config_audit +osfmk/kern/backtrace.c standard osfmk/kern/btlog.c standard +osfmk/kern/build_config.c standard osfmk/kern/clock.c standard osfmk/kern/clock_oldops.c standard osfmk/kern/coalition.c optional config_coalitions @@ -141,9 +134,12 @@ osfmk/kern/ipc_sync.c standard osfmk/kern/ipc_tt.c standard osfmk/kern/kalloc.c standard osfmk/kern/kern_ecc.c optional config_ecc_logging +osfmk/kern/ktrace_background_notify.c standard osfmk/kern/ledger.c standard osfmk/kern/locks.c standard +osfmk/kern/ltable.c standard osfmk/kern/machine.c standard +osfmk/kern/mach_node.c standard osfmk/kern/mk_sp.c standard osfmk/kern/mk_timer.c standard osfmk/kern/page_decrypt.c standard @@ -189,6 +185,7 @@ osfmk/kern/hibernate.c optional hibernation ./mach/exc_server.c optional mach_bsd ./mach/host_priv_server.c standard ./mach/host_security_server.c standard +./mach/ktrace_background_user.c standard ./mach/lock_set_server.c standard ./mach/mach_exc_user.c standard ./mach/mach_exc_server.c optional mach_bsd @@ -198,9 +195,8 @@ osfmk/kern/hibernate.c optional hibernation ./mach/mach_vm_server.c standard ./mach/mach_voucher_server.c standard ./mach/mach_voucher_attr_control_server.c standard -./mach/memory_object_server.c standard ./mach/memory_object_control_server.c standard -./mach/memory_object_default_server.c standard +./mach/resource_notify_user.c standard ./mach/upl_server.c standard ./mach/audit_triggers_user.c standard ./mach/task_access_user.c standard @@ -210,6 +206,7 @@ osfmk/kern/kern_cdata.c standard osfmk/bank/bank.c optional config_bank osfmk/atm/atm.c optional config_atm ./atm/atm_notification_user.c optional config_atm +osfmk/voucher/ipc_pthread_priority.c standard ./mach/coalition_notification_user.c optional config_coalitions ./mach/sysdiagnose_notification_user.c optional config_sysdiagnose # @@ -227,12 +224,13 @@ osfmk/atm/atm.c optional config_atm osfmk/vm/bsd_vm.c optional mach_bsd osfmk/vm/vm_compressor.c standard osfmk/vm/vm_compressor_pager.c standard +osfmk/vm/vm_compressor_backing_store.c standard +osfmk/vm/vm_compressor_algorithms.c standard +osfmk/vm/lz4.c standard osfmk/vm/vm_phantom_cache.c optional config_phantom_cache -osfmk/vm/default_freezer.c optional config_freeze osfmk/vm/device_vm.c standard osfmk/vm/memory_object.c standard osfmk/vm/vm_debug.c standard -osfmk/vm/vm_external.c optional mach_pagemap osfmk/vm/vm_fault.c standard osfmk/vm/vm_init.c standard osfmk/vm/vm_kern.c standard @@ -281,20 +279,27 @@ osfmk/kperf/kperf.c optional kperf osfmk/kperf/action.c optional kperf osfmk/kperf/callstack.c optional kperf osfmk/kperf/pet.c optional kperf -# osfmk/kperf/kperfbsd.c optional kperf # bsd/conf/files -osfmk/kperf/threadinfo.c optional kperf +osfmk/kperf/thread_samplers.c optional kperf +osfmk/kperf/task_samplers.c optional kperf osfmk/kperf/meminfo.c optional kperf -osfmk/kperf/timetrigger.c optional kperf +osfmk/kperf/kperf_timer.c optional kperf osfmk/kperf/kperf_kpc.c optional kperf +osfmk/kperf/kdebug_trigger.c optional kperf osfmk/kern/kpc_thread.c optional kpc osfmk/kern/kpc_common.c optional kpc osfmk/console/serial_general.c standard +osfmk/console/serial_console.c optional serial_console +osfmk/console/video_scroll.c optional video_console +osfmk/console/video_console.c optional video_console + osfmk/kern/telemetry.c optional config_telemetry # Built-in corecrypto for early_random(): osfmk/corecrypto/cc/src/cc_clear.c standard +osfmk/corecrypto/cc/src/cc_cmp_safe.c standard +osfmk/corecrypto/cc/src/cc_abort.c standard osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c standard osfmk/corecrypto/ccdigest/src/ccdigest_init.c standard osfmk/corecrypto/ccdigest/src/ccdigest_update.c standard diff --git a/osfmk/conf/files.x86_64 b/osfmk/conf/files.x86_64 index 6b2389a45..6bd6977a8 100644 --- a/osfmk/conf/files.x86_64 +++ b/osfmk/conf/files.x86_64 @@ -31,6 +31,7 @@ osfmk/x86_64/bzero.s standard osfmk/x86_64/WKdmDecompress_new.s standard osfmk/x86_64/WKdmCompress_new.s standard osfmk/x86_64/WKdmData_new.s standard +osfmk/x86_64/lz4_decode_x86_64.s standard osfmk/i386/cpu.c standard osfmk/i386/cpuid.c standard osfmk/i386/cpu_threads.c standard @@ -83,10 +84,6 @@ osfmk/i386/acpi.c standard osfmk/i386/mtrr.c optional config_mtrr -osfmk/console/i386/serial_console.c optional serial_console - -osfmk/console/video_console.c optional video_console -osfmk/console/i386/video_scroll.c optional video_console #osfmk/profiling/x86_64/profile-md.c optional gprof #osfmk/profiling/x86_64/profile-asm.s optional gprof diff --git a/osfmk/console/Makefile b/osfmk/console/Makefile index 90da36439..f8d5445eb 100644 --- a/osfmk/console/Makefile +++ b/osfmk/console/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -23,5 +22,3 @@ EXPORT_MI_DIR = console include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/console/i386/serial_console.c b/osfmk/console/i386/serial_console.c deleted file mode 100644 index 0d51045ef..000000000 --- a/osfmk/console/i386/serial_console.c +++ /dev/null @@ -1,371 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct { - char *buffer; - int len; - int used; - char *write_ptr; - char *read_ptr; - decl_simple_lock_data(,read_lock); - decl_simple_lock_data(,write_lock); -} console_ring; - -hw_lock_data_t cnputc_lock; -static volatile long console_output = 0; - -typedef struct console_buf { - char *buf_base; - char *buf_end; - char *buf_ptr; -#define CPU_BUFFER_LEN (256 - 3*(sizeof(char*))) - char buf[CPU_BUFFER_LEN]; -} console_buf_t; - -static void _serial_putc(int, int, int); - -struct console_ops cons_ops[] = { - { - .putc = _serial_putc, - .getc = _serial_getc, - }, - { - .putc = vcputc, - .getc = vcgetc, - }, -}; - -uint32_t nconsops = (sizeof cons_ops / sizeof cons_ops[0]); - -uint32_t cons_ops_index = VC_CONS_OPS; - -/* This macro polls for pending TLB flushes while spinning on a lock - */ -#define SIMPLE_LOCK_NO_INTRS(l) \ -MACRO_BEGIN \ - boolean_t istate = ml_get_interrupts_enabled(); \ - while (!simple_lock_try((l))) \ - { \ - if (!istate) \ - handle_pending_TLB_flushes(); \ - cpu_pause(); \ - } \ -MACRO_END - -void -console_init(void) -{ - int ret; - - console_ring.len = PAGE_SIZE; - ret = kmem_alloc(kernel_map, (vm_offset_t *) &console_ring.buffer, - console_ring.len, VM_KERN_MEMORY_OSFMK); - if (ret != KERN_SUCCESS) - panic("console_ring_init() " - "failed to allocate ring buffer, error %d\n", ret); - console_ring.used = 0; - console_ring.read_ptr = console_ring.buffer; - console_ring.write_ptr = console_ring.buffer; - simple_lock_init(&console_ring.read_lock, 0); - simple_lock_init(&console_ring.write_lock, 0); - hw_lock_init(&cnputc_lock); -} - -void * -console_cpu_alloc(__unused boolean_t boot_processor) -{ - int ret; - console_buf_t *cbp; - - ret = kmem_alloc(kernel_map, (vm_offset_t *) &cbp, - sizeof(console_buf_t), VM_KERN_MEMORY_OSFMK); - if (ret != KERN_SUCCESS) { - printf("console_cpu_alloc() " - "failed to allocate cpu buffer, error=%d\n", ret); - return NULL; - } - - cbp->buf_base = (char *) &cbp->buf; - cbp->buf_ptr = cbp->buf_base; - cbp->buf_end = cbp->buf_base + CPU_BUFFER_LEN; - - return (void *) cbp; -} - -void -console_cpu_free(void *buf) -{ - if (buf != NULL) - kfree((void *) buf, sizeof(console_buf_t)); -} - -/* So we can re-write the serial device functions at boot-time */ -void -console_set_serial_ops( struct console_ops *newops ) -{ - cons_ops[SERIAL_CONS_OPS] = *newops; -} - -static inline int -console_ring_space(void) -{ - return console_ring.len - console_ring.used; -} - -static boolean_t -console_ring_put(char ch) -{ - if (console_ring.used < console_ring.len) { - console_ring.used++;; - *console_ring.write_ptr++ = ch; - if (console_ring.write_ptr - console_ring.buffer - == console_ring.len) - console_ring.write_ptr = console_ring.buffer; - return TRUE; - } else { - return FALSE; - } -} - -static int -console_ring_get(void) -{ - char ch = 0; - - if (console_ring.used > 0) { - console_ring.used--; - ch = *console_ring.read_ptr++; - if (console_ring.read_ptr - console_ring.buffer - == console_ring.len) - console_ring.read_ptr = console_ring.buffer; - } - return (int) ch; -} - -static inline void -cpu_buffer_put(console_buf_t *cbp, char ch) -{ - if (ch != '\0' && cbp->buf_ptr < cbp->buf_end) - *(cbp->buf_ptr++) = ch; -} - -static inline void -_cnputc(char c) -{ - /* The console device output routines are assumed to be - * non-reentrant. - */ - mp_disable_preemption(); - /* Use the maximum available spinlock timeout. Some configurations - * exhibit non-deterministic stalls across console output. - */ - if (!hw_lock_to(&cnputc_lock, UINT32_MAX)) { - /* If we timed out on the lock, and we're in the debugger, - * break the lock. - */ - if (debug_mode) { - /* Since hw_lock_to takes a pre-emption count...*/ - mp_enable_preemption(); - hw_lock_init(&cnputc_lock); - hw_lock_lock(&cnputc_lock); - } - else - panic("Lock acquire timeout in _cnputc()"); - } - cons_ops[cons_ops_index].putc(0, 0, c); - if (c == '\n') - cons_ops[cons_ops_index].putc(0, 0, '\r'); - hw_lock_unlock(&cnputc_lock); - mp_enable_preemption(); -} - -void cnputc_unbuffered(char c) { - _cnputc(c); -} - -void -cnputcusr(char c) -{ - /* Spin (with pre-emption enabled) waiting for console_ring_try_empty() - * to complete output. There is a small window here where we could - * end up with a stale value of console_output, but it's unlikely, - * and _cnputc(), which outputs to the console device, is internally - * synchronized. There's something of a conflict between the - * character-at-a-time (with pre-emption enabled) unbuffered - * output model here, and the buffered output from cnputc(), - * whose consumers include printf() ( which outputs a sequence - * with pre-emption disabled, and should be safe to call with - * interrupts off); we don't want to disable pre-emption indefinitely - * here, and spinlocks and mutexes are inappropriate. - */ - while (console_output != 0); - - _cnputc(c); -} - -static void -console_ring_try_empty(void) -{ - boolean_t state = ml_get_interrupts_enabled(); - /* - * Try to get the read lock on the ring buffer to empty it. - * If this fails someone else is already emptying... - */ - if (!simple_lock_try(&console_ring.read_lock)) - return; - /* Indicate that we're in the process of writing a block of data - * to the console. - */ - atomic_incl(&console_output, 1); - for (;;) { - char ch; - if (!state) - handle_pending_TLB_flushes(); - ml_set_interrupts_enabled(FALSE); - SIMPLE_LOCK_NO_INTRS(&console_ring.write_lock); - ch = console_ring_get(); - simple_unlock(&console_ring.write_lock); - ml_set_interrupts_enabled(state); - if (ch == 0) - break; - _cnputc(ch); - } - atomic_decl(&console_output, 1); - simple_unlock(&console_ring.read_lock); -} - -void -cnputc(char c) -{ - console_buf_t *cbp; - mp_disable_preemption(); - cbp = (console_buf_t *) current_cpu_datap()->cpu_console_buf; - if (cbp == NULL) { - mp_enable_preemption(); - /* Put directly if console ring is not initialized */ - _cnputc(c); - return; - } - - /* add to stack buf */ - if (c != '\n') { - /* XXX - cpu_buffer_put() can fail silently if the buffer - * is exhausted, as can happen if there's a long sequence - * of data with no newlines. We should, instead, attempt - * a flush. - */ - cpu_buffer_put(cbp, c); - } else { - boolean_t state; - char *cp; - - /* Here at end of printf -- time to try to output */ - - /* copy this buffer into the shared ring buffer */ - state = ml_set_interrupts_enabled(FALSE); - SIMPLE_LOCK_NO_INTRS(&console_ring.write_lock); - - /* - * Is there enough space in the shared ring buffer? - * Try to empty if not. - * Note, we want the entire local buffer to fit to - * avoid another cpu interjecting. - */ - while (cbp->buf_ptr-cbp->buf_base + 1 > console_ring_space()) { - simple_unlock(&console_ring.write_lock); - ml_set_interrupts_enabled(state); - console_ring_try_empty(); - state = ml_set_interrupts_enabled(FALSE); - SIMPLE_LOCK_NO_INTRS(&console_ring.write_lock); - } - for (cp = cbp->buf_base; cp < cbp->buf_ptr; cp++) - console_ring_put(*cp); - console_ring_put('\n'); - cbp->buf_ptr = cbp->buf_base; - simple_unlock(&console_ring.write_lock); - ml_set_interrupts_enabled(state); - } - console_ring_try_empty(); - mp_enable_preemption(); -} - -int _serial_getc(__unused int a, __unused int b, boolean_t wait, __unused boolean_t raw) -{ - int c; - do { - c = serial_getc(); - } while (wait && c < 0); - - return c; -} - -static void _serial_putc(__unused int a, __unused int b, int c) -{ - serial_putc(c); -} - - -int -cngetc(void) -{ - return cons_ops[cons_ops_index].getc(0, 0, - TRUE, FALSE); -} - -int -cnmaygetc(void) -{ - return cons_ops[cons_ops_index].getc(0, 0, - FALSE, FALSE); -} - -int -vcgetc(__unused int l, - __unused int u, - __unused boolean_t wait, - __unused boolean_t raw) -{ - char c; - - if( 0 == (*PE_poll_input)( 0, &c)) - return( c); - else - return( 0); -} diff --git a/osfmk/console/serial_console.c b/osfmk/console/serial_console.c new file mode 100644 index 000000000..ec139794c --- /dev/null +++ b/osfmk/console/serial_console.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifdef __x86_64__ +#include +#include +#include +#include +#include +#include +#include +#endif /* __x86_64__ */ + +#include +#include +#include +#include +#include +#include +#include +#include + + + +#ifndef MAX_CPU_SLOTS +#define MAX_CPU_SLOTS (MAX_CPUS) +#endif + +static struct { + char * buffer; + int len; + int used; + char * write_ptr; + char * read_ptr; + decl_simple_lock_data(, read_lock); + decl_simple_lock_data(, write_lock); +} console_ring; + +hw_lock_data_t cnputc_lock; +static volatile uint32_t console_output = 0; + +/* + * New allocation mechanism for console buffers + * Total allocation: 1 * PAGE_SIZE + * - Each cpu gets CPU_CONS_BUF_SIZE buffer + * - Kernel wide console ring gets PAGE_SIZE - MAX_CPU_SLOTS * CPU_CONS_BUF_SIZE + * + * At the return from console_init() the memory is setup as follows: + * +----------------------------+-------------+-------------+-------------+-------------+ + * |console ring buffer---------|f2eec075-----|f2eec075-----|f2eec075-----|f2eec075-----| + * +----------------------------+-------------+-------------+-------------+-------------+ + * Each cpu allocation will find the first (f2eec075) and use that buffer. + * + */ + +#define CPU_CONS_BUF_SIZE 256 +#define CPU_BUF_FREE_HEX 0xf2eec075 + +#define KERN_CONSOLE_BUF_SIZE vm_map_round_page(CPU_CONS_BUF_SIZE *(MAX_CPU_SLOTS + 1), PAGE_SIZE - 1) +#define KERN_CONSOLE_RING_SIZE (KERN_CONSOLE_BUF_SIZE - (CPU_CONS_BUF_SIZE * MAX_CPU_SLOTS)) + +/* + * A serial line running at 115200 bps can output ~11.5 characters per millisecond. + * Synchronous serial logging with preemption+interrupts disabled fundamentally prevents us + * from hitting expected scheduling deadlines, but we can at least tone it down a bit. + * + * TODO: IOLog should use asynchronous serial logging instead of the synchronous serial console. (26555148) + * + * Keep interrupt disabled periods shorter than 1ms + */ +#define MAX_INT_DISABLED_FLUSH_SIZE 8 +#define MAX_TOTAL_FLUSH_SIZE (MAX(2, MAX_CPU_SLOTS) * CPU_CONS_BUF_SIZE) + +typedef struct console_buf { + char * buf_base; + char * buf_end; + char * buf_ptr; +#define CPU_BUFFER_LEN (CPU_CONS_BUF_SIZE - 3 * (sizeof(char *))) + char buf[CPU_BUFFER_LEN]; +} console_buf_t; + +extern int serial_getc(void); +extern void serial_putc(char); + +static void _serial_putc(int, int, int); + +struct console_ops cons_ops[] = { + { + .putc = _serial_putc, .getc = _serial_getc, + }, + { + .putc = vcputc, .getc = vcgetc, + }, +}; + +uint32_t nconsops = (sizeof cons_ops / sizeof cons_ops[0]); + +uint32_t cons_ops_index = VC_CONS_OPS; + + +static bool console_suspended = false; + +static void +console_ring_lock_init(void) +{ + simple_lock_init(&console_ring.read_lock, 0); + simple_lock_init(&console_ring.write_lock, 0); +} + +void +console_init(void) +{ + int ret, i; + uint32_t * p; + + if (!OSCompareAndSwap(0, KERN_CONSOLE_RING_SIZE, (UInt32 *)&console_ring.len)) + return; + + assert(console_ring.len > 0); + + ret = kmem_alloc(kernel_map, (vm_offset_t *)&console_ring.buffer, KERN_CONSOLE_BUF_SIZE, VM_KERN_MEMORY_OSFMK); + if (ret != KERN_SUCCESS) { + panic("console_ring_init() failed to allocate ring buffer, error %d\n", ret); + } + + /* setup memory for per cpu console buffers */ + for (i = 0; i < MAX_CPU_SLOTS; i++) { + p = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t))); + *p = CPU_BUF_FREE_HEX; + } + + console_ring.used = 0; + console_ring.read_ptr = console_ring.buffer; + console_ring.write_ptr = console_ring.buffer; + console_ring_lock_init(); + hw_lock_init(&cnputc_lock); +} + +void * +console_cpu_alloc(__unused boolean_t boot_processor) +{ + console_buf_t * cbp; + int i; + uint32_t * p; + + console_init(); + assert(console_ring.buffer != NULL); + + /* select the next slot from the per cpu buffers at end of console_ring.buffer */ + for (i = 0; i < MAX_CPU_SLOTS; i++) { + p = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t))); + if (OSCompareAndSwap(CPU_BUF_FREE_HEX, 0, (UInt32 *)p)) + break; + } + assert(i < MAX_CPU_SLOTS); + + cbp = (console_buf_t *)(uintptr_t)p; + if ((uintptr_t)cbp >= (uintptr_t)console_ring.buffer + KERN_CONSOLE_BUF_SIZE) { + printf("console_cpu_alloc() failed to allocate cpu buffer\n"); + return NULL; + } + + cbp->buf_base = (char *)&cbp->buf; + cbp->buf_ptr = cbp->buf_base; + cbp->buf_end = cbp->buf_base + CPU_BUFFER_LEN; + return (void *)cbp; +} + +void +console_cpu_free(void * buf) +{ + assert((uintptr_t)buf > (uintptr_t)console_ring.buffer); + assert((uintptr_t)buf < (uintptr_t)console_ring.buffer + KERN_CONSOLE_BUF_SIZE); + if (buf != NULL) + *(uint32_t *)buf = CPU_BUF_FREE_HEX; +} + +static inline int +console_ring_space(void) +{ + return console_ring.len - console_ring.used; +} + +static boolean_t +console_ring_put(char ch) +{ + if (console_ring.used < console_ring.len) { + console_ring.used++; + *console_ring.write_ptr++ = ch; + if (console_ring.write_ptr - console_ring.buffer == console_ring.len) + console_ring.write_ptr = console_ring.buffer; + return TRUE; + } else { + return FALSE; + } +} + +static inline boolean_t +cpu_buffer_put(console_buf_t * cbp, char ch) +{ + if (ch != '\0' && cbp->buf_ptr < cbp->buf_end) { + *(cbp->buf_ptr++) = ch; + return TRUE; + } else { + return FALSE; + } +} + +static inline int +cpu_buffer_size(console_buf_t * cbp) +{ + return (int)(cbp->buf_ptr - cbp->buf_base); +} + +static inline void +_cnputs(char * c, int size) +{ + /* The console device output routines are assumed to be + * non-reentrant. + */ + mp_disable_preemption(); + if (!hw_lock_to(&cnputc_lock, LockTimeOut)) { + /* If we timed out on the lock, and we're in the debugger, + * copy lock data for debugging and break the lock. + */ + hw_lock_data_t _shadow_lock; + memcpy(&_shadow_lock, &cnputc_lock, sizeof(cnputc_lock)); + if (debug_mode) { + /* Since hw_lock_to takes a pre-emption count...*/ + mp_enable_preemption(); + hw_lock_init(&cnputc_lock); + hw_lock_lock(&cnputc_lock); + } else { + panic("Lock acquire timeout in _cnputs() lock=%p, lock owner thread=0x%lx, current_thread: %p\n", &_shadow_lock, + _shadow_lock.lock_data, current_thread()); + } + } + + while (size-- > 0) { + cons_ops[cons_ops_index].putc(0, 0, *c); + if (*c == '\n') + cons_ops[cons_ops_index].putc(0, 0, '\r'); + c++; + } + + hw_lock_unlock(&cnputc_lock); + mp_enable_preemption(); +} + +void +cnputc_unbuffered(char c) +{ + _cnputs(&c, 1); +} + +void +cnputcusr(char c) +{ + boolean_t state; + + /* Spin (with pre-emption enabled) waiting for console_ring_try_empty() + * to complete output. There is a small window here where we could + * end up with a stale value of console_output, but it's unlikely, + * and _cnputs(), which outputs to the console device, is internally + * synchronized. There's something of a conflict between the + * character-at-a-time (with pre-emption enabled) unbuffered + * output model here, and the buffered output from cnputc(), + * whose consumers include printf() ( which outputs a sequence + * with pre-emption disabled, and should be safe to call with + * interrupts off); we don't want to disable pre-emption indefinitely + * here, and spinlocks and mutexes are inappropriate. + */ + while (console_output != 0) + ; + + /* + * We disable interrupts to avoid issues caused by rendevous IPIs + * and an interruptible core holding the lock while an uninterruptible + * core wants it. Stackshot is the prime example of this. + */ + state = ml_set_interrupts_enabled(FALSE); + _cnputs(&c, 1); + ml_set_interrupts_enabled(state); +} + +static void +console_ring_try_empty(void) +{ +#ifdef __x86_64__ + boolean_t handle_tlb_flushes = (ml_get_interrupts_enabled() == FALSE); +#endif /* __x86_64__ */ + + int nchars_out = 0; + int total_chars_out = 0; + int size_before_wrap = 0; + + do { +#ifdef __x86_64__ + if (handle_tlb_flushes) + handle_pending_TLB_flushes(); +#endif /* __x86_64__ */ + + /* + * Try to get the read lock on the ring buffer to empty it. + * If this fails someone else is already emptying... + */ + if (!simple_lock_try(&console_ring.read_lock)) { + /* + * If multiple cores are spinning trying to empty the buffer, + * we may suffer lock starvation (get the read lock, but + * never the write lock, with other cores unable to get the + * read lock). As a result, insert a delay on failure, to + * let other cores have a turn. + */ + delay(1); + return; + } + + boolean_t state = ml_set_interrupts_enabled(FALSE); + + /* Indicate that we're in the process of writing a block of data to the console. */ + (void)hw_atomic_add(&console_output, 1); + + simple_lock_try_lock_loop(&console_ring.write_lock); + + /* try small chunk at a time, so we allow writes from other cpus into the buffer */ + nchars_out = MIN(console_ring.used, MAX_INT_DISABLED_FLUSH_SIZE); + + /* account for data to be read before wrap around */ + size_before_wrap = (int)((console_ring.buffer + console_ring.len) - console_ring.read_ptr); + if (nchars_out > size_before_wrap) + nchars_out = size_before_wrap; + + if (nchars_out > 0) { + _cnputs(console_ring.read_ptr, nchars_out); + console_ring.read_ptr = + console_ring.buffer + ((console_ring.read_ptr - console_ring.buffer + nchars_out) % console_ring.len); + console_ring.used -= nchars_out; + total_chars_out += nchars_out; + } + + simple_unlock(&console_ring.write_lock); + + (void)hw_atomic_sub(&console_output, 1); + + simple_unlock(&console_ring.read_lock); + + ml_set_interrupts_enabled(state); + + /* + * In case we end up being the console drain thread + * for far too long, break out. Except in panic/suspend cases + * where we should clear out full buffer. + */ + if (debug_mode == 0 && !console_suspended && (total_chars_out >= MAX_TOTAL_FLUSH_SIZE)) + break; + + } while (nchars_out > 0); +} + + +void +console_suspend() +{ + console_suspended = true; + console_ring_try_empty(); +} + +void +console_resume() +{ + console_suspended = false; +} + +void +console_write(char * str, int size) +{ + console_init(); + int chunk_size = size; + int i = 0; + + if (size > console_ring.len) + chunk_size = CPU_CONS_BUF_SIZE; + + while (size > 0) { + boolean_t state = ml_set_interrupts_enabled(FALSE); + + simple_lock_try_lock_loop(&console_ring.write_lock); + while (chunk_size > console_ring_space()) { + simple_unlock(&console_ring.write_lock); + ml_set_interrupts_enabled(state); + + console_ring_try_empty(); + + state = ml_set_interrupts_enabled(FALSE); + simple_lock_try_lock_loop(&console_ring.write_lock); + } + + for (i = 0; i < chunk_size; i++) + console_ring_put(str[i]); + + str = &str[i]; + size -= chunk_size; + simple_unlock(&console_ring.write_lock); + ml_set_interrupts_enabled(state); + } + + console_ring_try_empty(); +} + +void +cnputc(char c) +{ + console_buf_t * cbp; + cpu_data_t * cpu_data_p; + boolean_t state; + boolean_t needs_print = TRUE; + char * cp; + +restart: + mp_disable_preemption(); + cpu_data_p = current_cpu_datap(); + cbp = (console_buf_t *)cpu_data_p->cpu_console_buf; + if (console_suspended || cbp == NULL) { + mp_enable_preemption(); + /* Put directly if console ring is not initialized or we're heading into suspend */ + _cnputs(&c, 1); + return; + } + +#ifndef __x86_64__ + /* Is there a panic backtrace going on? */ + if (cpu_data_p->PAB_active) { + /* If another processor was in the process of emptying the + * console ring buffer when it received the panic backtrace + * signal, that processor will be spinning in DebugXCall() + * waiting for the panicking processor to finish printing + * the backtrace. But panicking processor will never + * be able to obtain the ring buffer lock since it is + * owned by a processor that's spinning in DebugXCall(). + * Blow away any locks that other processors may have on + * the console ring buffer so that the backtrace can + * complete. + */ + console_ring_lock_init(); + } +#endif /* __x86_64__ */ + + state = ml_set_interrupts_enabled(FALSE); + + /* + * add to stack buf + * If the cpu buffer is full, we'll flush, then try + * another put. If it fails a second time... screw + * it. + */ + if (needs_print && !cpu_buffer_put(cbp, c)) { + simple_lock_try_lock_loop(&console_ring.write_lock); + + if (cpu_buffer_size(cbp) > console_ring_space()) { + simple_unlock(&console_ring.write_lock); + ml_set_interrupts_enabled(state); + mp_enable_preemption(); + + console_ring_try_empty(); + goto restart; + } + + for (cp = cbp->buf_base; cp < cbp->buf_ptr; cp++) + console_ring_put(*cp); + cbp->buf_ptr = cbp->buf_base; + simple_unlock(&console_ring.write_lock); + + cpu_buffer_put(cbp, c); + } + + needs_print = FALSE; + + if (c != '\n') { + ml_set_interrupts_enabled(state); + mp_enable_preemption(); + return; + } + + /* We printed a newline, time to flush the CPU buffer to the global buffer */ + simple_lock_try_lock_loop(&console_ring.write_lock); + + /* + * Is there enough space in the shared ring buffer? + * Try to empty if not. + * Note, we want the entire local buffer to fit to + * avoid another cpu interjecting. + */ + + if (cpu_buffer_size(cbp) > console_ring_space()) { + simple_unlock(&console_ring.write_lock); + ml_set_interrupts_enabled(state); + mp_enable_preemption(); + + console_ring_try_empty(); + + goto restart; + } + + for (cp = cbp->buf_base; cp < cbp->buf_ptr; cp++) + console_ring_put(*cp); + + cbp->buf_ptr = cbp->buf_base; + simple_unlock(&console_ring.write_lock); + ml_set_interrupts_enabled(state); + mp_enable_preemption(); + + console_ring_try_empty(); + + return; +} + +int +_serial_getc(__unused int a, __unused int b, boolean_t wait, __unused boolean_t raw) +{ + int c; + do { + c = serial_getc(); + } while (wait && c < 0); + + + return c; +} + +static void +_serial_putc(__unused int a, __unused int b, int c) +{ + serial_putc(c); +} + +int +cngetc(void) +{ + return cons_ops[cons_ops_index].getc(0, 0, TRUE, FALSE); +} + +int +cnmaygetc(void) +{ + return cons_ops[cons_ops_index].getc(0, 0, FALSE, FALSE); +} + +int +vcgetc(__unused int l, __unused int u, __unused boolean_t wait, __unused boolean_t raw) +{ + char c; + + if (0 == (*PE_poll_input)(0, &c)) + return c; + else + return 0; +} + +/* So we can re-write the serial device functions at boot-time */ +void +console_set_serial_ops(struct console_ops * newops) +{ + cons_ops[SERIAL_CONS_OPS] = *newops; +} + diff --git a/osfmk/console/video_console.c b/osfmk/console/video_console.c index e9a1555a5..5295d3c03 100644 --- a/osfmk/console/video_console.c +++ b/osfmk/console/video_console.c @@ -2973,39 +2973,6 @@ initialize_screen(PE_Video * boot_vinfo, unsigned int op) } } -void -dim_screen(void) -{ - unsigned int *p, *endp, *row; - int col, rowline, rowlongs; - register unsigned int mask; - - if(!vinfo.v_depth) - return; - - if ( vinfo.v_depth == 32 ) - mask = 0x007F7F7F; - else if ( vinfo.v_depth == 30 ) - mask = (0x1ff<<20) | (0x1ff<<10) | 0x1ff; - else if ( vinfo.v_depth == 16 ) - mask = 0x3DEF3DEF; - else - return; - - rowline = (int)(vinfo.v_rowscanbytes / 4); - rowlongs = (int)(vinfo.v_rowbytes / 4); - - p = (unsigned int*) vinfo.v_baseaddr; - endp = p + (rowlongs * vinfo.v_height); - - for (row = p ; row < endp ; row += rowlongs) { - for (p = &row[0], col = 0; col < rowline; col++) { - *p = (*p >> 1) & mask; - ++p; - } - } -} - void vcattach(void); /* XXX gcc 4 warning cleanup */ void diff --git a/osfmk/console/i386/video_scroll.c b/osfmk/console/video_scroll.c similarity index 74% rename from osfmk/console/i386/video_scroll.c rename to osfmk/console/video_scroll.c index 4c84776c7..466430bd9 100644 --- a/osfmk/console/i386/video_scroll.c +++ b/osfmk/console/video_scroll.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2007 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -31,16 +31,15 @@ extern void bcopy(const void *, void *, size_t); -void video_scroll_up(void *start, - void *end, - void *dest) +void +video_scroll_up(void * start, void * end, void * dest) { - bcopy(start, dest, ((char*)end - (char*)start) << 2);; + bcopy(start, dest, ((char *)end - (char *)start) << 2); } -void video_scroll_down(void *start, /* HIGH addr */ - void *end, /* LOW addr */ - void *dest) /* HIGH addr */ +void video_scroll_down(void * start, /* HIGH addr */ + void * end, /* LOW addr */ + void * dest) /* HIGH addr */ { - bcopy(end, dest, ((char*)start - (char*)end) << 2); + bcopy(end, dest, ((char *)start - (char *)end) << 2); } diff --git a/osfmk/corecrypto/cc/src/cc_abort.c b/osfmk/corecrypto/cc/src/cc_abort.c new file mode 100644 index 000000000..ac48bd9e3 --- /dev/null +++ b/osfmk/corecrypto/cc/src/cc_abort.c @@ -0,0 +1,36 @@ +/* + * cc_abort.c + * corecrypto + * + * Created on 7/16/2015 + * + * Copyright (c) 2014,2015 Apple Inc. All rights reserved. + * + */ + +#include + +//cc_abort() is implemented to comply with by FIPS 140-2, when DRBG produces +//two equal consecutive blocks. See radar 19129408 + +#if CC_KERNEL +#include +void cc_abort(const char * msg CC_UNUSED , ...) +{ + panic(msg); +} + +#elif CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT +void cc_abort(const char * msg CC_UNUSED, ...) +{ + //do nothing and return becasue we don't have panic() in those + //environments +} + +#else +#include +void cc_abort(const char * msg CC_UNUSED, ...) +{ + abort(); +} +#endif diff --git a/osfmk/corecrypto/cc/src/cc_clear.c b/osfmk/corecrypto/cc/src/cc_clear.c index 79f9d971f..a163e900d 100644 --- a/osfmk/corecrypto/cc/src/cc_clear.c +++ b/osfmk/corecrypto/cc/src/cc_clear.c @@ -10,18 +10,31 @@ #include +//rdar://problem/26986552 + +#if ( CC_HAS_MEMSET_S == 1 ) && (defined( __STDC_WANT_LIB_EXT1__ ) && ( __STDC_WANT_LIB_EXT1__ == 1 ) ) void cc_clear(size_t len, void *dst) { -#if ( CC_HAS_MEMSET_S == 1 ) && (defined( __STDC_WANT_LIB_EXT1__ ) && ( __STDC_WANT_LIB_EXT1__ == 1 ) ) memset_s(dst,len,0,len); +} +#elif defined(_WIN32) && !defined(__clang__) //Clang with Microsoft CodeGen, doesn't support SecureZeroMemory +#include +static void cc_clear(size_t len, void *dst) +{ + SecureZeroMemory(dst, len); +} #else - volatile size_t ctr=0; - volatile uint8_t *data=dst; - if (len) { - cc_zero(len,dst); - (void)data[ctr]; // Touch the buffer so that the compiler does not - // Optimize out the zeroing - } -#endif +void cc_clear(size_t len, void *dst) +{ + volatile char *vptr = (volatile char *)dst; + while (len--) + *vptr++ = '\0'; } +#endif +/* This is an altarnative for clang that should work + void cc_clear(size_t len, void *dst) __attribute__ ((optnone)) + { + cc_zero(len,dst); + } +*/ diff --git a/osfmk/corecrypto/cc/src/cc_cmp_safe.c b/osfmk/corecrypto/cc/src/cc_cmp_safe.c new file mode 100644 index 000000000..b06c8a724 --- /dev/null +++ b/osfmk/corecrypto/cc/src/cc_cmp_safe.c @@ -0,0 +1,25 @@ +/* + * cc_cmp_safe.c + * corecrypto + * + * Created on 04/22/2014 + * + * Copyright (c) 2014,2015 Apple Inc. All rights reserved. + * + */ + +#include + +int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2) +{ + size_t i; + const uint8_t *s=(const uint8_t *)ptr1; + const uint8_t *t=(const uint8_t *)ptr2; + uint8_t flag=((num<=0)?1:0); // If 0 return an error + for (i=0;ivsize, state->v); + //cc_print(label, state->vsize, state->nextvptr); + cc_print(label, state->vsize, state->vptr); cc_print(label, state->keysize, state->key); } #endif + +static void done(struct ccdrbg_state *drbg); + /* NIST SP 800-90A, Rev. 1 HMAC_DRBG April 2014, p 46 @@ -120,7 +129,7 @@ static void dumpState(const char *label, struct ccdrbg_nisthmac_state *state) { 6. Return K and V. */ -// was: unsigned long providedDataLength, const void *providedData +// was: size_t providedDataLength, const void *providedData /* To handle the case where we have three strings that are concatenated, @@ -128,56 +137,72 @@ static void dumpState(const char *label, struct ccdrbg_nisthmac_state *state) { */ static int hmac_dbrg_update(struct ccdrbg_state *drbg, - unsigned long daLen, const void *da, - unsigned long dbLen, const void *db, - unsigned long dcLen, const void *dc + size_t daLen, const void *da, + size_t dbLen, const void *db, + size_t dcLen, const void *dc ) { + int rc=CCDRBG_STATUS_ERROR; struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; const struct ccdigest_info *di = state->custom->di; const unsigned char cZero = 0x00; const unsigned char cOne = 0x01; + cchmac_ctx_decl(di->state_size, di->block_size, ctx); - cchmac_init(di, ctx, state->keysize, state->key); // 1. K = HMAC (K, V || 0x00 || provided_data). - cchmac_update(di, ctx, state->vsize, state->v); + cchmac_update(di, ctx, state->vsize, state->vptr); cchmac_update(di, ctx, 1, &cZero); if (da && daLen) cchmac_update(di, ctx, daLen, da); if (db && dbLen) cchmac_update(di, ctx, dbLen, db); if (dc && dcLen) cchmac_update(di, ctx, dcLen, dc); cchmac_final(di, ctx, state->key); - - // 2. V=HMAC(K,V). - cchmac(di, state->keysize, state->key, state->vsize, state->v, state->v); - - // 3. If (provided_data = Null), then return K and V. + // One parameter must be non-empty, or return - if (!((da && daLen) || (db && dbLen) || (dc && dcLen))) - return CCDRBG_STATUS_OK; - - // 4. K = HMAC (K, V || 0x01 || provided_data). - cchmac_init(di, ctx, state->keysize, state->key); - cchmac_update(di, ctx, state->vsize, state->v); - cchmac_update(di, ctx, 1, &cOne); - if (da && daLen) cchmac_update(di, ctx, daLen, da); - if (db && dbLen) cchmac_update(di, ctx, dbLen, db); - if (dc && dcLen) cchmac_update(di, ctx, dcLen, dc); - cchmac_final(di, ctx, state->key); - - // 5. V=HMAC(K,V). - cchmac(di, state->keysize, state->key, state->vsize, state->v, state->v); - - return CCDRBG_STATUS_OK; + if (((da && daLen) || (db && dbLen) || (dc && dcLen))) { + // 2. V=HMAC(K,V). + cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->vptr); + // 4. K = HMAC (K, V || 0x01 || provided_data). + cchmac_init(di, ctx, state->keysize, state->key); + cchmac_update(di, ctx, state->vsize, state->vptr); + cchmac_update(di, ctx, 1, &cOne); + if (da && daLen) cchmac_update(di, ctx, daLen, da); + if (db && dbLen) cchmac_update(di, ctx, dbLen, db); + if (dc && dcLen) cchmac_update(di, ctx, dcLen, dc); + cchmac_final(di, ctx, state->key); + } + // If additional data 5. V=HMAC(K,V) + // If no addtional data, this is step 2. V=HMAC(K,V). + state->bytesLeft = 0; + + // FIPS 140-2 4.9.2 Conditional Tests + // "the first n-bit block generated after power-up, initialization, or reset shall not be used, but shall be saved for comparison with the next n-bit block to be generated" + // Generate the first block and the second block. Compare for FIPS and discard the first block + // We keep the second block as the first set of data to be returned + cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->vptr); // First block + cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->nextvptr); // First to be returned + if (0==cc_cmp_safe(state->vsize, state->vptr, state->nextvptr)) { + //The world as we know it has come to an end + //the DRBG data structure is zeroized. subsequent calls to + //DRBG ends up in NULL dereferencing and/or unpredictable state. + //catastrophic error in SP 800-90A + done(drbg); + rc=CCDRBG_STATUS_ABORT; + cc_abort(NULL); + goto errOut; + } + rc=CCDRBG_STATUS_OK; +errOut: + return rc; } //make sure state is initialized, before calling this function static int validate_inputs(struct ccdrbg_nisthmac_state *state, - unsigned long entropyLength, - unsigned long additionalInputLength, - unsigned long psLength) + size_t entropyLength, + size_t additionalInputLength, + size_t psLength) { int rc; const struct ccdrbg_nisthmac_custom *custom=state->custom; @@ -185,7 +210,7 @@ static int validate_inputs(struct ccdrbg_nisthmac_state *state, rc =CCDRBG_STATUS_ERROR; //buffer size checks - cc_require (di->output_size<=sizeof(state->v), end); //digest size too long + cc_require (di->output_size<=sizeof(state->v)/2, end); //digest size too long cc_require (di->output_size<=sizeof(state->key), end); //digest size too long //NIST SP800 compliance checks @@ -224,9 +249,9 @@ static int validate_inputs(struct ccdrbg_nisthmac_state *state, //SP800-90 A: Required minimum entropy for instantiate and reseed=security_strength static int hmac_dbrg_instantiate_algorithm(struct ccdrbg_state *drbg, - unsigned long entropyLength, const void *entropy, - unsigned long nonceLength, const void *nonce, - unsigned long psLength, const void *ps) + size_t entropyLength, const void *entropy, + size_t nonceLength, const void *nonce, + size_t psLength, const void *ps) { // TODO: The NIST code passes nonce (i.e. HMAC key) to generate, but cc interface isn't set up that way struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; @@ -237,7 +262,7 @@ static int hmac_dbrg_instantiate_algorithm(struct ccdrbg_state *drbg, cc_zero(state->keysize, state->key); // 3. Set V to outlen/8 bytes of 0x01. - CC_MEMSET(state->v, 0x01, state->vsize); + CC_MEMSET(state->vptr, 0x01, state->vsize); // 4. (Key, V) = HMAC_DRBG_Update (seed_material, Key, V). hmac_dbrg_update(drbg, entropyLength, entropy, nonceLength, nonce, psLength, ps); @@ -253,12 +278,10 @@ static int hmac_dbrg_instantiate_algorithm(struct ccdrbg_state *drbg, // min_entropy = NH_REQUIRED_MIN_ENTROPY(security_strength) // bytes of entropy -static void done(struct ccdrbg_state *drbg); - static int init(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, - unsigned long entropyLength, const void* entropy, - unsigned long nonceLength, const void* nonce, - unsigned long psLength, const void* ps) + size_t entropyLength, const void* entropy, + size_t nonceLength, const void* nonce, + size_t psLength, const void* ps) { struct ccdrbg_nisthmac_state *state=(struct ccdrbg_nisthmac_state *)drbg; state->bytesLeft = 0; @@ -275,11 +298,13 @@ static int init(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, const struct ccdigest_info *di = state->custom->di; state->vsize = di->output_size; state->keysize = di->output_size; - + state->vptr=state->v; + state->nextvptr=state->v+state->vsize; + // 7. (V, Key, reseed_counter) = HMAC_DRBG_Instantiate_algorithm (entropy_input, personalization_string). hmac_dbrg_instantiate_algorithm(drbg, entropyLength, entropy, nonceLength, nonce, psLength, ps); -#ifdef DEBUGFOO +#if DRBG_NISTHMAC_DEBUG dumpState("Init: ", state); #endif return CCDRBG_STATUS_OK; @@ -312,8 +337,8 @@ static int init(const struct ccdrbg_info *info, struct ccdrbg_state *drbg, static int reseed(struct ccdrbg_state *drbg, - unsigned long entropyLength, const void *entropy, - unsigned long additionalLength, const void *additional) + size_t entropyLength, const void *entropy, + size_t additionalLength, const void *additional) { struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; @@ -323,7 +348,7 @@ reseed(struct ccdrbg_state *drbg, int rx = hmac_dbrg_update(drbg, entropyLength, entropy, additionalLength, additional, 0, NULL); state->reseed_counter = 1; -#ifdef DEBUGFOO +#if DRBG_NISTHMAC_DEBUG dumpState("Reseed: ", state); #endif return rx; @@ -346,12 +371,12 @@ reseed(struct ccdrbg_state *drbg, 7. Return (“Success”, pseudorandom_bits, V, Key, reseed_counter). */ -static int validate_gen_params(uint64_t reseed_counter, unsigned long dataOutLength, unsigned long additionalLength) +static int validate_gen_params(uint64_t reseed_counter, size_t dataOutLength, size_t additionalLength) { int rc=CCDRBG_STATUS_PARAM_ERROR; - cc_require (dataOutLength >= 1, end); //Requested zero byte in one request + // Zero byte in one request is a valid use-case (21208820) cc_require (dataOutLength <= CCDRBG_MAX_REQUEST_SIZE, end); //Requested too many bytes in one request cc_require (additionalLength<=CCDRBG_MAX_ADDITIONALINPUT_SIZE, end); //Additional input too long @@ -365,8 +390,8 @@ static int validate_gen_params(uint64_t reseed_counter, unsigned long dataOutLe return rc; } -static int generate(struct ccdrbg_state *drbg, unsigned long dataOutLength, void *dataOut, - unsigned long additionalLength, const void *additional) +static int generate(struct ccdrbg_state *drbg, size_t dataOutLength, void *dataOut, + size_t additionalLength, const void *additional) { struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg; const struct ccdrbg_nisthmac_custom *custom = state->custom; @@ -384,27 +409,45 @@ static int generate(struct ccdrbg_state *drbg, unsigned long dataOutLength, void while (dataOutLength > 0) { if (!state->bytesLeft) { // 5. V=HMAC(K,V). - cchmac(di, state->keysize, state->key, state->vsize, state->v, state->v); - state->bytesLeft = di->output_size;//di->output_size; state->vsize + cchmac(di, state->keysize, state->key, state->vsize, state->nextvptr, state->vptr); // Won't be returned + // FIPS 140-2 4.9.2 Conditional Tests + // "Each subsequent generation of an n-bit block shall be compared with the previously generated block. The test shall fail if any two compared n-bit blocks are equal." + if (0==cc_cmp_safe(state->vsize, state->vptr, state->nextvptr)) { + //The world as we know it has come to an end + //the DRBG data structure is zeroized. subsequent calls to + //DRBG ends up in NULL dereferencing and/or unpredictable state. + //catastrophic error in SP 800-90A + done(drbg); + rc=CCDRBG_STATUS_ABORT; + cc_abort(NULL); + goto errOut; + } + CC_SWAP(state->nextvptr, state->vptr); + state->bytesLeft = state->vsize; +#if DRBG_NISTHMAC_DEBUG + cc_print("generate blk: ", state->vsize, state->vptr); +#endif } size_t outLength = dataOutLength > state->bytesLeft ? state->bytesLeft : dataOutLength; - CC_MEMCPY(outPtr, state->v, outLength); + CC_MEMCPY(outPtr, state->vptr, outLength); state->bytesLeft -= outLength; outPtr += outLength; dataOutLength -= outLength; } - + // 6. (Key, V) = HMAC_DRBG_Update (additional_input, Key, V). hmac_dbrg_update(drbg, additionalLength, additional, 0, NULL, 0, NULL); // 7. reseed_counter = reseed_counter + 1. state->reseed_counter++; -#ifdef DEBUGFOO - dumpState("generate: ", state); +#if DRBG_NISTHMAC_DEBUG + dumpState("generate end: ", state); + cc_print("generate end nxt: ", state->vsize, state->nextvptr); #endif - - return CCDRBG_STATUS_OK; + rc=CCDRBG_STATUS_OK; +errOut: + return rc; } static void done(struct ccdrbg_state *drbg) diff --git a/osfmk/corecrypto/ccdigest/src/ccdigest_update.c b/osfmk/corecrypto/ccdigest/src/ccdigest_update.c index ce652362a..4df21c38a 100644 --- a/osfmk/corecrypto/ccdigest/src/ccdigest_update.c +++ b/osfmk/corecrypto/ccdigest/src/ccdigest_update.c @@ -12,18 +12,30 @@ #include void ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, - unsigned long len, const void *data) { + size_t len, const void *data) { const char * data_ptr = data; + size_t nblocks, nbytes; + while (len > 0) { if (ccdigest_num(di, ctx) == 0 && len > di->block_size) { - unsigned long nblocks = len / di->block_size; + //low-end processors are slow on divison + if(di->block_size == 1<<6 ){ //sha256 + nblocks = len >> 6; + nbytes = len & 0xFFFFffC0; + }else if(di->block_size == 1<<7 ){ //sha512 + nblocks = len >> 7; + nbytes = len & 0xFFFFff80; + }else { + nblocks = len / di->block_size; + nbytes = nblocks * di->block_size; + } + di->compress(ccdigest_state(di, ctx), nblocks, data_ptr); - unsigned long nbytes = nblocks * di->block_size; len -= nbytes; data_ptr += nbytes; ccdigest_nbits(di, ctx) += nbytes * 8; } else { - unsigned long n = di->block_size - ccdigest_num(di, ctx); + size_t n = di->block_size - ccdigest_num(di, ctx); if (len < n) n = len; CC_MEMCPY(ccdigest_data(di, ctx) + ccdigest_num(di, ctx), data_ptr, n); diff --git a/osfmk/corecrypto/cchmac/src/cchmac.c b/osfmk/corecrypto/cchmac/src/cchmac.c index eb38024db..61f859e6e 100644 --- a/osfmk/corecrypto/cchmac/src/cchmac.c +++ b/osfmk/corecrypto/cchmac/src/cchmac.c @@ -11,8 +11,8 @@ #include void cchmac(const struct ccdigest_info *di, - unsigned long key_len, const void *key, - unsigned long data_len, const void *data, unsigned char *mac) { + size_t key_len, const void *key, + size_t data_len, const void *data, unsigned char *mac) { cchmac_di_decl(di, hc); cchmac_init(di, hc, key_len, key); cchmac_update(di, hc, data_len, data); diff --git a/osfmk/corecrypto/cchmac/src/cchmac_init.c b/osfmk/corecrypto/cchmac/src/cchmac_init.c index 8d426e8c8..ffda5227e 100644 --- a/osfmk/corecrypto/cchmac/src/cchmac_init.c +++ b/osfmk/corecrypto/cchmac/src/cchmac_init.c @@ -20,11 +20,11 @@ text is the data being protected. */ void cchmac_init(const struct ccdigest_info *di, cchmac_ctx_t hc, - unsigned long key_len, const void *key_data) { + size_t key_len, const void *key_data) { const unsigned char *key = key_data; /* Set cchmac_data(di, hc) to key ^ opad. */ - unsigned long byte = 0; + size_t byte = 0; if (key_len <= di->block_size) { for (;byte < key_len; ++byte) { cchmac_data(di, hc)[byte] = key[byte] ^ 0x5c; diff --git a/osfmk/corecrypto/cchmac/src/cchmac_update.c b/osfmk/corecrypto/cchmac/src/cchmac_update.c index 26abc62e0..3273d4385 100644 --- a/osfmk/corecrypto/cchmac/src/cchmac_update.c +++ b/osfmk/corecrypto/cchmac/src/cchmac_update.c @@ -11,6 +11,6 @@ #include void cchmac_update(const struct ccdigest_info *di, cchmac_ctx_t hc, - unsigned long data_len, const void *data) { + size_t data_len, const void *data) { ccdigest_update(di, cchmac_digest_ctx(di, hc), data_len, data); } diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c b/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c index a709adcf6..2a9209f39 100644 --- a/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c +++ b/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c @@ -65,7 +65,7 @@ * [including the GNU Public Licence.] */ -#define USE_SUPER_COOL_NEW_CCOID_T + #include #include #include @@ -156,7 +156,7 @@ # define X(i) XX[i] #endif -static void sha1_compress(ccdigest_state_t s, unsigned long num, const void *buf) +static void sha1_compress(ccdigest_state_t s, size_t num, const void *buf) { const unsigned char *data=buf; register uint32_t A,B,C,D,E,T,l; diff --git a/osfmk/corpses/Makefile b/osfmk/corpses/Makefile index ded3ef9c5..6a7e314c0 100644 --- a/osfmk/corpses/Makefile +++ b/osfmk/corpses/Makefile @@ -16,6 +16,5 @@ EXPORT_MI_LIST = ${DATAFILES} EXPORT_MI_DIR = corpses - include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/osfmk/corpses/corpse.c b/osfmk/corpses/corpse.c index 27a0c13f5..94b5a0973 100644 --- a/osfmk/corpses/corpse.c +++ b/osfmk/corpses/corpse.c @@ -129,23 +129,56 @@ #include #include +#if CONFIG_MACF +#include +#endif + +/* + * Exported interfaces + */ +#include + unsigned long total_corpses_count = 0; unsigned long total_corpses_created = 0; boolean_t corpse_enabled_config = TRUE; +/* bootarg to turn on corpse forking for EXC_RESOURCE */ +int exc_via_corpse_forking = 1; + +/* bootarg to unify corpse blob allocation */ +int unify_corpse_blob_alloc = 1; + +/* bootarg to generate corpse for fatal high memory watermark violation */ +int corpse_for_fatal_memkill = 1; + kcdata_descriptor_t task_get_corpseinfo(task_t task); -kcdata_descriptor_t task_crashinfo_alloc_init(mach_vm_address_t crash_data_p, unsigned size); -kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data); +kcdata_descriptor_t task_crashinfo_alloc_init(mach_vm_address_t crash_data_p, unsigned size, int get_corpseref, unsigned flags); +kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data, int release_corpseref); static kern_return_t task_crashinfo_get_ref(); static kern_return_t task_crashinfo_release_ref(); - +extern int IS_64BIT_PROCESS(void *); +extern void gather_populate_corpse_crashinfo(void *p, void *crash_info_ptr, mach_exception_data_type_t code, mach_exception_data_type_t subcode, uint64_t *udata_buffer, int num_udata); +extern void *proc_find(int pid); +extern int proc_rele(void *p); void corpses_init(){ char temp_buf[20]; + int exc_corpse_forking; + int corpse_blob_alloc; + int fatal_memkill; if (PE_parse_boot_argn("-no_corpses", temp_buf, sizeof(temp_buf))) { corpse_enabled_config = FALSE; } + if (PE_parse_boot_argn("exc_via_corpse_forking", &exc_corpse_forking, sizeof(exc_corpse_forking))) { + exc_via_corpse_forking = exc_corpse_forking; + } + if (PE_parse_boot_argn("unify_corpse_blob_alloc", &corpse_blob_alloc, sizeof(corpse_blob_alloc))) { + unify_corpse_blob_alloc = corpse_blob_alloc; + } + if (PE_parse_boot_argn("corpse_for_fatal_memkill", &fatal_memkill, sizeof(fatal_memkill))) { + corpse_for_fatal_memkill = fatal_memkill; + } } /* @@ -187,26 +220,27 @@ kern_return_t task_crashinfo_release_ref() } -kcdata_descriptor_t task_crashinfo_alloc_init(mach_vm_address_t crash_data_p, unsigned size) +kcdata_descriptor_t task_crashinfo_alloc_init(mach_vm_address_t crash_data_p, unsigned size, int get_corpseref, unsigned flags) { - if(KERN_SUCCESS != task_crashinfo_get_ref()) { + if(get_corpseref && KERN_SUCCESS != task_crashinfo_get_ref()) { return NULL; } - return kcdata_memory_alloc_init(crash_data_p, TASK_CRASHINFO_BEGIN, size, KCFLAG_USE_COPYOUT); + return kcdata_memory_alloc_init(crash_data_p, TASK_CRASHINFO_BEGIN, size, flags); } /* * Free up the memory associated with task_crashinfo_data */ -kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data) +kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data, int release_corpseref) { if (!data) { return KERN_INVALID_ARGUMENT; } - task_crashinfo_release_ref(); + if (release_corpseref) + task_crashinfo_release_ref(); return kcdata_memory_destroy(data); } @@ -225,4 +259,357 @@ kcdata_descriptor_t task_get_corpseinfo(task_t task) return retval; } +/* + * Routine: task_add_to_corpse_task_list + * params: task - task to be added to corpse task list + * returns: None. + */ +void +task_add_to_corpse_task_list(task_t corpse_task) +{ + lck_mtx_lock(&tasks_corpse_lock); + queue_enter(&corpse_tasks, corpse_task, task_t, corpse_tasks); + lck_mtx_unlock(&tasks_corpse_lock); +} + +/* + * Routine: task_remove_from_corpse_task_list + * params: task - task to be removed from corpse task list + * returns: None. + */ +void +task_remove_from_corpse_task_list(task_t corpse_task) +{ + lck_mtx_lock(&tasks_corpse_lock); + queue_remove(&corpse_tasks, corpse_task, task_t, corpse_tasks); + lck_mtx_unlock(&tasks_corpse_lock); +} + +/* + * Routine: task_purge_all_corpses + * params: None. + * returns: None. + */ +void +task_purge_all_corpses(void) +{ + task_t task; + + printf("Purging corpses......\n\n"); + + lck_mtx_lock(&tasks_corpse_lock); + /* Iterate through all the corpse tasks and clear all map entries */ + queue_iterate(&corpse_tasks, task, task_t, corpse_tasks) { + vm_map_remove(task->map, + task->map->min_offset, + task->map->max_offset, + /* no unnesting on final cleanup: */ + VM_MAP_REMOVE_NO_UNNESTING); + } + + lck_mtx_unlock(&tasks_corpse_lock); +} + +/* + * Routine: task_generate_corpse + * params: task - task to fork a corpse + * corpse_task - task port of the generated corpse + * returns: KERN_SUCCESS on Success. + * KERN_FAILURE on Failure. + * KERN_NO_SUPPORTED on corpse disabled. + * KERN_RESOURCE_SHORTAGE on memory alloc failure or reaching max corpse. + */ +kern_return_t +task_generate_corpse( + task_t task, + ipc_port_t *corpse_task_port) +{ + task_t new_task; + kern_return_t kr; + thread_t thread, th_iter; + ipc_port_t corpse_port; + ipc_port_t old_notify; + + if (task == kernel_task || task == TASK_NULL || task == current_task()) { + return KERN_INVALID_ARGUMENT; + } + + task_lock(task); + if (task_is_a_corpse_fork(task)) { + task_unlock(task); + return KERN_INVALID_ARGUMENT; + } + task_unlock(task); + + /* Generate a corpse for the given task, will return with a ref on corpse task */ + kr = task_generate_corpse_internal(task, &new_task, &thread, 0, 0); + if (kr != KERN_SUCCESS) { + return kr; + } + assert(thread == THREAD_NULL); + + /* wait for all the threads in the task to terminate */ + task_lock(new_task); + task_wait_till_threads_terminate_locked(new_task); + + /* Reset thread ports of all the threads in task */ + queue_iterate(&new_task->threads, th_iter, thread_t, task_threads) + { + /* Do not reset the thread port for inactive threads */ + if (th_iter->corpse_dup == FALSE) { + ipc_thread_reset(th_iter); + } + } + task_unlock(new_task); + + /* transfer the task ref to port and arm the no-senders notification */ + corpse_port = convert_task_to_port(new_task); + assert(IP_NULL != corpse_port); + + ip_lock(corpse_port); + assert(ip_active(corpse_port)); + ipc_port_nsrequest(corpse_port, corpse_port->ip_mscount, ipc_port_make_sonce_locked(corpse_port), &old_notify); + /* port unlocked */ + + assert(IP_NULL == old_notify); + *corpse_task_port = corpse_port; + return KERN_SUCCESS; +} + +/* + * Routine: task_enqueue_exception_with_corpse + * params: task - task to generate a corpse and enqueue it + * code - exception code to be enqueued + * codeCnt - code array count - code and subcode + */ +void +task_enqueue_exception_with_corpse( + task_t task, + mach_exception_data_t code, + mach_msg_type_number_t codeCnt) +{ + task_t new_task = TASK_NULL; + thread_t thread = THREAD_NULL; + kern_return_t kr; + + if (codeCnt < 2) { + return; + } + + /* Generate a corpse for the given task, will return with a ref on corpse task */ + kr = task_generate_corpse_internal(task, &new_task, &thread, code[0], code[1]); + if (kr != KERN_SUCCESS) { + return; + } + + assert(thread != THREAD_NULL); + assert(new_task != TASK_NULL); + thread_exception_enqueue(new_task, thread); + + return; +} + +/* + * Routine: task_generate_corpse_internal + * params: task - task to fork a corpse + * corpse_task - task of the generated corpse + * exc_thread - equivalent thread in corpse enqueuing exception + * code - mach exception code to be passed in corpse blob + * subcode - mach excpetion subcode to be passed in corpse blob + * returns: KERN_SUCCESS on Success. + * KERN_FAILURE on Failure. + * KERN_NO_SUPPORTED on corpse disabled. + * KERN_RESOURCE_SHORTAGE on memory alloc failure or reaching max corpse. + */ +kern_return_t +task_generate_corpse_internal( + task_t task, + task_t *corpse_task, + thread_t *exc_thread, + mach_exception_data_type_t code, + mach_exception_data_type_t subcode) +{ + task_t new_task = TASK_NULL; + thread_t thread = THREAD_NULL; + thread_t thread_next = THREAD_NULL; + kern_return_t kr; + struct proc *p = NULL; + int is64bit; + int t_flags; + uint64_t *udata_buffer = NULL; + int size = 0; + int num_udata = 0; + boolean_t release_corpse_ref = FALSE; + + if (!corpses_enabled()) { + return KERN_NOT_SUPPORTED; + } + + kr = task_crashinfo_get_ref(); + if (kr != KERN_SUCCESS) { + return kr; + } + release_corpse_ref = TRUE; + + /* Having a task reference does not guarantee a proc reference */ + p = proc_find(task_pid(task)); + if (p == NULL) { + kr = KERN_INVALID_TASK; + goto error_task_generate_corpse; + } + + is64bit = IS_64BIT_PROCESS(p); + t_flags = TF_CORPSE_FORK | TF_PENDING_CORPSE | TF_CORPSE | (is64bit ? TF_64B_ADDR : TF_NONE); + + /* Create a task for corpse */ + kr = task_create_internal(task, + NULL, + TRUE, + is64bit, + t_flags, + &new_task); + if (kr != KERN_SUCCESS) { + goto error_task_generate_corpse; + } + + /* Create and copy threads from task, returns a ref to thread */ + kr = task_duplicate_map_and_threads(task, p, new_task, &thread, + is64bit, &udata_buffer, &size, &num_udata); + if (kr != KERN_SUCCESS) { + goto error_task_generate_corpse; + } + + kr = task_collect_crash_info(new_task, p, TRUE); + if (kr != KERN_SUCCESS) { + goto error_task_generate_corpse; + } + + /* The corpse_info field in task in initialized, call to task_deallocate will drop corpse ref */ + release_corpse_ref = FALSE; + + kr = task_start_halt(new_task); + if (kr != KERN_SUCCESS) { + goto error_task_generate_corpse; + } + + /* terminate the ipc space */ + ipc_space_terminate(new_task->itk_space); + + /* Populate the corpse blob, use the proc struct of task instead of corpse task */ + gather_populate_corpse_crashinfo(p, task_get_corpseinfo(new_task), code, subcode, udata_buffer, num_udata); + + /* Add it to global corpse task list */ + task_add_to_corpse_task_list(new_task); + + *corpse_task = new_task; + *exc_thread = thread; + +error_task_generate_corpse: + /* Release the proc reference */ + if (p != NULL) { + proc_rele(p); + } + if (kr != KERN_SUCCESS) { + if (thread != THREAD_NULL) { + thread_deallocate(thread); + } + if (new_task != TASK_NULL) { + task_lock(new_task); + /* Terminate all the other threads in the task. */ + queue_iterate(&new_task->threads, thread_next, thread_t, task_threads) + { + thread_terminate_internal(thread_next); + } + /* wait for all the threads in the task to terminate */ + task_wait_till_threads_terminate_locked(new_task); + task_unlock(new_task); + + task_clear_corpse(new_task); + task_terminate_internal(new_task); + task_deallocate(new_task); + } + if (release_corpse_ref) { + task_crashinfo_release_ref(); + } + } + /* Free the udata buffer allocated in task_duplicate_map_and_threads */ + if (udata_buffer != NULL) { + kfree(udata_buffer, size); + } + + return kr; +} + +/* + * Routine: task_map_corpse_info + * params: task - Map the corpse info in task's address space + * corpse_task - task port of the corpse + * kcd_addr_begin - address of the mapped corpse info + * kcd_addr_begin - size of the mapped corpse info + * returns: KERN_SUCCESS on Success. + * KERN_FAILURE on Failure. + * KERN_INVALID_ARGUMENT on invalid arguments. + * Note: Temporary function, will be deleted soon. + */ +kern_return_t +task_map_corpse_info( + task_t task, + task_t corpse_task, + vm_address_t *kcd_addr_begin, + uint32_t *kcd_size) +{ + kern_return_t kr; + mach_vm_address_t kcd_addr_begin_64; + mach_vm_size_t size_64; + + kr = task_map_corpse_info_64(task, corpse_task, &kcd_addr_begin_64, &size_64); + if (kr != KERN_SUCCESS) { + return kr; + } + + *kcd_addr_begin = (vm_address_t)kcd_addr_begin_64; + *kcd_size = (uint32_t) size_64; + return KERN_SUCCESS; +} + +/* + * Routine: task_map_corpse_info_64 + * params: task - Map the corpse info in task's address space + * corpse_task - task port of the corpse + * kcd_addr_begin - address of the mapped corpse info (takes mach_vm_addess_t *) + * kcd_addr_begin - size of the mapped corpse info (takes mach_vm_size_t *) + * returns: KERN_SUCCESS on Success. + * KERN_FAILURE on Failure. + * KERN_INVALID_ARGUMENT on invalid arguments. + */ +kern_return_t +task_map_corpse_info_64( + task_t task, + task_t corpse_task, + mach_vm_address_t *kcd_addr_begin, + mach_vm_size_t *kcd_size) +{ + kern_return_t kr; + mach_vm_offset_t crash_data_ptr = 0; + mach_vm_size_t size = CORPSEINFO_ALLOCATION_SIZE; + + if (task == TASK_NULL || task_is_a_corpse_fork(task)) { + return KERN_INVALID_ARGUMENT; + } + + if (corpse_task == TASK_NULL || !task_is_a_corpse(corpse_task) || + corpse_task->corpse_info == NULL || corpse_task->corpse_info_kernel == NULL) { + return KERN_INVALID_ARGUMENT; + } + kr = mach_vm_allocate(task->map, &crash_data_ptr, size, + (VM_MAKE_TAG(VM_MEMORY_CORPSEINFO) | VM_FLAGS_ANYWHERE)); + if (kr != KERN_SUCCESS) { + return kr; + } + copyout(corpse_task->corpse_info_kernel, crash_data_ptr, size); + *kcd_addr_begin = crash_data_ptr; + *kcd_size = size; + + return KERN_SUCCESS; +} diff --git a/osfmk/corpses/task_corpse.h b/osfmk/corpses/task_corpse.h index 5a4008186..7e784e759 100644 --- a/osfmk/corpses/task_corpse.h +++ b/osfmk/corpses/task_corpse.h @@ -32,47 +32,10 @@ #include #include #include +#include typedef struct kcdata_item *task_crashinfo_item_t; -/* - * NOTE: Please update libkdd/kcdata/kcdtypes.c if you make any changes - * in TASK_CRASHINFO_* types. - */ - -#define TASK_CRASHINFO_BEGIN KCDATA_BUFFER_BEGIN_CRASHINFO -#define TASK_CRASHINFO_STRING_DESC KCDATA_TYPE_STRING_DESC -#define TASK_CRASHINFO_UINT32_DESC KCDATA_TYPE_UINT32_DESC -#define TASK_CRASHINFO_UINT64_DESC KCDATA_TYPE_UINT64_DESC - -#define TASK_CRASHINFO_EXTMODINFO 0x801 -#define TASK_CRASHINFO_BSDINFOWITHUNIQID 0x802 /* struct proc_uniqidentifierinfo */ -#define TASK_CRASHINFO_TASKDYLD_INFO 0x803 -#define TASK_CRASHINFO_UUID 0x804 -#define TASK_CRASHINFO_PID 0x805 -#define TASK_CRASHINFO_PPID 0x806 -#define TASK_CRASHINFO_RUSAGE 0x807 /* struct rusage */ -#define TASK_CRASHINFO_RUSAGE_INFO 0x808 /* struct rusage_info_current */ -#define TASK_CRASHINFO_PROC_NAME 0x809 /* char * */ -#define TASK_CRASHINFO_PROC_STARTTIME 0x80B /* struct timeval64 */ -#define TASK_CRASHINFO_USERSTACK 0x80C /* uint64_t */ -#define TASK_CRASHINFO_ARGSLEN 0x80D -#define TASK_CRASHINFO_EXCEPTION_CODES 0x80E /* mach_exception_data_t */ -#define TASK_CRASHINFO_PROC_PATH 0x80F /* string of len MAXPATHLEN */ -#define TASK_CRASHINFO_PROC_CSFLAGS 0x810 /* uint32_t */ -#define TASK_CRASHINFO_PROC_STATUS 0x811 /* char */ -#define TASK_CRASHINFO_UID 0x812 /* uid_t */ -#define TASK_CRASHINFO_GID 0x813 /* gid_t */ -#define TASK_CRASHINFO_PROC_ARGC 0x814 /* int */ -#define TASK_CRASHINFO_PROC_FLAGS 0x815 /* unsigned int */ -#define TASK_CRASHINFO_CPUTYPE 0x816 /* cpu_type_t */ -#define TASK_CRASHINFO_WORKQUEUEINFO 0x817 /* struct proc_workqueueinfo */ -#define TASK_CRASHINFO_RESPONSIBLE_PID 0x818 /* pid_t */ -#define TASK_CRASHINFO_DIRTY_FLAGS 0x819 /* int */ -#define TASK_CRASHINFO_CRASHED_THREADID 0x81A /* uint64_t */ - -#define TASK_CRASHINFO_END KCDATA_TYPE_BUFFER_END - /* Deprecated: use the KCDATA_* macros for all future use */ #define CRASHINFO_ITEM_TYPE(item) KCDATA_ITEM_TYPE(item) #define CRASHINFO_ITEM_SIZE(item) KCDATA_ITEM_SIZE(item) @@ -90,26 +53,58 @@ typedef struct kcdata_item *task_crashinfo_item_t; #ifdef XNU_KERNEL_PRIVATE -#define CORPSEINFO_ALLOCATION_SIZE (1024 * 1024 * 2) +#define CORPSEINFO_ALLOCATION_SIZE (1024 * 16) #define TOTAL_CORPSES_ALLOWED 5 extern kern_return_t task_mark_corpse(task_t task); -extern kern_return_t task_deliver_crash_notification(task_t task); +extern kern_return_t task_deliver_crash_notification(task_t task, thread_t thread, mach_exception_data_type_t subcode); extern kcdata_descriptor_t task_get_corpseinfo(task_t task); +#define GET_CORPSE_REF TRUE +#define RELEASE_CORPSE_REF TRUE + extern kcdata_descriptor_t task_crashinfo_alloc_init( mach_vm_address_t crash_data_p, - unsigned size); -extern kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data); + unsigned size, + int get_corpseref, unsigned flags); +extern kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data, int release_corpseref); extern void corpses_init(void); extern boolean_t corpses_enabled(void); +extern kern_return_t task_generate_corpse_internal( + task_t task, + task_t *corpse_task, + thread_t *thread, + mach_exception_data_type_t code, + mach_exception_data_type_t subcode); + +extern void task_clear_corpse(task_t task); + +extern kern_return_t task_duplicate_map_and_threads( + task_t task, + void *p, + task_t new_task, + thread_t *thread, + int is64bit, + uint64_t **udata_buffer, + int *size, + int*num_udata); + +extern void task_enqueue_exception_with_corpse( + task_t task, + mach_exception_data_t code, + mach_msg_type_number_t codeCnt); + +extern void task_add_to_corpse_task_list(task_t corpse_task); +void task_remove_from_corpse_task_list(task_t corpse_task); +void task_purge_all_corpses(void); + #endif /* XNU_KERNEL_PRIVATE */ #endif /* _TASK_CORPSE_H_ */ diff --git a/osfmk/default_pager/Makefile b/osfmk/default_pager/Makefile index 45195d298..f7cad85e2 100644 --- a/osfmk/default_pager/Makefile +++ b/osfmk/default_pager/Makefile @@ -10,21 +10,18 @@ MIG_TYPES = \ default_pager_types.defs MIG_DEFS = \ - default_pager_alerts.defs MIG_USHDRS = \ MIG_UUHDRS = \ - default_pager_object.h - MIGINCLUDES = ${MIG_UUHDRS} ${MIG_USHDRS} DATAFILES = \ default_pager_types.h \ - ${MIG_DEFS} + ${MIG_DEFS} -INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_GEN_LIST = ${MIGINCLUDES} @@ -42,26 +39,26 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ - -header $@ \ + -header $@ \ $< ${MIG_USHDRS} : \ %_server.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ -header /dev/null \ - -sheader $@ \ + -sheader $@ \ $< # # Build path -# +# INCFLAGS_MAKEFILE= -I.. MIGKSFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_SERVER=1 @@ -72,16 +69,12 @@ MIGKUFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_USER=1 -maxonstack 1024 # level code. # MIG_KUHDRS = \ - default_pager_alerts.h MIG_KUSRC = \ - default_pager_alerts_user.c MIG_KSHDRS = \ - default_pager_object.h MIG_KSSRC = \ - default_pager_object_server.c # # JMM - @@ -98,7 +91,7 @@ MIG_KSSRC = \ # even require that as we move towards making all the environments look # the same. # -COMP_FILES = ${MIG_KUSRC} ${MIG_KSSRC} +COMP_FILES = ${MIG_KUSRC} ${MIG_KSSRC} do_build_all:: $(COMP_FILES) @@ -106,7 +99,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -116,7 +109,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %_server.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/default_pager/default_pager.c b/osfmk/default_pager/default_pager.c deleted file mode 100644 index 71e998fe9..000000000 --- a/osfmk/default_pager/default_pager.c +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - * Default pager. - * Threads management. - * Requests handling. - */ - -#include "default_pager_internal.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -char my_name[] = "(default pager): "; - -#if DEFAULT_PAGER_DEBUG -int debug_mask = 0; -#endif /* DEFAULT_PAGER_DEBUG */ - -/* - * Use 16 Kbyte stacks instead of the default 64K. - * Use 4 Kbyte waiting stacks instead of the default 8K. - */ - -vm_size_t cthread_stack_size = 16 *1024; -extern vm_size_t cthread_wait_stack_size; - -#ifndef MACH_KERNEL -unsigned long long vm_page_mask; -int vm_page_shift; -#endif - -boolean_t verbose; - -/* task_t default_pager_self; */ /* Our task port. */ -lck_mtx_t dpt_lock; /* lock for the dpt array struct */ -default_pager_thread_t **dpt_array; - -memory_object_default_t default_pager_object; /* for memory_object_create. */ - -MACH_PORT_FACE default_pager_default_set; /* Port set for "default" thread. */ -MACH_PORT_FACE default_pager_internal_set; /* Port set for internal objects. */ -MACH_PORT_FACE default_pager_external_set; /* Port set for external objects. */ - -#define DEFAULT_PAGER_INTERNAL_COUNT (4) - - -/* Memory created by default_pager_object_create should mostly be resident. */ -#define DEFAULT_PAGER_EXTERNAL_COUNT (2) - -int default_pager_internal_count = DEFAULT_PAGER_INTERNAL_COUNT; -/* Number of "internal" threads. */ -int default_pager_external_count = DEFAULT_PAGER_EXTERNAL_COUNT; -/* Number of "external" threads. */ - -/* - * Forward declarations. - */ -boolean_t default_pager_notify_server(mach_msg_header_t *, - mach_msg_header_t *); -boolean_t default_pager_demux_object(mach_msg_header_t *, - mach_msg_header_t *); -boolean_t default_pager_demux_default(mach_msg_header_t *, - mach_msg_header_t *); -default_pager_thread_t *start_default_pager_thread(int, boolean_t); -void default_pager(void); -void default_pager_thread(void *); -void default_pager_initialize(void); -boolean_t dp_parse_argument(char *); /* forward; */ -unsigned int d_to_i(char *); /* forward; */ - -extern int vstruct_def_clshift; - -struct global_stats global_stats; - -/* - * Initialize and Run the default pager - */ -void -default_pager(void) -{ - int i, id; - __unused static char here[] = "default_pager"; - default_pager_thread_t dpt; - kern_return_t kr; - - - - /* - * Give me space for the thread array and zero it. - */ - i = default_pager_internal_count + default_pager_external_count + 1; - dpt_array = (default_pager_thread_t **) - kalloc(i * sizeof(default_pager_thread_t *)); - memset(dpt_array, 0, i * sizeof(default_pager_thread_t *)); - - /* Setup my thread structure. */ - id = 0; - dpt.dpt_buffer = 0; - dpt.dpt_internal = FALSE; - dpt.dpt_initialized_p = TRUE; - dpt_array[0] = &dpt; - - /* - * Now we create the threads that will actually - * manage objects. - */ - - for (i = 0; i < default_pager_internal_count; i++) { - dpt_array[id] = (default_pager_thread_t *) - kalloc(sizeof (default_pager_thread_t)); - if (dpt_array[id] == NULL) - Panic("alloc pager thread"); - kr = vm_allocate(kernel_map, &((dpt_array[id])->dpt_buffer), - vm_page_size << vstruct_def_clshift, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK)); - if (kr != KERN_SUCCESS) - Panic("alloc thread buffer"); - kr = vm_map_wire(kernel_map, (dpt_array[id])->dpt_buffer, - ((dpt_array[id])->dpt_buffer) - +(vm_page_size << vstruct_def_clshift), - VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK), - FALSE); - if (kr != KERN_SUCCESS) - Panic("wire thread buffer"); - (dpt_array[id])->dpt_internal = TRUE; - (dpt_array[id])->dpt_initialized_p = TRUE; - (dpt_array[id])->checked_out = FALSE; - id++; - } - DPT_LOCK_INIT(dpt_lock); -} - - - - - - -/* simple utility: only works for 2^n */ -int -local_log2( - unsigned int n) -{ - register int i = 0; - - if(n == 0) return 0; - - while ((n & 1) == 0) { - i++; - n >>= 1; - } - return i; -} - - - - -/* another simple utility, d_to_i(char*) supporting only decimal - * and devoid of range checking; obscure name chosen deliberately - * to avoid confusion with semantic-rich POSIX routines */ -unsigned int -d_to_i(char * arg) -{ - unsigned int rval = 0; - char ch; - - while ((ch = *arg++) && ch >= '0' && ch <= '9') { - rval *= 10; - rval += ch - '0'; - } - return(rval); -} - - - - -/* - * Check for non-disk-partition arguments of the form - * attribute=argument - * returning TRUE if one if found - */ -boolean_t dp_parse_argument(char *av) -{ - char *rhs = av; - __unused static char here[] = "dp_parse_argument"; - - /* Check for '-v' flag */ - - if (av[0] == '-' && av[1] == 'v' && av[2] == 0) { - verbose = TRUE ; - return TRUE; - } - - /* - * If we find a '=' followed by an argument in the string, - * check for known arguments - */ - while (*rhs && *rhs != '=') - rhs++; - if (*rhs && *++rhs) { - /* clsize=N pages */ - if (strprefix(av,"cl")) { - if (!bs_set_default_clsize(d_to_i(rhs))) - dprintf(("Bad argument (%s) - ignored\n", av)); - return(TRUE); - } - /* else if strprefix(av,"another_argument")) { - handle_another_argument(av); - return(TRUE); - } */ - } - return(FALSE); -} - -int -start_def_pager( __unused char *bs_device ) -{ -/* - MACH_PORT_FACE master_device_port; -*/ -/* - MACH_PORT_FACE security_port; -*/ - __unused static char here[] = "main"; - - - - - /* setup read buffers, etc */ - default_pager_initialize(); - -#ifndef MACH_KERNEL - default_pager(); -#endif - - if (DEFAULT_PAGER_IS_ACTIVE) { - /* start the backing store monitor, it runs on a callout thread */ - default_pager_backing_store_monitor_callout = - thread_call_allocate(default_pager_backing_store_monitor, NULL); - if (!default_pager_backing_store_monitor_callout) - panic("can't start backing store monitor thread"); - thread_call_enter(default_pager_backing_store_monitor_callout); - } - - return (0); -} - -kern_return_t -default_pager_info( - memory_object_default_t pager, - default_pager_info_t *infop) -{ - uint64_t pages_total, pages_free; - - if (pager != default_pager_object) - return KERN_INVALID_ARGUMENT; - - bs_global_info(&pages_total, &pages_free); - - infop->dpi_total_space = (vm_size_t) ptoa_64(pages_total); - infop->dpi_free_space = (vm_size_t) ptoa_64(pages_free); - infop->dpi_page_size = vm_page_size; - - return KERN_SUCCESS; -} - - -kern_return_t -default_pager_info_64( - memory_object_default_t pager, - default_pager_info_64_t *infop) -{ - uint64_t pages_total, pages_free; - - if (pager != default_pager_object) - return KERN_INVALID_ARGUMENT; - - bs_global_info(&pages_total, &pages_free); - - infop->dpi_total_space = ptoa_64(pages_total); - infop->dpi_free_space = ptoa_64(pages_free); - infop->dpi_page_size = vm_page_size; - infop->dpi_flags = 0; - if (dp_encryption_inited && dp_encryption == TRUE) { - infop->dpi_flags |= DPI_ENCRYPTED; - } - - return KERN_SUCCESS; -} - -lck_grp_t default_pager_lck_grp; -lck_grp_attr_t default_pager_lck_grp_attr; -lck_attr_t default_pager_lck_attr; - - - -void -default_pager_initialize(void) -{ - kern_return_t kr; - __unused static char here[] = "default_pager_initialize"; - - lck_grp_attr_setdefault(&default_pager_lck_grp_attr); - lck_grp_init(&default_pager_lck_grp, "default_pager", &default_pager_lck_grp_attr); - lck_attr_setdefault(&default_pager_lck_attr); - - /* - * Vm variables. - */ -#ifndef MACH_KERNEL - vm_page_mask = vm_page_size - 1; - assert((unsigned int) vm_page_size == vm_page_size); - vm_page_shift = local_log2((unsigned int) vm_page_size); -#endif - - /* - * List of all vstructs. - */ - vstruct_zone = zinit(sizeof(struct vstruct), - 10000 * sizeof(struct vstruct), - 8192, "vstruct zone"); - zone_change(vstruct_zone, Z_CALLERACCT, FALSE); - zone_change(vstruct_zone, Z_NOENCRYPT, TRUE); - - VSL_LOCK_INIT(); - queue_init(&vstruct_list.vsl_queue); - vstruct_list.vsl_count = 0; - - VSTATS_LOCK_INIT(&global_stats.gs_lock); - - bs_initialize(); - - /* - * Exported DMM port. - */ - default_pager_object = ipc_port_alloc_kernel(); - - - /* - * Export pager interfaces. - */ -#ifdef USER_PAGER - if ((kr = netname_check_in(name_server_port, "UserPager", - default_pager_self, - default_pager_object)) - != KERN_SUCCESS) { - dprintf(("netname_check_in returned 0x%x\n", kr)); - exit(1); - } -#else /* USER_PAGER */ - { - unsigned int clsize; - memory_object_default_t dmm; - - dmm = default_pager_object; - assert((unsigned int) vm_page_size == vm_page_size); - clsize = ((unsigned int) vm_page_size << vstruct_def_clshift); - kr = host_default_memory_manager(host_priv_self(), &dmm, clsize); - if ((kr != KERN_SUCCESS) || - (dmm != MEMORY_OBJECT_DEFAULT_NULL)) - Panic("default memory manager"); - - } -#endif /* USER_PAGER */ - - -} - diff --git a/osfmk/default_pager/default_pager_internal.h b/osfmk/default_pager/default_pager_internal.h deleted file mode 100644 index 7ae7452a6..000000000 --- a/osfmk/default_pager/default_pager_internal.h +++ /dev/null @@ -1,790 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - * Default pager. - * General definitions. - */ - -#ifndef _DEFAULT_PAGER_INTERNAL_H_ -#define _DEFAULT_PAGER_INTERNAL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Default option settings. - */ -#ifndef PARALLEL -#define PARALLEL 1 -#endif - -#ifndef CHECKSUM -#define CHECKSUM 0 -#endif - -#define MACH_PORT_FACE mach_port_t - -#if CONFIG_FREEZE -#define RECLAIM_SWAP 1 -#else -#define RECLAIM_SWAP 0 -#endif - -#define USE_PRECIOUS 0 - -#ifdef USER_PAGER -#define UP(stuff) stuff -#else /* USER_PAGER */ -#define UP(stuff) -#endif /* USER_PAGER */ - -#define dprintf(args) \ - do { \ - printf("%s[KERNEL]: ", my_name); \ - printf args; \ - } while (0) - -/* - * Debug. - */ -extern char my_name[]; - -#define DEFAULT_PAGER_DEBUG 0 - -#if DEFAULT_PAGER_DEBUG - -extern int debug_mask; -#define DEBUG_MSG_EXTERNAL 0x00000001 -#define DEBUG_MSG_INTERNAL 0x00000002 -#define DEBUG_MO_EXTERNAL 0x00000100 -#define DEBUG_MO_INTERNAL 0x00000200 -#define DEBUG_VS_EXTERNAL 0x00010000 -#define DEBUG_VS_INTERNAL 0x00020000 -#define DEBUG_BS_EXTERNAL 0x01000000 -#define DEBUG_BS_INTERNAL 0x02000000 - -#define DP_DEBUG(level, args) \ - do { \ - if (debug_mask & (level)) \ - dprintf(args); \ - } while (0) - -#define ASSERT(expr) \ - do { \ - if (!(expr)) \ -#ifndef MACH_KERNEL - panic("%s[%d]%s: assertion failed in %s line %d: %s",\ - my_name, dp_thread_id(), here, \ - __FILE__, __LINE__, # expr); \ -#else - panic("%s[KERNEL]: assertion failed in %s line %d: %s",\ - my_name, __FILE__, __LINE__, # expr); \ -#endif - } while (0) - -#else /* DEFAULT_PAGER_DEBUG */ - -#define DP_DEBUG(level, args) do {} while(0) -#define ASSERT(clause) do {} while(0) - -#endif /* DEFAULT_PAGER_DEBUG */ - -#ifndef MACH_KERNEL -extern char *mach_error_string(kern_return_t); -#endif - -#define PAGER_SUCCESS 0 -#define PAGER_FULL 1 -#define PAGER_ERROR 2 - -/* - * VM and IPC globals. - */ -#ifdef MACH_KERNEL -#define vm_page_size PAGE_SIZE -#define vm_page_mask PAGE_MASK -#define vm_page_shift PAGE_SHIFT -#else -extern vm_object_size_t vm_page_size; -extern unsigned long long vm_page_mask; -extern int vm_page_shift; -#endif - -#ifndef MACH_KERNEL -#define ptoa(p) ((p)*vm_page_size) -#define atop(a) ((a)/vm_page_size) -#endif -#define howmany(a,b) ((((a) % (b)) == 0) ? ((a) / (b)) : (((a) / (b)) + 1)) - -extern memory_object_default_t default_pager_object; - -#ifdef MACH_KERNEL -extern lck_mtx_t dpt_lock; /* Lock for the dpt array */ -extern int default_pager_internal_count; -extern MACH_PORT_FACE default_pager_host_port; -/* extern task_t default_pager_self; */ /* dont need or want */ -extern MACH_PORT_FACE default_pager_internal_set; -extern MACH_PORT_FACE default_pager_external_set; -extern MACH_PORT_FACE default_pager_default_set; -#else -extern mach_port_t default_pager_host_port; -extern task_port_t default_pager_self; -extern mach_port_t default_pager_internal_set; -extern mach_port_t default_pager_external_set; -extern mach_port_t default_pager_default_set; -#endif - -typedef vm32_offset_t dp_offset_t; -typedef vm32_size_t dp_size_t; -typedef vm32_address_t dp_address_t; - -typedef struct default_pager_thread { -#ifndef MACH_KERNEL - cthread_t dpt_thread; /* Server thread. */ -#endif - vm_offset_t dpt_buffer; /* Read buffer. */ - boolean_t dpt_internal; /* Do we handle internal objects? */ -#ifndef MACH_KERNEL - int dpt_id; /* thread id for printf */ -#else - int checked_out; -#endif - boolean_t dpt_initialized_p; /* Thread is ready for requests. */ -} default_pager_thread_t; - -#ifdef MACH_KERNEL -extern default_pager_thread_t **dpt_array; -#endif - -/* - * Global statistics. - */ -struct global_stats { - unsigned int gs_pageout_calls; /* # pageout calls */ - unsigned int gs_pagein_calls; /* # pagein calls */ - unsigned int gs_pages_in; /* # pages paged in (total) */ - unsigned int gs_pages_out; /* # pages paged out (total) */ - unsigned int gs_pages_unavail; /* # zero-fill pages */ - unsigned int gs_pages_init; /* # page init requests */ - unsigned int gs_pages_init_writes; /* # page init writes */ - VSTATS_LOCK_DECL(gs_lock) -}; -extern struct global_stats global_stats; -#define GSTAT(clause) VSTATS_ACTION(&global_stats.gs_lock, (clause)) - -/* - * Cluster related definitions. - * Clusters are sized in number of pages per cluster. - * Cluster sizes must be powers of two. - * - * These numbers are related to the struct vs_map, - * defined below. - */ -#define MAX_CLUSTER_SIZE 8 -#define MAX_CLUSTER_SHIFT 3 -#define NO_CLSIZE 0 - -/* - * bit map related macros - */ -#define NBBY 8 /* bits per byte XXX */ -#define BYTEMASK 0xff -#define setbit(a,i) (*(((char *)(a)) + ((i)/NBBY)) |= 1<<((i)%NBBY)) -#define clrbit(a,i) (*(((char *)(a)) + ((i)/NBBY)) &= ~(1<<((i)%NBBY))) -#define isset(a,i) (*(((char *)(a)) + ((i)/NBBY)) & (1<<((i)%NBBY))) -#define isclr(a,i) ((*(((char *)(a)) + ((i)/NBBY)) & (1<<((i)%NBBY))) == 0) - -/* - * Default Pager. - * Backing Store Management. - */ - -#define BS_MAXPRI 4 -#define BS_MINPRI 0 -#define BS_NOPRI -1 -#define BS_FULLPRI -2 - -/* - * Quick way to access the emergency segment backing store structures - * without a full-blown search. - */ -extern MACH_PORT_FACE emergency_segment_backing_store; - -/* - * Mapping between backing store port and backing store object. - */ -struct backing_store { - queue_chain_t bs_links; /* link in backing_store_list */ - lck_mtx_t bs_lock; /* lock for the structure */ - MACH_PORT_FACE bs_port; /* backing store port */ - int bs_priority; - int bs_clsize; /* cluster size in pages */ - - /* statistics */ - unsigned int bs_pages_free; /* # unallocated pages */ - unsigned int bs_pages_total; /* # pages (total) */ - unsigned int bs_pages_in; /* # page read requests */ - unsigned int bs_pages_in_fail; /* # page read errors */ - unsigned int bs_pages_out; /* # page write requests */ - unsigned int bs_pages_out_fail; /* # page write errors */ -}; -typedef struct backing_store *backing_store_t; -#define BACKING_STORE_NULL ((backing_store_t) 0) -#define BS_STAT(bs, clause) VSTATS_ACTION(&(bs)->bs_lock, (clause)) - -#ifdef MACH_KERNEL -#define BS_LOCK_INIT(bs) lck_mtx_init(&(bs)->bs_lock, &default_pager_lck_grp, &default_pager_lck_attr) -#define BS_LOCK_DESTROY(bs) lck_mtx_destroy(&(bs)->bs_lock, &default_pager_lck_grp) -#define BS_LOCK(bs) lck_mtx_lock(&(bs)->bs_lock) -#define BS_UNLOCK(bs) lck_mtx_unlock(&(bs)->bs_lock) - -struct backing_store_list_head { - queue_head_t bsl_queue; - lck_mtx_t bsl_lock; -#endif -}; -extern struct backing_store_list_head backing_store_list; -extern int backing_store_release_trigger_disable; - -#define BSL_LOCK_INIT() lck_mtx_init(&backing_store_list.bsl_lock, &default_pager_lck_grp, &default_pager_lck_attr) -#define BSL_LOCK_DESTROY() lck_mtx_destroy(&backing_store_list.bsl_lock, &default_pager_lck_grp) -#define BSL_LOCK() lck_mtx_lock(&backing_store_list.bsl_lock) -#define BSL_UNLOCK() lck_mtx_unlock(&backing_store_list.bsl_lock) - -/* - * Paging segment management. - * Controls allocation of blocks within paging area. - */ -struct paging_segment { - /* device management */ - union { - MACH_PORT_FACE dev; /* Port to device */ - struct vnode *vnode; /* vnode for bs file */ - } storage_type; - unsigned int ps_segtype; /* file type or partition */ - MACH_PORT_FACE ps_device; /* Port to device */ - dp_offset_t ps_offset; /* Offset of segment within device */ - dp_offset_t ps_recnum; /* Number of device records in segment*/ - unsigned int ps_pgnum; /* Number of pages in segment */ - unsigned int ps_record_shift;/* Bit shift: pages to device records */ - - /* clusters and pages */ - unsigned int ps_clshift; /* Bit shift: clusters to pages */ - unsigned int ps_ncls; /* Number of clusters in segment */ - unsigned int ps_clcount; /* Number of free clusters */ - unsigned int ps_pgcount; /* Number of free pages */ - unsigned int ps_hint; /* Hint of where to look next. */ - unsigned int ps_special_clusters; /* Clusters that might come in while we've - * released the locks doing a ps_delete. - */ - - /* bitmap */ - lck_mtx_t ps_lock; /* Lock for contents of struct */ - unsigned char *ps_bmap; /* Map of used clusters */ - - /* backing store */ - backing_store_t ps_bs; /* Backing store segment belongs to */ -#define PS_CAN_USE 0x1 -#define PS_GOING_AWAY 0x2 -#define PS_EMERGENCY_SEGMENT 0x4 - unsigned int ps_state; -}; - -#define IS_PS_OK_TO_USE(ps) ((ps->ps_state & PS_CAN_USE) == PS_CAN_USE) -#define IS_PS_GOING_AWAY(ps) ((ps->ps_state & PS_GOING_AWAY) == PS_GOING_AWAY) -#define IS_PS_EMERGENCY_SEGMENT(ps) ((ps->ps_state & PS_EMERGENCY_SEGMENT) == PS_EMERGENCY_SEGMENT) - -#define ps_vnode storage_type.vnode -#define ps_device storage_type.dev -#define PS_PARTITION 1 -#define PS_FILE 2 - -typedef struct paging_segment *paging_segment_t; - -#define PAGING_SEGMENT_NULL ((paging_segment_t) 0) - -#define PS_LOCK_INIT(ps) lck_mtx_init(&(ps)->ps_lock, &default_pager_lck_grp, &default_pager_lck_attr) -#define PS_LOCK_DESTROY(ps) lck_mtx_destroy(&(ps)->ps_lock, &default_pager_lck_grp) -#define PS_LOCK(ps) lck_mtx_lock(&(ps)->ps_lock) -#define PS_UNLOCK(ps) lck_mtx_unlock(&(ps)->ps_lock) - -typedef unsigned int pseg_index_t; - -#define INVALID_PSEG_INDEX ((pseg_index_t)-1) -#define EMERGENCY_PSEG_INDEX ((pseg_index_t) 0) -/* - * MAX_PSEG_INDEX value is related to struct vs_map below. - * "0" is reserved for empty map entries (no segment). - */ -#define MAX_PSEG_INDEX 63 /* 0 is reserved for empty map */ -#define MAX_NUM_PAGING_SEGMENTS MAX_PSEG_INDEX - -/* paging segments array */ -extern paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS]; -extern lck_mtx_t paging_segments_lock; -extern int paging_segment_count; /* number of active paging segments */ -extern int paging_segment_max; /* highest used paging segment index */ -extern int ps_select_array[DEFAULT_PAGER_BACKING_STORE_MAXPRI+1]; - -#define PSL_LOCK_INIT() lck_mtx_init(&paging_segments_lock, &default_pager_lck_grp, &default_pager_lck_attr) -#define PSL_LOCK_DESTROY() lck_mtx_destroy(&paging_segments_lock, &default_pager_lck_grp) -#define PSL_LOCK() lck_mtx_lock(&paging_segments_lock) -#define PSL_UNLOCK() lck_mtx_unlock(&paging_segments_lock) - -/* - * Vstruct manipulation. The vstruct is the pager's internal - * representation of vm objects it manages. There is one vstruct allocated - * per vm object. - * - * The following data structures are defined for vstruct and vm object - * management. - */ - -/* - * vs_map - * A structure used only for temporary objects. It is the element - * contained in the vs_clmap structure, which contains information - * about which clusters and pages in an object are present on backing - * store (a paging file). - * Note that this structure and its associated constants may change - * with minimal impact on code. The only function which knows the - * internals of this structure is ps_clmap(). - * - * If it is necessary to change the maximum number of paging segments - * or pages in a cluster, then this structure is the one most - * affected. The constants and structures which *may* change are: - * MAX_CLUSTER_SIZE - * MAX_CLUSTER_SHIFT - * MAX_NUM_PAGING_SEGMENTS - * VSTRUCT_DEF_CLSHIFT - * struct vs_map and associated macros and constants (VSM_*) - * (only the macro definitions need change, the exported (inside the - * pager only) interfaces remain the same; the constants are for - * internal vs_map manipulation only). - * struct clbmap (below). - */ -struct vs_map { - unsigned int vsmap_entry:23, /* offset in paging segment */ - vsmap_psindex:8, /* paging segment */ - vsmap_error:1, - vsmap_bmap:16, - vsmap_alloc:16; -}; - -typedef struct vs_map *vs_map_t; - - -#define VSM_ENTRY_NULL 0x7fffff - -/* - * Exported macros for manipulating the vs_map structure -- - * checking status, getting and setting bits. - */ -#define VSCLSIZE(vs) (1U << (vs)->vs_clshift) -#define VSM_ISCLR(vsm) (((vsm).vsmap_entry == VSM_ENTRY_NULL) && \ - ((vsm).vsmap_error == 0)) -#define VSM_ISERR(vsm) ((vsm).vsmap_error) -#define VSM_SETCLOFF(vsm, val) ((vsm).vsmap_entry = (val)) -#define VSM_SETERR(vsm, err) ((vsm).vsmap_error = 1, \ - (vsm).vsmap_entry = (err)) -#define VSM_GETERR(vsm) ((vsm).vsmap_entry) -#define VSM_SETPG(vsm, page) ((vsm).vsmap_bmap |= (1 << (page))) -#define VSM_CLRPG(vsm, page) ((vsm).vsmap_bmap &= ~(1 << (page))) -#define VSM_SETPS(vsm, psindx) ((vsm).vsmap_psindex = (psindx)) -#define VSM_PSINDEX(vsm) ((vsm).vsmap_psindex) -#define VSM_PS(vsm) paging_segments[(vsm).vsmap_psindex] -#define VSM_BMAP(vsm) ((vsm).vsmap_bmap) -#define VSM_CLOFF(vsm) ((vsm).vsmap_entry) -#define VSM_CLR(vsm) ((vsm).vsmap_entry = VSM_ENTRY_NULL, \ - (vsm).vsmap_psindex = 0, \ - (vsm).vsmap_error = 0, \ - (vsm).vsmap_bmap = 0, \ - (vsm).vsmap_alloc = 0) -#define VSM_ALLOC(vsm) ((vsm).vsmap_alloc) -#define VSM_SETALLOC(vsm, page) ((vsm).vsmap_alloc |= (1 << (page))) -#define VSM_CLRALLOC(vsm, page) ((vsm).vsmap_alloc &= ~(1 << (page))) - -/* - * Constants and macros for dealing with vstruct maps, - * which comprise vs_map structures, which - * map vm objects to backing storage (paging files and clusters). - */ -#define CLMAP_THRESHOLD 512 /* bytes */ -#define CLMAP_ENTRIES (CLMAP_THRESHOLD/(int)sizeof(struct vs_map)) -#define CLMAP_SIZE(ncls) (ncls*(int)sizeof(struct vs_map)) - -#define INDIRECT_CLMAP_ENTRIES(ncls) (((ncls-1)/CLMAP_ENTRIES) + 1) -#define INDIRECT_CLMAP_SIZE(ncls) (INDIRECT_CLMAP_ENTRIES(ncls) * (int)sizeof(struct vs_map *)) -#define INDIRECT_CLMAP(size) (CLMAP_SIZE(size) > CLMAP_THRESHOLD) - -#define RMAPSIZE(blocks) (howmany(blocks,NBBY)) - -#define CL_FIND 1 -#define CL_ALLOC 2 - -/* - * clmap - * - * A cluster map returned by ps_clmap. It is an abstracted cluster of - * pages. It gives the caller information about the cluster - * desired. On read it tells the caller if a cluster is mapped, and if so, - * which of its pages are valid. It should not be referenced directly, - * except by ps_clmap; macros should be used. If the number of pages - * in a cluster needs to be more than 32, then the struct clbmap must - * become larger. - */ -struct clbmap { - unsigned int clb_map; -}; - -struct clmap { - paging_segment_t cl_ps; /* paging segment backing cluster */ - int cl_numpages; /* number of valid pages */ - struct clbmap cl_bmap; /* map of pages in cluster */ - int cl_error; /* cluster error value */ - struct clbmap cl_alloc; /* map of allocated pages in cluster */ -}; - -#define CLMAP_ERROR(clm) (clm).cl_error -#define CLMAP_PS(clm) (clm).cl_ps -#define CLMAP_NPGS(clm) (clm).cl_numpages -#define CLMAP_ISSET(clm,i) ((1<<(i))&((clm).cl_bmap.clb_map)) -#define CLMAP_ALLOC(clm) (clm).cl_alloc.clb_map -/* - * Shift off unused bits in a partial cluster - */ -#define CLMAP_SHIFT(clm,vs) \ - (clm)->cl_bmap.clb_map >>= (VSCLSIZE(vs) - (clm)->cl_numpages) -#define CLMAP_SHIFTALLOC(clm,vs) \ - (clm)->cl_alloc.clb_map >>= (VSCLSIZE(vs) - (clm)->cl_numpages) - -typedef struct vstruct_alias { - memory_object_pager_ops_t name; - struct vstruct *vs; -} vstruct_alias_t; - -#define DPT_LOCK_INIT(lock) lck_mtx_init(&(lock), &default_pager_lck_grp, &default_pager_lck_attr) -#define DPT_LOCK_DESTROY(lock) lck_mtx_destroy(&(lock), &default_pager_lck_grp) -#define DPT_LOCK(lock) lck_mtx_lock(&(lock)) -#define DPT_UNLOCK(lock) lck_mtx_unlock(&(lock)) -#define DPT_SLEEP(lock, e, i) lck_mtx_sleep(&(lock), LCK_SLEEP_DEFAULT, (event_t)(e), i) -#define VS_LOCK_TYPE hw_lock_data_t -#define VS_LOCK_INIT(vs) hw_lock_init(&(vs)->vs_lock) -#define VS_TRY_LOCK(vs) (VS_LOCK(vs),TRUE) -#define VS_LOCK(vs) hw_lock_lock(&(vs)->vs_lock) -#define VS_UNLOCK(vs) hw_lock_unlock(&(vs)->vs_lock) -#define VS_MAP_LOCK_TYPE lck_mtx_t -#define VS_MAP_LOCK_INIT(vs) lck_mtx_init(&(vs)->vs_map_lock, &default_pager_lck_grp, &default_pager_lck_attr) -#define VS_MAP_LOCK_DESTROY(vs) lck_mtx_destroy(&(vs)->vs_map_lock, &default_pager_lck_grp) -#define VS_MAP_LOCK(vs) lck_mtx_lock(&(vs)->vs_map_lock) -#define VS_MAP_TRY_LOCK(vs) lck_mtx_try_lock(&(vs)->vs_map_lock) -#define VS_MAP_UNLOCK(vs) lck_mtx_unlock(&(vs)->vs_map_lock) - - -/* - * VM Object Structure: This is the structure used to manage - * default pager object associations with their control counter- - * parts (VM objects). - * - * The start of this structure MUST match a "struct memory_object". - */ -typedef struct vstruct { - struct ipc_object_header vs_pager_header; /* fake ip_kotype() */ - memory_object_pager_ops_t vs_pager_ops; /* == &default_pager_ops */ - memory_object_control_t vs_control; /* our mem obj control ref */ - VS_LOCK_TYPE vs_lock; /* data for the lock */ - - /* JMM - Could combine these first two in a single pending count now */ - unsigned int vs_next_seqno; /* next sequence num to issue */ - unsigned int vs_seqno; /* Pager port sequence number */ - unsigned int vs_readers; /* Reads in progress */ - unsigned int vs_writers; /* Writes in progress */ - - unsigned int - /* boolean_t */ vs_waiting_seqno:1, /* to wait on seqno */ - /* boolean_t */ vs_waiting_read:1, /* waiting on reader? */ - /* boolean_t */ vs_waiting_write:1, /* waiting on writer? */ - /* boolean_t */ vs_waiting_async:1, /* waiting on async? */ - /* boolean_t */ vs_indirect:1, /* map indirect? */ - /* boolean_t */ vs_xfer_pending:1; /* xfer out of seg? */ - - unsigned int vs_async_pending;/* pending async write count */ - unsigned int vs_errors; /* Pageout error count */ - unsigned int vs_references; /* references */ - - queue_chain_t vs_links; /* Link in pager-wide list */ - - unsigned int vs_clshift; /* Bit shift: clusters->pages */ - unsigned int vs_size; /* Object size in clusters */ - lck_mtx_t vs_map_lock; /* to protect map below */ - union { - struct vs_map *vsu_dmap; /* Direct map of clusters */ - struct vs_map **vsu_imap; /* Indirect map of clusters */ - } vs_un; -} *vstruct_t; - -#define vs_dmap vs_un.vsu_dmap -#define vs_imap vs_un.vsu_imap - -#define VSTRUCT_NULL ((vstruct_t) 0) - -__private_extern__ void vs_async_wait(vstruct_t); - -#if PARALLEL -__private_extern__ void vs_lock(vstruct_t); -__private_extern__ void vs_unlock(vstruct_t); -__private_extern__ void vs_start_read(vstruct_t); -__private_extern__ void vs_finish_read(vstruct_t); -__private_extern__ void vs_wait_for_readers(vstruct_t); -__private_extern__ void vs_start_write(vstruct_t); -__private_extern__ void vs_finish_write(vstruct_t); -__private_extern__ void vs_wait_for_writers(vstruct_t); -__private_extern__ void vs_wait_for_sync_writers(vstruct_t); -#else /* PARALLEL */ -#define vs_lock(vs) -#define vs_unlock(vs) -#define vs_start_read(vs) -#define vs_wait_for_readers(vs) -#define vs_finish_read(vs) -#define vs_start_write(vs) -#define vs_wait_for_writers(vs) -#define vs_wait_for_sync_writers(vs) -#define vs_finish_write(vs) -#endif /* PARALLEL */ - -/* - * Data structures and variables dealing with asynchronous - * completion of paging operations. - */ -/* - * vs_async - * A structure passed to ps_write_device for asynchronous completions. - * It contains enough information to complete the write and - * inform the VM of its completion. - */ -struct vs_async { - struct vs_async *vsa_next; /* pointer to next structure */ - vstruct_t vsa_vs; /* the vstruct for the object */ - vm_offset_t vsa_addr; /* the vaddr of the data moved */ - vm_offset_t vsa_offset; /* the object offset of the data */ - vm_size_t vsa_size; /* the number of bytes moved */ - paging_segment_t vsa_ps; /* the paging segment used */ - int vsa_flags; /* flags */ - int vsa_error; /* error, if there is one */ - MACH_PORT_FACE reply_port; /* associated reply port */ -}; - -/* - * flags values. - */ -#define VSA_READ 0x0001 -#define VSA_WRITE 0x0002 -#define VSA_TRANSFER 0x0004 - -/* - * List of all vstructs. A specific vstruct is - * found directly via its port, this list is - * only used for monitoring purposes by the - * default_pager_object* calls - */ -struct vstruct_list_head { - queue_head_t vsl_queue; - lck_mtx_t vsl_lock; - int vsl_count; /* saves code */ -}; - -extern struct vstruct_list_head vstruct_list; - -__private_extern__ void vstruct_list_insert(vstruct_t vs); -__private_extern__ void vstruct_list_delete(vstruct_t vs); - - -extern lck_grp_t default_pager_lck_grp; -extern lck_attr_t default_pager_lck_attr; - -#define VSL_LOCK_INIT() lck_mtx_init(&vstruct_list.vsl_lock, &default_pager_lck_grp, &default_pager_lck_attr) -#define VSL_LOCK_DESTROY() lck_mtx_destroy(&vstruct_list.vsl_lock, &default_pager_lck_grp) -#define VSL_LOCK() lck_mtx_lock(&vstruct_list.vsl_lock) -#define VSL_LOCK_TRY() lck_mtx_try_lock(&vstruct_list.vsl_lock) -#define VSL_UNLOCK() lck_mtx_unlock(&vstruct_list.vsl_lock) -#define VSL_SLEEP(e,i) lck_mtx_sleep(&vstruct_list.vsl_lock, LCK_SLEEP_DEFAULT, (e), (i)) - -#ifdef MACH_KERNEL -extern zone_t vstruct_zone; -#endif - -/* - * Create port alias for vstruct address. - * - * We assume that the last two bits of a vstruct address will be zero due to - * memory allocation restrictions, hence are available for use as a sanity - * check. - */ -#ifdef MACH_KERNEL - -extern const struct memory_object_pager_ops default_pager_ops; - -#define mem_obj_is_vs(_mem_obj_) \ - (((_mem_obj_) != NULL) && \ - ((_mem_obj_)->mo_pager_ops == &default_pager_ops)) -#define mem_obj_to_vs(_mem_obj_) \ - ((vstruct_t)(_mem_obj_)) -#define vs_to_mem_obj(_vs_) ((memory_object_t)(_vs_)) -#define vs_lookup(_mem_obj_, _vs_) \ - do { \ - if (!mem_obj_is_vs(_mem_obj_)) \ - panic("bad dp memory object"); \ - _vs_ = mem_obj_to_vs(_mem_obj_); \ - } while (0) -#define vs_lookup_safe(_mem_obj_, _vs_) \ - do { \ - if (!mem_obj_is_vs(_mem_obj_)) \ - _vs_ = VSTRUCT_NULL; \ - else \ - _vs_ = mem_obj_to_vs(_mem_obj_); \ - } while (0) -#else - -#define vs_to_port(_vs_) (((vm_offset_t)(_vs_))+1) -#define port_to_vs(_port_) ((vstruct_t)(((vm_offset_t)(_port_))&~3)) -#define port_is_vs(_port_) ((((vm_offset_t)(_port_))&3) == 1) - -#define vs_lookup(_port_, _vs_) \ - do { \ - if (!MACH_PORT_VALID(_port_) || !port_is_vs(_port_) \ - || port_to_vs(_port_)->vs_mem_obj != (_port_)) \ - Panic("bad pager port"); \ - _vs_ = port_to_vs(_port_); \ - } while (0) -#endif - -/* - * Cross-module routines declaration. - */ -#ifndef MACH_KERNEL -extern int dp_thread_id(void); -#endif -extern boolean_t device_reply_server(mach_msg_header_t *, - mach_msg_header_t *); -#ifdef MACH_KERNEL -extern boolean_t default_pager_no_senders(memory_object_t, - mach_port_mscount_t); -#else -extern void default_pager_no_senders(memory_object_t, - mach_port_seqno_t, - mach_port_mscount_t); -#endif - -extern int local_log2(unsigned int); -extern void bs_initialize(void); -extern void bs_global_info(uint64_t *, - uint64_t *); -extern boolean_t bs_add_device(char *, - MACH_PORT_FACE); -extern vstruct_t ps_vstruct_create(dp_size_t); -extern void ps_vstruct_dealloc(vstruct_t); -extern kern_return_t ps_vstruct_reclaim(vstruct_t, - boolean_t, - boolean_t); -extern kern_return_t pvs_cluster_read(vstruct_t, - dp_offset_t, - dp_size_t, - void *); -extern kern_return_t vs_cluster_write(vstruct_t, - upl_t, - upl_offset_t, - upl_size_t, - boolean_t, - int); -extern dp_offset_t ps_clmap(vstruct_t, - dp_offset_t, - struct clmap *, - int, - dp_size_t, - int); -extern vm_size_t ps_vstruct_allocated_size(vstruct_t); -extern unsigned int ps_vstruct_allocated_pages(vstruct_t, - default_pager_page_t *, - unsigned int); -extern boolean_t bs_set_default_clsize(unsigned int); - -extern boolean_t verbose; - -extern thread_call_t default_pager_backing_store_monitor_callout; -extern void default_pager_backing_store_monitor(thread_call_param_t, thread_call_param_t); - -extern ipc_port_t max_pages_trigger_port; -extern unsigned int dp_pages_free; -extern unsigned int maximum_pages_free; - -/* Do we know yet if swap files need to be encrypted ? */ -extern boolean_t dp_encryption_inited; -/* Should we encrypt data before writing to swap ? */ -extern boolean_t dp_encryption; - -extern boolean_t dp_isssd; - -#endif /* _DEFAULT_PAGER_INTERNAL_H_ */ diff --git a/osfmk/default_pager/default_pager_object.defs b/osfmk/default_pager/default_pager_object.defs deleted file mode 100644 index 18b84c8a8..000000000 --- a/osfmk/default_pager/default_pager_object.defs +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: mach/default_pager_object.defs - * - */ - -subsystem -#if KERNEL_USER - KernelUser -#endif /* KERNEL_USER */ -#if KERNEL_SERVER - KernelServer -#endif /* KERNEL_SERVER */ - default_pager_object 2275; - -#include -#include -#include - -type vnode_ptr_t = array[1] of int; - -routine default_pager_object_create( - default_pager : mach_port_t; - object_size : vm_size_t; - out memory_object : memory_object_t); - -routine default_pager_info( - default_pager : mach_port_t; - out info : default_pager_info_t); - -routine default_pager_objects( - default_pager : mach_port_t; - out objects : default_pager_object_array_t, - Dealloc; - out ports : mach_port_array_t = - array[] of mach_port_move_send_t, - Dealloc); - -routine default_pager_object_pages( - default_pager : mach_port_t; - memory_object : memory_object_name_t; - out pages : default_pager_page_array_t, - Dealloc); - -skip; /* default_pager_paging_file */ - -routine default_pager_backing_store_create( - default_pager : mach_port_t; - in priority : int; - in clsize : int; - out backing_store : mach_port_t = - MACH_MSG_TYPE_MAKE_SEND); - -routine default_pager_backing_store_delete( - backing_store : mach_port_t); - - -#ifdef PAGE_TO_DEVICE -routine default_pager_add_segment( - backing_store : mach_port_t; - in device : mach_port_t; - in offset : recnum_t; - in count : recnum_t; - in record_size : int); -#endif - -routine default_pager_backing_store_info( - backing_store : mach_port_t; - flavor : backing_store_flavor_t; - out info : backing_store_info_t, CountInOut); - -routine default_pager_add_file( - backing_store : mach_port_t; - in vnode : vnode_ptr_t; - in record_size : int; - in size : vm_size_t); - - -routine default_pager_triggers( - default_pager : mach_port_t; - in hi_wat : int; - in lo_wat : int; - in flags : int; - in trigger_port : mach_port_t); - -routine default_pager_info_64( - default_pager : mach_port_t; - out info : default_pager_info_64_t); - -/* vim: set ft=c : */ diff --git a/osfmk/default_pager/default_pager_types.h b/osfmk/default_pager/default_pager_types.h index 8bd35dc36..8ad6fedd4 100644 --- a/osfmk/default_pager/default_pager_types.h +++ b/osfmk/default_pager/default_pager_types.h @@ -41,71 +41,6 @@ #include #include -typedef memory_object_default_t default_pager_t; - -/* - * Remember to update the mig type definitions - * in default_pager_types.defs when adding/removing fields. - */ - -typedef struct default_pager_info { - vm_size_t dpi_total_space; /* size of backing store */ - vm_size_t dpi_free_space; /* how much of it is unused */ - vm_size_t dpi_page_size; /* the pager's vm page size */ -} default_pager_info_t; - -typedef struct default_pager_info_64 { - memory_object_size_t dpi_total_space; /* size of backing store */ - memory_object_size_t dpi_free_space; /* how much of it is unused */ - vm_size_t dpi_page_size; /* the pager's vm page size */ - int dpi_flags; -#define DPI_ENCRYPTED 0x1 /* swap files are encrypted */ -} default_pager_info_64_t; - -typedef integer_t *backing_store_info_t; -typedef int backing_store_flavor_t; -typedef int *vnode_ptr_t; - -#define BACKING_STORE_BASIC_INFO 1 -#define BACKING_STORE_BASIC_INFO_COUNT \ - (sizeof(struct backing_store_basic_info)/sizeof(integer_t)) -struct backing_store_basic_info { - natural_t pageout_calls; /* # pageout calls */ - natural_t pagein_calls; /* # pagein calls */ - natural_t pages_in; /* # pages paged in (total) */ - natural_t pages_out; /* # pages paged out (total) */ - natural_t pages_unavail; /* # zero-fill pages */ - natural_t pages_init; /* # page init requests */ - natural_t pages_init_writes; /* # page init writes */ - - natural_t bs_pages_total; /* # pages (total) */ - natural_t bs_pages_free; /* # unallocated pages */ - natural_t bs_pages_in; /* # page read requests */ - natural_t bs_pages_in_fail; /* # page read errors */ - natural_t bs_pages_out; /* # page write requests */ - natural_t bs_pages_out_fail; /* # page write errors */ - - integer_t bs_priority; - integer_t bs_clsize; -}; -typedef struct backing_store_basic_info *backing_store_basic_info_t; - - -typedef struct default_pager_object { - vm_offset_t dpo_object; /* object managed by the pager */ - vm_size_t dpo_size; /* backing store used for the object */ -} default_pager_object_t; - -typedef default_pager_object_t *default_pager_object_array_t; - -typedef struct default_pager_page { - vm_offset_t dpp_offset; /* offset of the page in its object */ -} default_pager_page_t; - -typedef default_pager_page_t *default_pager_page_array_t; - -#define DEFAULT_PAGER_BACKING_STORE_MAXPRI 4 - #define HI_WAT_ALERT 0x01 #define LO_WAT_ALERT 0x02 #define SWAP_ENCRYPT_ON 0x04 diff --git a/osfmk/default_pager/diag.h b/osfmk/default_pager/diag.h deleted file mode 100644 index de307fdc5..000000000 --- a/osfmk/default_pager/diag.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef MACH_KERNEL -#ifdef ASSERTIONS -#define assert(cond) \ - ((void) ((cond) ? 0 : panic("%sassertion: %s", my_name, # cond))) -#endif -#ifndef ASSERTIONS -#define assert(cond) do {} while(0) -#endif -#endif - -#ifndef MACH_KERNEL -#define Panic(aargh) panic("%s[%d]: %s", my_name, dp_thread_id(), aargh) -#else -#define Panic(aargh) panic("%s[KERNEL]: %s", my_name, aargh) -#endif - -#define VSTATS_ACTION(l, stmt) \ - do { VSTATS_LOCK(l); stmt; VSTATS_UNLOCK(l); } while (0) - -#if !defined(VAGUE_STATS) || (VAGUE_STATS > 0) -#define VSTATS_LOCK_DECL(name) -#define VSTATS_LOCK(l) -#define VSTATS_UNLOCK(l) -#define VSTATS_LOCK_INIT(l) -#else - -extern lck_grp_t default_pager_lck_grp; -extern lck_attr_t default_pager_lck_attr; - - -#define VSTATS_LOCK_DECL(name) struct lck_mtx_t name; -#define VSTATS_LOCK(l) lck_mtx_lock(l) -#define VSTATS_UNLOCK(l) lck_mtx_unlock(l) -#define VSTATS_LOCK_INIT(l) lck_mtx_init(l, &default_pager_lck_grp, &default_pager_lck_attr ) -#endif /* VAGUE_STATS */ - diff --git a/osfmk/default_pager/dp_backing_store.c b/osfmk/default_pager/dp_backing_store.c deleted file mode 100644 index 819af3cdc..000000000 --- a/osfmk/default_pager/dp_backing_store.c +++ /dev/null @@ -1,4634 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - * Default Pager. - * Paging File Management. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - - -/* todo - need large internal object support */ - -/* - * ALLOC_STRIDE... the maximum number of bytes allocated from - * a swap file before moving on to the next swap file... if - * all swap files reside on a single disk, this value should - * be very large (this is the default assumption)... if the - * swap files are spread across multiple disks, than this value - * should be small (128 * 1024)... - * - * This should be determined dynamically in the future - */ - -#define ALLOC_STRIDE (1024 * 1024 * 1024) -int physical_transfer_cluster_count = 0; - -#define VM_SUPER_CLUSTER 0x40000 -#define VM_SUPER_PAGES (VM_SUPER_CLUSTER / PAGE_MIN_SIZE) - -/* - * 0 means no shift to pages, so == 1 page/cluster. 1 would mean - * 2 pages/cluster, 2 means 4 pages/cluster, and so on. - */ -#define VSTRUCT_MIN_CLSHIFT 0 - -#define VSTRUCT_DEF_CLSHIFT 2 -int default_pager_clsize = 0; - -int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT; - -/* statistics */ -unsigned int clustered_writes[VM_SUPER_PAGES+1]; -unsigned int clustered_reads[VM_SUPER_PAGES+1]; - -/* - * Globals used for asynchronous paging operations: - * vs_async_list: head of list of to-be-completed I/O ops - * async_num_queued: number of pages completed, but not yet - * processed by async thread. - * async_requests_out: number of pages of requests not completed. - */ - -#if 0 -struct vs_async *vs_async_list; -int async_num_queued; -int async_requests_out; -#endif - - -#define VS_ASYNC_REUSE 1 -struct vs_async *vs_async_free_list; - -lck_mtx_t default_pager_async_lock; /* Protects globals above */ - - -int vs_alloc_async_failed = 0; /* statistics */ -int vs_alloc_async_count = 0; /* statistics */ -struct vs_async *vs_alloc_async(void); /* forward */ -void vs_free_async(struct vs_async *vsa); /* forward */ - - -#define VS_ALLOC_ASYNC() vs_alloc_async() -#define VS_FREE_ASYNC(vsa) vs_free_async(vsa) - -#define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock) -#define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock) -#define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr) -#define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp) -#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock) -/* - * Paging Space Hysteresis triggers and the target notification port - * - */ -unsigned int dp_pages_free_drift_count = 0; -unsigned int dp_pages_free_drifted_max = 0; -unsigned int minimum_pages_remaining = 0; -unsigned int maximum_pages_free = 0; -ipc_port_t min_pages_trigger_port = NULL; -ipc_port_t max_pages_trigger_port = NULL; - -#if CONFIG_FREEZE -boolean_t use_emergency_swap_file_first = TRUE; -#else -boolean_t use_emergency_swap_file_first = FALSE; -#endif -boolean_t bs_low = FALSE; -int backing_store_release_trigger_disable = 0; -boolean_t backing_store_stop_compaction = FALSE; -boolean_t backing_store_abort_compaction = FALSE; - -/* Have we decided if swap needs to be encrypted yet ? */ -boolean_t dp_encryption_inited = FALSE; -/* Should we encrypt swap ? */ -boolean_t dp_encryption = FALSE; - -boolean_t dp_isssd = FALSE; - -/* - * Object sizes are rounded up to the next power of 2, - * unless they are bigger than a given maximum size. - */ -vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */ - -/* - * List of all backing store and segments. - */ -MACH_PORT_FACE emergency_segment_backing_store; -struct backing_store_list_head backing_store_list; -paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS]; -lck_mtx_t paging_segments_lock; -int paging_segment_max = 0; -int paging_segment_count = 0; -int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 }; - - -/* - * Total pages free in system - * This differs from clusters committed/avail which is a measure of the - * over commitment of paging segments to backing store. An idea which is - * likely to be deprecated. - */ -unsigned int dp_pages_free = 0; -unsigned int dp_pages_reserve = 0; -unsigned int cluster_transfer_minimum = 100; - -/* - * Trim state - */ -struct ps_vnode_trim_data { - struct vnode *vp; - dp_offset_t offset; - dp_size_t length; -}; - -/* forward declarations */ -kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */ -kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */ -default_pager_thread_t *get_read_buffer( void ); -kern_return_t ps_vstruct_transfer_from_segment( - vstruct_t vs, - paging_segment_t segment, - upl_t upl); -kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */ -kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */ -kern_return_t vs_cluster_transfer( - vstruct_t vs, - dp_offset_t offset, - dp_size_t cnt, - upl_t upl); -vs_map_t vs_get_map_entry( - vstruct_t vs, - dp_offset_t offset); - -kern_return_t -default_pager_backing_store_delete_internal( MACH_PORT_FACE ); - -static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data); -static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data); -static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length); - -default_pager_thread_t * -get_read_buffer( void ) -{ - int i; - - DPT_LOCK(dpt_lock); - while(TRUE) { - for (i=0; ichecked_out == FALSE) { - dpt_array[i]->checked_out = TRUE; - DPT_UNLOCK(dpt_lock); - return dpt_array[i]; - } - } - DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT); - } -} - -void -bs_initialize(void) -{ - int i; - - /* - * List of all backing store. - */ - BSL_LOCK_INIT(); - queue_init(&backing_store_list.bsl_queue); - PSL_LOCK_INIT(); - - VS_ASYNC_LOCK_INIT(); -#if VS_ASYNC_REUSE - vs_async_free_list = NULL; -#endif /* VS_ASYNC_REUSE */ - - for (i = 0; i < VM_SUPER_PAGES + 1; i++) { - clustered_writes[i] = 0; - clustered_reads[i] = 0; - } - -} - -/* - * When things do not quite workout... - */ -void bs_no_paging_space(boolean_t); /* forward */ - -void -bs_no_paging_space( - boolean_t out_of_memory) -{ - - if (out_of_memory) - dprintf(("*** OUT OF MEMORY ***\n")); - panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE"); -} - -void bs_more_space(int); /* forward */ -void bs_commit(int); /* forward */ - -boolean_t user_warned = FALSE; -unsigned int clusters_committed = 0; -unsigned int clusters_available = 0; -unsigned int clusters_committed_peak = 0; - -void -bs_more_space( - int nclusters) -{ - BSL_LOCK(); - /* - * Account for new paging space. - */ - clusters_available += nclusters; - - if (clusters_available >= clusters_committed) { - if (verbose && user_warned) { - printf("%s%s - %d excess clusters now.\n", - my_name, - "paging space is OK now", - clusters_available - clusters_committed); - user_warned = FALSE; - clusters_committed_peak = 0; - } - } else { - if (verbose && user_warned) { - printf("%s%s - still short of %d clusters.\n", - my_name, - "WARNING: paging space over-committed", - clusters_committed - clusters_available); - clusters_committed_peak -= nclusters; - } - } - BSL_UNLOCK(); - - return; -} - -void -bs_commit( - int nclusters) -{ - BSL_LOCK(); - clusters_committed += nclusters; - if (clusters_committed > clusters_available) { - if (verbose && !user_warned) { - user_warned = TRUE; - printf("%s%s - short of %d clusters.\n", - my_name, - "WARNING: paging space over-committed", - clusters_committed - clusters_available); - } - if (clusters_committed > clusters_committed_peak) { - clusters_committed_peak = clusters_committed; - } - } else { - if (verbose && user_warned) { - printf("%s%s - was short of up to %d clusters.\n", - my_name, - "paging space is OK now", - clusters_committed_peak - clusters_available); - user_warned = FALSE; - clusters_committed_peak = 0; - } - } - BSL_UNLOCK(); - - return; -} - -int default_pager_info_verbose = 1; - -void -bs_global_info( - uint64_t *totalp, - uint64_t *freep) -{ - uint64_t pages_total, pages_free; - paging_segment_t ps; - int i; - - PSL_LOCK(); - pages_total = pages_free = 0; - for (i = 0; i <= paging_segment_max; i++) { - ps = paging_segments[i]; - if (ps == PAGING_SEGMENT_NULL) - continue; - - /* - * no need to lock: by the time this data - * gets back to any remote requestor it - * will be obsolete anyways - */ - pages_total += ps->ps_pgnum; - pages_free += ps->ps_clcount << ps->ps_clshift; - DP_DEBUG(DEBUG_BS_INTERNAL, - ("segment #%d: %d total, %d free\n", - i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift)); - } - *totalp = pages_total; - *freep = pages_free; - if (verbose && user_warned && default_pager_info_verbose) { - if (clusters_available < clusters_committed) { - printf("%s %d clusters committed, %d available.\n", - my_name, - clusters_committed, - clusters_available); - } - } - PSL_UNLOCK(); -} - -backing_store_t backing_store_alloc(void); /* forward */ - -backing_store_t -backing_store_alloc(void) -{ - backing_store_t bs; - - bs = (backing_store_t) kalloc(sizeof (struct backing_store)); - if (bs == BACKING_STORE_NULL) - panic("backing_store_alloc: no memory"); - - BS_LOCK_INIT(bs); - bs->bs_port = MACH_PORT_NULL; - bs->bs_priority = 0; - bs->bs_clsize = 0; - bs->bs_pages_total = 0; - bs->bs_pages_in = 0; - bs->bs_pages_in_fail = 0; - bs->bs_pages_out = 0; - bs->bs_pages_out_fail = 0; - - return bs; -} - -backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */ - -/* Even in both the component space and external versions of this pager, */ -/* backing_store_lookup will be called from tasks in the application space */ -backing_store_t -backing_store_lookup( - MACH_PORT_FACE port) -{ - backing_store_t bs; - -/* - port is currently backed with a vs structure in the alias field - we could create an ISBS alias and a port_is_bs call but frankly - I see no reason for the test, the bs->port == port check below - will work properly on junk entries. - - if ((port == MACH_PORT_NULL) || port_is_vs(port)) -*/ - if (port == MACH_PORT_NULL) - return BACKING_STORE_NULL; - - BSL_LOCK(); - queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t, - bs_links) { - BS_LOCK(bs); - if (bs->bs_port == port) { - BSL_UNLOCK(); - /* Success, return it locked. */ - return bs; - } - BS_UNLOCK(bs); - } - BSL_UNLOCK(); - return BACKING_STORE_NULL; -} - -void backing_store_add(backing_store_t); /* forward */ - -void -backing_store_add( - __unused backing_store_t bs) -{ -// MACH_PORT_FACE port = bs->bs_port; -// MACH_PORT_FACE pset = default_pager_default_set; - kern_return_t kr = KERN_SUCCESS; - - if (kr != KERN_SUCCESS) - panic("backing_store_add: add to set"); - -} - -/* - * Set up default page shift, but only if not already - * set and argument is within range. - */ -boolean_t -bs_set_default_clsize(unsigned int npages) -{ - switch(npages){ - case 1: - case 2: - case 4: - case 8: - if (default_pager_clsize == 0) /* if not yet set */ - vstruct_def_clshift = local_log2(npages); - return(TRUE); - } - return(FALSE); -} - -int bs_get_global_clsize(int clsize); /* forward */ - -int -bs_get_global_clsize( - int clsize) -{ - int i; - memory_object_default_t dmm; - kern_return_t kr; - - /* - * Only allow setting of cluster size once. If called - * with no cluster size (default), we use the compiled-in default - * for the duration. The same cluster size is used for all - * paging segments. - */ - if (default_pager_clsize == 0) { - /* - * Keep cluster size in bit shift because it's quicker - * arithmetic, and easier to keep at a power of 2. - */ - if (clsize != NO_CLSIZE) { - for (i = 0; (1 << i) < clsize; i++); - if (i > MAX_CLUSTER_SHIFT) - i = MAX_CLUSTER_SHIFT; - vstruct_def_clshift = i; - } - default_pager_clsize = (1 << vstruct_def_clshift); - - /* - * Let the user know the new (and definitive) cluster size. - */ - if (verbose) - printf("%scluster size = %d page%s\n", - my_name, default_pager_clsize, - (default_pager_clsize == 1) ? "" : "s"); - - /* - * Let the kernel know too, in case it hasn't used the - * default value provided in main() yet. - */ - dmm = default_pager_object; - clsize = default_pager_clsize * vm_page_size; /* in bytes */ - kr = host_default_memory_manager(host_priv_self(), - &dmm, - clsize); - memory_object_default_deallocate(dmm); - - if (kr != KERN_SUCCESS) { - panic("bs_get_global_cl_size:host_default_memory_manager"); - } - if (dmm != default_pager_object) { - panic("bs_get_global_cl_size:there is another default pager"); - } - } - ASSERT(default_pager_clsize > 0 && - (default_pager_clsize & (default_pager_clsize - 1)) == 0); - - return default_pager_clsize; -} - -kern_return_t -default_pager_backing_store_create( - memory_object_default_t pager, - int priority, - int clsize, /* in bytes */ - MACH_PORT_FACE *backing_store) -{ - backing_store_t bs; - MACH_PORT_FACE port; -// kern_return_t kr; - struct vstruct_alias *alias_struct; - - if (pager != default_pager_object) - return KERN_INVALID_ARGUMENT; - - bs = backing_store_alloc(); - port = ipc_port_alloc_kernel(); - ipc_port_make_send(port); - assert (port != IP_NULL); - - DP_DEBUG(DEBUG_BS_EXTERNAL, - ("priority=%d clsize=%d bs_port=0x%x\n", - priority, clsize, (int) backing_store)); - - alias_struct = (struct vstruct_alias *) - kalloc(sizeof (struct vstruct_alias)); - if(alias_struct != NULL) { - alias_struct->vs = (struct vstruct *)bs; - alias_struct->name = &default_pager_ops; - port->ip_alias = (uintptr_t) alias_struct; - } - else { - ipc_port_dealloc_kernel((MACH_PORT_FACE)(port)); - - BS_LOCK_DESTROY(bs); - kfree(bs, sizeof (struct backing_store)); - - return KERN_RESOURCE_SHORTAGE; - } - - bs->bs_port = port; - if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI) - priority = BS_MAXPRI; - else if (priority == BS_NOPRI) - priority = BS_MAXPRI; - else - priority = BS_MINPRI; - bs->bs_priority = priority; - - bs->bs_clsize = bs_get_global_clsize(atop_32(clsize)); - - BSL_LOCK(); - queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t, - bs_links); - BSL_UNLOCK(); - - backing_store_add(bs); - - *backing_store = port; - return KERN_SUCCESS; -} - -kern_return_t -default_pager_backing_store_info( - MACH_PORT_FACE backing_store, - backing_store_flavor_t flavour, - backing_store_info_t info, - mach_msg_type_number_t *size) -{ - backing_store_t bs; - backing_store_basic_info_t basic; - int i; - paging_segment_t ps; - - if (flavour != BACKING_STORE_BASIC_INFO || - *size < BACKING_STORE_BASIC_INFO_COUNT) - return KERN_INVALID_ARGUMENT; - - basic = (backing_store_basic_info_t)info; - *size = BACKING_STORE_BASIC_INFO_COUNT; - - VSTATS_LOCK(&global_stats.gs_lock); - basic->pageout_calls = global_stats.gs_pageout_calls; - basic->pagein_calls = global_stats.gs_pagein_calls; - basic->pages_in = global_stats.gs_pages_in; - basic->pages_out = global_stats.gs_pages_out; - basic->pages_unavail = global_stats.gs_pages_unavail; - basic->pages_init = global_stats.gs_pages_init; - basic->pages_init_writes= global_stats.gs_pages_init_writes; - VSTATS_UNLOCK(&global_stats.gs_lock); - - if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL) - return KERN_INVALID_ARGUMENT; - - basic->bs_pages_total = bs->bs_pages_total; - PSL_LOCK(); - bs->bs_pages_free = 0; - for (i = 0; i <= paging_segment_max; i++) { - ps = paging_segments[i]; - if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) { - PS_LOCK(ps); - bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift; - PS_UNLOCK(ps); - } - } - PSL_UNLOCK(); - basic->bs_pages_free = bs->bs_pages_free; - basic->bs_pages_in = bs->bs_pages_in; - basic->bs_pages_in_fail = bs->bs_pages_in_fail; - basic->bs_pages_out = bs->bs_pages_out; - basic->bs_pages_out_fail= bs->bs_pages_out_fail; - - basic->bs_priority = bs->bs_priority; - basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */ - - BS_UNLOCK(bs); - - return KERN_SUCCESS; -} - -int ps_delete(paging_segment_t); /* forward */ -boolean_t current_thread_aborted(void); - -int -ps_delete( - paging_segment_t ps) -{ - vstruct_t vs; - kern_return_t error = KERN_SUCCESS; - int vs_count; - - VSL_LOCK(); /* get the lock on the list of vs's */ - - /* The lock relationship and sequence is farily complicated */ - /* this code looks at a live list, locking and unlocking the list */ - /* as it traverses it. It depends on the locking behavior of */ - /* default_pager_no_senders. no_senders always locks the vstruct */ - /* targeted for removal before locking the vstruct list. However */ - /* it will remove that member of the list without locking its */ - /* neighbors. We can be sure when we hold a lock on a vstruct */ - /* it cannot be removed from the list but we must hold the list */ - /* lock to be sure that its pointers to its neighbors are valid. */ - /* Also, we can hold off destruction of a vstruct when the list */ - /* lock and the vs locks are not being held by bumping the */ - /* vs_async_pending count. */ - - - while(backing_store_release_trigger_disable != 0) { - VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT); - } - - /* we will choose instead to hold a send right */ - vs_count = vstruct_list.vsl_count; - vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue)); - if(vs == (vstruct_t)&vstruct_list) { - VSL_UNLOCK(); - return KERN_SUCCESS; - } - VS_LOCK(vs); - vs_async_wait(vs); /* wait for any pending async writes */ - if ((vs_count != 0) && (vs != NULL)) - vs->vs_async_pending += 1; /* hold parties calling */ - /* vs_async_wait */ - - if (bs_low == FALSE) - backing_store_abort_compaction = FALSE; - - VS_UNLOCK(vs); - VSL_UNLOCK(); - while((vs_count != 0) && (vs != NULL)) { - /* We take the count of AMO's before beginning the */ - /* transfer of of the target segment. */ - /* We are guaranteed that the target segment cannot get */ - /* more users. We also know that queue entries are */ - /* made at the back of the list. If some of the entries */ - /* we would check disappear while we are traversing the */ - /* list then we will either check new entries which */ - /* do not have any backing store in the target segment */ - /* or re-check old entries. This might not be optimal */ - /* but it will always be correct. The alternative is to */ - /* take a snapshot of the list. */ - vstruct_t next_vs; - - if(dp_pages_free < cluster_transfer_minimum) - error = KERN_FAILURE; - else { - vm_object_t transfer_object; - unsigned int count; - upl_t upl; - upl_control_flags_t upl_flags; - - transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER); - count = 0; - upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | - UPL_SET_LITE | UPL_SET_INTERNAL); - if (dp_encryption) { - /* mark the pages as "encrypted" when they come in */ - upl_flags |= UPL_ENCRYPT; - } - error = vm_object_upl_request(transfer_object, - (vm_object_offset_t)0, VM_SUPER_CLUSTER, - &upl, NULL, &count, upl_flags); - - if(error == KERN_SUCCESS) { - error = ps_vstruct_transfer_from_segment( - vs, ps, upl); - upl_commit(upl, NULL, 0); - upl_deallocate(upl); - } else { - error = KERN_FAILURE; - } - vm_object_deallocate(transfer_object); - } - if(error || current_thread_aborted()) { - VS_LOCK(vs); - vs->vs_async_pending -= 1; /* release vs_async_wait */ - if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { - vs->vs_waiting_async = FALSE; - VS_UNLOCK(vs); - thread_wakeup(&vs->vs_async_pending); - } else { - VS_UNLOCK(vs); - } - return KERN_FAILURE; - } - - VSL_LOCK(); - - while(backing_store_release_trigger_disable != 0) { - VSL_SLEEP(&backing_store_release_trigger_disable, - THREAD_UNINT); - } - - next_vs = (vstruct_t) queue_next(&(vs->vs_links)); - if((next_vs != (vstruct_t)&vstruct_list) && - (vs != next_vs) && (vs_count != 1)) { - VS_LOCK(next_vs); - vs_async_wait(next_vs); /* wait for any */ - /* pending async writes */ - next_vs->vs_async_pending += 1; /* hold parties */ - /* calling vs_async_wait */ - VS_UNLOCK(next_vs); - } - VSL_UNLOCK(); - VS_LOCK(vs); - vs->vs_async_pending -= 1; - if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { - vs->vs_waiting_async = FALSE; - VS_UNLOCK(vs); - thread_wakeup(&vs->vs_async_pending); - } else { - VS_UNLOCK(vs); - } - if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list)) - vs = NULL; - else - vs = next_vs; - vs_count--; - } - return KERN_SUCCESS; -} - - -kern_return_t -default_pager_backing_store_delete_internal( - MACH_PORT_FACE backing_store) -{ - backing_store_t bs; - int i; - paging_segment_t ps; - int error; - int interim_pages_removed = 0; - boolean_t dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store ); - - if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL) - return KERN_INVALID_ARGUMENT; - -restart: - PSL_LOCK(); - error = KERN_SUCCESS; - for (i = 0; i <= paging_segment_max; i++) { - ps = paging_segments[i]; - if (ps != PAGING_SEGMENT_NULL && - ps->ps_bs == bs && - ! IS_PS_GOING_AWAY(ps)) { - PS_LOCK(ps); - - if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) { - /* - * Someone is already busy reclamining this paging segment. - * If it's the emergency segment we are looking at then check - * that someone has not already recovered it and set the right - * state i.e. online but not activated. - */ - PS_UNLOCK(ps); - continue; - } - - /* disable access to this segment */ - ps->ps_state &= ~PS_CAN_USE; - ps->ps_state |= PS_GOING_AWAY; - PS_UNLOCK(ps); - /* - * The "ps" segment is "off-line" now, - * we can try and delete it... - */ - if(dp_pages_free < (cluster_transfer_minimum - + ps->ps_pgcount)) { - error = KERN_FAILURE; - PSL_UNLOCK(); - } - else { - /* remove all pages associated with the */ - /* segment from the list of free pages */ - /* when transfer is through, all target */ - /* segment pages will appear to be free */ - - dp_pages_free -= ps->ps_pgcount; - interim_pages_removed += ps->ps_pgcount; - PSL_UNLOCK(); - error = ps_delete(ps); - } - if (error != KERN_SUCCESS) { - /* - * We couldn't delete the segment, - * probably because there's not enough - * virtual memory left. - * Re-enable all the segments. - */ - PSL_LOCK(); - break; - } - goto restart; - } - } - - if (error != KERN_SUCCESS) { - for (i = 0; i <= paging_segment_max; i++) { - ps = paging_segments[i]; - if (ps != PAGING_SEGMENT_NULL && - ps->ps_bs == bs && - IS_PS_GOING_AWAY(ps)) { - PS_LOCK(ps); - - if( !IS_PS_GOING_AWAY(ps)) { - PS_UNLOCK(ps); - continue; - } - /* Handle the special clusters that came in while we let go the lock*/ - if( ps->ps_special_clusters) { - dp_pages_free += ps->ps_special_clusters << ps->ps_clshift; - ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift; - ps->ps_clcount += ps->ps_special_clusters; - if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) { - ps_select_array[ps->ps_bs->bs_priority] = 0; - } - ps->ps_special_clusters = 0; - } - /* re-enable access to this segment */ - ps->ps_state &= ~PS_GOING_AWAY; - ps->ps_state |= PS_CAN_USE; - PS_UNLOCK(ps); - } - } - dp_pages_free += interim_pages_removed; - PSL_UNLOCK(); - BS_UNLOCK(bs); - return error; - } - - for (i = 0; i <= paging_segment_max; i++) { - ps = paging_segments[i]; - if (ps != PAGING_SEGMENT_NULL && - ps->ps_bs == bs) { - if(IS_PS_GOING_AWAY(ps)) { - if(IS_PS_EMERGENCY_SEGMENT(ps)) { - PS_LOCK(ps); - ps->ps_state &= ~PS_GOING_AWAY; - ps->ps_special_clusters = 0; - ps->ps_pgcount = ps->ps_pgnum; - ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; - dp_pages_reserve += ps->ps_pgcount; - PS_UNLOCK(ps); - } else { - paging_segments[i] = PAGING_SEGMENT_NULL; - paging_segment_count--; - PS_LOCK(ps); - kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); - kfree(ps, sizeof *ps); - } - } - } - } - - /* Scan the entire ps array separately to make certain we find the */ - /* proper paging_segment_max */ - for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) { - if(paging_segments[i] != PAGING_SEGMENT_NULL) - paging_segment_max = i; - } - - PSL_UNLOCK(); - - if( dealing_with_emergency_segment ) { - BS_UNLOCK(bs); - return KERN_SUCCESS; - } - - /* - * All the segments have been deleted. - * We can remove the backing store. - */ - - /* - * Disable lookups of this backing store. - */ - if((void *)bs->bs_port->ip_alias != NULL) - kfree((void *) bs->bs_port->ip_alias, - sizeof (struct vstruct_alias)); - ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port)); - bs->bs_port = MACH_PORT_NULL; - BS_UNLOCK(bs); - - /* - * Remove backing store from backing_store list. - */ - BSL_LOCK(); - queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t, - bs_links); - BSL_UNLOCK(); - - /* - * Free the backing store structure. - */ - BS_LOCK_DESTROY(bs); - kfree(bs, sizeof *bs); - - return KERN_SUCCESS; -} - -kern_return_t -default_pager_backing_store_delete( - MACH_PORT_FACE backing_store) -{ - if( backing_store != emergency_segment_backing_store ) { - default_pager_backing_store_delete_internal(emergency_segment_backing_store); - } - return(default_pager_backing_store_delete_internal(backing_store)); -} - -int ps_enter(paging_segment_t); /* forward */ - -int -ps_enter( - paging_segment_t ps) -{ - int i; - - PSL_LOCK(); - - for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) { - if (paging_segments[i] == PAGING_SEGMENT_NULL) - break; - } - - if (i < MAX_NUM_PAGING_SEGMENTS) { - paging_segments[i] = ps; - if (i > paging_segment_max) - paging_segment_max = i; - paging_segment_count++; - if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) || - (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)) - ps_select_array[ps->ps_bs->bs_priority] = 0; - i = 0; - } else { - PSL_UNLOCK(); - return KERN_RESOURCE_SHORTAGE; - } - - PSL_UNLOCK(); - return i; -} - -#ifdef DEVICE_PAGING -kern_return_t -default_pager_add_segment( - MACH_PORT_FACE backing_store, - MACH_PORT_FACE device, - recnum_t offset, - recnum_t count, - int record_size) -{ - backing_store_t bs; - paging_segment_t ps; - int i; - int error; - - if ((bs = backing_store_lookup(backing_store)) - == BACKING_STORE_NULL) - return KERN_INVALID_ARGUMENT; - - PSL_LOCK(); - for (i = 0; i <= paging_segment_max; i++) { - ps = paging_segments[i]; - if (ps == PAGING_SEGMENT_NULL) - continue; - - /* - * Check for overlap on same device. - */ - if (!(ps->ps_device != device - || offset >= ps->ps_offset + ps->ps_recnum - || offset + count <= ps->ps_offset)) { - PSL_UNLOCK(); - BS_UNLOCK(bs); - return KERN_INVALID_ARGUMENT; - } - } - PSL_UNLOCK(); - - /* - * Set up the paging segment - */ - ps = (paging_segment_t) kalloc(sizeof (struct paging_segment)); - if (ps == PAGING_SEGMENT_NULL) { - BS_UNLOCK(bs); - return KERN_RESOURCE_SHORTAGE; - } - - ps->ps_segtype = PS_PARTITION; - ps->ps_device = device; - ps->ps_offset = offset; - ps->ps_record_shift = local_log2(vm_page_size / record_size); - ps->ps_recnum = count; - ps->ps_pgnum = count >> ps->ps_record_shift; - - ps->ps_pgcount = ps->ps_pgnum; - ps->ps_clshift = local_log2(bs->bs_clsize); - ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; - ps->ps_hint = 0; - - PS_LOCK_INIT(ps); - ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); - if (!ps->ps_bmap) { - PS_LOCK_DESTROY(ps); - kfree(ps, sizeof *ps); - BS_UNLOCK(bs); - return KERN_RESOURCE_SHORTAGE; - } - for (i = 0; i < ps->ps_ncls; i++) { - clrbit(ps->ps_bmap, i); - } - - if(paging_segment_count == 0) { - ps->ps_state = PS_EMERGENCY_SEGMENT; - if(use_emergency_swap_file_first) { - ps->ps_state |= PS_CAN_USE; - } - } else { - ps->ps_state = PS_CAN_USE; - } - - ps->ps_bs = bs; - - if ((error = ps_enter(ps)) != 0) { - kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); - - PS_LOCK_DESTROY(ps); - kfree(ps, sizeof *ps); - BS_UNLOCK(bs); - return KERN_RESOURCE_SHORTAGE; - } - - bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift; - bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift; - BS_UNLOCK(bs); - - PSL_LOCK(); - if(IS_PS_OK_TO_USE(ps)) { - dp_pages_free += ps->ps_pgcount; - } else { - dp_pages_reserve += ps->ps_pgcount; - } - PSL_UNLOCK(); - - bs_more_space(ps->ps_clcount); - - DP_DEBUG(DEBUG_BS_INTERNAL, - ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", - device, offset, count, record_size, - ps->ps_record_shift, ps->ps_pgnum)); - - return KERN_SUCCESS; -} - -boolean_t -bs_add_device( - char *dev_name, - MACH_PORT_FACE master) -{ - security_token_t null_security_token = { - { 0, 0 } - }; - MACH_PORT_FACE device; - int info[DEV_GET_SIZE_COUNT]; - mach_msg_type_number_t info_count; - MACH_PORT_FACE bs = MACH_PORT_NULL; - unsigned int rec_size; - recnum_t count; - int clsize; - MACH_PORT_FACE reply_port; - - if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE, - null_security_token, dev_name, &device)) - return FALSE; - - info_count = DEV_GET_SIZE_COUNT; - if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) { - rec_size = info[DEV_GET_SIZE_RECORD_SIZE]; - count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size; - clsize = bs_get_global_clsize(0); - if (!default_pager_backing_store_create( - default_pager_object, - DEFAULT_PAGER_BACKING_STORE_MAXPRI, - (clsize * vm_page_size), - &bs)) { - if (!default_pager_add_segment(bs, device, - 0, count, rec_size)) { - return TRUE; - } - ipc_port_release_receive(bs); - } - } - - ipc_port_release_send(device); - return FALSE; -} -#endif /* DEVICE_PAGING */ - -#if VS_ASYNC_REUSE - -struct vs_async * -vs_alloc_async(void) -{ - struct vs_async *vsa; - MACH_PORT_FACE reply_port; -// kern_return_t kr; - - VS_ASYNC_LOCK(); - if (vs_async_free_list == NULL) { - VS_ASYNC_UNLOCK(); - vsa = (struct vs_async *) kalloc(sizeof (struct vs_async)); - if (vsa != NULL) { - /* - * Try allocating a reply port named after the - * address of the vs_async structure. - */ - struct vstruct_alias *alias_struct; - - reply_port = ipc_port_alloc_kernel(); - alias_struct = (struct vstruct_alias *) - kalloc(sizeof (struct vstruct_alias)); - if(alias_struct != NULL) { - __IGNORE_WCASTALIGN(alias_struct->vs = (struct vstruct *)vsa); - alias_struct->name = &default_pager_ops; - reply_port->ip_alias = (uintptr_t) alias_struct; - vsa->reply_port = reply_port; - vs_alloc_async_count++; - } - else { - vs_alloc_async_failed++; - ipc_port_dealloc_kernel((MACH_PORT_FACE) - (reply_port)); - kfree(vsa, sizeof (struct vs_async)); - vsa = NULL; - } - } - } else { - vsa = vs_async_free_list; - vs_async_free_list = vs_async_free_list->vsa_next; - VS_ASYNC_UNLOCK(); - } - - return vsa; -} - -void -vs_free_async( - struct vs_async *vsa) -{ - VS_ASYNC_LOCK(); - vsa->vsa_next = vs_async_free_list; - vs_async_free_list = vsa; - VS_ASYNC_UNLOCK(); -} - -#else /* VS_ASYNC_REUSE */ - -struct vs_async * -vs_alloc_async(void) -{ - struct vs_async *vsa; - MACH_PORT_FACE reply_port; - kern_return_t kr; - - vsa = (struct vs_async *) kalloc(sizeof (struct vs_async)); - if (vsa != NULL) { - /* - * Try allocating a reply port named after the - * address of the vs_async structure. - */ - reply_port = ipc_port_alloc_kernel(); - alias_struct = (vstruct_alias *) - kalloc(sizeof (struct vstruct_alias)); - if(alias_struct != NULL) { - alias_struct->vs = reply_port; - alias_struct->name = &default_pager_ops; - reply_port->defpager_importance.alias = (int) vsa; - vsa->reply_port = reply_port; - vs_alloc_async_count++; - } - else { - vs_alloc_async_failed++; - ipc_port_dealloc_kernel((MACH_PORT_FACE) - (reply_port)); - kfree(vsa, sizeof (struct vs_async)); - vsa = NULL; - } - } - - return vsa; -} - -void -vs_free_async( - struct vs_async *vsa) -{ - MACH_PORT_FACE reply_port; - kern_return_t kr; - - reply_port = vsa->reply_port; - kfree(reply_port->ip_alias, sizeof (struct vstuct_alias)); - kfree(vsa, sizeof (struct vs_async)); - ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port)); -#if 0 - VS_ASYNC_LOCK(); - vs_alloc_async_count--; - VS_ASYNC_UNLOCK(); -#endif -} - -#endif /* VS_ASYNC_REUSE */ - -zone_t vstruct_zone; - -vstruct_t -ps_vstruct_create( - dp_size_t size) -{ - vstruct_t vs; - unsigned int i; - - vs = (vstruct_t) zalloc(vstruct_zone); - if (vs == VSTRUCT_NULL) { - return VSTRUCT_NULL; - } - - VS_LOCK_INIT(vs); - - /* - * The following fields will be provided later. - */ - vs->vs_pager_ops = NULL; - vs->vs_control = MEMORY_OBJECT_CONTROL_NULL; - vs->vs_references = 1; - vs->vs_seqno = 0; - - vs->vs_waiting_seqno = FALSE; - vs->vs_waiting_read = FALSE; - vs->vs_waiting_write = FALSE; - vs->vs_waiting_async = FALSE; - - vs->vs_readers = 0; - vs->vs_writers = 0; - - vs->vs_errors = 0; - - vs->vs_clshift = local_log2(bs_get_global_clsize(0)); - vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1; - vs->vs_async_pending = 0; - - /* - * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE - * depending on the size of the memory object. - */ - if (INDIRECT_CLMAP(vs->vs_size)) { - vs->vs_imap = (struct vs_map **) - kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size)); - vs->vs_indirect = TRUE; - } else { - vs->vs_dmap = (struct vs_map *) - kalloc(CLMAP_SIZE(vs->vs_size)); - vs->vs_indirect = FALSE; - } - vs->vs_xfer_pending = FALSE; - DP_DEBUG(DEBUG_VS_INTERNAL, - ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect)); - - /* - * Check to see that we got the space. - */ - if (!vs->vs_dmap) { - kfree(vs, sizeof *vs); - return VSTRUCT_NULL; - } - - /* - * Zero the indirect pointers, or clear the direct pointers. - */ - if (vs->vs_indirect) - memset(vs->vs_imap, 0, - INDIRECT_CLMAP_SIZE(vs->vs_size)); - else - for (i = 0; i < vs->vs_size; i++) - VSM_CLR(vs->vs_dmap[i]); - - VS_MAP_LOCK_INIT(vs); - - bs_commit(vs->vs_size); - - return vs; -} - -paging_segment_t ps_select_segment(unsigned int, int *); /* forward */ - -paging_segment_t -ps_select_segment( - unsigned int shift, - int *psindex) -{ - paging_segment_t ps; - int i; - int j; - - /* - * Optimize case where there's only one segment. - * paging_segment_max will index the one and only segment. - */ - - PSL_LOCK(); - if (paging_segment_count == 1) { - paging_segment_t lps = PAGING_SEGMENT_NULL; /* used to avoid extra PS_UNLOCK */ - ipc_port_t trigger = IP_NULL; - - ps = paging_segments[paging_segment_max]; - *psindex = paging_segment_max; - PS_LOCK(ps); - if( !IS_PS_EMERGENCY_SEGMENT(ps) ) { - panic("Emergency paging segment missing\n"); - } - ASSERT(ps->ps_clshift >= shift); - if(IS_PS_OK_TO_USE(ps)) { - if (ps->ps_clcount) { - ps->ps_clcount--; - dp_pages_free -= 1 << ps->ps_clshift; - ps->ps_pgcount -= 1 << ps->ps_clshift; - if(min_pages_trigger_port && - (dp_pages_free < minimum_pages_remaining)) { - trigger = min_pages_trigger_port; - min_pages_trigger_port = NULL; - bs_low = TRUE; - backing_store_abort_compaction = TRUE; - } - lps = ps; - } - } - PS_UNLOCK(ps); - - if( lps == PAGING_SEGMENT_NULL ) { - if(dp_pages_free) { - dp_pages_free_drift_count++; - if(dp_pages_free > dp_pages_free_drifted_max) { - dp_pages_free_drifted_max = dp_pages_free; - } - dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free)); - } - dp_pages_free = 0; - } - - PSL_UNLOCK(); - - if (trigger != IP_NULL) { - dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); - - default_pager_space_alert(trigger, HI_WAT_ALERT); - ipc_port_release_send(trigger); - } - return lps; - } - - if (paging_segment_count == 0) { - if(dp_pages_free) { - dp_pages_free_drift_count++; - if(dp_pages_free > dp_pages_free_drifted_max) { - dp_pages_free_drifted_max = dp_pages_free; - } - dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free)); - } - dp_pages_free = 0; - PSL_UNLOCK(); - return PAGING_SEGMENT_NULL; - } - - for (i = BS_MAXPRI; - i >= BS_MINPRI; i--) { - int start_index; - - if ((ps_select_array[i] == BS_NOPRI) || - (ps_select_array[i] == BS_FULLPRI)) - continue; - start_index = ps_select_array[i]; - - if(!(paging_segments[start_index])) { - j = start_index+1; - physical_transfer_cluster_count = 0; - } - else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >> - (((paging_segments[start_index])->ps_clshift) - + vm_page_shift))) { - physical_transfer_cluster_count = 0; - j = start_index + 1; - } else { - physical_transfer_cluster_count+=1; - j = start_index; - if(start_index == 0) - start_index = paging_segment_max; - else - start_index = start_index - 1; - } - - while (1) { - if (j > paging_segment_max) - j = 0; - if ((ps = paging_segments[j]) && - (ps->ps_bs->bs_priority == i)) { - /* - * Force the ps cluster size to be - * >= that of the vstruct. - */ - PS_LOCK(ps); - if (IS_PS_OK_TO_USE(ps)) { - if ((ps->ps_clcount) && - (ps->ps_clshift >= shift)) { - ipc_port_t trigger = IP_NULL; - - ps->ps_clcount--; - dp_pages_free -= 1 << ps->ps_clshift; - ps->ps_pgcount -= 1 << ps->ps_clshift; - if(min_pages_trigger_port && - (dp_pages_free < - minimum_pages_remaining)) { - trigger = min_pages_trigger_port; - min_pages_trigger_port = NULL; - bs_low = TRUE; - backing_store_abort_compaction = TRUE; - } - PS_UNLOCK(ps); - /* - * found one, quit looking. - */ - ps_select_array[i] = j; - PSL_UNLOCK(); - - if (trigger != IP_NULL) { - dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); - - default_pager_space_alert( - trigger, - HI_WAT_ALERT); - ipc_port_release_send(trigger); - } - *psindex = j; - return ps; - } - } - PS_UNLOCK(ps); - } - if (j == start_index) { - /* - * none at this priority -- mark it full - */ - ps_select_array[i] = BS_FULLPRI; - break; - } - j++; - } - } - - if(dp_pages_free) { - dp_pages_free_drift_count++; - if(dp_pages_free > dp_pages_free_drifted_max) { - dp_pages_free_drifted_max = dp_pages_free; - } - dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free)); - } - dp_pages_free = 0; - PSL_UNLOCK(); - return PAGING_SEGMENT_NULL; -} - -dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/ - -dp_offset_t -ps_allocate_cluster( - vstruct_t vs, - int *psindex, - paging_segment_t use_ps) -{ - unsigned int byte_num; - int bit_num = 0; - paging_segment_t ps; - dp_offset_t cluster; - ipc_port_t trigger = IP_NULL; - - /* - * Find best paging segment. - * ps_select_segment will decrement cluster count on ps. - * Must pass cluster shift to find the most appropriate segment. - */ - /* NOTE: The addition of paging segment delete capability threatened - * to seriously complicate the treatment of paging segments in this - * module and the ones that call it (notably ps_clmap), because of the - * difficulty in assuring that the paging segment would continue to - * exist between being unlocked and locked. This was - * avoided because all calls to this module are based in either - * dp_memory_object calls which rely on the vs lock, or by - * the transfer function which is part of the segment delete path. - * The transfer function which is part of paging segment delete is - * protected from multiple callers by the backing store lock. - * The paging segment delete function treats mappings to a paging - * segment on a vstruct by vstruct basis, locking the vstruct targeted - * while data is transferred to the remaining segments. This is in - * line with the view that incomplete or in-transition mappings between - * data, a vstruct, and backing store are protected by the vs lock. - * This and the ordering of the paging segment "going_away" bit setting - * protects us. - */ -retry: - if (use_ps != PAGING_SEGMENT_NULL) { - ps = use_ps; - PSL_LOCK(); - PS_LOCK(ps); - - ASSERT(ps->ps_clcount != 0); - - ps->ps_clcount--; - dp_pages_free -= 1 << ps->ps_clshift; - ps->ps_pgcount -= 1 << ps->ps_clshift; - if(min_pages_trigger_port && - (dp_pages_free < minimum_pages_remaining)) { - trigger = min_pages_trigger_port; - min_pages_trigger_port = NULL; - bs_low = TRUE; - backing_store_abort_compaction = TRUE; - } - PSL_UNLOCK(); - PS_UNLOCK(ps); - if (trigger != IP_NULL) { - dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); - - default_pager_space_alert(trigger, HI_WAT_ALERT); - ipc_port_release_send(trigger); - } - - } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) == - PAGING_SEGMENT_NULL) { - static clock_sec_t lastnotify = 0; - clock_sec_t now; - clock_nsec_t nanoseconds_dummy; - - /* - * Don't immediately jump to the emergency segment. Give the - * dynamic pager a chance to create it's first normal swap file. - * Unless, of course the very first normal swap file can't be - * created due to some problem and we didn't expect that problem - * i.e. use_emergency_swap_file_first was never set to true initially. - * It then gets set in the swap file creation error handling. - */ - if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) { - - ps = paging_segments[EMERGENCY_PSEG_INDEX]; - if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) { - PSL_LOCK(); - PS_LOCK(ps); - - if(IS_PS_GOING_AWAY(ps)) { - /* Someone de-activated the emergency paging segment*/ - PS_UNLOCK(ps); - PSL_UNLOCK(); - - } else if(dp_pages_free) { - /* - * Someone has already activated the emergency paging segment - * OR - * Between us having rec'd a NULL segment from ps_select_segment - * and reaching here a new normal segment could have been added. - * E.g. we get NULL segment and another thread just added the - * new swap file. Hence check to see if we have more dp_pages_free - * before activating the emergency segment. - */ - PS_UNLOCK(ps); - PSL_UNLOCK(); - goto retry; - - } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) { - /* - * PS_CAN_USE is only reset from the emergency segment when it's - * been successfully recovered. So it's legal to have an emergency - * segment that has PS_CAN_USE but no clusters because it's recovery - * failed. - */ - backing_store_t bs = ps->ps_bs; - ps->ps_state |= PS_CAN_USE; - if(ps_select_array[bs->bs_priority] == BS_FULLPRI || - ps_select_array[bs->bs_priority] == BS_NOPRI) { - ps_select_array[bs->bs_priority] = 0; - } - dp_pages_free += ps->ps_pgcount; - dp_pages_reserve -= ps->ps_pgcount; - PS_UNLOCK(ps); - PSL_UNLOCK(); - dprintf(("Switching ON Emergency paging segment\n")); - goto retry; - } - - PS_UNLOCK(ps); - PSL_UNLOCK(); - } - } - - /* - * Emit a notification of the low-paging resource condition - * but don't issue it more than once every five seconds. This - * prevents us from overflowing logs with thousands of - * repetitions of the message. - */ - clock_get_system_nanotime(&now, &nanoseconds_dummy); - if (paging_segment_count > 1 && (now > lastnotify + 5)) { - /* With an activated emergency paging segment we still - * didn't get any clusters. This could mean that the - * emergency paging segment is exhausted. - */ - dprintf(("System is out of paging space.\n")); - lastnotify = now; - } - - PSL_LOCK(); - - if(min_pages_trigger_port) { - trigger = min_pages_trigger_port; - min_pages_trigger_port = NULL; - bs_low = TRUE; - backing_store_abort_compaction = TRUE; - } - PSL_UNLOCK(); - if (trigger != IP_NULL) { - dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); - - default_pager_space_alert(trigger, HI_WAT_ALERT); - ipc_port_release_send(trigger); - } - return (dp_offset_t) -1; - } - - /* - * Look for an available cluster. At the end of the loop, - * byte_num is the byte offset and bit_num is the bit offset of the - * first zero bit in the paging segment bitmap. - */ - PS_LOCK(ps); - byte_num = ps->ps_hint; - for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) { - if (*(ps->ps_bmap + byte_num) != BYTEMASK) { - for (bit_num = 0; bit_num < NBBY; bit_num++) { - if (isclr((ps->ps_bmap + byte_num), bit_num)) - break; - } - ASSERT(bit_num != NBBY); - break; - } - } - ps->ps_hint = byte_num; - cluster = (byte_num*NBBY) + bit_num; - - /* Space was reserved, so this must be true */ - ASSERT(cluster < ps->ps_ncls); - - setbit(ps->ps_bmap, cluster); - PS_UNLOCK(ps); - - return cluster; -} - -void ps_deallocate_cluster(paging_segment_t, dp_offset_t); /* forward */ - -void -ps_deallocate_cluster( - paging_segment_t ps, - dp_offset_t cluster) -{ - - if (cluster >= ps->ps_ncls) - panic("ps_deallocate_cluster: Invalid cluster number"); - - /* - * Lock the paging segment, clear the cluster's bitmap and increment the - * number of free cluster. - */ - PSL_LOCK(); - PS_LOCK(ps); - clrbit(ps->ps_bmap, cluster); - if( IS_PS_OK_TO_USE(ps)) { - ++ps->ps_clcount; - ps->ps_pgcount += 1 << ps->ps_clshift; - dp_pages_free += 1 << ps->ps_clshift; - } else { - ps->ps_special_clusters += 1; - } - - /* - * Move the hint down to the freed cluster if it is - * less than the current hint. - */ - if ((cluster/NBBY) < ps->ps_hint) { - ps->ps_hint = (cluster/NBBY); - } - - - /* - * If we're freeing space on a full priority, reset the array. - */ - if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) - ps_select_array[ps->ps_bs->bs_priority] = 0; - PS_UNLOCK(ps); - PSL_UNLOCK(); - - return; -} - -void ps_dealloc_vsmap(struct vs_map *, dp_size_t); /* forward */ - -void -ps_dealloc_vsmap( - struct vs_map *vsmap, - dp_size_t size) -{ - unsigned int i; - struct ps_vnode_trim_data trim_data; - - ps_vnode_trim_init(&trim_data); - - for (i = 0; i < size; i++) { - if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) { - ps_vnode_trim_more(&trim_data, - &vsmap[i], - VSM_PS(vsmap[i])->ps_clshift, - vm_page_size << VSM_PS(vsmap[i])->ps_clshift); - ps_deallocate_cluster(VSM_PS(vsmap[i]), - VSM_CLOFF(vsmap[i])); - } else { - ps_vnode_trim_now(&trim_data); - } - } - ps_vnode_trim_now(&trim_data); -} - -void -ps_vstruct_dealloc( - vstruct_t vs) -{ - unsigned int i; -// spl_t s; - - VS_MAP_LOCK(vs); - - /* - * If this is an indirect structure, then we walk through the valid - * (non-zero) indirect pointers and deallocate the clusters - * associated with each used map entry (via ps_dealloc_vsmap). - * When all of the clusters in an indirect block have been - * freed, we deallocate the block. When all of the indirect - * blocks have been deallocated we deallocate the memory - * holding the indirect pointers. - */ - if (vs->vs_indirect) { - for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { - if (vs->vs_imap[i] != NULL) { - ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES); - kfree(vs->vs_imap[i], CLMAP_THRESHOLD); - } - } - kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size)); - } else { - /* - * Direct map. Free used clusters, then memory. - */ - ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size); - kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); - } - VS_MAP_UNLOCK(vs); - - bs_commit(- vs->vs_size); - - VS_MAP_LOCK_DESTROY(vs); - - zfree(vstruct_zone, vs); -} - -kern_return_t -ps_vstruct_reclaim( - vstruct_t vs, - boolean_t return_to_vm, - boolean_t reclaim_backing_store) -{ - unsigned int i, j; - struct vs_map *vsmap; - boolean_t vsmap_all_clear, vsimap_all_clear; - struct vm_object_fault_info fault_info; - int clmap_off; - unsigned int vsmap_size; - kern_return_t kr = KERN_SUCCESS; - - VS_MAP_LOCK(vs); - - fault_info.cluster_size = VM_SUPER_CLUSTER; - fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info.user_tag = 0; - fault_info.pmap_options = 0; - fault_info.lo_offset = 0; - fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift); - fault_info.io_sync = reclaim_backing_store; - fault_info.batch_pmap_op = FALSE; - - /* - * If this is an indirect structure, then we walk through the valid - * (non-zero) indirect pointers and deallocate the clusters - * associated with each used map entry (via ps_dealloc_vsmap). - * When all of the clusters in an indirect block have been - * freed, we deallocate the block. When all of the indirect - * blocks have been deallocated we deallocate the memory - * holding the indirect pointers. - */ - if (vs->vs_indirect) { - vsimap_all_clear = TRUE; - for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { - vsmap = vs->vs_imap[i]; - if (vsmap == NULL) - continue; - /* loop on clusters in this indirect map */ - clmap_off = (vm_page_size * CLMAP_ENTRIES * - VSCLSIZE(vs) * i); - if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size)) - vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i); - else - vsmap_size = CLMAP_ENTRIES; - vsmap_all_clear = TRUE; - if (return_to_vm) { - for (j = 0; j < vsmap_size;) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j])) { - j++; - clmap_off += vm_page_size * VSCLSIZE(vs); - continue; - } - VS_MAP_UNLOCK(vs); - kr = pvs_cluster_read( - vs, - clmap_off, - (dp_size_t) -1, /* read whole cluster */ - &fault_info); - - VS_MAP_LOCK(vs); /* XXX what if it changed ? */ - if (kr != KERN_SUCCESS) { - vsmap_all_clear = FALSE; - vsimap_all_clear = FALSE; - - kr = KERN_MEMORY_ERROR; - goto out; - } - } - } - if (vsmap_all_clear) { - ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES); - kfree(vsmap, CLMAP_THRESHOLD); - vs->vs_imap[i] = NULL; - } - } - if (vsimap_all_clear) { -// kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size)); - } - } else { - /* - * Direct map. Free used clusters, then memory. - */ - vsmap = vs->vs_dmap; - if (vsmap == NULL) { - goto out; - } - vsmap_all_clear = TRUE; - /* loop on clusters in the direct map */ - if (return_to_vm) { - for (j = 0; j < vs->vs_size;) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j])) { - j++; - continue; - } - clmap_off = vm_page_size * (j << vs->vs_clshift); - VS_MAP_UNLOCK(vs); - kr = pvs_cluster_read( - vs, - clmap_off, - (dp_size_t) -1, /* read whole cluster */ - &fault_info); - - VS_MAP_LOCK(vs); /* XXX what if it changed ? */ - if (kr != KERN_SUCCESS) { - vsmap_all_clear = FALSE; - - kr = KERN_MEMORY_ERROR; - goto out; - } else { -// VSM_CLR(vsmap[j]); - } - } - } - if (vsmap_all_clear) { - ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size); -// kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); - } - } -out: - VS_MAP_UNLOCK(vs); - - return kr; -} - -int ps_map_extend(vstruct_t, unsigned int); /* forward */ - -int ps_map_extend( - vstruct_t vs, - unsigned int new_size) -{ - struct vs_map **new_imap; - struct vs_map *new_dmap = NULL; - int newdsize; - int i; - void *old_map = NULL; - int old_map_size = 0; - - if (vs->vs_size >= new_size) { - /* - * Someone has already done the work. - */ - return 0; - } - - /* - * If the new size extends into the indirect range, then we have one - * of two cases: we are going from indirect to indirect, or we are - * going from direct to indirect. If we are going from indirect to - * indirect, then it is possible that the new size will fit in the old - * indirect map. If this is the case, then just reset the size of the - * vstruct map and we are done. If the new size will not - * fit into the old indirect map, then we have to allocate a new - * indirect map and copy the old map pointers into this new map. - * - * If we are going from direct to indirect, then we have to allocate a - * new indirect map and copy the old direct pages into the first - * indirect page of the new map. - * NOTE: allocating memory here is dangerous, as we're in the - * pageout path. - */ - if (INDIRECT_CLMAP(new_size)) { - int new_map_size = INDIRECT_CLMAP_SIZE(new_size); - - /* - * Get a new indirect map and zero it. - */ - old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size); - if (vs->vs_indirect && - (new_map_size == old_map_size)) { - bs_commit(new_size - vs->vs_size); - vs->vs_size = new_size; - return 0; - } - - new_imap = (struct vs_map **)kalloc(new_map_size); - if (new_imap == NULL) { - return -1; - } - memset(new_imap, 0, new_map_size); - - if (vs->vs_indirect) { - /* Copy old entries into new map */ - memcpy(new_imap, vs->vs_imap, old_map_size); - /* Arrange to free the old map */ - old_map = (void *) vs->vs_imap; - newdsize = 0; - } else { /* Old map was a direct map */ - /* Allocate an indirect page */ - if ((new_imap[0] = (struct vs_map *) - kalloc(CLMAP_THRESHOLD)) == NULL) { - kfree(new_imap, new_map_size); - return -1; - } - new_dmap = new_imap[0]; - newdsize = CLMAP_ENTRIES; - } - } else { - new_imap = NULL; - newdsize = new_size; - /* - * If the new map is a direct map, then the old map must - * also have been a direct map. All we have to do is - * to allocate a new direct map, copy the old entries - * into it and free the old map. - */ - if ((new_dmap = (struct vs_map *) - kalloc(CLMAP_SIZE(new_size))) == NULL) { - return -1; - } - } - if (newdsize) { - - /* Free the old map */ - old_map = (void *) vs->vs_dmap; - old_map_size = CLMAP_SIZE(vs->vs_size); - - /* Copy info from the old map into the new map */ - memcpy(new_dmap, vs->vs_dmap, old_map_size); - - /* Initialize the rest of the new map */ - for (i = vs->vs_size; i < newdsize; i++) - VSM_CLR(new_dmap[i]); - } - if (new_imap) { - vs->vs_imap = new_imap; - vs->vs_indirect = TRUE; - } else - vs->vs_dmap = new_dmap; - bs_commit(new_size - vs->vs_size); - vs->vs_size = new_size; - if (old_map) - kfree(old_map, old_map_size); - return 0; -} - -dp_offset_t -ps_clmap( - vstruct_t vs, - dp_offset_t offset, - struct clmap *clmap, - int flag, - dp_size_t size, - int error) -{ - dp_offset_t cluster; /* The cluster of offset. */ - dp_offset_t newcl; /* The new cluster allocated. */ - dp_offset_t newoff; - unsigned int i; - struct vs_map *vsmap; - - VS_MAP_LOCK(vs); - - ASSERT(vs->vs_dmap); - cluster = atop_32(offset) >> vs->vs_clshift; - - /* - * Initialize cluster error value - */ - clmap->cl_error = 0; - - /* - * If the object has grown, extend the page map. - */ - if (cluster >= vs->vs_size) { - if (flag == CL_FIND) { - /* Do not allocate if just doing a lookup */ - VS_MAP_UNLOCK(vs); - return (dp_offset_t) -1; - } - if (ps_map_extend(vs, cluster + 1)) { - VS_MAP_UNLOCK(vs); - return (dp_offset_t) -1; - } - } - - /* - * Look for the desired cluster. If the map is indirect, then we - * have a two level lookup. First find the indirect block, then - * find the actual cluster. If the indirect block has not yet - * been allocated, then do so. If the cluster has not yet been - * allocated, then do so. - * - * If any of the allocations fail, then return an error. - * Don't allocate if just doing a lookup. - */ - if (vs->vs_indirect) { - long ind_block = cluster/CLMAP_ENTRIES; - - /* Is the indirect block allocated? */ - vsmap = vs->vs_imap[ind_block]; - if (vsmap == NULL) { - if (flag == CL_FIND) { - VS_MAP_UNLOCK(vs); - return (dp_offset_t) -1; - } - - /* Allocate the indirect block */ - vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD); - if (vsmap == NULL) { - VS_MAP_UNLOCK(vs); - return (dp_offset_t) -1; - } - /* Initialize the cluster offsets */ - for (i = 0; i < CLMAP_ENTRIES; i++) - VSM_CLR(vsmap[i]); - vs->vs_imap[ind_block] = vsmap; - } - } else - vsmap = vs->vs_dmap; - - ASSERT(vsmap); - vsmap += cluster%CLMAP_ENTRIES; - - /* - * At this point, vsmap points to the struct vs_map desired. - * - * Look in the map for the cluster, if there was an error on a - * previous write, flag it and return. If it is not yet - * allocated, then allocate it, if we're writing; if we're - * doing a lookup and the cluster's not allocated, return error. - */ - if (VSM_ISERR(*vsmap)) { - clmap->cl_error = VSM_GETERR(*vsmap); - VS_MAP_UNLOCK(vs); - return (dp_offset_t) -1; - } else if (VSM_ISCLR(*vsmap)) { - int psindex; - - if (flag == CL_FIND) { - /* - * If there's an error and the entry is clear, then - * we've run out of swap space. Record the error - * here and return. - */ - if (error) { - VSM_SETERR(*vsmap, error); - } - VS_MAP_UNLOCK(vs); - return (dp_offset_t) -1; - } else { - /* - * Attempt to allocate a cluster from the paging segment - */ - newcl = ps_allocate_cluster(vs, &psindex, - PAGING_SEGMENT_NULL); - if (newcl == (dp_offset_t) -1) { - VS_MAP_UNLOCK(vs); - return (dp_offset_t) -1; - } - VSM_CLR(*vsmap); - VSM_SETCLOFF(*vsmap, newcl); - VSM_SETPS(*vsmap, psindex); - } - } else - newcl = VSM_CLOFF(*vsmap); - - /* - * Fill in pertinent fields of the clmap - */ - clmap->cl_ps = VSM_PS(*vsmap); - clmap->cl_numpages = VSCLSIZE(vs); - clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap); - - /* - * Byte offset in paging segment is byte offset to cluster plus - * byte offset within cluster. It looks ugly, but should be - * relatively quick. - */ - ASSERT(trunc_page(offset) == offset); - newcl = ptoa_32(newcl) << vs->vs_clshift; - newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1); - if (flag == CL_ALLOC) { - /* - * set bits in the allocation bitmap according to which - * pages were requested. size is in bytes. - */ - i = atop_32(newoff); - while ((size > 0) && (i < VSCLSIZE(vs))) { - VSM_SETALLOC(*vsmap, i); - i++; - size -= vm_page_size; - } - } - clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap); - if (newoff) { - /* - * Offset is not cluster aligned, so number of pages - * and bitmaps must be adjusted - */ - clmap->cl_numpages -= atop_32(newoff); - CLMAP_SHIFT(clmap, vs); - CLMAP_SHIFTALLOC(clmap, vs); - } - - /* - * - * The setting of valid bits and handling of write errors - * must be done here, while we hold the lock on the map. - * It logically should be done in ps_vs_write_complete(). - * The size and error information has been passed from - * ps_vs_write_complete(). If the size parameter is non-zero, - * then there is work to be done. If error is also non-zero, - * then the error number is recorded in the cluster and the - * entire cluster is in error. - */ - if (size && flag == CL_FIND) { - dp_offset_t off = (dp_offset_t) 0; - - if (!error) { - for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0; - i++) { - VSM_SETPG(*vsmap, i); - size -= vm_page_size; - } - ASSERT(i <= VSCLSIZE(vs)); - } else { - BS_STAT(clmap->cl_ps->ps_bs, - clmap->cl_ps->ps_bs->bs_pages_out_fail += - atop_32(size)); - off = VSM_CLOFF(*vsmap); - VSM_SETERR(*vsmap, error); - } - /* - * Deallocate cluster if error, and no valid pages - * already present. - */ - if (off != (dp_offset_t) 0) - ps_deallocate_cluster(clmap->cl_ps, off); - VS_MAP_UNLOCK(vs); - return (dp_offset_t) 0; - } else - VS_MAP_UNLOCK(vs); - - DP_DEBUG(DEBUG_VS_INTERNAL, - ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n", - newcl+newoff, (int) vs, (int) vsmap, flag)); - DP_DEBUG(DEBUG_VS_INTERNAL, - (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n", - (int) clmap->cl_ps, clmap->cl_numpages, - (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map)); - - return (newcl + newoff); -} - -void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t); /* forward */ - -void -ps_clunmap( - vstruct_t vs, - dp_offset_t offset, - dp_size_t length) -{ - dp_offset_t cluster; /* The cluster number of offset */ - struct vs_map *vsmap; - struct ps_vnode_trim_data trim_data; - - ps_vnode_trim_init(&trim_data); - - VS_MAP_LOCK(vs); - - /* - * Loop through all clusters in this range, freeing paging segment - * clusters and map entries as encountered. - */ - while (length > 0) { - dp_offset_t newoff; - unsigned int i; - - cluster = atop_32(offset) >> vs->vs_clshift; - if (vs->vs_indirect) /* indirect map */ - vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES]; - else - vsmap = vs->vs_dmap; - if (vsmap == NULL) { - ps_vnode_trim_now(&trim_data); - VS_MAP_UNLOCK(vs); - return; - } - vsmap += cluster%CLMAP_ENTRIES; - if (VSM_ISCLR(*vsmap)) { - ps_vnode_trim_now(&trim_data); - length -= vm_page_size; - offset += vm_page_size; - continue; - } - /* - * We've got a valid mapping. Clear it and deallocate - * paging segment cluster pages. - * Optimize for entire cluster cleraing. - */ - if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) { - /* - * Not cluster aligned. - */ - ASSERT(trunc_page(newoff) == newoff); - i = atop_32(newoff); - } else - i = 0; - while ((i < VSCLSIZE(vs)) && (length > 0)) { - VSM_CLRPG(*vsmap, i); - VSM_CLRALLOC(*vsmap, i); - length -= vm_page_size; - offset += vm_page_size; - i++; - } - - /* - * If map entry is empty, clear and deallocate cluster. - */ - if (!VSM_BMAP(*vsmap)) { - ps_vnode_trim_more(&trim_data, - vsmap, - vs->vs_clshift, - VSCLSIZE(vs) * vm_page_size); - ps_deallocate_cluster(VSM_PS(*vsmap), - VSM_CLOFF(*vsmap)); - VSM_CLR(*vsmap); - } else { - ps_vnode_trim_now(&trim_data); - } - } - ps_vnode_trim_now(&trim_data); - - VS_MAP_UNLOCK(vs); -} - -void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */ - -void -ps_vs_write_complete( - vstruct_t vs, - dp_offset_t offset, - dp_size_t size, - int error) -{ - struct clmap clmap; - - /* - * Get the struct vsmap for this cluster. - * Use READ, even though it was written, because the - * cluster MUST be present, unless there was an error - * in the original ps_clmap (e.g. no space), in which - * case, nothing happens. - * - * Must pass enough information to ps_clmap to allow it - * to set the vs_map structure bitmap under lock. - */ - (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error); -} - -void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int); /* forward */ - -void -vs_cl_write_complete( - vstruct_t vs, - __unused paging_segment_t ps, - dp_offset_t offset, - __unused vm_offset_t addr, - dp_size_t size, - boolean_t async, - int error) -{ -// kern_return_t kr; - - if (error) { - /* - * For internal objects, the error is recorded on a - * per-cluster basis by ps_clmap() which is called - * by ps_vs_write_complete() below. - */ - dprintf(("write failed error = 0x%x\n", error)); - /* add upl_abort code here */ - } else - GSTAT(global_stats.gs_pages_out += atop_32(size)); - /* - * Notify the vstruct mapping code, so it can do its accounting. - */ - ps_vs_write_complete(vs, offset, size, error); - - if (async) { - VS_LOCK(vs); - ASSERT(vs->vs_async_pending > 0); - vs->vs_async_pending -= size; - if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { - vs->vs_waiting_async = FALSE; - VS_UNLOCK(vs); - thread_wakeup(&vs->vs_async_pending); - } else { - VS_UNLOCK(vs); - } - } -} - -#ifdef DEVICE_PAGING -kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t); - -kern_return_t -device_write_reply( - MACH_PORT_FACE reply_port, - kern_return_t device_code, - io_buf_len_t bytes_written) -{ - struct vs_async *vsa; - - vsa = (struct vs_async *) - ((struct vstruct_alias *)(reply_port->ip_alias))->vs; - - if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) { - device_code = KERN_FAILURE; - } - - vsa->vsa_error = device_code; - - - ASSERT(vsa->vsa_vs != VSTRUCT_NULL); - if(vsa->vsa_flags & VSA_TRANSFER) { - /* revisit when async disk segments redone */ - if(vsa->vsa_error) { - /* need to consider error condition. re-write data or */ - /* throw it away here. */ - vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr); - } - ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset, - vsa->vsa_size, vsa->vsa_error); - } else { - vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset, - vsa->vsa_addr, vsa->vsa_size, TRUE, - vsa->vsa_error); - } - VS_FREE_ASYNC(vsa); - - return KERN_SUCCESS; -} - -kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t); -kern_return_t -device_write_reply_inband( - MACH_PORT_FACE reply_port, - kern_return_t return_code, - io_buf_len_t bytes_written) -{ - panic("device_write_reply_inband: illegal"); - return KERN_SUCCESS; -} - -kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t); -kern_return_t -device_read_reply( - MACH_PORT_FACE reply_port, - kern_return_t return_code, - io_buf_ptr_t data, - mach_msg_type_number_t dataCnt) -{ - struct vs_async *vsa; - vsa = (struct vs_async *) - ((struct vstruct_alias *)(reply_port->defpager_importance.alias))->vs; - vsa->vsa_addr = (vm_offset_t)data; - vsa->vsa_size = (vm_size_t)dataCnt; - vsa->vsa_error = return_code; - thread_wakeup(&vsa); - return KERN_SUCCESS; -} - -kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t); -kern_return_t -device_read_reply_inband( - MACH_PORT_FACE reply_port, - kern_return_t return_code, - io_buf_ptr_inband_t data, - mach_msg_type_number_t dataCnt) -{ - panic("device_read_reply_inband: illegal"); - return KERN_SUCCESS; -} - -kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t); -kern_return_t -device_read_reply_overwrite( - MACH_PORT_FACE reply_port, - kern_return_t return_code, - io_buf_len_t bytes_read) -{ - panic("device_read_reply_overwrite: illegal\n"); - return KERN_SUCCESS; -} - -kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE); -kern_return_t -device_open_reply( - MACH_PORT_FACE reply_port, - kern_return_t return_code, - MACH_PORT_FACE device_port) -{ - panic("device_open_reply: illegal\n"); - return KERN_SUCCESS; -} - -kern_return_t -ps_read_device( - paging_segment_t ps, - dp_offset_t offset, - vm_offset_t *bufferp, - unsigned int size, - unsigned int *residualp, - int flags) -{ - kern_return_t kr; - recnum_t dev_offset; - unsigned int bytes_wanted; - unsigned int bytes_read; - unsigned int total_read; - vm_offset_t dev_buffer; - vm_offset_t buf_ptr; - unsigned int records_read; - struct vs_async *vsa; - - device_t device; - vm_map_copy_t device_data = NULL; - default_pager_thread_t *dpt = NULL; - - device = dev_port_lookup(ps->ps_device); - clustered_reads[atop_32(size)]++; - - dev_offset = (ps->ps_offset + - (offset >> (vm_page_shift - ps->ps_record_shift))); - bytes_wanted = size; - total_read = 0; - *bufferp = (vm_offset_t)NULL; - - do { - vsa = VS_ALLOC_ASYNC(); - if (vsa) { - vsa->vsa_vs = NULL; - vsa->vsa_addr = 0; - vsa->vsa_offset = 0; - vsa->vsa_size = 0; - vsa->vsa_ps = NULL; - } - ip_lock(vsa->reply_port); - vsa->reply_port->ip_sorights++; - ip_reference(vsa->reply_port); - ip_unlock(vsa->reply_port); - kr = ds_device_read_common(device, - vsa->reply_port, - (mach_msg_type_name_t) - MACH_MSG_TYPE_MOVE_SEND_ONCE, - (dev_mode_t) 0, - dev_offset, - bytes_wanted, - (IO_READ | IO_CALL), - (io_buf_ptr_t *) &dev_buffer, - (mach_msg_type_number_t *) &bytes_read); - if(kr == MIG_NO_REPLY) { - assert_wait(&vsa, THREAD_UNINT); - thread_block(THREAD_CONTINUE_NULL); - - dev_buffer = vsa->vsa_addr; - bytes_read = (unsigned int)vsa->vsa_size; - kr = vsa->vsa_error; - } - VS_FREE_ASYNC(vsa); - if (kr != KERN_SUCCESS || bytes_read == 0) { - break; - } - total_read += bytes_read; - - /* - * If we got the entire range, use the returned dev_buffer. - */ - if (bytes_read == size) { - *bufferp = (vm_offset_t)dev_buffer; - break; - } - -#if 1 - dprintf(("read only %d bytes out of %d\n", - bytes_read, bytes_wanted)); -#endif - if(dpt == NULL) { - dpt = get_read_buffer(); - buf_ptr = dpt->dpt_buffer; - *bufferp = (vm_offset_t)buf_ptr; - } - /* - * Otherwise, copy the data into the provided buffer (*bufferp) - * and append the rest of the range as it comes in. - */ - memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read); - buf_ptr += bytes_read; - bytes_wanted -= bytes_read; - records_read = (bytes_read >> - (vm_page_shift - ps->ps_record_shift)); - dev_offset += records_read; - DP_DEBUG(DEBUG_VS_INTERNAL, - ("calling vm_deallocate(addr=0x%X,size=0x%X)\n", - dev_buffer, bytes_read)); - if (vm_deallocate(kernel_map, dev_buffer, bytes_read) - != KERN_SUCCESS) - Panic("dealloc buf"); - } while (bytes_wanted); - - *residualp = size - total_read; - if((dev_buffer != *bufferp) && (total_read != 0)) { - vm_offset_t temp_buffer; - vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK)); - memcpy((void *) temp_buffer, (void *) *bufferp, total_read); - if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read, - VM_MAP_COPYIN_OPT_SRC_DESTROY | - VM_MAP_COPYIN_OPT_STEAL_PAGES | - VM_MAP_COPYIN_OPT_PMAP_ENTER, - (vm_map_copy_t *)&device_data, FALSE)) - panic("ps_read_device: cannot copyin locally provided buffer\n"); - } - else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){ - if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read, - VM_MAP_COPYIN_OPT_SRC_DESTROY | - VM_MAP_COPYIN_OPT_STEAL_PAGES | - VM_MAP_COPYIN_OPT_PMAP_ENTER, - (vm_map_copy_t *)&device_data, FALSE)) - panic("ps_read_device: cannot copyin backing store provided buffer\n"); - } - else { - device_data = NULL; - } - *bufferp = (vm_offset_t)device_data; - - if(dpt != NULL) { - /* Free the receive buffer */ - dpt->checked_out = 0; - thread_wakeup(&dpt_array); - } - return KERN_SUCCESS; -} - -kern_return_t -ps_write_device( - paging_segment_t ps, - dp_offset_t offset, - vm_offset_t addr, - unsigned int size, - struct vs_async *vsa) -{ - recnum_t dev_offset; - io_buf_len_t bytes_to_write, bytes_written; - recnum_t records_written; - kern_return_t kr; - MACH_PORT_FACE reply_port; - - - - clustered_writes[atop_32(size)]++; - - dev_offset = (ps->ps_offset + - (offset >> (vm_page_shift - ps->ps_record_shift))); - bytes_to_write = size; - - if (vsa) { - /* - * Asynchronous write. - */ - reply_port = vsa->reply_port; - ip_lock(reply_port); - reply_port->ip_sorights++; - ip_reference(reply_port); - ip_unlock(reply_port); - { - device_t device; - device = dev_port_lookup(ps->ps_device); - - vsa->vsa_addr = addr; - kr=ds_device_write_common(device, - reply_port, - (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE, - (dev_mode_t) 0, - dev_offset, - (io_buf_ptr_t) addr, - size, - (IO_WRITE | IO_CALL), - &bytes_written); - } - if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) { - if (verbose) - dprintf(("%s0x%x, addr=0x%x," - "size=0x%x,offset=0x%x\n", - "device_write_request returned ", - kr, addr, size, offset)); - BS_STAT(ps->ps_bs, - ps->ps_bs->bs_pages_out_fail += atop_32(size)); - /* do the completion notification to free resources */ - device_write_reply(reply_port, kr, 0); - return PAGER_ERROR; - } - } else do { - /* - * Synchronous write. - */ - { - device_t device; - device = dev_port_lookup(ps->ps_device); - kr=ds_device_write_common(device, - IP_NULL, 0, - (dev_mode_t) 0, - dev_offset, - (io_buf_ptr_t) addr, - size, - (IO_WRITE | IO_SYNC | IO_KERNEL_BUF), - &bytes_written); - } - if (kr != KERN_SUCCESS) { - dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n", - "device_write returned ", - kr, addr, size, offset)); - BS_STAT(ps->ps_bs, - ps->ps_bs->bs_pages_out_fail += atop_32(size)); - return PAGER_ERROR; - } - if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1)) - Panic("fragmented write"); - records_written = (bytes_written >> - (vm_page_shift - ps->ps_record_shift)); - dev_offset += records_written; -#if 1 - if (bytes_written != bytes_to_write) { - dprintf(("wrote only %d bytes out of %d\n", - bytes_written, bytes_to_write)); - } -#endif - bytes_to_write -= bytes_written; - addr += bytes_written; - } while (bytes_to_write > 0); - - return PAGER_SUCCESS; -} - - -#else /* !DEVICE_PAGING */ - -kern_return_t -ps_read_device( - __unused paging_segment_t ps, - __unused dp_offset_t offset, - __unused vm_offset_t *bufferp, - __unused unsigned int size, - __unused unsigned int *residualp, - __unused int flags) -{ - panic("ps_read_device not supported"); - return KERN_FAILURE; -} - -kern_return_t -ps_write_device( - __unused paging_segment_t ps, - __unused dp_offset_t offset, - __unused vm_offset_t addr, - __unused unsigned int size, - __unused struct vs_async *vsa) -{ - panic("ps_write_device not supported"); - return KERN_FAILURE; -} - -#endif /* DEVICE_PAGING */ -void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */ - -void -pvs_object_data_provided( - __unused vstruct_t vs, - __unused upl_t upl, - __unused upl_offset_t offset, - upl_size_t size) -{ -#if RECLAIM_SWAP - boolean_t empty; -#endif - - DP_DEBUG(DEBUG_VS_INTERNAL, - ("buffer=0x%x,offset=0x%x,size=0x%x\n", - upl, offset, size)); - - ASSERT(size > 0); - GSTAT(global_stats.gs_pages_in += atop_32(size)); - -/* check upl iosync flag instead of using RECLAIM_SWAP*/ -#if RECLAIM_SWAP - if (size != upl->size) { - if (size) { - ps_clunmap(vs, offset, size); - upl_commit_range(upl, 0, size, 0, NULL, 0, &empty); - } - upl_abort(upl, UPL_ABORT_ERROR); - upl_deallocate(upl); - } else { - ps_clunmap(vs, offset, size); - upl_commit(upl, NULL, 0); - upl_deallocate(upl); - } -#endif /* RECLAIM_SWAP */ - -} - -static memory_object_offset_t last_start; -static vm_size_t last_length; - -/* - * A "cnt" of 0 means that the caller just wants to check if the page at - * offset "vs_offset" exists in the backing store. That page hasn't been - * prepared, so no need to release it. - * - * A "cnt" of -1 means that the caller wants to bring back from the backing - * store all existing pages in the cluster containing "vs_offset". - */ -kern_return_t -pvs_cluster_read( - vstruct_t vs, - dp_offset_t vs_offset, - dp_size_t cnt, - void *fault_info) -{ - kern_return_t error = KERN_SUCCESS; - unsigned int size; - unsigned int residual; - unsigned int request_flags; - int io_flags = 0; - int seg_index; - int pages_in_cl; - int cl_size; - int cl_mask; - int cl_index; - unsigned int xfer_size; - dp_offset_t orig_vs_offset; - dp_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT]; - paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT]; - struct clmap clmap; - upl_t upl; - unsigned int page_list_count; - memory_object_offset_t cluster_start; - vm_size_t cluster_length; - uint32_t io_streaming; - int i; - boolean_t io_sync = FALSE; - boolean_t reclaim_all = FALSE; - - pages_in_cl = 1 << vs->vs_clshift; - cl_size = pages_in_cl * vm_page_size; - cl_mask = cl_size - 1; - - request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; - - if (cnt == (dp_size_t) -1) - reclaim_all = TRUE; - - if (reclaim_all == TRUE) { - /* - * We've been called from ps_vstruct_reclaim() to move all - * the object's swapped pages back to VM pages. - * This can put memory pressure on the system, so we do want - * to wait for free pages, to avoid getting in the way of the - * vm_pageout_scan() thread. - * Let's not use UPL_NOBLOCK in this case. - */ - vs_offset &= ~cl_mask; - i = pages_in_cl; - } else { - i = 1; - - /* - * if the I/O cluster size == PAGE_SIZE, we don't want to set - * the UPL_NOBLOCK since we may be trying to recover from a - * previous partial pagein I/O that occurred because we were low - * on memory and bailed early in order to honor the UPL_NOBLOCK... - * since we're only asking for a single page, we can block w/o fear - * of tying up pages while waiting for more to become available - */ - if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE) - request_flags |= UPL_NOBLOCK; - } - -again: - cl_index = (vs_offset & cl_mask) / vm_page_size; - - if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) || - !CLMAP_ISSET(clmap, cl_index)) { - /* - * the needed page doesn't exist in the backing store... - * we don't want to try to do any I/O, just abort the - * page and let the fault handler provide a zero-fill - */ - if (cnt == 0) { - /* - * The caller was just poking at us to see if - * the page has been paged out. No need to - * mess with the page at all. - * Just let the caller know we don't have that page. - */ - return KERN_FAILURE; - } - if (reclaim_all == TRUE) { - i--; - if (i == 0) { - /* no more pages in this cluster */ - return KERN_FAILURE; - } - /* try the next page in this cluster */ - vs_offset += vm_page_size; - goto again; - } - - page_list_count = 0; - - memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, - PAGE_SIZE, PAGE_SIZE, - &upl, NULL, &page_list_count, - request_flags | UPL_SET_INTERNAL); - upl_range_needed(upl, 0, 1); - - if (clmap.cl_error) - upl_abort(upl, UPL_ABORT_ERROR); - else - upl_abort(upl, UPL_ABORT_UNAVAILABLE); - upl_deallocate(upl); - - return KERN_SUCCESS; - } - - if (cnt == 0) { - /* - * The caller was just poking at us to see if - * the page has been paged out. No need to - * mess with the page at all. - * Just let the caller know we do have that page. - */ - return KERN_SUCCESS; - } - - if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) { - io_sync = TRUE; - } else { -#if RECLAIM_SWAP - io_sync = TRUE; -#endif /* RECLAIM_SWAP */ - } - - if( io_sync == TRUE ) { - - io_flags |= UPL_IOSYNC | UPL_NOCOMMIT; -#if USE_PRECIOUS - request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE; -#else /* USE_PRECIOUS */ - request_flags |= UPL_REQUEST_SET_DIRTY; -#endif /* USE_PRECIOUS */ - } - - assert(dp_encryption_inited); - if (dp_encryption) { - /* - * ENCRYPTED SWAP: - * request that the UPL be prepared for - * decryption. - */ - request_flags |= UPL_ENCRYPT; - io_flags |= UPL_PAGING_ENCRYPTED; - } - orig_vs_offset = vs_offset; - - assert(cnt != 0); - cnt = VM_SUPER_CLUSTER; - cluster_start = (memory_object_offset_t) vs_offset; - cluster_length = (vm_size_t) cnt; - io_streaming = 0; - - /* - * determine how big a speculative I/O we should try for... - */ - if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) { - assert(vs_offset >= (dp_offset_t) cluster_start && - vs_offset < (dp_offset_t) (cluster_start + cluster_length)); - vs_offset = (dp_offset_t) cluster_start; - cnt = (dp_size_t) cluster_length; - } else { - cluster_length = PAGE_SIZE; - cnt = PAGE_SIZE; - } - - if (io_streaming) - io_flags |= UPL_IOSTREAMING; - - last_start = cluster_start; - last_length = cluster_length; - - /* - * This loop will be executed multiple times until the entire - * range has been looked at or we issue an I/O... if the request spans cluster - * boundaries, the clusters will be checked for logical continunity, - * if contiguous the I/O request will span multiple clusters... - * at most only 1 I/O will be issued... it will encompass the original offset - */ - while (cnt && error == KERN_SUCCESS) { - int ps_info_valid; - - if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) { - size = VM_SUPER_CLUSTER; - size -= vs_offset & cl_mask; - } else if (cnt > VM_SUPER_CLUSTER) - size = VM_SUPER_CLUSTER; - else - size = cnt; - - cnt -= size; - - ps_info_valid = 0; - seg_index = 0; - - while (size > 0 && error == KERN_SUCCESS) { - unsigned int abort_size; - unsigned int lsize; - int failed_size; - int beg_pseg; - int beg_indx; - dp_offset_t cur_offset; - - if ( !ps_info_valid) { - ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0); - psp[seg_index] = CLMAP_PS(clmap); - ps_info_valid = 1; - } - /* - * skip over unallocated physical segments - */ - if (ps_offset[seg_index] == (dp_offset_t) -1) { - abort_size = cl_size - (vs_offset & cl_mask); - abort_size = MIN(abort_size, size); - - size -= abort_size; - vs_offset += abort_size; - - seg_index++; - ps_info_valid = 0; - - continue; - } - cl_index = (vs_offset & cl_mask) / vm_page_size; - - for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) { - /* - * skip over unallocated pages - */ - if (CLMAP_ISSET(clmap, cl_index)) - break; - abort_size += vm_page_size; - } - if (abort_size) { - size -= abort_size; - vs_offset += abort_size; - - if (cl_index == pages_in_cl) { - /* - * if we're at the end of this physical cluster - * then bump to the next one and continue looking - */ - seg_index++; - ps_info_valid = 0; - - continue; - } - if (size == 0) - break; - } - /* - * remember the starting point of the first allocated page - * for the I/O we're about to issue - */ - beg_pseg = seg_index; - beg_indx = cl_index; - cur_offset = vs_offset; - - /* - * calculate the size of the I/O that we can do... - * this may span multiple physical segments if - * they are contiguous - */ - for (xfer_size = 0; xfer_size < size; ) { - - while (cl_index < pages_in_cl && xfer_size < size) { - /* - * accumulate allocated pages within - * a physical segment - */ - if (CLMAP_ISSET(clmap, cl_index)) { - xfer_size += vm_page_size; - cur_offset += vm_page_size; - cl_index++; - - BS_STAT(psp[seg_index]->ps_bs, - psp[seg_index]->ps_bs->bs_pages_in++); - } else - break; - } - if (cl_index < pages_in_cl || xfer_size >= size) { - /* - * we've hit an unallocated page or - * the end of this request... see if - * it's time to fire the I/O - */ - break; - } - /* - * we've hit the end of the current physical - * segment and there's more to do, so try - * moving to the next one - */ - seg_index++; - - ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0); - psp[seg_index] = CLMAP_PS(clmap); - ps_info_valid = 1; - - if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) { - /* - * if the physical segment we're about - * to step into is not contiguous to - * the one we're currently in, or it's - * in a different paging file, or - * it hasn't been allocated.... - * we stop this run and go check - * to see if it's time to fire the I/O - */ - break; - } - /* - * start with first page of the next physical - * segment - */ - cl_index = 0; - } - if (xfer_size == 0) { - /* - * no I/O to generate for this segment - */ - continue; - } - if (cur_offset <= orig_vs_offset) { - /* - * we've hit a hole in our speculative cluster - * before the offset that we're really after... - * don't issue the I/O since it doesn't encompass - * the original offset and we're looking to only - * pull in the speculative pages if they can be - * made part of a single I/O - */ - size -= xfer_size; - vs_offset += xfer_size; - - continue; - } - /* - * we have a contiguous range of allocated pages - * to read from that encompasses the original offset - */ - page_list_count = 0; - memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, - xfer_size, xfer_size, - &upl, NULL, &page_list_count, - request_flags | UPL_SET_INTERNAL); - - error = ps_read_file(psp[beg_pseg], - upl, (upl_offset_t) 0, - ps_offset[beg_pseg] + (beg_indx * vm_page_size), - xfer_size, &residual, io_flags); - - - /* - * Adjust counts and send response to VM. Optimize - * for the common case, i.e. no error and/or partial - * data. If there was an error, then we need to error - * the entire range, even if some data was successfully - * read. If there was a partial read we may supply some - * data and may error some as well. In all cases the - * VM must receive some notification for every page - * in the range. - */ - if ((error == KERN_SUCCESS) && (residual == 0)) { - /* - * Got everything we asked for, supply the data - * to the VM. Note that as a side effect of - * supplying the data, the buffer holding the - * supplied data is deallocated from the pager's - * address space. - */ - lsize = xfer_size; - failed_size = 0; - } else { - lsize = 0; - failed_size = xfer_size; - - if (error == KERN_SUCCESS) { - if (residual == xfer_size) { - /* - * If a read operation returns no error - * and no data moved, we turn it into - * an error, assuming we're reading at - * or beyong EOF. - * Fall through and error the entire range. - */ - error = KERN_FAILURE; - } else { - /* - * Otherwise, we have partial read. If - * the part read is a integral number - * of pages supply it. Otherwise round - * it up to a page boundary, zero fill - * the unread part, and supply it. - * Fall through and error the remainder - * of the range, if any. - */ - int fill; - - fill = residual & (vm_page_size - 1); - lsize = (xfer_size - residual) + fill; - - if (lsize < xfer_size) - failed_size = xfer_size - lsize; - - if (reclaim_all == FALSE) - error = KERN_FAILURE; - } - } - } - pvs_object_data_provided(vs, upl, vs_offset, lsize); - - if (failed_size) { - /* - * There was an error in some part of the range, tell - * the VM. Note that error is explicitly checked again - * since it can be modified above. - */ - BS_STAT(psp[beg_pseg]->ps_bs, - psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size)); - } - /* - * we've issued a single I/O that encompassed the original offset - * at this point we either met our speculative request length or - * we ran into a 'hole' (i.e. page not present in the cluster, cluster - * not present or not physically contiguous to the previous one), so - * we're done issuing I/O at this point - */ - return (error); - } - } - return error; -} - -int vs_do_async_write = 1; - -kern_return_t -vs_cluster_write( - vstruct_t vs, - upl_t internal_upl, - upl_offset_t offset, - upl_size_t cnt, - boolean_t dp_internal, - int flags) -{ - upl_size_t transfer_size; - int error = 0; - struct clmap clmap; - - dp_offset_t actual_offset; /* Offset within paging segment */ - paging_segment_t ps; - dp_offset_t mobj_base_addr; - dp_offset_t mobj_target_addr; - - upl_t upl; - upl_page_info_t *pl; - int page_index; - unsigned int page_max_index; - int list_size; - int pages_in_cl; - unsigned int cl_size; - int base_index; - unsigned int seg_size; - unsigned int upl_offset_in_object; - boolean_t minimal_clustering = FALSE; - boolean_t found_dirty; - - if (!dp_encryption_inited) { - /* - * ENCRYPTED SWAP: - * Once we've started using swap, we - * can't change our mind on whether - * it needs to be encrypted or - * not. - */ - dp_encryption_inited = TRUE; - } - if (dp_encryption) { - /* - * ENCRYPTED SWAP: - * the UPL will need to be encrypted... - */ - flags |= UPL_PAGING_ENCRYPTED; - } - - pages_in_cl = 1 << vs->vs_clshift; - cl_size = pages_in_cl * vm_page_size; - -#if CONFIG_FREEZE - minimal_clustering = TRUE; -#else - if (dp_isssd == TRUE) - minimal_clustering = TRUE; -#endif - if (!dp_internal) { - unsigned int page_list_count; - int request_flags; - unsigned int super_size; - int first_dirty; - int num_dirty; - int num_of_pages; - int seg_index; - upl_offset_t upl_offset; - upl_offset_t upl_offset_aligned; - dp_offset_t seg_offset; - dp_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1]; - paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1]; - - - if (bs_low) - super_size = cl_size; - else - super_size = VM_SUPER_CLUSTER; - - request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE | - UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM | - UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE; - - if (dp_encryption) { - /* - * ENCRYPTED SWAP: - * request that the UPL be prepared for - * encryption. - */ - request_flags |= UPL_ENCRYPT; - flags |= UPL_PAGING_ENCRYPTED; - } - - page_list_count = 0; - memory_object_super_upl_request(vs->vs_control, - (memory_object_offset_t)offset, - cnt, super_size, - &upl, NULL, &page_list_count, - request_flags | UPL_FOR_PAGEOUT); - - /* - * The default pager does not handle objects larger than - * 4GB, so it does not deal with offset that don't fit in - * 32-bit. Cast down upl->offset now and make sure we - * did not lose any valuable bits. - */ - upl_offset_in_object = (unsigned int) upl->offset; - assert(upl->offset == upl_offset_in_object); - - pl = UPL_GET_INTERNAL_PAGE_LIST(upl); - - seg_size = cl_size - (upl_offset_in_object % cl_size); - upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1); - page_index = 0; - page_max_index = upl->size / PAGE_SIZE; - found_dirty = TRUE; - - for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) { - - unsigned int seg_pgcnt; - - seg_pgcnt = seg_size / PAGE_SIZE; - - if (minimal_clustering == TRUE) { - unsigned int non_dirty; - - non_dirty = 0; - found_dirty = FALSE; - - for (; non_dirty < seg_pgcnt; non_dirty++) { - if ((page_index + non_dirty) >= page_max_index) - break; - - if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) || - UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) { - found_dirty = TRUE; - break; - } - } - } - if (found_dirty == TRUE) { - ps_offset[seg_index] = - ps_clmap(vs, - upl_offset_aligned, - &clmap, CL_ALLOC, - cl_size, 0); - - if (ps_offset[seg_index] == (dp_offset_t) -1) { - upl_abort(upl, 0); - upl_deallocate(upl); - - return KERN_FAILURE; - } - psp[seg_index] = CLMAP_PS(clmap); - } - if (transfer_size > seg_size) { - page_index += seg_pgcnt; - transfer_size -= seg_size; - upl_offset_aligned += cl_size; - seg_size = cl_size; - seg_index++; - } else - transfer_size = 0; - } - /* - * Ignore any non-present pages at the end of the - * UPL. - */ - for (page_index = upl->size / vm_page_size; page_index > 0;) { - if (UPL_PAGE_PRESENT(pl, --page_index)) { - page_index++; - break; - } - } - if (page_index == 0) { - /* - * no pages in the UPL - * abort and return - */ - upl_abort(upl, 0); - upl_deallocate(upl); - - return KERN_SUCCESS; - } - num_of_pages = page_index; - - base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE; - - for (page_index = 0; page_index < num_of_pages; ) { - /* - * skip over non-dirty pages - */ - for ( ; page_index < num_of_pages; page_index++) { - if (UPL_DIRTY_PAGE(pl, page_index) - || UPL_PRECIOUS_PAGE(pl, page_index)) - /* - * this is a page we need to write - * go see if we can buddy it up with - * others that are contiguous to it - */ - break; - /* - * if the page is not-dirty, but present we - * need to commit it... This is an unusual - * case since we only asked for dirty pages - */ - if (UPL_PAGE_PRESENT(pl, page_index)) { - boolean_t empty = FALSE; - upl_commit_range(upl, - page_index * vm_page_size, - vm_page_size, - UPL_COMMIT_NOTIFY_EMPTY, - pl, - page_list_count, - &empty); - if (empty) { - assert(page_index == - num_of_pages - 1); - upl_deallocate(upl); - } - } - } - if (page_index == num_of_pages) - /* - * no more pages to look at, we're out of here - */ - break; - - /* - * gather up contiguous dirty pages... we have at - * least 1 * otherwise we would have bailed above - * make sure that each physical segment that we step - * into is contiguous to the one we're currently in - * if it's not, we have to stop and write what we have - */ - for (first_dirty = page_index; - page_index < num_of_pages; ) { - if ( !UPL_DIRTY_PAGE(pl, page_index) - && !UPL_PRECIOUS_PAGE(pl, page_index)) - break; - page_index++; - /* - * if we just looked at the last page in the UPL - * we don't need to check for physical segment - * continuity - */ - if (page_index < num_of_pages) { - int cur_seg; - int nxt_seg; - - cur_seg = (base_index + (page_index - 1))/pages_in_cl; - nxt_seg = (base_index + page_index)/pages_in_cl; - - if (cur_seg != nxt_seg) { - if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg])) - /* - * if the segment we're about - * to step into is not - * contiguous to the one we're - * currently in, or it's in a - * different paging file.... - * we stop here and generate - * the I/O - */ - break; - } - } - } - num_dirty = page_index - first_dirty; - - if (num_dirty) { - upl_offset = first_dirty * vm_page_size; - transfer_size = num_dirty * vm_page_size; - - while (transfer_size) { - - if ((seg_size = cl_size - - ((upl_offset_in_object + - upl_offset) % cl_size)) - > transfer_size) - seg_size = transfer_size; - - ps_vs_write_complete( - vs, - (upl_offset_in_object + - upl_offset), - seg_size, error); - - transfer_size -= seg_size; - upl_offset += seg_size; - } - upl_offset = first_dirty * vm_page_size; - transfer_size = num_dirty * vm_page_size; - - seg_index = (base_index + first_dirty) / pages_in_cl; - seg_offset = (upl_offset_in_object + upl_offset) % cl_size; - - error = ps_write_file(psp[seg_index], - upl, upl_offset, - ps_offset[seg_index] - + seg_offset, - transfer_size, flags); - } - } - - } else { - assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift)); - list_size = cnt; - - page_index = 0; - /* The caller provides a mapped_data which is derived */ - /* from a temporary object. The targeted pages are */ - /* guaranteed to be set at offset 0 in the mapped_data */ - /* The actual offset however must still be derived */ - /* from the offset in the vs in question */ - mobj_base_addr = offset; - mobj_target_addr = mobj_base_addr; - - for (transfer_size = list_size; transfer_size != 0;) { - actual_offset = ps_clmap(vs, mobj_target_addr, - &clmap, CL_ALLOC, - transfer_size < cl_size ? - transfer_size : cl_size, 0); - if(actual_offset == (dp_offset_t) -1) { - error = 1; - break; - } - cnt = MIN(transfer_size, - (unsigned) CLMAP_NPGS(clmap) * vm_page_size); - ps = CLMAP_PS(clmap); - /* Assume that the caller has given us contiguous */ - /* pages */ - if(cnt) { - ps_vs_write_complete(vs, mobj_target_addr, - cnt, error); - error = ps_write_file(ps, internal_upl, - 0, actual_offset, - cnt, flags); - if (error) - break; - } - if (error) - break; - actual_offset += cnt; - mobj_target_addr += cnt; - transfer_size -= cnt; - cnt = 0; - - if (error) - break; - } - } - if(error) - return KERN_FAILURE; - else - return KERN_SUCCESS; -} - -vm_size_t -ps_vstruct_allocated_size( - vstruct_t vs) -{ - int num_pages; - struct vs_map *vsmap; - unsigned int i, j, k; - - num_pages = 0; - if (vs->vs_indirect) { - /* loop on indirect maps */ - for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { - vsmap = vs->vs_imap[i]; - if (vsmap == NULL) - continue; - /* loop on clusters in this indirect map */ - for (j = 0; j < CLMAP_ENTRIES; j++) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j])) - continue; - /* loop on pages in this cluster */ - for (k = 0; k < VSCLSIZE(vs); k++) { - if ((VSM_BMAP(vsmap[j])) & (1 << k)) - num_pages++; - } - } - } - } else { - vsmap = vs->vs_dmap; - if (vsmap == NULL) - return 0; - /* loop on clusters in the direct map */ - for (j = 0; j < CLMAP_ENTRIES; j++) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j])) - continue; - /* loop on pages in this cluster */ - for (k = 0; k < VSCLSIZE(vs); k++) { - if ((VSM_BMAP(vsmap[j])) & (1 << k)) - num_pages++; - } - } - } - - return ptoa_32(num_pages); -} - -unsigned int -ps_vstruct_allocated_pages( - vstruct_t vs, - default_pager_page_t *pages, - unsigned int pages_size) -{ - unsigned int num_pages; - struct vs_map *vsmap; - dp_offset_t offset; - unsigned int i, j, k; - - num_pages = 0; - offset = 0; - if (vs->vs_indirect) { - /* loop on indirect maps */ - for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { - vsmap = vs->vs_imap[i]; - if (vsmap == NULL) { - offset += (vm_page_size * CLMAP_ENTRIES * - VSCLSIZE(vs)); - continue; - } - /* loop on clusters in this indirect map */ - for (j = 0; j < CLMAP_ENTRIES; j++) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j])) { - offset += vm_page_size * VSCLSIZE(vs); - continue; - } - /* loop on pages in this cluster */ - for (k = 0; k < VSCLSIZE(vs); k++) { - if ((VSM_BMAP(vsmap[j])) & (1 << k)) { - num_pages++; - if (num_pages < pages_size) - pages++->dpp_offset = - offset; - } - offset += vm_page_size; - } - } - } - } else { - vsmap = vs->vs_dmap; - if (vsmap == NULL) - return 0; - /* loop on clusters in the direct map */ - for (j = 0; j < CLMAP_ENTRIES; j++) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j])) { - offset += vm_page_size * VSCLSIZE(vs); - continue; - } - /* loop on pages in this cluster */ - for (k = 0; k < VSCLSIZE(vs); k++) { - if ((VSM_BMAP(vsmap[j])) & (1 << k)) { - num_pages++; - if (num_pages < pages_size) - pages++->dpp_offset = offset; - } - offset += vm_page_size; - } - } - } - - return num_pages; -} - - -kern_return_t -ps_vstruct_transfer_from_segment( - vstruct_t vs, - paging_segment_t segment, - upl_t upl) -{ - struct vs_map *vsmap; -// struct vs_map old_vsmap; -// struct vs_map new_vsmap; - unsigned int i, j; - - VS_LOCK(vs); /* block all work on this vstruct */ - /* can't allow the normal multiple write */ - /* semantic because writes may conflict */ - vs->vs_xfer_pending = TRUE; - vs_wait_for_sync_writers(vs); - vs_start_write(vs); - vs_wait_for_readers(vs); - /* we will unlock the vs to allow other writes while transferring */ - /* and will be guaranteed of the persistance of the vs struct */ - /* because the caller of ps_vstruct_transfer_from_segment bumped */ - /* vs_async_pending */ - /* OK we now have guaranteed no other parties are accessing this */ - /* vs. Now that we are also supporting simple lock versions of */ - /* vs_lock we cannot hold onto VS_LOCK as we may block below. */ - /* our purpose in holding it before was the multiple write case */ - /* we now use the boolean xfer_pending to do that. We can use */ - /* a boolean instead of a count because we have guaranteed single */ - /* file access to this code in its caller */ - VS_UNLOCK(vs); -vs_changed: - if (vs->vs_indirect) { - unsigned int vsmap_size; - int clmap_off; - /* loop on indirect maps */ - for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { - vsmap = vs->vs_imap[i]; - if (vsmap == NULL) - continue; - /* loop on clusters in this indirect map */ - clmap_off = (vm_page_size * CLMAP_ENTRIES * - VSCLSIZE(vs) * i); - if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size)) - vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i); - else - vsmap_size = CLMAP_ENTRIES; - for (j = 0; j < vsmap_size; j++) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j]) || - (VSM_PS(vsmap[j]) != segment)) - continue; - if(vs_cluster_transfer(vs, - (vm_page_size * (j << vs->vs_clshift)) - + clmap_off, - vm_page_size << vs->vs_clshift, - upl) - != KERN_SUCCESS) { - VS_LOCK(vs); - vs->vs_xfer_pending = FALSE; - VS_UNLOCK(vs); - vs_finish_write(vs); - return KERN_FAILURE; - } - /* allow other readers/writers during transfer*/ - VS_LOCK(vs); - vs->vs_xfer_pending = FALSE; - VS_UNLOCK(vs); - vs_finish_write(vs); - - if (backing_store_abort_compaction || backing_store_stop_compaction) { - backing_store_abort_compaction = FALSE; - dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n")); - return KERN_FAILURE; - } - vnode_pager_throttle(); - - VS_LOCK(vs); - vs->vs_xfer_pending = TRUE; - vs_wait_for_sync_writers(vs); - vs_start_write(vs); - vs_wait_for_readers(vs); - VS_UNLOCK(vs); - if (!(vs->vs_indirect)) { - goto vs_changed; - } - } - } - } else { - vsmap = vs->vs_dmap; - if (vsmap == NULL) { - VS_LOCK(vs); - vs->vs_xfer_pending = FALSE; - VS_UNLOCK(vs); - vs_finish_write(vs); - return KERN_SUCCESS; - } - /* loop on clusters in the direct map */ - for (j = 0; j < vs->vs_size; j++) { - if (VSM_ISCLR(vsmap[j]) || - VSM_ISERR(vsmap[j]) || - (VSM_PS(vsmap[j]) != segment)) - continue; - if(vs_cluster_transfer(vs, - vm_page_size * (j << vs->vs_clshift), - vm_page_size << vs->vs_clshift, - upl) != KERN_SUCCESS) { - VS_LOCK(vs); - vs->vs_xfer_pending = FALSE; - VS_UNLOCK(vs); - vs_finish_write(vs); - return KERN_FAILURE; - } - /* allow other readers/writers during transfer*/ - VS_LOCK(vs); - vs->vs_xfer_pending = FALSE; - VS_UNLOCK(vs); - vs_finish_write(vs); - VS_LOCK(vs); - vs->vs_xfer_pending = TRUE; - vs_wait_for_sync_writers(vs); - vs_start_write(vs); - vs_wait_for_readers(vs); - VS_UNLOCK(vs); - if (vs->vs_indirect) { - goto vs_changed; - } - } - } - - VS_LOCK(vs); - vs->vs_xfer_pending = FALSE; - VS_UNLOCK(vs); - vs_finish_write(vs); - return KERN_SUCCESS; -} - - - -vs_map_t -vs_get_map_entry( - vstruct_t vs, - dp_offset_t offset) -{ - struct vs_map *vsmap; - dp_offset_t cluster; - - cluster = atop_32(offset) >> vs->vs_clshift; - if (vs->vs_indirect) { - long ind_block = cluster/CLMAP_ENTRIES; - - /* Is the indirect block allocated? */ - vsmap = vs->vs_imap[ind_block]; - if(vsmap == (vs_map_t) NULL) - return vsmap; - } else - vsmap = vs->vs_dmap; - vsmap += cluster%CLMAP_ENTRIES; - return vsmap; -} - -kern_return_t -vs_cluster_transfer( - vstruct_t vs, - dp_offset_t offset, - dp_size_t cnt, - upl_t upl) -{ - dp_offset_t actual_offset; - paging_segment_t ps; - struct clmap clmap; - kern_return_t error = KERN_SUCCESS; - unsigned int size, size_wanted; - int i; - unsigned int residual = 0; - unsigned int unavail_size; -// default_pager_thread_t *dpt; -// boolean_t dealloc; - struct vs_map *vsmap_ptr = NULL; - struct vs_map read_vsmap; - struct vs_map original_read_vsmap; - struct vs_map write_vsmap; -// upl_t sync_upl; -// vm_offset_t ioaddr; - - /* vs_cluster_transfer reads in the pages of a cluster and - * then writes these pages back to new backing store. The - * segment the pages are being read from is assumed to have - * been taken off-line and is no longer considered for new - * space requests. - */ - - /* - * This loop will be executed once per cluster referenced. - * Typically this means once, since it's unlikely that the - * VM system will ask for anything spanning cluster boundaries. - * - * If there are holes in a cluster (in a paging segment), we stop - * reading at the hole, then loop again, hoping to - * find valid pages later in the cluster. This continues until - * the entire range has been examined, and read, if present. The - * pages are written as they are read. If a failure occurs after - * some pages are written the unmap call at the bottom of the loop - * recovers the backing store and the old backing store remains - * in effect. - */ - - VSM_CLR(write_vsmap); - VSM_CLR(original_read_vsmap); - /* grab the actual object's pages to sync with I/O */ - while (cnt && (error == KERN_SUCCESS)) { - vsmap_ptr = vs_get_map_entry(vs, offset); - actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0); - - if (actual_offset == (dp_offset_t) -1) { - - /* - * Nothing left to write in this cluster at least - * set write cluster information for any previous - * write, clear for next cluster, if there is one - */ - unsigned int local_size, clmask, clsize; - - clsize = vm_page_size << vs->vs_clshift; - clmask = clsize - 1; - local_size = clsize - (offset & clmask); - ASSERT(local_size); - local_size = MIN(local_size, cnt); - - /* This cluster has no data in it beyond what may */ - /* have been found on a previous iteration through */ - /* the loop "write_vsmap" */ - *vsmap_ptr = write_vsmap; - VSM_CLR(write_vsmap); - VSM_CLR(original_read_vsmap); - - cnt -= local_size; - offset += local_size; - continue; - } - - /* - * Count up contiguous available or unavailable - * pages. - */ - ps = CLMAP_PS(clmap); - ASSERT(ps); - size = 0; - unavail_size = 0; - for (i = 0; - (size < cnt) && (unavail_size < cnt) && - (i < CLMAP_NPGS(clmap)); i++) { - if (CLMAP_ISSET(clmap, i)) { - if (unavail_size != 0) - break; - size += vm_page_size; - BS_STAT(ps->ps_bs, - ps->ps_bs->bs_pages_in++); - } else { - if (size != 0) - break; - unavail_size += vm_page_size; - } - } - - if (size == 0) { - ASSERT(unavail_size); - ps_clunmap(vs, offset, unavail_size); - cnt -= unavail_size; - offset += unavail_size; - if((offset & ((vm_page_size << vs->vs_clshift) - 1)) - == 0) { - /* There is no more to transfer in this - cluster - */ - *vsmap_ptr = write_vsmap; - VSM_CLR(write_vsmap); - VSM_CLR(original_read_vsmap); - } - continue; - } - - if(VSM_ISCLR(original_read_vsmap)) - original_read_vsmap = *vsmap_ptr; - - if(ps->ps_segtype == PS_PARTITION) { - panic("swap partition not supported\n"); - /*NOTREACHED*/ - error = KERN_FAILURE; - residual = size; -/* - NEED TO ISSUE WITH SYNC & NO COMMIT - error = ps_read_device(ps, actual_offset, &buffer, - size, &residual, flags); -*/ - } else { - /* NEED TO ISSUE WITH SYNC & NO COMMIT */ - error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset, - size, &residual, - (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0))); - } - - read_vsmap = *vsmap_ptr; - - - /* - * Adjust counts and put data in new BS. Optimize for the - * common case, i.e. no error and/or partial data. - * If there was an error, then we need to error the entire - * range, even if some data was successfully read. - * - */ - if ((error == KERN_SUCCESS) && (residual == 0)) { - - /* - * Got everything we asked for, supply the data to - * the new BS. Note that as a side effect of supplying - * the data, the buffer holding the supplied data is - * deallocated from the pager's address space unless - * the write is unsuccessful. - */ - - /* note buffer will be cleaned up in all cases by */ - /* internal_cluster_write or if an error on write */ - /* the vm_map_copy_page_discard call */ - *vsmap_ptr = write_vsmap; - - if(vs_cluster_write(vs, upl, offset, - size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) { - error = KERN_FAILURE; - if(!(VSM_ISCLR(*vsmap_ptr))) { - /* unmap the new backing store object */ - ps_clunmap(vs, offset, size); - } - /* original vsmap */ - *vsmap_ptr = original_read_vsmap; - VSM_CLR(write_vsmap); - } else { - if((offset + size) & - ((vm_page_size << vs->vs_clshift) - - 1)) { - /* There is more to transfer in this - cluster - */ - write_vsmap = *vsmap_ptr; - *vsmap_ptr = read_vsmap; - ps_clunmap(vs, offset, size); - } else { - /* discard the old backing object */ - write_vsmap = *vsmap_ptr; - *vsmap_ptr = read_vsmap; - ps_clunmap(vs, offset, size); - *vsmap_ptr = write_vsmap; - VSM_CLR(write_vsmap); - VSM_CLR(original_read_vsmap); - } - } - } else { - size_wanted = size; - if (error == KERN_SUCCESS) { - if (residual == size) { - /* - * If a read operation returns no error - * and no data moved, we turn it into - * an error, assuming we're reading at - * or beyond EOF. - * Fall through and error the entire - * range. - */ - error = KERN_FAILURE; - *vsmap_ptr = write_vsmap; - if(!(VSM_ISCLR(*vsmap_ptr))) { - /* unmap the new backing store object */ - ps_clunmap(vs, offset, size); - } - *vsmap_ptr = original_read_vsmap; - VSM_CLR(write_vsmap); - continue; - } else { - /* - * Otherwise, we have partial read. - * This is also considered an error - * for the purposes of cluster transfer - */ - error = KERN_FAILURE; - *vsmap_ptr = write_vsmap; - if(!(VSM_ISCLR(*vsmap_ptr))) { - /* unmap the new backing store object */ - ps_clunmap(vs, offset, size); - } - *vsmap_ptr = original_read_vsmap; - VSM_CLR(write_vsmap); - continue; - } - } - - } - cnt -= size; - offset += size; - - } /* END while (cnt && (error == 0)) */ - if(!VSM_ISCLR(write_vsmap)) - *vsmap_ptr = write_vsmap; - - return error; -} - -kern_return_t -default_pager_add_file( - MACH_PORT_FACE backing_store, - vnode_ptr_t vp, - int record_size, - vm_size_t size) -{ - backing_store_t bs; - paging_segment_t ps; - int i; - unsigned int j; - int error; - - if ((bs = backing_store_lookup(backing_store)) - == BACKING_STORE_NULL) - return KERN_INVALID_ARGUMENT; - - PSL_LOCK(); - for (i = 0; i <= paging_segment_max; i++) { - ps = paging_segments[i]; - if (ps == PAGING_SEGMENT_NULL) - continue; - if (ps->ps_segtype != PS_FILE) - continue; - - /* - * Check for overlap on same device. - */ - if (ps->ps_vnode == (struct vnode *)vp) { - PSL_UNLOCK(); - BS_UNLOCK(bs); - return KERN_INVALID_ARGUMENT; - } - } - PSL_UNLOCK(); - - /* - * Set up the paging segment - */ - ps = (paging_segment_t) kalloc(sizeof (struct paging_segment)); - if (ps == PAGING_SEGMENT_NULL) { - BS_UNLOCK(bs); - return KERN_RESOURCE_SHORTAGE; - } - - ps->ps_segtype = PS_FILE; - ps->ps_vnode = (struct vnode *)vp; - ps->ps_offset = 0; - ps->ps_record_shift = local_log2(vm_page_size / record_size); - assert((dp_size_t) size == size); - ps->ps_recnum = (dp_size_t) size; - ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift; - - ps->ps_pgcount = ps->ps_pgnum; - ps->ps_clshift = local_log2(bs->bs_clsize); - ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; - ps->ps_special_clusters = 0; - ps->ps_hint = 0; - - PS_LOCK_INIT(ps); - ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); - if (!ps->ps_bmap) { - PS_LOCK_DESTROY(ps); - kfree(ps, sizeof *ps); - BS_UNLOCK(bs); - return KERN_RESOURCE_SHORTAGE; - } - for (j = 0; j < ps->ps_ncls; j++) { - clrbit(ps->ps_bmap, j); - } - - if(paging_segment_count == 0) { - ps->ps_state = PS_EMERGENCY_SEGMENT; - if(use_emergency_swap_file_first) { - ps->ps_state |= PS_CAN_USE; - } - emergency_segment_backing_store = backing_store; - } else { - ps->ps_state = PS_CAN_USE; - } - - ps->ps_bs = bs; - - if ((error = ps_enter(ps)) != 0) { - kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); - PS_LOCK_DESTROY(ps); - kfree(ps, sizeof *ps); - BS_UNLOCK(bs); - return KERN_RESOURCE_SHORTAGE; - } - - bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift; - bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift; - PSL_LOCK(); - if(IS_PS_OK_TO_USE(ps)) { - dp_pages_free += ps->ps_pgcount; - } else { - dp_pages_reserve += ps->ps_pgcount; - } - PSL_UNLOCK(); - - BS_UNLOCK(bs); - - bs_more_space(ps->ps_clcount); - - /* - * If the paging segment being activated is not the emergency - * segment and we notice that the emergency segment is being - * used then we help recover it. If all goes well, the - * emergency segment will be back to its original state of - * online but not activated (till it's needed the next time). - */ -#if CONFIG_FREEZE - if (!memorystatus_freeze_enabled) -#endif - { - ps = paging_segments[EMERGENCY_PSEG_INDEX]; - if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) { - if(default_pager_backing_store_delete(emergency_segment_backing_store)) { - dprintf(("Failed to recover emergency paging segment\n")); - } else { - dprintf(("Recovered emergency paging segment\n")); - } - } - } - - DP_DEBUG(DEBUG_BS_INTERNAL, - ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", - device, offset, (dp_size_t) size, record_size, - ps->ps_record_shift, ps->ps_pgnum)); - - return KERN_SUCCESS; -} - - - -kern_return_t -ps_read_file( - paging_segment_t ps, - upl_t upl, - upl_offset_t upl_offset, - dp_offset_t offset, - upl_size_t size, - unsigned int *residualp, - int flags) -{ - vm_object_offset_t f_offset; - int error = 0; - int result; - - assert(dp_encryption_inited); - - clustered_reads[atop_32(size)]++; - - f_offset = (vm_object_offset_t)(ps->ps_offset + offset); - - /* - * for transfer case we need to pass uploffset and flags - */ - assert((upl_size_t) size == size); - error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL); - - /* The vnode_pagein semantic is somewhat at odds with the existing */ - /* device_read semantic. Partial reads are not experienced at this */ - /* level. It is up to the bit map code and cluster read code to */ - /* check that requested data locations are actually backed, and the */ - /* pagein code to either read all of the requested data or return an */ - /* error. */ - - if (error) - result = KERN_FAILURE; - else { - *residualp = 0; - result = KERN_SUCCESS; - } - return result; -} - -kern_return_t -ps_write_file( - paging_segment_t ps, - upl_t upl, - upl_offset_t upl_offset, - dp_offset_t offset, - unsigned int size, - int flags) -{ - vm_object_offset_t f_offset; - kern_return_t result; - - assert(dp_encryption_inited); - - clustered_writes[atop_32(size)]++; - f_offset = (vm_object_offset_t)(ps->ps_offset + offset); - - if (flags & UPL_PAGING_ENCRYPTED) { - /* - * ENCRYPTED SWAP: - * encrypt all the pages that we're going - * to pageout. - */ - upl_encrypt(upl, upl_offset, size); - } - assert((upl_size_t) size == size); - if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL)) - result = KERN_FAILURE; - else - result = KERN_SUCCESS; - - return result; -} - -static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data) -{ -#pragma unused(data) -} - -static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data) -{ -#pragma unused(data) -} - -static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length) -{ -#pragma unused(data, map, shift, length) -} - -kern_return_t -default_pager_triggers( __unused MACH_PORT_FACE default_pager, - int hi_wat, - int lo_wat, - int flags, - MACH_PORT_FACE trigger_port) -{ - MACH_PORT_FACE release = IPC_PORT_NULL; - kern_return_t kr; - clock_sec_t now; - clock_nsec_t nanoseconds_dummy; - static clock_sec_t error_notify = 0; - - PSL_LOCK(); - if (flags == SWAP_ENCRYPT_ON) { - /* ENCRYPTED SWAP: turn encryption on */ - release = trigger_port; - if (!dp_encryption_inited) { - dp_encryption_inited = TRUE; - dp_encryption = TRUE; - kr = KERN_SUCCESS; - } else { - kr = KERN_FAILURE; - } - } else if (flags == SWAP_ENCRYPT_OFF) { - /* ENCRYPTED SWAP: turn encryption off */ - release = trigger_port; - if (!dp_encryption_inited) { - dp_encryption_inited = TRUE; - dp_encryption = FALSE; - kr = KERN_SUCCESS; - } else { - kr = KERN_FAILURE; - } - } else if (flags == HI_WAT_ALERT) { - release = min_pages_trigger_port; -#if CONFIG_FREEZE - /* High and low water signals aren't applicable when freeze is */ - /* enabled, so release the trigger ports here and return */ - /* KERN_FAILURE. */ - if (memorystatus_freeze_enabled) { - if (IP_VALID( trigger_port )){ - ipc_port_release_send( trigger_port ); - } - min_pages_trigger_port = IPC_PORT_NULL; - kr = KERN_FAILURE; - } - else -#endif - { - min_pages_trigger_port = trigger_port; - minimum_pages_remaining = hi_wat/vm_page_size; - bs_low = FALSE; - kr = KERN_SUCCESS; - } - } else if (flags == LO_WAT_ALERT) { - release = max_pages_trigger_port; -#if CONFIG_FREEZE - if (memorystatus_freeze_enabled) { - if (IP_VALID( trigger_port )){ - ipc_port_release_send( trigger_port ); - } - max_pages_trigger_port = IPC_PORT_NULL; - kr = KERN_FAILURE; - } - else -#endif - { - max_pages_trigger_port = trigger_port; - maximum_pages_free = lo_wat/vm_page_size; - kr = KERN_SUCCESS; - } - } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) { - use_emergency_swap_file_first = TRUE; - release = trigger_port; - kr = KERN_SUCCESS; - } else if (flags == SWAP_FILE_CREATION_ERROR) { - release = trigger_port; - kr = KERN_SUCCESS; - if( paging_segment_count == 1) { - use_emergency_swap_file_first = TRUE; - } - no_paging_space_action(); - clock_get_system_nanotime(&now, &nanoseconds_dummy); - if (now > error_notify + 5) { - dprintf(("Swap File Error.\n")); - error_notify = now; - } - } else { - release = trigger_port; - kr = KERN_INVALID_ARGUMENT; - } - PSL_UNLOCK(); - - if (IP_VALID(release)) - ipc_port_release_send(release); - - return kr; -} - -/* - * Monitor the amount of available backing store vs. the amount of - * required backing store, notify a listener (if present) when - * backing store may safely be removed. - * - * We attempt to avoid the situation where backing store is - * discarded en masse, as this can lead to thrashing as the - * backing store is compacted. - */ - -#define PF_INTERVAL 3 /* time between free level checks */ -#define PF_LATENCY 10 /* number of intervals before release */ - -static int dp_pages_free_low_count = 0; -thread_call_t default_pager_backing_store_monitor_callout; - -void -default_pager_backing_store_monitor(__unused thread_call_param_t p1, - __unused thread_call_param_t p2) -{ -// unsigned long long average; - ipc_port_t trigger; - uint64_t deadline; - - /* - * We determine whether it will be safe to release some - * backing store by watching the free page level. If - * it remains below the maximum_pages_free threshold for - * at least PF_LATENCY checks (taken at PF_INTERVAL seconds) - * then we deem it safe. - * - * Note that this establishes a maximum rate at which backing - * store will be released, as each notification (currently) - * only results in a single backing store object being - * released. - */ - if (dp_pages_free > maximum_pages_free) { - dp_pages_free_low_count++; - } else { - dp_pages_free_low_count = 0; - } - - /* decide whether to send notification */ - trigger = IP_NULL; - if (max_pages_trigger_port && - (backing_store_release_trigger_disable == 0) && - (dp_pages_free_low_count > PF_LATENCY)) { - trigger = max_pages_trigger_port; - max_pages_trigger_port = NULL; - } - - /* send notification */ - if (trigger != IP_NULL) { - VSL_LOCK(); - if(backing_store_release_trigger_disable != 0) { - assert_wait((event_t) - &backing_store_release_trigger_disable, - THREAD_UNINT); - VSL_UNLOCK(); - thread_block(THREAD_CONTINUE_NULL); - } else { - VSL_UNLOCK(); - } - dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n")); - - default_pager_space_alert(trigger, LO_WAT_ALERT); - ipc_port_release_send(trigger); - dp_pages_free_low_count = 0; - } - - clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline); - thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline); -} - -#if CONFIG_FREEZE -unsigned int default_pager_swap_pages_free() { - return dp_pages_free; -} -#endif diff --git a/osfmk/default_pager/dp_memory_object.c b/osfmk/default_pager/dp_memory_object.c deleted file mode 100644 index 0c44bb801..000000000 --- a/osfmk/default_pager/dp_memory_object.c +++ /dev/null @@ -1,1193 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - * Default Pager. - * Memory Object Management. - */ - -#include "default_pager_internal.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* forward declaration */ -vstruct_t vs_object_create(dp_size_t size); - -/* - * List of all vstructs. A specific vstruct is - * found directly via its port, this list is - * only used for monitoring purposes by the - * default_pager_object* calls and by ps_delete - * when abstract memory objects must be scanned - * to remove any live storage on a segment which - * is to be removed. - */ -struct vstruct_list_head vstruct_list; - -__private_extern__ void -vstruct_list_insert( - vstruct_t vs) -{ - VSL_LOCK(); - queue_enter(&vstruct_list.vsl_queue, vs, vstruct_t, vs_links); - vstruct_list.vsl_count++; - VSL_UNLOCK(); -} - - -__private_extern__ void -vstruct_list_delete( - vstruct_t vs) -{ - queue_remove(&vstruct_list.vsl_queue, vs, vstruct_t, vs_links); - vstruct_list.vsl_count--; -} - -/* - * We use the sequence numbers on requests to regulate - * our parallelism. In general, we allow multiple reads and writes - * to proceed in parallel, with the exception that reads must - * wait for previous writes to finish. (Because the kernel might - * generate a data-request for a page on the heels of a data-write - * for the same page, and we must avoid returning stale data.) - * terminate requests wait for proceeding reads and writes to finish. - */ - -static unsigned int default_pager_total = 0; /* debugging */ -static unsigned int default_pager_wait_seqno = 0; /* debugging */ -static unsigned int default_pager_wait_read = 0; /* debugging */ -static unsigned int default_pager_wait_write = 0; /* debugging */ - -__private_extern__ void -vs_async_wait( - vstruct_t vs) -{ - - ASSERT(vs->vs_async_pending >= 0); - while (vs->vs_async_pending > 0) { - vs->vs_waiting_async = TRUE; - assert_wait(&vs->vs_async_pending, THREAD_UNINT); - VS_UNLOCK(vs); - thread_block(THREAD_CONTINUE_NULL); - VS_LOCK(vs); - } - ASSERT(vs->vs_async_pending == 0); -} - - -#if PARALLEL -/* - * Waits for correct sequence number. Leaves pager locked. - * - * JMM - Sequence numbers guarantee ordering of requests generated - * by a single thread if the receiver is multithreaded and - * the interfaces are asynchronous (i.e. sender can generate - * more than one request before the first is received in the - * pager). Normally, IPC would generate these number in that - * case. But we are trying to avoid using IPC for the in-kernel - * scenario. Since these are actually invoked synchronously - * anyway (in-kernel), we can just fake the sequence number - * generation here (thus avoiding the dependence on IPC). - */ -__private_extern__ void -vs_lock( - vstruct_t vs) -{ - mach_port_seqno_t seqno; - - default_pager_total++; - VS_LOCK(vs); - - seqno = vs->vs_next_seqno++; - - while (vs->vs_seqno != seqno) { - default_pager_wait_seqno++; - vs->vs_waiting_seqno = TRUE; - assert_wait(&vs->vs_seqno, THREAD_UNINT); - VS_UNLOCK(vs); - thread_block(THREAD_CONTINUE_NULL); - VS_LOCK(vs); - } -} - -/* - * Increments sequence number and unlocks pager. - */ -__private_extern__ void -vs_unlock(vstruct_t vs) -{ - vs->vs_seqno++; - if (vs->vs_waiting_seqno) { - vs->vs_waiting_seqno = FALSE; - VS_UNLOCK(vs); - thread_wakeup(&vs->vs_seqno); - return; - } - VS_UNLOCK(vs); -} - -/* - * Start a read - one more reader. Pager must be locked. - */ -__private_extern__ void -vs_start_read( - vstruct_t vs) -{ - vs->vs_readers++; -} - -/* - * Wait for readers. Unlocks and relocks pager if wait needed. - */ -__private_extern__ void -vs_wait_for_readers( - vstruct_t vs) -{ - while (vs->vs_readers != 0) { - default_pager_wait_read++; - vs->vs_waiting_read = TRUE; - assert_wait(&vs->vs_readers, THREAD_UNINT); - VS_UNLOCK(vs); - thread_block(THREAD_CONTINUE_NULL); - VS_LOCK(vs); - } -} - -/* - * Finish a read. Pager is unlocked and returns unlocked. - */ -__private_extern__ void -vs_finish_read( - vstruct_t vs) -{ - VS_LOCK(vs); - if (--vs->vs_readers == 0 && vs->vs_waiting_read) { - vs->vs_waiting_read = FALSE; - VS_UNLOCK(vs); - thread_wakeup(&vs->vs_readers); - return; - } - VS_UNLOCK(vs); -} - -/* - * Start a write - one more writer. Pager must be locked. - */ -__private_extern__ void -vs_start_write( - vstruct_t vs) -{ - vs->vs_writers++; -} - -/* - * Wait for writers. Unlocks and relocks pager if wait needed. - */ -__private_extern__ void -vs_wait_for_writers( - vstruct_t vs) -{ - while (vs->vs_writers != 0) { - default_pager_wait_write++; - vs->vs_waiting_write = TRUE; - assert_wait(&vs->vs_writers, THREAD_UNINT); - VS_UNLOCK(vs); - thread_block(THREAD_CONTINUE_NULL); - VS_LOCK(vs); - } - vs_async_wait(vs); -} - -/* This is to be used for the transfer from segment code ONLY */ -/* The transfer code holds off vs destruction by keeping the */ -/* vs_async_wait count non-zero. It will not ocnflict with */ -/* other writers on an async basis because it only writes on */ -/* a cluster basis into fresh (as of sync time) cluster locations */ - -__private_extern__ void -vs_wait_for_sync_writers( - vstruct_t vs) -{ - while (vs->vs_writers != 0) { - default_pager_wait_write++; - vs->vs_waiting_write = TRUE; - assert_wait(&vs->vs_writers, THREAD_UNINT); - VS_UNLOCK(vs); - thread_block(THREAD_CONTINUE_NULL); - VS_LOCK(vs); - } -} - - -/* - * Finish a write. Pager is unlocked and returns unlocked. - */ -__private_extern__ void -vs_finish_write( - vstruct_t vs) -{ - VS_LOCK(vs); - if (--vs->vs_writers == 0 && vs->vs_waiting_write) { - vs->vs_waiting_write = FALSE; - VS_UNLOCK(vs); - thread_wakeup(&vs->vs_writers); - return; - } - VS_UNLOCK(vs); -} -#endif /* PARALLEL */ - -vstruct_t -vs_object_create( - dp_size_t size) -{ - vstruct_t vs; - - /* - * Allocate a vstruct. If there are any problems, then report them - * to the console. - */ - vs = ps_vstruct_create(size); - if (vs == VSTRUCT_NULL) { - dprintf(("vs_object_create: unable to allocate %s\n", - "-- either run swapon command or reboot")); - return VSTRUCT_NULL; - } - - return vs; -} - -#if 0 -void default_pager_add(vstruct_t, boolean_t); /* forward */ - -void -default_pager_add( - vstruct_t vs, - boolean_t internal) -{ - memory_object_t mem_obj = vs->vs_mem_obj; - mach_port_t pset; - mach_port_mscount_t sync; - mach_port_t previous; - kern_return_t kr; - static char here[] = "default_pager_add"; - - /* - * The port currently has a make-send count of zero, - * because either we just created the port or we just - * received the port in a memory_object_create request. - */ - - if (internal) { - /* possibly generate an immediate no-senders notification */ - sync = 0; - pset = default_pager_internal_set; - } else { - /* delay notification till send right is created */ - sync = 1; - pset = default_pager_external_set; - } - - ip_lock(mem_obj); /* unlocked in nsrequest below */ - ipc_port_make_sonce_locked(mem_obj); - ipc_port_nsrequest(mem_obj, sync, mem_obj, &previous); -} - -#endif - -const struct memory_object_pager_ops default_pager_ops = { - dp_memory_object_reference, - dp_memory_object_deallocate, - dp_memory_object_init, - dp_memory_object_terminate, - dp_memory_object_data_request, - dp_memory_object_data_return, - dp_memory_object_data_initialize, - dp_memory_object_data_unlock, - dp_memory_object_synchronize, - dp_memory_object_map, - dp_memory_object_last_unmap, - dp_memory_object_data_reclaim, - "default pager" -}; - -kern_return_t -dp_memory_object_init( - memory_object_t mem_obj, - memory_object_control_t control, - __unused memory_object_cluster_size_t pager_page_size) -{ - vstruct_t vs; - - assert(pager_page_size == vm_page_size); - - memory_object_control_reference(control); - - vs_lookup(mem_obj, vs); - vs_lock(vs); - - if (vs->vs_control != MEMORY_OBJECT_CONTROL_NULL) - Panic("bad request"); - - vs->vs_control = control; - vs_unlock(vs); - - return KERN_SUCCESS; -} - -kern_return_t -dp_memory_object_synchronize( - memory_object_t mem_obj, - memory_object_offset_t offset, - memory_object_size_t length, - __unused vm_sync_t flags) -{ - vstruct_t vs; - - vs_lookup(mem_obj, vs); - vs_lock(vs); - vs_unlock(vs); - - memory_object_synchronize_completed(vs->vs_control, offset, length); - - return KERN_SUCCESS; -} - -kern_return_t -dp_memory_object_map( - __unused memory_object_t mem_obj, - __unused vm_prot_t prot) -{ - panic("dp_memory_object_map"); - return KERN_FAILURE; -} - -kern_return_t -dp_memory_object_last_unmap( - __unused memory_object_t mem_obj) -{ - panic("dp_memory_object_last_unmap"); - return KERN_FAILURE; -} - -kern_return_t -dp_memory_object_data_reclaim( - memory_object_t mem_obj, - boolean_t reclaim_backing_store) -{ - vstruct_t vs; - kern_return_t retval; - - vs_lookup(mem_obj, vs); - for (;;) { - vs_lock(vs); - vs_async_wait(vs); - if (!vs->vs_xfer_pending) { - break; - } - } - vs->vs_xfer_pending = TRUE; - vs_unlock(vs); - - retval = ps_vstruct_reclaim(vs, TRUE, reclaim_backing_store); - - vs_lock(vs); - vs->vs_xfer_pending = FALSE; - vs_unlock(vs); - - return retval; -} - -kern_return_t -dp_memory_object_terminate( - memory_object_t mem_obj) -{ - memory_object_control_t control; - vstruct_t vs; - - /* - * control port is a receive right, not a send right. - */ - - vs_lookup(mem_obj, vs); - vs_lock(vs); - - /* - * Wait for read and write requests to terminate. - */ - - vs_wait_for_readers(vs); - vs_wait_for_writers(vs); - - /* - * After memory_object_terminate both memory_object_init - * and a no-senders notification are possible, so we need - * to clean up our reference to the memory_object_control - * to prepare for a new init. - */ - - control = vs->vs_control; - vs->vs_control = MEMORY_OBJECT_CONTROL_NULL; - - /* a bit of special case ugliness here. Wakeup any waiting reads */ - /* these data requests had to be removed from the seqno traffic */ - /* based on a performance bottleneck with large memory objects */ - /* the problem will right itself with the new component based */ - /* synchronous interface. The new async will be able to return */ - /* failure during its sync phase. In the mean time ... */ - - thread_wakeup(&vs->vs_writers); - thread_wakeup(&vs->vs_async_pending); - - vs_unlock(vs); - - /* - * Now we deallocate our reference on the control. - */ - memory_object_control_deallocate(control); - return KERN_SUCCESS; -} - -void -dp_memory_object_reference( - memory_object_t mem_obj) -{ - vstruct_t vs; - - vs_lookup_safe(mem_obj, vs); - if (vs == VSTRUCT_NULL) - return; - - VS_LOCK(vs); - assert(vs->vs_references > 0); - vs->vs_references++; - VS_UNLOCK(vs); -} - -void -dp_memory_object_deallocate( - memory_object_t mem_obj) -{ - vstruct_t vs; - mach_port_seqno_t seqno; - - /* - * Because we don't give out multiple first references - * for a memory object, there can't be a race - * between getting a deallocate call and creating - * a new reference for the object. - */ - - vs_lookup_safe(mem_obj, vs); - if (vs == VSTRUCT_NULL) - return; - - VS_LOCK(vs); - if (--vs->vs_references > 0) { - VS_UNLOCK(vs); - return; - } - - seqno = vs->vs_next_seqno++; - while (vs->vs_seqno != seqno) { - default_pager_wait_seqno++; - vs->vs_waiting_seqno = TRUE; - assert_wait(&vs->vs_seqno, THREAD_UNINT); - VS_UNLOCK(vs); - thread_block(THREAD_CONTINUE_NULL); - VS_LOCK(vs); - } - - vs_async_wait(vs); /* wait for pending async IO */ - - /* do not delete the vs structure until the referencing pointers */ - /* in the vstruct list have been expunged */ - - /* get VSL_LOCK out of order by using TRY mechanism */ - while(!VSL_LOCK_TRY()) { - VS_UNLOCK(vs); - VSL_LOCK(); - VSL_UNLOCK(); - VS_LOCK(vs); - vs_async_wait(vs); /* wait for pending async IO */ - } - - - /* - * We shouldn't get a deallocation call - * when the kernel has the object cached. - */ - if (vs->vs_control != MEMORY_OBJECT_CONTROL_NULL) - Panic("bad request"); - - /* - * Unlock the pager (though there should be no one - * waiting for it). - */ - VS_UNLOCK(vs); - - /* Lock out paging segment removal for the duration of this */ - /* call. We are vulnerable to losing a paging segment we rely */ - /* on as soon as we remove ourselves from the VSL and unlock */ - - /* Keep our thread from blocking on attempt to trigger backing */ - /* store release */ - backing_store_release_trigger_disable += 1; - - /* - * Remove the memory object port association, and then - * the destroy the port itself. We must remove the object - * from the port list before deallocating the pager, - * because of default_pager_objects. - */ - vstruct_list_delete(vs); - VSL_UNLOCK(); - - ps_vstruct_dealloc(vs); - - VSL_LOCK(); - backing_store_release_trigger_disable -= 1; - if(backing_store_release_trigger_disable == 0) { - thread_wakeup((event_t)&backing_store_release_trigger_disable); - } - VSL_UNLOCK(); -} - -kern_return_t -dp_memory_object_data_request( - memory_object_t mem_obj, - memory_object_offset_t offset, - memory_object_cluster_size_t length, - __unused vm_prot_t protection_required, - memory_object_fault_info_t fault_info) -{ - vstruct_t vs; - kern_return_t kr = KERN_SUCCESS; - - GSTAT(global_stats.gs_pagein_calls++); - - - /* CDY at this moment vs_lookup panics when presented with the wrong */ - /* port. As we are expanding this pager to support user interfaces */ - /* this should be changed to return kern_failure */ - vs_lookup(mem_obj, vs); - vs_lock(vs); - - /* We are going to relax the strict sequencing here for performance */ - /* reasons. We can do this because we know that the read and */ - /* write threads are different and we rely on synchronization */ - /* of read and write requests at the cache memory_object level */ - /* break out wait_for_writers, all of this goes away when */ - /* we get real control of seqno with the new component interface */ - - if (vs->vs_writers != 0) { - /* you can't hold on to the seqno and go */ - /* to sleep like that */ - vs_unlock(vs); /* bump internal count of seqno */ - VS_LOCK(vs); - while (vs->vs_writers != 0) { - default_pager_wait_write++; - vs->vs_waiting_write = TRUE; - assert_wait(&vs->vs_writers, THREAD_UNINT); - VS_UNLOCK(vs); - thread_block(THREAD_CONTINUE_NULL); - VS_LOCK(vs); - vs_async_wait(vs); - } - if(vs->vs_control == MEMORY_OBJECT_CONTROL_NULL) { - VS_UNLOCK(vs); - return KERN_FAILURE; - } - vs_start_read(vs); - VS_UNLOCK(vs); - } else { - vs_start_read(vs); - vs_unlock(vs); - } - - /* - * Request must be on a page boundary and a multiple of pages. - */ - if ((offset & vm_page_mask) != 0 || (length & vm_page_mask) != 0) - Panic("bad alignment"); - - assert((dp_offset_t) offset == offset); - kr = pvs_cluster_read(vs, (dp_offset_t) offset, length, fault_info); - - /* Regular data requests have a non-zero length and always return KERN_SUCCESS. - Their actual success is determined by the fact that they provide a page or not, - i.e whether we call upl_commit() or upl_abort(). A length of 0 means that the - caller is only asking if the pager has a copy of that page or not. The answer to - that question is provided by the return value. KERN_SUCCESS means that the pager - does have that page. - */ - if(length) { - kr = KERN_SUCCESS; - } - - vs_finish_read(vs); - - return kr; -} - -/* - * memory_object_data_initialize: check whether we already have each page, and - * write it if we do not. The implementation is far from optimized, and - * also assumes that the default_pager is single-threaded. - */ -/* It is questionable whether or not a pager should decide what is relevant */ -/* and what is not in data sent from the kernel. Data initialize has been */ -/* changed to copy back all data sent to it in preparation for its eventual */ -/* merge with data return. It is the kernel that should decide what pages */ -/* to write back. As of the writing of this note, this is indeed the case */ -/* the kernel writes back one page at a time through this interface */ - -kern_return_t -dp_memory_object_data_initialize( - memory_object_t mem_obj, - memory_object_offset_t offset, - memory_object_cluster_size_t size) -{ - vstruct_t vs; - - DP_DEBUG(DEBUG_MO_EXTERNAL, - ("mem_obj=0x%x,offset=0x%x,cnt=0x%x\n", - (int)mem_obj, (int)offset, (int)size)); - GSTAT(global_stats.gs_pages_init += atop_32(size)); - - vs_lookup(mem_obj, vs); - vs_lock(vs); - vs_start_write(vs); - vs_unlock(vs); - - /* - * Write the data via clustered writes. vs_cluster_write will - * loop if the address range specified crosses cluster - * boundaries. - */ - assert((upl_offset_t) offset == offset); - vs_cluster_write(vs, 0, (upl_offset_t)offset, size, FALSE, 0); - - vs_finish_write(vs); - - return KERN_SUCCESS; -} - -kern_return_t -dp_memory_object_data_unlock( - __unused memory_object_t mem_obj, - __unused memory_object_offset_t offset, - __unused memory_object_size_t size, - __unused vm_prot_t desired_access) -{ - Panic("dp_memory_object_data_unlock: illegal"); - return KERN_FAILURE; -} - - -/*ARGSUSED8*/ -kern_return_t -dp_memory_object_data_return( - memory_object_t mem_obj, - memory_object_offset_t offset, - memory_object_cluster_size_t size, - __unused memory_object_offset_t *resid_offset, - __unused int *io_error, - __unused boolean_t dirty, - __unused boolean_t kernel_copy, - __unused int upl_flags) -{ - vstruct_t vs; - - DP_DEBUG(DEBUG_MO_EXTERNAL, - ("mem_obj=0x%x,offset=0x%x,size=0x%x\n", - (int)mem_obj, (int)offset, (int)size)); - GSTAT(global_stats.gs_pageout_calls++); - - /* This routine is called by the pageout thread. The pageout thread */ - /* cannot be blocked by read activities unless the read activities */ - /* Therefore the grant of vs lock must be done on a try versus a */ - /* blocking basis. The code below relies on the fact that the */ - /* interface is synchronous. Should this interface be again async */ - /* for some type of pager in the future the pages will have to be */ - /* returned through a separate, asynchronous path. */ - - vs_lookup(mem_obj, vs); - - default_pager_total++; - - /* might be unreachable if VS_TRY_LOCK is, by definition, always true */ - __unreachable_ok_push - if(!VS_TRY_LOCK(vs)) { - /* the call below will not be done by caller when we have */ - /* a synchronous interface */ - /* return KERN_LOCK_OWNED; */ - upl_t upl; - unsigned int page_list_count = 0; - memory_object_super_upl_request(vs->vs_control, - (memory_object_offset_t)offset, - size, size, - &upl, NULL, &page_list_count, - UPL_NOBLOCK | UPL_CLEAN_IN_PLACE - | UPL_NO_SYNC | UPL_COPYOUT_FROM); - upl_abort(upl,0); - upl_deallocate(upl); - return KERN_SUCCESS; - } - __unreachable_ok_pop - - if ((vs->vs_seqno != vs->vs_next_seqno++) - || (vs->vs_readers) - || (vs->vs_xfer_pending)) { - upl_t upl; - unsigned int page_list_count = 0; - - vs->vs_next_seqno--; - VS_UNLOCK(vs); - - /* the call below will not be done by caller when we have */ - /* a synchronous interface */ - /* return KERN_LOCK_OWNED; */ - memory_object_super_upl_request(vs->vs_control, - (memory_object_offset_t)offset, - size, size, - &upl, NULL, &page_list_count, - UPL_NOBLOCK | UPL_CLEAN_IN_PLACE - | UPL_NO_SYNC | UPL_COPYOUT_FROM); - upl_abort(upl,0); - upl_deallocate(upl); - return KERN_SUCCESS; - } - - if ((size % vm_page_size) != 0) - Panic("bad alignment"); - - vs_start_write(vs); - - - vs->vs_async_pending += 1; /* protect from backing store contraction */ - vs_unlock(vs); - - /* - * Write the data via clustered writes. vs_cluster_write will - * loop if the address range specified crosses cluster - * boundaries. - */ - assert((upl_offset_t) offset == offset); - vs_cluster_write(vs, 0, (upl_offset_t) offset, size, FALSE, 0); - - vs_finish_write(vs); - - /* temporary, need a finer lock based on cluster */ - - VS_LOCK(vs); - vs->vs_async_pending -= 1; /* release vs_async_wait */ - if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { - vs->vs_waiting_async = FALSE; - VS_UNLOCK(vs); - thread_wakeup(&vs->vs_async_pending); - } else { - VS_UNLOCK(vs); - } - - - return KERN_SUCCESS; -} - -/* - * Routine: default_pager_memory_object_create - * Purpose: - * Handle requests for memory objects from the - * kernel. - * Notes: - * Because we only give out the default memory - * manager port to the kernel, we don't have to - * be so paranoid about the contents. - */ -kern_return_t -default_pager_memory_object_create( - __unused memory_object_default_t dmm, - vm_size_t new_size, - memory_object_t *new_mem_obj) -{ - vstruct_t vs; - - assert(dmm == default_pager_object); - - if ((dp_size_t) new_size != new_size) { - /* 32-bit overflow */ - return KERN_INVALID_ARGUMENT; - } - - vs = vs_object_create((dp_size_t) new_size); - if (vs == VSTRUCT_NULL) - return KERN_RESOURCE_SHORTAGE; - - vs->vs_next_seqno = 0; - - /* - * Set up associations between this memory object - * and this default_pager structure - */ - - vs->vs_pager_ops = &default_pager_ops; - vs->vs_pager_header.io_bits = IKOT_MEMORY_OBJECT; - - /* - * After this, other threads might receive requests - * for this memory object or find it in the port list. - */ - - vstruct_list_insert(vs); - *new_mem_obj = vs_to_mem_obj(vs); - return KERN_SUCCESS; -} - -/* - * Create an external object. - */ -kern_return_t -default_pager_object_create( - default_pager_t default_pager, - vm_size_t size, - memory_object_t *mem_objp) -{ - vstruct_t vs; - - if (default_pager != default_pager_object) - return KERN_INVALID_ARGUMENT; - - if ((dp_size_t) size != size) { - /* 32-bit overflow */ - return KERN_INVALID_ARGUMENT; - } - - vs = vs_object_create((dp_size_t) size); - if (vs == VSTRUCT_NULL) - return KERN_RESOURCE_SHORTAGE; - - /* - * Set up associations between the default pager - * and this vstruct structure - */ - vs->vs_pager_ops = &default_pager_ops; - vstruct_list_insert(vs); - *mem_objp = vs_to_mem_obj(vs); - return KERN_SUCCESS; -} - -kern_return_t -default_pager_objects( - default_pager_t default_pager, - default_pager_object_array_t *objectsp, - mach_msg_type_number_t *ocountp, - mach_port_array_t *portsp, - mach_msg_type_number_t *pcountp) -{ - vm_offset_t oaddr = 0; /* memory for objects */ - vm_size_t osize = 0; /* current size */ - default_pager_object_t * objects; - unsigned int opotential = 0; - - vm_map_copy_t pcopy = 0; /* copy handle for pagers */ - vm_size_t psize = 0; /* current size */ - memory_object_t * pagers; - unsigned int ppotential = 0; - - unsigned int actual; - unsigned int num_objects; - kern_return_t kr; - vstruct_t entry; - - if (default_pager != default_pager_object) - return KERN_INVALID_ARGUMENT; - - /* - * We will send no more than this many - */ - actual = vstruct_list.vsl_count; - - /* - * Out out-of-line port arrays are simply kalloc'ed. - */ - psize = vm_map_round_page(actual * sizeof (*pagers), - vm_map_page_mask(ipc_kernel_map)); - ppotential = (unsigned int) (psize / sizeof (*pagers)); - pagers = (memory_object_t *)kalloc(psize); - if (0 == pagers) - return KERN_RESOURCE_SHORTAGE; - - /* - * returned out of line data must be allocated out - * the ipc_kernel_map, wired down, filled in, and - * then "copied in" as if it had been sent by a - * user process. - */ - osize = vm_map_round_page(actual * sizeof (*objects), - vm_map_page_mask(ipc_kernel_map)); - opotential = (unsigned int) (osize / sizeof (*objects)); - kr = kmem_alloc(ipc_kernel_map, &oaddr, osize, VM_KERN_MEMORY_IPC); - if (KERN_SUCCESS != kr) { - kfree(pagers, psize); - return KERN_RESOURCE_SHORTAGE; - } - objects = (default_pager_object_t *)oaddr; - - - /* - * Now scan the list. - */ - - VSL_LOCK(); - - num_objects = 0; - queue_iterate(&vstruct_list.vsl_queue, entry, vstruct_t, vs_links) { - - memory_object_t pager; - vm_size_t size; - - if ((num_objects >= opotential) || - (num_objects >= ppotential)) { - - /* - * This should be rare. In any case, - * we will only miss recent objects, - * because they are added at the end. - */ - break; - } - - /* - * Avoid interfering with normal operations - */ - if (!VS_MAP_TRY_LOCK(entry)) - goto not_this_one; - size = ps_vstruct_allocated_size(entry); - VS_MAP_UNLOCK(entry); - - VS_LOCK(entry); - - /* - * We need a reference for our caller. Adding this - * reference through the linked list could race with - * destruction of the object. If we find the object - * has no references, just give up on it. - */ - VS_LOCK(entry); - if (entry->vs_references == 0) { - VS_UNLOCK(entry); - goto not_this_one; - } - pager = vs_to_mem_obj(entry); - dp_memory_object_reference(pager); - VS_UNLOCK(entry); - - /* the arrays are wired, so no deadlock worries */ - - objects[num_objects].dpo_object = (vm_offset_t) entry; - objects[num_objects].dpo_size = size; - pagers [num_objects++] = pager; - continue; - - not_this_one: - /* - * Do not return garbage - */ - objects[num_objects].dpo_object = (vm_offset_t) 0; - objects[num_objects].dpo_size = 0; - pagers[num_objects++] = MEMORY_OBJECT_NULL; - - } - - VSL_UNLOCK(); - - /* clear out any excess allocation */ - while (num_objects < opotential) { - objects[--opotential].dpo_object = (vm_offset_t) 0; - objects[opotential].dpo_size = 0; - } - while (num_objects < ppotential) { - pagers[--ppotential] = MEMORY_OBJECT_NULL; - } - - kr = vm_map_unwire(ipc_kernel_map, - vm_map_trunc_page(oaddr, - vm_map_page_mask(ipc_kernel_map)), - vm_map_round_page(oaddr + osize, - vm_map_page_mask(ipc_kernel_map)), - FALSE); - assert(KERN_SUCCESS == kr); - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)oaddr, - (vm_map_size_t)(num_objects * sizeof(*objects)), TRUE, &pcopy); - assert(KERN_SUCCESS == kr); - - *objectsp = (default_pager_object_array_t)objects; - *ocountp = num_objects; - *portsp = (mach_port_array_t)pcopy; - *pcountp = num_objects; - - return KERN_SUCCESS; -} - -kern_return_t -default_pager_object_pages( - default_pager_t default_pager, - mach_port_t memory_object, - default_pager_page_array_t *pagesp, - mach_msg_type_number_t *countp) -{ - vm_offset_t addr = 0; /* memory for page offsets */ - vm_size_t size = 0; /* current memory size */ - vm_map_copy_t copy; - default_pager_page_t * pages = 0; - unsigned int potential; - unsigned int actual; - kern_return_t kr; - memory_object_t object; - - if (default_pager != default_pager_object) - return KERN_INVALID_ARGUMENT; - - object = (memory_object_t) memory_object; - - potential = 0; - for (;;) { - vstruct_t entry; - - VSL_LOCK(); - queue_iterate(&vstruct_list.vsl_queue, entry, vstruct_t, - vs_links) { - VS_LOCK(entry); - if (vs_to_mem_obj(entry) == object) { - VSL_UNLOCK(); - goto found_object; - } - VS_UNLOCK(entry); - } - VSL_UNLOCK(); - - /* did not find the object */ - if (0 != addr) - kmem_free(ipc_kernel_map, addr, size); - - return KERN_INVALID_ARGUMENT; - - found_object: - - if (!VS_MAP_TRY_LOCK(entry)) { - /* oh well bad luck */ - int wresult; - - VS_UNLOCK(entry); - - assert_wait_timeout((event_t)assert_wait_timeout, THREAD_UNINT, 1, 1000*NSEC_PER_USEC); - wresult = thread_block(THREAD_CONTINUE_NULL); - assert(wresult == THREAD_TIMED_OUT); - continue; - } - - actual = ps_vstruct_allocated_pages(entry, pages, potential); - VS_MAP_UNLOCK(entry); - VS_UNLOCK(entry); - - if (actual <= potential) - break; - - /* allocate more memory */ - if (0 != addr) - kmem_free(ipc_kernel_map, addr, size); - - size = vm_map_round_page(actual * sizeof (*pages), - vm_map_page_mask(ipc_kernel_map)); - kr = kmem_alloc(ipc_kernel_map, &addr, size, VM_KERN_MEMORY_IPC); - if (KERN_SUCCESS != kr) - return KERN_RESOURCE_SHORTAGE; - - pages = (default_pager_page_t *)addr; - potential = (unsigned int) (size / sizeof (*pages)); - } - - /* - * Clear unused memory. - */ - while (actual < potential) - pages[--potential].dpp_offset = 0; - - kr = vm_map_unwire(ipc_kernel_map, - vm_map_trunc_page(addr, - vm_map_page_mask(ipc_kernel_map)), - vm_map_round_page(addr + size, - vm_map_page_mask(ipc_kernel_map)), - FALSE); - assert(KERN_SUCCESS == kr); - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)addr, - (vm_map_size_t)(actual * sizeof(*pages)), TRUE, ©); - assert(KERN_SUCCESS == kr); - - - *pagesp = (default_pager_page_array_t)copy; - *countp = actual; - return KERN_SUCCESS; -} diff --git a/osfmk/device/Makefile b/osfmk/device/Makefile index a788cd1c3..c6e070a05 100644 --- a/osfmk/device/Makefile +++ b/osfmk/device/Makefile @@ -32,7 +32,7 @@ EXPORT_MI_DIR = device # # Build path -# +# INCFLAGS_MAKEFILE= -I.. MIGKSFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_SERVER=1 @@ -45,7 +45,7 @@ COMP_FILES = ${DEVICE_FILES} do_build_all:: $(COMP_FILES) ${DEVICE_FILES}: device.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -header /dev/null \ -user /dev/null \ diff --git a/osfmk/device/iokit_rpc.c b/osfmk/device/iokit_rpc.c index e855d1a0a..4e5da248b 100644 --- a/osfmk/device/iokit_rpc.c +++ b/osfmk/device/iokit_rpc.c @@ -141,7 +141,7 @@ MIGEXTERN io_object_t iokit_lookup_object_port( ipc_port_t port) { - register io_object_t obj; + io_object_t obj; if (!IP_VALID(port)) return (NULL); @@ -163,7 +163,7 @@ MIGEXTERN io_object_t iokit_lookup_connect_port( ipc_port_t port) { - register io_object_t obj; + io_object_t obj; if (!IP_VALID(port)) return (NULL); @@ -258,8 +258,8 @@ MIGEXTERN ipc_port_t iokit_make_object_port( io_object_t obj ) { - register ipc_port_t port; - register ipc_port_t sendPort; + ipc_port_t port; + ipc_port_t sendPort; if( obj == NULL) return IP_NULL; @@ -280,8 +280,8 @@ MIGEXTERN ipc_port_t iokit_make_connect_port( io_object_t obj ) { - register ipc_port_t port; - register ipc_port_t sendPort; + ipc_port_t port; + ipc_port_t sendPort; if( obj == NULL) return IP_NULL; diff --git a/osfmk/device/subrs.c b/osfmk/device/subrs.c index c9556819f..e36267988 100644 --- a/osfmk/device/subrs.c +++ b/osfmk/device/subrs.c @@ -557,17 +557,17 @@ STRDUP(const char *string, int type) /* * Return TRUE(1) if string 2 is a prefix of string 1. - */ -int -strprefix(register const char *s1, register const char *s2) -{ - register int c; - - while ((c = *s2++) != '\0') { - if (c != *s1++) - return (0); - } - return (1); + */ +int +strprefix(const char *s1, const char *s2) +{ + int c; + + while ((c = *s2++) != '\0') { + if (c != *s1++) + return (0); + } + return (1); } char * diff --git a/osfmk/gssd/Makefile b/osfmk/gssd/Makefile index 89d2c87db..83666b479 100644 --- a/osfmk/gssd/Makefile +++ b/osfmk/gssd/Makefile @@ -12,9 +12,9 @@ PRIVATE_DATAFILES = gssd_mach_types.h ${MIG_DEFS} INSTALL_MI_LIST = -INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} +INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} -INSTALL_MI_GEN_LIST = +INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = gssd @@ -27,7 +27,7 @@ EXPORT_MI_DIR = gssd # # Build path -# +# INCFLAGS_MAKEFILE= -I.. MIGKUFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_USER=1 -maxonstack 1024 @@ -41,7 +41,7 @@ COMP_FILES = ${MIG_KUSRC} do_build_all:: $(COMP_FILES) ${MIG_KUSRC} : gssd_mach.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user gssd_mach.c \ -header gssd_mach.h \ diff --git a/osfmk/gssd/gssd_mach.defs b/osfmk/gssd/gssd_mach.defs index 0c0a650e1..774f28e35 100644 --- a/osfmk/gssd/gssd_mach.defs +++ b/osfmk/gssd/gssd_mach.defs @@ -44,6 +44,7 @@ type gssd_verifier = uint64_t; type gssd_gid_list = array [*:16] of uint32_t; type gssd_ctx = uint64_t; type gssd_cred = uint64_t; +type gssd_etype_list = array [*:64] of int32_t; subsystem #if KERNEL_USER @@ -139,6 +140,29 @@ routine mach_gss_accept_sec_context_v2( out minor_stat : uint32_t ); +routine mach_gss_init_sec_context_v3( + server : mach_port_t; + in mech : gssd_mechtype; + in intoken : gssd_byte_buffer; + in uid : uint32_t; + in clnt_nt : gssd_nametype; + in clnt_princ : gssd_byte_buffer; + in svc_nt : gssd_nametype; + in svc_princ : gssd_byte_buffer; + in flags : uint32_t; + in etypes : gssd_etype_list; + inout gssd_flags : uint32_t; + inout context : gssd_ctx; + inout cred_handle : gssd_cred; + ServerAuditToken atoken : audit_token_t; + out ret_flags : uint32_t; + out key : gssd_byte_buffer, dealloc; + out outtoken : gssd_byte_buffer, dealloc; + out displayname : gssd_dstring; + out major_stat : uint32_t; + out minor_stat : uint32_t +); + routine mach_gss_hold_cred( server : mach_port_t; in mech : gssd_mechtype; diff --git a/osfmk/gssd/gssd_mach_types.h b/osfmk/gssd/gssd_mach_types.h index 6015ea89c..e1ba9a829 100644 --- a/osfmk/gssd/gssd_mach_types.h +++ b/osfmk/gssd/gssd_mach_types.h @@ -61,6 +61,7 @@ typedef uint8_t *gssd_byte_buffer; typedef uint32_t *gssd_gid_list; typedef uint64_t gssd_ctx; typedef uint64_t gssd_cred; +typedef int32_t *gssd_etype_list; /* The following need to correspond to GSS_C_*_FLAG in gssapi.h */ #define GSSD_DELEG_FLAG 1 @@ -81,6 +82,6 @@ typedef uint64_t gssd_cred; #define GSSD_RESTART 16 // Destroy the supplied context and start over #define GSSD_NFS_1DES 64 // Only get single DES session keys #define GSSD_WIN2K_HACK 128 // Hack for Win2K - +#define GSSD_LUCID_CONTEXT 256 // Export Lucid context #endif /* _GSSD_MACH_TYPES_H_ */ diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index d4a138af8..63daa8019 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,6 +105,7 @@ #include #include +#include #include #include #include @@ -133,7 +134,6 @@ static void machine_conf(void); void panic_print_symbol_name(vm_address_t search); -extern boolean_t init_task_died; extern const char version[]; extern char osversion[]; extern int max_unsafe_quanta; @@ -807,6 +807,7 @@ int reset_mem_on_reboot = 1; /* * Halt the system or reboot. */ +__attribute__((noreturn)) void halt_all_cpus(boolean_t reboot) { @@ -882,8 +883,13 @@ void DebuggerWithContext( __unused unsigned int reason, __unused void *ctx, - const char *message) + const char *message, + uint64_t debugger_options_mask) { + if (debugger_options_mask != DEBUGGER_OPTION_NONE) { + kprintf("debugger options (%llx) not supported for desktop.\n", debugger_options_mask); + } + Debugger(message); } @@ -894,8 +900,7 @@ Debugger( unsigned long pi_size = 0; void *stackptr; int cn = cpu_number(); - task_t task = current_task(); - int task_pid = pid_from_task(task); + boolean_t old_doprnt_hide_pointers = doprnt_hide_pointers; hw_atomic_add(&debug_mode, 1); @@ -928,7 +933,7 @@ Debugger( __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); /* Print backtrace - callee is internally synchronized */ - if (task_pid == 1 && (init_task_died)) { + if (strncmp(panicstr, LAUNCHD_CRASHED_PREFIX, strlen(LAUNCHD_CRASHED_PREFIX)) == 0) { /* Special handling of launchd died panics */ print_launchd_info(); } else { @@ -993,7 +998,7 @@ Debugger( } } - if (!panicDebugging) { + if (!panicDebugging && !kdp_has_polled_corefile()) { unsigned cnum; /* Clear the MP rendezvous function lock, in the event * that a panic occurred while in that codepath. @@ -1344,9 +1349,10 @@ print_tasks_user_threads(task_t task) pmap = get_task_pmap(task); savestate = get_user_regs(thread); rbp = savestate->ss_64.rbp; + kdb_printf("\t0x%016llx\n", savestate->ss_64.isf.rip); print_one_backtrace(pmap, (vm_offset_t)rbp, cur_marker, TRUE, TRUE); kdb_printf("\n"); - } + } } void diff --git a/osfmk/i386/Diagnostics.c b/osfmk/i386/Diagnostics.c index 50dad3970..bd2ca2140 100644 --- a/osfmk/i386/Diagnostics.c +++ b/osfmk/i386/Diagnostics.c @@ -307,7 +307,7 @@ diagCall64(x86_saved_state_t * state) rval = 1; } break; -#if DEBUG +#if DEVELOPMENT || DEBUG case dgGzallocTest: { (void) ml_set_interrupts_enabled(TRUE); @@ -321,7 +321,7 @@ diagCall64(x86_saved_state_t * state) break; #endif -#if PERMIT_PERMCHECK +#if DEVELOPMENT || DEBUG case dgPermCheck: { (void) ml_set_interrupts_enabled(TRUE); @@ -330,7 +330,7 @@ diagCall64(x86_saved_state_t * state) (void) ml_set_interrupts_enabled(FALSE); } break; -#endif /* PERMIT_PERMCHECK */ +#endif /* DEVELOPMENT || DEBUG */ default: /* Handle invalid ones */ rval = 0; /* Return an exception */ } diff --git a/osfmk/i386/Makefile b/osfmk/i386/Makefile index f1873bb62..b01634828 100644 --- a/osfmk/i386/Makefile +++ b/osfmk/i386/Makefile @@ -6,7 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -EXPORT_ONLY_FILES = \ +EXPORT_ONLY_FILES = \ apic.h \ asm.h \ atomic.h \ @@ -59,5 +59,3 @@ EXPORT_MD_DIR = i386 include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index 19d39a9c8..fce3396d0 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -234,7 +234,13 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) acpi_sleep_cpu(func, refcon); #endif - start = mach_absolute_time(); + acpi_wake_abstime = mach_absolute_time(); + /* Rebase TSC->absolute time conversion, using timestamp + * recorded before sleep. + */ + rtc_nanotime_init(acpi_sleep_abstime); + acpi_wake_postrebase_abstime = start = mach_absolute_time(); + assert(start >= acpi_sleep_abstime); x86_64_post_sleep(old_cr3); @@ -302,19 +308,14 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) #endif elapsed += mach_absolute_time() - start; - acpi_wake_abstime = mach_absolute_time(); - - /* let the realtime clock reset */ - rtc_sleep_wakeup(acpi_sleep_abstime); - acpi_wake_postrebase_abstime = mach_absolute_time(); - assert(mach_absolute_time() >= acpi_sleep_abstime); + rtc_decrementer_configure(); kdebug_enable = save_kdebug_enable; if (kdebug_enable == 0) { if (wake_nkdbufs) { start = mach_absolute_time(); - start_kern_tracing(wake_nkdbufs, TRUE); + kdebug_trace_start(wake_nkdbufs, NULL, TRUE); elapsed_trace_start += mach_absolute_time() - start; } } @@ -448,7 +449,7 @@ acpi_idle_kernel(acpi_sleep_callback func, void *refcon) /* Like S3 sleep, turn on tracing if trace_wake boot-arg is present */ if (kdebug_enable == 0) { if (wake_nkdbufs) - start_kern_tracing(wake_nkdbufs, TRUE); + kdebug_trace_start(wake_nkdbufs, NULL, TRUE); } IOCPURunPlatformActiveActions(); diff --git a/osfmk/i386/atomic.h b/osfmk/i386/atomic.h index 2cbeae68b..22ca1fd50 100644 --- a/osfmk/i386/atomic.h +++ b/osfmk/i386/atomic.h @@ -49,5 +49,18 @@ #endif +#ifdef ATOMIC_PRIVATE + +static boolean_t +atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, + enum memory_order ord, boolean_t wait) +{ + (void)wait; + return __c11_atomic_compare_exchange_strong((_Atomic uintptr_t *)target, &oldval, newval, ord, memory_order_relaxed); +} + +#endif // ATOMIC_PRIVATE + + #endif // _I386_ATOMIC_H_ diff --git a/osfmk/i386/bit_routines.h b/osfmk/i386/bit_routines.h index adf0d9c5e..2154ec763 100644 --- a/osfmk/i386/bit_routines.h +++ b/osfmk/i386/bit_routines.h @@ -108,7 +108,7 @@ static inline char xchgb(volatile char * cp, char new) { - register char old = new; + char old = new; __asm__ volatile (" xchgb %0,%2" : "=q" (old) : diff --git a/osfmk/i386/bsd_i386.c b/osfmk/i386/bsd_i386.c index a70d68ae3..4e1cf5634 100644 --- a/osfmk/i386/bsd_i386.c +++ b/osfmk/i386/bsd_i386.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,7 +95,8 @@ thread_userstack( thread_state_t tstate, __unused unsigned int count, mach_vm_offset_t *user_stack, - int *customstack + int *customstack, + __unused boolean_t is64bit ) { if (customstack) @@ -153,10 +154,10 @@ thread_userstack( */ kern_return_t thread_userstackdefault( - thread_t thread, - mach_vm_offset_t *default_user_stack) + mach_vm_offset_t *default_user_stack, + boolean_t is64bit) { - if (thread_is_64bit(thread)) { + if (is64bit) { *default_user_stack = VM_USRSTACK64; } else { *default_user_stack = VM_USRSTACK32; @@ -238,8 +239,7 @@ thread_set_child(thread_t child, int pid) extern long fuword(vm_offset_t); - - +__attribute__((noreturn)) void machdep_syscall(x86_saved_state_t *state) { @@ -321,7 +321,7 @@ machdep_syscall(x86_saved_state_t *state) /* NOTREACHED */ } - +__attribute__((noreturn)) void machdep_syscall64(x86_saved_state_t *state) { @@ -406,6 +406,7 @@ __private_extern__ void mach_call_munger(x86_saved_state_t *state); extern const char *mach_syscall_name_table[]; +__attribute__((noreturn)) void mach_call_munger(x86_saved_state_t *state) { @@ -494,6 +495,7 @@ mach_call_munger(x86_saved_state_t *state) __private_extern__ void mach_call_munger64(x86_saved_state_t *regs); +__attribute__((noreturn)) void mach_call_munger64(x86_saved_state_t *state) { @@ -693,21 +695,17 @@ thread_setsinglestep(thread_t thread, int on) return (KERN_SUCCESS); } - - -/* XXX this should be a struct savearea so that CHUD will work better on x86 */ void * -find_user_regs(thread_t thread) +get_user_regs(thread_t th) { - pal_register_cache_state(thread, DIRTY); - return USER_STATE(thread); + pal_register_cache_state(th, DIRTY); + return(USER_STATE(th)); } void * -get_user_regs(thread_t th) +find_user_regs(thread_t thread) { - pal_register_cache_state(th, DIRTY); - return(USER_STATE(th)); + return get_user_regs(thread); } #if CONFIG_DTRACE diff --git a/osfmk/i386/commpage/commpage.c b/osfmk/i386/commpage/commpage.c index 8bc648ca8..6dae08567 100644 --- a/osfmk/i386/commpage/commpage.c +++ b/osfmk/i386/commpage/commpage.c @@ -310,7 +310,6 @@ commpage_init_cpu_capabilities( void ) CPUID_LEAF7_FEATURE_MPX); setif(bits, kHasSGX, cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SGX); - uint64_t misc_enable = rdmsr64(MSR_IA32_MISC_ENABLE); setif(bits, kHasENFSTRG, (misc_enable & 1ULL) && (cpuid_leaf7_features() & @@ -325,7 +324,7 @@ commpage_init_cpu_capabilities( void ) static void commpage_mach_approximate_time_init(void) { - char *cp = commPagePtr32; + char *cp = commPagePtr32; uint8_t supported; #ifdef CONFIG_MACH_APPROXIMATE_TIME @@ -334,17 +333,32 @@ commpage_mach_approximate_time_init(void) supported = 0; #endif if ( cp ) { - cp += (_COMM_PAGE_APPROX_TIME_SUPPORTED - _COMM_PAGE32_BASE_ADDRESS); + cp += (_COMM_PAGE_APPROX_TIME_SUPPORTED - _COMM_PAGE32_BASE_ADDRESS); *(boolean_t *)cp = supported; } - cp = commPagePtr64; + + cp = commPagePtr64; if ( cp ) { - cp += (_COMM_PAGE_APPROX_TIME_SUPPORTED - _COMM_PAGE32_START_ADDRESS); + cp += (_COMM_PAGE_APPROX_TIME_SUPPORTED - _COMM_PAGE32_START_ADDRESS); *(boolean_t *)cp = supported; } commpage_update_mach_approximate_time(0); } +static void +commpage_mach_continuous_time_init(void) +{ + commpage_update_mach_continuous_time(0); +} + +static void +commpage_boottime_init(void) +{ + clock_sec_t secs; + clock_usec_t microsecs; + clock_get_boottime_microtime(&secs, µsecs); + commpage_update_boottime(secs * USEC_PER_SEC + microsecs); +} uint64_t _get_cpu_capabilities(void) @@ -487,8 +501,10 @@ commpage_populate( void ) commpage_update_active_cpus(); commpage_mach_approximate_time_init(); + commpage_mach_continuous_time_init(); + commpage_boottime_init(); rtc_nanotime_init_commpage(); - commpage_update_kdebug_enable(); + commpage_update_kdebug_state(); #if CONFIG_ATM commpage_update_atm_diagnostic_config(atm_get_diagnostic_config()); #endif @@ -724,13 +740,15 @@ commpage_update_active_cpus(void) } /* - * Update the commpage data with the value of the "kdebug_enable" - * global so that userspace can avoid trapping into the kernel - * for kdebug_trace() calls. Serialization is handled - * by the caller in bsd/kern/kdebug.c. + * Update the commpage with current kdebug state. This currently has bits for + * global trace state, and typefilter enablement. It is likely additional state + * will be tracked in the future. + * + * INVARIANT: This value will always be 0 if global tracing is disabled. This + * allows simple guard tests of "if (*_COMM_PAGE_KDEBUG_ENABLE) { ... }" */ void -commpage_update_kdebug_enable(void) +commpage_update_kdebug_state(void) { volatile uint32_t *saved_data_ptr; char *cp; @@ -739,14 +757,14 @@ commpage_update_kdebug_enable(void) if (cp) { cp += (_COMM_PAGE_KDEBUG_ENABLE - _COMM_PAGE32_BASE_ADDRESS); saved_data_ptr = (volatile uint32_t *)cp; - *saved_data_ptr = kdebug_enable; + *saved_data_ptr = kdebug_commpage_state(); } cp = commPagePtr64; - if ( cp ) { + if (cp) { cp += (_COMM_PAGE_KDEBUG_ENABLE - _COMM_PAGE32_START_ADDRESS); saved_data_ptr = (volatile uint32_t *)cp; - *saved_data_ptr = kdebug_enable; + *saved_data_ptr = kdebug_commpage_state(); } } @@ -812,6 +830,40 @@ commpage_update_mach_approximate_time(uint64_t abstime) #endif } +void +commpage_update_mach_continuous_time(uint64_t sleeptime) +{ + char *cp; + cp = commPagePtr32; + if (cp) { + cp += (_COMM_PAGE_CONT_TIMEBASE - _COMM_PAGE32_START_ADDRESS); + *(uint64_t *)cp = sleeptime; + } + + cp = commPagePtr64; + if (cp) { + cp += (_COMM_PAGE_CONT_TIMEBASE - _COMM_PAGE32_START_ADDRESS); + *(uint64_t *)cp = sleeptime; + } +} + +void +commpage_update_boottime(uint64_t boottime) +{ + char *cp; + cp = commPagePtr32; + if (cp) { + cp += (_COMM_PAGE_BOOTTIME_USEC - _COMM_PAGE32_START_ADDRESS); + *(uint64_t *)cp = boottime; + } + + cp = commPagePtr64; + if (cp) { + cp += (_COMM_PAGE_BOOTTIME_USEC - _COMM_PAGE32_START_ADDRESS); + *(uint64_t *)cp = boottime; + } +} + extern user32_addr_t commpage_text32_location; extern user64_addr_t commpage_text64_location; diff --git a/osfmk/i386/commpage/commpage.h b/osfmk/i386/commpage/commpage.h index 6f2a3418d..45c001b41 100644 --- a/osfmk/i386/commpage/commpage.h +++ b/osfmk/i386/commpage/commpage.h @@ -146,7 +146,9 @@ extern void commpage_set_spin_count(unsigned int count); extern void commpage_sched_gen_inc(void); extern void commpage_update_active_cpus(void); extern void commpage_update_mach_approximate_time(uint64_t abstime); -extern void commpage_update_kdebug_enable(void); +extern void commpage_update_mach_continuous_time(uint64_t sleeptime); +extern void commpage_update_boottime(uint64_t boottime_usec); +extern void commpage_update_kdebug_state(void); extern void commpage_update_atm_diagnostic_config(uint32_t); extern uint32_t commpage_is_in_pfz32(uint32_t); diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index 9f33188c5..868e1b4fd 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -82,7 +82,7 @@ __BEGIN_DECLS extern uint64_t _get_cpu_capabilities( void ); __END_DECLS -inline static +__inline static int _NumCPUs( void ) { return (int) (_get_cpu_capabilities() & kNumCPUs) >> kNumCPUsShift; @@ -200,10 +200,14 @@ int _NumCPUs( void ) #define _COMM_PAGE_GTOD_GENERATION (_COMM_PAGE_START_ADDRESS+0x06c) /* used by gettimeofday() */ #define _COMM_PAGE_GTOD_NS_BASE (_COMM_PAGE_START_ADDRESS+0x070) /* used by gettimeofday() */ #define _COMM_PAGE_GTOD_SEC_BASE (_COMM_PAGE_START_ADDRESS+0x078) /* used by gettimeofday() */ + /* NOTE: APPROX_TIME must be aligned to 64-byte cache line size: */ #define _COMM_PAGE_APPROX_TIME (_COMM_PAGE_START_ADDRESS+0x080) /* used by mach_approximate_time() */ #define _COMM_PAGE_APPROX_TIME_SUPPORTED (_COMM_PAGE_START_ADDRESS+0x088) /* used by mach_approximate_time() */ +/* Align following entries to next cache line */ +#define _COMM_PAGE_CONT_TIMEBASE (_COMM_PAGE_START_ADDRESS+0x0C0) /* used by mach_continuous_time() */ +#define _COMM_PAGE_BOOTTIME_USEC (_COMM_PAGE_START_ADDRESS+0x0C8) /* uint64_t boottime */ #define _COMM_PAGE_END (_COMM_PAGE_START_ADDRESS+0xfff) /* end of common page */ diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index 466d62f23..057fc5b1c 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -107,6 +107,13 @@ typedef uint8_t pcid_ref_t; #define CPU_RTIME_BINS (12) #define CPU_ITIME_BINS (CPU_RTIME_BINS) +#define MAXPLFRAMES (32) +typedef struct { + boolean_t pltype; + int plevel; + uint64_t plbt[MAXPLFRAMES]; +} plrecord_t; + /* * Per-cpu data. * @@ -155,6 +162,7 @@ typedef struct cpu_data volatile task_map_t cpu_task_map; volatile addr64_t cpu_task_cr3; addr64_t cpu_kernel_cr3; + boolean_t cpu_pagezero_mapped; cpu_uber_t cpu_uber; void *cpu_chud; void *cpu_console_buf; @@ -192,6 +200,7 @@ typedef struct cpu_data uint32_t cpu_pmap_pcid_enabled; pcid_t cpu_active_pcid; pcid_t cpu_last_pcid; + pcid_t cpu_kernel_pcid; volatile pcid_ref_t *cpu_pmap_pcid_coherentp; volatile pcid_ref_t *cpu_pmap_pcid_coherentp_kernel; #define PMAP_PCID_MAX_PCID (0x1000) @@ -243,7 +252,12 @@ typedef struct cpu_data int cpu_threadtype; boolean_t cpu_iflag; boolean_t cpu_boot_complete; - int cpu_hibernate; + int cpu_hibernate; +#define MAX_PREEMPTION_RECORDS (128) +#if DEVELOPMENT || DEBUG + int cpu_plri; + plrecord_t plrecords[MAX_PREEMPTION_RECORDS]; +#endif } cpu_data_t; extern cpu_data_t *cpu_data_ptr[]; @@ -351,25 +365,140 @@ get_cpu_phys_number(void) CPU_DATA_GET(cpu_phys_number,int) } +static inline cpu_data_t * +current_cpu_datap(void) { + CPU_DATA_GET(cpu_this, cpu_data_t *); +} + +/* + * Facility to diagnose preemption-level imbalances, which are otherwise + * challenging to debug. On each operation that enables or disables preemption, + * we record a backtrace into a per-CPU ring buffer, along with the current + * preemption level and operation type. Thus, if an imbalance is observed, + * one can examine these per-CPU records to determine which codepath failed + * to re-enable preemption, enabled premption without a corresponding + * disablement etc. The backtracer determines which stack is currently active, + * and uses that to perform bounds checks on unterminated stacks. + * To enable, sysctl -w machdep.pltrace=1 on DEVELOPMENT or DEBUG kernels (DRK '15) + * The bounds check currently doesn't account for non-default thread stack sizes. + */ +#if DEVELOPMENT || DEBUG +static inline void pltrace_bt(uint64_t *rets, int maxframes, uint64_t stacklo, uint64_t stackhi) { + uint64_t *cfp = (uint64_t *) __builtin_frame_address(0); + int plbtf; + + assert(stacklo !=0 && stackhi !=0); + + for (plbtf = 0; plbtf < maxframes; plbtf++) { + if (((uint64_t)cfp == 0) || (((uint64_t)cfp < stacklo) || ((uint64_t)cfp > stackhi))) { + rets[plbtf] = 0; + continue; + } + rets[plbtf] = *(cfp + 1); + cfp = (uint64_t *) (*cfp); + } +} + + +extern uint32_t low_intstack[]; /* bottom */ +extern uint32_t low_eintstack[]; /* top */ +extern char mp_slave_stack[PAGE_SIZE]; + +static inline void pltrace_internal(boolean_t enable) { + cpu_data_t *cdata = current_cpu_datap(); + int cpli = cdata->cpu_preemption_level; + int cplrecord = cdata->cpu_plri; + uint64_t kstackb, kstackt, *plbts; + + assert(cpli >= 0); + + cdata->plrecords[cplrecord].pltype = enable; + cdata->plrecords[cplrecord].plevel = cpli; + + plbts = &cdata->plrecords[cplrecord].plbt[0]; + + cplrecord++; + + if (cplrecord >= MAX_PREEMPTION_RECORDS) { + cplrecord = 0; + } + + cdata->cpu_plri = cplrecord; + /* Obtain the 'current' program counter, initial backtrace + * element. This will also indicate if we were unable to + * trace further up the stack for some reason + */ + __asm__ volatile("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" + : "=m" (plbts[0]) + : + : "rax"); + + + thread_t cplthread = cdata->cpu_active_thread; + if (cplthread) { + uintptr_t csp; + __asm__ __volatile__ ("movq %%rsp, %0": "=r" (csp):); + /* Determine which stack we're on to populate stack bounds. + * We don't need to trace across stack boundaries for this + * routine. + */ + kstackb = cdata->cpu_active_stack; + kstackt = kstackb + KERNEL_STACK_SIZE; + if (csp < kstackb || csp > kstackt) { + kstackt = cdata->cpu_kernel_stack; + kstackb = kstackb - KERNEL_STACK_SIZE; + if (csp < kstackb || csp > kstackt) { + kstackt = cdata->cpu_int_stack_top; + kstackb = kstackt - INTSTACK_SIZE; + if (csp < kstackb || csp > kstackt) { + kstackt = (uintptr_t)low_eintstack; + kstackb = (uintptr_t)low_eintstack - INTSTACK_SIZE; + if (csp < kstackb || csp > kstackt) { + kstackb = (uintptr_t) mp_slave_stack; + kstackt = (uintptr_t) mp_slave_stack + PAGE_SIZE; + } + } + } + } + + if (kstackb) { + pltrace_bt(&plbts[1], MAXPLFRAMES - 1, kstackb, kstackt); + } + } +} + +extern int plctrace_enabled; +#endif /* DEVELOPMENT || DEBUG */ + +static inline void pltrace(boolean_t plenable) { +#if DEVELOPMENT || DEBUG + if (__improbable(plctrace_enabled != 0)) { + pltrace_internal(plenable); + } +#else + (void)plenable; +#endif +} static inline void -disable_preemption(void) -{ +disable_preemption_internal(void) { + assert(get_preemption_level() >= 0); + #if defined(__clang__) cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL; cpu_data->cpu_preemption_level++; #else __asm__ volatile ("incl %%gs:%P0" - : - : "i" (offsetof(cpu_data_t, cpu_preemption_level))); + : + : "i" (offsetof(cpu_data_t, cpu_preemption_level))); #endif + pltrace(FALSE); } static inline void -enable_preemption(void) -{ +enable_preemption_internal(void) { assert(get_preemption_level() > 0); - + pltrace(TRUE); #if defined(__clang__) cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL; if (0 == --cpu_data->cpu_preemption_level) @@ -390,6 +519,7 @@ enable_preemption_no_check(void) { assert(get_preemption_level() > 0); + pltrace(TRUE); #if defined(__clang__) cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL; cpu_data->cpu_preemption_level--; @@ -401,33 +531,53 @@ enable_preemption_no_check(void) #endif } +static inline void +_enable_preemption_no_check(void) { + enable_preemption_no_check(); +} + static inline void mp_disable_preemption(void) { - disable_preemption(); + disable_preemption_internal(); } static inline void -mp_enable_preemption(void) +_mp_disable_preemption(void) { - enable_preemption(); + disable_preemption_internal(); } static inline void -mp_enable_preemption_no_check(void) +mp_enable_preemption(void) { + enable_preemption_internal(); +} + +static inline void +_mp_enable_preemption(void) { + enable_preemption_internal(); +} + +static inline void +mp_enable_preemption_no_check(void) { enable_preemption_no_check(); } -static inline cpu_data_t * -current_cpu_datap(void) -{ - CPU_DATA_GET(cpu_this, cpu_data_t *); +static inline void +_mp_enable_preemption_no_check(void) { + enable_preemption_no_check(); } +#ifdef XNU_KERNEL_PRIVATE +#define disable_preemption() disable_preemption_internal() +#define enable_preemption() enable_preemption_internal() +#define MACHINE_PREEMPTION_MACROS (1) +#endif + + static inline cpu_data_t * -cpu_datap(int cpu) -{ +cpu_datap(int cpu) { return cpu_data_ptr[cpu]; } diff --git a/osfmk/i386/cpu_threads.c b/osfmk/i386/cpu_threads.c index 2e87fae6a..e58a9369e 100644 --- a/osfmk/i386/cpu_threads.c +++ b/osfmk/i386/cpu_threads.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2010 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -970,6 +970,7 @@ cpu_thread_init(void) * Called for a cpu to halt permanently * (as opposed to halting and expecting an interrupt to awaken it). */ +__attribute__((noreturn)) void cpu_thread_halt(void) { diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index 584012965..db82b6eb3 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -757,9 +757,6 @@ cpuid_set_cpufamily(i386_cpu_info_t *info_p) switch (info_p->cpuid_family) { case 6: switch (info_p->cpuid_model) { - case 15: - cpufamily = CPUFAMILY_INTEL_MEROM; - break; case 23: cpufamily = CPUFAMILY_INTEL_PENRYN; break; @@ -850,7 +847,6 @@ cpuid_set_info(void) * (which determines whether SMT/Hyperthreading is active). */ switch (info_p->cpuid_cpufamily) { - case CPUFAMILY_INTEL_MEROM: case CPUFAMILY_INTEL_PENRYN: info_p->core_count = info_p->cpuid_cores_per_package; info_p->thread_count = info_p->cpuid_logical_per_package; diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index feca66997..2c1f1803a 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -193,8 +193,6 @@ #define CPUID_MWAIT_EXTENSION _Bit(0) /* enumeration of WMAIT extensions */ #define CPUID_MWAIT_BREAK _Bit(1) /* interrupts are break events */ -#define CPUID_MODEL_YONAH 0x0E -#define CPUID_MODEL_MEROM 0x0F #define CPUID_MODEL_PENRYN 0x17 #define CPUID_MODEL_NEHALEM 0x1A #define CPUID_MODEL_FIELDS 0x1E /* Lynnfield, Clarksfield */ diff --git a/osfmk/i386/endian.h b/osfmk/i386/endian.h index 248519588..d9bed4649 100644 --- a/osfmk/i386/endian.h +++ b/osfmk/i386/endian.h @@ -68,12 +68,12 @@ unsigned short htons(unsigned short); static __inline__ unsigned long ntohl(unsigned long); static __inline__ unsigned long -ntohl(register unsigned long value) +ntohl(unsigned long value) { #if defined(__clang__) return (unsigned long)__builtin_bswap32((unsigned int)value); #else - register unsigned long l = value; + unsigned long l = value; __asm__ volatile("bswap %0" : "=r" (l) : "0" (l)); return l; #endif diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index df870f71e..e9dce1877 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -665,13 +665,15 @@ fpu_dup_fxstate( /* * Make sure we`ve got the latest fp state info */ - intr = ml_set_interrupts_enabled(FALSE); - assert(current_thread() == parent); - clear_ts(); - fp_save(parent); - clear_fpu(); - - (void)ml_set_interrupts_enabled(intr); + if (current_thread() == parent) { + intr = ml_set_interrupts_enabled(FALSE); + assert(current_thread() == parent); + clear_ts(); + fp_save(parent); + clear_fpu(); + + (void)ml_set_interrupts_enabled(intr); + } if (ifps->fp_valid) { child->machine.ifps = new_ifps; diff --git a/osfmk/i386/genassym.c b/osfmk/i386/genassym.c index 1de618473..1a0c71cef 100644 --- a/osfmk/i386/genassym.c +++ b/osfmk/i386/genassym.c @@ -376,6 +376,8 @@ main( offsetof(cpu_data_t, cpu_kernel_cr3)); DECLARE("CPU_TLB_INVALID", offsetof(cpu_data_t, cpu_tlb_invalid)); + DECLARE("CPU_PAGEZERO_MAPPED", + offsetof(cpu_data_t, cpu_pagezero_mapped)); DECLARE("CPU_TASK_MAP", offsetof(cpu_data_t, cpu_task_map)); @@ -397,6 +399,9 @@ main( DECLARE("hwIntCnt", offsetof(cpu_data_t,cpu_hwIntCnt)); DECLARE("CPU_ACTIVE_PCID", offsetof(cpu_data_t, cpu_active_pcid)); + DECLARE("CPU_KERNEL_PCID", + offsetof(cpu_data_t, cpu_kernel_pcid)); + DECLARE("CPU_PCID_COHERENTP", offsetof(cpu_data_t, cpu_pmap_pcid_coherentp)); DECLARE("CPU_PCID_COHERENTP_KERNEL", diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index f6546ad34..d4d22ae57 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2012 Apple Inc. All rights reserved. + * Copyright (c) 2003-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -320,6 +320,7 @@ Idle_PTs_init(void) * Non-bootstrap processors are called with argument boot_args_start NULL. * These processors switch immediately to the existing kernel page tables. */ +__attribute__((noreturn)) void vstart(vm_offset_t boot_args_start) { @@ -422,7 +423,7 @@ i386_init(void) tsc_init(); rtclock_early_init(); /* mach_absolute_time() now functionsl */ - kernel_debug_string_simple("i386_init"); + kernel_debug_string_early("i386_init"); pstate_trace(); #if CONFIG_MCA @@ -439,10 +440,10 @@ i386_init(void) panic_init(); /* Init this in case we need debugger */ /* setup debugging output if one has been chosen */ - kernel_debug_string_simple("PE_init_kprintf"); + kernel_debug_string_early("PE_init_kprintf"); PE_init_kprintf(FALSE); - kernel_debug_string_simple("kernel_early_bootstrap"); + kernel_debug_string_early("kernel_early_bootstrap"); kernel_early_bootstrap(); if (!PE_parse_boot_argn("diag", &dgWork.dgFlags, sizeof (dgWork.dgFlags))) @@ -459,7 +460,7 @@ i386_init(void) } /* setup console output */ - kernel_debug_string_simple("PE_init_printf"); + kernel_debug_string_early("PE_init_printf"); PE_init_printf(FALSE); kprintf("version_variant = %s\n", version_variant); @@ -501,7 +502,7 @@ i386_init(void) * VM initialization, after this we're using page tables... * Thn maximum number of cpus must be set beforehand. */ - kernel_debug_string_simple("i386_vm_init"); + kernel_debug_string_early("i386_vm_init"); i386_vm_init(maxmemtouse, IA32e, kernelBootArgs); /* create the console for verbose or pretty mode */ @@ -509,13 +510,13 @@ i386_init(void) PE_init_platform(TRUE, kernelBootArgs); PE_create_console(); - kernel_debug_string_simple("power_management_init"); + kernel_debug_string_early("power_management_init"); power_management_init(); processor_bootstrap(); thread_bootstrap(); pstate_trace(); - kernel_debug_string_simple("machine_startup"); + kernel_debug_string_early("machine_startup"); machine_startup(); pstate_trace(); } diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s index 61355263f..f54e040a1 100644 --- a/osfmk/i386/i386_lock.s +++ b/osfmk/i386/i386_lock.s @@ -244,16 +244,6 @@ * register initially, and then either a byte or register-sized * word is loaded/stored to the pointer */ - -/* - * void hw_lock_init(hw_lock_t) - * - * Initialize a hardware lock. - */ -LEAF_ENTRY(hw_lock_init) - movq $0, (%rdi) /* clear the lock */ - LEAF_RET - /* * void hw_lock_byte_init(volatile uint8_t *) @@ -264,28 +254,6 @@ LEAF_ENTRY(hw_lock_byte_init) movb $0, (%rdi) /* clear the lock */ LEAF_RET -/* - * void hw_lock_lock(hw_lock_t) - * - * Acquire lock, spinning until it becomes available. - * MACH_RT: also return with preemption disabled. - */ -LEAF_ENTRY(hw_lock_lock) - mov %gs:CPU_ACTIVE_THREAD, %rcx /* get thread pointer */ - - PREEMPTION_DISABLE -1: - mov (%rdi), %rax - test %rax,%rax /* lock locked? */ - jne 3f /* branch if so */ - lock; cmpxchg %rcx,(%rdi) /* try to acquire the HW lock */ - jne 3f - movl $1,%eax /* In case this was a timeout call */ - LEAF_RET /* if yes, then nothing left to do */ -3: - PAUSE /* pause for hyper-threading */ - jmp 1b /* try again */ - /* * void hw_lock_byte_lock(uint8_t *lock_byte) * @@ -307,93 +275,6 @@ LEAF_ENTRY(hw_lock_byte_lock) PAUSE /* pause for hyper-threading */ jmp 1b /* try again */ -/* - * unsigned int hw_lock_to(hw_lock_t, unsigned int) - * - * Acquire lock, spinning until it becomes available or timeout. - * MACH_RT: also return with preemption disabled. - */ -LEAF_ENTRY(hw_lock_to) -1: - mov %gs:CPU_ACTIVE_THREAD, %rcx - - /* - * Attempt to grab the lock immediately - * - fastpath without timeout nonsense. - */ - PREEMPTION_DISABLE - - mov (%rdi), %rax - test %rax,%rax /* lock locked? */ - jne 2f /* branch if so */ - lock; cmpxchg %rcx,(%rdi) /* try to acquire the HW lock */ - jne 2f /* branch on failure */ - movl $1,%eax - LEAF_RET - -2: -#define INNER_LOOP_COUNT 1000 - /* - * Failed to get the lock so set the timeout - * and then spin re-checking the lock but pausing - * every so many (INNER_LOOP_COUNT) spins to check for timeout. - */ - push %r9 - lfence - rdtsc /* read cyclecount into %edx:%eax */ - shlq $32, %rdx - orq %rdx, %rax /* load 64-bit quantity into %rax */ - addq %rax, %rsi /* %rsi is the timeout expiry */ - -4: - /* - * The inner-loop spin to look for the lock being freed. - */ - mov $(INNER_LOOP_COUNT),%r9 -5: - PAUSE /* pause for hyper-threading */ - mov (%rdi),%rax /* spin checking lock value in cache */ - test %rax,%rax - je 6f /* zero => unlocked, try to grab it */ - decq %r9 /* decrement inner loop count */ - jnz 5b /* time to check for timeout? */ - - /* - * Here after spinning INNER_LOOP_COUNT times, check for timeout - */ - lfence - rdtsc /* cyclecount into %edx:%eax */ - shlq $32, %rdx - orq %rdx, %rax /* load 64-bit quantity into %rax */ - cmpq %rsi, %rax /* compare to timeout */ - jb 4b /* continue spinning if less, or */ - xor %rax,%rax /* with 0 return value */ - pop %r9 - LEAF_RET - -6: - /* - * Here to try to grab the lock that now appears to be free - * after contention. - */ - mov %gs:CPU_ACTIVE_THREAD, %rcx - lock; cmpxchg %rcx,(%rdi) /* try to acquire the HW lock */ - jne 4b /* no - spin again */ - movl $1,%eax /* yes */ - pop %r9 - LEAF_RET - -/* - * void hw_lock_unlock(hw_lock_t) - * - * Unconditionally release lock. - * MACH_RT: release preemption level. - */ -LEAF_ENTRY(hw_lock_unlock) - movq $0, (%rdi) /* clear the lock */ - PREEMPTION_ENABLE - LEAF_RET - /* * void hw_lock_byte_unlock(uint8_t *lock_byte) * @@ -406,41 +287,6 @@ LEAF_ENTRY(hw_lock_byte_unlock) PREEMPTION_ENABLE LEAF_RET -/* - * unsigned int hw_lock_try(hw_lock_t) - * MACH_RT: returns with preemption disabled on success. - */ -LEAF_ENTRY(hw_lock_try) - mov %gs:CPU_ACTIVE_THREAD, %rcx - PREEMPTION_DISABLE - - mov (%rdi),%rax - test %rax,%rax - jne 1f - lock; cmpxchg %rcx,(%rdi) /* try to acquire the HW lock */ - jne 1f - - movl $1,%eax /* success */ - LEAF_RET - -1: - PREEMPTION_ENABLE /* failure: release preemption... */ - xorl %eax,%eax /* ...and return failure */ - LEAF_RET - -/* - * unsigned int hw_lock_held(hw_lock_t) - * MACH_RT: doesn't change preemption state. - * N.B. Racy, of course. - */ -LEAF_ENTRY(hw_lock_held) - mov (%rdi),%rax /* check lock value */ - test %rax,%rax - movl $1,%ecx - cmovne %ecx,%eax /* 0 => unlocked, 1 => locked */ - LEAF_RET - - /* * Reader-writer lock fastpaths. These currently exist for the * shared lock acquire, the exclusive lock acquire, the shared to @@ -1711,184 +1557,3 @@ LEAF_ENTRY(preemption_underflow_panic) .text -LEAF_ENTRY(_disable_preemption) -#if MACH_RT - PREEMPTION_DISABLE -#endif /* MACH_RT */ - LEAF_RET - -LEAF_ENTRY(_enable_preemption) -#if MACH_RT -#if MACH_ASSERT - cmpl $0,%gs:CPU_PREEMPTION_LEVEL - jg 1f - movl %gs:CPU_PREEMPTION_LEVEL,%esi - ALIGN_STACK() - LOAD_STRING_ARG0(_enable_preemption_less_than_zero) - CALL_PANIC() - hlt - .cstring -_enable_preemption_less_than_zero: - .asciz "_enable_preemption: preemption_level(%d) < 0!" - .text -1: -#endif /* MACH_ASSERT */ - PREEMPTION_ENABLE -#endif /* MACH_RT */ - LEAF_RET - -LEAF_ENTRY(_enable_preemption_no_check) -#if MACH_RT -#if MACH_ASSERT - cmpl $0,%gs:CPU_PREEMPTION_LEVEL - jg 1f - ALIGN_STACK() - LOAD_STRING_ARG0(_enable_preemption_no_check_less_than_zero) - CALL_PANIC() - hlt - .cstring -_enable_preemption_no_check_less_than_zero: - .asciz "_enable_preemption_no_check: preemption_level <= 0!" - .text -1: -#endif /* MACH_ASSERT */ - _ENABLE_PREEMPTION_NO_CHECK -#endif /* MACH_RT */ - LEAF_RET - - -LEAF_ENTRY(_mp_disable_preemption) -#if MACH_RT - PREEMPTION_DISABLE -#endif /* MACH_RT */ - LEAF_RET - -LEAF_ENTRY(_mp_enable_preemption) -#if MACH_RT -#if MACH_ASSERT - cmpl $0,%gs:CPU_PREEMPTION_LEVEL - jg 1f - movl %gs:CPU_PREEMPTION_LEVEL,%esi - ALIGN_PANIC() - LOAD_STRING_ARG0(_mp_enable_preemption_less_than_zero) - CALL_PANIC() - hlt - .cstring -_mp_enable_preemption_less_than_zero: - .asciz "_mp_enable_preemption: preemption_level (%d) <= 0!" - .text -1: -#endif /* MACH_ASSERT */ - PREEMPTION_ENABLE -#endif /* MACH_RT */ - LEAF_RET - -LEAF_ENTRY(_mp_enable_preemption_no_check) -#if MACH_RT -#if MACH_ASSERT - cmpl $0,%gs:CPU_PREEMPTION_LEVEL - jg 1f - ALIGN_STACK() - LOAD_STRING_ARG0(_mp_enable_preemption_no_check_less_than_zero) - CALL_PANIC() - hlt - .cstring -_mp_enable_preemption_no_check_less_than_zero: - .asciz "_mp_enable_preemption_no_check: preemption_level <= 0!" - .text -1: -#endif /* MACH_ASSERT */ - _ENABLE_PREEMPTION_NO_CHECK -#endif /* MACH_RT */ - LEAF_RET - -/* - * Atomic primitives, prototyped in kern/simple_lock.h - */ -LEAF_ENTRY(hw_atomic_add) -#if MACH_LDEBUG - test $3, %rdi - jz 1f - ud2 -1: -#endif - movl %esi, %eax /* Load addend */ - lock xaddl %eax, (%rdi) /* Atomic exchange and add */ - addl %esi, %eax /* Calculate result */ - LEAF_RET - -LEAF_ENTRY(hw_atomic_sub) -#if MACH_LDEBUG - test $3, %rdi - jz 1f - ud2 -1: -#endif - negl %esi - movl %esi, %eax - lock xaddl %eax, (%rdi) /* Atomic exchange and add */ - addl %esi, %eax /* Calculate result */ - LEAF_RET - -LEAF_ENTRY(hw_atomic_or) -#if MACH_LDEBUG - test $3, %rdi - jz 1f - ud2 -1: -#endif - movl (%rdi), %eax -1: - movl %esi, %edx /* Load mask */ - orl %eax, %edx - lock cmpxchgl %edx, (%rdi) /* Atomic CAS */ - jne 1b - movl %edx, %eax /* Result */ - LEAF_RET -/* - * A variant of hw_atomic_or which doesn't return a value. - * The implementation is thus comparatively more efficient. - */ - -LEAF_ENTRY(hw_atomic_or_noret) -#if MACH_LDEBUG - test $3, %rdi - jz 1f - ud2 -1: -#endif - lock - orl %esi, (%rdi) /* Atomic OR */ - LEAF_RET - - -LEAF_ENTRY(hw_atomic_and) -#if MACH_LDEBUG - test $3, %rdi - jz 1f - ud2 -1: -#endif - movl (%rdi), %eax -1: - movl %esi, %edx /* Load mask */ - andl %eax, %edx - lock cmpxchgl %edx, (%rdi) /* Atomic CAS */ - jne 1b - movl %edx, %eax /* Result */ - LEAF_RET -/* - * A variant of hw_atomic_and which doesn't return a value. - * The implementation is thus comparatively more efficient. - */ - -LEAF_ENTRY(hw_atomic_and_noret) -#if MACH_LDEBUG - test $3, %rdi - jz 1f - ud2 -1: -#endif - lock andl %esi, (%rdi) /* Atomic OR */ - LEAF_RET - diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index 81b9d6f51..3ae00d697 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -100,6 +100,8 @@ vm_offset_t vm_kernel_top; vm_offset_t vm_kernel_stext; vm_offset_t vm_kernel_etext; vm_offset_t vm_kernel_slide; +vm_offset_t vm_kernel_slid_base; +vm_offset_t vm_kernel_slid_top; vm_offset_t vm_hib_base; vm_offset_t vm_kext_base = VM_MIN_KERNEL_AND_KEXT_ADDRESS; vm_offset_t vm_kext_top = VM_MIN_KERNEL_ADDRESS; @@ -131,7 +133,7 @@ vm_offset_t virtual_avail, virtual_end; static pmap_paddr_t avail_remaining; vm_offset_t static_memory_end = 0; -vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, sconstdata, econstdata, end; +vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, end, sconst, econst; /* * _mh_execute_header is the mach_header for the currently executing kernel @@ -139,16 +141,14 @@ vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, sconstdata, econstdata, end; vm_offset_t segTEXTB; unsigned long segSizeTEXT; vm_offset_t segDATAB; unsigned long segSizeDATA; vm_offset_t segLINKB; unsigned long segSizeLINK; -vm_offset_t segPRELINKB; unsigned long segSizePRELINK; +vm_offset_t segPRELINKTEXTB; unsigned long segSizePRELINKTEXT; vm_offset_t segPRELINKINFOB; unsigned long segSizePRELINKINFO; vm_offset_t segHIBB; unsigned long segSizeHIB; -vm_offset_t sectCONSTB; unsigned long sectSizeConst; - -boolean_t doconstro_override = FALSE; +unsigned long segSizeConst; static kernel_segment_command_t *segTEXT, *segDATA; static kernel_section_t *cursectTEXT, *lastsectTEXT; -static kernel_section_t *sectDCONST; +static kernel_segment_command_t *segCONST; extern uint64_t firmware_Conventional_bytes; extern uint64_t firmware_RuntimeServices_bytes; @@ -251,16 +251,16 @@ i386_vm_init(uint64_t maxmem, "__LINKEDIT", &segSizeLINK); segHIBB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__HIB", &segSizeHIB); - segPRELINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, - "__PRELINK_TEXT", &segSizePRELINK); - segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, - "__PRELINK_INFO", &segSizePRELINKINFO); + segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, + "__PRELINK_TEXT", &segSizePRELINKTEXT); + segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, + "__PRELINK_INFO", &segSizePRELINKINFO); segTEXT = getsegbynamefromheader(&_mh_execute_header, "__TEXT"); segDATA = getsegbynamefromheader(&_mh_execute_header, "__DATA"); - sectDCONST = getsectbynamefromheader(&_mh_execute_header, - "__DATA", "__const"); + segCONST = getsegbynamefromheader(&_mh_execute_header, + "__CONST"); cursectTEXT = lastsectTEXT = firstsect(segTEXT); /* Discover the last TEXT section within the TEXT segment */ while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) { @@ -278,23 +278,17 @@ i386_vm_init(uint64_t maxmem, sdata = segDATAB; edata = segDATAB + segSizeDATA; - sectCONSTB = (vm_offset_t) sectDCONST->addr; - sectSizeConst = sectDCONST->size; - sconstdata = sectCONSTB; - econstdata = sectCONSTB + sectSizeConst; - - if (sectSizeConst & PAGE_MASK) { - kernel_section_t *ns = nextsect(segDATA, sectDCONST); - if (ns && !(ns->addr & PAGE_MASK)) - doconstro_override = TRUE; - } else - doconstro_override = TRUE; + sconst = segCONST->vmaddr; + segSizeConst = segCONST->vmsize; + econst = sconst + segSizeConst; + assert(((sconst|econst) & PAGE_MASK) == 0); + DBG("segTEXTB = %p\n", (void *) segTEXTB); DBG("segDATAB = %p\n", (void *) segDATAB); DBG("segLINKB = %p\n", (void *) segLINKB); DBG("segHIBB = %p\n", (void *) segHIBB); - DBG("segPRELINKB = %p\n", (void *) segPRELINKB); + DBG("segPRELINKTEXTB = %p\n", (void *) segPRELINKTEXTB); DBG("segPRELINKINFOB = %p\n", (void *) segPRELINKINFOB); DBG("sHIB = %p\n", (void *) sHIB); DBG("eHIB = %p\n", (void *) eHIB); @@ -302,21 +296,22 @@ i386_vm_init(uint64_t maxmem, DBG("etext = %p\n", (void *) etext); DBG("sdata = %p\n", (void *) sdata); DBG("edata = %p\n", (void *) edata); - DBG("sconstdata = %p\n", (void *) sconstdata); - DBG("econstdata = %p\n", (void *) econstdata); + DBG("sconst = %p\n", (void *) sconst); + DBG("econst = %p\n", (void *) econst); DBG("kernel_top = %p\n", (void *) &last_kernel_symbol); vm_kernel_base = sHIB; vm_kernel_top = (vm_offset_t) &last_kernel_symbol; vm_kernel_stext = stext; vm_kernel_etext = etext; - - vm_prelink_stext = segPRELINKB; - vm_prelink_etext = segPRELINKB + segSizePRELINK; - vm_prelink_sinfo = segPRELINKINFOB; - vm_prelink_einfo = segPRELINKINFOB + segSizePRELINKINFO; - vm_slinkedit = segLINKB; - vm_elinkedit = segLINKB + segSizePRELINK; + vm_prelink_stext = segPRELINKTEXTB; + vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT; + vm_prelink_sinfo = segPRELINKINFOB; + vm_prelink_einfo = segPRELINKINFOB + segSizePRELINKINFO; + vm_slinkedit = segLINKB; + vm_elinkedit = segLINKB + segSizePRELINKTEXT; + vm_kernel_slid_base = vm_kext_base; + vm_kernel_slid_top = vm_elinkedit; vm_set_page_size(); diff --git a/osfmk/i386/lapic_native.c b/osfmk/i386/lapic_native.c index 1a1bc5845..a0777959e 100644 --- a/osfmk/i386/lapic_native.c +++ b/osfmk/i386/lapic_native.c @@ -958,7 +958,7 @@ lapic_get_cmci_vector(void) return cmci_vector; } -#if DEBUG +#if DEVELOPMENT || DEBUG extern void lapic_trigger_MC(void); void lapic_trigger_MC(void) diff --git a/osfmk/i386/locks.h b/osfmk/i386/locks.h index 2934da1ce..47a808c3f 100644 --- a/osfmk/i386/locks.h +++ b/osfmk/i386/locks.h @@ -71,47 +71,38 @@ typedef struct __lck_spin_t__ lck_spin_t; typedef struct _lck_mtx_ { union { struct { - volatile uintptr_t lck_mtxd_owner; + volatile uintptr_t lck_mtx_owner; union { struct { volatile uint32_t - lck_mtxd_waiters:16, - lck_mtxd_pri:8, - lck_mtxd_ilocked:1, - lck_mtxd_mlocked:1, - lck_mtxd_promoted:1, - lck_mtxd_spin:1, - lck_mtxd_is_ext:1, - lck_mtxd_pad3:3; + lck_mtx_waiters:16, + lck_mtx_pri:8, + lck_mtx_ilocked:1, + lck_mtx_mlocked:1, + lck_mtx_promoted:1, + lck_mtx_spin:1, + lck_mtx_is_ext:1, + lck_mtx_pad3:3; }; - uint32_t lck_mtxd_state; + uint32_t lck_mtx_state; }; /* Pad field used as a canary, initialized to ~0 */ - uint32_t lck_mtxd_pad32; - } lck_mtxd; + uint32_t lck_mtx_pad32; + }; struct { - struct _lck_mtx_ext_ *lck_mtxi_ptr; - uint32_t lck_mtxi_tag; - uint32_t lck_mtxi_pad32; - } lck_mtxi; - } lck_mtx_sw; + struct _lck_mtx_ext_ *lck_mtx_ptr; + uint32_t lck_mtx_tag; + uint32_t lck_mtx_pad32_2; + }; + }; } lck_mtx_t; -#define lck_mtx_owner lck_mtx_sw.lck_mtxd.lck_mtxd_owner -#define lck_mtx_waiters lck_mtx_sw.lck_mtxd.lck_mtxd_waiters -#define lck_mtx_pri lck_mtx_sw.lck_mtxd.lck_mtxd_pri -#define lck_mtx_promoted lck_mtx_sw.lck_mtxd.lck_mtxd_promoted -#define lck_mtx_is_ext lck_mtx_sw.lck_mtxd.lck_mtxd_is_ext - -#define lck_mtx_tag lck_mtx_sw.lck_mtxi.lck_mtxi_tag -#define lck_mtx_ptr lck_mtx_sw.lck_mtxi.lck_mtxi_ptr -#define lck_mtx_state lck_mtx_sw.lck_mtxd.lck_mtxd_state /* This pattern must subsume the interlocked, mlocked and spin bits */ #define LCK_MTX_TAG_INDIRECT 0x07ff1007 /* lock marked as Indirect */ #define LCK_MTX_TAG_DESTROYED 0x07fe2007 /* lock marked as Destroyed */ /* Adaptive spin before blocking */ -extern unsigned int MutexSpin; +extern uint64_t MutexSpin; extern int lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex); extern void lck_mtx_lock_wait_x86(lck_mtx_t *mutex); extern void lck_mtx_lock_acquire_x86(lck_mtx_t *mutex); @@ -215,6 +206,21 @@ typedef struct _lck_rw_t_internal_ { #define LCK_RW_TAG_DESTROYED 0x00002007 /* lock marked as Destroyed */ +#if LOCK_PRIVATE + +#define disable_preemption_for_thread(t) ((cpu_data_t GS_RELATIVE *)0UL)->cpu_preemption_level++ + +#define LCK_MTX_THREAD_TO_STATE(t) ((uintptr_t)t) +#define PLATFORM_LCK_ILOCK 0 + +#define LOCK_SNOOP_SPINS 1000 +#define LOCK_PRETEST 1 + +/* Spinlock panic deadline, in mach_absolute_time units (ns on i386) */ +#define LOCK_PANIC_TIMEOUT 0xf00000 /* 250 ms (huge) */ + +#endif // LOCK_PRIVATE + #else #ifdef KERNEL_PRIVATE #pragma pack(1) diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 130ba126a..addf910e4 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -287,7 +287,47 @@ boolean_t lck_spin_try_lock( lck_spin_t *lck) { - return((boolean_t)usimple_lock_try((usimple_lock_t) lck)); + boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck); +#if DEVELOPMENT || DEBUG + if (lrval) { + pltrace(FALSE); + } +#endif + return(lrval); +} + +/* + * Routine: lck_spin_assert + */ +void +lck_spin_assert(lck_spin_t *lock, unsigned int type) +{ + thread_t thread, holder; + uintptr_t state; + + if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) { + panic("lck_spin_assert(): invalid arg (%u)", type); + } + + state = lock->interlock; + holder = (thread_t)state; + thread = current_thread(); + if (type == LCK_ASSERT_OWNED) { + if (__improbable(holder == THREAD_NULL)) { + panic("Lock not owned %p = %lx", lock, state); + } + if (__improbable(holder != thread)) { + panic("Lock not owned by current thread %p = %lx", lock, state); + } + } else if (type == LCK_ASSERT_NOTOWNED) { + if (__improbable(holder != THREAD_NULL)) { + if (holder == thread) { + panic("Lock owned by current thread %p = %lx", lock, state); + } else { + panic("Lock %p owned by thread %p", lock, holder); + } + } + } } /* @@ -378,6 +418,10 @@ usimple_lock( panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data); } } +#if DEVELOPMENT || DEBUG + pltrace(FALSE); +#endif + USLDBG(usld_lock_post(l, pc)); #else simple_lock((simple_lock_t)l); @@ -401,6 +445,9 @@ usimple_unlock( OBTAIN_PC(pc); USLDBG(usld_unlock(l, pc)); +#if DEVELOPMENT || DEBUG + pltrace(TRUE); +#endif hw_lock_unlock(&l->interlock); #else simple_unlock_rwmb((simple_lock_t)l); @@ -431,7 +478,10 @@ usimple_lock_try( OBTAIN_PC(pc); USLDBG(usld_lock_try_pre(l, pc)); if ((success = hw_lock_try(&l->interlock))) { - USLDBG(usld_lock_try_post(l, pc)); +#if DEVELOPMENT || DEBUG + pltrace(FALSE); +#endif + USLDBG(usld_lock_try_post(l, pc)); } return success; #else @@ -439,6 +489,22 @@ usimple_lock_try( #endif } +/* + * Acquire a usimple_lock while polling for pending TLB flushes + * and spinning on a lock. + * + */ +void +usimple_lock_try_lock_loop(usimple_lock_t l) +{ + boolean_t istate = ml_get_interrupts_enabled(); + while (!simple_lock_try((l))) { + if (!istate) + handle_pending_TLB_flushes(); + cpu_pause(); + } +} + #if USLOCK_DEBUG /* * States of a usimple_lock. The default when initializing @@ -548,7 +614,7 @@ usld_lock_post( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "successful usimple_lock"; @@ -585,7 +651,7 @@ usld_unlock( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "usimple_unlock"; @@ -650,7 +716,7 @@ usld_lock_try_post( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "successful usimple_lock_try"; if (!usld_lock_common_checks(l, caller)) @@ -1151,7 +1217,7 @@ lck_rw_unlock_shared( ret = lck_rw_done(lck); if (ret != LCK_RW_TYPE_SHARED) - panic("lck_rw_unlock(): lock held in mode: %d\n", ret); + panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret); } @@ -1649,7 +1715,7 @@ lck_mtx_ext_init( lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; lck->lck_mtx.lck_mtx_is_ext = 1; - lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; + lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF; } /* @@ -1679,7 +1745,7 @@ lck_mtx_init( lck->lck_mtx_owner = 0; lck->lck_mtx_state = 0; } - lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; + lck->lck_mtx_pad32 = 0xFFFFFFFF; lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } @@ -1709,7 +1775,7 @@ lck_mtx_init_ext( lck->lck_mtx_owner = 0; lck->lck_mtx_state = 0; } - lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; + lck->lck_mtx_pad32 = 0xFFFFFFFF; lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); @@ -2121,7 +2187,7 @@ kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck) panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger"); } - if (lck->lck_mtx_sw.lck_mtxd.lck_mtxd_ilocked || lck->lck_mtx_sw.lck_mtxd.lck_mtxd_mlocked) { + if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) { return TRUE; } diff --git a/osfmk/i386/machine_check.c b/osfmk/i386/machine_check.c index 3e1a8cc7f..862d280fc 100644 --- a/osfmk/i386/machine_check.c +++ b/osfmk/i386/machine_check.c @@ -362,13 +362,11 @@ mca_dump(void) } +#if DEVELOPMENT || DEBUG extern void mca_exception_panic(void); extern void lapic_trigger_MC(void); void mca_exception_panic(void) { -#if DEBUG lapic_trigger_MC(); -#else - kprintf("mca_exception_panic() requires DEBUG build\n"); -#endif } +#endif diff --git a/osfmk/i386/machine_cpu.h b/osfmk/i386/machine_cpu.h index 82532f088..c193948e1 100644 --- a/osfmk/i386/machine_cpu.h +++ b/osfmk/i386/machine_cpu.h @@ -54,7 +54,7 @@ static inline void cpu_halt(void) static inline void cpu_pause(void) { - asm volatile( "rep; nop" ); + __builtin_ia32_pause(); } #endif /* _I386_MACHINE_CPU_H_ */ diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index a90d68178..334b6ff0d 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -33,10 +33,12 @@ #include #include #include -#include + #include #include #include +#include + #include #include #include @@ -56,7 +58,7 @@ #include #endif #include - +#include #if DEBUG #define DBG(x...) kprintf("DBG: " x) #else @@ -67,10 +69,11 @@ extern void wakeup(void *); static int max_cpus_initialized = 0; -unsigned int LockTimeOut; -unsigned int TLBTimeOut; -unsigned int LockTimeOutTSC; -unsigned int MutexSpin; +uint64_t LockTimeOut; +uint64_t TLBTimeOut; +uint64_t LockTimeOutTSC; +uint32_t LockTimeOutUsec; +uint64_t MutexSpin; uint64_t LastDebuggerEntryAllowance; uint64_t delay_spin_threshold; @@ -448,10 +451,7 @@ register_cpu( chudxnu_cpu_free(this_cpu_datap->cpu_chud); console_cpu_free(this_cpu_datap->cpu_console_buf); #if KPC - kpc_counterbuf_free(this_cpu_datap->cpu_kpc_buf[0]); - kpc_counterbuf_free(this_cpu_datap->cpu_kpc_buf[1]); - kpc_counterbuf_free(this_cpu_datap->cpu_kpc_shadow); - kpc_counterbuf_free(this_cpu_datap->cpu_kpc_reload); + kpc_unregister_cpu(this_cpu_datap); #endif return KERN_FAILURE; @@ -612,7 +612,6 @@ ml_get_max_cpus(void) (void) ml_set_interrupts_enabled(current_state); return(machine_info.max_cpus); } - /* * Routine: ml_init_lock_timeout * Function: @@ -633,10 +632,14 @@ ml_init_lock_timeout(void) if (PE_parse_boot_argn("slto_us", &slto, sizeof (slto))) default_timeout_ns = slto * NSEC_PER_USEC; - /* LockTimeOut is absolutetime, LockTimeOutTSC is in TSC ticks */ + /* + * LockTimeOut is absolutetime, LockTimeOutTSC is in TSC ticks, + * and LockTimeOutUsec is in microseconds and it's 32-bits. + */ + LockTimeOutUsec = (uint32_t) (default_timeout_ns / NSEC_PER_USEC); nanoseconds_to_absolutetime(default_timeout_ns, &abstime); - LockTimeOut = (uint32_t) abstime; - LockTimeOutTSC = (uint32_t) tmrCvt(abstime, tscFCvtn2t); + LockTimeOut = abstime; + LockTimeOutTSC = tmrCvt(abstime, tscFCvtn2t); /* * TLBTimeOut dictates the TLB flush timeout period. It defaults to @@ -670,7 +673,37 @@ ml_init_lock_timeout(void) nanoseconds_to_absolutetime(4ULL * NSEC_PER_SEC, &LastDebuggerEntryAllowance); if (PE_parse_boot_argn("panic_restart_timeout", &prt, sizeof (prt))) nanoseconds_to_absolutetime(prt * NSEC_PER_SEC, &panic_restart_timeout); + virtualized = ((cpuid_features() & CPUID_FEATURE_VMM) != 0); + if (virtualized) { + int vti; + + if (!PE_parse_boot_argn("vti", &vti, sizeof (vti))) + vti = 6; + printf("Timeouts adjusted for virtualization (<<%d)\n", vti); + kprintf("Timeouts adjusted for virtualization (<<%d):\n", vti); +#define VIRTUAL_TIMEOUT_INFLATE64(_timeout) \ +MACRO_BEGIN \ + kprintf("%24s: 0x%016llx ", #_timeout, _timeout); \ + _timeout <<= vti; \ + kprintf("-> 0x%016llx\n", _timeout); \ +MACRO_END +#define VIRTUAL_TIMEOUT_INFLATE32(_timeout) \ +MACRO_BEGIN \ + kprintf("%24s: 0x%08x ", #_timeout, _timeout); \ + if ((_timeout <> vti == _timeout) \ + _timeout <<= vti; \ + else \ + _timeout = ~0; /* cap rather than overflow */ \ + kprintf("-> 0x%08x\n", _timeout); \ +MACRO_END + VIRTUAL_TIMEOUT_INFLATE32(LockTimeOutUsec); + VIRTUAL_TIMEOUT_INFLATE64(LockTimeOut); + VIRTUAL_TIMEOUT_INFLATE64(LockTimeOutTSC); + VIRTUAL_TIMEOUT_INFLATE64(TLBTimeOut); + VIRTUAL_TIMEOUT_INFLATE64(MutexSpin); + } + interrupt_latency_tracker_setup(); simple_lock_init(&ml_timer_evaluation_slock, 0); } @@ -812,7 +845,7 @@ kernel_preempt_check(void) } boolean_t machine_timeout_suspended(void) { - return (virtualized || pmap_tlb_flush_timeout || spinlock_timed_out || panic_active() || mp_recent_debugger_activity() || ml_recent_wake()); + return (pmap_tlb_flush_timeout || spinlock_timed_out || panic_active() || mp_recent_debugger_activity() || ml_recent_wake()); } /* Eagerly evaluate all pending timer and thread callouts @@ -864,6 +897,11 @@ ml_entropy_collect(void) *ep = ror32(*ep, 9) ^ tsc_lo; } +uint64_t +ml_energy_stat(__unused thread_t t) { + return 0; +} + void ml_gpu_stat_update(uint64_t gpu_ns_delta) { current_thread()->machine.thread_gpu_ns += gpu_ns_delta; @@ -873,3 +911,17 @@ uint64_t ml_gpu_stat(thread_t t) { return t->machine.thread_gpu_ns; } + +int plctrace_enabled = 0; + +void _disable_preemption(void) { + disable_preemption_internal(); +} + +void _enable_preemption(void) { + enable_preemption_internal(); +} + +void plctrace_disable(void) { + plctrace_enabled = 0; +} diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index 6f8dc6809..a1e9c39f8 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -145,6 +145,7 @@ void ml_get_bouncepool_info( vm_size_t *size); /* Indicates if spinlock, IPI and other timeouts should be suspended */ boolean_t machine_timeout_suspended(void); +void plctrace_disable(void); #endif /* PEXPERT_KERNEL_PRIVATE || MACH_KERNEL_PRIVATE */ /* Warm up a CPU to receive an interrupt */ @@ -328,6 +329,7 @@ void timer_queue_expire_rescan(void*); void ml_timer_evaluate(void); boolean_t ml_timer_forced_evaluation(void); +uint64_t ml_energy_stat(thread_t); void ml_gpu_stat_update(uint64_t); uint64_t ml_gpu_stat(thread_t); boolean_t ml_recent_wake(void); diff --git a/osfmk/i386/machine_task.c b/osfmk/i386/machine_task.c index 342f123f5..956ad991b 100644 --- a/osfmk/i386/machine_task.c +++ b/osfmk/i386/machine_task.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -88,7 +88,6 @@ machine_task_set_state( copy_debug_state32(tstate, (x86_debug_state32_t*) task->task_debug, FALSE); return KERN_SUCCESS; - break; } case x86_DEBUG_STATE64: { @@ -107,7 +106,6 @@ machine_task_set_state( copy_debug_state64(tstate, (x86_debug_state64_t*) task->task_debug, FALSE); return KERN_SUCCESS; - break; } case x86_DEBUG_STATE: { @@ -143,13 +141,10 @@ machine_task_set_state( } else { return KERN_INVALID_ARGUMENT; } - - break; } default: { return KERN_INVALID_ARGUMENT; - break; } } } @@ -176,7 +171,6 @@ machine_task_get_state(task_t task, } return KERN_SUCCESS; - break; } case x86_DEBUG_STATE64: { @@ -193,7 +187,6 @@ machine_task_get_state(task_t task, } return KERN_SUCCESS; - break; } case x86_DEBUG_STATE: { @@ -223,12 +216,10 @@ machine_task_get_state(task_t task, } return KERN_SUCCESS; - break; } default: { return KERN_INVALID_ARGUMENT; - break; } } } diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 219948221..c0a70f349 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -110,9 +111,8 @@ void i386_cpu_IPI(int cpu); #if MACH_KDP static void mp_kdp_wait(boolean_t flush, boolean_t isNMI); #endif /* MACH_KDP */ -static void mp_rendezvous_action(void); -static void mp_broadcast_action(void); +static boolean_t mp_safe_spin_lock(usimple_lock_t lock); #if MACH_KDP static boolean_t cpu_signal_pending(int cpu, mp_event_t event); #endif /* MACH_KDP */ @@ -124,6 +124,9 @@ volatile boolean_t force_immediate_debugger_NMI = FALSE; volatile boolean_t pmap_tlb_flush_timeout = FALSE; decl_simple_lock_data(,mp_kdp_lock); +decl_simple_lock_data(,debugger_callback_lock); +struct debugger_callback *debugger_callback = NULL; + decl_lck_mtx_data(static, mp_cpu_boot_lock); lck_mtx_ext_t mp_cpu_boot_lock_ext; @@ -226,6 +229,7 @@ smp_init(void) { simple_lock_init(&mp_kdp_lock, 0); simple_lock_init(&mp_rv_lock, 0); + simple_lock_init(&debugger_callback_lock, 0); lck_grp_attr_setdefault(&smp_lck_grp_attr); lck_grp_init(&smp_lck_grp, "i386_smp", &smp_lck_grp_attr); lck_mtx_init_ext(&mp_cpu_boot_lock, &mp_cpu_boot_lock_ext, &smp_lck_grp, LCK_ATTR_NULL); @@ -535,14 +539,6 @@ cpu_signal_handler(x86_saved_state_t *regs) DBGLOG(cpu_handle,my_cpu,MP_TLB_FLUSH); i_bit_clear(MP_TLB_FLUSH, my_word); pmap_update_interrupt(); - } else if (i_bit(MP_RENDEZVOUS, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_RENDEZVOUS); - i_bit_clear(MP_RENDEZVOUS, my_word); - mp_rendezvous_action(); - } else if (i_bit(MP_BROADCAST, my_word)) { - DBGLOG(cpu_handle,my_cpu,MP_BROADCAST); - i_bit_clear(MP_BROADCAST, my_word); - mp_broadcast_action(); } else if (i_bit(MP_CHUD, my_word)) { DBGLOG(cpu_handle,my_cpu,MP_CHUD); i_bit_clear(MP_CHUD, my_word); @@ -614,17 +610,24 @@ NMIInterruptHandler(x86_saved_state_t *regs) mp_cpus_call_wait_timeout || panic_active()) { mp_kdp_wait(FALSE, TRUE); - } else if (virtualized && (debug_boot_arg & DB_NMI)) { + } else if (!mp_kdp_trap && + !mp_kdp_is_NMI && + virtualized && (debug_boot_arg & DB_NMI)) { /* * Under a VMM with the debug boot-arg set, drop into kdp. * Since an NMI is involved, there's a risk of contending with * a panic. And side-effects of NMIs may result in entry into, * and continuing from, the debugger being unreliable. */ - kprintf_break_lock(); - kprintf("Debugger entry requested by NMI\n"); - kdp_i386_trap(T_DEBUG, saved_state64(regs), 0, 0); - printf("Debugger entry requested by NMI\n"); + if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) { + kprintf_break_lock(); + kprintf("Debugger entry requested by NMI\n"); + kdp_i386_trap(T_DEBUG, saved_state64(regs), 0, 0); + printf("Debugger entry requested by NMI\n"); + mp_kdp_is_NMI = FALSE; + } else { + mp_kdp_wait(FALSE, FALSE); + } } else { mp_kdp_wait(FALSE, FALSE); } @@ -760,44 +763,6 @@ i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0); } -/* - * Send event to all running cpus. - * Called with the topology locked. - */ -void -i386_signal_cpus(mp_event_t event, mp_sync_t mode) -{ - unsigned int cpu; - unsigned int my_cpu = cpu_number(); - - assert(hw_lock_held((hw_lock_t)&x86_topo_lock)); - - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) - continue; - i386_signal_cpu(cpu, event, mode); - } -} - -/* - * Return the number of running cpus. - * Called with the topology locked. - */ -int -i386_active_cpus(void) -{ - unsigned int cpu; - unsigned int ncpus = 0; - - assert(hw_lock_held((hw_lock_t)&x86_topo_lock)); - - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_datap(cpu)->cpu_running) - ncpus++; - } - return(ncpus); -} - /* * Helper function called when busy-waiting: panic if too long * a TSC-based time has elapsed since the start of the spin. @@ -816,14 +781,15 @@ mp_spin_timeout(uint64_t tsc_start) * unless we have serial console printing (kprintf) enabled * in which case we allow an even greater margin. */ - tsc_timeout = disable_serial_output ? (uint64_t) LockTimeOutTSC << 2 - : (uint64_t) LockTimeOutTSC << 4; + tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2 + : LockTimeOutTSC << 4; return (rdtsc64() > tsc_start + tsc_timeout); } /* * Helper function to take a spinlock while ensuring that incoming IPIs * are still serviced if interrupts are masked while we spin. + * Returns current interrupt state. */ static boolean_t mp_safe_spin_lock(usimple_lock_t lock) @@ -867,7 +833,7 @@ mp_safe_spin_lock(usimple_lock_t lock) */ static void -mp_rendezvous_action(void) +mp_rendezvous_action(__unused void *null) { boolean_t intrs_enabled; uint64_t tsc_spin_start; @@ -948,13 +914,10 @@ mp_rendezvous(void (*setup_func)(void *), * signal other processors, which will call mp_rendezvous_action() * with interrupts disabled */ - (void) mp_safe_spin_lock(&x86_topo_lock); - mp_rv_ncpus = i386_active_cpus(); - i386_signal_cpus(MP_RENDEZVOUS, ASYNC); - simple_unlock(&x86_topo_lock); + mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1; /* call executor function on this cpu */ - mp_rendezvous_action(); + mp_rendezvous_action(NULL); /* * Spin for everyone to complete. @@ -1212,7 +1175,6 @@ mp_cpus_call( (void (*)(void *,void *))action_func, arg, NULL, - NULL, NULL); } @@ -1224,6 +1186,7 @@ mp_cpus_call_wait(boolean_t intrs_enabled, mp_call_queue_t *cqp; uint64_t tsc_spin_start; + assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); cqp = &mp_cpus_call_head[cpu_number()]; tsc_spin_start = rdtsc64(); @@ -1253,14 +1216,12 @@ mp_cpus_call1( void (*action_func)(void *, void *), void *arg0, void *arg1, - cpumask_t *cpus_calledp, - cpumask_t *cpus_notcalledp) + cpumask_t *cpus_calledp) { - cpu_t cpu; + cpu_t cpu = 0; boolean_t intrs_enabled = FALSE; boolean_t call_self = FALSE; cpumask_t cpus_called = 0; - cpumask_t cpus_notcalled = 0; cpumask_t cpus_responded = 0; long cpus_call_count = 0; uint64_t tsc_spin_start; @@ -1318,7 +1279,6 @@ mp_cpus_call1( } else { /* * Here to queue a call to cpu and IPI. - * Spinning for request buffer unless NOSYNC. */ mp_call_t *callp = NULL; mp_call_queue_t *cqp = &mp_cpus_call_head[cpu]; @@ -1328,34 +1288,23 @@ mp_cpus_call1( if (callp == NULL) callp = mp_call_alloc(); intrs_inner = mp_call_head_lock(cqp); - if (mode == NOSYNC) { - if (callp == NULL) { - cpus_notcalled |= cpu_to_cpumask(cpu); - mp_call_head_unlock(cqp, intrs_inner); - KERNEL_DEBUG_CONSTANT( - TRACE_MP_CPUS_CALL_NOBUF, - cpu, 0, 0, 0, 0); - continue; - } - callp->maskp = NULL; - } else { - if (callp == NULL) { - mp_call_head_unlock(cqp, intrs_inner); - KERNEL_DEBUG_CONSTANT( - TRACE_MP_CPUS_CALL_NOBUF, - cpu, 0, 0, 0, 0); - if (!intrs_inner) { - /* Sniffing w/o locking */ - if (!queue_empty(&cqp->queue)) - mp_cpus_call_action(); - handle_pending_TLB_flushes(); - } - if (mp_spin_timeout(tsc_spin_start)) - panic("mp_cpus_call1() timeout"); - goto queue_call; + if (callp == NULL) { + mp_call_head_unlock(cqp, intrs_inner); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_NOBUF, + cpu, 0, 0, 0, 0); + if (!intrs_inner) { + /* Sniffing w/o locking */ + if (!queue_empty(&cqp->queue)) + mp_cpus_call_action(); + handle_pending_TLB_flushes(); } - callp->maskp = &cpus_responded; + if (mp_spin_timeout(tsc_spin_start)) + panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx", + tsc_spin_start, rdtsc64()); + goto queue_call; } + callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded; callp->func = action_func; callp->arg0 = arg0; callp->arg1 = arg1; @@ -1386,13 +1335,13 @@ mp_cpus_call1( } } - /* Safe to allow pre-emption now */ - mp_enable_preemption(); - /* For ASYNC, now wait for all signaled cpus to complete their calls */ if (mode == ASYNC) mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded); + /* Safe to allow pre-emption now */ + mp_enable_preemption(); + out: if (call_self){ cpus_called |= cpu_to_cpumask(cpu); @@ -1401,19 +1350,17 @@ mp_cpus_call1( if (cpus_calledp) *cpus_calledp = cpus_called; - if (cpus_notcalledp) - *cpus_notcalledp = cpus_notcalled; KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL | DBG_FUNC_END, - cpus_call_count, cpus_called, cpus_notcalled, 0, 0); + cpus_call_count, cpus_called, 0, 0, 0); return (cpu_t) cpus_call_count; } static void -mp_broadcast_action(void) +mp_broadcast_action(__unused void *null) { /* call action function */ if (mp_bc_action_func != NULL) @@ -1452,16 +1399,14 @@ mp_broadcast( /* * signal other processors, which will call mp_broadcast_action() */ - simple_lock(&x86_topo_lock); - mp_bc_ncpus = i386_active_cpus(); /* total including this cpu */ - mp_bc_count = mp_bc_ncpus; - i386_signal_cpus(MP_BROADCAST, ASYNC); + mp_bc_count = real_ncpus; /* assume max possible active */ + mp_bc_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, *mp_broadcast_action, NULL) + 1; + atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */ /* call executor function on this cpu */ - mp_broadcast_action(); - simple_unlock(&x86_topo_lock); + mp_broadcast_action(NULL); - /* block for all cpus to have run action_func */ + /* block for other cpus to have run action_func */ if (mp_bc_ncpus > 1) thread_block(THREAD_CONTINUE_NULL); else @@ -1563,6 +1508,7 @@ int pmsafe_debug = 1; #if MACH_KDP volatile boolean_t mp_kdp_trap = FALSE; +volatile boolean_t mp_kdp_is_NMI = FALSE; volatile unsigned long mp_kdp_ncpus; boolean_t mp_kdp_state; @@ -1639,10 +1585,7 @@ mp_kdp_enter(void) * "unsafe-to-interrupt" points such as the trampolines, * but neither do we want to lose state by waiting too long. */ - tsc_timeout = rdtsc64() + (ncpus * 1000 * 1000 * 10ULL); - - if (virtualized) - tsc_timeout = ~0ULL; + tsc_timeout = rdtsc64() + (LockTimeOutTSC); while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { /* @@ -1658,12 +1601,22 @@ mp_kdp_enter(void) * interrupt them with an NMI via the local APIC. */ if (mp_kdp_ncpus != ncpus) { + DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu); for (cpu = 0; cpu < real_ncpus; cpu++) { if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) continue; if (cpu_signal_pending(cpu, MP_KDP)) cpu_NMI_interrupt(cpu); } + /* Wait again for the same timeout */ + tsc_timeout = rdtsc64() + (LockTimeOutTSC); + while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { + handle_pending_TLB_flushes(); + cpu_pause(); + } + if (mp_kdp_ncpus != ncpus) { + panic("mp_kdp_enter() timed-out waiting after NMI"); + } } } else @@ -1787,6 +1740,35 @@ mp_kdp_exit(void) (void) ml_set_interrupts_enabled(mp_kdp_state); postcode(0); } + +#define TRAP_DEBUGGER __asm__ volatile("int3") + +kern_return_t +DebuggerWithCallback(kern_return_t (*callback) (void*), + void *callback_context, + boolean_t proceed_on_sync_failure) +{ + simple_lock(&debugger_callback_lock); + + struct debugger_callback callback_buf = { + .callback = callback, + .callback_context = callback_context, + .proceed_on_sync_failure = proceed_on_sync_failure, + .error = KERN_FAILURE + }; + + assert(debugger_callback == NULL); + debugger_callback = &callback_buf; + + TRAP_DEBUGGER; + + debugger_callback = NULL; + + simple_unlock(&debugger_callback_lock); + + return callback_buf.error; +} + #endif /* MACH_KDP */ boolean_t @@ -1952,3 +1934,22 @@ ml_interrupt_prewarm( return cwd.cwd_result; } } + +#if DEBUG || DEVELOPMENT +void +kernel_spin(uint64_t spin_ns) +{ + boolean_t istate; + uint64_t spin_abs; + uint64_t deadline; + + kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns); + istate = ml_set_interrupts_enabled(FALSE); + nanoseconds_to_absolutetime(spin_ns, &spin_abs); + deadline = mach_absolute_time() + spin_ns; + while (mach_absolute_time() < deadline) + cpu_pause(); + ml_set_interrupts_enabled(istate); + kprintf("kernel_spin() continuing\n"); +} +#endif diff --git a/osfmk/i386/mp.h b/osfmk/i386/mp.h index 5fffdf0c1..892ae6337 100644 --- a/osfmk/i386/mp.h +++ b/osfmk/i386/mp.h @@ -103,6 +103,7 @@ extern int kdb_debug; extern int kdb_active[]; extern volatile boolean_t mp_kdp_trap; +extern volatile boolean_t mp_kdp_is_NMI; extern volatile boolean_t force_immediate_debugger_NMI; extern volatile boolean_t pmap_tlb_flush_timeout; extern volatile usimple_lock_t spinlock_timed_out; @@ -115,6 +116,7 @@ extern void mp_kdp_enter(void); extern void mp_kdp_exit(void); extern boolean_t mp_recent_debugger_activity(void); +extern void kernel_spin(uint64_t spin_ns); /* * All cpu rendezvous: @@ -183,8 +185,7 @@ extern cpu_t mp_cpus_call1( void (*action_func)(void *, void*), void *arg0, void *arg1, - cpumask_t *cpus_calledp, - cpumask_t *cpus_notcalledp); + cpumask_t *cpus_calledp); extern void mp_cpus_NMIPI(cpumask_t cpus); @@ -271,42 +272,6 @@ i_bit_impl(long word, long bit) { #define i_bit(bit, word) i_bit_impl((long)(*(word)), bit) #endif -#if MACH_RT - -#if defined(__x86_64__) - -#define _DISABLE_PREEMPTION \ - incl %gs:CPU_PREEMPTION_LEVEL - -#define _ENABLE_PREEMPTION \ - decl %gs:CPU_PREEMPTION_LEVEL ; \ - jne 9f ; \ - call EXT(kernel_preempt_check) ; \ -9: - -#define _ENABLE_PREEMPTION_NO_CHECK \ - decl %gs:CPU_PREEMPTION_LEVEL - -#else -#error Unsupported architecture -#endif - -/* x86_64 just calls through to the other macro directly */ -#define DISABLE_PREEMPTION _DISABLE_PREEMPTION -#define ENABLE_PREEMPTION _ENABLE_PREEMPTION -#define ENABLE_PREEMPTION_NO_CHECK _ENABLE_PREEMPTION_NO_CHECK -#define MP_DISABLE_PREEMPTION _DISABLE_PREEMPTION -#define MP_ENABLE_PREEMPTION _ENABLE_PREEMPTION -#define MP_ENABLE_PREEMPTION_NO_CHECK _ENABLE_PREEMPTION_NO_CHECK - -#else /* MACH_RT */ -#define DISABLE_PREEMPTION -#define ENABLE_PREEMPTION -#define ENABLE_PREEMPTION_NO_CHECK -#define MP_DISABLE_PREEMPTION -#define MP_ENABLE_PREEMPTION -#define MP_ENABLE_PREEMPTION_NO_CHECK -#endif /* MACH_RT */ #endif /* _I386_MP_H_ */ diff --git a/osfmk/i386/mp_events.h b/osfmk/i386/mp_events.h index 32fde7cc5..59c661e7a 100644 --- a/osfmk/i386/mp_events.h +++ b/osfmk/i386/mp_events.h @@ -39,11 +39,9 @@ typedef enum { MP_KDP, MP_KDB, MP_AST, - MP_RENDEZVOUS, MP_IDLE, MP_UNIDLE, MP_CHUD, - MP_BROADCAST, MP_CALL, MP_CALL_PM, MP_LAST @@ -55,11 +53,9 @@ const char *mp_event_name[] = { \ "MP_KDP", \ "MP_KDB", \ "MP_AST", \ - "MP_RENDEZVOUS", \ "MP_IDLE", \ "MP_UNIDLE", \ "MP_CHUD", \ - "MP_BROADCAST", \ "MP_CALL", \ "MP_CALL_PM", \ "MP_LAST" \ @@ -70,8 +66,6 @@ typedef enum { SYNC, ASYNC, NOSYNC } mp_sync_t; __BEGIN_DECLS extern void i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode); -extern void i386_signal_cpus(mp_event_t event, mp_sync_t mode); -extern int i386_active_cpus(void); extern void i386_activate_cpu(void); extern void i386_deactivate_cpu(void); extern void cpu_NMI_interrupt(int /* cpu */); diff --git a/osfmk/i386/pal_native.h b/osfmk/i386/pal_native.h index 1979983a8..4a0225a05 100644 --- a/osfmk/i386/pal_native.h +++ b/osfmk/i386/pal_native.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,10 +41,10 @@ #define pal_sti() __asm__ volatile ("sti") #define pal_cli() __asm__ volatile ("cli") -static inline -void pal_stop_cpu(boolean_t cli) +static inline void +pal_stop_cpu(boolean_t cli) { - if( cli ) + if (cli) __asm__ volatile ( "cli" ); __asm__ volatile ( "wbinvd; hlt" ); } @@ -54,7 +54,6 @@ void pal_stop_cpu(boolean_t cli) #define pal_execve_return(t) #define pal_thread_terminate_self(t) #define pal_ast_check(t) -#define pal_switch_pmap(t,u,v) #define panic_display_pal_info() do { } while(0) #define pal_kernel_announce() do { } while(0) diff --git a/osfmk/i386/pal_routines.c b/osfmk/i386/pal_routines.c index 7adc8841a..b11aaf462 100644 --- a/osfmk/i386/pal_routines.c +++ b/osfmk/i386/pal_routines.c @@ -77,10 +77,18 @@ pal_serial_init(void) return serial_init(); } +void +pal_serial_putc_nocr(char c) +{ + serial_putc(c); +} + void pal_serial_putc(char c) { serial_putc(c); + if (c == '\n') + serial_putc('\r'); } int diff --git a/osfmk/i386/pal_routines.h b/osfmk/i386/pal_routines.h index 356cb79e9..4336d45dd 100644 --- a/osfmk/i386/pal_routines.h +++ b/osfmk/i386/pal_routines.h @@ -92,6 +92,7 @@ extern struct pal_apic_table *apic_table; /* serial / debug output routines */ extern int pal_serial_init(void); extern void pal_serial_putc(char); +extern void pal_serial_putc_nocr(char); extern int pal_serial_getc(void); /* Generic I386 PAL functions go here */ @@ -124,9 +125,6 @@ void pal_ast_check(thread_t thread); /* Called by sync_iss_to_iks */ extern void pal_get_kern_regs( x86_saved_state_t *state ); -/* Called by load_machfile */ -void pal_switch_pmap(thread_t, pmap_t, boolean_t); - /* * Platform-specific hlt/sti. */ diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index 84456b323..ed6e82e53 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -94,11 +95,6 @@ #include #include /* LAPIC_PMC_SWI_VECTOR */ -#if KPERF -#include -#include -#endif - #if HYPERVISOR #include #endif @@ -394,6 +390,15 @@ machine_load_context( Load_context(new); } +static inline void pmap_switch_context(thread_t ot, thread_t nt, int cnum) { + pmap_assert(ml_get_interrupts_enabled() == FALSE); + vm_map_t nmap = nt->map, omap = ot->map; + if ((omap != nmap) || (nmap->pmap->pagezero_accessible)) { + PMAP_DEACTIVATE_MAP(omap, ot, cnum); + PMAP_ACTIVATE_MAP(nmap, nt, cnum); + } +} + /* * Switch to a new thread. * Save the old thread`s kernel state or continuation, @@ -406,11 +411,11 @@ machine_switch_context( thread_t new) { #if MACH_RT - assert(current_cpu_datap()->cpu_active_stack == old->kernel_stack); -#endif -#if KPERF - kperf_kpc_cswitch(old, new); + assert(current_cpu_datap()->cpu_active_stack == old->kernel_stack); #endif + + kpc_off_cpu(old); + /* * Save FP registers if in use. */ @@ -420,7 +425,7 @@ machine_switch_context( new->machine.specFlags |= OnProc; /* - * Monitor the stack depth and report new max, + * Monitor the stack depth and report new max, * not worrying about races. */ vm_offset_t depth = current_stack_depth(); @@ -435,7 +440,7 @@ machine_switch_context( * Switch address maps if need be, even if not switching tasks. * (A server activation may be "borrowing" a client map.) */ - PMAP_SWITCH_CONTEXT(old, new, cpu_number()); + pmap_switch_context(old, new, cpu_number()); /* * Load the rest of the user state for the new thread @@ -459,7 +464,7 @@ machine_processor_shutdown( vmx_suspend(); #endif fpu_save_context(thread); - PMAP_SWITCH_CONTEXT(thread, processor->idle_thread, cpu_number()); + pmap_switch_context(thread, processor->idle_thread, cpu_number()); return(Shutdown_context(thread, doshutdown, processor)); } @@ -992,8 +997,6 @@ machine_thread_set_state( return set_thread_state32(thr_act, &state->uts.ts32); } else return(KERN_INVALID_ARGUMENT); - - break; } case x86_DEBUG_STATE32: { @@ -1767,9 +1770,7 @@ machine_stack_handoff(thread_t old, assert(new); assert(old); -#if KPERF - kperf_kpc_cswitch(old, new); -#endif + kpc_off_cpu(old); stack = old->kernel_stack; if (stack == old->reserved_stack) { @@ -1789,7 +1790,7 @@ machine_stack_handoff(thread_t old, old->machine.specFlags &= ~OnProc; new->machine.specFlags |= OnProc; - PMAP_SWITCH_CONTEXT(old, new, cpu_number()); + pmap_switch_context(old, new, cpu_number()); act_machine_switch_pcb(old, new); #if HYPERVISOR diff --git a/osfmk/i386/pcb_native.c b/osfmk/i386/pcb_native.c index 6f200efcc..d77d20134 100644 --- a/osfmk/i386/pcb_native.c +++ b/osfmk/i386/pcb_native.c @@ -412,7 +412,7 @@ void machine_thread_destroy( thread_t thread) { - register pcb_t pcb = THREAD_TO_PCB(thread); + pcb_t pcb = THREAD_TO_PCB(thread); #if HYPERVISOR if (thread->hv_thread_target) { diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index 939e47174..ccad03ea7 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -96,9 +96,6 @@ #define intel_ptob(x) i386_ptob(x) #define intel_round_page(x) i386_round_page(x) #define intel_trunc_page(x) i386_trunc_page(x) -#define trunc_intel_to_vm(x) trunc_i386_to_vm(x) -#define round_intel_to_vm(x) round_i386_to_vm(x) -#define vm_to_intel(x) vm_to_i386(x) /* * i386/i486/i860 Page Table Entry @@ -170,17 +167,25 @@ typedef uint64_t pt_entry_t; typedef uint64_t pmap_paddr_t; -#if DEBUG +#if DEVELOPMENT || DEBUG #define PMAP_ASSERT 1 +extern int pmap_asserts_enabled; +extern int pmap_asserts_traced; #endif + #if PMAP_ASSERT -#define pmap_assert(ex) ((ex) ? (void)0 : Assert(__FILE__, __LINE__, # ex)) +#define pmap_assert(ex) (pmap_asserts_enabled ? ((ex) ? (void)0 : Assert(__FILE__, __LINE__, # ex)) : (void)0) #define pmap_assert2(ex, fmt, args...) \ do { \ - if (!(ex)) { \ - kprintf("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ - panic("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ + if (__improbable(pmap_asserts_enabled && !(ex))) { \ + if (pmap_asserts_traced) { \ + KERNEL_DEBUG_CONSTANT(0xDEAD1000, __builtin_return_address(0), __LINE__, 0, 0, 0); \ + kdebug_enable = 0; \ + } else { \ + kprintf("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ + panic("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ + } \ } \ } while(0) #else @@ -307,7 +312,17 @@ pmap_store_pte(pt_entry_t *entryp, pt_entry_t value) /* This is conservative, but suffices */ #define INTEL_PTE_RSVD ((1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54)) -#define INTEL_COMPRESSED (1ULL << 62) /* marker, for invalid PTE only -- ignored by hardware for both regular/EPT entries*/ +#define INTEL_PTE_COMPRESSED (1ULL << 62) /* marker, for invalid PTE only -- ignored by hardware for both regular/EPT entries*/ +#define INTEL_PTE_COMPRESSED_ALT (1ULL << 61) /* compressed but with "alternate accounting" */ + +#define INTEL_PTE_COMPRESSED_MASK (INTEL_PTE_COMPRESSED | \ + INTEL_PTE_COMPRESSED_ALT) +#define PTE_IS_COMPRESSED(x) \ + ((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */ \ + ((x) & INTEL_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \ + ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \ + (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \ + &(x), (x), (x) & ~INTEL_PTE_COMPRESSED_MASK), FALSE))) #define pa_to_pte(a) ((a) & INTEL_PTE_PFN) /* XXX */ #define pte_to_pa(p) ((p) & INTEL_PTE_PFN) /* XXX */ @@ -407,7 +422,8 @@ extern boolean_t pmap_ept_support_ad; #define PTE_READ(is_ept) ((is_ept) ? INTEL_EPT_READ : INTEL_PTE_VALID) #define PTE_WRITE(is_ept) ((is_ept) ? INTEL_EPT_WRITE : INTEL_PTE_WRITE) #define PTE_PS INTEL_PTE_PS -#define PTE_COMPRESSED INTEL_COMPRESSED +#define PTE_COMPRESSED INTEL_PTE_COMPRESSED +#define PTE_COMPRESSED_ALT INTEL_PTE_COMPRESSED_ALT #define PTE_NCACHE(is_ept) ((is_ept) ? INTEL_EPT_NCACHE : INTEL_PTE_NCACHE) #define PTE_WTHRU(is_ept) ((is_ept) ? INTEL_EPT_WTHRU : INTEL_PTE_WTHRU) #define PTE_REF(is_ept) ((is_ept) ? INTEL_EPT_REF : INTEL_PTE_REF) @@ -434,7 +450,6 @@ extern pt_entry_t *PTmap; extern pdpt_entry_t *IdlePDPT; extern pml4_entry_t *IdlePML4; extern boolean_t no_shared_cr3; -extern addr64_t kernel64_cr3; extern pd_entry_t *IdlePTD; /* physical addr of "Idle" state PTD */ extern uint64_t pmap_pv_hashlist_walks; @@ -478,6 +493,12 @@ static inline void * PHYSMAP_PTOV_check(void *paddr) { #define LOWGLOBAL_ALIAS (VM_MIN_KERNEL_ADDRESS + 0x2000) #define CPU_GDT_ALIAS(_cpu) (LOWGLOBAL_ALIAS + (0x1000*(_cpu))) +/* + * This indicates (roughly) where there is free space for the VM + * to use for the heap; this does not need to be precise. + */ +#define KERNEL_PMAP_HEAP_RANGE_START VM_MIN_KERNEL_AND_KEXT_ADDRESS + #endif /*__x86_64__ */ #include @@ -491,22 +512,27 @@ static inline void * PHYSMAP_PTOV_check(void *paddr) { struct pmap { decl_simple_lock_data(,lock) /* lock on map */ pmap_paddr_t pm_cr3; /* physical addr */ - pmap_paddr_t pm_eptp; /* EPTP */ - boolean_t pm_shared; - pd_entry_t *dirbase; /* page directory pointer */ - vm_object_t pm_obj; /* object to hold pde's */ task_map_t pm_task_map; - pdpt_entry_t *pm_pdpt; /* KVA of 3rd level page */ - pml4_entry_t *pm_pml4; /* VKA of top level */ - vm_object_t pm_obj_pdpt; /* holds pdpt pages */ - vm_object_t pm_obj_pml4; /* holds pml4 pages */ + boolean_t pm_shared; + boolean_t pagezero_accessible; #define PMAP_PCID_MAX_CPUS MAX_CPUS /* Must be a multiple of 8 */ pcid_t pmap_pcid_cpus[PMAP_PCID_MAX_CPUS]; volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS]; struct pmap_statistics stats; /* map statistics */ int ref_count; /* reference count */ int nx_enabled; + pdpt_entry_t *pm_pdpt; /* KVA of 3rd level page */ + pml4_entry_t *pm_pml4; /* VKA of top level */ + vm_object_t pm_obj; /* object to hold pde's */ + vm_object_t pm_obj_pdpt; /* holds pdpt pages */ + vm_object_t pm_obj_pml4; /* holds pml4 pages */ + pmap_paddr_t pm_eptp; /* EPTP */ + pd_entry_t *dirbase; /* page directory pointer */ ledger_t ledger; /* ledger tracking phys mappings */ +#if MACH_ASSERT + int pmap_pid; + char pmap_procname[17]; +#endif /* MACH_ASSERT */ }; static inline boolean_t @@ -569,24 +595,45 @@ extern unsigned pmap_memory_region_current; extern pmap_memory_region_t pmap_memory_regions[]; #include +#include static inline void -set_dirbase(pmap_t tpmap, __unused thread_t thread, int my_cpu) { +set_dirbase(pmap_t tpmap, thread_t thread, int my_cpu) { int ccpu = my_cpu; cpu_datap(ccpu)->cpu_task_cr3 = tpmap->pm_cr3; cpu_datap(ccpu)->cpu_task_map = tpmap->pm_task_map; + + assert((get_preemption_level() > 0) || (ml_get_interrupts_enabled() == FALSE)); + assert(ccpu == cpu_number()); /* * Switch cr3 if necessary * - unless running with no_shared_cr3 debugging mode * and we're not on the kernel's cr3 (after pre-empted copyio) */ + boolean_t nopagezero = tpmap->pagezero_accessible; + boolean_t priorpagezero = cpu_datap(ccpu)->cpu_pagezero_mapped; + cpu_datap(ccpu)->cpu_pagezero_mapped = nopagezero; + if (__probable(!no_shared_cr3)) { - if (get_cr3_base() != tpmap->pm_cr3) { + if (__improbable(nopagezero)) { + boolean_t copyio_active = ((thread->machine.specFlags & CopyIOActive) != 0); if (pmap_pcid_ncpus) { - pmap_pcid_activate(tpmap, ccpu); + pmap_pcid_activate(tpmap, ccpu, TRUE, copyio_active); + } else { + if (copyio_active) { + if (get_cr3_base() != tpmap->pm_cr3) { + set_cr3_raw(tpmap->pm_cr3); + } + } else if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) { + set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3); + } } - else + } else if ((get_cr3_base() != tpmap->pm_cr3) || priorpagezero) { + if (pmap_pcid_ncpus) { + pmap_pcid_activate(tpmap, ccpu, FALSE, FALSE); + } else { set_cr3_raw(tpmap->pm_cr3); + } } } else { if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) @@ -696,7 +743,7 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr #define PMAP_ACTIVATE_MAP(map, thread, my_cpu) { \ - register pmap_t tpmap; \ + pmap_t tpmap; \ \ tpmap = vm_map_pmap(map); \ set_dirbase(tpmap, thread, my_cpu); \ @@ -704,20 +751,11 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr #if defined(__x86_64__) #define PMAP_DEACTIVATE_MAP(map, thread, ccpu) \ - pmap_assert(pmap_pcid_ncpus ? (pcid_for_pmap_cpu_tuple(map->pmap, ccpu) == (get_cr3_raw() & 0xFFF)) : TRUE); + pmap_assert2((pmap_pcid_ncpus ? (pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu) == (get_cr3_raw() & 0xFFF)) : TRUE),"PCIDs: 0x%x, active PCID: 0x%x, CR3: 0x%lx, pmap_cr3: 0x%llx, kernel_cr3: 0x%llx, kernel pmap cr3: 0x%llx, CPU active PCID: 0x%x, CPU kernel PCID: 0x%x, specflags: 0x%x, pagezero: 0x%x", pmap_pcid_ncpus, pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu), get_cr3_raw(), map->pmap->pm_cr3, cpu_datap(ccpu)->cpu_kernel_cr3, kernel_pmap->pm_cr3, cpu_datap(ccpu)->cpu_active_pcid, cpu_datap(ccpu)->cpu_kernel_pcid, thread->machine.specFlags, map->pmap->pagezero_accessible); #else #define PMAP_DEACTIVATE_MAP(map, thread) #endif -#define PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) { \ - \ - pmap_assert(ml_get_interrupts_enabled() == FALSE); \ - if (old_th->map != new_th->map) { \ - PMAP_DEACTIVATE_MAP(old_th->map, old_th, my_cpu); \ - PMAP_ACTIVATE_MAP(new_th->map, new_th, my_cpu); \ - } \ -} - #if NCOPY_WINDOWS > 0 #define PMAP_SWITCH_USER(th, new_map, my_cpu) { \ spl_t spl; \ @@ -821,6 +859,16 @@ extern boolean_t pmap_is_empty(pmap_t pmap, kern_return_t pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t); +#if MACH_ASSERT +extern int pmap_stats_assert; +#define PMAP_STATS_ASSERTF(args) \ + MACRO_BEGIN \ + if (pmap_stats_assert) assertf args; \ + MACRO_END +#else /* MACH_ASSERT */ +#define PMAP_STATS_ASSERTF(args) +#endif /* MACH_ASSERT */ + #endif /* ASSEMBLER */ diff --git a/osfmk/i386/pmap_common.c b/osfmk/i386/pmap_common.c index 6fe3641c3..a5cd8f269 100644 --- a/osfmk/i386/pmap_common.c +++ b/osfmk/i386/pmap_common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,6 +61,9 @@ event_t mapping_replenish_event, pmap_user_pv_throttle_event; uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; +int pmap_asserts_enabled = DEBUG; +int pmap_asserts_traced = 0; + unsigned int pmap_cache_attributes(ppnum_t pn) { if (pmap_get_cache_attributes(pn, FALSE) & INTEL_PTE_NCACHE) return (VM_WIMG_IO); @@ -341,7 +344,9 @@ unsigned pmap_kernel_reserve_replenish_stat; unsigned pmap_user_reserve_replenish_stat; unsigned pmap_kern_reserve_alloc_stat; -void mapping_replenish(void) +__attribute__((noreturn)) +void +mapping_replenish(void) { pv_hashed_entry_t pvh_e; pv_hashed_entry_t pvh_eh; diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index 4f5580dfa..3c8909968 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -219,7 +219,7 @@ than the original pv lists that contained all aliases for the specific ppn. typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */ queue_head_t qlink; - vm_map_offset_t va; /* virtual address for mapping */ + vm_map_offset_t va_and_flags; /* virtual address for mapping */ pmap_t pmap; /* pmap where mapping lies */ } *pv_rooted_entry_t; @@ -228,7 +228,7 @@ typedef struct pv_rooted_entry { typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */ queue_head_t qlink; - vm_map_offset_t va; + vm_map_offset_t va_and_flags; pmap_t pmap; ppnum_t ppn; struct pv_hashed_entry *nexth; @@ -236,6 +236,12 @@ typedef struct pv_hashed_entry { #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) +#define PVE_VA(pve) ((pve)->va_and_flags & ~PAGE_MASK) +#define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK) +#define PVE_IS_ALTACCT 0x001 +#define PVE_IS_ALTACCT_PAGE(pve) \ + (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE) + //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ #ifdef PV_DEBUG #define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized"); @@ -379,6 +385,9 @@ static inline void pmap_pv_throttle(__unused pmap_t p) { (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL)) #define IS_REUSABLE_PAGE(x) \ (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE)) +#define IS_ALTACCT_PAGE(x) \ + (IS_MANAGED_PAGE((x)) && \ + (PVE_IS_ALTACCT_PAGE(&pv_head_table[(x)]))) /* * Physical page attributes. Copy bits from PTE definition. @@ -501,7 +510,7 @@ pmap_pvh_unlink(pv_hashed_entry_t pvh) int pvhash_idx; CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh->pmap, pvh->va); + pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh)); pprevh = pvhash(pvhash_idx); @@ -530,7 +539,7 @@ pv_hash_add(pv_hashed_entry_t pvh_e, int pvhash_idx; CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); + pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); LOCK_PV_HASH(pvhash_idx); insque(&pvh_e->qlink, &pv_h->qlink); hashp = pvhash(pvhash_idx); @@ -549,7 +558,7 @@ pv_hash_remove(pv_hashed_entry_t pvh_e) int pvhash_idx; CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); + pvhash_idx = pvhashidx(pvh_e->pmap,PVE_VA(pvh_e)); LOCK_PV_HASH(pvhash_idx); remque(&pvh_e->qlink); pmap_pvh_unlink(pvh_e); @@ -651,7 +660,8 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * pv_rooted_entry_t pv_e = pv_h; uint32_t bitdex; pmap_t pvpmap = pv_h->pmap; - vm_map_offset_t pvva = pv_h->va; + vm_map_offset_t pvva = PVE_VA(pv_h); + vm_map_offset_t pve_flags = PVE_FLAGS(pv_h); boolean_t ppcd = FALSE; boolean_t is_ept; @@ -672,10 +682,14 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * * of the PV */ do { - if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || - (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { + if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && PVE_VA(pv_e) == vaddr) || + (pv_e->pmap == pmap && popcnt1(PVE_VA(pv_e) ^ vaddr))) { pv_e->pmap = pmap; - pv_e->va = vaddr; + if (pv_e == pv_h) { + pv_h->va_and_flags = vaddr | pve_flags; + } else { + pv_e->va_and_flags = vaddr; + } suppress_reason = PV_BITFLIP; action = PMAP_ACTION_RETRY; goto pmap_cpc_exit; @@ -690,7 +704,7 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); if (IS_MANAGED_PAGE(npn)) { pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); - if (npv_h->va == vaddr && npv_h->pmap == pmap) { + if (PVE_VA(npv_h) == vaddr && npv_h->pmap == pmap) { suppress_reason = PTE_BITFLIP; suppress_ppn = npn; action = PMAP_ACTION_RETRY_RELOCK; @@ -779,7 +793,7 @@ pmap_pv_remove(pmap_t pmap, goto pmap_pv_remove_retry; } - if (pv_h->va == vaddr && pv_h->pmap == pmap) { + if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { /* * Header is the pv_rooted_entry. * We can't free that. If there is a queued @@ -789,12 +803,14 @@ pmap_pv_remove(pmap_t pmap, */ pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); if (pv_h != (pv_rooted_entry_t) pvh_e) { + vm_map_offset_t pve_flags; + /* * Entry queued to root, remove this from hash * and install as new root. */ CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); + pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); LOCK_PV_HASH(pvhash_idx); remque(&pvh_e->qlink); pprevh = pvhash(pvhash_idx); @@ -806,7 +822,9 @@ pmap_pv_remove(pmap_t pmap, pmap_pvh_unlink(pvh_e); UNLOCK_PV_HASH(pvhash_idx); pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; /* dispose of pvh_e */ + pve_flags = PVE_FLAGS(pv_h); + pv_h->va_and_flags = PVE_VA(pvh_e) | pve_flags; + /* dispose of pvh_e */ } else { /* none queued after rooted */ pv_h->pmap = PMAP_NULL; @@ -831,7 +849,7 @@ pmap_pv_remove(pmap_t pmap, while (PV_HASHED_ENTRY_NULL != pvh_e) { pv_cnt++; if (pvh_e->pmap == pmap && - pvh_e->va == vaddr && + PVE_VA(pvh_e) == vaddr && pvh_e->ppn == ppn) break; pprevh = &pvh_e->nexth; @@ -842,7 +860,7 @@ pmap_pv_remove(pmap_t pmap, pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); if (pac == PMAP_ACTION_ASSERT) - panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va); + panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h)); else { UNLOCK_PV_HASH(pvhash_idx); if (pac == PMAP_ACTION_RETRY_RELOCK) { @@ -875,29 +893,13 @@ extern int pt_fake_zone_index; static inline void PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) { - thread_t thr = current_thread(); - task_t task; - zinfo_usage_t zinfo; - pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); - - if (pt_fake_zone_index != -1 && - (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc); } static inline void PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) { - thread_t thr = current_thread(); - task_t task; - zinfo_usage_t zinfo; - pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); - - if (pt_fake_zone_index != -1 && - (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free); } static inline void @@ -999,7 +1001,7 @@ pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) return (NULL); } -#if PMAP_ASSERT +#if DEBUG return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); #else return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; diff --git a/osfmk/i386/pmap_pcid.h b/osfmk/i386/pmap_pcid.h index 0e16f3e2d..fc0854d7e 100644 --- a/osfmk/i386/pmap_pcid.h +++ b/osfmk/i386/pmap_pcid.h @@ -36,8 +36,8 @@ void pmap_pcid_deallocate_pcid(int, pmap_t); void pmap_destroy_pcid_sync_action(void *); void pmap_destroy_pcid_sync(pmap_t); void pmap_pcid_lazy_flush(pmap_t); -void pmap_pcid_activate(pmap_t, int); -pcid_t pcid_for_pmap_cpu_tuple(pmap_t, int); +void pmap_pcid_activate(pmap_t, int, boolean_t, boolean_t); +pcid_t pcid_for_pmap_cpu_tuple(pmap_t, thread_t, int); #define PMAP_INVALID ((pmap_t)0xDEAD7347) #define PMAP_PCID_INVALID_PCID (0xDEAD) diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index af1563424..1ee68dafb 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -178,11 +179,10 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t i += (uint32_t) NPDEPG; } else { - npde = pmap_pde(subord, nstart); + npde = pmap_pde(subord, vaddr); if (npde == 0) - panic("pmap_nest: no npde, subord %p nstart 0x%llx", subord, nstart); + panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr); tpde = *npde; - nstart += NBPDE; pde = pmap_pde(grand, vaddr); if ((0 == pde) && cpu_64bit) { PMAP_UNLOCK(grand); @@ -405,7 +405,7 @@ pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) { do { pmap = pv_e->pmap; - vaddr = pv_e->va; + vaddr = PVE_VA(pv_e); ptep = pmap_pte(pmap, vaddr); if (0 == ptep) @@ -454,7 +454,7 @@ void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { void pmap_enter( - register pmap_t pmap, + pmap_t pmap, vm_map_offset_t vaddr, ppnum_t pn, vm_prot_t prot, @@ -468,7 +468,7 @@ pmap_enter( kern_return_t pmap_enter_options( - register pmap_t pmap, + pmap_t pmap, vm_map_offset_t vaddr, ppnum_t pn, vm_prot_t prot, @@ -546,7 +546,7 @@ pmap_enter_options( * pmap is always expanded to include enough hardware * pages to map one VM page. */ - if(superpage) { + if (superpage) { while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) { /* need room for another pde entry */ PMAP_UNLOCK(pmap); @@ -590,9 +590,27 @@ pmap_enter_options( old_pa_locked = FALSE; if (old_pa == 0 && - (*pte & PTE_COMPRESSED)) { + PTE_IS_COMPRESSED(*pte)) { + /* + * "pmap" should be locked at this point, so this should + * not race with another pmap_enter() or pmap_remove_range(). + */ + assert(pmap != kernel_pmap); + /* one less "compressed" */ OSAddAtomic64(-1, &pmap->stats.compressed); + pmap_ledger_debit(pmap, task_ledgers.internal_compressed, + PAGE_SIZE); + if (*pte & PTE_COMPRESSED_ALT) { + pmap_ledger_debit( + pmap, + task_ledgers.alternate_accounting_compressed, + PAGE_SIZE); + } else { + /* was part of the footprint */ + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, + PAGE_SIZE); + } /* marker will be cleared below */ } @@ -753,18 +771,32 @@ pmap_enter_options( if (IS_MANAGED_PAGE(pai)) { pmap_assert(old_pa_locked == TRUE); pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); - pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); OSAddAtomic(-1, &pmap->stats.resident_count); if (pmap != kernel_pmap) { if (IS_REUSABLE_PAGE(pai)) { - assert(pmap->stats.reusable > 0); + PMAP_STATS_ASSERTF( + (pmap->stats.reusable > 0, + "reusable %d", + pmap->stats.reusable)); OSAddAtomic(-1, &pmap->stats.reusable); } else if (IS_INTERNAL_PAGE(pai)) { - assert(pmap->stats.internal > 0); + PMAP_STATS_ASSERTF( + (pmap->stats.internal > 0, + "internal %d", + pmap->stats.internal)); OSAddAtomic(-1, &pmap->stats.internal); + pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE); + if (IS_ALTACCT_PAGE(pai)) { + pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE); + } else { + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + } } else { - assert(pmap->stats.external > 0); + PMAP_STATS_ASSERTF( + (pmap->stats.external > 0, + "external %d", + pmap->stats.external)); OSAddAtomic(-1, &pmap->stats.external); } } @@ -833,7 +865,7 @@ pmap_enter_options( /* * No mappings yet, use rooted pv */ - pv_h->va = vaddr; + pv_h->va_and_flags = vaddr; pv_h->pmap = pmap; queue_init(&pv_h->qlink); @@ -847,6 +879,13 @@ pmap_enter_options( } else { pmap_phys_attributes[pai] &= ~PHYS_REUSABLE; } + if ((options & PMAP_OPTIONS_ALT_ACCT) && + IS_INTERNAL_PAGE(pai)) { + assert(!IS_REUSABLE_PAGE(pai)); + pv_h->va_and_flags |= PVE_IS_ALTACCT; + } else { + pv_h->va_and_flags &= ~PVE_IS_ALTACCT; + } } else { /* * Add new pv_hashed_entry after header. @@ -881,7 +920,7 @@ pmap_enter_options( if (PV_HASHED_ENTRY_NULL == pvh_e) panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings"); - pvh_e->va = vaddr; + pvh_e->va_and_flags = vaddr; pvh_e->pmap = pmap; pvh_e->ppn = pn; pv_hash_add(pvh_e, pv_h); @@ -897,7 +936,6 @@ pmap_enter_options( * for 'managed memory' */ pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); - pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); OSAddAtomic(+1, &pmap->stats.resident_count); if (pmap->stats.resident_count > pmap->stats.resident_max) { pmap->stats.resident_max = pmap->stats.resident_count; @@ -909,6 +947,12 @@ pmap_enter_options( } else if (IS_INTERNAL_PAGE(pai)) { OSAddAtomic(+1, &pmap->stats.internal); PMAP_STATS_PEAK(pmap->stats.internal); + pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE); + if (IS_ALTACCT_PAGE(pai)) { + pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE); + } else { + pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + } } else { OSAddAtomic(+1, &pmap->stats.external); PMAP_STATS_PEAK(pmap->stats.external); @@ -919,7 +963,6 @@ pmap_enter_options( * are determined. Consider consulting the available DRAM map. */ pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); - pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); OSAddAtomic(+1, &pmap->stats.resident_count); if (pmap != kernel_pmap) { #if 00 @@ -1071,8 +1114,9 @@ pmap_remove_range_options( pv_hashed_entry_t pvh_e; int pvh_cnt = 0; int num_removed, num_unwired, num_found, num_invalid; - int num_device, num_external, num_internal, num_reusable; - uint64_t num_compressed; + int num_external, num_reusable; + int num_internal, num_alt_internal; + uint64_t num_compressed, num_alt_compressed; ppnum_t pai; pmap_paddr_t pa; vm_map_offset_t vaddr; @@ -1082,11 +1126,12 @@ pmap_remove_range_options( num_unwired = 0; num_found = 0; num_invalid = 0; - num_device = 0; num_external = 0; num_internal = 0; num_reusable = 0; num_compressed = 0; + num_alt_internal = 0; + num_alt_compressed = 0; /* invalidate the PTEs first to "freeze" them */ for (cpte = spte, vaddr = start_vaddr; cpte < epte; @@ -1097,12 +1142,16 @@ pmap_remove_range_options( if (pa == 0) { if (pmap != kernel_pmap && (options & PMAP_OPTIONS_REMOVE) && - (p & PTE_COMPRESSED)) { - /* one less "compressed" */ + (PTE_IS_COMPRESSED(p))) { + /* one less "compressed"... */ num_compressed++; - /* clear marker */ + if (p & PTE_COMPRESSED_ALT) { + /* ... but it used to be "ALTACCT" */ + num_alt_compressed++; + } + /* clear marker(s) */ /* XXX probably does not need to be atomic! */ - pmap_update_pte(cpte, PTE_COMPRESSED, 0); + pmap_update_pte(cpte, INTEL_PTE_COMPRESSED_MASK, 0); } continue; } @@ -1119,7 +1168,6 @@ pmap_remove_range_options( * Just remove the mappings. */ pmap_store_pte(cpte, 0); - num_device++; continue; } @@ -1144,8 +1192,26 @@ pmap_remove_range_options( cpte++, vaddr += PAGE_SIZE_64) { pa = pte_to_pa(*cpte); - if (pa == 0) + if (pa == 0) { + check_pte_for_compressed_marker: + /* + * This PTE could have been replaced with a + * "compressed" marker after our first "freeze" + * loop above, so check again. + */ + if (pmap != kernel_pmap && + (options & PMAP_OPTIONS_REMOVE) && + (PTE_IS_COMPRESSED(*cpte))) { + /* one less "compressed"... */ + num_compressed++; + if (*cpte & PTE_COMPRESSED_ALT) { + /* ... but it used to be "ALTACCT" */ + num_alt_compressed++; + } + pmap_store_pte(cpte, 0); + } continue; + } pai = pa_index(pa); @@ -1154,13 +1220,17 @@ pmap_remove_range_options( pa = pte_to_pa(*cpte); if (pa == 0) { UNLOCK_PVH(pai); - continue; + goto check_pte_for_compressed_marker; } num_removed++; if (IS_REUSABLE_PAGE(pai)) { + assert(!IS_ALTACCT_PAGE(pai)); num_reusable++; } else if (IS_INTERNAL_PAGE(pai)) { num_internal++; + if (IS_ALTACCT_PAGE(pai)) { + num_alt_internal++; + } } else { num_external++; } @@ -1211,35 +1281,67 @@ pmap_remove_range_options( panic("pmap_remove_range: resident_count"); #endif pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed)); - pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(num_removed)); - assert(pmap->stats.resident_count >= num_removed); + PMAP_STATS_ASSERTF((pmap->stats.resident_count >= num_removed, + "pmap=%p num_removed=%d stats.resident_count=%d", + pmap, num_removed, pmap->stats.resident_count)); OSAddAtomic(-num_removed, &pmap->stats.resident_count); if (pmap != kernel_pmap) { -#if 00 - assert(pmap->stats.device >= num_device); - if (num_device) - OSAddAtomic(-num_device, &pmap->stats.device); -#endif /* 00 */ - assert(pmap->stats.external >= num_external); - if (num_external) + PMAP_STATS_ASSERTF((pmap->stats.external >= num_external, + "pmap=%p num_external=%d stats.external=%d", + pmap, num_external, pmap->stats.external)); + PMAP_STATS_ASSERTF((pmap->stats.internal >= num_internal, + "pmap=%p num_internal=%d stats.internal=%d", + pmap, num_internal, pmap->stats.internal)); + PMAP_STATS_ASSERTF((pmap->stats.reusable >= num_reusable, + "pmap=%p num_reusable=%d stats.reusable=%d", + pmap, num_reusable, pmap->stats.reusable)); + PMAP_STATS_ASSERTF((pmap->stats.compressed >= num_compressed, + "pmap=%p num_compressed=%lld, stats.compressed=%lld", + pmap, num_compressed, pmap->stats.compressed)); + + if (num_external) { OSAddAtomic(-num_external, &pmap->stats.external); - assert(pmap->stats.internal >= num_internal); - if (num_internal) + } + if (num_internal) { OSAddAtomic(-num_internal, &pmap->stats.internal); - assert(pmap->stats.reusable >= num_reusable); + pmap_ledger_debit(pmap, + task_ledgers.internal, + machine_ptob(num_internal)); + } + if (num_alt_internal) { + pmap_ledger_debit(pmap, + task_ledgers.alternate_accounting, + machine_ptob(num_alt_internal)); + } + if (num_alt_compressed) { + pmap_ledger_debit(pmap, + task_ledgers.alternate_accounting_compressed, + machine_ptob(num_alt_compressed)); + } if (num_reusable) OSAddAtomic(-num_reusable, &pmap->stats.reusable); - assert(pmap->stats.compressed >= num_compressed); - if (num_compressed) + if (num_compressed) { OSAddAtomic64(-num_compressed, &pmap->stats.compressed); + pmap_ledger_debit(pmap, + task_ledgers.internal_compressed, + machine_ptob(num_compressed)); + } + pmap_ledger_debit(pmap, + task_ledgers.phys_footprint, + machine_ptob((num_internal - + num_alt_internal) + + (num_compressed - + num_alt_compressed))); } #if TESTING if (pmap->stats.wired_count < num_unwired) panic("pmap_remove_range: wired_count"); #endif - assert(pmap->stats.wired_count >= num_unwired); + PMAP_STATS_ASSERTF((pmap->stats.wired_count >= num_unwired, + "pmap=%p num_unwired=%d stats.wired_count=%d", + pmap, num_unwired, pmap->stats.wired_count)); OSAddAtomic(-num_unwired, &pmap->stats.wired_count); pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired)); @@ -1466,7 +1568,7 @@ pmap_page_protect_options( pmap = pv_e->pmap; is_ept = is_ept_pmap(pmap); - vaddr = pv_e->va; + vaddr = PVE_VA(pv_e); pte = pmap_pte(pmap, vaddr); pmap_assert2((pa_index(pte_to_pa(*pte)) == pn), @@ -1493,8 +1595,12 @@ pmap_page_protect_options( if (pmap != kernel_pmap && (options & PMAP_OPTIONS_COMPRESSOR) && IS_INTERNAL_PAGE(pai)) { - /* mark this PTE as having been "reclaimed" */ + assert(!PTE_IS_COMPRESSED(*pte)); + /* mark this PTE as having been "compressed" */ new_pte_value = PTE_COMPRESSED; + if (IS_ALTACCT_PAGE(pai)) { + new_pte_value |= PTE_COMPRESSED_ALT; + } } else { new_pte_value = 0; } @@ -1513,37 +1619,35 @@ pmap_page_protect_options( pmap_update_pte(pte, PTE_VALID_MASK(is_ept), 0); PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); + if (!is_ept) { + pmap_phys_attributes[pai] |= + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + } else { + pmap_phys_attributes[pai] |= + ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED); + } if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) && - ! (pmap_phys_attributes[pai] & - PHYS_MODIFIED) && - (*pte & PHYS_MODIFIED)) { + IS_INTERNAL_PAGE(pai) && + (pmap_phys_attributes[pai] & + PHYS_MODIFIED)) { /* * Page is actually "modified" and * will be compressed. Start * accounting for it as "compressed". */ + assert(!(options & PMAP_OPTIONS_COMPRESSOR)); options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; options |= PMAP_OPTIONS_COMPRESSOR; + assert(new_pte_value == 0); new_pte_value = PTE_COMPRESSED; - } - if (!is_ept) { - pmap_phys_attributes[pai] |= - *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - } else { - pmap_phys_attributes[pai] |= - ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED); + if (IS_ALTACCT_PAGE(pai)) { + new_pte_value |= PTE_COMPRESSED_ALT; + } } pmap_store_pte(pte, new_pte_value); } - if (new_pte_value == PTE_COMPRESSED) { - /* one more "compressed" page */ - OSAddAtomic64(+1, &pmap->stats.compressed); - PMAP_STATS_PEAK(pmap->stats.compressed); - pmap->stats.compressed_lifetime++; - } - #if TESTING if (pmap->stats.resident_count < 1) panic("pmap_page_protect: resident_count"); @@ -1551,16 +1655,13 @@ pmap_page_protect_options( pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); OSAddAtomic(-1, &pmap->stats.resident_count); + + /* + * We only ever compress internal pages. + */ if (options & PMAP_OPTIONS_COMPRESSOR) { - /* - * This removal is only being done so we can send this page to - * the compressor; therefore it mustn't affect total task footprint. - */ - pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE); - } else { - pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + assert(IS_INTERNAL_PAGE(pai)); } - if (pmap != kernel_pmap) { if (IS_REUSABLE_PAGE(pai)) { assert(pmap->stats.reusable > 0); @@ -1572,6 +1673,72 @@ pmap_page_protect_options( assert(pmap->stats.external > 0); OSAddAtomic(-1, &pmap->stats.external); } + if ((options & PMAP_OPTIONS_COMPRESSOR) && + IS_INTERNAL_PAGE(pai)) { + /* adjust "compressed" stats */ + OSAddAtomic64(+1, &pmap->stats.compressed); + PMAP_STATS_PEAK(pmap->stats.compressed); + pmap->stats.compressed_lifetime++; + } + if (IS_REUSABLE_PAGE(pai)) { + assert(!IS_ALTACCT_PAGE(pai)); + if (options & PMAP_OPTIONS_COMPRESSOR) { + pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE); + /* was not in footprint, but is now */ + pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + } + } else if (IS_INTERNAL_PAGE(pai)) { + pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE); + /* + * Update all stats related to physical + * footprint, which only deals with + * internal pages. + */ + if (options & PMAP_OPTIONS_COMPRESSOR) { + /* + * This removal is only being + * done so we can send this page + * to the compressor; therefore + * it mustn't affect total task + * footprint. + */ + if (IS_ALTACCT_PAGE(pai)) { + /* + * We've already debited + * internal, above. + * Debit + * alternate_accounting + * here, which means the + * net change on + * phys_footprint is 0. + */ + pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE); + pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE); + } + pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE); + } else { + /* + * This internal page isn't + * going to the compressor, + * so adjust stats to keep + * phys_footprint up to date. + */ + if (IS_ALTACCT_PAGE(pai)) { + /* + * We've already debited + * internal, above. + * Debit + * alternate_accounting + * here, which means + * the net change on + * phys_footprint is 0. + */ + pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE); + } else { + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); + } + } + } } /* @@ -1624,9 +1791,12 @@ pmap_page_protect_options( pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); if (pvh_e != (pv_hashed_entry_t) pv_h) { + vm_map_offset_t pve_flags; + pv_hash_remove(pvh_e); pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; + pve_flags = pv_h->va_and_flags & PAGE_MASK; + pv_h->va_and_flags = PVE_VA(pvh_e) | pve_flags; pvh_e->qlink.next = (queue_entry_t) pvh_eh; pvh_eh = pvh_e; @@ -1722,7 +1892,7 @@ phys_attribute_clear( pmap = pv_e->pmap; is_ept = is_ept_pmap(pmap); - va = pv_e->va; + va = PVE_VA(pv_e); pte_bits = 0; if (bits) { @@ -1786,10 +1956,23 @@ phys_attribute_clear( /* one more "internal" */ OSAddAtomic(+1, &pmap->stats.internal); PMAP_STATS_PEAK(pmap->stats.internal); + assert(pmap->stats.internal > 0); + pmap_ledger_credit(pmap, + task_ledgers.internal, + PAGE_SIZE); + if (IS_ALTACCT_PAGE(pai)) { + /* no impact on footprint */ + } else { + pmap_ledger_credit( + pmap, + task_ledgers.phys_footprint, + PAGE_SIZE); + } } else { /* one more "external" */ OSAddAtomic(+1, &pmap->stats.external); PMAP_STATS_PEAK(pmap->stats.external); + assert(pmap->stats.external > 0); } } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable && @@ -1797,10 +1980,22 @@ phys_attribute_clear( /* one more "reusable" */ OSAddAtomic(+1, &pmap->stats.reusable); PMAP_STATS_PEAK(pmap->stats.reusable); + assert(pmap->stats.reusable > 0); if (is_internal) { /* one less "internal" */ assert(pmap->stats.internal > 0); OSAddAtomic(-1, &pmap->stats.internal); + pmap_ledger_debit(pmap, + task_ledgers.internal, + PAGE_SIZE); + if (IS_ALTACCT_PAGE(pai)) { + /* no impact on footprint */ + } else { + pmap_ledger_debit( + pmap, + task_ledgers.phys_footprint, + PAGE_SIZE); + } } else { /* one less "external" */ assert(pmap->stats.external > 0); @@ -1905,7 +2100,7 @@ phys_attribute_test( pmap = pv_e->pmap; is_ept = is_ept_pmap(pmap); - va = pv_e->va; + va = PVE_VA(pv_e); /* * pick up modify and/or reference bits from mapping */ @@ -2109,12 +2304,80 @@ pmap_query_resident( return resident_bytes; } -#if MACH_ASSERT +kern_return_t +pmap_query_page_info( + pmap_t pmap, + vm_map_offset_t va, + int *disp_p) +{ + int disp; + boolean_t is_ept; + pmap_paddr_t pa; + ppnum_t pai; + pd_entry_t *pde; + pt_entry_t *pte; + + pmap_intr_assert(); + if (pmap == PMAP_NULL || pmap == kernel_pmap) { + *disp_p = 0; + return KERN_INVALID_ARGUMENT; + } + + disp = 0; + is_ept = is_ept_pmap(pmap); + + PMAP_LOCK(pmap); + + pde = pmap_pde(pmap, va); + if (!pde || + !(*pde & PTE_VALID_MASK(is_ept)) || + (*pde & PTE_PS)) { + goto done; + } + + pte = pmap_pte(pmap, va); + if (pte == PT_ENTRY_NULL) { + goto done; + } + + pa = pte_to_pa(*pte); + if (pa == 0) { + if (PTE_IS_COMPRESSED(*pte)) { + disp |= PMAP_QUERY_PAGE_COMPRESSED; + if (*pte & PTE_COMPRESSED_ALT) { + disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT; + } + } + } else { + disp |= PMAP_QUERY_PAGE_PRESENT; + pai = pa_index(pa); + if (!IS_MANAGED_PAGE(pai)) { + } else if (IS_REUSABLE_PAGE(pai)) { + disp |= PMAP_QUERY_PAGE_REUSABLE; + } else if (IS_INTERNAL_PAGE(pai)) { + disp |= PMAP_QUERY_PAGE_INTERNAL; + if (IS_ALTACCT_PAGE(pai)) { + disp |= PMAP_QUERY_PAGE_ALTACCT; + } + } + } + +done: + PMAP_UNLOCK(pmap); + *disp_p = disp; + return KERN_SUCCESS; +} + +#if DEBUG || DEVELOPMENT +void +kernel_pmap_lock(void) +{ + PMAP_LOCK(kernel_pmap); +} + void -pmap_set_process( - __unused pmap_t pmap, - __unused int pid, - __unused char *procname) +kernel_pmap_unlock(void) { + PMAP_UNLOCK(kernel_pmap); } -#endif /* MACH_ASSERT */ +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index 7cfbf2631..6ed44cc73 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -145,7 +145,7 @@ _rtc_nanotime_init(pal_rtc_nanotime_t *rntp, uint64_t base) _pal_rtc_nanotime_store(tsc, base, rntp->scale, rntp->shift, rntp); } -static void +void rtc_nanotime_init(uint64_t base) { _rtc_nanotime_init(&pal_rtc_nanotime_info, base); @@ -265,6 +265,10 @@ rtc_sleep_wakeup( rtc_nanotime_init(base); } +void +rtc_decrementer_configure(void) { + rtc_timer->rtc_config(); +} /* * rtclock_early_init() is called very early at boot to * establish mach_absolute_time() and set it to zero. diff --git a/osfmk/i386/rtclock_protos.h b/osfmk/i386/rtclock_protos.h index 469a04cf5..b2c1bd529 100644 --- a/osfmk/i386/rtclock_protos.h +++ b/osfmk/i386/rtclock_protos.h @@ -69,5 +69,6 @@ extern rtc_timer_t *rtc_timer; extern void rtc_timer_init(void); extern void rtclock_early_init(void); - +extern void rtc_nanotime_init(uint64_t); +extern void rtc_decrementer_configure(void); #endif /* _I386_RTCLOCK_PROTOS_H_ */ diff --git a/osfmk/i386/simple_lock.h b/osfmk/i386/simple_lock.h index ce1708ab9..31032681d 100644 --- a/osfmk/i386/simple_lock.h +++ b/osfmk/i386/simple_lock.h @@ -73,8 +73,9 @@ #include #include -extern unsigned int LockTimeOutTSC; /* Lock timeout in TSC ticks */ -extern unsigned int LockTimeOut; /* Lock timeout in absolute time */ +extern uint64_t LockTimeOutTSC; /* Lock timeout in TSC ticks */ +extern uint32_t LockTimeOutUsec;/* Lock timeout in microseconds */ +extern uint64_t LockTimeOut; /* Lock timeout in absolute time */ #if MACH_LDEBUG #define USLOCK_DEBUG 1 diff --git a/osfmk/i386/thread.h b/osfmk/i386/thread.h index acefe5531..1fea8c2d8 100644 --- a/osfmk/i386/thread.h +++ b/osfmk/i386/thread.h @@ -128,6 +128,12 @@ struct machine_thread { struct pal_pcb pal_pcb; uint32_t specFlags; + /* N.B.: These "specFlags" are read-modify-written non-atomically within + * the copyio routine. So conceivably any exception that modifies the + * flags in a persistent manner could be clobbered if it occurs within + * a copyio context. For now, the only other flag here is OnProc which + * is not modified except at context switch. + */ #define OnProc 0x1 #define CopyIOActive 0x2 /* Checked to ensure DTrace actions do not re-enter copyio(). */ uint64_t thread_gpu_ns; diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index 592f75895..ace505bfa 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -131,6 +131,7 @@ extern boolean_t dtrace_tally_fault(user_addr_t); extern boolean_t pmap_smep_enabled; extern boolean_t pmap_smap_enabled; +__attribute__((noreturn)) void thread_syscall_return( kern_return_t ret) @@ -512,7 +513,7 @@ kernel_trap( #if NCOPY_WINDOWS > 0 int fault_in_copy_window = -1; #endif - int is_user = 0; + int is_user; int trap_pl = get_preemption_level(); thread = current_thread(); @@ -532,6 +533,8 @@ kernel_trap( myast = ast_pending(); + is_user = (vaddr < VM_MAX_USER_PAGE_ADDRESS); + perfASTCallback astfn = perfASTHook; if (__improbable(astfn != NULL)) { if (*myast & AST_CHUD_ALL) @@ -567,7 +570,14 @@ kernel_trap( 0, 0, 0, VM_KERNEL_UNSLIDE(kern_ip), 0); return; } - + + user_addr_t kd_vaddr = is_user ? vaddr : VM_KERNEL_UNSLIDE(vaddr); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, + (unsigned)(kd_vaddr >> 32), (unsigned)kd_vaddr, is_user, + VM_KERNEL_UNSLIDE(kern_ip), 0); + + if (T_PAGE_FAULT == type) { /* * assume we're faulting in the kernel map @@ -602,13 +612,11 @@ kernel_trap( map = thread->map; fault_in_copy_window = window_index; } - is_user = -1; } #else if (__probable(vaddr < VM_MAX_USER_PAGE_ADDRESS)) { /* fault occurred in userspace */ map = thread->map; - is_user = -1; /* Intercept a potential Supervisor Mode Execute * Protection fault. These criteria identify @@ -617,7 +625,8 @@ kernel_trap( * (The VM could just redrive a SMEP fault, hence * the intercept). */ - if (__improbable((code == (T_PF_PROT | T_PF_EXECUTE)) && (pmap_smep_enabled) && (saved_state->isf.rip == vaddr))) { + if (__improbable((code == (T_PF_PROT | T_PF_EXECUTE)) && + (pmap_smep_enabled) && (saved_state->isf.rip == vaddr))) { goto debugger_entry; } @@ -644,17 +653,14 @@ kernel_trap( set_cr3_raw(map->pmap->pm_cr3); return; } - + if (__improbable(vaddr < PAGE_SIZE) && + ((thread->machine.specFlags & CopyIOActive) == 0)) { + goto debugger_entry; + } } #endif } } - user_addr_t kd_vaddr = is_user ? vaddr : VM_KERNEL_UNSLIDE(vaddr); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, - (unsigned)(kd_vaddr >> 32), (unsigned)kd_vaddr, is_user, - VM_KERNEL_UNSLIDE(kern_ip), 0); - (void) ml_set_interrupts_enabled(intr); @@ -714,8 +720,7 @@ kernel_trap( prot |= VM_PROT_EXECUTE; result = vm_fault(map, - vm_map_trunc_page(vaddr, - PAGE_MASK), + vaddr, prot, FALSE, THREAD_UNINT, NULL, 0); @@ -781,10 +786,8 @@ kernel_trap( */ sync_iss_to_iks(state); #if MACH_KDP - if (current_debugger != KDB_CUR_DB) { - if (kdp_i386_trap(type, saved_state, result, (vm_offset_t)vaddr)) - return; - } + if (kdp_i386_trap(type, saved_state, result, (vm_offset_t)vaddr)) + return; #endif } pal_cli(); @@ -801,9 +804,6 @@ set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip) saved_state->isf.rip = ip; } - - - static void panic_trap(x86_saved_state64_t *regs, uint32_t pl) { @@ -1090,8 +1090,7 @@ user_trap( if (__improbable(err & T_PF_EXECUTE)) prot |= VM_PROT_EXECUTE; kret = vm_fault(thread->map, - vm_map_trunc_page(vaddr, - PAGE_MASK), + vaddr, prot, FALSE, THREAD_ABORTSAFE, NULL, 0); diff --git a/osfmk/i386/tsc.c b/osfmk/i386/tsc.c index 01452703c..e8de697d9 100644 --- a/osfmk/i386/tsc.c +++ b/osfmk/i386/tsc.c @@ -222,7 +222,6 @@ tsc_init(void) break; } - case CPUFAMILY_INTEL_MEROM: case CPUFAMILY_INTEL_PENRYN: { uint64_t prfsts; diff --git a/osfmk/ipc/Makefile b/osfmk/ipc/Makefile index 8aa306ba6..475de2958 100644 --- a/osfmk/ipc/Makefile +++ b/osfmk/ipc/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -22,10 +21,9 @@ INSTALL_MI_DIR = ipc EXPORT_MI_LIST = ${DATAFILES} ${EXPORT_ONLY_FILES} -INSTALL_KF_MI_LCL_LIST = ${EXPORT_ONLY_FILES} ${EXPORT_PRIVATE_FILES} +INSTALL_KF_MI_LCL_LIST = ${EXPORT_ONLY_FILES} ${EXPORT_PRIVATE_FILES} EXPORT_MI_DIR = ipc include $(MakeInc_rule) include $(MakeInc_dir) - diff --git a/osfmk/ipc/flipc.c b/osfmk/ipc/flipc.c new file mode 100644 index 000000000..ff0143cd0 --- /dev/null +++ b/osfmk/ipc/flipc.c @@ -0,0 +1,658 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* File: ipc/flipc.h + * Author: Dean Reece + * Date: 2016 + * + * Implementation of fast local ipc (flipc). + */ + + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma pack(4) + + +/*** FLIPC Internal Implementation (private to flipc.c) ***/ + + +zone_t flipc_port_zone; + + +/* Get the mnl_name associated with local ipc_port . + * Returns MNL_NAME_NULL if is invalid or not a flipc port. + */ +static inline mnl_name_t mnl_name_from_port(ipc_port_t lport) +{ + mnl_name_t name = MNL_NAME_NULL; + + if (IP_VALID(lport)) { + flipc_port_t fport = lport->ip_messages.data.port.fport; + if (FPORT_VALID(fport)) + name = fport->obj.name; + } + return name; +} + + +/* Lookup the ipc_port associated with mnl_name . + * Returns IP_NULL if is invalid or not a known mnl object. + */ +static inline ipc_port_t mnl_name_to_port(mnl_name_t name) +{ + ipc_port_t lport = IP_NULL; + + if (MNL_NAME_VALID(name)) { + flipc_port_t fport = (flipc_port_t)mnl_obj_lookup(name); + if (FPORT_VALID(fport)) + lport = fport->lport; + } + return lport; +} + + +/* flipc_port_create() is called to convert a regular mach port into a + * flipc port (i.e., the port has one or more rights off-node). + * must be locked on entry and is not unlocked on return. + */ +static kern_return_t +flipc_port_create(ipc_port_t lport, mach_node_t node, mnl_name_t name) +{ + /* Ensure parameters are valid and not already linked */ + assert(IP_VALID(lport)); + assert(MACH_NODE_VALID(node)); + assert(MNL_NAME_VALID(name)); + assert(!FPORT_VALID(lport->ip_messages.imq_fport)); + + /* Allocate and initialize a flipc port */ + flipc_port_t fport = (flipc_port_t) zalloc(flipc_port_zone); + if (!FPORT_VALID(fport)) + return KERN_RESOURCE_SHORTAGE; + bzero(fport, sizeof(struct flipc_port)); + fport->obj.name = name; + fport->hostnode = node; + if (node == localnode) + fport->state = FPORT_STATE_PRINCIPAL; + else + fport->state = FPORT_STATE_PROXY; + + /* Link co-structures (lport is locked) */ + fport->lport = lport; + lport->ip_messages.imq_fport = fport; + + /* Add fport to the name hash table; revert link if insert fails */ + kern_return_t kr = mnl_obj_insert((mnl_obj_t)fport); + if (kr != KERN_SUCCESS) { + lport->ip_messages.imq_fport = FPORT_NULL; + fport->lport = IP_NULL; + zfree(flipc_port_zone, fport); + } + + return kr; +} + + +/* flipc_port_destroy() is called to convert a flipc port back to a + * local-only ipc port (i.e., the port has no remaining off-node rights). + * This will dispose of any undelivered flipc messages, generating NAKs if + * needed. must be locked on entry and is not unlocked on return. + */ +static void +flipc_port_destroy(ipc_port_t lport) +{ + /* Ensure parameter is valid, and linked to an fport with a valid name */ + assert(IP_VALID(lport)); + ipc_mqueue_t port_mq = &lport->ip_messages; + flipc_port_t fport = port_mq->data.port.fport; + assert(FPORT_VALID(fport)); + assert(MNL_NAME_VALID(fport->obj.name)); + + /* Dispose of any undelivered messages */ + int m = port_mq->data.port.msgcount; + if (m > 0) { + ipc_kmsg_t kmsg; +#ifdef DEBUG + printf("flipc: destroying %p with %d undelivered msgs\n", lport, m); +#endif + + /* Logic was lifted from ipc_mqueue_select_on_thread() */ + while (m--) { + kmsg = ipc_kmsg_queue_first(&port_mq->imq_messages); + assert(kmsg != IKM_NULL); + ipc_kmsg_rmqueue(&port_mq->imq_messages, kmsg); + if (fport->state == FPORT_STATE_PRINCIPAL) + flipc_msg_ack(kmsg->ikm_node, port_mq, FALSE); + ipc_mqueue_release_msgcount(port_mq, NULL); + port_mq->imq_seqno++; + } + } + + /* Remove from name hash table, unlink co-structures, and free fport */ + mnl_obj_remove(fport->obj.name); + lport->ip_messages.data.port.fport = FPORT_NULL; + fport->lport = IP_NULL; + zfree(flipc_port_zone, fport); +} + + +/* + * Routine: flipc_msg_size_from_kmsg(ipc_kmsg_t kmsg) + * Purpose: + * Compute the size of the buffer needed to hold the translated flipc + * message. All identifiers are converted to flipc_names which are 64b. + * If this node's pointers are a different size, we have to allow for + * expansion of the descriptors as appropriate. + * Conditions: + * Nothing locked. + * Returns: + * size of the message as it would be sent over the flipc link. + */ +static mach_msg_size_t flipc_msg_size_from_kmsg(ipc_kmsg_t kmsg) +{ + mach_msg_size_t fsize = kmsg->ikm_header->msgh_size; + + if (kmsg->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) + PE_enter_debugger("flipc_msg_size_from_kmsg(): Complex messages not supported."); + + return fsize; +} + + +/* Translate a kmsg into a flipc msg suitable to transmit over the mach node + * link. All in-line rights and objects are similarly processed. If the msg + * moves a receive right, then queued messages may need to be moved as a + * result, causing this function to ultimately be recursive. + */ +static kern_return_t mnl_msg_from_kmsg(ipc_kmsg_t kmsg, mnl_msg_t *fmsgp) +{ + if (kmsg->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) { + printf("mnl_msg_from_kmsg(): Complex messages not supported."); + return KERN_FAILURE; + } + + mach_msg_size_t fsize = flipc_msg_size_from_kmsg(kmsg); + + mnl_msg_t fmsg = mnl_msg_alloc(fsize, 0); + + if (fmsg == MNL_MSG_NULL) + return KERN_RESOURCE_SHORTAGE; + + /* Setup flipc message header */ + fmsg->sub = MACH_NODE_SUB_FLIPC; + fmsg->cmd = FLIPC_CMD_IPCMESSAGE; + fmsg->node_id = localnode_id; // Message is from us + fmsg->qos = 0; // not used + fmsg->size = fsize; // Payload size (does NOT include mnl_msg header) + fmsg->object = kmsg->ikm_header->msgh_remote_port->ip_messages.data.port.fport->obj.name; + + /* Copy body of message */ + bcopy((const void*)kmsg->ikm_header, (void*)MNL_MSG_PAYLOAD(fmsg), fsize); + + // Convert port fields + mach_msg_header_t *mmsg = (mach_msg_header_t*)MNL_MSG_PAYLOAD(fmsg); + mmsg->msgh_remote_port = (mach_port_t)fmsg->object; + mmsg->msgh_local_port = (mach_port_t) + mnl_name_from_port(mmsg->msgh_local_port); + mmsg->msgh_voucher_port = (mach_port_name_t)MNL_NAME_NULL; + + *fmsgp = (mnl_msg_t)fmsg; + + return KERN_SUCCESS; +} + + +/* lifted from ipc_mig.c:mach_msg_send_from_kernel_proper() */ +static mach_msg_return_t +mach_msg_send_from_remote_kernel(mach_msg_header_t *msg, + mach_msg_size_t send_size, + mach_node_t node) +{ + ipc_kmsg_t kmsg; + mach_msg_return_t mr; + + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); + if (mr != MACH_MSG_SUCCESS) + return mr; + + mr = ipc_kmsg_copyin_from_kernel(kmsg); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_free(kmsg); + return mr; + } + + kmsg->ikm_node = node; // node that needs to receive message ack + mr = ipc_kmsg_send(kmsg, + MACH_SEND_KERNEL_DEFAULT, + MACH_MSG_TIMEOUT_NONE); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_destroy(kmsg); + } + + return mr; +} + + +/* Translate a flipc msg into a kmsg and post it to the appropriate + * port. is the node that originated the message, not necessarily the + * node we received it from. This will block if the receiving port is full. + */ +static mach_msg_return_t +flipc_cmd_ipc(mnl_msg_t fmsg, + mach_node_t node, + uint32_t flags __unused) +{ + mach_msg_header_t *mmsg; + + // Convert flipc message into mach message in place to avoid alloc/copy + mmsg = (mach_msg_header_t*)MNL_MSG_PAYLOAD(fmsg); + mmsg->msgh_size = fmsg->size; + mmsg->msgh_remote_port = mnl_name_to_port(fmsg->object); + mmsg->msgh_local_port = mnl_name_to_port((mnl_name_t)mmsg->msgh_local_port); + mmsg->msgh_voucher_port = (mach_port_name_t)MACH_PORT_NULL; + mmsg->msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0); + // unchanged: msgh_id + + return mach_msg_send_from_remote_kernel(mmsg, fmsg->size, node); +} + + +/* Called when an ACKMESSAGE packet is received. indicates + * the flipc name of the port holding the messages to be acknowledged. + * indicates the number of messages being acked for this node:port. + */ +static void +flipc_cmd_ack(flipc_ack_msg_t fmsg, + mach_node_t node __unused, + uint32_t flags __unused) +{ + unsigned int msg_count = fmsg->msg_count; + thread_t thread = current_thread(); + boolean_t kick = FALSE; + + flipc_port_t fport = (flipc_port_t)mnl_obj_lookup(fmsg->mnl.object); + + ipc_port_t lport = fport->lport; + ip_lock(lport); + + ipc_mqueue_t lport_mq = &lport->ip_messages; + spl_t s = splsched(); + imq_lock(lport_mq); + + assert(fport->peek_count >= msg_count); // Can't ack what we haven't peeked! + + while (msg_count--) { + ipc_mqueue_select_on_thread(lport_mq, IMQ_NULL, 0, 0, thread); + fport->peek_count--; + kick |= ipc_kmsg_delayed_destroy(thread->ith_kmsg); + } + + imq_unlock(lport_mq); + splx(s); + ip_unlock(lport); + + if (kick) + ipc_kmsg_reap_delayed(); +} + + + +/*** FLIPC Node Managment Functions (called by mach node layer) ***/ + + +/* The mach node layer calls flipc_init() once before it calls any other + * flipc entry points. Returns KERN_SUCCESS on success; otherwise flipc + * is not initialized and cannot be used. + */ +kern_return_t +flipc_init(void) +{ + /* Create zone for flipc ports. + * TODO: Pick a better max value than ipc_port_max>>4 + */ + flipc_port_zone = zinit(sizeof(struct flipc_port), + (ipc_port_max>>4) * sizeof(struct flipc_port), + sizeof(struct flipc_port), + "flipc ports"); + + zone_change(flipc_port_zone, Z_CALLERACCT, FALSE); + zone_change(flipc_port_zone, Z_NOENCRYPT, TRUE); + return KERN_SUCCESS; +} + + +/* flipc_node_prepare() is called by mach node layer when a remote node is + * registered by a link driver, or when the bootstrap port changes for the + * local node. This is the flipc layer's opportunity to initialize per-node + * flipc state, and to convert the node's bootstrap port into a flipc port. + * Note that the node is not yet in the mach node table. + * Returns KERN_SUCCESS on success; otherwise node is not prepared. + */ +kern_return_t +flipc_node_prepare(mach_node_t node) +{ + kern_return_t kr; + + assert(MACH_NODE_VALID(node)); + ipc_port_t bs_port = node->bootstrap_port; + assert(IP_VALID(bs_port)); + + ip_lock(bs_port); + + kr = flipc_port_create(bs_port, + node, + MNL_NAME_BOOTSTRAP(node->info.node_id)); + ip_unlock(bs_port); + + return kr; +} + + +/* flipc_node_retire() is called by mach node layer when a remote node is + * terminated by a link driver, or when the local node's bootstrap port + * becomes invalid. This is the flipc layer's opportunity to free per-node + * flipc state, and to revert the node's bootstrap port to a local ipc port. + * must be locked by the caller. + * Returns KERN_SUCCESS on success. + */ +kern_return_t +flipc_node_retire(mach_node_t node) +{ + if (!MACH_NODE_VALID(node)) + return KERN_NODE_DOWN; + + ipc_port_t bs_port = node->bootstrap_port; + if (IP_VALID(bs_port)) { + ip_lock(bs_port); + flipc_port_destroy(bs_port); + ip_unlock(bs_port); + } + + return KERN_SUCCESS; +} + + +/*** FLIPC Message Functions (called by mach node layer) ***/ + + +/* The node layer calls flipc_msg_to_remote_node() to fetch the next message + * for . This function will block until a message is available or the + * node is terminated, in which case it returns MNL_MSG_NULL. + */ +mnl_msg_t +flipc_msg_to_remote_node(mach_node_t to_node, + uint32_t flags __unused) +{ + mach_port_seqno_t msgoff; + ipc_kmsg_t kmsg = IKM_NULL; + mnl_msg_t fmsg = MNL_MSG_NULL; + + assert(to_node != localnode); + assert(get_preemption_level()==0); + + ipc_mqueue_t portset_mq = &to_node->proxy_port_set->ips_messages; + ipc_mqueue_t port_mq = IMQ_NULL; + + while (!to_node->dead) { + /* Fetch next message from proxy port */ + ipc_mqueue_receive(portset_mq, MACH_PEEK_MSG, 0, 0, THREAD_ABORTSAFE); + + thread_t thread = current_thread(); + if (thread->ith_state == MACH_PEEK_READY) { + port_mq = thread->ith_peekq; + thread->ith_peekq = IMQ_NULL; + } else { + panic("Unexpected thread state %d after ipc_mqueue_receive()", + thread->ith_state); + } + + assert(get_preemption_level()==0); + imq_lock(port_mq); + + flipc_port_t fport = port_mq->data.port.fport; + + if (FPORT_VALID(fport)) { + msgoff = port_mq->data.port.fport->peek_count; + + ipc_mqueue_peek_locked(port_mq, &msgoff, NULL, NULL, NULL, &kmsg); + if (kmsg != IKM_NULL) + port_mq->data.port.fport->peek_count++; + + /* Clean up outstanding prepost on port_mq. + * This also unlocks port_mq and restores spl. + */ + spl_t spl = splsched(); + ipc_mqueue_release_peek_ref(port_mq, &spl); + assert(get_preemption_level()==0); + + /* DANGER: The code below must be allowed to allocate so it can't + * run under the protection of the imq_lock, but that leaves mqueue + * open for business for a small window before we examine kmsg. + * This SHOULD be OK, since we are the only thread looking. + */ + if (kmsg != IKM_NULL) + mnl_msg_from_kmsg(kmsg, (mnl_msg_t*)&fmsg); + } else { + /* Must be from the control_port, which is not a flipc port */ + assert(!FPORT_VALID(port_mq->data.port.fport)); + + /* This is a simplified copy of ipc_mqueue_select_on_thread() */ + kmsg = ipc_kmsg_queue_first(&port_mq->imq_messages); + assert(kmsg != IKM_NULL); + ipc_kmsg_rmqueue(&port_mq->imq_messages, kmsg); + ipc_mqueue_release_msgcount(port_mq, portset_mq); + imq_unlock(port_mq); + current_task()->messages_received++; + ip_release(to_node->control_port); // Should derive ref from port_mq + + /* We just pass the kmsg payload as the fmsg. + * flipc_msg_free() will notice and free the kmsg properly. + */ + mach_msg_header_t *hdr = kmsg->ikm_header; + fmsg = (mnl_msg_t)(&hdr[1]); + /* Stash kmsg pointer just before fmsg */ + *(ipc_kmsg_t*)((vm_offset_t)fmsg-sizeof(vm_offset_t)) = kmsg; + } + + if (MNL_MSG_VALID(fmsg)) + break; + } + assert(MNL_MSG_VALID(fmsg)); + return fmsg; +} + + +/* The mach node layer calls this to deliver an incoming message. It is the + * responsibility of the caller to release the received message buffer after + * return. + */ +void +flipc_msg_from_node(mach_node_t from_node __unused, + mnl_msg_t msg, + uint32_t flags) +{ + /* Note that if flipc message forwarding is supported, the from_node arg + * may not match fmsg->node_id. The former is the node from which we + * received the message; the latter is the node that originated the + * message. We use the originating node, which is where the ack goes. + */ + assert(msg->sub == MACH_NODE_SUB_FLIPC); + mach_node_t node = mach_node_for_id_locked(msg->node_id, FALSE, FALSE); + MACH_NODE_UNLOCK(node); + + switch (msg->cmd) { + case FLIPC_CMD_IPCMESSAGE: + flipc_cmd_ipc(msg, node, flags); + break; + + case FLIPC_CMD_ACKMESSAGE: + case FLIPC_CMD_NAKMESSAGE: + flipc_cmd_ack((flipc_ack_msg_t)msg, node, flags); + break; + + default: +#if DEBUG + PE_enter_debugger("flipc_incoming(): Invalid command"); +#endif + break; + } +} + + +/* The node layer calls flipc_msg_free() to dispose of sent messages that + * originated in the FLIPC layer. This allows us to repurpose the payload + * of an ack or nak kmsg as a flipc message to avoid a copy - we detect + * such messages here and free them appropriately. + */ +void +flipc_msg_free(mnl_msg_t msg, + uint32_t flags) +{ + switch (msg->cmd) { + case FLIPC_CMD_ACKMESSAGE: // Flipc msg is a kmsg in disguise... + case FLIPC_CMD_NAKMESSAGE: // Convert back to kmsg for disposal + ipc_kmsg_free(*(ipc_kmsg_t*)((vm_offset_t)msg-sizeof(vm_offset_t))); + break; + + default: // Flipc msg is not a kmsg in disguise; dispose of normally + mnl_msg_free(msg, flags); + break; + } +} + + +/*** FLIPC Message Functions (called by mach ipc subsystem) ***/ + +/* Ack's one message sent to from . A new kmsg is allocated + * and filled in as an ack, then posted to the node's contol port. This will + * wake the link driver (if sleeping) and cause the ack to be included with + * normal IPC traffic. + * + * This function immediately returns if or is invalid, so it + * is safe & quick to call speculatively. + * + * Called from mach ipc_mqueue.c when a flipc-originated message is consumed. + */ +void +flipc_msg_ack(mach_node_t node, + ipc_mqueue_t mqueue, + boolean_t delivered) +{ + flipc_port_t fport = mqueue->imq_fport; + + assert(FPORT_VALID(fport)); + assert(MACH_NODE_VALID(node)); + + mnl_name_t name = MNL_NAME_NULL; + mach_node_id_t nid = HOST_LOCAL_NODE; + ipc_port_t ack_port = IP_NULL; + + ip_lock(fport->lport); + name = fport->obj.name; + ip_unlock(fport->lport); + + if (!MNL_NAME_VALID(name)) + return; + + MACH_NODE_LOCK(node); + if (node->active) { + nid = node->info.node_id; + ack_port = node->control_port; + } + MACH_NODE_UNLOCK(node); + + if ( !IP_VALID(ack_port) || !MACH_NODE_ID_VALID(nid) ) + return; + + /* We have a valid node id & obj name, and a port to send the ack to. */ + ipc_kmsg_t kmsg = ipc_kmsg_alloc(sizeof(struct flipc_ack_msg) + MAX_TRAILER_SIZE); + assert((unsigned long long)kmsg >= 4ULL);//!= IKM_NULL); + mach_msg_header_t *msg = kmsg->ikm_header; + + /* Fill in the mach_msg_header struct */ + msg->msgh_bits = MACH_MSGH_BITS_SET(0, 0, 0, 0); + msg->msgh_size = sizeof(msg); + msg->msgh_remote_port = ack_port; + msg->msgh_local_port = MACH_PORT_NULL; + msg->msgh_voucher_port = MACH_PORT_NULL; + msg->msgh_id = FLIPC_CMD_ID; + + /* Fill in the flipc_ack_msg struct */ + flipc_ack_msg_t fmsg = (flipc_ack_msg_t)(&msg[1]); + fmsg->resend_to = HOST_LOCAL_NODE; + fmsg->msg_count = 1; // Might want to coalesce acks to a node/name pair + + /* Fill in the mnl_msg struct */ + fmsg->mnl.sub = MACH_NODE_SUB_FLIPC; + fmsg->mnl.cmd = delivered ? FLIPC_CMD_ACKMESSAGE : FLIPC_CMD_NAKMESSAGE; + fmsg->mnl.qos = 0; // Doesn't do anything yet + fmsg->mnl.flags = 0; + fmsg->mnl.node_id = nid; + fmsg->mnl.object = name; + fmsg->mnl.options = 0; + fmsg->mnl.size = sizeof(struct flipc_ack_msg) - sizeof(struct mnl_msg); + +#if (0) + mach_msg_return_t mmr; + spl_t s; + ipc_mqueue_t ack_mqueue; + + ip_lock(ack_port); + ack_mqueue = &ack_port->ip_messages; + s = splsched(); + imq_lock(ack_mqueue); + ip_unlock(ack_port); + + /* ipc_mqueue_send() unlocks ack_mqueue and restores splx(s) */ + mmr = ipc_mqueue_send(ack_mqueue, kmsg, 0, 0, s); +#else + kern_return_t kr; + kr = ipc_kmsg_send(kmsg, + MACH_SEND_KERNEL_DEFAULT, + MACH_MSG_TIMEOUT_NONE); +#endif +} + + diff --git a/osfmk/ipc/flipc.h b/osfmk/ipc/flipc.h new file mode 100644 index 000000000..fb89b3ff0 --- /dev/null +++ b/osfmk/ipc/flipc.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2015-2016 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * File: ipc/flipc.h + * Author: Dean Reece + * Date: 2016 + * + * Definitions for fast local ipc (flipc). + */ + +#ifndef _IPC_FLIPC_H_ +#define _IPC_FLIPC_H_ + +#if MACH_KERNEL_PRIVATE && MACH_FLIPC + +#include +#include +#include + +__BEGIN_DECLS + + +/*** FLIPC Port Declarations ***/ + +/* A FLIPC port (flipc_port_t) is a companion structure to ipc_port_t. + * Any ipc_port object that is known to the flipc layer has one of these + * structures to maintain the state of the port with respect to flipc. + * When a port reverts to a purely local object (all rights for the port exist + * on a single node) the flipc port companion structure will be de-allocated. + */ + +typedef struct flipc_port { + struct mnl_obj obj; // Necessary to be in mnl_name_table[] + ipc_port_t lport; // The associated local ipc_port + mach_node_t hostnode; // Node holding the recieve right + uint32_t peek_count; // How many kmsgs in mq have been peeked + uint32_t state:3; // See FPORT_STATE_* defines below +} *flipc_port_t; + +#define FPORT_NULL ((flipc_port_t) 0UL) +#define FPORT_VALID(fport) ((fport) != FPORT_NULL) + +#define FPORT_STATE_INIT (0) // Port is being initialized +#define FPORT_STATE_PROXY (1) // Principal is on another node +#define FPORT_STATE_PRINCIPAL (2) // Principal is on this node +#define FPORT_STATE_PREPRINCIPAL (3) // Principal moving to this node +#define FPORT_STATE_POSTPRINCIPAL (4) // Principal moving to other node +#define FPORT_STATE_DEAD (5) // Port is being destroyed + + +/*** FLIPC Node Managment Declarations (used by mach node layer) ***/ + +extern mach_node_id_t localnode_id; // This node's FLIPC id. + +/* The mach node layer calls flipc_init() once before it calls any other + * flipc entry points. Returns KERN_SUCCESS on success; otherwise flipc + * is not initialized and cannot be used. + */ +kern_return_t flipc_init(void); + +/* flipc_node_prepare() is called by mach node layer when a remote node is + * registered by a link driver. This is the flipc layer's opportunity to + * convert it to a flipc port and hook it into any appropriate structures. + * Note that the node is not yet in the mach node table. Returns KERN_SUCCESS + * on success; otherwise node is not prepared and cannot be used. + */ +kern_return_t flipc_node_prepare(mach_node_t node); + +/* flipc_node_retire() is called by mach node layer when a remote node is + * terminated by a link driver. This is the flipc layer's opportunity to + * convert it back to a local port and unhook it into any structures. + * Returns KERN_SUCCESS on success. + */ +kern_return_t flipc_node_retire(mach_node_t node); + + +/*** FLIPC Message Declarations (used by mach node layer) ***/ + +/* Definition for a flipc ack/nak message. These messages are sent to the + * originating node of a message to ack or nak the message. Ack'd messages + * are destroyed by the originating node (to avoid duplicate delivery). Nak'd + * messages are re-sent to the node specified in (used when a + * receive right moves to a different node). These messages are queued onto + * the originating node's control_port and sent along with other ipc traffic. + */ + +typedef struct flipc_ack_msg { + struct mnl_msg mnl; // Flipc message starts with mnl message + mach_node_id_t resend_to; // Node ID for resends (if NAK) + uint8_t msg_count; // Number of msgs being ackd/nakd +} __attribute__((__packed__)) *flipc_ack_msg_t; + +#define FLIPC_CMD_ID (0x43504952UL) // msgh_id "RIPC" for FLIPC msgs +#define FLIPC_CMD_IPCMESSAGE (1) // IPC Msg: is sender; is dest port +#define FLIPC_CMD_ACKMESSAGE (2) // is port being ack'd +#define FLIPC_CMD_NAKMESSAGE (3) // is port being nak'd + + +/* The node layer calls flipc_msg_to_remote_node() to fetch the next message + * for . This function will block until a message is available or the + * node is terminated, in which case it returns MNL_MSG_NULL. + */ +mnl_msg_t flipc_msg_to_remote_node(mach_node_t to_node, + uint32_t flags); + +/* The node layer calls flipc_msg_to_remote_node() to post the next message + * from . This function will block until a message is available + * or the node is terminated, in which case it returns MNL_MSG_NULL. + */ +void flipc_msg_from_node(mach_node_t from_node, + mnl_msg_t msg_arg, + uint32_t flags); + +/* The node layer calls flipc_msg_free() to dispose of sent messages that + * originated in the FLIPC layer. + */ +void flipc_msg_free(mnl_msg_t msg, + uint32_t flags); + + +/*** FLIPC Message Declarations (used by mach ipc subsystem) ***/ + +/* Ack a message sent by to . A new kmsg is allocated and + * filled in as an ack (or nak if is false), then posted to the + * node's contol port. This will wake the link driver (if sleeping) and cause + * the ack to be included with normal IPC traffic. + * + * This function immediately returns if or is invalid, so it + * is safe & quick to call speculatively. + * + * Called from mach ipc_mqueue.c when a flipc-originated message is consumed. + */ +void flipc_msg_ack(mach_node_t node, + ipc_mqueue_t mqueue, + boolean_t delivered); + + +__END_DECLS + +#endif // MACH_KERNEL_PRIVATE +#endif // _IPC_FLIPC_H_ + diff --git a/osfmk/ipc/ipc_entry.c b/osfmk/ipc/ipc_entry.c index aabb15ada..9604a81b8 100644 --- a/osfmk/ipc/ipc_entry.c +++ b/osfmk/ipc/ipc_entry.c @@ -749,3 +749,29 @@ ipc_entry_grow_table( return KERN_SUCCESS; } + + +/* + * Routine: ipc_entry_name_mask + * Purpose: + * Ensure a mach port name has the default ipc entry + * generation bits set. This can be used to ensure that + * a name passed in by user space matches names generated + * by the kernel. + * Conditions: + * None. + * Returns: + * 'name' input with default generation bits masked or added + * as appropriate. + */ +mach_port_name_t +ipc_entry_name_mask(mach_port_name_t name) +{ +#ifndef NO_PORT_GEN + static mach_port_name_t null_name = MACH_PORT_MAKE(0, IE_BITS_NEW_GEN(IE_BITS_GEN_MASK)); + return name | null_name; +#else + static mach_port_name_t null_name = MACH_PORT_MAKE(0, ~(IE_BITS_NEW_GEN(IE_BITS_GEN_MASK))); + return name & ~null_name; +#endif +} diff --git a/osfmk/ipc/ipc_entry.h b/osfmk/ipc/ipc_entry.h index 4a34f110d..531434edf 100644 --- a/osfmk/ipc/ipc_entry.h +++ b/osfmk/ipc/ipc_entry.h @@ -185,4 +185,7 @@ extern kern_return_t ipc_entry_grow_table( ipc_space_t space, ipc_table_elems_t target_size); +/* mask on/off default entry generation bits */ +extern mach_port_name_t ipc_entry_name_mask( + mach_port_name_t name); #endif /* _IPC_IPC_ENTRY_H_ */ diff --git a/osfmk/ipc/ipc_importance.c b/osfmk/ipc/ipc_importance.c index 089f6afd5..3c16ac82b 100644 --- a/osfmk/ipc/ipc_importance.c +++ b/osfmk/ipc/ipc_importance.c @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -236,18 +237,52 @@ ipc_importance_inherit_link( ipc_importance_inherit_t inherit, ipc_importance_elem_t elem) { - ipc_importance_elem_t link_elem; + ipc_importance_task_t link_task; assert(IIE_NULL == inherit->iii_from_elem); - link_elem = (IIE_TYPE_INHERIT == IIE_TYPE(elem)) ? - (ipc_importance_elem_t)((ipc_importance_inherit_t)elem)->iii_to_task : - elem; + link_task = (IIE_TYPE_INHERIT == IIE_TYPE(elem)) ? + ((ipc_importance_inherit_t)elem)->iii_to_task : + (ipc_importance_task_t)elem; - queue_enter(&link_elem->iie_inherits, inherit, + queue_enter(&link_task->iit_inherits, inherit, ipc_importance_inherit_t, iii_inheritance); inherit->iii_from_elem = elem; } +/* + * Routine: ipc_importance_inherit_find + * Purpose: + * Find an existing inherit that links the from element to the + * to_task at a given nesting depth. As inherits from other + * inherits are actually linked off the original inherit's donation + * receiving task, we have to conduct our search from there if + * the from element is an inherit. + * Returns: + * A pointer (not a reference) to the matching inherit. + * Conditions: + * Importance lock held. + */ +static ipc_importance_inherit_t +ipc_importance_inherit_find( + ipc_importance_elem_t from, + ipc_importance_task_t to_task, + unsigned int depth) +{ + ipc_importance_task_t link_task; + ipc_importance_inherit_t inherit; + + link_task = (IIE_TYPE_INHERIT == IIE_TYPE(from)) ? + ((ipc_importance_inherit_t)from)->iii_to_task : + (ipc_importance_task_t)from; + + queue_iterate(&link_task->iit_inherits, inherit, + ipc_importance_inherit_t, iii_inheritance) { + if (inherit->iii_to_task == to_task && inherit->iii_depth == depth) + return inherit; + } + return III_NULL; +} + /* * Routine: ipc_importance_inherit_unlink * Purpose: @@ -268,13 +303,13 @@ ipc_importance_inherit_unlink( ipc_importance_elem_t elem = inherit->iii_from_elem; if (IIE_NULL != elem) { - ipc_importance_elem_t unlink_elem; + ipc_importance_task_t unlink_task; - unlink_elem = (IIE_TYPE_INHERIT == IIE_TYPE(elem)) ? - (ipc_importance_elem_t)((ipc_importance_inherit_t)elem)->iii_to_task : - elem; + unlink_task = (IIE_TYPE_INHERIT == IIE_TYPE(elem)) ? + ((ipc_importance_inherit_t)elem)->iii_to_task : + (ipc_importance_task_t)elem; - queue_remove(&unlink_elem->iie_inherits, inherit, + queue_remove(&unlink_task->iit_inherits, inherit, ipc_importance_inherit_t, iii_inheritance); inherit->iii_from_elem = IIE_NULL; } @@ -308,40 +343,36 @@ ipc_importance_release_locked(ipc_importance_elem_t elem) { assert(0 < IIE_REFS(elem)); - if (0 < ipc_importance_release_internal(elem)) { - #if DEVELOPMENT || DEBUG - ipc_importance_inherit_t temp_inherit; - ipc_importance_task_t link_task; - ipc_kmsg_t temp_kmsg; - uint32_t expected = 0; - - if (0 < elem->iie_made) - expected++; - - link_task = (IIE_TYPE_INHERIT == IIE_TYPE(elem)) ? - ((ipc_importance_inherit_t)elem)->iii_to_task : - (ipc_importance_task_t)elem; - - queue_iterate(&link_task->iit_kmsgs, temp_kmsg, ipc_kmsg_t, ikm_inheritance) - if (temp_kmsg->ikm_importance == elem) - expected++; - queue_iterate(&link_task->iit_inherits, temp_inherit, - ipc_importance_inherit_t, iii_inheritance) - if (temp_inherit->iii_from_elem == elem) - expected++; - - if (IIE_REFS(elem) < expected) - panic("ipc_importance_release_locked (%p)", elem); + ipc_importance_inherit_t temp_inherit; + ipc_importance_task_t link_task; + ipc_kmsg_t temp_kmsg; + uint32_t expected = 0; + + if (0 < elem->iie_made) + expected++; + + link_task = (IIE_TYPE_INHERIT == IIE_TYPE(elem)) ? + ((ipc_importance_inherit_t)elem)->iii_to_task : + (ipc_importance_task_t)elem; + + queue_iterate(&link_task->iit_kmsgs, temp_kmsg, ipc_kmsg_t, ikm_inheritance) + if (temp_kmsg->ikm_importance == elem) + expected++; + queue_iterate(&link_task->iit_inherits, temp_inherit, + ipc_importance_inherit_t, iii_inheritance) + if (temp_inherit->iii_from_elem == elem) + expected++; + if (IIE_REFS(elem) < expected + 1) + panic("ipc_importance_release_locked (%p)", elem); #endif + + if (0 < ipc_importance_release_internal(elem)) { ipc_importance_unlock(); return; } /* last ref */ - /* can't get to no refs if we contribute to something else's importance */ - assert(queue_empty(&elem->iie_kmsgs)); - assert(queue_empty(&elem->iie_inherits)); switch (IIE_TYPE(elem)) { @@ -351,6 +382,8 @@ ipc_importance_release_locked(ipc_importance_elem_t elem) ipc_importance_task_t task_elem; task_elem = (ipc_importance_task_t)elem; + + /* the task can't still hold a reference on the task importance */ assert(TASK_NULL == task_elem->iit_task); #if DEVELOPMENT || DEBUG @@ -537,7 +570,7 @@ ipc_importance_task_check_transition( #endif } else { // assert(delta <= task_imp->iit_assertcnt); - if (delta > task_imp->iit_assertcnt - IIT_EXTERN(task_imp)) { + if (task_imp->iit_assertcnt < delta + IIT_EXTERN(task_imp)) { /* TODO: Turn this back into a panic */ if (target_task != TASK_NULL) { printf("Over-release of kernel-internal importance assertions for pid %d (%s), " @@ -864,7 +897,7 @@ ipc_importance_task_process_updates( /* complete the policy update with the task unlocked */ ipc_importance_task_release(task_imp); task_unlock(target_task); - task_policy_update_complete_unlocked(target_task, THREAD_NULL, &pend_token); + task_policy_update_complete_unlocked(target_task, &pend_token); task_deallocate(target_task); ipc_importance_lock(); @@ -1467,7 +1500,7 @@ ipc_importance_task_update_live_donor(ipc_importance_task_t task_imp) before_donor = ipc_importance_task_is_marked_donor(task_imp); /* snapshot task live donor status - may change, but another call will accompany the change */ - task_live_donor = target_task->effective_policy.t_live_donor; + task_live_donor = target_task->effective_policy.tep_live_donor; #if IMPORTANCE_DEBUG int target_pid = task_pid(target_task); @@ -1952,11 +1985,10 @@ ipc_importance_reset_locked(ipc_importance_task_t task_imp, boolean_t donor) task_imp->iit_externdrop -= task_imp->iit_legacy_externdrop; /* assert(IIT_LEGACY_EXTERN(task_imp) <= task_imp->iit_assertcnt); */ - if (IIT_LEGACY_EXTERN(task_imp) < task_imp->iit_assertcnt) { + if (IIT_EXTERN(task_imp) < task_imp->iit_assertcnt) { task_imp->iit_assertcnt -= IIT_LEGACY_EXTERN(task_imp); } else { - assert(IIT_LEGACY_EXTERN(task_imp) == task_imp->iit_assertcnt); - task_imp->iit_assertcnt = 0; + task_imp->iit_assertcnt = IIT_EXTERN(task_imp); } task_imp->iit_legacy_externcnt = 0; task_imp->iit_legacy_externdrop = 0; @@ -2516,7 +2548,6 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) ipc_port_t port = kmsg->ikm_header->msgh_remote_port; ipc_importance_inherit_t inherit = III_NULL; ipc_importance_inherit_t alloc = III_NULL; - ipc_importance_inherit_t temp_inherit; boolean_t cleared_self_donation = FALSE; boolean_t donating; uint32_t depth = 1; @@ -2614,14 +2645,7 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) * check to see if we already have an inherit for this pairing */ while (III_NULL == inherit) { - queue_iterate(&from_elem->iie_inherits, temp_inherit, - ipc_importance_inherit_t, iii_inheritance) { - if (temp_inherit->iii_to_task == task_imp && - temp_inherit->iii_depth == depth) { - inherit = temp_inherit; - break; - } - } + inherit = ipc_importance_inherit_find(from_elem, task_imp, depth); /* Do we have to allocate a new inherit */ if (III_NULL == inherit) { @@ -2666,9 +2690,6 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) /* add in a external reference for this use of the inherit */ inherit->iii_externcnt++; - if (donating) { - task_imp->iit_externcnt++; - } } else { /* initialize the previously allocated space */ inherit = alloc; @@ -2680,12 +2701,9 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) inherit->iii_to_task = task_imp; inherit->iii_from_elem = IIE_NULL; queue_init(&inherit->iii_kmsgs); - queue_init(&inherit->iii_inherits); - /* If donating, reflect that in the task externcnt */ if (donating) { inherit->iii_donating = TRUE; - task_imp->iit_externcnt++; } else { inherit->iii_donating = FALSE; } @@ -2714,6 +2732,14 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) elem = ipc_importance_kmsg_unlink(kmsg); assert(elem == from_elem); + /* If found inherit and donating, reflect that in the task externcnt */ + if (III_NULL != inherit && donating) { + task_imp->iit_externcnt++; + /* The owner of receive right might have changed, take the internal assertion */ + ipc_importance_task_hold_internal_assertion_locked(task_imp, 1); + /* may have dropped and retaken importance lock */ + } + /* If we didn't create a new inherit, we have some resources to release */ if (III_NULL == inherit || inherit != alloc) { if (IIE_NULL != from_elem) { @@ -2748,21 +2774,9 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) * unlinked the kmsg and snapshot the donating state while holding * the importance lock */ - if (donating) { + if (donating || cleared_self_donation) { ip_lock(port); - if (III_NULL != inherit) { - /* task assertions transferred to inherit, just adjust port count */ - ipc_port_impcount_delta(port, -1, IP_NULL); - ip_unlock(port); - } else { - /* drop importance from port and destination task */ - if (ipc_port_importance_delta(port, IPID_OPTION_NORMAL, -1) == FALSE) { - ip_unlock(port); - } - } - } else if (cleared_self_donation) { - ip_lock(port); - /* drop cleared donation from port and destination task */ + /* drop importance from port and destination task */ if (ipc_port_importance_delta(port, IPID_OPTION_NORMAL, -1) == FALSE) { ip_unlock(port); } @@ -2886,18 +2900,21 @@ ipc_importance_receive( ipc_importance_task_t task_imp = task_self->task_imp_base; ipc_port_t port = kmsg->ikm_header->msgh_remote_port; - ip_lock(port); - ipc_port_impcount_delta(port, -1, IP_NULL); - ip_unlock(port); - - /* will user accept legacy responsibility for the importance boost */ - if (KERN_SUCCESS == ipc_importance_task_externalize_legacy_assertion(task_imp, 1, sender_pid)) { + /* The owner of receive right might have changed, take the internal assertion */ + if (KERN_SUCCESS == ipc_importance_task_hold_internal_assertion(task_imp, 1)) { + ipc_importance_task_externalize_legacy_assertion(task_imp, 1, sender_pid); impresult = 1; } else { /* The importance boost never applied to task (clear the bit) */ kmsg->ikm_header->msgh_bits &= ~MACH_MSGH_BITS_RAISEIMP; impresult = 0; } + + /* Drop the boost on the port and the owner of the receive right */ + ip_lock(port); + if (ipc_port_importance_delta(port, IPID_OPTION_NORMAL, -1) == FALSE) { + ip_unlock(port); + } } } @@ -3363,7 +3380,7 @@ ipc_importance_command( /* if not donating to a denap receiver, it was called incorrectly */ if (!ipc_importance_task_is_marked_denap_receiver(to_task)) { ipc_importance_unlock(); - return KERN_INVALID_ARGUMENT; /* keeps dispatch happy */ + return KERN_INVALID_TASK; /* keeps dispatch happy */ } /* Enough external references left to drop? */ diff --git a/osfmk/ipc/ipc_importance.h b/osfmk/ipc/ipc_importance.h index 15ad62d66..2a2ac2f45 100644 --- a/osfmk/ipc/ipc_importance.h +++ b/osfmk/ipc/ipc_importance.h @@ -62,7 +62,6 @@ struct ipc_importance_elem { uint32_t iie_bits; /* type and refs */ mach_voucher_attr_value_reference_t iie_made; /* references given to vouchers */ queue_head_t iie_kmsgs; /* list of kmsgs inheriting from this */ - queue_head_t iie_inherits; /* list of inherit elems hung off this */ uint32_t iie_externcnt; /* number of externalized boosts */ uint32_t iie_externdrop; /* number of those dropped already */ #define IIE_REF_DEBUG 0 @@ -105,6 +104,7 @@ struct ipc_importance_elem { struct ipc_importance_task { struct ipc_importance_elem iit_elem; /* common element parts */ task_t iit_task; /* task associated with */ + queue_head_t iit_inherits; /* list of inherit elems hung off this */ queue_t iit_updateq; /* queue chained on for task policy updates */ queue_chain_t iit_updates; /* link on update chain */ queue_chain_t iit_props; /* link on propagation chain */ @@ -130,7 +130,6 @@ struct ipc_importance_task { #define iit_bits iit_elem.iie_bits #define iit_made iit_elem.iie_made #define iit_kmsgs iit_elem.iie_kmsgs -#define iit_inherits iit_elem.iie_inherits #define iit_externcnt iit_elem.iie_externcnt #define iit_externdrop iit_elem.iie_externdrop @@ -162,7 +161,6 @@ struct ipc_importance_inherit { #define iii_bits iii_elem.iie_bits #define iii_made iii_elem.iie_made #define iii_kmsgs iii_elem.iie_kmsgs -#define iii_inherits iii_elem.iie_inherits #define iii_externcnt iii_elem.iie_externcnt #define iii_externdrop iii_elem.iie_externdrop #define III_REFS_MAX IIE_REFS_MAX diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 5d78a1848..479bf2fe2 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -93,6 +93,7 @@ #include #include #include +#include #include @@ -112,9 +113,15 @@ #include #include #include +#if MACH_FLIPC +#include +#include +#endif #include +#include + #include #ifdef ppc @@ -538,6 +545,285 @@ MACRO_BEGIN \ } \ MACRO_END +#define KMSG_TRACE_FLAG_TRACED 0x000001 +#define KMSG_TRACE_FLAG_COMPLEX 0x000002 +#define KMSG_TRACE_FLAG_OOLMEM 0x000004 +#define KMSG_TRACE_FLAG_VCPY 0x000008 +#define KMSG_TRACE_FLAG_PCPY 0x000010 +#define KMSG_TRACE_FLAG_SND64 0x000020 +#define KMSG_TRACE_FLAG_RAISEIMP 0x000040 +#define KMSG_TRACE_FLAG_APP_SRC 0x000080 +#define KMSG_TRACE_FLAG_APP_DST 0x000100 +#define KMSG_TRACE_FLAG_DAEMON_SRC 0x000200 +#define KMSG_TRACE_FLAG_DAEMON_DST 0x000400 +#define KMSG_TRACE_FLAG_DST_NDFLTQ 0x000800 +#define KMSG_TRACE_FLAG_SRC_NDFLTQ 0x001000 +#define KMSG_TRACE_FLAG_DST_SONCE 0x002000 +#define KMSG_TRACE_FLAG_SRC_SONCE 0x004000 +#define KMSG_TRACE_FLAG_CHECKIN 0x008000 +#define KMSG_TRACE_FLAG_ONEWAY 0x010000 +#define KMSG_TRACE_FLAG_IOKIT 0x020000 +#define KMSG_TRACE_FLAG_SNDRCV 0x040000 +#define KMSG_TRACE_FLAG_DSTQFULL 0x080000 +#define KMSG_TRACE_FLAG_VOUCHER 0x100000 +#define KMSG_TRACE_FLAG_TIMER 0x200000 +#define KMSG_TRACE_FLAG_SEMA 0x400000 +#define KMSG_TRACE_FLAG_DTMPOWNER 0x800000 + +#define KMSG_TRACE_FLAGS_MASK 0xffffff +#define KMSG_TRACE_FLAGS_SHIFT 8 + +#define KMSG_TRACE_PORTS_MASK 0xff +#define KMSG_TRACE_PORTS_SHIFT 0 + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +extern boolean_t kdebug_debugid_enabled(uint32_t debugid); + +void ipc_kmsg_trace_send(ipc_kmsg_t kmsg, + mach_msg_option_t option) +{ + task_t send_task = TASK_NULL; + ipc_port_t dst_port, src_port; + boolean_t is_task_64bit; + mach_msg_header_t *msg; + mach_msg_trailer_t *trailer; + + int kotype = 0; + uint32_t msg_size = 0; + uint32_t msg_flags = KMSG_TRACE_FLAG_TRACED; + uint32_t num_ports = 0; + uint32_t send_pid, dst_pid; + + /* + * check to see not only if ktracing is enabled, but if we will + * _actually_ emit the KMSG_INFO tracepoint. This saves us a + * significant amount of processing (and a port lock hold) in + * the non-tracing case. + */ + if (__probable((kdebug_enable & KDEBUG_TRACE) == 0)) + return; + if (!kdebug_debugid_enabled(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO))) + return; + + msg = kmsg->ikm_header; + + dst_port = (ipc_port_t)(msg->msgh_remote_port); + if (!IPC_PORT_VALID(dst_port)) + return; + + /* + * Message properties / options + */ + if ((option & (MACH_SEND_MSG|MACH_RCV_MSG)) == (MACH_SEND_MSG|MACH_RCV_MSG)) + msg_flags |= KMSG_TRACE_FLAG_SNDRCV; + + if (msg->msgh_id >= is_iokit_subsystem.start && + msg->msgh_id < is_iokit_subsystem.end + 100) + msg_flags |= KMSG_TRACE_FLAG_IOKIT; + /* magic XPC checkin message id (XPC_MESSAGE_ID_CHECKIN) from libxpc */ + else if (msg->msgh_id == 0x77303074u /* w00t */) + msg_flags |= KMSG_TRACE_FLAG_CHECKIN; + + if (msg->msgh_bits & MACH_MSGH_BITS_RAISEIMP) + msg_flags |= KMSG_TRACE_FLAG_RAISEIMP; + + if (unsafe_convert_port_to_voucher(kmsg->ikm_voucher)) + msg_flags |= KMSG_TRACE_FLAG_VOUCHER; + + /* + * Sending task / port + */ + send_task = current_task(); + send_pid = task_pid(send_task); + + if (send_pid != 0) { + if (task_is_daemon(send_task)) + msg_flags |= KMSG_TRACE_FLAG_DAEMON_SRC; + else if (task_is_app(send_task)) + msg_flags |= KMSG_TRACE_FLAG_APP_SRC; + } + + is_task_64bit = (send_task->map->max_offset > VM_MAX_ADDRESS); + if (is_task_64bit) + msg_flags |= KMSG_TRACE_FLAG_SND64; + + src_port = (ipc_port_t)(msg->msgh_local_port); + if (src_port) { + if (src_port->ip_messages.imq_qlimit != MACH_PORT_QLIMIT_DEFAULT) + msg_flags |= KMSG_TRACE_FLAG_SRC_NDFLTQ; + switch (MACH_MSGH_BITS_LOCAL(msg->msgh_bits)) { + case MACH_MSG_TYPE_MOVE_SEND_ONCE: + msg_flags |= KMSG_TRACE_FLAG_SRC_SONCE; + break; + default: + break; + } + } else { + msg_flags |= KMSG_TRACE_FLAG_ONEWAY; + } + + + /* + * Destination task / port + */ + ip_lock(dst_port); + if (!ip_active(dst_port)) { + /* dst port is being torn down */ + dst_pid = (uint32_t)0xfffffff0; + } else if (dst_port->ip_tempowner) { + msg_flags |= KMSG_TRACE_FLAG_DTMPOWNER; + if (IIT_NULL != dst_port->ip_imp_task) + dst_pid = task_pid(dst_port->ip_imp_task->iit_task); + else + dst_pid = (uint32_t)0xfffffff1; + } else if (dst_port->ip_receiver_name == MACH_PORT_NULL) { + /* dst_port is otherwise in-transit */ + dst_pid = (uint32_t)0xfffffff2; + } else { + if (dst_port->ip_receiver == ipc_space_kernel) { + dst_pid = 0; + } else { + ipc_space_t dst_space; + dst_space = dst_port->ip_receiver; + if (dst_space && is_active(dst_space)) { + dst_pid = task_pid(dst_space->is_task); + if (task_is_daemon(dst_space->is_task)) + msg_flags |= KMSG_TRACE_FLAG_DAEMON_DST; + else if (task_is_app(dst_space->is_task)) + msg_flags |= KMSG_TRACE_FLAG_APP_DST; + } else { + /* receiving task is being torn down */ + dst_pid = (uint32_t)0xfffffff3; + } + } + } + + if (dst_port->ip_messages.imq_qlimit != MACH_PORT_QLIMIT_DEFAULT) + msg_flags |= KMSG_TRACE_FLAG_DST_NDFLTQ; + if (imq_full(&dst_port->ip_messages)) + msg_flags |= KMSG_TRACE_FLAG_DSTQFULL; + + kotype = ip_kotype(dst_port); + + ip_unlock(dst_port); + + switch (kotype) { + case IKOT_SEMAPHORE: + msg_flags |= KMSG_TRACE_FLAG_SEMA; + break; + case IKOT_TIMER: + case IKOT_CLOCK: + msg_flags |= KMSG_TRACE_FLAG_TIMER; + break; + case IKOT_MASTER_DEVICE: + case IKOT_IOKIT_CONNECT: + case IKOT_IOKIT_OBJECT: + case IKOT_IOKIT_SPARE: + msg_flags |= KMSG_TRACE_FLAG_IOKIT; + break; + default: + break; + } + + switch(MACH_MSGH_BITS_REMOTE(msg->msgh_bits)) { + case MACH_MSG_TYPE_PORT_SEND_ONCE: + msg_flags |= KMSG_TRACE_FLAG_DST_SONCE; + break; + default: + break; + } + + + /* + * Message size / content + */ + msg_size = msg->msgh_size - sizeof(mach_msg_header_t); + + if (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX) { + mach_msg_body_t *msg_body; + mach_msg_descriptor_t *kern_dsc; + int dsc_count; + + msg_flags |= KMSG_TRACE_FLAG_COMPLEX; + + msg_body = (mach_msg_body_t *)(kmsg->ikm_header + 1); + dsc_count = (int)msg_body->msgh_descriptor_count; + kern_dsc = (mach_msg_descriptor_t *)(msg_body + 1); + + /* this is gross: see ipc_kmsg_copyin_body()... */ + if (!is_task_64bit) + msg_size -= (dsc_count * 12); + + for (int i = 0; i < dsc_count; i++) { + switch (kern_dsc[i].type.type) { + case MACH_MSG_PORT_DESCRIPTOR: + num_ports++; + if (is_task_64bit) + msg_size -= 12; + break; + case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: + case MACH_MSG_OOL_DESCRIPTOR: { + mach_msg_ool_descriptor_t *dsc; + dsc = (mach_msg_ool_descriptor_t *)&kern_dsc[i]; + msg_flags |= KMSG_TRACE_FLAG_OOLMEM; + msg_size += dsc->size; + if ((dsc->size >= MSG_OOL_SIZE_SMALL) && + (dsc->copy == MACH_MSG_PHYSICAL_COPY) && + !dsc->deallocate) + msg_flags |= KMSG_TRACE_FLAG_PCPY; + else if (dsc->size <= MSG_OOL_SIZE_SMALL) + msg_flags |= KMSG_TRACE_FLAG_PCPY; + else + msg_flags |= KMSG_TRACE_FLAG_VCPY; + if (is_task_64bit) + msg_size -= 16; + } break; + case MACH_MSG_OOL_PORTS_DESCRIPTOR: { + mach_msg_ool_ports_descriptor_t *dsc; + dsc = (mach_msg_ool_ports_descriptor_t *)&kern_dsc[i]; + num_ports += dsc->count; + if (is_task_64bit) + msg_size -= 16; + } break; + default: + break; + } + } + } + + /* + * Trailer contents + */ + trailer = (mach_msg_trailer_t *)((vm_offset_t)msg + + (vm_offset_t)msg->msgh_size); + if (trailer->msgh_trailer_size <= sizeof(mach_msg_security_trailer_t)) { + extern security_token_t KERNEL_SECURITY_TOKEN; + mach_msg_security_trailer_t *strailer; + strailer = (mach_msg_security_trailer_t *)trailer; + /* + * verify the sender PID: replies from the kernel often look + * like self-talk because the sending port is not reset. + */ + if (memcmp(&strailer->msgh_sender, + &KERNEL_SECURITY_TOKEN, + sizeof(KERNEL_SECURITY_TOKEN)) == 0) { + send_pid = 0; + msg_flags &= ~(KMSG_TRACE_FLAG_APP_SRC | KMSG_TRACE_FLAG_DAEMON_SRC); + } + } + + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, + (uintptr_t)send_pid, + (uintptr_t)dst_pid, + (uintptr_t)msg_size, + (uintptr_t)( + ((msg_flags & KMSG_TRACE_FLAGS_MASK) << KMSG_TRACE_FLAGS_SHIFT) | + ((num_ports & KMSG_TRACE_PORTS_MASK) << KMSG_TRACE_PORTS_SHIFT) + ) + ); +} +#endif + /* zone for cached ipc_kmsg_t structures */ zone_t ipc_kmsg_zone; @@ -730,7 +1016,100 @@ ipc_kmsg_enqueue( ipc_kmsg_queue_t queue, ipc_kmsg_t kmsg) { - ipc_kmsg_enqueue_macro(queue, kmsg); + ipc_kmsg_t first = queue->ikmq_base; + ipc_kmsg_t last; + + if (first == IKM_NULL) { + queue->ikmq_base = kmsg; + kmsg->ikm_next = kmsg; + kmsg->ikm_prev = kmsg; + } else { + last = first->ikm_prev; + kmsg->ikm_next = first; + kmsg->ikm_prev = last; + first->ikm_prev = kmsg; + last->ikm_next = kmsg; + } +} + +/* + * Routine: ipc_kmsg_enqueue_qos + * Purpose: + * Enqueue a kmsg, propagating qos + * overrides towards the head of the queue. + * + * Returns: + * whether the head of the queue had + * it's override-qos adjusted because + * of this insertion. + */ + +boolean_t +ipc_kmsg_enqueue_qos( + ipc_kmsg_queue_t queue, + ipc_kmsg_t kmsg) +{ + ipc_kmsg_t first = queue->ikmq_base; + ipc_kmsg_t prev; + mach_msg_priority_t override; + + if (first == IKM_NULL) { + /* insert a first message */ + queue->ikmq_base = kmsg; + kmsg->ikm_next = kmsg; + kmsg->ikm_prev = kmsg; + return TRUE; + } + + /* insert at the tail */ + prev = first->ikm_prev; + kmsg->ikm_next = first; + kmsg->ikm_prev = prev; + first->ikm_prev = kmsg; + prev->ikm_next = kmsg; + + /* apply QoS overrides towards the head */ + override = kmsg->ikm_qos_override; + while (prev != kmsg && + override > prev->ikm_qos_override) { + prev->ikm_qos_override = override; + prev = prev->ikm_prev; + } + + /* did we adjust everything? */ + return (prev == kmsg); +} + +/* + * Routine: ipc_kmsg_override_qos + * Purpose: + * Update the override for a given kmsg already + * enqueued, propagating qos override adjustments + * towards the head of the queue. + * + * Returns: + * whether the head of the queue had + * it's override-qos adjusted because + * of this insertion. + */ + +boolean_t +ipc_kmsg_override_qos( + ipc_kmsg_queue_t queue, + ipc_kmsg_t kmsg, + mach_msg_priority_t override) +{ + ipc_kmsg_t first = queue->ikmq_base; + ipc_kmsg_t cur = kmsg; + + /* apply QoS overrides towards the head */ + while (override > cur->ikm_qos_override) { + cur->ikm_qos_override = override; + if (cur == first) + return TRUE; + cur = cur->ikm_next; + } + return FALSE; } /* @@ -748,7 +1127,7 @@ ipc_kmsg_dequeue( first = ipc_kmsg_queue_first(queue); if (first != IKM_NULL) - ipc_kmsg_rmqueue_first_macro(queue, first); + ipc_kmsg_rmqueue(queue, first); return first; } @@ -1376,7 +1755,6 @@ ipc_kmsg_send( thread_t th = current_thread(); mach_msg_return_t error = MACH_MSG_SUCCESS; boolean_t kernel_reply = FALSE; - spl_t s; /* Check if honor qlimit flag is set on thread. */ if ((th->options & TH_OPT_HONOR_QLIMIT) == TH_OPT_HONOR_QLIMIT) { @@ -1398,6 +1776,7 @@ ipc_kmsg_send( /* don't allow the creation of a circular loop */ if (kmsg->ikm_header->msgh_bits & MACH_MSGH_BITS_CIRCULAR) { ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, MACH_MSGH_BITS_CIRCULAR); return MACH_MSG_SUCCESS; } @@ -1417,9 +1796,14 @@ ipc_kmsg_send( */ if (!ip_active(port)) { ip_unlock(port); +#if MACH_FLIPC + if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport)) + flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE); +#endif ip_release(port); /* JMM - Future: release right, not just ref */ kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL; ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, MACH_SEND_INVALID_DEST); return MACH_MSG_SUCCESS; } @@ -1440,15 +1824,19 @@ ipc_kmsg_send( /* * Call the server routine, and get the reply message to send. */ - kmsg = ipc_kobject_server(kmsg); + kmsg = ipc_kobject_server(kmsg, option); if (kmsg == IKM_NULL) return MACH_MSG_SUCCESS; + /* restart the KMSG_INFO tracing for the reply message */ + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port; assert(IP_VALID(port)); ip_lock(port); /* fall thru with reply - same options */ kernel_reply = TRUE; + if (!ip_active(port)) + error = MACH_SEND_INVALID_DEST; } #if IMPORTANCE_INHERITANCE @@ -1464,17 +1852,20 @@ ipc_kmsg_send( } #endif /* IMPORTANCE_INHERITANCE */ - /* - * We have a valid message and a valid reference on the port. - * we can unlock the port and call mqueue_send() on its message - * queue. Lock message queue while port is locked. - */ - s = splsched(); - imq_lock(&port->ip_messages); - ip_unlock(port); + if (error != MACH_MSG_SUCCESS) { + ip_unlock(port); + } else { + /* + * We have a valid message and a valid reference on the port. + * we can unlock the port and call mqueue_send() on its message + * queue. Lock message queue while port is locked. + */ + imq_lock(&port->ip_messages); + ip_unlock(port); - error = ipc_mqueue_send(&port->ip_messages, kmsg, option, - send_timeout, s); + error = ipc_mqueue_send(&port->ip_messages, kmsg, option, + send_timeout); + } #if IMPORTANCE_INHERITANCE if (did_importance == TRUE) { @@ -1519,9 +1910,14 @@ ipc_kmsg_send( * as a successful delivery (like we do for an inactive port). */ if (error == MACH_SEND_INVALID_DEST) { +#if MACH_FLIPC + if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport)) + flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE); +#endif ip_release(port); /* JMM - Future: release right, not just ref */ kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL; ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, MACH_SEND_INVALID_DEST); return MACH_MSG_SUCCESS; } @@ -1531,9 +1927,14 @@ ipc_kmsg_send( * pseudo-receive on error conditions. We need to just treat * the message as a successful delivery. */ +#if MACH_FLIPC + if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport)) + flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE); +#endif ip_release(port); /* JMM - Future: release right, not just ref */ kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL; ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, error); return MACH_MSG_SUCCESS; } return error; @@ -1555,10 +1956,14 @@ ipc_kmsg_send( mach_msg_return_t ipc_kmsg_put( - mach_vm_address_t msg_addr, ipc_kmsg_t kmsg, - mach_msg_size_t size) + mach_msg_option_t option, + mach_vm_address_t rcv_addr, + mach_msg_size_t rcv_size, + mach_msg_size_t trailer_size, + mach_msg_size_t *sizep) { + mach_msg_size_t size = kmsg->ikm_header->msgh_size + trailer_size; mach_msg_return_t mr; DEBUG_IPC_KMSG_PRINT(kmsg, "ipc_kmsg_put()"); @@ -1614,12 +2019,29 @@ ipc_kmsg_put( kprintf("type: %d\n", ((mach_msg_type_descriptor_t *)(((mach_msg_base_t *)kmsg->ikm_header)+1))->type); } __unreachable_ok_pop - if (copyoutmsg((const char *) kmsg->ikm_header, msg_addr, size)) + + /* Re-Compute target address if using stack-style delivery */ + if (option & MACH_RCV_STACK) { + rcv_addr += rcv_size - size; + } + + if (copyoutmsg((const char *) kmsg->ikm_header, rcv_addr, size)) { mr = MACH_RCV_INVALID_DATA; - else + size = 0; + } else mr = MACH_MSG_SUCCESS; + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_LINK) | DBG_FUNC_NONE, + (rcv_addr >= VM_MIN_KERNEL_AND_KEXT_ADDRESS || + rcv_addr + size >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) ? (uintptr_t)0 : (uintptr_t)rcv_addr, + VM_KERNEL_ADDRPERM((uintptr_t)kmsg), + 1 /* this is on the receive/copyout path */, + 0, + 0); ipc_kmsg_free(kmsg); + + if (sizep) + *sizep = size; return mr; } @@ -1644,6 +2066,33 @@ ipc_kmsg_put_to_kernel( ipc_kmsg_free(kmsg); } +unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t propagation); + +static void +ipc_kmsg_set_qos( + ipc_kmsg_t kmsg, + mach_msg_option_t options, + mach_msg_priority_t override) +{ + kern_return_t kr; + + kr = ipc_get_pthpriority_from_kmsg_voucher(kmsg, &kmsg->ikm_qos); + if (kr != KERN_SUCCESS) { + kmsg->ikm_qos = MACH_MSG_PRIORITY_UNSPECIFIED; + } + kmsg->ikm_qos_override = kmsg->ikm_qos; + + if (options & MACH_SEND_OVERRIDE) { + unsigned long canonical; + mach_msg_priority_t canon; + + canonical = pthread_priority_canonicalize(override, TRUE); + canon = (mach_msg_priority_t)canonical; + if (canon > kmsg->ikm_qos) + kmsg->ikm_qos_override = canon; + } +} + /* * Routine: ipc_kmsg_copyin_header * Purpose: @@ -1672,6 +2121,7 @@ mach_msg_return_t ipc_kmsg_copyin_header( ipc_kmsg_t kmsg, ipc_space_t space, + mach_msg_priority_t override, mach_msg_option_t *optionp) { mach_msg_header_t *msg = kmsg->ikm_header; @@ -1974,16 +2424,6 @@ ipc_kmsg_copyin_header( voucher_entry = IE_NULL; } - /* - * No room to store voucher port in in-kernel msg header, - * so we store it back in the kmsg itself. - */ - if (IP_VALID(voucher_port)) { - assert(ip_active(voucher_port)); - kmsg->ikm_voucher = voucher_port; - voucher_type = MACH_MSG_TYPE_MOVE_SEND; - } - dest_type = ipc_object_copyin_type(dest_type); reply_type = ipc_object_copyin_type(reply_type); @@ -2004,12 +2444,16 @@ ipc_kmsg_copyin_header( if (ip_full(dport)) { #if IMPORTANCE_INHERITANCE needboost = ipc_port_request_sparm(dport, dest_name, - dest_entry->ie_request, - (*optionp & MACH_SEND_NOIMPORTANCE)); + dest_entry->ie_request, + *optionp, + override); if (needboost == FALSE) ip_unlock(dport); #else - ipc_port_request_sparm(dport, dest_name, dest_entry->ie_request); + ipc_port_request_sparm(dport, dest_name, + dest_entry->ie_request, + *optionp, + override); ip_unlock(dport); #endif /* IMPORTANCE_INHERITANCE */ } else { @@ -2048,6 +2492,21 @@ ipc_kmsg_copyin_header( if (voucher_soright != IP_NULL) { ipc_notify_port_deleted(voucher_soright, voucher_name); } + + /* + * No room to store voucher port in in-kernel msg header, + * so we store it back in the kmsg itself. Extract the + * qos, and apply any override before we enqueue the kmsg. + */ + if (IP_VALID(voucher_port)) { + + kmsg->ikm_voucher = voucher_port; + voucher_type = MACH_MSG_TYPE_MOVE_SEND; + } + + /* capture the qos value(s) for the kmsg */ + ipc_kmsg_set_qos(kmsg, *optionp, override); + msg->msgh_bits = MACH_MSGH_BITS_SET(dest_type, reply_type, voucher_type, mbits); msg->msgh_remote_port = (ipc_port_t)dest_port; msg->msgh_local_port = (ipc_port_t)reply_port; @@ -2637,13 +3096,14 @@ ipc_kmsg_copyin( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, + mach_msg_priority_t override, mach_msg_option_t *optionp) { mach_msg_return_t mr; kmsg->ikm_header->msgh_bits &= MACH_MSGH_BITS_USER; - mr = ipc_kmsg_copyin_header(kmsg, space, optionp); + mr = ipc_kmsg_copyin_header(kmsg, space, override, optionp); if (mr != MACH_MSG_SUCCESS) return mr; @@ -3440,13 +3900,13 @@ ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descrip vm_map_copy_t copy; vm_map_address_t rcv_addr; mach_msg_copy_options_t copy_options; - mach_msg_size_t size; + vm_map_size_t size; mach_msg_descriptor_type_t dsc_type; //SKIP_PORT_DESCRIPTORS(saddr, sdsc_count); - copy = (vm_map_copy_t) dsc->address; - size = dsc->size; + copy = (vm_map_copy_t)dsc->address; + size = (vm_map_size_t)dsc->size; copy_options = dsc->copy; assert(copy_options != MACH_MSG_KALLOC_COPY_T); dsc_type = dsc->type; @@ -3455,10 +3915,10 @@ ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descrip kern_return_t kr; rcv_addr = 0; - if (vm_map_copy_validate_size(map, copy, (vm_map_size_t)size) == FALSE) + if (vm_map_copy_validate_size(map, copy, &size) == FALSE) panic("Inconsistent OOL/copyout size on %p: expected %d, got %lld @%p", - dsc, size, (unsigned long long)copy->size, copy); - kr = vm_map_copyout(map, &rcv_addr, copy); + dsc, dsc->size, (unsigned long long)copy->size, copy); + kr = vm_map_copyout_size(map, &rcv_addr, copy, size); if (kr != KERN_SUCCESS) { if (kr == KERN_RESOURCE_SHORTAGE) *mr |= MACH_MSG_VM_KERNEL; @@ -3489,7 +3949,7 @@ ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descrip TRUE : FALSE; user_ool_dsc->copy = copy_options; user_ool_dsc->type = dsc_type; - user_ool_dsc->size = size; + user_ool_dsc->size = (mach_msg_size_t)size; user_dsc = (typeof(user_dsc))user_ool_dsc; } else if (is_64bit) { @@ -3501,7 +3961,7 @@ ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descrip TRUE : FALSE; user_ool_dsc->copy = copy_options; user_ool_dsc->type = dsc_type; - user_ool_dsc->size = size; + user_ool_dsc->size = (mach_msg_size_t)size; user_dsc = (typeof(user_dsc))user_ool_dsc; } else { @@ -3509,7 +3969,7 @@ ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descrip user_ool_dsc--; user_ool_dsc->address = CAST_DOWN_EXPLICIT(uint32_t, rcv_addr); - user_ool_dsc->size = size; + user_ool_dsc->size = (mach_msg_size_t)size; user_ool_dsc->deallocate = (copy_options == MACH_MSG_VIRTUAL_COPY) ? TRUE : FALSE; user_ool_dsc->copy = copy_options; diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index c020a3d39..6e11487c5 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -80,6 +80,7 @@ #include #include #include +#include /* * This structure is only the header for a kmsg buffer; @@ -94,14 +95,19 @@ */ struct ipc_kmsg { - mach_msg_size_t ikm_size; - struct ipc_kmsg *ikm_next; /* next message on port/discard queue */ - struct ipc_kmsg *ikm_prev; /* prev message on port/discard queue */ - mach_msg_header_t *ikm_header; - ipc_port_t ikm_prealloc; /* port we were preallocated from */ - ipc_port_t ikm_voucher; /* voucher port carried */ - struct ipc_importance_elem *ikm_importance;/* inherited from */ - queue_chain_t ikm_inheritance;/* inherited from link */ + mach_msg_size_t ikm_size; + struct ipc_kmsg *ikm_next; /* next message on port/discard queue */ + struct ipc_kmsg *ikm_prev; /* prev message on port/discard queue */ + mach_msg_header_t *ikm_header; + ipc_port_t ikm_prealloc; /* port we were preallocated from */ + ipc_port_t ikm_voucher; /* voucher port carried */ + mach_msg_priority_t ikm_qos; /* qos of this kmsg */ + mach_msg_priority_t ikm_qos_override; /* qos override on this kmsg */ + struct ipc_importance_elem *ikm_importance; /* inherited from */ + queue_chain_t ikm_inheritance; /* inherited from link */ +#if MACH_FLIPC + struct mach_node *ikm_node; /* Originating node - needed for ack */ +#endif }; #if defined(__i386__) || defined(__arm__) @@ -146,13 +152,22 @@ MACRO_BEGIN \ (kmsg)->ikm_prealloc = IP_NULL; \ MACRO_END -#define ikm_init(kmsg, size) \ -MACRO_BEGIN \ - (kmsg)->ikm_size = (size); \ - (kmsg)->ikm_prealloc = IP_NULL; \ - (kmsg)->ikm_voucher = IP_NULL; \ - (kmsg)->ikm_importance = IIE_NULL; \ - assert((kmsg)->ikm_prev = (kmsg)->ikm_next = IKM_BOGUS); \ +#if MACH_FLIPC +#define ikm_flipc_init(kmsg) (kmsg)->ikm_node = MACH_NODE_NULL +#else +#define ikm_flipc_init(kmsg) +#endif + +#define ikm_init(kmsg, size) \ +MACRO_BEGIN \ + (kmsg)->ikm_size = (size); \ + (kmsg)->ikm_prealloc = IP_NULL; \ + (kmsg)->ikm_voucher = IP_NULL; \ + (kmsg)->ikm_importance = IIE_NULL; \ + (kmsg)->ikm_qos = MACH_MSG_PRIORITY_UNSPECIFIED; \ + (kmsg)->ikm_qos_override = MACH_MSG_PRIORITY_UNSPECIFIED; \ + ikm_flipc_init(kmsg); \ + assert((kmsg)->ikm_prev = (kmsg)->ikm_next = IKM_BOGUS); \ MACRO_END #define ikm_check_init(kmsg, size) \ @@ -193,6 +208,15 @@ extern void ipc_kmsg_enqueue( ipc_kmsg_queue_t queue, ipc_kmsg_t kmsg); +extern boolean_t ipc_kmsg_enqueue_qos( + ipc_kmsg_queue_t queue, + ipc_kmsg_t kmsg); + +extern boolean_t ipc_kmsg_override_qos( + ipc_kmsg_queue_t queue, + ipc_kmsg_t kmsg, + mach_msg_priority_t override); + /* Dequeue and return a kmsg */ extern ipc_kmsg_t ipc_kmsg_dequeue( ipc_kmsg_queue_t queue); @@ -202,6 +226,11 @@ extern void ipc_kmsg_rmqueue( ipc_kmsg_queue_t queue, ipc_kmsg_t kmsg); +/* Pull the (given) first kmsg out of a queue */ +extern void ipc_kmsg_rmqueue_first( + ipc_kmsg_queue_t queue, + ipc_kmsg_t kmsg); + #define ipc_kmsg_queue_first(queue) ((queue)->ikmq_base) /* Return the kmsg following the given kmsg */ @@ -209,46 +238,6 @@ extern ipc_kmsg_t ipc_kmsg_queue_next( ipc_kmsg_queue_t queue, ipc_kmsg_t kmsg); -#define ipc_kmsg_rmqueue_first_macro(queue, kmsg) \ -MACRO_BEGIN \ - register ipc_kmsg_t _next; \ - \ - assert((queue)->ikmq_base == (kmsg)); \ - \ - _next = (kmsg)->ikm_next; \ - if (_next == (kmsg)) { \ - assert((kmsg)->ikm_prev == (kmsg)); \ - (queue)->ikmq_base = IKM_NULL; \ - } else { \ - register ipc_kmsg_t _prev = (kmsg)->ikm_prev; \ - \ - (queue)->ikmq_base = _next; \ - _next->ikm_prev = _prev; \ - _prev->ikm_next = _next; \ - } \ - /* XXX Debug paranoia ASSIGNMENTS */ \ - assert(kmsg->ikm_next = IKM_BOGUS); \ - assert(kmsg->ikm_prev = IKM_BOGUS); \ -MACRO_END - -#define ipc_kmsg_enqueue_macro(queue, kmsg) \ -MACRO_BEGIN \ - register ipc_kmsg_t _first = (queue)->ikmq_base; \ - \ - if (_first == IKM_NULL) { \ - (queue)->ikmq_base = (kmsg); \ - (kmsg)->ikm_next = (kmsg); \ - (kmsg)->ikm_prev = (kmsg); \ - } else { \ - register ipc_kmsg_t _last = _first->ikm_prev; \ - \ - (kmsg)->ikm_next = _first; \ - (kmsg)->ikm_prev = _last; \ - _first->ikm_prev = (kmsg); \ - _last->ikm_next = (kmsg); \ - } \ -MACRO_END - /* Allocate a kernel message */ extern ipc_kmsg_t ipc_kmsg_alloc( mach_msg_size_t size); @@ -302,9 +291,12 @@ extern mach_msg_return_t ipc_kmsg_send( /* Copy a kernel message buffer to a user message */ extern mach_msg_return_t ipc_kmsg_put( - mach_vm_address_t msg_addr, ipc_kmsg_t kmsg, - mach_msg_size_t size); + mach_msg_option_t option, + mach_vm_address_t rcv_addr, + mach_msg_size_t rcv_size, + mach_msg_size_t trailer_size, + mach_msg_size_t *size); /* Copy a kernel message buffer to a kernel message */ extern void ipc_kmsg_put_to_kernel( @@ -316,6 +308,7 @@ extern void ipc_kmsg_put_to_kernel( extern mach_msg_return_t ipc_kmsg_copyin_header( ipc_kmsg_t kmsg, ipc_space_t space, + mach_msg_priority_t override, mach_msg_option_t *optionp); /* Copyin port rights and out-of-line memory from a user message */ @@ -323,6 +316,7 @@ extern mach_msg_return_t ipc_kmsg_copyin( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, + mach_msg_priority_t override, mach_msg_option_t *optionp); /* Copyin port rights and out-of-line memory from a kernel message */ @@ -397,5 +391,12 @@ ipc_kmsg_add_trailer(ipc_kmsg_t kmsg, ipc_space_t space, mach_port_seqno_t seqno, boolean_t minimal_trailer, mach_vm_offset_t context); +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +extern void ipc_kmsg_trace_send(ipc_kmsg_t kmsg, + mach_msg_option_t option); +#else +#define ipc_kmsg_trace_send(a,b) do { } while (0) +#endif + #endif /* _IPC_IPC_KMSG_H_ */ diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index 1b5d82c19..c85da50bc 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -90,15 +90,27 @@ #include #include +#if MACH_FLIPC +#include +#endif + #ifdef __LP64__ #include #endif +#include + +extern char *proc_name_address(void *p); + int ipc_mqueue_full; /* address is event for queue space */ int ipc_mqueue_rcv; /* address is event for message arrival */ /* forward declarations */ void ipc_mqueue_receive_results(wait_result_t result); +static void ipc_mqueue_peek_on_thread( + ipc_mqueue_t port_mq, + mach_msg_option_t option, + thread_t thread); /* * Routine: ipc_mqueue_init @@ -113,16 +125,20 @@ ipc_mqueue_init( { if (is_set) { waitq_set_init(&mqueue->imq_set_queue, - SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST|SYNC_POLICY_DISABLE_IRQ, - reserved_link); + SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, + reserved_link, NULL); } else { - waitq_init(&mqueue->imq_wait_queue, SYNC_POLICY_FIFO|SYNC_POLICY_DISABLE_IRQ); + waitq_init(&mqueue->imq_wait_queue, SYNC_POLICY_FIFO); ipc_kmsg_queue_init(&mqueue->imq_messages); mqueue->imq_seqno = 0; mqueue->imq_msgcount = 0; mqueue->imq_qlimit = MACH_PORT_QLIMIT_DEFAULT; mqueue->imq_fullwaiters = FALSE; +#if MACH_FLIPC + mqueue->imq_fport = FPORT_NULL; +#endif } + klist_init(&mqueue->imq_klist); } void ipc_mqueue_deinit( @@ -146,10 +162,10 @@ void ipc_mqueue_deinit( * mq is unlocked */ void -imq_reserve_and_lock(ipc_mqueue_t mq, uint64_t *reserved_prepost, spl_t *spl) +imq_reserve_and_lock(ipc_mqueue_t mq, uint64_t *reserved_prepost) { *reserved_prepost = waitq_prepost_reserve(&mq->imq_wait_queue, 0, - WAITQ_KEEP_LOCKED, spl); + WAITQ_KEEP_LOCKED); } @@ -163,11 +179,10 @@ imq_reserve_and_lock(ipc_mqueue_t mq, uint64_t *reserved_prepost, spl_t *spl) * mq is locked */ void -imq_release_and_unlock(ipc_mqueue_t mq, uint64_t reserved_prepost, spl_t spl) +imq_release_and_unlock(ipc_mqueue_t mq, uint64_t reserved_prepost) { assert(imq_held(mq)); waitq_unlock(&mq->imq_wait_queue); - splx(spl); waitq_prepost_release_reserve(reserved_prepost); } @@ -220,14 +235,20 @@ ipc_mqueue_remove( * Remove the mqueue from all the sets it is a member of * Conditions: * Nothing locked. + * Returns: + * mqueue unlocked and set links deallocated */ void ipc_mqueue_remove_from_all(ipc_mqueue_t mqueue) { struct waitq *mq_waitq = &mqueue->imq_wait_queue; + kern_return_t kr; - waitq_unlink_all(mq_waitq); - return; + imq_lock(mqueue); + + assert(waitq_valid(mq_waitq)); + kr = waitq_unlink_all_unlock(mq_waitq); + /* mqueue unlocked and set links deallocated */ } /* @@ -237,13 +258,18 @@ ipc_mqueue_remove_from_all(ipc_mqueue_t mqueue) * Also removes the queue from any containing sets. * Conditions: * Nothing locked. + * Returns: + * mqueue unlocked all set links deallocated */ void ipc_mqueue_remove_all(ipc_mqueue_t mqueue) { struct waitq_set *mq_setq = &mqueue->imq_set_queue; - waitq_set_unlink_all(mq_setq); - return; + + imq_lock(mqueue); + assert(waitqs_is_set(mq_setq)); + waitq_set_unlink_all_unlock(mq_setq); + /* mqueue unlocked set links deallocated */ } @@ -270,11 +296,9 @@ ipc_mqueue_add( ipc_kmsg_queue_t kmsgq; ipc_kmsg_t kmsg, next; kern_return_t kr; - spl_t s; assert(reserved_link && *reserved_link != 0); - s = splsched(); imq_lock(port_mqueue); /* @@ -284,7 +308,6 @@ ipc_mqueue_add( kr = waitq_link(port_waitq, set_waitq, WAITQ_ALREADY_LOCKED, reserved_link); if (kr != KERN_SUCCESS) { imq_unlock(port_mqueue); - splx(s); return kr; } @@ -304,11 +327,12 @@ ipc_mqueue_add( mach_msg_size_t msize; spl_t th_spl; - th = waitq_wakeup64_identity_locked( + th = waitq_wakeup64_identify_locked( port_waitq, IPC_MQUEUE_RECEIVE, THREAD_AWAKENED, &th_spl, - reserved_prepost, WAITQ_KEEP_LOCKED); + reserved_prepost, WAITQ_ALL_PRIORITIES, + WAITQ_KEEP_LOCKED); /* waitq/mqueue still locked, thread locked */ if (th == THREAD_NULL) @@ -321,9 +345,20 @@ ipc_mqueue_add( * go look for another thread that can. */ if (th->ith_state != MACH_RCV_IN_PROGRESS) { - thread_unlock(th); - splx(th_spl); - continue; + if (th->ith_state == MACH_PEEK_IN_PROGRESS) { + /* + * wakeup the peeking thread, but + * continue to loop over the threads + * waiting on the port's mqueue to see + * if there are any actual receivers + */ + ipc_mqueue_peek_on_thread(port_mqueue, + th->ith_option, + th); + } + thread_unlock(th); + splx(th_spl); + continue; } /* @@ -335,7 +370,7 @@ ipc_mqueue_add( * just move onto the next. */ msize = ipc_kmsg_copyout_size(kmsg, th->map); - if (th->ith_msize < + if (th->ith_rsize < (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(th), th->ith_option))) { th->ith_state = MACH_RCV_TOO_LARGE; th->ith_msize = msize; @@ -359,18 +394,24 @@ ipc_mqueue_add( * so give it to him. */ ipc_kmsg_rmqueue(kmsgq, kmsg); +#if MACH_FLIPC + mach_node_t node = kmsg->ikm_node; +#endif ipc_mqueue_release_msgcount(port_mqueue, IMQ_NULL); th->ith_kmsg = kmsg; th->ith_seqno = port_mqueue->imq_seqno++; thread_unlock(th); splx(th_spl); +#if MACH_FLIPC + if (MACH_NODE_VALID(node) && FPORT_VALID(port_mqueue->imq_fport)) + flipc_msg_ack(node, port_mqueue, TRUE); +#endif break; /* go to next message */ } } leave: imq_unlock(port_mqueue); - splx(s); return KERN_SUCCESS; } @@ -386,6 +427,9 @@ void ipc_mqueue_changed( ipc_mqueue_t mqueue) { + /* Indicate that this message queue is vanishing */ + knote_vanish(&mqueue->imq_klist); + waitq_wakeup64_all_locked(&mqueue->imq_wait_queue, IPC_MQUEUE_RECEIVE, THREAD_RESTART, @@ -419,8 +463,7 @@ ipc_mqueue_send( ipc_mqueue_t mqueue, ipc_kmsg_t kmsg, mach_msg_option_t option, - mach_msg_timeout_t send_timeout, - spl_t s) + mach_msg_timeout_t send_timeout) { int wresult; @@ -438,7 +481,6 @@ ipc_mqueue_send( mqueue->imq_msgcount++; assert(mqueue->imq_msgcount > 0); imq_unlock(mqueue); - splx(s); } else { thread_t cur_thread = current_thread(); uint64_t deadline; @@ -448,16 +490,14 @@ ipc_mqueue_send( */ if ((option & MACH_SEND_TIMEOUT) && (send_timeout == 0)) { imq_unlock(mqueue); - splx(s); return MACH_SEND_TIMED_OUT; } if (imq_full_kernel(mqueue)) { imq_unlock(mqueue); - splx(s); return MACH_SEND_NO_BUFFER; } mqueue->imq_fullwaiters = TRUE; - thread_lock(cur_thread); + if (option & MACH_SEND_TIMEOUT) clock_interval_to_deadline(send_timeout, 1000*NSEC_PER_USEC, &deadline); else @@ -469,9 +509,8 @@ ipc_mqueue_send( TIMEOUT_URGENCY_USER_NORMAL, deadline, TIMEOUT_NO_LEEWAY, cur_thread); - thread_unlock(cur_thread); + imq_unlock(mqueue); - splx(s); if (wresult == THREAD_WAITING) { wresult = thread_block(THREAD_CONTINUE_NULL); @@ -503,10 +542,58 @@ ipc_mqueue_send( } } - ipc_mqueue_post(mqueue, kmsg); + ipc_mqueue_post(mqueue, kmsg, option); return MACH_MSG_SUCCESS; } +/* + * Routine: ipc_mqueue_override_send + * Purpose: + * Set an override qos on the first message in the queue + * (if the queue is full). This is a send-possible override + * that will go away as soon as we drain a message from the + * queue. + * + * Conditions: + * The message queue is not locked. + * The caller holds a reference on the message queue. + */ +extern void ipc_mqueue_override_send( + ipc_mqueue_t mqueue, + mach_msg_priority_t override) +{ + boolean_t __unused full_queue_empty = FALSE; + + imq_lock(mqueue); + assert(imq_valid(mqueue)); + assert(!imq_is_set(mqueue)); + + if (imq_full(mqueue)) { + ipc_kmsg_t first = ipc_kmsg_queue_first(&mqueue->imq_messages); + + if (first && ipc_kmsg_override_qos(&mqueue->imq_messages, first, override)) + KNOTE(&mqueue->imq_klist, 0); + if (!first) + full_queue_empty = TRUE; + } + imq_unlock(mqueue); + +#if DEVELOPMENT || DEBUG + if (full_queue_empty) { + ipc_port_t port = ip_from_mq(mqueue); + int dst_pid = 0; + if (ip_active(port) && !port->ip_tempowner && + port->ip_receiver_name && port->ip_receiver && + port->ip_receiver != ipc_space_kernel) { + dst_pid = task_pid(port->ip_receiver->is_task); + } + printf("%s[%d] could not override mqueue (dst:%d) with 0x%x: " + "queue slots are full, but there are no messages!\n", + proc_name_address(current_task()->bsd_info), + task_pid(current_task()), dst_pid, override); + } +#endif +} /* * Routine: ipc_mqueue_release_msgcount @@ -552,7 +639,7 @@ ipc_mqueue_release_msgcount(ipc_mqueue_t port_mq, ipc_mqueue_t set_mq) if (ipc_kmsg_queue_empty(&port_mq->imq_messages)) { /* no more msgs: invalidate the port's prepost object */ - waitq_clear_prepost_locked(&port_mq->imq_wait_queue, NULL); + waitq_clear_prepost_locked(&port_mq->imq_wait_queue); } } @@ -569,11 +656,14 @@ ipc_mqueue_release_msgcount(ipc_mqueue_t port_mq, ipc_mqueue_t set_mq) */ void ipc_mqueue_post( - register ipc_mqueue_t mqueue, - register ipc_kmsg_t kmsg) + ipc_mqueue_t mqueue, + ipc_kmsg_t kmsg, + mach_msg_option_t __unused option) { - spl_t s; uint64_t reserved_prepost = 0; + boolean_t destroy_msg = FALSE; + + ipc_kmsg_trace_send(kmsg, option); /* * While the msg queue is locked, we have control of the @@ -581,28 +671,49 @@ ipc_mqueue_post( * * Check for a receiver for the message. */ - imq_reserve_and_lock(mqueue, &reserved_prepost, &s); + imq_reserve_and_lock(mqueue, &reserved_prepost); + + /* we may have raced with port destruction! */ + if (!imq_valid(mqueue)) { + destroy_msg = TRUE; + goto out_unlock; + } + for (;;) { struct waitq *waitq = &mqueue->imq_wait_queue; spl_t th_spl; thread_t receiver; mach_msg_size_t msize; - receiver = waitq_wakeup64_identity_locked(waitq, - IPC_MQUEUE_RECEIVE, - THREAD_AWAKENED, - &th_spl, - &reserved_prepost, - WAITQ_KEEP_LOCKED); + receiver = waitq_wakeup64_identify_locked(waitq, + IPC_MQUEUE_RECEIVE, + THREAD_AWAKENED, + &th_spl, + &reserved_prepost, + WAITQ_ALL_PRIORITIES, + WAITQ_KEEP_LOCKED); /* waitq still locked, thread locked */ if (receiver == THREAD_NULL) { - + /* - * no receivers; queue kmsg if space still reserved. + * no receivers; queue kmsg if space still reserved + * Reservations are cancelled when the port goes inactive. + * note that this will enqueue the message for any + * "peeking" receivers. + * + * Also, post the knote to wake up any threads waiting + * on that style of interface if this insertion is of + * note (first insertion, or adjusted override qos all + * the way to the head of the queue). + * + * This is just for ports. portset knotes are stay-active, + * and their threads get awakened through the !MACH_RCV_IN_PROGRESS + * logic below). */ if (mqueue->imq_msgcount > 0) { - ipc_kmsg_enqueue_macro(&mqueue->imq_messages, kmsg); + if (ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg)) + KNOTE(&mqueue->imq_klist, 0); break; } @@ -610,24 +721,38 @@ ipc_mqueue_post( * Otherwise, the message queue must belong to an inactive * port, so just destroy the message and pretend it was posted. */ - /* clear the waitq boost we may have been given */ - waitq_clear_promotion_locked(waitq, current_thread()); - imq_release_and_unlock(mqueue, reserved_prepost, s); - ipc_kmsg_destroy(kmsg); - current_task()->messages_sent++; - return; + destroy_msg = TRUE; + goto out_unlock; } /* - * If the receiver waited with a facility not directly - * related to Mach messaging, then it isn't prepared to get - * handed the message directly. Just set it running, and - * go look for another thread that can. + * If a thread is attempting a "peek" into the message queue + * (MACH_PEEK_IN_PROGRESS), then we enqueue the message and set the + * thread running. A successful peek is essentially the same as + * message delivery since the peeking thread takes responsibility + * for delivering the message and (eventually) removing it from + * the mqueue. Only one thread can successfully use the peek + * facility on any given port, so we exit the waitq loop after + * encountering such a thread. + */ + if (receiver->ith_state == MACH_PEEK_IN_PROGRESS && mqueue->imq_msgcount > 0) { + ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg); + ipc_mqueue_peek_on_thread(mqueue, receiver->ith_option, receiver); + thread_unlock(receiver); + splx(th_spl); + break; /* Message was posted, so break out of loop */ + } + + /* + * If the receiver waited with a facility not directly related + * to Mach messaging, then it isn't prepared to get handed the + * message directly. Just set it running, and go look for + * another thread that can. */ if (receiver->ith_state != MACH_RCV_IN_PROGRESS) { - thread_unlock(receiver); - splx(th_spl); - continue; + thread_unlock(receiver); + splx(th_spl); + continue; } @@ -637,7 +762,7 @@ ipc_mqueue_post( * the thread we wake up will get that as its status. */ msize = ipc_kmsg_copyout_size(kmsg, receiver->map); - if (receiver->ith_msize < + if (receiver->ith_rsize < (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(receiver), receiver->ith_option))) { receiver->ith_msize = msize; receiver->ith_state = MACH_RCV_TOO_LARGE; @@ -652,14 +777,21 @@ ipc_mqueue_post( */ if ((receiver->ith_state == MACH_MSG_SUCCESS) || !(receiver->ith_option & MACH_RCV_LARGE)) { - receiver->ith_kmsg = kmsg; receiver->ith_seqno = mqueue->imq_seqno++; +#if MACH_FLIPC + mach_node_t node = kmsg->ikm_node; +#endif thread_unlock(receiver); splx(th_spl); /* we didn't need our reserved spot in the queue */ ipc_mqueue_release_msgcount(mqueue, IMQ_NULL); + +#if MACH_FLIPC + if (MACH_NODE_VALID(node) && FPORT_VALID(mqueue->imq_fport)) + flipc_msg_ack(node, mqueue, TRUE); +#endif break; } @@ -675,10 +807,13 @@ ipc_mqueue_post( splx(th_spl); } +out_unlock: /* clear the waitq boost we may have been given */ waitq_clear_promotion_locked(&mqueue->imq_wait_queue, current_thread()); - imq_release_and_unlock(mqueue, reserved_prepost, s); - + imq_release_and_unlock(mqueue, reserved_prepost); + if (destroy_msg) + ipc_kmsg_destroy(kmsg); + current_task()->messages_sent++; return; } @@ -727,6 +862,7 @@ ipc_mqueue_receive_results(wait_result_t saved_wait_result) } case MACH_MSG_SUCCESS: + case MACH_PEEK_READY: return; default: @@ -752,22 +888,17 @@ ipc_mqueue_receive_continue( * Purpose: * Receive a message from a message queue. * - * If continuation is non-zero, then we might discard - * our kernel stack when we block. We will continue - * after unblocking by executing continuation. - * - * If resume is true, then we are resuming a receive - * operation after a blocked receive discarded our stack. * Conditions: * Our caller must hold a reference for the port or port set * to which this queue belongs, to keep the queue * from being deallocated. * * The kmsg is returned with clean header fields - * and with the circular bit turned off. + * and with the circular bit turned off through the ith_kmsg + * field of the thread's receive continuation state. * Returns: - * MACH_MSG_SUCCESS Message returned in kmsgp. - * MACH_RCV_TOO_LARGE Message size returned in kmsgp. + * MACH_MSG_SUCCESS Message returned in ith_kmsg. + * MACH_RCV_TOO_LARGE Message size returned in ith_msize. * MACH_RCV_TIMED_OUT No message obtained. * MACH_RCV_INTERRUPTED No message obtained. * MACH_RCV_PORT_DIED Port/set died; no message. @@ -784,13 +915,15 @@ ipc_mqueue_receive( int interruptible) { wait_result_t wresult; - thread_t self = current_thread(); - - wresult = ipc_mqueue_receive_on_thread(mqueue, option, max_size, - rcv_timeout, interruptible, - self); - if (wresult == THREAD_NOT_WAITING) - return; + thread_t self = current_thread(); + + imq_lock(mqueue); + wresult = ipc_mqueue_receive_on_thread(mqueue, option, max_size, + rcv_timeout, interruptible, + self); + /* mqueue unlocked */ + if (wresult == THREAD_NOT_WAITING) + return; if (wresult == THREAD_WAITING) { counter((interruptible == THREAD_ABORTSAFE) ? @@ -832,9 +965,21 @@ static int mqueue_process_prepost_receive(void *ctx, struct waitq *waitq, return WQ_ITERATE_BREAK_KEEP_LOCKED; } +/* + * Routine: ipc_mqueue_receive_on_thread + * Purpose: + * Receive a message from a message queue using a specified thread. + * If no message available, assert_wait on the appropriate waitq. + * + * Conditions: + * Assumes thread is self. + * Called with mqueue locked. + * Returns with mqueue unlocked. + * May have assert-waited. Caller must block in those cases. + */ wait_result_t ipc_mqueue_receive_on_thread( - ipc_mqueue_t mqueue, + ipc_mqueue_t mqueue, mach_msg_option_t option, mach_msg_size_t max_size, mach_msg_timeout_t rcv_timeout, @@ -843,20 +988,28 @@ ipc_mqueue_receive_on_thread( { wait_result_t wresult; uint64_t deadline; - spl_t s; - s = splsched(); - imq_lock(mqueue); + /* called with mqueue locked */ + /* no need to reserve anything: we never prepost to anyone */ + + if (!imq_valid(mqueue)) { + /* someone raced us to destroy this mqueue/port! */ + imq_unlock(mqueue); + /* + * ipc_mqueue_receive_results updates the thread's ith_state + * TODO: differentiate between rights being moved and + * rights/ports being destroyed (21885327) + */ + return THREAD_RESTART; + } if (imq_is_set(mqueue)) { ipc_mqueue_t port_mq = IMQ_NULL; - spl_t set_spl; (void)waitq_set_iterate_preposts(&mqueue->imq_set_queue, &port_mq, - mqueue_process_prepost_receive, - &set_spl); + mqueue_process_prepost_receive); if (port_mq != IMQ_NULL) { /* @@ -870,27 +1023,20 @@ ipc_mqueue_receive_on_thread( */ imq_unlock(mqueue); - /* TODO: if/when port mqueues become non irq safe, - * we won't need this spl, and we should be - * able to call splx(s) (if that's even - * necessary). - * For now, we've still disabled interrupts via - * imq_reserve_and_lock(); - */ - splx(set_spl); - /* * Continue on to handling the message with just * the port mqueue locked. */ - ipc_mqueue_select_on_thread(port_mq, mqueue, option, - max_size, thread); + if (option & MACH_PEEK_MSG) + ipc_mqueue_peek_on_thread(port_mq, option, thread); + else + ipc_mqueue_select_on_thread(port_mq, mqueue, option, + max_size, thread); imq_unlock(port_mq); - splx(s); return THREAD_NOT_WAITING; } - } else { + } else if (imq_is_queue(mqueue)) { ipc_kmsg_queue_t kmsgs; /* @@ -898,12 +1044,17 @@ ipc_mqueue_receive_on_thread( */ kmsgs = &mqueue->imq_messages; if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) { - ipc_mqueue_select_on_thread(mqueue, IMQ_NULL, option, - max_size, thread); + if (option & MACH_PEEK_MSG) + ipc_mqueue_peek_on_thread(mqueue, option, thread); + else + ipc_mqueue_select_on_thread(mqueue, IMQ_NULL, option, + max_size, thread); imq_unlock(mqueue); - splx(s); return THREAD_NOT_WAITING; } + } else { + panic("Unknown mqueue type 0x%x: likely memory corruption!\n", + mqueue->imq_wait_queue.waitq_type); } /* @@ -914,17 +1065,19 @@ ipc_mqueue_receive_on_thread( if (option & MACH_RCV_TIMEOUT) { if (rcv_timeout == 0) { imq_unlock(mqueue); - splx(s); thread->ith_state = MACH_RCV_TIMED_OUT; return THREAD_NOT_WAITING; } } - /* NOTE: need splsched() here if mqueue no longer needs irq disabled */ - thread_lock(thread); - thread->ith_state = MACH_RCV_IN_PROGRESS; thread->ith_option = option; - thread->ith_msize = max_size; + thread->ith_rsize = max_size; + thread->ith_msize = 0; + + if (option & MACH_PEEK_MSG) + thread->ith_state = MACH_PEEK_IN_PROGRESS; + else + thread->ith_state = MACH_RCV_IN_PROGRESS; if (option & MACH_RCV_TIMEOUT) clock_interval_to_deadline(rcv_timeout, 1000*NSEC_PER_USEC, &deadline); @@ -942,13 +1095,45 @@ ipc_mqueue_receive_on_thread( if (wresult == THREAD_AWAKENED) panic("ipc_mqueue_receive_on_thread: sleep walking"); - thread_unlock(thread); imq_unlock(mqueue); - splx(s); + return wresult; } +/* + * Routine: ipc_mqueue_peek_on_thread + * Purpose: + * A receiver discovered that there was a message on the queue + * before he had to block. Tell a thread about the message queue, + * but don't pick off any messages. + * Conditions: + * port_mq locked + * at least one message on port_mq's message queue + * + * Returns: (on thread->ith_state) + * MACH_PEEK_READY ith_peekq contains a message queue + */ +void +ipc_mqueue_peek_on_thread( + ipc_mqueue_t port_mq, + mach_msg_option_t option, + thread_t thread) +{ + (void)option; + assert(option & MACH_PEEK_MSG); + assert(ipc_kmsg_queue_first(&port_mq->imq_messages) != IKM_NULL); + + /* + * Take a reference on the mqueue's associated port: + * the peeking thread will be responsible to release this reference + * using ip_release_mq() + */ + ip_reference_mq(port_mq); + thread->ith_peekq = port_mq; + thread->ith_state = MACH_PEEK_READY; +} + /* * Routine: ipc_mqueue_select_on_thread * Purpose: @@ -975,7 +1160,7 @@ ipc_mqueue_select_on_thread( { ipc_kmsg_t kmsg; mach_msg_return_t mr = MACH_MSG_SUCCESS; - mach_msg_size_t rcv_size; + mach_msg_size_t msize; /* * Do some sanity checking of our ability to receive @@ -990,20 +1175,24 @@ ipc_mqueue_select_on_thread( * the queue, instead return the appropriate error * (and size needed). */ - rcv_size = ipc_kmsg_copyout_size(kmsg, thread->map); - if (rcv_size + REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option) > max_size) { + msize = ipc_kmsg_copyout_size(kmsg, thread->map); + if (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option) > max_size) { mr = MACH_RCV_TOO_LARGE; if (option & MACH_RCV_LARGE) { thread->ith_receiver_name = port_mq->imq_receiver_name; thread->ith_kmsg = IKM_NULL; - thread->ith_msize = rcv_size; + thread->ith_msize = msize; thread->ith_seqno = 0; thread->ith_state = mr; return; } } - ipc_kmsg_rmqueue_first_macro(&port_mq->imq_messages, kmsg); + ipc_kmsg_rmqueue(&port_mq->imq_messages, kmsg); +#if MACH_FLIPC + if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port_mq->imq_fport)) + flipc_msg_ack(kmsg->ikm_node, port_mq, TRUE); +#endif ipc_mqueue_release_msgcount(port_mq, set_mq); thread->ith_seqno = port_mq->imq_seqno++; thread->ith_kmsg = kmsg; @@ -1014,7 +1203,7 @@ ipc_mqueue_select_on_thread( } /* - * Routine: ipc_mqueue_peek + * Routine: ipc_mqueue_peek_locked * Purpose: * Peek at a (non-set) message queue to see if it has a message * matching the sequence number provided (if zero, then the @@ -1022,27 +1211,25 @@ ipc_mqueue_select_on_thread( * message. * * Conditions: - * Locks may be held by callers, so this routine cannot block. + * The ipc_mqueue_t is locked by callers. + * Other locks may be held by callers, so this routine cannot block. * Caller holds reference on the message queue. */ unsigned -ipc_mqueue_peek(ipc_mqueue_t mq, - mach_port_seqno_t * seqnop, - mach_msg_size_t * msg_sizep, - mach_msg_id_t * msg_idp, - mach_msg_max_trailer_t * msg_trailerp) +ipc_mqueue_peek_locked(ipc_mqueue_t mq, + mach_port_seqno_t * seqnop, + mach_msg_size_t * msg_sizep, + mach_msg_id_t * msg_idp, + mach_msg_max_trailer_t * msg_trailerp, + ipc_kmsg_t *kmsgp) { ipc_kmsg_queue_t kmsgq; ipc_kmsg_t kmsg; mach_port_seqno_t seqno, msgoff; - int res = 0; - spl_t s; + unsigned res = 0; assert(!imq_is_set(mq)); - s = splsched(); - imq_lock(mq); - seqno = 0; if (seqnop != NULL) seqno = *seqnop; @@ -1077,14 +1264,80 @@ ipc_mqueue_peek(ipc_mqueue_t mq, (mach_msg_max_trailer_t *)((vm_offset_t)kmsg->ikm_header + round_msg(kmsg->ikm_header->msgh_size)), sizeof(mach_msg_max_trailer_t)); + if (kmsgp != NULL) + *kmsgp = kmsg; + res = 1; - out: +out: + return res; +} + + +/* + * Routine: ipc_mqueue_peek + * Purpose: + * Peek at a (non-set) message queue to see if it has a message + * matching the sequence number provided (if zero, then the + * first message in the queue) and return vital info about the + * message. + * + * Conditions: + * The ipc_mqueue_t is unlocked. + * Locks may be held by callers, so this routine cannot block. + * Caller holds reference on the message queue. + */ +unsigned +ipc_mqueue_peek(ipc_mqueue_t mq, + mach_port_seqno_t * seqnop, + mach_msg_size_t * msg_sizep, + mach_msg_id_t * msg_idp, + mach_msg_max_trailer_t * msg_trailerp, + ipc_kmsg_t *kmsgp) +{ + unsigned res; + + imq_lock(mq); + + res = ipc_mqueue_peek_locked(mq, seqnop, msg_sizep, msg_idp, + msg_trailerp, kmsgp); + imq_unlock(mq); - splx(s); return res; } +/* + * Routine: ipc_mqueue_release_peek_ref + * Purpose: + * Release the reference on an mqueue's associated port which was + * granted to a thread in ipc_mqueue_peek_on_thread (on the + * MACH_PEEK_MSG thread wakeup path). + * + * Conditions: + * The ipc_mqueue_t should be locked on entry. + * The ipc_mqueue_t will be _unlocked_ on return + * (and potentially invalid!) + * + */ +void ipc_mqueue_release_peek_ref(ipc_mqueue_t mq) +{ + assert(!imq_is_set(mq)); + assert(imq_held(mq)); + + /* + * clear any preposts this mq may have generated + * (which would cause subsequent immediate wakeups) + */ + waitq_clear_prepost_locked(&mq->imq_wait_queue); + + imq_unlock(mq); + + /* + * release the port reference: we need to do this outside the lock + * because we might be holding the last port reference! + **/ + ip_release_mq(mq); +} /* * peek at the contained port message queues, break prepost iteration as soon @@ -1121,19 +1374,23 @@ static int mqueue_peek_iterator(void *ctx, struct waitq *waitq, unsigned ipc_mqueue_set_peek(ipc_mqueue_t mq) { - spl_t s; int ret; - assert(imq_is_set(mq)); - - s = splsched(); imq_lock(mq); + /* + * We may have raced with port destruction where the mqueue is marked + * as invalid. In that case, even though we don't have messages, we + * have an end-of-life event to deliver. + */ + if (!imq_is_valid(mq)) + return 1; + ret = waitq_set_iterate_preposts(&mq->imq_set_queue, NULL, - mqueue_peek_iterator, NULL); + mqueue_peek_iterator); imq_unlock(mq); - splx(s); + return (ret == WQ_ITERATE_BREAK); } @@ -1211,29 +1468,24 @@ ipc_mqueue_set_gather_member_names( /* - * Routine: ipc_mqueue_destroy + * Routine: ipc_mqueue_destroy_locked * Purpose: * Destroy a (non-set) message queue. * Set any blocked senders running. * Destroy the kmsgs in the queue. * Conditions: - * Nothing locked. + * mqueue locked * Receivers were removed when the receive right was "changed" */ -void -ipc_mqueue_destroy( - ipc_mqueue_t mqueue) +boolean_t +ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue) { ipc_kmsg_queue_t kmqueue; ipc_kmsg_t kmsg; boolean_t reap = FALSE; - spl_t s; assert(!imq_is_set(mqueue)); - s = splsched(); - imq_lock(mqueue); - /* * rouse all blocked senders * (don't boost anyone - we're tearing this queue down) @@ -1253,6 +1505,10 @@ ipc_mqueue_destroy( */ kmqueue = &mqueue->imq_messages; while ((kmsg = ipc_kmsg_dequeue(kmqueue)) != IKM_NULL) { +#if MACH_FLIPC + if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(mqueue->imq_fport)) + flipc_msg_ack(kmsg->ikm_node, mqueue, TRUE); +#endif boolean_t first; first = ipc_kmsg_delayed_destroy(kmsg); if (first) @@ -1267,26 +1523,20 @@ ipc_mqueue_destroy( */ mqueue->imq_msgcount = 0; - /* clear out any preposting we may have done */ - waitq_clear_prepost_locked(&mqueue->imq_wait_queue, &s); + /* invalidate the waitq for subsequent mqueue operations */ + waitq_invalidate_locked(&mqueue->imq_wait_queue); - imq_unlock(mqueue); - splx(s); + /* clear out any preposting we may have done */ + waitq_clear_prepost_locked(&mqueue->imq_wait_queue); /* - * assert that we're destroying a queue that's not a - * member of any other queue + * assert that we are destroying / invalidating a queue that's + * not a member of any other queue. */ - assert(mqueue->imq_wait_queue.waitq_prepost_id == 0); - assert(mqueue->imq_wait_queue.waitq_set_id == 0); - + assert(mqueue->imq_preposts == 0); + assert(mqueue->imq_in_pset == 0); - /* - * Destroy the messages we enqueued if we aren't nested - * inside some other attempt to drain the same queue. - */ - if (reap) - ipc_kmsg_reap_delayed(); + return reap; } /* @@ -1303,12 +1553,10 @@ ipc_mqueue_set_qlimit( ipc_mqueue_t mqueue, mach_port_msgcount_t qlimit) { - spl_t s; assert(qlimit <= MACH_PORT_QLIMIT_MAX); /* wake up senders allowed by the new qlimit */ - s = splsched(); imq_lock(mqueue); if (qlimit > mqueue->imq_qlimit) { mach_port_msgcount_t i, wakeup; @@ -1338,7 +1586,6 @@ ipc_mqueue_set_qlimit( } mqueue->imq_qlimit = qlimit; imq_unlock(mqueue); - splx(s); } /* @@ -1353,13 +1600,9 @@ ipc_mqueue_set_seqno( ipc_mqueue_t mqueue, mach_port_seqno_t seqno) { - spl_t s; - - s = splsched(); imq_lock(mqueue); mqueue->imq_seqno = seqno; imq_unlock(mqueue); - splx(s); } @@ -1426,7 +1669,6 @@ ipc_mqueue_copyin( ips_lock(pset); assert(ips_active(pset)); - assert(pset->ips_local_name == name); is_read_unlock(space); mqueue = &pset->ips_messages; diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index 401a3cae3..b3fbbb2e6 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -91,12 +91,15 @@ typedef struct ipc_mqueue { mach_port_name_t receiver_name; uint16_t msgcount; uint16_t qlimit; +#if MACH_FLIPC + struct flipc_port *fport; // Null for local port, or ptr to flipc port +#endif } __attribute__((__packed__)) port; struct { struct waitq_set setq; - mach_port_name_t local_name; } __attribute__((__packed__)) pset; } data; + struct klist imq_klist; } *ipc_mqueue_t; #define IMQ_NULL ((ipc_mqueue_t) 0) @@ -107,6 +110,9 @@ typedef struct ipc_mqueue { #define imq_qlimit data.port.qlimit #define imq_seqno data.port.seqno #define imq_receiver_name data.port.receiver_name +#if MACH_FLIPC +#define imq_fport data.port.fport +#endif /* * we can use the 'eventmask' bits of the waitq b/c @@ -114,21 +120,24 @@ typedef struct ipc_mqueue { */ #define imq_fullwaiters data.port.waitq.waitq_eventmask #define imq_in_pset data.port.waitq.waitq_set_id +#define imq_preposts data.port.waitq.waitq_prepost_id #define imq_set_queue data.pset.setq -#define imq_local_name data.pset.local_name #define imq_is_set(mq) waitqs_is_set(&(mq)->imq_set_queue) +#define imq_is_queue(mq) waitq_is_queue(&(mq)->imq_wait_queue) +#define imq_is_valid(mq) waitq_is_valid(&(mq)->imq_wait_queue) #define imq_lock(mq) waitq_lock(&(mq)->imq_wait_queue) #define imq_lock_try(mq) waitq_lock_try(&(mq)->imq_wait_queue) #define imq_unlock(mq) waitq_unlock(&(mq)->imq_wait_queue) #define imq_held(mq) waitq_held(&(mq)->imq_wait_queue) +#define imq_valid(mq) waitq_valid(&(mq)->imq_wait_queue) extern void imq_reserve_and_lock(ipc_mqueue_t mq, - uint64_t *reserved_prepost, spl_t *spl); + uint64_t *reserved_prepost); extern void imq_release_and_unlock(ipc_mqueue_t mq, - uint64_t reserved_prepost, spl_t spl); + uint64_t reserved_prepost); #define imq_full(mq) ((mq)->imq_msgcount >= (mq)->imq_qlimit) #define imq_full_kernel(mq) ((mq)->imq_msgcount >= MACH_PORT_QLIMIT_KERNEL) @@ -154,7 +163,7 @@ extern void ipc_mqueue_deinit( ipc_mqueue_t mqueue); /* destroy an mqueue */ -extern void ipc_mqueue_destroy( +extern boolean_t ipc_mqueue_destroy_locked( ipc_mqueue_t mqueue); /* Wake up receivers waiting in a message queue */ @@ -191,8 +200,7 @@ extern mach_msg_return_t ipc_mqueue_send( ipc_mqueue_t mqueue, ipc_kmsg_t kmsg, mach_msg_option_t option, - mach_msg_timeout_t timeout_val, - spl_t s); + mach_msg_timeout_t timeout_val); /* check for queue send queue full of a port */ extern mach_msg_return_t ipc_mqueue_preflight_send( @@ -201,10 +209,16 @@ extern mach_msg_return_t ipc_mqueue_preflight_send( mach_msg_option_t option, mach_msg_timeout_t timeout_val); +/* Set a [send-possible] override on the mqueue */ +extern void ipc_mqueue_override_send( + ipc_mqueue_t mqueue, + mach_msg_priority_t override); + /* Deliver message to message queue or waiting receiver */ extern void ipc_mqueue_post( ipc_mqueue_t mqueue, - ipc_kmsg_t kmsg); + ipc_kmsg_t kmsg, + mach_msg_option_t option); /* Receive a message from a message queue */ extern void ipc_mqueue_receive( @@ -242,12 +256,26 @@ extern unsigned ipc_mqueue_peek( mach_port_seqno_t *msg_seqnop, mach_msg_size_t *msg_sizep, mach_msg_id_t *msg_idp, - mach_msg_max_trailer_t *msg_trailerp); + mach_msg_max_trailer_t *msg_trailerp, + ipc_kmsg_t *kmsgp); + +/* Peek into a locked messaqe queue to see if there are messages */ +extern unsigned ipc_mqueue_peek_locked( + ipc_mqueue_t mqueue, + mach_port_seqno_t *msg_seqnop, + mach_msg_size_t *msg_sizep, + mach_msg_id_t *msg_idp, + mach_msg_max_trailer_t *msg_trailerp, + ipc_kmsg_t *kmsgp); /* Peek into a messaqe queue set to see if there are queues with messages */ extern unsigned ipc_mqueue_set_peek( ipc_mqueue_t mqueue); +/* Release an mqueue/port reference that was granted by MACH_PEEK_MSG */ +extern void ipc_mqueue_release_peek_ref( + ipc_mqueue_t mqueue); + /* Gather the names of member port for a given set */ extern void ipc_mqueue_set_gather_member_names( ipc_space_t space, diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index 166c9d6da..f972fcd91 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -612,7 +612,8 @@ ipc_object_copyin_from_kernel( ip_lock(port); if (ip_active(port)) { assert(port->ip_receiver_name != MACH_PORT_NULL); - assert(port->ip_receiver == ipc_space_kernel); + assert((port->ip_receiver == ipc_space_kernel) || + (port->ip_receiver->is_node_id != HOST_LOCAL_NODE)); port->ip_mscount++; } diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index 9a580b9e6..25010f1fc 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -347,20 +348,13 @@ ipc_port_request_grow( * (or armed with importance in that version). */ -#if IMPORTANCE_INHERITANCE boolean_t ipc_port_request_sparm( ipc_port_t port, __assert_only mach_port_name_t name, ipc_port_request_index_t index, - mach_msg_option_t option) -#else -boolean_t -ipc_port_request_sparm( - ipc_port_t port, - __assert_only mach_port_name_t name, - ipc_port_request_index_t index) -#endif /* IMPORTANCE_INHERITANCE */ + mach_msg_option_t option, + mach_msg_priority_t override) { if (index != IE_REQ_NONE) { ipc_port_request_t ipr, table; @@ -373,9 +367,16 @@ ipc_port_request_sparm( ipr = &table[index]; assert(ipr->ipr_name == name); + /* Is there a valid destination? */ if (IPR_SOR_SPREQ(ipr->ipr_soright)) { ipr->ipr_soright = IPR_SOR_MAKE(ipr->ipr_soright, IPR_SOR_SPARM_MASK); port->ip_sprequests = 1; + + if (option & MACH_SEND_OVERRIDE) { + /* apply override to message queue */ + ipc_mqueue_override_send(&port->ip_messages, override); + } + #if IMPORTANCE_INHERITANCE if (((option & MACH_SEND_NOIMPORTANCE) == 0) && (port->ip_impdonation != 0) && @@ -538,21 +539,31 @@ ipc_port_nsrequest( /* * Routine: ipc_port_clear_receiver * Purpose: - * Prepares a receive right for transmission/destruction. + * Prepares a receive right for transmission/destruction, + * optionally performs mqueue destruction (with port lock held) + * * Conditions: * The port is locked and active. + * Returns: + * If should_destroy is TRUE, then the return value indicates + * whether the caller needs to reap kmsg structures that should + * be destroyed (by calling ipc_kmsg_reap_delayed) + * + * If should_destroy is FALSE, this always returns FALSE */ -void +boolean_t ipc_port_clear_receiver( - ipc_port_t port) + ipc_port_t port, + boolean_t should_destroy) { - spl_t s; - - assert(ip_active(port)); + ipc_mqueue_t mqueue = &port->ip_messages; + boolean_t reap_messages = FALSE; /* - * pull ourselves from any sets. + * Pull ourselves out of any sets to which we belong. + * We hold the port locked, so even though this acquires and releases + * the mqueue lock, we know we won't be added to any other sets. */ if (port->ip_in_pset != 0) { ipc_pset_remove_from_all(port); @@ -563,14 +574,26 @@ ipc_port_clear_receiver( * Send anyone waiting on the port's queue directly away. * Also clear the mscount and seqno. */ - s = splsched(); - imq_lock(&port->ip_messages); - ipc_mqueue_changed(&port->ip_messages); - ipc_port_set_mscount(port, 0); - port->ip_messages.imq_seqno = 0; + imq_lock(mqueue); + ipc_mqueue_changed(mqueue); + port->ip_mscount = 0; + mqueue->imq_seqno = 0; port->ip_context = port->ip_guarded = port->ip_strict_guard = 0; + + if (should_destroy) { + /* + * Mark the mqueue invalid, preventing further send/receive + * operations from succeeding. It's important for this to be + * done under the same lock hold as the ipc_mqueue_changed + * call to avoid additional threads blocking on an mqueue + * that's being destroyed. + */ + reap_messages = ipc_mqueue_destroy_locked(mqueue); + } + imq_unlock(&port->ip_messages); - splx(s); + + return reap_messages; } /* @@ -728,9 +751,6 @@ ipc_port_spnotify( { ipc_port_request_index_t index = 0; ipc_table_elems_t size = 0; -#if IMPORTANCE_INHERITANCE - boolean_t dropassert = FALSE; -#endif /* IMPORTANCE_INHERITANCE */ /* * If the port has no send-possible request @@ -744,15 +764,15 @@ ipc_port_spnotify( #if IMPORTANCE_INHERITANCE if (port->ip_spimportant != 0) { port->ip_spimportant = 0; - if (ipc_port_impcount_delta(port, -1, IP_NULL) == -1) { - dropassert = TRUE; + if (ipc_port_importance_delta(port, IPID_OPTION_NORMAL, -1) == TRUE) { + ip_lock(port); } } #endif /* IMPORTANCE_INHERITANCE */ if (port->ip_sprequests == 0) { ip_unlock(port); - goto out; + return; } port->ip_sprequests = 0; @@ -791,13 +811,6 @@ ipc_port_spnotify( } } ip_unlock(port); -out: -#if IMPORTANCE_INHERITANCE - if (dropassert == TRUE && ipc_importance_task_is_any_receiver_type(current_task()->task_imp_base)) { - /* drop internal assertion */ - ipc_importance_task_drop_internal_assertion(current_task()->task_imp_base, 1); - } -#endif /* IMPORTANCE_INHERITANCE */ return; } @@ -850,8 +863,7 @@ ipc_port_dnnotify( */ void -ipc_port_destroy( - ipc_port_t port) +ipc_port_destroy(ipc_port_t port) { ipc_port_t pdrequest, nsrequest; ipc_mqueue_t mqueue; @@ -867,8 +879,6 @@ ipc_port_destroy( assert(ip_active(port)); /* port->ip_receiver_name is garbage */ /* port->ip_receiver/port->ip_destination is garbage */ - assert(port->ip_in_pset == 0); - assert(port->ip_mscount == 0); /* check for a backup port */ pdrequest = port->ip_pdrequest; @@ -895,6 +905,11 @@ ipc_port_destroy( #endif /* IMPORTANCE_INHERITANCE */ if (pdrequest != IP_NULL) { + /* clear receiver, don't destroy the port */ + (void)ipc_port_clear_receiver(port, FALSE); + assert(port->ip_in_pset == 0); + assert(port->ip_mscount == 0); + /* we assume the ref for pdrequest */ port->ip_pdrequest = IP_NULL; @@ -909,17 +924,32 @@ ipc_port_destroy( goto drop_assertions; } - /* once port is dead, we don't need to keep it locked */ - port->ip_object.io_bits &= ~IO_BITS_ACTIVE; port->ip_timestamp = ipc_port_timestamp(); nsrequest = port->ip_nsrequest; + /* + * The mach_msg_* paths don't hold a port lock, they only hold a + * reference to the port object. If a thread raced us and is now + * blocked waiting for message reception on this mqueue (or waiting + * for ipc_mqueue_full), it will never be woken up. We call + * ipc_port_clear_receiver() here, _after_ the port has been marked + * inactive, to wakeup any threads which may be blocked and ensure + * that no other thread can get lost waiting for a wake up on a + * port/mqueue that's been destroyed. + */ + boolean_t reap_msgs = FALSE; + reap_msgs = ipc_port_clear_receiver(port, TRUE); /* marks mqueue inactive */ + assert(port->ip_in_pset == 0); + assert(port->ip_mscount == 0); + /* * If the port has a preallocated message buffer and that buffer * is not inuse, free it. If it has an inuse one, then the kmsg * free will detect that we freed the association and it can free it * like a normal buffer. + * + * Once the port is marked inactive we don't need to keep it locked. */ if (IP_PREALLOC(port)) { ipc_port_t inuse_port; @@ -942,9 +972,14 @@ ipc_port_destroy( if (nsrequest != IP_NULL) ipc_notify_send_once(nsrequest); /* consumes ref */ - /* destroy any queued messages */ + /* + * Reap any kmsg objects waiting to be destroyed. + * This must be done after we've released the port lock. + */ + if (reap_msgs) + ipc_kmsg_reap_delayed(); + mqueue = &port->ip_messages; - ipc_mqueue_destroy(mqueue); /* cleanup waitq related resources */ ipc_mqueue_deinit(mqueue); diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 92bb0e70a..cde159cd0 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,7 +72,7 @@ #ifndef _IPC_IPC_PORT_H_ #define _IPC_IPC_PORT_H_ -#if MACH_KERNEL_PRIVATE +#ifdef MACH_KERNEL_PRIVATE #include #include @@ -120,15 +120,6 @@ struct ipc_port { struct ipc_object ip_object; struct ipc_mqueue ip_messages; - natural_t ip_sprequests:1, /* send-possible requests outstanding */ - ip_spimportant:1, /* ... at least one is importance donating */ - ip_impdonation:1, /* port supports importance donation */ - ip_tempowner:1, /* dont give donations to current receiver */ - ip_guarded:1, /* port guarded (use context value as guard) */ - ip_strict_guard:1, /* Strict guarding; Prevents user manipulation of context values directly */ - ip_reserved:2, - ip_impcount:24; /* number of importance donations in nested queue */ - union { struct ipc_space *receiver; struct ipc_port *destination; @@ -148,6 +139,15 @@ struct ipc_port { mach_vm_address_t ip_context; + natural_t ip_sprequests:1, /* send-possible requests outstanding */ + ip_spimportant:1, /* ... at least one is importance donating */ + ip_impdonation:1, /* port supports importance donation */ + ip_tempowner:1, /* dont give donations to current receiver */ + ip_guarded:1, /* port guarded (use context value as guard) */ + ip_strict_guard:1, /* Strict guarding; Prevents user manipulation of context values directly */ + ip_reserved:2, + ip_impcount:24; /* number of importance donations in nested queue */ + mach_port_mscount_t ip_mscount; mach_port_rights_t ip_srights; mach_port_rights_t ip_sorights; @@ -191,6 +191,15 @@ struct ipc_port { #define ip_reference(port) io_reference(&(port)->ip_object) #define ip_release(port) io_release(&(port)->ip_object) +/* get an ipc_port pointer from an ipc_mqueue pointer */ +#define ip_from_mq(mq) ((struct ipc_port *)((void *)( \ + (char *)(mq) - \ + __offsetof(struct ipc_port, ip_messages)) \ + )) + +#define ip_reference_mq(mq) ip_reference(ip_from_mq(mq)) +#define ip_release_mq(mq) ip_release(ip_from_mq(mq)) + #define ip_kotype(port) io_kotype(&(port)->ip_object) #define ip_full_kernel(port) imq_full_kernel(&(port)->ip_messages) @@ -340,18 +349,12 @@ extern ipc_port_t ipc_port_request_cancel( ipc_port_request_index_t index); /* Arm any delayed send-possible notification */ -#if IMPORTANCE_INHERITANCE extern boolean_t ipc_port_request_sparm( - ipc_port_t port, - mach_port_name_t name, - ipc_port_request_index_t index, - mach_msg_option_t option); -#else -extern boolean_t ipc_port_request_sparm( - ipc_port_t port, - mach_port_name_t name, - ipc_port_request_index_t index); -#endif /* IMPORTANCE_INHERITANCE */ + ipc_port_t port, + mach_port_name_t name, + ipc_port_request_index_t index, + mach_msg_option_t option, + mach_msg_priority_t override); /* Macros for manipulating a port's dead name notificaiton requests */ #define ipc_port_request_rename(port, index, oname, nname) \ @@ -391,8 +394,9 @@ MACRO_BEGIN \ MACRO_END /* Prepare a receive right for transmission/destruction */ -extern void ipc_port_clear_receiver( - ipc_port_t port); +extern boolean_t ipc_port_clear_receiver( + ipc_port_t port, + boolean_t should_destroy); /* Initialize a newly-allocated port */ extern void ipc_port_init( @@ -499,7 +503,7 @@ extern void ipc_port_release( #endif /* KERNEL_PRIVATE */ -#if MACH_KERNEL_PRIVATE +#ifdef MACH_KERNEL_PRIVATE /* Make a naked send-once right from a locked and active receive right */ extern ipc_port_t ipc_port_make_sonce_locked( diff --git a/osfmk/ipc/ipc_pset.c b/osfmk/ipc/ipc_pset.c index de389c969..f6772f2c7 100644 --- a/osfmk/ipc/ipc_pset.c +++ b/osfmk/ipc/ipc_pset.c @@ -114,7 +114,6 @@ ipc_pset_alloc( } /* pset and space are locked */ - pset->ips_local_name = name; ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link); is_write_unlock(space); @@ -161,7 +160,6 @@ ipc_pset_alloc_name( } /* pset is locked */ - pset->ips_local_name = name; ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link); waitq_link_release(reserved_link); @@ -170,6 +168,47 @@ ipc_pset_alloc_name( return KERN_SUCCESS; } + +/* + * Routine: ipc_pset_alloc_special + * Purpose: + * Allocate a port set in a special space. + * The new port set is returned with one ref. + * If unsuccessful, IPS_NULL is returned. + * Conditions: + * Nothing locked. + */ +ipc_pset_t +ipc_pset_alloc_special( + __assert_only ipc_space_t space) +{ + ipc_pset_t pset; + uint64_t reserved_link; + + assert(space != IS_NULL); + assert(space->is_table == IE_NULL); + assert(!is_active(space)); + + reserved_link = waitq_link_reserve(NULL); + + __IGNORE_WCASTALIGN(pset = (ipc_pset_t)io_alloc(IOT_PORT_SET)); + if (pset == IPS_NULL) + return IPS_NULL; + + bzero((char *)pset, sizeof(*pset)); + + io_lock_init(&pset->ips_object); + pset->ips_references = 1; + pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0); + + ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link); + + waitq_link_release(reserved_link); + + return pset; +} + + /* * Routine: ipc_pset_member * Purpose: @@ -257,8 +296,6 @@ kern_return_t ipc_pset_remove_from_all( ipc_port_t port) { - assert(ip_active(port)); - if (port->ip_in_pset == 0) return KERN_NOT_IN_SET; @@ -314,18 +351,21 @@ ipc_pset_destroy( /* Kqueue EVFILT_MACHPORT support */ +#include #include static int filt_machportattach(struct knote *kn); static void filt_machportdetach(struct knote *kn); static int filt_machport(struct knote *kn, long hint); -static void filt_machporttouch(struct knote *kn, struct kevent_internal_s *kev, long type); +static int filt_machporttouch(struct knote *kn, struct kevent_internal_s *kev); +static int filt_machportprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); static unsigned filt_machportpeek(struct knote *kn); struct filterops machport_filtops = { .f_attach = filt_machportattach, .f_detach = filt_machportdetach, .f_event = filt_machport, .f_touch = filt_machporttouch, + .f_process = filt_machportprocess, .f_peek = filt_machportpeek, }; @@ -333,94 +373,252 @@ static int filt_machportattach( struct knote *kn) { - mach_port_name_t name = (mach_port_name_t)kn->kn_kevent.ident; - uint64_t wq_link_id = waitq_link_reserve(NULL); - ipc_pset_t pset = IPS_NULL; - int result = ENOSYS; - kern_return_t kr; - - kr = ipc_object_translate(current_space(), name, - MACH_PORT_RIGHT_PORT_SET, - (ipc_object_t *)&pset); - if (kr != KERN_SUCCESS) { - waitq_link_release(wq_link_id); - return (kr == KERN_INVALID_NAME ? ENOENT : ENOTSUP); - } - /* We've got a lock on pset */ + mach_port_name_t name = (mach_port_name_t)kn->kn_kevent.ident; + uint64_t wq_link_id = waitq_link_reserve(NULL); + ipc_space_t space = current_space(); + ipc_kmsg_t first; + + spl_t s; + int error; + int result = 0; + kern_return_t kr; + ipc_entry_t entry; + ipc_mqueue_t mqueue; + + kr = ipc_right_lookup_read(space, name, &entry); + if (kr == KERN_SUCCESS) { + /* space is read-locked and active */ + + if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) { + ipc_pset_t pset; + + __IGNORE_WCASTALIGN(pset = (ipc_pset_t)entry->ie_object); + mqueue = &pset->ips_messages; + + s = splsched(); + imq_lock(mqueue); + + /* + * Bind the portset wait queue directly to knote/kqueue. + * This allows us to just use wait_queue foo to effect a wakeup, + * rather than having to call knote() from the Mach code on each + * message. We still attach the knote to the mqueue klist for + * NOTE_REVOKE purposes only. + */ + error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id); + if (!error) { + ips_reference(pset); + kn->kn_ptr.p_mqueue = mqueue; + KNOTE_ATTACH(&mqueue->imq_klist, kn); + } + imq_unlock(mqueue); + splx(s); + + is_read_unlock(space); + + /* + * linked knotes are marked stay-active and therefore don't + * need an indication of their fired state to be returned + * from the attach operation. + */ + + } else if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) { + ipc_port_t port; + + __IGNORE_WCASTALIGN(port = (ipc_port_t)entry->ie_object); + mqueue = &port->ip_messages; + ip_reference(port); + + /* + * attach knote to port and determine result + * If the filter requested direct message receipt, + * we may need to adjust the qos of the knote to + * reflect the requested and override qos of the + * first message in the queue. + */ + s = splsched(); + imq_lock(mqueue); + kn->kn_ptr.p_mqueue = mqueue; + KNOTE_ATTACH(&mqueue->imq_klist, kn); + if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { + if (kn->kn_sfflags & MACH_RCV_MSG) + knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override); + result = 1; + } + imq_unlock(mqueue); + splx(s); + + is_read_unlock(space); + error = 0; + } else { + is_read_unlock(space); + error = ENOTSUP; + } + } else { + error = ENOENT; + } - /* - * Bind the portset wait queue directly to knote/kqueue. - * This allows us to just use wait_queue foo to effect a wakeup, - * rather than having to call knote() from the Mach code on each - * message. - */ - result = knote_link_waitq(kn, &pset->ips_messages.imq_wait_queue, &wq_link_id); - if (result == 0) { - waitq_link_release(wq_link_id); - /* keep a reference for the knote */ - kn->kn_ptr.p_pset = pset; - ips_reference(pset); - ips_unlock(pset); + waitq_link_release(wq_link_id); + + /* bail out on errors */ + if (error) { + kn->kn_flags |= EV_ERROR; + kn->kn_data = error; return 0; } - ips_unlock(pset); - waitq_link_release(wq_link_id); return result; } +/* NOT proud of these - we should have a stricter relationship between mqueue and ipc object */ +#define mqueue_to_pset(mq) ((ipc_pset_t)((uintptr_t)mq-offsetof(struct ipc_pset, ips_messages))) +#define mqueue_to_port(mq) ((ipc_port_t)((uintptr_t)mq-offsetof(struct ipc_port, ip_messages))) +#define mqueue_to_object(mq) (((ipc_object_t)(mq)) - 1) + + static void filt_machportdetach( - struct knote *kn) + struct knote *kn) { - ipc_pset_t pset = kn->kn_ptr.p_pset; + ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; + ipc_object_t object = mqueue_to_object(mqueue); + spl_t s; - /* - * Unlink the portset wait queue from knote/kqueue, - * and release our reference on the portset. - */ - ips_lock(pset); - (void)knote_unlink_waitq(kn, &pset->ips_messages.imq_wait_queue); - kn->kn_ptr.p_pset = IPS_NULL; - ips_unlock(pset); - ips_release(pset); + s = splsched(); + imq_lock(mqueue); + KNOTE_DETACH(&mqueue->imq_klist, kn); + kn->kn_ptr.p_mqueue = IMQ_NULL; + imq_unlock(mqueue); + splx(s); + + if (io_otype(object) == IOT_PORT_SET) { + /* + * Unlink the portset wait queue from knote/kqueue. + * JMM - Does this need to be atomic under the mq lock? + */ + (void)knote_unlink_waitq(kn, &mqueue->imq_wait_queue); + } + io_release(object); } +/* + * filt_machport - deliver events into the mach port filter + * + * Mach port message arrival events are currently only posted via the + * kqueue filter routine for ports. Port sets are marked stay-active + * and the wait queue code will break any kqueue waiters out to go + * poll the stay-queued knotes again. + * + * If there is a message at the head of the queue, + * we indicate that the knote should go active. If + * the message is to be direct-received, we adjust the + * QoS of the knote according the requested and override + * QoS of that first message. + * + * NOTE_REVOKE events are a legacy way to indicate that the port/portset + * was deallocated or left the current Mach portspace (modern technique + * is with an EV_VANISHED protocol). If we see NOTE_REVOKE, deliver an + * EV_EOF event for these changes (hopefully it will get delivered before + * the port name recycles to the same generation count and someone tries + * to re-register a kevent for it or the events are udata-specific - + * avoiding a conflict). + */ static int filt_machport( - struct knote *kn, - __unused long hint) + struct knote *kn, + long hint) +{ + ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; + ipc_kmsg_t first; + int result = 0; + + /* mqueue locked by caller */ + assert(imq_held(mqueue)); + + if (hint == NOTE_REVOKE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + result = 1; + } else if (imq_is_valid(mqueue)) { + assert(!imq_is_set(mqueue)); + if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { + if (kn->kn_sfflags & MACH_RCV_MSG) + knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override); + result = 1; + } + } + + return result; +} + +static int +filt_machporttouch( + struct knote *kn, + struct kevent_internal_s *kev) { - mach_port_name_t name = (mach_port_name_t)kn->kn_kevent.ident; - ipc_pset_t pset = IPS_NULL; - wait_result_t wresult; - thread_t self = current_thread(); - kern_return_t kr; - mach_msg_option_t option; - mach_msg_size_t size; + ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; + ipc_kmsg_t first; + int result = 0; + spl_t s; + + s = splsched(); + imq_lock(mqueue); - /* never called from below */ - assert(hint == 0); + /* copy in new settings and save off new input fflags */ + kn->kn_sfflags = kev->fflags; + kn->kn_ext[0] = kev->ext[0]; + kn->kn_ext[1] = kev->ext[1]; + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; /* - * called from user context. Have to validate the - * name. If it changed, we have an EOF situation. + * If the mqueue is a valid port and there is a message + * that will be direct-received from the knote, update + * the knote qos based on the first message and trigger + * the event. If there are no more messages, reset the + * QoS to the value provided by the kevent. */ - kr = ipc_object_translate(current_space(), name, - MACH_PORT_RIGHT_PORT_SET, - (ipc_object_t *)&pset); - if (kr != KERN_SUCCESS || pset != kn->kn_ptr.p_pset || !ips_active(pset)) { - kn->kn_data = 0; - kn->kn_flags |= (EV_EOF | EV_ONESHOT); - if (pset != IPS_NULL) { - ips_unlock(pset); - } - return(1); - } + if (imq_is_valid(mqueue) && !imq_is_set(mqueue) && + (first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { + if (kn->kn_sfflags & MACH_RCV_MSG) + knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override); + result = 1; + } else if (kn->kn_sfflags & MACH_RCV_MSG) { + knote_adjust_qos(kn, + MACH_MSG_PRIORITY_UNSPECIFIED, + MACH_MSG_PRIORITY_UNSPECIFIED); + } + imq_unlock(mqueue); + splx(s); - /* just use the reference from here on out */ - ips_reference(pset); - ips_unlock(pset); + return result; +} + +static int +filt_machportprocess( + struct knote *kn, + struct filt_process_s *process_data, + struct kevent_internal_s *kev) +{ + ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; + ipc_object_t object = mqueue_to_object(mqueue); + thread_t self = current_thread(); + boolean_t used_filtprocess_data = FALSE; + + wait_result_t wresult; + mach_msg_option_t option; + mach_vm_address_t addr; + mach_msg_size_t size; + + imq_lock(mqueue); + + /* Capture current state */ + *kev = kn->kn_kevent; + + /* If already deallocated/moved return one last EOF event */ + if (kev->flags & EV_EOF) { + imq_unlock(mqueue); + return 1; + } /* * Only honor supported receive options. If no options are @@ -428,16 +626,35 @@ filt_machport( * name of the port and sizeof the waiting message. */ option = kn->kn_sfflags & (MACH_RCV_MSG|MACH_RCV_LARGE|MACH_RCV_LARGE_IDENTITY| - MACH_RCV_TRAILER_MASK|MACH_RCV_VOUCHER); + MACH_RCV_TRAILER_MASK|MACH_RCV_VOUCHER); + if (option & MACH_RCV_MSG) { - self->ith_msg_addr = (mach_vm_address_t) kn->kn_ext[0]; - size = (mach_msg_size_t)kn->kn_ext[1]; + addr = (mach_vm_address_t) kn->kn_ext[0]; + size = (mach_msg_size_t) kn->kn_ext[1]; + + /* + * If the kevent didn't specify a buffer and length, carve a buffer + * from the filter processing data according to the flags. + */ + if (size == 0 && process_data != NULL) { + used_filtprocess_data = TRUE; + + addr = (mach_vm_address_t)process_data->fp_data_out; + size = (mach_msg_size_t)process_data->fp_data_resid; + option |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY); + if (process_data->fp_flags & KEVENT_FLAG_STACK_DATA) + option |= MACH_RCV_STACK; + } } else { + /* just detect the port name (if a set) and size of the first message */ option = MACH_RCV_LARGE; - self->ith_msg_addr = 0; + addr = 0; size = 0; } + /* just use the reference from here on out */ + io_reference(object); + /* * Set up to receive a message or the notification of a * too large message. But never allow this call to wait. @@ -445,8 +662,10 @@ filt_machport( * options, pass those through here. But we don't support * scatter lists through this interface. */ - self->ith_object = (ipc_object_t)pset; - self->ith_msize = size; + self->ith_object = object; + self->ith_msg_addr = addr; + self->ith_rsize = size; + self->ith_msize = 0; self->ith_option = option; self->ith_receiver_name = MACH_PORT_NULL; self->ith_continuation = NULL; @@ -454,24 +673,26 @@ filt_machport( self->ith_state = MACH_RCV_IN_PROGRESS; wresult = ipc_mqueue_receive_on_thread( - &pset->ips_messages, + mqueue, option, size, /* max_size */ 0, /* immediate timeout */ THREAD_INTERRUPTIBLE, self); - assert(wresult == THREAD_NOT_WAITING); - assert(self->ith_state != MACH_RCV_IN_PROGRESS); + /* mqueue unlocked */ /* - * If we timed out, just release the reference on the - * portset and return zero. + * If we timed out, or the process is exiting, just release the + * reference on the ipc_object and return zero. */ - if (self->ith_state == MACH_RCV_TIMED_OUT) { - ips_release(pset); + if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) { + io_release(object); return 0; } + assert(wresult == THREAD_NOT_WAITING); + assert(self->ith_state != MACH_RCV_IN_PROGRESS); + /* * If we weren't attempting to receive a message * directly, we need to return the port name in @@ -480,8 +701,8 @@ filt_machport( if ((option & MACH_RCV_MSG) != MACH_RCV_MSG) { assert(self->ith_state == MACH_RCV_TOO_LARGE); assert(self->ith_kmsg == IKM_NULL); - kn->kn_data = self->ith_receiver_name; - ips_release(pset); + kev->data = self->ith_receiver_name; + io_release(object); return 1; } @@ -489,56 +710,66 @@ filt_machport( * Attempt to receive the message directly, returning * the results in the fflags field. */ - assert(option & MACH_RCV_MSG); - kn->kn_ext[1] = self->ith_msize; - kn->kn_data = MACH_PORT_NULL; - kn->kn_fflags = mach_msg_receive_results(); - /* kmsg and pset reference consumed */ + kev->fflags = mach_msg_receive_results(&size); + + /* kmsg and object reference consumed */ /* * if the user asked for the identity of ports containing a * a too-large message, return it in the data field (as we * do for messages we didn't try to receive). */ - if ((kn->kn_fflags == MACH_RCV_TOO_LARGE) && - (option & MACH_RCV_LARGE_IDENTITY)) - kn->kn_data = self->ith_receiver_name; - - return 1; -} + if (kev->fflags == MACH_RCV_TOO_LARGE) { + kev->ext[1] = self->ith_msize; + if (option & MACH_RCV_LARGE_IDENTITY) + kev->data = self->ith_receiver_name; + else + kev->data = MACH_PORT_NULL; + } else { + kev->ext[1] = size; + kev->data = MACH_PORT_NULL; + } -static void -filt_machporttouch(struct knote *kn, struct kevent_internal_s *kev, long type) -{ - switch (type) { - case EVENT_REGISTER: - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - kn->kn_ext[0] = kev->ext[0]; - kn->kn_ext[1] = kev->ext[1]; - break; - case EVENT_PROCESS: - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; + /* + * If we used a data buffer carved out from the filt_process data, + * store the address used in the knote and adjust the residual and + * other parameters for future use. + */ + if (used_filtprocess_data) { + assert(process_data->fp_data_resid >= size); + process_data->fp_data_resid -= size; + if ((process_data->fp_flags & KEVENT_FLAG_STACK_DATA) == 0) { + kev->ext[0] = process_data->fp_data_out; + process_data->fp_data_out += size; + } else { + assert(option & MACH_RCV_STACK); + kev->ext[0] = process_data->fp_data_out + + process_data->fp_data_resid; } - break; - default: - panic("filt_machporttouch() - invalid type (%ld)", type); - break; - } + } + + /* + * Apply message-based QoS values to output kevent as prescribed. + * The kev->qos field gets max(msg-qos, kn->kn_qos). + * The kev->ext[2] field gets (msg-qos << 32) | (override-qos). + * + * The mach_msg_receive_results() call saved off the message + * QoS values in the continuation save area on successful receive. + */ + if (kev->fflags == MACH_MSG_SUCCESS) { + kev->qos = mach_msg_priority_combine(self->ith_qos, kn->kn_qos); + kev->ext[2] = ((uint64_t)self->ith_qos << 32) | + (uint64_t)self->ith_qos_override; + } + + return 1; } /* - * Peek to see if the portset associated with the knote has any + * Peek to see if the message queue associated with the knote has any * events. This pre-hook is called when a filter uses the stay- - * on-queue mechanism (as the knote_link_waitq mechanism - * does). - * - * This is called with the kqueue that the knote belongs to still - * locked (thus holding a reference on the knote, but restricting - * also restricting our ability to take other locks). + * on-queue mechanism (as the knote_link_waitq mechanism does for + * portsets) and someone calls select() against the containing kqueue. * * Just peek at the pre-post status of the portset's wait queue * to determine if it has anything interesting. We can do it @@ -550,8 +781,7 @@ filt_machporttouch(struct knote *kn, struct kevent_internal_s *kev, long type) static unsigned filt_machportpeek(struct knote *kn) { - ipc_pset_t pset = kn->kn_ptr.p_pset; - ipc_mqueue_t set_mq = &pset->ips_messages; + ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; - return (ipc_mqueue_set_peek(set_mq)); + return (ipc_mqueue_set_peek(mqueue)); } diff --git a/osfmk/ipc/ipc_pset.h b/osfmk/ipc/ipc_pset.h index b6f56fffe..863faedcf 100644 --- a/osfmk/ipc/ipc_pset.h +++ b/osfmk/ipc/ipc_pset.h @@ -84,8 +84,6 @@ struct ipc_pset { }; #define ips_references ips_object.io_references -#define ips_local_name ips_messages.imq_local_name - #define ips_active(pset) io_active(&(pset)->ips_object) #define ips_lock(pset) io_lock(&(pset)->ips_object) @@ -106,6 +104,10 @@ extern kern_return_t ipc_pset_alloc_name( mach_port_name_t name, ipc_pset_t *psetp); +/* Allocate a port set in a special space */ +extern ipc_pset_t ipc_pset_alloc_special( + ipc_space_t space); + /* Add a port to a port set */ extern kern_return_t ipc_pset_add( ipc_pset_t pset, diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index bf655396e..eb9c04544 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -416,15 +416,15 @@ ipc_right_request_alloc( break; } + kr = (entry->ie_bits & MACH_PORT_TYPE_PORT_OR_DEAD) ? + KERN_INVALID_ARGUMENT : KERN_INVALID_RIGHT; + is_write_unlock(space); if (port != IP_NULL) ip_release(port); - if (entry->ie_bits & MACH_PORT_TYPE_PORT_OR_DEAD) - return KERN_INVALID_ARGUMENT; - else - return KERN_INVALID_RIGHT; + return kr; } *previousp = previous; @@ -662,8 +662,7 @@ ipc_right_terminate( assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); - ipc_port_clear_receiver(port); - ipc_port_destroy(port); /* consumes our ref, unlocks */ + ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */ } else if (type & MACH_PORT_TYPE_SEND_ONCE) { assert(port->ip_sorights > 0); @@ -807,8 +806,7 @@ ipc_right_destroy( assert(ip_active(port)); assert(port->ip_receiver == space); - ipc_port_clear_receiver(port); - ipc_port_destroy(port); /* consumes our ref, unlocks */ + ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */ } else if (type & MACH_PORT_TYPE_SEND_ONCE) { assert(port->ip_sorights > 0); @@ -1188,8 +1186,7 @@ ipc_right_delta( } is_write_unlock(space); - ipc_port_clear_receiver(port); - ipc_port_destroy(port); /* consumes ref, unlocks */ + ipc_port_destroy(port); /* clears receiver, consumes ref, unlocks */ if (request != IP_NULL) ipc_notify_port_deleted(request, name); @@ -1552,8 +1549,7 @@ ipc_right_destruct( if (nsrequest != IP_NULL) ipc_notify_no_senders(nsrequest, mscount); - ipc_port_clear_receiver(port); - ipc_port_destroy(port); /* consumes ref, unlocks */ + ipc_port_destroy(port); /* clears receiver, consumes ref, unlocks */ if (request != IP_NULL) ipc_notify_port_deleted(request, name); @@ -1836,7 +1832,7 @@ ipc_right_copyin( entry->ie_bits = bits &~ MACH_PORT_TYPE_RECEIVE; ipc_entry_modified(space, name, entry); - ipc_port_clear_receiver(port); + (void)ipc_port_clear_receiver(port, FALSE); /* don't destroy the port/mqueue */ port->ip_receiver_name = MACH_PORT_NULL; port->ip_destination = IP_NULL; @@ -2663,9 +2659,7 @@ ipc_right_rename( ips_lock(pset); assert(ips_active(pset)); - assert(pset->ips_local_name == oname); - pset->ips_local_name = nname; ips_unlock(pset); break; } diff --git a/osfmk/ipc/ipc_space.c b/osfmk/ipc/ipc_space.c index 7d0305e32..4c38dbab7 100644 --- a/osfmk/ipc/ipc_space.c +++ b/osfmk/ipc/ipc_space.c @@ -170,6 +170,7 @@ ipc_space_create( space->is_task = NULL; space->is_low_mod = new_size; space->is_high_mod = 0; + space->is_node_id = HOST_LOCAL_NODE; /* HOST_LOCAL_NODE, except proxy spaces */ *spacep = space; return KERN_SUCCESS; @@ -208,6 +209,7 @@ ipc_space_create_special( space->is_table_next = 0; space->is_low_mod = 0; space->is_high_mod = 0; + space->is_node_id = HOST_LOCAL_NODE; /* HOST_LOCAL_NODE, except proxy spaces */ *spacep = space; return KERN_SUCCESS; diff --git a/osfmk/ipc/ipc_space.h b/osfmk/ipc/ipc_space.h index ff7588ed0..136d9181a 100644 --- a/osfmk/ipc/ipc_space.h +++ b/osfmk/ipc/ipc_space.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,7 +81,7 @@ #include #ifdef __APPLE_API_PRIVATE -#if MACH_KERNEL_PRIVATE +#ifdef MACH_KERNEL_PRIVATE #include #include #include @@ -120,6 +120,7 @@ struct ipc_space { struct ipc_table_size *is_table_next; /* info for larger table */ ipc_entry_num_t is_low_mod; /* lowest modified entry during growth */ ipc_entry_num_t is_high_mod; /* highest modified entry during growth */ + int is_node_id; /* HOST_LOCAL_NODE, or remote node if proxy space */ }; #define IS_NULL ((ipc_space_t) 0) diff --git a/osfmk/ipc/ipc_voucher.c b/osfmk/ipc/ipc_voucher.c index f914dec6f..ab0858c99 100644 --- a/osfmk/ipc/ipc_voucher.c +++ b/osfmk/ipc/ipc_voucher.c @@ -27,6 +27,7 @@ */ #include +#include #include #include #include @@ -42,6 +43,7 @@ #include #include #include +#include /* * Sysctl variable; enable and disable tracing of voucher contents @@ -96,6 +98,7 @@ static lck_spin_t ivgt_lock_data; ipc_voucher_t iv_alloc(iv_index_t entries); void iv_dealloc(ipc_voucher_t iv, boolean_t unhash); +extern int thread_qos_from_pthread_priority(unsigned long, unsigned long *); static inline iv_refs_t iv_reference(ipc_voucher_t iv) @@ -722,8 +725,9 @@ ipc_voucher_attr_control_notify(mach_msg_header_t *msg) ip_unlock(port); ivac_release(ivac); + } else { + ip_unlock(port); } - ip_unlock(port); } /* @@ -765,7 +769,7 @@ convert_voucher_attr_control_to_port(ipc_voucher_attr_control_t control) assert(IP_NULL == port->ip_nsrequest); ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify); assert(IP_NULL == old_notify); - ip_unlock(port); + /* ipc_port_nsrequest unlocks the port */ } else { /* piggyback on the existing port reference, so consume ours */ ip_unlock(port); @@ -1081,8 +1085,6 @@ static void ivace_release( * re-drive the release. */ if (ivace->ivace_made != made) { - assert(made < ivace->ivace_made); - if (KERN_SUCCESS == kr) ivace->ivace_made -= made; @@ -1700,7 +1702,7 @@ iv_dedup(ipc_voucher_t new_iv) #define PAYLOAD_PER_TRACEPOINT (4 * sizeof(uintptr_t)) #define PAYLOAD_SIZE 1024 - _Static_assert(PAYLOAD_SIZE % PAYLOAD_PER_TRACEPOINT == 0, "size invariant violated"); + static_assert(PAYLOAD_SIZE % PAYLOAD_PER_TRACEPOINT == 0, "size invariant violated"); mach_voucher_attr_raw_recipe_array_size_t payload_size = PAYLOAD_SIZE; uintptr_t payload[PAYLOAD_SIZE / sizeof(uintptr_t)]; @@ -2608,6 +2610,49 @@ host_register_mach_voucher_attr_manager( return KERN_NOT_SUPPORTED; } +/* + * Routine: ipc_get_pthpriority_from_kmsg_voucher + * Purpose: + * Get the canonicalized pthread priority from the voucher attached in the kmsg. + */ +kern_return_t +ipc_get_pthpriority_from_kmsg_voucher( + ipc_kmsg_t kmsg, + ipc_pthread_priority_value_t *canonicalize_priority_value) +{ + ipc_voucher_t pthread_priority_voucher; + mach_voucher_attr_raw_recipe_size_t content_size = + sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t); + uint8_t content_data[content_size]; + mach_voucher_attr_recipe_t cur_content; + kern_return_t kr = KERN_SUCCESS; + + if (!IP_VALID(kmsg->ikm_voucher)) { + return KERN_FAILURE; + } + + pthread_priority_voucher = (ipc_voucher_t)kmsg->ikm_voucher->ip_kobject; + kr = mach_voucher_extract_attr_recipe(pthread_priority_voucher, + MACH_VOUCHER_ATTR_KEY_PTHPRIORITY, + content_data, + &content_size); + if (kr != KERN_SUCCESS) { + return kr; + } + + /* return KERN_INVALID_VALUE for default value */ + if (content_size < sizeof(mach_voucher_attr_recipe_t)) { + return KERN_INVALID_VALUE; + } + + cur_content = (mach_voucher_attr_recipe_t) (void *) &content_data[0]; + assert(cur_content->content_size == sizeof(ipc_pthread_priority_value_t)); + memcpy(canonicalize_priority_value, cur_content->content, sizeof(ipc_pthread_priority_value_t)); + + return KERN_SUCCESS; +} + + /* * Routine: ipc_voucher_send_preprocessing * Purpose: @@ -2786,6 +2831,47 @@ ipc_voucher_prepare_processing_recipe( return KERN_SUCCESS; } +/* + * Activity id Generation + */ +uint64_t voucher_activity_id; + +#define generate_activity_id(x) \ + ((uint64_t)OSAddAtomic64((x), (int64_t *)&voucher_activity_id)) + +/* + * Routine: mach_init_activity_id + * Purpose: + * Initialize voucher activity id. + */ +void +mach_init_activity_id(void) +{ + voucher_activity_id = 1; +} + +/* + * Routine: mach_generate_activity_id + * Purpose: + * Generate a system wide voucher activity id. + */ +kern_return_t +mach_generate_activity_id( + struct mach_generate_activity_id_args *args) +{ + uint64_t activity_id; + kern_return_t kr = KERN_SUCCESS; + + if (args->count <= 0 || args->count > MACH_ACTIVITY_ID_COUNT_MAX) { + return KERN_INVALID_ARGUMENT; + } + + activity_id = generate_activity_id(args->count); + kr = copyout(&activity_id, args->activity_id, sizeof (activity_id)); + + return (kr); +} + #if defined(MACH_VOUCHER_ATTR_KEY_USER_DATA) || defined(MACH_VOUCHER_ATTR_KEY_TEST) /* diff --git a/osfmk/ipc/ipc_voucher.h b/osfmk/ipc/ipc_voucher.h index bc8061e31..3f637856c 100644 --- a/osfmk/ipc/ipc_voucher.h +++ b/osfmk/ipc/ipc_voucher.h @@ -38,6 +38,7 @@ #include #include #include +#include /* locking */ extern lck_grp_t ipc_lck_grp; @@ -158,7 +159,8 @@ typedef ipc_voucher_attr_control_t iv_attr_control_t; extern ipc_voucher_attr_control_t ivac_alloc(iv_index_t); extern void ipc_voucher_receive_postprocessing(ipc_kmsg_t kmsg, mach_msg_option_t option); extern void ipc_voucher_send_preprocessing(ipc_kmsg_t kmsg); - +extern void mach_init_activity_id(void); +extern kern_return_t ipc_get_pthpriority_from_kmsg_voucher(ipc_kmsg_t kmsg, ipc_pthread_priority_value_t *qos); #define ivac_lock_init(ivac) \ lck_spin_init(&(ivac)->ivac_lock_data, &ipc_lck_grp, &ipc_lck_attr) #define ivac_lock_destroy(ivac) \ diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index 25c472e15..972073a71 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -188,7 +188,7 @@ mach_port_space_info( if (space == IS_NULL) return KERN_INVALID_TASK; -#if !(DEVELOPMENT | DEBUG) +#if !(DEVELOPMENT || DEBUG) && CONFIG_MACF const boolean_t dbg_ok = (mac_task_check_expose_task(kernel_task) == 0); #else const boolean_t dbg_ok = TRUE; diff --git a/osfmk/ipc/mach_kernelrpc.c b/osfmk/ipc/mach_kernelrpc.c index 9971c84b5..af9223254 100644 --- a/osfmk/ipc/mach_kernelrpc.c +++ b/osfmk/ipc/mach_kernelrpc.c @@ -30,9 +30,12 @@ #include #include #include +#include +#include #include #include #include +#include #include int @@ -118,6 +121,33 @@ _kernelrpc_mach_vm_map_trap(struct _kernelrpc_mach_vm_map_trap_args *args) return (rv); } +int +_kernelrpc_mach_vm_purgable_control_trap( + struct _kernelrpc_mach_vm_purgable_control_trap_args *args) +{ + int state; + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + if (copyin(args->state, (char *)&state, sizeof (state))) + goto done; + + rv = mach_vm_purgable_control(task->map, + args->address, + args->control, + &state); + if (rv == KERN_SUCCESS) + rv = copyout(&state, args->state, sizeof (state)); + +done: + if (task) + task_deallocate(task); + return (rv); +} + int _kernelrpc_mach_port_allocate_trap(struct _kernelrpc_mach_port_allocate_args *args) { @@ -345,3 +375,116 @@ _kernelrpc_mach_port_unguard_trap(struct _kernelrpc_mach_port_unguard_args *args return (rv); } +kern_return_t +host_create_mach_voucher_trap(struct host_create_mach_voucher_args *args) +{ + host_t host = port_name_to_host(args->host); + ipc_voucher_t new_voucher = IV_NULL; + ipc_port_t voucher_port = IPC_PORT_NULL; + mach_port_name_t voucher_name = 0; + kern_return_t kr = 0; + + if (host == HOST_NULL) + return MACH_SEND_INVALID_DEST; + + if (args->recipes_size < 0) + return KERN_INVALID_ARGUMENT; + else if (args->recipes_size > MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE) + return MIG_ARRAY_TOO_LARGE; + + if (args->recipes_size < MACH_VOUCHER_TRAP_STACK_LIMIT) { + /* keep small recipes on the stack for speed */ + uint8_t krecipes[args->recipes_size]; + if (copyin(args->recipes, (void *)krecipes, args->recipes_size)) { + kr = KERN_MEMORY_ERROR; + goto done; + } + kr = host_create_mach_voucher(host, krecipes, args->recipes_size, &new_voucher); + } else { + uint8_t *krecipes = kalloc((vm_size_t)args->recipes_size); + if (!krecipes) { + kr = KERN_RESOURCE_SHORTAGE; + goto done; + } + + if (copyin(args->recipes, (void *)krecipes, args->recipes_size)) { + kfree(krecipes, (vm_size_t)args->recipes_size); + kr = KERN_MEMORY_ERROR; + goto done; + } + + kr = host_create_mach_voucher(host, krecipes, args->recipes_size, &new_voucher); + kfree(krecipes, (vm_size_t)args->recipes_size); + } + + if (kr == 0) { + voucher_port = convert_voucher_to_port(new_voucher); + voucher_name = ipc_port_copyout_send(voucher_port, current_space()); + + kr = copyout(&voucher_name, args->voucher, sizeof(voucher_name)); + } + +done: + return kr; +} + +kern_return_t +mach_voucher_extract_attr_recipe_trap(struct mach_voucher_extract_attr_recipe_args *args) +{ + ipc_voucher_t voucher = IV_NULL; + kern_return_t kr = KERN_SUCCESS; + mach_msg_type_number_t sz = 0; + + if (copyin(args->recipe_size, (void *)&sz, sizeof(sz))) + return KERN_MEMORY_ERROR; + + if (sz > MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE) + return MIG_ARRAY_TOO_LARGE; + + voucher = convert_port_name_to_voucher(args->voucher_name); + if (voucher == IV_NULL) + return MACH_SEND_INVALID_DEST; + + mach_msg_type_number_t __assert_only max_sz = sz; + + if (sz < MACH_VOUCHER_TRAP_STACK_LIMIT) { + /* keep small recipes on the stack for speed */ + uint8_t krecipe[sz]; + if (copyin(args->recipe, (void *)krecipe, sz)) { + kr = KERN_MEMORY_ERROR; + goto done; + } + kr = mach_voucher_extract_attr_recipe(voucher, args->key, + (mach_voucher_attr_raw_recipe_t)krecipe, &sz); + assert(sz <= max_sz); + + if (kr == KERN_SUCCESS && sz > 0) + kr = copyout(krecipe, (void *)args->recipe, sz); + } else { + uint8_t *krecipe = kalloc((vm_size_t)sz); + if (!krecipe) { + kr = KERN_RESOURCE_SHORTAGE; + goto done; + } + + if (copyin(args->recipe, (void *)krecipe, args->recipe_size)) { + kfree(krecipe, (vm_size_t)sz); + kr = KERN_MEMORY_ERROR; + goto done; + } + + kr = mach_voucher_extract_attr_recipe(voucher, args->key, + (mach_voucher_attr_raw_recipe_t)krecipe, &sz); + assert(sz <= max_sz); + + if (kr == KERN_SUCCESS && sz > 0) + kr = copyout(krecipe, (void *)args->recipe, sz); + kfree(krecipe, (vm_size_t)sz); + } + + kr = copyout(&sz, args->recipe_size, sizeof(sz)); + +done: + ipc_voucher_release(voucher); + return kr; +} diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index 6c4e472dc..6a5241f71 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -138,10 +138,12 @@ mach_msg_return_t mach_msg_receive( mach_msg_return_t msg_receive_error( ipc_kmsg_t kmsg, - mach_vm_address_t msg_addr, mach_msg_option_t option, + mach_vm_address_t rcv_addr, + mach_msg_size_t rcv_size, mach_port_seqno_t seqno, - ipc_space_t space); + ipc_space_t space, + mach_msg_size_t *out_size); security_token_t KERNEL_SECURITY_TOKEN = KERNEL_SECURITY_TOKEN_VALUE; audit_token_t KERNEL_AUDIT_TOKEN = KERNEL_AUDIT_TOKEN_VALUE; @@ -185,7 +187,7 @@ mach_msg_send( mach_msg_option_t option, mach_msg_size_t send_size, mach_msg_timeout_t send_timeout, - __unused mach_port_name_t notify) + mach_msg_priority_t override) { ipc_space_t space = current_space(); vm_map_t map = current_map(); @@ -200,13 +202,22 @@ mach_msg_send( if (send_size > MACH_MSG_SIZE_MAX - MAX_TRAILER_SIZE) return MACH_SEND_TOO_LARGE; + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + msg_and_trailer_size = send_size + MAX_TRAILER_SIZE; kmsg = ipc_kmsg_alloc(msg_and_trailer_size); - if (kmsg == IKM_NULL) + if (kmsg == IKM_NULL) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, MACH_SEND_NO_BUFFER); return MACH_SEND_NO_BUFFER; + } + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_LINK) | DBG_FUNC_NONE, + (uintptr_t)0, /* this should only be called from the kernel! */ + VM_KERNEL_ADDRPERM((uintptr_t)kmsg), + 0, 0, + 0); (void) memcpy((void *) kmsg->ikm_header, (const void *) msg, send_size); kmsg->ikm_header->msgh_size = send_size; @@ -223,10 +234,11 @@ mach_msg_send( trailer->msgh_trailer_type = MACH_MSG_TRAILER_FORMAT_0; trailer->msgh_trailer_size = MACH_MSG_TRAILER_MINIMUM_SIZE; - mr = ipc_kmsg_copyin(kmsg, space, map, &option); + mr = ipc_kmsg_copyin(kmsg, space, map, override, &option); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -237,6 +249,7 @@ mach_msg_send( (void) memcpy((void *) msg, (const void *) kmsg->ikm_header, kmsg->ikm_header->msgh_size); ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); } return mr; @@ -278,7 +291,8 @@ typedef struct */ mach_msg_return_t -mach_msg_receive_results(void) +mach_msg_receive_results( + mach_msg_size_t *sizep) { thread_t self = current_thread(); ipc_space_t space = current_space(); @@ -286,47 +300,68 @@ mach_msg_receive_results(void) ipc_object_t object = self->ith_object; mach_msg_return_t mr = self->ith_state; - mach_vm_address_t msg_addr = self->ith_msg_addr; + mach_vm_address_t rcv_addr = self->ith_msg_addr; + mach_msg_size_t rcv_size = self->ith_rsize; mach_msg_option_t option = self->ith_option; ipc_kmsg_t kmsg = self->ith_kmsg; mach_port_seqno_t seqno = self->ith_seqno; + mach_msg_trailer_size_t trailer_size; + mach_msg_size_t size = 0; io_release(object); if (mr != MACH_MSG_SUCCESS) { - if (mr == MACH_RCV_TOO_LARGE ) { - if (option & MACH_RCV_LARGE) { - /* - * We need to inform the user-level code that it needs more - * space. The value for how much space was returned in the - * msize save area instead of the message (which was left on - * the queue). - */ - if (option & MACH_RCV_LARGE_IDENTITY) { - if (copyout((char *) &self->ith_receiver_name, - msg_addr + offsetof(mach_msg_user_header_t, msgh_local_port), - sizeof(mach_port_name_t))) - mr = MACH_RCV_INVALID_DATA; - } - if (copyout((char *) &self->ith_msize, - msg_addr + offsetof(mach_msg_user_header_t, msgh_size), - sizeof(mach_msg_size_t))) - mr = MACH_RCV_INVALID_DATA; - } else { - - /* discard importance in message */ - ipc_importance_clean(kmsg); - - if (msg_receive_error(kmsg, msg_addr, option, seqno, space) - == MACH_RCV_INVALID_DATA) - mr = MACH_RCV_INVALID_DATA; - } - } - return mr; + if (mr == MACH_RCV_TOO_LARGE) { + + /* + * If the receive operation occurs with MACH_RCV_LARGE set + * then no message was extracted from the queue, and the size + * and (optionally) receiver names were the only thing captured. + * Just copyout the size (and optional port name) in a fake + * header. + */ + if (option & MACH_RCV_LARGE) { + + if ((option & MACH_RCV_STACK) == 0 && + rcv_size >= offsetof(mach_msg_user_header_t, msgh_reserved)) { + + /* + * We need to inform the user-level code that it needs more + * space. The value for how much space was returned in the + * msize save area instead of the message (which was left on + * the queue). + */ + if (option & MACH_RCV_LARGE_IDENTITY) { + if (copyout((char *) &self->ith_receiver_name, + rcv_addr + offsetof(mach_msg_user_header_t, msgh_local_port), + sizeof(mach_port_name_t))) + mr = MACH_RCV_INVALID_DATA; + } + if (copyout((char *) &self->ith_msize, + rcv_addr + offsetof(mach_msg_user_header_t, msgh_size), + sizeof(mach_msg_size_t))) + mr = MACH_RCV_INVALID_DATA; + } + } else { + + /* discard importance in message */ + ipc_importance_clean(kmsg); + + if (msg_receive_error(kmsg, option, rcv_addr, rcv_size, seqno, space, &size) + == MACH_RCV_INVALID_DATA) + mr = MACH_RCV_INVALID_DATA; + } + } + + if (sizep) + *sizep = size; + return mr; } + /* MACH_MSG_SUCCESS */ + #if IMPORTANCE_INHERITANCE /* adopt/transform any importance attributes carried in the message */ @@ -339,31 +374,61 @@ mach_msg_receive_results(void) trailer_size = ipc_kmsg_add_trailer(kmsg, space, option, self, seqno, FALSE, kmsg->ikm_header->msgh_remote_port->ip_context); + mr = ipc_kmsg_copyout(kmsg, space, map, MACH_MSG_BODY_NULL, option); if (mr != MACH_MSG_SUCCESS) { + /* already received importance, so have to undo that here */ ipc_importance_unreceive(kmsg, option); + /* if we had a body error copyout what we have, otherwise a simple header/trailer */ if ((mr &~ MACH_MSG_MASK) == MACH_RCV_BODY_ERROR) { - if (ipc_kmsg_put(msg_addr, kmsg, kmsg->ikm_header->msgh_size + - trailer_size) == MACH_RCV_INVALID_DATA) + if (ipc_kmsg_put(kmsg, option, rcv_addr, rcv_size, trailer_size, &size) == MACH_RCV_INVALID_DATA) mr = MACH_RCV_INVALID_DATA; - } - else { - if (msg_receive_error(kmsg, msg_addr, option, seqno, space) + } else { + if (msg_receive_error(kmsg, option, rcv_addr, rcv_size, seqno, space, &size) == MACH_RCV_INVALID_DATA) mr = MACH_RCV_INVALID_DATA; } } else { - mr = ipc_kmsg_put(msg_addr, - kmsg, - kmsg->ikm_header->msgh_size + - trailer_size); + /* capture ksmg QoS values to the thread continuation state */ + self->ith_qos = kmsg->ikm_qos; + self->ith_qos_override = kmsg->ikm_qos_override; + mr = ipc_kmsg_put(kmsg, option, rcv_addr, rcv_size, trailer_size, &size); } + if (sizep) + *sizep = size; return mr; } +#ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG +#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */ +#endif +#ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG +#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG 0x80000000 /* request overcommit threads */ +#endif +#ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK +#define _PTHREAD_PRIORITY_QOS_CLASS_MASK 0x003fff00 /* QoS class mask */ +#endif + +/* JMM - this needs to invoke a pthread function to compute this */ +mach_msg_priority_t +mach_msg_priority_combine(mach_msg_priority_t msg_qos, + mach_msg_priority_t recv_qos) +{ + mach_msg_priority_t overcommit; + mach_msg_priority_t no_oc_qos; + mach_msg_priority_t res; + + assert(msg_qos < _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG); + overcommit = recv_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG; + no_oc_qos = recv_qos & ~overcommit; + res = (no_oc_qos > msg_qos) ? no_oc_qos : msg_qos; + res |= overcommit; + + return res; +} /* * Routine: mach_msg_receive [Kernel Internal] @@ -406,22 +471,28 @@ mach_msg_receive( self->ith_msg_addr = CAST_DOWN(mach_vm_address_t, msg); self->ith_object = object; - self->ith_msize = rcv_size; + self->ith_rsize = rcv_size; + self->ith_msize = 0; self->ith_option = option; self->ith_continuation = continuation; ipc_mqueue_receive(mqueue, option, rcv_size, rcv_timeout, THREAD_ABORTSAFE); if ((option & MACH_RCV_TIMEOUT) && rcv_timeout == 0) thread_poll_yield(self); - return mach_msg_receive_results(); + return mach_msg_receive_results(NULL); } void mach_msg_receive_continue(void) { + mach_msg_return_t mr; thread_t self = current_thread(); - (*self->ith_continuation)(mach_msg_receive_results()); + if (self->ith_state == MACH_PEEK_READY) + mr = MACH_PEEK_READY; + else + mr = mach_msg_receive_results(NULL); + (*self->ith_continuation)(mr); } @@ -445,7 +516,7 @@ mach_msg_overwrite_trap( mach_msg_size_t rcv_size = args->rcv_size; mach_port_name_t rcv_name = args->rcv_name; mach_msg_timeout_t msg_timeout = args->timeout; - __unused mach_port_name_t notify = args->notify; + mach_msg_priority_t override = args->override; mach_vm_address_t rcv_msg_addr = args->rcv_msg; __unused mach_port_seqno_t temp_seqno = 0; @@ -459,15 +530,26 @@ mach_msg_overwrite_trap( ipc_space_t space = current_space(); ipc_kmsg_t kmsg; + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + mr = ipc_kmsg_get(msg_addr, send_size, &kmsg); - if (mr != MACH_MSG_SUCCESS) + if (mr != MACH_MSG_SUCCESS) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; + } - mr = ipc_kmsg_copyin(kmsg, space, map, &option); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_LINK) | DBG_FUNC_NONE, + (uintptr_t)msg_addr, + VM_KERNEL_ADDRPERM((uintptr_t)kmsg), + 0, 0, + 0); + + mr = ipc_kmsg_copyin(kmsg, space, map, override, &option); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -475,7 +557,8 @@ mach_msg_overwrite_trap( if (mr != MACH_MSG_SUCCESS) { mr |= ipc_kmsg_copyout_pseudo(kmsg, space, map, MACH_MSG_BODY_NULL); - (void) ipc_kmsg_put(msg_addr, kmsg, kmsg->ikm_header->msgh_size); + (void) ipc_kmsg_put(kmsg, option, msg_addr, send_size, 0, NULL); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -498,7 +581,8 @@ mach_msg_overwrite_trap( else self->ith_msg_addr = msg_addr; self->ith_object = object; - self->ith_msize = rcv_size; + self->ith_rsize = rcv_size; + self->ith_msize = 0; self->ith_option = option; self->ith_receiver_name = MACH_PORT_NULL; self->ith_continuation = thread_syscall_return; @@ -506,7 +590,7 @@ mach_msg_overwrite_trap( ipc_mqueue_receive(mqueue, option, rcv_size, msg_timeout, THREAD_ABORTSAFE); if ((option & MACH_RCV_TIMEOUT) && msg_timeout == 0) thread_poll_yield(self); - return mach_msg_receive_results(); + return mach_msg_receive_results(NULL); } return MACH_MSG_SUCCESS; @@ -542,6 +626,8 @@ mach_msg_trap( * MACH_RCV_TOO_LARGE or MACH_RCV_BODY_ERROR error. * Conditions: * Nothing locked. + * size - maximum buffer size on input, + * actual copied-out size on output * Returns: * MACH_MSG_SUCCESS minimal header/trailer copied * MACH_RCV_INVALID_DATA copyout to user buffer failed @@ -550,10 +636,12 @@ mach_msg_trap( mach_msg_return_t msg_receive_error( ipc_kmsg_t kmsg, - mach_vm_address_t msg_addr, mach_msg_option_t option, + mach_vm_address_t rcv_addr, + mach_msg_size_t rcv_size, mach_port_seqno_t seqno, - ipc_space_t space) + ipc_space_t space, + mach_msg_size_t *sizep) { mach_vm_address_t context; mach_msg_trailer_size_t trailer_size; @@ -583,10 +671,11 @@ msg_receive_error( TRUE, context); /* - * Copy the message to user space + * Copy the message to user space and return the size + * (note that ipc_kmsg_put may also adjust the actual + * size copied out to user-space). */ - if (ipc_kmsg_put(msg_addr, kmsg, kmsg->ikm_header->msgh_size + - trailer_size) == MACH_RCV_INVALID_DATA) + if (ipc_kmsg_put(kmsg, option, rcv_addr, rcv_size, trailer_size, sizep) == MACH_RCV_INVALID_DATA) return(MACH_RCV_INVALID_DATA); else return(MACH_MSG_SUCCESS); diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index c606f8147..d15991ebf 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -250,7 +250,7 @@ mach_port_names( vm_map_copy_t memory2; /* copied-in memory, for types */ /* safe simplifying assumption */ - assert_static(sizeof(mach_port_name_t) == sizeof(mach_port_type_t)); + static_assert(sizeof(mach_port_name_t) == sizeof(mach_port_type_t)); if (space == IS_NULL) return KERN_INVALID_TASK; @@ -1022,7 +1022,7 @@ mach_port_peek( /* Port locked and active */ found = ipc_mqueue_peek(&port->ip_messages, seqnop, - msg_sizep, msg_idp, &max_trailer); + msg_sizep, msg_idp, &max_trailer, NULL); ip_unlock(port); if (found != TRUE) @@ -1390,8 +1390,7 @@ mach_port_move_member( */ wq_link_id = waitq_link_reserve(NULL); wq_reserved_prepost = waitq_prepost_reserve(NULL, 10, - WAITQ_DONT_LOCK, - NULL); + WAITQ_DONT_LOCK); } kr = ipc_right_lookup_read(space, member, &entry); @@ -1428,6 +1427,7 @@ mach_port_move_member( assert(nset != IPS_NULL); } ip_lock(port); + assert(ip_active(port)); ipc_pset_remove_from_all(port); if (nset != IPS_NULL) { @@ -2042,7 +2042,7 @@ mach_port_insert_member( wq_link_id = waitq_link_reserve(NULL); wq_reserved_prepost = waitq_prepost_reserve(NULL, 10, - WAITQ_DONT_LOCK, NULL); + WAITQ_DONT_LOCK); kr = ipc_object_translate_two(space, name, MACH_PORT_RIGHT_RECEIVE, &obj, @@ -2221,25 +2221,14 @@ mach_port_unguard_locked( */ kern_return_t mach_port_guard_exception( - mach_port_name_t name, - uint64_t inguard, - uint64_t portguard, - unsigned reason) + mach_port_name_t name, + __unused uint64_t inguard, + uint64_t portguard, + unsigned reason) { thread_t t = current_thread(); uint64_t code, subcode; - /* Log exception info to syslog */ - printf( "Mach Port Guard Exception - " - "Thread: 0x%x, " - "Port Name: 0x%x, " - "Expected Guard: 0x%x, " - "Received Guard: 0x%x\n", - (unsigned)VM_KERNEL_UNSLIDE_OR_PERM(t), - (unsigned)name, - (unsigned)portguard, - (unsigned)inguard); - /* * EXC_GUARD namespace for mach ports * diff --git a/osfmk/kdp/Makefile b/osfmk/kdp/Makefile index b36e5dce9..7c1535007 100644 --- a/osfmk/kdp/Makefile +++ b/osfmk/kdp/Makefile @@ -20,5 +20,3 @@ EXPORT_MI_DIR = kdp include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/kdp/kdp.c b/osfmk/kdp/kdp.c index a397a9843..3875ba4e6 100644 --- a/osfmk/kdp/kdp.c +++ b/osfmk/kdp/kdp.c @@ -143,9 +143,13 @@ kdp_packet( size_t plen = *len; kdp_req_t req; boolean_t ret; - + #if DO_ALIGN - bcopy((char *)pkt, (char *)rd, sizeof(aligned_pkt)); + if (plen > sizeof(aligned_pkt)) { + printf("kdp_packet bad len %lu\n", plen); + return FALSE; + } + bcopy((char *)pkt, (char *)rd, plen); #else rd = (kdp_pkt_t *)pkt; #endif diff --git a/osfmk/kdp/kdp_core.c b/osfmk/kdp/kdp_core.c index 8820b2a73..02febf2f1 100644 --- a/osfmk/kdp/kdp_core.c +++ b/osfmk/kdp/kdp_core.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,9 @@ #endif /* defined(__i386__) || defined(__x86_64__) */ +#if WITH_CONSISTENT_DBG +#include +#endif /* WITH_CONSISTENT_DBG */ typedef int (*pmap_traverse_callback)(vm_map_offset_t start, vm_map_offset_t end, @@ -117,6 +121,51 @@ static size_t kdp_core_zsize; static size_t kdp_core_zoffset; static z_stream kdp_core_zs; +static uint64_t kdp_core_total_size; +static uint64_t kdp_core_total_size_sent_uncomp; +#if WITH_CONSISTENT_DBG +struct xnu_hw_shmem_dbg_command_info *hwsd_info = NULL; + +#define KDP_CORE_HW_SHMEM_DBG_NUM_BUFFERS 2 +#define KDP_CORE_HW_SHMEM_DBG_TOTAL_BUF_SIZE 64 * 1024 + +/* + * Astris can read up to 4064 bytes at a time over + * the probe, so we should try to make our buffer + * size a multiple of this to make reads by astris + * (the bottleneck) most efficient. + */ +#define OPTIMAL_ASTRIS_READSIZE 4064 + +struct kdp_hw_shmem_dbg_buf_elm { + vm_offset_t khsd_buf; + uint32_t khsd_data_length; + STAILQ_ENTRY(kdp_hw_shmem_dbg_buf_elm) khsd_elms; +}; + +static STAILQ_HEAD(, kdp_hw_shmem_dbg_buf_elm) free_hw_shmem_dbg_bufs = + STAILQ_HEAD_INITIALIZER(free_hw_shmem_dbg_bufs); +static STAILQ_HEAD(, kdp_hw_shmem_dbg_buf_elm) hw_shmem_dbg_bufs_to_flush = + STAILQ_HEAD_INITIALIZER(hw_shmem_dbg_bufs_to_flush); + +static struct kdp_hw_shmem_dbg_buf_elm *currently_filling_buf = NULL; +static struct kdp_hw_shmem_dbg_buf_elm *currently_flushing_buf = NULL; + +static uint32_t kdp_hw_shmem_dbg_bufsize = 0; + +static uint32_t kdp_hw_shmem_dbg_seq_no = 0; +static uint64_t kdp_hw_shmem_dbg_contact_deadline = 0; +static uint64_t kdp_hw_shmem_dbg_contact_deadline_interval = 0; + +#define KDP_HW_SHMEM_DBG_TIMEOUT_DEADLINE_SECS 30 +#endif /* WITH_CONSISTENT_DBG */ + +/* + * These variables will be modified by the BSD layer if the root device is + * a RAMDisk. + */ +uint64_t kdp_core_ramdisk_addr = 0; +uint64_t kdp_core_ramdisk_size = 0; #define DEBG kdb_printf @@ -125,6 +174,255 @@ boolean_t kdp_has_polled_corefile(void) return (NULL != gIOPolledCoreFileVars); } +#if WITH_CONSISTENT_DBG +/* + * Whenever we start a coredump, make sure the buffers + * are all on the free queue and the state is as expected. + * The buffers may have been left in a different state if + * a previous coredump attempt failed. + */ +static void +kern_dump_hw_shmem_dbg_reset() +{ + struct kdp_hw_shmem_dbg_buf_elm *cur_elm = NULL, *tmp_elm = NULL; + + STAILQ_FOREACH(cur_elm, &free_hw_shmem_dbg_bufs, khsd_elms) { + cur_elm->khsd_data_length = 0; + } + + if (currently_filling_buf != NULL) { + currently_filling_buf->khsd_data_length = 0; + + STAILQ_INSERT_HEAD(&free_hw_shmem_dbg_bufs, currently_filling_buf, khsd_elms); + currently_filling_buf = NULL; + } + + if (currently_flushing_buf != NULL) { + currently_flushing_buf->khsd_data_length = 0; + + STAILQ_INSERT_HEAD(&free_hw_shmem_dbg_bufs, currently_flushing_buf, khsd_elms); + currently_flushing_buf = NULL; + } + + STAILQ_FOREACH_SAFE(cur_elm, &hw_shmem_dbg_bufs_to_flush, khsd_elms, tmp_elm) { + cur_elm->khsd_data_length = 0; + + STAILQ_REMOVE(&hw_shmem_dbg_bufs_to_flush, cur_elm, kdp_hw_shmem_dbg_buf_elm, khsd_elms); + STAILQ_INSERT_HEAD(&free_hw_shmem_dbg_bufs, cur_elm, khsd_elms); + } + + hwsd_info->xhsdci_status = XHSDCI_COREDUMP_BUF_EMPTY; + kdp_hw_shmem_dbg_seq_no = 0; + hwsd_info->xhsdci_buf_phys_addr = 0; + hwsd_info->xhsdci_buf_data_length = 0; + hwsd_info->xhsdci_coredump_total_size_uncomp = 0; + hwsd_info->xhsdci_coredump_total_size_sent_uncomp = 0; + hwsd_info->xhsdci_page_size = PAGE_SIZE; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + + kdp_hw_shmem_dbg_contact_deadline = mach_absolute_time() + kdp_hw_shmem_dbg_contact_deadline_interval; +} + +/* + * Tries to move buffers forward in 'progress'. If + * the hardware debugger is done consuming the current buffer, we + * can put the next one on it and move the current + * buffer back to the free queue. + */ +static int +kern_dump_hw_shmem_dbg_process_buffers() +{ + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + if (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_ERROR) { + kdb_printf("Detected remote error, terminating...\n"); + return -1; + } else if (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_BUF_EMPTY) { + if (hwsd_info->xhsdci_seq_no != (kdp_hw_shmem_dbg_seq_no + 1)) { + kdb_printf("Detected stale/invalid seq num. Expected: %d, received %d\n", + (kdp_hw_shmem_dbg_seq_no + 1), hwsd_info->xhsdci_seq_no); + hwsd_info->xhsdci_status = XHSDCI_COREDUMP_ERROR; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + return -1; + } + + kdp_hw_shmem_dbg_seq_no = hwsd_info->xhsdci_seq_no; + + if (currently_flushing_buf != NULL) { + currently_flushing_buf->khsd_data_length = 0; + STAILQ_INSERT_TAIL(&free_hw_shmem_dbg_bufs, currently_flushing_buf, khsd_elms); + } + + currently_flushing_buf = STAILQ_FIRST(&hw_shmem_dbg_bufs_to_flush); + if (currently_flushing_buf != NULL) { + STAILQ_REMOVE_HEAD(&hw_shmem_dbg_bufs_to_flush, khsd_elms); + + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + hwsd_info->xhsdci_buf_phys_addr = kvtophys(currently_flushing_buf->khsd_buf); + hwsd_info->xhsdci_buf_data_length = currently_flushing_buf->khsd_data_length; + hwsd_info->xhsdci_coredump_total_size_uncomp = kdp_core_total_size; + hwsd_info->xhsdci_coredump_total_size_sent_uncomp = kdp_core_total_size_sent_uncomp; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, KDP_CORE_HW_SHMEM_DBG_TOTAL_BUF_SIZE); + hwsd_info->xhsdci_seq_no = ++kdp_hw_shmem_dbg_seq_no; + hwsd_info->xhsdci_status = XHSDCI_COREDUMP_BUF_READY; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + } + + kdp_hw_shmem_dbg_contact_deadline = mach_absolute_time() + + kdp_hw_shmem_dbg_contact_deadline_interval; + + return 0; + } else if (mach_absolute_time() > kdp_hw_shmem_dbg_contact_deadline) { + kdb_printf("Kernel timed out waiting for hardware debugger to update handshake structure."); + kdb_printf(" No contact in %d seconds\n", KDP_HW_SHMEM_DBG_TIMEOUT_DEADLINE_SECS); + + hwsd_info->xhsdci_status = XHSDCI_COREDUMP_ERROR; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + return -1; + } + + return 0; +} + +/* + * Populates currently_filling_buf with a new buffer + * once one becomes available. Returns 0 on success + * or the value returned by kern_dump_hw_shmem_dbg_process_buffers() + * if it is non-zero (an error). + */ +static int +kern_dump_hw_shmem_dbg_get_buffer() +{ + int ret = 0; + + assert(currently_filling_buf == NULL); + + while (STAILQ_EMPTY(&free_hw_shmem_dbg_bufs)) { + ret = kern_dump_hw_shmem_dbg_process_buffers(); + if (ret) { + return ret; + } + } + + currently_filling_buf = STAILQ_FIRST(&free_hw_shmem_dbg_bufs); + STAILQ_REMOVE_HEAD(&free_hw_shmem_dbg_bufs, khsd_elms); + + assert(currently_filling_buf->khsd_data_length == 0); + return ret; +} + +/* + * Output procedure for hardware shared memory core dumps + * + * Tries to fill up the buffer completely before flushing + */ +static int +kern_dump_hw_shmem_dbg_buffer_proc(unsigned int request, __unused char *corename, + uint64_t length, void * data) +{ + int ret = 0; + + assert(length < UINT32_MAX); + uint32_t bytes_remaining = (uint32_t) length; + uint32_t bytes_to_copy; + + if (request == KDP_EOF) { + assert(currently_filling_buf == NULL); + + /* + * Wait until we've flushed all the buffers + * before setting the connection status to done. + */ + while (!STAILQ_EMPTY(&hw_shmem_dbg_bufs_to_flush) || + currently_flushing_buf != NULL) { + ret = kern_dump_hw_shmem_dbg_process_buffers(); + if (ret) { + return ret; + } + } + + /* + * If the last status we saw indicates that the buffer was + * empty and we didn't flush any new data since then, we expect + * the sequence number to still match the last we saw. + */ + if (hwsd_info->xhsdci_seq_no < kdp_hw_shmem_dbg_seq_no) { + kdb_printf("EOF Flush: Detected stale/invalid seq num. Expected: %d, received %d\n", + kdp_hw_shmem_dbg_seq_no, hwsd_info->xhsdci_seq_no); + return -1; + } + + kdp_hw_shmem_dbg_seq_no = hwsd_info->xhsdci_seq_no; + + kdb_printf("Setting coredump status as done!\n"); + hwsd_info->xhsdci_seq_no = ++kdp_hw_shmem_dbg_seq_no; + hwsd_info->xhsdci_status = XHSDCI_COREDUMP_STATUS_DONE; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + + return ret; + } + + assert(request == KDP_DATA); + + /* + * The output procedure is called with length == 0 and data == NULL + * to flush any remaining output at the end of the coredump before + * we call it a final time to mark the dump as done. + */ + if (length == 0) { + assert(data == NULL); + + if (currently_filling_buf != NULL) { + STAILQ_INSERT_TAIL(&hw_shmem_dbg_bufs_to_flush, currently_filling_buf, khsd_elms); + currently_filling_buf = NULL; + } + + /* + * Move the current buffer along if possible. + */ + ret = kern_dump_hw_shmem_dbg_process_buffers(); + return ret; + } + + while (bytes_remaining != 0) { + /* + * Make sure we have a buffer to work with. + */ + while (currently_filling_buf == NULL) { + ret = kern_dump_hw_shmem_dbg_get_buffer(); + if (ret) { + return ret; + } + } + + assert(kdp_hw_shmem_dbg_bufsize >= currently_filling_buf->khsd_data_length); + bytes_to_copy = MIN(bytes_remaining, kdp_hw_shmem_dbg_bufsize - + currently_filling_buf->khsd_data_length); + bcopy(data, (void *)(currently_filling_buf->khsd_buf + currently_filling_buf->khsd_data_length), + bytes_to_copy); + + currently_filling_buf->khsd_data_length += bytes_to_copy; + + if (currently_filling_buf->khsd_data_length == kdp_hw_shmem_dbg_bufsize) { + STAILQ_INSERT_TAIL(&hw_shmem_dbg_bufs_to_flush, currently_filling_buf, khsd_elms); + currently_filling_buf = NULL; + + /* + * Move it along if possible. + */ + ret = kern_dump_hw_shmem_dbg_process_buffers(); + if (ret) { + return ret; + } + } + + bytes_remaining -= bytes_to_copy; + data = (void *) ((uintptr_t)data + bytes_to_copy); + } + + return ret; +} +#endif /* WITH_CONSISTENT_DBG */ + static IOReturn kern_dump_disk_proc(unsigned int request, __unused char *corename, uint64_t length, void * data) @@ -163,6 +461,9 @@ kern_dump_disk_proc(unsigned int request, __unused char *corename, return (err); } +/* + * flushes any data to the output proc immediately + */ static int kdp_core_zoutput(z_streamp strm, Bytef *buf, unsigned len) { @@ -183,6 +484,9 @@ kdp_core_zoutput(z_streamp strm, Bytef *buf, unsigned len) return (len); } +/* + * tries to fill the buffer with data before flushing it via the output proc. + */ static int kdp_core_zoutputbuf(z_streamp strm, Bytef *inbuf, unsigned inlen) { @@ -200,10 +504,14 @@ kdp_core_zoutputbuf(z_streamp strm, Bytef *inbuf, unsigned inlen) { chunk = vars->outremain; if (chunk > remain) chunk = remain; - bcopy(inbuf, &vars->outbuf[vars->outlen - vars->outremain], chunk); + if (!inbuf) bzero(&vars->outbuf[vars->outlen - vars->outremain], chunk); + else + { + bcopy(inbuf, &vars->outbuf[vars->outlen - vars->outremain], chunk); + inbuf += chunk; + } vars->outremain -= chunk; remain -= chunk; - inbuf += chunk; if (vars->outremain && !flush) break; if ((ret = (*vars->outproc)(KDP_DATA, NULL, @@ -227,7 +535,7 @@ static int kdp_core_zinput(z_streamp strm, Bytef *buf, unsigned size) { struct kdp_core_out_vars * vars = (typeof(vars)) strm->opaque; - uint64_t percent; + uint64_t percent, total_in = 0; unsigned len; len = strm->avail_in; @@ -244,11 +552,14 @@ kdp_core_zinput(z_streamp strm, Bytef *buf, unsigned size) if (0 == (511 & vars->writes++)) { - percent = (strm->total_in * 100) / vars->totalbytes; + total_in = strm->total_in; + kdp_core_total_size_sent_uncomp = strm->total_in; + + percent = (total_in * 100) / vars->totalbytes; if ((percent - vars->lastpercent) >= 10) { vars->lastpercent = percent; - DEBG("%lld..", percent); + DEBG("%lld..\n", percent); } } @@ -256,55 +567,84 @@ kdp_core_zinput(z_streamp strm, Bytef *buf, unsigned size) } static IOReturn -kdp_core_stream_output(struct kdp_core_out_vars * vars, uint64_t length, void * data) +kdp_core_stream_output_chunk(struct kdp_core_out_vars * vars, unsigned length, void * data) { z_stream * zs; int zr; boolean_t flush; - flush = (!length && !data); - zr = Z_OK; - zs = &kdp_core_zs; - assert(!zs->avail_in); - while (vars->error >= 0) + if (kdp_corezip_disabled) { - if (!zs->avail_in && !flush) - { - if (!length) break; - zs->next_in = data ? data : (Bytef *) zs /* zero marker */; - zs->avail_in = (uInt)length; - length = 0; - } - if (!zs->avail_out) - { - zs->next_out = (Bytef *) zs; - zs->avail_out = UINT32_MAX; - } - zr = deflate(zs, flush ? Z_FINISH : Z_NO_FLUSH); - if (Z_STREAM_END == zr) break; - if (zr != Z_OK) + (*vars->zoutput)(zs, data, length); + } + else + { + + flush = (!length && !data); + zr = Z_OK; + + assert(!zs->avail_in); + + while (vars->error >= 0) { - DEBG("ZERR %d\n", zr); - vars->error = zr; + if (!zs->avail_in && !flush) + { + if (!length) break; + zs->next_in = data ? data : (Bytef *) zs /* zero marker */; + zs->avail_in = length; + length = 0; + } + if (!zs->avail_out) + { + zs->next_out = (Bytef *) zs; + zs->avail_out = UINT32_MAX; + } + zr = deflate(zs, flush ? Z_FINISH : Z_NO_FLUSH); + if (Z_STREAM_END == zr) break; + if (zr != Z_OK) + { + DEBG("ZERR %d\n", zr); + vars->error = zr; + } } - } - if (flush) (*vars->zoutput)(zs, NULL, 0); + if (flush) (*vars->zoutput)(zs, NULL, 0); + } return (vars->error); } +static IOReturn +kdp_core_stream_output(struct kdp_core_out_vars * vars, uint64_t length, void * data) +{ + IOReturn err; + unsigned int chunk; + enum { kMaxZLibChunk = 1024*1024*1024 }; + + do + { + if (length <= kMaxZLibChunk) chunk = (typeof(chunk)) length; + else chunk = kMaxZLibChunk; + err = kdp_core_stream_output_chunk(vars, chunk, data); + + length -= chunk; + if (data) data = (void *) (((uintptr_t) data) + chunk); + } + while (length && (kIOReturnSuccess == err)); + + return (err); +} + extern vm_offset_t c_buffers; extern vm_size_t c_buffers_size; ppnum_t -kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr) +kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr, uintptr_t * pvphysaddr) { - ppnum_t ppn; - uint64_t vincr; - vincr = PAGE_SIZE_64; + ppnum_t ppn = 0; + uint64_t vincr = PAGE_SIZE_64; assert(!(vaddr & PAGE_MASK_64)); @@ -321,10 +661,23 @@ kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr) ppn = 0; vincr = kdp_core_zsize; } + else if ((kdp_core_ramdisk_addr != 0) && (vaddr == kdp_core_ramdisk_addr)) + { + ppn = 0; + vincr = kdp_core_ramdisk_size; + } else ppn = pmap_find_phys(kernel_pmap, vaddr); - *pvincr = vincr; + *pvincr = round_page_64(vincr); + + if (ppn && pvphysaddr) + { + uint64_t phys = ptoa_64(ppn); + if (physmap_enclosed(phys)) *pvphysaddr = PHYSMAP_PTOV(phys); + else ppn = 0; + } + return (ppn); } @@ -358,7 +711,7 @@ pmap_traverse_present_mappings(pmap_t __unused pmap, for (vcur = vcurstart = start; (ret == KERN_SUCCESS) && (vcur < end); ) { ppnum_t ppn; - ppn = kernel_pmap_present_mapping(vcur, &vincr); + ppn = kernel_pmap_present_mapping(vcur, &vincr, NULL); if (ppn != 0) { if (((vcur < debug_start) || (vcur >= debug_end)) @@ -477,7 +830,7 @@ kern_dump_pmap_traverse_send_segdata_callback(vm_map_offset_t start, kdc->region_count++; kdc->dumpable_bytes += size; - if ((ret = kdp_core_stream_output(kdc->outvars, (unsigned int)size, (caddr_t)(uintptr_t)start)) != kIOReturnSuccess) { + if ((ret = kdp_core_stream_output(kdc->outvars, size, (caddr_t)(uintptr_t)start)) != kIOReturnSuccess) { DEBG("kdp_core_stream_output(0x%x)\n", ret); goto out; } @@ -488,77 +841,92 @@ kern_dump_pmap_traverse_send_segdata_callback(vm_map_offset_t start, } static int -do_kern_dump(kern_dump_output_proc outproc, bool local) +do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) { - struct kern_dump_preflight_context kdc_preflight; - struct kern_dump_send_context kdc_sendseg; - struct kern_dump_send_context kdc_send; - struct kdp_core_out_vars outvars; - struct mach_core_fileheader hdr; - kernel_mach_header_t mh; - uint32_t segment_count, tstate_count; - size_t command_size = 0, header_size = 0, tstate_size = 0; - uint64_t hoffset, foffset; - int ret; - char * log_start; - uint64_t log_length; - uint64_t new_logs; - boolean_t opened; - - opened = false; - log_start = debug_buf_ptr; - log_length = 0; - if (log_start >= debug_buf_addr) - { - log_length = log_start - debug_buf_addr; - if (log_length <= debug_buf_size) log_length = debug_buf_size - log_length; - else log_length = 0; - } + struct kern_dump_preflight_context kdc_preflight = { }; + struct kern_dump_send_context kdc_sendseg = { }; + struct kern_dump_send_context kdc_send = { }; + struct kdp_core_out_vars outvars = { }; + struct mach_core_fileheader hdr = { }; + struct ident_command ident = { }; + kernel_mach_header_t mh = { }; + + uint32_t segment_count = 0, tstate_count = 0; + size_t command_size = 0, header_size = 0, tstate_size = 0; + uint64_t hoffset = 0, foffset = 0; + int ret = 0; + char * log_start; + char * buf; + size_t log_size; + uint64_t new_logs = 0; + boolean_t opened; + + opened = false; + log_start = debug_buf_ptr; + log_size = debug_buf_ptr - debug_buf_addr; + assert (log_size <= debug_buf_size); + if (debug_buf_stackshot_start) + { + assert(debug_buf_stackshot_end >= debug_buf_stackshot_start); + log_size -= (debug_buf_stackshot_end - debug_buf_stackshot_start); + } - if (local) - { - if ((ret = (*outproc)(KDP_WRQ, NULL, 0, &hoffset)) != kIOReturnSuccess) { - DEBG("KDP_WRQ(0x%x)\n", ret); - goto out; + if (kd_variant == KERN_DUMP_DISK) + { + if ((ret = (*outproc)(KDP_WRQ, NULL, 0, &hoffset)) != kIOReturnSuccess) { + DEBG("KDP_WRQ(0x%x)\n", ret); + goto out; + } + } + opened = true; + + // init gzip + bzero(&outvars, sizeof(outvars)); + bzero(&hdr, sizeof(hdr)); + outvars.outproc = outproc; + + /* + * Initialize zstream variables that point to input and output + * buffer info. + */ + kdp_core_zs.avail_in = 0; + kdp_core_zs.next_in = NULL; + kdp_core_zs.avail_out = 0; + kdp_core_zs.next_out = NULL; + kdp_core_zs.opaque = &outvars; + kdc_sendseg.outvars = &outvars; + kdc_send.outvars = &outvars; + + enum { kHdrOffset = 4096, kMaxCoreLog = 16384 }; + + if (kd_variant == KERN_DUMP_DISK) { + outvars.outbuf = NULL; + outvars.outlen = 0; + outvars.outremain = 0; + outvars.zoutput = kdp_core_zoutput; + // space for file header, panic log, core log + foffset = (kHdrOffset + log_size + kMaxCoreLog + 4095) & ~4095ULL; + hdr.log_offset = kHdrOffset; + hdr.gzip_offset = foffset; + if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) { + DEBG("KDP_SEEK(0x%x)\n", ret); + goto out; + } + } else if (kd_variant == KERN_DUMP_NET) { + outvars.outbuf = (Bytef *) (kdp_core_zmem + kdp_core_zoffset); + assert((kdp_core_zoffset + kdp_crashdump_pkt_size) <= kdp_core_zsize); + outvars.outlen = kdp_crashdump_pkt_size; + outvars.outremain = outvars.outlen; + outvars.zoutput = kdp_core_zoutputbuf; +#if WITH_CONSISTENT_DBG + } else { /* KERN_DUMP_HW_SHMEM_DBG */ + outvars.outbuf = NULL; + outvars.outlen = 0; + outvars.outremain = 0; + outvars.zoutput = kdp_core_zoutput; + kern_dump_hw_shmem_dbg_reset(); +#endif } - } - opened = true; - - // init gzip - bzero(&outvars, sizeof(outvars)); - bzero(&hdr, sizeof(hdr)); - outvars.outproc = outproc; - kdp_core_zs.avail_in = 0; - kdp_core_zs.next_in = NULL; - kdp_core_zs.avail_out = 0; - kdp_core_zs.next_out = NULL; - kdp_core_zs.opaque = &outvars; - kdc_sendseg.outvars = &outvars; - kdc_send.outvars = &outvars; - - if (local) - { - outvars.outbuf = NULL; - outvars.outlen = 0; - outvars.outremain = 0; - outvars.zoutput = kdp_core_zoutput; - // space for file header & log - foffset = (4096 + log_length + 4095) & ~4095ULL; - hdr.log_offset = 4096; - hdr.gzip_offset = foffset; - if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) { - DEBG("KDP_SEEK(0x%x)\n", ret); - goto out; - } - } - else - { - outvars.outbuf = (Bytef *) (kdp_core_zmem + kdp_core_zoffset); - assert((kdp_core_zoffset + kdp_crashdump_pkt_size) <= kdp_core_zsize); - outvars.outlen = kdp_crashdump_pkt_size; - outvars.outremain = outvars.outlen; - outvars.zoutput = kdp_core_zoutputbuf; - } deflateResetWithIO(&kdp_core_zs, kdp_core_zinput, outvars.zoutput); @@ -581,9 +949,14 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) assert(outvars.totalbytes); segment_count = kdc_preflight.region_count; + kdp_core_total_size = outvars.totalbytes; + kdp_core_total_size_sent_uncomp = 0; + kern_collectth_state_size(&tstate_count, &tstate_size); - command_size = segment_count * sizeof(kernel_segment_command_t) + tstate_count * tstate_size; + command_size = segment_count * sizeof(kernel_segment_command_t) + + tstate_count * tstate_size + + sizeof(struct ident_command) + sizeof(kdp_kernelversion_string); header_size = command_size + sizeof(kernel_mach_header_t); @@ -595,7 +968,7 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) mh.cputype = _mh_execute_header.cputype;; mh.cpusubtype = _mh_execute_header.cpusubtype; mh.filetype = MH_CORE; - mh.ncmds = segment_count + tstate_count; + mh.ncmds = segment_count + tstate_count + 1; mh.sizeofcmds = (uint32_t)command_size; mh.flags = 0; #if defined(__LP64__) @@ -615,7 +988,7 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) hoffset += sizeof(kernel_mach_header_t); - DEBG("%s", local ? "Writing local kernel core..." : + DEBG("%s", (kd_variant == KERN_DUMP_DISK) ? "Writing local kernel core..." : "Transmitting kernel state, please wait:\n"); kdc_sendseg.region_count = 0; @@ -659,6 +1032,17 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) while (iter); } + ident.cmd = LC_IDENT; + ident.cmdsize = (uint32_t) (sizeof(struct ident_command) + sizeof(kdp_kernelversion_string)); + if ((ret = kdp_core_stream_output(&outvars, sizeof(ident), &ident)) != kIOReturnSuccess) { + DEBG("kdp_core_stream_output(0x%x)\n", ret); + goto out; + } + if ((ret = kdp_core_stream_output(&outvars, sizeof(kdp_kernelversion_string), &kdp_kernelversion_string[0])) != kIOReturnSuccess) { + DEBG("kdp_core_stream_output(0x%x)\n", ret); + goto out; + } + kdc_send.region_count = 0; kdc_send.dumpable_bytes = 0; foffset = (uint64_t) round_page(header_size); /* offset into file */ @@ -667,11 +1051,11 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) foffset = round_page_64(header_size) - header_size; if (foffset) { - // zero fill to page align - if ((ret = kdp_core_stream_output(&outvars, foffset, NULL)) != kIOReturnSuccess) { - DEBG("kdp_core_stream_output(0x%x)\n", ret); - goto out; - } + // zero fill to page align + if ((ret = kdp_core_stream_output(&outvars, foffset, NULL)) != kIOReturnSuccess) { + DEBG("kdp_core_stream_output(0x%x)\n", ret); + goto out; + } } ret = pmap_traverse_present_mappings(kernel_pmap, @@ -697,27 +1081,37 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) DEBG("Region counts: [%u, %u, %u]\n", kdc_preflight.region_count, kdc_sendseg.region_count, kdc_send.region_count); - DEBG("Byte counts : [%llu, %llu, %llu, %lu, %llu]\n", kdc_preflight.dumpable_bytes, + DEBG("Byte counts : [%llu, %llu, %llu, %lu, %lu]\n", kdc_preflight.dumpable_bytes, kdc_sendseg.dumpable_bytes, kdc_send.dumpable_bytes, - outvars.zipped, log_length); - if (local && opened) + outvars.zipped, + (long) (debug_buf_ptr - debug_buf_addr)); + if ((kd_variant == KERN_DUMP_DISK) && opened) { // write debug log - foffset = 4096; + foffset = kHdrOffset; if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) { DEBG("KDP_SEEK(0x%x)\n", ret); goto exit; - } + } - new_logs = debug_buf_ptr - log_start; - if (new_logs > log_length) new_logs = log_length; - - if ((ret = (*outproc)(KDP_DATA, NULL, new_logs, log_start)) != kIOReturnSuccess) - { - DEBG("KDP_DATA(0x%x)\n", ret); - goto exit; - } + new_logs = debug_buf_ptr - log_start; + if (new_logs > kMaxCoreLog) new_logs = kMaxCoreLog; + buf = debug_buf_addr; + if (debug_buf_stackshot_start) + { + if ((ret = (*outproc)(KDP_DATA, NULL, (debug_buf_stackshot_start - debug_buf_addr), debug_buf_addr)) != kIOReturnSuccess) + { + DEBG("KDP_DATA(0x%x)\n", ret); + goto exit; + } + buf = debug_buf_stackshot_end; + } + if ((ret = (*outproc)(KDP_DATA, NULL, (log_start + new_logs - buf), buf)) != kIOReturnSuccess) + { + DEBG("KDP_DATA(0x%x)\n", ret); + goto exit; + } // write header @@ -728,8 +1122,8 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) } hdr.signature = MACH_CORE_FILEHEADER_SIGNATURE; - hdr.log_length = new_logs; - hdr.gzip_length = outvars.zipped; + hdr.log_length = new_logs + log_size; + hdr.gzip_length = outvars.zipped; if ((ret = (*outproc)(KDP_DATA, NULL, sizeof(hdr), &hdr)) != kIOReturnSuccess) { @@ -740,28 +1134,32 @@ do_kern_dump(kern_dump_output_proc outproc, bool local) exit: /* close / last packet */ - if ((ret = (*outproc)(KDP_EOF, NULL, 0, ((void *) 0))) != kIOReturnSuccess) + if (opened && (ret = (*outproc)(KDP_EOF, NULL, 0, ((void *) 0))) != kIOReturnSuccess) { - DEBG("KDP_EOF(0x%x)\n", ret); - } + DEBG("KDP_EOF(0x%x)\n", ret); + } return (ret); } int -kern_dump(boolean_t local) +kern_dump(enum kern_dump_type kd_variant) { - static boolean_t dumped_local; - if (local) { - if (dumped_local) return (0); - dumped_local = TRUE; - return (do_kern_dump(&kern_dump_disk_proc, true)); - } + static boolean_t dumped_local; + if (kd_variant == KERN_DUMP_DISK) { + if (dumped_local) return (0); + dumped_local = TRUE; + return (do_kern_dump(&kern_dump_disk_proc, KERN_DUMP_DISK)); +#if WITH_CONSISTENT_DBG + } else if (kd_variant == KERN_DUMP_HW_SHMEM_DBG) { + return (do_kern_dump(&kern_dump_hw_shmem_dbg_buffer_proc, KERN_DUMP_HW_SHMEM_DBG)); +#endif + } #if CONFIG_KDP_INTERACTIVE_DEBUGGING - return (do_kern_dump(&kdp_send_crashdump_data, false)); + return (do_kern_dump(&kdp_send_crashdump_data, KERN_DUMP_NET)); #else - return (-1); + return (-1); #endif } @@ -787,27 +1185,85 @@ kdp_core_zfree(void * __unused ref, void * __unused ptr) {} void kdp_core_init(void) { - int wbits = 12; - int memlevel = 3; - kern_return_t kr; - - if (kdp_core_zs.zalloc) return; - kdp_core_zsize = round_page(NETBUF + zlib_deflate_memory_size(wbits, memlevel)); - printf("kdp_core zlib memory 0x%lx\n", kdp_core_zsize); - kr = kmem_alloc(kernel_map, &kdp_core_zmem, kdp_core_zsize, VM_KERN_MEMORY_DIAG); - assert (KERN_SUCCESS == kr); - - kdp_core_zoffset = 0; - kdp_core_zs.zalloc = kdp_core_zalloc; - kdp_core_zs.zfree = kdp_core_zfree; - - if (deflateInit2(&kdp_core_zs, LEVEL, Z_DEFLATED, - wbits + 16 /*gzip mode*/, memlevel, Z_DEFAULT_STRATEGY)) - { - /* Allocation failed */ - bzero(&kdp_core_zs, sizeof(kdp_core_zs)); + int wbits = 12; + int memlevel = 3; + kern_return_t kr; +#if WITH_CONSISTENT_DBG + int i = 0; + vm_offset_t kdp_core_hw_shmem_buf = 0; + struct kdp_hw_shmem_dbg_buf_elm *cur_elm = NULL; +#endif + + if (kdp_core_zs.zalloc) return; + kdp_core_zsize = round_page(NETBUF + zlib_deflate_memory_size(wbits, memlevel)); + printf("kdp_core zlib memory 0x%lx\n", kdp_core_zsize); + kr = kmem_alloc(kernel_map, &kdp_core_zmem, kdp_core_zsize, VM_KERN_MEMORY_DIAG); + assert (KERN_SUCCESS == kr); + kdp_core_zoffset = 0; - } + kdp_core_zs.zalloc = kdp_core_zalloc; + kdp_core_zs.zfree = kdp_core_zfree; + + if (deflateInit2(&kdp_core_zs, LEVEL, Z_DEFLATED, + wbits + 16 /*gzip mode*/, memlevel, Z_DEFAULT_STRATEGY)) { + /* Allocation failed */ + bzero(&kdp_core_zs, sizeof(kdp_core_zs)); + kdp_core_zoffset = 0; + } + +#if WITH_CONSISTENT_DBG + if (!PE_consistent_debug_enabled()) { + return; + } + + /* + * We need to allocate physically contiguous memory since astris isn't capable + * of doing address translations while the CPUs are running. + */ + kdp_hw_shmem_dbg_bufsize = KDP_CORE_HW_SHMEM_DBG_TOTAL_BUF_SIZE; + kr = kmem_alloc_contig(kernel_map, &kdp_core_hw_shmem_buf, kdp_hw_shmem_dbg_bufsize, VM_MAP_PAGE_MASK(kernel_map), + 0, 0, KMA_KOBJECT, VM_KERN_MEMORY_DIAG); + assert(KERN_SUCCESS == kr); + + /* + * Put the connection info structure at the beginning of this buffer and adjust + * the buffer size accordingly. + */ + hwsd_info = (struct xnu_hw_shmem_dbg_command_info *) kdp_core_hw_shmem_buf; + hwsd_info->xhsdci_status = XHSDCI_STATUS_NONE; + hwsd_info->xhsdci_seq_no = 0; + hwsd_info->xhsdci_buf_phys_addr = 0; + hwsd_info->xhsdci_buf_data_length = 0; + hwsd_info->xhsdci_coredump_total_size_uncomp = 0; + hwsd_info->xhsdci_coredump_total_size_sent_uncomp = 0; + hwsd_info->xhsdci_page_size = PAGE_SIZE; + + kdp_core_hw_shmem_buf += sizeof(*hwsd_info); + kdp_hw_shmem_dbg_bufsize -= sizeof(*hwsd_info); + kdp_hw_shmem_dbg_bufsize = (kdp_hw_shmem_dbg_bufsize / KDP_CORE_HW_SHMEM_DBG_NUM_BUFFERS); + kdp_hw_shmem_dbg_bufsize -= (kdp_hw_shmem_dbg_bufsize % OPTIMAL_ASTRIS_READSIZE); + + STAILQ_INIT(&free_hw_shmem_dbg_bufs); + STAILQ_INIT(&hw_shmem_dbg_bufs_to_flush); + + for (i = 0; i < KDP_CORE_HW_SHMEM_DBG_NUM_BUFFERS; i++) { + cur_elm = kalloc(sizeof(*cur_elm)); + assert(cur_elm != NULL); + + cur_elm->khsd_buf = kdp_core_hw_shmem_buf; + cur_elm->khsd_data_length = 0; + + kdp_core_hw_shmem_buf += kdp_hw_shmem_dbg_bufsize; + + STAILQ_INSERT_HEAD(&free_hw_shmem_dbg_bufs, cur_elm, khsd_elms); + } + + nanoseconds_to_absolutetime(KDP_HW_SHMEM_DBG_TIMEOUT_DEADLINE_SECS * NSEC_PER_SEC, + &kdp_hw_shmem_dbg_contact_deadline_interval); + + PE_consistent_debug_register(kDbgIdAstrisConnection, kvtophys((vm_offset_t) hwsd_info), sizeof(pmap_paddr_t)); + PE_consistent_debug_register(kDbgIdAstrisConnectionVers, CUR_XNU_HWSDCI_STRUCT_VERS, sizeof(uint32_t)); +#endif /* WITH_CONSISTENT_DBG */ } #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ diff --git a/osfmk/kdp/kdp_core.h b/osfmk/kdp/kdp_core.h index 6192d2ded..6db5cba8a 100644 --- a/osfmk/kdp/kdp_core.h +++ b/osfmk/kdp/kdp_core.h @@ -80,6 +80,43 @@ struct corehdr { #define CORE_REMOTE_PORT 1069 /* hardwired, we can't really query the services file */ +#if WITH_CONSISTENT_DBG +/* + * xnu shared memory hardware debugger support + * + * A hardware debugger can connect, read the consistent debug + * header to determine the physical location of the handshake + * structure and communicate using commands in the structure as + * defined below. + * + * Currently used for sending compressed coredumps to + * astris. + */ +struct xnu_hw_shmem_dbg_command_info { + volatile uint32_t xhsdci_status; + uint32_t xhsdci_seq_no; + volatile uint64_t xhsdci_buf_phys_addr; + volatile uint32_t xhsdci_buf_data_length; + /* end of version 0 structure */ + uint64_t xhsdci_coredump_total_size_uncomp; + uint64_t xhsdci_coredump_total_size_sent_uncomp; + uint32_t xhsdci_page_size; +} __attribute__((packed)); + +#define CUR_XNU_HWSDCI_STRUCT_VERS 1 + +#define XHSDCI_STATUS_NONE 0 /* default status */ +#define XHSDCI_STATUS_KERNEL_BUSY 1 /* kernel is busy with other procedure */ +#define XHSDCI_STATUS_KERNEL_READY 2 /* kernel ready to begin command */ +#define XHSDCI_COREDUMP_BEGIN 3 /* indicates hardware debugger is ready to begin consuming coredump info */ +#define XHSDCI_COREDUMP_BUF_READY 4 /* indicates the kernel has populated the buffer */ +#define XHSDCI_COREDUMP_BUF_EMPTY 5 /* indicates hardware debugger is done consuming the current data */ +#define XHSDCI_COREDUMP_STATUS_DONE 6 /* indicates last compressed data is in buffer */ +#define XHSDCI_COREDUMP_ERROR 7 /* indicates an error was encountered */ +#define XHSDCI_COREDUMP_REMOTE_DONE 8 /* indicates that hardware debugger is done */ + +#endif /* WITH_CONSISTENT_DBG */ + void kdp_panic_dump (void); void abort_panic_transfer (void); void kdp_set_dump_info(const uint32_t flags, const char *file, const char *destip, @@ -87,7 +124,15 @@ void kdp_set_dump_info(const uint32_t flags, const char *file, const char *desti void kdp_get_dump_info(uint32_t *flags, char *file, char *destip, char *routerip, uint32_t *port); -extern int kern_dump(boolean_t local); +enum kern_dump_type { + KERN_DUMP_DISK, /* local, on device core dump */ + KERN_DUMP_NET, /* kdp network core dump */ +#if WITH_CONSISTENT_DBG + KERN_DUMP_HW_SHMEM_DBG, /* coordinated hardware shared memory debugger core dump */ +#endif +}; + +extern int kern_dump(enum kern_dump_type kd_variant); struct corehdr *create_panic_header(unsigned int request, const char *corename, unsigned length, unsigned block); @@ -105,6 +150,8 @@ boolean_t kdp_has_polled_corefile(void); void kdp_core_init(void); +extern boolean_t kdp_corezip_disabled; + #define KDP_CRASHDUMP_POLL_COUNT (2500) #endif /* __KDP_CORE_H */ diff --git a/osfmk/kdp/kdp_dyld.h b/osfmk/kdp/kdp_dyld.h index fc7e8f3a0..314d220b5 100644 --- a/osfmk/kdp/kdp_dyld.h +++ b/osfmk/kdp/kdp_dyld.h @@ -31,18 +31,6 @@ * for each binary image not loaded from the shared cache during stackshots. */ -/* From dyld/include/dyld_images.h */ - -struct user32_dyld_uuid_info { - user32_addr_t imageLoadAddress; /* base address image is mapped into */ - uuid_t imageUUID; /* UUID of image */ -}; - -struct user64_dyld_uuid_info { - user64_addr_t imageLoadAddress; /* base address image is mapped into */ - uuid_t imageUUID; /* UUID of image */ -}; - /* Re-use dyld format for kext load addresses */ #if __LP64__ typedef struct user64_dyld_uuid_info kernel_uuid_info; @@ -64,15 +52,15 @@ struct user64_dyld_image_info { // FIXME: dyld is in C++, and some of the fields in dyld_all_image_infos are C++ // native booleans. There must be a better way... -typedef uint8_t bool; +typedef uint8_t dyld_bool; struct user32_dyld_all_image_infos { uint32_t version; uint32_t infoArrayCount; user32_addr_t infoArray; user32_addr_t notification; - bool processDetachedFromSharedRegion; - bool libSystemInitialized; + dyld_bool processDetachedFromSharedRegion; + dyld_bool libSystemInitialized; user32_addr_t dyldImageLoadAddress; user32_addr_t jitInfo; user32_addr_t dyldVersion; @@ -80,9 +68,25 @@ struct user32_dyld_all_image_infos { user32_addr_t terminationFlags; user32_addr_t coreSymbolicationShmPage; user32_addr_t systemOrderFlag; - user32_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count - user32_addr_t uuidArray; - user32_addr_t dyldAllImageInfosAddress; + user32_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count + user32_addr_t uuidArray; + user32_addr_t dyldAllImageInfosAddress; + + /* the following field is only in version 10 (Mac OS X 10.7, iOS 4.2) and later */ + user32_addr_t initialImageCount; + /* the following field is only in version 11 (Mac OS X 10.7, iOS 4.2) and later */ + user32_addr_t errorKind; + user32_addr_t errorClientOfDylibPath; + user32_addr_t errorTargetDylibPath; + user32_addr_t errorSymbol; + /* the following field is only in version 12 (Mac OS X 10.7, iOS 4.3) and later */ + user32_addr_t sharedCacheSlide; + /* the following field is only in version 13 (Mac OS X 10.9, iOS 7.0) and later */ + uint8_t sharedCacheUUID[16]; + /* the following field is only in version 14 (Mac OS X 10.9, iOS 7.0) and later */ + user32_addr_t reserved[16]; + /* the following field is only in version 15 (Mac OS X 10.12, iOS 10.0) and later */ + uint64_t timestamp; }; struct user64_dyld_all_image_infos { @@ -90,8 +94,8 @@ struct user64_dyld_all_image_infos { uint32_t infoArrayCount; user64_addr_t infoArray; user64_addr_t notification; - bool processDetachedFromSharedRegion; - bool libSystemInitialized; + dyld_bool processDetachedFromSharedRegion; + dyld_bool libSystemInitialized; user64_addr_t dyldImageLoadAddress; user64_addr_t jitInfo; user64_addr_t dyldVersion; @@ -99,7 +103,23 @@ struct user64_dyld_all_image_infos { user64_addr_t terminationFlags; user64_addr_t coreSymbolicationShmPage; user64_addr_t systemOrderFlag; - user64_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count - user64_addr_t uuidArray; - user64_addr_t dyldAllImageInfosAddress; + user64_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count + user64_addr_t uuidArray; + user64_addr_t dyldAllImageInfosAddress; + + /* the following field is only in version 10 (Mac OS X 10.7, iOS 4.2) and later */ + user64_addr_t initialImageCount; + /* the following field is only in version 11 (Mac OS X 10.7, iOS 4.2) and later */ + user64_addr_t errorKind; + user64_addr_t errorClientOfDylibPath; + user64_addr_t errorTargetDylibPath; + user64_addr_t errorSymbol; + /* the following field is only in version 12 (Mac OS X 10.7, iOS 4.3) and later */ + user64_addr_t sharedCacheSlide; + /* the following field is only in version 13 (Mac OS X 10.9, iOS 7.0) and later */ + uint8_t sharedCacheUUID[16]; + /* the following field is only in version 14 (Mac OS X 10.9, iOS 7.0) and later */ + user64_addr_t reserved[16]; + /* the following field is only in version 15 (Mac OS X 10.12, iOS 10.0) and later */ + uint64_t timestamp; }; diff --git a/osfmk/kdp/kdp_internal.h b/osfmk/kdp/kdp_internal.h index f73d34582..8c0e39a80 100644 --- a/osfmk/kdp/kdp_internal.h +++ b/osfmk/kdp/kdp_internal.h @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include typedef struct { @@ -54,6 +56,8 @@ extern kdp_glob_t kdp; extern volatile int kdp_flag; extern int noresume_on_disconnect; +extern char kdp_kernelversion_string[256]; + #define KDP_READY 0x1 #define KDP_ARP 0x2 #define KDP_BP_DIS 0x4 @@ -71,6 +75,15 @@ typedef boolean_t unsigned short * ); +struct debugger_callback { + kern_return_t (*callback) (void*); + void *callback_context; + boolean_t proceed_on_sync_failure; + kern_return_t error; +}; + +extern struct debugger_callback *debugger_callback; + extern boolean_t kdp_packet( diff --git a/osfmk/kdp/kdp_private.h b/osfmk/kdp/kdp_private.h index bcd2f3399..07e5123ff 100644 --- a/osfmk/kdp/kdp_private.h +++ b/osfmk/kdp/kdp_private.h @@ -29,7 +29,6 @@ /* * Private functions for kdp.c */ -extern char kdp_kernelversion_string[]; static boolean_t kdp_unknown( diff --git a/osfmk/kdp/kdp_protocol.h b/osfmk/kdp/kdp_protocol.h index eb86bae10..8cc612288 100644 --- a/osfmk/kdp/kdp_protocol.h +++ b/osfmk/kdp/kdp_protocol.h @@ -36,6 +36,7 @@ #ifdef MACH_KERNEL_PRIVATE #include +#include #include #endif diff --git a/osfmk/kdp/kdp_serial.c b/osfmk/kdp/kdp_serial.c index bc8f1369b..0bf85a9b3 100644 --- a/osfmk/kdp/kdp_serial.c +++ b/osfmk/kdp/kdp_serial.c @@ -26,6 +26,9 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include "kdp_serial.h" +#include +#include +#include #define SKDP_START_CHAR 0xFA #define SKDP_END_CHAR 0xFB @@ -34,26 +37,47 @@ static enum {DS_WAITSTART, DS_READING, DS_ESCAPED} dsState; static unsigned char dsBuffer[1518]; static int dsPos; +static uint32_t dsCRC; +static bool dsHaveCRC; + + +static void kdp_serial_out(unsigned char byte, void (*outFunc)(char)) +{ + //need to escape '\n' because the kernel serial output turns it into a cr/lf + if(byte == SKDP_START_CHAR || byte == SKDP_END_CHAR || byte == SKDP_ESC_CHAR || byte == '\n') + { + outFunc(SKDP_ESC_CHAR); + byte = ~byte; + } + outFunc(byte); +} void kdp_serialize_packet(unsigned char *packet, unsigned int len, void (*outFunc)(char)) { - unsigned int index; + unsigned int index; + unsigned char byte; + uint32_t crc; + + // insert the CRC between back to back STARTs which is compatible with old clients + crc = (uint32_t) z_crc32(0, packet, len); + outFunc(SKDP_START_CHAR); + kdp_serial_out((crc >> 0), outFunc); + kdp_serial_out((crc >> 8), outFunc); + kdp_serial_out((crc >> 16), outFunc); + kdp_serial_out((crc >> 24), outFunc); + outFunc(SKDP_START_CHAR); for (index = 0; index < len; index++) { - unsigned char byte = *packet++; - //need to escape '\n' because the kernel serial output turns it into a cr/lf - if(byte == SKDP_START_CHAR || byte == SKDP_END_CHAR || byte == SKDP_ESC_CHAR || byte == '\n') - { - outFunc(SKDP_ESC_CHAR); - byte = ~byte; - } - outFunc(byte); + byte = *packet++; + kdp_serial_out(byte, outFunc); } outFunc(SKDP_END_CHAR); } unsigned char *kdp_unserialize_packet(unsigned char byte, unsigned int *len) { + uint32_t crc; + switch(dsState) { case DS_WAITSTART: @@ -63,6 +87,7 @@ unsigned char *kdp_unserialize_packet(unsigned char byte, unsigned int *len) dsState = DS_READING; dsPos = 0; *len = SERIALIZE_READING; + dsHaveCRC = false; return 0; } *len = SERIALIZE_WAIT_START; @@ -76,7 +101,12 @@ unsigned char *kdp_unserialize_packet(unsigned char byte, unsigned int *len) } if(byte == SKDP_START_CHAR) { -// printf("unexpected start char, resetting\n"); + if (dsPos >= 4) + { + dsHaveCRC = true; + dsCRC = dsBuffer[0] | (dsBuffer[1] << 8) | (dsBuffer[2] << 16) | (dsBuffer[3] << 24); + } + //else printf("unexpected start char, resetting\n"); dsPos = 0; *len = SERIALIZE_READING; return 0; @@ -84,6 +114,17 @@ unsigned char *kdp_unserialize_packet(unsigned char byte, unsigned int *len) if(byte == SKDP_END_CHAR) { dsState = DS_WAITSTART; + if (dsHaveCRC) + { + crc = (uint32_t) z_crc32(0, &dsBuffer[0], dsPos); + if (crc != dsCRC) + { +// printf("bad packet crc 0x%x != 0x%x\n", crc, dsCRC); + dsPos = 0; + *len = SERIALIZE_WAIT_START; + return 0; + } + } *len = dsPos; dsPos = 0; return dsBuffer; diff --git a/osfmk/kdp/kdp_udp.c b/osfmk/kdp/kdp_udp.c index c93876ac8..ec0c072de 100644 --- a/osfmk/kdp/kdp_udp.c +++ b/osfmk/kdp/kdp_udp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,6 +72,8 @@ #include #include +#include + extern unsigned int not_in_kdp; extern int kdp_snapshot; extern void do_stackshot(void); @@ -217,36 +219,37 @@ struct kdp_ether_arp { #define KDP_MAXPACKET (ETHERHDRSIZE + ETHERMTU + ETHERCRC) static struct { - unsigned char data[KDP_MAXPACKET]; - unsigned int off, len; - boolean_t input; + unsigned char data[KDP_MAXPACKET]; + unsigned int off, len; + boolean_t input; } pkt, saved_reply; struct kdp_manual_pkt manual_pkt; struct { - struct { - struct kdp_in_addr in; - struct kdp_ether_addr ea; - } loc; - struct { - struct kdp_in_addr in; - struct kdp_ether_addr ea; - } rmt; + struct { + struct kdp_in_addr in; + struct kdp_ether_addr ea; + } loc; + struct { + struct kdp_in_addr in; + struct kdp_ether_addr ea; + } rmt; } adr; static const char *exception_message[] = { - "Unknown", - "Memory access", /* EXC_BAD_ACCESS */ - "Failed instruction", /* EXC_BAD_INSTRUCTION */ - "Arithmetic", /* EXC_ARITHMETIC */ - "Emulation", /* EXC_EMULATION */ - "Software", /* EXC_SOFTWARE */ - "Breakpoint" /* EXC_BREAKPOINT */ + "Unknown", + "Memory access", /* EXC_BAD_ACCESS */ + "Failed instruction", /* EXC_BAD_INSTRUCTION */ + "Arithmetic", /* EXC_ARITHMETIC */ + "Emulation", /* EXC_EMULATION */ + "Software", /* EXC_SOFTWARE */ + "Breakpoint" /* EXC_BREAKPOINT */ }; volatile int kdp_flag = 0; +boolean_t kdp_corezip_disabled = 0; kdp_send_t kdp_en_send_pkt; static kdp_receive_t kdp_en_recv_pkt; @@ -300,7 +303,7 @@ uint32_t kdp_crashdump_pkt_size = 512; #define KDP_LARGE_CRASHDUMP_PKT_SIZE (1440 - 6 - sizeof(struct kdp_udpiphdr)) static char panicd_ip_str[20]; static char router_ip_str[20]; -static char corename_str[50]; +static char corename_str[100]; static unsigned int panic_block = 0; volatile unsigned int kdp_trigger_core_dump = 0; @@ -330,6 +333,12 @@ uint32_t kdp_feature_large_crashdumps, kdp_feature_large_pkt_size; char kdp_kernelversion_string[256]; static boolean_t gKDPDebug = FALSE; + +#if WITH_CONSISTENT_DBG +#include +extern volatile struct xnu_hw_shmem_dbg_command_info *hwsd_info; +#endif + #define KDP_DEBUG(...) if (gKDPDebug) printf(__VA_ARGS__); #define SBLOCKSZ (2048) @@ -353,49 +362,53 @@ kdp_timer_callout_init(void) { /* only send/receive data if the link is up */ -inline static void wait_for_link(void) +inline static void +wait_for_link(void) { - static int first = 0; + static int first = 0; - if (!kdp_en_linkstatus) - return; + if (!kdp_en_linkstatus) + return; - while (((*kdp_en_linkstatus)() & LINK_UP_STATUS) != LINK_UP_STATUS) { - if (first) - continue; + while (((*kdp_en_linkstatus)() & LINK_UP_STATUS) != LINK_UP_STATUS) { + if (first) + continue; - first = 1; - printf("Waiting for link to become available.\n"); - kprintf("Waiting for link to become available.\n"); - } + first = 1; + printf("Waiting for link to become available.\n"); + kprintf("Waiting for link to become available.\n"); + } } -inline static void kdp_send_data(void *packet, unsigned int len) +inline static void +kdp_send_data(void *packet, unsigned int len) { - wait_for_link(); - (*kdp_en_send_pkt)(packet, len); + wait_for_link(); + (*kdp_en_send_pkt)(packet, len); } -inline static void kdp_receive_data(void *packet, unsigned int *len, - unsigned int timeout) +inline static void +kdp_receive_data(void *packet, unsigned int *len, unsigned int timeout) { - wait_for_link(); - (*kdp_en_recv_pkt)(packet, len, timeout); + wait_for_link(); + (*kdp_en_recv_pkt)(packet, len, timeout); } -void kdp_register_link(kdp_link_t link, kdp_mode_t mode) +void +kdp_register_link(kdp_link_t link, kdp_mode_t mode) { - kdp_en_linkstatus = link; - kdp_en_setmode = mode; + kdp_en_linkstatus = link; + kdp_en_setmode = mode; } -void kdp_unregister_link(__unused kdp_link_t link, __unused kdp_mode_t mode) +void +kdp_unregister_link(__unused kdp_link_t link, __unused kdp_mode_t mode) { - kdp_en_linkstatus = NULL; - kdp_en_setmode = NULL; + kdp_en_linkstatus = NULL; + kdp_en_setmode = NULL; } void @@ -432,6 +445,8 @@ kdp_register_send_receive( if (debug & DB_PANICLOG_DUMP) kdp_flag |= PANIC_LOG_DUMP; + kdp_corezip_disabled = (0 != (debug & DB_DISABLE_GZIP_CORE)); + if (PE_parse_boot_argn("_panicd_ip", panicd_ip_str, sizeof (panicd_ip_str))) panicd_specified = TRUE; @@ -449,8 +464,7 @@ kdp_register_send_receive( kdp_flag |= KDP_READY; - if (current_debugger == NO_CUR_DB) - current_debugger = KDP_CUR_DB; + current_debugger = KDP_CUR_DB; if ((kdp_current_ip_address != 0) && halt_in_debugger) { kdp_call(); halt_in_debugger=0; @@ -587,67 +601,68 @@ kdp_send( unsigned short remote_port ) { - struct kdp_udpiphdr aligned_ui, *ui = &aligned_ui; - struct kdp_ip aligned_ip, *ip = &aligned_ip; - struct kdp_ether_header *eh; - - if (pkt.input) - kdp_panic("kdp_send"); + struct kdp_udpiphdr aligned_ui, *ui = &aligned_ui; + struct kdp_ip aligned_ip, *ip = &aligned_ip; + struct kdp_ether_header *eh; - pkt.off -= (unsigned int)sizeof (struct kdp_udpiphdr); + if (pkt.input) + kdp_panic("kdp_send"); + + pkt.off -= (unsigned int)sizeof (struct kdp_udpiphdr); #if DO_ALIGN - bcopy((char *)&pkt.data[pkt.off], (char *)ui, sizeof(*ui)); + bcopy((char *)&pkt.data[pkt.off], (char *)ui, sizeof(*ui)); #else - ui = (struct kdp_udpiphdr *)&pkt.data[pkt.off]; + ui = (struct kdp_udpiphdr *)&pkt.data[pkt.off]; #endif - ui->ui_next = ui->ui_prev = 0; - ui->ui_x1 = 0; - ui->ui_pr = IPPROTO_UDP; - ui->ui_len = htons((u_short)pkt.len + sizeof (struct kdp_udphdr)); - ui->ui_src = adr.loc.in; - ui->ui_dst = adr.rmt.in; - ui->ui_sport = htons(KDP_REMOTE_PORT); - ui->ui_dport = remote_port; - ui->ui_ulen = ui->ui_len; - ui->ui_sum = 0; + ui->ui_next = ui->ui_prev = 0; + ui->ui_x1 = 0; + ui->ui_pr = IPPROTO_UDP; + ui->ui_len = htons((u_short)pkt.len + sizeof (struct kdp_udphdr)); + ui->ui_src = adr.loc.in; + ui->ui_dst = adr.rmt.in; + ui->ui_sport = htons(KDP_REMOTE_PORT); + ui->ui_dport = remote_port; + ui->ui_ulen = ui->ui_len; + ui->ui_sum = 0; #if DO_ALIGN - bcopy((char *)ui, (char *)&pkt.data[pkt.off], sizeof(*ui)); - bcopy((char *)&pkt.data[pkt.off], (char *)ip, sizeof(*ip)); + bcopy((char *)ui, (char *)&pkt.data[pkt.off], sizeof(*ui)); + bcopy((char *)&pkt.data[pkt.off], (char *)ip, sizeof(*ip)); #else - ip = (struct kdp_ip *)&pkt.data[pkt.off]; + ip = (struct kdp_ip *)&pkt.data[pkt.off]; #endif - ip->ip_len = htons(sizeof (struct kdp_udpiphdr) + pkt.len); - ip->ip_v = IPVERSION; - ip->ip_id = htons(ip_id++); - ip->ip_hl = sizeof (struct kdp_ip) >> 2; - ip->ip_ttl = udp_ttl; - ip->ip_sum = 0; - ip->ip_sum = htons(~ip_sum((unsigned char *)ip, ip->ip_hl)); + ip->ip_len = htons(sizeof (struct kdp_udpiphdr) + pkt.len); + ip->ip_v = IPVERSION; + ip->ip_id = htons(ip_id++); + ip->ip_hl = sizeof (struct kdp_ip) >> 2; + ip->ip_ttl = udp_ttl; + ip->ip_sum = 0; + ip->ip_sum = htons(~ip_sum((unsigned char *)ip, ip->ip_hl)); #if DO_ALIGN - bcopy((char *)ip, (char *)&pkt.data[pkt.off], sizeof(*ip)); + bcopy((char *)ip, (char *)&pkt.data[pkt.off], sizeof(*ip)); #endif - - pkt.len += (unsigned int)sizeof (struct kdp_udpiphdr); - - pkt.off -= (unsigned int)sizeof (struct kdp_ether_header); - - eh = (struct kdp_ether_header *)&pkt.data[pkt.off]; - enaddr_copy(&adr.loc.ea, eh->ether_shost); - enaddr_copy(&adr.rmt.ea, eh->ether_dhost); - eh->ether_type = htons(ETHERTYPE_IP); - - pkt.len += (unsigned int)sizeof (struct kdp_ether_header); - kdp_send_data(&pkt.data[pkt.off], pkt.len); + + pkt.len += (unsigned int)sizeof (struct kdp_udpiphdr); + + pkt.off -= (unsigned int)sizeof (struct kdp_ether_header); + + eh = (struct kdp_ether_header *)&pkt.data[pkt.off]; + enaddr_copy(&adr.loc.ea, eh->ether_shost); + enaddr_copy(&adr.rmt.ea, eh->ether_dhost); + eh->ether_type = htons(ETHERTYPE_IP); + + pkt.len += (unsigned int)sizeof (struct kdp_ether_header); + kdp_send_data(&pkt.data[pkt.off], pkt.len); } -inline static void debugger_if_necessary(void) +inline static void +debugger_if_necessary(void) { - if ((current_debugger == KDP_CUR_DB) && halt_in_debugger) { - kdp_call(); - halt_in_debugger=0; - } + if ((current_debugger == KDP_CUR_DB) && halt_in_debugger) { + kdp_call(); + halt_in_debugger=0; + } } @@ -711,60 +726,60 @@ kdp_set_ip_and_mac_addresses( struct kdp_in_addr *ipaddr, struct kdp_ether_addr *macaddr) { - static uint64_t last_time = (uint64_t) -1; - static uint64_t throttle_val = 0; - uint64_t cur_time; - char addr[16]; - - if (kdp_current_ip_address == ipaddr->s_addr) - goto done; - - /* don't replace if serial debugging is configured */ - if (!KDP_SERIAL_ENABLED() || - (kdp_current_ip_address != KDP_SERIAL_IPADDR)) { - kdp_current_mac_address = *macaddr; - kdp_current_ip_address = ipaddr->s_addr; - } + static uint64_t last_time = (uint64_t) -1; + static uint64_t throttle_val = 0; + uint64_t cur_time; + char addr[16]; + + if (kdp_current_ip_address == ipaddr->s_addr) + goto done; + + /* don't replace if serial debugging is configured */ + if (!KDP_SERIAL_ENABLED() || + (kdp_current_ip_address != KDP_SERIAL_IPADDR)) { + kdp_current_mac_address = *macaddr; + kdp_current_ip_address = ipaddr->s_addr; + } - if (save_ip_in_nvram == FALSE) - goto done; + if (save_ip_in_nvram == FALSE) + goto done; - if (inet_ntoa_r(*ipaddr, addr, sizeof(addr)) == NULL) - goto done; + if (inet_ntoa_r(*ipaddr, addr, sizeof(addr)) == NULL) + goto done; - /* throttle writes if needed */ - if (!throttle_val) - nanoseconds_to_absolutetime(KDP_THROTTLE_VALUE, &throttle_val); + /* throttle writes if needed */ + if (!throttle_val) + nanoseconds_to_absolutetime(KDP_THROTTLE_VALUE, &throttle_val); - cur_time = mach_absolute_time(); - if (last_time == (uint64_t) -1 || - ((cur_time - last_time) > throttle_val)) { - PEWriteNVRAMProperty("_kdp_ipstr", addr, - (const unsigned int) strlen(addr)); - } - last_time = cur_time; + cur_time = mach_absolute_time(); + if (last_time == (uint64_t) -1 || + ((cur_time - last_time) > throttle_val)) { + PEWriteNVRAMProperty("_kdp_ipstr", addr, + (const unsigned int) strlen(addr)); + } + last_time = cur_time; done: - debugger_if_necessary(); + debugger_if_necessary(); } void kdp_set_gateway_mac(void *gatewaymac) { - router_mac = *(struct kdp_ether_addr *)gatewaymac; - flag_router_mac_initialized = TRUE; + router_mac = *(struct kdp_ether_addr *)gatewaymac; + flag_router_mac_initialized = TRUE; } struct kdp_ether_addr kdp_get_mac_addr(void) { - return kdp_current_mac_address; + return kdp_current_mac_address; } unsigned int kdp_get_ip_address(void) { - return (unsigned int)kdp_current_ip_address; + return (unsigned int)kdp_current_ip_address; } void @@ -889,14 +904,11 @@ kdp_poll(void) if (pkt.len == 0) return; - if (pkt.len >= sizeof(struct kdp_ether_header)) - { + if (pkt.len >= sizeof(struct kdp_ether_header)) { eh = (struct kdp_ether_header *)&pkt.data[pkt.off]; - if (kdp_flag & KDP_ARP) - { - if (ntohs(eh->ether_type) == ETHERTYPE_ARP) - { + if (kdp_flag & KDP_ARP) { + if (ntohs(eh->ether_type) == ETHERTYPE_ARP) { kdp_arp_dispatch(); return; } @@ -941,8 +953,7 @@ kdp_poll(void) * enter the debugger if not told otherwise. */ else - if (flag_panic_dump_in_progress) - { + if (flag_panic_dump_in_progress) { if (!flag_dont_abort_panic_dump) { abort_panic_transfer(); } @@ -964,6 +975,7 @@ kdp_poll(void) pkt.input = TRUE; } + /* Create and transmit an ARP resolution request for the target IP address. * This is modeled on ether_inet_arp()/RFC 826. */ @@ -1034,106 +1046,105 @@ kdp_arp_resolve(uint32_t arp_target_ip, struct kdp_ether_addr *resolved_MAC) return TRUE; } - if (!flag_panic_dump_in_progress || pkt.input) /* we received a debugging packet, bail*/ - { + if (!flag_panic_dump_in_progress || pkt.input) { /* we received a debugging packet, bail*/ printf("Received a debugger packet,transferring control to debugger\n"); /* Indicate that we should wait in the debugger when we return */ kdp_flag |= DBG_POST_CORE; pkt.input = FALSE; return FALSE; - } - else /* We timed out */ + } else { /* We timed out */ if (0 == poll_count) { poll_count = 256; goto TRANSMIT_RETRY; } + } return FALSE; } static void kdp_handler( - void *saved_state + void *saved_state ) { - unsigned short reply_port; - kdp_hdr_t aligned_hdr, *hdr = &aligned_hdr; + unsigned short reply_port; + kdp_hdr_t aligned_hdr, *hdr = &aligned_hdr; - kdp.saved_state = saved_state; // see comment in kdp_raise_exception + kdp.saved_state = saved_state; // see comment in kdp_raise_exception + + do { + while (!pkt.input) + kdp_poll(); - do { - while (!pkt.input) - kdp_poll(); - #if DO_ALIGN - bcopy((char *)&pkt.data[pkt.off], (char *)hdr, sizeof(*hdr)); + bcopy((char *)&pkt.data[pkt.off], (char *)hdr, sizeof(*hdr)); #else - hdr = (kdp_hdr_t *)&pkt.data[pkt.off]; + hdr = (kdp_hdr_t *)&pkt.data[pkt.off]; #endif - // ignore replies -- we're not expecting them anyway. - if (hdr->is_reply) { - goto again; - } - - if (hdr->request == KDP_REATTACH) - exception_seq = hdr->seq; - - // check for retransmitted request - if (hdr->seq == (exception_seq - 1)) { - /* retransmit last reply */ - kdp_send_data(&saved_reply.data[saved_reply.off], - saved_reply.len); - goto again; - } else if ((hdr->seq != exception_seq) && - (hdr->request != KDP_CONNECT)) { - printf("kdp: bad sequence %d (want %d)\n", - hdr->seq, exception_seq); - goto again; - } - - /* This is a manual side-channel to the main KDP protocol. - * A client like GDB/kgmacros can manually construct - * a request, set the input flag, issue a dummy KDP request, - * and then manually collect the result - */ - if (manual_pkt.input) { - kdp_hdr_t *manual_hdr = (kdp_hdr_t *)&manual_pkt.data; - unsigned short manual_port_unused = 0; - if (!manual_hdr->is_reply) { - /* process */ - kdp_packet((unsigned char *)&manual_pkt.data, - (int *)&manual_pkt.len, - &manual_port_unused); - } - manual_pkt.input = 0; - } - - if (kdp_packet((unsigned char*)&pkt.data[pkt.off], - (int *)&pkt.len, - (unsigned short *)&reply_port)) { - boolean_t sideband = FALSE; - - /* if it's an already connected error message, - * send a sideband reply for that. for successful connects, - * make sure the sequence number is correct. */ - if (hdr->request == KDP_CONNECT) { - kdp_connect_reply_t *rp = - (kdp_connect_reply_t *) &pkt.data[pkt.off]; - kdp_error_t err = rp->error; - - if (err == KDPERR_NO_ERROR) { - exception_seq = hdr->seq; - } else if (err == KDPERR_ALREADY_CONNECTED) { - sideband = TRUE; - } - } + // ignore replies -- we're not expecting them anyway. + if (hdr->is_reply) { + goto again; + } - kdp_reply(reply_port, sideband); - } + if (hdr->request == KDP_REATTACH) + exception_seq = hdr->seq; + + // check for retransmitted request + if (hdr->seq == (exception_seq - 1)) { + /* retransmit last reply */ + kdp_send_data(&saved_reply.data[saved_reply.off], + saved_reply.len); + goto again; + } else if ((hdr->seq != exception_seq) && + (hdr->request != KDP_CONNECT)) { + printf("kdp: bad sequence %d (want %d)\n", + hdr->seq, exception_seq); + goto again; + } + + /* This is a manual side-channel to the main KDP protocol. + * A client like GDB/kgmacros can manually construct + * a request, set the input flag, issue a dummy KDP request, + * and then manually collect the result + */ + if (manual_pkt.input) { + kdp_hdr_t *manual_hdr = (kdp_hdr_t *)&manual_pkt.data; + unsigned short manual_port_unused = 0; + if (!manual_hdr->is_reply) { + /* process */ + kdp_packet((unsigned char *)&manual_pkt.data, + (int *)&manual_pkt.len, + &manual_port_unused); + } + manual_pkt.input = 0; + } + + if (kdp_packet((unsigned char*)&pkt.data[pkt.off], + (int *)&pkt.len, + (unsigned short *)&reply_port)) { + boolean_t sideband = FALSE; + + /* if it's an already connected error message, + * send a sideband reply for that. for successful connects, + * make sure the sequence number is correct. */ + if (hdr->request == KDP_CONNECT) { + kdp_connect_reply_t *rp = + (kdp_connect_reply_t *) &pkt.data[pkt.off]; + kdp_error_t err = rp->error; + + if (err == KDPERR_NO_ERROR) { + exception_seq = hdr->seq; + } else if (err == KDPERR_ALREADY_CONNECTED) { + sideband = TRUE; + } + } + + kdp_reply(reply_port, sideband); + } again: - pkt.input = FALSE; - } while (kdp.is_halted); + pkt.input = FALSE; + } while (kdp.is_halted); } static void @@ -1149,53 +1160,53 @@ kdp_connection_wait(void) * the panic.log */ - if (KDP_SERIAL_ENABLED()) { - printf("Using serial KDP.\n"); - kprintf("Using serial KDP.\n"); - } else { - printf( "ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", - kdp_mac_addr.ether_addr_octet[0] & 0xff, - kdp_mac_addr.ether_addr_octet[1] & 0xff, - kdp_mac_addr.ether_addr_octet[2] & 0xff, - kdp_mac_addr.ether_addr_octet[3] & 0xff, - kdp_mac_addr.ether_addr_octet[4] & 0xff, - kdp_mac_addr.ether_addr_octet[5] & 0xff); - - kprintf( "ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", - kdp_mac_addr.ether_addr_octet[0] & 0xff, - kdp_mac_addr.ether_addr_octet[1] & 0xff, - kdp_mac_addr.ether_addr_octet[2] & 0xff, - kdp_mac_addr.ether_addr_octet[3] & 0xff, - kdp_mac_addr.ether_addr_octet[4] & 0xff, - kdp_mac_addr.ether_addr_octet[5] & 0xff); - - printf( "ip address: %d.%d.%d.%d\n", - (ip_addr & 0xff000000) >> 24, - (ip_addr & 0xff0000) >> 16, - (ip_addr & 0xff00) >> 8, - (ip_addr & 0xff)); - - kprintf( "ip address: %d.%d.%d.%d\n", - (ip_addr & 0xff000000) >> 24, - (ip_addr & 0xff0000) >> 16, - (ip_addr & 0xff00) >> 8, - (ip_addr & 0xff)); - } + if (KDP_SERIAL_ENABLED()) { + printf("Using serial KDP.\n"); + kprintf("Using serial KDP.\n"); + } else { + printf("ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", + kdp_mac_addr.ether_addr_octet[0] & 0xff, + kdp_mac_addr.ether_addr_octet[1] & 0xff, + kdp_mac_addr.ether_addr_octet[2] & 0xff, + kdp_mac_addr.ether_addr_octet[3] & 0xff, + kdp_mac_addr.ether_addr_octet[4] & 0xff, + kdp_mac_addr.ether_addr_octet[5] & 0xff); + + kprintf("ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", + kdp_mac_addr.ether_addr_octet[0] & 0xff, + kdp_mac_addr.ether_addr_octet[1] & 0xff, + kdp_mac_addr.ether_addr_octet[2] & 0xff, + kdp_mac_addr.ether_addr_octet[3] & 0xff, + kdp_mac_addr.ether_addr_octet[4] & 0xff, + kdp_mac_addr.ether_addr_octet[5] & 0xff); + + printf("ip address: %d.%d.%d.%d\n", + (ip_addr & 0xff000000) >> 24, + (ip_addr & 0xff0000) >> 16, + (ip_addr & 0xff00) >> 8, + (ip_addr & 0xff)); + + kprintf("ip address: %d.%d.%d.%d\n", + (ip_addr & 0xff000000) >> 24, + (ip_addr & 0xff0000) >> 16, + (ip_addr & 0xff00) >> 8, + (ip_addr & 0xff)); + } printf("\nWaiting for remote debugger connection.\n"); kprintf("\nWaiting for remote debugger connection.\n"); if (reattach_wait == 0) { - if((kdp_flag & KDP_GETC_ENA) && (0 != kdp_getc())) - { + if((kdp_flag & KDP_GETC_ENA) && (0 != kdp_getc())) { printf("Options..... Type\n"); printf("------------ ----\n"); printf("continue.... 'c'\n"); printf("reboot...... 'r'\n"); } - } else + } else { reattach_wait = 0; + } exception_seq = 0; @@ -1230,8 +1241,8 @@ kdp_connection_wait(void) } if (((hdr->request == KDP_CONNECT) || (hdr->request == KDP_REATTACH)) && !hdr->is_reply && (hdr->seq == exception_seq)) { - if (kdp_packet((unsigned char *)&pkt.data[pkt.off], - (int *)&pkt.len, + if (kdp_packet((unsigned char *)&pkt.data[pkt.off], + (int *)&pkt.len, (unsigned short *)&reply_port)) kdp_reply(reply_port, FALSE); if (hdr->request == KDP_REATTACH) { @@ -1252,168 +1263,171 @@ kdp_connection_wait(void) static void kdp_send_exception( - unsigned int exception, - unsigned int code, - unsigned int subcode + unsigned int exception, + unsigned int code, + unsigned int subcode ) { - unsigned short remote_port; - unsigned int timeout_count = 100; - unsigned int poll_timeout; - - do { - pkt.off = sizeof (struct kdp_ether_header) + sizeof (struct kdp_udpiphdr); - kdp_exception((unsigned char *)&pkt.data[pkt.off], - (int *)&pkt.len, - (unsigned short *)&remote_port, - (unsigned int)exception, - (unsigned int)code, - (unsigned int)subcode); - - kdp_send(remote_port); - - poll_timeout = 50; - while(!pkt.input && poll_timeout) - { - kdp_poll(); - poll_timeout--; - } + unsigned short remote_port; + unsigned int timeout_count = 100; + unsigned int poll_timeout; + + do { + pkt.off = sizeof (struct kdp_ether_header) + sizeof (struct kdp_udpiphdr); + kdp_exception((unsigned char *)&pkt.data[pkt.off], + (int *)&pkt.len, + (unsigned short *)&remote_port, + (unsigned int)exception, + (unsigned int)code, + (unsigned int)subcode); + + kdp_send(remote_port); + + poll_timeout = 50; + while (!pkt.input && poll_timeout) { + kdp_poll(); + poll_timeout--; + } + + if (pkt.input) { + if (!kdp_exception_ack(&pkt.data[pkt.off], pkt.len)) { + pkt.input = FALSE; + } + } - if (pkt.input) { - if (!kdp_exception_ack(&pkt.data[pkt.off], pkt.len)) { pkt.input = FALSE; - } - } - pkt.input = FALSE; + if (kdp.exception_ack_needed) + kdp_us_spin(250000); - if (kdp.exception_ack_needed) - kdp_us_spin(250000); + } while (kdp.exception_ack_needed && timeout_count--); - } while (kdp.exception_ack_needed && timeout_count--); - - if (kdp.exception_ack_needed) { - // give up & disconnect - printf("kdp: exception ack timeout\n"); - if (current_debugger == KDP_CUR_DB) - active_debugger=0; - kdp_reset(); - } + if (kdp.exception_ack_needed) { + // give up & disconnect + printf("kdp: exception ack timeout\n"); + if (current_debugger == KDP_CUR_DB) + active_debugger=0; + kdp_reset(); + } } static void kdp_debugger_loop( - unsigned int exception, - unsigned int code, - unsigned int subcode, - void *saved_state) + unsigned int exception, + unsigned int code, + unsigned int subcode, + void *saved_state) { - int index; - - if (saved_state == 0) - printf("kdp_raise_exception with NULL state\n"); - - index = exception; - if (exception != EXC_BREAKPOINT) { - if (exception > EXC_BREAKPOINT || exception < EXC_BAD_ACCESS) { - index = 0; - } - printf("%s exception (%x,%x,%x)\n", - exception_message[index], - exception, code, subcode); - } - - kdp_sync_cache(); - - /* XXX WMG it seems that sometimes it doesn't work to let kdp_handler - * do this. I think the client and the host can get out of sync. - */ - kdp.saved_state = saved_state; - kdp.kdp_cpu = cpu_number(); - kdp.kdp_thread = current_thread(); - - if (kdp_en_setmode) - (*kdp_en_setmode)(TRUE); /* enabling link mode */ - - if (pkt.input) - kdp_panic("kdp_raise_exception"); - - if (((kdp_flag & KDP_PANIC_DUMP_ENABLED) || (kdp_flag & PANIC_LOG_DUMP) || kdp_has_polled_corefile()) - && (panicstr != (char *) 0)) { - kdp_panic_dump(); - if (kdp_flag & REBOOT_POST_CORE) - kdp_machine_reboot(); - } - else - if ((kdp_flag & PANIC_CORE_ON_NMI) && (panicstr == (char *) 0) && - !kdp.is_conn) { - - disable_debug_output = disableConsoleOutput = FALSE; - kdp_panic_dump(); - - if (!(kdp_flag & DBG_POST_CORE)) - goto exit_debugger_loop; - } - - again: - if (!kdp.is_conn) - kdp_connection_wait(); - else { - kdp_send_exception(exception, code, subcode); - if (kdp.exception_ack_needed) { - kdp.exception_ack_needed = FALSE; - kdp_remove_all_breakpoints(); - printf("Remote debugger disconnected.\n"); - } - } - - if (kdp.is_conn) { - kdp.is_halted = TRUE; /* XXX */ - kdp_handler(saved_state); - if (!kdp.is_conn) - { - kdp_remove_all_breakpoints(); - printf("Remote debugger disconnected.\n"); - } - } - /* Allow triggering a panic core dump when connected to the machine - * Continuing after setting kdp_trigger_core_dump should do the - * trick. - */ - - if (1 == kdp_trigger_core_dump) { - kdp_flag |= KDP_PANIC_DUMP_ENABLED; - kdp_panic_dump(); - if (kdp_flag & REBOOT_POST_CORE) - kdp_machine_reboot(); - kdp_trigger_core_dump = 0; - } + int index; -/* Trigger a reboot if the user has set this flag through the - * debugger.Ideally, this would be done through the HOSTREBOOT packet - * in the protocol,but that will need gdb support,and when it's - * available, it should work automatically. - */ - if (1 == flag_kdp_trigger_reboot) { - kdp_machine_reboot(); - /* If we're still around, reset the flag */ - flag_kdp_trigger_reboot = 0; - } + if (saved_state == 0) + printf("kdp_raise_exception with NULL state\n"); - if (kdp_reentry_deadline) { - kdp_schedule_debugger_reentry(kdp_reentry_deadline); - printf("Debugger re-entry scheduled in %d milliseconds\n", kdp_reentry_deadline); - kdp_reentry_deadline = 0; - } + index = exception; + if (exception != EXC_BREAKPOINT) { + if (exception > EXC_BREAKPOINT || exception < EXC_BAD_ACCESS) { + index = 0; + } + printf("%s exception (%x,%x,%x)\n", + exception_message[index], + exception, code, subcode); + } + + kdp_sync_cache(); + + /* XXX WMG it seems that sometimes it doesn't work to let kdp_handler + * do this. I think the client and the host can get out of sync. + */ + kdp.saved_state = saved_state; + kdp.kdp_cpu = cpu_number(); + kdp.kdp_thread = current_thread(); - kdp_sync_cache(); + if (kdp_en_setmode) + (*kdp_en_setmode)(TRUE); /* enabling link mode */ - if (reattach_wait == 1) - goto again; + if (pkt.input) + kdp_panic("kdp_raise_exception"); + + if (((kdp_flag & KDP_PANIC_DUMP_ENABLED) + || (kdp_flag & PANIC_LOG_DUMP) + || kdp_has_polled_corefile()) + && (panicstr != (char *) 0)) { + kdp_panic_dump(); + if (kdp_flag & REBOOT_POST_CORE) + kdp_machine_reboot(); + } else { + if ((kdp_flag & PANIC_CORE_ON_NMI) && (panicstr == (char *) 0) + && !kdp.is_conn) { + + disable_debug_output = disableConsoleOutput = FALSE; + kdp_panic_dump(); + if (kdp_flag & REBOOT_POST_CORE) + kdp_machine_reboot(); + + if (!(kdp_flag & DBG_POST_CORE)) + goto exit_debugger_loop; + } + } + +again: + if (!kdp.is_conn) { + kdp_connection_wait(); + } else { + kdp_send_exception(exception, code, subcode); + if (kdp.exception_ack_needed) { + kdp.exception_ack_needed = FALSE; + kdp_remove_all_breakpoints(); + printf("Remote debugger disconnected.\n"); + } + } + + if (kdp.is_conn) { + kdp.is_halted = TRUE; /* XXX */ + kdp_handler(saved_state); + if (!kdp.is_conn) + { + kdp_remove_all_breakpoints(); + printf("Remote debugger disconnected.\n"); + } + } + /* Allow triggering a panic core dump when connected to the machine + * Continuing after setting kdp_trigger_core_dump should do the + * trick. + */ + + if (1 == kdp_trigger_core_dump) { + kdp_flag |= KDP_PANIC_DUMP_ENABLED; + kdp_panic_dump(); + if (kdp_flag & REBOOT_POST_CORE) + kdp_machine_reboot(); + kdp_trigger_core_dump = 0; + } + + /* Trigger a reboot if the user has set this flag through the + * debugger.Ideally, this would be done through the HOSTREBOOT packet + * in the protocol,but that will need gdb support,and when it's + * available, it should work automatically. + */ + if (1 == flag_kdp_trigger_reboot) { + kdp_machine_reboot(); + /* If we're still around, reset the flag */ + flag_kdp_trigger_reboot = 0; + } + + if (kdp_reentry_deadline) { + kdp_schedule_debugger_reentry(kdp_reentry_deadline); + printf("Debugger re-entry scheduled in %d milliseconds\n", kdp_reentry_deadline); + kdp_reentry_deadline = 0; + } + + kdp_sync_cache(); + + if (reattach_wait == 1) + goto again; exit_debugger_loop: - if (kdp_en_setmode) - (*kdp_en_setmode)(FALSE); /* link cleanup */ + if (kdp_en_setmode) + (*kdp_en_setmode)(FALSE); /* link cleanup */ } void @@ -1429,7 +1443,7 @@ kdp_reset(void) struct corehdr * create_panic_header(unsigned int request, const char *corename, - unsigned length, unsigned int block) + unsigned length, unsigned int block) { struct kdp_udpiphdr aligned_ui, *ui = &aligned_ui; struct kdp_ip aligned_ip, *ip = &aligned_ip; @@ -1484,8 +1498,7 @@ create_panic_header(unsigned int request, const char *corename, coreh = (struct corehdr *) &pkt.data[pkt.off]; coreh->th_opcode = htons((u_short)request); - if (request == KDP_WRQ) - { + if (request == KDP_WRQ) { char *cp; cp = coreh->th_u.tu_rpl; @@ -1500,9 +1513,7 @@ create_panic_header(unsigned int request, const char *corename, PE_parse_boot_argn("kdp_crashdump_pkt_size", &kdp_crashdump_pkt_size, sizeof(kdp_crashdump_pkt_size)); cp += sizeof(kdp_crashdump_feature_mask); *(uint32_t *)cp = htonl(kdp_crashdump_pkt_size); - } - else - { + } else { coreh->th_block = htonl((unsigned int) block); } @@ -1518,7 +1529,8 @@ create_panic_header(unsigned int request, const char *corename, return coreh; } -static int kdp_send_crashdump_seek(char *corename, uint64_t seek_off) +static int +kdp_send_crashdump_seek(char *corename, uint64_t seek_off) { int panic_error; @@ -1541,7 +1553,8 @@ static int kdp_send_crashdump_seek(char *corename, uint64_t seek_off) return KERN_SUCCESS; } -int kdp_send_crashdump_data(unsigned int request, char *corename, +int +kdp_send_crashdump_data(unsigned int request, char *corename, uint64_t length, void * txstart) { int panic_error = 0; @@ -1566,7 +1579,7 @@ uint32_t kdp_crashdump_short_pkt; int kdp_send_crashdump_pkt(unsigned int request, char *corename, - uint64_t length, void *panic_data) + uint64_t length, void *panic_data) { int poll_count; struct corehdr *th = NULL; @@ -1639,7 +1652,6 @@ kdp_send_crashdump_pkt(unsigned int request, char *corename, } if (pkt.input) { - pkt.input = FALSE; th = (struct corehdr *) &pkt.data[pkt.off]; @@ -1660,36 +1672,31 @@ kdp_send_crashdump_pkt(unsigned int request, char *corename, } } if (ntohs(th->th_opcode) == KDP_ACK && ntohl(th->th_block) == panic_block) { - } - else + } else { if (ntohs(th->th_opcode) == KDP_ERROR) { printf("Panic server returned error %d, retrying\n", ntohl(th->th_code)); poll_count = 1000; goto TRANSMIT_RETRY; + } else if (ntohl(th->th_block) == (panic_block - 1)) { + printf("RX retry "); + if (++rretries > 1) + goto TRANSMIT_RETRY; + else + goto RECEIVE_RETRY; } - else - if (ntohl(th->th_block) == (panic_block - 1)) { - printf("RX retry "); - if (++rretries > 1) - goto TRANSMIT_RETRY; - else - goto RECEIVE_RETRY; - } - } - else - if (!flag_panic_dump_in_progress) /* we received a debugging packet, bail*/ - { + } + } else if (!flag_panic_dump_in_progress) { /* we received a debugging packet, bail*/ printf("Received a debugger packet,transferring control to debugger\n"); /* Configure that if not set ..*/ kdp_flag |= DBG_POST_CORE; return (-2); - } - else /* We timed out */ + } else { /* We timed out */ if (0 == poll_count) { poll_count = 1000; kdp_us_spin ((tretries%4) * panic_timeout); /* capped linear backoff */ goto TRANSMIT_RETRY; } + } if (!(++panic_block % SBLOCKSZ)) { uint64_t ctime; @@ -1715,7 +1722,7 @@ kdp_send_crashdump_pkt(unsigned int request, char *corename, static int isdigit (char c) { - return ((c > 47) && (c < 58)); + return ((c > 47) && (c < 58)); } /* Horrid hack to extract xnu version if possible - a much cleaner approach @@ -1891,7 +1898,7 @@ kdp_panic_dump(void) /* try a local disk dump */ if (kdp_has_polled_corefile()) { flag_panic_dump_in_progress = TRUE; - kern_dump(TRUE); + kern_dump(KERN_DUMP_DISK); abort_panic_transfer(); } @@ -1918,48 +1925,49 @@ kdp_panic_dump(void) kdp_get_xnu_version((char *) &pkt.data[0]); - if (!corename_specified) { - coresuffix[0] = 0; - /* Panic log bit takes precedence over core dump bit */ - if ((panicstr != (char *) 0) && (kdp_flag & PANIC_LOG_DUMP)) - strlcpy(coreprefix, "paniclog", sizeof(coreprefix)); - else if (kdp_flag & SYSTEM_LOG_DUMP) - strlcpy(coreprefix, "systemlog", sizeof(coreprefix)); - else { - strlcpy(coreprefix, "core", sizeof(coreprefix)); - strlcpy(coresuffix, ".gz", sizeof(coresuffix)); - } - - abstime = mach_absolute_time(); - pkt.data[20] = '\0'; - snprintf (corename_str, sizeof(corename_str), "%s-%s-%d.%d.%d.%d-%x%s", - coreprefix, &pkt.data[0], - (current_ip & 0xff000000) >> 24, - (current_ip & 0xff0000) >> 16, - (current_ip & 0xff00) >> 8, - (current_ip & 0xff), - (unsigned int) (abstime & 0xffffffff), - coresuffix); - } + if (!corename_specified) { + coresuffix[0] = 0; + /* Panic log bit takes precedence over core dump bit */ + if ((panicstr != (char *) 0) && (kdp_flag & PANIC_LOG_DUMP)) + strlcpy(coreprefix, "paniclog", sizeof(coreprefix)); + else if (kdp_flag & SYSTEM_LOG_DUMP) + strlcpy(coreprefix, "systemlog", sizeof(coreprefix)); + else { + strlcpy(coreprefix, "core", sizeof(coreprefix)); + if (!kdp_corezip_disabled) strlcpy(coresuffix, ".gz", sizeof(coresuffix)); + } + + abstime = mach_absolute_time(); + pkt.data[20] = '\0'; + snprintf (corename_str, + sizeof(corename_str), + "%s-%s-%d.%d.%d.%d-%x%s", + coreprefix, &pkt.data[0], + (current_ip & 0xff000000) >> 24, + (current_ip & 0xff0000) >> 16, + (current_ip & 0xff00) >> 8, + (current_ip & 0xff), + (unsigned int) (abstime & 0xffffffff), + coresuffix); + } if (0 == inet_aton(panicd_ip_str, (struct kdp_in_addr *) &panic_server_ip)) { kdb_printf("inet_aton() failed interpreting %s as a panic server IP\n", panicd_ip_str); - } - else + } else { kdb_printf("Attempting connection to panic server configured at IP %s, port %d\n", panicd_ip_str, panicd_port); + } destination_mac = router_mac; if (kdp_arp_resolve(panic_server_ip, &temp_mac)) { kdb_printf("Resolved %s's (or proxy's) link level address\n", panicd_ip_str); destination_mac = temp_mac; - } - else { + } else { if (!flag_panic_dump_in_progress) goto panic_dump_exit; if (router_specified) { - if (0 == inet_aton(router_ip_str, (struct kdp_in_addr *) &parsed_router_ip)) + if (0 == inet_aton(router_ip_str, (struct kdp_in_addr *) &parsed_router_ip)) { kdb_printf("inet_aton() failed interpreting %s as an IP\n", router_ip_str); - else { + } else { router_ip = parsed_router_ip; if (kdp_arp_resolve(router_ip, &temp_mac)) { destination_mac = temp_mac; @@ -2024,7 +2032,7 @@ kdp_panic_dump(void) } /* We want a core dump if we're here */ - kern_dump(FALSE); + kern_dump(KERN_DUMP_NET); panic_dump_exit: abort_panic_transfer(); @@ -2048,7 +2056,7 @@ static void kdp_serial_send(void *rpkt, unsigned int rpkt_len) { // printf("tx\n"); - kdp_serialize_packet((unsigned char *)rpkt, rpkt_len, pal_serial_putc); + kdp_serialize_packet((unsigned char *)rpkt, rpkt_len, pal_serial_putc_nocr); } static void @@ -2060,15 +2068,12 @@ kdp_serial_receive(void *rpkt, unsigned int *rpkt_len, unsigned int timeout) clock_interval_to_deadline(timeout, 1000 * 1000 /* milliseconds */, &deadline); // printf("rx\n"); - for(clock_get_uptime(&now); now < deadline; clock_get_uptime(&now)) - { + for(clock_get_uptime(&now); now < deadline; clock_get_uptime(&now)) { readkar = pal_serial_getc(); - if(readkar >= 0) - { + if(readkar >= 0) { unsigned char *packet; // printf("got char %02x\n", readkar); - if((packet = kdp_unserialize_packet(readkar,rpkt_len))) - { + if((packet = kdp_unserialize_packet(readkar,rpkt_len))) { memcpy(rpkt, packet, *rpkt_len); return; } @@ -2080,39 +2085,41 @@ kdp_serial_receive(void *rpkt, unsigned int *rpkt_len, unsigned int timeout) static boolean_t kdp_serial_setmode(boolean_t active) { - if (active == FALSE) /* leaving KDP */ - return TRUE; + if (active == FALSE) /* leaving KDP */ + return TRUE; if (!needs_serial_init) - return TRUE; + return TRUE; - pal_serial_init(); - needs_serial_init = FALSE; - return TRUE; + pal_serial_init(); + needs_serial_init = FALSE; + return TRUE; } static void kdp_serial_callout(__unused void *arg, kdp_event_t event) { - /* When we stop KDP, set the bit to re-initialize the console serial port - * the next time we send/receive a KDP packet. We don't do it on - * KDP_EVENT_ENTER directly because it also gets called when we trap to KDP - * for non-external debugging, i.e., stackshot or core dumps. - * - * Set needs_serial_init on exit (and initialization, see above) and not - * enter because enter is sent multiple times and causes excess reinitialization. - */ - - switch (event) - { + /* + * When we stop KDP, set the bit to re-initialize the console serial + * port the next time we send/receive a KDP packet. We don't do it on + * KDP_EVENT_ENTER directly because it also gets called when we trap to + * KDP for non-external debugging, i.e., stackshot or core dumps. + * + * Set needs_serial_init on exit (and initialization, see above) and not + * enter because enter is sent multiple times and causes excess + * reinitialization. + */ + + switch (event) + { case KDP_EVENT_PANICLOG: case KDP_EVENT_ENTER: break; case KDP_EVENT_EXIT: needs_serial_init = TRUE; break; - } + } } #endif /* CONFIG_SERIAL_KDP */ @@ -2159,11 +2166,19 @@ kdp_init(void) struct kdp_in_addr ipaddr; struct kdp_ether_addr macaddr; + // serial must be explicitly requested + if(!PE_parse_boot_argn("kdp_match_name", kdpname, sizeof(kdpname)) || strncmp(kdpname, "serial", sizeof(kdpname)) != 0) + return; - // serial must be explicitly requested - if(!PE_parse_boot_argn("kdp_match_name", kdpname, sizeof(kdpname)) || strncmp(kdpname, "serial", sizeof(kdpname)) != 0) +#if WITH_CONSISTENT_DBG + if (PE_consistent_debug_enabled() && debug_boot_arg) { + current_debugger = HW_SHM_CUR_DB; return; - + } else { + printf("Consistent debug disabled or debug boot arg not present, falling through to serial for debugger\n"); + } +#endif /* WITH_CONSISTENT_DBG */ + kprintf("Initializing serial KDP\n"); kdp_register_callout(kdp_serial_callout, NULL); @@ -2190,79 +2205,110 @@ kdp_init(void) } #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ -#if defined(__arm64__) || !CONFIG_KDP_INTERACTIVE_DEBUGGING +#if !CONFIG_KDP_INTERACTIVE_DEBUGGING +__attribute__((noreturn)) static void -panic_spin_forever() +panic_spin_forever() { kdb_printf("\nPlease go to https://panic.apple.com to report this panic\n"); + for (;;) { } } #endif +#if WITH_CONSISTENT_DBG && CONFIG_KDP_INTERACTIVE_DEBUGGING +__attribute__((noreturn)) +static void +panic_spin_shmcon() +{ + kdb_printf("\nPlease go to https://panic.apple.com to report this panic\n"); + kdb_printf("Waiting for hardware shared memory debugger, handshake structure is at virt: %p, phys %p\n", + hwsd_info, (void *)kvtophys((vm_offset_t)hwsd_info)); + + assert(hwsd_info != NULL); + hwsd_info->xhsdci_status = XHSDCI_STATUS_KERNEL_READY; + hwsd_info->xhsdci_seq_no = 0; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + + for (;;) { + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + if (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_BEGIN) { + kern_dump(KERN_DUMP_HW_SHMEM_DBG); + } + + if ((hwsd_info->xhsdci_status == XHSDCI_COREDUMP_REMOTE_DONE) || + (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_ERROR)) { + hwsd_info->xhsdci_status = XHSDCI_STATUS_KERNEL_READY; + hwsd_info->xhsdci_seq_no = 0; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + } + } +} +#endif /* WITH_CONSISTENT_DBG && CONFIG_KDP_INTERACTIVE_DEBUGGING */ + +#if !CONFIG_KDP_INTERACTIVE_DEBUGGING +__attribute__((noreturn)) void kdp_raise_exception( - unsigned int exception, - unsigned int code, - unsigned int subcode, - void *saved_state -) + __unused unsigned int exception, + __unused unsigned int code, + __unused unsigned int subcode, + __unused void *saved_state + ) +#else +void +kdp_raise_exception( + unsigned int exception, + unsigned int code, + unsigned int subcode, + void *saved_state + ) +#endif { - unsigned int initial_not_in_kdp = not_in_kdp; - not_in_kdp = 0; - /* Was a system trace requested ? */ - if (kdp_snapshot && (!panic_active()) && (panic_caller == 0)) { - do_stackshot(); - not_in_kdp = initial_not_in_kdp; - return; - } +#if CONFIG_KDP_INTERACTIVE_DEBUGGING + unsigned int initial_not_in_kdp = not_in_kdp; + not_in_kdp = 0; -#if CONFIG_KDP_INTERACTIVE_DEBUGGING + disable_preemption(); - disable_preemption(); - /* - * On ARM64, KDP debugging is disabled by default. - * It is compiled into the kernel for DEVELOPMENT and DEBUG, - * but still hidden behind a boot arg (thus PE_i_can_has_kdp()). - * For RELEASE, it is not compiled. - */ - if ( - (current_debugger != KDP_CUR_DB) - ) - { - /* try a local disk dump */ - if (kdp_has_polled_corefile()) { - flag_panic_dump_in_progress = TRUE; - kern_dump(TRUE); - abort_panic_transfer(); - } - } - - if (current_debugger != KDP_CUR_DB) { - kdb_printf("\nDebugger not configured. Hanging.\n"); - for (;;) { } - } - - kdp_debugger_loop(exception, code, subcode, saved_state); - not_in_kdp = initial_not_in_kdp; - enable_preemption(); + if (current_debugger != KDP_CUR_DB) { + /* try a local disk dump */ + if (kdp_has_polled_corefile()) { +#if WITH_CONSISTENT_DBG + if (current_debugger == HW_SHM_CUR_DB) { + hwsd_info->xhsdci_status = XHSDCI_STATUS_KERNEL_BUSY; + } +#endif /* WITH_CONSISTENT_DBG */ + flag_panic_dump_in_progress = TRUE; + kern_dump(KERN_DUMP_DISK); + abort_panic_transfer(); + } +#if WITH_CONSISTENT_DBG + if (current_debugger == HW_SHM_CUR_DB) { + panic_spin_shmcon(); + } +#endif /* WITH_CONSISTENT_DBG */ + + + if (!panicDebugging) { + kdp_machine_reboot(); + } + } + + kdp_debugger_loop(exception, code, subcode, saved_state); + not_in_kdp = initial_not_in_kdp; + enable_preemption(); #else /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ - assert(current_debugger != KDP_CUR_DB); - - /* - * If kernel debugging is enabled via boot-args, but KDP debugging - * is not compiled into the kernel, spin here waiting for debugging - * via another method. Why here? Because we want to have watchdog - * disabled (via KDP callout) while sitting waiting to be debugged. - */ - panic_spin_forever(); - - (void)exception; - (void)code; - (void)subcode; - (void)saved_state; + assert(current_debugger != KDP_CUR_DB); + + /* + * If kernel debugging is enabled via boot-args, but KDP debugging + * is not compiled into the kernel, spin here waiting for debugging + * via another method. Why here? Because we want to have watchdog + * disabled (via KDP callout) while sitting waiting to be debugged. + */ + panic_spin_forever(); #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ } - - diff --git a/osfmk/kdp/ml/i386/kdp_x86_common.c b/osfmk/kdp/ml/i386/kdp_x86_common.c index 3ce3b191d..b576ec666 100644 --- a/osfmk/kdp/ml/i386/kdp_x86_common.c +++ b/osfmk/kdp/ml/i386/kdp_x86_common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,22 +66,22 @@ boolean_t kdp_read_io; boolean_t kdp_trans_off; -addr64_t kdp_vtophys(pmap_t pmap, addr64_t va); +pmap_paddr_t kdp_vtophys(pmap_t pmap, vm_offset_t va); pmap_t kdp_pmap = 0; -addr64_t +pmap_paddr_t kdp_vtophys( pmap_t pmap, - addr64_t va) + vm_offset_t va) { - addr64_t pa; + pmap_paddr_t pa; ppnum_t pp; pp = pmap_find_phys(pmap, va); if(!pp) return 0; - pa = ((addr64_t)pp << PAGE_SHIFT) | (va & PAGE_MASK); + pa = ((pmap_paddr_t)pp << PAGE_SHIFT) | (va & PAGE_MASK); return(pa); } @@ -333,7 +333,6 @@ kdp_machine_ioport_read(kdp_readioport_req_t *rq, caddr_t data, uint16_t lcpu) break; default: return KDPERR_BADFLAVOR; - break; } return KDPERR_NO_ERROR; @@ -362,7 +361,6 @@ kdp_machine_ioport_write(kdp_writeioport_req_t *rq, caddr_t data, uint16_t lcpu) break; default: return KDPERR_BADFLAVOR; - break; } return KDPERR_NO_ERROR; diff --git a/osfmk/kdp/ml/x86_64/kdp_machdep.c b/osfmk/kdp/ml/x86_64/kdp_machdep.c index 91019f56a..1b8a111d0 100644 --- a/osfmk/kdp/ml/x86_64/kdp_machdep.c +++ b/osfmk/kdp/ml/x86_64/kdp_machdep.c @@ -59,6 +59,7 @@ extern cpu_subtype_t cpuid_cpusubtype(void); extern vm_offset_t machine_trace_thread_get_kva(vm_offset_t cur_target_addr, vm_map_t map, uint32_t *thread_trace_flags); extern void machine_trace_thread_clear_validation_cache(void); +extern vm_map_t kernel_map; void print_saved_state(void *); void kdp_call(void); @@ -66,16 +67,8 @@ int kdp_getc(void); boolean_t kdp_call_kdb(void); void kdp_getstate(x86_thread_state64_t *); void kdp_setstate(x86_thread_state64_t *); -void kdp_print_phys(int); - -int -machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p, uint32_t *thread_trace_flags); - -int -machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p, uint32_t *thread_trace_flags); - -unsigned -machine_read64(addr64_t srcaddr, caddr_t dstaddr, uint32_t len); +void kdp_print_phys(int); +unsigned machine_read64(addr64_t srcaddr, caddr_t dstaddr, uint32_t len); static void kdp_callouts(kdp_event_t event); @@ -475,7 +468,15 @@ kdp_i386_trap( saved_state = current_cpu_datap()->cpu_fatal_trap_state; } - kdp_raise_exception(exception, code, subcode, saved_state); + if (debugger_callback) { + unsigned int initial_not_in_kdp = not_in_kdp; + not_in_kdp = 0; + debugger_callback->error = debugger_callback->callback(debugger_callback->callback_context); + not_in_kdp = initial_not_in_kdp; + } else { + kdp_raise_exception(exception, code, subcode, saved_state); + } + /* If the instruction single step bit is set, disable kernel preemption */ if (saved_state->isf.rflags & EFL_TF) { @@ -505,53 +506,56 @@ kdp_machine_get_breakinsn( *size = 1; } -extern pmap_t kdp_pmap; - #define RETURN_OFFSET 4 int -machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p, uint32_t *thread_trace_flags) +machine_trace_thread(thread_t thread, + char * tracepos, + char * tracebound, + int nframes, + boolean_t user_p, + boolean_t trace_fp, + uint32_t * thread_trace_flags) { - uint32_t *tracebuf = (uint32_t *)tracepos; - uint32_t fence = 0; - uint32_t stackptr = 0; - uint32_t stacklimit = 0xfc000000; - int framecount = 0; - uint32_t init_eip = 0; - uint32_t prevsp = 0; - uint32_t framesize = 2 * sizeof(vm_offset_t); + uint32_t * tracebuf = (uint32_t *)tracepos; + uint32_t framesize = (trace_fp ? 2 : 1) * sizeof(uint32_t); + + uint32_t fence = 0; + uint32_t stackptr = 0; + uint32_t stacklimit = 0xfc000000; + int framecount = 0; + uint32_t prev_eip = 0; + uint32_t prevsp = 0; vm_offset_t kern_virt_addr = 0; + vm_map_t bt_vm_map = VM_MAP_NULL; + + nframes = (tracebound > tracepos) ? MIN(nframes, (int)((tracebound - tracepos) / framesize)) : 0; + + if (thread->machine.iss == NULL) { + // no register states to backtrace, probably thread is terminating + return 0; + } if (user_p) { - x86_saved_state32_t *iss32; + x86_saved_state32_t *iss32; iss32 = USER_REGS32(thread); - init_eip = iss32->eip; + prev_eip = iss32->eip; stackptr = iss32->ebp; stacklimit = 0xffffffff; - kdp_pmap = thread->task->map->pmap; + bt_vm_map = thread->task->map; } else panic("32-bit trace attempted on 64-bit kernel"); - /* bounds check before we start advancing tracebuf */ - if ((tracebound - ((char *)tracebuf)) < (4 * framesize)) { - machine_trace_thread_clear_validation_cache(); - kdp_pmap = 0; - return 0; - } - - *tracebuf++ = init_eip; - for (framecount = 0; framecount < nframes; framecount++) { - if ((tracebound - ((char *)tracebuf)) < (4 * framesize)) { - tracebuf--; - break; + *tracebuf++ = prev_eip; + if (trace_fp) { + *tracebuf++ = stackptr; } - *tracebuf++ = stackptr; /* Invalid frame, or hit fence */ if (!stackptr || (stackptr == fence)) { break; @@ -570,7 +574,7 @@ machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nfra break; } - kern_virt_addr = machine_trace_thread_get_kva(stackptr + RETURN_OFFSET, thread->task->map, thread_trace_flags); + kern_virt_addr = machine_trace_thread_get_kva(stackptr + RETURN_OFFSET, bt_vm_map, thread_trace_flags); if (!kern_virt_addr) { if (thread_trace_flags) { @@ -579,27 +583,23 @@ machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nfra break; } - *tracebuf = *(uint32_t *)kern_virt_addr; - tracebuf++; + prev_eip = *(uint32_t *)kern_virt_addr; prevsp = stackptr; - kern_virt_addr = machine_trace_thread_get_kva(stackptr, thread->task->map, thread_trace_flags); - if (!kern_virt_addr) { + kern_virt_addr = machine_trace_thread_get_kva(stackptr, bt_vm_map, thread_trace_flags); + + if (kern_virt_addr) { + stackptr = *(uint32_t *)kern_virt_addr; + } else { + stackptr = 0; if (thread_trace_flags) { *thread_trace_flags |= kThreadTruncatedBT; } - - /* We need to fill in a complete LR/FP record, even if we couldn't find a FP */ - *tracebuf++ = 0; - break; } - - stackptr = *(uint32_t *)kern_virt_addr; } machine_trace_thread_clear_validation_cache(); - kdp_pmap = 0; return (uint32_t) (((char *) tracebuf) - tracepos); } @@ -614,62 +614,64 @@ machine_read64(addr64_t srcaddr, caddr_t dstaddr, uint32_t len) } int -machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p, uint32_t *thread_trace_flags) +machine_trace_thread64(thread_t thread, + char * tracepos, + char * tracebound, + int nframes, + boolean_t user_p, + boolean_t trace_fp, + uint32_t * thread_trace_flags) { - uint64_t *tracebuf = (uint64_t *)tracepos; - uint32_t fence = 0; - addr64_t stackptr = 0; - int framecount = 0; - addr64_t init_rip = 0; - addr64_t prevsp = 0; - unsigned framesize = 2 * sizeof(addr64_t); + uint64_t * tracebuf = (uint64_t *)tracepos; + unsigned framesize = (trace_fp ? 2 : 1) * sizeof(addr64_t); + + uint32_t fence = 0; + addr64_t stackptr = 0; + int framecount = 0; + addr64_t prev_rip = 0; + addr64_t prevsp = 0; vm_offset_t kern_virt_addr = 0; + vm_map_t bt_vm_map = VM_MAP_NULL; + + if (thread->machine.iss == NULL) { + // no register states to backtrace, probably thread is terminating + return 0; + } + + nframes = (tracebound > tracepos) ? MIN(nframes, (int)((tracebound - tracepos) / framesize)) : 0; if (user_p) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(thread); - init_rip = iss64->isf.rip; + prev_rip = iss64->isf.rip; stackptr = iss64->rbp; - kdp_pmap = thread->task->map->pmap; + bt_vm_map = thread->task->map; } else { stackptr = STACK_IKS(thread->kernel_stack)->k_rbp; - init_rip = STACK_IKS(thread->kernel_stack)->k_rip; - init_rip = VM_KERNEL_UNSLIDE(init_rip); - kdp_pmap = NULL; + prev_rip = STACK_IKS(thread->kernel_stack)->k_rip; + prev_rip = VM_KERNEL_UNSLIDE(prev_rip); + bt_vm_map = kernel_map; } - /* bounds check before we start advancing tracebuf */ - if ((uint32_t)(tracebound - ((char *)tracebuf)) < (4 * framesize)) { - machine_trace_thread_clear_validation_cache(); - kdp_pmap = NULL; - return 0; - } - *tracebuf++ = init_rip; - for (framecount = 0; framecount < nframes; framecount++) { - if ((uint32_t)(tracebound - ((char *)tracebuf)) < (4 * framesize)) { - tracebuf--; - break; + *tracebuf++ = prev_rip; + if (trace_fp) { + *tracebuf++ = stackptr; } - *tracebuf++ = stackptr; - - if (!stackptr || (stackptr == fence)){ + if (!stackptr || (stackptr == fence)) { break; } - if (stackptr & 0x0000007) { break; } - if (stackptr <= prevsp) { break; } - kern_virt_addr = machine_trace_thread_get_kva(stackptr + RETURN_OFFSET64, thread->task->map, thread_trace_flags); - + kern_virt_addr = machine_trace_thread_get_kva(stackptr + RETURN_OFFSET64, bt_vm_map, thread_trace_flags); if (!kern_virt_addr) { if (thread_trace_flags) { *thread_trace_flags |= kThreadTruncatedBT; @@ -677,30 +679,26 @@ machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nf break; } - *tracebuf = *(uint64_t *)kern_virt_addr; - if (!user_p) - *tracebuf = VM_KERNEL_UNSLIDE(*tracebuf); - - tracebuf++; + prev_rip = *(uint64_t *)kern_virt_addr; + if (!user_p) { + prev_rip = VM_KERNEL_UNSLIDE(prev_rip); + } prevsp = stackptr; - kern_virt_addr = machine_trace_thread_get_kva(stackptr, thread->task->map, thread_trace_flags); - if (!kern_virt_addr) { + kern_virt_addr = machine_trace_thread_get_kva(stackptr, bt_vm_map, thread_trace_flags); + + if (kern_virt_addr) { + stackptr = *(uint64_t *)kern_virt_addr; + } else { + stackptr = 0; if (thread_trace_flags) { *thread_trace_flags |= kThreadTruncatedBT; } - - /* We need to fill in a complete LR/FP record, even if we couldn't find a FP */ - *tracebuf++ = 0; - break; } - - stackptr = *(uint64_t *)kern_virt_addr; } machine_trace_thread_clear_validation_cache(); - kdp_pmap = NULL; return (uint32_t) (((char *) tracebuf) - tracepos); } diff --git a/osfmk/kern/Makefile b/osfmk/kern/Makefile index d04e183e6..70c638b1f 100644 --- a/osfmk/kern/Makefile +++ b/osfmk/kern/Makefile @@ -8,7 +8,8 @@ include $(MakeInc_def) DATAFILES = \ exc_resource.h \ - kern_cdata.h + kern_cdata.h \ + kcdata.h PRIVATE_DATAFILES = \ debug.h \ @@ -18,6 +19,8 @@ EXPORT_FILES = \ affinity.h \ assert.h \ audit_sessionport.h \ + backtrace.h \ + bits.h \ call_entry.h \ clock.h \ coalition.h \ @@ -40,6 +43,7 @@ EXPORT_FILES = \ macro_help.h \ page_decrypt.h \ pms.h \ + policy_internal.h \ processor.h \ queue.h \ sched_prim.h \ @@ -54,11 +58,15 @@ EXPORT_FILES = \ waitq.h \ zalloc.h +PRIVATE_EXPORT_FILES = \ + build_config.h \ + mach_node_link.h + INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} -INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} +INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} ${PRIVATE_EXPORT_FILES} INSTALL_MI_DIR = kern @@ -68,5 +76,3 @@ EXPORT_MI_DIR = kern include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/kern/assert.h b/osfmk/kern/assert.h index c5e2f4516..e0ffbaf9a 100644 --- a/osfmk/kern/assert.h +++ b/osfmk/kern/assert.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,7 +75,9 @@ extern void Assert( int line, const char *expression) __attribute__((noinline)); -#if CONFIG_NO_PANIC_STRINGS +extern int kext_assertions_enable; + +#ifdef CONFIG_NO_PANIC_STRINGS #define Assert(file, line, ex) (Assert)("", line, "") #define __Panic(fmt, args...) panic("", ##args) #else /* CONFIG_NO_PANIC_STRINGS */ @@ -84,24 +86,65 @@ extern void Assert( __END_DECLS -#if MACH_ASSERT +#ifndef APPLE_KEXT_ASSERTIONS +#define APPLE_KEXT_ASSERTIONS 0 +#endif + +#if MACH_ASSERT #define assert(ex) \ (__builtin_expect(!!((long)(ex)), 1L) ? (void)0 : Assert(__FILE__, __LINE__, # ex)) -#define assert_static(ex) _Static_assert((ex), #ex) #define assertf(ex, fmt, args...) \ (__builtin_expect(!!((long)(ex)), 1L) ? (void)0 : __Panic("%s:%d Assertion failed: %s : " fmt, __FILE__, __LINE__, # ex, ##args)) +#define __assert_only + +#elif APPLE_KEXT_ASSERTIONS && !XNU_KERNEL_PRIVATE /* MACH_ASSERT */ +#define assert(ex) \ + (__builtin_expect(!!((long)((!kext_assertions_enable) || (ex))), 1L) ? (void)0 : Assert(__FILE__, __LINE__, # ex)) +#define assertf(ex, fmt, args...) \ + (__builtin_expect(!!((long)((!kext_assertions_enable) || (ex))), 1L) ? (void)0 : __Panic("%s:%d Assertion failed: %s : " fmt, __FILE__, __LINE__, # ex, ##args)) #define __assert_only -#else /* MACH_ASSERT */ +#else /* APPLE_KEXT_ASSERTIONS && !XNU_KERNEL_PRIVATE */ #define assert(ex) ((void)0) -#define assert_static(ex) _Static_assert((ex), #ex) #define assertf(ex, fmt, args...) ((void)0) - #define __assert_only __unused #endif /* MACH_ASSERT */ +/* + * static_assert is a C11 / C++0x / C++1z feature. + * + * Beginning with C++0x, it is a keyword and should not be #defined + * + * static_assert is not disabled by MACH_ASSERT or NDEBUG + */ + +#ifndef __cplusplus + #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L + #define _STATIC_ASSERT_OVERLOADED_MACRO(_1, _2, NAME, ...) NAME + #define static_assert(...) _STATIC_ASSERT_OVERLOADED_MACRO(__VA_ARGS__, _static_assert_2_args, _static_assert_1_arg)(__VA_ARGS__) + + #define _static_assert_2_args(ex, str) _Static_assert((ex), str) + #define _static_assert_1_arg(ex) _Static_assert((ex), #ex) + #endif +#else + #if !defined(__cpp_static_assert) + /* pre C++11 support */ + #define _STATIC_ASSERT_OVERLOADED_MACRO(_1, _2, NAME, ...) NAME + #define static_assert(...) _STATIC_ASSERT_OVERLOADED_MACRO(__VA_ARGS__, _static_assert_2_args, _static_assert_1_arg)(__VA_ARGS__) + + #define _static_assert_2_args(ex, str) _Static_assert((ex), str) + #define _static_assert_1_arg(ex) _Static_assert((ex), #ex) + #else + /* + * C++11 only supports the 2 argument version of static_assert. + * C++1z has added support for the 1 argument version. + */ + #define _static_assert_1_arg(ex) static_assert((ex), #ex) + #endif +#endif + #endif /* _KERN_ASSERT_H_ */ diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index f2ceba343..62f060d65 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -80,6 +80,7 @@ #endif #include #include +#include #include #include // for CHUD AST hook #include @@ -93,7 +94,9 @@ ast_init(void) { } -extern void chudxnu_thread_ast(thread_t); // XXX this should probably be in a header... +#ifdef CONFIG_DTRACE +extern void dtrace_ast(void); +#endif /* * Called at splsched. @@ -152,6 +155,12 @@ ast_taken( if (!preempt_trap) { ml_set_interrupts_enabled(enable); +#if CONFIG_DTRACE + if (reasons & AST_DTRACE) { + dtrace_ast(); + } +#endif + #ifdef MACH_BSD /* * Handle BSD hook. @@ -175,14 +184,14 @@ ast_taken( */ if (reasons & AST_APC) { thread_ast_clear(thread, AST_APC); - special_handler(thread); + thread_apc_ast(thread); } - + if (reasons & AST_GUARD) { thread_ast_clear(thread, AST_GUARD); guard_ast(thread); } - + if (reasons & AST_LEDGER) { thread_ast_clear(thread, AST_LEDGER); ledger_ast(thread); @@ -193,19 +202,19 @@ ast_taken( */ if (reasons & AST_KPERF) { thread_ast_clear(thread, AST_KPERF); - chudxnu_thread_ast(thread); + kperf_kpc_thread_ast(thread); } #if CONFIG_TELEMETRY if (reasons & AST_TELEMETRY_ALL) { boolean_t interrupted_userspace = FALSE; - boolean_t is_windowed = FALSE; + boolean_t io_telemetry = FALSE; assert((reasons & AST_TELEMETRY_ALL) != AST_TELEMETRY_ALL); /* only one is valid at a time */ interrupted_userspace = (reasons & AST_TELEMETRY_USER) ? TRUE : FALSE; - is_windowed = ((reasons & AST_TELEMETRY_WINDOWED) ? TRUE : FALSE); + io_telemetry = ((reasons & AST_TELEMETRY_IO) ? TRUE : FALSE); thread_ast_clear(thread, AST_TELEMETRY_ALL); - telemetry_ast(thread, interrupted_userspace, is_windowed); + telemetry_ast(thread, interrupted_userspace, io_telemetry); } #endif @@ -309,4 +318,9 @@ ast_context(thread_t thread) *pending_ast = ((*pending_ast & ~AST_PER_THREAD) | thread->ast); } +void +ast_dtrace_on(void) +{ + ast_on(AST_DTRACE); +} diff --git a/osfmk/kern/ast.h b/osfmk/kern/ast.h index c6ecb5efa..bd9dd0bb5 100644 --- a/osfmk/kern/ast.h +++ b/osfmk/kern/ast.h @@ -118,8 +118,9 @@ typedef uint32_t ast_t; #define AST_GUARD 0x1000 #define AST_TELEMETRY_USER 0x2000 /* telemetry sample requested on interrupt from userspace */ #define AST_TELEMETRY_KERNEL 0x4000 /* telemetry sample requested on interrupt from kernel */ -#define AST_TELEMETRY_WINDOWED 0x8000 /* telemetry sample meant for the window buffer */ #define AST_SFI 0x10000 /* Evaluate if SFI wait is needed before return to userspace */ +#define AST_DTRACE 0x20000 +#define AST_TELEMETRY_IO 0x40000 /* telemetry sample requested for I/O */ #define AST_NONE 0x00 #define AST_ALL (~AST_NONE) @@ -128,10 +129,10 @@ typedef uint32_t ast_t; #define AST_PREEMPTION (AST_PREEMPT | AST_QUANTUM | AST_URGENT) #define AST_CHUD_ALL (AST_CHUD_URGENT|AST_CHUD) -#define AST_TELEMETRY_ALL (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL | AST_TELEMETRY_WINDOWED) +#define AST_TELEMETRY_ALL (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL | AST_TELEMETRY_IO) /* Per-thread ASTs follow the thread at context-switch time. */ -#define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | AST_LEDGER | AST_GUARD | AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL | AST_TELEMETRY_WINDOWED) +#define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | AST_LEDGER | AST_GUARD | AST_TELEMETRY_ALL ) /* Initialize module */ extern void ast_init(void); @@ -176,4 +177,8 @@ extern void bsd_ast(thread_t); #endif /* MACH_BSD */ +#ifdef CONFIG_DTRACE +extern void ast_dtrace_on(void); +#endif /* CONFIG_DTRACE */ + #endif /* _KERN_AST_H_ */ diff --git a/osfmk/kern/backtrace.c b/osfmk/kern/backtrace.c new file mode 100644 index 000000000..19dbe70ff --- /dev/null +++ b/osfmk/kern/backtrace.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include +#include +#include +#include + + +uint32_t __attribute__((noinline)) +backtrace(uintptr_t *bt, uint32_t max_frames) +{ + return backtrace_frame(bt, max_frames, __builtin_frame_address(0)); +} + +/* + * This function captures a backtrace from the current stack and returns the + * number of frames captured, limited by max_frames and starting at start_frame. + * It's fast because it does no checking to make sure there isn't bad data. + * Since it's only called from threads that we're going to keep executing, + * if there's bad data we were going to die eventually. If this function is + * inlined, it doesn't record the frame of the function it's inside (because + * there's no stack frame). + */ +uint32_t __attribute__((noinline,not_tail_called)) +backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) +{ + thread_t thread = current_thread(); + uintptr_t *fp; + uintptr_t *next_fp; + uint32_t frame_index = 0; + uintptr_t top, bottom; + + assert(bt != NULL); + assert(max_frames > 0); + + fp = start_frame; + bottom = thread->kernel_stack; + top = bottom + kernel_stack_size; + + if ((uintptr_t)fp >= top || (uintptr_t)fp < bottom) { + fp = NULL; + } + + while (fp != NULL && frame_index < max_frames) { + next_fp = (uintptr_t *)*fp; + + /* + * If the frame pointer is 0, backtracing has reached the top of + * the stack and there is no return address. Some stacks might not + * have set this up, so bounds check, as well. + */ + if (next_fp == NULL || + (uintptr_t)next_fp >= top || + (uintptr_t)next_fp < bottom) + { + break; + } + + /* return address is one word higher than frame pointer */ + bt[frame_index++] = *(fp + 1); + + /* stacks grow down; backtracing should be moving to higher addresses */ + if (next_fp <= fp) { + break; + } + fp = next_fp; + } + + return frame_index; +} + +#if defined(__x86_64__) + +static kern_return_t +interrupted_kernel_pc_fp(uintptr_t *pc, uintptr_t *fp) +{ + x86_saved_state_t *state; + bool state_64; + uint64_t cs; + + state = current_cpu_datap()->cpu_int_state; + if (!state) { + return KERN_FAILURE; + } + + state_64 = is_saved_state64(state); + + if (state_64) { + cs = saved_state64(state)->isf.cs; + } else { + cs = saved_state32(state)->cs; + } + /* return early if interrupted a thread in user space */ + if ((cs & SEL_PL) == SEL_PL_U) { + return KERN_FAILURE; + } + + if (state_64) { + *pc = saved_state64(state)->isf.rip; + *fp = saved_state64(state)->rbp; + } else { + *pc = saved_state32(state)->eip; + *fp = saved_state32(state)->ebp; + } + return KERN_SUCCESS; +} + +#else /* defined(__arm__) */ +#error "interrupted_kernel_pc_fp: unsupported architecture" +#endif /* !defined(__arm__) */ + +uint32_t +backtrace_interrupted(uintptr_t *bt, uint32_t max_frames) +{ + uintptr_t pc; + uintptr_t *fp; + kern_return_t kr; + + assert(bt != NULL); + assert(max_frames > 0); + assert(ml_at_interrupt_context() == TRUE); + + kr = interrupted_kernel_pc_fp(&pc, (uintptr_t)&fp); + if (kr != KERN_SUCCESS) { + return 0; + } + + bt[0] = pc; + if (max_frames == 1) { + return 1; + } + + return backtrace_frame(bt + 1, max_frames - 1, fp); +} + +int +backtrace_user(uintptr_t *bt, uint32_t max_frames, uint32_t *frames_out, + bool *user_64_out) +{ + return backtrace_thread_user(current_thread(), bt, max_frames, frames_out, + user_64_out); +} + +int +backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, + uint32_t *frames_out, bool *user_64_out) +{ + bool user_64; + uintptr_t pc, fp, next_fp; + vm_map_t map, old_map; + uint32_t frame_index = 0; + int err = 0; + size_t frame_size; + + assert(ml_get_interrupts_enabled() == TRUE); + if (!ml_get_interrupts_enabled()) { + return EINVAL; + } + + assert(bt != NULL); + assert(max_frames > 0); + assert(frames_out != NULL); + assert(user_64_out != NULL); + +#if defined(__x86_64__) + + /* don't allow a malformed user stack to copyin arbitrary kernel data */ +#define INVALID_USER_FP(FP) ((FP) == 0 || !IS_USERADDR64_CANONICAL((FP))) + + x86_saved_state_t *state = get_user_regs(thread); + + if (!state) { + return EINVAL; + } + + user_64 = is_saved_state64(state); + if (user_64) { + pc = saved_state64(state)->isf.rip; + fp = saved_state64(state)->rbp; + } else { + pc = saved_state32(state)->eip; + fp = saved_state32(state)->ebp; + } + +#else /* defined(__arm__) */ +#error "backtrace_thread_user: unsupported architecture" +#endif /* !defined(__arm__) */ + + /* switch to the correct map, for copyin */ + if (thread != current_thread()) { + map = get_task_map_reference(get_threadtask(thread)); + if (map == NULL) { + return EINVAL; + } + old_map = vm_map_switch(map); + } else { + map = NULL; + } + + union { + struct { + uint64_t fp; + uint64_t ret; + } u64; + struct { + uint32_t fp; + uint32_t ret; + } u32; + } frame; + frame_size = 2 * (user_64 ? sizeof(uint64_t) : sizeof(uint32_t)); + + bt[frame_index++] = pc; + + if (INVALID_USER_FP(fp)) { + goto out; + } + + while (fp != 0 && frame_index < max_frames) { + err = copyin(fp, (char *)&frame, frame_size); + if (err) { + goto out; + } + + next_fp = user_64 ? frame.u64.fp : frame.u32.fp; + + if (INVALID_USER_FP(next_fp)) { + break; + } + + bt[frame_index++] = user_64 ? frame.u64.ret : frame.u32.ret; + + /* stacks grow down; backtracing should be moving to higher addresses */ + if (next_fp <= fp) { + break; + } + fp = next_fp; + } + +out: + if (map) { + (void)vm_map_switch(old_map); + vm_map_deallocate(map); + } + + *user_64_out = user_64; + *frames_out = frame_index; + return err; +#undef INVALID_USER_FP +} diff --git a/osfmk/kern/backtrace.h b/osfmk/kern/backtrace.h new file mode 100644 index 000000000..246ca5a83 --- /dev/null +++ b/osfmk/kern/backtrace.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef BACKTRACE_H +#define BACKTRACE_H + +#include +#include +#include + +__BEGIN_DECLS + +/* + * Backtrace the current thread, storing up to max_frames return addresses in + * bt. Returns the number of return addresses stored. + */ +uint32_t backtrace(uintptr_t *bt, uint32_t max_frames) + __attribute__((noinline)); + +/* + * Backtrace the current thread starting at the frame pointer start_fp, storing + * up to max_frames return addresses in bt. Returns the number of return + * addresses stored. + */ +uint32_t backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) + __attribute__((noinline,not_tail_called)); + +/* + * Backtrace the kernel stack of the context that was interrupted, storing up + * to max_frames return addresses in bt. Returns 0 on success, and non-zero + * otherwise. On success, the number of frames written is stored at the value + * pointed to by frames_out. + * + * Must be called from interrupt context. + */ +uint32_t backtrace_interrupted(uintptr_t *bt, uint32_t max_frames); + +/* + * Backtrace the user stack of the current thread, storing up to max_frames + * return addresses in bt. Returns 0 on success, and non-zero otherwise. On + * success, the number of frames written is stored at the value pointed to by + * frames_out and the value pointed to by user_64_out is set true if the user + * space thread was running in 64-bit mode, and false otherwise. + * + * Must not be called from interrupt context or with interrupts disabled. + */ +int backtrace_user(uintptr_t *bt, uint32_t max_frames, uint32_t *frames_out, + bool *user_64_out); + +/* + * Backtrace the user stack of the given thread, storing up to max_frames return + * addresses in bt. Returns 0 on success, and non-zero otherwise. On success, + * the number of frames written is stored at the value pointed to by frames_out + * and the value pointed to by user_64_out is set true if the user space thread + * was running in 64-bit mode, and false otherwise. + * + * Must not be called from interrupt context or with interrupts disabled. + */ +int backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, + uint32_t *frames_out, bool *user_64_out); + +__END_DECLS + +#endif /* !defined(BACKTRACE_H) */ diff --git a/osfmk/kern/bits.c b/osfmk/kern/bits.c deleted file mode 100644 index b6cfb2043..000000000 --- a/osfmk/kern/bits.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:35 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.2 1998/04/29 17:35:55 mburg - * MK7.3 merger - * - * Revision 1.1.24.1 1998/02/03 09:27:19 gdt - * Merge up to MK7.3 - * [1998/02/03 09:12:57 gdt] - * - * Revision 1.1.21.1 1996/11/29 16:57:21 stephen - * nmklinux_1.0b3_shared into pmk1.1 - * Added explanatory note. - * [1996/04/10 16:54:46 emcmanus] - * - * Revision 1.1.22.1 1997/06/17 02:57:05 devrcs - * Added `testbit()' routine. - * [1996/03/18 15:21:50 rkc] - * - * Revision 1.1.7.3 1995/01/10 05:10:36 devrcs - * mk6 CR801 - copyright marker not FREE_ - * [1994/12/01 19:24:54 dwm] - * - * Revision 1.1.7.1 1994/06/14 16:59:49 bolinger - * Merge up to NMK17.2. - * [1994/06/14 16:53:29 bolinger] - * - * Revision 1.1.5.1 1994/04/11 09:36:31 bernadat - * Checked in NMK16_2 revision - * [94/03/15 bernadat] - * - * Revision 1.1.3.1 1993/12/23 08:53:13 bernadat - * Checked in bolinger_860ci revision. - * [93/11/29 bernadat] - * - * Revision 1.1.1.2 1993/09/12 15:44:20 bolinger - * Initial checkin of 860 modifications; MD files from NMK14.8. - * - * $EndLog$ - */ -/* - * C version of bit manipulation routines now required by kernel. - * Should be replaced with assembler versions in any real port. - * - * Note that these routines use little-endian numbering for bits (i.e., - * the bit number corresponds to the associated power-of-2). - */ -#include /* for BYTE_SIZE */ - -#define INT_SIZE (BYTE_SIZE * sizeof (int)) - -/* - * Set indicated bit in bit string. - */ -void -setbit(int bitno, int *s) -{ - for ( ; INT_SIZE <= bitno; bitno -= INT_SIZE, ++s) - ; - *s |= 1 << bitno; -} - -/* - * Clear indicated bit in bit string. - */ -void -clrbit(int bitno, int *s) -{ - for ( ; INT_SIZE <= bitno; bitno -= INT_SIZE, ++s) - ; - *s &= ~(1 << bitno); -} - -/* - * Find first bit set in bit string. - */ -int -ffsbit(int *s) -{ - int offset, mask; - - for (offset = 0; !*s; offset += INT_SIZE, ++s) - ; - for (mask = 1; mask; mask <<= 1, ++offset) - if (mask & *s) - return (offset); - /* - * Shouldn't get here - */ - return (0); -} - -/* - * Test if indicated bit is set in bit string. - */ -int -testbit(int bitno, int *s) -{ - for ( ; INT_SIZE <= bitno; bitno -= INT_SIZE, ++s) - ; - return(*s & (1 << bitno)); -} diff --git a/osfmk/kern/bits.h b/osfmk/kern/bits.h new file mode 100644 index 000000000..0305208a8 --- /dev/null +++ b/osfmk/kern/bits.h @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + * Bit manipulation functions + */ + +#ifndef __BITS_H__ +#define __BITS_H__ + +#include +#include +#include + +typedef unsigned int uint; + +#define BIT(b) (1ULL << (b)) + +#define mask(width) (BIT(width) - 1) +#define extract(x, shift, width) ((((uint64_t)(x)) >> (shift)) & mask(width)) +#define bits(x, hi, lo) extract((x), (lo), (hi) - (lo) + 1) + +#define bit_set(x, b) ((x) |= BIT(b)) +#define bit_clear(x, b) ((x) &= ~BIT(b)) +#define bit_test(x, b) ((bool)((x) & BIT(b))) + +/* Returns the most significant '1' bit, or -1 if all zeros */ +inline static int +bit_first(uint64_t bitmap) +{ + return (bitmap == 0) ? -1 : 63 - __builtin_clzll(bitmap); +} + + +inline static int +__bit_next(uint64_t bitmap, int previous_bit) +{ + uint64_t mask = previous_bit ? mask(previous_bit) : ~0ULL; + + return bit_first(bitmap & mask); +} + +/* Returns the most significant '1' bit that is less significant than previous_bit, + * or -1 if no such bit exists. + */ +inline static int +bit_next(uint64_t bitmap, int previous_bit) +{ + if (previous_bit == 0) { + return -1; + } else { + return __bit_next(bitmap, previous_bit); + } +} + +/* Returns the least significant '1' bit, or -1 if all zeros */ +inline static int +lsb_first(uint64_t bitmap) +{ + return __builtin_ffsll(bitmap) - 1; +} + +/* Returns the least significant '1' bit that is more significant than previous_bit, + * or -1 if no such bit exists. + * previous_bit may be -1, in which case this is equivalent to lsb_first() + */ +inline static int +lsb_next(uint64_t bitmap, int previous_bit) +{ + uint64_t mask = mask(previous_bit + 1); + + return lsb_first(bitmap & ~mask); +} + +inline static int +bit_count(uint64_t x) +{ + return __builtin_popcountll(x); +} + +/* Return the highest power of 2 that is <= n, or -1 if n == 0 */ +inline static int +bit_floor(uint64_t n) +{ + return bit_first(n); +} + +/* Return the lowest power of 2 that is >= n, or -1 if n == 0 */ +inline static int +bit_ceiling(uint64_t n) +{ + if (n == 0) { + return -1; + } + return bit_first(n - 1) + 1; +} + +/* If n is a power of 2, bit_log2(n) == bit_floor(n) == bit_ceiling(n) */ +#define bit_log2(n) bit_floor((uint64_t)(n)) + +typedef _Atomic uint64_t bitmap_t; + + +inline static bool +atomic_bit_set(bitmap_t *map, int n, int mem_order) +{ + bitmap_t prev; + prev = __c11_atomic_fetch_or(map, BIT(n), mem_order); + return bit_test(prev, n); +} + +inline static bool +atomic_bit_clear(bitmap_t *map, int n, int mem_order) +{ + bitmap_t prev; + prev = __c11_atomic_fetch_and(map, ~BIT(n), mem_order); + return bit_test(prev, n); +} + + +#define BITMAP_LEN(n) (((uint)(n) + 63) >> 6) /* Round to 64bit bitmap_t */ +#define BITMAP_SIZE(n) (size_t)(BITMAP_LEN(n) << 3) /* Round to 64bit bitmap_t, then convert to bytes */ +#define bitmap_bit(n) bits(n, 5, 0) +#define bitmap_index(n) bits(n, 63, 6) + +inline static bitmap_t * +bitmap_zero(bitmap_t *map, uint nbits) +{ + return (bitmap_t *)memset((void *)map, 0, BITMAP_SIZE(nbits)); +} + +inline static bitmap_t * +bitmap_full(bitmap_t *map, uint nbits) +{ + return (bitmap_t *)memset((void *)map, ~0, BITMAP_SIZE(nbits)); +} + +inline static bitmap_t * +bitmap_alloc(uint nbits) +{ + assert(nbits > 0); + bitmap_t *map = (bitmap_t *)kalloc(BITMAP_SIZE(nbits)); + if (map) { + bitmap_zero(map, nbits); + } + return map; +} + +inline static void +bitmap_free(bitmap_t *map, uint nbits) +{ + assert(nbits > 0); + kfree(map, BITMAP_SIZE(nbits)); +} + +inline static void +bitmap_set(bitmap_t *map, uint n) +{ + bit_set(map[bitmap_index(n)], bitmap_bit(n)); +} + +inline static void +bitmap_clear(bitmap_t *map, uint n) +{ + bit_clear(map[bitmap_index(n)], bitmap_bit(n)); +} + +inline static bool +atomic_bitmap_set(bitmap_t *map, uint n, int mem_order) +{ + return atomic_bit_set(&map[bitmap_index(n)], bitmap_bit(n), mem_order); +} + +inline static bool +atomic_bitmap_clear(bitmap_t *map, uint n, int mem_order) +{ + return atomic_bit_clear(&map[bitmap_index(n)], bitmap_bit(n), mem_order); +} + +inline static bool +bitmap_test(bitmap_t *map, uint n) +{ + return bit_test(map[bitmap_index(n)], bitmap_bit(n)); +} + +inline static int +bitmap_first(bitmap_t *map, uint nbits) +{ + for (int i = (int)bitmap_index(nbits - 1); i >= 0; i--) { + if (map[i] == 0) { + continue; + } + return (i << 6) + bit_first(map[i]); + } + + return -1; +} + +inline static int +bitmap_and_not_mask_first(bitmap_t *map, bitmap_t *mask, uint nbits) +{ + for (int i = (int)bitmap_index(nbits - 1); i >= 0; i--) { + if ((map[i] & ~mask[i]) == 0) { + continue; + } + return (i << 6) + bit_first(map[i] & ~mask[i]); + } + + return -1; +} + +inline static int +bitmap_lsb_first(bitmap_t *map, uint nbits) +{ + for (uint i = 0; i <= bitmap_index(nbits - 1); i++) { + if (map[i] == 0) { + continue; + } + return (int)((i << 6) + (uint32_t)lsb_first(map[i])); + } + + return -1; +} + +inline static int +bitmap_next(bitmap_t *map, uint prev) +{ + if (prev == 0) { + return -1; + } + + int64_t i = bitmap_index(prev - 1); + int res = __bit_next(map[i], bits(prev, 5, 0)); + if (res >= 0) { + return (int)(res + (i << 6)); + } + + for (i = i - 1; i >= 0; i--) { + if (map[i] == 0) { + continue; + } + return (int)((i << 6) + bit_first(map[i])); + } + + return -1; +} + +inline static int +bitmap_lsb_next(bitmap_t *map, uint nbits, uint prev) +{ + if ((prev + 1) >= nbits) { + return -1; + } + + uint64_t i = bitmap_index(prev + 1); + uint b = bits((prev + 1), 5, 0) - 1; + int32_t res = lsb_next((uint64_t)map[i], (int)b); + if (res >= 0) { + return (int)((uint64_t)res + (i << 6)); + } + + for (i = i + 1; i <= bitmap_index(nbits - 1); i++) { + if (map[i] == 0) { + continue; + } + return (int)((i << 6) + (uint64_t)lsb_first(map[i])); + } + + return -1; +} + +#endif diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index 713466a53..69aaa1ceb 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -50,7 +50,6 @@ /* BSD KERN COMPONENT INTERFACE */ task_t bsd_init_task = TASK_NULL; -boolean_t init_task_died; extern unsigned int not_in_kdp; /* Skip acquiring locks if we're in kdp */ thread_t get_firstthread(task_t); @@ -59,13 +58,15 @@ int get_thread_userstop(thread_t); boolean_t current_thread_aborted(void); void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); kern_return_t get_signalact(task_t , thread_t *, int); -int get_vmsubmap_entries(vm_map_t, vm_object_offset_t, vm_object_offset_t); int fill_task_rusage(task_t task, rusage_info_current *ri); int fill_task_io_rusage(task_t task, rusage_info_current *ri); int fill_task_qos_rusage(task_t task, rusage_info_current *ri); void fill_task_billed_usage(task_t task, rusage_info_current *ri); void task_bsdtask_kill(task_t); +extern uint64_t get_dispatchqueue_serialno_offset_from_proc(void *p); +extern uint64_t proc_uniqueid(void *p); + #if MACH_BSD extern void psignal(void *, int); #endif @@ -319,8 +320,11 @@ swap_task_map(task_t task, thread_t thread, vm_map_t map, boolean_t doswitch) task_lock(task); mp_disable_preemption(); + old_map = task->map; thread->map = task->map = map; + vm_commit_pagezero_status(map); + if (doswitch) { pmap_switch(map->pmap); } @@ -428,6 +432,126 @@ uint64_t get_task_phys_footprint_max(task_t task) return 0; } +/* + * + */ +uint64_t get_task_phys_footprint_limit(task_t task) +{ + kern_return_t ret; + ledger_amount_t max; + + ret = ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &max); + if (KERN_SUCCESS == ret) { + return max; + } + + return 0; +} + +uint64_t get_task_internal(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.internal, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_internal_compressed(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.internal_compressed, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_purgeable_nonvolatile(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.purgeable_nonvolatile, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_purgeable_nonvolatile_compressed(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.purgeable_nonvolatile_compressed, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_alternate_accounting(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.alternate_accounting, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_alternate_accounting_compressed(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.alternate_accounting_compressed, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_page_table(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.page_table, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_iokit_mapped(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.iokit_mapped, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + uint64_t get_task_cpu_time(task_t task) { kern_return_t ret; @@ -475,7 +599,9 @@ get_vmmap_size( return(map->size); } -int +#if CONFIG_COREDUMP + +static int get_vmsubmap_entries( vm_map_t map, vm_object_offset_t start, @@ -537,6 +663,7 @@ get_vmmap_entries( vm_map_unlock(map); return(total_entries); } +#endif /* CONFIG_COREDUMP */ /* * @@ -822,7 +949,7 @@ fill_task_rusage(task_t task, rusage_info_current *ri) assert(task != TASK_NULL); task_lock(task); - task_power_info_locked(task, &powerinfo, NULL); + task_power_info_locked(task, &powerinfo, NULL, NULL); ri->ri_pkg_idle_wkups = powerinfo.task_platform_idle_wakeups; ri->ri_interrupt_wkups = powerinfo.task_interrupt_wakeups; ri->ri_user_time = powerinfo.total_user; @@ -845,8 +972,8 @@ void fill_task_billed_usage(task_t task __unused, rusage_info_current *ri) { #if CONFIG_BANK - ri->ri_billed_system_time = bank_billed_time(task->bank_context); - ri->ri_serviced_system_time = bank_serviced_time(task->bank_context); + ri->ri_billed_system_time = bank_billed_time_safe(task); + ri->ri_serviced_system_time = bank_serviced_time_safe(task); #else ri->ri_billed_system_time = 0; ri->ri_serviced_system_time = 0; @@ -884,10 +1011,7 @@ fill_task_qos_rusage(task_t task, rusage_info_current *ri) if (thread->options & TH_OPT_IDLE_THREAD) continue; - thread_mtx_lock(thread); - thread_update_qos_cpu_time(thread, TRUE); - thread_mtx_unlock(thread); - + thread_update_qos_cpu_time(thread); } ri->ri_cpu_time_qos_default = task->cpu_time_qos_stats.cpu_time_qos_default; ri->ri_cpu_time_qos_maintenance = task->cpu_time_qos_stats.cpu_time_qos_maintenance; @@ -900,3 +1024,39 @@ fill_task_qos_rusage(task_t task, rusage_info_current *ri) task_unlock(task); return (0); } + +uint64_t +get_task_dispatchqueue_serialno_offset(task_t task) +{ + uint64_t dq_serialno_offset = 0; + + if (task->bsd_info) { + dq_serialno_offset = get_dispatchqueue_serialno_offset_from_proc(task->bsd_info); + } + + return dq_serialno_offset; +} + +uint64_t +get_task_uniqueid(task_t task) +{ + if (task->bsd_info) { + return proc_uniqueid(task->bsd_info); + } else { + return UINT64_MAX; + } +} + +#if CONFIG_MACF +struct label * +get_task_crash_label(task_t task) +{ + return task->crash_label; +} + +void +set_task_crash_label(task_t task, struct label *label) +{ + task->crash_label = label; +} +#endif diff --git a/osfmk/kern/btlog.c b/osfmk/kern/btlog.c index c39dd166d..80a479961 100644 --- a/osfmk/kern/btlog.c +++ b/osfmk/kern/btlog.c @@ -33,6 +33,9 @@ #include #include #include +#define _SYS_TYPES_H_ +#include +#include /* * Since all records are located contiguously in memory, @@ -40,38 +43,92 @@ * and to maintain the linked list of active records * in chronological order. */ -typedef uint32_t btlog_recordindex_t; /* only 24 bits used */ -#define BTLOG_RECORDINDEX_NONE (0xFFFFFF) #define BTLOG_MAX_RECORDS (0xFFFFFF /* 16777215 */) +#define BTLOG_RECORDINDEX_NONE (0xFFFFFF) + +/* + * Each record is a stack with a reference count and a list of + * log elements that refer to it. + * + * Each log element is placed in a hash bucket that is contained + * within the btlog structure. It contains the index to the record + * that it references. + * + * So you can go from an address to the corresp. stack by hashing the address, + * finding the hash head and traversing the chain of log elements + * till you find the hash bucket with an address that matches your + * address (if it exists) or creating a new bucket to hold this new address. + */ + +#define ELEMENT_HASH_BUCKET_COUNT (256) +#define BTLOG_HASHELEMINDEX_NONE BTLOG_RECORDINDEX_NONE + +#define ZELEMS_DEFAULT (8000) +size_t zelems_count = 0; + +typedef uint32_t btlog_recordindex_t; /* only 24 bits used */ + +/* + * Queue head for the queue of elements connected to a particular record (stack). + * For quick removal of the oldest element referencing the least popular stack. Useful for LEAKS mode. + */ +TAILQ_HEAD(_element_record_queue, btlog_element); + +/* + * Queue head for the queue of elements that hash to the same bucket. + * For quick removal of the oldest element ever logged. Useful for CORRUPTION mode where we use only bucket i.e. FIFO. + */ +TAILQ_HEAD(_element_hash_queue, btlog_element); typedef struct btlog_record { - btlog_recordindex_t next:24; - uint8_t operation; -#if __LP64__ - uint32_t _pad; -#endif - void *element; - void *bt[]; /* variable sized, based on btlog_t params */ + btlog_recordindex_t next:24, + operation:8; + uint32_t ref_count; + uint32_t bthash; + struct _element_record_queue element_record_queue; + void *bt[]; /* variable sized, based on btlog_t params */ } btlog_record_t; +typedef struct btlog_element { + btlog_recordindex_t recindex:24, + operation:8; + uintptr_t elem; + TAILQ_ENTRY(btlog_element) element_record_link; /* Links to other elements pointing to the same stack. */ + + TAILQ_ENTRY(btlog_element) element_hash_link; /* Links to other elements in the same hash chain. + * During LEAKS mode, this is used as a singly-linked list because + * we don't want to initialize ELEMENT_HASH_BUCKET_COUNT heads. + * + * During CORRUPTION mode with a single hash chain, this is used as a doubly-linked list. + */ +} btlog_element_t; + struct btlog { vm_address_t btlog_buffer; /* all memory for this btlog_t */ vm_size_t btlog_buffersize; - btlog_lock_t lock_callback; /* caller-provided locking */ - btlog_unlock_t unlock_callback; - void *callback_context; - uintptr_t btrecords; /* use btlog_recordindex_t to lookup */ - size_t btrecord_count; size_t btrecord_btdepth; /* BT entries per record */ size_t btrecord_size; btlog_recordindex_t head; /* active record list */ btlog_recordindex_t tail; - size_t activecount; - - btlog_recordindex_t freelist; + btlog_recordindex_t activerecord; + btlog_recordindex_t freelist_records; + + size_t active_record_count; + size_t active_element_count; + btlog_element_t *freelist_elements; + union { + btlog_element_t **elem_recindex_hashtbl; /* LEAKS mode: We use an array of ELEMENT_HASH_BUCKET_COUNT buckets. */ + struct _element_hash_queue *element_hash_queue; /* CORRUPTION mode: We use a single hash bucket i.e. queue */ + } elem_linkage_un; + + decl_simple_lock_data(,btlog_lock); + boolean_t caller_will_remove_entries_for_element; /* If TRUE, this means that the caller is interested in keeping track of abandoned / leaked elements. + * And so they want to be in charge of explicitly removing elements. Depending on this variable we + * will choose what kind of data structure to use for the elem_linkage_un union above. + */ }; extern boolean_t vm_kernel_ready; @@ -80,19 +137,109 @@ extern boolean_t kmem_alloc_ready; #define lookup_btrecord(btlog, index) \ ((btlog_record_t *)(btlog->btrecords + index * btlog->btrecord_size)) +uint32_t calculate_hashidx_for_element(uintptr_t elem, btlog_t *btlog); +uint32_t lookup_btrecord_byhash(btlog_t *btlog, uint32_t md5_hash, void *bt[], size_t btcount); + +void btlog_add_elem_to_freelist(btlog_t *btlog, btlog_element_t *hash_elem); +btlog_element_t* btlog_get_elem_from_freelist(btlog_t *btlog); + +uint32_t +lookup_btrecord_byhash(btlog_t *btlog, uint32_t md5_hash, void *bt[], size_t btcount) +{ + btlog_recordindex_t recindex = BTLOG_RECORDINDEX_NONE; + btlog_record_t *record = NULL; + size_t i = 0; + boolean_t stack_matched = TRUE; + + assert(btcount); + assert(bt); + + recindex = btlog->head; + record = lookup_btrecord(btlog, recindex); + while (recindex != BTLOG_RECORDINDEX_NONE) { + assert(record->bthash); + assert(! TAILQ_EMPTY(&record->element_record_queue)); + if (record->bthash == md5_hash) { + + /* + * Make sure that the incoming stack actually matches the + * stack in this record. Since we only save off a + * part of the md5 hash there can be collisions sometimes. + * This comparison isn't costly because, in case of collisions, + * usually the first few frames are different. + */ + + stack_matched = TRUE; + + if (btcount < btlog->btrecord_btdepth) { + if (record->bt[btcount] != NULL) { + /* + * If the stack depth passed in is smaller than + * the recorded stack and we have a valid pointer + * in the recorded stack at that depth, then we + * don't need to do any further checks. + */ + stack_matched = FALSE; + goto next; + } + } + + for (i=0; i < MIN(btcount, btlog->btrecord_btdepth); i++) { + if (record->bt[i] != bt[i]) { + stack_matched = FALSE; + goto next; + } + } + + if (stack_matched == TRUE) { + break; + } + } +next: + recindex = record->next; + record = lookup_btrecord(btlog, recindex); + } + + return recindex; +} + +uint32_t +calculate_hashidx_for_element(uintptr_t elem, btlog_t *btlog) +{ + if (btlog->caller_will_remove_entries_for_element) { + uint32_t addr = 0; + + addr = (uint32_t) ((elem & 0xFF00) >> 0x8); + + return addr; + } else { + return 0; + } +} + +static void +btlog_lock(btlog_t *btlog) +{ + simple_lock(&btlog->btlog_lock); +} +static void +btlog_unlock(btlog_t *btlog) +{ + simple_unlock(&btlog->btlog_lock); +} + btlog_t * btlog_create(size_t numrecords, - size_t record_btdepth, - btlog_lock_t lock_callback, - btlog_unlock_t unlock_callback, - void *callback_context) + size_t record_btdepth, + boolean_t caller_will_remove_entries_for_element) { btlog_t *btlog; - vm_size_t buffersize_needed; - vm_address_t buffer = 0; - size_t i; + vm_size_t buffersize_needed = 0, elemsize_needed = 0; + vm_address_t buffer = 0, elem_buffer = 0, elem_hash_buffer = 0; + size_t i = 0; kern_return_t ret; - size_t btrecord_size; + size_t btrecord_size = 0; + uintptr_t free_elem = 0, next_free_elem = 0; if (vm_kernel_ready && !kmem_alloc_ready) return NULL; @@ -106,16 +253,26 @@ btlog_create(size_t numrecords, if (record_btdepth > BTLOG_MAX_DEPTH) return NULL; - if ((lock_callback && !unlock_callback) || - (!lock_callback && unlock_callback)) - return NULL; - /* btlog_record_t is variable-sized, calculate needs now */ btrecord_size = sizeof(btlog_record_t) + sizeof(void *) * record_btdepth; buffersize_needed = sizeof(btlog_t) + numrecords * btrecord_size; buffersize_needed = round_page(buffersize_needed); + + if (zelems_count == 0) { + zelems_count = ((max_mem + (1024*1024*1024) /*GB*/) >> 30) * ZELEMS_DEFAULT; + + if (PE_parse_boot_argn("zelems", &zelems_count, sizeof(zelems_count)) == TRUE) { + /* + * Need a max? With this scheme, it should be possible to tune the default + * so that we don't need a boot-arg to request more elements. + */ + printf("Set number of log elements per btlog to: %ld\n", zelems_count); + } + } + elemsize_needed = sizeof(btlog_element_t) * zelems_count; + elemsize_needed = round_page(elemsize_needed); /* since rounding to a page size might hold more, recalculate */ numrecords = MIN(BTLOG_MAX_RECORDS, @@ -123,38 +280,90 @@ btlog_create(size_t numrecords, if (kmem_alloc_ready) { ret = kmem_alloc(kernel_map, &buffer, buffersize_needed, VM_KERN_MEMORY_DIAG); + if (ret != KERN_SUCCESS) + return NULL; + + ret = kmem_alloc(kernel_map, &elem_buffer, elemsize_needed, VM_KERN_MEMORY_DIAG); + if (ret != KERN_SUCCESS) { + kmem_free(kernel_map, buffer, buffersize_needed); + buffer = 0; + return NULL; + } + + if (caller_will_remove_entries_for_element == TRUE) { + ret = kmem_alloc(kernel_map, &elem_hash_buffer, ELEMENT_HASH_BUCKET_COUNT * sizeof(btlog_element_t*), VM_KERN_MEMORY_DIAG); + } else { + ret = kmem_alloc(kernel_map, &elem_hash_buffer, 2 * sizeof(btlog_element_t*), VM_KERN_MEMORY_DIAG); + } + + if (ret != KERN_SUCCESS) { + kmem_free(kernel_map, buffer, buffersize_needed); + buffer = 0; + + kmem_free(kernel_map, elem_buffer, elemsize_needed); + elem_buffer = 0; + return NULL; + } + } else { buffer = (vm_address_t)pmap_steal_memory(buffersize_needed); + elem_buffer = (vm_address_t)pmap_steal_memory(elemsize_needed); + if (caller_will_remove_entries_for_element == TRUE) { + elem_hash_buffer = (vm_address_t)pmap_steal_memory(ELEMENT_HASH_BUCKET_COUNT * sizeof(btlog_element_t*)); + } else { + elem_hash_buffer = (vm_address_t)pmap_steal_memory(2 * sizeof(btlog_element_t*)); + } ret = KERN_SUCCESS; } - if (ret != KERN_SUCCESS) - return NULL; btlog = (btlog_t *)buffer; btlog->btlog_buffer = buffer; btlog->btlog_buffersize = buffersize_needed; + btlog->freelist_elements = (btlog_element_t *)elem_buffer; - btlog->lock_callback = lock_callback; - btlog->unlock_callback = unlock_callback; - btlog->callback_context = callback_context; + simple_lock_init(&btlog->btlog_lock, 0); + + btlog->caller_will_remove_entries_for_element = caller_will_remove_entries_for_element; + + if (caller_will_remove_entries_for_element == TRUE) { + btlog->elem_linkage_un.elem_recindex_hashtbl = (btlog_element_t **)elem_hash_buffer; + } else { + btlog->elem_linkage_un.element_hash_queue = (struct _element_hash_queue*) elem_hash_buffer; + TAILQ_INIT(btlog->elem_linkage_un.element_hash_queue); + } btlog->btrecords = (uintptr_t)(buffer + sizeof(btlog_t)); - btlog->btrecord_count = numrecords; btlog->btrecord_btdepth = record_btdepth; btlog->btrecord_size = btrecord_size; btlog->head = BTLOG_RECORDINDEX_NONE; btlog->tail = BTLOG_RECORDINDEX_NONE; - btlog->activecount = 0; + btlog->active_record_count = 0; + btlog->activerecord = BTLOG_RECORDINDEX_NONE; + + for (i=0; i < ELEMENT_HASH_BUCKET_COUNT; i++) { + btlog->elem_linkage_un.elem_recindex_hashtbl[i]=0; + } - /* populate freelist with all records in order */ - btlog->freelist = 0; + /* populate freelist_records with all records in order */ + btlog->freelist_records = 0; for (i=0; i < (numrecords - 1); i++) { btlog_record_t *rec = lookup_btrecord(btlog, i); rec->next = (btlog_recordindex_t)(i + 1); } lookup_btrecord(btlog, i)->next = BTLOG_RECORDINDEX_NONE; /* terminate */ + /* populate freelist_elements with all elements in order */ + free_elem = (uintptr_t)btlog->freelist_elements; + + for (i=0; i < (zelems_count - 1); i++) { + + next_free_elem = free_elem + sizeof(btlog_element_t); + *(uintptr_t*)free_elem = next_free_elem; + free_elem = next_free_elem; + } + *(uintptr_t*)next_free_elem = BTLOG_HASHELEMINDEX_NONE; + return btlog; } @@ -162,38 +371,197 @@ btlog_create(size_t numrecords, static btlog_recordindex_t btlog_get_record_from_freelist(btlog_t *btlog) { - btlog_recordindex_t recindex = btlog->freelist; + btlog_recordindex_t recindex = btlog->freelist_records; if (recindex == BTLOG_RECORDINDEX_NONE) { /* nothing on freelist */ return BTLOG_RECORDINDEX_NONE; } else { - /* remove the head of the freelist */ + /* remove the head of the freelist_records */ btlog_record_t *record = lookup_btrecord(btlog, recindex); - btlog->freelist = record->next; + btlog->freelist_records = record->next; return recindex; } } +static void +btlog_add_record_to_freelist(btlog_t *btlog, btlog_recordindex_t recindex) +{ + btlog_recordindex_t precindex = BTLOG_RECORDINDEX_NONE; + btlog_record_t *precord = NULL, *record = NULL; + + record = lookup_btrecord(btlog, recindex); + + assert(TAILQ_EMPTY(&record->element_record_queue)); + + record->bthash = 0; + + precindex = btlog->head; + precord = lookup_btrecord(btlog, precindex); + + if (precindex == recindex) { + btlog->head = precord->next; + btlog->active_record_count--; + + record->next = btlog->freelist_records; + btlog->freelist_records = recindex; + + if (btlog->head == BTLOG_RECORDINDEX_NONE) { + /* active list is now empty, update tail */ + btlog->tail = BTLOG_RECORDINDEX_NONE; + assert(btlog->active_record_count == 0); + } + } else { + while (precindex != BTLOG_RECORDINDEX_NONE) { + if (precord->next == recindex) { + precord->next = record->next; + btlog->active_record_count--; + + record->next = btlog->freelist_records; + btlog->freelist_records = recindex; + + if (btlog->tail == recindex) { + btlog->tail = precindex; + } + break; + } else { + precindex = precord->next; + precord = lookup_btrecord(btlog, precindex); + } + } + } +} + + /* Assumes btlog is already locked */ -static btlog_recordindex_t -btlog_evict_record_from_activelist(btlog_t *btlog) +static void +btlog_evict_elements_from_record(btlog_t *btlog, int num_elements_to_evict) { btlog_recordindex_t recindex = btlog->head; + btlog_record_t *record = NULL; + btlog_element_t *recelem = NULL; if (recindex == BTLOG_RECORDINDEX_NONE) { /* nothing on active list */ - return BTLOG_RECORDINDEX_NONE; + panic("BTLog: Eviction requested on btlog (0x%lx) with an empty active list.\n", (uintptr_t) btlog); } else { - /* remove the head of the active list */ - btlog_record_t *record = lookup_btrecord(btlog, recindex); - btlog->head = record->next; - btlog->activecount--; - if (btlog->head == BTLOG_RECORDINDEX_NONE) { - /* active list is now empty, update tail */ - btlog->tail = BTLOG_RECORDINDEX_NONE; + + while (num_elements_to_evict) { + /* + * LEAKS: reap the oldest element within the record with the lowest refs. + * CORRUPTION: reap the oldest element overall and drop its reference on the record + */ + + if (btlog->caller_will_remove_entries_for_element) { + uint32_t max_refs_threshold = UINT32_MAX; + btlog_recordindex_t precindex = 0, prev_evictindex = 0, evict_index = 0; + + prev_evictindex = evict_index = btlog->head; + precindex = recindex = btlog->head; + + while (recindex != BTLOG_RECORDINDEX_NONE) { + + record = lookup_btrecord(btlog, recindex); + + if (btlog->activerecord == recindex || record->ref_count > max_refs_threshold) { + /* skip this record */ + } else { + prev_evictindex = precindex; + evict_index = recindex; + max_refs_threshold = record->ref_count; + } + + if (record->next != BTLOG_RECORDINDEX_NONE) { + precindex = recindex; + } + + recindex = record->next; + } + + recindex = evict_index; + assert(recindex != BTLOG_RECORDINDEX_NONE); + record = lookup_btrecord(btlog, recindex); + + recelem = TAILQ_LAST(&record->element_record_queue, _element_record_queue); + } else { + + recelem = TAILQ_LAST(btlog->elem_linkage_un.element_hash_queue, _element_hash_queue); + recindex = recelem->recindex; + record = lookup_btrecord(btlog, recindex); + } + + /* + * Here we have the element to drop (recelem), its record and the record index. + */ + + while (recelem && num_elements_to_evict) { + + TAILQ_REMOVE(&record->element_record_queue, recelem, element_record_link); + + if (btlog->caller_will_remove_entries_for_element) { + + btlog_element_t *prev_hashelem = NULL, *hashelem = NULL; + uint32_t hashidx = 0; + + hashidx = calculate_hashidx_for_element(~recelem->elem, btlog); + + prev_hashelem = hashelem = btlog->elem_linkage_un.elem_recindex_hashtbl[hashidx]; + while (hashelem != NULL) { + if (hashelem == recelem) + break; + else { + prev_hashelem = hashelem; + hashelem = TAILQ_NEXT(hashelem, element_hash_link); + } + } + + if (hashelem == NULL) { + panic("BTLog: Missing hashelem for element list of record 0x%lx\n", (uintptr_t) record); + } + + if (prev_hashelem != hashelem) { + TAILQ_NEXT(prev_hashelem, element_hash_link) = TAILQ_NEXT(hashelem, element_hash_link); + } else { + btlog->elem_linkage_un.elem_recindex_hashtbl[hashidx] = TAILQ_NEXT(hashelem, element_hash_link); + } + } else { + + TAILQ_REMOVE(btlog->elem_linkage_un.element_hash_queue, recelem, element_hash_link); + } + + btlog_add_elem_to_freelist(btlog, recelem); + btlog->active_element_count--; + + num_elements_to_evict--; + + assert(record->ref_count); + + record->ref_count--; + + if (record->ref_count == 0) { + + btlog_add_record_to_freelist(btlog, recindex); + + /* + * LEAKS: All done with this record. Need the next least popular record. + * CORRUPTION: We don't care about records. We'll just pick the next oldest element. + */ + + if (btlog->caller_will_remove_entries_for_element) { + break; + } + } + + if (btlog->caller_will_remove_entries_for_element) { + recelem = TAILQ_LAST(&record->element_record_queue, _element_record_queue); + } else { + + recelem = TAILQ_LAST(btlog->elem_linkage_un.element_hash_queue, _element_hash_queue); + recindex = recelem->recindex; + record = lookup_btrecord(btlog, recindex); + } + } } - return recindex; } } @@ -201,6 +569,9 @@ btlog_evict_record_from_activelist(btlog_t *btlog) static void btlog_append_record_to_activelist(btlog_t *btlog, btlog_recordindex_t recindex) { + + assert(recindex != BTLOG_RECORDINDEX_NONE); + if (btlog->head == BTLOG_RECORDINDEX_NONE) { /* empty active list, update both head and tail */ btlog->head = btlog->tail = recindex; @@ -209,7 +580,39 @@ btlog_append_record_to_activelist(btlog_t *btlog, btlog_recordindex_t recindex) record->next = recindex; btlog->tail = recindex; } - btlog->activecount++; + btlog->active_record_count++; +} + +btlog_element_t* +btlog_get_elem_from_freelist(btlog_t *btlog) +{ + btlog_element_t *free_elem = NULL; + +retry: + free_elem = btlog->freelist_elements; + + if ((uintptr_t)free_elem == BTLOG_HASHELEMINDEX_NONE) { + /* nothing on freelist */ + btlog_evict_elements_from_record(btlog, 1); + goto retry; + } else { + /* remove the head of the freelist */ + uintptr_t next_elem = *(uintptr_t*)free_elem; + btlog->freelist_elements = (btlog_element_t *)next_elem; + return free_elem; + } +} + +void +btlog_add_elem_to_freelist(btlog_t *btlog, btlog_element_t *elem) +{ + btlog_element_t *free_elem = btlog->freelist_elements; + + TAILQ_NEXT(elem, element_hash_link) = (btlog_element_t *) BTLOG_HASHELEMINDEX_NONE; + TAILQ_NEXT(elem, element_record_link) = (btlog_element_t *) BTLOG_HASHELEMINDEX_NONE; + + *(uintptr_t*)elem = (uintptr_t)free_elem; + btlog->freelist_elements = elem; } void @@ -219,116 +622,219 @@ btlog_add_entry(btlog_t *btlog, void *bt[], size_t btcount) { - btlog_recordindex_t recindex; - btlog_record_t *record; - size_t i; + btlog_recordindex_t recindex = 0; + btlog_record_t *record = NULL; + size_t i; + u_int32_t md5_buffer[4]; + MD5_CTX btlog_ctx; + uint32_t hashidx = 0; - if (btlog->lock_callback) - btlog->lock_callback(btlog->callback_context); + btlog_element_t *hashelem = NULL; - /* If there's a free record, use it */ - recindex = btlog_get_record_from_freelist(btlog); - if (recindex == BTLOG_RECORDINDEX_NONE) { - /* Use the first active record (FIFO age-out) */ - recindex = btlog_evict_record_from_activelist(btlog); - assert(recindex != BTLOG_RECORDINDEX_NONE); - } + if (g_crypto_funcs == NULL) + return; - record = lookup_btrecord(btlog, recindex); + btlog_lock(btlog); - /* we always add to the tail, so there is no next pointer */ - record->next = BTLOG_RECORDINDEX_NONE; - record->operation = operation; - record->element = element; + MD5Init(&btlog_ctx); for (i=0; i < MIN(btcount, btlog->btrecord_btdepth); i++) { - record->bt[i] = bt[i]; + MD5Update(&btlog_ctx, (u_char *) &bt[i], sizeof(bt[i])); } - for (; i < btlog->btrecord_btdepth; i++) { - record->bt[i] = NULL; + MD5Final((u_char *) &md5_buffer, &btlog_ctx); + + recindex = lookup_btrecord_byhash(btlog, md5_buffer[0], bt, btcount); + + if (recindex != BTLOG_RECORDINDEX_NONE) { + + record = lookup_btrecord(btlog, recindex); + record->ref_count++; + assert(record->operation == operation); + } else { +retry: + /* If there's a free record, use it */ + recindex = btlog_get_record_from_freelist(btlog); + if (recindex == BTLOG_RECORDINDEX_NONE) { + /* Use the first active record (FIFO age-out) */ + btlog_evict_elements_from_record(btlog, ((2 * sizeof(btlog_record_t))/sizeof(btlog_element_t))); + goto retry; + } + + record = lookup_btrecord(btlog, recindex); + + /* we always add to the tail, so there is no next pointer */ + record->next = BTLOG_RECORDINDEX_NONE; + record->operation = operation; + record->bthash = md5_buffer[0]; + record->ref_count = 1; + TAILQ_INIT(&record->element_record_queue); + + for (i=0; i < MIN(btcount, btlog->btrecord_btdepth); i++) { + record->bt[i] = bt[i]; + } + + for (; i < btlog->btrecord_btdepth; i++) { + record->bt[i] = NULL; + } + + btlog_append_record_to_activelist(btlog, recindex); } - btlog_append_record_to_activelist(btlog, recindex); + btlog->activerecord = recindex; + + hashidx = calculate_hashidx_for_element((uintptr_t)element, btlog); + hashelem = btlog_get_elem_from_freelist(btlog); + + assert(record->bthash); - if (btlog->unlock_callback) - btlog->unlock_callback(btlog->callback_context); + hashelem->elem = ~((uintptr_t)element); + hashelem->operation = record->operation; + hashelem->recindex = recindex; + + TAILQ_INSERT_HEAD(&record->element_record_queue, hashelem, element_record_link); + + if (btlog->caller_will_remove_entries_for_element) { + TAILQ_NEXT(hashelem, element_hash_link) = btlog->elem_linkage_un.elem_recindex_hashtbl[hashidx]; + btlog->elem_linkage_un.elem_recindex_hashtbl[hashidx] = hashelem; + + } else { + TAILQ_INSERT_HEAD(btlog->elem_linkage_un.element_hash_queue, hashelem, element_hash_link); + } + + btlog->active_element_count++; + + btlog->activerecord = BTLOG_RECORDINDEX_NONE; + + btlog_unlock(btlog); } void btlog_remove_entries_for_element(btlog_t *btlog, void *element) { - btlog_recordindex_t recindex; - btlog_record_t *record; - - if (btlog->lock_callback) - btlog->lock_callback(btlog->callback_context); - - /* - * Since the btlog_t anchors the active - * list with a pointer to the head of - * the list, first loop making sure - * the head is correct (and doesn't - * match the element being removed). - */ - recindex = btlog->head; - record = lookup_btrecord(btlog, recindex); - while (recindex != BTLOG_RECORDINDEX_NONE) { - if (record->element == element) { - /* remove head of active list */ - btlog->head = record->next; - btlog->activecount--; - - /* add to freelist */ - record->next = btlog->freelist; - btlog->freelist = recindex; - - /* check the new head */ - recindex = btlog->head; - record = lookup_btrecord(btlog, recindex); - } else { - /* head didn't match, so we can move on */ + btlog_recordindex_t recindex = BTLOG_RECORDINDEX_NONE; + btlog_record_t *record = NULL; + uint32_t hashidx = 0; + + btlog_element_t *prev_hashelem = NULL, *hashelem = NULL; + + if (btlog->caller_will_remove_entries_for_element == FALSE) { + panic("Explicit removal of entry is not permitted for this btlog (%p).\n", btlog); + } + + if (g_crypto_funcs == NULL) + return; + + btlog_lock(btlog); + + hashidx = calculate_hashidx_for_element((uintptr_t) element, btlog); + prev_hashelem = hashelem = btlog->elem_linkage_un.elem_recindex_hashtbl[hashidx]; + + while (hashelem != NULL) { + if (~hashelem->elem == (uintptr_t)element) break; + else { + prev_hashelem = hashelem; + hashelem = TAILQ_NEXT(hashelem, element_hash_link); } } - if (recindex == BTLOG_RECORDINDEX_NONE) { - /* we iterated over the entire active list removing the element */ - btlog->tail = BTLOG_RECORDINDEX_NONE; - } else { - /* the head of the active list is stable, now remove other entries */ - btlog_recordindex_t precindex = recindex; - btlog_record_t *precord = record; + if (hashelem) { + + btlog_element_t *recelem = NULL; + + if (prev_hashelem != hashelem) { + TAILQ_NEXT(prev_hashelem, element_hash_link) = TAILQ_NEXT(hashelem, element_hash_link); + } else { - recindex = precord->next; + btlog->elem_linkage_un.elem_recindex_hashtbl[hashidx] = TAILQ_NEXT(hashelem, element_hash_link); + } + + recindex = hashelem->recindex; record = lookup_btrecord(btlog, recindex); - while (recindex != BTLOG_RECORDINDEX_NONE) { - if (record->element == element) { - /* remove in place */ - precord->next = record->next; - btlog->activecount--; + + recelem = hashelem; + TAILQ_REMOVE(&record->element_record_queue, recelem, element_record_link); - /* add to freelist */ - record->next = btlog->freelist; - btlog->freelist = recindex; + btlog_add_elem_to_freelist(btlog, hashelem); + btlog->active_element_count--; - /* check the next record */ - recindex = precord->next; - record = lookup_btrecord(btlog, recindex); - } else { - /* check the next record */ - precindex = recindex; - precord = record; + assert(record->ref_count); - recindex = record->next; - record = lookup_btrecord(btlog, recindex); - } - } + record->ref_count--; - /* We got to the end of the active list. Update the tail */ - btlog->tail = precindex; + if (record->ref_count == 0) { + btlog_add_record_to_freelist(btlog, recindex); + } } - if (btlog->unlock_callback) - btlog->unlock_callback(btlog->callback_context); + btlog_unlock(btlog); +} + +#if DEBUG || DEVELOPMENT +void +btlog_copy_backtraces_for_elements(btlog_t * btlog, + uintptr_t * instances, + uint32_t * countp, + uint32_t zoneSize, + leak_site_proc proc, + void * refCon) +{ + btlog_recordindex_t recindex; + btlog_record_t * record; + btlog_element_t * hashelem; + uint32_t hashidx, idx, dups, numSites, siteCount; + uintptr_t element, site; + uint32_t count; + + btlog_lock(btlog); + + count = *countp; + for (numSites = 0, idx = 0; idx < count; idx++) + { + element = instances[idx]; + + if (kInstanceFlagReferenced & element) continue; + element = INSTANCE_PUT(element) & ~kInstanceFlags; + + site = 0; + hashidx = calculate_hashidx_for_element(element, btlog); + hashelem = btlog->elem_linkage_un.elem_recindex_hashtbl[hashidx]; + while (hashelem != NULL) + { + if (~hashelem->elem == element) break; + hashelem = TAILQ_NEXT(hashelem, element_hash_link); + } + if (hashelem) + { + recindex = hashelem->recindex; + site = (uintptr_t) lookup_btrecord(btlog, recindex); + } + if (site) element = (site | kInstanceFlagReferenced); + instances[numSites] = INSTANCE_PUT(element); + numSites++; + } + + for (idx = 0; idx < numSites; idx++) + { + site = instances[idx]; + if (!site) continue; + if (!(kInstanceFlagReferenced & site)) continue; + for (siteCount = 1, dups = (idx + 1); dups < numSites; dups++) + { + if (instances[dups] == site) + { + siteCount++; + instances[dups] = 0; + } + } + record = (typeof(record)) (INSTANCE_PUT(site) & ~kInstanceFlags); + (*proc)(refCon, siteCount, zoneSize, (uintptr_t *) &record->bt[0], (uint32_t) btlog->btrecord_btdepth); + } + + *countp = numSites; + + btlog_unlock(btlog); } + +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/kern/btlog.h b/osfmk/kern/btlog.h index 7bbc12a5d..c9e937b60 100644 --- a/osfmk/kern/btlog.h +++ b/osfmk/kern/btlog.h @@ -30,6 +30,7 @@ #define _KERN_BTLOG_H_ #include +#include #include #include @@ -66,14 +67,9 @@ struct btlog; typedef struct btlog btlog_t; -typedef void (*btlog_lock_t)(void *context); -typedef void (*btlog_unlock_t)(void *context); - extern btlog_t *btlog_create(size_t numrecords, size_t record_btdepth, - btlog_lock_t lock_callback, - btlog_unlock_t unlock_callback, - void *callback_context); + boolean_t caller_will_remove_entries_for_element); extern void btlog_add_entry(btlog_t *btlog, void *element, @@ -84,6 +80,15 @@ extern void btlog_add_entry(btlog_t *btlog, extern void btlog_remove_entries_for_element(btlog_t *btlog, void *element); +#if DEBUG || DEVELOPMENT +void btlog_copy_backtraces_for_elements(btlog_t * btlog, + uintptr_t * instances, + uint32_t * count, + uint32_t zoneSize, + leak_site_proc proc, + void * refCon); +#endif /* DEBUG || DEVELOPMENT */ + #endif /* XNU_KERNEL_PRIVATE */ #endif /* _KERN_BTLOG_H_ */ diff --git a/osfmk/kern/build_config.c b/osfmk/kern/build_config.c new file mode 100644 index 000000000..a1261d715 --- /dev/null +++ b/osfmk/kern/build_config.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +bool +kern_config_is_development(void) +{ +#if DEVELOPMENT || DEBUG + return true; +#else + return false; +#endif +} + diff --git a/osfmk/kern/build_config.h b/osfmk/kern/build_config.h new file mode 100644 index 000000000..895437a83 --- /dev/null +++ b/osfmk/kern/build_config.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_BUILD_CONFIG_H +#define _KERN_BUILD_CONFIG_H + +#include +#include + +bool kern_config_is_development(void) OS_CONST; + +#endif /* _KERN_BUILD_CONFIG_H */ diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index f493edc67..6173d89b6 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -38,6 +38,8 @@ #include #include #include +#include +#include #include @@ -62,6 +64,12 @@ decl_simple_lock_data(,clock_lock) #define clock_lock_init() \ simple_lock_init(&clock_lock, 0) +#ifdef kdp_simple_lock_is_acquired +boolean_t kdp_clock_is_locked() +{ + return kdp_simple_lock_is_acquired(&clock_lock); +} +#endif /* * Time of day (calendar) variables. @@ -130,7 +138,9 @@ void _clock_delay_until_deadline_with_leeway(uint64_t interval, uint64_t deadline, uint64_t leeway); -static uint64_t clock_boottime; /* Seconds boottime epoch */ +/* Seconds boottime epoch */ +static uint64_t clock_boottime; +static uint32_t clock_boottime_usec; #define TIME_ADD(rsecs, secs, rfrac, frac, unit) \ MACRO_BEGIN \ @@ -235,34 +245,20 @@ clock_get_calendar_microtime( clock_get_calendar_absolute_and_microtime(secs, microsecs, NULL); } -/* - * clock_get_calendar_absolute_and_microtime: - * - * Returns the current calendar value, - * microseconds as the fraction. Also - * returns mach_absolute_time if abstime - * is not NULL. - */ -void -clock_get_calendar_absolute_and_microtime( +static void +clock_get_calendar_absolute_and_microtime_locked( clock_sec_t *secs, clock_usec_t *microsecs, uint64_t *abstime) { - uint64_t now; - spl_t s; - - s = splclock(); - clock_lock(); - - now = mach_absolute_time(); + uint64_t now = mach_absolute_time(); if (abstime) *abstime = now; if (clock_calend.adjdelta < 0) { uint32_t t32; - /* + /* * Since offset is decremented during a negative adjustment, * ensure that time increases monotonically without going * temporarily backwards. @@ -286,6 +282,28 @@ clock_get_calendar_absolute_and_microtime( absolutetime_to_microtime(now, secs, microsecs); *secs += (clock_sec_t)clock_calend.epoch; +} + +/* + * clock_get_calendar_absolute_and_microtime: + * + * Returns the current calendar value, + * microseconds as the fraction. Also + * returns mach_absolute_time if abstime + * is not NULL. + */ +void +clock_get_calendar_absolute_and_microtime( + clock_sec_t *secs, + clock_usec_t *microsecs, + uint64_t *abstime) +{ + spl_t s; + + s = splclock(); + clock_lock(); + + clock_get_calendar_absolute_and_microtime_locked(secs, microsecs, abstime); clock_unlock(); splx(s); @@ -306,35 +324,15 @@ clock_get_calendar_nanotime( clock_sec_t *secs, clock_nsec_t *nanosecs) { - uint64_t now; spl_t s; s = splclock(); clock_lock(); - now = mach_absolute_time(); - - if (clock_calend.adjdelta < 0) { - uint32_t t32; - - if (now > clock_calend.adjstart) { - t32 = (uint32_t)(now - clock_calend.adjstart); - - if (t32 > clock_calend.adjoffset) - now -= clock_calend.adjoffset; - else - now = clock_calend.adjstart; - } - } - - now += clock_calend.offset; - - absolutetime_to_microtime(now, secs, nanosecs); + clock_get_calendar_absolute_and_microtime_locked(secs, nanosecs, NULL); *nanosecs *= NSEC_PER_USEC; - *secs += (clock_sec_t)clock_calend.epoch; - clock_unlock(); splx(s); } @@ -354,6 +352,15 @@ void clock_gettimeofday( clock_sec_t *secs, clock_usec_t *microsecs) +{ + clock_gettimeofday_and_absolute_time(secs, microsecs, NULL); +} + +void +clock_gettimeofday_and_absolute_time( + clock_sec_t *secs, + clock_usec_t *microsecs, + uint64_t *mach_time) { uint64_t now; spl_t s; @@ -387,6 +394,10 @@ clock_gettimeofday( clock_unlock(); splx(s); + + if (mach_time) { + *mach_time = now; + } } /* @@ -408,8 +419,12 @@ clock_set_calendar_microtime( { clock_sec_t sys; clock_usec_t microsys; + uint64_t absolutesys; clock_sec_t newsecs; + clock_sec_t oldsecs; clock_usec_t newmicrosecs; + clock_usec_t oldmicrosecs; + uint64_t commpage_value; spl_t s; newsecs = secs; @@ -421,16 +436,28 @@ clock_set_calendar_microtime( commpage_disable_timestamp(); /* - * Calculate the new calendar epoch based on - * the new value and the system clock. + * Adjust the boottime based on the delta. */ - clock_get_system_microtime(&sys, µsys); - TIME_SUB(secs, sys, microsecs, microsys, USEC_PER_SEC); + clock_get_calendar_absolute_and_microtime_locked(&oldsecs, &oldmicrosecs, &absolutesys); + if (oldsecs < secs || (oldsecs == secs && oldmicrosecs < microsecs)){ + // moving forwards + long deltasecs = secs, deltamicrosecs = microsecs; + TIME_SUB(deltasecs, oldsecs, deltamicrosecs, oldmicrosecs, USEC_PER_SEC); + TIME_ADD(clock_boottime, deltasecs, clock_boottime_usec, deltamicrosecs, USEC_PER_SEC); + } else { + // moving backwards + long deltasecs = oldsecs, deltamicrosecs = oldmicrosecs; + TIME_SUB(deltasecs, secs, deltamicrosecs, microsecs, USEC_PER_SEC); + TIME_SUB(clock_boottime, deltasecs, clock_boottime_usec, deltamicrosecs, USEC_PER_SEC); + } + commpage_value = clock_boottime * USEC_PER_SEC + clock_boottime_usec; /* - * Adjust the boottime based on the delta. + * Calculate the new calendar epoch based on + * the new value and the system clock. */ - clock_boottime += secs - clock_calend.epoch; + absolutetime_to_microtime(absolutesys, &sys, µsys); + TIME_SUB(secs, sys, microsecs, microsys, USEC_PER_SEC); /* * Set the new calendar epoch. @@ -456,11 +483,14 @@ clock_set_calendar_microtime( splx(s); + commpage_update_boottime(commpage_value); + /* * Send host notifications. */ host_notify_calendar_change(); - + host_notify_calendar_set(); + #if CONFIG_DTRACE clock_track_calend_nowait(); #endif @@ -482,12 +512,16 @@ uint64_t mach_absolutetime_last_sleep; void clock_initialize_calendar(void) { - clock_sec_t sys, secs; - clock_usec_t microsys, microsecs; - uint64_t new_epoch; + clock_sec_t sys; // sleepless time since boot in seconds + clock_sec_t secs; // Current UTC time + clock_sec_t utc_offset_secs; // Difference in current UTC time and sleepless time since boot + clock_usec_t microsys; + clock_usec_t microsecs; + clock_usec_t utc_offset_microsecs; + uint64_t new_epoch; // utc_offset_secs in mach absolute time units spl_t s; - PEGetUTCTimeOfDay(&secs, µsecs); + PEGetUTCTimeOfDay(&secs, µsecs); s = splclock(); clock_lock(); @@ -498,37 +532,56 @@ clock_initialize_calendar(void) /* * Initialize the boot time based on the platform clock. */ - if (clock_boottime == 0) + if (clock_boottime == 0){ clock_boottime = secs; + clock_boottime_usec = microsecs; + commpage_update_boottime(clock_boottime * USEC_PER_SEC + clock_boottime_usec); + } /* * Calculate the new calendar epoch based on * the platform clock and the system clock. */ clock_get_system_microtime(&sys, µsys); - TIME_SUB(secs, sys, microsecs, microsys, USEC_PER_SEC); + utc_offset_secs = secs; + utc_offset_microsecs = microsecs; + + // This macro mutates utc_offset_secs and micro_utc_offset + TIME_SUB(utc_offset_secs, sys, utc_offset_microsecs, microsys, USEC_PER_SEC); /* * Set the new calendar epoch. */ - clock_calend.epoch = secs; + clock_calend.epoch = utc_offset_secs; - nanoseconds_to_absolutetime((uint64_t)microsecs * NSEC_PER_USEC, &clock_calend.offset); + nanoseconds_to_absolutetime((uint64_t)utc_offset_microsecs * NSEC_PER_USEC, &clock_calend.offset); - clock_interval_to_absolutetime_interval((uint32_t) secs, NSEC_PER_SEC, &new_epoch); + clock_interval_to_absolutetime_interval((uint32_t) utc_offset_secs, NSEC_PER_SEC, &new_epoch); new_epoch += clock_calend.offset; if (clock_calend.epoch_absolute) { - mach_absolutetime_last_sleep = new_epoch - clock_calend.epoch_absolute; + /* new_epoch is the difference between absolute_time and utc_time + * this value will remain constant until the system sleeps. + * Then, difference between values would go up by the time the system sleeps. + * epoch_absolute is the last difference between the two values + * so the difference in the differences would be the time of the last sleep + */ + + if(new_epoch > clock_calend.epoch_absolute) { + mach_absolutetime_last_sleep = new_epoch - clock_calend.epoch_absolute; + } + else { + mach_absolutetime_last_sleep = 0; + } mach_absolutetime_asleep += mach_absolutetime_last_sleep; KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_CLOCK,MACH_EPOCH_CHANGE) | DBG_FUNC_NONE, - (uintptr_t) mach_absolutetime_last_sleep, - (uintptr_t) mach_absolutetime_asleep, - (uintptr_t) (mach_absolutetime_last_sleep >> 32), - (uintptr_t) (mach_absolutetime_asleep >> 32), + (uintptr_t) mach_absolutetime_last_sleep, + (uintptr_t) mach_absolutetime_asleep, + (uintptr_t) (mach_absolutetime_last_sleep >> 32), + (uintptr_t) (mach_absolutetime_asleep >> 32), 0); } clock_calend.epoch_absolute = new_epoch; @@ -539,6 +592,9 @@ clock_initialize_calendar(void) calend_adjtotal = clock_calend.adjdelta = 0; } + commpage_update_mach_continuous_time(mach_absolutetime_asleep); + adjust_cont_time_thread_calls(); + clock_unlock(); splx(s); @@ -568,7 +624,29 @@ clock_get_boottime_nanotime( clock_lock(); *secs = (clock_sec_t)clock_boottime; - *nanosecs = 0; + *nanosecs = (clock_nsec_t)clock_boottime_usec * NSEC_PER_USEC; + + clock_unlock(); + splx(s); +} + +/* + * clock_get_boottime_nanotime: + * + * Return the boottime, used by sysctl. + */ +void +clock_get_boottime_microtime( + clock_sec_t *secs, + clock_usec_t *microsecs) +{ + spl_t s; + + s = splclock(); + clock_lock(); + + *secs = (clock_sec_t)clock_boottime; + *microsecs = (clock_nsec_t)clock_boottime_usec; clock_unlock(); splx(s); @@ -952,6 +1030,14 @@ clock_absolutetime_interval_to_deadline( *result = mach_absolute_time() + abstime; } +void +clock_continuoustime_interval_to_deadline( + uint64_t conttime, + uint64_t *result) +{ + *result = mach_continuous_time() + conttime; +} + void clock_get_uptime( uint64_t *result) @@ -978,6 +1064,61 @@ clock_deadline_for_periodic_event( } } +uint64_t +mach_continuous_time(void) +{ + while(1) { + uint64_t read1 = mach_absolutetime_asleep; + uint64_t absolute = mach_absolute_time(); + OSMemoryBarrier(); + uint64_t read2 = mach_absolutetime_asleep; + + if(__builtin_expect(read1 == read2, 1)) { + return absolute + read1; + } + } +} + +uint64_t +mach_continuous_approximate_time(void) +{ + while(1) { + uint64_t read1 = mach_absolutetime_asleep; + uint64_t absolute = mach_approximate_time(); + OSMemoryBarrier(); + uint64_t read2 = mach_absolutetime_asleep; + + if(__builtin_expect(read1 == read2, 1)) { + return absolute + read1; + } + } +} + +/* + * continuoustime_to_absolutetime + * Must be called with interrupts disabled + * Returned value is only valid until the next update to + * mach_continuous_time + */ +uint64_t +continuoustime_to_absolutetime(uint64_t conttime) { + if (conttime <= mach_absolutetime_asleep) + return 0; + else + return conttime - mach_absolutetime_asleep; +} + +/* + * absolutetime_to_continuoustime + * Must be called with interrupts disabled + * Returned value is only valid until the next update to + * mach_continuous_time + */ +uint64_t +absolutetime_to_continuoustime(uint64_t abstime) { + return abstime + mach_absolutetime_asleep; +} + #if CONFIG_DTRACE /* diff --git a/osfmk/kern/clock.h b/osfmk/kern/clock.h index 8918ca3c1..7a7b44328 100644 --- a/osfmk/kern/clock.h +++ b/osfmk/kern/clock.h @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -145,6 +146,12 @@ extern void clock_gettimeofday( clock_sec_t *secs, clock_usec_t *microsecs); +extern void clock_gettimeofday_and_absolute_time( + clock_sec_t *secs, + clock_usec_t *microsecs, + uint64_t *absolute_time); + + extern void clock_set_calendar_microtime( clock_sec_t secs, clock_usec_t microsecs); @@ -153,6 +160,10 @@ extern void clock_get_boottime_nanotime( clock_sec_t *secs, clock_nsec_t *nanosecs); +extern void clock_get_boottime_microtime( + clock_sec_t *secs, + clock_nsec_t *microsecs); + extern void absolutetime_to_microtime( uint64_t abstime, clock_sec_t *secs, @@ -171,6 +182,8 @@ extern void clock_get_calendar_nanotime_nowait( #endif /* CONFIG_DTRACE */ +boolean_t kdp_clock_is_locked(void); + #endif /* XNU_KERNEL_PRIVATE */ extern void clock_get_calendar_microtime( @@ -214,6 +227,10 @@ extern void clock_absolutetime_interval_to_deadline( uint64_t abstime, uint64_t *result); +extern void clock_continuoustime_interval_to_deadline( + uint64_t abstime, + uint64_t *result); + extern void clock_delay_until( uint64_t deadline); @@ -221,10 +238,28 @@ extern void absolutetime_to_nanoseconds( uint64_t abstime, uint64_t *result); -extern void nanoseconds_to_absolutetime( +extern void nanoseconds_to_absolutetime( uint64_t nanoseconds, uint64_t *result); +/* + * Absolute <-> Continuous Time conversion routines + * + * It is the caller's responsibility to ensure that these functions are + * synchronized with respect to updates to the continuous timebase. The + * returned value is only valid until the next update to the continuous + * timebase. + * + * If the value to be returned by continuoustime_to_absolutetime would be + * negative, zero is returned. This occurs when the provided continuous time + * is less the amount of the time the system spent asleep and /must/ be + * handled. + */ +extern uint64_t absolutetime_to_continuoustime( + uint64_t abstime); +extern uint64_t continuoustime_to_absolutetime( + uint64_t conttime); + extern uint64_t mach_absolutetime_asleep; extern uint64_t mach_absolutetime_last_sleep; @@ -285,6 +320,10 @@ extern void delay_for_interval_with_leeway( uint32_t leeway, uint32_t scale_factor); +#ifdef XNU_KERNEL_PRIVATE +extern void delay(int usec); +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/kern/clock_oldops.c b/osfmk/kern/clock_oldops.c index cdb1bf670..a3debc0d9 100644 --- a/osfmk/kern/clock_oldops.c +++ b/osfmk/kern/clock_oldops.c @@ -172,7 +172,7 @@ void clock_oldconfig(void) { clock_t clock; - register int i; + int i; simple_lock_init(&alarm_lock, 0); thread_call_setup(&alarm_done_call, (thread_call_func_t)alarm_done, NULL); @@ -197,7 +197,7 @@ void clock_oldinit(void) { clock_t clock; - register int i; + int i; /* * Initialize basic clock structures. @@ -216,7 +216,7 @@ void clock_service_create(void) { clock_t clock; - register int i; + int i; /* * Initialize ipc clock services. @@ -620,8 +620,8 @@ static void alarm_expire(void) { clock_t clock; - register alarm_t alrm1; - register alarm_t alrm2; + alarm_t alrm1; + alarm_t alrm2; mach_timespec_t clock_time; mach_timespec_t *alarm_time; spl_t s; @@ -687,7 +687,7 @@ alarm_expire(void) static void alarm_done(void) { - register alarm_t alrm; + alarm_t alrm; kern_return_t code; spl_t s; @@ -726,7 +726,7 @@ static void post_alarm( alarm_t alarm) { - register alarm_t alrm1, alrm2; + alarm_t alrm1, alrm2; mach_timespec_t *alarm_time; mach_timespec_t *queue_time; diff --git a/osfmk/kern/coalition.c b/osfmk/kern/coalition.c index fd2afa858..7b9885612 100644 --- a/osfmk/kern/coalition.c +++ b/osfmk/kern/coalition.c @@ -80,6 +80,7 @@ static uint64_t coalition_next_id = 1; static queue_head_t coalitions_q; coalition_t init_coalition[COALITION_NUM_TYPES]; +coalition_t corpse_coalition[COALITION_NUM_TYPES]; zone_t coalition_zone; @@ -168,6 +169,7 @@ struct i_resource_coalition { ledger_t ledger; uint64_t bytesread; uint64_t byteswritten; + uint64_t energy; uint64_t gpu_time; uint64_t logical_immediate_writes; uint64_t logical_deferred_writes; @@ -297,6 +299,7 @@ coalition_notify_user(uint64_t id, uint32_t flags) } coalition_notification(user_port, id, flags); + ipc_port_release_send(user_port); } /* @@ -442,10 +445,18 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us { kern_return_t kr; ledger_amount_t credit, debit; + int i; if (coal->type != COALITION_TYPE_RESOURCE) return KERN_INVALID_ARGUMENT; + /* Return KERN_INVALID_ARGUMENT for Corpse coalition */ + for (i = 0; i < COALITION_NUM_TYPES; i++) { + if (coal == corpse_coalition[i]) { + return KERN_INVALID_ARGUMENT; + } + } + ledger_t sum_ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES); if (sum_ledger == LEDGER_NULL) return KERN_RESOURCE_SHORTAGE; @@ -460,6 +471,7 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us uint64_t bytesread = coal->r.bytesread; uint64_t byteswritten = coal->r.byteswritten; uint64_t gpu_time = coal->r.gpu_time; + uint64_t energy = coal->r.energy; uint64_t logical_immediate_writes = coal->r.logical_immediate_writes; uint64_t logical_deferred_writes = coal->r.logical_deferred_writes; uint64_t logical_invalidated_writes = coal->r.logical_invalidated_writes; @@ -539,6 +551,7 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us cru_out->bytesread = bytesread; cru_out->byteswritten = byteswritten; cru_out->gpu_time = gpu_time; + cru_out->energy = energy; cru_out->logical_immediate_writes = logical_immediate_writes; cru_out->logical_deferred_writes = logical_deferred_writes; cru_out->logical_invalidated_writes = logical_invalidated_writes; @@ -1173,6 +1186,18 @@ coalitions_adopt_init_task(task_t task) return kr; } +/* Used for forked corpses. */ +kern_return_t +coalitions_adopt_corpse_task(task_t task) +{ + kern_return_t kr; + kr = coalitions_adopt_task(corpse_coalition, task); + if (kr != KERN_SUCCESS) { + panic("failed to adopt task %p into corpse coalition: %d", task, kr); + } + return kr; +} + /* * coalition_adopt_task_internal * Condition: Coalition must be referenced and unlocked. Will fail if coalition @@ -1313,8 +1338,11 @@ task_release_coalitions(task_t task) { int i; for (i = 0; i < COALITION_NUM_TYPES; i++) { - if (task->coalition[i]) + if (task->coalition[i]) { coalition_release(task->coalition[i]); + } else if (i == COALITION_TYPE_RESOURCE) { + panic("deallocating task %p was not a member of a resource coalition", task); + } } } @@ -1517,6 +1545,10 @@ coalitions_init(void) if (kr != KERN_SUCCESS) panic("%s: could not create init %s coalition: kr:%d", __func__, coal_type_str(i), kr); + kr = coalition_create_internal(ctype->type, FALSE, &corpse_coalition[ctype->type]); + if (kr != KERN_SUCCESS) + panic("%s: could not create corpse %s coalition: kr:%d", + __func__, coal_type_str(i), kr); } /* "Leak" our reference to the global object */ diff --git a/osfmk/kern/coalition.h b/osfmk/kern/coalition.h index 0bd9d2d88..87962bd8b 100644 --- a/osfmk/kern/coalition.h +++ b/osfmk/kern/coalition.h @@ -43,6 +43,7 @@ void coalitions_init(void); */ kern_return_t coalitions_adopt_task(coalition_t *coaltions, task_t task); kern_return_t coalitions_adopt_init_task(task_t task); +kern_return_t coalitions_adopt_corpse_task(task_t task); /* Currently, no error conditions. If task is not already in a coalition, * KERN_SUCCESS is returned because removing it did not fail. diff --git a/osfmk/kern/cpu_data.h b/osfmk/kern/cpu_data.h index 373297d0e..58d4ecad8 100644 --- a/osfmk/kern/cpu_data.h +++ b/osfmk/kern/cpu_data.h @@ -48,9 +48,10 @@ __BEGIN_DECLS extern void _disable_preemption(void); extern void _enable_preemption(void); +#ifndef MACHINE_PREEMPTION_MACROS #define disable_preemption() _disable_preemption() #define enable_preemption() _enable_preemption() - +#endif __END_DECLS diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index cd9b5bb23..e031b95fd 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -90,10 +91,13 @@ #include #include +#include + #if (defined(__arm64__) || defined(NAND_PANIC_DEVICE)) && !defined(LEGACY_PANIC_LOGS) #include /* For gPanicBase */ #endif + unsigned int halt_in_debugger = 0; unsigned int switch_debugger = 0; unsigned int current_debugger = 0; @@ -104,6 +108,7 @@ unsigned int systemLogDiags = FALSE; unsigned int panicDebugging = FALSE; unsigned int logPanicDataToScreen = FALSE; unsigned int kdebug_serial = FALSE; +boolean_t lock_panic_mode = FALSE; int mach_assert = 1; @@ -130,6 +135,9 @@ char *debug_buf_ptr = debug_buf; unsigned int debug_buf_size = sizeof(debug_buf); #endif +char *debug_buf_stackshot_start; +char *debug_buf_stackshot_end; + static char model_name[64]; unsigned char *kernel_uuid; /* uuid_string_t */ char kernel_uuid_string[37]; @@ -151,10 +159,17 @@ struct pasc { typedef struct pasc pasc_t; /* Prevent CPP from breaking the definition below */ -#if CONFIG_NO_PANIC_STRINGS +#ifdef CONFIG_NO_PANIC_STRINGS #undef Assert #endif +int kext_assertions_enable = +#if DEBUG || DEVELOPMENT + TRUE; +#else + FALSE; +#endif + void __attribute__((noinline)) Assert( const char *file, @@ -282,6 +297,8 @@ panic_prologue(const char *str) s = splhigh(); disable_preemption(); + /* Locking code should relax some checks at panic time */ + lock_panic_mode = TRUE; #if defined(__i386__) || defined(__x86_64__) /* Attempt to display the unparsed panic string */ @@ -316,6 +333,7 @@ panic_prologue(const char *str) } else { nestedpanic +=1; PANIC_UNLOCK(); + // Other cores will not be resumed on double panic Debugger("double panic"); // a printf statement here was removed to avoid a panic-loop caused // by a panic from printf @@ -328,12 +346,21 @@ panic_prologue(const char *str) panicwait = 1; PANIC_UNLOCK(); + + // halt other cores now in anticipation of the debugger call return(s); } - +#if DEVELOPMENT || DEBUG static void panic_epilogue(spl_t s) +#else +#if !defined(__i386__) && !defined(__x86_64__) +__attribute__((noreturn)) +#endif +static void +panic_epilogue(__unused spl_t s) +#endif { /* * Release panicstr so that we can handle normally other panics. @@ -344,19 +371,21 @@ panic_epilogue(spl_t s) #if DEVELOPMENT || DEBUG if (return_on_panic) { + // resume other cores as we are returning panic_normal(); enable_preemption(); splx(s); return; } -#else - (void)s; #endif kdb_printf("panic: We are hanging here...\n"); panic_stop(); /* NOTREACHED */ } +#if !DEVELOPMENT && !DEBUG && !defined(__i386__) && !defined(__x86_64__) +__attribute__((noreturn)) +#endif void panic(const char *str, ...) { @@ -364,11 +393,14 @@ panic(const char *str, ...) spl_t s; boolean_t old_doprnt_hide_pointers = doprnt_hide_pointers; - +#if defined (__x86_64__) + plctrace_disable(); +#endif /* panic_caller is initialized to 0. If set, don't change it */ if ( ! panic_caller ) panic_caller = (unsigned long)(char *)__builtin_return_address(0); - + + s = panic_prologue(str); /* Never hide pointers from panic logs. */ @@ -393,6 +425,44 @@ panic(const char *str, ...) panic_epilogue(s); } +/* + * panic_with_options: wraps the panic call in a way that allows us to pass + * a bitmask of specific debugger options. + */ +#if !DEVELOPMENT && !DEBUG && !defined(__i386__) && !defined(__x86_64__) +__attribute__((noreturn)) +#endif +void +panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mask, const char *str, ...) +{ + va_list listp; + spl_t s; + + + /* panic_caller is initialized to 0. If set, don't change it */ + if ( ! panic_caller ) + panic_caller = (unsigned long)(char *)__builtin_return_address(0); + + s = panic_prologue(str); + kdb_printf("panic(cpu %d caller 0x%lx): ", (unsigned) paniccpu, panic_caller); + if (str) { + va_start(listp, str); + _doprnt(str, &listp, consdebug_putc, 0); + va_end(listp); + } + kdb_printf("\n"); + + /* + * Release panicwait indicator so that other cpus may call Debugger(). + */ + panicwait = 0; + DebuggerWithContext(reason, ctx, "panic", debugger_options_mask); + panic_epilogue(s); +} + +#if !DEVELOPMENT && !DEBUG && !defined(__i386__) && !defined(__x86_64__) +__attribute__((noreturn)) +#endif void panic_context(unsigned int reason, void *ctx, const char *str, ...) { @@ -417,27 +487,53 @@ panic_context(unsigned int reason, void *ctx, const char *str, ...) * Release panicwait indicator so that other cpus may call Debugger(). */ panicwait = 0; - DebuggerWithContext(reason, ctx, "panic"); + DebuggerWithContext(reason, ctx, "panic", DEBUGGER_OPTION_NONE); panic_epilogue(s); } -void -log(__unused int level, char *fmt, ...) +__attribute__((noinline,not_tail_called)) +void log(__unused int level, char *fmt, ...) { + void *caller = __builtin_return_address(0); va_list listp; + va_list listp2; + #ifdef lint level++; #endif /* lint */ #ifdef MACH_BSD - disable_preemption(); va_start(listp, fmt); - _doprnt(fmt, &listp, conslog_putc, 0); - va_end(listp); + va_copy(listp2, listp); + + disable_preemption(); + _doprnt(fmt, &listp, cons_putc_locked, 0); enable_preemption(); + + va_end(listp); + + os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp2, caller); + va_end(listp2); #endif } +/* + * Skip appending log messages to the new logging infrastructure in contexts + * where safety is uncertain. These contexts include: + * - When we're in the debugger + * - We're in a panic + * - Interrupts are disabled + * - Or Pre-emption is disabled + * In all the above cases, it is potentially unsafe to log messages. + */ + +boolean_t oslog_is_safe(void) { + return (debug_mode == 0 && + not_in_kdp == 1 && + get_preemption_level() == 0 && + ml_get_interrupts_enabled() == TRUE); +} + void debug_putc(char c) { @@ -590,8 +686,7 @@ __private_extern__ void panic_display_system_configuration(void) { } } -extern zone_t first_zone; -extern unsigned int num_zones, stack_total; +extern unsigned int stack_total; extern unsigned long long stack_allocs; #if defined(__i386__) || defined (__x86_64__) @@ -611,22 +706,12 @@ __private_extern__ void panic_display_zprint() struct zone zone_copy; kdb_printf("%-20s %10s %10s\n", "Zone Name", "Cur Size", "Free Size"); - if(first_zone!=NULL) { - if(ml_nofault_copy((vm_offset_t)first_zone, (vm_offset_t)&zone_copy, sizeof(struct zone)) == sizeof(struct zone)) { - for (i = 0; i < num_zones; i++) { - if(zone_copy.cur_size > (1024*1024)) { - kdb_printf("%-20s %10lu %10lu\n",zone_copy.zone_name, (uintptr_t)zone_copy.cur_size,(uintptr_t)(zone_copy.countfree * zone_copy.elem_size)); - } - - if(zone_copy.next_zone == NULL) { - break; - } - - if(ml_nofault_copy((vm_offset_t)zone_copy.next_zone, (vm_offset_t)&zone_copy, sizeof(struct zone)) != sizeof(struct zone)) { - break; - } + for (i = 0; i < num_zones; i++) { + if(ml_nofault_copy((vm_offset_t)(&zone_array[i]), (vm_offset_t)&zone_copy, sizeof(struct zone)) == sizeof(struct zone)) { + if(zone_copy.cur_size > (1024*1024)) { + kdb_printf("%-20s %10lu %10lu\n",zone_copy.zone_name, (uintptr_t)zone_copy.cur_size,(uintptr_t)(zone_copy.countfree * zone_copy.elem_size)); } - } + } } kdb_printf("%-20s %10lu\n", "Kernel Stacks", (uintptr_t)(kernel_stack_size * stack_total)); @@ -709,7 +794,7 @@ void kdp_set_gateway_mac(void *); void kdp_set_interface(void *); void kdp_register_send_receive(void *, void *); void kdp_unregister_send_receive(void *, void *); -void kdp_snapshot_preflight(int, void *, uint32_t, uint32_t, kcdata_descriptor_t, boolean_t enable_faulting); + int kdp_stack_snapshot_geterror(void); uint32_t kdp_stack_snapshot_bytes_traced(void); diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index 14917ddc1..dccba1842 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,10 +29,13 @@ #ifndef _KERN_DEBUG_H_ #define _KERN_DEBUG_H_ +#include + #include #include #include #include +#include #ifndef XNU_KERNEL_PRIVATE #include @@ -41,11 +44,6 @@ #ifdef __APPLE_API_PRIVATE #ifdef __APPLE_API_UNSTABLE -/* This value must always match IO_NUM_PRIORITIES defined in thread_info.h */ -#define STACKSHOT_IO_NUM_PRIORITIES 4 -/* This value must always match MAXTHREADNAMESIZE used in bsd */ -#define STACKSHOT_MAX_THREAD_NAME_SIZE 64 - struct thread_snapshot { uint32_t snapshot_magic; uint32_t nkern_frames; @@ -90,33 +88,12 @@ struct thread_snapshot { uint64_t total_syscalls; char pth_name[STACKSHOT_MAX_THREAD_NAME_SIZE]; -} __attribute__ ((packed)); - -struct thread_snapshot_v2 { - uint64_t ths_thread_id; - uint64_t ths_wait_event; - uint64_t ths_continuation; - uint64_t ths_total_syscalls; - uint64_t ths_voucher_identifier; - uint64_t ths_dqserialnum; - uint64_t ths_user_time; - uint64_t ths_sys_time; - uint64_t ths_ss_flags; - uint64_t ths_last_run_time; - uint64_t ths_last_made_runnable_time; - uint32_t ths_state; - uint32_t ths_sched_flags; - int16_t ths_base_priority; - int16_t ths_sched_priority; - uint8_t ths_eqos; - uint8_t ths_rqos; - uint8_t ths_rqos_override; - uint8_t ths_io_tier; -} __attribute__ ((packed)); +} __attribute__((packed)); +/* old, non kcdata format */ struct task_snapshot { - uint32_t snapshot_magic; - int32_t pid; + uint32_t snapshot_magic; + int32_t pid; uint64_t uniqueid; uint64_t user_time_in_terminated_threads; uint64_t system_time_in_terminated_threads; @@ -166,48 +143,7 @@ struct task_snapshot { } __attribute__ ((packed)); -struct io_stats_snapshot -{ - /* - * I/O Statistics - * XXX: These fields must be together. - */ - uint64_t ss_disk_reads_count; - uint64_t ss_disk_reads_size; - uint64_t ss_disk_writes_count; - uint64_t ss_disk_writes_size; - uint64_t ss_io_priority_count[STACKSHOT_IO_NUM_PRIORITIES]; - uint64_t ss_io_priority_size[STACKSHOT_IO_NUM_PRIORITIES]; - uint64_t ss_paging_count; - uint64_t ss_paging_size; - uint64_t ss_non_paging_count; - uint64_t ss_non_paging_size; - uint64_t ss_data_count; - uint64_t ss_data_size; - uint64_t ss_metadata_count; - uint64_t ss_metadata_size; - /* XXX: I/O Statistics end */ - -} __attribute__ ((packed)); -struct task_snapshot_v2 { - uint64_t ts_unique_pid; - uint64_t ts_ss_flags; - uint64_t ts_user_time_in_terminated_threads; - uint64_t ts_system_time_in_terminated_threads; - uint64_t ts_p_start_sec; - uint64_t ts_task_size; - uint64_t ts_max_resident_size; - uint32_t ts_suspend_count; - uint32_t ts_faults; - uint32_t ts_pageins; - uint32_t ts_cow_faults; - uint32_t ts_was_throttled; - uint32_t ts_did_throttle; - uint32_t ts_latency_qos; - int32_t ts_pid; - char ts_p_comm[32]; -} __attribute__ ((packed)); struct micro_snapshot { uint32_t snapshot_magic; @@ -218,34 +154,7 @@ struct micro_snapshot { uint16_t ms_opaque_flags; /* managed by external entity, e.g. fdrmicrod */ } __attribute__ ((packed)); -struct mem_and_io_snapshot { - uint32_t snapshot_magic; - uint32_t free_pages; - uint32_t active_pages; - uint32_t inactive_pages; - uint32_t purgeable_pages; - uint32_t wired_pages; - uint32_t speculative_pages; - uint32_t throttled_pages; - uint32_t filebacked_pages; - uint32_t compressions; - uint32_t decompressions; - uint32_t compressor_size; - int busy_buffer_count; - uint32_t pages_wanted; - uint32_t pages_reclaimed; - uint8_t pages_wanted_reclaimed_valid; // did mach_vm_pressure_monitor succeed? -} __attribute__((packed)); - -struct stack_snapshot_frame32 { - uint32_t lr; - uint32_t sp; -}; -struct stack_snapshot_frame64 { - uint64_t lr; - uint64_t sp; -}; struct _dyld_cache_header { @@ -264,20 +173,12 @@ struct _dyld_cache_header uint8_t uuid[16]; // unique value for each shared cache file }; -struct dyld_uuid_info_32 { - uint32_t imageLoadAddress; /* base address image is mapped at */ - uuid_t imageUUID; -}; - -struct dyld_uuid_info_64 { - uint64_t imageLoadAddress; /* base address image is mapped at */ - uuid_t imageUUID; -}; enum micro_snapshot_flags { kInterruptRecord = 0x1, kTimerArmingRecord = 0x2, - kUserMode = 0x4, /* interrupted usermode, or armed by usermode */ + kUserMode = 0x4, /* interrupted usermode, or armed by usermode */ + kIORecord = 0x8, }; /* @@ -288,81 +189,55 @@ enum generic_snapshot_flags { kKernel64_p = 0x2 }; -enum task_snapshot_flags { - kTaskRsrcFlagged = 0x4, // In the EXC_RESOURCE danger zone? - kTerminatedSnapshot = 0x8, - kPidSuspended = 0x10, // true for suspended task - kFrozen = 0x20, // true for hibernated task (along with pidsuspended) - kTaskDarwinBG = 0x40, - kTaskExtDarwinBG = 0x80, - kTaskVisVisible = 0x100, - kTaskVisNonvisible = 0x200, - kTaskIsForeground = 0x400, - kTaskIsBoosted = 0x800, - kTaskIsSuppressed = 0x1000, - kTaskIsTimerThrottled = 0x2000, /* deprecated */ - kTaskIsImpDonor = 0x4000, - kTaskIsLiveImpDonor = 0x8000 -}; - -enum thread_snapshot_flags { - kHasDispatchSerial = 0x4, - kStacksPCOnly = 0x8, /* Stack traces have no frame pointers. */ - kThreadDarwinBG = 0x10, /* Thread is darwinbg */ - kThreadIOPassive = 0x20, /* Thread uses passive IO */ - kThreadSuspended = 0x40, /* Thread is suspended */ - kThreadTruncatedBT = 0x80, /* Unmapped pages caused truncated backtrace */ - kGlobalForcedIdle = 0x100, /* Thread performs global forced idle */ - kThreadDecompressedBT = 0x200, /* Some thread stack pages were decompressed as part of BT */ - kThreadFaultedBT = 0x400 /* Some thread stack pages were faulted in as part of BT */ -}; #define VM_PRESSURE_TIME_WINDOW 5 /* seconds */ enum { - STACKSHOT_GET_DQ = 0x01, - STACKSHOT_SAVE_LOADINFO = 0x02, - STACKSHOT_GET_GLOBAL_MEM_STATS = 0x04, - STACKSHOT_SAVE_KEXT_LOADINFO = 0x08, - STACKSHOT_GET_MICROSTACKSHOT = 0x10, - STACKSHOT_GLOBAL_MICROSTACKSHOT_ENABLE = 0x20, - STACKSHOT_GLOBAL_MICROSTACKSHOT_DISABLE = 0x40, - STACKSHOT_SET_MICROSTACKSHOT_MARK = 0x80, - STACKSHOT_SAVE_KERNEL_FRAMES_ONLY = 0x100, - STACKSHOT_GET_BOOT_PROFILE = 0x200, - STACKSHOT_GET_WINDOWED_MICROSTACKSHOTS = 0x400, - STACKSHOT_WINDOWED_MICROSTACKSHOTS_ENABLE = 0x800, - STACKSHOT_WINDOWED_MICROSTACKSHOTS_DISABLE = 0x1000, - STACKSHOT_SAVE_IMP_DONATION_PIDS = 0x2000, - STACKSHOT_SAVE_IN_KERNEL_BUFFER = 0x4000, - STACKSHOT_RETRIEVE_EXISTING_BUFFER = 0x8000, - STACKSHOT_KCDATA_FORMAT = 0x10000, - STACKSHOT_ENABLE_FAULTING = 0x20000 + STACKSHOT_GET_DQ = 0x01, + STACKSHOT_SAVE_LOADINFO = 0x02, + STACKSHOT_GET_GLOBAL_MEM_STATS = 0x04, + STACKSHOT_SAVE_KEXT_LOADINFO = 0x08, + STACKSHOT_GET_MICROSTACKSHOT = 0x10, + STACKSHOT_GLOBAL_MICROSTACKSHOT_ENABLE = 0x20, + STACKSHOT_GLOBAL_MICROSTACKSHOT_DISABLE = 0x40, + STACKSHOT_SET_MICROSTACKSHOT_MARK = 0x80, + STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY = 0x100, + STACKSHOT_GET_BOOT_PROFILE = 0x200, + STACKSHOT_SAVE_IMP_DONATION_PIDS = 0x2000, + STACKSHOT_SAVE_IN_KERNEL_BUFFER = 0x4000, + STACKSHOT_RETRIEVE_EXISTING_BUFFER = 0x8000, + STACKSHOT_KCDATA_FORMAT = 0x10000, + STACKSHOT_ENABLE_BT_FAULTING = 0x20000, + STACKSHOT_COLLECT_DELTA_SNAPSHOT = 0x40000, + /* + * STACKSHOT_TAILSPIN flips on several features aimed at minimizing the size + * of stackshots. It is meant to be used only by the tailspin daemon. Its + * behavior may be changed at any time to suit the needs of the tailspin + * daemon. Seriously, if you are not the tailspin daemon, don't use this + * flag. If you need these features, ask us to add a stable SPI for what + * you need. That being said, the features it turns on are: + * + * minimize_uuids: If the set of loaded dylibs or kexts has not changed in + * the delta period, do then not report them. + * + * iostats: do not include io statistics. + * + * trace_fp: do not include the frame pointers in stack traces. + * + * minimize_nonrunnables: Do not report detailed information about threads + * which were not runnable in the delta period. + */ + STACKSHOT_TAILSPIN = 0x80000, + /* + * Kernel consumers of stackshot (via stack_snapshot_from_kernel) can ask + * that we try to take the stackshot lock, and fail if we don't get it. + */ + STACKSHOT_TRYLOCK = 0x100000, + STACKSHOT_ENABLE_UUID_FAULTING = 0x200000, + STACKSHOT_FROM_PANIC = 0x400000, + STACKSHOT_NO_IO_STATS = 0x800000, }; -/* - * NOTE: Please update libkdd/kcdata/kcdtypes.c if you make any changes - * in STACKSHOT_KCTYPE_* types. - */ -#define STACKSHOT_KCTYPE_IOSTATS 0x901 /* io_stats_snapshot */ -#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902 /* struct mem_and_io_snapshot */ -#define STACKSHOT_KCCONTAINER_TASK 0x903 -#define STACKSHOT_KCCONTAINER_THREAD 0x904 -#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905 /* task_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906 /* thread_snapshot_v2 */ -#define STASKSHOT_KCTYPE_DONATING_PIDS 0x907 /* int[] */ -#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908 /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_NAME 0x909 /* char[] */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90A /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90B /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90C /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90D /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_BOOTARGS 0x90E /* boot args string */ -#define STACKSHOT_KCTYPE_OSVERSION 0x90F /* os version string */ -#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910 /* kernel page size in uint32_t */ -#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911 /* jetsam level in uint32_t */ - - #define STACKSHOT_THREAD_SNAPSHOT_MAGIC 0xfeedface #define STACKSHOT_TASK_SNAPSHOT_MAGIC 0xdecafbad #define STACKSHOT_MEM_AND_IO_SNAPSHOT_MAGIC 0xbfcabcde @@ -378,6 +253,9 @@ extern unsigned int systemLogDiags; extern char debug_buf[]; #endif extern char *debug_buf_addr; +extern char *debug_buf_stackshot_start; +extern char *debug_buf_stackshot_end; + extern unsigned int debug_boot_arg; extern unsigned char *kernel_uuid; extern char kernel_uuid_string[]; @@ -393,7 +271,8 @@ extern unsigned int switch_debugger; extern unsigned int current_debugger; #define NO_CUR_DB 0x0 #define KDP_CUR_DB 0x1 -#define KDB_CUR_DB 0x2 +//#define KDB_CUR_DB 0x2 +#define HW_SHM_CUR_DB 0x3 extern unsigned int active_debugger; extern unsigned int debug_mode; @@ -466,7 +345,8 @@ void panic_display_ecc_errors(void); */ #define DB_NMI_BTN_ENA 0x8000 /* Enable button to directly trigger NMI */ #define DB_PRT_KDEBUG 0x10000 /* kprintf KDEBUG traces */ -#define DB_DISABLE_LOCAL_CORE 0x20000 /* ignore local core dump support */ +#define DB_DISABLE_LOCAL_CORE 0x20000 /* ignore local kernel core dump support */ +#define DB_DISABLE_GZIP_CORE 0x40000 /* don't gzip kernel core dumps */ #if DEBUG /* @@ -515,6 +395,63 @@ enum { #endif /* KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE + +/* + * @var not_in_kdp + * + * @abstract True if we're in normal kernel operation, False if we're in a + * single-core debugger context. + */ +extern unsigned int not_in_kdp; + +/* + * @function DebuggerWithCallback + * + * @abstract Enter single-core debugger context and call a callback function. + * + * @param proceed_on_sync_failure If true, then go ahead and try to debug even + * if we can't synch with the other cores. This is inherently unsafe and should + * only be used if the kernel is going down in flames anyway. + * + * @result returns KERN_OPERATION_TIMED_OUT if synchronization times out and + * proceed_on_sync_failure is false. Otherwise return the return value of the + * callback. + */ +kern_return_t +DebuggerWithCallback(kern_return_t (*callback) (void*), + void *callback_context, + boolean_t proceed_on_sync_failure); + +boolean_t oslog_is_safe(void); + +/* + * @function stack_snapshot_from_kernel + * + * @abstract Stackshot function for kernel consumers who have their own buffer. + * + * @param pid the PID to be traced or -1 for the whole system + * @param buf a pointer to the buffer where the stackshot should be written + * @param size the size of the buffer + * @param flags flags to be passed to the stackshot + * @param delta_since_timestamp start time for delta period + * @bytes_traced a pointer to be filled with the length of the stackshot + * + */ +#ifdef __cplusplus +extern "C" { +#endif +kern_return_t +stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, + uint64_t delta_since_timestamp, unsigned *bytes_traced); +#ifdef __cplusplus +} +#endif + + +#endif /* XNU_KERNEL_PRIVATE */ + #ifdef KERNEL __BEGIN_DECLS @@ -524,9 +461,19 @@ extern void panic(const char *string, ...) __printflike(1,2); #if KERNEL_PRIVATE void _consume_panic_args(int, ...); void panic_context(unsigned int reason, void *ctx, const char *string, ...); +void panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mask, const char *str, ...); + +/* launchd crashed prefix in message to signal special panic handling */ +#define LAUNCHD_CRASHED_PREFIX "initproc exited" + +/* + * Values for a 64-bit mask that's passed to the debugger. + */ +#define DEBUGGER_OPTION_NONE 0x0ULL +#define DEBUGGER_OPTION_PANICLOGANDREBOOT 0x1ULL /* capture a panic log and then reboot immediately */ #endif -#if CONFIG_NO_PANIC_STRINGS +#ifdef CONFIG_NO_PANIC_STRINGS #if KERNEL_PRIVATE #define panic_plain(x, ...) _consume_panic_args( 0, ## __VA_ARGS__ ) #define panic(x, ...) _consume_panic_args( 0, ## __VA_ARGS__ ) @@ -549,6 +496,33 @@ void populate_model_name(char *); unsigned panic_active(void); #endif + +#if XNU_KERNEL_PRIVATE +#if DEBUG || DEVELOPMENT +/* leak pointer scan definitions */ + +enum +{ + kInstanceFlagAddress = 0x01UL, + kInstanceFlagReferenced = 0x02UL, + kInstanceFlags = 0x03UL +}; + +#define INSTANCE_GET(x) ((x) & ~kInstanceFlags) +#define INSTANCE_PUT(x) ((x) ^ ~kInstanceFlags) + +typedef void (*leak_site_proc)(void * refCon, uint32_t siteCount, uint32_t zoneSize, + uintptr_t * backtrace, uint32_t btCount); + +extern kern_return_t +zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon); + +extern void +zone_leaks_scan(uintptr_t * instances, uint32_t count, uint32_t zoneSize, uint32_t * found); + +#endif /* DEBUG || DEVELOPMENT */ +#endif /* XNU_KERNEL_PRIVATE */ + __END_DECLS #endif /* KERNEL */ diff --git a/osfmk/kern/exc_resource.h b/osfmk/kern/exc_resource.h index 336c41327..c90fafc61 100644 --- a/osfmk/kern/exc_resource.h +++ b/osfmk/kern/exc_resource.h @@ -61,6 +61,7 @@ #define RESOURCE_TYPE_CPU 1 #define RESOURCE_TYPE_WAKEUPS 2 #define RESOURCE_TYPE_MEMORY 3 +#define RESOURCE_TYPE_IO 4 /* RESOURCE_TYPE_CPU flavors */ #define FLAVOR_CPU_MONITOR 1 @@ -159,6 +160,40 @@ #define EXC_RESOURCE_HWM_DECODE_LIMIT(code) \ ((code) & 0x1FFFULL) +/* RESOURCE_TYPE_IO flavors */ +#define FLAVOR_IO_PHYSICAL_WRITES 1 +#define FLAVOR_IO_LOGICAL_WRITES 2 + +/* + * RESOURCE_TYPE_IO exception code & subcode. + * + * This is sent by the kernel when a task crosses its + * I/O limits. + * + * code: + * +-----------------------------------------------+ + * |[63:61] RESOURCE |[60:58] FLAVOR_IO_ |[57:32] | + * |_TYPE_IO |PHYSICAL/LOGICAL |Unused | + * +-----------------------------------------------+ + * |[31:15] Interval (sec) | [14:0] Limit (MB) | + * +-----------------------------------------------+ + * + * subcode: + * +-----------------------------------------------+ + * | | [14:0] I/O Count | + * | | (in MB) | + * +-----------------------------------------------+ + * + */ + +/* RESOURCE_TYPE_IO decoding macros */ +#define EXC_RESOURCE_IO_DECODE_INTERVAL(code) \ + (((code) >> 15) & 0x1FFFFULL) +#define EXC_RESOURCE_IO_DECODE_LIMIT(code) \ + ((code) & 0x7FFFULL) +#define EXC_RESOURCE_IO_OBSERVED(subcode) \ + ((subcode) & 0x7FFFULL) + #ifdef KERNEL @@ -186,6 +221,14 @@ #define EXC_RESOURCE_HWM_ENCODE_LIMIT(code, num) \ ((code) |= ((uint64_t)(num) & 0x1FFFULL)) +/* RESOURCE_TYPE_IO::FLAVOR_IO_PHYSICAL_WRITES/FLAVOR_IO_LOGICAL_WRITES specific encoding macros */ +#define EXC_RESOURCE_IO_ENCODE_INTERVAL(code, interval) \ + ((code) |= (((uint64_t)(interval) & 0x1FFFFULL) << 15)) +#define EXC_RESOURCE_IO_ENCODE_LIMIT(code, limit) \ + ((code) |= (((uint64_t)(limit) & 0x7FFFULL))) +#define EXC_RESOURCE_IO_ENCODE_OBSERVED(subcode, num) \ + ((subcode) |= (((uint64_t)(num) & 0x7FFFULL))) + #endif /* KERNEL */ diff --git a/osfmk/kern/exception.c b/osfmk/kern/exception.c index a47544027..a93f38ca5 100644 --- a/osfmk/kern/exception.c +++ b/osfmk/kern/exception.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,9 @@ exception_deliver( int behavior; int flavor; kern_return_t kr; + int use_fast_retrieve = TRUE; + task_t task; + ipc_port_t thread_port = NULL, task_port = NULL; /* * Save work if we are terminating. @@ -200,6 +204,32 @@ exception_deliver( small_code[1] = CAST_DOWN_EXPLICIT(exception_data_type_t, code[1]); } + task = thread->task; + +#if CONFIG_MACF + /* Now is a reasonably good time to check if the exception action is + * permitted for this process, because after this point we will send + * the message out almost certainly. + * As with other failures, exception_triage_thread will go on + * to the next level. + */ + if (mac_exc_action_check_exception_send(task, excp) != 0) { + return KERN_FAILURE; + } +#endif + + if ((thread != current_thread() || exception == EXC_CORPSE_NOTIFY) + && behavior != EXCEPTION_STATE) { + use_fast_retrieve = FALSE; + + task_reference(task); + task_port = convert_task_to_port(task); + /* task ref consumed */ + thread_reference(thread); + thread_port = convert_thread_to_port(thread); + /* thread ref consumed */ + + } switch (behavior) { case EXCEPTION_STATE: { @@ -241,15 +271,19 @@ exception_deliver( c_thr_exc_raise++; if (code64) { kr = mach_exception_raise(exc_port, - retrieve_thread_self_fast(thread), - retrieve_task_self_fast(thread->task), + use_fast_retrieve ? retrieve_thread_self_fast(thread) : + thread_port, + use_fast_retrieve ? retrieve_task_self_fast(thread->task) : + task_port, exception, code, codeCnt); } else { kr = exception_raise(exc_port, - retrieve_thread_self_fast(thread), - retrieve_task_self_fast(thread->task), + use_fast_retrieve ? retrieve_thread_self_fast(thread) : + thread_port, + use_fast_retrieve ? retrieve_task_self_fast(thread->task) : + task_port, exception, small_code, codeCnt); @@ -270,8 +304,10 @@ exception_deliver( if (code64) { kr = mach_exception_raise_state_identity( exc_port, - retrieve_thread_self_fast(thread), - retrieve_task_self_fast(thread->task), + use_fast_retrieve ? retrieve_thread_self_fast(thread) : + thread_port, + use_fast_retrieve ? retrieve_task_self_fast(thread->task) : + task_port, exception, code, codeCnt, @@ -280,8 +316,10 @@ exception_deliver( state, &state_cnt); } else { kr = exception_raise_state_identity(exc_port, - retrieve_thread_self_fast(thread), - retrieve_task_self_fast(thread->task), + use_fast_retrieve ? retrieve_thread_self_fast(thread) : + thread_port, + use_fast_retrieve ? retrieve_task_self_fast(thread->task) : + task_port, exception, small_code, codeCnt, @@ -340,10 +378,11 @@ check_exc_receiver_dependency( return retval; } + /* - * Routine: exception_triage + * Routine: exception_triage_thread * Purpose: - * The current thread caught an exception. + * The thread caught an exception. * We make an up-call to the thread's exception server. * Conditions: * Nothing locked and no resources held. @@ -354,12 +393,12 @@ check_exc_receiver_dependency( * KERN_SUCCESS if exception is handled by any of the handlers. */ kern_return_t -exception_triage( +exception_triage_thread( exception_type_t exception, mach_exception_data_t code, - mach_msg_type_number_t codeCnt) + mach_msg_type_number_t codeCnt, + thread_t thread) { - thread_t thread; task_t task; host_priv_t host_priv; lck_mtx_t *mutex; @@ -379,8 +418,6 @@ exception_triage( panic("called exception_triage when it was forbidden by the boot environment"); } - thread = current_thread(); - /* * Try to raise the exception at the activation level. */ @@ -395,8 +432,8 @@ exception_triage( /* * Maybe the task level will handle it. */ - task = current_task(); - mutex = &task->lock; + task = thread->task; + mutex = &task->itk_lock_data; if (KERN_SUCCESS == check_exc_receiver_dependency(exception, task->exc_actions, mutex)) { kr = exception_deliver(thread, exception, code, codeCnt, task->exc_actions, mutex); @@ -424,6 +461,29 @@ exception_triage( return kr; } +/* + * Routine: exception_triage + * Purpose: + * The current thread caught an exception. + * We make an up-call to the thread's exception server. + * Conditions: + * Nothing locked and no resources held. + * Called from an exception context, so + * thread_exception_return and thread_kdb_return + * are possible. + * Returns: + * KERN_SUCCESS if exception is handled by any of the handlers. + */ +kern_return_t +exception_triage( + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t codeCnt) +{ + thread_t thread = current_thread(); + return exception_triage_thread(exception, code, codeCnt, thread); +} + kern_return_t bsd_exception( exception_type_t exception, @@ -439,7 +499,7 @@ bsd_exception( * Maybe the task level will handle it. */ task = current_task(); - mutex = &task->lock; + mutex = &task->itk_lock_data; kr = exception_deliver(self, exception, code, codeCnt, task->exc_actions, mutex); diff --git a/osfmk/kern/exception.h b/osfmk/kern/exception.h index 94786e7be..7ab0fcca3 100644 --- a/osfmk/kern/exception.h +++ b/osfmk/kern/exception.h @@ -36,6 +36,7 @@ #include #include #include +#include /* * Common storage for exception actions. @@ -46,6 +47,7 @@ struct exception_action { thread_state_flavor_t flavor; /* state flavor to send */ exception_behavior_t behavior; /* exception type to raise */ boolean_t privileged; /* survives ipc_task_reset */ + struct label *label; /* MAC label associated with action */ }; /* Make an up-call to a thread's exception server */ @@ -54,6 +56,12 @@ extern kern_return_t exception_triage( mach_exception_data_t code, mach_msg_type_number_t codeCnt); +extern kern_return_t exception_triage_thread( + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t codeCnt, + thread_t thread); + /* Notify system performance monitor */ extern kern_return_t sys_perf_notify(thread_t thread, int pid); diff --git a/osfmk/kern/gzalloc.c b/osfmk/kern/gzalloc.c index 8db705a72..5062baa78 100644 --- a/osfmk/kern/gzalloc.c +++ b/osfmk/kern/gzalloc.c @@ -62,7 +62,6 @@ */ #include -#include #include #include @@ -172,33 +171,21 @@ void gzalloc_configure(void) { if (PE_parse_boot_argn("-gzalloc_mode", temp_buf, sizeof (temp_buf))) { gzalloc_mode = TRUE; gzalloc_min = GZALLOC_MIN_DEFAULT; -#if ZONE_DEBUG - gzalloc_min += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; -#endif gzalloc_max = ~0U; } if (PE_parse_boot_argn("gzalloc_min", &gzalloc_min, sizeof(gzalloc_min))) { -#if ZONE_DEBUG - gzalloc_min += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; -#endif gzalloc_mode = TRUE; gzalloc_max = ~0U; } if (PE_parse_boot_argn("gzalloc_max", &gzalloc_max, sizeof(gzalloc_max))) { -#if ZONE_DEBUG - gzalloc_max += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; -#endif gzalloc_mode = TRUE; if (gzalloc_min == ~0U) gzalloc_min = 0; } if (PE_parse_boot_argn("gzalloc_size", &gzalloc_size, sizeof(gzalloc_size))) { -#if ZONE_DEBUG - gzalloc_size += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; -#endif gzalloc_min = gzalloc_max = gzalloc_size; gzalloc_mode = TRUE; } @@ -268,7 +255,7 @@ gzalloc_alloc(zone_t zone, boolean_t canblock) { vm_offset_t rounded_size = round_page(zone->elem_size + GZHEADER_SIZE); vm_offset_t residue = rounded_size - zone->elem_size; vm_offset_t gzaddr = 0; - gzhdr_t *gzh; + gzhdr_t *gzh, *gzhcopy = NULL; if (!kmem_ready || (vm_page_zone == ZONE_NULL)) { /* Early allocations are supplied directly from the @@ -287,7 +274,7 @@ gzalloc_alloc(zone_t zone, boolean_t canblock) { else { kern_return_t kr = kernel_memory_allocate(gzalloc_map, &gzaddr, rounded_size + (1*PAGE_SIZE), - 0, KMA_KOBJECT | gzalloc_guard, + 0, KMA_KOBJECT | KMA_ATOMIC | gzalloc_guard, VM_KERN_MEMORY_OSFMK); if (kr != KERN_SUCCESS) panic("gzalloc: kernel_memory_allocate for size 0x%llx failed with %d", (uint64_t)rounded_size, kr); @@ -301,6 +288,7 @@ gzalloc_alloc(zone_t zone, boolean_t canblock) { */ gzh = (gzhdr_t *) (gzaddr + zone->elem_size); addr = gzaddr; + gzhcopy = (gzhdr_t *) (gzaddr + rounded_size - sizeof(gzhdr_t)); } else { gzh = (gzhdr_t *) (gzaddr + residue - GZHEADER_SIZE); addr = (gzaddr + residue); @@ -322,6 +310,14 @@ gzalloc_alloc(zone_t zone, boolean_t canblock) { gzh->gzsize = (uint32_t) zone->elem_size; gzh->gzsig = GZALLOC_SIGNATURE; + /* In underflow detection mode, stash away a copy of the + * metadata at the edge of the allocated range, for + * retrieval by gzalloc_element_size() + */ + if (gzhcopy) { + *gzhcopy = *gzh; + } + lock_zone(zone); zone->count++; zone->sum_count++; @@ -438,3 +434,47 @@ boolean_t gzalloc_free(zone_t zone, void *addr) { } return gzfreed; } + +boolean_t gzalloc_element_size(void *gzaddr, zone_t *z, vm_size_t *gzsz) { + uintptr_t a = (uintptr_t)gzaddr; + if (__improbable(gzalloc_mode && (a >= gzalloc_map_min) && (a <= gzalloc_map_max))) { + gzhdr_t *gzh; + + /* Locate the gzalloc metadata adjoining the element */ + if (gzalloc_uf_mode == TRUE) { + boolean_t vmef; + vm_map_entry_t gzvme = NULL; + + /* In underflow detection mode, locate the map entry describing + * the element, and then locate the copy of the gzalloc + * header at the trailing edge of the range. + */ + vm_map_lock_read(gzalloc_map); + vmef = vm_map_lookup_entry(gzalloc_map, (vm_map_offset_t)a, &gzvme); + vm_map_unlock(gzalloc_map); + if (vmef == FALSE) { + panic("GZALLOC: unable to locate map entry for %p\n", (void *)a); + } + assertf(gzvme->vme_atomic != 0, "GZALLOC: VM map entry inconsistency, vme: %p, start: %llu end: %llu", gzvme, gzvme->vme_start, gzvme->vme_end); + gzh = (gzhdr_t *)(gzvme->vme_end - GZHEADER_SIZE); + } else { + gzh = (gzhdr_t *)(a - GZHEADER_SIZE); + } + + if (gzh->gzsig != GZALLOC_SIGNATURE) { + panic("GZALLOC signature mismatch for element %p, expected 0x%x, found 0x%x", (void *)a, GZALLOC_SIGNATURE, gzh->gzsig); + } + + *gzsz = gzh->gzone->elem_size; + if ((*gzsz < gzalloc_min) || (*gzsz > gzalloc_max)) { + panic("GZALLOC: invalid element size %lu\n", *gzsz); + } + + if (z) { + *z = gzh->gzone; + } + return TRUE; + } else { + return FALSE; + } +} diff --git a/osfmk/kern/hibernate.c b/osfmk/kern/hibernate.c index c15eb172e..0bbc73d49 100644 --- a/osfmk/kern/hibernate.c +++ b/osfmk/kern/hibernate.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -106,14 +105,13 @@ hibernate_setup(IOHibernateImageHeader * header, hibernate_reset_stats(); - if (vmflush && (COMPRESSED_PAGER_IS_ACTIVE || dp_isssd)) { + if (vmflush && VM_CONFIG_COMPRESSOR_IS_PRESENT) { sync_internal(); - if (COMPRESSED_PAGER_IS_ACTIVE) { - vm_decompressor_lock(); - need_to_unlock_decompressor = TRUE; - } + vm_decompressor_lock(); + need_to_unlock_decompressor = TRUE; + hibernate_flush_memory(); } @@ -147,7 +145,7 @@ hibernate_teardown(hibernate_page_list_t * page_list, if (page_list_pal) kfree(page_list_pal, page_list_pal->list_size); - if (COMPRESSED_PAGER_IS_ACTIVE) { + if (VM_CONFIG_COMPRESSOR_IS_PRESENT) { if (need_to_unlock_decompressor == TRUE) { need_to_unlock_decompressor = FALSE; vm_decompressor_unlock(); diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index dba639a8c..2e5852334 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -89,11 +89,13 @@ #include #include #include +#include // mach_node_port_changed() #include #include #include + #if CONFIG_ATM #include #endif @@ -102,6 +104,8 @@ #include #endif +#include + host_data_t realhost; vm_extmod_statistics_data_t host_extmod_statistics; @@ -109,7 +113,7 @@ vm_extmod_statistics_data_t host_extmod_statistics; kern_return_t host_processors(host_priv_t host_priv, processor_array_t * out_array, mach_msg_type_number_t * countp) { - register processor_t processor, *tp; + processor_t processor, *tp; void * addr; unsigned int count, i; @@ -156,8 +160,8 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num switch (flavor) { case HOST_BASIC_INFO: { - register host_basic_info_t basic_info; - register int master_id; + host_basic_info_t basic_info; + int master_id; /* * Basic information about this host. @@ -191,7 +195,7 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num } case HOST_SCHED_INFO: { - register host_sched_info_t sched_info; + host_sched_info_t sched_info; uint32_t quantum_time; uint64_t quantum_ns; @@ -225,7 +229,7 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num } case HOST_PRIORITY_INFO: { - register host_priority_info_t priority_info; + host_priority_info_t priority_info; if (*count < HOST_PRIORITY_INFO_COUNT) return (KERN_FAILURE); @@ -255,6 +259,19 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num return (KERN_SUCCESS); } + case HOST_CAN_HAS_DEBUGGER: { + host_can_has_debugger_info_t can_has_debugger_info; + + if (*count < HOST_CAN_HAS_DEBUGGER_COUNT) + return (KERN_FAILURE); + + can_has_debugger_info = (host_can_has_debugger_info_t)info; + can_has_debugger_info->can_has_debugger = PE_i_can_has_debugger(NULL); + *count = HOST_CAN_HAS_DEBUGGER_COUNT; + + return KERN_SUCCESS; + } + case HOST_VM_PURGABLE: { if (*count < HOST_VM_PURGABLE_COUNT) return (KERN_FAILURE); @@ -321,8 +338,8 @@ host_statistics(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_ty } case HOST_VM_INFO: { - register processor_t processor; - register vm_statistics64_t stat; + processor_t processor; + vm_statistics64_t stat; vm_statistics64_data_t host_vm_stat; vm_statistics_t stat32; mach_msg_type_number_t original_count; @@ -404,7 +421,7 @@ host_statistics(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_ty } case HOST_CPU_LOAD_INFO: { - register processor_t processor; + processor_t processor; host_cpu_load_info_t cpu_load_info; if (*count < HOST_CPU_LOAD_INFO_COUNT) @@ -504,8 +521,8 @@ host_statistics64(host_t host, host_flavor_t flavor, host_info64_t info, mach_ms switch (flavor) { case HOST_VM_INFO64: /* We were asked to get vm_statistics64 */ { - register processor_t processor; - register vm_statistics64_t stat; + processor_t processor; + vm_statistics64_t stat; vm_statistics64_data_t host_vm_stat; mach_msg_type_number_t original_count; unsigned int local_q_internal_count; @@ -861,15 +878,38 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port) { ipc_port_t old_port; +#if !MACH_FLIPC + if (id == HOST_NODE_PORT) + return (KERN_NOT_SUPPORTED); +#endif + host_lock(host_priv); old_port = host_priv->special[id]; host_priv->special[id] = port; host_unlock(host_priv); + +#if MACH_FLIPC + if (id == HOST_NODE_PORT) + mach_node_port_changed(); +#endif + if (IP_VALID(old_port)) ipc_port_release_send(old_port); return (KERN_SUCCESS); } +/* + * Kernel interface for retrieving a special port. + */ +kern_return_t +kernel_get_special_port(host_priv_t host_priv, int id, ipc_port_t * portp) +{ + host_lock(host_priv); + *portp = host_priv->special[id]; + host_unlock(host_priv); + return (KERN_SUCCESS); +} + /* * User interface for setting a special port. * diff --git a/osfmk/kern/host_notify.c b/osfmk/kern/host_notify.c index 83826c191..a69f109b4 100644 --- a/osfmk/kern/host_notify.c +++ b/osfmk/kern/host_notify.c @@ -48,7 +48,8 @@ static zone_t host_notify_zone; static queue_head_t host_notify_queue[HOST_NOTIFY_TYPE_MAX+1]; static mach_msg_id_t host_notify_replyid[HOST_NOTIFY_TYPE_MAX+1] = - { HOST_CALENDAR_CHANGED_REPLYID }; + { HOST_CALENDAR_CHANGED_REPLYID, + HOST_CALENDAR_SET_REPLYID }; struct host_notify_entry { queue_chain_t entries; @@ -206,3 +207,11 @@ host_notify_calendar_change(void) host_notify_all(HOST_NOTIFY_CALENDAR_CHANGE, &msg.Head, sizeof (msg)); } + +void +host_notify_calendar_set(void) +{ + __Request__host_calendar_set_t msg; + + host_notify_all(HOST_NOTIFY_CALENDAR_SET, &msg.Head, sizeof (msg)); +} diff --git a/osfmk/kern/host_notify.h b/osfmk/kern/host_notify.h index 85bfa608d..12846b3dd 100644 --- a/osfmk/kern/host_notify.h +++ b/osfmk/kern/host_notify.h @@ -44,6 +44,7 @@ void host_notify_port_destroy( ipc_port_t port); void host_notify_calendar_change(void); +void host_notify_calendar_set(void); void host_notify_init(void); diff --git a/osfmk/kern/ipc_host.c b/osfmk/kern/ipc_host.c index 88e629de4..b68bd0a09 100644 --- a/osfmk/kern/ipc_host.c +++ b/osfmk/kern/ipc_host.c @@ -133,6 +133,10 @@ void ipc_host_init(void) for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { realhost.exc_actions[i].port = IP_NULL; + realhost.exc_actions[i].label = NULL; + /* The mac framework is not yet initialized, so we defer + * initializing the labels to later, when they are set + * for the first time. */ }/* for */ /* @@ -274,15 +278,12 @@ convert_port_to_host( host_t host = HOST_NULL; if (IP_VALID(port)) { - ip_lock(port); - if (ip_active(port) && - ((ip_kotype(port) == IKOT_HOST) || - (ip_kotype(port) == IKOT_HOST_PRIV) - )) + if (ip_kotype(port) == IKOT_HOST || + ip_kotype(port) == IKOT_HOST_PRIV) { host = (host_t) port->ip_kobject; - ip_unlock(port); + assert(ip_active(port)); + } } - return host; } @@ -543,7 +544,7 @@ host_set_exception_ports( exception_behavior_t new_behavior, thread_state_flavor_t new_flavor) { - register int i; + int i; ipc_port_t old_port[EXC_TYPES_COUNT]; if (host_priv == HOST_PRIV_NULL) { @@ -583,14 +584,27 @@ host_set_exception_ports( host_lock(host_priv); for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { - if (exception_mask & (1 << i)) { +#if CONFIG_MACF + if (host_priv->exc_actions[i].label == NULL) { + // Lazy initialization (see ipc_port_init). + mac_exc_action_label_init(host_priv->exc_actions + i); + } +#endif + + if ((exception_mask & (1 << i)) +#if CONFIG_MACF + && mac_exc_action_label_update(current_task(), host_priv->exc_actions + i) == 0 +#endif + ) { old_port[i] = host_priv->exc_actions[i].port; + host_priv->exc_actions[i].port = ipc_port_copy_send(new_port); host_priv->exc_actions[i].behavior = new_behavior; host_priv->exc_actions[i].flavor = new_flavor; - } else + } else { old_port[i] = IP_NULL; + } }/* for */ /* @@ -650,6 +664,13 @@ host_get_exception_ports( count = 0; for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { +#if CONFIG_MACF + if (host_priv->exc_actions[i].label == NULL) { + // Lazy initialization (see ipc_port_init). + mac_exc_action_label_init(host_priv->exc_actions + i); + } +#endif + if (exception_mask & (1 << i)) { for (j = 0; j < count; j++) { /* @@ -731,7 +752,18 @@ host_swap_exception_ports( assert(EXC_TYPES_COUNT > FIRST_EXCEPTION); for (count=0, i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT && count < *CountCnt; i++) { - if (exception_mask & (1 << i)) { +#if CONFIG_MACF + if (host_priv->exc_actions[i].label == NULL) { + // Lazy initialization (see ipc_port_init). + mac_exc_action_label_init(host_priv->exc_actions + i); + } +#endif + + if ((exception_mask & (1 << i)) +#if CONFIG_MACF + && mac_exc_action_label_update(current_task(), host_priv->exc_actions + i) == 0 +#endif + ) { for (j = 0; j < count; j++) { /* * search for an identical entry, if found diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index 7aa0466db..73e7a084a 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,8 +91,6 @@ #include #include #include -#include -#include #include #include #include @@ -100,7 +98,7 @@ #include #include #include -#if VM32_SUPPORT +#ifdef VM32_SUPPORT #include #endif #include @@ -187,11 +185,10 @@ const struct mig_subsystem *mig_e[] = { (const struct mig_subsystem *)&lock_set_subsystem, (const struct mig_subsystem *)&task_subsystem, (const struct mig_subsystem *)&thread_act_subsystem, -#if VM32_SUPPORT +#ifdef VM32_SUPPORT (const struct mig_subsystem *)&vm32_map_subsystem, #endif (const struct mig_subsystem *)&UNDReply_subsystem, - (const struct mig_subsystem *)&default_pager_object_subsystem, (const struct mig_subsystem *)&mach_voucher_subsystem, (const struct mig_subsystem *)&mach_voucher_attr_control_subsystem, @@ -260,7 +257,8 @@ mig_init(void) ipc_kmsg_t ipc_kobject_server( - ipc_kmsg_t request) + ipc_kmsg_t request, + mach_msg_option_t __unused option) { mach_msg_size_t reply_size; ipc_kmsg_t reply; @@ -268,19 +266,22 @@ ipc_kobject_server( ipc_port_t *destp; ipc_port_t replyp = IPC_PORT_NULL; mach_msg_format_0_trailer_t *trailer; - register mig_hash_t *ptr; + mig_hash_t *ptr; + task_t task = TASK_NULL; + uint32_t exec_token; + boolean_t exec_token_changed = FALSE; /* * Find out corresponding mig_hash entry if any */ { - register int key = request->ikm_header->msgh_id; - register int i = MIG_HASH(key); - register int max_iter = mig_table_max_displ; - - do + int key = request->ikm_header->msgh_id; + unsigned int i = (unsigned int)MIG_HASH(key); + int max_iter = mig_table_max_displ; + + do { ptr = &mig_buckets[i++ % MAX_MIG_ENTRIES]; - while (key != ptr->num && ptr->num && --max_iter); + } while (key != ptr->num && ptr->num && --max_iter); if (!ptr->routine || key != ptr->num) { ptr = (mig_hash_t *)0; @@ -299,6 +300,7 @@ ipc_kobject_server( if (reply == IKM_NULL) { printf("ipc_kobject_server: dropping request\n"); + ipc_kmsg_trace_send(request, option); ipc_kmsg_destroy(request); return IKM_NULL; } @@ -335,9 +337,28 @@ ipc_kobject_server( * Find the routine to call, and call it * to perform the kernel function */ + ipc_kmsg_trace_send(request, option); { - if (ptr) { + if (ptr) { + /* + * Check if the port is a task port, if its a task port then + * snapshot the task exec token before the mig routine call. + */ + ipc_port_t port = request->ikm_header->msgh_remote_port; + if (IP_VALID(port) && ip_kotype(port) == IKOT_TASK) { + task = convert_port_to_task_with_exec_token(port, &exec_token); + } + (*ptr->routine)(request->ikm_header, reply->ikm_header); + + /* Check if the exec token changed during the mig routine */ + if (task != TASK_NULL) { + if (exec_token != task->exec_token) { + exec_token_changed = TRUE; + } + task_deallocate(task); + } + kernel_task->messages_received++; } else { @@ -453,6 +474,52 @@ ipc_kobject_server( return IKM_NULL; } + /* Fail the MIG call if the task exec token changed during the call */ + if (kr == KERN_SUCCESS && exec_token_changed) { + /* + * Create a new reply msg with error and destroy the old reply msg. + */ + ipc_kmsg_t new_reply = ipc_kmsg_alloc(reply_size); + + if (new_reply == IKM_NULL) { + printf("ipc_kobject_server: dropping request\n"); + ipc_kmsg_destroy(reply); + return IKM_NULL; + } + /* + * Initialize the new reply message. + */ + { +#define OutP_new ((mig_reply_error_t *) new_reply->ikm_header) +#define OutP_old ((mig_reply_error_t *) reply->ikm_header) + + bzero((void *)OutP_new, reply_size); + + OutP_new->NDR = OutP_old->NDR; + OutP_new->Head.msgh_size = sizeof(mig_reply_error_t); + OutP_new->Head.msgh_bits = OutP_old->Head.msgh_bits & ~MACH_MSGH_BITS_COMPLEX; + OutP_new->Head.msgh_remote_port = OutP_old->Head.msgh_remote_port; + OutP_new->Head.msgh_local_port = MACH_PORT_NULL; + OutP_new->Head.msgh_voucher_port = MACH_PORT_NULL; + OutP_new->Head.msgh_id = OutP_old->Head.msgh_id; + + /* Set the error as KERN_INVALID_TASK */ + OutP_new->RetCode = KERN_INVALID_TASK; + +#undef OutP_new +#undef OutP_old + } + + /* + * Destroy everything in reply except the reply port right, + * which is needed in the new reply message. + */ + reply->ikm_header->msgh_remote_port = MACH_PORT_NULL; + ipc_kmsg_destroy(reply); + + reply = new_reply; + } + trailer = (mach_msg_format_0_trailer_t *) ((vm_offset_t)reply->ikm_header + (int)reply->ikm_header->msgh_size); @@ -572,6 +639,10 @@ ipc_kobject_notify( case IKOT_SEMAPHORE: semaphore_notify(request_header); return TRUE; + + case IKOT_TASK: + task_port_notify(request_header); + return TRUE; case IKOT_NAMED_ENTRY: ip_lock(port); diff --git a/osfmk/kern/ipc_kobject.h b/osfmk/kern/ipc_kobject.h index 557c7a0c0..daff8a51b 100644 --- a/osfmk/kern/ipc_kobject.h +++ b/osfmk/kern/ipc_kobject.h @@ -143,7 +143,8 @@ typedef natural_t ipc_kobject_type_t; /* Dispatch a kernel server function */ extern ipc_kmsg_t ipc_kobject_server( - ipc_kmsg_t request); + ipc_kmsg_t request, + mach_msg_option_t option); /* Make a port represent a kernel object of the given type */ extern void ipc_kobject_set( diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index 5dff6a1a2..3530f97ce 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -116,13 +116,18 @@ mach_msg_send_from_kernel( ipc_kmsg_t kmsg; mach_msg_return_t mr; + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); - if (mr != MACH_MSG_SUCCESS) + if (mr != MACH_MSG_SUCCESS) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; + } mr = ipc_kmsg_copyin_from_kernel_legacy(kmsg); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -138,6 +143,7 @@ mach_msg_send_from_kernel( mr = ipc_kmsg_send(kmsg, option, MACH_MSG_TIMEOUT_NONE); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); } return mr; @@ -153,13 +159,18 @@ mach_msg_send_from_kernel_proper( ipc_kmsg_t kmsg; mach_msg_return_t mr; + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); - if (mr != MACH_MSG_SUCCESS) + if (mr != MACH_MSG_SUCCESS) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; + } mr = ipc_kmsg_copyin_from_kernel(kmsg); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -175,6 +186,7 @@ mach_msg_send_from_kernel_proper( mr = ipc_kmsg_send(kmsg, option, MACH_MSG_TIMEOUT_NONE); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); } return mr; @@ -190,13 +202,18 @@ mach_msg_send_from_kernel_with_options( ipc_kmsg_t kmsg; mach_msg_return_t mr; + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); - if (mr != MACH_MSG_SUCCESS) + if (mr != MACH_MSG_SUCCESS) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; + } mr = ipc_kmsg_copyin_from_kernel(kmsg); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -217,6 +234,7 @@ mach_msg_send_from_kernel_with_options( if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); } return mr; @@ -235,13 +253,18 @@ mach_msg_send_from_kernel_with_options_legacy( ipc_kmsg_t kmsg; mach_msg_return_t mr; + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); - if (mr != MACH_MSG_SUCCESS) + if (mr != MACH_MSG_SUCCESS) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; + } mr = ipc_kmsg_copyin_from_kernel_legacy(kmsg); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -260,6 +283,7 @@ mach_msg_send_from_kernel_with_options_legacy( if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); } return mr; @@ -332,9 +356,13 @@ mach_msg_rpc_from_kernel_body( assert(msg->msgh_local_port == MACH_PORT_NULL); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); - if (mr != MACH_MSG_SUCCESS) + if (mr != MACH_MSG_SUCCESS) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; + } reply = self->ith_rpc_reply; if (reply == IP_NULL) { @@ -360,6 +388,7 @@ mach_msg_rpc_from_kernel_body( #endif if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -375,6 +404,7 @@ mach_msg_rpc_from_kernel_body( mr = ipc_kmsg_send(kmsg, option, MACH_MSG_TIMEOUT_NONE); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_destroy(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -488,7 +518,7 @@ mach_msg_overwrite( mach_msg_size_t rcv_size, mach_port_name_t rcv_name, __unused mach_msg_timeout_t msg_timeout, - __unused mach_port_name_t notify, + mach_msg_priority_t override, __unused mach_msg_header_t *rcv_msg, __unused mach_msg_size_t rcv_msg_size) { @@ -509,12 +539,21 @@ mach_msg_overwrite( if (send_size > MACH_MSG_SIZE_MAX - MAX_TRAILER_SIZE) return MACH_SEND_TOO_LARGE; + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + msg_and_trailer_size = send_size + MAX_TRAILER_SIZE; kmsg = ipc_kmsg_alloc(msg_and_trailer_size); - if (kmsg == IKM_NULL) + if (kmsg == IKM_NULL) { + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, MACH_SEND_NO_BUFFER); return MACH_SEND_NO_BUFFER; + } + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_LINK) | DBG_FUNC_NONE, + (uintptr_t)0, /* this should only be called from the kernel! */ + VM_KERNEL_ADDRPERM((uintptr_t)kmsg), + 0, 0, + 0); (void) memcpy((void *) kmsg->ikm_header, (const void *) msg, send_size); kmsg->ikm_header->msgh_size = send_size; @@ -531,10 +570,11 @@ mach_msg_overwrite( max_trailer->msgh_trailer_type = MACH_MSG_TRAILER_FORMAT_0; max_trailer->msgh_trailer_size = MACH_MSG_TRAILER_MINIMUM_SIZE; - mr = ipc_kmsg_copyin(kmsg, space, map, &option); + mr = ipc_kmsg_copyin(kmsg, space, map, override, &option); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); + KDBG(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); return mr; } @@ -571,10 +611,10 @@ mach_msg_overwrite( io_release(object); } while (mr == MACH_RCV_INTERRUPTED); + if (mr != MACH_MSG_SUCCESS) return mr; - trailer_size = ipc_kmsg_add_trailer(kmsg, space, option, current_thread(), seqno, TRUE, kmsg->ikm_header->msgh_remote_port->ip_context); @@ -678,6 +718,59 @@ mig_strncpy( return i; } +/* + * mig_strncpy_zerofill -- Bounded string copy. Does what the + * library routine strncpy OUGHT to do: Copies the (null terminated) + * string in src into dest, a buffer of length len. Assures that + * the copy is still null terminated and doesn't overflow the buffer, + * truncating the copy if necessary. If the string in src is smaller + * than given length len, it will zero fill the remaining bytes in dest. + * + * Parameters: + * + * dest - Pointer to destination buffer. + * + * src - Pointer to source string. + * + * len - Length of destination buffer. + */ +int +mig_strncpy_zerofill( + char *dest, + const char *src, + int len) +{ + int i = 0; + boolean_t terminated = FALSE; + int retval = 0; + + if (len <= 0 || dest == NULL) { + return 0; + } + + if (src == NULL) { + terminated = TRUE; + } + + for (i = 1; i < len; i++) { + if (!terminated) { + if (!(*dest++ = *src++)) { + retval = i; + terminated = TRUE; + } + } else { + *dest++ = '\0'; + } + } + + *dest = '\0'; + if (!terminated) { + retval = i; + } + + return retval; +} + char * mig_user_allocate( vm_size_t size) diff --git a/osfmk/kern/ipc_tt.c b/osfmk/kern/ipc_tt.c index 9a43453bd..45da8cbaf 100644 --- a/osfmk/kern/ipc_tt.c +++ b/osfmk/kern/ipc_tt.c @@ -143,7 +143,15 @@ ipc_task_init( task->itk_self = kport; task->itk_nself = nport; task->itk_resume = IP_NULL; /* Lazily allocated on-demand */ - task->itk_sself = ipc_port_make_send(kport); + if (task_is_a_corpse_fork(task)) { + /* + * No sender's notification for corpse would not + * work with a naked send right in kernel. + */ + task->itk_sself = IP_NULL; + } else { + task->itk_sself = ipc_port_make_send(kport); + } task->itk_debug_control = IP_NULL; task->itk_space = space; @@ -152,6 +160,9 @@ ipc_task_init( for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { task->exc_actions[i].port = IP_NULL; +#if CONFIG_MACF + mac_exc_action_label_init(task->exc_actions + i); +#endif }/* for */ kr = host_get_host_port(host_priv_self(), &port); @@ -186,6 +197,9 @@ ipc_task_init( parent->exc_actions[i].behavior; task->exc_actions[i].privileged = parent->exc_actions[i].privileged; +#if CONFIG_MACF + mac_exc_action_label_inherit(parent->exc_actions + i, task->exc_actions + i); +#endif }/* for */ task->itk_host = ipc_port_copy_send(parent->itk_host); @@ -320,6 +334,9 @@ ipc_task_terminate( if (IP_VALID(task->exc_actions[i].port)) { ipc_port_release_send(task->exc_actions[i].port); } +#if CONFIG_MACF + mac_exc_action_label_destroy(task->exc_actions + i); +#endif } if (IP_VALID(task->itk_host)) @@ -392,7 +409,13 @@ ipc_task_reset( task->itk_self = new_kport; old_sself = task->itk_sself; task->itk_sself = ipc_port_make_send(new_kport); - ipc_kobject_set(old_kport, IKO_NULL, IKOT_NONE); + + /* Set the old kport to IKOT_NONE and update the exec token while under the port lock */ + ip_lock(old_kport); + ipc_kobject_set_atomically(old_kport, IKO_NULL, IKOT_NONE); + task->exec_token += 1; + ip_unlock(old_kport); + ipc_kobject_set(new_kport, (ipc_kobject_t) task, IKOT_TASK); for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { @@ -403,6 +426,9 @@ ipc_task_reset( } if (!task->exc_actions[i].privileged) { +#if CONFIG_MACF + mac_exc_action_label_reset(task->exc_actions + i); +#endif old_exc_actions[i] = task->exc_actions[i].port; task->exc_actions[i].port = IP_NULL; } @@ -471,6 +497,12 @@ ipc_thread_init_exc_actions( thread->exc_actions = kalloc(sizeof(struct exception_action) * EXC_TYPES_COUNT); bzero(thread->exc_actions, sizeof(struct exception_action) * EXC_TYPES_COUNT); + +#if CONFIG_MACF + for (size_t i = 0; i < EXC_TYPES_COUNT; ++i) { + mac_exc_action_label_init(thread->exc_actions + i); + } +#endif } void @@ -478,6 +510,12 @@ ipc_thread_destroy_exc_actions( thread_t thread) { if (thread->exc_actions != NULL) { +#if CONFIG_MACF + for (size_t i = 0; i < EXC_TYPES_COUNT; ++i) { + mac_exc_action_label_destroy(thread->exc_actions + i); + } +#endif + kfree(thread->exc_actions, sizeof(struct exception_action) * EXC_TYPES_COUNT); thread->exc_actions = NULL; @@ -594,6 +632,9 @@ ipc_thread_reset( if (thread->exc_actions[i].privileged) { old_exc_actions[i] = IP_NULL; } else { +#if CONFIG_MACF + mac_exc_action_label_reset(thread->exc_actions + i); +#endif old_exc_actions[i] = thread->exc_actions[i].port; thread->exc_actions[i].port = IP_NULL; } @@ -633,9 +674,9 @@ ipc_thread_reset( ipc_port_t retrieve_task_self_fast( - register task_t task) + task_t task) { - register ipc_port_t port; + ipc_port_t port; assert(task == current_task()); @@ -673,7 +714,7 @@ ipc_port_t retrieve_thread_self_fast( thread_t thread) { - register ipc_port_t port; + ipc_port_t port; assert(thread == current_thread()); @@ -909,36 +950,36 @@ task_get_special_port( } switch (which) { - case TASK_KERNEL_PORT: + case TASK_KERNEL_PORT: port = ipc_port_copy_send(task->itk_sself); break; - case TASK_NAME_PORT: + case TASK_NAME_PORT: port = ipc_port_make_send(task->itk_nself); break; - case TASK_HOST_PORT: + case TASK_HOST_PORT: port = ipc_port_copy_send(task->itk_host); break; - case TASK_BOOTSTRAP_PORT: + case TASK_BOOTSTRAP_PORT: port = ipc_port_copy_send(task->itk_bootstrap); break; - case TASK_SEATBELT_PORT: + case TASK_SEATBELT_PORT: port = ipc_port_copy_send(task->itk_seatbelt); break; - case TASK_ACCESS_PORT: + case TASK_ACCESS_PORT: port = ipc_port_copy_send(task->itk_task_access); break; - case TASK_DEBUG_CONTROL_PORT: + case TASK_DEBUG_CONTROL_PORT: port = ipc_port_copy_send(task->itk_debug_control); break; - default: - itk_unlock(task); + default: + itk_unlock(task); return KERN_INVALID_ARGUMENT; } itk_unlock(task); @@ -976,33 +1017,32 @@ task_set_special_port( return KERN_INVALID_ARGUMENT; switch (which) { - case TASK_KERNEL_PORT: - whichp = &task->itk_sself; - break; + case TASK_KERNEL_PORT: + whichp = &task->itk_sself; + break; - case TASK_HOST_PORT: - whichp = &task->itk_host; - break; + case TASK_HOST_PORT: + whichp = &task->itk_host; + break; - case TASK_BOOTSTRAP_PORT: - whichp = &task->itk_bootstrap; - break; + case TASK_BOOTSTRAP_PORT: + whichp = &task->itk_bootstrap; + break; - case TASK_SEATBELT_PORT: - whichp = &task->itk_seatbelt; - break; + case TASK_SEATBELT_PORT: + whichp = &task->itk_seatbelt; + break; - case TASK_ACCESS_PORT: - whichp = &task->itk_task_access; - break; - - case TASK_DEBUG_CONTROL_PORT: - whichp = &task->itk_debug_control; - break; + case TASK_ACCESS_PORT: + whichp = &task->itk_task_access; + break; + case TASK_DEBUG_CONTROL_PORT: + whichp = &task->itk_debug_control; + break; - default: - return KERN_INVALID_ARGUMENT; + default: + return KERN_INVALID_ARGUMENT; }/* switch */ itk_lock(task); @@ -1220,6 +1260,24 @@ convert_port_to_locked_task(ipc_port_t port) task_t convert_port_to_task( ipc_port_t port) +{ + return convert_port_to_task_with_exec_token(port, NULL); +} + +/* + * Routine: convert_port_to_task_with_exec_token + * Purpose: + * Convert from a port to a task and return + * the exec token stored in the task. + * Doesn't consume the port ref; produces a task ref, + * which may be null. + * Conditions: + * Nothing locked. + */ +task_t +convert_port_to_task_with_exec_token( + ipc_port_t port, + uint32_t *exec_token) { task_t task = TASK_NULL; @@ -1231,6 +1289,9 @@ convert_port_to_task( task = (task_t)port->ip_kobject; assert(task != TASK_NULL); + if (exec_token) { + *exec_token = task->exec_token; + } task_reference_internal(task); } @@ -1460,6 +1521,33 @@ port_name_to_task( return task; } +/* + * Routine: port_name_to_host + * Purpose: + * Convert from a port name to a host pointer. + * NOTE: This does _not_ return a +1 reference to the host_t + * Conditions: + * Nothing locked. + */ +host_t +port_name_to_host( + mach_port_name_t name) +{ + + host_t host = HOST_NULL; + kern_return_t kr; + ipc_port_t port; + + if (MACH_PORT_VALID(name)) { + kr = ipc_port_translate_send(current_space(), name, &port); + if (kr == KERN_SUCCESS) { + host = convert_port_to_host(port); + ip_unlock(port); + } + } + return host; +} + /* * Routine: convert_task_to_port * Purpose: @@ -1673,7 +1761,11 @@ thread_set_exception_ports( ipc_thread_init_exc_actions(thread); } for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) { - if (exception_mask & (1 << i)) { + if ((exception_mask & (1 << i)) +#if CONFIG_MACF + && mac_exc_action_label_update(current_task(), thread->exc_actions + i) == 0 +#endif + ) { old_port[i] = thread->exc_actions[i].port; thread->exc_actions[i].port = ipc_port_copy_send(new_port); thread->exc_actions[i].behavior = new_behavior; @@ -1744,7 +1836,11 @@ task_set_exception_ports( } for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) { - if (exception_mask & (1 << i)) { + if ((exception_mask & (1 << i)) +#if CONFIG_MACF + && mac_exc_action_label_update(current_task(), task->exc_actions + i) == 0 +#endif + ) { old_port[i] = task->exc_actions[i].port; task->exc_actions[i].port = ipc_port_copy_send(new_port); @@ -1848,7 +1944,11 @@ thread_swap_exception_ports( assert(EXC_TYPES_COUNT > FIRST_EXCEPTION); for (count = 0, i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT && count < *CountCnt; ++i) { - if (exception_mask & (1 << i)) { + if ((exception_mask & (1 << i)) +#if CONFIG_MACF + && mac_exc_action_label_update(current_task(), thread->exc_actions + i) == 0 +#endif + ) { for (j = 0; j < count; ++j) { /* * search for an identical entry, if found @@ -1945,7 +2045,11 @@ task_swap_exception_ports( assert(EXC_TYPES_COUNT > FIRST_EXCEPTION); for (count = 0, i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT && count < *CountCnt; ++i) { - if (exception_mask & (1 << i)) { + if ((exception_mask & (1 << i)) +#if CONFIG_MACF + && mac_exc_action_label_update(current_task(), task->exc_actions + i) == 0 +#endif + ) { for (j = 0; j < count; j++) { /* * search for an identical entry, if found diff --git a/osfmk/kern/ipc_tt.h b/osfmk/kern/ipc_tt.h index cbf75eb27..895eb2544 100644 --- a/osfmk/kern/ipc_tt.h +++ b/osfmk/kern/ipc_tt.h @@ -131,9 +131,17 @@ extern task_name_t convert_port_to_task_name( extern task_t convert_port_to_task( ipc_port_t port); +extern task_t +convert_port_to_task_with_exec_token( + ipc_port_t port, + uint32_t *exec_token); + extern task_t port_name_to_task( mach_port_name_t name); +extern host_t port_name_to_host( + mach_port_name_t name); + extern boolean_t ref_task_port_locked( ipc_port_t port, task_t *ptask); diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index 2ac827b63..ac6d89b50 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -77,6 +77,7 @@ #include #include #include +#include #ifdef MACH_BSD zone_t kalloc_zone(vm_size_t); @@ -114,28 +115,14 @@ static void KALLOC_ZINFO_SALLOC(vm_size_t bytes) { thread_t thr = current_thread(); - task_t task; - zinfo_usage_t zinfo; - ledger_debit(thr->t_ledger, task_ledgers.tkm_shared, bytes); - - if (kalloc_fake_zone_index != -1 && - (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - zinfo[kalloc_fake_zone_index].alloc += bytes; } static void KALLOC_ZINFO_SFREE(vm_size_t bytes) { thread_t thr = current_thread(); - task_t task; - zinfo_usage_t zinfo; - ledger_credit(thr->t_ledger, task_ledgers.tkm_shared, bytes); - - if (kalloc_fake_zone_index != -1 && - (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - zinfo[kalloc_fake_zone_index].free += bytes; } /* @@ -165,11 +152,11 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes) 80, \ 96, \ /* 6 */ 128, \ - 160, \ + 160, 192, \ 256, \ /* 9 */ 288, \ - 512, \ - 1024, \ + 512, 576, \ + 1024, 1152, \ /* C */ 1280, \ 2048, \ 4096 @@ -183,10 +170,13 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes) "kalloc.96", \ /* 6 */ "kalloc.128", \ "kalloc.160", \ + "kalloc.192", \ "kalloc.256", \ /* 9 */ "kalloc.288", \ "kalloc.512", \ + "kalloc.576", \ "kalloc.1024", \ + "kalloc.1152", \ /* C */ "kalloc.1280", \ "kalloc.2048", \ "kalloc.4096" @@ -204,9 +194,9 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes) /* 6 */ 64, 72, 88, 112, \ 128, 192, \ 256, 288, 384, 440, \ -/* 9 */ 512, 768, \ +/* 9 */ 512, 576, 768, \ 1024, 1152, 1536, \ - 2048, 3072, \ + 2048, 2128, 3072, \ 4096, 6144 #define K_ZONE_NAMES \ @@ -216,9 +206,9 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes) /* 6 */ "kalloc.64", "kalloc.72", "kalloc.88", "kalloc.112", \ "kalloc.128", "kalloc.192", \ "kalloc.256", "kalloc.288", "kalloc.384", "kalloc.440", \ -/* 9 */ "kalloc.512", "kalloc.768", \ +/* 9 */ "kalloc.512", "kalloc.576", "kalloc.768", \ "kalloc.1024", "kalloc.1152", "kalloc.1536", \ - "kalloc.2048", "kalloc.3072", \ + "kalloc.2048", "kalloc.2128", "kalloc.3072", \ "kalloc.4096", "kalloc.6144" #else @@ -309,7 +299,7 @@ kalloc_init( kern_return_t retval; vm_offset_t min; vm_size_t size, kalloc_map_size; - register int i; + int i; /* * Scale the kalloc_map_size to physical memory size: stay below @@ -452,13 +442,134 @@ get_zone_search(vm_size_t size, int zindex) return (k_zone[zindex]); } +static vm_size_t +vm_map_lookup_kalloc_entry_locked( + vm_map_t map, + void *addr) +{ + boolean_t ret; + vm_map_entry_t vm_entry = NULL; + + ret = vm_map_lookup_entry(map, (vm_map_offset_t)addr, &vm_entry); + if (!ret) { + panic("Attempting to lookup/free an address not allocated via kalloc! (vm_map_lookup_entry() failed map: %p, addr: %p)\n", + map, addr); + } + if (vm_entry->vme_start != (vm_map_offset_t)addr) { + panic("Attempting to lookup/free the middle of a kalloc'ed element! (map: %p, addr: %p, entry: %p)\n", + map, addr, vm_entry); + } + if (!vm_entry->vme_atomic) { + panic("Attempting to lookup/free an address not managed by kalloc! (map: %p, addr: %p, entry: %p)\n", + map, addr, vm_entry); + } + return (vm_entry->vme_end - vm_entry->vme_start); +} + +vm_size_t +kalloc_size( + void *addr) +{ + vm_map_t map; + vm_size_t size; + + size = zone_element_size(addr, NULL); + if (size) { + return size; + } + if (((vm_offset_t)addr >= kalloc_map_min) && ((vm_offset_t)addr < kalloc_map_max)) { + map = kalloc_map; + } else { + map = kernel_map; + } + vm_map_lock_read(map); + size = vm_map_lookup_kalloc_entry_locked(map, addr); + vm_map_unlock_read(map); + return size; +} + +vm_size_t +kalloc_bucket_size( + vm_size_t size) +{ + zone_t z; + vm_map_t map; + + if (size < MAX_SIZE_ZDLUT) { + z = get_zone_dlut(size); + return z->elem_size; + } + + if (size < kalloc_max_prerounded) { + z = get_zone_search(size, k_zindex_start); + return z->elem_size; + } + + if (size >= kalloc_kernmap_size) + map = kernel_map; + else + map = kalloc_map; + + return vm_map_round_page(size, VM_MAP_PAGE_MASK(map)); +} + +vm_size_t +kfree_addr( + void *addr) +{ + vm_map_t map; + vm_size_t size = 0; + kern_return_t ret; + zone_t z; + + size = zone_element_size(addr, &z); + if (size) { + zfree(z, addr); + return size; + } + + if (((vm_offset_t)addr >= kalloc_map_min) && ((vm_offset_t)addr < kalloc_map_max)) { + map = kalloc_map; + } else { + map = kernel_map; + } + if ((vm_offset_t)addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS) { + panic("kfree on an address not in the kernel & kext address range! addr: %p\n", addr); + } + + vm_map_lock(map); + size = vm_map_lookup_kalloc_entry_locked(map, addr); + ret = vm_map_remove_locked(map, + vm_map_trunc_page((vm_map_offset_t)addr, + VM_MAP_PAGE_MASK(map)), + vm_map_round_page((vm_map_offset_t)addr + size, + VM_MAP_PAGE_MASK(map)), + VM_MAP_REMOVE_KUNWIRE); + if (ret != KERN_SUCCESS) { + panic("vm_map_remove_locked() failed for kalloc vm_entry! addr: %p, map: %p ret: %d\n", + addr, map, ret); + } + vm_map_unlock(map); + + kalloc_spin_lock(); + kalloc_large_total -= size; + kalloc_large_inuse--; + kalloc_unlock(); + + KALLOC_ZINFO_SFREE(size); + return size; +} + void * kalloc_canblock( - vm_size_t size, + vm_size_t * psize, boolean_t canblock, vm_allocation_site_t * site) { zone_t z; + vm_size_t size; + + size = *psize; if (size < MAX_SIZE_ZDLUT) z = get_zone_dlut(size); @@ -486,12 +597,12 @@ kalloc_canblock( vm_tag_t tag; tag = (site ? tag = vm_tag_alloc(site) : VM_KERN_MEMORY_KALLOC); - if (kmem_alloc(alloc_map, (vm_offset_t *)&addr, size, tag) != KERN_SUCCESS) { + if (kmem_alloc_flags(alloc_map, (vm_offset_t *)&addr, size, tag, KMA_ATOMIC) != KERN_SUCCESS) { if (alloc_map != kernel_map) { if (kalloc_fallback_count++ == 0) { printf("%s: falling back to kernel_map\n", __func__); } - if (kmem_alloc(kernel_map, (vm_offset_t *)&addr, size, tag) != KERN_SUCCESS) + if (kmem_alloc_flags(kernel_map, (vm_offset_t *)&addr, size, tag, KMA_ATOMIC) != KERN_SUCCESS) addr = NULL; } else @@ -518,6 +629,7 @@ kalloc_canblock( KALLOC_ZINFO_SALLOC(size); } + *psize = round_page(size); return(addr); } #ifdef KALLOC_DEBUG @@ -526,7 +638,9 @@ kalloc_canblock( z, z->zone_name, (unsigned long)size); #endif assert(size <= z->elem_size); - return zalloc_canblock(z, canblock); + *psize = z->elem_size; + void *addr = zalloc_canblock(z, canblock); + return addr; } void * @@ -584,7 +698,6 @@ kfree( return; } kmem_free(alloc_map, (vm_offset_t)data, size); - kalloc_spin_lock(); kalloc_large_total -= size; @@ -799,3 +912,11 @@ OSFree( OSMalloc_Tagrele(tag); } + +uint32_t +OSMalloc_size( + void *addr) +{ + return (uint32_t)kalloc_size(addr); +} + diff --git a/osfmk/kern/kalloc.h b/osfmk/kern/kalloc.h index caad32a3b..5a12d09ee 100644 --- a/osfmk/kern/kalloc.h +++ b/osfmk/kern/kalloc.h @@ -70,33 +70,84 @@ __BEGIN_DECLS extern void * kalloc_canblock( - vm_size_t size, + vm_size_t * size, boolean_t canblock, vm_allocation_site_t * site); -#define kalloc(size) \ +extern vm_size_t +kalloc_size( + void * addr); + +extern vm_size_t +kfree_addr( + void * addr); + +extern vm_size_t +kalloc_bucket_size( + vm_size_t size); + +#define kalloc(size) \ ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))); \ - kalloc_canblock((size), TRUE, &site); }) + vm_size_t tsize = (size); \ + kalloc_canblock(&tsize, TRUE, &site); }) #define kalloc_tag(size, tag) \ ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \ - = { (tag), 0 } ; \ - kalloc_canblock((size), TRUE, &site); }) + = { (tag), 0 }; \ + vm_size_t tsize = (size); \ + kalloc_canblock(&tsize, TRUE, &site); }) #define kalloc_tag_bt(size, tag) \ ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \ - = { (tag), VM_TAG_BT }; \ - kalloc_canblock((size), TRUE, &site); }) + = { (tag), VM_TAG_BT }; \ + vm_size_t tsize = (size); \ + kalloc_canblock(&tsize, TRUE, &site); }) #define kalloc_noblock(size) \ ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))); \ - kalloc_canblock((size), FALSE, &site); }) + vm_size_t tsize = (size); \ + kalloc_canblock(&tsize, FALSE, &site); }) + +#define kalloc_noblock_tag(size, tag) \ + ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \ + = { (tag), 0 }; \ + vm_size_t tsize = (size); \ + kalloc_canblock(&tsize, FALSE, &site); }) #define kalloc_noblock_tag_bt(size, tag) \ ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \ - = { (tag), VM_TAG_BT }; \ + = { (tag), VM_TAG_BT }; \ + vm_size_t tsize = (size); \ + kalloc_canblock(&tsize, FALSE, &site); }) + + +/* these versions update the size reference with the actual size allocated */ + +#define kallocp(size) \ + ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))); \ + kalloc_canblock((size), TRUE, &site); }) + +#define kallocp_tag(size, tag) \ + ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \ + = { (tag), 0 }; \ + kalloc_canblock((size), TRUE, &site); }) + +#define kallocp_tag_bt(size, tag) \ + ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \ + = { (tag), VM_TAG_BT }; \ + kalloc_canblock((size), TRUE, &site); }) + +#define kallocp_noblock(size) \ + ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))); \ + kalloc_canblock((size), FALSE, &site); }) + +#define kallocp_noblock_tag_bt(size, tag) \ + ({ static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \ + = { (tag), VM_TAG_BT }; \ kalloc_canblock((size), FALSE, &site); }) + + extern void kfree(void *data, vm_size_t size); diff --git a/osfmk/kern/kcdata.h b/osfmk/kern/kcdata.h new file mode 100644 index 000000000..3e1c76d31 --- /dev/null +++ b/osfmk/kern/kcdata.h @@ -0,0 +1,1061 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +/* + * + * THE KCDATA MANIFESTO + * + * Kcdata is a self-describing data serialization format. It is meant to get + * nested data structures out of xnu with minimum fuss, but also for that data + * to be easy to parse. It is also meant to allow us to add new fields and + * evolve the data format without breaking old parsers. + * + * Kcdata is a permanent data format suitable for long-term storage including + * in files. It is very important that we continue to be able to parse old + * versions of kcdata-based formats. To this end, there are several + * invariants you MUST MAINTAIN if you alter this file. + * + * * None of the magic numbers should ever be a byteswap of themselves or + * of any of the other magic numbers. + * + * * Never remove any type. + * + * * All kcdata structs must be packed, and must exclusively use fixed-size + * types. + * + * * Never change the definition of any type, except to add new fields to + * the end. + * + * * If you do add new fields to the end of a type, do not actually change + * the definition of the old structure. Instead, define a new structure + * with the new fields. See thread_snapshot_v3 as an example. This + * provides source compatibility for old readers, and also documents where + * the potential size cutoffs are. + * + * * If you change libkdd, or kcdata.py run the unit tests under libkdd. + * + * * If you add a type or extend an existing one, add a sample test to + * libkdd/tests so future changes to libkdd will always parse your struct + * correctly. + * + * For example to add a field to this: + * + * struct foobar { + * uint32_t baz; + * uint32_t quux; + * } __attribute__ ((packed)); + * + * Make it look like this: + * + * struct foobar { + * uint32_t baz; + * uint32_t quux; + * ///////// end version 1 of foobar. sizeof(struct foobar) was 8 //////// + * uint32_t frozzle; + * } __attribute__ ((packed)); + * + * If you are parsing kcdata formats, you MUST + * + * * Check the length field of each struct, including array elements. If the + * struct is longer than you expect, you must ignore the extra data. + * + * * Ignore any data types you do not understand. + * + * Additionally, we want to be as forward compatible as we can. Meaning old + * tools should still be able to use new data whenever possible. To this end, + * you should: + * + * * Try not to add new versions of types that supplant old ones. Instead + * extend the length of existing types or add supplemental types. + * + * * Try not to remove information from existing kcdata formats, unless + * removal was explicitly asked for. For example it is fine to add a + * stackshot flag to remove unwanted information, but you should not + * remove it from the default stackshot if the new flag is absent. + * + * * (TBD) If you do break old readers by removing information or + * supplanting old structs, then increase the major version number. + * + * + * + * The following is a description of the kcdata format. + * + * + * The format for data is setup in a generic format as follows + * + * Layout of data structure: + * + * | 8 - bytes | + * | type = MAGIC | LENGTH | + * | 0 | + * | type | size | + * | flags | + * | data | + * |___________data____________| + * | type | size | + * | flags | + * |___________data____________| + * | type = END | size=0 | + * | 0 | + * + * + * The type field describes what kind of data is passed. For example type = TASK_CRASHINFO_UUID means the following data is a uuid. + * These types need to be defined in task_corpses.h for easy consumption by userspace inspection tools. + * + * Some range of types is reserved for special types like ints, longs etc. A cool new functionality made possible with this + * extensible data format is that kernel can decide to put more information as required without requiring user space tools to + * re-compile to be compatible. The case of rusage struct versions could be introduced without breaking existing tools. + * + * Feature description: Generic data with description + * ------------------- + * Further more generic data with description is very much possible now. For example + * + * - kcdata_add_uint64_with_description(cdatainfo, 0x700, "NUM MACH PORTS"); + * - and more functions that allow adding description. + * The userspace tools can then look at the description and print the data even if they are not compiled with knowledge of the field apriori. + * + * Example data: + * 0000 57 f1 ad de 00 00 00 00 00 00 00 00 00 00 00 00 W............... + * 0010 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... + * 0020 50 49 44 00 00 00 00 00 00 00 00 00 00 00 00 00 PID............. + * 0030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0040 9c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0050 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... + * 0060 50 41 52 45 4e 54 20 50 49 44 00 00 00 00 00 00 PARENT PID...... + * 0070 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0080 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + * 0090 ed 58 91 f1 + * + * Feature description: Container markers for compound data + * ------------------ + * If a given kernel data type is complex and requires adding multiple optional fields inside a container + * object for a consumer to understand arbitrary data, we package it using container markers. + * + * For example, the stackshot code gathers information and describes the state of a given task with respect + * to many subsystems. It includes data such as io stats, vm counters, process names/flags and syscall counts. + * + * kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); + * // add multiple data, or add__with_description()s here + * + * kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); + * + * Feature description: Custom Data formats on demand + * -------------------- + * With the self describing nature of format, the kernel provider can describe a data type (uniquely identified by a number) and use + * it in the buffer for sending data. The consumer can parse the type information and have knowledge of describing incoming data. + * Following is an example of how we can describe a kernel specific struct sample_disk_io_stats in buffer. + * + * struct sample_disk_io_stats { + * uint64_t disk_reads_count; + * uint64_t disk_reads_size; + * uint64_t io_priority_count[4]; + * uint64_t io_priority_size; + * } __attribute__ ((packed)); + * + * + * struct kcdata_subtype_descriptor disk_io_stats_def[] = { + * {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"}, + * {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"}, + * {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"}, + * {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"}, + * }; + * + * Now you can add this custom type definition into the buffer as + * kcdata_add_type_definition(kcdata_p, KCTYPE_SAMPLE_DISK_IO_STATS, "sample_disk_io_stats", + * &disk_io_stats_def[0], sizeof(disk_io_stats_def)/sizeof(struct kcdata_subtype_descriptor)); + * + */ + + +#ifndef _KCDATA_H_ +#define _KCDATA_H_ + +#include +#include +#include + +#define KCDATA_DESC_MAXLEN 32 /* including NULL byte at end */ + +#define KCDATA_FLAGS_STRUCT_PADDING_MASK 0xf +#define KCDATA_FLAGS_STRUCT_HAS_PADDING 0x80 + +/* + * kcdata aligns elements to 16 byte boundaries. + */ +#define KCDATA_ALIGNMENT_SIZE 0x10 + +struct kcdata_item { + uint32_t type; + uint32_t size; /* len(data) */ + /* flags. + * + * For structures: + * padding = flags & 0xf + * has_padding = (flags & 0x80) >> 7 + * + * has_padding is needed to disambiguate cases such as + * thread_snapshot_v2 and thread_snapshot_v3. Their + * respective sizes are 0x68 and 0x70, and thread_snapshot_v2 + * was emmitted by old kernels *before* we started recording + * padding. Since legacy thread_snapsht_v2 and modern + * thread_snapshot_v3 will both record 0 for the padding + * flags, we need some other bit which will be nonzero in the + * flags to disambiguate. + * + * This is why we hardcode a special case for + * STACKSHOT_KCTYPE_THREAD_SNAPSHOT into the iterator + * functions below. There is only a finite number of such + * hardcodings which will ever be needed. They can occur + * when: + * + * * We have a legacy structure that predates padding flags + * + * * which we want to extend without changing the kcdata type + * + * * by only so many bytes as would fit in the space that + * was previously unused padding. + * + * For containers: + * container_id = flags + * + * For arrays: + * element_count = flags & UINT32_MAX + * element_type = (flags >> 32) & UINT32_MAX + */ + uint64_t flags; + char data[]; /* must be at the end */ +}; + +typedef struct kcdata_item * kcdata_item_t; + +enum KCDATA_SUBTYPE_TYPES { KC_ST_CHAR = 1, KC_ST_INT8, KC_ST_UINT8, KC_ST_INT16, KC_ST_UINT16, KC_ST_INT32, KC_ST_UINT32, KC_ST_INT64, KC_ST_UINT64 }; +typedef enum KCDATA_SUBTYPE_TYPES kctype_subtype_t; + +/* + * A subtype description structure that defines + * how a compound data is laid out in memory. This + * provides on the fly definition of types and consumption + * by the parser. + */ +struct kcdata_subtype_descriptor { + uint8_t kcs_flags; +#define KCS_SUBTYPE_FLAGS_NONE 0x0 +#define KCS_SUBTYPE_FLAGS_ARRAY 0x1 +/* Force struct type even if only one element. + * + * Normally a kcdata_type_definition is treated as a structure if it has + * more than one subtype descriptor. Otherwise it is treated as a simple + * type. For example libkdd will represent a simple integer 42 as simply + * 42, but it will represent a structure containing an integer 42 as + * {"field_name": 42}.. + * + * If a kcdata_type_definition has only single subtype, then it will be + * treated as a structure iff KCS_SUBTYPE_FLAGS_STRUCT is set. If it has + * multiple subtypes, it will always be treated as a structure. + * + * KCS_SUBTYPE_FLAGS_MERGE has the opposite effect. If this flag is used then + * even if there are multiple elements, they will all be treated as individual + * properties of the parent dictionary. + */ +#define KCS_SUBTYPE_FLAGS_STRUCT 0x2 /* force struct type even if only one element */ +#define KCS_SUBTYPE_FLAGS_MERGE 0x4 /* treat as multiple elements of parents instead of struct */ + uint8_t kcs_elem_type; /* restricted to kctype_subtype_t */ + uint16_t kcs_elem_offset; /* offset in struct where data is found */ + uint32_t kcs_elem_size; /* size of element (or) packed state for array type */ + char kcs_name[KCDATA_DESC_MAXLEN]; /* max 31 bytes for name of field */ +}; + +typedef struct kcdata_subtype_descriptor * kcdata_subtype_descriptor_t; + +/* + * In case of array of basic c types in kctype_subtype_t, + * size is packed in lower 16 bits and + * count is packed in upper 16 bits of kcs_elem_size field. + */ +#define KCS_SUBTYPE_PACK_SIZE(e_count, e_size) (((e_count)&0xffffu) << 16 | ((e_size)&0xffffu)) + +static inline uint32_t +kcs_get_elem_size(kcdata_subtype_descriptor_t d) +{ + if (d->kcs_flags & KCS_SUBTYPE_FLAGS_ARRAY) { + /* size is composed as ((count &0xffff)<<16 | (elem_size & 0xffff)) */ + return (uint32_t)((d->kcs_elem_size & 0xffff) * ((d->kcs_elem_size & 0xffff0000)>>16)); + } + return d->kcs_elem_size; +} + +static inline uint32_t +kcs_get_elem_count(kcdata_subtype_descriptor_t d) +{ + if (d->kcs_flags & KCS_SUBTYPE_FLAGS_ARRAY) + return (d->kcs_elem_size >> 16) & 0xffff; + return 1; +} + +static inline int +kcs_set_elem_size(kcdata_subtype_descriptor_t d, uint32_t size, uint32_t count) +{ + if (count > 1) { + /* means we are setting up an array */ + if (size > 0xffff || count > 0xffff) + return -1; //invalid argument + d->kcs_elem_size = ((count & 0xffff) << 16 | (size & 0xffff)); + } + else + { + d->kcs_elem_size = size; + } + return 0; +} + +struct kcdata_type_definition { + uint32_t kct_type_identifier; + uint32_t kct_num_elements; + char kct_name[KCDATA_DESC_MAXLEN]; + struct kcdata_subtype_descriptor kct_elements[]; +}; + + +/* chunk type definitions. 0 - 0x7ff are reserved and defined here + * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes + * in STACKSHOT_KCTYPE_* types. + */ + +/* + * Types with description value. + * these will have KCDATA_DESC_MAXLEN-1 length string description + * and rest of kcdata_iter_size() - KCDATA_DESC_MAXLEN bytes as data + */ +#define KCDATA_TYPE_INVALID 0x0u +#define KCDATA_TYPE_STRING_DESC 0x1u +#define KCDATA_TYPE_UINT32_DESC 0x2u +#define KCDATA_TYPE_UINT64_DESC 0x3u +#define KCDATA_TYPE_INT32_DESC 0x4u +#define KCDATA_TYPE_INT64_DESC 0x5u +#define KCDATA_TYPE_BINDATA_DESC 0x6u + +/* + * Compound type definitions + */ +#define KCDATA_TYPE_ARRAY 0x11u /* Array of data OBSOLETE DONT USE THIS*/ +#define KCDATA_TYPE_TYPEDEFINTION 0x12u /* Meta type that describes a type on the fly. */ +#define KCDATA_TYPE_CONTAINER_BEGIN \ + 0x13u /* Container type which has corresponding CONTAINER_END header. \ + * KCDATA_TYPE_CONTAINER_BEGIN has type in the data segment. \ + * Both headers have (uint64_t) ID for matching up nested data. \ + */ +#define KCDATA_TYPE_CONTAINER_END 0x14u + +#define KCDATA_TYPE_ARRAY_PAD0 0x20u /* Array of data with 0 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD1 0x21u /* Array of data with 1 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD2 0x22u /* Array of data with 2 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD3 0x23u /* Array of data with 3 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD4 0x24u /* Array of data with 4 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD5 0x25u /* Array of data with 5 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD6 0x26u /* Array of data with 6 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD7 0x27u /* Array of data with 7 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD8 0x28u /* Array of data with 8 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PAD9 0x29u /* Array of data with 9 byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADa 0x2au /* Array of data with a byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADb 0x2bu /* Array of data with b byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADc 0x2cu /* Array of data with c byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADd 0x2du /* Array of data with d byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADe 0x2eu /* Array of data with e byte of padding*/ +#define KCDATA_TYPE_ARRAY_PADf 0x2fu /* Array of data with f byte of padding*/ + +/* + * Generic data types that are most commonly used + */ +#define KCDATA_TYPE_LIBRARY_LOADINFO 0x30u /* struct dyld_uuid_info_32 */ +#define KCDATA_TYPE_LIBRARY_LOADINFO64 0x31u /* struct dyld_uuid_info_64 */ +#define KCDATA_TYPE_TIMEBASE 0x32u /* struct mach_timebase_info */ +#define KCDATA_TYPE_MACH_ABSOLUTE_TIME 0x33u /* uint64_t */ +#define KCDATA_TYPE_TIMEVAL 0x34u /* struct timeval64 */ +#define KCDATA_TYPE_USECS_SINCE_EPOCH 0x35u /* time in usecs uint64_t */ +#define KCDATA_TYPE_PID 0x36u /* int32_t */ +#define KCDATA_TYPE_PROCNAME 0x37u /* char * */ +#define KCDATA_TYPE_NESTED_KCDATA 0x38u /* nested kcdata buffer */ + +#define KCDATA_TYPE_BUFFER_END 0xF19158EDu + +/* MAGIC numbers defined for each class of chunked data + * + * To future-proof against big-endian arches, make sure none of these magic + * numbers are byteswaps of each other + */ + +#define KCDATA_BUFFER_BEGIN_CRASHINFO 0xDEADF157u /* owner: corpses/task_corpse.h */ + /* type-range: 0x800 - 0x8ff */ +#define KCDATA_BUFFER_BEGIN_STACKSHOT 0x59a25807u /* owner: sys/stackshot.h */ + /* type-range: 0x900 - 0x93f */ +#define KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT 0xDE17A59Au /* owner: sys/stackshot.h */ + /* type-range: 0x940 - 0x9ff */ +#define KCDATA_BUFFER_BEGIN_OS_REASON 0x53A20900u /* owner: sys/reason.h */ + /* type-range: 0x1000-0x103f */ +#define KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG 0x1e21c09fu /* owner: osfmk/tests/kernel_tests.c */ + /* type-range: 0x1040-0x105f */ + +/* next type range number available 0x1060 */ +/**************** definitions for XNUPOST *********************/ +#define XNUPOST_KCTYPE_TESTCONFIG 0x1040 + +/**************** definitions for stackshot *********************/ + +/* This value must always match IO_NUM_PRIORITIES defined in thread_info.h */ +#define STACKSHOT_IO_NUM_PRIORITIES 4 +/* This value must always match MAXTHREADNAMESIZE used in bsd */ +#define STACKSHOT_MAX_THREAD_NAME_SIZE 64 + +/* + * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes + * in STACKSHOT_KCTYPE_* types. + */ +#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ +#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ +#define STACKSHOT_KCCONTAINER_TASK 0x903u +#define STACKSHOT_KCCONTAINER_THREAD 0x904u +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ +#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ +#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ +#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ +#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ +#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ + +#define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u /* task_delta_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v2 */ + +#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ +#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ +#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times */ +#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ +#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ +#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ + +struct stack_snapshot_frame32 { + uint32_t lr; + uint32_t sp; +}; + +struct stack_snapshot_frame64 { + uint64_t lr; + uint64_t sp; +}; + +struct dyld_uuid_info_32 { + uint32_t imageLoadAddress; /* base address image is mapped at */ + uuid_t imageUUID; +}; + +struct dyld_uuid_info_64 { + uint64_t imageLoadAddress; /* XXX image slide */ + uuid_t imageUUID; +}; + +struct dyld_uuid_info_64_v2 { + uint64_t imageLoadAddress; /* XXX image slide */ + uuid_t imageUUID; + /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */ + uint64_t imageSlidBaseAddress; /* slid base address of image */ +}; + +struct user32_dyld_uuid_info { + uint32_t imageLoadAddress; /* base address image is mapped into */ + uuid_t imageUUID; /* UUID of image */ +}; + +struct user64_dyld_uuid_info { + uint64_t imageLoadAddress; /* base address image is mapped into */ + uuid_t imageUUID; /* UUID of image */ +}; + +enum task_snapshot_flags { + kTaskRsrcFlagged = 0x4, // In the EXC_RESOURCE danger zone? + kTerminatedSnapshot = 0x8, + kPidSuspended = 0x10, // true for suspended task + kFrozen = 0x20, // true for hibernated task (along with pidsuspended) + kTaskDarwinBG = 0x40, + kTaskExtDarwinBG = 0x80, + kTaskVisVisible = 0x100, + kTaskVisNonvisible = 0x200, + kTaskIsForeground = 0x400, + kTaskIsBoosted = 0x800, + kTaskIsSuppressed = 0x1000, + kTaskIsTimerThrottled = 0x2000, /* deprecated */ + kTaskIsImpDonor = 0x4000, + kTaskIsLiveImpDonor = 0x8000, + kTaskIsDirty = 0x10000, + kTaskWqExceededConstrainedThreadLimit = 0x20000, + kTaskWqExceededTotalThreadLimit = 0x40000, + kTaskWqFlagsAvailable = 0x80000, + kTaskUUIDInfoFaultedIn = 0x100000, /* successfully faulted in some UUID info */ + kTaskUUIDInfoMissing = 0x200000, /* some UUID info was paged out */ + kTaskUUIDInfoTriedFault = 0x400000, /* tried to fault in UUID info */ + kTaskSharedRegionInfoUnavailable = 0x800000, /* shared region info unavailable */ +}; + +enum thread_snapshot_flags { + kHasDispatchSerial = 0x4, + kStacksPCOnly = 0x8, /* Stack traces have no frame pointers. */ + kThreadDarwinBG = 0x10, /* Thread is darwinbg */ + kThreadIOPassive = 0x20, /* Thread uses passive IO */ + kThreadSuspended = 0x40, /* Thread is suspended */ + kThreadTruncatedBT = 0x80, /* Unmapped pages caused truncated backtrace */ + kGlobalForcedIdle = 0x100, /* Thread performs global forced idle */ + kThreadFaultedBT = 0x200, /* Some thread stack pages were faulted in as part of BT */ + kThreadTriedFaultBT = 0x400, /* We tried to fault in thread stack pages as part of BT */ + kThreadOnCore = 0x800, /* Thread was on-core when we entered debugger context */ + kThreadIdleWorker = 0x1000, /* Thread is an idle libpthread worker thread */ +}; + +struct mem_and_io_snapshot { + uint32_t snapshot_magic; + uint32_t free_pages; + uint32_t active_pages; + uint32_t inactive_pages; + uint32_t purgeable_pages; + uint32_t wired_pages; + uint32_t speculative_pages; + uint32_t throttled_pages; + uint32_t filebacked_pages; + uint32_t compressions; + uint32_t decompressions; + uint32_t compressor_size; + int32_t busy_buffer_count; + uint32_t pages_wanted; + uint32_t pages_reclaimed; + uint8_t pages_wanted_reclaimed_valid; // did mach_vm_pressure_monitor succeed? +} __attribute__((packed)); + +/* SS_TH_* macros are for ths_state */ +#define SS_TH_WAIT 0x01 /* queued for waiting */ +#define SS_TH_SUSP 0x02 /* stopped or requested to stop */ +#define SS_TH_RUN 0x04 /* running or on runq */ +#define SS_TH_UNINT 0x08 /* waiting uninteruptibly */ +#define SS_TH_TERMINATE 0x10 /* halted at termination */ +#define SS_TH_TERMINATE2 0x20 /* added to termination queue */ +#define SS_TH_IDLE 0x80 /* idling processor */ + +struct thread_snapshot_v2 { + uint64_t ths_thread_id; + uint64_t ths_wait_event; + uint64_t ths_continuation; + uint64_t ths_total_syscalls; + uint64_t ths_voucher_identifier; + uint64_t ths_dqserialnum; + uint64_t ths_user_time; + uint64_t ths_sys_time; + uint64_t ths_ss_flags; + uint64_t ths_last_run_time; + uint64_t ths_last_made_runnable_time; + uint32_t ths_state; + uint32_t ths_sched_flags; + int16_t ths_base_priority; + int16_t ths_sched_priority; + uint8_t ths_eqos; + uint8_t ths_rqos; + uint8_t ths_rqos_override; + uint8_t ths_io_tier; +} __attribute__((packed)); + +struct thread_snapshot_v3 { + uint64_t ths_thread_id; + uint64_t ths_wait_event; + uint64_t ths_continuation; + uint64_t ths_total_syscalls; + uint64_t ths_voucher_identifier; + uint64_t ths_dqserialnum; + uint64_t ths_user_time; + uint64_t ths_sys_time; + uint64_t ths_ss_flags; + uint64_t ths_last_run_time; + uint64_t ths_last_made_runnable_time; + uint32_t ths_state; + uint32_t ths_sched_flags; + int16_t ths_base_priority; + int16_t ths_sched_priority; + uint8_t ths_eqos; + uint8_t ths_rqos; + uint8_t ths_rqos_override; + uint8_t ths_io_tier; + uint64_t ths_thread_t; +} __attribute__((packed)); + +struct thread_delta_snapshot_v2 { + uint64_t tds_thread_id; + uint64_t tds_voucher_identifier; + uint64_t tds_ss_flags; + uint64_t tds_last_made_runnable_time; + uint32_t tds_state; + uint32_t tds_sched_flags; + int16_t tds_base_priority; + int16_t tds_sched_priority; + uint8_t tds_eqos; + uint8_t tds_rqos; + uint8_t tds_rqos_override; + uint8_t tds_io_tier; +} __attribute__ ((packed)); + +struct io_stats_snapshot +{ + /* + * I/O Statistics + * XXX: These fields must be together. + */ + uint64_t ss_disk_reads_count; + uint64_t ss_disk_reads_size; + uint64_t ss_disk_writes_count; + uint64_t ss_disk_writes_size; + uint64_t ss_io_priority_count[STACKSHOT_IO_NUM_PRIORITIES]; + uint64_t ss_io_priority_size[STACKSHOT_IO_NUM_PRIORITIES]; + uint64_t ss_paging_count; + uint64_t ss_paging_size; + uint64_t ss_non_paging_count; + uint64_t ss_non_paging_size; + uint64_t ss_data_count; + uint64_t ss_data_size; + uint64_t ss_metadata_count; + uint64_t ss_metadata_size; + /* XXX: I/O Statistics end */ + +} __attribute__ ((packed)); + +struct task_snapshot_v2 { + uint64_t ts_unique_pid; + uint64_t ts_ss_flags; + uint64_t ts_user_time_in_terminated_threads; + uint64_t ts_system_time_in_terminated_threads; + uint64_t ts_p_start_sec; + uint64_t ts_task_size; + uint64_t ts_max_resident_size; + uint32_t ts_suspend_count; + uint32_t ts_faults; + uint32_t ts_pageins; + uint32_t ts_cow_faults; + uint32_t ts_was_throttled; + uint32_t ts_did_throttle; + uint32_t ts_latency_qos; + int32_t ts_pid; + char ts_p_comm[32]; +} __attribute__ ((packed)); + +struct task_delta_snapshot_v2 { + uint64_t tds_unique_pid; + uint64_t tds_ss_flags; + uint64_t tds_user_time_in_terminated_threads; + uint64_t tds_system_time_in_terminated_threads; + uint64_t tds_task_size; + uint64_t tds_max_resident_size; + uint32_t tds_suspend_count; + uint32_t tds_faults; + uint32_t tds_pageins; + uint32_t tds_cow_faults; + uint32_t tds_was_throttled; + uint32_t tds_did_throttle; + uint32_t tds_latency_qos; +} __attribute__ ((packed)); + +struct stackshot_cpu_times { + uint64_t user_usec; + uint64_t system_usec; +} __attribute__((packed)); + +struct stackshot_duration { + uint64_t stackshot_duration; + uint64_t stackshot_duration_outer; +} __attribute__((packed)); + +struct stackshot_fault_stats { + uint32_t sfs_pages_faulted_in; /* number of pages faulted in using KDP fault path */ + uint64_t sfs_time_spent_faulting; /* MATUs spent faulting */ + uint64_t sfs_system_max_fault_time; /* MATUs fault time limit per stackshot */ + uint8_t sfs_stopped_faulting; /* we stopped decompressing because we hit the limit */ +} __attribute__((packed)); + +/**************** definitions for crashinfo *********************/ + +/* + * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes + * in TASK_CRASHINFO_* types. + */ + +/* FIXME some of these types aren't clean (fixed width, packed, and defined *here*) */ + +#define TASK_CRASHINFO_BEGIN KCDATA_BUFFER_BEGIN_CRASHINFO +#define TASK_CRASHINFO_STRING_DESC KCDATA_TYPE_STRING_DESC +#define TASK_CRASHINFO_UINT32_DESC KCDATA_TYPE_UINT32_DESC +#define TASK_CRASHINFO_UINT64_DESC KCDATA_TYPE_UINT64_DESC + +#define TASK_CRASHINFO_EXTMODINFO 0x801 +#define TASK_CRASHINFO_BSDINFOWITHUNIQID 0x802 /* struct proc_uniqidentifierinfo */ +#define TASK_CRASHINFO_TASKDYLD_INFO 0x803 +#define TASK_CRASHINFO_UUID 0x804 +#define TASK_CRASHINFO_PID 0x805 +#define TASK_CRASHINFO_PPID 0x806 +#define TASK_CRASHINFO_RUSAGE 0x807 /* struct rusage DEPRECATED do not use. + This struct has longs in it */ +#define TASK_CRASHINFO_RUSAGE_INFO 0x808 /* struct rusage_info_v3 from resource.h */ +#define TASK_CRASHINFO_PROC_NAME 0x809 /* char * */ +#define TASK_CRASHINFO_PROC_STARTTIME 0x80B /* struct timeval64 */ +#define TASK_CRASHINFO_USERSTACK 0x80C /* uint64_t */ +#define TASK_CRASHINFO_ARGSLEN 0x80D +#define TASK_CRASHINFO_EXCEPTION_CODES 0x80E /* mach_exception_data_t */ +#define TASK_CRASHINFO_PROC_PATH 0x80F /* string of len MAXPATHLEN */ +#define TASK_CRASHINFO_PROC_CSFLAGS 0x810 /* uint32_t */ +#define TASK_CRASHINFO_PROC_STATUS 0x811 /* char */ +#define TASK_CRASHINFO_UID 0x812 /* uid_t */ +#define TASK_CRASHINFO_GID 0x813 /* gid_t */ +#define TASK_CRASHINFO_PROC_ARGC 0x814 /* int */ +#define TASK_CRASHINFO_PROC_FLAGS 0x815 /* unsigned int */ +#define TASK_CRASHINFO_CPUTYPE 0x816 /* cpu_type_t */ +#define TASK_CRASHINFO_WORKQUEUEINFO 0x817 /* struct proc_workqueueinfo */ +#define TASK_CRASHINFO_RESPONSIBLE_PID 0x818 /* pid_t */ +#define TASK_CRASHINFO_DIRTY_FLAGS 0x819 /* int */ +#define TASK_CRASHINFO_CRASHED_THREADID 0x81A /* uint64_t */ +#define TASK_CRASHINFO_COALITION_ID 0x81B /* uint64_t */ +#define TASK_CRASHINFO_UDATA_PTRS 0x81C /* uint64_t */ +#define TASK_CRASHINFO_MEMORY_LIMIT 0x81D /* uint64_t */ + +#define TASK_CRASHINFO_END KCDATA_TYPE_BUFFER_END + +/**************** definitions for os reasons *********************/ + +#define EXIT_REASON_SNAPSHOT 0x1001 +#define EXIT_REASON_USER_DESC 0x1002 /* string description of reason */ +#define EXIT_REASON_USER_PAYLOAD 0x1003 /* user payload data */ +#define EXIT_REASON_CODESIGNING_INFO 0x1004 + +struct exit_reason_snapshot { + uint32_t ers_namespace; + uint64_t ers_code; + /* end of version 1 of exit_reason_snapshot. sizeof v1 was 12 */ + uint64_t ers_flags; +} __attribute__((packed)); + +#define EXIT_REASON_CODESIG_PATH_MAX 1024 + +struct codesigning_exit_reason_info { + uint64_t ceri_virt_addr; + uint64_t ceri_file_offset; + char ceri_pathname[EXIT_REASON_CODESIG_PATH_MAX]; + char ceri_filename[EXIT_REASON_CODESIG_PATH_MAX]; + uint64_t ceri_codesig_modtime_secs; + uint64_t ceri_codesig_modtime_nsecs; + uint64_t ceri_page_modtime_secs; + uint64_t ceri_page_modtime_nsecs; + uint8_t ceri_path_truncated; + uint8_t ceri_object_codesigned; + uint8_t ceri_page_codesig_validated; + uint8_t ceri_page_codesig_tainted; + uint8_t ceri_page_codesig_nx; + uint8_t ceri_page_wpmapped; + uint8_t ceri_page_slid; + uint8_t ceri_page_dirty; + uint32_t ceri_page_shadow_depth; +} __attribute__((packed)); + +#define EXIT_REASON_USER_DESC_MAX_LEN 1024 +#define EXIT_REASON_PAYLOAD_MAX_LEN 2048 +/**************** safe iterators *********************/ + +typedef struct kcdata_iter { + kcdata_item_t item; + void *end; +} kcdata_iter_t; + + +static inline +kcdata_iter_t kcdata_iter(void *buffer, unsigned long size) { + kcdata_iter_t iter; + iter.item = (kcdata_item_t) buffer; + iter.end = (void*) (((uintptr_t)buffer) + size); + return iter; +} + +static inline +kcdata_iter_t kcdata_iter_unsafe(void *buffer) __attribute__((deprecated)); + +static inline +kcdata_iter_t kcdata_iter_unsafe(void *buffer) { + kcdata_iter_t iter; + iter.item = (kcdata_item_t) buffer; + iter.end = (void*) (uintptr_t) ~0; + return iter; +} + +static const kcdata_iter_t kcdata_invalid_iter = { .item = 0, .end = 0 }; + +static inline +int kcdata_iter_valid(kcdata_iter_t iter) { + return + ( (uintptr_t)iter.item + sizeof(struct kcdata_item) <= (uintptr_t)iter.end ) && + ( (uintptr_t)iter.item + sizeof(struct kcdata_item) + iter.item->size <= (uintptr_t)iter.end); +} + + +static inline +kcdata_iter_t kcdata_iter_next(kcdata_iter_t iter) { + iter.item = (kcdata_item_t) (((uintptr_t)iter.item) + sizeof(struct kcdata_item) + (iter.item->size)); + return iter; +} + +static inline uint32_t +kcdata_iter_type(kcdata_iter_t iter) +{ + if ((iter.item->type & ~0xfu) == KCDATA_TYPE_ARRAY_PAD0) + return KCDATA_TYPE_ARRAY; + else + return iter.item->type; +} + +static inline uint32_t +kcdata_calc_padding(uint32_t size) +{ + /* calculate number of bits to add to size to get something divisible by 16 */ + return (-size) & 0xf; +} + +static inline uint32_t +kcdata_flags_get_padding(uint64_t flags) +{ + return flags & KCDATA_FLAGS_STRUCT_PADDING_MASK; +} + +/* see comment above about has_padding */ +static inline int +kcdata_iter_is_legacy_item(kcdata_iter_t iter, uint32_t legacy_size) +{ + uint32_t legacy_size_padded = legacy_size + kcdata_calc_padding(legacy_size); + return (iter.item->size == legacy_size_padded && + (iter.item->flags & (KCDATA_FLAGS_STRUCT_PADDING_MASK | KCDATA_FLAGS_STRUCT_HAS_PADDING)) == 0); + +} + +static inline uint32_t +kcdata_iter_size(kcdata_iter_t iter) +{ + uint32_t legacy_size = 0; + + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: + case KCDATA_TYPE_CONTAINER_BEGIN: + return iter.item->size; + case STACKSHOT_KCTYPE_THREAD_SNAPSHOT: { + legacy_size = sizeof(struct thread_snapshot_v2); + if (kcdata_iter_is_legacy_item(iter, legacy_size)) { + return legacy_size; + } + + goto not_legacy; + } + case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { + legacy_size = sizeof(struct dyld_uuid_info_64); + if (kcdata_iter_is_legacy_item(iter, legacy_size)) { + return legacy_size; + } + + goto not_legacy; + } +not_legacy: + default: + if (iter.item->size < kcdata_flags_get_padding(iter.item->flags)) + return 0; + else + return iter.item->size - kcdata_flags_get_padding(iter.item->flags); + } +} + +static inline uint64_t +kcdata_iter_flags(kcdata_iter_t iter) +{ + return iter.item->flags; +} + +static inline +void * kcdata_iter_payload(kcdata_iter_t iter) { + return &iter.item->data; +} + + +static inline +uint32_t kcdata_iter_array_elem_type(kcdata_iter_t iter) { + return (iter.item->flags >> 32) & UINT32_MAX; +} + +static inline +uint32_t kcdata_iter_array_elem_count(kcdata_iter_t iter) { + return (iter.item->flags) & UINT32_MAX; +} + +/* KCDATA_TYPE_ARRAY is ambiguous about the size of the array elements. Size is + * calculated as total_size / elements_count, but total size got padded out to a + * 16 byte alignment. New kernels will generate KCDATA_TYPE_ARRAY_PAD* instead + * to explicitly tell us how much padding was used. Here we have a fixed, never + * to be altered list of the sizes of array elements that were used before I + * discovered this issue. If you find a KCDATA_TYPE_ARRAY that is not one of + * these types, treat it as invalid data. */ + +static inline +uint32_t +kcdata_iter_array_size_switch(kcdata_iter_t iter) { + switch(kcdata_iter_array_elem_type(iter)) { + case KCDATA_TYPE_LIBRARY_LOADINFO: + return sizeof(struct dyld_uuid_info_32); + case KCDATA_TYPE_LIBRARY_LOADINFO64: + return sizeof(struct dyld_uuid_info_64); + case STACKSHOT_KCTYPE_KERN_STACKFRAME: + case STACKSHOT_KCTYPE_USER_STACKFRAME: + return sizeof(struct stack_snapshot_frame32); + case STACKSHOT_KCTYPE_KERN_STACKFRAME64: + case STACKSHOT_KCTYPE_USER_STACKFRAME64: + return sizeof(struct stack_snapshot_frame64); + case STACKSHOT_KCTYPE_DONATING_PIDS: + return sizeof(int32_t); + case STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT: + return sizeof(struct thread_delta_snapshot_v2); + // This one is only here to make some unit tests work. It should be OK to + // remove. + case TASK_CRASHINFO_CRASHED_THREADID: + return sizeof(uint64_t); + default: + return 0; + } +} + +static inline +int kcdata_iter_array_valid(kcdata_iter_t iter) { + if (!kcdata_iter_valid(iter)) + return 0; + if (kcdata_iter_type(iter) != KCDATA_TYPE_ARRAY) + return 0; + if (kcdata_iter_array_elem_count(iter) == 0) + return iter.item->size == 0; + if (iter.item->type == KCDATA_TYPE_ARRAY) { + uint32_t elem_size = kcdata_iter_array_size_switch(iter); + if (elem_size == 0) + return 0; + /* sizes get aligned to the nearest 16. */ + return + kcdata_iter_array_elem_count(iter) <= iter.item->size / elem_size && + iter.item->size % kcdata_iter_array_elem_count(iter) < 16; + } else { + return + (iter.item->type & 0xf) <= iter.item->size && + kcdata_iter_array_elem_count(iter) <= iter.item->size - (iter.item->type & 0xf) && + (iter.item->size - (iter.item->type & 0xf)) % kcdata_iter_array_elem_count(iter) == 0; + } +} + + +static inline +uint32_t kcdata_iter_array_elem_size(kcdata_iter_t iter) { + if (iter.item->type == KCDATA_TYPE_ARRAY) + return kcdata_iter_array_size_switch(iter); + if (kcdata_iter_array_elem_count(iter) == 0) + return 0; + return (iter.item->size - (iter.item->type & 0xf)) / kcdata_iter_array_elem_count(iter); +} + +static inline +int kcdata_iter_container_valid(kcdata_iter_t iter) { + return + kcdata_iter_valid(iter) && + kcdata_iter_type(iter) == KCDATA_TYPE_CONTAINER_BEGIN && + iter.item->size >= sizeof(uint32_t); +} + +static inline +uint32_t kcdata_iter_container_type(kcdata_iter_t iter) { + return * (uint32_t *) kcdata_iter_payload(iter); +} + +static inline +uint64_t kcdata_iter_container_id(kcdata_iter_t iter) { + return iter.item->flags; +} + + +#define KCDATA_ITER_FOREACH(iter) for(; kcdata_iter_valid(iter) && iter.item->type != KCDATA_TYPE_BUFFER_END; iter = kcdata_iter_next(iter)) +#define KCDATA_ITER_FOREACH_FAILED(iter) (!kcdata_iter_valid(iter) || (iter).item->type != KCDATA_TYPE_BUFFER_END) + +static inline +kcdata_iter_t +kcdata_iter_find_type(kcdata_iter_t iter, uint32_t type) +{ + KCDATA_ITER_FOREACH(iter) + { + if (kcdata_iter_type(iter) == type) + return iter; + } + return kcdata_invalid_iter; +} + +static inline +int kcdata_iter_data_with_desc_valid(kcdata_iter_t iter, uint32_t minsize) { + return + kcdata_iter_valid(iter) && + kcdata_iter_size(iter) >= KCDATA_DESC_MAXLEN + minsize && + ((char*)kcdata_iter_payload(iter))[KCDATA_DESC_MAXLEN-1] == 0; +} + +static inline +char *kcdata_iter_string(kcdata_iter_t iter, uint32_t offset) { + if (offset > kcdata_iter_size(iter)) { + return NULL; + } + uint32_t maxlen = kcdata_iter_size(iter) - offset; + char *s = ((char*)kcdata_iter_payload(iter)) + offset; + if (strnlen(s, maxlen) < maxlen) { + return s; + } else { + return NULL; + } +} + +static inline void kcdata_iter_get_data_with_desc(kcdata_iter_t iter, char **desc_ptr, void **data_ptr, uint32_t *size_ptr) { + if (desc_ptr) + *desc_ptr = (char *)kcdata_iter_payload(iter); + if (data_ptr) + *data_ptr = (void *)((uintptr_t)kcdata_iter_payload(iter) + KCDATA_DESC_MAXLEN); + if (size_ptr) + *size_ptr = kcdata_iter_size(iter) - KCDATA_DESC_MAXLEN; +} + +#endif diff --git a/osfmk/kern/kern_cdata.c b/osfmk/kern/kern_cdata.c index 503032ae7..46499f452 100644 --- a/osfmk/kern/kern_cdata.c +++ b/osfmk/kern/kern_cdata.c @@ -38,94 +38,26 @@ #include #include +static kern_return_t kcdata_get_memory_addr_with_flavor(kcdata_descriptor_t data, uint32_t type, uint32_t size, uint64_t flags, mach_vm_address_t *user_addr); + /* + * Estimates how large of a buffer that should be allocated for a buffer that will contain + * num_items items of known types with overall length payload_size. * - * The format for data is setup in a generic format as follows - * - * Layout of data structure: - * - * | 8 - bytes | - * | type = MAGIC | LENGTH | - * | 0 | - * | type | size | - * | flags | - * | data | - * |___________data____________| - * | type | size | - * | flags | - * |___________data____________| - * | type = END | size=0 | - * | 0 | - * - * - * The type field describes what kind of data is passed. For example type = TASK_CRASHINFO_UUID means the following data is a uuid. - * These types need to be defined in task_corpses.h for easy consumption by userspace inspection tools. - * - * Some range of types is reserved for special types like ints, longs etc. A cool new functionality made possible with this - * extensible data format is that kernel can decide to put more information as required without requiring user space tools to - * re-compile to be compatible. The case of rusage struct versions could be introduced without breaking existing tools. - * - * Feature description: Generic data with description - * ------------------- - * Further more generic data with description is very much possible now. For example - * - * - kcdata_add_uint64_with_description(cdatainfo, 0x700, "NUM MACH PORTS"); - * - and more functions that allow adding description. - * The userspace tools can then look at the description and print the data even if they are not compiled with knowledge of the field apriori. - * - * Example data: - * 0000 57 f1 ad de 00 00 00 00 00 00 00 00 00 00 00 00 W............... - * 0010 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... - * 0020 50 49 44 00 00 00 00 00 00 00 00 00 00 00 00 00 PID............. - * 0030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - * 0040 9c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - * 0050 01 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00 ........0....... - * 0060 50 41 52 45 4e 54 20 50 49 44 00 00 00 00 00 00 PARENT PID...... - * 0070 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - * 0080 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - * 0090 ed 58 91 f1 - * - * Feature description: Container markers for compound data - * ------------------ - * If a given kernel data type is complex and requires adding multiple optional fields inside a container - * object for a consumer to understand arbitrary data, we package it using container markers. - * - * For example, the stackshot code gathers information and describes the state of a given task with respect - * to many subsystems. It includes data such as io stats, vm counters, process names/flags and syscall counts. - * - * kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); - * // add multiple data, or add__with_description()s here - * - * kcdata_add_container_marker(kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, task_uniqueid); - * - * Feature description: Custom Data formats on demand - * -------------------- - * With the self describing nature of format, the kernel provider can describe a data type (uniquely identified by a number) and use - * it in the buffer for sending data. The consumer can parse the type information and have knowledge of describing incoming data. - * Following is an example of how we can describe a kernel specific struct sample_disk_io_stats in buffer. - * - * struct sample_disk_io_stats { - * uint64_t disk_reads_count; - * uint64_t disk_reads_size; - * uint64_t io_priority_count[4]; - * uint64_t io_priority_size; - * } __attribute__ ((packed)); - * - * - * struct kcdata_subtype_descriptor disk_io_stats_def[] = { - * {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"}, - * {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"}, - * {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"}, - * {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"}, - * }; - * - * Now you can add this custom type definition into the buffer as - * kcdata_add_type_definition(kcdata_p, KCTYPE_SAMPLE_DISK_IO_STATS, "sample_disk_io_stats", - * &disk_io_stats_def[0], sizeof(disk_io_stats_def)/sizeof(struct kcdata_subtype_descriptor)); - * + * NOTE: This function will not give an accurate estimate for buffers that will + * contain unknown types (those with string descriptions). */ +uint32_t kcdata_estimate_required_buffer_size(uint32_t num_items, uint32_t payload_size) +{ + /* + * In the worst case each item will need (KCDATA_ALIGNMENT_SIZE - 1) padding + */ + uint32_t max_padding_bytes = num_items * (KCDATA_ALIGNMENT_SIZE - 1); + uint32_t item_description_bytes = num_items * sizeof(struct kcdata_item); + uint32_t begin_and_end_marker_bytes = 2 * sizeof(struct kcdata_item); -static kern_return_t kcdata_get_memory_addr_with_flavor(kcdata_descriptor_t data, uint32_t type, uint32_t size, uint64_t flags, mach_vm_address_t *user_addr); + return max_padding_bytes + item_description_bytes + begin_and_end_marker_bytes + payload_size; +} kcdata_descriptor_t kcdata_memory_alloc_init(mach_vm_address_t buffer_addr_p, unsigned data_type, unsigned size, unsigned flags) { @@ -196,21 +128,40 @@ kern_return_t kcdata_memory_destroy(kcdata_descriptor_t data) /* * Routine: kcdata_get_memory_addr * Desc: get memory address in the userspace memory for corpse info - * NOTE: The caller is responsible to zero the resulting memory or - * user other means to mark memory if it has failed populating the + * NOTE: The caller is responsible for zeroing the resulting memory or + * using other means to mark memory if it has failed populating the * data in middle of operation. * params: data - pointer describing the crash info allocation * type - type of data to be put. See corpse.h for defined types * size - size requested. The header describes this size * returns: mach_vm_address_t address in user memory for copyout(). */ -kern_return_t kcdata_get_memory_addr( - kcdata_descriptor_t data, - uint32_t type, - uint32_t size, - mach_vm_address_t *user_addr) +kern_return_t +kcdata_get_memory_addr(kcdata_descriptor_t data, uint32_t type, uint32_t size, mach_vm_address_t * user_addr) +{ + /* record number of padding bytes as lower 4 bits of flags */ + uint64_t flags = (KCDATA_FLAGS_STRUCT_PADDING_MASK & kcdata_calc_padding(size)) | KCDATA_FLAGS_STRUCT_HAS_PADDING; + return kcdata_get_memory_addr_with_flavor(data, type, size, flags, user_addr); +} + +/* + * Routine: kcdata_add_buffer_end + * + * Desc: Write buffer end marker. This does not advance the end pointer in the + * kcdata_descriptor_t, so it may be used conservatively before additional data + * is added, as long as it is at least called after the last time data is added. + * + * params: data - pointer describing the crash info allocation + */ + +kern_return_t +kcdata_write_buffer_end(kcdata_descriptor_t data) { - return kcdata_get_memory_addr_with_flavor(data, type, size, 0, user_addr); + struct kcdata_item info; + bzero(&info, sizeof(info)); + info.type = KCDATA_TYPE_BUFFER_END; + info.size = 0; + return kcdata_memcpy(data, data->kcd_addr_end, &info, sizeof(info)); } /* @@ -233,14 +184,12 @@ static kern_return_t kcdata_get_memory_addr_with_flavor( } /* make sure 16 byte aligned */ - if (size & 0xf) { - size += (0x10 - (size & 0xf)); - } + size += kcdata_calc_padding(size); bzero(&info, sizeof(info)); - KCDATA_ITEM_TYPE(&info) = type; - KCDATA_ITEM_SIZE(&info) = size; - KCDATA_ITEM_FLAGS(&info) = flags; + info.type = type; + info.size = size; + info.flags = flags; total_size = size + sizeof(info); /* check available memory, including trailer size for KCDATA_TYPE_BUFFER_END */ @@ -259,19 +208,12 @@ static kern_return_t kcdata_get_memory_addr_with_flavor( *user_addr = data->kcd_addr_end; data->kcd_addr_end += size; - /* setup the end header as well */ - bzero(&info, sizeof(info)); - KCDATA_ITEM_TYPE(&info) = KCDATA_TYPE_BUFFER_END; - KCDATA_ITEM_SIZE(&info) = 0; - - if (data->kcd_flags & KCFLAG_USE_COPYOUT) { - if (copyout(&info, data->kcd_addr_end, sizeof(info))) - return KERN_NO_ACCESS; + if (!(data->kcd_flags & KCFLAG_NO_AUTO_ENDBUFFER)) { + /* setup the end header as well */ + return kcdata_write_buffer_end(data); } else { - memcpy((void *)data->kcd_addr_end, &info, sizeof(info)); + return KERN_SUCCESS; } - - return KERN_SUCCESS; } /* @@ -294,10 +236,14 @@ kern_return_t kcdata_get_memory_addr_for_array( uint32_t count, mach_vm_address_t *user_addr) { - uint64_t flags = type_of_element; - flags = (flags << 32) | count; + /* for arrays we record the number of padding bytes as the low-order 4 bits + * of the type field. KCDATA_TYPE_ARRAY_PAD{x} means x bytes of pad. */ + uint64_t flags = type_of_element; + flags = (flags << 32) | count; uint32_t total_size = count * size_of_element; - return kcdata_get_memory_addr_with_flavor(data, KCDATA_TYPE_ARRAY, total_size, flags, user_addr); + uint32_t pad = kcdata_calc_padding(total_size); + + return kcdata_get_memory_addr_with_flavor(data, KCDATA_TYPE_ARRAY_PAD0 | pad, total_size, flags, user_addr); } /* @@ -329,6 +275,28 @@ kern_return_t kcdata_add_container_marker( return kr; } +/* + * Routine: kcdata_undo_addcontainer_begin + * Desc: call this after adding a container begin but before adding anything else to revert. + */ +kern_return_t +kcdata_undo_add_container_begin(kcdata_descriptor_t data) +{ + /* + * the payload of a container begin is a single uint64_t. It is padded out + * to 16 bytes. + */ + const mach_vm_address_t padded_payload_size = 16; + data->kcd_addr_end -= sizeof(struct kcdata_item) + padded_payload_size; + + if (!(data->kcd_flags & KCFLAG_NO_AUTO_ENDBUFFER)) { + /* setup the end header as well */ + return kcdata_write_buffer_end(data); + } else { + return KERN_SUCCESS; + } +} + /* * Routine: kcdata_memcpy * Desc: a common function to copy data out based on either copyout or memcopy flags @@ -373,6 +341,7 @@ kern_return_t kcdata_add_type_definition( struct kcdata_type_definition kc_type_definition; mach_vm_address_t user_addr; uint32_t total_size = sizeof(struct kcdata_type_definition); + bzero(&kc_type_definition, sizeof(kc_type_definition)); if (strnlen(type_name, KCDATA_DESC_MAXLEN + 1) >= KCDATA_DESC_MAXLEN) return KERN_INVALID_ARGUMENT; @@ -381,7 +350,9 @@ kern_return_t kcdata_add_type_definition( kc_type_definition.kct_type_identifier = type_id; total_size += elements_count * sizeof(struct kcdata_subtype_descriptor); - if (KERN_SUCCESS != (kr = kcdata_get_memory_addr_with_flavor(data, KCDATA_TYPE_TYPEDEFINTION, total_size, 0, &user_addr))) + /* record number of padding bytes as lower 4 bits of flags */ + if (KERN_SUCCESS != (kr = kcdata_get_memory_addr_with_flavor(data, KCDATA_TYPE_TYPEDEFINTION, total_size, + kcdata_calc_padding(total_size), &user_addr))) return kr; if (KERN_SUCCESS != (kr = kcdata_memcpy(data, user_addr, (void *)&kc_type_definition, sizeof(struct kcdata_type_definition)))) return kr; @@ -406,10 +377,8 @@ struct _uint32_with_description_data { #pragma pack() -kern_return_t kcdata_add_uint64_with_description( - kcdata_descriptor_t data_desc, - uint64_t data, - const char *description) +kern_return_t +kcdata_add_uint64_with_description(kcdata_descriptor_t data_desc, uint64_t data, const char * description) { if (strnlen(description, KCDATA_DESC_MAXLEN + 1) >= KCDATA_DESC_MAXLEN) return KERN_INVALID_ARGUMENT; diff --git a/osfmk/kern/kern_cdata.h b/osfmk/kern/kern_cdata.h index ac02b62d7..fd7543342 100644 --- a/osfmk/kern/kern_cdata.h +++ b/osfmk/kern/kern_cdata.h @@ -29,218 +29,80 @@ #ifndef _KERN_CDATA_H_ #define _KERN_CDATA_H_ -#include +#include #include -#define KCDATA_DESC_MAXLEN 32 /* including NULL byte at end */ - -struct kcdata_item { - uint32_t type; - uint32_t size; /* len(data) */ - uint64_t flags; -#ifndef KERNEL - char data[]; /* must be at the end */ -#endif -}; - -typedef struct kcdata_item * kcdata_item_t; - -enum KCDATA_SUBTYPE_TYPES { KC_ST_CHAR = 1, KC_ST_INT8, KC_ST_UINT8, KC_ST_INT16, KC_ST_UINT16, KC_ST_INT32, KC_ST_UINT32, KC_ST_INT64, KC_ST_UINT64 }; -typedef enum KCDATA_SUBTYPE_TYPES kctype_subtype_t; - -/* - * A subtype description structure that defines - * how a compound data is laid out in memory. This - * provides on the fly definition of types and consumption - * by the parser. - */ -struct kcdata_subtype_descriptor { - uint8_t kcs_flags; -#define KCS_SUBTYPE_FLAGS_NONE 0x0 -#define KCS_SUBTYPE_FLAGS_ARRAY 0x1 - uint8_t kcs_elem_type; /* restricted to kctype_subtype_t */ - uint16_t kcs_elem_offset; /* offset in struct where data is found */ - uint32_t kcs_elem_size; /* size of element (or) packed state for array type */ - char kcs_name[KCDATA_DESC_MAXLEN]; /* max 31 bytes for name of field */ -}; - -typedef struct kcdata_subtype_descriptor * kcdata_subtype_descriptor_t; - /* - * In case of array of basic c types in kctype_subtype_t, - * size is packed in lower 16 bits and - * count is packed in upper 16 bits of kcs_elem_size field. - */ -#define KCS_SUBTYPE_PACK_SIZE(e_count,e_size) (((e_count) & 0xffff) << 16 | ((e_size) & 0xffff)) - -static inline uint32_t -kcs_get_elem_size(kcdata_subtype_descriptor_t d) -{ - if (d->kcs_flags & KCS_SUBTYPE_FLAGS_ARRAY) { - /* size is composed as ((count &0xffff)<<16 | (elem_size & 0xffff)) */ - return (uint32_t)((d->kcs_elem_size & 0xffff) * ((d->kcs_elem_size & 0xffff0000)>>16)); - } - return d->kcs_elem_size; -} - -static inline uint32_t -kcs_get_elem_count(kcdata_subtype_descriptor_t d) -{ - if (d->kcs_flags & KCS_SUBTYPE_FLAGS_ARRAY) - return (d->kcs_elem_size >> 16) & 0xffff; - return 1; -} - -static inline kern_return_t -kcs_set_elem_size(kcdata_subtype_descriptor_t d, uint32_t size, uint32_t count) -{ - if (count > 1) { - /* means we are setting up an array */ - if (size > 0xffff || count > 0xffff) - return KERN_INVALID_ARGUMENT; - d->kcs_elem_size = ((count & 0xffff) << 16 | (size & 0xffff)); - } - else - { - d->kcs_elem_size = size; - } - return KERN_SUCCESS; -} - -struct kcdata_type_definition { - uint32_t kct_type_identifier; - uint32_t kct_num_elements; - char kct_name[KCDATA_DESC_MAXLEN]; -#ifndef KERNEL - struct kcdata_subtype_descriptor kct_elements[]; -#endif -}; - -/* chunk type definitions. 0 - 0x7ff are reserved and defined here - * NOTE: Please update libkdd/kcdata/kcdtypes.c if you make any changes - * in STACKSHOT_KCTYPE_* types. - */ - -/* - * Types with description value. - * these will have KCDATA_DESC_MAXLEN-1 length string description - * and rest of KCDATA_ITEM_SIZE() - KCDATA_DESC_MAXLEN bytes as data - */ -#define KCDATA_TYPE_INVALID 0x0 -#define KCDATA_TYPE_STRING_DESC 0x1 -#define KCDATA_TYPE_UINT32_DESC 0x2 -#define KCDATA_TYPE_UINT64_DESC 0x3 -#define KCDATA_TYPE_INT32_DESC 0x4 -#define KCDATA_TYPE_INT64_DESC 0x5 -#define KCDATA_TYPE_BINDATA_DESC 0x6 - -/* - * Compound type definitions - */ -#define KCDATA_TYPE_ARRAY 0x11 /* Array of data */ -#define KCDATA_TYPE_TYPEDEFINTION 0x12 /* Meta type that describes a type on the fly. */ -#define KCDATA_TYPE_CONTAINER_BEGIN 0x13 /* Container type which has corresponding CONTAINER_END header. - * KCDATA_TYPE_CONTAINER_BEGIN has type in the data segment. - * Both headers have (uint64_t) ID for matching up nested data. - */ -#define KCDATA_TYPE_CONTAINER_END 0x14 - - -/* - * Generic data types that are most commonly used + * Do not use these macros! + * + * Instead, you should use kcdata_iter_* functions defined in kcdata.h. These + * macoros have no idea where the kcdata buffer ends, so they are all unsafe. */ -#define KCDATA_TYPE_LIBRARY_LOADINFO 0x30 /* struct dyld_uuid_info_32 */ -#define KCDATA_TYPE_LIBRARY_LOADINFO64 0x31 /* struct dyld_uuid_info_64 */ -#define KCDATA_TYPE_TIMEBASE 0x32 /* struct mach_timebase_info */ -#define KCDATA_TYPE_MACH_ABSOLUTE_TIME 0x33 /* uint64_t */ -#define KCDATA_TYPE_TIMEVAL 0x34 /* struct timeval64 */ -#define KCDATA_TYPE_USECS_SINCE_EPOCH 0x35 /* time in usecs uint64_t */ - -#define KCDATA_TYPE_BUFFER_END 0xF19158ED - -/* MAGIC numbers defined for each class of chunked data */ -#define KCDATA_BUFFER_BEGIN_CRASHINFO 0xDEADF157 /* owner: corpses/task_corpse.h */ - /* type-range: 0x800 - 0x8ff */ -#define KCDATA_BUFFER_BEGIN_STACKSHOT 0x59a25807 /* owner: sys/stackshot.h */ - /* type-range: 0x900 - 0x9ff */ - -/* next type range number available 0x1000 */ - -/* Common MACROS and library functions */ -/* make header = sizeof(type, flags, size) */ #define KCDATA_ITEM_HEADER_SIZE (sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint64_t)) -#define KCDATA_ITEM_TYPE(item) (((kcdata_item_t)(item))->type) -#define KCDATA_ITEM_SIZE(item) (((kcdata_item_t)(item))->size) -#define KCDATA_ITEM_FLAGS(item) (((kcdata_item_t)(item))->flags) - -#define KCDATA_ITEM_ARRAY_GET_EL_TYPE(item) ((KCDATA_ITEM_FLAGS(item) >> 32) & UINT32_MAX) -#define KCDATA_ITEM_ARRAY_GET_EL_COUNT(item) (KCDATA_ITEM_FLAGS(item) & UINT32_MAX) -#define KCDATA_ITEM_ARRAY_GET_EL_SIZE(item) (KCDATA_ITEM_SIZE(item) / KCDATA_ITEM_ARRAY_GET_EL_COUNT(item)) - -#define KCDATA_CONTAINER_ID(item) ((uint64_t)KCDATA_ITEM_FLAGS(item)) - -#define KCDATA_ITEM_NEXT_HEADER(item) ((kcdata_item_t)((uint64_t)((uintptr_t)(item)) + KCDATA_ITEM_HEADER_SIZE + KCDATA_ITEM_SIZE(item))) - -#define KCDATA_ITEM_FOREACH(head) for (; KCDATA_ITEM_TYPE(head) != KCDATA_TYPE_BUFFER_END; (head) = KCDATA_ITEM_NEXT_HEADER(head)) - -static inline kcdata_item_t -KCDATA_ITEM_FIND_TYPE(kcdata_item_t head, uint32_t type) -{ - KCDATA_ITEM_FOREACH(head) - { - if (KCDATA_ITEM_TYPE(head) == type) { - break; - } - } - return (KCDATA_ITEM_TYPE(head) == type) ? (kcdata_item_t)head : 0; -} - -#ifndef KERNEL -#define KCDATA_ITEM_DATA_PTR(item) (&((kcdata_item_t)(item))->data) - -static inline uint32_t kcdata_get_container_type(kcdata_item_t buffer) { - if (KCDATA_ITEM_TYPE(buffer) == KCDATA_TYPE_CONTAINER_BEGIN) - return *(uint32_t *)KCDATA_ITEM_DATA_PTR(buffer); - return 0; -} - -static inline void kcdata_get_data_with_desc(kcdata_item_t buffer, char **desc_ptr, void **data_ptr) { - if (desc_ptr) - *desc_ptr = (char *)KCDATA_ITEM_DATA_PTR(buffer); - if (data_ptr) - *data_ptr = (void *)((uintptr_t)KCDATA_ITEM_DATA_PTR(buffer) + KCDATA_DESC_MAXLEN); -} -#endif /* KERNEL */ +#define KCDATA_ITEM_ITER(item) kcdata_iter_unsafe((void*)(item)) +#define KCDATA_ITEM_TYPE(item) kcdata_iter_type(KCDATA_ITEM_ITER(item)) +#define KCDATA_ITEM_SIZE(item) kcdata_iter_size(KCDATA_ITEM_ITER(item)) +#define KCDATA_ITEM_FLAGS(item) kcdata_iter_flags(KCDATA_ITEM_ITER(item)) +#define KCDATA_ITEM_ARRAY_GET_EL_TYPE(item) kcdata_iter_array_elem_type(KCDATA_ITEM_ITER(item)) +#define KCDATA_ITEM_ARRAY_GET_EL_COUNT(item) kcdata_iter_array_elem_count(KCDATA_ITEM_ITER(item)) +#define KCDATA_ITEM_ARRAY_GET_EL_SIZE(item) kcdata_iter_array_elem_size(KCDATA_ITEM_ITER(item)) +#define KCDATA_CONTAINER_ID(item) kcdata_iter_container_id(KCDATA_ITEM_ITER(item)) +#define KCDATA_ITEM_NEXT_HEADER(itemx) (kcdata_iter_next(KCDATA_ITEM_ITER(itemx)).item) +#define KCDATA_ITEM_FOREACH(head) for (; KCDATA_ITEM_TYPE(head) != KCDATA_TYPE_BUFFER_END; (head) = KCDATA_ITEM_NEXT_HEADER(head)) +#define KCDATA_ITEM_DATA_PTR(item) kcdata_iter_payload(KCDATA_ITEM_ITER(item)) +#define KCDATA_ITEM_FIND_TYPE(itemx, type) (kcdata_iter_find_type(KCDATA_ITEM_ITER(itemx), type).item) +#define kcdata_get_container_type(buffer) kcdata_iter_container_type(KCDATA_ITEM_ITER(buffer)) +#define kcdata_get_data_with_desc(buf,desc,data) kcdata_iter_get_data_with_desc(KCDATA_ITEM_ITER(buf),desc,data,NULL) +/* Do not use these macros! */ + +#ifdef KERNEL #ifdef XNU_KERNEL_PRIVATE /* Structure to save information about corpse data */ struct kcdata_descriptor { uint32_t kcd_length; - uint32_t kcd_flags; -#define KCFLAG_USE_MEMCOPY 0x0 -#define KCFLAG_USE_COPYOUT 0x1 - mach_vm_address_t kcd_addr_begin; - mach_vm_address_t kcd_addr_end; + uint32_t kcd_flags; +#define KCFLAG_USE_MEMCOPY 0x0 +#define KCFLAG_USE_COPYOUT 0x1 +#define KCFLAG_NO_AUTO_ENDBUFFER 0x2 + mach_vm_address_t kcd_addr_begin; + mach_vm_address_t kcd_addr_end; }; typedef struct kcdata_descriptor * kcdata_descriptor_t; kcdata_descriptor_t kcdata_memory_alloc_init(mach_vm_address_t crash_data_p, unsigned data_type, unsigned size, unsigned flags); -kern_return_t kcdata_memory_static_init(kcdata_descriptor_t data, mach_vm_address_t buffer_addr_p, unsigned data_type, unsigned size, unsigned flags); +kern_return_t kcdata_memory_static_init( + kcdata_descriptor_t data, mach_vm_address_t buffer_addr_p, unsigned data_type, unsigned size, unsigned flags); kern_return_t kcdata_memory_destroy(kcdata_descriptor_t data); -uint64_t kcdata_memory_get_used_bytes(kcdata_descriptor_t kcd); -kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, void *src_addr, uint32_t size); +kern_return_t +kcdata_add_container_marker(kcdata_descriptor_t data, uint32_t header_type, uint32_t container_type, uint64_t identifier); +kern_return_t kcdata_add_type_definition(kcdata_descriptor_t data, + uint32_t type_id, + char * type_name, + struct kcdata_subtype_descriptor * elements_array_addr, + uint32_t elements_count); + +kern_return_t kcdata_add_uint64_with_description(kcdata_descriptor_t crashinfo, uint64_t data, const char * description); +kern_return_t kcdata_add_uint32_with_description(kcdata_descriptor_t crashinfo, uint32_t data, const char * description); -kern_return_t kcdata_get_memory_addr(kcdata_descriptor_t data, uint32_t type, uint32_t size, mach_vm_address_t *user_addr); -kern_return_t kcdata_get_memory_addr_for_array(kcdata_descriptor_t data, uint32_t type_of_element, uint32_t size_of_element, uint32_t count, mach_vm_address_t *user_addr); -kern_return_t kcdata_add_container_marker(kcdata_descriptor_t data, uint32_t header_type, uint32_t container_type, uint64_t identifier); -kern_return_t kcdata_add_type_definition(kcdata_descriptor_t data, uint32_t type_id, char *type_name, struct kcdata_subtype_descriptor *elements_array_addr, uint32_t elements_count); +kern_return_t kcdata_undo_add_container_begin(kcdata_descriptor_t data); +kern_return_t kcdata_write_buffer_end(kcdata_descriptor_t data); -kern_return_t kcdata_add_uint64_with_description(kcdata_descriptor_t crashinfo, uint64_t data, const char *description); -kern_return_t kcdata_add_uint32_with_description(kcdata_descriptor_t crashinfo, uint32_t data, const char *description); +#else /* XNU_KERNEL_PRIVATE */ + +typedef void * kcdata_descriptor_t; #endif /* XNU_KERNEL_PRIVATE */ +uint32_t kcdata_estimate_required_buffer_size(uint32_t num_items, uint32_t payload_size); +uint64_t kcdata_memory_get_used_bytes(kcdata_descriptor_t kcd); +kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, void * src_addr, uint32_t size); +kern_return_t kcdata_get_memory_addr(kcdata_descriptor_t data, uint32_t type, uint32_t size, mach_vm_address_t * user_addr); +kern_return_t kcdata_get_memory_addr_for_array( + kcdata_descriptor_t data, uint32_t type_of_element, uint32_t size_of_element, uint32_t count, mach_vm_address_t * user_addr); + +#endif /* KERNEL */ #endif /* _KERN_CDATA_H_ */ diff --git a/osfmk/kern/kern_stackshot.c b/osfmk/kern/kern_stackshot.c index fd20ff2a9..41272304c 100644 --- a/osfmk/kern/kern_stackshot.c +++ b/osfmk/kern/kern_stackshot.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #ifdef IMPORTANCE_INHERITANCE @@ -48,8 +49,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -63,16 +66,12 @@ extern unsigned int not_in_kdp; -/* - * TODO: Even hackier than the other pieces. This should really - * be moved off of kdp_pmap, and we should probably separate - * machine_trace_thread out of the kdp code. - */ -extern pmap_t kdp_pmap; + extern addr64_t kdp_vtophys(pmap_t pmap, addr64_t va); +extern void * proc_get_uthread_uu_threadlist(void * uthread_v); -int kdp_snapshot = 0; -static int stack_snapshot_ret = 0; +int kdp_snapshot = 0; +static kern_return_t stack_snapshot_ret = 0; static uint32_t stack_snapshot_bytes_traced = 0; static kcdata_descriptor_t stackshot_kcdata_p = NULL; @@ -80,70 +79,92 @@ static void *stack_snapshot_buf; static uint32_t stack_snapshot_bufsize; int stack_snapshot_pid; static uint32_t stack_snapshot_flags; -static unsigned int old_debugger; -static boolean_t stack_enable_faulting; +static uint64_t stack_snapshot_delta_since_timestamp; +static boolean_t panic_stackshot; + +static boolean_t stack_enable_faulting = FALSE; +static struct stackshot_fault_stats fault_stats; -void *kernel_stackshot_buf = NULL; /* Pointer to buffer for stackshots triggered from the kernel and retrieved later */ -int kernel_stackshot_buf_size = 0; +static uint64_t * stackshot_duration_outer; +static uint64_t stackshot_microsecs; -void *stackshot_snapbuf = NULL; /* Used by stack_snapshot2 (to be removed) */ +void * kernel_stackshot_buf = NULL; /* Pointer to buffer for stackshots triggered from the kernel and retrieved later */ +int kernel_stackshot_buf_size = 0; -__private_extern__ void stackshot_lock_init( void ); +void * stackshot_snapbuf = NULL; /* Used by stack_snapshot2 (to be removed) */ + +__private_extern__ void stackshot_init( void ); static boolean_t memory_iszero(void *addr, size_t size); -kern_return_t stack_snapshot2(int pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval); -kern_return_t stack_snapshot_from_kernel_internal(int pid, void *buf, uint32_t size, uint32_t flags, unsigned *bytes_traced); #if CONFIG_TELEMETRY kern_return_t stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval); #endif uint32_t get_stackshot_estsize(uint32_t prev_size_hint); kern_return_t kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user); -void do_stackshot(void); -void kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t flags, kcdata_descriptor_t data_p, boolean_t enable_faulting); -void kdp_snapshot_postflight(void); -static int kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t *pbytesTraced); +kern_return_t do_stackshot(void *); +void kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t flags, kcdata_descriptor_t data_p, uint64_t since_timestamp); +boolean_t stackshot_thread_is_idle_worker_unsafe(thread_t thread); static int kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t *pBytesTraced); -int kdp_stack_snapshot_geterror(void); +kern_return_t kdp_stack_snapshot_geterror(void); uint32_t kdp_stack_snapshot_bytes_traced(void); -int kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_flags, uint32_t *pbytesTraced); static int pid_from_task(task_t task); -static uint64_t proc_uniqueid_from_task(task_t task); static void kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap); -static boolean_t kdp_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size); +static boolean_t kdp_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_result); +static boolean_t kdp_copyin_word(task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results); static uint64_t proc_was_throttled_from_task(task_t task); +extern uint32_t workqueue_get_pwq_state_kdp(void *proc); + extern int proc_pid(void *p); extern uint64_t proc_uniqueid(void *p); extern uint64_t proc_was_throttled(void *p); extern uint64_t proc_did_throttle(void *p); -static uint64_t proc_did_throttle_from_task(task_t task); -extern void proc_name_kdp(task_t task, char *buf, int size); -extern int proc_threadname_kdp(void *uth, char *buf, size_t size); -extern void proc_starttime_kdp(void *p, uint64_t *tv_sec, uint64_t *tv_usec); -extern uint64_t get_dispatchqueue_serialno_offset_from_proc(void *p); -static uint64_t proc_dispatchqueue_serialno_offset_from_task(task_t task); +static uint64_t proc_did_throttle_from_task(task_t task); +extern void proc_name_kdp(task_t task, char * buf, int size); +extern int proc_threadname_kdp(void * uth, char * buf, size_t size); +extern void proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime); extern int memorystatus_get_pressure_status_kdp(void); +extern boolean_t memorystatus_proc_is_dirty_unsafe(void * v); + +extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */ +extern void bcopy_phys(addr64_t, addr64_t, vm_size_t); + +#if CONFIG_TELEMETRY +extern kern_return_t stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval); +#endif /* CONFIG_TELEMETRY */ -extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */ -extern void bcopy_phys(addr64_t, addr64_t, vm_size_t); -extern int machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p, uint32_t *thread_trace_flags); -extern int machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p, uint32_t *thread_trace_flags); +extern kern_return_t kern_stack_snapshot_with_reason(char* reason); +extern kern_return_t kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user); -/* Validates that the given address is both a valid page and has - * default caching attributes for the current kdp_pmap. Returns +/* + * Validates that the given address is both a valid page and has + * default caching attributes for the current map. Returns * 0 if the address is invalid, and a kernel virtual address for * the given address if it is valid. */ vm_offset_t machine_trace_thread_get_kva(vm_offset_t cur_target_addr, vm_map_t map, uint32_t *thread_trace_flags); +#define KDP_FAULT_RESULT_PAGED_OUT 0x1 /* some data was unable to be retrieved */ +#define KDP_FAULT_RESULT_TRIED_FAULT 0x2 /* tried to fault in data */ +#define KDP_FAULT_RESULT_FAULTED_IN 0x4 /* successfully faulted in data */ + +/* + * Looks up the physical translation for the given address in the target map, attempting + * to fault data in if requested and it is not resident. Populates thread_trace_flags if requested + * as well. + */ +vm_offset_t kdp_find_phys(vm_map_t map, vm_offset_t target_addr, boolean_t try_fault, uint32_t *kdp_fault_results); + +static size_t stackshot_strlcpy(char *dst, const char *src, size_t maxlen); +static void stackshot_memcpy(void *dst, const void *src, size_t len); + /* Clears caching information used by the above validation routine - * (in case the kdp_pmap has been changed or cleared). + * (in case the current map has been changed or cleared). */ void machine_trace_thread_clear_validation_cache(void); #define MAX_FRAMES 1000 #define MAX_LOADINFOS 500 -#define USECSPERSEC 1000000 #define TASK_IMP_WALK_LIMIT 20 typedef struct thread_snapshot *thread_snapshot_t; @@ -169,17 +190,30 @@ static lck_attr_t *stackshot_subsys_lck_attr; static lck_mtx_t stackshot_subsys_mutex; #define STACKSHOT_SUBSYS_LOCK() lck_mtx_lock(&stackshot_subsys_mutex) +#define STACKSHOT_SUBSYS_TRY_LOCK() lck_mtx_try_lock(&stackshot_subsys_mutex) #define STACKSHOT_SUBSYS_UNLOCK() lck_mtx_unlock(&stackshot_subsys_mutex) -#if defined(__i386__) || defined (__x86_64__) -#define TRAP_DEBUGGER __asm__ volatile("int3") -#else -#error No TRAP_DEBUGGER definition for this architecture -#endif -/* Initialize the mutex governing access to the stack snapshot subsystem */ +#define SANE_BOOTPROFILE_TRACEBUF_SIZE (64 * 1024 * 1024) +#define SANE_TRACEBUF_SIZE (8 * 1024 * 1024) + +/* + * We currently set a ceiling of 3 milliseconds spent in the kdp fault path + * for non-panic stackshots where faulting is requested. + */ +#define KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS (3 * NSEC_PER_MSEC) + +#define STACKSHOT_SUPP_SIZE (16 * 1024) /* Minimum stackshot size */ +#define TASK_UUID_AVG_SIZE (16 * sizeof(uuid_t)) /* Average space consumed by UUIDs/task */ + +/* + * Initialize the mutex governing access to the stack snapshot subsystem + * and other stackshot related bits. + */ __private_extern__ void -stackshot_lock_init( void ) +stackshot_init( void ) { + mach_timebase_info_data_t timebase; + stackshot_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); stackshot_subsys_lck_grp = lck_grp_alloc_init("stackshot_subsys_lock", stackshot_subsys_lck_grp_attr); @@ -187,13 +221,10 @@ stackshot_lock_init( void ) stackshot_subsys_lck_attr = lck_attr_alloc_init(); lck_mtx_init(&stackshot_subsys_mutex, stackshot_subsys_lck_grp, stackshot_subsys_lck_attr); -} - -#define SANE_BOOTPROFILE_TRACEBUF_SIZE (64 * 1024 * 1024) -#define SANE_TRACEBUF_SIZE (8 * 1024 * 1024) -#define STACKSHOT_SUPP_SIZE (16 * 1024) /* Minimum stackshot size */ -#define TASK_UUID_AVG_SIZE (16 * sizeof(uuid_t)) /* Average space consumed by UUIDs/task */ + clock_timebase_info(&timebase); + fault_stats.sfs_system_max_fault_time = ((KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS * timebase.denom)/ timebase.numer); +} /* * Method for grabbing timer values safely, in the sense that no infinite loop will occur @@ -216,158 +247,10 @@ static uint64_t safe_grab_timer_value(struct timer *t) #endif } -/* - * Old, inefficient stackshot call. This will be removed in the next release and is being replaced with - * two syscalls -- stack_snapshot_with_config and stack_microsnapshot. - */ -kern_return_t -stack_snapshot2(int pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval) -{ - boolean_t istate; - int error = KERN_SUCCESS; - unsigned bytesTraced = 0; - -#if CONFIG_TELEMETRY - if (flags & STACKSHOT_GLOBAL_MICROSTACKSHOT_ENABLE) { - telemetry_global_ctl(1); - *retval = 0; - return (0); - } else if (flags & STACKSHOT_GLOBAL_MICROSTACKSHOT_DISABLE) { - telemetry_global_ctl(0); - *retval = 0; - return (0); - } - - if (flags & STACKSHOT_WINDOWED_MICROSTACKSHOTS_ENABLE) { - error = telemetry_enable_window(); - - if (error != KERN_SUCCESS) { - /* We are probably out of memory */ - *retval = -1; - return KERN_RESOURCE_SHORTAGE; - } - - *retval = 0; - return (0); - } else if (flags & STACKSHOT_WINDOWED_MICROSTACKSHOTS_DISABLE) { - telemetry_disable_window(); - *retval = 0; - return (0); - } -#endif - - *retval = -1; - /* Serialize tracing */ - STACKSHOT_SUBSYS_LOCK(); - - if (tracebuf_size <= 0) { - error = KERN_INVALID_ARGUMENT; - goto error_exit; - } - -#if CONFIG_TELEMETRY - if (flags & STACKSHOT_GET_MICROSTACKSHOT) { - - if (tracebuf_size > SANE_TRACEBUF_SIZE) { - error = KERN_INVALID_ARGUMENT; - goto error_exit; - } - - bytesTraced = tracebuf_size; - error = telemetry_gather(tracebuf, &bytesTraced, - (flags & STACKSHOT_SET_MICROSTACKSHOT_MARK) ? TRUE : FALSE); - *retval = (int)bytesTraced; - goto error_exit; - } - - if (flags & STACKSHOT_GET_WINDOWED_MICROSTACKSHOTS) { - - if (tracebuf_size > SANE_TRACEBUF_SIZE) { - error = KERN_INVALID_ARGUMENT; - goto error_exit; - } - - bytesTraced = tracebuf_size; - error = telemetry_gather_windowed(tracebuf, &bytesTraced); - *retval = (int)bytesTraced; - goto error_exit; - } - - if (flags & STACKSHOT_GET_BOOT_PROFILE) { - - if (tracebuf_size > SANE_BOOTPROFILE_TRACEBUF_SIZE) { - error = KERN_INVALID_ARGUMENT; - goto error_exit; - } - - bytesTraced = tracebuf_size; - error = bootprofile_gather(tracebuf, &bytesTraced); - *retval = (int)bytesTraced; - goto error_exit; - } -#endif - - if (tracebuf_size > SANE_TRACEBUF_SIZE) { - error = KERN_INVALID_ARGUMENT; - goto error_exit; - } - - assert(stackshot_snapbuf == NULL); - if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&stackshot_snapbuf, tracebuf_size, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) { - error = KERN_RESOURCE_SHORTAGE; - goto error_exit; - } - - if (panic_active()) { - error = KERN_RESOURCE_SHORTAGE; - goto error_exit; - } - - istate = ml_set_interrupts_enabled(FALSE); - /* Preload trace parameters */ - kdp_snapshot_preflight(pid, stackshot_snapbuf, tracebuf_size, flags, NULL, FALSE); - - /* Trap to the debugger to obtain a coherent stack snapshot; this populates - * the trace buffer - */ - - TRAP_DEBUGGER; - - ml_set_interrupts_enabled(istate); - - bytesTraced = kdp_stack_snapshot_bytes_traced(); - - if (bytesTraced > 0) { - if ((error = copyout(stackshot_snapbuf, tracebuf, - ((bytesTraced < tracebuf_size) ? - bytesTraced : tracebuf_size)))) - goto error_exit; - *retval = bytesTraced; - } - else { - error = KERN_NOT_IN_SET; - goto error_exit; - } - - error = kdp_stack_snapshot_geterror(); - if (error == -1) { - error = KERN_NO_SPACE; - *retval = -1; - goto error_exit; - } - -error_exit: - if (stackshot_snapbuf != NULL) - kmem_free(kernel_map, (vm_offset_t) stackshot_snapbuf, tracebuf_size); - stackshot_snapbuf = NULL; - STACKSHOT_SUBSYS_UNLOCK(); - return error; -} - kern_return_t -stack_snapshot_from_kernel_internal(int pid, void *buf, uint32_t size, uint32_t flags, unsigned *bytes_traced) +stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, uint64_t delta_since_timestamp, unsigned *bytes_traced) { - int error = 0; + kern_return_t error = KERN_SUCCESS; boolean_t istate; if ((buf == NULL) || (size <= 0) || (bytes_traced == NULL)) { @@ -380,17 +263,33 @@ stack_snapshot_from_kernel_internal(int pid, void *buf, uint32_t size, uint32_t } /* Serialize tracing */ - STACKSHOT_SUBSYS_LOCK(); + if (flags & STACKSHOT_TRYLOCK) { + if (!STACKSHOT_SUBSYS_TRY_LOCK()) { + return KERN_LOCK_OWNED; + } + } else { + STACKSHOT_SUBSYS_LOCK(); + } + istate = ml_set_interrupts_enabled(FALSE); + struct kcdata_descriptor kcdata; + uint32_t hdr_tag = (flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? + KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT : KCDATA_BUFFER_BEGIN_STACKSHOT; + + error = kcdata_memory_static_init(&kcdata, (mach_vm_address_t)buf, hdr_tag, size, + KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER); + if (error) { + goto out; + } /* Preload trace parameters*/ - kdp_snapshot_preflight(pid, buf, size, flags, NULL, FALSE); + kdp_snapshot_preflight(pid, buf, size, flags, &kcdata, delta_since_timestamp); /* Trap to the debugger to obtain a coherent stack snapshot; this populates * the trace buffer */ - TRAP_DEBUGGER; + stack_snapshot_ret = DebuggerWithCallback(do_stackshot, NULL, FALSE); ml_set_interrupts_enabled(istate); @@ -398,8 +297,8 @@ stack_snapshot_from_kernel_internal(int pid, void *buf, uint32_t size, uint32_t error = kdp_stack_snapshot_geterror(); +out: STACKSHOT_SUBSYS_UNLOCK(); - return error; } @@ -425,26 +324,6 @@ stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flag goto exit; } - if (flags & STACKSHOT_WINDOWED_MICROSTACKSHOTS_ENABLE) { - error = telemetry_enable_window(); - - if (error != KERN_SUCCESS) { - /* - * We are probably out of memory - */ - *retval = -1; - error = KERN_RESOURCE_SHORTAGE; - goto exit; - } - - *retval = 0; - goto exit; - } else if (flags & STACKSHOT_WINDOWED_MICROSTACKSHOTS_DISABLE) { - telemetry_disable_window(); - *retval = 0; - goto exit; - } - /* * Data related operations */ @@ -470,19 +349,6 @@ stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flag goto unlock_exit; } - if (flags & STACKSHOT_GET_WINDOWED_MICROSTACKSHOTS) { - - if (tracebuf_size > SANE_TRACEBUF_SIZE) { - error = KERN_INVALID_ARGUMENT; - goto unlock_exit; - } - - bytes_traced = tracebuf_size; - error = telemetry_gather_windowed(tracebuf, &bytes_traced); - *retval = (int)bytes_traced; - goto unlock_exit; - } - if (flags & STACKSHOT_GET_BOOT_PROFILE) { if (tracebuf_size > SANE_BOOTPROFILE_TRACEBUF_SIZE) { @@ -588,7 +454,6 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi int pid = -1; uint32_t flags; uint64_t since_timestamp; - boolean_t enable_faulting = FALSE; uint32_t size_hint = 0; if(stackshot_config == NULL) { @@ -605,29 +470,33 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi out_size_addr = config->sc_out_size_addr; pid = config->sc_pid; flags = config->sc_flags; - since_timestamp = config->sc_since_timestamp; + since_timestamp = config->sc_delta_timestamp; if (config->sc_size <= SANE_TRACEBUF_SIZE) { size_hint = config->sc_size; } break; default: return KERN_NOT_SUPPORTED; - } - - /* - * Currently saving a kernel buffer is only supported from the internal/KEXT API. - */ - if (stackshot_from_user) { - if (flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER) { - return KERN_NO_ACCESS; - } - } else { + } + + /* + * Currently saving a kernel buffer and trylock are only supported from the + * internal/KEXT API. + */ + if (stackshot_from_user) { + if (flags & (STACKSHOT_TRYLOCK | STACKSHOT_SAVE_IN_KERNEL_BUFFER | STACKSHOT_FROM_PANIC)) { + return KERN_NO_ACCESS; + } + } else { if (!(flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER)) { return KERN_NOT_SUPPORTED; } } - if (flags & STACKSHOT_ENABLE_FAULTING) { + /* + * We only support the KDP fault path and delta snapshots and tailspin mode with the kcdata format + */ + if (!(flags & STACKSHOT_KCDATA_FORMAT)) { return KERN_NOT_SUPPORTED; } @@ -638,8 +507,8 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi return KERN_INVALID_ARGUMENT; } - if (since_timestamp != 0) { - return KERN_NOT_SUPPORTED; + if (since_timestamp != 0 && ((flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) == 0)) { + return KERN_INVALID_ARGUMENT; } STACKSHOT_SUBSYS_LOCK(); @@ -674,6 +543,20 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi goto error_exit; } + if (flags & STACKSHOT_GET_BOOT_PROFILE) { + void *bootprofile = NULL; + uint32_t len = 0; +#if CONFIG_TELEMETRY + bootprofile_get(&bootprofile, &len); +#endif + if (!bootprofile || !len) { + error = KERN_NOT_IN_SET; + goto error_exit; + } + error = stackshot_remap_buffer(bootprofile, len, out_buffer_addr, out_size_addr); + goto error_exit; + } + stackshotbuf_size = get_stackshot_estsize(size_hint); for (; stackshotbuf_size <= SANE_TRACEBUF_SIZE; stackshotbuf_size <<= 1) { @@ -690,10 +573,12 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi goto error_exit; } - if (flags & STACKSHOT_KCDATA_FORMAT) { - kcdata_p = kcdata_memory_alloc_init((mach_vm_address_t)stackshotbuf, KCDATA_BUFFER_BEGIN_STACKSHOT, stackshotbuf_size, KCFLAG_USE_MEMCOPY); - } + uint32_t hdr_tag = (flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT : KCDATA_BUFFER_BEGIN_STACKSHOT; + kcdata_p = kcdata_memory_alloc_init((mach_vm_address_t)stackshotbuf, hdr_tag, stackshotbuf_size, + KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER); + stackshot_duration_outer = NULL; + uint64_t time_start = mach_absolute_time(); /* * Disable interrupts and save the current interrupt state. @@ -703,20 +588,24 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi /* * Load stackshot parameters. */ - kdp_snapshot_preflight(pid, stackshotbuf, stackshotbuf_size, flags, kcdata_p, enable_faulting); + kdp_snapshot_preflight(pid, stackshotbuf, stackshotbuf_size, flags, kcdata_p, since_timestamp); /* * Trap to the debugger to obtain a stackshot (this will populate the buffer). */ - TRAP_DEBUGGER; + stack_snapshot_ret = DebuggerWithCallback(do_stackshot, NULL, FALSE); ml_set_interrupts_enabled(prev_interrupt_state); - /* - * If we didn't allocate a big enough buffer, deallocate and try again. - */ + /* record the duration that interupts were disabled */ + + uint64_t time_end = mach_absolute_time(); + if (stackshot_duration_outer) { + *stackshot_duration_outer = time_end - time_start; + } + error = kdp_stack_snapshot_geterror(); - if (error == -1) { + if (error != KERN_SUCCESS) { if (kcdata_p != NULL) { kcdata_memory_destroy(kcdata_p); kcdata_p = NULL; @@ -724,13 +613,20 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi } kmem_free(kernel_map, (vm_offset_t)stackshotbuf, stackshotbuf_size); stackshotbuf = NULL; - continue; + if (error == KERN_INSUFFICIENT_BUFFER_SIZE) { + /* + * If we didn't allocate a big enough buffer, deallocate and try again. + */ + continue; + } else { + goto error_exit; + } } bytes_traced = kdp_stack_snapshot_bytes_traced(); if (bytes_traced <= 0) { - error = KERN_NOT_IN_SET; + error = KERN_ABORTED; goto error_exit; } @@ -787,41 +683,26 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi /* Cache stack snapshot parameters in preparation for a trace */ void kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t flags, - kcdata_descriptor_t data_p, boolean_t enable_faulting) + kcdata_descriptor_t data_p, uint64_t since_timestamp) { + uint64_t microsecs = 0, secs = 0; + clock_get_calendar_microtime((clock_sec_t *)&secs, (clock_usec_t *)µsecs); + + stackshot_microsecs = microsecs + (secs * USEC_PER_SEC); stack_snapshot_pid = pid; stack_snapshot_buf = tracebuf; stack_snapshot_bufsize = tracebuf_size; stack_snapshot_flags = flags; - stack_enable_faulting = enable_faulting; + stack_snapshot_delta_since_timestamp = since_timestamp; + + panic_stackshot = ((flags & STACKSHOT_FROM_PANIC) != 0); + if (data_p != NULL) { stackshot_kcdata_p = data_p; } - kdp_snapshot++; - /* Mark this debugger as active, since the polled mode driver that - * ordinarily does this may not be enabled (yet), or since KDB may be - * the primary debugger. - */ - old_debugger = current_debugger; - if (old_debugger != KDP_CUR_DB) { - current_debugger = KDP_CUR_DB; - } -} - -void -kdp_snapshot_postflight(void) -{ - kdp_snapshot--; -#if CONFIG_KDP_INTERACTIVE_DEBUGGING - if ( - (kdp_en_send_pkt == NULL) || (old_debugger == KDB_CUR_DB)) - current_debugger = old_debugger; -#else - current_debugger = old_debugger; -#endif } -int +kern_return_t kdp_stack_snapshot_geterror(void) { return stack_snapshot_ret; @@ -843,588 +724,858 @@ static boolean_t memory_iszero(void *addr, size_t size) return TRUE; } -static int -kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t *pBytesTraced) -{ - /* convenience macros specific only for this function */ #define kcd_end_address(kcd) ((void *)((uint64_t)((kcd)->kcd_addr_begin) + kcdata_memory_get_used_bytes((kcd)))) #define kcd_max_address(kcd) ((void *)((kcd)->kcd_addr_begin + (kcd)->kcd_length)) -#define kcd_exit_on_error(action) \ - do { \ - if (KERN_SUCCESS != (error = (action))) { \ - if (error == KERN_RESOURCE_SHORTAGE) { \ - error = -1; \ - } \ - goto error_exit; \ - } \ +/* + * Use of the kcd_exit_on_error(action) macro requires a local + * 'kern_return_t error' variable and 'error_exit' label. + */ +#define kcd_exit_on_error(action) \ + do { \ + if (KERN_SUCCESS != (error = (action))) { \ + if (error == KERN_RESOURCE_SHORTAGE) { \ + error = KERN_INSUFFICIENT_BUFFER_SIZE; \ + } \ + goto error_exit; \ + } \ } while (0); /* end kcd_exit_on_error */ - int error = 0; +static uint64_t +kcdata_get_task_ss_flags(task_t task) +{ + uint64_t ss_flags = 0; + boolean_t task64 = task_has_64BitAddr(task); + + if (task64) + ss_flags |= kUser64_p; + if (!task->active || task_is_a_corpse(task)) + ss_flags |= kTerminatedSnapshot; + if (task->pidsuspended) + ss_flags |= kPidSuspended; + if (task->frozen) + ss_flags |= kFrozen; + if (task->effective_policy.tep_darwinbg == 1) + ss_flags |= kTaskDarwinBG; + if (task->requested_policy.trp_role == TASK_FOREGROUND_APPLICATION) + ss_flags |= kTaskIsForeground; + if (task->requested_policy.trp_boosted == 1) + ss_flags |= kTaskIsBoosted; + if (task->effective_policy.tep_sup_active == 1) + ss_flags |= kTaskIsSuppressed; +#if CONFIG_MEMORYSTATUS + if (memorystatus_proc_is_dirty_unsafe(task->bsd_info)) + ss_flags |= kTaskIsDirty; +#endif + + ss_flags |= (0x7 & workqueue_get_pwq_state_kdp(task->bsd_info)) << 17; + +#if IMPORTANCE_INHERITANCE + if (task->task_imp_base) { + if (task->task_imp_base->iit_donor) + ss_flags |= kTaskIsImpDonor; + if (task->task_imp_base->iit_live_donor) + ss_flags |= kTaskIsLiveImpDonor; + } +#endif + + return ss_flags; +} + +static kern_return_t +kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, struct dyld_uuid_info_64_v2 *sys_shared_cache_loadinfo, uint32_t trace_flags, uint64_t *task_snap_ss_flags) +{ + kern_return_t error = KERN_SUCCESS; mach_vm_address_t out_addr = 0; - uint64_t abs_time; - struct task_snapshot_v2 *cur_tsnap; - uint64_t system_state_flags = 0; - int saved_count = 0; - task_t task = TASK_NULL; - thread_t thread = THREAD_NULL; - mach_timebase_info_data_t timebase = {0, 0}; - uint64_t microsecs = 0, secs = 0; - uint32_t length_to_copy, tmp32; - abs_time = mach_absolute_time(); - clock_get_calendar_microtime((clock_sec_t*)&secs, (clock_usec_t*)µsecs); + uint8_t shared_cache_identifier[16]; + uint64_t shared_cache_slide = 0; + uint64_t shared_cache_base_address = 0; + int task_pid = pid_from_task(task); + boolean_t should_fault = (trace_flags & STACKSHOT_ENABLE_UUID_FAULTING); + uint32_t kdp_fault_results = 0; - /* process the flags */ - boolean_t dispatch_p = ((trace_flags & STACKSHOT_GET_DQ) != 0); - boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0); - boolean_t save_kextloadinfo_p = ((trace_flags & STACKSHOT_SAVE_KEXT_LOADINFO) != 0); - boolean_t save_userframes_p = ((trace_flags & STACKSHOT_SAVE_KERNEL_FRAMES_ONLY) == 0); - boolean_t save_donating_pids_p = ((trace_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0); + assert(task_snap_ss_flags != NULL); - if (sizeof(void *) == 8) - system_state_flags |= kKernel64_p; + if (task->shared_region && ml_validate_nofault((vm_offset_t)task->shared_region, sizeof(struct vm_shared_region))) { + struct vm_shared_region *sr = task->shared_region; + shared_cache_base_address = sr->sr_base_address + sr->sr_first_mapping; + } else { + *task_snap_ss_flags |= kTaskSharedRegionInfoUnavailable; + } - if (stackshot_kcdata_p == NULL || pBytesTraced == NULL) { - error = -1; + if (!shared_cache_base_address || + !kdp_copyin(task->map, shared_cache_base_address + offsetof(struct _dyld_cache_header, uuid), + shared_cache_identifier, sizeof(shared_cache_identifier), should_fault, &kdp_fault_results)) { goto error_exit; } - /* begin saving data into the buffer */ - *pBytesTraced = 0; - kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, trace_flags, "stackshot_in_flags")); - kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, (uint32_t)pid, "stackshot_in_pid")); - kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, system_state_flags, "system_state_flags")); - tmp32 = PAGE_SIZE; - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_KERN_PAGE_SIZE, sizeof(uint32_t), &out_addr)); - memcpy((void *)out_addr, &tmp32, sizeof(tmp32)); + if (task->shared_region) { + /* + * No refcounting here, but we are in debugger + * context, so that should be safe. + */ + shared_cache_slide = task->shared_region->sr_slide_info.slide; + } else { + shared_cache_slide = 0; + } -#if CONFIG_JETSAM - tmp32 = memorystatus_get_pressure_status_kdp(); - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_JETSAM_LEVEL, sizeof(uint32_t), &out_addr)); - memcpy((void *)out_addr, &tmp32, sizeof(tmp32)); -#endif + if (sys_shared_cache_loadinfo) { + if (task_pid == 1) { + /* save launchd's shared cache info as system level */ + stackshot_memcpy(sys_shared_cache_loadinfo->imageUUID, shared_cache_identifier, sizeof(sys_shared_cache_loadinfo->imageUUID)); + sys_shared_cache_loadinfo->imageLoadAddress = shared_cache_slide; + sys_shared_cache_loadinfo->imageSlidBaseAddress = shared_cache_slide + task->shared_region->sr_base_address; - /* save boot-args and osversion string */ - length_to_copy = MIN((uint32_t)(strlen(version) + 1), OSVERSIZE); - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_OSVERSION, length_to_copy, &out_addr)); - strlcpy((char*)out_addr, &version[0], length_to_copy); + goto error_exit; + } else { + if (shared_cache_slide == sys_shared_cache_loadinfo->imageLoadAddress && + 0 == memcmp(shared_cache_identifier, sys_shared_cache_loadinfo->imageUUID, + sizeof(sys_shared_cache_loadinfo->imageUUID))) { + /* skip adding shared cache info. its same as system level one */ + goto error_exit; + } + } + } - length_to_copy = MIN((uint32_t)(strlen(PE_boot_args()) + 1), OSVERSIZE); - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_BOOTARGS, length_to_copy, &out_addr)); - strlcpy((char*)out_addr, PE_boot_args(), length_to_copy); + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64_v2), &out_addr)); + struct dyld_uuid_info_64_v2 *shared_cache_data = (struct dyld_uuid_info_64_v2 *)out_addr; + shared_cache_data->imageLoadAddress = shared_cache_slide; + stackshot_memcpy(shared_cache_data->imageUUID, shared_cache_identifier, sizeof(shared_cache_data->imageUUID)); + shared_cache_data->imageSlidBaseAddress = shared_cache_base_address; - /* setup mach_absolute_time and timebase info */ - clock_timebase_info(&timebase); - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_TIMEBASE, sizeof(timebase), &out_addr)); - memcpy((void *)out_addr, &timebase, sizeof(timebase)); +error_exit: + if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) { + *task_snap_ss_flags |= kTaskUUIDInfoMissing; + } - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &out_addr)); - memcpy((void *)out_addr, &abs_time, sizeof(uint64_t)); + if (kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) { + *task_snap_ss_flags |= kTaskUUIDInfoTriedFault; + } - microsecs = microsecs + (secs * USECSPERSEC); - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &out_addr)); - memcpy((void *)out_addr, µsecs, sizeof(uint64_t)); + if (kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) { + *task_snap_ss_flags |= kTaskUUIDInfoFaultedIn; + } - /* reserve space of system level shared cache load info */ - struct dyld_uuid_info_64 *sys_shared_cache_loadinfo; - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(kernel_uuid_info), &out_addr)); - sys_shared_cache_loadinfo = (struct dyld_uuid_info_64 *)out_addr; - bzero((void *)sys_shared_cache_loadinfo, sizeof(struct dyld_uuid_info_64)); + return error; +} - /* Add requested information first */ - if (trace_flags & STACKSHOT_GET_GLOBAL_MEM_STATS) { - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_GLOBAL_MEM_STATS, sizeof(struct mem_and_io_snapshot), &out_addr)); - kdp_mem_and_io_snapshot((struct mem_and_io_snapshot *)out_addr); - } +static kern_return_t +kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_flags, boolean_t have_pmap, uint64_t *task_snap_ss_flags) +{ + boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0); + boolean_t save_kextloadinfo_p = ((trace_flags & STACKSHOT_SAVE_KEXT_LOADINFO) != 0); + boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); + boolean_t minimize_uuids = collect_delta_stackshot && ((trace_flags & STACKSHOT_TAILSPIN) != 0); + boolean_t should_fault = (trace_flags & STACKSHOT_ENABLE_UUID_FAULTING); - /* Iterate over tasks */ - queue_head_t *task_list = &tasks; - queue_iterate(task_list, task, task_t, tasks) { - int task_pid; - if ((task == NULL) || !ml_validate_nofault((vm_offset_t) task, sizeof(struct task))) - goto error_exit; + kern_return_t error = KERN_SUCCESS; + mach_vm_address_t out_addr = 0; - task_pid = pid_from_task(task); - if (!task->active) { - /* - * Not interested in terminated tasks without threads, and - * at the moment, stackshot can't handle a task without a name. - */ - if (queue_empty(&task->threads) || task_pid == -1) { - continue; + uint32_t uuid_info_count = 0; + mach_vm_address_t uuid_info_addr = 0; + uint64_t uuid_info_timestamp = 0; + uint32_t kdp_fault_results = 0; + + assert(task_snap_ss_flags != NULL); + + int task_pid = pid_from_task(task); + boolean_t task64 = task_has_64BitAddr(task); + + if (save_loadinfo_p && have_pmap && task->active && task_pid > 0) { + /* Read the dyld_all_image_infos struct from the task memory to get UUID array count and location */ + if (task64) { + struct user64_dyld_all_image_infos task_image_infos; + if (kdp_copyin(task->map, task->all_image_info_addr, &task_image_infos, + sizeof(struct user64_dyld_all_image_infos), should_fault, &kdp_fault_results)) { + uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; + uuid_info_addr = task_image_infos.uuidArray; + if (task_image_infos.version >= 15) { + uuid_info_timestamp = task_image_infos.timestamp; + } + } + } else { + struct user32_dyld_all_image_infos task_image_infos; + if (kdp_copyin(task->map, task->all_image_info_addr, &task_image_infos, + sizeof(struct user32_dyld_all_image_infos), should_fault, &kdp_fault_results)) { + uuid_info_count = task_image_infos.uuidArrayCount; + uuid_info_addr = task_image_infos.uuidArray; + if (task_image_infos.version >= 15) { + uuid_info_timestamp = task_image_infos.timestamp; + } } } - /* Trace everything, unless a process was specified */ - if ((pid == -1) || (pid == task_pid)) { + /* + * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating + * this data structure), we zero the uuid_info_count so that we won't even try to save load info + * for this task. + */ + if (!uuid_info_addr) { + uuid_info_count = 0; + } + } - uint64_t task_uniqueid = proc_uniqueid_from_task(task); - boolean_t task64 = task_has_64BitAddr(task); - boolean_t have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map))); - boolean_t have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap))); + if (have_pmap && task_pid == 0) { + if (save_kextloadinfo_p && ml_validate_nofault((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader))) { + uuid_info_count = gLoadedKextSummaries->numSummaries + 1; /* include main kernel UUID */ + } else { + uuid_info_count = 1; /* include kernelcache UUID (embedded) or kernel UUID (desktop) */ + } + } - /* add task snapshot marker */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, STACKSHOT_KCCONTAINER_TASK, task_uniqueid)); + if (task_pid > 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { + if (minimize_uuids && uuid_info_timestamp != 0 && uuid_info_timestamp < stack_snapshot_delta_since_timestamp) + goto error_exit; - /* add task_snapshot_v2 struct data */ - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v2), &out_addr)); - cur_tsnap = (struct task_snapshot_v2 *)out_addr; - bzero(cur_tsnap, sizeof(struct task_snapshot_v2)); + uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); + uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; - cur_tsnap->ts_pid = task_pid; - cur_tsnap->ts_unique_pid = task_uniqueid; + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task64 ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO), + uuid_info_size, uuid_info_count, &out_addr)); - /* Add the BSD process identifiers */ - if (task_pid != -1 && task->bsd_info != NULL) - proc_name_kdp(task, cur_tsnap->ts_p_comm, sizeof(cur_tsnap->ts_p_comm)); - else { - cur_tsnap->ts_p_comm[0] = '\0'; -#if IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) - if (task->task_imp_base != NULL) { - strlcpy(cur_tsnap->ts_p_comm, &task->task_imp_base->iit_procname[0], - MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm))); - } -#endif - } + /* Copy in the UUID info array + * It may be nonresident, in which case just fix up nloadinfos to 0 in the task_snap + */ + if (have_pmap && !kdp_copyin(task->map, uuid_info_addr, (void *)out_addr, uuid_info_array_size, should_fault, &kdp_fault_results)) { + bzero((void *)out_addr, uuid_info_array_size); + } - if (task64) - cur_tsnap->ts_ss_flags |= kUser64_p; - if (!task->active || task_is_a_corpse(task)) - cur_tsnap->ts_ss_flags |= kTerminatedSnapshot; - if (task->pidsuspended) - cur_tsnap->ts_ss_flags |= kPidSuspended; - if (task->frozen) - cur_tsnap->ts_ss_flags |= kFrozen; - if (task->effective_policy.darwinbg == 1) - cur_tsnap->ts_ss_flags |= kTaskDarwinBG; - if (task->requested_policy.t_role == TASK_FOREGROUND_APPLICATION) - cur_tsnap->ts_ss_flags |= kTaskIsForeground; - if (task->requested_policy.t_boosted == 1) - cur_tsnap->ts_ss_flags |= kTaskIsBoosted; - if (task->effective_policy.t_sup_active == 1) - cur_tsnap->ts_ss_flags |= kTaskIsSuppressed; + } else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { + if (minimize_uuids && gLoadedKextSummaries != 0 && gLoadedKextSummariesTimestamp < stack_snapshot_delta_since_timestamp) + goto error_exit; -#if IMPORTANCE_INHERITANCE - if (task->task_imp_base) { - if (task->task_imp_base->iit_donor) - cur_tsnap->ts_ss_flags |= kTaskIsImpDonor; - if (task->task_imp_base->iit_live_donor) - cur_tsnap->ts_ss_flags |= kTaskIsLiveImpDonor; + uintptr_t image_load_address; + + do { + + + if (!kernel_uuid || !ml_validate_nofault((vm_offset_t)kernel_uuid, sizeof(uuid_t))) { + /* Kernel UUID not found or inaccessible */ + break; } -#endif - cur_tsnap->ts_latency_qos = (task->effective_policy.t_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ? - LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | task->effective_policy.t_latency_qos); - cur_tsnap->ts_suspend_count = task->suspend_count; - cur_tsnap->ts_p_start_sec = 0; - proc_starttime_kdp(task->bsd_info, &cur_tsnap->ts_p_start_sec, NULL); - - cur_tsnap->ts_task_size = have_pmap ? (pmap_resident_count(task->map->pmap) * PAGE_SIZE) : 0; - cur_tsnap->ts_max_resident_size = get_task_resident_max(task); - cur_tsnap->ts_faults = task->faults; - cur_tsnap->ts_pageins = task->pageins; - cur_tsnap->ts_cow_faults = task->cow_faults; - cur_tsnap->ts_user_time_in_terminated_threads = task->total_user_time; - cur_tsnap->ts_system_time_in_terminated_threads = task->total_system_time; - cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task); - cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task); - - /* Check for shared cache information */ - do { - uint8_t shared_cache_identifier[16]; - uint64_t shared_cache_slide; - uint64_t shared_cache_base_address = 0; - boolean_t found_shared_cache_info = TRUE; - - if (task->shared_region && ml_validate_nofault((vm_offset_t)task->shared_region, sizeof(struct vm_shared_region))) { - struct vm_shared_region *sr = task->shared_region; - shared_cache_base_address = sr->sr_base_address + sr->sr_first_mapping; + kcd_exit_on_error(kcdata_get_memory_addr_for_array( + kcd, (sizeof(kernel_uuid_info) == sizeof(struct user64_dyld_uuid_info)) ? KCDATA_TYPE_LIBRARY_LOADINFO64 + : KCDATA_TYPE_LIBRARY_LOADINFO, + sizeof(kernel_uuid_info), uuid_info_count, &out_addr)); + kernel_uuid_info *uuid_info_array = (kernel_uuid_info *)out_addr; + image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(vm_kernel_stext); + uuid_info_array[0].imageLoadAddress = image_load_address; + stackshot_memcpy(&uuid_info_array[0].imageUUID, kernel_uuid, sizeof(uuid_t)); + + if (save_kextloadinfo_p && + ml_validate_nofault((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader)) && + ml_validate_nofault((vm_offset_t)(&gLoadedKextSummaries->summaries[0]), + gLoadedKextSummaries->entry_size * gLoadedKextSummaries->numSummaries)) { + uint32_t kexti; + for (kexti=0 ; kexti < gLoadedKextSummaries->numSummaries; kexti++) { + image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(gLoadedKextSummaries->summaries[kexti].address); + uuid_info_array[kexti + 1].imageLoadAddress = image_load_address; + stackshot_memcpy(&uuid_info_array[kexti + 1].imageUUID, &gLoadedKextSummaries->summaries[kexti].uuid, sizeof(uuid_t)); } + } + } while(0); + } - if (!shared_cache_base_address || - !kdp_copyin(task->map->pmap, shared_cache_base_address + offsetof(struct _dyld_cache_header, uuid), shared_cache_identifier, sizeof(shared_cache_identifier)) - ) { - found_shared_cache_info = FALSE; - } +error_exit: + if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) { + *task_snap_ss_flags |= kTaskUUIDInfoMissing; + } - if (task->shared_region) { - /* - * No refcounting here, but we are in debugger - * context, so that should be safe. - */ - shared_cache_slide = task->shared_region->sr_slide_info.slide; - } else { - shared_cache_slide = 0; - } + if (kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) { + *task_snap_ss_flags |= kTaskUUIDInfoTriedFault; + } - if (found_shared_cache_info == FALSE) - break; + if (kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) { + *task_snap_ss_flags |= kTaskUUIDInfoFaultedIn; + } - if (task_pid == 1) { - /* save launchd's shared cache info as system level */ - bcopy(shared_cache_identifier, sys_shared_cache_loadinfo->imageUUID, sizeof(sys_shared_cache_loadinfo->imageUUID)); - sys_shared_cache_loadinfo->imageLoadAddress = shared_cache_slide; - break; - } else { - if (shared_cache_slide == sys_shared_cache_loadinfo->imageLoadAddress && - 0 == memcmp(shared_cache_identifier, sys_shared_cache_loadinfo->imageUUID, sizeof(sys_shared_cache_loadinfo->imageUUID))) { - /* skip adding shared cache info. its same as system level one */ - break; - } - } + return error; +} - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64), &out_addr)); - struct dyld_uuid_info_64 *shared_cache_data = (struct dyld_uuid_info_64 *)out_addr; - shared_cache_data->imageLoadAddress = shared_cache_slide; - bcopy(shared_cache_identifier, shared_cache_data->imageUUID, sizeof(shared_cache_data->imageUUID)); - - } while(0); - - /* I/O Statistics if any counters are non zero */ - assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES); - if (task->task_io_stats && !memory_iszero(task->task_io_stats, sizeof(struct io_stat_info))) { - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr)); - struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr; - _iostat->ss_disk_reads_count = task->task_io_stats->disk_reads.count; - _iostat->ss_disk_reads_size = task->task_io_stats->disk_reads.size; - _iostat->ss_disk_writes_count = (task->task_io_stats->total_io.count - task->task_io_stats->disk_reads.count); - _iostat->ss_disk_writes_size = (task->task_io_stats->total_io.size - task->task_io_stats->disk_reads.size); - _iostat->ss_paging_count = task->task_io_stats->paging.count; - _iostat->ss_paging_size = task->task_io_stats->paging.size; - _iostat->ss_non_paging_count = (task->task_io_stats->total_io.count - task->task_io_stats->paging.count); - _iostat->ss_non_paging_size = (task->task_io_stats->total_io.size - task->task_io_stats->paging.size); - _iostat->ss_metadata_count = task->task_io_stats->metadata.count; - _iostat->ss_metadata_size = task->task_io_stats->metadata.size; - _iostat->ss_data_count = (task->task_io_stats->total_io.count - task->task_io_stats->metadata.count); - _iostat->ss_data_size = (task->task_io_stats->total_io.size - task->task_io_stats->metadata.size); - for(int i = 0; i < IO_NUM_PRIORITIES; i++) { - _iostat->ss_io_priority_count[i] = task->task_io_stats->io_priority[i].count; - _iostat->ss_io_priority_size[i] = task->task_io_stats->io_priority[i].size; - } - } +static kern_return_t +kcdata_record_task_iostats(kcdata_descriptor_t kcd, task_t task) +{ + kern_return_t error = KERN_SUCCESS; + mach_vm_address_t out_addr = 0; -#if IMPORTANCE_INHERITANCE - if (save_donating_pids_p) { - kcd_exit_on_error(((((mach_vm_address_t) kcd_end_address(stackshot_kcdata_p) + (TASK_IMP_WALK_LIMIT * sizeof(int32_t))) - < (mach_vm_address_t) kcd_max_address(stackshot_kcdata_p)) ? KERN_SUCCESS : KERN_RESOURCE_SHORTAGE)); - saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS, (void *)kcd_end_address(stackshot_kcdata_p), TASK_IMP_WALK_LIMIT); - if (saved_count > 0) - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STASKSHOT_KCTYPE_DONATING_PIDS, sizeof(int32_t), saved_count, &out_addr)); - } + /* I/O Statistics if any counters are non zero */ + assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES); + if (task->task_io_stats && !memory_iszero(task->task_io_stats, sizeof(struct io_stat_info))) { + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr)); + struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr; + _iostat->ss_disk_reads_count = task->task_io_stats->disk_reads.count; + _iostat->ss_disk_reads_size = task->task_io_stats->disk_reads.size; + _iostat->ss_disk_writes_count = (task->task_io_stats->total_io.count - task->task_io_stats->disk_reads.count); + _iostat->ss_disk_writes_size = (task->task_io_stats->total_io.size - task->task_io_stats->disk_reads.size); + _iostat->ss_paging_count = task->task_io_stats->paging.count; + _iostat->ss_paging_size = task->task_io_stats->paging.size; + _iostat->ss_non_paging_count = (task->task_io_stats->total_io.count - task->task_io_stats->paging.count); + _iostat->ss_non_paging_size = (task->task_io_stats->total_io.size - task->task_io_stats->paging.size); + _iostat->ss_metadata_count = task->task_io_stats->metadata.count; + _iostat->ss_metadata_size = task->task_io_stats->metadata.size; + _iostat->ss_data_count = (task->task_io_stats->total_io.count - task->task_io_stats->metadata.count); + _iostat->ss_data_size = (task->task_io_stats->total_io.size - task->task_io_stats->metadata.size); + for(int i = 0; i < IO_NUM_PRIORITIES; i++) { + _iostat->ss_io_priority_count[i] = task->task_io_stats->io_priority[i].count; + _iostat->ss_io_priority_size[i] = task->task_io_stats->io_priority[i].size; + } + } + +error_exit: + return error; +} + +static kern_return_t +kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace_flags, boolean_t have_pmap, uint64_t **task_snap_ss_flags) +{ + boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); + boolean_t collect_iostats = !collect_delta_stackshot && !(trace_flags & STACKSHOT_TAILSPIN) && !(trace_flags & STACKSHOT_NO_IO_STATS); + + kern_return_t error = KERN_SUCCESS; + mach_vm_address_t out_addr = 0; + struct task_snapshot_v2 * cur_tsnap = NULL; + + assert(task_snap_ss_flags != NULL); + + int task_pid = pid_from_task(task); + uint64_t task_uniqueid = get_task_uniqueid(task); + + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v2), &out_addr)); + + cur_tsnap = (struct task_snapshot_v2 *)out_addr; + + cur_tsnap->ts_unique_pid = task_uniqueid; + cur_tsnap->ts_ss_flags = kcdata_get_task_ss_flags(task); + *task_snap_ss_flags = &cur_tsnap->ts_ss_flags; + cur_tsnap->ts_user_time_in_terminated_threads = task->total_user_time; + cur_tsnap->ts_system_time_in_terminated_threads = task->total_system_time; + + cur_tsnap->ts_p_start_sec = 0; + proc_starttime_kdp(task->bsd_info, &cur_tsnap->ts_p_start_sec, NULL, NULL); + + cur_tsnap->ts_task_size = have_pmap ? (pmap_resident_count(task->map->pmap) * PAGE_SIZE) : 0; + cur_tsnap->ts_max_resident_size = get_task_resident_max(task); + cur_tsnap->ts_suspend_count = task->suspend_count; + cur_tsnap->ts_faults = task->faults; + cur_tsnap->ts_pageins = task->pageins; + cur_tsnap->ts_cow_faults = task->cow_faults; + cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task); + cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task); + cur_tsnap->ts_latency_qos = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ? + LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | task->effective_policy.tep_latency_qos); + cur_tsnap->ts_pid = task_pid; + + /* Add the BSD process identifiers */ + if (task_pid != -1 && task->bsd_info != NULL) + proc_name_kdp(task, cur_tsnap->ts_p_comm, sizeof(cur_tsnap->ts_p_comm)); + else { + cur_tsnap->ts_p_comm[0] = '\0'; +#if IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) + if (task->task_imp_base != NULL) { + stackshot_strlcpy(cur_tsnap->ts_p_comm, &task->task_imp_base->iit_procname[0], + MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm))); + } #endif + } - /* place load info and libraries now */ - uint32_t uuid_info_count = 0; - mach_vm_address_t uuid_info_addr = 0; - if (save_loadinfo_p && have_pmap && task->active && task_pid > 0) { - /* Read the dyld_all_image_infos struct from the task memory to get UUID array count and location */ - if (task64) { - struct user64_dyld_all_image_infos task_image_infos; - if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct user64_dyld_all_image_infos))) { - uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; - uuid_info_addr = task_image_infos.uuidArray; - } - } else { - struct user32_dyld_all_image_infos task_image_infos; - if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct user32_dyld_all_image_infos))) { - uuid_info_count = task_image_infos.uuidArrayCount; - uuid_info_addr = task_image_infos.uuidArray; - } - } + if (collect_iostats) { + kcd_exit_on_error(kcdata_record_task_iostats(kcd, task)); + } - /* - * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating - * this data structure), we zero the uuid_info_count so that we won't even try to save load info - * for this task. - */ - if (!uuid_info_addr) { - uuid_info_count = 0; - } - } +error_exit: + return error; +} - if (have_pmap && task_pid == 0) { - if (save_kextloadinfo_p && ml_validate_nofault((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader))) { - uuid_info_count = gLoadedKextSummaries->numSummaries + 1; /* include main kernel UUID */ - } else { - uuid_info_count = 1; /* atleast include kernel uuid */ - } - } +static kern_return_t +kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, boolean_t have_pmap, uint64_t **task_snap_ss_flags) +{ + kern_return_t error = KERN_SUCCESS; + struct task_delta_snapshot_v2 * cur_tsnap = NULL; + mach_vm_address_t out_addr = 0; - if (task_pid > 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { - uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); - uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; + uint64_t task_uniqueid = get_task_uniqueid(task); + assert(task_snap_ss_flags != NULL); - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, - (task64 ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO), - uuid_info_size, - uuid_info_count, - &out_addr)); + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT, sizeof(struct task_delta_snapshot_v2), &out_addr)); + cur_tsnap = (struct task_delta_snapshot_v2 *)out_addr; - /* Copy in the UUID info array - * It may be nonresident, in which case just fix up nloadinfos to 0 in the task_snap - */ - if (have_pmap && !kdp_copyin(task->map->pmap, uuid_info_addr, (void *)out_addr, uuid_info_array_size)) { - bzero((void *)out_addr, uuid_info_array_size); - } + cur_tsnap->tds_unique_pid = task_uniqueid; + cur_tsnap->tds_ss_flags = kcdata_get_task_ss_flags(task); + *task_snap_ss_flags = &cur_tsnap->tds_ss_flags; - } else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { - uintptr_t image_load_address; + cur_tsnap->tds_user_time_in_terminated_threads = task->total_user_time; + cur_tsnap->tds_system_time_in_terminated_threads = task->total_system_time; - do { - if (!kernel_uuid || !ml_validate_nofault((vm_offset_t)kernel_uuid, sizeof(uuid_t))) { - /* Kernel UUID not found or inaccessible */ - break; - } - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, - (sizeof(kernel_uuid_info) == sizeof(struct user64_dyld_uuid_info))? KCDATA_TYPE_LIBRARY_LOADINFO64: KCDATA_TYPE_LIBRARY_LOADINFO, - sizeof(kernel_uuid_info), uuid_info_count, &out_addr) - ); - kernel_uuid_info *uuid_info_array = (kernel_uuid_info *)out_addr; - image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(vm_kernel_stext); - uuid_info_array[0].imageLoadAddress = image_load_address; - memcpy(&uuid_info_array[0].imageUUID, kernel_uuid, sizeof(uuid_t)); - - if (save_kextloadinfo_p && ml_validate_nofault((vm_offset_t)(&gLoadedKextSummaries->summaries[0]), - gLoadedKextSummaries->entry_size * gLoadedKextSummaries->numSummaries)) { - uint32_t kexti; - for (kexti=0 ; kexti < gLoadedKextSummaries->numSummaries; kexti++) { - image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(gLoadedKextSummaries->summaries[kexti].address); - uuid_info_array[kexti + 1].imageLoadAddress = image_load_address; - memcpy(&uuid_info_array[kexti + 1].imageUUID, &gLoadedKextSummaries->summaries[kexti].uuid, sizeof(uuid_t)); - } - } - } while(0); - } + cur_tsnap->tds_task_size = have_pmap ? (pmap_resident_count(task->map->pmap) * PAGE_SIZE) : 0; - /* Iterate over task threads */ - queue_iterate(&task->threads, thread, thread_t, task_threads){ - uint64_t tval; - uint64_t thread_uniqueid = 0; - char cur_thread_name[STACKSHOT_MAX_THREAD_NAME_SIZE]; + cur_tsnap->tds_max_resident_size = get_task_resident_max(task); + cur_tsnap->tds_suspend_count = task->suspend_count; + cur_tsnap->tds_faults = task->faults; + cur_tsnap->tds_pageins = task->pageins; + cur_tsnap->tds_cow_faults = task->cow_faults; + cur_tsnap->tds_was_throttled = (uint32_t)proc_was_throttled_from_task(task); + cur_tsnap->tds_did_throttle = (uint32_t)proc_did_throttle_from_task(task); + cur_tsnap->tds_latency_qos = (task-> effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) + ? LATENCY_QOS_TIER_UNSPECIFIED + : ((0xFF << 16) | task-> effective_policy.tep_latency_qos); - if ((thread == NULL) || !ml_validate_nofault((vm_offset_t) thread, sizeof(struct thread))) - goto error_exit; +error_exit: + return error; +} - if (!save_userframes_p && thread->kernel_stack == 0) - continue; +static kern_return_t +kcdata_record_thread_iostats(kcdata_descriptor_t kcd, thread_t thread) +{ + kern_return_t error = KERN_SUCCESS; + mach_vm_address_t out_addr = 0; - thread_uniqueid = thread_tid(thread); + /* I/O Statistics */ + assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES); + if (thread->thread_io_stats && !memory_iszero(thread->thread_io_stats, sizeof(struct io_stat_info))) { + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr)); + struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr; + _iostat->ss_disk_reads_count = thread->thread_io_stats->disk_reads.count; + _iostat->ss_disk_reads_size = thread->thread_io_stats->disk_reads.size; + _iostat->ss_disk_writes_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->disk_reads.count); + _iostat->ss_disk_writes_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->disk_reads.size); + _iostat->ss_paging_count = thread->thread_io_stats->paging.count; + _iostat->ss_paging_size = thread->thread_io_stats->paging.size; + _iostat->ss_non_paging_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->paging.count); + _iostat->ss_non_paging_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->paging.size); + _iostat->ss_metadata_count = thread->thread_io_stats->metadata.count; + _iostat->ss_metadata_size = thread->thread_io_stats->metadata.size; + _iostat->ss_data_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->metadata.count); + _iostat->ss_data_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->metadata.size); + for(int i = 0; i < IO_NUM_PRIORITIES; i++) { + _iostat->ss_io_priority_count[i] = thread->thread_io_stats->io_priority[i].count; + _iostat->ss_io_priority_size[i] = thread->thread_io_stats->io_priority[i].size; + } + } - /* add thread marker */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_SNAPSHOT, sizeof(struct thread_snapshot_v2), &out_addr)); - struct thread_snapshot_v2 * cur_thread_snap = (struct thread_snapshot_v2 *)out_addr; - - /* Populate the thread snapshot header */ - cur_thread_snap->ths_thread_id = thread_uniqueid; - cur_thread_snap->ths_state = thread->state; - cur_thread_snap->ths_ss_flags = 0; - cur_thread_snap->ths_base_priority = thread->base_pri; - cur_thread_snap->ths_sched_priority = thread->sched_pri; - cur_thread_snap->ths_sched_flags = thread->sched_flags; - cur_thread_snap->ths_wait_event = VM_KERNEL_UNSLIDE_OR_PERM(thread->wait_event); - cur_thread_snap->ths_continuation = VM_KERNEL_UNSLIDE(thread->continuation); - cur_thread_snap->ths_last_run_time = thread->last_run_time; - cur_thread_snap->ths_last_made_runnable_time = thread->last_made_runnable_time; - cur_thread_snap->ths_io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO); - cur_thread_snap->ths_eqos = thread->effective_policy.thep_qos; - cur_thread_snap->ths_rqos = thread->requested_policy.thrp_qos; - cur_thread_snap->ths_rqos_override = thread->requested_policy.thrp_qos_override; - cur_thread_snap->ths_total_syscalls = thread->syscalls_mach + thread->syscalls_unix; - cur_thread_snap->ths_dqserialnum = 0; - - tval = safe_grab_timer_value(&thread->user_timer); - cur_thread_snap->ths_user_time = tval; - tval = safe_grab_timer_value(&thread->system_timer); - - if (thread->precise_user_kernel_time) { - cur_thread_snap->ths_sys_time = tval; - } else { - cur_thread_snap->ths_user_time += tval; - cur_thread_snap->ths_sys_time = 0; - } +error_exit: + return error; +} - if (thread->effective_policy.darwinbg) - cur_thread_snap->ths_ss_flags |= kThreadDarwinBG; - if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) - cur_thread_snap->ths_ss_flags |= kThreadIOPassive; - if (thread->suspend_count > 0) - cur_thread_snap->ths_ss_flags |= kThreadSuspended; +static kern_return_t +kcdata_record_thread_snapshot( + kcdata_descriptor_t kcd, thread_t thread, task_t task, uint32_t trace_flags, boolean_t have_pmap, boolean_t thread_on_core) +{ + boolean_t dispatch_p = ((trace_flags & STACKSHOT_GET_DQ) != 0); + boolean_t active_kthreads_only_p = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0); + boolean_t trace_fp_p = ((trace_flags & STACKSHOT_TAILSPIN) == 0); + boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); + boolean_t collect_iostats = !collect_delta_stackshot && !(trace_flags & STACKSHOT_TAILSPIN) && !(trace_flags & STACKSHOT_NO_IO_STATS); - if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) { - cur_thread_snap->ths_ss_flags |= kGlobalForcedIdle; + kern_return_t error = KERN_SUCCESS; + mach_vm_address_t out_addr = 0; + int saved_count = 0; + + struct thread_snapshot_v3 * cur_thread_snap = NULL; + char cur_thread_name[STACKSHOT_MAX_THREAD_NAME_SIZE]; + uint64_t tval = 0; + boolean_t task64 = task_has_64BitAddr(task); + + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_SNAPSHOT, sizeof(struct thread_snapshot_v3), &out_addr)); + cur_thread_snap = (struct thread_snapshot_v3 *)out_addr; + + /* Populate the thread snapshot header */ + cur_thread_snap->ths_thread_id = thread_tid(thread); + cur_thread_snap->ths_wait_event = VM_KERNEL_UNSLIDE_OR_PERM(thread->wait_event); + cur_thread_snap->ths_continuation = VM_KERNEL_UNSLIDE(thread->continuation); + cur_thread_snap->ths_total_syscalls = thread->syscalls_mach + thread->syscalls_unix; + + if (IPC_VOUCHER_NULL != thread->ith_voucher) + cur_thread_snap->ths_voucher_identifier = VM_KERNEL_ADDRPERM(thread->ith_voucher); + else + cur_thread_snap->ths_voucher_identifier = 0; + + cur_thread_snap->ths_dqserialnum = 0; + if (dispatch_p && (task != kernel_task) && (task->active) && have_pmap) { + uint64_t dqkeyaddr = thread_dispatchqaddr(thread); + if (dqkeyaddr != 0) { + uint64_t dqaddr = 0; + boolean_t copyin_ok = kdp_copyin_word(task, dqkeyaddr, &dqaddr, FALSE, NULL); + if (copyin_ok && dqaddr != 0) { + uint64_t dqserialnumaddr = dqaddr + get_task_dispatchqueue_serialno_offset(task); + uint64_t dqserialnum = 0; + copyin_ok = kdp_copyin_word(task, dqserialnumaddr, &dqserialnum, FALSE, NULL); + if (copyin_ok) { + cur_thread_snap->ths_ss_flags |= kHasDispatchSerial; + cur_thread_snap->ths_dqserialnum = dqserialnum; } + } + } + } - if (IPC_VOUCHER_NULL != thread->ith_voucher) - cur_thread_snap->ths_voucher_identifier = VM_KERNEL_ADDRPERM(thread->ith_voucher); - if (dispatch_p && (task != kernel_task) && (task->active) && have_pmap) { - uint64_t dqkeyaddr = thread_dispatchqaddr(thread); - if (dqkeyaddr != 0) { - uint64_t dqaddr = 0; - if (kdp_copyin(task->map->pmap, dqkeyaddr, &dqaddr, (task64 ? 8 : 4)) && (dqaddr != 0)) { - uint64_t dqserialnumaddr = dqaddr + proc_dispatchqueue_serialno_offset_from_task(task); - uint64_t dqserialnum = 0; - if (kdp_copyin(task->map->pmap, dqserialnumaddr, &dqserialnum, (task64 ? 8 : 4))) { - cur_thread_snap->ths_ss_flags |= kHasDispatchSerial; - cur_thread_snap->ths_dqserialnum = dqserialnum; - } - } - } - } + tval = safe_grab_timer_value(&thread->user_timer); + cur_thread_snap->ths_user_time = tval; + tval = safe_grab_timer_value(&thread->system_timer); - /* if there is thread name then add to buffer */ - cur_thread_name[0] = '\0'; - proc_threadname_kdp(thread->uthread, cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE); - if (strnlen(cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE) > 0) { - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_NAME, sizeof(cur_thread_name), &out_addr)); - bcopy((void *)cur_thread_name, (void *)out_addr, sizeof(cur_thread_name)); - } + if (thread->precise_user_kernel_time) { + cur_thread_snap->ths_sys_time = tval; + } else { + cur_thread_snap->ths_user_time += tval; + cur_thread_snap->ths_sys_time = 0; + } - /* I/O Statistics */ - assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES); - if (thread->thread_io_stats && !memory_iszero(thread->thread_io_stats, sizeof(struct io_stat_info))) { - kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr)); - struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr; - _iostat->ss_disk_reads_count = thread->thread_io_stats->disk_reads.count; - _iostat->ss_disk_reads_size = thread->thread_io_stats->disk_reads.size; - _iostat->ss_disk_writes_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->disk_reads.count); - _iostat->ss_disk_writes_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->disk_reads.size); - _iostat->ss_paging_count = thread->thread_io_stats->paging.count; - _iostat->ss_paging_size = thread->thread_io_stats->paging.size; - _iostat->ss_non_paging_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->paging.count); - _iostat->ss_non_paging_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->paging.size); - _iostat->ss_metadata_count = thread->thread_io_stats->metadata.count; - _iostat->ss_metadata_size = thread->thread_io_stats->metadata.size; - _iostat->ss_data_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->metadata.count); - _iostat->ss_data_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->metadata.size); - for(int i = 0; i < IO_NUM_PRIORITIES; i++) { - _iostat->ss_io_priority_count[i] = thread->thread_io_stats->io_priority[i].count; - _iostat->ss_io_priority_size[i] = thread->thread_io_stats->io_priority[i].size; - } - } + cur_thread_snap->ths_ss_flags = 0; + if (thread->effective_policy.thep_darwinbg) + cur_thread_snap->ths_ss_flags |= kThreadDarwinBG; + if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) + cur_thread_snap->ths_ss_flags |= kThreadIOPassive; + if (thread->suspend_count > 0) + cur_thread_snap->ths_ss_flags |= kThreadSuspended; + if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) + cur_thread_snap->ths_ss_flags |= kGlobalForcedIdle; + if (thread_on_core) + cur_thread_snap->ths_ss_flags |= kThreadOnCore; + if (stackshot_thread_is_idle_worker_unsafe(thread)) + cur_thread_snap->ths_ss_flags |= kThreadIdleWorker; + + /* make sure state flags defined in kcdata.h still match internal flags */ + static_assert(SS_TH_WAIT == TH_WAIT); + static_assert(SS_TH_SUSP == TH_SUSP); + static_assert(SS_TH_RUN == TH_RUN); + static_assert(SS_TH_UNINT == TH_UNINT); + static_assert(SS_TH_TERMINATE == TH_TERMINATE); + static_assert(SS_TH_TERMINATE2 == TH_TERMINATE2); + static_assert(SS_TH_IDLE == TH_IDLE); + + cur_thread_snap->ths_last_run_time = thread->last_run_time; + cur_thread_snap->ths_last_made_runnable_time = thread->last_made_runnable_time; + cur_thread_snap->ths_state = thread->state; + cur_thread_snap->ths_sched_flags = thread->sched_flags; + cur_thread_snap->ths_base_priority = thread->base_pri; + cur_thread_snap->ths_sched_priority = thread->sched_pri; + cur_thread_snap->ths_eqos = thread->effective_policy.thep_qos; + cur_thread_snap->ths_rqos = thread->requested_policy.thrp_qos; + cur_thread_snap->ths_rqos_override = thread->requested_policy.thrp_qos_override; + cur_thread_snap->ths_io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO); + cur_thread_snap->ths_thread_t = VM_KERNEL_ADDRPERM(thread); + + /* if there is thread name then add to buffer */ + cur_thread_name[0] = '\0'; + proc_threadname_kdp(thread->uthread, cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE); + if (strnlen(cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE) > 0) { + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_NAME, sizeof(cur_thread_name), &out_addr)); + stackshot_memcpy((void *)out_addr, (void *)cur_thread_name, sizeof(cur_thread_name)); + } - /* Trace user stack, if any */ - if (save_userframes_p && task->active && thread->task->map != kernel_map) { - uint32_t thread_snapshot_flags = 0; - /* 64-bit task? */ - if (task_has_64BitAddr(thread->task)) { - out_addr = (mach_vm_address_t)kcd_end_address(stackshot_kcdata_p); - saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(stackshot_kcdata_p), MAX_FRAMES, TRUE, &thread_snapshot_flags); - if (saved_count > 0) { - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, - STACKSHOT_KCTYPE_USER_STACKFRAME64, - sizeof(struct stack_snapshot_frame64), - saved_count/sizeof(struct stack_snapshot_frame64), - &out_addr)); - cur_thread_snap->ths_ss_flags |= kUser64_p; - } - } - else { - out_addr = (mach_vm_address_t)kcd_end_address(stackshot_kcdata_p); - saved_count = machine_trace_thread(thread, (char *)out_addr, (char *)kcd_max_address(stackshot_kcdata_p), MAX_FRAMES, TRUE, &thread_snapshot_flags); - if (saved_count > 0) { - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, - STACKSHOT_KCTYPE_USER_STACKFRAME, - sizeof(struct stack_snapshot_frame32), - saved_count/sizeof(struct stack_snapshot_frame32), - &out_addr)); - } - } + /* record system and user cpu times */ + time_value_t user_time; + time_value_t system_time; + thread_read_times(thread, &user_time, &system_time); + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_CPU_TIMES, sizeof(struct stackshot_cpu_times), &out_addr)); + struct stackshot_cpu_times * stackshot_cpu_times = (struct stackshot_cpu_times *)out_addr; + stackshot_cpu_times->user_usec = ((uint64_t)user_time.seconds) * USEC_PER_SEC + user_time.microseconds; + stackshot_cpu_times->system_usec = ((uint64_t)system_time.seconds) * USEC_PER_SEC + system_time.microseconds; + + /* Trace user stack, if any */ + if (!active_kthreads_only_p && task->active && thread->task->map != kernel_map) { + uint32_t thread_snapshot_flags = 0; + /* 64-bit task? */ + if (task64) { + out_addr = (mach_vm_address_t)kcd_end_address(kcd); + saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, TRUE, + trace_fp_p, &thread_snapshot_flags); + if (saved_count > 0) { + int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame64) : sizeof(uint64_t); + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_USER_STACKFRAME64 + : STACKSHOT_KCTYPE_USER_STACKLR64, + frame_size, saved_count / frame_size, &out_addr)); + cur_thread_snap->ths_ss_flags |= kUser64_p; + } + } else { + out_addr = (mach_vm_address_t)kcd_end_address(kcd); + saved_count = machine_trace_thread(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, TRUE, trace_fp_p, + &thread_snapshot_flags); + if (saved_count > 0) { + int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame32) : sizeof(uint32_t); + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_USER_STACKFRAME + : STACKSHOT_KCTYPE_USER_STACKLR, + frame_size, saved_count / frame_size, &out_addr)); + } + } - if (thread_snapshot_flags != 0) { - cur_thread_snap->ths_ss_flags |= thread_snapshot_flags; - } - } + if (thread_snapshot_flags != 0) { + cur_thread_snap->ths_ss_flags |= thread_snapshot_flags; + } + } - /* Call through to the machine specific trace routines - * Frames are added past the snapshot header. - */ - if (thread->kernel_stack != 0) { - uint32_t thread_snapshot_flags = 0; + /* Call through to the machine specific trace routines + * Frames are added past the snapshot header. + */ + if (thread->kernel_stack != 0) { + uint32_t thread_snapshot_flags = 0; #if defined(__LP64__) - out_addr = (mach_vm_address_t)kcd_end_address(stackshot_kcdata_p); - saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(stackshot_kcdata_p), MAX_FRAMES, FALSE, &thread_snapshot_flags); - if (saved_count > 0){ - cur_thread_snap->ths_ss_flags |= kKernel64_p; - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, - STACKSHOT_KCTYPE_KERN_STACKFRAME64, - sizeof(struct stack_snapshot_frame64), - saved_count/sizeof(struct stack_snapshot_frame64), - &out_addr)); - } + out_addr = (mach_vm_address_t)kcd_end_address(kcd); + saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, FALSE, trace_fp_p, + &thread_snapshot_flags); + if (saved_count > 0) { + int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame64) : sizeof(uint64_t); + cur_thread_snap->ths_ss_flags |= kKernel64_p; + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_KERN_STACKFRAME64 + : STACKSHOT_KCTYPE_KERN_STACKLR64, + frame_size, saved_count / frame_size, &out_addr)); + } #else - out_addr = (mach_vm_address_t)kcd_end_address(stackshot_kcdata_p); - saved_count = machine_trace_thread(thread, (char *)out_addr, (char *)kcd_max_address(stackshot_kcdata_p), MAX_FRAMES, FALSE, &thread_snapshot_flags); - if (saved_count > 0) { - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, - STACKSHOT_KCTYPE_KERN_STACKFRAME, - sizeof(struct stack_snapshot_frame32), - saved_count/sizeof(struct stack_snapshot_frame32), - &out_addr)); - } + out_addr = (mach_vm_address_t)kcd_end_address(kcd); + saved_count = machine_trace_thread(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, FALSE, trace_fp_p, + &thread_snapshot_flags); + if (saved_count > 0) { + int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame32) : sizeof(uint32_t); + kcd_exit_on_error( + kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_KERN_STACKFRAME : STACKSHOT_KCTYPE_KERN_STACKLR, + frame_size, saved_count / frame_size, &out_addr)); + } #endif - if (thread_snapshot_flags != 0) { - cur_thread_snap->ths_ss_flags |= thread_snapshot_flags; - } - } - /* mark end of thread snapshot data */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); - } - /* mark end of task snapshot data */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, task_uniqueid)); + if (thread_snapshot_flags != 0) { + cur_thread_snap->ths_ss_flags |= thread_snapshot_flags; } } - /* === END of populating stackshot data === */ + if (collect_iostats) { + kcd_exit_on_error(kcdata_record_thread_iostats(kcd, thread)); + } - *pBytesTraced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_kcdata_p); error_exit: - /* Release stack snapshot wait indicator */ - kdp_snapshot_postflight(); - return error; } static int -kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_flags, uint32_t *pbytesTraced) +kcdata_record_thread_delta_snapshot(struct thread_delta_snapshot_v2 * cur_thread_snap, thread_t thread, boolean_t thread_on_core) +{ + cur_thread_snap->tds_thread_id = thread_tid(thread); + if (IPC_VOUCHER_NULL != thread->ith_voucher) + cur_thread_snap->tds_voucher_identifier = VM_KERNEL_ADDRPERM(thread->ith_voucher); + else + cur_thread_snap->tds_voucher_identifier = 0; + + cur_thread_snap->tds_ss_flags = 0; + if (thread->effective_policy.thep_darwinbg) + cur_thread_snap->tds_ss_flags |= kThreadDarwinBG; + if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) + cur_thread_snap->tds_ss_flags |= kThreadIOPassive; + if (thread->suspend_count > 0) + cur_thread_snap->tds_ss_flags |= kThreadSuspended; + if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) + cur_thread_snap->tds_ss_flags |= kGlobalForcedIdle; + if (thread_on_core) + cur_thread_snap->tds_ss_flags |= kThreadOnCore; + if (stackshot_thread_is_idle_worker_unsafe(thread)) + cur_thread_snap->tds_ss_flags |= kThreadIdleWorker; + + cur_thread_snap->tds_last_made_runnable_time = thread->last_made_runnable_time; + cur_thread_snap->tds_state = thread->state; + cur_thread_snap->tds_sched_flags = thread->sched_flags; + cur_thread_snap->tds_base_priority = thread->base_pri; + cur_thread_snap->tds_sched_priority = thread->sched_pri; + cur_thread_snap->tds_eqos = thread->effective_policy.thep_qos; + cur_thread_snap->tds_rqos = thread->requested_policy.thrp_qos; + cur_thread_snap->tds_rqos_override = thread->requested_policy.thrp_qos_override; + cur_thread_snap->tds_io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO); + + return 0; +} + +/* + * Why 12? 12 strikes a decent balance between allocating a large array on + * the stack and having large kcdata item overheads for recording nonrunable + * tasks. + */ +#define UNIQUEIDSPERFLUSH 12 + +struct saved_uniqueids { + uint64_t ids[UNIQUEIDSPERFLUSH]; + unsigned count; +}; + +static kern_return_t +flush_nonrunnable_tasks(struct saved_uniqueids * ids) +{ + if (ids->count == 0) + return KERN_SUCCESS; + mach_vm_address_t out_addr = 0; + kern_return_t ret = kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TASKS, sizeof(uint64_t), + ids->count, &out_addr); + if (ret != KERN_SUCCESS) { + return ret; + } + stackshot_memcpy((void *)out_addr, ids->ids, sizeof(uint64_t) * ids->count); + ids->count = 0; + return ret; +} + +static kern_return_t +handle_nonrunnable_task(struct saved_uniqueids * ids, uint64_t pid) +{ + kern_return_t ret = KERN_SUCCESS; + ids->ids[ids->count] = pid; + ids->count++; + assert(ids->count <= UNIQUEIDSPERFLUSH); + if (ids->count == UNIQUEIDSPERFLUSH) + ret = flush_nonrunnable_tasks(ids); + return ret; +} + +enum thread_classification { + tc_full_snapshot, /* take a full snapshot */ + tc_delta_snapshot, /* take a delta snapshot */ + tc_nonrunnable, /* only report id */ +}; + +static enum thread_classification +classify_thread(thread_t thread, boolean_t * thread_on_core_p, uint32_t trace_flags) { - char *tracepos = (char *) tracebuf; - char *tracebound = tracepos + tracebuf_size; - uint32_t tracebytes = 0; - int error = 0, i; + boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); + boolean_t minimize_nonrunnables = ((trace_flags & STACKSHOT_TAILSPIN) != 0); + processor_t last_processor = thread->last_processor; + + boolean_t thread_on_core = + (last_processor != PROCESSOR_NULL && last_processor->state == PROCESSOR_RUNNING && last_processor->active_thread == thread); + + *thread_on_core_p = thread_on_core; + + /* Capture the full thread snapshot if this is not a delta stackshot or if the thread has run subsequent to the + * previous full stackshot */ + if (!collect_delta_stackshot || thread_on_core || (thread->last_run_time > stack_snapshot_delta_since_timestamp)) { + return tc_full_snapshot; + } else { + if (minimize_nonrunnables && !(thread->state & TH_RUN)) { + return tc_nonrunnable; + } else { + return tc_delta_snapshot; + } + } +} + +static kern_return_t +kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTraced) +{ + kern_return_t error = KERN_SUCCESS; + mach_vm_address_t out_addr = 0; + uint64_t abs_time = 0, abs_time_end = 0; + uint64_t *abs_time_addr = NULL; + uint64_t system_state_flags = 0; + int saved_count = 0; task_t task = TASK_NULL; thread_t thread = THREAD_NULL; - unsigned framesize = 2 * sizeof(vm_offset_t); + mach_timebase_info_data_t timebase = {0, 0}; + uint32_t length_to_copy = 0, tmp32 = 0; - queue_head_t *task_list = &tasks; - boolean_t is_active_list = TRUE; - - boolean_t dispatch_p = ((trace_flags & STACKSHOT_GET_DQ) != 0); - boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0); - boolean_t save_kextloadinfo_p = ((trace_flags & STACKSHOT_SAVE_KEXT_LOADINFO) != 0); - boolean_t save_userframes_p = ((trace_flags & STACKSHOT_SAVE_KERNEL_FRAMES_ONLY) == 0); - boolean_t save_donating_pids_p = ((trace_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0); - - if(trace_flags & STACKSHOT_GET_GLOBAL_MEM_STATS) { - if(tracepos + sizeof(struct mem_and_io_snapshot) > tracebound) { - error = -1; - goto error_exit; - } - kdp_mem_and_io_snapshot((struct mem_and_io_snapshot *)tracepos); - tracepos += sizeof(struct mem_and_io_snapshot); + abs_time = mach_absolute_time(); + + /* process the flags */ + boolean_t active_kthreads_only_p = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0); + boolean_t save_donating_pids_p = ((trace_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0); + boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); + boolean_t minimize_nonrunnables = ((trace_flags & STACKSHOT_TAILSPIN) != 0); + boolean_t use_fault_path = ((trace_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0); + + stack_enable_faulting = (trace_flags & (STACKSHOT_ENABLE_BT_FAULTING)); + + + struct saved_uniqueids saved_uniqueids = {.count = 0}; + + if (use_fault_path) { + fault_stats.sfs_pages_faulted_in = 0; + fault_stats.sfs_time_spent_faulting = 0; + fault_stats.sfs_stopped_faulting = (uint8_t) FALSE; + } + + if (sizeof(void *) == 8) + system_state_flags |= kKernel64_p; + + if (stackshot_kcdata_p == NULL || pBytesTraced == NULL) { + error = KERN_INVALID_ARGUMENT; + goto error_exit; + } + + /* setup mach_absolute_time and timebase info -- copy out in some cases and needed to convert since_timestamp to seconds for proc start time */ + clock_timebase_info(&timebase); + + /* begin saving data into the buffer */ + *pBytesTraced = 0; + kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, trace_flags, "stackshot_in_flags")); + kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, (uint32_t)pid, "stackshot_in_pid")); + kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, system_state_flags, "system_state_flags")); + +#if CONFIG_JETSAM + tmp32 = memorystatus_get_pressure_status_kdp(); + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_JETSAM_LEVEL, sizeof(uint32_t), &out_addr)); + stackshot_memcpy((void *)out_addr, &tmp32, sizeof(tmp32)); +#endif + + if (!collect_delta_stackshot) { + tmp32 = PAGE_SIZE; + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_KERN_PAGE_SIZE, sizeof(uint32_t), &out_addr)); + stackshot_memcpy((void *)out_addr, &tmp32, sizeof(tmp32)); + + /* save boot-args and osversion string */ + length_to_copy = MIN((uint32_t)(strlen(version) + 1), OSVERSIZE); + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_OSVERSION, length_to_copy, &out_addr)); + stackshot_strlcpy((char*)out_addr, &version[0], length_to_copy); + + length_to_copy = MIN((uint32_t)(strlen(PE_boot_args()) + 1), OSVERSIZE); + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_BOOTARGS, length_to_copy, &out_addr)); + stackshot_strlcpy((char*)out_addr, PE_boot_args(), length_to_copy); + + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_TIMEBASE, sizeof(timebase), &out_addr)); + stackshot_memcpy((void *)out_addr, &timebase, sizeof(timebase)); + } else { + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP, sizeof(uint64_t), &out_addr)); + stackshot_memcpy((void*)out_addr, &stack_snapshot_delta_since_timestamp, sizeof(stack_snapshot_delta_since_timestamp)); + } + + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &out_addr)); + abs_time_addr = (uint64_t *)out_addr; + stackshot_memcpy((void *)abs_time_addr, &abs_time, sizeof(uint64_t)); + + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &out_addr)); + stackshot_memcpy((void *)out_addr, &stackshot_microsecs, sizeof(uint64_t)); + + /* reserve space of system level shared cache load info */ + struct dyld_uuid_info_64_v2 * sys_shared_cache_loadinfo = NULL; + if (!collect_delta_stackshot) { + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, + sizeof(struct dyld_uuid_info_64_v2), &out_addr)); + sys_shared_cache_loadinfo = (struct dyld_uuid_info_64_v2 *)out_addr; + bzero((void *)sys_shared_cache_loadinfo, sizeof(struct dyld_uuid_info_64_v2)); } - -walk_list: + /* Add requested information first */ + if (trace_flags & STACKSHOT_GET_GLOBAL_MEM_STATS) { + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_GLOBAL_MEM_STATS, sizeof(struct mem_and_io_snapshot), &out_addr)); + kdp_mem_and_io_snapshot((struct mem_and_io_snapshot *)out_addr); + } + + /* Iterate over tasks */ + queue_head_t *task_list = &tasks; queue_iterate(task_list, task, task_t, tasks) { - if ((task == NULL) || !ml_validate_nofault((vm_offset_t) task, sizeof(struct task))) + int task_pid = 0; + uint64_t task_uniqueid = 0; + int num_delta_thread_snapshots = 0; + int num_nonrunnable_threads = 0; + uint64_t task_start_abstime = 0; + boolean_t task_delta_stackshot = FALSE; + boolean_t task64 = FALSE, have_map = FALSE, have_pmap = FALSE; + boolean_t some_thread_ran = FALSE; + uint64_t *task_snap_ss_flags = NULL; + + if ((task == NULL) || !ml_validate_nofault((vm_offset_t)task, sizeof(struct task))) { + error = KERN_FAILURE; goto error_exit; + } + + have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map))); + have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap))); - int task_pid = pid_from_task(task); - uint64_t task_uniqueid = proc_uniqueid_from_task(task); - boolean_t task64 = task_has_64BitAddr(task); + task_pid = pid_from_task(task); + task_uniqueid = get_task_uniqueid(task); + task64 = task_has_64BitAddr(task); if (!task->active || task_is_a_corpse(task)) { /* @@ -1436,408 +1587,218 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl } } + if (collect_delta_stackshot) { + proc_starttime_kdp(task->bsd_info, NULL, NULL, &task_start_abstime); + } + /* Trace everything, unless a process was specified */ if ((pid == -1) || (pid == task_pid)) { - task_snapshot_t task_snap; - thread_snapshot_t tsnap = NULL; - uint32_t uuid_info_count = 0; - mach_vm_address_t uuid_info_addr = 0; - boolean_t have_map = (task->map != NULL) && - (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map))); - boolean_t have_pmap = have_map && (task->map->pmap != NULL) && - (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap))); - uint64_t shared_cache_base_address = 0; - - if (have_pmap && task->active && save_loadinfo_p && task_pid > 0) { - // Read the dyld_all_image_infos struct from the task memory to get UUID array count and location - if (task64) { - struct user64_dyld_all_image_infos task_image_infos; - if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct user64_dyld_all_image_infos))) { - uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; - uuid_info_addr = task_image_infos.uuidArray; - } - } else { - struct user32_dyld_all_image_infos task_image_infos; - if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct user32_dyld_all_image_infos))) { - uuid_info_count = task_image_infos.uuidArrayCount; - uuid_info_addr = task_image_infos.uuidArray; - } - } +#if DEBUG || DEVELOPMENT + /* we might want to call kcdata_undo_add_container_begin(), which is + * only safe if we call it after kcdata_add_container_marker() but + * before adding any other kcdata items. In development kernels, + * we'll remember where the buffer end was and confirm after calling + * kcdata_undo_add_container_begin() that it's in exactly the same + * place.*/ + mach_vm_address_t revert_addr = stackshot_kcdata_p->kcd_addr_end; +#endif - // If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating - // this data structure), we zero the uuid_info_count so that we won't even try to save load info - // for this task. - if (!uuid_info_addr) { - uuid_info_count = 0; - } - } + /* add task snapshot marker */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, + STACKSHOT_KCCONTAINER_TASK, task_uniqueid)); - if (have_pmap && task_pid == 0) { - if (save_kextloadinfo_p && ml_validate_nofault((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader))) { - uuid_info_count = gLoadedKextSummaries->numSummaries + 1; /* include main kernel UUID */ - }else { - uuid_info_count = 1; /* atleast include kernel uuid */ + if (!collect_delta_stackshot || (task_start_abstime == 0) || + (task_start_abstime > stack_snapshot_delta_since_timestamp)) { + kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, trace_flags, have_pmap, &task_snap_ss_flags)); + } else { + task_delta_stackshot = TRUE; + if (minimize_nonrunnables) { + // delay taking the task snapshot. If there are no runnable threads we'll skip it. + } else { + kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, have_pmap, &task_snap_ss_flags)); } } - if (tracepos + sizeof(struct task_snapshot) > tracebound) { - error = -1; - goto error_exit; - } - - task_snap = (task_snapshot_t) tracepos; - task_snap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC; - task_snap->pid = task_pid; - task_snap->uniqueid = task_uniqueid; - task_snap->nloadinfos = uuid_info_count; - task_snap->donating_pid_count = 0; - - /* Add the BSD process identifiers */ - if (task_pid != -1) - proc_name_kdp(task, task_snap->p_comm, sizeof(task_snap->p_comm)); - else - task_snap->p_comm[0] = '\0'; - task_snap->ss_flags = 0; - if (task64) - task_snap->ss_flags |= kUser64_p; - if (task64 && task_pid == 0) - task_snap->ss_flags |= kKernel64_p; - if (!task->active || task_is_a_corpse(task)) - task_snap->ss_flags |= kTerminatedSnapshot; - if(task->pidsuspended) task_snap->ss_flags |= kPidSuspended; - if(task->frozen) task_snap->ss_flags |= kFrozen; - - if (task->effective_policy.darwinbg == 1) { - task_snap->ss_flags |= kTaskDarwinBG; - } - - if (task->requested_policy.t_role == TASK_FOREGROUND_APPLICATION) { - task_snap->ss_flags |= kTaskIsForeground; - } + /* Iterate over task threads */ + queue_iterate(&task->threads, thread, thread_t, task_threads) + { + uint64_t thread_uniqueid; - if (task->requested_policy.t_boosted == 1) { - task_snap->ss_flags |= kTaskIsBoosted; - } + if ((thread == NULL) || !ml_validate_nofault((vm_offset_t)thread, sizeof(struct thread))) { + error = KERN_FAILURE; + goto error_exit; + } - if (task->effective_policy.t_sup_active == 1) - task_snap->ss_flags |= kTaskIsSuppressed; -#if IMPORTANCE_INHERITANCE - if (task->task_imp_base) { - if (task->task_imp_base->iit_donor) { - task_snap->ss_flags |= kTaskIsImpDonor; -} + if (active_kthreads_only_p && thread->kernel_stack == 0) + continue; - if (task->task_imp_base->iit_live_donor) { - task_snap->ss_flags |= kTaskIsLiveImpDonor; - } - } -#endif + thread_uniqueid = thread_tid(thread); - task_snap->latency_qos = (task->effective_policy.t_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ? - LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | task->effective_policy.t_latency_qos); + boolean_t thread_on_core; + enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, trace_flags); - task_snap->suspend_count = task->suspend_count; - task_snap->task_size = have_pmap ? pmap_resident_count(task->map->pmap) : 0; - task_snap->faults = task->faults; - task_snap->pageins = task->pageins; - task_snap->cow_faults = task->cow_faults; + switch (thread_classification) { + case tc_full_snapshot: + /* add thread marker */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, + STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); + kcd_exit_on_error( + kcdata_record_thread_snapshot(stackshot_kcdata_p, thread, task, trace_flags, have_pmap, thread_on_core)); - task_snap->user_time_in_terminated_threads = task->total_user_time; - task_snap->system_time_in_terminated_threads = task->total_system_time; - /* - * The throttling counters are maintained as 64-bit counters in the proc - * structure. However, we reserve 32-bits (each) for them in the task_snapshot - * struct to save space and since we do not expect them to overflow 32-bits. If we - * find these values overflowing in the future, the fix would be to simply - * upgrade these counters to 64-bit in the task_snapshot struct - */ - task_snap->was_throttled = (uint32_t) proc_was_throttled_from_task(task); - task_snap->did_throttle = (uint32_t) proc_did_throttle_from_task(task); + /* mark end of thread snapshot data */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, + STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); - /* fetch some useful BSD info: */ - task_snap->p_start_sec = task_snap->p_start_usec = 0; - proc_starttime_kdp(task->bsd_info, &task_snap->p_start_sec, &task_snap->p_start_usec); - if (task->shared_region && ml_validate_nofault((vm_offset_t)task->shared_region, - sizeof(struct vm_shared_region))) { - struct vm_shared_region *sr = task->shared_region; + some_thread_ran = TRUE; + break; - shared_cache_base_address = sr->sr_base_address + sr->sr_first_mapping; - } - if (!shared_cache_base_address - || !kdp_copyin(task->map->pmap, shared_cache_base_address + offsetof(struct _dyld_cache_header, uuid), task_snap->shared_cache_identifier, sizeof(task_snap->shared_cache_identifier))) { - memset(task_snap->shared_cache_identifier, 0x0, sizeof(task_snap->shared_cache_identifier)); - } - if (task->shared_region) { - /* - * No refcounting here, but we are in debugger - * context, so that should be safe. - */ - task_snap->shared_cache_slide = task->shared_region->sr_slide_info.slide; - } else { - task_snap->shared_cache_slide = 0; - } + case tc_delta_snapshot: + num_delta_thread_snapshots++; + break; - /* I/O Statistics */ - assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES); - - if (task->task_io_stats) { - task_snap->disk_reads_count = task->task_io_stats->disk_reads.count; - task_snap->disk_reads_size = task->task_io_stats->disk_reads.size; - task_snap->disk_writes_count = (task->task_io_stats->total_io.count - task->task_io_stats->disk_reads.count); - task_snap->disk_writes_size = (task->task_io_stats->total_io.size - task->task_io_stats->disk_reads.size); - for(i = 0; i < IO_NUM_PRIORITIES; i++) { - task_snap->io_priority_count[i] = task->task_io_stats->io_priority[i].count; - task_snap->io_priority_size[i] = task->task_io_stats->io_priority[i].size; + case tc_nonrunnable: + num_nonrunnable_threads++; + break; } - task_snap->paging_count = task->task_io_stats->paging.count; - task_snap->paging_size = task->task_io_stats->paging.size; - task_snap->non_paging_count = (task->task_io_stats->total_io.count - task->task_io_stats->paging.count); - task_snap->non_paging_size = (task->task_io_stats->total_io.size - task->task_io_stats->paging.size); - task_snap->metadata_count = task->task_io_stats->metadata.count; - task_snap->metadata_size = task->task_io_stats->metadata.size; - task_snap->data_count = (task->task_io_stats->total_io.count - task->task_io_stats->metadata.count); - task_snap->data_size = (task->task_io_stats->total_io.size - task->task_io_stats->metadata.size); - } else { - /* zero from disk_reads_count to end of structure */ - memset(&task_snap->disk_reads_count, 0, offsetof(struct task_snapshot, metadata_size) - offsetof(struct task_snapshot, disk_reads_count)); } - tracepos += sizeof(struct task_snapshot); - - if (task_pid > 0 && uuid_info_count > 0) { - uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); - uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; - if (tracepos + uuid_info_array_size > tracebound) { - error = -1; - goto error_exit; - } + if (task_delta_stackshot && minimize_nonrunnables) { + if (some_thread_ran || num_delta_thread_snapshots > 0) { + kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, have_pmap, &task_snap_ss_flags)); + } else { + kcd_exit_on_error(kcdata_undo_add_container_begin(stackshot_kcdata_p)); - // Copy in the UUID info array - // It may be nonresident, in which case just fix up nloadinfos to 0 in the task_snap - if (have_pmap && !kdp_copyin(task->map->pmap, uuid_info_addr, tracepos, uuid_info_array_size)) - task_snap->nloadinfos = 0; - else - tracepos += uuid_info_array_size; - } else if (task_pid == 0 && uuid_info_count > 0) { - uint32_t uuid_info_size = (uint32_t)sizeof(kernel_uuid_info); - uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; - uint32_t uuid_offset = offsetof(kernel_uuid_info, imageUUID); - uintptr_t image_load_address; - - if (tracepos + uuid_info_array_size > tracebound) { - error = -1; - goto error_exit; +#if DEBUG || DEVELOPMENT + mach_vm_address_t undo_addr = stackshot_kcdata_p->kcd_addr_end; + if (revert_addr != undo_addr) { + panic("tried to revert a container begin but we already moved past it. revert=%p undo=%p", + (void *)revert_addr, (void *)undo_addr); + } +#endif + kcd_exit_on_error(handle_nonrunnable_task(&saved_uniqueids, task_uniqueid)); + continue; } + } - do { + struct thread_delta_snapshot_v2 * delta_snapshots = NULL; + int current_delta_snapshot_index = 0; - if (!kernel_uuid || !ml_validate_nofault((vm_offset_t)kernel_uuid, sizeof(uuid_t))) { - /* Kernel UUID not found or inaccessible */ - task_snap->nloadinfos = 0; - break; - } - image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(vm_kernel_stext); - memcpy(tracepos, &image_load_address, sizeof(uintptr_t)); - memcpy((tracepos + uuid_offset), kernel_uuid, sizeof(uuid_t)); - tracepos += uuid_info_size; - - if (save_kextloadinfo_p && ml_validate_nofault((vm_offset_t)(&gLoadedKextSummaries->summaries[0]), - gLoadedKextSummaries->entry_size * gLoadedKextSummaries->numSummaries)) { - uint32_t kexti; - for (kexti=0 ; kexti < gLoadedKextSummaries->numSummaries; kexti++) { - image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(gLoadedKextSummaries->summaries[kexti].address); - memcpy(tracepos, &image_load_address, sizeof(uintptr_t)); - memcpy((tracepos + uuid_offset), &gLoadedKextSummaries->summaries[kexti].uuid, sizeof(uuid_t)); - tracepos += uuid_info_size; - } - } else { - /* kext summary invalid, but kernel UUID was copied */ - task_snap->nloadinfos = 1; - break; - } - } while(0); + if (num_delta_thread_snapshots > 0) { + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT, + sizeof(struct thread_delta_snapshot_v2), + num_delta_thread_snapshots, &out_addr)); + delta_snapshots = (struct thread_delta_snapshot_v2 *)out_addr; } - - if (save_donating_pids_p) { - if (tracepos + (TASK_IMP_WALK_LIMIT * sizeof(int32_t)) > tracebound) { - error = -1; - goto error_exit; - } - task_snap->donating_pid_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS, tracepos, TASK_IMP_WALK_LIMIT); - tracepos += sizeof(int) * task_snap->donating_pid_count; + uint64_t * nonrunnable_tids = NULL; + int current_nonrunnable_index = 0; + + if (num_nonrunnable_threads > 0) { + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TIDS, + sizeof(uint64_t), num_nonrunnable_threads, &out_addr)); + nonrunnable_tids = (uint64_t *)out_addr; } - queue_iterate(&task->threads, thread, thread_t, task_threads){ - uint64_t tval; + if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0) { + queue_iterate(&task->threads, thread, thread_t, task_threads) + { + if (active_kthreads_only_p && thread->kernel_stack == 0) + continue; - if ((thread == NULL) || !ml_validate_nofault((vm_offset_t) thread, sizeof(struct thread))) - goto error_exit; + boolean_t thread_on_core; + enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, trace_flags); - if (((tracepos + 4 * sizeof(struct thread_snapshot)) > tracebound)) { - error = -1; - goto error_exit; - } - if (!save_userframes_p && thread->kernel_stack == 0) - continue; - - /* Populate the thread snapshot header */ - tsnap = (thread_snapshot_t) tracepos; - tsnap->thread_id = thread_tid(thread); - tsnap->state = thread->state; - tsnap->priority = thread->base_pri; - tsnap->sched_pri = thread->sched_pri; - tsnap->sched_flags = thread->sched_flags; - tsnap->wait_event = VM_KERNEL_UNSLIDE_OR_PERM(thread->wait_event); - tsnap->continuation = VM_KERNEL_UNSLIDE(thread->continuation); - tval = safe_grab_timer_value(&thread->user_timer); - tsnap->user_time = tval; - tval = safe_grab_timer_value(&thread->system_timer); - if (thread->precise_user_kernel_time) { - tsnap->system_time = tval; - } else { - tsnap->user_time += tval; - tsnap->system_time = 0; - } - tsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC; - bzero(&tsnap->pth_name, STACKSHOT_MAX_THREAD_NAME_SIZE); - proc_threadname_kdp(thread->uthread, &tsnap->pth_name[0], STACKSHOT_MAX_THREAD_NAME_SIZE); - tracepos += sizeof(struct thread_snapshot); - tsnap->ss_flags = 0; - /* I/O Statistics */ - assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES); - if (thread->thread_io_stats) { - tsnap->disk_reads_count = thread->thread_io_stats->disk_reads.count; - tsnap->disk_reads_size = thread->thread_io_stats->disk_reads.size; - tsnap->disk_writes_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->disk_reads.count); - tsnap->disk_writes_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->disk_reads.size); - for(i = 0; i < IO_NUM_PRIORITIES; i++) { - tsnap->io_priority_count[i] = thread->thread_io_stats->io_priority[i].count; - tsnap->io_priority_size[i] = thread->thread_io_stats->io_priority[i].size; - } - tsnap->paging_count = thread->thread_io_stats->paging.count; - tsnap->paging_size = thread->thread_io_stats->paging.size; - tsnap->non_paging_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->paging.count); - tsnap->non_paging_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->paging.size); - tsnap->metadata_count = thread->thread_io_stats->metadata.count; - tsnap->metadata_size = thread->thread_io_stats->metadata.size; - tsnap->data_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->metadata.count); - tsnap->data_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->metadata.size); - } else { - /* zero from disk_reads_count to end of structure */ - memset(&tsnap->disk_reads_count, 0, - offsetof(struct thread_snapshot, metadata_size) - offsetof(struct thread_snapshot, disk_reads_count)); - } + switch (thread_classification) { + case tc_full_snapshot: + /* full thread snapshot captured above */ + continue; - if (thread->effective_policy.darwinbg) { - tsnap->ss_flags |= kThreadDarwinBG; - } - - tsnap->io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO); - if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) { - tsnap->ss_flags |= kThreadIOPassive; - } - - if (thread->suspend_count > 0) { - tsnap->ss_flags |= kThreadSuspended; - } + case tc_delta_snapshot: + kcd_exit_on_error(kcdata_record_thread_delta_snapshot(&delta_snapshots[current_delta_snapshot_index++], + thread, thread_on_core)); + break; - if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) { - tsnap->ss_flags |= kGlobalForcedIdle; + case tc_nonrunnable: + nonrunnable_tids[current_nonrunnable_index++] = thread_tid(thread); + continue; + } } - if (IPC_VOUCHER_NULL != thread->ith_voucher) { - tsnap->voucher_identifier = VM_KERNEL_ADDRPERM(thread->ith_voucher); +#if DEBUG || DEVELOPMENT + if (current_delta_snapshot_index != num_delta_thread_snapshots) { + panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task, + num_delta_thread_snapshots, current_delta_snapshot_index); } - - tsnap->ts_qos = thread->effective_policy.thep_qos; - tsnap->ts_rqos = thread->requested_policy.thrp_qos; - tsnap->ts_rqos_override = thread->requested_policy.thrp_qos_override; - /* zero out unused data. */ - tsnap->_reserved[0] = 0; - tsnap->_reserved[1] = 0; - tsnap->_reserved[2] = 0; - tsnap->total_syscalls = thread->syscalls_mach + thread->syscalls_unix; - - if (dispatch_p && (task != kernel_task) && (task->active) && have_pmap) { - uint64_t dqkeyaddr = thread_dispatchqaddr(thread); - if (dqkeyaddr != 0) { - uint64_t dqaddr = 0; - if (kdp_copyin(task->map->pmap, dqkeyaddr, &dqaddr, (task64 ? 8 : 4)) && (dqaddr != 0)) { - uint64_t dqserialnumaddr = dqaddr + proc_dispatchqueue_serialno_offset_from_task(task); - uint64_t dqserialnum = 0; - if (kdp_copyin(task->map->pmap, dqserialnumaddr, &dqserialnum, (task64 ? 8 : 4))) { - tsnap->ss_flags |= kHasDispatchSerial; - memcpy(tracepos, &dqserialnum, sizeof(dqserialnum)); - tracepos += 8; - } - } - } + if (current_nonrunnable_index != num_nonrunnable_threads) { + panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task, + num_nonrunnable_threads, current_nonrunnable_index); } -/* Call through to the machine specific trace routines - * Frames are added past the snapshot header. - */ - tracebytes = 0; - if (thread->kernel_stack != 0) { - uint32_t thread_snapshot_flags = 0; -#if defined(__LP64__) - tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, FALSE, &thread_snapshot_flags); - tsnap->ss_flags |= kKernel64_p; - framesize = 16; -#else - tracebytes = machine_trace_thread(thread, tracepos, tracebound, MAX_FRAMES, FALSE, &thread_snapshot_flags); - framesize = 8; #endif - if (thread_snapshot_flags != 0) { - tsnap->ss_flags |= thread_snapshot_flags; - } - } - tsnap->nkern_frames = tracebytes/framesize; - tracepos += tracebytes; - tracebytes = 0; - /* Trace user stack, if any */ - if (save_userframes_p && task->active && thread->task->map != kernel_map) { - uint32_t thread_snapshot_flags = 0; - /* 64-bit task? */ - if (task_has_64BitAddr(thread->task)) { - tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, TRUE, &thread_snapshot_flags); - tsnap->ss_flags |= kUser64_p; - framesize = 16; - } - else { - tracebytes = machine_trace_thread(thread, tracepos, tracebound, MAX_FRAMES, TRUE, &thread_snapshot_flags); - framesize = 8; - } - if (thread_snapshot_flags != 0) { - tsnap->ss_flags |= thread_snapshot_flags; - } - } - tsnap->nuser_frames = tracebytes/framesize; - tracepos += tracebytes; - tracebytes = 0; } - if (!save_userframes_p && tsnap == NULL) { - /* - * No thread info is collected due to lack of kernel frames. - * Remove information about this task also - */ - tracepos = (char *)task_snap; - } +#if IMPORTANCE_INHERITANCE + if (save_donating_pids_p) { + kcd_exit_on_error( + ((((mach_vm_address_t)kcd_end_address(stackshot_kcdata_p) + (TASK_IMP_WALK_LIMIT * sizeof(int32_t))) < + (mach_vm_address_t)kcd_max_address(stackshot_kcdata_p)) + ? KERN_SUCCESS + : KERN_RESOURCE_SHORTAGE)); + saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS, + (void *)kcd_end_address(stackshot_kcdata_p), TASK_IMP_WALK_LIMIT); + if (saved_count > 0) + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS, + sizeof(int32_t), saved_count, &out_addr)); + } +#endif + + if (!collect_delta_stackshot || (num_delta_thread_snapshots != task->thread_count) || !task_delta_stackshot) { + /* + * Collect shared cache info and UUID info in these scenarios + * 1) a full stackshot + * 2) a delta stackshot where the task started after the previous full stackshot OR + * any thread from the task has run since the previous full stackshot + */ + + kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, sys_shared_cache_loadinfo, trace_flags, task_snap_ss_flags)); + kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, trace_flags, have_pmap, task_snap_ss_flags)); + } + /* mark end of task snapshot data */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, + task_uniqueid)); } } - if (is_active_list) { - is_active_list = FALSE; - task_list = &terminated_tasks; - goto walk_list; + if (minimize_nonrunnables) { + flush_nonrunnable_tasks(&saved_uniqueids); + } + + if (use_fault_path) { + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS, + sizeof(struct stackshot_fault_stats), &out_addr)); + stackshot_memcpy((void*)out_addr, &fault_stats, sizeof(struct stackshot_fault_stats)); } + /* update timestamp of the stackshot */ + abs_time_end = mach_absolute_time(); +#if DEVELOPMENT || DEBUG + kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_STACKSHOT_DURATION, + sizeof(struct stackshot_duration), &out_addr)); + struct stackshot_duration * stackshot_duration = (struct stackshot_duration *)out_addr; + stackshot_duration->stackshot_duration = (abs_time_end - abs_time); + stackshot_duration->stackshot_duration_outer = 0; + stackshot_duration_outer = &stackshot_duration->stackshot_duration_outer; +#endif + stackshot_memcpy((void *)abs_time_addr, &abs_time_end, sizeof(uint64_t)); + + + kcd_exit_on_error(kcdata_write_buffer_end(stackshot_kcdata_p)); + + /* === END of populating stackshot data === */ + + *pBytesTraced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_kcdata_p); error_exit: - /* Release stack snapshot wait indicator */ - kdp_snapshot_postflight(); - *pbytesTraced = (uint32_t)(tracepos - (char *) tracebuf); + stack_enable_faulting = FALSE; return error; } @@ -1855,17 +1816,6 @@ static int pid_from_task(task_t task) return pid; } -static uint64_t -proc_uniqueid_from_task(task_t task) -{ - uint64_t uniqueid = ~(0ULL); - - if (task->bsd_info) - uniqueid = proc_uniqueid(task->bsd_info); - - return uniqueid; -} - static uint64_t proc_was_throttled_from_task(task_t task) { @@ -1888,18 +1838,6 @@ proc_did_throttle_from_task(task_t task) return did_throttle; } -static uint64_t -proc_dispatchqueue_serialno_offset_from_task(task_t task) -{ - uint64_t dq_serialno_offset = 0; - - if (task->bsd_info) { - dq_serialno_offset = get_dispatchqueue_serialno_offset_from_proc(task->bsd_info); - } - - return dq_serialno_offset; -} - static void kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap) { @@ -1958,8 +1896,107 @@ kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap) } } +void +stackshot_memcpy(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +size_t +stackshot_strlcpy(char *dst, const char *src, size_t maxlen) +{ + const size_t srclen = strlen(src); + + if (srclen < maxlen) { + stackshot_memcpy(dst, src, srclen+1); + } else if (maxlen != 0) { + stackshot_memcpy(dst, src, maxlen-1); + dst[maxlen-1] = '\0'; + } + + return srclen; +} + + +/* + * Returns the physical address of the specified map:target address, + * using the kdp fault path if requested and the page is not resident. + */ +vm_offset_t +kdp_find_phys(vm_map_t map, vm_offset_t target_addr, boolean_t try_fault, uint32_t *kdp_fault_results) +{ + vm_offset_t cur_phys_addr; + unsigned cur_wimg_bits; + uint64_t fault_start_time = 0; + + if (map == VM_MAP_NULL) { + return 0; + } + + cur_phys_addr = kdp_vtophys(map->pmap, target_addr); + if (!pmap_valid_page((ppnum_t) atop(cur_phys_addr))) { + if (!try_fault || fault_stats.sfs_stopped_faulting) { + if (kdp_fault_results) + *kdp_fault_results |= KDP_FAULT_RESULT_PAGED_OUT; + + return 0; + } + + /* + * The pmap doesn't have a valid page so we start at the top level + * vm map and try a lightweight fault. Update fault path usage stats. + */ + fault_start_time = mach_absolute_time(); + cur_phys_addr = kdp_lightweight_fault(map, (target_addr & ~PAGE_MASK)); + fault_stats.sfs_time_spent_faulting += (mach_absolute_time() - fault_start_time); + + if ((fault_stats.sfs_time_spent_faulting >= fault_stats.sfs_system_max_fault_time) && !panic_stackshot) { + fault_stats.sfs_stopped_faulting = (uint8_t) TRUE; + } + + cur_phys_addr += (target_addr & PAGE_MASK); + + if (!pmap_valid_page((ppnum_t) atop(cur_phys_addr))) { + if (kdp_fault_results) + *kdp_fault_results |= (KDP_FAULT_RESULT_TRIED_FAULT | KDP_FAULT_RESULT_PAGED_OUT); + + return 0; + } + + if (kdp_fault_results) + *kdp_fault_results |= KDP_FAULT_RESULT_FAULTED_IN; + + fault_stats.sfs_pages_faulted_in++; + } else { + /* + * This check is done in kdp_lightweight_fault for the fault path. + */ + cur_wimg_bits = pmap_cache_attributes((ppnum_t) atop(cur_phys_addr)); + + if ((cur_wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) { + return 0; + } + } + + return cur_phys_addr; +} + boolean_t -kdp_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size) +kdp_copyin_word( + task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results) +{ + if (task_has_64BitAddr(task)) { + return kdp_copyin(task->map, addr, result, sizeof(uint64_t), try_fault, kdp_fault_results); + } else { + uint32_t buf; + boolean_t r = kdp_copyin(task->map, addr, &buf, sizeof(uint32_t), try_fault, kdp_fault_results); + *result = buf; + return r; + } +} + +boolean_t +kdp_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_results) { size_t rem = size; char *kvaddr = dest; @@ -1970,51 +2007,43 @@ kdp_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size) if (((vm_offset_t)dest + size) >= (gPanicBase + gPanicSize)) { return FALSE; } - ppnum_t upn = pmap_find_phys(p, uaddr); - uint64_t phys_src = ptoa_64(upn) | (uaddr & PAGE_MASK); - void *src_va = (void*)phystokv(phys_src); - if (upn && pmap_valid_page(upn)) { - bcopy(src_va, kvaddr, size); - return TRUE; - } - return FALSE; } #endif while (rem) { - ppnum_t upn = pmap_find_phys(p, uaddr); - uint64_t phys_src = ptoa_64(upn) | (uaddr & PAGE_MASK); + uint64_t phys_src = kdp_find_phys(map, uaddr, try_fault, kdp_fault_results); uint64_t phys_dest = kvtophys((vm_offset_t)kvaddr); uint64_t src_rem = PAGE_SIZE - (phys_src & PAGE_MASK); uint64_t dst_rem = PAGE_SIZE - (phys_dest & PAGE_MASK); size_t cur_size = (uint32_t) MIN(src_rem, dst_rem); cur_size = MIN(cur_size, rem); - if (upn && pmap_valid_page(upn) && phys_dest) { - bcopy_phys(phys_src, phys_dest, cur_size); - } - else + if (phys_src && phys_dest) { + bcopy_phys(phys_src, phys_dest, cur_size); + } else { break; + } + uaddr += cur_size; kvaddr += cur_size; rem -= cur_size; } + return (rem == 0); } -void -do_stackshot() +kern_return_t +do_stackshot(void *context) { - if (stack_snapshot_flags & STACKSHOT_KCDATA_FORMAT) { - stack_snapshot_ret = kdp_stackshot_kcdata_format(stack_snapshot_pid, +#pragma unused(context) + kdp_snapshot++; + + stack_snapshot_ret = kdp_stackshot_kcdata_format(stack_snapshot_pid, stack_snapshot_flags, &stack_snapshot_bytes_traced); - } - else { - stack_snapshot_ret = kdp_stackshot(stack_snapshot_pid, - stack_snapshot_buf, stack_snapshot_bufsize, - stack_snapshot_flags, &stack_snapshot_bytes_traced); - } + + kdp_snapshot--; + return stack_snapshot_ret; } /* @@ -2025,7 +2054,7 @@ do_stackshot() * machine_trace_thread and its relatives tend to throw at us. * * Please zero the nasty global this uses after a bulk lookup; - * this isn't safe across a switch of the kdp_pmap or changes + * this isn't safe across a switch of the map or changes * to a pmap. * * This also means that if zero is a valid KVA, we are @@ -2035,50 +2064,41 @@ do_stackshot() vm_offset_t machine_trace_thread_get_kva(vm_offset_t cur_target_addr, vm_map_t map, uint32_t *thread_trace_flags) { - unsigned cur_wimg_bits; vm_offset_t cur_target_page; vm_offset_t cur_phys_addr; vm_offset_t kern_virt_target_addr; + uint32_t kdp_fault_results = 0; cur_target_page = atop(cur_target_addr); if ((cur_target_page != prev_target_page) || validate_next_addr) { + /* * Alright; it wasn't our previous page. So * we must validate that there is a page * table entry for this address under the - * current kdp_pmap, and that it has default + * current pmap, and that it has default * cache attributes (otherwise it may not be * safe to access it). */ - cur_phys_addr = kdp_vtophys(kdp_pmap ? kdp_pmap : kernel_pmap, cur_target_addr); - - if (!pmap_valid_page((ppnum_t) atop(cur_phys_addr))) { - - if (!stack_enable_faulting) { - return 0; + cur_phys_addr = kdp_find_phys(map, cur_target_addr, stack_enable_faulting, &kdp_fault_results); + if (thread_trace_flags) { + if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) { + *thread_trace_flags |= kThreadTruncatedBT; } - /* - * The pmap doesn't have a valid page so we start at the top level - * vm map and try a lightweight fault. - */ - cur_phys_addr = kdp_lightweight_fault(map, (cur_target_addr & ~PAGE_MASK), thread_trace_flags); - cur_phys_addr += (cur_target_addr & PAGE_MASK); - - if (!pmap_valid_page((ppnum_t) atop(cur_phys_addr))) - return 0; - } else { - /* - * This check is done in kdp_lightweight_fault for the fault path. - */ - cur_wimg_bits = pmap_cache_attributes((ppnum_t) atop(cur_phys_addr)); + if (kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) { + *thread_trace_flags |= kThreadTriedFaultBT; + } - if ((cur_wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) { - return 0; + if (kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) { + *thread_trace_flags |= kThreadFaultedBT; } } + if (cur_phys_addr == 0) { + return 0; + } #if __x86_64__ kern_virt_target_addr = (vm_offset_t) PHYSMAP_PTOV(cur_phys_addr); #else @@ -2101,3 +2121,18 @@ machine_trace_thread_clear_validation_cache(void) validate_next_addr = TRUE; } +boolean_t +stackshot_thread_is_idle_worker_unsafe(thread_t thread) +{ + /* When the pthread kext puts a worker thread to sleep, it will call + * assert_wait on the thread's own threadlist. see parkit() in + * kern_support.c. + */ + struct uthread * uthread = get_bsdthread_info(thread); + event64_t threadlist = (event64_t)proc_get_uthread_uu_threadlist(uthread); + event64_t wait_event = thread->wait_event; + return uthread && + (thread->state & TH_WAIT) && + wait_event && + threadlist == wait_event; +} diff --git a/osfmk/kern/kern_types.h b/osfmk/kern/kern_types.h index f73eff9e3..c13795669 100644 --- a/osfmk/kern/kern_types.h +++ b/osfmk/kern/kern_types.h @@ -182,6 +182,11 @@ typedef int wait_timeout_urgency_t; #ifdef KERNEL_PRIVATE +/* + * n.b. this is defined in thread_call.h, but in the TIMEOUT_URGENCY flags space: + * #define THREAD_CALL_CONTINUOUS 0x100 + */ + #ifdef MACH_KERNEL_PRIVATE #include diff --git a/osfmk/kern/kpc.h b/osfmk/kern/kpc.h index 461dd6704..7c79c6953 100644 --- a/osfmk/kern/kpc.h +++ b/osfmk/kern/kpc.h @@ -2,7 +2,7 @@ * Copyright (c) 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,18 +22,20 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef __KERN_KPC_H__ -#define __KERN_KPC_H__ +#ifndef KERN_KPC_H +#define KERN_KPC_H /* Kernel interfaces to KPC PMC infrastructure. */ #include #include /* thread_* */ +__BEGIN_DECLS + /* cross-platform class constants */ #define KPC_CLASS_FIXED (0) #define KPC_CLASS_CONFIGURABLE (1) @@ -87,6 +89,7 @@ typedef void (*kpc_pm_handler_t)(boolean_t); */ struct cpu_data; extern boolean_t kpc_register_cpu(struct cpu_data *cpu_data); +extern void kpc_unregister_cpu(struct cpu_data *cpu_data); /* bootstrap */ extern void kpc_init(void); @@ -159,8 +162,24 @@ extern int kpc_threads_counting; /* AST callback for KPC */ extern void kpc_thread_ast_handler( thread_t thread ); -/* context switch accounting between two threads */ -extern void kpc_switch_context( thread_t old_thread, thread_t new_thread ); +#ifdef MACH_KERNEL_PRIVATE + +/* context switch callback for KPC */ + +extern boolean_t kpc_off_cpu_active; + +extern void kpc_off_cpu_internal(thread_t thread); +extern void kpc_off_cpu_update(void); + +static inline void +kpc_off_cpu(thread_t thread) +{ + if (__improbable(kpc_off_cpu_active)) { + kpc_off_cpu_internal(thread); + } +} + +#endif /* defined(MACH_KERNEL_PRIVATE) */ /* acquire/release the counters used by the Power Manager */ extern int kpc_force_all_ctrs( task_t task, int val ); @@ -323,4 +342,6 @@ struct kpc_driver int (*set_period)(uint32_t classes, uint64_t *period); }; -#endif /* __KERN_KPC_H__ */ +__END_DECLS + +#endif /* !definde(KERN_KPC_H) */ diff --git a/osfmk/kern/kpc_common.c b/osfmk/kern/kpc_common.c index 26bdae8f2..38f06c48a 100644 --- a/osfmk/kern/kpc_common.c +++ b/osfmk/kern/kpc_common.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,8 @@ static kpc_pm_handler_t kpc_pm_handler; static boolean_t kpc_pm_has_custom_config; static uint64_t kpc_pm_pmc_mask; +boolean_t kpc_context_switch_active = FALSE; + void kpc_common_init(void); void kpc_common_init(void) @@ -109,14 +112,33 @@ kpc_register_cpu(struct cpu_data *cpu_data) return TRUE; error: - kfree(cpu_data->cpu_kpc_buf[0], COUNTERBUF_SIZE_PER_CPU); - kfree(cpu_data->cpu_kpc_buf[1], COUNTERBUF_SIZE_PER_CPU); - kfree(cpu_data->cpu_kpc_shadow, COUNTERBUF_SIZE_PER_CPU); - kfree(cpu_data->cpu_kpc_reload, COUNTERBUF_SIZE_PER_CPU); - + kpc_unregister_cpu(cpu_data); return FALSE; } +void +kpc_unregister_cpu(struct cpu_data *cpu_data) +{ + assert(cpu_data); + if (cpu_data->cpu_kpc_buf[0] != NULL) { + kfree(cpu_data->cpu_kpc_buf[0], COUNTERBUF_SIZE_PER_CPU); + cpu_data->cpu_kpc_buf[0] = NULL; + } + if (cpu_data->cpu_kpc_buf[1] != NULL) { + kfree(cpu_data->cpu_kpc_buf[1], COUNTERBUF_SIZE_PER_CPU); + cpu_data->cpu_kpc_buf[1] = NULL; + } + if (cpu_data->cpu_kpc_shadow != NULL) { + kfree(cpu_data->cpu_kpc_shadow, COUNTERBUF_SIZE_PER_CPU); + cpu_data->cpu_kpc_shadow = NULL; + } + if (cpu_data->cpu_kpc_reload != NULL) { + kfree(cpu_data->cpu_kpc_reload, COUNTERBUF_SIZE_PER_CPU); + cpu_data->cpu_kpc_reload = NULL; + } +} + + static void kpc_task_set_forced_all_ctrs(task_t task, boolean_t state) { @@ -472,24 +494,19 @@ kpc_sample_kperf(uint32_t actionid) { struct kperf_sample sbuf; struct kperf_context ctx; - task_t task = NULL; - int r; - BUF_DATA1(PERF_KPC_HNDLR | DBG_FUNC_START, 0); + BUF_DATA(PERF_KPC_HNDLR | DBG_FUNC_START); ctx.cur_pid = 0; ctx.cur_thread = current_thread(); - - task = chudxnu_task_for_thread(ctx.cur_thread); - if (task) - ctx.cur_pid = chudxnu_pid_for_task(task); + ctx.cur_pid = task_pid(current_task()); ctx.trigger_type = TRIGGER_TYPE_PMI; ctx.trigger_id = 0; - r = kperf_sample(&sbuf, &ctx, actionid, SAMPLE_FLAG_PEND_USER); + int r = kperf_sample(&sbuf, &ctx, actionid, SAMPLE_FLAG_PEND_USER); - BUF_INFO1(PERF_KPC_HNDLR | DBG_FUNC_END, r); + BUF_INFO(PERF_KPC_HNDLR | DBG_FUNC_END, r); } @@ -786,4 +803,3 @@ kpc_get_configurable_pmc_mask(uint32_t classes) return cfg_mask | pwr_mask; } - diff --git a/osfmk/kern/kpc_thread.c b/osfmk/kern/kpc_thread.c index 1ac250c39..aa8f261d4 100644 --- a/osfmk/kern/kpc_thread.c +++ b/osfmk/kern/kpc_thread.c @@ -45,6 +45,9 @@ /* global for whether to read PMCs on context switch */ int kpc_threads_counting = 0; +/* whether to call into KPC when a thread goes off CPU */ +boolean_t kpc_off_cpu_active = FALSE; + /* current config and number of counters in that config */ static uint32_t kpc_thread_classes = 0; static uint32_t kpc_thread_classes_count = 0; @@ -111,12 +114,12 @@ kpc_set_thread_counting(uint32_t classes) /* and schedule an AST for this thread... */ if( !current_thread()->kpc_buf ) { - current_thread()->t_chud |= T_KPC_ALLOC; + current_thread()->kperf_flags |= T_KPC_ALLOC; act_set_kperf(current_thread()); - } + } } - kperf_kpc_cswitch_callback_update(); + kpc_off_cpu_update(); lck_mtx_unlock(&kpc_thread_lock); return 0; @@ -141,13 +144,12 @@ kpc_update_thread_counters( thread_t thread ) for( i = 0; i < kpc_thread_classes_count; i++ ) thread->kpc_buf[i] += cpu->cpu_kpc_buf[1][i] - cpu->cpu_kpc_buf[0][i]; - /* schedule any necessary allocations */ if( !current_thread()->kpc_buf ) { - current_thread()->t_chud |= T_KPC_ALLOC; + current_thread()->kperf_flags |= T_KPC_ALLOC; act_set_kperf(current_thread()); - } + } /* 3. switch the PMC block pointers */ tmp = cpu->cpu_kpc_buf[1]; @@ -155,12 +157,6 @@ kpc_update_thread_counters( thread_t thread ) cpu->cpu_kpc_buf[0] = tmp; } -void -kpc_switch_context( thread_t old, thread_t new __unused ) -{ - kpc_update_thread_counters( old ); -} - /* get counter values for a thread */ int kpc_get_curthread_counters(uint32_t *inoutcount, uint64_t *buf) @@ -191,6 +187,19 @@ kpc_get_curthread_counters(uint32_t *inoutcount, uint64_t *buf) return 0; } +void +kpc_off_cpu_update(void) +{ + kpc_off_cpu_active = kpc_threads_counting; +} + +void +kpc_off_cpu_internal(thread_t thread) +{ + if (kpc_threads_counting) { + kpc_update_thread_counters(thread); + } +} void kpc_thread_create(thread_t thread) @@ -223,6 +232,6 @@ void kpc_thread_ast_handler( thread_t thread ) { /* see if we want an alloc */ - if( thread->t_chud & T_KPC_ALLOC ) + if( thread->kperf_flags & T_KPC_ALLOC ) thread->kpc_buf = kpc_counterbuf_alloc(); } diff --git a/osfmk/kern/ktrace_background_notify.c b/osfmk/kern/ktrace_background_notify.c new file mode 100644 index 000000000..f3af54ea5 --- /dev/null +++ b/osfmk/kern/ktrace_background_notify.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#include + +kern_return_t ktrace_background_available_notify_user(void); + +/* + * If user space has registered for background notifications, send one. + */ +kern_return_t +ktrace_background_available_notify_user(void) +{ + mach_port_t user_port; + kern_return_t kr; + + kr = host_get_ktrace_background_port(host_priv_self(), &user_port); + if (kr != KERN_SUCCESS || !IPC_PORT_VALID(user_port)) { + return KERN_FAILURE; + } + + kr = send_ktrace_background_available(user_port); + ipc_port_release_send(user_port); + return kr; +} diff --git a/osfmk/kern/ledger.c b/osfmk/kern/ledger.c index 1f90ef24c..983e51b9c 100644 --- a/osfmk/kern/ledger.c +++ b/osfmk/kern/ledger.c @@ -38,6 +38,8 @@ #include #include #include +#include + #include #include @@ -56,6 +58,7 @@ #define LF_WARNED 0x2000 /* callback was called for balance warning */ #define LF_TRACKING_MAX 0x4000 /* track max balance over user-specfied time */ #define LF_PANIC_ON_NEGATIVE 0x8000 /* panic if it goes negative */ +#define LF_TRACK_CREDIT_ONLY 0x10000 /* only update "credit" */ /* Determine whether a ledger entry exists and has been initialized and active */ #define ENTRY_VALID(l, e) \ @@ -156,11 +159,11 @@ struct ledger_entry { } __attribute__((aligned(8))); struct ledger { - int l_id; + uint64_t l_id; + int32_t l_refs; + int32_t l_size; struct ledger_template *l_template; - int l_refs; - int l_size; - struct ledger_entry *l_entries; + struct ledger_entry l_entries[0] __attribute__((aligned(8))); }; static int ledger_cnt = 0; @@ -170,6 +173,9 @@ static kern_return_t ledger_perform_blocking(ledger_t l); static uint32_t flag_set(volatile uint32_t *flags, uint32_t bit); static uint32_t flag_clear(volatile uint32_t *flags, uint32_t bit); +static void ledger_entry_check_new_balance(ledger_t ledger, int entry, + struct ledger_entry *le); + #if 0 static void debug_callback(const void *p0, __unused const void *p1) @@ -345,30 +351,27 @@ ledger_t ledger_instantiate(ledger_template_t template, int entry_type) { ledger_t ledger; - size_t sz; + size_t cnt, sz; int i; - ledger = (ledger_t)kalloc(sizeof (struct ledger)); - if (ledger == NULL) - return (LEDGER_NULL); - - ledger->l_template = template; - ledger->l_id = ledger_cnt++; - ledger->l_refs = 1; - template_lock(template); template->lt_refs++; - ledger->l_size = template->lt_cnt; + cnt = template->lt_cnt; template_unlock(template); - sz = ledger->l_size * sizeof (struct ledger_entry); - ledger->l_entries = kalloc(sz); - if (sz && (ledger->l_entries == NULL)) { + sz = sizeof(*ledger) + (cnt * sizeof(struct ledger_entry)); + + ledger = (ledger_t)kalloc(sz); + if (ledger == NULL) { ledger_template_dereference(template); - kfree(ledger, sizeof(struct ledger)); - return (LEDGER_NULL); + return LEDGER_NULL; } + ledger->l_template = template; + ledger->l_id = ledger_cnt++; + ledger->l_refs = 1; + ledger->l_size = (int32_t)cnt; + template_lock(template); assert(ledger->l_size <= template->lt_cnt); for (i = 0; i < ledger->l_size; i++) { @@ -447,9 +450,8 @@ ledger_dereference(ledger_t ledger) /* Just released the last reference. Free it. */ if (v == 1) { - kfree(ledger->l_entries, - ledger->l_size * sizeof (struct ledger_entry)); - kfree(ledger, sizeof (*ledger)); + kfree(ledger, + sizeof(*ledger) + ledger->l_size * sizeof(struct ledger_entry)); } return (KERN_SUCCESS); @@ -463,7 +465,11 @@ warn_level_exceeded(struct ledger_entry *le) { ledger_amount_t balance; - assert((le->le_credit >= 0) && (le->le_debit >= 0)); + if (le->le_flags & LF_TRACK_CREDIT_ONLY) { + assert(le->le_debit == 0); + } else { + assert((le->le_credit >= 0) && (le->le_debit >= 0)); + } /* * XXX - Currently, we only support warnings for ledgers which @@ -483,7 +489,11 @@ limit_exceeded(struct ledger_entry *le) { ledger_amount_t balance; - assert((le->le_credit >= 0) && (le->le_debit >= 0)); + if (le->le_flags & LF_TRACK_CREDIT_ONLY) { + assert(le->le_debit == 0); + } else { + assert((le->le_credit >= 0) && (le->le_debit >= 0)); + } balance = le->le_credit - le->le_debit; if ((le->le_limit <= 0) && (balance < le->le_limit)) @@ -535,10 +545,17 @@ ledger_refill(uint64_t now, ledger_t ledger, int entry) struct ledger_entry *le; ledger_amount_t balance, due; + assert(entry >= 0 && entry < ledger->l_size); + le = &ledger->l_entries[entry]; assert(le->le_limit != LEDGER_LIMIT_INFINITY); + if (le->le_flags & LF_TRACK_CREDIT_ONLY) { + assert(le->le_debit == 0); + return; + } + /* * If another thread is handling the refill already, we're not * needed. @@ -653,11 +670,9 @@ ledger_refill(uint64_t now, ledger_t ledger, int entry) #define TOCKSTAMP_IS_STALE(now, tock) ((((now) - (tock)) < NTOCKS) ? FALSE : TRUE) void -ledger_check_new_balance(ledger_t ledger, int entry) +ledger_entry_check_new_balance(ledger_t ledger, int entry, struct ledger_entry *le) { - struct ledger_entry *le; - - le = &ledger->l_entries[entry]; + ledger_amount_t credit, debit; if (le->le_flags & LF_TRACKING_MAX) { ledger_amount_t balance = le->le_credit - le->le_debit; @@ -749,13 +764,29 @@ ledger_check_new_balance(ledger_t ledger, int entry) } } + credit = le->le_credit; + debit = le->le_debit; if ((le->le_flags & LF_PANIC_ON_NEGATIVE) && - (le->le_credit < le->le_debit)) { - panic("ledger_check_new_balance(%p,%d): negative ledger %p balance:%lld\n", - ledger, entry, le, le->le_credit - le->le_debit); + ((credit < debit) || + (le->le_credit < le->le_debit))) { + panic("ledger_entry_check_new_balance(%p,%d): negative ledger %p credit:%lld/%lld debit:%lld/%lld balance:%lld/%lld\n", + ledger, entry, le, + credit, le->le_credit, + debit, le->le_debit, + credit - debit, le->le_credit - le->le_debit); } } +void +ledger_check_new_balance(ledger_t ledger, int entry) +{ + struct ledger_entry *le; + assert(entry > 0 && entry <= ledger->l_size); + le = &ledger->l_entries[entry]; + ledger_entry_check_new_balance(ledger, entry, le); +} + + /* * Add value to an entry in a ledger. */ @@ -776,7 +807,7 @@ ledger_credit(ledger_t ledger, int entry, ledger_amount_t amount) old = OSAddAtomic64(amount, &le->le_credit); new = old + amount; lprintf(("%p Credit %lld->%lld\n", current_thread(), old, new)); - ledger_check_new_balance(ledger, entry); + ledger_entry_check_new_balance(ledger, entry, le); return (KERN_SUCCESS); } @@ -825,7 +856,13 @@ ledger_zero_balance(ledger_t ledger, int entry) le = &ledger->l_entries[entry]; top: - if (le->le_credit > le->le_debit) { + if (le->le_flags & LF_TRACK_CREDIT_ONLY) { + assert(le->le_debit == 0); + if (!OSCompareAndSwap64(le->le_credit, 0, &le->le_credit)) { + goto top; + } + lprintf(("%p zeroed %lld->%lld\n", current_thread(), le->le_credit, 0)); + } else if (le->le_credit > le->le_debit) { if (!OSCompareAndSwap64(le->le_debit, le->le_credit, &le->le_debit)) goto top; lprintf(("%p zeroed %lld->%lld\n", current_thread(), le->le_debit, le->le_credit)); @@ -963,16 +1000,34 @@ ledger_panic_on_negative(ledger_template_t template, int entry) template_lock(template); if ((entry < 0) || (entry >= template->lt_cnt)) { - template_unlock(template); + template_unlock(template); return (KERN_INVALID_VALUE); } template->lt_entries[entry].et_flags |= LF_PANIC_ON_NEGATIVE; - template_unlock(template); + template_unlock(template); return (KERN_SUCCESS); } + +kern_return_t +ledger_track_credit_only(ledger_template_t template, int entry) +{ + template_lock(template); + + if ((entry < 0) || (entry >= template->lt_cnt)) { + template_unlock(template); + return (KERN_INVALID_VALUE); + } + + template->lt_entries[entry].et_flags |= LF_TRACK_CREDIT_ONLY; + + template_unlock(template); + + return (KERN_SUCCESS); +} + /* * Add a callback to be executed when the resource goes into deficit. */ @@ -1165,11 +1220,17 @@ ledger_debit(ledger_t ledger, int entry, ledger_amount_t amount) le = &ledger->l_entries[entry]; - old = OSAddAtomic64(amount, &le->le_debit); - new = old + amount; - + if (le->le_flags & LF_TRACK_CREDIT_ONLY) { + assert(le->le_debit == 0); + old = OSAddAtomic64(-amount, &le->le_credit); + new = old - amount; + } else { + old = OSAddAtomic64(amount, &le->le_debit); + new = old + amount; + } lprintf(("%p Debit %lld->%lld\n", thread, old, new)); - ledger_check_new_balance(ledger, entry); + + ledger_entry_check_new_balance(ledger, entry, le); return (KERN_SUCCESS); } @@ -1446,7 +1507,11 @@ ledger_get_balance(ledger_t ledger, int entry, ledger_amount_t *balance) le = &ledger->l_entries[entry]; - assert((le->le_credit >= 0) && (le->le_debit >= 0)); + if (le->le_flags & LF_TRACK_CREDIT_ONLY) { + assert(le->le_debit == 0); + } else { + assert((le->le_credit >= 0) && (le->le_debit >= 0)); + } *balance = le->le_credit - le->le_debit; @@ -1550,11 +1615,11 @@ ledger_get_entry_info(ledger_t ledger, assert(ledger != NULL); assert(lei != NULL); - assert(entry < ledger->l_size); - struct ledger_entry *le = &ledger->l_entries[entry]; - - ledger_fill_entry_info(le, lei, now); + if (entry >= 0 && entry < ledger->l_size) { + struct ledger_entry *le = &ledger->l_entries[entry]; + ledger_fill_entry_info(le, lei, now); + } } int diff --git a/osfmk/kern/ledger.h b/osfmk/kern/ledger.h index 2ebb8facf..689c1d277 100644 --- a/osfmk/kern/ledger.h +++ b/osfmk/kern/ledger.h @@ -32,6 +32,8 @@ #ifndef _KERN_LEDGER_H_ #define _KERN_LEDGER_H_ +#include /* ledger_t */ + #define LEDGER_INFO 0 #define LEDGER_ENTRY_INFO 1 #define LEDGER_TEMPLATE_INFO 2 @@ -54,11 +56,11 @@ struct ledger_template_info { }; struct ledger_entry_info { - int64_t lei_balance; - int64_t lei_credit; - int64_t lei_debit; - uint64_t lei_limit; - uint64_t lei_refill_period; /* In milliseconds */ + int64_t lei_balance; + int64_t lei_credit; + int64_t lei_debit; + uint64_t lei_limit; + uint64_t lei_refill_period; /* In nanoseconds */ uint64_t lei_last_refill; /* Time since last refill */ }; @@ -100,6 +102,8 @@ extern kern_return_t ledger_track_maximum(ledger_template_t template, int entry, int period_in_secs); extern kern_return_t ledger_panic_on_negative(ledger_template_t template, int entry); +extern kern_return_t ledger_track_credit_only(ledger_template_t template, + int entry); extern int ledger_key_lookup(ledger_template_t template, const char *key); /* value of entry type */ diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index c81671591..5141535e3 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,6 +53,10 @@ * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ + +#define ATOMIC_PRIVATE 1 +#define LOCK_PRIVATE 1 + #include #include @@ -67,6 +71,8 @@ #include #include #include +#include +#include #include @@ -87,6 +93,27 @@ #define LCK_MTX_LCK_WAIT_CODE 2 #define LCK_MTX_UNLCK_WAKEUP_CODE 3 +#if MACH_LDEBUG +#define ALIGN_TEST(p,t) do{if((uintptr_t)p&(sizeof(t)-1)) __builtin_trap();}while(0) +#else +#define ALIGN_TEST(p,t) do{}while(0) +#endif + +/* Silence the volatile to _Atomic cast warning */ +#define ATOMIC_CAST(t,p) ((_Atomic t*)(uintptr_t)(p)) + +/* Enforce program order of loads and stores. */ +#define ordered_load(target, type) \ + __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed) +#define ordered_store(target, type, value) \ + __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed) + +#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data, uintptr_t) +#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, uintptr_t, (value)) + +#define NOINLINE __attribute__((noinline)) + + static queue_head_t lck_grp_queue; static unsigned int lck_grp_cnt; @@ -218,6 +245,9 @@ lck_grp_alloc_init( void lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr) { + /* make sure locking infrastructure has been initialized */ + assert(lck_grp_cnt > 0); + bzero((void *)grp, sizeof(lck_grp_t)); (void)strlcpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME); @@ -315,6 +345,7 @@ lck_grp_lckcnt_decr( lck_type_t lck_type) { unsigned int *lckcnt; + int updated; switch (lck_type) { case LCK_TYPE_SPIN: @@ -327,10 +358,12 @@ lck_grp_lckcnt_decr( lckcnt = &grp->lck_grp_rwcnt; break; default: - return panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type); + panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type); + return; } - (void)hw_atomic_sub(lckcnt, 1); + updated = (int)hw_atomic_sub(lckcnt, 1); + assert(updated >= 0); } /* @@ -415,6 +448,212 @@ lck_attr_free( kfree(attr, sizeof(lck_attr_t)); } +/* + * Routine: hw_lock_init + * + * Initialize a hardware lock. + */ +void +hw_lock_init(hw_lock_t lock) +{ + ordered_store_hw(lock, 0); +} + +/* + * Routine: hw_lock_lock_contended + * + * Spin until lock is acquired or timeout expires. + * timeout is in mach_absolute_time ticks. + * MACH_RT: called with preemption disabled. + */ + +#if __SMP__ +static unsigned int NOINLINE +hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic) +{ + uint64_t end = 0; + uintptr_t holder = lock->lock_data; + int i; + + if (timeout == 0) + timeout = LOCK_PANIC_TIMEOUT; + + for ( ; ; ) { + for (i = 0; i < LOCK_SNOOP_SPINS; i++) { + boolean_t wait = FALSE; + + cpu_pause(); +#if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST) + holder = ordered_load_hw(lock); + if (holder != 0) + continue; +#endif +#if __ARM_ENABLE_WFE_ + wait = TRUE; // Wait for event +#endif + if (atomic_compare_exchange(&lock->lock_data, 0, data, + memory_order_acquire_smp, wait)) + return 1; + } + if (end == 0) + end = ml_get_timebase() + timeout; + else if (ml_get_timebase() >= end) + break; + } + if (do_panic) { + // Capture the actual time spent blocked, which may be higher than the timeout + // if a misbehaving interrupt stole this thread's CPU time. + panic("Spinlock timeout after %llu ticks, %p = %lx", + (ml_get_timebase() - end + timeout), lock, holder); + } + return 0; +} +#endif // __SMP__ + +/* + * Routine: hw_lock_lock + * + * Acquire lock, spinning until it becomes available. + * MACH_RT: also return with preemption disabled. + */ +void +hw_lock_lock(hw_lock_t lock) +{ + thread_t thread; + uintptr_t state; + + thread = current_thread(); + disable_preemption_for_thread(thread); + state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; +#if __SMP__ +#if LOCK_PRETEST + if (ordered_load_hw(lock)) + goto contended; +#endif // LOCK_PRETEST + if (atomic_compare_exchange(&lock->lock_data, 0, state, + memory_order_acquire_smp, TRUE)) + return; +#if LOCK_PRETEST +contended: +#endif // LOCK_PRETEST + hw_lock_lock_contended(lock, state, 0, TRUE); +#else // __SMP__ + if (lock->lock_data) + panic("Spinlock held %p", lock); + lock->lock_data = state; +#endif // __SMP__ + return; +} + +/* + * Routine: hw_lock_to + * + * Acquire lock, spinning until it becomes available or timeout. + * timeout is in mach_absolute_time ticks. + * MACH_RT: also return with preemption disabled. + */ +unsigned int +hw_lock_to(hw_lock_t lock, uint64_t timeout) +{ + thread_t thread; + uintptr_t state; + + thread = current_thread(); + disable_preemption_for_thread(thread); + state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; +#if __SMP__ +#if LOCK_PRETEST + if (ordered_load_hw(lock)) + goto contended; +#endif // LOCK_PRETEST + if (atomic_compare_exchange(&lock->lock_data, 0, state, + memory_order_acquire_smp, TRUE)) + return 1; +#if LOCK_PRETEST +contended: +#endif // LOCK_PRETEST + return hw_lock_lock_contended(lock, state, timeout, FALSE); +#else // __SMP__ + (void)timeout; + if (ordered_load_hw(lock) == 0) { + ordered_store_hw(lock, state); + return 1; + } + return 0; +#endif // __SMP__ +} + +/* + * Routine: hw_lock_try + * MACH_RT: returns with preemption disabled on success. + */ +unsigned int +hw_lock_try(hw_lock_t lock) +{ + thread_t thread = current_thread(); + int success = 0; +#if LOCK_TRY_DISABLE_INT + long intmask; + + intmask = disable_interrupts(); +#else + disable_preemption_for_thread(thread); +#endif // LOCK_TRY_DISABLE_INT + +#if __SMP__ +#if LOCK_PRETEST + if (ordered_load_hw(lock)) + goto failed; +#endif // LOCK_PRETEST + success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK, + memory_order_acquire_smp, FALSE); +#else + if (lock->lock_data == 0) { + lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; + success = 1; + } +#endif // __SMP__ + +#if LOCK_TRY_DISABLE_INT + if (success) + disable_preemption_for_thread(thread); +#if LOCK_PRETEST +failed: +#endif // LOCK_PRETEST + restore_interrupts(intmask); +#else +#if LOCK_PRETEST +failed: +#endif // LOCK_PRETEST + if (!success) + enable_preemption(); +#endif // LOCK_TRY_DISABLE_INT + return success; +} + +/* + * Routine: hw_lock_unlock + * + * Unconditionally release lock. + * MACH_RT: release preemption level. + */ +void +hw_lock_unlock(hw_lock_t lock) +{ + __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp); + enable_preemption(); +} + +/* + * RoutineL hw_lock_held + * MACH_RT: doesn't change preemption state. + * N.B. Racy, of course. + */ +unsigned int +hw_lock_held(hw_lock_t lock) +{ + return (ordered_load_hw(lock) != 0); +} /* * Routine: lck_spin_sleep @@ -665,9 +904,11 @@ lck_mtx_lock_wait ( priority = MIN(priority, MAXPRI_PROMOTE); thread_lock(holder); - if (mutex->lck_mtx_pri == 0) + if (mutex->lck_mtx_pri == 0) { holder->promotions++; - holder->sched_flags |= TH_SFLAG_PROMOTED; + holder->sched_flags |= TH_SFLAG_PROMOTED; + } + if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, @@ -1118,6 +1359,41 @@ void lck_rw_clear_promotion(thread_t thread) splx(s); } +/* + * Callout from context switch if the thread goes + * off core with a positive rwlock_count + * + * Called at splsched with the thread locked + */ +void +lck_rw_set_promotion_locked(thread_t thread) +{ + if (LcksOpts & disLkRWPrio) + return; + + integer_t priority; + + priority = thread->sched_pri; + + if (priority < thread->base_pri) + priority = thread->base_pri; + if (priority < BASEPRI_BACKGROUND) + priority = BASEPRI_BACKGROUND; + + if ((thread->sched_pri < priority) || + !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->sched_pri, + thread->base_pri, priority, 0); + + thread->sched_flags |= TH_SFLAG_RW_PROMOTED; + + if (thread->sched_pri < priority) + set_sched_pri(thread, priority); + } +} + kern_return_t host_lockgroup_info( host_t host, @@ -1202,3 +1478,58 @@ host_lockgroup_info( return(KERN_SUCCESS); } +/* + * Atomic primitives, prototyped in kern/simple_lock.h + * Noret versions are more efficient on some architectures + */ + +uint32_t +hw_atomic_add(volatile uint32_t *dest, uint32_t delt) +{ + ALIGN_TEST(dest,uint32_t); + return __c11_atomic_fetch_add(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) + delt; +} + +uint32_t +hw_atomic_sub(volatile uint32_t *dest, uint32_t delt) +{ + ALIGN_TEST(dest,uint32_t); + return __c11_atomic_fetch_sub(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) - delt; +} + +uint32_t +hw_atomic_or(volatile uint32_t *dest, uint32_t mask) +{ + ALIGN_TEST(dest,uint32_t); + return __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) | mask; +} + +void +hw_atomic_or_noret(volatile uint32_t *dest, uint32_t mask) +{ + ALIGN_TEST(dest,uint32_t); + __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed); +} + +uint32_t +hw_atomic_and(volatile uint32_t *dest, uint32_t mask) +{ + ALIGN_TEST(dest,uint32_t); + return __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) & mask; +} + +void +hw_atomic_and_noret(volatile uint32_t *dest, uint32_t mask) +{ + ALIGN_TEST(dest,uint32_t); + __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed); +} + +uint32_t +hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest) +{ + ALIGN_TEST(dest,uint32_t); + return __c11_atomic_compare_exchange_strong(ATOMIC_CAST(uint32_t,dest), &oldval, newval, + memory_order_acq_rel_smp, memory_order_relaxed); +} + diff --git a/osfmk/kern/locks.h b/osfmk/kern/locks.h index c8768fe6d..5a29c2e14 100644 --- a/osfmk/kern/locks.h +++ b/osfmk/kern/locks.h @@ -323,35 +323,29 @@ extern void mutex_pause(uint32_t); extern void lck_mtx_yield ( lck_mtx_t *lck); -#if defined(__i386__) || defined(__x86_64__) extern boolean_t lck_mtx_try_lock_spin( lck_mtx_t *lck); -extern boolean_t lck_mtx_try_lock_spin_always( +extern void lck_mtx_lock_spin( lck_mtx_t *lck); -extern void lck_mtx_lock_spin_always( +extern boolean_t kdp_lck_mtx_lock_spin_is_acquired( lck_mtx_t *lck); -extern void lck_mtx_lock_spin( +extern void lck_mtx_convert_spin( lck_mtx_t *lck); -extern void lck_mtx_convert_spin( +extern void lck_mtx_lock_spin_always( lck_mtx_t *lck); -extern boolean_t kdp_lck_mtx_lock_spin_is_acquired( +extern boolean_t lck_mtx_try_lock_spin_always( lck_mtx_t *lck); + #define lck_mtx_unlock_always(l) lck_mtx_unlock(l) -#else -#define lck_mtx_try_lock_spin(l) lck_mtx_try_lock(l) -#define lck_mtx_lock_spin(l) lck_mtx_lock(l) -#define lck_mtx_try_lock_spin_always(l) lck_spin_try_lock(l) -#define lck_mtx_lock_spin_always(l) lck_spin_lock(l) -#define kdp_lck_mtx_lock_spin_is_acquired(l) kdp_lck_spin_is_acquired(l) -#define lck_mtx_unlock_always(l) lck_spin_unlock(l) -#define lck_mtx_convert_spin(l) do {} while (0) -#endif +extern void lck_spin_assert( + lck_spin_t *lck, + unsigned int type); extern boolean_t kdp_lck_rw_lock_is_acquired_exclusive( lck_rw_t *lck); @@ -362,10 +356,25 @@ extern void lck_mtx_assert( lck_mtx_t *lck, unsigned int type); +#if MACH_ASSERT +#define LCK_MTX_ASSERT(lck,type) lck_mtx_assert((lck),(type)) +#else /* MACH_ASSERT */ +#define LCK_MTX_ASSERT(lck,type) +#endif /* MACH_ASSERT */ + +#if DEBUG +#define LCK_MTX_ASSERT_DEBUG(lck,type) lck_mtx_assert((lck),(type)) +#else /* DEBUG */ +#define LCK_MTX_ASSERT_DEBUG(lck,type) +#endif /* DEBUG */ + __END_DECLS -#define LCK_MTX_ASSERT_OWNED 0x01 -#define LCK_MTX_ASSERT_NOTOWNED 0x02 +#define LCK_ASSERT_OWNED 1 +#define LCK_ASSERT_NOTOWNED 2 + +#define LCK_MTX_ASSERT_OWNED LCK_ASSERT_OWNED +#define LCK_MTX_ASSERT_NOTOWNED LCK_ASSERT_NOTOWNED #ifdef MACH_KERNEL_PRIVATE extern void lck_mtx_lock_wait( @@ -446,6 +455,7 @@ extern void lck_rw_assert( extern void lck_rw_clear_promotion( thread_t thread); +extern void lck_rw_set_promotion_locked(thread_t thread); #endif #ifdef KERNEL_PRIVATE diff --git a/osfmk/kern/ltable.c b/osfmk/kern/ltable.c new file mode 100644 index 000000000..4aedca0d4 --- /dev/null +++ b/osfmk/kern/ltable.c @@ -0,0 +1,999 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include + + +#define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align))) +#define ROUNDDOWN(x,y) (((x)/(y))*(y)) + +/* ---------------------------------------------------------------------- + * + * Lockless Link Table Interface + * + * ---------------------------------------------------------------------- */ + +vm_size_t g_lt_max_tbl_size; +static lck_grp_t g_lt_lck_grp; + +/* default VA space for link tables (zone allocated) */ +#define DEFAULT_MAX_TABLE_SIZE P2ROUNDUP(8 * 1024 * 1024, PAGE_SIZE) + +#if defined(DEVELOPMENT) || defined(DEBUG) +/* global for lldb macros */ +uint64_t g_lt_idx_max = LT_IDX_MAX; +#endif + + +/* construct a link table element from an offset and mask into a slab */ +#define lt_elem_ofst_slab(slab, slab_msk, ofst) \ + /* cast through 'void *' to avoid compiler alignment warning messages */ \ + ((struct lt_elem *)((void *)((uintptr_t)(slab) + ((ofst) & (slab_msk))))) + +#if defined(CONFIG_LTABLE_STATS) +/* version that makes no assumption on waste within a slab */ +static inline struct lt_elem * +lt_elem_idx(struct link_table *table, uint32_t idx) +{ + int slab_idx = idx / table->slab_elem; + struct lt_elem *slab = table->table[slab_idx]; + if (!slab) + panic("Invalid index:%d slab:%d (NULL) for table:%p\n", + idx, slab_idx, table); + assert(slab->lt_id.idx <= idx && (slab->lt_id.idx + table->slab_elem) > idx); + return lt_elem_ofst_slab(slab, table->slab_msk, (idx - slab->lt_id.idx) * table->elem_sz); +} +#else /* !CONFIG_LTABLE_STATS */ +/* verion that assumes 100% ultilization of slabs (no waste) */ +static inline struct lt_elem * +lt_elem_idx(struct link_table *table, uint32_t idx) +{ + uint32_t ofst = idx * table->elem_sz; + struct lt_elem *slab = table->table[ofst >> table->slab_shift]; + if (!slab) + panic("Invalid index:%d slab:%d (NULL) for table:%p\n", + idx, (ofst >> table->slab_shift), table); + assert(slab->lt_id.idx <= idx && (slab->lt_id.idx + table->slab_elem) > idx); + return lt_elem_ofst_slab(slab, table->slab_msk, ofst); +} +#endif /* !CONFIG_LTABLE_STATS */ + +static int __assert_only +lt_elem_in_range(struct lt_elem *elem, struct link_table *table) +{ + struct lt_elem **base = table->table; + uintptr_t e = (uintptr_t)elem; + assert(base != NULL); + while (*base != NULL) { + uintptr_t b = (uintptr_t)(*base); + if (e >= b && e < b + table->slab_sz) + return 1; + base++; + if ((uintptr_t)base >= (uintptr_t)table->table + PAGE_SIZE) + return 0; + } + return 0; +} + + +/** + * lt_elem_invalidate: mark 'elem' as invalid + * + * NOTE: this does _not_ get or put a reference on 'elem' + */ +void lt_elem_invalidate(struct lt_elem *elem) +{ + uint32_t __assert_only old = OSBitAndAtomic(~LT_BITS_VALID, &elem->lt_bits); + OSMemoryBarrier(); + assert(((lt_bits_type(old) != LT_RESERVED) && (old & LT_BITS_VALID)) || + ((lt_bits_type(old) == LT_RESERVED) && !(old & LT_BITS_VALID))); +} + +/** + * lt_elem_mkvalid: mark 'elem' as valid + * + * NOTE: this does _not_ get or put a reference on 'elem' + */ +void lt_elem_mkvalid(struct lt_elem *elem) +{ + uint32_t __assert_only old = OSBitOrAtomic(LT_BITS_VALID, &elem->lt_bits); + OSMemoryBarrier(); + assert(!(old & LT_BITS_VALID)); +} + +static void lt_elem_set_type(struct lt_elem *elem, int type) +{ + uint32_t old_bits, new_bits; + do { + old_bits = elem->lt_bits; + new_bits = (old_bits & ~LT_BITS_TYPE) | + ((type & LT_BITS_TYPE_MASK) << LT_BITS_TYPE_SHIFT); + } while (OSCompareAndSwap(old_bits, new_bits, &elem->lt_bits) == FALSE); + OSMemoryBarrier(); +} + + +/** + * ltable_bootstrap: bootstrap a link table + * + * Called once at system boot + */ +void ltable_bootstrap(void) +{ + static int s_is_bootstrapped = 0; + + uint32_t tmp32 = 0; + + if (s_is_bootstrapped) + return; + s_is_bootstrapped = 1; + + g_lt_max_tbl_size = DEFAULT_MAX_TABLE_SIZE; + if (PE_parse_boot_argn("lt_tbl_size", &tmp32, sizeof(tmp32)) == TRUE) + g_lt_max_tbl_size = (vm_size_t)P2ROUNDUP(tmp32, PAGE_SIZE); + + lck_grp_init(&g_lt_lck_grp, "link_table_locks", LCK_GRP_ATTR_NULL); +} + +/** + * ltable_init: initialize a link table with given parameters + * + */ +void ltable_init(struct link_table *table, const char *name, + uint32_t max_tbl_elem, uint32_t elem_sz, + ltable_poison_func poison) +{ + kern_return_t kr; + uint32_t slab_sz, slab_shift, slab_msk, slab_elem; + zone_t slab_zone; + size_t max_tbl_sz; + struct lt_elem *e, **base; + +#ifndef CONFIG_LTABLE_STATS + /* the element size _must_ be a power of two! */ + if ((elem_sz & (elem_sz - 1)) != 0) + panic("elem_sz:%d for table:'%s' must be a power of two!", + elem_sz, name); +#endif + + /* + * First, allocate a single page of memory to act as the base + * for the table's element slabs + */ + kr = kernel_memory_allocate(kernel_map, (vm_offset_t *)&base, + PAGE_SIZE, 0, KMA_NOPAGEWAIT, VM_KERN_MEMORY_LTABLE); + if (kr != KERN_SUCCESS) + panic("Cannot initialize %s table: " + "kernel_memory_allocate failed:%d\n", name, kr); + memset(base, 0, PAGE_SIZE); + + /* + * Based on the maximum table size, calculate the slab size: + * we allocate 1 page of slab pointers for the table, and we need to + * index elements of 'elem_sz', this gives us the slab size based on + * the maximum size the table should grow. + */ + max_tbl_sz = (max_tbl_elem * elem_sz); + max_tbl_sz = P2ROUNDUP(max_tbl_sz, PAGE_SIZE); + + /* system maximum table size divided by number of slots in a page */ + slab_sz = (uint32_t)(max_tbl_sz / (PAGE_SIZE / (sizeof(void *)))); + if (slab_sz < PAGE_SIZE) + slab_sz = PAGE_SIZE; + + /* make sure the slab size is a power of two */ + slab_shift = 0; + slab_msk = ~0; + for (uint32_t i = 0; i < 31; i++) { + uint32_t bit = (1 << i); + if ((slab_sz & bit) == slab_sz) { + slab_shift = i; + slab_msk = 0; + for (uint32_t j = 0; j < i; j++) + slab_msk |= (1 << j); + break; + } + slab_sz &= ~bit; + } + slab_elem = slab_sz / elem_sz; + + /* initialize the table's slab zone (for table growth) */ + ltdbg("Initializing %s zone: slab:%d (%d,0x%x) max:%ld", + name, slab_sz, slab_shift, slab_msk, max_tbl_sz); + slab_zone = zinit(slab_sz, max_tbl_sz, slab_sz, name); + assert(slab_zone != ZONE_NULL); + + /* allocate the first slab and populate it */ + base[0] = (struct lt_elem *)zalloc(slab_zone); + if (base[0] == NULL) + panic("Can't allocate a %s table slab from zone:%p", + name, slab_zone); + + memset(base[0], 0, slab_sz); + + /* setup the initial freelist */ + ltdbg("initializing %d links (%d bytes each)...", slab_elem, elem_sz); + for (unsigned l = 0; l < slab_elem; l++) { + e = lt_elem_ofst_slab(base[0], slab_msk, l * elem_sz); + e->lt_id.idx = l; + /* + * setting generation to 0 ensures that a setid of 0 is + * invalid because the generation will be incremented before + * each element's allocation. + */ + e->lt_id.generation = 0; + e->lt_next_idx = l + 1; + } + + /* make sure the last free element points to a never-valid idx */ + e = lt_elem_ofst_slab(base[0], slab_msk, (slab_elem - 1) * elem_sz); + e->lt_next_idx = LT_IDX_MAX; + + lck_mtx_init(&table->lock, &g_lt_lck_grp, LCK_ATTR_NULL); + + table->slab_sz = slab_sz; + table->slab_shift = slab_shift; + table->slab_msk = slab_msk; + table->slab_elem = slab_elem; + table->slab_zone = slab_zone; + + table->elem_sz = elem_sz; + table->nelem = slab_elem; + table->used_elem = 0; + table->elem_sz = elem_sz; + table->poison = poison; + + table->table = base; + table->next_free_slab = &base[1]; + table->free_list.id = base[0]->lt_id.id; + +#if CONFIG_LTABLE_STATS + table->nslabs = 1; + table->nallocs = 0; + table->nreallocs = 0; + table->npreposts = 0; + table->nreservations = 0; + table->nreserved_releases = 0; + + table->max_used = 0; + table->avg_used = 0; + table->max_reservations = 0; + table->avg_reservations = 0; +#endif +} + + +/** + * ltable_grow: grow a link table by adding another 'slab' of table elements + * + * Conditions: + * table mutex is unlocked + * calling thread can block + */ +void ltable_grow(struct link_table *table, uint32_t min_free) +{ + struct lt_elem *slab, **slot; + struct lt_elem *e = NULL, *first_new_elem, *last_new_elem; + struct ltable_id free_id; + uint32_t free_elem; + + assert(get_preemption_level() == 0); + assert(table && table->slab_zone); + + lck_mtx_lock(&table->lock); + + free_elem = table->nelem - table->used_elem; + + /* + * If the caller just wanted to ensure a minimum number of elements, + * do that (and don't just blindly grow the table). Also, don't grow + * the table unnecessarily - we could have been beaten by a higher + * priority thread who acquired the lock and grew the table before we + * got here. + */ + if (free_elem > min_free) { + lck_mtx_unlock(&table->lock); + return; + } + + /* we are now committed to table growth */ + ltdbg_v("BEGIN"); + + if (table->next_free_slab == NULL) { + /* + * before we panic, check one more time to see if any other + * threads have free'd from space in the table. + */ + if ((table->nelem - table->used_elem) > 0) { + /* there's at least 1 free element: don't panic yet */ + lck_mtx_unlock(&table->lock); + return; + } + panic("No more room to grow table: %p (nelem: %d, used: %d)", + table, table->nelem, table->used_elem); + } + slot = table->next_free_slab; + table->next_free_slab++; + if ((uintptr_t)table->next_free_slab >= (uintptr_t)table->table + PAGE_SIZE) + table->next_free_slab = NULL; + + assert(*slot == NULL); + + /* allocate another slab */ + slab = (struct lt_elem *)zalloc(table->slab_zone); + if (slab == NULL) + panic("Can't allocate a %s table (%p) slab from zone:%p", + table->slab_zone->zone_name, table, table->slab_zone); + + memset(slab, 0, table->slab_sz); + + /* put the new elements into a freelist */ + ltdbg_v(" init %d new links...", table->slab_elem); + for (unsigned l = 0; l < table->slab_elem; l++) { + uint32_t idx = l + table->nelem; + if (idx >= (LT_IDX_MAX - 1)) + break; /* the last element of the last slab */ + e = lt_elem_ofst_slab(slab, table->slab_msk, l * table->elem_sz); + e->lt_id.idx = idx; + e->lt_next_idx = idx + 1; + } + last_new_elem = e; + assert(last_new_elem != NULL); + + first_new_elem = lt_elem_ofst_slab(slab, table->slab_msk, 0); + + /* update table book keeping, and atomically swap the freelist head */ + *slot = slab; + if (table->nelem + table->slab_elem >= LT_IDX_MAX) + table->nelem = LT_IDX_MAX - 1; + else + table->nelem += table->slab_elem; + +#if CONFIG_LTABLE_STATS + table->nslabs += 1; +#endif + + /* + * The atomic swap of the free list head marks the end of table + * growth. Incoming requests may now use the newly allocated slab + * of table elements + */ + free_id = table->free_list; + /* connect the existing free list to the end of the new free list */ + last_new_elem->lt_next_idx = free_id.idx; + while (OSCompareAndSwap64(free_id.id, first_new_elem->lt_id.id, + &table->free_list.id) == FALSE) { + OSMemoryBarrier(); + free_id = table->free_list; + last_new_elem->lt_next_idx = free_id.idx; + } + OSMemoryBarrier(); + + lck_mtx_unlock(&table->lock); + + return; +} + + +/** + * ltable_alloc_elem: allocate one or more elements from a given table + * + * The returned element(s) will be of type 'type', but will remain invalid. + * + * If the caller has disabled preemption, then this function may (rarely) spin + * waiting either for another thread to either release 'nelem' table elements, + * or grow the table. + * + * If the caller can block, then this function may (rarely) block while + * the table grows to meet the demand for 'nelem' element(s). + */ +__attribute__((noinline)) +struct lt_elem *ltable_alloc_elem(struct link_table *table, int type, + int nelem, int nattempts) +{ + int nspins = 0, ntries = 0, nalloc = 0; + uint32_t table_size; + struct lt_elem *elem = NULL; + struct ltable_id free_id, next_id; + + static const int max_retries = 500; + + if (type != LT_ELEM && type != LT_LINK && type != LT_RESERVED) + panic("link_table_aloc of invalid elem type:%d from table @%p", + type, table); + + assert(nelem > 0); + + /* + * If the callers only wants to try a certain number of times, make it + * look like we've already made (MAX - nattempts) tries at allocation + */ + if (nattempts > 0 && nattempts <= max_retries) { + ntries = max_retries - nattempts; + } + +try_again: + elem = NULL; + if (ntries++ > max_retries) { + struct lt_elem *tmp; + if (nattempts > 0) { + /* + * The caller specified a particular number of + * attempts before failure, so it's expected that + * they're prepared to handle a NULL return. + */ + return NULL; + } + + if (table->used_elem + nelem >= table_size) + panic("No more room to grow table: 0x%p size:%d, used:%d, requested elem:%d", + table, table_size, table->used_elem, nelem); + if (nelem == 1) + panic("Too many alloc retries: %d, table:%p, type:%d, nelem:%d", + ntries, table, type, nelem); + /* don't panic: try allocating one-at-a-time */ + while (nelem > 0) { + tmp = ltable_alloc_elem(table, type, 1, nattempts); + if (elem) + lt_elem_list_link(table, tmp, elem); + elem = tmp; + --nelem; + } + assert(elem != NULL); + return elem; + } + + nalloc = 0; + table_size = table->nelem; + + if (table->used_elem + nelem >= table_size) { + if (get_preemption_level() != 0) { +#if CONFIG_LTABLE_STATS + table->nspins += 1; +#endif + /* + * We may have just raced with table growth: check + * again to make sure there really isn't any space. + */ + if (++nspins > 4) + panic("Can't grow table %p with preemption" + " disabled!", table); + delay(1); + goto try_again; + } + ltable_grow(table, nelem); + goto try_again; + } + + /* read this value only once before the CAS */ + free_id = table->free_list; + if (free_id.idx >= table_size) + goto try_again; + + /* + * Find the item on the free list which will become the new free list + * head, but be careful not to modify any memory (read only)! Other + * threads can alter table state at any time up until the CAS. We + * don't modify any memory until we've successfully swapped out the + * free list head with the one we've investigated. + */ + for (struct lt_elem *next_elem = lt_elem_idx(table, free_id.idx); + nalloc < nelem; + nalloc++) { + elem = next_elem; + next_id.generation = 0; + next_id.idx = next_elem->lt_next_idx; + if (next_id.idx < table->nelem) { + next_elem = lt_elem_idx(table, next_id.idx); + next_id.id = next_elem->lt_id.id; + } else { + goto try_again; + } + } + /* 'elem' points to the last element being allocated */ + + if (OSCompareAndSwap64(free_id.id, next_id.id, + &table->free_list.id) == FALSE) + goto try_again; + + /* load barrier */ + OSMemoryBarrier(); + + /* + * After the CAS, we know that we own free_id, and it points to a + * valid table entry (checked above). Grab the table pointer and + * reset some values. + */ + OSAddAtomic(nelem, &table->used_elem); + + /* end the list of allocated elements */ + elem->lt_next_idx = LT_IDX_MAX; + /* reset 'elem' to point to the first allocated element */ + elem = lt_elem_idx(table, free_id.idx); + + /* + * Update the generation count, and return the element(s) + * with a single reference (and no valid bit). If the + * caller immediately calls _put() on any element, then + * it will be released back to the free list. If the caller + * subsequently marks the element as valid, then the put + * will simply drop the reference. + */ + for (struct lt_elem *tmp = elem; ; ) { + assert(!lt_bits_valid(tmp->lt_bits) && + (lt_bits_refcnt(tmp->lt_bits) == 0)); + --nalloc; + tmp->lt_id.generation += 1; + tmp->lt_bits = 1; + lt_elem_set_type(tmp, type); + if (tmp->lt_next_idx == LT_IDX_MAX) + break; + assert(tmp->lt_next_idx != LT_IDX_MAX); + tmp = lt_elem_idx(table, tmp->lt_next_idx); + } + assert(nalloc == 0); + +#if CONFIG_LTABLE_STATS + uint64_t nreservations; + table->nallocs += nelem; + if (type == LT_RESERVED) + OSIncrementAtomic64(&table->nreservations); + nreservations = table->nreservations; + if (table->used_elem > table->max_used) + table->max_used = table->used_elem; + if (nreservations > table->max_reservations) + table->max_reservations = nreservations; + table->avg_used = (table->avg_used + table->used_elem) / 2; + table->avg_reservations = (table->avg_reservations + nreservations) / 2; +#endif + + return elem; +} + + +/** + * ltable_realloc_elem: convert a reserved element to a particular type + * + * This funciton is used to convert reserved elements (not yet marked valid) + * to the given 'type'. The generation of 'elem' is incremented, the element + * is disconnected from any list to which it belongs, and its type is set to + * 'type'. + */ +void ltable_realloc_elem(struct link_table *table, struct lt_elem *elem, int type) +{ + (void)table; + assert(lt_elem_in_range(elem, table) && + !lt_bits_valid(elem->lt_bits)); + +#if CONFIG_LTABLE_STATS + table->nreallocs += 1; + if (lt_bits_type(elem->lt_bits) == LT_RESERVED && type != LT_RESERVED) { + /* + * This isn't under any lock, so we'll clamp it. + * the stats are meant to be informative, not perfectly + * accurate + */ + OSDecrementAtomic64(&table->nreservations); + } + table->avg_reservations = (table->avg_reservations + table->nreservations) / 2; +#endif + + /* + * Return the same element with a new generation count, and a + * (potentially) new type. Don't touch the refcount: the caller + * is responsible for getting that (and the valid bit) correct. + */ + elem->lt_id.generation += 1; + elem->lt_next_idx = LT_IDX_MAX; + lt_elem_set_type(elem, type); + + return; +} + + +/** + * ltable_free_elem: release an element back to a link table + * + * Do not call this function directly: use ltable_[get|put]_elem! + * + * Conditions: + * 'elem' was originally allocated from 'table' + * 'elem' is _not_ marked valid + * 'elem' has a reference count of 0 + */ +static void ltable_free_elem(struct link_table *table, struct lt_elem *elem) +{ + struct ltable_id next_id; + + assert(lt_elem_in_range(elem, table) && + !lt_bits_valid(elem->lt_bits) && + (lt_bits_refcnt(elem->lt_bits) == 0)); + + OSDecrementAtomic(&table->used_elem); + +#if CONFIG_LTABLE_STATS + table->avg_used = (table->avg_used + table->used_elem) / 2; + if (lt_bits_type(elem->lt_bits) == LT_RESERVED) + OSDecrementAtomic64(&table->nreservations); + table->avg_reservations = (table->avg_reservations + table->nreservations) / 2; +#endif + + elem->lt_bits = 0; + + if (table->poison) + (table->poison)(table, elem); + +again: + next_id = table->free_list; + if (next_id.idx >= table->nelem) + elem->lt_next_idx = LT_IDX_MAX; + else + elem->lt_next_idx = next_id.idx; + + /* store barrier */ + OSMemoryBarrier(); + if (OSCompareAndSwap64(next_id.id, elem->lt_id.id, + &table->free_list.id) == FALSE) + goto again; +} + + +/** + * ltable_get_elem: get a reference to a table element identified by 'id' + * + * Returns a reference to the table element associated with the given 'id', or + * NULL if the 'id' was invalid or does not exist in 'table'. The caller is + * responsible to release the reference using ltable_put_elem(). + * + * NOTE: if the table element pointed to by 'id' is marked as invalid, + * this function will return NULL. + */ +struct lt_elem *ltable_get_elem(struct link_table *table, uint64_t id) +{ + struct lt_elem *elem; + uint32_t idx, bits, new_bits; + + /* + * Here we have a reference to the table which is guaranteed to remain + * valid until we drop the reference + */ + + idx = ((struct ltable_id *)&id)->idx; + + if (idx >= table->nelem) + panic("id:0x%llx : idx:%d > %d", id, idx, table->nelem); + + elem = lt_elem_idx(table, idx); + + /* verify the validity by taking a reference on the table object */ + bits = elem->lt_bits; + if (!lt_bits_valid(bits)) + return NULL; + + /* + * do a pre-verify on the element ID to potentially + * avoid 2 compare-and-swaps + */ + if (elem->lt_id.id != id) + return NULL; + + new_bits = bits + 1; + + /* check for overflow */ + assert(lt_bits_refcnt(new_bits) > 0); + + while (OSCompareAndSwap(bits, new_bits, &elem->lt_bits) == FALSE) { + /* + * either the element became invalid, + * or someone else grabbed/removed a reference. + */ + bits = elem->lt_bits; + if (!lt_bits_valid(bits)) { + /* don't return invalid elements */ + return NULL; + } + new_bits = bits + 1; + assert(lt_bits_refcnt(new_bits) > 0); + } + + /* load barrier */ + OSMemoryBarrier(); + + /* check to see that our reference is to the same generation! */ + if (elem->lt_id.id != id) { + /* + ltdbg("ID:0x%llx table generation (%d) != %d", + id, elem->lt_id.generation, + ((struct ltable_id *)&id)->generation); + */ + ltable_put_elem(table, elem); + return NULL; + } + + /* We now have a reference on a valid object */ + return elem; +} + +/** + * ltable_put_elem: release a reference to table element + * + * This function releases a reference taken on a table element via + * ltable_get_elem(). This function will release the element back to 'table' + * when the reference count goes to 0 AND the element has been marked as + * invalid. + */ +void ltable_put_elem(struct link_table *table, struct lt_elem *elem) +{ + uint32_t bits, new_bits; + + assert(lt_elem_in_range(elem, table)); + + bits = elem->lt_bits; + new_bits = bits - 1; + + /* check for underflow */ + assert(lt_bits_refcnt(new_bits) < LT_BITS_REFCNT_MASK); + + while (OSCompareAndSwap(bits, new_bits, &elem->lt_bits) == FALSE) { + bits = elem->lt_bits; + new_bits = bits - 1; + /* catch underflow */ + assert(lt_bits_refcnt(new_bits) < LT_BITS_REFCNT_MASK); + } + + /* load barrier */ + OSMemoryBarrier(); + + /* + * if this was the last reference, and it was marked as invalid, + * then we can add this link object back to the free list + */ + if (!lt_bits_valid(new_bits) && (lt_bits_refcnt(new_bits) == 0)) + ltable_free_elem(table, elem); + + return; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * + * API: lt_elem_list_... + * + * Reuse the free list linkage member, 'lt_next_idx' of a table element + * in a slightly more generic singly-linked list. All members of this + * list have been allocated from a table, but have not been made valid. + * + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ + +/** + * lt_elem_list_link: link a child onto a parent + * + * Note that if 'parent' is the head of a list, this function will follow that + * list and attach 'child' to the end of it. In the simplest case, this + * results in: parent->child + * however this could also result in: parent->...->child + */ +int lt_elem_list_link(struct link_table *table, struct lt_elem *parent, struct lt_elem *child) +{ + int nelem = 1; + + assert(lt_elem_in_range(parent, table)); + + /* find the end of the parent's list */ + while (parent->lt_next_idx != LT_IDX_MAX) { + assert(parent->lt_next_idx < table->nelem); + parent = lt_elem_idx(table, parent->lt_next_idx); + nelem++; + } + + if (child) { + assert(lt_elem_in_range(child, table)); + parent->lt_next_idx = child->lt_id.idx; + } + + return nelem; +} + + +/** + * lt_elem_list_first: obtain a pointer to the first element of a list. + * + * This function converts the head of a singly-linked list, 'id', into a real + * lt_elem object and returns a pointer to the object. + * + * It does _not_ take an extra reference on the object: the list implicitly + * holds that reference. + */ +struct lt_elem *lt_elem_list_first(struct link_table *table, uint64_t id) +{ + uint32_t idx; + struct lt_elem *elem = NULL; + + if (id == 0) + return NULL; + + idx = ((struct ltable_id *)&id)->idx; + + if (idx > table->nelem) + panic("Invalid element for id:0x%llx", id); + elem = lt_elem_idx(table, idx); + + /* invalid element: reserved ID was probably already reallocated */ + if (elem->lt_id.id != id) + return NULL; + + /* the returned element should _not_ be marked valid! */ + if (lt_bits_valid(elem->lt_bits) || + lt_bits_type(elem->lt_bits) != LT_RESERVED || + lt_bits_refcnt(elem->lt_bits) != 1) { + panic("Valid/unreserved element %p (0x%x) in reserved list", + elem, elem->lt_bits); + } + + return elem; +} + + +/** + * lt_elem_list_next: return the item subsequent to 'elem' in a list + * + * Note that this will return NULL if 'elem' is actually the end of the list. + */ +struct lt_elem *lt_elem_list_next(struct link_table *table, struct lt_elem *head) +{ + struct lt_elem *elem; + + if (!head) + return NULL; + if (head->lt_next_idx >= table->nelem) + return NULL; + + elem = lt_elem_idx(table, head->lt_next_idx); + assert(lt_elem_in_range(elem, table)); + + return elem; +} + + +/** + * lt_elem_list_break: break a list in two around 'elem' + * + * This function will reset the next_idx field of 'elem' (making it the end of + * the list), and return the element subsequent to 'elem' in the list + * (which could be NULL) + */ +struct lt_elem *lt_elem_list_break(struct link_table *table, struct lt_elem *elem) +{ + struct lt_elem *next; + + if (!elem) + return NULL; + next = lt_elem_list_next(table, elem); + elem->lt_next_idx = LT_IDX_MAX; + + return next; +} + + +/** + * lt_elem_list_pop: pop an item off the head of a list + * + * The list head is pointed to by '*id', the element corresponding to '*id' is + * returned by this function, and the new list head is returned in the in/out + * parameter, '*id'. The caller is responsible for the reference on the + * returned object. A realloc is done to reset the type of the object, but it + * is still left invalid. + */ +struct lt_elem *lt_elem_list_pop(struct link_table *table, uint64_t *id, int type) +{ + struct lt_elem *first, *next; + + if (!id || *id == 0) + return NULL; + + /* pop an item off the reserved stack */ + + first = lt_elem_list_first(table, *id); + if (!first) { + *id = 0; + return NULL; + } + + next = lt_elem_list_next(table, first); + if (next) + *id = next->lt_id.id; + else + *id = 0; + + ltable_realloc_elem(table, first, type); + + return first; +} + +/** + * lt_elem_list_release: free an entire list of reserved elements + * + * All elements in the list whose first member is 'head' will be released back + * to 'table' as free elements. The 'type' parameter is used in development + * kernels to assert that all elements on the list are of the given type. + */ +int lt_elem_list_release(struct link_table *table, struct lt_elem *head, + int __assert_only type) +{ + struct lt_elem *elem; + struct ltable_id free_id; + int nelem = 0; + + if (!head) + return 0; + + for (elem = head; ; ) { + assert(lt_elem_in_range(elem, table)); + assert(!lt_bits_valid(elem->lt_bits) && (lt_bits_refcnt(elem->lt_bits) == 1)); + assert(lt_bits_type(elem->lt_bits) == type); + + nelem++; + elem->lt_bits = 0; + if (table->poison) + (table->poison)(table, elem); + + if (elem->lt_next_idx == LT_IDX_MAX) + break; + assert(elem->lt_next_idx < table->nelem); + elem = lt_elem_idx(table, elem->lt_next_idx); + } + + /* + * 'elem' now points to the end of our list, and 'head' points to the + * beginning. We want to atomically swap the free list pointer with + * the 'head' and ensure that 'elem' points to the previous free list + * head. + */ + +again: + free_id = table->free_list; + if (free_id.idx >= table->nelem) + elem->lt_next_idx = LT_IDX_MAX; + else + elem->lt_next_idx = free_id.idx; + + /* store barrier */ + OSMemoryBarrier(); + if (OSCompareAndSwap64(free_id.id, head->lt_id.id, + &table->free_list.id) == FALSE) + goto again; + + OSAddAtomic(-nelem, &table->used_elem); + return nelem; +} diff --git a/osfmk/kern/ltable.h b/osfmk/kern/ltable.h new file mode 100644 index 000000000..aa62edfb9 --- /dev/null +++ b/osfmk/kern/ltable.h @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifdef XNU_KERNEL_PRIVATE + +#include +#include + +#if CONFIG_LTABLE_DEBUG +#define ltdbg(fmt,...) \ + printf("LT[%s]: " fmt "\n", __func__, ## __VA_ARGS__) +#else +#define ltdbg(fmt,...) do { } while (0) +#endif + +#ifdef LTABLE_VERBOSE_DEBUG +#define ltdbg_v(fmt,...) \ + printf("LT[v:%s]: " fmt "\n", __func__, ## __VA_ARGS__) +#else +#define ltdbg_v(fmt,...) do { } while (0) +#endif + +#define ltinfo(fmt,...) \ + printf("LT[%s]: " fmt "\n", __func__, ## __VA_ARGS__) + +#define lterr(fmt,...) \ + printf("LT[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__) + + + +/* ---------------------------------------------------------------------- + * + * Lockless Link Table Interface + * + * ---------------------------------------------------------------------- */ + +struct ltable_id { + union { + uint64_t id; + struct { + /* + * this bitfield is OK because we don't need to + * enforce a particular memory layout + */ + uint64_t idx:18, /* allows indexing up to 8MB of 32byte objects */ + generation:46; + }; + }; +}; + +/* this _must_ match the idx bitfield definition in struct ltable_id */ +#define LT_IDX_MAX (0x3ffff) + +extern vm_size_t g_lt_max_tbl_size; + + +struct lt_elem { + struct ltable_id lt_id; + uint32_t lt_bits; + uint32_t lt_next_idx; +}; + +/* reference count bits should _always_ be the low-order bits */ +#define LT_BITS_REFCNT_MASK (0x1FFFFFFF) +#define LT_BITS_REFCNT_SHIFT (0) +#define LT_BITS_REFCNT (LT_BITS_REFCNT_MASK << LT_BITS_REFCNT_SHIFT) + +#define LT_BITS_TYPE_MASK (0x3) +#define LT_BITS_TYPE_SHIFT (29) +#define LT_BITS_TYPE (LT_BITS_TYPE_MASK << LT_BITS_TYPE_SHIFT) + +#define LT_BITS_VALID_MASK (0x1) +#define LT_BITS_VALID_SHIFT (31) +#define LT_BITS_VALID (LT_BITS_VALID_MASK << LT_BITS_VALID_SHIFT) + +#define lt_bits_refcnt(bits) \ + (((bits) >> LT_BITS_REFCNT_SHIFT) & LT_BITS_REFCNT_MASK) + +#define lt_bits_type(bits) \ + (((bits) >> LT_BITS_TYPE_SHIFT) & LT_BITS_TYPE_MASK) + +#define lt_bits_valid(bits) \ + ((bits) & LT_BITS_VALID) + +enum lt_elem_type { + LT_FREE = 0, + LT_ELEM = 1, + LT_LINK = 2, + LT_RESERVED = 3, +}; + +struct link_table; +typedef void (*ltable_poison_func)(struct link_table *, struct lt_elem *); + +/* + * link_table structure + * + * A link table is a container for slabs of elements. Each slab is 'slab_sz' + * bytes and contains 'slab_sz/elem_sz' elements (of 'elem_sz' bytes each). + * These slabs allow the table to be broken up into potentially dis-contiguous + * VA space. On 32-bit platforms with large amounts of physical RAM, this is + * quite important. Keeping slabs like this slightly complicates retrieval of + * table elements, but not by much. + */ +struct link_table { + struct lt_elem **table; /* an array of 'slabs' of elements */ + struct lt_elem **next_free_slab; + struct ltable_id free_list __attribute__((aligned(8))); + + uint32_t elem_sz; /* size of a table element (bytes) */ + uint32_t slab_shift; + uint32_t slab_msk; + uint32_t slab_elem; + uint32_t slab_sz; /* size of a table 'slab' object (bytes) */ + + uint32_t nelem; + uint32_t used_elem; + zone_t slab_zone; + + ltable_poison_func poison; + + lck_mtx_t lock; + uint32_t state; + +#if CONFIG_LTABLE_STATS + uint32_t nslabs; + + uint64_t nallocs; + uint64_t nreallocs; + uint64_t npreposts; + int64_t nreservations; + uint64_t nreserved_releases; + uint64_t nspins; + + uint64_t max_used; + uint64_t avg_used; + uint64_t max_reservations; + uint64_t avg_reservations; +#endif +} __attribute__((aligned(8))); + + +/** + * ltable_bootstrap: bootstrap a link table + * + * Called once at system boot + */ +extern void ltable_bootstrap(void); + + +/** + * ltable_init: initialize a link table with given parameters + * + */ +extern void ltable_init(struct link_table *table, const char *name, + uint32_t max_tbl_elem, uint32_t elem_sz, + ltable_poison_func poison); + + +/** + * ltable_grow: grow a link table by adding another 'slab' of table elements + * + * Conditions: + * table mutex is unlocked + * calling thread can block + */ +extern void ltable_grow(struct link_table *table, uint32_t min_free); + + +/** + * ltable_alloc_elem: allocate one or more elements from a given table + * + * The returned element(s) will be of type 'type', but will remain invalid. + * + * If the caller has disabled preemption, then this function may (rarely) spin + * waiting either for another thread to either release 'nelem' table elements, + * or grow the table. + * + * If the caller can block, then this function may (rarely) block while + * the table grows to meet the demand for 'nelem' element(s). + */ +extern __attribute__((noinline)) +struct lt_elem *ltable_alloc_elem(struct link_table *table, int type, + int nelem, int nattempts); + + +/** + * ltable_realloc_elem: convert a reserved element to a particular type + * + * This funciton is used to convert reserved elements (not yet marked valid) + * to the given 'type'. The generation of 'elem' is incremented, the element + * is disconnected from any list to which it belongs, and its type is set to + * 'type'. + */ +extern void ltable_realloc_elem(struct link_table *table, + struct lt_elem *elem, int type); + + +/** + * ltable_get_elem: get a reference to a table element identified by 'id' + * + * Returns a reference to the table element associated with the given 'id', or + * NULL if the 'id' was invalid or does not exist in 'table'. The caller is + * responsible to release the reference using ltable_put_elem(). + * + * NOTE: if the table element pointed to by 'id' is marked as invalid, + * this function will return NULL. + */ +extern struct lt_elem *ltable_get_elem(struct link_table *table, uint64_t id); + + +/** + * ltable_put_elem: release a reference to table element + * + * This function releases a reference taken on a table element via + * ltable_get_elem(). This function will release the element back to 'table' + * when the reference count goes to 0 AND the element has been marked as + * invalid. + */ +extern void ltable_put_elem(struct link_table *table, struct lt_elem *elem); + + +/** + * lt_elem_invalidate: mark 'elem' as invalid + * + * NOTE: this does _not_ get or put a reference on 'elem' + */ +extern void lt_elem_invalidate(struct lt_elem *elem); + + +/** + * lt_elem_mkvalid: mark 'elem' as valid + * + * NOTE: this does _not_ get or put a reference on 'elem' + */ +extern void lt_elem_mkvalid(struct lt_elem *elem); + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * + * API: lt_elem_list_* + * + * Reuse the free list linkage member, 'lt_next_idx' of a link table element + * in a slightly more generic singly-linked list. All members of this list + * have been allocated from a table, but have not been made valid. + * + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ + +/** + * lt_elem_list_link: link a child onto a parent + * + * Note that if 'parent' is the head of a list, this function will follow that + * list and attach 'child' to the end of it. In the simplest case, this + * results in: parent->child + * however this could also result in: parent->...->child + */ +extern int lt_elem_list_link(struct link_table *table, + struct lt_elem *parent, struct lt_elem *child); + + +/** + * lt_elem_list_first: obtain a pointer to the first element of a list. + * + * This function converts the head of a singly-linked list, 'id', into a real + * lt_elem object and returns a pointer to the object. + * + * It does _not_ take an extra reference on the object: the list implicitly + * holds that reference. + */ +extern struct lt_elem *lt_elem_list_first(struct link_table *table, uint64_t id); + + +/** + * lt_elem_list_next: return the item subsequent to 'elem' in a list + * + * Note that this will return NULL if 'elem' is actually the end of the list. + */ +extern struct lt_elem *lt_elem_list_next(struct link_table *table, + struct lt_elem *elem); + + +/** + * lt_elem_list_break: break a list in two around 'elem' + * + * This function will reset the next_idx field of 'elem' (making it the end of + * the list), and return the element subsequent to 'elem' in the list + * (which could be NULL) + */ +extern struct lt_elem *lt_elem_list_break(struct link_table *table, + struct lt_elem *elem); + + +/** + * lt_elem_list_pop: pop an item off the head of a list + * + * The list head is pointed to by '*id', the element corresponding to '*id' is + * returned by this function, and the new list head is returned in the in/out + * parameter, '*id'. The caller is responsible for the reference on the + * returned object. A realloc is done to reset the type of the object, but it + * is still left invalid. + */ +extern struct lt_elem *lt_elem_list_pop(struct link_table *table, + uint64_t *id, int type); + + +/** + * lt_elem_list_release: free an entire list of reserved elements + * + * All elements in the list whose first member is 'head' will be released back + * to 'table' as free elements. The 'type' parameter is used in development + * kernels to assert that all elements on the list are of the given type. + */ +extern int lt_elem_list_release(struct link_table *table, + struct lt_elem *head, + int __assert_only type); + +static inline int lt_elem_list_release_id(struct link_table *table, + uint64_t id, int type) +{ + return lt_elem_list_release(table, lt_elem_list_first(table, id), type); +} + +#endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/mach_node.c b/osfmk/kern/mach_node.c new file mode 100644 index 000000000..4a0d96dc8 --- /dev/null +++ b/osfmk/kern/mach_node.c @@ -0,0 +1,902 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* File: kern/mach_node.h + * Author: Dean Reece + * Date: 2016 + * + * Implementation of mach node support. + * This is the basis for flipc, which provides inter-node communication. + */ + + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include // mach_msg_send_from_kernel_proper() + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include // OSAddAtomic64(), OSCompareAndSwap() +#include // OSHostByteOrder() + +#pragma pack(4) + +#define MNL_NAME_TABLE_SIZE (256) // Hash is evenly distributed, so ^2 is ok +#define MNL_NAME_HASH(name) (name % MNL_NAME_TABLE_SIZE) + +/*** Visible outside mach_node layer ***/ +mach_node_id_t localnode_id = -1; // This node's FLIPC id. +#if MACH_FLIPC +mach_node_t localnode; // This node's mach_node_t struct + + +/*** Private to mach_node layer ***/ +static int mach_nodes_to_publish; +static mach_node_t mach_node_table[MACH_NODES_MAX]; +static lck_spin_t mach_node_table_lock_data; +#define MACH_NODE_TABLE_LOCK() lck_spin_lock(&mach_node_table_lock_data) +#define MACH_NODE_TABLE_UNLOCK() lck_spin_unlock(&mach_node_table_lock_data) +#define MACH_NODE_TABLE_LOCK_INIT() lck_spin_init(&mach_node_table_lock_data, \ + &ipc_lck_grp, &ipc_lck_attr) + +static volatile SInt64 mnl_name_next; +static queue_head_t mnl_name_table[MNL_NAME_TABLE_SIZE]; +static lck_spin_t mnl_name_table_lock_data; +#define MNL_NAME_TABLE_LOCK() lck_spin_lock(&mnl_name_table_lock_data) +#define MNL_NAME_TABLE_UNLOCK() lck_spin_unlock(&mnl_name_table_lock_data) +#define MNL_NAME_TABLE_LOCK_INIT() lck_spin_init(&mnl_name_table_lock_data, \ + &ipc_lck_grp, &ipc_lck_attr) + +static void mach_node_init(void); +static void mnl_name_table_init(void); +static void mach_node_table_init(void); +static void mach_node_publish(mach_node_t node); + +static mach_node_t mach_node_alloc_init(mach_node_id_t node_id); +static kern_return_t mach_node_register(mach_node_t node); + + +/* mach_node_init() is run lazily when a node link driver registers + * or the node special port is set. + * The variable localnode_id is used to determine if init has already run. + */ +void +mach_node_init(void) +{ + mach_node_id_t node_id = 0; // TODO: Read from device tree? + if (OSCompareAndSwap((UInt32)(HOST_LOCAL_NODE), + (UInt32)node_id, + &localnode_id)) { + printf("mach_node_init(): localnode_id=%d of %d\n", + localnode_id, MACH_NODES_MAX); + mach_node_table_init(); + mnl_name_table_init(); + flipc_init(); + } // TODO: else block until init is finished (init completion race) +} + +void +mach_node_table_init(void) +{ + MACH_NODE_TABLE_LOCK_INIT(); + MACH_NODE_TABLE_LOCK(); + + /* Start with an enpty node table. */ + bzero(mach_node_table, sizeof(mach_node_t) * MACH_NODES_MAX); + mach_nodes_to_publish = 0; + + /* Allocate localnode's struct */ + localnode = mach_node_for_id_locked(localnode_id, 1, 1); + assert(MACH_NODE_VALID(localnode)); + + MACH_NODE_TABLE_UNLOCK(); + + /* Set up localnode's struct */ + bzero(localnode, sizeof(localnode)); + localnode->info.datamodel = LOCAL_DATA_MODEL; + localnode->info.byteorder = OSHostByteOrder(); + localnode->info.proto_vers_min = MNL_PROTOCOL_V1; + localnode->info.proto_vers_max = MNL_PROTOCOL_V1; + localnode->proto_vers = MNL_PROTOCOL_V1; + localnode->published = 0; + localnode->active = 1; + + MACH_NODE_UNLOCK(localnode); +} + +/* Sends a publication message to the local node's bootstrap server. + * This function is smart and will only send a notification if one as really + * needed - it can be called speculatively on any node at any time. + * + * Note: MUST be called with the node table lock held. + */ + +void +mach_node_publish(mach_node_t node) +{ + kern_return_t kr; + + if (!MACH_NODE_VALID(node) || (!node->active) || (node->published)) + return; // node is invalid or not suitable for publication + + ipc_port_t bs_port = localnode->bootstrap_port; + if (!IP_VALID(bs_port)) + return; // No bootstrap server to notify! + + /* Node is suitable and server is present, so make registration message */ + struct mach_node_server_register_msg msg; + + msg.node_header.header.msgh_remote_port = bs_port; + msg.node_header.header.msgh_size = sizeof(msg); + msg.node_header.header.msgh_local_port = MACH_PORT_NULL; + msg.node_header.header.msgh_voucher_port = MACH_PORT_NULL; + msg.node_header.header.msgh_id = MACH_NODE_SERVER_MSG_ID; + msg.node_header.node_id = node->info.node_id; + msg.node_header.options = 0; + msg.datamodel = node->info.datamodel; + msg.byteorder = node->info.byteorder; + + if (node == localnode) { + msg.node_header.identifier = MACH_NODE_SM_REG_LOCAL; + msg.node_header.header.msgh_bits = + MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, 0, 0, 0); + } else { + msg.node_header.identifier = MACH_NODE_SM_REG_REMOTE; + msg.node_header.header.msgh_local_port = node->bootstrap_port; + msg.node_header.header.msgh_bits = MACH_MSGH_BITS_SET + (MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND, 0, 0); + } + + kr = mach_msg_send_from_kernel_proper(&msg.node_header.header, + sizeof (msg)); + if (kr == KERN_SUCCESS) { + node->published = 1; + mach_nodes_to_publish--; + } + printf("mach_node_publish(%d)=%d\n", node->info.node_id, kr); +} + +/* Called whenever the node special port changes */ +void +mach_node_port_changed(void) +{ + ipc_port_t bs_port; + + mach_node_init(); // Lazy init of mach_node layer + + /* Cleanup previous bootstrap port if necessary */ + MACH_NODE_LOCK(localnode); + flipc_node_retire(localnode); + bs_port = localnode->bootstrap_port; + if (IP_VALID(bs_port)) { + localnode->bootstrap_port = IP_NULL; + // TODO: destroy send right to outgoing bs_port + } + + kernel_get_special_port(host_priv_self(), HOST_NODE_PORT, &bs_port); + assert(IP_VALID(bs_port)); + localnode->bootstrap_port = bs_port; + flipc_node_prepare(localnode); + MACH_NODE_UNLOCK(localnode); + + /* Cleanup the publication state of all nodes in the table */ + MACH_NODE_TABLE_LOCK(); + // TODO: Signup for bootstrap port death notifications + localnode->active = 1; + + mach_nodes_to_publish = 0; + + int n; + for (n=0; npublished = 0; + if (np->active == 1) + mach_nodes_to_publish++; + } + + mach_node_publish(localnode); // Always publish local node first + + for (n=0; ninfo.node_id = node_id; + } + return node; +} + + +/* This function takes a mach_node struct with a completed info field and + * registers it with the mach_node and flipc (if flipc is enabled) layers. + */ +kern_return_t +mach_node_register(mach_node_t node) +{ + assert(MACH_NODE_VALID(node)); + mach_node_id_t nid = node->info.node_id; + assert(MACH_NODE_ID_VALID(nid)); + + kern_return_t kr; + ipc_space_t proxy_space = IS_NULL; + ipc_pset_t pp_set = IPS_NULL; // pset for proxy ports + ipc_port_t bs_port = MACH_PORT_NULL; + ipc_port_t ack_port = MACH_PORT_NULL; + + printf("mach_node_register(%d)\n", nid); + + /* TODO: Support non-native byte order and data models */ + if ((node->info.byteorder != OSHostByteOrder()) || + (node->info.datamodel != LOCAL_DATA_MODEL)) { + printf("mach_node_register: unsupported byte order (%d) or width (%d)", + node->info.byteorder, node->info.datamodel); + return KERN_INVALID_ARGUMENT; + } + + /* Create the space that holds all local rights assigned to */ + kr = ipc_space_create_special(&proxy_space); + if (kr != KERN_SUCCESS) + goto out; + proxy_space->is_node_id = nid; + + /* Create the bootstrap proxy port for this remote node */ + bs_port = ipc_port_alloc_special(proxy_space); + if (bs_port == MACH_PORT_NULL) { + kr = KERN_RESOURCE_SHORTAGE; + goto out; + } + + /* Create the control (ack) port for this remote node */ + ack_port = ipc_port_alloc_special(proxy_space); + if (ack_port == MACH_PORT_NULL) { + kr = KERN_RESOURCE_SHORTAGE; + goto out; + } + + /* Create the set that holds all proxy ports for this remote node */ + pp_set = ipc_pset_alloc_special(proxy_space); + if (pp_set == IPS_NULL) { + kr = KERN_RESOURCE_SHORTAGE; + goto out; + } + + /* Add the bootstrap port to the proxy port set */ + uint64_t wq_link_id = waitq_link_reserve(NULL); + uint64_t wq_reserved_prepost = waitq_prepost_reserve(NULL, 10, + WAITQ_DONT_LOCK); + ips_lock(pp_set); + ip_lock(bs_port); + ipc_pset_add(pp_set, + bs_port, + &wq_link_id, + &wq_reserved_prepost); + ip_unlock(bs_port); + ips_unlock(pp_set); + + waitq_link_release(wq_link_id); + waitq_prepost_release_reserve(wq_reserved_prepost); + + /* Add the control port to the proxy port set */ + wq_link_id = waitq_link_reserve(NULL); + wq_reserved_prepost = waitq_prepost_reserve(NULL, 10, + WAITQ_DONT_LOCK); + ips_lock(pp_set); + ip_lock(ack_port); + ipc_pset_add(pp_set, + ack_port, + &wq_link_id, + &wq_reserved_prepost); + ip_unlock(ack_port); + ips_unlock(pp_set); + + waitq_link_release(wq_link_id); + waitq_prepost_release_reserve(wq_reserved_prepost); + + // Setup mach_node struct + node->published = 0; + node->active = 1; + node->proxy_space = proxy_space; + node->proxy_port_set = pp_set; + node->bootstrap_port = bs_port; + node->proto_vers = node->info.proto_vers_max; + node->control_port = ack_port; + + // Place new mach_node struct into node table + MACH_NODE_TABLE_LOCK(); + + mach_node_t old_node = mach_node_table[nid]; + if (!MACH_NODE_VALID(old_node) || (old_node->dead)) { + node->antecedent = old_node; + flipc_node_prepare(node); + mach_node_table[nid] = node; + mach_nodes_to_publish++; + mach_node_publish(node); + kr = KERN_SUCCESS; + } else { + printf("mach_node_register: id %d already active!", nid); + kr = KERN_FAILURE; + } + MACH_NODE_TABLE_UNLOCK(); + +out: + if (kr != KERN_SUCCESS) { // Dispose of whatever we allocated + if (pp_set) { + ips_lock(pp_set); + ipc_pset_destroy(pp_set); + } + + if (bs_port) + ipc_port_dealloc_special(bs_port, proxy_space); + + if (ack_port) + ipc_port_dealloc_special(ack_port, proxy_space); + + if (proxy_space) + ipc_space_terminate(proxy_space); + } + + return kr; +} + + +/* Gets or allocates a locked mach_node struct for the specified . + * The current node is locked and returned if it is not dead, or if it is dead + * and is false. A new node struct is allocated, locked and + * returned if the node is dead and is true, or if the node + * is absent and is true. MACH_NODE_NULL is returned if + * the node is absent and is false. MACH_NODE_NULL is also + * returned if a new node structure was not able to be allocated. + * + * Note: This function must be called with the node table lock held! + */ +mach_node_t +mach_node_for_id_locked(mach_node_id_t node_id, + boolean_t alloc_if_dead, + boolean_t alloc_if_absent) +{ + if ((node_id < 0) || (node_id >= MACH_NODES_MAX)) + return MACH_NODE_NULL; + + mach_node_t node = mach_node_table[node_id]; + + if ( (!MACH_NODE_VALID(node) && alloc_if_absent) || + (MACH_NODE_VALID(node) && node->dead && alloc_if_dead) ) { + node = mach_node_alloc_init(node_id); + if (MACH_NODE_VALID(node)) { + node->antecedent = mach_node_table[node_id]; + mach_node_table[node_id] = node; + } + } + + if (MACH_NODE_VALID(node)) + MACH_NODE_LOCK(node); + + return node; +} + + + +/*** Mach Node Link Name and Hash Table Implementation ***/ + +/* Allocate a new unique name and return it. + * Dispose of this with mnl_name_free(). + * Returns MNL_NAME_NULL on failure. + */ +mnl_name_t +mnl_name_alloc(void) +{ + return (mnl_name_t)OSAddAtomic64(MACH_NODES_MAX, &mnl_name_next); +} + + +/* Deallocate a unique name that was allocated via mnl_name_alloc(). + */ +void +mnl_name_free(mnl_name_t name __unused) +{ + ; // Nothing to do for now since we don't recycle mnl names. +} + + +/* Called once from mach_node_init(), this sets up the hash table structures. + */ +void +mnl_name_table_init(void) +{ + MNL_NAME_TABLE_LOCK_INIT(); + MNL_NAME_TABLE_LOCK(); + + // Set the first name to this node's bootstrap name + mnl_name_next = localnode_id + MACH_NODES_MAX; + + for (int i=0; ilinks); + obj->name = MNL_NAME_NULL; +} + + +/* Search the local node's hash table for the object associated with a + * mnl_name_t and return it. Returns MNL_NAME_NULL on failure. + */ +mnl_obj_t +mnl_obj_lookup(mnl_name_t name) +{ + mnl_obj_t obj = MNL_OBJ_NULL; + + if (name != MNL_NAME_NULL) { + qe_foreach_element(obj, &mnl_name_table[MNL_NAME_HASH(name)], links) { + if (obj->name == name) + break; + } + } + return obj; +} + + +/* Search the local node's hash table for the object associated with a + * mnl_name_t and remove it. The pointer to the removed object is returned so + * that the caller can appropriately dispose of the object. + * Returns MNL_NAME_NULL on failure. + */ +mnl_obj_t +mnl_obj_remove(mnl_name_t name) +{ + mnl_obj_t obj = MNL_OBJ_NULL; + + if (name != MNL_NAME_NULL) { + qe_foreach_element_safe(obj, &mnl_name_table[MNL_NAME_HASH(name)], links) { + if (obj->name == name) + remqueue(&obj->links); + } + } + return obj; +} + + +/* Insert an object into the local node's hash table. If the name of the + * provided object is MNL_NAME_NULL then a new mnl_name is allocated and + * assigned to the object. + * Returns KERN_SUCCESS if obj was added to hash table + * Returns KERN_INVALID_ARGUMENT if obj is invalid + * Returns KERN_NAME_EXISTS if obj's name already exists in hash table + */ +kern_return_t +mnl_obj_insert(mnl_obj_t obj) +{ + if (!MNL_OBJ_VALID(obj)) + return KERN_INVALID_ARGUMENT; + + MNL_NAME_TABLE_LOCK(); + + if (!MNL_NAME_VALID(obj->name)) { + // obj is unnammed, so lets allocate a fresh one + obj->name = mnl_name_alloc(); + } + + enqueue(&mnl_name_table[MNL_NAME_HASH(obj->name)], &obj->links); + MNL_NAME_TABLE_UNLOCK(); + + if(obj->name >= (MACH_NODES_MAX<<1)) + panic("Unexpected MNL_NAME %lld in obj %p", obj->name, obj); + + return KERN_SUCCESS; +} + + +/*** Mach Node Link Driver Interface Implementation ***/ + +/* Allocate a mnl_msg struct plus additional payload. Link drivers are not + * required to use this to allocate messages; any wired and mapped kernel + * memory is acceptable. + * + * Arguments: + * payload Number of additional bytes to allocate for message payload + * flags Currently unused; 0 should be passed + * + * Return values: + * MNL_MSG_NULL: Allocation failed + * *: Pointer to new mnl_msg struct of requested size + */ +mnl_msg_t +mnl_msg_alloc(int payload, + uint32_t flags __unused) +{ + mnl_msg_t msg = kalloc(MNL_MSG_SIZE + payload); + + if (MNL_MSG_VALID(msg)) { + bzero(msg, MNL_MSG_SIZE); // Only zero the header + msg->size = payload; + } + + return msg; +} + + +/* Free a mnl_msg struct allocated by mnl_msg_alloc(). + * + * Arguments: + * msg Pointer to the message buffer to be freed + * flags Currently unused; 0 should be passed + */ +void +mnl_msg_free(mnl_msg_t msg, + uint32_t flags __unused) +{ + if (MNL_MSG_VALID(msg)) + kfree(msg, MNL_MSG_SIZE + msg->size); +} + + +/* The link driver calls this to setup a new (or restarted) node, and to get + * an mnl_node_info struct for use as a parameter to other mnl functions. + * If MNL_NODE_NULL is returned, the operation failed. Otherwise, a pointer + * to a new mnl_node struct is returned. The caller should set all fields + * in the structure, then call mnl_register() to complete node registration. + * + * Arguments: + * nid The id of the node to be instantiated + * flags Currently unused; 0 should be passed + * + * Return values: + * MNL_NODE_NULL: Operation failed + * *: Pointer to a new mnl_node struct + */ +mnl_node_info_t +mnl_instantiate(mach_node_id_t nid, + uint32_t flags __unused) +{ + mach_node_init(); // Lazy init of mach_node layer + + if ((nid==localnode_id) || !MACH_NODE_ID_VALID(nid)) + return MNL_NODE_NULL; + + return (mnl_node_info_t)mach_node_alloc_init(nid); +} + +/* The link driver calls mnl_register() to complete the node registration + * process. KERN_SUCCESS is returned if registration succeeded, otherwise + * an error is returned. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * flags Currently unused; 0 should be passed + * + * Return values: + * KERN_SUCCESS: Registration succeeded + * KERN_INVALID_ARGUMENT: Field(s) in contained unacceptable values + * KERN_*: Values returned from underlying functions + */ +kern_return_t +mnl_register(mnl_node_info_t node, + uint32_t flags __unused) +{ + if (MNL_NODE_VALID(node) && (node->node_id != localnode_id)) + return mach_node_register((mach_node_t)node); + + return KERN_INVALID_ARGUMENT; +} + + +/* The link driver calls this to report that the link has been raised in one + * or both directions. If the link is two uni-directional channels, each link + * driver will independently call this function, each only raising the link + * they are responsible for. The mach_node layer will not communicate with + * the remote node until both rx and tx links are up. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * link Indicates which link(s) are up (see MNL_LINK_* defines) + * flags Currently unused; 0 should be passed + * + * Return values: + * KERN_SUCCESS: Link state changed successfully. + * KERN_INVALID_ARGUMENT: An argument value was not allowed. + * KERN_*: Values returned from underlying functions. + */ +kern_return_t +mnl_set_link_state(mnl_node_info_t node, + int link, + uint32_t flags __unused) +{ + kern_return_t kr; + mach_node_t mnode = (mach_node_t)node; + + if (!MACH_NODE_VALID(mnode) || !(link & MNL_LINK_UP) || (link & mnode->link)) + return KERN_INVALID_ARGUMENT; // bad node, or bad link argument + + MACH_NODE_LOCK(mnode); + + if (mnode->dead) { + kr = KERN_NODE_DOWN; + } else { + mnode->link |= link; + kr = KERN_SUCCESS; + } + + MACH_NODE_UNLOCK(mnode); + + return kr; +} + +/* The link driver calls this to indicate a node has terminated and is no + * longer available for messaging. This may be due to a crash or an orderly + * shutdown, but either way the remote node no longer retains any state about + * the remaining nodes. References held on behalf of the terminated node + * will be cleaned up. After this is called, both the rx and tx links are + * marked as down. If the remote node restarts, the link driver can bring + * up the link using mnl_instantiate() again. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * flags Currently unused; 0 should be passed + * + * Return values: + * KERN_SUCCESS: Node was terminated. + * KERN_INVALID_ARGUMENT: Node id was invalid or non-existant. + * KERN_*: Values returned from underlying functions. + */ +kern_return_t +mnl_terminate(mnl_node_info_t node, + uint32_t flags __unused) +{ + kern_return_t kr = KERN_SUCCESS; + mach_node_t mnode = (mach_node_t)node; + + if (!MACH_NODE_VALID(mnode)) + return KERN_INVALID_ARGUMENT; // bad node + + MACH_NODE_LOCK(mnode); + if (mnode->dead) { + kr = KERN_NODE_DOWN; // node is already terminated + goto unlock; + } + + mnode->link = MNL_LINK_DOWN; + mnode->active = 0; + mnode->suspended = 0; + mnode->dead = 1; + + flipc_node_retire(mnode); + + // Wake any threads sleeping on the proxy port set + if (mnode->proxy_port_set != IPS_NULL) { + ips_lock(mnode->proxy_port_set); + ipc_pset_destroy(mnode->proxy_port_set); + mnode->proxy_port_set = IPS_NULL; + } + + // TODO: Inform node name server (if registered) of termination + +unlock: + MACH_NODE_UNLOCK(mnode); + return kr; +} + + +/* The link driver calls this to deliver an incoming message. Note that the + * link driver must dispose of the memory pointed to by after the + * function call returns. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * msg Pointer to the message buffer + * flags Currently unused; 0 should be passed + */ +void +mnl_msg_from_node(mnl_node_info_t node __unused, + mnl_msg_t msg, + uint32_t flags __unused) +{ + assert(MNL_MSG_VALID(msg)); + assert(MACH_NODE_ID_VALID(msg->node_id)); + assert(MNL_NODE_VALID(node)); + + /* If node message forwarding is supported, the from_node_id arg may not + * match fmsg->info.node_id. The former is the node from which we received + * the message; the latter is the node that generated the message originally. + * We always use fmsg->info.node_id, which is where the ack needs to go. + */ + + switch (msg->sub) { + + case MACH_NODE_SUB_FLIPC: + flipc_msg_from_node((mach_node_t)node, msg, flags); + break; + + default: +#if DEBUG + PE_enter_debugger("mnl_msg_from_node(): Invalid subsystem"); +#endif + break; + } +} + + +/* The link driver calls this to fetch the next message to transmit. + * This function will block until a message is available, or will return + * FLIPC_MSG_NULL if the link is to be terminated. After the caller has + * completed the transmission and no longer needs the msg buffer, it should + * call mnl_msg_complete(). + * + * Arguments: + * node Pointer to the node's mnl_node structure + * flags Currently unused; 0 should be passed + */ +mnl_msg_t +mnl_msg_to_node(mnl_node_info_t node __unused, + uint32_t flags __unused) +{ + assert(MNL_NODE_VALID(node)); + +#if DEBUG + thread_set_thread_name(current_thread(), "MNL_Link"); +#endif + + return flipc_msg_to_remote_node((mach_node_t)node, 0); +} + + +/* The link driver calls this to indicate that the specified msg buffer has + * been sent over the link and can be deallocated. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * msg Pointer to the message buffer + * flags Currently unused; 0 should be passed + */ +void +mnl_msg_complete(mnl_node_info_t node __unused, + mnl_msg_t msg, + uint32_t flags) +{ + switch (msg->sub) { + case MACH_NODE_SUB_NODE: + mnl_msg_free(msg, flags); + break; + + case MACH_NODE_SUB_FLIPC: + flipc_msg_free(msg, flags); + break; + + default: +#if DEBUG + PE_enter_debugger("mnl_msg_complete(): Invalid subsystem"); +#endif + break; + } +} + +#else // MACH_FLIPC not configured, so provide KPI stubs + +mnl_msg_t +mnl_msg_alloc(int payload __unused, uint32_t flags __unused) +{ + return MNL_MSG_NULL; +} + +void +mnl_msg_free(mnl_msg_t msg __unused, uint32_t flags __unused) +{ + return; +} + +mnl_node_info_t +mnl_instantiate(mach_node_id_t nid __unused, uint32_t flags __unused) +{ + return MNL_NODE_NULL; +} + +kern_return_t +mnl_register(mnl_node_info_t node __unused, uint32_t flags __unused) +{ + return KERN_FAILURE; +} + +kern_return_t +mnl_set_link_state(mnl_node_info_t node __unused, + int link __unused, + uint32_t flags __unused) +{ + return KERN_FAILURE; +} + +kern_return_t +mnl_terminate(mnl_node_info_t node __unused, uint32_t flags __unused) +{ + return KERN_FAILURE; +} + +void +mnl_msg_from_node(mnl_node_info_t node __unused, + mnl_msg_t msg __unused, + uint32_t flags __unused) +{ + return; +} + +mnl_msg_t +mnl_msg_to_node(mnl_node_info_t node __unused, uint32_t flags __unused) +{ + return MNL_MSG_NULL; +} + +void +mnl_msg_complete(mnl_node_info_t node __unused, + mnl_msg_t msg __unused, + uint32_t flags __unused) +{ + return; +} + +#endif // MACH_FLIPC diff --git a/osfmk/kern/mach_node.h b/osfmk/kern/mach_node.h new file mode 100644 index 000000000..99bf01128 --- /dev/null +++ b/osfmk/kern/mach_node.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2015-2016 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * File: kern/mach_node.h + * Author: Dean Reece + * Date: 2016 + * + * Definitions for mach internode communication (used by flipc). + * This header is intended for use inside the kernel only. + */ + +#ifndef _KERN_MACH_NODE_H_ +#define _KERN_MACH_NODE_H_ + +#if defined(MACH_KERNEL_PRIVATE) || defined(__APPLE_API_PRIVATE) + +/*** Mach Node Name Server Section + * Definitions shared by the mach_node layer in the kernel and the + * node's bootstrap server (noded). + */ + +/* This structure describes messages sent from the mach_node layer to the + * node bootstrap server. + */ +#pragma pack(4) +typedef struct mach_node_server_msg { + mach_msg_header_t header; + uint32_t identifier; // See FLIPC_SM_* defines + uint32_t options; // Currently unused + uint32_t node_id; // Node number +} *mach_node_server_msg_t; +#pragma pack() + +/* This structure describes node registration messages sent from the mach_node + * layer to the node bootstrap server. + */ +typedef struct mach_node_server_register_msg { + struct mach_node_server_msg node_header; + uint8_t datamodel; // 1==ILP32, 2==LP64; matches dtrace + uint8_t byteorder; // Uses defines from libkern/OSByteOrder.h +} *mach_node_server_register_msg_t; +#pragma pack() + +#define MACH_NODE_SERVER_MSG_ID (0x45444f4eUL) // msgh_id "NODE" for Node msgs +#define MACH_NODE_SM_REG_LOCAL (0UL) // Register the local node +#define MACH_NODE_SM_REG_REMOTE (1UL) // Register a remote node + +#if defined(__LP64__) +#define LOCAL_DATA_MODEL (2) // Native data model is LP64 +#else +#define LOCAL_DATA_MODEL (1) // Native data model is ILP32 +#endif + +#endif + + +#if MACH_FLIPC && defined(MACH_KERNEL_PRIVATE) + +#include +#include + +#include + +__BEGIN_DECLS + +#define MACH_NODES_MAX (2) // Must be a power-of-2 +#define MACH_NODE_ID_VALID(nid) (((nid) >= 0) && ((nid) < MACH_NODES_MAX)) + +typedef struct flipc_node *flipc_node_t; // Defined in ipc/flipc.h + + +/*** Mach Node Section + * + * An instance of mach_node is allocated for each node known to mach. + * In-kernel interfaces use a pointer to this structure to refer to a node. + * External interfaces and protocols refer to node by id (mach_node_id_t). + */ +typedef struct mach_node *mach_node_t; + +struct mach_node { + /* Static node details, provided by the link driver at registration */ + struct mnl_node_info info; + + lck_spin_t node_lock_data; + + /* Flags and status word */ + uint32_t link:2; // See MNL_LINK* defines + uint32_t published:1; // True if node server has send-right + uint32_t active:1; // True if node is up and ready + uint32_t suspended:1; // True if node is active but sleeping + uint32_t dead:1; // True if node is dead + uint32_t _reserved:26; // Fill out the 32b flags field + + /* port/space/set */ + ipc_space_t proxy_space; // Kernel special space for proxy rights + ipc_pset_t proxy_port_set; // All proxy ports are in this set + ipc_port_t bootstrap_port; // Port for which "noded" holds rcv right + ipc_port_t control_port; // For control & ack/nak messages + + /* Misc */ + int proto_vers; // Protocol version in use for this node + mach_node_t antecedent; // Pointer to prior encarnation of this node id +}; + +extern mach_node_t localnode; // This node's mach_node_t struct + +#define MACH_NODE_NULL ((mach_node_t) 0UL) +#define MACH_NODE_SIZE ((vm_offset_t)sizeof(struct mach_node)) +#define MACH_NODE_VALID(node) ((node) != MACH_NODE_NULL) +#define MACH_NODE_ALLOC() ((mach_node_t)kalloc(MACH_NODE_SIZE)) +#define MACH_NODE_FREE(node) kfree(node, MACH_NODE_SIZE) + +#define MACH_NODE_LOCK_INIT(np) lck_spin_init(&(np)->node_lock_data, \ + &ipc_lck_grp, &ipc_lck_attr) +#define MACH_NODE_LOCK_DESTROY(np) lck_spin_destroy(&(np)->node_lock_data, \ + &ipc_lck_grp) +#define MACH_NODE_LOCK(np) lck_spin_lock(&(np)->node_lock_data) +#define MACH_NODE_UNLOCK(np) lck_spin_unlock(&(np)->node_lock_data) + +/* Gets or allocates a locked mach_node struct for the specified . + * The current node is locked and returned if it is not dead, or if it is dead + * and is false. A new node struct is allocated, locked and + * returned if the node is dead and is true, or if the node + * is absent and is true. MACH_NODE_NULL is returned if + * the node is absent and is false. MACH_NODE_NULL is also + * returned if a new node structure was not able to be allocated. + */ +mach_node_t +mach_node_for_id_locked(mach_node_id_t node_id, + boolean_t alloc_if_dead, + boolean_t alloc_if_absent); + + +/*** Mach Node Link Name Section + * + * A node link name (mnl_name_t) is an oqaque value guaranteed unique across + * kernel instances on all nodes. This guarantee requires that node ids not + * be recycled. + * + * Names 0..(MACH_NODES_MAX-1) represent null (invalid) names + * Names MACH_NODES_MAX..(MACH_NODES_MAX*2-1) represent bootstrap names + * Names >=(MACH_NODES_MAX*2) represent normal names. + */ + +/* Allocate a new unique name and return it. + * Dispose of this with mnl_name_free(). + * Returns MNL_NAME_NULL on failure. + */ +extern mnl_name_t mnl_name_alloc(void); + +/* Deallocate a unique name that was allocated via mnl_name_alloc(). + */ +extern void mnl_name_free(mnl_name_t name); + +/* This macro is used to convert a node id to a bootstrap port name. + */ +#define MNL_NAME_BOOTSTRAP(nid) ((mnl_name_t) MACH_NODES_MAX | (nid)) +#define MNL_NAME_NULL ((mnl_name_t) 0UL) +#define MNL_NAME_VALID(obj) ((obj) >= MACH_NODES_MAX) + + +/* The mnl hash table may optionally be used by clients to associate mnl_names + * with objects. Objects to be stored in the hash table must start with an + * instance of struct mnk_obj. It is up to clients of the hash table to + * allocate and free the actual objects being stored. + */ +typedef struct mnl_obj { + queue_chain_t links; // List of mnk_name_obj (See kern/queue.h "Method 1") + mnl_name_t name; // Unique mnl_name +} *mnl_obj_t; + +#define MNL_OBJ_NULL ((mnl_obj_t) 0UL) +#define MNL_OBJ_VALID(obj) ((obj) != MNL_OBJ_NULL) + + +/* Initialize the data structures in the mnl_obj structure at the head of the + * provided object. This should be called on an object before it is passed to + * any other mnl_obj* routine. + */ +void mnl_obj_init(mnl_obj_t obj); + +/* Search the local node's hash table for the object associated with a + * mnl_name_t and return it. Returns MNL_NAME_NULL on failure. + */ +mnl_obj_t mnl_obj_lookup(mnl_name_t name); + +/* Search the local node's hash table for the object associated with a + * mnl_name_t and remove it. The pointer to the removed object is returned so + * that the caller can appropriately dispose of the object. + * Returns MNL_NAME_NULL on failure. + */ +mnl_obj_t mnl_obj_remove(mnl_name_t name); + +/* Insert an object into the locak node's hash table. If the name of the + * provided object is MNL_NAME_NULL then a new mnl_name is allocated and + * assigned to the object. Returns KERN_SUCCESS, or KERN_NAME_EXISTS if + * an object associated with that name is already in the hash table. + */ +kern_return_t mnl_obj_insert(mnl_obj_t obj); + + +/*** Mach Node Link Message Section *** + * + * Struct mnl_msg is only the header for a mnl_msg buffer; + * the actual buffer is normally larger. The rest of the buffer + * holds the body of the message to be transmitted over the link. + * + * Note: A mnl_msg received over a link will be in the byte-order of the + * node that send it. fname and size must be corrected to the hosts' native + * byte order by the link driver before it is sent up to the flipc layer. + * However, the link driver should not attempt to adjust the data model or + * byte order of the payload that follows the mnl_msg header - that will + * be done by the flipc layer. + */ + + +/* Values for mnl_msg.sub + */ +#define MACH_NODE_SUB_INVALID (0) // Never sent +#define MACH_NODE_SUB_NODE (1) // MNL msg is for node management +#define MACH_NODE_SUB_FLIPC (2) // MNL msg is for FLIPC subsystem +#define MACH_NODE_SUB_VMSYS (3) // MNL msg is for VM subsystem + + +/* Called whenever the node special port changes + */ +void mach_node_port_changed(void); + + +__END_DECLS + +#endif // MACH_FLIPC && MACH_KERNEL_PRIVATE +#endif // _KERN_MACH_NODE_H_ + diff --git a/osfmk/kern/mach_node_link.h b/osfmk/kern/mach_node_link.h new file mode 100644 index 000000000..b5da5334d --- /dev/null +++ b/osfmk/kern/mach_node_link.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2015-2016 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * File: kern/mach_node_link.h + * Author: Dean Reece + * Date: 2016 + * + * This header provides definitions required by Mach Node Link (MNL) drivers. + * MNL drivers pass messages between nodes within a host. + * + * The constructs available at the node link level are very basic: + * Node IDs (mach_node_id_t) uniquely identify nodes within a host. + * MNL Info (mnl_node_info) describe the static characteristics of a node. + * MNL Names (mnl_name_t) uniquely identify abjects across all nodes. + * MNL Messages (mnl_msg) are passed between nodes (kernels) within a host. + */ + +#ifndef _KERN_MACH_NODE_LINK_H_ +#define _KERN_MACH_NODE_LINK_H_ + +#if KERNEL_PRIVATE + +#include + +__BEGIN_DECLS + + +/*** Node Info Section ***/ + +typedef int mach_node_id_t; // Used to uniquely identify a node +extern mach_node_id_t localnode_id; // This node's unique id. + +/* An mnl_node struct describes static characteristcs of a node. The link + * driver requests this structure from the mach_node layer and fills out + * the fields. All fields must be filled in (non-zero) before both rx and tx + * links are brought up. + */ +typedef struct mnl_node_info { + mach_node_id_t node_id; // The node ID of this node + uint8_t datamodel; // 1==ILP32, 2==LP64 (matches dtrace) + uint8_t byteorder; // See libkern/OSByteOrder.h + uint32_t proto_vers_min; // Oldest MNL protocol vers node can accept + uint32_t proto_vers_max; // Newest MNL protocol vers node can accept +} __attribute__ ((aligned (8))) *mnl_node_info_t; + +#define MNL_NODE_NULL ((mnl_node_info_t) 0UL) +#define MNL_NODE_VALID(n) ((n) != MNL_NODE_NULL) +#define MNL_PROTOCOL_V1 (1UL) // Current Node Link Protocol Version + +/*** Mach Node Link Name Section + * + * A node link name (mnl_name_t) is an oqaque value guaranteed unique across + * kernel instances on all nodes. + */ +typedef uint64_t mnl_name_t; + +/*** Mach Node Link Message Section ***/ + +/* This structure is the header for an MNL Message buffer; the actual buffer + * is normally larger, and holds this header followed by the body of the + * message to be transmitted over the link. + * + * Note: The and fields are in host-native byte order when + * passed to mnl_msg_from_node() and from mnl_msg_to_node(). + * The byte order of these fields as sent over the link is left to the link + * specification. The link drivers on both sides must translate these fields + * between the link's byte order and host-native byte order. + * + * The body of the message, however, is treated as a byte-stream and passed + * to/from the mach_node layer without any introspection or byte reordering. + */ +typedef struct mnl_msg { + uint8_t sub; // 8b subsystem code + uint8_t cmd; // 8b command code + uint8_t qos; // 8b TODO: Doesn't do anything yet + uint8_t flags; // 8b Command-specific flag byte + uint32_t node_id; // 32b id of node that originated message + mnl_name_t object; // 64b object ref (use is determined by sub & cmd) + uint32_t options; // 32b Currently unused + uint32_t size; // 32b Number of bytes that follow mnl_msg header +} __attribute__((__packed__)) *mnl_msg_t; + + +/* Allocate a mnl_msg struct plus additional payload. Link drivers are not + * required to use this to allocate messages; any wired and mapped kernel + * memory is acceptable. + * + * Arguments: + * payload Number of additional bytes to allocate for message payload + * flags Currently unused; 0 should be passed + * + * Return values: + * MNL_MSG_NULL: Allocation failed + * *: Pointer to new mnl_msg struct of requested size + */ +mnl_msg_t mnl_msg_alloc(int payload, uint32_t flags); + + +/* Free a mnl_msg struct allocated by mnl_msg_alloc(). + * + * Arguments: + * msg Pointer to the message buffer to be freed + * flags Currently unused; 0 should be passed + */ +void mnl_msg_free(mnl_msg_t msg, uint32_t flags); + +#define MNL_MSG_NULL ((mnl_msg_t) 0UL) +#define MNL_MSG_VALID(msg) ((msg) != MNL_MSG_NULL) +#define MNL_MSG_SIZE ((vm_offset_t)sizeof(struct mnl_msg)) +#define MNL_MSG_PAYLOAD(msg) ((vm_offset_t)(msg) + MNL_MSG_SIZE) + + +/*** Mach Node Link Driver Interface Section ***/ + +/* The link driver calls this to setup a new (or restarted) node, and to get + * an mnl_node_info struct for use as a parameter to other mnl functions. + * If MNL_NODE_NULL is returned, the operation failed. Otherwise, a pointer + * to a new mnl_node struct is returned. The caller should set all fields + * in the structure, then call mnl_register() to complete node registration. + * + * Arguments: + * nid The id of the node to be instantiated + * flags Currently unused; 0 should be passed + * + * Return values: + * MNL_NODE_NULL: Operation failed + * *: Pointer to a new mnl_node struct + */ +mnl_node_info_t mnl_instantiate(mach_node_id_t nid, + uint32_t flags); + + +/* The link driver calls mnl_register() to complete the node registration + * process. KERN_SUCCESS is returned if registration succeeded, otherwise + * an error is returned. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * flags Currently unused; 0 should be passed + * + * Return values: + * KERN_SUCCESS: Registration succeeded + * KERN_INVALID_ARGUMENT: Field(s) in contained unacceptable values + * KERN_*: Values returned from underlying functions + */ +kern_return_t mnl_register(mnl_node_info_t node, + uint32_t flags); + + +/* The link driver calls this to report that the link has been raised in one + * or both directions. If the link is two uni-directional channels, each link + * driver will independently call this function, each only raising the link + * they are responsible for. The mach_node layer will not communicate with + * the remote node until both rx and tx links are up. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * link Indicates which link(s) are up (see MNL_LINK_* defines) + * flags Currently unused; 0 should be passed + * + * Return values: + * KERN_SUCCESS: Link state changed successfully. + * KERN_INVALID_ARGUMENT: An argument value was not allowed. + * KERN_*: Values returned from underlying functions. + */ +kern_return_t mnl_set_link_state(mnl_node_info_t node, + int link, + uint32_t flags); + +#define MNL_LINK_DOWN (0UL) +#define MNL_LINK_RX (1UL) +#define MNL_LINK_TX (2UL) +#define MNL_LINK_UP (MNL_LINK_RX|MNL_LINK_TX) + + +/* The link driver calls this to indicate a node has terminated and is no + * longer available for messaging. This may be due to a crash or an orderly + * shutdown, but either way the remote node no longer retains any state about + * the remaining nodes. References held on behalf of the terminated node + * will be cleaned up. After this is called, both the rx and tx links are + * marked as down. If the remote node restarts, the link driver can bring + * up the link using mnl_instantiate() again. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * flags Currently unused; 0 should be passed + * + * Return values: + * KERN_SUCCESS: Node was terminated. + * KERN_INVALID_ARGUMENT: Node id was invalid or non-existant. + * KERN_*: Values returned from underlying functions. + */ +kern_return_t mnl_terminate(mnl_node_info_t node, + uint32_t flags); + + +/* The link driver calls this to deliver an incoming message. Note that the + * link driver must dispose of the memory pointed to by after the + * function call returns. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * msg Pointer to the message buffer + * flags Currently unused; 0 should be passed + */ +void mnl_msg_from_node(mnl_node_info_t node, + mnl_msg_t msg, + uint32_t flags); + + +/* The link driver calls this to fetch the next message to transmit. + * This function will block until a message is available, or will return + * FLIPC_MSG_NULL if the link is to be terminated. After the caller has + * completed the transmission and no longer needs the msg buffer, it should + * call mnl_msg_complete(). + * + * Arguments: + * node Pointer to the node's mnl_node structure + * flags Currently unused; 0 should be passed + */ +mnl_msg_t mnl_msg_to_node(mnl_node_info_t node, + uint32_t flags); + + +/* The link driver calls this to indicate that the specified msg buffer has + * been sent over the link and can be deallocated. + * + * Arguments: + * node Pointer to the node's mnl_node structure + * msg Pointer to the message buffer + * flags Currently unused; 0 should be passed + */ +void mnl_msg_complete(mnl_node_info_t node, + mnl_msg_t msg, + uint32_t flags); + +__END_DECLS + +#endif /* KERNEL_PRIVATE */ +#endif /* _KERN_MACH_NODE_LINK_H_ */ diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index f6b498fb4..31ab34932 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -329,6 +329,8 @@ processor_offline( processor->active_thread = new_thread; processor->current_pri = IDLEPRI; processor->current_thmode = TH_MODE_NONE; + processor->starting_pri = IDLEPRI; + processor->current_sfi_class = SFI_CLASS_KERNEL; processor->deadline = UINT64_MAX; new_thread->last_processor = processor; diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index 010661a22..355316d22 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -83,6 +83,18 @@ extern int copyin( char *kernel_addr, vm_size_t nbytes); +/* Move an aligned 32 or 64-bit word from user space to kernel space + * using a single read instruction + * + * when reading a 32-bit word, the value is 0-extended into the kernel space + * 64-bit buffer passed as `kernel_addr` + * (think `*kernel_addr = *(uint32_t *)user_addr`) + */ +extern int copyin_word( + const user_addr_t user_addr, + uint64_t *kernel_addr, + vm_size_t nbytes); + /* Move a NUL-terminated string from a user space to kernel space */ extern int copyinstr( const user_addr_t user_addr, @@ -121,6 +133,7 @@ extern int sscanf(const char *input, const char *fmt, ...) __scanflike(2,3); extern integer_t sprintf(char *buf, const char *fmt, ...) __deprecated; extern int printf(const char *format, ...) __printflike(1,2); +extern int vprintf(const char *format, va_list ap); #if KERNEL_PRIVATE int _consume_printf_args(int, ...); @@ -148,21 +161,21 @@ extern void log(int level, char *fmt, ...); void _doprnt( - register const char *fmt, + const char *fmt, va_list *argp, void (*putc)(char), int radix); void _doprnt_log( - register const char *fmt, + const char *fmt, va_list *argp, void (*putc)(char), int radix); int __doprnt( - register const char *fmt, + const char *fmt, va_list argp, void (*putc)(int, void *), void *arg, @@ -189,6 +202,12 @@ extern void cnputc(char); extern void cnputc_unbuffered(char); +extern void console_write(char *, int); + +extern void console_suspend(void); + +extern void console_resume(void); + extern int cngetc(void); extern int cnmaygetc(void); @@ -202,17 +221,24 @@ extern int _longjmp( extern void bootstrap_create(void); +/* + * Halt other cores before invoking debugger + * Halting other cores as early as possible helps preserve + * the current system state for debugging + */ +extern void DebuggerHaltOtherCores(void); + +/* Resume other cores */ +extern void DebuggerResumeOtherCores(void); + extern void Debugger( const char * message); extern void DebuggerWithContext( unsigned int reason, void *ctx, - const char *message); - -extern void delay( - int n); - + const char *message, + uint64_t debugger_options_mask); #if DIPC @@ -225,6 +251,11 @@ extern kern_return_t kernel_set_special_port( int which, ipc_port_t port); +extern kern_return_t kernel_get_special_port( + host_priv_t host_priv, + int which, + ipc_port_t *portp); + user_addr_t get_useraddr(void); /* symbol lookup */ diff --git a/osfmk/kern/mk_sp.c b/osfmk/kern/mk_sp.c index f902e916c..af9f05cb8 100644 --- a/osfmk/kern/mk_sp.c +++ b/osfmk/kern/mk_sp.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -87,8 +88,6 @@ thread_set_policy( if (invalid_policy(policy)) return(KERN_INVALID_ARGUMENT); - thread_mtx_lock(thread); - switch (policy) { case POLICY_RR: @@ -160,8 +159,6 @@ thread_set_policy( } if (result != KERN_SUCCESS) { - thread_mtx_unlock(thread); - return (result); } @@ -170,8 +167,6 @@ thread_set_policy( result = thread_set_mode_and_absolute_pri(thread, policy, bas); } - thread_mtx_unlock(thread); - return (result); } diff --git a/osfmk/kern/policy_internal.h b/osfmk/kern/policy_internal.h new file mode 100644 index 000000000..59a7b9a79 --- /dev/null +++ b/osfmk/kern/policy_internal.h @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_POLICY_INTERNAL_H_ +#define _KERN_POLICY_INTERNAL_H_ + +/* + * Interfaces for functionality implemented in task_ or thread_policy subsystem + */ + +#ifdef XNU_KERNEL_PRIVATE + +#include +#include +#include +#include +#include +#include + +/* + ****************************** + * XNU-internal functionality + ****************************** + */ + +/* + * Get effective policy + * Only for use by relevant subsystem, should never be passed into a setter! + */ +extern int proc_get_effective_task_policy(task_t task, int flavor); +extern int proc_get_effective_thread_policy(thread_t thread, int flavor); + +/* Set task 'nice' value */ +extern kern_return_t task_importance(task_t task, integer_t importance); + +/* value */ +#define TASK_POLICY_DISABLE 0x0 +#define TASK_POLICY_ENABLE 0x1 + +/* category */ +#define TASK_POLICY_INTERNAL 0x0 +#define TASK_POLICY_EXTERNAL 0x1 +#define TASK_POLICY_ATTRIBUTE 0x2 + +/* for tracing */ +#define TASK_POLICY_TASK 0x4 +#define TASK_POLICY_THREAD 0x8 + +/* flavors (also DBG_IMPORTANCE subclasses 0x20 - 0x3F) */ + +/* internal or external, thread or task */ +#define TASK_POLICY_DARWIN_BG 0x21 +#define TASK_POLICY_IOPOL 0x22 +#define TASK_POLICY_IO 0x23 +#define TASK_POLICY_PASSIVE_IO 0x24 + +/* internal, task only */ +#define TASK_POLICY_DARWIN_BG_IOPOL 0x27 + +/* task-only attributes */ +#define TASK_POLICY_TAL 0x28 +#define TASK_POLICY_BOOST 0x29 +#define TASK_POLICY_ROLE 0x2A +/* unused 0x2B */ +#define TASK_POLICY_TERMINATED 0x2C +#define TASK_POLICY_NEW_SOCKETS_BG 0x2D +/* unused 0x2E */ +#define TASK_POLICY_LATENCY_QOS 0x2F +#define TASK_POLICY_THROUGH_QOS 0x30 +#define TASK_POLICY_WATCHERS_BG 0x31 + +#define TASK_POLICY_SFI_MANAGED 0x34 +#define TASK_POLICY_ALL_SOCKETS_BG 0x37 + +#define TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS 0x39 /* latency as value1, throughput as value2 */ +#define TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS 0x3A /* latency as value1, throughput as value2 */ + +/* thread-only attributes */ +#define TASK_POLICY_PIDBIND_BG 0x32 +/* unused 0x33 */ +#define TASK_POLICY_QOS 0x35 +#define TASK_POLICY_QOS_OVERRIDE 0x36 +#define TASK_POLICY_QOS_AND_RELPRIO 0x38 /* QoS as value1, relative priority as value2 */ +#define TASK_POLICY_QOS_PROMOTE 0x3C +#define TASK_POLICY_QOS_IPC_OVERRIDE 0x3D + +#define TASK_POLICY_MAX 0x3F + +/* The main entrance to task policy is this function */ +extern void proc_set_task_policy(task_t task, int category, int flavor, int value); +extern int proc_get_task_policy(task_t task, int category, int flavor); + +extern void proc_set_thread_policy(thread_t thread, int category, int flavor, int value); +extern int proc_get_thread_policy(thread_t thread, int category, int flavor); + +/* For use when you don't already hold a reference on the target thread */ +extern void proc_set_thread_policy_with_tid(task_t task, uint64_t tid, int category, int flavor, int value); + + +/* Functions used by kern_resource.c */ +extern boolean_t thread_has_qos_policy(thread_t thread); +extern kern_return_t thread_remove_qos_policy(thread_t thread); + +extern int proc_darwin_role_to_task_role(int darwin_role, int* task_role); +extern int proc_task_role_to_darwin_role(int task_role); + +/* Functions used by kern_exec.c */ +extern void task_set_main_thread_qos(task_t task, thread_t main_thread); +extern void proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, + ipc_port_t * portwatch_ports, int portwatch_count); + +/* IO Throttle tiers */ +#define THROTTLE_LEVEL_NONE -1 +#define THROTTLE_LEVEL_TIER0 0 /* IOPOL_NORMAL, IOPOL_DEFAULT, IOPOL_PASSIVE */ + +#define THROTTLE_LEVEL_THROTTLED 1 +#define THROTTLE_LEVEL_TIER1 1 /* IOPOL_STANDARD */ +#define THROTTLE_LEVEL_TIER2 2 /* IOPOL_UTILITY */ +#define THROTTLE_LEVEL_TIER3 3 /* IOPOL_THROTTLE */ + +#define THROTTLE_LEVEL_START 0 +#define THROTTLE_LEVEL_END 3 + +#define THROTTLE_LEVEL_COMPRESSOR_TIER0 THROTTLE_LEVEL_TIER0 +#define THROTTLE_LEVEL_COMPRESSOR_TIER1 THROTTLE_LEVEL_TIER1 +#define THROTTLE_LEVEL_COMPRESSOR_TIER2 THROTTLE_LEVEL_TIER2 + +#define THROTTLE_LEVEL_PAGEOUT_THROTTLED THROTTLE_LEVEL_TIER2 +#define THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED THROTTLE_LEVEL_TIER1 + +#if CONFIG_IOSCHED +#define IOSCHED_METADATA_TIER THROTTLE_LEVEL_TIER1 +#endif /* CONFIG_IOSCHED */ + +extern int proc_get_darwinbgstate(task_t task, uint32_t *flagsp); +extern int task_get_apptype(task_t); + +#ifdef MACH_BSD +extern void proc_apply_task_networkbg(void * bsd_info, thread_t thread); +#endif /* MACH_BSD */ + +/* Functions used by pthread_shims.c */ +extern boolean_t proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, + int override_qos, boolean_t first_override_for_resource, + user_addr_t resource, int resource_type); +extern int proc_thread_qos_add_override_check_owner(thread_t thread, int override_qos, + boolean_t first_override_for_resource, user_addr_t resource, int resource_type, + user_addr_t user_lock_addr, mach_port_name_t user_lock_owner); +extern boolean_t proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid, + user_addr_t resource, int resource_type); +extern boolean_t proc_thread_qos_reset_override(task_t task, thread_t thread, uint64_t tid, + user_addr_t resource, int resource_type); +extern int proc_thread_qos_squash_override(thread_t thread, user_addr_t resource, int resource_type); + +extern kern_return_t +thread_set_workq_qos(thread_t thread, int qos_tier, int relprio); +extern kern_return_t +thread_set_workq_pri(thread_t thread, integer_t priority, integer_t policy); + +extern int +task_get_default_manager_qos(task_t task); + +extern void proc_thread_qos_deallocate(thread_t thread); + +extern int task_clear_cpuusage(task_t task, int cpumon_entitled); + + +/* Importance inheritance functions not under IMPORTANCE_INHERITANCE */ +extern void task_importance_mark_donor(task_t task, boolean_t donating); +extern void task_importance_reset(task_t task); + +#if IMPORTANCE_INHERITANCE +extern boolean_t task_is_importance_donor(task_t task); +extern boolean_t task_is_importance_receiver_type(task_t task); + +extern int task_importance_hold_file_lock_assertion(task_t target_task, uint32_t count); +extern int task_importance_drop_file_lock_assertion(task_t target_task, uint32_t count); + +extern int task_importance_hold_legacy_external_assertion(task_t target_task, uint32_t count); +extern int task_importance_drop_legacy_external_assertion(task_t target_task, uint32_t count); +#endif /* IMPORTANCE_INHERITANCE */ + +/* Functions used by process_policy.c */ +extern boolean_t proc_task_is_tal(task_t task); + +/* Arguments to proc_set_task_ruse_cpu */ +#define TASK_POLICY_RESOURCE_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE 0x01 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_SUSPEND 0x02 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_TERMINATE 0x03 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_KQ 0x04 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC 0x05 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT TASK_POLICY_RESOURCE_ATTRIBUTE_NONE + +extern int proc_get_task_ruse_cpu(task_t task, uint32_t *policyp, uint8_t *percentagep, + uint64_t *intervalp, uint64_t *deadlinep); +extern int proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint8_t percentage, + uint64_t interval, uint64_t deadline, int cpumon_entitled); +extern int task_suspend_cpumon(task_t task); +extern int task_resume_cpumon(task_t task); +extern int proc_clear_task_ruse_cpu(task_t task, int cpumon_entitled); + +extern int proc_apply_resource_actions(void * p, int type, int action); +extern int proc_restore_resource_actions(void * p, int type, int action); + +/* VM/Jetsam importance callouts */ +extern int task_low_mem_privileged_listener(task_t task, boolean_t new_value, boolean_t *old_value); +extern boolean_t task_has_been_notified(task_t task, int pressurelevel); +extern boolean_t task_used_for_purging(task_t task, int pressurelevel); +extern void task_mark_has_been_notified(task_t task, int pressurelevel); +extern void task_mark_used_for_purging(task_t task, int pressurelevel); +extern void task_clear_has_been_notified(task_t task, int pressurelevel); +extern void task_clear_used_for_purging(task_t task); +extern int task_importance_estimate(task_t task); + +/* + * Allocate/assign a single work interval ID for a thread, + * and support deallocating it. + */ +extern kern_return_t thread_policy_create_work_interval(thread_t thread, uint64_t *work_interval_id); +extern kern_return_t thread_policy_destroy_work_interval(thread_t thread, uint64_t work_interval_id); + +extern kern_return_t thread_policy_set_internal(thread_t thread, thread_policy_flavor_t flavor, + thread_policy_t policy_info, mach_msg_type_number_t count); + +struct promote_token { + uint16_t pt_basepri; + uint16_t pt_qos; +}; + +#define PROMOTE_TOKEN_INIT ((struct promote_token){.pt_basepri = 0, .pt_qos = 0}) + +extern void thread_user_promotion_add(thread_t thread, thread_t promoter, struct promote_token* promote_token); +extern void thread_user_promotion_update(thread_t thread, thread_t promoter, struct promote_token* promote_token); +extern void thread_user_promotion_drop(thread_t thread); + +/* for IPC override management */ +extern void thread_add_ipc_override(thread_t thread, uint32_t qos_override); +extern void thread_update_ipc_override(thread_t thread, uint32_t qos_override); +extern void thread_drop_ipc_override(thread_t thread); +extern uint32_t thread_get_ipc_override(thread_t thread); + +/* + ****************************** + * Mach-internal functionality + ****************************** + */ + +#ifdef MACH_KERNEL_PRIVATE + +/* + * this exports the internal policy update calls + * for IPC importance hooks into task policy + */ + +typedef struct task_pend_token { + uint32_t tpt_update_sockets :1, + tpt_update_timers :1, + tpt_update_watchers :1, + tpt_update_live_donor :1, + tpt_update_coal_sfi :1, + tpt_update_throttle :1, + tpt_update_thread_sfi :1, + tpt_force_recompute_pri :1; +} *task_pend_token_t; + +extern void task_policy_update_complete_unlocked(task_t task, task_pend_token_t pend_token); +extern void task_update_boost_locked(task_t task, boolean_t boost_active, task_pend_token_t pend_token); + +extern void thread_policy_update_locked(thread_t thread, task_pend_token_t pend_token); +extern void thread_policy_update_complete_unlocked(thread_t task, task_pend_token_t pend_token); + +typedef struct { + int qos_pri[THREAD_QOS_LAST]; + int qos_iotier[THREAD_QOS_LAST]; + uint32_t qos_through_qos[THREAD_QOS_LAST]; + uint32_t qos_latency_qos[THREAD_QOS_LAST]; +} qos_policy_params_t; + +extern const qos_policy_params_t thread_qos_policy_params; + +/* for task policy tracepoints */ +/* Convenience functions for munging a policy bitfield into a tracepoint */ +uintptr_t threquested_0(thread_t thread); +uintptr_t threquested_1(thread_t thread); +uintptr_t theffective_0(thread_t thread); +uintptr_t theffective_1(thread_t thread); +extern uint32_t tpending(task_pend_token_t pend_token); + +extern void proc_iopol_to_tier(int iopolicy, int *tier, int *passive); +extern int proc_tier_to_iopol(int tier, int passive); + +extern void set_thread_iotier_override(thread_t, int policy); + +extern integer_t task_grab_latency_qos(task_t task); +extern void task_policy_create(task_t task, task_t parent_task); +extern void thread_policy_create(thread_t thread); + +extern boolean_t task_is_daemon(task_t task); +extern boolean_t task_is_app(task_t task); + + +#if IMPORTANCE_INHERITANCE +extern boolean_t task_is_marked_importance_donor(task_t task); +extern boolean_t task_is_marked_importance_receiver(task_t task); + +extern boolean_t task_is_marked_importance_denap_receiver(task_t task); +#endif /* IMPORTANCE_INHERITANCE */ + +/* flags for rusage_cpu_flags */ +#define TASK_RUSECPU_FLAGS_PROC_LIMIT 0x01 +#define TASK_RUSECPU_FLAGS_PERTHR_LIMIT 0x02 +#define TASK_RUSECPU_FLAGS_DEADLINE 0x04 +#define TASK_RUSECPU_FLAGS_FATAL_CPUMON 0x08 /* CPU usage monitor violations are fatal */ +#define TASK_RUSECPU_FLAGS_FATAL_WAKEUPSMON 0x10 /* wakeups monitor violations are fatal */ +#define TASK_RUSECPU_FLAGS_PHYS_FOOTPRINT_EXCEPTION 0x20 /* exceeding physical footprint generates EXC_RESOURCE */ + +extern void proc_init_cpumon_params(void); +extern void thread_policy_init(void); + +int task_compute_main_thread_qos(task_t task); + +/* thread policy internals */ +extern void thread_policy_reset(thread_t thread); +extern kern_return_t thread_set_mode_and_absolute_pri(thread_t thread, integer_t policy, integer_t priority); + +extern void thread_policy_update_tasklocked(thread_t thread, integer_t priority, integer_t max_priority, task_pend_token_t pend_token); + +#include "mach/resource_notify.h" /* from MIG */ + +/*! @function send_resource_violation + @abstract send usage monitor violation notification + + @param violator the task (process) violating its CPU budget + @param ledger_info the entry tracking the resource limit + @param flags see constants for type in sys/reason.h + + @result KERN_SUCCESS if the message was sent + + @discussion + send_resource_violation() calls the corresponding MIG routine + over the host special RESOURCE_NOTIFY port. +*/ +kern_return_t send_resource_violation(typeof(send_cpu_usage_violation), + task_t violator, + struct ledger_entry_info *ledger_info, + resource_notify_flags_t flags); + +/*! @function trace_resource_violation + @abstract trace violations on K32/64 + + @param code the (K64) DBG_MACH_RESOURCE trace code + @param ledger_info the entry tracking the resource limit + + @discussion + Trace observed usage and corresponding limit on K32 or K64. On + K32, a pair of trace points are used. The low nibble of the K32 + trace points must start at double the low nibble of the provided + K64 trace point. For example: + #define LOGWRITES_VIOLATED 0x022 + ... + #define LOGWRITES_VIOLATED_K32A 0x024 + #define LOGWRITES_VIOLATED_K32B 0x025 +*/ +void trace_resource_violation(uint16_t code, + struct ledger_entry_info *ledger_info); + +#endif /* MACH_KERNEL_PRIVATE */ + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _KERN_POLICY_INTERNAL_H_ */ diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index 82ad32bb9..c93df6826 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -168,6 +168,7 @@ #include #endif #include +#include #define isdigit(d) ((d) >= '0' && (d) <= '9') #define Ctod(c) ((c) - '0') @@ -339,9 +340,9 @@ __doprnt( case 'b': case 'B': { - register char *p; + char *p; boolean_t any; - register int i; + int i; if (long_long) { u = va_arg(argp, unsigned long long); @@ -363,7 +364,7 @@ __doprnt( /* * Bit field */ - register int j; + int j; if (any) (*putc)(',', arg); else { @@ -414,8 +415,8 @@ __doprnt( case 's': { - register const char *p; - register const char *p2; + const char *p; + const char *p2; if (prec == -1) prec = 0x7fffffff; /* MAXINT */ @@ -567,7 +568,7 @@ __doprnt( print_num: { char buf[MAXBUF]; /* build number here */ - register char * p = &buf[MAXBUF-1]; + char * p = &buf[MAXBUF-1]; static char digits[] = "0123456789abcdef0123456789ABCDEF"; const char *prefix = NULL; @@ -669,7 +670,7 @@ dummy_putc(int ch, void *arg) void _doprnt( - register const char *fmt, + const char *fmt, va_list *argp, /* character output routine */ void (*putc)(char), @@ -680,7 +681,7 @@ _doprnt( void _doprnt_log( - register const char *fmt, + const char *fmt, va_list *argp, /* character output routine */ void (*putc)(char), @@ -693,14 +694,22 @@ _doprnt_log( boolean_t new_printf_cpu_number = FALSE; #endif /* MP_PRINTF */ - decl_simple_lock_data(,printf_lock) decl_simple_lock_data(,bsd_log_spinlock) + +/* + * Defined here to allow lock group to be statically allocated. + */ +static lck_grp_t oslog_stream_lock_grp; +decl_lck_spin_data(,oslog_stream_lock) +void oslog_lock_init(void); + extern void bsd_log_init(void); void bsd_log_lock(void); void bsd_log_unlock(void); void + printf_init(void) { /* @@ -723,14 +732,21 @@ bsd_log_unlock(void) simple_unlock(&bsd_log_spinlock); } +void +oslog_lock_init(void) +{ + lck_grp_init(&oslog_stream_lock_grp, "oslog stream", LCK_GRP_ATTR_NULL); + lck_spin_init(&oslog_stream_lock, &oslog_stream_lock_grp, LCK_ATTR_NULL); +} + /* derived from boot_gets */ void safe_gets( char *str, int maxlen) { - register char *lp; - register int c; + char *lp; + int c; char *strmax = str + maxlen - 1; /* allow space for trailing 0 */ lp = str; @@ -795,21 +811,47 @@ cons_putc_locked( cnputc(c); } -int -printf(const char *fmt, ...) +static int +vprintf_internal(const char *fmt, va_list ap_in, void *caller) { - va_list listp; - if (fmt) { + va_list ap; + va_copy(ap, ap_in); + disable_preemption(); - va_start(listp, fmt); - _doprnt_log(fmt, &listp, conslog_putc, 16); - va_end(listp); + _doprnt_log(fmt, &ap, cons_putc_locked, 16); enable_preemption(); + + va_end(ap); + + if (debug_mode == 0) { + os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, ap_in, caller); + } } return 0; } +__attribute__((noinline,not_tail_called)) +int +printf(const char *fmt, ...) +{ + int ret; + + va_list ap; + va_start(ap, fmt); + ret = vprintf_internal(fmt, ap, __builtin_return_address(0)); + va_end(ap); + + return ret; +} + +__attribute__((noinline,not_tail_called)) +int +vprintf(const char *fmt, va_list ap) +{ + return vprintf_internal(fmt, ap, __builtin_return_address(0)); +} + void consdebug_putc(char c) { diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index f50696079..f4f5b1cc8 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -81,6 +81,8 @@ #include /* for commpage_update_mach_approximate_time */ #endif +static void sched_update_thread_bucket(thread_t thread); + /* * thread_quantum_expire: * @@ -114,6 +116,9 @@ thread_quantum_expire( * Because this balance adjustment could potentially attempt to wake this very * thread, we must credit the ledger before taking the thread lock. The ledger * pointers are only manipulated by the thread itself at the ast boundary. + * + * TODO: This fails to account for the time between when the timer was armed and when it fired. + * It should be based on the system_timer and running a thread_timer_event operation here. */ ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->quantum_remaining); ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->quantum_remaining); @@ -235,15 +240,16 @@ thread_quantum_expire( void sched_set_thread_base_priority(thread_t thread, int priority) { - int old_priority = thread->base_pri; + assert(priority >= MINPRI); + + if (thread->sched_mode == TH_MODE_REALTIME) + assert(priority <= BASEPRI_RTQUEUES); + else + assert(priority < BASEPRI_RTQUEUES); + thread->base_pri = priority; - /* A thread is 'throttled' when its base priority is at or below MAXPRI_THROTTLE */ - if ((priority > MAXPRI_THROTTLE) && (old_priority <= MAXPRI_THROTTLE)) { - sched_set_thread_throttled(thread, FALSE); - } else if ((priority <= MAXPRI_THROTTLE) && (old_priority > MAXPRI_THROTTLE)) { - sched_set_thread_throttled(thread, TRUE); - } + sched_update_thread_bucket(thread); thread_recompute_sched_pri(thread, FALSE); } @@ -413,20 +419,16 @@ can_update_priority( */ void update_priority( - register thread_t thread) + thread_t thread) { - register unsigned ticks; - register uint32_t delta; + uint32_t ticks, delta; ticks = sched_tick - thread->sched_stamp; assert(ticks != 0); + thread->sched_stamp += ticks; - if (sched_use_combined_fgbg_decay) - thread->pri_shift = sched_combined_fgbg_pri_shift; - else if (thread->sched_flags & TH_SFLAG_THROTTLED) - thread->pri_shift = sched_background_pri_shift; - else - thread->pri_shift = sched_pri_shift; + + thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket]; /* If requested, accelerate aging of sched_usage */ if (sched_decay_usage_age_factor > 1) @@ -437,8 +439,6 @@ update_priority( */ thread_timer_delta(thread, delta); if (ticks < SCHED_DECAY_TICKS) { - register struct shift_data *shiftp; - /* * Accumulate timesharing usage only * during contention for processor @@ -450,25 +450,20 @@ update_priority( thread->cpu_usage += delta + thread->cpu_delta; thread->cpu_delta = 0; - shiftp = &sched_decay_shifts[ticks]; + struct shift_data *shiftp = &sched_decay_shifts[ticks]; + if (shiftp->shift2 > 0) { - thread->cpu_usage = - (thread->cpu_usage >> shiftp->shift1) + - (thread->cpu_usage >> shiftp->shift2); - thread->sched_usage = - (thread->sched_usage >> shiftp->shift1) + - (thread->sched_usage >> shiftp->shift2); - } - else { - thread->cpu_usage = - (thread->cpu_usage >> shiftp->shift1) - - (thread->cpu_usage >> -(shiftp->shift2)); - thread->sched_usage = - (thread->sched_usage >> shiftp->shift1) - - (thread->sched_usage >> -(shiftp->shift2)); + thread->cpu_usage = (thread->cpu_usage >> shiftp->shift1) + + (thread->cpu_usage >> shiftp->shift2); + thread->sched_usage = (thread->sched_usage >> shiftp->shift1) + + (thread->sched_usage >> shiftp->shift2); + } else { + thread->cpu_usage = (thread->cpu_usage >> shiftp->shift1) - + (thread->cpu_usage >> -(shiftp->shift2)); + thread->sched_usage = (thread->sched_usage >> shiftp->shift1) - + (thread->sched_usage >> -(shiftp->shift2)); } - } - else { + } else { thread->cpu_usage = thread->cpu_delta = 0; thread->sched_usage = 0; } @@ -516,86 +511,96 @@ update_priority( #endif /* CONFIG_SCHED_TIMESHARE_CORE */ -#if MACH_ASSERT -/* sched_mode == TH_MODE_TIMESHARE controls whether a thread has a timeshare count when it has a run count */ -void sched_share_incr(thread_t thread) { - assert((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN); - assert(thread->sched_mode == TH_MODE_TIMESHARE); - assert(thread->SHARE_COUNT == 0); - thread->SHARE_COUNT++; - (void)hw_atomic_add(&sched_share_count, 1); +/* + * TH_BUCKET_RUN is a count of *all* runnable non-idle threads. + * Each other bucket is a count of the runnable non-idle threads + * with that property. + */ +volatile uint32_t sched_run_buckets[TH_BUCKET_MAX]; + +static void +sched_incr_bucket(sched_bucket_t bucket) +{ + assert(bucket >= TH_BUCKET_FIXPRI && + bucket <= TH_BUCKET_SHARE_BG); + + hw_atomic_add(&sched_run_buckets[bucket], 1); } -void sched_share_decr(thread_t thread) { - assert((thread->state & (TH_RUN|TH_IDLE)) != TH_RUN || thread->sched_mode != TH_MODE_TIMESHARE); - assert(thread->SHARE_COUNT == 1); - (void)hw_atomic_sub(&sched_share_count, 1); - thread->SHARE_COUNT--; +static void +sched_decr_bucket(sched_bucket_t bucket) +{ + assert(bucket >= TH_BUCKET_FIXPRI && + bucket <= TH_BUCKET_SHARE_BG); + + assert(sched_run_buckets[bucket] > 0); + + hw_atomic_sub(&sched_run_buckets[bucket], 1); } -/* TH_SFLAG_THROTTLED controls whether a thread has a background count when it has a run count and a share count */ +/* TH_RUN & !TH_IDLE controls whether a thread has a run count */ -void sched_background_incr(thread_t thread) { +uint32_t +sched_run_incr(thread_t thread) +{ assert((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN); - assert(thread->sched_mode == TH_MODE_TIMESHARE); - assert((thread->sched_flags & TH_SFLAG_THROTTLED) == TH_SFLAG_THROTTLED); - assert(thread->BG_COUNT == 0); - thread->BG_COUNT++; - int val = hw_atomic_add(&sched_background_count, 1); - assert(val >= 0); + uint32_t new_count = hw_atomic_add(&sched_run_buckets[TH_BUCKET_RUN], 1); - /* Always do the background change while holding a share count */ - assert(thread->SHARE_COUNT == 1); -} + sched_incr_bucket(thread->th_sched_bucket); -void sched_background_decr(thread_t thread) { - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN && thread->sched_mode == TH_MODE_TIMESHARE) - assert((thread->sched_flags & TH_SFLAG_THROTTLED) != TH_SFLAG_THROTTLED); - assert(thread->BG_COUNT == 1); - int val = hw_atomic_sub(&sched_background_count, 1); - thread->BG_COUNT--; - assert(val >= 0); - assert(thread->BG_COUNT == 0); - - /* Always do the background change while holding a share count */ - assert(thread->SHARE_COUNT == 1); + return new_count; } +uint32_t +sched_run_decr(thread_t thread) +{ + assert((thread->state & (TH_RUN|TH_IDLE)) != TH_RUN); -void -assert_thread_sched_count(thread_t thread) { - /* Only 0 or 1 are acceptable values */ - assert(thread->BG_COUNT == 0 || thread->BG_COUNT == 1); - assert(thread->SHARE_COUNT == 0 || thread->SHARE_COUNT == 1); - - /* BG is only allowed when you already have a share count */ - if (thread->BG_COUNT == 1) - assert(thread->SHARE_COUNT == 1); - if (thread->SHARE_COUNT == 0) - assert(thread->BG_COUNT == 0); - - if ((thread->state & (TH_RUN|TH_IDLE)) != TH_RUN || - (thread->sched_mode != TH_MODE_TIMESHARE)) - assert(thread->SHARE_COUNT == 0); - - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN && - (thread->sched_mode == TH_MODE_TIMESHARE)) - assert(thread->SHARE_COUNT == 1); - - if ((thread->state & (TH_RUN|TH_IDLE)) != TH_RUN || - (thread->sched_mode != TH_MODE_TIMESHARE) || - !(thread->sched_flags & TH_SFLAG_THROTTLED)) - assert(thread->BG_COUNT == 0); - - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN && - (thread->sched_mode == TH_MODE_TIMESHARE) && - (thread->sched_flags & TH_SFLAG_THROTTLED)) - assert(thread->BG_COUNT == 1); + sched_decr_bucket(thread->th_sched_bucket); + + uint32_t new_count = hw_atomic_sub(&sched_run_buckets[TH_BUCKET_RUN], 1); + + return new_count; } -#endif /* MACH_ASSERT */ +static void +sched_update_thread_bucket(thread_t thread) +{ + sched_bucket_t old_bucket = thread->th_sched_bucket; + sched_bucket_t new_bucket = TH_BUCKET_RUN; + + switch (thread->sched_mode) { + case TH_MODE_FIXED: + case TH_MODE_REALTIME: + new_bucket = TH_BUCKET_FIXPRI; + break; + + case TH_MODE_TIMESHARE: + if (thread->base_pri > BASEPRI_UTILITY) + new_bucket = TH_BUCKET_SHARE_FG; + else if (thread->base_pri > MAXPRI_THROTTLE) + new_bucket = TH_BUCKET_SHARE_UT; + else + new_bucket = TH_BUCKET_SHARE_BG; + break; + + default: + panic("unexpected mode: %d", thread->sched_mode); + break; + } + + if (old_bucket != new_bucket) { + thread->th_sched_bucket = new_bucket; + thread->pri_shift = sched_pri_shifts[new_bucket]; + + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) { + sched_decr_bucket(old_bucket); + sched_incr_bucket(new_bucket); + } + } +} /* * Set the thread's true scheduling mode @@ -607,43 +612,22 @@ assert_thread_sched_count(thread_t thread) { void sched_set_thread_mode(thread_t thread, sched_mode_t new_mode) { - assert_thread_sched_count(thread); assert(thread->runq == PROCESSOR_NULL); - sched_mode_t old_mode = thread->sched_mode; - - thread->sched_mode = new_mode; - switch (new_mode) { - case TH_MODE_FIXED: - case TH_MODE_REALTIME: - if (old_mode == TH_MODE_TIMESHARE) { - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) { - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_decr(thread); - - sched_share_decr(thread); - } - } - break; - - case TH_MODE_TIMESHARE: - if (old_mode != TH_MODE_TIMESHARE) { - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) { - sched_share_incr(thread); - - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_incr(thread); - } - } - break; - - default: - panic("unexpected mode: %d", new_mode); - break; + case TH_MODE_FIXED: + case TH_MODE_REALTIME: + case TH_MODE_TIMESHARE: + break; + + default: + panic("unexpected mode: %d", new_mode); + break; } - assert_thread_sched_count(thread); + thread->sched_mode = new_mode; + + sched_update_thread_bucket(thread); } /* @@ -654,7 +638,6 @@ sched_thread_mode_demote(thread_t thread, uint32_t reason) { assert(reason & TH_SFLAG_DEMOTED_MASK); assert((thread->sched_flags & reason) != reason); - assert_thread_sched_count(thread); if (thread->policy_reset) return; @@ -679,8 +662,6 @@ sched_thread_mode_demote(thread_t thread, uint32_t reason) if (removed) thread_run_queue_reinsert(thread, SCHED_TAILQ); - - assert_thread_sched_count(thread); } /* @@ -695,8 +676,6 @@ sched_thread_mode_undemote(thread_t thread, uint32_t reason) assert(thread->sched_mode == TH_MODE_TIMESHARE); assert(thread->policy_reset == 0); - assert_thread_sched_count(thread); - thread->sched_flags &= ~reason; if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) { @@ -716,34 +695,4 @@ sched_thread_mode_undemote(thread_t thread, uint32_t reason) thread_run_queue_reinsert(thread, SCHED_TAILQ); } -/* - * Set the thread to be categorized as 'background' - * Called with thread mutex and thread lock held - * - * TODO: Eventually, 'background' should be a true sched_mode. - */ -void -sched_set_thread_throttled(thread_t thread, boolean_t wants_throttle) -{ - if (thread->policy_reset) - return; - - assert(((thread->sched_flags & TH_SFLAG_THROTTLED) ? TRUE : FALSE) != wants_throttle); - - assert_thread_sched_count(thread); - - if (wants_throttle) { - thread->sched_flags |= TH_SFLAG_THROTTLED; - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN && thread->sched_mode == TH_MODE_TIMESHARE) { - sched_background_incr(thread); - } - } else { - thread->sched_flags &= ~TH_SFLAG_THROTTLED; - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN && thread->sched_mode == TH_MODE_TIMESHARE) { - sched_background_decr(thread); - } - } - - assert_thread_sched_count(thread); -} diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 2c2dae409..b0a13fb08 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -92,11 +92,13 @@ decl_simple_lock_data(static,pset_node_lock) queue_head_t tasks; queue_head_t terminated_tasks; /* To be used ONLY for stackshot. */ +queue_head_t corpse_tasks; int tasks_count; int terminated_tasks_count; queue_head_t threads; int threads_count; decl_lck_mtx_data(,tasks_threads_lock) +decl_lck_mtx_data(,tasks_corpse_lock) processor_t processor_list; unsigned int processor_count; @@ -120,6 +122,7 @@ processor_bootstrap(void) queue_init(&tasks); queue_init(&terminated_tasks); queue_init(&threads); + queue_init(&corpse_tasks); simple_lock_init(&processor_list_lock, 0); @@ -151,6 +154,8 @@ processor_init( processor->processor_set = pset; processor->current_pri = MINPRI; processor->current_thmode = TH_MODE_NONE; + processor->current_sfi_class = SFI_CLASS_KERNEL; + processor->starting_pri = MINPRI; processor->cpu_id = cpu_id; timer_call_setup(&processor->quantum_timer, thread_quantum_expire, processor); processor->quantum_end = UINT64_MAX; @@ -159,7 +164,7 @@ processor_init( processor->processor_primary = processor; /* no SMT relationship known at this point */ processor->processor_secondary = NULL; processor->is_SMT = FALSE; - processor->is_recommended = TRUE; + processor->is_recommended = (pset->recommended_bitmask & (1ULL << cpu_id)) ? TRUE : FALSE; processor->processor_self = IP_NULL; processor_data_init(processor); processor->processor_list = NULL; @@ -267,6 +272,7 @@ pset_init( pset->online_processor_count = 0; pset->cpu_set_low = pset->cpu_set_hi = 0; pset->cpu_set_count = 0; + pset->recommended_bitmask = ~0ULL; pset->pending_AST_cpu_mask = 0; #if defined(CONFIG_SCHED_DEFERRED_AST) pset->pending_deferred_AST_cpu_mask = 0; @@ -303,13 +309,13 @@ processor_info_count( kern_return_t processor_info( - register processor_t processor, + processor_t processor, processor_flavor_t flavor, host_t *host, processor_info_t info, mach_msg_type_number_t *count) { - register int cpu_id, state; + int cpu_id, state; kern_return_t result; if (processor == PROCESSOR_NULL) @@ -321,7 +327,7 @@ processor_info( case PROCESSOR_BASIC_INFO: { - register processor_basic_info_t basic_info; + processor_basic_info_t basic_info; if (*count < PROCESSOR_BASIC_INFO_COUNT) return (KERN_FAILURE); @@ -603,7 +609,7 @@ processor_set_info( return(KERN_INVALID_ARGUMENT); if (flavor == PROCESSOR_SET_BASIC_INFO) { - register processor_set_basic_info_t basic_info; + processor_set_basic_info_t basic_info; if (*count < PROCESSOR_SET_BASIC_INFO_COUNT) return(KERN_FAILURE); @@ -617,7 +623,7 @@ processor_set_info( return(KERN_SUCCESS); } else if (flavor == PROCESSOR_SET_TIMESHARE_DEFAULT) { - register policy_timeshare_base_t ts_base; + policy_timeshare_base_t ts_base; if (*count < POLICY_TIMESHARE_BASE_COUNT) return(KERN_FAILURE); @@ -630,7 +636,7 @@ processor_set_info( return(KERN_SUCCESS); } else if (flavor == PROCESSOR_SET_FIFO_DEFAULT) { - register policy_fifo_base_t fifo_base; + policy_fifo_base_t fifo_base; if (*count < POLICY_FIFO_BASE_COUNT) return(KERN_FAILURE); @@ -643,7 +649,7 @@ processor_set_info( return(KERN_SUCCESS); } else if (flavor == PROCESSOR_SET_RR_DEFAULT) { - register policy_rr_base_t rr_base; + policy_rr_base_t rr_base; if (*count < POLICY_RR_BASE_COUNT) return(KERN_FAILURE); @@ -657,7 +663,7 @@ processor_set_info( return(KERN_SUCCESS); } else if (flavor == PROCESSOR_SET_TIMESHARE_LIMITS) { - register policy_timeshare_limit_t ts_limit; + policy_timeshare_limit_t ts_limit; if (*count < POLICY_TIMESHARE_LIMIT_COUNT) return(KERN_FAILURE); @@ -670,7 +676,7 @@ processor_set_info( return(KERN_SUCCESS); } else if (flavor == PROCESSOR_SET_FIFO_LIMITS) { - register policy_fifo_limit_t fifo_limit; + policy_fifo_limit_t fifo_limit; if (*count < POLICY_FIFO_LIMIT_COUNT) return(KERN_FAILURE); @@ -683,7 +689,7 @@ processor_set_info( return(KERN_SUCCESS); } else if (flavor == PROCESSOR_SET_RR_LIMITS) { - register policy_rr_limit_t rr_limit; + policy_rr_limit_t rr_limit; if (*count < POLICY_RR_LIMIT_COUNT) return(KERN_FAILURE); @@ -696,7 +702,7 @@ processor_set_info( return(KERN_SUCCESS); } else if (flavor == PROCESSOR_SET_ENABLED_POLICIES) { - register int *enabled; + int *enabled; if (*count < (sizeof(*enabled)/sizeof(int))) return(KERN_FAILURE); @@ -730,7 +736,7 @@ processor_set_statistics( return (KERN_INVALID_PROCESSOR_SET); if (flavor == PROCESSOR_SET_LOAD_INFO) { - register processor_set_load_info_t load_info; + processor_set_load_info_t load_info; if (*count < PROCESSOR_SET_LOAD_INFO_COUNT) return(KERN_FAILURE); @@ -807,7 +813,7 @@ processor_set_things( mach_msg_type_number_t *count, int type) { - unsigned int i , j, used; + unsigned int i; task_t task; thread_t thread; @@ -926,6 +932,8 @@ processor_set_things( lck_mtx_unlock(&tasks_threads_lock); #if CONFIG_MACF + unsigned int j, used; + /* for each task, make sure we are allowed to examine it */ for (i = used = 0; i < actual_tasks; i++) { if (mac_task_check_expose_task(task_list[i])) { diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index dd4586138..abf663053 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -91,6 +91,7 @@ struct processor_set { int cpu_set_low, cpu_set_hi; int cpu_set_count; + uint64_t recommended_bitmask; #if __SMP__ decl_simple_lock_data(,sched_lock) /* lock for above */ @@ -142,9 +143,10 @@ struct pset_node { extern struct pset_node pset_node0; -extern queue_head_t tasks, terminated_tasks, threads; /* Terminated tasks are ONLY for stackshot */ +extern queue_head_t tasks, terminated_tasks, threads, corpse_tasks; /* Terminated tasks are ONLY for stackshot */ extern int tasks_count, terminated_tasks_count, threads_count; decl_lck_mtx_data(extern,tasks_threads_lock) +decl_lck_mtx_data(extern,tasks_corpse_lock) struct processor { queue_chain_t processor_queue;/* idle/active queue link, @@ -162,6 +164,7 @@ struct processor { int current_pri; /* priority of current thread */ sched_mode_t current_thmode; /* sched mode of current thread */ sfi_class_id_t current_sfi_class; /* SFI class of current thread */ + int starting_pri; /* priority of current thread as it was when scheduled */ int cpu_id; /* platform numeric id */ timer_call_data_t quantum_timer; /* timer for quantum expiration */ diff --git a/osfmk/kern/queue.h b/osfmk/kern/queue.h index f45899ce2..dc99d000f 100644 --- a/osfmk/kern/queue.h +++ b/osfmk/kern/queue.h @@ -136,7 +136,7 @@ __BEGIN_DECLS * [1] remqueue * [1] insque * [1] remque - * [1] re_queue + * [1] re_queue_head * [1] re_queue_tail * [1] movqueue * [1] qe_element @@ -520,6 +520,46 @@ re_queue_tail(queue_t que, queue_entry_t elt) &((elt)->field) != (head); \ elt = _nelt, _nelt = qe_element((elt)->field.next, typeof(*(elt)), field)) \ +#ifdef XNU_KERNEL_PRIVATE + +/* Dequeue an element from head, or return NULL if the queue is empty */ +#define qe_dequeue_head(head, type, field) ({ \ + queue_entry_t _tmp_entry = dequeue_head((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) NULL) \ + _tmp_element = qe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* Dequeue an element from tail, or return NULL if the queue is empty */ +#define qe_dequeue_tail(head, type, field) ({ \ + queue_entry_t _tmp_entry = dequeue_tail((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) NULL) \ + _tmp_element = qe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* Peek at the first element, or return NULL if the queue is empty */ +#define qe_queue_first(head, type, field) ({ \ + queue_entry_t _tmp_entry = queue_first((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) head) \ + _tmp_element = qe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +/* Peek at the last element, or return NULL if the queue is empty */ +#define qe_queue_last(head, type, field) ({ \ + queue_entry_t _tmp_entry = queue_last((head)); \ + type *_tmp_element = (type*) NULL; \ + if (_tmp_entry != (queue_entry_t) head) \ + _tmp_element = qe_element(_tmp_entry, type, field); \ + _tmp_element; \ +}) + +#endif /* XNU_KERNEL_PRIVATE */ + /* * Macro: queue_init * Function: @@ -983,11 +1023,9 @@ struct mpqueue_head { struct queue_entry head; /* header for queue */ uint64_t earliest_soft_deadline; uint64_t count; -#if defined(__i386__) || defined(__x86_64__) lck_mtx_t lock_data; +#if defined(__i386__) || defined(__x86_64__) lck_mtx_ext_t lock_data_ext; -#else - lck_spin_t lock_data; #endif }; @@ -1014,7 +1052,7 @@ MACRO_END #define mpqueue_init(q, lck_grp, lck_attr) \ MACRO_BEGIN \ queue_init(&(q)->head); \ - lck_spin_init(&(q)->lock_data, \ + lck_mtx_init(&(q)->lock_data, \ lck_grp, \ lck_attr); \ MACRO_END diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index d8f470150..f1225c9d2 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -73,13 +73,15 @@ #include #include #include +#include +#include #define NRQS 128 /* 128 levels per run queue */ -#define NRQBM (NRQS / 32) /* number of words per bit map */ #define MAXPRI (NRQS-1) -#define MINPRI IDLEPRI /* lowest legal priority schedulable */ -#define IDLEPRI 0 /* idle thread priority */ +#define MINPRI 0 /* lowest legal priority schedulable */ +#define IDLEPRI MINPRI /* idle thread priority */ +#define NOPRI -1 /* * High-level priority assignments @@ -142,15 +144,15 @@ #define BASEPRI_REALTIME (MAXPRI - (NRQS / 4) + 1) /* 96 */ #define MAXPRI_KERNEL (BASEPRI_REALTIME - 1) /* 95 */ -#define BASEPRI_PREEMPT (MAXPRI_KERNEL - 2) /* 93 */ -#define BASEPRI_KERNEL (MINPRI_KERNEL + 1) /* 81 */ -#define MINPRI_KERNEL (MAXPRI_KERNEL - (NRQS / 8) + 1) /* 80 */ +#define BASEPRI_PREEMPT (MAXPRI_KERNEL - 2) /* 93 */ +#define BASEPRI_KERNEL (MINPRI_KERNEL + 1) /* 81 */ +#define MINPRI_KERNEL (MAXPRI_KERNEL - (NRQS / 8) + 1) /* 80 */ -#define MAXPRI_RESERVED (MINPRI_KERNEL - 1) /* 79 */ +#define MAXPRI_RESERVED (MINPRI_KERNEL - 1) /* 79 */ #define BASEPRI_GRAPHICS (MAXPRI_RESERVED - 3) /* 76 */ -#define MINPRI_RESERVED (MAXPRI_RESERVED - (NRQS / 8) + 1) /* 64 */ +#define MINPRI_RESERVED (MAXPRI_RESERVED - (NRQS / 8) + 1) /* 64 */ -#define MAXPRI_USER (MINPRI_RESERVED - 1) /* 63 */ +#define MAXPRI_USER (MINPRI_RESERVED - 1) /* 63 */ #define BASEPRI_CONTROL (BASEPRI_DEFAULT + 17) /* 48 */ #define BASEPRI_FOREGROUND (BASEPRI_DEFAULT + 16) /* 47 */ #define BASEPRI_BACKGROUND (BASEPRI_DEFAULT + 15) /* 46 */ @@ -158,10 +160,10 @@ #define BASEPRI_DEFAULT (MAXPRI_USER - (NRQS / 4)) /* 31 */ #define MAXPRI_SUPPRESSED (BASEPRI_DEFAULT - 3) /* 28 */ #define BASEPRI_UTILITY (BASEPRI_DEFAULT - 11) /* 20 */ -#define MAXPRI_THROTTLE (MINPRI + 4) /* 4 */ -#define MINPRI_USER MINPRI /* 0 */ +#define MAXPRI_THROTTLE (MINPRI + 4) /* 4 */ +#define MINPRI_USER MINPRI /* 0 */ -#define DEPRESSPRI MINPRI /* depress priority */ +#define DEPRESSPRI MINPRI /* depress priority */ #define MAXPRI_PROMOTE (MAXPRI_KERNEL) /* ceiling for mutex promotion */ /* Type used for thread->sched_mode and saved_mode */ @@ -172,6 +174,16 @@ typedef enum { TH_MODE_TIMESHARE, /* use timesharing algorithm */ } sched_mode_t; +/* Buckets used for load calculation */ +typedef enum { + TH_BUCKET_RUN = 0, /* All runnable threads */ + TH_BUCKET_FIXPRI, /* Fixed-priority */ + TH_BUCKET_SHARE_FG, /* Timeshare thread above BASEPRI_UTILITY */ + TH_BUCKET_SHARE_UT, /* Timeshare thread between BASEPRI_UTILITY and MAXPRI_THROTTLE */ + TH_BUCKET_SHARE_BG, /* Timeshare thread between MAXPRI_THROTTLE and MINPRI */ + TH_BUCKET_MAX, +} sched_bucket_t; + /* * Macro to check for invalid priorities. */ @@ -186,7 +198,7 @@ struct runq_stats { struct run_queue { int highq; /* highest runnable queue */ - int bitmap[NRQBM]; /* run queue bitmap array */ + bitmap_t bitmap[BITMAP_LEN(NRQS)]; /* run queue bitmap array */ int count; /* # of threads total */ int urgency; /* level of preemption urgency */ queue_head_t queues[NRQS]; /* one for each priority */ @@ -194,6 +206,20 @@ struct run_queue { struct runq_stats runq_stats; }; +inline static void +rq_bitmap_set(bitmap_t *map, u_int n) +{ + assert(n < NRQS); + bitmap_set(map, n); +} + +inline static void +rq_bitmap_clear(bitmap_t *map, u_int n) +{ + assert(n < NRQS); + bitmap_clear(map, n); +} + #endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) || defined(CONFIG_SCHED_PROTO) */ struct rt_queue { @@ -314,9 +340,6 @@ extern void compute_stack_target( extern void compute_memory_pressure( void *arg); -extern void compute_zone_gc_throttle( - void *arg); - extern void compute_pageout_gc_throttle( void *arg); @@ -328,13 +351,12 @@ extern void compute_pmap_gc_throttle( * to priority. */ #if defined(CONFIG_SCHED_TIMESHARE_CORE) -extern uint32_t sched_pri_shift; -extern uint32_t sched_background_pri_shift; -extern uint32_t sched_combined_fgbg_pri_shift; + +#define MAX_LOAD (NRQS - 1) +extern uint32_t sched_pri_shifts[TH_BUCKET_MAX]; extern uint32_t sched_fixed_shift; extern int8_t sched_load_shifts[NRQS]; extern uint32_t sched_decay_usage_age_factor; -extern uint32_t sched_use_combined_fgbg_decay; void sched_timeshare_consider_maintenance(uint64_t ctime); #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -343,7 +365,6 @@ void sched_consider_recommended_cores(uint64_t ctime, thread_t thread); extern int32_t sched_poll_yield_shift; extern uint64_t sched_safe_duration; -extern uint32_t sched_run_count, sched_share_count, sched_background_count; extern uint32_t sched_load_average, sched_mach_factor; extern uint32_t avenrun[3], mach_factor[3]; @@ -351,49 +372,10 @@ extern uint32_t avenrun[3], mach_factor[3]; extern uint64_t max_unsafe_computation; extern uint64_t max_poll_computation; -/* TH_RUN & !TH_IDLE controls whether a thread has a run count */ -#define sched_run_incr(th) \ - hw_atomic_add(&sched_run_count, 1) \ - -#define sched_run_decr(th) \ - hw_atomic_sub(&sched_run_count, 1) \ - -#if MACH_ASSERT -extern void sched_share_incr(thread_t thread); -extern void sched_share_decr(thread_t thread); -extern void sched_background_incr(thread_t thread); -extern void sched_background_decr(thread_t thread); - -extern void assert_thread_sched_count(thread_t thread); - -#else /* MACH_ASSERT */ -/* sched_mode == TH_MODE_TIMESHARE controls whether a thread has a timeshare count when it has a run count */ -#define sched_share_incr(th) \ -MACRO_BEGIN \ - (void)hw_atomic_add(&sched_share_count, 1); \ -MACRO_END - -#define sched_share_decr(th) \ -MACRO_BEGIN \ - (void)hw_atomic_sub(&sched_share_count, 1); \ -MACRO_END - -/* TH_SFLAG_THROTTLED controls whether a thread has a background count when it has a run count and a share count */ -#define sched_background_incr(th) \ -MACRO_BEGIN \ - hw_atomic_add(&sched_background_count, 1); \ -MACRO_END - -#define sched_background_decr(th) \ -MACRO_BEGIN \ - hw_atomic_sub(&sched_background_count, 1); \ -MACRO_END - -#define assert_thread_sched_count(th) \ -MACRO_BEGIN \ -MACRO_END +extern volatile uint32_t sched_run_buckets[TH_BUCKET_MAX]; -#endif /* !MACH_ASSERT */ +extern uint32_t sched_run_incr(thread_t thread); +extern uint32_t sched_run_decr(thread_t thread); /* * thread_timer_delta macro takes care of both thread timers. diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c index 411dfb47c..cf9520915 100644 --- a/osfmk/kern/sched_average.c +++ b/osfmk/kern/sched_average.c @@ -77,6 +77,8 @@ uint32_t avenrun[3] = {0, 0, 0}; uint32_t mach_factor[3] = {0, 0, 0}; +uint32_t sched_load_average, sched_mach_factor; + #if defined(CONFIG_SCHED_TIMESHARE_CORE) /* * Values are scaled by LOAD_SCALE, defined in processor_info.h @@ -109,7 +111,6 @@ static struct sched_average { { compute_averunnable, &sched_nrun, 5, 0 }, { compute_stack_target, NULL, 5, 1 }, { compute_memory_pressure, NULL, 1, 0 }, - { compute_zone_gc_throttle, NULL, 60, 0 }, { compute_pageout_gc_throttle, NULL, 1, 0 }, { compute_pmap_gc_throttle, NULL, 60, 0 }, #if CONFIG_TELEMETRY @@ -120,6 +121,8 @@ static struct sched_average { typedef struct sched_average *sched_average_t; +uint32_t load_now[TH_BUCKET_MAX]; + /* The "stdelta" parameter represents the number of scheduler maintenance * "ticks" that have elapsed since the last invocation, subject to * integer division imprecision. @@ -128,119 +131,122 @@ typedef struct sched_average *sched_average_t; void compute_averages(uint64_t stdelta) { - int ncpus, nthreads, nshared, nbackground, nshared_non_bg; - uint32_t factor_now, average_now, load_now = 0, background_load_now = 0, combined_fgbg_load_now = 0; - sched_average_t avg; - uint64_t abstime, index; - /* - * Retrieve counts, ignoring - * the current thread. + * Retrieve a snapshot of the current run counts. + * + * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets, + * not byte-by-byte copy. */ - ncpus = processor_avail_count; - nthreads = sched_run_count - 1; - nshared = sched_share_count; - nbackground = sched_background_count; + uint32_t ncpus = processor_avail_count; - /* - * Load average and mach factor calculations for - * those which ask about these things. - */ - average_now = nthreads * LOAD_SCALE; + load_now[TH_BUCKET_RUN] = sched_run_buckets[TH_BUCKET_RUN]; + load_now[TH_BUCKET_FIXPRI] = sched_run_buckets[TH_BUCKET_FIXPRI]; + load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG]; + load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT]; + load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG]; - if (nthreads > ncpus) - factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1); - else - factor_now = (ncpus - nthreads) * LOAD_SCALE; + assert(load_now[TH_BUCKET_RUN] >= 0); + assert(load_now[TH_BUCKET_FIXPRI] >= 0); + + /* Ignore the current thread, which is a running fixpri thread */ + + uint32_t nthreads = load_now[TH_BUCKET_RUN] - 1; + uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI] - 1; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE, + load_now[TH_BUCKET_FIXPRI] - 1, load_now[TH_BUCKET_SHARE_FG], + load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0); - /* For those statistics that formerly relied on being recomputed - * on timer ticks, advance by the approximate number of corresponding - * elapsed intervals, thus compensating for potential idle intervals. - */ - for (index = 0; index < stdelta; index++) { - sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5; - sched_load_average = ((sched_load_average << 2) + average_now) / 5; - } /* * Compute the timeshare priority conversion factor based on loading. * Because our counters may be incremented and accessed * concurrently with respect to each other, we may have - * windows where the invariant nthreads >= nshared >= nbackground + * windows where the invariant (nthreads - nfixpri) == (fg + bg + ut) * is broken, so truncate values in these cases. */ - if (nshared > nthreads) - nshared = nthreads; - - if (nbackground > nshared) - nbackground = nshared; + uint32_t timeshare_threads = (nthreads - nfixpri); - nshared_non_bg = nshared - nbackground; + for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) { + if (load_now[i] > timeshare_threads) + load_now[i] = timeshare_threads; + } - if (nshared_non_bg > ncpus) { - if (ncpus > 1) - load_now = nshared_non_bg / ncpus; - else - load_now = nshared_non_bg; + /* + * Utility threads contribute up to NCPUS of load to FG threads + */ + if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) { + load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT]; + } else { + load_now[TH_BUCKET_SHARE_FG] += ncpus; + } - if (load_now > NRQS - 1) - load_now = NRQS - 1; + /* + * FG and UT should notice there's one thread of competition from BG, + * but no more. + */ + if (load_now[TH_BUCKET_SHARE_BG] > 0) { + load_now[TH_BUCKET_SHARE_FG] += 1; + load_now[TH_BUCKET_SHARE_UT] += 1; } - if (nbackground > ncpus) { - if (ncpus > 1) - background_load_now = nbackground / ncpus; - else - background_load_now = nbackground; + /* + * The conversion factor consists of two components: + * a fixed value based on the absolute time unit (sched_fixed_shift), + * and a dynamic portion based on load (sched_load_shifts). + * + * Zero load results in a out of range shift count. + */ - if (background_load_now > NRQS - 1) - background_load_now = NRQS - 1; - } + for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) { + uint32_t bucket_load = 0; - if (nshared > ncpus) { - if (ncpus > 1) - combined_fgbg_load_now = nshared / ncpus; - else - combined_fgbg_load_now = nshared; + if (load_now[i] > ncpus) { + if (ncpus > 1) + bucket_load = load_now[i] / ncpus; + else + bucket_load = load_now[i]; - if (combined_fgbg_load_now > NRQS - 1) - combined_fgbg_load_now = NRQS - 1; - } + if (bucket_load > MAX_LOAD) + bucket_load = MAX_LOAD; + } - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE, - (nthreads - nshared), (nshared - nbackground), nbackground, 0, 0); + sched_pri_shifts[i] = sched_fixed_shift - sched_load_shifts[bucket_load]; + } /* - * Sample total running threads. + * Sample total running threads for the load average calculation. */ sched_nrun = nthreads; - -#if defined(CONFIG_SCHED_TIMESHARE_CORE) /* - * The conversion factor consists of - * two components: a fixed value based - * on the absolute time unit, and a - * dynamic portion based on loading. - * - * Zero loading results in a out of range - * shift count. Accumulated usage is ignored - * during conversion and new usage deltas - * are discarded. + * Load average and mach factor calculations for + * those which ask about these things. */ - sched_pri_shift = sched_fixed_shift - sched_load_shifts[load_now]; - sched_background_pri_shift = sched_fixed_shift - sched_load_shifts[background_load_now]; - sched_combined_fgbg_pri_shift = sched_fixed_shift - sched_load_shifts[combined_fgbg_load_now]; + uint32_t average_now = nthreads * LOAD_SCALE; + uint32_t factor_now; + + if (nthreads > ncpus) + factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1); + else + factor_now = (ncpus - nthreads) * LOAD_SCALE; /* - * Compute old-style Mach load averages. + * For those statistics that formerly relied on being recomputed + * on timer ticks, advance by the approximate number of corresponding + * elapsed intervals, thus compensating for potential idle intervals. */ + for (uint32_t index = 0; index < stdelta; index++) { + sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5; + sched_load_average = ((sched_load_average << 2) + average_now) / 5; + } - for (index = 0; index < stdelta; index++) { - register int i; - - for (i = 0; i < 3; i++) { + /* + * Compute old-style Mach load averages. + */ + for (uint32_t index = 0; index < stdelta; index++) { + for (uint32_t i = 0; i < 3; i++) { mach_factor[i] = ((mach_factor[i] * fract[i]) + (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; @@ -248,13 +254,13 @@ compute_averages(uint64_t stdelta) (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; } } -#endif /* CONFIG_SCHED_TIMESHARE_CORE */ /* - * Compute averages in other components. + * Compute averages in other components. */ - abstime = mach_absolute_time(); - for (avg = sched_average; avg->comp != NULL; ++avg) { + uint64_t abstime = mach_absolute_time(); + + for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) { if (abstime >= avg->deadline) { uint64_t period_abs = (avg->period * sched_one_second_interval); uint64_t ninvokes = 1; @@ -262,7 +268,7 @@ compute_averages(uint64_t stdelta) ninvokes += (abstime - avg->deadline) / period_abs; ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA); - for (index = 0; index < ninvokes; index++) { + for (uint32_t index = 0; index < ninvokes; index++) { (*avg->comp)(avg->param); } avg->deadline = abstime + period_abs; diff --git a/osfmk/kern/sched_dualq.c b/osfmk/kern/sched_dualq.c index f7cbccb40..48ff5a038 100644 --- a/osfmk/kern/sched_dualq.c +++ b/osfmk/kern/sched_dualq.c @@ -268,7 +268,13 @@ sched_dualq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte) { - int qpri = MAX(dualq_main_runq(processor)->highq, dualq_bound_runq(processor)->highq); + run_queue_t main_runq = dualq_main_runq(processor); + run_queue_t bound_runq = dualq_bound_runq(processor); + + if (main_runq->count == 0 && bound_runq->count == 0) + return FALSE; + + int qpri = MAX(main_runq->highq, bound_runq->highq); if (gte) return qpri >= priority; @@ -316,12 +322,15 @@ sched_dualq_processor_queue_shutdown(processor_t processor) while (rq->count > 0) { thread = run_queue_dequeue(rq, SCHED_HEADQ); - enqueue_tail(&tqueue, (queue_entry_t)thread); + enqueue_tail(&tqueue, &thread->runq_links); } pset_unlock(pset); - while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) { + qe_foreach_element_safe(thread, &tqueue, runq_links) { + + remqueue(&thread->runq_links); + thread_lock(thread); thread_setrun(thread, SCHED_TAILQ); diff --git a/osfmk/kern/sched_grrr.c b/osfmk/kern/sched_grrr.c index 8b70499e8..10441edd3 100644 --- a/osfmk/kern/sched_grrr.c +++ b/osfmk/kern/sched_grrr.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -70,7 +70,7 @@ static boolean_t grrr_enqueue( grrr_run_queue_t rq, thread_t thread); - + static thread_t grrr_select( grrr_run_queue_t rq); @@ -237,7 +237,7 @@ sched_grrr_init(void) if (default_preemption_rate < 1) default_preemption_rate = 100; grrr_quantum_us = (1000 * 1000) / default_preemption_rate; - + printf("standard grrr timeslicing quantum is %d us\n", grrr_quantum_us); grrr_priority_mapping_init(); @@ -253,11 +253,11 @@ sched_grrr_timebase_init(void) grrr_quantum_us, NSEC_PER_USEC, &abstime); assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); grrr_quantum = (uint32_t)abstime; - + thread_depress_time = 1 * grrr_quantum; default_timeshare_computation = grrr_quantum / 2; default_timeshare_constraint = grrr_quantum; - + max_unsafe_computation = max_unsafe_quanta * grrr_quantum; sched_safe_duration = 2 * max_unsafe_quanta * grrr_quantum; @@ -278,43 +278,41 @@ static void sched_grrr_maintenance_continuation(void) { uint64_t abstime = mach_absolute_time(); - + grrr_rescale_tick++; - + /* * Compute various averages. */ compute_averages(1); - + if (sched_grrr_tick_deadline == 0) sched_grrr_tick_deadline = abstime; - + clock_deadline_for_periodic_event(10*sched_one_second_interval, abstime, &sched_grrr_tick_deadline); - + assert_wait_deadline((event_t)sched_grrr_maintenance_continuation, THREAD_UNINT, sched_grrr_tick_deadline); thread_block((thread_continue_t)sched_grrr_maintenance_continuation); /*NOTREACHED*/ } - static thread_t sched_grrr_choose_thread(processor_t processor, int priority __unused, ast_t reason __unused) { grrr_run_queue_t rq = &processor->grrr_runq; - - return grrr_select(rq); + + return grrr_select(rq); } static thread_t sched_grrr_steal_thread(processor_set_t pset) { pset_unlock(pset); - - return (THREAD_NULL); - + + return THREAD_NULL; } static int @@ -339,11 +337,11 @@ sched_grrr_processor_enqueue( { grrr_run_queue_t rq = &processor->grrr_runq; boolean_t result; - + result = grrr_enqueue(rq, thread); - + thread->runq = processor; - + return result; } @@ -354,29 +352,29 @@ sched_grrr_processor_queue_shutdown( processor_set_t pset = processor->processor_set; thread_t thread; queue_head_t tqueue, bqueue; - + queue_init(&tqueue); queue_init(&bqueue); - + while ((thread = sched_grrr_choose_thread(processor, IDLEPRI, AST_NONE)) != THREAD_NULL) { if (thread->bound_processor == PROCESSOR_NULL) { enqueue_tail(&tqueue, (queue_entry_t)thread); } else { - enqueue_tail(&bqueue, (queue_entry_t)thread); + enqueue_tail(&bqueue, (queue_entry_t)thread); } } - + while ((thread = (thread_t)(void *)dequeue_head(&bqueue)) != THREAD_NULL) { sched_grrr_processor_enqueue(processor, thread, SCHED_TAILQ); - } - + } + pset_unlock(pset); - + while ((thread = (thread_t)(void *)dequeue_head(&tqueue)) != THREAD_NULL) { thread_lock(thread); - + thread_setrun(thread, SCHED_TAILQ); - + thread_unlock(thread); } } @@ -386,11 +384,10 @@ sched_grrr_processor_queue_remove( processor_t processor, thread_t thread) { - void * rqlock; - - rqlock = &processor->processor_set->sched_lock; - simple_lock(rqlock); - + processor_set_t pset = processor->processor_set; + + pset_lock(pset); + if (processor == thread->runq) { /* * Thread is on a run queue and we have a lock on @@ -402,24 +399,24 @@ sched_grrr_processor_queue_remove( } else { /* * The thread left the run queue before we could - * lock the run queue. + * lock the run queue. */ assert(thread->runq == PROCESSOR_NULL); - processor = PROCESSOR_NULL; + processor = PROCESSOR_NULL; } - - simple_unlock(rqlock); - - return (processor != PROCESSOR_NULL); + + pset_unlock(pset); + + return (processor != PROCESSOR_NULL); } - + static boolean_t sched_grrr_processor_queue_empty(processor_t processor __unused) { boolean_t result; - + result = (processor->grrr_runq.count == 0); - + return result; } @@ -434,10 +431,10 @@ sched_grrr_processor_queue_has_priority(processor_t processor, i = grrr_group_mapping[grrr_priority_mapping[priority]]; for ( ; i < NUM_GRRR_GROUPS; i++) { if (rq->groups[i].count > 0) - return (TRUE); + return TRUE; } - - return (FALSE); + + return FALSE; } /* Implement sched_preempt_pri in code */ @@ -446,13 +443,13 @@ sched_grrr_priority_is_urgent(int priority) { if (priority <= BASEPRI_FOREGROUND) return FALSE; - + if (priority < MINPRI_KERNEL) return TRUE; if (priority >= BASEPRI_PREEMPT) return TRUE; - + return FALSE; } @@ -460,14 +457,12 @@ static ast_t sched_grrr_processor_csw_check(processor_t processor) { int count; - + count = sched_grrr_processor_runq_count(processor); - - if (count > 0) { - + + if (count > 0) return AST_PREEMPT; - } - + return AST_NONE; } @@ -483,7 +478,7 @@ sched_grrr_initial_thread_sched_mode(task_t parent_task) if (parent_task == kernel_task) return TH_MODE_FIXED; else - return TH_MODE_TIMESHARE; + return TH_MODE_TIMESHARE; } static boolean_t @@ -495,7 +490,7 @@ sched_grrr_can_update_priority(thread_t thread __unused) static void sched_grrr_update_priority(thread_t thread __unused) { - + return; } static void @@ -525,7 +520,7 @@ sched_grrr_processor_bound_count(__unused processor_t processor) static void sched_grrr_thread_update_scan(__unused sched_update_scan_context_t scan_context) { - + return; } #endif /* defined(CONFIG_SCHED_GRRR) */ @@ -536,34 +531,33 @@ static void grrr_priority_mapping_init(void) { unsigned int i; - + /* Map 0->0 up to 10->20 */ for (i=0; i <= 10; i++) { grrr_priority_mapping[i] = 2*i; } - + /* Map user priorities 11->33 up to 51 -> 153 */ for (i=11; i <= 51; i++) { grrr_priority_mapping[i] = 3*i; } - + /* Map high priorities 52->180 up to 127->255 */ for (i=52; i <= 127; i++) { grrr_priority_mapping[i] = 128 + i; } - + for (i = 0; i < NUM_GRRR_PROPORTIONAL_PRIORITIES; i++) { - -#if 0 + +#if 0 unsigned j, k; /* Calculate log(i); */ for (j=0, k=1; k <= i; j++, k *= 2); #endif - + /* Groups of 4 */ grrr_group_mapping[i] = i >> 2; } - } static thread_t @@ -574,21 +568,21 @@ grrr_intragroup_schedule(grrr_group_t group) if (group->count == 0) { return THREAD_NULL; } - + thread = group->current_client; if (thread == THREAD_NULL) { thread = (thread_t)(void *)queue_first(&group->clients); } - + if (1 /* deficit */) { group->current_client = (thread_t)(void *)queue_next((queue_entry_t)thread); if (queue_end(&group->clients, (queue_entry_t)group->current_client)) { group->current_client = (thread_t)(void *)queue_first(&group->clients); } - + thread = group->current_client; } - + return thread; } @@ -597,31 +591,31 @@ grrr_intergroup_schedule(grrr_run_queue_t rq) { thread_t thread; grrr_group_t group; - + if (rq->count == 0) { return THREAD_NULL; } - + group = rq->current_group; - + if (group == GRRR_GROUP_NULL) { group = (grrr_group_t)queue_first(&rq->sorted_group_list); } - + thread = grrr_intragroup_schedule(group); - + if ((group->work >= (UINT32_MAX-256)) || (rq->last_rescale_tick != grrr_rescale_tick)) { grrr_rescale_work(rq); } group->work++; - + if (queue_end(&rq->sorted_group_list, queue_next((queue_entry_t)group))) { /* last group, go back to beginning */ group = (grrr_group_t)queue_first(&rq->sorted_group_list); } else { grrr_group_t nextgroup = (grrr_group_t)queue_next((queue_entry_t)group); uint64_t orderleft, orderright; - + /* * The well-ordering condition for intergroup selection is: * @@ -638,9 +632,9 @@ grrr_intergroup_schedule(grrr_run_queue_t rq) group = (grrr_group_t)queue_first(&rq->sorted_group_list); } } - + rq->current_group = group; - + return thread; } @@ -648,9 +642,9 @@ static void grrr_runqueue_init(grrr_run_queue_t runq) { grrr_group_index_t index; - + runq->count = 0; - + for (index = 0; index < NUM_GRRR_GROUPS; index++) { unsigned int prisearch; @@ -662,7 +656,7 @@ grrr_runqueue_init(grrr_run_queue_t runq) break; } } - + runq->groups[index].index = index; queue_init(&runq->groups[index].clients); @@ -671,7 +665,7 @@ grrr_runqueue_init(grrr_run_queue_t runq) runq->groups[index].work = 0; runq->groups[index].current_client = THREAD_NULL; } - + queue_init(&runq->sorted_group_list); runq->weight = 0; runq->current_group = GRRR_GROUP_NULL; @@ -694,7 +688,7 @@ static boolean_t grrr_enqueue( grrr_run_queue_t rq, thread_t thread) -{ +{ grrr_proportional_priority_t gpriority; grrr_group_index_t gindex; grrr_group_t group; @@ -706,7 +700,7 @@ grrr_enqueue( #if 0 thread->grrr_deficit = 0; #endif - + if (group->count == 0) { /* Empty group, this is the first client */ enqueue_tail(&group->clients, (queue_entry_t)thread); @@ -728,13 +722,13 @@ grrr_enqueue( /* Since there was already a client, this is on the per-processor sorted list already */ remqueue((queue_entry_t)group); } - + grrr_sorted_list_insert_group(rq, group); rq->count++; rq->weight += gpriority; - - return (FALSE); + + return FALSE; } static thread_t @@ -747,11 +741,11 @@ grrr_select(grrr_run_queue_t rq) grrr_proportional_priority_t gpriority; grrr_group_index_t gindex; grrr_group_t group; - + gpriority = grrr_priority_mapping[thread->sched_pri]; gindex = grrr_group_mapping[gpriority]; group = &rq->groups[gindex]; - + remqueue((queue_entry_t)thread); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); group->count--; @@ -759,7 +753,7 @@ grrr_select(grrr_run_queue_t rq) if (group->current_client == thread) { group->current_client = THREAD_NULL; } - + remqueue((queue_entry_t)group); if (group->count == 0) { if (rq->current_group == group) { @@ -769,30 +763,29 @@ grrr_select(grrr_run_queue_t rq) /* Need to re-insert in sorted location */ grrr_sorted_list_insert_group(rq, group); } - + rq->count--; rq->weight -= gpriority; - + thread->runq = PROCESSOR_NULL; - } - - - return (thread); + } + + return thread; } static void grrr_remove( grrr_run_queue_t rq, thread_t thread) -{ +{ grrr_proportional_priority_t gpriority; grrr_group_index_t gindex; grrr_group_t group; - + gpriority = grrr_priority_mapping[thread->sched_pri]; gindex = grrr_group_mapping[gpriority]; group = &rq->groups[gindex]; - + remqueue((queue_entry_t)thread); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); group->count--; @@ -800,7 +793,7 @@ grrr_remove( if (group->current_client == thread) { group->current_client = THREAD_NULL; } - + remqueue((queue_entry_t)group); if (group->count == 0) { if (rq->current_group == group) { @@ -810,10 +803,10 @@ grrr_remove( /* Need to re-insert in sorted location */ grrr_sorted_list_insert_group(rq, group); } - + rq->count--; rq->weight -= gpriority; - + thread->runq = PROCESSOR_NULL; } @@ -826,13 +819,13 @@ grrr_sorted_list_insert_group(grrr_run_queue_t rq, enqueue_tail(&rq->sorted_group_list, (queue_entry_t)group); } else { grrr_group_t search_group; - + /* Start searching from the head (heaviest weight) for the first * element less than us, so we can insert before it */ search_group = (grrr_group_t)queue_first(&rq->sorted_group_list); while (!queue_end(&rq->sorted_group_list, (queue_entry_t)search_group) ) { - + if (search_group->weight < group->weight) { /* we should be before this */ search_group = (grrr_group_t)queue_prev((queue_entry_t)search_group); @@ -844,11 +837,11 @@ grrr_sorted_list_insert_group(grrr_run_queue_t rq, break; } } - + /* otherwise, our weight is too small, keep going */ search_group = (grrr_group_t)queue_next((queue_entry_t)search_group); } - + if (queue_end(&rq->sorted_group_list, (queue_entry_t)search_group)) { enqueue_tail(&rq->sorted_group_list, (queue_entry_t)group); } else { diff --git a/osfmk/kern/sched_multiq.c b/osfmk/kern/sched_multiq.c index ac1cc6d24..d37de1f3c 100644 --- a/osfmk/kern/sched_multiq.c +++ b/osfmk/kern/sched_multiq.c @@ -119,7 +119,7 @@ * or can we get away with putting the entry in either one or the other pset? * * Consider the right way to handle runq count - I don't want to iterate groups. - * Perhaps keep a global counter. sched_run_count will not work. + * Perhaps keep a global counter. * Alternate option - remove it from choose_processor. It doesn't add much value * now that we have global runq. * @@ -175,7 +175,7 @@ #endif typedef struct sched_entry { - queue_chain_t links; + queue_chain_t entry_links; int16_t sched_pri; /* scheduled (current) priority */ int16_t runq; int32_t pad; @@ -462,7 +462,10 @@ __attribute__((always_inline)) static inline sched_group_t group_for_entry(sched_entry_t entry) { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcast-align" sched_group_t group = (sched_group_t)(entry - entry->sched_pri); +#pragma clang diagnostic pop return group; } @@ -472,9 +475,9 @@ entry_queue_first_entry(entry_queue_t rq) { assert(rq->count != 0); - queue_t queue = rq->queues + rq->highq; + queue_t queue = &rq->queues[rq->highq]; - sched_entry_t entry = (sched_entry_t)queue_first(queue); + sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links); assert(entry->sched_pri == rq->highq); @@ -505,11 +508,12 @@ group_first_thread(sched_group_t group) assert(rq->count != 0); - queue_t queue = rq->queues + rq->highq; + queue_t queue = &rq->queues[rq->highq]; - thread_t thread = (thread_t)(void*)queue_first(queue); + thread_t thread = qe_queue_first(queue, struct thread, runq_links); assert(thread != THREAD_NULL); + assert_thread_magic(thread); assert(thread->sched_group == group); @@ -526,12 +530,12 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr queue_t q; sched_entry_t elem; - assert(queue_chain_linked(&entry->links)); + assert(queue_chain_linked(&entry->entry_links)); assert(entry->runq == MULTIQ_ERUNQ); q = &runq->queues[expected_pri]; - queue_iterate(q, elem, sched_entry_t, links) { + qe_foreach_element(elem, q, entry_links) { if (elem == entry) return; } @@ -551,7 +555,7 @@ sched_group_check_thread(sched_group_t group, thread_t thread) q = &group->runq.queues[pri]; - queue_iterate(q, elem, thread_t, links) { + qe_foreach_element(elem, q, runq_links) { if (elem == thread) return; } @@ -608,12 +612,12 @@ static sched_entry_t entry_queue_dequeue_entry(entry_queue_t rq) { sched_entry_t sched_entry; - queue_t queue = rq->queues + rq->highq; + queue_t queue = &rq->queues[rq->highq]; assert(rq->count > 0); assert(!queue_empty(queue)); - sched_entry = (sched_entry_t)dequeue_head(queue); + sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; @@ -621,9 +625,8 @@ entry_queue_dequeue_entry(entry_queue_t rq) rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { - if (rq->highq != IDLEPRI) - clrbit(MAXPRI - rq->highq, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + rq_bitmap_clear(rq->bitmap, rq->highq); + rq->highq = bitmap_first(rq->bitmap, NRQS); } sched_entry->runq = 0; @@ -641,24 +644,24 @@ entry_queue_enqueue_entry( integer_t options) { int sched_pri = entry->sched_pri; - queue_t queue = rq->queues + sched_pri; + queue_t queue = &rq->queues[sched_pri]; boolean_t result = FALSE; assert(entry->runq == 0); if (queue_empty(queue)) { - enqueue_tail(queue, (queue_entry_t)entry); + enqueue_tail(queue, &entry->entry_links); - setbit(MAXPRI - sched_pri, rq->bitmap); + rq_bitmap_set(rq->bitmap, sched_pri); if (sched_pri > rq->highq) { rq->highq = sched_pri; result = TRUE; } } else { if (options & SCHED_TAILQ) - enqueue_tail(queue, (queue_entry_t)entry); + enqueue_tail(queue, &entry->entry_links); else - enqueue_head(queue, (queue_entry_t)entry); + enqueue_head(queue, &entry->entry_links); } if (SCHED(priority_is_urgent)(sched_pri)) rq->urgency++; @@ -686,7 +689,7 @@ entry_queue_remove_entry( } #endif - remqueue((queue_entry_t)entry); + remqueue(&entry->entry_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; @@ -694,11 +697,10 @@ entry_queue_remove_entry( rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(rq->queues + sched_pri)) { + if (queue_empty(&rq->queues[sched_pri])) { /* update run queue status */ - if (sched_pri != IDLEPRI) - clrbit(MAXPRI - sched_pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + rq_bitmap_clear(rq->bitmap, sched_pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } entry->runq = 0; @@ -711,19 +713,18 @@ entry_queue_change_entry( integer_t options) { int sched_pri = entry->sched_pri; - queue_t queue = rq->queues + sched_pri; + queue_t queue = &rq->queues[sched_pri]; #if defined(MULTIQ_SANITY_CHECK) if (multiq_sanity_check) { entry_queue_check_entry(rq, entry, sched_pri); } #endif - remqueue((queue_entry_t)entry); if (options & SCHED_TAILQ) - enqueue_tail(queue, (queue_entry_t)entry); + re_queue_tail(queue, &entry->entry_links); else - enqueue_head(queue, (queue_entry_t)entry); + re_queue_head(queue, &entry->entry_links); } /* * The run queue must not be empty. @@ -737,14 +738,15 @@ group_run_queue_dequeue_thread( boolean_t *queue_empty) { thread_t thread; - queue_t queue = rq->queues + rq->highq; + queue_t queue = &rq->queues[rq->highq]; assert(rq->count > 0); assert(!queue_empty(queue)); *thread_pri = rq->highq; - thread = (thread_t)(void*)dequeue_head(queue); + thread = qe_dequeue_head(queue, struct thread, runq_links); + assert_thread_magic(thread); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; @@ -752,15 +754,14 @@ group_run_queue_dequeue_thread( rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { - if (rq->highq != IDLEPRI) - clrbit(MAXPRI - rq->highq, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + rq_bitmap_clear(rq->bitmap, rq->highq); + rq->highq = bitmap_first(rq->bitmap, NRQS); *queue_empty = TRUE; } else { *queue_empty = FALSE; } - return (thread); + return thread; } /* @@ -774,24 +775,25 @@ group_run_queue_enqueue_thread( integer_t thread_pri, integer_t options) { - queue_t queue = rq->queues + thread_pri; + queue_t queue = &rq->queues[thread_pri]; boolean_t result = FALSE; assert(thread->runq == PROCESSOR_NULL); + assert_thread_magic(thread); if (queue_empty(queue)) { - enqueue_tail(queue, (queue_entry_t)thread); + enqueue_tail(queue, &thread->runq_links); - setbit(MAXPRI - thread_pri, rq->bitmap); + rq_bitmap_set(rq->bitmap, thread_pri); if (thread_pri > rq->highq) { rq->highq = thread_pri; } result = TRUE; } else { if (options & SCHED_TAILQ) - enqueue_tail(queue, (queue_entry_t)thread); + enqueue_tail(queue, &thread->runq_links); else - enqueue_head(queue, (queue_entry_t)thread); + enqueue_head(queue, &thread->runq_links); } if (SCHED(priority_is_urgent)(thread_pri)) rq->urgency++; @@ -813,9 +815,10 @@ group_run_queue_remove_thread( { boolean_t result = FALSE; + assert_thread_magic(thread); assert(thread->runq != PROCESSOR_NULL); - remqueue((queue_entry_t)thread); + remqueue(&thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; @@ -823,11 +826,10 @@ group_run_queue_remove_thread( rq->urgency--; assert(rq->urgency >= 0); } - if (queue_empty(rq->queues + thread_pri)) { + if (queue_empty(&rq->queues[thread_pri])) { /* update run queue status */ - if (thread_pri != IDLEPRI) - clrbit(MAXPRI - thread_pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + rq_bitmap_clear(rq->bitmap, thread_pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); result = TRUE; } @@ -1213,7 +1215,13 @@ sched_multiq_processor_queue_has_priority( int priority, boolean_t gte) { - int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq); + run_queue_t main_runq = multiq_main_entryq(processor); + run_queue_t bound_runq = multiq_bound_runq(processor); + + if (main_runq->count == 0 && bound_runq->count == 0) + return FALSE; + + int qpri = MAX(main_runq->highq, bound_runq->highq); if (gte) return qpri >= priority; @@ -1278,12 +1286,15 @@ sched_multiq_processor_queue_shutdown(processor_t processor) while (main_entryq->count > 0) { thread = sched_global_dequeue_thread(main_entryq); - enqueue_tail(&tqueue, (queue_entry_t)thread); + enqueue_tail(&tqueue, &thread->runq_links); } pset_unlock(pset); - while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) { + qe_foreach_element_safe(thread, &tqueue, runq_links) { + + remqueue(&thread->runq_links); + thread_lock(thread); thread_setrun(thread, SCHED_TAILQ); @@ -1346,27 +1357,36 @@ sched_multiq_steal_thread(processor_set_t pset) * Scan the global queue for candidate groups, and scan those groups for * candidate threads. * + * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2) + * Instead, iterate only the queue in the group runq matching the priority of the entry. + * * Returns TRUE if retry is needed. */ static boolean_t group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context) { - int count; - queue_t q; - sched_group_t group; - sched_entry_t entry; - - if ((count = runq->count) > 0) { - q = runq->queues + runq->highq; - while (count > 0) { - queue_iterate(q, entry, sched_entry_t, links) { - group = group_for_entry(entry); - if (group->runq.count > 0) { - if (runq_scan(&group->runq, scan_context)) - return (TRUE); - } - count--; + int count = runq->count; + int queue_index; + + assert(count >= 0); + + if (count == 0) + return FALSE; + + for (queue_index = bitmap_first(runq->bitmap, NRQS); + queue_index >= 0; + queue_index = bitmap_next(runq->bitmap, queue_index)) { + + sched_entry_t entry; + + qe_foreach_element(entry, &runq->queues[queue_index], entry_links) { + assert(count > 0); + + sched_group_t group = group_for_entry(entry); + if (group->runq.count > 0) { + if (runq_scan(&group->runq, scan_context)) + return (TRUE); } - q--; + count--; } } diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 8c70db47d..2b2a98d68 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,6 +82,7 @@ #endif #include +#include #include #include #include @@ -102,6 +103,7 @@ #include #include #include +#include #include #include @@ -110,13 +112,11 @@ #include #include +#include +#include #include -#if defined(CONFIG_TELEMETRY) && defined(CONFIG_SCHED_TIMESHARE_CORE) -#include -#endif - struct rt_queue rt_runq; uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0; @@ -175,15 +175,9 @@ uint32_t min_rt_quantum; unsigned sched_tick; uint32_t sched_tick_interval; -#if defined(CONFIG_TELEMETRY) -uint32_t sched_telemetry_interval; -#endif /* CONFIG_TELEMETRY */ -uint32_t sched_pri_shift = INT8_MAX; -uint32_t sched_background_pri_shift = INT8_MAX; -uint32_t sched_combined_fgbg_pri_shift = INT8_MAX; +uint32_t sched_pri_shifts[TH_BUCKET_MAX]; uint32_t sched_fixed_shift; -uint32_t sched_use_combined_fgbg_decay = 0; uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */ @@ -207,9 +201,6 @@ thread_t sched_maintenance_thread; uint64_t sched_one_second_interval; -uint32_t sched_run_count, sched_share_count, sched_background_count; -uint32_t sched_load_average, sched_mach_factor; - /* Forwards */ #if defined(CONFIG_SCHED_TIMESHARE_CORE) @@ -270,7 +261,7 @@ sched_vm_group_maintenance(void); #if defined(CONFIG_SCHED_TIMESHARE_CORE) int8_t sched_load_shifts[NRQS]; -int sched_preempt_pri[NRQBM]; +bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)]; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ const struct sched_dispatch_table *sched_current_dispatch = NULL; @@ -465,21 +456,17 @@ sched_timeshare_timebase_init(void) abstime >>= 1; sched_fixed_shift = shift; + for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++) + sched_pri_shifts[i] = INT8_MAX; + max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum; sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum; - + max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum; thread_depress_time = 1 * std_quantum; default_timeshare_computation = std_quantum / 2; default_timeshare_constraint = std_quantum; -#if defined(CONFIG_TELEMETRY) - /* interval for high frequency telemetry */ - clock_interval_to_absolutetime_interval(10, NSEC_PER_MSEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); - sched_telemetry_interval = (uint32_t)abstime; -#endif - } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -533,10 +520,6 @@ load_shift_init(void) kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor); } - if (PE_parse_boot_argn("sched_use_combined_fgbg_decay", &sched_use_combined_fgbg_decay, sizeof (sched_use_combined_fgbg_decay))) { - kprintf("Overriding schedule fg/bg decay calculation: %u\n", sched_use_combined_fgbg_decay); - } - if (sched_decay_penalty == 0) { /* * There is no penalty for timeshare threads for using too much @@ -569,13 +552,13 @@ load_shift_init(void) static void preempt_pri_init(void) { - int i, *p = sched_preempt_pri; + bitmap_t *p = sched_preempt_pri; - for (i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) - setbit(i, p); + for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) + bitmap_set(p, i); - for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) - setbit(i, p); + for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) + bitmap_set(p, i); } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -591,6 +574,8 @@ thread_timer_expire( thread_t thread = p0; spl_t s; + assert_thread_magic(thread); + s = splsched(); thread_lock(thread); if (--thread->wait_timer_active == 0) { @@ -651,19 +636,12 @@ thread_unblock( (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); - /* - * Update run counts. - */ + /* Update the runnable thread count */ new_run_count = sched_run_incr(thread); - if (thread->sched_mode == TH_MODE_TIMESHARE) { - sched_share_incr(thread); - - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_incr(thread); - } } else { /* - * Signal if idling on another processor. + * Either the thread is idling in place on another processor, + * or it hasn't finished context switching yet. */ #if CONFIG_SCHED_IDLE_IN_PLACE if (thread->state & TH_IDLE) { @@ -675,8 +653,11 @@ thread_unblock( #else assert((thread->state & TH_IDLE) == 0); #endif - - new_run_count = sched_run_count; /* updated in thread_select_idle() */ + /* + * The run count is only dropped after the context switch completes + * and the thread is still waiting, so we should not run_incr here + */ + new_run_count = sched_run_buckets[TH_BUCKET_RUN]; } @@ -745,7 +726,8 @@ thread_unblock( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, new_run_count, 0); + (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, + sched_run_buckets[TH_BUCKET_RUN], 0); DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info); @@ -770,6 +752,8 @@ thread_go( thread_t thread, wait_result_t wresult) { + assert_thread_magic(thread); + assert(thread->at_safe_point == FALSE); assert(thread->wait_event == NO_EVENT64); assert(thread->waitq == NULL); @@ -778,8 +762,13 @@ thread_go( assert(thread->state & TH_WAIT); - if (thread_unblock(thread, wresult)) + if (thread_unblock(thread, wresult)) { +#if SCHED_TRACE_THREAD_WAKEUPS + backtrace(&thread->thread_wakeup_bt[0], + (sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t))); +#endif thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); + } return (KERN_SUCCESS); } @@ -801,7 +790,6 @@ thread_mark_wait_locked( { boolean_t at_safe_point; - assert(thread == current_thread()); assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2))); /* @@ -905,6 +893,18 @@ assert_wait( return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER); } +/* + * assert_wait_queue: + * + * Return the global waitq for the specified event + */ +struct waitq * +assert_wait_queue( + event_t event) +{ + return global_eventq(event); +} + wait_result_t assert_wait_timeout( event_t event, @@ -925,7 +925,6 @@ assert_wait_timeout( s = splsched(); waitq_lock(waitq); - thread_lock(thread); clock_interval_to_deadline(interval, scale_factor, &deadline); @@ -939,7 +938,6 @@ assert_wait_timeout( deadline, TIMEOUT_NO_LEEWAY, thread); - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -976,7 +974,6 @@ assert_wait_timeout_with_leeway( s = splsched(); waitq_lock(waitq); - thread_lock(thread); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, @@ -987,7 +984,6 @@ assert_wait_timeout_with_leeway( urgency, deadline, slop, thread); - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -1011,7 +1007,6 @@ assert_wait_deadline( s = splsched(); waitq_lock(waitq); - thread_lock(thread); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, @@ -1021,7 +1016,6 @@ assert_wait_deadline( interruptible, TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread); - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -1047,7 +1041,6 @@ assert_wait_deadline_with_leeway( s = splsched(); waitq_lock(waitq); - thread_lock(thread); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, @@ -1057,8 +1050,6 @@ assert_wait_deadline_with_leeway( interruptible, urgency, deadline, leeway, thread); - - thread_unlock(thread); waitq_unlock(waitq); splx(s); return wresult; @@ -1311,21 +1302,19 @@ clear_wait_internal( thread_t thread, wait_result_t wresult) { - uint32_t i = LockTimeOut; + uint32_t i = LockTimeOutUsec; struct waitq *waitq = thread->waitq; - + do { if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) return (KERN_FAILURE); if (waitq != NULL) { - assert(waitq_irq_safe(waitq)); //irqs are already disabled! - if (waitq_lock_try(waitq)) { - waitq_pull_thread_locked(waitq, thread); - waitq_unlock(waitq); - } else { + if (!waitq_pull_thread_locked(waitq, thread)) { thread_unlock(thread); delay(1); + if (i > 0 && !machine_timeout_suspended()) + i--; thread_lock(thread); if (waitq != thread->waitq) return KERN_NOT_WAITING; @@ -1338,7 +1327,7 @@ clear_wait_internal( return (thread_go(thread, wresult)); else return (KERN_NOT_WAITING); - } while ((--i > 0) || machine_timeout_suspended()); + } while (i > 0); panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n", thread, waitq, cpu_number()); @@ -1383,33 +1372,72 @@ clear_wait( */ kern_return_t thread_wakeup_prim( - event_t event, - boolean_t one_thread, - wait_result_t result) + event_t event, + boolean_t one_thread, + wait_result_t result) { - return (thread_wakeup_prim_internal(event, one_thread, result, -1)); + if (__improbable(event == NO_EVENT)) + panic("%s() called with NO_EVENT", __func__); + + struct waitq *wq = global_eventq(event); + + if (one_thread) + return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES); + else + return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES); } +/* + * Wakeup a specified thread if and only if it's waiting for this event + */ +kern_return_t +thread_wakeup_thread( + event_t event, + thread_t thread) +{ + if (__improbable(event == NO_EVENT)) + panic("%s() called with NO_EVENT", __func__); + + struct waitq *wq = global_eventq(event); + + return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED); +} +/* + * Wakeup a thread waiting on an event and promote it to a priority. + * + * Requires woken thread to un-promote itself when done. + */ kern_return_t -thread_wakeup_prim_internal( - event_t event, - boolean_t one_thread, - wait_result_t result, - int priority) +thread_wakeup_one_with_pri( + event_t event, + int priority) { if (__improbable(event == NO_EVENT)) panic("%s() called with NO_EVENT", __func__); - struct waitq *wq; + struct waitq *wq = global_eventq(event); - wq = global_eventq(event); - priority = (priority == -1 ? WAITQ_ALL_PRIORITIES : priority); + return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); +} - if (one_thread) - return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, priority); - else - return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, priority); +/* + * Wakeup a thread waiting on an event, + * promote it to a priority, + * and return a reference to the woken thread. + * + * Requires woken thread to un-promote itself when done. + */ +thread_t +thread_wakeup_identify(event_t event, + int priority) +{ + if (__improbable(event == NO_EVENT)) + panic("%s() called with NO_EVENT", __func__); + + struct waitq *wq = global_eventq(event); + + return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); } /* @@ -1665,9 +1693,7 @@ sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) { processor_t sprocessor; - sprocessor = (processor_t)queue_first(&cpset->active_queue); - - while (!queue_end(&cpset->active_queue, (queue_entry_t)sprocessor)) { + qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) { if ((sprocessor->state == PROCESSOR_RUNNING) && (sprocessor->processor_primary != sprocessor) && (sprocessor->processor_primary->state == PROCESSOR_RUNNING) && @@ -1677,7 +1703,6 @@ sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) { ast_processor = sprocessor; break; } - sprocessor = (processor_t)queue_next((queue_entry_t)sprocessor); } smt_balance_exit: @@ -1769,9 +1794,7 @@ thread_select( */ if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) { if (rt_runq.count > 0) { - thread_t next_rt; - - next_rt = (thread_t)queue_first(&rt_runq.queue); + thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links); assert(next_rt->runq == THREAD_ON_RT_RUNQ); @@ -1806,14 +1829,14 @@ thread_select( /* OK, so we're not going to run the current thread. Look at the RT queue. */ if (rt_runq.count > 0) { - thread_t next_rt = (thread_t)queue_first(&rt_runq.queue); + thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links); assert(next_rt->runq == THREAD_ON_RT_RUNQ); if (__probable((next_rt->bound_processor == PROCESSOR_NULL || (next_rt->bound_processor == processor)))) { pick_new_rt_thread: - new_thread = (thread_t)dequeue_head(&rt_runq.queue); + new_thread = qe_dequeue_head(&rt_runq.queue, struct thread, runq_links); new_thread->runq = PROCESSOR_NULL; SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); @@ -1865,14 +1888,12 @@ thread_select( * was running. */ if (processor->state == PROCESSOR_RUNNING) { - remqueue((queue_entry_t)processor); processor->state = PROCESSOR_IDLE; if (processor->processor_primary == processor) { - enqueue_head(&pset->idle_queue, (queue_entry_t)processor); - } - else { - enqueue_head(&pset->idle_secondary_queue, (queue_entry_t)processor); + re_queue_head(&pset->idle_queue, &processor->processor_queue); + } else { + re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue); } } @@ -1933,12 +1954,6 @@ thread_select_idle( uint64_t arg1, arg2; int urgency; - if (thread->sched_mode == TH_MODE_TIMESHARE) { - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_decr(thread); - - sched_share_decr(thread); - } sched_run_decr(thread); thread->state |= TH_IDLE; @@ -2011,12 +2026,6 @@ thread_select_idle( thread_tell_urgency(urgency, arg1, arg2, 0, new_thread); sched_run_incr(thread); - if (thread->sched_mode == TH_MODE_TIMESHARE) { - sched_share_incr(thread); - - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_incr(thread); - } return (new_thread); } @@ -2063,12 +2072,14 @@ thread_invoke( sched_timeshare_consider_maintenance(ctime); #endif + assert_thread_magic(self); assert(self == current_thread()); assert(self->runq == PROCESSOR_NULL); assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN); thread_lock(thread); + assert_thread_magic(thread); assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN); assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor()); assert(thread->runq == PROCESSOR_NULL); @@ -2153,6 +2164,10 @@ thread_invoke( DTRACE_SCHED(on__cpu); +#if KPERF + kperf_on_cpu(thread, continuation, NULL); +#endif /* KPERF */ + thread_dispatch(self, thread); thread->continuation = thread->parameter = NULL; @@ -2172,6 +2187,10 @@ thread_invoke( thread_unlock(self); +#if KPERF + kperf_on_cpu(thread, continuation, NULL); +#endif /* KPERF */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); @@ -2281,6 +2300,10 @@ thread_invoke( DTRACE_SCHED(on__cpu); +#if KPERF + kperf_on_cpu(self, NULL, __builtin_frame_address(0)); +#endif /* KPERF */ + /* * We have been resumed and are set to run. */ @@ -2318,7 +2341,7 @@ pset_cancel_deferred_dispatch( uint32_t sampled_sched_run_count; pset_lock(pset); - sampled_sched_run_count = (volatile uint32_t) sched_run_count; + sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN]; /* * If we have emptied the run queue, and our current thread is runnable, we @@ -2375,7 +2398,7 @@ pset_cancel_deferred_dispatch( * The tail? At the (relative) old position in the * queue? Or something else entirely? */ - re_queue_head(&pset->idle_queue, (queue_entry_t)active_processor); + re_queue_head(&pset->idle_queue, &active_processor->processor_queue); assert(active_processor->next_thread == THREAD_NULL); @@ -2431,8 +2454,9 @@ thread_dispatch( if (thread->state & TH_IDLE) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), 0, thread->state, sched_run_count, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), 0, thread->state, + sched_run_buckets[TH_BUCKET_RUN], 0); } else { int64_t consumed; int64_t remainder = 0; @@ -2467,9 +2491,24 @@ thread_dispatch( thread_lock(thread); /* - * Compute remainder of current quantum. + * Apply a priority floor if the thread holds a kernel resource + * Do this before checking starting_pri to avoid overpenalizing + * repeated rwlock blockers. + */ + if (__improbable(thread->rwlock_count != 0)) + lck_rw_set_promotion_locked(thread); + + boolean_t keep_quantum = processor->first_timeslice; + + /* + * Treat a thread which has dropped priority since it got on core + * as having expired its quantum. */ - if (processor->first_timeslice && + if (processor->starting_pri > thread->sched_pri) + keep_quantum = FALSE; + + /* Compute remainder of current quantum. */ + if (keep_quantum && processor->quantum_end > processor->last_dispatch) thread->quantum_remaining = (uint32_t)remainder; else @@ -2523,28 +2562,6 @@ thread_dispatch( thread->computation_metered += (processor->last_dispatch - thread->computation_epoch); - if ((thread->rwlock_count != 0) && !(LcksOpts & disLkRWPrio)) { - integer_t priority; - - priority = thread->sched_pri; - - if (priority < thread->base_pri) - priority = thread->base_pri; - if (priority < BASEPRI_BACKGROUND) - priority = BASEPRI_BACKGROUND; - - if ((thread->sched_pri < priority) || !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, priority, 0); - - thread->sched_flags |= TH_SFLAG_RW_PROMOTED; - - if (thread->sched_pri < priority) - set_sched_pri(thread, priority); - } - } - if (!(thread->state & TH_WAIT)) { /* * Still runnable. @@ -2561,8 +2578,9 @@ thread_dispatch( thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->reason, thread->state, sched_run_count, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->reason, thread->state, + sched_run_buckets[TH_BUCKET_RUN], 0); if (thread->wake_active) { thread->wake_active = FALSE; @@ -2594,12 +2612,6 @@ thread_dispatch( thread->last_made_runnable_time = ~0ULL; thread->chosen_processor = PROCESSOR_NULL; - if (thread->sched_mode == TH_MODE_TIMESHARE) { - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_background_decr(thread); - - sched_share_decr(thread); - } new_run_count = sched_run_decr(thread); #if CONFIG_SCHED_SFI @@ -2613,8 +2625,9 @@ thread_dispatch( machine_thread_going_off_core(thread, should_terminate); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->reason, thread->state, new_run_count, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->reason, thread->state, + new_run_count, 0); (*thread->sched_call)(SCHED_CALL_BLOCK, thread); @@ -2652,7 +2665,7 @@ thread_dispatch( } #endif - assert(processor->last_dispatch >= self->last_made_runnable_time); + assertf(processor->last_dispatch >= self->last_made_runnable_time, "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx", processor->last_dispatch, self->last_made_runnable_time); latency = processor->last_dispatch - self->last_made_runnable_time; urgency = thread_get_urgency(self, &arg1, &arg2); @@ -2685,6 +2698,7 @@ thread_dispatch( self->computation_epoch = processor->last_dispatch; self->reason = AST_NONE; + processor->starting_pri = self->sched_pri; thread_unlock(self); @@ -2693,7 +2707,7 @@ thread_dispatch( * TODO: Can we state that redispatching our old thread is also * uninteresting? */ - if ((((volatile uint32_t)sched_run_count) == 1) && + if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) && !(self->state & TH_IDLE)) { pset_cancel_deferred_dispatch(processor->processor_set, processor); } @@ -2839,6 +2853,10 @@ thread_continue( continuation = self->continuation; parameter = self->parameter; +#if KPERF + kperf_on_cpu(self, continuation, NULL); +#endif + thread_dispatch(thread, self); self->continuation = self->parameter = NULL; @@ -2864,10 +2882,10 @@ thread_quantum_init(thread_t thread) uint32_t sched_timeshare_initial_quantum_size(thread_t thread) { - if ((thread == THREAD_NULL) || !(thread->sched_flags & TH_SFLAG_THROTTLED)) - return std_quantum; - else + if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) return bg_quantum; + else + return std_quantum; } /* @@ -2879,14 +2897,11 @@ void run_queue_init( run_queue_t rq) { - int i; - - rq->highq = IDLEPRI; - for (i = 0; i < NRQBM; i++) + rq->highq = NOPRI; + for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) rq->bitmap[i] = 0; - setbit(MAXPRI - IDLEPRI, rq->bitmap); rq->urgency = rq->count = 0; - for (i = 0; i < NRQS; i++) + for (int i = 0; i < NRQS; i++) queue_init(&rq->queues[i]); } @@ -2901,19 +2916,21 @@ run_queue_init( */ thread_t run_queue_dequeue( - run_queue_t rq, - integer_t options) + run_queue_t rq, + integer_t options) { - thread_t thread; - queue_t queue = rq->queues + rq->highq; + thread_t thread; + queue_t queue = &rq->queues[rq->highq]; if (options & SCHED_HEADQ) { - thread = (thread_t)dequeue_head(queue); - } - else { - thread = (thread_t)dequeue_tail(queue); + thread = qe_dequeue_head(queue, struct thread, runq_links); + } else { + thread = qe_dequeue_tail(queue, struct thread, runq_links); } + assert(thread != THREAD_NULL); + assert_thread_magic(thread); + thread->runq = PROCESSOR_NULL; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; @@ -2921,12 +2938,11 @@ run_queue_dequeue( rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { - if (rq->highq != IDLEPRI) - clrbit(MAXPRI - rq->highq, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, rq->highq); + rq->highq = bitmap_first(rq->bitmap, NRQS); } - return (thread); + return thread; } /* @@ -2939,34 +2955,35 @@ run_queue_dequeue( */ boolean_t run_queue_enqueue( - run_queue_t rq, - thread_t thread, - integer_t options) + run_queue_t rq, + thread_t thread, + integer_t options) { - queue_t queue = rq->queues + thread->sched_pri; - boolean_t result = FALSE; - + queue_t queue = &rq->queues[thread->sched_pri]; + boolean_t result = FALSE; + + assert_thread_magic(thread); + if (queue_empty(queue)) { - enqueue_tail(queue, (queue_entry_t)thread); - - setbit(MAXPRI - thread->sched_pri, rq->bitmap); + enqueue_tail(queue, &thread->runq_links); + + rq_bitmap_set(rq->bitmap, thread->sched_pri); if (thread->sched_pri > rq->highq) { rq->highq = thread->sched_pri; result = TRUE; } } else { if (options & SCHED_TAILQ) - enqueue_tail(queue, (queue_entry_t)thread); + enqueue_tail(queue, &thread->runq_links); else - enqueue_head(queue, (queue_entry_t)thread); + enqueue_head(queue, &thread->runq_links); } if (SCHED(priority_is_urgent)(thread->sched_pri)) rq->urgency++; SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count++; - + return (result); - } /* @@ -2978,24 +2995,25 @@ run_queue_enqueue( */ void run_queue_remove( - run_queue_t rq, - thread_t thread) + run_queue_t rq, + thread_t thread) { + assert(thread->runq != PROCESSOR_NULL); + assert_thread_magic(thread); - remqueue((queue_entry_t)thread); + remqueue(&thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; if (SCHED(priority_is_urgent)(thread->sched_pri)) { rq->urgency--; assert(rq->urgency >= 0); } - - if (queue_empty(rq->queues + thread->sched_pri)) { + + if (queue_empty(&rq->queues[thread->sched_pri])) { /* update run queue status */ - if (thread->sched_pri != IDLEPRI) - clrbit(MAXPRI - thread->sched_pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, thread->sched_pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } - + thread->runq = PROCESSOR_NULL; } @@ -3009,7 +3027,7 @@ rt_runq_scan(sched_update_scan_context_t scan_context) s = splsched(); rt_lock_lock(); - qe_foreach_element_safe(thread, &rt_runq.queue, links) { + qe_foreach_element_safe(thread, &rt_runq.queue, runq_links) { if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; } @@ -3026,36 +3044,34 @@ rt_runq_scan(sched_update_scan_context_t scan_context) * Enqueue a thread for realtime execution. */ static boolean_t -realtime_queue_insert( - thread_t thread) +realtime_queue_insert(thread_t thread) { - queue_t queue = &rt_runq.queue; - uint64_t deadline = thread->realtime.deadline; - boolean_t preempt = FALSE; + queue_t queue = &rt_runq.queue; + uint64_t deadline = thread->realtime.deadline; + boolean_t preempt = FALSE; rt_lock_lock(); if (queue_empty(queue)) { - enqueue_tail(queue, (queue_entry_t)thread); + enqueue_tail(queue, &thread->runq_links); preempt = TRUE; - } - else { - register thread_t entry = (thread_t)queue_first(queue); - - while (TRUE) { - if ( queue_end(queue, (queue_entry_t)entry) || - deadline < entry->realtime.deadline ) { - entry = (thread_t)queue_prev((queue_entry_t)entry); + } else { + /* Insert into rt_runq in thread deadline order */ + queue_entry_t iter; + qe_foreach(iter, queue) { + thread_t iter_thread = qe_element(iter, struct thread, runq_links); + assert_thread_magic(iter_thread); + + if (deadline < iter_thread->realtime.deadline) { + if (iter == queue_first(queue)) + preempt = TRUE; + insque(&thread->runq_links, queue_prev(iter)); + break; + } else if (iter == queue_last(queue)) { + enqueue_tail(queue, &thread->runq_links); break; } - - entry = (thread_t)queue_next((queue_entry_t)entry); } - - if ((queue_entry_t)entry == queue) - preempt = TRUE; - - insque((queue_entry_t)thread, (queue_entry_t)entry); } thread->runq = THREAD_ON_RT_RUNQ; @@ -3095,8 +3111,7 @@ realtime_setrun( */ if ( (thread->bound_processor == processor) && processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); + re_queue_tail(&pset->active_queue, &processor->processor_queue); processor->next_thread = thread; processor->current_pri = thread->sched_pri; @@ -3131,8 +3146,8 @@ realtime_setrun( if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); + re_queue_tail(&pset->active_queue, &processor->processor_queue); + processor->next_thread = THREAD_NULL; processor->current_pri = thread->sched_pri; processor->current_thmode = thread->sched_mode; @@ -3185,7 +3200,7 @@ realtime_setrun( boolean_t priority_is_urgent(int priority) { - return testbit(priority, sched_preempt_pri) ? TRUE : FALSE; + return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE; } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -3220,8 +3235,8 @@ processor_setrun( if ( (SCHED(direct_dispatch_to_idle_processors) || thread->bound_processor == processor) && processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); + + re_queue_tail(&pset->active_queue, &processor->processor_queue); processor->next_thread = thread; processor->current_pri = thread->sched_pri; @@ -3268,8 +3283,8 @@ processor_setrun( if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); + re_queue_tail(&pset->active_queue, &processor->processor_queue); + processor->next_thread = THREAD_NULL; processor->current_pri = thread->sched_pri; processor->current_thmode = thread->sched_mode; @@ -3300,8 +3315,8 @@ processor_setrun( ipi_action = eInterruptRunning; } else if ( processor->state == PROCESSOR_IDLE && processor != current_processor() ) { - remqueue((queue_entry_t)processor); - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); + re_queue_tail(&pset->active_queue, &processor->processor_queue); + processor->next_thread = THREAD_NULL; processor->current_pri = thread->sched_pri; processor->current_thmode = thread->sched_mode; @@ -3417,7 +3432,9 @@ choose_processor( thread_t thread) { processor_set_t nset, cset = pset; - + + assert(thread->sched_pri <= BASEPRI_RTQUEUES); + /* * Prefer the hinted processor, when appropriate. */ @@ -3468,7 +3485,6 @@ choose_processor( * the "least cost idle" processor above. */ return (processor); - break; case PROCESSOR_RUNNING: case PROCESSOR_DISPATCHING: /* @@ -3603,12 +3619,12 @@ choose_processor( if (thread->sched_pri > lowest_unpaired_primary_priority) { /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor); + re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue); return lp_unpaired_primary_processor; } if (thread->sched_pri > lowest_priority) { /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor); + re_queue_tail(&cset->active_queue, &lp_processor->processor_queue); return lp_processor; } if (thread->realtime.deadline < furthest_deadline) @@ -3624,12 +3640,12 @@ choose_processor( if (thread->sched_pri > lowest_unpaired_primary_priority) { /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor); + re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue); return lp_unpaired_primary_processor; } if (thread->sched_pri > lowest_priority) { /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor); + re_queue_tail(&cset->active_queue, &lp_processor->processor_queue); return lp_processor; } @@ -4102,7 +4118,7 @@ thread_run_queue_remove( assert(thread->runq == THREAD_ON_RT_RUNQ); - remqueue((queue_entry_t)thread); + remqueue(&thread->runq_links); SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); rt_runq.count--; @@ -4159,7 +4175,6 @@ thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2) ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { /* * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted - * TODO: Use TH_SFLAG_THROTTLED instead? */ *arg1 = thread->sched_pri; *arg2 = thread->base_pri; @@ -4169,9 +4184,9 @@ thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2) /* For otherwise unclassified threads, report throughput QoS * parameters */ - *arg1 = thread->effective_policy.t_through_qos; - *arg2 = thread->task->effective_policy.t_through_qos; - + *arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS); + *arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS); + return (THREAD_URGENCY_NORMAL); } } @@ -4303,22 +4318,19 @@ processor_idle( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0); - + return (new_thread); - } - else - if (state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); + + } else if (state == PROCESSOR_IDLE) { + re_queue_tail(&pset->active_queue, &processor->processor_queue); processor->state = PROCESSOR_RUNNING; processor->current_pri = IDLEPRI; processor->current_thmode = TH_MODE_FIXED; processor->current_sfi_class = SFI_CLASS_KERNEL; processor->deadline = UINT64_MAX; - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); - } - else - if (state == PROCESSOR_SHUTDOWN) { + + } else if (state == PROCESSOR_SHUTDOWN) { /* * Going off-line. Force a * reschedule. @@ -4424,6 +4436,8 @@ sched_startup(void) thread_deallocate(thread); + assert_thread_magic(thread); + /* * Yield to the sched_init_thread once, to * initialize our own thread after being switched @@ -4438,9 +4452,6 @@ sched_startup(void) #if defined(CONFIG_SCHED_TIMESHARE_CORE) static volatile uint64_t sched_maintenance_deadline; -#if defined(CONFIG_TELEMETRY) -static volatile uint64_t sched_telemetry_deadline = 0; -#endif static uint64_t sched_tick_last_abstime; static uint64_t sched_tick_delta; uint64_t sched_tick_max_delta; @@ -4489,17 +4500,13 @@ sched_timeshare_maintenance_continue(void) } KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START, - sched_tick_delta, - late_time, - 0, - 0, - 0); + sched_tick_delta, late_time, 0, 0, 0); /* Add a number of pseudo-ticks corresponding to the elapsed interval * This could be greater than 1 if substantial intervals where * all processors are idle occur, which rarely occurs in practice. */ - + sched_tick += sched_tick_delta; /* @@ -4509,7 +4516,8 @@ sched_timeshare_maintenance_continue(void) /* * Scan the run queues for threads which - * may need to be updated. + * may need to be updated, and find the earliest runnable thread on the runqueue + * to report its latency. */ SCHED(thread_update_scan)(&scan_context); @@ -4517,9 +4525,16 @@ sched_timeshare_maintenance_continue(void) uint64_t ctime = mach_absolute_time(); - machine_max_runnable_latency(ctime > scan_context.earliest_bg_make_runnable_time ? ctime - scan_context.earliest_bg_make_runnable_time : 0, - ctime > scan_context.earliest_normal_make_runnable_time ? ctime - scan_context.earliest_normal_make_runnable_time : 0, - ctime > scan_context.earliest_rt_make_runnable_time ? ctime - scan_context.earliest_rt_make_runnable_time : 0); + uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ? + ctime - scan_context.earliest_bg_make_runnable_time : 0; + + uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ? + ctime - scan_context.earliest_normal_make_runnable_time : 0; + + uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ? + ctime - scan_context.earliest_rt_make_runnable_time : 0; + + machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency); /* * Check to see if the special sched VM group needs attention. @@ -4527,12 +4542,9 @@ sched_timeshare_maintenance_continue(void) sched_vm_group_maintenance(); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_END, - sched_pri_shift, - sched_background_pri_shift, - 0, - 0, - 0); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END, + sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG], + sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0); assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT); thread_block((thread_continue_t)sched_timeshare_maintenance_continue); @@ -4567,26 +4579,6 @@ sched_timeshare_consider_maintenance(uint64_t ctime) { sched_maintenance_wakeups++; } } - -#if defined(CONFIG_TELEMETRY) - /* - * Windowed telemetry is driven by the scheduler. It should be safe - * to call compute_telemetry_windowed() even when windowed telemetry - * is disabled, but we should try to avoid doing extra work for no - * reason. - */ - if (telemetry_window_enabled) { - deadline = sched_telemetry_deadline; - - if (__improbable(ctime >= deadline)) { - ndeadline = ctime + sched_telemetry_interval; - - if (__probable(__sync_bool_compare_and_swap(&sched_telemetry_deadline, deadline, ndeadline))) { - compute_telemetry_windowed(); - } - } - } -#endif /* CONFIG_TELEMETRY */ } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -4598,6 +4590,8 @@ sched_init_thread(void (*continuation)(void)) thread_t thread = current_thread(); + thread_set_thread_name(thread, "sched_maintenance_thread"); + sched_maintenance_thread = thread; continuation(); @@ -4625,8 +4619,8 @@ sched_init_thread(void (*continuation)(void)) #define THREAD_UPDATE_SIZE 128 -static thread_t thread_update_array[THREAD_UPDATE_SIZE]; -static int thread_update_count = 0; +static thread_t thread_update_array[THREAD_UPDATE_SIZE]; +static uint32_t thread_update_count = 0; /* Returns TRUE if thread was added, FALSE if thread_update_array is full */ boolean_t @@ -4643,14 +4637,16 @@ thread_update_add_thread(thread_t thread) void thread_update_process_threads(void) { - while (thread_update_count > 0) { - spl_t s; - thread_t thread = thread_update_array[--thread_update_count]; - thread_update_array[thread_update_count] = THREAD_NULL; + assert(thread_update_count <= THREAD_UPDATE_SIZE); - s = splsched(); + for (uint32_t i = 0 ; i < thread_update_count ; i++) { + thread_t thread = thread_update_array[i]; + assert_thread_magic(thread); + thread_update_array[i] = THREAD_NULL; + + spl_t s = splsched(); thread_lock(thread); - if (!(thread->state & (TH_WAIT)) && (SCHED(can_update_priority)(thread))) { + if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) { SCHED(update_priority)(thread); } thread_unlock(thread); @@ -4658,6 +4654,8 @@ thread_update_process_threads(void) thread_deallocate(thread); } + + thread_update_count = 0; } /* @@ -4667,41 +4665,48 @@ thread_update_process_threads(void) */ boolean_t runq_scan( - run_queue_t runq, - sched_update_scan_context_t scan_context) + run_queue_t runq, + sched_update_scan_context_t scan_context) { - register int count; - register queue_t q; - register thread_t thread; - - if ((count = runq->count) > 0) { - q = runq->queues + runq->highq; - while (count > 0) { - queue_iterate(q, thread, thread_t, links) { - if ( thread->sched_stamp != sched_tick && - (thread->sched_mode == TH_MODE_TIMESHARE) ) { - if (thread_update_add_thread(thread) == FALSE) - return (TRUE); - } + int count = runq->count; + int queue_index; - if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { - if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) { - scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time; - } - } else { - if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) { - scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time; - } - } + assert(count >= 0); + + if (count == 0) + return FALSE; + + for (queue_index = bitmap_first(runq->bitmap, NRQS); + queue_index >= 0; + queue_index = bitmap_next(runq->bitmap, queue_index)) { + + thread_t thread; + queue_t queue = &runq->queues[queue_index]; - count--; + qe_foreach_element(thread, queue, runq_links) { + assert(count > 0); + assert_thread_magic(thread); + + if (thread->sched_stamp != sched_tick && + thread->sched_mode == TH_MODE_TIMESHARE) { + if (thread_update_add_thread(thread) == FALSE) + return TRUE; } - q--; + if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { + if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) { + scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time; + } + } else { + if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) { + scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time; + } + } + count--; } } - return (FALSE); + return FALSE; } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index 2522592e0..a42ecb7b0 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -141,10 +141,6 @@ extern void sched_set_thread_base_priority( thread_t thread, int priority); -/* Set the thread to be categorized as 'background' */ -extern void sched_set_thread_throttled(thread_t thread, - boolean_t wants_throttle); - /* Set the thread's true scheduling mode */ extern void sched_set_thread_mode(thread_t thread, sched_mode_t mode); @@ -402,6 +398,17 @@ extern char sched_string[SCHED_STRING_MAX_LENGTH]; extern kern_return_t sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags); +extern thread_t port_name_to_thread_for_ulock(mach_port_name_t thread_name); + +/* Attempt to context switch to a specific runnable thread */ +extern wait_result_t thread_handoff(thread_t thread); + +extern struct waitq *assert_wait_queue(event_t event); + +extern kern_return_t thread_wakeup_one_with_pri(event_t event, int priority); + +extern thread_t thread_wakeup_identify(event_t event, int priority); + #endif /* XNU_KERNEL_PRIVATE */ /* Context switch */ @@ -452,13 +459,6 @@ extern kern_return_t thread_wakeup_prim( boolean_t one_thread, wait_result_t result); -extern kern_return_t thread_wakeup_prim_internal( - event_t event, - boolean_t one_thread, - wait_result_t result, - int priority); - - #define thread_wakeup(x) \ thread_wakeup_prim((x), FALSE, THREAD_AWAKENED) #define thread_wakeup_with_result(x, z) \ @@ -466,12 +466,10 @@ extern kern_return_t thread_wakeup_prim_internal( #define thread_wakeup_one(x) \ thread_wakeup_prim((x), TRUE, THREAD_AWAKENED) -#ifdef MACH_KERNEL_PRIVATE -#define thread_wakeup_one_with_pri(x, pri) \ - thread_wakeup_prim_internal((x), TRUE, THREAD_AWAKENED, pri) -#endif +/* Wakeup the specified thread if it is waiting on this event */ +extern kern_return_t thread_wakeup_thread(event_t event, thread_t thread); -extern boolean_t preemption_enabled(void); +extern boolean_t preemption_enabled(void); #ifdef MACH_KERNEL_PRIVATE diff --git a/osfmk/kern/sched_proto.c b/osfmk/kern/sched_proto.c index d3a5bf688..e0d3c14ff 100644 --- a/osfmk/kern/sched_proto.c +++ b/osfmk/kern/sched_proto.c @@ -326,9 +326,8 @@ sched_proto_choose_thread(processor_t processor, SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; if (queue_empty(queue)) { - if (pri != IDLEPRI) - clrbit(MAXPRI - pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } simple_unlock(&global_runq_lock); @@ -435,9 +434,8 @@ sched_proto_processor_queue_remove( if (queue_empty(rq->queues + thread->sched_pri)) { /* update run queue status */ - if (thread->sched_pri != IDLEPRI) - clrbit(MAXPRI - thread->sched_pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, thread->sched_pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } thread->runq = PROCESSOR_NULL; @@ -475,7 +473,9 @@ sched_proto_processor_queue_has_priority(processor_t processor __unused, simple_lock(&global_runq_lock); - if (gte) + if (global_runq->count == 0) + result = FALSE; + else if (gte) result = global_runq->highq >= priority; else result = global_runq->highq >= priority; diff --git a/osfmk/kern/sched_traditional.c b/osfmk/kern/sched_traditional.c index 79d94ffd6..80f950feb 100644 --- a/osfmk/kern/sched_traditional.c +++ b/osfmk/kern/sched_traditional.c @@ -317,9 +317,8 @@ sched_traditional_choose_thread_from_runq( rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { - if (pri != IDLEPRI) - clrbit(MAXPRI - pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } return (thread); @@ -432,7 +431,11 @@ sched_traditional_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte) { - if (gte) + run_queue_t runq = runq_for_processor(processor); + + if (runq->count == 0) + return FALSE; + else if (gte) return runq_for_processor(processor)->highq >= priority; else return runq_for_processor(processor)->highq > priority; @@ -503,9 +506,8 @@ sched_traditional_processor_queue_shutdown(processor_t processor) rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { - if (pri != IDLEPRI) - clrbit(MAXPRI - pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } enqueue_tail(&tqueue, (queue_entry_t)thread); @@ -628,9 +630,8 @@ sched_traditional_steal_processor_thread(processor_t processor) rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { - if (pri != IDLEPRI) - clrbit(MAXPRI - pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); + bitmap_clear(rq->bitmap, pri); + rq->highq = bitmap_first(rq->bitmap, NRQS); } return (thread); diff --git a/osfmk/kern/sfi.c b/osfmk/kern/sfi.c index b43fe2db1..8a67ec034 100644 --- a/osfmk/kern/sfi.c +++ b/osfmk/kern/sfi.c @@ -42,6 +42,8 @@ #include #include #include +#include + #include #include @@ -94,6 +96,8 @@ extern sched_call_t workqueue_get_sched_callback(void); * * The pset lock may also be taken, but not while any other locks are held. * + * The task and thread mutex may also be held while reevaluating sfi state. + * * splsched ---> sfi_lock ---> waitq ---> thread_lock * \ \ \__ thread_lock (*) * \ \__ pset_lock @@ -738,13 +742,15 @@ sfi_class_id_t sfi_thread_classify(thread_t thread) task_t task = thread->task; boolean_t is_kernel_thread = (task == kernel_task); sched_mode_t thmode = thread->sched_mode; - int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS); - int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE); - int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG); - int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED); - int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); boolean_t focal = FALSE; + int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE); + int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS); + int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED); + + int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); + int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG); + /* kernel threads never reach the user AST boundary, and are in a separate world for SFI */ if (is_kernel_thread) { return SFI_CLASS_KERNEL; @@ -928,7 +934,6 @@ void sfi_ast(thread_t thread) uint64_t tid; thread_continue_t continuation; sched_call_t workq_callback = workqueue_get_sched_callback(); - boolean_t did_clear_wq = FALSE; s = splsched(); @@ -964,7 +969,8 @@ void sfi_ast(thread_t thread) /* Optimistically clear workq callback while thread is already locked */ if (workq_callback && (thread->sched_call == workq_callback)) { thread_sched_call(thread, NULL); - did_clear_wq = TRUE; + } else { + workq_callback = NULL; } thread_unlock(thread); @@ -991,15 +997,9 @@ void sfi_ast(thread_t thread) splx(s); if (did_wait) { - thread_block_reason(continuation, did_clear_wq ? workq_callback : NULL, AST_SFI); - } else { - if (did_clear_wq) { - s = splsched(); - thread_lock(thread); - thread_sched_call(thread, workq_callback); - thread_unlock(thread); - splx(s); - } + thread_block_reason(continuation, workq_callback, AST_SFI); + } else if (workq_callback) { + thread_reenable_sched_call(thread, workq_callback); } } diff --git a/osfmk/kern/simple_lock.h b/osfmk/kern/simple_lock.h index c1a191adb..8ef311a88 100644 --- a/osfmk/kern/simple_lock.h +++ b/osfmk/kern/simple_lock.h @@ -89,7 +89,7 @@ extern void hw_lock_unlock( extern unsigned int hw_lock_to( hw_lock_t, - unsigned int); + uint64_t); extern unsigned int hw_lock_try( hw_lock_t); @@ -166,6 +166,9 @@ extern void usimple_unlock( extern unsigned int usimple_lock_try( usimple_lock_t); +extern void usimple_lock_try_lock_loop( + usimple_lock_t); + __END_DECLS #define ETAP_NO_TRACE 0 @@ -181,6 +184,7 @@ __END_DECLS #define simple_lock(l) usimple_lock(l) #define simple_unlock(l) usimple_unlock(l) #define simple_lock_try(l) usimple_lock_try(l) +#define simple_lock_try_lock_loop(l) usimple_lock_try_lock_loop(l) #define simple_lock_addr(l) (&(l)) #endif /* !defined(simple_lock_init) */ diff --git a/osfmk/kern/stack.c b/osfmk/kern/stack.c index 400cedcf9..0cb793286 100644 --- a/osfmk/kern/stack.c +++ b/osfmk/kern/stack.c @@ -83,29 +83,13 @@ vm_offset_t kernel_stack_depth_max; static inline void STACK_ZINFO_PALLOC(thread_t thread) { - task_t task; - zinfo_usage_t zinfo; - ledger_credit(thread->t_ledger, task_ledgers.tkm_private, kernel_stack_size); - - if (stack_fake_zone_index != -1 && - (task = thread->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(kernel_stack_size, - (int64_t *)&zinfo[stack_fake_zone_index].alloc); } static inline void STACK_ZINFO_PFREE(thread_t thread) { - task_t task; - zinfo_usage_t zinfo; - ledger_debit(thread->t_ledger, task_ledgers.tkm_private, kernel_stack_size); - - if (stack_fake_zone_index != -1 && - (task = thread->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(kernel_stack_size, - (int64_t *)&zinfo[stack_fake_zone_index].free); } static inline void @@ -113,19 +97,6 @@ STACK_ZINFO_HANDOFF(thread_t from, thread_t to) { ledger_debit(from->t_ledger, task_ledgers.tkm_private, kernel_stack_size); ledger_credit(to->t_ledger, task_ledgers.tkm_private, kernel_stack_size); - - if (stack_fake_zone_index != -1) { - task_t task; - zinfo_usage_t zinfo; - - if ((task = from->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(kernel_stack_size, - (int64_t *)&zinfo[stack_fake_zone_index].free); - - if ((task = to->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(kernel_stack_size, - (int64_t *)&zinfo[stack_fake_zone_index].alloc); - } } /* @@ -518,8 +489,8 @@ processor_set_stack_usage( vm_size_t maxusage; vm_offset_t maxstack; - register thread_t *thread_list; - register thread_t thread; + thread_t *thread_list; + thread_t thread; unsigned int actual; /* this many things */ unsigned int i; diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index ac6acbc04..8c3b99f09 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -116,8 +116,12 @@ #include #include #include +#include +#include #include +#include +#include #if CONFIG_ATM @@ -148,10 +152,6 @@ #include #endif -#if KPERF -#include -#endif - #if HYPERVISOR #include #endif @@ -181,8 +181,9 @@ extern void OSKextRemoveKextBootstrap(void); void scale_setup(void); extern void bsd_scale_setup(int); extern unsigned int semaphore_max; -extern void stackshot_lock_init(void); -extern void console_init(void); +extern void stackshot_init(void); +extern void ktrace_init(void); +extern void oslog_init(void); /* * Running in virtual memory, on the interrupt stack. @@ -194,25 +195,25 @@ extern int serverperfmode; unsigned int new_nkdbufs = 0; unsigned int wake_nkdbufs = 0; unsigned int write_trace_on_panic = 0; -unsigned int trace_typefilter = 0; -boolean_t trace_serial = FALSE; +static char trace_typefilter[64] = { 0 }; +boolean_t trace_serial = FALSE; +boolean_t oslog_early_boot_complete = FALSE; /* mach leak logging */ int log_leaks = 0; -int turn_on_log_leaks = 0; static inline void kernel_bootstrap_log(const char *message) { // kprintf("kernel_bootstrap: %s\n", message); - kernel_debug_string_simple(message); + kernel_debug_string_early(message); } static inline void kernel_bootstrap_thread_log(const char *message) { // kprintf("kernel_bootstrap_thread: %s\n", message); - kernel_debug_string_simple(message); + kernel_debug_string_early(message); } void @@ -251,12 +252,12 @@ kernel_bootstrap(void) printf("%s\n", version); /* log kernel version */ if (PE_parse_boot_argn("-l", namep, sizeof (namep))) /* leaks logging */ - turn_on_log_leaks = 1; + log_leaks = 1; PE_parse_boot_argn("trace", &new_nkdbufs, sizeof (new_nkdbufs)); PE_parse_boot_argn("trace_wake", &wake_nkdbufs, sizeof (wake_nkdbufs)); PE_parse_boot_argn("trace_panic", &write_trace_on_panic, sizeof(write_trace_on_panic)); - PE_parse_boot_argn("trace_typefilter", &trace_typefilter, sizeof(trace_typefilter)); + PE_parse_boot_arg_str("trace_typefilter", trace_typefilter, sizeof(trace_typefilter)); scale_setup(); @@ -274,6 +275,7 @@ kernel_bootstrap(void) machine_info.major_version = version_major; machine_info.minor_version = version_minor; + oslog_init(); #if CONFIG_TELEMETRY kernel_bootstrap_log("telemetry_init"); @@ -293,12 +295,15 @@ kernel_bootstrap(void) kernel_bootstrap_log("console_init"); console_init(); - kernel_bootstrap_log("stackshot_lock_init"); - stackshot_lock_init(); + kernel_bootstrap_log("stackshot_init"); + stackshot_init(); kernel_bootstrap_log("sched_init"); sched_init(); + kernel_bootstrap_log("ltable_bootstrap"); + ltable_bootstrap(); + kernel_bootstrap_log("waitq_bootstrap"); waitq_bootstrap(); @@ -350,13 +355,18 @@ kernel_bootstrap(void) kernel_bootstrap_log("atm_init"); atm_init(); #endif + kernel_bootstrap_log("mach_init_activity_id"); + mach_init_activity_id(); #if CONFIG_BANK /* Initialize the BANK Manager. */ kernel_bootstrap_log("bank_init"); bank_init(); #endif - + + kernel_bootstrap_log("ipc_pthread_priority_init"); + ipc_pthread_priority_init(); + /* initialize the corpse config based on boot-args */ corpses_init(); @@ -430,6 +440,7 @@ kernel_bootstrap_thread(void) kernel_bootstrap_thread_log("thread_bind"); thread_bind(processor); + /* * Initialize ipc thread call support. */ @@ -454,7 +465,7 @@ kernel_bootstrap_thread(void) device_service_create(); kth_started = 1; - + #if (defined(__i386__) || defined(__x86_64__)) && NCOPY_WINDOWS > 0 /* * Create and initialize the physical copy window for processor 0 @@ -482,10 +493,6 @@ kernel_bootstrap_thread(void) ecc_log_init(); #endif -#if KPERF - kperf_bootstrap(); -#endif - #if HYPERVISOR hv_support_init(); #endif @@ -499,33 +506,32 @@ kernel_bootstrap_thread(void) vmx_init(); #endif -#if (defined(__i386__) || defined(__x86_64__)) - if (kdebug_serial) { - new_nkdbufs = 1; - if (trace_typefilter == 0) - trace_typefilter = 1; - } - if (turn_on_log_leaks && !new_nkdbufs) - new_nkdbufs = 200000; - if (trace_typefilter) - start_kern_tracing_with_typefilter(new_nkdbufs, - FALSE, - trace_typefilter); - else - start_kern_tracing(new_nkdbufs, FALSE); - if (turn_on_log_leaks) - log_leaks = 1; + kernel_bootstrap_thread_log("ktrace_init"); + ktrace_init(); -#endif + if (new_nkdbufs > 0 || kdebug_serial || log_leaks) { + kdebug_boot_trace(new_nkdbufs, trace_typefilter); + } kernel_bootstrap_log("prng_init"); prng_cpu_init(master_cpu); +#ifdef MACH_BSD + kernel_bootstrap_log("bsd_early_init"); + bsd_early_init(); +#endif + #ifdef IOKIT + kernel_bootstrap_log("PE_init_iokit"); PE_init_iokit(); #endif assert(ml_get_interrupts_enabled() == FALSE); + + // Set this flag to indicate that it is now okay to start testing + // for interrupts / preemeption disabled while logging + oslog_early_boot_complete = TRUE; + (void) spllo(); /* Allow interruptions */ #if (defined(__i386__) || defined(__x86_64__)) && NCOPY_WINDOWS > 0 @@ -539,17 +545,6 @@ kernel_bootstrap_thread(void) cpu_userwindow_init(0); #endif -#if (!defined(__i386__) && !defined(__x86_64__)) - if (turn_on_log_leaks && !new_nkdbufs) - new_nkdbufs = 200000; - if (trace_typefilter) - start_kern_tracing_with_typefilter(new_nkdbufs, FALSE, trace_typefilter); - else - start_kern_tracing(new_nkdbufs, FALSE); - if (turn_on_log_leaks) - log_leaks = 1; -#endif - /* * Initialize the shared region module. */ @@ -714,6 +709,9 @@ load_context( processor->active_thread = thread; processor->current_pri = thread->sched_pri; processor->current_thmode = thread->sched_mode; + processor->current_sfi_class = SFI_CLASS_KERNEL; + processor->starting_pri = thread->sched_pri; + processor->deadline = UINT64_MAX; thread->last_processor = processor; diff --git a/osfmk/kern/startup.h b/osfmk/kern/startup.h index 6e1864df6..4555687dc 100644 --- a/osfmk/kern/startup.h +++ b/osfmk/kern/startup.h @@ -64,6 +64,7 @@ extern void device_service_create(void); /* BSD subsystem initialization */ extern void bsd_init(void); +extern void bsd_early_init(void); /* codesigning subsystem initialization */ extern void cs_init(void); diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index 4304559e6..fc09bb646 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -687,7 +687,6 @@ semaphore_wait_internal( thread_t self = current_thread(); wait_semaphore->count = -1; /* we don't keep an actual count */ - thread_lock(self); (void)waitq_assert_wait64_locked( &wait_semaphore->waitq, SEMAPHORE_EVENT, @@ -695,7 +694,6 @@ semaphore_wait_internal( TIMEOUT_URGENCY_USER_NORMAL, deadline, TIMEOUT_NO_LEEWAY, self); - thread_unlock(self); } semaphore_unlock(wait_semaphore); splx(spl_level); diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 1ebf39e4f..e413d30c2 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -68,6 +68,8 @@ #include #include #include +#include + #include #include @@ -104,7 +106,7 @@ __unused struct pfz_exit_args *args) static void swtch_continue(void) { - register processor_t myprocessor; + processor_t myprocessor; boolean_t result; disable_preemption(); @@ -120,7 +122,7 @@ boolean_t swtch( __unused struct swtch_args *args) { - register processor_t myprocessor; + processor_t myprocessor; boolean_t result; disable_preemption(); @@ -147,7 +149,7 @@ swtch( static void swtch_pri_continue(void) { - register processor_t myprocessor; + processor_t myprocessor; boolean_t result; thread_depress_abort_internal(current_thread()); @@ -165,7 +167,7 @@ boolean_t swtch_pri( __unused struct swtch_pri_args *args) { - register processor_t myprocessor; + processor_t myprocessor; boolean_t result; disable_preemption(); @@ -193,38 +195,24 @@ __unused struct swtch_pri_args *args) return (result); } -static int +static boolean_t thread_switch_disable_workqueue_sched_callback(void) { sched_call_t callback = workqueue_get_sched_callback(); - thread_t self = current_thread(); - if (!callback || self->sched_call != callback) { - return FALSE; - } - spl_t s = splsched(); - thread_lock(self); - thread_sched_call(self, NULL); - thread_unlock(self); - splx(s); - return TRUE; + return thread_disable_sched_call(current_thread(), callback) != NULL; } static void thread_switch_enable_workqueue_sched_callback(void) { sched_call_t callback = workqueue_get_sched_callback(); - thread_t self = current_thread(); - spl_t s = splsched(); - thread_lock(self); - thread_sched_call(self, callback); - thread_unlock(self); - splx(s); + thread_reenable_sched_call(current_thread(), callback); } static void thread_switch_continue(void) { - register thread_t self = current_thread(); + thread_t self = current_thread(); int option = self->saved.swtch.option; boolean_t reenable_workq_callback = self->saved.swtch.reenable_workq_callback; @@ -397,6 +385,88 @@ thread_switch( return (KERN_SUCCESS); } +/* Returns a +1 thread reference */ +thread_t +port_name_to_thread_for_ulock(mach_port_name_t thread_name) +{ + thread_t thread = THREAD_NULL; + thread_t self = current_thread(); + + /* + * Translate the port name if supplied. + */ + if (thread_name != MACH_PORT_NULL) { + ipc_port_t port; + + if (ipc_port_translate_send(self->task->itk_space, + thread_name, &port) == KERN_SUCCESS) { + ip_reference(port); + ip_unlock(port); + + thread = convert_port_to_thread(port); + ip_release(port); + + if (thread == THREAD_NULL) { + return thread; + } + + if ((thread == self) || (thread->task != self->task)) { + thread_deallocate(thread); + thread = THREAD_NULL; + } + } + } + + return thread; +} + +/* This function is called after an assert_wait(), therefore it must not + * cause another wait until after the thread_run() or thread_block() + * + * Consumes a ref on thread + */ +wait_result_t +thread_handoff(thread_t thread) +{ + thread_t deallocate_thread = THREAD_NULL; + thread_t self = current_thread(); + + /* + * Try to handoff if supplied. + */ + if (thread != THREAD_NULL) { + spl_t s = splsched(); + + thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread); + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_THREAD_SWITCH)|DBG_FUNC_NONE, + thread_tid(thread), thread->state, + pulled_thread ? TRUE : FALSE, 0, 0); + + if (pulled_thread != THREAD_NULL) { + /* We can't be dropping the last ref here */ + thread_deallocate_safe(thread); + + int result = thread_run(self, THREAD_CONTINUE_NULL, NULL, pulled_thread); + + splx(s); + return result; + } + + splx(s); + + deallocate_thread = thread; + thread = THREAD_NULL; + } + + int result = thread_block(THREAD_CONTINUE_NULL); + if (deallocate_thread != THREAD_NULL) { + thread_deallocate(deallocate_thread); + } + + return result; +} + /* * Depress thread's priority to lowest possible for the specified interval, * with a value of zero resulting in no timeout being scheduled. @@ -405,7 +475,7 @@ void thread_depress_abstime( uint64_t interval) { - register thread_t self = current_thread(); + thread_t self = current_thread(); uint64_t deadline; spl_t s; @@ -568,3 +638,40 @@ thread_yield_internal( thread_depress_abort_internal(current_thread()); } +/* + * This yields to a possible non-urgent preemption pending on the current processor. + * + * This is useful when doing a long computation in the kernel without returning to userspace. + * + * As opposed to other yielding mechanisms, this does not drop the priority of the current thread. + */ +void +thread_yield_to_preemption() +{ + /* + * ast_pending() should ideally be called with interrupts disabled, but + * the check here is fine because csw_check() will do the right thing. + */ + ast_t *pending_ast = ast_pending(); + ast_t ast = AST_NONE; + processor_t p; + + if (*pending_ast & AST_PREEMPT) { + thread_t self = current_thread(); + + spl_t s = splsched(); + + p = current_processor(); + thread_lock(self); + ast = csw_check(p, AST_YIELD); + ast_on(ast); + thread_unlock(self); + + if (ast != AST_NONE) { + (void)thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast); + } + + splx(s); + } +} + diff --git a/osfmk/kern/syscall_sw.c b/osfmk/kern/syscall_sw.c index e086346e3..f5d963b11 100644 --- a/osfmk/kern/syscall_sw.c +++ b/osfmk/kern/syscall_sw.c @@ -113,7 +113,7 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 8 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 9 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 10 */ MACH_TRAP(_kernelrpc_mach_vm_allocate_trap, 4, 5, munge_wwlw), -/* 11 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 11 */ MACH_TRAP(_kernelrpc_mach_vm_purgable_control_trap, 4, 5, munge_wlww), /* 12 */ MACH_TRAP(_kernelrpc_mach_vm_deallocate_trap, 3, 5, munge_wll), /* 13 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 14 */ MACH_TRAP(_kernelrpc_mach_vm_protect_trap, 5, 7, munge_wllww), @@ -145,7 +145,7 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 40 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 41 */ MACH_TRAP(_kernelrpc_mach_port_guard_trap, 4, 5, munge_wwlw), /* 42 */ MACH_TRAP(_kernelrpc_mach_port_unguard_trap, 3, 4, munge_wwl), -/* 43 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 43 */ MACH_TRAP(mach_generate_activity_id, 3, 3, munge_www), /* 44 */ MACH_TRAP(task_name_for_pid, 3, 3, munge_www), /* 45 */ MACH_TRAP(task_for_pid, 3, 3, munge_www), /* 46 */ MACH_TRAP(pid_for_task, 2, 2, munge_ww), @@ -173,9 +173,9 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 67 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 68 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 69 */ MACH_TRAP(kern_invalid, 0, 0, NULL), -/* 70 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 70 */ MACH_TRAP(host_create_mach_voucher_trap, 4, 4, munge_wwww), /* 71 */ MACH_TRAP(kern_invalid, 0, 0, NULL), -/* 72 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 72 */ MACH_TRAP(mach_voucher_extract_attr_recipe_trap, 4, 4, munge_wwww), /* 73 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 74 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 75 */ MACH_TRAP(kern_invalid, 0, 0, NULL), @@ -280,7 +280,7 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 40 */ "kern_invalid", /* 41 */ "_kernelrpc_mach_port_guard_trap", /* 42 */ "_kernelrpc_mach_port_unguard_trap", -/* 43 */ "kern_invalid", +/* 43 */ "mach_generate_activity_id", /* 44 */ "task_name_for_pid", /* 45 */ "task_for_pid", /* 46 */ "pid_for_task", @@ -308,9 +308,9 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 67 */ "kern_invalid", /* 68 */ "kern_invalid", /* 69 */ "kern_invalid", -/* 70 */ "kern_invalid", +/* 70 */ "host_create_mach_voucher_trap", /* 71 */ "kern_invalid", -/* 72 */ "kern_invalid", +/* 72 */ "mach_voucher_extract_attr_recipe_trap", /* 73 */ "kern_invalid", /* 74 */ "kern_invalid", /* 75 */ "kern_invalid", diff --git a/osfmk/kern/sysdiagnose.c b/osfmk/kern/sysdiagnose.c index 5f66f7e88..a2a147207 100644 --- a/osfmk/kern/sysdiagnose.c +++ b/osfmk/kern/sysdiagnose.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -54,5 +55,7 @@ sysdiagnose_notify_user(uint32_t keycode) KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SYSDIAGNOSE, SYSDIAGNOSE_NOTIFY_USER) | DBG_FUNC_START, 0, 0, 0, 0, 0); - return send_sysdiagnose_notification(user_port, keycode); + kr = send_sysdiagnose_notification(user_port, keycode); + ipc_port_release_send(user_port); + return kr; } diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index d40fd6fc1..cc8159895 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010, 2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,7 @@ #include #include #include +#include #include #include @@ -122,6 +123,8 @@ #include #include #include +#include + #include #if CONFIG_TELEMETRY #include @@ -155,20 +158,26 @@ #include #endif -#include +#include /* picks up ledger.h */ + +#if CONFIG_MACF +#include +#endif #if KPERF extern int kpc_force_all_ctrs(task_t, int); #endif -uint32_t qos_override_mode; - task_t kernel_task; zone_t task_zone; lck_attr_t task_lck_attr; lck_grp_t task_lck_grp; lck_grp_attr_t task_lck_grp_attr; +extern int exc_via_corpse_forking; +extern int unify_corpse_blob_alloc; +extern int corpse_for_fatal_memkill; + /* Flag set by core audio when audio is playing. Used to stifle EXC_RESOURCE generation when active. */ int audio_active = 0; @@ -182,11 +191,12 @@ lck_spin_t dead_task_statistics_lock; ledger_template_t task_ledger_template = NULL; struct _task_ledger_indices task_ledgers __attribute__((used)) = - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, { 0 /* initialized at runtime */}, #ifdef CONFIG_BANK -1, -1, #endif + -1, -1, }; /* System sleep state */ @@ -196,17 +206,20 @@ boolean_t tasks_suspend_state; void init_task_ledgers(void); void task_footprint_exceeded(int warning, __unused const void *param0, __unused const void *param1); void task_wakeups_rate_exceeded(int warning, __unused const void *param0, __unused const void *param1); -void __attribute__((noinline)) THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE(void); -void __attribute__((noinline)) PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb); +void task_io_rate_exceeded(int warning, const void *param0, __unused const void *param1); +void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS(void); +void __attribute__((noinline)) PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, boolean_t is_fatal); +void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MUCH_IO(int flavor); kern_return_t task_suspend_internal(task_t); kern_return_t task_resume_internal(task_t); static kern_return_t task_start_halt_locked(task_t task, boolean_t should_mark_corpse); +int proc_list_uptrs(void *p, uint64_t *udata_buffer, int size); extern kern_return_t iokit_task_terminate(task_t task); -void proc_init_cpumon_params(void); extern kern_return_t exception_deliver(thread_t, exception_type_t, mach_exception_data_t, mach_msg_type_number_t, struct exception_action *, lck_mtx_t *); +extern void bsd_copythreadname(void *dst_uth, void *src_uth); // Warn tasks when they hit 80% of their memory limit. #define PHYS_FOOTPRINT_WARNING_LEVEL 80 @@ -230,15 +243,30 @@ int task_wakeups_monitor_ustackshots_trigger_pct; /* Percentage. Level at which int disable_exc_resource; /* Global override to supress EXC_RESOURCE for resource monitor violations. */ ledger_amount_t max_task_footprint = 0; /* Per-task limit on physical memory consumption in bytes */ +int max_task_footprint_warning_level = 0; /* Per-task limit warning percentage */ int max_task_footprint_mb = 0; /* Per-task limit on physical memory consumption in megabytes */ +/* I/O Monitor Limits */ +#define IOMON_DEFAULT_LIMIT (20480ull) /* MB of logical/physical I/O */ +#define IOMON_DEFAULT_INTERVAL (86400ull) /* in seconds */ + +uint64_t task_iomon_limit_mb; /* Per-task I/O monitor limit in MBs */ +uint64_t task_iomon_interval_secs; /* Per-task I/O monitor interval in secs */ + +#define IO_TELEMETRY_DEFAULT_LIMIT (10ll * 1024ll * 1024ll) +int64_t io_telemetry_limit; /* Threshold to take a microstackshot (0 indicated I/O telemetry is turned off) */ +int64_t global_logical_writes_count = 0; /* Global count for logical writes */ +static boolean_t global_update_logical_writes(int64_t); + #if MACH_ASSERT int pmap_ledgers_panic = 1; #endif /* MACH_ASSERT */ int task_max = CONFIG_TASK_MAX; /* Max number of tasks */ +#if CONFIG_COREDUMP int hwm_user_cores = 0; /* high watermark violations generate user core files */ +#endif #ifdef MACH_BSD extern void proc_getexecutableuuid(void *, unsigned char *, unsigned long); @@ -246,31 +274,22 @@ extern int proc_pid(struct proc *p); extern int proc_selfpid(void); extern char *proc_name_address(struct proc *p); extern uint64_t get_dispatchqueue_offset_from_proc(void *); -#if CONFIG_JETSAM + +#if CONFIG_MEMORYSTATUS extern void proc_memstat_terminated(struct proc* p, boolean_t set); -extern void memorystatus_on_ledger_footprint_exceeded(int warning, const int max_footprint_mb); -#endif -#endif -#if MACH_ASSERT -extern int pmap_ledgers_panic; -#endif /* MACH_ASSERT */ +extern boolean_t memorystatus_turnoff_exception_and_get_fatalness(boolean_t warning, const int max_footprint_mb); +extern void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t is_fatal); +#endif /* CONFIG_MEMORYSTATUS */ + +#endif /* MACH_BSD */ /* Forwards */ -void task_hold_locked( - task_t task); -void task_wait_locked( - task_t task, - boolean_t until_not_runnable); -void task_release_locked( - task_t task); -void task_free( - task_t task ); -void task_synchronizer_destroy_all( - task_t task); - -int check_for_tasksuspend( - task_t task); +static void task_hold_locked(task_t task); +static void task_wait_locked(task_t task, boolean_t until_not_runnable); +static void task_release_locked(task_t task); + +static void task_synchronizer_destroy_all(task_t task); void task_backing_store_privileged( @@ -315,6 +334,26 @@ task_set_64bit( thread_mtx_lock(thread); machine_thread_switch_addrmode(thread); thread_mtx_unlock(thread); + + if (thread == current_thread()) { + uint64_t arg1, arg2; + int urgency; + spl_t spl = splsched(); + /* + * This call tell that the current thread changed it's 32bitness. + * Other thread were no more on core when 32bitness was changed, + * but current_thread() is on core and the previous call to + * machine_thread_going_on_core() gave 32bitness which is now wrong. + * + * This is needed for bring-up, a different callback should be used + * in the future. + */ + thread_lock(thread); + urgency = thread_get_urgency(thread, &arg1, &arg2); + machine_thread_going_on_core(thread, urgency, 0); + thread_unlock(thread); + splx(spl); + } } #endif /* defined(__i386__) || defined(__x86_64__) || defined(__arm64__) */ @@ -375,24 +414,13 @@ task_bank_init(__unused task_t task) { #if TASK_REFERENCE_LEAK_DEBUG #include -decl_simple_lock_data(static,task_ref_lock); static btlog_t *task_ref_btlog; #define TASK_REF_OP_INCR 0x1 #define TASK_REF_OP_DECR 0x2 +#define TASK_REF_NUM_RECORDS 100000 #define TASK_REF_BTDEPTH 7 -static void -task_ref_lock_lock(void *context) -{ - simple_lock((simple_lock_t)context); -} -static void -task_ref_lock_unlock(void *context) -{ - simple_unlock((simple_lock_t)context); -} - void task_reference_internal(task_t task) { @@ -429,6 +457,7 @@ task_init(void) lck_grp_init(&task_lck_grp, "task", &task_lck_grp_attr); lck_attr_setdefault(&task_lck_attr); lck_mtx_init(&tasks_threads_lock, &task_lck_grp, &task_lck_attr); + lck_mtx_init(&tasks_corpse_lock, &task_lck_grp, &task_lck_attr); task_zone = zinit( sizeof(struct task), @@ -438,6 +467,7 @@ task_init(void) zone_change(task_zone, Z_NOENCRYPT, TRUE); + /* * Configure per-task memory limit. * The boot-arg is interpreted as Megabytes, @@ -459,7 +489,7 @@ task_init(void) } if (max_task_footprint_mb != 0) { -#if CONFIG_JETSAM +#if CONFIG_MEMORYSTATUS if (max_task_footprint_mb < 50) { printf("Warning: max_task_pmem %d below minimum.\n", max_task_footprint_mb); @@ -469,9 +499,43 @@ task_init(void) max_task_footprint_mb); max_task_footprint = (ledger_amount_t)max_task_footprint_mb * 1024 * 1024; // Convert MB to bytes + + /* + * Configure the per-task memory limit warning level. + * This is computed as a percentage. + */ + max_task_footprint_warning_level = 0; + + if (max_mem < 0x40000000) { + /* + * On devices with < 1GB of memory: + * -- set warnings to 50MB below the per-task limit. + */ + if (max_task_footprint_mb > 50) { + max_task_footprint_warning_level = ((max_task_footprint_mb - 50) * 100) / max_task_footprint_mb; + } + } else { + /* + * On devices with >= 1GB of memory: + * -- set warnings to 100MB below the per-task limit. + */ + if (max_task_footprint_mb > 100) { + max_task_footprint_warning_level = ((max_task_footprint_mb - 100) * 100) / max_task_footprint_mb; + } + } + + /* + * Never allow warning level to land below the default. + */ + if (max_task_footprint_warning_level < PHYS_FOOTPRINT_WARNING_LEVEL) { + max_task_footprint_warning_level = PHYS_FOOTPRINT_WARNING_LEVEL; + } + + printf("Limiting task physical memory warning to %d%%\n", max_task_footprint_warning_level); + #else - printf("Warning: max_task_footprint specified, but jetsam not configured; ignoring.\n"); -#endif + printf("Warning: max_task_pmem specified, but jetsam not configured; ignoring.\n"); +#endif /* CONFIG_MEMORYSTATUS */ } #if MACH_ASSERT @@ -479,16 +543,12 @@ task_init(void) sizeof (pmap_ledgers_panic)); #endif /* MACH_ASSERT */ +#if CONFIG_COREDUMP if (!PE_parse_boot_argn("hwm_user_cores", &hwm_user_cores, sizeof (hwm_user_cores))) { hwm_user_cores = 0; } - - if (PE_parse_boot_argn("qos_override_mode", &qos_override_mode, sizeof(qos_override_mode))) { - printf("QOS override mode: 0x%08x\n", qos_override_mode); - } else { - qos_override_mode = QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE; - } +#endif proc_init_cpumon_params(); @@ -510,6 +570,18 @@ task_init(void) disable_exc_resource = 0; } + if (!PE_parse_boot_argn("task_iomon_limit_mb", &task_iomon_limit_mb, sizeof (task_iomon_limit_mb))) { + task_iomon_limit_mb = IOMON_DEFAULT_LIMIT; + } + + if (!PE_parse_boot_argn("task_iomon_interval_secs", &task_iomon_interval_secs, sizeof (task_iomon_interval_secs))) { + task_iomon_interval_secs = IOMON_DEFAULT_INTERVAL; + } + + if (!PE_parse_boot_argn("io_telemetry_limit", &io_telemetry_limit, sizeof (io_telemetry_limit))) { + io_telemetry_limit = IO_TELEMETRY_DEFAULT_LIMIT; + } + /* * If we have coalitions, coalition_init() will call init_task_ledgers() as it * sets up the ledgers for the default coalition. If we don't have coalitions, @@ -522,12 +594,7 @@ task_init(void) #endif /* CONFIG_COALITIONS */ #if TASK_REFERENCE_LEAK_DEBUG - simple_lock_init(&task_ref_lock, 0); - task_ref_btlog = btlog_create(100000, - TASK_REF_BTDEPTH, - task_ref_lock_lock, - task_ref_lock_unlock, - &task_ref_lock); + task_ref_btlog = btlog_create(TASK_REF_NUM_RECORDS, TASK_REF_BTDEPTH, TRUE /* caller_will_remove_entries_for_element? */); assert(task_ref_btlog); #endif @@ -535,16 +602,15 @@ task_init(void) * Create the kernel task as the first task. */ #ifdef __LP64__ - if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, &kernel_task) != KERN_SUCCESS) + if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TF_NONE, &kernel_task) != KERN_SUCCESS) #else - if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, &kernel_task) != KERN_SUCCESS) + if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, TF_NONE, &kernel_task) != KERN_SUCCESS) #endif panic("task_init\n"); vm_map_deallocate(kernel_task->map); kernel_task->map = kernel_map; lck_spin_init(&dead_task_statistics_lock, &task_lck_grp, &task_lck_attr); - } /* @@ -614,6 +680,7 @@ host_security_create_task_token( * + iokit_mapped * + purgeable_nonvolatile * + purgeable_nonvolatile_compressed + * + page_table * * internal * The task's anonymous memory, which on iOS is always resident. @@ -641,6 +708,11 @@ init_task_ledgers(void) assert(task_ledger_template == NULL); assert(kernel_task == TASK_NULL); +#if MACH_ASSERT + PE_parse_boot_argn("pmap_ledgers_panic", &pmap_ledgers_panic, + sizeof (pmap_ledgers_panic)); +#endif /* MACH_ASSERT */ + if ((t = ledger_template_create("Per-task ledger")) == NULL) panic("couldn't create task ledger template"); @@ -661,6 +733,8 @@ init_task_ledgers(void) "bytes"); task_ledgers.alternate_accounting_compressed = ledger_entry_add(t, "alternate_accounting_compressed", "physmem", "bytes"); + task_ledgers.page_table = ledger_entry_add(t, "page_table", "physmem", + "bytes"); task_ledgers.phys_footprint = ledger_entry_add(t, "phys_footprint", "physmem", "bytes"); task_ledgers.internal_compressed = ledger_entry_add(t, "internal_compressed", "physmem", @@ -707,6 +781,9 @@ init_task_ledgers(void) task_ledgers.cpu_time_billed_to_me = ledger_entry_add(t, "cpu_time_billed_to_me", "sched", "ns"); task_ledgers.cpu_time_billed_to_others = ledger_entry_add(t, "cpu_time_billed_to_others", "sched", "ns"); #endif + task_ledgers.physical_writes = ledger_entry_add(t, "physical_writes", "res", "bytes"); + task_ledgers.logical_writes = ledger_entry_add(t, "logical_writes", "res", "bytes"); + if ((task_ledgers.cpu_time < 0) || (task_ledgers.tkm_private < 0) || (task_ledgers.tkm_shared < 0) || @@ -716,6 +793,7 @@ init_task_ledgers(void) (task_ledgers.iokit_mapped < 0) || (task_ledgers.alternate_accounting < 0) || (task_ledgers.alternate_accounting_compressed < 0) || + (task_ledgers.page_table < 0) || (task_ledgers.phys_footprint < 0) || (task_ledgers.internal_compressed < 0) || (task_ledgers.purgeable_volatile < 0) || @@ -723,18 +801,32 @@ init_task_ledgers(void) (task_ledgers.purgeable_volatile_compressed < 0) || (task_ledgers.purgeable_nonvolatile_compressed < 0) || (task_ledgers.platform_idle_wakeups < 0) || - (task_ledgers.interrupt_wakeups < 0) + (task_ledgers.interrupt_wakeups < 0) || #ifdef CONFIG_BANK - || (task_ledgers.cpu_time_billed_to_me < 0) || (task_ledgers.cpu_time_billed_to_others < 0) + (task_ledgers.cpu_time_billed_to_me < 0) || (task_ledgers.cpu_time_billed_to_others < 0) || #endif + (task_ledgers.physical_writes < 0) || + (task_ledgers.logical_writes < 0) ) { panic("couldn't create entries for task ledger template"); } + ledger_track_credit_only(t, task_ledgers.phys_footprint); + ledger_track_credit_only(t, task_ledgers.internal); + ledger_track_credit_only(t, task_ledgers.internal_compressed); + ledger_track_credit_only(t, task_ledgers.iokit_mapped); + ledger_track_credit_only(t, task_ledgers.alternate_accounting); + ledger_track_credit_only(t, task_ledgers.alternate_accounting_compressed); + ledger_track_credit_only(t, task_ledgers.purgeable_volatile); + ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile); + ledger_track_credit_only(t, task_ledgers.purgeable_volatile_compressed); + ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile_compressed); + ledger_track_maximum(t, task_ledgers.phys_footprint, 60); #if MACH_ASSERT if (pmap_ledgers_panic) { ledger_panic_on_negative(t, task_ledgers.phys_footprint); + ledger_panic_on_negative(t, task_ledgers.page_table); ledger_panic_on_negative(t, task_ledgers.internal); ledger_panic_on_negative(t, task_ledgers.internal_compressed); ledger_panic_on_negative(t, task_ledgers.iokit_mapped); @@ -747,13 +839,14 @@ init_task_ledgers(void) } #endif /* MACH_ASSERT */ -#if CONFIG_JETSAM +#if CONFIG_MEMORYSTATUS ledger_set_callback(t, task_ledgers.phys_footprint, task_footprint_exceeded, NULL, NULL); -#endif +#endif /* CONFIG_MEMORYSTATUS */ ledger_set_callback(t, task_ledgers.interrupt_wakeups, task_wakeups_rate_exceeded, NULL, NULL); - + ledger_set_callback(t, task_ledgers.physical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_PHYSICAL_WRITES, NULL); + ledger_set_callback(t, task_ledgers.logical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_LOGICAL_WRITES, NULL); task_ledger_template = t; } @@ -763,6 +856,7 @@ task_create_internal( coalition_t *parent_coalitions __unused, boolean_t inherit_memory, boolean_t is_64bit, + uint32_t t_flags, task_t *child_task) /* OUT */ { task_t new_task; @@ -792,8 +886,8 @@ task_create_internal( #endif /* if inherit_memory is true, parent_task MUST not be NULL */ - if (inherit_memory) - new_task->map = vm_map_fork(ledger, parent_task->map); + if (!(t_flags & TF_CORPSE_FORK) && inherit_memory) + new_task->map = vm_map_fork(ledger, parent_task->map, 0); else new_task->map = vm_map_create(pmap_create(ledger, 0, is_64bit), (vm_map_offset_t)(VM_MIN_ADDRESS), @@ -813,17 +907,11 @@ task_create_internal( new_task->active = TRUE; new_task->halting = FALSE; new_task->user_data = NULL; - new_task->faults = 0; - new_task->cow_faults = 0; - new_task->pageins = 0; - new_task->messages_sent = 0; - new_task->messages_received = 0; - new_task->syscalls_mach = 0; new_task->priv_flags = 0; - new_task->syscalls_unix=0; - new_task->c_switch = new_task->p_switch = new_task->ps_switch = 0; - new_task->t_flags = 0; + new_task->t_flags = t_flags; new_task->importance = 0; + new_task->corpse_info_kernel = NULL; + new_task->exec_token = 0; #if CONFIG_ATM new_task->atm_context = NULL; @@ -832,18 +920,20 @@ task_create_internal( new_task->bank_context = NULL; #endif - zinfo_task_init(new_task); - #ifdef MACH_BSD new_task->bsd_info = NULL; new_task->corpse_info = NULL; #endif /* MACH_BSD */ -#if CONFIG_JETSAM +#if CONFIG_MACF + new_task->crash_label = NULL; +#endif + +#if CONFIG_MEMORYSTATUS if (max_task_footprint != 0) { ledger_set_limit(ledger, task_ledgers.phys_footprint, max_task_footprint, PHYS_FOOTPRINT_WARNING_LEVEL); } -#endif +#endif /* CONFIG_MEMORYSTATUS */ if (task_wakeups_monitor_rate != 0) { uint32_t flags = WAKEMON_ENABLE | WAKEMON_SET_DEFAULTS; @@ -851,20 +941,26 @@ task_create_internal( task_wakeups_monitor_ctl(new_task, &flags, &rate); } +#if CONFIG_IO_ACCOUNTING + uint32_t flags = IOMON_ENABLE; + task_io_monitor_ctl(new_task, &flags); +#endif /* CONFIG_IO_ACCOUNTING */ + #if defined(__i386__) || defined(__x86_64__) new_task->i386_ldt = 0; #endif new_task->task_debug = NULL; +#if DEVELOPMENT || DEBUG + new_task->task_unnested = FALSE; + new_task->task_disconnected_count = 0; +#endif queue_init(&new_task->semaphore_list); new_task->semaphores_owned = 0; ipc_task_init(new_task, parent_task); - new_task->total_user_time = 0; - new_task->total_system_time = 0; - new_task->vtimers = 0; new_task->shared_region = NULL; @@ -888,11 +984,6 @@ task_create_internal( #endif /* HYPERVISOR */ - new_task->low_mem_notified_warn = 0; - new_task->low_mem_notified_critical = 0; - new_task->low_mem_privileged_listener = 0; - new_task->purged_memory_warn = 0; - new_task->purged_memory_critical = 0; new_task->mem_notify_reserved = 0; #if IMPORTANCE_INHERITANCE new_task->task_imp_base = NULL; @@ -904,7 +995,6 @@ task_create_internal( new_task->requested_policy = default_task_requested_policy; new_task->effective_policy = default_task_effective_policy; - new_task->pended_policy = default_task_pended_policy; if (parent_task != TASK_NULL) { new_task->sec_token = parent_task->sec_token; @@ -959,19 +1049,7 @@ task_create_internal( new_task->priority = BASEPRI_DEFAULT; new_task->max_priority = MAXPRI_USER; - new_task->requested_policy.t_apptype = parent_task->requested_policy.t_apptype; - - new_task->requested_policy.int_darwinbg = parent_task->requested_policy.int_darwinbg; - new_task->requested_policy.ext_darwinbg = parent_task->requested_policy.ext_darwinbg; - new_task->requested_policy.int_iotier = parent_task->requested_policy.int_iotier; - new_task->requested_policy.ext_iotier = parent_task->requested_policy.ext_iotier; - new_task->requested_policy.int_iopassive = parent_task->requested_policy.int_iopassive; - new_task->requested_policy.ext_iopassive = parent_task->requested_policy.ext_iopassive; - new_task->requested_policy.bg_iotier = parent_task->requested_policy.bg_iotier; - new_task->requested_policy.terminated = parent_task->requested_policy.terminated; - new_task->requested_policy.t_qos_clamp = parent_task->requested_policy.t_qos_clamp; - - task_policy_create(new_task, parent_task->requested_policy.t_boosted); + task_policy_create(new_task, parent_task); } else { new_task->sec_token = KERNEL_SECURITY_TOKEN; new_task->audit_token = KERNEL_AUDIT_TOKEN; @@ -1001,32 +1079,90 @@ task_create_internal( new_task->task_io_stats = (io_stat_info_t)kalloc(sizeof(struct io_stat_info)); assert(new_task->task_io_stats != NULL); bzero(new_task->task_io_stats, sizeof(struct io_stat_info)); - new_task->task_immediate_writes = 0; - new_task->task_deferred_writes = 0; - new_task->task_invalidated_writes = 0; - new_task->task_metadata_writes = 0; bzero(&(new_task->cpu_time_qos_stats), sizeof(struct _cpu_time_qos_stats)); bzero(&new_task->extmod_statistics, sizeof(new_task->extmod_statistics)); - new_task->task_timer_wakeups_bin_1 = new_task->task_timer_wakeups_bin_2 = 0; - new_task->task_gpu_ns = 0; -#if CONFIG_COALITIONS + /* Copy resource acc. info from Parent for Corpe Forked task. */ + if (parent_task != NULL && (t_flags & TF_CORPSE_FORK)) { + new_task->total_user_time = parent_task->total_user_time; + new_task->total_system_time = parent_task->total_system_time; + ledger_rollup(new_task->ledger, parent_task->ledger); + new_task->faults = parent_task->faults; + new_task->pageins = parent_task->pageins; + new_task->cow_faults = parent_task->cow_faults; + new_task->messages_sent = parent_task->messages_sent; + new_task->messages_received = parent_task->messages_received; + new_task->syscalls_mach = parent_task->syscalls_mach; + new_task->syscalls_unix = parent_task->syscalls_unix; + new_task->c_switch = parent_task->c_switch; + new_task->p_switch = parent_task->p_switch; + new_task->ps_switch = parent_task->ps_switch; + new_task->extmod_statistics = parent_task->extmod_statistics; + new_task->low_mem_notified_warn = parent_task->low_mem_notified_warn; + new_task->low_mem_notified_critical = parent_task->low_mem_notified_critical; + new_task->purged_memory_warn = parent_task->purged_memory_warn; + new_task->purged_memory_critical = parent_task->purged_memory_critical; + new_task->low_mem_privileged_listener = parent_task->low_mem_privileged_listener; + *new_task->task_io_stats = *parent_task->task_io_stats; + new_task->cpu_time_qos_stats = parent_task->cpu_time_qos_stats; + new_task->task_timer_wakeups_bin_1 = parent_task->task_timer_wakeups_bin_1; + new_task->task_timer_wakeups_bin_2 = parent_task->task_timer_wakeups_bin_2; + new_task->task_gpu_ns = parent_task->task_gpu_ns; + new_task->task_immediate_writes = parent_task->task_immediate_writes; + new_task->task_deferred_writes = parent_task->task_deferred_writes; + new_task->task_invalidated_writes = parent_task->task_invalidated_writes; + new_task->task_metadata_writes = parent_task->task_metadata_writes; + new_task->task_energy = parent_task->task_energy; + } else { + /* Initialize to zero for standard fork/spawn case */ + new_task->total_user_time = 0; + new_task->total_system_time = 0; + new_task->faults = 0; + new_task->pageins = 0; + new_task->cow_faults = 0; + new_task->messages_sent = 0; + new_task->messages_received = 0; + new_task->syscalls_mach = 0; + new_task->syscalls_unix = 0; + new_task->c_switch = 0; + new_task->p_switch = 0; + new_task->ps_switch = 0; + new_task->low_mem_notified_warn = 0; + new_task->low_mem_notified_critical = 0; + new_task->purged_memory_warn = 0; + new_task->purged_memory_critical = 0; + new_task->low_mem_privileged_listener = 0; + new_task->task_timer_wakeups_bin_1 = 0; + new_task->task_timer_wakeups_bin_2 = 0; + new_task->task_gpu_ns = 0; + new_task->task_immediate_writes = 0; + new_task->task_deferred_writes = 0; + new_task->task_invalidated_writes = 0; + new_task->task_metadata_writes = 0; + new_task->task_energy = 0; + } - /* TODO: there is no graceful failure path here... */ - if (parent_coalitions && parent_coalitions[COALITION_TYPE_RESOURCE]) { - coalitions_adopt_task(parent_coalitions, new_task); - } else if (parent_task && parent_task->coalition[COALITION_TYPE_RESOURCE]) { - /* - * all tasks at least have a resource coalition, so - * if the parent has one then inherit all coalitions - * the parent is a part of - */ - coalitions_adopt_task(parent_task->coalition, new_task); + +#if CONFIG_COALITIONS + if (!(t_flags & TF_CORPSE_FORK)) { + /* TODO: there is no graceful failure path here... */ + if (parent_coalitions && parent_coalitions[COALITION_TYPE_RESOURCE]) { + coalitions_adopt_task(parent_coalitions, new_task); + } else if (parent_task && parent_task->coalition[COALITION_TYPE_RESOURCE]) { + /* + * all tasks at least have a resource coalition, so + * if the parent has one then inherit all coalitions + * the parent is a part of + */ + coalitions_adopt_task(parent_task->coalition, new_task); + } else { + /* TODO: assert that new_task will be PID 1 (launchd) */ + coalitions_adopt_init_task(new_task); + } } else { - /* TODO: assert that new_task will be PID 1 (launchd) */ - coalitions_adopt_init_task(new_task); + coalitions_adopt_corpse_task(new_task); } if (new_task->coalition[COALITION_TYPE_RESOURCE] == COALITION_NULL) { @@ -1047,6 +1183,12 @@ task_create_internal( new_task->task_purgeable_disowning = FALSE; new_task->task_purgeable_disowned = FALSE; +#if CONFIG_SECLUDED_MEMORY + new_task->task_can_use_secluded_mem = FALSE; + new_task->task_could_use_secluded_mem = FALSE; + new_task->task_could_also_use_secluded_mem = FALSE; +#endif /* CONFIG_SECLUDED_MEMORY */ + queue_init(&new_task->io_user_clients); ipc_task_enable(new_task); @@ -1193,15 +1335,12 @@ task_deallocate( OSAddAtomic64(debit, (int64_t *)&tasks_tkm_shared.free); } ledger_dereference(task->ledger); - zinfo_task_free(task); #if TASK_REFERENCE_LEAK_DEBUG btlog_remove_entries_for_element(task_ref_btlog, task); #endif #if CONFIG_COALITIONS - if (!task->coalition[COALITION_TYPE_RESOURCE]) - panic("deallocating task was not a member of a resource coalition"); task_release_coalitions(task); #endif /* CONFIG_COALITIONS */ @@ -1210,10 +1349,19 @@ task_deallocate( #if MACH_BSD /* clean up collected information since last reference to task is gone */ if (task->corpse_info) { - task_crashinfo_destroy(task->corpse_info); + task_crashinfo_destroy(task->corpse_info, RELEASE_CORPSE_REF); task->corpse_info = NULL; } #endif + if (task->corpse_info_kernel) { + kfree(task->corpse_info_kernel, CORPSEINFO_ALLOCATION_SIZE); + } + +#if CONFIG_MACF + if (task->crash_label) { + mac_exc_action_label_task_destroy(task); + } +#endif zfree(task_zone, task); } @@ -1249,46 +1397,73 @@ task_suspension_token_deallocate( * collect crash info from bsd and mach based data */ kern_return_t -task_collect_crash_info(task_t task) +task_collect_crash_info(task_t task, struct proc *proc, int is_corpse_fork) { kern_return_t kr = KERN_SUCCESS; kcdata_descriptor_t crash_data = NULL; kcdata_descriptor_t crash_data_release = NULL; mach_msg_type_number_t size = CORPSEINFO_ALLOCATION_SIZE; - mach_vm_offset_t crash_data_user_ptr = 0; + mach_vm_offset_t crash_data_ptr = 0; + void *crash_data_kernel = NULL; + void *crash_data_kernel_release = NULL; + int corpse_blob_kernel_alloc = (is_corpse_fork || unify_corpse_blob_alloc); if (!corpses_enabled()) { return KERN_NOT_SUPPORTED; } task_lock(task); - assert(task->bsd_info != NULL); - if (task->corpse_info == NULL && task->bsd_info != NULL) { + + assert(is_corpse_fork || task->bsd_info != NULL); + if (task->corpse_info == NULL && (is_corpse_fork || task->bsd_info != NULL)) { +#if CONFIG_MACF + /* Update the corpse label, used by the exception delivery mac hook */ + mac_exc_action_label_task_update(task, proc); +#endif task_unlock(task); - /* map crash data memory in task's vm map */ - kr = mach_vm_allocate(task->map, &crash_data_user_ptr, size, (VM_MAKE_TAG(VM_MEMORY_CORPSEINFO) | VM_FLAGS_ANYWHERE)); + if (!corpse_blob_kernel_alloc) { + /* map crash data memory in task's vm map */ + kr = mach_vm_allocate(task->map, &crash_data_ptr, size, (VM_MAKE_TAG(VM_MEMORY_CORPSEINFO) | VM_FLAGS_ANYWHERE)); + } else { + crash_data_kernel = (void *) kalloc(CORPSEINFO_ALLOCATION_SIZE); + if (crash_data_kernel == 0) + kr = KERN_RESOURCE_SHORTAGE; + bzero(crash_data_kernel, CORPSEINFO_ALLOCATION_SIZE); + crash_data_ptr = (mach_vm_offset_t) crash_data_kernel; + } if (kr != KERN_SUCCESS) goto out_no_lock; - crash_data = task_crashinfo_alloc_init((mach_vm_address_t)crash_data_user_ptr, size); + /* Do not get a corpse ref for corpse fork */ + crash_data = task_crashinfo_alloc_init((mach_vm_address_t)crash_data_ptr, size, is_corpse_fork ? !GET_CORPSE_REF : GET_CORPSE_REF, corpse_blob_kernel_alloc ? KCFLAG_USE_MEMCOPY: KCFLAG_USE_COPYOUT); if (crash_data) { task_lock(task); crash_data_release = task->corpse_info; + crash_data_kernel_release = task->corpse_info_kernel; task->corpse_info = crash_data; + task->corpse_info_kernel = crash_data_kernel; + task_unlock(task); kr = KERN_SUCCESS; } else { /* if failed to create corpse info, free the mapping */ - if (KERN_SUCCESS != mach_vm_deallocate(task->map, crash_data_user_ptr, size)) { - printf("mach_vm_deallocate failed to clear corpse_data for pid %d.\n", task_pid(task)); + if (!corpse_blob_kernel_alloc) { + if (KERN_SUCCESS != mach_vm_deallocate(task->map, crash_data_ptr, size)) { + printf("mach_vm_deallocate failed to clear corpse_data for pid %d.\n", task_pid(task)); + } + } else { + kfree(crash_data_kernel, CORPSEINFO_ALLOCATION_SIZE); } kr = KERN_FAILURE; } if (crash_data_release != NULL) { - task_crashinfo_destroy(crash_data_release); + task_crashinfo_destroy(crash_data_release, is_corpse_fork ? !RELEASE_CORPSE_REF : RELEASE_CORPSE_REF); + } + if (crash_data_kernel_release != NULL) { + kfree(crash_data_kernel_release, CORPSEINFO_ALLOCATION_SIZE); } } else { task_unlock(task); @@ -1304,58 +1479,66 @@ task_collect_crash_info(task_t task) * Makes outcall to registered host port for a corpse. */ kern_return_t -task_deliver_crash_notification(task_t task) +task_deliver_crash_notification(task_t task, thread_t thread, mach_exception_data_type_t subcode) { kcdata_descriptor_t crash_info = task->corpse_info; thread_t th_iter = NULL; kern_return_t kr = KERN_SUCCESS; wait_interrupt_t wsave; mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; + ipc_port_t task_port, old_notify; if (crash_info == NULL) return KERN_FAILURE; - code[0] = crash_info->kcd_addr_begin; - code[1] = crash_info->kcd_length; - task_lock(task); + if (task_is_a_corpse_fork(task)) { + /* Populate code with EXC_RESOURCE for corpse fork */ + code[0] = EXC_RESOURCE; + code[1] = subcode; + } else if (unify_corpse_blob_alloc) { + /* Populate code with EXC_CRASH for corpses */ + code[0] = EXC_CRASH; + code[1] = 0; + /* Update the code[1] if the boot-arg corpse_for_fatal_memkill is set */ + if (corpse_for_fatal_memkill) { + code[1] = subcode; + } + } else { + /* Populate code with address and length for EXC_CRASH */ + code[0] = crash_info->kcd_addr_begin; + code[1] = crash_info->kcd_length; + } queue_iterate(&task->threads, th_iter, thread_t, task_threads) { - ipc_thread_reset(th_iter); + if (th_iter->corpse_dup == FALSE) { + ipc_thread_reset(th_iter); + } } task_unlock(task); + /* Arm the no-sender notification for taskport */ + task_reference(task); + task_port = convert_task_to_port(task); + ip_lock(task_port); + assert(ip_active(task_port)); + ipc_port_nsrequest(task_port, task_port->ip_mscount, ipc_port_make_sonce_locked(task_port), &old_notify); + /* port unlocked */ + assert(IP_NULL == old_notify); + wsave = thread_interrupt_level(THREAD_UNINT); - kr = exception_triage(EXC_CORPSE_NOTIFY, code, EXCEPTION_CODE_MAX); + kr = exception_triage_thread(EXC_CORPSE_NOTIFY, code, EXCEPTION_CODE_MAX, thread); if (kr != KERN_SUCCESS) { printf("Failed to send exception EXC_CORPSE_NOTIFY. error code: %d for pid %d\n", kr, task_pid(task)); } - /* - * crash reporting is done. Now release threads - * for reaping by thread_terminate_daemon - */ - task_lock(task); - assert(task->active_thread_count == 0); - queue_iterate(&task->threads, th_iter, thread_t, task_threads) - { - thread_mtx_lock(th_iter); - assert(th_iter->inspection == TRUE); - th_iter->inspection = FALSE; - /* now that the corpse has been autopsied, dispose of the thread name */ - uthread_cleanup_name(th_iter->uthread); - thread_mtx_unlock(th_iter); - } - - thread_terminate_crashed_threads(); - /* remove the pending corpse report flag */ - task_clear_corpse_pending_report(task); - - task_unlock(task); - (void)thread_interrupt_level(wsave); - task_terminate_internal(task); + /* + * Drop the send right on task port, will fire the + * no-sender notification if exception deliver failed. + */ + ipc_port_release_send(task_port); return kr; } @@ -1415,7 +1598,7 @@ task_mark_corpse(task_t task) assert(task == current_task()); assert(!task_is_a_corpse(task)); - kr = task_collect_crash_info(task); + kr = task_collect_crash_info(task, (struct proc*)task->bsd_info, FALSE); if (kr != KERN_SUCCESS) { return kr; } @@ -1430,20 +1613,265 @@ task_mark_corpse(task_t task) kr = task_start_halt_locked(task, TRUE); assert(kr == KERN_SUCCESS); + ipc_task_reset(task); + /* Remove the naked send right for task port, needed to arm no sender notification */ + task_set_special_port(task, TASK_KERNEL_PORT, IPC_PORT_NULL); ipc_task_enable(task); task_unlock(task); /* terminate the ipc space */ ipc_space_terminate(task->itk_space); + + /* Add it to global corpse task list */ + task_add_to_corpse_task_list(task); task_start_halt(task); thread_terminate_internal(self_thread); + (void) thread_interrupt_level(wsave); assert(task->halting == TRUE); return kr; } +/* + * task_clear_corpse + * + * Clears the corpse pending bit on task. + * Removes inspection bit on the threads. + */ +void +task_clear_corpse(task_t task) +{ + thread_t th_iter = NULL; + + task_lock(task); + queue_iterate(&task->threads, th_iter, thread_t, task_threads) + { + thread_mtx_lock(th_iter); + th_iter->inspection = FALSE; + thread_mtx_unlock(th_iter); + } + + thread_terminate_crashed_threads(); + /* remove the pending corpse report flag */ + task_clear_corpse_pending_report(task); + + task_unlock(task); +} + +/* + * task_port_notify + * + * Called whenever the Mach port system detects no-senders on + * the task port of a corpse. + * Each notification that comes in should terminate the task (corpse). + */ +void +task_port_notify(mach_msg_header_t *msg) +{ + mach_no_senders_notification_t *notification = (void *)msg; + ipc_port_t port = notification->not_header.msgh_remote_port; + task_t task; + + assert(ip_active(port)); + assert(IKOT_TASK == ip_kotype(port)); + task = (task_t) port->ip_kobject; + + assert(task_is_a_corpse(task)); + + /* Remove the task from global corpse task list */ + task_remove_from_corpse_task_list(task); + + task_clear_corpse(task); + task_terminate_internal(task); +} + +/* + * task_wait_till_threads_terminate_locked + * + * Wait till all the threads in the task are terminated. + * Might release the task lock and re-acquire it. + */ +void +task_wait_till_threads_terminate_locked(task_t task) +{ + /* wait for all the threads in the task to terminate */ + while (task->active_thread_count != 0) { + assert_wait((event_t)&task->active_thread_count, THREAD_UNINT); + task_unlock(task); + thread_block(THREAD_CONTINUE_NULL); + + task_lock(task); + } +} + +/* + * task_duplicate_map_and_threads + * + * Copy vmmap of source task. + * Copy active threads from source task to destination task. + * Source task would be suspended during the copy. + */ +kern_return_t +task_duplicate_map_and_threads( + task_t task, + void *p, + task_t new_task, + thread_t *thread_ret, + int is64bit, + uint64_t **udata_buffer, + int *size, + int *num_udata) +{ + kern_return_t kr = KERN_SUCCESS; + int active; + thread_t thread, self, thread_return = THREAD_NULL; + thread_t new_thread = THREAD_NULL; + thread_t *thread_array; + uint32_t active_thread_count = 0, array_count = 0, i; + vm_map_t oldmap; + uint64_t *buffer = NULL; + int buf_size = 0; + int est_knotes = 0, num_knotes = 0; + + self = current_thread(); + + /* + * Suspend the task to copy thread state, use the internal + * variant so that no user-space process can resume + * the task from under us + */ + kr = task_suspend_internal(task); + if (kr != KERN_SUCCESS) { + return kr; + } + + if (task->map->disable_vmentry_reuse == TRUE) { + /* + * Quite likely GuardMalloc (or some debugging tool) + * is being used on this task. And it has gone through + * its limit. Making a corpse will likely encounter + * a lot of VM entries that will need COW. + * + * Skip it. + */ + task_resume_internal(task); + return KERN_FAILURE; + } + + /* Setup new task's vmmap, switch from parent task's map to it COW map */ + oldmap = new_task->map; + new_task->map = vm_map_fork(new_task->ledger, + task->map, + (VM_MAP_FORK_SHARE_IF_INHERIT_NONE | + VM_MAP_FORK_PRESERVE_PURGEABLE)); + vm_map_deallocate(oldmap); + + if (is64bit) { + vm_map_set_64bit(get_task_map(new_task)); + } else { + vm_map_set_32bit(get_task_map(new_task)); + } + + /* Get all the udata pointers from kqueue */ + est_knotes = proc_list_uptrs(p, NULL, 0); + if (est_knotes > 0) { + buf_size = (est_knotes + 32) * sizeof(uint64_t); + buffer = (uint64_t *) kalloc(buf_size); + num_knotes = proc_list_uptrs(p, buffer, buf_size); + if (num_knotes > est_knotes + 32) { + num_knotes = est_knotes + 32; + } + } + + active_thread_count = task->active_thread_count; + if (active_thread_count == 0) { + if (buffer != NULL) { + kfree(buffer, buf_size); + } + task_resume_internal(task); + return KERN_FAILURE; + } + + thread_array = (thread_t *) kalloc(sizeof(thread_t) * active_thread_count); + + /* Iterate all the threads and drop the task lock before calling thread_create_with_continuation */ + task_lock(task); + queue_iterate(&task->threads, thread, thread_t, task_threads) { + /* Skip inactive threads */ + active = thread->active; + if (!active) { + continue; + } + + if (array_count >= active_thread_count) { + break; + } + + thread_array[array_count++] = thread; + thread_reference(thread); + } + task_unlock(task); + + for (i = 0; i < array_count; i++) { + + kr = thread_create_with_continuation(new_task, &new_thread, (thread_continue_t)thread_corpse_continue); + if (kr != KERN_SUCCESS) { + break; + } + + /* Equivalent of current thread in corpse */ + if (thread_array[i] == self) { + thread_return = new_thread; + } else { + /* drop the extra ref returned by thread_create_with_continuation */ + thread_deallocate(new_thread); + } + + kr = thread_dup2(thread_array[i], new_thread); + if (kr != KERN_SUCCESS) { + thread_mtx_lock(new_thread); + new_thread->corpse_dup = TRUE; + thread_mtx_unlock(new_thread); + continue; + } + + /* Copy thread name */ + bsd_copythreadname(new_thread->uthread, thread_array[i]->uthread); + thread_copy_resource_info(new_thread, thread_array[i]); + } + + task_resume_internal(task); + + for (i = 0; i < array_count; i++) { + thread_deallocate(thread_array[i]); + } + kfree(thread_array, sizeof(thread_t) * active_thread_count); + + if (kr == KERN_SUCCESS) { + *thread_ret = thread_return; + *udata_buffer = buffer; + *size = buf_size; + *num_udata = num_knotes; + } else { + if (thread_return != THREAD_NULL) { + thread_deallocate(thread_return); + } + if (buffer != NULL) { + kfree(buffer, buf_size); + } + } + + return kr; +} + +#if CONFIG_SECLUDED_MEMORY +extern void task_set_can_use_secluded_mem_locked( + task_t task, + boolean_t can_use_secluded_mem); +#endif /* CONFIG_SECLUDED_MEMORY */ + kern_return_t task_terminate_internal( task_t task) @@ -1474,6 +1902,14 @@ task_terminate_internal( task_lock(task); } +#if CONFIG_SECLUDED_MEMORY + if (task->task_can_use_secluded_mem) { + task_set_can_use_secluded_mem_locked(task, FALSE); + } + task->task_could_use_secluded_mem = FALSE; + task->task_could_also_use_secluded_mem = FALSE; +#endif /* CONFIG_SECLUDED_MEMORY */ + if (!task->active) { /* * Task is already being terminated. @@ -1545,7 +1981,7 @@ task_terminate_internal( task_unlock(task); - proc_set_task_policy(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, + proc_set_task_policy(task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_TERMINATED, TASK_POLICY_ENABLE); /* Early object reap phase */ @@ -1698,7 +2134,7 @@ task_start_halt_locked(task_t task, boolean_t should_mark_corpse) self = current_thread(); - if (task != self->task) + if (task != self->task && !task_is_a_corpse_fork(task)) return (KERN_INVALID_ARGUMENT); if (task->halting || !task->active || !self->active) { @@ -1798,6 +2234,12 @@ task_complete_halt(task_t task) /* no unnesting on final cleanup: */ VM_MAP_REMOVE_NO_UNNESTING); + /* + * Kick out any IOKitUser handles to the task. At best they're stale, + * at worst someone is racing a SUID exec. + */ + iokit_task_terminate(task); + task->halting = FALSE; } @@ -1812,9 +2254,9 @@ task_complete_halt(task_t task) */ void task_hold_locked( - register task_t task) + task_t task) { - register thread_t thread; + thread_t thread; assert(task->active); @@ -1844,7 +2286,7 @@ task_hold_locked( */ kern_return_t task_hold( - register task_t task) + task_t task) { if (task == TASK_NULL) return (KERN_INVALID_ARGUMENT); @@ -1895,10 +2337,10 @@ task_wait( */ void task_wait_locked( - register task_t task, + task_t task, boolean_t until_not_runnable) { - register thread_t thread, self; + thread_t thread, self; assert(task->active); assert(task->suspend_count > 0); @@ -1925,9 +2367,9 @@ task_wait_locked( */ void task_release_locked( - register task_t task) + task_t task) { - register thread_t thread; + thread_t thread; assert(task->active); assert(task->suspend_count > 0); @@ -2087,13 +2529,18 @@ task_threads( static kern_return_t place_task_hold ( - register task_t task, + task_t task, int mode) { - if (!task->active) { + if (!task->active && !task_is_a_corpse(task)) { return (KERN_FAILURE); } + /* Return success for corpse task */ + if (task_is_a_corpse(task)) { + return KERN_SUCCESS; + } + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_IPC,MACH_TASK_SUSPEND) | DBG_FUNC_NONE, task_pid(task), ((thread_t)queue_first(&task->threads))->thread_id, @@ -2128,14 +2575,19 @@ place_task_hold ( static kern_return_t release_task_hold ( - register task_t task, + task_t task, int mode) { - register boolean_t release = FALSE; + boolean_t release = FALSE; - if (!task->active) { + if (!task->active && !task_is_a_corpse(task)) { return (KERN_FAILURE); } + + /* Return success for corpse task */ + if (task_is_a_corpse(task)) { + return KERN_SUCCESS; + } if (mode == TASK_HOLD_PIDSUSPEND) { if (task->pidsuspended == FALSE) { @@ -2206,7 +2658,7 @@ release_task_hold ( */ kern_return_t task_suspend( - register task_t task) + task_t task) { kern_return_t kr; mach_port_t port, send, old_notify; @@ -2280,7 +2732,7 @@ task_suspend( */ kern_return_t task_resume( - register task_t task) + task_t task) { kern_return_t kr; mach_port_name_t resume_port_name; @@ -2344,7 +2796,7 @@ task_suspend_internal(task_t task) */ kern_return_t task_suspend2( - register task_t task, + task_t task, task_suspension_token_t *suspend_token) { kern_return_t kr; @@ -2372,7 +2824,7 @@ task_suspend2( */ kern_return_t task_resume_internal( - register task_suspension_token_t task) + task_suspension_token_t task) { kern_return_t kr; @@ -2390,7 +2842,7 @@ task_resume_internal( */ kern_return_t task_resume2( - register task_suspension_token_t task) + task_suspension_token_t task) { kern_return_t kr; @@ -2485,7 +2937,7 @@ task_pidsuspend_locked(task_t task) */ kern_return_t task_pidsuspend( - register task_t task) + task_t task) { kern_return_t kr; @@ -2501,9 +2953,6 @@ task_pidsuspend( return (kr); } -/* If enabled, we bring all the frozen pages back in prior to resumption; otherwise, they're faulted back in on demand */ -#define THAW_ON_RESUME 1 - /* * task_pidresume: * Resumes a previously suspended task. @@ -2513,7 +2962,7 @@ task_pidsuspend( */ kern_return_t task_pidresume( - register task_t task) + task_t task) { kern_return_t kr; @@ -2522,7 +2971,7 @@ task_pidresume( task_lock(task); -#if (CONFIG_FREEZE && THAW_ON_RESUME) +#if CONFIG_FREEZE while (task->changing_freeze_state) { @@ -2539,17 +2988,8 @@ task_pidresume( task_unlock(task); -#if (CONFIG_FREEZE && THAW_ON_RESUME) - if ((kr == KERN_SUCCESS) && (task->frozen == TRUE)) { - - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - - kr = KERN_SUCCESS; - } else { +#if CONFIG_FREEZE - kr = vm_map_thaw(task->map); - } - } task_lock(task); if (kr == KERN_SUCCESS) @@ -2563,6 +3003,84 @@ task_pidresume( return (kr); } + +#if DEVELOPMENT || DEBUG + +extern void IOSleep(int); + +kern_return_t +task_disconnect_page_mappings(task_t task) +{ + int n; + + if (task == TASK_NULL || task == kernel_task) + return (KERN_INVALID_ARGUMENT); + + /* + * this function is used to strip all of the mappings from + * the pmap for the specified task to force the task to + * re-fault all of the pages it is actively using... this + * allows us to approximate the true working set of the + * specified task. We only engage if at least 1 of the + * threads in the task is runnable, but we want to continuously + * sweep (at least for a while - I've arbitrarily set the limit at + * 100 sweeps to be re-looked at as we gain experience) to get a better + * view into what areas within a page are being visited (as opposed to only + * seeing the first fault of a page after the task becomes + * runnable)... in the future I may + * try to block until awakened by a thread in this task + * being made runnable, but for now we'll periodically poll from the + * user level debug tool driving the sysctl + */ + for (n = 0; n < 100; n++) { + thread_t thread; + boolean_t runnable; + boolean_t do_unnest; + int page_count; + + runnable = FALSE; + do_unnest = FALSE; + + task_lock(task); + + queue_iterate(&task->threads, thread, thread_t, task_threads) { + + if (thread->state & TH_RUN) { + runnable = TRUE; + break; + } + } + if (n == 0) + task->task_disconnected_count++; + + if (task->task_unnested == FALSE) { + if (runnable == TRUE) { + task->task_unnested = TRUE; + do_unnest = TRUE; + } + } + task_unlock(task); + + if (runnable == FALSE) + break; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_TASK_PAGE_MAPPINGS)) | DBG_FUNC_START, + task, do_unnest, task->task_disconnected_count, 0, 0); + + page_count = vm_map_disconnect_page_mappings(task->map, do_unnest); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_TASK_PAGE_MAPPINGS)) | DBG_FUNC_END, + task, page_count, 0, 0, 0); + + if ((n % 5) == 4) + IOSleep(1); + } + return (KERN_SUCCESS); +} + +#endif + + #if CONFIG_FREEZE /* @@ -2578,7 +3096,7 @@ extern queue_head_t c_swapout_list_head; kern_return_t task_freeze( - register task_t task, + task_t task, uint32_t *purgeable_count, uint32_t *wired_count, uint32_t *clean_count, @@ -2587,7 +3105,7 @@ task_freeze( boolean_t *shared, boolean_t walk_only) { - kern_return_t kr; + kern_return_t kr = KERN_SUCCESS; if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); @@ -2611,7 +3129,7 @@ task_freeze( task_unlock(task); if (walk_only) { - kr = vm_map_freeze_walk(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared); + panic("task_freeze - walk_only == TRUE"); } else { kr = vm_map_freeze(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared); } @@ -2625,7 +3143,7 @@ task_freeze( task_unlock(task); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { + if (VM_CONFIG_COMPRESSOR_IS_PRESENT) { vm_wake_compactor_swapper(); /* * We do an explicit wakeup of the swapout thread here @@ -2650,10 +3168,8 @@ task_freeze( */ kern_return_t task_thaw( - register task_t task) + task_t task) { - kern_return_t kr; - if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); @@ -2671,32 +3187,11 @@ task_thaw( task_unlock(task); return (KERN_FAILURE); } - task->changing_freeze_state = TRUE; - - if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) { - task_unlock(task); - - kr = vm_map_thaw(task->map); - - task_lock(task); - - if (kr == KERN_SUCCESS) - task->frozen = FALSE; - } else { - task->frozen = FALSE; - kr = KERN_SUCCESS; - } - - task->changing_freeze_state = FALSE; - thread_wakeup(&task->changing_freeze_state); + task->frozen = FALSE; task_unlock(task); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - vm_wake_compactor_swapper(); - } - - return (kr); + return (KERN_SUCCESS); } #endif /* CONFIG_FREEZE */ @@ -2785,7 +3280,6 @@ task_set_info( mem_info->user_memory_address, mem_info->buffer_size); return kr; - break; } #endif @@ -2804,10 +3298,12 @@ task_info( mach_msg_type_number_t *task_info_count) { kern_return_t error = KERN_SUCCESS; + mach_msg_type_number_t original_task_info_count; if (task == TASK_NULL) return (KERN_INVALID_ARGUMENT); + original_task_info_count = *task_info_count; task_lock(task); if ((task != current_task()) && (!task->active)) { @@ -2948,8 +3444,8 @@ task_info( case TASK_THREAD_TIMES_INFO: { - register task_thread_times_info_t times_info; - register thread_t thread; + task_thread_times_info_t times_info; + thread_t thread; if (*task_info_count < TASK_THREAD_TIMES_INFO_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -2982,7 +3478,7 @@ task_info( case TASK_ABSOLUTETIME_INFO: { task_absolutetime_info_t info; - register thread_t thread; + thread_t thread; if (*task_info_count < TASK_ABSOLUTETIME_INFO_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -3169,7 +3665,7 @@ task_info( /* OBSOLETE */ case TASK_SCHED_RR_INFO: { - register policy_rr_base_t rr_base; + policy_rr_base_t rr_base; uint32_t quantum_time; uint64_t quantum_ns; @@ -3199,7 +3695,7 @@ task_info( /* OBSOLETE */ case TASK_SCHED_TIMESHARE_INFO: { - register policy_timeshare_base_t ts_base; + policy_timeshare_base_t ts_base; if (*task_info_count < POLICY_TIMESHARE_BASE_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -3221,7 +3717,7 @@ task_info( case TASK_SECURITY_TOKEN: { - register security_token_t *sec_token_p; + security_token_t *sec_token_p; if (*task_info_count < TASK_SECURITY_TOKEN_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -3238,7 +3734,7 @@ task_info( case TASK_AUDIT_TOKEN: { - register audit_token_t *audit_token_p; + audit_token_t *audit_token_p; if (*task_info_count < TASK_AUDIT_TOKEN_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -3259,8 +3755,8 @@ task_info( case TASK_EVENTS_INFO: { - register task_events_info_t events_info; - register thread_t thread; + task_events_info_t events_info; + thread_t thread; if (*task_info_count < TASK_EVENTS_INFO_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -3307,7 +3803,7 @@ task_info( break; } - task_power_info_locked(task, (task_power_info_t)task_info_out, NULL); + task_power_info_locked(task, (task_power_info_t)task_info_out, NULL, NULL); break; } @@ -3318,7 +3814,9 @@ task_info( break; } task_power_info_v2_t tpiv2 = (task_power_info_v2_t) task_info_out; - task_power_info_locked(task, &tpiv2->cpu_energy, &tpiv2->gpu_energy); + + uint64_t *task_energy = NULL; + task_power_info_locked(task, &tpiv2->cpu_energy, &tpiv2->gpu_energy, task_energy); break; } @@ -3417,14 +3915,22 @@ task_info( volatile_virtual_size; } } - vm_map_unlock_read(map); } + *task_info_count = TASK_VM_INFO_REV0_COUNT; - if (*task_info_count >= TASK_VM_INFO_COUNT) { - vm_info->phys_footprint = 0; - *task_info_count = TASK_VM_INFO_COUNT; - } else { - *task_info_count = TASK_VM_INFO_REV0_COUNT; + if (original_task_info_count >= TASK_VM_INFO_REV1_COUNT) { + vm_info->phys_footprint = + (mach_vm_size_t) get_task_phys_footprint(task); + *task_info_count = TASK_VM_INFO_REV1_COUNT; + } + if (original_task_info_count >= TASK_VM_INFO_REV2_COUNT) { + vm_info->min_address = map->min_offset; + vm_info->max_address = map->max_offset; + *task_info_count = TASK_VM_INFO_REV2_COUNT; + } + + if (task != kernel_task) { + vm_map_unlock_read(map); } break; @@ -3564,7 +4070,8 @@ void task_power_info_locked( task_t task, task_power_info_t info, - gpu_energy_data_t ginfo) + gpu_energy_data_t ginfo, + uint64_t *task_energy) { thread_t thread; ledger_amount_t tmp; @@ -3582,6 +4089,10 @@ task_power_info_locked( info->total_user = task->total_user_time; info->total_system = task->total_system_time; + if (task_energy) { + *task_energy = task->task_energy; + } + if (ginfo) { ginfo->task_gpu_utilisation = task->task_gpu_ns; } @@ -3599,6 +4110,10 @@ task_power_info_locked( info->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1; info->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2; + if (task_energy) { + *task_energy += ml_energy_stat(thread); + } + tval = timer_grab(&thread->user_timer); info->total_user += tval; @@ -3647,6 +4162,35 @@ task_gpu_utilisation( return gpu_time; } +/* + * task_energy + * + * Returns the total energy used by the all the threads of the task + * (both dead and alive) + */ +uint64_t +task_energy( + task_t task) +{ + uint64_t energy = 0; + thread_t thread; + + task_lock(task); + energy += task->task_energy; + + queue_iterate(&task->threads, thread, thread_t, task_threads) { + spl_t x; + x = splsched(); + thread_lock(thread); + energy += ml_energy_stat(thread); + thread_unlock(thread); + splx(x); + } + + task_unlock(task); + return energy; +} + kern_return_t task_purgable_info( task_t task, @@ -3670,8 +4214,6 @@ task_vtimer_set( thread_t thread; spl_t x; - /* assert(task == current_task()); */ /* bogus assert 4803227 4807483 */ - task_lock(task); task->vtimers |= which; @@ -3739,15 +4281,20 @@ __unused uint32_t *microsecs) { thread_t thread = current_thread(); - uint32_t tdelt; - clock_sec_t secs; + uint32_t tdelt = 0; + clock_sec_t secs = 0; uint64_t tsum; assert(task == current_task()); - assert(task->vtimers & which); + spl_t s = splsched(); + thread_lock(thread); - secs = tdelt = 0; + if ((task->vtimers & which) != (uint32_t)which) { + thread_unlock(thread); + splx(s); + return; + } switch (which) { @@ -3781,6 +4328,8 @@ __unused break; } + thread_unlock(thread); + splx(s); } /* @@ -3957,11 +4506,11 @@ task_get_state( return ret; } -#if CONFIG_JETSAM +#if CONFIG_MEMORYSTATUS #define HWM_USERCORE_MINSPACE 250 // free space (in MB) required *after* core file creation void __attribute__((noinline)) -PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb) +PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, boolean_t is_fatal) { task_t task = current_task(); int pid = 0; @@ -3982,7 +4531,7 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb) if (task->bsd_info != NULL) procname = proc_name_address(current_task()->bsd_info); #endif - +#if CONFIG_COREDUMP if (hwm_user_cores) { int error; uint64_t starttime, end; @@ -4008,6 +4557,7 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb) printf("coredump of %s[%d] taken in %d secs %d microsecs\n", proc_name_address(current_task()->bsd_info), pid, (int)secs, microsecs); } +#endif /* CONFIG_COREDUMP */ if (disable_exc_resource) { printf("process %s[%d] crossed memory high watermark (%d MB); EXC_RESOURCE " @@ -4031,13 +4581,21 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb) EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_HIGH_WATERMARK); EXC_RESOURCE_HWM_ENCODE_LIMIT(code[0], max_footprint_mb); - /* - * Use the _internal_ variant so that no user-space - * process can resume our task from under us. - */ - task_suspend_internal(task); - exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX); - task_resume_internal(task); + /* Do not generate a corpse fork if the violation is a fatal one */ + if (is_fatal || exc_via_corpse_forking == 0) { + /* Do not send a EXC_RESOURCE is corpse_for_fatal_memkill is set */ + if (corpse_for_fatal_memkill == 0) { + /* + * Use the _internal_ variant so that no user-space + * process can resume our task from under us. + */ + task_suspend_internal(task); + exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX); + task_resume_internal(task); + } + } else { + task_enqueue_exception_with_corpse(task, code, EXCEPTION_CODE_MAX); + } /* * After the EXC_RESOURCE has been handled, we must clear the @@ -4054,8 +4612,9 @@ void task_footprint_exceeded(int warning, __unused const void *param0, __unused const void *param1) { ledger_amount_t max_footprint, max_footprint_mb; - ledger_amount_t footprint_after_purge; task_t task; + boolean_t is_fatal; + boolean_t trigger_exception; if (warning == LEDGER_WARNING_DIPPED_BELOW) { /* @@ -4070,35 +4629,22 @@ task_footprint_exceeded(int warning, __unused const void *param0, __unused const max_footprint_mb = max_footprint >> 20; /* - * Try and purge all "volatile" memory in that task first. + * Capture the trigger exception flag before turning off the exception. */ - (void) task_purge_volatile_memory(task); - /* are we still over the limit ? */ - ledger_get_balance(task->ledger, - task_ledgers.phys_footprint, - &footprint_after_purge); - if ((!warning && - footprint_after_purge <= max_footprint) || - (warning && - footprint_after_purge <= ((max_footprint * - PHYS_FOOTPRINT_WARNING_LEVEL) / 100))) { - /* all better now */ - ledger_reset_callback_state(task->ledger, - task_ledgers.phys_footprint); - return; - } - /* still over the limit after purging... */ + trigger_exception = task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PHYS_FOOTPRINT_EXCEPTION ? TRUE : FALSE; + + is_fatal = memorystatus_turnoff_exception_and_get_fatalness((warning == LEDGER_WARNING_ROSE_ABOVE) ? TRUE : FALSE, (int)max_footprint_mb); /* * If this an actual violation (not a warning), * generate a non-fatal high watermark EXC_RESOURCE. */ - if ((warning == 0) && (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PHYS_FOOTPRINT_EXCEPTION)) { - PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND((int)max_footprint_mb); + if ((warning == 0) && trigger_exception) { + PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND((int)max_footprint_mb, is_fatal); } memorystatus_on_ledger_footprint_exceeded((warning == LEDGER_WARNING_ROSE_ABOVE) ? TRUE : FALSE, - (int)max_footprint_mb); + is_fatal); } extern int proc_check_footprint_priv(void); @@ -4167,7 +4713,7 @@ task_set_phys_footprint_limit_internal( */ ledger_set_limit(task->ledger, task_ledgers.phys_footprint, max_task_footprint ? max_task_footprint : LEDGER_LIMIT_INFINITY, - max_task_footprint ? PHYS_FOOTPRINT_WARNING_LEVEL : 0); + max_task_footprint ? max_task_footprint_warning_level : 0); return (KERN_SUCCESS); } @@ -4213,7 +4759,7 @@ task_get_phys_footprint_limit( return (KERN_SUCCESS); } -#else /* CONFIG_JETSAM */ +#else /* CONFIG_MEMORYSTATUS */ kern_return_t task_set_phys_footprint_limit( __unused task_t task, @@ -4230,7 +4776,7 @@ task_get_phys_footprint_limit( { return (KERN_FAILURE); } -#endif /* CONFIG_JETSAM */ +#endif /* CONFIG_MEMORYSTATUS */ /* * We need to export some functions to other components that @@ -4245,14 +4791,9 @@ boolean_t is_kerneltask(task_t t) return (FALSE); } -int -check_for_tasksuspend(task_t task) +boolean_t is_corpsetask(task_t t) { - - if (task == TASK_NULL) - return (0); - - return (task->suspend_count > 0); + return (task_is_a_corpse(t)); } #undef current_task @@ -4283,23 +4824,47 @@ int task_pid(task_t task) } -/* - * This routine is called always with task lock held. - * And it returns a thread handle without reference as the caller - * operates on it under the task lock held. +/* + * This routine finds a thread in a task by its unique id + * Returns a referenced thread or THREAD_NULL if the thread was not found + * + * TODO: This is super inefficient - it's an O(threads in task) list walk! + * We should make a tid hash, or transition all tid clients to thread ports + * + * Precondition: No locks held (will take task lock) */ thread_t task_findtid(task_t task, uint64_t tid) { - thread_t thread= THREAD_NULL; + thread_t self = current_thread(); + thread_t found_thread = THREAD_NULL; + thread_t iter_thread = THREAD_NULL; - queue_iterate(&task->threads, thread, thread_t, task_threads) { - if (thread->thread_id == tid) - return(thread); + /* Short-circuit the lookup if we're looking up ourselves */ + if (tid == self->thread_id || tid == TID_NULL) { + assert(self->task == task); + + thread_reference(self); + + return self; } - return(THREAD_NULL); + + task_lock(task); + + queue_iterate(&task->threads, iter_thread, thread_t, task_threads) { + if (iter_thread->thread_id == tid) { + found_thread = iter_thread; + thread_reference(found_thread); + break; + } + } + + task_unlock(task); + + return (found_thread); } + /* * Control the CPU usage monitor for a task. */ @@ -4365,7 +4930,7 @@ task_wakeups_monitor_ctl(task_t task, uint32_t *flags, int32_t *rate_hz) } #endif /* CONFIG_NOMONITORS */ - if (*rate_hz < 0) { + if (*rate_hz <= 0) { task_unlock(task); return KERN_INVALID_ARGUMENT; } @@ -4417,21 +4982,22 @@ task_wakeups_rate_exceeded(int warning, __unused const void *param0, __unused co #endif if (warning == 0) { - THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE(); + SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS(); } } void __attribute__((noinline)) -THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE(void) +SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS(void) { - task_t task = current_task(); - int pid = 0; - const char *procname = "unknown"; - uint64_t observed_wakeups_rate; - uint64_t permitted_wakeups_rate; - uint64_t observation_interval; - mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; - struct ledger_entry_info lei; + task_t task = current_task(); + int pid = 0; + const char *procname = "unknown"; + boolean_t fatal; + kern_return_t kr; +#ifdef EXC_RESOURCE_MONITORS + mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; +#endif /* EXC_RESOURCE_MONITORS */ + struct ledger_entry_info lei; #ifdef MACH_BSD pid = proc_selfpid(); @@ -4444,14 +5010,30 @@ THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE(void) /* * Disable the exception notification so we don't overwhelm * the listener with an endless stream of redundant exceptions. + * TODO: detect whether another thread is already reporting the violation. */ uint32_t flags = WAKEMON_DISABLE; task_wakeups_monitor_ctl(task, &flags, NULL); - observed_wakeups_rate = (lei.lei_balance * (int64_t)NSEC_PER_SEC) / lei.lei_last_refill; - permitted_wakeups_rate = lei.lei_limit / task_wakeups_monitor_interval; - observation_interval = lei.lei_refill_period / NSEC_PER_SEC; - + fatal = task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_WAKEUPSMON; + trace_resource_violation(RMON_CPUWAKES_VIOLATED, &lei); + printf("process %s[%d] caught waking the CPU %llu times " + "over ~%llu seconds, averaging %llu wakes / second and " + "violating a %slimit of %llu wakes over %llu seconds.\n", + procname, pid, + lei.lei_balance, lei.lei_last_refill / NSEC_PER_SEC, + lei.lei_last_refill == 0 ? 0 : + (NSEC_PER_SEC * lei.lei_balance / lei.lei_last_refill), + fatal ? "FATAL " : "", + lei.lei_limit, lei.lei_refill_period / NSEC_PER_SEC); + + kr = send_resource_violation(send_cpu_wakes_violation, task, &lei, + fatal ? kRNFatalLimitFlag : 0); + if (kr) { + printf("send_resource_violation(CPU wakes, ...): error %#x\n", kr); + } + +#ifdef EXC_RESOURCE_MONITORS if (disable_exc_resource) { printf("process %s[%d] caught causing excessive wakeups. EXC_RESOURCE " "supressed by a boot-arg\n", procname, pid); @@ -4462,56 +5044,184 @@ THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE(void) "supressed due to audio playback\n", procname, pid); return; } - printf("process %s[%d] caught causing excessive wakeups. Observed wakeups rate " - "(per sec): %lld; Maximum permitted wakeups rate (per sec): %lld; Observation " - "period: %lld seconds; Task lifetime number of wakeups: %lld\n", - procname, pid, observed_wakeups_rate, permitted_wakeups_rate, - observation_interval, lei.lei_credit); + if (lei.lei_last_refill == 0) { + printf("process %s[%d] caught causing excessive wakeups. EXC_RESOURCE " + "supressed due to lei.lei_last_refill = 0 \n", procname, pid); + } code[0] = code[1] = 0; EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_WAKEUPS); EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_WAKEUPS_MONITOR); - EXC_RESOURCE_CPUMONITOR_ENCODE_WAKEUPS_PERMITTED(code[0], task_wakeups_monitor_rate); - EXC_RESOURCE_CPUMONITOR_ENCODE_OBSERVATION_INTERVAL(code[0], observation_interval); - EXC_RESOURCE_CPUMONITOR_ENCODE_WAKEUPS_OBSERVED(code[1], lei.lei_balance * (int64_t)NSEC_PER_SEC / lei.lei_last_refill); + EXC_RESOURCE_CPUMONITOR_ENCODE_WAKEUPS_PERMITTED(code[0], + NSEC_PER_SEC * lei.lei_limit / lei.lei_refill_period); + EXC_RESOURCE_CPUMONITOR_ENCODE_OBSERVATION_INTERVAL(code[0], + lei.lei_last_refill); + EXC_RESOURCE_CPUMONITOR_ENCODE_WAKEUPS_OBSERVED(code[1], + NSEC_PER_SEC * lei.lei_balance / lei.lei_last_refill); exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX); +#endif /* EXC_RESOURCE_MONITORS */ - if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_WAKEUPSMON) { + if (fatal) { task_terminate_internal(task); } } -kern_return_t -task_purge_volatile_memory( - task_t task) +static boolean_t +global_update_logical_writes(int64_t io_delta) { - vm_map_t map; - int num_object_purged; - - if (task == TASK_NULL) - return KERN_INVALID_TASK; + int64_t old_count, new_count; + boolean_t needs_telemetry; + + do { + new_count = old_count = global_logical_writes_count; + new_count += io_delta; + if (new_count >= io_telemetry_limit) { + new_count = 0; + needs_telemetry = TRUE; + } else { + needs_telemetry = FALSE; + } + } while(!OSCompareAndSwap64(old_count, new_count, &global_logical_writes_count)); + return needs_telemetry; +} - task_lock(task); +void task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp) +{ + int64_t io_delta = 0; + boolean_t needs_telemetry = FALSE; - if (!task->active) { - task_unlock(task); - return KERN_INVALID_TASK; + if ((!task) || (!io_size) || (!vp)) + return; + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_DATA_WRITE)) | DBG_FUNC_NONE, + task_pid(task), io_size, flags, (uintptr_t)VM_KERNEL_ADDRPERM(vp), 0); + DTRACE_IO4(logical_writes, struct task *, task, uint32_t, io_size, int, flags, vnode *, vp); + switch(flags) { + case TASK_WRITE_IMMEDIATE: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_immediate_writes)); + ledger_credit(task->ledger, task_ledgers.logical_writes, io_size); + break; + case TASK_WRITE_DEFERRED: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_deferred_writes)); + ledger_credit(task->ledger, task_ledgers.logical_writes, io_size); + break; + case TASK_WRITE_INVALIDATED: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_invalidated_writes)); + ledger_debit(task->ledger, task_ledgers.logical_writes, io_size); + break; + case TASK_WRITE_METADATA: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_metadata_writes)); + ledger_credit(task->ledger, task_ledgers.logical_writes, io_size); + break; } - map = task->map; - if (map == VM_MAP_NULL) { - task_unlock(task); - return KERN_INVALID_TASK; + + io_delta = (flags == TASK_WRITE_INVALIDATED) ? ((int64_t)io_size * -1ll) : ((int64_t)io_size); + if (io_telemetry_limit != 0) { + /* If io_telemetry_limit is 0, disable global updates and I/O telemetry */ + needs_telemetry = global_update_logical_writes(io_delta); + if (needs_telemetry) { + act_set_io_telemetry_ast(current_thread()); + } } - vm_map_reference(task->map); +} - task_unlock(task); +/* + * Control the I/O monitor for a task. + */ +kern_return_t +task_io_monitor_ctl(task_t task, uint32_t *flags) +{ + ledger_t ledger = task->ledger; - num_object_purged = vm_map_purge(map); - vm_map_deallocate(map); + task_lock(task); + if (*flags & IOMON_ENABLE) { + /* Configure the physical I/O ledger */ + ledger_set_limit(ledger, task_ledgers.physical_writes, (task_iomon_limit_mb * 1024 * 1024), 0); + ledger_set_period(ledger, task_ledgers.physical_writes, (task_iomon_interval_secs * NSEC_PER_SEC)); + + /* Configure the logical I/O ledger */ + ledger_set_limit(ledger, task_ledgers.logical_writes, (task_iomon_limit_mb * 1024 * 1024), 0); + ledger_set_period(ledger, task_ledgers.logical_writes, (task_iomon_interval_secs * NSEC_PER_SEC)); + + } else if (*flags & IOMON_DISABLE) { + /* + * Caller wishes to disable I/O monitor on the task. + */ + ledger_disable_refill(ledger, task_ledgers.physical_writes); + ledger_disable_callback(ledger, task_ledgers.physical_writes); + ledger_disable_refill(ledger, task_ledgers.logical_writes); + ledger_disable_callback(ledger, task_ledgers.logical_writes); + } + task_unlock(task); return KERN_SUCCESS; } +void +task_io_rate_exceeded(int warning, const void *param0, __unused const void *param1) +{ + if (warning == 0) { + SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MUCH_IO((int)param0); + } +} + +void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MUCH_IO(int flavor) +{ + int pid = 0; + task_t task = current_task(); +#ifdef EXC_RESOURCE_MONITORS + mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; +#endif /* EXC_RESOURCE_MONITORS */ + struct ledger_entry_info lei; + kern_return_t kr; + +#ifdef MACH_BSD + pid = proc_selfpid(); +#endif + /* + * Get the ledger entry info. We need to do this before disabling the exception + * to get correct values for all fields. + */ + switch(flavor) { + case FLAVOR_IO_PHYSICAL_WRITES: + ledger_get_entry_info(task->ledger, task_ledgers.physical_writes, &lei); + break; + case FLAVOR_IO_LOGICAL_WRITES: + ledger_get_entry_info(task->ledger, task_ledgers.logical_writes, &lei); + break; + } + + + /* + * Disable the exception notification so we don't overwhelm + * the listener with an endless stream of redundant exceptions. + * TODO: detect whether another thread is already reporting the violation. + */ + uint32_t flags = IOMON_DISABLE; + task_io_monitor_ctl(task, &flags); + + if (flavor == FLAVOR_IO_LOGICAL_WRITES) { + trace_resource_violation(RMON_LOGWRITES_VIOLATED, &lei); + } + printf("process [%d] caught causing excessive I/O (flavor: %d). Task I/O: %lld MB. [Limit : %lld MB per %lld secs]\n", + pid, flavor, (lei.lei_balance / (1024 * 1024)), (lei.lei_limit / (1024 * 1024)), (lei.lei_refill_period / NSEC_PER_SEC)); + + kr = send_resource_violation(send_disk_writes_violation, task, &lei, kRNFlagsNone); + if (kr) { + printf("send_resource_violation(disk_writes, ...): error %#x\n", kr); + } + +#ifdef EXC_RESOURCE_MONITORS + code[0] = code[1] = 0; + EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_IO); + EXC_RESOURCE_ENCODE_FLAVOR(code[0], flavor); + EXC_RESOURCE_IO_ENCODE_INTERVAL(code[0], (lei.lei_refill_period / NSEC_PER_SEC)); + EXC_RESOURCE_IO_ENCODE_LIMIT(code[0], (lei.lei_limit / (1024 * 1024))); + EXC_RESOURCE_IO_ENCODE_OBSERVED(code[1], (lei.lei_balance / (1024 * 1024))); + exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX); +#endif /* EXC_RESOURCE_MONITORS */ +} + /* Placeholders for the task set/get voucher interfaces */ kern_return_t task_get_mach_voucher( @@ -4569,28 +5279,215 @@ boolean_t task_is_gpu_denied(task_t task) return (task->t_flags & TF_GPU_DENIED) ? TRUE : FALSE; } -void task_update_logical_writes(task_t task, uint32_t io_size, int flags) + +uint64_t get_task_memory_region_count(task_t task) { - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_DATA_WRITE)) | DBG_FUNC_NONE, task_pid(task), io_size, flags, 0, 0); - switch(flags) { - case TASK_WRITE_IMMEDIATE: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_immediate_writes)); - break; - case TASK_WRITE_DEFERRED: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_deferred_writes)); - break; - case TASK_WRITE_INVALIDATED: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_invalidated_writes)); - break; - case TASK_WRITE_METADATA: - OSAddAtomic64(io_size, (SInt64 *)&(task->task_metadata_writes)); - break; + vm_map_t map; + map = (task == kernel_task) ? kernel_map: task->map; + return((uint64_t)get_map_nentries(map)); +} + +static void +kdebug_trace_dyld_internal(uint32_t base_code, + struct dyld_kernel_image_info *info) +{ + static_assert(sizeof(info->uuid) >= 16); + +#if defined(__LP64__) + uint64_t *uuid = (uint64_t *)&(info->uuid); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, base_code), uuid[0], + uuid[1], info->load_addr, + (uint64_t)info->fsid.val[0] | ((uint64_t)info->fsid.val[1] << 32), + 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, base_code + 1), + (uint64_t)info->fsobjid.fid_objno | + ((uint64_t)info->fsobjid.fid_generation << 32), + 0, 0, 0, 0); +#else /* defined(__LP64__) */ + uint32_t *uuid = (uint32_t *)&(info->uuid); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, base_code + 2), uuid[0], + uuid[1], uuid[2], uuid[3], 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, base_code + 3), + (uint32_t)info->load_addr, info->fsid.val[0], info->fsid.val[1], + info->fsobjid.fid_objno, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, base_code + 4), + info->fsobjid.fid_generation, 0, 0, 0, 0); +#endif /* !defined(__LP64__) */ +} + +static kern_return_t +kdebug_trace_dyld(task_t task, uint32_t base_code, + vm_map_copy_t infos_copy, mach_msg_type_number_t infos_len) +{ + kern_return_t kr; + dyld_kernel_image_info_array_t infos; + vm_map_offset_t map_data; + vm_offset_t data; + + assert(infos_copy != NULL); + + if (task == NULL || task != current_task()) { + return KERN_INVALID_TASK; } - return; + + kr = vm_map_copyout(ipc_kernel_map, &map_data, (vm_map_copy_t)infos_copy); + if (kr != KERN_SUCCESS) { + return kr; + } + + infos = CAST_DOWN(dyld_kernel_image_info_array_t, map_data); + + for (mach_msg_type_number_t i = 0; i < infos_len; i++) { + kdebug_trace_dyld_internal(base_code, &(infos[i])); + } + + data = CAST_DOWN(vm_offset_t, map_data); + mach_vm_deallocate(ipc_kernel_map, data, infos_len * sizeof(infos[0])); + return KERN_SUCCESS; +} + +kern_return_t +task_register_dyld_image_infos(task_t task, + dyld_kernel_image_info_array_t infos_copy, + mach_msg_type_number_t infos_len) +{ + return kdebug_trace_dyld(task, DBG_DYLD_UUID_MAP_A, + (vm_map_copy_t)infos_copy, infos_len); +} + +kern_return_t +task_unregister_dyld_image_infos(task_t task, + dyld_kernel_image_info_array_t infos_copy, + mach_msg_type_number_t infos_len) +{ + return kdebug_trace_dyld(task, DBG_DYLD_UUID_UNMAP_A, + (vm_map_copy_t)infos_copy, infos_len); +} + +kern_return_t +task_get_dyld_image_infos(__unused task_t task, + __unused dyld_kernel_image_info_array_t * dyld_images, + __unused mach_msg_type_number_t * dyld_imagesCnt) +{ + return KERN_NOT_SUPPORTED; +} + +kern_return_t +task_register_dyld_shared_cache_image_info(task_t task, + dyld_kernel_image_info_t cache_img, + __unused boolean_t no_cache, + __unused boolean_t private_cache) +{ + if (task == NULL || task != current_task()) { + return KERN_INVALID_TASK; + } + + kdebug_trace_dyld_internal(DBG_DYLD_UUID_SHARED_CACHE_A, &cache_img); + return KERN_SUCCESS; +} + +kern_return_t +task_register_dyld_set_dyld_state(__unused task_t task, + __unused uint8_t dyld_state) +{ + return KERN_NOT_SUPPORTED; +} + +kern_return_t +task_register_dyld_get_process_state(__unused task_t task, + __unused dyld_kernel_process_info_t * dyld_process_state) +{ + return KERN_NOT_SUPPORTED; +} + +#if CONFIG_SECLUDED_MEMORY +int num_tasks_can_use_secluded_mem = 0; + +void +task_set_can_use_secluded_mem( + task_t task, + boolean_t can_use_secluded_mem) +{ + if (!task->task_could_use_secluded_mem) { + return; + } + task_lock(task); + task_set_can_use_secluded_mem_locked(task, can_use_secluded_mem); + task_unlock(task); +} + +void +task_set_can_use_secluded_mem_locked( + task_t task, + boolean_t can_use_secluded_mem) +{ + assert(task->task_could_use_secluded_mem); + if (can_use_secluded_mem && + secluded_for_apps && /* global boot-arg */ + !task->task_can_use_secluded_mem) { + assert(num_tasks_can_use_secluded_mem >= 0); + OSAddAtomic(+1, + (volatile SInt32 *)&num_tasks_can_use_secluded_mem); + task->task_can_use_secluded_mem = TRUE; + } else if (!can_use_secluded_mem && + task->task_can_use_secluded_mem) { + assert(num_tasks_can_use_secluded_mem > 0); + OSAddAtomic(-1, + (volatile SInt32 *)&num_tasks_can_use_secluded_mem); + task->task_can_use_secluded_mem = FALSE; + } +} + +void +task_set_could_use_secluded_mem( + task_t task, + boolean_t could_use_secluded_mem) +{ + task->task_could_use_secluded_mem = could_use_secluded_mem; +} + +void +task_set_could_also_use_secluded_mem( + task_t task, + boolean_t could_also_use_secluded_mem) +{ + task->task_could_also_use_secluded_mem = could_also_use_secluded_mem; +} + +boolean_t +task_can_use_secluded_mem( + task_t task) +{ + if (task->task_can_use_secluded_mem) { + assert(task->task_could_use_secluded_mem); + assert(num_tasks_can_use_secluded_mem > 0); + return TRUE; + } + if (task->task_could_also_use_secluded_mem && + num_tasks_can_use_secluded_mem > 0) { + assert(num_tasks_can_use_secluded_mem > 0); + return TRUE; + } + return FALSE; +} + +boolean_t +task_could_use_secluded_mem( + task_t task) +{ + return task->task_could_use_secluded_mem; } +#endif /* CONFIG_SECLUDED_MEMORY */ queue_head_t * task_io_user_clients(task_t task) { - return (&task->io_user_clients); + return (&task->io_user_clients); } diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index 63cd3d3d0..3449f26c7 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -95,6 +95,12 @@ #include #include +#ifdef XNU_KERNEL_PRIVATE +#include +#include +#include +#endif /* XNU_KERNEL_PRIVATE */ + #ifdef MACH_KERNEL_PRIVATE #include @@ -113,40 +119,6 @@ #include #include #include -#endif /* MACH_KERNEL_PRIVATE */ - -#ifdef XNU_KERNEL_PRIVATE - -#include -#include -#include - -/* defns for task->rsu_controldata */ -#define TASK_POLICY_CPU_RESOURCE_USAGE 0 -#define TASK_POLICY_WIREDMEM_RESOURCE_USAGE 1 -#define TASK_POLICY_VIRTUALMEM_RESOURCE_USAGE 2 -#define TASK_POLICY_DISK_RESOURCE_USAGE 3 -#define TASK_POLICY_NETWORK_RESOURCE_USAGE 4 -#define TASK_POLICY_POWER_RESOURCE_USAGE 5 - -#define TASK_POLICY_RESOURCE_USAGE_COUNT 6 - -#define TASK_POLICY_CPUMON_DISABLE 0xFF -#define TASK_POLICY_CPUMON_DEFAULTS 0xFE - -/* Resource usage/low resource attributes */ -#define TASK_POLICY_RESOURCE_ATTRIBUTE_NONE 0x00 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE 0x01 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_SUSPEND 0x02 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_TERMINATE 0x03 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_KQ 0x04 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC 0x05 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT TASK_POLICY_RESOURCE_ATTRIBUTE_NONE - -#endif /* XNU_KERNEL_PRIVATE */ - -#ifdef MACH_KERNEL_PRIVATE - #include #include @@ -256,14 +228,18 @@ struct task { uint32_t p_switch; /* total processor switches */ uint32_t ps_switch; /* total pset switches */ - zinfo_usage_t tkm_zinfo; /* per-task, per-zone usage statistics */ - #ifdef MACH_BSD void *bsd_info; #endif kcdata_descriptor_t corpse_info; + void * corpse_info_kernel; + queue_chain_t corpse_tasks; +#ifdef CONFIG_MACF + struct label * crash_label; +#endif struct vm_shared_region *shared_region; volatile uint32_t t_flags; /* general-purpose task flags protected by task_lock (TL) */ +#define TF_NONE 0 #define TF_64B_ADDR 0x00000001 /* task has 64-bit addressing */ #define TF_64B_DATA 0x00000002 /* task has 64-bit data registers */ #define TF_CPUMON_WARNING 0x00000004 /* task has at least one thread in CPU usage warning zone */ @@ -272,6 +248,7 @@ struct task { #define TF_GPU_DENIED 0x00000010 /* task is not allowed to access the GPU */ #define TF_CORPSE 0x00000020 /* task is a corpse */ #define TF_PENDING_CORPSE 0x00000040 /* task corpse has not been reported yet */ +#define TF_CORPSE_FORK 0x00000080 /* task is a forked corpse */ #define task_has_64BitAddr(task) \ (((task)->t_flags & TF_64B_ADDR) != 0) @@ -297,6 +274,9 @@ struct task { #define task_clear_corpse_pending_report(task) \ ((task)->t_flags &= ~TF_PENDING_CORPSE) +#define task_is_a_corpse_fork(task) \ + (((task)->t_flags & TF_CORPSE_FORK) != 0) + mach_vm_address_t all_image_info_addr; /* dyld __all_image_info */ mach_vm_size_t all_image_info_size; /* section location and size */ @@ -341,7 +321,6 @@ struct task { struct task_requested_policy requested_policy; struct task_effective_policy effective_policy; - struct task_pended_policy pended_policy; /* * Can be merged with imp_donor bits, once the IMPORTANCE_INHERITANCE macro goes away. @@ -368,6 +347,7 @@ struct task { uint32_t task_timer_wakeups_bin_1; uint32_t task_timer_wakeups_bin_2; uint64_t task_gpu_ns; + uint64_t task_energy; /* # of purgeable volatile VM objects owned by this task: */ int task_volatile_objects; @@ -387,11 +367,23 @@ struct task { queue_chain_t task_coalition[COALITION_NUM_TYPES]; uint64_t dispatchqueue_offset; +#if DEVELOPMENT || DEBUG + boolean_t task_unnested; + int task_disconnected_count; +#endif + #if HYPERVISOR void *hv_task_target; /* hypervisor virtual machine object associated with this task */ #endif /* HYPERVISOR */ +#if CONFIG_SECLUDED_MEMORY + boolean_t task_can_use_secluded_mem; + boolean_t task_could_use_secluded_mem; + boolean_t task_could_also_use_secluded_mem; +#endif /* CONFIG_SECLUDED_MEMORY */ + queue_head_t io_user_clients; + uint32_t exec_token; }; #define task_lock(task) lck_mtx_lock(&(task)->lock) @@ -441,13 +433,6 @@ extern void init_task_ledgers(void); extern lck_attr_t task_lck_attr; extern lck_grp_t task_lck_grp; -#define QOS_OVERRIDE_MODE_OVERHANG_PEAK 0 -#define QOS_OVERRIDE_MODE_IGNORE_OVERRIDE 1 -#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE 2 -#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH 3 -#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 4 - -extern uint32_t qos_override_mode; #else /* MACH_KERNEL_PRIVATE */ @@ -457,6 +442,8 @@ extern task_t current_task(void); extern void task_reference(task_t task); +#define TF_NONE 0 + __END_DECLS #endif /* MACH_KERNEL_PRIVATE */ @@ -497,6 +484,12 @@ extern kern_return_t task_send_trace_memory( uint32_t pid, uint64_t uniqueid); +#if DEVELOPMENT || DEBUG + +extern kern_return_t task_disconnect_page_mappings( + task_t task); +#endif + extern void tasks_system_suspend(boolean_t suspend); #if CONFIG_FREEZE @@ -534,20 +527,22 @@ extern kern_return_t task_create_internal( coalition_t *parent_coalitions, boolean_t inherit_memory, boolean_t is_64bit, + uint32_t flags, task_t *child_task); /* OUT */ -extern kern_return_t task_importance( - task_t task, - integer_t importance); extern void task_power_info_locked( task_t task, task_power_info_t info, - gpu_energy_data_t gpu_energy); + gpu_energy_data_t gpu_energy, + uint64_t *task_power); extern uint64_t task_gpu_utilisation( task_t task); +extern uint64_t task_energy( + task_t task); + extern void task_vtimer_set( task_t task, integer_t which); @@ -582,7 +577,9 @@ extern int get_task_numacts( task_t task); extern int get_task_numactivethreads(task_t task); -extern kern_return_t task_collect_crash_info(task_t task); +extern kern_return_t task_collect_crash_info(task_t task, struct proc *p, int is_corpse_fork); +void task_port_notify(mach_msg_header_t *msg); +void task_wait_till_threads_terminate_locked(task_t task); /* JMM - should just be temporary (implementation in bsd_kern still) */ extern void set_bsdtask_info(task_t,void *); @@ -594,15 +591,29 @@ extern uint64_t get_task_compressed(task_t); extern uint64_t get_task_resident_max(task_t); extern uint64_t get_task_phys_footprint(task_t); extern uint64_t get_task_phys_footprint_max(task_t); +extern uint64_t get_task_phys_footprint_limit(task_t); extern uint64_t get_task_purgeable_size(task_t); extern uint64_t get_task_cpu_time(task_t); extern uint64_t get_task_dispatchqueue_offset(task_t); +extern uint64_t get_task_dispatchqueue_serialno_offset(task_t); +extern uint64_t get_task_uniqueid(task_t); + +extern uint64_t get_task_internal(task_t); +extern uint64_t get_task_internal_compressed(task_t); +extern uint64_t get_task_purgeable_nonvolatile(task_t); +extern uint64_t get_task_purgeable_nonvolatile_compressed(task_t); +extern uint64_t get_task_iokit_mapped(task_t); +extern uint64_t get_task_alternate_accounting(task_t); +extern uint64_t get_task_alternate_accounting_compressed(task_t); +extern uint64_t get_task_memory_region_count(task_t); +extern uint64_t get_task_page_table(task_t); extern kern_return_t task_convert_phys_footprint_limit(int, int *); extern kern_return_t task_set_phys_footprint_limit_internal(task_t, int, int *, boolean_t); extern kern_return_t task_get_phys_footprint_limit(task_t task, int *limit_mb); extern boolean_t is_kerneltask(task_t task); +extern boolean_t is_corpsetask(task_t task); extern kern_return_t check_actforsig(task_t task, thread_t thread, int setast); @@ -630,6 +641,7 @@ struct _task_ledger_indices { int iokit_mapped; int alternate_accounting; int alternate_accounting_compressed; + int page_table; int phys_footprint; int internal_compressed; int purgeable_volatile; @@ -645,229 +657,29 @@ struct _task_ledger_indices { int cpu_time_billed_to_me; int cpu_time_billed_to_others; #endif + int physical_writes; + int logical_writes; }; extern struct _task_ledger_indices task_ledgers; -/* Begin task_policy */ - -/* value */ -#define TASK_POLICY_DISABLE 0x0 -#define TASK_POLICY_ENABLE 0x1 - -/* category */ -#define TASK_POLICY_INTERNAL 0x0 -#define TASK_POLICY_EXTERNAL 0x1 -#define TASK_POLICY_ATTRIBUTE 0x2 - -/* for tracing */ -#define TASK_POLICY_TASK 0x4 -#define TASK_POLICY_THREAD 0x8 - -/* flavors (also DBG_IMPORTANCE subclasses 0x20 - 0x3F) */ - -/* internal or external, thread or task */ -#define TASK_POLICY_DARWIN_BG 0x21 -#define TASK_POLICY_IOPOL 0x22 -#define TASK_POLICY_IO 0x23 -#define TASK_POLICY_PASSIVE_IO 0x24 - -/* internal, task only */ -#define TASK_POLICY_DARWIN_BG_IOPOL 0x27 - -/* task-only attributes */ -#define TASK_POLICY_TAL 0x28 -#define TASK_POLICY_BOOST 0x29 -#define TASK_POLICY_ROLE 0x2A -#define TASK_POLICY_SUPPRESSED_CPU 0x2B -#define TASK_POLICY_TERMINATED 0x2C -#define TASK_POLICY_NEW_SOCKETS_BG 0x2D -#define TASK_POLICY_LOWPRI_CPU 0x2E -#define TASK_POLICY_LATENCY_QOS 0x2F -#define TASK_POLICY_THROUGH_QOS 0x30 -#define TASK_POLICY_WATCHERS_BG 0x31 - -#define TASK_POLICY_SFI_MANAGED 0x34 -#define TASK_POLICY_ALL_SOCKETS_BG 0x37 - -#define TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS 0x39 /* latency as value1, throughput as value2 */ -#define TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS 0x3A /* latency as value1, throughput as value2 */ - -/* thread-only attributes */ -#define TASK_POLICY_PIDBIND_BG 0x32 -#define TASK_POLICY_WORKQ_BG 0x33 -#define TASK_POLICY_QOS 0x35 -#define TASK_POLICY_QOS_OVERRIDE 0x36 -#define TASK_POLICY_QOS_AND_RELPRIO 0x38 /* QoS as value1, relative priority as value2 */ - -#define TASK_POLICY_MAX 0x3F - -/* The main entrance to task policy is this function */ -extern void proc_set_task_policy(task_t task, thread_t thread, int category, int flavor, int value); -extern int proc_get_task_policy(task_t task, thread_t thread, int category, int flavor); - -/* For attributes that have two scalars as input/output */ -extern void proc_set_task_policy2(task_t task, thread_t thread, int category, int flavor, int value1, int value2); -extern void proc_get_task_policy2(task_t task, thread_t thread, int category, int flavor, int *value1, int *value2); - -/* For use by kernel threads and others who don't hold a reference on the target thread */ -extern void proc_set_task_policy_thread(task_t task, uint64_t tid, int category, int flavor, int value); - -extern void proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, - ipc_port_t * portwatch_ports, int portwatch_count); - -extern void task_set_main_thread_qos(task_t task, thread_t main_thread); - -extern int proc_darwin_role_to_task_role(int darwin_role, int* task_role); -extern int proc_task_role_to_darwin_role(int task_role); - - -/* IO Throttle tiers */ -#define THROTTLE_LEVEL_NONE -1 -#define THROTTLE_LEVEL_TIER0 0 /* IOPOL_NORMAL, IOPOL_DEFAULT, IOPOL_PASSIVE */ - -#define THROTTLE_LEVEL_THROTTLED 1 -#define THROTTLE_LEVEL_TIER1 1 /* IOPOL_STANDARD */ -#define THROTTLE_LEVEL_TIER2 2 /* IOPOL_UTILITY */ -#define THROTTLE_LEVEL_TIER3 3 /* IOPOL_THROTTLE */ - -#define THROTTLE_LEVEL_START 0 -#define THROTTLE_LEVEL_END 3 - -#define THROTTLE_LEVEL_COMPRESSOR_TIER0 THROTTLE_LEVEL_TIER0 -#define THROTTLE_LEVEL_COMPRESSOR_TIER1 THROTTLE_LEVEL_TIER1 -#define THROTTLE_LEVEL_COMPRESSOR_TIER2 THROTTLE_LEVEL_TIER2 - -#define THROTTLE_LEVEL_PAGEOUT_THROTTLED THROTTLE_LEVEL_TIER2 -#define THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED THROTTLE_LEVEL_TIER1 - -#if CONFIG_IOSCHED -#define IOSCHED_METADATA_TIER THROTTLE_LEVEL_TIER1 -#endif /* CONFIG_IOSCHED */ - -extern int proc_apply_workq_bgthreadpolicy(thread_t thread); -extern int proc_restore_workq_bgthreadpolicy(thread_t thread); - -extern int proc_get_darwinbgstate(task_t task, uint32_t *flagsp); -extern boolean_t proc_task_is_tal(task_t task); -extern int task_get_apptype(task_t); -extern integer_t task_grab_latency_qos(task_t task); -extern void task_policy_create(task_t task, int parent_boosted); -extern void thread_policy_create(thread_t thread); - -/* - * for IPC importance hooks into task policy - */ -typedef struct task_pend_token { - uint32_t tpt_update_sockets :1, - tpt_update_timers :1, - tpt_update_watchers :1, - tpt_update_live_donor :1, - tpt_update_coal_sfi :1; -} *task_pend_token_t; - -extern void task_policy_update_complete_unlocked(task_t task, thread_t thread, task_pend_token_t pend_token); -extern void task_update_boost_locked(task_t task, boolean_t boost_active, task_pend_token_t pend_token); -extern void task_set_boost_locked(task_t task, boolean_t boost_active); - -/* - * Get effective policy - * Only for use by relevant subsystem, should never be passed into a setter! - */ - -extern int proc_get_effective_task_policy(task_t task, int flavor); -extern int proc_get_effective_thread_policy(thread_t thread, int flavor); - -/* temporary compatibility */ -int proc_setthread_saved_importance(thread_t thread, int importance); - -int proc_get_task_ruse_cpu(task_t task, uint32_t *policyp, uint8_t *percentagep, uint64_t *intervalp, uint64_t *deadlinep); -int proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint8_t percentage, uint64_t interval, uint64_t deadline, int cpumon_entitled); -int proc_clear_task_ruse_cpu(task_t task, int cpumon_entitled); -thread_t task_findtid(task_t, uint64_t); -void set_thread_iotier_override(thread_t, int policy); - -boolean_t proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, int resource_type); -boolean_t proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type); -boolean_t proc_thread_qos_reset_override(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type); -void proc_thread_qos_deallocate(thread_t thread); - -#define TASK_RUSECPU_FLAGS_PROC_LIMIT 0x01 -#define TASK_RUSECPU_FLAGS_PERTHR_LIMIT 0x02 -#define TASK_RUSECPU_FLAGS_DEADLINE 0x04 -#define TASK_RUSECPU_FLAGS_FATAL_CPUMON 0x08 /* CPU usage monitor violations are fatal */ -#define TASK_RUSECPU_FLAGS_FATAL_WAKEUPSMON 0x10 /* wakeups monitor violations are fatal */ -#define TASK_RUSECPU_FLAGS_PHYS_FOOTPRINT_EXCEPTION 0x20 /* exceeding physical footprint generates EXC_RESOURCE */ - -/* BSD call back functions */ -extern int proc_apply_resource_actions(void * p, int type, int action); -extern int proc_restore_resource_actions(void * p, int type, int action); -extern int task_restore_resource_actions(task_t task, int type); - -extern int task_clear_cpuusage(task_t task, int cpumon_entitled); +/* requires task to be unlocked, returns a referenced thread */ +thread_t task_findtid(task_t task, uint64_t tid); extern kern_return_t task_wakeups_monitor_ctl(task_t task, uint32_t *rate_hz, int32_t *flags); extern kern_return_t task_cpu_usage_monitor_ctl(task_t task, uint32_t *flags); +extern kern_return_t task_io_monitor_ctl(task_t task, uint32_t *flags); - -extern void task_importance_mark_donor(task_t task, boolean_t donating); -extern void task_importance_mark_live_donor(task_t task, boolean_t donating); -extern void task_importance_mark_receiver(task_t task, boolean_t receiving); -extern void task_importance_mark_denap_receiver(task_t task, boolean_t denap); -extern void task_importance_reset(task_t task); extern void task_atm_reset(task_t task); extern void task_bank_reset(task_t task); extern void task_bank_init(task_t task); -#if IMPORTANCE_INHERITANCE - -extern boolean_t task_is_importance_donor(task_t task); -extern boolean_t task_is_marked_importance_donor(task_t task); -extern boolean_t task_is_marked_live_importance_donor(task_t task); - -extern boolean_t task_is_importance_receiver(task_t task); -extern boolean_t task_is_marked_importance_receiver(task_t task); - -extern boolean_t task_is_importance_denap_receiver(task_t task); -extern boolean_t task_is_marked_importance_denap_receiver(task_t task); - -extern boolean_t task_is_importance_receiver_type(task_t task); - -extern int task_importance_hold_watchport_assertion(task_t target_task, uint32_t count); -extern int task_importance_hold_internal_assertion(task_t target_task, uint32_t count); -extern int task_importance_drop_internal_assertion(task_t target_task, uint32_t count); - -extern int task_importance_hold_file_lock_assertion(task_t target_task, uint32_t count); -extern int task_importance_drop_file_lock_assertion(task_t target_task, uint32_t count); - -extern int task_importance_hold_legacy_external_assertion(task_t target_task, uint32_t count); -extern int task_importance_drop_legacy_external_assertion(task_t target_task, uint32_t count); - -#endif /* IMPORTANCE_INHERITANCE */ - -extern int task_low_mem_privileged_listener(task_t task, boolean_t new_value, boolean_t *old_value); -extern boolean_t task_has_been_notified(task_t task, int pressurelevel); -extern boolean_t task_used_for_purging(task_t task, int pressurelevel); -extern void task_mark_has_been_notified(task_t task, int pressurelevel); -extern void task_mark_used_for_purging(task_t task, int pressurelevel); -extern void task_clear_has_been_notified(task_t task, int pressurelevel); -extern void task_clear_used_for_purging(task_t task); -extern int task_importance_estimate(task_t task); - extern int task_pid(task_t task); - +extern boolean_t task_has_assertions(task_t task); /* End task_policy */ -extern kern_return_t task_purge_volatile_memory(task_t task); - extern void task_set_gpu_denied(task_t task, boolean_t denied); extern boolean_t task_is_gpu_denied(task_t task); -#define TASK_WRITE_IMMEDIATE 0x1 -#define TASK_WRITE_DEFERRED 0x2 -#define TASK_WRITE_INVALIDATED 0x4 -#define TASK_WRITE_METADATA 0x8 -extern void task_update_logical_writes(task_t task, uint32_t io_size, int flags); - extern queue_head_t * task_io_user_clients(task_t task); #endif /* XNU_KERNEL_PRIVATE */ @@ -893,6 +705,31 @@ extern task_suspension_token_t convert_port_to_task_suspension_token(ipc_port_t extern boolean_t task_suspension_notify(mach_msg_header_t *); +#define TASK_WRITE_IMMEDIATE 0x1 +#define TASK_WRITE_DEFERRED 0x2 +#define TASK_WRITE_INVALIDATED 0x4 +#define TASK_WRITE_METADATA 0x8 +extern void task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp); + +#if CONFIG_SECLUDED_MEMORY +extern void task_set_can_use_secluded_mem( + task_t task, + boolean_t can_use_secluded_mem); +extern void task_set_could_use_secluded_mem( + task_t task, + boolean_t could_use_secluded_mem); +extern void task_set_could_also_use_secluded_mem( + task_t task, + boolean_t could_also_use_secluded_mem); +extern boolean_t task_can_use_secluded_mem(task_t task); +extern boolean_t task_could_use_secluded_mem(task_t task); +#endif /* CONFIG_SECLUDED_MEMORY */ + +#if CONFIG_MACF +extern struct label *get_task_crash_label(task_t task); +extern void set_task_crash_label(task_t task, struct label *label); +#endif /* CONFIG_MACF */ + #endif /* KERNEL_PRIVATE */ extern task_t kernel_task; diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index 101197a51..6d9f28919 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,13 +22,19 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include +#include + #include #include +#include /* host_priv_self() */ +#include /* host_get_special_port() */ +#include /* RESOURCE_NOTIFY_PORT */ #include #include #include @@ -61,17 +67,19 @@ * centralized here in one state machine to simplify the implementation of those interactions. * * Architecture: - * Threads and tasks have three policy fields: requested, effective, and pending. + * Threads and tasks have two policy fields: requested, effective. * Requested represents the wishes of each interface that influences task policy. * Effective represents the distillation of that policy into a set of behaviors. - * Pending represents updates that haven't been applied yet. + * + * Each thread making a modification in the policy system passes a 'pending' struct, + * which tracks updates that will be applied after dropping the policy engine lock. * * Each interface that has an input into the task policy state machine controls a field in requested. * If the interface has a getter, it returns what is in the field in requested, but that is * not necessarily what is actually in effect. * * All kernel subsystems that behave differently based on task policy call into - * the get_effective_policy function, which returns the decision of the task policy state machine + * the proc_get_effective_(task|thread)_policy functions, which return the decision of the task policy state machine * for that subsystem by querying only the 'effective' field. * * Policy change operations: @@ -86,10 +94,10 @@ * subsystems which cannot be touched while holding the task lock. * * To add a new requested policy, add the field in the requested struct, the flavor in task.h, - * the setter and getter in proc_(set|get)_task_policy*, and dump the state in task_requested_bitfield, + * the setter and getter in proc_(set|get)_task_policy*, * then set up the effects of that behavior in task_policy_update*. If the policy manifests * itself as a distinct effective policy, add it to the effective struct and add it to the - * proc_get_effective_policy accessor. + * proc_get_effective_task_policy accessor. * * Most policies are set via proc_set_task_policy, but policies that don't fit that interface * roll their own lock/set/update/unlock/complete code inside this file. @@ -106,79 +114,88 @@ * * Locking * - * Changing task policy on a task or thread takes the task lock, and not the thread lock. - * TODO: Should changing policy on a thread take the thread lock instead? + * Changing task policy on a task takes the task lock. + * Changing task policy on a thread takes the thread mutex. + * Task policy changes that affect threads will take each thread's mutex to update it if necessary. + * + * Querying the effective policy does not take a lock, because callers + * may run in interrupt context or other place where locks are not OK. * - * Querying the effective policy does not take the task lock, to prevent deadlocks or slowdown in sensitive code. * This means that any notification of state change needs to be externally synchronized. + * We do this by idempotent callouts after the state has changed to ask + * other subsystems to update their view of the world. * + * TODO: Move all cpu/wakes/io monitor code into a separate file + * TODO: Move all importance code over to importance subsystem + * TODO: Move all taskwatch code into a separate file + * TODO: Move all VM importance code into a separate file */ -extern const qos_policy_params_t thread_qos_policy_params; - -/* for task holds without dropping the lock */ -extern void task_hold_locked(task_t task); -extern void task_release_locked(task_t task); -extern void task_wait_locked(task_t task, boolean_t until_not_runnable); - -extern void thread_recompute_qos(thread_t thread); - /* Task policy related helper functions */ -static void proc_set_task_policy_locked(task_t task, thread_t thread, int category, int flavor, int value); -static void proc_set_task_policy2_locked(task_t task, thread_t thread, int category, int flavor, int value1, int value2); +static void proc_set_task_policy_locked(task_t task, int category, int flavor, int value, int value2); -static void task_policy_update_locked(task_t task, thread_t thread, task_pend_token_t pend_token); -static void task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_create, task_pend_token_t pend_token); -static void task_policy_update_task_locked(task_t task, boolean_t update_throttle, boolean_t update_bg_throttle, boolean_t update_sfi); -static void task_policy_update_thread_locked(thread_t thread, int update_cpu, boolean_t update_throttle, boolean_t update_sfi, boolean_t update_qos); +static void task_policy_update_locked(task_t task, task_pend_token_t pend_token); +static void task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_token_t pend_token); + +/* For attributes that have two scalars as input/output */ +static void proc_set_task_policy2(task_t task, int category, int flavor, int value1, int value2); +static void proc_get_task_policy2(task_t task, int category, int flavor, int *value1, int *value2); #if CONFIG_SCHED_SFI static boolean_t task_policy_update_coalition_focal_tasks(task_t task, int prev_role, int next_role); #endif -static int proc_get_effective_policy(task_t task, thread_t thread, int policy); - -static void proc_iopol_to_tier(int iopolicy, int *tier, int *passive); -static int proc_tier_to_iopol(int tier, int passive); +static uint64_t task_requested_bitfield(task_t task); +static uint64_t task_effective_bitfield(task_t task); -static uintptr_t trequested_0(task_t task, thread_t thread); -static uintptr_t trequested_1(task_t task, thread_t thread); -static uintptr_t teffective_0(task_t task, thread_t thread); -static uintptr_t teffective_1(task_t task, thread_t thread); -static uint32_t tpending(task_pend_token_t pend_token); -static uint64_t task_requested_bitfield(task_t task, thread_t thread); -static uint64_t task_effective_bitfield(task_t task, thread_t thread); +/* Convenience functions for munging a policy bitfield into a tracepoint */ +static uintptr_t trequested_0(task_t task); +static uintptr_t trequested_1(task_t task); +static uintptr_t teffective_0(task_t task); +static uintptr_t teffective_1(task_t task); -void proc_get_thread_policy(thread_t thread, thread_policy_state_t info); - -/* CPU Limits related helper functions */ +/* CPU limits helper functions */ +static int task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t deadline, int scope, int entitled); static int task_get_cpuusage(task_t task, uint8_t *percentagep, uint64_t *intervalp, uint64_t *deadlinep, int *scope); -int task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t deadline, int scope, int entitled); +static int task_enable_cpumon_locked(task_t task); +static int task_disable_cpumon(task_t task); static int task_clear_cpuusage_locked(task_t task, int cpumon_entitled); -int task_disable_cpumon(task_t task); static int task_apply_resource_actions(task_t task, int type); -void task_action_cpuusage(thread_call_param_t param0, thread_call_param_t param1); -void proc_init_cpumon_params(void); +static void task_action_cpuusage(thread_call_param_t param0, thread_call_param_t param1); #ifdef MACH_BSD -int proc_pid(void *proc); -extern int proc_selfpid(void); -extern char * proc_name_address(void *p); -extern void rethrottle_thread(void * uthread); -extern void proc_apply_task_networkbg(void * bsd_info, thread_t thread); +typedef struct proc * proc_t; +int proc_pid(void *proc); +extern int proc_selfpid(void); +extern char * proc_name_address(void *p); +extern char * proc_best_name(proc_t proc); + +extern int proc_pidpathinfo_internal(proc_t p, uint64_t arg, + char *buffer, uint32_t buffersize, + int32_t *retval); #endif /* MACH_BSD */ -extern zone_t thread_qos_override_zone; -static boolean_t _proc_thread_qos_remove_override_internal(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type, boolean_t reset); /* Importance Inheritance related helper functions */ #if IMPORTANCE_INHERITANCE +static void task_importance_mark_live_donor(task_t task, boolean_t donating); +static void task_importance_mark_receiver(task_t task, boolean_t receiving); +static void task_importance_mark_denap_receiver(task_t task, boolean_t denap); + +static boolean_t task_is_marked_live_importance_donor(task_t task); +static boolean_t task_is_importance_receiver(task_t task); +static boolean_t task_is_importance_denap_receiver(task_t task); + +static int task_importance_hold_internal_assertion(task_t target_task, uint32_t count); + static void task_add_importance_watchport(task_t task, mach_port_t port, int *boostp); static void task_importance_update_live_donor(task_t target_task); +static void task_set_boost_locked(task_t task, boolean_t boost_active); + #endif /* IMPORTANCE_INHERITANCE */ #if IMPORTANCE_DEBUG @@ -193,18 +210,6 @@ static void task_importance_update_live_donor(task_t target_task); #define __imp_only __unused #endif -#define TASK_LOCKED 1 -#define TASK_UNLOCKED 0 - -#define DO_LOWPRI_CPU 1 -#define UNDO_LOWPRI_CPU 2 - -/* Macros for making tracing simpler */ - -#define tpriority(task, thread) ((uintptr_t)(thread == THREAD_NULL ? (task->priority) : (thread->base_pri))) -#define tisthread(thread) (thread == THREAD_NULL ? TASK_POLICY_TASK : TASK_POLICY_THREAD) -#define targetid(task, thread) ((uintptr_t)(thread == THREAD_NULL ? (task_pid(task)) : (thread->thread_id))) - /* * Default parameters for certain policies */ @@ -219,10 +224,9 @@ const int proc_default_bg_iotier = THROTTLE_LEVEL_TIER2; /* Latency/throughput QoS fields remain zeroed, i.e. TIER_UNSPECIFIED at creation */ const struct task_requested_policy default_task_requested_policy = { - .bg_iotier = proc_default_bg_iotier + .trp_bg_iotier = proc_default_bg_iotier }; const struct task_effective_policy default_task_effective_policy = {}; -const struct task_pended_policy default_task_pended_policy = {}; /* * Default parameters for CPU usage monitor. @@ -235,6 +239,7 @@ const struct task_pended_policy default_task_pended_policy = {}; uint8_t proc_max_cpumon_percentage; uint64_t proc_max_cpumon_interval; + kern_return_t qos_latency_policy_validate(task_latency_qos_t ltier) { if ((ltier != LATENCY_QOS_TIER_UNSPECIFIED) && @@ -314,7 +319,7 @@ task_policy_set( case TASK_FOREGROUND_APPLICATION: case TASK_BACKGROUND_APPLICATION: case TASK_DEFAULT_APPLICATION: - proc_set_task_policy(task, THREAD_NULL, + proc_set_task_policy(task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE, info->role); break; @@ -323,7 +328,7 @@ task_policy_set( if (task != current_task() || task->sec_token.val[0] != 0) result = KERN_INVALID_ARGUMENT; else - proc_set_task_policy(task, THREAD_NULL, + proc_set_task_policy(task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE, info->role); break; @@ -333,7 +338,7 @@ task_policy_set( if (task != current_task() || task->sec_token.val[0] != 0) result = KERN_INVALID_ARGUMENT; else - proc_set_task_policy(task, THREAD_NULL, + proc_set_task_policy(task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE, info->role); break; @@ -359,7 +364,7 @@ task_policy_set( uint32_t lqos = qos_extract(qosinfo->task_latency_qos_tier); uint32_t tqos = qos_extract(qosinfo->task_throughput_qos_tier); - proc_set_task_policy2(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, + proc_set_task_policy2(task, TASK_POLICY_ATTRIBUTE, flavor == TASK_BASE_QOS_POLICY ? TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS : TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS, lqos, tqos); } @@ -375,7 +380,7 @@ task_policy_set( uint32_t lqos = qos_extract(qosinfo->task_latency_qos_tier); - proc_set_task_policy(task, NULL, TASK_POLICY_ATTRIBUTE, TASK_BASE_LATENCY_QOS_POLICY, lqos); + proc_set_task_policy(task, TASK_POLICY_ATTRIBUTE, TASK_BASE_LATENCY_QOS_POLICY, lqos); } break; @@ -389,7 +394,7 @@ task_policy_set( uint32_t tqos = qos_extract(qosinfo->task_throughput_qos_tier); - proc_set_task_policy(task, NULL, TASK_POLICY_ATTRIBUTE, TASK_BASE_THROUGHPUT_QOS_POLICY, tqos); + proc_set_task_policy(task, TASK_POLICY_ATTRIBUTE, TASK_BASE_THROUGHPUT_QOS_POLICY, tqos); } break; @@ -421,29 +426,27 @@ task_policy_set( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_TASK_SUPPRESSION, info->active)) | DBG_FUNC_START, - proc_selfpid(), task_pid(task), trequested_0(task, THREAD_NULL), - trequested_1(task, THREAD_NULL), 0); - - task->requested_policy.t_sup_active = (info->active) ? 1 : 0; - task->requested_policy.t_sup_lowpri_cpu = (info->lowpri_cpu) ? 1 : 0; - task->requested_policy.t_sup_timer = qos_extract(info->timer_throttle); - task->requested_policy.t_sup_disk = (info->disk_throttle) ? 1 : 0; - task->requested_policy.t_sup_cpu_limit = (info->cpu_limit) ? 1 : 0; - task->requested_policy.t_sup_suspend = (info->suspend) ? 1 : 0; - task->requested_policy.t_sup_throughput = qos_extract(info->throughput_qos); - task->requested_policy.t_sup_cpu = (info->suppressed_cpu) ? 1 : 0; - task->requested_policy.t_sup_bg_sockets = (info->background_sockets) ? 1 : 0; - - task_policy_update_locked(task, THREAD_NULL, &pend_token); + proc_selfpid(), task_pid(task), trequested_0(task), + trequested_1(task), 0); - task_unlock(task); + task->requested_policy.trp_sup_active = (info->active) ? 1 : 0; + task->requested_policy.trp_sup_lowpri_cpu = (info->lowpri_cpu) ? 1 : 0; + task->requested_policy.trp_sup_timer = qos_extract(info->timer_throttle); + task->requested_policy.trp_sup_disk = (info->disk_throttle) ? 1 : 0; + task->requested_policy.trp_sup_throughput = qos_extract(info->throughput_qos); + task->requested_policy.trp_sup_cpu = (info->suppressed_cpu) ? 1 : 0; + task->requested_policy.trp_sup_bg_sockets = (info->background_sockets) ? 1 : 0; - task_policy_update_complete_unlocked(task, THREAD_NULL, &pend_token); + task_policy_update_locked(task, &pend_token); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_TASK_SUPPRESSION, info->active)) | DBG_FUNC_END, - proc_selfpid(), task_pid(task), trequested_0(task, THREAD_NULL), - trequested_1(task, THREAD_NULL), 0); + proc_selfpid(), task_pid(task), trequested_0(task), + trequested_1(task), 0); + + task_unlock(task); + + task_policy_update_complete_unlocked(task, &pend_token); break; @@ -482,13 +485,14 @@ task_importance( task->importance = importance; - /* TODO: tracepoint? */ + struct task_pend_token pend_token = {}; - /* Redrive only the task priority calculation */ - task_policy_update_task_locked(task, FALSE, FALSE, FALSE); + task_policy_update_locked(task, &pend_token); task_unlock(task); + task_policy_update_complete_unlocked(task, &pend_token); + return (KERN_SUCCESS); } @@ -515,7 +519,7 @@ task_policy_get( if (*get_default) info->role = TASK_UNSPECIFIED; else - info->role = proc_get_task_policy(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE); + info->role = proc_get_task_policy(task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE); break; } @@ -533,7 +537,7 @@ task_policy_get( } else if (flavor == TASK_BASE_QOS_POLICY) { int value1, value2; - proc_get_task_policy2(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS, &value1, &value2); + proc_get_task_policy2(task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS, &value1, &value2); info->task_latency_qos_tier = qos_latency_policy_package(value1); info->task_throughput_qos_tier = qos_throughput_policy_package(value2); @@ -541,7 +545,7 @@ task_policy_get( } else if (flavor == TASK_OVERRIDE_QOS_POLICY) { int value1, value2; - proc_get_task_policy2(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS, &value1, &value2); + proc_get_task_policy2(task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS, &value1, &value2); info->task_latency_qos_tier = qos_latency_policy_package(value1); info->task_throughput_qos_tier = qos_throughput_policy_package(value2); @@ -572,10 +576,13 @@ task_policy_get( } else { task_lock(task); - info->requested = task_requested_bitfield(task, THREAD_NULL); - info->effective = task_effective_bitfield(task, THREAD_NULL); + info->requested = task_requested_bitfield(task); + info->effective = task_effective_bitfield(task); info->pending = 0; - + + info->tps_requested_policy = *(uint64_t*)(&task->requested_policy); + info->tps_effective_policy = *(uint64_t*)(&task->effective_policy); + info->flags = 0; if (task->task_imp_base != NULL) { info->imp_assertcnt = task->task_imp_base->iit_assertcnt; @@ -593,9 +600,6 @@ task_policy_get( task_unlock(task); } - info->reserved[0] = 0; - info->reserved[1] = 0; - break; } @@ -618,15 +622,15 @@ task_policy_get( info->throughput_qos = 0; info->suppressed_cpu = 0; } else { - info->active = task->requested_policy.t_sup_active; - info->lowpri_cpu = task->requested_policy.t_sup_lowpri_cpu; - info->timer_throttle = qos_latency_policy_package(task->requested_policy.t_sup_timer); - info->disk_throttle = task->requested_policy.t_sup_disk; - info->cpu_limit = task->requested_policy.t_sup_cpu_limit; - info->suspend = task->requested_policy.t_sup_suspend; - info->throughput_qos = qos_throughput_policy_package(task->requested_policy.t_sup_throughput); - info->suppressed_cpu = task->requested_policy.t_sup_cpu; - info->background_sockets = task->requested_policy.t_sup_bg_sockets; + info->active = task->requested_policy.trp_sup_active; + info->lowpri_cpu = task->requested_policy.trp_sup_lowpri_cpu; + info->timer_throttle = qos_latency_policy_package(task->requested_policy.trp_sup_timer); + info->disk_throttle = task->requested_policy.trp_sup_disk; + info->cpu_limit = 0; + info->suspend = 0; + info->throughput_qos = qos_throughput_policy_package(task->requested_policy.trp_sup_throughput); + info->suppressed_cpu = task->requested_policy.trp_sup_cpu; + info->background_sockets = task->requested_policy.trp_sup_bg_sockets; } task_unlock(task); @@ -646,66 +650,60 @@ task_policy_get( * The threads, etc will inherit from the task as they get created. */ void -task_policy_create(task_t task, int parent_boosted) -{ - if (task->requested_policy.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) { - if (parent_boosted) { - task->requested_policy.t_apptype = TASK_APPTYPE_DAEMON_INTERACTIVE; +task_policy_create(task_t task, task_t parent_task) +{ + task->requested_policy.trp_apptype = parent_task->requested_policy.trp_apptype; + + task->requested_policy.trp_int_darwinbg = parent_task->requested_policy.trp_int_darwinbg; + task->requested_policy.trp_ext_darwinbg = parent_task->requested_policy.trp_ext_darwinbg; + task->requested_policy.trp_int_iotier = parent_task->requested_policy.trp_int_iotier; + task->requested_policy.trp_ext_iotier = parent_task->requested_policy.trp_ext_iotier; + task->requested_policy.trp_int_iopassive = parent_task->requested_policy.trp_int_iopassive; + task->requested_policy.trp_ext_iopassive = parent_task->requested_policy.trp_ext_iopassive; + task->requested_policy.trp_bg_iotier = parent_task->requested_policy.trp_bg_iotier; + task->requested_policy.trp_terminated = parent_task->requested_policy.trp_terminated; + task->requested_policy.trp_qos_clamp = parent_task->requested_policy.trp_qos_clamp; + + if (task->requested_policy.trp_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) { + if (parent_task->requested_policy.trp_boosted) { + task->requested_policy.trp_apptype = TASK_APPTYPE_DAEMON_INTERACTIVE; task_importance_mark_donor(task, TRUE); } else { - task->requested_policy.t_apptype = TASK_APPTYPE_DAEMON_BACKGROUND; + task->requested_policy.trp_apptype = TASK_APPTYPE_DAEMON_BACKGROUND; task_importance_mark_receiver(task, FALSE); } } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_TASK))) | DBG_FUNC_START, - task_pid(task), teffective_0(task, THREAD_NULL), - teffective_1(task, THREAD_NULL), tpriority(task, THREAD_NULL), 0); + (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_TASK))) | DBG_FUNC_START, + task_pid(task), teffective_0(task), + teffective_1(task), task->priority, 0); - task_policy_update_internal_locked(task, THREAD_NULL, TRUE, NULL); + task_policy_update_internal_locked(task, TRUE, NULL); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_TASK))) | DBG_FUNC_END, - task_pid(task), teffective_0(task, THREAD_NULL), - teffective_1(task, THREAD_NULL), tpriority(task, THREAD_NULL), 0); + (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_TASK))) | DBG_FUNC_END, + task_pid(task), teffective_0(task), + teffective_1(task), task->priority, 0); task_importance_update_live_donor(task); - task_policy_update_task_locked(task, FALSE, FALSE, FALSE); } -void -thread_policy_create(thread_t thread) -{ - task_t task = thread->task; - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_START, - targetid(task, thread), teffective_0(task, thread), - teffective_1(task, thread), tpriority(task, thread), 0); - - task_policy_update_internal_locked(task, thread, TRUE, NULL); - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_END, - targetid(task, thread), teffective_0(task, thread), - teffective_1(task, thread), tpriority(task, thread), 0); -} static void -task_policy_update_locked(task_t task, thread_t thread, task_pend_token_t pend_token) +task_policy_update_locked(task_t task, task_pend_token_t pend_token) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_UPDATE, tisthread(thread)) | DBG_FUNC_START), - targetid(task, thread), teffective_0(task, thread), - teffective_1(task, thread), tpriority(task, thread), 0); + (IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_TASK) | DBG_FUNC_START), + task_pid(task), teffective_0(task), + teffective_1(task), task->priority, 0); - task_policy_update_internal_locked(task, thread, FALSE, pend_token); + task_policy_update_internal_locked(task, FALSE, pend_token); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_UPDATE, tisthread(thread))) | DBG_FUNC_END, - targetid(task, thread), teffective_0(task, thread), - teffective_1(task, thread), tpriority(task, thread), 0); + (IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_TASK)) | DBG_FUNC_END, + task_pid(task), teffective_0(task), + teffective_1(task), task->priority, 0); } /* @@ -721,134 +719,76 @@ task_policy_update_locked(task_t task, thread_t thread, task_pend_token_t pend_t */ static void -task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_create, task_pend_token_t pend_token) +task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_token_t pend_token) { - boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE; - /* * Step 1: * Gather requested policy */ - struct task_requested_policy requested = - (on_task) ? task->requested_policy : thread->requested_policy; - + struct task_requested_policy requested = task->requested_policy; /* * Step 2: * Calculate new effective policies from requested policy and task state * Rules: - * If in an 'on_task' block, must only look at and set fields starting with t_ - * If operating on a task, don't touch anything starting with th_ - * If operating on a thread, don't touch anything starting with t_ * Don't change requested, it won't take effect */ struct task_effective_policy next = {}; - struct task_effective_policy task_effective; - - /* Calculate QoS policies */ - - if (on_task) { - /* Update task role */ - next.t_role = requested.t_role; - - /* Set task qos clamp and ceiling */ - next.t_qos_clamp = requested.t_qos_clamp; - - if (requested.t_apptype == TASK_APPTYPE_APP_DEFAULT || - requested.t_apptype == TASK_APPTYPE_APP_TAL) { - - switch (next.t_role) { - case TASK_FOREGROUND_APPLICATION: - /* Foreground apps get urgent scheduler priority */ - next.qos_ui_is_urgent = 1; - next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED; - break; - - case TASK_BACKGROUND_APPLICATION: - /* This is really 'non-focal but on-screen' */ - next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED; - break; - - case TASK_DEFAULT_APPLICATION: - /* This is 'may render UI but we don't know if it's focal/nonfocal' */ - next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED; - break; - - case TASK_NONUI_APPLICATION: - /* i.e. 'off-screen' */ - next.t_qos_ceiling = THREAD_QOS_LEGACY; - break; - - case TASK_CONTROL_APPLICATION: - case TASK_GRAPHICS_SERVER: - next.qos_ui_is_urgent = 1; - next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED; - break; - - case TASK_THROTTLE_APPLICATION: - /* i.e. 'TAL launch' */ - next.t_qos_ceiling = THREAD_QOS_UTILITY; - break; - - case TASK_UNSPECIFIED: - default: - /* Apps that don't have an application role get - * USER_INTERACTIVE and USER_INITIATED squashed to LEGACY */ - next.t_qos_ceiling = THREAD_QOS_LEGACY; - break; - } - } else { - /* Daemons get USER_INTERACTIVE squashed to USER_INITIATED */ - next.t_qos_ceiling = THREAD_QOS_USER_INITIATED; - } - } else { - /* - * Set thread qos tier - * Note that an override only overrides the QoS field, not other policy settings. - * A thread must already be participating in QoS for override to take effect - */ - /* Snapshot the task's effective policy */ - task_effective = task->effective_policy; + /* Update task role */ + next.tep_role = requested.trp_role; - next.qos_ui_is_urgent = task_effective.qos_ui_is_urgent; + /* Set task qos clamp and ceiling */ + next.tep_qos_clamp = requested.trp_qos_clamp; - if ((requested.thrp_qos_override != THREAD_QOS_UNSPECIFIED) && (requested.thrp_qos != THREAD_QOS_UNSPECIFIED)) - next.thep_qos = MAX(requested.thrp_qos_override, requested.thrp_qos); - else - next.thep_qos = requested.thrp_qos; + if (requested.trp_apptype == TASK_APPTYPE_APP_DEFAULT || + requested.trp_apptype == TASK_APPTYPE_APP_TAL) { - /* A task clamp will result in an effective QoS even when requested is UNSPECIFIED */ - if (task_effective.t_qos_clamp != THREAD_QOS_UNSPECIFIED) { - if (next.thep_qos != THREAD_QOS_UNSPECIFIED) - next.thep_qos = MIN(task_effective.t_qos_clamp, next.thep_qos); - else - next.thep_qos = task_effective.t_qos_clamp; - } + switch (next.tep_role) { + case TASK_FOREGROUND_APPLICATION: + /* Foreground apps get urgent scheduler priority */ + next.tep_qos_ui_is_urgent = 1; + next.tep_qos_ceiling = THREAD_QOS_UNSPECIFIED; + break; - /* The ceiling only applies to threads that are in the QoS world */ - if (task_effective.t_qos_ceiling != THREAD_QOS_UNSPECIFIED && - next.thep_qos != THREAD_QOS_UNSPECIFIED) { - next.thep_qos = MIN(task_effective.t_qos_ceiling, next.thep_qos); - } + case TASK_BACKGROUND_APPLICATION: + /* This is really 'non-focal but on-screen' */ + next.tep_qos_ceiling = THREAD_QOS_UNSPECIFIED; + break; - /* - * The QoS relative priority is only applicable when the original programmer's - * intended (requested) QoS is in effect. When the QoS is clamped (e.g. - * USER_INITIATED-13REL clamped to UTILITY), the relative priority is not honored, - * since otherwise it would be lower than unclamped threads. Similarly, in the - * presence of boosting, the programmer doesn't know what other actors - * are boosting the thread. - */ - if ((requested.thrp_qos != THREAD_QOS_UNSPECIFIED) && - (requested.thrp_qos == next.thep_qos) && - (requested.thrp_qos_override == THREAD_QOS_UNSPECIFIED)) { - next.thep_qos_relprio = requested.thrp_qos_relprio; - } else { - next.thep_qos_relprio = 0; + case TASK_DEFAULT_APPLICATION: + /* This is 'may render UI but we don't know if it's focal/nonfocal' */ + next.tep_qos_ceiling = THREAD_QOS_UNSPECIFIED; + break; + + case TASK_NONUI_APPLICATION: + /* i.e. 'off-screen' */ + next.tep_qos_ceiling = THREAD_QOS_LEGACY; + break; + + case TASK_CONTROL_APPLICATION: + case TASK_GRAPHICS_SERVER: + next.tep_qos_ui_is_urgent = 1; + next.tep_qos_ceiling = THREAD_QOS_UNSPECIFIED; + break; + + case TASK_THROTTLE_APPLICATION: + /* i.e. 'TAL launch' */ + next.tep_qos_ceiling = THREAD_QOS_UTILITY; + break; + + case TASK_UNSPECIFIED: + default: + /* Apps that don't have an application role get + * USER_INTERACTIVE and USER_INITIATED squashed to LEGACY */ + next.tep_qos_ceiling = THREAD_QOS_LEGACY; + break; } + } else { + /* Daemons get USER_INTERACTIVE squashed to USER_INITIATED */ + next.tep_qos_ceiling = THREAD_QOS_USER_INITIATED; } /* Calculate DARWIN_BG */ @@ -862,67 +802,48 @@ task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_cr * * Backgrounding due to apptype does. */ - if (requested.int_darwinbg || requested.ext_darwinbg) + if (requested.trp_int_darwinbg || requested.trp_ext_darwinbg) wants_watchersbg = wants_all_sockets_bg = wants_darwinbg = TRUE; - if (on_task) { - /* Background TAL apps are throttled when TAL is enabled */ - if (requested.t_apptype == TASK_APPTYPE_APP_TAL && - requested.t_role == TASK_BACKGROUND_APPLICATION && - requested.t_tal_enabled == 1) { - next.t_tal_engaged = 1; - } - - if ((requested.t_apptype == TASK_APPTYPE_APP_DEFAULT || - requested.t_apptype == TASK_APPTYPE_APP_TAL) && - requested.t_role == TASK_THROTTLE_APPLICATION) { - next.t_tal_engaged = 1; - } - - /* Adaptive daemons are DARWIN_BG unless boosted, and don't get network throttled. */ - if (requested.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE && - requested.t_boosted == 0) - wants_darwinbg = TRUE; + /* Background TAL apps are throttled when TAL is enabled */ + if (requested.trp_apptype == TASK_APPTYPE_APP_TAL && + requested.trp_role == TASK_BACKGROUND_APPLICATION && + requested.trp_tal_enabled == 1) { + next.tep_tal_engaged = 1; + } - /* Background daemons are always DARWIN_BG, no exceptions, and don't get network throttled. */ - if (requested.t_apptype == TASK_APPTYPE_DAEMON_BACKGROUND) - wants_darwinbg = TRUE; + if ((requested.trp_apptype == TASK_APPTYPE_APP_DEFAULT || + requested.trp_apptype == TASK_APPTYPE_APP_TAL) && + requested.trp_role == TASK_THROTTLE_APPLICATION) { + next.tep_tal_engaged = 1; + } - if (next.t_qos_clamp == THREAD_QOS_BACKGROUND || next.t_qos_clamp == THREAD_QOS_MAINTENANCE) - wants_darwinbg = TRUE; - } else { - if (requested.th_pidbind_bg) - wants_all_sockets_bg = wants_darwinbg = TRUE; + /* Adaptive daemons are DARWIN_BG unless boosted, and don't get network throttled. */ + if (requested.trp_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE && + requested.trp_boosted == 0) + wants_darwinbg = TRUE; - if (requested.th_workq_bg) - wants_darwinbg = TRUE; + /* Background daemons are always DARWIN_BG, no exceptions, and don't get network throttled. */ + if (requested.trp_apptype == TASK_APPTYPE_DAEMON_BACKGROUND) + wants_darwinbg = TRUE; - if (next.thep_qos == THREAD_QOS_BACKGROUND || next.thep_qos == THREAD_QOS_MAINTENANCE) - wants_darwinbg = TRUE; - } + if (next.tep_qos_clamp == THREAD_QOS_BACKGROUND || next.tep_qos_clamp == THREAD_QOS_MAINTENANCE) + wants_darwinbg = TRUE; /* Calculate side effects of DARWIN_BG */ if (wants_darwinbg) { - next.darwinbg = 1; - /* darwinbg threads/tasks always create bg sockets, but we don't always loop over all sockets */ - next.new_sockets_bg = 1; - next.lowpri_cpu = 1; + next.tep_darwinbg = 1; + /* darwinbg tasks always create bg sockets, but we don't always loop over all sockets */ + next.tep_new_sockets_bg = 1; + next.tep_lowpri_cpu = 1; } if (wants_all_sockets_bg) - next.all_sockets_bg = 1; - - if (on_task && wants_watchersbg) - next.t_watchers_bg = 1; + next.tep_all_sockets_bg = 1; - /* darwinbg on either task or thread implies background QOS (or lower) */ - if (!on_task && - (wants_darwinbg || task_effective.darwinbg) && - (next.thep_qos > THREAD_QOS_BACKGROUND || next.thep_qos == THREAD_QOS_UNSPECIFIED)){ - next.thep_qos = THREAD_QOS_BACKGROUND; - next.thep_qos_relprio = 0; - } + if (wants_watchersbg) + next.tep_watchers_bg = 1; /* Calculate low CPU priority */ @@ -931,133 +852,119 @@ task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_cr if (wants_darwinbg) wants_lowpri_cpu = TRUE; - if (next.t_tal_engaged) + if (next.tep_tal_engaged) wants_lowpri_cpu = TRUE; - if (on_task && requested.t_sup_lowpri_cpu && requested.t_boosted == 0) + if (requested.trp_sup_lowpri_cpu && requested.trp_boosted == 0) wants_lowpri_cpu = TRUE; if (wants_lowpri_cpu) - next.lowpri_cpu = 1; + next.tep_lowpri_cpu = 1; /* Calculate IO policy */ /* Update BG IO policy (so we can see if it has changed) */ - next.bg_iotier = requested.bg_iotier; + next.tep_bg_iotier = requested.trp_bg_iotier; int iopol = THROTTLE_LEVEL_TIER0; if (wants_darwinbg) - iopol = MAX(iopol, requested.bg_iotier); - - if (on_task) { - if (requested.t_apptype == TASK_APPTYPE_DAEMON_STANDARD) - iopol = MAX(iopol, proc_standard_daemon_tier); + iopol = MAX(iopol, requested.trp_bg_iotier); - if (requested.t_sup_disk && requested.t_boosted == 0) - iopol = MAX(iopol, proc_suppressed_disk_tier); + if (requested.trp_apptype == TASK_APPTYPE_DAEMON_STANDARD) + iopol = MAX(iopol, proc_standard_daemon_tier); - if (next.t_tal_engaged) - iopol = MAX(iopol, proc_tal_disk_tier); + if (requested.trp_sup_disk && requested.trp_boosted == 0) + iopol = MAX(iopol, proc_suppressed_disk_tier); - if (next.t_qos_clamp != THREAD_QOS_UNSPECIFIED) - iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.t_qos_clamp]); + if (next.tep_tal_engaged) + iopol = MAX(iopol, proc_tal_disk_tier); - } else { - /* Look up the associated IO tier value for the QoS class */ - iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.thep_qos]); - } + if (next.tep_qos_clamp != THREAD_QOS_UNSPECIFIED) + iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.tep_qos_clamp]); - iopol = MAX(iopol, requested.int_iotier); - iopol = MAX(iopol, requested.ext_iotier); + iopol = MAX(iopol, requested.trp_int_iotier); + iopol = MAX(iopol, requested.trp_ext_iotier); - next.io_tier = iopol; + next.tep_io_tier = iopol; /* Calculate Passive IO policy */ - if (requested.ext_iopassive || requested.int_iopassive) - next.io_passive = 1; - - /* Calculate miscellaneous policy */ - - if (on_task) { - /* Calculate suppression-active flag */ - if (requested.t_sup_active && requested.t_boosted == 0) - next.t_sup_active = 1; + if (requested.trp_ext_iopassive || requested.trp_int_iopassive) + next.tep_io_passive = 1; - /* Calculate suspend policy */ - if (requested.t_sup_suspend && requested.t_boosted == 0) - next.t_suspended = 1; + /* Calculate suppression-active flag */ + if (requested.trp_sup_active && requested.trp_boosted == 0) + next.tep_sup_active = 1; - /* Calculate timer QOS */ - int latency_qos = requested.t_base_latency_qos; + /* Calculate timer QOS */ + int latency_qos = requested.trp_base_latency_qos; - if (requested.t_sup_timer && requested.t_boosted == 0) - latency_qos = requested.t_sup_timer; + if (requested.trp_sup_timer && requested.trp_boosted == 0) + latency_qos = requested.trp_sup_timer; - if (next.t_qos_clamp != THREAD_QOS_UNSPECIFIED) - latency_qos = MAX(latency_qos, (int)thread_qos_policy_params.qos_latency_qos[next.t_qos_clamp]); + if (next.tep_qos_clamp != THREAD_QOS_UNSPECIFIED) + latency_qos = MAX(latency_qos, (int)thread_qos_policy_params.qos_latency_qos[next.tep_qos_clamp]); - if (requested.t_over_latency_qos != 0) - latency_qos = requested.t_over_latency_qos; + if (requested.trp_over_latency_qos != 0) + latency_qos = requested.trp_over_latency_qos; - /* Treat the windowserver special */ - if (requested.t_role == TASK_GRAPHICS_SERVER) - latency_qos = proc_graphics_timer_qos; + /* Treat the windowserver special */ + if (requested.trp_role == TASK_GRAPHICS_SERVER) + latency_qos = proc_graphics_timer_qos; - next.t_latency_qos = latency_qos; + next.tep_latency_qos = latency_qos; - /* Calculate throughput QOS */ - int through_qos = requested.t_base_through_qos; + /* Calculate throughput QOS */ + int through_qos = requested.trp_base_through_qos; - if (requested.t_sup_throughput && requested.t_boosted == 0) - through_qos = requested.t_sup_throughput; + if (requested.trp_sup_throughput && requested.trp_boosted == 0) + through_qos = requested.trp_sup_throughput; - if (next.t_qos_clamp != THREAD_QOS_UNSPECIFIED) - through_qos = MAX(through_qos, (int)thread_qos_policy_params.qos_through_qos[next.t_qos_clamp]); + if (next.tep_qos_clamp != THREAD_QOS_UNSPECIFIED) + through_qos = MAX(through_qos, (int)thread_qos_policy_params.qos_through_qos[next.tep_qos_clamp]); - if (requested.t_over_through_qos != 0) - through_qos = requested.t_over_through_qos; + if (requested.trp_over_through_qos != 0) + through_qos = requested.trp_over_through_qos; - next.t_through_qos = through_qos; + next.tep_through_qos = through_qos; - /* Calculate suppressed CPU priority */ - if (requested.t_sup_cpu && requested.t_boosted == 0) - next.t_suppressed_cpu = 1; + /* Calculate suppressed CPU priority */ + if (requested.trp_sup_cpu && requested.trp_boosted == 0) + next.tep_suppressed_cpu = 1; - /* - * Calculate background sockets - * Don't take into account boosting to limit transition frequency. - */ - if (requested.t_sup_bg_sockets){ - next.all_sockets_bg = 1; - next.new_sockets_bg = 1; - } + /* + * Calculate background sockets + * Don't take into account boosting to limit transition frequency. + */ + if (requested.trp_sup_bg_sockets){ + next.tep_all_sockets_bg = 1; + next.tep_new_sockets_bg = 1; + } - /* Apply SFI Managed class bit */ - next.t_sfi_managed = requested.t_sfi_managed; + /* Apply SFI Managed class bit */ + next.tep_sfi_managed = requested.trp_sfi_managed; - /* Calculate 'live donor' status for live importance */ - switch (requested.t_apptype) { - case TASK_APPTYPE_APP_TAL: - case TASK_APPTYPE_APP_DEFAULT: - if (requested.ext_darwinbg == 0) - next.t_live_donor = 1; - else - next.t_live_donor = 0; - break; + /* Calculate 'live donor' status for live importance */ + switch (requested.trp_apptype) { + case TASK_APPTYPE_APP_TAL: + case TASK_APPTYPE_APP_DEFAULT: + if (requested.trp_ext_darwinbg == 0) + next.tep_live_donor = 1; + else + next.tep_live_donor = 0; + break; - case TASK_APPTYPE_DAEMON_INTERACTIVE: - case TASK_APPTYPE_DAEMON_STANDARD: - case TASK_APPTYPE_DAEMON_ADAPTIVE: - case TASK_APPTYPE_DAEMON_BACKGROUND: - default: - next.t_live_donor = 0; - break; - } + case TASK_APPTYPE_DAEMON_INTERACTIVE: + case TASK_APPTYPE_DAEMON_STANDARD: + case TASK_APPTYPE_DAEMON_ADAPTIVE: + case TASK_APPTYPE_DAEMON_BACKGROUND: + default: + next.tep_live_donor = 0; + break; } - if (requested.terminated) { + if (requested.trp_terminated) { /* * Shoot down the throttles that slow down exit or response to SIGTERM * We don't need to shoot down: @@ -1066,23 +973,16 @@ task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_cr * new_sockets_bg (doesn't matter for exiting process) * pidsuspend (jetsam-ed BG process shouldn't run again) * watchers_bg (watcher threads don't need to be unthrottled) - * t_latency_qos (affects userspace timers only) + * latency_qos (affects userspace timers only) */ - next.terminated = 1; - next.darwinbg = 0; - next.lowpri_cpu = 0; - next.io_tier = THROTTLE_LEVEL_TIER0; - if (on_task) { - next.t_tal_engaged = 0; - next.t_role = TASK_UNSPECIFIED; - next.t_suppressed_cpu = 0; - - /* TODO: This should only be shot down on SIGTERM, not exit */ - next.t_suspended = 0; - } else { - next.thep_qos = THREAD_QOS_UNSPECIFIED; - } + next.tep_terminated = 1; + next.tep_darwinbg = 0; + next.tep_lowpri_cpu = 0; + next.tep_io_tier = THROTTLE_LEVEL_TIER0; + next.tep_tal_engaged = 0; + next.tep_role = TASK_UNSPECIFIED; + next.tep_suppressed_cpu = 0; } /* @@ -1090,143 +990,153 @@ task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_cr * Swap out old policy for new policy */ - if (!on_task) { - /* Acquire thread mutex to synchronize against - * thread_policy_set(). Consider reworking to separate qos - * fields, or locking the task in thread_policy_set. - * A more efficient model would be to make the thread bits - * authoritative. - */ - thread_mtx_lock(thread); - } - - struct task_effective_policy prev = - (on_task) ? task->effective_policy : thread->effective_policy; - - /* - * Check for invalid transitions here for easier debugging - * TODO: dump the structs as hex in the panic string - */ - if (task == kernel_task && prev.all_sockets_bg != next.all_sockets_bg) - panic("unexpected network change for kernel task"); + struct task_effective_policy prev = task->effective_policy; /* This is the point where the new values become visible to other threads */ - if (on_task) - task->effective_policy = next; - else { - /* Preserve thread specific latency/throughput QoS modified via - * thread_policy_set(). Inelegant in the extreme, to be reworked. - * - * If thread QoS class is set, we don't need to preserve the previously set values. - * We should ensure to not accidentally preserve previous thread QoS values if you set a thread - * back to default QoS. - */ - uint32_t lqos = thread->effective_policy.t_latency_qos, tqos = thread->effective_policy.t_through_qos; - - if (prev.thep_qos == THREAD_QOS_UNSPECIFIED && next.thep_qos == THREAD_QOS_UNSPECIFIED) { - next.t_latency_qos = lqos; - next.t_through_qos = tqos; - } else if (prev.thep_qos != THREAD_QOS_UNSPECIFIED && next.thep_qos == THREAD_QOS_UNSPECIFIED) { - next.t_latency_qos = 0; - next.t_through_qos = 0; - } else { - next.t_latency_qos = thread_qos_policy_params.qos_latency_qos[next.thep_qos]; - next.t_through_qos = thread_qos_policy_params.qos_through_qos[next.thep_qos]; - } + task->effective_policy = next; - thread_update_qos_cpu_time(thread, TRUE); - thread->effective_policy = next; - thread_mtx_unlock(thread); - } - - /* Don't do anything further to a half-formed task or thread */ + /* Don't do anything further to a half-formed task */ if (in_create) return; + if (task == kernel_task) + panic("Attempting to set task policy on kernel_task"); + /* * Step 4: * Pend updates that can't be done while holding the task lock */ - if (prev.all_sockets_bg != next.all_sockets_bg) + if (prev.tep_all_sockets_bg != next.tep_all_sockets_bg) pend_token->tpt_update_sockets = 1; - if (on_task) { - /* Only re-scan the timer list if the qos level is getting less strong */ - if (prev.t_latency_qos > next.t_latency_qos) - pend_token->tpt_update_timers = 1; + /* Only re-scan the timer list if the qos level is getting less strong */ + if (prev.tep_latency_qos > next.tep_latency_qos) + pend_token->tpt_update_timers = 1; - if (prev.t_live_donor != next.t_live_donor) - pend_token->tpt_update_live_donor = 1; - } + if (prev.tep_live_donor != next.tep_live_donor) + pend_token->tpt_update_live_donor = 1; /* * Step 5: * Update other subsystems as necessary if something has changed */ - boolean_t update_throttle = (prev.io_tier != next.io_tier) ? TRUE : FALSE; - - if (on_task) { - if (prev.t_suspended == 0 && next.t_suspended == 1 && task->active) { - task_hold_locked(task); - task_wait_locked(task, FALSE); - } - if (prev.t_suspended == 1 && next.t_suspended == 0 && task->active) { - task_release_locked(task); - } - - boolean_t update_threads = FALSE; - boolean_t update_sfi = FALSE; + boolean_t update_threads = FALSE, update_sfi = FALSE; - if (prev.bg_iotier != next.bg_iotier || - prev.terminated != next.terminated || - prev.t_qos_clamp != next.t_qos_clamp || - prev.t_qos_ceiling != next.t_qos_ceiling || - prev.qos_ui_is_urgent != next.qos_ui_is_urgent || - prev.darwinbg != next.darwinbg) - update_threads = TRUE; + /* + * Check for the attributes that thread_policy_update_internal_locked() consults, + * and trigger thread policy re-evaluation. + */ + if (prev.tep_io_tier != next.tep_io_tier || + prev.tep_bg_iotier != next.tep_bg_iotier || + prev.tep_io_passive != next.tep_io_passive || + prev.tep_darwinbg != next.tep_darwinbg || + prev.tep_qos_clamp != next.tep_qos_clamp || + prev.tep_qos_ceiling != next.tep_qos_ceiling || + prev.tep_qos_ui_is_urgent != next.tep_qos_ui_is_urgent || + prev.tep_latency_qos != next.tep_latency_qos || + prev.tep_through_qos != next.tep_through_qos || + prev.tep_lowpri_cpu != next.tep_lowpri_cpu || + prev.tep_new_sockets_bg != next.tep_new_sockets_bg || + prev.tep_terminated != next.tep_terminated ) + update_threads = TRUE; - /* - * A bit of a layering violation. We know what task policy attributes - * sfi_thread_classify() consults, so if they change, trigger SFI - * re-evaluation. - */ - if ((prev.t_latency_qos != next.t_latency_qos) || - (prev.t_role != next.t_role) || - (prev.darwinbg != next.darwinbg) || - (prev.t_sfi_managed != next.t_sfi_managed)) - update_sfi = TRUE; + /* + * Check for the attributes that sfi_thread_classify() consults, + * and trigger SFI re-evaluation. + */ + if (prev.tep_latency_qos != next.tep_latency_qos || + prev.tep_role != next.tep_role || + prev.tep_sfi_managed != next.tep_sfi_managed ) + update_sfi = TRUE; #if CONFIG_SCHED_SFI - if (prev.t_role != next.t_role && task_policy_update_coalition_focal_tasks(task, prev.t_role, next.t_role)) { + /* Reflect task role transitions into the coalition role counters */ + if (prev.tep_role != next.tep_role) { + if (task_policy_update_coalition_focal_tasks(task, prev.tep_role, next.tep_role)) { update_sfi = TRUE; pend_token->tpt_update_coal_sfi = 1; } + } #endif /* !CONFIG_SCHED_SFI */ - task_policy_update_task_locked(task, update_throttle, update_threads, update_sfi); + boolean_t update_priority = FALSE; + + int priority = BASEPRI_DEFAULT; + int max_priority = MAXPRI_USER; + + if (next.tep_lowpri_cpu) { + priority = MAXPRI_THROTTLE; + max_priority = MAXPRI_THROTTLE; + } else if (next.tep_suppressed_cpu) { + priority = MAXPRI_SUPPRESSED; + max_priority = MAXPRI_SUPPRESSED; } else { - int update_cpu = 0; - boolean_t update_sfi = FALSE; - boolean_t update_qos = FALSE; + switch (next.tep_role) { + case TASK_CONTROL_APPLICATION: + priority = BASEPRI_CONTROL; + break; + case TASK_GRAPHICS_SERVER: + priority = BASEPRI_GRAPHICS; + max_priority = MAXPRI_RESERVED; + break; + default: + break; + } - if (prev.lowpri_cpu != next.lowpri_cpu) - update_cpu = (next.lowpri_cpu ? DO_LOWPRI_CPU : UNDO_LOWPRI_CPU); + /* factor in 'nice' value */ + priority += task->importance; - if (prev.darwinbg != next.darwinbg || - prev.thep_qos != next.thep_qos) - update_sfi = TRUE; + if (task->effective_policy.tep_qos_clamp != THREAD_QOS_UNSPECIFIED) { + int qos_clamp_priority = thread_qos_policy_params.qos_pri[task->effective_policy.tep_qos_clamp]; - if (prev.thep_qos != next.thep_qos || - prev.thep_qos_relprio != next.thep_qos_relprio || - prev.qos_ui_is_urgent != next.qos_ui_is_urgent || - prev.terminated != next.terminated) { - update_qos = TRUE; + priority = MIN(priority, qos_clamp_priority); + max_priority = MIN(max_priority, qos_clamp_priority); } - task_policy_update_thread_locked(thread, update_cpu, update_throttle, update_sfi, update_qos); + if (priority > max_priority) + priority = max_priority; + else if (priority < MINPRI) + priority = MINPRI; + } + + assert(priority <= max_priority); + + /* avoid extra work if priority isn't changing */ + if (priority != task->priority || + max_priority != task->max_priority ) { + /* update the scheduling priority for the task */ + task->max_priority = max_priority; + task->priority = priority; + update_priority = TRUE; + } + + /* Loop over the threads in the task: + * only once + * only if necessary + * with one thread mutex hold per thread + */ + if (update_threads || update_priority || update_sfi) { + thread_t thread; + + queue_iterate(&task->threads, thread, thread_t, task_threads) { + struct task_pend_token thread_pend_token = {}; + + if (update_sfi) + thread_pend_token.tpt_update_thread_sfi = 1; + + if (update_priority || update_threads) + thread_policy_update_tasklocked(thread, + task->priority, task->max_priority, + &thread_pend_token); + + assert(!thread_pend_token.tpt_update_sockets); + + // Slightly risky, as we still hold the task lock... + thread_policy_update_complete_unlocked(thread, &thread_pend_token); + } } } @@ -1262,280 +1172,95 @@ task_policy_update_coalition_focal_tasks(task_t task, return sfi_transition; } -#endif /* CONFIG_SCHED_SFI */ -/* Despite the name, the thread's task is locked, the thread is not */ -void -task_policy_update_thread_locked(thread_t thread, - int update_cpu, - boolean_t update_throttle, - boolean_t update_sfi, - boolean_t update_qos) +/* coalition object is locked */ +static void +task_sfi_reevaluate_cb(coalition_t coal, void *ctx, task_t task) { - thread_precedence_policy_data_t policy; + thread_t thread; - if (update_throttle) { - rethrottle_thread(thread->uthread); - } + /* unused for now */ + (void)coal; - if (update_sfi) { - sfi_reevaluate(thread); - } + /* skip the task we're re-evaluating on behalf of: it's already updated */ + if (task == (task_t)ctx) + return; - /* - * TODO: pidbind needs to stuff remembered importance into saved_importance - * properly deal with bg'ed threads being pidbound and unbging while pidbound - * - * TODO: A BG thread's priority is 0 on desktop and 4 on embedded. Need to reconcile this. - * */ - if (update_cpu == DO_LOWPRI_CPU) { - thread->saved_importance = thread->importance; - policy.importance = INT_MIN; - } else if (update_cpu == UNDO_LOWPRI_CPU) { - policy.importance = thread->saved_importance; - thread->saved_importance = 0; - } + task_lock(task); - /* Takes thread lock and thread mtx lock */ - if (update_cpu) - thread_policy_set_internal(thread, THREAD_PRECEDENCE_POLICY, - (thread_policy_t)&policy, - THREAD_PRECEDENCE_POLICY_COUNT); + queue_iterate(&task->threads, thread, thread_t, task_threads) { + sfi_reevaluate(thread); + } - if (update_qos) - thread_recompute_qos(thread); + task_unlock(task); } +#endif /* CONFIG_SCHED_SFI */ /* - * Calculate priority on a task, loop through its threads, and tell them about - * priority changes and throttle changes. + * Called with task unlocked to do things that can't be done while holding the task lock */ void -task_policy_update_task_locked(task_t task, - boolean_t update_throttle, - boolean_t update_threads, - boolean_t update_sfi) +task_policy_update_complete_unlocked(task_t task, task_pend_token_t pend_token) { - boolean_t update_priority = FALSE; +#ifdef MACH_BSD + if (pend_token->tpt_update_sockets) + proc_apply_task_networkbg(task->bsd_info, THREAD_NULL); +#endif /* MACH_BSD */ - if (task == kernel_task) - panic("Attempting to set task policy on kernel_task"); + /* The timer throttle has been removed or reduced, we need to look for expired timers and fire them */ + if (pend_token->tpt_update_timers) + ml_timer_evaluate(); - int priority = BASEPRI_DEFAULT; - int max_priority = MAXPRI_USER; - if (proc_get_effective_task_policy(task, TASK_POLICY_LOWPRI_CPU)) { - priority = MAXPRI_THROTTLE; - max_priority = MAXPRI_THROTTLE; - } else if (proc_get_effective_task_policy(task, TASK_POLICY_SUPPRESSED_CPU)) { - priority = MAXPRI_SUPPRESSED; - max_priority = MAXPRI_SUPPRESSED; - } else { - switch (proc_get_effective_task_policy(task, TASK_POLICY_ROLE)) { - case TASK_CONTROL_APPLICATION: - priority = BASEPRI_CONTROL; - break; - case TASK_GRAPHICS_SERVER: - priority = BASEPRI_GRAPHICS; - max_priority = MAXPRI_RESERVED; - break; - default: - break; - } + if (pend_token->tpt_update_live_donor) + task_importance_update_live_donor(task); - /* factor in 'nice' value */ - priority += task->importance; +#if CONFIG_SCHED_SFI + /* use the resource coalition for SFI re-evaluation */ + if (pend_token->tpt_update_coal_sfi) + coalition_for_each_task(task->coalition[COALITION_TYPE_RESOURCE], + (void *)task, task_sfi_reevaluate_cb); +#endif /* CONFIG_SCHED_SFI */ +} - if (task->effective_policy.t_qos_clamp != THREAD_QOS_UNSPECIFIED) { - int qos_clamp_priority = thread_qos_policy_params.qos_pri[task->effective_policy.t_qos_clamp]; +/* + * Initiate a task policy state transition + * + * Everything that modifies requested except functions that need to hold the task lock + * should use this function + * + * Argument validation should be performed before reaching this point. + * + * TODO: Do we need to check task->active? + */ +void +proc_set_task_policy(task_t task, + int category, + int flavor, + int value) +{ + struct task_pend_token pend_token = {}; - priority = MIN(priority, qos_clamp_priority); - max_priority = MIN(max_priority, qos_clamp_priority); - } - } - - /* avoid extra work if priority isn't changing */ - if (task->priority != priority || task->max_priority != max_priority) { - update_priority = TRUE; - - /* update the scheduling priority for the task */ - task->max_priority = max_priority; - - if (priority > task->max_priority) - priority = task->max_priority; - else if (priority < MINPRI) - priority = MINPRI; - - task->priority = priority; - } - - /* Loop over the threads in the task only once, and only if necessary */ - if (update_threads || update_throttle || update_priority || update_sfi ) { - thread_t thread; - - queue_iterate(&task->threads, thread, thread_t, task_threads) { - if (update_priority) { - thread_mtx_lock(thread); - - thread_task_priority(thread, priority, max_priority); - - thread_mtx_unlock(thread); - } - - if (update_throttle) { - rethrottle_thread(thread->uthread); - } - - if (update_sfi) { - sfi_reevaluate(thread); - } - - if (update_threads) { - thread->requested_policy.bg_iotier = task->effective_policy.bg_iotier; - thread->requested_policy.terminated = task->effective_policy.terminated; - - task_policy_update_internal_locked(task, thread, FALSE, NULL); - /* The thread policy must not emit any completion actions due to this change. */ - } - } - } -} - -#if CONFIG_SCHED_SFI -/* coalition object is locked */ -static void -task_sfi_reevaluate_cb(coalition_t coal, void *ctx, task_t task) -{ - thread_t thread; - - /* unused for now */ - (void)coal; - - /* skip the task we're re-evaluating on behalf of: it's already updated */ - if (task == (task_t)ctx) - return; - - task_lock(task); - - queue_iterate(&task->threads, thread, thread_t, task_threads) { - sfi_reevaluate(thread); - } - - task_unlock(task); -} -#endif /* CONFIG_SCHED_SFI */ - -/* - * Called with task unlocked to do things that can't be done while holding the task lock - */ -void -task_policy_update_complete_unlocked(task_t task, thread_t thread, task_pend_token_t pend_token) -{ - boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE; - -#ifdef MACH_BSD - if (pend_token->tpt_update_sockets) - proc_apply_task_networkbg(task->bsd_info, thread); -#endif /* MACH_BSD */ - - if (on_task) { - /* The timer throttle has been removed or reduced, we need to look for expired timers and fire them */ - if (pend_token->tpt_update_timers) - ml_timer_evaluate(); - - - if (pend_token->tpt_update_live_donor) - task_importance_update_live_donor(task); - -#if CONFIG_SCHED_SFI - /* use the resource coalition for SFI re-evaluation */ - if (pend_token->tpt_update_coal_sfi) - coalition_for_each_task(task->coalition[COALITION_TYPE_RESOURCE], - (void *)task, task_sfi_reevaluate_cb); -#endif /* CONFIG_SCHED_SFI */ - } -} - -/* - * Initiate a task policy state transition - * - * Everything that modifies requested except functions that need to hold the task lock - * should use this function - * - * Argument validation should be performed before reaching this point. - * - * TODO: Do we need to check task->active or thread->active? - */ -void -proc_set_task_policy(task_t task, - thread_t thread, - int category, - int flavor, - int value) -{ - struct task_pend_token pend_token = {}; - task_lock(task); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_START, - targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), value, 0); + (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_TASK))) | DBG_FUNC_START, + task_pid(task), trequested_0(task), + trequested_1(task), value, 0); - proc_set_task_policy_locked(task, thread, category, flavor, value); + proc_set_task_policy_locked(task, category, flavor, value, 0); - task_policy_update_locked(task, thread, &pend_token); - - task_unlock(task); + task_policy_update_locked(task, &pend_token); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_END, - targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), tpending(&pend_token), 0); - - task_policy_update_complete_unlocked(task, thread, &pend_token); -} - -/* - * Initiate a task policy state transition on a thread with its TID - * Useful if you cannot guarantee the thread won't get terminated - */ -void -proc_set_task_policy_thread(task_t task, - uint64_t tid, - int category, - int flavor, - int value) -{ - thread_t thread; - thread_t self = current_thread(); - struct task_pend_token pend_token = {}; - - task_lock(task); - - if (tid == TID_NULL || tid == self->thread_id) - thread = self; - else - thread = task_findtid(task, tid); - - if (thread == THREAD_NULL) { - task_unlock(task); - return; - } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_START, - targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), value, 0); - - proc_set_task_policy_locked(task, thread, category, flavor, value); - - task_policy_update_locked(task, thread, &pend_token); + (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_TASK))) | DBG_FUNC_END, + task_pid(task), trequested_0(task), + trequested_1(task), tpending(&pend_token), 0); task_unlock(task); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_END, - targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), tpending(&pend_token), 0); - - task_policy_update_complete_unlocked(task, thread, &pend_token); + task_policy_update_complete_unlocked(task, &pend_token); } /* @@ -1543,27 +1268,33 @@ proc_set_task_policy_thread(task_t task, * Same locking rules apply. */ void -proc_set_task_policy2(task_t task, thread_t thread, int category, int flavor, int value1, int value2) +proc_set_task_policy2(task_t task, + int category, + int flavor, + int value, + int value2) { struct task_pend_token pend_token = {}; - + task_lock(task); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_START, - targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), value1, 0); - - proc_set_task_policy2_locked(task, thread, category, flavor, value1, value2); + (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_TASK))) | DBG_FUNC_START, + task_pid(task), trequested_0(task), + trequested_1(task), value, 0); - task_policy_update_locked(task, thread, &pend_token); + proc_set_task_policy_locked(task, category, flavor, value, value2); - task_unlock(task); + task_policy_update_locked(task, &pend_token); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_END, - targetid(task, thread), trequested_0(task, thread), trequested_0(task, thread), tpending(&pend_token), 0); + (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_TASK))) | DBG_FUNC_END, + task_pid(task), trequested_0(task), + trequested_1(task), tpending(&pend_token), 0); - task_policy_update_complete_unlocked(task, thread, &pend_token); + task_unlock(task); + + task_policy_update_complete_unlocked(task, &pend_token); } /* @@ -1574,247 +1305,168 @@ proc_set_task_policy2(task_t task, thread_t thread, int category, int flavor, in */ static void proc_set_task_policy_locked(task_t task, - thread_t thread, int category, int flavor, - int value) + int value, + int value2) { - boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE; - int tier, passive; - struct task_requested_policy requested = - (on_task) ? task->requested_policy : thread->requested_policy; + struct task_requested_policy requested = task->requested_policy; switch (flavor) { - /* Category: EXTERNAL and INTERNAL, thread and task */ + /* Category: EXTERNAL and INTERNAL */ case TASK_POLICY_DARWIN_BG: if (category == TASK_POLICY_EXTERNAL) - requested.ext_darwinbg = value; + requested.trp_ext_darwinbg = value; else - requested.int_darwinbg = value; + requested.trp_int_darwinbg = value; break; case TASK_POLICY_IOPOL: proc_iopol_to_tier(value, &tier, &passive); if (category == TASK_POLICY_EXTERNAL) { - requested.ext_iotier = tier; - requested.ext_iopassive = passive; + requested.trp_ext_iotier = tier; + requested.trp_ext_iopassive = passive; } else { - requested.int_iotier = tier; - requested.int_iopassive = passive; + requested.trp_int_iotier = tier; + requested.trp_int_iopassive = passive; } break; case TASK_POLICY_IO: if (category == TASK_POLICY_EXTERNAL) - requested.ext_iotier = value; + requested.trp_ext_iotier = value; else - requested.int_iotier = value; + requested.trp_int_iotier = value; break; case TASK_POLICY_PASSIVE_IO: if (category == TASK_POLICY_EXTERNAL) - requested.ext_iopassive = value; + requested.trp_ext_iopassive = value; else - requested.int_iopassive = value; + requested.trp_int_iopassive = value; break; - /* Category: INTERNAL, task only */ + /* Category: INTERNAL */ case TASK_POLICY_DARWIN_BG_IOPOL: - assert(on_task && category == TASK_POLICY_INTERNAL); + assert(category == TASK_POLICY_INTERNAL); proc_iopol_to_tier(value, &tier, &passive); - requested.bg_iotier = tier; + requested.trp_bg_iotier = tier; break; - /* Category: ATTRIBUTE, task only */ + /* Category: ATTRIBUTE */ case TASK_POLICY_TAL: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_tal_enabled = value; + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_tal_enabled = value; break; case TASK_POLICY_BOOST: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_boosted = value; + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_boosted = value; break; case TASK_POLICY_ROLE: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_role = value; + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_role = value; break; case TASK_POLICY_TERMINATED: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.terminated = value; + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_terminated = value; break; - case TASK_BASE_LATENCY_QOS_POLICY: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_base_latency_qos = value; - break; - case TASK_BASE_THROUGHPUT_QOS_POLICY: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_base_through_qos = value; - break; - case TASK_POLICY_SFI_MANAGED: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_sfi_managed = value; - break; - - /* Category: ATTRIBUTE, thread only */ - case TASK_POLICY_PIDBIND_BG: - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - requested.th_pidbind_bg = value; - break; - - case TASK_POLICY_WORKQ_BG: - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - requested.th_workq_bg = value; - break; - - case TASK_POLICY_QOS: - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - requested.thrp_qos = value; + case TASK_BASE_LATENCY_QOS_POLICY: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_base_latency_qos = value; break; - case TASK_POLICY_QOS_OVERRIDE: - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - requested.thrp_qos_override = value; + case TASK_BASE_THROUGHPUT_QOS_POLICY: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_base_through_qos = value; break; - default: - panic("unknown task policy: %d %d %d", category, flavor, value); + case TASK_POLICY_SFI_MANAGED: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_sfi_managed = value; break; - } - - if (on_task) - task->requested_policy = requested; - else - thread->requested_policy = requested; -} - -/* - * Variant of proc_set_task_policy_locked() that sets two scalars in the requested policy structure. - */ -static void -proc_set_task_policy2_locked(task_t task, - thread_t thread, - int category, - int flavor, - int value1, - int value2) -{ - boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE; - - struct task_requested_policy requested = - (on_task) ? task->requested_policy : thread->requested_policy; - - switch (flavor) { - - /* Category: ATTRIBUTE, task only */ case TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_base_latency_qos = value1; - requested.t_base_through_qos = value2; + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_base_latency_qos = value; + requested.trp_base_through_qos = value2; break; case TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - requested.t_over_latency_qos = value1; - requested.t_over_through_qos = value2; - break; - - /* Category: ATTRIBUTE, thread only */ - - case TASK_POLICY_QOS_AND_RELPRIO: - - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - requested.thrp_qos = value1; - requested.thrp_qos_relprio = value2; - DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio); + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_over_latency_qos = value; + requested.trp_over_through_qos = value2; break; default: - panic("unknown task policy: %d %d %d %d", category, flavor, value1, value2); + panic("unknown task policy: %d %d %d %d", category, flavor, value, value2); break; } - if (on_task) - task->requested_policy = requested; - else - thread->requested_policy = requested; + task->requested_policy = requested; } - /* * Gets what you set. Effective values may be different. */ int proc_get_task_policy(task_t task, - thread_t thread, int category, int flavor) { - boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE; - int value = 0; task_lock(task); - struct task_requested_policy requested = - (on_task) ? task->requested_policy : thread->requested_policy; + struct task_requested_policy requested = task->requested_policy; switch (flavor) { case TASK_POLICY_DARWIN_BG: if (category == TASK_POLICY_EXTERNAL) - value = requested.ext_darwinbg; + value = requested.trp_ext_darwinbg; else - value = requested.int_darwinbg; + value = requested.trp_int_darwinbg; break; case TASK_POLICY_IOPOL: if (category == TASK_POLICY_EXTERNAL) - value = proc_tier_to_iopol(requested.ext_iotier, - requested.ext_iopassive); + value = proc_tier_to_iopol(requested.trp_ext_iotier, + requested.trp_ext_iopassive); else - value = proc_tier_to_iopol(requested.int_iotier, - requested.int_iopassive); + value = proc_tier_to_iopol(requested.trp_int_iotier, + requested.trp_int_iopassive); break; case TASK_POLICY_IO: if (category == TASK_POLICY_EXTERNAL) - value = requested.ext_iotier; + value = requested.trp_ext_iotier; else - value = requested.int_iotier; + value = requested.trp_int_iotier; break; case TASK_POLICY_PASSIVE_IO: if (category == TASK_POLICY_EXTERNAL) - value = requested.ext_iopassive; + value = requested.trp_ext_iopassive; else - value = requested.int_iopassive; + value = requested.trp_int_iopassive; break; case TASK_POLICY_DARWIN_BG_IOPOL: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - value = proc_tier_to_iopol(requested.bg_iotier, 0); + assert(category == TASK_POLICY_ATTRIBUTE); + value = proc_tier_to_iopol(requested.trp_bg_iotier, 0); break; case TASK_POLICY_ROLE: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - value = requested.t_role; + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.trp_role; break; case TASK_POLICY_SFI_MANAGED: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - value = requested.t_sfi_managed; - break; - case TASK_POLICY_QOS: - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - value = requested.thrp_qos; - break; - case TASK_POLICY_QOS_OVERRIDE: - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - value = requested.thrp_qos_override; + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.trp_sfi_managed; break; default: panic("unknown policy_flavor %d", flavor); @@ -1830,34 +1482,27 @@ proc_get_task_policy(task_t task, * Variant of proc_get_task_policy() that returns two scalar outputs. */ void -proc_get_task_policy2(task_t task, thread_t thread, int category __unused, int flavor, int *value1, int *value2) +proc_get_task_policy2(task_t task, + __assert_only int category, + int flavor, + int *value1, + int *value2) { - boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE; - task_lock(task); - struct task_requested_policy requested = - (on_task) ? task->requested_policy : thread->requested_policy; + struct task_requested_policy requested = task->requested_policy; switch (flavor) { - /* TASK attributes */ case TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - *value1 = requested.t_base_latency_qos; - *value2 = requested.t_base_through_qos; + assert(category == TASK_POLICY_ATTRIBUTE); + *value1 = requested.trp_base_latency_qos; + *value2 = requested.trp_base_through_qos; break; case TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS: - assert(on_task && category == TASK_POLICY_ATTRIBUTE); - *value1 = requested.t_over_latency_qos; - *value2 = requested.t_over_through_qos; - break; - - /* THREAD attributes */ - case TASK_POLICY_QOS_AND_RELPRIO: - assert(!on_task && category == TASK_POLICY_ATTRIBUTE); - *value1 = requested.thrp_qos; - *value2 = requested.thrp_qos_relprio; + assert(category == TASK_POLICY_ATTRIBUTE); + *value1 = requested.trp_over_latency_qos; + *value2 = requested.trp_over_through_qos; break; default: @@ -1868,91 +1513,34 @@ proc_get_task_policy2(task_t task, thread_t thread, int category __unused, int f task_unlock(task); } - -/* - * Functions for querying effective state for relevant subsystems - * ONLY the relevant subsystem should query these. - * NEVER take a value from one of the 'effective' functions and stuff it into a setter. - */ - -int -proc_get_effective_task_policy(task_t task, int flavor) -{ - return proc_get_effective_policy(task, THREAD_NULL, flavor); -} - -int -proc_get_effective_thread_policy(thread_t thread, int flavor) -{ - return proc_get_effective_policy(thread->task, thread, flavor); -} - /* + * Function for querying effective state for relevant subsystems * Gets what is actually in effect, for subsystems which pull policy instead of receive updates. * + * ONLY the relevant subsystem should query this. + * NEVER take a value from the 'effective' function and stuff it into a setter. + * * NOTE: This accessor does not take the task lock. * Notifications of state updates need to be externally synchronized with state queries. * This routine *MUST* remain interrupt safe, as it is potentially invoked * within the context of a timer interrupt. It is also called in KDP context for stackshot. */ -static int -proc_get_effective_policy(task_t task, - thread_t thread, - int flavor) +int +proc_get_effective_task_policy(task_t task, + int flavor) { - boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE; int value = 0; switch (flavor) { case TASK_POLICY_DARWIN_BG: /* * This backs the KPI call proc_pidbackgrounded to find - * out if a pid is backgrounded, - * as well as proc_get_effective_thread_policy. - * Its main use is within the timer layer, as well as + * out if a pid is backgrounded. + * It is used to communicate state to the VM system, as well as * prioritizing requests to the graphics system. * Returns 1 for background mode, 0 for normal mode */ - if (on_task) - value = task->effective_policy.darwinbg; - else - value = (task->effective_policy.darwinbg || - thread->effective_policy.darwinbg) ? 1 : 0; - break; - case TASK_POLICY_IO: - /* - * The I/O system calls here to find out what throttling tier to apply to an operation. - * Returns THROTTLE_LEVEL_* values. Some userspace spinlock operations can apply - * a temporary iotier override to make the I/O more aggressive to get the lock - * owner to release the spinlock. - */ - if (on_task) - value = task->effective_policy.io_tier; - else { - value = MAX(task->effective_policy.io_tier, - thread->effective_policy.io_tier); - if (thread->iotier_override != THROTTLE_LEVEL_NONE) - value = MIN(value, thread->iotier_override); - } - break; - case TASK_POLICY_PASSIVE_IO: - /* - * The I/O system calls here to find out whether an operation should be passive. - * (i.e. not cause operations with lower throttle tiers to be throttled) - * Returns 1 for passive mode, 0 for normal mode. - * If a userspace spinlock has applied an override, that I/O should always - * be passive to avoid self-throttling when the override is removed and lower - * iotier I/Os are issued. - */ - if (on_task) - value = task->effective_policy.io_passive; - else { - int io_tier = MAX(task->effective_policy.io_tier, thread->effective_policy.io_tier); - boolean_t override_in_effect = (thread->iotier_override != THROTTLE_LEVEL_NONE) && (thread->iotier_override < io_tier); - - value = (task->effective_policy.io_passive || - thread->effective_policy.io_passive || override_in_effect) ? 1 : 0; - } + value = task->effective_policy.tep_darwinbg; break; case TASK_POLICY_ALL_SOCKETS_BG: /* @@ -1962,73 +1550,41 @@ proc_get_effective_policy(task_t task, * This consults both thread and task so un-DBGing a thread while the task is BG * doesn't get you out of the network throttle. */ - if (on_task) - value = task->effective_policy.all_sockets_bg; - else - value = (task->effective_policy.all_sockets_bg || - thread->effective_policy.all_sockets_bg) ? 1 : 0; - break; - case TASK_POLICY_NEW_SOCKETS_BG: - /* - * socreate() calls this to determine if it should mark a new socket as background - * Returns 1 for background mode, 0 for normal mode - */ - if (on_task) - value = task->effective_policy.new_sockets_bg; - else - value = (task->effective_policy.new_sockets_bg || - thread->effective_policy.new_sockets_bg) ? 1 : 0; - break; - case TASK_POLICY_LOWPRI_CPU: - /* - * Returns 1 for low priority cpu mode, 0 for normal mode - */ - if (on_task) - value = task->effective_policy.lowpri_cpu; - else - value = (task->effective_policy.lowpri_cpu || - thread->effective_policy.lowpri_cpu) ? 1 : 0; - break; - case TASK_POLICY_SUPPRESSED_CPU: - /* - * Returns 1 for suppressed cpu mode, 0 for normal mode - */ - assert(on_task); - value = task->effective_policy.t_suppressed_cpu; + value = task->effective_policy.tep_all_sockets_bg; break; case TASK_POLICY_LATENCY_QOS: /* * timer arming calls into here to find out the timer coalescing level * Returns a QoS tier (0-6) */ - if (on_task) { - value = task->effective_policy.t_latency_qos; - } else { - value = MAX(task->effective_policy.t_latency_qos, thread->effective_policy.t_latency_qos); - } + value = task->effective_policy.tep_latency_qos; break; case TASK_POLICY_THROUGH_QOS: /* + * This value is passed into the urgency callout from the scheduler + * to the performance management subsystem. * Returns a QoS tier (0-6) */ - assert(on_task); - value = task->effective_policy.t_through_qos; + value = task->effective_policy.tep_through_qos; break; case TASK_POLICY_ROLE: - assert(on_task); - value = task->effective_policy.t_role; + /* + * This controls various things that ask whether a process is foreground, + * like SFI, VM, access to GPU, etc + */ + value = task->effective_policy.tep_role; break; case TASK_POLICY_WATCHERS_BG: - assert(on_task); - value = task->effective_policy.t_watchers_bg; + /* + * This controls whether or not a thread watching this process should be BG. + */ + value = task->effective_policy.tep_watchers_bg; break; case TASK_POLICY_SFI_MANAGED: - assert(on_task); - value = task->effective_policy.t_sfi_managed; - break; - case TASK_POLICY_QOS: - assert(!on_task); - value = thread->effective_policy.thep_qos; + /* + * This controls whether or not a process is targeted for specific control by thermald. + */ + value = task->effective_policy.tep_sfi_managed; break; default: panic("unknown policy_flavor %d", flavor); @@ -2045,7 +1601,7 @@ proc_get_effective_policy(task_t task, * Note that it is possible to support e.g. IOPOL_PASSIVE_STANDARD in the future */ -static void +void proc_iopol_to_tier(int iopolicy, int *tier, int *passive) { *passive = 0; @@ -2069,532 +1625,93 @@ proc_iopol_to_tier(int iopolicy, int *tier, int *passive) break; default: panic("unknown I/O policy %d", iopolicy); - break; - } -} - -static int -proc_tier_to_iopol(int tier, int passive) -{ - if (passive == 1) { - switch (tier) { - case THROTTLE_LEVEL_TIER0: - return IOPOL_PASSIVE; - break; - default: - panic("unknown passive tier %d", tier); - return IOPOL_DEFAULT; - break; - } - } else { - switch (tier) { - case THROTTLE_LEVEL_NONE: - case THROTTLE_LEVEL_TIER0: - return IOPOL_DEFAULT; - break; - case THROTTLE_LEVEL_TIER1: - return IOPOL_STANDARD; - break; - case THROTTLE_LEVEL_TIER2: - return IOPOL_UTILITY; - break; - case THROTTLE_LEVEL_TIER3: - return IOPOL_THROTTLE; - break; - default: - panic("unknown tier %d", tier); - return IOPOL_DEFAULT; - break; - } - } -} - -int -proc_darwin_role_to_task_role(int darwin_role, int* task_role) -{ - integer_t role = TASK_UNSPECIFIED; - - switch (darwin_role) { - case PRIO_DARWIN_ROLE_DEFAULT: - role = TASK_UNSPECIFIED; - break; - case PRIO_DARWIN_ROLE_UI_FOCAL: - role = TASK_FOREGROUND_APPLICATION; - break; - case PRIO_DARWIN_ROLE_UI: - role = TASK_DEFAULT_APPLICATION; - break; - case PRIO_DARWIN_ROLE_NON_UI: - role = TASK_NONUI_APPLICATION; - break; - case PRIO_DARWIN_ROLE_UI_NON_FOCAL: - role = TASK_BACKGROUND_APPLICATION; - break; - case PRIO_DARWIN_ROLE_TAL_LAUNCH: - role = TASK_THROTTLE_APPLICATION; - break; - default: - return EINVAL; - } - - *task_role = role; - - return 0; -} - -int -proc_task_role_to_darwin_role(int task_role) -{ - switch (task_role) { - case TASK_FOREGROUND_APPLICATION: - return PRIO_DARWIN_ROLE_UI_FOCAL; - case TASK_BACKGROUND_APPLICATION: - return PRIO_DARWIN_ROLE_UI; - case TASK_NONUI_APPLICATION: - return PRIO_DARWIN_ROLE_NON_UI; - case TASK_DEFAULT_APPLICATION: - return PRIO_DARWIN_ROLE_UI_NON_FOCAL; - case TASK_THROTTLE_APPLICATION: - return PRIO_DARWIN_ROLE_TAL_LAUNCH; - case TASK_UNSPECIFIED: - default: - return PRIO_DARWIN_ROLE_DEFAULT; - } -} - - -/* apply internal backgrounding for workqueue threads */ -int -proc_apply_workq_bgthreadpolicy(thread_t thread) -{ - if (thread == THREAD_NULL) - return ESRCH; - - proc_set_task_policy(thread->task, thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_WORKQ_BG, TASK_POLICY_ENABLE); - - return(0); -} - -/* - * remove internal backgrounding for workqueue threads - * does NOT go find sockets created while BG and unbackground them - */ -int -proc_restore_workq_bgthreadpolicy(thread_t thread) -{ - if (thread == THREAD_NULL) - return ESRCH; - - proc_set_task_policy(thread->task, thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_WORKQ_BG, TASK_POLICY_DISABLE); - - return(0); -} - -/* here for temporary compatibility */ -int -proc_setthread_saved_importance(__unused thread_t thread, __unused int importance) -{ - return(0); -} - -/* - * Set an override on the thread which is consulted with a - * higher priority than the task/thread policy. This should - * only be set for temporary grants until the thread - * returns to the userspace boundary - * - * We use atomic operations to swap in the override, with - * the assumption that the thread itself can - * read the override and clear it on return to userspace. - * - * No locking is performed, since it is acceptable to see - * a stale override for one loop through throttle_lowpri_io(). - * However a thread reference must be held on the thread. - */ - -void set_thread_iotier_override(thread_t thread, int policy) -{ - int current_override; - - /* Let most aggressive I/O policy win until user boundary */ - do { - current_override = thread->iotier_override; - - if (current_override != THROTTLE_LEVEL_NONE) - policy = MIN(current_override, policy); - - if (current_override == policy) { - /* no effective change */ - return; - } - } while (!OSCompareAndSwap(current_override, policy, &thread->iotier_override)); - - /* - * Since the thread may be currently throttled, - * re-evaluate tiers and potentially break out - * of an msleep - */ - rethrottle_thread(thread->uthread); -} - -/* - * Userspace synchronization routines (like pthread mutexes, pthread reader-writer locks, - * semaphores, dispatch_sync) may result in priority inversions where a higher priority - * (i.e. scheduler priority, I/O tier, QoS tier) is waiting on a resource owned by a lower - * priority thread. In these cases, we attempt to propagate the priority token, as long - * as the subsystem informs us of the relationships between the threads. The userspace - * synchronization subsystem should maintain the information of owner->resource and - * resource->waiters itself. - */ - -/* - * This helper canonicalizes the resource/resource_type given the current qos_override_mode - * in effect. Note that wildcards (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD) may need - * to be handled specially in the future, but for now it's fine to slam - * *resource to USER_ADDR_NULL even if it was previously a wildcard. - */ -static void _canonicalize_resource_and_type(user_addr_t *resource, int *resource_type) { - if (qos_override_mode == QOS_OVERRIDE_MODE_OVERHANG_PEAK || qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) { - /* Map all input resource/type to a single one */ - *resource = USER_ADDR_NULL; - *resource_type = THREAD_QOS_OVERRIDE_TYPE_UNKNOWN; - } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE) { - /* no transform */ - } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH) { - /* Map all dispatch overrides to a single one, to avoid memory overhead */ - if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) { - *resource = USER_ADDR_NULL; - } - } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE) { - /* Map all mutex overrides to a single one, to avoid memory overhead */ - if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX) { - *resource = USER_ADDR_NULL; - } - } -} - -/* This helper routine finds an existing override if known. Locking should be done by caller */ -static struct thread_qos_override *_find_qos_override(thread_t thread, user_addr_t resource, int resource_type) { - struct thread_qos_override *override; - - override = thread->overrides; - while (override) { - if (override->override_resource == resource && - override->override_resource_type == resource_type) { - return override; - } - - override = override->override_next; - } - - return NULL; -} - -static void _find_and_decrement_qos_override(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset, struct thread_qos_override **free_override_list) { - struct thread_qos_override *override, *override_prev; - - override_prev = NULL; - override = thread->overrides; - while (override) { - struct thread_qos_override *override_next = override->override_next; - - if ((THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD == resource || override->override_resource == resource) && - override->override_resource_type == resource_type) { - if (reset) { - override->override_contended_resource_count = 0; - } else { - override->override_contended_resource_count--; - } - - if (override->override_contended_resource_count == 0) { - if (override_prev == NULL) { - thread->overrides = override_next; - } else { - override_prev->override_next = override_next; - } - - /* Add to out-param for later zfree */ - override->override_next = *free_override_list; - *free_override_list = override; - } else { - override_prev = override; - } - - if (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD != resource) { - return; - } - } else { - override_prev = override; - } - - override = override_next; - } -} - -/* This helper recalculates the current requested override using the policy selected at boot */ -static int _calculate_requested_qos_override(thread_t thread) -{ - if (qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) { - return THREAD_QOS_UNSPECIFIED; - } - - /* iterate over all overrides and calculate MAX */ - struct thread_qos_override *override; - int qos_override = THREAD_QOS_UNSPECIFIED; - - override = thread->overrides; - while (override) { - if (qos_override_mode != QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH || - override->override_resource_type != THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) { - qos_override = MAX(qos_override, override->override_qos); - } - - override = override->override_next; - } - - return qos_override; -} - -boolean_t proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, int resource_type) -{ - thread_t self = current_thread(); - struct task_pend_token pend_token = {}; - - /* XXX move to thread mutex when thread policy does */ - task_lock(task); - - /* - * If thread is passed, it is assumed to be most accurate, since the caller must have an explicit (or implicit) reference - * to the thread - */ - - if (thread != THREAD_NULL) { - assert(task == thread->task); - } else { - if (tid == self->thread_id) { - thread = self; - } else { - thread = task_findtid(task, tid); - - if (thread == THREAD_NULL) { - KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_NONE, - tid, 0, 0xdead, 0, 0); - task_unlock(task); - return FALSE; - } - } - } - - KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_START, - thread_tid(thread), override_qos, first_override_for_resource ? 1 : 0, 0, 0); - - DTRACE_BOOST5(qos_add_override_pre, uint64_t, tid, uint64_t, thread->requested_policy.thrp_qos, - uint64_t, thread->effective_policy.thep_qos, int, override_qos, boolean_t, first_override_for_resource); - - struct task_requested_policy requested = thread->requested_policy; - struct thread_qos_override *override; - struct thread_qos_override *deferred_free_override = NULL; - int new_qos_override, prev_qos_override; - int new_effective_qos; - boolean_t has_thread_reference = FALSE; - - _canonicalize_resource_and_type(&resource, &resource_type); - - if (first_override_for_resource) { - override = _find_qos_override(thread, resource, resource_type); - if (override) { - override->override_contended_resource_count++; - } else { - struct thread_qos_override *override_new; - - /* We need to allocate a new object. Drop the task lock and recheck afterwards in case someone else added the override */ - thread_reference(thread); - has_thread_reference = TRUE; - task_unlock(task); - override_new = zalloc(thread_qos_override_zone); - task_lock(task); - - override = _find_qos_override(thread, resource, resource_type); - if (override) { - /* Someone else already allocated while the task lock was dropped */ - deferred_free_override = override_new; - override->override_contended_resource_count++; - } else { - override = override_new; - override->override_next = thread->overrides; - override->override_contended_resource_count = 1 /* since first_override_for_resource was TRUE */; - override->override_resource = resource; - override->override_resource_type = resource_type; - override->override_qos = THREAD_QOS_UNSPECIFIED; - thread->overrides = override; - } - } - } else { - override = _find_qos_override(thread, resource, resource_type); - } - - if (override) { - if (override->override_qos == THREAD_QOS_UNSPECIFIED) - override->override_qos = override_qos; - else - override->override_qos = MAX(override->override_qos, override_qos); - } - - /* Determine how to combine the various overrides into a single current requested override */ - prev_qos_override = requested.thrp_qos_override; - new_qos_override = _calculate_requested_qos_override(thread); - - if (new_qos_override != prev_qos_override) { - requested.thrp_qos_override = new_qos_override; - - thread->requested_policy = requested; - - task_policy_update_locked(task, thread, &pend_token); - - if (!has_thread_reference) { - thread_reference(thread); - } - - task_unlock(task); - - task_policy_update_complete_unlocked(task, thread, &pend_token); - - new_effective_qos = thread->effective_policy.thep_qos; - - thread_deallocate(thread); - } else { - new_effective_qos = thread->effective_policy.thep_qos; - - task_unlock(task); - - if (has_thread_reference) { - thread_deallocate(thread); - } - } - - if (deferred_free_override) { - zfree(thread_qos_override_zone, deferred_free_override); - } - - DTRACE_BOOST3(qos_add_override_post, int, prev_qos_override, int, new_qos_override, - int, new_effective_qos); - - KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_END, - new_qos_override, resource, resource_type, 0, 0); - - return TRUE; + break; + } } - -static boolean_t _proc_thread_qos_remove_override_internal(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type, boolean_t reset) +int +proc_tier_to_iopol(int tier, int passive) { - thread_t self = current_thread(); - struct task_pend_token pend_token = {}; - - /* XXX move to thread mutex when thread policy does */ - task_lock(task); - - /* - * If thread is passed, it is assumed to be most accurate, since the caller must have an explicit (or implicit) reference - * to the thread - */ - if (thread != THREAD_NULL) { - assert(task == thread->task); - } else { - if (tid == self->thread_id) { - thread = self; - } else { - thread = task_findtid(task, tid); - - if (thread == THREAD_NULL) { - KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE, - tid, 0, 0xdead, 0, 0); - task_unlock(task); - return FALSE; - } + if (passive == 1) { + switch (tier) { + case THROTTLE_LEVEL_TIER0: + return IOPOL_PASSIVE; + default: + panic("unknown passive tier %d", tier); + return IOPOL_DEFAULT; } - } - - struct task_requested_policy requested = thread->requested_policy; - struct thread_qos_override *deferred_free_override_list = NULL; - int new_qos_override, prev_qos_override; - - _canonicalize_resource_and_type(&resource, &resource_type); - - _find_and_decrement_qos_override(thread, resource, resource_type, reset, &deferred_free_override_list); - - KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_START, - thread_tid(thread), resource, reset, 0, 0); - - /* Determine how to combine the various overrides into a single current requested override */ - prev_qos_override = requested.thrp_qos_override; - new_qos_override = _calculate_requested_qos_override(thread); - - if (new_qos_override != prev_qos_override) { - requested.thrp_qos_override = new_qos_override; - - thread->requested_policy = requested; - - task_policy_update_locked(task, thread, &pend_token); - - thread_reference(thread); - - task_unlock(task); - - task_policy_update_complete_unlocked(task, thread, &pend_token); - - thread_deallocate(thread); } else { - task_unlock(task); - } - - while (deferred_free_override_list) { - struct thread_qos_override *override_next = deferred_free_override_list->override_next; - - zfree(thread_qos_override_zone, deferred_free_override_list); - deferred_free_override_list = override_next; + switch (tier) { + case THROTTLE_LEVEL_NONE: + case THROTTLE_LEVEL_TIER0: + return IOPOL_DEFAULT; + case THROTTLE_LEVEL_TIER1: + return IOPOL_STANDARD; + case THROTTLE_LEVEL_TIER2: + return IOPOL_UTILITY; + case THROTTLE_LEVEL_TIER3: + return IOPOL_THROTTLE; + default: + panic("unknown tier %d", tier); + return IOPOL_DEFAULT; + } } - - KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_END, - 0, 0, 0, 0, 0); - - return TRUE; } -boolean_t proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type) +int +proc_darwin_role_to_task_role(int darwin_role, int* task_role) { - return _proc_thread_qos_remove_override_internal(task, thread, tid, resource, resource_type, FALSE); + integer_t role = TASK_UNSPECIFIED; -} + switch (darwin_role) { + case PRIO_DARWIN_ROLE_DEFAULT: + role = TASK_UNSPECIFIED; + break; + case PRIO_DARWIN_ROLE_UI_FOCAL: + role = TASK_FOREGROUND_APPLICATION; + break; + case PRIO_DARWIN_ROLE_UI: + role = TASK_DEFAULT_APPLICATION; + break; + case PRIO_DARWIN_ROLE_NON_UI: + role = TASK_NONUI_APPLICATION; + break; + case PRIO_DARWIN_ROLE_UI_NON_FOCAL: + role = TASK_BACKGROUND_APPLICATION; + break; + case PRIO_DARWIN_ROLE_TAL_LAUNCH: + role = TASK_THROTTLE_APPLICATION; + break; + default: + return EINVAL; + } -boolean_t proc_thread_qos_reset_override(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type) -{ - return _proc_thread_qos_remove_override_internal(task, thread, tid, resource, resource_type, TRUE); + *task_role = role; + + return 0; } -/* Deallocate before thread termination */ -void proc_thread_qos_deallocate(thread_t thread) +int +proc_task_role_to_darwin_role(int task_role) { - task_t task = thread->task; - struct thread_qos_override *override; - - /* XXX move to thread mutex when thread policy does */ - task_lock(task); - override = thread->overrides; - thread->overrides = NULL; /* task policy re-evaluation needed? */ - thread->requested_policy.thrp_qos_override = THREAD_QOS_UNSPECIFIED; - task_unlock(task); - - while (override) { - struct thread_qos_override *override_next = override->override_next; - - zfree(thread_qos_override_zone, override); - override = override_next; + switch (task_role) { + case TASK_FOREGROUND_APPLICATION: + return PRIO_DARWIN_ROLE_UI_FOCAL; + case TASK_BACKGROUND_APPLICATION: + return PRIO_DARWIN_ROLE_UI_NON_FOCAL; + case TASK_NONUI_APPLICATION: + return PRIO_DARWIN_ROLE_NON_UI; + case TASK_DEFAULT_APPLICATION: + return PRIO_DARWIN_ROLE_UI; + case TASK_THROTTLE_APPLICATION: + return PRIO_DARWIN_ROLE_TAL_LAUNCH; + case TASK_UNSPECIFIED: + default: + return PRIO_DARWIN_ROLE_DEFAULT; } } + /* TODO: remove this variable when interactive daemon audit period is over */ extern boolean_t ipc_importance_interactive_receiver; @@ -2611,7 +1728,7 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_TASK_APPTYPE, apptype)) | DBG_FUNC_START, - task_pid(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), + task_pid(task), trequested_0(task), trequested_1(task), apptype, 0); switch (apptype) { @@ -2687,62 +1804,49 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role, if (apptype == TASK_APPTYPE_APP_TAL) { /* TAL starts off enabled by default */ - task->requested_policy.t_tal_enabled = 1; + task->requested_policy.trp_tal_enabled = 1; } if (apptype != TASK_APPTYPE_NONE) { - task->requested_policy.t_apptype = apptype; + task->requested_policy.trp_apptype = apptype; } if (role != TASK_UNSPECIFIED) { - task->requested_policy.t_role = role; + task->requested_policy.trp_role = role; } if (qos_clamp != THREAD_QOS_UNSPECIFIED) { - task->requested_policy.t_qos_clamp = qos_clamp; + task->requested_policy.trp_qos_clamp = qos_clamp; } - task_policy_update_locked(task, THREAD_NULL, &pend_token); + task_policy_update_locked(task, &pend_token); task_unlock(task); /* Ensure the donor bit is updated to be in sync with the new live donor status */ pend_token.tpt_update_live_donor = 1; - task_policy_update_complete_unlocked(task, THREAD_NULL, &pend_token); + task_policy_update_complete_unlocked(task, &pend_token); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_TASK_APPTYPE, apptype)) | DBG_FUNC_END, - task_pid(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), + task_pid(task), trequested_0(task), trequested_1(task), task_is_importance_receiver(task), 0); } extern task_t bsd_init_task; -/* Set up the primordial thread's QoS */ -void -task_set_main_thread_qos(task_t task, thread_t main_thread) { - struct task_pend_token pend_token = {}; - - assert(main_thread->task == task); - - task_lock(task); - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_START, - task_pid(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), - main_thread->requested_policy.thrp_qos, 0); - +/* + * Compute the default main thread qos for a task + */ +int +task_compute_main_thread_qos(task_t task) +{ int primordial_qos = THREAD_QOS_UNSPECIFIED; - int qos_clamp = task->requested_policy.t_qos_clamp; - - if (task == bsd_init_task) { - /* PID 1 gets a special case */ - primordial_qos = THREAD_QOS_USER_INITIATED; - } + int qos_clamp = task->requested_policy.trp_qos_clamp; - switch (task->requested_policy.t_apptype) { + switch (task->requested_policy.trp_apptype) { case TASK_APPTYPE_APP_TAL: case TASK_APPTYPE_APP_DEFAULT: primordial_qos = THREAD_QOS_USER_INTERACTIVE; @@ -2759,6 +1863,11 @@ task_set_main_thread_qos(task_t task, thread_t main_thread) { break; } + if (task == bsd_init_task) { + /* PID 1 gets a special case */ + primordial_qos = MAX(primordial_qos, THREAD_QOS_USER_INITIATED); + } + if (qos_clamp != THREAD_QOS_UNSPECIFIED) { if (primordial_qos != THREAD_QOS_UNSPECIFIED) { primordial_qos = MIN(qos_clamp, primordial_qos); @@ -2767,31 +1876,47 @@ task_set_main_thread_qos(task_t task, thread_t main_thread) { } } - main_thread->requested_policy.thrp_qos = primordial_qos; - - task_policy_update_locked(task, main_thread, &pend_token); - - task_unlock(task); - - task_policy_update_complete_unlocked(task, main_thread, &pend_token); - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_END, - task_pid(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), - primordial_qos, 0); + return primordial_qos; } + /* for process_policy to check before attempting to set */ boolean_t proc_task_is_tal(task_t task) { - return (task->requested_policy.t_apptype == TASK_APPTYPE_APP_TAL) ? TRUE : FALSE; + return (task->requested_policy.trp_apptype == TASK_APPTYPE_APP_TAL) ? TRUE : FALSE; } int task_get_apptype(task_t task) { - return task->requested_policy.t_apptype; + return task->requested_policy.trp_apptype; +} + +boolean_t +task_is_daemon(task_t task) +{ + switch (task->requested_policy.trp_apptype) { + case TASK_APPTYPE_DAEMON_INTERACTIVE: + case TASK_APPTYPE_DAEMON_STANDARD: + case TASK_APPTYPE_DAEMON_ADAPTIVE: + case TASK_APPTYPE_DAEMON_BACKGROUND: + return TRUE; + default: + return FALSE; + } +} + +boolean_t +task_is_app(task_t task) +{ + switch (task->requested_policy.trp_apptype) { + case TASK_APPTYPE_APP_DEFAULT: + case TASK_APPTYPE_APP_TAL: + return TRUE; + default: + return FALSE; + } } /* for telemetry */ @@ -2805,44 +1930,33 @@ task_grab_latency_qos(task_t task) int proc_get_darwinbgstate(task_t task, uint32_t * flagsp) { - if (task->requested_policy.ext_darwinbg) + if (task->requested_policy.trp_ext_darwinbg) *flagsp |= PROC_FLAG_EXT_DARWINBG; - if (task->requested_policy.int_darwinbg) + if (task->requested_policy.trp_int_darwinbg) *flagsp |= PROC_FLAG_DARWINBG; - if (task->requested_policy.t_apptype == TASK_APPTYPE_APP_DEFAULT || - task->requested_policy.t_apptype == TASK_APPTYPE_APP_TAL) + if (task->requested_policy.trp_apptype == TASK_APPTYPE_APP_DEFAULT || + task->requested_policy.trp_apptype == TASK_APPTYPE_APP_TAL) *flagsp |= PROC_FLAG_APPLICATION; - if (task->requested_policy.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) + if (task->requested_policy.trp_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) *flagsp |= PROC_FLAG_ADAPTIVE; - if (task->requested_policy.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE && task->requested_policy.t_boosted == 1) + if (task->requested_policy.trp_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE && + task->requested_policy.trp_boosted == 1) *flagsp |= PROC_FLAG_ADAPTIVE_IMPORTANT; if (task_is_importance_donor(task)) *flagsp |= PROC_FLAG_IMPORTANCE_DONOR; - if (task->effective_policy.t_sup_active) + if (task->effective_policy.tep_sup_active) *flagsp |= PROC_FLAG_SUPPRESSED; return(0); } -/* All per-thread state is in the first 32-bits of the bitfield */ -void -proc_get_thread_policy(thread_t thread, thread_policy_state_t info) -{ - task_t task = thread->task; - task_lock(task); - info->requested = (integer_t)task_requested_bitfield(task, thread); - info->effective = (integer_t)task_effective_bitfield(task, thread); - info->pending = 0; - task_unlock(task); -} - /* * Tracepoint data... Reading the tracepoint data can be somewhat complicated. * The current scheme packs as much data into a single tracepoint as it can. @@ -2886,138 +2000,113 @@ proc_get_thread_policy(thread_t thread, thread_policy_state_t info) */ static uintptr_t -trequested_0(task_t task, thread_t thread) +trequested_0(task_t task) { - assert(task); - _Static_assert(sizeof(struct task_requested_policy) == sizeof(uint64_t), "size invariant violated"); - _Static_assert(sizeof(task->requested_policy) == sizeof(thread->requested_policy), "size invariant violated"); + static_assert(sizeof(struct task_requested_policy) == sizeof(uint64_t), "size invariant violated"); + + uintptr_t* raw = (uintptr_t*)&task->requested_policy; - uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->requested_policy : &thread->requested_policy); return raw[0]; } static uintptr_t -trequested_1(task_t task, thread_t thread) +trequested_1(task_t task) { - assert(task); - _Static_assert(sizeof(struct task_requested_policy) == sizeof(uint64_t), "size invariant violated"); - _Static_assert(sizeof(task->requested_policy) == sizeof(thread->requested_policy), "size invariant violated"); - #if defined __LP64__ - return (thread == NULL) ? 0 : *(uintptr_t*)&thread->requested_policy; + (void)task; + return 0; #else - uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->requested_policy : &thread->requested_policy); + uintptr_t* raw = (uintptr_t*)(&task->requested_policy); return raw[1]; #endif } static uintptr_t -teffective_0(task_t task, thread_t thread) +teffective_0(task_t task) { - assert(task); - _Static_assert(sizeof(struct task_effective_policy) == sizeof(uint64_t), "size invariant violated"); - _Static_assert(sizeof(task->effective_policy) == sizeof(thread->effective_policy), "size invariant violated"); + uintptr_t* raw = (uintptr_t*)&task->effective_policy; - uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->effective_policy : &thread->effective_policy); return raw[0]; } static uintptr_t -teffective_1(task_t task, thread_t thread) +teffective_1(task_t task) { - assert(task); - _Static_assert(sizeof(struct task_effective_policy) == sizeof(uint64_t), "size invariant violated"); - _Static_assert(sizeof(task->effective_policy) == sizeof(thread->effective_policy), "size invariant violated"); - #if defined __LP64__ - return (thread == NULL) ? 0 : *(uintptr_t*)&thread->effective_policy; + (void)task; + return 0; #else - uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->effective_policy : &thread->effective_policy); + uintptr_t* raw = (uintptr_t*)(&task->effective_policy); return raw[1]; #endif } /* dump pending for tracepoint */ -static uint32_t tpending(task_pend_token_t pend_token) { return *(uint32_t*)(void*)(pend_token); } +uint32_t tpending(task_pend_token_t pend_token) { return *(uint32_t*)(void*)(pend_token); } uint64_t -task_requested_bitfield(task_t task, thread_t thread) +task_requested_bitfield(task_t task) { uint64_t bits = 0; - struct task_requested_policy requested = - (thread == THREAD_NULL) ? task->requested_policy : thread->requested_policy; - - bits |= (requested.int_darwinbg ? POLICY_REQ_INT_DARWIN_BG : 0); - bits |= (requested.ext_darwinbg ? POLICY_REQ_EXT_DARWIN_BG : 0); - bits |= (requested.int_iotier ? (((uint64_t)requested.int_iotier) << POLICY_REQ_INT_IO_TIER_SHIFT) : 0); - bits |= (requested.ext_iotier ? (((uint64_t)requested.ext_iotier) << POLICY_REQ_EXT_IO_TIER_SHIFT) : 0); - bits |= (requested.int_iopassive ? POLICY_REQ_INT_PASSIVE_IO : 0); - bits |= (requested.ext_iopassive ? POLICY_REQ_EXT_PASSIVE_IO : 0); - bits |= (requested.bg_iotier ? (((uint64_t)requested.bg_iotier) << POLICY_REQ_BG_IOTIER_SHIFT) : 0); - bits |= (requested.terminated ? POLICY_REQ_TERMINATED : 0); - - bits |= (requested.th_pidbind_bg ? POLICY_REQ_PIDBIND_BG : 0); - bits |= (requested.th_workq_bg ? POLICY_REQ_WORKQ_BG : 0); - - if (thread != THREAD_NULL) { - bits |= (requested.thrp_qos ? (((uint64_t)requested.thrp_qos) << POLICY_REQ_TH_QOS_SHIFT) : 0); - bits |= (requested.thrp_qos_override ? (((uint64_t)requested.thrp_qos_override) << POLICY_REQ_TH_QOS_OVER_SHIFT) : 0); - } - - bits |= (requested.t_boosted ? POLICY_REQ_BOOSTED : 0); - bits |= (requested.t_tal_enabled ? POLICY_REQ_TAL_ENABLED : 0); - bits |= (requested.t_apptype ? (((uint64_t)requested.t_apptype) << POLICY_REQ_APPTYPE_SHIFT) : 0); - bits |= (requested.t_role ? (((uint64_t)requested.t_role) << POLICY_REQ_ROLE_SHIFT) : 0); - - bits |= (requested.t_sup_active ? POLICY_REQ_SUP_ACTIVE : 0); - bits |= (requested.t_sup_lowpri_cpu ? POLICY_REQ_SUP_LOWPRI_CPU : 0); - bits |= (requested.t_sup_cpu ? POLICY_REQ_SUP_CPU : 0); - bits |= (requested.t_sup_timer ? (((uint64_t)requested.t_sup_timer) << POLICY_REQ_SUP_TIMER_THROTTLE_SHIFT) : 0); - bits |= (requested.t_sup_throughput ? (((uint64_t)requested.t_sup_throughput) << POLICY_REQ_SUP_THROUGHPUT_SHIFT) : 0); - bits |= (requested.t_sup_disk ? POLICY_REQ_SUP_DISK_THROTTLE : 0); - bits |= (requested.t_sup_cpu_limit ? POLICY_REQ_SUP_CPU_LIMIT : 0); - bits |= (requested.t_sup_suspend ? POLICY_REQ_SUP_SUSPEND : 0); - bits |= (requested.t_sup_bg_sockets ? POLICY_REQ_SUP_BG_SOCKETS : 0); - bits |= (requested.t_base_latency_qos ? (((uint64_t)requested.t_base_latency_qos) << POLICY_REQ_BASE_LATENCY_QOS_SHIFT) : 0); - bits |= (requested.t_over_latency_qos ? (((uint64_t)requested.t_over_latency_qos) << POLICY_REQ_OVER_LATENCY_QOS_SHIFT) : 0); - bits |= (requested.t_base_through_qos ? (((uint64_t)requested.t_base_through_qos) << POLICY_REQ_BASE_THROUGH_QOS_SHIFT) : 0); - bits |= (requested.t_over_through_qos ? (((uint64_t)requested.t_over_through_qos) << POLICY_REQ_OVER_THROUGH_QOS_SHIFT) : 0); - bits |= (requested.t_sfi_managed ? POLICY_REQ_SFI_MANAGED : 0); - bits |= (requested.t_qos_clamp ? (((uint64_t)requested.t_qos_clamp) << POLICY_REQ_QOS_CLAMP_SHIFT) : 0); + struct task_requested_policy requested = task->requested_policy; + + bits |= (requested.trp_int_darwinbg ? POLICY_REQ_INT_DARWIN_BG : 0); + bits |= (requested.trp_ext_darwinbg ? POLICY_REQ_EXT_DARWIN_BG : 0); + bits |= (requested.trp_int_iotier ? (((uint64_t)requested.trp_int_iotier) << POLICY_REQ_INT_IO_TIER_SHIFT) : 0); + bits |= (requested.trp_ext_iotier ? (((uint64_t)requested.trp_ext_iotier) << POLICY_REQ_EXT_IO_TIER_SHIFT) : 0); + bits |= (requested.trp_int_iopassive ? POLICY_REQ_INT_PASSIVE_IO : 0); + bits |= (requested.trp_ext_iopassive ? POLICY_REQ_EXT_PASSIVE_IO : 0); + bits |= (requested.trp_bg_iotier ? (((uint64_t)requested.trp_bg_iotier) << POLICY_REQ_BG_IOTIER_SHIFT) : 0); + bits |= (requested.trp_terminated ? POLICY_REQ_TERMINATED : 0); + + bits |= (requested.trp_boosted ? POLICY_REQ_BOOSTED : 0); + bits |= (requested.trp_tal_enabled ? POLICY_REQ_TAL_ENABLED : 0); + bits |= (requested.trp_apptype ? (((uint64_t)requested.trp_apptype) << POLICY_REQ_APPTYPE_SHIFT) : 0); + bits |= (requested.trp_role ? (((uint64_t)requested.trp_role) << POLICY_REQ_ROLE_SHIFT) : 0); + + bits |= (requested.trp_sup_active ? POLICY_REQ_SUP_ACTIVE : 0); + bits |= (requested.trp_sup_lowpri_cpu ? POLICY_REQ_SUP_LOWPRI_CPU : 0); + bits |= (requested.trp_sup_cpu ? POLICY_REQ_SUP_CPU : 0); + bits |= (requested.trp_sup_timer ? (((uint64_t)requested.trp_sup_timer) << POLICY_REQ_SUP_TIMER_THROTTLE_SHIFT) : 0); + bits |= (requested.trp_sup_throughput ? (((uint64_t)requested.trp_sup_throughput) << POLICY_REQ_SUP_THROUGHPUT_SHIFT) : 0); + bits |= (requested.trp_sup_disk ? POLICY_REQ_SUP_DISK_THROTTLE : 0); + bits |= (requested.trp_sup_bg_sockets ? POLICY_REQ_SUP_BG_SOCKETS : 0); + + bits |= (requested.trp_base_latency_qos ? (((uint64_t)requested.trp_base_latency_qos) << POLICY_REQ_BASE_LATENCY_QOS_SHIFT) : 0); + bits |= (requested.trp_over_latency_qos ? (((uint64_t)requested.trp_over_latency_qos) << POLICY_REQ_OVER_LATENCY_QOS_SHIFT) : 0); + bits |= (requested.trp_base_through_qos ? (((uint64_t)requested.trp_base_through_qos) << POLICY_REQ_BASE_THROUGH_QOS_SHIFT) : 0); + bits |= (requested.trp_over_through_qos ? (((uint64_t)requested.trp_over_through_qos) << POLICY_REQ_OVER_THROUGH_QOS_SHIFT) : 0); + bits |= (requested.trp_sfi_managed ? POLICY_REQ_SFI_MANAGED : 0); + bits |= (requested.trp_qos_clamp ? (((uint64_t)requested.trp_qos_clamp) << POLICY_REQ_QOS_CLAMP_SHIFT) : 0); return bits; } uint64_t -task_effective_bitfield(task_t task, thread_t thread) +task_effective_bitfield(task_t task) { uint64_t bits = 0; - struct task_effective_policy effective = - (thread == THREAD_NULL) ? task->effective_policy : thread->effective_policy; - - bits |= (effective.io_tier ? (((uint64_t)effective.io_tier) << POLICY_EFF_IO_TIER_SHIFT) : 0); - bits |= (effective.io_passive ? POLICY_EFF_IO_PASSIVE : 0); - bits |= (effective.darwinbg ? POLICY_EFF_DARWIN_BG : 0); - bits |= (effective.lowpri_cpu ? POLICY_EFF_LOWPRI_CPU : 0); - bits |= (effective.terminated ? POLICY_EFF_TERMINATED : 0); - bits |= (effective.all_sockets_bg ? POLICY_EFF_ALL_SOCKETS_BG : 0); - bits |= (effective.new_sockets_bg ? POLICY_EFF_NEW_SOCKETS_BG : 0); - bits |= (effective.bg_iotier ? (((uint64_t)effective.bg_iotier) << POLICY_EFF_BG_IOTIER_SHIFT) : 0); - bits |= (effective.qos_ui_is_urgent ? POLICY_EFF_QOS_UI_IS_URGENT : 0); - - if (thread != THREAD_NULL) - bits |= (effective.thep_qos ? (((uint64_t)effective.thep_qos) << POLICY_EFF_TH_QOS_SHIFT) : 0); - - bits |= (effective.t_tal_engaged ? POLICY_EFF_TAL_ENGAGED : 0); - bits |= (effective.t_suspended ? POLICY_EFF_SUSPENDED : 0); - bits |= (effective.t_watchers_bg ? POLICY_EFF_WATCHERS_BG : 0); - bits |= (effective.t_sup_active ? POLICY_EFF_SUP_ACTIVE : 0); - bits |= (effective.t_suppressed_cpu ? POLICY_EFF_SUP_CPU : 0); - bits |= (effective.t_role ? (((uint64_t)effective.t_role) << POLICY_EFF_ROLE_SHIFT) : 0); - bits |= (effective.t_latency_qos ? (((uint64_t)effective.t_latency_qos) << POLICY_EFF_LATENCY_QOS_SHIFT) : 0); - bits |= (effective.t_through_qos ? (((uint64_t)effective.t_through_qos) << POLICY_EFF_THROUGH_QOS_SHIFT) : 0); - bits |= (effective.t_sfi_managed ? POLICY_EFF_SFI_MANAGED : 0); - bits |= (effective.t_qos_ceiling ? (((uint64_t)effective.t_qos_ceiling) << POLICY_EFF_QOS_CEILING_SHIFT) : 0); + struct task_effective_policy effective = task->effective_policy; + + bits |= (effective.tep_io_tier ? (((uint64_t)effective.tep_io_tier) << POLICY_EFF_IO_TIER_SHIFT) : 0); + bits |= (effective.tep_io_passive ? POLICY_EFF_IO_PASSIVE : 0); + bits |= (effective.tep_darwinbg ? POLICY_EFF_DARWIN_BG : 0); + bits |= (effective.tep_lowpri_cpu ? POLICY_EFF_LOWPRI_CPU : 0); + bits |= (effective.tep_terminated ? POLICY_EFF_TERMINATED : 0); + bits |= (effective.tep_all_sockets_bg ? POLICY_EFF_ALL_SOCKETS_BG : 0); + bits |= (effective.tep_new_sockets_bg ? POLICY_EFF_NEW_SOCKETS_BG : 0); + bits |= (effective.tep_bg_iotier ? (((uint64_t)effective.tep_bg_iotier) << POLICY_EFF_BG_IOTIER_SHIFT) : 0); + bits |= (effective.tep_qos_ui_is_urgent ? POLICY_EFF_QOS_UI_IS_URGENT : 0); + + bits |= (effective.tep_tal_engaged ? POLICY_EFF_TAL_ENGAGED : 0); + bits |= (effective.tep_watchers_bg ? POLICY_EFF_WATCHERS_BG : 0); + bits |= (effective.tep_sup_active ? POLICY_EFF_SUP_ACTIVE : 0); + bits |= (effective.tep_suppressed_cpu ? POLICY_EFF_SUP_CPU : 0); + bits |= (effective.tep_role ? (((uint64_t)effective.tep_role) << POLICY_EFF_ROLE_SHIFT) : 0); + bits |= (effective.tep_latency_qos ? (((uint64_t)effective.tep_latency_qos) << POLICY_EFF_LATENCY_QOS_SHIFT) : 0); + bits |= (effective.tep_through_qos ? (((uint64_t)effective.tep_through_qos) << POLICY_EFF_THROUGH_QOS_SHIFT) : 0); + bits |= (effective.tep_sfi_managed ? POLICY_EFF_SFI_MANAGED : 0); + bits |= (effective.tep_qos_ceiling ? (((uint64_t)effective.tep_qos_ceiling) << POLICY_EFF_QOS_CEILING_SHIFT) : 0); return bits; } @@ -3186,6 +2275,16 @@ proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint8_t percentage, uint64_ return(error); } +/* TODO: get rid of these */ +#define TASK_POLICY_CPU_RESOURCE_USAGE 0 +#define TASK_POLICY_WIREDMEM_RESOURCE_USAGE 1 +#define TASK_POLICY_VIRTUALMEM_RESOURCE_USAGE 2 +#define TASK_POLICY_DISK_RESOURCE_USAGE 3 +#define TASK_POLICY_NETWORK_RESOURCE_USAGE 4 +#define TASK_POLICY_POWER_RESOURCE_USAGE 5 + +#define TASK_POLICY_RESOURCE_USAGE_COUNT 6 + int proc_clear_task_ruse_cpu(task_t task, int cpumon_entitled) { @@ -3271,7 +2370,7 @@ task_apply_resource_actions(task_t task, int type) * "scopes" will not be accessible via this API. We could change it to pass in the scope of interest * to the caller, and prefer that, but there's no need for that at the moment. */ -int +static int task_get_cpuusage(task_t task, uint8_t *percentagep, uint64_t *intervalp, uint64_t *deadlinep, int *scope) { *percentagep = 0; @@ -3297,17 +2396,18 @@ task_get_cpuusage(task_t task, uint8_t *percentagep, uint64_t *intervalp, uint64 } /* - * Disable the CPU usage monitor for the task. Return value indicates + * Suspend the CPU usage monitor for the task. Return value indicates * if the mechanism was actually enabled. */ int -task_disable_cpumon(task_t task) { +task_suspend_cpumon(task_t task) +{ thread_t thread; task_lock_assert_owned(task); if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) == 0) { - return (KERN_INVALID_ARGUMENT); + return KERN_INVALID_ARGUMENT; } #if CONFIG_TELEMETRY @@ -3319,22 +2419,80 @@ task_disable_cpumon(task_t task) { #endif /* - * Disable the monitor for the task, and propagate that change to each thread. + * Suspend monitoring for the task, and propagate that change to each thread. */ task->rusage_cpu_flags &= ~(TASK_RUSECPU_FLAGS_PERTHR_LIMIT | TASK_RUSECPU_FLAGS_FATAL_CPUMON); queue_iterate(&task->threads, thread, thread_t, task_threads) { set_astledger(thread); } + + return KERN_SUCCESS; +} + +/* + * Remove all traces of the CPU monitor. + */ +int +task_disable_cpumon(task_t task) +{ + int kret; + + task_lock_assert_owned(task); + + kret = task_suspend_cpumon(task); + if (kret) return kret; + + /* Once we clear these values, the monitor can't be resumed */ task->rusage_cpu_perthr_percentage = 0; task->rusage_cpu_perthr_interval = 0; return (KERN_SUCCESS); } + +static int +task_enable_cpumon_locked(task_t task) +{ + thread_t thread; + task_lock_assert_owned(task); + + if (task->rusage_cpu_perthr_percentage == 0 || + task->rusage_cpu_perthr_interval == 0) { + return KERN_INVALID_ARGUMENT; + } + + task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_PERTHR_LIMIT; + queue_iterate(&task->threads, thread, thread_t, task_threads) { + set_astledger(thread); + } + + return KERN_SUCCESS; +} + int +task_resume_cpumon(task_t task) +{ + kern_return_t kret; + + if (!task) { + return EINVAL; + } + + task_lock(task); + kret = task_enable_cpumon_locked(task); + task_unlock(task); + + return kret; +} + + +/* duplicate values from bsd/sys/process_policy.h */ +#define PROC_POLICY_CPUMON_DISABLE 0xFF +#define PROC_POLICY_CPUMON_DEFAULTS 0xFE + +static int task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t deadline, int scope, int cpumon_entitled) { - thread_t thread; uint64_t abstime = 0; uint64_t limittime = 0; @@ -3354,10 +2512,11 @@ task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t d * exceeds the limit. */ - if (percentage == TASK_POLICY_CPUMON_DISABLE) { + if (percentage == PROC_POLICY_CPUMON_DISABLE) { if (cpumon_entitled) { + /* 25095698 - task_disable_cpumon() should be reliable */ task_disable_cpumon(task); - return (0); + return 0; } /* @@ -3369,10 +2528,10 @@ task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t d * back to the defaults. */ warn = TRUE; - percentage = TASK_POLICY_CPUMON_DEFAULTS; + percentage = PROC_POLICY_CPUMON_DEFAULTS; } - if (percentage == TASK_POLICY_CPUMON_DEFAULTS) { + if (percentage == PROC_POLICY_CPUMON_DEFAULTS) { percentage = proc_max_cpumon_percentage; interval = proc_max_cpumon_interval; } @@ -3424,12 +2583,12 @@ task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t d " (missing required entitlement).\n", procname, pid); } - task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_PERTHR_LIMIT; + /* configure the limit values */ task->rusage_cpu_perthr_percentage = percentage; task->rusage_cpu_perthr_interval = interval; - queue_iterate(&task->threads, thread, thread_t, task_threads) { - set_astledger(thread); - } + + /* and enable the CPU monitor */ + (void)task_enable_cpumon_locked(task); } else if (scope == TASK_RUSECPU_FLAGS_PROC_LIMIT) { /* * Currently, a proc-wide CPU limit always blocks if the limit is @@ -3489,7 +2648,7 @@ task_clear_cpuusage(task_t task, int cpumon_entitled) return(retval); } -int +static int task_clear_cpuusage_locked(task_t task, int cpumon_entitled) { thread_call_t savecallt; @@ -3525,8 +2684,8 @@ task_clear_cpuusage_locked(task_t task, int cpumon_entitled) return(0); } -/* called by ledger unit to enforce action due to resource usage criteria being met */ -void +/* called by ledger unit to enforce action due to resource usage criteria being met */ +static void task_action_cpuusage(thread_call_param_t param0, __unused thread_call_param_t param1) { task_t task = (task_t)param0; @@ -3635,15 +2794,15 @@ task_importance_reset(__imp_only task_t task) * * Task lock must be held. */ -void +static void task_set_boost_locked(task_t task, boolean_t boost_active) { #if IMPORTANCE_DEBUG KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_BOOST, (boost_active ? IMP_BOOSTED : IMP_UNBOOSTED)) | DBG_FUNC_START), - proc_selfpid(), task_pid(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), 0); + proc_selfpid(), task_pid(task), trequested_0(task), trequested_1(task), 0); #endif - task->requested_policy.t_boosted = boost_active; + task->requested_policy.trp_boosted = boost_active; #if IMPORTANCE_DEBUG if (boost_active == TRUE){ @@ -3653,7 +2812,7 @@ task_set_boost_locked(task_t task, boolean_t boost_active) } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_BOOST, (boost_active ? IMP_BOOSTED : IMP_UNBOOSTED)) | DBG_FUNC_END), proc_selfpid(), task_pid(task), - trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), 0); + trequested_0(task), trequested_1(task), 0); #endif } @@ -3667,7 +2826,7 @@ task_update_boost_locked(task_t task, boolean_t boost_active, task_pend_token_t { task_set_boost_locked(task, boost_active); - task_policy_update_locked(task, THREAD_NULL, pend_token); + task_policy_update_locked(task, pend_token); } /* @@ -3772,20 +2931,6 @@ task_is_importance_receiver_type(task_t task) * Assertions are changed from internal to external via task_importance_externalize_assertion */ -int -task_importance_hold_watchport_assertion(task_t target_task, uint32_t count) -{ - ipc_importance_task_t task_imp; - kern_return_t ret; - - /* must already have set up an importance */ - task_imp = target_task->task_imp_base; - assert(IIT_NULL != task_imp); - - ret = ipc_importance_task_hold_internal_assertion(task_imp, count); - return (KERN_SUCCESS != ret) ? ENOTSUP : 0; -} - int task_importance_hold_internal_assertion(task_t target_task, uint32_t count) { @@ -3835,21 +2980,6 @@ task_importance_hold_legacy_external_assertion(task_t target_task, uint32_t coun return (KERN_SUCCESS != ret) ? ENOTSUP : 0; } -int -task_importance_drop_internal_assertion(task_t target_task, uint32_t count) -{ - ipc_importance_task_t task_imp; - kern_return_t ret; - - /* must already have set up an importance */ - task_imp = target_task->task_imp_base; - if (IIT_NULL == task_imp) { - return EOVERFLOW; - } - ret = ipc_importance_task_drop_internal_assertion(target_task->task_imp_base, count); - return (KERN_SUCCESS != ret) ? ENOTSUP : 0; -} - int task_importance_drop_file_lock_assertion(task_t target_task, uint32_t count) { @@ -4126,3 +3256,122 @@ task_importance_estimate(task_t task) return task_importance; } +boolean_t +task_has_assertions(task_t task) +{ + return (task->task_imp_base->iit_assertcnt? TRUE : FALSE); +} + + +kern_return_t +send_resource_violation(typeof(send_cpu_usage_violation) sendfunc, + task_t violator, + struct ledger_entry_info *linfo, + resource_notify_flags_t flags) +{ +#ifndef MACH_BSD + return KERN_NOT_SUPPORTED; +#else + kern_return_t kr = KERN_SUCCESS; + proc_t proc = NULL; + posix_path_t proc_path = ""; + proc_name_t procname = ""; + int pid = -1; + clock_sec_t secs; + clock_nsec_t nsecs; + mach_timespec_t timestamp; + thread_t curthread = current_thread(); + ipc_port_t dstport = MACH_PORT_NULL; + + if (!violator) { + kr = KERN_INVALID_ARGUMENT; goto finish; + } + + /* extract violator information */ + task_lock(violator); + if (!(proc = get_bsdtask_info(violator))) { + task_unlock(violator); + kr = KERN_INVALID_ARGUMENT; goto finish; + } + (void)mig_strncpy(procname, proc_best_name(proc), sizeof(procname)); + pid = task_pid(violator); + if (flags & kRNFatalLimitFlag) { + kr = proc_pidpathinfo_internal(proc, 0, proc_path, + sizeof(proc_path), NULL); + } + task_unlock(violator); + if (kr) goto finish; + + /* violation time ~ now */ + clock_get_calendar_nanotime(&secs, &nsecs); + timestamp.tv_sec = (int32_t)secs; + timestamp.tv_nsec = (int32_t)nsecs; + /* 25567702 tracks widening mach_timespec_t */ + + /* send message */ + kr = host_get_special_port(host_priv_self(), HOST_LOCAL_NODE, + HOST_RESOURCE_NOTIFY_PORT, &dstport); + if (kr) goto finish; + + /* TH_OPT_HONOR_QLIMIT causes ipc_kmsg_send() to respect the + * queue limit. It also unsets this flag, but this code also + * unsets it for clarity and in case that code changes. */ + curthread->options |= TH_OPT_HONOR_QLIMIT; + kr = sendfunc(dstport, + procname, pid, proc_path, timestamp, + linfo->lei_balance, linfo->lei_last_refill, + linfo->lei_limit, linfo->lei_refill_period, + flags); + curthread->options &= (~TH_OPT_HONOR_QLIMIT); + + ipc_port_release_send(dstport); + +finish: + return kr; +#endif /* MACH_BSD */ +} + + +/* + * Resource violations trace four 64-bit integers. For K32, two additional + * codes are allocated, the first with the low nibble doubled. So if the K64 + * code is 0x042, the K32 codes would be 0x044 and 0x45. + */ +#ifdef __LP64__ +void +trace_resource_violation(uint16_t code, + struct ledger_entry_info *linfo) +{ + KERNEL_DBG_IST_SANE(KDBG_CODE(DBG_MACH, DBG_MACH_RESOURCE, code), + linfo->lei_balance, linfo->lei_last_refill, + linfo->lei_limit, linfo->lei_refill_period); +} +#else /* K32 */ +/* TODO: create/find a trace_two_LLs() for K32 systems */ +#define MASK32 0xffffffff +void +trace_resource_violation(uint16_t code, + struct ledger_entry_info *linfo) +{ + int8_t lownibble = (code & 0x3) * 2; + int16_t codeA = (code & 0xffc) | lownibble; + int16_t codeB = codeA + 1; + + int32_t balance_high = (linfo->lei_balance >> 32) & MASK32; + int32_t balance_low = linfo->lei_balance & MASK32; + int32_t last_refill_high = (linfo->lei_last_refill >> 32) & MASK32; + int32_t last_refill_low = linfo->lei_last_refill & MASK32; + + int32_t limit_high = (linfo->lei_limit >> 32) & MASK32; + int32_t limit_low = linfo->lei_limit & MASK32; + int32_t refill_period_high = (linfo->lei_refill_period >> 32) & MASK32; + int32_t refill_period_low = linfo->lei_refill_period & MASK32; + + KERNEL_DBG_IST_SANE(KDBG_CODE(DBG_MACH, DBG_MACH_RESOURCE, codeA), + balance_high, balance_low, + last_refill_high, last_refill_low); + KERNEL_DBG_IST_SANE(KDBG_CODE(DBG_MACH, DBG_MACH_RESOURCE, codeB), + limit_high, limit_low, + refill_period_high, refill_period_low); +} +#endif /* K64/K32 */ diff --git a/osfmk/kern/telemetry.c b/osfmk/kern/telemetry.c index cda0bebc6..bb58493d4 100644 --- a/osfmk/kern/telemetry.c +++ b/osfmk/kern/telemetry.c @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include @@ -63,7 +65,6 @@ extern char *proc_name_address(void *p); extern uint64_t proc_uniqueid(void *p); extern uint64_t proc_was_throttled(void *p); extern uint64_t proc_did_throttle(void *p); -extern uint64_t get_dispatchqueue_serialno_offset_from_proc(void *p); extern int proc_selfpid(void); struct micro_snapshot_buffer { @@ -77,7 +78,6 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark, struct micro_snapshot_buffer * current_buffer); #define TELEMETRY_DEFAULT_SAMPLE_RATE (1) /* 1 sample every 1 second */ -#define TELEMETRY_DEFAULT_WINDOW_BUFFER_SIZE (512*1024) /* Should hopefully provide 10 seconds worth of samples */ #define TELEMETRY_DEFAULT_BUFFER_SIZE (16*1024) #define TELEMETRY_MAX_BUFFER_SIZE (64*1024) @@ -86,17 +86,8 @@ int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark uint32_t telemetry_sample_rate = 0; volatile boolean_t telemetry_needs_record = FALSE; -volatile boolean_t telemetry_windowed_record = FALSE; volatile boolean_t telemetry_needs_timer_arming_record = FALSE; -/* - * Tells the scheduler that we want it to invoke - * compute_telemetry_windowed(); it is still our responsibility - * to ensure that we do not panic if someone disables the window - * buffer immediately after the scheduler does so. - */ -volatile boolean_t telemetry_window_enabled = FALSE; - /* * If TRUE, record micro-stackshot samples for all tasks. * If FALSE, only sample tasks which are marked for telemetry. @@ -107,17 +98,12 @@ uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry uint32_t telemetry_timestamp = 0; /* - * We have two buffers. The telemetry_buffer is responsible + * The telemetry_buffer is responsible * for timer samples and interrupt samples that are driven by * compute_averages(). It will notify its client (if one * exists) when it has enough data to be worth flushing. - * - * The window_buffer contains only interrupt_samples that are - * driven by the scheduler. Its intent is to provide a - * window of recent activity on the cpu(s). */ struct micro_snapshot_buffer telemetry_buffer = {0, 0, 0, 0}; -struct micro_snapshot_buffer window_buffer = {0, 0, 0, 0}; int telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked? int telemetry_buffer_notify_at = 0; @@ -250,83 +236,6 @@ telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable) } } -/* - * Enable the window_buffer, and do any associated setup. - */ -kern_return_t -telemetry_enable_window(void) -{ - kern_return_t ret = KERN_SUCCESS; - vm_offset_t kern_buffer = 0; - vm_size_t kern_buffer_size = TELEMETRY_DEFAULT_WINDOW_BUFFER_SIZE; - - /* - * We have no guarantee we won't allocate the buffer, take - * the lock, and then discover someone beat us to the punch, - * but we would prefer to avoid blocking while holding the - * lock. - */ - ret = kmem_alloc(kernel_map, &kern_buffer, kern_buffer_size, VM_KERN_MEMORY_DIAG); - - TELEMETRY_LOCK(); - - if (!window_buffer.buffer) { - if (ret == KERN_SUCCESS) { - /* No existing buffer was found, so... */ - window_buffer.end_point = 0; - window_buffer.current_position = 0; - - /* Hand off the buffer, and... */ - window_buffer.size = (uint32_t) kern_buffer_size; - window_buffer.buffer = kern_buffer; - kern_buffer = 0; - kern_buffer_size = 0; - bzero((void *) window_buffer.buffer, window_buffer.size); - - /* Let the scheduler know it should drive windowed samples */ - telemetry_window_enabled = TRUE; - } - } else { - /* We already have a buffer, so we have "succeeded" */ - ret = KERN_SUCCESS; - } - - TELEMETRY_UNLOCK(); - - if (kern_buffer) - kmem_free(kernel_map, kern_buffer, kern_buffer_size); - - return ret; -} - -/* - * Disable the window_buffer, and do any associated teardown. - */ -void -telemetry_disable_window(void) -{ - vm_offset_t kern_buffer = 0; - vm_size_t kern_buffer_size = 0; - - TELEMETRY_LOCK(); - - if (window_buffer.buffer) { - /* We have a window buffer, so tear it down */ - telemetry_window_enabled = FALSE; - kern_buffer = window_buffer.buffer; - kern_buffer_size = window_buffer.size; - window_buffer.buffer = 0; - window_buffer.size = 0; - window_buffer.current_position = 0; - window_buffer.end_point = 0; - } - - TELEMETRY_UNLOCK(); - - if (kern_buffer) - kmem_free(kernel_map, kern_buffer, kern_buffer_size); -} - /* * Determine if the current thread is eligible for telemetry: * @@ -389,11 +298,6 @@ void telemetry_mark_curthread(boolean_t interrupted_userspace) ast_bits |= (interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL); - if (telemetry_windowed_record) { - ast_bits |= AST_TELEMETRY_WINDOWED; - } - - telemetry_windowed_record = FALSE; telemetry_needs_record = FALSE; thread_ast_set(thread, ast_bits); ast_propagate(thread->ast); @@ -409,32 +313,6 @@ void compute_telemetry(void *arg __unused) } } -void compute_telemetry_windowed(void) -{ - if (telemetry_sample_all_tasks || (telemetry_active_tasks > 0)) { - /* - * Due to the relationship between the two fields here, - * a request for a windowed record will "squash" a - * request for a regular interrupt record. We hedge - * against this by doing a quick check for an existing - * request. compute_telemetry doesn't hedge because - * a regular request cannot squash a windowed request - * (due to the implementation). - * - * If we really want to do this properly, we could make - * telemetry_needs_record a bitfield, and process one - * request per telemetry_mark_curthread... but that - * would be more expensive (atomics). This should be - * robust enough for now (although it biases in favor - * of the regular records). - */ - if (!telemetry_needs_record) { - telemetry_needs_record = TRUE; - telemetry_windowed_record = TRUE; - } - } -} - /* * If userland has registered a port for telemetry notifications, send one now. */ @@ -451,20 +329,20 @@ telemetry_notify_user(void) } telemetry_notification(user_port, flags); + ipc_port_release_send(user_port); } -void telemetry_ast(thread_t thread, boolean_t interrupted_userspace, boolean_t is_windowed) +void telemetry_ast(thread_t thread, boolean_t interrupted_userspace, boolean_t io_telemetry) { uint8_t microsnapshot_flags = kInterruptRecord; + if (io_telemetry == TRUE) { + microsnapshot_flags = kIORecord; + } if (interrupted_userspace) microsnapshot_flags |= kUserMode; - if (is_windowed) { - telemetry_take_sample(thread, microsnapshot_flags, &window_buffer); - } else { - telemetry_take_sample(thread, microsnapshot_flags, &telemetry_buffer); - } + telemetry_take_sample(thread, microsnapshot_flags, &telemetry_buffer); } void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro_snapshot_buffer * current_buffer) @@ -521,6 +399,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct * buffer with the global telemetry lock held -- so we must do our (possibly faulting) * copies from userland here, before taking the lock. */ + cs.nframes = MAX_CALLSTACK_FRAMES; kperf_ucallstack_sample(&cs, &ctx); if (!(cs.flags & CALLSTACK_VALID)) return; @@ -618,7 +497,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct uint64_t dqkeyaddr = thread_dispatchqaddr(thread); if (dqkeyaddr != 0) { uint64_t dqaddr = 0; - uint64_t dq_serialno_offset = get_dispatchqueue_serialno_offset_from_proc(task->bsd_info); + uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task); if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64BitAddr(task) ? 8 : 4)) == 0) && (dqaddr != 0) && (dq_serialno_offset != 0)) { uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset; @@ -633,7 +512,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct TELEMETRY_LOCK(); /* - * For the benefit of the window buffer; if our buffer is not backed by anything, + * If our buffer is not backed by anything, * then we cannot take the sample. Meant to allow us to deallocate the window * buffer if it is disabled. */ @@ -712,13 +591,13 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct tsnap->ss_flags |= kTaskRsrcFlagged; } - if (task->effective_policy.darwinbg == 1) { + if (proc_get_effective_task_policy(task, TASK_POLICY_DARWIN_BG)) { tsnap->ss_flags |= kTaskDarwinBG; } proc_get_darwinbgstate(task, &tmp); - if (task->requested_policy.t_role == TASK_FOREGROUND_APPLICATION) { + if (proc_get_effective_task_policy(task, TASK_POLICY_ROLE) == TASK_FOREGROUND_APPLICATION) { tsnap->ss_flags |= kTaskIsForeground; } @@ -797,7 +676,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct thsnap->ts_rqos = thread->requested_policy.thrp_qos; thsnap->ts_rqos_override = thread->requested_policy.thrp_qos_override; - if (thread->effective_policy.darwinbg) { + if (proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG)) { thsnap->ss_flags |= kThreadDarwinBG; } @@ -931,11 +810,6 @@ int telemetry_gather(user_addr_t buffer, uint32_t *length, boolean_t mark) return telemetry_buffer_gather(buffer, length, mark, &telemetry_buffer); } -int telemetry_gather_windowed(user_addr_t buffer, uint32_t *length) -{ - return telemetry_buffer_gather(buffer, length, 0, &window_buffer); -} - int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark, struct micro_snapshot_buffer * current_buffer) { int result = 0; @@ -1069,14 +943,16 @@ vm_offset_t bootprofile_buffer = 0; uint32_t bootprofile_buffer_size = 0; uint32_t bootprofile_buffer_current_position = 0; uint32_t bootprofile_interval_ms = 0; +uint32_t bootprofile_stackshot_flags = 0; uint64_t bootprofile_interval_abs = 0; uint64_t bootprofile_next_deadline = 0; uint32_t bootprofile_all_procs = 0; char bootprofile_proc_name[17]; - +uint64_t bootprofile_delta_since_timestamp = 0; lck_grp_t bootprofile_lck_grp; lck_mtx_t bootprofile_mtx; + enum { kBootProfileDisabled = 0, kBootProfileStartTimerAtBoot, @@ -1094,9 +970,6 @@ static void bootprofile_timer_call( timer_call_param_t param0, timer_call_param_t param1); -extern int -stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, unsigned *retbytes); - void bootprofile_init(void) { kern_return_t ret; @@ -1116,6 +989,10 @@ void bootprofile_init(void) bootprofile_interval_ms = 0; } + if (!PE_parse_boot_argn("bootprofile_stackshot_flags", &bootprofile_stackshot_flags, sizeof(bootprofile_stackshot_flags))) { + bootprofile_stackshot_flags = 0; + } + if (!PE_parse_boot_argn("bootprofile_proc_name", &bootprofile_proc_name, sizeof(bootprofile_proc_name))) { bootprofile_all_procs = 1; bootprofile_proc_name[0] = '\0'; @@ -1180,7 +1057,8 @@ bootprofile_wake_from_sleep(void) } -static void bootprofile_timer_call( +static void +bootprofile_timer_call( timer_call_param_t param0 __unused, timer_call_param_t param1 __unused) { @@ -1215,13 +1093,46 @@ static void bootprofile_timer_call( /* initiate a stackshot with whatever portion of the buffer is left */ if (bootprofile_buffer_current_position < bootprofile_buffer_size) { - stack_snapshot_from_kernel( - pid_to_profile, - (void *)(bootprofile_buffer + bootprofile_buffer_current_position), - bootprofile_buffer_size - bootprofile_buffer_current_position, - STACKSHOT_SAVE_LOADINFO | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS, - &retbytes - ); + + uint32_t flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_TRYLOCK | STACKSHOT_SAVE_LOADINFO + | STACKSHOT_GET_GLOBAL_MEM_STATS; +#if __x86_64__ + flags |= STACKSHOT_SAVE_KEXT_LOADINFO; +#endif /* __x86_64__ */ + + + /* OR on flags specified in boot-args */ + flags |= bootprofile_stackshot_flags; + if ((flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) && (bootprofile_delta_since_timestamp == 0)) { + /* Can't take deltas until the first one */ + flags &= ~ STACKSHOT_COLLECT_DELTA_SNAPSHOT; + } + + uint64_t timestamp = 0; + if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) { + timestamp = mach_absolute_time(); + } + + kern_return_t r = stack_snapshot_from_kernel( + pid_to_profile, (void *)(bootprofile_buffer + bootprofile_buffer_current_position), + bootprofile_buffer_size - bootprofile_buffer_current_position, + flags, bootprofile_delta_since_timestamp, &retbytes); + + /* + * We call with STACKSHOT_TRYLOCK because the stackshot lock is coarser + * than the bootprofile lock. If someone else has the lock we'll just + * try again later. + */ + + if (r == KERN_LOCK_OWNED) { + BOOTPROFILE_UNLOCK(); + goto reprogram; + } + + if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT && + r == KERN_SUCCESS) { + bootprofile_delta_since_timestamp = timestamp; + } bootprofile_buffer_current_position += retbytes; } @@ -1251,6 +1162,14 @@ static void bootprofile_timer_call( FALSE); } +void bootprofile_get(void **buffer, uint32_t *length) +{ + BOOTPROFILE_LOCK(); + *buffer = (void*) bootprofile_buffer; + *length = bootprofile_buffer_current_position; + BOOTPROFILE_UNLOCK(); +} + int bootprofile_gather(user_addr_t buffer, uint32_t *length) { int result = 0; diff --git a/osfmk/kern/telemetry.h b/osfmk/kern/telemetry.h index beca74be2..4cbd028ba 100644 --- a/osfmk/kern/telemetry.h +++ b/osfmk/kern/telemetry.h @@ -37,17 +37,14 @@ __BEGIN_DECLS extern volatile boolean_t telemetry_needs_record; -extern volatile boolean_t telemetry_window_enabled; extern void telemetry_init(void); extern void compute_telemetry(void *); -extern void compute_telemetry_windowed(void); -extern void telemetry_ast(thread_t, boolean_t interrupted_userspace, boolean_t is_windowed); +extern void telemetry_ast(thread_t, boolean_t interrupted_userspace, boolean_t io_telemetry); extern int telemetry_gather(user_addr_t buffer, uint32_t *length, boolean_t mark); -extern int telemetry_gather_windowed(user_addr_t buffer, uint32_t *length); extern void telemetry_mark_curthread(boolean_t interrupted_userspace); @@ -55,9 +52,6 @@ extern void telemetry_task_ctl(task_t task, uint32_t reason, int enable_disable) extern void telemetry_task_ctl_locked(task_t task, uint32_t reason, int enable_disable); extern void telemetry_global_ctl(int enable_disable); -extern kern_return_t telemetry_enable_window(void); -extern void telemetry_disable_window(void); - extern int telemetry_timer_event(uint64_t deadline, uint64_t interval, uint64_t leeway); #define TELEMETRY_CMD_TIMER_EVENT 1 @@ -67,6 +61,7 @@ extern int telemetry_timer_event(uint64_t deadline, uint64_t interval, uint64_t extern void bootprofile_init(void); extern void bootprofile_wake_from_sleep(void); +extern void bootprofile_get(void **buffer, uint32_t *length); extern int bootprofile_gather(user_addr_t buffer, uint32_t *length); __END_DECLS diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index cc8e391b1..5a703f62f 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -117,6 +117,8 @@ #include #include #include +#include + #include #if KPC #include @@ -141,6 +143,7 @@ #include #include #include +#include static struct zone *thread_zone; static lck_grp_attr_t thread_lck_grp_attr; @@ -157,6 +160,15 @@ static queue_head_t thread_terminate_queue; static queue_head_t crashed_threads_queue; +decl_simple_lock_data(static,thread_exception_lock) +static queue_head_t thread_exception_queue; + +struct thread_exception_elt { + queue_chain_t elt; + task_t exception_task; + thread_t exception_thread; +}; + static struct thread thread_template, init_thread; static void sched_call_null( @@ -165,6 +177,7 @@ static void sched_call_null( #ifdef MACH_BSD extern void proc_exit(void *); +extern mach_exception_data_type_t proc_encode_exit_exception_code(void *); extern uint64_t get_dispatchqueue_offset_from_proc(void *); extern int proc_selfpid(void); extern char * proc_name_address(void *p); @@ -180,8 +193,7 @@ static uint64_t thread_unique_id = 100; struct _thread_ledger_indices thread_ledgers = { -1 }; static ledger_template_t thread_ledger_template = NULL; -void init_thread_ledgers(void); -int task_disable_cpumon(task_t task); +static void init_thread_ledgers(void); #if CONFIG_JETSAM void jetsam_on_ledger_cpulimit_exceeded(void); @@ -196,7 +208,7 @@ void jetsam_on_ledger_cpulimit_exceeded(void); #define CPUMON_USTACKSHOTS_TRIGGER_DEFAULT_PCT 70 int cpumon_ustackshots_trigger_pct; /* Percentage. Level at which we start gathering telemetry. */ -void __attribute__((noinline)) THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void); +void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void); /* * The smallest interval over which we support limiting CPU consumption is 1ms @@ -210,6 +222,10 @@ thread_bootstrap(void) * Fill in a template thread for fast initialization. */ +#if MACH_ASSERT + thread_template.thread_magic = THREAD_MAGIC; +#endif /* MACH_ASSERT */ + thread_template.runq = PROCESSOR_NULL; thread_template.ref_count = 2; @@ -230,6 +246,7 @@ thread_bootstrap(void) thread_template.sched_flags = 0; thread_template.saved_mode = TH_MODE_NONE; thread_template.safe_release = 0; + thread_template.th_sched_bucket = TH_BUCKET_RUN; thread_template.sfi_class = SFI_CLASS_UNSPECIFIED; thread_template.sfi_wait_class = SFI_CLASS_UNSPECIFIED; @@ -249,10 +266,6 @@ thread_bootstrap(void) thread_template.pending_promoter[1] = NULL; thread_template.rwlock_count = 0; -#if MACH_ASSERT - thread_template.SHARE_COUNT = 0; - thread_template.BG_COUNT = 0; -#endif /* MACH_ASSERT */ thread_template.realtime.deadline = UINT64_MAX; @@ -283,6 +296,7 @@ thread_bootstrap(void) thread_template.vtimer_user_save = 0; thread_template.vtimer_prof_save = 0; thread_template.vtimer_rlim_save = 0; + thread_template.vtimer_qos_save = 0; #if CONFIG_SCHED_SFI thread_template.wait_sfi_begin_time = 0; @@ -303,6 +317,13 @@ thread_bootstrap(void) thread_template.t_dtrace_tracing = 0; #endif /* CONFIG_DTRACE */ +#if KPERF + thread_template.kperf_flags = 0; + thread_template.kperf_pet_gen = 0; + thread_template.kperf_c_switch = 0; + thread_template.kperf_pet_cnt = 0; +#endif + #if KPC thread_template.kpc_buf = NULL; #endif @@ -311,8 +332,6 @@ thread_bootstrap(void) thread_template.hv_thread_target = NULL; #endif /* HYPERVISOR */ - thread_template.t_chud = 0; - #if (DEVELOPMENT || DEBUG) thread_template.t_page_creation_throttled_hard = 0; thread_template.t_page_creation_throttled_soft = 0; @@ -333,9 +352,8 @@ thread_bootstrap(void) thread_template.t_deduct_bank_ledger_time = 0; #endif - thread_template.requested_policy = default_task_requested_policy; - thread_template.effective_policy = default_task_effective_policy; - thread_template.pended_policy = default_task_pended_policy; + thread_template.requested_policy = (struct thread_requested_policy) {}; + thread_template.effective_policy = (struct thread_effective_policy) {}; bzero(&thread_template.overrides, sizeof(thread_template.overrides)); @@ -381,9 +399,11 @@ thread_init(void) lck_grp_attr_setdefault(&thread_lck_grp_attr); lck_grp_init(&thread_lck_grp, "thread", &thread_lck_grp_attr); lck_attr_setdefault(&thread_lck_attr); - + stack_init(); + thread_policy_init(); + /* * Initialize any machine-dependent * per-thread structures necessary. @@ -400,6 +420,19 @@ thread_init(void) init_thread_ledgers(); } +void +thread_corpse_continue(void) +{ + thread_t thread = current_thread(); + + thread_terminate_internal(thread); + ml_set_interrupts_enabled(FALSE); + ast_taken(AST_APC, TRUE); + + panic("thread_corpse_continue"); + /*NOTREACHED*/ +} + static void thread_terminate_continue(void) { @@ -425,14 +458,12 @@ thread_terminate_self(void) thread_mtx_lock(thread); ipc_thread_disable(thread); - + thread_mtx_unlock(thread); s = splsched(); thread_lock(thread); - assert_thread_sched_count(thread); - /* * Cancel priority depression, wait for concurrent expirations * on other processors. @@ -470,14 +501,37 @@ thread_terminate_self(void) thread_mtx_unlock(thread); task = thread->task; - uthread_cleanup(task, thread->uthread, task->bsd_info, thread->inspection == 1 ? TRUE : FALSE); + uthread_cleanup(task, thread->uthread, task->bsd_info); threadcnt = hw_atomic_sub(&task->active_thread_count, 1); + if (task->bsd_info) { + /* trace out pid before we sign off */ + long dbg_arg1 = 0; + + kdbg_trace_data(thread->task->bsd_info, &dbg_arg1); + + KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE_PID | DBG_FUNC_NONE, + dbg_arg1, 0, 0, 0, 0); + } + /* * If we are the last thread to terminate and the task is * associated with a BSD process, perform BSD process exit. */ if (threadcnt == 0 && task->bsd_info != NULL) { + mach_exception_data_type_t subcode = 0; + { + /* since we're the last thread in this process, trace out the command name too */ + long dbg_arg1 = 0, dbg_arg2 = 0, dbg_arg3 = 0, dbg_arg4 = 0; + + kdbg_trace_string(thread->task->bsd_info, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4); + + KERNEL_DEBUG_CONSTANT(TRACE_STRING_PROC_EXIT | DBG_FUNC_NONE, + dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); + } + + /* Get the exit reason before proc_exit */ + subcode = proc_encode_exit_exception_code(task->bsd_info); proc_exit(task->bsd_info); /* * if there is crash info in task @@ -485,9 +539,18 @@ thread_terminate_self(void) * last thread for this task. */ if (task->corpse_info) { - task_deliver_crash_notification(task); + task_deliver_crash_notification(task, current_thread(), subcode); + } + } + + if (threadcnt == 0) { + task_lock(task); + if (task_is_a_corpse_fork(task)) { + thread_wakeup((event_t)&task->active_thread_count); } + task_unlock(task); } + uthread_cred_free(thread->uthread); s = splsched(); @@ -542,8 +605,12 @@ thread_terminate_self(void) void thread_deallocate_safe(thread_t thread) { - if (__improbable(hw_atomic_sub(&(thread)->ref_count, 1) == 0)) - panic("bad thread refcount!"); + assert_thread_magic(thread); + + uint32_t old_refcount = hw_atomic_sub(&(thread)->ref_count, 1) + 1; + + if (__improbable(old_refcount <= 1)) + panic("bad thread refcount: %d", old_refcount); } void @@ -555,6 +622,9 @@ thread_deallocate( if (thread == THREAD_NULL) return; + assert_thread_magic(thread); + assert(thread->ref_count > 0); + if (__probable(hw_atomic_sub(&(thread)->ref_count, 1) > 0)) return; @@ -563,6 +633,8 @@ thread_deallocate( assert(thread->runq == PROCESSOR_NULL); + assert(thread->user_promotions == 0); + #if KPC kpc_thread_destroy(thread); #endif @@ -601,9 +673,107 @@ thread_deallocate( task_deallocate(task); +#if MACH_ASSERT + assert_thread_magic(thread); + thread->thread_magic = 0; +#endif /* MACH_ASSERT */ + zfree(thread_zone, thread); } +/* + * thread_exception_daemon: + * + * Deliver EXC_RESOURCE exception + */ +static void +thread_exception_daemon(void) +{ + struct thread_exception_elt *elt; + task_t task; + thread_t thread; + + simple_lock(&thread_exception_lock); + while ((elt = (struct thread_exception_elt *)dequeue_head(&thread_exception_queue)) != NULL) { + simple_unlock(&thread_exception_lock); + + task = elt->exception_task; + thread = elt->exception_thread; + assert_thread_magic(thread); + + kfree(elt, sizeof(struct thread_exception_elt)); + + /* wait for all the threads in the task to terminate */ + task_lock(task); + task_wait_till_threads_terminate_locked(task); + task_unlock(task); + + /* Consumes the task ref returned by task_generate_corpse_internal */ + task_deallocate(task); + /* Consumes the thread ref returned by task_generate_corpse_internal */ + thread_deallocate(thread); + + /* Deliver the EXC_RESOURCE notification, also clears the corpse. */ + task_deliver_crash_notification(task, thread, 0); + + simple_lock(&thread_exception_lock); + } + + assert_wait((event_t)&thread_exception_queue, THREAD_UNINT); + simple_unlock(&thread_exception_lock); + + thread_block((thread_continue_t)thread_exception_daemon); +} + +/* + * thread_exception_enqueue: + * + * Enqueue a corpse port to be delivered an EXC_RESOURCE. + */ +void +thread_exception_enqueue( + task_t task, + thread_t thread) +{ + struct thread_exception_elt *elt = (struct thread_exception_elt*) kalloc( + sizeof(struct thread_exception_elt)); + + elt->exception_task = task; + elt->exception_thread = thread; + + simple_lock(&thread_exception_lock); + enqueue_tail(&thread_exception_queue, (queue_entry_t)elt); + simple_unlock(&thread_exception_lock); + + thread_wakeup((event_t)&thread_exception_queue); +} + +/* + * thread_copy_resource_info + * + * Copy the resource info counters from source + * thread to destination thread. + */ +void +thread_copy_resource_info( + thread_t dst_thread, + thread_t src_thread) +{ + dst_thread->thread_tag = src_thread->thread_tag; + dst_thread->c_switch = src_thread->c_switch; + dst_thread->p_switch = src_thread->p_switch; + dst_thread->ps_switch = src_thread->ps_switch; + dst_thread->precise_user_kernel_time = src_thread->precise_user_kernel_time; + dst_thread->user_timer = src_thread->user_timer; + dst_thread->user_timer_save = src_thread->user_timer_save; + dst_thread->system_timer_save = src_thread->system_timer_save; + dst_thread->syscalls_unix = src_thread->syscalls_unix; + dst_thread->syscalls_mach = src_thread->syscalls_mach; + ledger_rollup(dst_thread->t_threadledger, src_thread->t_threadledger); + *dst_thread->thread_io_stats = *src_thread->thread_io_stats; + +} + /* * thread_terminate_daemon: * @@ -621,7 +791,8 @@ thread_terminate_daemon(void) (void)splsched(); simple_lock(&thread_terminate_lock); - while ((thread = (thread_t)dequeue_head(&thread_terminate_queue)) != THREAD_NULL) { + while ((thread = qe_dequeue_head(&thread_terminate_queue, struct thread, runq_links)) != THREAD_NULL) { + assert_thread_magic(thread); /* * if marked for crash reporting, skip reaping. @@ -629,16 +800,13 @@ thread_terminate_daemon(void) * for reaping when done */ if (thread->inspection){ - enqueue_tail(&crashed_threads_queue, (queue_entry_t)thread); + enqueue_tail(&crashed_threads_queue, &thread->runq_links); continue; } simple_unlock(&thread_terminate_lock); (void)spllo(); - assert(thread->SHARE_COUNT == 0); - assert(thread->BG_COUNT == 0); - task = thread->task; task_lock(task); @@ -659,8 +827,10 @@ thread_terminate_daemon(void) task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1; task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2; task->task_gpu_ns += ml_gpu_stat(thread); - - thread_update_qos_cpu_time(thread, FALSE); + task->task_energy += ml_energy_stat(thread); + + thread_update_qos_cpu_time(thread); + queue_remove(&task->threads, thread, thread_t, task_threads); task->thread_count--; @@ -707,7 +877,7 @@ thread_terminate_enqueue( KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0); simple_lock(&thread_terminate_lock); - enqueue_tail(&thread_terminate_queue, (queue_entry_t)thread); + enqueue_tail(&thread_terminate_queue, &thread->runq_links); simple_unlock(&thread_terminate_lock); thread_wakeup((event_t)&thread_terminate_queue); @@ -715,13 +885,13 @@ thread_terminate_enqueue( /* * thread_terminate_crashed_threads: - * walk the list of crashed therds and put back set of threads + * walk the list of crashed threads and put back set of threads * who are no longer being inspected. */ void thread_terminate_crashed_threads() { - thread_t th_iter, th_remove; + thread_t th_remove; boolean_t should_wake_terminate_queue = FALSE; simple_lock(&thread_terminate_lock); @@ -729,16 +899,13 @@ thread_terminate_crashed_threads() * loop through the crashed threads queue * to put any threads that are not being inspected anymore */ - th_iter = (thread_t)queue_first(&crashed_threads_queue); - while (!queue_end(&crashed_threads_queue, (queue_entry_t)th_iter)) { - th_remove = th_iter; - th_iter = (thread_t)queue_next(&th_iter->links); + qe_foreach_element_safe(th_remove, &crashed_threads_queue, runq_links) { /* make sure current_thread is never in crashed queue */ assert(th_remove != current_thread()); - if (th_remove->inspection != TRUE){ - remque((queue_entry_t)th_remove); - enqueue_tail(&thread_terminate_queue, (queue_entry_t)th_remove); + + if (th_remove->inspection == FALSE) { + re_queue_tail(&thread_terminate_queue, &th_remove->runq_links); should_wake_terminate_queue = TRUE; } } @@ -764,7 +931,9 @@ thread_stack_daemon(void) s = splsched(); simple_lock(&thread_stack_lock); - while ((thread = (thread_t)dequeue_head(&thread_stack_queue)) != THREAD_NULL) { + while ((thread = qe_dequeue_head(&thread_stack_queue, struct thread, runq_links)) != THREAD_NULL) { + assert_thread_magic(thread); + simple_unlock(&thread_stack_lock); splx(s); @@ -801,9 +970,10 @@ thread_stack_enqueue( thread_t thread) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_STACK_WAIT) | DBG_FUNC_START, thread_tid(thread), 0, 0, 0, 0); + assert_thread_magic(thread); simple_lock(&thread_stack_lock); - enqueue_tail(&thread_stack_queue, (queue_entry_t)thread); + enqueue_tail(&thread_stack_queue, &thread->runq_links); simple_unlock(&thread_stack_lock); thread_wakeup((event_t)&thread_stack_queue); @@ -833,11 +1003,21 @@ thread_daemon_init(void) panic("thread_daemon_init: thread_stack_daemon"); thread_deallocate(thread); + + simple_lock_init(&thread_exception_lock, 0); + queue_init(&thread_exception_queue); + + result = kernel_thread_start_priority((thread_continue_t)thread_exception_daemon, NULL, MINPRI_KERNEL, &thread); + if (result != KERN_SUCCESS) + panic("thread_daemon_init: thread_exception_daemon"); + + thread_deallocate(thread); } #define TH_OPTION_NONE 0x00 #define TH_OPTION_NOCRED 0x01 #define TH_OPTION_NOSUSP 0x02 + /* * Create a new thread. * Doesn't start the thread running. @@ -871,6 +1051,10 @@ thread_create_internal( #ifdef MACH_BSD new_thread->uthread = uthread_alloc(parent_task, new_thread, (options & TH_OPTION_NOCRED) != 0); if (new_thread->uthread == NULL) { +#if MACH_ASSERT + new_thread->thread_magic = 0; +#endif /* MACH_ASSERT */ + zfree(thread_zone, new_thread); return (KERN_RESOURCE_SHORTAGE); } @@ -882,11 +1066,15 @@ thread_create_internal( new_thread->uthread = NULL; /* cred free may not be necessary */ - uthread_cleanup(parent_task, ut, parent_task->bsd_info, FALSE); + uthread_cleanup(parent_task, ut, parent_task->bsd_info); uthread_cred_free(ut); uthread_zone_free(ut); #endif /* MACH_BSD */ +#if MACH_ASSERT + new_thread->thread_magic = 0; +#endif /* MACH_ASSERT */ + zfree(thread_zone, new_thread); return (KERN_FAILURE); } @@ -915,11 +1103,13 @@ thread_create_internal( lck_mtx_lock(&tasks_threads_lock); task_lock(parent_task); - if ( !parent_task->active || parent_task->halting || - ((options & TH_OPTION_NOSUSP) != 0 && - parent_task->suspend_count > 0) || - (parent_task->thread_count >= task_threadmax && - parent_task != kernel_task) ) { + /* + * Fail thread creation if parent task is being torn down or has too many threads + * If the caller asked for TH_OPTION_NOSUSP, also fail if the parent task is suspended + */ + if (parent_task->active == 0 || parent_task->halting || + (parent_task->suspend_count > 0 && (options & TH_OPTION_NOSUSP) != 0) || + (parent_task->thread_count >= task_threadmax && parent_task != kernel_task)) { task_unlock(parent_task); lck_mtx_unlock(&tasks_threads_lock); @@ -928,7 +1118,7 @@ thread_create_internal( void *ut = new_thread->uthread; new_thread->uthread = NULL; - uthread_cleanup(parent_task, ut, parent_task->bsd_info, FALSE); + uthread_cleanup(parent_task, ut, parent_task->bsd_info); /* cred free may not be necessary */ uthread_cred_free(ut); uthread_zone_free(ut); @@ -963,7 +1153,6 @@ thread_create_internal( ledger_entry_setactive(new_thread->t_threadledger, thread_ledgers.cpu_time); } - new_thread->cpu_time_last_qos = 0; #ifdef CONFIG_BANK new_thread->t_bankledger = LEDGER_NULL; new_thread->t_deduct_bank_ledger_time = 0; @@ -987,19 +1176,9 @@ thread_create_internal( #if KPC kpc_thread_create(new_thread); #endif - - /* Only need to update policies pushed from task to thread */ - new_thread->requested_policy.bg_iotier = parent_task->effective_policy.bg_iotier; - new_thread->requested_policy.terminated = parent_task->effective_policy.terminated; /* Set the thread's scheduling parameters */ -#if defined(CONFIG_SCHED_TIMESHARE_CORE) - new_thread->sched_stamp = sched_tick; - new_thread->pri_shift = sched_pri_shift; -#endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */ - new_thread->sched_mode = SCHED(initial_thread_sched_mode)(parent_task); - new_thread->sched_flags = 0; new_thread->max_priority = parent_task->max_priority; new_thread->task_priority = parent_task->priority; @@ -1009,10 +1188,14 @@ thread_create_internal( new_priority = new_thread->max_priority; new_thread->importance = new_priority - new_thread->task_priority; - new_thread->saved_importance = new_thread->importance; sched_set_thread_base_priority(new_thread, new_priority); +#if defined(CONFIG_SCHED_TIMESHARE_CORE) + new_thread->sched_stamp = sched_tick; + new_thread->pri_shift = sched_pri_shifts[new_thread->th_sched_bucket]; +#endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */ + thread_policy_create(new_thread); @@ -1030,7 +1213,13 @@ thread_create_internal( threads_count++; new_thread->active = TRUE; - new_thread->inspection = FALSE; + if (task_is_a_corpse_fork(parent_task)) { + /* Set the inspection bit if the task is a corpse fork */ + new_thread->inspection = TRUE; + } else { + new_thread->inspection = FALSE; + } + new_thread->corpse_dup = FALSE; *out_thread = new_thread; { @@ -1039,14 +1228,14 @@ thread_create_internal( kdbg_trace_data(parent_task->bsd_info, &dbg_arg2); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - TRACEDBG_CODE(DBG_TRACE_DATA, 1) | DBG_FUNC_NONE, + TRACE_DATA_NEWTHREAD | DBG_FUNC_NONE, (vm_address_t)(uintptr_t)thread_tid(new_thread), dbg_arg2, 0, 0, 0); kdbg_trace_string(parent_task->bsd_info, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - TRACEDBG_CODE(DBG_TRACE_STRING, 1) | DBG_FUNC_NONE, + TRACE_STRING_NEWTHREAD | DBG_FUNC_NONE, dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); } @@ -1121,14 +1310,14 @@ thread_create_with_continuation( static kern_return_t thread_create_running_internal2( - register task_t task, + task_t task, int flavor, thread_state_t new_state, mach_msg_type_number_t new_state_count, thread_t *new_thread, boolean_t from_user) { - register kern_return_t result; + kern_return_t result; thread_t thread; if (task == TASK_NULL || task == kernel_task) @@ -1138,6 +1327,9 @@ thread_create_running_internal2( if (result != KERN_SUCCESS) return (result); + if (task->suspend_count > 0) + thread_hold(thread); + result = machine_thread_set_state(thread, flavor, new_state, new_state_count); if (result != KERN_SUCCESS) { task_unlock(task); @@ -1149,7 +1341,7 @@ thread_create_running_internal2( } thread_mtx_lock(thread); - thread_start_internal(thread); + thread_start(thread); thread_mtx_unlock(thread); if (from_user) @@ -1166,7 +1358,7 @@ thread_create_running_internal2( /* Prototype, see justification above */ kern_return_t thread_create_running( - register task_t task, + task_t task, int flavor, thread_state_t new_state, mach_msg_type_number_t new_state_count, @@ -1174,7 +1366,7 @@ thread_create_running( kern_return_t thread_create_running( - register task_t task, + task_t task, int flavor, thread_state_t new_state, mach_msg_type_number_t new_state_count, @@ -1187,7 +1379,7 @@ thread_create_running( kern_return_t thread_create_running_from_user( - register task_t task, + task_t task, int flavor, thread_state_t new_state, mach_msg_type_number_t new_state_count, @@ -1227,6 +1419,39 @@ thread_create_workq( return (KERN_SUCCESS); } +kern_return_t +thread_create_workq_waiting( + task_t task, + thread_continue_t thread_return, + event_t event, + thread_t *new_thread) +{ + thread_t thread; + kern_return_t result; + + if (task == TASK_NULL || task == kernel_task) + return KERN_INVALID_ARGUMENT; + + result = thread_create_internal(task, -1, thread_return, TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread); + + if (result != KERN_SUCCESS) + return result; + + if (task->suspend_count > 0) + thread_hold(thread); + + thread_mtx_lock(thread); + thread_start_in_assert_wait(thread, event, THREAD_INTERRUPTIBLE); + thread_mtx_unlock(thread); + + task_unlock(task); + lck_mtx_unlock(&tasks_threads_lock); + + *new_thread = thread; + + return result; +} + /* * kernel_thread_create: * @@ -1281,7 +1506,7 @@ kernel_thread_start_priority( *new_thread = thread; thread_mtx_lock(thread); - thread_start_internal(thread); + thread_start(thread); thread_mtx_unlock(thread); return (result); @@ -1373,7 +1598,7 @@ retrieve_thread_basic_info(thread_t thread, thread_basic_info_t basic_info) kern_return_t thread_info_internal( - register thread_t thread, + thread_t thread, thread_flavor_t flavor, thread_info_t thread_info_out, /* ptr to OUT array */ mach_msg_type_number_t *thread_info_count) /*IN/OUT*/ @@ -1402,7 +1627,7 @@ thread_info_internal( } else if (flavor == THREAD_IDENTIFIER_INFO) { - register thread_identifier_info_t identifier_info; + thread_identifier_info_t identifier_info; if (*thread_info_count < THREAD_IDENTIFIER_INFO_COUNT) return (KERN_INVALID_ARGUMENT); @@ -1710,6 +1935,12 @@ thread_wire( } +boolean_t +is_vm_privileged(void) +{ + return current_thread()->options & TH_OPT_VMPRIV ? TRUE : FALSE; +} + boolean_t set_vm_privilege(boolean_t privileged) { @@ -1810,12 +2041,12 @@ thread_cputime_callback(int warning, __unused const void *arg0, __unused const v #endif if (warning == 0) { - THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(); + SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(); } } void __attribute__((noinline)) -THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void) +SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void) { int pid = 0; task_t task = current_task(); @@ -1827,41 +2058,45 @@ THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void) time_value_t thread_user_time; int action; uint8_t percentage; - uint32_t limit_percent; - uint32_t usage_percent; + uint32_t usage_percent = 0; uint32_t interval_sec; uint64_t interval_ns; uint64_t balance_ns; boolean_t fatal = FALSE; + boolean_t send_exc_resource = TRUE; /* in addition to RESOURCE_NOTIFY */ + kern_return_t kr; +#ifdef EXC_RESOURCE_MONITORS mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; +#endif /* EXC_RESOURCE_MONITORS */ struct ledger_entry_info lei; assert(thread->t_threadledger != LEDGER_NULL); /* - * Now that a thread has tripped the monitor, disable it for the entire task. + * Extract the fatal bit and suspend the monitor (which clears the bit). */ task_lock(task); - if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) == 0) { - /* - * The CPU usage monitor has been disabled on our task, so some other - * thread must have gotten here first. We only send one exception per - * task lifetime, so there's nothing left for us to do here. - */ - task_unlock(task); - return; - } if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_CPUMON) { fatal = TRUE; + send_exc_resource = TRUE; } - task_disable_cpumon(task); + /* Only one thread can be here at a time. Whichever makes it through + first will successfully suspend the monitor and proceed to send the + notification. Other threads will get an error trying to suspend the + monitor and give up on sending the notification. In the first release, + the monitor won't be resumed for a number of seconds, but we may + eventually need to handle low-latency resume. + */ + kr = task_suspend_cpumon(task); task_unlock(task); + if (kr == KERN_INVALID_ARGUMENT) return; #ifdef MACH_BSD pid = proc_selfpid(); - if (task->bsd_info != NULL) + if (task->bsd_info != NULL) { procname = proc_name_address(task->bsd_info); + } #endif thread_get_cpulimit(&action, &percentage, &interval_ns); @@ -1871,58 +2106,80 @@ THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void) thread_read_times(thread, &thread_user_time, &thread_system_time); time_value_add(&thread_total_time, &thread_user_time); time_value_add(&thread_total_time, &thread_system_time); - ledger_get_entry_info(thread->t_threadledger, thread_ledgers.cpu_time, &lei); + /* credit/debit/balance/limit are in absolute time units; + the refill info is in nanoseconds. */ absolutetime_to_nanoseconds(lei.lei_balance, &balance_ns); - usage_percent = (uint32_t) ((balance_ns * 100ULL) / lei.lei_last_refill); - - /* Show refill period in the same units as balance, limit, etc */ - nanoseconds_to_absolutetime(lei.lei_refill_period, &lei.lei_refill_period); + if (lei.lei_last_refill > 0) { + usage_percent = (uint32_t)((balance_ns*100ULL) / lei.lei_last_refill); + } - limit_percent = (uint32_t) ((lei.lei_limit * 100ULL) / lei.lei_refill_period); + /* TODO: show task total runtime (via TASK_ABSOLUTETIME_INFO)? */ + printf("process %s[%d] thread %llu caught burning CPU! " + "It used more than %d%% CPU over %u seconds " + "(actual recent usage: %d%% over ~%llu seconds). " + "Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys) " + "ledger balance: %lld mabs credit: %lld mabs debit: %lld mabs " + "limit: %llu mabs period: %llu ns last refill: %llu ns%s.\n", + procname, pid, tid, + percentage, interval_sec, + usage_percent, + (lei.lei_last_refill + NSEC_PER_SEC/2) / NSEC_PER_SEC, + thread_total_time.seconds, thread_total_time.microseconds, + thread_user_time.seconds, thread_user_time.microseconds, + thread_system_time.seconds,thread_system_time.microseconds, + lei.lei_balance, lei.lei_credit, lei.lei_debit, + lei.lei_limit, lei.lei_refill_period, lei.lei_last_refill, + (fatal ? " [fatal violation]" : "")); - /* TODO: show task total runtime as well? see TASK_ABSOLUTETIME_INFO */ + /* + For now, send RESOURCE_NOTIFY in parallel with EXC_RESOURCE. Once + we have logging parity, we will stop sending EXC_RESOURCE (24508922). + */ - if (disable_exc_resource) { - printf("process %s[%d] thread %llu caught burning CPU!; EXC_RESOURCE " - "supressed by a boot-arg\n", procname, pid, tid); - return; + /* RESOURCE_NOTIFY MIG specifies nanoseconds of CPU time */ + lei.lei_balance = balance_ns; + absolutetime_to_nanoseconds(lei.lei_limit, &lei.lei_limit); + trace_resource_violation(RMON_CPUUSAGE_VIOLATED, &lei); + kr = send_resource_violation(send_cpu_usage_violation, task, &lei, + fatal ? kRNFatalLimitFlag : 0); + if (kr) { + printf("send_resource_violation(CPU usage, ...): error %#x\n", kr); } - if (audio_active) { - printf("process %s[%d] thread %llu caught burning CPU!; EXC_RESOURCE " - "supressed due to audio playback\n", procname, pid, tid); - return; +#ifdef EXC_RESOURCE_MONITORS + if (send_exc_resource) { + if (disable_exc_resource) { + printf("process %s[%d] thread %llu caught burning CPU! " + "EXC_RESOURCE%s supressed by a boot-arg\n", + procname, pid, tid, fatal ? " (and termination)" : ""); + return; + } + + if (audio_active) { + printf("process %s[%d] thread %llu caught burning CPU! " + "EXC_RESOURCE & termination supressed due to audio playback\n", + procname, pid, tid); + return; + } } - printf("process %s[%d] thread %llu caught burning CPU! " - "It used more than %d%% CPU (Actual recent usage: %d%%) over %d seconds. " - "thread lifetime cpu usage %d.%06d seconds, (%d.%06d user, %d.%06d system) " - "ledger info: balance: %lld credit: %lld debit: %lld limit: %llu (%d%%) " - "period: %llu time since last refill (ns): %llu %s\n", - procname, pid, tid, - percentage, usage_percent, interval_sec, - thread_total_time.seconds, thread_total_time.microseconds, - thread_user_time.seconds, thread_user_time.microseconds, - thread_system_time.seconds, thread_system_time.microseconds, - lei.lei_balance, - lei.lei_credit, lei.lei_debit, - lei.lei_limit, limit_percent, - lei.lei_refill_period, lei.lei_last_refill, - (fatal ? "[fatal violation]" : "")); - - - code[0] = code[1] = 0; - EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_CPU); - if (fatal) { - EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR_FATAL); - }else { - EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR); + + + if (send_exc_resource) { + code[0] = code[1] = 0; + EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_CPU); + if (fatal) { + EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR_FATAL); + }else { + EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR); + } + EXC_RESOURCE_CPUMONITOR_ENCODE_INTERVAL(code[0], interval_sec); + EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[0], percentage); + EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[1], usage_percent); + exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX); } - EXC_RESOURCE_CPUMONITOR_ENCODE_INTERVAL(code[0], interval_sec); - EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[0], limit_percent); - EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[1], usage_percent); - exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX); +#endif /* EXC_RESOURCE_MONITORS */ if (fatal) { #if CONFIG_JETSAM @@ -1965,9 +2222,13 @@ void thread_update_io_stats(thread_t thread, int size, int io_flags) UPDATE_IO_STATS(thread->thread_io_stats->total_io, size); UPDATE_IO_STATS_ATOMIC(thread->task->task_io_stats->total_io, size); + if (!(io_flags & DKIO_READ)) { + DTRACE_IO3(physical_writes, struct task *, thread->task, uint32_t, size, int, io_flags); + ledger_credit(thread->task->ledger, task_ledgers.physical_writes, size); + } } -void +static void init_thread_ledgers(void) { ledger_template_t t; int idx; @@ -2153,6 +2414,39 @@ thread_sched_call( thread->sched_call = (call != NULL)? call: sched_call_null; } +sched_call_t +thread_disable_sched_call( + thread_t thread, + sched_call_t call) +{ + if (call) { + spl_t s = splsched(); + thread_lock(thread); + if (thread->sched_call == call) { + thread->sched_call = sched_call_null; + } else { + call = NULL; + } + thread_unlock(thread); + splx(s); + } + return call; +} + +void +thread_reenable_sched_call( + thread_t thread, + sched_call_t call) +{ + if (call) { + spl_t s = splsched(); + thread_lock(thread); + thread_sched_call(thread, call); + thread_unlock(thread); + splx(s); + } +} + void thread_static_param( thread_t thread, @@ -2181,19 +2475,24 @@ uint64_t thread_dispatchqaddr( thread_t thread) { - uint64_t dispatchqueue_addr = 0; - uint64_t thread_handle = 0; + uint64_t dispatchqueue_addr; + uint64_t thread_handle; - if (thread != THREAD_NULL) { - thread_handle = thread->machine.cthread_self; - - if (thread->inspection == TRUE) - dispatchqueue_addr = thread_handle + get_task_dispatchqueue_offset(thread->task); - else if (thread->task->bsd_info) - dispatchqueue_addr = thread_handle + get_dispatchqueue_offset_from_proc(thread->task->bsd_info); - } + if (thread == THREAD_NULL) + return 0; + + thread_handle = thread->machine.cthread_self; + if (thread_handle == 0) + return 0; + + if (thread->inspection == TRUE) + dispatchqueue_addr = thread_handle + get_task_dispatchqueue_offset(thread->task); + else if (thread->task->bsd_info) + dispatchqueue_addr = thread_handle + get_dispatchqueue_offset_from_proc(thread->task->bsd_info); + else + dispatchqueue_addr = 0; - return (dispatchqueue_addr); + return dispatchqueue_addr; } /* @@ -2509,6 +2808,28 @@ thread_get_current_voucher_origin_pid( return kr; } +boolean_t +thread_has_thread_name(thread_t th) +{ + if ((th) && (th->uthread)) { + return bsd_hasthreadname(th->uthread); + } + + /* + * This is an odd case; clients may set the thread name based on the lack of + * a name, but in this context there is no uthread to attach the name to. + */ + return FALSE; +} + +void +thread_set_thread_name(thread_t th, const char* name) +{ + if ((th) && (th->uthread) && name) { + bsd_setthreadname(th->uthread, name); + } +} + /* * thread_enable_send_importance - set/clear the SEND_IMPORTANCE thread option bit. */ @@ -2633,6 +2954,8 @@ void dtrace_thread_bootstrap(void) if (thread->t_dtrace_flags & TH_DTRACE_EXECSUCCESS) { thread->t_dtrace_flags &= ~TH_DTRACE_EXECSUCCESS; DTRACE_PROC(exec__success); + KDBG(BSDDBG_CODE(DBG_BSD_PROC,BSD_PROC_EXEC), + task_pid(task)); } DTRACE_PROC(start); } diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 44918bbcf..c27489677 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -127,7 +127,15 @@ #include #include + struct thread { + +#if MACH_ASSERT +#define THREAD_MAGIC 0x1234ABCDDCBA4321ULL + /* Ensure nothing uses &thread as a queue entry */ + uint64_t thread_magic; +#endif /* MACH_ASSERT */ + /* * NOTE: The runq field in the thread structure has an unusual * locking protocol. If its value is PROCESSOR_NULL, then it is @@ -140,11 +148,16 @@ struct thread { * New waitq APIs allow the 'links' and 'runq' fields to be * anywhere in the thread structure. */ - /* Items examined often, modified infrequently */ - queue_chain_t links; /* run/wait queue links */ - processor_t runq; /* run queue assignment */ - event64_t wait_event; /* wait queue event */ - struct waitq *waitq; + union { + queue_chain_t runq_links; /* run queue links */ + queue_chain_t wait_links; /* wait queue links */ + }; + + processor_t runq; /* run queue assignment */ + + event64_t wait_event; /* wait queue event */ + struct waitq *waitq; /* wait queue this thread is enqueued on */ + /* Data updated during assert_wait/thread_wakeup */ #if __SMP__ decl_simple_lock_data(,sched_lock) /* scheduling lock (thread_lock()) */ @@ -195,14 +208,18 @@ struct thread { sched_mode_t sched_mode; /* scheduling mode */ sched_mode_t saved_mode; /* saved mode during forced mode demotion */ + /* This thread's contribution to global sched counters */ + sched_bucket_t th_sched_bucket; + sfi_class_id_t sfi_class; /* SFI class (XXX Updated on CSW/QE/AST) */ sfi_class_id_t sfi_wait_class; /* Currently in SFI wait for this class, protected by sfi_lock */ - + + uint32_t sched_flags; /* current flag bits */ /* TH_SFLAG_FAIRSHARE_TRIPPED (unused) 0x0001 */ #define TH_SFLAG_FAILSAFE 0x0002 /* fail-safe has tripped */ -#define TH_SFLAG_THROTTLED 0x0004 /* thread treated as background for scheduler decay purposes */ -#define TH_SFLAG_DEMOTED_MASK (TH_SFLAG_THROTTLE_DEMOTED | TH_SFLAG_FAILSAFE) /* saved_mode contains previous sched_mode */ +#define TH_SFLAG_THROTTLED 0x0004 /* throttled thread forced to timeshare mode (may be applied in addition to failsafe) */ +#define TH_SFLAG_DEMOTED_MASK (TH_SFLAG_THROTTLED | TH_SFLAG_FAILSAFE) /* saved_mode contains previous sched_mode */ #define TH_SFLAG_PROMOTED 0x0008 /* sched pri has been promoted */ #define TH_SFLAG_ABORT 0x0010 /* abort interruptible waits */ @@ -211,10 +228,10 @@ struct thread { #define TH_SFLAG_DEPRESS 0x0040 /* normal depress yield */ #define TH_SFLAG_POLLDEPRESS 0x0080 /* polled depress yield */ #define TH_SFLAG_DEPRESSED_MASK (TH_SFLAG_DEPRESS | TH_SFLAG_POLLDEPRESS) -#define TH_SFLAG_PRI_UPDATE 0x0100 /* Updating priority */ +/* unused TH_SFLAG_PRI_UPDATE 0x0100 */ #define TH_SFLAG_EAGERPREEMPT 0x0200 /* Any preemption of this thread should be treated as if AST_URGENT applied */ #define TH_SFLAG_RW_PROMOTED 0x0400 /* sched pri has been promoted due to blocking with RW lock held */ -#define TH_SFLAG_THROTTLE_DEMOTED 0x0800 /* throttled thread forced to timeshare mode (may be applied in addition to failsafe) */ +/* unused TH_SFLAG_THROTTLE_DEMOTED 0x0800 */ #define TH_SFLAG_WAITQ_PROMOTED 0x1000 /* sched pri promoted from waitq wakeup (generally for IPC receive) */ #define TH_SFLAG_PROMOTED_MASK (TH_SFLAG_PROMOTED | TH_SFLAG_RW_PROMOTED | TH_SFLAG_WAITQ_PROMOTED) @@ -238,10 +255,6 @@ struct thread { uint32_t rwlock_count; /* Number of lck_rw_t locks held by thread */ -#if MACH_ASSERT - uint32_t SHARE_COUNT, BG_COUNT; /* This thread's contribution to global sched counters (temporary debugging) */ -#endif /* MACH_ASSERT */ - integer_t importance; /* task-relative importance */ uint32_t was_promoted_on_wakeup; @@ -305,6 +318,7 @@ struct thread { uint64_t vtimer_user_save; /* saved values for vtimers */ uint64_t vtimer_prof_save; uint64_t vtimer_rlim_save; + uint64_t vtimer_qos_save; #if CONFIG_SCHED_SFI /* Timing for wait state */ @@ -331,10 +345,18 @@ struct thread { mach_port_seqno_t seqno; /* seqno of recvd message */ ipc_object_t object; /* object received on */ mach_vm_address_t msg_addr; /* receive buffer pointer */ - mach_msg_size_t msize; /* max size for recvd msg */ + mach_msg_size_t rsize; /* max size for recvd msg */ + mach_msg_size_t msize; /* actual size for recvd msg */ mach_msg_option_t option; /* options for receive */ mach_port_name_t receiver_name; /* the receive port name */ - struct ipc_kmsg *kmsg; /* received message */ + union { + struct ipc_kmsg *kmsg; /* received message */ + struct ipc_mqueue *peekq; /* mqueue to peek at */ + struct { + mach_msg_priority_t qos; /* received message qos */ + mach_msg_priority_t oqos; /* override qos for message */ + } received_qos; + }; mach_msg_continue_t continuation; } receive; struct { @@ -390,13 +412,15 @@ struct thread { /* Miscellaneous bits guarded by mutex */ uint32_t - active:1, /* Thread is active and has not been terminated */ - started:1, /* Thread has been started after creation */ - static_param:1, /* Disallow policy parameter changes */ - inspection:1, /* TRUE when task is being inspected by crash reporter */ - policy_reset:1, /* Disallow policy parameter changes on terminating threads */ + active:1, /* Thread is active and has not been terminated */ + started:1, /* Thread has been started after creation */ + static_param:1, /* Disallow policy parameter changes */ + inspection:1, /* TRUE when task is being inspected by crash reporter */ + policy_reset:1, /* Disallow policy parameter changes on terminating threads */ + suspend_parked:1, /* thread parked in thread_suspended */ + corpse_dup:1, /* TRUE when thread is an inactive duplicate in a corpse */ :0; - + /* Ports associated with this thread */ struct ipc_port *ith_self; /* not a right, doesn't hold ref */ struct ipc_port *ith_sself; /* a send right */ @@ -422,32 +446,30 @@ struct thread { uint64_t t_page_creation_throttled_soft; #endif /* DEVELOPMENT || DEBUG */ -#define T_CHUD_MARKED 0x01 /* this thread is marked by CHUD */ -#define T_IN_CHUD 0x02 /* this thread is already in a CHUD handler */ -#define THREAD_PMC_FLAG 0x04 /* Bit in "t_chud" signifying PMC interest */ -#define T_AST_CALLSTACK 0x08 /* Thread scheduled to dump a - * callstack on its next - * AST */ -#define T_AST_NAME 0x10 /* Thread scheduled to dump - * its name on its next - * AST */ -#define T_NAME_DONE 0x20 /* Thread has previously - * recorded its name */ -#define T_KPC_ALLOC 0x40 /* Thread needs a kpc_buf */ - - uint32_t t_chud; /* CHUD flags, used for Shark */ - uint32_t chud_c_switch; /* last dispatch detection */ +#ifdef KPERF +/* The high 7 bits are the number of frames to sample of a user callstack. */ +#define T_KPERF_CALLSTACK_DEPTH_OFFSET (25) +#define T_KPERF_SET_CALLSTACK_DEPTH(DEPTH) (((uint32_t)(DEPTH)) << T_KPERF_CALLSTACK_DEPTH_OFFSET) +#define T_KPERF_GET_CALLSTACK_DEPTH(FLAGS) ((FLAGS) >> T_KPERF_CALLSTACK_DEPTH_OFFSET) +#endif + +#define T_KPERF_AST_CALLSTACK (1U << 0) /* dump a callstack on thread's next AST */ +#define T_KPERF_AST_DISPATCH (1U << 1) /* dump a name on thread's next AST */ +#define T_KPC_ALLOC (1U << 2) /* thread needs a kpc_buf allocated */ +/* only go up to T_KPERF_CALLSTACK_DEPTH_OFFSET - 1 */ + +#ifdef KPERF + uint32_t kperf_flags; + uint32_t kperf_pet_gen; /* last generation of PET that sampled this thread*/ + uint32_t kperf_c_switch; /* last dispatch detection */ + uint32_t kperf_pet_cnt; /* how many times a thread has been sampled by PET */ +#endif #ifdef KPC /* accumulated performance counters for this thread */ uint64_t *kpc_buf; #endif -#ifdef KPERF - /* count of how many times a thread has been sampled since it was last scheduled */ - uint64_t kperf_pet_cnt; -#endif - #if HYPERVISOR /* hypervisor virtual CPU object associated with this thread */ void *hv_thread_target; @@ -460,16 +482,14 @@ struct thread { uint32_t syscalls_mach; ledger_t t_ledger; ledger_t t_threadledger; /* per thread ledger */ - uint64_t cpu_time_last_qos; #ifdef CONFIG_BANK ledger_t t_bankledger; /* ledger to charge someone */ uint64_t t_deduct_bank_ledger_time; /* cpu time to be deducted from bank ledger */ #endif - /* policy is protected by the task lock */ - struct task_requested_policy requested_policy; - struct task_effective_policy effective_policy; - struct task_pended_policy pended_policy; + /* policy is protected by the thread mutex */ + struct thread_requested_policy requested_policy; + struct thread_effective_policy effective_policy; /* usynch override is protected by the task lock, eventually will be thread mutex */ struct thread_qos_override { @@ -480,8 +500,11 @@ struct thread { user_addr_t override_resource; } *overrides; + uint32_t ipc_overrides; + uint32_t user_promotions; + uint16_t user_promotion_basepri; + int iotier_override; /* atomic operations to set, cleared on ret to user */ - integer_t saved_importance; /* saved task-relative importance */ io_stat_info_t thread_io_stats; /* per-thread I/O statistics */ @@ -506,23 +529,39 @@ struct thread { /*** Machine-dependent state ***/ struct machine_thread machine; + +#if SCHED_TRACE_THREAD_WAKEUPS + uintptr_t thread_wakeup_bt[64]; +#endif }; -#define ith_state saved.receive.state -#define ith_object saved.receive.object -#define ith_msg_addr saved.receive.msg_addr -#define ith_msize saved.receive.msize -#define ith_option saved.receive.option -#define ith_receiver_name saved.receive.receiver_name -#define ith_continuation saved.receive.continuation -#define ith_kmsg saved.receive.kmsg -#define ith_seqno saved.receive.seqno - -#define sth_waitsemaphore saved.sema.waitsemaphore -#define sth_signalsemaphore saved.sema.signalsemaphore -#define sth_options saved.sema.options -#define sth_result saved.sema.result -#define sth_continuation saved.sema.continuation +#define ith_state saved.receive.state +#define ith_object saved.receive.object +#define ith_msg_addr saved.receive.msg_addr +#define ith_rsize saved.receive.rsize +#define ith_msize saved.receive.msize +#define ith_option saved.receive.option +#define ith_receiver_name saved.receive.receiver_name +#define ith_continuation saved.receive.continuation +#define ith_kmsg saved.receive.kmsg +#define ith_peekq saved.receive.peekq +#define ith_qos saved.receive.received_qos.qos +#define ith_qos_override saved.receive.received_qos.oqos +#define ith_seqno saved.receive.seqno + +#define sth_waitsemaphore saved.sema.waitsemaphore +#define sth_signalsemaphore saved.sema.signalsemaphore +#define sth_options saved.sema.options +#define sth_result saved.sema.result +#define sth_continuation saved.sema.continuation + +#if MACH_ASSERT +#define assert_thread_magic(thread) assertf((thread)->thread_magic == THREAD_MAGIC, \ + "bad thread magic 0x%llx for thread %p, expected 0x%llx", \ + (thread)->thread_magic, (thread), THREAD_MAGIC) +#else +#define assert_thread_magic(thread) do { (void)(thread); } while (0) +#endif extern void thread_bootstrap(void); @@ -550,12 +589,25 @@ extern void thread_terminate_self(void); extern kern_return_t thread_terminate_internal( thread_t thread); -extern void thread_start_internal( +extern void thread_start( thread_t thread) __attribute__ ((noinline)); +extern void thread_start_in_assert_wait( + thread_t thread, + event_t event, + wait_interrupt_t interruptible) __attribute__ ((noinline)); + extern void thread_terminate_enqueue( thread_t thread); +extern void thread_exception_enqueue( + task_t task, + thread_t thread); + +extern void thread_copy_resource_info( + thread_t dst_thread, + thread_t src_thread); + extern void thread_terminate_crashed_threads(void); extern void thread_stack_enqueue( @@ -567,6 +619,8 @@ extern void thread_hold( extern void thread_release( thread_t thread); +extern void thread_corpse_continue(void); + /* Locking for scheduler state, always acquired with interrupts disabled (splsched()) */ #if __SMP__ #define thread_lock_init(th) simple_lock_init(&(th)->sched_lock, 0) @@ -615,18 +669,7 @@ extern kern_return_t thread_info_internal( thread_info_t thread_info_out, mach_msg_type_number_t *thread_info_count); -extern void thread_task_priority( - thread_t thread, - integer_t priority, - integer_t max_priority); -extern kern_return_t thread_set_mode_and_absolute_pri( - thread_t thread, - integer_t policy, - integer_t priority); - -extern void thread_policy_reset( - thread_t thread); extern kern_return_t kernel_thread_create( thread_continue_t continuation, @@ -662,9 +705,6 @@ extern void machine_load_context( extern kern_return_t machine_thread_state_initialize( thread_t thread); -extern kern_return_t machine_thread_neon_state_initialize( - thread_t thread); - extern kern_return_t machine_thread_set_state( thread_t thread, thread_flavor_t flavor, @@ -713,16 +753,9 @@ extern kern_return_t machine_thread_set_tsd_base( #define thread_mtx_try(thread) lck_mtx_try_lock(&(thread)->mutex) #define thread_mtx_unlock(thread) lck_mtx_unlock(&(thread)->mutex) -extern void install_special_handler( - thread_t thread); +extern void thread_apc_ast(thread_t thread); -extern void special_handler( - thread_t thread); - -extern void -thread_update_qos_cpu_time( - thread_t thread, - boolean_t lock_needed); +extern void thread_update_qos_cpu_time(thread_t thread); void act_machine_sv_free(thread_t, int); @@ -737,12 +770,6 @@ static inline uint16_t thread_get_tag_internal(thread_t thread) { return thread->thread_tag; } -typedef struct { - int qos_pri[THREAD_QOS_LAST]; - int qos_iotier[THREAD_QOS_LAST]; - uint32_t qos_through_qos[THREAD_QOS_LAST]; - uint32_t qos_latency_qos[THREAD_QOS_LAST]; -} qos_policy_params_t; extern void thread_set_options(uint32_t thopt); @@ -793,21 +820,12 @@ __BEGIN_DECLS #define THREAD_TAG_CALLOUT 0x2 #define THREAD_TAG_IOWORKLOOP 0x4 +#define THREAD_TAG_PTHREAD 0x10 +#define THREAD_TAG_WORKQUEUE 0x20 + uint16_t thread_set_tag(thread_t, uint16_t); uint16_t thread_get_tag(thread_t); -/* - * Allocate/assign a single work interval ID for a thread, - * and support deallocating it. - */ -extern kern_return_t thread_policy_create_work_interval( - thread_t thread, - uint64_t *work_interval_id); - -extern kern_return_t thread_policy_destroy_work_interval( - thread_t thread, - uint64_t work_interval_id); - extern kern_return_t thread_state_initialize( thread_t thread); @@ -833,9 +851,17 @@ extern kern_return_t thread_create_workq( thread_continue_t thread_return, thread_t *new_thread); +extern kern_return_t thread_create_workq_waiting( + task_t task, + thread_continue_t thread_return, + event_t event, + thread_t *new_thread); + extern void thread_yield_internal( mach_msg_timeout_t interval); +extern void thread_yield_to_preemption(void); + /* * Thread-private CPU limits: apply a private CPU limit to this thread only. Available actions are: * @@ -889,7 +915,8 @@ extern kern_return_t thread_userstack( thread_state_t, unsigned int, mach_vm_offset_t *, - int *); + int *, + boolean_t); extern kern_return_t thread_entrypoint( thread_t, @@ -899,8 +926,8 @@ extern kern_return_t thread_entrypoint( mach_vm_offset_t *); extern kern_return_t thread_userstackdefault( - thread_t, - mach_vm_offset_t *); + mach_vm_offset_t *, + boolean_t); extern kern_return_t thread_wire_internal( host_priv_t host_priv, @@ -911,6 +938,8 @@ extern kern_return_t thread_wire_internal( extern kern_return_t thread_dup(thread_t); +extern kern_return_t thread_dup2(thread_t, thread_t); + typedef void (*sched_call_t)( int type, thread_t thread); @@ -922,6 +951,14 @@ extern void thread_sched_call( thread_t thread, sched_call_t call); +extern sched_call_t thread_disable_sched_call( + thread_t thread, + sched_call_t call); + +extern void thread_reenable_sched_call( + thread_t thread, + sched_call_t call); + extern void thread_static_param( thread_t thread, boolean_t state); @@ -929,16 +966,6 @@ extern void thread_static_param( extern boolean_t thread_is_static_param( thread_t thread); -extern kern_return_t thread_policy_set_internal( - thread_t thread, - thread_policy_flavor_t flavor, - thread_policy_t policy_info, - mach_msg_type_number_t count); - -extern boolean_t thread_has_qos_policy(thread_t thread); - -extern kern_return_t thread_remove_qos_policy(thread_t thread); - extern task_t get_threadtask(thread_t); #define thread_is_64bit(thd) \ task_has_64BitAddr(get_threadtask(thd)) @@ -948,7 +975,7 @@ extern void *get_bsdthread_info(thread_t); extern void set_bsdthread_info(thread_t, void *); extern void *uthread_alloc(task_t, thread_t, int); extern void uthread_cleanup_name(void *uthread); -extern void uthread_cleanup(task_t, void *, void *, boolean_t); +extern void uthread_cleanup(task_t, void *, void *); extern void uthread_zone_free(void *); extern void uthread_cred_free(void *); @@ -968,6 +995,7 @@ extern int is_64signalregset(void); extern void act_set_kperf(thread_t); extern void set_astledger(thread_t thread); +extern void act_set_io_telemetry_ast(thread_t); extern uint32_t dtrace_get_thread_predcache(thread_t); extern int64_t dtrace_get_thread_vtime(thread_t); @@ -1010,8 +1038,47 @@ extern kern_return_t thread_get_current_voucher_origin_pid(int32_t *pid); extern void set_thread_rwlock_boost(void); extern void clear_thread_rwlock_boost(void); +/*! @function thread_has_thread_name + @abstract Checks if a thread has a name. + @discussion This function takes one input, a thread, and returns a boolean value indicating if that thread already has a name associated with it. + @param th The thread to inspect. + @result TRUE if the thread has a name, FALSE otherwise. +*/ +extern boolean_t thread_has_thread_name(thread_t th); + +/*! @function thread_set_thread_name + @abstract Set a thread's name. + @discussion This function takes two input parameters: a thread to name, and the name to apply to the thread. The name will be attached to the thread in order to better identify the thread. + @param th The thread to be named. + @param name The name to apply to the thread. +*/ +extern void thread_set_thread_name(thread_t th, const char* name); + extern void thread_enable_send_importance(thread_t thread, boolean_t enable); +/* Get a backtrace for a threads kernel or user stack (user_p), with pc and optionally + * frame pointer (getfp). Returns bytes added to buffer, and kThreadTruncatedBT in + * thread_trace_flags if a user page is not present after kdp_lightweight_fault() is + * called. + */ + +extern int machine_trace_thread( + thread_t thread, + char *tracepos, + char *tracebound, + int nframes, + boolean_t user_p, + boolean_t getfp, + uint32_t *thread_trace_flags); + +extern int machine_trace_thread64(thread_t thread, + char *tracepos, + char *tracebound, + int nframes, + boolean_t user_p, + boolean_t getfp, + uint32_t *thread_trace_flags); + #endif /* XNU_KERNEL_PRIVATE */ @@ -1032,6 +1099,7 @@ extern kern_return_t kernel_thread_start( void thread_set_eager_preempt(thread_t thread); void thread_clear_eager_preempt(thread_t thread); extern ipc_port_t convert_thread_to_port(thread_t); +extern boolean_t is_vm_privileged(void); extern boolean_t set_vm_privilege(boolean_t); #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index 1d1376e81..1047d59af 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -51,8 +51,6 @@ */ #include #include -#include -#include #include #include @@ -76,32 +74,77 @@ #include #include -#include - #include -void act_abort(thread_t); -void install_special_handler_locked(thread_t); -void special_handler_continue(void); +static void act_abort(thread_t thread); + +static void thread_suspended(void *arg, wait_result_t result); +static void thread_set_apc_ast(thread_t thread); +static void thread_set_apc_ast_locked(thread_t thread); /* * Internal routine to mark a thread as started. * Always called with the thread mutex locked. - * - * Note: function intentionally declared with the noinline attribute to - * prevent multiple declaration of probe symbols in this file; we would - * prefer "#pragma noinline", but gcc does not support it. - * PR-6385749 -- the lwp-start probe should fire from within the context - * of the newly created thread. Commented out for now, in case we - * turn it into a dead code probe. */ void -thread_start_internal( +thread_start( thread_t thread) { clear_wait(thread, THREAD_AWAKENED); thread->started = TRUE; - // DTRACE_PROC1(lwp__start, thread_t, thread); +} + +/* + * Internal routine to mark a thread as waiting + * right after it has been created. The caller + * is responsible to call wakeup()/thread_wakeup() + * or thread_terminate() to get it going. + * + * Always called with the thread mutex locked. + * + * Task and task_threads mutexes also held + * (so nobody can set the thread running before + * this point) + * + * Converts TH_UNINT wait to THREAD_INTERRUPTIBLE + * to allow termination from this point forward. + */ +void +thread_start_in_assert_wait( + thread_t thread, + event_t event, + wait_interrupt_t interruptible) +{ + struct waitq *waitq = assert_wait_queue(event); + wait_result_t wait_result; + spl_t spl; + + spl = splsched(); + waitq_lock(waitq); + + /* clear out startup condition (safe because thread not started yet) */ + thread_lock(thread); + assert(!thread->started); + assert((thread->state & (TH_WAIT | TH_UNINT)) == (TH_WAIT | TH_UNINT)); + thread->state &= ~(TH_WAIT | TH_UNINT); + thread_unlock(thread); + + /* assert wait interruptibly forever */ + wait_result = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), + interruptible, + TIMEOUT_URGENCY_SYS_NORMAL, + TIMEOUT_WAIT_FOREVER, + TIMEOUT_NO_LEEWAY, + thread); + assert (wait_result == THREAD_WAITING); + + /* mark thread started while we still hold the waitq lock */ + thread_lock(thread); + thread->started = TRUE; + thread_unlock(thread); + + waitq_unlock(waitq); + splx(spl); } /* @@ -124,7 +167,7 @@ thread_terminate_internal( if (thread->started) clear_wait(thread, THREAD_INTERRUPTED); else { - thread_start_internal(thread); + thread_start(thread); } } else @@ -162,8 +205,7 @@ thread_terminate( /* * If a kernel thread is terminating itself, force an AST here. * Kernel threads don't normally pass through the AST checking - * code - and all threads finish their own termination in the - * special handler APC. + * code - and all threads finish their own termination in mach_apc_ast. */ if (thread->task == kernel_task) { ml_set_interrupts_enabled(FALSE); @@ -182,13 +224,11 @@ thread_terminate( * Called with thread mutex held. */ void -thread_hold( - register thread_t thread) +thread_hold(thread_t thread) { if (thread->suspend_count++ == 0) { - install_special_handler(thread); - if (thread->started) - thread_wakeup_one(&thread->suspend_count); + thread_set_apc_ast(thread); + assert(thread->suspend_parked == FALSE); } } @@ -196,28 +236,34 @@ thread_hold( * Decrement internal suspension count, setting thread * runnable when count falls to zero. * + * Because the wait is abortsafe, we can't be guaranteed that the thread + * is currently actually waiting even if suspend_parked is set. + * * Called with thread mutex held. */ void -thread_release( - register thread_t thread) +thread_release(thread_t thread) { - if ( thread->suspend_count > 0 && - --thread->suspend_count == 0 ) { - if (thread->started) - thread_wakeup_one(&thread->suspend_count); - else { - thread_start_internal(thread); + assertf(thread->suspend_count > 0, "thread %p over-resumed", thread); + + /* fail-safe on non-assert builds */ + if (thread->suspend_count == 0) + return; + + if (--thread->suspend_count == 0) { + if (!thread->started) { + thread_start(thread); + } else if (thread->suspend_parked) { + thread->suspend_parked = FALSE; + thread_wakeup_thread(&thread->suspend_count, thread); } } } kern_return_t -thread_suspend( - register thread_t thread) +thread_suspend(thread_t thread) { - thread_t self = current_thread(); - kern_return_t result = KERN_SUCCESS; + kern_return_t result = KERN_SUCCESS; if (thread == THREAD_NULL || thread->task == kernel_task) return (KERN_INVALID_ARGUMENT); @@ -225,29 +271,24 @@ thread_suspend( thread_mtx_lock(thread); if (thread->active) { - if ( thread->user_stop_count++ == 0 && - thread->suspend_count++ == 0 ) { - install_special_handler(thread); - if (thread != self) - thread_wakeup_one(&thread->suspend_count); - } - } - else + if (thread->user_stop_count++ == 0) + thread_hold(thread); + } else { result = KERN_TERMINATED; + } thread_mtx_unlock(thread); - if (thread != self && result == KERN_SUCCESS) + if (thread != current_thread() && result == KERN_SUCCESS) thread_wait(thread, FALSE); return (result); } kern_return_t -thread_resume( - register thread_t thread) +thread_resume(thread_t thread) { - kern_return_t result = KERN_SUCCESS; + kern_return_t result = KERN_SUCCESS; if (thread == THREAD_NULL || thread->task == kernel_task) return (KERN_INVALID_ARGUMENT); @@ -256,20 +297,14 @@ thread_resume( if (thread->active) { if (thread->user_stop_count > 0) { - if ( --thread->user_stop_count == 0 && - --thread->suspend_count == 0 ) { - if (thread->started) - thread_wakeup_one(&thread->suspend_count); - else { - thread_start_internal(thread); - } - } - } - else + if (--thread->user_stop_count == 0) + thread_release(thread); + } else { result = KERN_FAILURE; - } - else + } + } else { result = KERN_TERMINATED; + } thread_mtx_unlock(thread); @@ -283,7 +318,7 @@ thread_resume( */ kern_return_t thread_depress_abort( - register thread_t thread) + thread_t thread) { kern_return_t result; @@ -304,12 +339,12 @@ thread_depress_abort( /* - * Indicate that the activation should run its - * special handler to detect a condition. + * Indicate that the thread should run the AST_APC callback + * to detect an abort condition. * * Called with thread mutex held. */ -void +static void act_abort( thread_t thread) { @@ -319,18 +354,18 @@ act_abort( if (!(thread->sched_flags & TH_SFLAG_ABORT)) { thread->sched_flags |= TH_SFLAG_ABORT; - install_special_handler_locked(thread); - } - else + thread_set_apc_ast_locked(thread); + } else { thread->sched_flags &= ~TH_SFLAG_ABORTSAFELY; + } thread_unlock(thread); splx(s); } - + kern_return_t thread_abort( - register thread_t thread) + thread_t thread) { kern_return_t result = KERN_SUCCESS; @@ -370,15 +405,15 @@ thread_abort_safely( clear_wait_internal(thread, THREAD_INTERRUPTED) != KERN_SUCCESS) { if (!(thread->sched_flags & TH_SFLAG_ABORT)) { thread->sched_flags |= TH_SFLAG_ABORTED_MASK; - install_special_handler_locked(thread); + thread_set_apc_ast_locked(thread); } } thread_unlock(thread); splx(s); - } - else + } else { result = KERN_TERMINATED; - + } + thread_mtx_unlock(thread); return (result); @@ -416,7 +451,7 @@ thread_info( kern_return_t thread_get_state( - register thread_t thread, + thread_t thread, int flavor, thread_state_t state, /* pointer to OUT array */ mach_msg_type_number_t *state_count) /*IN/OUT*/ @@ -470,7 +505,7 @@ thread_get_state( */ static kern_return_t thread_set_state_internal( - register thread_t thread, + thread_t thread, int flavor, thread_state_t state, mach_msg_type_number_t state_count, @@ -520,14 +555,14 @@ thread_set_state_internal( /* No prototype, since thread_act_server.h has the _from_user version if KERNEL_SERVER */ kern_return_t thread_set_state( - register thread_t thread, + thread_t thread, int flavor, thread_state_t state, mach_msg_type_number_t state_count); kern_return_t thread_set_state( - register thread_t thread, + thread_t thread, int flavor, thread_state_t state, mach_msg_type_number_t state_count) @@ -537,7 +572,7 @@ thread_set_state( kern_return_t thread_set_state_from_user( - register thread_t thread, + thread_t thread, int flavor, thread_state_t state, mach_msg_type_number_t state_count) @@ -554,7 +589,7 @@ thread_set_state_from_user( */ kern_return_t thread_state_initialize( - register thread_t thread) + thread_t thread) { kern_return_t result = KERN_SUCCESS; @@ -595,7 +630,7 @@ thread_state_initialize( kern_return_t thread_dup( - register thread_t target) + thread_t target) { thread_t self = current_thread(); kern_return_t result = KERN_SUCCESS; @@ -633,6 +668,54 @@ thread_dup( } +kern_return_t +thread_dup2( + thread_t source, + thread_t target) +{ + kern_return_t result = KERN_SUCCESS; + uint32_t active = 0; + + if (source == THREAD_NULL || target == THREAD_NULL || target == source) + return (KERN_INVALID_ARGUMENT); + + thread_mtx_lock(source); + active = source->active; + thread_mtx_unlock(source); + + if (!active) { + return KERN_TERMINATED; + } + + thread_mtx_lock(target); + + if (target->active || target->inspection) { + thread_hold(target); + + thread_mtx_unlock(target); + + if (thread_stop(target, TRUE)) { + thread_mtx_lock(target); + result = machine_thread_dup(source, target); + if (source->affinity_set != AFFINITY_SET_NULL) + thread_affinity_dup(source, target); + thread_unstop(target); + } + else { + thread_mtx_lock(target); + result = KERN_ABORTED; + } + + thread_release(target); + } + else + result = KERN_TERMINATED; + + thread_mtx_unlock(target); + + return (result); +} + /* * thread_setstatus: * @@ -641,7 +724,7 @@ thread_dup( */ kern_return_t thread_setstatus( - register thread_t thread, + thread_t thread, int flavor, thread_state_t tstate, mach_msg_type_number_t count) @@ -657,7 +740,7 @@ thread_setstatus( */ kern_return_t thread_getstatus( - register thread_t thread, + thread_t thread, int flavor, thread_state_t tstate, mach_msg_type_number_t *count) @@ -711,56 +794,57 @@ thread_set_tsd_base( } /* - * install_special_handler: + * thread_set_apc_ast: * - * Install the special returnhandler that handles suspension and - * termination, if it hasn't been installed already. + * Register the AST_APC callback that handles suspension and + * termination, if it hasn't been installed already. * - * Called with the thread mutex held. + * Called with the thread mutex held. */ -void -install_special_handler( - thread_t thread) +static void +thread_set_apc_ast(thread_t thread) { - spl_t s = splsched(); + spl_t s = splsched(); thread_lock(thread); - install_special_handler_locked(thread); + thread_set_apc_ast_locked(thread); thread_unlock(thread); + splx(s); } /* - * install_special_handler_locked: + * thread_set_apc_ast_locked: * - * Do the work of installing the special_handler. + * Do the work of registering for the AST_APC callback. * - * Called with the thread mutex and scheduling lock held. + * Called with the thread mutex and scheduling lock held. */ -void -install_special_handler_locked( - thread_t thread) +static void +thread_set_apc_ast_locked(thread_t thread) { - /* * Temporarily undepress, so target has * a chance to do locking required to - * block itself in special_handler(). + * block itself in thread_suspended. + * + * Leaves the depress flag set so we can reinstate when it's blocked. */ if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) thread_recompute_sched_pri(thread, TRUE); thread_ast_set(thread, AST_APC); - if (thread == current_thread()) + if (thread == current_thread()) { ast_propagate(thread->ast); - else { - processor_t processor = thread->last_processor; + } else { + processor_t processor = thread->last_processor; - if ( processor != PROCESSOR_NULL && - processor->state == PROCESSOR_RUNNING && - processor->active_thread == thread ) + if (processor != PROCESSOR_NULL && + processor->state == PROCESSOR_RUNNING && + processor->active_thread == thread) { cause_ast_check(processor); + } } } @@ -770,31 +854,36 @@ install_special_handler_locked( */ /* - * special_handler_continue + * thread_suspended * - * Continuation routine for the special handler blocks. It checks + * Continuation routine for thread suspension. It checks * to see whether there has been any new suspensions. If so, it - * installs the special handler again. Otherwise, it checks to see + * installs the AST_APC handler again. Otherwise, it checks to see * if the current depression needs to be re-instated (it may have * been temporarily removed in order to get to this point in a hurry). */ -void -special_handler_continue(void) +__attribute__((noreturn)) +static void +thread_suspended(__unused void *parameter, wait_result_t result) { - thread_t thread = current_thread(); + thread_t thread = current_thread(); thread_mtx_lock(thread); - if (thread->suspend_count > 0) - install_special_handler(thread); - else { - spl_t s = splsched(); + if (result == THREAD_INTERRUPTED) + thread->suspend_parked = FALSE; + else + assert(thread->suspend_parked == FALSE); + + if (thread->suspend_count > 0) { + thread_set_apc_ast(thread); + } else { + spl_t s = splsched(); thread_lock(thread); if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - processor_t myprocessor = thread->last_processor; - thread->sched_pri = DEPRESSPRI; + thread->last_processor->current_pri = thread->sched_pri; KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), (uintptr_t)thread_tid(thread), @@ -802,8 +891,6 @@ special_handler_continue(void) thread->sched_pri, 0, /* eventually, 'reason' */ 0); - - myprocessor->current_pri = thread->sched_pri; } thread_unlock(thread); splx(s); @@ -816,41 +903,44 @@ special_handler_continue(void) } /* - * special_handler - handles suspension, termination. Called - * with nothing locked. Returns (if it returns) the same way. + * thread_apc_ast - handles AST_APC and drives thread suspension and termination. + * Called with nothing locked. Returns (if it returns) the same way. */ void -special_handler( - thread_t thread) +thread_apc_ast(thread_t thread) { - spl_t s; - thread_mtx_lock(thread); - s = splsched(); + assert(thread->suspend_parked == FALSE); + + spl_t s = splsched(); thread_lock(thread); + + /* TH_SFLAG_POLLDEPRESS is OK to have here */ + assert((thread->sched_flags & TH_SFLAG_DEPRESS) == 0); + thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; thread_unlock(thread); splx(s); - /* - * If we're suspended, go to sleep and wait for someone to wake us up. - */ - if (thread->active) { - if (thread->suspend_count > 0) { - assert_wait(&thread->suspend_count, THREAD_ABORTSAFE); - thread_mtx_unlock(thread); - thread_block((thread_continue_t)special_handler_continue); - /*NOTREACHED*/ - } - } - else { + if (!thread->active) { + /* Thread is ready to terminate, time to tear it down */ thread_mtx_unlock(thread); thread_terminate_self(); /*NOTREACHED*/ } + /* If we're suspended, go to sleep and wait for someone to wake us up. */ + if (thread->suspend_count > 0) { + thread->suspend_parked = TRUE; + assert_wait(&thread->suspend_count, THREAD_ABORTSAFE); + thread_mtx_unlock(thread); + + thread_block(thread_suspended); + /*NOTREACHED*/ + } + thread_mtx_unlock(thread); } @@ -963,4 +1053,9 @@ set_astledger(thread_t thread) act_set_ast(thread, AST_LEDGER); } +void +act_set_io_telemetry_ast(thread_t thread) +{ + act_set_ast(thread, AST_TELEMETRY_IO); +} diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index 6e406e8b7..f84423716 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -81,15 +81,28 @@ typedef struct thread_call_group *thread_call_group_t; #define TCG_PARALLEL 0x01 #define TCG_DEALLOC_ACTIVE 0x02 +#define TCG_CONTINUOUS 0x04 -#define THREAD_CALL_GROUP_COUNT 4 +#define THREAD_CALL_PRIO_COUNT 4 +#define THREAD_CALL_ABSTIME_COUNT 4 +#define THREAD_CALL_CONTTIME_COUNT 4 +#define THREAD_CALL_GROUP_COUNT (THREAD_CALL_CONTTIME_COUNT + THREAD_CALL_ABSTIME_COUNT) #define THREAD_CALL_THREAD_MIN 4 #define INTERNAL_CALL_COUNT 768 #define THREAD_CALL_DEALLOC_INTERVAL_NS (5 * 1000 * 1000) /* 5 ms */ #define THREAD_CALL_ADD_RATIO 4 #define THREAD_CALL_MACH_FACTOR_CAP 3 -static struct thread_call_group thread_call_groups[THREAD_CALL_GROUP_COUNT]; +#define IS_CONT_GROUP(group) \ + (((group)->flags & TCG_CONTINUOUS) ? TRUE : FALSE) + +// groups [0..4]: thread calls in mach_absolute_time +// groups [4..8]: thread calls in mach_continuous_time +static struct thread_call_group thread_call_groups[THREAD_CALL_GROUP_COUNT]; + +static struct thread_call_group *abstime_thread_call_groups; +static struct thread_call_group *conttime_thread_call_groups; + static boolean_t thread_call_daemon_awake; static thread_call_data_t internal_call_storage[INTERNAL_CALL_COUNT]; static queue_head_t thread_call_internal_queue; @@ -109,7 +122,7 @@ static void thread_call_daemon(void *arg); static void thread_call_thread(thread_call_group_t group, wait_result_t wres); extern void thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1); static void thread_call_dealloc_timer(timer_call_param_t p0, timer_call_param_t p1); -static void thread_call_group_setup(thread_call_group_t group, thread_call_priority_t pri, uint32_t target_thread_count, boolean_t parallel); +static void thread_call_group_setup(thread_call_group_t group, thread_call_priority_t pri, uint32_t target_thread_count, boolean_t parallel, boolean_t continuous); static void sched_call_thread(int type, thread_t thread); static void thread_call_start_deallocate_timer(thread_call_group_t group); static void thread_call_wait_locked(thread_call_t call); @@ -127,11 +140,7 @@ lck_grp_t thread_call_lck_grp; lck_attr_t thread_call_lck_attr; lck_grp_attr_t thread_call_lck_grp_attr; -#if defined(__i386__) || defined(__x86_64__) lck_mtx_t thread_call_lock_data; -#else -lck_spin_t thread_call_lock_data; -#endif #define thread_call_lock_spin() \ @@ -237,7 +246,16 @@ thread_call_get_group( pri == THREAD_CALL_PRIORITY_KERNEL || pri == THREAD_CALL_PRIORITY_HIGH); - return &thread_call_groups[pri]; + thread_call_group_t group; + + if(call->tc_flags & THREAD_CALL_CONTINUOUS) { + group = &conttime_thread_call_groups[pri]; + } else { + group = &abstime_thread_call_groups[pri]; + } + + assert(IS_CONT_GROUP(group) == ((call->tc_flags & THREAD_CALL_CONTINUOUS) ? TRUE : FALSE)); + return group; } static void @@ -245,7 +263,8 @@ thread_call_group_setup( thread_call_group_t group, thread_call_priority_t pri, uint32_t target_thread_count, - boolean_t parallel) + boolean_t parallel, + boolean_t continuous) { queue_init(&group->pending_queue); queue_init(&group->delayed_queue); @@ -262,7 +281,11 @@ thread_call_group_setup( if (parallel) { group->flags |= TCG_PARALLEL; group->sched_call = NULL; - } + } + + if(continuous) { + group->flags |= TCG_CONTINUOUS; + } } /* @@ -314,24 +337,25 @@ thread_call_initialize(void) zone_change(thread_call_zone, Z_CALLERACCT, FALSE); zone_change(thread_call_zone, Z_NOENCRYPT, TRUE); + abstime_thread_call_groups = &thread_call_groups[0]; + conttime_thread_call_groups = &thread_call_groups[THREAD_CALL_ABSTIME_COUNT]; + lck_attr_setdefault(&thread_call_lck_attr); lck_grp_attr_setdefault(&thread_call_lck_grp_attr); lck_grp_init(&thread_call_queues_lck_grp, "thread_call_queues", &thread_call_lck_grp_attr); lck_grp_init(&thread_call_lck_grp, "thread_call", &thread_call_lck_grp_attr); - -#if defined(__i386__) || defined(__x86_64__) - lck_mtx_init(&thread_call_lock_data, &thread_call_lck_grp, &thread_call_lck_attr); -#else - lck_spin_init(&thread_call_lock_data, &thread_call_lck_grp, &thread_call_lck_attr); -#endif - + lck_mtx_init(&thread_call_lock_data, &thread_call_lck_grp, &thread_call_lck_attr); nanotime_to_absolutetime(0, THREAD_CALL_DEALLOC_INTERVAL_NS, &thread_call_dealloc_interval_abs); - waitq_init(&daemon_waitq, SYNC_POLICY_FIFO); + waitq_init(&daemon_waitq, SYNC_POLICY_DISABLE_IRQ | SYNC_POLICY_FIFO); - thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_LOW], THREAD_CALL_PRIORITY_LOW, 0, TRUE); - thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_USER], THREAD_CALL_PRIORITY_USER, 0, TRUE); - thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_KERNEL], THREAD_CALL_PRIORITY_KERNEL, 1, TRUE); - thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_HIGH], THREAD_CALL_PRIORITY_HIGH, THREAD_CALL_THREAD_MIN, FALSE); + thread_call_group_setup(&abstime_thread_call_groups[THREAD_CALL_PRIORITY_LOW], THREAD_CALL_PRIORITY_LOW, 0, TRUE, FALSE); + thread_call_group_setup(&abstime_thread_call_groups[THREAD_CALL_PRIORITY_USER], THREAD_CALL_PRIORITY_USER, 0, TRUE, FALSE); + thread_call_group_setup(&abstime_thread_call_groups[THREAD_CALL_PRIORITY_KERNEL], THREAD_CALL_PRIORITY_KERNEL, 1, TRUE, FALSE); + thread_call_group_setup(&abstime_thread_call_groups[THREAD_CALL_PRIORITY_HIGH], THREAD_CALL_PRIORITY_HIGH, THREAD_CALL_THREAD_MIN, FALSE, FALSE); + thread_call_group_setup(&conttime_thread_call_groups[THREAD_CALL_PRIORITY_LOW], THREAD_CALL_PRIORITY_LOW, 0, TRUE, TRUE); + thread_call_group_setup(&conttime_thread_call_groups[THREAD_CALL_PRIORITY_USER], THREAD_CALL_PRIORITY_USER, 0, TRUE, TRUE); + thread_call_group_setup(&conttime_thread_call_groups[THREAD_CALL_PRIORITY_KERNEL], THREAD_CALL_PRIORITY_KERNEL, 0, TRUE, TRUE); + thread_call_group_setup(&conttime_thread_call_groups[THREAD_CALL_PRIORITY_HIGH], THREAD_CALL_PRIORITY_HIGH, 1, FALSE, TRUE); s = disable_ints_and_lock(); @@ -436,6 +460,9 @@ _pending_call_enqueue( if (old_queue == NULL) { call->tc_submit_count++; + } else if (old_queue != &group->pending_queue && + old_queue != &group->delayed_queue){ + panic("tried to move a thread call (%p) between groups (old_queue: %p)", call, old_queue); } group->pending_count++; @@ -467,10 +494,15 @@ _delayed_call_enqueue( old_queue = call_entry_enqueue_deadline(CE(call), &group->delayed_queue, deadline); - if (old_queue == &group->pending_queue) + if (old_queue == &group->pending_queue) { group->pending_count--; - else if (old_queue == NULL) + } else if (old_queue == NULL) { call->tc_submit_count++; + } else if (old_queue == &group->delayed_queue) { + // we did nothing, and that's fine + } else { + panic("tried to move a thread call (%p) between groups (old_queue: %p)", call, old_queue); + } return (old_queue != NULL); } @@ -515,13 +547,20 @@ _set_delayed_call_timer( thread_call_t call, thread_call_group_t group) { - uint64_t leeway; + uint64_t leeway, fire_at; assert((call->tc_soft_deadline != 0) && ((call->tc_soft_deadline <= call->tc_call.deadline))); + assert(IS_CONT_GROUP(group) == ((call->tc_flags & THREAD_CALL_CONTINUOUS) ? TRUE : FALSE)); + + fire_at = call->tc_soft_deadline; + + if (IS_CONT_GROUP(group)) { + fire_at = continuoustime_to_absolutetime(fire_at); + } leeway = call->tc_call.deadline - call->tc_soft_deadline; timer_call_enter_with_leeway(&group->delayed_timer, NULL, - call->tc_soft_deadline, leeway, + fire_at, leeway, TIMER_CALL_SYS_CRITICAL|TIMER_CALL_LEEWAY, ((call->tc_flags & THREAD_CALL_RATELIMITED) == THREAD_CALL_RATELIMITED)); } @@ -545,7 +584,7 @@ _remove_from_pending_queue( { boolean_t call_removed = FALSE; thread_call_t call; - thread_call_group_t group = &thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; + thread_call_group_t group = &abstime_thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; call = TC(queue_first(&group->pending_queue)); @@ -590,7 +629,7 @@ _remove_from_delayed_queue( { boolean_t call_removed = FALSE; thread_call_t call; - thread_call_group_t group = &thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; + thread_call_group_t group = &abstime_thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; call = TC(queue_first(&group->delayed_queue)); @@ -670,6 +709,8 @@ thread_call_func_cancel( boolean_t result; spl_t s; + assert(func != NULL); + s = splsched(); thread_call_lock_spin(); @@ -780,25 +821,7 @@ boolean_t thread_call_enter( thread_call_t call) { - boolean_t result = TRUE; - thread_call_group_t group; - spl_t s; - - group = thread_call_get_group(call); - - s = splsched(); - thread_call_lock_spin(); - - if (call->tc_call.queue != &group->pending_queue) { - result = _pending_call_enqueue(call, group); - } - - call->tc_call.param1 = 0; - - thread_call_unlock(); - splx(s); - - return (result); + return thread_call_enter1(call, 0); } boolean_t @@ -810,6 +833,8 @@ thread_call_enter1( thread_call_group_t group; spl_t s; + assert(call->tc_call.func != NULL); + group = thread_call_get_group(call); s = splsched(); @@ -841,7 +866,7 @@ thread_call_enter_delayed( thread_call_t call, uint64_t deadline) { - assert(call); + assert(call != NULL); return thread_call_enter_delayed_internal(call, NULL, 0, 0, deadline, 0, 0); } @@ -851,7 +876,7 @@ thread_call_enter1_delayed( thread_call_param_t param1, uint64_t deadline) { - assert(call); + assert(call != NULL); return thread_call_enter_delayed_internal(call, NULL, 0, param1, deadline, 0, 0); } @@ -863,7 +888,7 @@ thread_call_enter_delayed_with_leeway( uint64_t leeway, unsigned int flags) { - assert(call); + assert(call != NULL); return thread_call_enter_delayed_internal(call, NULL, 0, param1, deadline, leeway, flags); } @@ -880,6 +905,8 @@ thread_call_enter_delayed_with_leeway( * leeway - timer slack represented as delta of deadline. * flags - THREAD_CALL_DELAY_XXX : classification of caller's desires wrt timer coalescing. * THREAD_CALL_DELAY_LEEWAY : value in leeway is used for timer coalescing. + * THREAD_CALL_CONTINUOUS: thread call will be called according to mach_continuous_time rather + * than mach_absolute_time */ boolean_t thread_call_enter_delayed_internal( @@ -894,8 +921,9 @@ thread_call_enter_delayed_internal( boolean_t result = TRUE; thread_call_group_t group; spl_t s; - uint64_t abstime, sdeadline, slop; + uint64_t abstime, conttime, sdeadline, slop; uint32_t urgency; + const boolean_t is_cont_time = (flags & THREAD_CALL_CONTINUOUS) ? TRUE : FALSE; /* direct mapping between thread_call, timer_call, and timeout_urgency values */ urgency = (flags & TIMEOUT_URGENCY_MASK); @@ -908,15 +936,21 @@ thread_call_enter_delayed_internal( call = _internal_call_allocate(alt_func, alt_param0); } + if (is_cont_time) { + call->tc_flags |= THREAD_CALL_CONTINUOUS; + } + + assert(call->tc_call.func != NULL); group = thread_call_get_group(call); abstime = mach_absolute_time(); + conttime = absolutetime_to_continuoustime(abstime); call->tc_flags |= THREAD_CALL_DELAYED; call->tc_soft_deadline = sdeadline = deadline; boolean_t ratelimited = FALSE; - slop = timer_call_slop(deadline, abstime, urgency, current_thread(), &ratelimited); + slop = timer_call_slop(deadline, is_cont_time ? conttime : abstime, urgency, current_thread(), &ratelimited); if ((flags & THREAD_CALL_DELAY_LEEWAY) != 0 && leeway > slop) slop = leeway; @@ -934,16 +968,24 @@ thread_call_enter_delayed_internal( call->tc_call.param1 = param1; - call->ttd = (sdeadline > abstime) ? (sdeadline - abstime) : 0; + + if(is_cont_time) { + call->ttd = (sdeadline > conttime) ? (sdeadline - conttime) : 0; + } + else { + call->ttd = (sdeadline > abstime) ? (sdeadline - abstime) : 0; + } result = _delayed_call_enqueue(call, group, deadline); - if (queue_first(&group->delayed_queue) == qe(call)) + if (queue_first(&group->delayed_queue) == qe(call)) { _set_delayed_call_timer(call, group); + } #if CONFIG_DTRACE DTRACE_TMR5(thread_callout__create, thread_call_func_t, call->tc_call.func, uint64_t, (deadline - sdeadline), uint64_t, (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF), call); #endif + thread_call_unlock(); splx(s); @@ -1059,7 +1101,7 @@ thread_call_wake( if (group->idle_count == 0) { timer_call_cancel(&group->dealloc_timer); - group->flags &= TCG_DEALLOC_ACTIVE; + group->flags &= ~TCG_DEALLOC_ACTIVE; } } else { if (!thread_call_daemon_awake && thread_call_group_should_add_thread(group)) { @@ -1182,6 +1224,7 @@ thread_call_thread( thread_call_param_t param0, param1; call = TC(dequeue_head(&group->pending_queue)); + assert(call != NULL); group->pending_count--; func = call->tc_call.func; @@ -1391,11 +1434,20 @@ thread_call_delayed_timer( thread_call_lock_spin(); - timestamp = mach_absolute_time(); + const boolean_t is_cont_time = IS_CONT_GROUP(group) ? TRUE : FALSE; + + if (is_cont_time) { + timestamp = mach_continuous_time(); + } + else { + timestamp = mach_absolute_time(); + } call = TC(queue_first(&group->delayed_queue)); while (!queue_end(&group->delayed_queue, qe(call))) { + assert((!is_cont_time) || (call->tc_flags & THREAD_CALL_CONTINUOUS)); + if (call->tc_soft_deadline <= timestamp) { if ((call->tc_flags & THREAD_CALL_RATELIMITED) && (CE(call)->deadline > timestamp) && @@ -1410,17 +1462,17 @@ thread_call_delayed_timer( call = TC(queue_first(&group->delayed_queue)); } - if (!queue_end(&group->delayed_queue, qe(call))) + if (!queue_end(&group->delayed_queue, qe(call))) { _set_delayed_call_timer(call, group); + } thread_call_unlock(); } static void -thread_call_delayed_timer_rescan(timer_call_param_t p0, __unused timer_call_param_t p1) +thread_call_delayed_timer_rescan(thread_call_group_t group) { thread_call_t call; - thread_call_group_t group = p0; uint64_t timestamp; boolean_t istate; @@ -1428,7 +1480,12 @@ thread_call_delayed_timer_rescan(timer_call_param_t p0, __unused timer_call_par thread_call_lock_spin(); assert(ml_timer_forced_evaluation() == TRUE); - timestamp = mach_absolute_time(); + + if (IS_CONT_GROUP(group)) { + timestamp = mach_continuous_time(); + } else { + timestamp = mach_absolute_time(); + } call = TC(queue_first(&group->delayed_queue)); @@ -1460,10 +1517,10 @@ thread_call_delayed_timer_rescan(timer_call_param_t p0, __unused timer_call_par void thread_call_delayed_timer_rescan_all(void) { - thread_call_delayed_timer_rescan((timer_call_param_t)&thread_call_groups[THREAD_CALL_PRIORITY_LOW], NULL); - thread_call_delayed_timer_rescan((timer_call_param_t)&thread_call_groups[THREAD_CALL_PRIORITY_USER], NULL); - thread_call_delayed_timer_rescan((timer_call_param_t)&thread_call_groups[THREAD_CALL_PRIORITY_KERNEL], NULL); - thread_call_delayed_timer_rescan((timer_call_param_t)&thread_call_groups[THREAD_CALL_PRIORITY_HIGH], NULL); + int i; + for(i = 0; i < THREAD_CALL_GROUP_COUNT; i++) { + thread_call_delayed_timer_rescan(&thread_call_groups[i]); + } } /* @@ -1571,3 +1628,29 @@ thread_call_isactive(thread_call_t call) return active; } + +/* + * adjust_cont_time_thread_calls + * on wake, reenqueue delayed call timer for continuous time thread call groups + */ +void +adjust_cont_time_thread_calls(void) +{ + thread_call_group_t group; + + spl_t s; + int i; + s = disable_ints_and_lock(); + + for (i = 0; i < THREAD_CALL_CONTTIME_COUNT; i++) { + // only the continuous thread call groups + group = &conttime_thread_call_groups[i]; + assert(IS_CONT_GROUP(group)); + + if (!queue_empty(&group->delayed_queue)) { + _set_delayed_call_timer(TC(queue_first(&group->delayed_queue)), group); + } + } + + enable_ints_and_unlock(s); +} diff --git a/osfmk/kern/thread_call.h b/osfmk/kern/thread_call.h index c44561f63..5b486cbbf 100644 --- a/osfmk/kern/thread_call.h +++ b/osfmk/kern/thread_call.h @@ -64,10 +64,10 @@ typedef void (*thread_call_func_t)( @constant THREAD_CALL_PRIORITY_LOW Very low importance. */ typedef enum { - THREAD_CALL_PRIORITY_HIGH = 0, - THREAD_CALL_PRIORITY_KERNEL = 1, - THREAD_CALL_PRIORITY_USER = 2, - THREAD_CALL_PRIORITY_LOW = 3 + THREAD_CALL_PRIORITY_HIGH = 0, + THREAD_CALL_PRIORITY_KERNEL = 1, + THREAD_CALL_PRIORITY_USER = 2, + THREAD_CALL_PRIORITY_LOW = 3 } thread_call_priority_t; __BEGIN_DECLS @@ -163,6 +163,13 @@ extern boolean_t thread_call_enter1_delayed( */ #define THREAD_CALL_DELAY_LEEWAY TIMEOUT_URGENCY_LEEWAY +/* + * Indicates that the time parameters should be interpreted as + * mach_continuous_time values, rather than mach_absolute_time and the timer + * be programmed to fire based on continuous time. + */ +#define THREAD_CALL_CONTINUOUS 0x100 + /*! @function thread_call_enter_delayed_with_leeway @abstract Submit a thread call to be executed at some point in the future. @@ -279,7 +286,7 @@ __END_DECLS #include struct thread_call { - struct call_entry tc_call; /* Must be first */ + struct call_entry tc_call; /* Must be first */ uint64_t tc_submit_count; uint64_t tc_finish_count; uint64_t ttd; /* Time to deadline at creation */ @@ -287,12 +294,12 @@ struct thread_call { thread_call_priority_t tc_pri; uint32_t tc_flags; int32_t tc_refs; -}; +}; -#define THREAD_CALL_ALLOC 0x01 -#define THREAD_CALL_WAIT 0x02 -#define THREAD_CALL_DELAYED 0x04 -#define THREAD_CALL_RATELIMITED TIMEOUT_URGENCY_RATELIMITED +#define THREAD_CALL_ALLOC 0x01 +#define THREAD_CALL_WAIT 0x02 +#define THREAD_CALL_DELAYED 0x04 +#define THREAD_CALL_RATELIMITED TIMEOUT_URGENCY_RATELIMITED typedef struct thread_call thread_call_data_t; @@ -333,6 +340,12 @@ extern boolean_t thread_call_func_cancel( thread_call_func_t func, thread_call_param_t param, boolean_t cancel_all); + +/* + * Called on the wake path to adjust the thread callouts running in mach_continuous_time + */ +void adjust_cont_time_thread_calls(void); + __END_DECLS #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/thread_policy.c b/osfmk/kern/thread_policy.c index a7043c78b..f30639e47 100644 --- a/osfmk/kern/thread_policy.c +++ b/osfmk/kern/thread_policy.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -35,11 +35,32 @@ #include #include #include +#include +#include +#include #include +#ifdef MACH_BSD +extern int proc_selfpid(void); +extern char * proc_name_address(void *p); +extern void rethrottle_thread(void * uthread); +#endif /* MACH_BSD */ + #define QOS_EXTRACT(q) ((q) & 0xff) +uint32_t qos_override_mode; +#define QOS_OVERRIDE_MODE_OVERHANG_PEAK 0 +#define QOS_OVERRIDE_MODE_IGNORE_OVERRIDE 1 +#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE 2 +#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH 3 +#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 4 + +extern zone_t thread_qos_override_zone; + +static boolean_t +proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset, boolean_t squash); + /* * THREAD_QOS_UNSPECIFIED is assigned the highest tier available, so it does not provide a limit * to threads that don't have a QoS class set. @@ -98,28 +119,79 @@ thread_set_user_sched_mode_and_recompute_pri(thread_t thread, sched_mode_t mode) static int thread_qos_scaled_relative_priority(int qos, int qos_relprio); +static void +proc_get_thread_policy_bitfield(thread_t thread, thread_policy_state_t info); + +static void +proc_set_thread_policy_locked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token); + +static void +proc_set_thread_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token); + +static void +thread_set_requested_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2); + +static int +thread_get_requested_policy_spinlocked(thread_t thread, int category, int flavor, int* value2); + +static int +proc_get_thread_policy_locked(thread_t thread, int category, int flavor, int* value2); + +static void +thread_policy_update_spinlocked(thread_t thread, boolean_t recompute_priority, task_pend_token_t pend_token); + +static void +thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_priority, task_pend_token_t pend_token); -extern void proc_get_thread_policy(thread_t thread, thread_policy_state_t info); +void +thread_policy_init(void) { + if (PE_parse_boot_argn("qos_override_mode", &qos_override_mode, sizeof(qos_override_mode))) { + printf("QOS override mode: 0x%08x\n", qos_override_mode); + } else { + qos_override_mode = QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE; + } +} boolean_t thread_has_qos_policy(thread_t thread) { - return (proc_get_task_policy(thread->task, thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS) != THREAD_QOS_UNSPECIFIED) ? TRUE : FALSE; + return (proc_get_thread_policy(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS) != THREAD_QOS_UNSPECIFIED) ? TRUE : FALSE; } -kern_return_t -thread_remove_qos_policy(thread_t thread) + +static void +thread_remove_qos_policy_locked(thread_t thread, + task_pend_token_t pend_token) { - thread_qos_policy_data_t unspec_qos; - unspec_qos.qos_tier = THREAD_QOS_UNSPECIFIED; - unspec_qos.tier_importance = 0; __unused int prev_qos = thread->requested_policy.thrp_qos; DTRACE_PROC2(qos__remove, thread_t, thread, int, prev_qos); - return thread_policy_set_internal(thread, THREAD_QOS_POLICY, (thread_policy_t)&unspec_qos, THREAD_QOS_POLICY_COUNT); + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, + THREAD_QOS_UNSPECIFIED, 0, pend_token); +} + +kern_return_t +thread_remove_qos_policy(thread_t thread) +{ + struct task_pend_token pend_token = {}; + + thread_mtx_lock(thread); + if (!thread->active) { + thread_mtx_unlock(thread); + return KERN_TERMINATED; + } + + thread_remove_qos_policy_locked(thread, &pend_token); + + thread_mtx_unlock(thread); + + thread_policy_update_complete_unlocked(thread, &pend_token); + + return KERN_SUCCESS; } + boolean_t thread_is_static_param(thread_t thread) { @@ -205,7 +277,7 @@ thread_policy_set( if (thread_is_static_param(thread)) return (KERN_POLICY_STATIC); - if (flavor == THREAD_QOS_POLICY || flavor == THREAD_QOS_POLICY_OVERRIDE) + if (flavor == THREAD_QOS_POLICY) return (KERN_INVALID_ARGUMENT); } @@ -236,13 +308,13 @@ thread_policy_set( kern_return_t thread_policy_set_internal( - thread_t thread, - thread_policy_flavor_t flavor, - thread_policy_t policy_info, - mach_msg_type_number_t count) + thread_t thread, + thread_policy_flavor_t flavor, + thread_policy_t policy_info, + mach_msg_type_number_t count) { - kern_return_t result = KERN_SUCCESS; - spl_t s; + kern_return_t result = KERN_SUCCESS; + struct task_pend_token pend_token = {}; thread_mtx_lock(thread); if (!thread->active) { @@ -255,10 +327,10 @@ thread_policy_set_internal( case THREAD_EXTENDED_POLICY: { - boolean_t timeshare = TRUE; + boolean_t timeshare = TRUE; if (count >= THREAD_EXTENDED_POLICY_COUNT) { - thread_extended_policy_t info; + thread_extended_policy_t info; info = (thread_extended_policy_t)policy_info; timeshare = info->timeshare; @@ -266,7 +338,7 @@ thread_policy_set_internal( sched_mode_t mode = (timeshare == TRUE) ? TH_MODE_TIMESHARE : TH_MODE_FIXED; - s = splsched(); + spl_t s = splsched(); thread_lock(thread); thread_set_user_sched_mode_and_recompute_pri(thread, mode); @@ -274,14 +346,14 @@ thread_policy_set_internal( thread_unlock(thread); splx(s); - sfi_reevaluate(thread); + pend_token.tpt_update_thread_sfi = 1; break; } case THREAD_TIME_CONSTRAINT_POLICY: { - thread_time_constraint_policy_t info; + thread_time_constraint_policy_t info; if (count < THREAD_TIME_CONSTRAINT_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; @@ -289,34 +361,34 @@ thread_policy_set_internal( } info = (thread_time_constraint_policy_t)policy_info; - if ( info->constraint < info->computation || - info->computation > max_rt_quantum || - info->computation < min_rt_quantum ) { + if (info->constraint < info->computation || + info->computation > max_rt_quantum || + info->computation < min_rt_quantum ) { result = KERN_INVALID_ARGUMENT; break; } - s = splsched(); + spl_t s = splsched(); thread_lock(thread); - thread->realtime.period = info->period; - thread->realtime.computation = info->computation; - thread->realtime.constraint = info->constraint; - thread->realtime.preemptible = info->preemptible; + thread->realtime.period = info->period; + thread->realtime.computation = info->computation; + thread->realtime.constraint = info->constraint; + thread->realtime.preemptible = info->preemptible; thread_set_user_sched_mode_and_recompute_pri(thread, TH_MODE_REALTIME); thread_unlock(thread); splx(s); - sfi_reevaluate(thread); + pend_token.tpt_update_thread_sfi = 1; break; } case THREAD_PRECEDENCE_POLICY: { - thread_precedence_policy_t info; + thread_precedence_policy_t info; if (count < THREAD_PRECEDENCE_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; @@ -324,7 +396,7 @@ thread_policy_set_internal( } info = (thread_precedence_policy_t)policy_info; - s = splsched(); + spl_t s = splsched(); thread_lock(thread); thread->importance = info->importance; @@ -339,7 +411,7 @@ thread_policy_set_internal( case THREAD_AFFINITY_POLICY: { - thread_affinity_policy_t info; + thread_affinity_policy_t info; if (!thread_affinity_is_supported()) { result = KERN_NOT_SUPPORTED; @@ -361,53 +433,50 @@ thread_policy_set_internal( return thread_affinity_set(thread, info->affinity_tag); } + case THREAD_THROUGHPUT_QOS_POLICY: { thread_throughput_qos_policy_t info = (thread_throughput_qos_policy_t) policy_info; - int tqos; - - if (count < THREAD_LATENCY_QOS_POLICY_COUNT) { + thread_throughput_qos_t tqos; + + if (count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } - if ((result = qos_throughput_policy_validate(info->thread_throughput_qos_tier)) != - KERN_SUCCESS) { + if ((result = qos_throughput_policy_validate(info->thread_throughput_qos_tier)) != KERN_SUCCESS) break; - } tqos = qos_extract(info->thread_throughput_qos_tier); - thread->effective_policy.t_through_qos = tqos; - } + + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_THROUGH_QOS, tqos, 0, &pend_token); + break; + } case THREAD_LATENCY_QOS_POLICY: { thread_latency_qos_policy_t info = (thread_latency_qos_policy_t) policy_info; - int lqos; - - if (count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) { + thread_latency_qos_t lqos; + + if (count < THREAD_LATENCY_QOS_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } - if ((result = qos_latency_policy_validate(info->thread_latency_qos_tier)) != - KERN_SUCCESS) { + if ((result = qos_latency_policy_validate(info->thread_latency_qos_tier)) != KERN_SUCCESS) break; - } lqos = qos_extract(info->thread_latency_qos_tier); -/* The expected use cases (opt-in) of per-thread latency QoS would seem to - * preclude any requirement at present to re-evaluate timers on a thread level - * latency QoS change. - */ - thread->effective_policy.t_latency_qos = lqos; - } + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_LATENCY_QOS, lqos, 0, &pend_token); + break; + } case THREAD_QOS_POLICY: - case THREAD_QOS_POLICY_OVERRIDE: { thread_qos_policy_t info = (thread_qos_policy_t)policy_info; @@ -431,41 +500,9 @@ thread_policy_set_internal( break; } - /* - * Going into task policy requires the task mutex, - * because of the way synchronization against the IO policy - * subsystem works. - * - * We need to move thread policy to the thread mutex instead. - * separate thread policy from task policy - */ - - if (flavor == THREAD_QOS_POLICY_OVERRIDE) { - int strongest_override = info->qos_tier; - - if (info->qos_tier != THREAD_QOS_UNSPECIFIED && - thread->requested_policy.thrp_qos_override != THREAD_QOS_UNSPECIFIED) - strongest_override = MAX(thread->requested_policy.thrp_qos_override, info->qos_tier); - - thread_mtx_unlock(thread); - - /* There is a race here. To be closed in separate thread policy from task policy */ - - proc_set_task_policy(thread->task, thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, strongest_override); - - return (result); - } - - thread_mtx_unlock(thread); - - proc_set_task_policy2(thread->task, thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, info->qos_tier, -info->tier_importance); + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, + info->qos_tier, -info->tier_importance, &pend_token); - thread_mtx_lock(thread); - if (!thread->active) { - thread_mtx_unlock(thread); - return (KERN_TERMINATED); - } - break; } @@ -475,98 +512,167 @@ thread_policy_set_internal( } thread_mtx_unlock(thread); + + thread_policy_update_complete_unlocked(thread, &pend_token); + return (result); } /* - * thread_set_mode_and_absolute_pri: - * - * Set scheduling policy & absolute priority for thread, for deprecated - * thread_set_policy and thread_policy interfaces. - * * Note that there is no implemented difference between POLICY_RR and POLICY_FIFO. * Both result in FIXED mode scheduling. - * - * Called with thread mutex locked. */ -kern_return_t -thread_set_mode_and_absolute_pri( - thread_t thread, - integer_t policy, - integer_t priority) -{ - spl_t s; - sched_mode_t mode; - kern_return_t kr = KERN_SUCCESS; - - if (thread_is_static_param(thread)) - return (KERN_POLICY_STATIC); - - if (thread->policy_reset) - return (KERN_SUCCESS); - - /* Setting legacy policies on threads kills the current QoS */ - if (thread->requested_policy.thrp_qos != THREAD_QOS_UNSPECIFIED) { - thread_mtx_unlock(thread); - - kr = thread_remove_qos_policy(thread); - - thread_mtx_lock(thread); - if (!thread->active) { - return (KERN_TERMINATED); - } - } - +static sched_mode_t +convert_policy_to_sched_mode(integer_t policy) { switch (policy) { case POLICY_TIMESHARE: - mode = TH_MODE_TIMESHARE; - break; + return TH_MODE_TIMESHARE; case POLICY_RR: case POLICY_FIFO: - mode = TH_MODE_FIXED; - break; + return TH_MODE_FIXED; default: panic("unexpected sched policy: %d", policy); - break; + return TH_MODE_NONE; } +} - s = splsched(); +/* + * Called either with the thread mutex locked + * or from the pthread kext in a 'safe place'. + */ +static kern_return_t +thread_set_mode_and_absolute_pri_internal(thread_t thread, + sched_mode_t mode, + integer_t priority, + task_pend_token_t pend_token) +{ + kern_return_t kr = KERN_SUCCESS; + + spl_t s = splsched(); thread_lock(thread); /* This path isn't allowed to change a thread out of realtime. */ - if ((thread->sched_mode != TH_MODE_REALTIME) && - (thread->saved_mode != TH_MODE_REALTIME)) { + if ((thread->sched_mode == TH_MODE_REALTIME) || + (thread->saved_mode == TH_MODE_REALTIME)) { + kr = KERN_FAILURE; + goto unlock; + } - /* - * Reverse engineer and apply the correct importance value - * from the requested absolute priority value. - */ + if (thread->policy_reset) { + kr = KERN_SUCCESS; + goto unlock; + } - if (priority >= thread->max_priority) - priority = thread->max_priority - thread->task_priority; - else if (priority >= MINPRI_KERNEL) - priority -= MINPRI_KERNEL; - else if (priority >= MINPRI_RESERVED) - priority -= MINPRI_RESERVED; - else - priority -= BASEPRI_DEFAULT; + sched_mode_t old_mode = thread->sched_mode; - priority += thread->task_priority; + /* + * Reverse engineer and apply the correct importance value + * from the requested absolute priority value. + * + * TODO: Store the absolute priority value instead + */ - if (priority > thread->max_priority) - priority = thread->max_priority; - else if (priority < MINPRI) - priority = MINPRI; + if (priority >= thread->max_priority) + priority = thread->max_priority - thread->task_priority; + else if (priority >= MINPRI_KERNEL) + priority -= MINPRI_KERNEL; + else if (priority >= MINPRI_RESERVED) + priority -= MINPRI_RESERVED; + else + priority -= BASEPRI_DEFAULT; - thread->importance = priority - thread->task_priority; + priority += thread->task_priority; - thread_set_user_sched_mode_and_recompute_pri(thread, mode); - } + if (priority > thread->max_priority) + priority = thread->max_priority; + else if (priority < MINPRI) + priority = MINPRI; + + thread->importance = priority - thread->task_priority; + + thread_set_user_sched_mode_and_recompute_pri(thread, mode); + + if (mode != old_mode) + pend_token->tpt_update_thread_sfi = 1; +unlock: thread_unlock(thread); splx(s); - sfi_reevaluate(thread); + return kr; +} + +/* + * KPI for pthread kext + * + * Set scheduling policy & absolute priority for thread + * May be called from waitqueue callout context with spinlocks held + * Thread mutex lock is not held + */ +kern_return_t +thread_set_workq_pri(thread_t thread, + integer_t priority, + integer_t policy) +{ + struct task_pend_token pend_token = {}; + sched_mode_t mode = convert_policy_to_sched_mode(policy); + + assert(thread->static_param); + if (!thread->static_param) + return KERN_FAILURE; + + /* Concern: this doesn't hold the mutex... */ + if (!thread->active) + return KERN_TERMINATED; + + kern_return_t kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, &pend_token); + + if (pend_token.tpt_update_thread_sfi) + sfi_reevaluate(thread); + + return kr; +} + +/* + * thread_set_mode_and_absolute_pri: + * + * Set scheduling policy & absolute priority for thread, for deprecated + * thread_set_policy and thread_policy interfaces. + * + * Called with nothing locked. + */ +kern_return_t +thread_set_mode_and_absolute_pri(thread_t thread, + integer_t policy, + integer_t priority) +{ + kern_return_t kr = KERN_SUCCESS; + struct task_pend_token pend_token = {}; + + sched_mode_t mode = convert_policy_to_sched_mode(policy); + + thread_mtx_lock(thread); + + if (!thread->active) { + kr = KERN_TERMINATED; + goto unlock; + } + + if (thread_is_static_param(thread)) { + kr = KERN_POLICY_STATIC; + goto unlock; + } + + /* Setting legacy policies on threads kills the current QoS */ + if (thread->requested_policy.thrp_qos != THREAD_QOS_UNSPECIFIED) + thread_remove_qos_policy_locked(thread, &pend_token); + + kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, &pend_token); + +unlock: + thread_mtx_unlock(thread); + + thread_policy_update_complete_unlocked(thread, &pend_token); return (kr); } @@ -603,159 +709,67 @@ thread_set_user_sched_mode_and_recompute_pri(thread_t thread, sched_mode_t mode) thread_run_queue_reinsert(thread, SCHED_TAILQ); } -/* called with task lock locked */ -void -thread_recompute_qos(thread_t thread) { - spl_t s; - - thread_mtx_lock(thread); - - if (!thread->active) { - thread_mtx_unlock(thread); - return; - } - - s = splsched(); - thread_lock(thread); - - thread_recompute_priority(thread); - - thread_unlock(thread); - splx(s); - - thread_mtx_unlock(thread); -} - -/* called with task lock locked and thread_mtx_lock locked */ -void -thread_update_qos_cpu_time(thread_t thread, boolean_t lock_needed) +/* called at splsched with thread lock locked */ +static void +thread_update_qos_cpu_time_locked(thread_t thread) { - uint64_t last_qos_change_balance; - ledger_amount_t thread_balance_credit; - ledger_amount_t thread_balance_debit; - ledger_amount_t effective_qos_time; - uint64_t ctime; - uint64_t remainder = 0, consumed = 0; - processor_t processor; - spl_t s; - kern_return_t kr; + task_t task = thread->task; + uint64_t timer_sum, timer_delta; - if (lock_needed) { - s = splsched(); - thread_lock(thread); - } - - /* - * Calculation of time elapsed by the thread in the current qos. - * Following is the timeline which shows all the variables used in the calculation below. - * - * thread ledger thread ledger - * cpu_time_last_qos cpu_time - * | |<- consumed ->|<- remainder ->| - * timeline -----------------------------------------------------------> - * | | | - * thread_dispatch ctime quantum end - * - * |<----- effective qos time ----->| - */ - - /* - * Calculate time elapsed since last qos change on this thread. - * For cpu time on thread ledger, do not use ledger_get_balance, - * only use credit field of ledger, since - * debit is used by per thread cpu limits and is not zero. - */ - kr = ledger_get_entries(thread->t_threadledger, thread_ledgers.cpu_time, &thread_balance_credit, &thread_balance_debit); - if (kr != KERN_SUCCESS) - goto out; - last_qos_change_balance = thread->cpu_time_last_qos; - - /* - * If thread running on CPU, calculate time elapsed since this thread was last dispatched on cpu. - * The thread ledger is only updated at context switch, the time since last context swicth is not - * updated in the thread ledger cpu time. - */ - processor = thread->last_processor; - if ((processor != PROCESSOR_NULL) && (processor->state == PROCESSOR_RUNNING) && - (processor->active_thread == thread)) { - ctime = mach_absolute_time(); - - if (processor->quantum_end > ctime) - remainder = processor->quantum_end - ctime; - - consumed = thread->quantum_remaining - remainder; - } /* - * There can be multiple qos change in a quantum and in that case the cpu_time_last_qos will - * lie between cpu_time marker and ctime marker shown below. The output of - * thread_balance - last_qos_change_balance will be negative in such case, but overall outcome - * when consumed is added to it would be positive. + * This is only as accurate as the distance between + * last context switch (embedded) or last user/kernel boundary transition (desktop) + * because user_timer and system_timer are only updated then. * - * thread ledger - * cpu_time - * |<------------ consumed --------->|<- remainder ->| - * timeline -----------------------------------------------------------> - * | | | | - * thread_dispatch thread ledger ctime quantum end - * cpu_time_last_qos + * TODO: Consider running a thread_timer_event operation here to update it first. + * Maybe doable with interrupts disabled from current thread. + * If the thread is on a different core, may not be easy to get right. * - * |<-effective qos time->| + * TODO: There should be a function for this in timer.c */ - effective_qos_time = (ledger_amount_t) consumed; - effective_qos_time += thread_balance_credit - last_qos_change_balance; - if (lock_needed) { - thread_unlock(thread); - splx(s); - } + timer_sum = timer_grab(&thread->user_timer); + timer_sum += timer_grab(&thread->system_timer); + timer_delta = timer_sum - thread->vtimer_qos_save; - if (effective_qos_time < 0) - return; + thread->vtimer_qos_save = timer_sum; - thread->cpu_time_last_qos += (uint64_t)effective_qos_time; + uint64_t* task_counter = NULL; - /* - * Update the task-level qos stats. Its safe to perform operations on these fields, since we - * hold the task lock. - */ + /* Update the task-level qos stats atomically, because we don't have the task lock. */ switch (thread->effective_policy.thep_qos) { - - case THREAD_QOS_DEFAULT: - thread->task->cpu_time_qos_stats.cpu_time_qos_default += effective_qos_time; - break; - - case THREAD_QOS_MAINTENANCE: - thread->task->cpu_time_qos_stats.cpu_time_qos_maintenance += effective_qos_time; - break; + case THREAD_QOS_DEFAULT: task_counter = &task->cpu_time_qos_stats.cpu_time_qos_default; break; + case THREAD_QOS_MAINTENANCE: task_counter = &task->cpu_time_qos_stats.cpu_time_qos_maintenance; break; + case THREAD_QOS_BACKGROUND: task_counter = &task->cpu_time_qos_stats.cpu_time_qos_background; break; + case THREAD_QOS_UTILITY: task_counter = &task->cpu_time_qos_stats.cpu_time_qos_utility; break; + case THREAD_QOS_LEGACY: task_counter = &task->cpu_time_qos_stats.cpu_time_qos_legacy; break; + case THREAD_QOS_USER_INITIATED: task_counter = &task->cpu_time_qos_stats.cpu_time_qos_user_initiated; break; + case THREAD_QOS_USER_INTERACTIVE: task_counter = &task->cpu_time_qos_stats.cpu_time_qos_user_interactive; break; + default: + panic("unknown effective QoS: %d", thread->effective_policy.thep_qos); + } - case THREAD_QOS_BACKGROUND: - thread->task->cpu_time_qos_stats.cpu_time_qos_background += effective_qos_time; - break; + OSAddAtomic64(timer_delta, task_counter); +} - case THREAD_QOS_UTILITY: - thread->task->cpu_time_qos_stats.cpu_time_qos_utility += effective_qos_time; - break; +/* + * called with no thread locks held + * may hold task lock + */ +void +thread_update_qos_cpu_time(thread_t thread) +{ + thread_mtx_lock(thread); - case THREAD_QOS_LEGACY: - thread->task->cpu_time_qos_stats.cpu_time_qos_legacy += effective_qos_time; - break; - - case THREAD_QOS_USER_INITIATED: - thread->task->cpu_time_qos_stats.cpu_time_qos_user_initiated += effective_qos_time; - break; + spl_t s = splsched(); + thread_lock(thread); - case THREAD_QOS_USER_INTERACTIVE: - thread->task->cpu_time_qos_stats.cpu_time_qos_user_interactive += effective_qos_time; - break; - } + thread_update_qos_cpu_time_locked(thread); - return; + thread_unlock(thread); + splx(s); -out: - if (lock_needed) { - thread_unlock(thread); - splx(s); - } + thread_mtx_unlock(thread); } /* @@ -777,7 +791,7 @@ thread_recompute_priority( return; } else if (thread->effective_policy.thep_qos != THREAD_QOS_UNSPECIFIED) { int qos = thread->effective_policy.thep_qos; - int qos_ui_is_urgent = thread->effective_policy.qos_ui_is_urgent; + int qos_ui_is_urgent = thread->effective_policy.thep_qos_ui_is_urgent; int qos_relprio = -(thread->effective_policy.thep_qos_relprio); /* stored in task policy inverted */ int qos_scaled_relprio; @@ -792,6 +806,8 @@ thread_recompute_priority( qos_scaled_relprio += 1; } + /* TODO: factor in renice priority here? */ + priority += qos_scaled_relprio; } else { if (thread->importance > MAXPRI) @@ -804,38 +820,60 @@ thread_recompute_priority( priority += thread->task_priority; } - if (thread->saved_mode == TH_MODE_REALTIME && - thread->sched_flags & TH_SFLAG_FAILSAFE) - priority = DEPRESSPRI; - - if (thread->effective_policy.terminated == TRUE && priority < thread->task_priority) { - priority = thread->task_priority; - } + priority = MAX(priority, thread->user_promotion_basepri); + /* + * Clamp priority back into the allowed range for this task. + * The initial priority value could be out of this range due to: + * Task clamped to BG or Utility (max-pri is 4, or 20) + * Task is user task (max-pri is 63) + * Task is kernel task (max-pri is 95) + * Note that thread->importance is user-settable to any integer + * via THREAD_PRECEDENCE_POLICY. + */ if (priority > thread->max_priority) priority = thread->max_priority; else if (priority < MINPRI) priority = MINPRI; + if (thread->saved_mode == TH_MODE_REALTIME && + thread->sched_flags & TH_SFLAG_FAILSAFE) + priority = DEPRESSPRI; + + if (thread->effective_policy.thep_terminated == TRUE) { + /* + * We temporarily want to override the expected priority to + * ensure that the thread exits in a timely manner. + * Note that this is allowed to exceed thread->max_priority + * so that the thread is no longer clamped to background + * during the final exit phase. + */ + if (priority < thread->task_priority) + priority = thread->task_priority; + if (priority < BASEPRI_DEFAULT) + priority = BASEPRI_DEFAULT; + } + sched_set_thread_base_priority(thread, priority); } -/* Called with the thread mutex held */ +/* Called with the task lock held, but not the thread mutex or spinlock */ void -thread_task_priority( - thread_t thread, - integer_t priority, - integer_t max_priority) +thread_policy_update_tasklocked( + thread_t thread, + integer_t priority, + integer_t max_priority, + task_pend_token_t pend_token) { - spl_t s; - - assert(thread != THREAD_NULL); + thread_mtx_lock(thread); - if (!thread->active || thread->policy_reset) + if (!thread->active || thread->policy_reset) { + thread_mtx_unlock(thread); return; + } - s = splsched(); + spl_t s = splsched(); thread_lock(thread); __unused @@ -845,10 +883,12 @@ thread_task_priority( thread->max_priority = max_priority; - thread_recompute_priority(thread); + thread_policy_update_spinlocked(thread, TRUE, pend_token); thread_unlock(thread); splx(s); + + thread_mtx_unlock(thread); } /* @@ -868,24 +908,11 @@ thread_policy_reset( s = splsched(); thread_lock(thread); - assert_thread_sched_count(thread); - if (thread->sched_flags & TH_SFLAG_FAILSAFE) sched_thread_mode_undemote(thread, TH_SFLAG_FAILSAFE); - assert_thread_sched_count(thread); - - if (thread->sched_flags & TH_SFLAG_THROTTLE_DEMOTED) - sched_thread_mode_undemote(thread, TH_SFLAG_THROTTLE_DEMOTED); - - assert_thread_sched_count(thread); - if (thread->sched_flags & TH_SFLAG_THROTTLED) - sched_set_thread_throttled(thread, FALSE); - - assert_thread_sched_count(thread); - - assert(thread->BG_COUNT == 0); + sched_thread_mode_undemote(thread, TH_SFLAG_THROTTLED); /* At this point, the various demotions should be inactive */ assert(!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)); @@ -904,9 +931,6 @@ thread_policy_reset( sched_set_thread_base_priority(thread, thread->task_priority); - assert(thread->BG_COUNT == 0); - assert_thread_sched_count(thread); - thread_unlock(thread); splx(s); } @@ -920,7 +944,6 @@ thread_policy_get( boolean_t *get_default) { kern_return_t result = KERN_SUCCESS; - spl_t s; if (thread == THREAD_NULL) return (KERN_INVALID_ARGUMENT); @@ -939,7 +962,7 @@ thread_policy_get( boolean_t timeshare = TRUE; if (!(*get_default)) { - s = splsched(); + spl_t s = splsched(); thread_lock(thread); if ( (thread->sched_mode != TH_MODE_REALTIME) && @@ -978,7 +1001,7 @@ thread_policy_get( info = (thread_time_constraint_policy_t)policy_info; if (!(*get_default)) { - s = splsched(); + spl_t s = splsched(); thread_lock(thread); if ( (thread->sched_mode == TH_MODE_REALTIME) || @@ -1017,7 +1040,7 @@ thread_policy_get( info = (thread_precedence_policy_t)policy_info; if (!(*get_default)) { - s = splsched(); + spl_t s = splsched(); thread_lock(thread); info->importance = thread->importance; @@ -1069,21 +1092,27 @@ thread_policy_get( break; } - info = (thread_policy_state_t)policy_info; + info = (thread_policy_state_t)(void*)policy_info; if (!(*get_default)) { info->flags = 0; + spl_t s = splsched(); + thread_lock(thread); + info->flags |= (thread->static_param ? THREAD_POLICY_STATE_FLAG_STATIC_PARAM : 0); - /* - * Unlock the thread mutex and directly return. - * This is necessary because proc_get_thread_policy() - * takes the task lock. - */ - thread_mtx_unlock(thread); - proc_get_thread_policy(thread, info); - return (result); + info->thps_requested_policy = *(uint64_t*)(void*)(&thread->requested_policy); + info->thps_effective_policy = *(uint64_t*)(void*)(&thread->effective_policy); + + info->thps_user_promotions = thread->user_promotions; + info->thps_user_promotion_basepri = thread->user_promotion_basepri; + info->thps_ipc_overrides = thread->ipc_overrides; + + proc_get_thread_policy_bitfield(thread, info); + + thread_unlock(thread); + splx(s); } else { info->requested = 0; info->effective = 0; @@ -1096,7 +1125,7 @@ thread_policy_get( case THREAD_LATENCY_QOS_POLICY: { thread_latency_qos_policy_t info = (thread_latency_qos_policy_t) policy_info; - uint32_t plqos; + thread_latency_qos_t plqos; if (*count < THREAD_LATENCY_QOS_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; @@ -1106,7 +1135,7 @@ thread_policy_get( if (*get_default) { plqos = 0; } else { - plqos = thread->effective_policy.t_latency_qos; + plqos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_LATENCY_QOS, NULL); } info->thread_latency_qos_tier = qos_latency_policy_package(plqos); @@ -1116,7 +1145,7 @@ thread_policy_get( case THREAD_THROUGHPUT_QOS_POLICY: { thread_throughput_qos_policy_t info = (thread_throughput_qos_policy_t) policy_info; - uint32_t ptqos; + thread_throughput_qos_t ptqos; if (*count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; @@ -1126,7 +1155,7 @@ thread_policy_get( if (*get_default) { ptqos = 0; } else { - ptqos = thread->effective_policy.t_through_qos; + ptqos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_THROUGH_QOS, NULL); } info->thread_throughput_qos_tier = qos_throughput_policy_package(ptqos); @@ -1134,7 +1163,6 @@ thread_policy_get( break; case THREAD_QOS_POLICY: - case THREAD_QOS_POLICY_OVERRIDE: { thread_qos_policy_t info = (thread_qos_policy_t)policy_info; @@ -1144,14 +1172,11 @@ thread_policy_get( } if (!(*get_default)) { - if (flavor == THREAD_QOS_POLICY_OVERRIDE) { - info->qos_tier = thread->requested_policy.thrp_qos_override; - /* TODO: handle importance overrides */ - info->tier_importance = 0; - } else { - info->qos_tier = thread->requested_policy.thrp_qos; - info->tier_importance = thread->importance; - } + int relprio_value = 0; + info->qos_tier = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_AND_RELPRIO, &relprio_value); + + info->tier_importance = -relprio_value; } else { info->qos_tier = THREAD_QOS_UNSPECIFIED; info->tier_importance = 0; @@ -1208,3 +1233,1709 @@ thread_policy_destroy_work_interval( thread_mtx_unlock(thread); return KERN_SUCCESS; } + +void +thread_policy_create(thread_t thread) +{ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_START, + thread_tid(thread), theffective_0(thread), + theffective_1(thread), thread->base_pri, 0); + + /* We pass a pend token but ignore it */ + struct task_pend_token pend_token = {}; + + thread_policy_update_internal_spinlocked(thread, TRUE, &pend_token); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_END, + thread_tid(thread), theffective_0(thread), + theffective_1(thread), thread->base_pri, 0); +} + +static void +thread_policy_update_spinlocked(thread_t thread, boolean_t recompute_priority, task_pend_token_t pend_token) +{ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_THREAD) | DBG_FUNC_START), + thread_tid(thread), theffective_0(thread), + theffective_1(thread), thread->base_pri, 0); + + thread_policy_update_internal_spinlocked(thread, recompute_priority, pend_token); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_THREAD)) | DBG_FUNC_END, + thread_tid(thread), theffective_0(thread), + theffective_1(thread), thread->base_pri, 0); +} + + + +/* + * One thread state update function TO RULE THEM ALL + * + * This function updates the thread effective policy fields + * and pushes the results to the relevant subsystems. + * + * Returns TRUE if a pended action needs to be run. + * + * Called with thread spinlock locked, task may be locked, thread mutex may be locked + */ +static void +thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_priority, + task_pend_token_t pend_token) +{ + /* + * Step 1: + * Gather requested policy and effective task state + */ + + struct thread_requested_policy requested = thread->requested_policy; + struct task_effective_policy task_effective = thread->task->effective_policy; + + /* + * Step 2: + * Calculate new effective policies from requested policy, task and thread state + * Rules: + * Don't change requested, it won't take effect + */ + + struct thread_effective_policy next = {}; + + next.thep_qos_ui_is_urgent = task_effective.tep_qos_ui_is_urgent; + + uint32_t next_qos = requested.thrp_qos; + + if (requested.thrp_qos != THREAD_QOS_UNSPECIFIED) { + if (requested.thrp_qos_override != THREAD_QOS_UNSPECIFIED) + next_qos = MAX(requested.thrp_qos_override, next_qos); + + if (requested.thrp_qos_promote != THREAD_QOS_UNSPECIFIED) + next_qos = MAX(requested.thrp_qos_promote, next_qos); + + if (requested.thrp_qos_ipc_override != THREAD_QOS_UNSPECIFIED) + next_qos = MAX(requested.thrp_qos_ipc_override, next_qos); + } + + next.thep_qos = next_qos; + + /* A task clamp will result in an effective QoS even when requested is UNSPECIFIED */ + if (task_effective.tep_qos_clamp != THREAD_QOS_UNSPECIFIED) { + if (next.thep_qos != THREAD_QOS_UNSPECIFIED) + next.thep_qos = MIN(task_effective.tep_qos_clamp, next.thep_qos); + else + next.thep_qos = task_effective.tep_qos_clamp; + } + + /* + * Extract outbound-promotion QoS before applying task ceiling or BG clamp + * This allows QoS promotions to work properly even after the process is unclamped. + */ + next.thep_qos_promote = next.thep_qos; + + /* The ceiling only applies to threads that are in the QoS world */ + if (task_effective.tep_qos_ceiling != THREAD_QOS_UNSPECIFIED && + next.thep_qos != THREAD_QOS_UNSPECIFIED) { + next.thep_qos = MIN(task_effective.tep_qos_ceiling, next.thep_qos); + } + + /* + * The QoS relative priority is only applicable when the original programmer's + * intended (requested) QoS is in effect. When the QoS is clamped (e.g. + * USER_INITIATED-13REL clamped to UTILITY), the relative priority is not honored, + * since otherwise it would be lower than unclamped threads. Similarly, in the + * presence of boosting, the programmer doesn't know what other actors + * are boosting the thread. + */ + if ((requested.thrp_qos != THREAD_QOS_UNSPECIFIED) && + (requested.thrp_qos == next.thep_qos) && + (requested.thrp_qos_override == THREAD_QOS_UNSPECIFIED)) { + next.thep_qos_relprio = requested.thrp_qos_relprio; + } else { + next.thep_qos_relprio = 0; + } + + /* Calculate DARWIN_BG */ + boolean_t wants_darwinbg = FALSE; + boolean_t wants_all_sockets_bg = FALSE; /* Do I want my existing sockets to be bg */ + + /* + * If DARWIN_BG has been requested at either level, it's engaged. + * darwinbg threads always create bg sockets, + * but only some types of darwinbg change the sockets + * after they're created + */ + if (requested.thrp_int_darwinbg || requested.thrp_ext_darwinbg) + wants_all_sockets_bg = wants_darwinbg = TRUE; + + if (requested.thrp_pidbind_bg) + wants_all_sockets_bg = wants_darwinbg = TRUE; + + if (task_effective.tep_darwinbg) + wants_darwinbg = TRUE; + + if (next.thep_qos == THREAD_QOS_BACKGROUND || + next.thep_qos == THREAD_QOS_MAINTENANCE) + wants_darwinbg = TRUE; + + /* Calculate side effects of DARWIN_BG */ + + if (wants_darwinbg) + next.thep_darwinbg = 1; + + if (next.thep_darwinbg || task_effective.tep_new_sockets_bg) + next.thep_new_sockets_bg = 1; + + /* Don't use task_effective.tep_all_sockets_bg here */ + if (wants_all_sockets_bg) + next.thep_all_sockets_bg = 1; + + /* darwinbg implies background QOS (or lower) */ + if (next.thep_darwinbg && + (next.thep_qos > THREAD_QOS_BACKGROUND || next.thep_qos == THREAD_QOS_UNSPECIFIED)) { + next.thep_qos = THREAD_QOS_BACKGROUND; + next.thep_qos_relprio = 0; + } + + /* Calculate IO policy */ + + int iopol = THROTTLE_LEVEL_TIER0; + + /* Factor in the task's IO policy */ + if (next.thep_darwinbg) + iopol = MAX(iopol, task_effective.tep_bg_iotier); + + iopol = MAX(iopol, task_effective.tep_io_tier); + + /* Look up the associated IO tier value for the QoS class */ + iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.thep_qos]); + + iopol = MAX(iopol, requested.thrp_int_iotier); + iopol = MAX(iopol, requested.thrp_ext_iotier); + + next.thep_io_tier = iopol; + + /* + * If a QoS override is causing IO to go into a lower tier, we also set + * the passive bit so that a thread doesn't end up stuck in its own throttle + * window when the override goes away. + */ + boolean_t qos_io_override_active = FALSE; + if (thread_qos_policy_params.qos_iotier[next.thep_qos] < + thread_qos_policy_params.qos_iotier[requested.thrp_qos]) + qos_io_override_active = TRUE; + + /* Calculate Passive IO policy */ + if (requested.thrp_ext_iopassive || + requested.thrp_int_iopassive || + qos_io_override_active || + task_effective.tep_io_passive ) + next.thep_io_passive = 1; + + /* Calculate timer QOS */ + uint32_t latency_qos = requested.thrp_latency_qos; + + latency_qos = MAX(latency_qos, task_effective.tep_latency_qos); + latency_qos = MAX(latency_qos, thread_qos_policy_params.qos_latency_qos[next.thep_qos]); + + next.thep_latency_qos = latency_qos; + + /* Calculate throughput QOS */ + uint32_t through_qos = requested.thrp_through_qos; + + through_qos = MAX(through_qos, task_effective.tep_through_qos); + through_qos = MAX(through_qos, thread_qos_policy_params.qos_through_qos[next.thep_qos]); + + next.thep_through_qos = through_qos; + + if (task_effective.tep_terminated || requested.thrp_terminated) { + /* Shoot down the throttles that slow down exit or response to SIGTERM */ + next.thep_terminated = 1; + next.thep_darwinbg = 0; + next.thep_io_tier = THROTTLE_LEVEL_TIER0; + next.thep_qos = THREAD_QOS_UNSPECIFIED; + next.thep_latency_qos = LATENCY_QOS_TIER_UNSPECIFIED; + next.thep_through_qos = THROUGHPUT_QOS_TIER_UNSPECIFIED; + } + + /* + * Step 3: + * Swap out old policy for new policy + */ + + struct thread_effective_policy prev = thread->effective_policy; + + thread_update_qos_cpu_time_locked(thread); + + /* This is the point where the new values become visible to other threads */ + thread->effective_policy = next; + + /* + * Step 4: + * Pend updates that can't be done while holding the thread lock + */ + + if (prev.thep_all_sockets_bg != next.thep_all_sockets_bg) + pend_token->tpt_update_sockets = 1; + + /* TODO: Doesn't this only need to be done if the throttle went up? */ + if (prev.thep_io_tier != next.thep_io_tier) + pend_token->tpt_update_throttle = 1; + + /* + * Check for the attributes that sfi_thread_classify() consults, + * and trigger SFI re-evaluation. + */ + if (prev.thep_qos != next.thep_qos || + prev.thep_darwinbg != next.thep_darwinbg ) + pend_token->tpt_update_thread_sfi = 1; + + /* + * Step 5: + * Update other subsystems as necessary if something has changed + */ + + /* Check for the attributes that thread_recompute_priority() consults */ + if (prev.thep_qos != next.thep_qos || + prev.thep_qos_relprio != next.thep_qos_relprio || + prev.thep_qos_ui_is_urgent != next.thep_qos_ui_is_urgent || + prev.thep_terminated != next.thep_terminated || + pend_token->tpt_force_recompute_pri == 1 || + recompute_priority) { + thread_recompute_priority(thread); + } +} + + +/* + * Initiate a thread policy state transition on a thread with its TID + * Useful if you cannot guarantee the thread won't get terminated + * Precondition: No locks are held + * Will take task lock - using the non-tid variant is faster + * if you already have a thread ref. + */ +void +proc_set_thread_policy_with_tid(task_t task, + uint64_t tid, + int category, + int flavor, + int value) +{ + /* takes task lock, returns ref'ed thread or NULL */ + thread_t thread = task_findtid(task, tid); + + if (thread == THREAD_NULL) + return; + + proc_set_thread_policy(thread, category, flavor, value); + + thread_deallocate(thread); +} + +/* + * Initiate a thread policy transition on a thread + * This path supports networking transitions (i.e. darwinbg transitions) + * Precondition: No locks are held + */ +void +proc_set_thread_policy(thread_t thread, + int category, + int flavor, + int value) +{ + struct task_pend_token pend_token = {}; + + thread_mtx_lock(thread); + + proc_set_thread_policy_locked(thread, category, flavor, value, 0, &pend_token); + + thread_mtx_unlock(thread); + + thread_policy_update_complete_unlocked(thread, &pend_token); +} + +/* + * KPI for pthread kext to call to set thread base QoS values during a workq wakeup + * May be called with interrupts disabled and workqueue/waitqueue/kqueue locks held + * + * Does NOT do update completion, so the thread MUST be in a safe place WRT + * IO throttling and SFI. + * + * TODO: Can I assert 'it must be in a safe place'? + */ +kern_return_t +thread_set_workq_qos(thread_t thread, + int qos_tier, + int relprio) /* relprio is -16 to 0 */ +{ + assert(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST); + assert(relprio <= 0 && relprio >= THREAD_QOS_MIN_TIER_IMPORTANCE); + + if (!(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST)) + return KERN_FAILURE; + if (!(relprio <= 0 && relprio >= THREAD_QOS_MIN_TIER_IMPORTANCE)) + return KERN_FAILURE; + + if (qos_tier == THREAD_QOS_UNSPECIFIED) { + assert(relprio == 0); + if (relprio != 0) + return KERN_FAILURE; + } + + assert(thread->static_param); + if (!thread->static_param) { + return KERN_FAILURE; + } + + /* Concern: this doesn't hold the mutex... */ + //if (!thread->active) + // return KERN_TERMINATED; + + struct task_pend_token pend_token = {}; + + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, qos_tier, -relprio, &pend_token); + + assert(pend_token.tpt_update_sockets == 0); + /* we don't need to update throttle or sfi because pthread kext promises the thread is in a safe place */ + /* TODO: Do we need to update SFI to ensure it gets tagged with the AST? */ + + return KERN_SUCCESS; +} + + +/* + * Do the things that can't be done while holding a thread mutex. + * These are set up to call back into thread policy to get the latest value, + * so they don't have to be synchronized with the update. + * The only required semantic is 'call this sometime after updating effective policy' + * + * Precondition: Thread mutex is not held + * + * This may be called with the task lock held, but in that case it won't be + * called with tpt_update_sockets set. + */ +void +thread_policy_update_complete_unlocked(thread_t thread, task_pend_token_t pend_token) +{ +#ifdef MACH_BSD + if (pend_token->tpt_update_sockets) + proc_apply_task_networkbg(thread->task->bsd_info, thread); +#endif /* MACH_BSD */ + + if (pend_token->tpt_update_throttle) + rethrottle_thread(thread->uthread); + + if (pend_token->tpt_update_thread_sfi) + sfi_reevaluate(thread); +} + +/* + * Set and update thread policy + * Thread mutex might be held + */ +static void +proc_set_thread_policy_locked(thread_t thread, + int category, + int flavor, + int value, + int value2, + task_pend_token_t pend_token) +{ + spl_t s = splsched(); + thread_lock(thread); + + proc_set_thread_policy_spinlocked(thread, category, flavor, value, value2, pend_token); + + thread_unlock(thread); + splx(s); +} + +/* + * Set and update thread policy + * Thread spinlock is held + */ +static void +proc_set_thread_policy_spinlocked(thread_t thread, + int category, + int flavor, + int value, + int value2, + task_pend_token_t pend_token) +{ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_START, + thread_tid(thread), threquested_0(thread), + threquested_1(thread), value, 0); + + thread_set_requested_policy_spinlocked(thread, category, flavor, value, value2); + + thread_policy_update_spinlocked(thread, FALSE, pend_token); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_END, + thread_tid(thread), threquested_0(thread), + threquested_1(thread), tpending(pend_token), 0); +} + +/* + * Set the requested state for a specific flavor to a specific value. + */ +static void +thread_set_requested_policy_spinlocked(thread_t thread, + int category, + int flavor, + int value, + int value2) +{ + int tier, passive; + + struct thread_requested_policy requested = thread->requested_policy; + + switch (flavor) { + + /* Category: EXTERNAL and INTERNAL, thread and task */ + + case TASK_POLICY_DARWIN_BG: + if (category == TASK_POLICY_EXTERNAL) + requested.thrp_ext_darwinbg = value; + else + requested.thrp_int_darwinbg = value; + break; + + case TASK_POLICY_IOPOL: + proc_iopol_to_tier(value, &tier, &passive); + if (category == TASK_POLICY_EXTERNAL) { + requested.thrp_ext_iotier = tier; + requested.thrp_ext_iopassive = passive; + } else { + requested.thrp_int_iotier = tier; + requested.thrp_int_iopassive = passive; + } + break; + + case TASK_POLICY_IO: + if (category == TASK_POLICY_EXTERNAL) + requested.thrp_ext_iotier = value; + else + requested.thrp_int_iotier = value; + break; + + case TASK_POLICY_PASSIVE_IO: + if (category == TASK_POLICY_EXTERNAL) + requested.thrp_ext_iopassive = value; + else + requested.thrp_int_iopassive = value; + break; + + /* Category: ATTRIBUTE, thread only */ + + case TASK_POLICY_PIDBIND_BG: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_pidbind_bg = value; + break; + + case TASK_POLICY_LATENCY_QOS: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_latency_qos = value; + break; + + case TASK_POLICY_THROUGH_QOS: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_through_qos = value; + break; + + case TASK_POLICY_QOS: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_qos = value; + break; + + case TASK_POLICY_QOS_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_qos_override = value; + break; + + case TASK_POLICY_QOS_AND_RELPRIO: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_qos = value; + requested.thrp_qos_relprio = value2; + DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio); + break; + + case TASK_POLICY_QOS_PROMOTE: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_qos_promote = value; + break; + + case TASK_POLICY_QOS_IPC_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_qos_ipc_override = value; + break; + + case TASK_POLICY_TERMINATED: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_terminated = value; + break; + + default: + panic("unknown task policy: %d %d %d", category, flavor, value); + break; + } + + thread->requested_policy = requested; +} + +/* + * Gets what you set. Effective values may be different. + * Precondition: No locks are held + */ +int +proc_get_thread_policy(thread_t thread, + int category, + int flavor) +{ + int value = 0; + thread_mtx_lock(thread); + value = proc_get_thread_policy_locked(thread, category, flavor, NULL); + thread_mtx_unlock(thread); + return value; +} + +static int +proc_get_thread_policy_locked(thread_t thread, + int category, + int flavor, + int* value2) +{ + int value = 0; + + spl_t s = splsched(); + thread_lock(thread); + + value = thread_get_requested_policy_spinlocked(thread, category, flavor, value2); + + thread_unlock(thread); + splx(s); + + return value; +} + +/* + * Gets what you set. Effective values may be different. + */ +static int +thread_get_requested_policy_spinlocked(thread_t thread, + int category, + int flavor, + int* value2) +{ + int value = 0; + + struct thread_requested_policy requested = thread->requested_policy; + + switch (flavor) { + case TASK_POLICY_DARWIN_BG: + if (category == TASK_POLICY_EXTERNAL) + value = requested.thrp_ext_darwinbg; + else + value = requested.thrp_int_darwinbg; + break; + case TASK_POLICY_IOPOL: + if (category == TASK_POLICY_EXTERNAL) + value = proc_tier_to_iopol(requested.thrp_ext_iotier, + requested.thrp_ext_iopassive); + else + value = proc_tier_to_iopol(requested.thrp_int_iotier, + requested.thrp_int_iopassive); + break; + case TASK_POLICY_IO: + if (category == TASK_POLICY_EXTERNAL) + value = requested.thrp_ext_iotier; + else + value = requested.thrp_int_iotier; + break; + case TASK_POLICY_PASSIVE_IO: + if (category == TASK_POLICY_EXTERNAL) + value = requested.thrp_ext_iopassive; + else + value = requested.thrp_int_iopassive; + break; + case TASK_POLICY_QOS: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_qos; + break; + case TASK_POLICY_QOS_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_qos_override; + break; + case TASK_POLICY_LATENCY_QOS: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_latency_qos; + break; + case TASK_POLICY_THROUGH_QOS: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_through_qos; + break; + case TASK_POLICY_QOS_AND_RELPRIO: + assert(category == TASK_POLICY_ATTRIBUTE); + assert(value2 != NULL); + value = requested.thrp_qos; + *value2 = requested.thrp_qos_relprio; + break; + case TASK_POLICY_QOS_PROMOTE: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_qos_promote; + break; + case TASK_POLICY_QOS_IPC_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_qos_ipc_override; + break; + case TASK_POLICY_TERMINATED: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_terminated; + break; + + default: + panic("unknown policy_flavor %d", flavor); + break; + } + + return value; +} + +/* + * Gets what is actually in effect, for subsystems which pull policy instead of receive updates. + * + * NOTE: This accessor does not take the task or thread lock. + * Notifications of state updates need to be externally synchronized with state queries. + * This routine *MUST* remain interrupt safe, as it is potentially invoked + * within the context of a timer interrupt. + * + * TODO: I think we can get away with architecting this such that we don't need to look at the task ever. + * Is that a good idea? Maybe it's best to avoid evaluate-all-the-threads updates. + * I don't think that cost is worth not having the right answer. + */ +int +proc_get_effective_thread_policy(thread_t thread, + int flavor) +{ + int value = 0; + + switch (flavor) { + case TASK_POLICY_DARWIN_BG: + /* + * This call is used within the timer layer, as well as + * prioritizing requests to the graphics system. + * It also informs SFI and originator-bg-state. + * Returns 1 for background mode, 0 for normal mode + */ + + value = thread->effective_policy.thep_darwinbg ? 1 : 0; + break; + case TASK_POLICY_IO: + /* + * The I/O system calls here to find out what throttling tier to apply to an operation. + * Returns THROTTLE_LEVEL_* values + */ + value = thread->effective_policy.thep_io_tier; + if (thread->iotier_override != THROTTLE_LEVEL_NONE) + value = MIN(value, thread->iotier_override); + break; + case TASK_POLICY_PASSIVE_IO: + /* + * The I/O system calls here to find out whether an operation should be passive. + * (i.e. not cause operations with lower throttle tiers to be throttled) + * Returns 1 for passive mode, 0 for normal mode + * + * If an override is causing IO to go into a lower tier, we also set + * the passive bit so that a thread doesn't end up stuck in its own throttle + * window when the override goes away. + */ + value = thread->effective_policy.thep_io_passive ? 1 : 0; + if (thread->iotier_override != THROTTLE_LEVEL_NONE && + thread->iotier_override < thread->effective_policy.thep_io_tier) + value = 1; + break; + case TASK_POLICY_ALL_SOCKETS_BG: + /* + * do_background_socket() calls this to determine whether + * it should change the thread's sockets + * Returns 1 for background mode, 0 for normal mode + * This consults both thread and task so un-DBGing a thread while the task is BG + * doesn't get you out of the network throttle. + */ + value = (thread->effective_policy.thep_all_sockets_bg || + thread->task->effective_policy.tep_all_sockets_bg) ? 1 : 0; + break; + case TASK_POLICY_NEW_SOCKETS_BG: + /* + * socreate() calls this to determine if it should mark a new socket as background + * Returns 1 for background mode, 0 for normal mode + */ + value = thread->effective_policy.thep_new_sockets_bg ? 1 : 0; + break; + case TASK_POLICY_LATENCY_QOS: + /* + * timer arming calls into here to find out the timer coalescing level + * Returns a latency QoS tier (0-6) + */ + value = thread->effective_policy.thep_latency_qos; + break; + case TASK_POLICY_THROUGH_QOS: + /* + * This value is passed into the urgency callout from the scheduler + * to the performance management subsystem. + * + * Returns a throughput QoS tier (0-6) + */ + value = thread->effective_policy.thep_through_qos; + break; + case TASK_POLICY_QOS: + /* + * This is communicated to the performance management layer and SFI. + * + * Returns a QoS policy tier + */ + value = thread->effective_policy.thep_qos; + break; + default: + panic("unknown thread policy flavor %d", flavor); + break; + } + + return value; +} + + +/* + * (integer_t) casts limit the number of bits we can fit here + * this interface is deprecated and replaced by the _EXT struct ? + */ +static void +proc_get_thread_policy_bitfield(thread_t thread, thread_policy_state_t info) +{ + uint64_t bits = 0; + struct thread_requested_policy requested = thread->requested_policy; + + bits |= (requested.thrp_int_darwinbg ? POLICY_REQ_INT_DARWIN_BG : 0); + bits |= (requested.thrp_ext_darwinbg ? POLICY_REQ_EXT_DARWIN_BG : 0); + bits |= (requested.thrp_int_iotier ? (((uint64_t)requested.thrp_int_iotier) << POLICY_REQ_INT_IO_TIER_SHIFT) : 0); + bits |= (requested.thrp_ext_iotier ? (((uint64_t)requested.thrp_ext_iotier) << POLICY_REQ_EXT_IO_TIER_SHIFT) : 0); + bits |= (requested.thrp_int_iopassive ? POLICY_REQ_INT_PASSIVE_IO : 0); + bits |= (requested.thrp_ext_iopassive ? POLICY_REQ_EXT_PASSIVE_IO : 0); + + bits |= (requested.thrp_qos ? (((uint64_t)requested.thrp_qos) << POLICY_REQ_TH_QOS_SHIFT) : 0); + bits |= (requested.thrp_qos_override ? (((uint64_t)requested.thrp_qos_override) << POLICY_REQ_TH_QOS_OVER_SHIFT) : 0); + + bits |= (requested.thrp_pidbind_bg ? POLICY_REQ_PIDBIND_BG : 0); + + bits |= (requested.thrp_latency_qos ? (((uint64_t)requested.thrp_latency_qos) << POLICY_REQ_BASE_LATENCY_QOS_SHIFT) : 0); + bits |= (requested.thrp_through_qos ? (((uint64_t)requested.thrp_through_qos) << POLICY_REQ_BASE_THROUGH_QOS_SHIFT) : 0); + + info->requested = (integer_t) bits; + bits = 0; + + struct thread_effective_policy effective = thread->effective_policy; + + bits |= (effective.thep_darwinbg ? POLICY_EFF_DARWIN_BG : 0); + + bits |= (effective.thep_io_tier ? (((uint64_t)effective.thep_io_tier) << POLICY_EFF_IO_TIER_SHIFT) : 0); + bits |= (effective.thep_io_passive ? POLICY_EFF_IO_PASSIVE : 0); + bits |= (effective.thep_all_sockets_bg ? POLICY_EFF_ALL_SOCKETS_BG : 0); + bits |= (effective.thep_new_sockets_bg ? POLICY_EFF_NEW_SOCKETS_BG : 0); + + bits |= (effective.thep_qos ? (((uint64_t)effective.thep_qos) << POLICY_EFF_TH_QOS_SHIFT) : 0); + + bits |= (effective.thep_latency_qos ? (((uint64_t)effective.thep_latency_qos) << POLICY_EFF_LATENCY_QOS_SHIFT) : 0); + bits |= (effective.thep_through_qos ? (((uint64_t)effective.thep_through_qos) << POLICY_EFF_THROUGH_QOS_SHIFT) : 0); + + info->effective = (integer_t)bits; + bits = 0; + + info->pending = 0; +} + +/* + * Sneakily trace either the task and thread requested + * or just the thread requested, depending on if we have enough room. + * We do have room on LP64. On LP32, we have to split it between two uintptr_t's. + * + * LP32 LP64 + * threquested_0(thread) thread[0] task[0] + * threquested_1(thread) thread[1] thread[0] + * + */ + +uintptr_t +threquested_0(thread_t thread) +{ + static_assert(sizeof(struct thread_requested_policy) == sizeof(uint64_t), "size invariant violated"); + + uintptr_t* raw = (uintptr_t*)(void*)&thread->requested_policy; + + return raw[0]; +} + +uintptr_t +threquested_1(thread_t thread) +{ +#if defined __LP64__ + return *(uintptr_t*)&thread->task->requested_policy; +#else + uintptr_t* raw = (uintptr_t*)(void*)&thread->requested_policy; + return raw[1]; +#endif +} + +uintptr_t +theffective_0(thread_t thread) +{ + static_assert(sizeof(struct thread_effective_policy) == sizeof(uint64_t), "size invariant violated"); + + uintptr_t* raw = (uintptr_t*)(void*)&thread->effective_policy; + return raw[0]; +} + +uintptr_t +theffective_1(thread_t thread) +{ +#if defined __LP64__ + return *(uintptr_t*)&thread->task->effective_policy; +#else + uintptr_t* raw = (uintptr_t*)(void*)&thread->effective_policy; + return raw[1]; +#endif +} + + +/* + * Set an override on the thread which is consulted with a + * higher priority than the task/thread policy. This should + * only be set for temporary grants until the thread + * returns to the userspace boundary + * + * We use atomic operations to swap in the override, with + * the assumption that the thread itself can + * read the override and clear it on return to userspace. + * + * No locking is performed, since it is acceptable to see + * a stale override for one loop through throttle_lowpri_io(). + * However a thread reference must be held on the thread. + */ + +void set_thread_iotier_override(thread_t thread, int policy) +{ + int current_override; + + /* Let most aggressive I/O policy win until user boundary */ + do { + current_override = thread->iotier_override; + + if (current_override != THROTTLE_LEVEL_NONE) + policy = MIN(current_override, policy); + + if (current_override == policy) { + /* no effective change */ + return; + } + } while (!OSCompareAndSwap(current_override, policy, &thread->iotier_override)); + + /* + * Since the thread may be currently throttled, + * re-evaluate tiers and potentially break out + * of an msleep + */ + rethrottle_thread(thread->uthread); +} + +/* + * Userspace synchronization routines (like pthread mutexes, pthread reader-writer locks, + * semaphores, dispatch_sync) may result in priority inversions where a higher priority + * (i.e. scheduler priority, I/O tier, QoS tier) is waiting on a resource owned by a lower + * priority thread. In these cases, we attempt to propagate the priority token, as long + * as the subsystem informs us of the relationships between the threads. The userspace + * synchronization subsystem should maintain the information of owner->resource and + * resource->waiters itself. + */ + +/* + * This helper canonicalizes the resource/resource_type given the current qos_override_mode + * in effect. Note that wildcards (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD) may need + * to be handled specially in the future, but for now it's fine to slam + * *resource to USER_ADDR_NULL even if it was previously a wildcard. + */ +static void canonicalize_resource_and_type(user_addr_t *resource, int *resource_type) { + if (qos_override_mode == QOS_OVERRIDE_MODE_OVERHANG_PEAK || qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) { + /* Map all input resource/type to a single one */ + *resource = USER_ADDR_NULL; + *resource_type = THREAD_QOS_OVERRIDE_TYPE_UNKNOWN; + } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE) { + /* no transform */ + } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH) { + /* Map all dispatch overrides to a single one, to avoid memory overhead */ + if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) { + *resource = USER_ADDR_NULL; + } + } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE) { + /* Map all mutex overrides to a single one, to avoid memory overhead */ + if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX) { + *resource = USER_ADDR_NULL; + } + } +} + +/* This helper routine finds an existing override if known. Locking should be done by caller */ +static struct thread_qos_override * +find_qos_override(thread_t thread, + user_addr_t resource, + int resource_type) +{ + struct thread_qos_override *override; + + override = thread->overrides; + while (override) { + if (override->override_resource == resource && + override->override_resource_type == resource_type) { + return override; + } + + override = override->override_next; + } + + return NULL; +} + +static void +find_and_decrement_qos_override(thread_t thread, + user_addr_t resource, + int resource_type, + boolean_t reset, + struct thread_qos_override **free_override_list) +{ + struct thread_qos_override *override, *override_prev; + + override_prev = NULL; + override = thread->overrides; + while (override) { + struct thread_qos_override *override_next = override->override_next; + + if ((THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD == resource || override->override_resource == resource) && + (THREAD_QOS_OVERRIDE_TYPE_WILDCARD == resource_type || override->override_resource_type == resource_type)) { + + if (reset) { + override->override_contended_resource_count = 0; + } else { + override->override_contended_resource_count--; + } + + if (override->override_contended_resource_count == 0) { + if (override_prev == NULL) { + thread->overrides = override_next; + } else { + override_prev->override_next = override_next; + } + + /* Add to out-param for later zfree */ + override->override_next = *free_override_list; + *free_override_list = override; + } else { + override_prev = override; + } + + if (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD != resource) { + return; + } + } else { + override_prev = override; + } + + override = override_next; + } +} + +/* This helper recalculates the current requested override using the policy selected at boot */ +static int +calculate_requested_qos_override(thread_t thread) +{ + if (qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) { + return THREAD_QOS_UNSPECIFIED; + } + + /* iterate over all overrides and calculate MAX */ + struct thread_qos_override *override; + int qos_override = THREAD_QOS_UNSPECIFIED; + + override = thread->overrides; + while (override) { + if (qos_override_mode != QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH || + override->override_resource_type != THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) { + qos_override = MAX(qos_override, override->override_qos); + } + + override = override->override_next; + } + + return qos_override; +} + +/* + * Returns: + * - 0 on success + * - EINVAL if some invalid input was passed + * - EFAULT if user_lock_addr != NULL and needs to be faulted (userland has to + * fault and retry) + * - ESTALE if user_lock_addr != NULL && + * ulock_owner_value_to_port_name(*user_lock_addr) != user_lock_owner + */ +static int +proc_thread_qos_add_override_internal(thread_t thread, + int override_qos, + boolean_t first_override_for_resource, + user_addr_t resource, + int resource_type, + user_addr_t user_lock_addr, + mach_port_name_t user_lock_owner) +{ + struct task_pend_token pend_token = {}; + int rc = 0; + + thread_mtx_lock(thread); + + KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_START, + thread_tid(thread), override_qos, first_override_for_resource ? 1 : 0, 0, 0); + + DTRACE_BOOST5(qos_add_override_pre, uint64_t, thread_tid(thread), + uint64_t, thread->requested_policy.thrp_qos, + uint64_t, thread->effective_policy.thep_qos, + int, override_qos, boolean_t, first_override_for_resource); + + struct thread_qos_override *override; + struct thread_qos_override *override_new = NULL; + int new_qos_override, prev_qos_override; + int new_effective_qos; + + canonicalize_resource_and_type(&resource, &resource_type); + + override = find_qos_override(thread, resource, resource_type); + if (first_override_for_resource && !override) { + /* We need to allocate a new object. Drop the thread lock and + * recheck afterwards in case someone else added the override + */ + thread_mtx_unlock(thread); + override_new = zalloc(thread_qos_override_zone); + thread_mtx_lock(thread); + override = find_qos_override(thread, resource, resource_type); + } + if (user_lock_addr) { + uint64_t val; + /* Workaround lack of explicit support for 'no-fault copyin' + * , as disabling preemption prevents paging in + */ + disable_preemption(); + rc = copyin_word(user_lock_addr, &val, sizeof(user_lock_owner)); + enable_preemption(); + if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != user_lock_owner) { + rc = ESTALE; + } + if (rc) { + prev_qos_override = proc_get_thread_policy_locked(thread, + TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL); + new_qos_override = prev_qos_override; + new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); + thread_mtx_unlock(thread); + goto out; + } + } + if (first_override_for_resource && override) { + /* Someone else already allocated while the thread lock was dropped */ + override->override_contended_resource_count++; + } else if (!override && override_new) { + override = override_new; + override_new = NULL; + override->override_next = thread->overrides; + /* since first_override_for_resource was TRUE */ + override->override_contended_resource_count = 1; + override->override_resource = resource; + override->override_resource_type = resource_type; + override->override_qos = THREAD_QOS_UNSPECIFIED; + thread->overrides = override; + } + + if (override) { + if (override->override_qos == THREAD_QOS_UNSPECIFIED) + override->override_qos = override_qos; + else + override->override_qos = MAX(override->override_qos, override_qos); + } + + /* Determine how to combine the various overrides into a single current + * requested override + */ + new_qos_override = calculate_requested_qos_override(thread); + + prev_qos_override = proc_get_thread_policy_locked(thread, + TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL); + + if (new_qos_override != prev_qos_override) { + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_OVERRIDE, + new_qos_override, 0, &pend_token); + } + + new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); + + thread_mtx_unlock(thread); + + thread_policy_update_complete_unlocked(thread, &pend_token); + +out: + if (override_new) { + zfree(thread_qos_override_zone, override_new); + } + + DTRACE_BOOST4(qos_add_override_post, int, prev_qos_override, + int, new_qos_override, int, new_effective_qos, int, rc); + + KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_END, + new_qos_override, resource, resource_type, 0, 0); + + return rc; +} + +int +proc_thread_qos_add_override_check_owner(thread_t thread, + int override_qos, + boolean_t first_override_for_resource, + user_addr_t resource, + int resource_type, + user_addr_t user_lock_addr, + mach_port_name_t user_lock_owner) +{ + return proc_thread_qos_add_override_internal(thread, override_qos, + first_override_for_resource, resource, resource_type, + user_lock_addr, user_lock_owner); +} + +boolean_t +proc_thread_qos_add_override(task_t task, + thread_t thread, + uint64_t tid, + int override_qos, + boolean_t first_override_for_resource, + user_addr_t resource, + int resource_type) +{ + boolean_t has_thread_reference = FALSE; + int rc = 0; + + if (thread == THREAD_NULL) { + thread = task_findtid(task, tid); + /* returns referenced thread */ + + if (thread == THREAD_NULL) { + KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_NONE, + tid, 0, 0xdead, 0, 0); + return FALSE; + } + has_thread_reference = TRUE; + } else { + assert(thread->task == task); + } + rc = proc_thread_qos_add_override_internal(thread, override_qos, + first_override_for_resource, resource, resource_type, 0, 0); + if (has_thread_reference) { + thread_deallocate(thread); + } + + return rc == 0; +} + +static int +proc_thread_qos_remove_override_internal(thread_t thread, + user_addr_t resource, + int resource_type, + boolean_t reset, + boolean_t squash) +{ + struct task_pend_token pend_token = {}; + + struct thread_qos_override *deferred_free_override_list = NULL; + int new_qos_override, prev_qos_override, new_effective_qos, prev_qos; + int new_qos = THREAD_QOS_UNSPECIFIED; + + thread_mtx_lock(thread); + + canonicalize_resource_and_type(&resource, &resource_type); + + find_and_decrement_qos_override(thread, resource, resource_type, reset, &deferred_free_override_list); + + KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_START, + thread_tid(thread), resource, reset, 0, 0); + + DTRACE_BOOST3(qos_remove_override_pre, uint64_t, thread_tid(thread), + uint64_t, thread->requested_policy.thrp_qos, + uint64_t, thread->effective_policy.thep_qos); + + /* Determine how to combine the various overrides into a single current requested override */ + new_qos_override = calculate_requested_qos_override(thread); + + spl_t s = splsched(); + thread_lock(thread); + + /* + * The override chain and therefore the value of the current override is locked with thread mutex, + * so we can do a get/set without races. However, the rest of thread policy is locked under the spinlock. + * This means you can't change the current override from a spinlock-only setter. + */ + prev_qos_override = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL); + + if (squash) { + /* + * Remove the specified overrides, and set the current override as the new base QoS. + * Return the new QoS value. + */ + prev_qos = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, NULL); + + new_qos = MAX(prev_qos, prev_qos_override); + if (new_qos != prev_qos) + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, new_qos, 0, &pend_token); + } + + if (new_qos_override != prev_qos_override) + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, new_qos_override, 0, &pend_token); + + new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); + + thread_unlock(thread); + splx(s); + + thread_mtx_unlock(thread); + + thread_policy_update_complete_unlocked(thread, &pend_token); + + while (deferred_free_override_list) { + struct thread_qos_override *override_next = deferred_free_override_list->override_next; + + zfree(thread_qos_override_zone, deferred_free_override_list); + deferred_free_override_list = override_next; + } + + DTRACE_BOOST3(qos_remove_override_post, int, prev_qos_override, + int, new_qos_override, int, new_effective_qos); + + KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_END, + thread_tid(thread), squash, 0, 0, 0); + + return new_qos; +} + +boolean_t +proc_thread_qos_remove_override(task_t task, + thread_t thread, + uint64_t tid, + user_addr_t resource, + int resource_type) +{ + boolean_t has_thread_reference = FALSE; + + if (thread == THREAD_NULL) { + thread = task_findtid(task, tid); + /* returns referenced thread */ + + if (thread == THREAD_NULL) { + KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE, + tid, 0, 0xdead, 0, 0); + return FALSE; + } + has_thread_reference = TRUE; + } else { + assert(task == thread->task); + } + + proc_thread_qos_remove_override_internal(thread, resource, resource_type, FALSE, FALSE); + + if (has_thread_reference) + thread_deallocate(thread); + + return TRUE; +} + +boolean_t +proc_thread_qos_reset_override(task_t task, + thread_t thread, + uint64_t tid, + user_addr_t resource, + int resource_type) + +{ + boolean_t has_thread_reference = FALSE; + + if (thread == THREAD_NULL) { + thread = task_findtid(task, tid); + /* returns referenced thread */ + + if (thread == THREAD_NULL) { + KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE, + tid, 0, 0xdead, 0, 0); + return FALSE; + } + has_thread_reference = TRUE; + } else { + assert(task == thread->task); + } + + proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, FALSE); + + if (has_thread_reference) + thread_deallocate(thread); + + return TRUE; +} + +/* + * Clears the requested overrides, and replaces the current QoS with the max + * of the current QoS and the current override, then returns the new QoS. + * + * This is useful in order to reset overrides before parking a workqueue thread, + * but avoid dropping priority and getting preempted right before parking. + * + * Called without any locks held. + */ +int +proc_thread_qos_squash_override(thread_t thread, user_addr_t resource, int resource_type) +{ + return proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, TRUE); +} + +/* Deallocate before thread termination */ +void proc_thread_qos_deallocate(thread_t thread) +{ + /* + * There are no more references to this thread, + * therefore this thread must not own any more locks, + * therefore there must not be any more user promotions. + */ + assert(thread->user_promotions == 0); + assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED); + assert(thread->user_promotion_basepri == 0); + + /* This thread must have no more IPC overrides. */ + assert(thread->ipc_overrides == 0); + assert(thread->requested_policy.thrp_qos_ipc_override == THREAD_QOS_UNSPECIFIED); + + /* + * Clear out any lingering override objects. + */ + struct thread_qos_override *override; + + thread_mtx_lock(thread); + override = thread->overrides; + thread->overrides = NULL; + thread->requested_policy.thrp_qos_override = THREAD_QOS_UNSPECIFIED; + /* We don't need to re-evaluate thread policy here because the thread has already exited */ + thread_mtx_unlock(thread); + + while (override) { + struct thread_qos_override *override_next = override->override_next; + + zfree(thread_qos_override_zone, override); + override = override_next; + } +} + +/* + * Set up the primordial thread's QoS + */ +void +task_set_main_thread_qos(task_t task, thread_t thread) { + struct task_pend_token pend_token = {}; + + assert(thread->task == task); + + thread_mtx_lock(thread); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_START, + thread_tid(thread), threquested_0(thread), threquested_1(thread), + thread->requested_policy.thrp_qos, 0); + + int primordial_qos = task_compute_main_thread_qos(task); + + proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, + primordial_qos, 0, &pend_token); + + thread_mtx_unlock(thread); + + thread_policy_update_complete_unlocked(thread, &pend_token); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_END, + thread_tid(thread), threquested_0(thread), threquested_1(thread), + primordial_qos, 0); +} + +/* + * KPI for pthread kext + * + * Return a good guess at what the initial manager QoS will be + * Dispatch can override this in userspace if it so chooses + */ +int +task_get_default_manager_qos(task_t task) +{ + int primordial_qos = task_compute_main_thread_qos(task); + + if (primordial_qos == THREAD_QOS_LEGACY) + primordial_qos = THREAD_QOS_USER_INITIATED; + + return primordial_qos; +} + + +/* + * Promote thread with the user level properties of 'promoter' + * Mutexes may be held, but it's OK to take the throttle lock + * + * if 'new_promotion' is TRUE, this is a new promotion. + * if FALSE, we are updating an existing promotion. + */ +static void +thread_user_promotion_promote(thread_t thread, + thread_t promoter, + struct promote_token* promote_token, + boolean_t new_promotion) +{ + struct task_pend_token pend_token = {}; + + uint32_t promoter_base_pri = 0, promoter_qos = THREAD_QOS_UNSPECIFIED; + + spl_t s = splsched(); + thread_lock(promoter); + + /* + * We capture the 'promotion qos' here, which is captured + * before task-level clamping. + * + * This means that if the process gets unclamped while a promotion, + * is in effect, the owning thread ends up with the correct QoS. + * + * This does NOT work correctly across processes, as the correct QoS + * in one is not necessarily the correct QoS in another. + * When we add support for multi-process ulock boosting, we need to + * do something more complex. + */ + promoter_qos = promoter->effective_policy.thep_qos_promote; + + /* TODO: extract 'effective unclamped base pri' instead */ + promoter_base_pri = promoter->base_pri; + + thread_unlock(promoter); + splx(s); + + /* clamp out realtime to max user pri */ + promoter_base_pri = MIN(promoter_base_pri, MAXPRI_USER); + + /* add in the saved promotion token */ + assert(promote_token->pt_basepri <= MAXPRI_USER); + + promoter_base_pri = MAX(promoter_base_pri, promote_token->pt_basepri); + promoter_qos = MAX(promoter_qos, promote_token->pt_qos); + + /* save the max for later */ + promote_token->pt_basepri = promoter_base_pri; + promote_token->pt_qos = promoter_qos; + + s = splsched(); + thread_lock(thread); + + if (new_promotion) { + if (thread->user_promotions == 0) { + assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED); + assert(thread->user_promotion_basepri == 0); + } + + thread->user_promotions++; + } else { + assert(thread->user_promotions > 0); + } + + uint32_t thread_qos = thread->requested_policy.thrp_qos_promote; + uint32_t thread_basepri = thread->user_promotion_basepri; + + uint32_t new_qos = MAX(thread_qos, promoter_qos); + uint32_t new_basepri = MAX(thread_basepri, promoter_base_pri); + + /* TODO: Fast path the 'new is lower than effective' case to avoid full reevaluation */ + if (thread_qos != new_qos || thread_basepri != new_basepri) { + + thread->user_promotion_basepri = new_basepri; + + pend_token.tpt_force_recompute_pri = 1; + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_PROMOTE, new_qos, + 0, &pend_token); + } + + thread_unlock(thread); + splx(s); + + thread_policy_update_complete_unlocked(thread, &pend_token); +} + +/* Add a user promotion to thread */ +void +thread_user_promotion_add(thread_t thread, + thread_t promoter, + struct promote_token* promote_token) +{ + thread_user_promotion_promote(thread, promoter, promote_token, TRUE); +} + +/* Update an existing user promotion on thread */ +void +thread_user_promotion_update(thread_t thread, + thread_t promoter, + struct promote_token* promote_token) +{ + thread_user_promotion_promote(thread, promoter, promote_token, FALSE); +} + +/* + * Drop a user promotion on thread + * Mutexes may be held, but it's OK to take the throttle lock + */ +void +thread_user_promotion_drop(thread_t thread) +{ + struct task_pend_token pend_token = {}; + + spl_t s = splsched(); + thread_lock(thread); + + assert(thread->user_promotions > 0); + + if (--thread->user_promotions == 0) { + thread->requested_policy.thrp_qos_promote = THREAD_QOS_UNSPECIFIED; + thread->user_promotion_basepri = 0; + + pend_token.tpt_force_recompute_pri = 1; + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_PROMOTE, THREAD_QOS_UNSPECIFIED, + 0, &pend_token); + } + + thread_unlock(thread); + splx(s); + + thread_policy_update_complete_unlocked(thread, &pend_token); +} + + +/* + * Set the thread's QoS IPC override + * Owned by the IPC subsystem + * + * May be called with spinlocks held, but not spinlocks + * that may deadlock against the thread lock, the throttle lock, or the SFI lock. + * + * One 'add' must be balanced by one 'drop'. + * Between 'add' and 'drop', the overide QoS value may be updated with an 'update'. + * Before the thread is deallocated, there must be 0 remaining overrides. + */ +static void +thread_ipc_override(thread_t thread, + uint32_t qos_override, + boolean_t is_new_override) +{ + struct task_pend_token pend_token = {}; + + spl_t s = splsched(); + thread_lock(thread); + + uint32_t old_override = thread->requested_policy.thrp_qos_ipc_override; + + if (is_new_override) { + if (thread->ipc_overrides++ == 0) { + /* This add is the first override for this thread */ + assert(old_override == THREAD_QOS_UNSPECIFIED); + } else { + /* There are already other overrides in effect for this thread */ + assert(old_override > THREAD_QOS_UNSPECIFIED); + } + } else { + /* There must be at least one override (the previous add call) in effect */ + assert(thread->ipc_overrides > 0); + assert(old_override > THREAD_QOS_UNSPECIFIED); + } + + uint32_t new_override = MAX(old_override, qos_override); + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_IPC_OVERRIDE, + new_override, 0, &pend_token); + + assert(pend_token.tpt_update_sockets == 0); + + thread_unlock(thread); + splx(s); + + /* + * this is only safe after rethrottle_thread supports + * being called from spinlock context + */ + thread_policy_update_complete_unlocked(thread, &pend_token); +} + +void +thread_add_ipc_override(thread_t thread, + uint32_t qos_override) +{ + thread_ipc_override(thread, qos_override, TRUE); +} + +void +thread_update_ipc_override(thread_t thread, + uint32_t qos_override) +{ + thread_ipc_override(thread, qos_override, FALSE); +} + +void +thread_drop_ipc_override(thread_t thread) +{ + struct task_pend_token pend_token = {}; + + spl_t s = splsched(); + thread_lock(thread); + + assert(thread->ipc_overrides > 0); + + if (--thread->ipc_overrides == 0) { + /* + * There are no more overrides for this thread, so we should + * clear out the saturated override value + */ + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_IPC_OVERRIDE, THREAD_QOS_UNSPECIFIED, + 0, &pend_token); + } + + thread_unlock(thread); + splx(s); + + /* + * this is only safe after rethrottle_thread supports + * being called from spinlock context + */ + thread_policy_update_complete_unlocked(thread, &pend_token); +} + +/* Get current IPC override, may be called from spinlock context */ +uint32_t +thread_get_ipc_override(thread_t thread) +{ + return proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_IPC_OVERRIDE, NULL); +} + diff --git a/osfmk/kern/timer_call.c b/osfmk/kern/timer_call.c index fead4d663..047d6951e 100644 --- a/osfmk/kern/timer_call.c +++ b/osfmk/kern/timer_call.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -563,6 +564,7 @@ timer_call_enter_internal( uint32_t urgency; uint64_t sdeadline, ttd; + assert(call->call_entry.func != NULL); s = splclock(); sdeadline = deadline; @@ -740,7 +742,7 @@ timer_queue_shutdown( s = splclock(); /* Note comma operator in while expression re-locking each iteration */ - while (timer_queue_lock_spin(queue), !queue_empty(&queue->head)) { + while ((void)timer_queue_lock_spin(queue), !queue_empty(&queue->head)) { call = TIMER_CALL(queue_first(&queue->head)); if (!simple_lock_try(&call->lock)) { diff --git a/osfmk/kern/waitq.c b/osfmk/kern/waitq.c index fad26e5ba..884f3f647 100644 --- a/osfmk/kern/waitq.c +++ b/osfmk/kern/waitq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Apple Inc. All rights reserved. + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,7 +54,9 @@ * the rights to redistribute these changes. */ #include +#include #include +#include #include #include #include @@ -62,12 +64,23 @@ #include #include #include +#include + #include #include #include #include +#if defined(CONFIG_WAITQ_LINK_STATS) || defined(CONFIG_WAITQ_PREPOST_STATS) +# if !defined(CONFIG_LTABLE_STATS) +# error "You must configure LTABLE_STATS to use WAITQ_[LINK|PREPOST]_STATS" +# endif +# if !defined(CONFIG_WAITQ_STATS) +# error "You must configure WAITQ_STATS to use WAITQ_[LINK|PREPOST]_STATS" +# endif +#endif + #if CONFIG_WAITQ_DEBUG #define wqdbg(fmt,...) \ printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__) @@ -85,982 +98,60 @@ #define wqinfo(fmt,...) \ printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__) -#define wqerr(fmt,...) \ - printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__) - - -/* - * un-comment the following lines to debug the link/prepost tables - * NOTE: this expands each element by ~40 bytes - */ -//#define CONFIG_WAITQ_LINK_STATS -//#define CONFIG_WAITQ_PREPOST_STATS - -/* - * file-static functions / data - */ -static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event, - uint64_t *reserved_preposts, - int priority, spl_t *spl); - -static kern_return_t waitq_select_thread_locked(struct waitq *waitq, - event64_t event, - thread_t thread, spl_t *spl); - -#define WAITQ_SET_MAX (task_max * 3) -static zone_t waitq_set_zone; - - -#define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align))) -#define ROUNDDOWN(x,y) (((x)/(y))*(y)) - - -#ifdef CONFIG_WAITQ_STATS -static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip); -#endif - - -/* ---------------------------------------------------------------------- - * - * Wait Queue Link/Prepost Table Implementation - * - * ---------------------------------------------------------------------- */ -#define DEFAULT_MIN_FREE_TABLE_ELEM 100 -static uint32_t g_min_free_table_elem; -static uint32_t g_min_free_cache; - -static vm_size_t g_wqt_max_tbl_size; -static lck_grp_t g_wqt_lck_grp; - -/* 1 prepost table, 1 setid link table */ -#define NUM_WQ_TABLES 2 - -/* default VA space for waitq tables (zone allocated) */ -#define DEFAULT_MAX_TABLE_SIZE P2ROUNDUP(8 * 1024 * 1024, PAGE_SIZE) - -struct wq_id { - union { - uint64_t id; - struct { - /* - * this bitfied is OK because we don't need to - * enforce a particular memory layout - */ - uint64_t idx:18, /* allows indexing up to 8MB of 32byte link objects */ - generation:46; - }; - }; -}; - -enum wqt_elem_type { - WQT_FREE = 0, - WQT_ELEM = 1, - WQT_LINK = 2, - WQT_RESERVED = 3, -}; - -struct wqt_elem { - uint32_t wqt_bits; - - uint32_t wqt_next_idx; - - struct wq_id wqt_id; -}; - -/* this _must_ match the idx bitfield definition in struct wq_id */ -#define WQT_IDX_MAX (0x3ffff) -#if defined(DEVELOPMENT) || defined(DEBUG) -/* global for lldb macros */ -uint64_t g_wqt_idx_max = WQT_IDX_MAX; -#endif - -/* reference count bits should _always_ be the low-order bits */ -#define WQT_BITS_REFCNT_MASK (0x1FFFFFFF) -#define WQT_BITS_REFCNT_SHIFT (0) -#define WQT_BITS_REFCNT (WQT_BITS_REFCNT_MASK << WQT_BITS_REFCNT_SHIFT) - -#define WQT_BITS_TYPE_MASK (0x3) -#define WQT_BITS_TYPE_SHIFT (29) -#define WQT_BITS_TYPE (WQT_BITS_TYPE_MASK << WQT_BITS_TYPE_SHIFT) - -#define WQT_BITS_VALID_MASK (0x1) -#define WQT_BITS_VALID_SHIFT (31) -#define WQT_BITS_VALID (WQT_BITS_VALID_MASK << WQT_BITS_VALID_SHIFT) - -#define wqt_bits_refcnt(bits) \ - (((bits) >> WQT_BITS_REFCNT_SHIFT) & WQT_BITS_REFCNT_MASK) - -#define wqt_bits_type(bits) \ - (((bits) >> WQT_BITS_TYPE_SHIFT) & WQT_BITS_TYPE_MASK) - -#define wqt_bits_valid(bits) \ - ((bits) & WQT_BITS_VALID) - -struct wq_table; -typedef void (*wq_table_poison_func)(struct wq_table *, struct wqt_elem *); - -/* - * A table is a container for slabs of elements. Each slab is 'slab_sz' bytes - * and contains 'slab_sz/elem_sz' elements (of 'elem_sz' bytes each). These - * slabs allow the table to be broken up into potentially dis-contiguous VA - * space. On 32-bit platforms with large amounts of physical RAM, this is - * quite important. Keeping slabs like this slightly complicates retrieval of - * table elements, but not by much. - */ -struct wq_table { - struct wqt_elem **table; /* an array of 'slabs' of elements */ - struct wqt_elem **next_free_slab; - struct wq_id free_list __attribute__((aligned(8))); - - uint32_t nelem; - uint32_t used_elem; - uint32_t elem_sz; /* size of a table element (bytes) */ - - uint32_t slab_sz; /* size of a table 'slab' object (bytes) */ - uint32_t slab_shift; - uint32_t slab_msk; - uint32_t slab_elem; - zone_t slab_zone; - - wq_table_poison_func poison; - - lck_mtx_t lock; - uint32_t state; - -#if CONFIG_WAITQ_STATS - uint32_t nslabs; - - uint64_t nallocs; - uint64_t nreallocs; - uint64_t npreposts; - int64_t nreservations; - uint64_t nreserved_releases; - uint64_t nspins; - - uint64_t max_used; - uint64_t avg_used; - uint64_t max_reservations; - uint64_t avg_reservations; -#endif -} __attribute__((aligned(8))); - -#define wqt_elem_ofst_slab(slab, slab_msk, ofst) \ - /* cast through 'void *' to avoid compiler alignment warning messages */ \ - ((struct wqt_elem *)((void *)((uintptr_t)(slab) + ((ofst) & (slab_msk))))) - -#if defined(CONFIG_WAITQ_LINK_STATS) || defined(CONFIG_WAITQ_PREPOST_STATS) -/* version that makes no assumption on waste within a slab */ -static inline struct wqt_elem * -wqt_elem_idx(struct wq_table *table, uint32_t idx) -{ - int slab_idx = idx / table->slab_elem; - struct wqt_elem *slab = table->table[slab_idx]; - if (!slab) - panic("Invalid index:%d slab:%d (NULL) for table:%p\n", - idx, slab_idx, table); - assert(slab->wqt_id.idx <= idx && (slab->wqt_id.idx + table->slab_elem) > idx); - return wqt_elem_ofst_slab(slab, table->slab_msk, (idx - slab->wqt_id.idx) * table->elem_sz); -} -#else /* !CONFIG_WAITQ_[LINK|PREPOST]_STATS */ -/* verion that assumes 100% ultilization of slabs (no waste) */ -static inline struct wqt_elem * -wqt_elem_idx(struct wq_table *table, uint32_t idx) -{ - uint32_t ofst = idx * table->elem_sz; - struct wqt_elem *slab = table->table[ofst >> table->slab_shift]; - if (!slab) - panic("Invalid index:%d slab:%d (NULL) for table:%p\n", - idx, (ofst >> table->slab_shift), table); - assert(slab->wqt_id.idx <= idx && (slab->wqt_id.idx + table->slab_elem) > idx); - return wqt_elem_ofst_slab(slab, table->slab_msk, ofst); -} -#endif /* !CONFIG_WAITQ_[LINK|PREPOST]_STATS */ - -static int __assert_only wqt_elem_in_range(struct wqt_elem *elem, - struct wq_table *table) -{ - struct wqt_elem **base = table->table; - uintptr_t e = (uintptr_t)elem; - assert(base != NULL); - while (*base != NULL) { - uintptr_t b = (uintptr_t)(*base); - if (e >= b && e < b + table->slab_sz) - return 1; - base++; - if ((uintptr_t)base >= (uintptr_t)table->table + PAGE_SIZE) - return 0; - } - return 0; -} - -static struct wqt_elem *wq_table_get_elem(struct wq_table *table, uint64_t id); -static void wq_table_put_elem(struct wq_table *table, struct wqt_elem *elem); -static int wqt_elem_list_link(struct wq_table *table, struct wqt_elem *parent, - struct wqt_elem *child); - -static void wqt_elem_invalidate(struct wqt_elem *elem) -{ - uint32_t __assert_only old = OSBitAndAtomic(~WQT_BITS_VALID, &elem->wqt_bits); - OSMemoryBarrier(); - assert(((wqt_bits_type(old) != WQT_RESERVED) && (old & WQT_BITS_VALID)) || - ((wqt_bits_type(old) == WQT_RESERVED) && !(old & WQT_BITS_VALID))); -} - -static void wqt_elem_mkvalid(struct wqt_elem *elem) -{ - uint32_t __assert_only old = OSBitOrAtomic(WQT_BITS_VALID, &elem->wqt_bits); - OSMemoryBarrier(); - assert(!(old & WQT_BITS_VALID)); -} - -static void wqt_elem_set_type(struct wqt_elem *elem, int type) -{ - uint32_t old_bits, new_bits; - do { - old_bits = elem->wqt_bits; - new_bits = (old_bits & ~WQT_BITS_TYPE) | - ((type & WQT_BITS_TYPE_MASK) << WQT_BITS_TYPE_SHIFT); - } while (OSCompareAndSwap(old_bits, new_bits, &elem->wqt_bits) == FALSE); - OSMemoryBarrier(); -} - - -static void wq_table_bootstrap(void) -{ - uint32_t tmp32 = 0; - - g_min_free_cache = 0; - g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM; - if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE) - g_min_free_table_elem = tmp32; - wqdbg("Minimum free table elements: %d", tmp32); - - g_wqt_max_tbl_size = DEFAULT_MAX_TABLE_SIZE; - if (PE_parse_boot_argn("wqt_tbl_size", &tmp32, sizeof(tmp32)) == TRUE) - g_wqt_max_tbl_size = (vm_size_t)P2ROUNDUP(tmp32, PAGE_SIZE); - - lck_grp_init(&g_wqt_lck_grp, "waitq_table_locks", LCK_GRP_ATTR_NULL); -} - -static void wq_table_init(struct wq_table *table, const char *name, - uint32_t max_tbl_elem, uint32_t elem_sz, - wq_table_poison_func poison) -{ - kern_return_t kr; - uint32_t slab_sz, slab_shift, slab_msk, slab_elem; - zone_t slab_zone; - size_t max_tbl_sz; - struct wqt_elem *e, **base; - - /* - * First, allocate a single page of memory to act as the base - * for the table's element slabs - */ - kr = kernel_memory_allocate(kernel_map, (vm_offset_t *)&base, - PAGE_SIZE, 0, KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ); - if (kr != KERN_SUCCESS) - panic("Cannot initialize %s table: " - "kernel_memory_allocate failed:%d\n", name, kr); - memset(base, 0, PAGE_SIZE); - - /* - * Based on the maximum table size, calculate the slab size: - * we allocate 1 page of slab pointers for the table, and we need to - * index elements of 'elem_sz', this gives us the slab size based on - * the maximum size the table should grow. - */ - max_tbl_sz = (max_tbl_elem * elem_sz); - max_tbl_sz = P2ROUNDUP(max_tbl_sz, PAGE_SIZE); - - /* system maximum table size divided by number of slots in a page */ - slab_sz = (uint32_t)(max_tbl_sz / (PAGE_SIZE / (sizeof(void *)))); - if (slab_sz < PAGE_SIZE) - slab_sz = PAGE_SIZE; - - /* make sure the slab size is a power of two */ - slab_shift = 0; - slab_msk = ~0; - for (uint32_t i = 0; i < 31; i++) { - uint32_t bit = (1 << i); - if ((slab_sz & bit) == slab_sz) { - slab_shift = i; - slab_msk = 0; - for (uint32_t j = 0; j < i; j++) - slab_msk |= (1 << j); - break; - } - slab_sz &= ~bit; - } - slab_elem = slab_sz / elem_sz; - - /* initialize the table's slab zone (for table growth) */ - wqdbg("Initializing %s zone: slab:%d (%d,0x%x) max:%ld", - name, slab_sz, slab_shift, slab_msk, max_tbl_sz); - slab_zone = zinit(slab_sz, max_tbl_sz, slab_sz, name); - assert(slab_zone != ZONE_NULL); - - /* allocate the first slab and populate it */ - base[0] = (struct wqt_elem *)zalloc(slab_zone); - if (base[0] == NULL) - panic("Can't allocate a %s table slab from zone:%p", - name, slab_zone); - - memset(base[0], 0, slab_sz); - - /* setup the initial freelist */ - wqdbg("initializing %d links (%d bytes each)...", slab_elem, elem_sz); - for (unsigned l = 0; l < slab_elem; l++) { - e = wqt_elem_ofst_slab(base[0], slab_msk, l * elem_sz); - e->wqt_id.idx = l; - /* - * setting generation to 0 ensures that a setid of 0 is - * invalid because the generation will be incremented before - * each element's allocation. - */ - e->wqt_id.generation = 0; - e->wqt_next_idx = l + 1; - } - - /* make sure the last free element points to a never-valid idx */ - e = wqt_elem_ofst_slab(base[0], slab_msk, (slab_elem - 1) * elem_sz); - e->wqt_next_idx = WQT_IDX_MAX; - - lck_mtx_init(&table->lock, &g_wqt_lck_grp, LCK_ATTR_NULL); - - table->slab_sz = slab_sz; - table->slab_shift = slab_shift; - table->slab_msk = slab_msk; - table->slab_elem = slab_elem; - table->slab_zone = slab_zone; - - table->elem_sz = elem_sz; - table->nelem = slab_elem; - table->used_elem = 0; - table->elem_sz = elem_sz; - table->poison = poison; - - table->table = base; - table->next_free_slab = &base[1]; - table->free_list.id = base[0]->wqt_id.id; - -#if CONFIG_WAITQ_STATS - table->nslabs = 1; - table->nallocs = 0; - table->nreallocs = 0; - table->npreposts = 0; - table->nreservations = 0; - table->nreserved_releases = 0; - - table->max_used = 0; - table->avg_used = 0; - table->max_reservations = 0; - table->avg_reservations = 0; -#endif -} - -/** - * grow a waitq table by adding another 'slab' of table elements - * - * Conditions: - * table mutex is unlocked - * calling thread can block - */ -static void wq_table_grow(struct wq_table *table, uint32_t min_free) -{ - struct wqt_elem *slab, **slot; - struct wqt_elem *e = NULL, *first_new_elem, *last_new_elem; - struct wq_id free_id; - uint32_t free_elem; - - assert(get_preemption_level() == 0); - assert(table && table->slab_zone); - - lck_mtx_lock(&table->lock); - - free_elem = table->nelem - table->used_elem; - - /* - * If the caller just wanted to ensure a minimum number of elements, - * do that (and don't just blindly grow the table). Also, don't grow - * the table unnecessarily - we could have been beaten by a higher - * priority thread who acquired the lock and grew the table before we - * got here. - */ - if (free_elem > min_free) { - lck_mtx_unlock(&table->lock); - return; - } - - /* we are now committed to table growth */ - wqdbg_v("BEGIN"); - - if (table->next_free_slab == NULL) { - /* - * before we panic, check one more time to see if any other - * threads have free'd from space in the table. - */ - if ((table->nelem - table->used_elem) > 0) { - /* there's at least 1 free element: don't panic yet */ - lck_mtx_unlock(&table->lock); - return; - } - panic("No more room to grow table: %p (nelem: %d, used: %d)", - table, table->nelem, table->used_elem); - } - slot = table->next_free_slab; - table->next_free_slab++; - if ((uintptr_t)table->next_free_slab >= (uintptr_t)table->table + PAGE_SIZE) - table->next_free_slab = NULL; - - assert(*slot == NULL); - - /* allocate another slab */ - slab = (struct wqt_elem *)zalloc(table->slab_zone); - if (slab == NULL) - panic("Can't allocate a %s table (%p) slab from zone:%p", - table->slab_zone->zone_name, table, table->slab_zone); - - memset(slab, 0, table->slab_sz); - - /* put the new elements into a freelist */ - wqdbg_v(" init %d new links...", table->slab_elem); - for (unsigned l = 0; l < table->slab_elem; l++) { - uint32_t idx = l + table->nelem; - if (idx >= (WQT_IDX_MAX - 1)) - break; /* the last element of the last slab */ - e = wqt_elem_ofst_slab(slab, table->slab_msk, l * table->elem_sz); - e->wqt_id.idx = idx; - e->wqt_next_idx = idx + 1; - } - last_new_elem = e; - assert(last_new_elem != NULL); - - first_new_elem = wqt_elem_ofst_slab(slab, table->slab_msk, 0); - - /* update table book keeping, and atomically swap the freelist head */ - *slot = slab; - if (table->nelem + table->slab_elem >= WQT_IDX_MAX) - table->nelem = WQT_IDX_MAX - 1; - else - table->nelem += table->slab_elem; - -#if CONFIG_WAITQ_STATS - table->nslabs += 1; -#endif - - /* - * The atomic swap of the free list head marks the end of table - * growth. Incoming requests may now use the newly allocated slab - * of table elements - */ - free_id = table->free_list; - /* connect the existing free list to the end of the new free list */ - last_new_elem->wqt_next_idx = free_id.idx; - while (OSCompareAndSwap64(free_id.id, first_new_elem->wqt_id.id, - &table->free_list.id) == FALSE) { - OSMemoryBarrier(); - free_id = table->free_list; - last_new_elem->wqt_next_idx = free_id.idx; - } - OSMemoryBarrier(); - - lck_mtx_unlock(&table->lock); - - return; -} - -static __attribute__((noinline)) -struct wqt_elem *wq_table_alloc_elem(struct wq_table *table, int type, int nelem) -{ - int nspins = 0, ntries = 0, nalloc = 0; - uint32_t table_size; - struct wqt_elem *elem = NULL; - struct wq_id free_id, next_id; - - static const int max_retries = 500; - - if (type != WQT_ELEM && type != WQT_LINK && type != WQT_RESERVED) - panic("wq_table_aloc of invalid elem type:%d from table @%p", - type, table); - - assert(nelem > 0); - -try_again: - elem = NULL; - if (ntries++ > max_retries) { - struct wqt_elem *tmp; - if (table->used_elem + nelem >= table_size) - panic("No more room to grow table: 0x%p size:%d, used:%d, requested elem:%d", - table, table_size, table->used_elem, nelem); - if (nelem == 1) - panic("Too many alloc retries: %d, table:%p, type:%d, nelem:%d", - ntries, table, type, nelem); - /* don't panic: try allocating one-at-a-time */ - while (nelem > 0) { - tmp = wq_table_alloc_elem(table, type, 1); - if (elem) - wqt_elem_list_link(table, tmp, elem); - elem = tmp; - --nelem; - } - assert(elem != NULL); - return elem; - } - - nalloc = 0; - table_size = table->nelem; - - if (table->used_elem + nelem >= table_size) { - if (get_preemption_level() != 0) { -#if CONFIG_WAITQ_STATS - table->nspins += 1; -#endif - /* - * We may have just raced with table growth: check - * again to make sure there really isn't any space. - */ - if (++nspins > 4) - panic("Can't grow table %p with preemption" - " disabled!", table); - delay(1); - goto try_again; - } - wq_table_grow(table, nelem); - goto try_again; - } - - /* read this value only once before the CAS */ - free_id = table->free_list; - if (free_id.idx >= table_size) - goto try_again; - - /* - * Find the item on the free list which will become the new free list - * head, but be careful not to modify any memory (read only)! Other - * threads can alter table state at any time up until the CAS. We - * don't modify any memory until we've successfully swapped out the - * free list head with the one we've investigated. - */ - for (struct wqt_elem *next_elem = wqt_elem_idx(table, free_id.idx); - nalloc < nelem; - nalloc++) { - elem = next_elem; - next_id.generation = 0; - next_id.idx = next_elem->wqt_next_idx; - if (next_id.idx < table->nelem) { - next_elem = wqt_elem_idx(table, next_id.idx); - next_id.id = next_elem->wqt_id.id; - } else { - goto try_again; - } - } - /* 'elem' points to the last element being allocated */ - - if (OSCompareAndSwap64(free_id.id, next_id.id, - &table->free_list.id) == FALSE) - goto try_again; - - /* load barrier */ - OSMemoryBarrier(); - - /* - * After the CAS, we know that we own free_id, and it points to a - * valid table entry (checked above). Grab the table pointer and - * reset some values. - */ - OSAddAtomic(nelem, &table->used_elem); - - /* end the list of allocated elements */ - elem->wqt_next_idx = WQT_IDX_MAX; - /* reset 'elem' to point to the first allocated element */ - elem = wqt_elem_idx(table, free_id.idx); - - /* - * Update the generation count, and return the element(s) - * with a single reference (and no valid bit). If the - * caller immediately calls _put() on any element, then - * it will be released back to the free list. If the caller - * subsequently marks the element as valid, then the put - * will simply drop the reference. - */ - for (struct wqt_elem *tmp = elem; ; ) { - assert(!wqt_bits_valid(tmp->wqt_bits) && - (wqt_bits_refcnt(tmp->wqt_bits) == 0)); - --nalloc; - tmp->wqt_id.generation += 1; - tmp->wqt_bits = 1; - wqt_elem_set_type(tmp, type); - if (tmp->wqt_next_idx == WQT_IDX_MAX) - break; - assert(tmp->wqt_next_idx != WQT_IDX_MAX); - tmp = wqt_elem_idx(table, tmp->wqt_next_idx); - } - assert(nalloc == 0); - -#if CONFIG_WAITQ_STATS - uint64_t nreservations; - table->nallocs += nelem; - if (type == WQT_RESERVED) - OSIncrementAtomic64(&table->nreservations); - nreservations = table->nreservations; - if (table->used_elem > table->max_used) - table->max_used = table->used_elem; - if (nreservations > table->max_reservations) - table->max_reservations = nreservations; - table->avg_used = (table->avg_used + table->used_elem) / 2; - table->avg_reservations = (table->avg_reservations + nreservations) / 2; -#endif - - return elem; -} - -static void wq_table_realloc_elem(struct wq_table *table, struct wqt_elem *elem, int type) -{ - (void)table; - assert(wqt_elem_in_range(elem, table) && - !wqt_bits_valid(elem->wqt_bits)); - -#if CONFIG_WAITQ_STATS - table->nreallocs += 1; - if (wqt_bits_type(elem->wqt_bits) == WQT_RESERVED && type != WQT_RESERVED) { - /* - * This isn't under any lock, so we'll clamp it. - * the stats are meant to be informative, not perfectly - * accurate - */ - OSDecrementAtomic64(&table->nreservations); - } - table->avg_reservations = (table->avg_reservations + table->nreservations) / 2; -#endif - - /* - * Return the same element with a new generation count, and a - * (potentially) new type. Don't touch the refcount: the caller - * is responsible for getting that (and the valid bit) correct. - */ - elem->wqt_id.generation += 1; - elem->wqt_next_idx = WQT_IDX_MAX; - wqt_elem_set_type(elem, type); - - return; -} - -static void wq_table_free_elem(struct wq_table *table, struct wqt_elem *elem) -{ - struct wq_id next_id; - - assert(wqt_elem_in_range(elem, table) && - !wqt_bits_valid(elem->wqt_bits) && - (wqt_bits_refcnt(elem->wqt_bits) == 0)); - - OSDecrementAtomic(&table->used_elem); - -#if CONFIG_WAITQ_STATS - table->avg_used = (table->avg_used + table->used_elem) / 2; - if (wqt_bits_type(elem->wqt_bits) == WQT_RESERVED) - OSDecrementAtomic64(&table->nreservations); - table->avg_reservations = (table->avg_reservations + table->nreservations) / 2; -#endif - - elem->wqt_bits = 0; - - if (table->poison) - (table->poison)(table, elem); - -again: - next_id = table->free_list; - if (next_id.idx >= table->nelem) - elem->wqt_next_idx = WQT_IDX_MAX; - else - elem->wqt_next_idx = next_id.idx; - - /* store barrier */ - OSMemoryBarrier(); - if (OSCompareAndSwap64(next_id.id, elem->wqt_id.id, - &table->free_list.id) == FALSE) - goto again; -} - -/* get a reference to a table element identified by 'id' */ -static struct wqt_elem *wq_table_get_elem(struct wq_table *table, uint64_t id) -{ - struct wqt_elem *elem; - uint32_t idx, bits, new_bits; - - /* - * Here we have a reference to the table which is guaranteed to remain - * valid until we drop the reference - */ - - idx = ((struct wq_id *)&id)->idx; - - if (idx >= table->nelem) - panic("id:0x%llx : idx:%d > %d", id, idx, table->nelem); - - elem = wqt_elem_idx(table, idx); - - /* verify the validity by taking a reference on the table object */ - bits = elem->wqt_bits; - if (!wqt_bits_valid(bits)) - return NULL; - - /* - * do a pre-verify on the element ID to potentially - * avoid 2 compare-and-swaps - */ - if (elem->wqt_id.id != id) - return NULL; - - new_bits = bits + 1; - - /* check for overflow */ - assert(wqt_bits_refcnt(new_bits) > 0); - - while (OSCompareAndSwap(bits, new_bits, &elem->wqt_bits) == FALSE) { - /* - * either the element became invalid, - * or someone else grabbed/removed a reference. - */ - bits = elem->wqt_bits; - if (!wqt_bits_valid(bits)) { - /* don't return invalid elements */ - return NULL; - } - new_bits = bits + 1; - assert(wqt_bits_refcnt(new_bits) > 0); - } - - /* load barrier */ - OSMemoryBarrier(); - - /* check to see that our reference is to the same generation! */ - if (elem->wqt_id.id != id) { - /* - wqdbg("ID:0x%llx table generation (%d) != %d", - id, elem->wqt_id.generation, - ((struct wq_id *)&id)->generation); - */ - wq_table_put_elem(table, elem); - return NULL; - } - - /* We now have a reference on a valid object */ - return elem; -} - -/* release a ref to table element - puts it back on free list as appropriate */ -static void wq_table_put_elem(struct wq_table *table, struct wqt_elem *elem) -{ - uint32_t bits, new_bits; - - assert(wqt_elem_in_range(elem, table)); - - bits = elem->wqt_bits; - new_bits = bits - 1; - - /* check for underflow */ - assert(wqt_bits_refcnt(new_bits) < WQT_BITS_REFCNT_MASK); - - while (OSCompareAndSwap(bits, new_bits, &elem->wqt_bits) == FALSE) { - bits = elem->wqt_bits; - new_bits = bits - 1; - /* catch underflow */ - assert(wqt_bits_refcnt(new_bits) < WQT_BITS_REFCNT_MASK); - } - - /* load barrier */ - OSMemoryBarrier(); - - /* - * if this was the last reference, and it was marked as invalid, - * then we can add this link object back to the free list - */ - if (!wqt_bits_valid(new_bits) && (wqt_bits_refcnt(new_bits) == 0)) - wq_table_free_elem(table, elem); - - return; -} - - -/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - * API: wqt_elem_list_... - * - * Reuse the free list linkage member, 'wqt_next_idx' of a table element - * in a slightly more generic singly-linked list. All members of this - * list have been allocated from a table, but have not been made valid. - * - * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ - -/* link parent->child */ -static int wqt_elem_list_link(struct wq_table *table, struct wqt_elem *parent, struct wqt_elem *child) -{ - int nelem = 1; - - assert(wqt_elem_in_range(parent, table)); - - /* find the end of the parent's list */ - while (parent->wqt_next_idx != WQT_IDX_MAX) { - assert(parent->wqt_next_idx < table->nelem); - parent = wqt_elem_idx(table, parent->wqt_next_idx); - nelem++; - } - - if (child) { - assert(wqt_elem_in_range(child, table)); - parent->wqt_next_idx = child->wqt_id.idx; - } - - return nelem; -} - -static struct wqt_elem *wqt_elem_list_next(struct wq_table *table, struct wqt_elem *head) -{ - struct wqt_elem *elem; - - if (!head) - return NULL; - if (head->wqt_next_idx >= table->nelem) - return NULL; - - elem = wqt_elem_idx(table, head->wqt_next_idx); - assert(wqt_elem_in_range(elem, table)); - - return elem; -} - -/* - * Obtain a pointer to the first element of a list. Don't take an extra - * reference on the object - the list implicitly holds that reference. - * - * This function is used to convert the head of a singly-linked list - * to a real wqt_elem object. - */ -static struct wqt_elem *wqt_elem_list_first(struct wq_table *table, uint64_t id) -{ - uint32_t idx; - struct wqt_elem *elem = NULL; - - if (id == 0) - return NULL; - - idx = ((struct wq_id *)&id)->idx; - - if (idx > table->nelem) - panic("Invalid element for id:0x%llx", id); - elem = wqt_elem_idx(table, idx); - - /* invalid element: reserved ID was probably already reallocated */ - if (elem->wqt_id.id != id) - return NULL; - - /* the returned element should _not_ be marked valid! */ - if (wqt_bits_valid(elem->wqt_bits) || - wqt_bits_type(elem->wqt_bits) != WQT_RESERVED || - wqt_bits_refcnt(elem->wqt_bits) != 1) { - panic("Valid/unreserved element %p (0x%x) in reserved list", - elem, elem->wqt_bits); - } - - return elem; -} - -static void wqt_elem_reset_next(struct wq_table *table, struct wqt_elem *wqp) -{ - (void)table; - - if (!wqp) - return; - assert(wqt_elem_in_range(wqp, table)); - - wqp->wqt_next_idx = WQT_IDX_MAX; -} +#define wqerr(fmt,...) \ + printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__) + /* - * Pop an item off the list. - * New list head returned in *id, caller responsible for reference on returned - * object. We do a realloc here to reset the type of the object, but still - * leave it invalid. + * un-comment the following lines to debug the link/prepost tables + * NOTE: this expands each element by ~40 bytes */ -static struct wqt_elem *wqt_elem_list_pop(struct wq_table *table, uint64_t *id, int type) -{ - struct wqt_elem *first, *next; - - if (!id || *id == 0) - return NULL; +//#define CONFIG_WAITQ_LINK_STATS +//#define CONFIG_WAITQ_PREPOST_STATS - /* pop an item off the reserved stack */ +/* + * file-static functions / data + */ +static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event, + uint64_t *reserved_preposts, + int priority, spl_t *spl); - first = wqt_elem_list_first(table, *id); - if (!first) { - *id = 0; - return NULL; - } +static kern_return_t waitq_select_thread_locked(struct waitq *waitq, + event64_t event, + thread_t thread, spl_t *spl); - next = wqt_elem_list_next(table, first); - if (next) - *id = next->wqt_id.id; - else - *id = 0; +#define WAITQ_SET_MAX (task_max * 3) +static zone_t waitq_set_zone; - wq_table_realloc_elem(table, first, type); - return first; -} +#define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align))) +#define ROUNDDOWN(x,y) (((x)/(y))*(y)) -/* - * Free an entire list of linked/reserved elements - */ -static int wqt_elem_list_release(struct wq_table *table, - struct wqt_elem *head, - int __assert_only type) -{ - struct wqt_elem *elem; - struct wq_id free_id; - int nelem = 0; - if (!head) - return 0; +#if defined(CONFIG_LTABLE_STATS) || defined(CONFIG_WAITQ_STATS) +static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip); +#endif - for (elem = head; ; ) { - assert(wqt_elem_in_range(elem, table)); - assert(!wqt_bits_valid(elem->wqt_bits) && (wqt_bits_refcnt(elem->wqt_bits) == 1)); - assert(wqt_bits_type(elem->wqt_bits) == type); - nelem++; - elem->wqt_bits = 0; - if (table->poison) - (table->poison)(table, elem); +#define waitq_lock_to(wq,to) \ + (hw_lock_to(&(wq)->waitq_interlock, to)) - if (elem->wqt_next_idx == WQT_IDX_MAX) - break; - assert(elem->wqt_next_idx < table->nelem); - elem = wqt_elem_idx(table, elem->wqt_next_idx); - } +#define waitq_lock_unlock(wq) \ + (hw_lock_unlock(&(wq)->waitq_interlock)) - /* - * 'elem' now points to the end of our list, and 'head' points to the - * beginning. We want to atomically swap the free list pointer with - * the 'head' and ensure that 'elem' points to the previous free list - * head. - */ +#define waitq_lock_init(wq) \ + (hw_lock_init(&(wq)->waitq_interlock)) -again: - free_id = table->free_list; - if (free_id.idx >= table->nelem) - elem->wqt_next_idx = WQT_IDX_MAX; - else - elem->wqt_next_idx = free_id.idx; - /* store barrier */ - OSMemoryBarrier(); - if (OSCompareAndSwap64(free_id.id, head->wqt_id.id, - &table->free_list.id) == FALSE) - goto again; +/* + * Prepost callback function for specially marked waitq sets + * (prepost alternative) + */ +extern void waitq_set__CALLING_PREPOST_HOOK__(void *ctx, void *memberctx, int priority); - OSAddAtomic(-nelem, &table->used_elem); - return nelem; -} +#define DEFAULT_MIN_FREE_TABLE_ELEM 100 +static uint32_t g_min_free_table_elem; +static uint32_t g_min_free_cache; /* ---------------------------------------------------------------------- @@ -1068,30 +159,30 @@ static int wqt_elem_list_release(struct wq_table *table, * SetID Link Table Implementation * * ---------------------------------------------------------------------- */ -static struct wq_table g_linktable; +static struct link_table g_wqlinktable; -enum setid_link_type { - SLT_ALL = -1, - SLT_FREE = WQT_FREE, - SLT_WQS = WQT_ELEM, - SLT_LINK = WQT_LINK, +enum wq_link_type { + WQL_ALL = -1, + WQL_FREE = LT_FREE, + WQL_WQS = LT_ELEM, + WQL_LINK = LT_LINK, }; -struct setid_link { - struct wqt_elem wqte; +struct waitq_link { + struct lt_elem wqte; union { - /* wqt_type == SLT_WQS (WQT_ELEM) */ + /* wqt_type == WQL_WQS (LT_ELEM) */ struct { - struct waitq_set *sl_set; + struct waitq_set *wql_set; /* uint64_t sl_prepost_id; */ - } sl_wqs; + } wql_wqs; - /* wqt_type == SLT_LINK (WQT_LINK) */ + /* wqt_type == WQL_LINK (LT_LINK) */ struct { - uint64_t sl_left_setid; - uint64_t sl_right_setid; - } sl_link; + uint64_t left_setid; + uint64_t right_setid; + } wql_link; }; #ifdef CONFIG_WAITQ_LINK_STATS thread_t sl_alloc_th; @@ -1106,64 +197,64 @@ struct setid_link { #endif }; #if !defined(CONFIG_WAITQ_LINK_STATS) -_Static_assert((sizeof(struct setid_link) & (sizeof(struct setid_link) - 1)) == 0, - "setid_link struct must be a power of two!"); +static_assert((sizeof(struct waitq_link) & (sizeof(struct waitq_link) - 1)) == 0, + "waitq_link struct must be a power of two!"); #endif -#define sl_refcnt(link) \ - (wqt_bits_refcnt((link)->wqte.wqt_bits)) +#define wql_refcnt(link) \ + (lt_bits_refcnt((link)->wqte.lt_bits)) -#define sl_type(link) \ - (wqt_bits_type((link)->wqte.wqt_bits)) +#define wql_type(link) \ + (lt_bits_type((link)->wqte.lt_bits)) -#define sl_set_valid(link) \ +#define wql_mkvalid(link) \ do { \ - wqt_elem_mkvalid(&(link)->wqte); \ - lt_do_mkvalid_stats(&(link)->wqte); \ + lt_elem_mkvalid(&(link)->wqte); \ + wql_do_mkvalid_stats(&(link)->wqte); \ } while (0) -#define sl_is_valid(link) \ - wqt_bits_valid((link)->wqte.wqt_bits) +#define wql_is_valid(link) \ + lt_bits_valid((link)->wqte.lt_bits) -#define sl_set_id wqte.wqt_id +#define wql_setid wqte.lt_id -#define SLT_WQS_POISON ((void *)(0xf00df00d)) -#define SLT_LINK_POISON (0x0bad0badffffffffull) +#define WQL_WQS_POISON ((void *)(0xf00df00d)) +#define WQL_LINK_POISON (0x0bad0badffffffffull) -static void lt_poison(struct wq_table *table, struct wqt_elem *elem) +static void wql_poison(struct link_table *table, struct lt_elem *elem) { - struct setid_link *sl_link = (struct setid_link *)elem; + struct waitq_link *link = (struct waitq_link *)elem; (void)table; - switch (sl_type(sl_link)) { - case SLT_WQS: - sl_link->sl_wqs.sl_set = SLT_WQS_POISON; + switch (wql_type(link)) { + case WQL_WQS: + link->wql_wqs.wql_set = WQL_WQS_POISON; break; - case SLT_LINK: - sl_link->sl_link.sl_left_setid = SLT_LINK_POISON; - sl_link->sl_link.sl_right_setid = SLT_LINK_POISON; + case WQL_LINK: + link->wql_link.left_setid = WQL_LINK_POISON; + link->wql_link.right_setid = WQL_LINK_POISON; break; default: break; } #ifdef CONFIG_WAITQ_LINK_STATS - memset(sl_link->sl_alloc_bt, 0, sizeof(sl_link->sl_alloc_bt)); - sl_link->sl_alloc_ts = 0; - memset(sl_link->sl_mkvalid_bt, 0, sizeof(sl_link->sl_mkvalid_bt)); - sl_link->sl_mkvalid_ts = 0; + memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt)); + link->sl_alloc_ts = 0; + memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt)); + link->sl_mkvalid_ts = 0; - sl_link->sl_alloc_th = THREAD_NULL; + link->sl_alloc_th = THREAD_NULL; /* leave the sl_alloc_task in place for debugging */ - sl_link->sl_free_ts = mach_absolute_time(); + link->sl_free_ts = mach_absolute_time(); #endif } #ifdef CONFIG_WAITQ_LINK_STATS -static __inline__ void lt_do_alloc_stats(struct wqt_elem *elem) +static __inline__ void wql_do_alloc_stats(struct lt_elem *elem) { if (elem) { - struct setid_link *link = (struct setid_link *)elem; + struct waitq_link *link = (struct waitq_link *)elem; memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt)); waitq_grab_backtrace(link->sl_alloc_bt, 0); link->sl_alloc_th = current_thread(); @@ -1177,9 +268,9 @@ static __inline__ void lt_do_alloc_stats(struct wqt_elem *elem) } } -static __inline__ void lt_do_invalidate_stats(struct wqt_elem *elem) +static __inline__ void wql_do_invalidate_stats(struct lt_elem *elem) { - struct setid_link *link = (struct setid_link *)elem; + struct waitq_link *link = (struct waitq_link *)elem; if (!elem) return; @@ -1191,9 +282,9 @@ static __inline__ void lt_do_invalidate_stats(struct wqt_elem *elem) waitq_grab_backtrace(link->sl_invalidate_bt, 0); } -static __inline__ void lt_do_mkvalid_stats(struct wqt_elem *elem) +static __inline__ void wql_do_mkvalid_stats(struct lt_elem *elem) { - struct setid_link *link = (struct setid_link *)elem; + struct waitq_link *link = (struct waitq_link *)elem; if (!elem) return; @@ -1203,107 +294,107 @@ static __inline__ void lt_do_mkvalid_stats(struct wqt_elem *elem) waitq_grab_backtrace(link->sl_mkvalid_bt, 0); } #else -#define lt_do_alloc_stats(e) -#define lt_do_invalidate_stats(e) -#define lt_do_mkvalid_stats(e) +#define wql_do_alloc_stats(e) +#define wql_do_invalidate_stats(e) +#define wql_do_mkvalid_stats(e) #endif /* CONFIG_WAITQ_LINK_STATS */ -static void lt_init(void) +static void wql_init(void) { uint32_t tablesz = 0, max_links = 0; if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE) - tablesz = (uint32_t)g_wqt_max_tbl_size; + tablesz = (uint32_t)g_lt_max_tbl_size; tablesz = P2ROUNDUP(tablesz, PAGE_SIZE); - max_links = tablesz / sizeof(struct setid_link); + max_links = tablesz / sizeof(struct waitq_link); assert(max_links > 0 && tablesz > 0); /* we have a restricted index range */ - if (max_links > (WQT_IDX_MAX + 1)) - max_links = WQT_IDX_MAX + 1; + if (max_links > (LT_IDX_MAX + 1)) + max_links = LT_IDX_MAX + 1; wqinfo("init linktable with max:%d elements (%d bytes)", max_links, tablesz); - wq_table_init(&g_linktable, "wqslab.links", max_links, - sizeof(struct setid_link), lt_poison); + ltable_init(&g_wqlinktable, "wqslab.wql", max_links, + sizeof(struct waitq_link), wql_poison); } -static void lt_ensure_free_space(void) +static void wql_ensure_free_space(void) { - if (g_linktable.nelem - g_linktable.used_elem < g_min_free_table_elem) { + if (g_wqlinktable.nelem - g_wqlinktable.used_elem < g_min_free_table_elem) { /* * we don't hold locks on these values, so check for underflow */ - if (g_linktable.used_elem <= g_linktable.nelem) { + if (g_wqlinktable.used_elem <= g_wqlinktable.nelem) { wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d", - g_linktable.nelem, g_linktable.used_elem, + g_wqlinktable.nelem, g_wqlinktable.used_elem, g_min_free_table_elem); - wq_table_grow(&g_linktable, g_min_free_table_elem); + ltable_grow(&g_wqlinktable, g_min_free_table_elem); } } } -static struct setid_link *lt_alloc_link(int type) +static struct waitq_link *wql_alloc_link(int type) { - struct wqt_elem *elem; + struct lt_elem *elem; - elem = wq_table_alloc_elem(&g_linktable, type, 1); - lt_do_alloc_stats(elem); - return (struct setid_link *)elem; + elem = ltable_alloc_elem(&g_wqlinktable, type, 1, 0); + wql_do_alloc_stats(elem); + return (struct waitq_link *)elem; } -static void lt_realloc_link(struct setid_link *link, int type) +static void wql_realloc_link(struct waitq_link *link, int type) { - wq_table_realloc_elem(&g_linktable, &link->wqte, type); + ltable_realloc_elem(&g_wqlinktable, &link->wqte, type); #ifdef CONFIG_WAITQ_LINK_STATS memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt)); link->sl_alloc_ts = 0; - lt_do_alloc_stats(&link->wqte); + wql_do_alloc_stats(&link->wqte); memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt)); link->sl_invalidate_ts = 0; #endif } -static void lt_invalidate(struct setid_link *link) +static void wql_invalidate(struct waitq_link *link) { - wqt_elem_invalidate(&link->wqte); - lt_do_invalidate_stats(&link->wqte); + lt_elem_invalidate(&link->wqte); + wql_do_invalidate_stats(&link->wqte); } -static struct setid_link *lt_get_link(uint64_t setid) +static struct waitq_link *wql_get_link(uint64_t setid) { - struct wqt_elem *elem; + struct lt_elem *elem; - elem = wq_table_get_elem(&g_linktable, setid); - return (struct setid_link *)elem; + elem = ltable_get_elem(&g_wqlinktable, setid); + return (struct waitq_link *)elem; } -static void lt_put_link(struct setid_link *link) +static void wql_put_link(struct waitq_link *link) { if (!link) return; - wq_table_put_elem(&g_linktable, (struct wqt_elem *)link); + ltable_put_elem(&g_wqlinktable, (struct lt_elem *)link); } -static struct setid_link *lt_get_reserved(uint64_t setid, int type) +static struct waitq_link *wql_get_reserved(uint64_t setid, int type) { - struct wqt_elem *elem; + struct lt_elem *elem; - elem = wqt_elem_list_first(&g_linktable, setid); + elem = lt_elem_list_first(&g_wqlinktable, setid); if (!elem) return NULL; - wq_table_realloc_elem(&g_linktable, elem, type); - return (struct setid_link *)elem; + ltable_realloc_elem(&g_wqlinktable, elem, type); + return (struct waitq_link *)elem; } static inline int waitq_maybe_remove_link(struct waitq *waitq, uint64_t setid, - struct setid_link *parent, - struct setid_link *left, - struct setid_link *right); + struct waitq_link *parent, + struct waitq_link *left, + struct waitq_link *right); enum { LINK_WALK_ONE_LEVEL = 0, @@ -1311,11 +402,11 @@ enum { LINK_WALK_FULL_DAG_UNLOCKED = 2, }; -typedef int (*lt_callback_func)(struct waitq *waitq, void *ctx, - struct setid_link *link); +typedef int (*wql_callback_func)(struct waitq *waitq, void *ctx, + struct waitq_link *link); /** - * walk all table elements (of type 'link_type') pointed to by 'setid' + * walk_waitq_links: walk all table elements (of type 'link_type') pointed to by 'setid' * * Conditions: * waitq is locked (or NULL) @@ -1364,30 +455,30 @@ typedef int (*lt_callback_func)(struct waitq *waitq, void *ctx, * the associated waitq set object and recursively walk all sets to * which that set belongs. This is a DFS of the tree structure. * *) recurse down the left side of the tree (following the - * 'sl_left_setid' pointer in the link object + * 'left_setid' pointer in the link object * *) recurse down the right side of the tree (following the - * 'sl_right_setid' pointer in the link object + * 'right_setid' pointer in the link object */ static __attribute__((noinline)) -int walk_setid_links(int walk_type, struct waitq *waitq, +int walk_waitq_links(int walk_type, struct waitq *waitq, uint64_t setid, int link_type, - void *ctx, lt_callback_func cb) + void *ctx, wql_callback_func cb) { - struct setid_link *link; + struct waitq_link *link; uint64_t nextid; - int sl_type; + int wqltype; - link = lt_get_link(setid); + link = wql_get_link(setid); /* invalid link */ if (!link) return WQ_ITERATE_CONTINUE; setid = nextid = 0; - sl_type = sl_type(link); - if (sl_type == SLT_LINK) { - setid = link->sl_link.sl_left_setid; - nextid = link->sl_link.sl_right_setid; + wqltype = wql_type(link); + if (wqltype == WQL_LINK) { + setid = link->wql_link.left_setid; + nextid = link->wql_link.right_setid; } /* @@ -1396,16 +487,16 @@ int walk_setid_links(int walk_type, struct waitq *waitq, * invalid. The only valid thing we can do is put our * reference to it (which may put it back on the free list) */ - if (link_type == SLT_ALL || link_type == sl_type) { + if (link_type == WQL_ALL || link_type == wqltype) { /* allow the callback to early-out */ int ret = cb(waitq, ctx, link); if (ret != WQ_ITERATE_CONTINUE) { - lt_put_link(link); + wql_put_link(link); return ret; } } - if (sl_type == SLT_WQS && + if (wqltype == WQL_WQS && (walk_type == LINK_WALK_FULL_DAG || walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) { /* @@ -1413,19 +504,13 @@ int walk_setid_links(int walk_type, struct waitq *waitq, * added. We do this just before we put our reference to * the link object (which may free it). */ - struct waitq_set *wqset = link->sl_wqs.sl_set; + struct waitq_set *wqset = link->wql_wqs.wql_set; int ret = WQ_ITERATE_CONTINUE; - int get_spl = 0; int should_unlock = 0; uint64_t wqset_setid = 0; - spl_t set_spl; if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) { - if ((!waitq || !waitq_irq_safe(waitq)) && - waitq_irq_safe(&wqset->wqset_q)) { - get_spl = 1; - set_spl = splsched(); - } + assert(!waitq_irq_safe(&wqset->wqset_q)); waitq_set_lock(wqset); should_unlock = 1; } @@ -1434,45 +519,41 @@ int walk_setid_links(int walk_type, struct waitq *waitq, * verify the linked waitq set as it could have been * invalidated before we grabbed the lock! */ - if (wqset->wqset_id != link->sl_set_id.id) { + if (wqset->wqset_id != link->wql_setid.id) { /*This is the bottom of the tree: just get out */ if (should_unlock) { waitq_set_unlock(wqset); - if (get_spl) - splx(set_spl); } - lt_put_link(link); + wql_put_link(link); return WQ_ITERATE_CONTINUE; } wqset_setid = wqset->wqset_q.waitq_set_id; if (wqset_setid > 0) - ret = walk_setid_links(walk_type, &wqset->wqset_q, + ret = walk_waitq_links(walk_type, &wqset->wqset_q, wqset_setid, link_type, ctx, cb); if (should_unlock) { waitq_set_unlock(wqset); - if (get_spl) - splx(set_spl); } if (ret != WQ_ITERATE_CONTINUE) { - lt_put_link(link); + wql_put_link(link); return ret; } } - lt_put_link(link); + wql_put_link(link); /* recurse down left side of the tree */ if (setid) { - int ret = walk_setid_links(walk_type, waitq, setid, link_type, ctx, cb); + int ret = walk_waitq_links(walk_type, waitq, setid, link_type, ctx, cb); if (ret != WQ_ITERATE_CONTINUE) return ret; } /* recurse down right side of the tree */ if (nextid) - return walk_setid_links(walk_type, waitq, nextid, link_type, ctx, cb); + return walk_waitq_links(walk_type, waitq, nextid, link_type, ctx, cb); return WQ_ITERATE_CONTINUE; } @@ -1482,23 +563,23 @@ int walk_setid_links(int walk_type, struct waitq *waitq, * Prepost Link Table Implementation * * ---------------------------------------------------------------------- */ -static struct wq_table g_prepost_table; +static struct link_table g_prepost_table; enum wq_prepost_type { - WQP_FREE = WQT_FREE, - WQP_WQ = WQT_ELEM, - WQP_POST = WQT_LINK, + WQP_FREE = LT_FREE, + WQP_WQ = LT_ELEM, + WQP_POST = LT_LINK, }; struct wq_prepost { - struct wqt_elem wqte; + struct lt_elem wqte; union { - /* wqt_type == WQP_WQ (WQT_ELEM) */ + /* wqt_type == WQP_WQ (LT_ELEM) */ struct { struct waitq *wqp_wq_ptr; } wqp_wq; - /* wqt_type == WQP_POST (WQT_LINK) */ + /* wqt_type == WQP_POST (LT_LINK) */ struct { uint64_t wqp_next_id; uint64_t wqp_wq_id; @@ -1511,28 +592,28 @@ struct wq_prepost { #endif }; #if !defined(CONFIG_WAITQ_PREPOST_STATS) -_Static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0, +static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0, "wq_prepost struct must be a power of two!"); #endif #define wqp_refcnt(wqp) \ - (wqt_bits_refcnt((wqp)->wqte.wqt_bits)) + (lt_bits_refcnt((wqp)->wqte.lt_bits)) #define wqp_type(wqp) \ - (wqt_bits_type((wqp)->wqte.wqt_bits)) + (lt_bits_type((wqp)->wqte.lt_bits)) #define wqp_set_valid(wqp) \ - wqt_elem_mkvalid(&(wqp)->wqte) + lt_elem_mkvalid(&(wqp)->wqte) #define wqp_is_valid(wqp) \ - wqt_bits_valid((wqp)->wqte.wqt_bits) + lt_bits_valid((wqp)->wqte.lt_bits) -#define wqp_prepostid wqte.wqt_id +#define wqp_prepostid wqte.lt_id #define WQP_WQ_POISON (0x0bad0badffffffffull) #define WQP_POST_POISON (0xf00df00df00df00d) -static void wqp_poison(struct wq_table *table, struct wqt_elem *elem) +static void wqp_poison(struct link_table *table, struct lt_elem *elem) { struct wq_prepost *wqp = (struct wq_prepost *)elem; (void)table; @@ -1550,28 +631,24 @@ static void wqp_poison(struct wq_table *table, struct wqt_elem *elem) } #ifdef CONFIG_WAITQ_PREPOST_STATS -static __inline__ void wqp_do_alloc_stats(struct wqt_elem *elem) +static __inline__ void wqp_do_alloc_stats(struct lt_elem *elem) { - if (elem) { - struct wq_prepost *wqp = (struct wq_prepost *)elem; - - /* be sure the take stats for _all_ allocated objects */ - for (;;) { - uint32_t next_idx; + if (!elem) + return; - memset(wqp->wqp_alloc_bt, 0, sizeof(wqp->wqp_alloc_bt)); - waitq_grab_backtrace(wqp->wqp_alloc_bt, 4); - wqp->wqp_alloc_th = current_thread(); - wqp->wqp_alloc_task = current_task(); - next_idx = wqp->wqte.wqt_next_idx; + struct wq_prepost *wqp = (struct wq_prepost *)elem; + uintptr_t alloc_bt[sizeof(wqp->wqp_alloc_bt)]; - if (next_idx == WQT_IDX_MAX) - break; - assert(next_idx < g_prepost_table.nelem); + waitq_grab_backtrace(alloc_bt, NWAITQ_BTFRAMES); - wqp = (struct wq_prepost *)wqt_elem_idx(&g_prepost_table, - next_idx); - } + /* be sure the take stats for _all_ allocated objects */ + for (;;) { + memcpy(wqp->wqp_alloc_bt, alloc_bt, sizeof(alloc_bt)); + wqp->wqp_alloc_th = current_thread(); + wqp->wqp_alloc_task = current_task(); + wqp = (struct wq_prepost *)lt_elem_list_next(&g_prepost_table, &wqp->wqte); + if (!wqp) + break; } } #else @@ -1583,20 +660,20 @@ static void wqp_init(void) uint32_t tablesz = 0, max_wqp = 0; if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE) - tablesz = (uint32_t)g_wqt_max_tbl_size; + tablesz = (uint32_t)g_lt_max_tbl_size; tablesz = P2ROUNDUP(tablesz, PAGE_SIZE); max_wqp = tablesz / sizeof(struct wq_prepost); assert(max_wqp > 0 && tablesz > 0); /* we have a restricted index range */ - if (max_wqp > (WQT_IDX_MAX + 1)) - max_wqp = WQT_IDX_MAX + 1; + if (max_wqp > (LT_IDX_MAX + 1)) + max_wqp = LT_IDX_MAX + 1; wqinfo("init prepost table with max:%d elements (%d bytes)", max_wqp, tablesz); - wq_table_init(&g_prepost_table, "wqslab.prepost", max_wqp, - sizeof(struct wq_prepost), wqp_poison); + ltable_init(&g_prepost_table, "wqslab.prepost", max_wqp, + sizeof(struct wq_prepost), wqp_poison); } /* @@ -1604,29 +681,37 @@ static void wqp_init(void) */ static void wq_prepost_refill_cpu_cache(uint32_t nalloc) { - struct wqt_elem *new_head, *old_head; + struct lt_elem *new_head, *old_head; struct wqp_cache *cache; /* require preemption enabled to allocate elements */ if (get_preemption_level() != 0) return; - new_head = wq_table_alloc_elem(&g_prepost_table, - WQT_RESERVED, nalloc); + new_head = ltable_alloc_elem(&g_prepost_table, + LT_RESERVED, nalloc, 1); if (new_head == NULL) return; disable_preemption(); cache = &PROCESSOR_DATA(current_processor(), wqp_cache); + + /* check once more before putting these elements on the list */ + if (cache->avail >= WQP_CACHE_MAX) { + lt_elem_list_release(&g_prepost_table, new_head, LT_RESERVED); + enable_preemption(); + return; + } + cache->avail += nalloc; - if (cache->head == 0 || cache->head == WQT_IDX_MAX) { - cache->head = new_head->wqt_id.id; + if (cache->head == 0 || cache->head == LT_IDX_MAX) { + cache->head = new_head->lt_id.id; goto out; } - old_head = wqt_elem_list_first(&g_prepost_table, cache->head); - (void)wqt_elem_list_link(&g_prepost_table, new_head, old_head); - cache->head = new_head->wqt_id.id; + old_head = lt_elem_list_first(&g_prepost_table, cache->head); + (void)lt_elem_list_link(&g_prepost_table, new_head, old_head); + cache->head = new_head->lt_id.id; out: enable_preemption(); @@ -1666,18 +751,18 @@ static void wq_prepost_ensure_free_space(void) wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d", g_prepost_table.nelem, g_prepost_table.used_elem, g_min_free_table_elem, g_min_free_cache); - wq_table_grow(&g_prepost_table, min_free); + ltable_grow(&g_prepost_table, min_free); } } } static struct wq_prepost *wq_prepost_alloc(int type, int nelem) { - struct wqt_elem *elem; + struct lt_elem *elem; struct wq_prepost *wqp; struct wqp_cache *cache; - if (type != WQT_RESERVED) + if (type != LT_RESERVED) goto do_alloc; if (nelem == 0) return NULL; @@ -1689,30 +774,30 @@ static struct wq_prepost *wq_prepost_alloc(int type, int nelem) disable_preemption(); cache = &PROCESSOR_DATA(current_processor(), wqp_cache); if (nelem <= (int)cache->avail) { - struct wqt_elem *first, *next = NULL; + struct lt_elem *first, *next = NULL; int nalloc = nelem; cache->avail -= nelem; /* grab the first element */ - first = wqt_elem_list_first(&g_prepost_table, cache->head); + first = lt_elem_list_first(&g_prepost_table, cache->head); /* find the last element and re-adjust the cache head */ for (elem = first; elem != NULL && nalloc > 0; elem = next) { - next = wqt_elem_list_next(&g_prepost_table, elem); + next = lt_elem_list_next(&g_prepost_table, elem); if (--nalloc == 0) { /* terminate the allocated list */ - elem->wqt_next_idx = WQT_IDX_MAX; + elem->lt_next_idx = LT_IDX_MAX; break; } } assert(nalloc == 0); if (!next) - cache->head = WQT_IDX_MAX; + cache->head = LT_IDX_MAX; else - cache->head = next->wqt_id.id; + cache->head = next->lt_id.id; /* assert that we don't have mis-matched book keeping */ - assert(!(cache->head == WQT_IDX_MAX && cache->avail > 0)); + assert(!(cache->head == LT_IDX_MAX && cache->avail > 0)); enable_preemption(); elem = first; goto out; @@ -1721,7 +806,7 @@ static struct wq_prepost *wq_prepost_alloc(int type, int nelem) do_alloc: /* fall-back to standard table allocation */ - elem = wq_table_alloc_elem(&g_prepost_table, type, nelem); + elem = ltable_alloc_elem(&g_prepost_table, type, nelem, 0); if (!elem) return NULL; @@ -1731,55 +816,48 @@ static struct wq_prepost *wq_prepost_alloc(int type, int nelem) return wqp; } -/* -static void wq_prepost_realloc(struct wq_prepost *wqp, int type) -{ - wq_table_realloc_elem(&g_prepost_table, &wqp->wqte, type); -} -*/ - static void wq_prepost_invalidate(struct wq_prepost *wqp) { - wqt_elem_invalidate(&wqp->wqte); + lt_elem_invalidate(&wqp->wqte); } static struct wq_prepost *wq_prepost_get(uint64_t wqp_id) { - struct wqt_elem *elem; + struct lt_elem *elem; - elem = wq_table_get_elem(&g_prepost_table, wqp_id); + elem = ltable_get_elem(&g_prepost_table, wqp_id); return (struct wq_prepost *)elem; } static void wq_prepost_put(struct wq_prepost *wqp) { - wq_table_put_elem(&g_prepost_table, (struct wqt_elem *)wqp); + ltable_put_elem(&g_prepost_table, (struct lt_elem *)wqp); } static int wq_prepost_rlink(struct wq_prepost *parent, struct wq_prepost *child) { - return wqt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte); + return lt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte); } static struct wq_prepost *wq_prepost_get_rnext(struct wq_prepost *head) { - struct wqt_elem *elem; + struct lt_elem *elem; struct wq_prepost *wqp; uint64_t id; - elem = wqt_elem_list_next(&g_prepost_table, &head->wqte); + elem = lt_elem_list_next(&g_prepost_table, &head->wqte); if (!elem) return NULL; - id = elem->wqt_id.id; - elem = wq_table_get_elem(&g_prepost_table, id); + id = elem->lt_id.id; + elem = ltable_get_elem(&g_prepost_table, id); if (!elem) return NULL; wqp = (struct wq_prepost *)elem; - if (elem->wqt_id.id != id || + if (elem->lt_id.id != id || wqp_type(wqp) != WQP_POST || wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) { - wq_table_put_elem(&g_prepost_table, elem); + ltable_put_elem(&g_prepost_table, elem); return NULL; } @@ -1788,7 +866,7 @@ static struct wq_prepost *wq_prepost_get_rnext(struct wq_prepost *head) static void wq_prepost_reset_rnext(struct wq_prepost *wqp) { - wqt_elem_reset_next(&g_prepost_table, &wqp->wqte); + (void)lt_elem_list_break(&g_prepost_table, &wqp->wqte); } @@ -1813,6 +891,7 @@ static int wq_prepost_remove(struct waitq_set *wqset, struct wq_prepost *prev_wqp, *next_wqp; assert(wqp_type(wqp) == WQP_POST); + assert(wqset->wqset_q.waitq_prepost == 1); if (next_id == wqp_id) { /* the list is singular and becoming empty */ @@ -1867,16 +946,16 @@ static int wq_prepost_remove(struct waitq_set *wqset, static struct wq_prepost *wq_prepost_rfirst(uint64_t id) { - struct wqt_elem *elem; - elem = wqt_elem_list_first(&g_prepost_table, id); + struct lt_elem *elem; + elem = lt_elem_list_first(&g_prepost_table, id); wqp_do_alloc_stats(elem); return (struct wq_prepost *)(void *)elem; } static struct wq_prepost *wq_prepost_rpop(uint64_t *id, int type) { - struct wqt_elem *elem; - elem = wqt_elem_list_pop(&g_prepost_table, id, type); + struct lt_elem *elem; + elem = lt_elem_list_pop(&g_prepost_table, id, type); wqp_do_alloc_stats(elem); return (struct wq_prepost *)(void *)elem; } @@ -1885,7 +964,7 @@ static void wq_prepost_release_rlist(struct wq_prepost *wqp) { int nelem = 0; struct wqp_cache *cache; - struct wqt_elem *elem; + struct lt_elem *elem; if (!wqp) return; @@ -1899,11 +978,11 @@ static void wq_prepost_release_rlist(struct wq_prepost *wqp) disable_preemption(); cache = &PROCESSOR_DATA(current_processor(), wqp_cache); if (cache->avail < WQP_CACHE_MAX) { - struct wqt_elem *tmp = NULL; - if (cache->head != WQT_IDX_MAX) - tmp = wqt_elem_list_first(&g_prepost_table, cache->head); - nelem = wqt_elem_list_link(&g_prepost_table, elem, tmp); - cache->head = elem->wqt_id.id; + struct lt_elem *tmp = NULL; + if (cache->head != LT_IDX_MAX) + tmp = lt_elem_list_first(&g_prepost_table, cache->head); + nelem = lt_elem_list_link(&g_prepost_table, elem, tmp); + cache->head = elem->lt_id.id; cache->avail += nelem; enable_preemption(); return; @@ -1911,7 +990,7 @@ static void wq_prepost_release_rlist(struct wq_prepost *wqp) enable_preemption(); /* release these elements back to the main table */ - nelem = wqt_elem_list_release(&g_prepost_table, elem, WQT_RESERVED); + nelem = lt_elem_list_release(&g_prepost_table, elem, LT_RESERVED); #if CONFIG_WAITQ_STATS g_prepost_table.nreserved_releases += 1; @@ -1938,10 +1017,12 @@ typedef int (*wqp_callback_func)(struct waitq_set *wqset, static int wq_prepost_foreach_locked(struct waitq_set *wqset, void *ctx, wqp_callback_func cb) { - int ret; + int ret = WQ_ITERATE_SUCCESS; struct wq_prepost *wqp, *tmp_wqp; - if (!wqset || !wqset->wqset_prepost_id) + assert(cb != NULL); + + if (!wqset || !waitq_set_maybe_preposted(wqset)) return WQ_ITERATE_SUCCESS; restart: @@ -1957,8 +1038,8 @@ static int wq_prepost_foreach_locked(struct waitq_set *wqset, if (wqp_type(wqp) == WQP_WQ) { uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id; - if (cb) - ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr); + + ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr); switch (ret) { case WQ_ITERATE_INVALIDATE_CONTINUE: @@ -2024,9 +1105,7 @@ static int wq_prepost_foreach_locked(struct waitq_set *wqset, * drop the lock on our waitq set. We need to re-validate * our state when this function returns. */ - if (cb) - ret = cb(wqset, ctx, wqp, - tmp_wqp->wqp_wq.wqp_wq_ptr); + ret = cb(wqset, ctx, wqp, tmp_wqp->wqp_wq.wqp_wq_ptr); wq_prepost_put(tmp_wqp); switch (ret) { @@ -2253,6 +1332,7 @@ static struct wq_prepost *wq_get_prepost_obj(uint64_t *reserved, int type) */ if (reserved && *reserved) { wqp = wq_prepost_rpop(reserved, type); + assert(wqp->wqte.lt_id.idx < g_prepost_table.nelem); } else { /* * TODO: if in interrupt context, grab from a special @@ -2273,7 +1353,7 @@ static struct wq_prepost *wq_get_prepost_obj(uint64_t *reserved, int type) * Parameters: * wqset The set onto which waitq will be preposted * waitq The waitq that's preposting - * reserved List (wqt_elem_list_ style) of pre-allocated prepost elements + * reserved List (lt_elem_list_ style) of pre-allocated prepost elements * Could be NULL * * Conditions: @@ -2311,7 +1391,7 @@ static void wq_prepost_do_post_locked(struct waitq_set *wqset, wq_prepost_put(wqp); } -#if CONFIG_WAITQ_STATS +#if CONFIG_LTABLE_STATS g_prepost_table.npreposts += 1; #endif @@ -2440,8 +1520,8 @@ static void wq_prepost_do_post_locked(struct waitq_set *wqset, * Stats collection / reporting * * ---------------------------------------------------------------------- */ -#if CONFIG_WAITQ_STATS -static void wq_table_stats(struct wq_table *table, struct wq_table_stats *stats) +#if defined(CONFIG_LTABLE_STATS) && defined(CONFIG_WAITQ_STATS) +static void wq_table_stats(struct link_table *table, struct wq_table_stats *stats) { stats->version = WAITQ_STATS_VERSION; stats->table_elements = table->nelem; @@ -2464,7 +1544,7 @@ void waitq_link_stats(struct wq_table_stats *stats) { if (!stats) return; - wq_table_stats(&g_linktable, stats); + wq_table_stats(&g_wqlinktable, stats); } void waitq_prepost_stats(struct wq_table_stats *stats) @@ -2489,25 +1569,9 @@ static uint32_t g_num_waitqs = 1; */ #define _CAST_TO_EVENT_MASK(event) ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul)) -/* - * The Jenkins "one at a time" hash. - * TBD: There may be some value to unrolling here, - * depending on the architecture. - */ static __inline__ uint32_t waitq_hash(char *key, size_t length) { - uint32_t hash = 0; - size_t i; - - for (i = 0; i < length; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + uint32_t hash = jenkins_hash(key, length); hash &= (g_num_waitqs - 1); return hash; @@ -2526,11 +1590,9 @@ struct waitq *global_waitq(int index) } -#if CONFIG_WAITQ_STATS +#if defined(CONFIG_LTABLE_STATS) || defined(CONFIG_WAITQ_STATS) /* this global is for lldb */ const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES; -struct wq_stats g_boot_stats; -struct wq_stats *g_waitq_stats = &g_boot_stats; static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip) { @@ -2538,9 +1600,17 @@ static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int s if (skip < 0) skip = 0; memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t)); - fastbacktrace(buf, g_nwaitq_btframes + skip); + backtrace(buf, g_nwaitq_btframes + skip); memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t)); } +#else /* no stats */ +#define waitq_grab_backtrace(...) +#endif + +#if CONFIG_WAITQ_STATS + +struct wq_stats g_boot_stats; +struct wq_stats *g_waitq_stats = &g_boot_stats; static __inline__ struct wq_stats *waitq_global_stats(struct waitq *waitq) { struct wq_stats *wqs; @@ -2591,7 +1661,7 @@ static __inline__ void waitq_stats_count_fail(struct waitq *waitq) waitq_grab_backtrace(wqs->last_failed_wakeup, 2); } } -#else +#else /* !CONFIG_WAITQ_STATS */ #define waitq_stats_count_wait(q) do { } while (0) #define waitq_stats_count_wakeup(q) do { } while (0) #define waitq_stats_count_clear_wakeup(q) do { } while (0) @@ -2600,12 +1670,12 @@ static __inline__ void waitq_stats_count_fail(struct waitq *waitq) int waitq_is_valid(struct waitq *waitq) { - return (waitq != NULL) && ((waitq->waitq_type & ~1) == WQT_QUEUE); + return (waitq != NULL) && waitq->waitq_isvalid && ((waitq->waitq_type & ~1) == WQT_QUEUE); } int waitq_set_is_valid(struct waitq_set *wqset) { - return (wqset != NULL) && waitqs_is_set(wqset); + return (wqset != NULL) && wqset->wqset_q.waitq_isvalid && waitqs_is_set(wqset); } int waitq_is_global(struct waitq *waitq) @@ -2628,7 +1698,7 @@ static uint32_t waitq_hash_size(void) if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize))) return (hsize); - queues = thread_max / 11; + queues = thread_max / 5; hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE); return hsize; @@ -2637,11 +1707,12 @@ static uint32_t waitq_hash_size(void) void waitq_bootstrap(void) { kern_return_t kret; - uint32_t whsize, qsz; + uint32_t whsize, qsz, tmp32; - wq_table_bootstrap(); - lt_init(); - wqp_init(); + g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM; + if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE) + g_min_free_table_elem = tmp32; + wqdbg("Minimum free table elements: %d", tmp32); /* * Determine the amount of memory we're willing to reserve for @@ -2690,12 +1761,17 @@ void waitq_bootstrap(void) waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO|SYNC_POLICY_DISABLE_IRQ); } - waitq_set_zone = zinit(sizeof(struct waitq_set), WAITQ_SET_MAX * sizeof(struct waitq_set), sizeof(struct waitq_set), "waitq sets"); zone_change(waitq_set_zone, Z_NOENCRYPT, TRUE); + + /* initialize the global waitq link table */ + wql_init(); + + /* initialize the global waitq prepost table */ + wqp_init(); } @@ -2720,19 +1796,13 @@ void waitq_bootstrap(void) void waitq_lock(struct waitq *wq) { - if (__improbable(hw_lock_to(&(wq)->waitq_interlock, + if (__improbable(waitq_lock_to(wq, hwLockTimeOut * 2) == 0)) { boolean_t wql_acquired = FALSE; while (machine_timeout_suspended()) { -#if defined(__i386__) || defined(__x86_64__) - /* - * i386/x86_64 return with preemption disabled on a - * timeout for diagnostic purposes. - */ mp_enable_preemption(); -#endif - wql_acquired = hw_lock_to(&(wq)->waitq_interlock, + wql_acquired = waitq_lock_to(wq, hwLockTimeOut * 2); if (wql_acquired) break; @@ -2741,13 +1811,19 @@ void waitq_lock(struct waitq *wq) panic("waitq deadlock - waitq=%p, cpu=%d\n", wq, cpu_number()); } +#if defined(__x86_64__) + pltrace(FALSE); +#endif assert(waitq_held(wq)); } void waitq_unlock(struct waitq *wq) { assert(waitq_held(wq)); - hw_lock_unlock(&(wq)->waitq_interlock); +#if defined(__x86_64__) + pltrace(TRUE); +#endif + waitq_lock_unlock(wq); } @@ -2802,30 +1878,27 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args); * onto the waitq set pointed to by 'link'. */ static int waitq_select_walk_cb(struct waitq *waitq, void *ctx, - struct setid_link *link) + struct waitq_link *link) { int ret = WQ_ITERATE_CONTINUE; struct waitq_select_args args = *((struct waitq_select_args *)ctx); struct waitq_set *wqset; - int get_spl = 0; - spl_t set_spl; (void)waitq; - assert(sl_type(link) == SLT_WQS); + assert(wql_type(link) == WQL_WQS); - wqset = link->sl_wqs.sl_set; + wqset = link->wql_wqs.wql_set; args.waitq = &wqset->wqset_q; - if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) { - get_spl = 1; - set_spl = splsched(); - } + assert(!waitq_irq_safe(waitq)); + assert(!waitq_irq_safe(&wqset->wqset_q)); + waitq_set_lock(wqset); /* * verify that the link wasn't invalidated just before * we were able to take the lock. */ - if (wqset->wqset_id != link->sl_set_id.id) + if (wqset->wqset_id != link->wql_setid.id) goto out_unlock; /* @@ -2849,20 +1922,24 @@ static int waitq_select_walk_cb(struct waitq *waitq, void *ctx, * if wqset can handle preposts and the event is set to 0. * We also make sure to not post waitq sets to other sets. * - * In the future, we may consider an optimization to prepost - * 'args.posted_waitq' directly to 'wqset' to avoid - * unnecessary data structure manipulations in the kqueue path + * If the set doesn't support preposts, but does support + * prepost callout/hook interaction, invoke the predefined + * callout function and pass the set's 'prepost_hook.' This + * could potentially release another thread to handle events. */ - if (args.event == NO_EVENT64 && waitq_set_can_prepost(wqset)) { - wq_prepost_do_post_locked(wqset, waitq, - args.reserved_preposts); + if (args.event == NO_EVENT64) { + if (waitq_set_can_prepost(wqset)) { + wq_prepost_do_post_locked( + wqset, waitq, args.reserved_preposts); + } else if (waitq_set_has_prepost_hook(wqset)) { + waitq_set__CALLING_PREPOST_HOOK__( + wqset->wqset_prepost_hook, waitq, 0); + } } } out_unlock: waitq_set_unlock(wqset); - if (get_spl) - splx(set_spl); return ret; } @@ -2887,66 +1964,87 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) struct waitq *waitq = args->waitq; int max_threads = args->max_threads; thread_t thread = THREAD_NULL, first_thread = THREAD_NULL; - int global_q = 0; - unsigned long eventmask = 0; + struct waitq *safeq; + uint32_t remaining_eventmask = 0; + uint32_t eventmask; int *nthreads = args->nthreads; + spl_t spl = 0; assert(max_threads != 0); - global_q = waitq_is_global(waitq); - if (global_q) { + if (!waitq_irq_safe(waitq)) { + /* JMM - add flag to waitq to avoid global lookup if no waiters */ + eventmask = _CAST_TO_EVENT_MASK(waitq); + safeq = global_eventq(waitq); + if (*nthreads == 0) + spl = splsched(); + waitq_lock(safeq); + } else { eventmask = _CAST_TO_EVENT_MASK(args->event); - /* make sure this waitq accepts this event mask */ - if ((waitq->waitq_eventmask & eventmask) != eventmask) - return; - eventmask = 0; + safeq = waitq; } - /* look through each thread waiting directly on the waitq */ - qe_foreach_element_safe(thread, &waitq->waitq_queue, links) { - thread_t t = THREAD_NULL; - assert(thread->waitq == waitq); - if (thread->wait_event == args->event) { - t = thread; - if (first_thread == THREAD_NULL) - first_thread = thread; - - /* allow the caller to futher refine the selection */ - if (args->select_cb) - t = args->select_cb(args->select_ctx, waitq, - global_q, thread); - if (t != THREAD_NULL) { - *nthreads += 1; - if (args->threadq) { - if (*nthreads == 1) - *(args->spl) = splsched(); - thread_lock(t); - thread_clear_waitq_state(t); - /* put locked thread on output queue */ - re_queue_tail(args->threadq, &t->links); + /* + * If the safeq doesn't have an eventmask (not global) or the event + * we're looking for IS set in its eventmask, then scan the threads + * in that queue for ones that match the original pair. + */ + if (!waitq_is_global(safeq) || + (safeq->waitq_eventmask & eventmask) == eventmask) { + + /* look through each thread waiting directly on the safeq */ + qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) { + thread_t t = THREAD_NULL; + assert_thread_magic(thread); + + if (thread->waitq == waitq && thread->wait_event == args->event) { + t = thread; + if (first_thread == THREAD_NULL) + first_thread = thread; + + /* allow the caller to futher refine the selection */ + if (args->select_cb) + t = args->select_cb(args->select_ctx, waitq, + waitq_is_global(waitq), thread); + if (t != THREAD_NULL) { + *nthreads += 1; + if (args->threadq) { + if (*nthreads == 1) + *(args->spl) = (safeq != waitq) ? spl : splsched(); + thread_lock(t); + thread_clear_waitq_state(t); + /* put locked thread on output queue */ + re_queue_tail(args->threadq, &t->wait_links); + } + /* only enqueue up to 'max' threads */ + if (*nthreads >= max_threads && max_threads > 0) + break; } - /* only enqueue up to 'max' threads */ - if (*nthreads >= max_threads && max_threads > 0) - break; + } + /* thread wasn't selected so track it's event */ + if (t == THREAD_NULL) { + remaining_eventmask |= (thread->waitq != safeq) ? + _CAST_TO_EVENT_MASK(thread->waitq): + _CAST_TO_EVENT_MASK(thread->wait_event); } } - /* thread wasn't selected, and the waitq is global */ - if (t == THREAD_NULL && global_q) - eventmask |= _CAST_TO_EVENT_MASK(thread->wait_event); - } - /* - * Update the eventmask of global queues: - * - If we selected all the threads in the queue, or we selected zero - * threads on the queue, set the eventmask to the calculated value - * (potentially 0 if we selected them all) - * - If we just pulled out a subset of threads from the queue, then we - * can't assume the calculated mask is complete (because we may not - * have made it through all the threads in the queue), so we have to - * leave it alone. - */ - if (global_q && (queue_empty(&waitq->waitq_queue) || *nthreads == 0)) - waitq->waitq_eventmask = (typeof(waitq->waitq_eventmask))eventmask; + /* + * Update the eventmask of global queues we just scanned: + * - If we selected all the threads in the queue, we can clear its + * eventmask. + * + * - If we didn't find enough threads to fill our needs, then we can + * assume we looked at every thread in the queue and the mask we + * computed is complete - so reset it. + */ + if (waitq_is_global(safeq)) { + if (queue_empty(&safeq->waitq_queue)) + safeq->waitq_eventmask = 0; + else if (max_threads < 0 || *nthreads < max_threads) + safeq->waitq_eventmask = remaining_eventmask; + } + } /* * Grab the first thread in the queue if no other thread was selected. @@ -2956,16 +2054,23 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) if (*nthreads == 0 && first_thread != THREAD_NULL && args->threadq) { /* we know this is the first (and only) thread */ ++(*nthreads); - *(args->spl) = splsched(); + *(args->spl) = (safeq != waitq) ? spl : splsched(); thread_lock(first_thread); thread_clear_waitq_state(first_thread); - re_queue_tail(args->threadq, &first_thread->links); + re_queue_tail(args->threadq, &first_thread->wait_links); - /* update the eventmask on global queues */ - if (global_q && queue_empty(&waitq->waitq_queue)) - waitq->waitq_eventmask = 0; + /* update the eventmask on [now] empty global queues */ + if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue)) + safeq->waitq_eventmask = 0; } + /* unlock the safe queue if we locked one above */ + if (safeq != waitq) { + waitq_unlock(safeq); + if (*nthreads == 0) + splx(spl); + } + if (max_threads > 0 && *nthreads >= max_threads) return; @@ -2977,14 +2082,14 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) return; /* check to see if the set ID for this wait queue is valid */ - struct setid_link *link = lt_get_link(waitq->waitq_set_id); + struct waitq_link *link = wql_get_link(waitq->waitq_set_id); if (!link) { /* the waitq set to which this waitq belonged, has been invalidated */ waitq->waitq_set_id = 0; return; } - lt_put_link(link); + wql_put_link(link); /* * If this waitq is a member of any wait queue sets, we need to look @@ -2994,8 +2099,8 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) * Note that we do a local walk of this waitq's links - we manually * recurse down wait queue set's with non-zero wqset_q.waitq_set_id */ - (void)walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, - SLT_WQS, (void *)args, waitq_select_walk_cb); + (void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, + WQL_WQS, (void *)args, waitq_select_walk_cb); } /** @@ -3100,8 +2205,10 @@ static thread_t waitq_select_one_cb(void *ctx, struct waitq *waitq, return THREAD_NULL; } + + /** - * select a single thread from a waitq that's waiting for a given event + * select from a waitq a single thread waiting for a given event * * Conditions: * 'waitq' is locked @@ -3115,21 +2222,20 @@ static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event, uint64_t *reserved_preposts, int priority, spl_t *spl) { + (void)priority; int nthreads; queue_head_t threadq; - (void)priority; - queue_init(&threadq); nthreads = waitq_select_n_locked(waitq, event, waitq_select_one_cb, NULL, - reserved_preposts, &threadq, 1, spl); + reserved_preposts, &threadq, 1, spl); /* if we selected a thread, return it (still locked) */ if (!queue_empty(&threadq)) { thread_t t; queue_entry_t qe = dequeue_head(&threadq); - t = qe_element(qe, struct thread, links); + t = qe_element(qe, struct thread, wait_links); assert(queue_empty(&threadq)); /* there should be 1 entry */ /* t has been locked and removed from all queues */ return t; @@ -3138,6 +2244,94 @@ static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event, return THREAD_NULL; } +struct find_max_pri_ctx { + integer_t max_sched_pri; + integer_t max_base_pri; + thread_t highest_thread; +}; + +/** + * callback function that finds the max priority thread + * + * Conditions: + * 'waitq' is locked + * 'thread' is not locked + */ +static thread_t +waitq_find_max_pri_cb(void *ctx_in, + __unused struct waitq *waitq, + __unused int is_global, + thread_t thread) +{ + struct find_max_pri_ctx *ctx = (struct find_max_pri_ctx *)ctx_in; + + /* + * thread is not locked, use pri as a hint only + * wake up the highest base pri, and find the highest sched pri at that base pri + */ + integer_t sched_pri = *(volatile int16_t *)&thread->sched_pri; + integer_t base_pri = *(volatile int16_t *)&thread->base_pri; + + if (ctx->highest_thread == THREAD_NULL || + (base_pri > ctx->max_base_pri) || + (base_pri == ctx->max_base_pri && sched_pri > ctx->max_sched_pri)) { + /* don't select the thread, just update ctx */ + + ctx->max_sched_pri = sched_pri; + ctx->max_base_pri = base_pri; + ctx->highest_thread = thread; + } + + return THREAD_NULL; +} + +/** + * select from a waitq the highest priority thread waiting for a given event + * + * Conditions: + * 'waitq' is locked + * + * Returns: + * A locked thread that's been removed from the waitq, but has not + * yet been put on a run queue. Caller is responsible to call splx + * with the '*spl' value. + */ +static thread_t +waitq_select_max_locked(struct waitq *waitq, event64_t event, + uint64_t *reserved_preposts, + spl_t *spl) +{ + __assert_only int nthreads; + assert(!waitq->waitq_set_id); /* doesn't support recursive sets */ + + struct find_max_pri_ctx ctx = { + .max_sched_pri = 0, + .max_base_pri = 0, + .highest_thread = THREAD_NULL, + }; + + /* + * Scan the waitq to find the highest priority thread. + * This doesn't remove any thread from the queue + */ + nthreads = waitq_select_n_locked(waitq, event, waitq_find_max_pri_cb, &ctx, + reserved_preposts, NULL, 1, spl); + + assert(nthreads == 0); + + if (ctx.highest_thread != THREAD_NULL) { + __assert_only kern_return_t ret; + + /* Remove only the thread we just found */ + ret = waitq_select_thread_locked(waitq, event, ctx.highest_thread, spl); + + assert(ret == KERN_SUCCESS); + return ctx.highest_thread; + } + + return THREAD_NULL; +} + struct select_thread_ctx { thread_t thread; @@ -3160,46 +2354,58 @@ struct select_thread_ctx { * in ctx->spl. */ static int waitq_select_thread_cb(struct waitq *waitq, void *ctx, - struct setid_link *link) + struct waitq_link *link) { struct select_thread_ctx *stctx = (struct select_thread_ctx *)ctx; struct waitq_set *wqset; + struct waitq *wqsetq; + struct waitq *safeq; + spl_t s; (void)waitq; - + thread_t thread = stctx->thread; event64_t event = stctx->event; - if (sl_type(link) != SLT_WQS) + if (wql_type(link) != WQL_WQS) return WQ_ITERATE_CONTINUE; - wqset = link->sl_wqs.sl_set; + wqset = link->wql_wqs.wql_set; + wqsetq = &wqset->wqset_q; - if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) { - *(stctx->spl) = splsched(); - waitq_set_lock(wqset); - thread_lock(thread); - } else { - waitq_set_lock(wqset); - *(stctx->spl) = splsched(); - thread_lock(thread); - } + assert(!waitq_irq_safe(waitq)); + assert(!waitq_irq_safe(wqsetq)); + + waitq_set_lock(wqset); + + s = splsched(); - if ((thread->waitq == &wqset->wqset_q) - && (thread->wait_event == event)) { - remqueue(&thread->links); + /* find and lock the interrupt-safe waitq the thread is thought to be on */ + safeq = global_eventq(wqsetq); + waitq_lock(safeq); + + thread_lock(thread); + + if ((thread->waitq == wqsetq) && (thread->wait_event == event)) { + remqueue(&thread->wait_links); + if (queue_empty(&safeq->waitq_queue)) { + safeq->waitq_eventmask = 0; + } thread_clear_waitq_state(thread); + waitq_unlock(safeq); + waitq_set_unlock(wqset); /* * thread still locked, * return non-zero to break out of WQS walk */ - waitq_set_unlock(wqset); + *(stctx->spl) = s; return WQ_ITERATE_FOUND; } thread_unlock(thread); waitq_set_unlock(wqset); - splx(*(stctx->spl)); + waitq_unlock(safeq); + splx(s); return WQ_ITERATE_CONTINUE; } @@ -3216,28 +2422,47 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq, event64_t event, thread_t thread, spl_t *spl) { - struct setid_link *link; + struct waitq *safeq; + struct waitq_link *link; struct select_thread_ctx ctx; kern_return_t kr; + spl_t s; + + s = splsched(); + + /* Find and lock the interrupts disabled queue the thread is actually on */ + if (!waitq_irq_safe(waitq)) { + safeq = global_eventq(waitq); + waitq_lock(safeq); + } else { + safeq = waitq; + } - *spl = splsched(); thread_lock(thread); if ((thread->waitq == waitq) && (thread->wait_event == event)) { - remqueue(&thread->links); + remqueue(&thread->wait_links); + if (queue_empty(&safeq->waitq_queue)) { + safeq->waitq_eventmask = 0; + } thread_clear_waitq_state(thread); + *spl = s; /* thread still locked */ return KERN_SUCCESS; } thread_unlock(thread); - splx(*spl); + + if (safeq != waitq) + waitq_unlock(safeq); + + splx(s); if (!waitq->waitq_set_id) return KERN_NOT_WAITING; /* check to see if the set ID for this wait queue is valid */ - link = lt_get_link(waitq->waitq_set_id); + link = wql_get_link(waitq->waitq_set_id); if (!link) { /* the waitq to which this set belonged, has been invalidated */ waitq->waitq_set_id = 0; @@ -3253,10 +2478,10 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq, ctx.thread = thread; ctx.event = event; ctx.spl = spl; - kr = walk_setid_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id, - SLT_WQS, (void *)&ctx, waitq_select_thread_cb); + kr = walk_waitq_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id, + WQL_WQS, (void *)&ctx, waitq_select_thread_cb); - lt_put_link(link); + wql_put_link(link); /* we found a thread, return success */ if (kr == WQ_ITERATE_FOUND) @@ -3279,7 +2504,6 @@ static int prepost_exists_cb(struct waitq_set __unused *wqset, * * Conditions: * 'waitq' is locked - * 'thread' is locked */ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, event64_t wait_event, @@ -3291,11 +2515,16 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, { wait_result_t wait_result; int realtime = 0; + struct waitq *safeq; + uintptr_t eventmask; + spl_t s; + /* * Warning: Do _not_ place debugging print statements here. - * The thread is locked! + * The waitq is locked! */ + assert(!thread->started || thread == current_thread()); if (thread->waitq != NULL) panic("thread already waiting on %p", thread->waitq); @@ -3316,12 +2545,34 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, ret = wq_prepost_foreach_locked(wqset, NULL, prepost_exists_cb); if (ret == WQ_ITERATE_FOUND) { + s = splsched(); + thread_lock(thread); thread->wait_result = THREAD_AWAKENED; + thread_unlock(thread); + splx(s); return THREAD_AWAKENED; } } } + s = splsched(); + + /* + * If already dealing with an irq safe wait queue, we are all set. + * Otherwise, determine a global queue to use and lock it. + */ + if (!waitq_irq_safe(waitq)) { + safeq = global_eventq(waitq); + eventmask = _CAST_TO_EVENT_MASK(waitq); + waitq_lock(safeq); + } else { + safeq = waitq; + eventmask = _CAST_TO_EVENT_MASK(wait_event); + } + + /* lock the thread now that we have the irq-safe waitq locked */ + thread_lock(thread); + /* * Realtime threads get priority for wait queue placements. * This allows wait_queue_wakeup_one to prefer a waiting @@ -3342,17 +2593,20 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, wait_result = thread_mark_wait_locked(thread, interruptible); /* thread->wait_result has been set */ if (wait_result == THREAD_WAITING) { - if (!waitq->waitq_fifo + + if (!safeq->waitq_fifo || (thread->options & TH_OPT_VMPRIV) || realtime) - enqueue_head(&waitq->waitq_queue, &thread->links); + enqueue_head(&safeq->waitq_queue, &thread->wait_links); else - enqueue_tail(&waitq->waitq_queue, &thread->links); + enqueue_tail(&safeq->waitq_queue, &thread->wait_links); + /* mark the event and real waitq, even if enqueued on a global safeq */ thread->wait_event = wait_event; thread->waitq = waitq; if (deadline != 0) { boolean_t act; + act = timer_call_enter_with_leeway(&thread->wait_timer, NULL, deadline, leeway, @@ -3362,13 +2616,22 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, thread->wait_timer_is_set = TRUE; } - if (waitq_is_global(waitq)) - waitq->waitq_eventmask = waitq->waitq_eventmask - | _CAST_TO_EVENT_MASK(wait_event); + if (waitq_is_global(safeq)) + safeq->waitq_eventmask |= eventmask; waitq_stats_count_wait(waitq); } + /* unlock the thread */ + thread_unlock(thread); + + /* unlock the safeq if we locked it here */ + if (safeq != waitq) { + waitq_unlock(safeq); + } + + splx(s); + return wait_result; } @@ -3376,7 +2639,6 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, * remove 'thread' from its current blocking state on 'waitq' * * Conditions: - * 'waitq' is locked * 'thread' is locked * * Notes: @@ -3384,18 +2646,37 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, * sched_prim.c from the thread timer wakeup path * (i.e. the thread was waiting on 'waitq' with a timeout that expired) */ -void waitq_pull_thread_locked(struct waitq *waitq, thread_t thread) +int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread) { - (void)waitq; + struct waitq *safeq; + + assert_thread_magic(thread); assert(thread->waitq == waitq); - remqueue(&thread->links); + /* Find the interrupts disabled queue thread is waiting on */ + if (!waitq_irq_safe(waitq)) { + safeq = global_eventq(waitq); + } else { + safeq = waitq; + } + + /* thread is already locked so have to try for the waitq lock */ + if (!waitq_lock_try(safeq)) + return 0; + + remqueue(&thread->wait_links); thread_clear_waitq_state(thread); waitq_stats_count_clear_wakeup(waitq); /* clear the global event mask if this was the last thread there! */ - if (waitq_is_global(waitq) && queue_empty(&waitq->waitq_queue)) - waitq->waitq_eventmask = 0; + if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue)) { + safeq->waitq_eventmask = 0; + /* JMM - also mark no-waiters on waitq (if not the same as the safeq) */ + } + + waitq_unlock(safeq); + + return 1; } @@ -3518,14 +2799,15 @@ kern_return_t waitq_wakeup64_all_locked(struct waitq *waitq, ret = KERN_NOT_WAITING; #if CONFIG_WAITQ_STATS - qe_foreach_element(thread, &wakeup_queue, links) + qe_foreach_element(thread, &wakeup_queue, wait_links) waitq_stats_count_wakeup(waitq); #endif if (lock_state == WAITQ_UNLOCK) waitq_unlock(waitq); - qe_foreach_element_safe(thread, &wakeup_queue, links) { - remqueue(&thread->links); + qe_foreach_element_safe(thread, &wakeup_queue, wait_links) { + assert_thread_magic(thread); + remqueue(&thread->wait_links); maybe_adjust_thread_pri(thread, priority); ret = thread_go(thread, result); assert(ret == KERN_SUCCESS); @@ -3560,9 +2842,16 @@ kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq, assert(waitq_held(waitq)); - thread = waitq_select_one_locked(waitq, wake_event, - reserved_preposts, - priority, &th_spl); + if (priority == WAITQ_SELECT_MAX_PRI) { + thread = waitq_select_max_locked(waitq, wake_event, + reserved_preposts, + &th_spl); + } else { + thread = waitq_select_one_locked(waitq, wake_event, + reserved_preposts, + priority, &th_spl); + } + if (thread != THREAD_NULL) waitq_stats_count_wakeup(waitq); @@ -3596,20 +2885,28 @@ kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq, * been disabled, and the caller is responsible to call * splx() with the returned '*spl' value. */ -thread_t waitq_wakeup64_identity_locked(struct waitq *waitq, - event64_t wake_event, - wait_result_t result, - spl_t *spl, - uint64_t *reserved_preposts, - waitq_lock_state_t lock_state) +thread_t +waitq_wakeup64_identify_locked(struct waitq *waitq, + event64_t wake_event, + wait_result_t result, + spl_t *spl, + uint64_t *reserved_preposts, + int priority, + waitq_lock_state_t lock_state) { thread_t thread; assert(waitq_held(waitq)); - thread = waitq_select_one_locked(waitq, wake_event, - reserved_preposts, - WAITQ_ALL_PRIORITIES, spl); + if (priority == WAITQ_SELECT_MAX_PRI) { + thread = waitq_select_max_locked(waitq, wake_event, + reserved_preposts, + spl); + } else { + thread = waitq_select_one_locked(waitq, wake_event, + reserved_preposts, + priority, spl); + } if (thread != THREAD_NULL) waitq_stats_count_wakeup(waitq); @@ -3652,6 +2949,7 @@ kern_return_t waitq_wakeup64_thread_locked(struct waitq *waitq, spl_t th_spl; assert(waitq_held(waitq)); + assert_thread_magic(thread); /* * See if the thread was still waiting there. If so, it got @@ -3706,9 +3004,10 @@ kern_return_t waitq_init(struct waitq *waitq, int policy) waitq->waitq_set_id = 0; waitq->waitq_prepost_id = 0; - hw_lock_init(&waitq->waitq_interlock); + waitq_lock_init(waitq); queue_init(&waitq->waitq_queue); + waitq->waitq_isvalid = 1; return KERN_SUCCESS; } @@ -3721,26 +3020,24 @@ static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx, struct wq_prepost *wqp, struct waitq *waitq); /** - * walk_setid_links callback to invalidate 'link' parameter + * walk_waitq_links callback to invalidate 'link' parameter * * Conditions: - * Called from walk_setid_links. + * Called from walk_waitq_links. * Note that unlink other callbacks, this one make no assumptions about * the 'waitq' parameter, specifically it does not have to be locked or * even valid. */ static int waitq_unlink_all_cb(struct waitq *waitq, void *ctx, - struct setid_link *link) + struct waitq_link *link) { (void)waitq; (void)ctx; - if (sl_type(link) == SLT_LINK && sl_is_valid(link)) - lt_invalidate(link); + if (wql_type(link) == WQL_LINK && wql_is_valid(link)) + wql_invalidate(link); - if (sl_type(link) == SLT_WQS) { + if (wql_type(link) == WQL_WQS) { struct waitq_set *wqset; - int do_spl = 0; - spl_t spl; struct wq_unlink_ctx ulctx; /* @@ -3752,14 +3049,10 @@ static int waitq_unlink_all_cb(struct waitq *waitq, void *ctx, if (waitq->waitq_prepost_id == 0) goto out; - wqset = link->sl_wqs.sl_set; + wqset = link->wql_wqs.wql_set; assert(wqset != NULL); + assert(!waitq_irq_safe(&wqset->wqset_q)); - if (waitq_set_is_valid(wqset) && - waitq_irq_safe(&wqset->wqset_q)) { - spl = splsched(); - do_spl = 1; - } waitq_set_lock(wqset); if (!waitq_set_is_valid(wqset)) { @@ -3775,8 +3068,6 @@ static int waitq_unlink_all_cb(struct waitq *waitq, void *ctx, waitq_unlink_prepost_cb); out_unlock: waitq_set_unlock(wqset); - if (do_spl) - splx(spl); } out: @@ -3789,32 +3080,41 @@ static int waitq_unlink_all_cb(struct waitq *waitq, void *ctx, */ void waitq_deinit(struct waitq *waitq) { - uint64_t setid = 0; spl_t s; - if (!waitq_valid(waitq)) + if (!waitq || !waitq_is_queue(waitq)) return; if (waitq_irq_safe(waitq)) s = splsched(); waitq_lock(waitq); - if (!waitq_valid(waitq)) - goto out; + if (!waitq_valid(waitq)) { + waitq_unlock(waitq); + if (waitq_irq_safe(waitq)) + splx(s); + return; + } - waitq_unlink_all_locked(waitq, &setid, &s, NULL); waitq->waitq_type = WQT_INVALID; - assert(queue_empty(&waitq->waitq_queue)); + waitq->waitq_isvalid = 0; -out: - waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) + if (!waitq_irq_safe(waitq)) { + waitq_unlink_all_unlock(waitq); + /* waitq unlocked and set links deallocated */ + } else { + waitq_unlock(waitq); splx(s); + } - if (setid) - (void)walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, setid, - SLT_ALL, NULL, waitq_unlink_all_cb); + assert(queue_empty(&waitq->waitq_queue)); } +void waitq_invalidate_locked(struct waitq *waitq) +{ + assert(waitq_held(waitq)); + assert(waitq_is_valid(waitq)); + waitq->waitq_isvalid = 0; +} /** * invalidate the given wq_prepost object @@ -3843,7 +3143,7 @@ static int wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset, * allocated / initialized waitq_set object * NULL on failure */ -struct waitq_set *waitq_set_alloc(int policy) +struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook) { struct waitq_set *wqset; @@ -3852,7 +3152,7 @@ struct waitq_set *waitq_set_alloc(int policy) panic("Can't allocate a new waitq set from zone %p", waitq_set_zone); kern_return_t ret; - ret = waitq_set_init(wqset, policy, NULL); + ret = waitq_set_init(wqset, policy, NULL, prepost_hook); if (ret != KERN_SUCCESS) { zfree(waitq_set_zone, wqset); wqset = NULL; @@ -3869,9 +3169,10 @@ struct waitq_set *waitq_set_alloc(int policy) * no 'reserved_link' object is passed. */ kern_return_t waitq_set_init(struct waitq_set *wqset, - int policy, uint64_t *reserved_link) + int policy, uint64_t *reserved_link, + void *prepost_hook) { - struct setid_link *link; + struct waitq_link *link; kern_return_t ret; memset(wqset, 0, sizeof(*wqset)); @@ -3881,27 +3182,30 @@ kern_return_t waitq_set_init(struct waitq_set *wqset, return ret; wqset->wqset_q.waitq_type = WQT_SET; - if (policy & SYNC_POLICY_PREPOST) + if (policy & SYNC_POLICY_PREPOST) { wqset->wqset_q.waitq_prepost = 1; - else + wqset->wqset_prepost_id = 0; + assert(prepost_hook == NULL); + } else { wqset->wqset_q.waitq_prepost = 0; + wqset->wqset_prepost_hook = prepost_hook; + } if (reserved_link && *reserved_link != 0) { - link = lt_get_reserved(*reserved_link, SLT_WQS); + link = wql_get_reserved(*reserved_link, WQL_WQS); /* always consume the caller's reference */ *reserved_link = 0; } else { - link = lt_alloc_link(SLT_WQS); + link = wql_alloc_link(WQL_WQS); } if (!link) panic("Can't allocate link object for waitq set: %p", wqset); - link->sl_wqs.sl_set = wqset; - sl_set_valid(link); + link->wql_wqs.wql_set = wqset; + wql_mkvalid(link); - wqset->wqset_id = link->sl_set_id.id; - wqset->wqset_prepost_id = 0; - lt_put_link(link); + wqset->wqset_id = link->wql_setid.id; + wql_put_link(link); return KERN_SUCCESS; } @@ -3917,48 +3221,35 @@ kern_return_t waitq_set_init(struct waitq_set *wqset, */ void waitq_set_deinit(struct waitq_set *wqset) { - struct setid_link *link = NULL; - uint64_t set_id, set_links_id, prepost_id; - int do_spl = 0; - spl_t s; + struct waitq_link *link = NULL; + uint64_t set_id, prepost_id; if (!waitqs_is_set(wqset)) panic("trying to de-initialize an invalid wqset @%p", wqset); - if (waitq_irq_safe(&wqset->wqset_q)) { - s = splsched(); - do_spl = 1; - } + assert(!waitq_irq_safe(&wqset->wqset_q)); waitq_set_lock(wqset); set_id = wqset->wqset_id; /* grab the set's link object */ - link = lt_get_link(set_id); + link = wql_get_link(set_id); if (link) - lt_invalidate(link); + wql_invalidate(link); /* someone raced us to deinit */ - if (!link || wqset->wqset_id != set_id || set_id != link->sl_set_id.id) { + if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) { if (link) - lt_put_link(link); + wql_put_link(link); waitq_set_unlock(wqset); - if (do_spl) - splx(s); return; } /* every wait queue set should have a valid link object */ - assert(link != NULL && sl_type(link) == SLT_WQS); + assert(link != NULL && wql_type(link) == WQL_WQS); wqset->wqset_id = 0; - wqset->wqset_q.waitq_type = WQT_INVALID; - wqset->wqset_q.waitq_fifo = 0; - wqset->wqset_q.waitq_prepost = 0; - /* don't clear the 'waitq_irq' bit: it's used in locking! */ - wqset->wqset_q.waitq_eventmask = 0; - /* * This set may have a lot of preposts, or may have been a member of * many other sets. To minimize spinlock hold times, we clear out the @@ -3966,37 +3257,39 @@ void waitq_set_deinit(struct waitq_set *wqset) * table objects. We keep handles to the prepost and set linkage * objects and free those outside the critical section. */ - prepost_id = wqset->wqset_prepost_id; + prepost_id = 0; + if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) + prepost_id = wqset->wqset_prepost_id; + /* else { TODO: notify kqueue subsystem? } */ wqset->wqset_prepost_id = 0; - set_links_id = 0; - waitq_unlink_all_locked(&wqset->wqset_q, &set_links_id, &s, NULL); + wqset->wqset_q.waitq_type = WQT_INVALID; + wqset->wqset_q.waitq_fifo = 0; + wqset->wqset_q.waitq_prepost = 0; + wqset->wqset_q.waitq_isvalid = 0; - waitq_set_unlock(wqset); - if (do_spl) - splx(s); + /* don't clear the 'waitq_irq' bit: it's used in locking! */ + wqset->wqset_q.waitq_eventmask = 0; + + waitq_unlink_all_unlock(&wqset->wqset_q); + /* wqset->wqset_q unlocked and set links deallocated */ /* - * walk_setid_links may race with us for access to the waitq set. - * If walk_setid_links has a reference to the set, then we should wait + * walk_waitq_links may race with us for access to the waitq set. + * If walk_waitq_links has a reference to the set, then we should wait * until the link's refcount goes to 1 (our reference) before we exit * this function. That way we ensure that the waitq set memory will * remain valid even though it's been cleared out. */ - while (sl_refcnt(link) > 1) + while (wql_refcnt(link) > 1) delay(1); - lt_put_link(link); - - /* - * release all the set link objects - * (links to other sets to which this set was previously added) - */ - if (set_links_id) - (void)walk_setid_links(LINK_WALK_ONE_LEVEL, NULL, set_links_id, - SLT_ALL, NULL, waitq_unlink_all_cb); + wql_put_link(link); /* drop / unlink all the prepost table objects */ - (void)wq_prepost_iterate(prepost_id, NULL, wqset_clear_prepost_chain_cb); + /* JMM - can this happen before the delay? */ + if (prepost_id) + (void)wq_prepost_iterate(prepost_id, NULL, + wqset_clear_prepost_chain_cb); } /** @@ -4058,11 +3351,13 @@ struct waitq *wqset_waitq(struct waitq_set *wqset) * The return value of the function indicates whether or not this * happened: 1 == lock was dropped, 0 == lock held */ -int waitq_clear_prepost_locked(struct waitq *waitq, spl_t *s) +int waitq_clear_prepost_locked(struct waitq *waitq) { struct wq_prepost *wqp; int dropped_lock = 0; + assert(!waitq_irq_safe(waitq)); + if (waitq->waitq_prepost_id == 0) return 0; @@ -4074,7 +3369,6 @@ int waitq_clear_prepost_locked(struct waitq *waitq, spl_t *s) wqp->wqp_prepostid.id, wqp_refcnt(wqp)); wq_prepost_invalidate(wqp); while (wqp_refcnt(wqp) > 1) { - int do_spl = waitq_irq_safe(waitq); /* * Some other thread must have raced us to grab a link @@ -4093,16 +3387,13 @@ int waitq_clear_prepost_locked(struct waitq *waitq, spl_t *s) disable_preemption(); waitq_unlock(waitq); - if (s && do_spl) - splx(*s); dropped_lock = 1; /* * don't yield here, just spin and assume the other * consumer is already on core... */ delay(1); - if (s && do_spl) - *s = splsched(); + waitq_lock(waitq); enable_preemption(); @@ -4123,19 +3414,13 @@ int waitq_clear_prepost_locked(struct waitq *waitq, spl_t *s) */ void waitq_clear_prepost(struct waitq *waitq) { - spl_t s; - int do_spl = waitq_irq_safe(waitq); - assert(waitq_valid(waitq)); + assert(!waitq_irq_safe(waitq)); - if (do_spl) - s = splsched(); waitq_lock(waitq); /* it doesn't matter to us if the lock is dropped here */ - (void)waitq_clear_prepost_locked(waitq, &s); + (void)waitq_clear_prepost_locked(waitq); waitq_unlock(waitq); - if (do_spl) - splx(s); } /** @@ -4148,13 +3433,12 @@ uint64_t waitq_get_prepost_id(struct waitq *waitq) { struct wq_prepost *wqp; uint64_t wqp_id = 0; - spl_t s; if (!waitq_valid(waitq)) return 0; + + assert(!waitq_irq_safe(waitq)); - if (waitq_irq_safe(waitq)) - s = splsched(); waitq_lock(waitq); if (!waitq_valid(waitq)) @@ -4167,16 +3451,12 @@ uint64_t waitq_get_prepost_id(struct waitq *waitq) /* don't hold a spinlock while allocating a prepost object */ waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); wqp = wq_prepost_alloc(WQP_WQ, 1); if (!wqp) return 0; /* re-acquire the waitq lock */ - if (waitq_irq_safe(waitq)) - s = splsched(); waitq_lock(waitq); if (!waitq_valid(waitq)) { @@ -4202,30 +3482,28 @@ uint64_t waitq_get_prepost_id(struct waitq *waitq) out_unlock: waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); return wqp_id; } -static int waitq_inset_cb(struct waitq *waitq, void *ctx, struct setid_link *link) +static int waitq_inset_cb(struct waitq *waitq, void *ctx, struct waitq_link *link) { uint64_t setid = *(uint64_t *)ctx; - int ltype = sl_type(link); + int wqltype = wql_type(link); (void)waitq; - if (ltype == SLT_WQS && link->sl_set_id.id == setid) { + if (wqltype == WQL_WQS && link->wql_setid.id == setid) { wqdbg_v(" waitq already in set 0x%llx", setid); return WQ_ITERATE_FOUND; - } else if (ltype == SLT_LINK) { + } else if (wqltype == WQL_LINK) { /* * break out early if we see a link that points to the setid * in question. This saves us a step in the * iteration/recursion */ - wqdbg_v(" waitq already in set 0x%llx (SLT_LINK)", setid); - if (link->sl_link.sl_left_setid == setid || - link->sl_link.sl_right_setid == setid) + wqdbg_v(" waitq already in set 0x%llx (WQL_LINK)", setid); + if (link->wql_link.left_setid == setid || + link->wql_link.right_setid == setid) return WQ_ITERATE_FOUND; } @@ -4243,16 +3521,14 @@ boolean_t waitq_member(struct waitq *waitq, struct waitq_set *wqset) { kern_return_t kr = WQ_ITERATE_SUCCESS; uint64_t setid; - spl_t s; if (!waitq_valid(waitq)) panic("Invalid waitq: %p", waitq); + assert(!waitq_irq_safe(waitq)); if (!waitqs_is_set(wqset)) return FALSE; - - if (waitq_irq_safe(waitq)) - s = splsched(); + waitq_lock(waitq); setid = wqset->wqset_id; @@ -4262,23 +3538,16 @@ boolean_t waitq_member(struct waitq *waitq, struct waitq_set *wqset) /* fast path: most waitqs are members of only 1 set */ if (waitq->waitq_set_id == setid) { waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); return TRUE; } /* walk the link table and look for the Set ID of wqset */ - kr = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, - SLT_ALL, (void *)&setid, waitq_inset_cb); + kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, + WQL_ALL, (void *)&setid, waitq_inset_cb); out_unlock: waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); - - if (kr == WQ_ITERATE_FOUND) - return TRUE; - return FALSE; + return (kr == WQ_ITERATE_FOUND); } /** @@ -4286,22 +3555,22 @@ boolean_t waitq_member(struct waitq *waitq, struct waitq_set *wqset) */ boolean_t waitq_in_set(struct waitq *waitq) { - struct setid_link *link; + struct waitq_link *link; boolean_t inset = FALSE; - spl_t s; if (waitq_irq_safe(waitq)) - s = splsched(); + return FALSE; + waitq_lock(waitq); if (!waitq->waitq_set_id) goto out_unlock; - link = lt_get_link(waitq->waitq_set_id); + link = wql_get_link(waitq->waitq_set_id); if (link) { /* if we get here, the waitq is in _at_least_one_ set */ inset = TRUE; - lt_put_link(link); + wql_put_link(link); } else { /* we can just optimize this for next time */ waitq->waitq_set_id = 0; @@ -4309,8 +3578,6 @@ boolean_t waitq_in_set(struct waitq *waitq) out_unlock: waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); return inset; } @@ -4324,7 +3591,7 @@ boolean_t waitq_in_set(struct waitq *waitq) */ uint64_t waitq_link_reserve(struct waitq *waitq) { - struct setid_link *link; + struct waitq_link *link; uint64_t reserved_id = 0; assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread())); @@ -4333,14 +3600,14 @@ uint64_t waitq_link_reserve(struct waitq *waitq) * We've asserted that the caller can block, so we enforce a * minimum-free table element policy here. */ - lt_ensure_free_space(); + wql_ensure_free_space(); (void)waitq; - link = lt_alloc_link(WQT_RESERVED); + link = wql_alloc_link(LT_RESERVED); if (!link) return 0; - reserved_id = link->sl_set_id.id; + reserved_id = link->wql_setid.id; return reserved_id; } @@ -4350,23 +3617,23 @@ uint64_t waitq_link_reserve(struct waitq *waitq) */ void waitq_link_release(uint64_t id) { - struct setid_link *link; + struct waitq_link *link; if (id == 0) return; - link = lt_get_reserved(id, SLT_LINK); + link = wql_get_reserved(id, WQL_LINK); if (!link) return; /* * if we successfully got a link object, then we know * it's not been marked valid, and can be released with - * a standard lt_put_link() which should free the element. + * a standard wql_put_link() which should free the element. */ - lt_put_link(link); -#if CONFIG_WAITQ_STATS - g_linktable.nreserved_releases += 1; + wql_put_link(link); +#if CONFIG_LTABLE_STATS + g_wqlinktable.nreserved_releases += 1; #endif } @@ -4378,9 +3645,9 @@ void waitq_link_release(uint64_t id) * caller should have a reference to the 'link' object */ static kern_return_t waitq_link_internal(struct waitq *waitq, - uint64_t setid, struct setid_link *link) + uint64_t setid, struct waitq_link *link) { - struct setid_link *qlink; + struct waitq_link *qlink; kern_return_t kr; assert(waitq_held(waitq)); @@ -4395,7 +3662,7 @@ static kern_return_t waitq_link_internal(struct waitq *waitq, return KERN_SUCCESS; } - qlink = lt_get_link(waitq->waitq_set_id); + qlink = wql_get_link(waitq->waitq_set_id); if (!qlink) { /* * The set to which this wait queue belonged has been @@ -4404,15 +3671,15 @@ static kern_return_t waitq_link_internal(struct waitq *waitq, waitq->waitq_set_id = setid; return KERN_SUCCESS; } - lt_put_link(qlink); + wql_put_link(qlink); /* * Check to see if it's already a member of the set. * * TODO: check for cycles! */ - kr = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, - SLT_ALL, (void *)&setid, waitq_inset_cb); + kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, + WQL_ALL, (void *)&setid, waitq_inset_cb); if (kr == WQ_ITERATE_FOUND) return kr; @@ -4425,11 +3692,11 @@ static kern_return_t waitq_link_internal(struct waitq *waitq, * this link object. That's OK because the next time we use that * object we'll just ignore it. */ - link->sl_link.sl_left_setid = setid; - link->sl_link.sl_right_setid = waitq->waitq_set_id; - sl_set_valid(link); + link->wql_link.left_setid = setid; + link->wql_link.right_setid = waitq->waitq_set_id; + wql_mkvalid(link); - waitq->waitq_set_id = link->sl_set_id.id; + waitq->waitq_set_id = link->wql_setid.id; return KERN_SUCCESS; } @@ -4452,11 +3719,10 @@ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset, waitq_lock_state_t lock_state, uint64_t *reserved_link) { kern_return_t kr; - struct setid_link *link; + struct waitq_link *link; int should_lock = (lock_state == WAITQ_SHOULD_LOCK); - spl_t s; - if (!waitq_valid(waitq)) + if (!waitq_valid(waitq) || waitq_irq_safe(waitq)) panic("Invalid waitq: %p", waitq); if (!waitqs_is_set(wqset)) @@ -4465,39 +3731,26 @@ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset, wqdbg_v("Link waitq %p to wqset 0x%llx", (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id); - if (waitq_irq_safe(waitq) && (!reserved_link || *reserved_link == 0)) { - /* - * wait queues that need IRQs disabled cannot block waiting - * for table growth to complete. Even though this is rare, - * we require all these waitqs to pass in a reserved link - * object to avoid the potential to block. - */ - panic("Global/IRQ-safe waitq %p cannot link to %p without" - "reserved object!", waitq, wqset); - } - /* * We _might_ need a new link object here, so we'll grab outside * the lock because the alloc call _might_ block. * - * If the caller reserved a link beforehand, then lt_get_link + * If the caller reserved a link beforehand, then wql_get_link * is guaranteed not to block because the caller holds an extra * reference to the link which, in turn, hold a reference to the * link table. */ if (reserved_link && *reserved_link != 0) { - link = lt_get_reserved(*reserved_link, SLT_LINK); + link = wql_get_reserved(*reserved_link, WQL_LINK); /* always consume the caller's reference */ *reserved_link = 0; } else { - link = lt_alloc_link(SLT_LINK); + link = wql_alloc_link(WQL_LINK); } if (!link) return KERN_NO_SPACE; if (should_lock) { - if (waitq_irq_safe(waitq)) - s = splsched(); waitq_lock(waitq); } @@ -4505,11 +3758,9 @@ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset, if (should_lock) { waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); } - lt_put_link(link); + wql_put_link(link); return kr; } @@ -4519,7 +3770,7 @@ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset, * this function also prunes invalid objects from the tree * * Conditions: - * MUST be called from walk_setid_links link table walk + * MUST be called from walk_waitq_links link table walk * 'waitq' is locked * * Notes: @@ -4529,9 +3780,9 @@ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset, */ static inline int waitq_maybe_remove_link(struct waitq *waitq, uint64_t setid, - struct setid_link *parent, - struct setid_link *left, - struct setid_link *right) + struct waitq_link *parent, + struct waitq_link *left, + struct waitq_link *right) { uint64_t *wq_setid = &waitq->waitq_set_id; @@ -4555,31 +3806,31 @@ static inline int waitq_maybe_remove_link(struct waitq *waitq, * waitq_set_id of the original waitq to point to the side of the * parent that is still valid. We then discard the parent link object. */ - if (*wq_setid == parent->sl_set_id.id) { + if (*wq_setid == parent->wql_setid.id) { if (!left && !right) { /* completely invalid children */ - lt_invalidate(parent); + wql_invalidate(parent); wqdbg_v("S1, L+R"); *wq_setid = 0; return WQ_ITERATE_INVALID; - } else if (!left || left->sl_set_id.id == setid) { + } else if (!left || left->wql_setid.id == setid) { /* * left side matches we know it points either to the * WQS we're unlinking, or to an invalid object: * no need to invalidate it */ - *wq_setid = right ? right->sl_set_id.id : 0; - lt_invalidate(parent); + *wq_setid = right ? right->wql_setid.id : 0; + wql_invalidate(parent); wqdbg_v("S1, L"); return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; - } else if (!right || right->sl_set_id.id == setid) { + } else if (!right || right->wql_setid.id == setid) { /* * if right side matches we know it points either to the * WQS we're unlinking, or to an invalid object: * no need to invalidate it */ - *wq_setid = left ? left->sl_set_id.id : 0; - lt_invalidate(parent); + *wq_setid = left ? left->wql_setid.id : 0; + wql_invalidate(parent); wqdbg_v("S1, R"); return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; } @@ -4614,78 +3865,78 @@ static inline int waitq_maybe_remove_link(struct waitq *waitq, * middle link (left or right) and point the parent link directly to * the remaining leaf node. */ - if (left && sl_type(left) == SLT_LINK) { + if (left && wql_type(left) == WQL_LINK) { uint64_t Ll, Lr; - struct setid_link *linkLl, *linkLr; - assert(left->sl_set_id.id != setid); - Ll = left->sl_link.sl_left_setid; - Lr = left->sl_link.sl_right_setid; - linkLl = lt_get_link(Ll); - linkLr = lt_get_link(Lr); + struct waitq_link *linkLl, *linkLr; + assert(left->wql_setid.id != setid); + Ll = left->wql_link.left_setid; + Lr = left->wql_link.right_setid; + linkLl = wql_get_link(Ll); + linkLr = wql_get_link(Lr); if (!linkLl && !linkLr) { /* * The left object points to two invalid objects! * We can invalidate the left w/o touching the parent. */ - lt_invalidate(left); + wql_invalidate(left); wqdbg_v("S2, Ll+Lr"); return WQ_ITERATE_INVALID; } else if (!linkLl || Ll == setid) { /* Ll is invalid and/or the wait queue set we're looking for */ - parent->sl_link.sl_left_setid = Lr; - lt_invalidate(left); - lt_put_link(linkLl); - lt_put_link(linkLr); + parent->wql_link.left_setid = Lr; + wql_invalidate(left); + wql_put_link(linkLl); + wql_put_link(linkLr); wqdbg_v("S2, Ll"); return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; } else if (!linkLr || Lr == setid) { /* Lr is invalid and/or the wait queue set we're looking for */ - parent->sl_link.sl_left_setid = Ll; - lt_invalidate(left); - lt_put_link(linkLr); - lt_put_link(linkLl); + parent->wql_link.left_setid = Ll; + wql_invalidate(left); + wql_put_link(linkLr); + wql_put_link(linkLl); wqdbg_v("S2, Lr"); return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; } - lt_put_link(linkLl); - lt_put_link(linkLr); + wql_put_link(linkLl); + wql_put_link(linkLr); } - if (right && sl_type(right) == SLT_LINK) { + if (right && wql_type(right) == WQL_LINK) { uint64_t Rl, Rr; - struct setid_link *linkRl, *linkRr; - assert(right->sl_set_id.id != setid); - Rl = right->sl_link.sl_left_setid; - Rr = right->sl_link.sl_right_setid; - linkRl = lt_get_link(Rl); - linkRr = lt_get_link(Rr); + struct waitq_link *linkRl, *linkRr; + assert(right->wql_setid.id != setid); + Rl = right->wql_link.left_setid; + Rr = right->wql_link.right_setid; + linkRl = wql_get_link(Rl); + linkRr = wql_get_link(Rr); if (!linkRl && !linkRr) { /* * The right object points to two invalid objects! * We can invalidate the right w/o touching the parent. */ - lt_invalidate(right); + wql_invalidate(right); wqdbg_v("S2, Rl+Rr"); return WQ_ITERATE_INVALID; } else if (!linkRl || Rl == setid) { /* Rl is invalid and/or the wait queue set we're looking for */ - parent->sl_link.sl_right_setid = Rr; - lt_invalidate(right); - lt_put_link(linkRl); - lt_put_link(linkRr); + parent->wql_link.right_setid = Rr; + wql_invalidate(right); + wql_put_link(linkRl); + wql_put_link(linkRr); wqdbg_v("S2, Rl"); return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; } else if (!linkRr || Rr == setid) { /* Rr is invalid and/or the wait queue set we're looking for */ - parent->sl_link.sl_right_setid = Rl; - lt_invalidate(right); - lt_put_link(linkRl); - lt_put_link(linkRr); + parent->wql_link.right_setid = Rl; + wql_invalidate(right); + wql_put_link(linkRl); + wql_put_link(linkRr); wqdbg_v("S2, Rr"); return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; } - lt_put_link(linkRl); - lt_put_link(linkRr); + wql_put_link(linkRl); + wql_put_link(linkRr); } return WQ_ITERATE_CONTINUE; @@ -4695,7 +3946,7 @@ static inline int waitq_maybe_remove_link(struct waitq *waitq, * link table walk callback that unlinks 'waitq' from 'ctx->setid' * * Conditions: - * called from walk_setid_links + * called from walk_waitq_links * 'waitq' is locked * * Notes: @@ -4703,25 +3954,25 @@ static inline int waitq_maybe_remove_link(struct waitq *waitq, * perform the actual unlinking */ static int waitq_unlink_cb(struct waitq *waitq, void *ctx, - struct setid_link *link) + struct waitq_link *link) { uint64_t setid = *((uint64_t *)ctx); - struct setid_link *right, *left; + struct waitq_link *right, *left; int ret = 0; - if (sl_type(link) != SLT_LINK) + if (wql_type(link) != WQL_LINK) return WQ_ITERATE_CONTINUE; do { - left = lt_get_link(link->sl_link.sl_left_setid); - right = lt_get_link(link->sl_link.sl_right_setid); + left = wql_get_link(link->wql_link.left_setid); + right = wql_get_link(link->wql_link.right_setid); ret = waitq_maybe_remove_link(waitq, setid, link, left, right); - lt_put_link(left); - lt_put_link(right); + wql_put_link(left); + wql_put_link(right); - if (!sl_is_valid(link)) + if (!wql_is_valid(link)) return WQ_ITERATE_INVALID; /* A ret value of UNLINKED will break us out of table walk */ } while (ret == WQ_ITERATE_INVALID); @@ -4777,12 +4028,13 @@ static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx, * (see waitq_clear_prepost_locked) */ static kern_return_t waitq_unlink_locked(struct waitq *waitq, - struct waitq_set *wqset, - spl_t *s) + struct waitq_set *wqset) { uint64_t setid; kern_return_t kr; + assert(!waitq_irq_safe(waitq)); + setid = wqset->wqset_id; if (waitq->waitq_set_id == 0) { @@ -4793,7 +4045,7 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq, * they prepost into select sets... */ if (waitq->waitq_prepost_id != 0) - (void)waitq_clear_prepost_locked(waitq, s); + (void)waitq_clear_prepost_locked(waitq); return KERN_NOT_IN_SET; } @@ -4805,7 +4057,7 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq, * matter if this function drops and re-acquires the lock * because we're not manipulating waitq state any more. */ - (void)waitq_clear_prepost_locked(waitq, s); + (void)waitq_clear_prepost_locked(waitq); return KERN_SUCCESS; } @@ -4824,19 +4076,20 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq, * A, and set A belonged to set B. You can't remove the waitq * from set B. */ - kr = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, - SLT_LINK, (void *)&setid, waitq_unlink_cb); + kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, + WQL_LINK, (void *)&setid, waitq_unlink_cb); if (kr == WQ_ITERATE_UNLINKED) { struct wq_unlink_ctx ulctx; - int do_spl = 0; kr = KERN_SUCCESS; /* found it and dis-associated it */ - if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) { - *s = splsched(); - do_spl = 1; - } + /* don't look for preposts if it's not prepost-enabled */ + if (!wqset->wqset_q.waitq_prepost) + goto out; + + assert(!waitq_irq_safe(&wqset->wqset_q)); + waitq_set_lock(wqset); /* * clear out any prepost from waitq into wqset @@ -4848,12 +4101,11 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq, (void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx, waitq_unlink_prepost_cb); waitq_set_unlock(wqset); - if (do_spl) - splx(*s); } else { kr = KERN_NOT_IN_SET; /* waitq is _not_ associated with wqset */ } +out: return kr; } @@ -4869,7 +4121,6 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq, kern_return_t waitq_unlink(struct waitq *waitq, struct waitq_set *wqset) { kern_return_t kr = KERN_SUCCESS; - spl_t s; assert(waitqs_is_set(wqset)); @@ -4883,16 +4134,13 @@ kern_return_t waitq_unlink(struct waitq *waitq, struct waitq_set *wqset) wqdbg_v("unlink waitq %p from set 0x%llx", (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id); - if (waitq_irq_safe(waitq)) - s = splsched(); + assert(!waitq_irq_safe(waitq)); + waitq_lock(waitq); - kr = waitq_unlink_locked(waitq, wqset, &s); + kr = waitq_unlink_locked(waitq, wqset); waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); - return kr; } @@ -4911,7 +4159,6 @@ void waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset) wqp = wq_prepost_get(wqp_id); if (wqp) { struct waitq *wq; - spl_t s; wq = wqp->wqp_wq.wqp_wq_ptr; @@ -4922,26 +4169,22 @@ void waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset) * complete the unlink operation atomically to avoid a race * with waitq_unlink[_all]. */ - if (waitq_irq_safe(wq)) - s = splsched(); + assert(!waitq_irq_safe(wq)); + waitq_lock(wq); wq_prepost_put(wqp); if (!waitq_valid(wq)) { /* someone already tore down this waitq! */ waitq_unlock(wq); - if (waitq_irq_safe(wq)) - splx(s); enable_preemption(); return; } /* this _may_ drop the wq lock, but that's OK */ - waitq_unlink_locked(wq, wqset, &s); + waitq_unlink_locked(wq, wqset); waitq_unlock(wq); - if (waitq_irq_safe(wq)) - splx(s); } enable_preemption(); return; @@ -4952,25 +4195,26 @@ void waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset) * unlink 'waitq' from all sets to which it belongs * * Conditions: - * 'waitq' is locked + * 'waitq' is locked on entry + * returns with waitq lock dropped * * Notes: - * may drop and re-acquire the waitq lock * may (rarely) spin (see waitq_clear_prepost_locked) */ -kern_return_t waitq_unlink_all_locked(struct waitq *waitq, uint64_t *old_set_id, - spl_t *s, int *dropped_lock) +kern_return_t waitq_unlink_all_unlock(struct waitq *waitq) { + uint64_t old_set_id = 0; wqdbg_v("unlink waitq %p from all sets", (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq)); - - *old_set_id = 0; + assert(!waitq_irq_safe(waitq)); /* it's not a member of any sets */ - if (waitq->waitq_set_id == 0) + if (waitq->waitq_set_id == 0) { + waitq_unlock(waitq); return KERN_SUCCESS; + } - *old_set_id = waitq->waitq_set_id; + old_set_id = waitq->waitq_set_id; waitq->waitq_set_id = 0; /* @@ -4979,9 +4223,19 @@ kern_return_t waitq_unlink_all_locked(struct waitq *waitq, uint64_t *old_set_id, * if it was added to another set and preposted to that set in the * time we drop the lock, the state will remain consistent. */ - int dropped = waitq_clear_prepost_locked(waitq, s); - if (dropped_lock) - *dropped_lock = dropped; + (void)waitq_clear_prepost_locked(waitq); + + waitq_unlock(waitq); + + if (old_set_id) { + /* + * Walk the link table and invalidate each LINK object that + * used to connect this waitq to one or more sets: this works + * because WQL_LINK objects are private to each wait queue + */ + (void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, old_set_id, + WQL_LINK, NULL, waitq_unlink_all_cb); + } return KERN_SUCCESS; } @@ -4998,31 +4252,20 @@ kern_return_t waitq_unlink_all_locked(struct waitq *waitq, uint64_t *old_set_id, kern_return_t waitq_unlink_all(struct waitq *waitq) { kern_return_t kr = KERN_SUCCESS; - uint64_t setid = 0; - spl_t s; if (!waitq_valid(waitq)) panic("Invalid waitq: %p", waitq); - if (waitq_irq_safe(waitq)) - s = splsched(); + assert(!waitq_irq_safe(waitq)); waitq_lock(waitq); - if (waitq_valid(waitq)) - kr = waitq_unlink_all_locked(waitq, &setid, &s, NULL); - waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); - - if (setid) { - /* - * Walk the link table and invalidate each LINK object that - * used to connect this waitq to one or more sets: this works - * because SLT_LINK objects are private to each wait queue - */ - (void)walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, setid, - SLT_LINK, NULL, waitq_unlink_all_cb); + if (!waitq_valid(waitq)) { + waitq_unlock(waitq); + return KERN_SUCCESS; } + kr = waitq_unlink_all_unlock(waitq); + /* waitq unlocked and set links deallocated */ + return kr; } @@ -5031,16 +4274,16 @@ kern_return_t waitq_unlink_all(struct waitq *waitq) * unlink all waitqs from 'wqset' * * Conditions: - * 'wqset' is not locked + * 'wqset' is locked on entry + * 'wqset' is unlocked on exit and spl is restored + * + * Note: * may (rarely) spin/block (see waitq_clear_prepost_locked) */ -kern_return_t waitq_set_unlink_all(struct waitq_set *wqset) +kern_return_t waitq_set_unlink_all_unlock(struct waitq_set *wqset) { - struct setid_link *link; - uint64_t prepost_id, set_links_id = 0; - spl_t spl; - - assert(waitqs_is_set(wqset)); + struct waitq_link *link; + uint64_t prepost_id; wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id); @@ -5048,12 +4291,9 @@ kern_return_t waitq_set_unlink_all(struct waitq_set *wqset) * This operation does not require interaction with any of the set's * constituent wait queues. All we have to do is invalidate the SetID */ - if (waitq_irq_safe(&wqset->wqset_q)) - spl = splsched(); - waitq_set_lock(wqset); /* invalidate and re-alloc the link object first */ - link = lt_get_link(wqset->wqset_id); + link = wql_get_link(wqset->wqset_id); /* we may have raced with a waitq_set_deinit: handle this */ if (!link) { @@ -5061,18 +4301,21 @@ kern_return_t waitq_set_unlink_all(struct waitq_set *wqset) return KERN_SUCCESS; } - lt_invalidate(link); + wql_invalidate(link); /* re-alloc the object to get a new generation ID */ - lt_realloc_link(link, SLT_WQS); - link->sl_wqs.sl_set = wqset; + wql_realloc_link(link, WQL_WQS); + link->wql_wqs.wql_set = wqset; - wqset->wqset_id = link->sl_set_id.id; - sl_set_valid(link); - lt_put_link(link); + wqset->wqset_id = link->wql_setid.id; + wql_mkvalid(link); + wql_put_link(link); /* clear any preposts attached to this set */ - prepost_id = wqset->wqset_prepost_id; + prepost_id = 0; + if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) + prepost_id = wqset->wqset_prepost_id; + /* else { TODO: notify kqueue subsystem? } */ wqset->wqset_prepost_id = 0; /* @@ -5080,23 +4323,11 @@ kern_return_t waitq_set_unlink_all(struct waitq_set *wqset) * waitq sets may prepost to other sets if, for example, they are * associated with a kqueue which is in a select set. * - * This may drop and re-acquire the set lock, but that's OK because - * the resulting state will remain consistent. - */ - waitq_unlink_all_locked(&wqset->wqset_q, &set_links_id, &spl, NULL); - - waitq_set_unlock(wqset); - if (waitq_irq_safe(&wqset->wqset_q)) - splx(spl); - - /* - * release all the set link objects + * This releases all the set link objects * (links to other sets to which this set was previously added) */ - if (set_links_id) - (void)walk_setid_links(LINK_WALK_ONE_LEVEL, &wqset->wqset_q, - set_links_id, SLT_LINK, NULL, - waitq_unlink_all_cb); + waitq_unlink_all_unlock(&wqset->wqset_q); + /* wqset->wqset_q unlocked */ /* drop / unlink all the prepost table objects */ if (prepost_id) @@ -5106,9 +4337,25 @@ kern_return_t waitq_set_unlink_all(struct waitq_set *wqset) return KERN_SUCCESS; } +/** + * unlink all waitqs from 'wqset' + * + * Conditions: + * 'wqset' is not locked + * may (rarely) spin/block (see waitq_clear_prepost_locked) + */ +kern_return_t waitq_set_unlink_all(struct waitq_set *wqset) +{ + assert(waitqs_is_set(wqset)); + assert(!waitq_irq_safe(&wqset->wqset_q)); + + waitq_set_lock(wqset); + return waitq_set_unlink_all_unlock(wqset); + /* wqset unlocked and set links and preposts deallocated */ +} static int waitq_prepost_reserve_cb(struct waitq *waitq, void *ctx, - struct setid_link *link) + struct waitq_link *link) { uint32_t *num = (uint32_t *)ctx; (void)waitq; @@ -5118,20 +4365,19 @@ static int waitq_prepost_reserve_cb(struct waitq *waitq, void *ctx, * per waitq set (if the set was already preposted by another * waitq). */ - if (sl_type(link) == SLT_WQS) { + if (wql_type(link) == WQL_WQS) { /* * check to see if the associated waitq actually supports * preposting */ - if (waitq_set_can_prepost(link->sl_wqs.sl_set)) + if (waitq_set_can_prepost(link->wql_wqs.wql_set)) *num += 2; } return WQ_ITERATE_CONTINUE; } static int waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq, - spl_t *s, int *did_unlock, - struct wq_prepost **wqp) + int *did_unlock, struct wq_prepost **wqp) { struct wq_prepost *tmp; struct wqp_cache *cache; @@ -5154,12 +4400,10 @@ static int waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq, /* unlock the waitq to perform the allocation */ *did_unlock = 1; waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(*s); } do_alloc: - tmp = wq_prepost_alloc(WQT_RESERVED, nalloc); + tmp = wq_prepost_alloc(LT_RESERVED, nalloc); if (!tmp) panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)", nalloc, waitq, *wqp); @@ -5186,8 +4430,6 @@ static int waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq, enable_preemption(); } else { /* otherwise: re-lock the waitq */ - if (waitq_irq_safe(waitq)) - *s = splsched(); waitq_lock(waitq); } } @@ -5225,9 +4467,9 @@ static int waitq_count_prepost_reservation(struct waitq *waitq, int extra, int k * situation is no worse than before and we've alleviated lock * contention on any sets to which this waitq belongs. */ - (void)walk_setid_links(LINK_WALK_FULL_DAG_UNLOCKED, + (void)walk_waitq_links(LINK_WALK_FULL_DAG_UNLOCKED, waitq, waitq->waitq_set_id, - SLT_WQS, (void *)&npreposts, + WQL_WQS, (void *)&npreposts, waitq_prepost_reserve_cb); } @@ -5265,10 +4507,7 @@ static int waitq_count_prepost_reservation(struct waitq *waitq, int extra, int k * * Notes: * If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation - * atomically and returns 'waitq' locked. If the waitq requires - * interrupts to be disabled, then the output parameter 's' is set to the - * previous interrupt state (from splsched), and the caller is - * responsible to call splx(). + * atomically and returns 'waitq' locked. * * This function attempts to pre-allocate precisely enough prepost * objects based on the current set membership of 'waitq'. If the @@ -5277,7 +4516,7 @@ static int waitq_count_prepost_reservation(struct waitq *waitq, int extra, int k * any (rare) blocking in the wakeup path. */ uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra, - waitq_lock_state_t lock_state, spl_t *s) + waitq_lock_state_t lock_state) { uint64_t reserved = 0; uint64_t prev_setid = 0, prev_prepostid = 0; @@ -5286,9 +4525,6 @@ uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra, int keep_locked = (lock_state == WAITQ_KEEP_LOCKED); int unlocked = 0; - if (s) - *s = 0; - wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)", (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra); @@ -5299,7 +4535,7 @@ uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra, * and the set itself may need a new POST object in addition * to the number of preposts requested by the caller */ - nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL, NULL, + nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL, &unlocked, &wqp); assert(nalloc == extra + 2); return wqp->wqp_prepostid.id; @@ -5307,16 +4543,9 @@ uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra, assert(lock_state == WAITQ_KEEP_LOCKED || lock_state == WAITQ_UNLOCK); - if (waitq_irq_safe(waitq)) - *s = splsched(); - waitq_lock(waitq); + assert(!waitq_irq_safe(waitq)); - /* global queues are never part of any sets */ - if (waitq_is_global(waitq)) { - if (keep_locked) - goto out; - goto out_unlock; - } + waitq_lock(waitq); /* remember the set ID that we started with */ prev_setid = waitq->waitq_set_id; @@ -5341,7 +4570,7 @@ uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra, try_alloc: /* this _may_ unlock and relock the waitq! */ - nalloc = waitq_alloc_prepost_reservation(npreposts, waitq, s, + nalloc = waitq_alloc_prepost_reservation(npreposts, waitq, &unlocked, &wqp); if (!unlocked) { @@ -5387,8 +4616,6 @@ uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra, out_unlock: waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(*s); out: if (wqp) reserved = wqp->wqp_prepostid.id; @@ -5429,6 +4656,9 @@ void waitq_set_clear_preposts(struct waitq_set *wqset) assert(waitqs_is_set(wqset)); + if (!wqset->wqset_q.waitq_prepost || !wqset->wqset_prepost_id) + return; + wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx", wqset->wqset_id); @@ -5458,38 +4688,31 @@ struct wq_it_ctx { void *input; void *ctx; waitq_iterator_t it; - - spl_t *spl; }; static int waitq_iterate_sets_cb(struct waitq *waitq, void *ctx, - struct setid_link *link) + struct waitq_link *link) { struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx); struct waitq_set *wqset; int ret; - spl_t spl; (void)waitq; - assert(sl_type(link) == SLT_WQS); + assert(!waitq_irq_safe(waitq)); + assert(wql_type(link) == WQL_WQS); /* * the waitq is locked, so we can just take the set lock * and call the iterator function */ - wqset = link->sl_wqs.sl_set; + wqset = link->wql_wqs.wql_set; assert(wqset != NULL); - - if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) - spl = splsched(); + assert(!waitq_irq_safe(&wqset->wqset_q)); waitq_set_lock(wqset); ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset); waitq_set_unlock(wqset); - if (!waitq_irq_safe(waitq) && waitq_irq_safe(&wqset->wqset_q)) - splx(spl); - return ret; } @@ -5506,7 +4729,6 @@ static int wqset_iterate_prepost_cb(struct waitq_set *wqset, void *ctx, struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx); uint64_t wqp_id; int ret; - spl_t s; (void)wqp; @@ -5520,14 +4742,11 @@ static int wqset_iterate_prepost_cb(struct waitq_set *wqset, void *ctx, * to go. If not, we need to back off, check that the 'wqp' hasn't * been invalidated, and try to re-take the locks. */ - if (waitq_irq_safe(waitq)) - s = splsched(); + assert(!waitq_irq_safe(waitq)); + if (waitq_lock_try(waitq)) goto call_iterator; - if (waitq_irq_safe(waitq)) - splx(s); - if (!wqp_is_valid(wqp)) return WQ_ITERATE_RESTART; @@ -5570,16 +4789,11 @@ static int wqset_iterate_prepost_cb(struct waitq_set *wqset, void *ctx, if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) { ret = WQ_ITERATE_BREAK; - if (wctx->spl) - *(wctx->spl) = s; goto out; } out_unlock: waitq_unlock(waitq); - if (waitq_irq_safe(waitq)) - splx(s); - out: return ret; } @@ -5601,8 +4815,8 @@ int waitq_iterate_sets(struct waitq *waitq, void *ctx, waitq_iterator_t it) if (!it || !waitq) return KERN_INVALID_ARGUMENT; - ret = walk_setid_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, - SLT_WQS, (void *)&wctx, waitq_iterate_sets_cb); + ret = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, + WQL_WQS, (void *)&wctx, waitq_iterate_sets_cb); if (ret == WQ_ITERATE_CONTINUE) ret = WQ_ITERATE_SUCCESS; return ret; @@ -5615,13 +4829,12 @@ int waitq_iterate_sets(struct waitq *waitq, void *ctx, waitq_iterator_t it) * 'wqset' is locked */ int waitq_set_iterate_preposts(struct waitq_set *wqset, - void *ctx, waitq_iterator_t it, spl_t *s) + void *ctx, waitq_iterator_t it) { struct wq_it_ctx wctx = { .input = (void *)wqset, .ctx = ctx, .it = it, - .spl = s, }; if (!it || !wqset) return WQ_ITERATE_INVALID; @@ -5639,20 +4852,20 @@ int waitq_set_iterate_preposts(struct waitq_set *wqset, * * ---------------------------------------------------------------------- */ + /** * declare a thread's intent to wait on 'waitq' for 'wait_event' * * Conditions: * 'waitq' is not locked - * will disable and re-enable interrupts while locking current_thread() */ wait_result_t waitq_assert_wait64(struct waitq *waitq, event64_t wait_event, wait_interrupt_t interruptible, - uint64_t deadline) + uint64_t deadline) { - wait_result_t ret; thread_t thread = current_thread(); + wait_result_t ret; spl_t s; if (!waitq_valid(waitq)) @@ -5660,20 +4873,15 @@ wait_result_t waitq_assert_wait64(struct waitq *waitq, if (waitq_irq_safe(waitq)) s = splsched(); - waitq_lock(waitq); - - if (!waitq_irq_safe(waitq)) - s = splsched(); - thread_lock(thread); + waitq_lock(waitq); ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible, TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread); - - thread_unlock(thread); waitq_unlock(waitq); - splx(s); + if (waitq_irq_safe(waitq)) + splx(s); return ret; } @@ -5701,19 +4909,14 @@ wait_result_t waitq_assert_wait64_leeway(struct waitq *waitq, if (waitq_irq_safe(waitq)) s = splsched(); - waitq_lock(waitq); - - if (!waitq_irq_safe(waitq)) - s = splsched(); - thread_lock(thread); + waitq_lock(waitq); ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible, urgency, deadline, leeway, thread); - - thread_unlock(thread); waitq_unlock(waitq); - splx(s); + if (waitq_irq_safe(waitq)) + splx(s); return ret; } @@ -5739,9 +4942,13 @@ kern_return_t waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event, if (!waitq_valid(waitq)) panic("Invalid waitq: %p", waitq); - /* NOTE: this will _not_ reserve anything if waitq is global */ - reserved_preposts = waitq_prepost_reserve(waitq, 0, - WAITQ_KEEP_LOCKED, &spl); + if (!waitq_irq_safe(waitq)) { + /* reserve preposts in addition to locking the waitq */ + reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED); + } else { + spl = splsched(); + waitq_lock(waitq); + } /* waitq is locked upon return */ kr = waitq_wakeup64_one_locked(waitq, wake_event, result, @@ -5779,12 +4986,14 @@ kern_return_t waitq_wakeup64_all(struct waitq *waitq, if (!waitq_valid(waitq)) panic("Invalid waitq: %p", waitq); - /* keep waitq locked upon return */ - /* NOTE: this will _not_ reserve anything if waitq is global */ - reserved_preposts = waitq_prepost_reserve(waitq, 0, - WAITQ_KEEP_LOCKED, &s); - - /* waitq is locked */ + if (!waitq_irq_safe(waitq)) { + /* reserve preposts in addition to locking waitq */ + reserved_preposts = waitq_prepost_reserve(waitq, 0, + WAITQ_KEEP_LOCKED); + } else { + s = splsched(); + waitq_lock(waitq); + } ret = waitq_wakeup64_all_locked(waitq, wake_event, result, &reserved_preposts, priority, @@ -5844,3 +5053,60 @@ kern_return_t waitq_wakeup64_thread(struct waitq *waitq, return ret; } + +/** + * wakeup a single thread from a waitq that's waiting for a given event + * and return a reference to that thread + * returns THREAD_NULL if no thread was waiting + * + * Conditions: + * 'waitq' is not locked + * may (rarely) block if 'waitq' is non-global and a member of 1 or more sets + * may disable and re-enable interrupts + * + * Notes: + * will _not_ block if waitq is global (or not a member of any set) + */ +thread_t +waitq_wakeup64_identify(struct waitq *waitq, + event64_t wake_event, + wait_result_t result, + int priority) +{ + uint64_t reserved_preposts = 0; + spl_t thread_spl = 0; + thread_t thread; + spl_t spl; + + if (!waitq_valid(waitq)) + panic("Invalid waitq: %p", waitq); + + if (!waitq_irq_safe(waitq)) { + /* reserve preposts in addition to locking waitq */ + reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED); + } else { + spl = splsched(); + waitq_lock(waitq); + } + + thread = waitq_wakeup64_identify_locked(waitq, wake_event, result, + &thread_spl, &reserved_preposts, + priority, WAITQ_UNLOCK); + /* waitq is unlocked, thread is locked */ + + if (thread != THREAD_NULL) { + thread_reference(thread); + thread_unlock(thread); + splx(thread_spl); + } + + if (waitq_irq_safe(waitq)) + splx(spl); + + /* release any left-over prepost object (won't block/lock anything) */ + waitq_prepost_release_reserve(reserved_preposts); + + /* returns +1 ref to running thread or THREAD_NULL */ + return thread; +} + diff --git a/osfmk/kern/waitq.h b/osfmk/kern/waitq.h index 92751fa64..af9bb9eed 100644 --- a/osfmk/kern/waitq.h +++ b/osfmk/kern/waitq.h @@ -44,6 +44,7 @@ */ #define WAITQ_ALL_PRIORITIES (-1) #define WAITQ_PROMOTE_PRIORITY (-2) +#define WAITQ_SELECT_MAX_PRI (-3) typedef enum e_waitq_lock_state { WAITQ_KEEP_LOCKED = 0x01, @@ -53,24 +54,48 @@ typedef enum e_waitq_lock_state { WAITQ_DONT_LOCK = 0x10, } waitq_lock_state_t; +/* + * The Jenkins "one at a time" hash. + * TBD: There may be some value to unrolling here, + * depending on the architecture. + */ +static __inline__ uint32_t +jenkins_hash(char *key, size_t length) +{ + uint32_t hash = 0; + size_t i; + + for (i = 0; i < length; i++) { + hash += (uint32_t)key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash; +} + +/* Opaque sizes and alignment used for struct verification */ +#if __x86_64__ + #define WQ_OPAQUE_ALIGN 8 + #define WQS_OPAQUE_ALIGN 8 + #define WQ_OPAQUE_SIZE 48 + #define WQS_OPAQUE_SIZE 64 +#else + #error Unknown size requirement +#endif + #ifndef MACH_KERNEL_PRIVATE /* * The opaque waitq structure is here mostly for AIO and selinfo, * but could potentially be used by other BSD subsystems. */ -#ifndef __LP64__ - struct waitq { char opaque[32]; }; - struct waitq_set { char opaque[48]; }; -#else - #if defined(__x86_64__) - struct waitq { char opaque[48]; }; - struct waitq_set { char opaque[64]; }; - #else - struct waitq { char opaque[40]; }; - struct waitq_set { char opaque[56]; }; - #endif /* !x86_64 */ -#endif /* __LP64__ */ +struct waitq { char opaque[WQ_OPAQUE_SIZE]; } __attribute__((aligned(WQ_OPAQUE_ALIGN))); +struct waitq_set { char opaque[WQS_OPAQUE_SIZE]; } __attribute__((aligned(WQS_OPAQUE_ALIGN))); #else /* MACH_KERNEL_PRIVATE */ @@ -90,7 +115,7 @@ typedef enum e_waitq_lock_state { * New plan: this is an optimization anyway, so I'm stealing 32bits * from the mask to shrink the waitq object even further. */ -#define _EVENT_MASK_BITS ((sizeof(uint32_t) * 8) - 5) +#define _EVENT_MASK_BITS ((sizeof(uint32_t) * 8) - 6) #define WAITQ_BOOST_PRIORITY 31 @@ -135,6 +160,7 @@ struct waitq { waitq_fifo:1, /* fifo wakeup policy? */ waitq_prepost:1, /* waitq supports prepost? */ waitq_irq:1, /* waitq requires interrupts disabled */ + waitq_isvalid:1, /* waitq structure is valid */ waitq_eventmask:_EVENT_MASK_BITS; /* the wait queue set (set-of-sets) to which this queue belongs */ hw_lock_data_t waitq_interlock; /* interlock */ @@ -144,6 +170,9 @@ struct waitq { queue_head_t waitq_queue; /* queue of elements */ }; +static_assert(sizeof(struct waitq) == WQ_OPAQUE_SIZE, "waitq structure size mismatch"); +static_assert(__alignof(struct waitq) == WQ_OPAQUE_ALIGN, "waitq structure alignment mismatch"); + /* * struct waitq_set * @@ -152,9 +181,15 @@ struct waitq { struct waitq_set { struct waitq wqset_q; uint64_t wqset_id; - uint64_t wqset_prepost_id; + union { + uint64_t wqset_prepost_id; + void *wqset_prepost_hook; + }; }; +static_assert(sizeof(struct waitq_set) == WQS_OPAQUE_SIZE, "waitq_set structure size mismatch"); +static_assert(__alignof(struct waitq_set) == WQS_OPAQUE_ALIGN, "waitq_set structure alignment mismatch"); + extern void waitq_bootstrap(void); #define waitq_is_queue(wq) \ @@ -167,17 +202,26 @@ extern void waitq_bootstrap(void); (((wqs)->wqset_q.waitq_type == WQT_SET) && ((wqs)->wqset_id != 0)) #define waitq_valid(wq) \ - ((wq) != NULL && ((wq)->waitq_type & ~1) == WQT_QUEUE) + ((wq) != NULL && (wq)->waitq_isvalid && ((wq)->waitq_type & ~1) == WQT_QUEUE) + +/* + * Invalidate a waitq. The only valid waitq functions to call after this are: + * waitq_deinit() + * waitq_set_deinit() + */ +extern void waitq_invalidate_locked(struct waitq *wq); #define waitq_empty(wq) \ (queue_empty(&(wq)->waitq_queue)) + #define waitq_held(wq) \ (hw_lock_held(&(wq)->waitq_interlock)) #define waitq_lock_try(wq) \ (hw_lock_try(&(wq)->waitq_interlock)) + #define waitq_wait_possible(thread) \ ((thread)->waitq == NULL) @@ -191,6 +235,9 @@ extern void waitq_unlock(struct waitq *wq); (wqs)->wqset_q.waitq_prepost) #define waitq_set_maybe_preposted(wqs) ((wqs)->wqset_q.waitq_prepost && \ (wqs)->wqset_prepost_id > 0) +#define waitq_set_has_prepost_hook(wqs) (waitqs_is_set(wqs) && \ + !((wqs)->wqset_q.waitq_prepost) && \ + (wqs)->wqset_prepost_hook) /* assert intent to wait on a locked wait queue */ extern wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, @@ -202,7 +249,7 @@ extern wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, thread_t thread); /* pull a thread from its wait queue */ -extern void waitq_pull_thread_locked(struct waitq *waitq, thread_t thread); +extern int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread); /* wakeup all threads waiting for a particular event on locked queue */ extern kern_return_t waitq_wakeup64_all_locked(struct waitq *waitq, @@ -221,12 +268,14 @@ extern kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq, waitq_lock_state_t lock_state); /* return identity of a thread awakened for a particular */ -extern thread_t waitq_wakeup64_identity_locked(struct waitq *waitq, - event64_t wake_event, - wait_result_t result, - spl_t *spl, - uint64_t *reserved_preposts, - waitq_lock_state_t lock_state); +extern thread_t +waitq_wakeup64_identify_locked(struct waitq *waitq, + event64_t wake_event, + wait_result_t result, + spl_t *spl, + uint64_t *reserved_preposts, + int priority, + waitq_lock_state_t lock_state); /* wakeup thread iff its still waiting for a particular event on locked queue */ extern kern_return_t waitq_wakeup64_thread_locked(struct waitq *waitq, @@ -236,16 +285,18 @@ extern kern_return_t waitq_wakeup64_thread_locked(struct waitq *waitq, waitq_lock_state_t lock_state); /* clear all preposts generated by the given waitq */ -extern int waitq_clear_prepost_locked(struct waitq *waitq, spl_t *s); +extern int waitq_clear_prepost_locked(struct waitq *waitq); /* clear all preposts from the given wait queue set */ extern void waitq_set_clear_preposts_locked(struct waitq_set *wqset); -/* unlink the given waitq from all sets */ -extern kern_return_t waitq_unlink_all_locked(struct waitq *waitq, - uint64_t *old_set_id, - spl_t *s, - int *dropped_lock); +/* unlink the given waitq from all sets - returns unlocked */ +extern kern_return_t waitq_unlink_all_unlock(struct waitq *waitq); + +/* unlink the given waitq set from all waitqs and waitq sets - returns unlocked */ +extern kern_return_t waitq_set_unlink_all_unlock(struct waitq_set *wqset); + + /* * clear a thread's boosted priority @@ -283,13 +334,13 @@ extern int waitq_iterate_sets(struct waitq *waitq, void *ctx, /* iterator over all waitqs that have preposted to wqset */ extern int waitq_set_iterate_preposts(struct waitq_set *wqset, - void *ctx, waitq_iterator_t it, spl_t *s); + void *ctx, waitq_iterator_t it); /* * prepost reservation */ extern uint64_t waitq_prepost_reserve(struct waitq *waitq, int extra, - waitq_lock_state_t lock_state, spl_t *s); + waitq_lock_state_t lock_state); extern void waitq_prepost_release_reserve(uint64_t id); @@ -315,10 +366,11 @@ extern struct waitq *global_waitq(int index); /* * set alloc/init/free */ -extern struct waitq_set *waitq_set_alloc(int policy); +extern struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook); extern kern_return_t waitq_set_init(struct waitq_set *wqset, - int policy, uint64_t *reserved_link); + int policy, uint64_t *reserved_link, + void *prepost_hook); extern void waitq_set_deinit(struct waitq_set *wqset); @@ -358,7 +410,6 @@ extern kern_return_t waitq_unlink_all(struct waitq *waitq); extern kern_return_t waitq_set_unlink_all(struct waitq_set *wqset); - /* * preposts */ @@ -441,11 +492,23 @@ extern kern_return_t waitq_wakeup64_all(struct waitq *waitq, wait_result_t result, int priority); +#ifdef XNU_KERNEL_PRIVATE + /* wakeup a specified thread iff it's waiting on pair */ extern kern_return_t waitq_wakeup64_thread(struct waitq *waitq, event64_t wake_event, thread_t thread, wait_result_t result); + +/* return a reference to the thread that was woken up */ +extern thread_t +waitq_wakeup64_identify(struct waitq *waitq, + event64_t wake_event, + wait_result_t result, + int priority); + +#endif /* XNU_KERNEL_PRIVATE */ + __END_DECLS #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/xpr.c b/osfmk/kern/xpr.c index 8fbaca22d..a08724be9 100644 --- a/osfmk/kern/xpr.c +++ b/osfmk/kern/xpr.c @@ -93,7 +93,7 @@ xpr( long arg5) { spl_t s; - register struct xprbuf *x; + struct xprbuf *x; /* If we aren't initialized, ignore trace request */ if (!xprenable || (xprptr == 0)) diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index 351268d34..78ace5f99 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -63,7 +63,6 @@ * data blocks for which quick allocation/deallocation is possible. */ #include -#include #include #include @@ -76,6 +75,7 @@ #include #include +#include #include #include #include @@ -85,7 +85,6 @@ #include #include #include -#include #include #include @@ -102,51 +101,12 @@ #include /* - * ZONE_ALIAS_ADDR - * - * With this option enabled, zones with alloc_size <= PAGE_SIZE allocate - * a virtual page from the zone_map, but before zcram-ing the allocated memory - * into the zone, the page is translated to use the alias address of the page - * in the static kernel region. zone_gc reverses that translation when - * scanning the freelist to collect free pages so that it can look up the page - * in the zone_page_table, and free it to kmem_free. - * - * The static kernel region is a flat 1:1 mapping of physical memory passed - * to xnu by the booter. It is mapped to the range: - * [gVirtBase, gVirtBase + gPhysSize] - * - * Accessing memory via the static kernel region is faster due to the - * entire region being mapped via large pages, cutting down - * on TLB misses. - * - * zinit favors using PAGE_SIZE backing allocations for a zone unless it would - * waste more than 10% space to use a single page, in order to take advantage - * of the speed benefit for as many zones as possible. - * - * Zones with > PAGE_SIZE allocations can't take advantage of this - * because kernel_memory_allocate doesn't give out physically contiguous pages. - * - * zone_virtual_addr() - * - translates an address from the static kernel region to the zone_map - * - returns the same address if it's not from the static kernel region - * It relies on the fact that a physical page mapped to the - * zone_map is not mapped anywhere else (except the static kernel region). - * - * zone_alias_addr() - * - translates a virtual memory address from the zone_map to the - * corresponding address in the static kernel region - * + * ZONE_ALIAS_ADDR (deprecated) */ -#if !ZONE_ALIAS_ADDR #define from_zone_map(addr, size) \ ((vm_offset_t)(addr) >= zone_map_min_address && \ ((vm_offset_t)(addr) + size - 1) < zone_map_max_address ) -#else -#define from_zone_map(addr, size) \ - ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) >= zone_map_min_address && \ - ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) + size -1) < zone_map_max_address ) -#endif /* * Zone Corruption Debugging @@ -352,9 +312,6 @@ zp_init(void) #endif } -/* zone_map page count for page table structure */ -uint64_t zone_map_table_page_count = 0; - /* * These macros are used to keep track of the number * of pages being used by the zone currently. The @@ -370,6 +327,8 @@ uint64_t zone_map_table_page_count = 0; OSAddAtomic64(-count, &(z->page_count)); \ } +vm_map_t zone_map = VM_MAP_NULL; + /* for is_sane_zone_element and garbage collection */ vm_offset_t zone_map_min_address = 0; /* initialized in zone_init */ @@ -382,6 +341,13 @@ static unsigned int bool_gen_seed[RANDOM_BOOL_GEN_SEED_COUNT]; static unsigned int bool_gen_global = 0; decl_simple_lock_data(, bool_gen_lock) +/* VM region for all metadata structures */ +vm_offset_t zone_metadata_region_min = 0; +vm_offset_t zone_metadata_region_max = 0; +decl_lck_mtx_data(static ,zone_metadata_region_lck) +lck_attr_t zone_metadata_lock_attr; +lck_mtx_ext_t zone_metadata_region_lck_ext; + /* Helpful for walking through a zone's free element list. */ struct zone_free_element { struct zone_free_element *next; @@ -389,14 +355,126 @@ struct zone_free_element { /* void *backup_ptr; */ }; +/* + * Protects num_zones, zone_array and zone_array_index + */ +decl_simple_lock_data(, all_zones_lock) +unsigned int num_zones; + +#define MAX_ZONES 256 +struct zone zone_array[MAX_ZONES]; +static int zone_array_index = 0; + +#define MULTIPAGE_METADATA_MAGIC (0xff) + +#define PAGE_METADATA_GET_ZINDEX(page_meta) \ + (page_meta->zindex) + +#define PAGE_METADATA_GET_ZONE(page_meta) \ + (&(zone_array[page_meta->zindex])) + +#define PAGE_METADATA_SET_ZINDEX(page_meta, index) \ + page_meta->zindex = (index); + struct zone_page_metadata { - queue_chain_t pages; - struct zone_free_element *elements; - zone_t zone; - uint16_t alloc_count; - uint16_t free_count; + queue_chain_t pages; /* linkage pointer for metadata lists */ + + /* Union for maintaining start of element free list and real metadata (for multipage allocations) */ + union { + /* + * The start of the freelist can be maintained as a 32-bit offset instead of a pointer because + * the free elements would be at max ZONE_MAX_ALLOC_SIZE bytes away from the metadata. Offset + * from start of the allocation chunk to free element list head. + */ + uint32_t freelist_offset; + /* + * This field is used to lookup the real metadata for multipage allocations, where we mark the + * metadata for all pages except the first as "fake" metadata using MULTIPAGE_METADATA_MAGIC. + * Offset from this fake metadata to real metadata of allocation chunk (-ve offset). + */ + uint32_t real_metadata_offset; + }; + + /* + * For the first page in the allocation chunk, this represents the total number of free elements in + * the chunk. + * For all other pages, it represents the number of free elements on that page (used + * for garbage collection of zones with large multipage allocation size) + */ + uint16_t free_count; + uint8_t zindex; /* Zone index within the zone_array */ + uint8_t page_count; /* Count of pages within the allocation chunk */ }; +/* Macro to get page index (within zone_map) of page containing element */ +#define PAGE_INDEX_FOR_ELEMENT(element) \ + (((vm_offset_t)trunc_page(element) - zone_map_min_address) / PAGE_SIZE) + +/* Macro to get metadata structure given a page index in zone_map */ +#define PAGE_METADATA_FOR_PAGE_INDEX(index) \ + (zone_metadata_region_min + ((index) * sizeof(struct zone_page_metadata))) + +/* Macro to get index (within zone_map) for given metadata */ +#define PAGE_INDEX_FOR_METADATA(page_meta) \ + (((vm_offset_t)page_meta - zone_metadata_region_min) / sizeof(struct zone_page_metadata)) + +/* Macro to get page for given page index in zone_map */ +#define PAGE_FOR_PAGE_INDEX(index) \ + (zone_map_min_address + (PAGE_SIZE * (index))) + +/* Macro to get the actual metadata for a given address */ +#define PAGE_METADATA_FOR_ELEMENT(element) \ + (struct zone_page_metadata *)(PAGE_METADATA_FOR_PAGE_INDEX(PAGE_INDEX_FOR_ELEMENT(element))) + +/* Magic value to indicate empty element free list */ +#define PAGE_METADATA_EMPTY_FREELIST ((uint32_t)(~0)) + +static inline void * +page_metadata_get_freelist(struct zone_page_metadata *page_meta) +{ + assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC); + if (page_meta->freelist_offset == PAGE_METADATA_EMPTY_FREELIST) + return NULL; + else { + if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) + return (void *)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)) + page_meta->freelist_offset); + else + return (void *)((vm_offset_t)page_meta + page_meta->freelist_offset); + } +} + +static inline void +page_metadata_set_freelist(struct zone_page_metadata *page_meta, void *addr) +{ + assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC); + if (addr == NULL) + page_meta->freelist_offset = PAGE_METADATA_EMPTY_FREELIST; + else { + if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) + page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta))); + else + page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - (vm_offset_t)page_meta); + } +} + +static inline struct zone_page_metadata * +page_metadata_get_realmeta(struct zone_page_metadata *page_meta) +{ + assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC); + return (struct zone_page_metadata *)((vm_offset_t)page_meta - page_meta->real_metadata_offset); +} + +static inline void +page_metadata_set_realmeta(struct zone_page_metadata *page_meta, struct zone_page_metadata *real_meta) +{ + assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC); + assert(PAGE_METADATA_GET_ZINDEX(real_meta) != MULTIPAGE_METADATA_MAGIC); + assert((vm_offset_t)page_meta > (vm_offset_t)real_meta); + vm_offset_t offset = (vm_offset_t)page_meta - (vm_offset_t)real_meta; + assert(offset <= UINT32_MAX); + page_meta->real_metadata_offset = (uint32_t)offset; +} + /* The backup pointer is stored in the last pointer-sized location in an element. */ static inline vm_offset_t * get_backup_ptr(vm_size_t elem_size, @@ -405,10 +483,97 @@ get_backup_ptr(vm_size_t elem_size, return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t)); } +/* + * Routine to populate a page backing metadata in the zone_metadata_region. + * Must be called without the zone lock held as it might potentially block. + */ +static inline void +zone_populate_metadata_page(struct zone_page_metadata *page_meta) +{ + vm_offset_t page_metadata_begin = trunc_page(page_meta); + vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata)); + + for(;page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) { + if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) + continue; + /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */ + lck_mtx_lock(&zone_metadata_region_lck); + if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) { + kernel_memory_populate(zone_map, + page_metadata_begin, + PAGE_SIZE, + KMA_KOBJECT, + VM_KERN_MEMORY_OSFMK); + } + lck_mtx_unlock(&zone_metadata_region_lck); + } + return; +} + +static inline uint16_t +get_metadata_alloc_count(struct zone_page_metadata *page_meta) +{ + assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC); + struct zone *z = PAGE_METADATA_GET_ZONE(page_meta); + return ((page_meta->page_count * PAGE_SIZE) / z->elem_size); +} + +/* + * Routine to lookup metadata for any given address. + * If init is marked as TRUE, this should be called without holding the zone lock + * since the initialization might block. + */ static inline struct zone_page_metadata * -get_zone_page_metadata(struct zone_free_element *element) +get_zone_page_metadata(struct zone_free_element *element, boolean_t init) +{ + struct zone_page_metadata *page_meta = 0; + + if (from_zone_map(element, sizeof(struct zone_free_element))) { + page_meta = (struct zone_page_metadata *)(PAGE_METADATA_FOR_ELEMENT(element)); + if (init) + zone_populate_metadata_page(page_meta); + } else { + page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element)); + } + if (init) + bzero((char *)page_meta, sizeof(struct zone_page_metadata)); + return ((PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta)); +} + +/* Routine to get the page for a given metadata */ +static inline vm_offset_t +get_zone_page(struct zone_page_metadata *page_meta) { - return (struct zone_page_metadata *)(trunc_page((vm_offset_t)element)); + if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) + return (vm_offset_t)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta))); + else + return (vm_offset_t)(trunc_page(page_meta)); +} + +/* Routine to get the size of a zone allocated address. If the address doesnt belong to the + * zone_map, returns 0. + */ +vm_size_t +zone_element_size(void *addr, zone_t *z) +{ + struct zone *src_zone; + if (from_zone_map(addr, sizeof(void *))) { + struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE); + src_zone = PAGE_METADATA_GET_ZONE(page_meta); + if (z) { + *z = src_zone; + } + return (src_zone->elem_size); + } else { +#if CONFIG_GZALLOC + vm_size_t gzsize; + if (gzalloc_element_size(addr, z, &gzsize)) { + return gzsize; + } +#endif /* CONFIG_GZALLOC */ + + return 0; + } } /* @@ -435,23 +600,6 @@ is_sane_zone_ptr(zone_t zone, * zone using foreign memory is properly tagged with allows_foreign */ if (zone->collectable && !zone->allows_foreign) { -#if ZONE_ALIAS_ADDR - /* - * If this address is in the static kernel region, it might be - * the alias address of a valid zone element. - * If we tried to find the zone_virtual_addr() of an invalid - * address in the static kernel region, it will panic, so don't - * check addresses in this region. - * - * TODO: Use a safe variant of zone_virtual_addr to - * make this check more accurate - * - * The static kernel region is mapped at: - * [gVirtBase, gVirtBase + gPhysSize] - */ - if ((addr - gVirtBase) < gPhysSize) - return TRUE; -#endif /* check if addr is from zone map */ if (addr >= zone_map_min_address && (addr + obj_size - 1) < zone_map_max_address ) @@ -516,9 +664,11 @@ backup_ptr_mismatch_panic(zone_t zone, vm_offset_t backup) { vm_offset_t likely_backup; + vm_offset_t likely_primary; + likely_primary = primary ^ zp_nopoison_cookie; boolean_t sane_backup; - boolean_t sane_primary = is_sane_zone_element(zone, primary); + boolean_t sane_primary = is_sane_zone_element(zone, likely_primary); boolean_t element_was_poisoned = (backup & 0x1) ? TRUE : FALSE; #if defined(__LP64__) @@ -539,7 +689,7 @@ backup_ptr_mismatch_panic(zone_t zone, /* The primary is definitely the corrupted one */ if (!sane_primary && sane_backup) - zone_element_was_modified_panic(zone, element, primary, likely_backup, 0); + zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0); /* The backup is definitely the corrupted one */ if (sane_primary && !sane_backup) @@ -560,58 +710,6 @@ backup_ptr_mismatch_panic(zone_t zone, zone_element_was_modified_panic(zone, element, primary, likely_backup, 0); } -/* - * Sets the next element of tail to elem. - * elem can be NULL. - * Preserves the poisoning state of the element. - */ -static inline void -append_zone_element(zone_t zone, - struct zone_free_element *tail, - struct zone_free_element *elem) -{ - vm_offset_t *backup = get_backup_ptr(zone->elem_size, (vm_offset_t *) tail); - - vm_offset_t old_backup = *backup; - - vm_offset_t old_next = (vm_offset_t) tail->next; - vm_offset_t new_next = (vm_offset_t) elem; - - if (old_next == (old_backup ^ zp_nopoison_cookie)) - *backup = new_next ^ zp_nopoison_cookie; - else if (old_next == (old_backup ^ zp_poisoned_cookie)) - *backup = new_next ^ zp_poisoned_cookie; - else - backup_ptr_mismatch_panic(zone, - (vm_offset_t) tail, - old_next, - old_backup); - - tail->next = elem; -} - - -/* - * Insert a linked list of elements (delineated by head and tail) at the head of - * the zone free list. Every element in the list being added has already gone - * through append_zone_element, so their backup pointers are already - * set properly. - * Precondition: There should be no elements after tail - */ -static inline void -add_list_to_zone(zone_t zone, - struct zone_free_element *head, - struct zone_free_element *tail) -{ - assert(tail->next == NULL); - assert(!zone->use_page_list); - - append_zone_element(zone, tail, zone->free_elements); - - zone->free_elements = head; -} - - /* * Adds the element to the head of the zone's free list * Keeps a backup next-pointer at the end of the element @@ -627,13 +725,9 @@ free_to_zone(zone_t zone, vm_offset_t *primary = (vm_offset_t *) element; vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary); - if (zone->use_page_list) { - page_meta = get_zone_page_metadata((struct zone_free_element *)element); - assert(page_meta->zone == zone); - old_head = (vm_offset_t)page_meta->elements; - } else { - old_head = (vm_offset_t)zone->free_elements; - } + page_meta = get_zone_page_metadata((struct zone_free_element *)element, FALSE); + assert(PAGE_METADATA_GET_ZONE(page_meta) == zone); + old_head = (vm_offset_t)page_metadata_get_freelist(page_meta); #if MACH_ASSERT if (__improbable(!is_sane_zone_element(zone, old_head))) @@ -654,30 +748,28 @@ free_to_zone(zone_t zone, *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie); - /* Insert this element at the head of the free list */ - *primary = old_head; - if (zone->use_page_list) { - page_meta->elements = (struct zone_free_element *)element; - page_meta->free_count++; - if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) { - if (page_meta->free_count == 1) { - /* first foreign element freed on page, move from all_used */ - remqueue((queue_entry_t)page_meta); - enqueue_tail(&zone->pages.any_free_foreign, (queue_entry_t)page_meta); - } else { - /* no other list transitions */ - } - } else if (page_meta->free_count == page_meta->alloc_count) { - /* whether the page was on the intermediate or all_used, queue, move it to free */ - remqueue((queue_entry_t)page_meta); - enqueue_tail(&zone->pages.all_free, (queue_entry_t)page_meta); - } else if (page_meta->free_count == 1) { - /* first free element on page, move from all_used */ - remqueue((queue_entry_t)page_meta); - enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta); + /* + * Insert this element at the head of the free list. We also xor the + * primary pointer with the zp_nopoison_cookie to make sure a free + * element does not provide the location of the next free element directly. + */ + *primary = old_head ^ zp_nopoison_cookie; + page_metadata_set_freelist(page_meta, (struct zone_free_element *)element); + page_meta->free_count++; + if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) { + if (page_meta->free_count == 1) { + /* first foreign element freed on page, move from all_used */ + re_queue_tail(&zone->pages.any_free_foreign, &(page_meta->pages)); + } else { + /* no other list transitions */ } - } else { - zone->free_elements = (struct zone_free_element *)element; + } else if (page_meta->free_count == get_metadata_alloc_count(page_meta)) { + /* whether the page was on the intermediate or all_used, queue, move it to free */ + re_queue_tail(&zone->pages.all_free, &(page_meta->pages)); + zone->count_all_free_pages += page_meta->page_count; + } else if (page_meta->free_count == 1) { + /* first free element on page, move from all_used */ + re_queue_tail(&zone->pages.intermediate, &(page_meta->pages)); } zone->count--; zone->countfree++; @@ -699,40 +791,37 @@ try_alloc_from_zone(zone_t zone, *check_poison = FALSE; /* if zone is empty, bail */ - if (zone->use_page_list) { - if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign)) - page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); - else if (!queue_empty(&zone->pages.intermediate)) - page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); - else if (!queue_empty(&zone->pages.all_free)) - page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); - else { - return 0; - } - - /* Check if page_meta passes is_sane_zone_element */ - if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta))) - panic("zalloc: invalid metadata structure %p for freelist of zone %s\n", - (void *) page_meta, zone->zone_name); - assert(page_meta->zone == zone); - element = (vm_offset_t)page_meta->elements; + if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign)) + page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); + else if (!queue_empty(&zone->pages.intermediate)) + page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); + else if (!queue_empty(&zone->pages.all_free)) { + page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); + assert(zone->count_all_free_pages >= page_meta->page_count); + zone->count_all_free_pages -= page_meta->page_count; } else { - if (zone->free_elements == NULL) - return 0; - - element = (vm_offset_t)zone->free_elements; + return 0; } + /* Check if page_meta passes is_sane_zone_element */ + if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta))) + panic("zalloc: invalid metadata structure %p for freelist of zone %s\n", + (void *) page_meta, zone->zone_name); + assert(PAGE_METADATA_GET_ZONE(page_meta) == zone); + element = (vm_offset_t)page_metadata_get_freelist(page_meta); -#if MACH_ASSERT - if (__improbable(!is_sane_zone_element(zone, element))) + if (__improbable(!is_sane_zone_ptr(zone, element, zone->elem_size))) panic("zfree: invalid head pointer %p for freelist of zone %s\n", (void *) element, zone->zone_name); -#endif vm_offset_t *primary = (vm_offset_t *) element; vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary); - vm_offset_t next_element = *primary; + /* + * Since the primary next pointer is xor'ed with zp_nopoison_cookie + * for obfuscation, retrieve the original value back + */ + vm_offset_t next_element = *primary ^ zp_nopoison_cookie; + vm_offset_t next_element_primary = *primary; vm_offset_t next_element_backup = *backup; /* @@ -740,7 +829,7 @@ try_alloc_from_zone(zone_t zone, * should have been, and print it appropriately */ if (__improbable(!is_sane_zone_element(zone, next_element))) - backup_ptr_mismatch_panic(zone, element, next_element, next_element_backup); + backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup); /* Check the backup pointer for the regular cookie */ if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) { @@ -748,7 +837,7 @@ try_alloc_from_zone(zone_t zone, /* Check for the poisoned cookie instead */ if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) /* Neither cookie is valid, corruption has occurred */ - backup_ptr_mismatch_panic(zone, element, next_element, next_element_backup); + backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup); /* * Element was marked as poisoned, so check its integrity before using it. @@ -756,46 +845,32 @@ try_alloc_from_zone(zone_t zone, *check_poison = TRUE; } - if (zone->use_page_list) { - - /* Make sure the page_meta is at the correct offset from the start of page */ - if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element))) - panic("zalloc: metadata located at incorrect location on page of zone %s\n", - zone->zone_name); - - /* Make sure next_element belongs to the same page as page_meta */ - if (next_element) { - if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element))) - panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n", - (void *)next_element, (void *)element, zone->zone_name); - } + /* Make sure the page_meta is at the correct offset from the start of page */ + if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element, FALSE))) + panic("zalloc: Incorrect metadata %p found in zone %s page queue. Expected metadata: %p\n", + page_meta, zone->zone_name, get_zone_page_metadata((struct zone_free_element *)element, FALSE)); + + /* Make sure next_element belongs to the same page as page_meta */ + if (next_element) { + if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element, FALSE))) + panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n", + (void *)next_element, (void *)element, zone->zone_name); } /* Remove this element from the free list */ - if (zone->use_page_list) { + page_metadata_set_freelist(page_meta, (struct zone_free_element *)next_element); + page_meta->free_count--; - page_meta->elements = (struct zone_free_element *)next_element; - page_meta->free_count--; - - if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) { - if (page_meta->free_count == 0) { - /* move to all used */ - remqueue((queue_entry_t)page_meta); - enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta); - } else { - /* no other list transitions */ + if (page_meta->free_count == 0) { + /* move to all used */ + re_queue_tail(&zone->pages.all_used, &(page_meta->pages)); + } else { + if (!zone->allows_foreign || from_zone_map(element, zone->elem_size)) { + if (get_metadata_alloc_count(page_meta) == page_meta->free_count + 1) { + /* remove from free, move to intermediate */ + re_queue_tail(&zone->pages.intermediate, &(page_meta->pages)); } - } else if (page_meta->free_count == 0) { - /* remove from intermediate or free, move to all_used */ - remqueue((queue_entry_t)page_meta); - enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta); - } else if (page_meta->alloc_count == page_meta->free_count + 1) { - /* remove from free, move to intermediate */ - remqueue((queue_entry_t)page_meta); - enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta); } - } else { - zone->free_elements = (struct zone_free_element *)next_element; } zone->countfree--; zone->count++; @@ -804,91 +879,14 @@ try_alloc_from_zone(zone_t zone, return element; } - /* * End of zone poisoning */ -/* - * Fake zones for things that want to report via zprint but are not actually zones. - */ -struct fake_zone_info { - const char* name; - void (*init)(int); - void (*query)(int *, - vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, - uint64_t *, int *, int *, int *); -}; - -static const struct fake_zone_info fake_zones[] = { -}; -static const unsigned int num_fake_zones = - sizeof (fake_zones) / sizeof (fake_zones[0]); - /* * Zone info options */ -boolean_t zinfo_per_task = FALSE; /* enabled by -zinfop in boot-args */ -#define ZINFO_SLOTS 200 /* for now */ -#define ZONES_MAX (ZINFO_SLOTS - num_fake_zones - 1) - -/* - * Support for garbage collection of unused zone pages - * - * The kernel virtually allocates the "zone map" submap of the kernel - * map. When an individual zone needs more storage, memory is allocated - * out of the zone map, and the two-level "zone_page_table" is - * on-demand expanded so that it has entries for those pages. - * zone_page_init()/zone_page_alloc() initialize "alloc_count" - * to the number of zone elements that occupy the zone page (which may - * be a minimum of 1, including if a zone element spans multiple - * pages). - * - * Asynchronously, the zone_gc() logic attempts to walk zone free - * lists to see if all the elements on a zone page are free. If - * "collect_count" (which it increments during the scan) matches - * "alloc_count", the zone page is a candidate for collection and the - * physical page is returned to the VM system. During this process, the - * first word of the zone page is re-used to maintain a linked list of - * to-be-collected zone pages. - */ -typedef uint32_t zone_page_index_t; -#define ZONE_PAGE_INDEX_INVALID ((zone_page_index_t)0xFFFFFFFFU) - -struct zone_page_table_entry { - volatile uint16_t alloc_count; - volatile uint16_t collect_count; -}; - -#define ZONE_PAGE_USED 0 -#define ZONE_PAGE_UNUSED 0xffff - -/* Forwards */ -void zone_page_init( - vm_offset_t addr, - vm_size_t size); - -void zone_page_alloc( - vm_offset_t addr, - vm_size_t size); - -void zone_page_free_element( - zone_page_index_t *free_page_head, - zone_page_index_t *free_page_tail, - vm_offset_t addr, - vm_size_t size); - -void zone_page_collect( - vm_offset_t addr, - vm_size_t size); - -boolean_t zone_page_collectable( - vm_offset_t addr, - vm_size_t size); - -void zone_page_keep( - vm_offset_t addr, - vm_size_t size); +#define ZINFO_SLOTS MAX_ZONES /* for now */ void zone_display_zprint(void); @@ -913,20 +911,6 @@ void zalloc_async( static thread_call_data_t call_async_alloc; -vm_map_t zone_map = VM_MAP_NULL; - -zone_t zone_zone = ZONE_NULL; /* the zone containing other zones */ - -zone_t zinfo_zone = ZONE_NULL; /* zone of per-task zone info */ - -/* - * The VM system gives us an initial chunk of memory. - * It has to be big enough to allocate the zone_zone - * all the way through the pmap zone. - */ - -vm_offset_t zdata; -vm_size_t zdata_size; /* * Align elements that use the zone page list to 32 byte boundaries. */ @@ -954,22 +938,6 @@ MACRO_END #define lock_try_zone(zone) lck_mtx_try_lock_spin(&zone->lock) -/* - * Garbage collection map information - */ -#define ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE (32) -struct zone_page_table_entry * volatile zone_page_table[ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE]; -vm_size_t zone_page_table_used_size; -unsigned int zone_pages; -unsigned int zone_page_table_second_level_size; /* power of 2 */ -unsigned int zone_page_table_second_level_shift_amount; - -#define zone_page_table_first_level_slot(x) ((x) >> zone_page_table_second_level_shift_amount) -#define zone_page_table_second_level_slot(x) ((x) & (zone_page_table_second_level_size - 1)) - -void zone_page_table_expand(zone_page_index_t pindex); -struct zone_page_table_entry *zone_page_table_lookup(zone_page_index_t pindex); - /* * Exclude more than one concurrent garbage collection */ @@ -980,19 +948,8 @@ lck_grp_t zone_gc_lck_grp; lck_grp_attr_t zone_gc_lck_grp_attr; lck_mtx_ext_t zone_gc_lck_ext; -/* - * Protects first_zone, last_zone, num_zones, - * and the next_zone field of zones. - */ -decl_simple_lock_data(, all_zones_lock) -zone_t first_zone; -zone_t *last_zone; -unsigned int num_zones; - boolean_t zone_gc_allowed = TRUE; -boolean_t zone_gc_forced = FALSE; boolean_t panic_include_zprint = FALSE; -boolean_t zone_gc_allowed_by_time_throttle = TRUE; vm_offset_t panic_kext_memory_info = 0; vm_size_t panic_kext_memory_size = 0; @@ -1031,30 +988,39 @@ uint32_t zalloc_debug = 0; * corrupted to examine its history. This should lead to the source of the corruption. */ +static boolean_t log_records_init = FALSE; static int log_records; /* size of the log, expressed in number of records */ +#define MAX_NUM_ZONES_ALLOWED_LOGGING 5 /* Maximum 5 zones can be logged at once */ + +static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING; +static int num_zones_logged = 0; + #define MAX_ZONE_NAME 32 /* max length of a zone name we can take from the boot-args */ static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */ /* Log allocations and frees to help debug a zone element corruption */ boolean_t corruption_debug_flag = FALSE; /* enabled by "-zc" boot-arg */ +/* Making pointer scanning leaks detection possible for all zones */ + +#if DEBUG || DEVELOPMENT +boolean_t leak_scan_debug_flag = FALSE; /* enabled by "-zl" boot-arg */ +#endif /* DEBUG || DEVELOPMENT */ + /* * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to - * the number of records you want in the log. For example, "zrecs=1000" sets it to 1000 records. Note - * that the larger the size of the log, the slower the system will run due to linear searching in the log, - * but one doesn't generally care about performance when tracking down a leak. The log is capped at 8000 - * records since going much larger than this tends to make the system unresponsive and unbootable on small - * memory configurations. The default value is 4000 records. + * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this + * is the number of stacks suspected of leaking, we don't need many records. */ #if defined(__LP64__) -#define ZRECORDS_MAX 128000 /* Max records allowed in the log */ +#define ZRECORDS_MAX 2560 /* Max records allowed in the log */ #else -#define ZRECORDS_MAX 8000 /* Max records allowed in the log */ +#define ZRECORDS_MAX 1536 /* Max records allowed in the log */ #endif -#define ZRECORDS_DEFAULT 4000 /* default records in log if zrecs is not specificed in boot-args */ +#define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */ /* * Each record in the log contains a pointer to the zone element it refers to, @@ -1072,12 +1038,6 @@ boolean_t corruption_debug_flag = FALSE; /* enabled by "-zc" boot-ar #define ZOP_ALLOC 1 #define ZOP_FREE 0 -/* - * The allocation log and all the related variables are protected by the zone lock for the zone_of_interest - */ -static btlog_t *zlog_btlog; /* the log itself, dynamically allocated when logging is enabled */ -static zone_t zone_of_interest = NULL; /* the zone being watched; corresponds to zone_name_to_log */ - /* * Decide if we want to log this zone by doing a string compare between a zone name and the name * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not @@ -1124,7 +1084,7 @@ log_this_zone(const char *zonename, const char *logname) * the buffer for the records has been allocated. */ -#define DO_LOGGING(z) (zlog_btlog && (z) == zone_of_interest) +#define DO_LOGGING(z) (z->zone_logging == TRUE && z->zlog_btlog) extern boolean_t kmem_alloc_ready; @@ -1525,65 +1485,6 @@ zleak_free(uintptr_t addr, * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix. */ -/* - * This function captures a backtrace from the current stack and - * returns the number of frames captured, limited by max_frames. - * It's fast because it does no checking to make sure there isn't bad data. - * Since it's only called from threads that we're going to keep executing, - * if there's bad data we were going to die eventually. - * If this function is inlined, it doesn't record the frame of the function it's inside. - * (because there's no stack frame!) - */ - -uint32_t -fastbacktrace(uintptr_t* bt, uint32_t max_frames) -{ - uintptr_t* frameptr = NULL, *frameptr_next = NULL; - uintptr_t retaddr = 0; - uint32_t frame_index = 0, frames = 0; - uintptr_t kstackb, kstackt; - thread_t cthread = current_thread(); - - if (__improbable(cthread == NULL)) - return 0; - - kstackb = cthread->kernel_stack; - kstackt = kstackb + kernel_stack_size; - /* Load stack frame pointer (EBP on x86) into frameptr */ - frameptr = __builtin_frame_address(0); - if (((uintptr_t)frameptr > kstackt) || ((uintptr_t)frameptr < kstackb)) - frameptr = NULL; - - while (frameptr != NULL && frame_index < max_frames ) { - /* Next frame pointer is pointed to by the previous one */ - frameptr_next = (uintptr_t*) *frameptr; - - /* Bail if we see a zero in the stack frame, that means we've reached the top of the stack */ - /* That also means the return address is worthless, so don't record it */ - if (frameptr_next == NULL) - break; - /* Verify thread stack bounds */ - if (((uintptr_t)frameptr_next > kstackt) || ((uintptr_t)frameptr_next < kstackb)) - break; - /* Pull return address from one spot above the frame pointer */ - retaddr = *(frameptr + 1); - - /* Store it in the backtrace array */ - bt[frame_index++] = retaddr; - - frameptr = frameptr_next; - } - - /* Save the number of frames captured for return value */ - frames = frame_index; - - /* Fill in the rest of the backtrace with zeros */ - while (frame_index < max_frames) - bt[frame_index++] = 0; - - return frames; -} - /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ uintptr_t hash_mix(uintptr_t x) @@ -1646,6 +1547,9 @@ hashaddr(uintptr_t pt, uint32_t max_size) /* End of all leak-detection code */ #pragma mark - +#define ZONE_MAX_ALLOC_SIZE (32 * 1024) +#define ZONE_ALLOC_FRAG_PERCENT(alloc_size, ele_size) (((alloc_size % ele_size) * 100) / alloc_size) + /* * zinit initializes a new zone. The zone data structures themselves * are stored in a zone, which is initially a static structure that @@ -1659,17 +1563,12 @@ zinit( const char *name) /* a name for the zone */ { zone_t z; - boolean_t use_page_list = FALSE; - - if (zone_zone == ZONE_NULL) { - z = (struct zone *)zdata; - /* special handling in zcram() because the first element is being used */ - } else - z = (zone_t) zalloc(zone_zone); - - if (z == ZONE_NULL) - return(ZONE_NULL); + simple_lock(&all_zones_lock); + z = &(zone_array[zone_array_index]); + zone_array_index++; + assert(zone_array_index != MAX_ZONES); + simple_unlock(&all_zones_lock); /* Zone elements must fit both a next pointer and a backup pointer */ vm_size_t minimum_element_size = sizeof(vm_offset_t) * 2; @@ -1689,70 +1588,17 @@ zinit( alloc = round_page(alloc); max = round_page(max); - /* - * we look for an allocation size with less than 1% waste - * up to 5 pages in size... - * otherwise, we look for an allocation size with least fragmentation - * in the range of 1 - 5 pages - * This size will be used unless - * the user suggestion is larger AND has less fragmentation - */ -#if ZONE_ALIAS_ADDR - /* Favor PAGE_SIZE allocations unless we waste >10% space */ - if ((size < PAGE_SIZE) && (PAGE_SIZE % size <= PAGE_SIZE / 10)) - alloc = PAGE_SIZE; - else -#endif -#if defined(__LP64__) - if (((alloc % size) != 0) || (alloc > PAGE_SIZE * 8)) -#endif - { - vm_size_t best, waste; unsigned int i; - best = PAGE_SIZE; - waste = best % size; - - for (i = 1; i <= 5; i++) { - vm_size_t tsize, twaste; - - tsize = i * PAGE_SIZE; - - if ((tsize % size) < (tsize / 100)) { - alloc = tsize; - goto use_this_allocation; - } - twaste = tsize % size; - if (twaste < waste) - best = tsize, waste = twaste; + vm_size_t best_alloc = PAGE_SIZE; + vm_size_t alloc_size; + for (alloc_size = (2 * PAGE_SIZE); alloc_size <= ZONE_MAX_ALLOC_SIZE; alloc_size += PAGE_SIZE) { + if (ZONE_ALLOC_FRAG_PERCENT(alloc_size, size) < ZONE_ALLOC_FRAG_PERCENT(best_alloc, size)) { + best_alloc = alloc_size; } - if (alloc <= best || (alloc % size >= waste)) - alloc = best; } -use_this_allocation: + alloc = best_alloc; if (max && (max < alloc)) max = alloc; - /* - * Opt into page list tracking if we can reliably map an allocation - * to its page_metadata, and if the wastage in the tail of - * the allocation is not too large - */ - - /* zone_zone can't use page metadata since the page metadata will overwrite zone metadata */ - if (alloc == PAGE_SIZE && zone_zone != ZONE_NULL) { - vm_offset_t first_element_offset; - size_t zone_page_metadata_size = sizeof(struct zone_page_metadata); - - if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0) { - first_element_offset = zone_page_metadata_size; - } else { - first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT)); - } - - if (((PAGE_SIZE - first_element_offset) % size) <= PAGE_SIZE / 100) { - use_page_list = TRUE; - } - } - z->free_elements = NULL; queue_init(&z->pages.any_free_foreign); queue_init(&z->pages.all_free); @@ -1766,10 +1612,10 @@ zinit( z->zone_name = name; z->count = 0; z->countfree = 0; + z->count_all_free_pages = 0; z->sum_count = 0LL; z->doing_alloc_without_vm_priv = FALSE; z->doing_alloc_with_vm_priv = FALSE; - z->doing_gc = FALSE; z->exhaustible = FALSE; z->collectable = TRUE; z->allows_foreign = FALSE; @@ -1782,81 +1628,156 @@ zinit( z->async_prio_refill = FALSE; z->gzalloc_exempt = FALSE; z->alignment_required = FALSE; - z->use_page_list = use_page_list; + z->zone_replenishing = FALSE; z->prio_refill_watermark = 0; z->zone_replenish_thread = NULL; z->zp_count = 0; + #if CONFIG_ZLEAKS z->zleak_capture = 0; z->zleak_on = FALSE; #endif /* CONFIG_ZLEAKS */ -#if ZONE_DEBUG - z->active_zones.next = z->active_zones.prev = NULL; - zone_debug_enable(z); -#endif /* ZONE_DEBUG */ lock_zone_init(z); /* * Add the zone to the all-zones list. - * If we are tracking zone info per task, and we have - * already used all the available stat slots, then keep - * using the overflow zone slot. */ - z->next_zone = ZONE_NULL; simple_lock(&all_zones_lock); - *last_zone = z; - last_zone = &z->next_zone; z->index = num_zones; - if (zinfo_per_task) { - if (num_zones > ZONES_MAX) - z->index = ZONES_MAX; - } num_zones++; simple_unlock(&all_zones_lock); /* - * Check if we should be logging this zone. If so, remember the zone pointer. + * Check for and set up zone leak detection if requested via boot-args. We recognized two + * boot-args: + * + * zlog= + * zrecs= + * + * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to + * control the size of the log. If zrecs is not specified, a default value is used. */ - if (log_this_zone(z->zone_name, zone_name_to_log)) { - zone_of_interest = z; - } - /* - * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are - * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. kmem_alloc_ready is set to - * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one - * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again - * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized - * right now. - */ - if (zone_of_interest != NULL && zlog_btlog == NULL && kmem_alloc_ready) { - zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, NULL, NULL, NULL); - if (zlog_btlog) { - printf("zone: logging started for zone %s\n", zone_of_interest->zone_name); - } else { - printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); - zone_of_interest = NULL; - } - } -#if CONFIG_GZALLOC - gzalloc_zone_init(z); -#endif - return(z); -} -unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count; + if (num_zones_logged < max_num_zones_to_log) { -static void zone_replenish_thread(zone_t); + int i = 1; /* zlog0 isn't allowed. */ + boolean_t zone_logging_enabled = FALSE; + char zlog_name[MAX_ZONE_NAME] = ""; /* Temp. buffer to create the strings zlog1, zlog2 etc... */ -/* High priority VM privileged thread used to asynchronously refill a designated - * zone, such as the reserved VM map entry zone. - */ -static void zone_replenish_thread(zone_t z) { - vm_size_t free_size; - current_thread()->options |= TH_OPT_VMPRIV; + while (i <= max_num_zones_to_log) { - for (;;) { + snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i); + + if (PE_parse_boot_argn(zlog_name, zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) { + if (log_this_zone(z->zone_name, zone_name_to_log)) { + z->zone_logging = TRUE; + zone_logging_enabled = TRUE; + num_zones_logged++; + break; + } + } + i++; + } + + if (zone_logging_enabled == FALSE) { + /* + * Backwards compat. with the old boot-arg used to specify single zone logging i.e. zlog + * Needs to happen after the newer zlogn checks because the prefix will match all the zlogn + * boot-args. + */ + if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) { + if (log_this_zone(z->zone_name, zone_name_to_log)) { + z->zone_logging = TRUE; + zone_logging_enabled = TRUE; + num_zones_logged++; + } + } + } + + if (log_records_init == FALSE && zone_logging_enabled == TRUE) { + if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) { + /* + * Don't allow more than ZRECORDS_MAX records even if the user asked for more. + * This prevents accidentally hogging too much kernel memory and making the system + * unusable. + */ + + log_records = MIN(ZRECORDS_MAX, log_records); + log_records_init = TRUE; + } else { + log_records = ZRECORDS_DEFAULT; + log_records_init = TRUE; + } + } + + /* + * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are + * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. kmem_alloc_ready is set to + * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one + * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again + * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized + * right now. + */ + if (kmem_alloc_ready) { + + zone_t curr_zone = NULL; + unsigned int max_zones = 0, zone_idx = 0; + + simple_lock(&all_zones_lock); + max_zones = num_zones; + simple_unlock(&all_zones_lock); + + for (zone_idx = 0; zone_idx < max_zones; zone_idx++) { + + curr_zone = &(zone_array[zone_idx]); + + /* + * We work with the zone unlocked here because we could end up needing the zone lock to + * enable logging for this zone e.g. need a VM object to allocate memory to enable logging for the + * VM objects zone. + * + * We don't expect these zones to be needed at this early a time in boot and so take this chance. + */ + if (curr_zone->zone_logging && curr_zone->zlog_btlog == NULL) { + + curr_zone->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */); + + if (curr_zone->zlog_btlog) { + + printf("zone: logging started for zone %s\n", curr_zone->zone_name); + } else { + printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); + curr_zone->zone_logging = FALSE; + } + } + + } + } + } + +#if CONFIG_GZALLOC + gzalloc_zone_init(z); +#endif + return(z); +} +unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count; + +static void zone_replenish_thread(zone_t); + +/* High priority VM privileged thread used to asynchronously refill a designated + * zone, such as the reserved VM map entry zone. + */ +__attribute__((noreturn)) +static void +zone_replenish_thread(zone_t z) +{ + vm_size_t free_size; + current_thread()->options |= TH_OPT_VMPRIV; + + for (;;) { lock_zone(z); + z->zone_replenishing = TRUE; assert(z->prio_refill_watermark != 0); while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) { assert(z->doing_alloc_without_vm_priv == FALSE); @@ -1879,20 +1800,12 @@ static void zone_replenish_thread(zone_t z) { kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE); if (kr == KERN_SUCCESS) { -#if ZONE_ALIAS_ADDR - if (alloc_size == PAGE_SIZE) - space = zone_alias_addr(space); -#endif zcram(z, space, alloc_size); } else if (kr == KERN_RESOURCE_SHORTAGE) { VM_PAGE_WAIT(); } else if (kr == KERN_NO_SPACE) { kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE); if (kr == KERN_SUCCESS) { -#if ZONE_ALIAS_ADDR - if (alloc_size == PAGE_SIZE) - space = zone_alias_addr(space); -#endif zcram(z, space, alloc_size); } else { assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC); @@ -1904,13 +1817,14 @@ static void zone_replenish_thread(zone_t z) { zone_replenish_loops++; } - unlock_zone(z); + z->zone_replenishing = FALSE; /* Signal any potential throttled consumers, terminating * their timer-bounded waits. */ thread_wakeup(z); assert_wait(&z->zone_replenish_thread, THREAD_UNINT); + unlock_zone(z); thread_block(THREAD_CONTINUE_NULL); zone_replenish_wakeups++; } @@ -1931,6 +1845,27 @@ zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) { thread_deallocate(z->zone_replenish_thread); } +/* Initialize the metadata for an allocation chunk */ +static inline void +zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadata *chunk_metadata) +{ + struct zone_page_metadata *page_metadata; + + /* The first page is the real metadata for this allocation chunk. We mark the others as fake metadata */ + size -= PAGE_SIZE; + newmem += PAGE_SIZE; + + for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) { + page_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE); + assert(page_metadata != chunk_metadata); + PAGE_METADATA_SET_ZINDEX(page_metadata, MULTIPAGE_METADATA_MAGIC); + page_metadata_set_realmeta(page_metadata, chunk_metadata); + page_metadata->free_count = 0; + } + return; +} + + /* * Boolean Random Number Generator for generating booleans to randomize * the order of elements in newly zcram()'ed memory. The algorithm is a @@ -1979,7 +1914,6 @@ random_free_to_zone( vm_offset_t newmem, vm_offset_t first_element_offset, int element_count, - boolean_t from_zm, int *entropy_buffer) { vm_offset_t last_element_offset; @@ -1991,7 +1925,11 @@ random_free_to_zone( last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size); for (index = 0; index < element_count; index++) { assert(first_element_offset <= last_element_offset); - if (random_bool_gen(entropy_buffer, index, MAX_ENTROPY_PER_ZCRAM)) { + if ( +#if DEBUG || DEVELOPMENT + leak_scan_debug_flag || +#endif /* DEBUG || DEVELOPMENT */ + random_bool_gen(entropy_buffer, index, MAX_ENTROPY_PER_ZCRAM)) { element_addr = newmem + first_element_offset; first_element_offset += elem_size; } else { @@ -2002,9 +1940,6 @@ random_free_to_zone( zone->count++; /* compensate for free_to_zone */ free_to_zone(zone, element_addr, FALSE); } - if (!zone->use_page_list && from_zm) { - zone_page_alloc(element_addr, elem_size); - } zone->cur_size += elem_size; } } @@ -2020,7 +1955,6 @@ zcram( { vm_size_t elem_size; boolean_t from_zm = FALSE; - vm_offset_t first_element_offset; int element_count; int entropy_buffer[MAX_ENTROPY_PER_ZCRAM]; @@ -2036,51 +1970,65 @@ zcram( if (from_zone_map(newmem, size)) from_zm = TRUE; + if (!from_zm) { + /* We cannot support elements larger than page size for foreign memory because we + * put metadata on the page itself for each page of foreign memory. We need to do + * this in order to be able to reach the metadata when any element is freed + */ + assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata)))); + } + if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name, (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size); - if (from_zm && !zone->use_page_list) - zone_page_init(newmem, size); - ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE)); random_bool_gen_entropy(entropy_buffer, MAX_ENTROPY_PER_ZCRAM); - lock_zone(zone); + /* + * Initialize the metadata for all pages. We dont need the zone lock + * here because we are not manipulating any zone related state yet. + */ - if (zone->use_page_list) { - struct zone_page_metadata *page_metadata; - size_t zone_page_metadata_size = sizeof(struct zone_page_metadata); + struct zone_page_metadata *chunk_metadata; + size_t zone_page_metadata_size = sizeof(struct zone_page_metadata); - assert((newmem & PAGE_MASK) == 0); - assert((size & PAGE_MASK) == 0); - for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) { + assert((newmem & PAGE_MASK) == 0); + assert((size & PAGE_MASK) == 0); - page_metadata = (struct zone_page_metadata *)(newmem); - - page_metadata->pages.next = NULL; - page_metadata->pages.prev = NULL; - page_metadata->elements = NULL; - page_metadata->zone = zone; - page_metadata->alloc_count = 0; - page_metadata->free_count = 0; + chunk_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE); + chunk_metadata->pages.next = NULL; + chunk_metadata->pages.prev = NULL; + page_metadata_set_freelist(chunk_metadata, 0); + PAGE_METADATA_SET_ZINDEX(chunk_metadata, zone->index); + chunk_metadata->free_count = 0; + chunk_metadata->page_count = (size / PAGE_SIZE); - enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_metadata); + zcram_metadata_init(newmem, size, chunk_metadata); + lock_zone(zone); + enqueue_tail(&zone->pages.all_used, &(chunk_metadata->pages)); + + if (!from_zm) { + /* We cannot support elements larger than page size for foreign memory because we + * put metadata on the page itself for each page of foreign memory. We need to do + * this in order to be able to reach the metadata when any element is freed + */ + + for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) { + vm_offset_t first_element_offset = 0; if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0){ first_element_offset = zone_page_metadata_size; } else { first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT)); } element_count = (int)((PAGE_SIZE - first_element_offset) / elem_size); - page_metadata->alloc_count += element_count; - random_free_to_zone(zone, newmem, first_element_offset, element_count, from_zm, entropy_buffer); + random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer); } - } else { - first_element_offset = 0; - element_count = (int)((size - first_element_offset) / elem_size); - random_free_to_zone(zone, newmem, first_element_offset, element_count, from_zm, entropy_buffer); + } else { + element_count = (int)(size / elem_size); + random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer); } unlock_zone(zone); @@ -2088,24 +2036,6 @@ zcram( } - -/* - * Steal memory for the zone package. Called from - * vm_page_bootstrap(). - */ -void -zone_steal_memory(void) -{ -#if CONFIG_GZALLOC - gzalloc_configure(); -#endif - /* Request enough early memory to get to the pmap zone */ - zdata_size = 12 * sizeof(struct zone); - zdata_size = round_page(zdata_size); - zdata = (vm_offset_t)pmap_steal_memory(zdata_size); -} - - /* * Fill a zone with enough memory to contain at least nelem elements. * Memory is obtained with kmem_alloc_kobject from the kernel_map. @@ -2151,10 +2081,6 @@ zone_bootstrap(void) char temp_buf[16]; unsigned int i; - if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof(temp_buf))) { - zinfo_per_task = TRUE; - } - if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) zalloc_debug = 0; @@ -2172,89 +2098,26 @@ zone_bootstrap(void) corruption_debug_flag = TRUE; } - /* - * Check for and set up zone leak detection if requested via boot-args. We recognized two - * boot-args: - * - * zlog= - * zrecs= - * - * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to - * control the size of the log. If zrecs is not specified, a default value is used. - */ - - if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) { - if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) { - - /* - * Don't allow more than ZRECORDS_MAX records even if the user asked for more. - * This prevents accidentally hogging too much kernel memory and making the system - * unusable. - */ - - log_records = MIN(ZRECORDS_MAX, log_records); - - } else { - log_records = ZRECORDS_DEFAULT; - } +#if DEBUG || DEVELOPMENT + /* disable element location randomization in a page */ + if (PE_parse_boot_argn("-zl", temp_buf, sizeof(temp_buf))) { + leak_scan_debug_flag = TRUE; } +#endif simple_lock_init(&all_zones_lock, 0); - first_zone = ZONE_NULL; - last_zone = &first_zone; num_zones = 0; thread_call_setup(&call_async_alloc, zalloc_async, NULL); - /* assertion: nobody else called zinit before us */ - assert(zone_zone == ZONE_NULL); - /* initializing global lock group for zones */ lck_grp_attr_setdefault(&zone_locks_grp_attr); lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr); - zone_zone = zinit(sizeof(struct zone), 128 * sizeof(struct zone), - sizeof(struct zone), "zones"); - zone_change(zone_zone, Z_COLLECT, FALSE); - zone_change(zone_zone, Z_CALLERACCT, FALSE); - zone_change(zone_zone, Z_NOENCRYPT, TRUE); - - zcram(zone_zone, zdata, zdata_size); - VM_PAGE_MOVE_STOLEN(atop_64(zdata_size)); - - /* initialize fake zones and zone info if tracking by task */ - if (zinfo_per_task) { - vm_size_t zisize = sizeof(zinfo_usage_store_t) * ZINFO_SLOTS; - - for (i = 0; i < num_fake_zones; i++) - fake_zones[i].init(ZINFO_SLOTS - num_fake_zones + i); - zinfo_zone = zinit(zisize, zisize * CONFIG_TASK_MAX, - zisize, "per task zinfo"); - zone_change(zinfo_zone, Z_CALLERACCT, FALSE); - } -} - -void -zinfo_task_init(task_t task) -{ - if (zinfo_per_task) { - task->tkm_zinfo = zalloc(zinfo_zone); - memset(task->tkm_zinfo, 0, sizeof(zinfo_usage_store_t) * ZINFO_SLOTS); - } else { - task->tkm_zinfo = NULL; - } + lck_attr_setdefault(&zone_metadata_lock_attr); + lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr); } -void -zinfo_task_free(task_t task) -{ - assert(task != kernel_task); - if (task->tkm_zinfo != NULL) { - zfree(zinfo_zone, task->tkm_zinfo); - task->tkm_zinfo = NULL; - } -} - /* Global initialization of Zone Allocator. * Runs after zone_bootstrap. */ @@ -2265,6 +2128,8 @@ zone_init( kern_return_t retval; vm_offset_t zone_min; vm_offset_t zone_max; + vm_offset_t zone_metadata_space; + unsigned int zone_pages; retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size, FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT | VM_MAKE_TAG(VM_KERN_MEMORY_ZONE), @@ -2282,35 +2147,27 @@ zone_init( zone_map_min_address = zone_min; zone_map_max_address = zone_max; + zone_pages = (unsigned int)atop_kernel(zone_max - zone_min); + zone_metadata_space = round_page(zone_pages * sizeof(struct zone_page_metadata)); + retval = kernel_memory_allocate(zone_map, &zone_metadata_region_min, zone_metadata_space, + 0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_OSFMK); + if (retval != KERN_SUCCESS) + panic("zone_init: zone_metadata_region initialization failed!"); + zone_metadata_region_max = zone_metadata_region_min + zone_metadata_space; + #if defined(__LP64__) /* * ensure that any vm_page_t that gets created from * the vm_page zone can be packed properly (see vm_page.h * for the packing requirements */ - if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_min_address)) != (vm_page_t)zone_map_min_address) - panic("VM_PAGE_PACK_PTR failed on zone_map_min_address - %p", (void *)zone_map_min_address); + if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_metadata_region_max))) != (vm_page_t)zone_metadata_region_max) + panic("VM_PAGE_PACK_PTR failed on zone_metadata_region_max - %p", (void *)zone_metadata_region_max); - if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address)) != (vm_page_t)zone_map_max_address) + if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address))) != (vm_page_t)zone_map_max_address) panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address); #endif - zone_pages = (unsigned int)atop_kernel(zone_max - zone_min); - zone_page_table_used_size = sizeof(zone_page_table); - - zone_page_table_second_level_size = 1; - zone_page_table_second_level_shift_amount = 0; - - /* - * Find the power of 2 for the second level that allows - * the first level to fit in ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE - * slots. - */ - while ((zone_page_table_first_level_slot(zone_pages-1)) >= ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE) { - zone_page_table_second_level_size <<= 1; - zone_page_table_second_level_shift_amount++; - } - lck_grp_attr_setdefault(&zone_gc_lck_grp_attr); lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr); lck_attr_setdefault(&zone_gc_lck_attr); @@ -2324,70 +2181,6 @@ zone_init( #endif /* CONFIG_ZLEAKS */ } -void -zone_page_table_expand(zone_page_index_t pindex) -{ - unsigned int first_index; - struct zone_page_table_entry * volatile * first_level_ptr; - - assert(pindex < zone_pages); - - first_index = zone_page_table_first_level_slot(pindex); - first_level_ptr = &zone_page_table[first_index]; - - if (*first_level_ptr == NULL) { - /* - * We were able to verify the old first-level slot - * had NULL, so attempt to populate it. - */ - - vm_offset_t second_level_array = 0; - vm_size_t second_level_size = round_page(zone_page_table_second_level_size * sizeof(struct zone_page_table_entry)); - zone_page_index_t i; - struct zone_page_table_entry *entry_array; - - if (kmem_alloc_kobject(zone_map, &second_level_array, - second_level_size, VM_KERN_MEMORY_OSFMK) != KERN_SUCCESS) { - panic("zone_page_table_expand"); - } - zone_map_table_page_count += (second_level_size / PAGE_SIZE); - - /* - * zone_gc() may scan the "zone_page_table" directly, - * so make sure any slots have a valid unused state. - */ - entry_array = (struct zone_page_table_entry *)second_level_array; - for (i=0; i < zone_page_table_second_level_size; i++) { - entry_array[i].alloc_count = ZONE_PAGE_UNUSED; - entry_array[i].collect_count = 0; - } - - if (OSCompareAndSwapPtr(NULL, entry_array, first_level_ptr)) { - /* Old slot was NULL, replaced with expanded level */ - OSAddAtomicLong(second_level_size, &zone_page_table_used_size); - } else { - /* Old slot was not NULL, someone else expanded first */ - kmem_free(zone_map, second_level_array, second_level_size); - zone_map_table_page_count -= (second_level_size / PAGE_SIZE); - } - } else { - /* Old slot was not NULL, already been expanded */ - } -} - -struct zone_page_table_entry * -zone_page_table_lookup(zone_page_index_t pindex) -{ - unsigned int first_index = zone_page_table_first_level_slot(pindex); - struct zone_page_table_entry *second_level = zone_page_table[first_index]; - - if (second_level) { - return &second_level[zone_page_table_second_level_slot(pindex)]; - } - - return NULL; -} - extern volatile SInt32 kfree_nop_count; #pragma mark - @@ -2407,7 +2200,7 @@ zalloc_internal( uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ int numsaved = 0; boolean_t zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE; -#if CONFIG_GZALLOC || ZONE_DEBUG +#if CONFIG_GZALLOC boolean_t did_gzalloc = FALSE; #endif thread_t thr = current_thread(); @@ -2439,7 +2232,7 @@ zalloc_internal( if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) { /* Avoid backtracing twice if zone logging is on */ if (numsaved == 0) - zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); + zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH); else zleak_tracedepth = numsaved; } @@ -2456,12 +2249,11 @@ zalloc_internal( if (zone_replenish_wakeup) { zone_replenish_wakeups_initiated++; - unlock_zone(zone); /* Signal the potentially waiting * refill thread. */ thread_wakeup(&zone->zone_replenish_thread); - + unlock_zone(zone); /* Scheduling latencies etc. may prevent * the refill thread from keeping up * with demand. Throttle consumers @@ -2506,16 +2298,6 @@ zalloc_internal( */ zone->waiting = TRUE; zone_sleep(zone); - } else if (zone->doing_gc) { - /* - * zone_gc() is running. Since we need an element - * from the free list that is currently being - * collected, set the waiting bit and - * wait for the GC process to finish - * before trying again - */ - zone->waiting = TRUE; - zone_sleep(zone); } else { vm_offset_t space; vm_size_t alloc_size; @@ -2548,6 +2330,14 @@ zalloc_internal( panic("zalloc: zone \"%s\" empty.", zone->zone_name); } } + /* + * It is possible that a BG thread is refilling/expanding the zone + * and gets pre-empted during that operation. That blocks all other + * threads from making progress leading to a watchdog timeout. To + * avoid that, boost the thread priority using the rwlock boost + */ + set_thread_rwlock_boost(); + if ((thr->options & TH_OPT_VMPRIV)) { zone->doing_alloc_with_vm_priv = TRUE; set_doing_alloc_with_vm_priv = TRUE; @@ -2570,11 +2360,6 @@ zalloc_internal( retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE); if (retval == KERN_SUCCESS) { -#if ZONE_ALIAS_ADDR - if (alloc_size == PAGE_SIZE) - space = zone_alias_addr(space); -#endif - #if CONFIG_ZLEAKS if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) { if (zone_map->size >= zleak_global_tracking_threshold) { @@ -2600,7 +2385,7 @@ zalloc_internal( retry++; if (retry == 2) { - zone_gc(TRUE); + zone_gc(); printf("zalloc did gc\n"); zone_display_zprint(); } @@ -2635,6 +2420,8 @@ zalloc_internal( zone->waiting = FALSE; zone_wakeup(zone); } + clear_thread_rwlock_boost(); + addr = try_alloc_from_zone(zone, &check_poison); if (addr == 0 && retval == KERN_RESOURCE_SHORTAGE) { @@ -2672,28 +2459,14 @@ zalloc_internal( addr = try_alloc_from_zone(zone, &check_poison); } - /* - * See if we should be logging allocations in this zone. Logging is rarely done except when a leak is - * suspected, so this code rarely executes. We need to do this code while still holding the zone lock - * since it protects the various log related data structures. - */ - - if (__improbable(DO_LOGGING(zone) && addr)) { - btlog_add_entry(zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved); - } - vm_offset_t inner_size = zone->elem_size; - -#if ZONE_DEBUG - if (!did_gzalloc && addr && zone_debug_enabled(zone)) { - enqueue_tail(&zone->active_zones, (queue_entry_t)addr); - addr += ZONE_DEBUG_OFFSET; - inner_size -= ZONE_DEBUG_OFFSET; - } -#endif unlock_zone(zone); + if (__improbable(DO_LOGGING(zone) && addr)) { + btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved); + } + if (__improbable(check_poison && addr)) { vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1; vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr); @@ -2718,23 +2491,20 @@ zalloc_internal( *primary = ZP_POISON; *backup = ZP_POISON; + +#if DEBUG || DEVELOPMENT + if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) { + int count, idx; + /* Fill element, from tail, with backtrace in reverse order */ + if (numsaved == 0) numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH); + count = (int) (zone->elem_size / sizeof(uintptr_t)); + if (count >= numsaved) count = numsaved - 1; + for (idx = 0; idx < count; idx++) ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1]; + } +#endif /* DEBUG || DEVELOPMENT */ } TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr); - - if (addr) { - task_t task; - zinfo_usage_t zinfo; - vm_size_t sz = zone->elem_size; - - if (zone->caller_acct) - ledger_credit(thr->t_ledger, task_ledgers.tkm_private, sz); - else - ledger_credit(thr->t_ledger, task_ledgers.tkm_shared, sz); - - if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].alloc); - } return((void *)addr); } @@ -2769,17 +2539,16 @@ zalloc_async( __unused thread_call_param_t p0, __unused thread_call_param_t p1) { - zone_t current_z = NULL, head_z; + zone_t current_z = NULL; unsigned int max_zones, i; void *elt = NULL; boolean_t pending = FALSE; simple_lock(&all_zones_lock); - head_z = first_zone; max_zones = num_zones; simple_unlock(&all_zones_lock); - current_z = head_z; for (i = 0; i < max_zones; i++) { + current_z = &(zone_array[i]); lock_zone(current_z); if (current_z->async_pending == TRUE) { current_z->async_pending = FALSE; @@ -2792,165 +2561,66 @@ zalloc_async( zfree(current_z, elt); pending = FALSE; } - /* - * This is based on assumption that zones never get - * freed once allocated and linked. - * Hence a read outside of lock is OK. - */ - current_z = current_z->next_zone; } } /* * zget returns an element from the specified zone * and immediately returns nothing if there is nothing there. - * - * This form should be used when you can not block (like when - * processing an interrupt). - * - * XXX: It seems like only vm_page_grab_fictitious_common uses this, and its - * friend vm_page_more_fictitious can block, so it doesn't seem like - * this is used for interrupts any more.... */ void * zget( - register zone_t zone) + zone_t zone) { - vm_offset_t addr; - boolean_t check_poison = FALSE; - -#if CONFIG_ZLEAKS - uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used for zone leak detection */ - uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ -#endif /* CONFIG_ZLEAKS */ - - assert( zone != ZONE_NULL ); + return zalloc_internal(zone, FALSE, TRUE); +} -#if CONFIG_ZLEAKS - /* - * Zone leak detection: capture a backtrace - */ - if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) { - zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); - } -#endif /* CONFIG_ZLEAKS */ +/* Keep this FALSE by default. Large memory machine run orders of magnitude + slower in debug mode when true. Use debugger to enable if needed */ +/* static */ boolean_t zone_check = FALSE; - if (!lock_try_zone(zone)) - return NULL; - - addr = try_alloc_from_zone(zone, &check_poison); +static void zone_check_freelist(zone_t zone, vm_offset_t elem) +{ + struct zone_free_element *this; + struct zone_page_metadata *thispage; - vm_offset_t inner_size = zone->elem_size; - -#if ZONE_DEBUG - if (addr && zone_debug_enabled(zone)) { - enqueue_tail(&zone->active_zones, (queue_entry_t)addr); - addr += ZONE_DEBUG_OFFSET; - inner_size -= ZONE_DEBUG_OFFSET; + if (zone->allows_foreign) { + for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); + !queue_end(&zone->pages.any_free_foreign, &(thispage->pages)); + thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) { + for (this = page_metadata_get_freelist(thispage); + this != NULL; + this = this->next) { + if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) + panic("zone_check_freelist"); + } + } } -#endif /* ZONE_DEBUG */ - -#if CONFIG_ZLEAKS - /* - * Zone leak detection: record the allocation - */ - if (zone->zleak_on && zleak_tracedepth > 0 && addr) { - /* Sampling can fail if another sample is happening at the same time in a different zone. */ - if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { - /* If it failed, roll back the counter so we sample the next allocation instead. */ - zone->zleak_capture = zleak_sample_factor; + for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); + !queue_end(&zone->pages.all_free, &(thispage->pages)); + thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) { + for (this = page_metadata_get_freelist(thispage); + this != NULL; + this = this->next) { + if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) + panic("zone_check_freelist"); } } -#endif /* CONFIG_ZLEAKS */ - - unlock_zone(zone); - - if (__improbable(check_poison && addr)) { - vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1; - vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr); - - for ( ; element_cursor < backup ; element_cursor++) - if (__improbable(*element_cursor != ZP_POISON)) - zone_element_was_modified_panic(zone, - addr, - *element_cursor, - ZP_POISON, - ((vm_offset_t)element_cursor) - addr); - } - - if (addr) { - /* - * Clear out the old next pointer and backup to avoid leaking the cookie - * and so that only values on the freelist have a valid cookie - */ - vm_offset_t *primary = (vm_offset_t *) addr; - vm_offset_t *backup = get_backup_ptr(inner_size, primary); - - *primary = ZP_POISON; - *backup = ZP_POISON; - } - - return((void *) addr); -} - -/* Keep this FALSE by default. Large memory machine run orders of magnitude - slower in debug mode when true. Use debugger to enable if needed */ -/* static */ boolean_t zone_check = FALSE; - -static void zone_check_freelist(zone_t zone, vm_offset_t elem) -{ - struct zone_free_element *this; - struct zone_page_metadata *thispage; - - if (zone->use_page_list) { - if (zone->allows_foreign) { - for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); - !queue_end(&zone->pages.any_free_foreign, (queue_entry_t)thispage); - thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { - for (this = thispage->elements; - this != NULL; - this = this->next) { - if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) - panic("zone_check_freelist"); - } - } - } - for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); - !queue_end(&zone->pages.all_free, (queue_entry_t)thispage); - thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { - for (this = thispage->elements; - this != NULL; - this = this->next) { - if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) - panic("zone_check_freelist"); - } - } - for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); - !queue_end(&zone->pages.intermediate, (queue_entry_t)thispage); - thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { - for (this = thispage->elements; - this != NULL; - this = this->next) { - if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) - panic("zone_check_freelist"); - } - } - } else { - for (this = zone->free_elements; - this != NULL; - this = this->next) { + for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); + !queue_end(&zone->pages.intermediate, &(thispage->pages)); + thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) { + for (this = page_metadata_get_freelist(thispage); + this != NULL; + this = this->next) { if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) panic("zone_check_freelist"); } } } -static zone_t zone_last_bogus_zone = ZONE_NULL; -static vm_offset_t zone_last_bogus_elem = 0; - void zfree( - register zone_t zone, + zone_t zone, void *addr) { vm_offset_t elem = (vm_offset_t) addr; @@ -2961,42 +2631,6 @@ zfree( assert(zone != ZONE_NULL); -#if 1 - if (zone->use_page_list) { - struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr); - if (zone != page_meta->zone) { - /* - * Something bad has happened. Someone tried to zfree a pointer but the metadata says it is from - * a different zone (or maybe it's from a zone that doesn't use page free lists at all). We can repair - * some cases of this, if: - * 1) The specified zone had use_page_list, and the true zone also has use_page_list set. In that case - * we can swap the zone_t - * 2) The specified zone had use_page_list, but the true zone does not. In this case page_meta is garbage, - * and dereferencing page_meta->zone might panic. - * To distinguish the two, we enumerate the zone list to match it up. - * We do not handle the case where an incorrect zone is passed that does not have use_page_list set, - * even if the true zone did have this set. - */ - zone_t fixed_zone = NULL; - int fixed_i, max_zones; - - simple_lock(&all_zones_lock); - max_zones = num_zones; - fixed_zone = first_zone; - simple_unlock(&all_zones_lock); - - for (fixed_i=0; fixed_i < max_zones; fixed_i++, fixed_zone = fixed_zone->next_zone) { - if (fixed_zone == page_meta->zone && fixed_zone->use_page_list) { - /* we can fix this */ - printf("Fixing incorrect zfree from zone %s to zone %s\n", zone->zone_name, fixed_zone->zone_name); - zone = fixed_zone; - break; - } - } - } - } -#endif - /* * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. */ @@ -3008,25 +2642,24 @@ zfree( /* Basic sanity checks */ if (zone == ZONE_NULL || elem == (vm_offset_t)0) panic("zfree: NULL"); - /* zone_gc assumes zones are never freed */ - if (zone == zone_zone) - panic("zfree: freeing to zone_zone breaks zone_gc!"); #endif #if CONFIG_GZALLOC gzfreed = gzalloc_free(zone, addr); #endif + if (!gzfreed) { + struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE); + if (zone != PAGE_METADATA_GET_ZONE(page_meta)) { + panic("Element %p from zone %s caught being freed to wrong zone %s\n", addr, PAGE_METADATA_GET_ZONE(page_meta)->zone_name, zone->zone_name); + } + } + TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr); if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign && !from_zone_map(elem, zone->elem_size))) { -#if MACH_ASSERT panic("zfree: non-allocated memory in collectable zone!"); -#endif - zone_last_bogus_zone = zone; - zone_last_bogus_elem = elem; - return; } if ((zp_factor != 0 || zp_tiny_zone_limit != 0) && !gzfreed) { @@ -3040,11 +2673,6 @@ zfree( vm_offset_t inner_size = zone->elem_size; -#if ZONE_DEBUG - if (!gzfreed && zone_debug_enabled(zone)) { - inner_size -= ZONE_DEBUG_OFFSET; - } -#endif uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale); if (inner_size <= zp_tiny_zone_limit) @@ -3064,8 +2692,6 @@ zfree( } } - lock_zone(zone); - /* * See if we're doing logging on this zone. There are two styles of logging used depending on * whether we're trying to catch a leak or corruption. See comments above in zalloc for details. @@ -3077,7 +2703,7 @@ zfree( * We're logging to catch a corruption. Add a record of this zfree operation * to log. */ - btlog_add_entry(zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved); + btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved); } else { /* * We're logging to catch a leak. Remove any record we might have for this @@ -3085,29 +2711,12 @@ zfree( * overflowed and that's OK. Since the log is of a limited size, old records * get overwritten if there are more zallocs than zfrees. */ - btlog_remove_entries_for_element(zlog_btlog, (void *)addr); + btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr); } } -#if ZONE_DEBUG - if (!gzfreed && zone_debug_enabled(zone)) { - queue_t tmp_elem; - - elem -= ZONE_DEBUG_OFFSET; - if (zone_check) { - /* check the zone's consistency */ + lock_zone(zone); - for (tmp_elem = queue_first(&zone->active_zones); - !queue_end(tmp_elem, &zone->active_zones); - tmp_elem = queue_next(tmp_elem)) - if (elem == (vm_offset_t)tmp_elem) - break; - if (elem != (vm_offset_t)tmp_elem) - panic("zfree()ing element from wrong zone"); - } - remqueue((queue_t) elem); - } -#endif /* ZONE_DEBUG */ if (zone_check) { zone_check_freelist(zone, elem); } @@ -3131,31 +2740,7 @@ zfree( } #endif /* CONFIG_ZLEAKS */ - /* - * If elements have one or more pages, and memory is low, - * request to run the garbage collection in the zone the next - * time the pageout thread runs. - */ - if (zone->elem_size >= PAGE_SIZE && - vm_pool_low()){ - zone_gc_forced = TRUE; - } unlock_zone(zone); - - { - thread_t thr = current_thread(); - task_t task; - zinfo_usage_t zinfo; - vm_size_t sz = zone->elem_size; - - if (zone->caller_acct) - ledger_debit(thr->t_ledger, task_ledgers.tkm_private, sz); - else - ledger_debit(thr->t_ledger, task_ledgers.tkm_shared, sz); - - if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].free); - } } @@ -3201,16 +2786,6 @@ zone_change( break; case Z_ALIGNMENT_REQUIRED: zone->alignment_required = value; - /* - * Disable the page list optimization here to provide - * more of an alignment guarantee. This prevents - * the alignment from being modified by the metadata stored - * at the beginning of the page. - */ - zone->use_page_list = FALSE; -#if ZONE_DEBUG - zone_debug_disable(zone); -#endif #if CONFIG_GZALLOC gzalloc_reconfigure(zone); #endif @@ -3242,225 +2817,6 @@ zone_free_count(zone_t zone) return(free_count); } -/* - * Zone garbage collection subroutines - */ - -boolean_t -zone_page_collectable( - vm_offset_t addr, - vm_size_t size) -{ - struct zone_page_table_entry *zp; - zone_page_index_t i, j; - -#if ZONE_ALIAS_ADDR - addr = zone_virtual_addr(addr); -#endif -#if MACH_ASSERT - if (!from_zone_map(addr, size)) - panic("zone_page_collectable"); -#endif - - i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); - j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); - - for (; i <= j; i++) { - zp = zone_page_table_lookup(i); - if (zp->collect_count == zp->alloc_count) - return (TRUE); - } - - return (FALSE); -} - -void -zone_page_keep( - vm_offset_t addr, - vm_size_t size) -{ - struct zone_page_table_entry *zp; - zone_page_index_t i, j; - -#if ZONE_ALIAS_ADDR - addr = zone_virtual_addr(addr); -#endif -#if MACH_ASSERT - if (!from_zone_map(addr, size)) - panic("zone_page_keep"); -#endif - - i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); - j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); - - for (; i <= j; i++) { - zp = zone_page_table_lookup(i); - zp->collect_count = 0; - } -} - -void -zone_page_collect( - vm_offset_t addr, - vm_size_t size) -{ - struct zone_page_table_entry *zp; - zone_page_index_t i, j; - -#if ZONE_ALIAS_ADDR - addr = zone_virtual_addr(addr); -#endif -#if MACH_ASSERT - if (!from_zone_map(addr, size)) - panic("zone_page_collect"); -#endif - - i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); - j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); - - for (; i <= j; i++) { - zp = zone_page_table_lookup(i); - ++zp->collect_count; - } -} - -void -zone_page_init( - vm_offset_t addr, - vm_size_t size) -{ - struct zone_page_table_entry *zp; - zone_page_index_t i, j; - -#if ZONE_ALIAS_ADDR - addr = zone_virtual_addr(addr); -#endif -#if MACH_ASSERT - if (!from_zone_map(addr, size)) - panic("zone_page_init"); -#endif - - i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); - j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); - - for (; i <= j; i++) { - /* make sure entry exists before marking unused */ - zone_page_table_expand(i); - - zp = zone_page_table_lookup(i); - assert(zp); - zp->alloc_count = ZONE_PAGE_UNUSED; - zp->collect_count = 0; - } -} - -void -zone_page_alloc( - vm_offset_t addr, - vm_size_t size) -{ - struct zone_page_table_entry *zp; - zone_page_index_t i, j; - -#if ZONE_ALIAS_ADDR - addr = zone_virtual_addr(addr); -#endif -#if MACH_ASSERT - if (!from_zone_map(addr, size)) - panic("zone_page_alloc"); -#endif - - i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); - j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); - - for (; i <= j; i++) { - zp = zone_page_table_lookup(i); - assert(zp); - - /* - * Set alloc_count to ZONE_PAGE_USED if - * it was previously set to ZONE_PAGE_UNUSED. - */ - if (zp->alloc_count == ZONE_PAGE_UNUSED) - zp->alloc_count = ZONE_PAGE_USED; - - ++zp->alloc_count; - } -} - -void -zone_page_free_element( - zone_page_index_t *free_page_head, - zone_page_index_t *free_page_tail, - vm_offset_t addr, - vm_size_t size) -{ - struct zone_page_table_entry *zp; - zone_page_index_t i, j; - -#if ZONE_ALIAS_ADDR - addr = zone_virtual_addr(addr); -#endif -#if MACH_ASSERT - if (!from_zone_map(addr, size)) - panic("zone_page_free_element"); -#endif - - /* Clear out the old next and backup pointers */ - vm_offset_t *primary = (vm_offset_t *) addr; - vm_offset_t *backup = get_backup_ptr(size, primary); - - *primary = ZP_POISON; - *backup = ZP_POISON; - - i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); - j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); - - for (; i <= j; i++) { - zp = zone_page_table_lookup(i); - - if (zp->collect_count > 0) - --zp->collect_count; - if (--zp->alloc_count == 0) { - vm_address_t free_page_address; - vm_address_t prev_free_page_address; - - zp->alloc_count = ZONE_PAGE_UNUSED; - zp->collect_count = 0; - - - /* - * This element was the last one on this page, re-use the page's - * storage for a page freelist - */ - free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)i); - *(zone_page_index_t *)free_page_address = ZONE_PAGE_INDEX_INVALID; - - if (*free_page_head == ZONE_PAGE_INDEX_INVALID) { - *free_page_head = i; - *free_page_tail = i; - } else { - prev_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)(*free_page_tail)); - *(zone_page_index_t *)prev_free_page_address = i; - *free_page_tail = i; - } - } - } -} - - -#define ZONEGC_SMALL_ELEMENT_SIZE 4096 - -struct { - uint64_t zgc_invoked; - uint64_t zgc_bailed; - uint32_t pgs_freed; - - uint32_t elems_collected, - elems_freed, - elems_kept; -} zgc_stats; - /* Zone garbage collection * * zone_gc will walk through all the free elements in all the @@ -3468,459 +2824,139 @@ struct { * pages. zone_gc is called by consider_zone_gc when the system * begins to run out of memory. */ +extern zone_t vm_map_entry_reserved_zone; +uint64_t zone_gc_bailed = 0; + void -zone_gc(boolean_t all_zones) +zone_gc(void) { unsigned int max_zones; zone_t z; unsigned int i; - uint32_t old_pgs_freed; - zone_page_index_t zone_free_page_head; - zone_page_index_t zone_free_page_tail; - thread_t mythread = current_thread(); + zone_t zres = vm_map_entry_reserved_zone; lck_mtx_lock(&zone_gc_lock); - zgc_stats.zgc_invoked++; - old_pgs_freed = zgc_stats.pgs_freed; - simple_lock(&all_zones_lock); max_zones = num_zones; - z = first_zone; simple_unlock(&all_zones_lock); if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) - kprintf("zone_gc(all_zones=%s) starting...\n", all_zones ? "TRUE" : "FALSE"); - - /* - * it's ok to allow eager kernel preemption while - * while holding a zone lock since it's taken - * as a spin lock (which prevents preemption) - */ - thread_set_eager_preempt(mythread); - -#if MACH_ASSERT - for (i = 0; i < zone_pages; i++) { - struct zone_page_table_entry *zp; - - zp = zone_page_table_lookup(i); - assert(!zp || (zp->collect_count == 0)); - } -#endif /* MACH_ASSERT */ + kprintf("zone_gc() starting...\n"); - for (i = 0; i < max_zones; i++, z = z->next_zone) { - unsigned int n, m; - vm_size_t elt_size, size_freed; - struct zone_free_element *elt, *base_elt, *base_prev, *prev, *scan, *keep, *tail; - int kmem_frees = 0, total_freed_pages = 0; - struct zone_page_metadata *page_meta; - queue_head_t page_meta_head; + for (i = 0; i < max_zones; i++) { + z = &(zone_array[i]); + vm_size_t elt_size, size_freed; + int total_freed_pages = 0; + struct zone_page_metadata *page_meta; + queue_head_t page_meta_head; assert(z != ZONE_NULL); if (!z->collectable) continue; - - if (all_zones == FALSE && z->elem_size < ZONEGC_SMALL_ELEMENT_SIZE && !z->use_page_list) + + if (queue_empty(&z->pages.all_free)) { continue; + } + + /* + * Since kmem_free() might use VM entries from the reserved VM entries zone, we should bail from zone_gc() if we + * are below the critical threshold for that zone. Otherwise, there could be a deadlock between the zone_gc + * thread and the zone_replenish thread for the VM entries zone on the zone_map lock. + */ + if (zres->zone_replenishing) { + zone_gc_bailed++; + break; + } lock_zone(z); - elt_size = z->elem_size; - /* - * Do a quick feasibility check before we scan the zone: - * skip unless there is likelihood of getting pages back - * (i.e we need a whole allocation block's worth of free - * elements before we can garbage collect) and - * the zone has more than 10 percent of it's elements free - * or the element size is a multiple of the PAGE_SIZE - */ - if ((elt_size & PAGE_MASK) && - !z->use_page_list && - (((z->cur_size - z->count * elt_size) <= (2 * z->alloc_size)) || - ((z->cur_size - z->count * elt_size) <= (z->cur_size / 10)))) { - unlock_zone(z); + if (queue_empty(&z->pages.all_free)) { + unlock_zone(z); continue; } - z->doing_gc = TRUE; - /* * Snatch all of the free elements away from the zone. */ + uint64_t old_all_free_count = z->count_all_free_pages; + queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages); + queue_init(&z->pages.all_free); + z->count_all_free_pages = 0; + unlock_zone(z); - if (z->use_page_list) { - queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages); - queue_init(&z->pages.all_free); - } else { - scan = (void *)z->free_elements; - z->free_elements = 0; + /* Iterate through all elements to find out size and count of elements we snatched */ + size_freed = 0; + queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) { + assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */ + size_freed += elt_size * page_meta->free_count; } + /* Update the zone size and free element count */ + lock_zone(z); + z->cur_size -= size_freed; + z->countfree -= size_freed/elt_size; unlock_zone(z); - if (z->use_page_list) { - /* - * For zones that maintain page lists (which in turn - * track free elements on those pages), zone_gc() - * is incredibly easy, and we bypass all the logic - * for scanning elements and mapping them to - * collectable pages - */ - - size_freed = 0; - - queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) { - assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */ + while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) { + vm_address_t free_page_address; + if (zres->zone_replenishing) + break; + /* Free the pages for metadata and account for them */ + free_page_address = get_zone_page(page_meta); + ZONE_PAGE_COUNT_DECR(z, page_meta->page_count); + total_freed_pages += page_meta->page_count; + old_all_free_count -= page_meta->page_count; + size_freed -= (elt_size * page_meta->free_count); + kmem_free(zone_map, free_page_address, (page_meta->page_count * PAGE_SIZE)); + thread_yield_to_preemption(); + } + if (page_meta != NULL) { + /* + * We bailed because the VM entry reserved zone is replenishing. Put the remaining + * metadata objects back on the all_free list and bail. + */ + queue_entry_t qe; + enqueue_head(&page_meta_head, &(page_meta->pages)); + zone_gc_bailed++; - zgc_stats.elems_freed += page_meta->free_count; - size_freed += elt_size * page_meta->free_count; - zgc_stats.elems_collected += page_meta->free_count; - } - lock_zone(z); - - if (size_freed > 0) { - z->cur_size -= size_freed; - z->countfree -= size_freed/elt_size; - } - - z->doing_gc = FALSE; - if (z->waiting) { - z->waiting = FALSE; - zone_wakeup(z); + qe_foreach_safe(qe, &page_meta_head) { + re_queue_tail(&z->pages.all_free, qe); } - + z->count_all_free_pages += (int)old_all_free_count; + z->cur_size += size_freed; + z->countfree += size_freed/elt_size; unlock_zone(z); - - if (queue_empty(&page_meta_head)) - continue; - - thread_clear_eager_preempt(mythread); - - while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) { - vm_address_t free_page_address; - - free_page_address = trunc_page((vm_address_t)page_meta); -#if ZONE_ALIAS_ADDR - free_page_address = zone_virtual_addr(free_page_address); -#endif - kmem_free(zone_map, free_page_address, PAGE_SIZE); - ZONE_PAGE_COUNT_DECR(z, 1); - total_freed_pages++; - zgc_stats.pgs_freed += 1; - - if (++kmem_frees == 32) { - thread_yield_internal(1); - kmem_frees = 0; - } - } - if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) - kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages); - - thread_set_eager_preempt(mythread); - continue; /* go to next zone */ + kprintf("zone_gc() bailed due to VM entry zone replenishing (zone_gc_bailed: %lld)\n", zone_gc_bailed); + break; } + + /* We freed all the pages from the all_free list for this zone */ + assert(old_all_free_count == 0); - /* - * Pass 1: - * - * Determine which elements we can attempt to collect - * and count them up in the page table. Foreign elements - * are returned to the zone. - */ - - prev = (void *)&scan; - elt = scan; - n = 0; tail = keep = NULL; + if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) + kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages); + } - zone_free_page_head = ZONE_PAGE_INDEX_INVALID; - zone_free_page_tail = ZONE_PAGE_INDEX_INVALID; + lck_mtx_unlock(&zone_gc_lock); +} +extern vm_offset_t kmapoff_kaddr; +extern unsigned int kmapoff_pgcnt; - while (elt != NULL) { - if (from_zone_map(elt, elt_size)) { - zone_page_collect((vm_offset_t)elt, elt_size); - - prev = elt; - elt = elt->next; - - ++zgc_stats.elems_collected; - } - else { - if (keep == NULL) - keep = tail = elt; - else { - append_zone_element(z, tail, elt); - tail = elt; - } - - append_zone_element(z, prev, elt->next); - elt = elt->next; - append_zone_element(z, tail, NULL); - } - - /* - * Dribble back the elements we are keeping. - * If there are none, give some elements that we haven't looked at yet - * back to the freelist so that others waiting on the zone don't get stuck - * for too long. This might prevent us from recovering some memory, - * but allows us to avoid having to allocate new memory to serve requests - * while zone_gc has all the free memory tied up. - * - */ - - if (++n >= 50) { - if (z->waiting == TRUE) { - /* z->waiting checked without lock held, rechecked below after locking */ - lock_zone(z); - - if (keep != NULL) { - add_list_to_zone(z, keep, tail); - tail = keep = NULL; - } else { - m =0; - base_elt = elt; - base_prev = prev; - while ((elt != NULL) && (++m < 50)) { - prev = elt; - elt = elt->next; - } - if (m !=0 ) { - /* Extract the elements from the list and - * give them back */ - append_zone_element(z, prev, NULL); - add_list_to_zone(z, base_elt, prev); - append_zone_element(z, base_prev, elt); - prev = base_prev; - } - } - - if (z->waiting) { - z->waiting = FALSE; - zone_wakeup(z); - } - - unlock_zone(z); - } - n =0; - } - } - - /* - * Return any remaining elements. - */ - - if (keep != NULL) { - lock_zone(z); - - add_list_to_zone(z, keep, tail); - - if (z->waiting) { - z->waiting = FALSE; - zone_wakeup(z); - } - - unlock_zone(z); - } - - /* - * Pass 2: - * - * Determine which pages we can reclaim and - * free those elements. - */ - - size_freed = 0; - elt = scan; - n = 0; tail = keep = NULL; - - while (elt != NULL) { - if (zone_page_collectable((vm_offset_t)elt, elt_size)) { - struct zone_free_element *next_elt = elt->next; - - size_freed += elt_size; - - /* - * If this is the last allocation on the page(s), - * we may use their storage to maintain the linked - * list of free-able pages. So store elt->next because - * "elt" may be scribbled over. - */ - zone_page_free_element(&zone_free_page_head, &zone_free_page_tail, (vm_offset_t)elt, elt_size); - - elt = next_elt; - - ++zgc_stats.elems_freed; - } - else { - zone_page_keep((vm_offset_t)elt, elt_size); - - if (keep == NULL) - keep = tail = elt; - else { - append_zone_element(z, tail, elt); - tail = elt; - } - - elt = elt->next; - append_zone_element(z, tail, NULL); - - ++zgc_stats.elems_kept; - } - - /* - * Dribble back the elements we are keeping, - * and update the zone size info. - */ - - if (++n >= 50) { - lock_zone(z); - - z->cur_size -= size_freed; - z->countfree -= size_freed/elt_size; - size_freed = 0; - - if (keep != NULL) { - add_list_to_zone(z, keep, tail); - } - - if (z->waiting) { - z->waiting = FALSE; - zone_wakeup(z); - } - - unlock_zone(z); - - n = 0; tail = keep = NULL; - } - } - - /* - * Return any remaining elements, and update - * the zone size info. - */ - - lock_zone(z); - - if (size_freed > 0 || keep != NULL) { - - z->cur_size -= size_freed; - z->countfree -= size_freed/elt_size; - - if (keep != NULL) { - add_list_to_zone(z, keep, tail); - } - - } - - z->doing_gc = FALSE; - if (z->waiting) { - z->waiting = FALSE; - zone_wakeup(z); - } - unlock_zone(z); - - if (zone_free_page_head == ZONE_PAGE_INDEX_INVALID) - continue; - - /* - * we don't want to allow eager kernel preemption while holding the - * various locks taken in the kmem_free path of execution - */ - thread_clear_eager_preempt(mythread); - - - /* - * This loop counts the number of pages that should be freed by the - * next loop that tries to coalesce the kmem_frees() - */ - uint32_t pages_to_free_count = 0; - vm_address_t fpa; - zone_page_index_t index; - for (index = zone_free_page_head; index != ZONE_PAGE_INDEX_INVALID;) { - pages_to_free_count++; - fpa = zone_map_min_address + PAGE_SIZE * ((vm_size_t)index); - index = *(zone_page_index_t *)fpa; - } - - /* - * Reclaim the pages we are freeing. - */ - while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { - zone_page_index_t zind = zone_free_page_head; - vm_address_t free_page_address; - int page_count; - - /* - * Use the first word of the page about to be freed to find the next free page - */ - free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)zind); - zone_free_page_head = *(zone_page_index_t *)free_page_address; - - page_count = 1; - total_freed_pages++; - - while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { - zone_page_index_t next_zind = zone_free_page_head; - vm_address_t next_free_page_address; - - next_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)next_zind); - - if (next_free_page_address == (free_page_address - PAGE_SIZE)) { - free_page_address = next_free_page_address; - } else if (next_free_page_address != (free_page_address + (PAGE_SIZE * page_count))) - break; - - zone_free_page_head = *(zone_page_index_t *)next_free_page_address; - page_count++; - total_freed_pages++; - } - kmem_free(zone_map, free_page_address, page_count * PAGE_SIZE); - ZONE_PAGE_COUNT_DECR(z, page_count); - zgc_stats.pgs_freed += page_count; - pages_to_free_count -= page_count; - - if (++kmem_frees == 32) { - thread_yield_internal(1); - kmem_frees = 0; - } - } - - /* Check that we actually free the exact number of pages we were supposed to */ - assert(pages_to_free_count == 0); - - if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) - kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages); - - thread_set_eager_preempt(mythread); - } - - if (old_pgs_freed == zgc_stats.pgs_freed) - zgc_stats.zgc_bailed++; - - thread_clear_eager_preempt(mythread); - - lck_mtx_unlock(&zone_gc_lock); - -} - -extern vm_offset_t kmapoff_kaddr; -extern unsigned int kmapoff_pgcnt; - -/* - * consider_zone_gc: - * - * Called by the pageout daemon when the system needs more free pages. - */ +/* + * consider_zone_gc: + * + * Called by the pageout daemon when the system needs more free pages. + */ void -consider_zone_gc(boolean_t force) +consider_zone_gc(void) { - boolean_t all_zones = FALSE; - if (kmapoff_kaddr != 0) { /* * One-time reclaim of kernel_map resources we allocated in @@ -3931,206 +2967,21 @@ consider_zone_gc(boolean_t force) kmapoff_kaddr = 0; } - if (zone_gc_allowed && - (zone_gc_allowed_by_time_throttle || - zone_gc_forced || - force)) { - if (zone_gc_allowed_by_time_throttle == TRUE) { - zone_gc_allowed_by_time_throttle = FALSE; - all_zones = TRUE; - } - zone_gc_forced = FALSE; - - zone_gc(all_zones); - } -} - -/* - * By default, don't attempt zone GC more frequently - * than once / 1 minutes. - */ -void -compute_zone_gc_throttle(void *arg __unused) -{ - zone_gc_allowed_by_time_throttle = TRUE; + if (zone_gc_allowed) + zone_gc(); } - -#if CONFIG_TASK_ZONE_INFO - kern_return_t task_zone_info( - task_t task, - mach_zone_name_array_t *namesp, - mach_msg_type_number_t *namesCntp, - task_zone_info_array_t *infop, - mach_msg_type_number_t *infoCntp) -{ - mach_zone_name_t *names; - vm_offset_t names_addr; - vm_size_t names_size; - task_zone_info_t *info; - vm_offset_t info_addr; - vm_size_t info_size; - unsigned int max_zones, i; - zone_t z; - mach_zone_name_t *zn; - task_zone_info_t *zi; - kern_return_t kr; - - vm_size_t used; - vm_map_copy_t copy; - - - if (task == TASK_NULL) - return KERN_INVALID_TASK; - - /* - * We assume that zones aren't freed once allocated. - * We won't pick up any zones that are allocated later. - */ - - simple_lock(&all_zones_lock); - max_zones = (unsigned int)(num_zones + num_fake_zones); - z = first_zone; - simple_unlock(&all_zones_lock); - - names_size = round_page(max_zones * sizeof *names); - kr = kmem_alloc_pageable(ipc_kernel_map, - &names_addr, names_size, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) - return kr; - names = (mach_zone_name_t *) names_addr; - - info_size = round_page(max_zones * sizeof *info); - kr = kmem_alloc_pageable(ipc_kernel_map, - &info_addr, info_size, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) { - kmem_free(ipc_kernel_map, - names_addr, names_size); - return kr; - } - - info = (task_zone_info_t *) info_addr; - - zn = &names[0]; - zi = &info[0]; - - for (i = 0; i < max_zones - num_fake_zones; i++) { - struct zone zcopy; - - assert(z != ZONE_NULL); - - lock_zone(z); - zcopy = *z; - unlock_zone(z); - - simple_lock(&all_zones_lock); - z = z->next_zone; - simple_unlock(&all_zones_lock); - - /* assuming here the name data is static */ - (void) strncpy(zn->mzn_name, zcopy.zone_name, - sizeof zn->mzn_name); - zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; - - zi->tzi_count = (uint64_t)zcopy.count; - zi->tzi_cur_size = ptoa_64(zcopy.page_count); - zi->tzi_max_size = (uint64_t)zcopy.max_size; - zi->tzi_elem_size = (uint64_t)zcopy.elem_size; - zi->tzi_alloc_size = (uint64_t)zcopy.alloc_size; - zi->tzi_sum_size = zcopy.sum_count * zcopy.elem_size; - zi->tzi_exhaustible = (uint64_t)zcopy.exhaustible; - zi->tzi_collectable = (uint64_t)zcopy.collectable; - zi->tzi_caller_acct = (uint64_t)zcopy.caller_acct; - if (task->tkm_zinfo != NULL) { - zi->tzi_task_alloc = task->tkm_zinfo[zcopy.index].alloc; - zi->tzi_task_free = task->tkm_zinfo[zcopy.index].free; - } else { - zi->tzi_task_alloc = 0; - zi->tzi_task_free = 0; - } - zn++; - zi++; - } - - /* - * loop through the fake zones and fill them using the specialized - * functions - */ - for (i = 0; i < num_fake_zones; i++) { - int count, collectable, exhaustible, caller_acct, index; - vm_size_t cur_size, max_size, elem_size, alloc_size; - uint64_t sum_size; - - strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); - zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; - fake_zones[i].query(&count, &cur_size, - &max_size, &elem_size, - &alloc_size, &sum_size, - &collectable, &exhaustible, &caller_acct); - zi->tzi_count = (uint64_t)count; - zi->tzi_cur_size = (uint64_t)cur_size; - zi->tzi_max_size = (uint64_t)max_size; - zi->tzi_elem_size = (uint64_t)elem_size; - zi->tzi_alloc_size = (uint64_t)alloc_size; - zi->tzi_sum_size = sum_size; - zi->tzi_collectable = (uint64_t)collectable; - zi->tzi_exhaustible = (uint64_t)exhaustible; - zi->tzi_caller_acct = (uint64_t)caller_acct; - if (task->tkm_zinfo != NULL) { - index = ZINFO_SLOTS - num_fake_zones + i; - zi->tzi_task_alloc = task->tkm_zinfo[index].alloc; - zi->tzi_task_free = task->tkm_zinfo[index].free; - } else { - zi->tzi_task_alloc = 0; - zi->tzi_task_free = 0; - } - zn++; - zi++; - } - - used = max_zones * sizeof *names; - if (used != names_size) - bzero((char *) (names_addr + used), names_size - used); - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, - (vm_map_size_t)used, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *namesp = (mach_zone_name_t *) copy; - *namesCntp = max_zones; - - used = max_zones * sizeof *info; - - if (used != info_size) - bzero((char *) (info_addr + used), info_size - used); - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, - (vm_map_size_t)used, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *infop = (task_zone_info_t *) copy; - *infoCntp = max_zones; - - return KERN_SUCCESS; -} - -#else /* CONFIG_TASK_ZONE_INFO */ - -kern_return_t -task_zone_info( - __unused task_t task, - __unused mach_zone_name_array_t *namesp, + __unused task_t task, + __unused mach_zone_name_array_t *namesp, __unused mach_msg_type_number_t *namesCntp, - __unused task_zone_info_array_t *infop, + __unused task_zone_info_array_t *infop, __unused mach_msg_type_number_t *infoCntp) { return KERN_FAILURE; } -#endif /* CONFIG_TASK_ZONE_INFO */ - kern_return_t mach_zone_info( host_priv_t host, @@ -4142,6 +2993,18 @@ mach_zone_info( return (mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL)); } + +kern_return_t +host_zone_info( + host_priv_t host, + zone_name_array_t *namesp, + mach_msg_type_number_t *namesCntp, + zone_info_array_t *infop, + mach_msg_type_number_t *infoCntp) +{ + return (mach_memory_info(host, (mach_zone_name_array_t *)namesp, namesCntp, (mach_zone_info_array_t *)infop, infoCntp, NULL, NULL)); +} + kern_return_t mach_memory_info( host_priv_t host, @@ -4174,7 +3037,7 @@ mach_memory_info( vm_size_t used; vm_map_copy_t copy; - + uint64_t zones_collectable_bytes = 0; if (host == HOST_NULL) return KERN_INVALID_HOST; @@ -4189,8 +3052,7 @@ mach_memory_info( */ simple_lock(&all_zones_lock); - max_zones = (unsigned int)(num_zones + num_fake_zones); - z = first_zone; + max_zones = (unsigned int)(num_zones); simple_unlock(&all_zones_lock); names_size = round_page(max_zones * sizeof *names); @@ -4210,50 +3072,18 @@ mach_memory_info( } info = (mach_zone_info_t *) info_addr; - num_sites = 0; - memory_info_addr = 0; - if (memoryInfop && memoryInfoCntp) - { - num_sites = VM_KERN_MEMORY_COUNT + VM_KERN_COUNTER_COUNT; - memory_info_size = num_sites * sizeof(*info); - memory_info_vmsize = round_page(memory_info_size); - kr = kmem_alloc_pageable(ipc_kernel_map, - &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) { - kmem_free(ipc_kernel_map, - names_addr, names_size); - kmem_free(ipc_kernel_map, - info_addr, info_size); - return kr; - } - - kr = vm_map_wire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, - VM_PROT_READ|VM_PROT_WRITE|VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_IPC), FALSE); - assert(kr == KERN_SUCCESS); - - memory_info = (mach_memory_info_t *) memory_info_addr; - vm_page_diagnose(memory_info, num_sites); - - kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE); - assert(kr == KERN_SUCCESS); - } - zn = &names[0]; zi = &info[0]; - for (i = 0; i < max_zones - num_fake_zones; i++) { + for (i = 0; i < max_zones; i++) { struct zone zcopy; - + z = &(zone_array[i]); assert(z != ZONE_NULL); lock_zone(z); zcopy = *z; unlock_zone(z); - simple_lock(&all_zones_lock); - z = z->next_zone; - simple_unlock(&all_zones_lock); - /* assuming here the name data is static */ (void) strncpy(zn->mzn_name, zcopy.zone_name, sizeof zn->mzn_name); @@ -4267,34 +3097,7 @@ mach_memory_info( zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size; zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible; zi->mzi_collectable = (uint64_t)zcopy.collectable; - zn++; - zi++; - } - - /* - * loop through the fake zones and fill them using the specialized - * functions - */ - for (i = 0; i < num_fake_zones; i++) { - int count, collectable, exhaustible, caller_acct; - vm_size_t cur_size, max_size, elem_size, alloc_size; - uint64_t sum_size; - - strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); - zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; - fake_zones[i].query(&count, &cur_size, - &max_size, &elem_size, - &alloc_size, &sum_size, - &collectable, &exhaustible, &caller_acct); - zi->mzi_count = (uint64_t)count; - zi->mzi_cur_size = (uint64_t)cur_size; - zi->mzi_max_size = (uint64_t)max_size; - zi->mzi_elem_size = (uint64_t)elem_size; - zi->mzi_alloc_size = (uint64_t)alloc_size; - zi->mzi_sum_size = sum_size; - zi->mzi_collectable = (uint64_t)collectable; - zi->mzi_exhaustible = (uint64_t)exhaustible; - + zones_collectable_bytes += ((uint64_t)zcopy.count_all_free_pages * PAGE_SIZE); zn++; zi++; } @@ -4321,9 +3124,35 @@ mach_memory_info( *infop = (mach_zone_info_t *) copy; *infoCntp = max_zones; + + num_sites = 0; + memory_info_addr = 0; if (memoryInfop && memoryInfoCntp) { + num_sites = VM_KERN_MEMORY_COUNT + VM_KERN_COUNTER_COUNT; + memory_info_size = num_sites * sizeof(*info); + memory_info_vmsize = round_page(memory_info_size); + kr = kmem_alloc_pageable(ipc_kernel_map, + &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + kmem_free(ipc_kernel_map, + names_addr, names_size); + kmem_free(ipc_kernel_map, + info_addr, info_size); + return kr; + } + + kr = vm_map_wire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, + VM_PROT_READ|VM_PROT_WRITE|VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_IPC), FALSE); + assert(kr == KERN_SUCCESS); + + memory_info = (mach_memory_info_t *) memory_info_addr; + vm_page_diagnose(memory_info, num_sites, zones_collectable_bytes); + + kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE); + assert(kr == KERN_SUCCESS); + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr, (vm_map_size_t)memory_info_size, TRUE, ©); assert(kr == KERN_SUCCESS); @@ -4335,152 +3164,6 @@ mach_memory_info( return KERN_SUCCESS; } -/* - * host_zone_info - LEGACY user interface for Mach zone information - * Should use mach_zone_info() instead! - */ -kern_return_t -host_zone_info( - host_priv_t host, - zone_name_array_t *namesp, - mach_msg_type_number_t *namesCntp, - zone_info_array_t *infop, - mach_msg_type_number_t *infoCntp) -{ - zone_name_t *names; - vm_offset_t names_addr; - vm_size_t names_size; - zone_info_t *info; - vm_offset_t info_addr; - vm_size_t info_size; - unsigned int max_zones, i; - zone_t z; - zone_name_t *zn; - zone_info_t *zi; - kern_return_t kr; - - vm_size_t used; - vm_map_copy_t copy; - - - if (host == HOST_NULL) - return KERN_INVALID_HOST; -#if CONFIG_DEBUGGER_FOR_ZONE_INFO - if (!PE_i_can_has_debugger(NULL)) - return KERN_INVALID_HOST; -#endif - -#if defined(__LP64__) - if (!thread_is_64bit(current_thread())) - return KERN_NOT_SUPPORTED; -#else - if (thread_is_64bit(current_thread())) - return KERN_NOT_SUPPORTED; -#endif - - /* - * We assume that zones aren't freed once allocated. - * We won't pick up any zones that are allocated later. - */ - - simple_lock(&all_zones_lock); - max_zones = (unsigned int)(num_zones + num_fake_zones); - z = first_zone; - simple_unlock(&all_zones_lock); - - names_size = round_page(max_zones * sizeof *names); - kr = kmem_alloc_pageable(ipc_kernel_map, - &names_addr, names_size, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) - return kr; - names = (zone_name_t *) names_addr; - - info_size = round_page(max_zones * sizeof *info); - kr = kmem_alloc_pageable(ipc_kernel_map, - &info_addr, info_size, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) { - kmem_free(ipc_kernel_map, - names_addr, names_size); - return kr; - } - - info = (zone_info_t *) info_addr; - - zn = &names[0]; - zi = &info[0]; - - for (i = 0; i < max_zones - num_fake_zones; i++) { - struct zone zcopy; - - assert(z != ZONE_NULL); - - lock_zone(z); - zcopy = *z; - unlock_zone(z); - - simple_lock(&all_zones_lock); - z = z->next_zone; - simple_unlock(&all_zones_lock); - - /* assuming here the name data is static */ - (void) strncpy(zn->zn_name, zcopy.zone_name, - sizeof zn->zn_name); - zn->zn_name[sizeof zn->zn_name - 1] = '\0'; - - zi->zi_count = zcopy.count; - zi->zi_cur_size = ptoa(zcopy.page_count); - zi->zi_max_size = zcopy.max_size; - zi->zi_elem_size = zcopy.elem_size; - zi->zi_alloc_size = zcopy.alloc_size; - zi->zi_exhaustible = zcopy.exhaustible; - zi->zi_collectable = zcopy.collectable; - - zn++; - zi++; - } - - /* - * loop through the fake zones and fill them using the specialized - * functions - */ - for (i = 0; i < num_fake_zones; i++) { - int caller_acct; - uint64_t sum_space; - strncpy(zn->zn_name, fake_zones[i].name, sizeof zn->zn_name); - zn->zn_name[sizeof zn->zn_name - 1] = '\0'; - fake_zones[i].query(&zi->zi_count, &zi->zi_cur_size, - &zi->zi_max_size, &zi->zi_elem_size, - &zi->zi_alloc_size, &sum_space, - &zi->zi_collectable, &zi->zi_exhaustible, &caller_acct); - zn++; - zi++; - } - - used = max_zones * sizeof *names; - if (used != names_size) - bzero((char *) (names_addr + used), names_size - used); - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, - (vm_map_size_t)used, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *namesp = (zone_name_t *) copy; - *namesCntp = max_zones; - - used = max_zones * sizeof *info; - if (used != info_size) - bzero((char *) (info_addr + used), info_size - used); - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, - (vm_map_size_t)used, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *infop = (zone_info_t *) copy; - *infoCntp = max_zones; - - return KERN_SUCCESS; -} - kern_return_t mach_zone_force_gc( host_t host) @@ -4489,7 +3172,7 @@ mach_zone_force_gc( if (host == HOST_NULL) return KERN_INVALID_HOST; - consider_zone_gc(TRUE); + consider_zone_gc(); return (KERN_SUCCESS); } @@ -4507,21 +3190,12 @@ void zone_display_zprint() unsigned int i; zone_t the_zone; - if(first_zone!=NULL) { - the_zone = first_zone; - for (i = 0; i < num_zones; i++) { - if(the_zone->cur_size > (1024*1024)) { - printf("%.20s:\t%lu\n",the_zone->zone_name,(uintptr_t)the_zone->cur_size); - } - - if(the_zone->next_zone == NULL) { - break; - } - - the_zone = the_zone->next_zone; + for (i = 0; i < num_zones; i++) { + the_zone = &(zone_array[i]); + if(the_zone->cur_size > (1024*1024)) { + printf("%.20s:\t%lu\n",the_zone->zone_name,(uintptr_t)the_zone->cur_size); } } - printf("Kernel Stacks:\t%lu\n",(uintptr_t)(kernel_stack_size * stack_total)); #if defined(__i386__) || defined (__x86_64__) @@ -4540,21 +3214,15 @@ zone_find_largest(void) zone_t zone_largest; simple_lock(&all_zones_lock); - the_zone = first_zone; max_zones = num_zones; simple_unlock(&all_zones_lock); - zone_largest = the_zone; + zone_largest = &(zone_array[0]); for (i = 0; i < max_zones; i++) { + the_zone = &(zone_array[i]); if (the_zone->cur_size > zone_largest->cur_size) { zone_largest = the_zone; } - - if (the_zone->next_zone == NULL) { - break; - } - - the_zone = the_zone->next_zone; } return zone_largest; } @@ -4568,26 +3236,175 @@ zone_find_largest(void) || !queue_empty(&z->pages.intermediate) \ || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign))) -void -zone_debug_enable( - zone_t z) -{ - if (zone_debug_enabled(z) || zone_in_use(z) || - z->alloc_size < (z->elem_size + ZONE_DEBUG_OFFSET)) - return; - queue_init(&z->active_zones); - z->elem_size += ZONE_DEBUG_OFFSET; + +#endif /* ZONE_DEBUG */ + + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#if DEBUG || DEVELOPMENT + +static uintptr_t * +zone_copy_all_allocations_inqueue(zone_t z, queue_head_t * queue, uintptr_t * elems) +{ + struct zone_page_metadata *page_meta; + vm_offset_t free, elements; + vm_offset_t idx, numElements, freeCount, bytesAvail, metaSize; + + queue_iterate(queue, page_meta, struct zone_page_metadata *, pages) + { + elements = get_zone_page(page_meta); + bytesAvail = ptoa(page_meta->page_count); + freeCount = 0; + if (z->allows_foreign && !from_zone_map(elements, z->elem_size)) + { + metaSize = (sizeof(struct zone_page_metadata) + ZONE_ELEMENT_ALIGNMENT - 1) & ~(ZONE_ELEMENT_ALIGNMENT - 1); + bytesAvail -= metaSize; + elements += metaSize; + } + numElements = bytesAvail / z->elem_size; + // construct array of all possible elements + for (idx = 0; idx < numElements; idx++) + { + elems[idx] = INSTANCE_PUT(elements + idx * z->elem_size); + } + // remove from the array all free elements + free = (vm_offset_t)page_metadata_get_freelist(page_meta); + while (free) + { + // find idx of free element + for (idx = 0; (idx < numElements) && (elems[idx] != INSTANCE_PUT(free)); idx++) {} + assert(idx < numElements); + // remove it + bcopy(&elems[idx + 1], &elems[idx], (numElements - (idx + 1)) * sizeof(elems[0])); + numElements--; + freeCount++; + // next free element + vm_offset_t *primary = (vm_offset_t *) free; + free = *primary ^ zp_nopoison_cookie; + } + elems += numElements; + } + + return (elems); +} + +kern_return_t +zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon) +{ + uintptr_t zbt[MAX_ZTRACE_DEPTH]; + zone_t zone; + uintptr_t * array; + uintptr_t * next; + uintptr_t element, bt; + uint32_t idx, count, found; + uint32_t btidx, btcount, nobtcount, btfound; + uint32_t elemSize; + uint64_t maxElems; + kern_return_t kr; + + for (idx = 0; idx < num_zones; idx++) + { + if (!strncmp(zoneName, zone_array[idx].zone_name, nameLen)) break; + } + if (idx >= num_zones) return (KERN_INVALID_NAME); + zone = &zone_array[idx]; + + elemSize = (uint32_t) zone->elem_size; + maxElems = ptoa(zone->page_count) / elemSize; + + if ((zone->alloc_size % elemSize) + && !leak_scan_debug_flag) return (KERN_INVALID_CAPABILITY); + + kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array, + maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG); + if (KERN_SUCCESS != kr) return (kr); + + lock_zone(zone); + + next = array; + next = zone_copy_all_allocations_inqueue(zone, &zone->pages.any_free_foreign, next); + next = zone_copy_all_allocations_inqueue(zone, &zone->pages.intermediate, next); + next = zone_copy_all_allocations_inqueue(zone, &zone->pages.all_used, next); + count = (uint32_t)(next - array); + + unlock_zone(zone); + + zone_leaks_scan(array, count, (uint32_t)zone->elem_size, &found); + assert(found <= count); + + for (idx = 0; idx < count; idx++) + { + element = array[idx]; + if (kInstanceFlagReferenced & element) continue; + element = INSTANCE_PUT(element) & ~kInstanceFlags; + } + + if (zone->zlog_btlog && !corruption_debug_flag) + { + // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found + btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon); + } + + for (nobtcount = idx = 0; idx < count; idx++) + { + element = array[idx]; + if (!element) continue; + if (kInstanceFlagReferenced & element) continue; + element = INSTANCE_PUT(element) & ~kInstanceFlags; + + // see if we can find any backtrace left in the element + btcount = (typeof(btcount)) (zone->elem_size / sizeof(uintptr_t)); + if (btcount >= MAX_ZTRACE_DEPTH) btcount = MAX_ZTRACE_DEPTH - 1; + for (btfound = btidx = 0; btidx < btcount; btidx++) + { + bt = ((uintptr_t *)element)[btcount - 1 - btidx]; + if (!VM_KERNEL_IS_SLID(bt)) break; + zbt[btfound++] = bt; + } + if (btfound) (*proc)(refCon, 1, elemSize, &zbt[0], btfound); + else nobtcount++; + } + if (nobtcount) + { + // fake backtrace when we found nothing + zbt[0] = (uintptr_t) &zalloc; + (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1); + } + + kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t)); + + return (KERN_SUCCESS); } void -zone_debug_disable( - zone_t z) +kern_wired_diagnose(void) { - if (!zone_debug_enabled(z) || zone_in_use(z)) - return; - z->elem_size -= ZONE_DEBUG_OFFSET; - z->active_zones.next = z->active_zones.prev = NULL; -} + unsigned int count = VM_KERN_MEMORY_COUNT + VM_KERN_COUNTER_COUNT; + mach_memory_info_t info[count]; + unsigned int idx; + uint64_t total_zone, total_wired, top_wired, osfmk_wired; + if (KERN_SUCCESS != vm_page_diagnose(info, count, 0)) return; -#endif /* ZONE_DEBUG */ + total_zone = total_wired = top_wired = osfmk_wired = 0; + for (idx = 0; idx < num_zones; idx++) + { + total_zone += ptoa_64(zone_array[idx].page_count); + } + total_wired = total_zone; + + for (idx = 0; idx < count; idx++) + { + if (VM_KERN_COUNT_WIRED == info[idx].site) top_wired = info[idx].size; + if (VM_KERN_MEMORY_OSFMK == info[idx].site) osfmk_wired = info[idx].size; + if (VM_KERN_SITE_HIDE & info[idx].flags) continue; + if (!(VM_KERN_SITE_WIRED & info[idx].flags)) continue; + total_wired += info[idx].size; + } + + printf("top 0x%qx, total 0x%qx, zone 0x%qx, osfmk 0x%qx\n", + top_wired, total_wired, total_zone, osfmk_wired); +} + +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/kern/zalloc.h b/osfmk/kern/zalloc.h index 544689055..5b04f6e83 100644 --- a/osfmk/kern/zalloc.h +++ b/osfmk/kern/zalloc.h @@ -77,6 +77,7 @@ #include #include #include +#include #if CONFIG_GZALLOC typedef struct gzalloc_data { @@ -106,6 +107,7 @@ struct zone { } pages; /* list of zone_page_metadata structs, which maintain per-page free element lists */ int count; /* Number of elements used now */ int countfree; /* Number of free elements */ + int count_all_free_pages; /* Number of pages collectable by GC */ lck_attr_t lock_attr; /* zone lock attribute */ decl_lck_mtx_data(,lock) /* zone lock */ lck_mtx_ext_t lock_ext; /* placeholder for indirect mutex */ @@ -126,21 +128,17 @@ struct zone { /* boolean_t */ async_pending :1, /* asynchronous allocation pending? */ /* boolean_t */ zleak_on :1, /* Are we collecting allocation information? */ /* boolean_t */ caller_acct :1, /* do we account allocation/free to the caller? */ - /* boolean_t */ doing_gc :1, /* garbage collect in progress? */ /* boolean_t */ noencrypt :1, /* boolean_t */ no_callout :1, /* boolean_t */ async_prio_refill :1, /* boolean_t */ gzalloc_exempt :1, /* boolean_t */ alignment_required :1, - /* boolean_t */ use_page_list :1, + /* boolean_t */ zone_logging :1, /* Enable zone logging for this zone. */ + /* boolean_t */ zone_replenishing :1, /* future */ _reserved :15; int index; /* index into zone_info arrays for this zone */ - struct zone *next_zone; /* Link for all-zones list */ const char *zone_name; /* a name for the zone */ -#if ZONE_DEBUG - queue_head_t active_zones; /* active elements */ -#endif /* ZONE_DEBUG */ #if CONFIG_ZLEAKS uint32_t zleak_capture; /* per-zone counter for capturing every N allocations */ @@ -151,6 +149,8 @@ struct zone { #if CONFIG_GZALLOC gzalloc_data_t gz; #endif /* CONFIG_GZALLOC */ + + btlog_t *zlog_btlog; /* zone logging structure to hold stacks and element references to those stacks. */ }; /* @@ -162,13 +162,9 @@ typedef struct zinfo_usage_store_t { uint64_t alloc __attribute__((aligned(8))); /* allocation counter */ uint64_t free __attribute__((aligned(8))); /* free counter */ } zinfo_usage_store_t; -typedef zinfo_usage_store_t *zinfo_usage_t; -extern void zone_gc(boolean_t); -extern void consider_zone_gc(boolean_t); - -/* Steal memory for zone module */ -extern void zone_steal_memory(void); +extern void zone_gc(void); +extern void consider_zone_gc(void); /* Bootstrap zone module (create zone zone) */ extern void zone_bootstrap(void); @@ -177,11 +173,6 @@ extern void zone_bootstrap(void); extern void zone_init( vm_size_t map_size); -/* Handle per-task zone info */ -extern void zinfo_task_init(task_t task); -extern void zinfo_task_free(task_t task); - - /* Stack use statistics */ extern void stack_fake_zone_init(int zone_index); extern void stack_fake_zone_info( @@ -208,6 +199,9 @@ extern void zone_debug_disable( #define ZONE_DEBUG_OFFSET ROUNDUP(sizeof(queue_chain_t),16) #endif /* ZONE_DEBUG */ +extern unsigned int num_zones; +extern struct zone zone_array[]; + #endif /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -290,6 +284,10 @@ extern void zprealloc( extern integer_t zone_free_count( zone_t zone); +extern vm_size_t zone_element_size( + void *addr, + zone_t *z); + /* * MAX_ZTRACE_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interest. 15 * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual @@ -327,7 +325,6 @@ extern int get_zleak_state(void); #endif /* CONFIG_ZLEAKS */ /* These functions used for leak detection both in zalloc.c and mbuf.c */ -extern uint32_t fastbacktrace(uintptr_t* bt, uint32_t max_frames) __attribute__((noinline)); extern uintptr_t hash_mix(uintptr_t); extern uint32_t hashbacktrace(uintptr_t *, uint32_t, uint32_t); extern uint32_t hashaddr(uintptr_t, uint32_t); @@ -351,8 +348,13 @@ boolean_t gzalloc_enabled(void); vm_offset_t gzalloc_alloc(zone_t, boolean_t); boolean_t gzalloc_free(zone_t, void *); +boolean_t gzalloc_element_size(void *, zone_t *, vm_size_t *); #endif /* CONFIG_GZALLOC */ +/* Callbacks for btlog lock/unlock */ +void zlog_btlog_lock(__unused void *); +void zlog_btlog_unlock(__unused void *); + #endif /* XNU_KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/kextd/Makefile b/osfmk/kextd/Makefile index 74e1d8067..a49df09c0 100644 --- a/osfmk/kextd/Makefile +++ b/osfmk/kextd/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -16,7 +15,7 @@ KERNELFILES = ${MIG_DEFS} INSTALL_MI_LIST = INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} -INSTALL_MI_GEN_LIST = +INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = kextd @@ -28,7 +27,7 @@ EXPORT_MI_DIR = kextd # # Build path -# +# INCFLAGS_MAKEFILE= -I.. MIGKUFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_USER=1 -maxonstack 1024 @@ -42,7 +41,7 @@ COMP_FILES = ${MIG_KUSRC} do_build_all:: $(COMP_FILES) ${MIG_KUSRC} : kextd_mach.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user kextd_mach.c \ -header kextd_mach.h \ diff --git a/osfmk/kperf/Makefile b/osfmk/kperf/Makefile index 699416570..385bc0520 100644 --- a/osfmk/kperf/Makefile +++ b/osfmk/kperf/Makefile @@ -3,17 +3,17 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) -EXPORT_ONLY_FILES = \ - context.h \ - timetrigger.h \ - pet.h \ - kperfbsd.h \ - action.h \ - kperf.h +EXPORT_ONLY_FILES = \ + action.h \ + context.h \ + kperf.h \ + kperfbsd.h \ + kperf_timer.h \ + kdebug_trigger.h \ + pet.h EXPORT_MI_DIR = kperf @@ -29,5 +29,3 @@ INSTALL_KF_MI_LIST = $(empty) include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/kperf/action.c b/osfmk/kperf/action.c index 1bfe48847..db437d771 100644 --- a/osfmk/kperf/action.c +++ b/osfmk/kperf/action.c @@ -33,31 +33,33 @@ #include #include -// #include #include #include /* panic */ #include #include +#include -#include -#include - +#include +#include #include -#include -#include #include -#include -#include #include -#include +#include +#include #include +#include +#include +#include +#include -#define ACTION_MAX 32 +#define ACTION_MAX (32) /* the list of different actions to take */ struct action { uint32_t sample; + uint32_t ucallstack_depth; + uint32_t kcallstack_depth; uint32_t userdata; int pid_filter; }; @@ -66,37 +68,17 @@ struct action static unsigned actionc = 0; static struct action *actionv = NULL; -/* manage callbacks from system */ - -/* callback set for kdebug */ -static int kperf_kdbg_callback_set = 0; -/* whether to record callstacks on kdebug events */ -static int kdebug_callstacks = 0; -/* the action ID to trigger on signposts */ -static int kperf_signpost_action = 0; - -/* callback set for context-switch */ -int kperf_cswitch_callback_set = 0; /* should emit tracepoint on context switch */ -static int kdebug_cswitch = 0; -/* the action ID to trigger on context switches */ -static int kperf_cswitch_action = 0; - -/* indirect hooks to play nice with CHUD for the transition to kperf */ -kern_return_t chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t fn); -kern_return_t chudxnu_kdebug_callback_cancel(void); +int kperf_kdebug_cswitch = 0; -/* Do the real work! */ -/* this can be called in any context ... right? */ static kern_return_t kperf_sample_internal(struct kperf_sample *sbuf, struct kperf_context *context, unsigned sample_what, unsigned sample_flags, - unsigned actionid) + unsigned actionid, uint32_t ucallstack_depth) { - boolean_t enabled; - int did_ucallstack = 0, did_tinfo_extra = 0; - uint32_t userdata; + int pended_ucallstack = 0; + int pended_th_dispatch = 0; /* not much point continuing here, but what to do ? return * Shutdown? cut a tracepoint and continue? @@ -105,11 +87,27 @@ kperf_sample_internal(struct kperf_sample *sbuf, return SAMPLE_CONTINUE; } - int is_kernel = (context->cur_pid == 0); + /* callstacks should be explicitly ignored */ + if (sample_flags & SAMPLE_FLAG_EMPTY_CALLSTACK) { + sample_what &= ~(SAMPLER_KSTACK | SAMPLER_USTACK); + } + + context->cur_thread->kperf_pet_gen = kperf_pet_gen; + boolean_t is_kernel = (context->cur_pid == 0); + + if (actionid && actionid <= actionc) { + sbuf->kcallstack.nframes = actionv[actionid - 1].kcallstack_depth; + } else { + sbuf->kcallstack.nframes = MAX_CALLSTACK_FRAMES; + } + + if (ucallstack_depth) { + sbuf->ucallstack.nframes = ucallstack_depth; + } else { + sbuf->ucallstack.nframes = MAX_CALLSTACK_FRAMES; + } - sbuf->kcallstack.nframes = 0; sbuf->kcallstack.flags = CALLSTACK_VALID; - sbuf->ucallstack.nframes = 0; sbuf->ucallstack.flags = CALLSTACK_VALID; /* an event occurred. Sample everything and dump it in a @@ -117,19 +115,35 @@ kperf_sample_internal(struct kperf_sample *sbuf, */ /* collect data from samplers */ - if (sample_what & SAMPLER_TINFO) { - kperf_threadinfo_sample(&sbuf->threadinfo, context); + if (sample_what & SAMPLER_TH_INFO) { + kperf_thread_info_sample(&sbuf->th_info, context); /* See if we should drop idle thread samples */ if (!(sample_flags & SAMPLE_FLAG_IDLE_THREADS)) { - if (sbuf->threadinfo.runmode & 0x40) { + if (sbuf->th_info.kpthi_runmode & 0x40) { return SAMPLE_CONTINUE; } } } - if ((sample_what & SAMPLER_KSTACK) && !(sample_flags & SAMPLE_FLAG_EMPTY_CALLSTACK)) { - kperf_kcallstack_sample(&(sbuf->kcallstack), context); + if (sample_what & SAMPLER_TH_SNAPSHOT) { + kperf_thread_snapshot_sample(&(sbuf->th_snapshot), context); + } + if (sample_what & SAMPLER_TH_SCHEDULING) { + kperf_thread_scheduling_sample(&(sbuf->th_scheduling), context); + } + if (sample_what & SAMPLER_KSTACK) { + if (sample_flags & SAMPLE_FLAG_CONTINUATION) { + kperf_continuation_sample(&(sbuf->kcallstack), context); + /* outside of interrupt context, backtrace the current thread */ + } else if (sample_flags & SAMPLE_FLAG_NON_INTERRUPT) { + kperf_backtrace_sample(&(sbuf->kcallstack), context); + } else { + kperf_kcallstack_sample(&(sbuf->kcallstack), context); + } + } + if (sample_what & SAMPLER_TK_SNAPSHOT) { + kperf_task_snapshot_sample(&(sbuf->tk_snapshot), context); } /* sensitive ones */ @@ -139,25 +153,20 @@ kperf_sample_internal(struct kperf_sample *sbuf, } if (sample_flags & SAMPLE_FLAG_PEND_USER) { - if ((sample_what & SAMPLER_USTACK) - && !(sample_flags & SAMPLE_FLAG_EMPTY_CALLSTACK)) - { - did_ucallstack = kperf_ucallstack_pend(context); + if (sample_what & SAMPLER_USTACK) { + pended_ucallstack = kperf_ucallstack_pend(context, sbuf->ucallstack.nframes); } - if (sample_what & SAMPLER_TINFOEX) { - did_tinfo_extra = kperf_threadinfo_extra_pend(context); + if (sample_what & SAMPLER_TH_DISPATCH) { + pended_th_dispatch = kperf_thread_dispatch_pend(context); } } else { - if ((sample_what & SAMPLER_USTACK) - && !(sample_flags & SAMPLE_FLAG_EMPTY_CALLSTACK)) - { + if (sample_what & SAMPLER_USTACK) { kperf_ucallstack_sample(&(sbuf->ucallstack), context); } - if (sample_what & SAMPLER_TINFOEX) { - kperf_threadinfo_extra_sample(&(sbuf->tinfo_ex), - context); + if (sample_what & SAMPLER_TH_DISPATCH) { + kperf_thread_dispatch_sample(&(sbuf->th_dispatch), context); } } } @@ -169,28 +178,42 @@ kperf_sample_internal(struct kperf_sample *sbuf, } /* lookup the user tag, if any */ + uint32_t userdata; if (actionid && (actionid <= actionc)) { userdata = actionv[actionid - 1].userdata; } else { userdata = actionid; } + /* avoid logging if this sample only pended samples */ + if (sample_flags & SAMPLE_FLAG_PEND_USER && + !(sample_what & ~(SAMPLER_USTACK | SAMPLER_TH_DISPATCH))) + { + return SAMPLE_CONTINUE; + } + /* stash the data into the buffer * interrupts off to ensure we don't get split */ - enabled = ml_set_interrupts_enabled(FALSE); + boolean_t enabled = ml_set_interrupts_enabled(FALSE); BUF_DATA(PERF_GEN_EVENT | DBG_FUNC_START, sample_what, actionid, userdata, sample_flags); - /* dump threadinfo */ - if (sample_what & SAMPLER_TINFO) { - kperf_threadinfo_log( &sbuf->threadinfo ); + if (sample_what & SAMPLER_TH_INFO) { + kperf_thread_info_log(&sbuf->th_info); + } + if (sample_what & SAMPLER_TH_SCHEDULING) { + kperf_thread_scheduling_log(&(sbuf->th_scheduling)); + } + if (sample_what & SAMPLER_TH_SNAPSHOT) { + kperf_thread_snapshot_log(&(sbuf->th_snapshot)); } - - /* dump kcallstack */ if (sample_what & SAMPLER_KSTACK) { - kperf_kcallstack_log( &sbuf->kcallstack ); + kperf_kcallstack_log(&sbuf->kcallstack); + } + if (sample_what & SAMPLER_TK_SNAPSHOT) { + kperf_task_snapshot_log(&(sbuf->tk_snapshot)); } /* dump user stuff */ @@ -201,20 +224,20 @@ kperf_sample_internal(struct kperf_sample *sbuf, } if (sample_flags & SAMPLE_FLAG_PEND_USER) { - if (did_ucallstack) { - BUF_INFO1(PERF_CS_UPEND, 0); + if (pended_ucallstack) { + BUF_INFO(PERF_CS_UPEND); } - if (did_tinfo_extra) { - BUF_INFO1(PERF_TI_XPEND, 0); + if (pended_th_dispatch) { + BUF_INFO(PERF_TI_DISPPEND); } } else { if (sample_what & SAMPLER_USTACK) { kperf_ucallstack_log(&(sbuf->ucallstack)); } - if (sample_what & SAMPLER_TINFOEX) { - kperf_threadinfo_extra_log(&(sbuf->tinfo_ex)); + if (sample_what & SAMPLER_TH_DISPATCH) { + kperf_thread_dispatch_log(&(sbuf->th_dispatch)); } } } @@ -225,7 +248,7 @@ kperf_sample_internal(struct kperf_sample *sbuf, kperf_kpc_cpu_log(&(sbuf->kpcdata)); } - BUF_DATA1(PERF_GEN_EVENT | DBG_FUNC_END, sample_what); + BUF_DATA(PERF_GEN_EVENT | DBG_FUNC_END, sample_what); /* intrs back on */ ml_set_interrupts_enabled(enabled); @@ -239,9 +262,6 @@ kperf_sample(struct kperf_sample *sbuf, struct kperf_context *context, unsigned actionid, unsigned sample_flags) { - unsigned sample_what = 0; - int pid_filter; - /* work out what to sample, if anything */ if ((actionid > actionc) || (actionid == 0)) { return SAMPLE_SHUTDOWN; @@ -250,337 +270,147 @@ kperf_sample(struct kperf_sample *sbuf, /* check the pid filter against the context's current pid. * filter pid == -1 means any pid */ - pid_filter = actionv[actionid - 1].pid_filter; + int pid_filter = actionv[actionid - 1].pid_filter; if ((pid_filter != -1) && (pid_filter != context->cur_pid)) { return SAMPLE_CONTINUE; } /* the samplers to run */ - sample_what = actionv[actionid - 1].sample; + unsigned int sample_what = actionv[actionid - 1].sample; /* do the actual sample operation */ return kperf_sample_internal(sbuf, context, sample_what, - sample_flags, actionid); + sample_flags, actionid, + actionv[actionid - 1].ucallstack_depth); } -/* ast callback on a thread */ void -kperf_thread_ast_handler(thread_t thread) +kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp) { - int r; - uint32_t t_chud; - unsigned sample_what = 0; - /* we know we're on a thread, so let's do stuff */ - task_t task = NULL; - - BUF_INFO1(PERF_AST_HNDLR | DBG_FUNC_START, thread); - - /* use ~2kb of the stack for the sample, should be ok since we're in the ast */ - struct kperf_sample sbuf; - memset(&sbuf, 0, sizeof(struct kperf_sample)); - - /* make a context, take a sample */ + uint32_t sample_flags = SAMPLE_FLAG_PEND_USER; struct kperf_context ctx; - ctx.cur_thread = thread; - ctx.cur_pid = -1; + struct kperf_sample *sample = NULL; + kern_return_t kr = KERN_SUCCESS; + int s; - task = chudxnu_task_for_thread(thread); - if (task) { - ctx.cur_pid = chudxnu_pid_for_task(task); + if (!kperf_kdebug_should_trigger(debugid)) { + return; } - /* decode the chud bits so we know what to sample */ - t_chud = kperf_get_thread_bits(thread); + BUF_VERB(PERF_KDBG_HNDLR | DBG_FUNC_START, debugid); - if (t_chud & T_AST_NAME) { - sample_what |= SAMPLER_TINFOEX; - } + ctx.cur_thread = current_thread(); + ctx.cur_pid = task_pid(get_threadtask(ctx.cur_thread)); + ctx.trigger_type = TRIGGER_TYPE_KDEBUG; + ctx.trigger_id = 0; - if (t_chud & T_AST_CALLSTACK) { - sample_what |= SAMPLER_USTACK; - sample_what |= SAMPLER_TINFO; - } + s = ml_set_interrupts_enabled(0); - /* do the sample, just of the user stuff */ - r = kperf_sample_internal(&sbuf, &ctx, sample_what, 0, 0); + sample = kperf_intr_sample_buffer(); - BUF_INFO1(PERF_AST_HNDLR | DBG_FUNC_END, r); -} - -/* register AST bits */ -int -kperf_ast_pend(thread_t cur_thread, uint32_t check_bits, - uint32_t set_bits) -{ - /* pend on the thread */ - uint32_t t_chud, set_done = 0; - - /* can only pend on the current thread */ - if (cur_thread != chudxnu_current_thread()) { - panic("pending to non-current thread"); + if (!ml_at_interrupt_context()) { + sample_flags |= SAMPLE_FLAG_NON_INTERRUPT; + ctx.starting_fp = starting_fp; } - /* get our current bits */ - t_chud = kperf_get_thread_bits(cur_thread); - - /* see if it's already been done or pended */ - if (!(t_chud & check_bits)) { - /* set the bit on the thread */ - t_chud |= set_bits; - kperf_set_thread_bits(cur_thread, t_chud); - - /* set the actual AST */ - kperf_set_thread_ast(cur_thread); - - set_done = 1; - } + kr = kperf_sample(sample, &ctx, kperf_kdebug_get_action(), sample_flags); - return set_done; + ml_set_interrupts_enabled(s); + BUF_VERB(PERF_KDBG_HNDLR | DBG_FUNC_END, kr); } /* - * kdebug callback & stack management + * This function allocates >2.3KB of the stack. Prevent the compiler from + * inlining this function into ast_taken and ensure the stack memory is only + * allocated for the kperf AST. */ - -#define IS_END(debugid) ((debugid & 3) == DBG_FUNC_END) -#define IS_MIG(debugid) (IS_END(debugid) && ((debugid & 0xff000000U) == KDBG_CLASS_ENCODE((unsigned)DBG_MIG, 0U))) -#define IS_MACH_SYSCALL(debugid) (IS_END(debugid) && (KDBG_CLASS_DECODE(debugid) == KDBG_CLASS_ENCODE(DBG_MACH, DBG_MACH_EXCP_SC))) -#define IS_VM_FAULT(debugid) (IS_END(debugid) && (KDBG_CLASS_DECODE(debugid) == KDBG_CLASS_ENCODE(DBG_MACH, DBG_MACH_VM))) -#define IS_BSD_SYSCTLL(debugid) (IS_END(debugid) && (KDBG_CLASS_DECODE(debugid) == KDBG_CLASS_ENCODE(DBG_BSD, DBG_BSD_EXCP_SC))) -#define IS_APPS_SIGNPOST(debugid) (KDBG_CLASS_DECODE(debugid) == KDBG_CLASS_ENCODE(DBG_APPS, DBG_MACH_CHUD)) -#define IS_MACH_SIGNPOST(debugid) (KDBG_CLASS_DECODE(debugid) == KDBG_CLASS_ENCODE(DBG_MACH, DBG_MACH_CHUD)) -#define IS_ENERGYTRACE(debugid) ((debugid & 0xff000000U) == KDBG_CLASS_ENCODE((unsigned)DBG_ENERGYTRACE, 0U)) - +__attribute__((noinline)) void -kperf_kdebug_callback(uint32_t debugid) +kperf_thread_ast_handler(thread_t thread) { - int cur_pid = 0; - task_t task = NULL; + BUF_INFO(PERF_AST_HNDLR | DBG_FUNC_START, thread, kperf_get_thread_flags(thread)); - if (!kdebug_callstacks && !kperf_signpost_action) { - return; - } - - /* if we're looking at a kperf tracepoint, don't recurse */ - if ((debugid & 0xff000000) == KDBG_CLASS_ENCODE(DBG_PERF, 0)) { - return; - } - - /* ensure interrupts are already off thanks to kdebug */ - if (ml_get_interrupts_enabled()) { - return; - } + /* ~2KB of the stack for the sample since this is called from AST */ + struct kperf_sample sbuf; + memset(&sbuf, 0, sizeof(struct kperf_sample)); - /* make sure we're not being called recursively. */ -#if NOTYET - if (kperf_kdbg_recurse(KPERF_RECURSE_IN)) { - return; - } -#endif + task_t task = get_threadtask(thread); - /* check the happy list of trace codes */ - if(!(IS_MIG(debugid) - || IS_MACH_SYSCALL(debugid) - || IS_VM_FAULT(debugid) - || IS_BSD_SYSCTLL(debugid) - || IS_MACH_SIGNPOST(debugid) - || IS_ENERGYTRACE(debugid) - || IS_APPS_SIGNPOST(debugid))) - { - return; - } - - /* check for kernel */ - thread_t thread = chudxnu_current_thread(); - task = chudxnu_task_for_thread(thread); - if (task) { - cur_pid = chudxnu_pid_for_task(task); - } - if (!cur_pid) { - return; - } + /* make a context, take a sample */ + struct kperf_context ctx; + ctx.cur_thread = thread; + ctx.cur_pid = task_pid(task); - if (kdebug_callstacks) { - /* dicing with death */ - BUF_INFO2(PERF_KDBG_HNDLR, debugid, cur_pid); + /* decode the flags to determine what to sample */ + unsigned int sample_what = 0; + uint32_t flags = kperf_get_thread_flags(thread); - /* pend the AST */ - kperf_ast_pend( thread, T_AST_CALLSTACK, T_AST_CALLSTACK ); + if (flags & T_KPERF_AST_DISPATCH) { + sample_what |= SAMPLER_TH_DISPATCH; } - - if (kperf_signpost_action && (IS_MACH_SIGNPOST(debugid) - || IS_APPS_SIGNPOST(debugid))) - { -#if NOTYET - /* make sure we're not being called recursively. */ - if(kperf_kdbg_recurse(KPERF_RECURSE_IN)) { - return; - } -#endif - - /* setup a context */ - struct kperf_context ctx; - struct kperf_sample *intbuf = NULL; - BUF_INFO2(PERF_SIGNPOST_HNDLR | DBG_FUNC_START, debugid, cur_pid); - - ctx.cur_thread = thread; - ctx.cur_pid = cur_pid; - ctx.trigger_type = TRIGGER_TYPE_TRACE; - ctx.trigger_id = 0; - - /* CPU sample buffer -- only valid with interrupts off (above) - * Technically this isn't true -- tracepoints can, and often - * are, cut from interrupt handlers, but none of those tracepoints - * should make it this far. - */ - intbuf = kperf_intr_sample_buffer(); - - /* do the sample */ - kperf_sample(intbuf, &ctx, kperf_signpost_action, - SAMPLE_FLAG_PEND_USER); - - BUF_INFO2(PERF_SIGNPOST_HNDLR | DBG_FUNC_END, debugid, cur_pid); -#if NOTYET - /* no longer recursive */ - kperf_kdbg_recurse(KPERF_RECURSE_OUT); -#endif + if (flags & T_KPERF_AST_CALLSTACK) { + sample_what |= SAMPLER_USTACK; + sample_what |= SAMPLER_TH_INFO; } -} -static void -kperf_kdbg_callback_update(void) -{ - unsigned old_callback_set = kperf_kdbg_callback_set; + uint32_t ucallstack_depth = T_KPERF_GET_CALLSTACK_DEPTH(flags); - /* compute new callback state */ - kperf_kdbg_callback_set = kdebug_callstacks || kperf_signpost_action; + int r = kperf_sample_internal(&sbuf, &ctx, sample_what, 0, 0, ucallstack_depth); - if (old_callback_set && !kperf_kdbg_callback_set) { - /* callback should no longer be set */ - chudxnu_kdebug_callback_cancel(); - } else if (!old_callback_set && kperf_kdbg_callback_set) { - /* callback must now be set */ - chudxnu_kdebug_callback_enter(NULL); - } -} - -int -kperf_kdbg_get_stacks(void) -{ - return kdebug_callstacks; + BUF_INFO(PERF_AST_HNDLR | DBG_FUNC_END, r); } +/* register AST bits */ int -kperf_kdbg_set_stacks(int newval) +kperf_ast_pend(thread_t thread, uint32_t set_flags) { - kdebug_callstacks = newval; - kperf_kdbg_callback_update(); + /* can only pend on the current thread */ + if (thread != current_thread()) { + panic("pending to non-current thread"); + } - return 0; -} + /* get our current bits */ + uint32_t flags = kperf_get_thread_flags(thread); -int -kperf_signpost_action_get(void) -{ - return kperf_signpost_action; -} + /* see if it's already been done or pended */ + if (!(flags & set_flags)) { + /* set the bit on the thread */ + flags |= set_flags; + kperf_set_thread_flags(thread, flags); -int -kperf_signpost_action_set(int newval) -{ - kperf_signpost_action = newval; - kperf_kdbg_callback_update(); + /* set the actual AST */ + act_set_kperf(thread); + return 1; + } return 0; } -/* - * Thread switch - */ - -/* called from context switch handler */ void -kperf_switch_context(__unused thread_t old, thread_t new) +kperf_ast_set_callstack_depth(thread_t thread, uint32_t depth) { - task_t task = get_threadtask(new); - int pid = chudxnu_pid_for_task(task); + uint32_t ast_flags = kperf_get_thread_flags(thread); + uint32_t existing_callstack_depth = T_KPERF_GET_CALLSTACK_DEPTH(ast_flags); - /* cut a tracepoint to tell us what the new thread's PID is - * for Instruments - */ - BUF_DATA2(PERF_TI_CSWITCH, thread_tid(new), pid); - - /* trigger action after counters have been updated */ - if (kperf_cswitch_action) { - struct kperf_sample sbuf; - struct kperf_context ctx; - int r; + if (existing_callstack_depth != depth) { + ast_flags &= ~T_KPERF_SET_CALLSTACK_DEPTH(depth); + ast_flags |= T_KPERF_SET_CALLSTACK_DEPTH(depth); - BUF_DATA1(PERF_CSWITCH_HNDLR | DBG_FUNC_START, 0); - - ctx.cur_pid = 0; - ctx.cur_thread = old; - - /* get PID for context */ - task_t old_task = chudxnu_task_for_thread(ctx.cur_thread); - if (old_task) { - ctx.cur_pid = chudxnu_pid_for_task(old_task); - } - - ctx.trigger_type = TRIGGER_TYPE_CSWITCH; - ctx.trigger_id = 0; - - r = kperf_sample(&sbuf, &ctx, kperf_cswitch_action, - SAMPLE_FLAG_PEND_USER); - - BUF_INFO1(PERF_CSWITCH_HNDLR | DBG_FUNC_END, r); + kperf_set_thread_flags(thread, ast_flags); } } -static void -kperf_cswitch_callback_update(void) -{ - unsigned old_callback_set = kperf_cswitch_callback_set; - - unsigned new_callback_set = kdebug_cswitch || kperf_cswitch_action; - - if (old_callback_set && !new_callback_set) { - kperf_cswitch_callback_set = 0; - } else if (!old_callback_set && new_callback_set) { - kperf_cswitch_callback_set = 1; - } else { - return; - } - - kperf_kpc_cswitch_callback_update(); -} - int kperf_kdbg_cswitch_get(void) { - return kdebug_cswitch; + return kperf_kdebug_cswitch; } int kperf_kdbg_cswitch_set(int newval) { - kdebug_cswitch = newval; - kperf_cswitch_callback_update(); - - return 0; -} - -int -kperf_cswitch_action_get(void) -{ - return kperf_cswitch_action; -} - -int -kperf_cswitch_action_set(int newval) -{ - kperf_cswitch_action = newval; - kperf_cswitch_callback_update(); + kperf_kdebug_cswitch = newval; + kperf_on_cpu_update(); return 0; } @@ -588,7 +418,7 @@ kperf_cswitch_action_set(int newval) /* * Action configuration */ -unsigned +unsigned int kperf_action_get_count(void) { return actionc; @@ -684,11 +514,23 @@ kperf_action_get_filter(unsigned actionid, int *pid_out) return 0; } +void +kperf_action_reset(void) +{ + for (unsigned int i = 0; i < actionc; i++) { + kperf_action_set_samplers(i + 1, 0); + kperf_action_set_userdata(i + 1, 0); + kperf_action_set_filter(i + 1, -1); + kperf_action_set_ucallstack_depth(i + 1, MAX_CALLSTACK_FRAMES); + kperf_action_set_kcallstack_depth(i + 1, MAX_CALLSTACK_FRAMES); + } +} + int kperf_action_set_count(unsigned count) { struct action *new_actionv = NULL, *old_actionv = NULL; - unsigned old_count, i; + unsigned old_count; /* easy no-op */ if (count == actionc) { @@ -710,15 +552,13 @@ kperf_action_set_count(unsigned count) */ if (actionc == 0) { int r; - r = kperf_init(); - - if (r != 0) { + if ((r = kperf_init())) { return r; } } /* create a new array */ - new_actionv = kalloc(count * sizeof(*new_actionv)); + new_actionv = kalloc_tag(count * sizeof(*new_actionv), VM_KERN_MEMORY_DIAG); if (new_actionv == NULL) { return ENOMEM; } @@ -732,8 +572,10 @@ kperf_action_set_count(unsigned count) memset(&(new_actionv[actionc]), 0, (count - old_count) * sizeof(*actionv)); - for (i = old_count; i < count; i++) { + for (unsigned int i = old_count; i < count; i++) { new_actionv[i].pid_filter = -1; + new_actionv[i].ucallstack_depth = MAX_CALLSTACK_FRAMES; + new_actionv[i].kcallstack_depth = MAX_CALLSTACK_FRAMES; } actionv = new_actionv; @@ -745,3 +587,71 @@ kperf_action_set_count(unsigned count) return 0; } + +int +kperf_action_set_ucallstack_depth(unsigned action_id, uint32_t depth) +{ + if ((action_id > actionc) || (action_id == 0)) { + return EINVAL; + } + + if (depth > MAX_CALLSTACK_FRAMES) { + return EINVAL; + } + + actionv[action_id - 1].ucallstack_depth = depth; + + return 0; +} + +int +kperf_action_set_kcallstack_depth(unsigned action_id, uint32_t depth) +{ + if ((action_id > actionc) || (action_id == 0)) { + return EINVAL; + } + + if (depth > MAX_CALLSTACK_FRAMES) { + return EINVAL; + } + + actionv[action_id - 1].kcallstack_depth = depth; + + return 0; +} + +int +kperf_action_get_ucallstack_depth(unsigned action_id, uint32_t * depth_out) +{ + if ((action_id > actionc)) { + return EINVAL; + } + + assert(depth_out); + + if (action_id == 0) { + *depth_out = MAX_CALLSTACK_FRAMES; + } else { + *depth_out = actionv[action_id - 1].ucallstack_depth; + } + + return 0; +} + +int +kperf_action_get_kcallstack_depth(unsigned action_id, uint32_t * depth_out) +{ + if ((action_id > actionc)) { + return EINVAL; + } + + assert(depth_out); + + if (action_id == 0) { + *depth_out = MAX_CALLSTACK_FRAMES; + } else { + *depth_out = actionv[action_id - 1].kcallstack_depth; + } + + return 0; +} diff --git a/osfmk/kperf/action.h b/osfmk/kperf/action.h index 01f103f5c..1233da6d0 100644 --- a/osfmk/kperf/action.h +++ b/osfmk/kperf/action.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,34 +22,50 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#ifndef KPERF_ACTION_H +#define KPERF_ACTION_H + +#include + /* fwd decl */ struct kperf_sample; struct kperf_context; /* bits for defining what to do on an action */ -#define SAMPLER_TINFO (1<<0) -#define SAMPLER_TINFOEX (1<<1) -#define SAMPLER_KSTACK (1<<2) -#define SAMPLER_USTACK (1<<3) -#define SAMPLER_PMC_THREAD (1<<4) -#define SAMPLER_PMC_CPU (1<<5) -#define SAMPLER_PMC_CONFIG (1<<6) -#define SAMPLER_MEMINFO (1<<7) +#define SAMPLER_TH_INFO (1U << 0) +#define SAMPLER_TH_SNAPSHOT (1U << 1) +#define SAMPLER_KSTACK (1U << 2) +#define SAMPLER_USTACK (1U << 3) +#define SAMPLER_PMC_THREAD (1U << 4) +#define SAMPLER_PMC_CPU (1U << 5) +#define SAMPLER_PMC_CONFIG (1U << 6) +#define SAMPLER_MEMINFO (1U << 7) +#define SAMPLER_TH_SCHEDULING (1U << 8) +#define SAMPLER_TH_DISPATCH (1U << 9) +#define SAMPLER_TK_SNAPSHOT (1U << 10) + +/* flags for sample calls */ -/* flags for sample calls*/ -#define SAMPLE_FLAG_PEND_USER (1<<0) -#define SAMPLE_FLAG_IDLE_THREADS (1<<1) -#define SAMPLE_FLAG_EMPTY_CALLSTACK (1<<2) +/* pend certain samplers until AST boundary, instead of sampling them */ +#define SAMPLE_FLAG_PEND_USER (1U << 0) +/* sample idle threads */ +#define SAMPLE_FLAG_IDLE_THREADS (1U << 1) +/* do not sample callstacks */ +#define SAMPLE_FLAG_EMPTY_CALLSTACK (1U << 2) +/* use the continuation as a kernel backtrace */ +#define SAMPLE_FLAG_CONTINUATION (1U << 3) +/* sample is occurring outside of interrupt context */ +#define SAMPLE_FLAG_NON_INTERRUPT (1U << 4) /* Take a sample into "sbuf" using current thread "cur_thread" */ -extern kern_return_t kperf_sample(struct kperf_sample *sbuf, - struct kperf_context*, - unsigned actionid, - unsigned sample_flags); +kern_return_t kperf_sample(struct kperf_sample *sbuf, + struct kperf_context *ctx, + unsigned actionid, + unsigned sample_flags); /* return codes from taking a sample * either keep trigger, or something went wrong (or we're shutting down) @@ -59,26 +75,24 @@ extern kern_return_t kperf_sample(struct kperf_sample *sbuf, #define SAMPLE_SHUTDOWN (1) #define SAMPLE_OFF (2) -/* Get the sample buffer to use from interrupt handler context. Only - * valid in interrupt contexts. - */ -extern struct kperf_sample* kperf_intr_sample_buffer(void); +void kperf_action_reset(void); /* Interface functions */ -extern unsigned kperf_action_get_count(void); -extern int kperf_action_set_count(unsigned count); +unsigned kperf_action_get_count(void); +int kperf_action_set_count(unsigned count); + +int kperf_action_set_samplers(unsigned int actionid, uint32_t samplers); +int kperf_action_get_samplers(unsigned int actionid, uint32_t *samplers_out); + +int kperf_action_set_userdata(unsigned int actionid, uint32_t userdata); +int kperf_action_get_userdata(unsigned int actionid, uint32_t *userdata_out); -extern int kperf_action_set_samplers(unsigned actionid, - uint32_t samplers); -extern int kperf_action_get_samplers(unsigned actionid, - uint32_t *samplers_out); +int kperf_action_set_ucallstack_depth(unsigned int actionid, uint32_t depth); +int kperf_action_get_ucallstack_depth(unsigned int actionid, uint32_t * depth_out); +int kperf_action_set_kcallstack_depth(unsigned int actionid, uint32_t depth); +int kperf_action_get_kcallstack_depth(unsigned int actionid, uint32_t * depth_out); -extern int kperf_action_set_userdata(unsigned actionid, - uint32_t userdata); -extern int kperf_action_get_userdata(unsigned actionid, - uint32_t *userdata_out); +int kperf_action_set_filter(unsigned int actionid, int pid); +int kperf_action_get_filter(unsigned int actionid, int *pid_out); -extern int kperf_action_set_filter(unsigned actionid, - int pid); -extern int kperf_action_get_filter(unsigned actionid, - int *pid_out); +#endif /* !defined(KPERF_ACTION_H) */ diff --git a/osfmk/kperf/ast.h b/osfmk/kperf/ast.h index 897d549c4..65d5044bc 100644 --- a/osfmk/kperf/ast.h +++ b/osfmk/kperf/ast.h @@ -27,4 +27,5 @@ */ /* pend ast bits on a thread */ -extern int kperf_ast_pend( thread_t, uint32_t, uint32_t ); +extern int kperf_ast_pend(thread_t thread, uint32_t flags); +extern void kperf_ast_set_callstack_depth(thread_t thread, uint32_t depth); diff --git a/osfmk/kperf/buffer.h b/osfmk/kperf/buffer.h index b46d46cec..31aee62ce 100644 --- a/osfmk/kperf/buffer.h +++ b/osfmk/kperf/buffer.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -30,11 +30,11 @@ #include -/* KDEBUG codes */ +/* kdebug codes */ #define PERF_CODE(SubClass, code) KDBG_CODE(DBG_PERF, SubClass, code) /* broad sub-classes */ -#define PERF_GENERIC (0) +#define PERF_GENERIC (0) #define PERF_THREADINFO (1) #define PERF_CALLSTACK (2) #define PERF_TIMER (3) @@ -42,21 +42,38 @@ #define PERF_AST (5) #define PERF_KPC (6) #define PERF_KDBG (7) -#define PERF_CSWITCH (8) -#define PERF_SIGNPOST (9) +#define PERF_TASK (8) +/* 9 unused */ #define PERF_MEMINFO (10) +/* helpers for 32-bit */ +#define UPPER_32(U64) ((U64) >> 32) +#define LOWER_32(U64) ((U64) & (UINT32_MAX)) +#define ENCODE_UPPER_64(U32) (((uint64_t)(U32)) << 32) +#define ENCODE_LOWER_64(U32) (((uint64_t)(U32)) & (UINT32_MAX)) + /* sub-class codes */ #define PERF_GEN_CODE(code) PERF_CODE(PERF_GENERIC, code) #define PERF_GEN_EVENT PERF_GEN_CODE(0) -#define PERF_TI_CODE(code) PERF_CODE(PERF_THREADINFO, code) -#define PERF_TI_SAMPLE PERF_TI_CODE(0) -#define PERF_TI_DATA PERF_TI_CODE(1) -#define PERF_TI_XSAMPLE PERF_TI_CODE(2) -#define PERF_TI_XPEND PERF_TI_CODE(3) -#define PERF_TI_XDATA PERF_TI_CODE(4) -#define PERF_TI_CSWITCH PERF_TI_CODE(5) +#define PERF_TI_CODE(code) PERF_CODE(PERF_THREADINFO, code) +#define PERF_TI_SAMPLE PERF_TI_CODE(0) +#define PERF_TI_DATA PERF_TI_CODE(1) +#define PERF_TI_XSAMPLE PERF_TI_CODE(2) +#define PERF_TI_XPEND PERF_TI_CODE(3) +#define PERF_TI_XDATA PERF_TI_CODE(4) +#define PERF_TI_CSWITCH PERF_TI_CODE(5) +#define PERF_TI_SCHEDSAMPLE PERF_TI_CODE(6) +#define PERF_TI_SCHEDDATA PERF_TI_CODE(7) +#define PERF_TI_SNAPSAMPLE PERF_TI_CODE(8) +#define PERF_TI_SNAPDATA PERF_TI_CODE(9) +#define PERF_TI_DISPSAMPLE PERF_TI_CODE(10) +#define PERF_TI_DISPDATA PERF_TI_CODE(11) +#define PERF_TI_DISPPEND PERF_TI_CODE(12) +#define PERF_TI_SNAPDATA_32 PERF_TI_CODE(13) +#define PERF_TI_DISPDATA_32 PERF_TI_CODE(14) +#define PERF_TI_SCHEDDATA1_32 PERF_TI_CODE(15) +#define PERF_TI_SCHEDDATA2_32 PERF_TI_CODE(16) #define PERF_CS_CODE(code) PERF_CODE(PERF_CALLSTACK, code) #define PERF_CS_KSAMPLE PERF_CS_CODE(0) @@ -67,21 +84,25 @@ #define PERF_CS_KHDR PERF_CS_CODE(5) #define PERF_CS_UHDR PERF_CS_CODE(6) #define PERF_CS_ERROR PERF_CS_CODE(7) +#define PERF_CS_BACKTRACE PERF_CS_CODE(8) +#define PERF_CS_LOG PERF_CS_CODE(9) #define PERF_TM_CODE(code) PERF_CODE(PERF_TIMER, code) -#define PERF_TM_ASCHED PERF_TM_CODE(0) +#define PERF_TM_FIRE PERF_TM_CODE(0) #define PERF_TM_SCHED PERF_TM_CODE(1) #define PERF_TM_HNDLR PERF_TM_CODE(2) -#define PERF_PET_CODE(code) PERF_CODE(PERF_PET, code) -#define PERF_PET_THREAD PERF_PET_CODE(0) -#define PERF_PET_ERROR PERF_PET_CODE(1) -#define PERF_PET_RUN PERF_PET_CODE(2) -#define PERF_PET_PAUSE PERF_PET_CODE(3) -#define PERF_PET_IDLE PERF_PET_CODE(4) -#define PERF_PET_SAMPLE PERF_PET_CODE(5) -#define PERF_PET_SCHED PERF_PET_CODE(6) -#define PERF_PET_END PERF_PET_CODE(7) +#define PERF_PET_CODE(code) PERF_CODE(PERF_PET, code) +#define PERF_PET_THREAD PERF_PET_CODE(0) +#define PERF_PET_ERROR PERF_PET_CODE(1) +#define PERF_PET_RUN PERF_PET_CODE(2) +#define PERF_PET_PAUSE PERF_PET_CODE(3) +#define PERF_PET_IDLE PERF_PET_CODE(4) +#define PERF_PET_SAMPLE PERF_PET_CODE(5) +#define PERF_PET_SCHED PERF_PET_CODE(6) +#define PERF_PET_END PERF_PET_CODE(7) +#define PERF_PET_SAMPLE_TASK PERF_PET_CODE(8) +#define PERF_PET_SAMPLE_THREAD PERF_PET_CODE(9) #define PERF_AST_CODE(code) PERF_CODE(PERF_AST, code) #define PERF_AST_HNDLR PERF_AST_CODE(0) @@ -98,15 +119,17 @@ #define PERF_KPC_CFG_REG32 PERF_KPC_CODE(7) #define PERF_KPC_DATA_THREAD PERF_KPC_CODE(8) #define PERF_KPC_DATA_THREAD32 PERF_KPC_CODE(9) +#define PERF_KPC_CPU_SAMPLE PERF_KPC_CODE(10) +#define PERF_KPC_THREAD_SAMPLE PERF_KPC_CODE(11) #define PERF_KDBG_CODE(code) PERF_CODE(PERF_KDBG, code) #define PERF_KDBG_HNDLR PERF_KDBG_CODE(0) -#define PERF_CSWITCH_CODE(code) PERF_CODE(PERF_CSWITCH, code) -#define PERF_CSWITCH_HNDLR PERF_CSWITCH_CODE(0) - -#define PERF_SIGNPOST_CODE(code) PERF_CODE(PERF_SIGNPOST, code) -#define PERF_SIGNPOST_HNDLR PERF_SIGNPOST_CODE(0) +#define PERF_TK_CODE(code) PERF_CODE(PERF_TASK, code) +#define PERF_TK_SNAP_SAMPLE PERF_TK_CODE(0) +#define PERF_TK_SNAP_DATA PERF_TK_CODE(1) +#define PERF_TK_SNAP_DATA1_32 PERF_TK_CODE(2) +#define PERF_TK_SNAP_DATA2_32 PERF_TK_CODE(3) #define PERF_MI_CODE(code) PERF_CODE(PERF_MEMINFO, code) #define PERF_MI_SAMPLE PERF_MI_CODE(0) @@ -123,23 +146,59 @@ enum ERR_NOMEM, }; -/* level of trace debug */ +/* level of kperf's logging to kdebug */ #define KPERF_DEBUG_DATA 0 #define KPERF_DEBUG_INFO 1 #define KPERF_DEBUG_VERBOSE 2 extern int kperf_debug_level; -/* for logging information / debugging -- optional */ -#define BUF_INFO( id, a0, a1, a2, a3) if (kperf_debug_level >= KPERF_DEBUG_INFO) KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, id,a0,a1,a2,a3,0) +/* BUF_DATA tracepoints are for logging actual kperf results. */ + +#define BUF_DATA_INT(EVENTID, A0, A1, A2, A3) KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, EVENTID, A0, A1, A2, A3, 0) + +#define BUF_DATA(EVENTID, ...) BUF_DATA_(EVENTID, ## __VA_ARGS__, 4, 3, 2, 1, 0) +#define BUF_DATA_(EVENTID, A1, A2, A3, A4, N_ARGS, ...) BUF_DATA##N_ARGS(EVENTID, A1, A2, A3, A4) +#define BUF_DATA0(EVENTID, A1, A2, A3, A4) BUF_DATA_INT(EVENTID, 0, 0, 0, 0) +#define BUF_DATA1(EVENTID, A1, A2, A3, A4) BUF_DATA_INT(EVENTID, A1, 0, 0, 0) +#define BUF_DATA2(EVENTID, A1, A2, A3, A4) BUF_DATA_INT(EVENTID, A1, A2, 0, 0) +#define BUF_DATA3(EVENTID, A1, A2, A3, A4) BUF_DATA_INT(EVENTID, A1, A2, A3, 0) +#define BUF_DATA4(EVENTID, A1, A2, A3, A4) BUF_DATA_INT(EVENTID, A1, A2, A3, A4) + +/* + * BUF_INFO tracepoints are for logging debugging information relevant to + * testing kperf's internal functions. + */ + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define BUF_INFO_INT(EVENTID, A1, A2, A3, A4) if (__improbable(kperf_debug_level >= KPERF_DEBUG_INFO)) KERNEL_DEBUG_CONSTANT(EVENTID, A1, A2, A3, A4, 0) +#else +#define BUF_INFO_INT(EVENTID, A1, A2, A3, A4) do { (void)(EVENTID); (void)(A1); (void)(A2); (void)(A3); (void)(A4); } while ((0)) +#endif + +#define BUF_INFO(EVENTID, ...) BUF_INFO_(EVENTID, ## __VA_ARGS__, 4, 3, 2, 1, 0) +#define BUF_INFO_(EVENTID, A1, A2, A3, A4, N_ARGS, ...) BUF_INFO##N_ARGS(EVENTID, A1, A2, A3, A4) +#define BUF_INFO0(EVENTID, A1, A2, A3, A4) BUF_INFO_INT(EVENTID, 0, 0, 0, 0) +#define BUF_INFO1(EVENTID, A1, A2, A3, A4) BUF_INFO_INT(EVENTID, A1, 0, 0, 0) +#define BUF_INFO2(EVENTID, A1, A2, A3, A4) BUF_INFO_INT(EVENTID, A1, A2, 0, 0) +#define BUF_INFO3(EVENTID, A1, A2, A3, A4) BUF_INFO_INT(EVENTID, A1, A2, A3, 0) +#define BUF_INFO4(EVENTID, A1, A2, A3, A4) BUF_INFO_INT(EVENTID, A1, A2, A3, A4) + +/* + * BUF_VERB tracepoints are for logging precise details of kperf's + * internal functions, like timing information for samplers. + */ -#define BUF_INFO1( id, a0 ) BUF_INFO(id, a0, 0, 0, 0 ) -#define BUF_INFO2( id, a0, a1 ) BUF_INFO(id, a0, a1, 0, 0 ) -#define BUF_INFO3( id, a0, a1, a2 ) BUF_INFO(id, a0, a1, a2, 0 ) +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define BUF_VERB_INT(EVENTID, A1, A2, A3, A4) if (__improbable(kperf_debug_level >= KPERF_DEBUG_VERBOSE)) KERNEL_DEBUG_CONSTANT(EVENTID, A1, A2, A3, A4, 0) +#else +#define BUF_VERB_INT(EVENTID, A1, A2, A3, A4) do { (void)(EVENTID); (void)(A1); (void)(A2); (void)(A3); (void)(A4); } while ((0)) +#endif -/* for logging actual data -- never compiled out */ -#define BUF_DATA( id, a0, a1, a2, a3) KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, id,a0,a1,a2,a3,0) +#define BUF_VERB(EVENTID, ...) BUF_VERB_(EVENTID, ## __VA_ARGS__, 4, 3, 2, 1, 0) +#define BUF_VERB_(EVENTID, A1, A2, A3, A4, N_ARGS, ...) BUF_VERB##N_ARGS(EVENTID, A1, A2, A3, A4) +#define BUF_VERB0(EVENTID, A1, A2, A3, A4) BUF_VERB_INT(EVENTID, 0, 0, 0, 0) +#define BUF_VERB1(EVENTID, A1, A2, A3, A4) BUF_VERB_INT(EVENTID, A1, 0, 0, 0) +#define BUF_VERB2(EVENTID, A1, A2, A3, A4) BUF_VERB_INT(EVENTID, A1, A2, 0, 0) +#define BUF_VERB3(EVENTID, A1, A2, A3, A4) BUF_VERB_INT(EVENTID, A1, A2, A3, 0) +#define BUF_VERB4(EVENTID, A1, A2, A3, A4) BUF_VERB_INT(EVENTID, A1, A2, A3, A4) -/* code neatness */ -#define BUF_DATA1( id, a0 ) BUF_DATA(id, a0, 0, 0, 0 ) -#define BUF_DATA2( id, a0, a1 ) BUF_DATA(id, a0, a1, 0, 0 ) -#define BUF_DATA3( id, a0, a1, a2 ) BUF_DATA(id, a0, a1, a2, 0 ) diff --git a/osfmk/kperf/callstack.c b/osfmk/kperf/callstack.c index 89bf40f72..b45c3f0e7 100644 --- a/osfmk/kperf/callstack.c +++ b/osfmk/kperf/callstack.c @@ -28,140 +28,345 @@ /* Collect kernel callstacks */ +#include #include -#include /* XXX: remove me */ #include - -#include - +#include +#include #include #include #include #include +#include + static void -callstack_sample( struct callstack *cs, - struct kperf_context *context, - uint32_t is_user ) +callstack_fixup_user(struct callstack *cs, thread_t thread) { - kern_return_t kr; - mach_msg_type_number_t nframes; /* WTF with the type? */ - uint32_t code; + uint64_t fixup_val = 0; + assert(cs->nframes < MAX_CALLSTACK_FRAMES); + +#if defined(__x86_64__) + user_addr_t sp_user; + bool user_64; + x86_saved_state_t *state; - if( is_user ) - code = PERF_CS_USAMPLE; - else - code = PERF_CS_KSAMPLE; + state = get_user_regs(thread); + if (!state) { + goto out; + } - BUF_INFO1( code, (uintptr_t)thread_tid(context->cur_thread) ); + user_64 = is_saved_state64(state); + if (user_64) { + sp_user = saved_state64(state)->isf.rsp; + } else { + sp_user = saved_state32(state)->uesp; + } - /* fill out known flags */ - cs->flags = 0; - if( !is_user ) - { - cs->flags |= CALLSTACK_KERNEL; -#ifdef __LP64__ - cs->flags |= CALLSTACK_64BIT; + if (thread == current_thread()) { + (void)copyin(sp_user, (char *)&fixup_val, + user_64 ? sizeof(uint64_t) : sizeof(uint32_t)); + } else { + (void)vm_map_read_user(get_task_map(get_threadtask(thread)), sp_user, + &fixup_val, user_64 ? sizeof(uint64_t) : sizeof(uint32_t)); + } + +#else +#error "callstack_fixup_user: unsupported architecture" #endif + +out: + cs->frames[cs->nframes++] = fixup_val; +} + +#if defined(__x86_64__) + +__attribute__((used)) +static kern_return_t +interrupted_kernel_sp_value(uintptr_t *sp_val) +{ + x86_saved_state_t *state; + uintptr_t sp; + bool state_64; + uint64_t cs; + uintptr_t top, bottom; + + state = current_cpu_datap()->cpu_int_state; + if (!state) { + return KERN_FAILURE; } - else - { - /* FIXME: detect 32 vs 64-bit? */ + + state_64 = is_saved_state64(state); + + if (state_64) { + cs = saved_state64(state)->isf.cs; + } else { + cs = saved_state32(state)->cs; + } + /* return early if interrupted a thread in user space */ + if ((cs & SEL_PL) == SEL_PL_U) { + return KERN_FAILURE; } - /* collect the callstack */ - nframes = MAX_CALLSTACK_FRAMES; - kr = chudxnu_thread_get_callstack64_kperf( context->cur_thread, - cs->frames, - &nframes, - is_user ); + if (state_64) { + sp = saved_state64(state)->isf.rsp; + } else { + sp = saved_state32(state)->uesp; + } - /* check for overflow */ - if( kr == KERN_SUCCESS ) - { - cs->flags |= CALLSTACK_VALID; - cs->nframes = nframes; + /* make sure the stack pointer is pointing somewhere in this stack */ + bottom = current_thread()->kernel_stack; + top = bottom + kernel_stack_size; + if (sp >= bottom && sp < top) { + return KERN_FAILURE; } - else if( kr == KERN_RESOURCE_SHORTAGE ) - { - /* FIXME: more here */ - cs->flags |= CALLSTACK_TRUNCATED; + + *sp_val = *(uintptr_t *)sp; + return KERN_SUCCESS; +} + +#else /* defined(__arm__) */ +#error "interrupted_kernel_{sp,lr}: unsupported architecture" +#endif /* !defined(__arm__) */ + + +static void +callstack_fixup_interrupted(struct callstack *cs) +{ + uintptr_t fixup_val = 0; + assert(cs->nframes < MAX_CALLSTACK_FRAMES); + + /* + * Only provide arbitrary data on development or debug kernels. + */ +#if DEVELOPMENT || DEBUG +#if defined(__x86_64__) + (void)interrupted_kernel_sp_value(&fixup_val); +#endif /* defined(__x86_64__) */ +#endif /* DEVELOPMENT || DEBUG */ + + cs->frames[cs->nframes++] = fixup_val ? + VM_KERNEL_UNSLIDE_OR_PERM(fixup_val) : 0; +} + +void +kperf_continuation_sample(struct callstack *cs, struct kperf_context *context) +{ + thread_t thread; + + assert(cs != NULL); + assert(context != NULL); + + thread = context->cur_thread; + assert(thread != NULL); + assert(thread->continuation != NULL); + + cs->flags = CALLSTACK_CONTINUATION | CALLSTACK_VALID | CALLSTACK_KERNEL; +#ifdef __LP64__ + cs->flags |= CALLSTACK_64BIT; +#endif + + cs->nframes = 1; + cs->frames[0] = VM_KERNEL_UNSLIDE(thread->continuation); +} + +void +kperf_backtrace_sample(struct callstack *cs, struct kperf_context *context) +{ + assert(cs != NULL); + assert(context != NULL); + assert(context->cur_thread == current_thread()); + + cs->flags = CALLSTACK_KERNEL | CALLSTACK_KERNEL_WORDS; +#ifdef __LP64__ + cs->flags |= CALLSTACK_64BIT; +#endif + + BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_START, 1); + + cs->nframes = backtrace_frame((uintptr_t *)&(cs->frames), cs->nframes - 1, + context->starting_fp); + if (cs->nframes > 0) { cs->flags |= CALLSTACK_VALID; - cs->nframes = nframes; + /* + * Fake the value pointed to by the stack pointer or the link + * register for symbolicators. + */ + cs->frames[cs->nframes + 1] = 0; + cs->nframes += 1; } - else - { - BUF_INFO2(PERF_CS_ERROR, ERR_GETSTACK, kr); - cs->nframes = 0; + + BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_END, cs->nframes); +} + +void +kperf_kcallstack_sample(struct callstack *cs, struct kperf_context *context) +{ + thread_t thread; + + assert(cs != NULL); + assert(context != NULL); + assert(cs->nframes <= MAX_CALLSTACK_FRAMES); + + thread = context->cur_thread; + assert(thread != NULL); + + BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread), + cs->nframes); + + cs->flags = CALLSTACK_KERNEL; + +#ifdef __LP64__ + cs->flags |= CALLSTACK_64BIT; +#endif + + if (ml_at_interrupt_context()) { + assert(thread == current_thread()); + cs->flags |= CALLSTACK_KERNEL_WORDS; + cs->nframes = backtrace_interrupted((uintptr_t *)cs->frames, + cs->nframes - 1); + if (cs->nframes != 0) { + callstack_fixup_interrupted(cs); + } + } else { + /* + * Rely on legacy CHUD backtracer to backtrace kernel stacks on + * other threads. + */ + kern_return_t kr; + kr = chudxnu_thread_get_callstack64_kperf(thread, cs->frames, + &cs->nframes, FALSE); + if (kr == KERN_SUCCESS) { + cs->flags |= CALLSTACK_VALID; + } else if (kr == KERN_RESOURCE_SHORTAGE) { + cs->flags |= CALLSTACK_VALID; + cs->flags |= CALLSTACK_TRUNCATED; + } else { + cs->nframes = 0; + } } - if( cs->nframes > MAX_CALLSTACK_FRAMES ) - { - /* necessary? */ - BUF_INFO1(PERF_CS_ERROR, ERR_FRAMES); - cs->nframes = 0; + if (cs->nframes == 0) { + BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK); } + BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread), cs->flags, cs->nframes); } void -kperf_kcallstack_sample( struct callstack *cs, struct kperf_context *context ) +kperf_ucallstack_sample(struct callstack *cs, struct kperf_context *context) { - callstack_sample( cs, context, 0 ); + thread_t thread; + bool user_64 = false; + int err; + + assert(cs != NULL); + assert(context != NULL); + assert(cs->nframes <= MAX_CALLSTACK_FRAMES); + assert(ml_get_interrupts_enabled() == TRUE); + + thread = context->cur_thread; + assert(thread != NULL); + + BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread), + cs->nframes); + + cs->flags = 0; + + err = backtrace_thread_user(thread, (uintptr_t *)cs->frames, + cs->nframes - 1, &cs->nframes, &user_64); + cs->flags |= CALLSTACK_KERNEL_WORDS; + if (user_64) { + cs->flags |= CALLSTACK_64BIT; + } + + if (!err || err == EFAULT) { + callstack_fixup_user(cs, thread); + cs->flags |= CALLSTACK_VALID; + } else { + cs->nframes = 0; + BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK, err); + } + + BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread), + cs->flags, cs->nframes); } -void -kperf_ucallstack_sample( struct callstack *cs, struct kperf_context *context ) +static inline uintptr_t +scrub_kernel_frame(uintptr_t *bt, int n_frames, int frame) { - callstack_sample( cs, context, 1 ); + if (frame < n_frames) { + return VM_KERNEL_UNSLIDE(bt[frame]); + } else { + return 0; + } +} + +static inline uintptr_t +scrub_frame(uint64_t *bt, int n_frames, int frame) +{ + if (frame < n_frames) { + return (uintptr_t)(bt[frame]); + } else { + return 0; + } } static void -callstack_log( struct callstack *cs, uint32_t hcode, uint32_t dcode ) +callstack_log(struct callstack *cs, uint32_t hcode, uint32_t dcode) { - unsigned int i, j, n, of = 4; + BUF_VERB(PERF_CS_LOG | DBG_FUNC_START, cs->flags, cs->nframes); - /* Header on the stack */ - BUF_DATA2( hcode, cs->flags, cs->nframes ); + /* framing information for the stack */ + BUF_DATA(hcode, cs->flags, cs->nframes); - /* look for how many batches of 4 */ - n = cs->nframes / 4; - of = cs->nframes % 4; - if( of != 0 ) + /* how many batches of 4 */ + unsigned int n = cs->nframes / 4; + unsigned int ovf = cs->nframes % 4; + if (ovf != 0) { n++; + } - /* print all the stack data, and zero the overflow */ - for( i = 0; i < n; i++ ) - { -#define SCRUB_FRAME(x) (((x)nframes)?cs->frames[x]:0) - j = i * 4; - BUF_DATA ( dcode, - SCRUB_FRAME(j+0), - SCRUB_FRAME(j+1), - SCRUB_FRAME(j+2), - SCRUB_FRAME(j+3) ); -#undef SCRUB_FRAME + if (cs->flags & CALLSTACK_KERNEL_WORDS) { + for (unsigned int i = 0; i < n; i++) { + unsigned int j = i * 4; + BUF_DATA(dcode, + scrub_kernel_frame((uintptr_t *)cs->frames, cs->nframes, j + 0), + scrub_kernel_frame((uintptr_t *)cs->frames, cs->nframes, j + 1), + scrub_kernel_frame((uintptr_t *)cs->frames, cs->nframes, j + 2), + scrub_kernel_frame((uintptr_t *)cs->frames, cs->nframes, j + 3)); + } + } else { + for (unsigned int i = 0; i < n; i++) { + unsigned int j = i * 4; + BUF_DATA(dcode, + scrub_frame(cs->frames, cs->nframes, j + 0), + scrub_frame(cs->frames, cs->nframes, j + 1), + scrub_frame(cs->frames, cs->nframes, j + 2), + scrub_frame(cs->frames, cs->nframes, j + 3)); + } } + + BUF_VERB(PERF_CS_LOG | DBG_FUNC_END, cs->flags, cs->nframes); } void kperf_kcallstack_log( struct callstack *cs ) { - callstack_log( cs, PERF_CS_KHDR, PERF_CS_KDATA ); + callstack_log(cs, PERF_CS_KHDR, PERF_CS_KDATA); } void kperf_ucallstack_log( struct callstack *cs ) { - callstack_log( cs, PERF_CS_UHDR, PERF_CS_UDATA ); + callstack_log(cs, PERF_CS_UHDR, PERF_CS_UDATA); } int -kperf_ucallstack_pend( struct kperf_context * context ) +kperf_ucallstack_pend(struct kperf_context * context, uint32_t depth) { - return kperf_ast_pend( context->cur_thread, T_AST_CALLSTACK, - T_AST_CALLSTACK ); -} + int did_pend = kperf_ast_pend(context->cur_thread, T_KPERF_AST_CALLSTACK); + kperf_ast_set_callstack_depth(context->cur_thread, depth); -// kr = chudxnu_thread_get_callstack(context->generic->threadID, -// (uint32_t*)frames, &frameCount, !collectingSupervisorStack); + return did_pend; +} diff --git a/osfmk/kperf/callstack.h b/osfmk/kperf/callstack.h index 3bfd96422..aa2ec165e 100644 --- a/osfmk/kperf/callstack.h +++ b/osfmk/kperf/callstack.h @@ -2,7 +2,7 @@ * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,23 +22,31 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef __AP_CALLSTACK_H__ -#define __AP_CALLSTACK_H__ +#ifndef KPERF_CALLSTACK_H +#define KPERF_CALLSTACK_H #define MAX_CALLSTACK_FRAMES (128) -#define CALLSTACK_VALID (1<<0) -#define CALLSTACK_DEFERRED (1<<1) -#define CALLSTACK_64BIT (1<<2) -#define CALLSTACK_KERNEL (1<<3) -#define CALLSTACK_TRUNCATED (1<<4) +/* the callstack contains valid data */ +#define CALLSTACK_VALID (1U << 0) +/* the callstack has been deferred */ +#define CALLSTACK_DEFERRED (1U << 1) +/* the callstack is 64-bit */ +#define CALLSTACK_64BIT (1U << 2) +/* the callstack is from the kernel */ +#define CALLSTACK_KERNEL (1U << 3) +/* the callstack was cut off */ +#define CALLSTACK_TRUNCATED (1U << 4) +/* the callstack is only holding a continuation "frame" */ +#define CALLSTACK_CONTINUATION (1U << 5) +/* the frames field is filled with uintptr_t, not uint64_t */ +#define CALLSTACK_KERNEL_WORDS (1U << 6) -struct callstack -{ +struct callstack { uint32_t flags; uint32_t nframes; uint64_t frames[MAX_CALLSTACK_FRAMES]; @@ -46,12 +54,13 @@ struct callstack struct kperf_context; -extern void kperf_kcallstack_sample( struct callstack *cs, struct kperf_context * ); -extern void kperf_kcallstack_log( struct callstack *cs ); - -extern void kperf_ucallstack_sample( struct callstack *cs, struct kperf_context * ); -extern int kperf_ucallstack_pend( struct kperf_context * ); -extern void kperf_ucallstack_log( struct callstack *cs ); +void kperf_kcallstack_sample(struct callstack *cs, struct kperf_context *); +void kperf_kcallstack_log(struct callstack *cs); +void kperf_continuation_sample(struct callstack *cs, struct kperf_context *); +void kperf_backtrace_sample(struct callstack *cs, struct kperf_context *context); +void kperf_ucallstack_sample(struct callstack *cs, struct kperf_context *); +int kperf_ucallstack_pend(struct kperf_context *, uint32_t depth); +void kperf_ucallstack_log(struct callstack *cs); -#endif /* __AP_CALLSTACK_H__ */ +#endif /* !defined(KPERF_CALLSTACK_H) */ diff --git a/osfmk/kperf/context.h b/osfmk/kperf/context.h index f1e232aa1..14eadfe7a 100644 --- a/osfmk/kperf/context.h +++ b/osfmk/kperf/context.h @@ -2,7 +2,7 @@ * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,20 +22,25 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#ifndef KPERF_CONTEXT_H +#define KPERF_CONTEXT_H + #include /* context of what we're looking at */ -struct kperf_context -{ +struct kperf_context { /* who was running during the event */ int cur_pid; thread_t cur_thread; + uintptr_t *starting_fp; /* who caused the event */ unsigned trigger_type; unsigned trigger_id; }; + +#endif /* !defined(KPERF_CONTEXT_H) */ diff --git a/osfmk/kperf/kdebug_trigger.c b/osfmk/kperf/kdebug_trigger.c new file mode 100644 index 000000000..7c343631b --- /dev/null +++ b/osfmk/kperf/kdebug_trigger.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * kperf's kdebug trigger is a precise mechanism for taking samples of the + * thread tracing a kdebug event. + * + * The filter used by kperf differs from kdebug's typefilter. kperf's filter + * is small -- only around 140 bytes, as opposed to kdebug's 8KB filter. It + * can also target precise debug IDs, instead of only being able to specify + * an entire subclass in a kdebug typefilter. Function specifiers can be + * provided to match against along with a class or subclass. For instance, this + * allows the kperf filter to only trigger a sample if an ending syscall event + * (DBG_BSD, DBG_BSD_EXCP_SC) occurs. + * + * The tradeoff for this flexibility is that only KPERF_KDEBUG_DEBUGIDS_MAX (32) + * classes, subclasses, or exact debug IDs can be filtered at one time. + * + * The filter consists of up to 32 debug IDs and an array of 2-bit type codes + * packed into a 64-bit value. To determine if a given debug ID should trigger + * a kperf sample, each debug ID is checked. The type code is unpacked from the + * 64-bit value to apply a mask to the debug ID. Then, a sample occurs if the + * masked debug ID is equal to the debug ID in the filter's list. + */ + +#include +#include +#include +#include +#include +#include +#include + +boolean_t kperf_kdebug_active = FALSE; +static void kperf_kdebug_update(void); + +static uint8_t kperf_kdebug_action = 0; + +static struct kperf_kdebug_filter { + uint64_t types[2]; + uint32_t debugids[KPERF_KDEBUG_DEBUGIDS_MAX]; + uint8_t n_debugids; +} __attribute__((packed)) *kperf_kdebug_filter = NULL; + +enum kperf_kdebug_filter_type { + KPERF_KDEBUG_FILTER_CLASS, + KPERF_KDEBUG_FILTER_CLASS_FN, + KPERF_KDEBUG_FILTER_CSC, + KPERF_KDEBUG_FILTER_CSC_FN, + KPERF_KDEBUG_FILTER_DEBUGID, + KPERF_KDEBUG_FILTER_DEBUGID_FN +}; + +const static uint32_t debugid_masks[] = { + [KPERF_KDEBUG_FILTER_CLASS] = KDBG_CLASS_MASK, + [KPERF_KDEBUG_FILTER_CLASS_FN] = KDBG_CLASS_MASK | KDBG_FUNC_MASK, + [KPERF_KDEBUG_FILTER_CSC] = KDBG_CSC_MASK, + [KPERF_KDEBUG_FILTER_CSC_FN] = KDBG_CSC_MASK | KDBG_FUNC_MASK, + [KPERF_KDEBUG_FILTER_DEBUGID] = KDBG_EVENTID_MASK, + [KPERF_KDEBUG_FILTER_DEBUGID_FN] = UINT32_MAX, +}; + +/* + * Types are packed into 2 64-bit fields in the filter, with 4-bits for each + * type. Only 3 bits are strictly necessary, but using 4 simplifies the + * unpacking. + */ + +/* UNSAFE */ +#define DECODE_TYPE(TYPES, I) ((((uint8_t *)(TYPES))[(I) / 2] >> ((I) % 2) * 4) & 0xf) + +int +kperf_kdebug_init(void) +{ + kperf_kdebug_filter = kalloc_tag(sizeof(*kperf_kdebug_filter), + VM_KERN_MEMORY_DIAG); + if (kperf_kdebug_filter == NULL) { + return ENOMEM; + } + bzero(kperf_kdebug_filter, sizeof(*kperf_kdebug_filter)); + + return 0; +} + +void +kperf_kdebug_reset(void) +{ + int err; + + if ((err = kperf_init())) { + return; + } + + kperf_kdebug_action = 0; + bzero(kperf_kdebug_filter, sizeof(*kperf_kdebug_filter)); + kperf_kdebug_update(); +} + +boolean_t +kperf_kdebug_should_trigger(uint32_t debugid) +{ + /* ignore kperf events */ + if (KDBG_EXTRACT_CLASS(debugid) == DBG_PERF) { + return FALSE; + } + + /* + * Search linearly through list of debugids and masks. If the filter + * gets larger than 128 bytes, change this to either a binary search or + * a sparse bitmap on the uint32_t range, depending on the new size. + */ + for (uint8_t i = 0; i < kperf_kdebug_filter->n_debugids; i++) { + uint32_t check_debugid = + kperf_kdebug_filter->debugids[i]; + uint32_t mask = debugid_masks[DECODE_TYPE(kperf_kdebug_filter->types, i)]; + + if ((debugid & mask) == check_debugid) { + return TRUE; + } + } + + return FALSE; +} + +int +kperf_kdebug_set_filter(user_addr_t user_filter, uint32_t user_size) +{ + uint32_t n_debugids_provided = 0; + int err = 0; + + if ((err = kperf_init())) { + return err; + } + + /* detect disabling the filter completely */ + if (user_filter == USER_ADDR_NULL || user_size == 0) { + bzero(kperf_kdebug_filter, sizeof(*kperf_kdebug_filter)); + goto out; + } + + n_debugids_provided = (uint32_t)KPERF_KDEBUG_N_DEBUGIDS(user_size); + + if ((err = kperf_kdebug_set_n_debugids(n_debugids_provided))) { + goto out; + } + + if ((err = copyin(user_filter, (char *)kperf_kdebug_filter, + KPERF_KDEBUG_FILTER_SIZE(n_debugids_provided)))) + { + bzero(kperf_kdebug_filter, sizeof(*kperf_kdebug_filter)); + goto out; + } + +out: + kperf_kdebug_update(); + + return err; +} + +uint32_t +kperf_kdebug_get_filter(struct kperf_kdebug_filter **filter) +{ + int err; + + if ((err = kperf_init())) { + return 0; + } + + assert(filter != NULL); + + *filter = kperf_kdebug_filter; + return kperf_kdebug_filter->n_debugids; +} + +int +kperf_kdebug_set_n_debugids(uint32_t n_debugids_in) +{ + int err; + + if ((err = kperf_init())) { + return EINVAL; + } + + if (n_debugids_in > KPERF_KDEBUG_DEBUGIDS_MAX) { + return EINVAL; + } + + kperf_kdebug_filter->n_debugids = n_debugids_in; + + return 0; +} + +int +kperf_kdebug_set_action(int action_id) +{ + if (action_id < 0 || (unsigned int)action_id > kperf_action_get_count()) { + return EINVAL; + } + + kperf_kdebug_action = action_id; + kperf_kdebug_update(); + + return 0; +} + +int +kperf_kdebug_get_action(void) +{ + return kperf_kdebug_action; +} + +static void +kperf_kdebug_update(void) +{ + int err; + + if ((err = kperf_init())) { + return; + } + + if (kperf_kdebug_action != 0 && + kperf_kdebug_filter->n_debugids != 0) + { + kperf_kdebug_active = TRUE; + } else { + kperf_kdebug_active = FALSE; + } +} diff --git a/osfmk/default_pager/default_pager_types.defs b/osfmk/kperf/kdebug_trigger.h similarity index 56% rename from osfmk/default_pager/default_pager_types.defs rename to osfmk/kperf/kdebug_trigger.h index 7d98c1ebf..6cfc254dc 100644 --- a/osfmk/default_pager/default_pager_types.defs +++ b/osfmk/kperf/kdebug_trigger.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,33 +22,36 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ -#ifndef _MACH_DEFAULT_PAGER_TYPES_DEFS_ -#define _MACH_DEFAULT_PAGER_TYPES_DEFS_ +#ifndef KPERF_KDEBUG_TRIGGER_H +#define KPERF_KDEBUG_TRIGGER_H -#include +#include +#include +#define KPERF_KDEBUG_DEBUGIDS_MAX (32) -type default_pager_info_t = struct[3] of natural_t; -type default_pager_info_64_t = struct[6] of natural_t; +struct kperf_kdebug_filter; -type default_pager_object_t = struct[2] of natural_t; -type default_pager_object_array_t = array[] of default_pager_object_t; +#define KPERF_KDEBUG_FILTER_SIZE(N_DEBUGIDS) ((2 * sizeof(uint64_t)) + ((N_DEBUGIDS) * sizeof(uint32_t))) +/* UNSAFE */ +#define KPERF_KDEBUG_N_DEBUGIDS(FILTER_SIZE) \ + (((FILTER_SIZE) <= (2 * sizeof(uint64_t))) ? 0 : \ + (((FILTER_SIZE) - (2 * sizeof(uint64_t))) / sizeof(uint32_t))) -type default_pager_page_t = struct[1] of natural_t; -type default_pager_page_array_t = array[] of default_pager_page_t; +int kperf_kdebug_init(void); +void kperf_kdebug_reset(void); -type backing_store_flavor_t = integer_t; -type backing_store_info_t = array[*:20] of integer_t; +boolean_t kperf_kdebug_should_trigger(uint32_t debugid); -import ; +int kperf_kdebug_set_action(int action_id); +int kperf_kdebug_get_action(void); -#endif /* _MACH_DEFAULT_PAGER_TYPES_DEFS_ */ +int kperf_kdebug_set_n_debugids(uint32_t n_debugids_in); +int kperf_kdebug_set_filter(user_addr_t user_filter, uint32_t user_size); +uint32_t kperf_kdebug_get_filter(struct kperf_kdebug_filter **filter); -/* vim: set ft=c : */ +#endif /* !defined(KPERF_KDEBUG_TRIGGER_H) */ diff --git a/osfmk/kperf/kperf.c b/osfmk/kperf/kperf.c index b1d9d4cb4..45d441e5f 100644 --- a/osfmk/kperf/kperf.c +++ b/osfmk/kperf/kperf.c @@ -25,183 +25,167 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include +#include /* port_name_to_task */ #include #include #include +#include #include +#include -#include -#include #include +#include +#include #include -#include - -#include /* port_name_to_task */ +#include +#include +#include -/** misc functions **/ -#include /* XXX: should bust this out */ +lck_grp_t kperf_lck_grp; /* thread on CPUs before starting the PET thread */ thread_t *kperf_thread_on_cpus = NULL; -/* interupt sample buffers -- one wired per CPU */ -static struct kperf_sample *intr_samplev = NULL; -static unsigned intr_samplec = 0; +/* one wired sample buffer per CPU */ +static struct kperf_sample *intr_samplev; +static unsigned int intr_samplec = 0; -/* track recursion in the trace code */ -static struct -{ - int active; - int pad[64 / sizeof(int)]; -} *kpdbg_recursev; -static unsigned kpdbg_recursec = 0; - -/* Curren sampling status */ +/* current sampling status */ static unsigned sampling_status = KPERF_SAMPLING_OFF; -/* Make sure we only init once */ -static unsigned kperf_initted = 0; +/* only init once */ +static boolean_t kperf_initted = FALSE; -extern void (*chudxnu_thread_ast_handler)(thread_t); +/* whether or not to callback to kperf on context switch */ +boolean_t kperf_on_cpu_active = FALSE; -struct kperf_sample* +struct kperf_sample * kperf_intr_sample_buffer(void) { - unsigned ncpu = chudxnu_cpu_number(); + unsigned ncpu = cpu_number(); - // XXX: assert? - if( ncpu >= intr_samplec ) - return NULL; + assert(ml_get_interrupts_enabled() == FALSE); + assert(ncpu < intr_samplec); - return &intr_samplev[ncpu]; -} - -int -kperf_kdbg_recurse(int step) -{ - unsigned ncpu = chudxnu_cpu_number(); - - // XXX: assert? - if( ncpu >= kpdbg_recursec ) - return 1; - - /* recursing in, available */ - if( (step > 0) - && (kpdbg_recursev[ncpu].active == 0) ) - { - kpdbg_recursev[ncpu].active = 1; - return 0; - } - - /* recursing in, unavailable */ - if( (step > 0) - && (kpdbg_recursev[ncpu].active != 0) ) - { - return 1; - } - - /* recursing out, unavailable */ - if( (step < 0) - && (kpdbg_recursev[ncpu].active != 0) ) - { - kpdbg_recursev[ncpu].active = 0; - return 0; - } - - /* recursing out, available */ - if( (step < 0) - && (kpdbg_recursev[ncpu].active == 0) ) - panic( "return from non-recursed kperf kdebug call" ); - - panic( "unknown kperf kdebug call" ); - return 1; + return &(intr_samplev[ncpu]); } /* setup interrupt sample buffers */ int kperf_init(void) { + static lck_grp_attr_t lck_grp_attr; + + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + unsigned ncpus = 0; int err; - if( kperf_initted ) + if (kperf_initted) { return 0; + } + + lck_grp_attr_setdefault(&lck_grp_attr); + lck_grp_init(&kperf_lck_grp, "kperf", &lck_grp_attr); - /* get number of cpus */ ncpus = machine_info.logical_cpu_max; - kperf_thread_on_cpus = kalloc( ncpus * sizeof(*kperf_thread_on_cpus) ); - if( kperf_thread_on_cpus == NULL ) - { + /* create buffers to remember which threads don't need to be sampled by PET */ + kperf_thread_on_cpus = kalloc_tag(ncpus * sizeof(*kperf_thread_on_cpus), + VM_KERN_MEMORY_DIAG); + if (kperf_thread_on_cpus == NULL) { err = ENOMEM; goto error; } + bzero(kperf_thread_on_cpus, ncpus * sizeof(*kperf_thread_on_cpus)); - /* clear it */ - bzero( kperf_thread_on_cpus, ncpus * sizeof(*kperf_thread_on_cpus) ); - - /* make the CPU array - * FIXME: cache alignment - */ - intr_samplev = kalloc( ncpus * sizeof(*intr_samplev)); + /* create the interrupt buffers */ intr_samplec = ncpus; - - if( intr_samplev == NULL ) - { + intr_samplev = kalloc_tag(ncpus * sizeof(*intr_samplev), + VM_KERN_MEMORY_DIAG); + if (intr_samplev == NULL) { err = ENOMEM; goto error; } + bzero(intr_samplev, ncpus * sizeof(*intr_samplev)); - /* clear it */ - bzero( intr_samplev, ncpus * sizeof(*intr_samplev) ); + /* create kdebug trigger filter buffers */ + if ((err = kperf_kdebug_init())) { + goto error; + } - /* make the recursion array */ - kpdbg_recursev = kalloc( ncpus * sizeof(*kpdbg_recursev)); - kpdbg_recursec = ncpus; + kperf_initted = TRUE; + return 0; - /* clear it */ - bzero( kpdbg_recursev, ncpus * sizeof(*kpdbg_recursev) ); +error: + if (intr_samplev) { + kfree(intr_samplev, ncpus * sizeof(*intr_samplev)); + intr_samplev = NULL; + intr_samplec = 0; + } - /* we're done */ - kperf_initted = 1; + if (kperf_thread_on_cpus) { + kfree(kperf_thread_on_cpus, ncpus * sizeof(*kperf_thread_on_cpus)); + kperf_thread_on_cpus = NULL; + } - return 0; -error: - if( intr_samplev ) - kfree( intr_samplev, ncpus * sizeof(*intr_samplev) ); - if( kperf_thread_on_cpus ) - kfree( kperf_thread_on_cpus, ncpus * sizeof(*kperf_thread_on_cpus) ); return err; } -/* random misc-ish functions */ -uint32_t -kperf_get_thread_bits( thread_t thread ) +void +kperf_reset(void) { - return thread->t_chud; + lck_mtx_assert(ktrace_lock, LCK_MTX_ASSERT_OWNED); + + /* turn off sampling first */ + (void)kperf_sampling_disable(); + + /* cleanup miscellaneous configuration first */ + (void)kperf_kdbg_cswitch_set(0); + (void)kperf_set_lightweight_pet(0); + kperf_kdebug_reset(); + + /* timers, which require actions, first */ + kperf_timer_reset(); + kperf_action_reset(); } void -kperf_set_thread_bits( thread_t thread, uint32_t bits ) +kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_fp) { - thread->t_chud = bits; + if (kperf_kdebug_cswitch) { + /* trace the new thread's PID for Instruments */ + int pid = task_pid(get_threadtask(thread)); + + BUF_DATA(PERF_TI_CSWITCH, thread_tid(thread), pid); + } + if (kperf_lightweight_pet_active) { + kperf_pet_on_cpu(thread, continuation, starting_fp); + } } -/* mark an AST to fire on a thread */ void -kperf_set_thread_ast( thread_t thread ) +kperf_on_cpu_update(void) { - /* FIXME: only call this on current thread from an interrupt - * handler for now... - */ - if( thread != current_thread() ) - panic( "unsafe AST set" ); + kperf_on_cpu_active = kperf_kdebug_cswitch || + kperf_lightweight_pet_active; +} + +/* random misc-ish functions */ +uint32_t +kperf_get_thread_flags(thread_t thread) +{ + return thread->kperf_flags; +} - act_set_kperf(thread); +void +kperf_set_thread_flags(thread_t thread, uint32_t flags) +{ + thread->kperf_flags = flags; } -unsigned +unsigned int kperf_sampling_status(void) { return sampling_status; @@ -210,20 +194,22 @@ kperf_sampling_status(void) int kperf_sampling_enable(void) { - /* already running! */ - if( sampling_status == KPERF_SAMPLING_ON ) + if (sampling_status == KPERF_SAMPLING_ON) { return 0; + } - if ( sampling_status != KPERF_SAMPLING_OFF ) - panic( "kperf: sampling wasn't off" ); + if (sampling_status != KPERF_SAMPLING_OFF) { + panic("kperf: sampling was %d when asked to enable", sampling_status); + } /* make sure interrupt tables and actions are initted */ - if( !kperf_initted - || (kperf_action_get_count() == 0) ) + if (!kperf_initted || (kperf_action_get_count() == 0)) { return ECANCELED; + } /* mark as running */ sampling_status = KPERF_SAMPLING_ON; + kperf_lightweight_pet_active_update(); /* tell timers to enable */ kperf_timer_go(); @@ -234,8 +220,9 @@ kperf_sampling_enable(void) int kperf_sampling_disable(void) { - if( sampling_status != KPERF_SAMPLING_ON ) + if (sampling_status != KPERF_SAMPLING_ON) { return 0; + } /* mark a shutting down */ sampling_status = KPERF_SAMPLING_SHUTDOWN; @@ -245,26 +232,44 @@ kperf_sampling_disable(void) /* mark as off */ sampling_status = KPERF_SAMPLING_OFF; + kperf_lightweight_pet_active_update(); return 0; } +boolean_t +kperf_thread_get_dirty(thread_t thread) +{ + return (thread->c_switch != thread->kperf_c_switch); +} + +void +kperf_thread_set_dirty(thread_t thread, boolean_t dirty) +{ + if (dirty) { + thread->kperf_c_switch = thread->c_switch - 1; + } else { + thread->kperf_c_switch = thread->c_switch; + } +} + int kperf_port_to_pid(mach_port_name_t portname) { task_t task; int pid; - if( !MACH_PORT_VALID(portname) ) + if (!MACH_PORT_VALID(portname)) { return -1; + } task = port_name_to_task(portname); - - if( task == TASK_NULL ) - return -1; + if (task == TASK_NULL) { + return -1; + } - pid = chudxnu_pid_for_task(task); + pid = task_pid(task); task_deallocate_internal(task); diff --git a/osfmk/kperf/kperf.h b/osfmk/kperf/kperf.h index 01c976d73..ec0ab45db 100644 --- a/osfmk/kperf/kperf.h +++ b/osfmk/kperf/kperf.h @@ -26,80 +26,119 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef __KPERF_H__ -#define __KPERF_H__ +#ifndef KPERF_H +#define KPERF_H #include +#include -/* The various trigger types supported by kperf */ -#define TRIGGER_TYPE_TIMER (0) -#define TRIGGER_TYPE_PMI (1) -#define TRIGGER_TYPE_TRACE (2) -#define TRIGGER_TYPE_CSWITCH (3) +extern lck_grp_t kperf_lck_grp; -/* Helpers to get and set AST bits on a thread */ -extern uint32_t kperf_get_thread_bits( thread_t thread ); -extern void kperf_set_thread_bits( thread_t thread, uint32_t bits ); -extern void kperf_set_thread_ast( thread_t thread ); +/* the trigger types supported by kperf */ +#define TRIGGER_TYPE_TIMER (0) +#define TRIGGER_TYPE_PMI (1) +#define TRIGGER_TYPE_KDEBUG (2) -/* Possible states of kperf sampling */ -#define KPERF_SAMPLING_OFF 0 -#define KPERF_SAMPLING_ON 1 -#define KPERF_SAMPLING_SHUTDOWN 2 +/* helpers to get and set AST flags on a thread */ +uint32_t kperf_get_thread_flags(thread_t thread); +void kperf_set_thread_flags(thread_t thread, uint32_t flags); -/* Init kperf module. Must be called before use, can be called as many - * times as you like. +/* + * Get and set dirtiness of thread, so kperf can track whether the thread + * has been dispatched since it last looked. + */ +boolean_t kperf_thread_get_dirty(thread_t thread); +void kperf_thread_set_dirty(thread_t thread, boolean_t dirty); + +/* possible states of kperf sampling */ +#define KPERF_SAMPLING_OFF (0) +#define KPERF_SAMPLING_ON (1) +#define KPERF_SAMPLING_SHUTDOWN (2) + +/* + * Initialize kperf. Must be called before use and can be called multiple times. */ extern int kperf_init(void); -/* Get and set sampling status */ +/* get and set sampling status */ extern unsigned kperf_sampling_status(void); extern int kperf_sampling_enable(void); extern int kperf_sampling_disable(void); -/* kperf AST handler +/* get a per-CPU sample buffer */ +struct kperf_sample *kperf_intr_sample_buffer(void); + +/* + * kperf AST handler + */ +extern __attribute__((noinline)) void kperf_thread_ast_handler(thread_t thread); + +/* + * thread on core callback + */ + +/* controls whether the callback is called on context switch */ +extern boolean_t kperf_on_cpu_active; + +/* update whether the callback is set */ +void kperf_on_cpu_update(void); + +/* handle a thread being switched on */ +void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_fp); + +/* for scheduler threads switching threads on */ +static inline void +kperf_on_cpu(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_fp) +{ + if (__improbable(kperf_on_cpu_active)) { + kperf_on_cpu_internal(thread, continuation, starting_fp); + } +} + +/* + * kdebug callback */ -extern void kperf_thread_ast_handler( thread_t thread ); -/* kperf kdebug callback +/* controls whether the kdebug callback is called */ +extern boolean_t kperf_kdebug_active; + +/* handle the kdebug event */ +void kperf_kdebug_callback_internal(uint32_t debugid); + +/* handle a kdebug event */ +void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp); + +/* called inside of kernel_debug_internal */ +static inline void +kperf_kdebug_callback(uint32_t debugid, uintptr_t *starting_fp) +{ + if (__improbable(kperf_kdebug_active)) { + kperf_kdebug_handler(debugid, starting_fp); + } +} + +/* + * Used by ktrace to reset kperf. ktrace_lock must be held. */ -extern void kperf_kdebug_callback(uint32_t debugid); +extern void kperf_reset(void); /* get and set whether we're recording stacks on interesting kdebug events */ extern int kperf_kdbg_get_stacks(void); extern int kperf_kdbg_set_stacks(int); -/* get and set whether to trigger an action on signposts */ -extern int kperf_signpost_action_get(void); -extern int kperf_signpost_action_set(int newval); +extern int kperf_kdebug_cswitch; -extern int kperf_cswitch_callback_set; +#if DEVELOPMENT || DEBUG +extern _Atomic long long kperf_pending_ipis; +#endif /* DEVELOPMENT || DEBUG */ /* get and set whether to output tracepoints on context-switch */ extern int kperf_kdbg_cswitch_get(void); extern int kperf_kdbg_cswitch_set(int newval); -/* get and set whether to trigger an action on context-switch */ -extern int kperf_cswitch_action_get(void); -extern int kperf_cswitch_action_set(int newval); - /* given a task port, find out its pid */ int kperf_port_to_pid(mach_port_name_t portname); -/* Check whether the current process has been blessed to allow access - * to kperf facilities. - */ -extern int kperf_access_check(void); - -/* track recursion on kdebug tracepoint tracking */ -extern int kperf_kdbg_recurse(int step); -#define KPERF_RECURSE_IN (1) -#define KPERF_RECURSE_OUT (-1) - -/* context switch tracking */ -extern void kperf_switch_context( thread_t old, thread_t new ); - -/* bootstrap */ -extern void kperf_bootstrap(void); - -#endif /* __KPERF_H__ */ +#endif /* !defined(KPERF_H) */ diff --git a/osfmk/kperf/kperf_arch.h b/osfmk/kperf/kperf_arch.h index 7142c3fe9..44719601e 100644 --- a/osfmk/kperf/kperf_arch.h +++ b/osfmk/kperf/kperf_arch.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2011 Apple Inc. All rights reserved. + * Copyright (c) 2011-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,22 +22,16 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _KPERF_ARCH_H -#define _KPERF_ARCH_H +#ifndef KPERF_ARCH_H +#define KPERF_ARCH_H -/* per-arch header */ -#if defined(__x86_64__) -#include "kperf/x86_64/kperf_arch.h" -#else -#error architecture not supported -#endif +struct kperf_timer; +void kperf_mp_broadcast_running(struct kperf_timer *trigger); -/* common definitions */ -extern int kperf_mp_broadcast( void (*func)(void*), void *arg ); -extern int kperf_mp_signal(void); -extern kern_return_t kperf_get_phys_footprint(task_t, uint64_t *); +void kperf_signal_handler(void); +kern_return_t kperf_get_phys_footprint(task_t, uint64_t *); -#endif /* _KPERF_ARCH_H */ +#endif /* KPERF_ARCH_H */ diff --git a/osfmk/kperf/kperf_kpc.c b/osfmk/kperf/kperf_kpc.c index b0a4c0a7a..5090e8c8c 100644 --- a/osfmk/kperf/kperf_kpc.c +++ b/osfmk/kperf/kperf_kpc.c @@ -2,7 +2,7 @@ * Copyright (c) 2013 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,41 +22,35 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Sample KPC data into kperf and manage a shared context-switch handler */ +/* + * Sample KPC data into kperf and manage shared context-switch and AST handlers + */ #include #include #include +#include #include #include /* kpc_cswitch_context, kpc_threads_counting */ -unsigned kperf_kpc_cswitch_set = 0; - void -kperf_kpc_switch_context(thread_t old, thread_t new) +kperf_kpc_thread_ast(thread_t thread) { - if (kpc_threads_counting) { - kpc_switch_context(old, new); - } - if (kperf_cswitch_callback_set) { - kperf_switch_context(old, new); - } -} + kpc_thread_ast_handler(thread); + kperf_thread_ast_handler(thread); -void -kperf_kpc_cswitch_callback_update(void) -{ - kperf_kpc_cswitch_set = kperf_cswitch_callback_set || - kpc_threads_counting; + thread->kperf_flags = 0; } void kperf_kpc_thread_sample(struct kpcdata *kpcd, int sample_config) { + BUF_INFO(PERF_KPC_THREAD_SAMPLE | DBG_FUNC_START, sample_config); + kpcd->running = kpc_get_running(); /* let kpc_get_curthread_counters set the correct count */ kpcd->counterc = KPC_MAX_COUNTERS; @@ -66,18 +60,22 @@ kperf_kpc_thread_sample(struct kpcdata *kpcd, int sample_config) memset(kpcd->counterv, 0, sizeof(uint64_t) * kpcd->counterc); } - /* help out Instruments */ + /* help out Instruments by sampling KPC's config */ if (!sample_config) { kpcd->configc = 0; } else { kpcd->configc = kpc_get_config_count(kpcd->running); kpc_get_config(kpcd->running, kpcd->configv); } + + BUF_INFO(PERF_KPC_THREAD_SAMPLE | DBG_FUNC_END, kpcd->running, kpcd->counterc); } void kperf_kpc_cpu_sample(struct kpcdata *kpcd, int sample_config) { + BUF_INFO(PERF_KPC_CPU_SAMPLE | DBG_FUNC_START, sample_config); + kpcd->running = kpc_get_running(); kpcd->counterc = kpc_get_cpu_counters(0, kpcd->running, &kpcd->curcpu, @@ -88,6 +86,8 @@ kperf_kpc_cpu_sample(struct kpcdata *kpcd, int sample_config) kpcd->configc = kpc_get_config_count(kpcd->running); kpc_get_config(kpcd->running, kpcd->configv); } + + BUF_INFO(PERF_KPC_CPU_SAMPLE | DBG_FUNC_END, kpcd->running, kpcd->counterc); } static void diff --git a/osfmk/kperf/kperf_kpc.h b/osfmk/kperf/kperf_kpc.h index d4dc8d814..84ec687b9 100644 --- a/osfmk/kperf/kperf_kpc.h +++ b/osfmk/kperf/kperf_kpc.h @@ -33,22 +33,7 @@ #include /* KPC_MAX_COUNTERS */ #endif -/* controls whether a context-switch handler is invoked */ -extern unsigned kperf_kpc_cswitch_set; - -void kperf_kpc_switch_context(thread_t old, thread_t new); -void kperf_kpc_cswitch_callback_update(void); - -/* for osfmk/platform/pcb.c context switches */ -static inline void -kperf_kpc_cswitch(thread_t old, thread_t new) -{ - if (!kperf_kpc_cswitch_set) { - return; - } - - kperf_kpc_switch_context(old, new); -} +void kperf_kpc_thread_ast(thread_t thread); /* KPC sample data */ struct kpcdata diff --git a/osfmk/kperf/kperf_timer.c b/osfmk/kperf/kperf_timer.c new file mode 100644 index 000000000..cfa429f2f --- /dev/null +++ b/osfmk/kperf/kperf_timer.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* Manage timers */ + +#include +#include /* current_thread() */ +#include +#include +#include +#include + +#include +#if defined(__x86_64__) +#include +#endif /* defined(__x86_64__) */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* the list of timers */ +struct kperf_timer *kperf_timerv = NULL; +unsigned int kperf_timerc = 0; + +static unsigned int pet_timer_id = 999; + +/* maximum number of timers we can construct */ +#define TIMER_MAX (16) + +#if defined(__x86_64__) + +#define MIN_PERIOD_NS (20 * NSEC_PER_USEC) +#define MIN_PERIOD_BG_NS (10 * NSEC_PER_MSEC) +#define MIN_PERIOD_PET_NS (2 * NSEC_PER_MSEC) +#define MIN_PERIOD_PET_BG_NS (10 * NSEC_PER_MSEC) + +#else /* defined(__x86_64__) */ +#error "unsupported architecture" +#endif /* defined(__x86_64__) */ + +static uint64_t min_period_abstime; +static uint64_t min_period_bg_abstime; +static uint64_t min_period_pet_abstime; +static uint64_t min_period_pet_bg_abstime; + +static uint64_t +kperf_timer_min_period_abstime(void) +{ + if (ktrace_background_active()) { + return min_period_bg_abstime; + } else { + return min_period_abstime; + } +} + +static uint64_t +kperf_timer_min_pet_period_abstime(void) +{ + if (ktrace_background_active()) { + return min_period_pet_bg_abstime; + } else { + return min_period_pet_abstime; + } +} + +static void +kperf_timer_schedule(struct kperf_timer *timer, uint64_t now) +{ + BUF_INFO(PERF_TM_SCHED, timer->period); + + /* if we re-programmed the timer to zero, just drop it */ + if (timer->period == 0) { + return; + } + + /* calculate deadline */ + uint64_t deadline = now + timer->period; + + /* re-schedule the timer, making sure we don't apply slop */ + timer_call_enter(&timer->tcall, deadline, TIMER_CALL_SYS_CRITICAL); +} + +void +kperf_ipi_handler(void *param) +{ + struct kperf_context ctx; + struct kperf_timer *timer = param; + + assert(timer != NULL); + + /* Always cut a tracepoint to show a sample event occurred */ + BUF_DATA(PERF_TM_HNDLR | DBG_FUNC_START, 0); + + int ncpu = cpu_number(); + + struct kperf_sample *intbuf = kperf_intr_sample_buffer(); + + /* On a timer, we can see the "real" current thread */ + ctx.cur_thread = current_thread(); + ctx.cur_pid = task_pid(get_threadtask(ctx.cur_thread)); + + /* who fired */ + ctx.trigger_type = TRIGGER_TYPE_TIMER; + ctx.trigger_id = (unsigned int)(timer - kperf_timerv); + + if (ctx.trigger_id == pet_timer_id && ncpu < machine_info.logical_cpu_max) { + kperf_thread_on_cpus[ncpu] = ctx.cur_thread; + } + + /* make sure sampling is on */ + unsigned int status = kperf_sampling_status(); + if (status == KPERF_SAMPLING_OFF) { + BUF_INFO(PERF_TM_HNDLR | DBG_FUNC_END, SAMPLE_OFF); + return; + } else if (status == KPERF_SAMPLING_SHUTDOWN) { + BUF_INFO(PERF_TM_HNDLR | DBG_FUNC_END, SAMPLE_SHUTDOWN); + return; + } + + /* call the action -- kernel-only from interrupt, pend user */ + int r = kperf_sample(intbuf, &ctx, timer->actionid, SAMPLE_FLAG_PEND_USER); + + /* end tracepoint is informational */ + BUF_INFO(PERF_TM_HNDLR | DBG_FUNC_END, r); + +#if defined(__x86_64__) + (void)atomic_bit_clear(&(timer->pending_cpus), ncpu, __ATOMIC_RELAXED); +#endif /* defined(__x86_64__) */ +} + +static void +kperf_timer_handler(void *param0, __unused void *param1) +{ + struct kperf_timer *timer = param0; + unsigned int ntimer = (unsigned int)(timer - kperf_timerv); + unsigned int ncpus = machine_info.logical_cpu_max; + + timer->active = 1; + + /* along the lines of do not ipi if we are all shutting down */ + if (kperf_sampling_status() == KPERF_SAMPLING_SHUTDOWN) { + goto deactivate; + } + + BUF_DATA(PERF_TM_FIRE, ntimer, ntimer == pet_timer_id, timer->period, + timer->actionid); + + if (ntimer == pet_timer_id) { + kperf_pet_fire_before(); + + /* clean-up the thread-on-CPUs cache */ + bzero(kperf_thread_on_cpus, ncpus * sizeof(*kperf_thread_on_cpus)); + } + + /* ping all CPUs */ + kperf_mp_broadcast_running(timer); + + /* release the pet thread? */ + if (ntimer == pet_timer_id) { + /* PET mode is responsible for rearming the timer */ + kperf_pet_fire_after(); + } else { + /* + * FIXME: Get the current time from elsewhere. The next + * timer's period now includes the time taken to reach this + * point. This causes a bias towards longer sampling periods + * than requested. + */ + kperf_timer_schedule(timer, mach_absolute_time()); + } + +deactivate: + timer->active = 0; +} + +/* program the timer from the PET thread */ +void +kperf_timer_pet_rearm(uint64_t elapsed_ticks) +{ + struct kperf_timer *timer = NULL; + uint64_t period = 0; + uint64_t deadline; + + /* + * If the pet_timer_id is invalid, it has been disabled, so this should + * do nothing. + */ + if (pet_timer_id >= kperf_timerc) { + return; + } + + unsigned int status = kperf_sampling_status(); + /* do not reprogram the timer if it has been shutdown or sampling is off */ + if (status == KPERF_SAMPLING_OFF) { + BUF_INFO(PERF_PET_END, SAMPLE_OFF); + return; + } else if (status == KPERF_SAMPLING_SHUTDOWN) { + BUF_INFO(PERF_PET_END, SAMPLE_SHUTDOWN); + return; + } + + timer = &(kperf_timerv[pet_timer_id]); + + /* if we re-programmed the timer to zero, just drop it */ + if (!timer->period) { + return; + } + + /* subtract the time the pet sample took being careful not to underflow */ + if (timer->period > elapsed_ticks) { + period = timer->period - elapsed_ticks; + } + + /* make sure we don't set the next PET sample to happen too soon */ + if (period < min_period_pet_abstime) { + period = min_period_pet_abstime; + } + + /* we probably took so long in the PET thread, it makes sense to take + * the time again. + */ + deadline = mach_absolute_time() + period; + + BUF_INFO(PERF_PET_SCHED, timer->period, period, elapsed_ticks, deadline); + + /* re-schedule the timer, making sure we don't apply slop */ + timer_call_enter(&(timer->tcall), deadline, TIMER_CALL_SYS_CRITICAL); + + return; +} + +/* turn on all the timers */ +void +kperf_timer_go(void) +{ + /* get the PET thread going */ + if (pet_timer_id < kperf_timerc) { + kperf_pet_config(kperf_timerv[pet_timer_id].actionid); + } + + uint64_t now = mach_absolute_time(); + + for (unsigned int i = 0; i < kperf_timerc; i++) { + if (kperf_timerv[i].period == 0) { + continue; + } + + kperf_timer_schedule(&(kperf_timerv[i]), now); + } +} + +void +kperf_timer_stop(void) +{ + for (unsigned int i = 0; i < kperf_timerc; i++) { + if (kperf_timerv[i].period == 0) { + continue; + } + + /* wait for the timer to stop */ + while (kperf_timerv[i].active); + + timer_call_cancel(&(kperf_timerv[i].tcall)); + } + + /* wait for PET to stop, too */ + kperf_pet_config(0); +} + +unsigned int +kperf_timer_get_petid(void) +{ + return pet_timer_id; +} + +int +kperf_timer_set_petid(unsigned int timerid) +{ + if (timerid < kperf_timerc) { + uint64_t min_period; + + min_period = kperf_timer_min_pet_period_abstime(); + if (kperf_timerv[timerid].period < min_period) { + kperf_timerv[timerid].period = min_period; + } + kperf_pet_config(kperf_timerv[timerid].actionid); + } else { + /* clear the PET trigger if it's a bogus ID */ + kperf_pet_config(0); + } + + pet_timer_id = timerid; + + return 0; +} + +int +kperf_timer_get_period(unsigned int timerid, uint64_t *period_abstime) +{ + if (timerid >= kperf_timerc) { + return EINVAL; + } + + *period_abstime = kperf_timerv[timerid].period; + return 0; +} + +int +kperf_timer_set_period(unsigned int timerid, uint64_t period_abstime) +{ + uint64_t min_period; + + if (timerid >= kperf_timerc) { + return EINVAL; + } + + if (pet_timer_id == timerid) { + min_period = kperf_timer_min_pet_period_abstime(); + } else { + min_period = kperf_timer_min_period_abstime(); + } + + if (period_abstime > 0 && period_abstime < min_period) { + period_abstime = min_period; + } + + kperf_timerv[timerid].period = period_abstime; + + /* FIXME: re-program running timers? */ + + return 0; +} + +int +kperf_timer_get_action(unsigned int timerid, uint32_t *action) +{ + if (timerid >= kperf_timerc) { + return EINVAL; + } + + *action = kperf_timerv[timerid].actionid; + return 0; +} + +int +kperf_timer_set_action(unsigned int timerid, uint32_t action) +{ + if (timerid >= kperf_timerc) { + return EINVAL; + } + + kperf_timerv[timerid].actionid = action; + return 0; +} + +unsigned int +kperf_timer_get_count(void) +{ + return kperf_timerc; +} + +void +kperf_timer_reset(void) +{ + kperf_timer_set_petid(999); + kperf_set_pet_idle_rate(KPERF_PET_DEFAULT_IDLE_RATE); + kperf_set_lightweight_pet(0); + for (unsigned int i = 0; i < kperf_timerc; i++) { + kperf_timerv[i].period = 0; + kperf_timerv[i].actionid = 0; +#if defined(__x86_64__) + kperf_timerv[i].pending_cpus = 0; +#endif /* defined(__x86_64__) */ + } +} + +extern int +kperf_timer_set_count(unsigned int count) +{ + struct kperf_timer *new_timerv = NULL, *old_timerv = NULL; + unsigned int old_count; + + if (min_period_abstime == 0) { + nanoseconds_to_absolutetime(MIN_PERIOD_NS, &min_period_abstime); + nanoseconds_to_absolutetime(MIN_PERIOD_BG_NS, &min_period_bg_abstime); + nanoseconds_to_absolutetime(MIN_PERIOD_PET_NS, &min_period_pet_abstime); + nanoseconds_to_absolutetime(MIN_PERIOD_PET_BG_NS, + &min_period_pet_bg_abstime); + assert(min_period_abstime > 0); + } + + if (count == kperf_timerc) { + return 0; + } + if (count > TIMER_MAX) { + return EINVAL; + } + + /* TODO: allow shrinking? */ + if (count < kperf_timerc) { + return EINVAL; + } + + /* + * Make sure kperf is initialized when creating the array for the first + * time. + */ + if (kperf_timerc == 0) { + int r; + + /* main kperf */ + if ((r = kperf_init())) { + return r; + } + } + + /* + * Shut down any running timers since we will be messing with the timer + * call structures. + */ + kperf_timer_stop(); + + /* create a new array */ + new_timerv = kalloc_tag(count * sizeof(struct kperf_timer), + VM_KERN_MEMORY_DIAG); + if (new_timerv == NULL) { + return ENOMEM; + } + old_timerv = kperf_timerv; + old_count = kperf_timerc; + + if (old_timerv != NULL) { + bcopy(kperf_timerv, new_timerv, + kperf_timerc * sizeof(struct kperf_timer)); + } + + /* zero the new entries */ + bzero(&(new_timerv[kperf_timerc]), + (count - old_count) * sizeof(struct kperf_timer)); + + /* (re-)setup the timer call info for all entries */ + for (unsigned int i = 0; i < count; i++) { + timer_call_setup(&(new_timerv[i].tcall), kperf_timer_handler, &(new_timerv[i])); + } + + kperf_timerv = new_timerv; + kperf_timerc = count; + + if (old_timerv != NULL) { + kfree(old_timerv, old_count * sizeof(struct kperf_timer)); + } + + return 0; +} diff --git a/osfmk/kperf/timetrigger.h b/osfmk/kperf/kperf_timer.h similarity index 52% rename from osfmk/kperf/timetrigger.h rename to osfmk/kperf/kperf_timer.h index 81196f129..4229b4d9f 100644 --- a/osfmk/kperf/timetrigger.h +++ b/osfmk/kperf/kperf_timer.h @@ -1,3 +1,5 @@ +#ifndef KPERF_TIMER_H +#define KPERF_TIMER_H /* * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * @@ -26,30 +28,55 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -// extern uint64_t timer_period; -extern void kperf_timer_reprogram(void); -extern void kperf_timer_reprogram_all(void); +#include +#include +struct kperf_timer { + struct timer_call tcall; + uint64_t period; + unsigned actionid; + volatile unsigned active; + + /* + * A bitmap of CPUs that have a pending timer to service. On Intel, it + * allows the core responding to the timer interrupt to not queue up + * cross-calls on cores that haven't yet responded. On ARM, it allows + * the signal handler to multiplex simultaneous fires of different + * timers. + */ + bitmap_t pending_cpus; +}; + +extern struct kperf_timer *kperf_timerv; +extern unsigned int kperf_timerc; + +void kperf_timer_reprogram(void); +void kperf_timer_reprogram_all(void); + +void kperf_ipi_handler(void *param); // return values from the action #define TIMER_REPROGRAM (0) -#define TIMER_STOP (1) +#define TIMER_STOP (1) /* getters and setters on timers */ -extern unsigned kperf_timer_get_count(void); -extern int kperf_timer_set_count(unsigned count); +unsigned kperf_timer_get_count(void); +int kperf_timer_set_count(unsigned int count); -extern int kperf_timer_get_period( unsigned timer, uint64_t *period ); -extern int kperf_timer_set_period( unsigned timer, uint64_t period ); +int kperf_timer_get_period(unsigned int timer, uint64_t *period); +int kperf_timer_set_period(unsigned int timer, uint64_t period); -extern int kperf_timer_get_action( unsigned timer, uint32_t *action ); -extern int kperf_timer_set_action( unsigned timer, uint32_t action ); +int kperf_timer_get_action(unsigned int timer, uint32_t *action); +int kperf_timer_set_action(unsigned int timer, uint32_t action); -extern int kperf_timer_go(void); -extern int kperf_timer_stop(void); +void kperf_timer_go(void); +void kperf_timer_stop(void); +void kperf_timer_reset(void); -extern unsigned kperf_timer_get_petid(void); -extern int kperf_timer_set_petid(unsigned count); +unsigned int kperf_timer_get_petid(void); +int kperf_timer_set_petid(unsigned int count); /* so PET thread can re-arm the timer */ -extern int kperf_timer_pet_set( unsigned timer, uint64_t elapsed_ticks ); +void kperf_timer_pet_rearm(uint64_t elapsed_ticks); + +#endif /* !defined(KPERF_TIMER_H) */ diff --git a/osfmk/kperf/kperfbsd.c b/osfmk/kperf/kperfbsd.c index 2a6554ab6..1b3ab5f5d 100644 --- a/osfmk/kperf/kperfbsd.c +++ b/osfmk/kperf/kperfbsd.c @@ -2,7 +2,7 @@ * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,556 +22,474 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* sysctl interface for paramters from user-land */ +#include +#include +#include #include #include #include #include #include -#include -#include -#include -#include #include -#include -#include -#include +#include +#include #include +#include +#include +#include - -/* a pid which is allowed to control kperf without requiring root access */ -static pid_t blessed_pid = -1; -static boolean_t blessed_preempt = FALSE; +#include /* IDs for dispatch from SYSCTL macros */ -#define REQ_SAMPLING (1) -#define REQ_ACTION_COUNT (2) -#define REQ_ACTION_SAMPLERS (3) -#define REQ_TIMER_COUNT (4) -#define REQ_TIMER_PERIOD (5) -#define REQ_TIMER_PET (6) -#define REQ_TIMER_ACTION (7) -#define REQ_BLESS (8) -#define REQ_ACTION_USERDATA (9) -#define REQ_ACTION_FILTER_BY_TASK (10) -#define REQ_ACTION_FILTER_BY_PID (11) -#define REQ_KDBG_CALLSTACKS (12) -#define REQ_PET_IDLE_RATE (13) -#define REQ_BLESS_PREEMPT (14) -#define REQ_KDBG_CSWITCH (15) -#define REQ_CSWITCH_ACTION (16) -#define REQ_SIGNPOST_ACTION (17) - -/* simple state variables */ -int kperf_debug_level = 0; +#define REQ_SAMPLING (1) +#define REQ_ACTION_COUNT (2) +#define REQ_ACTION_SAMPLERS (3) +#define REQ_TIMER_COUNT (4) +#define REQ_TIMER_PERIOD (5) +#define REQ_TIMER_PET (6) +#define REQ_TIMER_ACTION (7) +#define REQ_BLESS (8) +#define REQ_ACTION_USERDATA (9) +#define REQ_ACTION_FILTER_BY_TASK (10) +#define REQ_ACTION_FILTER_BY_PID (11) +/* 12 unused */ +#define REQ_PET_IDLE_RATE (13) +#define REQ_BLESS_PREEMPT (14) +#define REQ_KDBG_CSWITCH (15) +#define REQ_RESET (16) +/* 17 unused */ +#define REQ_ACTION_UCALLSTACK_DEPTH (18) +#define REQ_ACTION_KCALLSTACK_DEPTH (19) +#define REQ_LIGHTWEIGHT_PET (20) +#define REQ_KDEBUG_ACTION (21) +#define REQ_KDEBUG_FILTER (22) -static lck_grp_attr_t *kperf_cfg_lckgrp_attr = NULL; -static lck_grp_t *kperf_cfg_lckgrp = NULL; -static lck_mtx_t kperf_cfg_lock; -static boolean_t kperf_cfg_initted = FALSE; +int kperf_debug_level = 0; -void kdbg_swap_global_state_pid(pid_t old_pid, pid_t new_pid); /* bsd/kern/kdebug.c */ +#if DEVELOPMENT || DEBUG +_Atomic long long kperf_pending_ipis = 0; +#endif /* DEVELOPMENT || DEBUG */ -/*************************** +/* + * kperf has a different sysctl model than others. + * + * For simple queries like the number of actions, the normal sysctl style + * of get/set works well. * - * lock init + * However, when requesting information about something specific, like an + * action, user space needs to provide some contextual information. This + * information is stored in a uint64_t array that includes the context, like + * the action ID it is interested in. If user space is getting the value from + * the kernel, then the get side of the sysctl is valid. If it is setting the + * value, then the get pointers are left NULL. * - ***************************/ + * These functions handle marshalling and unmarshalling data from sysctls. + */ -void -kperf_bootstrap(void) +static int +kperf_sysctl_get_set_uint32(struct sysctl_req *req, + uint32_t (*get)(void), int (*set)(uint32_t)) { - kperf_cfg_lckgrp_attr = lck_grp_attr_alloc_init(); - kperf_cfg_lckgrp = lck_grp_alloc_init("kperf cfg", - kperf_cfg_lckgrp_attr); - lck_mtx_init(&kperf_cfg_lock, kperf_cfg_lckgrp, LCK_ATTR_NULL); + assert(req != NULL); + assert(get != NULL); + assert(set != NULL); - kperf_cfg_initted = TRUE; -} + uint32_t value = 0; + if (req->oldptr) { + value = get(); + } -/*************************** - * - * sysctl handlers - * - ***************************/ + int error = sysctl_io_number(req, value, sizeof(value), &value, NULL); + + if (error || !req->newptr) { + return error; + } + + return set(value); +} static int -sysctl_timer_period( __unused struct sysctl_oid *oidp, struct sysctl_req *req ) +kperf_sysctl_get_set_int(struct sysctl_req *req, + int (*get)(void), int (*set)(int)) { - int error = 0; - uint64_t inputs[2], retval; - unsigned timer, set = 0; - - /* get 2x 64-bit words */ - error = SYSCTL_IN( req, inputs, 2*sizeof(inputs[0]) ); - if(error) - return (error); - - /* setup inputs */ - timer = (unsigned) inputs[0]; - if( inputs[1] != ~0ULL ) - set = 1; - - if( set ) - { - error = kperf_timer_set_period( timer, inputs[1] ); - if( error ) - return error; - } + assert(req != NULL); + assert(get != NULL); + assert(set != NULL); - error = kperf_timer_get_period(timer, &retval); - if(error) - return (error); + int value = 0; + if (req->oldptr) { + value = get(); + } - inputs[1] = retval; - - if( error == 0 ) - error = SYSCTL_OUT( req, inputs, 2*sizeof(inputs[0]) ); + int error = sysctl_io_number(req, value, sizeof(value), &value, NULL); - return error; + if (error || !req->newptr) { + return error; + } + + return set(value); } static int -sysctl_timer_action( __unused struct sysctl_oid *oidp, struct sysctl_req *req ) +kperf_sysctl_get_set_unsigned_uint32(struct sysctl_req *req, + int (*get)(unsigned int, uint32_t *), int (*set)(unsigned int, uint32_t)) { - int error = 0; - uint64_t inputs[2]; - uint32_t retval; - unsigned timer, set = 0; - - /* get 2x 64-bit words */ - error = SYSCTL_IN( req, inputs, 2*sizeof(inputs[0]) ); - if(error) - return (error); - - /* setup inputs */ - timer = (unsigned) inputs[0]; - if( inputs[1] != ~0ULL ) - set = 1; - - if( set ) - { - error = kperf_timer_set_action( timer, inputs[1] ); - if( error ) - return error; - } + assert(req != NULL); + assert(get != NULL); + assert(set != NULL); + + int error; + uint64_t inputs[2]; + if ((error = SYSCTL_IN(req, inputs, sizeof(inputs)))) { + return error; + } + + unsigned int action_id = (unsigned int)inputs[0]; + uint32_t new_value = (uint32_t)inputs[1]; - error = kperf_timer_get_action(timer, &retval); - if(error) - return (error); + if (req->oldptr != USER_ADDR_NULL) { + uint32_t value_out = 0; + if ((error = get(action_id, &value_out))) { + return error; + } - inputs[1] = retval; - - if( error == 0 ) - error = SYSCTL_OUT( req, inputs, 2*sizeof(inputs[0]) ); + inputs[1] = value_out; + } else { + if ((error = set(action_id, new_value))) { + return error; + } + } - return error; + if (req->oldptr != USER_ADDR_NULL) { + error = SYSCTL_OUT(req, inputs, sizeof(inputs)); + return error; + } else { + return 0; + } } +/* + * These functions are essentially the same as the generic + * kperf_sysctl_get_set_unsigned_uint32, except they have unique input sizes. + */ + static int -sysctl_action_samplers( __unused struct sysctl_oid *oidp, - struct sysctl_req *req ) +sysctl_timer_period(struct sysctl_req *req) { - int error = 0; - uint64_t inputs[3]; - uint32_t retval; - unsigned actionid, set = 0; - - /* get 3x 64-bit words */ - error = SYSCTL_IN( req, inputs, 3*sizeof(inputs[0]) ); - if(error) - return (error); - - /* setup inputs */ - set = (unsigned) inputs[0]; - actionid = (unsigned) inputs[1]; - - if( set ) - { - error = kperf_action_set_samplers( actionid, inputs[2] ); - if( error ) - return error; - } + assert(req != NULL); - error = kperf_action_get_samplers(actionid, &retval); - if(error) - return (error); + int error; + uint64_t inputs[2]; + if ((error = SYSCTL_IN(req, inputs, sizeof(inputs)))) { + return error; + } + + unsigned int timer = (unsigned int)inputs[0]; + uint64_t new_period = inputs[1]; - inputs[2] = retval; - - if( error == 0 ) - error = SYSCTL_OUT( req, inputs, 3*sizeof(inputs[0]) ); + if (req->oldptr != USER_ADDR_NULL) { + uint64_t period_out = 0; + if ((error = kperf_timer_get_period(timer, &period_out))) { + return error; + } + + inputs[1] = period_out; + } else { + if ((error = kperf_timer_set_period(timer, new_period))) { + return error; + } + } - return error; + return SYSCTL_OUT(req, inputs, sizeof(inputs)); } static int -sysctl_action_userdata( __unused struct sysctl_oid *oidp, - struct sysctl_req *req ) +sysctl_action_filter(struct sysctl_req *req, boolean_t is_task_t) { - int error = 0; - uint64_t inputs[3]; - uint32_t retval; - unsigned actionid, set = 0; - - /* get 3x 64-bit words */ - error = SYSCTL_IN( req, inputs, 3*sizeof(inputs[0]) ); - if(error) - return (error); - - /* setup inputs */ - set = (unsigned) inputs[0]; - actionid = (unsigned) inputs[1]; - - if( set ) - { - error = kperf_action_set_userdata( actionid, inputs[2] ); - if( error ) - return error; - } + assert(req != NULL); + + int error; + uint64_t inputs[2]; + if ((error = SYSCTL_IN(req, inputs, sizeof(inputs)))) { + return error; + } + + unsigned int actionid = (unsigned int)inputs[0]; + int new_filter = (int)inputs[1]; - error = kperf_action_get_userdata(actionid, &retval); - if(error) - return (error); + if (req->oldptr != USER_ADDR_NULL) { + int filter_out; + if ((error = kperf_action_get_filter(actionid, &filter_out))) { + return error; + } + + inputs[1] = filter_out; + } else { + int pid = is_task_t ? kperf_port_to_pid((mach_port_name_t)new_filter) + : new_filter; - inputs[2] = retval; - - if( error == 0 ) - error = SYSCTL_OUT( req, inputs, 3*sizeof(inputs[0]) ); + if ((error = kperf_action_set_filter(actionid, pid))) { + return error; + } + } - return error; + return SYSCTL_OUT(req, inputs, sizeof(inputs)); } static int -sysctl_action_filter( __unused struct sysctl_oid *oidp, - struct sysctl_req *req, int is_task_t ) +sysctl_bless(struct sysctl_req *req) { - int error = 0; - uint64_t inputs[3]; - int retval; - unsigned actionid, set = 0; - mach_port_name_t portname; - int pid; - - /* get 3x 64-bit words */ - error = SYSCTL_IN( req, inputs, 3*sizeof(inputs[0]) ); - if(error) - return (error); - - /* setup inputs */ - set = (unsigned) inputs[0]; - actionid = (unsigned) inputs[1]; - - if( set ) - { - if( is_task_t ) - { - portname = (mach_port_name_t) inputs[2]; - pid = kperf_port_to_pid(portname); - } - else - pid = (int) inputs[2]; - - error = kperf_action_set_filter( actionid, pid ); - if( error ) - return error; - } - - error = kperf_action_get_filter(actionid, &retval); - if(error) - return (error); + int value = ktrace_get_owning_pid(); + int error = sysctl_io_number(req, value, sizeof(value), &value, NULL); - inputs[2] = retval; - - if( error == 0 ) - error = SYSCTL_OUT( req, inputs, 3*sizeof(inputs[0]) ); + if (error || !req->newptr) { + return error; + } - return error; + return ktrace_set_owning_pid(value); } +/* sysctl handlers that use the generic functions */ + static int -sysctl_sampling( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_action_samplers(struct sysctl_req *req) { - int error = 0; - uint32_t value = 0; - - /* get the old value and process it */ - value = kperf_sampling_status(); - - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); - - /* if that worked, and we're writing... */ - if( value ) - error = kperf_sampling_enable(); - else - error = kperf_sampling_disable(); - - return error; + return kperf_sysctl_get_set_unsigned_uint32(req, + kperf_action_get_samplers, kperf_action_set_samplers); } static int -sysctl_action_count( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_action_userdata(struct sysctl_req *req) { - int error = 0; - uint32_t value = 0; - - /* get the old value and process it */ - value = kperf_action_get_count(); - - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); - - /* if that worked, and we're writing... */ - return kperf_action_set_count(value); + return kperf_sysctl_get_set_unsigned_uint32(req, + kperf_action_get_userdata, kperf_action_set_userdata); } static int -sysctl_timer_count( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_action_ucallstack_depth(struct sysctl_req *req) { - int error = 0; - uint32_t value = 0; - - /* get the old value and process it */ - value = kperf_timer_get_count(); - - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); - - /* if that worked, and we're writing... */ - return kperf_timer_set_count(value); + return kperf_sysctl_get_set_unsigned_uint32(req, + kperf_action_get_ucallstack_depth, kperf_action_set_ucallstack_depth); } static int -sysctl_timer_pet( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_action_kcallstack_depth(struct sysctl_req *req) { - int error = 0; - uint32_t value = 0; - - /* get the old value and process it */ - value = kperf_timer_get_petid(); - - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); - - /* if that worked, and we're writing... */ - return kperf_timer_set_petid(value); + return kperf_sysctl_get_set_unsigned_uint32(req, + kperf_action_get_kcallstack_depth, kperf_action_set_kcallstack_depth); } static int -sysctl_bless( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_kdebug_action(struct sysctl_req *req) { - int error = 0; - int value = 0; - - /* get the old value and process it */ - value = blessed_pid; - - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); - - /* if that worked, and we're writing... */ - error = kperf_bless_pid(value); - - return error; + return kperf_sysctl_get_set_int(req, kperf_kdebug_get_action, + kperf_kdebug_set_action); } static int -sysctl_bless_preempt( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_kdebug_filter(struct sysctl_req *req) { - int error = 0; - int value = 0; + assert(req != NULL); - /* get the old value and process it */ - value = blessed_preempt; + if (req->oldptr != USER_ADDR_NULL) { + struct kperf_kdebug_filter *filter = NULL; + uint32_t n_debugids = kperf_kdebug_get_filter(&filter); + size_t filter_size = KPERF_KDEBUG_FILTER_SIZE(n_debugids); - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); + if (n_debugids == 0) { + return EINVAL; + } - /* if that worked, and we're writing... */ - blessed_preempt = value ? TRUE : FALSE; + return SYSCTL_OUT(req, filter, filter_size); + } - return 0; + return kperf_kdebug_set_filter(req->newptr, (uint32_t)req->newlen); } - static int -sysctl_kdbg_callstacks( struct sysctl_oid *oidp, struct sysctl_req *req ) +kperf_sampling_set(uint32_t sample_start) { - int error = 0; - int value = 0; - - /* get the old value and process it */ - value = kperf_kdbg_get_stacks(); - - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); - - /* if that worked, and we're writing... */ - error = kperf_kdbg_set_stacks(value); - - return error; + if (sample_start) { + return kperf_sampling_enable(); + } else { + return kperf_sampling_disable(); + } } static int -sysctl_pet_idle_rate( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_sampling(struct sysctl_req *req) { - int error = 0; - int value = 0; - - /* get the old value and process it */ - value = kperf_get_pet_idle_rate(); - - /* copy out the old value, get the new value */ - error = sysctl_handle_int(oidp, &value, 0, req); - if (error || !req->newptr) - return (error); + return kperf_sysctl_get_set_uint32(req, kperf_sampling_status, + kperf_sampling_set); +} - /* if that worked, and we're writing... */ - kperf_set_pet_idle_rate(value); +static int +sysctl_action_count(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_uint32(req, kperf_action_get_count, + kperf_action_set_count); +} - return error; +static int +sysctl_timer_count(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_uint32(req, kperf_timer_get_count, + kperf_timer_set_count); } static int -sysctl_kdbg_cswitch( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_timer_action(struct sysctl_req *req) { - int value = kperf_kdbg_cswitch_get(); - int error = sysctl_handle_int(oidp, &value, 0, req); + return kperf_sysctl_get_set_unsigned_uint32(req, kperf_timer_get_action, + kperf_timer_set_action); +} - if (error || !req->newptr) { - return error; - } +static int +sysctl_timer_pet(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_uint32(req, kperf_timer_get_petid, + kperf_timer_set_petid); +} - return kperf_kdbg_cswitch_set(value); +static int +sysctl_bless_preempt(struct sysctl_req *req) +{ + return sysctl_io_number(req, ktrace_root_set_owner_allowed, + sizeof(ktrace_root_set_owner_allowed), + &ktrace_root_set_owner_allowed, NULL); } static int -sysctl_cswitch_action( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_kperf_reset(struct sysctl_req *req) { - int value = kperf_cswitch_action_get(); - int error = sysctl_handle_int(oidp, &value, 0, req); + int should_reset = 0; - if (error || !req->newptr) { - return error; - } + int error = sysctl_io_number(req, should_reset, sizeof(should_reset), + &should_reset, NULL); + if (error) { + return error; + } - return kperf_cswitch_action_set(value); + if (should_reset) { + ktrace_reset(KTRACE_KPERF); + } + return 0; } static int -sysctl_signpost_action( struct sysctl_oid *oidp, struct sysctl_req *req ) +sysctl_pet_idle_rate(struct sysctl_req *req) { - int value = kperf_signpost_action_get(); - int error = sysctl_handle_int(oidp, &value, 0, req); + return kperf_sysctl_get_set_int(req, kperf_get_pet_idle_rate, + kperf_set_pet_idle_rate); +} - if (error || !req->newptr) { - return error; - } +static int +sysctl_lightweight_pet(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_int(req, kperf_get_lightweight_pet, + kperf_set_lightweight_pet); +} - return kperf_signpost_action_set(value); +static int +sysctl_kdbg_cswitch(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_int(req, kperf_kdbg_cswitch_get, + kperf_kdbg_cswitch_set); } -/* - * #define SYSCTL_HANDLER_ARGS (struct sysctl_oid *oidp, \ - * void *arg1, int arg2, \ - * struct sysctl_req *req ) - */ static int kperf_sysctl SYSCTL_HANDLER_ARGS { +#pragma unused(oidp, arg2) int ret; + uintptr_t type = (uintptr_t)arg1; - // __unused struct sysctl_oid *unused_oidp = oidp; - (void)arg2; + lck_mtx_lock(ktrace_lock); - if ( !kperf_cfg_initted ) - panic("kperf_bootstrap not called"); - - ret = kperf_access_check(); - if (ret) { - return ret; + if (req->oldptr == USER_ADDR_NULL && req->newptr != USER_ADDR_NULL) { + if ((ret = ktrace_configure(KTRACE_KPERF))) { + lck_mtx_unlock(ktrace_lock); + return ret; + } + } else { + if ((ret = ktrace_read_check())) { + lck_mtx_unlock(ktrace_lock); + return ret; + } } - lck_mtx_lock(&kperf_cfg_lock); - /* which request */ - switch( (uintptr_t) arg1 ) - { + switch (type) { case REQ_ACTION_COUNT: - ret = sysctl_action_count( oidp, req ); + ret = sysctl_action_count(req); break; case REQ_ACTION_SAMPLERS: - ret = sysctl_action_samplers( oidp, req ); + ret = sysctl_action_samplers(req); break; case REQ_ACTION_USERDATA: - ret = sysctl_action_userdata( oidp, req ); + ret = sysctl_action_userdata(req); break; case REQ_TIMER_COUNT: - ret = sysctl_timer_count( oidp, req ); + ret = sysctl_timer_count(req); break; case REQ_TIMER_PERIOD: - ret = sysctl_timer_period( oidp, req ); + ret = sysctl_timer_period(req); break; case REQ_TIMER_PET: - ret = sysctl_timer_pet( oidp, req ); + ret = sysctl_timer_pet(req); break; case REQ_TIMER_ACTION: - ret = sysctl_timer_action( oidp, req ); + ret = sysctl_timer_action(req); break; case REQ_SAMPLING: - ret = sysctl_sampling( oidp, req ); - break; - case REQ_KDBG_CALLSTACKS: - ret = sysctl_kdbg_callstacks( oidp, req ); + ret = sysctl_sampling(req); break; case REQ_KDBG_CSWITCH: - ret = sysctl_kdbg_cswitch( oidp, req ); + ret = sysctl_kdbg_cswitch(req); break; case REQ_ACTION_FILTER_BY_TASK: - ret = sysctl_action_filter( oidp, req, 1 ); + ret = sysctl_action_filter(req, TRUE); break; case REQ_ACTION_FILTER_BY_PID: - ret = sysctl_action_filter( oidp, req, 0 ); + ret = sysctl_action_filter(req, FALSE); + break; + case REQ_KDEBUG_ACTION: + ret = sysctl_kdebug_action(req); + break; + case REQ_KDEBUG_FILTER: + ret = sysctl_kdebug_filter(req); break; case REQ_PET_IDLE_RATE: - ret = sysctl_pet_idle_rate( oidp, req ); + ret = sysctl_pet_idle_rate(req); break; case REQ_BLESS_PREEMPT: - ret = sysctl_bless_preempt( oidp, req ); + ret = sysctl_bless_preempt(req); + break; + case REQ_RESET: + ret = sysctl_kperf_reset(req); break; - case REQ_CSWITCH_ACTION: - ret = sysctl_cswitch_action( oidp, req ); + case REQ_ACTION_UCALLSTACK_DEPTH: + ret = sysctl_action_ucallstack_depth(req); break; - case REQ_SIGNPOST_ACTION: - ret = sysctl_signpost_action( oidp, req ); + case REQ_ACTION_KCALLSTACK_DEPTH: + ret = sysctl_action_kcallstack_depth(req); break; + case REQ_LIGHTWEIGHT_PET: + ret = sysctl_lightweight_pet(req); + break; default: ret = ENOENT; break; } - lck_mtx_unlock(&kperf_cfg_lock); + lck_mtx_unlock(ktrace_lock); return ret; } @@ -579,218 +497,186 @@ kperf_sysctl SYSCTL_HANDLER_ARGS static int kperf_sysctl_bless_handler SYSCTL_HANDLER_ARGS { +#pragma unused(oidp, arg2) int ret; - // __unused struct sysctl_oid *unused_oidp = oidp; - (void)arg2; - - if ( !kperf_cfg_initted ) - panic("kperf_bootstrap not called"); - - lck_mtx_lock(&kperf_cfg_lock); - - /* which request */ - if ( (uintptr_t) arg1 == REQ_BLESS ) - ret = sysctl_bless( oidp, req ); - else - ret = ENOENT; - - lck_mtx_unlock(&kperf_cfg_lock); - - return ret; -} - -/*************************** - * - * Access control - * - ***************************/ -/* Validate whether the current process has priviledges to access - * kperf (and by extension, trace). Returns 0 if access is granted. - */ -int -kperf_access_check(void) -{ - proc_t p = current_proc(); - proc_t blessed_p; - int ret = 0; - boolean_t pid_gone = FALSE; - - /* check if the pid that held the lock is gone */ - blessed_p = proc_find(blessed_pid); - - if ( blessed_p != NULL ) - proc_rele(blessed_p); - else - pid_gone = TRUE; - - if ( blessed_pid == -1 || pid_gone ) { - /* check for root */ - ret = suser(kauth_cred_get(), &p->p_acflag); - if( !ret ) + lck_mtx_lock(ktrace_lock); + + /* if setting a new "blessed pid" (ktrace owning pid) */ + if (req->newptr != USER_ADDR_NULL) { + /* + * root can bypass the ktrace check when a flag is set (for + * backwards compatibility) or when ownership is maintained over + * subsystems resets (to allow the user space process that set + * ownership to unset it). + */ + if (!((ktrace_root_set_owner_allowed || + ktrace_keep_ownership_on_reset) && + kauth_cred_issuser(kauth_cred_get()))) + { + if ((ret = ktrace_configure(KTRACE_KPERF))) { + lck_mtx_unlock(ktrace_lock); + return ret; + } + } + } else { + if ((ret = ktrace_read_check())) { + lck_mtx_unlock(ktrace_lock); return ret; - } - - /* check against blessed pid */ - if( p->p_pid != blessed_pid ) - return EACCES; - - /* access granted. */ - return 0; -} - -/* specify a pid as being able to access kperf/trace, depiste not - * being root - */ -int -kperf_bless_pid(pid_t newpid) -{ - proc_t p = NULL; - pid_t current_pid; - - p = current_proc(); - current_pid = p->p_pid; - - /* are we allowed to preempt? */ - if ( (newpid != -1) && (blessed_pid != -1) && - (blessed_pid != current_pid) && !blessed_preempt ) { - /* check if the pid that held the lock is gone */ - p = proc_find(blessed_pid); - - if ( p != NULL ) { - proc_rele(p); - return EACCES; } } - /* validate new pid */ - if ( newpid != -1 ) { - p = proc_find(newpid); - - if ( p == NULL ) - return EINVAL; - - proc_rele(p); + /* which request */ + if ((uintptr_t)arg1 == REQ_BLESS) { + ret = sysctl_bless(req); + } else { + ret = ENOENT; } - /* take trace facility as well */ - kdbg_swap_global_state_pid(blessed_pid, newpid); + lck_mtx_unlock(ktrace_lock); - blessed_pid = newpid; - blessed_preempt = FALSE; - - return 0; + return ret; } -/*************************** - * - * sysctl hooks - * - ***************************/ - /* root kperf node */ -SYSCTL_NODE(, OID_AUTO, kperf, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + +SYSCTL_NODE(, OID_AUTO, kperf, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "kperf"); -/* action sub-section */ -SYSCTL_NODE(_kperf, OID_AUTO, action, CTLFLAG_RW|CTLFLAG_LOCKED, 0, +/* actions */ + +SYSCTL_NODE(_kperf, OID_AUTO, action, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "action"); SYSCTL_PROC(_kperf_action, OID_AUTO, count, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_ACTION_COUNT, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void *)REQ_ACTION_COUNT, sizeof(int), kperf_sysctl, "I", "Number of actions"); SYSCTL_PROC(_kperf_action, OID_AUTO, samplers, - CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_ACTION_SAMPLERS, - 3*sizeof(uint64_t), kperf_sysctl, "UQ", - "What to sample what a trigger fires an action"); + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_ACTION_SAMPLERS, + 3 * sizeof(uint64_t), kperf_sysctl, "UQ", + "What to sample when a trigger fires an action"); SYSCTL_PROC(_kperf_action, OID_AUTO, userdata, - CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_ACTION_USERDATA, - 3*sizeof(uint64_t), kperf_sysctl, "UQ", + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_ACTION_USERDATA, + 3 * sizeof(uint64_t), kperf_sysctl, "UQ", "User data to attribute to action"); SYSCTL_PROC(_kperf_action, OID_AUTO, filter_by_task, - CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_ACTION_FILTER_BY_TASK, - 3*sizeof(uint64_t), kperf_sysctl, "UQ", + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_ACTION_FILTER_BY_TASK, + 3 * sizeof(uint64_t), kperf_sysctl, "UQ", "Apply a task filter to the action"); SYSCTL_PROC(_kperf_action, OID_AUTO, filter_by_pid, - CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_ACTION_FILTER_BY_PID, - 3*sizeof(uint64_t), kperf_sysctl, "UQ", + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_ACTION_FILTER_BY_PID, + 3 * sizeof(uint64_t), kperf_sysctl, "UQ", "Apply a pid filter to the action"); -/* timer sub-section */ -SYSCTL_NODE(_kperf, OID_AUTO, timer, CTLFLAG_RW|CTLFLAG_LOCKED, 0, +SYSCTL_PROC(_kperf_action, OID_AUTO, ucallstack_depth, + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_ACTION_UCALLSTACK_DEPTH, + sizeof(int), kperf_sysctl, "I", + "Maximum number of frames to include in user callstacks"); + +SYSCTL_PROC(_kperf_action, OID_AUTO, kcallstack_depth, + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_ACTION_KCALLSTACK_DEPTH, + sizeof(int), kperf_sysctl, "I", + "Maximum number of frames to include in kernel callstacks"); + +/* timers */ + +SYSCTL_NODE(_kperf, OID_AUTO, timer, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "timer"); SYSCTL_PROC(_kperf_timer, OID_AUTO, count, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_TIMER_COUNT, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void *)REQ_TIMER_COUNT, sizeof(int), kperf_sysctl, "I", "Number of time triggers"); SYSCTL_PROC(_kperf_timer, OID_AUTO, period, - CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_TIMER_PERIOD, - 2*sizeof(uint64_t), kperf_sysctl, "UQ", "Timer number and period"); + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_TIMER_PERIOD, + 2 * sizeof(uint64_t), kperf_sysctl, "UQ", + "Timer number and period"); SYSCTL_PROC(_kperf_timer, OID_AUTO, action, - CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_TIMER_ACTION, - 2*sizeof(uint64_t), kperf_sysctl, "UQ", "Timer number and actionid"); + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_TIMER_ACTION, + 2 * sizeof(uint64_t), kperf_sysctl, "UQ", + "Timer number and actionid"); SYSCTL_PROC(_kperf_timer, OID_AUTO, pet_timer, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_TIMER_PET, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void *)REQ_TIMER_PET, sizeof(int), kperf_sysctl, "I", "Which timer ID does PET"); +/* kdebug trigger */ + +SYSCTL_NODE(_kperf, OID_AUTO, kdebug, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "kdebug"); + +SYSCTL_PROC(_kperf_kdebug, OID_AUTO, action, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void*)REQ_KDEBUG_ACTION, + sizeof(int), kperf_sysctl, "I", "ID of action to trigger on kdebug events"); + +SYSCTL_PROC(_kperf_kdebug, OID_AUTO, filter, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void*)REQ_KDEBUG_FILTER, + sizeof(int), kperf_sysctl, "P", "The filter that determines which kdebug events trigger a sample"); + /* misc */ + SYSCTL_PROC(_kperf, OID_AUTO, sampling, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_SAMPLING, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void *)REQ_SAMPLING, sizeof(int), kperf_sysctl, "I", "Sampling running"); +SYSCTL_PROC(_kperf, OID_AUTO, reset, + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_RESET, + 0, kperf_sysctl, "-", "Reset kperf"); + SYSCTL_PROC(_kperf, OID_AUTO, blessed_pid, - CTLTYPE_INT|CTLFLAG_RW, /* must be root */ - (void*)REQ_BLESS, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, /* must be root */ + (void *)REQ_BLESS, sizeof(int), kperf_sysctl_bless_handler, "I", "Blessed pid"); SYSCTL_PROC(_kperf, OID_AUTO, blessed_preempt, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_BLESS_PREEMPT, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void *)REQ_BLESS_PREEMPT, sizeof(int), kperf_sysctl, "I", "Blessed preemption"); -SYSCTL_PROC(_kperf, OID_AUTO, kdbg_callstacks, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_KDBG_CALLSTACKS, - sizeof(int), kperf_sysctl, "I", "Generate kdbg callstacks"); - SYSCTL_PROC(_kperf, OID_AUTO, kdbg_cswitch, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, (void *)REQ_KDBG_CSWITCH, sizeof(int), kperf_sysctl, "I", "Generate context switch info"); SYSCTL_PROC(_kperf, OID_AUTO, pet_idle_rate, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - (void*)REQ_PET_IDLE_RATE, - sizeof(int), kperf_sysctl, "I", "Rate at which unscheduled threads are forced to be sampled in PET mode"); - -SYSCTL_PROC(_kperf, OID_AUTO, cswitch_action, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, - (void*)REQ_CSWITCH_ACTION, - sizeof(int), kperf_sysctl, "I", "ID of action to trigger on context-switch"); - -SYSCTL_PROC(_kperf, OID_AUTO, signpost_action, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, - (void*)REQ_SIGNPOST_ACTION, - sizeof(int), kperf_sysctl, "I", "ID of action to trigger on signposts"); + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void *)REQ_PET_IDLE_RATE, + sizeof(int), kperf_sysctl, "I", + "Rate at which unscheduled threads are forced to be sampled in " + "PET mode"); + +SYSCTL_PROC(_kperf, OID_AUTO, lightweight_pet, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + (void *)REQ_LIGHTWEIGHT_PET, + sizeof(int), kperf_sysctl, "I", + "Status of lightweight PET mode"); /* debug */ -SYSCTL_INT(_kperf, OID_AUTO, debug_level, CTLFLAG_RW, +SYSCTL_INT(_kperf, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_LOCKED, &kperf_debug_level, 0, "debug level"); +#if DEVELOPMENT || DEBUG +SYSCTL_QUAD(_kperf, OID_AUTO, already_pending_ipis, + CTLFLAG_RD | CTLFLAG_LOCKED, + &kperf_pending_ipis, ""); +#endif /* DEVELOPMENT || DEBUG */ diff --git a/osfmk/kperf/meminfo.c b/osfmk/kperf/meminfo.c index b7910aba0..15de26436 100644 --- a/osfmk/kperf/meminfo.c +++ b/osfmk/kperf/meminfo.c @@ -52,7 +52,7 @@ kperf_meminfo_sample(struct meminfo *mi, struct kperf_context *context) thread_t thread = context->cur_thread; - BUF_INFO1(PERF_MI_SAMPLE, (uintptr_t)thread_tid(thread)); + BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread)); task = get_threadtask(thread); @@ -79,13 +79,15 @@ kperf_meminfo_sample(struct meminfo *mi, struct kperf_context *context) } else { mi->purgeable_volatile_compressed = UINT64_MAX; } + + BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread)); } /* log an existing sample into the buffer */ void kperf_meminfo_log(struct meminfo *mi) { - BUF_DATA3(PERF_MI_DATA, mi->phys_footprint, mi->purgeable_volatile, - mi->purgeable_volatile_compressed); + BUF_DATA(PERF_MI_DATA, mi->phys_footprint, mi->purgeable_volatile, + mi->purgeable_volatile_compressed); } diff --git a/osfmk/kperf/pet.c b/osfmk/kperf/pet.c index e00f6a045..52d2909ce 100644 --- a/osfmk/kperf/pet.c +++ b/osfmk/kperf/pet.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,363 +22,600 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* all thread states code */ #include -#include -#include #include -#include - +#include #include #include #include #include #include -#include +#include #include +#include + +/* action ID to call for each sample + * + * Address is used as the sync point for waiting. + */ +static unsigned int pet_action_id = 0; + +static lck_mtx_t *pet_lock; +static boolean_t pet_initted = FALSE; +static boolean_t pet_running = FALSE; -/* timer id to call back on */ -static unsigned pet_timerid = 0; +/* number of callstack samples to skip for idle threads */ +static uint32_t pet_idle_rate = KPERF_PET_DEFAULT_IDLE_RATE; -/* aciton ID to call - * We also use this as the sync point for waiting, for no good reason +/* + * Lightweight PET mode samples the system less-intrusively than normal PET + * mode. Instead of iterating tasks and threads on each sample, it increments + * a global generation count, kperf_pet_gen, which is checked as threads are + * context switched on-core. If the thread's local generation count is older + * than the global generation, the thread samples itself. + * + * | | + * thread A +--+---------| + * | | + * thread B |--+---------------| + * | | + * thread C | | |------------------------------------- + * | | | + * thread D | | | |------------------------------- + * | | | | + * +--+---------+-----+--------------------------------> time + * | │ | + * | +-----+--- threads sampled when they come on-core in + * | kperf_pet_switch_context + * | + * +--- PET timer fire, sample on-core threads A and B, + * increment kperf_pet_gen */ -static unsigned pet_actionid = 0; +static boolean_t lightweight_pet = FALSE; -/* the actual thread pointer */ -static thread_t pet_thread = NULL; +/* + * Whether or not lightweight PET and sampling is active. + */ +boolean_t kperf_lightweight_pet_active = FALSE; -/* Lock on which to synchronise */ -static IOLock *pet_lock = NULL; +uint32_t kperf_pet_gen = 0; -/* where to sample data to */ -static struct kperf_sample pet_sample_buf; +static struct kperf_sample *pet_sample; -static int pet_idle_rate = 15; +/* thread lifecycle */ -/* sample an actual, honest to god thread! */ -static void -pet_sample_thread( thread_t thread ) -{ - struct kperf_context ctx; - task_t task; - unsigned skip_callstack; +static kern_return_t pet_init(void); +static void pet_start(void); +static void pet_stop(void); - /* work out the context */ - ctx.cur_thread = thread; - ctx.cur_pid = 0; +/* PET thread-only */ - task = chudxnu_task_for_thread(thread); - if(task) - ctx.cur_pid = chudxnu_pid_for_task(task); +static void pet_thread_loop(void *param, wait_result_t wr); +static void pet_thread_idle(void); +static void pet_thread_work_unit(void); - skip_callstack = (chudxnu_thread_get_dirty(thread) == TRUE) || ((thread->kperf_pet_cnt % (uint64_t)pet_idle_rate) == 0) ? 0 : SAMPLE_FLAG_EMPTY_CALLSTACK; +/* listing things to sample */ - /* do the actual sample */ - kperf_sample( &pet_sample_buf, &ctx, pet_actionid, - SAMPLE_FLAG_IDLE_THREADS | skip_callstack ); +static task_array_t pet_tasks = NULL; +static vm_size_t pet_tasks_size = 0; +static vm_size_t pet_tasks_count = 0; - if (!skip_callstack) - chudxnu_thread_set_dirty(thread, FALSE); +static thread_array_t pet_threads = NULL; +static vm_size_t pet_threads_size = 0; +static vm_size_t pet_threads_count = 0; - thread->kperf_pet_cnt++; -} +static kern_return_t pet_tasks_prepare(void); +static kern_return_t pet_tasks_prepare_internal(void); -/* given a list of threads, preferably stopped, sample 'em! */ -static void -pet_sample_thread_list( mach_msg_type_number_t threadc, thread_array_t threadv ) -{ - unsigned int i; - int ncpu; +static kern_return_t pet_threads_prepare(task_t task); - for( i = 0; i < threadc; i++ ) - { - thread_t thread = threadv[i]; +/* sampling */ - if( !thread ) - /* XXX? */ - continue; +static void pet_sample_all_tasks(uint32_t idle_rate); +static void pet_sample_task(task_t task, uint32_t idle_rate); +static void pet_sample_thread(int pid, thread_t thread, uint32_t idle_rate); - for (ncpu = 0; ncpu < machine_info.logical_cpu_max; ++ncpu) - { - thread_t candidate = kperf_thread_on_cpus[ncpu]; - if (candidate && candidate->thread_id == thread->thread_id) - break; - } +/* functions called by other areas of kperf */ - /* the thread was not on a CPU */ - if (ncpu == machine_info.logical_cpu_max) - pet_sample_thread( thread ); +void +kperf_pet_fire_before(void) +{ + if (!pet_initted || !pet_running) { + return; + } + + if (lightweight_pet) { + BUF_INFO(PERF_PET_SAMPLE); + OSIncrementAtomic(&kperf_pet_gen); } } -/* given a task (preferably stopped), sample all the threads in it */ -static void -pet_sample_task( task_t task ) +void +kperf_pet_fire_after(void) { - mach_msg_type_number_t threadc; - thread_array_t threadv; - kern_return_t kr; - - kr = chudxnu_task_threads(task, &threadv, &threadc); - if( kr != KERN_SUCCESS ) - { - BUF_INFO2(PERF_PET_ERROR, ERR_THREAD, kr); + if (!pet_initted || !pet_running) { return; } - pet_sample_thread_list( threadc, threadv ); - - chudxnu_free_thread_list(&threadv, &threadc); + if (lightweight_pet) { + kperf_timer_pet_rearm(0); + } else { + thread_wakeup(&pet_action_id); + } } -/* given a list of tasks, sample all the threads in 'em */ -static void -pet_sample_task_list( int taskc, task_array_t taskv ) +void +kperf_pet_on_cpu(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_fp) { - int i; - - for( i = 0; i < taskc; i++ ) - { - kern_return_t kr; - task_t task = taskv[i]; - - /* FIXME: necessary? old code did this, our hacky - * filtering code does, too + assert(thread != NULL); + assert(ml_get_interrupts_enabled() == FALSE); + + if (thread->kperf_pet_gen != kperf_pet_gen) { + BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START, kperf_pet_gen, thread->kperf_pet_gen); + + struct kperf_context ctx = { + .cur_thread = thread, + .cur_pid = task_pid(get_threadtask(thread)), + .starting_fp = starting_fp, + }; + /* + * Use a per-CPU interrupt buffer, since this is only called + * while interrupts are disabled, from the scheduler. */ - if(!task) { - continue; + struct kperf_sample *sample = kperf_intr_sample_buffer(); + if (!sample) { + BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END, 1); + return; } - /* try and stop any task other than the kernel task */ - if( task != kernel_task ) - { - kr = task_suspend_internal( task ); - - /* try the next task */ - if( kr != KERN_SUCCESS ) - continue; + unsigned int flags = SAMPLE_FLAG_NON_INTERRUPT | SAMPLE_FLAG_PEND_USER; + if (continuation != NULL) { + flags |= SAMPLE_FLAG_CONTINUATION; } + kperf_sample(sample, &ctx, pet_action_id, flags); - /* sample it */ - pet_sample_task( task ); + BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END); + } else { + BUF_VERB(PERF_PET_SAMPLE_THREAD, kperf_pet_gen, thread->kperf_pet_gen); + } +} - /* if it wasn't the kernel, resume it */ - if( task != kernel_task ) - (void) task_resume_internal(task); +void +kperf_pet_config(unsigned int action_id) +{ + kern_return_t kr = pet_init(); + if (kr != KERN_SUCCESS) { + return; } + + lck_mtx_lock(pet_lock); + + BUF_INFO(PERF_PET_THREAD, 3, action_id); + + if (action_id == 0) { + pet_stop(); + } else { + pet_start(); + } + + pet_action_id = action_id; + + lck_mtx_unlock(pet_lock); } -static void -pet_sample_all_tasks(void) +/* handle resource allocation */ + +void +pet_start(void) { - task_array_t taskv = NULL; - mach_msg_type_number_t taskc = 0; - kern_return_t kr; + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); - kr = chudxnu_all_tasks(&taskv, &taskc); + if (pet_running) { + return; + } - if( kr != KERN_SUCCESS ) - { - BUF_INFO2(PERF_PET_ERROR, ERR_TASK, kr); + pet_sample = kalloc(sizeof(struct kperf_sample)); + if (!pet_sample) { return; } - pet_sample_task_list( taskc, taskv ); - chudxnu_free_task_list(&taskv, &taskc); + pet_running = TRUE; } -#if 0 -static void -pet_sample_pid_filter(void) +void +pet_stop(void) { - task_t *taskv = NULL; - int *pidv, pidc, i; - vm_size_t asize; - - kperf_filter_pid_list( &pidc, &pidv ); - if( pidc == 0 ) - { - BUF_INFO2(PERF_PET_ERROR, ERR_PID, 0); + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); + + if (!pet_initted) { return; } - asize = pidc * sizeof(task_t); - taskv = kalloc( asize ); + if (pet_tasks != NULL) { + assert(pet_tasks_size != 0); + kfree(pet_tasks, pet_tasks_size); - if( taskv == NULL ) - goto out; - - /* convert the pid list into a task list */ - for( i = 0; i < pidc; i++ ) - { - int pid = pidv[i]; - if( pid == -1 ) - taskv[i] = NULL; - else - taskv[i] = chudxnu_task_for_pid(pid); + pet_tasks = NULL; + pet_tasks_size = 0; + pet_tasks_count = 0; } - /* now sample the task list */ - pet_sample_task_list( pidc, taskv ); + if (pet_threads != NULL) { + assert(pet_threads_size != 0); + kfree(pet_threads, pet_threads_size); + + pet_threads = NULL; + pet_threads_size = 0; + pet_threads_count = 0; + } - kfree(taskv, asize); + if (pet_sample != NULL) { + kfree(pet_sample, sizeof(struct kperf_sample)); + pet_sample = NULL; + } -out: - kperf_filter_free_pid_list( &pidc, &pidv ); + pet_running = FALSE; } -#endif -/* do the pet sample */ -static void -pet_work_unit(void) +/* + * Lazily initialize PET. The PET thread never exits once PET has been used + * once. + */ +static kern_return_t +pet_init(void) { - int pid_filter; + if (pet_initted) { + return KERN_SUCCESS; + } - /* check if we're filtering on pid */ - // pid_filter = kperf_filter_on_pid(); - pid_filter = 0; // FIXME + /* make the sync point */ + pet_lock = lck_mtx_alloc_init(&kperf_lck_grp, NULL); + assert(pet_lock); -#if 0 - if( pid_filter ) - { - BUF_INFO1(PERF_PET_SAMPLE | DBG_FUNC_START, 1); - pet_sample_pid_filter(); - } - else -#endif - { - /* otherwise filter everything */ - BUF_INFO1(PERF_PET_SAMPLE | DBG_FUNC_START, 0); - pet_sample_all_tasks(); + /* create the thread */ + + BUF_INFO(PERF_PET_THREAD, 0); + thread_t t; + kern_return_t kr = kernel_thread_start(pet_thread_loop, NULL, &t); + if (kr != KERN_SUCCESS) { + lck_mtx_free(pet_lock, &kperf_lck_grp); + return kr; } - BUF_INFO1(PERF_PET_SAMPLE | DBG_FUNC_END, 0); + thread_set_thread_name(t, "kperf sampling"); + /* let the thread hold the only reference */ + thread_deallocate(t); + + pet_initted = TRUE; + return KERN_SUCCESS; } -/* sleep indefinitely */ -static void -pet_idle(void) +/* called by PET thread only */ + +static void +pet_thread_work_unit(void) { - IOLockSleep(pet_lock, &pet_actionid, THREAD_UNINT); + pet_sample_all_tasks(pet_idle_rate); } -/* loop between sampling and waiting */ static void -pet_thread_loop( __unused void *param, __unused wait_result_t wr ) +pet_thread_idle(void) { + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); + + (void)lck_mtx_sleep(pet_lock, LCK_SLEEP_DEFAULT, &pet_action_id, + THREAD_UNINT); +} + +__attribute__((noreturn)) +static void +pet_thread_loop(void *param, wait_result_t wr) +{ +#pragma unused(param, wr) uint64_t work_unit_ticks; - BUF_INFO1(PERF_PET_THREAD, 1); + BUF_INFO(PERF_PET_THREAD, 1); - IOLockLock(pet_lock); - while(1) - { - BUF_INFO1(PERF_PET_IDLE, 0); - pet_idle(); + lck_mtx_lock(pet_lock); + for (;;) { + BUF_INFO(PERF_PET_IDLE); + pet_thread_idle(); - BUF_INFO1(PERF_PET_RUN, 0); + BUF_INFO(PERF_PET_RUN); /* measure how long the work unit takes */ work_unit_ticks = mach_absolute_time(); - pet_work_unit(); + pet_thread_work_unit(); work_unit_ticks = mach_absolute_time() - work_unit_ticks; /* re-program the timer */ - kperf_timer_pet_set( pet_timerid, work_unit_ticks ); + kperf_timer_pet_rearm(work_unit_ticks); + } +} - /* FIXME: break here on a condition? */ +/* sampling */ + +static void +pet_sample_thread(int pid, thread_t thread, uint32_t idle_rate) +{ + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); + + uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS; + + BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START); + + /* work out the context */ + struct kperf_context ctx = { + .cur_thread = thread, + .cur_pid = pid, + }; + + boolean_t thread_dirty = kperf_thread_get_dirty(thread); + + /* + * Clean a dirty thread and skip callstack sample if the thread was not + * dirty and thread has skipped less than pet_idle_rate samples. + */ + if (thread_dirty) { + kperf_thread_set_dirty(thread, FALSE); + } else if ((thread->kperf_pet_cnt % idle_rate) != 0) { + sample_flags |= SAMPLE_FLAG_EMPTY_CALLSTACK; } + thread->kperf_pet_cnt++; + + kperf_sample(pet_sample, &ctx, pet_action_id, sample_flags); + + BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END); } -/* make sure the thread takes a new period value */ -void -kperf_pet_timer_config( unsigned timerid, unsigned actionid ) +static kern_return_t +pet_threads_prepare(task_t task) { - if( !pet_lock ) - return; + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); + + vm_size_t threads_size_needed; + + if (task == TASK_NULL) { + return KERN_INVALID_ARGUMENT; + } + + for (;;) { + task_lock(task); + + if (!task->active) { + task_unlock(task); - /* hold the lock so pet thread doesn't run while we do this */ - IOLockLock(pet_lock); + return KERN_FAILURE; + } + + /* do we have the memory we need? */ + threads_size_needed = task->thread_count * sizeof(thread_t); + if (threads_size_needed <= pet_threads_size) { + break; + } - BUF_INFO1(PERF_PET_THREAD, 3); + /* not enough memory, unlock the task and increase allocation */ + task_unlock(task); - /* set values */ - pet_timerid = timerid; - pet_actionid = actionid; + if (pet_threads_size != 0) { + kfree(pet_threads, pet_threads_size); + } + + assert(threads_size_needed > 0); + pet_threads_size = threads_size_needed; - /* done */ - IOLockUnlock(pet_lock); + pet_threads = kalloc(pet_threads_size); + if (pet_threads == NULL) { + pet_threads_size = 0; + return KERN_RESOURCE_SHORTAGE; + } + } + + /* have memory and the task is locked and active */ + thread_t thread; + pet_threads_count = 0; + queue_iterate(&(task->threads), thread, thread_t, task_threads) { + thread_reference_internal(thread); + pet_threads[pet_threads_count++] = thread; + } + + /* can unlock task now that threads are referenced */ + task_unlock(task); + + return (pet_threads_count == 0) ? KERN_FAILURE : KERN_SUCCESS; } -/* make the thread run! */ -void -kperf_pet_thread_go(void) +static void +pet_sample_task(task_t task, uint32_t idle_rate) { - if( !pet_lock ) + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); + + BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START); + + kern_return_t kr = pet_threads_prepare(task); + if (kr != KERN_SUCCESS) { + BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr); + BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1); return; + } + + int pid = task_pid(task); - /* Make the thread go */ - IOLockWakeup(pet_lock, &pet_actionid, FALSE); + for (unsigned int i = 0; i < pet_threads_count; i++) { + thread_t thread = pet_threads[i]; + int cpu; + assert(thread); + + /* do not sample the thread if it was on a CPU during the IPI. */ + for (cpu = 0; cpu < machine_info.logical_cpu_max; cpu++) { + thread_t candidate = kperf_thread_on_cpus[cpu]; + if (candidate && (thread_tid(candidate) == thread_tid(thread))) { + break; + } + } + + /* the thread was not on a CPU */ + if (cpu == machine_info.logical_cpu_max) { + pet_sample_thread(pid, thread, idle_rate); + } + + thread_deallocate(pet_threads[i]); + } + + BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, pet_threads_count); } +static kern_return_t +pet_tasks_prepare_internal(void) +{ + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); -/* wait for the pet thread to finish a run */ -void -kperf_pet_thread_wait(void) + vm_size_t tasks_size_needed = 0; + + for (;;) { + lck_mtx_lock(&tasks_threads_lock); + + /* do we have the memory we need? */ + tasks_size_needed = tasks_count * sizeof(task_t); + if (tasks_size_needed <= pet_tasks_size) { + break; + } + + /* unlock and allocate more memory */ + lck_mtx_unlock(&tasks_threads_lock); + + /* grow task array */ + if (tasks_size_needed > pet_tasks_size) { + if (pet_tasks_size != 0) { + kfree(pet_tasks, pet_tasks_size); + } + + assert(tasks_size_needed > 0); + pet_tasks_size = tasks_size_needed; + + pet_tasks = (task_array_t)kalloc(pet_tasks_size); + if (pet_tasks == NULL) { + pet_tasks_size = 0; + return KERN_RESOURCE_SHORTAGE; + } + } + } + + return KERN_SUCCESS; +} + +static kern_return_t +pet_tasks_prepare(void) { - if( !pet_lock ) - return; + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); + + /* allocate space and take the tasks_threads_lock */ + kern_return_t kr = pet_tasks_prepare_internal(); + if (KERN_SUCCESS != kr) { + return kr; + } + lck_mtx_assert(&tasks_threads_lock, LCK_MTX_ASSERT_OWNED); + + /* make sure the tasks are not deallocated after dropping the lock */ + task_t task; + pet_tasks_count = 0; + queue_iterate(&tasks, task, task_t, tasks) { + if (task != kernel_task) { + task_reference_internal(task); + pet_tasks[pet_tasks_count++] = task; + } + } + + lck_mtx_unlock(&tasks_threads_lock); - /* acquire the lock to ensure the thread is parked. */ - IOLockLock(pet_lock); - IOLockUnlock(pet_lock); + return KERN_SUCCESS; } -/* keep the pet thread around while we run */ -int -kperf_pet_init(void) +static void +pet_sample_all_tasks(uint32_t idle_rate) { - kern_return_t rc; - thread_t t; + lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); - if( pet_thread != NULL ) - return 0; + BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_START); - /* make the sync poing */ - pet_lock = IOLockAlloc(); - if( pet_lock == NULL ) - return ENOMEM; + kern_return_t kr = pet_tasks_prepare(); + if (kr != KERN_SUCCESS) { + BUF_INFO(PERF_PET_ERROR, ERR_TASK, kr); + BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END, 0); + return; + } - /* create the thread */ - BUF_INFO1(PERF_PET_THREAD, 0); - rc = kernel_thread_start( pet_thread_loop, NULL, &t ); - if( rc != KERN_SUCCESS ) - { - IOLockFree( pet_lock ); - pet_lock = NULL; - return ENOMEM; + for (unsigned int i = 0; i < pet_tasks_count; i++) { + task_t task = pet_tasks[i]; + + if (task != kernel_task) { + kr = task_suspend_internal(task); + if (kr != KERN_SUCCESS) { + continue; + } + } + + pet_sample_task(task, idle_rate); + + if (task != kernel_task) { + task_resume_internal(task); + } } - /* OK! */ - return 0; + for(unsigned int i = 0; i < pet_tasks_count; i++) { + task_deallocate(pet_tasks[i]); + } + + BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END, pet_tasks_count); } +/* support sysctls */ + int -kperf_get_pet_idle_rate( void ) +kperf_get_pet_idle_rate(void) { return pet_idle_rate; } -void -kperf_set_pet_idle_rate( int val ) +int +kperf_set_pet_idle_rate(int val) { pet_idle_rate = val; + + return 0; +} + +int +kperf_get_lightweight_pet(void) +{ + return lightweight_pet; +} + +int +kperf_set_lightweight_pet(int val) +{ + if (kperf_sampling_status() == KPERF_SAMPLING_ON) { + return EBUSY; + } + + lightweight_pet = (val == 1); + kperf_lightweight_pet_active_update(); + + return 0; +} + +void +kperf_lightweight_pet_active_update(void) +{ + kperf_lightweight_pet_active = (kperf_sampling_status() && lightweight_pet); + kperf_on_cpu_update(); } diff --git a/osfmk/kperf/pet.h b/osfmk/kperf/pet.h index 03c411a01..f8c6c1720 100644 --- a/osfmk/kperf/pet.h +++ b/osfmk/kperf/pet.h @@ -26,21 +26,30 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include -/* create the pet thread */ -extern int kperf_pet_init(void); +#define KPERF_PET_DEFAULT_IDLE_RATE (15) -/* Kick the pet thread so it runs a sample of all threads */ -extern void kperf_pet_thread_go(void); +extern boolean_t kperf_lightweight_pet_active; +extern uint32_t kperf_pet_gen; -/* ensure the pet thread has stopped sampling */ -extern void kperf_pet_thread_wait(void); +/* prepare PET to be able to fire action with given ID, or disable PET */ +void kperf_pet_config(unsigned int action_id); -/* tell pet the timer parameters */ -extern void kperf_pet_timer_config( unsigned timerid, unsigned actionid ); +/* fire off a PET sample, both before and after on-core samples */ +void kperf_pet_fire_before(void); +void kperf_pet_fire_after(void); -/* get/set rate at which PET forces threads to be sampled */ -extern int kperf_get_pet_idle_rate( void ); -extern void kperf_set_pet_idle_rate( int val ); +/* notify PET of new threads switching on */ +void kperf_pet_on_cpu(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_frame); +/* get/set rate at which idle threads are sampled by PET */ +int kperf_get_pet_idle_rate(void); +int kperf_set_pet_idle_rate(int val); +/* get/set whether lightweight PET is enabled */ +int kperf_get_lightweight_pet(void); +int kperf_set_lightweight_pet(int val); + +void kperf_lightweight_pet_active_update(void); diff --git a/osfmk/kperf/sample.h b/osfmk/kperf/sample.h index fb6df72d2..42637fa31 100644 --- a/osfmk/kperf/sample.h +++ b/osfmk/kperf/sample.h @@ -2,7 +2,7 @@ * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,33 +22,37 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include "threadinfo.h" +#ifndef KPERF_SAMPLE_H +#define KPERF_SAMPLE_H + +#include +#include #include "callstack.h" #include "kperf_kpc.h" #include "meminfo.h" -#ifndef __KPERF_SAMPLE_H__ -#define __KPERF_SAMPLE_H__ +struct kperf_sample { + struct kperf_thread_info th_info; + struct kperf_thread_scheduling th_scheduling; + struct kperf_thread_snapshot th_snapshot; + struct kperf_thread_dispatch th_dispatch; + + struct kperf_task_snapshot tk_snapshot; -// what goes in a sample -struct kperf_sample -{ - struct threadinfo threadinfo; - struct tinfo_ex tinfo_ex; - struct callstack kcallstack; - struct callstack ucallstack; - struct meminfo meminfo; + struct callstack kcallstack; + struct callstack ucallstack; + struct meminfo meminfo; #if KPC struct kpcdata kpcdata; #endif }; -// cache of thread on CPUs during the IPI +/* cache of threads on each CPU during a timer fire */ extern thread_t *kperf_thread_on_cpus; -#endif /* __KPERF_SAMPLE_H__ */ +#endif /* !defined(KPERF_SAMPLE_H) */ diff --git a/osfmk/kperf/task_samplers.c b/osfmk/kperf/task_samplers.c new file mode 100644 index 000000000..ae49da4b4 --- /dev/null +++ b/osfmk/kperf/task_samplers.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#include + +extern boolean_t workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total, + boolean_t *exceeded_constrained); +extern boolean_t memorystatus_proc_is_dirty_unsafe(void *v); + +void +kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn, + struct kperf_context *ctx) +{ + thread_t thread; + task_t task; + boolean_t wq_state_available = FALSE; + boolean_t exceeded_total, exceeded_constrained; + + BUF_INFO(PERF_TK_SNAP_SAMPLE | DBG_FUNC_START); + + assert(tksn != NULL); + assert(ctx != NULL); + + thread = ctx->cur_thread; + task = get_threadtask(thread); + + tksn->kptksn_flags = 0; + if (task->effective_policy.tep_darwinbg) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_DARWIN_BG; + } + if (task->requested_policy.trp_role == TASK_FOREGROUND_APPLICATION) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_FOREGROUND; + } + if (task->requested_policy.trp_boosted == 1) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_BOOSTED; + } +#if CONFIG_MEMORYSTATUS + if (memorystatus_proc_is_dirty_unsafe(task->bsd_info)) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_DIRTY; + } +#endif + + if (task->bsd_info) { + wq_state_available = + workqueue_get_pwq_exceeded(task->bsd_info, &exceeded_total, + &exceeded_constrained); + } + if (wq_state_available) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_WQ_FLAGS_VALID; + + if (exceeded_total) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_WQ_EXCEEDED_TOTAL; + } + if (exceeded_constrained) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_WQ_EXCEEDED_CONSTRAINED; + } + } + + tksn->kptksn_suspend_count = task->suspend_count; + tksn->kptksn_pageins = task->pageins; + tksn->kptksn_user_time_in_terminated_threads = task->total_user_time; + tksn->kptksn_system_time_in_terminated_threads = task->total_system_time; + + BUF_INFO(PERF_TK_SNAP_SAMPLE | DBG_FUNC_END); +} + +void +kperf_task_snapshot_log(struct kperf_task_snapshot *tksn) +{ + assert(tksn != NULL); + +#if defined(__LP64__) + BUF_DATA(PERF_TK_SNAP_DATA, tksn->kptksn_flags, + ENCODE_UPPER_64(tksn->kptksn_suspend_count) | + ENCODE_LOWER_64(tksn->kptksn_pageins), + tksn->kptksn_user_time_in_terminated_threads, + tksn->kptksn_system_time_in_terminated_threads); +#else + BUF_DATA(PERF_TK_SNAP_DATA1_32, UPPER_32(tksn->kptksn_flags), + LOWER_32(tksn->kptksn_flags), + tksn->kptksn_suspend_count, + tksn->kptksn_pageins); + BUF_DATA(PERF_TK_SNAP_DATA2_32, UPPER_32(tksn->kptksn_user_time_in_terminated_threads), + LOWER_32(tksn->kptksn_user_time_in_terminated_threads), + UPPER_32(tksn->kptksn_system_time_in_terminated_threads), + LOWER_32(tksn->kptksn_system_time_in_terminated_threads)); +#endif /* defined(__LP64__) */ +} diff --git a/osfmk/kperf/threadinfo.h b/osfmk/kperf/task_samplers.h similarity index 55% rename from osfmk/kperf/threadinfo.h rename to osfmk/kperf/task_samplers.h index e7bcaafb2..ebebeb552 100644 --- a/osfmk/kperf/threadinfo.h +++ b/osfmk/kperf/task_samplers.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,35 +22,33 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef __AP_THREADINFO_H__ -#define __AP_THREADINFO_H__ +#ifndef KPERF_TASK_SAMPLERS_H +#define KPERF_TASK_SAMPLERS_H -/* 'live' threadinfo */ -struct threadinfo -{ - uint64_t pid; - uint64_t tid; - uint64_t dq_addr; - uint64_t runmode; -}; +#include -/* extra info we sample out of bounds */ -#define CHUD_MAXPCOMM 16 /* copy from kernel somewhere :P */ -struct tinfo_ex -{ - char p_comm[CHUD_MAXPCOMM+1]; /* XXX: 16 + 1 */ +struct kperf_task_snapshot { + uint64_t kptksn_flags; + uint64_t kptksn_user_time_in_terminated_threads; + uint64_t kptksn_system_time_in_terminated_threads; + int kptksn_suspend_count; + int kptksn_pageins; }; -struct kperf_context; -extern void kperf_threadinfo_sample(struct threadinfo *ti, struct kperf_context *); -extern void kperf_threadinfo_log(struct threadinfo *ti); +#define KPERF_TASK_FLAG_DARWIN_BG (1U << 0) +#define KPERF_TASK_FLAG_FOREGROUND (1U << 1) +#define KPERF_TASK_FLAG_BOOSTED (1U << 2) +#define KPERF_TASK_FLAG_DIRTY (1U << 3) +#define KPERF_TASK_FLAG_WQ_FLAGS_VALID (1U << 4) +#define KPERF_TASK_FLAG_WQ_EXCEEDED_TOTAL (1U << 5) +#define KPERF_TASK_FLAG_WQ_EXCEEDED_CONSTRAINED (1U << 6) -extern void kperf_threadinfo_extra_sample(struct tinfo_ex *, struct kperf_context *); -extern int kperf_threadinfo_extra_pend(struct kperf_context *); -extern void kperf_threadinfo_extra_log(struct tinfo_ex *); +void kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn, + struct kperf_context *ctx); +void kperf_task_snapshot_log(struct kperf_task_snapshot *tksn); -#endif /* __AP_THREADINFO_H__ */ +#endif /* !defined(KPERF_TASK_SAMPLERS_H) */ diff --git a/osfmk/kperf/thread_samplers.c b/osfmk/kperf/thread_samplers.c new file mode 100644 index 000000000..2442a40ed --- /dev/null +++ b/osfmk/kperf/thread_samplers.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* Sample thread data */ + +#include /* panic */ +#include /* thread_* */ +#include /* timer_data_t */ +#include /* TASK_POLICY_* */ +#include + +#include +#include +#include +#include +#include + +extern boolean_t stackshot_thread_is_idle_worker_unsafe(thread_t thread); + +/* + * XXX Deprecated, use thread scheduling sampler instead. + * + * Taken from AppleProfileGetRunModeOfThread and CHUD. Still here for + * backwards compatibility. + */ + +#define KPERF_TI_RUNNING (1U << 0) +#define KPERF_TI_RUNNABLE (1U << 1) +#define KPERF_TI_WAIT (1U << 2) +#define KPERF_TI_UNINT (1U << 3) +#define KPERF_TI_SUSP (1U << 4) +#define KPERF_TI_TERMINATE (1U << 5) +#define KPERF_TI_IDLE (1U << 6) + +static uint32_t +kperf_thread_info_runmode_legacy(thread_t thread) +{ + uint32_t kperf_state = 0; + int sched_state = thread->state; + processor_t last_processor = thread->last_processor; + + if ((last_processor != PROCESSOR_NULL) && (thread == last_processor->active_thread)) { + kperf_state |= KPERF_TI_RUNNING; + } + if (sched_state & TH_RUN) { + kperf_state |= KPERF_TI_RUNNABLE; + } + if (sched_state & TH_WAIT) { + kperf_state |= KPERF_TI_WAIT; + } + if (sched_state & TH_UNINT) { + kperf_state |= KPERF_TI_UNINT; + } + if (sched_state & TH_SUSP) { + kperf_state |= KPERF_TI_SUSP; + } + if (sched_state & TH_TERMINATE) { + kperf_state |= KPERF_TI_TERMINATE; + } + if (sched_state & TH_IDLE) { + kperf_state |= KPERF_TI_IDLE; + } + + /* on desktop, if state is blank, leave not idle set */ + if (kperf_state == 0) { + return (TH_IDLE << 16); + } + + /* high two bytes are inverted mask, low two bytes are normal */ + return (((~kperf_state & 0xffff) << 16) | (kperf_state & 0xffff)); +} + +void +kperf_thread_info_sample(struct kperf_thread_info *ti, struct kperf_context *context) +{ + thread_t cur_thread = context->cur_thread; + + BUF_INFO(PERF_TI_SAMPLE, (uintptr_t)thread_tid(cur_thread)); + + ti->kpthi_pid = context->cur_pid; + ti->kpthi_tid = thread_tid(cur_thread); + ti->kpthi_dq_addr = thread_dispatchqaddr(cur_thread); + ti->kpthi_runmode = kperf_thread_info_runmode_legacy(cur_thread); + + BUF_VERB(PERF_TI_SAMPLE | DBG_FUNC_END); +} + +void +kperf_thread_info_log(struct kperf_thread_info *ti) +{ + BUF_DATA(PERF_TI_DATA, ti->kpthi_pid, ti->kpthi_tid /* K64-only */, + ti->kpthi_dq_addr, ti->kpthi_runmode); +} + +/* + * Scheduling information reports inputs and outputs of the scheduler state for + * a thread. + */ + +void +kperf_thread_scheduling_sample(struct kperf_thread_scheduling *thsc, + struct kperf_context *context) +{ + assert(thsc != NULL); + assert(context != NULL); + + thread_t thread = context->cur_thread; + + BUF_INFO(PERF_TI_SCHEDSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread)); + + thsc->kpthsc_user_time = timer_grab(&(thread->user_timer)); + uint64_t system_time = timer_grab(&(thread->system_timer)); + + if (thread->precise_user_kernel_time) { + thsc->kpthsc_system_time = system_time; + } else { + thsc->kpthsc_user_time += system_time; + thsc->kpthsc_system_time = 0; + } + + thsc->kpthsc_state = thread->state; + thsc->kpthsc_base_priority = thread->base_pri; + thsc->kpthsc_sched_priority = thread->sched_pri; + thsc->kpthsc_effective_qos = thread->effective_policy.thep_qos; + thsc->kpthsc_requested_qos = thread->requested_policy.thrp_qos; + thsc->kpthsc_requested_qos_override = thread->requested_policy.thrp_qos_override; + thsc->kpthsc_effective_latency_qos = thread->effective_policy.thep_latency_qos; + + BUF_INFO(PERF_TI_SCHEDSAMPLE | DBG_FUNC_END); +} + + +void +kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc) +{ + assert(thsc != NULL); +#if defined(__LP64__) + BUF_DATA(PERF_TI_SCHEDDATA, thsc->kpthsc_user_time, + thsc->kpthsc_system_time, + (((uint64_t)thsc->kpthsc_base_priority) << 48) + | ((uint64_t)thsc->kpthsc_sched_priority << 32) + | ((uint64_t)(thsc->kpthsc_state & 0xff) << 24) + | (thsc->kpthsc_effective_qos << 6) + | (thsc->kpthsc_requested_qos << 3) + | thsc->kpthsc_requested_qos_override, + ((uint64_t)thsc->kpthsc_effective_latency_qos << 61)); +#else + BUF_DATA(PERF_TI_SCHEDDATA1_32, UPPER_32(thsc->kpthsc_user_time), + LOWER_32(thsc->kpthsc_user_time), + UPPER_32(thsc->kpthsc_system_time), + LOWER_32(thsc->kpthsc_system_time)); + BUF_DATA(PERF_TI_SCHEDDATA2_32, (((uint32_t)thsc->kpthsc_base_priority) << 16) + | thsc->kpthsc_sched_priority, + ((thsc->kpthsc_state & 0xff) << 24) + | (thsc->kpthsc_effective_qos << 6) + | (thsc->kpthsc_requested_qos << 3) + | thsc->kpthsc_requested_qos_override, + (uint32_t)thsc->kpthsc_effective_latency_qos << 29); +#endif /* defined(__LP64__) */ +} + +/* + * Snapshot information maintains parity with stackshot information for other, + * miscellaneous information about threads. + */ + +#define KPERF_THREAD_SNAPSHOT_DARWIN_BG (1U << 0); +#define KPERF_THREAD_SNAPSHOT_PASSIVE_IO (1U << 1); +#define KPERF_THREAD_SNAPSHOT_GFI (1U << 2); +#define KPERF_THREAD_SNAPSHOT_IDLE_WQ (1U << 3); +/* max is 1U << 7 */ + +void +kperf_thread_snapshot_sample(struct kperf_thread_snapshot *thsn, + struct kperf_context *context) +{ + assert(thsn != NULL); + assert(context != NULL); + + thread_t thread = context->cur_thread; + + BUF_INFO(PERF_TI_SNAPSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread)); + + thsn->kpthsn_last_made_runnable_time = thread->last_made_runnable_time; + + thsn->kpthsn_flags = 0; + if (thread->effective_policy.thep_darwinbg) { + thsn->kpthsn_flags |= KPERF_THREAD_SNAPSHOT_DARWIN_BG; + } + if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) { + thsn->kpthsn_flags |= KPERF_THREAD_SNAPSHOT_PASSIVE_IO; + } + if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) { + thsn->kpthsn_flags |= KPERF_THREAD_SNAPSHOT_GFI + } + if (stackshot_thread_is_idle_worker_unsafe(thread)) { + thsn->kpthsn_flags |= KPERF_THREAD_SNAPSHOT_IDLE_WQ; + } + + thsn->kpthsn_suspend_count = thread->suspend_count; + thsn->kpthsn_io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO); + + BUF_VERB(PERF_TI_SNAPSAMPLE | DBG_FUNC_END); +} + +void +kperf_thread_snapshot_log(struct kperf_thread_snapshot *thsn) +{ + assert(thsn != NULL); +#if defined(__LP64__) + BUF_DATA(PERF_TI_SNAPDATA, thsn->kpthsn_flags | ((uint32_t)(thsn->kpthsn_suspend_count) << 8) + | (thsn->kpthsn_io_tier << 24), + thsn->kpthsn_last_made_runnable_time); +#else + BUF_DATA(PERF_TI_SNAPDATA_32, thsn->kpthsn_flags | ((uint32_t)(thsn->kpthsn_suspend_count) << 8) + | (thsn->kpthsn_io_tier << 24), + UPPER_32(thsn->kpthsn_last_made_runnable_time), + LOWER_32(thsn->kpthsn_last_made_runnable_time)); +#endif /* defined(__LP64__) */ +} + +/* + * Dispatch information only contains the dispatch queue serial number from + * libdispatch. + * + * It's a separate sampler because queue data must be copied in from user space. + */ + +void +kperf_thread_dispatch_sample(struct kperf_thread_dispatch *thdi, + struct kperf_context *context) +{ + assert(thdi != NULL); + assert(context != NULL); + + thread_t thread = context->cur_thread; + + BUF_INFO(PERF_TI_DISPSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread)); + + task_t task = thread->task; + boolean_t task_64 = task_has_64BitAddr(task); + size_t user_addr_size = task_64 ? 8 : 4; + + assert(thread->task != kernel_task); + uint64_t user_dq_key_addr = thread_dispatchqaddr(thread); + if (user_dq_key_addr == 0) { + goto error; + } + + uint64_t user_dq_addr; + if ((copyin((user_addr_t)user_dq_key_addr, + (char *)&user_dq_addr, + user_addr_size) != 0) || + (user_dq_addr == 0)) + { + goto error; + } + + uint64_t user_dq_serialno_addr = + user_dq_addr + get_task_dispatchqueue_serialno_offset(task); + + if (copyin((user_addr_t)user_dq_serialno_addr, + (char *)&(thdi->kpthdi_dq_serialno), + user_addr_size) == 0) + { + goto out; + } + +error: + thdi->kpthdi_dq_serialno = 0; + +out: + BUF_VERB(PERF_TI_DISPSAMPLE | DBG_FUNC_END); +} + +int +kperf_thread_dispatch_pend(struct kperf_context *context) +{ + return kperf_ast_pend(context->cur_thread, T_KPERF_AST_DISPATCH); +} + +void +kperf_thread_dispatch_log(struct kperf_thread_dispatch *thdi) +{ + assert(thdi != NULL); +#if defined(__LP64__) + BUF_DATA(PERF_TI_DISPDATA, thdi->kpthdi_dq_serialno); +#else + BUF_DATA(PERF_TI_DISPDATA_32, UPPER_32(thdi->kpthdi_dq_serialno), + LOWER_32(thdi->kpthdi_dq_serialno)); +#endif /* defined(__LP64__) */ +} diff --git a/osfmk/kperf/thread_samplers.h b/osfmk/kperf/thread_samplers.h new file mode 100644 index 000000000..38195a629 --- /dev/null +++ b/osfmk/kperf/thread_samplers.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef KPERF_THREAD_SAMPLERS_H +#define KPERF_THREAD_SAMPLERS_H + +#include + +/* legacy thread info */ +struct kperf_thread_info { + uint64_t kpthi_pid; + uint64_t kpthi_tid; + uint64_t kpthi_dq_addr; + uint64_t kpthi_runmode; +}; + +void kperf_thread_info_sample(struct kperf_thread_info *, + struct kperf_context *); +void kperf_thread_info_log(struct kperf_thread_info *); + +/* scheduling information */ +struct kperf_thread_scheduling { + uint64_t kpthsc_user_time; + uint64_t kpthsc_system_time; + unsigned int kpthsc_state; + uint16_t kpthsc_base_priority; + uint16_t kpthsc_sched_priority; + unsigned int kpthsc_effective_qos : 3; + unsigned int kpthsc_requested_qos : 3; + unsigned int kpthsc_requested_qos_override : 3; + unsigned int kpthsc_effective_latency_qos : 3; +}; + +void kperf_thread_scheduling_sample(struct kperf_thread_scheduling *, + struct kperf_context *); +void kperf_thread_scheduling_log(struct kperf_thread_scheduling *); + +/* thread snapshot information */ +struct kperf_thread_snapshot { + uint64_t kpthsn_last_made_runnable_time; + int16_t kpthsn_suspend_count; + uint8_t kpthsn_io_tier; + uint8_t kpthsn_flags; +}; + +void kperf_thread_snapshot_sample(struct kperf_thread_snapshot *, + struct kperf_context *); +void kperf_thread_snapshot_log(struct kperf_thread_snapshot *); + +/* libdispatch information */ +struct kperf_thread_dispatch { + uint64_t kpthdi_dq_serialno; +}; + +void kperf_thread_dispatch_sample(struct kperf_thread_dispatch *, + struct kperf_context *); +int kperf_thread_dispatch_pend(struct kperf_context *); +void kperf_thread_dispatch_log(struct kperf_thread_dispatch *); + +#endif /* !defined(KPERF_THREAD_SAMPLERS_H) */ diff --git a/osfmk/kperf/threadinfo.c b/osfmk/kperf/threadinfo.c deleted file mode 100644 index 382a05304..000000000 --- a/osfmk/kperf/threadinfo.c +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -/* Sample thread data */ - -#include -#include /* thread_* */ -#include /* panic */ -// #include - -#include -#include - -#include -#include -#include -#include - -// kAppleProfileTriggerClientThreadModeIdle = 0x40, // TH_IDLE -// #define TH_IDLE 0x40 - -//kAppleProfileTriggerClientThreadModeNotIdle = kAppleProfileTriggerClientThreadModeIdle << 16, // !TH_IDLE -#define TH_IDLE_N (TH_IDLE << 16) - -static uint64_t -make_runmode(thread_t thread) -{ - /* CEG: This is a translation of - * AppleProfileGetRunModeOfThread below... kinda magic :/ - */ - const int mode = chudxnu_thread_get_scheduler_state(thread); - - if( 0 == mode) - { - return (chudxnu_thread_get_idle(thread) ? TH_IDLE : TH_IDLE_N); - } - else - { - // Today we happen to know there's a one-to-one mapping. - return ((mode & 0xffff) | ((~mode & 0xffff) << 16)); - } -} - - -/* code to collect current thread info */ -void -kperf_threadinfo_sample(struct threadinfo *ti, struct kperf_context *context) -{ - thread_t cur_thread = context->cur_thread; - BUF_INFO1( PERF_TI_SAMPLE, (uintptr_t)thread_tid(cur_thread) ); - - // fill out the fields - ti->pid = context->cur_pid; - ti->tid = thread_tid(cur_thread); - ti->dq_addr = thread_dispatchqaddr(cur_thread); - ti->runmode = make_runmode(cur_thread); -} - -/* log an existing sample into the buffer */ -void -kperf_threadinfo_log(struct threadinfo *ti) -{ - /* XXX: K64 only? */ - BUF_DATA( PERF_TI_DATA, ti->pid, ti->tid, ti->dq_addr, ti->runmode ); -} - -/* 'extra' thread-info functions that are deferred 'til thread-context - * time - */ -void -kperf_threadinfo_extra_sample(struct tinfo_ex *tex, struct kperf_context *context) -{ - thread_t cur_thread = context->cur_thread; - uint32_t t_chud; - - /* can only pend on the current thread */ - /* this is valid from PET mode... */ - /* - if( cur_thread != chudxnu_current_thread() ) - panic("pending to non-current thread"); - */ - - /* get our current bits */ - t_chud = kperf_get_thread_bits(cur_thread); - - /* check if there's anything for us to do */ - if( t_chud & T_AST_NAME ) - { - BUF_INFO1( PERF_TI_XSAMPLE, (uintptr_t)thread_tid(cur_thread) ); - - /* get the name out */ -#ifdef FIXME - /* need kperfbsd.c? */ - proc_name( context->cur_pid, - &tex->p_comm[0], CHUD_MAXPCOMM ); -#endif - - /* mark that it's done */ - t_chud &= ~T_AST_NAME; - t_chud |= T_NAME_DONE; - - kperf_set_thread_bits(cur_thread, t_chud); - } - else - /* empty string */ - tex->p_comm[0] = '\0'; - -} - -/* log it if there's anyting useful there */ -void -kperf_threadinfo_extra_log(struct tinfo_ex *tex) -{ - /* no data */ - if( tex->p_comm[0] == '\0' ) - return; - - /* FIXME: log more */ - BUF_DATA1( PERF_TI_XDATA, (uintptr_t)*(uintptr_t*)&tex->p_comm[0] ); -} - -/* pend a flag on a thread */ -int -kperf_threadinfo_extra_pend(struct kperf_context *context) -{ - return kperf_ast_pend( context->cur_thread, T_NAME_DONE | T_AST_NAME, - T_AST_NAME ); -} - - -#if 0 - -/* transalted from the APF */ - -APTIAKernelEntry_t *threadInfo = (APTIAKernelEntry_t*)(threadInfos + account->offset); - -context->timeStamp = mach_absolute_time(); -context->cpuNum = chudxnu_cpu_number(); - -// record the process info from the callback context -context->pid = chudxnu_current_pid(); -threadInfo->pid = context->generic->pid; - -// thread_tid is a thread_t to ID function in the kernel -context->threadID = chudxnu_current_thread(); -threadInfo->tid = thread_tid(context->generic->threadID); - -// also a kernel function -threadInfo->dispatch_queue_addr = thread_dispatchqaddr(context->generic->threadID); - -// see below -threadInfo->runMode = AppleProfileGetRunModeOfThread(context->generic->threadID); - - -/****** WTF is this?! *******/ - -/*!enum AppleProfileTriggerClientThreadRunMode - * - * Specifies the thread mode in which to record samples. - */ -typedef enum { // Target Thread State - can be OR'd - // Basic Building Blocks: - // for Time Profile, use kAppleProfileTriggerClientThreadModeRunning (optionally with kAppleProfileTriggerClientThreadModeNotIdle). - // for Time Profile (All Thread States), use kAppleProfileTriggerClientThreadModeAny (or just don't specify any thread mode filters). - // for Time Profile (Blocked Threads), use kIOProfileTriggerClientThreadModeBlocked. - // etc... - - kAppleProfileTriggerClientThreadModeNone = 0x0, - - kAppleProfileTriggerClientThreadModeRunning = 0x1, // On a core - kAppleProfileTriggerClientThreadModeRunnable = 0x2, // TH_RUN - kAppleProfileTriggerClientThreadModeBlocked = 0x4, // TH_WAIT - kAppleProfileTriggerClientThreadModeUninterruptible = 0x8, // TH_UNINT - kAppleProfileTriggerClientThreadModeSuspended = 0x10, // TH_SUSP - kAppleProfileTriggerClientThreadModeTerminating = 0x20, // TH_TERMINATE - kAppleProfileTriggerClientThreadModeIdle = 0x40, // TH_IDLE - - kAppleProfileTriggerClientThreadModeNotRunning = kAppleProfileTriggerClientThreadModeRunning << 16, // Not on a core - kAppleProfileTriggerClientThreadModeNotRunnable = kAppleProfileTriggerClientThreadModeRunnable << 16, // !TH_RUN - kAppleProfileTriggerClientThreadModeNotBlocked = kAppleProfileTriggerClientThreadModeBlocked << 16, // !TH_WAIT - kAppleProfileTriggerClientThreadModeNotUninterruptible = kAppleProfileTriggerClientThreadModeUninterruptible << 16, // !TH_UNINT - kAppleProfileTriggerClientThreadModeNotSuspended = kAppleProfileTriggerClientThreadModeSuspended << 16, // !TH_SUSP - kAppleProfileTriggerClientThreadModeNotTerminating = kAppleProfileTriggerClientThreadModeTerminating << 16, // !TH_TERMINATE - kAppleProfileTriggerClientThreadModeNotIdle = kAppleProfileTriggerClientThreadModeIdle << 16, // !TH_IDLE - - kAppleProfileTriggerClientThreadModeAny = ( kAppleProfileTriggerClientThreadModeRunning - | kAppleProfileTriggerClientThreadModeNotRunning), -} AppleProfileTriggerClientThreadRunMode; - -extern "C" AppleProfileTriggerClientThreadRunMode AppleProfileGetRunModeOfThread(thread_t thread) { - const int mode = chudxnu_thread_get_scheduler_state(thread); - - if (0 == mode) { - return (chudxnu_thread_get_idle(thread) ? kAppleProfileTriggerClientThreadModeIdle : kAppleProfileTriggerClientThreadModeNotIdle); - } else - return (AppleProfileTriggerClientThreadRunMode)((mode & 0xffff) | ((~mode & 0xffff) << 16)); // Today we happen to know there's a one-to-one mapping. -} - -#endif diff --git a/osfmk/kperf/timetrigger.c b/osfmk/kperf/timetrigger.c deleted file mode 100644 index bc43fd423..000000000 --- a/osfmk/kperf/timetrigger.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* Manage time triggers */ - -#include -#include /* current_thread() */ -#include -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -/* make up for arm signal deficiencies */ -void kperf_signal_handler(void); - -/* represents a periodic timer */ -struct time_trigger -{ - struct timer_call tcall; - uint64_t period; - unsigned actionid; - volatile unsigned active; - -#ifdef USE_SIMPLE_SIGNALS - /* firing accounting */ - uint64_t fire_count; - uint64_t last_cpu_fire[MAX_CPUS]; -#endif -}; - -/* the list of timers */ -static unsigned timerc = 0; -static struct time_trigger *timerv; -static unsigned pet_timer = 999; - -/* maximum number of timers we can construct */ -#define TIMER_MAX 16 - -/* minimal interval for a timer (10usec in nsec) */ -#define MIN_TIMER_NS (10000) -/* minimal interval for pet timer (2msec in nsec) */ -#define MIN_PET_TIMER_NS (2000000) - -static void -kperf_timer_schedule( struct time_trigger *trigger, uint64_t now ) -{ - uint64_t deadline; - - BUF_INFO1(PERF_TM_SCHED, trigger->period); - - /* if we re-programmed the timer to zero, just drop it */ - if( !trigger->period ) - return; - - /* calculate deadline */ - deadline = now + trigger->period; - - /* re-schedule the timer, making sure we don't apply slop */ - timer_call_enter( &trigger->tcall, deadline, TIMER_CALL_SYS_CRITICAL); -} - -static void -kperf_ipi_handler( void *param ) -{ - int r; - int ncpu; - struct kperf_sample *intbuf = NULL; - struct kperf_context ctx; - struct time_trigger *trigger = param; - task_t task = NULL; - - /* Always cut a tracepoint to show a sample event occurred */ - BUF_DATA1(PERF_TM_HNDLR | DBG_FUNC_START, 0); - - /* In an interrupt, get the interrupt buffer for this CPU */ - intbuf = kperf_intr_sample_buffer(); - - /* On a timer, we can see the "real" current thread */ - ctx.cur_pid = 0; /* remove this? */ - ctx.cur_thread = current_thread(); - - task = chudxnu_task_for_thread(ctx.cur_thread); - if (task) - ctx.cur_pid = chudxnu_pid_for_task(task); - - /* who fired */ - ctx.trigger_type = TRIGGER_TYPE_TIMER; - ctx.trigger_id = (unsigned)(trigger-timerv); /* computer timer number */ - - ncpu = chudxnu_cpu_number(); - if (ctx.trigger_id == pet_timer && ncpu < machine_info.logical_cpu_max) - kperf_thread_on_cpus[ncpu] = ctx.cur_thread; - - /* check samppling is on */ - if( kperf_sampling_status() == KPERF_SAMPLING_OFF ) { - BUF_INFO1(PERF_TM_HNDLR | DBG_FUNC_END, SAMPLE_OFF); - return; - } else if( kperf_sampling_status() == KPERF_SAMPLING_SHUTDOWN ) { - BUF_INFO1(PERF_TM_HNDLR | DBG_FUNC_END, SAMPLE_SHUTDOWN); - return; - } - - /* call the action -- kernel-only from interrupt, pend user */ - r = kperf_sample( intbuf, &ctx, trigger->actionid, SAMPLE_FLAG_PEND_USER ); - - /* end tracepoint is informational */ - BUF_INFO1(PERF_TM_HNDLR | DBG_FUNC_END, r); -} - -#ifdef USE_SIMPLE_SIGNALS -/* if we can't pass a (function, arg) pair through a signal properly, - * we do it the simple way. When a timer fires, we increment a counter - * in the time trigger and broadcast a generic signal to all cores. Cores - * search the time trigger list for any triggers for which their last seen - * firing counter is lower than the current one. - */ -void -kperf_signal_handler(void) -{ - int i, cpu; - struct time_trigger *tr = NULL; - - OSMemoryBarrier(); - - cpu = chudxnu_cpu_number(); - for( i = 0; i < (int) timerc; i++ ) - { - tr = &timerv[i]; - if( tr->fire_count <= tr->last_cpu_fire[cpu] ) - continue; /* this trigger hasn't fired */ - - /* fire the trigger! */ - tr->last_cpu_fire[cpu] = tr->fire_count; - kperf_ipi_handler( tr ); - } -} -#else -void -kperf_signal_handler(void) -{ - // so we can link... -} -#endif - -static void -kperf_timer_handler( void *param0, __unused void *param1 ) -{ - struct time_trigger *trigger = param0; - unsigned ntimer = (unsigned)(trigger - timerv); - unsigned ncpus = machine_info.logical_cpu_max; - - trigger->active = 1; - - /* along the lines of do not ipi if we are all shutting down */ - if( kperf_sampling_status() == KPERF_SAMPLING_SHUTDOWN ) - goto deactivate; - - /* clean-up the thread-on-CPUs cache */ - bzero(kperf_thread_on_cpus, ncpus * sizeof(*kperf_thread_on_cpus)); - - /* ping all CPUs */ -#ifndef USE_SIMPLE_SIGNALS - kperf_mp_broadcast( kperf_ipi_handler, trigger ); -#else - trigger->fire_count++; - OSMemoryBarrier(); - kperf_mp_signal(); -#endif - - /* release the pet thread? */ - if( ntimer == pet_timer ) - { - /* timer re-enabled when thread done */ - kperf_pet_thread_go(); - } - else - { - /* re-enable the timer - * FIXME: get the current time from elsewhere - */ - uint64_t now = mach_absolute_time(); - kperf_timer_schedule( trigger, now ); - } - -deactivate: - trigger->active = 0; -} - -/* program the timer from the pet thread */ -int -kperf_timer_pet_set( unsigned timer, uint64_t elapsed_ticks ) -{ - static uint64_t pet_min_ticks = 0; - - uint64_t now; - struct time_trigger *trigger = NULL; - uint64_t period = 0; - uint64_t deadline; - - /* compute ns -> ticks */ - if( pet_min_ticks == 0 ) - nanoseconds_to_absolutetime(MIN_PET_TIMER_NS, &pet_min_ticks); - - if( timer != pet_timer ) - panic( "PET setting with bogus ID\n" ); - - if( timer >= timerc ) - return EINVAL; - - if( kperf_sampling_status() == KPERF_SAMPLING_OFF ) { - BUF_INFO1(PERF_PET_END, SAMPLE_OFF); - return 0; - } - - // don't repgram the timer if it's been shutdown - if( kperf_sampling_status() == KPERF_SAMPLING_SHUTDOWN ) { - BUF_INFO1(PERF_PET_END, SAMPLE_SHUTDOWN); - return 0; - } - - /* CHECKME: we probably took so damn long in the PET thread, - * it makes sense to take the time again. - */ - now = mach_absolute_time(); - trigger = &timerv[timer]; - - /* if we re-programmed the timer to zero, just drop it */ - if( !trigger->period ) - return 0; - - /* subtract the time the pet sample took being careful not to underflow */ - if ( trigger->period > elapsed_ticks ) - period = trigger->period - elapsed_ticks; - - /* make sure we don't set the next PET sample to happen too soon */ - if ( period < pet_min_ticks ) - period = pet_min_ticks; - - /* calculate deadline */ - deadline = now + period; - - BUF_INFO(PERF_PET_SCHED, trigger->period, period, elapsed_ticks, deadline); - - /* re-schedule the timer, making sure we don't apply slop */ - timer_call_enter( &trigger->tcall, deadline, TIMER_CALL_SYS_CRITICAL); - - return 0; -} - - -/* turn on all the timers */ -extern int -kperf_timer_go(void) -{ - unsigned i; - uint64_t now = mach_absolute_time(); - - for( i = 0; i < timerc; i++ ) - { - if( timerv[i].period == 0 ) - continue; - - kperf_timer_schedule( &timerv[i], now ); - } - - return 0; -} - - -extern int -kperf_timer_stop(void) -{ - unsigned i; - - for( i = 0; i < timerc; i++ ) - { - if( timerv[i].period == 0 ) - continue; - - while (timerv[i].active) - ; - - timer_call_cancel( &timerv[i].tcall ); - } - - /* wait for PET to stop, too */ - kperf_pet_thread_wait(); - - return 0; -} - -unsigned -kperf_timer_get_petid(void) -{ - return pet_timer; -} - -int -kperf_timer_set_petid(unsigned timerid) -{ - struct time_trigger *trigger = NULL; - - /* they can program whatever... */ - pet_timer = timerid; - - /* clear them if it's a bogus ID */ - if( pet_timer >= timerc ) - { - kperf_pet_timer_config( 0, 0 ); - - return 0; - } - - /* update the values */ - trigger = &timerv[pet_timer]; - kperf_pet_timer_config( pet_timer, trigger->actionid ); - - return 0; -} - -int -kperf_timer_get_period( unsigned timer, uint64_t *period ) -{ - if( timer >= timerc ) - return EINVAL; - - *period = timerv[timer].period; - - return 0; -} - -int -kperf_timer_set_period( unsigned timer, uint64_t period ) -{ - static uint64_t min_timer_ticks = 0; - - if( timer >= timerc ) - return EINVAL; - - /* compute us -> ticks */ - if( min_timer_ticks == 0 ) - nanoseconds_to_absolutetime(MIN_TIMER_NS, &min_timer_ticks); - - /* check actual timer */ - if( period && (period < min_timer_ticks) ) - period = min_timer_ticks; - - timerv[timer].period = period; - - /* FIXME: re-program running timers? */ - - return 0; -} - -int -kperf_timer_get_action( unsigned timer, uint32_t *action ) -{ - if( timer >= timerc ) - return EINVAL; - - *action = timerv[timer].actionid; - - return 0; -} - -int -kperf_timer_set_action( unsigned timer, uint32_t action ) -{ - if( timer >= timerc ) - return EINVAL; - - timerv[timer].actionid = action; - - return 0; -} - -unsigned -kperf_timer_get_count(void) -{ - return timerc; -} - -static void -setup_timer_call( struct time_trigger *trigger ) -{ - timer_call_setup( &trigger->tcall, kperf_timer_handler, trigger ); -} - -extern int -kperf_timer_set_count(unsigned count) -{ - struct time_trigger *new_timerv = NULL, *old_timerv = NULL; - unsigned old_count, i; - - /* easy no-op */ - if( count == timerc ) - return 0; - - /* TODO: allow shrinking? */ - if( count < timerc ) - return EINVAL; - - /* cap it for good measure */ - if( count > TIMER_MAX ) - return EINVAL; - - /* creating the action arror for the first time. create a few - * more things, too. - */ - if( timerc == 0 ) - { - int r; - - /* main kperf */ - r = kperf_init(); - if( r ) - return r; - - /* get the PET thread going */ - r = kperf_pet_init(); - if( r ) - return r; - } - - /* first shut down any running timers since we will be messing - * with the timer call structures - */ - if( kperf_timer_stop() ) - return EBUSY; - - /* create a new array */ - new_timerv = kalloc( count * sizeof(*new_timerv) ); - if( new_timerv == NULL ) - return ENOMEM; - - old_timerv = timerv; - old_count = timerc; - - if( old_timerv != NULL ) - bcopy( timerv, new_timerv, timerc * sizeof(*timerv) ); - - /* zero the new entries */ - bzero( &new_timerv[timerc], (count - old_count) * sizeof(*new_timerv) ); - - /* (re-)setup the timer call info for all entries */ - for( i = 0; i < count; i++ ) - setup_timer_call( &new_timerv[i] ); - - timerv = new_timerv; - timerc = count; - - if( old_timerv != NULL ) - kfree( old_timerv, old_count * sizeof(*timerv) ); - - return 0; -} diff --git a/osfmk/kperf/x86_64/kperf_mp.c b/osfmk/kperf/x86_64/kperf_mp.c index d4a1e8b99..d9a292aa9 100644 --- a/osfmk/kperf/x86_64/kperf_mp.c +++ b/osfmk/kperf/x86_64/kperf_mp.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,19 +22,47 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include +#include #include +#include #include +#include -int -kperf_mp_broadcast( void (*func)(void*), void *arg ) +void +kperf_mp_broadcast_running(struct kperf_timer *trigger) { - mp_cpus_call( CPUMASK_ALL, ASYNC, func, arg ); + int ncpus = machine_info.logical_cpu_max; + cpumask_t cpu_mask = 0; + assert(ncpus < 64); + + for (int i = 0; i < ncpus; i++) { + /* do not IPI processors that are not scheduling threads */ + processor_t processor = cpu_to_processor(i); + if (processor == PROCESSOR_NULL || + processor->state != PROCESSOR_RUNNING || + processor->active_thread == THREAD_NULL) + { + continue; + } + + /* nor processors that have not responded to the last IPI */ + bool already_pending = atomic_bit_set(&(trigger->pending_cpus), i, + __ATOMIC_RELAXED); + if (already_pending) { +#if DEVELOPMENT || DEBUG + __c11_atomic_fetch_add(&kperf_pending_ipis, 1, __ATOMIC_RELAXED); +#endif + continue; + } + + cpu_mask |= cpu_to_cpumask(i); + } - return 0; + mp_cpus_call(cpu_mask, NOSYNC, kperf_ipi_handler, trigger); } diff --git a/osfmk/libsa/Makefile b/osfmk/libsa/Makefile index ea0f4cb80..1eb9e3345 100644 --- a/osfmk/libsa/Makefile +++ b/osfmk/libsa/Makefile @@ -7,17 +7,15 @@ include $(MakeInc_cmd) include $(MakeInc_def) DATAFILES = \ - string.h + string.h -INSTALL_MI_LIST = +INSTALL_MI_LIST = -INSTALL_MI_DIR = +INSTALL_MI_DIR = EXPORT_MI_LIST = ${DATAFILES} -EXPORT_MI_DIR = +EXPORT_MI_DIR = include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/lockd/Makefile b/osfmk/lockd/Makefile index d42e5bcb1..8ad03c5eb 100644 --- a/osfmk/lockd/Makefile +++ b/osfmk/lockd/Makefile @@ -16,7 +16,7 @@ KERNELFILES = ${PRIVATE_DATAFILES} INSTALL_MI_LIST = INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} -INSTALL_MI_GEN_LIST = +INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = lockd @@ -28,7 +28,7 @@ EXPORT_MI_DIR = lockd # # Build path -# +# INCFLAGS_MAKEFILE= -I.. MIGKUFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_USER=1 -maxonstack 1024 @@ -42,7 +42,7 @@ COMP_FILES = ${MIG_KUSRC} do_build_all:: $(COMP_FILES) ${MIG_KUSRC} : lockd_mach.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user lockd_mach.c \ -header lockd_mach.h \ diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index b14ee43e4..c948506ae 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -48,11 +48,10 @@ MIG_DEFS = \ thread_act.defs \ vm_map.defs - MACH_PRIVATE_DEFS = \ coalition_notification.defs \ + ktrace_background.defs \ mach_notify.defs \ - memory_object.defs \ memory_object_control.defs \ memory_object_default.defs \ sysdiagnose_notification.defs \ @@ -69,7 +68,6 @@ MIG_USHDRS = \ coalition_notification_server.h \ exc_server.h \ mach_exc_server.h \ - memory_object_server.h \ memory_object_default_server.h \ notify_server.h \ task_access_server.h \ @@ -81,6 +79,7 @@ MIG_UUHDRS = \ clock_priv.h \ host_priv.h \ host_security.h \ + ktrace_background.h \ lock_set.h \ mach_host.h \ mach_port.h \ @@ -101,6 +100,7 @@ MIGINCLUDES = ${MIG_UUHDRS} ${MIG_USHDRS} DATAFILES = \ boolean.h \ clock_types.h \ + dyld_kernel.h \ error.h \ exception.h \ exception_types.h \ @@ -122,6 +122,7 @@ DATAFILES = \ mig.h \ mig_errors.h \ mig_voucher_support.h \ + mig_strncpy_zerofill_support.h \ ndr.h \ notify.h \ policy.h \ @@ -161,16 +162,20 @@ INSTALL_MI_LIST = \ bootstrap.h \ ${DATAFILES} +# installed into System.framework's PrivateHeaders/mach subdirectory PRIVATE_DATAFILES = \ bootstrap.h \ coalition.h \ coalition_notification.defs \ host_info.h \ + ktrace_background.defs \ mach_host.defs \ mach_traps.h \ memory_object_types.h \ mig.h \ processor_info.h \ + resource_notify.defs \ + resource_monitors.h \ semaphore.h \ sfi_class.h \ syscall_sw.h \ @@ -200,6 +205,7 @@ EXPORT_MI_LIST = \ branch_predicates.h \ coalition.h \ mach_interface.h \ + resource_monitors.h \ sfi_class.h \ ${DATAFILES} @@ -212,21 +218,21 @@ ${MIGINCLUDES} : ${MIG_TYPES} ${MIG_UUHDRS} : \ %.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ - -header $@ \ + -header $@ \ $< ${MIG_USHDRS} : \ %_server.h : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)$(MIG) $(MIGFLAGS) \ -server /dev/null \ -user /dev/null \ -header /dev/null \ - -sheader $@ \ + -sheader $@ \ $< # @@ -241,31 +247,38 @@ MIGKUFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_USER=1 -maxonstack 1024 # MIG-generated headers that are traditionally used by kernel # level code. # + +# sender-side ("user") headers generated by MIG from corresponding .defs MIG_KUHDRS = \ audit_triggers.h \ clock_reply.h \ exc.h \ host_notify_reply.h \ + ktrace_background.h \ mach_exc.h \ mach_notify.h \ memory_object.h \ memory_object_control.h \ memory_object_default.h \ + resource_notify.h \ task_access.h \ upl.h \ vm_map.h +# sender-side ("user") source files generated by MIG from corresponding .defs MIG_KUSRC = \ audit_triggers_user.c \ clock_reply_user.c \ coalition_notification_user.c \ exc_user.c \ host_notify_reply_user.c \ + ktrace_background_user.c \ mach_exc_user.c \ mach_notify_user.c \ memory_object_user.c \ memory_object_control_user.c \ memory_object_default_user.c \ + resource_notify_user.c \ task_access_user.c \ telemetry_notification_user.c \ upl_user.c \ @@ -286,7 +299,6 @@ MIG_KSHDRS = \ mach_vm_server.h \ mach_voucher_server.h \ mach_voucher_attr_control_server.h \ - memory_object_server.h \ memory_object_control_server.h \ memory_object_default_server.h \ processor_server.h \ @@ -311,7 +323,6 @@ MIG_KSSRC = \ mach_vm_server.c \ mach_voucher_server.c \ mach_voucher_attr_control_server.c \ - memory_object_server.c \ memory_object_control_server.c \ memory_object_default_server.c \ processor_server.c \ @@ -346,7 +357,7 @@ ${COMP_FILES} : ${MIG_TYPES} ${MIG_KUSRC} : \ %_user.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ -user $*_user.c \ -header $*.h \ @@ -356,7 +367,7 @@ ${MIG_KUSRC} : \ ${MIG_KSSRC}: \ %_server.c : %.defs - @echo MIG $@ + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ -user /dev/null \ -header /dev/null \ diff --git a/osfmk/mach/alert.h b/osfmk/mach/alert.h deleted file mode 100644 index 4365af96a..000000000 --- a/osfmk/mach/alert.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:31 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:25:45 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.5.2 1995/01/18 18:35:06 ezf - * updated Utah CR notice - * [1995/01/18 18:30:38 ezf] - * - * Revision 1.1.5.1 1994/09/23 02:33:53 ezf - * change marker to not FREE - * [1994/09/22 21:38:56 ezf] - * - * Revision 1.1.2.1 1994/01/12 17:56:03 dwm - * Coloc: initial restructuring to follow Utah model. - * Alert bit definitions - * [1994/01/12 17:30:19 dwm] - * - * $EndLog$ - */ -/* - * Copyright (c) 1993 The University of Utah and - * the Computer Systems Laboratory (CSL). All rights reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSL requests users of this software to return to csl-dist@cs.utah.edu any - * improvements that they make and grant CSL redistribution rights. - * - */ - -#ifndef _MACH_ALERT_H_ -#define _MACH_ALERT_H_ - -#define ALERT_BITS 32 /* Minimum; more may actually be available */ - -/* Request to abort _all_ operations */ -#define ALERT_ABORT_STRONG 0x00000001 - -/* Request to abort restartable operations */ -#define ALERT_ABORT_SAFE 0x00000002 - -/* User-defined alert bits */ -#define ALERT_USER 0xffff0000 - -#endif /* _MACH_ALERT_H_ */ diff --git a/osfmk/mach/coalition.h b/osfmk/mach/coalition.h index 6b2038de4..e548852dd 100644 --- a/osfmk/mach/coalition.h +++ b/osfmk/mach/coalition.h @@ -88,6 +88,7 @@ struct coalition_resource_usage { uint64_t gpu_time; uint64_t cpu_time_billed_to_me; uint64_t cpu_time_billed_to_others; + uint64_t energy; uint64_t logical_immediate_writes; uint64_t logical_deferred_writes; uint64_t logical_invalidated_writes; diff --git a/osfmk/mach/dyld_kernel.h b/osfmk/mach/dyld_kernel.h new file mode 100644 index 000000000..b28e45f19 --- /dev/null +++ b/osfmk/mach/dyld_kernel.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _MACH_DYLIB_INFO_H_ +#define _MACH_DYLIB_INFO_H_ + +#include +#include +#include +#include +#include +#include + +/* These definitions must be kept in sync with the ones in + * osfmk/mach/mach_types.defs. + */ + +struct dyld_kernel_image_info { + uuid_t uuid; + fsobj_id_t fsobjid; + fsid_t fsid; + uint64_t load_addr; +}; + +struct dyld_kernel_process_info { + struct dyld_kernel_image_info cache_image_info; + uint64_t timestamp; // mach_absolute_time of last time dyld change to image list + uint32_t imageCount; // number of images currently loaded into process + uint32_t initialImageCount; // number of images statically loaded into process (before any dlopen() calls) + uint8_t dyldState; // one of dyld_process_state_* values + boolean_t no_cache; // process is running without a dyld cache + boolean_t private_cache; // process is using a private copy of its dyld cache +}; + +/* typedefs so our MIG is sane */ + +typedef struct dyld_kernel_image_info dyld_kernel_image_info_t; +typedef struct dyld_kernel_process_info dyld_kernel_process_info_t; +typedef dyld_kernel_image_info_t *dyld_kernel_image_info_array_t; + +#endif /* _MACH_DYLIB_INFO_H_ */ diff --git a/osfmk/mach/host_info.h b/osfmk/mach/host_info.h index 428a9e18b..d5f8c6b1f 100644 --- a/osfmk/mach/host_info.h +++ b/osfmk/mach/host_info.h @@ -99,6 +99,7 @@ typedef integer_t host_flavor_t; #define HOST_MACH_MSG_TRAP 8 /* Has mach_msg_trap */ #define HOST_VM_PURGABLE 9 /* purg'e'able memory info */ #define HOST_DEBUG_INFO_INTERNAL 10 /* Used for kernel internal development tests only */ +#define HOST_CAN_HAS_DEBUGGER 11 #ifdef MACH_KERNEL_PRIVATE struct host_basic_info_old { @@ -115,6 +116,14 @@ typedef struct host_basic_info_old *host_basic_info_old_t; (sizeof(host_basic_info_data_old_t)/sizeof(integer_t))) #endif /* MACH_KERNEL_PRIVATE */ +struct host_can_has_debugger_info { + boolean_t can_has_debugger; +}; +typedef struct host_can_has_debugger_info host_can_has_debugger_info_data_t; +typedef struct host_can_has_debugger_info *host_can_has_debugger_info_t; +#define HOST_CAN_HAS_DEBUGGER_COUNT ((mach_msg_type_number_t) \ + (sizeof(host_can_has_debugger_info_data_t)/sizeof(integer_t))) + #pragma pack(4) struct host_basic_info { diff --git a/osfmk/mach/host_notify.h b/osfmk/mach/host_notify.h index 6c0ca74b9..0a15991d2 100644 --- a/osfmk/mach/host_notify.h +++ b/osfmk/mach/host_notify.h @@ -30,8 +30,10 @@ #define _MACH_HOST_NOTIFY_H_ #define HOST_NOTIFY_CALENDAR_CHANGE 0 -#define HOST_NOTIFY_TYPE_MAX 0 +#define HOST_NOTIFY_CALENDAR_SET 1 +#define HOST_NOTIFY_TYPE_MAX 1 #define HOST_CALENDAR_CHANGED_REPLYID 950 +#define HOST_CALENDAR_SET_REPLYID 951 #endif /* _MACH_HOST_NOTIFY_H_ */ diff --git a/osfmk/mach/host_notify_reply.defs b/osfmk/mach/host_notify_reply.defs index 85437f821..469777cb5 100644 --- a/osfmk/mach/host_notify_reply.defs +++ b/osfmk/mach/host_notify_reply.defs @@ -37,4 +37,7 @@ subsystem simpleroutine host_calendar_changed( notify_port : mach_port_move_send_once_t); +simpleroutine host_calendar_set( + notify_port : mach_port_move_send_once_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach/host_special_ports.h b/osfmk/mach/host_special_ports.h index 84461ae8b..52a4a7449 100644 --- a/osfmk/mach/host_special_ports.h +++ b/osfmk/mach/host_special_ports.h @@ -85,6 +85,7 @@ #define HOST_USER_NOTIFICATION_PORT (3 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_AUTOMOUNTD_PORT (4 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_LOCKD_PORT (5 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_KTRACE_BACKGROUND_PORT (6 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_SEATBELT_PORT (7 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_KEXTD_PORT (8 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_CHUD_PORT (9 + HOST_MAX_SPECIAL_KERNEL_PORT) @@ -97,8 +98,11 @@ #define HOST_SYSDIAGNOSE_PORT (16 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_XPC_EXCEPTION_PORT (17 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_CONTAINERD_PORT (18 + HOST_MAX_SPECIAL_KERNEL_PORT) -#define HOST_MAX_SPECIAL_PORT HOST_CONTAINERD_PORT - /* See rdar://19421223 */ +#define HOST_NODE_PORT (19 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_RESOURCE_NOTIFY_PORT (20 + HOST_MAX_SPECIAL_KERNEL_PORT) + +#define HOST_MAX_SPECIAL_PORT HOST_RESOURCE_NOTIFY_PORT + /* MAX = last since rdar://19421223 */ /* * Special node identifier to always represent the local node. @@ -160,6 +164,12 @@ #define host_set_lockd_port(host, port) \ (host_set_special_port((host), HOST_LOCKD_PORT, (port))) +#define host_get_ktrace_background_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_KTRACE_BACKGROUND_PORT, (port))) +#define host_set_ktrace_background_port(host, port) \ + (host_set_special_port((host), HOST_KTRACE_BACKGROUND_PORT, (port))) + #define host_get_kextd_port(host, port) \ (host_get_special_port((host), \ HOST_LOCAL_NODE, HOST_KEXTD_PORT, (port))) @@ -220,4 +230,14 @@ #define host_set_container_port(host, port) \ (host_set_special_port((host), HOST_CONTAINERD_PORT, (port))) +#define host_get_node_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_NODE_PORT, (port))) +#define host_set_node_port(host, port) \ + (host_set_special_port((host), HOST_NODE_PORT, (port))) + +/* HOST_RESOURCE_NOTIFY_PORT doesn't #defines these conveniences. + All lookups go through send_resource_violation() + */ + #endif /* _MACH_HOST_SPECIAL_PORTS_H_ */ diff --git a/osfmk/mach/i386/Makefile b/osfmk/mach/i386/Makefile index 26a04f650..6d9affad1 100644 --- a/osfmk/mach/i386/Makefile +++ b/osfmk/mach/i386/Makefile @@ -23,11 +23,11 @@ INSTALL_MD_LIST = ${DATAFILES} INSTALL_MD_LCL_LIST = ${PRIVATE_DATAFILES} INSTALL_MD_GEN_LIST = \ - asm.h + asm.h INSTALL_MD_DIR = mach/i386 -EXPORT_MD_LIST = ${DATAFILES} +EXPORT_MD_LIST = ${DATAFILES} EXPORT_MD_GEN_LIST = \ asm.h @@ -36,5 +36,3 @@ EXPORT_MD_DIR = mach/i386 include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/mach/i386/thread_status.h b/osfmk/mach/i386/thread_status.h index 9c682ee72..03cc0f91b 100644 --- a/osfmk/mach/i386/thread_status.h +++ b/osfmk/mach/i386/thread_status.h @@ -356,12 +356,13 @@ struct x86_64_intr_stack_frame { uint64_t ss; }; typedef struct x86_64_intr_stack_frame x86_64_intr_stack_frame_t; -/* Note: sizeof(x86_64_intr_stack_frame_t) must be a multiple of 16 bytes */ +_Static_assert((sizeof(x86_64_intr_stack_frame_t) % 16) == 0, + "interrupt stack frame size must be a multiple of 16 bytes"); /* * thread state format for task running in 64bit long mode * in long mode, the same hardware frame is always pushed regardless - * of whether there was a change in privlege level... therefore, there + * of whether there was a change in privilege level... therefore, there * is no need for an x86_saved_state64_from_kernel variant */ struct x86_saved_state64 { diff --git a/osfmk/mach/i386/vm_param.h b/osfmk/mach/i386/vm_param.h index b4edac3cf..965e596ae 100644 --- a/osfmk/mach/i386/vm_param.h +++ b/osfmk/mach/i386/vm_param.h @@ -252,7 +252,7 @@ #define PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op) \ MACRO_BEGIN \ - pmap_set_cache_attributes((mem)->phys_page, (cache_attr)); \ + pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), (cache_attr)); \ (object)->set_cache_attr = TRUE; \ (void) batch_pmap_op; \ MACRO_END diff --git a/osfmk/mach/i386/vm_types.h b/osfmk/mach/i386/vm_types.h index ecdc42070..7e590842a 100644 --- a/osfmk/mach/i386/vm_types.h +++ b/osfmk/mach/i386/vm_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -133,7 +133,7 @@ typedef mach_vm_address_t mach_port_context_t; #ifdef MACH_KERNEL_PRIVATE -#if VM32_SUPPORT +#ifdef VM32_SUPPORT /* * These are types used internal to Mach to implement the diff --git a/osfmk/default_pager/default_pager_alerts.defs b/osfmk/mach/ktrace_background.defs similarity index 78% rename from osfmk/default_pager/default_pager_alerts.defs rename to osfmk/mach/ktrace_background.defs index 5629769c3..553c7b26d 100644 --- a/osfmk/default_pager/default_pager_alerts.defs +++ b/osfmk/mach/ktrace_background.defs @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,29 +22,28 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + /* - * File: default_pager/default_pager_alerts.defs - * + * Interface definition for background available notifications for the + * kernel trace facility. */ -subsystem -#if KERNEL_USER - KernelUser +subsystem +#if KERNEL_USER + KernelUser #endif /* KERNEL_USER */ -#if KERNEL_SERVER - KernelServer -#endif /* KERNEL_SERVER */ - default_pager_alerts 2295; + ktrace_background 670; #include #include +serverprefix receive_; +userprefix send_; -simpleroutine default_pager_space_alert( - alert_port : mach_port_t; - in flags : int); +simpleroutine ktrace_background_available( + ktrace_background_port : mach_port_t); /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_host.defs b/osfmk/mach/mach_host.defs index 1e4bb3d39..04c44fde2 100644 --- a/osfmk/mach/mach_host.defs +++ b/osfmk/mach/mach_host.defs @@ -275,7 +275,11 @@ skip; * Create a new voucher by running a series of commands against * pairs of resource attributes. */ +#if !KERNEL && !LIBSYSCALL_INTERFACE +routine _kernelrpc_host_create_mach_voucher( +#else routine host_create_mach_voucher( +#endif host : host_t; recipes : mach_voucher_attr_raw_recipe_array_t; out voucher : ipc_voucher_t); diff --git a/osfmk/mach/mach_time.h b/osfmk/mach/mach_time.h index 16805896a..e4c703174 100644 --- a/osfmk/mach/mach_time.h +++ b/osfmk/mach/mach_time.h @@ -30,8 +30,8 @@ #define _MACH_MACH_TIME_H_ #include - #include +#include struct mach_timebase_info { uint32_t numer; @@ -53,7 +53,44 @@ kern_return_t mach_wait_until( #endif /* KERNEL */ uint64_t mach_absolute_time(void); + +__OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_8_0) uint64_t mach_approximate_time(void); + +/* + * like mach_absolute_time, but advances during sleep + */ +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +__TVOS_AVAILABLE(__TVOS_10_0) +__WATCHOS_AVAILABLE(__WATCHOS_3_0) +uint64_t mach_continuous_time(void); + +/* + * like mach_approximate_time, but advances during sleep + */ +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +__TVOS_AVAILABLE(__TVOS_10_0) +__WATCHOS_AVAILABLE(__WATCHOS_3_0) +uint64_t mach_continuous_approximate_time(void); + +#if !defined(KERNEL) && defined(PRIVATE) +// Forward definition because this is a BSD value +struct timespec; + +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +__TVOS_AVAILABLE(__TVOS_10_0) +__WATCHOS_AVAILABLE(__WATCHOS_3_0) +kern_return_t mach_get_times(uint64_t* absolute_time, + uint64_t* continuous_time, + struct timespec *tp); + +__OSX_AVAILABLE_STARTING(__MAC_10_12, __IPHONE_10_0) +__TVOS_AVAILABLE(__TVOS_10_0) +__WATCHOS_AVAILABLE(__WATCHOS_3_0) +uint64_t mach_boottime_usec(void); + +#endif /* KERNEL */ + __END_DECLS #endif /* _MACH_MACH_TIME_H_ */ diff --git a/osfmk/mach/mach_traps.h b/osfmk/mach/mach_traps.h index c33a308f4..e28dba07e 100644 --- a/osfmk/mach/mach_traps.h +++ b/osfmk/mach/mach_traps.h @@ -107,7 +107,7 @@ extern mach_msg_return_t mach_msg_overwrite_trap( mach_msg_size_t rcv_size, mach_port_name_t rcv_name, mach_msg_timeout_t timeout, - mach_port_name_t notify, + mach_msg_priority_t override, mach_msg_header_t *rcv_msg, mach_msg_size_t rcv_limit); @@ -177,6 +177,12 @@ extern kern_return_t _kernelrpc_mach_vm_map_trap( vm_prot_t cur_protection ); +extern kern_return_t _kernelrpc_mach_vm_purgable_control_trap( + mach_port_name_t target, + mach_vm_offset_t address, + vm_purgable_t control, + int *state); + extern kern_return_t _kernelrpc_mach_port_allocate_trap( mach_port_name_t target, mach_port_right_t right, @@ -253,6 +259,12 @@ extern kern_return_t _kernelrpc_mach_port_unguard_trap( uint64_t guard ); +extern kern_return_t mach_generate_activity_id( + mach_port_name_t target, + int count, + uint64_t *activity_id +); + extern kern_return_t macx_swapon( uint64_t filename, int flags, @@ -286,6 +298,18 @@ extern kern_return_t thread_switch( extern mach_port_name_t task_self_trap(void); +extern kern_return_t host_create_mach_voucher_trap( + mach_port_name_t host, + mach_voucher_attr_raw_recipe_array_t recipes, + int recipes_size, + mach_port_name_t *voucher); + +extern kern_return_t mach_voucher_extract_attr_recipe_trap( + mach_port_name_t voucher_name, + mach_voucher_attr_key_t key, + mach_voucher_attr_raw_recipe_t recipe, + mach_msg_type_number_t *recipe_size); + /* * Obsolete interfaces. */ @@ -378,7 +402,7 @@ struct mach_msg_overwrite_trap_args { PAD_ARG_(mach_msg_size_t, rcv_size); PAD_ARG_(mach_port_name_t, rcv_name); PAD_ARG_(mach_msg_timeout_t, timeout); - PAD_ARG_(mach_port_name_t, notify); + PAD_ARG_(mach_msg_priority_t, override); PAD_ARG_8 PAD_ARG_(user_addr_t, rcv_msg); /* Unused on mach_msg_trap */ }; @@ -610,6 +634,16 @@ struct _kernelrpc_mach_vm_map_trap_args { extern kern_return_t _kernelrpc_mach_vm_map_trap( struct _kernelrpc_mach_vm_map_trap_args *args); +struct _kernelrpc_mach_vm_purgable_control_trap_args { + PAD_ARG_(mach_port_name_t, target); /* 1 word */ + PAD_ARG_(mach_vm_offset_t, address); /* 2 words */ + PAD_ARG_(vm_purgable_t, control); /* 1 word */ + PAD_ARG_(user_addr_t, state); /* 1 word */ +}; /* Total: 5 */ + +extern kern_return_t _kernelrpc_mach_vm_purgable_control_trap( + struct _kernelrpc_mach_vm_purgable_control_trap_args *args); + struct _kernelrpc_mach_port_allocate_args { PAD_ARG_(mach_port_name_t, target); PAD_ARG_(mach_port_right_t, right); @@ -710,6 +744,37 @@ struct _kernelrpc_mach_port_unguard_args { extern kern_return_t _kernelrpc_mach_port_unguard_trap( struct _kernelrpc_mach_port_unguard_args *args); +struct mach_generate_activity_id_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(int, count); + PAD_ARG_(user_addr_t, activity_id); +}; +extern kern_return_t mach_generate_activity_id( + struct mach_generate_activity_id_args *args); + +/* + * Voucher trap interfaces + */ + +struct host_create_mach_voucher_args { + PAD_ARG_(mach_port_name_t, host); + PAD_ARG_(mach_voucher_attr_raw_recipe_array_t, recipes); + PAD_ARG_(int, recipes_size); + PAD_ARG_(user_addr_t, voucher); +}; +extern kern_return_t host_create_mach_voucher_trap( + struct host_create_mach_voucher_args *args); + +struct mach_voucher_extract_attr_recipe_args { + PAD_ARG_(mach_port_name_t, voucher_name); + PAD_ARG_(mach_voucher_attr_key_t, key); + PAD_ARG_(mach_voucher_attr_raw_recipe_t, recipe); + PAD_ARG_(user_addr_t, recipe_size); +}; + +extern kern_return_t mach_voucher_extract_attr_recipe_trap( + struct mach_voucher_extract_attr_recipe_args *args); + /* not published to LP64 clients yet */ struct iokit_user_client_trap_args { diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index 28d867651..4c4f7287b 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010, 2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -462,7 +462,7 @@ type vm_page_info_t = array[*:32] of int; type mach_vm_read_entry_t = array[512] of mach_vm_offset_t; type vm_read_entry_t = array[512] of vm_offset_t; -#if VM32_SUPPORT +#ifdef VM32_SUPPORT type vm32_read_entry_t = array[512] of vm32_offset_t; #endif @@ -574,6 +574,11 @@ type kmod_args_t = ^array[] of MACH_MSG_TYPE_BYTE type io_master_t = mach_port_t; type UNDServerRef = mach_port_t; +/* These must be kept in sync with definitions in osfmk/mach/dyld_kernel.h */ +type dyld_kernel_image_info_t = struct[40] of MACH_MSG_TYPE_BYTE; +type dyld_kernel_image_info_array_t = ^array[] of dyld_kernel_image_info_t; +type dyld_kernel_process_info_t = struct[64] of MACH_MSG_TYPE_BYTE; + #if KERNEL_SERVER #ifdef MACH_KERNEL_PRIVATE simport ; /* for voucher conversions */ diff --git a/osfmk/mach/mach_types.h b/osfmk/mach/mach_types.h index 8bfb9c4d1..ac870a2b5 100644 --- a/osfmk/mach/mach_types.h +++ b/osfmk/mach/mach_types.h @@ -107,6 +107,7 @@ #include #include #include +#include #ifdef KERNEL diff --git a/osfmk/mach/mach_vm.defs b/osfmk/mach/mach_vm.defs index 806215525..e0c7828e6 100644 --- a/osfmk/mach/mach_vm.defs +++ b/osfmk/mach/mach_vm.defs @@ -472,9 +472,9 @@ routine mach_make_memory_entry_64( * definition of the routine. */ #if !defined(_MACH_VM_PUBLISH_AS_LOCAL_) -routine mach_vm_purgable_control( +routine PREFIX(mach_vm_purgable_control) ( #else -routine vm_purgable_control( +routine PREFIX(vm_purgable_control) ( #endif target_task : vm_map_t; address : mach_vm_address_t; diff --git a/osfmk/mach/mach_voucher.defs b/osfmk/mach/mach_voucher.defs index 6d370a5b2..3decdd8aa 100644 --- a/osfmk/mach/mach_voucher.defs +++ b/osfmk/mach/mach_voucher.defs @@ -42,7 +42,11 @@ routine mach_voucher_extract_attr_content( out content : mach_voucher_attr_content_t, CountInOut); /* extract a recipe to reconstitue a pair item in a future voucher */ +#if !KERNEL && !LIBSYSCALL_INTERFACE +routine _kernelrpc_mach_voucher_extract_attr_recipe( +#else routine mach_voucher_extract_attr_recipe( +#endif voucher : ipc_voucher_t; key : mach_voucher_attr_key_t; out recipe : mach_voucher_attr_raw_recipe_t, CountInOut); diff --git a/osfmk/mach/mach_voucher_types.h b/osfmk/mach/mach_voucher_types.h index 0c5a4b516..3eb982d5f 100644 --- a/osfmk/mach/mach_voucher_types.h +++ b/osfmk/mach/mach_voucher_types.h @@ -98,6 +98,7 @@ typedef mach_voucher_attr_key_t *mach_voucher_attr_key_array_t; #define MACH_VOUCHER_ATTR_KEY_ATM ((mach_voucher_attr_key_t)1) #define MACH_VOUCHER_ATTR_KEY_IMPORTANCE ((mach_voucher_attr_key_t)2) #define MACH_VOUCHER_ATTR_KEY_BANK ((mach_voucher_attr_key_t)3) +#define MACH_VOUCHER_ATTR_KEY_PTHPRIORITY ((mach_voucher_attr_key_t)4) #define MACH_VOUCHER_ATTR_KEY_USER_DATA ((mach_voucher_attr_key_t)7) #define MACH_VOUCHER_ATTR_KEY_BITS MACH_VOUCHER_ATTR_KEY_USER_DATA /* deprecated */ @@ -175,6 +176,9 @@ typedef mach_voucher_attr_raw_recipe_t mach_voucher_attr_raw_recipe_array_t; typedef mach_msg_type_number_t mach_voucher_attr_raw_recipe_size_t; typedef mach_msg_type_number_t mach_voucher_attr_raw_recipe_array_size_t; +#define MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE 5120 +#define MACH_VOUCHER_TRAP_STACK_LIMIT 256 + #pragma pack() /* @@ -249,4 +253,9 @@ typedef uint32_t mach_voucher_attr_control_flags_t; #define MACH_VOUCHER_IMPORTANCE_ATTR_DROP_EXTERNAL 2 /* Drop some number of external refs */ typedef uint32_t mach_voucher_attr_importance_refs; +/* + * Activity id Generation defines + */ +#define MACH_ACTIVITY_ID_COUNT_MAX 16 + #endif /* _MACH_VOUCHER_TYPES_H_ */ diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 09ea8bb8a..c057fb799 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -385,8 +385,6 @@ __END_DECLS #define CPUFAMILY_POWERPC_G4 0x77c184ae #define CPUFAMILY_POWERPC_G5 0xed76d8aa #define CPUFAMILY_INTEL_6_13 0xaa33392b -#define CPUFAMILY_INTEL_YONAH 0x73d67300 -#define CPUFAMILY_INTEL_MEROM 0x426f69ef #define CPUFAMILY_INTEL_PENRYN 0x78ea4fbc #define CPUFAMILY_INTEL_NEHALEM 0x6b5a4cd2 #define CPUFAMILY_INTEL_WESTMERE 0x573b5eec @@ -398,7 +396,7 @@ __END_DECLS #define CPUFAMILY_ARM_9 0xe73283ae #define CPUFAMILY_ARM_11 0x8ff620d8 #define CPUFAMILY_ARM_XSCALE 0x53b005f5 -#define CPUFAMILY_ARM_12 0xbd1b0ae9 +#define CPUFAMILY_ARM_12 0xbd1b0ae9 #define CPUFAMILY_ARM_13 0x0cc90e64 #define CPUFAMILY_ARM_14 0x96077ef1 #define CPUFAMILY_ARM_15 0xa8511bca @@ -406,15 +404,11 @@ __END_DECLS #define CPUFAMILY_ARM_CYCLONE 0x37a09642 #define CPUFAMILY_ARM_TYPHOON 0x2c91a47e #define CPUFAMILY_ARM_TWISTER 0x92fb37c8 +#define CPUFAMILY_ARM_HURRICANE 0x67ceee93 /* The following synonyms are deprecated: */ -#define CPUFAMILY_INTEL_6_14 CPUFAMILY_INTEL_YONAH -#define CPUFAMILY_INTEL_6_15 CPUFAMILY_INTEL_MEROM #define CPUFAMILY_INTEL_6_23 CPUFAMILY_INTEL_PENRYN #define CPUFAMILY_INTEL_6_26 CPUFAMILY_INTEL_NEHALEM -#define CPUFAMILY_INTEL_CORE CPUFAMILY_INTEL_YONAH -#define CPUFAMILY_INTEL_CORE2 CPUFAMILY_INTEL_MEROM - #endif /* _MACH_MACHINE_H_ */ diff --git a/osfmk/mach/machine/Makefile b/osfmk/mach/machine/Makefile index 615a37e59..5034c34d0 100644 --- a/osfmk/mach/machine/Makefile +++ b/osfmk/mach/machine/Makefile @@ -3,11 +3,9 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) - DATAFILES = \ asm.h boolean.h exception.h kern_return.h ndr_def.h rpc.h \ processor_info.h thread_state.h thread_status.h \ @@ -29,5 +27,3 @@ EXPORT_MI_DIR = mach/machine include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/mach/machine/sdt.h b/osfmk/mach/machine/sdt.h index 3bdba92a9..599d6b944 100644 --- a/osfmk/mach/machine/sdt.h +++ b/osfmk/mach/machine/sdt.h @@ -151,7 +151,20 @@ DTRACE_CALL10ARGS(provider, name) \ } +#else +#define DTRACE_PROBE(provider, name) do {} while(0) +#define DTRACE_PROBE1(provider, name, arg0) do {} while(0) +#define DTRACE_PROBE2(provider, name, arg0, arg1) do {} while(0) +#define DTRACE_PROBE3(provider, name, arg0, arg1, arg2) do {} while(0) +#define DTRACE_PROBE4(provider, name, arg0, arg1, arg2, arg3) do {} while(0) +#define DTRACE_PROBE5(provider, name, arg0, arg1, arg2, arg3, arg4) do {} while(0) +#define DTRACE_PROBE6(provider, name, arg0, arg1, arg2, arg3, arg4, arg5) do {} while(0) +#define DTRACE_PROBE7(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6) do {} while(0) +#define DTRACE_PROBE8(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) do {} while(0) +#define DTRACE_PROBE9(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) do {} while(0) +#define DTRACE_PROBE10(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9) do {} while(0) +#endif /* CONFIG_DTRACE */ #define DTRACE_SCHED(name) \ DTRACE_PROBE(__sched_, name); @@ -205,6 +218,16 @@ type3, arg3, type4, arg4, type5, arg5) \ DTRACE_PROBE5(__sdt_, name, arg1, arg2, arg3, arg4, arg5); +#define DTRACE_MEMORYSTATUS2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__sdt_, name, arg1, arg2); + +#define DTRACE_MEMORYSTATUS3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__sdt_, name, arg1, arg2, arg3); + +#define DTRACE_MEMORYSTATUS6(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6) \ + DTRACE_PROBE6(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6) + #define DTRACE_TMR3(name, type1, arg1, type2, arg2, type3, arg3) \ DTRACE_PROBE3(__sdt_, name, arg1, arg2, arg3); @@ -243,6 +266,10 @@ type3, arg3, type4, arg4, type5, arg5) \ DTRACE_PROBE5(__vminfo_, name, arg1, arg2, arg3, arg4, arg5) +#define DTRACE_VM6(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6) \ + DTRACE_PROBE6(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6) + #define DTRACE_IP(name) \ DTRACE_PROBE(__ip_, name) @@ -351,71 +378,8 @@ type3, arg3, type4, arg4, type5, arg5, type6, arg6) \ DTRACE_PROBE6(__boost_, name, arg1, arg2, arg3, arg4, arg5, arg6); -#else /* CONFIG_DTRACE */ - -#define DTRACE_SCHED(name) do {} while (0) -#define DTRACE_SCHED1(name, type1, arg1) do {} while (0) -#define DTRACE_SCHED2(name, type1, arg1, type2, arg2) do {} while (0) -#define DTRACE_SCHED3(name, type1, arg1, type2, arg2, type3, arg3) do {} while (0) -#define DTRACE_SCHED4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while (0) - -#define DTRACE_PROC(name) do {} while(0) -#define DTRACE_PROC1(name, type1, arg1) do {} while(0) -#define DTRACE_PROC2(name, type1, arg1, type2, arg2) do {} while (0) -#define DTRACE_PROC3(name, type1, arg1, type2, arg2, type3, arg3) do {} while (0) -#define DTRACE_PROC4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) -#define DTRACE_IO(name) do {} while(0) -#define DTRACE_IO1(name, type1, arg1) do {} while(0) -#define DTRACE_IO2(name, type1, arg1, type2, arg2) do {} while(0) -#define DTRACE_IO3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) -#define DTRACE_IO4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) -#define DTRACE_INT5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) -#define DTRACE_TMR3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) - -#define DTRACE_VM(name) do {} while(0) -#define DTRACE_VM1(name, type1, arg1) do {} while(0) -#define DTRACE_VM2(name, type1, arg1, type2, arg2) do {} while(0) -#define DTRACE_VM3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) -#define DTRACE_VM4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) -#define DTRACE_VM5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) -#define DTRACE_IP(name) do {} while(0) -#define DTRACE_IP1(name, type1, arg1) do {} while(0) -#define DTRACE_IP2(name, type1, arg1, type2, arg2) do {} while(0) -#define DTRACE_IP3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) -#define DTRACE_IP4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) -#define DTRACE_IP5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) -#define DTRACE_IP6(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5, type6, arg6) do {} while(0) -#define DTRACE_IP7(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5, \ - type6, arg6, type7, arg7) do {} while(0) - -#define DTRACE_TCP(name) do {} while(0) -#define DTRACE_TCP1(name, type1, arg1) do {} while(0) -#define DTRACE_TCP2(name, type1, arg1, type2, arg2) do {} while(0) -#define DTRACE_TCP3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) -#define DTRACE_TCP4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) -#define DTRACE_TCP5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) - -#define DTRACE_MPTCP(name) do {} while(0) -#define DTRACE_MPTCP1(name, type1, arg1) do {} while(0) -#define DTRACE_MPTCP2(name, type1, arg1, type2, arg2) do {} while(0) -#define DTRACE_MPTCP3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) -#define DTRACE_MPTCP4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) -#define DTRACE_MPTCP5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) -#define DTRACE_MPTCP6(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5, type6, arg6) do {} while(0) -#define DTRACE_MPTCP7(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5, type6, arg6, type7, arg7) do {} while(0) - -#define DTRACE_FSINFO(name, type, vp) do {} while(0) -#define DTRACE_FSINFO_IO(name, type1, vp, type2, size) do {} while (0) - -#define DTRACE_BOOST(name) do {} while(0) -#define DTRACE_BOOST1(name, type1, arg1) do {} while(0) -#define DTRACE_BOOST2(name, type1, arg1, type2, arg2) do {} while(0) -#define DTRACE_BOOST3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) -#define DTRACE_BOOST4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) -#define DTRACE_BOOST5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) -#define DTRACE_BOOST6(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5, type6, arg6) do {} while(0) - -#endif /* CONFIG_DTRACE */ +#if PRIVATE +#endif /* PRIVATE */ #endif /* KERNEL */ diff --git a/osfmk/mach/memory_object_types.h b/osfmk/mach/memory_object_types.h index 2adf2b77a..433dda23d 100644 --- a/osfmk/mach/memory_object_types.h +++ b/osfmk/mach/memory_object_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -380,6 +380,7 @@ typedef struct memory_object_attr_info memory_object_attr_info_data_t; & 0xFF000000) | ((flags) & 0xFFFFFF)); /* leave room for vm_prot bits */ +#define MAP_MEM_GRAB_SECLUDED 0x008000 /* can grab secluded pages */ #define MAP_MEM_ONLY 0x010000 /* change processor caching */ #define MAP_MEM_NAMED_CREATE 0x020000 /* create extant object */ #define MAP_MEM_PURGABLE 0x040000 /* create a purgable VM object */ @@ -411,7 +412,7 @@ struct upl_page_info { ppnum_t phys_addr; /* physical page index number */ unsigned int #ifdef XNU_KERNEL_PRIVATE - pageout:1, /* page is to be removed on commit */ + free_when_done:1,/* page is to be freed on commit */ absent:1, /* No valid data in this page */ dirty:1, /* Page must be cleaned (O) */ precious:1, /* must be cleaned, we have only copy */ @@ -669,15 +670,15 @@ typedef uint64_t upl_control_flags_t; (((upl)[(index)].phys_addr != 0) ? (!((upl)[(index)].absent)) : FALSE) #define UPL_PAGEOUT_PAGE(upl, index) \ - (((upl)[(index)].phys_addr != 0) ? ((upl)[(index)].pageout) : FALSE) + (((upl)[(index)].phys_addr != 0) ? ((upl)[(index)].free_when_done) : FALSE) #define UPL_SET_PAGE_FREE_ON_COMMIT(upl, index) \ (((upl)[(index)].phys_addr != 0) ? \ - ((upl)[(index)].pageout = TRUE) : FALSE) + ((upl)[(index)].free_when_done = TRUE) : FALSE) #define UPL_CLR_PAGE_FREE_ON_COMMIT(upl, index) \ (((upl)[(index)].phys_addr != 0) ? \ - ((upl)[(index)].pageout = FALSE) : FALSE) + ((upl)[(index)].free_when_done = FALSE) : FALSE) #define UPL_REPRIO_INFO_BLKNO(upl, index) \ (((upl)->upl_reprio_info[(index)]) & UPL_REPRIO_INFO_MASK) diff --git a/osfmk/mach/message.h b/osfmk/mach/message.h index 9b483dd5a..74fe65b24 100644 --- a/osfmk/mach/message.h +++ b/osfmk/mach/message.h @@ -222,20 +222,23 @@ typedef unsigned int mach_msg_bits_t; typedef natural_t mach_msg_size_t; typedef integer_t mach_msg_id_t; - #define MACH_MSG_SIZE_NULL (mach_msg_size_t *) 0 +typedef unsigned int mach_msg_priority_t; + +#define MACH_MSG_PRIORITY_UNSPECIFIED (mach_msg_priority_t) 0 + typedef unsigned int mach_msg_type_name_t; -#define MACH_MSG_TYPE_MOVE_RECEIVE 16 /* Must hold receive right */ -#define MACH_MSG_TYPE_MOVE_SEND 17 /* Must hold send right(s) */ -#define MACH_MSG_TYPE_MOVE_SEND_ONCE 18 /* Must hold sendonce right */ -#define MACH_MSG_TYPE_COPY_SEND 19 /* Must hold send right(s) */ -#define MACH_MSG_TYPE_MAKE_SEND 20 /* Must hold receive right */ -#define MACH_MSG_TYPE_MAKE_SEND_ONCE 21 /* Must hold receive right */ -#define MACH_MSG_TYPE_COPY_RECEIVE 22 /* NOT VALID */ -#define MACH_MSG_TYPE_DISPOSE_RECEIVE 24 /* must hold receive right */ -#define MACH_MSG_TYPE_DISPOSE_SEND 25 /* must hold send right(s) */ +#define MACH_MSG_TYPE_MOVE_RECEIVE 16 /* Must hold receive right */ +#define MACH_MSG_TYPE_MOVE_SEND 17 /* Must hold send right(s) */ +#define MACH_MSG_TYPE_MOVE_SEND_ONCE 18 /* Must hold sendonce right */ +#define MACH_MSG_TYPE_COPY_SEND 19 /* Must hold send right(s) */ +#define MACH_MSG_TYPE_MAKE_SEND 20 /* Must hold receive right */ +#define MACH_MSG_TYPE_MAKE_SEND_ONCE 21 /* Must hold receive right */ +#define MACH_MSG_TYPE_COPY_RECEIVE 22 /* NOT VALID */ +#define MACH_MSG_TYPE_DISPOSE_RECEIVE 24 /* must hold receive right */ +#define MACH_MSG_TYPE_DISPOSE_SEND 25 /* must hold send right(s) */ #define MACH_MSG_TYPE_DISPOSE_SEND_ONCE 26 /* must hold sendonce right */ typedef unsigned int mach_msg_copy_options_t; @@ -658,6 +661,7 @@ typedef integer_t mach_msg_option_t; #define MACH_RCV_LARGE_IDENTITY 0x00000008 /* identify source of large messages */ #define MACH_SEND_TIMEOUT 0x00000010 /* timeout value applies to send */ +#define MACH_SEND_OVERRIDE 0x00000020 /* priority override for send */ #define MACH_SEND_INTERRUPT 0x00000040 /* don't restart interrupted sends */ #define MACH_SEND_NOTIFY 0x00000080 /* arm send-possible notify */ #define MACH_SEND_ALWAYS 0x00010000 /* ignore qlimits - kernel only */ @@ -673,6 +677,20 @@ typedef integer_t mach_msg_option_t; #define MACH_RCV_VOUCHER 0x00000800 /* willing to receive voucher port */ #define MACH_RCV_OVERWRITE 0x00001000 /* scatter receive (deprecated) */ +#ifdef XNU_KERNEL_PRIVATE + +#define MACH_RCV_STACK 0x00002000 /* receive into highest addr of buffer */ + +/* + * NOTE: + * This internal-only flag is intended for use by a single thread per-port/set! + * If more than one thread attempts to MACH_PEEK_MSG on a port or set, one of + * the threads may miss messages (in fact, it may never wake up). + */ +#define MACH_PEEK_MSG 0x00100000 /* receive, but leave msgs queued */ + +#endif + /* * NOTE: a 0x00------ RCV mask implies to ask for * a MACH_MSG_TRAILER_FORMAT_0 with 0 Elements, @@ -698,13 +716,13 @@ typedef integer_t mach_msg_option_t; #ifdef MACH_KERNEL_PRIVATE /* The options that the kernel honors when passed from user space */ -#define MACH_SEND_USER (MACH_SEND_MSG | \ - MACH_SEND_TIMEOUT | MACH_SEND_NOTIFY | \ - MACH_SEND_TRAILER | MACH_SEND_NOIMPORTANCE ) +#define MACH_SEND_USER (MACH_SEND_MSG | MACH_SEND_TIMEOUT | \ + MACH_SEND_NOTIFY | MACH_SEND_OVERRIDE | \ + MACH_SEND_TRAILER | MACH_SEND_NOIMPORTANCE ) -#define MACH_RCV_USER (MACH_RCV_MSG | MACH_RCV_TIMEOUT | \ - MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \ - MACH_RCV_VOUCHER | MACH_RCV_TRAILER_MASK) +#define MACH_RCV_USER (MACH_RCV_MSG | MACH_RCV_TIMEOUT | \ + MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \ + MACH_RCV_VOUCHER | MACH_RCV_TRAILER_MASK) #define MACH_MSG_OPTION_USER (MACH_SEND_USER | MACH_RCV_USER) @@ -853,6 +871,13 @@ typedef kern_return_t mach_msg_return_t; #define MACH_RCV_IN_PROGRESS_TIMED 0x10004011 /* Waiting for receive with timeout. (Internal use only.) */ +#ifdef XNU_KERNEL_PRIVATE +#define MACH_PEEK_IN_PROGRESS 0x10008001 + /* Waiting for a peek. (Internal use only.) */ +#define MACH_PEEK_READY 0x10008002 + /* Waiting for a peek. (Internal use only.) */ +#endif + __BEGIN_DECLS @@ -916,8 +941,10 @@ extern kern_return_t mach_voucher_deallocate( #elif defined(MACH_KERNEL_PRIVATE) -extern mach_msg_return_t mach_msg_receive_results(void); +extern mach_msg_return_t mach_msg_receive_results(mach_msg_size_t *size); +extern mach_msg_priority_t mach_msg_priority_combine(mach_msg_priority_t msg_qos, + mach_msg_priority_t recv_qos); #endif /* KERNEL */ __END_DECLS diff --git a/osfmk/mach/mig.h b/osfmk/mach/mig.h index 3d7655076..f6bf29a70 100644 --- a/osfmk/mach/mig.h +++ b/osfmk/mach/mig.h @@ -273,6 +273,7 @@ extern void mig_put_reply_port(mach_port_t reply_port); /* Bounded string copy */ extern int mig_strncpy(char *dest, const char *src, int len); +extern int mig_strncpy_zerofill(char *dest, const char *src, int len); #ifdef KERNEL_PRIVATE diff --git a/osfmk/mach/mig_strncpy_zerofill_support.h b/osfmk/mach/mig_strncpy_zerofill_support.h new file mode 100644 index 000000000..47cdc6159 --- /dev/null +++ b/osfmk/mach/mig_strncpy_zerofill_support.h @@ -0,0 +1,8 @@ +//This dummy header file is created for mig to check when to call mig_strncpy_zerofill. +//Mig checks if this file is available to include and knows that Libsyscall has the new mig_strncpy_zerofill symbols to link to. +//Do not delete this file, mig will stop calling mig_strncpy_zerofill. + +#ifndef __MACH_MIG_STRNCPY_ZEROFILL_SUPPORT__ +#define __MACH_MIG_STRNCPY_ZEROFILL_SUPPORT__ + +#endif // __MACH_MIG_STRNCPY_ZEROFILL_SUPPORT__ diff --git a/bsd/hfs/hfs_encodings.h b/osfmk/mach/resource_monitors.h similarity index 52% rename from bsd/hfs/hfs_encodings.h rename to osfmk/mach/resource_monitors.h index f93ed4666..a6bad0b6c 100644 --- a/bsd/hfs/hfs_encodings.h +++ b/osfmk/mach/resource_monitors.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2002, 2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,50 +22,53 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + +#ifndef _MACH_RESOURCE_MONITORS_H_ +#define _MACH_RESOURCE_MONITORS_H_ + +#include /* PATH_MAX */ +#ifndef XNU_KERNEL_PRIVATE +#include +#endif + +__BEGIN_DECLS + /* - * Copyright (c) 1997-2000 Apple Computer, Inc. All Rights Reserved + * resource_notify_flags_t + * The top 32 bits are common flags, the bottom for per-call flags. */ +typedef uint64_t resource_notify_flags_t; +#define kRNFlagsNone 0 + +/* Flags applicable to any monitors. */ +#define kRNFatalLimitFlag (1ULL << 32) -#ifndef _HFS_ENCODINGS_H_ -#define _HFS_ENCODINGS_H_ +/* For the disk writes I/O monitor. + The default is logical writes. */ +#define kRNPhysicalWritesFlag (1ULL < 1) + +/* TEMPORARY compatibility, to be removed */ +#define kCPUTriggerFatalFlag kRNFatalLimitFlag -#include -#ifdef __APPLE_API_UNSTABLE -#define CTL_HFS_NAMES { \ - { 0, 0 }, \ - { "encodingbias", CTLTYPE_INT }, \ -} /* - * HFS Filename Encoding Converters Interface - * - * Private Interface for adding hfs filename - * encoding converters. These are not needed - * for HFS Plus volumes (since they already - * have Unicode filenames). + * Process name types for proc_internal.h. + * proc_name_t is used by resource_notify.defs clients in user space. * - * Used by HFS Encoding Converter Kernel Modules - * (like HFS_Japanese.kmod) to register their - * encoding conversion routines. + * MAXCOMLEN is defined in bsd/sys/param.h which we can neither include + * (type conflicts) nor modify (POSIX). */ +#define MAXCOMLEN 16 -typedef int (* hfs_to_unicode_func_t)(const Str31 hfs_str, UniChar *uni_str, - u_int32_t maxCharLen, u_int32_t *usedCharLen); - -typedef int (* unicode_to_hfs_func_t)(UniChar *uni_str, u_int32_t unicodeChars, - Str31 hfs_str); - -int hfs_addconverter(int kmod_id, u_int32_t encoding, - hfs_to_unicode_func_t get_unicode, - unicode_to_hfs_func_t get_hfsname); - -int hfs_remconverter(int kmod_id, u_int32_t encoding); +typedef char command_t[MAXCOMLEN+1]; +typedef char proc_name_t[2*MAXCOMLEN+1]; +typedef char posix_path_t[PATH_MAX]; -#endif /* __APPLE_API_UNSTABLE */ +__END_DECLS -#endif /* ! _HFS_ENCODINGS_H_ */ +#endif /* _MACH_RESOURCE_MONITORS_H_ */ diff --git a/osfmk/mach/resource_notify.defs b/osfmk/mach/resource_notify.defs new file mode 100644 index 000000000..e4153c2f2 --- /dev/null +++ b/osfmk/mach/resource_notify.defs @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include // mach_port_t +#include // mach_timespec_t + +#if KERNEL_USER +import ; +import ; +#else +import ; +import ; +#endif + +// match struct proc.p_name / proc_name_t as of January 2016 +#define MAXCOMLEN 16 +type proc_name_t = array [2*MAXCOMLEN+1] of char; +type posix_path_t = array [512] of char; // ?? can't compile w/1024? +type resource_notify_flags_t = uint64_t; + +/* The kernel sends the message, so we compile with KernelUser when + building in the kernel. */ +subsystem +#if KERNEL_USER + KernelUser +#endif + resource_notify 827800; /* 'R''N'00 */ + +UserPrefix send_; +ServerPrefix receive_; + + +SimpleRoutine cpu_usage_violation( + receiver : mach_port_t; + + /* violator */ + procname : proc_name_t; + pid : int; + killed_proc_path : posix_path_t; /* filled in if fatal */ + + /* violation */ + timestamp : mach_timespec_t; /* 32b time, see 25567702 */ + observed_cpu_nsecs : int64_t; + observation_nsecs : int64_t; /* time it took to hit limit */ + + /* threshold crossed: calculated from proc_set_cpumon_params() */ + cpu_nsecs_allowed : int64_t; + limit_window_nsecs : int64_t; /* over this period */ + + flags : resource_notify_flags_t +); + +SimpleRoutine cpu_wakes_violation( + receiver : mach_port_t; + + /* violator */ + procname : proc_name_t; + pid : int; + killed_proc_path : posix_path_t; /* filled in if fatal */ + + /* violation */ + timestamp : mach_timespec_t; + observed_cpu_wakes : int64_t; + observation_nsecs : int64_t; /* time it took to hit limit */ + + /* threshold crossed: calculated from proc_set_wakemon_params() */ + cpu_wakes_allowed : int64_t; + limit_window_nsecs : int64_t; /* over this period */ + + flags : resource_notify_flags_t +); + +SimpleRoutine disk_writes_violation( + receiver : mach_port_t; + + /* violator */ + procname : proc_name_t; + pid : int; + killed_proc_path : posix_path_t; /* filled in if fatal */ + + /* violation */ + timestamp : mach_timespec_t; + observed_bytes_dirtied : int64_t; + observation_nsecs : int64_t; /* time it took to hit limit */ + + /* threshold */ + bytes_dirtied_allowed : int64_t; + limit_window_nsecs : int64_t; /* over this period */ + + flags : resource_notify_flags_t +); diff --git a/osfmk/mach/shared_region.h b/osfmk/mach/shared_region.h index 7f1d3fbb0..e460db001 100644 --- a/osfmk/mach/shared_region.h +++ b/osfmk/mach/shared_region.h @@ -44,8 +44,8 @@ #define SHARED_REGION_BASE_X86_64 0x00007FFF70000000ULL #define SHARED_REGION_SIZE_X86_64 0x000000008FE00000ULL -#define SHARED_REGION_NESTING_BASE_X86_64 0x00007FFF80000000ULL -#define SHARED_REGION_NESTING_SIZE_X86_64 0x0000000040000000ULL +#define SHARED_REGION_NESTING_BASE_X86_64 0x00007FFF70000000ULL +#define SHARED_REGION_NESTING_SIZE_X86_64 0x000000008FE00000ULL #define SHARED_REGION_NESTING_MIN_X86_64 0x0000000000200000ULL #define SHARED_REGION_NESTING_MAX_X86_64 0xFFFFFFFFFFE00000ULL @@ -63,10 +63,10 @@ #define SHARED_REGION_NESTING_MIN_PPC64 0x0000000010000000ULL #define SHARED_REGION_NESTING_MAX_PPC64 0x0000000010000000ULL -#define SHARED_REGION_BASE_ARM 0x20000000ULL -#define SHARED_REGION_SIZE_ARM 0x20000000ULL -#define SHARED_REGION_NESTING_BASE_ARM 0x20000000ULL -#define SHARED_REGION_NESTING_SIZE_ARM 0x20000000ULL +#define SHARED_REGION_BASE_ARM 0x1A000000ULL +#define SHARED_REGION_SIZE_ARM 0x26000000ULL +#define SHARED_REGION_NESTING_BASE_ARM 0x1A000000ULL +#define SHARED_REGION_NESTING_SIZE_ARM 0x26000000ULL #define SHARED_REGION_NESTING_MIN_ARM ? #define SHARED_REGION_NESTING_MAX_ARM ? @@ -74,9 +74,9 @@ /* ARM64_TODO: move to higher memory */ #endif #define SHARED_REGION_BASE_ARM64 0x180000000ULL -#define SHARED_REGION_SIZE_ARM64 0x28000000ULL +#define SHARED_REGION_SIZE_ARM64 0x40000000ULL #define SHARED_REGION_NESTING_BASE_ARM64 0x180000000ULL -#define SHARED_REGION_NESTING_SIZE_ARM64 0x28000000ULL +#define SHARED_REGION_NESTING_SIZE_ARM64 0x40000000ULL #define SHARED_REGION_NESTING_MIN_ARM64 ? #define SHARED_REGION_NESTING_MAX_ARM64 ? diff --git a/osfmk/mach/std_types.h b/osfmk/mach/std_types.h index f5b838ab2..2ad966c71 100644 --- a/osfmk/mach/std_types.h +++ b/osfmk/mach/std_types.h @@ -69,4 +69,7 @@ #include #include +#include +#include + #endif /* _MACH_STD_TYPES_H_ */ diff --git a/osfmk/mach/syscall_sw.h b/osfmk/mach/syscall_sw.h index b326683d6..8ed6e7a68 100644 --- a/osfmk/mach/syscall_sw.h +++ b/osfmk/mach/syscall_sw.h @@ -84,6 +84,7 @@ * procedure call standard; we pad for 64-bit args. */ kernel_trap(_kernelrpc_mach_vm_allocate_trap,-10,5) /* 4 args, +1 for mach_vm_size_t */ +kernel_trap(_kernelrpc_mach_vm_purgable_control_trap,-11,5) /* 4 args, +1 for mach_vm_offset_t */ kernel_trap(_kernelrpc_mach_vm_deallocate_trap,-12,5) /* 3 args, +2 for mach_vm_size_t and mach_vm_address_t */ kernel_trap(_kernelrpc_mach_vm_protect_trap,-14,7) /* 5 args, +2 for mach_vm_address_t and mach_vm_size_t */ kernel_trap(_kernelrpc_mach_vm_map_trap,-15,9) @@ -115,6 +116,7 @@ kernel_trap(semaphore_timedwait_signal_trap,-39,4) kernel_trap(_kernelrpc_mach_port_guard_trap,-41,5) kernel_trap(_kernelrpc_mach_port_unguard_trap,-42,4) +kernel_trap(mach_generate_activity_id, -43, 3) kernel_trap(task_name_for_pid,-44,3) kernel_trap(task_for_pid,-45,3) @@ -139,7 +141,15 @@ kernel_trap(swtch,-60,0) kernel_trap(syscall_thread_switch,-61,3) kernel_trap(clock_sleep_trap,-62,5) -kernel_trap(mach_timebase_info,-89,1) +/* voucher traps */ +kernel_trap(host_create_mach_voucher_trap,-70,4) +/* mach_voucher_extract_attr_content */ +kernel_trap(mach_voucher_extract_attr_recipe_trap,-72,4) +/* mach_voucher_extract_all_attr_recipes */ +/* mach_voucher_attr_command */ +/* mach_voucher_debug_info */ + +kernel_trap(mach_timebase_info_trap,-89,1) #if defined(__LP64__) /* unit64_t arguments passed in one register in LP64 */ diff --git a/osfmk/mach/task.defs b/osfmk/mach/task.defs index 4d4db4dfe..df4e65ab5 100644 --- a/osfmk/mach/task.defs +++ b/osfmk/mach/task.defs @@ -450,5 +450,47 @@ routine task_swap_mach_voucher( new_voucher : ipc_voucher_t; inout old_voucher : ipc_voucher_t); +routine task_generate_corpse( + task :task_t; + out corpse_task_port:mach_port_t); + +routine task_map_corpse_info( + task :task_t; + corspe_task :task_t; + out kcd_addr_begin :vm_address_t; + out kcd_size :uint32_t); + +routine task_register_dyld_image_infos( + task :task_t; + dyld_images :dyld_kernel_image_info_array_t); + +routine task_unregister_dyld_image_infos( + task :task_t; + dyld_images :dyld_kernel_image_info_array_t); + +routine task_get_dyld_image_infos( + task :task_t; + out dyld_images :dyld_kernel_image_info_array_t); + +routine task_register_dyld_shared_cache_image_info( + task :task_t; + dyld_cache_image :dyld_kernel_image_info_t; + no_cache :boolean_t; + private_cache :boolean_t); + +routine task_register_dyld_set_dyld_state( + task :task_t; + dyld_state :uint8_t); + +routine task_register_dyld_get_process_state( + task :task_t; + out dyld_process_state :dyld_kernel_process_info_t); + +routine task_map_corpse_info_64( + task :task_t; + corspe_task :task_t; + out kcd_addr_begin :mach_vm_address_t; + out kcd_size :mach_vm_size_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach/task_info.h b/osfmk/mach/task_info.h index 3311e3c15..ebb290871 100644 --- a/osfmk/mach/task_info.h +++ b/osfmk/mach/task_info.h @@ -324,14 +324,20 @@ struct task_vm_info { /* added for rev1 */ mach_vm_size_t phys_footprint; + + /* added for rev2 */ + mach_vm_address_t min_address; + mach_vm_address_t max_address; }; typedef struct task_vm_info task_vm_info_data_t; typedef struct task_vm_info *task_vm_info_t; #define TASK_VM_INFO_COUNT ((mach_msg_type_number_t) \ (sizeof (task_vm_info_data_t) / sizeof (natural_t))) +#define TASK_VM_INFO_REV2_COUNT TASK_VM_INFO_COUNT +#define TASK_VM_INFO_REV1_COUNT /* doesn't include min and max address */ \ + ((mach_msg_type_number_t) (TASK_VM_INFO_REV2_COUNT - 4)) #define TASK_VM_INFO_REV0_COUNT /* doesn't include phys_footprint */ \ - ((mach_msg_type_number_t) \ - (TASK_VM_INFO_COUNT - 2)) + ((mach_msg_type_number_t) (TASK_VM_INFO_REV1_COUNT - 2)) typedef struct vm_purgeable_info task_purgable_info_t; diff --git a/osfmk/mach/task_policy.h b/osfmk/mach/task_policy.h index c6852fe52..9ad6d0798 100644 --- a/osfmk/mach/task_policy.h +++ b/osfmk/mach/task_policy.h @@ -185,106 +185,81 @@ typedef struct task_qos_policy *task_qos_policy_t; #ifdef PRIVATE +/* + * Internal bitfields are privately exported for *revlocked* + * tools like msa to decode tracepoints and taskinfo to dump state + * + * These struct definitions *will* change in the future. + * When they do, we will update TASK_POLICY_INTERNAL_STRUCT_VERSION. + */ + +#define TASK_POLICY_INTERNAL_STRUCT_VERSION 1 + struct task_requested_policy { - /* Task and thread policy (inherited) */ - uint64_t int_darwinbg :1, /* marked as darwinbg via setpriority */ - ext_darwinbg :1, - int_iotier :2, /* IO throttle tier */ - ext_iotier :2, - int_iopassive :1, /* should IOs cause lower tiers to be throttled */ - ext_iopassive :1, - bg_iotier :2, /* what IO throttle tier should apply to me when I'm darwinbg? (pushed to threads) */ - terminated :1, /* all throttles should be removed for quick exit or SIGTERM handling */ - - /* Thread only policy */ - th_pidbind_bg :1, /* thread only: task i'm bound to is marked 'watchbg' */ - th_workq_bg :1, /* thread only: currently running a background priority workqueue */ - thrp_qos :3, /* thread only: thread qos class */ - thrp_qos_relprio :4, /* thread only: thread qos relative priority (store as inverse, -10 -> 0xA) */ - thrp_qos_override :3, /* thread only: thread qos class override */ - - /* Task only policy */ - t_apptype :3, /* What apptype did launchd tell us this was (inherited) */ - t_boosted :1, /* Has a non-zero importance assertion count */ - t_int_gpu_deny :1, /* don't allow access to GPU */ - t_ext_gpu_deny :1, - t_role :3, /* task's system role */ - t_tal_enabled :1, /* TAL mode is enabled */ - t_base_latency_qos :3, /* Timer latency QoS */ - t_over_latency_qos :3, /* Timer latency QoS override */ - t_base_through_qos :3, /* Computation throughput QoS */ - t_over_through_qos :3, /* Computation throughput QoS override */ - t_sfi_managed :1, /* SFI Managed task */ - t_qos_clamp :3, /* task qos clamp */ - - /* Task only: suppression policies (non-embedded only) */ - t_sup_active :1, /* Suppression is on */ - t_sup_lowpri_cpu :1, /* Wants low priority CPU (MAXPRI_THROTTLE) */ - t_sup_timer :3, /* Wanted timer throttling QoS tier */ - t_sup_disk :1, /* Wants disk throttling */ - t_sup_cpu_limit :1, /* Wants CPU limit (not hooked up yet)*/ - t_sup_suspend :1, /* Wants to be suspended */ - t_sup_throughput :3, /* Wants throughput QoS tier */ - t_sup_cpu :1, /* Wants suppressed CPU priority (MAXPRI_SUPPRESSED) */ - t_sup_bg_sockets :1, /* Wants background sockets */ - - reserved :2; + uint64_t trp_int_darwinbg :1, /* marked as darwinbg via setpriority */ + trp_ext_darwinbg :1, + trp_int_iotier :2, /* IO throttle tier */ + trp_ext_iotier :2, + trp_int_iopassive :1, /* should IOs cause lower tiers to be throttled */ + trp_ext_iopassive :1, + trp_bg_iotier :2, /* what IO throttle tier should apply to me when I'm darwinbg? (pushed to threads) */ + trp_terminated :1, /* all throttles should be removed for quick exit or SIGTERM handling */ + trp_base_latency_qos :3, /* Timer latency QoS */ + trp_base_through_qos :3, /* Computation throughput QoS */ + + trp_apptype :3, /* What apptype did launchd tell us this was (inherited) */ + trp_boosted :1, /* Has a non-zero importance assertion count */ + trp_role :3, /* task's system role */ + trp_tal_enabled :1, /* TAL mode is enabled */ + trp_over_latency_qos :3, /* Timer latency QoS override */ + trp_over_through_qos :3, /* Computation throughput QoS override */ + trp_sfi_managed :1, /* SFI Managed task */ + trp_qos_clamp :3, /* task qos clamp */ + + /* suppression policies (non-embedded only) */ + trp_sup_active :1, /* Suppression is on */ + trp_sup_lowpri_cpu :1, /* Wants low priority CPU (MAXPRI_THROTTLE) */ + trp_sup_timer :3, /* Wanted timer throttling QoS tier */ + trp_sup_disk :1, /* Wants disk throttling */ + trp_sup_throughput :3, /* Wants throughput QoS tier */ + trp_sup_cpu :1, /* Wants suppressed CPU priority (MAXPRI_SUPPRESSED) */ + trp_sup_bg_sockets :1, /* Wants background sockets */ + + trp_reserved :18; }; struct task_effective_policy { - /* Task and thread policy */ - uint64_t darwinbg :1, /* marked as 'background', and sockets are marked bg when created */ - lowpri_cpu :1, /* cpu priority == MAXPRI_THROTTLE */ - io_tier :2, /* effective throttle tier */ - io_passive :1, /* should IOs cause lower tiers to be throttled */ - all_sockets_bg :1, /* All existing sockets in process are marked as bg (thread: all created by thread) */ - new_sockets_bg :1, /* Newly created sockets should be marked as bg */ - bg_iotier :2, /* What throttle tier should I be in when darwinbg is set? */ - terminated :1, /* all throttles have been removed for quick exit or SIGTERM handling */ - qos_ui_is_urgent :1, /* bump UI-Interactive QoS up to the urgent preemption band */ - - /* Thread only policy */ - thep_qos :3, /* thread only: thread qos class */ - thep_qos_relprio :4, /* thread only: thread qos relative priority (store as inverse, -10 -> 0xA) */ - - /* Task only policy */ - t_gpu_deny :1, /* not allowed to access GPU */ - t_tal_engaged :1, /* TAL mode is in effect */ - t_suspended :1, /* task_suspend-ed due to suppression */ - t_watchers_bg :1, /* watchers are BG-ed */ - t_latency_qos :3, /* Timer latency QoS level */ - t_through_qos :3, /* Computation throughput QoS level */ - t_sup_active :1, /* suppression behaviors are in effect */ - t_role :3, /* task's system role */ - t_suppressed_cpu :1, /* cpu priority == MAXPRI_SUPPRESSED (trumped by lowpri_cpu) */ - t_sfi_managed :1, /* SFI Managed task */ - t_live_donor :1, /* task is a live importance boost donor */ - t_qos_clamp :3, /* task qos clamp (applies to qos-disabled threads too) */ - t_qos_ceiling :3, /* task qos ceiling (applies to only qos-participating threads) */ - - reserved :23; + uint64_t tep_darwinbg :1, /* marked as 'background', and sockets are marked bg when created */ + tep_lowpri_cpu :1, /* cpu priority == MAXPRI_THROTTLE */ + tep_io_tier :2, /* effective throttle tier */ + tep_io_passive :1, /* should IOs cause lower tiers to be throttled */ + tep_all_sockets_bg :1, /* All existing sockets in process are marked as bg (thread: all created by thread) */ + tep_new_sockets_bg :1, /* Newly created sockets should be marked as bg */ + tep_bg_iotier :2, /* What throttle tier should I be in when darwinbg is set? */ + tep_terminated :1, /* all throttles have been removed for quick exit or SIGTERM handling */ + tep_qos_ui_is_urgent :1, /* bump UI-Interactive QoS up to the urgent preemption band */ + tep_latency_qos :3, /* Timer latency QoS level */ + tep_through_qos :3, /* Computation throughput QoS level */ + + tep_tal_engaged :1, /* TAL mode is in effect */ + tep_watchers_bg :1, /* watchers are BG-ed */ + tep_sup_active :1, /* suppression behaviors are in effect */ + tep_role :3, /* task's system role */ + tep_suppressed_cpu :1, /* cpu priority == MAXPRI_SUPPRESSED (trumped by lowpri_cpu) */ + tep_sfi_managed :1, /* SFI Managed task */ + tep_live_donor :1, /* task is a live importance boost donor */ + tep_qos_clamp :3, /* task qos clamp (applies to qos-disabled threads too) */ + tep_qos_ceiling :3, /* task qos ceiling (applies to only qos-participating threads) */ + + tep_reserved :32; }; -struct task_pended_policy { - uint64_t t_updating_policy :1, /* Busy bit for task to prevent concurrent 'complete' operations */ - - /* Task and thread policy */ - update_sockets :1, - - /* Task only policy */ - t_update_timers :1, - t_update_watchers :1, - - reserved :60; -}; - -#endif +#endif /* PRIVATE */ #ifdef MACH_KERNEL_PRIVATE extern const struct task_requested_policy default_task_requested_policy; extern const struct task_effective_policy default_task_effective_policy; -extern const struct task_pended_policy default_task_pended_policy; extern kern_return_t qos_latency_policy_validate(task_latency_qos_t); @@ -333,7 +308,8 @@ struct task_policy_state { uint32_t imp_externcnt; uint64_t flags; uint64_t imp_transitions; - uint64_t reserved[2]; + uint64_t tps_requested_policy; + uint64_t tps_effective_policy; }; typedef struct task_policy_state *task_policy_state_t; @@ -382,7 +358,7 @@ typedef struct task_policy_state *task_policy_state_t; /* thread requested policy */ #define POLICY_REQ_PIDBIND_BG 0x00000400 -#define POLICY_REQ_WORKQ_BG 0x00000800 +#define POLICY_REQ_WORKQ_BG 0x00000800 /* deprecated */ #define POLICY_REQ_TH_QOS_MASK 0x07000000 /* 3 bits (overlaps with ROLE) */ #define POLICY_REQ_TH_QOS_SHIFT 24 #define POLICY_REQ_TH_QOS_OVER_MASK 0x70000000 /* 3 bits (overlaps with TAL and SFI) */ diff --git a/osfmk/mach/task_special_ports.h b/osfmk/mach/task_special_ports.h index 635c9134a..66fd7ed05 100644 --- a/osfmk/mach/task_special_ports.h +++ b/osfmk/mach/task_special_ports.h @@ -88,7 +88,9 @@ typedef int task_special_port_t; #define TASK_ACCESS_PORT 9 /* Permission check for task_for_pid. */ -#define TASK_DEBUG_CONTROL_PORT 10 /* debug control port */ +#define TASK_DEBUG_CONTROL_PORT 10 /* debug control port */ + +#define TASK_RESOURCE_NOTIFY_PORT 11 /* overrides host special RN port */ /* * Definitions for ease of use diff --git a/osfmk/mach/thread_policy.h b/osfmk/mach/thread_policy.h index 252714389..0badb50ac 100644 --- a/osfmk/mach/thread_policy.h +++ b/osfmk/mach/thread_policy.h @@ -225,6 +225,8 @@ struct thread_background_policy { integer_t priority; }; +#define THREAD_BACKGROUND_POLICY_DARWIN_BG 0x1000 + typedef struct thread_background_policy thread_background_policy_data_t; typedef struct thread_background_policy *thread_background_policy_t; @@ -272,7 +274,13 @@ struct thread_policy_state { integer_t effective; integer_t pending; integer_t flags; - integer_t reserved[12]; + uint64_t thps_requested_policy; + uint64_t thps_effective_policy; + uint32_t thps_user_promotions; + uint32_t thps_user_promotion_basepri; + uint32_t thps_ipc_overrides; + uint32_t reserved32; + uint64_t reserved[2]; }; typedef struct thread_policy_state thread_policy_state_data_t; @@ -339,13 +347,18 @@ typedef struct thread_policy_state *thread_policy_state_t; * issue a reset-all in its outermost scope before deciding whether it * should return to dequeueing work from the global concurrent queues, * or return to the kernel. + * + * THREAD_QOS_OVERRIDE_TYPE_WILDCARD is a catch-all which will reset every + * resource matching the resource value. Passing + * THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD as well will reset everything. */ -#define THREAD_QOS_OVERRIDE_TYPE_UNKNOWN (0) -#define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX (1) -#define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_RWLOCK (2) -#define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE (3) -#define THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE (4) +#define THREAD_QOS_OVERRIDE_TYPE_UNKNOWN (0) +#define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX (1) +#define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_RWLOCK (2) +#define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE (3) +#define THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE (4) +#define THREAD_QOS_OVERRIDE_TYPE_WILDCARD (5) /* A special resource value to indicate a resource wildcard */ #define THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD (~((user_addr_t)0)) @@ -363,4 +376,57 @@ typedef struct thread_qos_policy *thread_qos_policy_t; #endif /* PRIVATE */ +#ifdef PRIVATE + +/* + * Internal bitfields are privately exported for revlocked tracing tools like msa to decode tracepoints. + * + * These struct definitions *will* change in the future. + * When they do, we will update THREAD_POLICY_INTERNAL_STRUCT_VERSION. + */ + +#define THREAD_POLICY_INTERNAL_STRUCT_VERSION 4 + +struct thread_requested_policy { + uint64_t thrp_int_darwinbg :1, /* marked as darwinbg via setpriority */ + thrp_ext_darwinbg :1, + thrp_int_iotier :2, /* IO throttle tier */ + thrp_ext_iotier :2, + thrp_int_iopassive :1, /* should IOs cause lower tiers to be throttled */ + thrp_ext_iopassive :1, + thrp_latency_qos :3, /* Timer latency QoS */ + thrp_through_qos :3, /* Computation throughput QoS */ + + thrp_pidbind_bg :1, /* task i'm bound to is marked 'watchbg' */ + thrp_qos :3, /* thread qos class */ + thrp_qos_relprio :4, /* thread qos relative priority (store as inverse, -10 -> 0xA) */ + thrp_qos_override :3, /* thread qos class override */ + thrp_qos_promote :3, /* thread qos class from promotion */ + thrp_qos_ipc_override :3, /* thread qos class from ipc override */ + thrp_terminated :1, /* heading for termination */ + + thrp_reserved :32; +}; + +struct thread_effective_policy { + uint64_t thep_darwinbg :1, /* marked as 'background', and sockets are marked bg when created */ + thep_io_tier :2, /* effective throttle tier */ + thep_io_passive :1, /* should IOs cause lower tiers to be throttled */ + thep_all_sockets_bg :1, /* All existing sockets in process are marked as bg (thread: all created by thread) */ + thep_new_sockets_bg :1, /* Newly created sockets should be marked as bg */ + thep_terminated :1, /* all throttles have been removed for quick exit or SIGTERM handling */ + thep_qos_ui_is_urgent :1, /* bump UI-Interactive QoS up to the urgent preemption band */ + thep_latency_qos :3, /* Timer latency QoS level */ + thep_through_qos :3, /* Computation throughput QoS level */ + + thep_qos :3, /* thread qos class */ + thep_qos_relprio :4, /* thread qos relative priority (store as inverse, -10 -> 0xA) */ + thep_qos_promote :3, /* thread qos class used for promotion */ + + thep_reserved :40; +}; + +#endif /* PRIVATE */ + + #endif /* _MACH_THREAD_POLICY_H_ */ diff --git a/osfmk/mach/vm32_map.defs b/osfmk/mach/vm32_map.defs index d28be8de8..04e7c0ad5 100644 --- a/osfmk/mach/vm32_map.defs +++ b/osfmk/mach/vm32_map.defs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,7 +73,7 @@ serverprefix vm32_; #include #include -#if VM32_SUPPORT +#ifdef VM32_SUPPORT /* See vm_map.defs for more information */ diff --git a/osfmk/mach/vm_map.defs b/osfmk/mach/vm_map.defs index 8ab1fcc4e..54f85c7e5 100644 --- a/osfmk/mach/vm_map.defs +++ b/osfmk/mach/vm_map.defs @@ -489,7 +489,7 @@ skip; /* was vm_upl_unmap */ * specified. See the routine implementation for a complete * definition of the routine. */ -routine vm_purgable_control( +routine PREFIX(vm_purgable_control) ( target_task : vm_map_t; address : vm_address_t; control : vm_purgable_t; diff --git a/osfmk/mach/vm_param.h b/osfmk/mach/vm_param.h index 1fa361488..b76a10b21 100644 --- a/osfmk/mach/vm_param.h +++ b/osfmk/mach/vm_param.h @@ -75,6 +75,9 @@ #include #endif /* ASSEMBLER */ +#include +#include + /* * The machine independent pages are refered to as PAGES. A page * is some number of hardware pages, depending on the target machine. @@ -119,6 +122,18 @@ #define mach_vm_round_page(x) (((mach_vm_offset_t)(x) + PAGE_MASK) & ~((signed)PAGE_MASK)) #define mach_vm_trunc_page(x) ((mach_vm_offset_t)(x) & ~((signed)PAGE_MASK)) +#define round_page_overflow(in, out) __os_warn_unused(({ \ + bool __ovr = os_add_overflow(in, (__typeof__(*out))PAGE_MASK, out); \ + *out &= ~((__typeof__(*out))PAGE_MASK); \ + __ovr; \ + })) + +static inline int OS_WARN_RESULT +mach_vm_round_page_overflow(mach_vm_offset_t in, mach_vm_offset_t *out) +{ + return round_page_overflow(in, out); +} + #define memory_object_round_page(x) (((memory_object_offset_t)(x) + PAGE_MASK) & ~((signed)PAGE_MASK)) #define memory_object_trunc_page(x) ((memory_object_offset_t)(x) & ~((signed)PAGE_MASK)) @@ -240,41 +255,21 @@ extern addr64_t vm_last_addr; /* Highest kernel virtual address known to the VM extern const vm_offset_t vm_min_kernel_address; extern const vm_offset_t vm_max_kernel_address; -extern vm_offset_t vm_kernel_stext; -extern vm_offset_t vm_kernel_etext; -extern vm_offset_t vm_kernel_base; -extern vm_offset_t vm_kernel_top; +extern vm_offset_t vm_kernel_stext; +extern vm_offset_t vm_kernel_etext; +extern vm_offset_t vm_kernel_slid_base; +extern vm_offset_t vm_kernel_slid_top; extern vm_offset_t vm_kernel_slide; -extern vm_offset_t vm_hib_base; extern vm_offset_t vm_kernel_addrperm; - extern vm_offset_t vm_kext_base; extern vm_offset_t vm_kext_top; -extern vm_offset_t vm_prelink_stext; -extern vm_offset_t vm_prelink_etext; -extern vm_offset_t vm_prelink_sinfo; -extern vm_offset_t vm_prelink_einfo; -extern vm_offset_t vm_slinkedit; -extern vm_offset_t vm_elinkedit; +extern vm_offset_t vm_kernel_base; +extern vm_offset_t vm_kernel_top; +extern vm_offset_t vm_hib_base; #define VM_KERNEL_IS_SLID(_o) \ - (((vm_offset_t)(_o) >= vm_kernel_base) && \ - ((vm_offset_t)(_o) <= vm_kernel_top)) -#define VM_KERNEL_IS_KEXT(_o) \ - (((vm_offset_t)(_o) >= vm_kext_base) && \ - ((vm_offset_t)(_o) < vm_kext_top)) - -#define VM_KERNEL_IS_PRELINKTEXT(_o) \ - (((vm_offset_t)(_o) >= vm_prelink_stext) && \ - ((vm_offset_t)(_o) < vm_prelink_etext)) - -#define VM_KERNEL_IS_PRELINKINFO(_o) \ - (((vm_offset_t)(_o) >= vm_prelink_sinfo) && \ - ((vm_offset_t)(_o) < vm_prelink_einfo)) - -#define VM_KERNEL_IS_KEXT_LINKEDIT(_o) \ - (((vm_offset_t)(_o) >= vm_slinkedit) && \ - ((vm_offset_t)(_o) < vm_elinkedit)) + (((vm_offset_t)(_o) >= vm_kernel_slid_base) && \ + ((vm_offset_t)(_o) < vm_kernel_slid_top)) #define VM_KERNEL_SLIDE(_u) \ ((vm_offset_t)(_u) + vm_kernel_slide) @@ -314,13 +309,9 @@ extern vm_offset_t vm_elinkedit; * * Nesting of these macros should be considered invalid. */ -#define VM_KERNEL_UNSLIDE(_v) \ - ((VM_KERNEL_IS_SLID(_v) || \ - VM_KERNEL_IS_KEXT(_v) || \ - VM_KERNEL_IS_PRELINKTEXT(_v) || \ - VM_KERNEL_IS_PRELINKINFO(_v) || \ - VM_KERNEL_IS_KEXT_LINKEDIT(_v)) ? \ - (vm_offset_t)(_v) - vm_kernel_slide : \ +#define VM_KERNEL_UNSLIDE(_v) \ + ((VM_KERNEL_IS_SLID(_v)) ? \ + (vm_offset_t)(_v) - vm_kernel_slide : \ (vm_offset_t)(_v)) #define VM_KERNEL_ADDRPERM(_v) \ @@ -329,11 +320,7 @@ extern vm_offset_t vm_elinkedit; (vm_offset_t)(_v) + vm_kernel_addrperm) #define VM_KERNEL_UNSLIDE_OR_PERM(_v) \ - ((VM_KERNEL_IS_SLID(_v) || \ - VM_KERNEL_IS_KEXT(_v) || \ - VM_KERNEL_IS_PRELINKTEXT(_v) || \ - VM_KERNEL_IS_PRELINKINFO(_v) || \ - VM_KERNEL_IS_KEXT_LINKEDIT(_v)) ? \ + ((VM_KERNEL_IS_SLID(_v)) ? \ (vm_offset_t)(_v) - vm_kernel_slide : \ ((vm_offset_t)(_v) >= VM_MIN_KERNEL_AND_KEXT_ADDRESS ? VM_KERNEL_ADDRPERM(_v) : (vm_offset_t)(_v))) diff --git a/osfmk/mach/vm_prot.h b/osfmk/mach/vm_prot.h index 039390c26..0d4d5bf3e 100644 --- a/osfmk/mach/vm_prot.h +++ b/osfmk/mach/vm_prot.h @@ -145,6 +145,18 @@ typedef int vm_prot_t; */ #define VM_PROT_IS_MASK ((vm_prot_t) 0x40) +/* + * Another invalid protection value to support execute-only protection. + * VM_PROT_STRIP_READ is a special marker that tells mprotect to not + * set VM_PROT_READ. We have to do it this way because existing code + * expects the system to set VM_PROT_READ if VM_PROT_EXECUTE is set. + * VM_PROT_EXECUTE_ONLY is just a convenience value to indicate that + * the memory should be executable and explicitly not readable. It will + * be ignored on platforms that do not support this type of protection. + */ +#define VM_PROT_STRIP_READ ((vm_prot_t) 0x80) +#define VM_PROT_EXECUTE_ONLY (VM_PROT_EXECUTE|VM_PROT_STRIP_READ) + #define VM_PROT_MEMORY_TAG_MASK 0xFF000000 #define VM_PROT_MEMORY_TAG_SHIFT 24 diff --git a/osfmk/mach/vm_region.h b/osfmk/mach/vm_region.h index 729c7d81f..22744725a 100644 --- a/osfmk/mach/vm_region.h +++ b/osfmk/mach/vm_region.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -134,7 +134,7 @@ typedef struct vm_region_basic_info vm_region_basic_info_data_t; * back. */ -#if MACH_KERNEL_PRIVATE +#ifdef MACH_KERNEL_PRIVATE #define VM_REGION_EXTENDED_INFO__legacy 11 struct vm_region_extended_info__legacy { vm_prot_t protection; @@ -326,7 +326,7 @@ struct vm_read_entry { vm_size_t size; }; -#if VM32_SUPPORT +#ifdef VM32_SUPPORT struct vm32_read_entry { vm32_address_t address; vm32_size_t size; @@ -338,7 +338,7 @@ struct vm32_read_entry { typedef struct mach_vm_read_entry mach_vm_read_entry_t[VM_MAP_ENTRY_MAX]; typedef struct vm_read_entry vm_read_entry_t[VM_MAP_ENTRY_MAX]; -#if VM32_SUPPORT +#ifdef VM32_SUPPORT typedef struct vm32_read_entry vm32_read_entry_t[VM_MAP_ENTRY_MAX]; #endif diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index bd74e1ef6..eac764903 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -295,10 +295,12 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_PURGABLE 0x0002 #ifdef KERNEL_PRIVATE #endif /* KERNEL_PRIVATE */ +#define VM_FLAGS_RANDOM_ADDR 0x0008 #define VM_FLAGS_NO_CACHE 0x0010 #define VM_FLAGS_RESILIENT_CODESIGN 0x0020 #define VM_FLAGS_RESILIENT_MEDIA 0x0040 #ifdef KERNEL_PRIVATE +#define VM_FLAGS_ATOMIC_ENTRY 0x0080 #define VM_FLAGS_PERMANENT 0x0100 /* mapping can NEVER be unmapped */ #define VM_FLAGS_GUARD_AFTER 0x0200 /* guard page after the mapping */ #define VM_FLAGS_GUARD_BEFORE 0x0400 /* guard page before the mapping */ @@ -336,6 +338,7 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_USER_ALLOCATE (VM_FLAGS_FIXED | \ VM_FLAGS_ANYWHERE | \ VM_FLAGS_PURGABLE | \ + VM_FLAGS_RANDOM_ADDR | \ VM_FLAGS_NO_CACHE | \ VM_FLAGS_OVERWRITE | \ VM_FLAGS_SUPERPAGE_MASK | \ @@ -345,6 +348,7 @@ typedef struct pmap_statistics *pmap_statistics_t; VM_FLAGS_RETURN_DATA_ADDR) #define VM_FLAGS_USER_REMAP (VM_FLAGS_FIXED | \ VM_FLAGS_ANYWHERE | \ + VM_FLAGS_RANDOM_ADDR | \ VM_FLAGS_OVERWRITE| \ VM_FLAGS_RETURN_DATA_ADDR |\ VM_FLAGS_RESILIENT_CODESIGN) @@ -413,6 +417,9 @@ typedef struct pmap_statistics *pmap_statistics_t; /* Window backing stores, custom shadow data, and compressed backing stores */ #define VM_MEMORY_COREGRAPHICS_BACKINGSTORES 57 +/* x-alloc'd memory */ +#define VM_MEMORY_COREGRAPHICS_XALLOC 58 + /* catch-all for other uses, such as the read-only shared data page */ #define VM_MEMORY_COREGRAPHICS_MISC VM_MEMORY_COREGRAPHICS @@ -478,6 +485,21 @@ typedef struct pmap_statistics *pmap_statistics_t; /* Apple System Logger (ASL) messages */ #define VM_MEMORY_ASL 81 +/* Swift runtime */ +#define VM_MEMORY_SWIFT_RUNTIME 82 + +/* Swift metadata */ +#define VM_MEMORY_SWIFT_METADATA 83 + +/* DHMM data */ +#define VM_MEMORY_DHMM 84 + +/* memory allocated by SceneKit.framework */ +#define VM_MEMORY_SCENEKIT 86 + +/* memory allocated by skywalk networking */ +#define VM_MEMORY_SKYWALK 87 + /* Reserve 240-255 for application */ #define VM_MEMORY_APPLICATION_SPECIFIC_1 240 #define VM_MEMORY_APPLICATION_SPECIFIC_16 255 @@ -489,6 +511,7 @@ typedef struct pmap_statistics *pmap_statistics_t; #if KERNEL_PRIVATE /* kernel map tags */ +/* please add new definition strings to zprint */ #define VM_KERN_MEMORY_NONE 0 @@ -516,8 +539,11 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_KERN_MEMORY_UBC 22 #define VM_KERN_MEMORY_SECURITY 23 #define VM_KERN_MEMORY_MLOCK 24 -// -#define VM_KERN_MEMORY_FIRST_DYNAMIC 25 +#define VM_KERN_MEMORY_REASON 25 +#define VM_KERN_MEMORY_SKYWALK 26 +#define VM_KERN_MEMORY_LTABLE 27 + +#define VM_KERN_MEMORY_FIRST_DYNAMIC 28 /* out of tags: */ #define VM_KERN_MEMORY_ANY 255 #define VM_KERN_MEMORY_COUNT 256 diff --git a/osfmk/mach_debug/Makefile b/osfmk/mach_debug/Makefile index d0cbf88d6..3aed683df 100644 --- a/osfmk/mach_debug/Makefile +++ b/osfmk/mach_debug/Makefile @@ -24,5 +24,3 @@ EXPORT_MI_DIR = mach_debug include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/mach_debug/zone_info.h b/osfmk/mach_debug/zone_info.h index facfe2a4f..937b594ab 100644 --- a/osfmk/mach_debug/zone_info.h +++ b/osfmk/mach_debug/zone_info.h @@ -140,7 +140,8 @@ typedef struct mach_memory_info { uint64_t size; uint64_t free; uint64_t largest; - uint64_t _resv[3]; + uint64_t collectable_bytes; + uint64_t _resv[2]; } mach_memory_info_t; typedef mach_memory_info_t *mach_memory_info_array_t; diff --git a/osfmk/machine/Makefile b/osfmk/machine/Makefile index add08c56c..669d20515 100644 --- a/osfmk/machine/Makefile +++ b/osfmk/machine/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -17,9 +16,9 @@ KERNELFILES = \ io_map_entries.h \ lock.h \ locks.h \ - machine_cpuid.h \ + machine_cpuid.h \ machine_routines.h \ - machine_kpc.h \ + machine_kpc.h \ pal_routines.h \ pal_hibernate.h \ simple_lock.h @@ -34,5 +33,3 @@ EXPORT_MI_DIR = machine include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/machine/lowglobals.h b/osfmk/machine/lowglobals.h new file mode 100644 index 000000000..319b758b4 --- /dev/null +++ b/osfmk/machine/lowglobals.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _MACHINE_LOWGLOBALS_H +#define _MACHINE_LOWGLOBALS_H + +#if defined (__x86_64__) +#include "x86_64/lowglobals.h" +#else +#error architecture not supported +#endif + +#endif /* _MACHINE_LOWGLOBALS_H */ diff --git a/osfmk/prng/Makefile b/osfmk/prng/Makefile index f795dae64..d2234ec25 100644 --- a/osfmk/prng/Makefile +++ b/osfmk/prng/Makefile @@ -3,11 +3,10 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) -EXPORT_ONLY_FILES = \ +EXPORT_ONLY_FILES = \ random.h EXPORT_MI_DIR = prng @@ -24,5 +23,3 @@ INSTALL_KF_MI_LIST = $(empty) include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/prng/YarrowCoreLib/port/smf.c b/osfmk/prng/YarrowCoreLib/port/smf.c index 83025e57a..5cb4a3664 100644 --- a/osfmk/prng/YarrowCoreLib/port/smf.c +++ b/osfmk/prng/YarrowCoreLib/port/smf.c @@ -39,11 +39,6 @@ /* Shim emulating _MALLOC */ -struct _mhead { - size_t mlen; - char dat[0]; -}; - SMFAPI void mmInit( void ) { return; @@ -51,25 +46,18 @@ SMFAPI void mmInit( void ) SMFAPI MMPTR mmMalloc(DWORD request) { - struct _mhead *hdr = NULL; - size_t memsize = sizeof (*hdr) + request; - - hdr = (void *) kalloc(memsize); - if (hdr == NULL) + void *addr; + + addr = (void *) kalloc(request); + if (addr == NULL) return NULL; - hdr->mlen = memsize; - return (MMPTR) hdr->dat; + return (MMPTR) addr; } SMFAPI void mmFree(MMPTR ptrnum) { - // get the size of the pointer back - struct _mhead *hdr; - - hdr = ptrnum; - hdr--; - kfree(hdr, hdr->mlen); + kfree_addr(ptrnum); } SMFAPI LPVOID mmGetPtr(MMPTR ptrnum) diff --git a/osfmk/prng/YarrowCoreLib/src/prng.c b/osfmk/prng/YarrowCoreLib/src/prng.c index 754935bc6..5c1d6ad40 100644 --- a/osfmk/prng/YarrowCoreLib/src/prng.c +++ b/osfmk/prng/YarrowCoreLib/src/prng.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 1999, 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -490,8 +490,6 @@ prngStretch(BYTE *inbuf,UINT inbuflen,BYTE *outbuf,UINT outbuflen) { return PRNG_SUCCESS; } - - return PRNG_ERR_PROGRAM_FLOW; } @@ -585,8 +583,6 @@ prngAllowReseed(PRNG *p, LONGLONG ticks) return prngForceReseed(p, ticks); else return PRNG_ERR_NOT_ENOUGH_ENTROPY; - - return PRNG_ERR_PROGRAM_FLOW; } #if SLOW_POLL_ENABLE diff --git a/osfmk/prng/fips_sha1.c b/osfmk/prng/fips_sha1.c index ccf0d72bb..93a006804 100644 --- a/osfmk/prng/fips_sha1.c +++ b/osfmk/prng/fips_sha1.c @@ -87,7 +87,7 @@ typedef int Boolean; */ #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) #define FETCH_32(p) ({ \ - register u_int32_t l = (u_int32_t)*((const u_int32_t *)(p)); \ + u_int32_t l = (u_int32_t)*((const u_int32_t *)(p)); \ __asm__ __volatile__("bswap %0" : "=r" (l) : "0" (l)); \ l; \ }) @@ -267,8 +267,8 @@ SHA1Transform(u_int32_t a, u_int32_t b, u_int32_t c, u_int32_t d, u_int32_t e, const u_int8_t block[64], SHA1_CTX *context) { /* Register (instead of array) is a win in most cases */ - register u_int32_t w0, w1, w2, w3, w4, w5, w6, w7; - register u_int32_t w8, w9, w10, w11, w12, w13, w14, w15; + u_int32_t w0, w1, w2, w3, w4, w5, w6, w7; + u_int32_t w8, w9, w10, w11, w12, w13, w14, w15; w15 = FETCH_32(block + 60); w14 = FETCH_32(block + 56); diff --git a/osfmk/prng/random.h b/osfmk/prng/random.h index 60a46b70a..2f721e288 100644 --- a/osfmk/prng/random.h +++ b/osfmk/prng/random.h @@ -50,7 +50,7 @@ extern entropy_data_t EntropyData; /* * Early_random implementation params: */ #define EARLY_RANDOM_SEED_SIZE (16) -#define EARLY_RANDOM_STATE_STATIC_SIZE (256) +#define EARLY_RANDOM_STATE_STATIC_SIZE (264) #if defined (__x86_64__) #define current_prng_context() (current_cpu_datap()->cpu_prng) diff --git a/osfmk/profiling/Makefile b/osfmk/profiling/Makefile index b3bbc168d..b7dc25208 100644 --- a/osfmk/profiling/Makefile +++ b/osfmk/profiling/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -54,5 +53,3 @@ EXPORT_MI_DIR = profile include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/profiling/i386/Makefile b/osfmk/profiling/i386/Makefile index 14a36756a..1253a004b 100644 --- a/osfmk/profiling/i386/Makefile +++ b/osfmk/profiling/i386/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -20,5 +19,3 @@ EXPORT_MD_DIR = profile/i386 include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/profiling/machine/Makefile b/osfmk/profiling/machine/Makefile index e881a130b..3ee985875 100644 --- a/osfmk/profiling/machine/Makefile +++ b/osfmk/profiling/machine/Makefile @@ -3,11 +3,9 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) - DATAFILES = \ profile-md.h @@ -21,5 +19,3 @@ EXPORT_MI_DIR = profile/machine include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/vm/Makefile b/osfmk/vm/Makefile index dbdfa765e..0453c363e 100644 --- a/osfmk/vm/Makefile +++ b/osfmk/vm/Makefile @@ -17,6 +17,7 @@ EXPORT_ONLY_FILES = \ vm_pageout.h \ vm_protos.h \ vm_shared_region.h \ + vm_compressor_algorithms.h \ WKdm_new.h INSTALL_MI_LIST = ${DATAFILES} @@ -29,5 +30,3 @@ EXPORT_MI_DIR = vm include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/vm/WKdm_new.h b/osfmk/vm/WKdm_new.h index 6a11289d4..3713b2d8b 100644 --- a/osfmk/vm/WKdm_new.h +++ b/osfmk/vm/WKdm_new.h @@ -69,7 +69,7 @@ extern "C" { #include -#define WKdm_SCRATCH_BUF_SIZE PAGE_SIZE +#define WKdm_SCRATCH_BUF_SIZE_INTERNAL PAGE_SIZE typedef unsigned int WK_word; diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index 2dbe896b4..78e39a753 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -42,13 +42,11 @@ #include #include #include +#include #include #include -#include -#include - #include #include #include @@ -123,9 +121,6 @@ typedef struct vnode_pager { #define pager_ikot pager_header.io_bits -ipc_port_t -trigger_name_to_port( /* forward */ - mach_port_t); kern_return_t vnode_pager_cluster_read( /* forward */ @@ -153,6 +148,10 @@ vnode_pager_t vnode_pager_lookup( /* forward */ memory_object_t); +struct vnode * +vnode_pager_lookup_vnode( /* forward */ + memory_object_t); + zone_t vnode_pager_zone; @@ -183,175 +182,6 @@ extern int proc_resetpcontrol(int); extern unsigned long vm_cs_validated_resets; #endif -/* - * Routine: mach_macx_triggers - * Function: - * Syscall interface to set the call backs for low and - * high water marks. - */ -int -mach_macx_triggers( - struct macx_triggers_args *args) -{ - int hi_water = args->hi_water; - int low_water = args->low_water; - int flags = args->flags; - mach_port_t trigger_name = args->alert_port; - kern_return_t kr; - memory_object_default_t default_pager; - ipc_port_t trigger_port; - - default_pager = MEMORY_OBJECT_DEFAULT_NULL; - kr = host_default_memory_manager(host_priv_self(), - &default_pager, 0); - if(kr != KERN_SUCCESS) { - return EINVAL; - } - - if (((flags & SWAP_ENCRYPT_ON) && (flags & SWAP_ENCRYPT_OFF)) || - ((flags & SWAP_COMPACT_ENABLE) && (flags & SWAP_COMPACT_DISABLE))) { - /* can't have it both ways */ - return EINVAL; - } - - if (default_pager_init_flag == 0) { - start_def_pager(NULL); - default_pager_init_flag = 1; - } - - if (flags & SWAP_ENCRYPT_ON) { - /* ENCRYPTED SWAP: tell default_pager to encrypt */ - default_pager_triggers(default_pager, - 0, 0, - SWAP_ENCRYPT_ON, - IP_NULL); - } else if (flags & SWAP_ENCRYPT_OFF) { - /* ENCRYPTED SWAP: tell default_pager not to encrypt */ - default_pager_triggers(default_pager, - 0, 0, - SWAP_ENCRYPT_OFF, - IP_NULL); - } - - if (flags & USE_EMERGENCY_SWAP_FILE_FIRST) { - /* - * Time to switch to the emergency segment. - */ - return default_pager_triggers(default_pager, - 0, 0, - USE_EMERGENCY_SWAP_FILE_FIRST, - IP_NULL); - } - - if (flags & SWAP_FILE_CREATION_ERROR) { - /* - * For some reason, the dynamic pager failed to create a swap file. - */ - trigger_port = trigger_name_to_port(trigger_name); - if(trigger_port == NULL) { - return EINVAL; - } - /* trigger_port is locked and active */ - ipc_port_make_send_locked(trigger_port); - ip_unlock(trigger_port); - default_pager_triggers(default_pager, - 0, 0, - SWAP_FILE_CREATION_ERROR, - trigger_port); - } - - if (flags & HI_WAT_ALERT) { - trigger_port = trigger_name_to_port(trigger_name); - if(trigger_port == NULL) { - return EINVAL; - } - /* trigger_port is locked and active */ - ipc_port_make_send_locked(trigger_port); - ip_unlock(trigger_port); - default_pager_triggers(default_pager, - hi_water, low_water, - HI_WAT_ALERT, trigger_port); - } - - if (flags & LO_WAT_ALERT) { - trigger_port = trigger_name_to_port(trigger_name); - if(trigger_port == NULL) { - return EINVAL; - } - /* trigger_port is locked and active */ - ipc_port_make_send_locked(trigger_port); - ip_unlock(trigger_port); - default_pager_triggers(default_pager, - hi_water, low_water, - LO_WAT_ALERT, trigger_port); - } - - - if (flags & PROC_RESUME) { - - /* - * For this call, hi_water is used to pass in the pid of the process we want to resume - * or unthrottle. This is of course restricted to the superuser (checked inside of - * proc_resetpcontrol). - */ - - return proc_resetpcontrol(hi_water); - } - - /* - * Set thread scheduling priority and policy for the current thread - * it is assumed for the time being that the thread setting the alert - * is the same one which will be servicing it. - * - * XXX This does not belong in the kernel XXX - */ - if (flags & HI_WAT_ALERT) { - thread_precedence_policy_data_t pre; - thread_extended_policy_data_t ext; - - ext.timeshare = FALSE; - pre.importance = INT32_MAX; - - thread_policy_set(current_thread(), - THREAD_EXTENDED_POLICY, - (thread_policy_t)&ext, - THREAD_EXTENDED_POLICY_COUNT); - - thread_policy_set(current_thread(), - THREAD_PRECEDENCE_POLICY, - (thread_policy_t)&pre, - THREAD_PRECEDENCE_POLICY_COUNT); - - current_thread()->options |= TH_OPT_VMPRIV; - } - - if (flags & (SWAP_COMPACT_DISABLE | SWAP_COMPACT_ENABLE)) { - return macx_backing_store_compaction(flags & (SWAP_COMPACT_DISABLE | SWAP_COMPACT_ENABLE)); - } - - return 0; -} - -/* - * - */ -ipc_port_t -trigger_name_to_port( - mach_port_t trigger_name) -{ - ipc_port_t trigger_port; - ipc_space_t space; - - if (trigger_name == 0) - return (NULL); - - space = current_space(); - if(ipc_port_translate_receive(space, CAST_MACH_PORT_TO_NAME(trigger_name), - &trigger_port) != KERN_SUCCESS) - return (NULL); - return trigger_port; -} - extern int uiomove64(addr64_t, int, void *); #define MAX_RUN 32 @@ -424,11 +254,9 @@ memory_object_control_uiomove( PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } - if (dst_page->laundry) { - dst_page->pageout = FALSE; - + if (dst_page->laundry) vm_pageout_steal_laundry(dst_page, FALSE); - } + /* * this routine is only called when copying * to/from real files... no need to consider @@ -451,7 +279,7 @@ memory_object_control_uiomove( #if DEVELOPMENT || DEBUG vm_cs_validated_resets++; #endif - pmap_disconnect(dst_page->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); } } dst_page->busy = TRUE; @@ -477,7 +305,7 @@ memory_object_control_uiomove( if ((xsize = PAGE_SIZE - start_offset) > io_requested) xsize = io_requested; - if ( (retval = uiomove64((addr64_t)(((addr64_t)(dst_page->phys_page) << PAGE_SHIFT) + start_offset), xsize, uio)) ) + if ( (retval = uiomove64((addr64_t)(((addr64_t)(VM_PAGE_GET_PHYS_PAGE(dst_page)) << PAGE_SHIFT) + start_offset), xsize, uio)) ) break; io_requested -= xsize; @@ -521,8 +349,9 @@ memory_object_control_uiomove( } orig_offset = 0; } + if (object->pager) + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager)); vm_object_unlock(object); - task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED); return (retval); } @@ -533,7 +362,7 @@ memory_object_control_uiomove( void vnode_pager_bootstrap(void) { - register vm_size_t size; + vm_size_t size; size = (vm_size_t) sizeof(struct vnode_pager); vnode_pager_zone = zinit(size, (vm_size_t) MAX_VNODE*size, @@ -623,7 +452,7 @@ vnode_pager_data_return( __unused boolean_t kernel_copy, int upl_flags) { - register vnode_pager_t vnode_object; + vnode_pager_t vnode_object; vnode_object = vnode_pager_lookup(mem_obj); @@ -764,24 +593,6 @@ vnode_pager_get_object_mtime( cs_mtime); } -kern_return_t -vnode_pager_get_object_cs_blobs( - memory_object_t mem_obj, - void **blobs) -{ - vnode_pager_t vnode_object; - - if (mem_obj == MEMORY_OBJECT_NULL || - mem_obj->mo_pager_ops != &vnode_pager_ops) { - return KERN_INVALID_ARGUMENT; - } - - vnode_object = vnode_pager_lookup(mem_obj); - - return vnode_pager_get_cs_blobs(vnode_object->vnode_handle, - blobs); -} - #if CHECK_CS_VALIDATION_BITMAP kern_return_t vnode_pager_cs_check_validation_bitmap( @@ -838,7 +649,7 @@ void vnode_pager_reference( memory_object_t mem_obj) { - register vnode_pager_t vnode_object; + vnode_pager_t vnode_object; unsigned int new_ref_count; vnode_object = vnode_pager_lookup(mem_obj); @@ -853,7 +664,7 @@ void vnode_pager_deallocate( memory_object_t mem_obj) { - register vnode_pager_t vnode_object; + vnode_pager_t vnode_object; PAGER_DEBUG(PAGER_ALL, ("vnode_pager_deallocate: %p\n", mem_obj)); @@ -893,7 +704,7 @@ vnode_pager_synchronize( memory_object_size_t length, __unused vm_sync_t sync_flags) { - register vnode_pager_t vnode_object; + vnode_pager_t vnode_object; PAGER_DEBUG(PAGER_ALL, ("vnode_pager_synchronize: %p\n", mem_obj)); @@ -935,7 +746,7 @@ kern_return_t vnode_pager_last_unmap( memory_object_t mem_obj) { - register vnode_pager_t vnode_object; + vnode_pager_t vnode_object; PAGER_DEBUG(PAGER_ALL, ("vnode_pager_last_unmap: %p\n", mem_obj)); @@ -1113,7 +924,7 @@ vnode_pager_t vnode_object_create( struct vnode *vp) { - register vnode_pager_t vnode_object; + vnode_pager_t vnode_object; vnode_object = (struct vnode_pager *) zalloc(vnode_pager_zone); if (vnode_object == VNODE_PAGER_NULL) @@ -1150,6 +961,18 @@ vnode_pager_lookup( } +struct vnode * +vnode_pager_lookup_vnode( + memory_object_t name) +{ + vnode_pager_t vnode_object; + vnode_object = (vnode_pager_t)name; + if(vnode_object->pager_ops == &vnode_pager_ops) + return (vnode_object->vnode_handle); + else + return NULL; +} + /*********************** proc_info implementation *************/ #include diff --git a/osfmk/vm/default_freezer.c b/osfmk/vm/default_freezer.c deleted file mode 100644 index d6601a227..000000000 --- a/osfmk/vm/default_freezer.c +++ /dev/null @@ -1,878 +0,0 @@ -/* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#if CONFIG_FREEZE - -#ifndef CONFIG_MEMORYSTATUS -#error "CONFIG_FREEZE defined without matching CONFIG_MEMORYSTATUS" -#endif - -#include - -/* - * Indicates that a page has been faulted back in. - */ -#define FREEZER_OFFSET_ABSENT ((vm_object_offset_t)(-1)) - -lck_grp_attr_t default_freezer_handle_lck_grp_attr; -lck_grp_t default_freezer_handle_lck_grp; - -void -default_freezer_init(void) -{ - lck_grp_attr_setdefault(&default_freezer_handle_lck_grp_attr); - lck_grp_init(&default_freezer_handle_lck_grp, "default_freezer_handle", - &default_freezer_handle_lck_grp_attr); - -} - - -/* - * Create the mapping table that will - * tell us the object/offset pair that - * corresponds to the page being sent - * out or being brought back in. - */ - -default_freezer_mapping_table_t -default_freezer_mapping_create(vm_object_t object, vm_offset_t offset) -{ - default_freezer_mapping_table_t table; - - table = kalloc(sizeof(struct default_freezer_mapping_table)); - if (table) { - memset(table, 0, sizeof(*table)); - } else { - panic("Could not allocate mapping table\n"); - } - - table->object = object; - table->offset = offset; - - return table; -} - -/* - * Table modifications/lookup are done behind - * the compact_object lock. - */ - -void -default_freezer_mapping_free(default_freezer_mapping_table_t *table_p, boolean_t all) -{ - default_freezer_mapping_table_t freezer_table = *table_p; - assert(freezer_table); - - if (all) { - do { - default_freezer_mapping_table_t next = freezer_table->next; - kfree(freezer_table, sizeof(*freezer_table)); - freezer_table = next; - } while (freezer_table); - } else { - kfree(freezer_table, sizeof(*freezer_table)); - } -} - -kern_return_t -default_freezer_mapping_store( - default_freezer_mapping_table_t table, - memory_object_offset_t table_offset, - memory_object_t memory_object, - memory_object_offset_t offset) -{ - default_freezer_mapping_table_entry_t entry; - uint32_t index; - - assert(table); - - while (table->next) { - table = table->next; - } - - if (table->index >= MAX_FREEZE_TABLE_ENTRIES) { - vm_object_t compact_object = table->object; - default_freezer_mapping_table_t next; - - next = default_freezer_mapping_create(compact_object, table_offset); - if (!next) { - return KERN_FAILURE; - } - table->next = next; - } - - index = (table)->index++; - entry = &(table)->entry[index]; - - entry->memory_object = memory_object; - entry->offset = offset; - - return KERN_SUCCESS; -} - -kern_return_t -default_freezer_mapping_update( - default_freezer_mapping_table_t table, - memory_object_t memory_object, - memory_object_offset_t offset, - memory_object_offset_t *table_offset, /*OUT: contains the offset into the compact object*/ - boolean_t remove_entry) -{ - - kern_return_t kr = KERN_SUCCESS; - vm_object_offset_t compact_offset; - default_freezer_mapping_table_entry_t entry; - uint32_t index = 0; - - if (table == NULL){ - return KERN_FAILURE; - } - - compact_offset = table->offset; - - while (1) { - if (index >= table->index) { - if (table->next) { - table = table->next; - index = 0; - } else { - /* End of tables and we didn't find our candidate entry */ - kr = KERN_FAILURE; - break; - } - } - - entry = &table->entry[index]; - - if (memory_object == entry->memory_object && offset == entry->offset) { - if (remove_entry == TRUE) { - /* - * Mark the page absent whilst retaining the object - * for cleanup during thaw. - */ - entry->offset = FREEZER_OFFSET_ABSENT; - } - if (table_offset != NULL) { - *table_offset = compact_offset; - } - break; - } - - index++; - compact_offset += PAGE_SIZE; - } - return kr; -} - - - -/* - * Create a freezer memory object for this - * vm object. This will be one of the vm - * objects that will pack the compact object. - */ -void -default_freezer_memory_object_create( - vm_object_t object, - default_freezer_handle_t df_handle) -{ - - default_freezer_memory_object_t fo = NULL; - - fo = kalloc(sizeof(struct default_freezer_memory_object)); - - if (fo) { - memory_object_control_t control = NULL; - - memset(fo, 0, sizeof(*fo)); - - control = memory_object_control_allocate(object); - assert (control != MEMORY_OBJECT_CONTROL_NULL); - - df_memory_object_init((memory_object_t)fo, control, 0); - fo->fo_df_handle = df_handle; - - default_freezer_handle_reference_locked(fo->fo_df_handle); - - object->pager = (memory_object_t)fo; - object->pager_created = TRUE; - object->pager_initialized = TRUE; - object->pager_ready = TRUE; - object->pager_trusted = TRUE; - object->pager_control = control; - } else { - panic(" Could not allocate freezer object\n"); - } -} - -kern_return_t -default_freezer_pack( - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *shared, - vm_object_t src_object, - default_freezer_handle_t df_handle) -{ - kern_return_t kr = KERN_SUCCESS; - - if (df_handle) { - default_freezer_handle_lock(df_handle); - } - - kr = vm_object_pack(purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared, src_object, df_handle); - - if (df_handle) { - default_freezer_handle_unlock(df_handle); - } - - return kr; -} - -/* - * Called with freezer_handle locked. - * default_freezer_pack locks the handle, calls - * vm_object_pack which, in turn, will call - * default_freezer_pack_page(). - */ -void -default_freezer_pack_page( - vm_page_t p, - default_freezer_handle_t df_handle) -{ - - default_freezer_mapping_table_t freeze_table = NULL; - memory_object_t memory_object = NULL; - vm_object_t compact_object = VM_OBJECT_NULL; - - assert(df_handle); - - compact_object = df_handle->dfh_compact_object; - - assert(compact_object); - - freeze_table = df_handle->dfh_table; - memory_object = p->object->pager; - - if (memory_object == NULL) { - default_freezer_memory_object_create(p->object, df_handle); - memory_object = p->object->pager; - } else { - assert(df_handle == ((default_freezer_memory_object_t)memory_object)->fo_df_handle); - } - - vm_object_lock(compact_object); - default_freezer_mapping_store(freeze_table, df_handle->dfh_compact_offset, memory_object, p->offset + p->object->paging_offset); - vm_page_rename(p, compact_object, df_handle->dfh_compact_offset, FALSE); - vm_object_unlock(compact_object); - - df_handle->dfh_compact_offset += PAGE_SIZE; -} - - -kern_return_t -default_freezer_unpack( - default_freezer_handle_t df_handle) -{ - - vm_page_t compact_page = VM_PAGE_NULL, src_page = VM_PAGE_NULL; - uint32_t index = 0; - vm_object_t src_object = VM_OBJECT_NULL; - vm_object_t compact_object = VM_OBJECT_NULL; - memory_object_t src_mem_object = MEMORY_OBJECT_NULL; - memory_object_offset_t src_offset = 0; - vm_object_offset_t compact_offset = 0; - default_freezer_memory_object_t fo = NULL; - default_freezer_mapping_table_t freeze_table = NULL; - boolean_t should_unlock_handle = FALSE; - kern_return_t kr; - - assert(df_handle); - - default_freezer_handle_lock(df_handle); - should_unlock_handle = TRUE; - - freeze_table = df_handle->dfh_table; - compact_object = df_handle->dfh_compact_object; - - assert(compact_object); - assert(compact_object->alive); - assert(!compact_object->terminating); - assert(compact_object->pager_ready); - - /* Bring the pages back in */ - if ((kr = vm_object_pagein(compact_object)) != KERN_SUCCESS) { - if (should_unlock_handle) { - default_freezer_handle_unlock(df_handle); - } - return (kr); - } - - vm_object_lock(compact_object); - - for (index = 0, compact_offset = 0; ; index++, compact_offset += PAGE_SIZE){ - if (index >= freeze_table->index) { - default_freezer_mapping_table_t table_next; - - table_next = freeze_table->next; - - /* Free the tables as we go along */ - default_freezer_mapping_free(&freeze_table, FALSE); - - if (table_next == NULL){ - break; - } - - freeze_table = table_next; - index = 0; - } - - /* - * Skip slots that represent deallocated memory objects. - */ - src_mem_object = freeze_table->entry[index].memory_object; - if (src_mem_object == MEMORY_OBJECT_NULL) - continue; - - /* - * Skip slots that represent faulted pages. - */ - src_offset = freeze_table->entry[index].offset; - if (src_offset != FREEZER_OFFSET_ABSENT) { - - compact_page = vm_page_lookup(compact_object, compact_offset); - assert(compact_page); - - fo = (default_freezer_memory_object_t)src_mem_object; - - src_object = memory_object_control_to_vm_object(fo->fo_pager_control); - - /* Move back over from the freeze object to the original */ - vm_object_lock(src_object); - src_page = vm_page_lookup(src_object, src_offset - src_object->paging_offset); - if (src_page != VM_PAGE_NULL){ - /* - * We might be racing with a VM fault. - * So handle that gracefully. - */ - assert(src_page->absent == TRUE); - VM_PAGE_FREE(src_page); - } - vm_page_rename(compact_page, src_object, src_offset - src_object->paging_offset, FALSE); - vm_object_unlock(src_object); - } - - } - - vm_object_unlock(compact_object); - - vm_object_deallocate(compact_object); - - if (should_unlock_handle) { - df_handle->dfh_table = NULL; - df_handle->dfh_compact_object = VM_OBJECT_NULL; - df_handle->dfh_compact_offset = 0; - default_freezer_handle_unlock(df_handle); - } - return (KERN_SUCCESS); -} - -void -df_memory_object_reference(__unused memory_object_t mem_obj) -{ - - /* No-op */ -} - -void -df_memory_object_deallocate(memory_object_t mem_obj) -{ - - default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; - - assert(fo); - - if (fo->fo_df_handle != NULL) { - - default_freezer_mapping_table_t table = NULL; - default_freezer_mapping_table_entry_t entry; - boolean_t found = FALSE; - uint32_t index = 0; - vm_object_t compact_object = VM_OBJECT_NULL; - - default_freezer_handle_lock(fo->fo_df_handle); - - compact_object = fo->fo_df_handle->dfh_compact_object; - table = fo->fo_df_handle->dfh_table; - - if (compact_object == VM_OBJECT_NULL || table == NULL) { - /*Nothing to do. A thaw must have cleared it all out.*/ - } else { - vm_object_lock(compact_object); - - /* Remove from table */ - while (1) { - if (index >= table->index) { - if (table->next) { - table = table->next; - index = 0; - } else { - /* End of tables */ - break; - } - } - - entry = &table->entry[index]; - if (mem_obj == entry->memory_object) { - /* It matches, so clear the entry */ - if (!found) { - found = TRUE; - } - entry->memory_object = MEMORY_OBJECT_NULL; - entry->offset = 0; - } else if (MEMORY_OBJECT_NULL != entry->memory_object) { - /* We have a different valid object; we're done */ - if (found) { - break; - } - } - - index++; - } - - vm_object_unlock(compact_object); - } - - if (default_freezer_handle_deallocate_locked(fo->fo_df_handle)) { - default_freezer_handle_unlock(fo->fo_df_handle); - } - } - - kfree(fo, sizeof(*fo)); -} - -kern_return_t -df_memory_object_init( - memory_object_t mem_obj, - memory_object_control_t control, - __unused memory_object_cluster_size_t pager_page_size) -{ - - default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; - assert(fo); - - fo->fo_pager_ops = &default_freezer_ops; - fo->fo_pager_header.io_bits = IKOT_MEMORY_OBJECT; - fo->fo_pager_control = control; - - return KERN_SUCCESS; -} - -kern_return_t -df_memory_object_terminate(memory_object_t mem_obj) -{ - - default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; - assert(fo); - memory_object_control_deallocate(fo->fo_pager_control); - return KERN_SUCCESS; -} - - -kern_return_t -df_memory_object_data_request( - memory_object_t mem_obj, - memory_object_offset_t offset, - memory_object_cluster_size_t length, - vm_prot_t protection_required, - memory_object_fault_info_t fault_info) -{ - - vm_object_t src_object = VM_OBJECT_NULL, compact_object = VM_OBJECT_NULL; - memory_object_offset_t compact_offset = 0; - memory_object_t pager = NULL; - kern_return_t kr = KERN_SUCCESS; - boolean_t drop_object_ref = FALSE; - vm_page_t compact_page, dst_page; - - default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; - default_freezer_handle_t df_handle = NULL; - - df_handle = fo->fo_df_handle; - - if (df_handle == NULL) { - kr = KERN_FAILURE; - } else { - default_freezer_handle_lock(df_handle); - - src_object = memory_object_control_to_vm_object(fo->fo_pager_control); - compact_object = fo->fo_df_handle->dfh_compact_object; - - if (compact_object == NULL) { - kr = KERN_FAILURE; - } else { - vm_object_lock(compact_object); - vm_object_reference_locked(compact_object); - drop_object_ref = TRUE; - - kr = default_freezer_mapping_update(fo->fo_df_handle->dfh_table, - mem_obj, - offset, - &compact_offset, - FALSE); - vm_object_unlock(compact_object); - } - default_freezer_handle_unlock(df_handle); - } - - - if (length == 0){ - /*Caller is just querying to see if we have the page*/ - if (drop_object_ref) { - vm_object_deallocate(compact_object); - } - return kr; - } - - if (kr != KERN_SUCCESS){ - - unsigned int request_flags; - upl_t upl; - unsigned int page_list_count = 0; - - request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE | UPL_SET_INTERNAL; - /* - * Should we decide to activate USE_PRECIOUS (from default_pager_internal.h) - * here, then the request_flags will need to add these to the ones above: - * - * request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE - */ - request_flags |= UPL_REQUEST_SET_DIRTY; - - memory_object_super_upl_request(fo->fo_pager_control, - (memory_object_offset_t)offset, - PAGE_SIZE, PAGE_SIZE, - &upl, NULL, &page_list_count, - request_flags); - upl_range_needed(upl, 0, 1); - - upl_abort(upl, UPL_ABORT_UNAVAILABLE); - upl_deallocate(upl); - - if (drop_object_ref) { - vm_object_deallocate(compact_object); - } - - return KERN_SUCCESS; - } - vm_object_lock(compact_object); - - assert(compact_object->alive); - assert(!compact_object->terminating); - - /* - * note that the activity_in_progress could be non-zero, but - * the pager has not yet been created since the activity_in_progress - * count is bumped via vm_pageout_cluster, while the pager isn't created - * until the pageout thread runs and starts to process the pages - * placed on the I/O queue... once the processing of the compact object - * proceeds to the point where it's placed the first page on the I/O - * queue, we need to wait until the entire freeze operation has completed. - */ - vm_object_paging_wait(compact_object, THREAD_UNINT); - - if (compact_object->pager_ready) { - vm_object_paging_begin(compact_object); - - compact_object->blocked_access = TRUE; - pager = (memory_object_t)compact_object->pager; - - vm_object_unlock(compact_object); - - ((vm_object_fault_info_t) fault_info)->io_sync = TRUE; - - /* - * We have a reference on both the default_freezer - * memory object handle and the compact object. - */ - kr = dp_memory_object_data_request(pager, - compact_offset, - length, - protection_required, - fault_info); - if (kr != KERN_SUCCESS) - panic("%d: default_freezer TOC pointed us to default_pager incorrectly\n", kr); - - vm_object_lock(compact_object); - - compact_object->blocked_access = FALSE; - vm_object_paging_end(compact_object); - } - vm_object_lock(src_object); - - if ((compact_page = vm_page_lookup(compact_object, compact_offset)) != VM_PAGE_NULL){ - - dst_page = vm_page_lookup(src_object, offset - src_object->paging_offset); - - if (dst_page && !dst_page->absent){ - /* - * Someone raced us here and unpacked - * the object behind us. - * So cleanup before we return. - */ - VM_PAGE_FREE(compact_page); - } else { - if (dst_page != NULL) { - VM_PAGE_FREE(dst_page); - } - vm_page_rename(compact_page, src_object, offset - src_object->paging_offset, FALSE); - - if (default_freezer_mapping_update(fo->fo_df_handle->dfh_table, - mem_obj, - offset, - NULL, - TRUE) != KERN_SUCCESS) { - printf("Page for object: 0x%lx at offset: 0x%lx not found in table\n", (uintptr_t)src_object, (uintptr_t)offset); - } - - PAGE_WAKEUP_DONE(compact_page); - } - } else { - printf("%d: default_freezer: compact_object doesn't have the page for object 0x%lx at offset 0x%lx \n", kr, (uintptr_t)compact_object, (uintptr_t)compact_offset); - kr = KERN_SUCCESS; - } - vm_object_unlock(src_object); - vm_object_unlock(compact_object); - vm_object_deallocate(compact_object); - - return kr; -} - -kern_return_t -df_memory_object_data_return( - __unused memory_object_t mem_obj, - __unused memory_object_offset_t offset, - __unused memory_object_cluster_size_t size, - __unused memory_object_offset_t *resid_offset, - __unused int *io_error, - __unused boolean_t dirty, - __unused boolean_t kernel_copy, - __unused int upl_flags) -{ - - panic(" default_freezer: df_memory_object_data_return should not be called\n"); - return KERN_SUCCESS; -} - -kern_return_t -df_memory_object_data_initialize( - __unused memory_object_t mem_obj, - __unused memory_object_offset_t offset, - __unused memory_object_cluster_size_t size) -{ - - panic(" default_freezer: df_memory_object_data_initialize should not be called\n"); - return KERN_SUCCESS; -} - -kern_return_t -df_memory_object_data_unlock( - __unused memory_object_t mem_obj, - __unused memory_object_offset_t offset, - __unused memory_object_size_t length, - __unused vm_prot_t prot) -{ - - panic(" default_freezer: df_memory_object_data_unlock should not be called\n"); - return KERN_FAILURE; -} - -kern_return_t -df_memory_object_synchronize( - __unused memory_object_t mem_obj, - __unused memory_object_offset_t offset, - __unused memory_object_size_t length, - __unused vm_sync_t flags) -{ - - panic(" default_freezer: df_memory_object_synchronize should not be called\n"); - return KERN_FAILURE; -} - -kern_return_t -df_memory_object_map( - __unused memory_object_t mem_obj, - __unused vm_prot_t prot) -{ - - panic(" default_freezer: df_memory_object_map should not be called\n"); - return KERN_FAILURE; -} - -kern_return_t -df_memory_object_last_unmap(__unused memory_object_t mem_obj) -{ - - panic(" default_freezer: df_memory_object_last_unmap should not be called\n"); - return KERN_FAILURE; -} - - -kern_return_t -df_memory_object_data_reclaim( - __unused memory_object_t mem_obj, - __unused boolean_t reclaim_backing_store) -{ - - panic("df_memory_object_data_reclaim\n"); - return KERN_SUCCESS; -} - - -/* - * The freezer handle is used to make sure that - * we don't race against the lookup and termination - * of the compact object. - */ - -void -default_freezer_handle_lock(default_freezer_handle_t df_handle) { - lck_rw_lock_exclusive(&df_handle->dfh_lck); -} - -void -default_freezer_handle_unlock(default_freezer_handle_t df_handle) { - lck_rw_done(&df_handle->dfh_lck); -} - -default_freezer_handle_t -default_freezer_handle_allocate(void) -{ - - default_freezer_handle_t df_handle = NULL; - df_handle = kalloc(sizeof(struct default_freezer_handle)); - - if (df_handle) { - memset(df_handle, 0, sizeof(struct default_freezer_handle)); - lck_rw_init(&df_handle->dfh_lck, &default_freezer_handle_lck_grp, NULL); - /* No one knows of this handle yet so no need to lock it. */ - default_freezer_handle_reference_locked(df_handle); - } else { - panic("Failed to allocated default_freezer_handle structure\n"); - } - return df_handle; -} - -kern_return_t -default_freezer_handle_init( - default_freezer_handle_t df_handle) -{ - kern_return_t kr = KERN_SUCCESS; - vm_object_t compact_object = VM_OBJECT_NULL; - - if (df_handle == NULL || df_handle->dfh_table != NULL) { - kr = KERN_FAILURE; - } else { - /* Create our compact object */ - compact_object = vm_object_allocate((vm_map_offset_t)(VM_MAX_ADDRESS) - (vm_map_offset_t)(VM_MIN_ADDRESS)); - if (!compact_object) { - kr = KERN_FAILURE; - } else { - df_handle->dfh_compact_object = compact_object; - df_handle->dfh_compact_offset = 0; - df_handle->dfh_table = default_freezer_mapping_create(df_handle->dfh_compact_object, df_handle->dfh_compact_offset); - if (!df_handle->dfh_table) { - kr = KERN_FAILURE; - } - } - } - - return kr; -} - -void -default_freezer_handle_reference_locked( - default_freezer_handle_t df_handle) -{ - assert(df_handle); - df_handle->dfh_ref_count++; -} - -void -default_freezer_handle_deallocate( - default_freezer_handle_t df_handle) -{ - assert(df_handle); - default_freezer_handle_lock(df_handle); - if (default_freezer_handle_deallocate_locked(df_handle)) { - default_freezer_handle_unlock(df_handle); - } -} - -boolean_t -default_freezer_handle_deallocate_locked( - default_freezer_handle_t df_handle) -{ - boolean_t should_unlock = TRUE; - - assert(df_handle); - df_handle->dfh_ref_count--; - if (df_handle->dfh_ref_count == 0) { - - if (df_handle->dfh_compact_object) { - vm_object_deallocate(df_handle->dfh_compact_object); - df_handle->dfh_compact_object = NULL; - df_handle->dfh_compact_offset = 0; - } - - if (df_handle->dfh_table) { - default_freezer_mapping_free(&df_handle->dfh_table, TRUE); - df_handle->dfh_table = NULL; - } - - lck_rw_done(&df_handle->dfh_lck); - lck_rw_destroy(&df_handle->dfh_lck, &default_freezer_handle_lck_grp); - should_unlock = FALSE; - - kfree(df_handle, sizeof(struct default_freezer_handle)); - } - return should_unlock; -} - -void -default_freezer_pageout( - default_freezer_handle_t df_handle) -{ - assert(df_handle); - - vm_object_pageout(df_handle->dfh_compact_object); -} - -#endif /* CONFIG_FREEZE */ diff --git a/osfmk/vm/default_freezer.h b/osfmk/vm/default_freezer.h deleted file mode 100644 index f08de63a5..000000000 --- a/osfmk/vm/default_freezer.h +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _DEFAULT_FREEZER_H_ -#define _DEFAULT_FREEZER_H_ - -#if CONFIG_FREEZE - -#ifdef MACH_KERNEL - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * Begin declaration for default_freezer_ops. -*/ -extern void df_memory_object_reference(memory_object_t); -extern void df_memory_object_deallocate(memory_object_t); -extern kern_return_t df_memory_object_init(memory_object_t, - memory_object_control_t, - memory_object_cluster_size_t); -extern kern_return_t df_memory_object_terminate(memory_object_t); -extern kern_return_t df_memory_object_data_request(memory_object_t, - memory_object_offset_t, - memory_object_cluster_size_t, - vm_prot_t, - memory_object_fault_info_t); -extern kern_return_t df_memory_object_data_return(memory_object_t, - memory_object_offset_t, - memory_object_cluster_size_t, - memory_object_offset_t *, - int *, - boolean_t, - boolean_t, - int); -extern kern_return_t df_memory_object_data_initialize(memory_object_t, - memory_object_offset_t, - memory_object_cluster_size_t); -extern kern_return_t df_memory_object_data_unlock(memory_object_t, - memory_object_offset_t, - memory_object_size_t, - vm_prot_t); -extern kern_return_t df_memory_object_synchronize(memory_object_t, - memory_object_offset_t, - memory_object_size_t, - vm_sync_t); -extern kern_return_t df_memory_object_map(memory_object_t, - vm_prot_t); -extern kern_return_t df_memory_object_last_unmap(memory_object_t); - -extern kern_return_t df_memory_object_data_reclaim( memory_object_t, - boolean_t); -/* - * End declaration for default_freezer_ops. -*/ - -const struct memory_object_pager_ops default_freezer_ops = { - df_memory_object_reference, - df_memory_object_deallocate, - df_memory_object_init, - df_memory_object_terminate, - df_memory_object_data_request, - df_memory_object_data_return, - df_memory_object_data_initialize, - df_memory_object_data_unlock, - df_memory_object_synchronize, - df_memory_object_map, - df_memory_object_last_unmap, - df_memory_object_data_reclaim, - "default freezer" -}; - -#define MAX_FREEZE_TABLE_ENTRIES 128 - -struct default_freezer_mapping_table_entry { - memory_object_t memory_object; /* memory object will lead us to the most current VM object */ - memory_object_offset_t offset; -}; -typedef struct default_freezer_mapping_table *default_freezer_mapping_table_t; - -struct default_freezer_mapping_table { - struct default_freezer_mapping_table *next; - vm_object_t object; /* packed object */ - vm_object_offset_t offset; - unsigned int index; - struct default_freezer_mapping_table_entry entry[MAX_FREEZE_TABLE_ENTRIES]; -}; -typedef struct default_freezer_mapping_table_entry *default_freezer_mapping_table_entry_t; - -struct default_freezer_handle { - lck_rw_t dfh_lck; - uint32_t dfh_ref_count; - default_freezer_mapping_table_t dfh_table; - vm_object_t dfh_compact_object; - vm_object_offset_t dfh_compact_offset; -}; -typedef struct default_freezer_handle *default_freezer_handle_t; - -struct default_freezer_memory_object{ - struct ipc_object_header fo_pager_header; /* fake ip_kotype() */ - memory_object_pager_ops_t fo_pager_ops; /* == &default_freezer_ops */ - memory_object_control_t fo_pager_control; - default_freezer_handle_t fo_df_handle; -}; -typedef struct default_freezer_memory_object *default_freezer_memory_object_t; - - -__private_extern__ void default_freezer_handle_lock(default_freezer_handle_t); -__private_extern__ void default_freezer_handle_unlock(default_freezer_handle_t); - -extern lck_grp_attr_t default_freezer_handle_lck_grp_attr; -extern lck_grp_t default_freezer_handle_lck_grp; - -__private_extern__ default_freezer_mapping_table_t default_freezer_mapping_create(vm_object_t, vm_offset_t); - -__private_extern__ void default_freezer_mapping_free(default_freezer_mapping_table_t *table_p, boolean_t all); - -__private_extern__ kern_return_t default_freezer_mapping_store( default_freezer_mapping_table_t , - memory_object_offset_t, - memory_object_t, - memory_object_offset_t ); - -__private_extern__ kern_return_t default_freezer_mapping_update( default_freezer_mapping_table_t, - memory_object_t, - memory_object_offset_t, - memory_object_offset_t *, - boolean_t ); - -__private_extern__ void default_freezer_handle_reference_locked(default_freezer_handle_t); - -__private_extern__ boolean_t default_freezer_handle_deallocate_locked(default_freezer_handle_t); - -__private_extern__ void default_freezer_memory_object_create(vm_object_t, default_freezer_handle_t); - -#endif /* MACH_KERNEL */ -#endif /* CONFIG_FREEZE */ -#endif /* DEFAULT_FREEZER_H */ diff --git a/osfmk/vm/device_vm.c b/osfmk/vm/device_vm.c index 7b4c8f161..abe786665 100644 --- a/osfmk/vm/device_vm.c +++ b/osfmk/vm/device_vm.c @@ -122,7 +122,7 @@ zone_t device_pager_zone; void device_pager_bootstrap(void) { - register vm_size_t size; + vm_size_t size; size = (vm_size_t) sizeof(struct device_pager); device_pager_zone = zinit(size, (vm_size_t) MAX_DNODE*size, @@ -450,7 +450,7 @@ device_pager_last_unmap( device_pager_t device_object_create(void) { - register device_pager_t device_object; + device_pager_t device_object; device_object = (struct device_pager *) zalloc(device_pager_zone); if (device_object == DEVICE_PAGER_NULL) @@ -464,3 +464,11 @@ device_object_create(void) return(device_object); } +boolean_t +is_device_pager_ops(const struct memory_object_pager_ops *pager_ops) +{ + if (pager_ops == &device_pager_ops) { + return TRUE; + } + return FALSE; +} diff --git a/osfmk/vm/lz4.c b/osfmk/vm/lz4.c new file mode 100644 index 000000000..7cb4e365f --- /dev/null +++ b/osfmk/vm/lz4.c @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2016-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +// LZ4_RAW buffer API +// EB May 2015 +// Mar 2016 Imported from the Compression project, with minor optimisations and +// early abort detection (Derek Kumar) + +#include "lz4.h" + +size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, + const uint8_t * __restrict src_buffer, size_t src_size, + void * __restrict work __attribute__((unused))) +{ + const uint8_t * src = src_buffer; + uint8_t * dst = dst_buffer; + + // Go fast if we can, keeping away from the end of buffers +#if LZ4_ENABLE_ASSEMBLY_DECODE + if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN) + { + if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN)) + return 0; // FAIL + } +#endif +//DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers? + + // Finish safe + if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size)) + return 0; // FAIL + + return (size_t)(dst - dst_buffer); // bytes produced +} +// Debug flags +#if LZ4DEBUG +#define DEBUG_LZ4_ENCODE_ERRORS (1) +#define DEBUG_LZ4_DECODE_ERRORS (1) +#endif + +#if DEBUG_LZ4_ENCODE_ERRORS +#endif + +#if !LZ4_ENABLE_ASSEMBLY_ENCODE + +#if defined(__x86_64__) || defined(__x86_64h__) +# define LZ4_MATCH_SEARCH_INIT_SIZE 32 +# define LZ4_MATCH_SEARCH_LOOP_SIZE 32 +#else +# define LZ4_MATCH_SEARCH_INIT_SIZE 8 +# define LZ4_MATCH_SEARCH_LOOP_SIZE 8 +#endif + +// Return hash for 4-byte sequence X +static inline uint32_t lz4_hash(uint32_t x) { return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS); } + +// Store 0xfff..fff at *PTR +static inline void lz4_fill16(uint8_t * ptr) +{ + store8(ptr,-1); + store8(ptr+8,-1); +} + +// Return number of matching bytes 0..4 at positions A and B. +static inline size_t lz4_nmatch4(const uint8_t * a,const uint8_t * b) +{ + uint32_t x = load4(a) ^ load4(b); + return (x == 0)?4:(__builtin_ctzl(x) >> 3); +} + +// Return number of matching bytes 0..8 at positions A and B. +static inline size_t lz4_nmatch8(const uint8_t * a,const uint8_t * b) +{ + uint64_t x = load8(a) ^ load8(b); + return (x == 0)?8:(__builtin_ctzll(x) >> 3); +} + +// Return number of matching bytes 0..16 at positions A and B. +static inline size_t lz4_nmatch16(const uint8_t * a,const uint8_t * b) +{ + size_t n = lz4_nmatch8(a,b); + return (n == 8)?(8 + lz4_nmatch8(a+8,b+8)):n; +} + +// Return number of matching bytes 0..32 at positions A and B. +static inline size_t lz4_nmatch32(const uint8_t * a,const uint8_t * b) +{ + size_t n = lz4_nmatch16(a,b); + return (n == 16)?(16 + lz4_nmatch16(a+16,b+16)):n; +} + +// Return number of matching bytes 0..64 at positions A and B. +static inline size_t lz4_nmatch64(const uint8_t * a,const uint8_t * b) +{ + size_t n = lz4_nmatch32(a,b); + return (n == 32)?(32 + lz4_nmatch32(a+32,b+32)):n; +} + +// Compile-time selection, return number of matching bytes 0..N at positions A and B. +static inline size_t lz4_nmatch(int N, const uint8_t * a, const uint8_t * b) +{ + switch (N) { + case 4: return lz4_nmatch4(a,b); + case 8: return lz4_nmatch8(a,b); + case 16: return lz4_nmatch16(a,b); + case 32: return lz4_nmatch32(a,b); + case 64: return lz4_nmatch64(a,b); + } + __builtin_trap(); // FAIL +} + +// Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff. +// We are allowed to access a constant number of bytes above DST_END. +// Return incremented DST pointer on success, and 0 on failure +static inline uint8_t *lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L) { + (void)end; + while (L >= 17*255) { + lz4_fill16(dst); + dst += 16; + L -= 16*255; + } + lz4_fill16(dst); + //DRKTODO verify these modulos/divisions are optimally handled by clang + dst += L/255; + *dst++ = L%255; + return dst; +} + +static inline uint32_t clamp(uint32_t x, uint32_t max) __attribute__((overloadable)) { return x > max ? max : x; } + +static inline uint8_t *copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L) { + uint8_t *end = dst + L; + { copy16(dst, src); dst += 16; src += 16; } + while (dst < end) { copy32(dst, src); dst += 32; src += 32; } + return end; +} + +static uint8_t *lz4_emit_match(uint32_t L, uint32_t M, uint32_t D, + uint8_t * restrict dst, + const uint8_t * const end, + const uint8_t * restrict src) { + // The LZ4 encoding scheme requires that M is at least 4, because + // the actual value stored by the encoding is M - 4. Check this + // requirement for debug builds. + assert(M >= 4 && "LZ4 encoding requires that M is at least 4"); + // Having checked that M >= 4, translate M by four. + M -= 4; + // Similarly, we must have D < 2**16, because we use only two bytes + // to represent the value of D in the encoding. + assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes."); + // Construct the command byte by clamping both L and M to 0 ... 15 + // and packing them into a single byte, and store it. + *dst++ = clamp(L, 15) << 4 | clamp(M, 15); + // If L is 15 or greater, we need to encode extra literal length bytes. + if (L >= 15) { + dst = lz4_store_length(dst, end, L - 15); + if (dst == 0 || dst + L >= end) return NULL; + } + // Copy the literal itself from src to dst. + dst = copy_literal(dst, src, L); + // Store match distance. + store2(dst, D); dst += 2; + // If M is 15 or greater, we need to encode extra match length bytes. + if (M >= 15) { + dst = lz4_store_length(dst, end, M - 15); + if (dst == 0) return NULL; + } + return dst; +} + +/* #ifndef LZ4_EARLY_ABORT */ +/* #define LZ4_EARLY_ABORT (1) */ +/* #endif */ + +#if LZ4_EARLY_ABORT +int lz4_do_early_abort = 1; +int lz4_early_aborts = 0; +#define LZ4_EARLY_ABORT_EVAL (448) +#define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20) +#endif /* LZ4_EARLY_ABORT */ + +void lz4_encode_2gb(uint8_t ** dst_ptr, + size_t dst_size, + const uint8_t ** src_ptr, + const uint8_t * src_begin, + size_t src_size, + lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES], + int skip_final_literals) +{ + uint8_t *dst = *dst_ptr; // current output stream position + uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN; + const uint8_t *src = *src_ptr; // current input stream literal to encode + const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN; + const uint8_t *match_begin = 0; // first byte of matched sequence + const uint8_t *match_end = 0; // first byte after matched sequence +#if LZ4_EARLY_ABORT + uint8_t * const dst_begin = dst; + uint32_t lz4_do_abort_eval = lz4_do_early_abort; +#endif + + while (dst < end) + { + ptrdiff_t match_distance = 0; + for (match_begin = src; match_begin < src_end; match_begin += 1) { + const uint32_t pos = (uint32_t)(match_begin - src_begin); + const uint32_t w0 = load4(match_begin); + const uint32_t w1 = load4(match_begin + 1); + const uint32_t w2 = load4(match_begin + 2); + const uint32_t w3 = load4(match_begin + 3); + const int i0 = lz4_hash(w0); + const int i1 = lz4_hash(w1); + const int i2 = lz4_hash(w2); + const int i3 = lz4_hash(w3); + const uint8_t *c0 = src_begin + hash_table[i0].offset; + const uint8_t *c1 = src_begin + hash_table[i1].offset; + const uint8_t *c2 = src_begin + hash_table[i2].offset; + const uint8_t *c3 = src_begin + hash_table[i3].offset; + const uint32_t m0 = hash_table[i0].word; + const uint32_t m1 = hash_table[i1].word; + const uint32_t m2 = hash_table[i2].word; + const uint32_t m3 = hash_table[i3].word; + hash_table[i0].offset = pos; + hash_table[i0].word = w0; + hash_table[i1].offset = pos + 1; + hash_table[i1].word = w1; + + hash_table[i2].offset = pos + 2; + hash_table[i2].word = w2; + hash_table[i3].offset = pos + 3; + hash_table[i3].word = w3; + + match_distance = (match_begin - c0); + if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) { + match_end = match_begin + 4; + goto EXPAND_FORWARD; + } + + match_begin++; + match_distance = (match_begin - c1); + if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) { + match_end = match_begin + 4; + goto EXPAND_FORWARD; + } + + match_begin++; + match_distance = (match_begin - c2); + if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) { + match_end = match_begin + 4; + goto EXPAND_FORWARD; + } + + match_begin++; + match_distance = (match_begin - c3); + if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) { + match_end = match_begin + 4; + goto EXPAND_FORWARD; + } + +#if LZ4_EARLY_ABORT + //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits + if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) { + ptrdiff_t dstd = dst - dst_begin; + + if (dstd == 0) { + lz4_early_aborts++; + return; + } + +/* if (dstd >= pos) { */ +/* return; */ +/* } */ +/* ptrdiff_t cbytes = pos - dstd; */ +/* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */ +/* return; */ +/* } */ + lz4_do_abort_eval = 0; + } +#endif + } + + if (skip_final_literals) { *src_ptr = src; *dst_ptr = dst; return; } // do not emit the final literal sequence + + // Emit a trailing literal that covers the remainder of the source buffer, + // if we can do so without exceeding the bounds of the destination buffer. + size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src; + if (src_remaining < 15) { + *dst++ = (uint8_t)(src_remaining << 4); + memcpy(dst, src, 16); dst += src_remaining; + } else { + *dst++ = 0xf0; + dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15)); + if (dst == 0 || dst + src_remaining >= end) return; + memcpy(dst, src, src_remaining); dst += src_remaining; + } + *dst_ptr = dst; + *src_ptr = src + src_remaining; + return; + + EXPAND_FORWARD: + + // Expand match forward + { + const uint8_t * ref_end = match_end - match_distance; + while (match_end < src_end) + { + size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end); + if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; } + match_end += LZ4_MATCH_SEARCH_LOOP_SIZE; + ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE; + } + } + + // Expand match backward + { + // match_begin_min = max(src_begin + match_distance,literal) + const uint8_t * match_begin_min = src_begin + match_distance; + match_begin_min = (match_begin_min < src)?src:match_begin_min; + const uint8_t * ref_begin = match_begin - match_distance; + + while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1] ) { match_begin -= 1; ref_begin -= 1; } + } + + // Emit match + dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src); + if (!dst) return; + + // Update state + src = match_end; + + // Update return values to include the last fully encoded match + *dst_ptr = dst; + *src_ptr = src; + } +} + +#endif + +size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, + const uint8_t * __restrict src_buffer, size_t src_size, + lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES]) +{ + // Initialize hash table + const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 }; + + const uint8_t * src = src_buffer; + uint8_t * dst = dst_buffer; + + // We need several blocks because our base function is limited to 2GB input + const size_t BLOCK_SIZE = 0x7ffff000; + while (src_size > 0) + { + //DRKTODO either implement pattern4 or figure out optimal unroll + //DRKTODO: bizarrely, with plain O3 the compiler generates a single + //DRKTODO: scalar STP per loop iteration with the stock loop + //DRKTODO If hand unrolled, it switches to NEON store pairs + // Reset hash table for each block +/* #if __STDC_HOSTED__ */ +/* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */ +/* #else */ +/* for (int i=0;i BLOCK_SIZE ? BLOCK_SIZE : src_size; + + // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads. + // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer + uint8_t * dst_start = dst; + const uint8_t * src_start = src; + lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size); + + // Check progress + size_t dst_used = dst - dst_start; + size_t src_used = src - src_start; // src_used <= src_to_encode + if (src_to_encode == src_size && src_used < src_to_encode) return 0; // FAIL to encode last block + + // Note that there is a potential problem here in case of non compressible data requiring more blocks. + // We may end up here with src_used very small, or even 0, and will not be able to make progress during + // compression. We FAIL unless the length of literals remaining at the end is small enough. + if (src_to_encode < src_size && src_to_encode - src_used >= (1<<16)) return 0; // FAIL too many literals + + // Update counters (SRC and DST already have been updated) + src_size -= src_used; + dst_size -= dst_used; + } + + return (size_t)(dst - dst_buffer); // bytes produced +} + +#define likely(expr) __builtin_expect((expr) != 0, 1) +#define unlikely(expr) __builtin_expect((expr) != 0, 0) +typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1))); + +int lz4_decode(uint8_t ** dst_ptr, + uint8_t * dst_begin, + uint8_t * dst_end, + const uint8_t ** src_ptr, + const uint8_t * src_end) +{ + uint8_t * dst = *dst_ptr; + const uint8_t * src = *src_ptr; + + // Require dst_end > dst. + if (dst_end <= dst) goto OUT_FULL; + + while (src < src_end) + { + // Keep last good position + *src_ptr = src; + *dst_ptr = dst; + + uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM + uint32_t literalLength = (cmd >> 4) & 15; // 0..15 + uint32_t matchLength = 4 + (cmd & 15); // 4..19 + + // extra bytes for literalLength + if (unlikely(literalLength == 15)) + { + uint8_t s; + do { +#if DEBUG_LZ4_DECODE_ERRORS + if (unlikely(src >= src_end)) printf("Truncated SRC literal length\n"); +#endif + if (unlikely(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed) + s = *src++; + literalLength += s; + } while (unlikely(s == 255)); + } + + // copy literal +#if DEBUG_LZ4_DECODE_ERRORS + if (unlikely(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n"); +#endif + if (unlikely(literalLength > (size_t)(src_end - src))) goto IN_FAIL; + if (unlikely(literalLength > (size_t)(dst_end - dst))) { + // literal will take us past the end of the destination buffer, + // so we can only copy part of it. + literalLength = (uint32_t)(dst_end - dst); + memcpy(dst, src, literalLength); + dst += literalLength; + goto OUT_FULL; + } + memcpy(dst,src,literalLength); + src += literalLength; + dst += literalLength; + + if (unlikely(src >= src_end)) goto OUT_FULL; // valid end of stream +#if DEBUG_LZ4_DECODE_ERRORS + if (unlikely(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n"); +#endif + if (unlikely(2 > (size_t)(src_end - src))) goto IN_FAIL; // unexpected end of input (2 bytes needed) + + //DRKTODO: this causes an alignment increase warning (legitimate?) + //DRKTODO: cast of char * to uint16_t* + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wcast-align" + + // match distance + uint64_t matchDistance = *(const uint16_t *)src; // 0x0000 <= matchDistance <= 0xffff + #pragma clang diagnostic pop + src += 2; +#if DEBUG_LZ4_DECODE_ERRORS + if (matchDistance == 0) printf("Invalid match distance D = 0\n"); +#endif + if (unlikely(matchDistance == 0)) goto IN_FAIL; // 0x0000 invalid + uint8_t * ref = dst - matchDistance; +#if DEBUG_LZ4_DECODE_ERRORS + if (unlikely(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end); +#endif + if (unlikely(ref < dst_begin)) goto OUT_FAIL; // out of range + + // extra bytes for matchLength + if (unlikely(matchLength == 19)) + { + uint8_t s; + do { +#if DEBUG_LZ4_DECODE_ERRORS + if (unlikely(src >= src_end)) printf("Truncated SRC match length\n"); +#endif + if (unlikely(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed) + s = *src++; + matchLength += s; + } while (unlikely(s == 255)); + } + + // copy match (may overlap) + if (unlikely(matchLength > (size_t)(dst_end - dst))) { + // match will take us past the end of the destination buffer, + // so we can only copy part of it. + matchLength = (uint32_t)(dst_end - dst); + for (uint32_t i=0; i +#include +#include +#include +#include "lz4_assembly_select.h" +#include "lz4_constants.h" + +#define memcpy __builtin_memcpy + +#pragma mark - Building blocks + +// Represents a position in the input stream +typedef struct { uint32_t offset; uint32_t word; } lz4_hash_entry_t; +static const size_t lz4_hash_table_size = LZ4_COMPRESS_HASH_ENTRIES*sizeof(lz4_hash_entry_t); + +// Worker function for lz4 encode. Underlies both the buffer and stream encode operations. +// Performs lz4 encoding of up to 2gb of data, updates dst_ptr and src_ptr to point to the +// first byte of output and input that couldn't be completely processed, respectively. +// +// If skip_final_literals is 0, the entire src buffer is encoded, by emitting a final sequence of literals +// at the end of the compressed payload. +// +// If skip_final_literals is not 0, this final literal sequence is not emitted, and the src buffer is +// partially encoded (the length of this literal sequence varies). +extern void lz4_encode_2gb(uint8_t **dst_ptr, size_t dst_size, + const uint8_t **src_ptr, const uint8_t *src_begin, size_t src_size, + lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],int skip_final_literals); + +extern int lz4_decode(uint8_t **dst_ptr, uint8_t *dst_begin, uint8_t *dst_end, + const uint8_t **src_ptr, const uint8_t *src_end); + +#if LZ4_ENABLE_ASSEMBLY_DECODE +extern int lz4_decode_asm(uint8_t **dst_ptr, uint8_t *dst_begin, uint8_t *dst_end, + const uint8_t **src_ptr, const uint8_t *src_end); +#endif + +#pragma mark - Buffer interfaces + +static const size_t lz4_encode_scratch_size = lz4_hash_table_size; +static const size_t lz4_decode_scratch_size = 0; + +#pragma mark - Buffer interfaces (LZ4 RAW) + +size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, + const uint8_t * __restrict src_buffer, size_t src_size, + lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES]); + +size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, + const uint8_t * __restrict src_buffer, size_t src_size, + void * __restrict work __attribute__((unused))); + +typedef __attribute__((__ext_vector_type__(8))) uint8_t vector_uchar8; +typedef __attribute__((__ext_vector_type__(16))) uint8_t vector_uchar16; +typedef __attribute__((__ext_vector_type__(32))) uint8_t vector_uchar32; +typedef __attribute__((__ext_vector_type__(64))) uint8_t vector_uchar64; +typedef __attribute__((__ext_vector_type__(16),__aligned__(1))) uint8_t packed_uchar16; +typedef __attribute__((__ext_vector_type__(32),__aligned__(1))) uint8_t packed_uchar32; +typedef __attribute__((__ext_vector_type__(64),__aligned__(1))) uint8_t packed_uchar64; + +typedef __attribute__((__ext_vector_type__(4))) uint16_t vector_ushort4; +typedef __attribute__((__ext_vector_type__(4),__aligned__(2))) uint16_t packed_ushort4; + +typedef __attribute__((__ext_vector_type__(2))) int32_t vector_int2; +typedef __attribute__((__ext_vector_type__(4))) int32_t vector_int4; +typedef __attribute__((__ext_vector_type__(8))) int32_t vector_int8; + +typedef __attribute__((__ext_vector_type__(4))) uint32_t vector_uint4; + +#define UTIL_FUNCTION static inline __attribute__((__always_inline__)) __attribute__((__overloadable__)) + +// Load N bytes from unaligned location PTR +UTIL_FUNCTION uint16_t load2(const void * ptr) { uint16_t data; memcpy(&data,ptr,sizeof data); return data; } +UTIL_FUNCTION uint32_t load4(const void * ptr) { uint32_t data; memcpy(&data,ptr,sizeof data); return data; } +UTIL_FUNCTION uint64_t load8(const void * ptr) { uint64_t data; memcpy(&data,ptr,sizeof data); return data; } +UTIL_FUNCTION vector_uchar16 load16(const void * ptr) { return (const vector_uchar16)*(const packed_uchar16 *)ptr; } +UTIL_FUNCTION vector_uchar32 load32(const void * ptr) { return (const vector_uchar32)*(const packed_uchar32 *)ptr; } +UTIL_FUNCTION vector_uchar64 load64(const void * ptr) { return (const vector_uchar64)*(const packed_uchar64 *)ptr; } + +// Store N bytes to unaligned location PTR +UTIL_FUNCTION void store2(void * ptr,uint16_t data) { memcpy(ptr,&data,sizeof data); } +UTIL_FUNCTION void store4(void * ptr,uint32_t data) { memcpy(ptr,&data,sizeof data); } +UTIL_FUNCTION void store8(void * ptr,uint64_t data) { memcpy(ptr,&data,sizeof data); } +UTIL_FUNCTION void store16(void * ptr,vector_uchar16 data) { *(packed_uchar16 *)ptr = (packed_uchar16)data; } +UTIL_FUNCTION void store32(void * ptr,vector_uchar32 data) { *(packed_uchar32 *)ptr = (packed_uchar32)data; } +UTIL_FUNCTION void store64(void * ptr,vector_uchar64 data) { *(packed_uchar64 *)ptr = (packed_uchar64)data; } + +// Load+Store N bytes from unaligned locations SRC to DST. No overlap allowed. +UTIL_FUNCTION void copy8(void * dst,const void * src) { store8(dst,load8(src)); } +UTIL_FUNCTION void copy16(void * dst,const void * src) { *(packed_uchar16 *)dst = *(const packed_uchar16 *)src; } +UTIL_FUNCTION void copy32(void * dst,const void * src) { *(packed_uchar32 *)dst = *(const packed_uchar32 *)src; } diff --git a/bsd/kern/vm_pressure.h b/osfmk/vm/lz4_assembly_select.h similarity index 65% rename from bsd/kern/vm_pressure.h rename to osfmk/vm/lz4_assembly_select.h index 402283583..7971711f4 100644 --- a/bsd/kern/vm_pressure.h +++ b/osfmk/vm/lz4_assembly_select.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2016-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,24 +26,19 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef VM_PRESSURE_H -#define VM_PRESSURE_H +#pragma once -#include -#include +// Common header to enable/disable the assembly code paths +// Rule: one define for each assembly source file -void vm_pressure_init(lck_grp_t *grp, lck_attr_t *attr); +// To enable assembly +#if defined __ARM_NEON__ +#define LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7 1 +#define LZ4_ENABLE_ASSEMBLY_DECODE_ARMV7 1 +#elif defined __x86_64__ +#define LZ4_ENABLE_ASSEMBLY_DECODE_X86_64 1 +#endif -int vm_knote_register(struct knote *); -void vm_knote_unregister(struct knote *); - -void consider_vm_pressure_events(void); -void vm_pressure_proc_cleanup(proc_t); - -#if VM_PRESSURE_EVENTS -void vm_find_pressure_foreground_candidates(void); -void vm_find_pressure_candidate(void); -boolean_t vm_dispatch_pressure_note_to_pid(pid_t pid, boolean_t locked); -#endif /* VM_PRESSURE_EVENTS */ - -#endif /* VM_PRESSURE_H */ +// To disable C +#define LZ4_ENABLE_ASSEMBLY_ENCODE ((LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7) || (LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64)) +#define LZ4_ENABLE_ASSEMBLY_DECODE (LZ4_ENABLE_ASSEMBLY_DECODE_ARM64 || LZ4_ENABLE_ASSEMBLY_DECODE_ARMV7 || LZ4_ENABLE_ASSEMBLY_DECODE_X86_64) diff --git a/osfmk/chud/chud_dtrace.h b/osfmk/vm/lz4_constants.h similarity index 71% rename from osfmk/chud/chud_dtrace.h rename to osfmk/vm/lz4_constants.h index 1cc06a511..a3d9a5256 100644 --- a/osfmk/chud/chud_dtrace.h +++ b/osfmk/vm/lz4_constants.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2016-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,16 +26,14 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef __CHUD_DTRACE_H__ -#define __CHUD_DTRACE_H__ +#pragma once -/* Definitions for arguments to the chud() dtrace builtin */ +// Tunables +#define LZ4_COMPRESS_HASH_BITS 10 +#define LZ4_COMPRESS_HASH_ENTRIES (1 << LZ4_COMPRESS_HASH_BITS) +#define LZ4_COMPRESS_HASH_MULTIPLY 2654435761U +#define LZ4_COMPRESS_HASH_SHIFT (32 - LZ4_COMPRESS_HASH_BITS) -#define CHUD_DTRACE_START_SHARK ((uint64_t)0x1ULL) -#define CHUD_DTRACE_STOP_SHARK ((uint64_t)0x2ULL) -#define CHUD_DTRACE_RECORD_SAMPLE ((uint64_t)0x3ULL) -#define CHUD_DTRACE_SIGNPOST_POINT ((uint64_t)0x4ULL) -#define CHUD_DTRACE_SIGNPOST_START ((uint64_t)0x5ULL) -#define CHUD_DTRACE_SIGNPOST_END ((uint64_t)0x6ULL) - -#endif /* __CHUD_DTRACE_H__ */ +// Not tunables +#define LZ4_GOFAST_SAFETY_MARGIN 128 +#define LZ4_DISTANCE_BOUND 65536 diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index 848b1eea8..d4bf4dcd8 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -129,7 +129,7 @@ decl_lck_mtx_data(, memory_manager_default_lock) #define memory_object_should_return_page(m, should_return) \ (should_return != MEMORY_OBJECT_RETURN_NONE && \ - (((m)->dirty || ((m)->dirty = pmap_is_modified((m)->phys_page))) || \ + (((m)->dirty || ((m)->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)))) || \ ((m)->precious && (should_return) == MEMORY_OBJECT_RETURN_ALL) || \ (should_return) == MEMORY_OBJECT_RETURN_ANYTHING)) @@ -212,7 +212,7 @@ memory_object_lock_page( * for the page to go from the clean to the dirty state * after we've made our decision */ - if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) { + if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } } else { @@ -222,7 +222,7 @@ memory_object_lock_page( * (pmap_page_protect may not increase protection). */ if (prot != VM_PROT_NO_CHANGE) - pmap_page_protect(m->phys_page, VM_PROT_ALL & ~prot); + pmap_page_protect(VM_PAGE_GET_PHYS_PAGE(m), VM_PROT_ALL & ~prot); } /* * Handle returning dirty or precious pages @@ -238,7 +238,7 @@ memory_object_lock_page( * faulted back into an address space * * if (!should_flush) - * pmap_disconnect(m->phys_page); + * pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); */ return (MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); } @@ -513,7 +513,8 @@ MACRO_BEGIN \ } \ MACRO_END - +extern struct vnode * +vnode_pager_lookup_vnode(memory_object_t); static int vm_object_update_extent( @@ -624,7 +625,7 @@ vm_object_update_extent( /* * add additional state for the flush */ - m->pageout = TRUE; + m->free_when_done = TRUE; } /* * we use to remove the page from the queues at this @@ -651,9 +652,8 @@ vm_object_update_extent( } } - if (dirty_count) { - task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_INVALIDATED); - } + if (object->pager) + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_INVALIDATED, vnode_pager_lookup_vnode(object->pager)); /* * We have completed the scan for applicable pages. * Clean any pages that have been saved. @@ -844,18 +844,17 @@ vm_object_update( case VM_FAULT_SUCCESS: if (top_page) { vm_fault_cleanup( - page->object, top_page); + VM_PAGE_OBJECT(page), top_page); vm_object_lock(copy_object); vm_object_paging_begin(copy_object); } - if (!page->active && - !page->inactive && - !page->throttled) { + if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(page))) { + vm_page_lockspin_queues(); - if (!page->active && - !page->inactive && - !page->throttled) + + if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(page))) { vm_page_deactivate(page); + } vm_page_unlock_queues(); } PAGE_WAKEUP_DONE(page); @@ -904,6 +903,7 @@ vm_object_update( } if (copy_object != VM_OBJECT_NULL && copy_object != object) { if ((flags & MEMORY_OBJECT_DATA_PURGE)) { + vm_object_lock_assert_exclusive(copy_object); copy_object->shadow_severed = TRUE; copy_object->shadowed = FALSE; copy_object->shadow = NULL; @@ -955,10 +955,10 @@ vm_object_update( num_of_extents = 0; e_mask = ~((vm_object_size_t)(EXTENT_SIZE - 1)); - m = (vm_page_t) queue_first(&object->memq); + m = (vm_page_t) vm_page_queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t) m)) { - next = (vm_page_t) queue_next(&m->listq); + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) m)) { + next = (vm_page_t) vm_page_queue_next(&m->listq); if ((m->offset >= start) && (m->offset < end)) { /* @@ -1950,6 +1950,41 @@ memory_object_mark_io_tracking( } } +#if CONFIG_SECLUDED_MEMORY +void +memory_object_mark_eligible_for_secluded( + memory_object_control_t control, + boolean_t eligible_for_secluded) +{ + vm_object_t object; + + if (control == NULL) + return; + object = memory_object_control_to_vm_object(control); + + if (object == VM_OBJECT_NULL) { + return; + } + + vm_object_lock(object); + if (eligible_for_secluded && + secluded_for_filecache && /* global boot-arg */ + !object->eligible_for_secluded) { + object->eligible_for_secluded = TRUE; + vm_page_secluded.eligible_for_secluded += object->resident_page_count; + } else if (!eligible_for_secluded && + object->eligible_for_secluded) { + object->eligible_for_secluded = FALSE; + vm_page_secluded.eligible_for_secluded -= object->resident_page_count; + if (object->resident_page_count) { + /* XXX FBDP TODO: flush pages from secluded queue? */ + // printf("FBDP TODO: flush %d pages from %p from secluded queue\n", object->resident_page_count, object); + } + } + vm_object_unlock(object); +} +#endif /* CONFIG_SECLUDED_MEMORY */ + kern_return_t memory_object_pages_resident( memory_object_control_t control, @@ -2323,19 +2358,6 @@ kern_return_t memory_object_data_reclaim reclaim_backing_store); } -/* Routine memory_object_create */ -kern_return_t memory_object_create -( - memory_object_default_t default_memory_manager, - vm_size_t new_memory_object_size, - memory_object_t *new_memory_object -) -{ - return default_pager_memory_object_create(default_memory_manager, - new_memory_object_size, - new_memory_object); -} - upl_t convert_port_to_upl( ipc_port_t port) diff --git a/osfmk/vm/memory_object.h b/osfmk/vm/memory_object.h index 35a35591c..2b26870ea 100644 --- a/osfmk/vm/memory_object.h +++ b/osfmk/vm/memory_object.h @@ -147,4 +147,10 @@ extern void memory_object_mark_unused( extern void memory_object_mark_io_tracking( memory_object_control_t control); +#if CONFIG_SECLUDED_MEMORY +extern void memory_object_mark_eligible_for_secluded( + memory_object_control_t control, + boolean_t eligible_for_secluded); +#endif /* CONFIG_SECLUDED_MEMORY */ + #endif /* _VM_MEMORY_OBJECT_H_ */ diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index 8a4b26961..d907093ac 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -413,17 +413,19 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory pmap_t __pmap = (pmap); \ vm_page_t __page = (page); \ int __options = 0; \ + vm_object_t __obj; \ \ PMAP_ENTER_CHECK(__pmap, __page) \ - if (__page->object->internal) { \ + __obj = VM_PAGE_OBJECT(__page); \ + if (__obj->internal) { \ __options |= PMAP_OPTIONS_INTERNAL; \ } \ - if (__page->reusable || __page->object->all_reusable) { \ + if (__page->reusable || __obj->all_reusable) { \ __options |= PMAP_OPTIONS_REUSABLE; \ } \ (void) pmap_enter_options(__pmap, \ (virtual_address), \ - __page->phys_page, \ + VM_PAGE_GET_PHYS_PAGE(__page), \ (protection), \ (fault_type), \ (flags), \ @@ -440,17 +442,19 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory pmap_t __pmap = (pmap); \ vm_page_t __page = (page); \ int __extra_options = 0; \ + vm_object_t __obj; \ \ PMAP_ENTER_CHECK(__pmap, __page) \ - if (__page->object->internal) { \ + __obj = VM_PAGE_OBJECT(__page); \ + if (__obj->internal) { \ __extra_options |= PMAP_OPTIONS_INTERNAL; \ } \ - if (__page->reusable || __page->object->all_reusable) { \ + if (__page->reusable || __obj->all_reusable) { \ __extra_options |= PMAP_OPTIONS_REUSABLE; \ } \ result = pmap_enter_options(__pmap, \ (virtual_address), \ - __page->phys_page, \ + VM_PAGE_GET_PHYS_PAGE(__page), \ (protection), \ (fault_type), \ (flags), \ @@ -464,7 +468,7 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory #define PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op) \ MACRO_BEGIN \ if (!batch_pmap_op) { \ - pmap_set_cache_attributes(mem->phys_page, cache_attr); \ + pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), cache_attr); \ object->set_cache_attr = TRUE; \ } \ MACRO_END @@ -575,6 +579,7 @@ extern kern_return_t pmap_unnest_options(pmap_t, uint64_t, unsigned int); extern boolean_t pmap_adjust_unnest_parameters(pmap_t, vm_map_offset_t *, vm_map_offset_t *); +extern void pmap_advise_pagezero_range(pmap_t, uint64_t); #endif /* MACH_KERNEL_PRIVATE */ extern boolean_t pmap_is_noencrypt(ppnum_t); @@ -663,6 +668,17 @@ mach_vm_size_t pmap_query_resident(pmap_t pmap, vm_map_offset_t e, mach_vm_size_t *compressed_bytes_p); +#define PMAP_QUERY_PAGE_PRESENT 0x01 +#define PMAP_QUERY_PAGE_REUSABLE 0x02 +#define PMAP_QUERY_PAGE_INTERNAL 0x04 +#define PMAP_QUERY_PAGE_ALTACCT 0x08 +#define PMAP_QUERY_PAGE_COMPRESSED 0x10 +#define PMAP_QUERY_PAGE_COMPRESSED_ALTACCT 0x20 +extern kern_return_t pmap_query_page_info( + pmap_t pmap, + vm_map_offset_t va, + int *disp); + #if CONFIG_PGTRACE int pmap_pgtrace_add_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end); int pmap_pgtrace_delete_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end); diff --git a/osfmk/vm/vm32_user.c b/osfmk/vm/vm32_user.c index 607e8d4b5..73e0b2bea 100644 --- a/osfmk/vm/vm32_user.c +++ b/osfmk/vm/vm32_user.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,7 +90,7 @@ #include #include -#if VM32_SUPPORT +#ifdef VM32_SUPPORT /* * See vm_user.c for the real implementation of all of these functions. diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 91208ca11..50be9b657 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -45,19 +45,18 @@ #include #include #include +#include #include #include -#include -#include - #include #include #include #include #include #include +#include /* @@ -277,6 +276,12 @@ apple_protect_pager_init( panic("apple_protect_pager_init: " "memory_object_change_attributes() failed"); +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache) { + memory_object_mark_eligible_for_secluded(control, TRUE); + } +#endif /* CONFIG_SECLUDED_MEMORY */ + return KERN_SUCCESS; } @@ -347,7 +352,7 @@ apple_protect_pager_data_request( upl_size_t upl_size; upl_page_info_t *upl_pl; unsigned int pl_count; - vm_object_t src_object, dst_object; + vm_object_t src_top_object, src_page_object, dst_object; kern_return_t kr, retval; vm_map_offset_t kernel_mapping; vm_offset_t src_vaddr, dst_vaddr; @@ -363,7 +368,8 @@ apple_protect_pager_data_request( PAGER_DEBUG(PAGER_ALL, ("apple_protect_pager_data_request: %p, %llx, %x, %x\n", mem_obj, offset, length, protection_required)); retval = KERN_SUCCESS; - src_object = VM_OBJECT_NULL; + src_top_object = VM_OBJECT_NULL; + src_page_object = VM_OBJECT_NULL; kernel_mapping = 0; upl = NULL; upl_pl = NULL; @@ -440,9 +446,9 @@ apple_protect_pager_data_request( * backing VM object (itself backed by the encrypted file via * the vnode pager). */ - src_object = pager->backing_object; - assert(src_object != VM_OBJECT_NULL); - vm_object_reference(src_object); /* to keep the source object alive */ + src_top_object = pager->backing_object; + assert(src_top_object != VM_OBJECT_NULL); + vm_object_reference(src_top_object); /* keep the source object alive */ /* * Fill in the contents of the pages requested by VM. @@ -462,15 +468,15 @@ apple_protect_pager_data_request( /* * Map the source (encrypted) page in the kernel's * virtual address space. - * We already hold a reference on the src_object. + * We already hold a reference on the src_top_object. */ retry_src_fault: - vm_object_lock(src_object); - vm_object_paging_begin(src_object); + vm_object_lock(src_top_object); + vm_object_paging_begin(src_top_object); error_code = 0; prot = VM_PROT_READ; src_page = VM_PAGE_NULL; - kr = vm_fault_page(src_object, + kr = vm_fault_page(src_top_object, pager->backing_offset + offset + cur_offset, VM_PROT_READ, FALSE, @@ -498,8 +504,8 @@ apple_protect_pager_data_request( goto done; case VM_FAULT_SUCCESS_NO_VM_PAGE: /* success but no VM page: fail */ - vm_object_paging_end(src_object); - vm_object_unlock(src_object); + vm_object_paging_end(src_top_object); + vm_object_unlock(src_top_object); /*FALLTHROUGH*/ case VM_FAULT_MEMORY_ERROR: /* the page is not there ! */ @@ -517,13 +523,11 @@ apple_protect_pager_data_request( assert(src_page != VM_PAGE_NULL); assert(src_page->busy); - if (!src_page->active && - !src_page->inactive && - !src_page->throttled) { + if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(src_page))) { + vm_page_lockspin_queues(); - if (!src_page->active && - !src_page->inactive && - !src_page->throttled) { + + if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(src_page))) { vm_page_deactivate(src_page); } vm_page_unlock_queues(); @@ -535,12 +539,12 @@ apple_protect_pager_data_request( */ #if __x86_64__ src_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)src_page->phys_page + PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) << PAGE_SHIFT); #else pmap_enter(kernel_pmap, src_vaddr, - src_page->phys_page, + VM_PAGE_GET_PHYS_PAGE(src_page), VM_PROT_READ, VM_PROT_NONE, 0, @@ -567,11 +571,12 @@ apple_protect_pager_data_request( 0, TRUE); #endif + src_page_object = VM_PAGE_OBJECT(src_page); /* * Validate the original page... */ - if (src_page->object->code_signed) { + if (src_page_object->code_signed) { vm_page_validate_cs_mapped( src_page, (const void *) src_vaddr); @@ -594,8 +599,8 @@ apple_protect_pager_data_request( * to unlock the object here. */ assert(src_page->busy); - assert(src_page->object->paging_in_progress > 0); - vm_object_unlock(src_page->object); + assert(src_page_object->paging_in_progress > 0); + vm_object_unlock(src_page_object); /* * Decrypt the encrypted contents of the source page @@ -633,7 +638,7 @@ apple_protect_pager_data_request( offset_in_page), *(uint64_t *)(dst_vaddr+ offset_in_page+8), - src_page->object->code_signed, + src_page_object->code_signed, src_page->cs_validated, src_page->cs_tainted, src_page->cs_nx); @@ -679,7 +684,7 @@ apple_protect_pager_data_request( (uint64_t) offset_in_page, *(uint64_t *)(dst_vaddr+offset_in_page), *(uint64_t *)(dst_vaddr+offset_in_page+8), - src_page->object->code_signed, + src_page_object->code_signed, src_page->cs_validated, src_page->cs_tainted, src_page->cs_nx, @@ -696,9 +701,10 @@ apple_protect_pager_data_request( retval = KERN_ABORTED; } + assert(VM_PAGE_OBJECT(src_page) == src_page_object); assert(src_page->busy); - assert(src_page->object->paging_in_progress > 0); - vm_object_lock(src_page->object); + assert(src_page_object->paging_in_progress > 0); + vm_object_lock(src_page_object); #if __x86_64__ || __arm__ || __arm64__ /* we used the 1-to-1 mapping of physical memory */ @@ -717,17 +723,38 @@ apple_protect_pager_data_request( /* * Cleanup the result of vm_fault_page() of the source page. */ - PAGE_WAKEUP_DONE(src_page); - vm_object_paging_end(src_page->object); - vm_object_unlock(src_page->object); + if (retval == KERN_SUCCESS && + src_page->busy && + !VM_PAGE_WIRED(src_page) && + !src_page->dirty && + !src_page->precious && + !src_page->laundry && + !src_page->cleaning) { + int refmod_state; + + refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(src_page)); + + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(src_page, FALSE); + } + if (!src_page->dirty) { + vm_page_free_unlocked(src_page, TRUE); + src_page = VM_PAGE_NULL; + } else { + PAGE_WAKEUP_DONE(src_page); + } + } else { + PAGE_WAKEUP_DONE(src_page); + } + src_page = VM_PAGE_NULL; + vm_object_paging_end(src_page_object); + vm_object_unlock(src_page_object); if (top_page != VM_PAGE_NULL) { - vm_object_t top_object; - - top_object = top_page->object; - vm_object_lock(top_object); + assert(VM_PAGE_OBJECT(top_page) == src_top_object); + vm_object_lock(src_top_object); VM_PAGE_FREE(top_page); - vm_object_paging_end(top_object); - vm_object_unlock(top_object); + vm_object_paging_end(src_top_object); + vm_object_unlock(src_top_object); } } @@ -796,8 +823,8 @@ apple_protect_pager_data_request( src_vaddr = 0; dst_vaddr = 0; } - if (src_object != VM_OBJECT_NULL) { - vm_object_deallocate(src_object); + if (src_top_object != VM_OBJECT_NULL) { + vm_object_deallocate(src_top_object); } return retval; diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index b4f2df76b..bf333dca4 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -35,14 +35,14 @@ #include #include #include +#include +#include #include /* for host_info() */ #include +#include #include -#include -#include - #include /* @@ -52,7 +52,6 @@ * the boot-arg & device-tree code. */ - int vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP; int vm_scale = 16; @@ -61,9 +60,7 @@ int vm_compressor_is_active = 0; int vm_compression_limit = 0; int vm_compressor_available = 0; -extern boolean_t vm_swap_up; extern void vm_pageout_io_throttle(void); -extern int not_in_kdp; #if CHECKSUM_THE_DATA || CHECKSUM_THE_SWAP || CHECKSUM_THE_COMPRESSED_DATA extern unsigned int hash_string(char *cp, int len); @@ -105,13 +102,13 @@ typedef struct c_slot_mapping *c_slot_mapping_t; union c_segu { c_segment_t c_seg; - uint32_t c_segno; + uintptr_t c_segno; }; -#define C_SLOT_PACK_PTR(ptr) (((uintptr_t)ptr - (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS) >> 2) -#define C_SLOT_UNPACK_PTR(cslot) ((uintptr_t)(cslot->c_packed_ptr << 2) + (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS) +#define C_SLOT_PACK_PTR(ptr) (((uintptr_t)ptr - (uintptr_t) KERNEL_PMAP_HEAP_RANGE_START) >> 2) +#define C_SLOT_UNPACK_PTR(cslot) ((uintptr_t)(cslot->c_packed_ptr << 2) + (uintptr_t) KERNEL_PMAP_HEAP_RANGE_START) uint32_t c_segment_count = 0; @@ -196,13 +193,7 @@ uint32_t vm_compressor_catchup_threshold_divisor = 10; lck_grp_attr_t vm_compressor_lck_grp_attr; lck_attr_t vm_compressor_lck_attr; lck_grp_t vm_compressor_lck_grp; - -#if __i386__ || __x86_64__ lck_mtx_t *c_list_lock; -#else /* __i386__ || __x86_64__ */ -lck_spin_t *c_list_lock; -#endif /* __i386__ || __x86_64__ */ - lck_rw_t c_master_lock; boolean_t decompressions_blocked = FALSE; @@ -270,7 +261,6 @@ boolean_t c_seg_major_compact_ok(c_segment_t, c_segment_t); int c_seg_minor_compaction_and_unlock(c_segment_t, boolean_t); int c_seg_do_minor_compaction_and_unlock(c_segment_t, boolean_t, boolean_t, boolean_t); void c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg); -void c_seg_need_delayed_compaction(c_segment_t); void c_seg_move_to_sparse_list(c_segment_t); void c_seg_insert_into_q(queue_head_t *, c_segment_t); @@ -278,7 +268,26 @@ void c_seg_insert_into_q(queue_head_t *, c_segment_t); uint64_t vm_available_memory(void); uint64_t vm_compressor_pages_compressed(void); -extern unsigned int dp_pages_free, dp_pages_reserve; +/* + * indicate the need to do a major compaction if + * the overall set of in-use compression segments + * becomes sparse... on systems that support pressure + * driven swapping, this will also cause swapouts to + * be initiated. + */ +static inline boolean_t vm_compressor_needs_to_major_compact() +{ + uint32_t incore_seg_count; + + incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count; + + if ((c_segment_count >= (c_segments_nearing_limit / 8)) && + ((incore_seg_count * C_SEG_MAX_PAGES) - VM_PAGE_COMPRESSOR_COUNT) > + ((incore_seg_count / 8) * C_SEG_MAX_PAGES)) + return (1); + return (0); +} + uint64_t vm_available_memory(void) @@ -294,19 +303,6 @@ vm_compressor_pages_compressed(void) } -boolean_t -vm_compression_available(void) -{ - if ( !(COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)) - return (FALSE); - - if (c_segments_available >= c_segments_limit || c_segment_pages_compressed >= c_segment_pages_compressed_limit) - return (FALSE); - - return (TRUE); -} - - boolean_t vm_compressor_low_on_space(void) { @@ -324,17 +320,10 @@ vm_wants_task_throttled(task_t task) if (task == kernel_task) return (0); - if (COMPRESSED_PAGER_IS_SWAPLESS || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) - return (0); - - if (COMPRESSED_PAGER_IS_SWAPBACKED || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + if (VM_CONFIG_SWAP_IS_ACTIVE) { if ((vm_compressor_low_on_space() || HARD_THROTTLE_LIMIT_REACHED()) && (unsigned int)pmap_compressed(task->map->pmap) > (c_segment_pages_compressed / 4)) return (1); - } else { - if (((dp_pages_free + dp_pages_reserve < 2000) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) && - get_task_resident_size(task) > (((AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE) / 5)) - return (1); } return (0); } @@ -361,7 +350,6 @@ vm_compressor_take_paging_space_action(void) } - void vm_compressor_init_locks(void) { @@ -395,7 +383,18 @@ vm_decompressor_unlock(void) thread_wakeup((event_t)&decompressions_blocked); } +static inline void cslot_copy(c_slot_t cdst, c_slot_t csrc) { +#if CHECKSUM_THE_DATA + cdst->c_hash_data = csrc->c_hash_data; +#endif +#if CHECKSUM_THE_COMPRESSED_DATA + cdst->c_hash_compressed_data = csrc->c_hash_compressed_data; +#endif + cdst->c_size = csrc->c_size; + cdst->c_packed_ptr = csrc->c_packed_ptr; +} +vm_map_t compressor_map; void vm_compressor_init(void) @@ -405,6 +404,12 @@ vm_compressor_init(void) c_slot_t cs = &cs_dummy; int c_segment_min_size; int c_segment_padded_size; + kern_return_t retval = KERN_SUCCESS; + vm_offset_t start_addr = 0; + vm_size_t c_segments_arr_size = 0, compressor_submap_size = 0; +#if RECORD_THE_COMPRESSED_DATA + vm_size_t c_compressed_record_sbuf_size = 0; +#endif /* RECORD_THE_COMPRESSED_DATA */ /* * ensure that any pointer that gets created from @@ -442,12 +447,7 @@ vm_compressor_init(void) * use PAGE_REPLACEMENT_ALLOWED to coordinate with the compressor. */ -#if __i386__ || __x86_64__ c_list_lock = lck_mtx_alloc_init(&vm_compressor_lck_grp, &vm_compressor_lck_attr); -#else /* __i386__ || __x86_64__ */ - c_list_lock = lck_spin_alloc_init(&vm_compressor_lck_grp, &vm_compressor_lck_attr); -#endif /* __i386__ || __x86_64__ */ - queue_init(&c_bad_list_head); queue_init(&c_age_list_head); @@ -497,13 +497,36 @@ vm_compressor_init(void) c_segments_busy = FALSE; - if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&c_segments), (sizeof(union c_segu) * c_segments_limit), 0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) + /* + * Submap needs space for: + * - c_segments + * - c_buffers + * - swap reclaimations -- C_SEG_BUFSIZE + */ + c_segments_arr_size = vm_map_round_page((sizeof(union c_segu) * c_segments_limit),VM_MAP_PAGE_MASK(kernel_map)); + c_buffers_size = vm_map_round_page(((vm_size_t)C_SEG_ALLOCSIZE * (vm_size_t)c_segments_limit), VM_MAP_PAGE_MASK(kernel_map)); + + compressor_submap_size = c_segments_arr_size + c_buffers_size + C_SEG_BUFSIZE; + +#if RECORD_THE_COMPRESSED_DATA + c_compressed_record_sbuf_size = (vm_size_t)C_SEG_ALLOCSIZE + (PAGE_SIZE * 2); + compressor_submap_size += c_compressed_record_sbuf_size; +#endif /* RECORD_THE_COMPRESSED_DATA */ + + retval = kmem_suballoc(kernel_map, &start_addr, compressor_submap_size, + FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT | VM_MAKE_TAG(0), + &compressor_map); + + if (retval != KERN_SUCCESS) + panic("vm_compressor_init: kmem_suballoc failed"); + + if (kernel_memory_allocate(compressor_map, (vm_offset_t *)(&c_segments), (sizeof(union c_segu) * c_segments_limit), 0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) panic("vm_compressor_init: kernel_memory_allocate failed - c_segments\n"); - c_buffers_size = (vm_size_t)C_SEG_ALLOCSIZE * (vm_size_t)c_segments_limit; - if (kernel_memory_allocate(kernel_map, &c_buffers, c_buffers_size, 0, KMA_COMPRESSOR | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) + if (kernel_memory_allocate(compressor_map, &c_buffers, c_buffers_size, 0, KMA_COMPRESSOR | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) panic("vm_compressor_init: kernel_memory_allocate failed - c_buffers\n"); c_segments_next_page = (caddr_t)c_segments; + vm_compressor_algorithm_init(); { host_basic_info_data_t hinfo; @@ -513,24 +536,23 @@ vm_compressor_init(void) host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); compressor_cpus = hinfo.max_cpus; + compressor_scratch_bufs = kalloc_tag(compressor_cpus * vm_compressor_get_decode_scratch_size(), VM_KERN_MEMORY_COMPRESSOR); - compressor_scratch_bufs = kalloc_tag(compressor_cpus * WKdm_SCRATCH_BUF_SIZE, VM_KERN_MEMORY_COMPRESSOR); - - kdp_compressor_scratch_buf = kalloc_tag(WKdm_SCRATCH_BUF_SIZE, VM_KERN_MEMORY_COMPRESSOR); + kdp_compressor_scratch_buf = kalloc_tag(vm_compressor_get_decode_scratch_size(), VM_KERN_MEMORY_COMPRESSOR); kdp_compressor_decompressed_page = kalloc_tag(PAGE_SIZE, VM_KERN_MEMORY_COMPRESSOR); kdp_compressor_decompressed_page_paddr = kvtophys((vm_offset_t)kdp_compressor_decompressed_page); kdp_compressor_decompressed_page_ppnum = (ppnum_t) atop(kdp_compressor_decompressed_page_paddr); } -#if CONFIG_FREEZE - freezer_compressor_scratch_buf = kalloc_tag(WKdm_SCRATCH_BUF_SIZE, VM_KERN_MEMORY_COMPRESSOR); +#if CONFIG_FREEZE + freezer_compressor_scratch_buf = kalloc_tag(vm_compressor_get_encode_scratch_size(), VM_KERN_MEMORY_COMPRESSOR); #endif #if RECORD_THE_COMPRESSED_DATA - if (kernel_memory_allocate(kernel_map, (vm_offset_t *)&c_compressed_record_sbuf, (vm_size_t)C_SEG_ALLOCSIZE + (PAGE_SIZE * 2), 0, KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) + if (kernel_memory_allocate(compressor_map, (vm_offset_t *)&c_compressed_record_sbuf, c_compressed_record_sbuf_size, 0, KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) panic("vm_compressor_init: kernel_memory_allocate failed - c_compressed_record_sbuf\n"); c_compressed_record_cptr = c_compressed_record_sbuf; - c_compressed_record_ebuf = c_compressed_record_sbuf + C_SEG_ALLOCSIZE + (PAGE_SIZE * 2); + c_compressed_record_ebuf = c_compressed_record_sbuf + c_compressed_record_sbuf_size; #endif if (kernel_thread_start_priority((thread_continue_t)vm_compressor_swap_trigger_thread, NULL, @@ -539,22 +561,19 @@ vm_compressor_init(void) } thread_deallocate(thread); - assert(default_pager_init_flag == 0); - if (vm_pageout_internal_start() != KERN_SUCCESS) { panic("vm_compressor_init: Failed to start the internal pageout thread.\n"); } - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) + if (VM_CONFIG_SWAP_IS_PRESENT) vm_compressor_swap_init(); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) + if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) vm_compressor_is_active = 1; #if CONFIG_FREEZE memorystatus_freeze_enabled = TRUE; #endif /* CONFIG_FREEZE */ - default_pager_init_flag = 1; vm_compressor_available = 1; vm_page_reactivate_all_throttled(); @@ -620,18 +639,20 @@ c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact) void -c_seg_need_delayed_compaction(c_segment_t c_seg) +c_seg_need_delayed_compaction(c_segment_t c_seg, boolean_t c_list_lock_held) { boolean_t clear_busy = FALSE; - if ( !lck_mtx_try_lock_spin_always(c_list_lock)) { - C_SEG_BUSY(c_seg); + if (c_list_lock_held == FALSE) { + if ( !lck_mtx_try_lock_spin_always(c_list_lock)) { + C_SEG_BUSY(c_seg); - lck_mtx_unlock_always(&c_seg->c_lock); - lck_mtx_lock_spin_always(c_list_lock); - lck_mtx_lock_spin_always(&c_seg->c_lock); + lck_mtx_unlock_always(&c_seg->c_lock); + lck_mtx_lock_spin_always(c_list_lock); + lck_mtx_lock_spin_always(&c_seg->c_lock); - clear_busy = TRUE; + clear_busy = TRUE; + } } assert(c_seg->c_state != C_IS_FILLING); @@ -640,7 +661,8 @@ c_seg_need_delayed_compaction(c_segment_t c_seg) c_seg->c_on_minorcompact_q = 1; c_minor_count++; } - lck_mtx_unlock_always(c_list_lock); + if (c_list_lock_held == FALSE) + lck_mtx_unlock_always(c_list_lock); if (clear_busy == TRUE) C_SEG_WAKEUP_DONE(c_seg); @@ -801,12 +823,10 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) { int old_state = c_seg->c_state; -#if DEVELOPMENT || DEBUG #if __i386__ || __x86_64__ if (new_state != C_IS_FILLING) - lck_mtx_assert(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED); - lck_mtx_assert(c_list_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED); #endif switch (old_state) { @@ -848,7 +868,8 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_SWAPPEDOUT_Q: - assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || + assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q || + new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE); queue_remove(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list); @@ -856,7 +877,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_SWAPPEDOUTSPARSE_Q: - assert(new_state == C_ON_SWAPPEDIN_Q || + assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q || new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE); queue_remove(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list); @@ -901,13 +922,20 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_AGE_Q: - assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q || - old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPOUT_Q); + assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q || old_state == C_ON_SWAPOUT_Q || + old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q); if (old_state == C_IS_FILLING) queue_enter(&c_age_list_head, c_seg, c_segment_t, c_age_list); - else - c_seg_insert_into_q(&c_age_list_head, c_seg); + else { + if (!queue_empty(&c_age_list_head)) { + c_segment_t c_first; + + c_first = (c_segment_t)queue_first(&c_age_list_head); + c_seg->c_creation_ts = c_first->c_creation_ts; + } + queue_enter_first(&c_age_list_head, c_seg, c_segment_t, c_age_list); + } c_age_count++; break; @@ -944,7 +972,11 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) case C_ON_SWAPPEDOUTSPARSE_Q: assert(c_seg->c_state == C_ON_SWAPOUT_Q || c_seg->c_state == C_ON_SWAPPEDOUT_Q); - c_seg_insert_into_q(&c_swappedout_sparse_list_head, c_seg); + if (insert_head == TRUE) + queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list); + else + queue_enter(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list); + c_swappedout_sparse_count++; break; @@ -1024,7 +1056,7 @@ c_seg_free_locked(c_segment_t c_seg) if (c_buffer) { if (pages_populated) - kernel_memory_depopulate(kernel_map, (vm_offset_t) c_buffer, pages_populated * PAGE_SIZE, KMA_COMPRESSOR); + kernel_memory_depopulate(compressor_map, (vm_offset_t) c_buffer, pages_populated * PAGE_SIZE, KMA_COMPRESSOR); } else if (c_swap_handle) { /* @@ -1053,11 +1085,7 @@ c_seg_free_locked(c_segment_t c_seg) lck_mtx_unlock_always(c_list_lock); -#if __i386__ || __x86_64__ lck_mtx_destroy(&c_seg->c_lock, &vm_compressor_lck_grp); -#else /* __i386__ || __x86_64__ */ - lck_spin_destroy(&c_seg->c_lock, &vm_compressor_lck_grp); -#endif /* __i386__ || __x86_64__ */ if (c_seg->c_slot_var_array_len) kfree(c_seg->c_slot_var_array, sizeof(struct c_slot) * c_seg->c_slot_var_array_len); @@ -1123,7 +1151,6 @@ c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy) int i; c_slot_t c_dst; c_slot_t c_src; - boolean_t need_unlock = TRUE; assert(c_seg->c_busy); @@ -1134,9 +1161,15 @@ c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy) c_seg_free(c_seg); return (1); } + lck_mtx_unlock_always(&c_seg->c_lock); + if (c_seg->c_firstemptyslot >= c_seg->c_nextslot || C_SEG_UNUSED_BYTES(c_seg) < PAGE_SIZE) goto done; +#if DEVELOPMENT || DEBUG + C_SEG_MAKE_WRITEABLE(c_seg); +#endif + #if VALIDATE_C_SEGMENTS c_seg->c_was_minor_compacted++; #endif @@ -1155,23 +1188,16 @@ c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy) if (c_size == 0) continue; - memcpy(&c_seg->c_store.c_buffer[c_offset], &c_seg->c_store.c_buffer[c_src->c_offset], c_size); + c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK; -#if CHECKSUM_THE_DATA - c_dst->c_hash_data = c_src->c_hash_data; -#endif -#if CHECKSUM_THE_COMPRESSED_DATA - c_dst->c_hash_compressed_data = c_src->c_hash_compressed_data; -#endif - c_dst->c_size = c_src->c_size; - c_dst->c_packed_ptr = c_src->c_packed_ptr; + memcpy(&c_seg->c_store.c_buffer[c_offset], &c_seg->c_store.c_buffer[c_src->c_offset], c_rounded_size); + + cslot_copy(c_dst, c_src); c_dst->c_offset = c_offset; slot_ptr = (c_slot_mapping_t)C_SLOT_UNPACK_PTR(c_dst); slot_ptr->s_cindx = c_indx; - c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK; - c_offset += C_SEG_BYTES_TO_OFFSET(c_rounded_size); PACK_C_SIZE(c_src, 0); c_indx++; @@ -1187,7 +1213,6 @@ c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy) #if VALIDATE_C_SEGMENTS c_seg_validate(c_seg, TRUE); #endif - if (old_populated_offset > c_seg->c_populated_offset) { uint32_t gc_size; int32_t *gc_ptr; @@ -1195,20 +1220,17 @@ c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy) gc_size = C_SEG_OFFSET_TO_BYTES(old_populated_offset - c_seg->c_populated_offset); gc_ptr = &c_seg->c_store.c_buffer[c_seg->c_populated_offset]; - lck_mtx_unlock_always(&c_seg->c_lock); + kernel_memory_depopulate(compressor_map, (vm_offset_t)gc_ptr, gc_size, KMA_COMPRESSOR); + } - kernel_memory_depopulate(kernel_map, (vm_offset_t)gc_ptr, gc_size, KMA_COMPRESSOR); +#if DEVELOPMENT || DEBUG + C_SEG_WRITE_PROTECT(c_seg); +#endif - if (clear_busy == TRUE) - lck_mtx_lock_spin_always(&c_seg->c_lock); - else - need_unlock = FALSE; - } done: - if (need_unlock == TRUE) { - if (clear_busy == TRUE) - C_SEG_WAKEUP_DONE(c_seg); - + if (clear_busy == TRUE) { + lck_mtx_lock_spin_always(&c_seg->c_lock); + C_SEG_WAKEUP_DONE(c_seg); lck_mtx_unlock_always(&c_seg->c_lock); } return (0); @@ -1313,6 +1335,9 @@ c_seg_major_compact( * from c_seg_src to c_seg_dst and update both c_segment's * state w/o holding the master lock */ +#if DEVELOPMENT || DEBUG + C_SEG_MAKE_WRITEABLE(c_seg_dst); +#endif #if VALIDATE_C_SEGMENTS c_seg_dst->c_was_major_compacted++; @@ -1347,7 +1372,7 @@ c_seg_major_compact( if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) size_to_populate = C_SEG_MAX_POPULATE_SIZE; - kernel_memory_populate(kernel_map, + kernel_memory_populate(compressor_map, (vm_offset_t) &c_seg_dst->c_store.c_buffer[c_seg_dst->c_populated_offset], size_to_populate, KMA_COMPRESSOR, @@ -1367,14 +1392,7 @@ c_seg_major_compact( c_seg_major_compact_stats.moved_slots++; c_seg_major_compact_stats.moved_bytes += c_size; -#if CHECKSUM_THE_DATA - c_dst->c_hash_data = c_src->c_hash_data; -#endif -#if CHECKSUM_THE_COMPRESSED_DATA - c_dst->c_hash_compressed_data = c_src->c_hash_compressed_data; -#endif - c_dst->c_size = c_src->c_size; - c_dst->c_packed_ptr = c_src->c_packed_ptr; + cslot_copy(c_dst, c_src); c_dst->c_offset = c_seg_dst->c_nextoffset; if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) @@ -1395,6 +1413,9 @@ c_seg_major_compact( break; } } +#if DEVELOPMENT || DEBUG + C_SEG_WRITE_PROTECT(c_seg_dst); +#endif if (dst_slot < c_seg_dst->c_nextslot) { PAGE_REPLACEMENT_ALLOWED(TRUE); @@ -1566,9 +1587,9 @@ compute_swapout_target_age(void) } -int compaction_swapper_inited = 0; int compaction_swapper_init_now = 0; int compaction_swapper_running = 0; +int compaction_swapper_awakened = 0; int compaction_swapper_abort = 0; @@ -1606,7 +1627,7 @@ compressor_needs_to_swap(void) if (age >= vm_ripe_target_age) return (TRUE); } - if ((vm_compressor_mode == VM_PAGER_COMPRESSOR_WITH_SWAP) && vm_swap_up == TRUE) { + if (VM_CONFIG_SWAP_IS_ACTIVE) { if (COMPRESSOR_NEEDS_TO_SWAP()) { return (TRUE); } @@ -1668,7 +1689,7 @@ compressor_needs_to_swap(void) if (should_swap == FALSE) { /* - * COMPRESSOR_NEEDS_TO_MAJOR_COMPACT returns true only if we're + * vm_compressor_needs_to_major_compact returns true only if we're * about to run out of available compressor segments... in this * case, we absolutely need to run a major compaction even if * we've just kicked off a jetsam or we don't otherwise need to @@ -1676,7 +1697,7 @@ compressor_needs_to_swap(void) * pages back to the uncompressed cache, but does not guarantee * that we will free up even a single compression segment */ - should_swap = COMPRESSOR_NEEDS_TO_MAJOR_COMPACT(); + should_swap = vm_compressor_needs_to_major_compact(); } /* @@ -1717,26 +1738,76 @@ vm_thrashing_jetsam_done(void) #endif /* CONFIG_JETSAM */ uint32_t vm_wake_compactor_swapper_calls = 0; +uint32_t vm_run_compactor_already_running = 0; +uint32_t vm_run_compactor_empty_minor_q = 0; +uint32_t vm_run_compactor_did_compact = 0; +uint32_t vm_run_compactor_waited = 0; + +void +vm_run_compactor(void) +{ + if (c_segment_count == 0) + return; + + lck_mtx_lock_spin_always(c_list_lock); + + if (c_minor_count == 0) { + vm_run_compactor_empty_minor_q++; + + lck_mtx_unlock_always(c_list_lock); + return; + } + if (compaction_swapper_running) { + + if (vm_restricted_to_single_processor == FALSE) { + vm_run_compactor_already_running++; + + lck_mtx_unlock_always(c_list_lock); + return; + } + vm_run_compactor_waited++; + + assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT); + + lck_mtx_unlock_always(c_list_lock); + + thread_block(THREAD_CONTINUE_NULL); + + return; + } + vm_run_compactor_did_compact++; + + fastwake_warmup = FALSE; + compaction_swapper_running = 1; + + vm_compressor_do_delayed_compactions(FALSE); + + compaction_swapper_running = 0; + + lck_mtx_unlock_always(c_list_lock); + + thread_wakeup((event_t)&compaction_swapper_running); +} + void vm_wake_compactor_swapper(void) { - if (compaction_swapper_running || c_segment_count == 0) + if (compaction_swapper_running || compaction_swapper_awakened || c_segment_count == 0) return; - if (c_minor_count || COMPRESSOR_NEEDS_TO_MAJOR_COMPACT()) { + if (c_minor_count || vm_compressor_needs_to_major_compact()) { lck_mtx_lock_spin_always(c_list_lock); fastwake_warmup = FALSE; - if (compaction_swapper_running == 0) { + if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) { vm_wake_compactor_swapper_calls++; + compaction_swapper_awakened = 1; thread_wakeup((event_t)&c_compressor_swap_trigger); - - compaction_swapper_running = 1; } lck_mtx_unlock_always(c_list_lock); } @@ -1750,6 +1821,7 @@ vm_consider_swapping() clock_sec_t now; clock_nsec_t nsec; + assert(VM_CONFIG_SWAP_IS_PRESENT); lck_mtx_lock_spin_always(c_list_lock); @@ -1800,6 +1872,8 @@ vm_consider_swapping() vm_swapout_ripe_segments = FALSE; lck_mtx_unlock_always(c_list_lock); + + thread_wakeup((event_t)&compaction_swapper_running); } @@ -1808,10 +1882,10 @@ vm_consider_waking_compactor_swapper(void) { boolean_t need_wakeup = FALSE; - if (compaction_swapper_running) + if (c_segment_count == 0) return; - if (c_segment_count == 0) + if (compaction_swapper_running || compaction_swapper_awakened) return; if (!compaction_swapper_inited && !compaction_swapper_init_now) { @@ -1841,12 +1915,11 @@ vm_consider_waking_compactor_swapper(void) fastwake_warmup = FALSE; - if (compaction_swapper_running == 0) { + if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) { memoryshot(VM_WAKEUP_COMPACTOR_SWAPPER, DBG_FUNC_NONE); + compaction_swapper_awakened = 1; thread_wakeup((event_t)&c_compressor_swap_trigger); - - compaction_swapper_running = 1; } lck_mtx_unlock_always(c_list_lock); } @@ -1864,7 +1937,7 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all) boolean_t needs_to_swap = FALSE; - lck_mtx_assert(c_list_lock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED); while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) { @@ -1884,7 +1957,7 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all) c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, TRUE); - if (vm_swap_up == TRUE && (number_compacted++ > DELAYED_COMPACTIONS_PER_PASS)) { + if (VM_CONFIG_SWAP_IS_ACTIVE && (number_compacted++ > DELAYED_COMPACTIONS_PER_PASS)) { if ((flush_all == TRUE || compressor_needs_to_swap() == TRUE) && c_swapout_count < C_SWAPOUT_LIMIT) needs_to_swap = TRUE; @@ -1923,6 +1996,14 @@ vm_compressor_age_swapped_in_segments(boolean_t flush_all) } +extern int vm_num_swap_files; +extern int vm_num_pinned_swap_files; +extern int vm_swappin_enabled; + +extern unsigned int vm_swapfile_total_segs_used; +extern unsigned int vm_swapfile_total_segs_alloced; + + void vm_compressor_flush(void) { @@ -1987,18 +2068,19 @@ vm_compressor_flush(void) lck_mtx_unlock_always(c_list_lock); + thread_wakeup((event_t)&compaction_swapper_running); + clock_get_uptime(&endTime); SUB_ABSOLUTETIME(&endTime, &startTime); absolutetime_to_nanoseconds(endTime, &nsec); - HIBLOG("vm_compressor_flush completed - took %qd msecs\n", nsec / 1000000ULL); + HIBLOG("vm_compressor_flush completed - took %qd msecs - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d, vm_swappin_enabled = %d\n", + nsec / 1000000ULL, vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled); } -extern void vm_swap_file_set_tuneables(void); int compaction_swap_trigger_thread_awakened = 0; - static void vm_compressor_swap_trigger_thread(void) { @@ -2014,25 +2096,31 @@ vm_compressor_swap_trigger_thread(void) * be operating on the correct directory (in case the default * of /var/vm/ is overridden by the dymanic_pager */ - if (compaction_swapper_init_now && !compaction_swapper_inited) { - if (vm_compressor_mode == VM_PAGER_COMPRESSOR_WITH_SWAP) - vm_swap_file_set_tuneables(); + if (compaction_swapper_init_now) { + vm_compaction_swapper_do_init(); if (vm_restricted_to_single_processor == TRUE) thread_vm_bind_group_add(); - compaction_swapper_inited = 1; + compaction_swapper_init_now = 0; } lck_mtx_lock_spin_always(c_list_lock); compaction_swap_trigger_thread_awakened++; + compaction_swapper_awakened = 0; - vm_compressor_compact_and_swap(FALSE); + if (compaction_swapper_running == 0) { + compaction_swapper_running = 1; + + vm_compressor_compact_and_swap(FALSE); + + compaction_swapper_running = 0; + } assert_wait((event_t)&c_compressor_swap_trigger, THREAD_UNINT); - compaction_swapper_running = 0; - thread_wakeup((event_t)&compaction_swapper_running); + if (compaction_swapper_running == 0) + thread_wakeup((event_t)&compaction_swapper_running); lck_mtx_unlock_always(c_list_lock); @@ -2089,7 +2177,7 @@ vm_compressor_record_warmup_end(void) } -#define DELAY_TRIM_ON_WAKE_SECS 4 +#define DELAY_TRIM_ON_WAKE_SECS 25 void vm_compressor_delay_trim(void) @@ -2114,10 +2202,11 @@ vm_compressor_do_warmup(void) return; } - if (compaction_swapper_running == 0) { + if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) { fastwake_warmup = TRUE; - compaction_swapper_running = 1; + + compaction_swapper_awakened = 1; thread_wakeup((event_t)&c_compressor_swap_trigger); } lck_mtx_unlock_always(c_list_lock); @@ -2127,7 +2216,6 @@ vm_compressor_do_warmup(void) void do_fastwake_warmup(void) { - uint64_t my_thread_id; c_segment_t c_seg = NULL; AbsoluteTime startTime, endTime; uint64_t nsec; @@ -2139,9 +2227,8 @@ do_fastwake_warmup(void) lck_mtx_unlock_always(c_list_lock); - my_thread_id = current_thread()->thread_id; - proc_set_task_policy_thread(kernel_task, my_thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2); + proc_set_thread_policy(current_thread(), + TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2); PAGE_REPLACEMENT_DISALLOWED(TRUE); @@ -2166,9 +2253,8 @@ do_fastwake_warmup(void) c_seg_wait_on_busy(c_seg); PAGE_REPLACEMENT_DISALLOWED(TRUE); } else { - c_seg_swapin(c_seg, TRUE); - - lck_mtx_unlock_always(&c_seg->c_lock); + if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) + lck_mtx_unlock_always(&c_seg->c_lock); c_segment_warmup_count++; PAGE_REPLACEMENT_DISALLOWED(FALSE); @@ -2181,8 +2267,8 @@ do_fastwake_warmup(void) PAGE_REPLACEMENT_DISALLOWED(FALSE); - proc_set_task_policy_thread(kernel_task, my_thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER0); + proc_set_thread_policy(current_thread(), + TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER0); clock_get_uptime(&endTime); SUB_ABSOLUTETIME(&endTime, &startTime); @@ -2250,6 +2336,15 @@ vm_compressor_compact_and_swap(boolean_t flush_all) HIBLOG("vm_compressor_flush - out of swap space\n"); break; } + if (vm_swap_files_pinned() == FALSE) { + HIBLOG("vm_compressor_flush - unpinned swap files\n"); + break; + } + if (hibernate_in_progress_with_pinned_swap == TRUE && + (vm_swapfile_total_segs_alloced == vm_swapfile_total_segs_used)) { + HIBLOG("vm_compressor_flush - out of pinned swap space\n"); + break; + } clock_get_system_nanotime(&sec, &nsec); if (sec > hibernate_flushing_deadline) { @@ -2410,27 +2505,25 @@ vm_compressor_compact_and_swap(boolean_t flush_all) assert(c_seg->c_busy); assert(!c_seg->c_on_minorcompact_q); - if (vm_swap_up == TRUE) { + if (VM_CONFIG_SWAP_IS_ACTIVE) { /* * This mode of putting a generic c_seg on the swapout list is - * only supported when we have general swap ON i.e. - * we compress pages into c_segs as we process them off - * the paging queues in vm_pageout_scan(). + * only supported when we have general swapping enabled */ - if (COMPRESSED_PAGER_IS_SWAPBACKED) - c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); - else { - if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) { - /* - * we are running compressor sweeps with swap-behind - * make sure the c_seg has aged enough before swapping it - * out... - */ - if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) { - c_seg->c_overage_swap = TRUE; - c_overage_swapped_count++; - c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); - } + c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); + } else { + if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) { + + assert(VM_CONFIG_SWAP_IS_PRESENT); + /* + * we are running compressor sweeps with swap-behind + * make sure the c_seg has aged enough before swapping it + * out... + */ + if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) { + c_seg->c_overage_swap = TRUE; + c_overage_swapped_count++; + c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); } } } @@ -2496,7 +2589,7 @@ c_seg_allocate(c_segment_t *current_chead) c_segments_busy = TRUE; lck_mtx_unlock_always(c_list_lock); - kernel_memory_populate(kernel_map, (vm_offset_t)c_segments_next_page, + kernel_memory_populate(compressor_map, (vm_offset_t)c_segments_next_page, PAGE_SIZE, KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR); c_segments_next_page += PAGE_SIZE; @@ -2520,7 +2613,7 @@ c_seg_allocate(c_segment_t *current_chead) c_segno = c_free_segno_head; assert(c_segno >= 0 && c_segno < c_segments_limit); - c_free_segno_head = c_segments[c_segno].c_segno; + c_free_segno_head = (uint32_t)c_segments[c_segno].c_segno; /* * do the rest of the bookkeeping now while we're still behind @@ -2538,11 +2631,7 @@ c_seg_allocate(c_segment_t *current_chead) c_seg->c_store.c_buffer = (int32_t *)C_SEG_BUFFER_ADDRESS(c_segno); -#if __i386__ || __x86_64__ lck_mtx_init(&c_seg->c_lock, &vm_compressor_lck_grp, &vm_compressor_lck_attr); -#else /* __i386__ || __x86_64__ */ - lck_spin_init(&c_seg->c_lock, &vm_compressor_lck_grp, &vm_compressor_lck_attr); -#endif /* __i386__ || __x86_64__ */ c_seg->c_state = C_IS_EMPTY; c_seg->c_firstemptyslot = C_SLOT_MAX_INDEX; @@ -2552,9 +2641,15 @@ c_seg_allocate(c_segment_t *current_chead) c_empty_count++; c_seg_switch_state(c_seg, C_IS_FILLING, FALSE); c_segments[c_segno].c_seg = c_seg; + assert(c_segments[c_segno].c_segno > c_segments_available); lck_mtx_unlock_always(c_list_lock); *current_chead = c_seg; + +#if DEVELOPMENT || DEBUG + C_SEG_MAKE_WRITEABLE(c_seg); +#endif + } c_seg_alloc_nextslot(c_seg); @@ -2569,7 +2664,7 @@ c_seg_allocate(c_segment_t *current_chead) if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) size_to_populate = C_SEG_MAX_POPULATE_SIZE; - kernel_memory_populate(kernel_map, + kernel_memory_populate(compressor_map, (vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset], size_to_populate, KMA_COMPRESSOR, @@ -2609,7 +2704,7 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) lck_mtx_unlock_always(&c_seg->c_lock); kernel_memory_depopulate( - kernel_map, + compressor_map, (vm_offset_t) &c_seg->c_store.c_buffer[offset_to_depopulate], unused_bytes, KMA_COMPRESSOR); @@ -2620,8 +2715,30 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) } assert(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset) <= C_SEG_BUFSIZE); +#if DEVELOPMENT || DEBUG + { + boolean_t c_seg_was_busy = FALSE; + + if ( !c_seg->c_busy) + C_SEG_BUSY(c_seg); + else + c_seg_was_busy = TRUE; + + lck_mtx_unlock_always(&c_seg->c_lock); + + C_SEG_WRITE_PROTECT(c_seg); + + lck_mtx_lock_spin_always(&c_seg->c_lock); + + if (c_seg_was_busy == FALSE) + C_SEG_WAKEUP_DONE(c_seg); + } +#endif + #if CONFIG_FREEZE - if (current_chead == (c_segment_t*)&freezer_chead && DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED && + if (current_chead == (c_segment_t*)&freezer_chead && + VM_CONFIG_SWAP_IS_PRESENT && + VM_CONFIG_FREEZER_SWAP_IS_ACTIVE && c_freezer_swapout_count < VM_MAX_FREEZER_CSEG_SWAP_COUNT) { new_state = C_ON_SWAPOUT_Q; } @@ -2640,6 +2757,9 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) c_seg->c_generation_id = c_generation_id++; c_seg_switch_state(c_seg, new_state, FALSE); + if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) + c_seg_need_delayed_compaction(c_seg, TRUE); + lck_mtx_unlock_always(c_list_lock); #if CONFIG_FREEZE @@ -2647,17 +2767,15 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) thread_wakeup((event_t)&c_swapout_list_head); #endif /* CONFIG_FREEZE */ - if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) - c_seg_need_delayed_compaction(c_seg); - *current_chead = NULL; } + /* * returns with c_seg locked */ void -c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data) +c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data, boolean_t minor_compact_ok, boolean_t age_on_swapin_q) { clock_sec_t sec; clock_nsec_t nsec; @@ -2667,6 +2785,9 @@ c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data) lck_mtx_lock_spin_always(c_list_lock); lck_mtx_lock_spin_always(&c_seg->c_lock); + assert(c_seg->c_busy_swapping); + assert(c_seg->c_busy); + c_seg->c_busy_swapping = 0; if (c_seg->c_overage_swap == TRUE) { @@ -2674,7 +2795,13 @@ c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data) c_seg->c_overage_swap = FALSE; } if (has_data == TRUE) { - c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE); + if (age_on_swapin_q == TRUE) + c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE); + else + c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE); + + if (minor_compact_ok == TRUE && !c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) + c_seg_need_delayed_compaction(c_seg, TRUE); } else { c_seg->c_store.c_buffer = (int32_t*) NULL; c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0); @@ -2689,12 +2816,13 @@ c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data) /* - * c_seg has to be locked and is returned locked. + * c_seg has to be locked and is returned locked if the c_seg isn't freed * PAGE_REPLACMENT_DISALLOWED has to be TRUE on entry and is returned TRUE + * c_seg_swapin returns 1 if the c_seg was freed, 0 otherwise */ -void -c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction) +int +c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_on_swapin_q) { vm_offset_t addr = 0; uint32_t io_size = 0; @@ -2724,15 +2852,18 @@ c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction) PAGE_REPLACEMENT_DISALLOWED(FALSE); addr = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno); + c_seg->c_store.c_buffer = (int32_t*) addr; - kernel_memory_populate(kernel_map, addr, io_size, KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR); + kernel_memory_populate(compressor_map, addr, io_size, KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR); - if (vm_swap_get(addr, f_offset, io_size) != KERN_SUCCESS) { + if (vm_swap_get(c_seg, f_offset, io_size) != KERN_SUCCESS) { PAGE_REPLACEMENT_DISALLOWED(TRUE); - kernel_memory_depopulate(kernel_map, addr, io_size, KMA_COMPRESSOR); + c_seg->c_store.c_swap_handle = f_offset; - c_seg_swapin_requeue(c_seg, FALSE); + kernel_memory_depopulate(compressor_map, addr, io_size, KMA_COMPRESSOR); + + c_seg_swapin_requeue(c_seg, FALSE, TRUE, age_on_swapin_q); } else { c_seg->c_store.c_buffer = (int32_t*) addr; #if ENCRYPTED_SWAP @@ -2750,14 +2881,22 @@ c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction) PAGE_REPLACEMENT_DISALLOWED(TRUE); + c_seg_swapin_requeue(c_seg, TRUE, force_minor_compaction == TRUE ? FALSE : TRUE, age_on_swapin_q); + + OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used); + if (force_minor_compaction == TRUE) { + if (c_seg_minor_compaction_and_unlock(c_seg, FALSE)) { + /* + * Drop the rwlock_count so that the thread priority + * is returned back to where it is supposed to be. + */ + clear_thread_rwlock_boost(); + return (1); + } + lck_mtx_lock_spin_always(&c_seg->c_lock); - - c_seg_minor_compaction_and_unlock(c_seg, FALSE); } - OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used); - - c_seg_swapin_requeue(c_seg, TRUE); } C_SEG_WAKEUP_DONE(c_seg); @@ -2766,6 +2905,8 @@ c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction) * is returned back to where it is supposed to be. */ clear_thread_rwlock_boost(); + + return (0); } @@ -2890,8 +3031,11 @@ c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead cs->c_hash_data = hash_string(src, PAGE_SIZE); #endif + if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) { + } else { c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], (WK_word *)(uintptr_t)scratch_buf, max_csize - 4); + } assert(c_size <= (max_csize - 4) && c_size >= -1); if (c_size == -1) { @@ -2982,12 +3126,46 @@ c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead return (0); } +static inline void sv_decompress(int32_t *ddst, int32_t pattern) { +#if __x86_64__ + memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t)); +#else + size_t i; + + /* Unroll the pattern fill loop 4x to encourage the + * compiler to emit NEON stores, cf. + * Loop autovectorization + * anomalies. + * We use separate loops for each PAGE_SIZE + * to allow the autovectorizer to engage, as PAGE_SIZE + * is currently not a constant. + */ + + if (PAGE_SIZE == 4096) { + for (i = 0; i < (4096U / sizeof(int32_t)); i += 4) { + *ddst++ = pattern; + *ddst++ = pattern; + *ddst++ = pattern; + *ddst++ = pattern; + } + } else { + assert(PAGE_SIZE == 16384); + for (i = 0; i < (int)(16384U / sizeof(int32_t)); i += 4) { + *ddst++ = pattern; + *ddst++ = pattern; + *ddst++ = pattern; + *ddst++ = pattern; + } + } +#endif +} static int c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int *zeroslot) { c_slot_t cs; c_segment_t c_seg; + uint32_t c_segno; int c_indx; int c_rounded_size; uint32_t c_size; @@ -2996,7 +3174,7 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int boolean_t consider_defragmenting = FALSE; boolean_t kdp_mode = FALSE; - if (flags & C_KDP) { + if (__improbable(flags & C_KDP)) { if (not_in_kdp) { panic("C_KDP passed to decompress page from outside of debugger context"); } @@ -3009,10 +3187,11 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int } kdp_mode = TRUE; + *zeroslot = 0; } ReTry: - if (!kdp_mode) { + if (__probable(!kdp_mode)) { PAGE_REPLACEMENT_DISALLOWED(TRUE); } else { if (kdp_lck_rw_lock_is_acquired_exclusive(&c_master_lock)) { @@ -3033,7 +3212,7 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int if (dst && decompressions_blocked == TRUE) { if (flags & C_DONT_BLOCK) { - if (!kdp_mode) { + if (__probable(!kdp_mode)) { PAGE_REPLACEMENT_DISALLOWED(FALSE); } @@ -3056,9 +3235,19 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int } #endif /* s_cseg is actually "segno+1" */ - c_seg = c_segments[slot_ptr->s_cseg - 1].c_seg; + c_segno = slot_ptr->s_cseg - 1; + + if (__improbable(c_segno >= c_segments_available)) + panic("c_decompress_page: c_segno %d >= c_segments_available %d, slot_ptr(%p), slot_data(%x)", + c_segno, c_segments_available, slot_ptr, *(int *)((void *)slot_ptr)); + + if (__improbable(c_segments[c_segno].c_segno < c_segments_available)) + panic("c_decompress_page: c_segno %d is free, slot_ptr(%p), slot_data(%x)", + c_segno, slot_ptr, *(int *)((void *)slot_ptr)); - if (!kdp_mode) { + c_seg = c_segments[c_segno].c_seg; + + if (__probable(!kdp_mode)) { lck_mtx_lock_spin_always(&c_seg->c_lock); } else { if (kdp_lck_mtx_lock_spin_is_acquired(&c_seg->c_lock)) { @@ -3068,6 +3257,11 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE); + if (dst == NULL && c_seg->c_busy_swapping) { + assert(c_seg->c_busy); + + goto bypass_busy_check; + } if (flags & C_DONT_BLOCK) { if (c_seg->c_busy || (C_SEG_IS_ONDISK(c_seg) && dst)) { *zeroslot = 0; @@ -3084,12 +3278,22 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int goto ReTry; } +bypass_busy_check: + c_indx = slot_ptr->s_cindx; + if (__improbable(c_indx >= c_seg->c_nextslot)) + panic("c_decompress_page: c_indx %d >= c_nextslot %d, c_seg(%p), slot_ptr(%p), slot_data(%x)", + c_indx, c_seg->c_nextslot, c_seg, slot_ptr, *(int *)((void *)slot_ptr)); + cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx); c_size = UNPACK_C_SIZE(cs); + if (__improbable(c_size == 0)) + panic("c_decompress_page: c_size == 0, c_seg(%p), slot_ptr(%p), slot_data(%x)", + c_seg, slot_ptr, *(int *)((void *)slot_ptr)); + c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK; if (dst) { @@ -3099,7 +3303,8 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int if (C_SEG_IS_ONDISK(c_seg)) { assert(kdp_mode == FALSE); - c_seg_swapin(c_seg, FALSE); + retval = c_seg_swapin(c_seg, FALSE, TRUE); + assert(retval == 0); retval = 1; } @@ -3111,7 +3316,7 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int } #if CHECKSUM_THE_COMPRESSED_DATA if (cs->c_hash_compressed_data != hash_string((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size)) - panic("compressed data doesn't match original"); + panic("compressed data doesn't match original hash: 0x%x, seg: %p, offset: %d, c_size: %d", cs->c_hash_compressed_data, c_seg, cs->c_offset, c_size); #endif if (c_rounded_size == PAGE_SIZE) { /* @@ -3130,21 +3335,12 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int */ dptr = (int32_t *)(uintptr_t)dst; data = *(int32_t *)(&c_seg->c_store.c_buffer[cs->c_offset]); -#if __x86_64__ - memset_word(dptr, data, PAGE_SIZE / sizeof(int32_t)); -#else - { - int i; - - for (i = 0; i < (int)(PAGE_SIZE / sizeof(int32_t)); i++) - *dptr++ = data; - } -#endif + sv_decompress(dptr, data); } else { uint32_t my_cpu_no; char *scratch_buf; - if (!kdp_mode) { + if (__probable(!kdp_mode)) { /* * we're behind the c_seg lock held in spin mode * which means pre-emption is disabled... therefore @@ -3154,24 +3350,28 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int assert(my_cpu_no < compressor_cpus); - scratch_buf = &compressor_scratch_bufs[my_cpu_no * WKdm_SCRATCH_BUF_SIZE]; + scratch_buf = &compressor_scratch_bufs[my_cpu_no * vm_compressor_get_decode_scratch_size()]; } else { scratch_buf = kdp_compressor_scratch_buf; } + + if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) { + } else { WKdm_decompress_new((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size); + } } #if CHECKSUM_THE_DATA if (cs->c_hash_data != hash_string(dst, PAGE_SIZE)) - panic("decompressed data doesn't match original"); + panic("decompressed data doesn't match original cs: %p, hash: %d, offset: %d, c_size: %d", cs, cs->c_hash_data, cs->c_offset, c_size); + #endif if (c_seg->c_swappedin_ts == 0 && !kdp_mode) { clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec); age_of_cseg = (uint32_t)cur_ts_sec - c_seg->c_creation_ts; - if (age_of_cseg < DECOMPRESSION_SAMPLE_MAX_AGE) OSAddAtomic(1, &age_of_decompressions_during_sample_period[age_of_cseg]); else @@ -3186,8 +3386,8 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int *zeroslot = 0; goto done; } - assert(kdp_mode == FALSE); + c_seg->c_bytes_unused += c_rounded_size; c_seg->c_bytes_used -= c_rounded_size; PACK_C_SIZE(cs, 0); @@ -3204,6 +3404,17 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int */ OSAddAtomic64(-c_rounded_size, &compressor_bytes_used); } + if (c_seg->c_busy_swapping) { + /* + * bypass case for c_busy_swapping... + * let the swapin/swapout paths deal with putting + * the c_seg on the minor compaction queue if needed + */ + assert(c_seg->c_busy); + goto done; + } + assert(!c_seg->c_busy); + if (c_seg->c_state != C_IS_FILLING) { if (c_seg->c_bytes_used == 0) { if ( !(C_SEG_IS_ONDISK(c_seg))) { @@ -3220,16 +3431,20 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int C_SEG_BUSY(c_seg); lck_mtx_unlock_always(&c_seg->c_lock); - kernel_memory_depopulate(kernel_map, (vm_offset_t) c_seg->c_store.c_buffer, pages_populated * PAGE_SIZE, KMA_COMPRESSOR); + kernel_memory_depopulate(compressor_map, (vm_offset_t) c_seg->c_store.c_buffer, pages_populated * PAGE_SIZE, KMA_COMPRESSOR); lck_mtx_lock_spin_always(&c_seg->c_lock); C_SEG_WAKEUP_DONE(c_seg); } - if (!c_seg->c_on_minorcompact_q) - c_seg_need_delayed_compaction(c_seg); - } else - assert(c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q); + if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPOUT_Q) + c_seg_need_delayed_compaction(c_seg, FALSE); + } else { + if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) { + c_seg_move_to_sparse_list(c_seg); + consider_defragmenting = TRUE; + } + } } else if (c_seg->c_on_minorcompact_q) { assert(c_seg->c_state != C_ON_BAD_Q); @@ -3241,7 +3456,7 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int } else if ( !(C_SEG_IS_ONDISK(c_seg))) { if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) { - c_seg_need_delayed_compaction(c_seg); + c_seg_need_delayed_compaction(c_seg, FALSE); } } else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) { @@ -3250,7 +3465,7 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int } } done: - if (kdp_mode) { + if (__improbable(kdp_mode)) { return retval; } @@ -3615,15 +3830,7 @@ vm_compressor_relocate( c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK; -#if CHECKSUM_THE_DATA - c_dst->c_hash_data = c_src->c_hash_data; -#endif -#if CHECKSUM_THE_COMPRESSED_DATA - c_dst->c_hash_compressed_data = c_src->c_hash_compressed_data; -#endif - - c_dst->c_size = c_src->c_size; - c_dst->c_packed_ptr = c_src->c_packed_ptr; + cslot_copy(c_dst, c_src); c_dst->c_offset = c_seg_dst->c_nextoffset; if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) @@ -3662,7 +3869,7 @@ vm_compressor_relocate( if (c_seg_src->c_bytes_used == 0 && c_seg_src->c_state != C_IS_FILLING) { if (!c_seg_src->c_on_minorcompact_q) - c_seg_need_delayed_compaction(c_seg_src); + c_seg_need_delayed_compaction(c_seg_src, FALSE); } lck_mtx_unlock_always(&c_seg_src->c_lock); diff --git a/osfmk/vm/vm_compressor.h b/osfmk/vm/vm_compressor.h index 45b110892..6c877bbe6 100644 --- a/osfmk/vm/vm_compressor.h +++ b/osfmk/vm/vm_compressor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,12 +33,12 @@ #include #include #include +#include #include #include #include - #define C_SEG_OFFSET_BITS 16 #define C_SEG_BUFSIZE (1024 * 256) #define C_SEG_MAX_PAGES (C_SEG_BUFSIZE / PAGE_SIZE) @@ -47,19 +47,24 @@ #define C_SEG_ALLOCSIZE (C_SEG_BUFSIZE) #define C_SEG_MAX_POPULATE_SIZE (4 * PAGE_SIZE) +#if DEBUG || COMPRESSOR_INTEGRITY_CHECKS +#define ENABLE_SWAP_CHECKS 1 +#define ENABLE_COMPRESSOR_CHECKS 1 +#else +#define ENABLE_SWAP_CHECKS 0 +#define ENABLE_COMPRESSOR_CHECKS 0 +#endif -#define CHECKSUM_THE_SWAP 0 /* Debug swap data */ -#define CHECKSUM_THE_DATA 0 /* Debug compressor/decompressor data */ -#define CHECKSUM_THE_COMPRESSED_DATA 0 /* Debug compressor/decompressor compressed data */ -#define VALIDATE_C_SEGMENTS 0 /* Debug compaction */ +#define CHECKSUM_THE_SWAP ENABLE_SWAP_CHECKS /* Debug swap data */ +#define CHECKSUM_THE_DATA ENABLE_COMPRESSOR_CHECKS /* Debug compressor/decompressor data */ +#define CHECKSUM_THE_COMPRESSED_DATA ENABLE_COMPRESSOR_CHECKS /* Debug compressor/decompressor compressed data */ +#define VALIDATE_C_SEGMENTS ENABLE_COMPRESSOR_CHECKS /* Debug compaction */ #define RECORD_THE_COMPRESSED_DATA 0 - - struct c_slot { uint64_t c_offset:C_SEG_OFFSET_BITS, - c_size:12, + c_size:12, c_packed_ptr:36; #if CHECKSUM_THE_DATA unsigned int c_hash_data; @@ -83,11 +88,7 @@ struct c_slot { struct c_segment { -#if __i386__ || __x86_64__ lck_mtx_t c_lock; -#else /* __i386__ || __x86_64__ */ - lck_spin_t c_lock; -#endif /* __i386__ || __x86_64__ */ queue_chain_t c_age_list; queue_chain_t c_list; @@ -181,12 +182,34 @@ extern vm_offset_t c_buffers; MACRO_END +#if DEVELOPMENT || DEBUG +extern vm_map_t compressor_map; + +#define C_SEG_MAKE_WRITEABLE(cseg) \ + MACRO_BEGIN \ + vm_map_protect(compressor_map, \ + (vm_map_offset_t)cseg->c_store.c_buffer, \ + (vm_map_offset_t)&cseg->c_store.c_buffer[C_SEG_BYTES_TO_OFFSET(C_SEG_ALLOCSIZE)],\ + VM_PROT_READ | VM_PROT_WRITE, \ + 0); \ + MACRO_END + +#define C_SEG_WRITE_PROTECT(cseg) \ + MACRO_BEGIN \ + vm_map_protect(compressor_map, \ + (vm_map_offset_t)cseg->c_store.c_buffer, \ + (vm_map_offset_t)&cseg->c_store.c_buffer[C_SEG_BYTES_TO_OFFSET(C_SEG_ALLOCSIZE)],\ + VM_PROT_READ, \ + 0); \ + MACRO_END +#endif typedef struct c_segment *c_segment_t; typedef struct c_slot *c_slot_t; uint64_t vm_compressor_total_compressions(void); void vm_wake_compactor_swapper(void); +void vm_run_compactor(void); void vm_thrashing_jetsam_done(void); void vm_consider_waking_compactor_swapper(void); void vm_consider_swapping(void); @@ -194,6 +217,7 @@ void vm_compressor_flush(void); void c_seg_free(c_segment_t); void c_seg_free_locked(c_segment_t); void c_seg_insert_into_age_q(c_segment_t); +void c_seg_need_delayed_compaction(c_segment_t, boolean_t); void vm_decompressor_lock(void); void vm_decompressor_unlock(void); @@ -204,8 +228,8 @@ void vm_compressor_record_warmup_start(void); void vm_compressor_record_warmup_end(void); int vm_wants_task_throttled(task_t); -boolean_t vm_compression_available(void); +extern void vm_compaction_swapper_do_init(void); extern void vm_compressor_swap_init(void); extern void vm_compressor_init_locks(void); extern lck_rw_t c_master_lock; @@ -215,17 +239,18 @@ extern void vm_swap_decrypt(c_segment_t); #endif /* ENCRYPTED_SWAP */ extern int vm_swap_low_on_space(void); -extern kern_return_t vm_swap_get(vm_offset_t, uint64_t, uint64_t); +extern kern_return_t vm_swap_get(c_segment_t, uint64_t, uint64_t); extern void vm_swap_free(uint64_t); extern void vm_swap_consider_defragmenting(void); -extern void c_seg_swapin_requeue(c_segment_t, boolean_t); -extern void c_seg_swapin(c_segment_t, boolean_t); +extern void c_seg_swapin_requeue(c_segment_t, boolean_t, boolean_t, boolean_t); +extern int c_seg_swapin(c_segment_t, boolean_t, boolean_t); extern void c_seg_wait_on_busy(c_segment_t); extern void c_seg_trim_tail(c_segment_t); extern void c_seg_switch_state(c_segment_t, int, boolean_t); extern boolean_t fastwake_recording_in_progress; +extern int compaction_swapper_inited; extern int compaction_swapper_running; extern uint64_t vm_swap_put_failures; @@ -248,6 +273,7 @@ extern uint64_t first_c_segment_to_warm_generation_id; extern uint64_t last_c_segment_to_warm_generation_id; extern boolean_t hibernate_flushing; extern boolean_t hibernate_no_swapspace; +extern boolean_t hibernate_in_progress_with_pinned_swap; extern uint32_t swapout_target_age; extern void c_seg_insert_into_q(queue_head_t *, c_segment_t); @@ -264,6 +290,9 @@ extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, c #define AVAILABLE_NON_COMPRESSED_MEMORY (vm_page_active_count + vm_page_inactive_count + vm_page_free_count + vm_page_speculative_count) #define AVAILABLE_MEMORY (AVAILABLE_NON_COMPRESSED_MEMORY + VM_PAGE_COMPRESSOR_COUNT) +/* TODO, there may be a minor optimisation opportunity to replace these divisions + * with multiplies and shifts + */ #define VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_minorcompact_threshold_divisor ? vm_compressor_minorcompact_threshold_divisor : 1)) #define VM_PAGE_COMPRESSOR_SWAP_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_majorcompact_threshold_divisor ? vm_compressor_majorcompact_threshold_divisor : 1)) @@ -279,31 +308,17 @@ extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, c #define SWAPPER_NEEDS_TO_UNTHROTTLE() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) ? 1 : 0) #define COMPRESSOR_NEEDS_TO_MINOR_COMPACT() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0) -/* - * indicate the need to do a major compaction if - * the overall set of in-use compression segments - * becomes sparse... on systems that support pressure - * driven swapping, this will also cause swapouts to - * be initiated. - */ -#define COMPRESSOR_NEEDS_TO_MAJOR_COMPACT() (((c_segment_count >= (c_segments_nearing_limit / 8)) && \ - ((c_segment_count * C_SEG_MAX_PAGES) - VM_PAGE_COMPRESSOR_COUNT) > \ - ((c_segment_count / 8) * C_SEG_MAX_PAGES)) \ - ? 1 : 0) #define COMPRESSOR_FREE_RESERVED_LIMIT 128 -#define COMPRESSOR_SCRATCH_BUF_SIZE WKdm_SCRATCH_BUF_SIZE +uint32_t vm_compressor_get_encode_scratch_size(void); +uint32_t vm_compressor_get_decode_scratch_size(void); +#define COMPRESSOR_SCRATCH_BUF_SIZE vm_compressor_get_encode_scratch_size() #if RECORD_THE_COMPRESSED_DATA extern void c_compressed_record_init(void); extern void c_compressed_record_write(char *, int); #endif - -#if __i386__ || __x86_64__ extern lck_mtx_t *c_list_lock; -#else /* __i386__ || __x86_64__ */ -extern lck_spin_t *c_list_lock; -#endif /* __i386__ || __x86_64__ */ diff --git a/osfmk/vm/vm_compressor_algorithms.c b/osfmk/vm/vm_compressor_algorithms.c new file mode 100644 index 000000000..e7135a7fc --- /dev/null +++ b/osfmk/vm/vm_compressor_algorithms.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2010-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* This module implements a hybrid/adaptive compression scheme, using WKdm where + * profitable and, currently, an LZ4 variant elsewhere. + * (Created 2016, Derek Kumar) + */ +#include "lz4.h" +#include "WKdm_new.h" +#include +#include + +#define MZV_MAGIC (17185) +#define LZ4_SCRATCH_ALIGN (64) +#define WKC_SCRATCH_ALIGN (64) + +#define LZ4_SCRATCH_ALIGN (64) +#define WKC_SCRATCH_ALIGN (64) + +#define memcpy_T_NT memcpy +#define memcpy_NT_T memcpy + +typedef union { + uint8_t lz4state[lz4_encode_scratch_size]__attribute((aligned(LZ4_SCRATCH_ALIGN))); + uint8_t wkscratch[0] __attribute((aligned(WKC_SCRATCH_ALIGN))); // TODO +} compressor_encode_scratch_t; + +typedef union { + uint8_t lz4decodestate[lz4_encode_scratch_size]__attribute((aligned(64))); + uint8_t wkdecompscratch[0] __attribute((aligned(64))); +} compressor_decode_scratch_t; + +typedef struct { + uint16_t lz4_selection_run; + uint16_t lz4_run_length; + uint16_t lz4_preselects; + uint32_t lz4_total_preselects; + uint16_t lz4_failure_skips; + uint32_t lz4_total_failure_skips; + uint16_t lz4_failure_run_length; + uint16_t lz4_total_unprofitables; + uint32_t lz4_total_negatives; + uint32_t lz4_total_failures; +} compressor_state_t; + +compressor_tuneables_t vmctune = { + .lz4_threshold = 2048, + .wkdm_reeval_threshold = 1536, + .lz4_max_failure_skips = 0, + .lz4_max_failure_run_length = ~0U, + .lz4_max_preselects = 0, + .lz4_run_preselection_threshold = ~0U, + .lz4_run_continue_bytes = 0, + .lz4_profitable_bytes = 0, +}; + +compressor_state_t vmcstate = { + .lz4_selection_run = 0, + .lz4_run_length = 0, + .lz4_preselects = 0, + .lz4_total_preselects = 0, + .lz4_failure_skips = 0, + .lz4_total_failure_skips = 0, + .lz4_failure_run_length = 0, + .lz4_total_unprofitables = 0, + .lz4_total_negatives = 0, +}; + +compressor_stats_t compressor_stats; + +enum compressor_preselect_t { + CPRESELLZ4 = 0, + CSKIPLZ4 = 1, + CPRESELWK = 2, +}; + +vm_compressor_mode_t vm_compressor_current_codec = VM_COMPRESSOR_DEFAULT_CODEC; + +boolean_t verbose = FALSE; + +#if DEVELOPMENT || DEBUG +#define VERBOSE(x...) \ + do { \ + if (verbose) \ + printf(x); \ + } while(0) +#define VM_COMPRESSOR_STAT(x...) \ + do { \ + (x); \ + } while(0) +//TODO make atomic where needed, decompression paths +#define VM_DECOMPRESSOR_STAT(x...) \ + do { \ + (x); \ + } while(0) +#else +#define VERBOSE(x...) \ + do { \ + }while (0) +#define VM_COMPRESSOR_STAT(x...) \ + do { \ + }while (0) +#define VM_DECOMPRESSOR_STAT(x...) \ + do { \ + }while (0) +#endif + +static inline enum compressor_preselect_t compressor_preselect(void) { + if (vmcstate.lz4_failure_skips >= vmctune.lz4_max_failure_skips) { + vmcstate.lz4_failure_skips = 0; + vmcstate.lz4_failure_run_length = 0; + } + + if (vmcstate.lz4_failure_run_length >= vmctune.lz4_max_failure_run_length) { + vmcstate.lz4_failure_skips++; + vmcstate.lz4_total_failure_skips++; + return CSKIPLZ4; + } + + if (vmcstate.lz4_preselects >= vmctune.lz4_max_preselects) { + vmcstate.lz4_preselects = 0; + return CPRESELWK; + } + + if (vmcstate.lz4_run_length >= vmctune.lz4_run_preselection_threshold) { + vmcstate.lz4_preselects++; + vmcstate.lz4_total_preselects++; + return CPRESELLZ4; + } + return CPRESELWK; +} + +static inline void compressor_selector_update(int lz4sz, int didwk, int wksz) { + VM_COMPRESSOR_STAT(compressor_stats.lz4_compressions++); + + if (lz4sz == 0) { + VM_COMPRESSOR_STAT(compressor_stats.lz4_compressed_bytes+=PAGE_SIZE); + VM_COMPRESSOR_STAT(compressor_stats.lz4_compression_failures++); + vmcstate.lz4_failure_run_length++; + VM_COMPRESSOR_STAT(vmcstate.lz4_total_failures++); + vmcstate.lz4_run_length = 0; + } else { + vmcstate.lz4_failure_run_length = 0; + + VM_COMPRESSOR_STAT(compressor_stats.lz4_compressed_bytes+=lz4sz); + + if (lz4sz <= vmctune.wkdm_reeval_threshold) { + vmcstate.lz4_run_length = 0; + } else { + if (!didwk) { + vmcstate.lz4_run_length++; + } + } + + if (didwk) { + if (__probable(wksz > lz4sz)) { + uint32_t lz4delta = wksz - lz4sz; + VM_COMPRESSOR_STAT(compressor_stats.lz4_wk_compression_delta+=lz4delta); + if (lz4delta >= vmctune.lz4_run_continue_bytes) { + vmcstate.lz4_run_length++; + } else if (lz4delta <= vmctune.lz4_profitable_bytes) { + vmcstate.lz4_failure_run_length++; + VM_COMPRESSOR_STAT(vmcstate.lz4_total_unprofitables++); + vmcstate.lz4_run_length = 0; + } else { + vmcstate.lz4_run_length = 0; + } + } else { + VM_COMPRESSOR_STAT(compressor_stats.lz4_wk_compression_negative_delta+=(lz4sz-wksz)); + vmcstate.lz4_failure_run_length++; + VM_COMPRESSOR_STAT(vmcstate.lz4_total_negatives++); + vmcstate.lz4_run_length = 0; + } + } + } +} + +//todo fix clang diagnostic +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wincompatible-pointer-types" + +static inline void WKdmD(WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes) { +#if DEVELOPMENT || DEBUG + uint32_t *inw = (uint32_t *) src_buf; + if (*inw != MZV_MAGIC) { + if ((*inw | *(inw+1) | *(inw+2)) & 0xFFFF0000) { + panic("WKdmDecompress: invalid header 0x%x 0x%x 0x%x\n", *inw, *(inw +1), *(inw+2)); + } + } +#endif /* DEVELOPMENT || DEBUG */ + WKdm_decompress_new(src_buf, dest_buf, scratch, bytes); +} + +static inline int WKdmC(WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int limit) { + return WKdm_compress_new(src_buf, dest_buf, scratch, limit); +} + + +int metacompressor(const uint8_t *in, uint8_t *cdst, int32_t outbufsz, uint16_t *codec, void *cscratchin) { + int sz = -1; + int dowk = FALSE, dolz4 = FALSE, skiplz4 = FALSE; + int insize = PAGE_SIZE; + compressor_encode_scratch_t *cscratch = cscratchin; + + if (vm_compressor_current_codec == CMODE_WK) { + dowk = TRUE; + } else if (vm_compressor_current_codec == CMODE_LZ4) { + dolz4 = TRUE; + } else if (vm_compressor_current_codec == CMODE_HYB) { + enum compressor_preselect_t presel = compressor_preselect(); + if (presel == CPRESELLZ4) { + dolz4 = TRUE; + goto lz4compress; + } else if (presel == CSKIPLZ4) { + dowk = TRUE; + skiplz4 = TRUE; + } else { + assert(presel == CPRESELWK); + dowk = TRUE; + } + } + + if (dowk) { + *codec = CCWK; + sz = WKdmC(in, cdst, &cscratch->wkscratch[0], outbufsz); + VM_COMPRESSOR_STAT(compressor_stats.wk_compressions++); + + VERBOSE("WKDm Compress: %d\n", sz); + if (sz == -1) { + VM_COMPRESSOR_STAT(compressor_stats.wk_compressed_bytes_total+=PAGE_SIZE); + VM_COMPRESSOR_STAT(compressor_stats.wk_compression_failures++); + + if (vm_compressor_current_codec == CMODE_HYB) { + goto lz4eval; + } + goto cexit; + } else if (sz == 0) { + VM_COMPRESSOR_STAT(compressor_stats.wk_sv_compressions++); + VM_COMPRESSOR_STAT(compressor_stats.wk_compressed_bytes_total+=8); + } else { + VM_COMPRESSOR_STAT(compressor_stats.wk_compressed_bytes_total+=sz); + } + } +lz4eval: + if (vm_compressor_current_codec == CMODE_HYB) { + if (((sz == -1) || (sz >= vmctune.lz4_threshold)) && (skiplz4 == FALSE)) { + dolz4 = TRUE; + } else { + __unused int wkc = (sz == -1) ? PAGE_SIZE : sz; + VM_COMPRESSOR_STAT(compressor_stats.wk_compressions_exclusive++); + VM_COMPRESSOR_STAT(compressor_stats.wk_compressed_bytes_exclusive+=wkc); + goto cexit; + } + } + +lz4compress: + + if (dolz4) { + if (sz == -1) { + sz = PAGE_SIZE; + } + int wksz = sz; + *codec = CCLZ4; + + sz = (int) lz4raw_encode_buffer(cdst, outbufsz, in, insize, &cscratch->lz4state[0]); + + VERBOSE("LZ4 Compress: %d\n", sz); + compressor_selector_update(sz, dowk, wksz); + if (sz == 0) { + sz = -1; + goto cexit; + } + } +cexit: + return sz; +} + +void metadecompressor(const uint8_t *source, uint8_t *dest, uint32_t csize, uint16_t ccodec, void *compressor_dscratchin) { + int dolz4 = (ccodec == CCLZ4); + int rval; + compressor_decode_scratch_t *compressor_dscratch = compressor_dscratchin; + + if (dolz4) { + rval = (int)lz4raw_decode_buffer(dest, PAGE_SIZE, source, csize, &compressor_dscratch->lz4decodestate[0]); + VM_DECOMPRESSOR_STAT(compressor_stats.lz4_decompressions+=1); + VM_DECOMPRESSOR_STAT(compressor_stats.lz4_decompressed_bytes+=csize); + + assertf(rval == PAGE_SIZE, "LZ4 decode: size != pgsize %d", rval); + + } else { + assert(ccodec == CCWK); + WKdmD(source, dest, &compressor_dscratch->wkdecompscratch[0], csize); + VM_DECOMPRESSOR_STAT(compressor_stats.wk_decompressions+=1); + VM_DECOMPRESSOR_STAT(compressor_stats.wk_decompressed_bytes+=csize); + } +} +#pragma clang diagnostic pop + +uint32_t vm_compressor_get_encode_scratch_size(void) { + if (vm_compressor_current_codec != VM_COMPRESSOR_DEFAULT_CODEC) { + return MAX(sizeof(compressor_encode_scratch_t), WKdm_SCRATCH_BUF_SIZE_INTERNAL); + } else { + return WKdm_SCRATCH_BUF_SIZE_INTERNAL; + } +} + +uint32_t vm_compressor_get_decode_scratch_size(void) { + if (vm_compressor_current_codec != VM_COMPRESSOR_DEFAULT_CODEC) { + return MAX(sizeof(compressor_decode_scratch_t), WKdm_SCRATCH_BUF_SIZE_INTERNAL); + } else { + return WKdm_SCRATCH_BUF_SIZE_INTERNAL; + } +} + + +int vm_compressor_algorithm(void) { + return vm_compressor_current_codec; +} + +void vm_compressor_algorithm_init(void) { + vm_compressor_mode_t new_codec = VM_COMPRESSOR_DEFAULT_CODEC; + + + PE_parse_boot_argn("vm_compressor_codec", &new_codec, sizeof(new_codec)); + assertf(((new_codec == VM_COMPRESSOR_DEFAULT_CODEC) || (new_codec == CMODE_WK) || + (new_codec == CMODE_LZ4) || (new_codec = CMODE_HYB)), + "Invalid VM compression codec: %u", new_codec); + + + if (PE_parse_boot_argn("-vm_compressor_wk", &new_codec, sizeof(new_codec))) { + new_codec = VM_COMPRESSOR_DEFAULT_CODEC; + } else if (PE_parse_boot_argn("-vm_compressor_hybrid", &new_codec, sizeof(new_codec))) { + new_codec = CMODE_HYB; + } + +} +//TODO check open-sourceability of lz4 diff --git a/osfmk/vm/vm_compressor_algorithms.h b/osfmk/vm/vm_compressor_algorithms.h new file mode 100644 index 000000000..dce1ea150 --- /dev/null +++ b/osfmk/vm/vm_compressor_algorithms.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#if XNU_KERNEL_PRIVATE +//DRKTODO: the decompression side stats should be either made optional or +//per-CPU to avoid cacheline contention + +typedef struct { + uint64_t lz4_compressions; + uint64_t lz4_compression_failures; + uint64_t lz4_compressed_bytes; + uint64_t lz4_wk_compression_delta; + uint64_t lz4_wk_compression_negative_delta; + uint64_t lz4_post_wk_compressions; + + uint64_t wk_compressions; + uint64_t wk_sv_compressions; + uint64_t wk_mzv_compressions; + uint64_t wk_compression_failures; + uint64_t wk_compressed_bytes_total; + uint64_t wk_compressions_exclusive; + uint64_t wk_compressed_bytes_exclusive; + + uint64_t lz4_decompressions; + uint64_t lz4_decompressed_bytes; + uint64_t uc_decompressions; + + uint64_t wk_decompressions; + uint64_t wk_decompressed_bytes; + uint64_t wk_sv_decompressions; +} compressor_stats_t; + +extern compressor_stats_t compressor_stats; + +typedef struct { + uint32_t lz4_selection_max; + int32_t wkdm_reeval_threshold; + int32_t lz4_threshold; + uint32_t lz4_max_failure_skips; + uint32_t lz4_max_failure_run_length; + uint32_t lz4_max_preselects; + uint32_t lz4_run_preselection_threshold; + uint32_t lz4_run_continue_bytes; + uint32_t lz4_profitable_bytes; +} compressor_tuneables_t; + +extern compressor_tuneables_t vmctune; + +int metacompressor(const uint8_t *in, uint8_t *cdst, int32_t outbufsz, uint16_t *codec, void *cscratch); +void metadecompressor(const uint8_t *source, uint8_t *dest, uint32_t csize, uint16_t ccodec, void *compressor_dscratch); + +typedef enum { + CCWK = 0, // must be 0 or 1 + CCLZ4 = 1, //must be 0 or 1 + CINVALID = 0xFFFF +} vm_compressor_codec_t; + +typedef enum { + CMODE_WK = 0, + CMODE_LZ4 = 1, + CMODE_HYB = 2, + VM_COMPRESSOR_DEFAULT_CODEC = 3, + CMODE_INVALID = 4 +} vm_compressor_mode_t; + +void vm_compressor_algorithm_init(void); +int vm_compressor_algorithm(void); +#endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_compressor_backing_store.c b/osfmk/vm/vm_compressor_backing_store.c index 33f4f7cc4..7015971be 100644 --- a/osfmk/vm/vm_compressor_backing_store.c +++ b/osfmk/vm/vm_compressor_backing_store.c @@ -31,9 +31,9 @@ #include +#include boolean_t compressor_store_stop_compaction = FALSE; -boolean_t vm_swap_up = FALSE; boolean_t vm_swapfile_create_needed = FALSE; boolean_t vm_swapfile_gc_needed = FALSE; @@ -44,6 +44,7 @@ uint64_t vm_swapout_thread_id; uint64_t vm_swap_put_failures = 0; uint64_t vm_swap_get_failures = 0; int vm_num_swap_files = 0; +int vm_num_pinned_swap_files = 0; int vm_swapout_thread_processed_segments = 0; int vm_swapout_thread_awakened = 0; int vm_swapfile_create_thread_awakened = 0; @@ -52,9 +53,12 @@ int vm_swapfile_gc_thread_awakened = 0; int vm_swapfile_gc_thread_running = 0; int64_t vm_swappin_avail = 0; +boolean_t vm_swappin_enabled = FALSE; unsigned int vm_swapfile_total_segs_alloced = 0; unsigned int vm_swapfile_total_segs_used = 0; +extern vm_map_t compressor_map; + #define SWAP_READY 0x1 /* Swap file is ready to be used */ #define SWAP_RECLAIM 0x2 /* Swap file is marked to be reclaimed */ @@ -91,6 +95,8 @@ clock_sec_t vm_swapfile_last_successful_create_ts = 0; int vm_swapfile_can_be_created = FALSE; boolean_t delayed_trim_handling_in_progress = FALSE; +boolean_t hibernate_in_progress_with_pinned_swap = FALSE; + static void vm_swapout_thread_throttle_adjust(void); static void vm_swap_free_now(struct swapfile *swf, uint64_t f_offset); static void vm_swapout_thread(void); @@ -143,16 +149,14 @@ extern unsigned long vm_page_decrypt_counter; #endif /* ENCRYPTED_SWAP */ extern void vm_pageout_io_throttle(void); -extern void vm_pageout_reinit_tuneables(void); -extern void vm_swap_file_set_tuneables(void); -struct swapfile *vm_swapfile_for_handle(uint64_t); +static struct swapfile *vm_swapfile_for_handle(uint64_t); /* * Called with the vm_swap_data_lock held. */ -struct swapfile * +static struct swapfile * vm_swapfile_for_handle(uint64_t f_offset) { @@ -220,11 +224,11 @@ vm_compressor_swap_init() } thread_deallocate(thread); - proc_set_task_policy_thread(kernel_task, thread->thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2); - proc_set_task_policy_thread(kernel_task, thread->thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); - + proc_set_thread_policy_with_tid(kernel_task, thread->thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2); + proc_set_thread_policy_with_tid(kernel_task, thread->thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + #if ENCRYPTED_SWAP if (swap_crypt_ctx_initialized == FALSE) { swap_crypt_ctx_initialize(); @@ -233,9 +237,7 @@ vm_compressor_swap_init() memset(swapfilename, 0, MAX_SWAPFILENAME_LEN + 1); - vm_swap_up = TRUE; - - printf("VM Swap Subsystem is %s\n", (vm_swap_up == TRUE) ? "ON" : "OFF"); + printf("VM Swap Subsystem is ON\n"); } @@ -261,45 +263,67 @@ c_compressed_record_write(char *buf, int size) #endif +int compaction_swapper_inited = 0; void -vm_swap_file_set_tuneables() +vm_compaction_swapper_do_init(void) { struct vnode *vp; char *pathname; int namelen; - if (strlen(swapfilename) == 0) { - /* - * If no swapfile name has been set, we'll - * use the default name. - * - * Also, this function is only called from the vm_pageout_scan thread - * via vm_consider_waking_compactor_swapper, - * so we don't need to worry about a race in checking/setting the name here. - */ - strlcpy(swapfilename, SWAP_FILE_NAME, MAX_SWAPFILENAME_LEN); + if (compaction_swapper_inited) + return; + + if (vm_compressor_mode != VM_PAGER_COMPRESSOR_WITH_SWAP) { + compaction_swapper_inited = 1; + return; } - namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1; - pathname = (char*)kalloc(namelen); - memset(pathname, 0, namelen); - snprintf(pathname, namelen, "%s%d", swapfilename, 0); + lck_mtx_lock(&vm_swap_data_lock); - vm_swapfile_open(pathname, &vp); + if ( !compaction_swapper_inited) { - if (vp == NULL) - goto done; + if (strlen(swapfilename) == 0) { + /* + * If no swapfile name has been set, we'll + * use the default name. + * + * Also, this function is only called from the vm_pageout_scan thread + * via vm_consider_waking_compactor_swapper, + * so we don't need to worry about a race in checking/setting the name here. + */ + strlcpy(swapfilename, SWAP_FILE_NAME, MAX_SWAPFILENAME_LEN); + } + namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1; + pathname = (char*)kalloc(namelen); + memset(pathname, 0, namelen); + snprintf(pathname, namelen, "%s%d", swapfilename, 0); - if (vnode_pager_isSSD(vp) == FALSE) - vm_pageout_reinit_tuneables(); - vnode_setswapmount(vp); - vm_swappin_avail = vnode_getswappin_avail(vp); - vm_swapfile_close((uint64_t)pathname, vp); -done: - kfree(pathname, namelen); + vm_swapfile_open(pathname, &vp); + + if (vp) { + + if (vnode_pager_isSSD(vp) == FALSE) { + vm_compressor_minorcompact_threshold_divisor = 18; + vm_compressor_majorcompact_threshold_divisor = 22; + vm_compressor_unthrottle_threshold_divisor = 32; + } + vnode_setswapmount(vp); + vm_swappin_avail = vnode_getswappin_avail(vp); + + if (vm_swappin_avail) + vm_swappin_enabled = TRUE; + vm_swapfile_close((uint64_t)pathname, vp); + } + kfree(pathname, namelen); + + compaction_swapper_inited = 1; + } + lck_mtx_unlock(&vm_swap_data_lock); } + #if ENCRYPTED_SWAP void vm_swap_encrypt(c_segment_t c_seg) @@ -314,6 +338,9 @@ vm_swap_encrypt(c_segment_t c_seg) assert(swap_crypt_ctx_initialized); +#if DEVELOPMENT || DEBUG + C_SEG_MAKE_WRITEABLE(c_seg); +#endif bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv)); encrypt_iv.c_seg = (void*)c_seg; @@ -338,6 +365,10 @@ vm_swap_encrypt(c_segment_t c_seg) &swap_crypt_ctx.encrypt); vm_page_encrypt_counter += (size/PAGE_SIZE_64); + +#if DEVELOPMENT || DEBUG + C_SEG_WRITE_PROTECT(c_seg); +#endif } void @@ -355,6 +386,9 @@ vm_swap_decrypt(c_segment_t c_seg) assert(swap_crypt_ctx_initialized); +#if DEVELOPMENT || DEBUG + C_SEG_MAKE_WRITEABLE(c_seg); +#endif /* * Prepare an "initial vector" for the decryption. * It has to be the same as the "initial vector" we @@ -384,6 +418,10 @@ vm_swap_decrypt(c_segment_t c_seg) &swap_crypt_ctx.decrypt); vm_page_decrypt_counter += (size/PAGE_SIZE_64); + +#if DEVELOPMENT || DEBUG + C_SEG_WRITE_PROTECT(c_seg); +#endif } #endif /* ENCRYPTED_SWAP */ @@ -465,8 +503,8 @@ vm_swap_defragment() } else { lck_mtx_unlock_always(c_list_lock); - c_seg_swapin(c_seg, TRUE); - lck_mtx_unlock_always(&c_seg->c_lock); + if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) + lck_mtx_unlock_always(&c_seg->c_lock); vm_swap_defragment_swapin++; } @@ -513,6 +551,9 @@ vm_swapfile_create_thread(void) lck_mtx_lock(&vm_swap_data_lock); + if (hibernate_in_progress_with_pinned_swap == TRUE) + break; + clock_get_system_nanotime(&sec, &nsec); if (VM_SWAP_SHOULD_CREATE(sec) == 0) @@ -529,6 +570,9 @@ vm_swapfile_create_thread(void) } vm_swapfile_create_thread_running = 0; + if (hibernate_in_progress_with_pinned_swap == TRUE) + thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap); + assert_wait((event_t)&vm_swapfile_create_needed, THREAD_UNINT); lck_mtx_unlock(&vm_swap_data_lock); @@ -539,8 +583,59 @@ vm_swapfile_create_thread(void) } +#if HIBERNATION + +kern_return_t +hibernate_pin_swap(boolean_t start) +{ + vm_compaction_swapper_do_init(); + + if (start == FALSE) { + + lck_mtx_lock(&vm_swap_data_lock); + hibernate_in_progress_with_pinned_swap = FALSE; + lck_mtx_unlock(&vm_swap_data_lock); + + return (KERN_SUCCESS); + } + if (vm_swappin_enabled == FALSE) + return (KERN_SUCCESS); + + lck_mtx_lock(&vm_swap_data_lock); + + hibernate_in_progress_with_pinned_swap = TRUE; + + while (vm_swapfile_create_thread_running || vm_swapfile_gc_thread_running) { + + assert_wait((event_t)&hibernate_in_progress_with_pinned_swap, THREAD_UNINT); + + lck_mtx_unlock(&vm_swap_data_lock); + + thread_block(THREAD_CONTINUE_NULL); + + lck_mtx_lock(&vm_swap_data_lock); + } + if (vm_num_swap_files > vm_num_pinned_swap_files) { + hibernate_in_progress_with_pinned_swap = FALSE; + lck_mtx_unlock(&vm_swap_data_lock); + + HIBLOG("hibernate_pin_swap failed - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d\n", + vm_num_swap_files, vm_num_pinned_swap_files); + return (KERN_FAILURE); + } + lck_mtx_unlock(&vm_swap_data_lock); + + while (VM_SWAP_SHOULD_PIN(MAX_SWAP_FILE_SIZE)) { + if (vm_swap_create_file() == FALSE) + break; + } + return (KERN_SUCCESS); +} +#endif + static void vm_swapfile_gc_thread(void) + { boolean_t need_defragment; boolean_t need_reclaim; @@ -552,6 +647,9 @@ vm_swapfile_gc_thread(void) lck_mtx_lock(&vm_swap_data_lock); + if (hibernate_in_progress_with_pinned_swap == TRUE) + break; + if (VM_SWAP_BUSY() || compressor_store_stop_compaction == TRUE) break; @@ -577,6 +675,9 @@ vm_swapfile_gc_thread(void) } vm_swapfile_gc_thread_running = 0; + if (hibernate_in_progress_with_pinned_swap == TRUE) + thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap); + assert_wait((event_t)&vm_swapfile_gc_needed, THREAD_UNINT); lck_mtx_unlock(&vm_swap_data_lock); @@ -652,10 +753,10 @@ vm_swapout_thread_throttle_adjust(void) } done: if (swapper_throttle != swapper_throttle_new) { - proc_set_task_policy_thread(kernel_task, vm_swapout_thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_IO, swapper_throttle_new); - proc_set_task_policy_thread(kernel_task, vm_swapout_thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, swapper_throttle_new); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); swapper_throttle = swapper_throttle_new; } @@ -664,7 +765,6 @@ vm_swapout_thread_throttle_adjust(void) int vm_swapout_found_empty = 0; - static void vm_swapout_thread(void) { @@ -704,6 +804,9 @@ vm_swapout_thread(void) if (size == 0) { assert(c_seg->c_bytes_used == 0); + if (!c_seg->c_on_minorcompact_q) + c_seg_need_delayed_compaction(c_seg, TRUE); + c_seg_switch_state(c_seg, C_IS_EMPTY, FALSE); lck_mtx_unlock_always(&c_seg->c_lock); lck_mtx_unlock_always(c_list_lock); @@ -736,8 +839,13 @@ vm_swapout_thread(void) PAGE_REPLACEMENT_DISALLOWED(TRUE); if (kr == KERN_SUCCESS) { - kernel_memory_depopulate(kernel_map, (vm_offset_t) addr, size, KMA_COMPRESSOR); + kernel_memory_depopulate(compressor_map, (vm_offset_t) addr, size, KMA_COMPRESSOR); } +#if ENCRYPTED_SWAP + else { + vm_swap_decrypt(c_seg); + } +#endif /* ENCRYPTED_SWAP */ lck_mtx_lock_spin_always(c_list_lock); lck_mtx_lock_spin_always(&c_seg->c_lock); @@ -761,18 +869,21 @@ vm_swapout_thread(void) if (c_seg->c_bytes_used) OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used); } else { -#if ENCRYPTED_SWAP - vm_swap_decrypt(c_seg); -#endif /* ENCRYPTED_SWAP */ if (c_seg->c_overage_swap == TRUE) { c_seg->c_overage_swap = FALSE; c_overage_swapped_count--; } c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE); + + if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) + c_seg_need_delayed_compaction(c_seg, TRUE); } - lck_mtx_unlock_always(c_list_lock); + assert(c_seg->c_busy_swapping); + assert(c_seg->c_busy); c_seg->c_busy_swapping = 0; + lck_mtx_unlock_always(c_list_lock); + C_SEG_WAKEUP_DONE(c_seg); lck_mtx_unlock_always(&c_seg->c_lock); @@ -805,6 +916,15 @@ vm_swap_create_file() boolean_t swap_file_pin = FALSE; struct swapfile *swf = NULL; + /* + * make sure we've got all the info we need + * to potentially pin a swap file... we could + * be swapping out due to hibernation w/o ever + * having run vm_pageout_scan, which is normally + * the trigger to do the init + */ + vm_compaction_swapper_do_init(); + /* * Any swapfile structure ready for re-use? */ @@ -911,6 +1031,7 @@ vm_swap_create_file() vm_swapfile_total_segs_alloced += swf->swp_nsegs; if (swap_file_pin == TRUE) { + vm_num_pinned_swap_files++; swf->swp_flags |= SWAP_PINNED; vm_swappin_avail -= swf->swp_size; } @@ -940,15 +1061,13 @@ vm_swap_create_file() kern_return_t -vm_swap_get(vm_offset_t addr, uint64_t f_offset, uint64_t size) +vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size) { struct swapfile *swf = NULL; uint64_t file_offset = 0; int retval = 0; - if (addr == 0) { - return KERN_FAILURE; - } + assert(c_seg->c_store.c_buffer); lck_mtx_lock(&vm_swap_data_lock); @@ -962,9 +1081,15 @@ vm_swap_get(vm_offset_t addr, uint64_t f_offset, uint64_t size) lck_mtx_unlock(&vm_swap_data_lock); +#if DEVELOPMENT || DEBUG + C_SEG_MAKE_WRITEABLE(c_seg); +#endif file_offset = (f_offset & SWAP_SLOT_MASK); - retval = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int)(size / PAGE_SIZE_64), SWAP_READ); + retval = vm_swapfile_io(swf->swp_vp, file_offset, c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ); +#if DEVELOPMENT || DEBUG + C_SEG_WRITE_PROTECT(c_seg); +#endif if (retval == 0) VM_STAT_INCR_BY(swapins, size >> PAGE_SHIFT); else @@ -1357,7 +1482,7 @@ vm_swap_reclaim(void) c_segment_t c_seg = NULL; - if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&addr), C_SEG_BUFSIZE, 0, KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) { + if (kernel_memory_allocate(compressor_map, (vm_offset_t *)(&addr), C_SEG_BUFSIZE, 0, KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) { panic("vm_swap_reclaim: kernel_memory_allocate failed\n"); } @@ -1500,7 +1625,7 @@ vm_swap_reclaim(void) * reading the data back in failed, so convert c_seg * to a swapped in c_segment that contains no data */ - c_seg_swapin_requeue(c_seg, FALSE); + c_seg_swapin_requeue(c_seg, FALSE, TRUE, FALSE); /* * returns with c_busy_swapping cleared */ @@ -1519,7 +1644,7 @@ vm_swap_reclaim(void) */ c_buffer = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno); - kernel_memory_populate(kernel_map, c_buffer, c_size, KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR); + kernel_memory_populate(compressor_map, c_buffer, c_size, KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR); memcpy((char *)c_buffer, (char *)addr, c_size); @@ -1527,7 +1652,7 @@ vm_swap_reclaim(void) #if ENCRYPTED_SWAP vm_swap_decrypt(c_seg); #endif /* ENCRYPTED_SWAP */ - c_seg_swapin_requeue(c_seg, TRUE); + c_seg_swapin_requeue(c_seg, TRUE, TRUE, FALSE); /* * returns with c_busy_swapping cleared */ @@ -1544,8 +1669,11 @@ vm_swap_reclaim(void) * The c_seg will now know about the new location on disk. */ c_seg->c_store.c_swap_handle = f_offset; + + assert(c_seg->c_busy_swapping); c_seg->c_busy_swapping = 0; swap_io_failed: + assert(c_seg->c_busy); C_SEG_WAKEUP_DONE(c_seg); lck_mtx_unlock_always(&c_seg->c_lock); @@ -1581,6 +1709,7 @@ vm_swap_reclaim(void) lck_mtx_lock(&vm_swap_data_lock); if (swf->swp_flags & SWAP_PINNED) { + vm_num_pinned_swap_files--; vm_swappin_avail += swf->swp_size; } @@ -1594,7 +1723,7 @@ vm_swap_reclaim(void) thread_wakeup((event_t) &swf->swp_flags); lck_mtx_unlock(&vm_swap_data_lock); - kmem_free(kernel_map, (vm_offset_t) addr, C_SEG_BUFSIZE); + kmem_free(compressor_map, (vm_offset_t) addr, C_SEG_BUFSIZE); } @@ -1642,3 +1771,16 @@ vm_swap_low_on_space(void) } return (0); } + +boolean_t +vm_swap_files_pinned(void) +{ + boolean_t result; + + if (vm_swappin_enabled == FALSE) + return(TRUE); + + result = (vm_num_pinned_swap_files == vm_num_swap_files); + + return (result); +} diff --git a/osfmk/vm/vm_compressor_backing_store.h b/osfmk/vm/vm_compressor_backing_store.h index dc22fe7e6..ea3075050 100644 --- a/osfmk/vm/vm_compressor_backing_store.h +++ b/osfmk/vm/vm_compressor_backing_store.h @@ -62,7 +62,6 @@ char swapfilename[MAX_SWAPFILENAME_LEN + 1]; #define SWAP_DEVICE_SHIFT 33 extern int vm_num_swap_files; -extern boolean_t vm_swap_up; struct swapfile; lck_grp_attr_t vm_swap_data_lock_grp_attr; diff --git a/osfmk/vm/vm_compressor_pager.c b/osfmk/vm/vm_compressor_pager.c index f4a1124ce..73e4dc5cc 100644 --- a/osfmk/vm/vm_compressor_pager.c +++ b/osfmk/vm/vm_compressor_pager.c @@ -61,10 +61,10 @@ #include #include +#include #include #include -#include #include #include @@ -828,6 +828,8 @@ vm_compressor_pager_state_clr( compressor_slot_t *slot_p; unsigned int num_slots_freed; + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + compressor_pager_stats.state_clr++; if ((uint32_t)(offset/PAGE_SIZE) != (offset/PAGE_SIZE)) { @@ -859,6 +861,8 @@ vm_compressor_pager_state_get( { compressor_pager_t pager; compressor_slot_t *slot_p; + + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); compressor_pager_stats.state_get++; diff --git a/osfmk/vm/vm_compressor_pager.h b/osfmk/vm/vm_compressor_pager.h index 729a30ca0..e723c9012 100644 --- a/osfmk/vm/vm_compressor_pager.h +++ b/osfmk/vm/vm_compressor_pager.h @@ -62,9 +62,7 @@ extern vm_external_state_t vm_compressor_pager_state_get( memory_object_offset_t offset); #define VM_COMPRESSOR_PAGER_STATE_GET(object, offset) \ - (((COMPRESSED_PAGER_IS_ACTIVE || \ - DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && \ - (object)->internal && \ + (((object)->internal && \ (object)->pager != NULL && \ !(object)->terminating && \ (object)->alive) \ @@ -74,9 +72,7 @@ extern vm_external_state_t vm_compressor_pager_state_get( #define VM_COMPRESSOR_PAGER_STATE_CLR(object, offset) \ MACRO_BEGIN \ - if ((COMPRESSED_PAGER_IS_ACTIVE || \ - DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && \ - (object)->internal && \ + if ((object)->internal && \ (object)->pager != NULL && \ !(object)->terminating && \ (object)->alive) { \ diff --git a/osfmk/vm/vm_debug.c b/osfmk/vm/vm_debug.c index 1abeb5164..e29eed60f 100644 --- a/osfmk/vm/vm_debug.c +++ b/osfmk/vm/vm_debug.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,7 +91,7 @@ #define __DEBUG_ONLY #endif /* !MACH_VM_DEBUG */ -#if VM32_SUPPORT +#ifdef VM32_SUPPORT #include #include diff --git a/osfmk/vm/vm_external.c b/osfmk/vm/vm_external.c deleted file mode 100644 index db0c32d09..000000000 --- a/osfmk/vm/vm_external.c +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * This module maintains information about the presence of - * pages not in memory. Since an external memory object - * must maintain a complete knowledge of its contents, this - * information takes the form of hints. - */ -#include /* for memcpy()/memset() */ - -#include -#include -#include -#include -#include - -/* - * The implementation uses bit arrays to record whether - * a page has been written to external storage. For - * convenience, these bit arrays come in various sizes. - * For example, a map N bytes long can record: - * - * 16 bytes = 128 pages = (@ 4KB/page) 512KB - * 1024 bytes = 8192 pages = (@ 4KB/page) 32MB - * 4096 bytes = 32768 pages = (@ 4KB/page) 128MB - * - * For a 32-bit machine with 4KB pages, the largest size - * would be 128KB = 32 pages. Machines with a larger page - * size are more efficient. - * - * This subsystem must be very careful about memory allocation, - * since vm_external_create() is almost always called with - * vm_privilege set. The largest map to be allocated must be less - * than or equal to a single page, and the kalloc subsystem must - * never allocate more than a single page in response to a kalloc() - * request. Also, vm_external_destroy() must not take any blocking - * locks, since it is called with a vm_object lock held. This - * implies that kfree() MUST be implemented in terms of zfree() - * NOT kmem_free() for all request sizes that this subsystem uses. - * - * For efficiency, this subsystem knows that the kalloc() subsystem - * is implemented in terms of power-of-2 allocation, and that the - * minimum allocation unit is KALLOC_MINSIZE - * - * XXXO - * Should consider using existence_map to hold bits directly - * when existence_size <= 4 bytes (i.e., 32 pages). - */ - -#define SMALL_SIZE KALLOC_MINSIZE -#define LARGE_SIZE PAGE_SIZE - -static vm_object_size_t power_of_2(vm_object_size_t size); - -static vm_object_size_t -power_of_2(vm_object_size_t size) -{ - vm_object_size_t power; - - power = 2 * SMALL_SIZE; - while (power < size) { - power <<= 1; - } - return(power); -} - -vm_external_map_t -vm_external_create( - vm_object_offset_t size) -{ - vm_object_size_t bytes; - vm_external_map_t result = VM_EXTERNAL_NULL; - - bytes = stob(size); - if (bytes <= SMALL_SIZE) { - result = (vm_external_map_t)kalloc(SMALL_SIZE); - if (result != NULL) { - memset(result, 0, SMALL_SIZE); - } - } else if (bytes <= LARGE_SIZE) { - bytes = power_of_2(bytes); - - assert((vm_size_t) bytes == bytes); - result = (vm_external_map_t)kalloc((vm_size_t)bytes); - if (result != NULL) { - assert((size_t) bytes == bytes); - memset(result, 0, (size_t) bytes); - } - } - return(result); -} - -void -vm_external_destroy( - vm_external_map_t map, - vm_object_size_t size) -{ - vm_object_size_t bytes; - - if (map == VM_EXTERNAL_NULL) - return; - - bytes = stob(size); - if (bytes <= SMALL_SIZE) { - bytes = SMALL_SIZE; - } else { - bytes = power_of_2(bytes); - } - assert((vm_size_t) bytes == bytes); - kfree(map, (vm_size_t) bytes); -} - -/* - * Return the number of bytes needed for a vm_external_map given the - * size of the object to be mapped, i.e. the size of the map that was - * created by vm_external_create. - */ -vm_object_size_t -vm_external_map_size( - vm_object_size_t size) -{ - vm_object_size_t bytes; - - bytes = stob(size); - if (bytes != 0) { - if (bytes <= SMALL_SIZE) { - bytes = SMALL_SIZE; - } else { - bytes = power_of_2(bytes); - } - } - return bytes; -} - -void -vm_external_copy( - vm_external_map_t old_map, - vm_object_size_t old_size, - vm_external_map_t new_map) -{ - vm_object_size_t bytes; - - /* - * Cannot copy non-existent maps - */ - if ((old_map == VM_EXTERNAL_NULL) || (new_map == VM_EXTERNAL_NULL)) - return; - - /* - * Copy old map to new - */ - bytes = stob(old_size); - assert((size_t) bytes == bytes); - memcpy(new_map, old_map, (size_t) bytes); -} - -boolean_t -vm_external_within( - vm_object_size_t new_size, - vm_object_size_t old_size) -{ - vm_object_size_t new_bytes; - vm_object_size_t old_bytes; - - assert(new_size >= old_size); - - /* - * "old_bytes" is calculated to be the actual amount of space - * allocated for a map of size "old_size". - */ - old_bytes = stob(old_size); - if (old_bytes <= SMALL_SIZE) old_bytes = SMALL_SIZE; - else if (old_bytes <= LARGE_SIZE) old_bytes = power_of_2(old_bytes); - - /* - * "new_bytes" is the map size required to map the "new_size" object. - * Since the rounding algorithms are the same, we needn't actually - * round up new_bytes to get the correct answer - */ - new_bytes = stob(new_size); - - return(new_bytes <= old_bytes); -} - -vm_external_state_t -_vm_external_state_get( - vm_external_map_t map, - vm_object_offset_t offset) -{ - uint64_t bit, byte; - - assert (map != VM_EXTERNAL_NULL); - - bit = atop_64(offset); - byte = bit >> 3; - if (map[byte] & (1 << (bit & 07))) { - return VM_EXTERNAL_STATE_EXISTS; - } else { - return VM_EXTERNAL_STATE_ABSENT; - } -} - -void -vm_external_state_set( - vm_external_map_t map, - vm_object_offset_t offset) -{ - uint64_t bit, byte; - - if (map == VM_EXTERNAL_NULL) - return; - - bit = atop_64(offset); - byte = bit >> 3; - map[byte] |= (1 << (bit & 07)); -} - -void -vm_external_state_clr( - vm_external_map_t map, - vm_object_offset_t offset) -{ - uint64_t bit, byte; - - if (map == VM_EXTERNAL_NULL) - return; - - bit = atop_64(offset); - byte = bit >> 3; - map[byte] &= ~(1 << (bit & 07)); -} - -void -vm_external_module_initialize(void) -{ -} diff --git a/osfmk/vm/vm_external.h b/osfmk/vm/vm_external.h index 803c9753a..eb7d71692 100644 --- a/osfmk/vm/vm_external.h +++ b/osfmk/vm/vm_external.h @@ -63,17 +63,6 @@ #include #include -/* - * External page management hint technology - * - * The data structure exported by this module maintains - * a (potentially incomplete) map of the pages written - * to external storage for a range of virtual memory. - */ - -typedef char *vm_external_map_t; -#define VM_EXTERNAL_NULL ((char *) 0) - /* * The states that may be recorded for a page of external storage. */ @@ -83,65 +72,4 @@ typedef int vm_external_state_t; #define VM_EXTERNAL_STATE_UNKNOWN 2 #define VM_EXTERNAL_STATE_ABSENT 3 -/* - * Useful macros - */ -#define stob(s) ((atop_64((s)) + 07) >> 3) - -/* - * Routines exported by this module. - */ - /* Initialize the module */ -extern void vm_external_module_initialize(void); - - -extern vm_external_map_t vm_external_create( - /* Create a vm_external_map_t */ - vm_object_size_t size); - -extern void vm_external_destroy( - /* Destroy one */ - vm_external_map_t map, - vm_object_size_t size); - -extern vm_object_size_t vm_external_map_size( - /* Return size of map in bytes */ - vm_object_size_t size); - -extern void vm_external_copy( - /* Copy one into another */ - vm_external_map_t old_map, - vm_object_size_t old_size, - vm_external_map_t new_map); - -extern void vm_external_state_set( - /* Set state of a page to - * VM_EXTERNAL_STATE_EXISTS */ - vm_external_map_t map, - vm_object_offset_t offset); - -extern void vm_external_state_clr( - /* clear page state - */ - vm_external_map_t map, - vm_object_offset_t offset); - -#define vm_external_state_get(map, offset) \ - (((map) != VM_EXTERNAL_NULL) ? \ - _vm_external_state_get((map), (offset)) : \ - VM_EXTERNAL_STATE_UNKNOWN) - /* Retrieve the state for a - * given page, if known. */ - -extern vm_external_state_t _vm_external_state_get( - /* HIDDEN routine */ - vm_external_map_t map, - vm_object_offset_t offset); - -boolean_t vm_external_within( - /* Check if new object size - * fits in current map */ - vm_object_size_t new_size, - vm_object_size_t old_size); - #endif /* VM_VM_EXTERNAL_H_ */ diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 8a0d7b95f..b7cd1bacc 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -104,6 +105,8 @@ #include #include +#include +#include #include /* for struct timespec */ @@ -129,6 +132,8 @@ unsigned int vm_object_pagein_throttle = 16; extern void throttle_lowpri_io(int); +extern struct vnode *vnode_pager_lookup_vnode(memory_object_t); + uint64_t vm_hard_throttle_threshold; @@ -193,7 +198,6 @@ unsigned long vm_cs_bitmap_validated = 0; void vm_pre_fault(vm_map_offset_t); -extern int not_in_kdp; extern char *kdp_compressor_decompressed_page; extern addr64_t kdp_compressor_decompressed_page_paddr; extern ppnum_t kdp_compressor_decompressed_page_ppnum; @@ -272,14 +276,14 @@ vm_fault_init(void) */ void vm_fault_cleanup( - register vm_object_t object, - register vm_page_t top_page) + vm_object_t object, + vm_page_t top_page) { vm_object_paging_end(object); vm_object_unlock(object); if (top_page != VM_PAGE_NULL) { - object = top_page->object; + object = VM_PAGE_OBJECT(top_page); vm_object_lock(object); VM_PAGE_FREE(top_page); @@ -536,7 +540,7 @@ vm_fault_deactivate_behind( for (n = 0; n < max_pages_in_run; n++) { m = vm_page_lookup(object, offset + run_offset + (n * pg_offset)); - if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { + if (m && !m->laundry && !m->busy && !m->no_cache && (m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->fictitious && !m->absent) { page_run[pages_in_run++] = m; /* @@ -550,7 +554,7 @@ vm_fault_deactivate_behind( * in the past (TLB caches don't hang around for very long), and of course could just as easily * have happened before we did the deactivate_behind. */ - pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); } } if (pages_in_run) { @@ -607,7 +611,7 @@ vm_page_throttled(boolean_t page_kept) return (HARD_THROTTLE_DELAY); } - if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) && + if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) && thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) { if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) { @@ -640,7 +644,7 @@ vm_page_throttled(boolean_t page_kept) thread->t_page_creation_throttled = 1; - if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) { + if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) { #if (DEVELOPMENT || DEBUG) thread->t_page_creation_throttled_hard++; OSAddAtomic(1, &vm_page_creation_throttled_hard); @@ -752,6 +756,9 @@ static int vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) { int my_fault = DBG_ZERO_FILL_FAULT; + vm_object_t object; + + object = VM_PAGE_OBJECT(m); /* * This is is a zero-fill page fault... @@ -786,29 +793,27 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL); } assert(!m->laundry); - assert(m->object != kernel_object); - //assert(m->pageq.next == NULL && m->pageq.prev == NULL); + assert(object != kernel_object); + //assert(m->pageq.next == 0 && m->pageq.prev == 0); - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && - (m->object->purgable == VM_PURGABLE_DENY || - m->object->purgable == VM_PURGABLE_NONVOLATILE || - m->object->purgable == VM_PURGABLE_VOLATILE )) { + if (!VM_DYNAMIC_PAGING_ENABLED() && + (object->purgable == VM_PURGABLE_DENY || + object->purgable == VM_PURGABLE_NONVOLATILE || + object->purgable == VM_PURGABLE_VOLATILE )) { vm_page_lockspin_queues(); - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) { + if (!VM_DYNAMIC_PAGING_ENABLED()) { assert(!VM_PAGE_WIRED(m)); /* * can't be on the pageout queue since we don't * have a pager to try and clean to */ - assert(!m->pageout_queue); - - vm_page_queues_remove(m); + vm_page_queues_remove(m, TRUE); vm_page_check_pageable_safe(m); - queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); - m->throttled = TRUE; + vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; } vm_page_unlock_queues(); @@ -916,50 +921,25 @@ vm_fault_page( int external_state = VM_EXTERNAL_STATE_UNKNOWN; memory_object_t pager; vm_fault_return_t retval; + int grab_options; /* - * MACH page map - an optional optimization where a bit map is maintained - * by the VM subsystem for internal objects to indicate which pages of - * the object currently reside on backing store. This existence map - * duplicates information maintained by the vnode pager. It is - * created at the time of the first pageout against the object, i.e. - * at the same time pager for the object is created. The optimization - * is designed to eliminate pager interaction overhead, if it is - * 'known' that the page does not exist on backing store. - * * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is - * either marked as paged out in the existence map for the object or no - * existence map exists for the object. MUST_ASK_PAGER() is one of the - * criteria in the decision to invoke the pager. It is also used as one - * of the criteria to terminate the scan for adjacent pages in a clustered - * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for - * permanent objects. Note also that if the pager for an internal object + * marked as paged out in the compressor pager or the pager doesn't exist. + * Note also that if the pager for an internal object * has not been created, the pager is not invoked regardless of the value - * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object - * for which a pager has been created. + * of MUST_ASK_PAGER(). * * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset - * is marked as paged out in the existence map for the object. PAGED_OUT() + * is marked as paged out in the compressor pager. * PAGED_OUT() is used to determine if a page has already been pushed * into a copy object in order to avoid a redundant page out operation. */ -#if MACH_PAGEMAP -#define MUST_ASK_PAGER(o, f, s) \ - ((vm_external_state_get((o)->existence_map, (f)) \ - != VM_EXTERNAL_STATE_ABSENT) && \ - (s = (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)))) \ - != VM_EXTERNAL_STATE_ABSENT) -#define PAGED_OUT(o, f) \ - ((vm_external_state_get((o)->existence_map, (f)) \ - == VM_EXTERNAL_STATE_EXISTS) || \ - (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) \ - == VM_EXTERNAL_STATE_EXISTS)) -#else /* MACH_PAGEMAP */ #define MUST_ASK_PAGER(o, f, s) \ ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT) + #define PAGED_OUT(o, f) \ (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS) -#endif /* MACH_PAGEMAP */ /* * Recovery actions @@ -967,16 +947,16 @@ vm_fault_page( #define RELEASE_PAGE(m) \ MACRO_BEGIN \ PAGE_WAKEUP_DONE(m); \ - if (!m->active && !m->inactive && !m->throttled) { \ - vm_page_lockspin_queues(); \ - if (!m->active && !m->inactive && !m->throttled) { \ - if (COMPRESSED_PAGER_IS_ACTIVE) \ - vm_page_deactivate(m); \ - else \ - vm_page_activate(m); \ - } \ - vm_page_unlock_queues(); \ - } \ + if ( !VM_PAGE_PAGEABLE(m)) { \ + vm_page_lockspin_queues(); \ + if ( !VM_PAGE_PAGEABLE(m)) { \ + if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \ + vm_page_deactivate(m); \ + else \ + vm_page_activate(m); \ + } \ + vm_page_unlock_queues(); \ + } \ MACRO_END #if TRACEFAULTPAGE @@ -1031,6 +1011,14 @@ vm_fault_page( #if TRACEFAULTPAGE dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ #endif + + grab_options = 0; +#if CONFIG_SECLUDED_MEMORY + if (object->can_grab_secluded) { + grab_options |= VM_PAGE_GRAB_SECLUDED; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + if (!object->alive) { /* * object is no longer valid @@ -1122,12 +1110,12 @@ vm_fault_page( continue; } if (m->laundry) { - m->pageout = FALSE; + m->free_when_done = FALSE; if (!m->cleaning) vm_pageout_steal_laundry(m, FALSE); } - if (m->phys_page == vm_page_guard_addr) { + if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) { /* * Guard page: off limits ! */ @@ -1284,10 +1272,7 @@ vm_fault_page( m->busy = TRUE; vm_page_lockspin_queues(); - - assert(!m->pageout_queue); - vm_page_queues_remove(m); - + vm_page_queues_remove(m, FALSE); vm_page_unlock_queues(); } XPR(XPR_VM_FAULT, @@ -1364,7 +1349,7 @@ vm_fault_page( return (VM_FAULT_RETRY); } } - if (type_of_fault == NULL && m->speculative && + if (type_of_fault == NULL && (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) && !(fault_info != NULL && fault_info->stealth)) { /* * If we were passed a non-NULL pointer for @@ -1382,10 +1367,11 @@ vm_fault_page( * the page in the speculative queue. */ vm_page_lockspin_queues(); - if (m->speculative) - vm_page_queues_remove(m); + if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) + vm_page_queues_remove(m, FALSE); vm_page_unlock_queues(); } + assert(object == VM_PAGE_OBJECT(m)); if (m->encrypted) { /* @@ -1398,7 +1384,6 @@ vm_fault_page( */ m->busy = TRUE; vm_page_decrypt(m, 0); - assert(object == m->object); assert(m->busy); PAGE_WAKEUP_DONE(m); @@ -1411,7 +1396,7 @@ vm_fault_page( } ASSERT_PAGE_DECRYPTED(m); - if (m->object->code_signed) { + if (object->code_signed) { /* * CODE SIGNING: * We just paged in a page from a signed @@ -1469,10 +1454,7 @@ vm_fault_page( goto dont_look_for_page; } - -#if !MACH_PAGEMAP data_supply = FALSE; -#endif /* !MACH_PAGEMAP */ look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply); @@ -1483,7 +1465,7 @@ vm_fault_page( /* * Allocate a new page for this object/offset pair as a placeholder */ - m = vm_page_grab(); + m = vm_page_grab_options(grab_options); #if TRACEFAULTPAGE dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ #endif @@ -1586,16 +1568,16 @@ vm_fault_page( return (VM_FAULT_RETRY); } } - if (object->internal && - (COMPRESSED_PAGER_IS_ACTIVE - || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)) { + if (object->internal) { int compressed_count_delta; + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + if (m == VM_PAGE_NULL) { /* * Allocate a new page for this object/offset pair as a placeholder */ - m = vm_page_grab(); + m = vm_page_grab_options(grab_options); #if TRACEFAULTPAGE dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ #endif @@ -1625,7 +1607,7 @@ vm_fault_page( rc = vm_compressor_pager_get( pager, offset + object->paging_offset, - m->phys_page, + VM_PAGE_GET_PHYS_PAGE(m), &my_fault_type, 0, &compressed_count_delta); @@ -1658,7 +1640,7 @@ vm_fault_page( case KERN_SUCCESS: m->absent = FALSE; m->dirty = TRUE; - if ((m->object->wimg_bits & + if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) { /* @@ -1668,7 +1650,7 @@ vm_fault_page( * after the decompression. */ pmap_sync_page_attributes_phys( - m->phys_page); + VM_PAGE_GET_PHYS_PAGE(m)); } else { m->written_by_kernel = TRUE; } @@ -1786,7 +1768,7 @@ vm_fault_page( * the fault w/o having to go through memory_object_data_request again */ assert(first_m != VM_PAGE_NULL); - assert(first_m->object == first_object); + assert(VM_PAGE_OBJECT(first_m) == first_object); vm_object_lock(first_object); VM_PAGE_FREE(first_m); @@ -1931,7 +1913,7 @@ vm_fault_page( vm_object_lock(object); } m = first_m; - assert(m->object == object); + assert(VM_PAGE_OBJECT(m) == object); first_m = VM_PAGE_NULL; /* @@ -1947,7 +1929,7 @@ vm_fault_page( return (error); if (m == VM_PAGE_NULL) { - m = vm_page_grab(); + m = vm_page_grab_options(grab_options); if (m == VM_PAGE_NULL) { vm_fault_cleanup(object, VM_PAGE_NULL); @@ -2010,7 +1992,7 @@ vm_fault_page( assert(m->busy && !m->absent); assert((first_m == VM_PAGE_NULL) || (first_m->busy && !first_m->absent && - !first_m->active && !first_m->inactive)); + !first_m->active && !first_m->inactive && !first_m->secluded)); #endif /* EXTRA_ASSERTIONS */ /* @@ -2083,7 +2065,7 @@ vm_fault_page( /* * Allocate a page for the copy */ - copy_m = vm_page_grab(); + copy_m = vm_page_grab_options(grab_options); if (copy_m == VM_PAGE_NULL) { RELEASE_PAGE(m); @@ -2113,7 +2095,7 @@ vm_fault_page( * avoid the pmap_disconnect() call. */ if (m->pmapped) - pmap_disconnect(m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); if (m->clustered) { VM_PAGE_COUNT_AS_PAGEIN(m); @@ -2126,6 +2108,16 @@ vm_fault_page( */ RELEASE_PAGE(m); + /* + * This check helps with marking the object as having a sequential pattern + * Normally we'll miss doing this below because this fault is about COW to + * the first_object i.e. bring page in from disk, push to object above but + * don't update the file object's sequential pattern. + */ + if (object->internal == FALSE) { + vm_fault_is_sequential(object, offset, fault_info->behavior); + } + vm_object_paging_end(object); vm_object_unlock(object); @@ -2345,7 +2337,7 @@ vm_fault_page( * pmaps use it.) */ if (m->pmapped) - pmap_disconnect(m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); if (m->clustered) { VM_PAGE_COUNT_AS_PAGEIN(m); @@ -2357,11 +2349,8 @@ vm_fault_page( * option. Else, we use the copy. */ if ((!copy_object->pager_ready) -#if MACH_PAGEMAP - || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT -#endif || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT - ) { + ) { vm_page_lockspin_queues(); assert(!m->cleaning); @@ -2371,69 +2360,6 @@ vm_fault_page( SET_PAGE_DIRTY(copy_m, TRUE); PAGE_WAKEUP_DONE(copy_m); - } else if (copy_object->internal && - (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE)) { - /* - * For internal objects check with the pager to see - * if the page already exists in the backing store. - * If yes, then we can drop the copy page. If not, - * then we'll activate it, mark it dirty and keep it - * around. - */ - - kern_return_t kr = KERN_SUCCESS; - - memory_object_t copy_pager = copy_object->pager; - assert(copy_pager != MEMORY_OBJECT_NULL); - vm_object_paging_begin(copy_object); - - vm_object_unlock(copy_object); - - kr = memory_object_data_request( - copy_pager, - copy_offset + copy_object->paging_offset, - 0, /* Only query the pager. */ - VM_PROT_READ, - NULL); - - vm_object_lock(copy_object); - - vm_object_paging_end(copy_object); - - /* - * Since we dropped the copy_object's lock, - * check whether we'll have to deallocate - * the hard way. - */ - if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) { - vm_object_unlock(copy_object); - vm_object_deallocate(copy_object); - vm_object_lock(object); - - continue; - } - if (kr == KERN_SUCCESS) { - /* - * The pager has the page. We don't want to overwrite - * that page by sending this one out to the backing store. - * So we drop the copy page. - */ - VM_PAGE_FREE(copy_m); - - } else { - /* - * The pager doesn't have the page. We'll keep this one - * around in the copy object. It might get sent out to - * the backing store under memory pressure. - */ - vm_page_lockspin_queues(); - assert(!m->cleaning); - vm_page_activate(copy_m); - vm_page_unlock_queues(); - - SET_PAGE_DIRTY(copy_m, TRUE); - PAGE_WAKEUP_DONE(copy_m); - } } else { assert(copy_m->busy == TRUE); @@ -2521,13 +2447,15 @@ vm_fault_page( object, offset, m, first_m, 0); if (m != VM_PAGE_NULL) { + assert(VM_PAGE_OBJECT(m) == object); + retval = VM_FAULT_SUCCESS; if (my_fault == DBG_PAGEIN_FAULT) { VM_PAGE_COUNT_AS_PAGEIN(m); - if (m->object->internal) + if (object->internal) my_fault = DBG_PAGEIND_FAULT; else my_fault = DBG_PAGEINV_FAULT; @@ -2579,10 +2507,10 @@ vm_fault_page( * 3. the page belongs to a code-signed object * 4. the page has not been validated yet or has been mapped for write. */ -#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \ +#define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj) \ ((pmap) != kernel_pmap /*1*/ && \ !(page)->cs_tainted /*2*/ && \ - (page)->object->code_signed /*3*/ && \ + (page_obj)->code_signed /*3*/ && \ (!(page)->cs_validated || (page)->wpmapped /*4*/)) @@ -2595,6 +2523,7 @@ vm_fault_page( * careful not to modify the VM object in any way that is not * legal under a shared lock... */ +extern int panic_on_cs_killed; extern int proc_selfpid(void); extern char *proc_name_address(void *p); unsigned long cs_enter_tainted_rejected = 0; @@ -2620,24 +2549,24 @@ vm_fault_enter(vm_page_t m, boolean_t map_is_switched, map_is_switch_protected; int cs_enforcement_enabled; vm_prot_t fault_type; + vm_object_t object; fault_type = change_wiring ? VM_PROT_NONE : caller_prot; + object = VM_PAGE_OBJECT(m); - vm_object_lock_assert_held(m->object); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); -#endif /* DEBUG */ + vm_object_lock_assert_held(object); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); - if (m->phys_page == vm_page_guard_addr) { + if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) { assert(m->fictitious); return KERN_SUCCESS; } if (*type_of_fault == DBG_ZERO_FILL_FAULT) { - vm_object_lock_assert_exclusive(m->object); + vm_object_lock_assert_exclusive(object); - } else if ((fault_type & VM_PROT_WRITE) == 0) { + } else if ((fault_type & VM_PROT_WRITE) == 0 && !m->wpmapped) { /* * This is not a "write" fault, so we * might not have taken the object lock @@ -2661,7 +2590,7 @@ vm_fault_enter(vm_page_t m, * so it must have come in as part of * a cluster... account 1 pagein against it */ - if (m->object->internal) + if (object->internal) *type_of_fault = DBG_PAGEIND_FAULT; else *type_of_fault = DBG_PAGEINV_FAULT; @@ -2681,8 +2610,8 @@ vm_fault_enter(vm_page_t m, } /* Validate code signature if necessary. */ - if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) { - vm_object_lock_assert_exclusive(m->object); + if (VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) { + vm_object_lock_assert_exclusive(object); if (m->cs_validated) { vm_cs_revalidates++; @@ -2725,6 +2654,27 @@ vm_fault_enter(vm_page_t m, return KERN_CODESIGN_ERROR; } + if (cs_enforcement_enabled && + !m->cs_validated && + (prot & VM_PROT_EXECUTE) && + !(caller_prot & VM_PROT_EXECUTE)) { + /* + * FOURK PAGER: + * This page has not been validated and will not be + * allowed to be mapped for "execute". + * But the caller did not request "execute" access for this + * fault, so we should not raise a code-signing violation + * (and possibly kill the process) below. + * Instead, let's just remove the "execute" access request. + * + * This can happen on devices with a 4K page size if a 16K + * page contains a mix of signed&executable and + * unsigned&non-executable 4K pages, making the whole 16K + * mapping "executable". + */ + prot &= ~VM_PROT_EXECUTE; + } + /* A page could be tainted, or pose a risk of being tainted later. * Check whether the receiving process wants it, and make it feel * the consequences (that hapens in cs_invalid_page()). @@ -2764,7 +2714,7 @@ vm_fault_enter(vm_page_t m, * There is no point in invalidating the switching process since * it will not be executing from the map. So we don't call * cs_invalid_page() in that case. */ - boolean_t reject_page; + boolean_t reject_page, cs_killed; if(map_is_switched) { assert(pmap==vm_map_pmap(current_thread()->map)); assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE)); @@ -2772,13 +2722,13 @@ vm_fault_enter(vm_page_t m, } else { if (cs_debug > 5) printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n", - m->object->code_signed ? "yes" : "no", + object->code_signed ? "yes" : "no", m->cs_validated ? "yes" : "no", m->cs_tainted ? "yes" : "no", m->wpmapped ? "yes" : "no", m->slid ? "yes" : "no", (int)prot); - reject_page = cs_invalid_page((addr64_t) vaddr); + reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed); } if (reject_page) { @@ -2793,6 +2743,8 @@ vm_fault_enter(vm_page_t m, boolean_t truncated_path; #define __PATH_MAX 1024 struct timespec mtime, cs_mtime; + int shadow_depth; + os_reason_t codesigning_exit_reason = OS_REASON_NULL; kr = KERN_CODESIGN_ERROR; cs_enter_tainted_rejected++; @@ -2805,13 +2757,15 @@ vm_fault_enter(vm_page_t m, procname = proc_name_address(task->bsd_info); /* get file's VM object */ - file_object = m->object; + file_object = object; file_offset = m->offset; - for (shadow = file_object->shadow; + for (shadow = file_object->shadow, + shadow_depth = 0; shadow != VM_OBJECT_NULL; - shadow = file_object->shadow) { + shadow = file_object->shadow, + shadow_depth++) { vm_object_lock_shared(shadow); - if (file_object != m->object) { + if (file_object != object) { vm_object_unlock(file_object); } file_offset += file_object->vo_shadow_offset; @@ -2857,8 +2811,8 @@ vm_fault_enter(vm_page_t m, "rejecting invalid page at address 0x%llx " "from offset 0x%llx in file \"%s%s%s\" " "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) " - "(signed:%d validated:%d tainted:%d " - "wpmapped:%d slid:%d)\n", + "(signed:%d validated:%d tainted:%d nx:%d " + "wpmapped:%d slid:%d dirty:%d depth:%d)\n", pid, procname, (addr64_t) vaddr, file_offset, (pathname ? pathname : ""), @@ -2870,12 +2824,108 @@ vm_fault_enter(vm_page_t m, ? "==" : "!="), mtime.tv_sec, mtime.tv_nsec, - m->object->code_signed, + object->code_signed, m->cs_validated, m->cs_tainted, + m->cs_nx, m->wpmapped, - m->slid); - if (file_object != m->object) { + m->slid, + m->dirty, + shadow_depth); + + /* + * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page + * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the + * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler + * will deal with the segmentation fault. + */ + if (cs_killed) { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, + pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0); + + codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE); + if (codesigning_exit_reason == NULL) { + printf("vm_fault_enter: failed to allocate codesigning exit reason\n"); + } else { + mach_vm_address_t data_addr = 0; + struct codesigning_exit_reason_info *ceri = NULL; + uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri)); + + if (os_reason_alloc_buffer(codesigning_exit_reason, reason_buffer_size_estimate)) { + printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n"); + } else { + if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor, + EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) { + ceri = (struct codesigning_exit_reason_info *)data_addr; + static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname)); + + ceri->ceri_virt_addr = vaddr; + ceri->ceri_file_offset = file_offset; + if (pathname) + strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname)); + else + ceri->ceri_pathname[0] = '\0'; + if (filename) + strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename)); + else + ceri->ceri_filename[0] = '\0'; + ceri->ceri_path_truncated = (truncated_path); + ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec; + ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec; + ceri->ceri_page_modtime_secs = mtime.tv_sec; + ceri->ceri_page_modtime_nsecs = mtime.tv_nsec; + ceri->ceri_object_codesigned = (object->code_signed); + ceri->ceri_page_codesig_validated = (m->cs_validated); + ceri->ceri_page_codesig_tainted = (m->cs_tainted); + ceri->ceri_page_codesig_nx = (m->cs_nx); + ceri->ceri_page_wpmapped = (m->wpmapped); + ceri->ceri_page_slid = (m->slid); + ceri->ceri_page_dirty = (m->dirty); + ceri->ceri_page_shadow_depth = shadow_depth; + } else { +#if DEBUG || DEVELOPMENT + panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason"); +#else + printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n"); +#endif /* DEBUG || DEVELOPMENT */ + /* Free the buffer */ + os_reason_alloc_buffer(codesigning_exit_reason, 0); + } + } + } + + set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE); + } + if (panic_on_cs_killed && + object->object_slid) { + panic("CODE SIGNING: process %d[%s]: " + "rejecting invalid page at address 0x%llx " + "from offset 0x%llx in file \"%s%s%s\" " + "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) " + "(signed:%d validated:%d tainted:%d nx:%d" + "wpmapped:%d slid:%d dirty:%d depth:%d)\n", + pid, procname, (addr64_t) vaddr, + file_offset, + (pathname ? pathname : ""), + (truncated_path ? "/.../" : ""), + (truncated_path ? filename : ""), + cs_mtime.tv_sec, cs_mtime.tv_nsec, + ((cs_mtime.tv_sec == mtime.tv_sec && + cs_mtime.tv_nsec == mtime.tv_nsec) + ? "==" + : "!="), + mtime.tv_sec, mtime.tv_nsec, + object->code_signed, + m->cs_validated, + m->cs_tainted, + m->cs_nx, + m->wpmapped, + m->slid, + m->dirty, + shadow_depth); + } + + if (file_object != object) { vm_object_unlock(file_object); } if (pathname_len != 0) { @@ -2887,7 +2937,7 @@ vm_fault_enter(vm_page_t m, /* proceed with the invalid page */ kr = KERN_SUCCESS; if (!m->cs_validated && - !m->object->code_signed) { + !object->code_signed) { /* * This page has not been (fully) validated but * does not belong to a code-signed object @@ -2959,13 +3009,17 @@ MACRO_END * the page queues. Change wiring * case is obvious. */ - assert(m->compressor || m->object != compressor_object); - if (m->compressor) { + assert((m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object); + +#if CONFIG_BACKGROUND_QUEUE + vm_page_update_background_state(m); +#endif + if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { /* * Compressor pages are neither wired * nor pageable and should never change. */ - assert(m->object == compressor_object); + assert(object == compressor_object); } else if (change_wiring) { __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED(); @@ -2979,24 +3033,33 @@ MACRO_END /* we keep the page queues lock, if we need it later */ } else { + if (object->internal == TRUE) { + /* + * don't allow anonymous pages on + * the speculative queues + */ + no_cache = FALSE; + } if (kr != KERN_SUCCESS) { __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED(); vm_page_deactivate(m); /* we keep the page queues lock, if we need it later */ - } else if (((!m->active && !m->inactive) || - m->clean_queue || - no_cache) && - !VM_PAGE_WIRED(m) && !m->throttled) { + } else if (((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) || + (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) || + (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) || + ((m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) && + !VM_PAGE_WIRED(m)) { - if (vm_page_local_q && - !no_cache && + if (vm_page_local_q && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) { struct vpl *lq; uint32_t lid; + assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED(); - vm_object_lock_assert_exclusive(m->object); + vm_object_lock_assert_exclusive(object); /* * we got a local queue to stuff this @@ -3009,7 +3072,7 @@ MACRO_END * we'll use the current cpu number to * select the queue note that we don't * need to disable preemption... we're - * going to behind the local queue's + * going to be behind the local queue's * lock to do the real work */ lid = cpu_number(); @@ -3019,13 +3082,13 @@ MACRO_END VPL_LOCK(&lq->vpl_lock); vm_page_check_pageable_safe(m); - queue_enter(&lq->vpl_queue, m, - vm_page_t, pageq); - m->local = TRUE; + vm_page_queue_enter(&lq->vpl_queue, m, + vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q; m->local_id = lid; lq->vpl_count++; - if (m->object->internal) + if (object->internal) lq->vpl_internal_count++; else lq->vpl_external_count++; @@ -3061,16 +3124,15 @@ MACRO_END * page queue lock */ if (!VM_PAGE_WIRED(m)) { - if (m->clean_queue) { - vm_page_queues_remove(m); + if (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + vm_page_queues_remove(m, FALSE); vm_pageout_cleaned_reactivated++; vm_pageout_cleaned_fault_reactivated++; } - if ((!m->active && - !m->inactive) || - no_cache) { + if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m) || + no_cache) { /* * If this is a no_cache mapping * and the page has never been @@ -3089,12 +3151,10 @@ MACRO_END m->no_cache)) { m->no_cache = TRUE; - if (!m->speculative) + if (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q) vm_page_speculate(m, FALSE); - } else if (!m->active && - !m->inactive) { - + } else if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m)) { vm_page_activate(m); } } @@ -3114,7 +3174,6 @@ MACRO_END * now so those processes can take note. */ if (kr == KERN_SUCCESS) { - /* * NOTE: we may only hold the vm_object lock SHARED * at this point, so we need the phys_page lock to @@ -3122,8 +3181,9 @@ MACRO_END * xpmapped bits */ if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) { + ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m); - pmap_lock_phys_page(m->phys_page); + pmap_lock_phys_page(phys_page); /* * go ahead and take the opportunity * to set 'pmapped' here so that we don't @@ -3136,14 +3196,13 @@ MACRO_END m->xpmapped = TRUE; - pmap_unlock_phys_page(m->phys_page); + pmap_unlock_phys_page(phys_page); - if (!m->object->internal) + if (!object->internal) OSAddAtomic(1, &vm_page_xpmapped_external_count); - if ((COMPRESSED_PAGER_IS_ACTIVE) && - m->object->internal && - m->object->pager != NULL) { + if (object->internal && + object->pager != NULL) { /* * This page could have been * uncompressed by the @@ -3155,21 +3214,24 @@ MACRO_END * make sure the icache is in * sync. */ - pmap_sync_page_data_phys(m->phys_page); + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + pmap_sync_page_data_phys(phys_page); } } else - pmap_unlock_phys_page(m->phys_page); + pmap_unlock_phys_page(phys_page); } else { if (m->pmapped == FALSE) { - pmap_lock_phys_page(m->phys_page); + ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m); + + pmap_lock_phys_page(phys_page); m->pmapped = TRUE; - pmap_unlock_phys_page(m->phys_page); + pmap_unlock_phys_page(phys_page); } } if (vm_page_is_slideable(m)) { boolean_t was_busy = m->busy; - vm_object_lock_assert_exclusive(m->object); + vm_object_lock_assert_exclusive(object); m->busy = TRUE; kr = vm_page_slide(m, 0); @@ -3191,9 +3253,10 @@ MACRO_END if (fault_type & VM_PROT_WRITE) { if (m->wpmapped == FALSE) { - vm_object_lock_assert_exclusive(m->object); - if (!m->object->internal) - task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED); + vm_object_lock_assert_exclusive(object); + if (!object->internal && object->pager) { + task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager)); + } m->wpmapped = TRUE; } if (must_disconnect) { @@ -3202,7 +3265,7 @@ MACRO_END * because of the CSE logic */ assert(cs_enforcement_enabled); - pmap_disconnect(m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); /* * If we are faulting for a write, we can clear * the execute bit - that will ensure the page is @@ -3217,6 +3280,7 @@ MACRO_END } } } + assert(VM_PAGE_OBJECT(m) == object); /* Prevent a deadlock by not * holding the object lock if we need to wait for a page in @@ -3251,17 +3315,19 @@ MACRO_END * the page busy and unlocking the object */ boolean_t was_busy = m->busy; - vm_object_lock_assert_exclusive(m->object); + vm_object_lock_assert_exclusive(object); m->busy = TRUE; - vm_object_unlock(m->object); + vm_object_unlock(object); PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0, wired, pmap_options, pe_result); + assert(VM_PAGE_OBJECT(m) == object); + /* Take the object lock again. */ - vm_object_lock(m->object); + vm_object_lock(object); /* If the page was busy, someone else will wake it up. * Otherwise, we have to do it now. */ @@ -3308,6 +3374,7 @@ vm_pre_fault(vm_map_offset_t vaddr) */ extern int _map_enter_debug; +extern uint64_t get_current_unique_pid(void); unsigned long vm_fault_collapse_total = 0; unsigned long vm_fault_collapse_skipped = 0; @@ -3353,6 +3420,7 @@ vm_fault_internal( vm_page_t m; /* Fast access to result_page */ kern_return_t error_code; vm_object_t cur_object; + vm_object_t m_object = NULL; vm_object_offset_t cur_offset; vm_page_t cur_m; vm_object_t new_object; @@ -3372,7 +3440,11 @@ vm_fault_internal( vm_object_t top_object = VM_OBJECT_NULL; int throttle_delay; int compressed_count_delta; + vm_map_offset_t real_vaddr; + int grab_options; + real_vaddr = vaddr; + vaddr = vm_map_trunc_page(vaddr, PAGE_MASK); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, @@ -3518,6 +3590,13 @@ vm_fault_internal( cur_object = object; cur_offset = offset; + grab_options = 0; +#if CONFIG_SECLUDED_MEMORY + if (object->can_grab_secluded) { + grab_options |= VM_PAGE_GRAB_SECLUDED; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + while (TRUE) { if (!cur_object->pager_created && cur_object->phys_contiguous) /* superpage */ @@ -3532,8 +3611,11 @@ vm_fault_internal( } m = vm_page_lookup(cur_object, cur_offset); + m_object = NULL; if (m != VM_PAGE_NULL) { + m_object = cur_object; + if (m->busy) { wait_result_t result; @@ -3583,7 +3665,7 @@ vm_fault_internal( continue; } } - if (m->pageout_queue && m->object->internal && COMPRESSED_PAGER_IS_ACTIVE) { + if ((m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) { /* * m->busy == TRUE and the object is locked exclusively * if m->pageout_queue == TRUE after we acquire the @@ -3593,9 +3675,11 @@ vm_fault_internal( * NOTE: this is only true for the internal pageout queue * in the compressor world */ + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + vm_page_lock_queues(); - if (m->pageout_queue) { + if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { vm_pageout_throttle_up(m); vm_page_unlock_queues(); @@ -3661,12 +3745,10 @@ vm_fault_internal( continue; } } - m->pageout = FALSE; - vm_pageout_steal_laundry(m, FALSE); } - if (m->phys_page == vm_page_guard_addr) { + if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) { /* * Guard page: let the slow path deal with it */ @@ -3678,7 +3760,7 @@ vm_fault_internal( */ break; } - if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) { + if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) { if (object != cur_object) vm_object_unlock(object); vm_map_unlock_read(map); @@ -3789,8 +3871,9 @@ vm_fault_internal( goto RetryFault; } } + assert(m_object == VM_PAGE_OBJECT(m)); - if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m) || + if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) || (physpage_p != NULL && (prot & VM_PROT_WRITE))) { upgrade_for_validation: /* @@ -3846,6 +3929,8 @@ vm_fault_internal( if ((fault_type & VM_PROT_WRITE) == 0) { + prot &= ~VM_PROT_WRITE; + if (object != cur_object) { /* * We still need to hold the top object @@ -3874,6 +3959,8 @@ vm_fault_internal( object_lock_type = cur_object_lock_type; } FastPmapEnter: + assert(m_object == VM_PAGE_OBJECT(m)); + /* * prepare for the pmap_enter... * object and map are both locked @@ -3916,14 +4003,28 @@ vm_fault_internal( need_retry_ptr, &type_of_fault); } +#if DEVELOPMENT || DEBUG + { + int event_code = 0; + + if (m_object->internal) + event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL)); + else if (m_object->object_slid) + event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE)); + else + event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL)); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0); + + DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); + } +#endif if (kr == KERN_SUCCESS && physpage_p != NULL) { /* for vm_map_wire_and_extract() */ - *physpage_p = m->phys_page; + *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (prot & VM_PROT_WRITE) { - vm_object_lock_assert_exclusive( - m->object); + vm_object_lock_assert_exclusive(m_object); m->dirty = TRUE; } } @@ -4011,9 +4112,10 @@ vm_fault_internal( * fault -- it requires a copy up the shadow * chain. */ + assert(m_object == VM_PAGE_OBJECT(m)); if ((cur_object_lock_type == OBJECT_LOCK_SHARED) && - VM_FAULT_NEED_CS_VALIDATION(NULL, m)) { + VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) { goto upgrade_for_validation; } @@ -4030,7 +4132,8 @@ vm_fault_internal( * the page has been copied and inserted */ cur_m = m; - m = vm_page_grab(); + m = vm_page_grab_options(grab_options); + m_object = NULL; if (m == VM_PAGE_NULL) { /* @@ -4047,17 +4150,19 @@ vm_fault_internal( */ vm_page_copy(cur_m, m); vm_page_insert(m, object, offset); + m_object = object; SET_PAGE_DIRTY(m, FALSE); /* * Now cope with the source page and object */ if (object->ref_count > 1 && cur_m->pmapped) - pmap_disconnect(cur_m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m)); if (cur_m->clustered) { VM_PAGE_COUNT_AS_PAGEIN(cur_m); VM_PAGE_CONSUME_CLUSTERED(cur_m); + vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior); } need_collapse = TRUE; @@ -4198,7 +4303,8 @@ vm_fault_internal( continue; } } - m = vm_page_grab(); + m = vm_page_grab_options(grab_options); + m_object = NULL; if (m == VM_PAGE_NULL) { /* @@ -4227,7 +4333,7 @@ vm_fault_internal( cur_object->pager, (cur_offset + cur_object->paging_offset), - m->phys_page, + VM_PAGE_GET_PHYS_PAGE(m), &my_fault_type, c_flags, &compressed_count_delta); @@ -4239,7 +4345,8 @@ vm_fault_internal( cur_object); if (kr != KERN_SUCCESS) { - vm_page_release(m); + vm_page_release(m, FALSE); + m = VM_PAGE_NULL; break; } m->dirty = TRUE; @@ -4292,18 +4399,20 @@ vm_fault_internal( if (insert_cur_object) { vm_page_insert(m, cur_object, cur_offset); + m_object = cur_object; } else { vm_page_insert(m, object, offset); + m_object = object; } - if ((m->object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) { + if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) { /* * If the page is not cacheable, * we can't let its contents * linger in the data cache * after the decompression. */ - pmap_sync_page_attributes_phys(m->phys_page); + pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m)); } type_of_fault = my_fault_type; @@ -4336,8 +4445,10 @@ vm_fault_internal( * inserted into the original object. */ if (cur_object->shadow_severed || - VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) - { + VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) || + cur_object == compressor_object || + cur_object == kernel_object || + cur_object == vm_submap_object) { if (object != cur_object) vm_object_unlock(cur_object); vm_object_unlock(object); @@ -4383,6 +4494,7 @@ vm_fault_internal( } } m = vm_page_alloc(object, offset); + m_object = NULL; if (m == VM_PAGE_NULL) { /* @@ -4391,6 +4503,7 @@ vm_fault_internal( */ break; } + m_object = object; /* * Now zero fill page... @@ -4457,6 +4570,10 @@ vm_fault_internal( if (real_map != map) vm_map_unlock(real_map); + assert(object != compressor_object); + assert(object != kernel_object); + assert(object != vm_submap_object); + /* * Make a reference to this object to * prevent its disposal while we are messing with @@ -4530,11 +4647,13 @@ vm_fault_internal( } } m = result_page; + m_object = NULL; if (m != VM_PAGE_NULL) { + m_object = VM_PAGE_OBJECT(m); assert((change_wiring && !wired) ? - (top_page == VM_PAGE_NULL) : - ((top_page == VM_PAGE_NULL) == (m->object == object))); + (top_page == VM_PAGE_NULL) : + ((top_page == VM_PAGE_NULL) == (m_object == object))); } /* @@ -4544,12 +4663,12 @@ vm_fault_internal( #define RELEASE_PAGE(m) \ MACRO_BEGIN \ PAGE_WAKEUP_DONE(m); \ - if (!m->active && !m->inactive && !m->throttled) { \ - vm_page_lockspin_queues(); \ - if (!m->active && !m->inactive && !m->throttled) \ - vm_page_activate(m); \ - vm_page_unlock_queues(); \ - } \ + if ( !VM_PAGE_PAGEABLE(m)) { \ + vm_page_lockspin_queues(); \ + if ( !VM_PAGE_PAGEABLE(m)) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + } \ MACRO_END /* @@ -4557,8 +4676,8 @@ vm_fault_internal( * since our last lookup. */ if (m != VM_PAGE_NULL) { - old_copy_object = m->object->copy; - vm_object_unlock(m->object); + old_copy_object = m_object->copy; + vm_object_unlock(m_object); } else { old_copy_object = VM_OBJECT_NULL; vm_object_unlock(object); @@ -4597,17 +4716,19 @@ vm_fault_internal( vm_map_unlock_read(map); if (m != VM_PAGE_NULL) { + assert(VM_PAGE_OBJECT(m) == m_object); + /* * retake the lock so that * we can drop the paging reference * in vm_fault_cleanup and do the * PAGE_WAKEUP_DONE in RELEASE_PAGE */ - vm_object_lock(m->object); + vm_object_lock(m_object); RELEASE_PAGE(m); - vm_fault_cleanup(m->object, top_page); + vm_fault_cleanup(m_object, top_page); } else { /* * retake the lock so that @@ -4631,17 +4752,19 @@ vm_fault_internal( vm_map_unlock(real_map); if (m != VM_PAGE_NULL) { + assert(VM_PAGE_OBJECT(m) == m_object); + /* * retake the lock so that * we can drop the paging reference * in vm_fault_cleanup and do the * PAGE_WAKEUP_DONE in RELEASE_PAGE */ - vm_object_lock(m->object); + vm_object_lock(m_object); RELEASE_PAGE(m); - vm_fault_cleanup(m->object, top_page); + vm_fault_cleanup(m_object, top_page); } else { /* * retake the lock so that @@ -4663,9 +4786,9 @@ vm_fault_internal( prot &= retry_prot; } if (m != VM_PAGE_NULL) { - vm_object_lock(m->object); + vm_object_lock(m_object); - if (m->object->copy != old_copy_object) { + if (m_object->copy != old_copy_object) { /* * The copy object changed while the top-level object * was unlocked, so take away write permission. @@ -4686,9 +4809,11 @@ vm_fault_internal( vm_map_unlock(real_map); if (m != VM_PAGE_NULL) { + assert(VM_PAGE_OBJECT(m) == m_object); + RELEASE_PAGE(m); - vm_fault_cleanup(m->object, top_page); + vm_fault_cleanup(m_object, top_page); } else vm_fault_cleanup(object, top_page); @@ -4733,21 +4858,39 @@ vm_fault_internal( NULL, &type_of_fault); } + assert(VM_PAGE_OBJECT(m) == m_object); + +#if DEVELOPMENT || DEBUG + { + int event_code = 0; + + if (m_object->internal) + event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL)); + else if (m_object->object_slid) + event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE)); + else + event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL)); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0); + + DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); + } +#endif if (kr != KERN_SUCCESS) { /* abort this page fault */ vm_map_verify_done(map, &version); if (real_map != map) vm_map_unlock(real_map); PAGE_WAKEUP_DONE(m); - vm_fault_cleanup(m->object, top_page); + vm_fault_cleanup(m_object, top_page); vm_object_deallocate(object); goto done; } if (physpage_p != NULL) { /* for vm_map_wire_and_extract() */ - *physpage_p = m->phys_page; + *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (prot & VM_PROT_WRITE) { - vm_object_lock_assert_exclusive(m->object); + vm_object_lock_assert_exclusive(m_object); m->dirty = TRUE; } } @@ -4881,9 +5024,11 @@ vm_fault_internal( vm_map_unlock(real_map); if (m != VM_PAGE_NULL) { + assert(VM_PAGE_OBJECT(m) == m_object); + PAGE_WAKEUP_DONE(m); - vm_fault_cleanup(m->object, top_page); + vm_fault_cleanup(m_object, top_page); } else vm_fault_cleanup(object, top_page); @@ -4942,10 +5087,9 @@ vm_fault_wire( vm_map_offset_t pmap_addr, ppnum_t *physpage_p) { - - register vm_map_offset_t va; - register vm_map_offset_t end_addr = entry->vme_end; - register kern_return_t rc; + vm_map_offset_t va; + vm_map_offset_t end_addr = entry->vme_end; + kern_return_t rc; assert(entry->in_transition); @@ -5012,8 +5156,8 @@ vm_fault_unwire( pmap_t pmap, vm_map_offset_t pmap_addr) { - register vm_map_offset_t va; - register vm_map_offset_t end_addr = entry->vme_end; + vm_map_offset_t va; + vm_map_offset_t end_addr = entry->vme_end; vm_object_t object; struct vm_object_fault_info fault_info; @@ -5123,15 +5267,15 @@ vm_fault_unwire( if (result != VM_FAULT_SUCCESS) panic("vm_fault_unwire: failure"); - result_object = result_page->object; + result_object = VM_PAGE_OBJECT(result_page); if (deallocate) { - assert(result_page->phys_page != + assert(VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_fictitious_addr); - pmap_disconnect(result_page->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page)); VM_PAGE_FREE(result_page); } else { - if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) + if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) pmap_change_wiring(pmap, pmap_addr + (va - entry->vme_start), FALSE); @@ -5142,7 +5286,7 @@ vm_fault_unwire( vm_page_unlock_queues(); } if(entry->zero_wired_pages) { - pmap_zero_page(result_page->phys_page); + pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page)); entry->zero_wired_pages = FALSE; } @@ -5195,7 +5339,7 @@ vm_fault_wire_fast( { vm_object_t object; vm_object_offset_t offset; - register vm_page_t m; + vm_page_t m; vm_prot_t prot; thread_t thread = current_thread(); int type_of_fault; @@ -5294,7 +5438,7 @@ vm_fault_wire_fast( ASSERT_PAGE_DECRYPTED(m); if (m->fictitious && - m->phys_page == vm_page_guard_addr) { + VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) { /* * Guard pages are fictitious pages and are never * entered into a pmap, so let's say it's been wired... @@ -5347,6 +5491,10 @@ vm_fault_wire_fast( : 0), NULL, &type_of_fault); + if (kr != KERN_SUCCESS) { + RELEASE_PAGE(m); + GIVE_UP; + } done: /* @@ -5356,9 +5504,10 @@ vm_fault_wire_fast( if (physpage_p) { /* for vm_map_wire_and_extract() */ if (kr == KERN_SUCCESS) { - *physpage_p = m->phys_page; + assert(object == VM_PAGE_OBJECT(m)); + *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (prot & VM_PROT_WRITE) { - vm_object_lock_assert_exclusive(m->object); + vm_object_lock_assert_exclusive(object); m->dirty = TRUE; } } else { @@ -5384,14 +5533,15 @@ vm_fault_copy_cleanup( vm_page_t page, vm_page_t top_page) { - vm_object_t object = page->object; + vm_object_t object = VM_PAGE_OBJECT(page); vm_object_lock(object); PAGE_WAKEUP_DONE(page); - if (!page->active && !page->inactive && !page->throttled) { + if ( !VM_PAGE_PAGEABLE(page)) { vm_page_lockspin_queues(); - if (!page->active && !page->inactive && !page->throttled) + if ( !VM_PAGE_PAGEABLE(page)) { vm_page_activate(page); + } vm_page_unlock_queues(); } vm_fault_cleanup(object, top_page); @@ -5404,7 +5554,7 @@ vm_fault_copy_dst_cleanup( vm_object_t object; if (page != VM_PAGE_NULL) { - object = page->object; + object = VM_PAGE_OBJECT(page); vm_object_lock(object); vm_page_lockspin_queues(); vm_page_unwire(page, TRUE); @@ -5464,6 +5614,7 @@ vm_fault_copy( vm_map_size_t amount_left; vm_object_t old_copy_object; + vm_object_t result_page_object = NULL; kern_return_t error = 0; vm_fault_return_t result; @@ -5572,7 +5723,8 @@ vm_fault_copy( } assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE); - old_copy_object = dst_page->object->copy; + assert(dst_object == VM_PAGE_OBJECT(dst_page)); + old_copy_object = dst_object->copy; /* * There exists the possiblity that the source and @@ -5590,7 +5742,7 @@ vm_fault_copy( vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE); vm_page_unlock_queues(); PAGE_WAKEUP_DONE(dst_page); - vm_object_unlock(dst_page->object); + vm_object_unlock(dst_object); if (dst_top_page != VM_PAGE_NULL) { vm_object_lock(dst_object); @@ -5670,12 +5822,12 @@ vm_fault_copy( "vm_fault_page()\n", result); } - + result_page_object = VM_PAGE_OBJECT(result_page); assert((src_top_page == VM_PAGE_NULL) == - (result_page->object == src_object)); + (result_page_object == src_object)); } assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE); - vm_object_unlock(result_page->object); + vm_object_unlock(result_page_object); } if (!vm_map_verify(dst_map, dst_version)) { @@ -5684,18 +5836,19 @@ vm_fault_copy( vm_fault_copy_dst_cleanup(dst_page); break; } + assert(dst_object == VM_PAGE_OBJECT(dst_page)); - vm_object_lock(dst_page->object); + vm_object_lock(dst_object); - if (dst_page->object->copy != old_copy_object) { - vm_object_unlock(dst_page->object); + if (dst_object->copy != old_copy_object) { + vm_object_unlock(dst_object); vm_map_verify_done(dst_map, dst_version); if (result_page != VM_PAGE_NULL && src_page != dst_page) vm_fault_copy_cleanup(result_page, src_top_page); vm_fault_copy_dst_cleanup(dst_page); break; } - vm_object_unlock(dst_page->object); + vm_object_unlock(dst_object); /* * Copy the page, and note that it is dirty @@ -5739,7 +5892,7 @@ vm_fault_copy( if(!dst_page->dirty){ vm_object_lock(dst_object); SET_PAGE_DIRTY(dst_page, TRUE); - vm_object_unlock(dst_page->object); + vm_object_unlock(dst_object); } } @@ -5749,14 +5902,14 @@ vm_fault_copy( if (result_page == VM_PAGE_NULL) vm_page_zero_fill(dst_page); else{ - vm_object_lock(result_page->object); + vm_object_lock(result_page_object); vm_page_copy(result_page, dst_page); - vm_object_unlock(result_page->object); + vm_object_unlock(result_page_object); if(!dst_page->dirty){ vm_object_lock(dst_object); SET_PAGE_DIRTY(dst_page, TRUE); - vm_object_unlock(dst_page->object); + vm_object_unlock(dst_object); } } @@ -5869,12 +6022,8 @@ vm_fault_classify_init(void) #endif /* VM_FAULT_CLASSIFY */ vm_offset_t -kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, uint32_t *fault_results) +kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr) { -#pragma unused(map, cur_target_addr, fault_results) - - return 0; -#if 0 vm_map_entry_t entry; vm_object_t object; vm_offset_t object_offset; @@ -5884,7 +6033,6 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, uint32_t *fault int my_fault_type = VM_PROT_READ; kern_return_t kr; - if (not_in_kdp) { panic("kdp_lightweight_fault called from outside of debugger context"); } @@ -5933,7 +6081,7 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, uint32_t *fault return 0; } - if (m->laundry || m->busy || m->pageout || m->absent || m->error || m->cleaning || + if (m->laundry || m->busy || m->free_when_done || m->absent || m->error || m->cleaning || m->overwriting || m->restart || m->unusual) { return 0; } @@ -5958,15 +6106,12 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, uint32_t *fault return 0; } - assert(!m->compressor); - if (m->compressor) { + assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); + if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { return 0; } - if (fault_results) { - *fault_results |= kThreadFaultedBT; - } - return ptoa(m->phys_page); + return ptoa(VM_PAGE_GET_PHYS_PAGE(m)); } compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN; @@ -5977,9 +6122,6 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, uint32_t *fault kdp_compressor_decompressed_page_ppnum, &my_fault_type, compressor_flags, &compressed_count_delta); if (kr == KERN_SUCCESS) { - if (fault_results) { - *fault_results |= kThreadDecompressedBT; - } return kdp_compressor_decompressed_page_paddr; } else { return 0; @@ -5994,27 +6136,24 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, uint32_t *fault object_offset += object->vo_shadow_offset; object = object->shadow; } -#endif /* 0 */ -} +} -#define CODE_SIGNING_CHUNK_SIZE 4096 void vm_page_validate_cs_mapped( vm_page_t page, const void *kaddr) { vm_object_t object; - vm_object_offset_t offset, offset_in_page; - kern_return_t kr; + vm_object_offset_t offset; memory_object_t pager; - void *blobs; + struct vnode *vnode; boolean_t validated; unsigned tainted; - int num_chunks, num_chunks_validated; assert(page->busy); - vm_object_lock_assert_exclusive(page->object); + object = VM_PAGE_OBJECT(page); + vm_object_lock_assert_exclusive(object); if (page->wpmapped && !page->cs_tainted) { /* @@ -6030,7 +6169,7 @@ vm_page_validate_cs_mapped( printf("CODESIGNING: vm_page_validate_cs: " "page %p obj %p off 0x%llx " "was modified\n", - page, page->object, page->offset); + page, object, page->offset); } vm_cs_validated_dirtied++; } @@ -6041,7 +6180,6 @@ vm_page_validate_cs_mapped( vm_cs_validates++; - object = page->object; assert(object->code_signed); offset = page->offset; @@ -6063,37 +6201,26 @@ vm_page_validate_cs_mapped( pager = object->pager; assert(object->paging_in_progress); - kr = vnode_pager_get_object_cs_blobs(pager, &blobs); - if (kr != KERN_SUCCESS) { - blobs = NULL; - } + vnode = vnode_pager_lookup_vnode(pager); /* verify the SHA1 hash for this page */ - num_chunks_validated = 0; - for (offset_in_page = 0, num_chunks = 0; - offset_in_page < PAGE_SIZE_64; - offset_in_page += CODE_SIGNING_CHUNK_SIZE, num_chunks++) { - tainted = 0; - validated = cs_validate_page(blobs, - pager, - (object->paging_offset + - offset + - offset_in_page), - (const void *)((const char *)kaddr - + offset_in_page), - &tainted); - if (validated) { - num_chunks_validated++; - } - if (tainted & CS_VALIDATE_TAINTED) { - page->cs_tainted = TRUE; - } - if (tainted & CS_VALIDATE_NX) { - page->cs_nx = TRUE; - } + tainted = 0; + validated = cs_validate_range(vnode, + pager, + (object->paging_offset + + offset), + (const void *)((const char *)kaddr), + PAGE_SIZE_64, + &tainted); + + if (tainted & CS_VALIDATE_TAINTED) { + page->cs_tainted = TRUE; + } + if (tainted & CS_VALIDATE_NX) { + page->cs_nx = TRUE; } - /* page is validated only if all its chunks are */ - if (num_chunks_validated == num_chunks) { + + if (validated) { page->cs_validated = TRUE; } } @@ -6111,10 +6238,11 @@ vm_page_validate_cs( boolean_t busy_page; boolean_t need_unmap; - vm_object_lock_assert_held(page->object); + object = VM_PAGE_OBJECT(page); + vm_object_lock_assert_held(object); if (page->wpmapped && !page->cs_tainted) { - vm_object_lock_assert_exclusive(page->object); + vm_object_lock_assert_exclusive(object); /* * This page was mapped for "write" access sometime in the @@ -6129,7 +6257,7 @@ vm_page_validate_cs( printf("CODESIGNING: vm_page_validate_cs: " "page %p obj %p off 0x%llx " "was modified\n", - page, page->object, page->offset); + page, object, page->offset); } vm_cs_validated_dirtied++; } @@ -6144,16 +6272,15 @@ vm_page_validate_cs( assert(!page->slid); #if CHECK_CS_VALIDATION_BITMAP - if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) { + if ( vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page(page->offset + object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) { page->cs_validated = TRUE; page->cs_tainted = FALSE; vm_cs_bitmap_validated++; return; } #endif - vm_object_lock_assert_exclusive(page->object); + vm_object_lock_assert_exclusive(object); - object = page->object; assert(object->code_signed); offset = page->offset; @@ -6196,7 +6323,7 @@ vm_page_validate_cs( } #endif assert(page->busy); - assert(object == page->object); + assert(object == VM_PAGE_OBJECT(page)); vm_object_lock_assert_exclusive(object); if (!busy_page) { @@ -6217,14 +6344,14 @@ vm_page_validate_cs_mapped_chunk( vm_page_t page, const void *kaddr, vm_offset_t chunk_offset, + vm_size_t chunk_size, boolean_t *validated_p, unsigned *tainted_p) { vm_object_t object; vm_object_offset_t offset, offset_in_page; - kern_return_t kr; memory_object_t pager; - void *blobs; + struct vnode *vnode; boolean_t validated; unsigned tainted; @@ -6232,9 +6359,9 @@ vm_page_validate_cs_mapped_chunk( *tainted_p = 0; assert(page->busy); - vm_object_lock_assert_exclusive(page->object); + object = VM_PAGE_OBJECT(page); + vm_object_lock_assert_exclusive(object); - object = page->object; assert(object->code_signed); offset = page->offset; @@ -6256,25 +6383,22 @@ vm_page_validate_cs_mapped_chunk( pager = object->pager; assert(object->paging_in_progress); - kr = vnode_pager_get_object_cs_blobs(pager, &blobs); - if (kr != KERN_SUCCESS) { - blobs = NULL; - } + vnode = vnode_pager_lookup_vnode(pager); /* verify the signature for this chunk */ offset_in_page = chunk_offset; assert(offset_in_page < PAGE_SIZE); - assert((offset_in_page & (CODE_SIGNING_CHUNK_SIZE-1)) == 0); tainted = 0; - validated = cs_validate_page(blobs, - pager, - (object->paging_offset + - offset + - offset_in_page), - (const void *)((const char *)kaddr + validated = cs_validate_range(vnode, + pager, + (object->paging_offset + + offset + + offset_in_page), + (const void *)((const char *)kaddr + offset_in_page), - &tainted); + chunk_size, + &tainted); if (validated) { *validated_p = TRUE; } diff --git a/osfmk/vm/vm_fault.h b/osfmk/vm/vm_fault.h index d6824c4fd..d5e8ed44a 100644 --- a/osfmk/vm/vm_fault.h +++ b/osfmk/vm/vm_fault.h @@ -176,8 +176,7 @@ extern kern_return_t vm_fault_enter( extern vm_offset_t kdp_lightweight_fault( vm_map_t map, - vm_offset_t cur_target_addr, - uint32_t *fault_results); + vm_offset_t cur_target_addr); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_fourk_pager.c b/osfmk/vm/vm_fourk_pager.c index 57f3ed726..785bbf5f8 100644 --- a/osfmk/vm/vm_fourk_pager.c +++ b/osfmk/vm/vm_fourk_pager.c @@ -45,19 +45,18 @@ #include #include #include +#include #include #include -#include -#include - #include #include #include #include #include #include +#include /* @@ -267,6 +266,12 @@ fourk_pager_init( panic("fourk_pager_init: " "memory_object_change_attributes() failed"); +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache) { + memory_object_mark_eligible_for_secluded(control, TRUE); + } +#endif /* CONFIG_SECLUDED_MEMORY */ + return KERN_SUCCESS; } @@ -942,6 +947,7 @@ fourk_pager_data_request( memory_object_offset_t src_offset; vm_offset_t offset_in_src_page; kern_return_t error_code; + vm_object_t src_page_object; vm_page_t src_page; vm_page_t top_page; vm_prot_t prot; @@ -1065,16 +1071,12 @@ fourk_pager_data_request( assert(src_page != VM_PAGE_NULL); assert(src_page->busy); - if (!src_page->active && - !src_page->inactive && - !src_page->speculative && - !src_page->throttled && + src_page_object = VM_PAGE_OBJECT(src_page); + + if (( !VM_PAGE_PAGEABLE(src_page)) && !VM_PAGE_WIRED(src_page)) { vm_page_lockspin_queues(); - if (!src_page->active && - !src_page->inactive && - !src_page->speculative && - !src_page->throttled && + if (( !VM_PAGE_PAGEABLE(src_page)) && !VM_PAGE_WIRED(src_page)) { vm_page_deactivate(src_page); } @@ -1083,7 +1085,7 @@ fourk_pager_data_request( #if __x86_64__ src_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)src_page->phys_page + PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) << PAGE_SHIFT); #else /* @@ -1092,7 +1094,7 @@ fourk_pager_data_request( */ pmap_enter(kernel_pmap, src_vaddr, - src_page->phys_page, + VM_PAGE_GET_PHYS_PAGE(src_page), VM_PROT_READ, VM_PROT_NONE, 0, @@ -1105,11 +1107,12 @@ fourk_pager_data_request( */ subpg_validated = FALSE; subpg_tainted = 0; - if (src_page->object->code_signed) { + if (src_page_object->code_signed) { vm_page_validate_cs_mapped_chunk( src_page, (const void *) src_vaddr, offset_in_src_page, + FOURK_PAGE_SIZE, &subpg_validated, &subpg_tainted); num_subpg_signed++; @@ -1155,7 +1158,7 @@ fourk_pager_data_request( pager, offset, cur_offset, (sub_page-sub_page_idx)*FOURK_PAGE_SIZE, - src_page->object, + src_page_object, src_page->offset + offset_in_src_page, *(uint64_t *)(dst_vaddr + ((sub_page-sub_page_idx) * @@ -1164,7 +1167,7 @@ fourk_pager_data_request( ((sub_page-sub_page_idx) * FOURK_PAGE_SIZE) + 8), - src_page->object->code_signed, + src_page_object->code_signed, subpg_validated, !!(subpg_tainted & CS_VALIDATE_TAINTED), !!(subpg_tainted & CS_VALIDATE_NX)); @@ -1188,9 +1191,8 @@ fourk_pager_data_request( * Cleanup the result of vm_fault_page(). */ if (src_page) { - vm_object_t src_page_object; + assert(VM_PAGE_OBJECT(src_page) == src_page_object); - src_page_object = src_page->object; PAGE_WAKEUP_DONE(src_page); src_page = VM_PAGE_NULL; vm_object_paging_end(src_page_object); @@ -1198,7 +1200,7 @@ fourk_pager_data_request( if (top_page) { vm_object_t top_object; - top_object = top_page->object; + top_object = VM_PAGE_OBJECT(top_page); vm_object_lock(top_object); VM_PAGE_FREE(top_page); top_page = VM_PAGE_NULL; diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index b5796e35e..bfbb2e54e 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -97,11 +97,12 @@ boolean_t zlog_ready = FALSE; vm_offset_t kmapoff_kaddr; unsigned int kmapoff_pgcnt; + static inline void vm_mem_bootstrap_log(const char *message) { // kprintf("vm_mem_bootstrap: %s\n", message); - kernel_debug_string_simple(message); + kernel_debug_string_early(message); } /* @@ -156,6 +157,7 @@ vm_mem_bootstrap(void) kmapoff_pgcnt * PAGE_SIZE_64, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK)) != KERN_SUCCESS) panic("cannot vm_allocate %u kernel_map pages", kmapoff_pgcnt); + vm_mem_bootstrap_log("pmap_init"); pmap_init(); diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index c75b23835..8d37f4cf0 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -93,9 +93,9 @@ extern boolean_t vm_kernel_ready; * Forward declarations for internal functions. */ extern kern_return_t kmem_alloc_pages( - register vm_object_t object, - register vm_object_offset_t offset, - register vm_object_size_t size); + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size); kern_return_t kmem_alloc_contig( @@ -237,10 +237,10 @@ kmem_alloc_contig( kern_return_t kernel_memory_allocate( - register vm_map_t map, - register vm_offset_t *addrp, - register vm_size_t size, - register vm_offset_t mask, + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size, + vm_offset_t mask, int flags, vm_tag_t tag) { @@ -342,7 +342,7 @@ kernel_memory_allocate( } vm_page_more_fictitious(); } - mem->pageq.next = (queue_entry_t)guard_page_list; + mem->snext = guard_page_list; guard_page_list = mem; } @@ -375,7 +375,7 @@ kernel_memory_allocate( } VM_PAGE_WAIT(); } - mem->pageq.next = (queue_entry_t)wired_page_list; + mem->snext = wired_page_list; wired_page_list = mem; } } @@ -394,6 +394,9 @@ kernel_memory_allocate( object = vm_object_allocate(map_size); } + if (flags & KMA_ATOMIC) + vm_alloc_flags |= VM_FLAGS_ATOMIC_ENTRY; + kr = vm_map_find_space(map, &map_addr, fill_size, map_mask, vm_alloc_flags, &entry); @@ -429,8 +432,8 @@ kernel_memory_allocate( panic("kernel_memory_allocate: guard_page_list == NULL"); mem = guard_page_list; - guard_page_list = (vm_page_t)mem->pageq.next; - mem->pageq.next = NULL; + guard_page_list = mem->snext; + mem->snext = NULL; vm_page_insert(mem, object, offset + pg_offset); @@ -448,9 +451,18 @@ kernel_memory_allocate( panic("kernel_memory_allocate: wired_page_list == NULL"); mem = wired_page_list; - wired_page_list = (vm_page_t)mem->pageq.next; - mem->pageq.next = NULL; + wired_page_list = mem->snext; + mem->snext = NULL; + + assert(mem->wire_count == 0); + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + + mem->vm_page_q_state = VM_PAGE_IS_WIRED; mem->wire_count++; + if (__improbable(mem->wire_count == 0)) { + panic("kernel_memory_allocate(%p): wire_count overflow", + mem); + } vm_page_insert_wired(mem, object, offset + pg_offset, tag); @@ -473,7 +485,7 @@ kernel_memory_allocate( if (flags & KMA_NOENCRYPT) { bzero(CAST_DOWN(void *, (map_addr + pg_offset)), PAGE_SIZE); - pmap_set_noencrypt(mem->phys_page); + pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); } } } @@ -482,8 +494,8 @@ kernel_memory_allocate( panic("kernel_memory_allocate: guard_page_list == NULL"); mem = guard_page_list; - guard_page_list = (vm_page_t)mem->pageq.next; - mem->pageq.next = NULL; + guard_page_list = mem->snext; + mem->snext = NULL; vm_page_insert(mem, object, offset + pg_offset); @@ -557,13 +569,13 @@ kernel_memory_populate( VM_PAGE_WAIT(); } - mem->pageq.next = (queue_entry_t) page_list; + mem->snext = page_list; page_list = mem; pg_offset -= PAGE_SIZE_64; kr = pmap_enter_options(kernel_pmap, - addr + pg_offset, mem->phys_page, + addr + pg_offset, VM_PAGE_GET_PHYS_PAGE(mem), VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, PMAP_OPTIONS_INTERNAL, NULL); assert(kr == KERN_SUCCESS); @@ -580,8 +592,8 @@ kernel_memory_populate( pg_offset += PAGE_SIZE_64) { mem = page_list; - page_list = (vm_page_t) mem->pageq.next; - mem->pageq.next = NULL; + page_list = mem->snext; + mem->snext = NULL; vm_page_insert(mem, object, offset + pg_offset); assert(mem->busy); @@ -589,7 +601,7 @@ kernel_memory_populate( mem->busy = FALSE; mem->pmapped = TRUE; mem->wpmapped = TRUE; - mem->compressor = TRUE; + mem->vm_page_q_state = VM_PAGE_USED_BY_COMPRESSOR; } vm_object_unlock(object); @@ -617,7 +629,7 @@ kernel_memory_populate( } VM_PAGE_WAIT(); } - mem->pageq.next = (queue_entry_t) page_list; + mem->snext = page_list; page_list = mem; } if (flags & KMA_KOBJECT) { @@ -647,10 +659,16 @@ kernel_memory_populate( panic("kernel_memory_populate: page_list == NULL"); mem = page_list; - page_list = (vm_page_t) mem->pageq.next; - mem->pageq.next = NULL; + page_list = mem->snext; + mem->snext = NULL; + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + mem->vm_page_q_state = VM_PAGE_IS_WIRED; mem->wire_count++; + if (__improbable(mem->wire_count == 0)) { + panic("kernel_memory_populate(%p): wire_count overflow", + mem); + } vm_page_insert_wired(mem, object, offset + pg_offset, tag); @@ -675,7 +693,7 @@ kernel_memory_populate( } if (flags & KMA_NOENCRYPT) { bzero(CAST_DOWN(void *, (addr + pg_offset)), PAGE_SIZE); - pmap_set_noencrypt(mem->phys_page); + pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); } } vm_page_lock_queues(); @@ -741,8 +759,9 @@ kernel_memory_depopulate( mem = vm_page_lookup(object, offset + pg_offset); assert(mem); - - pmap_disconnect(mem->phys_page); + + if (mem->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR) + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem)); mem->busy = TRUE; @@ -750,9 +769,12 @@ kernel_memory_depopulate( vm_page_remove(mem, TRUE); assert(mem->busy); - assert(mem->pageq.next == NULL && - mem->pageq.prev == NULL); - mem->pageq.next = (queue_entry_t)local_freeq; + assert(mem->pageq.next == 0 && mem->pageq.prev == 0); + assert((mem->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (mem->vm_page_q_state == VM_PAGE_NOT_ON_Q)); + + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; + mem->snext = local_freeq; local_freeq = mem; } vm_object_unlock(object); @@ -777,14 +799,26 @@ kmem_alloc_external( return (kmem_alloc(map, addrp, size, vm_tag_bt())); } + kern_return_t kmem_alloc( vm_map_t map, vm_offset_t *addrp, vm_size_t size, - vm_tag_t tag) + vm_tag_t tag) +{ + return kmem_alloc_flags(map, addrp, size, tag, 0); +} + +kern_return_t +kmem_alloc_flags( + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size, + vm_tag_t tag, + int flags) { - kern_return_t kr = kernel_memory_allocate(map, addrp, size, 0, 0, tag); + kern_return_t kr = kernel_memory_allocate(map, addrp, size, 0, flags, tag); TRACE_MACHLEAKS(KMEM_ALLOC_CODE, KMEM_ALLOC_CODE_2, size, *addrp); return kr; } @@ -1050,16 +1084,16 @@ kmem_free( kern_return_t kmem_alloc_pages( - register vm_object_t object, - register vm_object_offset_t offset, - register vm_object_size_t size) + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size) { vm_object_size_t alloc_size; alloc_size = vm_object_round_page(size); vm_object_lock(object); while (alloc_size) { - register vm_page_t mem; + vm_page_t mem; /* @@ -1445,9 +1479,7 @@ vm_kernel_unslide_or_perm_external( vm_offset_t addr, vm_offset_t *up_addr) { - if (VM_KERNEL_IS_SLID(addr) || VM_KERNEL_IS_KEXT(addr) || - VM_KERNEL_IS_PRELINKTEXT(addr) || VM_KERNEL_IS_PRELINKINFO(addr) || - VM_KERNEL_IS_KEXT_LINKEDIT(addr)) { + if (VM_KERNEL_IS_SLID(addr)) { *up_addr = addr - vm_kernel_slide; return; } diff --git a/osfmk/vm/vm_kern.h b/osfmk/vm/vm_kern.h index 435dae135..69afb3548 100644 --- a/osfmk/vm/vm_kern.h +++ b/osfmk/vm/vm_kern.h @@ -74,6 +74,8 @@ #ifdef XNU_KERNEL_PRIVATE +#include + extern kern_return_t kernel_memory_allocate( vm_map_t map, vm_offset_t *addrp, @@ -94,6 +96,13 @@ extern kern_return_t kernel_memory_allocate( #define KMA_KSTACK 0x100 #define KMA_VAONLY 0x200 #define KMA_COMPRESSOR 0x400 /* Pages belonging to the compressor are not on the paging queues, nor are they counted as wired. */ +#define KMA_ATOMIC 0x800 + +extern kern_return_t kmem_alloc( + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size, + vm_tag_t tag); extern kern_return_t kmem_alloc_contig( vm_map_t map, @@ -105,11 +114,12 @@ extern kern_return_t kmem_alloc_contig( int flags, vm_tag_t tag); -extern kern_return_t kmem_alloc( +extern kern_return_t kmem_alloc_flags( vm_map_t map, vm_offset_t *addrp, vm_size_t size, - vm_tag_t tag); + vm_tag_t tag, + int flags); extern kern_return_t kmem_alloc_pageable( vm_map_t map, @@ -174,7 +184,11 @@ extern kern_return_t memory_object_iopl_request( struct mach_memory_info; extern kern_return_t vm_page_diagnose(struct mach_memory_info * sites, - unsigned int num_sites); + unsigned int num_sites, uint64_t zones_collectable_bytes); + +#if DEBUG || DEVELOPMENT +extern void kern_wired_diagnose(void); +#endif /* DEBUG || DEVELOPMENT */ extern vm_tag_t vm_tag_bt(void); @@ -184,9 +198,37 @@ extern void vm_tag_alloc_locked(vm_allocation_site_t * site); extern vm_tag_t vm_tag_bt_debug(void); +extern uint32_t vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen); + +#if DEBUG || DEVELOPMENT + +struct vm_tag_set_entry +{ + vm_tag_t tag; + uint32_t count; +}; + +struct vm_tag_set +{ + lck_spin_t lock; + struct vm_tag_set_entry entries[0]; +}; + +typedef struct vm_tag_set * vm_tag_set_t; + +extern void vm_tag_set_init(vm_tag_set_t, uint32_t count); +extern kern_return_t vm_tag_set_enter(vm_tag_set_t set, uint32_t count, vm_tag_t tag); +extern kern_return_t vm_tag_set_remove(vm_tag_set_t set, uint32_t count, vm_tag_t tag, vm_tag_t * new_tag); + +#endif /* DEBUG || DEVELOPMENT */ + extern boolean_t vm_kernel_map_is_kernel(vm_map_t map); -extern ppnum_t kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr); +extern ppnum_t kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr, uintptr_t * pvphysaddr); +#if DEBUG || DEVELOPMENT +extern void kernel_pmap_lock(void); +extern void kernel_pmap_unlock(void); +#endif /* DEBUG || DEVELOPMENT */ #else /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 840c4babf..b834d295b 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -82,6 +82,7 @@ #include #include +#include #include #include #include @@ -109,6 +110,15 @@ #include #include +extern int proc_selfpid(void); +extern char *proc_name_address(void *p); + +#if VM_MAP_DEBUG_APPLE_PROTECT +int vm_map_debug_apple_protect = 0; +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ +#if VM_MAP_DEBUG_FOURK +int vm_map_debug_fourk = 0; +#endif /* VM_MAP_DEBUG_FOURK */ extern u_int32_t random(void); /* from */ /* Internal prototypes @@ -186,6 +196,7 @@ static kern_return_t vm_map_copyout_kernel_buffer( vm_map_t map, vm_map_address_t *addr, /* IN/OUT */ vm_map_copy_t copy, + vm_map_size_t copy_size, boolean_t overwrite, boolean_t consume_on_success); @@ -197,7 +208,8 @@ static void vm_map_fork_share( static boolean_t vm_map_fork_copy( vm_map_t old_map, vm_map_entry_t *old_entry_p, - vm_map_t new_map); + vm_map_t new_map, + int vm_map_copyin_flags); void vm_map_region_top_walk( vm_map_entry_t entry, @@ -253,7 +265,8 @@ static kern_return_t vm_map_remap_extract( vm_prot_t *cur_protection, vm_prot_t *max_protection, vm_inherit_t inheritance, - boolean_t pageable); + boolean_t pageable, + boolean_t same_map); static kern_return_t vm_map_remap_range_allocate( vm_map_t map, @@ -330,6 +343,7 @@ boolean_t _vmec_reserved = (NEW)->from_reserved_zone; \ (NEW)->iokit_acct = FALSE; \ (NEW)->vme_resilient_codesign = FALSE; \ (NEW)->vme_resilient_media = FALSE; \ + (NEW)->vme_atomic = FALSE; \ MACRO_END #define vm_map_entry_copy_full(NEW,OLD) \ @@ -444,7 +458,7 @@ override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */ static zone_t vm_map_zone; /* zone for vm_map structures */ static zone_t vm_map_entry_zone; /* zone for vm_map_entry structures */ -static zone_t vm_map_entry_reserved_zone; /* zone with reserve for non-blocking +zone_t vm_map_entry_reserved_zone; /* zone with reserve for non-blocking * allocations */ static zone_t vm_map_copy_zone; /* zone for vm_map_copy structures */ zone_t vm_map_holes_zone; /* zone for vm map holes (vm_map_links) structures */ @@ -660,20 +674,22 @@ vm_map_apple_protected( assert(map_addr == tmp_entry.vme_start); #if VM_MAP_DEBUG_APPLE_PROTECT - printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p: " - "backing:[object:%p,offset:0x%llx," - "crypto_backing_offset:0x%llx," - "crypto_start:0x%llx,crypto_end:0x%llx]\n", - map, - (uint64_t) map_addr, - (uint64_t) (map_addr + (tmp_entry.vme_end - - tmp_entry.vme_start)), - unprotected_mem_obj, - protected_object, - VME_OFFSET(&tmp_entry), - crypto_backing_offset, - crypto_start, - crypto_end); + if (vm_map_debug_apple_protect) { + printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:" + " backing:[object:%p,offset:0x%llx," + "crypto_backing_offset:0x%llx," + "crypto_start:0x%llx,crypto_end:0x%llx]\n", + map, + (uint64_t) map_addr, + (uint64_t) (map_addr + (tmp_entry.vme_end - + tmp_entry.vme_start)), + unprotected_mem_obj, + protected_object, + VME_OFFSET(&tmp_entry), + crypto_backing_offset, + crypto_start, + crypto_end); + } #endif /* VM_MAP_DEBUG_APPLE_PROTECT */ /* @@ -768,6 +784,7 @@ vm_map_init( * Set reserved_zone non-collectible to aid zone_gc(). */ zone_change(vm_map_zone, Z_COLLECT, FALSE); + zone_change(vm_map_zone, Z_FOREIGN, TRUE); zone_change(vm_map_entry_reserved_zone, Z_COLLECT, FALSE); zone_change(vm_map_entry_reserved_zone, Z_EXPAND, FALSE); @@ -799,9 +816,16 @@ vm_map_init( lck_attr_setdefault(&vm_map_lck_rw_attr); lck_attr_cleardebug(&vm_map_lck_rw_attr); -#if CONFIG_FREEZE - default_freezer_init(); -#endif /* CONFIG_FREEZE */ +#if VM_MAP_DEBUG_APPLE_PROTECT + PE_parse_boot_argn("vm_map_debug_apple_protect", + &vm_map_debug_apple_protect, + sizeof(vm_map_debug_apple_protect)); +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ +#if VM_MAP_DEBUG_APPLE_FOURK + PE_parse_boot_argn("vm_map_debug_fourk", + &vm_map_debug_fourk, + sizeof(vm_map_debug_fourk)); +#endif /* VM_MAP_DEBUG_FOURK */ } void @@ -901,7 +925,7 @@ vm_map_create( boolean_t pageable) { static int color_seed = 0; - register vm_map_t result; + vm_map_t result; struct vm_map_links *hole_entry = NULL; result = (vm_map_t) zalloc(vm_map_zone); @@ -935,6 +959,7 @@ vm_map_create( result->switch_protect = FALSE; result->disable_vmentry_reuse = FALSE; result->map_disallow_data_exec = FALSE; + result->is_nested_map = FALSE; result->highest_entry_end = 0; result->first_free = vm_map_to_entry(result); result->hint = vm_map_to_entry(result); @@ -955,9 +980,6 @@ vm_map_create( result->holelistenabled = FALSE; } -#if CONFIG_FREEZE - result->default_freezer_handle = NULL; -#endif vm_map_lock_init(result); lck_mtx_init_ext(&result->s_lock, &result->s_lock_ext, &vm_map_lck_grp, &vm_map_lck_attr); @@ -1008,8 +1030,8 @@ _vm_map_entry_create( vm_map_store_update( (vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE); #if MAP_ENTRY_CREATION_DEBUG entry->vme_creation_maphdr = map_header; - fastbacktrace(&entry->vme_creation_bt[0], - (sizeof(entry->vme_creation_bt)/sizeof(uintptr_t))); + backtrace(&entry->vme_creation_bt[0], + (sizeof(entry->vme_creation_bt)/sizeof(uintptr_t))); #endif return(entry); } @@ -1031,10 +1053,10 @@ _vm_map_entry_create( static void _vm_map_entry_dispose( - register struct vm_map_header *map_header, - register vm_map_entry_t entry) + struct vm_map_header *map_header, + vm_map_entry_t entry) { - register zone_t zone; + zone_t zone; if (map_header->entries_pageable || !(entry->from_reserved_zone)) zone = vm_map_entry_zone; @@ -1081,7 +1103,7 @@ first_free_is_valid( * vm_map_swapin. * */ -void vm_map_res_reference(register vm_map_t map) +void vm_map_res_reference(vm_map_t map) { /* assert map is locked */ assert(map->res_count >= 0); @@ -1105,7 +1127,7 @@ void vm_map_res_reference(register vm_map_t map) * The map may not be in memory (i.e. zero residence count). * */ -void vm_map_reference_swap(register vm_map_t map) +void vm_map_reference_swap(vm_map_t map) { assert(map != VM_MAP_NULL); lck_mtx_lock(&map->s_lock); @@ -1126,7 +1148,7 @@ void vm_map_reference_swap(register vm_map_t map) * The map is locked, so this function is callable from vm_map_deallocate. * */ -void vm_map_res_deallocate(register vm_map_t map) +void vm_map_res_deallocate(vm_map_t map) { assert(map->res_count > 0); if (--map->res_count == 0) { @@ -1162,12 +1184,6 @@ vm_map_destroy( (void) vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags, VM_MAP_NULL); -#if CONFIG_FREEZE - if (map->default_freezer_handle) { - default_freezer_handle_deallocate(map->default_freezer_handle); - map->default_freezer_handle = NULL; - } -#endif vm_map_disable_hole_optimization(map); vm_map_unlock(map); @@ -1176,6 +1192,28 @@ vm_map_destroy( if(map->pmap) pmap_destroy(map->pmap); + if (vm_map_lck_attr.lck_attr_val & LCK_ATTR_DEBUG) { + /* + * If lock debugging is enabled the mutexes get tagged as LCK_MTX_TAG_INDIRECT. + * And this is regardless of whether the lck_mtx_ext_t is embedded in the + * structure or kalloc'ed via lck_mtx_init. + * An example is s_lock_ext within struct _vm_map. + * + * A lck_mtx_destroy on such a mutex will attempt a kfree and panic. We + * can add another tag to detect embedded vs alloc'ed indirect external + * mutexes but that'll be additional checks in the lock path and require + * updating dependencies for the old vs new tag. + * + * Since the kfree() is for LCK_MTX_TAG_INDIRECT mutexes and that tag is applied + * just when lock debugging is ON, we choose to forego explicitly destroying + * the vm_map mutex and rw lock and, as a consequence, will overflow the reference + * count on vm_map_lck_grp, which has no serious side-effect. + */ + } else { + lck_rw_destroy(&(map)->lock, &vm_map_lck_grp); + lck_mtx_destroy(&(map)->s_lock, &vm_map_lck_grp); + } + zfree(vm_map_zone, map); } @@ -1231,7 +1269,7 @@ int vm_map_swap_enable = 1; void vm_map_swapin (vm_map_t map) { - register vm_map_entry_t entry; + vm_map_entry_t entry; if (!vm_map_swap_enable) /* debug */ return; @@ -1299,7 +1337,7 @@ void vm_map_swapin (vm_map_t map) void vm_map_swapout(vm_map_t map) { - register vm_map_entry_t entry; + vm_map_entry_t entry; /* * Map is locked @@ -1374,8 +1412,8 @@ void vm_map_swapout(vm_map_t map) */ boolean_t vm_map_lookup_entry( - register vm_map_t map, - register vm_map_offset_t address, + vm_map_t map, + vm_map_offset_t address, vm_map_entry_t *entry) /* OUT */ { return ( vm_map_store_lookup_entry( map, address, entry )); @@ -1396,7 +1434,7 @@ vm_map_lookup_entry( */ kern_return_t vm_map_find_space( - register vm_map_t map, + vm_map_t map, vm_map_offset_t *address, /* OUT */ vm_map_size_t size, vm_map_offset_t mask, @@ -1404,8 +1442,8 @@ vm_map_find_space( vm_map_entry_t *o_entry) /* OUT */ { vm_map_entry_t entry, new_entry; - register vm_map_offset_t start; - register vm_map_offset_t end; + vm_map_offset_t start; + vm_map_offset_t end; vm_map_entry_t hole_entry; if (size == 0) { @@ -1459,7 +1497,7 @@ vm_map_find_space( */ while (TRUE) { - register vm_map_entry_t next; + vm_map_entry_t next; /* * Find the end of the proposed new region. @@ -1593,6 +1631,10 @@ vm_map_find_space( new_entry->iokit_acct = FALSE; new_entry->vme_resilient_codesign = FALSE; new_entry->vme_resilient_media = FALSE; + if (flags & VM_FLAGS_ATOMIC_ENTRY) + new_entry->vme_atomic = TRUE; + else + new_entry->vme_atomic = FALSE; int alias; VM_GET_FLAGS_ALIAS(flags, alias); @@ -1635,9 +1677,9 @@ int vm_map_pmap_enter_enable = FALSE; __unused static void vm_map_pmap_enter( vm_map_t map, - register vm_map_offset_t addr, - register vm_map_offset_t end_addr, - register vm_object_t object, + vm_map_offset_t addr, + vm_map_offset_t end_addr, + vm_object_t object, vm_object_offset_t offset, vm_prot_t protection) { @@ -1648,7 +1690,7 @@ vm_map_pmap_enter( return; while (addr < end_addr) { - register vm_page_t m; + vm_page_t m; /* @@ -1833,6 +1875,7 @@ vm_map_enter( boolean_t iokit_acct = ((flags & VM_FLAGS_IOKIT_ACCT) != 0); boolean_t resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0); boolean_t resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0); + boolean_t random_address = ((flags & VM_FLAGS_RANDOM_ADDR) != 0); unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT); vm_tag_t alias, user_alias; vm_map_offset_t effective_min_offset, effective_max_offset; @@ -1997,6 +2040,10 @@ StartAgain: ; result = KERN_INVALID_ARGUMENT; goto BailOut; } + random_address = TRUE; + } + + if (random_address) { /* * Get a random start address. */ @@ -2112,7 +2159,7 @@ StartAgain: ; */ while (TRUE) { - register vm_map_entry_t next; + vm_map_entry_t next; /* * Find the end of the proposed new region. @@ -2402,6 +2449,7 @@ StartAgain: ; (entry->iokit_acct == iokit_acct) && (!entry->vme_resilient_codesign) && (!entry->vme_resilient_media) && + (!entry->vme_atomic) && ((entry->vme_end - entry->vme_start) + size <= (user_alias == VM_MEMORY_REALLOC ? @@ -2600,7 +2648,7 @@ StartAgain: ; /* create one vm_object per superpage */ sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start)); sp_object->phys_contiguous = TRUE; - sp_object->vo_shadow_offset = (vm_object_offset_t)pages->phys_page*PAGE_SIZE; + sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages)*PAGE_SIZE; VME_OBJECT_SET(entry, sp_object); assert(entry->use_pmap); @@ -2608,7 +2656,7 @@ StartAgain: ; vm_object_lock(sp_object); for (offset = 0; offset < SUPERPAGE_SIZE; offset += PAGE_SIZE) { m = pages; - pmap_zero_page(m->phys_page); + pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m)); pages = NEXT_PAGE(m); *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; vm_page_insert_wired(m, sp_object, offset, VM_KERN_MEMORY_OSFMK); @@ -3099,7 +3147,8 @@ vm_map_enter_mem_object_helper( VM_FLAGS_ANYWHERE | VM_FLAGS_OVERWRITE | VM_FLAGS_RETURN_4K_DATA_ADDR | - VM_FLAGS_RETURN_DATA_ADDR)) { + VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_ALIAS_MASK)) { named_entry_unlock(named_entry); return KERN_INVALID_ARGUMENT; } @@ -3134,7 +3183,8 @@ vm_map_enter_mem_object_helper( flags & (VM_FLAGS_ANYWHERE | VM_FLAGS_OVERWRITE | VM_FLAGS_RETURN_4K_DATA_ADDR | - VM_FLAGS_RETURN_DATA_ADDR), + VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_ALIAS_MASK), VM_OBJECT_NULL, 0, FALSE, /* copy */ @@ -3156,10 +3206,20 @@ vm_map_enter_mem_object_helper( vm_object_t copy_object; vm_map_size_t copy_size; vm_object_offset_t copy_offset; + int copy_vm_alias; copy_offset = VME_OFFSET(copy_entry); copy_size = (copy_entry->vme_end - copy_entry->vme_start); + VM_GET_FLAGS_ALIAS(flags, copy_vm_alias); + if (copy_vm_alias == 0) { + /* + * Caller does not want a specific + * alias for this new mapping: use + * the alias of the original mapping. + */ + copy_vm_alias = VME_ALIAS(copy_entry); + } /* sanity check */ if ((copy_addr + copy_size) > @@ -3189,6 +3249,7 @@ vm_map_enter_mem_object_helper( remap_flags |= VM_FLAGS_FIXED; remap_flags |= VM_FLAGS_OVERWRITE; remap_flags &= ~VM_FLAGS_ANYWHERE; + remap_flags |= VM_MAKE_TAG(copy_vm_alias); kr = vm_map_enter(target_map, ©_addr, copy_size, @@ -3446,6 +3507,12 @@ vm_map_enter_mem_object_helper( mach_vm_address_t va = map_addr; kern_return_t kr = KERN_SUCCESS; unsigned int i = 0; + int pmap_options; + + pmap_options = PMAP_OPTIONS_NOWAIT; + if (object->internal) { + pmap_options |= PMAP_OPTIONS_INTERNAL; + } for (i = 0; i < page_list_count; ++i) { if (UPL_VALID_PAGE(page_list, i)) { @@ -3461,7 +3528,7 @@ vm_map_enter_mem_object_helper( kr = pmap_enter_options(target_map->pmap, va, UPL_PHYS_PAGE(page_list, i), cur_protection, VM_PROT_NONE, - 0, TRUE, PMAP_OPTIONS_NOWAIT, NULL); + 0, TRUE, pmap_options, NULL); if (kr != KERN_SUCCESS) { OSIncrementAtomic64(&vm_prefault_nb_bailout); break; @@ -3771,7 +3838,7 @@ vm_map_enter_cpm( */ ASSERT_PAGE_DECRYPTED(m); assert(m->busy); - assert(m->phys_page>=(avail_start>>PAGE_SHIFT) && m->phys_page<=(avail_end>>PAGE_SHIFT)); + assert(VM_PAGE_GET_PHYS_PAGE(m)>=(avail_start>>PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m)<=(avail_end>>PAGE_SHIFT)); m->busy = FALSE; vm_page_insert(m, cpm_obj, offset); @@ -3881,7 +3948,7 @@ vm_map_enter_cpm( assert(!m->precious); assert(!m->clustered); if (offset != 0) { - if (m->phys_page != prev_addr + 1) { + if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) { printf("start 0x%llx end 0x%llx va 0x%llx\n", (uint64_t)start, (uint64_t)end, (uint64_t)va); printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset); @@ -3889,7 +3956,7 @@ vm_map_enter_cpm( panic("vm_allocate_cpm: pages not contig!"); } } - prev_addr = m->phys_page; + prev_addr = VM_PAGE_GET_PHYS_PAGE(m); } #endif /* MACH_ASSERT */ @@ -3948,7 +4015,15 @@ vm_map_clip_unnest( * depending on size/alignment. */ if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) { - log_unnest_badness(map, old_start_unnest, old_end_unnest); + assert(VME_SUBMAP(entry)->is_nested_map); + assert(!VME_SUBMAP(entry)->disable_vmentry_reuse); + log_unnest_badness(map, + old_start_unnest, + old_end_unnest, + VME_SUBMAP(entry)->is_nested_map, + (entry->vme_start + + VME_SUBMAP(entry)->lowest_unnestable_start - + VME_OFFSET(entry))); } if (entry->vme_start > start_unnest || @@ -4038,6 +4113,9 @@ vm_map_clip_start( (addr64_t)(entry->vme_start), (addr64_t)(entry->vme_end)); } + if (entry->vme_atomic) { + panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry); + } _vm_map_clip_start(&map->hdr, entry, startaddr); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); @@ -4060,11 +4138,11 @@ vm_map_clip_start( */ static void _vm_map_clip_start( - register struct vm_map_header *map_header, - register vm_map_entry_t entry, - register vm_map_offset_t start) + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_map_offset_t start) { - register vm_map_entry_t new_entry; + vm_map_entry_t new_entry; /* * Split off the front portion -- @@ -4143,6 +4221,9 @@ vm_map_clip_end( (addr64_t)(entry->vme_start), (addr64_t)(entry->vme_end)); } + if (entry->vme_atomic) { + panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry); + } _vm_map_clip_end(&map->hdr, entry, endaddr); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); @@ -4165,11 +4246,11 @@ vm_map_clip_end( */ static void _vm_map_clip_end( - register struct vm_map_header *map_header, - register vm_map_entry_t entry, - register vm_map_offset_t end) + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_map_offset_t end) { - register vm_map_entry_t new_entry; + vm_map_entry_t new_entry; /* * Create a new entry and insert it @@ -4231,13 +4312,13 @@ _vm_map_clip_end( */ static boolean_t vm_map_range_check( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, vm_map_entry_t *entry) { vm_map_entry_t cur; - register vm_map_offset_t prev; + vm_map_offset_t prev; /* * Basic sanity checks first @@ -4307,8 +4388,8 @@ vm_map_submap( boolean_t use_pmap) { vm_map_entry_t entry; - register kern_return_t result = KERN_INVALID_ARGUMENT; - register vm_object_t object; + kern_return_t result = KERN_INVALID_ARGUMENT; + vm_object_t object; vm_map_lock(map); @@ -4395,14 +4476,14 @@ vm_map_submap( */ kern_return_t vm_map_protect( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t new_prot, - register boolean_t set_max) -{ - register vm_map_entry_t current; - register vm_map_offset_t prev; + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t new_prot, + boolean_t set_max) +{ + vm_map_entry_t current; + vm_map_offset_t prev; vm_map_entry_t entry; vm_prot_t new_max; @@ -4476,6 +4557,8 @@ vm_map_protect( prev = current->vme_end; current = current->vme_next; } + + if (end > prev) { vm_map_unlock(map); return(KERN_INVALID_ADDRESS); @@ -4553,7 +4636,13 @@ vm_map_protect( vm_prot_t prot; - prot = current->protection & ~VM_PROT_WRITE; + prot = current->protection; + if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) { + prot &= ~VM_PROT_WRITE; + } else { + assert(!VME_OBJECT(current)->code_signed); + assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE); + } if (override_nx(map, VME_ALIAS(current)) && prot) prot |= VM_PROT_EXECUTE; @@ -4595,12 +4684,12 @@ vm_map_protect( */ kern_return_t vm_map_inherit( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_inherit_t new_inheritance) + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_inherit_t new_inheritance) { - register vm_map_entry_t entry; + vm_map_entry_t entry; vm_map_entry_t temp_entry; vm_map_lock(map); @@ -4762,6 +4851,7 @@ subtract_wire_counts( } } + /* * vm_map_wire: * @@ -4777,20 +4867,20 @@ subtract_wire_counts( */ static kern_return_t vm_map_wire_nested( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t caller_prot, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, boolean_t user_wire, pmap_t map_pmap, vm_map_offset_t pmap_addr, ppnum_t *physpage_p) { - register vm_map_entry_t entry; - register vm_prot_t access_type; + vm_map_entry_t entry; + vm_prot_t access_type; struct vm_map_entry *first_entry, tmp_entry; vm_map_t real_map; - register vm_map_offset_t s,e; + vm_map_offset_t s,e; kern_return_t rc; boolean_t need_wakeup; boolean_t main_map = FALSE; @@ -5001,7 +5091,7 @@ vm_map_wire_nested( vm_map_lock_write_to_read(map); if(vm_map_lookup_locked( &lookup_map, local_start, - access_type, + access_type | VM_PROT_COPY, OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired, @@ -5170,12 +5260,12 @@ vm_map_wire_nested( } m = vm_page_lookup(object, offset); assert(m != VM_PAGE_NULL); - assert(m->wire_count); - if (m != VM_PAGE_NULL && m->wire_count) { - *physpage_p = m->phys_page; + assert(VM_PAGE_WIRED(m)); + if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) { + *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (entry->protection & VM_PROT_WRITE) { vm_object_lock_assert_exclusive( - m->object); + object); m->dirty = TRUE; } } else { @@ -5196,6 +5286,7 @@ vm_map_wire_nested( */ + /* * Perform actions of vm_map_lookup that need the write * lock on the map: create a shadow object for a @@ -5371,10 +5462,10 @@ vm_map_wire_nested( kern_return_t vm_map_wire_external( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t caller_prot, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, boolean_t user_wire) { kern_return_t kret; @@ -5388,10 +5479,10 @@ vm_map_wire_external( kern_return_t vm_map_wire( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t caller_prot, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, boolean_t user_wire) { kern_return_t kret; @@ -5468,14 +5559,14 @@ vm_map_wire_and_extract( */ static kern_return_t vm_map_unwire_nested( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, boolean_t user_wire, pmap_t map_pmap, vm_map_offset_t pmap_addr) { - register vm_map_entry_t entry; + vm_map_entry_t entry; struct vm_map_entry *first_entry, tmp_entry; boolean_t need_wakeup; boolean_t main_map = FALSE; @@ -5801,9 +5892,9 @@ vm_map_unwire_nested( kern_return_t vm_map_unwire( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, boolean_t user_wire) { return vm_map_unwire_nested(map, start, end, @@ -5818,12 +5909,12 @@ vm_map_unwire( */ static void vm_map_entry_delete( - register vm_map_t map, - register vm_map_entry_t entry) + vm_map_t map, + vm_map_entry_t entry) { - register vm_map_offset_t s, e; - register vm_object_t object; - register vm_map_t submap; + vm_map_offset_t s, e; + vm_object_t object; + vm_map_t submap; s = entry->vme_start; e = entry->vme_end; @@ -5980,8 +6071,8 @@ vm_map_delete( { vm_map_entry_t entry, next; struct vm_map_entry *first_entry, tmp_entry; - register vm_map_offset_t s; - register vm_object_t object; + vm_map_offset_t s; + vm_object_t object; boolean_t need_wakeup; unsigned int last_timestamp = ~0; /* unlikely value */ int interruptible; @@ -6585,12 +6676,12 @@ vm_map_delete( */ kern_return_t vm_map_remove( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register boolean_t flags) + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t flags) { - register kern_return_t result; + kern_return_t result; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); @@ -6609,6 +6700,26 @@ vm_map_remove( return(result); } +/* + * vm_map_remove_locked: + * + * Remove the given address range from the target locked map. + * This is the exported form of vm_map_delete. + */ +kern_return_t +vm_map_remove_locked( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t flags) +{ + kern_return_t result; + + VM_MAP_RANGE_CHECK(map, start, end); + result = vm_map_delete(map, start, end, flags, VM_MAP_NULL); + return(result); +} + /* * Routine: vm_map_copy_discard @@ -6950,7 +7061,7 @@ vm_map_copy_overwrite_nested( if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { return(vm_map_copyout_kernel_buffer( dst_map, &dst_addr, - copy, TRUE, discard_on_success)); + copy, copy->size, TRUE, discard_on_success)); } /* @@ -8481,7 +8592,7 @@ vm_map_copyin_kernel_buffer( VM_MAP_PAGE_MASK(src_map)), (VM_MAP_REMOVE_INTERRUPTIBLE | VM_MAP_REMOVE_WAIT_FOR_KWIRE | - (src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : 0)); + ((src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : 0))); } *copy_result = copy; return KERN_SUCCESS; @@ -8504,16 +8615,19 @@ vm_map_copyout_kernel_buffer( vm_map_t map, vm_map_address_t *addr, /* IN/OUT */ vm_map_copy_t copy, + vm_map_size_t copy_size, boolean_t overwrite, boolean_t consume_on_success) { kern_return_t kr = KERN_SUCCESS; thread_t thread = current_thread(); + assert(copy->size == copy_size); + /* * check for corrupted vm_map_copy structure */ - if (copy->size > msg_ool_size_small || copy->offset) + if (copy_size > msg_ool_size_small || copy->offset) panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld", (long long)copy->size, (long long)copy->offset); @@ -8525,7 +8639,7 @@ vm_map_copyout_kernel_buffer( *addr = 0; kr = vm_map_enter(map, addr, - vm_map_round_page(copy->size, + vm_map_round_page(copy_size, VM_MAP_PAGE_MASK(map)), (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE, @@ -8548,8 +8662,8 @@ vm_map_copyout_kernel_buffer( * If the target map is the current map, just do * the copy. */ - assert((vm_size_t) copy->size == copy->size); - if (copyout(copy->cpy_kdata, *addr, (vm_size_t) copy->size)) { + assert((vm_size_t)copy_size == copy_size); + if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) { kr = KERN_INVALID_ADDRESS; } } @@ -8564,8 +8678,8 @@ vm_map_copyout_kernel_buffer( vm_map_reference(map); oldmap = vm_map_switch(map); - assert((vm_size_t) copy->size == copy->size); - if (copyout(copy->cpy_kdata, *addr, (vm_size_t) copy->size)) { + assert((vm_size_t)copy_size == copy_size); + if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) { vm_map_copyout_kernel_buffer_failures++; kr = KERN_INVALID_ADDRESS; } @@ -8585,7 +8699,7 @@ vm_map_copyout_kernel_buffer( vm_map_trunc_page(*addr, VM_MAP_PAGE_MASK(map)), vm_map_round_page((*addr + - vm_map_round_page(copy->size, + vm_map_round_page(copy_size, VM_MAP_PAGE_MASK(map))), VM_MAP_PAGE_MASK(map)), VM_MAP_NO_FLAGS); @@ -8594,7 +8708,7 @@ vm_map_copyout_kernel_buffer( } else { /* copy was successful, dicard the copy structure */ if (consume_on_success) { - kfree(copy, copy->size + cpy_kdata_hdr_sz); + kfree(copy, copy_size + cpy_kdata_hdr_sz); } } @@ -8663,18 +8777,25 @@ vm_map_copy_remap( } +/* + * Returns true if *size matches (or is in the range of) copy->size. + * Upon returning true, the *size field is updated with the actual size of the + * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types) + */ boolean_t vm_map_copy_validate_size( vm_map_t dst_map, vm_map_copy_t copy, - vm_map_size_t size) + vm_map_size_t *size) { if (copy == VM_MAP_COPY_NULL) return FALSE; + vm_map_size_t copy_sz = copy->size; + vm_map_size_t sz = *size; switch (copy->type) { case VM_MAP_COPY_OBJECT: case VM_MAP_COPY_KERNEL_BUFFER: - if (size == copy->size) + if (sz == copy_sz) return TRUE; break; case VM_MAP_COPY_ENTRY_LIST: @@ -8683,10 +8804,11 @@ vm_map_copy_validate_size( * validating this flavor of vm_map_copy, but we can at least * assert that it's within a range. */ - if (copy->size >= size && - copy->size <= vm_map_round_page(size, - VM_MAP_PAGE_MASK(dst_map))) + if (copy_sz >= sz && + copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) { + *size = copy_sz; return TRUE; + } break; default: break; @@ -8694,6 +8816,30 @@ vm_map_copy_validate_size( return FALSE; } +/* + * Routine: vm_map_copyout_size + * + * Description: + * Copy out a copy chain ("copy") into newly-allocated + * space in the destination map. Uses a prevalidated + * size for the copy object (vm_map_copy_validate_size). + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + */ +kern_return_t +vm_map_copyout_size( + vm_map_t dst_map, + vm_map_address_t *dst_addr, /* OUT */ + vm_map_copy_t copy, + vm_map_size_t copy_size) +{ + return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size, + TRUE, /* consume_on_success */ + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); +} /* * Routine: vm_map_copyout @@ -8705,18 +8851,17 @@ vm_map_copy_validate_size( * If successful, consumes the copy object. * Otherwise, the caller is responsible for it. */ - kern_return_t vm_map_copyout( vm_map_t dst_map, vm_map_address_t *dst_addr, /* OUT */ vm_map_copy_t copy) { - return vm_map_copyout_internal(dst_map, dst_addr, copy, - TRUE, /* consume_on_success */ - VM_PROT_DEFAULT, - VM_PROT_ALL, - VM_INHERIT_DEFAULT); + return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0, + TRUE, /* consume_on_success */ + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); } kern_return_t @@ -8724,6 +8869,7 @@ vm_map_copyout_internal( vm_map_t dst_map, vm_map_address_t *dst_addr, /* OUT */ vm_map_copy_t copy, + vm_map_size_t copy_size, boolean_t consume_on_success, vm_prot_t cur_protection, vm_prot_t max_protection, @@ -8746,6 +8892,11 @@ vm_map_copyout_internal( return(KERN_SUCCESS); } + if (copy->size != copy_size) { + *dst_addr = 0; + return KERN_FAILURE; + } + /* * Check for special copy object, created * by vm_map_copyin_object. @@ -8757,7 +8908,7 @@ vm_map_copyout_internal( vm_object_offset_t offset; offset = vm_object_trunc_page(copy->offset); - size = vm_map_round_page((copy->size + + size = vm_map_round_page((copy_size + (vm_map_size_t)(copy->offset - offset)), VM_MAP_PAGE_MASK(dst_map)); @@ -8782,8 +8933,8 @@ vm_map_copyout_internal( */ if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { - return vm_map_copyout_kernel_buffer(dst_map, dst_addr, - copy, FALSE, + return vm_map_copyout_kernel_buffer(dst_map, dst_addr, + copy, copy_size, FALSE, consume_on_success); } @@ -8794,7 +8945,7 @@ vm_map_copyout_internal( vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset, VM_MAP_COPY_PAGE_MASK(copy)); - size = vm_map_round_page((vm_map_size_t)copy->offset + copy->size, + size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size, VM_MAP_COPY_PAGE_MASK(copy)) - vm_copy_start; @@ -8992,9 +9143,9 @@ StartAgain: ; * map the pages into the destination map. */ if (entry->wired_count != 0) { - register vm_map_offset_t va; + vm_map_offset_t va; vm_object_offset_t offset; - register vm_object_t object; + vm_object_t object; vm_prot_t prot; int type_of_fault; @@ -9008,7 +9159,7 @@ StartAgain: ; TRUE); while (va < entry->vme_end) { - register vm_page_t m; + vm_page_t m; /* * Look up the page in the object. @@ -9215,6 +9366,7 @@ vm_map_copyin_internal( vm_map_size_t copy_size; boolean_t src_destroy; boolean_t use_maxprot; + boolean_t preserve_purgeable; if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) { return KERN_INVALID_ARGUMENT; @@ -9222,6 +9374,8 @@ vm_map_copyin_internal( src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE; use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE; + preserve_purgeable = + (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE; /* * Check for copies of zero bytes. @@ -9239,6 +9393,14 @@ vm_map_copyin_internal( if (src_end < src_addr) return KERN_INVALID_ADDRESS; + /* + * Compute (page aligned) start and end of region + */ + src_start = vm_map_trunc_page(src_addr, + VM_MAP_PAGE_MASK(src_map)); + src_end = vm_map_round_page(src_end, + VM_MAP_PAGE_MASK(src_map)); + /* * If the copy is sufficiently small, use a kernel buffer instead * of making a virtual copy. The theory being that the cost of @@ -9247,18 +9409,22 @@ vm_map_copyin_internal( */ if ((len < msg_ool_size_small) && !use_maxprot && - !(flags & VM_MAP_COPYIN_ENTRY_LIST)) + !preserve_purgeable && + !(flags & VM_MAP_COPYIN_ENTRY_LIST) && + /* + * Since the "msg_ool_size_small" threshold was increased and + * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the + * address space limits, we revert to doing a virtual copy if the + * copied range goes beyond those limits. Otherwise, mach_vm_read() + * of the commpage would now fail when it used to work. + */ + (src_start >= vm_map_min(src_map) && + src_start < vm_map_max(src_map) && + src_end >= vm_map_min(src_map) && + src_end < vm_map_max(src_map))) return vm_map_copyin_kernel_buffer(src_map, src_addr, len, src_destroy, copy_result); - /* - * Compute (page aligned) start and end of region - */ - src_start = vm_map_trunc_page(src_addr, - VM_MAP_PAGE_MASK(src_map)); - src_end = vm_map_round_page(src_end, - VM_MAP_PAGE_MASK(src_map)); - XPR(XPR_VM_MAP, "vm_map_copyin_common map 0x%x addr 0x%x len 0x%x dest %d\n", src_map, src_addr, len, src_destroy, 0); /* @@ -9352,14 +9518,12 @@ vm_map_copyin_internal( */ while (TRUE) { - register vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */ vm_map_size_t src_size; /* Size of source * map entry (in both * maps) */ - register vm_object_t src_object; /* Object to copy */ vm_object_offset_t src_offset; @@ -9619,6 +9783,37 @@ vm_map_copyin_internal( new_entry->needs_copy = new_entry_needs_copy; } + if (result == KERN_SUCCESS && + preserve_purgeable && + src_object->purgable != VM_PURGABLE_DENY) { + vm_object_t new_object; + + new_object = VME_OBJECT(new_entry); + assert(new_object != src_object); + vm_object_lock(new_object); + assert(new_object->ref_count == 1); + assert(new_object->shadow == VM_OBJECT_NULL); + assert(new_object->copy == VM_OBJECT_NULL); + assert(new_object->vo_purgeable_owner == NULL); + + new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; + new_object->true_share = TRUE; + /* start as non-volatile with no owner... */ + new_object->purgable = VM_PURGABLE_NONVOLATILE; + vm_purgeable_nonvolatile_enqueue(new_object, NULL); + /* ... and move to src_object's purgeable state */ + if (src_object->purgable != VM_PURGABLE_NONVOLATILE) { + int state; + state = src_object->purgable; + vm_object_purgable_control( + new_object, + VM_PURGABLE_SET_STATE, + &state); + } + vm_object_unlock(new_object); + new_object = VM_OBJECT_NULL; + } + if (result != KERN_SUCCESS && result != KERN_MEMORY_RESTART_COPY) { vm_map_lock(src_map); @@ -9672,14 +9867,39 @@ vm_map_copyin_internal( goto VerificationFailed; if (src_entry->vme_end < new_entry->vme_end) { + /* + * This entry might have been shortened + * (vm_map_clip_end) or been replaced with + * an entry that ends closer to "src_start" + * than before. + * Adjust "new_entry" accordingly; copying + * less memory would be correct but we also + * redo the copy (see below) if the new entry + * no longer points at the same object/offset. + */ assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end, VM_MAP_COPY_PAGE_MASK(copy))); new_entry->vme_end = src_entry->vme_end; src_size = new_entry->vme_end - src_start; + } else if (src_entry->vme_end > new_entry->vme_end) { + /* + * This entry might have been extended + * (vm_map_entry_simplify() or coalesce) + * or been replaced with an entry that ends farther + * from "src_start" than before. + * + * We've called vm_object_copy_*() only on + * the previous range, so we can't + * just extend new_entry. We have to re-do + * the copy based on the new entry as if it was + * pointing at a different object/offset (see + * "Verification failed" below). + */ } if ((VME_OBJECT(src_entry) != src_object) || - (VME_OFFSET(src_entry) != src_offset) ) { + (VME_OFFSET(src_entry) != src_offset) || + (src_entry->vme_end > new_entry->vme_end)) { /* * Verification failed. @@ -10036,7 +10256,8 @@ vm_map_copy_extract( cur_prot, max_prot, VM_INHERIT_SHARE, - TRUE); /* pageable */ + TRUE, /* pageable */ + FALSE); /* same_map */ if (kr != KERN_SUCCESS) { vm_map_copy_discard(copy); return kr; @@ -10288,6 +10509,17 @@ vm_map_fork_share( vm_map_entry_copy(new_entry, old_entry); old_entry->is_shared = TRUE; new_entry->is_shared = TRUE; + + /* + * If old entry's inheritence is VM_INHERIT_NONE, + * the new entry is for corpse fork, remove the + * write permission from the new entry. + */ + if (old_entry->inheritance == VM_INHERIT_NONE) { + + new_entry->protection &= ~VM_PROT_WRITE; + new_entry->max_protection &= ~VM_PROT_WRITE; + } /* * Insert the entry into the new map -- we @@ -10314,7 +10546,8 @@ static boolean_t vm_map_fork_copy( vm_map_t old_map, vm_map_entry_t *old_entry_p, - vm_map_t new_map) + vm_map_t new_map, + int vm_map_copyin_flags) { vm_map_entry_t old_entry = *old_entry_p; vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start; @@ -10329,7 +10562,9 @@ vm_map_fork_copy( * be accessed, not just whether it's accessible * right now. */ - if (vm_map_copyin_maxprot(old_map, start, entry_size, FALSE, ©) + vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT; + if (vm_map_copyin_internal(old_map, start, entry_size, + vm_map_copyin_flags, ©) != KERN_SUCCESS) { /* * The map might have changed while it @@ -10390,14 +10625,15 @@ vm_map_fork_copy( * * Create and return a new map based on the old * map, according to the inheritance values on the - * regions in that map. + * regions in that map and the options. * * The source map must not be locked. */ vm_map_t vm_map_fork( ledger_t ledger, - vm_map_t old_map) + vm_map_t old_map, + int options) { pmap_t new_pmap; vm_map_t new_map; @@ -10407,6 +10643,13 @@ vm_map_fork( boolean_t src_needs_copy; boolean_t new_entry_needs_copy; boolean_t pmap_is64bit; + int vm_map_copyin_flags; + + if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE | + VM_MAP_FORK_PRESERVE_PURGEABLE)) { + /* unsupported option */ + return VM_MAP_NULL; + } pmap_is64bit = #if defined(__i386__) || defined(__x86_64__) @@ -10424,6 +10667,7 @@ vm_map_fork( old_map->min_offset, old_map->max_offset, old_map->hdr.entries_pageable); + vm_commit_pagezero_status(new_map); /* inherit the parent map's page size */ vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map)); for ( @@ -10435,7 +10679,18 @@ vm_map_fork( switch (old_entry->inheritance) { case VM_INHERIT_NONE: - break; + /* + * Skip making a share entry if VM_MAP_FORK_SHARE_IF_INHERIT_NONE + * is not passed or it is backed by a device pager. + */ + if ((!(options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE)) || + (!old_entry->is_sub_map && + VME_OBJECT(old_entry) != NULL && + VME_OBJECT(old_entry)->pager != NULL && + is_device_pager_ops(VME_OBJECT(old_entry)->pager->mo_pager_ops))) { + break; + } + /* FALLTHROUGH */ case VM_INHERIT_SHARE: vm_map_fork_share(old_map, old_entry, new_map); @@ -10517,7 +10772,15 @@ vm_map_fork( break; slow_vm_map_fork_copy: - if (vm_map_fork_copy(old_map, &old_entry, new_map)) { + vm_map_copyin_flags = 0; + if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) { + vm_map_copyin_flags |= + VM_MAP_COPYIN_PRESERVE_PURGEABLE; + } + if (vm_map_fork_copy(old_map, + &old_entry, + new_map, + vm_map_copyin_flags)) { new_size += entry_size; } continue; @@ -10544,6 +10807,7 @@ kern_return_t vm_map_exec( vm_map_t new_map, task_t task, + boolean_t is64bit, void *fsroot, cpu_type_t cpu) { @@ -10554,8 +10818,8 @@ vm_map_exec( (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), cpu)); - (void) vm_commpage_enter(new_map, task); - (void) vm_shared_region_enter(new_map, task, fsroot, cpu); + (void) vm_commpage_enter(new_map, task, is64bit); + (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu); SHARED_REGION_TRACE_DEBUG( ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x): <-\n", (void *)VM_KERNEL_ADDRPERM(current_task()), @@ -10605,13 +10869,13 @@ vm_map_lookup_locked( vm_map_t *real_map) { vm_map_entry_t entry; - register vm_map_t map = *var_map; + vm_map_t map = *var_map; vm_map_t old_map = *var_map; vm_map_t cow_sub_map_parent = VM_MAP_NULL; vm_map_offset_t cow_parent_vaddr = 0; vm_map_offset_t old_start = 0; vm_map_offset_t old_end = 0; - register vm_prot_t prot; + vm_prot_t prot; boolean_t mask_protections; boolean_t force_copy; vm_prot_t original_fault_type; @@ -10676,7 +10940,9 @@ vm_map_lookup_locked( local_vaddr = vaddr; - if ((entry->use_pmap && !(fault_type & VM_PROT_WRITE))) { + if ((entry->use_pmap && + ! ((fault_type & VM_PROT_WRITE) || + force_copy))) { /* if real_map equals map we unlock below */ if ((*real_map != map) && (*real_map != cow_sub_map_parent)) @@ -10684,7 +10950,9 @@ vm_map_lookup_locked( *real_map = VME_SUBMAP(entry); } - if(entry->needs_copy && (fault_type & VM_PROT_WRITE)) { + if(entry->needs_copy && + ((fault_type & VM_PROT_WRITE) || + force_copy)) { if (!mapped_needs_copy) { if (vm_map_lock_read_to_write(map)) { vm_map_lock_read(map); @@ -10767,7 +11035,9 @@ vm_map_lookup_locked( goto submap_recurse; } - if(((fault_type & VM_PROT_WRITE) && cow_sub_map_parent)) { + if (((fault_type & VM_PROT_WRITE) || + force_copy) + && cow_sub_map_parent) { vm_object_t sub_object, copy_object; vm_object_offset_t copy_offset; @@ -10825,8 +11095,11 @@ vm_map_lookup_locked( /* set up shadow object */ copy_object = sub_object; - vm_object_reference(copy_object); + vm_object_lock(sub_object); + vm_object_reference_locked(sub_object); sub_object->shadowed = TRUE; + vm_object_unlock(sub_object); + assert(submap_entry->wired_count == 0); submap_entry->needs_copy = TRUE; @@ -10973,7 +11246,8 @@ vm_map_lookup_locked( goto protection_failure; } } - if ((fault_type & (prot)) != fault_type) { + if (((fault_type & prot) != fault_type) + ) { protection_failure: if (*real_map != map) { vm_map_unlock(*real_map); @@ -11021,15 +11295,20 @@ vm_map_lookup_locked( vm_map_lock_read(map); goto RetryLookup; } + + if (VME_OBJECT(entry)->shadowed == FALSE) { + vm_object_lock(VME_OBJECT(entry)); + VME_OBJECT(entry)->shadowed = TRUE; + vm_object_unlock(VME_OBJECT(entry)); + } VME_OBJECT_SHADOW(entry, (vm_map_size_t) (entry->vme_end - entry->vme_start)); - - VME_OBJECT(entry)->shadowed = TRUE; entry->needs_copy = FALSE; + vm_map_lock_write_to_read(map); } - else { + if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) { /* * We're attempting to read a copy-on-write * page -- don't allow writes. @@ -11121,8 +11400,8 @@ vm_map_lookup_locked( */ boolean_t vm_map_verify( - register vm_map_t map, - register vm_map_version_t *version) /* REF */ + vm_map_t map, + vm_map_version_t *version) /* REF */ { boolean_t result; @@ -11154,6 +11433,10 @@ vm_map_verify( * */ +#if DEVELOPMENT || DEBUG +int vm_region_footprint = 0; +#endif /* DEVELOPMENT || DEBUG */ + kern_return_t vm_map_region_recurse_64( vm_map_t map, @@ -11416,6 +11699,57 @@ vm_map_region_recurse_64( if (curr_entry == NULL) { /* no VM region contains the address... */ +#if DEVELOPMENT || DEBUG + if (vm_region_footprint && /* we want footprint numbers */ + look_for_pages && /* & we want page counts */ + next_entry == NULL && /* & there are no more regions */ + /* & we haven't already provided our fake region: */ + user_address == vm_map_last_entry(map)->vme_end) { + ledger_amount_t nonvol, nonvol_compressed; + /* + * Add a fake memory region to account for + * purgeable memory that counts towards this + * task's memory footprint, i.e. the resident + * compressed pages of non-volatile objects + * owned by that task. + */ + ledger_get_balance( + map->pmap->ledger, + task_ledgers.purgeable_nonvolatile, + &nonvol); + ledger_get_balance( + map->pmap->ledger, + task_ledgers.purgeable_nonvolatile_compressed, + &nonvol_compressed); + if (nonvol + nonvol_compressed == 0) { + /* no purgeable memory usage to report */ + return KERN_FAILURE; + } + /* fake region to show nonvolatile footprint */ + submap_info->protection = VM_PROT_DEFAULT; + submap_info->max_protection = VM_PROT_DEFAULT; + submap_info->inheritance = VM_INHERIT_DEFAULT; + submap_info->offset = 0; + submap_info->user_tag = 0; + submap_info->pages_resident = (unsigned int) (nonvol / PAGE_SIZE); + submap_info->pages_shared_now_private = 0; + submap_info->pages_swapped_out = (unsigned int) (nonvol_compressed / PAGE_SIZE); + submap_info->pages_dirtied = submap_info->pages_resident; + submap_info->ref_count = 1; + submap_info->shadow_depth = 0; + submap_info->external_pager = 0; + submap_info->share_mode = SM_PRIVATE; + submap_info->is_submap = 0; + submap_info->behavior = VM_BEHAVIOR_DEFAULT; + submap_info->object_id = 0x11111111; + submap_info->user_wired_count = 0; + submap_info->pages_reusable = 0; + *nesting_depth = 0; + *size = (vm_map_size_t) (nonvol + nonvol_compressed); + *address = user_address; + return KERN_SUCCESS; + } +#endif /* DEVELOPMENT || DEBUG */ if (next_entry == NULL) { /* ... and no VM region follows it either */ return KERN_INVALID_ADDRESS; @@ -11878,10 +12212,10 @@ vm_map_region_walk( boolean_t look_for_pages, mach_msg_type_number_t count) { - register struct vm_object *obj, *tmp_obj; - register vm_map_offset_t last_offset; - register int i; - register int ref_count; + struct vm_object *obj, *tmp_obj; + vm_map_offset_t last_offset; + int i; + int ref_count; struct vm_object *shadow_object; int shadow_depth; @@ -11904,98 +12238,138 @@ vm_map_region_walk( return; } - { - obj = VME_OBJECT(entry); + obj = VME_OBJECT(entry); - vm_object_lock(obj); + vm_object_lock(obj); - if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) - ref_count--; + if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) + ref_count--; - if (look_for_pages) { - for (last_offset = offset + range; - offset < last_offset; - offset += PAGE_SIZE_64, va += PAGE_SIZE) { - vm_map_region_look_for_page(map, va, obj, - offset, ref_count, - 0, extended, count); - } - } else { - shadow_object = obj->shadow; - shadow_depth = 0; - - if ( !(obj->pager_trusted) && !(obj->internal)) - extended->external_pager = 1; - - if (shadow_object != VM_OBJECT_NULL) { - vm_object_lock(shadow_object); - for (; - shadow_object != VM_OBJECT_NULL; - shadow_depth++) { - vm_object_t next_shadow; - - if ( !(shadow_object->pager_trusted) && - !(shadow_object->internal)) - extended->external_pager = 1; - - next_shadow = shadow_object->shadow; - if (next_shadow) { - vm_object_lock(next_shadow); + if (look_for_pages) { + for (last_offset = offset + range; + offset < last_offset; + offset += PAGE_SIZE_64, va += PAGE_SIZE) { +#if DEVELOPMENT || DEBUG + if (vm_region_footprint) { + if (obj->purgable != VM_PURGABLE_DENY) { + /* alternate accounting */ + } else if (entry->iokit_acct) { + /* alternate accounting */ + extended->pages_resident++; + extended->pages_dirtied++; + } else { + int disp; + + disp = 0; + pmap_query_page_info(map->pmap, va, &disp); + if (disp & PMAP_QUERY_PAGE_PRESENT) { + extended->pages_resident++; + if (disp & PMAP_QUERY_PAGE_REUSABLE) { + extended->pages_reusable++; + } else if (!(disp & PMAP_QUERY_PAGE_INTERNAL) || + (disp & PMAP_QUERY_PAGE_ALTACCT)) { + /* alternate accounting */ + } else { + extended->pages_dirtied++; + } + } else if (disp & PMAP_QUERY_PAGE_COMPRESSED) { + if (disp & PMAP_QUERY_PAGE_COMPRESSED_ALTACCT) { + /* alternate accounting */ + } else { + extended->pages_swapped_out++; + } } - vm_object_unlock(shadow_object); - shadow_object = next_shadow; } + continue; } - extended->shadow_depth = shadow_depth; +#endif /* DEVELOPMENT || DEBUG */ + vm_map_region_look_for_page(map, va, obj, + offset, ref_count, + 0, extended, count); } +#if DEVELOPMENT || DEBUG + if (vm_region_footprint) { + goto collect_object_info; + } +#endif /* DEVELOPMENT || DEBUG */ + } else { +#if DEVELOPMENT || DEBUG + collect_object_info: +#endif /* DEVELOPMENT || DEBUG */ + shadow_object = obj->shadow; + shadow_depth = 0; - if (extended->shadow_depth || entry->needs_copy) - extended->share_mode = SM_COW; - else { - if (ref_count == 1) - extended->share_mode = SM_PRIVATE; - else { - if (obj->true_share) - extended->share_mode = SM_TRUESHARED; - else - extended->share_mode = SM_SHARED; + if ( !(obj->pager_trusted) && !(obj->internal)) + extended->external_pager = 1; + + if (shadow_object != VM_OBJECT_NULL) { + vm_object_lock(shadow_object); + for (; + shadow_object != VM_OBJECT_NULL; + shadow_depth++) { + vm_object_t next_shadow; + + if ( !(shadow_object->pager_trusted) && + !(shadow_object->internal)) + extended->external_pager = 1; + + next_shadow = shadow_object->shadow; + if (next_shadow) { + vm_object_lock(next_shadow); + } + vm_object_unlock(shadow_object); + shadow_object = next_shadow; } } - extended->ref_count = ref_count - extended->shadow_depth; - - for (i = 0; i < extended->shadow_depth; i++) { - if ((tmp_obj = obj->shadow) == 0) - break; - vm_object_lock(tmp_obj); - vm_object_unlock(obj); - - if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) - ref_count--; + extended->shadow_depth = shadow_depth; + } - extended->ref_count += ref_count; - obj = tmp_obj; + if (extended->shadow_depth || entry->needs_copy) + extended->share_mode = SM_COW; + else { + if (ref_count == 1) + extended->share_mode = SM_PRIVATE; + else { + if (obj->true_share) + extended->share_mode = SM_TRUESHARED; + else + extended->share_mode = SM_SHARED; } + } + extended->ref_count = ref_count - extended->shadow_depth; + + for (i = 0; i < extended->shadow_depth; i++) { + if ((tmp_obj = obj->shadow) == 0) + break; + vm_object_lock(tmp_obj); vm_object_unlock(obj); - if (extended->share_mode == SM_SHARED) { - register vm_map_entry_t cur; - register vm_map_entry_t last; - int my_refs; + if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) + ref_count--; + + extended->ref_count += ref_count; + obj = tmp_obj; + } + vm_object_unlock(obj); - obj = VME_OBJECT(entry); - last = vm_map_to_entry(map); - my_refs = 0; + if (extended->share_mode == SM_SHARED) { + vm_map_entry_t cur; + vm_map_entry_t last; + int my_refs; - if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) - ref_count--; - for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) - my_refs += vm_map_region_count_obj_refs(cur, obj); + obj = VME_OBJECT(entry); + last = vm_map_to_entry(map); + my_refs = 0; - if (my_refs == ref_count) - extended->share_mode = SM_PRIVATE_ALIASED; - else if (my_refs > 1) - extended->share_mode = SM_SHARED_ALIASED; - } + if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) + ref_count--; + for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) + my_refs += vm_map_region_count_obj_refs(cur, obj); + + if (my_refs == ref_count) + extended->share_mode = SM_PRIVATE_ALIASED; + else if (my_refs > 1) + extended->share_mode = SM_SHARED_ALIASED; } } @@ -12014,11 +12388,11 @@ vm_map_region_look_for_page( vm_region_extended_info_t extended, mach_msg_type_number_t count) { - register vm_page_t p; - register vm_object_t shadow; - register int ref_count; - vm_object_t caller_object; - kern_return_t kr; + vm_page_t p; + vm_object_t shadow; + int ref_count; + vm_object_t caller_object; + shadow = object->shadow; caller_object = object; @@ -12033,10 +12407,10 @@ vm_map_region_look_for_page( extended->pages_shared_now_private++; if (!p->fictitious && - (p->dirty || pmap_is_modified(p->phys_page))) + (p->dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) extended->pages_dirtied++; else if (count >= VM_REGION_EXTENDED_INFO_COUNT) { - if (p->reusable || p->object->all_reusable) { + if (p->reusable || object->all_reusable) { extended->pages_reusable++; } } @@ -12048,58 +12422,18 @@ vm_map_region_look_for_page( return; } -#if MACH_PAGEMAP - if (object->existence_map) { - if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_EXISTS) { - - extended->pages_swapped_out++; - - if(object != caller_object) - vm_object_unlock(object); - - return; - } - } else -#endif /* MACH_PAGEMAP */ if (object->internal && object->alive && !object->terminating && object->pager_ready) { - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - if (VM_COMPRESSOR_PAGER_STATE_GET(object, - offset) - == VM_EXTERNAL_STATE_EXISTS) { - /* the pager has that page */ - extended->pages_swapped_out++; - if (object != caller_object) - vm_object_unlock(object); - return; - } - } else { - memory_object_t pager; - - vm_object_paging_begin(object); - pager = object->pager; - vm_object_unlock(object); - - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* just poke the pager */ - VM_PROT_READ, - NULL); - - vm_object_lock(object); - vm_object_paging_end(object); - - if (kr == KERN_SUCCESS) { - /* the pager has that page */ - extended->pages_swapped_out++; - if (object != caller_object) - vm_object_unlock(object); - return; - } + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) + == VM_EXTERNAL_STATE_EXISTS) { + /* the pager has that page */ + extended->pages_swapped_out++; + if (object != caller_object) + vm_object_unlock(object); + return; } } @@ -12134,9 +12468,9 @@ vm_map_region_count_obj_refs( vm_map_entry_t entry, vm_object_t object) { - register int ref_count; - register vm_object_t chk_obj; - register vm_object_t tmp_obj; + int ref_count; + vm_object_t chk_obj; + vm_object_t tmp_obj; if (VME_OBJECT(entry) == 0) return(0); @@ -12221,6 +12555,7 @@ vm_map_simplify_entry( (prev_entry->wired_count == this_entry->wired_count) && (prev_entry->user_wired_count == this_entry->user_wired_count) && + ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) && (prev_entry->in_transition == FALSE) && (this_entry->in_transition == FALSE) && (prev_entry->needs_wakeup == FALSE) && @@ -12404,7 +12739,7 @@ vm_map_machine_attribute( if (m && !m->fictitious) { ret = pmap_attribute_cache_sync( - m->phys_page, + VM_PAGE_GET_PHYS_PAGE(m), PAGE_SIZE, attribute, value); @@ -12458,7 +12793,7 @@ vm_map_behavior_set( vm_map_offset_t end, vm_behavior_t new_behavior) { - register vm_map_entry_t entry; + vm_map_entry_t entry; vm_map_entry_t temp_entry; XPR(XPR_VM_MAP, @@ -12956,6 +13291,14 @@ vm_map_reusable_pages( return KERN_INVALID_ADDRESS; } + if (! (entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) { + /* not writable: can't discard contents */ + vm_map_unlock_read(map); + vm_page_stats_reusable.reusable_nonwritable++; + vm_page_stats_reusable.reusable_pages_failure++; + return KERN_PROTECTION_FAILURE; + } + /* * The first time through, the start address could be anywhere * within the vm_map_entry we found. So adjust the offset to @@ -12979,20 +13322,27 @@ vm_map_reusable_pages( vm_object_lock(object); - if (object->ref_count == 1 && - !object->shadow && + if (((object->ref_count == 1) || + (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC && + object->copy == VM_OBJECT_NULL)) && + object->shadow == VM_OBJECT_NULL && /* * "iokit_acct" entries are billed for their virtual size * (rather than for their resident pages only), so they * wouldn't benefit from making pages reusable, and it * would be hard to keep track of pages that are both - * "iokit_acct" and "reusable" in the pmap stats and ledgers. + * "iokit_acct" and "reusable" in the pmap stats and + * ledgers. */ !(entry->iokit_acct || - (!entry->is_sub_map && !entry->use_pmap))) + (!entry->is_sub_map && !entry->use_pmap))) { + if (object->ref_count != 1) { + vm_page_stats_reusable.reusable_shared++; + } kill_pages = 1; - else + } else { kill_pages = -1; + } if (kill_pages != -1) { vm_object_deactivate_pages(object, start_offset, @@ -13260,6 +13610,7 @@ vm_map_entry_insert( new_entry->iokit_acct = FALSE; new_entry->vme_resilient_codesign = FALSE; new_entry->vme_resilient_media = FALSE; + new_entry->vme_atomic = FALSE; /* * Insert the new entry into the list. @@ -13292,7 +13643,8 @@ vm_map_remap_extract( vm_prot_t *max_protection, /* What, no behavior? */ vm_inherit_t inheritance, - boolean_t pageable) + boolean_t pageable, + boolean_t same_map) { kern_return_t result; vm_map_size_t mapped_size; @@ -13481,7 +13833,7 @@ vm_map_remap_extract( * Cannot allow an entry describing a JIT * region to be shared across address spaces. */ - if (src_entry->used_for_jit == TRUE) { + if (src_entry->used_for_jit == TRUE && !same_map) { result = KERN_INVALID_ARGUMENT; break; } @@ -13589,6 +13941,7 @@ vm_map_remap_extract( if (result != KERN_SUCCESS && result != KERN_MEMORY_RESTART_COPY) { _vm_map_entry_dispose(map_header, new_entry); + vm_map_lock(map); break; } @@ -13731,7 +14084,8 @@ vm_map_remap( cur_protection, max_protection, inheritance, - target_map->hdr.entries_pageable); + target_map->hdr.entries_pageable, + src_map == target_map); if (result != KERN_SUCCESS) { return result; @@ -13781,6 +14135,7 @@ vm_map_remap( } if( target_map->disable_vmentry_reuse == TRUE) { + assert(!target_map->is_nested_map); if( target_map->highest_entry_end < insp_entry->vme_end ){ target_map->highest_entry_end = insp_entry->vme_end; } @@ -13840,6 +14195,18 @@ StartAgain: ; if (flags & VM_FLAGS_ANYWHERE) { + if (flags & VM_FLAGS_RANDOM_ADDR) + { + /* + * Get a random start address. + */ + kr = vm_map_random_address_for_size(map, address, size); + if (kr != KERN_SUCCESS) { + return(kr); + } + start = *address; + } + /* * Calculate the first possible address. */ @@ -13915,7 +14282,7 @@ StartAgain: ; */ while (TRUE) { - register vm_map_entry_t next; + vm_map_entry_t next; /* * Find the end of the proposed new region. @@ -14390,7 +14757,6 @@ vm_map_page_info( vm_map_entry_t map_entry; vm_object_t object; vm_page_t m; - kern_return_t kr; kern_return_t retval = KERN_SUCCESS; boolean_t top_object; int disposition; @@ -14478,59 +14844,16 @@ vm_map_page_info( disposition |= VM_PAGE_QUERY_PAGE_PRESENT; break; } else { -#if MACH_PAGEMAP - if (object->existence_map) { - if (vm_external_state_get(object->existence_map, - offset) == - VM_EXTERNAL_STATE_EXISTS) { - /* - * this page has been paged out - */ - disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; - break; - } - } else -#endif if (object->internal && object->alive && !object->terminating && object->pager_ready) { - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - if (VM_COMPRESSOR_PAGER_STATE_GET( - object, - offset) - == VM_EXTERNAL_STATE_EXISTS) { - /* the pager has that page */ - disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; - break; - } - } else { - memory_object_t pager; - - vm_object_paging_begin(object); - pager = object->pager; - vm_object_unlock(object); - - /* - * Ask the default pager if - * it has this page. - */ - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* just poke the pager */ - VM_PROT_READ, - NULL); - - vm_object_lock(object); - vm_object_paging_end(object); - - if (kr == KERN_SUCCESS) { - /* the default pager has it */ - disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; - break; - } + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) + == VM_EXTERNAL_STATE_EXISTS) { + /* the pager has that page */ + disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; + break; } } @@ -14578,13 +14901,13 @@ vm_map_page_info( disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS; goto done_with_object; } - if (m->dirty || pmap_is_modified(m->phys_page)) + if (m->dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) disposition |= VM_PAGE_QUERY_PAGE_DIRTY; - if (m->reference || pmap_is_referenced(m->phys_page)) + if (m->reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) disposition |= VM_PAGE_QUERY_PAGE_REF; - if (m->speculative) + if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE; if (m->cs_validated) @@ -14709,6 +15032,18 @@ vm_map_msync( */ had_hole = TRUE; + if (sync_flags & VM_SYNC_KILLPAGES) { + /* + * For VM_SYNC_KILLPAGES, there should be + * no holes in the range, since we couldn't + * prevent someone else from allocating in + * that hole and we wouldn't want to "kill" + * their pages. + */ + vm_map_unlock(map); + break; + } + /* * Check for empty map. */ @@ -14790,10 +15125,18 @@ vm_map_msync( boolean_t reusable_pages = FALSE; if (sync_flags & VM_SYNC_KILLPAGES) { - if (object->ref_count == 1 && !object->shadow) + if (((object->ref_count == 1) || + ((object->copy_strategy != + MEMORY_OBJECT_COPY_SYMMETRIC) && + (object->copy == VM_OBJECT_NULL))) && + (object->shadow == VM_OBJECT_NULL)) { + if (object->ref_count != 1) { + vm_page_stats_reusable.free_shared++; + } kill_pages = 1; - else + } else { kill_pages = -1; + } } if (kill_pages != -1) vm_object_deactivate_pages( @@ -15077,7 +15420,7 @@ current_map(void) #undef vm_map_reference void vm_map_reference( - register vm_map_t map) + vm_map_t map) { if (map == VM_MAP_NULL) return; @@ -15101,7 +15444,7 @@ vm_map_reference( */ void vm_map_deallocate( - register vm_map_t map) + vm_map_t map) { unsigned int ref; @@ -15411,7 +15754,7 @@ kern_return_t vm_map_sign(vm_map_t map, * we'll disconnect the page so we note any future modification * attempts. */ m->wpmapped = FALSE; - refmod = pmap_disconnect(m->phys_page); + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); /* Pull the dirty status from the pmap, since we cleared the * wpmapped bit */ @@ -15489,61 +15832,66 @@ kern_return_t vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident return kr; } -#if CONFIG_FREEZE -kern_return_t vm_map_freeze_walk( - vm_map_t map, - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *has_shared) +#if DEVELOPMENT || DEBUG + +int +vm_map_disconnect_page_mappings( + vm_map_t map, + boolean_t do_unnest) { vm_map_entry_t entry; - + int page_count = 0; + + if (do_unnest == TRUE) { +#ifndef NO_NESTED_PMAP + vm_map_lock(map); + + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + + if (entry->is_sub_map && entry->use_pmap) { + /* + * Make sure the range between the start of this entry and + * the end of this entry is no longer nested, so that + * we will only remove mappings from the pmap in use by this + * this task + */ + vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end); + } + } + vm_map_unlock(map); +#endif + } vm_map_lock_read(map); - - *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; - *has_shared = FALSE; - + + page_count = map->pmap->stats.resident_count; + for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = entry->vme_next) { - unsigned int purgeable, clean, dirty, wired; - boolean_t shared; - if ((VME_OBJECT(entry) == 0) || - (entry->is_sub_map) || - (VME_OBJECT(entry)->phys_contiguous)) { + if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) || + (VME_OBJECT(entry)->phys_contiguous))) { continue; } + if (entry->is_sub_map) + assert(!entry->use_pmap); - default_freezer_pack(&purgeable, &wired, &clean, &dirty, dirty_budget, &shared, VME_OBJECT(entry), NULL); - - *purgeable_count += purgeable; - *wired_count += wired; - *clean_count += clean; - *dirty_count += dirty; - - if (shared) { - *has_shared = TRUE; - } - - /* Adjust pageout budget and finish up if reached */ - if (dirty_budget) { - dirty_budget -= dirty; - if (dirty_budget == 0) { - break; - } - } + pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0); } - vm_map_unlock_read(map); - return KERN_SUCCESS; + return page_count; } +#endif + + +#if CONFIG_FREEZE + + int c_freezer_swapout_count; int c_freezer_compression_count = 0; AbsoluteTime c_freezer_last_yield_ts = 0; @@ -15554,12 +15902,11 @@ kern_return_t vm_map_freeze( unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, - unsigned int dirty_budget, + __unused unsigned int dirty_budget, boolean_t *has_shared) { vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL; kern_return_t kr = KERN_SUCCESS; - boolean_t default_freezer_active = TRUE; *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; *has_shared = FALSE; @@ -15571,30 +15918,13 @@ kern_return_t vm_map_freeze( */ vm_map_lock(map); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - default_freezer_active = FALSE; - - if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { - kr = KERN_NO_SPACE; - goto done; - } - } - assert(default_freezer_active == FALSE); - - if (default_freezer_active) { - if (map->default_freezer_handle == NULL) { - map->default_freezer_handle = default_freezer_handle_allocate(); - } - - if ((kr = default_freezer_handle_init(map->default_freezer_handle)) != KERN_SUCCESS) { - /* - * Can happen if default_freezer_handle passed in is NULL - * Or, a table has already been allocated and associated - * with this handle, i.e. the map is already frozen. - */ - goto done; - } + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + + if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { + kr = KERN_NO_SPACE; + goto done; } + c_freezer_compression_count = 0; clock_get_uptime(&c_freezer_last_yield_ts); @@ -15604,48 +15934,24 @@ kern_return_t vm_map_freeze( vm_object_t src_object = VME_OBJECT(entry2); - if (VME_OBJECT(entry2) && + if (src_object && !entry2->is_sub_map && - !VME_OBJECT(entry2)->phys_contiguous) { + !src_object->phys_contiguous) { /* If eligible, scan the entry, moving eligible pages over to our parent object */ - if (default_freezer_active) { - unsigned int purgeable, clean, dirty, wired; - boolean_t shared; - - default_freezer_pack(&purgeable, &wired, &clean, &dirty, dirty_budget, &shared, - src_object, map->default_freezer_handle); - - *purgeable_count += purgeable; - *wired_count += wired; - *clean_count += clean; - *dirty_count += dirty; - - /* Adjust pageout budget and finish up if reached */ - if (dirty_budget) { - dirty_budget -= dirty; - if (dirty_budget == 0) { - break; - } - } - if (shared) { - *has_shared = TRUE; - } - } else { - if (VME_OBJECT(entry2)->internal == TRUE) { + if (src_object->internal == TRUE) { - if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { - /* - * Pages belonging to this object could be swapped to disk. - * Make sure it's not a shared object because we could end - * up just bringing it back in again. - */ - if (VME_OBJECT(entry2)->ref_count > 1) { - continue; - } + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * Pages belonging to this object could be swapped to disk. + * Make sure it's not a shared object because we could end + * up just bringing it back in again. + */ + if (src_object->ref_count > 1) { + continue; } - vm_object_compressed_freezer_pageout(VME_OBJECT(entry2)); } + vm_object_compressed_freezer_pageout(src_object); if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { kr = KERN_NO_SPACE; @@ -15654,19 +15960,12 @@ kern_return_t vm_map_freeze( } } } - - if (default_freezer_active) { - /* Finally, throw out the pages to swap */ - default_freezer_pageout(map->default_freezer_handle); - } - done: vm_map_unlock(map); - if (!default_freezer_active) { - vm_object_compressed_freezer_done(); - } - if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + vm_object_compressed_freezer_done(); + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { /* * reset the counter tracking the # of swapped c_segs * because we are now done with this freeze session and task. @@ -15676,35 +15975,6 @@ kern_return_t vm_map_freeze( return kr; } -kern_return_t -vm_map_thaw( - vm_map_t map) -{ - kern_return_t kr = KERN_SUCCESS; - - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - /* - * We will on-demand thaw in the presence of the compressed pager. - */ - return kr; - } - - vm_map_lock(map); - - if (map->default_freezer_handle == NULL) { - /* - * This map is not in a frozen state. - */ - kr = KERN_FAILURE; - goto out; - } - - kr = default_freezer_unpack(map->default_freezer_handle); -out: - vm_map_unlock(map); - - return kr; -} #endif /* @@ -15858,67 +16128,6 @@ vm_map_set_page_shift( return KERN_SUCCESS; } -int -vm_map_purge( - vm_map_t map) -{ - int num_object_purged; - vm_map_entry_t entry; - vm_map_offset_t next_address; - vm_object_t object; - int state; - kern_return_t kr; - - num_object_purged = 0; - - vm_map_lock_read(map); - entry = vm_map_first_entry(map); - while (entry != vm_map_to_entry(map)) { - if (entry->is_sub_map) { - goto next; - } - if (! (entry->protection & VM_PROT_WRITE)) { - goto next; - } - object = VME_OBJECT(entry); - if (object == VM_OBJECT_NULL) { - goto next; - } - if (object->purgable != VM_PURGABLE_VOLATILE) { - goto next; - } - - vm_object_lock(object); -#if 00 - if (VME_OFFSET(entry) != 0 || - (entry->vme_end - entry->vme_start) != object->vo_size) { - vm_object_unlock(object); - goto next; - } -#endif - next_address = entry->vme_end; - vm_map_unlock_read(map); - state = VM_PURGABLE_EMPTY; - kr = vm_object_purgable_control(object, - VM_PURGABLE_SET_STATE, - &state); - if (kr == KERN_SUCCESS) { - num_object_purged++; - } - vm_object_unlock(object); - - vm_map_lock_read(map); - if (vm_map_lookup_entry(map, next_address, &entry)) { - continue; - } - next: - entry = entry->vme_next; - } - vm_map_unlock_read(map); - - return num_object_purged; -} - kern_return_t vm_map_query_volatile( vm_map_t map, @@ -16020,6 +16229,11 @@ vm_map_sizes(vm_map_t map, vm_map_size_t free, total_free, largest_free; boolean_t end; + if (!map) + { + *psize = *pfree = *plargest_free = 0; + return; + } total_free = largest_free = 0; vm_map_lock_read(map); @@ -16089,3 +16303,7 @@ int vm_map_shadow_max( return shadows_max; } #endif /* VM_SCAN_FOR_SHADOW_CHAIN */ + +void vm_commit_pagezero_status(vm_map_t lmap) { + pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset); +} diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index 44c987925..9dc270a00 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -93,6 +93,7 @@ extern vm_map_t current_map(void); extern kern_return_t vm_map_exec( vm_map_t new_map, task_t task, + boolean_t is64bit, void *fsroot, cpu_type_t cpu); @@ -301,7 +302,8 @@ struct vm_map_entry { /* boolean_t */ iokit_acct:1, /* boolean_t */ vme_resilient_codesign:1, /* boolean_t */ vme_resilient_media:1, - __unused:6; + /* boolean_t */ vme_atomic:1, /* entry cannot be split/coalesced */ + __unused:5; ; unsigned short wired_count; /* can be paged if = 0 */ @@ -349,7 +351,6 @@ struct vm_map_header { int nentries; /* Number of entries */ boolean_t entries_pageable; /* are map entries pageable? */ - vm_map_offset_t highest_entry_end_addr; /* The ending address of the highest allocated vm_entry_t */ #ifdef VM_MAP_STORE_USE_RB struct rb_head rb_head_store; #endif @@ -380,11 +381,28 @@ struct _vm_map { struct vm_map_header hdr; /* Map entry header */ #define min_offset hdr.links.start /* start of range */ #define max_offset hdr.links.end /* end of range */ -#define highest_entry_end hdr.highest_entry_end_addr pmap_t pmap; /* Physical map */ vm_map_size_t size; /* virtual size */ vm_map_size_t user_wire_limit;/* rlimit on user locked memory */ vm_map_size_t user_wire_size; /* current size of user locked memory in this map */ + + union { + /* + * If map->disable_vmentry_reuse == TRUE: + * the end address of the highest allocated vm_map_entry_t. + */ + vm_map_offset_t vmu1_highest_entry_end; + /* + * For a nested VM map: + * the lowest address in this nested VM map that we would + * expect to be unnested under normal operation (i.e. for + * regular copy-on-write on DATA section). + */ + vm_map_offset_t vmu1_lowest_unnestable_start; + } vmu1; +#define highest_entry_end vmu1.vmu1_highest_entry_end +#define lowest_unnestable_start vmu1.vmu1_lowest_unnestable_start + int ref_count; /* Reference count */ #if TASK_SWAPPER int res_count; /* Residence count (swap) */ @@ -411,12 +429,11 @@ struct _vm_map { /* boolean_t */ disable_vmentry_reuse:1, /* All vm entries should keep using newer and higher addresses in the map */ /* boolean_t */ map_disallow_data_exec:1, /* Disallow execution from data pages on exec-permissive architectures */ /* boolean_t */ holelistenabled:1, - /* reserved */ pad:24; + /* boolean_t */ is_nested_map:1, + /* reserved */ pad:23; unsigned int timestamp; /* Version number */ unsigned int color_rr; /* next color (not protected by a lock) */ -#if CONFIG_FREEZE - void *default_freezer_handle; -#endif + boolean_t jit_entry_exists; } ; @@ -788,7 +805,7 @@ extern vm_object_t vm_submap_object; #define vm_map_dealloc_fast(map) \ MACRO_BEGIN \ - register int c; \ + int c; \ \ lck_mtx_lock(&map->s_lock); \ c = --map->ref_count; \ @@ -878,7 +895,10 @@ extern kern_return_t vm_map_read_user( /* Create a new task map using an existing task map as a template. */ extern vm_map_t vm_map_fork( ledger_t ledger, - vm_map_t old_map); + vm_map_t old_map, + int options); +#define VM_MAP_FORK_SHARE_IF_INHERIT_NONE 0x00000001 +#define VM_MAP_FORK_PRESERVE_PURGEABLE 0x00000002 /* Change inheritance */ extern kern_return_t vm_map_inherit( @@ -981,16 +1001,14 @@ extern kern_return_t vm_map_set_cache_attr( extern int override_nx(vm_map_t map, uint32_t user_tag); -extern int vm_map_purge(vm_map_t map); - /* kext exported versions */ extern kern_return_t vm_map_wire_external( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t caller_prot, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, boolean_t user_wire); extern kern_return_t vm_map_wire_and_extract_external( @@ -1111,6 +1129,13 @@ extern kern_return_t vm_map_remove( vm_map_offset_t end, boolean_t flags); +/* Deallocate a region when the map is already locked */ +extern kern_return_t vm_map_remove_locked( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t flags); + /* Discard a copy without using it */ extern void vm_map_copy_discard( vm_map_copy_t copy); @@ -1126,7 +1151,7 @@ extern kern_return_t vm_map_copy_overwrite( extern boolean_t vm_map_copy_validate_size( vm_map_t dst_map, vm_map_copy_t copy, - vm_map_size_t size); + vm_map_size_t *size); /* Place a copy into a map */ extern kern_return_t vm_map_copyout( @@ -1134,10 +1159,17 @@ extern kern_return_t vm_map_copyout( vm_map_address_t *dst_addr, /* OUT */ vm_map_copy_t copy); +extern kern_return_t vm_map_copyout_size( + vm_map_t dst_map, + vm_map_address_t *dst_addr, /* OUT */ + vm_map_copy_t copy, + vm_map_size_t copy_size); + extern kern_return_t vm_map_copyout_internal( vm_map_t dst_map, vm_map_address_t *dst_addr, /* OUT */ vm_map_copy_t copy, + vm_map_size_t copy_size, boolean_t consume_on_success, vm_prot_t cur_protection, vm_prot_t max_protection, @@ -1162,7 +1194,8 @@ extern kern_return_t vm_map_copyin_common( #define VM_MAP_COPYIN_SRC_DESTROY 0x00000001 #define VM_MAP_COPYIN_USE_MAXPROT 0x00000002 #define VM_MAP_COPYIN_ENTRY_LIST 0x00000004 -#define VM_MAP_COPYIN_ALL_FLAGS 0x00000007 +#define VM_MAP_COPYIN_PRESERVE_PURGEABLE 0x00000008 +#define VM_MAP_COPYIN_ALL_FLAGS 0x0000000F extern kern_return_t vm_map_copyin_internal( vm_map_t src_map, vm_map_address_t src_addr, @@ -1194,6 +1227,7 @@ extern void vm_map_set_32bit( extern boolean_t vm_map_has_hard_pagezero( vm_map_t map, vm_map_offset_t pagezero_size); +extern void vm_commit_pagezero_status(vm_map_t tmap); extern boolean_t vm_map_is_64bit( vm_map_t map); @@ -1348,19 +1382,16 @@ extern kern_return_t vm_map_partial_reap( unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed); -#if CONFIG_FREEZE -void vm_map_freeze_thaw_init(void); -void vm_map_freeze_thaw(void); -void vm_map_demand_fault(void); -extern kern_return_t vm_map_freeze_walk( - vm_map_t map, - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *has_shared); +#if DEVELOPMENT || DEBUG + +extern int vm_map_disconnect_page_mappings( + vm_map_t map, + boolean_t); +#endif + + +#if CONFIG_FREEZE extern kern_return_t vm_map_freeze( vm_map_t map, @@ -1370,9 +1401,6 @@ extern kern_return_t vm_map_freeze( unsigned int *dirty_count, unsigned int dirty_budget, boolean_t *has_shared); - -extern kern_return_t vm_map_thaw( - vm_map_t map); #endif __END_DECLS diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c index 70f0624f0..26b3477a4 100644 --- a/osfmk/vm/vm_map_store.c +++ b/osfmk/vm/vm_map_store.c @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include #include #include /* for vm_debug_events */ @@ -60,8 +61,8 @@ vm_map_store_init( struct vm_map_header *hdr ) boolean_t vm_map_store_lookup_entry( - register vm_map_t map, - register vm_map_offset_t address, + vm_map_t map, + vm_map_offset_t address, vm_map_entry_t *entry) /* OUT */ { #ifdef VM_MAP_STORE_USE_LL @@ -149,8 +150,8 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh } #endif #if MAP_ENTRY_INSERTION_DEBUG - fastbacktrace(&entry->vme_insertion_bt[0], - (sizeof (entry->vme_insertion_bt) / sizeof (uintptr_t))); + backtrace(&entry->vme_insertion_bt[0], + (sizeof (entry->vme_insertion_bt) / sizeof (uintptr_t))); #endif } diff --git a/osfmk/vm/vm_map_store.h b/osfmk/vm/vm_map_store.h index 8d6687f39..cc8b60df4 100644 --- a/osfmk/vm/vm_map_store.h +++ b/osfmk/vm/vm_map_store.h @@ -59,30 +59,34 @@ struct vm_map_store { #include #include -#define UPDATE_HIGHEST_ENTRY_END(map, highest_entry) \ - MACRO_BEGIN \ - struct _vm_map* UHEE_map; \ - struct vm_map_entry* UHEE_entry; \ - UHEE_map = (map); \ - UHEE_entry = (highest_entry); \ - if( UHEE_map->highest_entry_end < UHEE_entry->vme_end) { \ - UHEE_map->highest_entry_end = UHEE_entry->vme_end; \ +#define UPDATE_HIGHEST_ENTRY_END(map, highest_entry) \ + MACRO_BEGIN \ + struct _vm_map* UHEE_map; \ + struct vm_map_entry* UHEE_entry; \ + UHEE_map = (map); \ + assert(UHEE_map->disable_vmentry_reuse); \ + assert(!UHEE_map->is_nested_map); \ + UHEE_entry = (highest_entry); \ + if( UHEE_map->highest_entry_end < UHEE_entry->vme_end) { \ + UHEE_map->highest_entry_end = UHEE_entry->vme_end; \ } \ MACRO_END -#define VM_MAP_HIGHEST_ENTRY(map, entry, start) \ - MACRO_BEGIN \ - struct _vm_map* VMHE_map; \ - struct vm_map_entry* tmp_entry; \ - vm_map_offset_t VMHE_start; \ - VMHE_map = (map); \ - VMHE_start= VMHE_map->highest_entry_end + PAGE_SIZE_64; \ - while(vm_map_lookup_entry(VMHE_map, VMHE_start, &tmp_entry)){ \ - VMHE_map->highest_entry_end = tmp_entry->vme_end; \ - VMHE_start = VMHE_map->highest_entry_end + PAGE_SIZE_64; \ - } \ - entry = tmp_entry; \ - start = VMHE_start; \ +#define VM_MAP_HIGHEST_ENTRY(map, entry, start) \ + MACRO_BEGIN \ + struct _vm_map* VMHE_map; \ + struct vm_map_entry* tmp_entry; \ + vm_map_offset_t VMHE_start; \ + VMHE_map = (map); \ + assert(VMHE_map->disable_vmentry_reuse); \ + assert(!VMHE_map->is_nested_map); \ + VMHE_start= VMHE_map->highest_entry_end + PAGE_SIZE_64; \ + while(vm_map_lookup_entry(VMHE_map, VMHE_start, &tmp_entry)){ \ + VMHE_map->highest_entry_end = tmp_entry->vme_end; \ + VMHE_start = VMHE_map->highest_entry_end + PAGE_SIZE_64; \ + } \ + entry = tmp_entry; \ + start = VMHE_start; \ MACRO_END /* diff --git a/osfmk/vm/vm_map_store_ll.c b/osfmk/vm/vm_map_store_ll.c index 06bd7c971..c7c1afd98 100644 --- a/osfmk/vm/vm_map_store_ll.c +++ b/osfmk/vm/vm_map_store_ll.c @@ -157,12 +157,12 @@ vm_map_store_init_ll( __unused struct vm_map_header *hdr) */ boolean_t vm_map_store_lookup_entry_ll( - register vm_map_t map, - register vm_map_offset_t address, + vm_map_t map, + vm_map_offset_t address, vm_map_entry_t *entry) /* OUT */ { - register vm_map_entry_t cur; - register vm_map_entry_t last; + vm_map_entry_t cur; + vm_map_entry_t last; /* * Start looking either from the head of the diff --git a/osfmk/vm/vm_map_store_rb.c b/osfmk/vm/vm_map_store_rb.c index 5e881f6b7..130e4c8d3 100644 --- a/osfmk/vm/vm_map_store_rb.c +++ b/osfmk/vm/vm_map_store_rb.c @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include RB_GENERATE(rb_head, vm_map_store, entry, rb_node_compare); @@ -136,8 +137,8 @@ void vm_map_store_copy_insert_rb( vm_map_t map, __unused vm_map_entry_t after_wh (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_start, (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_end); } else { #if MAP_ENTRY_INSERTION_DEBUG - fastbacktrace(&entry->vme_insertion_bt[0], - (sizeof (entry->vme_insertion_bt) / sizeof (uintptr_t))); + backtrace(&entry->vme_insertion_bt[0], + (sizeof (entry->vme_insertion_bt) / sizeof (uintptr_t))); #endif entry = entry->vme_next; inserted++; diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 35c9ba57b..882c3c077 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -110,18 +111,8 @@ struct vm_counters vm_counters; #if VM_OBJECT_TRACKING boolean_t vm_object_tracking_inited = FALSE; -decl_simple_lock_data(static,vm_object_tracking_lock_data); btlog_t *vm_object_tracking_btlog; -static void -vm_object_tracking_lock(void *context) -{ - simple_lock((simple_lock_t)context); -} -static void -vm_object_tracking_unlock(void *context) -{ - simple_unlock((simple_lock_t)context); -} + void vm_object_tracking_init(void) { @@ -132,13 +123,10 @@ vm_object_tracking_init(void) sizeof (vm_object_tracking)); if (vm_object_tracking) { - simple_lock_init(&vm_object_tracking_lock_data, 0); vm_object_tracking_btlog = btlog_create( - 50000, + VM_OBJECT_TRACKING_NUM_RECORDS, VM_OBJECT_TRACKING_BTDEPTH, - vm_object_tracking_lock, - vm_object_tracking_unlock, - &vm_object_tracking_lock_data); + TRUE /* caller_will_remove_entries_for_element? */); assert(vm_object_tracking_btlog); vm_object_tracking_inited = TRUE; } @@ -249,10 +237,10 @@ static zone_t vm_object_zone; /* vm backing store zone */ * All wired-down kernel memory belongs to a single virtual * memory object (kernel_object) to avoid wasting data structures. */ -static struct vm_object kernel_object_store; -vm_object_t kernel_object; +static struct vm_object kernel_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +vm_object_t kernel_object; -static struct vm_object compressor_object_store; +static struct vm_object compressor_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); vm_object_t compressor_object = &compressor_object_store; /* @@ -261,7 +249,7 @@ vm_object_t compressor_object = &compressor_object_store; * is exported by the vm_map module. The storage is declared * here because it must be initialized here. */ -static struct vm_object vm_submap_object_store; +static struct vm_object vm_submap_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); /* * Virtual memory objects are initialized from @@ -547,7 +535,7 @@ _vm_object_allocate( object, size, 0,0,0); *object = vm_object_template; - queue_init(&object->memq); + vm_page_queue_init(&object->memq); queue_init(&object->msr_q); #if UPL_DEBUG || CONFIG_IOSCHED queue_init(&object->uplq); @@ -574,7 +562,7 @@ __private_extern__ vm_object_t vm_object_allocate( vm_object_size_t size) { - register vm_object_t object; + vm_object_t object; object = (vm_object_t) zalloc(vm_object_zone); @@ -602,12 +590,15 @@ lck_attr_t compressor_object_lck_attr; __private_extern__ void vm_object_bootstrap(void) { - register int i; + int i; + vm_size_t vm_object_size; + + vm_object_size = (sizeof(struct vm_object) + (VM_PACKED_POINTER_ALIGNMENT-1)) & ~(VM_PACKED_POINTER_ALIGNMENT - 1); - vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object), - round_page(512*1024), - round_page(12*1024), - "vm objects"); + vm_object_zone = zinit(vm_object_size, + round_page(512*1024), + round_page(12*1024), + "vm objects"); zone_change(vm_object_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ zone_change(vm_object_zone, Z_NOENCRYPT, TRUE); @@ -650,8 +641,10 @@ vm_object_bootstrap(void) */ /* memq; Lock; init after allocation */ - vm_object_template.memq.prev = NULL; - vm_object_template.memq.next = NULL; + + + vm_object_template.memq.prev = 0; + vm_object_template.memq.next = 0; #if 0 /* * We can't call vm_object_lock_init() here because that will @@ -661,6 +654,9 @@ vm_object_bootstrap(void) * the vm_object_template. */ vm_object_lock_init(&vm_object_template); +#endif +#if DEVELOPMENT || DEBUG + vm_object_template.Lock_owner = 0; #endif vm_object_template.vo_size = 0; vm_object_template.memq_hint = VM_PAGE_NULL; @@ -721,9 +717,6 @@ vm_object_bootstrap(void) #if CONFIG_PHANTOM_CACHE vm_object_template.phantom_object_id = 0; #endif -#if MACH_PAGEMAP - vm_object_template.existence_map = VM_EXTERNAL_NULL; -#endif /* MACH_PAGEMAP */ vm_object_template.cow_hint = ~(vm_offset_t)0; #if MACH_ASSERT vm_object_template.paging_object = VM_OBJECT_NULL; @@ -761,6 +754,15 @@ vm_object_bootstrap(void) vm_object_template.vo_cache_ts = 0; vm_object_template.wire_tag = VM_KERN_MEMORY_NONE; + + vm_object_template.io_tracking = FALSE; + +#if CONFIG_SECLUDED_MEMORY + vm_object_template.eligible_for_secluded = FALSE; + vm_object_template.can_grab_secluded = FALSE; +#else /* CONFIG_SECLUDED_MEMORY */ + vm_object_template.__object3_unused_bits = 0; +#endif /* CONFIG_SECLUDED_MEMORY */ #if DEBUG bzero(&vm_object_template.purgeable_owner_bt[0], @@ -805,10 +807,6 @@ vm_object_bootstrap(void) * non-zone memory. */ vm_object_reference(vm_submap_object); - -#if MACH_PAGEMAP - vm_external_module_initialize(); -#endif /* MACH_PAGEMAP */ } #if CONFIG_IOSCHED @@ -901,7 +899,7 @@ unsigned long vm_object_deallocate_shared_swap_failures = 0; __private_extern__ void vm_object_deallocate( - register vm_object_t object) + vm_object_t object) { #if VM_OBJECT_CACHE boolean_t retry_cache_trim = FALSE; @@ -1271,13 +1269,13 @@ vm_object_page_grab( vm_object_lock_assert_exclusive(object); - next_p = (vm_page_t)queue_first(&object->memq); + next_p = (vm_page_t)vm_page_queue_first(&object->memq); p_limit = MIN(50, object->resident_page_count); - while (!queue_end(&object->memq, (queue_entry_t)next_p) && --p_limit > 0) { + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && --p_limit > 0) { p = next_p; - next_p = (vm_page_t)queue_next(&next_p->listq); + next_p = (vm_page_t)vm_page_queue_next(&next_p->listq); if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry || p->fictitious) goto move_page_in_obj; @@ -1292,7 +1290,7 @@ vm_object_page_grab( if (p->reference == FALSE || p->dirty == FALSE) { - refmod_state = pmap_get_refmod(p->phys_page); + refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(p)); if (refmod_state & VM_MEM_REFERENCED) p->reference = TRUE; @@ -1302,7 +1300,7 @@ vm_object_page_grab( } if (p->dirty == FALSE && p->precious == FALSE) { - refmod_state = pmap_disconnect(p->phys_page); + refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); if (refmod_state & VM_MEM_REFERENCED) p->reference = TRUE; @@ -1314,7 +1312,7 @@ vm_object_page_grab( goto take_page; } } - if (p->inactive && p->reference == TRUE) { + if ((p->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) && p->reference == TRUE) { vm_page_activate(p); VM_STAT_INCR(reactivations); @@ -1322,8 +1320,8 @@ vm_object_page_grab( } vm_page_unlock_queues(); move_page_in_obj: - queue_remove(&object->memq, p, vm_page_t, listq); - queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, listq); p_skipped++; continue; @@ -1357,6 +1355,9 @@ static void vm_object_cache_remove_locked( vm_object_t object) { + assert(object->purgable == VM_PURGABLE_DENY); + assert(object->wired_page_count == 0); + queue_remove(&vm_object_cached_list, object, vm_object_t, objq); object->objq.next = NULL; object->objq.prev = NULL; @@ -1383,6 +1384,9 @@ vm_object_cache_add( clock_sec_t sec; clock_nsec_t nsec; + assert(object->purgable == VM_PURGABLE_DENY); + assert(object->wired_page_count == 0); + if (object->resident_page_count == 0) return; clock_get_system_nanotime(&sec, &nsec); @@ -1455,6 +1459,9 @@ vm_object_cache_evict( object = next_obj; next_obj = (vm_object_t)queue_next(&next_obj->objq); + + assert(object->purgable == VM_PURGABLE_DENY); + assert(object->wired_page_count == 0); if (sec < object->vo_cache_ts) { KERNEL_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec, 0); @@ -1474,7 +1481,7 @@ vm_object_cache_evict( object = VM_OBJECT_NULL; continue; } - if (queue_empty(&object->memq) || object->vo_cache_pages_to_scan == 0) { + if (vm_page_queue_empty(&object->memq) || object->vo_cache_pages_to_scan == 0) { /* * this case really shouldn't happen, but it's not fatal * so deal with it... if we don't remove the object from @@ -1502,7 +1509,7 @@ vm_object_cache_evict( * object is locked at this point and * has resident pages */ - next_p = (vm_page_t)queue_first(&object->memq); + next_p = (vm_page_t)vm_page_queue_first(&object->memq); /* * break the page scan into 2 pieces to minimize the time spent @@ -1516,25 +1523,25 @@ vm_object_cache_evict( ep_limit = EVICT_PREPARE_LIMIT; ep_count = 0; - while (!queue_end(&object->memq, (queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) { + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) { p = next_p; - next_p = (vm_page_t)queue_next(&next_p->listq); + next_p = (vm_page_t)vm_page_queue_next(&next_p->listq); object->vo_cache_pages_to_scan--; if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry) { - queue_remove(&object->memq, p, vm_page_t, listq); - queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, listq); ep_skipped++; continue; } if (p->wpmapped || p->dirty || p->precious) { - queue_remove(&object->memq, p, vm_page_t, listq); - queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, listq); - pmap_clear_reference(p->phys_page); + pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(p)); } ep_array[ep_count++] = p; } @@ -1554,9 +1561,7 @@ vm_object_cache_evict( * we've already filtered out pages that are in the laundry * so if we get here, this page can't be on the pageout queue */ - assert(!p->pageout_queue); - - vm_page_queues_remove(p); + vm_page_queues_remove(p, FALSE); vm_page_enqueue_inactive(p, TRUE); ep_moved++; @@ -1566,12 +1571,12 @@ vm_object_cache_evict( #endif vm_page_free_prepare_queues(p); - assert(p->pageq.next == NULL && p->pageq.prev == NULL); + assert(p->pageq.next == 0 && p->pageq.prev == 0); /* * Add this page to our list of reclaimed pages, * to be freed later. */ - p->pageq.next = (queue_entry_t) local_free_q; + p->snext = local_free_q; local_free_q = p; ep_freed++; @@ -1637,7 +1642,7 @@ vm_object_t vm_object_cache_trim( boolean_t called_from_vm_object_deallocate) { - register vm_object_t object = VM_OBJECT_NULL; + vm_object_t object = VM_OBJECT_NULL; vm_object_t shadow; for (;;) { @@ -1743,6 +1748,8 @@ vm_object_terminate( XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n", object, object->ref_count, 0, 0, 0); + vm_object_lock_assert_exclusive(object); + if (!object->pageout && (!object->temporary || object->can_persist) && (object->pager != NULL || object->shadow_severed)) { /* @@ -1992,7 +1999,7 @@ vm_object_reap( vm_object_reap_pages(object, REAP_REAP); } - assert(queue_empty(&object->memq)); + assert(vm_page_queue_empty(&object->memq)); assert(object->paging_in_progress == 0); assert(object->activity_in_progress == 0); assert(object->ref_count == 0); @@ -2014,10 +2021,6 @@ vm_object_reap( vm_object_paging_end(object); vm_object_unlock(object); -#if MACH_PAGEMAP - vm_external_destroy(object->existence_map, object->vo_size); -#endif /* MACH_PAGEMAP */ - object->shadow = VM_OBJECT_NULL; #if VM_OBJECT_TRACKING @@ -2050,9 +2053,9 @@ unsigned int vm_max_batch = 256; vm_page_t m; \ for (m = _local_free_q; \ m != VM_PAGE_NULL; \ - m = (vm_page_t) m->pageq.next) { \ + m = m->snext) { \ if (m->pmapped) { \ - pmap_disconnect(m->phys_page); \ + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); \ } \ } \ } \ @@ -2092,7 +2095,7 @@ vm_object_reap_pages( } restart_after_sleep: - if (queue_empty(&object->memq)) + if (vm_page_queue_empty(&object->memq)) return; loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH); @@ -2101,12 +2104,12 @@ vm_object_reap_pages( vm_page_lockspin_queues(); - next = (vm_page_t)queue_first(&object->memq); + next = (vm_page_t)vm_page_queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t)next)) { + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) { p = next; - next = (vm_page_t)queue_next(&next->listq); + next = (vm_page_t)vm_page_queue_next(&next->listq); if (--loop_count == 0) { @@ -2147,11 +2150,8 @@ vm_object_reap_pages( goto restart_after_sleep; } - if (p->laundry) { - p->pageout = FALSE; - + if (p->laundry) vm_pageout_steal_laundry(p, TRUE); - } } switch (reap_type) { @@ -2175,11 +2175,9 @@ vm_object_reap_pages( vm_page_purged_wired++; continue; } - if (p->laundry && !p->busy && !p->cleaning) { - p->pageout = FALSE; - + if (p->laundry && !p->busy && !p->cleaning) vm_pageout_steal_laundry(p, TRUE); - } + if (p->cleaning || p->laundry || p->absent) { /* * page is being acted upon, @@ -2195,12 +2193,13 @@ vm_object_reap_pages( * sure that it gets considered by * vm_pageout_scan() later. */ - vm_page_deactivate(p); + if (VM_PAGE_PAGEABLE(p)) + vm_page_deactivate(p); vm_page_purged_busy++; continue; } - assert(p->object != kernel_object); + assert(VM_PAGE_OBJECT(p) != kernel_object); /* * we can discard this page... @@ -2209,7 +2208,7 @@ vm_object_reap_pages( /* * unmap the page */ - pmap_disconnect_options(p->phys_page, PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_NOREFMOD, (void *)&pmap_flush_context_storage); + pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_NOREFMOD, (void *)&pmap_flush_context_storage); } vm_page_purged_count++; @@ -2227,23 +2226,25 @@ vm_object_reap_pages( break; } if (p->fictitious) { - assert (p->phys_page == vm_page_guard_addr); + assert (VM_PAGE_GET_PHYS_PAGE(p) == vm_page_guard_addr); break; } if (!p->dirty && p->wpmapped) - p->dirty = pmap_is_modified(p->phys_page); + p->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)); if ((p->dirty || p->precious) && !p->error && object->alive) { assert(!object->internal); - + + p->free_when_done = TRUE; + if (!p->laundry) { - vm_page_queues_remove(p); + vm_page_queues_remove(p, TRUE); /* * flush page... page will be freed * upon completion of I/O */ - (void)vm_pageout_cluster(p, TRUE, FALSE, FALSE); + (void)vm_pageout_cluster(p, FALSE, FALSE); } vm_page_unlock_queues(); /* @@ -2262,12 +2263,12 @@ vm_object_reap_pages( break; } vm_page_free_prepare_queues(p); - assert(p->pageq.next == NULL && p->pageq.prev == NULL); + assert(p->pageq.next == 0 && p->pageq.prev == 0); /* * Add this page to our list of reclaimed pages, * to be freed later. */ - p->pageq.next = (queue_entry_t) local_free_q; + p->snext = local_free_q; local_free_q = p; } vm_page_unlock_queues(); @@ -2520,9 +2521,9 @@ uint32_t vm_object_deactivate_all_pages_pages = 0; */ static void vm_object_deactivate_all_pages( - register vm_object_t object) + vm_object_t object) { - register vm_page_t p; + vm_page_t p; int loop_count; #if VM_OBJ_DEACT_ALL_STATS int pages_count; @@ -2534,7 +2535,7 @@ vm_object_deactivate_all_pages( pages_count = 0; #endif /* VM_OBJ_DEACT_ALL_STATS */ vm_page_lock_queues(); - queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { if (--loop_count == 0) { #if VM_OBJ_DEACT_ALL_STATS hw_atomic_add(&vm_object_deactivate_all_pages_batches, @@ -2546,7 +2547,7 @@ vm_object_deactivate_all_pages( lck_mtx_yield(&vm_page_queue_lock); loop_count = BATCH_LIMIT(V_O_D_A_P_MAX_BATCH); } - if (!p->busy && !p->throttled) { + if (!p->busy && (p->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q)) { #if VM_OBJ_DEACT_ALL_STATS pages_count++; #endif /* VM_OBJ_DEACT_ALL_STATS */ @@ -2649,68 +2650,16 @@ page_is_paged_out( vm_object_t object, vm_object_offset_t offset) { - kern_return_t kr; - memory_object_t pager; - - /* - * Check the existence map for the page if we have one, otherwise - * ask the pager about this page. - */ - -#if MACH_PAGEMAP - if (object->existence_map) { - if (vm_external_state_get(object->existence_map, offset) - == VM_EXTERNAL_STATE_EXISTS) { - /* - * We found the page - */ - - return TRUE; - } - } else -#endif /* MACH_PAGEMAP */ if (object->internal && object->alive && !object->terminating && object->pager_ready) { - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) - == VM_EXTERNAL_STATE_EXISTS) { - return TRUE; - } else { - return FALSE; - } - } - - /* - * We're already holding a "paging in progress" reference - * so the object can't disappear when we release the lock. - */ - - assert(object->paging_in_progress); - pager = object->pager; - vm_object_unlock(object); - - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* just poke the pager */ - VM_PROT_READ, - NULL); - - vm_object_lock(object); - - if (kr == KERN_SUCCESS) { - - /* - * We found the page - */ - + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) + == VM_EXTERNAL_STATE_EXISTS) { return TRUE; } } - return FALSE; } @@ -2810,13 +2759,13 @@ deactivate_pages_in_object( * to simulate it being * reclaimed and re-faulted. */ - pmap_zero_page(m->phys_page); + pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m)); } m->precious = FALSE; m->dirty = FALSE; clear_refmod |= VM_MEM_MODIFIED; - if (m->throttled) { + if (m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) { /* * This page is now clean and * reclaimable. Move it out @@ -2826,11 +2775,8 @@ deactivate_pages_in_object( */ dwp->dw_mask |= DW_move_page; } -#if MACH_PAGEMAP - vm_external_state_clr(object->existence_map, offset); -#endif /* MACH_PAGEMAP */ - VM_COMPRESSOR_PAGER_STATE_CLR(object, - offset); + + VM_COMPRESSOR_PAGER_STATE_CLR(object, offset); if (reusable_page && !m->reusable) { assert(!all_reusable); @@ -2848,12 +2794,12 @@ deactivate_pages_in_object( } } pmap_options |= PMAP_OPTIONS_NOFLUSH; - pmap_clear_refmod_options(m->phys_page, + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod, pmap_options, (void *)pfc); - if (!m->throttled && !(reusable_page || all_reusable)) + if ((m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && !(reusable_page || all_reusable)) dwp->dw_mask |= DW_move_page; if (dwp->dw_mask) @@ -2891,14 +2837,10 @@ deactivate_pages_in_object( */ if ((kill_page) && (object->internal)) { -#if MACH_PAGEMAP - vm_external_state_clr(object->existence_map, offset); -#endif /* MACH_PAGEMAP */ - VM_COMPRESSOR_PAGER_STATE_CLR(object, - offset); - if (pmap != PMAP_NULL && - (COMPRESSED_PAGER_IS_ACTIVE || - DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)) { + + VM_COMPRESSOR_PAGER_STATE_CLR(object, offset); + + if (pmap != PMAP_NULL) { /* * Tell pmap that this page * is no longer mapped, to @@ -3123,7 +3065,7 @@ vm_object_reuse_pages( * for all the pmaps that have mapped this \ * page. \ */ \ - pmap_clear_refmod_options((m)->phys_page, \ + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE((m)), \ 0, /* refmod */ \ (PMAP_OPTIONS_CLEAR_REUSABLE \ | PMAP_OPTIONS_NOFLUSH), \ @@ -3147,7 +3089,7 @@ vm_object_reuse_pages( reused = object->resident_page_count; } else { vm_page_stats_reusable.partial_reuse_calls++; - queue_iterate(&object->memq, m, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, m, vm_page_t, listq) { if (m->offset < start_offset || m->offset >= end_offset) { m->reusable = TRUE; @@ -3174,7 +3116,7 @@ vm_object_reuse_pages( } } else { vm_page_stats_reusable.partial_reuse_calls++; - queue_iterate(&object->memq, m, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, m, vm_page_t, listq) { if (object->reusable_page_count == 0) { break; } @@ -3218,8 +3160,8 @@ vm_object_reuse_pages( __private_extern__ void vm_object_pmap_protect( - register vm_object_t object, - register vm_object_offset_t offset, + vm_object_t object, + vm_object_offset_t offset, vm_object_size_t size, pmap_t pmap, vm_map_offset_t pmap_start, @@ -3231,8 +3173,8 @@ vm_object_pmap_protect( __private_extern__ void vm_object_pmap_protect_options( - register vm_object_t object, - register vm_object_offset_t offset, + vm_object_t object, + vm_object_offset_t offset, vm_object_size_t size, pmap_t pmap, vm_map_offset_t pmap_start, @@ -3310,7 +3252,7 @@ vm_object_pmap_protect_options( end = offset + size; - queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { if (!p->fictitious && (offset <= p->offset) && (p->offset < end)) { vm_map_offset_t start; @@ -3326,7 +3268,7 @@ vm_object_pmap_protect_options( &pmap_flush_context_storage); else pmap_page_protect_options( - p->phys_page, + VM_PAGE_GET_PHYS_PAGE(p), prot, options | PMAP_OPTIONS_NOFLUSH, &pmap_flush_context_storage); @@ -3361,7 +3303,7 @@ vm_object_pmap_protect_options( &pmap_flush_context_storage); else pmap_page_protect_options( - p->phys_page, + VM_PAGE_GET_PHYS_PAGE(p), prot, options | PMAP_OPTIONS_NOFLUSH, &pmap_flush_context_storage); @@ -3377,7 +3319,7 @@ vm_object_pmap_protect_options( * Must follow shadow chain to remove access * to pages in shadowed objects. */ - register vm_object_t next_object; + vm_object_t next_object; next_object = object->shadow; if (next_object != VM_OBJECT_NULL) { @@ -3437,7 +3379,7 @@ vm_object_pmap_protect_options( */ __private_extern__ kern_return_t vm_object_copy_slowly( - register vm_object_t src_object, + vm_object_t src_object, vm_object_offset_t src_offset, vm_object_size_t size, boolean_t interruptible, @@ -3520,9 +3462,10 @@ vm_object_copy_slowly( vm_prot_t prot = VM_PROT_READ; vm_page_t _result_page; vm_page_t top_page; - register vm_page_t result_page; kern_return_t error_code; + vm_object_t result_page_object; + vm_object_lock(src_object); @@ -3575,6 +3518,7 @@ vm_object_copy_slowly( switch(result) { case VM_FAULT_SUCCESS: result_page = _result_page; + result_page_object = VM_PAGE_OBJECT(result_page); /* * Copy the page to the new object. @@ -3586,7 +3530,7 @@ vm_object_copy_slowly( */ vm_page_copy(result_page, new_page); - vm_object_unlock(result_page->object); + vm_object_unlock(result_page_object); /* * Let go of both pages (make them @@ -3597,14 +3541,14 @@ vm_object_copy_slowly( PAGE_WAKEUP_DONE(new_page); vm_object_unlock(new_object); - vm_object_lock(result_page->object); + vm_object_lock(result_page_object); PAGE_WAKEUP_DONE(result_page); vm_page_lockspin_queues(); - if (!result_page->active && - !result_page->inactive && - !result_page->throttled) + if ((result_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) || + (result_page->vm_page_q_state == VM_PAGE_NOT_ON_Q)) { vm_page_activate(result_page); + } vm_page_activate(new_page); vm_page_unlock_queues(); @@ -3613,7 +3557,7 @@ vm_object_copy_slowly( * top-level placeholder page, if any. */ - vm_fault_cleanup(result_page->object, + vm_fault_cleanup(result_page_object, top_page); break; @@ -4041,7 +3985,7 @@ vm_object_copy_delayed( pmap_flush_context_init(&pmap_flush_context_storage); delayed_pmap_flush = FALSE; - queue_iterate(&src_object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&src_object->memq, p, vm_page_t, listq) { if (!p->fictitious && p->offset >= old_copy->vo_size && p->offset < copy_size) { @@ -4058,7 +4002,7 @@ vm_object_copy_delayed( return VM_OBJECT_NULL; } else { - pmap_page_protect_options(p->phys_page, (VM_PROT_ALL & ~VM_PROT_WRITE), + pmap_page_protect_options(VM_PAGE_GET_PHYS_PAGE(p), (VM_PROT_ALL & ~VM_PROT_WRITE), PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage); delayed_pmap_flush = TRUE; } @@ -4141,7 +4085,7 @@ vm_object_copy_delayed( pmap_flush_context_init(&pmap_flush_context_storage); delayed_pmap_flush = FALSE; - queue_iterate(&src_object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&src_object->memq, p, vm_page_t, listq) { if (!p->fictitious && p->offset < copy_size) { if (VM_PAGE_WIRED(p)) { if (old_copy) @@ -4155,7 +4099,7 @@ vm_object_copy_delayed( return VM_OBJECT_NULL; } else { - pmap_page_protect_options(p->phys_page, (VM_PROT_ALL & ~VM_PROT_WRITE), + pmap_page_protect_options(VM_PAGE_GET_PHYS_PAGE(p), (VM_PROT_ALL & ~VM_PROT_WRITE), PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage); delayed_pmap_flush = TRUE; } @@ -4222,7 +4166,7 @@ vm_object_copy_delayed( */ __private_extern__ kern_return_t vm_object_copy_strategically( - register vm_object_t src_object, + vm_object_t src_object, vm_object_offset_t src_offset, vm_object_size_t size, vm_object_t *dst_object, /* OUT */ @@ -4336,8 +4280,8 @@ vm_object_shadow( vm_object_offset_t *offset, /* IN/OUT */ vm_object_size_t length) { - register vm_object_t source; - register vm_object_t result; + vm_object_t source; + vm_object_t result; source = *object; assert(source != VM_OBJECT_NULL); @@ -4371,14 +4315,28 @@ vm_object_shadow( * (freeing up the extra data it might contain and that * we don't need). */ + + assert(source->copy_strategy != MEMORY_OBJECT_COPY_NONE); /* Purgeable objects shouldn't have shadow objects. */ + if (vm_object_shadow_check && source->vo_size == length && source->ref_count == 1 && (source->shadow == VM_OBJECT_NULL || source->shadow->copy == VM_OBJECT_NULL) ) { - source->shadowed = FALSE; - return FALSE; + /* lock the object and check again */ + vm_object_lock(source); + if (source->vo_size == length && + source->ref_count == 1 && + (source->shadow == VM_OBJECT_NULL || + source->shadow->copy == VM_OBJECT_NULL)) + { + source->shadowed = FALSE; + vm_object_unlock(source); + return FALSE; + } + /* things changed while we were locking "source"... */ + vm_object_unlock(source); } /* @@ -4418,7 +4376,7 @@ vm_object_shadow( * the memory_object requires careful synchronization. * * All associations are created by memory_object_create_named - * for external pagers and vm_object_pager_create for internal + * for external pagers and vm_object_compressor_pager_create for internal * objects as follows: * * pager: the memory_object itself, supplied by @@ -4508,7 +4466,7 @@ vm_object_enter( boolean_t init, boolean_t named) { - register vm_object_t object; + vm_object_t object; vm_object_t new_object; boolean_t must_init; vm_object_hash_entry_t entry, new_entry; @@ -4708,6 +4666,7 @@ vm_object_enter( if (named) object->named = TRUE; if (internal) { + vm_object_lock_assert_exclusive(object); object->pager_ready = TRUE; vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); } @@ -4741,7 +4700,7 @@ vm_object_enter( } /* - * Routine: vm_object_pager_create + * Routine: vm_object_compressor_pager_create * Purpose: * Create a memory object for an internal object. * In/out conditions: @@ -4749,134 +4708,14 @@ vm_object_enter( * it may be unlocked within this call. * Limitations: * Only one thread may be performing a - * vm_object_pager_create on an object at + * vm_object_compressor_pager_create on an object at * a time. Presumably, only the pageout * daemon will be using this routine. */ -void -vm_object_pager_create( - register vm_object_t object) -{ - memory_object_t pager; - vm_object_hash_entry_t entry; - lck_mtx_t *lck; -#if MACH_PAGEMAP - vm_object_size_t size; - vm_external_map_t map; -#endif /* MACH_PAGEMAP */ - - XPR(XPR_VM_OBJECT, "vm_object_pager_create, object 0x%X\n", - object, 0,0,0,0); - - assert(object != kernel_object); - - if (memory_manager_default_check() != KERN_SUCCESS) - return; - - /* - * Prevent collapse or termination by holding a paging reference - */ - - vm_object_paging_begin(object); - if (object->pager_created) { - /* - * Someone else got to it first... - * wait for them to finish initializing the ports - */ - while (!object->pager_initialized) { - vm_object_sleep(object, - VM_OBJECT_EVENT_INITIALIZED, - THREAD_UNINT); - } - vm_object_paging_end(object); - return; - } - - /* - * Indicate that a memory object has been assigned - * before dropping the lock, to prevent a race. - */ - - object->pager_created = TRUE; - object->paging_offset = 0; - -#if MACH_PAGEMAP - size = object->vo_size; -#endif /* MACH_PAGEMAP */ - vm_object_unlock(object); - -#if MACH_PAGEMAP - if (DEFAULT_PAGER_IS_ACTIVE) { - map = vm_external_create(size); - vm_object_lock(object); - assert(object->vo_size == size); - object->existence_map = map; - vm_object_unlock(object); - } -#endif /* MACH_PAGEMAP */ - - if ((uint32_t) object->vo_size != object->vo_size) { - panic("vm_object_pager_create(): object size 0x%llx >= 4GB\n", - (uint64_t) object->vo_size); - } - - /* - * Create the [internal] pager, and associate it with this object. - * - * We make the association here so that vm_object_enter() - * can look up the object to complete initializing it. No - * user will ever map this object. - */ - { - memory_object_default_t dmm; - - /* acquire a reference for the default memory manager */ - dmm = memory_manager_default_reference(); - - assert(object->temporary); - - /* create our new memory object */ - assert((vm_size_t) object->vo_size == object->vo_size); - (void) memory_object_create(dmm, (vm_size_t) object->vo_size, - &pager); - - memory_object_default_deallocate(dmm); - } - - entry = vm_object_hash_entry_alloc(pager); - - vm_object_lock(object); - lck = vm_object_hash_lock_spin(pager); - vm_object_hash_insert(entry, object); - vm_object_hash_unlock(lck); - vm_object_unlock(object); - - /* - * A reference was returned by - * memory_object_create(), and it is - * copied by vm_object_enter(). - */ - - if (vm_object_enter(pager, object->vo_size, TRUE, TRUE, FALSE) != object) - panic("vm_object_pager_create: mismatch"); - - /* - * Drop the reference we were passed. - */ - memory_object_deallocate(pager); - - vm_object_lock(object); - - /* - * Release the paging reference - */ - vm_object_paging_end(object); -} - void vm_object_compressor_pager_create( - register vm_object_t object) + vm_object_t object) { memory_object_t pager; vm_object_hash_entry_t entry; @@ -5015,11 +4854,6 @@ static long object_bypasses = 0; static boolean_t vm_object_collapse_allowed = TRUE; static boolean_t vm_object_bypass_allowed = TRUE; -#if MACH_PAGEMAP -static int vm_external_discarded; -static int vm_external_collapsed; -#endif - unsigned long vm_object_collapse_encrypted = 0; void vm_object_do_collapse_compressor(vm_object_t object, @@ -5135,9 +4969,9 @@ vm_object_do_collapse( * pages that shadow them. */ - while (!queue_empty(&backing_object->memq)) { + while (!vm_page_queue_empty(&backing_object->memq)) { - p = (vm_page_t) queue_first(&backing_object->memq); + p = (vm_page_t) vm_page_queue_first(&backing_object->memq); new_offset = (p->offset - backing_offset); @@ -5188,25 +5022,6 @@ vm_object_do_collapse( vm_page_rename(p, object, new_offset, TRUE); } - -#if MACH_PAGEMAP - } else if (pp->absent) { - - /* - * Parent has an absent page... - * it's not being paged in, so - * it must really be missing from - * the parent. - * - * Throw out the absent page... - * any faults looking for that - * page will restart with the new - * one. - */ - - VM_PAGE_FREE(pp); - vm_page_rename(p, object, new_offset, TRUE); -#endif /* MACH_PAGEMAP */ } else { assert(! pp->absent); @@ -5230,16 +5045,10 @@ vm_object_do_collapse( } else if (backing_object->pager != MEMORY_OBJECT_NULL) { vm_object_hash_entry_t entry; -#if !MACH_PAGEMAP assert((!object->pager_created && (object->pager == MEMORY_OBJECT_NULL)) || (!backing_object->pager_created && (backing_object->pager == MEMORY_OBJECT_NULL))); -#else - assert(!object->pager_created && - object->pager == MEMORY_OBJECT_NULL); -#endif /* !MACH_PAGEMAP */ - /* * Move the pager from backing_object to object. * @@ -5282,33 +5091,6 @@ vm_object_do_collapse( backing_object->paging_offset = 0; backing_object->pager = NULL; } - -#if MACH_PAGEMAP - /* - * If the shadow offset is 0, the use the existence map from - * the backing object if there is one. If the shadow offset is - * not zero, toss it. - * - * XXX - If the shadow offset is not 0 then a bit copy is needed - * if the map is to be salvaged. For now, we just just toss the - * old map, giving the collapsed object no map. This means that - * the pager is invoked for zero fill pages. If analysis shows - * that this happens frequently and is a performance hit, then - * this code should be fixed to salvage the map. - */ - assert(object->existence_map == VM_EXTERNAL_NULL); - if (backing_offset || (size != backing_object->vo_size)) { - vm_external_discarded++; - vm_external_destroy(backing_object->existence_map, - backing_object->vo_size); - } - else { - vm_external_collapsed++; - object->existence_map = backing_object->existence_map; - } - backing_object->existence_map = VM_EXTERNAL_NULL; -#endif /* MACH_PAGEMAP */ - /* * Object now shadows whatever backing_object did. * Note that the reference to backing_object->shadow @@ -5522,13 +5304,13 @@ static unsigned long vm_object_collapse_do_bypass = 0; __private_extern__ void vm_object_collapse( - register vm_object_t object, - register vm_object_offset_t hint_offset, + vm_object_t object, + vm_object_offset_t hint_offset, boolean_t can_bypass) { - register vm_object_t backing_object; - register unsigned int rcount; - register unsigned int size; + vm_object_t backing_object; + unsigned int rcount; + unsigned int size; vm_object_t original_object; int object_lock_type; int backing_object_lock_type; @@ -5675,22 +5457,13 @@ vm_object_collapse( * object, we may be able to collapse it into the * parent. * - * If MACH_PAGEMAP is defined: - * The parent must not have a pager created for it, - * since collapsing a backing_object dumps new pages - * into the parent that its pager doesn't know about - * (and the collapse code can't merge the existence - * maps). - * Otherwise: * As long as one of the objects is still not known * to the pager, we can collapse them. */ if (backing_object->ref_count == 1 && (vm_object_collapse_compressor_allowed || !object->pager_created -#if !MACH_PAGEMAP || (!backing_object->pager_created) -#endif /*!MACH_PAGEMAP */ ) && vm_object_collapse_allowed) { /* @@ -5769,11 +5542,7 @@ vm_object_collapse( * then we cannot bypass it, because we don't know * what pages it has. */ - if (backing_object->pager_created -#if MACH_PAGEMAP - && (backing_object->existence_map == VM_EXTERNAL_NULL) -#endif /* MACH_PAGEMAP */ - ) { + if (backing_object->pager_created) { /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); @@ -5788,11 +5557,7 @@ vm_object_collapse( * then we cannot bypass it, because we don't know * what pages it has. */ - if (object->pager_created -#if MACH_PAGEMAP - && (object->existence_map == VM_EXTERNAL_NULL) -#endif /* MACH_PAGEMAP */ - ) { + if (object->pager_created) { /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); @@ -5833,20 +5598,10 @@ vm_object_collapse( * */ -#if MACH_PAGEMAP -#define EXISTS_IN_OBJECT(obj, off, rc) \ - ((vm_external_state_get((obj)->existence_map, \ - (vm_offset_t)(off)) \ - == VM_EXTERNAL_STATE_EXISTS) || \ - (VM_COMPRESSOR_PAGER_STATE_GET((obj), (off)) \ - == VM_EXTERNAL_STATE_EXISTS) || \ - ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--)) -#else /* MACH_PAGEMAP */ #define EXISTS_IN_OBJECT(obj, off, rc) \ ((VM_COMPRESSOR_PAGER_STATE_GET((obj), (off)) \ == VM_EXTERNAL_STATE_EXISTS) || \ ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--)) -#endif /* MACH_PAGEMAP */ /* * Check the hint location first @@ -5891,7 +5646,7 @@ vm_object_collapse( vm_page_t p; backing_rcount = backing_object->resident_page_count; - p = (vm_page_t)queue_first(&backing_object->memq); + p = (vm_page_t)vm_page_queue_first(&backing_object->memq); do { offset = (p->offset - backing_offset); @@ -5903,7 +5658,7 @@ vm_object_collapse( break; } - p = (vm_page_t) queue_next(&p->listq); + p = (vm_page_t) vm_page_queue_next(&p->listq); } while (--backing_rcount); if (backing_rcount != 0 ) { @@ -5921,11 +5676,7 @@ vm_object_collapse( * Walk through the offsets looking for pages in the * backing object that show through to the object. */ - if (backing_rcount -#if MACH_PAGEMAP - || backing_object->existence_map -#endif /* MACH_PAGEMAP */ - ) { + if (backing_rcount) { offset = hint_offset; while((offset = @@ -6007,11 +5758,11 @@ unsigned int vm_object_page_remove_iterate = 0; __private_extern__ void vm_object_page_remove( - register vm_object_t object, - register vm_object_offset_t start, - register vm_object_offset_t end) + vm_object_t object, + vm_object_offset_t start, + vm_object_offset_t end) { - register vm_page_t p, next; + vm_page_t p, next; /* * One and two page removals are most popular. @@ -6025,22 +5776,22 @@ vm_object_page_remove( for (; start < end; start += PAGE_SIZE_64) { p = vm_page_lookup(object, start); if (p != VM_PAGE_NULL) { - assert(!p->cleaning && !p->pageout && !p->laundry); + assert(!p->cleaning && !p->laundry); if (!p->fictitious && p->pmapped) - pmap_disconnect(p->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); VM_PAGE_FREE(p); } } } else { vm_object_page_remove_iterate++; - p = (vm_page_t) queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t) p)) { - next = (vm_page_t) queue_next(&p->listq); + p = (vm_page_t) vm_page_queue_first(&object->memq); + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) p)) { + next = (vm_page_t) vm_page_queue_next(&p->listq); if ((start <= p->offset) && (p->offset < end)) { - assert(!p->cleaning && !p->pageout && !p->laundry); + assert(!p->cleaning && !p->laundry); if (!p->fictitious && p->pmapped) - pmap_disconnect(p->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); VM_PAGE_FREE(p); } p = next; @@ -6076,7 +5827,7 @@ static int vm_object_coalesce_count = 0; __private_extern__ boolean_t vm_object_coalesce( - register vm_object_t prev_object, + vm_object_t prev_object, vm_object_t next_object, vm_object_offset_t prev_offset, __unused vm_object_offset_t next_offset, @@ -6146,18 +5897,6 @@ vm_object_coalesce( */ newsize = prev_offset + prev_size + next_size; if (newsize > prev_object->vo_size) { -#if MACH_PAGEMAP - /* - * We cannot extend an object that has existence info, - * since the existence info might then fail to cover - * the entire object. - * - * This assertion must be true because the object - * has no pager, and we only create existence info - * for objects with pagers. - */ - assert(prev_object->existence_map == VM_EXTERNAL_NULL); -#endif /* MACH_PAGEMAP */ prev_object->vo_size = newsize; } @@ -6197,16 +5936,16 @@ vm_object_populate_with_private( if (m != VM_PAGE_NULL) { if (m->fictitious) { - if (m->phys_page != vm_page_guard_addr) { + if (VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr) { vm_page_lockspin_queues(); m->private = TRUE; vm_page_unlock_queues(); m->fictitious = FALSE; - m->phys_page = base_page; + VM_PAGE_SET_PHYS_PAGE(m, base_page); } - } else if (m->phys_page != base_page) { + } else if (VM_PAGE_GET_PHYS_PAGE(m) != base_page) { if ( !m->private) { /* @@ -6218,9 +5957,9 @@ vm_object_populate_with_private( /* * pmap call to clear old mapping */ - pmap_disconnect(m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); } - m->phys_page = base_page; + VM_PAGE_SET_PHYS_PAGE(m, base_page); } if (m->encrypted) { /* @@ -6239,7 +5978,7 @@ vm_object_populate_with_private( */ m->private = TRUE; m->fictitious = FALSE; - m->phys_page = base_page; + VM_PAGE_SET_PHYS_PAGE(m, base_page); m->unusual = TRUE; m->busy = FALSE; @@ -6287,7 +6026,7 @@ memory_object_free_from_cache( #if VM_OBJECT_CACHE int object_released = 0; - register vm_object_t object = VM_OBJECT_NULL; + vm_object_t object = VM_OBJECT_NULL; vm_object_t shadow; /* @@ -6685,8 +6424,9 @@ vm_object_purge(vm_object_t object, int flags) vm_object_reap_pages(object, REAP_PURGEABLE); - if (object->pager != NULL && - COMPRESSED_PAGER_IS_ACTIVE) { + if (object->pager != NULL) { + + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); if (object->activity_in_progress == 0 && object->paging_in_progress == 0) { @@ -6936,13 +6676,13 @@ vm_object_purgable_control( vm_page_t p; int refmod; - queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { if (p->busy || VM_PAGE_WIRED(p) || p->fictitious) { continue; } - refmod = pmap_disconnect(p->phys_page); + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); if ((refmod & VM_MEM_MODIFIED) && !p->dirty) { SET_PAGE_DIRTY(p, FALSE); @@ -7071,13 +6811,13 @@ vm_object_purgable_control( vm_page_t p; int refmod; - queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { if (p->busy || VM_PAGE_WIRED(p) || p->fictitious) { continue; } - refmod = pmap_disconnect(p->phys_page); + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); if ((refmod & VM_MEM_MODIFIED) && !p->dirty) { SET_PAGE_DIRTY(p, FALSE); @@ -7185,7 +6925,7 @@ vm_object_get_page_counts( if (object->resident_page_count <= (size >> PAGE_SHIFT)) { - queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { if (p->offset >= cur_offset && p->offset < end_offset) { @@ -7193,7 +6933,7 @@ vm_object_get_page_counts( if (count_dirty_pages) { - if (p->dirty || (p->wpmapped && pmap_is_modified(p->phys_page))) { + if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { local_dirty_count++; } @@ -7212,7 +6952,7 @@ vm_object_get_page_counts( if (count_dirty_pages) { - if (p->dirty || (p->wpmapped && pmap_is_modified(p->phys_page))) { + if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { local_dirty_count++; } @@ -7327,7 +7067,7 @@ vm_object_res_reference( #endif __private_extern__ void vm_object_reference( - register vm_object_t object) + vm_object_t object) { if (object == VM_OBJECT_NULL) return; @@ -7484,52 +7224,52 @@ vm_object_transpose( * Transpose the lists of resident pages. * This also updates the resident_page_count and the memq_hint. */ - if (object1->phys_contiguous || queue_empty(&object1->memq)) { + if (object1->phys_contiguous || vm_page_queue_empty(&object1->memq)) { /* * No pages in object1, just transfer pages * from object2 to object1. No need to go through * an intermediate object. */ - while (!queue_empty(&object2->memq)) { - page = (vm_page_t) queue_first(&object2->memq); + while (!vm_page_queue_empty(&object2->memq)) { + page = (vm_page_t) vm_page_queue_first(&object2->memq); vm_page_rename(page, object1, page->offset, FALSE); } - assert(queue_empty(&object2->memq)); - } else if (object2->phys_contiguous || queue_empty(&object2->memq)) { + assert(vm_page_queue_empty(&object2->memq)); + } else if (object2->phys_contiguous || vm_page_queue_empty(&object2->memq)) { /* * No pages in object2, just transfer pages * from object1 to object2. No need to go through * an intermediate object. */ - while (!queue_empty(&object1->memq)) { - page = (vm_page_t) queue_first(&object1->memq); + while (!vm_page_queue_empty(&object1->memq)) { + page = (vm_page_t) vm_page_queue_first(&object1->memq); vm_page_rename(page, object2, page->offset, FALSE); } - assert(queue_empty(&object1->memq)); + assert(vm_page_queue_empty(&object1->memq)); } else { /* transfer object1's pages to tmp_object */ - while (!queue_empty(&object1->memq)) { - page = (vm_page_t) queue_first(&object1->memq); + while (!vm_page_queue_empty(&object1->memq)) { + page = (vm_page_t) vm_page_queue_first(&object1->memq); page_offset = page->offset; vm_page_remove(page, TRUE); page->offset = page_offset; - queue_enter(&tmp_object->memq, page, vm_page_t, listq); + vm_page_queue_enter(&tmp_object->memq, page, vm_page_t, listq); } - assert(queue_empty(&object1->memq)); + assert(vm_page_queue_empty(&object1->memq)); /* transfer object2's pages to object1 */ - while (!queue_empty(&object2->memq)) { - page = (vm_page_t) queue_first(&object2->memq); + while (!vm_page_queue_empty(&object2->memq)) { + page = (vm_page_t) vm_page_queue_first(&object2->memq); vm_page_rename(page, object1, page->offset, FALSE); } - assert(queue_empty(&object2->memq)); + assert(vm_page_queue_empty(&object2->memq)); /* transfer tmp_object's pages to object2 */ - while (!queue_empty(&tmp_object->memq)) { - page = (vm_page_t) queue_first(&tmp_object->memq); - queue_remove(&tmp_object->memq, page, - vm_page_t, listq); + while (!vm_page_queue_empty(&tmp_object->memq)) { + page = (vm_page_t) vm_page_queue_first(&tmp_object->memq); + vm_page_queue_remove(&tmp_object->memq, page, + vm_page_t, listq); vm_page_insert(page, object2, page->offset); } - assert(queue_empty(&tmp_object->memq)); + assert(vm_page_queue_empty(&tmp_object->memq)); } #define __TRANSPOSE_FIELD(field) \ @@ -7616,9 +7356,6 @@ MACRO_END __TRANSPOSE_FIELD(pages_created); __TRANSPOSE_FIELD(pages_used); __TRANSPOSE_FIELD(scan_collisions); -#if MACH_PAGEMAP - __TRANSPOSE_FIELD(existence_map); -#endif __TRANSPOSE_FIELD(cow_hint); #if MACH_ASSERT __TRANSPOSE_FIELD(paging_object); @@ -7709,7 +7446,14 @@ MACRO_END extern int speculative_reads_disabled; extern int ignore_is_ssd; -unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES; +/* + * Try to always keep these values an even multiple of PAGE_SIZE. We use these values + * to derive min_ph_bytes and max_ph_bytes (IMP: bytes not # of pages) and expect those values to + * always be page-aligned. The derivation could involve operations (e.g. division) + * that could give us non-page-size aligned values if we start out with values that + * are odd multiples of PAGE_SIZE. + */ + unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES; unsigned int preheat_min_bytes = (1024 * 32); @@ -7776,7 +7520,16 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, if (isSSD) { min_ph_size /= 2; max_ph_size /= 8; + + if (min_ph_size & PAGE_MASK_64) { + min_ph_size = trunc_page(min_ph_size); + } + + if (max_ph_size & PAGE_MASK_64) { + max_ph_size = trunc_page(max_ph_size); + } } + if (min_ph_size < PAGE_SIZE) min_ph_size = PAGE_SIZE; @@ -7955,7 +7708,7 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, tail_size = 0; } assert( !(target_start & PAGE_MASK_64)); - assert( !(pre_heat_size & PAGE_MASK)); + assert( !(pre_heat_size & PAGE_MASK_64)); if (pre_heat_size <= PAGE_SIZE) goto out; @@ -7975,21 +7728,11 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, */ if (offset < fault_info->lo_offset) break; - /* - * for external objects and internal objects w/o an existence map - * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN + /* + * for external objects or internal objects w/o a pager, + * VM_COMPRESSOR_PAGER_STATE_GET will return VM_EXTERNAL_STATE_UNKNOWN */ -#if MACH_PAGEMAP - if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) { - /* - * we know for a fact that the pager can't provide the page - * so don't include it or any pages beyond it in this cluster - */ - break; - } -#endif /* MACH_PAGEMAP */ - if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) - == VM_EXTERNAL_STATE_ABSENT) { + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) { break; } if (vm_page_lookup(object, offset) != VM_PAGE_NULL) { @@ -8011,19 +7754,10 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, break; assert(offset < object_size); - /* - * for external objects and internal objects w/o an existence map - * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN + /* + * for external objects or internal objects w/o a pager, + * VM_COMPRESSOR_PAGER_STATE_GET will return VM_EXTERNAL_STATE_UNKNOWN */ -#if MACH_PAGEMAP - if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) { - /* - * we know for a fact that the pager can't provide the page - * so don't include it or any pages beyond it in this cluster - */ - break; - } -#endif /* MACH_PAGEMAP */ if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) { break; } @@ -8099,7 +7833,7 @@ vm_object_page_op( if (ops & UPL_POP_DUMP) { if (dst_page->pmapped == TRUE) - pmap_disconnect(dst_page->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); VM_PAGE_FREE(dst_page); break; @@ -8112,7 +7846,7 @@ vm_object_page_op( /* are undertaken */ if(dst_page->dirty) *flags |= UPL_POP_DIRTY; - if(dst_page->pageout) *flags |= UPL_POP_PAGEOUT; + if(dst_page->free_when_done) *flags |= UPL_POP_PAGEOUT; if(dst_page->precious) *flags |= UPL_POP_PRECIOUS; if(dst_page->absent) *flags |= UPL_POP_ABSENT; if(dst_page->busy) *flags |= UPL_POP_BUSY; @@ -8132,7 +7866,7 @@ vm_object_page_op( if (ops & UPL_POP_DIRTY) { SET_PAGE_DIRTY(dst_page, FALSE); } - if (ops & UPL_POP_PAGEOUT) dst_page->pageout = TRUE; + if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = TRUE; if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE; if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE; if (ops & UPL_POP_BUSY) dst_page->busy = TRUE; @@ -8141,7 +7875,7 @@ vm_object_page_op( if(ops & UPL_POP_CLR) { assert(dst_page->busy); if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE; - if (ops & UPL_POP_PAGEOUT) dst_page->pageout = FALSE; + if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = FALSE; if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE; if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE; if (ops & UPL_POP_BUSY) { @@ -8190,7 +7924,7 @@ vm_object_page_op( */ assert(dst_page->busy); assert(!dst_page->encrypted); - *phys_entry = dst_page->phys_page; + *phys_entry = VM_PAGE_GET_PHYS_PAGE(dst_page); } break; @@ -8264,13 +7998,11 @@ vm_object_range_op( */ continue; } - if (dst_page->laundry) { - dst_page->pageout = FALSE; - + if (dst_page->laundry) vm_pageout_steal_laundry(dst_page, FALSE); - } + if (dst_page->pmapped == TRUE) - pmap_disconnect(dst_page->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); VM_PAGE_FREE(dst_page); @@ -8329,11 +8061,20 @@ kern_return_t pager_map_to_phys_contiguous( } clobbered_private = pager_object->private; - pager_object->private = TRUE; + if (pager_object->private != TRUE) { + vm_object_lock(pager_object); + pager_object->private = TRUE; + vm_object_unlock(pager_object); + } retval = vm_object_populate_with_private(pager_object, offset, page_num, size); - if (retval != KERN_SUCCESS) - pager_object->private = clobbered_private; + if (retval != KERN_SUCCESS) { + if (pager_object->private != clobbered_private) { + vm_object_lock(pager_object); + pager_object->private = clobbered_private; + vm_object_unlock(pager_object); + } + } out: return retval; @@ -8349,6 +8090,9 @@ vm_object_lock(vm_object_t object) mutex_pause(2); } lck_rw_lock_exclusive(&object->Lock); +#if DEVELOPMENT || DEBUG + object->Lock_owner = current_thread(); +#endif } boolean_t @@ -8364,7 +8108,14 @@ vm_object_lock_avoid(vm_object_t object) boolean_t _vm_object_lock_try(vm_object_t object) { - return (lck_rw_try_lock_exclusive(&object->Lock)); + boolean_t retval; + + retval = lck_rw_try_lock_exclusive(&object->Lock); +#if DEVELOPMENT || DEBUG + if (retval == TRUE) + object->Lock_owner = current_thread(); +#endif + return (retval); } boolean_t @@ -8397,6 +8148,31 @@ vm_object_lock_try_shared(vm_object_t object) return (lck_rw_try_lock_shared(&object->Lock)); } +boolean_t +vm_object_lock_upgrade(vm_object_t object) +{ boolean_t retval; + + retval = lck_rw_lock_shared_to_exclusive(&object->Lock); +#if DEVELOPMENT || DEBUG + if (retval == TRUE) + object->Lock_owner = current_thread(); +#endif + return (retval); +} + +void +vm_object_unlock(vm_object_t object) +{ +#if DEVELOPMENT || DEBUG + if (object->Lock_owner) { + if (object->Lock_owner != current_thread()) + panic("vm_object_unlock: not owner - %p\n", object); + object->Lock_owner = 0; + } +#endif + lck_rw_done(&object->Lock); +} + unsigned int vm_object_change_wimg_mode_count = 0; @@ -8412,10 +8188,10 @@ vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode) vm_object_paging_wait(object, THREAD_UNINT); - queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { if (!p->fictitious) - pmap_set_cache_attributes(p->phys_page, wimg_mode); + pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(p), wimg_mode); } if (wimg_mode == VM_WIMG_USE_DEFAULT) object->set_cache_attr = FALSE; @@ -8429,140 +8205,6 @@ vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode) #if CONFIG_FREEZE -kern_return_t vm_object_pack( - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *shared, - vm_object_t src_object, - struct default_freezer_handle *df_handle) -{ - kern_return_t kr = KERN_SUCCESS; - - vm_object_lock(src_object); - - *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; - *shared = FALSE; - - if (!src_object->alive || src_object->terminating){ - kr = KERN_FAILURE; - goto done; - } - - if (src_object->purgable == VM_PURGABLE_VOLATILE) { - *purgeable_count = src_object->resident_page_count; - - /* If the default freezer handle is null, we're just walking the pages to discover how many can be hibernated */ - if (df_handle != NULL) { - purgeable_q_t queue; - /* object should be on a queue */ - assert(src_object->objq.next != NULL && - src_object->objq.prev != NULL); - - queue = vm_purgeable_object_remove(src_object); - assert(queue); - if (src_object->purgeable_when_ripe) { - vm_page_lock_queues(); - vm_purgeable_token_delete_first(queue); - vm_page_unlock_queues(); - } - - vm_object_purge(src_object, 0); - assert(src_object->purgable == VM_PURGABLE_EMPTY); - - /* - * This object was "volatile" so its pages must have - * already been accounted as "volatile": no change - * in accounting now that it's "empty". - */ - } - goto done; - } - - if (src_object->ref_count == 1) { - vm_object_pack_pages(wired_count, clean_count, dirty_count, dirty_budget, src_object, df_handle); - } else { - if (src_object->internal) { - *shared = TRUE; - } - } -done: - vm_object_unlock(src_object); - - return kr; -} - - -void -vm_object_pack_pages( - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - vm_object_t src_object, - struct default_freezer_handle *df_handle) -{ - vm_page_t p, next; - - next = (vm_page_t)queue_first(&src_object->memq); - - while (!queue_end(&src_object->memq, (queue_entry_t)next)) { - p = next; - next = (vm_page_t)queue_next(&next->listq); - - /* Finish up if we've hit our pageout limit */ - if (dirty_budget && (dirty_budget == *dirty_count)) { - break; - } - assert(!p->laundry); - - if (p->fictitious || p->busy ) - continue; - - if (p->absent || p->unusual || p->error) - continue; - - if (VM_PAGE_WIRED(p)) { - (*wired_count)++; - continue; - } - - if (df_handle == NULL) { - if (p->dirty || pmap_is_modified(p->phys_page)) { - (*dirty_count)++; - } else { - (*clean_count)++; - } - continue; - } - - if (p->cleaning) { - p->pageout = TRUE; - continue; - } - - if (p->pmapped == TRUE) { - int refmod_state; - refmod_state = pmap_disconnect(p->phys_page); - if (refmod_state & VM_MEM_MODIFIED) { - SET_PAGE_DIRTY(p, FALSE); - } - } - - if (p->dirty) { - default_freezer_pack_page(p, df_handle); - (*dirty_count)++; - } - else { - VM_PAGE_FREE(p); - (*clean_count)++; - } - } -} - - /* * This routine does the "relocation" of previously * compressed pages belonging to this object that are @@ -8619,6 +8261,7 @@ vm_object_compressed_freezer_pageout( int obj_resident_page_count_snapshot = 0; assert(object != VM_OBJECT_NULL); + assert(object->internal); vm_object_lock(object); @@ -8638,7 +8281,7 @@ vm_object_compressed_freezer_pageout( } } - if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { vm_object_offset_t curr_offset = 0; /* @@ -8679,24 +8322,22 @@ vm_object_compressed_freezer_pageout( vm_object_activity_begin(object); - while ((obj_resident_page_count_snapshot--) && !queue_empty(&object->memq)) { + while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq)) { - p = (vm_page_t)queue_first(&object->memq); + p = (vm_page_t)vm_page_queue_first(&object->memq); KERNEL_DEBUG(0xe0430004 | DBG_FUNC_START, object, local_freed, 0, 0, 0); vm_page_lockspin_queues(); if (p->cleaning || p->fictitious || p->busy || p->absent || p->unusual || p->error || VM_PAGE_WIRED(p)) { - if (p->cleaning) - p->pageout = TRUE; vm_page_unlock_queues(); KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 1, 0, 0); - queue_remove(&object->memq, p, vm_page_t, listq); - queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, listq); continue; } @@ -8710,7 +8351,7 @@ vm_object_compressed_freezer_pageout( pmap_flags = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; } - refmod_state = pmap_disconnect_options(p->phys_page, pmap_flags, NULL); + refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), pmap_flags, NULL); if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(p, FALSE); } @@ -8727,12 +8368,11 @@ vm_object_compressed_freezer_pageout( continue; } - if (p->laundry) { - p->pageout = FALSE; + if (p->laundry) vm_pageout_steal_laundry(p, TRUE); - } - vm_page_queues_remove(p); + vm_page_queues_remove(p, TRUE); + vm_page_unlock_queues(); @@ -8742,8 +8382,8 @@ vm_object_compressed_freezer_pageout( * Make the move here while we have the object lock held. */ - queue_remove(&object->memq, p, vm_page_t, listq); - queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, listq); /* * Grab an activity_in_progress here for vm_pageout_compress_page() to consume. @@ -8764,7 +8404,7 @@ vm_object_compressed_freezer_pageout( /* * page has already been un-tabled from the object via 'vm_page_remove' */ - p->pageq.next = (queue_entry_t)local_freeq; + p->snext = local_freeq; local_freeq = p; local_freed++; @@ -8806,39 +8446,6 @@ vm_object_compressed_freezer_pageout( } } -kern_return_t -vm_object_pagein( - vm_object_t object) -{ - memory_object_t pager; - kern_return_t kr; - - vm_object_lock(object); - - pager = object->pager; - - if (!object->pager_ready || pager == MEMORY_OBJECT_NULL) { - vm_object_unlock(object); - return KERN_FAILURE; - } - - vm_object_paging_wait(object, THREAD_UNINT); - vm_object_paging_begin(object); - - object->blocked_access = TRUE; - vm_object_unlock(object); - - kr = memory_object_data_reclaim(pager, TRUE); - - vm_object_lock(object); - - object->blocked_access = FALSE; - vm_object_paging_end(object); - - vm_object_unlock(object); - - return kr; -} #endif /* CONFIG_FREEZE */ @@ -8850,10 +8457,12 @@ vm_object_pageout( struct vm_pageout_queue *iq; boolean_t need_unlock = TRUE; + if (!VM_CONFIG_COMPRESSOR_IS_PRESENT) + return; + iq = &vm_pageout_queue_internal; assert(object != VM_OBJECT_NULL ); - assert(!DEFAULT_PAGER_IS_ACTIVE && !DEFAULT_FREEZER_IS_ACTIVE); vm_object_lock(object); @@ -8881,17 +8490,18 @@ vm_object_pageout( } ReScan: - next = (vm_page_t)queue_first(&object->memq); + next = (vm_page_t)vm_page_queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t)next)) { + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) { p = next; - next = (vm_page_t)queue_next(&next->listq); + next = (vm_page_t)vm_page_queue_next(&next->listq); - if (!(p->active || p->inactive || p->speculative) || + assert(p->vm_page_q_state != VM_PAGE_ON_FREE_Q); + + if ((p->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) || p->encrypted_cleaning || p->cleaning || p->laundry || - p->pageout || p->busy || p->absent || p->error || @@ -8940,25 +8550,21 @@ vm_object_pageout( int refmod_state; int pmap_options; - pmap_options = 0; - if (COMPRESSED_PAGER_IS_ACTIVE || - DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { + /* + * Tell pmap the page should be accounted + * for as "compressed" if it's been modified. + */ + pmap_options = + PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; + if (p->dirty || p->precious) { /* - * Tell pmap the page should be accounted - * for as "compressed" if it's been modified. + * We already know it's been modified, + * so tell pmap to account for it + * as "compressed". */ - pmap_options = - PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; - if (p->dirty || p->precious) { - /* - * We already know it's been modified, - * so tell pmap to account for it - * as "compressed". - */ - pmap_options = PMAP_OPTIONS_COMPRESSOR; - } + pmap_options = PMAP_OPTIONS_COMPRESSOR; } - refmod_state = pmap_disconnect_options(p->phys_page, + refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), pmap_options, NULL); if (refmod_state & VM_MEM_MODIFIED) { @@ -8972,8 +8578,9 @@ vm_object_pageout( continue; } - vm_page_queues_remove(p); - if (vm_pageout_cluster(p, TRUE, FALSE, TRUE)) + vm_page_queues_remove(p, TRUE); + + if (vm_pageout_cluster(p, FALSE, TRUE)) need_unlock = FALSE; if (need_unlock == TRUE) @@ -9155,7 +8762,7 @@ vm_page_handle_prio_inversion(vm_object_t o, vm_page_t m) page and was issued as a low prio I/O. */ for(i=0; i < num_pages; i++) { - if(UPL_PAGE_PRESENT(pl,i) && m->phys_page == pl[i].phys_addr) { + if(UPL_PAGE_PRESENT(pl,i) && VM_PAGE_GET_PHYS_PAGE(m) == pl[i].phys_addr) { if ((upl->flags & UPL_DECMP_REQ) && upl->decmp_io_upl) { KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, upl->upl_creator, m, upl, upl->upl_priority, 0); vm_decmp_upl_reprioritize(upl, cur_tier); diff --git a/osfmk/vm/vm_object.h b/osfmk/vm/vm_object.h index 94537ded9..daef84711 100644 --- a/osfmk/vm/vm_object.h +++ b/osfmk/vm/vm_object.h @@ -89,6 +89,7 @@ #include #include +#include #if VM_OBJECT_TRACKING #include @@ -96,6 +97,7 @@ extern void vm_object_tracking_init(void); extern boolean_t vm_object_tracking_inited; extern btlog_t *vm_object_tracking_btlog; +#define VM_OBJECT_TRACKING_NUM_RECORDS 50000 #define VM_OBJECT_TRACKING_BTDEPTH 7 #define VM_OBJECT_TRACKING_OP_CREATED 1 #define VM_OBJECT_TRACKING_OP_MODIFIED 2 @@ -139,9 +141,25 @@ struct vm_object_fault_info { #define vo_slide_info vo_un2.vou_slide_info struct vm_object { - queue_head_t memq; /* Resident memory */ + /* + * on 64 bit systems we pack the pointers hung off the memq. + * those pointers have to be able to point back to the memq. + * the packed pointers are required to be on a 64 byte boundary + * which means 2 things for the vm_object... (1) the memq + * struct has to be the first element of the structure so that + * we can control it's alignment... (2) the vm_object must be + * aligned on a 64 byte boundary... for static vm_object's + * this is accomplished via the 'aligned' attribute... for + * vm_object's in the zone pool, this is accomplished by + * rounding the size of the vm_object element to the nearest + * 64 byte size before creating the zone. + */ + vm_page_queue_head_t memq; /* Resident memory - must be first */ lck_rw_t Lock; /* Synchronization */ +#if DEVELOPMENT || DEBUG + thread_t Lock_owner; +#endif union { vm_object_size_t vou_size; /* Object size (only valid if internal) */ int vou_cache_pages_to_scan; /* pages yet to be visited in an @@ -151,9 +169,6 @@ struct vm_object { struct vm_page *memq_hint; int ref_count; /* Number of references */ -#if TASK_SWAPPER - int res_count; /* Residency references (swap)*/ -#endif /* TASK_SWAPPER */ unsigned int resident_page_count; /* number of resident pages */ unsigned int wired_page_count; /* number of wired pages */ @@ -314,8 +329,6 @@ struct vm_object { * primary caching. (for * I/O) */ - - queue_chain_t cached_list; /* Attachment point for the * list of objects cached as a @@ -335,10 +348,6 @@ struct vm_object { uint32_t pages_created; uint32_t pages_used; -#if MACH_PAGEMAP - vm_external_map_t existence_map; /* bitmap of pages written to - * backing storage */ -#endif /* MACH_PAGEMAP */ vm_offset_t cow_hint; /* last page present in */ /* shadow but not in object */ #if MACH_ASSERT @@ -366,7 +375,13 @@ struct vm_object { purgeable_queue_type:2, purgeable_queue_group:3, io_tracking:1, - __object2_unused_bits:7; /* for expansion */ +#if CONFIG_SECLUDED_MEMORY + eligible_for_secluded:1, + can_grab_secluded:1, +#else /* CONFIG_SECLUDED_MEMORY */ + __object3_unused_bits:2, +#endif /* CONFIG_SECLUDED_MEMORY */ + __object2_unused_bits:5; /* for expansion */ uint8_t scan_collisions; vm_tag_t wire_tag; @@ -461,6 +476,7 @@ extern lck_attr_t vm_map_lck_attr; { \ lck_spin_lock(&vm_objects_wired_lock); \ assert(!(object)->objq.next); \ + assert(!(object)->objq.prev); \ queue_enter(&vm_objects_wired, (object), vm_object_t, objq); \ lck_spin_unlock(&vm_objects_wired_lock); \ } \ @@ -478,6 +494,64 @@ extern lck_attr_t vm_map_lck_attr; MACRO_END +#define OBJECT_LOCK_SHARED 0 +#define OBJECT_LOCK_EXCLUSIVE 1 + +extern lck_grp_t vm_object_lck_grp; +extern lck_grp_attr_t vm_object_lck_grp_attr; +extern lck_attr_t vm_object_lck_attr; +extern lck_attr_t kernel_object_lck_attr; +extern lck_attr_t compressor_object_lck_attr; + +extern vm_object_t vm_pageout_scan_wants_object; + +extern void vm_object_lock(vm_object_t); +extern boolean_t vm_object_lock_try(vm_object_t); +extern boolean_t _vm_object_lock_try(vm_object_t); +extern boolean_t vm_object_lock_avoid(vm_object_t); +extern void vm_object_lock_shared(vm_object_t); +extern boolean_t vm_object_lock_try_shared(vm_object_t); +extern void vm_object_unlock(vm_object_t); +extern boolean_t vm_object_lock_upgrade(vm_object_t); + +/* + * Object locking macros + */ + +#define vm_object_lock_init(object) \ + lck_rw_init(&(object)->Lock, &vm_object_lck_grp, \ + (((object) == kernel_object || \ + (object) == vm_submap_object) ? \ + &kernel_object_lck_attr : \ + (((object) == compressor_object) ? \ + &compressor_object_lck_attr : \ + &vm_object_lck_attr))) +#define vm_object_lock_destroy(object) lck_rw_destroy(&(object)->Lock, &vm_object_lck_grp) + +#define vm_object_lock_try_scan(object) _vm_object_lock_try(object) + +/* + * CAUTION: the following vm_object_lock_assert_held*() macros merely + * check if anyone is holding the lock, but the holder may not necessarily + * be the caller... + */ +#if MACH_ASSERT || DEBUG +#define vm_object_lock_assert_held(object) \ + lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_HELD) +#define vm_object_lock_assert_shared(object) \ + lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_SHARED) +#define vm_object_lock_assert_exclusive(object) \ + lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_EXCLUSIVE) +#define vm_object_lock_assert_notheld(object) \ + lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_NOTHELD) +#else /* MACH_ASSERT || DEBUG */ +#define vm_object_lock_assert_held(object) +#define vm_object_lock_assert_shared(object) +#define vm_object_lock_assert_exclusive(object) +#define vm_object_lock_assert_notheld(object) +#endif /* MACH_ASSERT || DEBUG */ + + /* * Declare procedures that operate on VM objects. */ @@ -774,27 +848,6 @@ __private_extern__ void vm_object_reap_pages( #define REAP_DATA_FLUSH 3 #if CONFIG_FREEZE -struct default_freezer_handle; - -__private_extern__ kern_return_t -vm_object_pack( - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *shared, - vm_object_t src_object, - struct default_freezer_handle *df_handle); - -__private_extern__ void -vm_object_pack_pages( - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - vm_object_t src_object, - struct default_freezer_handle *df_handle); __private_extern__ void vm_object_compressed_freezer_pageout( @@ -804,9 +857,6 @@ __private_extern__ void vm_object_compressed_freezer_done( void); -__private_extern__ kern_return_t -vm_object_pagein( - vm_object_t object); #endif /* CONFIG_FREEZE */ __private_extern__ void @@ -841,37 +891,114 @@ extern void vm_io_reprioritize_init(void); #define VM_OBJECT_EVENT_UNBLOCKED 8 #define VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS 9 -#define vm_object_assert_wait(object, event, interruptible) \ - (((object)->all_wanted |= 1 << (event)), \ - assert_wait((event_t)((vm_offset_t)(object)+(event)),(interruptible))) +#define VM_OBJECT_EVENT_MAX 10 /* 11 bits in "all_wanted", so 0->10 */ -#define vm_object_wait(object, event, interruptible) \ - (vm_object_assert_wait((object),(event),(interruptible)), \ - vm_object_unlock(object), \ - thread_block(THREAD_CONTINUE_NULL)) \ - -#define thread_sleep_vm_object(object, event, interruptible) \ - lck_rw_sleep(&(object)->Lock, LCK_SLEEP_PROMOTED_PRI, (event_t)(event), (interruptible)) +static __inline__ wait_result_t +vm_object_assert_wait( + vm_object_t object, + int event, + wait_interrupt_t interruptible) +{ + wait_result_t wr; + + vm_object_lock_assert_exclusive(object); + assert(event >= 0 && event <= VM_OBJECT_EVENT_MAX); + + object->all_wanted |= 1 << event; + wr = assert_wait((event_t)((vm_offset_t)object + event), + interruptible); + return wr; +} + +static __inline__ wait_result_t +vm_object_wait( + vm_object_t object, + int event, + wait_interrupt_t interruptible) +{ + wait_result_t wr; + + vm_object_assert_wait(object, event, interruptible); + vm_object_unlock(object); + wr = thread_block(THREAD_CONTINUE_NULL); + return wr; +} + +static __inline__ wait_result_t +thread_sleep_vm_object( + vm_object_t object, + event_t event, + wait_interrupt_t interruptible) +{ + wait_result_t wr; + +#if DEVELOPMENT || DEBUG + if (object->Lock_owner != current_thread()) + panic("thread_sleep_vm_object: now owner - %p\n", object); + object->Lock_owner = 0; +#endif + wr = lck_rw_sleep(&object->Lock, + LCK_SLEEP_PROMOTED_PRI, + event, + interruptible); +#if DEVELOPMENT || DEBUG + object->Lock_owner = current_thread(); +#endif + return wr; +} -#define vm_object_sleep(object, event, interruptible) \ - (((object)->all_wanted |= 1 << (event)), \ - thread_sleep_vm_object((object), \ - ((vm_offset_t)(object)+(event)), (interruptible))) +static __inline__ wait_result_t +vm_object_sleep( + vm_object_t object, + int event, + wait_interrupt_t interruptible) +{ + wait_result_t wr; + + vm_object_lock_assert_exclusive(object); + assert(event >= 0 && event <= VM_OBJECT_EVENT_MAX); + + object->all_wanted |= 1 << event; + wr = thread_sleep_vm_object(object, + (event_t)((vm_offset_t)object + event), + interruptible); + return wr; +} + +static __inline__ void +vm_object_wakeup( + vm_object_t object, + int event) +{ + vm_object_lock_assert_exclusive(object); + assert(event >= 0 && event <= VM_OBJECT_EVENT_MAX); + + if (object->all_wanted & (1 << event)) + thread_wakeup((event_t)((vm_offset_t)object + event)); + object->all_wanted &= ~(1 << event); +} + +static __inline__ void +vm_object_set_wanted( + vm_object_t object, + int event) +{ + vm_object_lock_assert_exclusive(object); + assert(event >= 0 && event <= VM_OBJECT_EVENT_MAX); -#define vm_object_wakeup(object, event) \ - MACRO_BEGIN \ - if ((object)->all_wanted & (1 << (event))) \ - thread_wakeup((event_t)((vm_offset_t)(object) + (event))); \ - (object)->all_wanted &= ~(1 << (event)); \ - MACRO_END + object->all_wanted |= (1 << event); +} -#define vm_object_set_wanted(object, event) \ - MACRO_BEGIN \ - ((object)->all_wanted |= (1 << (event))); \ - MACRO_END +static __inline__ int +vm_object_wanted( + vm_object_t object, + int event) +{ + vm_object_lock_assert_held(object); + assert(event >= 0 && event <= VM_OBJECT_EVENT_MAX); -#define vm_object_wanted(object, event) \ - ((object)->all_wanted & (1 << (event))) + return object->all_wanted & (1 << event); +} /* * Routines implemented as macros @@ -1005,63 +1132,6 @@ extern void vm_io_reprioritize_init(void); -#define OBJECT_LOCK_SHARED 0 -#define OBJECT_LOCK_EXCLUSIVE 1 - -extern lck_grp_t vm_object_lck_grp; -extern lck_grp_attr_t vm_object_lck_grp_attr; -extern lck_attr_t vm_object_lck_attr; -extern lck_attr_t kernel_object_lck_attr; -extern lck_attr_t compressor_object_lck_attr; - -extern vm_object_t vm_pageout_scan_wants_object; - -extern void vm_object_lock(vm_object_t); -extern boolean_t vm_object_lock_try(vm_object_t); -extern boolean_t _vm_object_lock_try(vm_object_t); -extern boolean_t vm_object_lock_avoid(vm_object_t); -extern void vm_object_lock_shared(vm_object_t); -extern boolean_t vm_object_lock_try_shared(vm_object_t); - -/* - * Object locking macros - */ - -#define vm_object_lock_init(object) \ - lck_rw_init(&(object)->Lock, &vm_object_lck_grp, \ - (((object) == kernel_object || \ - (object) == vm_submap_object) ? \ - &kernel_object_lck_attr : \ - (((object) == compressor_object) ? \ - &compressor_object_lck_attr : \ - &vm_object_lck_attr))) -#define vm_object_lock_destroy(object) lck_rw_destroy(&(object)->Lock, &vm_object_lck_grp) - -#define vm_object_unlock(object) lck_rw_done(&(object)->Lock) -#define vm_object_lock_upgrade(object) lck_rw_lock_shared_to_exclusive(&(object)->Lock) -#define vm_object_lock_try_scan(object) _vm_object_lock_try(object) - -/* - * CAUTION: the following vm_object_lock_assert_held*() macros merely - * check if anyone is holding the lock, but the holder may not necessarily - * be the caller... - */ -#if MACH_ASSERT || DEBUG -#define vm_object_lock_assert_held(object) \ - lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_HELD) -#define vm_object_lock_assert_shared(object) \ - lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_SHARED) -#define vm_object_lock_assert_exclusive(object) \ - lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_EXCLUSIVE) -#define vm_object_lock_assert_notheld(object) \ - lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_NOTHELD) -#else /* MACH_ASSERT || DEBUG */ -#define vm_object_lock_assert_held(object) -#define vm_object_lock_assert_shared(object) -#define vm_object_lock_assert_exclusive(object) -#define vm_object_lock_assert_notheld(object) -#endif /* MACH_ASSERT || DEBUG */ - #define vm_object_round_page(x) (((vm_object_offset_t)(x) + PAGE_MASK) & ~((signed)PAGE_MASK)) #define vm_object_trunc_page(x) ((vm_object_offset_t)(x) & ~((signed)PAGE_MASK)) diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index ff8b1e0db..b34e386bb 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -68,69 +68,58 @@ #include #include - #include #include #include -#include -#include -#include -#include -#include +#if defined(__LP64__) -/* - * VM_PAGE_MIN_SPECULATIVE_AGE_Q through VM_PAGE_MAX_SPECULATIVE_AGE_Q - * represents a set of aging bins that are 'protected'... - * - * VM_PAGE_SPECULATIVE_AGED_Q is a list of the speculative pages that have - * not yet been 'claimed' but have been aged out of the protective bins - * this occurs in vm_page_speculate when it advances to the next bin - * and discovers that it is still occupied... at that point, all of the - * pages in that bin are moved to the VM_PAGE_SPECULATIVE_AGED_Q. the pages - * in that bin are all guaranteed to have reached at least the maximum age - * we allow for a protected page... they can be older if there is no - * memory pressure to pull them from the bin, or there are no new speculative pages - * being generated to push them out. - * this list is the one that vm_pageout_scan will prefer when looking - * for pages to move to the underweight free list - * - * VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS - * defines the amount of time a speculative page is normally - * allowed to live in the 'protected' state (i.e. not available - * to be stolen if vm_pageout_scan is running and looking for - * pages)... however, if the total number of speculative pages - * in the protected state exceeds our limit (defined in vm_pageout.c) - * and there are none available in VM_PAGE_SPECULATIVE_AGED_Q, then - * vm_pageout_scan is allowed to steal pages from the protected - * bucket even if they are underage. - * - * vm_pageout_scan is also allowed to pull pages from a protected - * bin if the bin has reached the "age of consent" we've set +/* + * in order to make the size of a vm_page_t 64 bytes (cache line size for both arm64 and x86_64) + * we'll keep the next_m pointer packed... as long as the kernel virtual space where we allocate + * vm_page_t's from doesn't span more then 256 Gbytes, we're safe. There are live tests in the + * vm_page_t array allocation and the zone init code to determine if we can safely pack and unpack + * pointers from the 2 ends of these spaces */ -#define VM_PAGE_MAX_SPECULATIVE_AGE_Q 10 -#define VM_PAGE_MIN_SPECULATIVE_AGE_Q 1 -#define VM_PAGE_SPECULATIVE_AGED_Q 0 - -#define VM_PAGE_SPECULATIVE_Q_AGE_MS 500 +typedef uint32_t vm_page_packed_t; -struct vm_speculative_age_q { - /* - * memory queue for speculative pages via clustered pageins - */ - queue_head_t age_q; - mach_timespec_t age_ts; +struct vm_page_packed_queue_entry { + vm_page_packed_t next; /* next element */ + vm_page_packed_t prev; /* previous element */ }; +typedef struct vm_page_packed_queue_entry *vm_page_queue_t; +typedef struct vm_page_packed_queue_entry vm_page_queue_head_t; +typedef struct vm_page_packed_queue_entry vm_page_queue_chain_t; +typedef struct vm_page_packed_queue_entry *vm_page_queue_entry_t; +typedef vm_page_packed_t vm_page_object_t; -extern -struct vm_speculative_age_q vm_page_queue_speculative[]; +#else + +/* + * we can't do the packing trick on 32 bit architectures, so + * just turn the macros into noops. + */ +typedef struct vm_page *vm_page_packed_t; + +#define vm_page_queue_t queue_t +#define vm_page_queue_head_t queue_head_t +#define vm_page_queue_chain_t queue_chain_t +#define vm_page_queue_entry_t queue_entry_t + +#define vm_page_object_t vm_object_t +#endif + + +#include +#include +#include + +#include +#include -extern int speculative_steal_index; -extern int speculative_age_index; -extern unsigned int vm_page_speculative_q_age_ms; #define VM_PAGE_COMPRESSOR_COUNT (compressor_object->resident_page_count) @@ -161,45 +150,58 @@ extern unsigned int vm_page_speculative_q_age_ms; * change that field; holding either lock is sufficient to read.] */ +#define VM_PAGE_NULL ((vm_page_t) 0) -#if defined(__LP64__) +extern char vm_page_inactive_states[]; +extern char vm_page_pageable_states[]; +extern char vm_page_non_speculative_pageable_states[]; +extern char vm_page_active_or_inactive_states[]; -/* - * in order to make the size of a vm_page_t 64 bytes (cache line size for both arm64 and x86_64) - * we'll keep the next_m pointer packed... as long as the kernel virtual space where we allocate - * vm_page_t's from doesn't span more then 256 Gbytes, we're safe. There are live tests in the - * vm_page_t array allocation and the zone init code to determine if we can safely pack and unpack - * pointers from the 2 ends of these spaces - */ -typedef uint32_t vm_page_packed_t; -#define VM_PAGE_PACK_PTR(m) (!(m) ? (vm_page_packed_t)0 : ((vm_page_packed_t)((uintptr_t)(((uintptr_t)(m) - (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS)) >> 6))) -#define VM_PAGE_UNPACK_PTR(p) (!(p) ? VM_PAGE_NULL : ((vm_page_t)((((uintptr_t)(p)) << 6) + (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS))) +#define VM_PAGE_INACTIVE(m) (vm_page_inactive_states[m->vm_page_q_state]) +#define VM_PAGE_PAGEABLE(m) (vm_page_pageable_states[m->vm_page_q_state]) +#define VM_PAGE_NON_SPECULATIVE_PAGEABLE(m) (vm_page_non_speculative_pageable_states[m->vm_page_q_state]) +#define VM_PAGE_ACTIVE_OR_INACTIVE(m) (vm_page_active_or_inactive_states[m->vm_page_q_state]) -#else -/* - * we can't do the packing trick on 32 bit architectures, so - * just turn the macros into noops. - */ -typedef struct vm_page *vm_page_packed_t; +#define VM_PAGE_NOT_ON_Q 0 /* page is not present on any queue, nor is it wired... mainly a transient state */ +#define VM_PAGE_IS_WIRED 1 /* page is currently wired */ +#define VM_PAGE_USED_BY_COMPRESSOR 2 /* page is in use by the compressor to hold compressed data */ +#define VM_PAGE_ON_FREE_Q 3 /* page is on the main free queue */ +#define VM_PAGE_ON_FREE_LOCAL_Q 4 /* page is on one of the per-CPU free queues */ +#define VM_PAGE_ON_FREE_LOPAGE_Q 5 /* page is on the lopage pool free list */ +#define VM_PAGE_ON_THROTTLED_Q 6 /* page is on the throttled queue... we stash anonymous pages here when not paging */ +#define VM_PAGE_ON_PAGEOUT_Q 7 /* page is on one of the pageout queues (internal/external) awaiting processing */ +#define VM_PAGE_ON_SPECULATIVE_Q 8 /* page is on one of the speculative queues */ +#define VM_PAGE_ON_ACTIVE_LOCAL_Q 9 /* page has recently been created and is being held in one of the per-CPU local queues */ +#define VM_PAGE_ON_ACTIVE_Q 10 /* page is in global active queue */ +#define VM_PAGE_ON_INACTIVE_INTERNAL_Q 11 /* page is on the inactive internal queue a.k.a. anonymous queue */ +#define VM_PAGE_ON_INACTIVE_EXTERNAL_Q 12 /* page in on the inactive external queue a.k.a. file backed queue */ +#define VM_PAGE_ON_INACTIVE_CLEANED_Q 13 /* page has been cleaned to a backing file and is ready to be stolen */ +#define VM_PAGE_ON_SECLUDED_Q 14 /* page is on secluded queue */ +#define VM_PAGE_Q_STATE_LAST_VALID_VALUE 14 /* we currently use 4 bits for the state... don't let this go beyond 15 */ -#define VM_PAGE_PACK_PTR(m) ((vm_page_packed_t)(m)) -#define VM_PAGE_UNPACK_PTR(p) ((vm_page_t)(p)) +#define VM_PAGE_Q_STATE_ARRAY_SIZE (VM_PAGE_Q_STATE_LAST_VALID_VALUE+1) -#endif +#define pageq pageq_un.vm_page_pageq +#define snext pageq_un.vm_page_snext struct vm_page { - queue_chain_t pageq; /* queue info for FIFO */ - /* queue or free list (P) */ + union { + vm_page_queue_chain_t vm_page_pageq; /* queue info for FIFO queue or free list (P) */ + struct vm_page *vm_page_snext; + } pageq_un; - queue_chain_t listq; /* all pages in same object (O) */ + vm_page_queue_chain_t listq; /* all pages in same object (O) */ - vm_object_offset_t offset; /* offset into that object (O,P) */ - vm_object_t object; /* which object am I in (O&P) */ +#if CONFIG_BACKGROUND_QUEUE + vm_page_queue_chain_t vm_page_backgroundq; /* anonymous pages in the background pool (P) */ +#endif + + vm_object_offset_t offset; /* offset into that object (O,P) */ + vm_page_object_t vm_page_object; /* which object am I in (O&P) */ - vm_page_packed_t next_m; /* VP bucket link (O) */ /* * The following word of flags is protected * by the "page queues" lock. @@ -211,26 +213,30 @@ struct vm_page { */ #define local_id wire_count unsigned int wire_count:16, /* how many wired down maps use me? (O&P) */ - /* boolean_t */ active:1, /* page is in active list (P) */ - inactive:1, /* page is in inactive list (P) */ - clean_queue:1, /* page is in pre-cleaned list (P) */ - local:1, /* page is in one of the local queues (P) */ - speculative:1, /* page is in speculative list (P) */ - throttled:1, /* pager is not responding or doesn't exist(P) */ - free:1, /* page is on free list (P) */ - pageout_queue:1,/* page is on queue for pageout (P) */ - laundry:1, /* page is being cleaned now (P)*/ - reference:1, /* page has been used (P) */ + vm_page_q_state:4, /* which q is the page on (P) */ + + vm_page_in_background:1, + vm_page_on_backgroundq:1, + /* boolean_t */ gobbled:1, /* page used internally (P) */ - private:1, /* Page should not be returned to - * the free list (P) */ + laundry:1, /* page is being cleaned now (P)*/ no_cache:1, /* page is not to be cached and should * be reused ahead of other pages (P) */ + private:1, /* Page should not be returned to + * the free list (P) */ + reference:1, /* page has been used (P) */ - __unused_pageq_bits:3; /* 3 bits available here */ + __unused_pageq_bits:5; /* 5 bits available here */ - ppnum_t phys_page; /* Physical address of page, passed - * to pmap_enter (read-only) */ + /* + * MUST keep the 2 32 bit words used as bit fields + * separated since the compiler has a nasty habit + * of using 64 bit loads and stores on them as + * if they were a single 64 bit field... since + * they are protected by 2 different locks, this + * is a real problem + */ + vm_page_packed_t next_m; /* VP bucket link (O) */ /* * The following word of flags is protected @@ -255,9 +261,9 @@ struct vm_page { xpmapped:1, /* page has been entered with execute permission (O) or (O-shared AND pmap_page) */ - wpmapped:1, /* page has been entered at some + wpmapped:1, /* page has been entered at some * point into a pmap for write (O) */ - pageout:1, /* page wired & busy for pageout (O) */ + free_when_done:1, /* page is to be freed once cleaning is completed (O) */ absent:1, /* Data has been requested, but is * not yet available (O) */ error:1, /* Data manager was unable to provide @@ -282,11 +288,37 @@ struct vm_page { reusable:1, lopage:1, slid:1, - compressor:1, /* page owned by compressor pool */ written_by_kernel:1, /* page was written by kernel (i.e. decompressed) */ - __unused_object_bits:4; /* 5 bits available here */ + __unused_object_bits:5; /* 5 bits available here */ + + ppnum_t phys_page; /* Physical address of page, passed + * to pmap_enter (read-only) */ }; + +typedef struct vm_page *vm_page_t; +extern vm_page_t vm_pages; +extern vm_page_t vm_page_array_beginning_addr; +extern vm_page_t vm_page_array_ending_addr; + + + + +struct vm_page_with_ppnum { + struct vm_page vm_page_with_ppnum; +}; +typedef struct vm_page_with_ppnum *vm_page_with_ppnum_t; + + +#define VM_PAGE_GET_PHYS_PAGE(page) (page)->phys_page +#define VM_PAGE_SET_PHYS_PAGE(page, ppnum) \ + MACRO_BEGIN \ + (page)->phys_page = ppnum; \ + MACRO_END + + + + #define DEBUG_ENCRYPTED_SWAP 1 #if DEBUG_ENCRYPTED_SWAP #define ASSERT_PAGE_DECRYPTED(page) \ @@ -300,7 +332,424 @@ struct vm_page { #define ASSERT_PAGE_DECRYPTED(page) assert(!(page)->encrypted) #endif /* DEBUG_ENCRYPTED_SWAP */ -typedef struct vm_page *vm_page_t; + + +#if defined(__LP64__) + +#define VM_VPLQ_ALIGNMENT 128 +#define VM_PACKED_POINTER_ALIGNMENT 64 /* must be a power of 2 */ +#define VM_PACKED_POINTER_SHIFT 6 + +#define VM_PACKED_FROM_VM_PAGES_ARRAY 0x80000000 + +static inline vm_page_packed_t vm_page_pack_ptr(uintptr_t p) +{ + vm_page_packed_t packed_ptr; + + if (!p) + return ((vm_page_packed_t)0); + + if (p >= (uintptr_t)(vm_page_array_beginning_addr) && p < (uintptr_t)(vm_page_array_ending_addr)) { + packed_ptr = ((vm_page_packed_t)(((vm_page_t)p - vm_page_array_beginning_addr))); + assert(! (packed_ptr & VM_PACKED_FROM_VM_PAGES_ARRAY)); + packed_ptr |= VM_PACKED_FROM_VM_PAGES_ARRAY; + return packed_ptr; + } + + assert((p & (VM_PACKED_POINTER_ALIGNMENT - 1)) == 0); + + packed_ptr = ((vm_page_packed_t)(((uintptr_t)(p - (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS)) >> VM_PACKED_POINTER_SHIFT)); + assert(packed_ptr != 0); + assert(! (packed_ptr & VM_PACKED_FROM_VM_PAGES_ARRAY)); + return packed_ptr; +} + + +static inline uintptr_t vm_page_unpack_ptr(uintptr_t p) +{ + if (!p) + return ((uintptr_t)0); + + if (p & VM_PACKED_FROM_VM_PAGES_ARRAY) + return ((uintptr_t)(&vm_pages[(uint32_t)(p & ~VM_PACKED_FROM_VM_PAGES_ARRAY)])); + return (((p << VM_PACKED_POINTER_SHIFT) + (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS)); +} + + +#define VM_PAGE_PACK_PTR(p) vm_page_pack_ptr((uintptr_t)(p)) +#define VM_PAGE_UNPACK_PTR(p) vm_page_unpack_ptr((uintptr_t)(p)) + +#define VM_PAGE_OBJECT(p) ((vm_object_t)(VM_PAGE_UNPACK_PTR(p->vm_page_object))) +#define VM_PAGE_PACK_OBJECT(o) ((vm_page_object_t)(VM_PAGE_PACK_PTR(o))) + + +#define VM_PAGE_ZERO_PAGEQ_ENTRY(p) \ +MACRO_BEGIN \ + (p)->snext = 0; \ +MACRO_END + + +#define VM_PAGE_CONVERT_TO_QUEUE_ENTRY(p) VM_PAGE_PACK_PTR(p) + + +static __inline__ void +vm_page_enqueue_tail( + vm_page_queue_t que, + vm_page_queue_entry_t elt) +{ + vm_page_queue_entry_t old_tail; + + old_tail = (vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR(que->prev); + elt->next = VM_PAGE_PACK_PTR(que); + elt->prev = que->prev; + old_tail->next = VM_PAGE_PACK_PTR(elt); + que->prev = VM_PAGE_PACK_PTR(elt); +} + + +static __inline__ void +vm_page_remque( + vm_page_queue_entry_t elt) +{ + vm_page_queue_entry_t next_elt, prev_elt; + + next_elt = (vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR(elt->next); + + /* next_elt may equal prev_elt (and the queue head) if elt was the only element */ + prev_elt = (vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR(elt->prev); + + next_elt->prev = VM_PAGE_PACK_PTR(prev_elt); + prev_elt->next = VM_PAGE_PACK_PTR(next_elt); + + elt->next = 0; + elt->prev = 0; +} + + +/* + * Macro: vm_page_queue_init + * Function: + * Initialize the given queue. + * Header: + * void vm_page_queue_init(q) + * vm_page_queue_t q; \* MODIFIED *\ + */ +#define vm_page_queue_init(q) \ +MACRO_BEGIN \ + assert((((uintptr_t)q) & (VM_PACKED_POINTER_ALIGNMENT-1)) == 0); \ + assert((VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR((uintptr_t)q))) == (uintptr_t)q); \ + (q)->next = VM_PAGE_PACK_PTR(q); \ + (q)->prev = VM_PAGE_PACK_PTR(q); \ +MACRO_END + + +/* + * Macro: vm_page_queue_enter + * Function: + * Insert a new element at the tail of the queue. + * Header: + * void vm_page_queue_enter(q, elt, type, field) + * queue_t q; + * elt; + * is what's in our queue + * is the chain field in (*) + * Note: + * This should only be used with Method 2 queue iteration (element chains) + */ +#define vm_page_queue_enter(head, elt, type, field) \ +MACRO_BEGIN \ + vm_page_queue_entry_t __prev; \ + \ + __prev = ((vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR((head)->prev)); \ + if ((head) == __prev) { \ + (head)->next = VM_PAGE_PACK_PTR(elt); \ + } \ + else { \ + ((type)(void *)__prev)->field.next = VM_PAGE_PACK_PTR(elt); \ + } \ + (elt)->field.prev = VM_PAGE_PACK_PTR(__prev); \ + (elt)->field.next = VM_PAGE_PACK_PTR(head); \ + (head)->prev = VM_PAGE_PACK_PTR(elt); \ +MACRO_END + + +/* + * Macro: vm_page_queue_enter_first + * Function: + * Insert a new element at the head of the queue. + * Header: + * void queue_enter_first(q, elt, type, field) + * queue_t q; + * elt; + * is what's in our queue + * is the chain field in (*) + * Note: + * This should only be used with Method 2 queue iteration (element chains) + */ +#define vm_page_queue_enter_first(head, elt, type, field) \ +MACRO_BEGIN \ + vm_page_queue_entry_t __next; \ + \ + __next = ((vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR((head)->next)); \ + if ((head) == __next) { \ + (head)->prev = VM_PAGE_PACK_PTR(elt); \ + } \ + else { \ + ((type)(void *)__next)->field.prev = VM_PAGE_PACK_PTR(elt); \ + } \ + (elt)->field.next = VM_PAGE_PACK_PTR(__next); \ + (elt)->field.prev = VM_PAGE_PACK_PTR(head); \ + (head)->next = VM_PAGE_PACK_PTR(elt); \ +MACRO_END + + +/* + * Macro: vm_page_queue_remove + * Function: + * Remove an arbitrary item from the queue. + * Header: + * void vm_page_queue_remove(q, qe, type, field) + * arguments as in vm_page_queue_enter + * Note: + * This should only be used with Method 2 queue iteration (element chains) + */ +#define vm_page_queue_remove(head, elt, type, field) \ +MACRO_BEGIN \ + vm_page_queue_entry_t __next, __prev; \ + \ + __next = ((vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR((elt)->field.next)); \ + __prev = ((vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR((elt)->field.prev)); \ + \ + if ((head) == __next) \ + (head)->prev = VM_PAGE_PACK_PTR(__prev); \ + else \ + ((type)(void *)__next)->field.prev = VM_PAGE_PACK_PTR(__prev); \ + \ + if ((head) == __prev) \ + (head)->next = VM_PAGE_PACK_PTR(__next); \ + else \ + ((type)(void *)__prev)->field.next = VM_PAGE_PACK_PTR(__next); \ + \ + (elt)->field.next = 0; \ + (elt)->field.prev = 0; \ +MACRO_END + + +/* + * Macro: vm_page_queue_remove_first + * Function: + * Remove and return the entry at the head of + * the queue. + * Header: + * vm_page_queue_remove_first(head, entry, type, field) + * entry is returned by reference + * Note: + * This should only be used with Method 2 queue iteration (element chains) + */ +#define vm_page_queue_remove_first(head, entry, type, field) \ +MACRO_BEGIN \ + vm_page_queue_entry_t __next; \ + \ + (entry) = (type)(void *) VM_PAGE_UNPACK_PTR(((head)->next)); \ + __next = ((vm_page_queue_entry_t)VM_PAGE_UNPACK_PTR((entry)->field.next)); \ + \ + if ((head) == __next) \ + (head)->prev = VM_PAGE_PACK_PTR(head); \ + else \ + ((type)(void *)(__next))->field.prev = VM_PAGE_PACK_PTR(head); \ + (head)->next = VM_PAGE_PACK_PTR(__next); \ + \ + (entry)->field.next = 0; \ + (entry)->field.prev = 0; \ +MACRO_END + + +/* + * Macro: vm_page_queue_end + * Function: + * Tests whether a new entry is really the end of + * the queue. + * Header: + * boolean_t vm_page_queue_end(q, qe) + * vm_page_queue_t q; + * vm_page_queue_entry_t qe; + */ +#define vm_page_queue_end(q, qe) ((q) == (qe)) + + +/* + * Macro: vm_page_queue_empty + * Function: + * Tests whether a queue is empty. + * Header: + * boolean_t vm_page_queue_empty(q) + * vm_page_queue_t q; + */ +#define vm_page_queue_empty(q) vm_page_queue_end((q), ((vm_page_queue_entry_t)vm_page_queue_first(q))) + + + +/* + * Macro: vm_page_queue_first + * Function: + * Returns the first entry in the queue, + * Header: + * uintpr_t vm_page_queue_first(q) + * vm_page_queue_t q; \* IN *\ + */ +#define vm_page_queue_first(q) (VM_PAGE_UNPACK_PTR((q)->next)) + + + +/* + * Macro: vm_page_queue_last + * Function: + * Returns the last entry in the queue. + * Header: + * vm_page_queue_entry_t queue_last(q) + * queue_t q; \* IN *\ + */ +#define vm_page_queue_last(q) (VM_PAGE_UNPACK_PTR((q)->prev)) + + + +/* + * Macro: vm_page_queue_next + * Function: + * Returns the entry after an item in the queue. + * Header: + * uintpr_t vm_page_queue_next(qc) + * vm_page_queue_t qc; + */ +#define vm_page_queue_next(qc) (VM_PAGE_UNPACK_PTR((qc)->next)) + + + +/* + * Macro: vm_page_queue_prev + * Function: + * Returns the entry before an item in the queue. + * Header: + * uinptr_t vm_page_queue_prev(qc) + * vm_page_queue_t qc; + */ +#define vm_page_queue_prev(qc) (VM_PAGE_UNPACK_PTR((qc)->prev)) + + + +/* + * Macro: vm_page_queue_iterate + * Function: + * iterate over each item in the queue. + * Generates a 'for' loop, setting elt to + * each item in turn (by reference). + * Header: + * vm_page_queue_iterate(q, elt, type, field) + * queue_t q; + * elt; + * is what's in our queue + * is the chain field in (*) + * Note: + * This should only be used with Method 2 queue iteration (element chains) + */ +#define vm_page_queue_iterate(head, elt, type, field) \ + for ((elt) = (type)(void *) vm_page_queue_first(head); \ + !vm_page_queue_end((head), (vm_page_queue_entry_t)(elt)); \ + (elt) = (type)(void *) vm_page_queue_next(&(elt)->field)) + +#else + +#define VM_VPLQ_ALIGNMENT 128 +#define VM_PACKED_POINTER_ALIGNMENT 4 +#define VM_PACKED_POINTER_SHIFT 0 + +#define VM_PACKED_FROM_VM_PAGES_ARRAY 0 + +#define VM_PAGE_PACK_PTR(p) (p) +#define VM_PAGE_UNPACK_PTR(p) ((uintptr_t)(p)) + +#define VM_PAGE_OBJECT(p) (vm_object_t)(p->vm_page_object) +#define VM_PAGE_PACK_OBJECT(o) ((vm_page_object_t)(VM_PAGE_PACK_PTR(o))) + + +#define VM_PAGE_ZERO_PAGEQ_ENTRY(p) \ +MACRO_BEGIN \ + (p)->pageq.next = 0; \ + (p)->pageq.prev = 0; \ +MACRO_END + +#define VM_PAGE_CONVERT_TO_QUEUE_ENTRY(p) ((queue_entry_t)(p)) + +#define vm_page_remque remque +#define vm_page_enqueue_tail enqueue_tail +#define vm_page_queue_init queue_init +#define vm_page_queue_enter queue_enter +#define vm_page_queue_enter_first queue_enter_first +#define vm_page_queue_remove queue_remove +#define vm_page_queue_remove_first queue_remove_first +#define vm_page_queue_end queue_end +#define vm_page_queue_empty queue_empty +#define vm_page_queue_first queue_first +#define vm_page_queue_last queue_last +#define vm_page_queue_next queue_next +#define vm_page_queue_prev queue_prev +#define vm_page_queue_iterate queue_iterate + +#endif + + + +/* + * VM_PAGE_MIN_SPECULATIVE_AGE_Q through VM_PAGE_MAX_SPECULATIVE_AGE_Q + * represents a set of aging bins that are 'protected'... + * + * VM_PAGE_SPECULATIVE_AGED_Q is a list of the speculative pages that have + * not yet been 'claimed' but have been aged out of the protective bins + * this occurs in vm_page_speculate when it advances to the next bin + * and discovers that it is still occupied... at that point, all of the + * pages in that bin are moved to the VM_PAGE_SPECULATIVE_AGED_Q. the pages + * in that bin are all guaranteed to have reached at least the maximum age + * we allow for a protected page... they can be older if there is no + * memory pressure to pull them from the bin, or there are no new speculative pages + * being generated to push them out. + * this list is the one that vm_pageout_scan will prefer when looking + * for pages to move to the underweight free list + * + * VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS + * defines the amount of time a speculative page is normally + * allowed to live in the 'protected' state (i.e. not available + * to be stolen if vm_pageout_scan is running and looking for + * pages)... however, if the total number of speculative pages + * in the protected state exceeds our limit (defined in vm_pageout.c) + * and there are none available in VM_PAGE_SPECULATIVE_AGED_Q, then + * vm_pageout_scan is allowed to steal pages from the protected + * bucket even if they are underage. + * + * vm_pageout_scan is also allowed to pull pages from a protected + * bin if the bin has reached the "age of consent" we've set + */ +#define VM_PAGE_MAX_SPECULATIVE_AGE_Q 10 +#define VM_PAGE_MIN_SPECULATIVE_AGE_Q 1 +#define VM_PAGE_SPECULATIVE_AGED_Q 0 + +#define VM_PAGE_SPECULATIVE_Q_AGE_MS 500 + +struct vm_speculative_age_q { + /* + * memory queue for speculative pages via clustered pageins + */ + vm_page_queue_head_t age_q; + mach_timespec_t age_ts; +} __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); + + + +extern +struct vm_speculative_age_q vm_page_queue_speculative[]; + +extern int speculative_steal_index; +extern int speculative_age_index; +extern unsigned int vm_page_speculative_q_age_ms; typedef struct vm_locks_array { @@ -311,10 +760,16 @@ typedef struct vm_locks_array { } vm_locks_array_t; -#define VM_PAGE_WIRED(m) ((!(m)->local && (m)->wire_count)) -#define VM_PAGE_NULL ((vm_page_t) 0) -#define NEXT_PAGE(m) ((vm_page_t) (m)->pageq.next) -#define NEXT_PAGE_PTR(m) ((vm_page_t *) &(m)->pageq.next) +#if CONFIG_BACKGROUND_QUEUE +extern void vm_page_assign_background_state(vm_page_t mem); +extern void vm_page_update_background_state(vm_page_t mem); +extern void vm_page_add_to_backgroundq(vm_page_t mem, boolean_t first); +extern void vm_page_remove_from_backgroundq(vm_page_t mem); +#endif + +#define VM_PAGE_WIRED(m) ((m)->vm_page_q_state == VM_PAGE_IS_WIRED) +#define NEXT_PAGE(m) ((m)->snext) +#define NEXT_PAGE_PTR(m) (&(m)->snext) /* * XXX The unusual bit should not be necessary. Most of the bit @@ -324,11 +779,12 @@ typedef struct vm_locks_array { /* * For debugging, this macro can be defined to perform * some useful check on a page structure. + * INTENTIONALLY left as a no-op so that the + * current call-sites can be left intact for future uses. */ #define VM_PAGE_CHECK(mem) \ MACRO_BEGIN \ - VM_PAGE_QUEUES_ASSERT(mem, 1); \ MACRO_END /* Page coloring: @@ -397,10 +853,10 @@ vm_map_size_t vm_global_no_user_wire_amount; #define VPL_LOCK_SPIN 1 struct vpl { + vm_page_queue_head_t vpl_queue; unsigned int vpl_count; unsigned int vpl_internal_count; unsigned int vpl_external_count; - queue_head_t vpl_queue; #ifdef VPL_LOCK_SPIN lck_spin_t vpl_lock; #else @@ -411,7 +867,7 @@ struct vpl { struct vplq { union { - char cache_line_pad[128]; + char cache_line_pad[VM_VPLQ_ALIGNMENT]; struct vpl vpl; } vpl_un; }; @@ -427,25 +883,56 @@ extern vm_locks_array_t vm_page_locks; extern -queue_head_t vm_page_queue_free[MAX_COLORS]; /* memory free queue */ +vm_page_queue_head_t vm_lopage_queue_free; /* low memory free queue */ extern -queue_head_t vm_lopage_queue_free; /* low memory free queue */ +vm_page_queue_head_t vm_page_queue_active; /* active memory queue */ extern -queue_head_t vm_page_queue_active; /* active memory queue */ +vm_page_queue_head_t vm_page_queue_inactive; /* inactive memory queue for normal pages */ +#if CONFIG_SECLUDED_MEMORY extern -queue_head_t vm_page_queue_inactive; /* inactive memory queue for normal pages */ +vm_page_queue_head_t vm_page_queue_secluded; /* reclaimable pages secluded for Camera */ +#endif /* CONFIG_SECLUDED_MEMORY */ extern -queue_head_t vm_page_queue_cleaned; /* clean-queue inactive memory */ +vm_page_queue_head_t vm_page_queue_cleaned; /* clean-queue inactive memory */ extern -queue_head_t vm_page_queue_anonymous; /* inactive memory queue for anonymous pages */ +vm_page_queue_head_t vm_page_queue_anonymous; /* inactive memory queue for anonymous pages */ extern -queue_head_t vm_page_queue_throttled; /* memory queue for throttled pageout pages */ +vm_page_queue_head_t vm_page_queue_throttled; /* memory queue for throttled pageout pages */ extern queue_head_t vm_objects_wired; extern lck_spin_t vm_objects_wired_lock; +#if CONFIG_BACKGROUND_QUEUE + +#define VM_PAGE_BACKGROUND_TARGET_MAX 50000 + +#define VM_PAGE_BG_DISABLED 0 +#define VM_PAGE_BG_LEVEL_1 1 +#define VM_PAGE_BG_LEVEL_2 2 +#define VM_PAGE_BG_LEVEL_3 3 + +extern +vm_page_queue_head_t vm_page_queue_background; +extern +uint64_t vm_page_background_promoted_count; +extern +uint32_t vm_page_background_count; +extern +uint32_t vm_page_background_limit; +extern +uint32_t vm_page_background_target; +extern +uint32_t vm_page_background_internal_count; +extern +uint32_t vm_page_background_external_count; +extern +uint32_t vm_page_background_mode; +extern +uint32_t vm_page_background_exclude_external; + +#endif extern vm_offset_t first_phys_addr; /* physical address for first_page */ @@ -455,11 +942,17 @@ vm_offset_t last_phys_addr; /* physical address for last_page */ extern unsigned int vm_page_free_count; /* How many pages are free? (sum of all colors) */ extern -unsigned int vm_page_fictitious_count;/* How many fictitious pages are free? */ -extern unsigned int vm_page_active_count; /* How many pages are active? */ extern unsigned int vm_page_inactive_count; /* How many pages are inactive? */ +#if CONFIG_SECLUDED_MEMORY +extern +unsigned int vm_page_secluded_count; /* How many pages are secluded? */ +extern +unsigned int vm_page_secluded_count_free; +extern +unsigned int vm_page_secluded_count_inuse; +#endif /* CONFIG_SECLUDED_MEMORY */ extern unsigned int vm_page_cleaned_count; /* How many pages are in the clean queue? */ extern @@ -488,10 +981,14 @@ extern uint32_t vm_page_creation_throttle; /* When to throttle new page creation */ extern unsigned int vm_page_inactive_target;/* How many do we want inactive? */ +#if CONFIG_SECLUDED_MEMORY +extern +unsigned int vm_page_secluded_target;/* How many do we want secluded? */ +#endif /* CONFIG_SECLUDED_MEMORY */ extern unsigned int vm_page_anonymous_min; /* When it's ok to pre-clean */ extern -unsigned int vm_page_inactive_min; /* When do wakeup pageout */ +unsigned int vm_page_inactive_min; /* When to wakeup pageout */ extern unsigned int vm_page_free_reserved; /* How many pages reserved to do pageout */ extern @@ -519,6 +1016,10 @@ extern unsigned int vm_page_free_wanted; extern unsigned int vm_page_free_wanted_privileged; /* how many VM privileged threads are waiting for memory */ +#if CONFIG_SECLUDED_MEMORY +extern unsigned int vm_page_free_wanted_secluded; + /* how many threads are waiting for secluded memory */ +#endif /* CONFIG_SECLUDED_MEMORY */ extern ppnum_t vm_page_fictitious_addr; /* (fake) phys_addr of fictitious pages */ @@ -581,11 +1082,16 @@ extern void vm_page_more_fictitious(void); extern int vm_pool_low(void); extern vm_page_t vm_page_grab(void); +extern vm_page_t vm_page_grab_options(int flags); +#if CONFIG_SECLUDED_MEMORY +#define VM_PAGE_GRAB_SECLUDED 0x00000001 +#endif /* CONFIG_SECLUDED_MEMORY */ extern vm_page_t vm_page_grablo(void); extern void vm_page_release( - vm_page_t page); + vm_page_t page, + boolean_t page_queues_locked); extern boolean_t vm_page_wait( int interruptible ); @@ -714,6 +1220,7 @@ extern void vm_page_validate_cs_mapped_chunk( vm_page_t page, const void *kaddr, vm_offset_t chunk_offset, + vm_size_t chunk_size, boolean_t *validated, unsigned *tainted); @@ -740,7 +1247,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail); memorystatus_pages_update( \ vm_page_pageable_external_count + \ vm_page_free_count + \ - (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) ? 0 : vm_page_purgeable_count) \ + (VM_DYNAMIC_PAGING_ENABLED() ? 0 : vm_page_purgeable_count) \ ); \ } while(0) @@ -810,6 +1317,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail); #define vm_page_queue_free_lock (vm_page_locks.vm_page_queue_free_lock2) #define vm_page_lock_queues() lck_mtx_lock(&vm_page_queue_lock) +#define vm_page_trylock_queues() lck_mtx_try_lock(&vm_page_queue_lock) #define vm_page_unlock_queues() lck_mtx_unlock(&vm_page_queue_lock) #define vm_page_lockspin_queues() lck_mtx_lock_spin(&vm_page_queue_lock) @@ -826,12 +1334,6 @@ extern void memorystatus_pages_update(unsigned int pages_avail); #define VPL_UNLOCK(vpl) lck_mtx_unlock(vpl) #endif -#if MACH_ASSERT -extern void vm_page_queues_assert(vm_page_t mem, int val); -#define VM_PAGE_QUEUES_ASSERT(mem, val) vm_page_queues_assert((mem), (val)) -#else -#define VM_PAGE_QUEUES_ASSERT(mem, val) -#endif #if DEVELOPMENT || DEBUG #define VM_PAGE_SPECULATIVE_USED_ADD() \ @@ -845,26 +1347,34 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); #define VM_PAGE_CONSUME_CLUSTERED(mem) \ MACRO_BEGIN \ - pmap_lock_phys_page(mem->phys_page); \ + ppnum_t __phys_page; \ + __phys_page = VM_PAGE_GET_PHYS_PAGE(mem); \ + pmap_lock_phys_page(__phys_page); \ if (mem->clustered) { \ - assert(mem->object); \ - mem->object->pages_used++; \ + vm_object_t o; \ + o = VM_PAGE_OBJECT(mem); \ + assert(o); \ + o->pages_used++; \ mem->clustered = FALSE; \ VM_PAGE_SPECULATIVE_USED_ADD(); \ } \ - pmap_unlock_phys_page(mem->phys_page); \ + pmap_unlock_phys_page(__phys_page); \ MACRO_END #define VM_PAGE_COUNT_AS_PAGEIN(mem) \ MACRO_BEGIN \ + { \ + vm_object_t o; \ + o = VM_PAGE_OBJECT(mem); \ DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL); \ current_task()->pageins++; \ - if (mem->object->internal) { \ + if (o->internal) { \ DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); \ } else { \ DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); \ } \ + } \ MACRO_END /* adjust for stolen pages accounted elsewhere */ @@ -931,9 +1441,10 @@ extern vm_page_t vm_object_page_grab(vm_object_t); extern void vm_page_buckets_check(void); #endif /* VM_PAGE_BUCKETS_CHECK */ -extern void vm_page_queues_remove(vm_page_t mem); +extern void vm_page_queues_remove(vm_page_t mem, boolean_t remove_from_backgroundq); extern void vm_page_remove_internal(vm_page_t page); extern void vm_page_enqueue_inactive(vm_page_t mem, boolean_t first); +extern void vm_page_enqueue_active(vm_page_t mem, boolean_t first); extern void vm_page_check_pageable_safe(vm_page_t page); diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 92816a1b2..1a8aa3558 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include @@ -310,7 +311,6 @@ boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void); boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void); #endif static void vm_pageout_garbage_collect(int); -static void vm_pageout_iothread_continue(struct vm_pageout_queue *); static void vm_pageout_iothread_external(void); static void vm_pageout_iothread_internal(struct cq *cq); static void vm_pageout_adjust_io_throttles(struct vm_pageout_queue *, struct vm_pageout_queue *, boolean_t); @@ -424,6 +424,8 @@ unsigned int vm_pageout_in_place = 0; unsigned int vm_page_steal_pageout_page = 0; +struct vm_config vm_config; + /* * ENCRYPTED SWAP: * counters and statistics... @@ -435,8 +437,8 @@ unsigned long vm_page_encrypt_abort_counter = 0; unsigned long vm_page_encrypt_already_encrypted_counter = 0; boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */ -struct vm_pageout_queue vm_pageout_queue_internal; -struct vm_pageout_queue vm_pageout_queue_external; +struct vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +struct vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); unsigned int vm_page_speculative_target = 0; @@ -456,6 +458,14 @@ extern boolean_t memorystatus_idle_exit_from_VM(void); #endif extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async); extern void memorystatus_on_pageout_scan_end(void); + +uint32_t vm_pageout_memorystatus_fb_factor_nr = 5; +uint32_t vm_pageout_memorystatus_fb_factor_dr = 2; +#if DEVELOPMENT || DEBUG +uint32_t vm_grab_anon_overrides = 0; +uint32_t vm_grab_anon_nops = 0; +#endif + #endif /* @@ -519,15 +529,15 @@ vm_pageout_object_terminate( shadow_object = object->shadow; vm_object_lock(shadow_object); - while (!queue_empty(&object->memq)) { + while (!vm_page_queue_empty(&object->memq)) { vm_page_t p, m; vm_object_offset_t offset; - p = (vm_page_t) queue_first(&object->memq); + p = (vm_page_t) vm_page_queue_first(&object->memq); assert(p->private); - assert(p->pageout); - p->pageout = FALSE; + assert(p->free_when_done); + p->free_when_done = FALSE; assert(!p->cleaning); assert(!p->laundry); @@ -549,7 +559,7 @@ vm_pageout_object_terminate( * Also decrement the burst throttle (if external). */ vm_page_lock_queues(); - if (m->pageout_queue) + if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) vm_pageout_throttle_up(m); /* @@ -560,12 +570,13 @@ vm_pageout_object_terminate( * pages may have been modified between the selection as an * adjacent page and conversion to a target. */ - if (m->pageout) { + if (m->free_when_done) { assert(m->busy); + assert(m->vm_page_q_state == VM_PAGE_IS_WIRED); assert(m->wire_count == 1); m->cleaning = FALSE; m->encrypted_cleaning = FALSE; - m->pageout = FALSE; + m->free_when_done = FALSE; #if MACH_CLUSTER_STATS if (m->wanted) vm_pageout_target_collisions++; #endif @@ -579,7 +590,7 @@ vm_pageout_object_terminate( * can detect whether the page was redirtied during * pageout by checking the modify state. */ - if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) { + if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } else { m->dirty = FALSE; @@ -603,7 +614,7 @@ vm_pageout_object_terminate( * If prep_pin_count is nonzero, then someone is using the * page, so make it active. */ - if (!m->active && !m->inactive && !m->throttled && !m->private) { + if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) && !m->private) { if (m->reference) vm_page_activate(m); else @@ -623,7 +634,7 @@ vm_pageout_object_terminate( * will take care of resetting dirty. We clear the * modify however for the Programmed I/O case. */ - pmap_clear_modify(m->phys_page); + pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m)); m->busy = FALSE; m->absent = FALSE; @@ -648,7 +659,7 @@ vm_pageout_object_terminate( * consulted if m->dirty is false. */ #if MACH_CLUSTER_STATS - m->dirty = pmap_is_modified(m->phys_page); + m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)); if (m->dirty) vm_pageout_cluster_dirtied++; else vm_pageout_cluster_cleaned++; @@ -706,11 +717,11 @@ vm_pageclean_setup( #endif XPR(XPR_VM_PAGEOUT, - "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n", - m->object, m->offset, m, + "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n", + VM_PAGE_OBJECT(m), m->offset, m, new_m, new_offset); - pmap_clear_modify(m->phys_page); + pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m)); /* * Mark original page as cleaning in place. @@ -724,11 +735,11 @@ vm_pageclean_setup( * the real page. */ assert(new_m->fictitious); - assert(new_m->phys_page == vm_page_fictitious_addr); + assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr); new_m->fictitious = FALSE; new_m->private = TRUE; - new_m->pageout = TRUE; - new_m->phys_page = m->phys_page; + new_m->free_when_done = TRUE; + VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m)); vm_page_lockspin_queues(); vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE); @@ -768,7 +779,13 @@ vm_pageout_initialize_page( XPR(XPR_VM_PAGEOUT, "vm_pageout_initialize_page, page 0x%X\n", m, 0, 0, 0, 0); + + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + + object = VM_PAGE_OBJECT(m); + assert(m->busy); + assert(object->internal); /* * Verify that we really want to clean this page @@ -780,12 +797,12 @@ vm_pageout_initialize_page( /* * Create a paging reference to let us play with the object. */ - object = m->object; paging_offset = m->offset + object->paging_offset; if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) { - VM_PAGE_FREE(m); panic("reservation without pageout?"); /* alan */ + + VM_PAGE_FREE(m); vm_object_unlock(object); return; @@ -800,17 +817,17 @@ vm_pageout_initialize_page( pager = object->pager; if (pager == MEMORY_OBJECT_NULL) { - VM_PAGE_FREE(m); panic("missing pager for copy object"); + + VM_PAGE_FREE(m); return; } /* * set the page for future call to vm_fault_list_request */ - pmap_clear_modify(m->phys_page); + pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m)); SET_PAGE_DIRTY(m, FALSE); - m->pageout = TRUE; /* * keep the object from collapsing or terminating @@ -858,9 +875,9 @@ struct { */ int -vm_pageout_cluster(vm_page_t m, boolean_t pageout, boolean_t immediate_ok, boolean_t keep_object_locked) +vm_pageout_cluster(vm_page_t m, boolean_t immediate_ok, boolean_t keep_object_locked) { - vm_object_t object = m->object; + vm_object_t object = VM_PAGE_OBJECT(m); struct vm_pageout_queue *q; @@ -869,41 +886,36 @@ vm_pageout_cluster(vm_page_t m, boolean_t pageout, boolean_t immediate_ok, boole object, m->offset, m, 0, 0); VM_PAGE_CHECK(m); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); vm_object_lock_assert_exclusive(object); /* * Only a certain kind of page is appreciated here. */ assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m))); - assert(!m->cleaning && !m->pageout && !m->laundry); -#ifndef CONFIG_FREEZE - assert(!m->inactive && !m->active); - assert(!m->throttled); -#endif + assert(!m->cleaning && !m->laundry); + assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); /* * protect the object from collapse or termination */ vm_object_activity_begin(object); - m->pageout = pageout; - if (object->internal == TRUE) { - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - m->busy = TRUE; + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); - if (vm_compressor_immediate_preferred == TRUE && immediate_ok == TRUE) { - if (keep_object_locked == FALSE) - vm_object_unlock(object); - vm_page_unlock_queues(); + m->busy = TRUE; - vm_pageout_immediate(m, keep_object_locked); + if (vm_compressor_immediate_preferred == TRUE && immediate_ok == TRUE) { + panic("immediate compressor mode no longer supported\n"); + + if (keep_object_locked == FALSE) + vm_object_unlock(object); + vm_page_unlock_queues(); - return (1); - } + vm_pageout_immediate(m, keep_object_locked); + + return (1); } q = &vm_pageout_queue_internal; } else @@ -915,8 +927,8 @@ vm_pageout_cluster(vm_page_t m, boolean_t pageout, boolean_t immediate_ok, boole m->laundry = TRUE; q->pgo_laundry++; - m->pageout_queue = TRUE; - queue_enter(&q->pgo_pending, m, vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_PAGEOUT_Q; + vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, pageq); if (q->pgo_idle == TRUE) { q->pgo_idle = FALSE; @@ -942,31 +954,31 @@ vm_pageout_throttle_up( vm_page_t m) { struct vm_pageout_queue *q; + vm_object_t m_object; - assert(m->object != VM_OBJECT_NULL); - assert(m->object != kernel_object); + m_object = VM_PAGE_OBJECT(m); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - vm_object_lock_assert_exclusive(m->object); -#endif + assert(m_object != VM_OBJECT_NULL); + assert(m_object != kernel_object); + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + vm_object_lock_assert_exclusive(m_object); vm_pageout_throttle_up_count++; - if (m->object->internal == TRUE) + if (m_object->internal == TRUE) q = &vm_pageout_queue_internal; else q = &vm_pageout_queue_external; - if (m->pageout_queue == TRUE) { + if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { - queue_remove(&q->pgo_pending, m, vm_page_t, pageq); - m->pageout_queue = FALSE; + vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_NOT_ON_Q; - m->pageq.next = NULL; - m->pageq.prev = NULL; + VM_PAGE_ZERO_PAGEQ_ENTRY(m); - vm_object_activity_end(m->object); + vm_object_activity_end(m_object); } if (m->laundry == TRUE) { @@ -990,9 +1002,7 @@ vm_pageout_throttle_up_batch( struct vm_pageout_queue *q, int batch_cnt) { -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); vm_pageout_throttle_up_count += batch_cnt; @@ -1157,18 +1167,16 @@ mach_vm_pressure_monitor( } /* provide number of pages reclaimed in the last "nsecs_monitored" */ - do { - vm_pageout_now = vm_pageout_stat_now; - pages_reclaimed = 0; - for (vm_pageout_then = - VM_PAGEOUT_STAT_BEFORE(vm_pageout_now); - vm_pageout_then != vm_pageout_now && - nsecs_monitored-- != 0; - vm_pageout_then = - VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) { - pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed; - } - } while (vm_pageout_now != vm_pageout_stat_now); + vm_pageout_now = vm_pageout_stat_now; + pages_reclaimed = 0; + for (vm_pageout_then = + VM_PAGEOUT_STAT_BEFORE(vm_pageout_now); + vm_pageout_then != vm_pageout_now && + nsecs_monitored-- != 0; + vm_pageout_then = + VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) { + pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed; + } *pages_reclaimed_p = pages_reclaimed; return KERN_SUCCESS; @@ -1176,8 +1184,151 @@ mach_vm_pressure_monitor( +#if DEVELOPMENT || DEBUG + static void -vm_pageout_page_queue(queue_head_t *, int); +vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int); + +/* + * condition variable used to make sure there is + * only a single sweep going on at a time + */ +boolean_t vm_pageout_disconnect_all_pages_active = FALSE; + + +void +vm_pageout_disconnect_all_pages() +{ + vm_page_lock_queues(); + + if (vm_pageout_disconnect_all_pages_active == TRUE) { + vm_page_unlock_queues(); + return; + } + vm_pageout_disconnect_all_pages_active = TRUE; + vm_page_unlock_queues(); + + vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count); + vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count); + vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count); + + vm_pageout_disconnect_all_pages_active = FALSE; +} + + +void +vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount) +{ + vm_page_t m; + vm_object_t t_object = NULL; + vm_object_t l_object = NULL; + vm_object_t m_object = NULL; + int delayed_unlock = 0; + int try_failed_count = 0; + int disconnected_count = 0; + int paused_count = 0; + int object_locked_count = 0; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START, + q, qcount, 0, 0, 0); + + vm_page_lock_queues(); + + while (qcount && !vm_page_queue_empty(q)) { + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + m = (vm_page_t) vm_page_queue_first(q); + m_object = VM_PAGE_OBJECT(m); + + /* + * check to see if we currently are working + * with the same object... if so, we've + * already got the lock + */ + if (m_object != l_object) { + /* + * the object associated with candidate page is + * different from the one we were just working + * with... dump the lock if we still own it + */ + if (l_object != NULL) { + vm_object_unlock(l_object); + l_object = NULL; + } + if (m_object != t_object) + try_failed_count = 0; + + /* + * Try to lock object; since we've alread got the + * page queues lock, we can only 'try' for this one. + * if the 'try' fails, we need to do a mutex_pause + * to allow the owner of the object lock a chance to + * run... + */ + if ( !vm_object_lock_try_scan(m_object)) { + + if (try_failed_count > 20) { + goto reenter_pg_on_q; + } + vm_page_unlock_queues(); + mutex_pause(try_failed_count++); + vm_page_lock_queues(); + delayed_unlock = 0; + + paused_count++; + + t_object = m_object; + continue; + } + object_locked_count++; + + l_object = m_object; + } + if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) { + /* + * put it back on the head of its queue + */ + goto reenter_pg_on_q; + } + if (m->pmapped == TRUE) { + + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); + + disconnected_count++; + } +reenter_pg_on_q: + vm_page_queue_remove(q, m, vm_page_t, pageq); + vm_page_queue_enter(q, m, vm_page_t, pageq); + + qcount--; + try_failed_count = 0; + + if (delayed_unlock++ > 128) { + + if (l_object != NULL) { + vm_object_unlock(l_object); + l_object = NULL; + } + lck_mtx_yield(&vm_page_queue_lock); + delayed_unlock = 0; + } + } + if (l_object != NULL) { + vm_object_unlock(l_object); + l_object = NULL; + } + vm_page_unlock_queues(); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END, + q, disconnected_count, object_locked_count, paused_count, 0); +} + +#endif + + +static void +vm_pageout_page_queue(vm_page_queue_head_t *, int); /* * condition variable used to make sure there is @@ -1189,7 +1340,7 @@ boolean_t vm_pageout_anonymous_pages_active = FALSE; void vm_pageout_anonymous_pages() { - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { + if (VM_CONFIG_COMPRESSOR_IS_PRESENT) { vm_page_lock_queues(); @@ -1204,7 +1355,8 @@ vm_pageout_anonymous_pages() vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count); vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count); - vm_consider_swapping(); + if (VM_CONFIG_SWAP_IS_PRESENT) + vm_consider_swapping(); vm_page_lock_queues(); vm_pageout_anonymous_pages_active = FALSE; @@ -1214,7 +1366,7 @@ vm_pageout_anonymous_pages() void -vm_pageout_page_queue(queue_head_t *q, int qcount) +vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) { vm_page_t m; vm_object_t t_object = NULL; @@ -1225,15 +1377,16 @@ vm_pageout_page_queue(queue_head_t *q, int qcount) int refmod_state; int pmap_options; struct vm_pageout_queue *iq; + ppnum_t phys_page; iq = &vm_pageout_queue_internal; vm_page_lock_queues(); - while (qcount && !queue_empty(q)) { + while (qcount && !vm_page_queue_empty(q)) { - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); if (VM_PAGE_Q_THROTTLED(iq)) { @@ -1252,8 +1405,8 @@ vm_pageout_page_queue(queue_head_t *q, int qcount) delayed_unlock = 0; continue; } - m = (vm_page_t) queue_first(q); - m_object = m->object; + m = (vm_page_t) vm_page_queue_first(q); + m_object = VM_PAGE_OBJECT(m); /* * check to see if we currently are working @@ -1298,15 +1451,17 @@ vm_pageout_page_queue(queue_head_t *q, int qcount) } l_object = m_object; } - if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->pageout) { + if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) { /* * page is not to be cleaned * put it back on the head of its queue */ goto reenter_pg_on_q; } + phys_page = VM_PAGE_GET_PHYS_PAGE(m); + if (m->reference == FALSE && m->pmapped == TRUE) { - refmod_state = pmap_get_refmod(m->phys_page); + refmod_state = pmap_get_refmod(phys_page); if (refmod_state & VM_MEM_REFERENCED) m->reference = TRUE; @@ -1316,7 +1471,7 @@ vm_pageout_page_queue(queue_head_t *q, int qcount) } if (m->reference == TRUE) { m->reference = FALSE; - pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); + pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); goto reenter_pg_on_q; } if (m->pmapped == TRUE) { @@ -1325,7 +1480,7 @@ vm_pageout_page_queue(queue_head_t *q, int qcount) } else { pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; } - refmod_state = pmap_disconnect_options(m->phys_page, pmap_options, NULL); + refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL); if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } @@ -1365,19 +1520,17 @@ vm_pageout_page_queue(queue_head_t *q, int qcount) * means this page can't be on the pageout queue so it's * safe to do the vm_page_queues_remove */ - assert(!m->pageout_queue); - - vm_page_queues_remove(m); + vm_page_queues_remove(m, TRUE); - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - vm_pageout_cluster(m, TRUE, FALSE, FALSE); + vm_pageout_cluster(m, FALSE, FALSE); goto next_pg; reenter_pg_on_q: - queue_remove(q, m, vm_page_t, pageq); - queue_enter(q, m, vm_page_t, pageq); + vm_page_queue_remove(q, m, vm_page_t, pageq); + vm_page_queue_enter(q, m, vm_page_t, pageq); next_pg: qcount--; try_failed_count = 0; @@ -1406,19 +1559,7 @@ vm_pageout_page_queue(queue_head_t *q, int qcount) */ extern void vm_pageout_io_throttle(void); -/* - * Page States: Used below to maintain the page state - * before it's removed from it's Q. This saved state - * helps us do the right accounting in certain cases - */ -#define PAGE_STATE_SPECULATIVE 1 -#define PAGE_STATE_ANONYMOUS 2 -#define PAGE_STATE_INACTIVE 3 -#define PAGE_STATE_INACTIVE_FIRST 4 -#define PAGE_STATE_CLEAN 5 - - -#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m) \ +#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \ MACRO_BEGIN \ /* \ * If a "reusable" page somehow made it back into \ @@ -1428,9 +1569,10 @@ extern void vm_pageout_io_throttle(void); * as "all re-used" instead of converting it to \ * "partially re-used", which could be expensive. \ */ \ + assert(VM_PAGE_OBJECT((m)) == (obj)); \ if ((m)->reusable || \ - (m)->object->all_reusable) { \ - vm_object_reuse_pages((m)->object, \ + (obj)->all_reusable) { \ + vm_object_reuse_pages((obj), \ (m)->offset, \ (m)->offset + PAGE_SIZE_64, \ FALSE); \ @@ -1450,11 +1592,24 @@ struct flow_control { mach_timespec_t ts; }; +#if CONFIG_BACKGROUND_QUEUE +uint64_t vm_pageout_considered_bq_internal = 0; +uint64_t vm_pageout_considered_bq_external = 0; +uint64_t vm_pageout_rejected_bq_internal = 0; +uint64_t vm_pageout_rejected_bq_external = 0; +#endif uint32_t vm_pageout_considered_page = 0; uint32_t vm_page_filecache_min = 0; #define ANONS_GRABBED_LIMIT 2 +#if CONFIG_SECLUDED_MEMORY +extern vm_page_t vm_page_grab_secluded(void); +uint64_t vm_pageout_freed_from_secluded = 0; +uint64_t vm_pageout_secluded_reactivated = 0; /* debugging; how many secluded pages are found to be referenced on pageout (and are therefore reactivated) */ +uint64_t vm_pageout_secluded_burst_count = 0; +#endif /* CONFIG_SECLUDED_MEMORY */ + /* * vm_pageout_scan does the dirty work for the pageout daemon. * It returns with both vm_page_queue_free_lock and vm_page_queue_lock @@ -1486,17 +1641,24 @@ vm_pageout_scan(void) vm_object_t last_object_tried; uint32_t catch_up_count = 0; uint32_t inactive_reclaim_run; - boolean_t forced_reclaim; boolean_t exceeded_burst_throttle; boolean_t grab_anonymous = FALSE; boolean_t force_anonymous = FALSE; int anons_grabbed = 0; - int page_prev_state = 0; + int page_prev_q_state = 0; + boolean_t requeue_insert_first = FALSE; +#if CONFIG_BACKGROUND_QUEUE + boolean_t ignore_reference = FALSE; +#endif +#if CONFIG_SECLUDED_MEMORY + boolean_t ignore_reference_secluded; +#endif /* CONFIG_SECLUDED_MEMORY */ int cache_evict_throttle = 0; uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; int force_purge = 0; #define DELAY_SPECULATIVE_AGE 1000 int delay_speculative_age = 0; + vm_object_t m_object = VM_OBJECT_NULL; #if VM_PRESSURE_EVENTS vm_pressure_level_t pressure_level; @@ -1548,6 +1710,8 @@ vm_pageout_scan(void) Restart: + + assert(delayed_unlock!=0); /* @@ -1588,6 +1752,263 @@ vm_pageout_scan(void) DTRACE_VM2(rev, int, 1, (uint64_t *), NULL); +#if CONFIG_SECLUDED_MEMORY + if (vm_page_secluded_count > vm_page_secluded_target && + object != NULL) { + vm_object_unlock(object); + object = NULL; + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + } + + /* + * Deal with secluded_q overflow. + */ + if (vm_page_secluded_count > vm_page_secluded_target && + secluded_aging_policy == SECLUDED_AGING_FIFO) { + unsigned int secluded_overflow; + vm_page_t secluded_page; + + /* + * SECLUDED_AGING_FIFO: + * No aging, just reclaim the excess pages + * at the tail of the secluded queue. + * We're reclaiming pages and we're not hogging + * any global lock, so no need for throttling. + */ + + secluded_overflow = (vm_page_secluded_count - + vm_page_secluded_target); + /* transfer to free queue */ + vm_page_unlock_queues(); + while (secluded_overflow--) { + secluded_page = vm_page_grab_secluded(); + if (secluded_page == VM_PAGE_NULL) { + break; + } + assert(secluded_page->busy); + assert(secluded_page->pageq.next == 0 && + secluded_page->pageq.prev == 0); + + secluded_page->snext = local_freeq; + local_freeq = secluded_page; + local_freed++; + secluded_page = VM_PAGE_NULL; + } + } else if (vm_page_secluded_count > vm_page_secluded_target && + secluded_aging_policy == SECLUDED_AGING_ALONG_ACTIVE) { + unsigned int secluded_overflow; + vm_page_t secluded_page; + + /* + * SECLUDED_AGING_ALONG_ACTIVE: + * There might be free pages at the tail of the + * secluded queue: + * just move them to the free queue (in batches). + * There can also be an excessive number of "inuse" + * pages: + * we age them by resetting their "referenced" bit and + * moving them to the inactive queue. Their trip + * through the secluded queue was equivalent to a trip + * through the active queue. + * + * We're holding the page queue lock, so we need + * to throttle and give someone else a chance to + * grab that lock if needed. + * + * We're also limiting the number of secluded "inuse" + * pages that get moved to the inactive queue, using + * the same "active_bust_count" method we use when + * balancing the active and inactive queues, because + * there can be a large number + * of extra "inuse" pages and handling them gets in the + * way of actually reclaiming memory. + */ + + active_burst_count = MIN(vm_pageout_burst_active_throttle, + vm_page_secluded_count_inuse); + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT; + delayed_unlock = 1; + secluded_overflow = (vm_page_secluded_count - + vm_page_secluded_target); + while (secluded_overflow-- > 0 && + vm_page_secluded_count > vm_page_secluded_target) { + assert((vm_page_secluded_count_free + + vm_page_secluded_count_inuse) == + vm_page_secluded_count); + vm_page_queue_remove_first(&vm_page_queue_secluded, + secluded_page, + vm_page_t, + pageq); + assert(secluded_page->vm_page_q_state == + VM_PAGE_ON_SECLUDED_Q); + VM_PAGE_ZERO_PAGEQ_ENTRY(secluded_page); + secluded_page->vm_page_q_state = VM_PAGE_NOT_ON_Q; + vm_page_secluded_count--; + assert(!secluded_page->fictitious); + assert(!VM_PAGE_WIRED(secluded_page)); + if (secluded_page->vm_page_object == 0) { + /* transfer to free queue */ + assert(secluded_page->busy); + vm_page_secluded_count_free--; + secluded_page->snext = local_freeq; + local_freeq = secluded_page; + local_freed++; + } else { + vm_page_secluded_count_inuse--; + /* transfer to head of inactive queue */ + pmap_clear_refmod_options( + VM_PAGE_GET_PHYS_PAGE(secluded_page), + VM_MEM_REFERENCED, + PMAP_OPTIONS_NOFLUSH, + (void *)NULL); + vm_page_enqueue_inactive(secluded_page, + FALSE); + if (active_burst_count-- == 0) { + vm_pageout_secluded_burst_count++; + break; + } + } + secluded_page = VM_PAGE_NULL; + if (delayed_unlock++ > delayed_unlock_limit) { + if (local_freeq) { + vm_page_unlock_queues(); + VM_DEBUG_EVENT( + vm_pageout_freelist, + VM_PAGEOUT_FREELIST, + DBG_FUNC_START, + vm_page_free_count, + local_freed, + delayed_unlock_limit, + 1); + vm_page_free_list(local_freeq, + TRUE); + VM_DEBUG_EVENT( + vm_pageout_freelist, + VM_PAGEOUT_FREELIST, + DBG_FUNC_END, + vm_page_free_count, + 0, 0, 1); + local_freeq = NULL; + local_freed = 0; + vm_page_lock_queues(); + } else { + lck_mtx_yield(&vm_page_queue_lock); + } + delayed_unlock = 1; + } + } + delayed_unlock = 1; + } else if (vm_page_secluded_count > vm_page_secluded_target && + secluded_aging_policy == SECLUDED_AGING_AFTER_INACTIVE) { + /* + * SECLUDED_AGING_AFTER_INACTIVE: + * No balancing needed at this point: when we get to + * the "choose a victim" part below, we'll consider the + * extra secluded pages before any inactive page. + */ + } else if (vm_page_secluded_count > vm_page_secluded_target && + secluded_aging_policy == SECLUDED_AGING_BEFORE_ACTIVE) { + unsigned int secluded_overflow; + vm_page_t secluded_page; + + /* + * SECLUDED_AGING_BEFORE_ACTIVE: + * Excess secluded pages go to the active queue and + * will later go to the inactive queue. + */ + active_burst_count = MIN(vm_pageout_burst_active_throttle, + vm_page_secluded_count_inuse); + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT; + delayed_unlock = 1; + secluded_overflow = (vm_page_secluded_count - + vm_page_secluded_target); + while (secluded_overflow-- > 0 && + vm_page_secluded_count > vm_page_secluded_target) { + assert((vm_page_secluded_count_free + + vm_page_secluded_count_inuse) == + vm_page_secluded_count); + vm_page_queue_remove_first(&vm_page_queue_secluded, + secluded_page, + vm_page_t, + pageq); + assert(secluded_page->vm_page_q_state == + VM_PAGE_ON_SECLUDED_Q); + VM_PAGE_ZERO_PAGEQ_ENTRY(secluded_page); + secluded_page->vm_page_q_state = VM_PAGE_NOT_ON_Q; + vm_page_secluded_count--; + assert(!secluded_page->fictitious); + assert(!VM_PAGE_WIRED(secluded_page)); + if (secluded_page->vm_page_object == 0) { + /* transfer to free queue */ + assert(secluded_page->busy); + vm_page_secluded_count_free--; + secluded_page->snext = local_freeq; + local_freeq = secluded_page; + local_freed++; + } else { + vm_page_secluded_count_inuse--; + /* transfer to head of active queue */ + vm_page_enqueue_active(secluded_page, + FALSE); + if (active_burst_count-- == 0) { + vm_pageout_secluded_burst_count++; + break; + } + } + secluded_page = VM_PAGE_NULL; + if (delayed_unlock++ > delayed_unlock_limit) { + if (local_freeq) { + vm_page_unlock_queues(); + VM_DEBUG_EVENT( + vm_pageout_freelist, + VM_PAGEOUT_FREELIST, + DBG_FUNC_START, + vm_page_free_count, + local_freed, + delayed_unlock_limit, + 1); + vm_page_free_list(local_freeq, + TRUE); + VM_DEBUG_EVENT( + vm_pageout_freelist, + VM_PAGEOUT_FREELIST, + DBG_FUNC_END, + vm_page_free_count, + 0, 0, 1); + local_freeq = NULL; + local_freed = 0; + vm_page_lock_queues(); + } else { + lck_mtx_yield(&vm_page_queue_lock); + } + delayed_unlock = 1; + } + } + delayed_unlock = 1; + } else if (vm_page_secluded_count > vm_page_secluded_target) { + panic("unsupported secluded_aging_policy %d\n", + secluded_aging_policy); + } + if (local_freeq) { + vm_page_unlock_queues(); + VM_DEBUG_EVENT(vm_pageout_freelist, + VM_PAGEOUT_FREELIST, + DBG_FUNC_START, + vm_page_free_count, + local_freed, + 0, + 0); + vm_page_free_list(local_freeq, TRUE); + VM_DEBUG_EVENT(vm_pageout_freelist, + VM_PAGEOUT_FREELIST, + DBG_FUNC_END, + vm_page_free_count, 0, 0, 0); + local_freeq = NULL; + local_freed = 0; + vm_page_lock_queues(); + } +#endif /* CONFIG_SECLUDED_MEMORY */ + assert(delayed_unlock); if (vm_upl_wait_for_pages < 0) @@ -1625,16 +2046,16 @@ vm_pageout_scan(void) memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START); - while (!queue_empty(&vm_page_queue_active) && active_burst_count--) { + while (!vm_page_queue_empty(&vm_page_queue_active) && active_burst_count--) { vm_pageout_active++; - m = (vm_page_t) queue_first(&vm_page_queue_active); + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); - assert(m->active && !m->inactive); + assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q); assert(!m->laundry); - assert(m->object != kernel_object); - assert(m->phys_page != vm_page_guard_addr); + assert(VM_PAGE_OBJECT(m) != kernel_object); + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); @@ -1649,7 +2070,7 @@ vm_pageout_scan(void) * in the past (TLB caches don't hang around for very long), and of course could just as easily * have happened before we moved the page */ - pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); /* * The page might be absent or busy, @@ -1700,7 +2121,13 @@ vm_pageout_scan(void) done_moving_active_pages: - if (vm_page_free_count + local_freed >= vm_page_free_target) { +#if CONFIG_BACKGROUND_QUEUE + if ((vm_page_free_count + local_freed >= vm_page_free_target) && + ((vm_page_background_mode < VM_PAGE_BG_LEVEL_2) || (vm_page_background_count <= vm_page_background_target))) +#else + if (vm_page_free_count + local_freed >= vm_page_free_target) +#endif + { if (object != NULL) { vm_object_unlock(object); object = NULL; @@ -1743,7 +2170,7 @@ vm_pageout_scan(void) vm_page_inactive_count + vm_page_speculative_count); if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && - !queue_empty(&vm_page_queue_active)) { + !vm_page_queue_empty(&vm_page_queue_active)) { /* * inactive target still not met... keep going * until we get the queues balanced... @@ -1818,7 +2245,7 @@ vm_pageout_scan(void) memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); } - if (queue_empty(&sq->age_q) && vm_page_speculative_count) { + if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) { /* * try to pull pages from the aging bins... * see vm_page.h for an explanation of how @@ -1831,7 +2258,7 @@ vm_pageout_scan(void) aq = &vm_page_queue_speculative[speculative_steal_index]; num_scanned_queues = 0; - while (queue_empty(&aq->age_q) && + while (vm_page_queue_empty(&aq->age_q) && num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) { speculative_steal_index++; @@ -1858,9 +2285,9 @@ vm_pageout_scan(void) if (vm_page_speculative_count > vm_page_speculative_count_drift_max) vm_page_speculative_count_drift_max = vm_page_speculative_count; vm_page_speculative_count_drifts++; -#if 6553678 - Debugger("vm_pageout_scan: no speculative pages"); -#endif +#if DEVELOPMENT || DEBUG + panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count); +#endif /* DEVELOPMENT || DEBUG */ /* readjust... */ vm_page_speculative_count = 0; /* ... and continue */ @@ -1898,7 +2325,13 @@ vm_pageout_scan(void) if (can_steal == TRUE) vm_page_speculate_ageit(aq); } - if (queue_empty(&sq->age_q) && cache_evict_throttle == 0) { +#if CONFIG_BACKGROUND_QUEUE + if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0 && + ((vm_page_background_mode == VM_PAGE_BG_DISABLED) || (vm_page_background_count <= vm_page_background_target))) +#else + if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) +#endif + { int pages_evicted; if (object != NULL) { @@ -1949,6 +2382,8 @@ vm_pageout_scan(void) */ vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3); #endif + if (vm_page_free_count < (vm_page_free_reserved / 4)) + vm_page_filecache_min = 0; exceeded_burst_throttle = FALSE; /* @@ -1958,7 +2393,9 @@ vm_pageout_scan(void) * within the last vm_pageout_burst_inactive_throttle iterations * 3) Flow control - default pageout queue is full */ - if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_anonymous) && queue_empty(&sq->age_q)) { + if (vm_page_queue_empty(&vm_page_queue_inactive) && + vm_page_queue_empty(&vm_page_queue_anonymous) && + vm_page_queue_empty(&sq->age_q)) { vm_pageout_scan_empty_throttle++; msecs = vm_pageout_empty_wait; goto vm_pageout_scan_delay; @@ -1980,7 +2417,7 @@ vm_pageout_scan(void) goto vm_pageout_scan_delay; } else if (VM_PAGE_Q_THROTTLED(iq) && - VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) { + VM_DYNAMIC_PAGING_ENABLED()) { clock_sec_t sec; clock_nsec_t nsec; @@ -2018,7 +2455,8 @@ vm_pageout_scan(void) vm_pageout_scan_yield_unthrottled++; continue; } - if (vm_page_pageable_external_count > vm_page_filecache_min && !queue_empty(&vm_page_queue_inactive)) { + if (vm_page_pageable_external_count > vm_page_filecache_min && + !vm_page_queue_empty(&vm_page_queue_inactive)) { anons_grabbed = ANONS_GRABBED_LIMIT; vm_pageout_scan_throttle_deferred++; goto consider_inactive; @@ -2231,30 +2669,95 @@ vm_pageout_scan(void) while (1) { uint32_t inactive_external_count; +#if CONFIG_BACKGROUND_QUEUE + ignore_reference = FALSE; +#endif /* CONFIG_BACKGROUND_QUEUE */ + m = NULL; + m_object = VM_OBJECT_NULL; - if (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) { + if (VM_DYNAMIC_PAGING_ENABLED()) { assert(vm_page_throttled_count == 0); - assert(queue_empty(&vm_page_queue_throttled)); + assert(vm_page_queue_empty(&vm_page_queue_throttled)); } + + +#if CONFIG_SECLUDED_MEMORY + if ((secluded_aging_policy == + SECLUDED_AGING_AFTER_INACTIVE) && + vm_page_secluded_count > vm_page_secluded_target) { + /* + * SECLUDED_AGING_AFTER_INACTIVE: + * Secluded pages have already been aged + * through the active and inactive queues, and + * we now have too many of them, so let's + * balance that queue by considering reclaiming + * the oldest page in the secluded queue. + */ + assert(!vm_page_queue_empty(&vm_page_queue_secluded)); + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_secluded); + if (m->vm_page_object == 0) { + /* + * It's already a free page: + * just move it to a free queue. + */ + vm_page_queues_remove(m, TRUE); + assert(m->busy); + assert(m->pageq.next == 0); + assert(m->pageq.prev == 0); + m->snext = local_freeq; + local_freeq = m; + local_freed++; + goto done_with_inactivepage; + } + /* + * Not a free page: we've found our next + * "victim". + */ + break; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + +#if CONFIG_BACKGROUND_QUEUE + if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) { + vm_object_t bg_m_object = NULL; + + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background); + + bg_m_object = VM_PAGE_OBJECT(m); + + if (force_anonymous == FALSE || bg_m_object->internal) { + ignore_reference = TRUE; + + if (bg_m_object->internal) + vm_pageout_considered_bq_internal++; + else + vm_pageout_considered_bq_external++; + + assert(VM_PAGE_PAGEABLE(m)); + break; + } + } +#endif + /* * The most eligible pages are ones we paged in speculatively, * but which have not yet been touched. */ - if (!queue_empty(&sq->age_q) && force_anonymous == FALSE) { - m = (vm_page_t) queue_first(&sq->age_q); + if (!vm_page_queue_empty(&sq->age_q) && force_anonymous == FALSE) { + m = (vm_page_t) vm_page_queue_first(&sq->age_q); - page_prev_state = PAGE_STATE_SPECULATIVE; + assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q); break; } /* * Try a clean-queue inactive page. */ - if (!queue_empty(&vm_page_queue_cleaned)) { - m = (vm_page_t) queue_first(&vm_page_queue_cleaned); + if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); - page_prev_state = PAGE_STATE_CLEAN; + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); break; } @@ -2267,13 +2770,44 @@ vm_pageout_scan(void) grab_anonymous = TRUE; anons_grabbed = 0; } +#if CONFIG_JETSAM + /* If the file-backed pool has accumulated + * significantly more pages than the jetsam + * threshold, prefer to reclaim those + * inline to minimise compute overhead of reclaiming + * anonymous pages. + * This calculation does not account for the CPU local + * external page queues, as those are expected to be + * much smaller relative to the global pools. + */ + if (grab_anonymous) { + if (vm_page_pageable_external_count > + vm_page_filecache_min) { + if ((vm_page_pageable_external_count * + vm_pageout_memorystatus_fb_factor_dr) > + (memorystatus_available_pages_critical * + vm_pageout_memorystatus_fb_factor_nr)) { + grab_anonymous = FALSE; +#if DEVELOPMENT || DEBUG + vm_grab_anon_overrides++; +#endif + } + } +#if DEVELOPMENT || DEBUG + if (grab_anonymous) { + vm_grab_anon_nops++; + + } +#endif + } +#endif /* CONFIG_JETSAM */ - if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || queue_empty(&vm_page_queue_anonymous)) { + if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) { - if ( !queue_empty(&vm_page_queue_inactive) ) { - m = (vm_page_t) queue_first(&vm_page_queue_inactive); + if ( !vm_page_queue_empty(&vm_page_queue_inactive) ) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); - page_prev_state = PAGE_STATE_INACTIVE; + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); anons_grabbed = 0; if (vm_page_pageable_external_count < vm_page_filecache_min) { @@ -2288,10 +2822,10 @@ vm_pageout_scan(void) break; } } - if ( !queue_empty(&vm_page_queue_anonymous) ) { - m = (vm_page_t) queue_first(&vm_page_queue_anonymous); + if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); - page_prev_state = PAGE_STATE_ANONYMOUS; + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); anons_grabbed++; break; @@ -2331,32 +2865,34 @@ vm_pageout_scan(void) if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) goto Restart; - if (!queue_empty(&sq->age_q)) + if (!vm_page_queue_empty(&sq->age_q)) goto Restart; panic("vm_pageout: no victim"); /* NOTREACHED */ } + m_object = VM_PAGE_OBJECT(m); force_anonymous = FALSE; + page_prev_q_state = m->vm_page_q_state; + requeue_insert_first = FALSE; /* * we just found this page on one of our queues... * it can't also be on the pageout queue, so safe * to call vm_page_queues_remove */ - assert(!m->pageout_queue); - - vm_page_queues_remove(m); + vm_page_queues_remove(m, TRUE); assert(!m->laundry); assert(!m->private); assert(!m->fictitious); - assert(m->object != kernel_object); - assert(m->phys_page != vm_page_guard_addr); + assert(m_object != kernel_object); + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); - if (page_prev_state != PAGE_STATE_SPECULATIVE) + if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q && + page_prev_q_state != VM_PAGE_ON_SECLUDED_Q) vm_pageout_stats[vm_pageout_stat_now].considered++; DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); @@ -2366,7 +2902,7 @@ vm_pageout_scan(void) * with the same object... if so, we've * already got the lock */ - if (m->object != object) { + if (m_object != object) { /* * the object associated with candidate page is * different from the one we were just working @@ -2387,18 +2923,18 @@ vm_pageout_scan(void) * the queue... clumps of pages associated with the same * object are fairly typical on the inactive and active queues */ - if (!vm_object_lock_try_scan(m->object)) { + if (!vm_object_lock_try_scan(m_object)) { vm_page_t m_want = NULL; vm_pageout_inactive_nolock++; - if (page_prev_state == PAGE_STATE_CLEAN) + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) vm_pageout_cleaned_nolock++; - if (page_prev_state == PAGE_STATE_SPECULATIVE) - page_prev_state = PAGE_STATE_INACTIVE_FIRST; + if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) + requeue_insert_first = TRUE; - pmap_clear_reference(m->phys_page); + pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); m->reference = FALSE; /* @@ -2410,16 +2946,17 @@ vm_pageout_scan(void) * is possible for the value to be a bit non-determistic, but that's ok * since it's only used as a hint */ - m->object->scan_collisions = 1; - - if ( !queue_empty(&sq->age_q) ) - m_want = (vm_page_t) queue_first(&sq->age_q); - else if ( !queue_empty(&vm_page_queue_cleaned)) - m_want = (vm_page_t) queue_first(&vm_page_queue_cleaned); - else if (anons_grabbed >= ANONS_GRABBED_LIMIT || queue_empty(&vm_page_queue_anonymous)) - m_want = (vm_page_t) queue_first(&vm_page_queue_inactive); - else if ( !queue_empty(&vm_page_queue_anonymous)) - m_want = (vm_page_t) queue_first(&vm_page_queue_anonymous); + m_object->scan_collisions = 1; + + if ( !vm_page_queue_empty(&sq->age_q) ) + m_want = (vm_page_t) vm_page_queue_first(&sq->age_q); + else if ( !vm_page_queue_empty(&vm_page_queue_cleaned)) + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); + else if ( !vm_page_queue_empty(&vm_page_queue_inactive) && + (anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous))) + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); + else if ( !vm_page_queue_empty(&vm_page_queue_anonymous)) + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); /* * this is the next object we're going to be interested in @@ -2427,7 +2964,7 @@ vm_pageout_scan(void) * returns control */ if (m_want) - vm_pageout_scan_wants_object = m_want->object; + vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want); /* * force us to dump any collected free pages @@ -2437,11 +2974,14 @@ vm_pageout_scan(void) goto requeue_page; } - object = m->object; + object = m_object; vm_pageout_scan_wants_object = VM_OBJECT_NULL; try_failed = FALSE; } + assert(m_object == object); + assert(VM_PAGE_OBJECT(m) == m_object); + if (catch_up_count) catch_up_count--; @@ -2475,23 +3015,22 @@ vm_pageout_scan(void) */ vm_pageout_inactive_busy++; - if (page_prev_state == PAGE_STATE_CLEAN) + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) vm_pageout_cleaned_busy++; requeue_page: - switch (page_prev_state) { - - case PAGE_STATE_SPECULATIVE: - case PAGE_STATE_ANONYMOUS: - case PAGE_STATE_CLEAN: - case PAGE_STATE_INACTIVE: - vm_page_enqueue_inactive(m, FALSE); - break; - - case PAGE_STATE_INACTIVE_FIRST: + if (requeue_insert_first) vm_page_enqueue_inactive(m, TRUE); - break; + else + vm_page_enqueue_inactive(m, FALSE); +#if CONFIG_BACKGROUND_QUEUE + if (ignore_reference == TRUE) { + if (m_object->internal) + vm_pageout_rejected_bq_internal++; + else + vm_pageout_rejected_bq_external++; } +#endif goto done_with_inactivepage; } @@ -2538,20 +3077,24 @@ vm_pageout_scan(void) if (m->tabled) vm_page_remove(m, TRUE); - assert(m->pageq.next == NULL && - m->pageq.prev == NULL); - m->pageq.next = (queue_entry_t)local_freeq; + assert(m->pageq.next == 0 && m->pageq.prev == 0); + m->snext = local_freeq; local_freeq = m; local_freed++; - if (page_prev_state == PAGE_STATE_SPECULATIVE) +#if CONFIG_SECLUDED_MEMORY + if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q) + vm_pageout_freed_from_secluded++; +#endif /* CONFIG_SECLUDED_MEMORY */ + if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) vm_pageout_freed_from_speculative++; - else if (page_prev_state == PAGE_STATE_CLEAN) + else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) vm_pageout_freed_from_cleaned++; else vm_pageout_freed_from_inactive_clean++; - if (page_prev_state != PAGE_STATE_SPECULATIVE) + if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q && + page_prev_q_state != VM_PAGE_ON_SECLUDED_Q) vm_pageout_stats[vm_pageout_stat_now].reclaimed++; inactive_burst_count = 0; @@ -2567,7 +3110,7 @@ vm_pageout_scan(void) if (object->purgable == VM_PURGABLE_EMPTY) { if (m->pmapped == TRUE) { /* unmap the page */ - refmod_state = pmap_disconnect(m->phys_page); + refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } @@ -2579,7 +3122,7 @@ vm_pageout_scan(void) goto reclaim_page; } - if (COMPRESSED_PAGER_IS_ACTIVE) { + if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) { /* * With the VM compressor, the cost of * reclaiming a page is much lower (no I/O), @@ -2600,7 +3143,7 @@ vm_pageout_scan(void) /* just stick it back on! */ reactivated_this_call++; - if (page_prev_state == PAGE_STATE_CLEAN) + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) vm_pageout_cleaned_volatile_reactivated++; goto reactivate_page; @@ -2638,7 +3181,7 @@ vm_pageout_scan(void) refmod_state = -1; if (m->reference == FALSE && m->pmapped == TRUE) { - refmod_state = pmap_get_refmod(m->phys_page); + refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); if (refmod_state & VM_MEM_REFERENCED) m->reference = TRUE; @@ -2648,7 +3191,7 @@ vm_pageout_scan(void) } /* - * if (m->cleaning && !m->pageout) + * if (m->cleaning && !m->free_when_done) * If already cleaning this page in place and it hasn't * been recently referenced, just pull off the queue. * We can leave the page mapped, and upl_commit_range @@ -2658,7 +3201,7 @@ vm_pageout_scan(void) * m->cleaning == TRUE * and we'll handle it here * - * if (m->pageout && !m->cleaning) + * if (m->free_when_done && !m->cleaning) * an msync INVALIDATE is in progress... * this page has been marked for destruction * after it has been cleaned, @@ -2666,17 +3209,17 @@ vm_pageout_scan(void) * where 'cleaning' will be set... * just leave it off the paging queues * - * if (m->pageout && m->clenaing) + * if (m->free_when_done && m->clenaing) * an msync INVALIDATE is in progress * and the UPL has already gathered this page... * just leave it off the paging queues */ /* - * page with m->pageout and still on the queues means that an + * page with m->free_when_done and still on the queues means that an * MS_INVALIDATE is in progress on this page... leave it alone */ - if (m->pageout) { + if (m->free_when_done) { goto done_with_inactivepage; } @@ -2692,10 +3235,35 @@ vm_pageout_scan(void) if (m->reference || m->dirty) { /* deal with a rogue "reusable" page */ - VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m); + VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object); } +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache && + vm_page_secluded_target > 0 && + m_object->eligible_for_secluded && + secluded_aging_policy == SECLUDED_AGING_FIFO) { + /* + * SECLUDED_AGING_FIFO: + * This victim page is eligible for the secluded pool + * and we're not aging secluded pages, so let's not + * reactivate it if it's been re-referenced. + * Later on, we'll move it to the secluded queue + * instead of freeing it. + */ + ignore_reference_secluded = TRUE; + } else { + ignore_reference_secluded = FALSE; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + if (!m->no_cache && +#if CONFIG_BACKGROUND_QUEUE + ignore_reference == FALSE && +#endif +#if CONFIG_SECLUDED_MEMORY + ignore_reference_secluded == FALSE && +#endif /* CONFIG_SECLUDED_MEMORY */ (m->reference || (m->xpmapped && !object->internal && (vm_page_xpmapped_external_count < (vm_page_external_count / 4))))) { /* @@ -2715,7 +3283,7 @@ vm_pageout_scan(void) } else { uint32_t isinuse; - if (page_prev_state == PAGE_STATE_CLEAN) + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) vm_pageout_cleaned_reference_reactivated++; reactivate_page: @@ -2736,9 +3304,20 @@ vm_pageout_scan(void) VM_STAT_INCR(reactivations); inactive_burst_count = 0; } - - if (page_prev_state == PAGE_STATE_CLEAN) +#if CONFIG_BACKGROUND_QUEUE + if (ignore_reference == TRUE) { + if (m_object->internal) + vm_pageout_rejected_bq_internal++; + else + vm_pageout_rejected_bq_external++; + } +#endif + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) vm_pageout_cleaned_reactivated++; +#if CONFIG_SECLUDED_MEMORY + if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q) + vm_pageout_secluded_reactivated++; +#endif /* CONFIG_SECLUDED_MEMORY */ vm_pageout_inactive_used++; @@ -2750,14 +3329,11 @@ vm_pageout_scan(void) * the dirty bit. */ if ((refmod_state == -1) && !m->dirty && m->pmapped) { - refmod_state = pmap_get_refmod(m->phys_page); + refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } } - forced_reclaim = TRUE; - } else { - forced_reclaim = FALSE; } XPR(XPR_VM_PAGEOUT, @@ -2792,15 +3368,16 @@ vm_pageout_scan(void) } } throttle_inactive: - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && + if (!VM_DYNAMIC_PAGING_ENABLED() && object->internal && m->dirty && (object->purgable == VM_PURGABLE_DENY || object->purgable == VM_PURGABLE_NONVOLATILE || object->purgable == VM_PURGABLE_VOLATILE)) { vm_page_check_pageable_safe(m); - queue_enter(&vm_page_queue_throttled, m, - vm_page_t, pageq); - m->throttled = TRUE; + assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + vm_page_queue_enter(&vm_page_queue_throttled, m, + vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; vm_pageout_scan_reclaimed_throttled++; @@ -2839,8 +3416,9 @@ vm_pageout_scan(void) vm_pageout_scan_inactive_throttled_external++; vm_page_check_pageable_safe(m); - queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); - m->active = TRUE; + assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; vm_page_active_count++; vm_page_pageable_external_count++; @@ -2879,9 +3457,6 @@ vm_pageout_scan(void) inactive_burst_count = 0; goto done_with_inactivepage; } else { - if (page_prev_state == PAGE_STATE_SPECULATIVE) - page_prev_state = PAGE_STATE_INACTIVE; - vm_pageout_scan_inactive_throttled_internal++; goto must_activate_page; @@ -2910,19 +3485,15 @@ vm_pageout_scan(void) /* * Don't count this page as going into the compressor * if any of these are true: - * 1) We have the dynamic pager i.e. no compressed pager - * 2) Freezer enabled device with a freezer file to - * hold the app data i.e. no compressed pager - * 3) Freezer enabled device with compressed pager + * 1) compressed pager isn't enabled + * 2) Freezer enabled device with compressed pager * backend (exclusive use) i.e. most of the VM system * (including vm_pageout_scan) has no knowledge of * the compressor - * 4) This page belongs to a file and hence will not be + * 3) This page belongs to a file and hence will not be * sent into the compressor */ - if (DEFAULT_PAGER_IS_ACTIVE || - DEFAULT_FREEZER_IS_ACTIVE || - DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS || + if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE || object->internal == FALSE) { pmap_options = 0; } else if (m->dirty || m->precious) { @@ -2946,7 +3517,7 @@ vm_pageout_scan(void) pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; } - refmod_state = pmap_disconnect_options(m->phys_page, + refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m), pmap_options, NULL); if (refmod_state & VM_MEM_MODIFIED) { @@ -2964,17 +3535,47 @@ vm_pageout_scan(void) */ if (!m->dirty && !m->precious) { - if (page_prev_state == PAGE_STATE_SPECULATIVE) + if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) vm_pageout_speculative_clean++; else { - if (page_prev_state == PAGE_STATE_ANONYMOUS) + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) vm_pageout_inactive_anonymous++; - else if (page_prev_state == PAGE_STATE_CLEAN) + else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) vm_pageout_cleaned_reclaimed++; vm_pageout_inactive_clean++; } +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache && + vm_page_secluded_target > 0 && + !m->fictitious && + m_object->eligible_for_secluded && + num_tasks_can_use_secluded_mem == 0 && + (secluded_aging_policy == SECLUDED_AGING_FIFO || + ((secluded_aging_policy == + SECLUDED_AGING_AFTER_INACTIVE) && + (page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)))) { + assert(page_prev_q_state != VM_PAGE_ON_SECLUDED_Q); + assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + LCK_MTX_ASSERT(&vm_page_queue_lock, + LCK_MTX_ASSERT_OWNED); + vm_page_queue_enter(&vm_page_queue_secluded, + m, + vm_page_t, + pageq); + m->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + vm_object_unlock(m_object); + object = VM_OBJECT_NULL; + vm_page_secluded_count++; + vm_page_secluded_count_inuse++; + assert(!m_object->internal); +// vm_page_pageable_external_count++; + m = VM_PAGE_NULL; + goto done_with_inactivepage; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + /* * OK, at this point we have found a page we are going to free. */ @@ -3018,7 +3619,7 @@ vm_pageout_scan(void) #endif /* CONFIG_JETSAM */ #endif /* VM_PRESSURE_EVENTS */ - if (page_prev_state == PAGE_STATE_ANONYMOUS) + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) vm_pageout_inactive_anonymous++; if (object->internal) vm_pageout_inactive_dirty_internal++; @@ -3031,7 +3632,7 @@ vm_pageout_scan(void) * anyway, so we may as well put it on the clean queue first and take it from there later * if necessary. that way, we'll ensure we don't free up too much. -mj */ - vm_pageout_cluster(m, FALSE, FALSE, FALSE); + vm_pageout_cluster(m, FALSE, FALSE); done_with_inactivepage: @@ -3085,7 +3686,7 @@ vm_page_free_reserve( { int free_after_reserve; - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { + if (VM_CONFIG_COMPRESSOR_IS_PRESENT) { if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT; @@ -3109,286 +3710,71 @@ vm_page_free_reserve( vm_page_free_target = vm_page_free_reserved + VM_PAGE_FREE_TARGET(free_after_reserve); - if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) - vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT; - - if (vm_page_free_target < vm_page_free_min + 5) - vm_page_free_target = vm_page_free_min + 5; - - vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2); -} - -/* - * vm_pageout is the high level pageout daemon. - */ - -void -vm_pageout_continue(void) -{ - DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL); - vm_pageout_scan_event_counter++; - - lck_mtx_lock(&vm_page_queue_free_lock); - vm_pageout_running = TRUE; - lck_mtx_unlock(&vm_page_queue_free_lock); - - vm_pageout_scan(); - /* - * we hold both the vm_page_queue_free_lock - * and the vm_page_queues_lock at this point - */ - assert(vm_page_free_wanted == 0); - assert(vm_page_free_wanted_privileged == 0); - assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); - - vm_pageout_running = FALSE; - if (vm_pageout_waiter) { - vm_pageout_waiter = FALSE; - thread_wakeup((event_t)&vm_pageout_waiter); - } - - lck_mtx_unlock(&vm_page_queue_free_lock); - vm_page_unlock_queues(); - - counter(c_vm_pageout_block++); - thread_block((thread_continue_t)vm_pageout_continue); - /*NOTREACHED*/ -} - -kern_return_t -vm_pageout_wait(uint64_t deadline) -{ - kern_return_t kr; - - lck_mtx_lock(&vm_page_queue_free_lock); - for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr); ) { - vm_pageout_waiter = TRUE; - if (THREAD_AWAKENED != lck_mtx_sleep_deadline( - &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT, - (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) { - kr = KERN_OPERATION_TIMED_OUT; - } - } - lck_mtx_unlock(&vm_page_queue_free_lock); - - return (kr); -} - - -#ifdef FAKE_DEADLOCK - -#define FAKE_COUNT 5000 - -int internal_count = 0; -int fake_deadlock = 0; - -#endif - -static void -vm_pageout_iothread_continue(struct vm_pageout_queue *q) -{ - vm_page_t m = NULL; - vm_object_t object; - vm_object_offset_t offset; - memory_object_t pager; - thread_t self = current_thread(); - - if ((vm_pageout_internal_iothread != THREAD_NULL) - && (self == vm_pageout_external_iothread ) - && (self->options & TH_OPT_VMPRIV)) - self->options &= ~TH_OPT_VMPRIV; - - vm_page_lockspin_queues(); - - while ( !queue_empty(&q->pgo_pending) ) { - - q->pgo_busy = TRUE; - queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); - if (m->object->object_slid) { - panic("slid page %p not allowed on this path\n", m); - } - VM_PAGE_CHECK(m); - m->pageout_queue = FALSE; - m->pageq.next = NULL; - m->pageq.prev = NULL; - - /* - * grab a snapshot of the object and offset this - * page is tabled in so that we can relookup this - * page after we've taken the object lock - these - * fields are stable while we hold the page queues lock - * but as soon as we drop it, there is nothing to keep - * this page in this object... we hold an activity_in_progress - * on this object which will keep it from terminating - */ - object = m->object; - offset = m->offset; - - vm_page_unlock_queues(); - -#ifdef FAKE_DEADLOCK - if (q == &vm_pageout_queue_internal) { - vm_offset_t addr; - int pg_count; - - internal_count++; - - if ((internal_count == FAKE_COUNT)) { - - pg_count = vm_page_free_count + vm_page_free_reserved; - - if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) { - kmem_free(kernel_map, addr, PAGE_SIZE * pg_count); - } - internal_count = 0; - fake_deadlock++; - } - } -#endif - vm_object_lock(object); - - m = vm_page_lookup(object, offset); - - if (m == NULL || - m->busy || m->cleaning || m->pageout_queue || !m->laundry) { - /* - * it's either the same page that someone else has - * started cleaning (or it's finished cleaning or - * been put back on the pageout queue), or - * the page has been freed or we have found a - * new page at this offset... in all of these cases - * we merely need to release the activity_in_progress - * we took when we put the page on the pageout queue - */ - vm_object_activity_end(object); - vm_object_unlock(object); - - vm_page_lockspin_queues(); - continue; - } - if (!object->pager_initialized) { - - /* - * If there is no memory object for the page, create - * one and hand it to the default pager. - */ - - if (!object->pager_initialized) - vm_object_collapse(object, - (vm_object_offset_t) 0, - TRUE); - if (!object->pager_initialized) - vm_object_pager_create(object); - if (!object->pager_initialized) { - /* - * Still no pager for the object. - * Reactivate the page. - * - * Should only happen if there is no - * default pager. - */ - m->pageout = FALSE; - - vm_page_lockspin_queues(); - - vm_pageout_throttle_up(m); - vm_page_activate(m); - vm_pageout_dirty_no_pager++; - - vm_page_unlock_queues(); - - /* - * And we are done with it. - */ - vm_object_activity_end(object); - vm_object_unlock(object); - - vm_page_lockspin_queues(); - continue; - } - } - pager = object->pager; - - if (pager == MEMORY_OBJECT_NULL) { - /* - * This pager has been destroyed by either - * memory_object_destroy or vm_object_destroy, and - * so there is nowhere for the page to go. - */ - if (m->pageout) { - /* - * Just free the page... VM_PAGE_FREE takes - * care of cleaning up all the state... - * including doing the vm_pageout_throttle_up - */ - VM_PAGE_FREE(m); - } else { - vm_page_lockspin_queues(); - - vm_pageout_throttle_up(m); - vm_page_activate(m); - - vm_page_unlock_queues(); - - /* - * And we are done with it. - */ - } - vm_object_activity_end(object); - vm_object_unlock(object); - - vm_page_lockspin_queues(); - continue; - } -#if 0 - /* - * we don't hold the page queue lock - * so this check isn't safe to make - */ - VM_PAGE_CHECK(m); -#endif - /* - * give back the activity_in_progress reference we - * took when we queued up this page and replace it - * it with a paging_in_progress reference that will - * also hold the paging offset from changing and - * prevent the object from terminating - */ - vm_object_activity_end(object); - vm_object_paging_begin(object); - vm_object_unlock(object); + if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) + vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT; - /* - * Send the data to the pager. - * any pageout clustering happens there - */ - memory_object_data_return(pager, - m->offset + object->paging_offset, - PAGE_SIZE, - NULL, - NULL, - FALSE, - FALSE, - 0); + if (vm_page_free_target < vm_page_free_min + 5) + vm_page_free_target = vm_page_free_min + 5; - vm_object_lock(object); - vm_object_paging_end(object); - vm_object_unlock(object); + vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2); +} - vm_pageout_io_throttle(); +/* + * vm_pageout is the high level pageout daemon. + */ - vm_page_lockspin_queues(); +void +vm_pageout_continue(void) +{ + DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL); + vm_pageout_scan_event_counter++; + + lck_mtx_lock(&vm_page_queue_free_lock); + vm_pageout_running = TRUE; + lck_mtx_unlock(&vm_page_queue_free_lock); + + vm_pageout_scan(); + /* + * we hold both the vm_page_queue_free_lock + * and the vm_page_queues_lock at this point + */ + assert(vm_page_free_wanted == 0); + assert(vm_page_free_wanted_privileged == 0); + assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); + + vm_pageout_running = FALSE; + if (vm_pageout_waiter) { + vm_pageout_waiter = FALSE; + thread_wakeup((event_t)&vm_pageout_waiter); } - q->pgo_busy = FALSE; - q->pgo_idle = TRUE; - assert_wait((event_t) &q->pgo_pending, THREAD_UNINT); + lck_mtx_unlock(&vm_page_queue_free_lock); vm_page_unlock_queues(); - thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) q); + counter(c_vm_pageout_block++); + thread_block((thread_continue_t)vm_pageout_continue); /*NOTREACHED*/ } +kern_return_t +vm_pageout_wait(uint64_t deadline) +{ + kern_return_t kr; + + lck_mtx_lock(&vm_page_queue_free_lock); + for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr); ) { + vm_pageout_waiter = TRUE; + if (THREAD_AWAKENED != lck_mtx_sleep_deadline( + &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT, + (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) { + kr = KERN_OPERATION_TIMED_OUT; + } + } + lck_mtx_unlock(&vm_page_queue_free_lock); + + return (kr); +} + static void vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) @@ -3404,18 +3790,13 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) vm_page_lockspin_queues(); - while ( !queue_empty(&q->pgo_pending) ) { + while ( !vm_page_queue_empty(&q->pgo_pending) ) { q->pgo_busy = TRUE; - queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); - if (m->object->object_slid) { - panic("slid page %p not allowed on this path\n", m); - } - VM_PAGE_CHECK(m); - m->pageout_queue = FALSE; - m->pageq.next = NULL; - m->pageq.prev = NULL; + vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); + assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q); + VM_PAGE_CHECK(m); /* * grab a snapshot of the object and offset this * page is tabled in so that we can relookup this @@ -3425,9 +3806,15 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) * this page in this object... we hold an activity_in_progress * on this object which will keep it from terminating */ - object = m->object; + object = VM_PAGE_OBJECT(m); offset = m->offset; + if (object->object_slid) { + panic("slid page %p not allowed on this path\n", m); + } + m->vm_page_q_state = VM_PAGE_NOT_ON_Q; + VM_PAGE_ZERO_PAGEQ_ENTRY(m); + vm_page_unlock_queues(); vm_object_lock(object); @@ -3435,7 +3822,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) m = vm_page_lookup(object, offset); if (m == NULL || - m->busy || m->cleaning || m->pageout_queue || !m->laundry) { + m->busy || m->cleaning || !m->laundry || (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { /* * it's either the same page that someone else has * started cleaning (or it's finished cleaning or @@ -3459,7 +3846,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) * memory_object_destroy or vm_object_destroy, and * so there is nowhere for the page to go. */ - if (m->pageout) { + if (m->free_when_done) { /* * Just free the page... VM_PAGE_FREE takes * care of cleaning up all the state... @@ -3537,6 +3924,10 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) uint32_t vm_compressor_failed; #define MAX_FREE_BATCH 32 +uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by + * this thread. + */ +uint64_t vm_compressor_thread_runtime; static void vm_pageout_iothread_internal_continue(struct cq *cq) @@ -3574,16 +3965,17 @@ vm_pageout_iothread_internal_continue(struct cq *cq) KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0); - while ( !queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) { - - queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); + while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) { + vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); + assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q); VM_PAGE_CHECK(m); + + m->vm_page_q_state = VM_PAGE_NOT_ON_Q; + VM_PAGE_ZERO_PAGEQ_ENTRY(m); + m->laundry = FALSE; - m->pageout_queue = FALSE; - m->pageq.prev = NULL; - - m->pageq.next = (queue_entry_t)local_q; + m->snext = local_q; local_q = m; local_cnt++; } @@ -3611,12 +4003,12 @@ vm_pageout_iothread_internal_continue(struct cq *cq) KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0); m = local_q; - local_q = (vm_page_t)m->pageq.next; - m->pageq.next = NULL; + local_q = m->snext; + m->snext = NULL; if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m, FALSE) == KERN_SUCCESS) { - m->pageq.next = (queue_entry_t)local_freeq; + m->snext = local_freeq; local_freeq = m; local_freed++; @@ -3684,6 +4076,10 @@ vm_pageout_iothread_internal_continue(struct cq *cq) assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT); vm_page_unlock_queues(); + if (__improbable(vm_compressor_time_thread)) { + vm_compressor_thread_runtime = thread_get_runtime_self(); + } + KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0); thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq); @@ -3700,7 +4096,7 @@ vm_pageout_immediate(vm_page_t m, boolean_t object_locked_by_caller) if (vm_pageout_compress_page(&vm_pageout_immediate_chead, vm_pageout_immediate_scratch_buf, m, object_locked_by_caller) == KERN_SUCCESS) { vm_page_free_prepare_object(m, TRUE); - vm_page_release(m); + vm_page_release(m, TRUE); } } @@ -3713,11 +4109,14 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b int compressed_count_delta; kern_return_t retval; - if (m->object->object_slid) { + object = VM_PAGE_OBJECT(m); + + if (object->object_slid) { panic("slid page %p not allowed on this path\n", m); } + assert(!m->free_when_done); + assert(!m->laundry); - object = m->object; pager = object->pager; if (object_locked_by_caller == FALSE && (!object->pager_initialized || pager == MEMORY_OBJECT_NULL)) { @@ -3736,16 +4135,17 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b if (!object->pager_initialized) vm_object_compressor_pager_create(object); - if (!object->pager_initialized) { + pager = object->pager; + + if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) { /* - * Still no pager for the object. + * Still no pager for the object, + * or the pager has been destroyed. * Reactivate the page. * * Should only happen if there is no * compression pager */ - m->pageout = FALSE; - m->laundry = FALSE; PAGE_WAKEUP_DONE(m); vm_page_lockspin_queues(); @@ -3761,38 +4161,6 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b return KERN_FAILURE; } - pager = object->pager; - - if (pager == MEMORY_OBJECT_NULL) { - /* - * This pager has been destroyed by either - * memory_object_destroy or vm_object_destroy, and - * so there is nowhere for the page to go. - */ - if (m->pageout) { - /* - * Just free the page... VM_PAGE_FREE takes - * care of cleaning up all the state... - * including doing the vm_pageout_throttle_up - */ - VM_PAGE_FREE(m); - } else { - m->laundry = FALSE; - PAGE_WAKEUP_DONE(m); - - vm_page_lockspin_queues(); - vm_page_activate(m); - vm_page_unlock_queues(); - - /* - * And we are done with it. - */ - } - vm_object_activity_end(object); - vm_object_unlock(object); - - return KERN_FAILURE; - } vm_object_unlock(object); KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0); @@ -3805,7 +4173,7 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b retval = vm_compressor_pager_put( pager, m->offset + object->paging_offset, - m->phys_page, + VM_PAGE_GET_PHYS_PAGE(m), current_chead, scratch_buf, &compressed_count_delta); @@ -3814,7 +4182,7 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b vm_object_lock(object); assert(object->activity_in_progress > 0); - assert(m->object == object); + assert(VM_PAGE_OBJECT(m) == object); } vm_compressor_pager_count(pager, @@ -3822,8 +4190,7 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b FALSE, /* shared_lock */ object); - m->laundry = FALSE; - m->pageout = FALSE; + assert( !VM_PAGE_WIRED(m)); if (retval == KERN_SUCCESS) { /* @@ -3872,9 +4239,6 @@ vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_qu if (hibernate_cleaning_in_progress == TRUE) req_lowpriority = FALSE; - if ((DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) && iq->pgo_inited == TRUE && iq->pgo_lowpriority != req_lowpriority) - set_iq = TRUE; - if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) set_eq = TRUE; @@ -3890,12 +4254,14 @@ vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_qu DTRACE_VM(laundryunthrottle); } if (set_iq == TRUE) { - proc_set_task_policy_thread(kernel_task, iq->pgo_tid, TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy); + proc_set_thread_policy_with_tid(kernel_task, iq->pgo_tid, + TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy); iq->pgo_lowpriority = req_lowpriority; } if (set_eq == TRUE) { - proc_set_task_policy_thread(kernel_task, eq->pgo_tid, TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy); + proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid, + TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy); eq->pgo_lowpriority = req_lowpriority; } @@ -3911,10 +4277,10 @@ vm_pageout_iothread_external(void) self->options |= TH_OPT_VMPRIV; - DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL); + DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL); - proc_set_task_policy_thread(kernel_task, self->thread_id, TASK_POLICY_EXTERNAL, - TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED); + proc_set_thread_policy(self, TASK_POLICY_EXTERNAL, + TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED); vm_page_lock_queues(); @@ -3924,10 +4290,7 @@ vm_pageout_iothread_external(void) vm_page_unlock_queues(); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) - vm_pageout_iothread_external_continue(&vm_pageout_queue_external); - else - vm_pageout_iothread_continue(&vm_pageout_queue_external); + vm_pageout_iothread_external_continue(&vm_pageout_queue_external); /*NOTREACHED*/ } @@ -3940,12 +4303,6 @@ vm_pageout_iothread_internal(struct cq *cq) self->options |= TH_OPT_VMPRIV; - if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) { - DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL); - - proc_set_task_policy_thread(kernel_task, self->thread_id, TASK_POLICY_EXTERNAL, - TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED); - } vm_page_lock_queues(); vm_pageout_queue_internal.pgo_tid = self->thread_id; @@ -3954,14 +4311,10 @@ vm_pageout_iothread_internal(struct cq *cq) vm_page_unlock_queues(); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - - if (vm_restricted_to_single_processor == TRUE) - thread_vm_bind_group_add(); + if (vm_restricted_to_single_processor == TRUE) + thread_vm_bind_group_add(); - vm_pageout_iothread_internal_continue(cq); - } else - vm_pageout_iothread_continue(&vm_pageout_queue_internal); + vm_pageout_iothread_internal_continue(cq); /*NOTREACHED*/ } @@ -3990,17 +4343,21 @@ vm_pressure_response(void) vm_pressure_level_t old_level = kVMPressureNormal; int new_level = -1; - + unsigned int total_pages; uint64_t available_memory = 0; if (vm_pressure_events_enabled == FALSE) return; - available_memory = (((uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY) * 100); + available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY; - memorystatus_level = (unsigned int) (available_memory / atop_64(max_mem)); + total_pages = (unsigned int) atop_64(max_mem); +#if CONFIG_SECLUDED_MEMORY + total_pages -= vm_page_secluded_count; +#endif /* CONFIG_SECLUDED_MEMORY */ + memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages); if (memorystatus_manual_testing_on) { return; @@ -4134,7 +4491,7 @@ uint32_t vm_pageout_considered_page_last = 0; * called once per-second via "compute_averages" */ void -compute_pageout_gc_throttle() +compute_pageout_gc_throttle(__unused void *arg) { if (vm_pageout_considered_page != vm_pageout_considered_page_last) { @@ -4167,7 +4524,7 @@ vm_pageout_garbage_collect(int collect) * consider_zone_gc should be last, because the other operations * might return memory to zones. */ - consider_zone_gc(buf_large_zfree); + consider_zone_gc(); } first_try = FALSE; @@ -4182,31 +4539,22 @@ vm_pageout_garbage_collect(int collect) } -void vm_pageout_reinit_tuneables(void); - -void -vm_pageout_reinit_tuneables(void) -{ - - vm_compressor_minorcompact_threshold_divisor = 18; - vm_compressor_majorcompact_threshold_divisor = 22; - vm_compressor_unthrottle_threshold_divisor = 32; -} - - #if VM_PAGE_BUCKETS_CHECK #if VM_PAGE_FAKE_BUCKETS extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end; #endif /* VM_PAGE_FAKE_BUCKETS */ #endif /* VM_PAGE_BUCKETS_CHECK */ + #define FBDP_TEST_COLLAPSE_COMPRESSOR 0 +#define FBDP_TEST_WIRE_AND_EXTRACT 0 +#define FBDP_TEST_PAGE_WIRE_OVERFLOW 0 + #if FBDP_TEST_COLLAPSE_COMPRESSOR extern boolean_t vm_object_collapse_compressor_allowed; #include #endif /* FBDP_TEST_COLLAPSE_COMPRESSOR */ -#define FBDP_TEST_WIRE_AND_EXTRACT 0 #if FBDP_TEST_WIRE_AND_EXTRACT extern ledger_template_t task_ledger_template; #include @@ -4328,7 +4676,7 @@ vm_pageout(void) vm_page_free_reserve(0); - queue_init(&vm_pageout_queue_external.pgo_pending); + vm_page_queue_init(&vm_pageout_queue_external.pgo_pending); vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX; vm_pageout_queue_external.pgo_laundry = 0; vm_pageout_queue_external.pgo_idle = FALSE; @@ -4339,7 +4687,7 @@ vm_pageout(void) vm_pageout_queue_external.pgo_tid = -1; vm_pageout_queue_external.pgo_inited = FALSE; - queue_init(&vm_pageout_queue_internal.pgo_pending); + vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending); vm_pageout_queue_internal.pgo_maxlaundry = 0; vm_pageout_queue_internal.pgo_laundry = 0; vm_pageout_queue_internal.pgo_idle = FALSE; @@ -4381,8 +4729,51 @@ vm_pageout(void) #endif vm_object_reaper_init(); - - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) + + + bzero(&vm_config, sizeof(vm_config)); + + switch(vm_compressor_mode) { + + case VM_PAGER_DEFAULT: + printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n"); + + case VM_PAGER_COMPRESSOR_WITH_SWAP: + vm_config.compressor_is_present = TRUE; + vm_config.swap_is_present = TRUE; + vm_config.compressor_is_active = TRUE; + vm_config.swap_is_active = TRUE; + break; + + case VM_PAGER_COMPRESSOR_NO_SWAP: + vm_config.compressor_is_present = TRUE; + vm_config.swap_is_present = TRUE; + vm_config.compressor_is_active = TRUE; + break; + + case VM_PAGER_FREEZER_DEFAULT: + printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n"); + + case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP: + vm_config.compressor_is_present = TRUE; + vm_config.swap_is_present = TRUE; + break; + + case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP: + vm_config.compressor_is_present = TRUE; + vm_config.swap_is_present = TRUE; + vm_config.compressor_is_active = TRUE; + vm_config.freezer_swap_is_active = TRUE; + break; + + case VM_PAGER_NOT_CONFIGURED: + break; + + default: + printf("unknown compressor mode - %x\n", vm_compressor_mode); + break; + } + if (VM_CONFIG_COMPRESSOR_IS_PRESENT) vm_compressor_pager_init(); #if VM_PRESSURE_EVENTS @@ -4622,7 +5013,7 @@ vm_pageout(void) cur_offset += PAGE_SIZE) { kr = vm_map_wire_and_extract(wire_map, wire_addr + cur_offset, - VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK)), + VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK), TRUE, &wire_ppnum); assert(kr == KERN_SUCCESS); @@ -4660,6 +5051,25 @@ vm_pageout(void) printf("FBDP_TEST_WIRE_AND_EXTRACT: PASS\n"); #endif /* FBDP_TEST_WIRE_AND_EXTRACT */ +#if FBDP_TEST_PAGE_WIRE_OVERFLOW + vm_object_t fbdp_object; + vm_page_t fbdp_page; + + printf("FBDP_TEST_PAGE_WIRE_OVERFLOW: starting...\n"); + + fbdp_object = vm_object_allocate(PAGE_SIZE); + vm_object_lock(fbdp_object); + fbdp_page = vm_page_alloc(fbdp_object, 0x0); + vm_page_lock_queues(); + do { + vm_page_wire(fbdp_page, 1, FALSE); + } while (fbdp_page->wire_count != 0); + vm_page_unlock_queues(); + vm_object_unlock(fbdp_object); + panic("FBDP(%p,%p): wire_count overflow not detected\n", + fbdp_object, fbdp_page); +#endif /* FBDP_TEST_PAGE_WIRE_OVERFLOW */ + vm_pageout_continue(); /* @@ -4696,45 +5106,37 @@ vm_pageout_internal_start(void) kern_return_t result; int i; host_basic_info_data_t hinfo; - int thread_count; + assert (VM_CONFIG_COMPRESSOR_IS_PRESENT); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; #define BSD_HOST 1 - host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - - assert(hinfo.max_cpus > 0); + host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - if (vm_compressor_thread_count >= hinfo.max_cpus) - vm_compressor_thread_count = hinfo.max_cpus - 1; - if (vm_compressor_thread_count <= 0) - vm_compressor_thread_count = 1; - else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) - vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT; + assert(hinfo.max_cpus > 0); - if (vm_compressor_immediate_preferred == TRUE) { - vm_pageout_immediate_chead = NULL; - vm_pageout_immediate_scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE); + if (vm_compressor_thread_count >= hinfo.max_cpus) + vm_compressor_thread_count = hinfo.max_cpus - 1; + if (vm_compressor_thread_count <= 0) + vm_compressor_thread_count = 1; + else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) + vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT; - vm_compressor_thread_count = 1; - } - thread_count = vm_compressor_thread_count; + if (vm_compressor_immediate_preferred == TRUE) { + vm_pageout_immediate_chead = NULL; + vm_pageout_immediate_scratch_buf = kalloc(vm_compressor_get_encode_scratch_size()); - vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX; - } else { - vm_compressor_thread_count = 0; - thread_count = 1; - vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX; + vm_compressor_thread_count = 1; } + vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX; + for (i = 0; i < vm_compressor_thread_count; i++) { ciq[i].id = i; ciq[i].q = &vm_pageout_queue_internal; ciq[i].current_chead = NULL; ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE); - } - for (i = 0; i < thread_count; i++) { + result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread); if (result == KERN_SUCCESS) @@ -4817,7 +5219,6 @@ upl_create(int type, int flags, upl_size_t size) bzero((char *)upl + upl_size, page_field_size); upl->flags = upl_flags | flags; - upl->src_object = NULL; upl->kaddr = (vm_offset_t)0; upl->size = 0; upl->map_object = NULL; @@ -5078,6 +5479,8 @@ vm_object_upl_request( int dw_count; int dw_limit; int io_tracking_flag = 0; + int grab_options; + ppnum_t phys_page; if (cntrl_flags & ~UPL_VALID_FLAGS) { /* @@ -5181,6 +5584,13 @@ vm_object_upl_request( vm_object_lock(object); vm_object_activity_begin(object); + grab_options = 0; +#if CONFIG_SECLUDED_MEMORY + if (object->can_grab_secluded) { + grab_options |= VM_PAGE_GRAB_SECLUDED; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + /* * we can lock in the paging_offset once paging_in_progress is set */ @@ -5275,6 +5685,8 @@ vm_object_upl_request( goto try_next_page; } + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); + /* * grab this up front... * a high percentange of the time we're going to @@ -5283,11 +5695,11 @@ vm_object_upl_request( * the pmap layer by grabbing it here and recording it */ if (dst_page->pmapped) - refmod_state = pmap_get_refmod(dst_page->phys_page); + refmod_state = pmap_get_refmod(phys_page); else refmod_state = 0; - if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) { + if ( (refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) { /* * page is on inactive list and referenced... * reactivate it now... this gets it out of the @@ -5319,8 +5731,9 @@ vm_object_upl_request( * can't have been referenced recently... */ if ( (hibernate_cleaning_in_progress == TRUE || - (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) || dst_page->throttled)) && - ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) { + (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) || + (dst_page->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q))) && + ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) { goto check_busy; } dont_return: @@ -5350,8 +5763,9 @@ vm_object_upl_request( check_busy: if (dst_page->busy) { if (cntrl_flags & UPL_NOBLOCK) { - if (user_page_list) + if (user_page_list) user_page_list[entry].phys_addr = 0; + dwp->dw_mask = 0; goto try_next_page; } @@ -5389,11 +5803,11 @@ vm_object_upl_request( */ dst_page->busy = was_busy; } - if (dst_page->pageout_queue == TRUE) { + if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { vm_page_lockspin_queues(); - if (dst_page->pageout_queue == TRUE) { + if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { /* * we've buddied up a page for a clustered pageout * that has already been moved to the pageout @@ -5414,7 +5828,7 @@ vm_object_upl_request( */ if (dst_page->pageout) encountered_lrp = TRUE; - if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious))) { + if ((dst_page->dirty || (object->internal && dst_page->precious))) { if (encountered_lrp) CLUSTER_STAT(pages_at_higher_offsets++;) else @@ -5424,10 +5838,10 @@ vm_object_upl_request( hw_dirty = refmod_state & VM_MEM_MODIFIED; dirty = hw_dirty ? TRUE : dst_page->dirty; - if (dst_page->phys_page > upl->highest_page) - upl->highest_page = dst_page->phys_page; + if (phys_page > upl->highest_page) + upl->highest_page = phys_page; - assert (!pmap_is_noencrypt(dst_page->phys_page)); + assert (!pmap_is_noencrypt(phys_page)); if (cntrl_flags & UPL_SET_LITE) { unsigned int pg_num; @@ -5437,7 +5851,7 @@ vm_object_upl_request( lite_list[pg_num>>5] |= 1 << (pg_num & 31); if (hw_dirty) - pmap_clear_modify(dst_page->phys_page); + pmap_clear_modify(phys_page); /* * Mark original page as cleaning @@ -5458,13 +5872,6 @@ vm_object_upl_request( alias_page->absent = FALSE; alias_page = NULL; } -#if MACH_PAGEMAP - /* - * Record that this page has been - * written out - */ - vm_external_state_set(object->existence_map, dst_page->offset); -#endif /*MACH_PAGEMAP*/ if (dirty) { SET_PAGE_DIRTY(dst_page, FALSE); } else { @@ -5494,7 +5901,7 @@ vm_object_upl_request( } if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) { if ( !VM_PAGE_WIRED(dst_page)) - dst_page->pageout = TRUE; + dst_page->free_when_done = TRUE; } } else { if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) { @@ -5566,11 +5973,8 @@ vm_object_upl_request( continue; } - if (dst_page->laundry) { - dst_page->pageout = FALSE; - + if (dst_page->laundry) vm_pageout_steal_laundry(dst_page, FALSE); - } } else { if (object->private) { /* @@ -5602,7 +6006,8 @@ vm_object_upl_request( dst_page = vm_object_page_grab(object); if (dst_page != VM_PAGE_NULL) - vm_page_release(dst_page); + vm_page_release(dst_page, + FALSE); dst_page = vm_object_page_grab(object); } @@ -5610,7 +6015,7 @@ vm_object_upl_request( /* * need to allocate a page */ - dst_page = vm_page_grab(); + dst_page = vm_page_grab_options(grab_options); } if (dst_page == VM_PAGE_NULL) { if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) { @@ -5664,6 +6069,8 @@ vm_object_upl_request( VM_STAT_INCR(pageins); } } + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); + /* * ENCRYPTED SWAP: */ @@ -5688,9 +6095,9 @@ vm_object_upl_request( * eliminate all mappings from the * original object and its prodigy */ - refmod_state = pmap_disconnect(dst_page->phys_page); + refmod_state = pmap_disconnect(phys_page); else - refmod_state = pmap_get_refmod(dst_page->phys_page); + refmod_state = pmap_get_refmod(phys_page); } else refmod_state = 0; @@ -5705,7 +6112,7 @@ vm_object_upl_request( lite_list[pg_num>>5] |= 1 << (pg_num & 31); if (hw_dirty) - pmap_clear_modify(dst_page->phys_page); + pmap_clear_modify(phys_page); /* * Mark original page as cleaning @@ -5769,7 +6176,7 @@ vm_object_upl_request( dwp->dw_mask |= DW_set_reference; } if (cntrl_flags & UPL_PRECIOUS) { - if (dst_page->object->internal) { + if (object->internal) { SET_PAGE_DIRTY(dst_page, FALSE); dst_page->precious = FALSE; } else { @@ -5782,19 +6189,19 @@ vm_object_upl_request( if (dst_page->busy) upl->flags |= UPL_HAS_BUSY; - if (dst_page->phys_page > upl->highest_page) - upl->highest_page = dst_page->phys_page; - assert (!pmap_is_noencrypt(dst_page->phys_page)); + if (phys_page > upl->highest_page) + upl->highest_page = phys_page; + assert (!pmap_is_noencrypt(phys_page)); if (user_page_list) { - user_page_list[entry].phys_addr = dst_page->phys_page; - user_page_list[entry].pageout = dst_page->pageout; + user_page_list[entry].phys_addr = phys_page; + user_page_list[entry].free_when_done = dst_page->free_when_done; user_page_list[entry].absent = dst_page->absent; user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].precious = dst_page->precious; user_page_list[entry].device = FALSE; user_page_list[entry].needed = FALSE; if (dst_page->clustered == TRUE) - user_page_list[entry].speculative = dst_page->speculative; + user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE; else user_page_list[entry].speculative = FALSE; user_page_list[entry].cs_validated = dst_page->cs_validated; @@ -5940,6 +6347,8 @@ vm_map_create_upl( vm_map_offset_t local_start; kern_return_t ret; + assert(page_aligned(offset)); + caller_flags = *flags; if (caller_flags & ~UPL_VALID_FLAGS) { @@ -5983,24 +6392,6 @@ vm_map_create_upl( return KERN_SUCCESS; } - if (entry->is_sub_map) { - vm_map_t submap; - - submap = VME_SUBMAP(entry); - local_start = entry->vme_start; - local_offset = VME_OFFSET(entry); - - vm_map_reference(submap); - vm_map_unlock_read(map); - - ret = vm_map_create_upl(submap, - local_offset + (offset - local_start), - upl_size, upl, page_list, count, flags); - vm_map_deallocate(submap); - - return ret; - } - if (VME_OBJECT(entry) == VM_OBJECT_NULL || !VME_OBJECT(entry)->phys_contiguous) { if (*upl_size > MAX_UPL_SIZE_BYTES) @@ -6030,16 +6421,19 @@ vm_map_create_upl( return KERN_PROTECTION_FAILURE; } + local_object = VME_OBJECT(entry); assert(local_object != VM_OBJECT_NULL); - if (*upl_size != 0 && + if (!entry->is_sub_map && + !entry->needs_copy && + *upl_size != 0 && local_object->vo_size > *upl_size && /* partial UPL */ entry->wired_count == 0 && /* No COW for entries that are wired */ (map->pmap != kernel_pmap) && /* alias checks */ (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */ || - (!entry->needs_copy && /* case 2 */ + (/* case 2 */ local_object->internal && (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) && local_object->ref_count > 1))) { @@ -6164,6 +6558,24 @@ vm_map_create_upl( goto REDISCOVER_ENTRY; } + if (entry->is_sub_map) { + vm_map_t submap; + + submap = VME_SUBMAP(entry); + local_start = entry->vme_start; + local_offset = VME_OFFSET(entry); + + vm_map_reference(submap); + vm_map_unlock_read(map); + + ret = vm_map_create_upl(submap, + local_offset + (offset - local_start), + upl_size, upl, page_list, count, flags); + vm_map_deallocate(submap); + + return ret; + } + if (sync_cow_data && (VME_OBJECT(entry)->shadow || VME_OBJECT(entry)->copy)) { @@ -6223,6 +6635,7 @@ vm_map_create_upl( local_offset = VME_OFFSET(entry); local_start = entry->vme_start; + vm_object_lock(local_object); /* @@ -6415,14 +6828,14 @@ vm_map_enter_upl( assert(alias_page->fictitious); alias_page->fictitious = FALSE; alias_page->private = TRUE; - alias_page->pageout = TRUE; + alias_page->free_when_done = TRUE; /* * since m is a page in the upl it must * already be wired or BUSY, so it's * safe to assign the underlying physical * page to the alias */ - alias_page->phys_page = m->phys_page; + VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m)); vm_object_unlock(object); @@ -6433,7 +6846,7 @@ vm_map_enter_upl( /* * ENCRYPTED SWAP: * The virtual page ("m") has to be wired in some way - * here or its physical page ("m->phys_page") could + * here or its backing physical page could * be recycled at any time. * Assuming this is enforced by the caller, we can't * get an encrypted page here. Since the encryption @@ -6478,6 +6891,7 @@ vm_map_enter_upl( VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { + vm_object_deallocate(upl->map_object); upl_unlock(upl); return(kr); } @@ -6630,6 +7044,7 @@ vm_map_remove_upl( return KERN_FAILURE; } + kern_return_t upl_commit_range( upl_t upl, @@ -6643,6 +7058,7 @@ upl_commit_range( upl_size_t xfer_size, subupl_size = size; vm_object_t shadow_object; vm_object_t object; + vm_object_t m_object; vm_object_offset_t target_offset; upl_offset_t subupl_offset = offset; int entry; @@ -6664,7 +7080,7 @@ upl_commit_range( int throttle_page = 0; int unwired_count = 0; int local_queue_count = 0; - queue_head_t local_queue; + vm_page_t first_local, last_local; *empty = FALSE; @@ -6774,7 +7190,7 @@ upl_commit_range( */ flags &= ~UPL_COMMIT_CS_VALIDATED; } - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && shadow_object->internal) + if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) should_be_throttled = TRUE; dwp = &dw_array[0]; @@ -6787,15 +7203,15 @@ upl_commit_range( shadow_object->purgable != VM_PURGABLE_VOLATILE && shadow_object->purgable != VM_PURGABLE_EMPTY) { - if (!queue_empty(&shadow_object->memq)) { - queue_init(&local_queue); + if (!vm_page_queue_empty(&shadow_object->memq)) { + if (size == shadow_object->vo_size) { - nxt_page = (vm_page_t)queue_first(&shadow_object->memq); + nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq); fast_path_full_commit = 1; } fast_path_possible = 1; - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && shadow_object->internal && + if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal && (shadow_object->purgable == VM_PURGABLE_DENY || shadow_object->purgable == VM_PURGABLE_NONVOLATILE || shadow_object->purgable == VM_PURGABLE_VOLATILE)) { @@ -6803,6 +7219,8 @@ upl_commit_range( } } } + first_local = VM_PAGE_NULL; + last_local = VM_PAGE_NULL; while (xfer_size) { vm_page_t t, m; @@ -6817,7 +7235,7 @@ upl_commit_range( if (nxt_page != VM_PAGE_NULL) { m = nxt_page; - nxt_page = (vm_page_t)queue_next(&nxt_page->listq); + nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq); target_offset = m->offset; } pg_num = (unsigned int) (target_offset/PAGE_SIZE); @@ -6834,7 +7252,7 @@ upl_commit_range( if (upl->flags & UPL_SHADOWED) { if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) { - t->pageout = FALSE; + t->free_when_done = FALSE; VM_PAGE_FREE(t); @@ -6845,7 +7263,9 @@ upl_commit_range( if (m == VM_PAGE_NULL) goto commit_next_page; - if (m->compressor) { + m_object = VM_PAGE_OBJECT(m); + + if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { assert(m->busy); dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); @@ -6893,7 +7313,7 @@ upl_commit_range( #if DEVELOPMENT || DEBUG vm_cs_validated_resets++; #endif - pmap_disconnect(m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); } clear_refmod |= VM_MEM_MODIFIED; } @@ -6906,9 +7326,10 @@ upl_commit_range( dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); } if (fast_path_possible) { - assert(m->object->purgable != VM_PURGABLE_EMPTY); - assert(m->object->purgable != VM_PURGABLE_VOLATILE); + assert(m_object->purgable != VM_PURGABLE_EMPTY); + assert(m_object->purgable != VM_PURGABLE_VOLATILE); if (m->absent) { + assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); assert(m->wire_count == 0); assert(m->busy); @@ -6917,29 +7338,48 @@ upl_commit_range( } else { if (m->wire_count == 0) panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object); + assert(m->vm_page_q_state == VM_PAGE_IS_WIRED); /* * XXX FBDP need to update some other * counters here (purgeable_wired_count) * (ledgers), ... */ - assert(m->wire_count); + assert(m->wire_count > 0); m->wire_count--; - if (m->wire_count == 0) + if (m->wire_count == 0) { + m->vm_page_q_state = VM_PAGE_NOT_ON_Q; unwired_count++; + } } if (m->wire_count == 0) { - queue_enter(&local_queue, m, vm_page_t, pageq); + assert(m->pageq.next == 0 && m->pageq.prev == 0); + + if (last_local == VM_PAGE_NULL) { + assert(first_local == VM_PAGE_NULL); + + last_local = m; + first_local = m; + } else { + assert(first_local != VM_PAGE_NULL); + + m->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local); + first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m); + first_local = m; + } local_queue_count++; if (throttle_page) { - m->throttled = TRUE; + m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; } else { - if (flags & UPL_COMMIT_INACTIVATE) - m->inactive = TRUE; - else - m->active = TRUE; + if (flags & UPL_COMMIT_INACTIVATE) { + if (shadow_object->internal) + m->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q; + else + m->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q; + } else + m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; } } } else { @@ -6962,7 +7402,7 @@ upl_commit_range( } goto commit_next_page; } - assert(!m->compressor); + assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); if (page_list) page_list[entry].phys_addr = 0; @@ -6983,7 +7423,7 @@ upl_commit_range( dwp->dw_mask |= DW_vm_pageout_throttle_up; if (VM_PAGE_WIRED(m)) - m->pageout = FALSE; + m->free_when_done = FALSE; if (! (flags & UPL_COMMIT_CS_VALIDATED) && m->cs_validated && !m->cs_tainted) { @@ -7003,7 +7443,7 @@ upl_commit_range( #if DEVELOPMENT || DEBUG vm_cs_validated_resets++; #endif - pmap_disconnect(m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); } if (m->overwriting) { /* @@ -7011,7 +7451,7 @@ upl_commit_range( */ if (m->busy) { #if CONFIG_PHANTOM_CACHE - if (m->absent && !m->object->internal) + if (m->absent && !m_object->internal) dwp->dw_mask |= DW_vm_phantom_cache_update; #endif m->absent = FALSE; @@ -7036,20 +7476,21 @@ upl_commit_range( } m->cleaning = FALSE; - if (m->pageout) { + if (m->free_when_done) { /* * With the clean queue enabled, UPL_PAGEOUT should * no longer set the pageout bit. It's pages now go * to the clean queue. */ assert(!(flags & UPL_PAGEOUT)); + assert(!m_object->internal); - m->pageout = FALSE; + m->free_when_done = FALSE; #if MACH_CLUSTER_STATS if (m->wanted) vm_pageout_target_collisions++; #endif if ((flags & UPL_COMMIT_SET_DIRTY) || - (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))) { + (m->pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) { /* * page was re-dirtied after we started * the pageout... reactivate it since @@ -7070,7 +7511,7 @@ upl_commit_range( * page has been successfully cleaned * go ahead and free it for other use */ - if (m->object->internal) { + if (m_object->internal) { DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL); } else { DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL); @@ -7084,7 +7525,7 @@ upl_commit_range( } #if MACH_CLUSTER_STATS if (m->wpmapped) - m->dirty = pmap_is_modified(m->phys_page); + m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)); if (m->dirty) vm_pageout_cluster_dirtied++; else vm_pageout_cluster_cleaned++; @@ -7115,7 +7556,7 @@ upl_commit_range( dwp->dw_mask |= DW_enqueue_cleaned; vm_pageout_enqueued_cleaned_from_inactive_dirty++; - } else if (should_be_throttled == TRUE && !m->active && !m->inactive && !m->speculative && !m->throttled) { + } else if (should_be_throttled == TRUE && (m->vm_page_q_state == VM_PAGE_NOT_ON_Q)) { /* * page coming back in from being 'frozen'... * it was dirty before it was frozen, so keep it so @@ -7126,10 +7567,10 @@ upl_commit_range( dwp->dw_mask |= DW_vm_page_activate; } else { - if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) { + if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)) { dwp->dw_mask |= DW_vm_page_deactivate_internal; clear_refmod |= VM_MEM_REFERENCED; - } else if (!m->active && !m->inactive && !m->speculative) { + } else if ( !VM_PAGE_PAGEABLE(m)) { if (m->clustered || (flags & UPL_COMMIT_SPECULATE)) dwp->dw_mask |= DW_vm_page_speculate; @@ -7156,7 +7597,7 @@ upl_commit_range( commit_next_page: if (clear_refmod) - pmap_clear_refmod(m->phys_page, clear_refmod); + pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod); target_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; @@ -7192,9 +7633,8 @@ upl_commit_range( if (local_queue_count || unwired_count) { if (local_queue_count) { - vm_page_t first_local, last_local; vm_page_t first_target; - queue_head_t *target_queue; + vm_page_queue_head_t *target_queue; if (throttle_page) target_queue = &vm_page_queue_throttled; @@ -7210,21 +7650,18 @@ upl_commit_range( /* * Transfer the entire local queue to a regular LRU page queues. */ - first_local = (vm_page_t) queue_first(&local_queue); - last_local = (vm_page_t) queue_last(&local_queue); - vm_page_lockspin_queues(); - first_target = (vm_page_t) queue_first(target_queue); + first_target = (vm_page_t) vm_page_queue_first(target_queue); - if (queue_empty(target_queue)) - queue_last(target_queue) = (queue_entry_t) last_local; + if (vm_page_queue_empty(target_queue)) + target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); else - queue_prev(&first_target->pageq) = (queue_entry_t) last_local; + first_target->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); - queue_first(target_queue) = (queue_entry_t) first_local; - queue_prev(&first_local->pageq) = (queue_entry_t) target_queue; - queue_next(&last_local->pageq) = (queue_entry_t) first_target; + target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local); + first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue); + last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target); /* * Adjust the global page counts. @@ -7284,7 +7721,7 @@ upl_commit_range( } } } else { - if (queue_empty(&upl->map_object->memq)) + if (vm_page_queue_empty(&upl->map_object->memq)) occupied = 0; } if (occupied == 0) { @@ -7338,7 +7775,6 @@ upl_commit_range( } goto process_upl_to_commit; } - if (pgpgout_count) { DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL); } @@ -7498,7 +7934,7 @@ upl_abort_range( } if (upl->flags & UPL_SHADOWED) { if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) { - t->pageout = FALSE; + t->free_when_done = FALSE; VM_PAGE_FREE(t); @@ -7511,7 +7947,7 @@ upl_abort_range( if (m != VM_PAGE_NULL) { - assert(!m->compressor); + assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); if (m->absent) { boolean_t must_free = TRUE; @@ -7621,13 +8057,11 @@ upl_abort_range( dwp->dw_mask |= DW_clear_busy; } - m->pageout = FALSE; + m->free_when_done = FALSE; m->cleaning = FALSE; -#if MACH_PAGEMAP - vm_external_state_clr(m->object->existence_map, m->offset); -#endif /* MACH_PAGEMAP */ + if (error & UPL_ABORT_DUMP_PAGES) { - pmap_disconnect(m->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); dwp->dw_mask |= DW_vm_page_free; } else { @@ -7641,7 +8075,7 @@ upl_abort_range( */ dwp->dw_mask |= DW_vm_page_lru; - } else if (!m->active && !m->inactive && !m->speculative) + } else if ( !VM_PAGE_PAGEABLE(m)) dwp->dw_mask |= DW_vm_page_deactivate_internal; } dwp->dw_mask |= DW_PAGE_WAKEUP; @@ -7694,7 +8128,7 @@ upl_abort_range( } } } else { - if (queue_empty(&upl->map_object->memq)) + if (vm_page_queue_empty(&upl->map_object->memq)) occupied = 0; } if (occupied == 0) { @@ -7805,15 +8239,17 @@ iopl_valid_data( if (object == kernel_object || object == compressor_object) panic("iopl_valid_data: object == kernel or compressor"); - if (object->purgable == VM_PURGABLE_VOLATILE) - panic("iopl_valid_data: object == VM_PURGABLE_VOLATILE"); + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) + panic("iopl_valid_data: object %p purgable %d", + object, object->purgable); size = upl->size; vm_object_lock(object); if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) - nxt_page = (vm_page_t)queue_first(&object->memq); + nxt_page = (vm_page_t)vm_page_queue_first(&object->memq); else offset = 0 + upl->offset - object->paging_offset; @@ -7821,7 +8257,7 @@ iopl_valid_data( if (nxt_page != VM_PAGE_NULL) { m = nxt_page; - nxt_page = (vm_page_t)queue_next(&nxt_page->listq); + nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq); } else { m = vm_page_lookup(object, offset); offset += PAGE_SIZE; @@ -7835,11 +8271,22 @@ iopl_valid_data( if (m->pageq.next || m->pageq.prev) panic("iopl_valid_data: busy+absent page on page queue"); + if (m->reusable) { + panic("iopl_valid_data: %p is reusable", m); + } m->absent = FALSE; m->dirty = TRUE; + assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(m->wire_count == 0); m->wire_count++; - wired_count++; + assert(m->wire_count); + if (m->wire_count == 1) { + m->vm_page_q_state = VM_PAGE_IS_WIRED; + wired_count++; + } else { + panic("iopl_valid_data: %p already wired\n", m); + } PAGE_WAKEUP_DONE(m); } @@ -7851,6 +8298,11 @@ iopl_valid_data( VM_OBJECT_WIRED(object); } object->wired_page_count += wired_count; + assert(object->resident_page_count >= object->wired_page_count); + + /* no need to adjust purgeable accounting for this object: */ + assert(object->purgable != VM_PURGABLE_VOLATILE); + assert(object->purgable != VM_PURGABLE_EMPTY); vm_page_lockspin_queues(); vm_page_wire_count += wired_count; @@ -7859,6 +8311,35 @@ iopl_valid_data( vm_object_unlock(object); } +vm_tag_t +iopl_set_tag( + upl_t upl, + vm_tag_t tag) +{ + vm_object_t object; + vm_tag_t prior_tag; + + if (upl == NULL) + panic("%s: NULL upl", __FUNCTION__); + if (vector_upl_is_valid(upl)) + panic("%s: vector upl", __FUNCTION__); + if (kernel_object == upl->map_object) + return (tag); + if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE) + return (tag); + + object = upl->map_object; + vm_object_lock(object); + + prior_tag = object->wire_tag; + object->wire_tag = tag; + if (VM_KERN_MEMORY_NONE == prior_tag) prior_tag = tag; + vm_object_unlock(object); + + return (prior_tag); +} + + void vm_object_set_pmap_cache_attr( vm_object_t object, @@ -7891,6 +8372,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us int page_count; int delayed_unlock = 0; boolean_t retval = TRUE; + ppnum_t phys_page; vm_object_lock_assert_exclusive(object); assert(object->purgable != VM_PURGABLE_VOLATILE); @@ -7901,7 +8383,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us tag = UPL_MEMORY_TAG(cntrl_flags); page_count = object->resident_page_count; - dst_page = (vm_page_t)queue_first(&object->memq); + dst_page = (vm_page_t)vm_page_queue_first(&object->memq); vm_page_lock_queues(); @@ -7933,14 +8415,16 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us assert(entry >= 0 && entry < object->resident_page_count); lite_list[entry>>5] |= 1 << (entry & 31); - if (dst_page->phys_page > upl->highest_page) - upl->highest_page = dst_page->phys_page; + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); + + if (phys_page > upl->highest_page) + upl->highest_page = phys_page; if (user_page_list) { - user_page_list[entry].phys_addr = dst_page->phys_page; + user_page_list[entry].phys_addr = phys_page; user_page_list[entry].absent = dst_page->absent; user_page_list[entry].dirty = dst_page->dirty; - user_page_list[entry].pageout = dst_page->pageout;; + user_page_list[entry].free_when_done = dst_page->free_when_done; user_page_list[entry].precious = dst_page->precious; user_page_list[entry].device = FALSE; user_page_list[entry].speculative = FALSE; @@ -7956,7 +8440,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us VM_CHECK_MEMORYSTATUS; } - dst_page = (vm_page_t)queue_next(&dst_page->listq); + dst_page = (vm_page_t)vm_page_queue_next(&dst_page->listq); } done: vm_page_unlock_queues(); @@ -7980,6 +8464,8 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u int entry = 0; uint64_t delayed_ledger_update = 0; kern_return_t ret = KERN_SUCCESS; + int grab_options; + ppnum_t phys_page; vm_object_lock_assert_exclusive(object); assert(object->purgable != VM_PURGABLE_VOLATILE); @@ -7998,9 +8484,17 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u tag = UPL_MEMORY_TAG(cntrl_flags); + grab_options = 0; +#if CONFIG_SECLUDED_MEMORY + if (object->can_grab_secluded) { + grab_options |= VM_PAGE_GRAB_SECLUDED; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + while (page_count--) { - while ( (dst_page = vm_page_grab()) == VM_PAGE_NULL) { + while ((dst_page = vm_page_grab_options(grab_options)) + == VM_PAGE_NULL) { OSAddAtomic(page_count, &vm_upl_wait_for_pages); @@ -8032,7 +8526,11 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u SET_PAGE_DIRTY(dst_page, FALSE); } if (dst_page->absent == FALSE) { + assert(dst_page->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(dst_page->wire_count == 0); dst_page->wire_count++; + dst_page->vm_page_q_state = VM_PAGE_IS_WIRED; + assert(dst_page->wire_count); pages_wired++; PAGE_WAKEUP_DONE(dst_page); } @@ -8042,14 +8540,16 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u lite_list[entry>>5] |= 1 << (entry & 31); - if (dst_page->phys_page > upl->highest_page) - upl->highest_page = dst_page->phys_page; + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); + + if (phys_page > upl->highest_page) + upl->highest_page = phys_page; if (user_page_list) { - user_page_list[entry].phys_addr = dst_page->phys_page; + user_page_list[entry].phys_addr = phys_page; user_page_list[entry].absent = dst_page->absent; user_page_list[entry].dirty = dst_page->dirty; - user_page_list[entry].pageout = FALSE; + user_page_list[entry].free_when_done = FALSE; user_page_list[entry].precious = FALSE; user_page_list[entry].device = FALSE; user_page_list[entry].speculative = FALSE; @@ -8127,6 +8627,7 @@ vm_object_iopl_request( boolean_t caller_lookup; int io_tracking_flag = 0; int interruptible; + ppnum_t phys_page; boolean_t set_cache_attr_needed = FALSE; boolean_t free_wired_pages = FALSE; @@ -8302,6 +8803,7 @@ vm_object_iopl_request( } #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */ + vm_object_lock_assert_exclusive(object); object->true_share = TRUE; if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) @@ -8511,9 +9013,12 @@ vm_object_iopl_request( if (top_page != VM_PAGE_NULL) { vm_object_t local_object; - local_object = top_page->object; - - if (top_page->object != dst_page->object) { + local_object = VM_PAGE_OBJECT(top_page); + + /* + * comparing 2 packed pointers + */ + if (top_page->vm_page_object != dst_page->vm_page_object) { vm_object_lock(local_object); VM_PAGE_FREE(top_page); vm_object_paging_end(local_object); @@ -8571,10 +9076,12 @@ vm_object_iopl_request( } while (result != VM_FAULT_SUCCESS); } + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); + if (upl->flags & UPL_KERNEL_OBJECT) goto record_phys_addr; - if (dst_page->compressor) { + if (dst_page->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { dst_page->busy = TRUE; goto record_phys_addr; } @@ -8594,13 +9101,11 @@ vm_object_iopl_request( PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } - if (dst_page->laundry) { - dst_page->pageout = FALSE; - + if (dst_page->laundry) vm_pageout_steal_laundry(dst_page, FALSE); - } + if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) && - dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) { + phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) { vm_page_t low_page; int refmod; @@ -8630,7 +9135,7 @@ vm_object_iopl_request( * to find the new page being substituted. */ if (dst_page->pmapped) - refmod = pmap_disconnect(dst_page->phys_page); + refmod = pmap_disconnect(phys_page); else refmod = 0; @@ -8657,6 +9162,8 @@ vm_object_iopl_request( */ if ( !dst_page->absent) dst_page->busy = FALSE; + + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); } if ( !dst_page->busy) dwp->dw_mask |= DW_vm_page_wire; @@ -8681,7 +9188,7 @@ vm_object_iopl_request( SET_PAGE_DIRTY(dst_page, TRUE); } if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) { - pmap_sync_page_attributes_phys(dst_page->phys_page); + pmap_sync_page_attributes_phys(phys_page); dst_page->written_by_kernel = FALSE; } @@ -8691,19 +9198,19 @@ vm_object_iopl_request( lite_list[entry>>5] |= 1 << (entry & 31); - if (dst_page->phys_page > upl->highest_page) - upl->highest_page = dst_page->phys_page; + if (phys_page > upl->highest_page) + upl->highest_page = phys_page; if (user_page_list) { - user_page_list[entry].phys_addr = dst_page->phys_page; - user_page_list[entry].pageout = dst_page->pageout; + user_page_list[entry].phys_addr = phys_page; + user_page_list[entry].free_when_done = dst_page->free_when_done; user_page_list[entry].absent = dst_page->absent; user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].precious = dst_page->precious; user_page_list[entry].device = FALSE; user_page_list[entry].needed = FALSE; if (dst_page->clustered == TRUE) - user_page_list[entry].speculative = dst_page->speculative; + user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE; else user_page_list[entry].speculative = FALSE; user_page_list[entry].cs_validated = dst_page->cs_validated; @@ -9073,7 +9580,7 @@ vm_paging_map_object( /* use permanent 1-to-1 kernel mapping of physical memory ? */ #if __x86_64__ *address = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)page->phys_page << + PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT); *need_unmap = FALSE; return KERN_SUCCESS; @@ -9246,7 +9753,7 @@ vm_paging_map_object( } page->pmapped = TRUE; - //assert(pmap_verify_free(page->phys_page)); + //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page))); PMAP_ENTER(kernel_pmap, *address + page_map_offset, page, @@ -9447,6 +9954,7 @@ vm_page_encrypt( vm_map_size_t kernel_mapping_size; boolean_t kernel_mapping_needs_unmap; vm_offset_t kernel_vaddr; + vm_object_t page_object; union { unsigned char aes_iv[AES_BLOCK_SIZE]; struct { @@ -9472,12 +9980,14 @@ vm_page_encrypt( ASSERT_PAGE_DECRYPTED(page); + page_object = VM_PAGE_OBJECT(page); + /* * Take a paging-in-progress reference to keep the object * alive even if we have to unlock it (in vm_paging_map_object() * for example)... */ - vm_object_paging_begin(page->object); + vm_object_paging_begin(page_object); if (kernel_mapping_offset == 0) { /* @@ -9488,7 +9998,7 @@ vm_page_encrypt( kernel_mapping_size = PAGE_SIZE; kernel_mapping_needs_unmap = FALSE; kr = vm_paging_map_object(page, - page->object, + page_object, page->offset, VM_PROT_READ | VM_PROT_WRITE, FALSE, @@ -9519,9 +10029,9 @@ vm_page_encrypt( * use to break the key. */ bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv)); - encrypt_iv.vm.pager_object = page->object->pager; + encrypt_iv.vm.pager_object = page_object->pager; encrypt_iv.vm.paging_offset = - page->object->paging_offset + page->offset; + page_object->paging_offset + page->offset; /* encrypt the "initial vector" */ aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0], @@ -9547,7 +10057,7 @@ vm_page_encrypt( * the caller undo the mapping if needed. */ if (kernel_mapping_needs_unmap) { - vm_paging_unmap_object(page->object, + vm_paging_unmap_object(page_object, kernel_mapping_offset, kernel_mapping_offset + kernel_mapping_size); } @@ -9562,11 +10072,11 @@ vm_page_encrypt( * The software bits will be reset later after the I/O * has completed (in upl_commit_range()). */ - pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED); + pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_REFERENCED | VM_MEM_MODIFIED); page->encrypted = TRUE; - vm_object_paging_end(page->object); + vm_object_paging_end(page_object); } /* @@ -9590,6 +10100,7 @@ vm_page_decrypt( vm_map_size_t kernel_mapping_size; vm_offset_t kernel_vaddr; boolean_t kernel_mapping_needs_unmap; + vm_object_t page_object; union { unsigned char aes_iv[AES_BLOCK_SIZE]; struct { @@ -9602,6 +10113,7 @@ vm_page_decrypt( assert(page->busy); assert(page->encrypted); + page_object = VM_PAGE_OBJECT(page); was_dirty = page->dirty; /* @@ -9609,7 +10121,7 @@ vm_page_decrypt( * alive even if we have to unlock it (in vm_paging_map_object() * for example)... */ - vm_object_paging_begin(page->object); + vm_object_paging_begin(page_object); if (kernel_mapping_offset == 0) { /* @@ -9620,7 +10132,7 @@ vm_page_decrypt( kernel_mapping_size = PAGE_SIZE; kernel_mapping_needs_unmap = FALSE; kr = vm_paging_map_object(page, - page->object, + page_object, page->offset, VM_PROT_READ | VM_PROT_WRITE, FALSE, @@ -9646,9 +10158,9 @@ vm_page_decrypt( * used to encrypt that page. */ bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv)); - decrypt_iv.vm.pager_object = page->object->pager; + decrypt_iv.vm.pager_object = page_object->pager; decrypt_iv.vm.paging_offset = - page->object->paging_offset + page->offset; + page_object->paging_offset + page->offset; /* encrypt the "initial vector" */ aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0], @@ -9673,7 +10185,7 @@ vm_page_decrypt( * the caller undo the mapping if needed. */ if (kernel_mapping_needs_unmap) { - vm_paging_unmap_object(page->object, + vm_paging_unmap_object(page_object, kernel_vaddr, kernel_vaddr + PAGE_SIZE); } @@ -9694,7 +10206,7 @@ vm_page_decrypt( */ page->dirty = FALSE; assert (page->cs_validated == FALSE); - pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED); } page->encrypted = FALSE; @@ -9705,18 +10217,18 @@ vm_page_decrypt( * be part of a DMA transfer from a driver that expects the memory to * be coherent at this point, we have to flush the data cache. */ - pmap_sync_page_attributes_phys(page->phys_page); + pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(page)); /* * Since the page is not mapped yet, some code might assume that it * doesn't need to invalidate the instruction cache when writing to * that page. That code relies on "pmapped" being FALSE, so that the * caches get synchronized when the page is first mapped. */ - assert(pmap_verify_free(page->phys_page)); + assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page))); page->pmapped = FALSE; page->wpmapped = FALSE; - vm_object_paging_end(page->object); + vm_object_paging_end(page_object); } #if DEVELOPMENT || DEBUG @@ -9825,7 +10337,7 @@ upl_encrypt( * encryption completes, any access will cause a * page fault and the page gets decrypted at that time. */ - pmap_disconnect(page->phys_page); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page)); vm_page_encrypt(page, 0); if (vm_object_lock_avoid(shadow_object)) { @@ -9883,6 +10395,7 @@ vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked) vm_page_lockspin_queues(); } + page->free_when_done = FALSE; /* * need to drop the laundry count... * we may also need to remove it @@ -10270,15 +10783,18 @@ vm_page_is_slideable(vm_page_t m) { boolean_t result = FALSE; vm_shared_region_slide_info_t si; + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(m); - vm_object_lock_assert_held(m->object); + vm_object_lock_assert_held(m_object); /* make sure our page belongs to the one object allowed to do this */ - if (!m->object->object_slid) { + if (!m_object->object_slid) { goto done; } - si = m->object->vo_slide_info; + si = m_object->vo_slide_info; if (si == NULL) { goto done; } @@ -10304,10 +10820,13 @@ vm_page_slide( vm_offset_t kernel_vaddr; uint32_t pageIndex; uint32_t slide_chunk; + vm_object_t page_object; + + page_object = VM_PAGE_OBJECT(page); assert(!page->slid); - assert(page->object->object_slid); - vm_object_lock_assert_exclusive(page->object); + assert(page_object->object_slid); + vm_object_lock_assert_exclusive(page_object); if (page->error) return KERN_FAILURE; @@ -10317,7 +10836,7 @@ vm_page_slide( * alive even if we have to unlock it (in vm_paging_map_object() * for example)... */ - vm_object_paging_begin(page->object); + vm_object_paging_begin(page_object); if (kernel_mapping_offset == 0) { /* @@ -10328,7 +10847,7 @@ vm_page_slide( kernel_mapping_size = PAGE_SIZE; kernel_mapping_needs_unmap = FALSE; kr = vm_paging_map_object(page, - page->object, + page_object, page->offset, VM_PROT_READ | VM_PROT_WRITE, FALSE, @@ -10353,16 +10872,15 @@ vm_page_slide( /*assert that slide_file_info.start/end are page-aligned?*/ assert(!page->slid); - assert(page->object->object_slid); + assert(page_object->object_slid); -#define PAGE_SIZE_FOR_SR_SLIDE 4096 pageIndex = (uint32_t)((page->offset - - page->object->vo_slide_info->start) / + page_object->vo_slide_info->start) / PAGE_SIZE_FOR_SR_SLIDE); for (slide_chunk = 0; slide_chunk < PAGE_SIZE / PAGE_SIZE_FOR_SR_SLIDE; slide_chunk++) { - kr = vm_shared_region_slide_page(page->object->vo_slide_info, + kr = vm_shared_region_slide_page(page_object->vo_slide_info, (kernel_vaddr + (slide_chunk * PAGE_SIZE_FOR_SR_SLIDE)), @@ -10378,21 +10896,21 @@ vm_page_slide( * Unmap the page from the kernel's address space, */ if (kernel_mapping_needs_unmap) { - vm_paging_unmap_object(page->object, + vm_paging_unmap_object(page_object, kernel_vaddr, kernel_vaddr + PAGE_SIZE); } page->dirty = FALSE; - pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED); if (kr != KERN_SUCCESS || cs_debug > 1) { printf("vm_page_slide(%p): " "obj %p off 0x%llx mobj %p moff 0x%llx\n", page, - page->object, page->offset, - page->object->pager, - page->offset + page->object->paging_offset); + page_object, page->offset, + page_object->pager, + page->offset + page_object->paging_offset); } if (kr == KERN_SUCCESS) { @@ -10402,7 +10920,7 @@ vm_page_slide( vm_page_slide_errors++; } - vm_object_paging_end(page->object); + vm_object_paging_end(page_object); return kr; } @@ -10472,50 +10990,50 @@ vm_countdirtypages(void) precpages=0; vm_page_lock_queues(); - m = (vm_page_t) queue_first(&vm_page_queue_inactive); + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); do { if (m ==(vm_page_t )0) break; if(m->dirty) dpages++; - if(m->pageout) pgopages++; + if(m->free_when_done) pgopages++; if(m->precious) precpages++; - assert(m->object != kernel_object); - m = (vm_page_t) queue_next(&m->pageq); + assert(VM_PAGE_OBJECT(m) != kernel_object); + m = (vm_page_t) vm_page_queue_next(&m->pageq); if (m ==(vm_page_t )0) break; - } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m)); + } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m)); vm_page_unlock_queues(); vm_page_lock_queues(); - m = (vm_page_t) queue_first(&vm_page_queue_throttled); + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled); do { if (m ==(vm_page_t )0) break; dpages++; assert(m->dirty); - assert(!m->pageout); - assert(m->object != kernel_object); - m = (vm_page_t) queue_next(&m->pageq); + assert(!m->free_when_done); + assert(VM_PAGE_OBJECT(m) != kernel_object); + m = (vm_page_t) vm_page_queue_next(&m->pageq); if (m ==(vm_page_t )0) break; - } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m)); + } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m)); vm_page_unlock_queues(); vm_page_lock_queues(); - m = (vm_page_t) queue_first(&vm_page_queue_anonymous); + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); do { if (m ==(vm_page_t )0) break; if(m->dirty) dpages++; - if(m->pageout) pgopages++; + if(m->free_when_done) pgopages++; if(m->precious) precpages++; - assert(m->object != kernel_object); - m = (vm_page_t) queue_next(&m->pageq); + assert(VM_PAGE_OBJECT(m) != kernel_object); + m = (vm_page_t) vm_page_queue_next(&m->pageq); if (m ==(vm_page_t )0) break; - } while (!queue_end(&vm_page_queue_anonymous,(queue_entry_t) m)); + } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m)); vm_page_unlock_queues(); printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages); @@ -10525,19 +11043,19 @@ vm_countdirtypages(void) precpages=0; vm_page_lock_queues(); - m = (vm_page_t) queue_first(&vm_page_queue_active); + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); do { if(m == (vm_page_t )0) break; if(m->dirty) dpages++; - if(m->pageout) pgopages++; + if(m->free_when_done) pgopages++; if(m->precious) precpages++; - assert(m->object != kernel_object); - m = (vm_page_t) queue_next(&m->pageq); + assert(VM_PAGE_OBJECT(m) != kernel_object); + m = (vm_page_t) vm_page_queue_next(&m->pageq); if(m == (vm_page_t )0) break; - } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m)); + } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m)); vm_page_unlock_queues(); printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages); @@ -10567,6 +11085,14 @@ void upl_set_associated_upl(upl_t upl, upl_t associated_upl) upl->associated_upl = associated_upl; } +struct vnode * upl_lookup_vnode(upl_t upl) +{ + if (!upl->map_object->internal) + return vnode_pager_lookup_vnode(upl->map_object->pager); + else + return NULL; +} + #if UPL_DEBUG kern_return_t upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2) { @@ -10593,8 +11119,8 @@ extern boolean_t vm_compressor_low_on_space(void); boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void) { - if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) { - + if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) { + /* Available pages below our threshold */ if (memorystatus_available_pages < memorystatus_available_pages_pressure) { /* No frozen processes to kill */ @@ -10615,7 +11141,8 @@ VM_PRESSURE_NORMAL_TO_WARNING(void) { boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void) { - if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) { + if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) { + /* Available pages below our threshold */ if (memorystatus_available_pages < memorystatus_available_pages_critical) { return TRUE; @@ -10632,7 +11159,8 @@ VM_PRESSURE_WARNING_TO_CRITICAL(void) { boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void) { - if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) { + if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) { + /* Available pages above our threshold */ unsigned int target_threshold = memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100); if (memorystatus_available_pages > target_threshold) { @@ -10647,7 +11175,8 @@ VM_PRESSURE_WARNING_TO_NORMAL(void) { boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void) { - if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) { + if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) { + /* Available pages above our threshold */ unsigned int target_threshold = memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100); if (memorystatus_available_pages > target_threshold) { diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index bd7cb800a..c829afa08 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -94,15 +94,28 @@ extern unsigned int vm_pageout_cleaned_reactivated, vm_pageout_cleaned_fault_rea #if CONFIG_FREEZE extern boolean_t memorystatus_freeze_enabled; -#define VM_DYNAMIC_PAGING_ENABLED(port) (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED || (memorystatus_freeze_enabled == FALSE && IP_VALID(port))) -#else -#define VM_DYNAMIC_PAGING_ENABLED(port) (COMPRESSED_PAGER_IS_ACTIVE || IP_VALID(port)) #endif +#define VM_DYNAMIC_PAGING_ENABLED() (VM_CONFIG_COMPRESSOR_IS_ACTIVE) + #if VM_PRESSURE_EVENTS extern boolean_t vm_pressure_events_enabled; #endif /* VM_PRESSURE_EVENTS */ + +/* + * the following codes are used in the DBG_MACH_WORKINGSET subclass + * of the DBG_MACH class + */ +#define VM_DISCONNECT_ALL_PAGE_MAPPINGS 0x00 +#define VM_DISCONNECT_TASK_PAGE_MAPPINGS 0x01 +#define VM_REAL_FAULT_ADDR_INTERNAL 0x02 +#define VM_REAL_FAULT_ADDR_PURGABLE 0x03 +#define VM_REAL_FAULT_ADDR_EXTERNAL 0x04 +#define VM_REAL_FAULT_ADDR_SHAREDCACHE 0x05 + + + extern int vm_debug_events; #define VMF_CHECK_ZFDELAY 0x100 @@ -168,6 +181,16 @@ extern void upl_set_associated_upl(upl_t upl, upl_t associated_upl); extern void iopl_valid_data( upl_t upl_ptr); +#ifdef XNU_KERNEL_PRIVATE + +extern vm_tag_t iopl_set_tag( + upl_t upl_ptr, + vm_tag_t tag); + +#endif /* XNU_KERNEL_PRIVATE */ + +extern struct vnode * upl_lookup_vnode(upl_t upl); + #ifndef MACH_KERNEL_PRIVATE typedef struct vm_page *vm_page_t; #endif @@ -203,7 +226,7 @@ extern unsigned int vm_page_anonymous_count; * manipulate this structure */ struct vm_pageout_queue { - queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */ + vm_page_queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */ unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */ unsigned int pgo_maxlaundry; uint64_t pgo_tid; /* thread ID of I/O thread that services this queue */ @@ -236,7 +259,6 @@ extern void vm_pageout_object_terminate( extern int vm_pageout_cluster( vm_page_t m, - boolean_t pageout, boolean_t immediate_ok, boolean_t keep_object_locked); @@ -295,7 +317,6 @@ struct upl { int ref_count; int ext_ref_count; int flags; - vm_object_t src_object; /* object derived from */ vm_object_offset_t offset; upl_size_t size; /* size in bytes of the address space */ vm_offset_t kaddr; /* secondary mapping in kernel */ @@ -502,6 +523,9 @@ struct vm_page_stats_reusable { uint64_t can_reuse_success; uint64_t can_reuse_failure; uint64_t reusable_reclaimed; + uint64_t reusable_nonwritable; + uint64_t reusable_shared; + uint64_t free_shared; }; extern struct vm_page_stats_reusable vm_page_stats_reusable; @@ -518,29 +542,36 @@ extern boolean_t vm_compressor_immediate_preferred; extern boolean_t vm_compressor_immediate_preferred_override; extern kern_return_t vm_pageout_compress_page(void **, char *, vm_page_t, boolean_t); extern void vm_pageout_anonymous_pages(void); +extern void vm_pageout_disconnect_all_pages(void); -#define VM_PAGER_DEFAULT 0x1 /* Use default pager. */ -#define VM_PAGER_COMPRESSOR_NO_SWAP 0x2 /* In-core compressor only. */ -#define VM_PAGER_COMPRESSOR_WITH_SWAP 0x4 /* In-core compressor + swap backend. */ -#define VM_PAGER_FREEZER_DEFAULT 0x8 /* Freezer backed by default pager.*/ -#define VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP 0x10 /* Freezer backed by in-core compressor only i.e. frozen data remain in-core compressed.*/ -#define VM_PAGER_FREEZER_COMPRESSOR_WITH_SWAP 0x20 /* Freezer backed by in-core compressor with swap support too.*/ +struct vm_config { + boolean_t compressor_is_present; /* compressor is initialized and can be used by the freezer, the sweep or the pager */ + boolean_t compressor_is_active; /* pager can actively compress pages... 'compressor_is_present' must be set */ + boolean_t swap_is_present; /* swap is initialized and can be used by the freezer, the sweep or the pager */ + boolean_t swap_is_active; /* pager can actively swap out compressed segments... 'swap_is_present' must be set */ + boolean_t freezer_swap_is_active; /* freezer can swap out frozen tasks... "compressor_is_present + swap_is_present" must be set */ +}; -#define VM_PAGER_MAX_MODES 6 /* Total number of vm compressor modes supported */ +extern struct vm_config vm_config; -#define DEFAULT_PAGER_IS_ACTIVE ((vm_compressor_mode & VM_PAGER_DEFAULT) == VM_PAGER_DEFAULT) -#define COMPRESSED_PAGER_IS_ACTIVE (vm_compressor_mode & (VM_PAGER_COMPRESSOR_NO_SWAP | VM_PAGER_COMPRESSOR_WITH_SWAP)) -#define COMPRESSED_PAGER_IS_SWAPLESS ((vm_compressor_mode & VM_PAGER_COMPRESSOR_NO_SWAP) == VM_PAGER_COMPRESSOR_NO_SWAP) -#define COMPRESSED_PAGER_IS_SWAPBACKED ((vm_compressor_mode & VM_PAGER_COMPRESSOR_WITH_SWAP) == VM_PAGER_COMPRESSOR_WITH_SWAP) +#define VM_PAGER_NOT_CONFIGURED 0x0 /* no compresser or swap configured */ +#define VM_PAGER_DEFAULT 0x1 /* Use default pager... DEPRECATED */ +#define VM_PAGER_COMPRESSOR_NO_SWAP 0x2 /* Active in-core compressor only. */ +#define VM_PAGER_COMPRESSOR_WITH_SWAP 0x4 /* Active in-core compressor + swap backend. */ +#define VM_PAGER_FREEZER_DEFAULT 0x8 /* Freezer backed by default pager... DEPRECATED */ +#define VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP 0x10 /* Freezer backed by in-core compressor only i.e. frozen data remain in-core compressed.*/ +#define VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP 0x20 /* Active in-core compressor + Freezer backed by in-core compressor with swap support too.*/ -#define DEFAULT_FREEZER_IS_ACTIVE ((vm_compressor_mode & VM_PAGER_FREEZER_DEFAULT) == VM_PAGER_FREEZER_DEFAULT) +#define VM_PAGER_MAX_MODES 6 /* Total number of vm compressor modes supported */ -#define DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE (vm_compressor_mode & (VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP | VM_PAGER_FREEZER_COMPRESSOR_WITH_SWAP)) -#define DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS ((vm_compressor_mode & VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP) == VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP) -#define DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED ((vm_compressor_mode & VM_PAGER_FREEZER_COMPRESSOR_WITH_SWAP) == VM_PAGER_FREEZER_COMPRESSOR_WITH_SWAP) +#define VM_CONFIG_COMPRESSOR_IS_PRESENT (vm_config.compressor_is_present == TRUE) +#define VM_CONFIG_COMPRESSOR_IS_ACTIVE (vm_config.compressor_is_active == TRUE) +#define VM_CONFIG_SWAP_IS_PRESENT (vm_config.swap_is_present == TRUE) +#define VM_CONFIG_SWAP_IS_ACTIVE (vm_config.swap_is_active == TRUE) +#define VM_CONFIG_FREEZER_SWAP_IS_ACTIVE (vm_config.freezer_swap_is_active == TRUE) #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_phantom_cache.c b/osfmk/vm/vm_phantom_cache.c index 9f2a3232f..4d8043702 100644 --- a/osfmk/vm/vm_phantom_cache.c +++ b/osfmk/vm/vm_phantom_cache.c @@ -101,6 +101,8 @@ vm_phantom_cache_init() unsigned int log1; unsigned int size; + if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) + return; num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY); vm_phantom_cache_num_entries = 1; @@ -145,29 +147,30 @@ void vm_phantom_cache_add_ghost(vm_page_t m) { vm_ghost_t vpce; + vm_object_t object; int ghost_index; int pg_mask; boolean_t isSSD = FALSE; vm_phantom_hash_entry_t ghost_hash_index; -#if MACH_ASSERT || DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - vm_object_lock_assert_exclusive(m->object); -#endif + object = VM_PAGE_OBJECT(m); + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + vm_object_lock_assert_exclusive(object); if (vm_phantom_cache_num_entries == 0) return; pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; - if (m->object->phantom_object_id == 0) { + if (object->phantom_object_id == 0) { - vnode_pager_get_isSSD(m->object->pager, &isSSD); + vnode_pager_get_isSSD(object->pager, &isSSD); if (isSSD == TRUE) - m->object->phantom_isssd = TRUE; + object->phantom_isssd = TRUE; - m->object->phantom_object_id = vm_phantom_object_id++; + object->phantom_object_id = vm_phantom_object_id++; if (vm_phantom_object_id == 0) vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP; @@ -225,14 +228,14 @@ vm_phantom_cache_add_ghost(vm_page_t m) vpce->g_pages_held = pg_mask; vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK; - vpce->g_obj_id = m->object->phantom_object_id; + vpce->g_obj_id = object->phantom_object_id; ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset); vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index]; vm_phantom_cache_hash[ghost_hash_index] = ghost_index; done: - if (m->object->phantom_isssd) + if (object->phantom_isssd) OSAddAtomic(1, &sample_period_ghost_added_count_ssd); else OSAddAtomic(1, &sample_period_ghost_added_count); @@ -245,8 +248,11 @@ vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask) uint64_t g_obj_offset; uint32_t g_obj_id; uint32_t ghost_index; + vm_object_t object; - if ((g_obj_id = m->object->phantom_object_id) == 0) { + object = VM_PAGE_OBJECT(m); + + if ((g_obj_id = object->phantom_object_id) == 0) { /* * no entries in phantom cache for this object */ @@ -286,11 +292,12 @@ vm_phantom_cache_update(vm_page_t m) { int pg_mask; vm_ghost_t vpce; + vm_object_t object; -#if MACH_ASSERT || DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - vm_object_lock_assert_exclusive(m->object); -#endif + object = VM_PAGE_OBJECT(m); + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + vm_object_lock_assert_exclusive(object); if (vm_phantom_cache_num_entries == 0) return; @@ -303,7 +310,7 @@ vm_phantom_cache_update(vm_page_t m) phantom_cache_stats.pcs_updated_phantom_state++; - if (m->object->phantom_isssd) + if (object->phantom_isssd) OSAddAtomic(1, &sample_period_ghost_found_count_ssd); else OSAddAtomic(1, &sample_period_ghost_found_count); diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index 71d58704b..8acd5f072 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -34,6 +34,10 @@ #include #include +#ifdef __cplusplus +extern "C" { +#endif + /* * This file contains various type definitions and routine prototypes * that are needed to avoid compilation warnings for VM code (in osfmk, @@ -58,12 +62,7 @@ extern kern_return_t device_data_action( extern kern_return_t device_close( uintptr_t device_handle); -/* - * default_pager - */ -extern int start_def_pager( - char *bs_device); -extern int default_pager_init_flag; +extern boolean_t vm_swap_files_pinned(void); /* * osfmk @@ -79,9 +78,9 @@ extern task_t port_name_to_task( extern ipc_space_t get_task_ipcspace( task_t t); -#if CONFIG_JETSAM +#if CONFIG_MEMORYSTATUS extern int max_task_footprint_mb; /* Per-task limit on physical memory consumption in megabytes */ -#endif // CONFIG_JETSAM +#endif /* CONFIG_MEMORYSTATUS */ /* Some loose-ends VM stuff */ @@ -93,11 +92,16 @@ extern void consider_machine_adjust(void); extern vm_map_offset_t get_map_min(vm_map_t); extern vm_map_offset_t get_map_max(vm_map_t); extern vm_map_size_t get_vmmap_size(vm_map_t); +#if CONFIG_COREDUMP extern int get_vmmap_entries(vm_map_t); +#endif +extern int get_map_nentries(vm_map_t); extern vm_map_offset_t vm_map_page_mask(vm_map_t); +#if CONFIG_COREDUMP extern boolean_t coredumpok(vm_map_t map, vm_offset_t va); +#endif /* * VM routines that used to be published to @@ -133,6 +137,9 @@ extern mach_vm_offset_t mach_get_vm_end(vm_map_t); #if CONFIG_CODE_DECRYPTION #define VM_MAP_DEBUG_APPLE_PROTECT MACH_ASSERT +#if VM_MAP_DEBUG_APPLE_PROTECT +extern int vm_map_debug_apple_protect; +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ struct pager_crypt_info; extern kern_return_t vm_map_apple_protected( vm_map_t map, @@ -166,7 +173,6 @@ extern memory_object_control_t swapfile_pager_control(memory_object_t mem_obj); * bsd */ struct vnode; -extern void vnode_pager_shutdown(void); extern void *upl_get_internal_page_list( upl_t upl); @@ -269,9 +275,6 @@ extern kern_return_t vnode_pager_get_object_mtime( memory_object_t mem_obj, struct timespec *mtime, struct timespec *cs_mtime); -extern kern_return_t vnode_pager_get_object_cs_blobs( - memory_object_t mem_obj, - void **blobs); #if CHECK_CS_VALIDATION_BITMAP extern kern_return_t vnode_pager_cs_check_validation_bitmap( @@ -324,6 +327,9 @@ extern void vnode_pager_vrele( struct vnode *vp); extern void vnode_pager_release_from_cache( int *); +extern struct vnode *vnode_pager_lookup_vnode( + memory_object_t); + extern int ubc_map( struct vnode *vp, int flags); @@ -333,83 +339,6 @@ extern void ubc_unmap( struct vm_map_entry; extern struct vm_object *find_vnode_object(struct vm_map_entry *entry); -extern void dp_memory_object_reference(memory_object_t); -extern void dp_memory_object_deallocate(memory_object_t); -#ifndef _memory_object_server_ -extern kern_return_t dp_memory_object_init(memory_object_t, - memory_object_control_t, - memory_object_cluster_size_t); -extern kern_return_t dp_memory_object_terminate(memory_object_t); -extern kern_return_t dp_memory_object_data_request(memory_object_t, - memory_object_offset_t, - memory_object_cluster_size_t, - vm_prot_t, - memory_object_fault_info_t); -extern kern_return_t dp_memory_object_data_return(memory_object_t, - memory_object_offset_t, - memory_object_cluster_size_t, - memory_object_offset_t *, - int *, - boolean_t, - boolean_t, - int); -extern kern_return_t dp_memory_object_data_initialize(memory_object_t, - memory_object_offset_t, - memory_object_cluster_size_t); -extern kern_return_t dp_memory_object_data_unlock(memory_object_t, - memory_object_offset_t, - memory_object_size_t, - vm_prot_t); -extern kern_return_t dp_memory_object_synchronize(memory_object_t, - memory_object_offset_t, - memory_object_size_t, - vm_sync_t); -extern kern_return_t dp_memory_object_map(memory_object_t, - vm_prot_t); -extern kern_return_t dp_memory_object_last_unmap(memory_object_t); -#endif /* _memory_object_server_ */ -#ifndef _memory_object_default_server_ -extern kern_return_t default_pager_memory_object_create( - memory_object_default_t, - vm_size_t, - memory_object_t *); -#endif /* _memory_object_default_server_ */ - -#if CONFIG_FREEZE -extern unsigned int default_pager_swap_pages_free(void); -struct default_freezer_handle; -struct vm_page; -__private_extern__ void default_freezer_init(void); -__private_extern__ struct default_freezer_handle* default_freezer_handle_allocate(void); -__private_extern__ kern_return_t -default_freezer_handle_init( - struct default_freezer_handle *df_handle); -__private_extern__ void -default_freezer_handle_deallocate( - struct default_freezer_handle *df_handle); -__private_extern__ void -default_freezer_pageout( - struct default_freezer_handle *df_handle); -__private_extern__ kern_return_t -default_freezer_pack( - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *shared, - vm_object_t src_object, - struct default_freezer_handle *df_handle); -__private_extern__ kern_return_t -default_freezer_unpack( - struct default_freezer_handle *df_handle); -__private_extern__ void -default_freezer_pack_page( - struct vm_page* p, - struct default_freezer_handle *df_handle); - -#endif /* CONFIG_FREEZE */ - extern void device_pager_reference(memory_object_t); extern void device_pager_deallocate(memory_object_t); extern kern_return_t device_pager_init(memory_object_t, @@ -453,6 +382,7 @@ extern memory_object_t device_pager_setup( vm_size_t, int); extern void device_pager_bootstrap(void); +extern boolean_t is_device_pager_ops(const struct memory_object_pager_ops *pager_ops); extern kern_return_t pager_map_to_phys_contiguous( memory_object_control_t object, @@ -476,19 +406,25 @@ extern int macx_swapinfo( boolean_t *encrypted_p); extern void log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot); -extern void log_unnest_badness(vm_map_t, vm_map_offset_t, vm_map_offset_t); +extern void log_unnest_badness( + vm_map_t map, + vm_map_offset_t start_unnest, + vm_map_offset_t end_unnest, + boolean_t is_nested_map, + vm_map_offset_t lowest_unnestable_addr); struct proc; extern int cs_allow_invalid(struct proc *p); -extern int cs_invalid_page(addr64_t vaddr); +extern int cs_invalid_page(addr64_t vaddr, boolean_t *cs_killed); #define CS_VALIDATE_TAINTED 0x00000001 #define CS_VALIDATE_NX 0x00000002 -extern boolean_t cs_validate_page(void *blobs, - memory_object_t pager, - memory_object_offset_t offset, - const void *data, - unsigned *result); +extern boolean_t cs_validate_range(struct vnode *vp, + memory_object_t pager, + memory_object_offset_t offset, + const void *data, + vm_size_t size, + unsigned *result); extern kern_return_t mach_memory_entry_purgable_control( ipc_port_t entry_port, @@ -579,6 +515,73 @@ struct vm_counters { }; extern struct vm_counters vm_counters; +#if CONFIG_SECLUDED_MEMORY +struct vm_page_secluded_data { + int eligible_for_secluded; + int grab_success_free; + int grab_success_other; + int grab_failure_locked; + int grab_failure_state; + int grab_failure_dirty; + int grab_for_iokit; + int grab_for_iokit_success; +}; +extern struct vm_page_secluded_data vm_page_secluded; + +extern int num_tasks_can_use_secluded_mem; + +/* boot-args */ +extern int secluded_for_apps; +extern int secluded_for_iokit; +extern int secluded_for_filecache; +#if 11 +extern int secluded_for_fbdp; +#endif + +/* + * "secluded_aging_policy" controls the aging of secluded pages: + * + * SECLUDED_AGING_FIFO + * When a page eligible for the secluded queue is activated or + * deactivated, it is inserted in the secluded queue. + * When it get pushed out of the secluded queue, it gets freed. + * + * SECLUDED_AGING_ALONG_ACTIVE + * When a page eligible for the secluded queue is activated, it is + * inserted in the secluded queue. + * When it gets pushed out of the secluded queue, its "referenced" bit + * is reset and it is inserted in the inactive queue. + * + * SECLUDED_AGING_AFTER_INACTIVE + * A page eligible for the secluded queue first makes its way through the + * active and inactive queues. + * When it is pushed out of the inactive queue without being re-activated, + * it is inserted in the secluded queue instead of being reclaimed. + * When it is pushed out of the secluded queue, it is either freed if it + * hasn't been re-referenced, or re-activated if it has been re-referenced. + * + * SECLUDED_AGING_BEFORE_ACTIVE + * A page eligible for the secluded queue will first make its way through + * the secluded queue. When it gets pushed out of the secluded queue (by + * new secluded pages), it goes back to the normal aging path, through the + * active queue and then the inactive queue. + */ +extern int secluded_aging_policy; +#define SECLUDED_AGING_FIFO 0 +#define SECLUDED_AGING_ALONG_ACTIVE 1 +#define SECLUDED_AGING_AFTER_INACTIVE 2 +#define SECLUDED_AGING_BEFORE_ACTIVE 3 + +extern void memory_object_mark_eligible_for_secluded( + memory_object_control_t control, + boolean_t eligible_for_secluded); + +#endif /* CONFIG_SECLUDED_MEMORY */ + +#ifdef __cplusplus +} +#endif + #endif /* _VM_VM_PROTOS_H_ */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index 3c6807cb5..b1d7aeba5 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -23,6 +23,7 @@ #include #include +#include #include @@ -141,9 +142,7 @@ vm_purgeable_token_check_queue(purgeable_q_t queue) kern_return_t vm_purgeable_token_add(purgeable_q_t queue) { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); /* new token */ token_idx_t token; @@ -299,9 +298,7 @@ vm_purgeable_token_add(purgeable_q_t queue) static token_idx_t vm_purgeable_token_remove_first(purgeable_q_t queue) { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); token_idx_t token; token = queue->token_q_head; @@ -357,9 +354,7 @@ vm_purgeable_token_remove_first(purgeable_q_t queue) static token_idx_t vm_purgeable_token_remove_last(purgeable_q_t queue) { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); token_idx_t token; token = queue->token_q_tail; @@ -423,9 +418,7 @@ vm_purgeable_token_remove_last(purgeable_q_t queue) void vm_purgeable_token_delete_first(purgeable_q_t queue) { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); token_idx_t token = vm_purgeable_token_remove_first(queue); if (token) { @@ -439,9 +432,7 @@ vm_purgeable_token_delete_first(purgeable_q_t queue) void vm_purgeable_token_delete_last(purgeable_q_t queue) { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); token_idx_t token = vm_purgeable_token_remove_last(queue); if (token) { @@ -457,9 +448,7 @@ vm_purgeable_token_delete_last(purgeable_q_t queue) void vm_purgeable_q_advance_all() { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); /* check queue counters - if they get really large, scale them back. * They tend to get that large when there is no purgeable queue action */ @@ -549,9 +538,7 @@ vm_purgeable_q_advance_all() static void vm_purgeable_token_remove_ripe(purgeable_q_t queue) { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); assert(queue->token_q_head && tokens[queue->token_q_head].count == 0); /* return token to free list. advance token list. */ token_idx_t new_head = tokens[queue->token_q_head].next; @@ -581,9 +568,7 @@ vm_purgeable_token_remove_ripe(purgeable_q_t queue) static void vm_purgeable_token_choose_and_delete_ripe(purgeable_q_t queue, purgeable_q_t queue2) { -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); assert(queue->token_q_head); if (tokens[queue->token_q_head].count == 0) { @@ -687,7 +672,7 @@ vm_purgeable_object_find_and_lock( best_object = VM_OBJECT_NULL; best_object_task_importance = INT_MAX; - lck_mtx_assert(&vm_purgeable_queue_lock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&vm_purgeable_queue_lock, LCK_MTX_ASSERT_OWNED); /* * Usually we would pick the first element from a queue. However, we * might not be able to get a lock on it, in which case we try the @@ -888,9 +873,7 @@ vm_purgeable_object_purge_one( boolean_t forced_purge; /* Need the page queue lock since we'll be changing the token queue. */ -#if MACH_ASSERT - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); lck_mtx_lock(&vm_purgeable_queue_lock); /* Cycle through all queues */ @@ -1126,7 +1109,7 @@ vm_purgeable_object_remove(vm_object_t object) void vm_purgeable_stats_helper(vm_purgeable_stat_t *stat, purgeable_q_t queue, int group, task_t target_task) { - lck_mtx_assert(&vm_purgeable_queue_lock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&vm_purgeable_queue_lock, LCK_MTX_ASSERT_OWNED); stat->count = stat->size = 0; vm_object_t object; @@ -1272,7 +1255,7 @@ vm_purgeable_volatile_queue_disown( collisions = 0; again: - lck_mtx_assert(&vm_purgeable_queue_lock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&vm_purgeable_queue_lock, LCK_MTX_ASSERT_OWNED); for (object = (vm_object_t) queue_first(&queue->objq[group]); !queue_end(&queue->objq[group], (queue_entry_t) object); @@ -1554,11 +1537,11 @@ vm_purgeable_nonvolatile_enqueue( assert(object->purgable == VM_PURGABLE_NONVOLATILE); assert(object->vo_purgeable_owner == NULL); - assert(owner != NULL); lck_mtx_lock(&vm_purgeable_queue_lock); - if (owner->task_purgeable_disowning) { + if (owner != NULL && + owner->task_purgeable_disowning) { /* task is exiting and no longer tracking purgeable objects */ owner = NULL; } @@ -1573,7 +1556,6 @@ vm_purgeable_nonvolatile_enqueue( #endif /* DEBUG */ page_count = object->resident_page_count; - assert(page_count == 0); /* should be a freshly-created object */ if (owner != NULL && page_count != 0) { ledger_credit(owner->ledger, task_ledgers.purgeable_nonvolatile, @@ -1663,9 +1645,8 @@ vm_purgeable_accounting( resident_page_count = object->resident_page_count; wired_page_count = object->wired_page_count; - if ((COMPRESSED_PAGER_IS_ACTIVE || - DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && - object->pager != NULL) { + if (VM_CONFIG_COMPRESSOR_IS_PRESENT && + object->pager != NULL) { compressed_page_count = vm_compressor_pager_get_count(object->pager); } else { diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 2ad202d0e..fd55a91d1 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,16 @@ #include + +char vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; +char vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; +char vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; +char vm_page_active_or_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; + +#if CONFIG_SECLUDED_MEMORY +struct vm_page_secluded_data vm_page_secluded; +#endif /* CONFIG_SECLUDED_MEMORY */ + boolean_t hibernate_cleaning_in_progress = FALSE; boolean_t vm_page_free_verify = TRUE; @@ -129,6 +140,8 @@ static vm_page_t vm_page_grab_fictitious_common(ppnum_t phys_addr); static void vm_tag_init(void); uint64_t vm_min_kernel_and_kext_address = VM_MIN_KERNEL_AND_KEXT_ADDRESS; +uint32_t vm_packed_from_vm_pages_array_mask = VM_PACKED_FROM_VM_PAGES_ARRAY; +uint32_t vm_packed_pointer_shift = VM_PACKED_POINTER_SHIFT; /* * Associated with page of user-allocatable memory is a @@ -183,7 +196,6 @@ vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end; #endif /* VM_PAGE_FAKE_BUCKETS */ #endif /* VM_PAGE_BUCKETS_CHECK */ -extern int not_in_kdp; #if MACH_PAGE_HASH_STATS @@ -247,6 +259,9 @@ int page_shift = PAGE_SHIFT; struct vm_page vm_page_template; vm_page_t vm_pages = VM_PAGE_NULL; +vm_page_t vm_page_array_beginning_addr; +vm_page_t vm_page_array_ending_addr; + unsigned int vm_pages_count = 0; ppnum_t vm_page_lowest = 0; @@ -259,11 +274,21 @@ unsigned int vm_colors; unsigned int vm_color_mask; /* mask is == (vm_colors-1) */ unsigned int vm_cache_geometry_colors = 0; /* set by hw dependent code during startup */ unsigned int vm_free_magazine_refill_limit = 0; -queue_head_t vm_page_queue_free[MAX_COLORS]; + + +struct vm_page_queue_free_head { + vm_page_queue_head_t qhead; +} __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); + +struct vm_page_queue_free_head vm_page_queue_free[MAX_COLORS]; + + unsigned int vm_page_free_wanted; unsigned int vm_page_free_wanted_privileged; +#if CONFIG_SECLUDED_MEMORY +unsigned int vm_page_free_wanted_secluded; +#endif /* CONFIG_SECLUDED_MEMORY */ unsigned int vm_page_free_count; -unsigned int vm_page_fictitious_count; /* * Occasionally, the virtual memory system uses @@ -274,6 +299,7 @@ unsigned int vm_page_fictitious_count; * These page structures are allocated the way * most other kernel structures are. */ +zone_t vm_page_array_zone; zone_t vm_page_zone; vm_locks_array_t vm_page_locks; decl_lck_mtx_data(,vm_page_alloc_lock) @@ -317,15 +343,37 @@ ppnum_t vm_page_guard_addr = (ppnum_t) -2; * pageout daemon often assignes a higher * importance to anonymous pages (less likely to pick) */ -queue_head_t vm_page_queue_active; -queue_head_t vm_page_queue_inactive; -queue_head_t vm_page_queue_anonymous; /* inactive memory queue for anonymous pages */ -queue_head_t vm_page_queue_throttled; +vm_page_queue_head_t vm_page_queue_active __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +vm_page_queue_head_t vm_page_queue_inactive __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +#if CONFIG_SECLUDED_MEMORY +vm_page_queue_head_t vm_page_queue_secluded __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +#endif /* CONFIG_SECLUDED_MEMORY */ +vm_page_queue_head_t vm_page_queue_anonymous __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); /* inactive memory queue for anonymous pages */ +vm_page_queue_head_t vm_page_queue_throttled __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); queue_head_t vm_objects_wired; +#if CONFIG_BACKGROUND_QUEUE +vm_page_queue_head_t vm_page_queue_background __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +uint32_t vm_page_background_limit; +uint32_t vm_page_background_target; +uint32_t vm_page_background_count; +uint64_t vm_page_background_promoted_count; + +uint32_t vm_page_background_internal_count; +uint32_t vm_page_background_external_count; + +uint32_t vm_page_background_mode; +uint32_t vm_page_background_exclude_external; +#endif + unsigned int vm_page_active_count; unsigned int vm_page_inactive_count; +#if CONFIG_SECLUDED_MEMORY +unsigned int vm_page_secluded_count; +unsigned int vm_page_secluded_count_free; +unsigned int vm_page_secluded_count_inuse; +#endif /* CONFIG_SECLUDED_MEMORY */ unsigned int vm_page_anonymous_count; unsigned int vm_page_throttled_count; unsigned int vm_page_speculative_count; @@ -355,7 +403,7 @@ unsigned int vm_page_speculative_created = 0; unsigned int vm_page_speculative_used = 0; #endif -queue_head_t vm_page_queue_cleaned; +vm_page_queue_head_t vm_page_queue_cleaned __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); unsigned int vm_page_cleaned_count = 0; unsigned int vm_pageout_enqueued_cleaned = 0; @@ -374,6 +422,9 @@ unsigned int vm_page_free_target = 0; unsigned int vm_page_free_min = 0; unsigned int vm_page_throttle_limit = 0; unsigned int vm_page_inactive_target = 0; +#if CONFIG_SECLUDED_MEMORY +unsigned int vm_page_secluded_target = 0; +#endif /* CONFIG_SECLUDED_MEMORY */ unsigned int vm_page_anonymous_min = 0; unsigned int vm_page_inactive_min = 0; unsigned int vm_page_free_reserved = 0; @@ -497,7 +548,7 @@ vm_page_init_local_q() lq = &t_local_q[i].vpl_un.vpl; VPL_LOCK_INIT(lq, &vm_page_lck_grp_local, &vm_page_lck_attr); - queue_init(&lq->vpl_queue); + vm_page_queue_init(&lq->vpl_queue); lq->vpl_count = 0; lq->vpl_internal_count = 0; lq->vpl_external_count = 0; @@ -525,7 +576,7 @@ vm_page_bootstrap( vm_offset_t *startp, vm_offset_t *endp) { - register vm_page_t m; + vm_page_t m; unsigned int i; unsigned int log1; unsigned int log2; @@ -538,31 +589,30 @@ vm_page_bootstrap( m = &vm_page_template; bzero(m, sizeof (*m)); - m->pageq.next = NULL; - m->pageq.prev = NULL; - m->listq.next = NULL; - m->listq.prev = NULL; - m->next_m = VM_PAGE_PACK_PTR(VM_PAGE_NULL); +#if CONFIG_BACKGROUND_QUEUE + m->vm_page_backgroundq.next = 0; + m->vm_page_backgroundq.prev = 0; + m->vm_page_in_background = FALSE; + m->vm_page_on_backgroundq = FALSE; +#endif + + VM_PAGE_ZERO_PAGEQ_ENTRY(m); + m->listq.next = 0; + m->listq.prev = 0; + m->next_m = 0; - m->object = VM_OBJECT_NULL; /* reset later */ + m->vm_page_object = 0; /* reset later */ m->offset = (vm_object_offset_t) -1; /* reset later */ m->wire_count = 0; - m->local = FALSE; - m->inactive = FALSE; - m->active = FALSE; - m->pageout_queue = FALSE; - m->speculative = FALSE; + m->vm_page_q_state = VM_PAGE_NOT_ON_Q; m->laundry = FALSE; - m->free = FALSE; m->reference = FALSE; m->gobbled = FALSE; m->private = FALSE; - m->throttled = FALSE; m->__unused_pageq_bits = 0; - m->phys_page = 0; /* reset later */ - + VM_PAGE_SET_PHYS_PAGE(m, 0); /* reset later */ m->busy = TRUE; m->wanted = FALSE; m->tabled = FALSE; @@ -570,7 +620,7 @@ vm_page_bootstrap( m->fictitious = FALSE; m->pmapped = FALSE; m->wpmapped = FALSE; - m->pageout = FALSE; + m->free_when_done = FALSE; m->absent = FALSE; m->error = FALSE; m->dirty = FALSE; @@ -589,7 +639,6 @@ vm_page_bootstrap( m->reusable = FALSE; m->slid = FALSE; m->xpmapped = FALSE; - m->compressor = FALSE; m->written_by_kernel = FALSE; m->__unused_object_bits = 0; @@ -621,34 +670,104 @@ vm_page_bootstrap( queue_init(&purgeable_nonvolatile_queue); for (i = 0; i < MAX_COLORS; i++ ) - queue_init(&vm_page_queue_free[i]); - - queue_init(&vm_lopage_queue_free); - queue_init(&vm_page_queue_active); - queue_init(&vm_page_queue_inactive); - queue_init(&vm_page_queue_cleaned); - queue_init(&vm_page_queue_throttled); - queue_init(&vm_page_queue_anonymous); + vm_page_queue_init(&vm_page_queue_free[i].qhead); + + vm_page_queue_init(&vm_lopage_queue_free); + vm_page_queue_init(&vm_page_queue_active); + vm_page_queue_init(&vm_page_queue_inactive); +#if CONFIG_SECLUDED_MEMORY + vm_page_queue_init(&vm_page_queue_secluded); +#endif /* CONFIG_SECLUDED_MEMORY */ + vm_page_queue_init(&vm_page_queue_cleaned); + vm_page_queue_init(&vm_page_queue_throttled); + vm_page_queue_init(&vm_page_queue_anonymous); queue_init(&vm_objects_wired); for ( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) { - queue_init(&vm_page_queue_speculative[i].age_q); + vm_page_queue_init(&vm_page_queue_speculative[i].age_q); vm_page_queue_speculative[i].age_ts.tv_sec = 0; vm_page_queue_speculative[i].age_ts.tv_nsec = 0; } +#if CONFIG_BACKGROUND_QUEUE + vm_page_queue_init(&vm_page_queue_background); + + vm_page_background_count = 0; + vm_page_background_internal_count = 0; + vm_page_background_external_count = 0; + vm_page_background_promoted_count = 0; + + vm_page_background_target = (unsigned int)(atop_64(max_mem) / 25); + + if (vm_page_background_target > VM_PAGE_BACKGROUND_TARGET_MAX) + vm_page_background_target = VM_PAGE_BACKGROUND_TARGET_MAX; + vm_page_background_limit = vm_page_background_target + 256; + + vm_page_background_mode = VM_PAGE_BG_LEVEL_1; + vm_page_background_exclude_external = 0; + + PE_parse_boot_argn("vm_page_bg_mode", &vm_page_background_mode, sizeof(vm_page_background_mode)); + PE_parse_boot_argn("vm_page_bg_exclude_external", &vm_page_background_exclude_external, sizeof(vm_page_background_exclude_external)); + PE_parse_boot_argn("vm_page_bg_target", &vm_page_background_target, sizeof(vm_page_background_target)); + PE_parse_boot_argn("vm_page_bg_limit", &vm_page_background_limit, sizeof(vm_page_background_limit)); + + if (vm_page_background_mode > VM_PAGE_BG_LEVEL_3) + vm_page_background_mode = VM_PAGE_BG_LEVEL_1; + + if (vm_page_background_limit <= vm_page_background_target) + vm_page_background_limit = vm_page_background_target + 256; +#endif vm_page_free_wanted = 0; vm_page_free_wanted_privileged = 0; +#if CONFIG_SECLUDED_MEMORY + vm_page_free_wanted_secluded = 0; +#endif /* CONFIG_SECLUDED_MEMORY */ vm_page_set_colors(); + bzero(vm_page_inactive_states, sizeof(vm_page_inactive_states)); + vm_page_inactive_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1; + vm_page_inactive_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1; + vm_page_inactive_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1; + + bzero(vm_page_pageable_states, sizeof(vm_page_pageable_states)); + vm_page_pageable_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1; + vm_page_pageable_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1; + vm_page_pageable_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1; + vm_page_pageable_states[VM_PAGE_ON_ACTIVE_Q] = 1; + vm_page_pageable_states[VM_PAGE_ON_SPECULATIVE_Q] = 1; + vm_page_pageable_states[VM_PAGE_ON_THROTTLED_Q] = 1; +#if CONFIG_SECLUDED_MEMORY + vm_page_pageable_states[VM_PAGE_ON_SECLUDED_Q] = 1; +#endif /* CONFIG_SECLUDED_MEMORY */ + + bzero(vm_page_non_speculative_pageable_states, sizeof(vm_page_non_speculative_pageable_states)); + vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1; + vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1; + vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1; + vm_page_non_speculative_pageable_states[VM_PAGE_ON_ACTIVE_Q] = 1; + vm_page_non_speculative_pageable_states[VM_PAGE_ON_THROTTLED_Q] = 1; +#if CONFIG_SECLUDED_MEMORY + vm_page_non_speculative_pageable_states[VM_PAGE_ON_SECLUDED_Q] = 1; +#endif /* CONFIG_SECLUDED_MEMORY */ + + bzero(vm_page_active_or_inactive_states, sizeof(vm_page_active_or_inactive_states)); + vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1; + vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1; + vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1; + vm_page_active_or_inactive_states[VM_PAGE_ON_ACTIVE_Q] = 1; +#if CONFIG_SECLUDED_MEMORY + vm_page_active_or_inactive_states[VM_PAGE_ON_SECLUDED_Q] = 1; +#endif /* CONFIG_SECLUDED_MEMORY */ + /* * Steal memory for the map and zone subsystems. */ - kernel_debug_string_simple("zone_steal_memory"); - zone_steal_memory(); - kernel_debug_string_simple("vm_map_steal_memory"); +#if CONFIG_GZALLOC + gzalloc_configure(); +#endif + kernel_debug_string_early("vm_map_steal_memory"); vm_map_steal_memory(); /* @@ -718,18 +837,18 @@ vm_page_bootstrap( #endif /* VM_PAGE_FAKE_BUCKETS */ #endif /* VM_PAGE_BUCKETS_CHECK */ - kernel_debug_string_simple("vm_page_buckets"); + kernel_debug_string_early("vm_page_buckets"); vm_page_buckets = (vm_page_bucket_t *) pmap_steal_memory(vm_page_bucket_count * sizeof(vm_page_bucket_t)); - kernel_debug_string_simple("vm_page_bucket_locks"); + kernel_debug_string_early("vm_page_bucket_locks"); vm_page_bucket_locks = (lck_spin_t *) pmap_steal_memory(vm_page_bucket_lock_count * sizeof(lck_spin_t)); for (i = 0; i < vm_page_bucket_count; i++) { - register vm_page_bucket_t *bucket = &vm_page_buckets[i]; + vm_page_bucket_t *bucket = &vm_page_buckets[i]; bucket->page_list = VM_PAGE_PACK_PTR(VM_PAGE_NULL); #if MACH_PAGE_HASH_STATS @@ -757,7 +876,7 @@ vm_page_bootstrap( * to get the alignment right. */ - kernel_debug_string_simple("pmap_startup"); + kernel_debug_string_early("pmap_startup"); pmap_startup(&virtual_space_start, &virtual_space_end); virtual_space_start = round_page(virtual_space_start); virtual_space_end = trunc_page(virtual_space_end); @@ -774,13 +893,16 @@ vm_page_bootstrap( */ assert((unsigned int) atop_64(max_mem) == atop_64(max_mem)); vm_page_wire_count = ((unsigned int) atop_64(max_mem)) - vm_page_free_count - vm_lopage_free_count; /* initial value */ +#if CONFIG_SECLUDED_MEMORY + vm_page_wire_count -= vm_page_secluded_count; +#endif vm_page_wire_count_initial = vm_page_wire_count; vm_page_pages_initial = vm_page_pages; printf("vm_page_bootstrap: %d free pages and %d wired pages\n", vm_page_free_count, vm_page_wire_count); - kernel_debug_string_simple("vm_page_bootstrap complete"); + kernel_debug_string_early("vm_page_bootstrap complete"); simple_lock_init(&vm_paging_lock, 0); } @@ -838,7 +960,7 @@ pmap_steal_memory( vaddr += PAGE_SIZE) { if (!pmap_next_page_hi(&phys_page)) - panic("pmap_steal_memory"); + panic("pmap_steal_memory() size: 0x%llx\n", (uint64_t)size); /* * XXX Logically, these mappings should be wired, @@ -861,6 +983,21 @@ pmap_steal_memory( return (void *) addr; } +#if CONFIG_SECLUDED_MEMORY +/* boot-args to control secluded memory */ +unsigned int secluded_mem_mb = 0; /* # of MBs of RAM to seclude */ +int secluded_for_iokit = 1; /* IOKit can use secluded memory */ +int secluded_for_apps = 1; /* apps can use secluded memory */ +int secluded_for_filecache = 2; /* filecache can use seclude memory */ +#if 11 +int secluded_for_fbdp = 0; +#endif +int secluded_aging_policy = SECLUDED_AGING_BEFORE_ACTIVE; +#endif /* CONFIG_SECLUDED_MEMORY */ + + + + void vm_page_release_startup(vm_page_t mem); void pmap_startup( @@ -871,13 +1008,7 @@ pmap_startup( ppnum_t phys_page; addr64_t tmpaddr; - #if defined(__LP64__) - /* - * struct vm_page must be of size 64 due to VM_PAGE_PACK_PTR use - */ - assert(sizeof(struct vm_page) == 64); - /* * make sure we are aligned on a 64 byte boundary * for VM_PAGE_PACK_PTR (it clips off the low-order @@ -901,7 +1032,12 @@ pmap_startup( /* * Initialize the page frames. */ - kernel_debug_string_simple("Initialize the page frames"); + kernel_debug_string_early("Initialize the page frames"); + + vm_page_array_beginning_addr = &vm_pages[0]; + vm_page_array_ending_addr = &vm_pages[npages]; + + for (i = 0, pages_initialized = 0; i < npages; i++) { if (!pmap_next_page(&phys_page)) break; @@ -916,13 +1052,13 @@ pmap_startup( #if defined(__LP64__) - if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[0])) != &vm_pages[0]) + if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[0]))) != &vm_pages[0]) panic("VM_PAGE_PACK_PTR failed on &vm_pages[0] - %p", (void *)&vm_pages[0]); - if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[vm_pages_count-1])) != &vm_pages[vm_pages_count-1]) + if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[vm_pages_count-1]))) != &vm_pages[vm_pages_count-1]) panic("VM_PAGE_PACK_PTR failed on &vm_pages[vm_pages_count-1] - %p", (void *)&vm_pages[vm_pages_count-1]); #endif - kernel_debug_string_simple("page fill/release"); + kernel_debug_string_early("page fill/release"); /* * Check if we want to initialize pages to a known value */ @@ -940,11 +1076,48 @@ pmap_startup( #endif if (fill) kprintf("Filling vm_pages with pattern: 0x%x\n", fillval); + +#if CONFIG_SECLUDED_MEMORY + /* default: no secluded mem */ + secluded_mem_mb = 0; + if (max_mem > 1*1024*1024*1024) { + /* default to 90MB for devices with > 1GB of RAM */ + secluded_mem_mb = 90; + } + /* override with value from device tree, if provided */ + PE_get_default("kern.secluded_mem_mb", + &secluded_mem_mb, sizeof(secluded_mem_mb)); + /* override with value from boot-args, if provided */ + PE_parse_boot_argn("secluded_mem_mb", + &secluded_mem_mb, + sizeof (secluded_mem_mb)); + + vm_page_secluded_target = (unsigned int) + ((secluded_mem_mb * 1024ULL * 1024ULL) / PAGE_SIZE); + PE_parse_boot_argn("secluded_for_iokit", + &secluded_for_iokit, + sizeof (secluded_for_iokit)); + PE_parse_boot_argn("secluded_for_apps", + &secluded_for_apps, + sizeof (secluded_for_apps)); + PE_parse_boot_argn("secluded_for_filecache", + &secluded_for_filecache, + sizeof (secluded_for_filecache)); +#if 11 + PE_parse_boot_argn("secluded_for_fbdp", + &secluded_for_fbdp, + sizeof (secluded_for_fbdp)); +#endif + PE_parse_boot_argn("secluded_aging_policy", + &secluded_aging_policy, + sizeof (secluded_aging_policy)); +#endif /* CONFIG_SECLUDED_MEMORY */ + // -debug code remove if (2 == vm_himemory_mode) { // free low -> high so high is preferred for (i = 1; i <= pages_initialized; i++) { - if(fill) fillPage(vm_pages[i - 1].phys_page, fillval); /* Fill the page with a know value if requested at boot */ + if(fill) fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i - 1]), fillval); /* Fill the page with a know value if requested at boot */ vm_page_release_startup(&vm_pages[i - 1]); } } @@ -958,7 +1131,7 @@ pmap_startup( * they require several consecutive pages. */ for (i = pages_initialized; i > 0; i--) { - if(fill) fillPage(vm_pages[i - 1].phys_page, fillval); /* Fill the page with a know value if requested at boot */ + if(fill) fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i - 1]), fillval); /* Fill the page with a know value if requested at boot */ vm_page_release_startup(&vm_pages[i - 1]); } @@ -973,7 +1146,7 @@ pmap_startup( xxl = 0; for( i = 0; i < vm_colors; i++ ) { - queue_iterate(&vm_page_queue_free[i], + queue_iterate(&vm_page_queue_free[i].qhead, xx, vm_page_t, pageq) { /* BRINGUP */ @@ -987,7 +1160,7 @@ pmap_startup( if(((j - 1) & 0xFFFF) == 0) kprintf("checking number %d of %d\n", j, vm_page_free_count); - for(xxo = xx->pageq.next; xxo != &vm_page_queue_free[i]; xxo = xxo->pageq.next) { /* (BRINGUP) */ + for(xxo = xx->pageq.next; xxo != &vm_page_queue_free[i].qhead; xxo = xxo->pageq.next) { /* (BRINGUP) */ k++; if(k > l) panic("pmap_startup: too many in secondary check %d %d\n", k, l); if((xx->phys_page & 0xFFFFFFFF) == (xxo->phys_page & 0xFFFFFFFF)) { /* (BRINGUP) */ @@ -1027,31 +1200,40 @@ pmap_startup( void vm_page_module_init(void) { - uint64_t vm_page_zone_pages, vm_page_zone_data_size; - vm_page_zone = zinit((vm_size_t) sizeof(struct vm_page), - 0, PAGE_SIZE, "vm pages"); + uint64_t vm_page_zone_pages, vm_page_array_zone_data_size; + vm_size_t vm_page_with_ppnum_size; -#if ZONE_DEBUG - zone_debug_disable(vm_page_zone); -#endif /* ZONE_DEBUG */ + vm_page_array_zone = zinit((vm_size_t) sizeof(struct vm_page), + 0, PAGE_SIZE, "vm pages array"); - zone_change(vm_page_zone, Z_CALLERACCT, FALSE); - zone_change(vm_page_zone, Z_EXPAND, FALSE); - zone_change(vm_page_zone, Z_EXHAUST, TRUE); - zone_change(vm_page_zone, Z_FOREIGN, TRUE); - zone_change(vm_page_zone, Z_GZALLOC_EXEMPT, TRUE); + zone_change(vm_page_array_zone, Z_CALLERACCT, FALSE); + zone_change(vm_page_array_zone, Z_EXPAND, FALSE); + zone_change(vm_page_array_zone, Z_EXHAUST, TRUE); + zone_change(vm_page_array_zone, Z_FOREIGN, TRUE); + zone_change(vm_page_array_zone, Z_GZALLOC_EXEMPT, TRUE); /* * Adjust zone statistics to account for the real pages allocated * in vm_page_create(). [Q: is this really what we want?] */ - vm_page_zone->count += vm_page_pages; - vm_page_zone->sum_count += vm_page_pages; - vm_page_zone_data_size = vm_page_pages * vm_page_zone->elem_size; - vm_page_zone->cur_size += vm_page_zone_data_size; - vm_page_zone_pages = ((round_page(vm_page_zone_data_size)) / PAGE_SIZE); - OSAddAtomic64(vm_page_zone_pages, &(vm_page_zone->page_count)); + vm_page_array_zone->count += vm_page_pages; + vm_page_array_zone->sum_count += vm_page_pages; + vm_page_array_zone_data_size = vm_page_pages * vm_page_array_zone->elem_size; + vm_page_array_zone->cur_size += vm_page_array_zone_data_size; + vm_page_zone_pages = ((round_page(vm_page_array_zone_data_size)) / PAGE_SIZE); + OSAddAtomic64(vm_page_zone_pages, &(vm_page_array_zone->page_count)); /* since zone accounts for these, take them out of stolen */ VM_PAGE_MOVE_STOLEN(vm_page_zone_pages); + + vm_page_with_ppnum_size = (sizeof(struct vm_page_with_ppnum) + (VM_PACKED_POINTER_ALIGNMENT-1)) & ~(VM_PACKED_POINTER_ALIGNMENT - 1); + + vm_page_zone = zinit(vm_page_with_ppnum_size, + 0, PAGE_SIZE, "vm pages"); + + zone_change(vm_page_zone, Z_CALLERACCT, FALSE); + zone_change(vm_page_zone, Z_EXPAND, FALSE); + zone_change(vm_page_zone, Z_EXHAUST, TRUE); + zone_change(vm_page_zone, Z_FOREIGN, TRUE); + zone_change(vm_page_zone, Z_GZALLOC_EXEMPT, TRUE); } /* @@ -1082,7 +1264,7 @@ vm_page_create( pmap_clear_noencrypt(phys_page); vm_page_pages++; - vm_page_release(m); + vm_page_release(m, FALSE); } } @@ -1161,41 +1343,35 @@ vm_page_insert_internal( assert(object != vm_submap_object); vm_object_lock_assert_exclusive(object); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, + LCK_MTX_ASSERT(&vm_page_queue_lock, queues_lock_held ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED); -#endif /* DEBUG */ + if (queues_lock_held == FALSE) + assert(!VM_PAGE_PAGEABLE(mem)); if (insert_in_hash == TRUE) { #if DEBUG || VM_PAGE_CHECK_BUCKETS - if (mem->tabled || mem->object != VM_OBJECT_NULL) + if (mem->tabled || mem->vm_page_object) panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) " "already in (obj=%p,off=0x%llx)", - mem, object, offset, mem->object, mem->offset); + mem, object, offset, VM_PAGE_OBJECT(mem), mem->offset); #endif assert(!object->internal || offset < object->vo_size); - - /* only insert "pageout" pages into "pageout" objects, - * and normal pages into normal objects */ -#if 00 - /* - * For some reason, this assertion gets tripped - * but it's mostly harmless, so let's disable it - * for now. - */ - assert(object->pageout == mem->pageout); -#endif /* 00 */ - assert(vm_page_lookup(object, offset) == VM_PAGE_NULL); /* * Record the object/offset pair in this page */ - mem->object = object; + mem->vm_page_object = VM_PAGE_PACK_OBJECT(object); mem->offset = offset; +#if CONFIG_SECLUDED_MEMORY + if (object->eligible_for_secluded) { + vm_page_secluded.eligible_for_secluded++; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + /* * Insert it into the object_object/offset hash table */ @@ -1207,7 +1383,7 @@ vm_page_insert_internal( mem->next_m = bucket->page_list; bucket->page_list = VM_PAGE_PACK_PTR(mem); - assert(mem == VM_PAGE_UNPACK_PTR(bucket->page_list)); + assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))); #if MACH_PAGE_HASH_STATS if (++bucket->cur_count > bucket->hi_count) @@ -1229,7 +1405,7 @@ vm_page_insert_internal( /* * Now link into the object's list of backed pages. */ - queue_enter(&object->memq, mem, vm_page_t, listq); + vm_page_queue_enter(&object->memq, mem, vm_page_t, listq); object->memq_hint = mem; mem->tabled = TRUE; @@ -1239,6 +1415,8 @@ vm_page_insert_internal( object->resident_page_count++; if (VM_PAGE_WIRED(mem)) { + assert(mem->wire_count > 0); + if (!mem->private && !mem->fictitious) { if (!object->wired_page_count) @@ -1273,7 +1451,7 @@ vm_page_insert_internal( * allocation. */ assert(!mem->reusable); - if (mem->object->all_reusable) { + if (object->all_reusable) { OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count); } @@ -1316,7 +1494,7 @@ vm_page_insert_internal( OSAddAtomic(+1, &vm_page_purgeable_count); } } else if (object->purgable == VM_PURGABLE_EMPTY && - mem->throttled) { + mem->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) { /* * This page belongs to a purged VM object but hasn't * been purged (because it was "busy"). @@ -1362,9 +1540,9 @@ vm_page_insert_internal( */ void vm_page_replace( - register vm_page_t mem, - register vm_object_t object, - register vm_object_offset_t offset) + vm_page_t mem, + vm_object_t object, + vm_object_offset_t offset) { vm_page_bucket_t *bucket; vm_page_t found_m = VM_PAGE_NULL; @@ -1380,17 +1558,19 @@ vm_page_replace( #endif vm_object_lock_assert_exclusive(object); #if DEBUG || VM_PAGE_CHECK_BUCKETS - if (mem->tabled || mem->object != VM_OBJECT_NULL) + if (mem->tabled || mem->vm_page_object) panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) " "already in (obj=%p,off=0x%llx)", - mem, object, offset, mem->object, mem->offset); - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); + mem, object, offset, VM_PAGE_OBJECT(mem), mem->offset); #endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); + + assert(!VM_PAGE_PAGEABLE(mem)); + /* * Record the object/offset pair in this page */ - - mem->object = object; + mem->vm_page_object = VM_PAGE_PACK_OBJECT(object); mem->offset = offset; /* @@ -1406,10 +1586,13 @@ vm_page_replace( if (bucket->page_list) { vm_page_packed_t *mp = &bucket->page_list; - vm_page_t m = VM_PAGE_UNPACK_PTR(*mp); + vm_page_t m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp)); do { - if (m->object == object && m->offset == offset) { + /* + * compare packed object pointers + */ + if (m->vm_page_object == mem->vm_page_object && m->offset == offset) { /* * Remove old page from hash list */ @@ -1420,11 +1603,11 @@ vm_page_replace( break; } mp = &m->next_m; - } while ((m = VM_PAGE_UNPACK_PTR(*mp))); + } while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp)))); mem->next_m = bucket->page_list; } else { - mem->next_m = VM_PAGE_PACK_PTR(VM_PAGE_NULL); + mem->next_m = VM_PAGE_PACK_PTR(NULL); } /* * insert new page at head of hash list @@ -1464,16 +1647,23 @@ vm_page_remove( lck_spin_t *bucket_lock; int hash_id; task_t owner; + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(mem); XPR(XPR_VM_PAGE, "vm_page_remove, object 0x%X offset 0x%X page 0x%X\n", - mem->object, mem->offset, + m_object, mem->offset, mem, 0,0); - vm_object_lock_assert_exclusive(mem->object); + vm_object_lock_assert_exclusive(m_object); assert(mem->tabled); assert(!mem->cleaning); assert(!mem->laundry); + + if (VM_PAGE_PAGEABLE(mem)) { + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + } #if 0 /* * we don't hold the page queue lock @@ -1485,13 +1675,13 @@ vm_page_remove( /* * Remove from the object_object/offset hash table */ - hash_id = vm_page_hash(mem->object, mem->offset); + hash_id = vm_page_hash(m_object, mem->offset); bucket = &vm_page_buckets[hash_id]; bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK]; lck_spin_lock(bucket_lock); - if ((this = VM_PAGE_UNPACK_PTR(bucket->page_list)) == mem) { + if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) { /* optimize for common case */ bucket->page_list = mem->next_m; @@ -1499,7 +1689,7 @@ vm_page_remove( vm_page_packed_t *prev; for (prev = &this->next_m; - (this = VM_PAGE_UNPACK_PTR(*prev)) != mem; + (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem; prev = &this->next_m) continue; *prev = this->next_m; @@ -1521,10 +1711,10 @@ vm_page_remove( * page. */ - assert(mem->object->resident_page_count > 0); - mem->object->resident_page_count--; + assert(m_object->resident_page_count > 0); + m_object->resident_page_count--; - if (mem->object->internal) { + if (m_object->internal) { #if DEBUG assert(vm_page_internal_count); #endif /* DEBUG */ @@ -1539,40 +1729,41 @@ vm_page_remove( OSAddAtomic(-1, &vm_page_xpmapped_external_count); } } - if (!mem->object->internal && (mem->object->objq.next || mem->object->objq.prev)) { - if (mem->object->resident_page_count == 0) - vm_object_cache_remove(mem->object); + if (!m_object->internal && (m_object->objq.next || m_object->objq.prev)) { + if (m_object->resident_page_count == 0) + vm_object_cache_remove(m_object); } if (VM_PAGE_WIRED(mem)) { - assert(mem->object->wired_page_count > 0); - mem->object->wired_page_count--; - if (!mem->object->wired_page_count) { - VM_OBJECT_UNWIRED(mem->object); + assert(mem->wire_count > 0); + assert(m_object->wired_page_count > 0); + m_object->wired_page_count--; + if (!m_object->wired_page_count) { + VM_OBJECT_UNWIRED(m_object); } } - assert(mem->object->resident_page_count >= - mem->object->wired_page_count); + assert(m_object->resident_page_count >= + m_object->wired_page_count); if (mem->reusable) { - assert(mem->object->reusable_page_count > 0); - mem->object->reusable_page_count--; - assert(mem->object->reusable_page_count <= - mem->object->resident_page_count); + assert(m_object->reusable_page_count > 0); + m_object->reusable_page_count--; + assert(m_object->reusable_page_count <= + m_object->resident_page_count); mem->reusable = FALSE; OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count); vm_page_stats_reusable.reused_remove++; - } else if (mem->object->all_reusable) { + } else if (m_object->all_reusable) { OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count); vm_page_stats_reusable.reused_remove++; } - if (mem->object->purgable == VM_PURGABLE_DENY) { + if (m_object->purgable == VM_PURGABLE_DENY) { owner = TASK_NULL; } else { - owner = mem->object->vo_purgeable_owner; + owner = m_object->vo_purgeable_owner; } if (owner && - (mem->object->purgable == VM_PURGABLE_NONVOLATILE || + (m_object->purgable == VM_PURGABLE_NONVOLATILE || VM_PAGE_WIRED(mem))) { /* less non-volatile bytes */ ledger_debit(owner->ledger, @@ -1583,15 +1774,15 @@ vm_page_remove( task_ledgers.phys_footprint, PAGE_SIZE); } else if (owner && - (mem->object->purgable == VM_PURGABLE_VOLATILE || - mem->object->purgable == VM_PURGABLE_EMPTY)) { + (m_object->purgable == VM_PURGABLE_VOLATILE || + m_object->purgable == VM_PURGABLE_EMPTY)) { assert(! VM_PAGE_WIRED(mem)); /* less volatile bytes */ ledger_debit(owner->ledger, task_ledgers.purgeable_volatile, PAGE_SIZE); } - if (mem->object->purgable == VM_PURGABLE_VOLATILE) { + if (m_object->purgable == VM_PURGABLE_VOLATILE) { if (VM_PAGE_WIRED(mem)) { assert(vm_page_purgeable_wired_count > 0); OSAddAtomic(-1, &vm_page_purgeable_wired_count); @@ -1600,11 +1791,11 @@ vm_page_remove( OSAddAtomic(-1, &vm_page_purgeable_count); } } - if (mem->object->set_cache_attr == TRUE) - pmap_set_cache_attributes(mem->phys_page, 0); + if (m_object->set_cache_attr == TRUE) + pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0); mem->tabled = FALSE; - mem->object = VM_OBJECT_NULL; + mem->vm_page_object = 0; mem->offset = (vm_object_offset_t) -1; } @@ -1654,7 +1845,7 @@ kdp_vm_page_lookup( panic("panic: kdp_vm_page_lookup done outside of kernel debugger"); } - queue_iterate(&object->memq, cur_page, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, cur_page, vm_page_t, listq) { if (cur_page->offset == offset) { return cur_page; } @@ -1675,7 +1866,7 @@ vm_page_lookup( { vm_page_t mem; vm_page_bucket_t *bucket; - queue_entry_t qe; + vm_page_queue_entry_t qe; lck_spin_t *bucket_lock = NULL; int hash_id; #if DEBUG_VM_PAGE_LOOKUP @@ -1695,7 +1886,7 @@ vm_page_lookup( mem = object->memq_hint; if (mem != VM_PAGE_NULL) { - assert(mem->object == object); + assert(VM_PAGE_OBJECT(mem) == object); if (mem->offset == offset) { #if DEBUG_VM_PAGE_LOOKUP @@ -1703,13 +1894,13 @@ vm_page_lookup( #endif return (mem); } - qe = queue_next(&mem->listq); + qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->listq); - if (! queue_end(&object->memq, qe)) { + if (! vm_page_queue_end(&object->memq, qe)) { vm_page_t next_page; - next_page = (vm_page_t) qe; - assert(next_page->object == object); + next_page = (vm_page_t)((uintptr_t)qe); + assert(VM_PAGE_OBJECT(next_page) == object); if (next_page->offset == offset) { object->memq_hint = next_page; /* new hint */ @@ -1719,13 +1910,13 @@ vm_page_lookup( return (next_page); } } - qe = queue_prev(&mem->listq); + qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->listq); - if (! queue_end(&object->memq, qe)) { + if (! vm_page_queue_end(&object->memq, qe)) { vm_page_t prev_page; - prev_page = (vm_page_t) qe; - assert(prev_page->object == object); + prev_page = (vm_page_t)((uintptr_t)qe); + assert(VM_PAGE_OBJECT(prev_page) == object); if (prev_page->offset == offset) { object->memq_hint = prev_page; /* new hint */ @@ -1765,24 +1956,29 @@ vm_page_lookup( * on average, it's roughly 3 times faster to run a short memq list * than to take the spin lock and go through the hash list */ - mem = (vm_page_t)queue_first(&object->memq); + mem = (vm_page_t)vm_page_queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t)mem)) { + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) { if (mem->offset == offset) break; - mem = (vm_page_t)queue_next(&mem->listq); + mem = (vm_page_t)vm_page_queue_next(&mem->listq); } - if (queue_end(&object->memq, (queue_entry_t)mem)) + if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) mem = NULL; } else { + vm_page_object_t packed_object; + + packed_object = VM_PAGE_PACK_OBJECT(object); bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK]; lck_spin_lock(bucket_lock); - for (mem = VM_PAGE_UNPACK_PTR(bucket->page_list); mem != VM_PAGE_NULL; mem = VM_PAGE_UNPACK_PTR(mem->next_m)) { + for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); + mem != VM_PAGE_NULL; + mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m))) { #if 0 /* * we don't hold the page queue lock @@ -1790,7 +1986,7 @@ vm_page_lookup( */ VM_PAGE_CHECK(mem); #endif - if ((mem->object == object) && (mem->offset == offset)) + if ((mem->vm_page_object == packed_object) && (mem->offset == offset)) break; } lck_spin_unlock(bucket_lock); @@ -1812,7 +2008,7 @@ vm_page_lookup( OSAddAtomic64(1, &vm_page_lookup_stats.vpl_miss); #endif if (mem != VM_PAGE_NULL) { - assert(mem->object == object); + assert(VM_PAGE_OBJECT(mem) == object); object->memq_hint = mem; } @@ -1830,17 +2026,19 @@ vm_page_lookup( */ void vm_page_rename( - register vm_page_t mem, - register vm_object_t new_object, - vm_object_offset_t new_offset, - boolean_t encrypted_ok) + vm_page_t mem, + vm_object_t new_object, + vm_object_offset_t new_offset, + boolean_t encrypted_ok) { - boolean_t internal_to_external, external_to_internal; - vm_tag_t tag; + boolean_t internal_to_external, external_to_internal; + vm_tag_t tag; + vm_object_t m_object; - assert(mem->object != new_object); + m_object = VM_PAGE_OBJECT(mem); - assert(mem->object); + assert(m_object != new_object); + assert(m_object); /* * ENCRYPTED SWAP: @@ -1873,26 +2071,26 @@ vm_page_rename( internal_to_external = FALSE; external_to_internal = FALSE; - if (mem->local) { + if (mem->vm_page_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) { /* * it's much easier to get the vm_page_pageable_xxx accounting correct * if we first move the page to the active queue... it's going to end * up there anyway, and we don't do vm_page_rename's frequently enough * for this to matter. */ - vm_page_queues_remove(mem); + vm_page_queues_remove(mem, FALSE); vm_page_activate(mem); } - if (mem->active || mem->inactive || mem->speculative) { - if (mem->object->internal && !new_object->internal) { + if (VM_PAGE_PAGEABLE(mem)) { + if (m_object->internal && !new_object->internal) { internal_to_external = TRUE; } - if (!mem->object->internal && new_object->internal) { + if (!m_object->internal && new_object->internal) { external_to_internal = TRUE; } } - tag = mem->object->wire_tag; + tag = m_object->wire_tag; vm_page_remove(mem, TRUE); vm_page_insert_internal(mem, new_object, new_offset, tag, TRUE, TRUE, FALSE, FALSE, NULL); @@ -1930,7 +2128,8 @@ vm_page_init( } #endif *mem = vm_page_template; - mem->phys_page = phys_page; + + VM_PAGE_SET_PHYS_PAGE(mem, phys_page); #if 0 /* * we're leaving this turned off for now... currently pages @@ -2004,10 +2203,10 @@ void vm_page_release_fictitious( vm_page_t m) { - assert(!m->free); + assert((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) || (m->vm_page_q_state == VM_PAGE_IS_WIRED)); assert(m->fictitious); - assert(m->phys_page == vm_page_fictitious_addr || - m->phys_page == vm_page_guard_addr); + assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr || + VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr); c_vm_page_release_fictitious++; @@ -2106,6 +2305,128 @@ vm_pool_low(void) } +#if CONFIG_BACKGROUND_QUEUE + +void +vm_page_update_background_state(vm_page_t mem) +{ + if (vm_page_background_mode == VM_PAGE_BG_DISABLED) + return; + + if (mem->vm_page_in_background == FALSE) + return; + +#if BACKGROUNDQ_BASED_ON_QOS + if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY) + return; +#else + task_t my_task; + + my_task = current_task(); + + if (my_task) { + if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG)) + return; + } +#endif + vm_page_lockspin_queues(); + + mem->vm_page_in_background = FALSE; + vm_page_background_promoted_count++; + + vm_page_remove_from_backgroundq(mem); + + vm_page_unlock_queues(); +} + + +void +vm_page_assign_background_state(vm_page_t mem) +{ + if (vm_page_background_mode == VM_PAGE_BG_DISABLED) + return; + +#if BACKGROUNDQ_BASED_ON_QOS + if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY) + mem->vm_page_in_background = TRUE; + else + mem->vm_page_in_background = FALSE; +#else + task_t my_task; + + my_task = current_task(); + + if (my_task) + mem->vm_page_in_background = proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG); +#endif +} + + +void +vm_page_remove_from_backgroundq( + vm_page_t mem) +{ + vm_object_t m_object; + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + if (mem->vm_page_on_backgroundq) { + vm_page_queue_remove(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq); + + mem->vm_page_backgroundq.next = 0; + mem->vm_page_backgroundq.prev = 0; + mem->vm_page_on_backgroundq = FALSE; + + vm_page_background_count--; + + m_object = VM_PAGE_OBJECT(mem); + + if (m_object->internal) + vm_page_background_internal_count--; + else + vm_page_background_external_count--; + } else { + assert(VM_PAGE_UNPACK_PTR(mem->vm_page_backgroundq.next) == (uintptr_t)NULL && + VM_PAGE_UNPACK_PTR(mem->vm_page_backgroundq.prev) == (uintptr_t)NULL); + } +} + + +void +vm_page_add_to_backgroundq( + vm_page_t mem, + boolean_t first) +{ + vm_object_t m_object; + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + if (vm_page_background_mode == VM_PAGE_BG_DISABLED) + return; + + if (mem->vm_page_on_backgroundq == FALSE) { + + m_object = VM_PAGE_OBJECT(mem); + + if (vm_page_background_exclude_external && !m_object->internal) + return; + + if (first == TRUE) + vm_page_queue_enter_first(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq); + else + vm_page_queue_enter(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq); + mem->vm_page_on_backgroundq = TRUE; + + vm_page_background_count++; + + if (m_object->internal) + vm_page_background_internal_count++; + else + vm_page_background_external_count++; + } +} + +#endif /* * this is an interface to support bring-up of drivers @@ -2122,7 +2443,7 @@ int vm_himemory_mode = 2; unsigned int vm_lopages_allocated_q = 0; unsigned int vm_lopages_allocated_cpm_success = 0; unsigned int vm_lopages_allocated_cpm_failed = 0; -queue_head_t vm_lopage_queue_free; +vm_page_queue_head_t vm_lopage_queue_free __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); vm_page_t vm_page_grablo(void) @@ -2134,12 +2455,14 @@ vm_page_grablo(void) lck_mtx_lock_spin(&vm_page_queue_free_lock); - if ( !queue_empty(&vm_lopage_queue_free)) { - queue_remove_first(&vm_lopage_queue_free, + if ( !vm_page_queue_empty(&vm_lopage_queue_free)) { + vm_page_queue_remove_first(&vm_lopage_queue_free, mem, vm_page_t, pageq); assert(vm_lopage_free_count); + assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; vm_lopage_free_count--; vm_lopages_allocated_q++; @@ -2148,6 +2471,10 @@ vm_page_grablo(void) vm_lopage_refill = TRUE; lck_mtx_unlock(&vm_page_queue_free_lock); + +#if CONFIG_BACKGROUND_QUEUE + vm_page_assign_background_state(mem); +#endif } else { lck_mtx_unlock(&vm_page_queue_free_lock); @@ -2159,6 +2486,8 @@ vm_page_grablo(void) return (VM_PAGE_NULL); } + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + mem->busy = TRUE; vm_page_lockspin_queues(); @@ -2171,13 +2500,11 @@ vm_page_grablo(void) vm_page_unlock_queues(); } assert(mem->busy); - assert(!mem->free); assert(!mem->pmapped); assert(!mem->wpmapped); - assert(!pmap_is_noencrypt(mem->phys_page)); + assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); - mem->pageq.next = NULL; - mem->pageq.prev = NULL; + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); return (mem); } @@ -2204,39 +2531,49 @@ vm_page_grablo(void) * request from the per-cpu queue. */ +#if CONFIG_SECLUDED_MEMORY +vm_page_t vm_page_grab_secluded(void); +#endif /* CONFIG_SECLUDED_MEMORY */ vm_page_t -vm_page_grab( void ) +vm_page_grab(void) { - vm_page_t mem; + return vm_page_grab_options(0); +} +vm_page_t +vm_page_grab_options( + int grab_options) +{ + vm_page_t mem; disable_preemption(); if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) { return_page_from_cpu_list: + assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q); + PROCESSOR_DATA(current_processor(), page_grab_count) += 1; - PROCESSOR_DATA(current_processor(), free_pages) = mem->pageq.next; + PROCESSOR_DATA(current_processor(), free_pages) = mem->snext; enable_preemption(); - mem->pageq.next = NULL; + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; - assert(mem->listq.next == NULL && mem->listq.prev == NULL); + assert(mem->listq.next == 0 && mem->listq.prev == 0); assert(mem->tabled == FALSE); - assert(mem->object == VM_OBJECT_NULL); + assert(mem->vm_page_object == 0); assert(!mem->laundry); - assert(!mem->free); - assert(pmap_verify_free(mem->phys_page)); + assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); assert(mem->busy); assert(!mem->encrypted); assert(!mem->pmapped); assert(!mem->wpmapped); - assert(!mem->active); - assert(!mem->inactive); - assert(!mem->throttled); - assert(!mem->speculative); - assert(!pmap_is_noencrypt(mem->phys_page)); + assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); +#if CONFIG_BACKGROUND_QUEUE + vm_page_assign_background_state(mem); +#endif return mem; } enable_preemption(); @@ -2258,6 +2595,7 @@ vm_page_grab( void ) vm_page_gobble_count); } #endif + lck_mtx_lock_spin(&vm_page_queue_free_lock); /* @@ -2266,8 +2604,30 @@ vm_page_grab( void ) */ if ((vm_page_free_count < vm_page_free_reserved) && !(current_thread()->options & TH_OPT_VMPRIV)) { + /* no page for us in the free queue... */ lck_mtx_unlock(&vm_page_queue_free_lock); mem = VM_PAGE_NULL; + +#if CONFIG_SECLUDED_MEMORY + /* ... but can we try and grab from the secluded queue? */ + if (vm_page_secluded_count > 0 && + ((grab_options & VM_PAGE_GRAB_SECLUDED) || + task_can_use_secluded_mem(current_task()))) { + mem = vm_page_grab_secluded(); + if (grab_options & VM_PAGE_GRAB_SECLUDED) { + vm_page_secluded.grab_for_iokit++; + if (mem) { + vm_page_secluded.grab_for_iokit_success++; + } + } + if (mem) { + VM_CHECK_MEMORYSTATUS; + return mem; + } + } +#else /* CONFIG_SECLUDED_MEMORY */ + (void) grab_options; +#endif /* CONFIG_SECLUDED_MEMORY */ } else { vm_page_t head; @@ -2314,47 +2674,42 @@ vm_page_grab( void ) while (pages_to_steal--) { - while (queue_empty(&vm_page_queue_free[color])) + while (vm_page_queue_empty(&vm_page_queue_free[color].qhead)) color = (color + 1) & vm_color_mask; - queue_remove_first(&vm_page_queue_free[color], + vm_page_queue_remove_first(&vm_page_queue_free[color].qhead, mem, vm_page_t, pageq); - mem->pageq.next = NULL; - mem->pageq.prev = NULL; - - assert(!mem->active); - assert(!mem->inactive); - assert(!mem->throttled); - assert(!mem->speculative); + assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_Q); + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); + color = (color + 1) & vm_color_mask; if (head == NULL) head = mem; else - tail->pageq.next = (queue_t)mem; + tail->snext = mem; tail = mem; - assert(mem->listq.next == NULL && mem->listq.prev == NULL); + assert(mem->listq.next == 0 && mem->listq.prev == 0); assert(mem->tabled == FALSE); - assert(mem->object == VM_OBJECT_NULL); + assert(mem->vm_page_object == 0); assert(!mem->laundry); - assert(mem->free); - mem->free = FALSE; - assert(pmap_verify_free(mem->phys_page)); + mem->vm_page_q_state = VM_PAGE_ON_FREE_LOCAL_Q; + + assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); assert(mem->busy); - assert(!mem->free); assert(!mem->encrypted); assert(!mem->pmapped); assert(!mem->wpmapped); - assert(!pmap_is_noencrypt(mem->phys_page)); + assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); } lck_mtx_unlock(&vm_page_queue_free_lock); - PROCESSOR_DATA(current_processor(), free_pages) = head->pageq.next; + PROCESSOR_DATA(current_processor(), free_pages) = head->snext; PROCESSOR_DATA(current_processor(), start_color) = color; /* @@ -2362,7 +2717,10 @@ vm_page_grab( void ) */ PROCESSOR_DATA(current_processor(), page_grab_count) += 1; mem = head; - mem->pageq.next = NULL; + assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q); + + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; enable_preemption(); } @@ -2380,13 +2738,145 @@ vm_page_grab( void ) ((vm_page_free_count < vm_page_free_target) && ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) thread_wakeup((event_t) &vm_page_free_wanted); +#if CONFIG_BACKGROUND_QUEUE + if (vm_page_background_mode == VM_PAGE_BG_LEVEL_3 && (vm_page_background_count > vm_page_background_limit)) + thread_wakeup((event_t) &vm_page_free_wanted); +#endif VM_CHECK_MEMORYSTATUS; - -// dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 4); /* (TEST/DEBUG) */ + + if (mem) { +// dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 4); /* (TEST/DEBUG) */ + +#if CONFIG_BACKGROUND_QUEUE + vm_page_assign_background_state(mem); +#endif + } + return mem; +} + +#if CONFIG_SECLUDED_MEMORY +vm_page_t +vm_page_grab_secluded(void) +{ + vm_page_t mem; + vm_object_t object; + int refmod_state; + + if (vm_page_secluded_count == 0) { + /* no secluded pages to grab... */ + return VM_PAGE_NULL; + } + + /* secluded queue is protected by the VM page queue lock */ + vm_page_lock_queues(); + + if (vm_page_secluded_count == 0) { + /* no secluded pages to grab... */ + vm_page_unlock_queues(); + return VM_PAGE_NULL; + } + +#if 00 + /* can we grab from the secluded queue? */ + if (vm_page_secluded_count > vm_page_secluded_target || + (vm_page_secluded_count > 0 && + task_can_use_secluded_mem(current_task()))) { + /* OK */ + } else { + /* can't grab from secluded queue... */ + vm_page_unlock_queues(); + return VM_PAGE_NULL; + } +#endif + + /* we can grab a page from secluded queue! */ + assert((vm_page_secluded_count_free + + vm_page_secluded_count_inuse) == + vm_page_secluded_count); + if (current_task()->task_can_use_secluded_mem) { + assert(num_tasks_can_use_secluded_mem > 0); + } + assert(!vm_page_queue_empty(&vm_page_queue_secluded)); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + vm_page_queue_remove_first(&vm_page_queue_secluded, + mem, + vm_page_t, + pageq); + assert(mem->vm_page_q_state == VM_PAGE_ON_SECLUDED_Q); + + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; + vm_page_secluded_count--; + + object = VM_PAGE_OBJECT(mem); + + assert(!mem->fictitious); + assert(!VM_PAGE_WIRED(mem)); + if (object == VM_OBJECT_NULL) { + /* free for grab! */ + assert(mem->busy); + vm_page_secluded_count_free--; + vm_page_unlock_queues(); + vm_page_secluded.grab_success_free++; + return mem; + } + + vm_page_secluded_count_inuse--; + assert(!object->internal); +// vm_page_pageable_external_count--; + + if (!vm_object_lock_try(object)) { +// printf("SECLUDED: page %p: object %p locked\n", mem, object); + vm_page_secluded.grab_failure_locked++; + reactivate_secluded_page: + vm_page_activate(mem); + vm_page_unlock_queues(); + return VM_PAGE_NULL; + } + if (mem->busy || + mem->cleaning || + mem->laundry) { + /* can't steal page in this state... */ + vm_object_unlock(object); + vm_page_secluded.grab_failure_state++; + goto reactivate_secluded_page; + } + + mem->busy = TRUE; + refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem)); + if (refmod_state & VM_MEM_REFERENCED) { + mem->reference = TRUE; + } + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(mem, FALSE); + } + if (mem->dirty || mem->precious) { + /* can't grab a dirty page; re-activate */ +// printf("SECLUDED: dirty page %p\n", mem); + vm_page_secluded.grab_failure_dirty++; + vm_object_unlock(object); + goto reactivate_secluded_page; + } + if (mem->reference) { + /* it's been used but we do need to grab a page... */ + } + vm_page_unlock_queues(); + + /* finish what vm_page_free() would have done... */ + vm_page_free_prepare_object(mem, TRUE); + vm_object_unlock(object); + object = VM_OBJECT_NULL; + if (vm_page_free_verify) { + assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); + } + pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); + assert(mem->busy); + vm_page_secluded.grab_success_other++; return mem; } +#endif /* CONFIG_SECLUDED_MEMORY */ /* * vm_page_release: @@ -2396,62 +2886,105 @@ vm_page_grab( void ) void vm_page_release( - register vm_page_t mem) + vm_page_t mem, + boolean_t page_queues_locked) { unsigned int color; int need_wakeup = 0; int need_priv_wakeup = 0; +#if CONFIG_SECLUDED_MEMORY + int need_secluded_wakeup = 0; +#endif /* CONFIG_SECLUDED_MEMORY */ + if (page_queues_locked) { + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + } else { + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); + } assert(!mem->private && !mem->fictitious); if (vm_page_free_verify) { - assert(pmap_verify_free(mem->phys_page)); + assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); } -// dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 5); /* (TEST/DEBUG) */ +// dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 5); /* (TEST/DEBUG) */ - pmap_clear_noencrypt(mem->phys_page); + pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); lck_mtx_lock_spin(&vm_page_queue_free_lock); -#if DEBUG - if (mem->free) - panic("vm_page_release"); -#endif + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); assert(mem->busy); assert(!mem->laundry); - assert(mem->object == VM_OBJECT_NULL); - assert(mem->pageq.next == NULL && - mem->pageq.prev == NULL); - assert(mem->listq.next == NULL && - mem->listq.prev == NULL); - + assert(mem->vm_page_object == 0); + assert(mem->pageq.next == 0 && mem->pageq.prev == 0); + assert(mem->listq.next == 0 && mem->listq.prev == 0); +#if CONFIG_BACKGROUND_QUEUE + assert(mem->vm_page_backgroundq.next == 0 && + mem->vm_page_backgroundq.prev == 0 && + mem->vm_page_on_backgroundq == FALSE); +#endif if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && vm_lopage_free_count < vm_lopage_free_limit && - mem->phys_page < max_valid_low_ppnum) { + VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) { /* * this exists to support hardware controllers * incapable of generating DMAs with more than 32 bits * of address on platforms with physical memory > 4G... */ - queue_enter_first(&vm_lopage_queue_free, - mem, - vm_page_t, - pageq); + vm_page_queue_enter_first(&vm_lopage_queue_free, + mem, + vm_page_t, + pageq); vm_lopage_free_count++; if (vm_lopage_free_count >= vm_lopage_free_limit) vm_lopage_refill = FALSE; + mem->vm_page_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; mem->lopage = TRUE; - } else { +#if CONFIG_SECLUDED_MEMORY + } else if (vm_page_free_count > vm_page_free_reserved && + vm_page_secluded_count < vm_page_secluded_target && + num_tasks_can_use_secluded_mem == 0) { + /* + * XXX FBDP TODO: also avoid refilling secluded queue + * when some IOKit objects are already grabbing from it... + */ + if (!page_queues_locked) { + if (!vm_page_trylock_queues()) { + /* take locks in right order */ + lck_mtx_unlock(&vm_page_queue_free_lock); + vm_page_lock_queues(); + lck_mtx_lock_spin(&vm_page_queue_free_lock); + } + } mem->lopage = FALSE; - mem->free = TRUE; + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + vm_page_queue_enter_first(&vm_page_queue_secluded, + mem, + vm_page_t, + pageq); + mem->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + vm_page_secluded_count++; + vm_page_secluded_count_free++; + if (!page_queues_locked) { + vm_page_unlock_queues(); + } + LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED); + if (vm_page_free_wanted_secluded > 0) { + vm_page_free_wanted_secluded--; + need_secluded_wakeup = 1; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + } else { + mem->lopage = FALSE; + mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; - color = mem->phys_page & vm_color_mask; - queue_enter_first(&vm_page_queue_free[color], - mem, - vm_page_t, - pageq); + color = VM_PAGE_GET_PHYS_PAGE(mem) & vm_color_mask; + vm_page_queue_enter_first(&vm_page_queue_free[color].qhead, + mem, + vm_page_t, + pageq); vm_page_free_count++; /* * Check if we should wake up someone waiting for page. @@ -2477,6 +3010,12 @@ vm_page_release( if (vm_page_free_wanted_privileged > 0) { vm_page_free_wanted_privileged--; need_priv_wakeup = 1; +#if CONFIG_SECLUDED_MEMORY + } else if (vm_page_free_wanted_secluded > 0 && + vm_page_free_count > vm_page_free_reserved) { + vm_page_free_wanted_secluded--; + need_secluded_wakeup = 1; +#endif /* CONFIG_SECLUDED_MEMORY */ } else if (vm_page_free_wanted > 0 && vm_page_free_count > vm_page_free_reserved) { vm_page_free_wanted--; @@ -2487,6 +3026,10 @@ vm_page_release( if (need_priv_wakeup) thread_wakeup_one((event_t) &vm_page_free_wanted_privileged); +#if CONFIG_SECLUDED_MEMORY + else if (need_secluded_wakeup) + thread_wakeup_one((event_t) &vm_page_free_wanted_secluded); +#endif /* CONFIG_SECLUDED_MEMORY */ else if (need_wakeup) thread_wakeup_one((event_t) &vm_page_free_count); @@ -2501,22 +3044,31 @@ vm_page_release( */ void vm_page_release_startup( - register vm_page_t mem) + vm_page_t mem) { - queue_t queue_free; + vm_page_queue_t queue_free; if (vm_lopage_free_count < vm_lopage_free_limit && - mem->phys_page < max_valid_low_ppnum) { + VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) { mem->lopage = TRUE; + mem->vm_page_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; vm_lopage_free_count++; queue_free = &vm_lopage_queue_free; - } else { +#if CONFIG_SECLUDED_MEMORY + } else if (vm_page_secluded_count < vm_page_secluded_target) { + mem->lopage = FALSE; + mem->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + vm_page_secluded_count++; + vm_page_secluded_count_free++; + queue_free = &vm_page_queue_secluded; +#endif /* CONFIG_SECLUDED_MEMORY */ + } else { mem->lopage = FALSE; - mem->free = TRUE; + mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; vm_page_free_count++; - queue_free = &vm_page_queue_free[mem->phys_page & vm_color_mask]; + queue_free = &vm_page_queue_free[VM_PAGE_GET_PHYS_PAGE(mem) & vm_color_mask].qhead; } - queue_enter_first(queue_free, mem, vm_page_t, pageq); + vm_page_queue_enter_first(queue_free, mem, vm_page_t, pageq); } /* @@ -2551,35 +3103,63 @@ vm_page_wait( lck_mtx_unlock(&vm_page_queue_free_lock); return TRUE; } - if (vm_page_free_count < vm_page_free_target) { - if (is_privileged) { - if (vm_page_free_wanted_privileged++ == 0) - need_wakeup = 1; - wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible); - } else { - if (vm_page_free_wanted++ == 0) - need_wakeup = 1; - wait_result = assert_wait((event_t)&vm_page_free_count, interruptible); - } + if (vm_page_free_count >= vm_page_free_target) { lck_mtx_unlock(&vm_page_queue_free_lock); - counter(c_vm_page_wait_block++); - - if (need_wakeup) - thread_wakeup((event_t)&vm_page_free_wanted); + return TRUE; + } - if (wait_result == THREAD_WAITING) { - VM_DEBUG_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START, - vm_page_free_wanted_privileged, vm_page_free_wanted, 0, 0); - wait_result = thread_block(THREAD_CONTINUE_NULL); - VM_DEBUG_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); + if (is_privileged) { + if (vm_page_free_wanted_privileged++ == 0) + need_wakeup = 1; + wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible); +#if CONFIG_SECLUDED_MEMORY + } else if (secluded_for_apps && + task_can_use_secluded_mem(current_task())) { +#if 00 + /* XXX FBDP: need pageq lock for this... */ + /* XXX FBDP: might wait even if pages available, */ + /* XXX FBDP: hopefully not for too long... */ + if (vm_page_secluded_count > 0) { + lck_mtx_unlock(&vm_page_queue_free_lock); + return TRUE; } - - return(wait_result == THREAD_AWAKENED); +#endif + if (vm_page_free_wanted_secluded++ == 0) { + need_wakeup = 1; + } + wait_result = assert_wait( + (event_t)&vm_page_free_wanted_secluded, + interruptible); +#endif /* CONFIG_SECLUDED_MEMORY */ } else { - lck_mtx_unlock(&vm_page_queue_free_lock); - return TRUE; + if (vm_page_free_wanted++ == 0) + need_wakeup = 1; + wait_result = assert_wait((event_t)&vm_page_free_count, + interruptible); + } + lck_mtx_unlock(&vm_page_queue_free_lock); + counter(c_vm_page_wait_block++); + + if (need_wakeup) + thread_wakeup((event_t)&vm_page_free_wanted); + + if (wait_result == THREAD_WAITING) { + VM_DEBUG_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START, + vm_page_free_wanted_privileged, + vm_page_free_wanted, +#if CONFIG_SECLUDED_MEMORY + vm_page_free_wanted_secluded, +#else /* CONFIG_SECLUDED_MEMORY */ + 0, +#endif /* CONFIG_SECLUDED_MEMORY */ + 0); + wait_result = thread_block(THREAD_CONTINUE_NULL); + VM_DEBUG_EVENT(vm_page_wait_block, + VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); } + + return (wait_result == THREAD_AWAKENED); } /* @@ -2596,10 +3176,17 @@ vm_page_alloc( vm_object_t object, vm_object_offset_t offset) { - register vm_page_t mem; + vm_page_t mem; + int grab_options; vm_object_lock_assert_exclusive(object); - mem = vm_page_grab(); + grab_options = 0; +#if CONFIG_SECLUDED_MEMORY + if (object->can_grab_secluded) { + grab_options |= VM_PAGE_GRAB_SECLUDED; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + mem = vm_page_grab_options(grab_options); if (mem == VM_PAGE_NULL) return VM_PAGE_NULL; @@ -2621,7 +3208,7 @@ vm_page_alloc_guard( vm_object_t object, vm_object_offset_t offset) { - register vm_page_t mem; + vm_page_t mem; vm_object_lock_assert_exclusive(object); mem = vm_page_grab_guard(); @@ -2657,17 +3244,17 @@ void vm_page_free_prepare_queues( vm_page_t mem) { + vm_object_t m_object; + VM_PAGE_CHECK(mem); - assert(!mem->free); + + assert(mem->vm_page_q_state != VM_PAGE_ON_FREE_Q); assert(!mem->cleaning); + m_object = VM_PAGE_OBJECT(mem); -#if MACH_ASSERT || DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - if (mem->free) - panic("vm_page_free: freeing page on free list\n"); -#endif /* MACH_ASSERT || DEBUG */ - if (mem->object) { - vm_object_lock_assert_exclusive(mem->object); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + if (m_object) { + vm_object_lock_assert_exclusive(m_object); } if (mem->laundry) { /* @@ -2681,30 +3268,32 @@ vm_page_free_prepare_queues( counter(++c_laundry_pages_freed); } - vm_page_queues_remove(mem); /* clears local/active/inactive/throttled/speculative */ + vm_page_queues_remove(mem, TRUE); if (VM_PAGE_WIRED(mem)) { - if (mem->object) { - assert(mem->object->wired_page_count > 0); - mem->object->wired_page_count--; - if (!mem->object->wired_page_count) { - VM_OBJECT_UNWIRED(mem->object); + assert(mem->wire_count > 0); + + if (m_object) { + assert(m_object->wired_page_count > 0); + m_object->wired_page_count--; + if (!m_object->wired_page_count) { + VM_OBJECT_UNWIRED(m_object); } - assert(mem->object->resident_page_count >= - mem->object->wired_page_count); + assert(m_object->resident_page_count >= + m_object->wired_page_count); - if (mem->object->purgable == VM_PURGABLE_VOLATILE) { + if (m_object->purgable == VM_PURGABLE_VOLATILE) { OSAddAtomic(+1, &vm_page_purgeable_count); assert(vm_page_purgeable_wired_count > 0); OSAddAtomic(-1, &vm_page_purgeable_wired_count); } - if ((mem->object->purgable == VM_PURGABLE_VOLATILE || - mem->object->purgable == VM_PURGABLE_EMPTY) && - mem->object->vo_purgeable_owner != TASK_NULL) { + if ((m_object->purgable == VM_PURGABLE_VOLATILE || + m_object->purgable == VM_PURGABLE_EMPTY) && + m_object->vo_purgeable_owner != TASK_NULL) { task_t owner; - owner = mem->object->vo_purgeable_owner; + owner = m_object->vo_purgeable_owner; /* * While wired, this page was accounted * as "non-volatile" but it should now @@ -2726,6 +3315,8 @@ vm_page_free_prepare_queues( } if (!mem->private && !mem->fictitious) vm_page_wire_count--; + + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; mem->wire_count = 0; assert(!mem->gobbled); } else if (mem->gobbled) { @@ -2749,10 +3340,10 @@ vm_page_free_prepare_object( if (mem->private) { mem->private = FALSE; mem->fictitious = TRUE; - mem->phys_page = vm_page_fictitious_addr; + VM_PAGE_SET_PHYS_PAGE(mem, vm_page_fictitious_addr); } if ( !mem->fictitious) { - vm_page_init(mem, mem->phys_page, mem->lopage); + vm_page_init(mem, VM_PAGE_GET_PHYS_PAGE(mem), mem->lopage); } } @@ -2774,7 +3365,8 @@ vm_page_free( if (mem->fictitious) { vm_page_release_fictitious(mem); } else { - vm_page_release(mem); + vm_page_release(mem, + TRUE); /* page queues are locked */ } } @@ -2793,7 +3385,7 @@ vm_page_free_unlocked( if (mem->fictitious) { vm_page_release_fictitious(mem); } else { - vm_page_release(mem); + vm_page_release(mem, FALSE); /* page queues are not locked */ } } @@ -2828,18 +3420,18 @@ vm_page_free_list( */ while (mem && pg_count < 64) { - assert(!mem->inactive); - assert(!mem->active); - assert(!mem->throttled); - assert(!mem->free); - assert(!mem->speculative); - assert(!VM_PAGE_WIRED(mem)); - assert(mem->pageq.prev == NULL); + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); +#if CONFIG_BACKGROUND_QUEUE + assert(mem->vm_page_backgroundq.next == 0 && + mem->vm_page_backgroundq.prev == 0 && + mem->vm_page_on_backgroundq == FALSE); +#endif + nxt = mem->snext; + mem->snext = NULL; + assert(mem->pageq.prev == 0); - nxt = (vm_page_t)(mem->pageq.next); - if (vm_page_free_verify && !mem->fictitious && !mem->private) { - assert(pmap_verify_free(mem->phys_page)); + assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); } if (prepare_object == TRUE) vm_page_free_prepare_object(mem, TRUE); @@ -2849,9 +3441,14 @@ vm_page_free_list( if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && vm_lopage_free_count < vm_lopage_free_limit && - mem->phys_page < max_valid_low_ppnum) { - mem->pageq.next = NULL; - vm_page_release(mem); + VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) { + vm_page_release(mem, FALSE); /* page queues are not locked */ +#if CONFIG_SECLUDED_MEMORY + } else if (vm_page_secluded_count < vm_page_secluded_target && + num_tasks_can_use_secluded_mem == 0) { + vm_page_release(mem, + FALSE); /* page queues are not locked */ +#endif /* CONFIG_SECLUDED_MEMORY */ } else { /* * IMPORTANT: we can't set the page "free" here @@ -2862,15 +3459,15 @@ vm_page_free_list( * cause trouble because the page is not actually * in the free queue yet... */ - mem->pageq.next = (queue_entry_t)local_freeq; + mem->snext = local_freeq; local_freeq = mem; pg_count++; - pmap_clear_noencrypt(mem->phys_page); + pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); } } else { - assert(mem->phys_page == vm_page_fictitious_addr || - mem->phys_page == vm_page_guard_addr); + assert(VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_fictitious_addr || + VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_guard_addr); vm_page_release_fictitious(mem); } mem = nxt; @@ -2881,23 +3478,27 @@ vm_page_free_list( unsigned int avail_free_count; unsigned int need_wakeup = 0; unsigned int need_priv_wakeup = 0; +#if CONFIG_SECLUDED_MEMORY + unsigned int need_wakeup_secluded = 0; +#endif /* CONFIG_SECLUDED_MEMORY */ lck_mtx_lock_spin(&vm_page_queue_free_lock); while (mem) { int color; - nxt = (vm_page_t)(mem->pageq.next); + nxt = mem->snext; - assert(!mem->free); + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); assert(mem->busy); - mem->free = TRUE; - - color = mem->phys_page & vm_color_mask; - queue_enter_first(&vm_page_queue_free[color], - mem, - vm_page_t, - pageq); + mem->lopage = FALSE; + mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; + + color = VM_PAGE_GET_PHYS_PAGE(mem) & vm_color_mask; + vm_page_queue_enter_first(&vm_page_queue_free[color].qhead, + mem, + vm_page_t, + pageq); mem = nxt; } vm_page_free_count += pg_count; @@ -2911,10 +3512,31 @@ vm_page_free_list( avail_free_count = 0; } else { need_priv_wakeup = vm_page_free_wanted_privileged; - vm_page_free_wanted_privileged = 0; avail_free_count -= vm_page_free_wanted_privileged; + vm_page_free_wanted_privileged = 0; } } +#if CONFIG_SECLUDED_MEMORY + if (vm_page_free_wanted_secluded > 0 && + avail_free_count > vm_page_free_reserved) { + unsigned int available_pages; + available_pages = (avail_free_count - + vm_page_free_reserved); + if (available_pages < + vm_page_free_wanted_secluded) { + need_wakeup_secluded = available_pages; + vm_page_free_wanted_secluded -= + available_pages; + avail_free_count -= available_pages; + } else { + need_wakeup_secluded = + vm_page_free_wanted_secluded; + avail_free_count -= + vm_page_free_wanted_secluded; + vm_page_free_wanted_secluded = 0; + } + } +#endif /* CONFIG_SECLUDED_MEMORY */ if (vm_page_free_wanted > 0 && avail_free_count > vm_page_free_reserved) { unsigned int available_pages; @@ -2938,6 +3560,21 @@ vm_page_free_list( */ thread_wakeup((event_t)&vm_page_free_wanted_privileged); } +#if CONFIG_SECLUDED_MEMORY + if (need_wakeup_secluded != 0 && + vm_page_free_wanted_secluded == 0) { + thread_wakeup((event_t) + &vm_page_free_wanted_secluded); + } else { + for (; + need_wakeup_secluded != 0; + need_wakeup_secluded--) { + thread_wakeup_one( + (event_t) + &vm_page_free_wanted_secluded); + } + } +#endif /* CONFIG_SECLUDED_MEMORY */ if (need_wakeup != 0 && vm_page_free_wanted == 0) { /* * We don't expect to have any more waiters @@ -2971,16 +3608,19 @@ vm_page_free_list( void vm_page_wire( - register vm_page_t mem, + vm_page_t mem, vm_tag_t tag, boolean_t check_memorystatus) { + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(mem); -// dbgLog(current_thread(), mem->offset, mem->object, 1); /* (TEST/DEBUG) */ +// dbgLog(current_thread(), mem->offset, m_object, 1); /* (TEST/DEBUG) */ VM_PAGE_CHECK(mem); - if (mem->object) { - vm_object_lock_assert_exclusive(mem->object); + if (m_object) { + vm_object_lock_assert_exclusive(m_object); } else { /* * In theory, the page should be in an object before it @@ -2992,43 +3632,43 @@ vm_page_wire( * that page and update it at the same time. */ } -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); if ( !VM_PAGE_WIRED(mem)) { - if (mem->pageout_queue) { - mem->pageout = FALSE; - vm_pageout_throttle_up(mem); - } - vm_page_queues_remove(mem); + if (mem->laundry) + vm_pageout_steal_laundry(mem, TRUE); + + vm_page_queues_remove(mem, TRUE); + + assert(mem->wire_count == 0); + mem->vm_page_q_state = VM_PAGE_IS_WIRED; - if (mem->object) { + if (m_object) { if (!mem->private && !mem->fictitious) { - if (!mem->object->wired_page_count) + if (!m_object->wired_page_count) { assert(VM_KERN_MEMORY_NONE != tag); - mem->object->wire_tag = tag; - VM_OBJECT_WIRED(mem->object); + m_object->wire_tag = tag; + VM_OBJECT_WIRED(m_object); } } - mem->object->wired_page_count++; + m_object->wired_page_count++; - assert(mem->object->resident_page_count >= - mem->object->wired_page_count); - if (mem->object->purgable == VM_PURGABLE_VOLATILE) { + assert(m_object->resident_page_count >= + m_object->wired_page_count); + if (m_object->purgable == VM_PURGABLE_VOLATILE) { assert(vm_page_purgeable_count > 0); OSAddAtomic(-1, &vm_page_purgeable_count); OSAddAtomic(1, &vm_page_purgeable_wired_count); } - if ((mem->object->purgable == VM_PURGABLE_VOLATILE || - mem->object->purgable == VM_PURGABLE_EMPTY) && - mem->object->vo_purgeable_owner != TASK_NULL) { + if ((m_object->purgable == VM_PURGABLE_VOLATILE || + m_object->purgable == VM_PURGABLE_EMPTY) && + m_object->vo_purgeable_owner != TASK_NULL) { task_t owner; - owner = mem->object->vo_purgeable_owner; + owner = m_object->vo_purgeable_owner; /* less volatile bytes */ ledger_debit(owner->ledger, task_ledgers.purgeable_volatile, @@ -3042,7 +3682,7 @@ vm_page_wire( task_ledgers.phys_footprint, PAGE_SIZE); } - if (mem->object->all_reusable) { + if (m_object->all_reusable) { /* * Wired pages are not counted as "re-usable" * in "all_reusable" VM objects, so nothing @@ -3054,7 +3694,7 @@ vm_page_wire( * wired, so adjust its state and the * accounting. */ - vm_object_reuse_pages(mem->object, + vm_object_reuse_pages(m_object, mem->offset, mem->offset+PAGE_SIZE_64, FALSE); @@ -3082,7 +3722,11 @@ vm_page_wire( */ } assert(!mem->gobbled); + assert(mem->vm_page_q_state == VM_PAGE_IS_WIRED); mem->wire_count++; + if (__improbable(mem->wire_count == 0)) { + panic("vm_page_wire(%p): wire_count overflow", mem); + } VM_PAGE_CHECK(mem); } @@ -3099,39 +3743,43 @@ vm_page_unwire( vm_page_t mem, boolean_t queueit) { + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(mem); -// dbgLog(current_thread(), mem->offset, mem->object, 0); /* (TEST/DEBUG) */ +// dbgLog(current_thread(), mem->offset, m_object, 0); /* (TEST/DEBUG) */ VM_PAGE_CHECK(mem); assert(VM_PAGE_WIRED(mem)); + assert(mem->wire_count > 0); assert(!mem->gobbled); - assert(mem->object != VM_OBJECT_NULL); -#if DEBUG - vm_object_lock_assert_exclusive(mem->object); - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + assert(m_object != VM_OBJECT_NULL); + vm_object_lock_assert_exclusive(m_object); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); if (--mem->wire_count == 0) { + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; + if (!mem->private && !mem->fictitious) { vm_page_wire_count--; } - assert(mem->object->wired_page_count > 0); - mem->object->wired_page_count--; - if (!mem->object->wired_page_count) { - VM_OBJECT_UNWIRED(mem->object); + assert(m_object->wired_page_count > 0); + m_object->wired_page_count--; + if (!m_object->wired_page_count) { + VM_OBJECT_UNWIRED(m_object); } - assert(mem->object->resident_page_count >= - mem->object->wired_page_count); - if (mem->object->purgable == VM_PURGABLE_VOLATILE) { + assert(m_object->resident_page_count >= + m_object->wired_page_count); + if (m_object->purgable == VM_PURGABLE_VOLATILE) { OSAddAtomic(+1, &vm_page_purgeable_count); assert(vm_page_purgeable_wired_count > 0); OSAddAtomic(-1, &vm_page_purgeable_wired_count); } - if ((mem->object->purgable == VM_PURGABLE_VOLATILE || - mem->object->purgable == VM_PURGABLE_EMPTY) && - mem->object->vo_purgeable_owner != TASK_NULL) { + if ((m_object->purgable == VM_PURGABLE_VOLATILE || + m_object->purgable == VM_PURGABLE_EMPTY) && + m_object->vo_purgeable_owner != TASK_NULL) { task_t owner; - owner = mem->object->vo_purgeable_owner; + owner = m_object->vo_purgeable_owner; /* more volatile bytes */ ledger_credit(owner->ledger, task_ledgers.purgeable_volatile, @@ -3145,11 +3793,11 @@ vm_page_unwire( task_ledgers.phys_footprint, PAGE_SIZE); } - assert(mem->object != kernel_object); - assert(mem->pageq.next == NULL && mem->pageq.prev == NULL); + assert(m_object != kernel_object); + assert(mem->pageq.next == 0 && mem->pageq.prev == 0); if (queueit == TRUE) { - if (mem->object->purgable == VM_PURGABLE_EMPTY) { + if (m_object->purgable == VM_PURGABLE_EMPTY) { vm_page_deactivate(mem); } else { vm_page_activate(mem); @@ -3184,15 +3832,16 @@ vm_page_deactivate_internal( vm_page_t m, boolean_t clear_hw_reference) { + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(m); VM_PAGE_CHECK(m); - assert(m->object != kernel_object); - assert(m->phys_page != vm_page_guard_addr); + assert(m_object != kernel_object); + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); -// dbgLog(m->phys_page, vm_page_free_count, vm_page_wire_count, 6); /* (TEST/DEBUG) */ -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif +// dbgLog(VM_PAGE_GET_PHYS_PAGE(m), vm_page_free_count, vm_page_wire_count, 6); /* (TEST/DEBUG) */ + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); /* * This page is no longer very interesting. If it was * interesting (active or inactive/referenced), then we @@ -3217,30 +3866,33 @@ vm_page_deactivate_internal( * (which is not required here) to decrement the activity_in_progress * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed - */ - if (m->laundry || m->pageout_queue || m->private || m->fictitious || m->compressor || (VM_PAGE_WIRED(m))) - return; - + */ + if (m->laundry || m->private || m->fictitious || + (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) || + VM_PAGE_WIRED(m)) { + return; + } if (!m->absent && clear_hw_reference == TRUE) - pmap_clear_reference(m->phys_page); + pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); m->reference = FALSE; m->no_cache = FALSE; - if (!m->inactive) { - vm_page_queues_remove(m); + if ( !VM_PAGE_INACTIVE(m)) { + vm_page_queues_remove(m, FALSE); - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && - m->dirty && m->object->internal && - (m->object->purgable == VM_PURGABLE_DENY || - m->object->purgable == VM_PURGABLE_NONVOLATILE || - m->object->purgable == VM_PURGABLE_VOLATILE)) { + if (!VM_DYNAMIC_PAGING_ENABLED() && + m->dirty && m_object->internal && + (m_object->purgable == VM_PURGABLE_DENY || + m_object->purgable == VM_PURGABLE_NONVOLATILE || + m_object->purgable == VM_PURGABLE_VOLATILE)) { vm_page_check_pageable_safe(m); - queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); - m->throttled = TRUE; + vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; } else { - if (m->object->named && m->object->ref_count == 1) { + if (m_object->named && m_object->ref_count == 1) { vm_page_speculate(m, FALSE); #if DEVELOPMENT || DEBUG vm_page_speculative_recreated++; @@ -3264,14 +3916,16 @@ vm_page_deactivate_internal( void vm_page_enqueue_cleaned(vm_page_t m) { - assert(m->phys_page != vm_page_guard_addr); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(m); + + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); assert( !(m->absent && !m->unusual)); + assert( !VM_PAGE_WIRED(m)); if (m->gobbled) { - assert( !VM_PAGE_WIRED(m)); if (!m->private && !m->fictitious) vm_page_wire_count--; vm_page_gobble_count--; @@ -3285,24 +3939,28 @@ void vm_page_enqueue_cleaned(vm_page_t m) * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->clean_queue || m->pageout_queue || m->private || m->fictitious) - return; - - vm_page_queues_remove(m); + if (m->laundry || m->private || m->fictitious || + (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) || + (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { + return; + } + vm_page_queues_remove(m, FALSE); vm_page_check_pageable_safe(m); - queue_enter(&vm_page_queue_cleaned, m, vm_page_t, pageq); - m->clean_queue = TRUE; + vm_page_queue_enter(&vm_page_queue_cleaned, m, vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q; vm_page_cleaned_count++; - m->inactive = TRUE; vm_page_inactive_count++; - if (m->object->internal) { + if (m_object->internal) { vm_page_pageable_internal_count++; } else { vm_page_pageable_external_count++; } - +#if CONFIG_BACKGROUND_QUEUE + if (m->vm_page_in_background) + vm_page_add_to_backgroundq(m, TRUE); +#endif vm_pageout_enqueued_cleaned++; } @@ -3316,16 +3974,18 @@ void vm_page_enqueue_cleaned(vm_page_t m) void vm_page_activate( - register vm_page_t m) + vm_page_t m) { + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(m); + VM_PAGE_CHECK(m); #ifdef FIXME_4778297 - assert(m->object != kernel_object); -#endif - assert(m->phys_page != vm_page_guard_addr); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + assert(m_object != kernel_object); #endif + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); assert( !(m->absent && !m->unusual)); if (m->gobbled) { @@ -3343,40 +4003,54 @@ vm_page_activate( * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->pageout_queue || m->private || m->fictitious || m->compressor) + if (m->laundry || m->private || m->fictitious || + (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) return; #if DEBUG - if (m->active) + if (m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q) panic("vm_page_activate: already active"); #endif - if (m->speculative) { + if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) { DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL); } - vm_page_queues_remove(m); + vm_page_queues_remove(m, FALSE); if ( !VM_PAGE_WIRED(m)) { vm_page_check_pageable_safe(m); - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && - m->dirty && m->object->internal && - (m->object->purgable == VM_PURGABLE_DENY || - m->object->purgable == VM_PURGABLE_NONVOLATILE || - m->object->purgable == VM_PURGABLE_VOLATILE)) { - queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); - m->throttled = TRUE; + if (!VM_DYNAMIC_PAGING_ENABLED() && + m->dirty && m_object->internal && + (m_object->purgable == VM_PURGABLE_DENY || + m_object->purgable == VM_PURGABLE_NONVOLATILE || + m_object->purgable == VM_PURGABLE_VOLATILE)) { + vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; } else { - queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); - m->active = TRUE; - vm_page_active_count++; - if (m->object->internal) { - vm_page_pageable_internal_count++; - } else { - vm_page_pageable_external_count++; - } +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache && + vm_page_secluded_target != 0 && + num_tasks_can_use_secluded_mem == 0 && + m_object->eligible_for_secluded && + ((secluded_aging_policy == SECLUDED_AGING_FIFO) || + (secluded_aging_policy == + SECLUDED_AGING_ALONG_ACTIVE) || + (secluded_aging_policy == + SECLUDED_AGING_BEFORE_ACTIVE))) { + vm_page_queue_enter(&vm_page_queue_secluded, m, + vm_page_t, pageq); + m->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + vm_page_secluded_count++; + vm_page_secluded_count_inuse++; + assert(!m_object->internal); +// vm_page_pageable_external_count++; + } else +#endif /* CONFIG_SECLUDED_MEMORY */ + vm_page_enqueue_active(m, FALSE); } m->reference = TRUE; m->no_cache = FALSE; @@ -3398,15 +4072,17 @@ vm_page_speculate( boolean_t new) { struct vm_speculative_age_q *aq; + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(m); VM_PAGE_CHECK(m); vm_page_check_pageable_safe(m); - assert(m->phys_page != vm_page_guard_addr); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); assert( !(m->absent && !m->unusual)); + assert(m_object->internal == FALSE); /* * if this page is currently on the pageout queue, we can't do the @@ -3416,10 +4092,12 @@ vm_page_speculate( * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->pageout_queue || m->private || m->fictitious || m->compressor) + if (m->laundry || m->private || m->fictitious || + (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) return; - vm_page_queues_remove(m); + vm_page_queues_remove(m, FALSE); if ( !VM_PAGE_WIRED(m)) { mach_timespec_t ts; @@ -3461,7 +4139,7 @@ vm_page_speculate( } aq = &vm_page_queue_speculative[speculative_age_index]; - if (!queue_empty(&aq->age_q)) + if (!vm_page_queue_empty(&aq->age_q)) vm_page_speculate_ageit(aq); aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000; @@ -3470,19 +4148,15 @@ vm_page_speculate( ADD_MACH_TIMESPEC(&aq->age_ts, &ts); } } - enqueue_tail(&aq->age_q, &m->pageq); - m->speculative = TRUE; + vm_page_enqueue_tail(&aq->age_q, &m->pageq); + m->vm_page_q_state = VM_PAGE_ON_SPECULATIVE_Q; vm_page_speculative_count++; - if (m->object->internal) { - vm_page_pageable_internal_count++; - } else { - vm_page_pageable_external_count++; - } + vm_page_pageable_external_count++; if (new == TRUE) { - vm_object_lock_assert_exclusive(m->object); + vm_object_lock_assert_exclusive(m_object); - m->object->pages_created++; + m_object->pages_created++; #if DEVELOPMENT || DEBUG vm_page_speculative_created++; #endif @@ -3506,28 +4180,28 @@ vm_page_speculate_ageit(struct vm_speculative_age_q *aq) sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; - if (queue_empty(&sq->age_q)) { + if (vm_page_queue_empty(&sq->age_q)) { sq->age_q.next = aq->age_q.next; sq->age_q.prev = aq->age_q.prev; - t = (vm_page_t)sq->age_q.next; - t->pageq.prev = &sq->age_q; + t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next); + t->pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q); - t = (vm_page_t)sq->age_q.prev; - t->pageq.next = &sq->age_q; + t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev); + t->pageq.next = VM_PAGE_PACK_PTR(&sq->age_q); } else { - t = (vm_page_t)sq->age_q.prev; + t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev); t->pageq.next = aq->age_q.next; - t = (vm_page_t)aq->age_q.next; + t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next); t->pageq.prev = sq->age_q.prev; - t = (vm_page_t)aq->age_q.prev; - t->pageq.next = &sq->age_q; + t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev); + t->pageq.next = VM_PAGE_PACK_PTR(&sq->age_q); sq->age_q.prev = aq->age_q.prev; } - queue_init(&aq->age_q); + vm_page_queue_init(&aq->age_q); } @@ -3536,12 +4210,10 @@ vm_page_lru( vm_page_t m) { VM_PAGE_CHECK(m); - assert(m->object != kernel_object); - assert(m->phys_page != vm_page_guard_addr); + assert(VM_PAGE_OBJECT(m) != kernel_object); + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); /* * if this page is currently on the pageout queue, we can't do the * vm_page_queues_remove (which doesn't handle the pageout queue case) @@ -3550,12 +4222,15 @@ vm_page_lru( * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->pageout_queue || m->private || m->compressor || (VM_PAGE_WIRED(m))) + if (m->laundry || m->private || + (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) || + VM_PAGE_WIRED(m)) return; m->no_cache = FALSE; - vm_page_queues_remove(m); + vm_page_queues_remove(m, FALSE); vm_page_enqueue_inactive(m, FALSE); } @@ -3569,36 +4244,38 @@ vm_page_reactivate_all_throttled(void) vm_page_t m; int extra_active_count; int extra_internal_count, extra_external_count; + vm_object_t m_object; - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) + if (!VM_DYNAMIC_PAGING_ENABLED()) return; extra_active_count = 0; extra_internal_count = 0; extra_external_count = 0; vm_page_lock_queues(); - if (! queue_empty(&vm_page_queue_throttled)) { + if (! vm_page_queue_empty(&vm_page_queue_throttled)) { /* * Switch "throttled" pages to "active". */ - queue_iterate(&vm_page_queue_throttled, m, vm_page_t, pageq) { + vm_page_queue_iterate(&vm_page_queue_throttled, m, vm_page_t, pageq) { VM_PAGE_CHECK(m); - assert(m->throttled); - assert(!m->active); - assert(!m->inactive); - assert(!m->speculative); - assert(!VM_PAGE_WIRED(m)); + assert(m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q); + + m_object = VM_PAGE_OBJECT(m); extra_active_count++; - if (m->object->internal) { + if (m_object->internal) { extra_internal_count++; } else { extra_external_count++; } - m->throttled = FALSE; - m->active = TRUE; + m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; VM_PAGE_CHECK(m); +#if CONFIG_BACKGROUND_QUEUE + if (m->vm_page_in_background) + vm_page_add_to_backgroundq(m, FALSE); +#endif } /* @@ -3607,22 +4284,22 @@ vm_page_reactivate_all_throttled(void) * get re-evaluated by the LRU algorithm first, since they've been * completely out of it until now. */ - first_throttled = (vm_page_t) queue_first(&vm_page_queue_throttled); - last_throttled = (vm_page_t) queue_last(&vm_page_queue_throttled); - first_active = (vm_page_t) queue_first(&vm_page_queue_active); - if (queue_empty(&vm_page_queue_active)) { - queue_last(&vm_page_queue_active) = (queue_entry_t) last_throttled; + first_throttled = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled); + last_throttled = (vm_page_t) vm_page_queue_last(&vm_page_queue_throttled); + first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); + if (vm_page_queue_empty(&vm_page_queue_active)) { + vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled); } else { - queue_prev(&first_active->pageq) = (queue_entry_t) last_throttled; + first_active->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled); } - queue_first(&vm_page_queue_active) = (queue_entry_t) first_throttled; - queue_prev(&first_throttled->pageq) = (queue_entry_t) &vm_page_queue_active; - queue_next(&last_throttled->pageq) = (queue_entry_t) first_active; + vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled); + first_throttled->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active); + last_throttled->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active); #if DEBUG printf("reactivated %d throttled pages\n", vm_page_throttled_count); #endif - queue_init(&vm_page_queue_throttled); + vm_page_queue_init(&vm_page_queue_throttled); /* * Adjust the global page counts. */ @@ -3632,7 +4309,7 @@ vm_page_reactivate_all_throttled(void) vm_page_throttled_count = 0; } assert(vm_page_throttled_count == 0); - assert(queue_empty(&vm_page_queue_throttled)); + assert(vm_page_queue_empty(&vm_page_queue_throttled)); vm_page_unlock_queues(); } @@ -3671,27 +4348,24 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks) /* * Switch "local" pages to "active". */ - assert(!queue_empty(&lq->vpl_queue)); + assert(!vm_page_queue_empty(&lq->vpl_queue)); - queue_iterate(&lq->vpl_queue, m, vm_page_t, pageq) { + vm_page_queue_iterate(&lq->vpl_queue, m, vm_page_t, pageq) { VM_PAGE_CHECK(m); vm_page_check_pageable_safe(m); - assert(m->local); - assert(!m->active); - assert(!m->inactive); - assert(!m->speculative); - assert(!VM_PAGE_WIRED(m)); - assert(!m->throttled); + assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q); assert(!m->fictitious); if (m->local_id != lid) panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m); m->local_id = 0; - m->local = FALSE; - m->active = TRUE; + m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; VM_PAGE_CHECK(m); - +#if CONFIG_BACKGROUND_QUEUE + if (m->vm_page_in_background) + vm_page_add_to_backgroundq(m, FALSE); +#endif count++; } if (count != lq->vpl_count) @@ -3700,20 +4374,20 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks) /* * Transfer the entire local queue to a regular LRU page queues. */ - first_local = (vm_page_t) queue_first(&lq->vpl_queue); - last_local = (vm_page_t) queue_last(&lq->vpl_queue); - first_active = (vm_page_t) queue_first(&vm_page_queue_active); + first_local = (vm_page_t) vm_page_queue_first(&lq->vpl_queue); + last_local = (vm_page_t) vm_page_queue_last(&lq->vpl_queue); + first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); - if (queue_empty(&vm_page_queue_active)) { - queue_last(&vm_page_queue_active) = (queue_entry_t) last_local; + if (vm_page_queue_empty(&vm_page_queue_active)) { + vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); } else { - queue_prev(&first_active->pageq) = (queue_entry_t) last_local; + first_active->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); } - queue_first(&vm_page_queue_active) = (queue_entry_t) first_local; - queue_prev(&first_local->pageq) = (queue_entry_t) &vm_page_queue_active; - queue_next(&last_local->pageq) = (queue_entry_t) first_active; + vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local); + first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active); + last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active); - queue_init(&lq->vpl_queue); + vm_page_queue_init(&lq->vpl_queue); /* * Adjust the global page counts. */ @@ -3724,7 +4398,7 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks) lq->vpl_internal_count = 0; lq->vpl_external_count = 0; } - assert(queue_empty(&lq->vpl_queue)); + assert(vm_page_queue_empty(&lq->vpl_queue)); if (nolocks == FALSE) { VPL_UNLOCK(&lq->vpl_lock); @@ -3754,7 +4428,7 @@ vm_page_part_zero_fill( #endif #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED - pmap_zero_part_page(m->phys_page, m_pa, len); + pmap_zero_part_page(VM_PAGE_GET_PHYS_PAGE(m), m_pa, len); #else vm_page_t tmp; while (1) { @@ -3789,8 +4463,8 @@ vm_page_zero_fill( vm_page_t m) { XPR(XPR_VM_PAGE, - "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n", - m->object, m->offset, m, 0,0); + "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n", + VM_PAGE_OBJECT(m), m->offset, m, 0,0); #if 0 /* * we don't hold the page queue lock @@ -3799,8 +4473,8 @@ vm_page_zero_fill( VM_PAGE_CHECK(m); #endif -// dbgTrace(0xAEAEAEAE, m->phys_page, 0); /* (BRINGUP) */ - pmap_zero_page(m->phys_page); +// dbgTrace(0xAEAEAEAE, VM_PAGE_GET_PHYS_PAGE(m), 0); /* (BRINGUP) */ + pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m)); } /* @@ -3825,8 +4499,8 @@ vm_page_part_copy( VM_PAGE_CHECK(src_m); VM_PAGE_CHECK(dst_m); #endif - pmap_copy_part_page(src_m->phys_page, src_pa, - dst_m->phys_page, dst_pa, len); + pmap_copy_part_page(VM_PAGE_GET_PHYS_PAGE(src_m), src_pa, + VM_PAGE_GET_PHYS_PAGE(dst_m), dst_pa, len); } /* @@ -3847,11 +4521,15 @@ vm_page_copy( vm_page_t src_m, vm_page_t dest_m) { + vm_object_t src_m_object; + + src_m_object = VM_PAGE_OBJECT(src_m); + XPR(XPR_VM_PAGE, - "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n", - src_m->object, src_m->offset, - dest_m->object, dest_m->offset, - 0); + "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n", + src_m_object, src_m->offset, + VM_PAGE_OBJECT(dest_m), dest_m->offset, + 0); #if 0 /* * we don't hold the page queue lock @@ -3860,7 +4538,7 @@ vm_page_copy( VM_PAGE_CHECK(src_m); VM_PAGE_CHECK(dest_m); #endif - vm_object_lock_assert_held(src_m->object); + vm_object_lock_assert_held(src_m_object); /* * ENCRYPTED SWAP: @@ -3873,8 +4551,8 @@ vm_page_copy( } dest_m->encrypted = FALSE; - if (src_m->object != VM_OBJECT_NULL && - src_m->object->code_signed) { + if (src_m_object != VM_OBJECT_NULL && + src_m_object->code_signed) { /* * We're copying a page from a code-signed object. * Whoever ends up mapping the copy page might care about @@ -3883,6 +4561,14 @@ vm_page_copy( */ vm_page_copy_cs_validations++; vm_page_validate_cs(src_m); +#if DEVELOPMENT || DEBUG + DTRACE_VM4(codesigned_copy, + vm_object_t, src_m_object, + vm_object_offset_t, src_m->offset, + int, src_m->cs_validated, + int, src_m->cs_tainted); +#endif /* DEVELOPMENT || DEBUG */ + } if (vm_page_is_slideable(src_m)) { @@ -3905,7 +4591,7 @@ vm_page_copy( } dest_m->slid = src_m->slid; dest_m->error = src_m->error; /* sliding src_m might have failed... */ - pmap_copy_page(src_m->phys_page, dest_m->phys_page); + pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m)); } #if MACH_ASSERT @@ -3914,34 +4600,31 @@ _vm_page_print( vm_page_t p) { printf("vm_page %p: \n", p); - printf(" pageq: next=%p prev=%p\n", p->pageq.next, p->pageq.prev); - printf(" listq: next=%p prev=%p\n", p->listq.next, p->listq.prev); - printf(" next=%p\n", VM_PAGE_UNPACK_PTR(p->next_m)); - printf(" object=%p offset=0x%llx\n", p->object, p->offset); + printf(" pageq: next=%p prev=%p\n", + (vm_page_t)VM_PAGE_UNPACK_PTR(p->pageq.next), + (vm_page_t)VM_PAGE_UNPACK_PTR(p->pageq.prev)); + printf(" listq: next=%p prev=%p\n", + (vm_page_t)(VM_PAGE_UNPACK_PTR(p->listq.next)), + (vm_page_t)(VM_PAGE_UNPACK_PTR(p->listq.prev))); + printf(" next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->next_m))); + printf(" object=%p offset=0x%llx\n",VM_PAGE_OBJECT(p), p->offset); printf(" wire_count=%u\n", p->wire_count); + printf(" q_state=%u\n", p->vm_page_q_state); - printf(" %slocal, %sinactive, %sactive, %spageout_queue, %sspeculative, %slaundry\n", - (p->local ? "" : "!"), - (p->inactive ? "" : "!"), - (p->active ? "" : "!"), - (p->pageout_queue ? "" : "!"), - (p->speculative ? "" : "!"), - (p->laundry ? "" : "!")); - printf(" %sfree, %sref, %sgobbled, %sprivate, %sthrottled\n", - (p->free ? "" : "!"), + printf(" %slaundry, %sref, %sgobbled, %sprivate\n", + (p->laundry ? "" : "!"), (p->reference ? "" : "!"), (p->gobbled ? "" : "!"), - (p->private ? "" : "!"), - (p->throttled ? "" : "!")); + (p->private ? "" : "!")); printf(" %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n", - (p->busy ? "" : "!"), - (p->wanted ? "" : "!"), - (p->tabled ? "" : "!"), - (p->fictitious ? "" : "!"), - (p->pmapped ? "" : "!"), - (p->wpmapped ? "" : "!")); - printf(" %spageout, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n", - (p->pageout ? "" : "!"), + (p->busy ? "" : "!"), + (p->wanted ? "" : "!"), + (p->tabled ? "" : "!"), + (p->fictitious ? "" : "!"), + (p->pmapped ? "" : "!"), + (p->wpmapped ? "" : "!")); + printf(" %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n", + (p->free_when_done ? "" : "!"), (p->absent ? "" : "!"), (p->error ? "" : "!"), (p->dirty ? "" : "!"), @@ -3960,7 +4643,7 @@ _vm_page_print( (p->cs_nx ? "" : "!"), (p->no_cache ? "" : "!")); - printf("phys_page=0x%x\n", p->phys_page); + printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p)); } /* @@ -3972,20 +4655,20 @@ vm_page_verify_contiguous( vm_page_t pages, unsigned int npages) { - register vm_page_t m; + vm_page_t m; unsigned int page_count; vm_offset_t prev_addr; - prev_addr = pages->phys_page; + prev_addr = VM_PAGE_GET_PHYS_PAGE(pages); page_count = 1; for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) { - if (m->phys_page != prev_addr + 1) { + if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) { printf("m %p prev_addr 0x%lx, current addr 0x%x\n", - m, (long)prev_addr, m->phys_page); + m, (long)prev_addr, VM_PAGE_GET_PHYS_PAGE(m)); printf("pages %p page_count %d npages %d\n", pages, page_count, npages); panic("vm_page_verify_contiguous: not contiguous!"); } - prev_addr = m->phys_page; + prev_addr = VM_PAGE_GET_PHYS_PAGE(m); ++page_count; } if (page_count != npages) { @@ -4003,7 +4686,7 @@ vm_page_verify_contiguous( static boolean_t vm_page_verify_this_free_list_enabled = FALSE; static unsigned int vm_page_verify_free_list( - queue_head_t *vm_page_queue, + vm_page_queue_head_t *vm_page_queue, unsigned int color, vm_page_t look_for_page, boolean_t expect_page) @@ -4018,28 +4701,33 @@ vm_page_verify_free_list( found_page = FALSE; npages = 0; - prev_m = (vm_page_t) vm_page_queue; - queue_iterate(vm_page_queue, - m, - vm_page_t, - pageq) { + prev_m = (vm_page_t)((uintptr_t)vm_page_queue); + + vm_page_queue_iterate(vm_page_queue, + m, + vm_page_t, + pageq) { if (m == look_for_page) { found_page = TRUE; } - if ((vm_page_t) m->pageq.prev != prev_m) + if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.prev) != prev_m) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p\n", - color, npages, m, m->pageq.prev, prev_m); + color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.prev), prev_m); if ( ! m->busy ) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy\n", color, npages, m); if (color != (unsigned int) -1) { - if ((m->phys_page & vm_color_mask) != color) + if ((VM_PAGE_GET_PHYS_PAGE(m) & vm_color_mask) != color) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u\n", - color, npages, m, m->phys_page & vm_color_mask, color); - if ( ! m->free ) - panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not free\n", - color, npages, m); + color, npages, m, VM_PAGE_GET_PHYS_PAGE(m) & vm_color_mask, color); + if (m->vm_page_q_state != VM_PAGE_ON_FREE_Q) + panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d\n", + color, npages, m, m->vm_page_q_state); + } else { + if (m->vm_page_q_state != VM_PAGE_ON_FREE_LOCAL_Q) + panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d\n", + npages, m, m->vm_page_q_state); } ++npages; prev_m = m; @@ -4049,14 +4737,14 @@ vm_page_verify_free_list( if (expect_page && !found_page) { printf("vm_page_verify_free_list(color=%u, npages=%u): page %p not found phys=%u\n", - color, npages, look_for_page, look_for_page->phys_page); + color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page)); _vm_page_print(look_for_page); for (other_color = 0; other_color < vm_colors; other_color++) { if (other_color == color) continue; - vm_page_verify_free_list(&vm_page_queue_free[other_color], + vm_page_verify_free_list(&vm_page_queue_free[other_color].qhead, other_color, look_for_page, FALSE); } if (color == (unsigned int) -1) { @@ -4067,7 +4755,7 @@ vm_page_verify_free_list( } if (!expect_page && found_page) { printf("vm_page_verify_free_list(color=%u, npages=%u): page %p found phys=%u\n", - color, npages, look_for_page, look_for_page->phys_page); + color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page)); } } return npages; @@ -4101,7 +4789,7 @@ vm_page_verify_free_lists( void ) } for( color = 0; color < vm_colors; color++ ) { - npages += vm_page_verify_free_list(&vm_page_queue_free[color], + npages += vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, VM_PAGE_NULL, FALSE); } nlopages = vm_page_verify_free_list(&vm_lopage_queue_free, @@ -4119,27 +4807,6 @@ vm_page_verify_free_lists( void ) lck_mtx_unlock(&vm_page_queue_free_lock); } -void -vm_page_queues_assert( - vm_page_t mem, - int val) -{ -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif - if (mem->free + mem->active + mem->inactive + mem->speculative + - mem->throttled + mem->pageout_queue > (val)) { - _vm_page_print(mem); - panic("vm_page_queues_assert(%p, %d)\n", mem, val); - } - if (VM_PAGE_WIRED(mem)) { - assert(!mem->active); - assert(!mem->inactive); - assert(!mem->speculative); - assert(!mem->throttled); - assert(!mem->pageout_queue); - } -} #endif /* MACH_ASSERT */ @@ -4296,20 +4963,19 @@ vm_page_find_contiguous( assert(!m->fictitious); assert(!m->private); - if (max_pnum && m->phys_page > max_pnum) { + if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) { /* no more low pages... */ break; } - if (!npages & ((m->phys_page & pnum_mask) != 0)) { + if (!npages & ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0)) { /* * not aligned */ RESET_STATE_OF_RUN(); } else if (VM_PAGE_WIRED(m) || m->gobbled || - m->encrypted_cleaning || - m->pageout_queue || m->laundry || m->wanted || - m->cleaning || m->overwriting || m->pageout) { + m->encrypted_cleaning || m->laundry || m->wanted || + m->cleaning || m->overwriting || m->free_when_done) { /* * page is in a transient state * or a state we don't want to deal @@ -4318,10 +4984,15 @@ vm_page_find_contiguous( */ RESET_STATE_OF_RUN(); - } else if (!m->free && !m->active && !m->inactive && !m->speculative && !m->throttled && !m->compressor) { + } else if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) || + (m->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q) || + (m->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) || + (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { /* - * page needs to be on one of our queues - * or it needs to belong to the compressor pool + * page needs to be on one of our queues (other then the pageout or special free queues) + * or it needs to belong to the compressor pool (which is now indicated + * by vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out + * from the check for VM_PAGE_NOT_ON_Q) * in order for it to be stable behind the * locks we hold at this point... * if not, don't consider it which @@ -4329,7 +5000,7 @@ vm_page_find_contiguous( */ RESET_STATE_OF_RUN(); - } else if (!m->free && (!m->tabled || m->busy)) { + } else if ((m->vm_page_q_state != VM_PAGE_ON_FREE_Q) && (!m->tabled || m->busy)) { /* * pages on the free list are always 'busy' * so we couldn't test for 'busy' in the check @@ -4344,22 +5015,22 @@ vm_page_find_contiguous( RESET_STATE_OF_RUN(); } else { - if (m->phys_page != prevcontaddr + 1) { - if ((m->phys_page & pnum_mask) != 0) { + if (VM_PAGE_GET_PHYS_PAGE(m) != prevcontaddr + 1) { + if ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0) { RESET_STATE_OF_RUN(); goto did_consider; } else { npages = 1; start_idx = page_idx; - start_pnum = m->phys_page; + start_pnum = VM_PAGE_GET_PHYS_PAGE(m); } } else { npages++; } - prevcontaddr = m->phys_page; + prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m); VM_PAGE_CHECK(m); - if (m->free) { + if (m->vm_page_q_state == VM_PAGE_ON_FREE_Q) { free_considered++; } else { /* @@ -4480,31 +5151,31 @@ vm_page_find_contiguous( m1 = &vm_pages[start_idx++]; #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL - assert(m1->free); + assert(m1->vm_page_q_state == VM_PAGE_ON_FREE_Q); #endif - if (m1->free) { + if (m1->vm_page_q_state == VM_PAGE_ON_FREE_Q) { unsigned int color; - color = m1->phys_page & vm_color_mask; + color = VM_PAGE_GET_PHYS_PAGE(m1) & vm_color_mask; #if MACH_ASSERT - vm_page_verify_free_list(&vm_page_queue_free[color], color, m1, TRUE); + vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, m1, TRUE); #endif - queue_remove(&vm_page_queue_free[color], - m1, - vm_page_t, - pageq); - m1->pageq.next = NULL; - m1->pageq.prev = NULL; + vm_page_queue_remove(&vm_page_queue_free[color].qhead, + m1, + vm_page_t, + pageq); + + VM_PAGE_ZERO_PAGEQ_ENTRY(m1); #if MACH_ASSERT - vm_page_verify_free_list(&vm_page_queue_free[color], color, VM_PAGE_NULL, FALSE); + vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, VM_PAGE_NULL, FALSE); #endif /* * Clear the "free" bit so that this page * does not get considered for another * concurrent physically-contiguous allocation. */ - m1->free = FALSE; + m1->vm_page_q_state = VM_PAGE_NOT_ON_Q; assert(m1->busy); vm_page_free_count--; @@ -4534,13 +5205,12 @@ vm_page_find_contiguous( */ m1 = &vm_pages[cur_idx--]; - assert(!m1->free); - - if (m1->object == VM_OBJECT_NULL) { + if (m1->vm_page_object == 0) { /* * page has already been removed from * the free list in the 1st pass */ + assert(m1->vm_page_q_state == VM_PAGE_NOT_ON_Q); assert(m1->offset == (vm_object_offset_t) -1); assert(m1->busy); assert(!m1->wanted); @@ -4553,7 +5223,9 @@ vm_page_find_contiguous( if (abort_run == TRUE) continue; - object = m1->object; + assert(m1->vm_page_q_state != VM_PAGE_NOT_ON_Q); + + object = VM_PAGE_OBJECT(m1); if (object != locked_object) { if (locked_object) { @@ -4565,9 +5237,9 @@ vm_page_find_contiguous( } if (locked_object == VM_OBJECT_NULL || (VM_PAGE_WIRED(m1) || m1->gobbled || - m1->encrypted_cleaning || - m1->pageout_queue || m1->laundry || m1->wanted || - m1->cleaning || m1->overwriting || m1->pageout || m1->busy)) { + m1->encrypted_cleaning || m1->laundry || m1->wanted || + m1->cleaning || m1->overwriting || m1->free_when_done || m1->busy) || + (m1->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { if (locked_object) { vm_object_unlock(locked_object); @@ -4582,12 +5254,12 @@ vm_page_find_contiguous( reusable = FALSE; if ((m1->reusable || - m1->object->all_reusable) && - m1->inactive && + object->all_reusable) && + (m1->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) && !m1->dirty && !m1->reference) { /* reusable page... */ - refmod = pmap_disconnect(m1->phys_page); + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1)); disconnected = TRUE; if (refmod == 0) { /* @@ -4617,17 +5289,17 @@ vm_page_find_contiguous( } if (! disconnected) { if (m1->pmapped) - refmod = pmap_disconnect(m1->phys_page); + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1)); else refmod = 0; } /* copy the page's contents */ - pmap_copy_page(m1->phys_page, m2->phys_page); + pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2)); /* copy the page's state */ assert(!VM_PAGE_WIRED(m1)); - assert(!m1->free); - assert(!m1->pageout_queue); + assert(m1->vm_page_q_state != VM_PAGE_ON_FREE_Q); + assert(m1->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q); assert(!m1->laundry); m2->reference = m1->reference; assert(!m1->gobbled); @@ -4639,7 +5311,7 @@ vm_page_find_contiguous( assert(!m1->fictitious); m2->pmapped = m1->pmapped; /* should flush cache ? */ m2->wpmapped = m1->wpmapped; - assert(!m1->pageout); + assert(!m1->free_when_done); m2->absent = m1->absent; m2->error = m1->error; m2->dirty = m1->dirty; @@ -4665,9 +5337,11 @@ vm_page_find_contiguous( // m2->reusable = m1->reusable; assert(!m2->reusable); - assert(!m1->lopage); + // assert(!m1->lopage); m2->slid = m1->slid; - m2->compressor = m1->compressor; + + if (m1->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) + m2->vm_page_q_state = VM_PAGE_USED_BY_COMPRESSOR; /* * page may need to be flushed if @@ -4683,7 +5357,7 @@ vm_page_find_contiguous( * inheriting state from the last time * this page was used... */ - pmap_clear_refmod(m2->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2), VM_MEM_MODIFIED | VM_MEM_REFERENCED); if (refmod & VM_MEM_REFERENCED) m2->reference = TRUE; @@ -4707,7 +5381,7 @@ vm_page_find_contiguous( */ vm_page_insert_internal(m2, locked_object, offset, VM_KERN_MEMORY_NONE, TRUE, TRUE, FALSE, FALSE, NULL); - if (m2->compressor) { + if (m2->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { m2->pmapped = TRUE; m2->wpmapped = TRUE; @@ -4725,7 +5399,7 @@ vm_page_find_contiguous( PAGE_WAKEUP_DONE(m2); } else { - assert(!m1->compressor); + assert(m1->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); /* * completely cleans up the state @@ -4740,8 +5414,11 @@ vm_page_find_contiguous( stolen_pages++; } - m1->pageq.next = (queue_entry_t) m; - m1->pageq.prev = NULL; +#if CONFIG_BACKGROUND_QUEUE + vm_page_assign_background_state(m1); +#endif + VM_PAGE_ZERO_PAGEQ_ENTRY(m1); + m1->snext = m; m = m1; } if (locked_object) { @@ -4797,9 +5474,13 @@ vm_page_find_contiguous( for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) { - if (wire == TRUE) + assert(m1->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(m1->wire_count == 0); + + if (wire == TRUE) { m1->wire_count++; - else + m1->vm_page_q_state = VM_PAGE_IS_WIRED; + } else m1->gobbled = TRUE; } if (wire == FALSE) @@ -4850,7 +5531,7 @@ vm_page_find_contiguous( (void)(*consider_buffer_cache_collect)(1); } - consider_zone_gc(TRUE); + consider_zone_gc(); zone_gc_called = TRUE; @@ -4996,18 +5677,18 @@ vm_page_do_delayed_work( if (dwp->dw_mask & DW_vm_page_free) { vm_page_free_prepare_queues(m); - assert(m->pageq.next == NULL && m->pageq.prev == NULL); + assert(m->pageq.next == 0 && m->pageq.prev == 0); /* * Add this page to our list of reclaimed pages, * to be freed later. */ - m->pageq.next = (queue_entry_t) local_free_q; + m->snext = local_free_q; local_free_q = m; } else { if (dwp->dw_mask & DW_vm_page_deactivate_internal) vm_page_deactivate_internal(m, FALSE); else if (dwp->dw_mask & DW_vm_page_activate) { - if (m->active == FALSE) { + if (m->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) { vm_page_activate(m); } } @@ -5021,7 +5702,7 @@ vm_page_do_delayed_work( * cleaned queue, and so we would have a referenced (maybe even dirty) * page on that queue, which we don't want */ - int refmod_state = pmap_disconnect(m->phys_page); + int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); if ((refmod_state & VM_MEM_REFERENCED)) { /* @@ -5032,7 +5713,7 @@ vm_page_do_delayed_work( vm_pageout_cleaned_reactivated++; vm_pageout_cleaned_commit_reactivated++; - if (m->active == FALSE) + if (m->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) vm_page_activate(m); } else { m->reference = FALSE; @@ -5042,8 +5723,8 @@ vm_page_do_delayed_work( else if (dwp->dw_mask & DW_vm_page_lru) vm_page_lru(m); else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) { - if ( !m->pageout_queue) - vm_page_queues_remove(m); + if (m->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q) + vm_page_queues_remove(m, TRUE); } if (dwp->dw_mask & DW_set_reference) m->reference = TRUE; @@ -5051,10 +5732,10 @@ vm_page_do_delayed_work( m->reference = FALSE; if (dwp->dw_mask & DW_move_page) { - if ( !m->pageout_queue) { - vm_page_queues_remove(m); + if (m->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q) { + vm_page_queues_remove(m, FALSE); - assert(m->object != kernel_object); + assert(VM_PAGE_OBJECT(m) != kernel_object); vm_page_enqueue_inactive(m, FALSE); } @@ -5100,7 +5781,7 @@ vm_page_alloc_list( return (KERN_RESOURCE_SHORTAGE); } - mem->pageq.next = (queue_entry_t) lo_page_list; + mem->snext = lo_page_list; lo_page_list = mem; } *list = lo_page_list; @@ -5117,7 +5798,7 @@ vm_page_set_offset(vm_page_t page, vm_object_offset_t offset) vm_page_t vm_page_get_next(vm_page_t page) { - return ((vm_page_t) page->pageq.next); + return (page->snext); } vm_object_offset_t @@ -5129,7 +5810,7 @@ vm_page_get_offset(vm_page_t page) ppnum_t vm_page_get_phys_page(vm_page_t page) { - return (page->phys_page); + return (VM_PAGE_GET_PHYS_PAGE(page)); } @@ -5141,7 +5822,7 @@ static vm_page_t hibernate_gobble_queue; static int hibernate_drain_pageout_queue(struct vm_pageout_queue *); static int hibernate_flush_dirty_pages(int); -static int hibernate_flush_queue(queue_head_t *, int); +static int hibernate_flush_queue(vm_page_queue_head_t *, int); void hibernate_flush_wait(void); void hibernate_mark_in_progress(void); @@ -5203,7 +5884,7 @@ hibernate_drain_pageout_queue(struct vm_pageout_queue *q) vm_page_lock_queues(); - while ( !queue_empty(&q->pgo_pending) ) { + while ( !vm_page_queue_empty(&q->pgo_pending) ) { q->pgo_draining = TRUE; @@ -5213,7 +5894,7 @@ hibernate_drain_pageout_queue(struct vm_pageout_queue *q) wait_result = thread_block(THREAD_CONTINUE_NULL); - if (wait_result == THREAD_TIMED_OUT && !queue_empty(&q->pgo_pending)) { + if (wait_result == THREAD_TIMED_OUT && !vm_page_queue_empty(&q->pgo_pending)) { hibernate_stats.hibernate_drain_timeout++; if (q == &vm_pageout_queue_external) @@ -5234,7 +5915,7 @@ hibernate_drain_pageout_queue(struct vm_pageout_queue *q) boolean_t hibernate_skip_external = FALSE; static int -hibernate_flush_queue(queue_head_t *q, int qcount) +hibernate_flush_queue(vm_page_queue_head_t *q, int qcount) { vm_page_t m; vm_object_t l_object = NULL; @@ -5255,7 +5936,7 @@ hibernate_flush_queue(queue_head_t *q, int qcount) vm_page_lock_queues(); - while (qcount && !queue_empty(q)) { + while (qcount && !vm_page_queue_empty(q)) { if (current_run++ == 1000) { if (hibernate_should_abort()) { @@ -5265,8 +5946,8 @@ hibernate_flush_queue(queue_head_t *q, int qcount) current_run = 0; } - m = (vm_page_t) queue_first(q); - m_object = m->object; + m = (vm_page_t) vm_page_queue_first(q); + m_object = VM_PAGE_OBJECT(m); /* * check to see if we currently are working @@ -5330,7 +6011,7 @@ hibernate_flush_queue(queue_head_t *q, int qcount) } } if ( !m->dirty && m->pmapped) { - refmod_state = pmap_get_refmod(m->phys_page); + refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); if ((refmod_state & VM_MEM_MODIFIED)) { SET_PAGE_DIRTY(m, FALSE); @@ -5415,22 +6096,20 @@ hibernate_flush_queue(queue_head_t *q, int qcount) * means this page can't be on the pageout queue so it's * safe to do the vm_page_queues_remove */ - assert(!m->pageout_queue); - - vm_page_queues_remove(m); + vm_page_queues_remove(m, TRUE); - if (COMPRESSED_PAGER_IS_ACTIVE && m_object->internal == TRUE) - pmap_disconnect_options(m->phys_page, PMAP_OPTIONS_COMPRESSOR, NULL); + if (m_object->internal == TRUE) + pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m), PMAP_OPTIONS_COMPRESSOR, NULL); - (void)vm_pageout_cluster(m, FALSE, FALSE, FALSE); + (void)vm_pageout_cluster(m, FALSE, FALSE); hibernate_stats.hibernate_found_dirty++; goto next_pg; reenter_pg_on_q: - queue_remove(q, m, vm_page_t, pageq); - queue_enter(q, m, vm_page_t, pageq); + vm_page_queue_remove(q, m, vm_page_t, pageq); + vm_page_queue_enter(q, m, vm_page_t, pageq); hibernate_stats.hibernate_reentered_on_q++; next_pg: @@ -5469,13 +6148,13 @@ hibernate_flush_dirty_pages(int pass) aq = &vm_page_queue_speculative[i]; - if (queue_empty(&aq->age_q)) + if (vm_page_queue_empty(&aq->age_q)) continue; qcount = 0; vm_page_lockspin_queues(); - queue_iterate(&aq->age_q, + vm_page_queue_iterate(&aq->age_q, m, vm_page_t, pageq) @@ -5491,6 +6170,7 @@ hibernate_flush_dirty_pages(int pass) } if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count)) return (1); + /* XXX FBDP TODO: flush secluded queue */ if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count)) return (1); if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count)) @@ -5498,20 +6178,20 @@ hibernate_flush_dirty_pages(int pass) if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) return (1); - if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1) + if (pass == 1) vm_compressor_record_warmup_start(); if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) { - if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1) + if (pass == 1) vm_compressor_record_warmup_end(); return (1); } if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) { - if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1) + if (pass == 1) vm_compressor_record_warmup_end(); return (1); } - if (COMPRESSED_PAGER_IS_ACTIVE && pass == 1) + if (pass == 1) vm_compressor_record_warmup_end(); if (hibernate_skip_external == FALSE && hibernate_drain_pageout_queue(&vm_pageout_queue_external)) @@ -5533,6 +6213,8 @@ hibernate_flush_memory() { int retval; + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_START, vm_page_free_count, 0, 0, 0, 0); hibernate_cleaning_in_progress = TRUE; @@ -5540,14 +6222,12 @@ hibernate_flush_memory() if ((retval = hibernate_flush_dirty_pages(1)) == 0) { - if (COMPRESSED_PAGER_IS_ACTIVE) { + KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0); - KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0); + vm_compressor_flush(); - vm_compressor_flush(); + KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0); - KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0); - } if (consider_buffer_cache_collect != NULL) { unsigned int orig_wire_count; @@ -5555,7 +6235,7 @@ hibernate_flush_memory() orig_wire_count = vm_page_wire_count; (void)(*consider_buffer_cache_collect)(1); - consider_zone_gc(TRUE); + consider_zone_gc(); HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count); @@ -5566,7 +6246,7 @@ hibernate_flush_memory() KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_END, vm_page_free_count, hibernate_stats.hibernate_found_dirty, retval, 0, 0); - if (retval && COMPRESSED_PAGER_IS_ACTIVE) + if (retval) HIBLOG("hibernate_flush_memory() failed to finish - vm_page_compressor_count(%d)\n", VM_PAGE_COMPRESSOR_COUNT); @@ -5621,7 +6301,7 @@ hibernate_free_gobble_pages(void) m = (vm_page_t) hibernate_gobble_queue; while(m) { - next = (vm_page_t) m->pageq.next; + next = m->snext; vm_page_free(m); count++; m = next; @@ -5644,12 +6324,13 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight) if (m->private) panic("hibernate_consider_discard: private"); - if (!vm_object_lock_try(m->object)) { + object = VM_PAGE_OBJECT(m); + + if (!vm_object_lock_try(object)) { + object = NULL; if (!preflight) hibernate_stats.cd_lock_failed++; break; } - object = m->object; - if (VM_PAGE_WIRED(m)) { if (!preflight) hibernate_stats.cd_found_wired++; break; @@ -5682,7 +6363,7 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight) } if (!m->dirty) { - refmod_state = pmap_get_refmod(m->phys_page); + refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); if (refmod_state & VM_MEM_REFERENCED) m->reference = TRUE; @@ -5725,15 +6406,18 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight) static void hibernate_discard_page(vm_page_t m) { + vm_object_t m_object; + if (m->absent || m->unusual || m->error) /* * If it's unusual in anyway, ignore */ return; + m_object = VM_PAGE_OBJECT(m); + #if MACH_ASSERT || DEBUG - vm_object_t object = m->object; - if (!vm_object_lock_try(m->object)) + if (!vm_object_lock_try(m_object)) panic("hibernate_discard_page(%p) !vm_object_lock_try", m); #else /* No need to lock page queue for token delete, hibernate_vm_unlock() @@ -5742,7 +6426,7 @@ hibernate_discard_page(vm_page_t m) if (m->pmapped == TRUE) { - __unused int refmod_state = pmap_disconnect(m->phys_page); + __unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); } if (m->laundry) @@ -5752,16 +6436,17 @@ hibernate_discard_page(vm_page_t m) if (m->fictitious) panic("hibernate_discard_page(%p) fictitious", m); - if (VM_PURGABLE_VOLATILE == m->object->purgable) + if (VM_PURGABLE_VOLATILE == m_object->purgable) { /* object should be on a queue */ - assert((m->object->objq.next != NULL) && (m->object->objq.prev != NULL)); - purgeable_q_t old_queue = vm_purgeable_object_remove(m->object); + assert((m_object->objq.next != NULL) && (m_object->objq.prev != NULL)); + purgeable_q_t old_queue = vm_purgeable_object_remove(m_object); assert(old_queue); - if (m->object->purgeable_when_ripe) { + if (m_object->purgeable_when_ripe) { vm_purgeable_token_delete_first(old_queue); } - m->object->purgable = VM_PURGABLE_EMPTY; + vm_object_lock_assert_exclusive(m_object); + m_object->purgable = VM_PURGABLE_EMPTY; /* * Purgeable ledgers: pages of VOLATILE and EMPTY objects are @@ -5770,8 +6455,8 @@ hibernate_discard_page(vm_page_t m) * effectively purging this object. */ unsigned int delta; - assert(m->object->resident_page_count >= m->object->wired_page_count); - delta = (m->object->resident_page_count - m->object->wired_page_count); + assert(m_object->resident_page_count >= m_object->wired_page_count); + delta = (m_object->resident_page_count - m_object->wired_page_count); assert(vm_page_purgeable_count >= delta); assert(delta > 0); OSAddAtomic(-delta, (SInt32 *)&vm_page_purgeable_count); @@ -5780,7 +6465,7 @@ hibernate_discard_page(vm_page_t m) vm_page_free(m); #if MACH_ASSERT || DEBUG - vm_object_unlock(object); + vm_object_unlock(m_object); #endif /* MACH_ASSERT || DEBUG */ } @@ -5910,22 +6595,24 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, pages--; count_wire--; if (!preflight) { - hibernate_page_bitset(page_list, TRUE, m->phys_page); - hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); + hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); } - m = (vm_page_t) m->pageq.next; + m = m->snext; } if (!preflight) for( i = 0; i < real_ncpus; i++ ) { if (cpu_data_ptr[i] && cpu_data_ptr[i]->cpu_processor) { - for (m = PROCESSOR_DATA(cpu_data_ptr[i]->cpu_processor, free_pages); m; m = (vm_page_t)m->pageq.next) + for (m = PROCESSOR_DATA(cpu_data_ptr[i]->cpu_processor, free_pages); m; m = m->snext) { + assert(m->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q); + pages--; count_wire--; - hibernate_page_bitset(page_list, TRUE, m->phys_page); - hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); + hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); hibernate_stats.cd_local_free++; hibernate_stats.cd_total_free++; @@ -5935,67 +6622,75 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, for( i = 0; i < vm_colors; i++ ) { - queue_iterate(&vm_page_queue_free[i], - m, - vm_page_t, - pageq) + vm_page_queue_iterate(&vm_page_queue_free[i].qhead, + m, + vm_page_t, + pageq) { + assert(m->vm_page_q_state == VM_PAGE_ON_FREE_Q); + pages--; count_wire--; if (!preflight) { - hibernate_page_bitset(page_list, TRUE, m->phys_page); - hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); + hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); hibernate_stats.cd_total_free++; } } } - queue_iterate(&vm_lopage_queue_free, - m, - vm_page_t, - pageq) + vm_page_queue_iterate(&vm_lopage_queue_free, + m, + vm_page_t, + pageq) { + assert(m->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); + pages--; count_wire--; if (!preflight) { - hibernate_page_bitset(page_list, TRUE, m->phys_page); - hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); + hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); hibernate_stats.cd_total_free++; } } - m = (vm_page_t) queue_first(&vm_page_queue_throttled); - while (m && !queue_end(&vm_page_queue_throttled, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled); + while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; + assert(m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q); + + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { - if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); count_discard_inactive++; discard = discard_all; } else count_throttled++; count_wire--; - if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (discard) hibernate_discard_page(m); m = next; } - m = (vm_page_t) queue_first(&vm_page_queue_anonymous); - while (m && !queue_end(&vm_page_queue_anonymous, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); + while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); + + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { - if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (m->dirty) count_discard_purgeable++; else @@ -6005,20 +6700,22 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, else count_anonymous++; count_wire--; - if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (discard) hibernate_discard_page(m); m = next; } - m = (vm_page_t) queue_first(&vm_page_queue_cleaned); - while (m && !queue_end(&vm_page_queue_cleaned, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); + while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); + + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { - if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (m->dirty) count_discard_purgeable++; else @@ -6028,20 +6725,22 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, else count_cleaned++; count_wire--; - if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (discard) hibernate_discard_page(m); m = next; } - m = (vm_page_t) queue_first(&vm_page_queue_active); - while (m && !queue_end(&vm_page_queue_active, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); + while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; + assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q); + + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { - if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (m->dirty) count_discard_purgeable++; else @@ -6051,20 +6750,22 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, else count_active++; count_wire--; - if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (discard) hibernate_discard_page(m); m = next; } - m = (vm_page_t) queue_first(&vm_page_queue_inactive); - while (m && !queue_end(&vm_page_queue_inactive, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); + while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); + + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { - if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (m->dirty) count_discard_purgeable++; else @@ -6074,39 +6775,44 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, else count_inactive++; count_wire--; - if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (discard) hibernate_discard_page(m); m = next; } + /* XXX FBDP TODO: secluded queue */ for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) { - m = (vm_page_t) queue_first(&vm_page_queue_speculative[i].age_q); - while (m && !queue_end(&vm_page_queue_speculative[i].age_q, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q); + while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; + assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q); + + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { - if (!preflight) hibernate_page_bitset(page_list, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); count_discard_speculative++; discard = discard_all; } else count_speculative++; count_wire--; - if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); if (discard) hibernate_discard_page(m); m = next; } } - queue_iterate(&compressor_object->memq, m, vm_page_t, listq) + vm_page_queue_iterate(&compressor_object->memq, m, vm_page_t, listq) { + assert(m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR); + count_compressor++; count_wire--; - if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + if (!preflight) hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); } if (preflight == FALSE && discard_all == TRUE) { @@ -6212,11 +6918,13 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) clock_get_uptime(&start); - m = (vm_page_t) queue_first(&vm_page_queue_anonymous); - while (m && !queue_end(&vm_page_queue_anonymous, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); + while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; - if (hibernate_page_bittst(page_list, m->phys_page)) + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); + + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { if (m->dirty) count_discard_purgeable++; @@ -6229,11 +6937,13 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) { - m = (vm_page_t) queue_first(&vm_page_queue_speculative[i].age_q); - while (m && !queue_end(&vm_page_queue_speculative[i].age_q, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q); + while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; - if (hibernate_page_bittst(page_list, m->phys_page)) + assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q); + + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { count_discard_speculative++; hibernate_discard_page(m); @@ -6242,11 +6952,13 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) } } - m = (vm_page_t) queue_first(&vm_page_queue_inactive); - while (m && !queue_end(&vm_page_queue_inactive, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); + while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; - if (hibernate_page_bittst(page_list, m->phys_page)) + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); + + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { if (m->dirty) count_discard_purgeable++; @@ -6256,12 +6968,15 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) } m = next; } + /* XXX FBDP TODO: secluded queue */ - m = (vm_page_t) queue_first(&vm_page_queue_active); - while (m && !queue_end(&vm_page_queue_active, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); + while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; - if (hibernate_page_bittst(page_list, m->phys_page)) + assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q); + + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { if (m->dirty) count_discard_purgeable++; @@ -6272,11 +6987,13 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = next; } - m = (vm_page_t) queue_first(&vm_page_queue_cleaned); - while (m && !queue_end(&vm_page_queue_cleaned, (queue_entry_t)m)) + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); + while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) { - next = (vm_page_t) m->pageq.next; - if (hibernate_page_bittst(page_list, m->phys_page)) + assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); + + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { if (m->dirty) count_discard_purgeable++; @@ -6341,7 +7058,7 @@ hibernate_create_paddr_map() if (ppnm) ppnm->ppnm_eindx = i; - if (ppnm == NULL || vm_pages[i].phys_page != next_ppnum_in_run) { + if (ppnm == NULL || VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]) != next_ppnum_in_run) { ppnm = kalloc(sizeof(struct ppnum_mapping)); @@ -6349,9 +7066,9 @@ hibernate_create_paddr_map() ppnm_head = ppnm; ppnm->ppnm_sindx = i; - ppnm->ppnm_base_paddr = vm_pages[i].phys_page; + ppnm->ppnm_base_paddr = VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]); } - next_ppnum_in_run = vm_pages[i].phys_page + 1; + next_ppnum_in_run = VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]) + 1; } ppnm->ppnm_eindx++; @@ -6416,15 +7133,18 @@ hibernate_hash_insert_page(vm_page_t mem) { vm_page_bucket_t *bucket; int hash_id; + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(mem); assert(mem->hashed); - assert(mem->object); + assert(m_object); assert(mem->offset != (vm_object_offset_t) -1); /* * Insert it into the object_object/offset hash table */ - hash_id = vm_page_hash(mem->object, mem->offset); + hash_id = vm_page_hash(m_object, mem->offset); bucket = &vm_page_buckets[hash_id]; mem->next_m = bucket->page_list; @@ -6444,13 +7164,13 @@ hibernate_free_range(int sindx, int eindx) vm_page_init(mem, hibernate_lookup_paddr(sindx), FALSE); mem->lopage = FALSE; - mem->free = TRUE; + mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; - color = mem->phys_page & vm_color_mask; - queue_enter_first(&vm_page_queue_free[color], - mem, - vm_page_t, - pageq); + color = VM_PAGE_GET_PHYS_PAGE(mem) & vm_color_mask; + vm_page_queue_enter_first(&vm_page_queue_free[color].qhead, + mem, + vm_page_t, + pageq); vm_page_free_count++; sindx++; @@ -6488,7 +7208,7 @@ hibernate_rebuild_vm_structs(void) * hibernate_teardown_vm_structs leaves the location where * this vm_page_t must be located in "next". */ - tmem = VM_PAGE_UNPACK_PTR(mem->next_m); + tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m)); mem->next_m = VM_PAGE_PACK_PTR(NULL); sindx = (int)(tmem - &vm_pages[0]); @@ -6523,9 +7243,9 @@ hibernate_rebuild_vm_structs(void) * vm_page_t's that were created on the fly (i.e. fictitious) */ for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) { - mem_next = VM_PAGE_UNPACK_PTR(mem->next_m); + mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m)); - mem->next_m = VM_PAGE_PACK_PTR(NULL); + mem->next_m = 0; hibernate_hash_insert_page(mem); } hibernate_rebuild_hash_list = NULL; @@ -6570,10 +7290,10 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l bucket = &vm_page_buckets[i]; - for (mem = VM_PAGE_UNPACK_PTR(bucket->page_list); mem != VM_PAGE_NULL; mem = mem_next) { + for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) { assert(mem->hashed); - mem_next = VM_PAGE_UNPACK_PTR(mem->next_m); + mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m)); if (mem < &vm_pages[0] || mem >= &vm_pages[vm_pages_count]) { mem->next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list); @@ -6592,26 +7312,26 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l mem = &vm_pages[i]; - if (mem->free) { + if (mem->vm_page_q_state == VM_PAGE_ON_FREE_Q) { unsigned int color; assert(mem->busy); assert(!mem->lopage); - color = mem->phys_page & vm_color_mask; + color = VM_PAGE_GET_PHYS_PAGE(mem) & vm_color_mask; + + vm_page_queue_remove(&vm_page_queue_free[color].qhead, + mem, + vm_page_t, + pageq); - queue_remove(&vm_page_queue_free[color], - mem, - vm_page_t, - pageq); - mem->pageq.next = NULL; - mem->pageq.prev = NULL; + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); vm_page_free_count--; hibernate_teardown_found_free_pages++; - if ( !vm_pages[compact_target_indx].free) + if (vm_pages[compact_target_indx].vm_page_q_state != VM_PAGE_ON_FREE_Q) compact_target_indx = i; } else { /* @@ -6622,13 +7342,13 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l */ mem->next_m = VM_PAGE_PACK_PTR(mem); - if (vm_pages[compact_target_indx].free) { + if (vm_pages[compact_target_indx].vm_page_q_state == VM_PAGE_ON_FREE_Q) { /* * we've got a hole to fill, so * move this vm_page_t to it's new home */ vm_pages[compact_target_indx] = *mem; - mem->free = TRUE; + mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; hibernate_teardown_last_valid_compact_indx = compact_target_indx; compact_target_indx++; @@ -6694,7 +7414,9 @@ vm_page_info( bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK]; lck_spin_lock(bucket_lock); - for (m = VM_PAGE_UNPACK_PTR(bucket->page_list); m != VM_PAGE_NULL; m = VM_PAGE_UNPACK_PTR(m->next_m)) + for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); + m != VM_PAGE_NULL; + m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->next_m))) bucket_count++; lck_spin_unlock(bucket_lock); @@ -6747,6 +7469,8 @@ vm_page_buckets_check(void) #endif /* VM_PAGE_FAKE_BUCKETS */ for (i = 0; i < vm_page_bucket_count; i++) { + vm_object_t p_object; + bucket = &vm_page_buckets[i]; if (!bucket->page_list) { continue; @@ -6754,24 +7478,27 @@ vm_page_buckets_check(void) bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK]; lck_spin_lock(bucket_lock); - p = VM_PAGE_UNPACK_PTR(bucket->page_list); + p = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); + while (p != VM_PAGE_NULL) { + p_object = VM_PAGE_OBJECT(p); + if (!p->hashed) { panic("BUCKET_CHECK: page %p (%p,0x%llx) " "hash %d in bucket %d at %p " "is not hashed\n", - p, p->object, p->offset, + p, p_object, p->offset, p_hash, i, bucket); } - p_hash = vm_page_hash(p->object, p->offset); + p_hash = vm_page_hash(p_object, p->offset); if (p_hash != i) { panic("BUCKET_CHECK: corruption in bucket %d " "at %p: page %p object %p offset 0x%llx " "hash %d\n", - i, bucket, p, p->object, p->offset, + i, bucket, p, p_object, p->offset, p_hash); } - p = VM_PAGE_UNPACK_PTR(p->next_m); + p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->next_m)); } lck_spin_unlock(bucket_lock); } @@ -6795,120 +7522,191 @@ vm_page_buckets_check(void) * this is why its safe to utilze the wire_count field in the vm_page_t as the local_id... * 'wired' and local are ALWAYS mutually exclusive conditions. */ + +#if CONFIG_BACKGROUND_QUEUE +void +vm_page_queues_remove(vm_page_t mem, boolean_t remove_from_backgroundq) +#else void -vm_page_queues_remove(vm_page_t mem) +vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) +#endif { - boolean_t was_pageable; + boolean_t was_pageable = TRUE; + vm_object_t m_object; - VM_PAGE_QUEUES_ASSERT(mem, 1); - assert(!mem->pageout_queue); - /* - * if (mem->pageout_queue) - * NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue... - * the caller is responsible for determing if the page is on that queue, and if so, must - * either first remove it (it needs both the page queues lock and the object lock to do - * this via vm_pageout_steal_laundry), or avoid the call to vm_page_queues_remove - */ - if (mem->local) { + m_object = VM_PAGE_OBJECT(mem); + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + if (mem->vm_page_q_state == VM_PAGE_NOT_ON_Q) + { + assert(mem->pageq.next == 0 && mem->pageq.prev == 0); +#if CONFIG_BACKGROUND_QUEUE + if (mem->vm_page_on_backgroundq == FALSE) { + assert(mem->vm_page_backgroundq.next == 0 && + mem->vm_page_backgroundq.prev == 0 && + mem->vm_page_on_backgroundq == FALSE); + } +#endif + return; + } + if (mem->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) + { + assert(mem->pageq.next == 0 && mem->pageq.prev == 0); +#if CONFIG_BACKGROUND_QUEUE + assert(mem->vm_page_backgroundq.next == 0 && + mem->vm_page_backgroundq.prev == 0 && + mem->vm_page_on_backgroundq == FALSE); +#endif + return; + } + if (mem->vm_page_q_state == VM_PAGE_IS_WIRED) { + /* + * might put these guys on a list for debugging purposes + * if we do, we'll need to remove this assert + */ + assert(mem->pageq.next == 0 && mem->pageq.prev == 0); +#if CONFIG_BACKGROUND_QUEUE + assert(mem->vm_page_backgroundq.next == 0 && + mem->vm_page_backgroundq.prev == 0 && + mem->vm_page_on_backgroundq == FALSE); +#endif + return; + } + + assert(m_object != compressor_object); + assert(m_object != kernel_object); + assert(m_object != vm_submap_object); + assert(!mem->fictitious); + + switch(mem->vm_page_q_state) { + + case VM_PAGE_ON_ACTIVE_LOCAL_Q: + { struct vpl *lq; - assert(mem->object != kernel_object); - assert(mem->object != compressor_object); - assert(!mem->inactive && !mem->speculative); - assert(!mem->active && !mem->throttled); - assert(!mem->clean_queue); - assert(!mem->fictitious); + lq = &vm_page_local_q[mem->local_id].vpl_un.vpl; VPL_LOCK(&lq->vpl_lock); - queue_remove(&lq->vpl_queue, - mem, vm_page_t, pageq); - mem->local = FALSE; + vm_page_queue_remove(&lq->vpl_queue, + mem, vm_page_t, pageq); mem->local_id = 0; lq->vpl_count--; - if (mem->object->internal) { + if (m_object->internal) { lq->vpl_internal_count--; } else { lq->vpl_external_count--; } VPL_UNLOCK(&lq->vpl_lock); was_pageable = FALSE; + break; } - - else if (mem->active) { - assert(mem->object != kernel_object); - assert(mem->object != compressor_object); - assert(!mem->inactive && !mem->speculative); - assert(!mem->clean_queue); - assert(!mem->throttled); - assert(!mem->fictitious); - queue_remove(&vm_page_queue_active, - mem, vm_page_t, pageq); - mem->active = FALSE; + case VM_PAGE_ON_ACTIVE_Q: + { + vm_page_queue_remove(&vm_page_queue_active, + mem, vm_page_t, pageq); vm_page_active_count--; - was_pageable = TRUE; + break; } - else if (mem->inactive) { - assert(mem->object != kernel_object); - assert(mem->object != compressor_object); - assert(!mem->active && !mem->speculative); - assert(!mem->throttled); - assert(!mem->fictitious); + case VM_PAGE_ON_INACTIVE_INTERNAL_Q: + { + assert(m_object->internal == TRUE); + vm_page_inactive_count--; - if (mem->clean_queue) { - queue_remove(&vm_page_queue_cleaned, - mem, vm_page_t, pageq); - mem->clean_queue = FALSE; - vm_page_cleaned_count--; - } else { - if (mem->object->internal) { - queue_remove(&vm_page_queue_anonymous, - mem, vm_page_t, pageq); - vm_page_anonymous_count--; - } else { - queue_remove(&vm_page_queue_inactive, - mem, vm_page_t, pageq); - } - vm_purgeable_q_advance_all(); - } - mem->inactive = FALSE; - was_pageable = TRUE; - } - - else if (mem->throttled) { - assert(mem->object != compressor_object); - assert(!mem->active && !mem->inactive); - assert(!mem->speculative); - assert(!mem->fictitious); - queue_remove(&vm_page_queue_throttled, - mem, vm_page_t, pageq); - mem->throttled = FALSE; + vm_page_queue_remove(&vm_page_queue_anonymous, + mem, vm_page_t, pageq); + vm_page_anonymous_count--; + vm_purgeable_q_advance_all(); + break; + } + + case VM_PAGE_ON_INACTIVE_EXTERNAL_Q: + { + assert(m_object->internal == FALSE); + + vm_page_inactive_count--; + vm_page_queue_remove(&vm_page_queue_inactive, + mem, vm_page_t, pageq); + vm_purgeable_q_advance_all(); + break; + } + + case VM_PAGE_ON_INACTIVE_CLEANED_Q: + { + assert(m_object->internal == FALSE); + + vm_page_inactive_count--; + vm_page_queue_remove(&vm_page_queue_cleaned, + mem, vm_page_t, pageq); + vm_page_cleaned_count--; + break; + } + + case VM_PAGE_ON_THROTTLED_Q: + { + assert(m_object->internal == TRUE); + + vm_page_queue_remove(&vm_page_queue_throttled, + mem, vm_page_t, pageq); vm_page_throttled_count--; was_pageable = FALSE; + break; } - else if (mem->speculative) { - assert(mem->object != compressor_object); - assert(!mem->active && !mem->inactive); - assert(!mem->throttled); - assert(!mem->fictitious); - remque(&mem->pageq); - mem->speculative = FALSE; + case VM_PAGE_ON_SPECULATIVE_Q: + { + assert(m_object->internal == FALSE); + + vm_page_remque(&mem->pageq); vm_page_speculative_count--; - was_pageable = TRUE; + break; + } + +#if CONFIG_SECLUDED_MEMORY + case VM_PAGE_ON_SECLUDED_Q: + { + vm_page_queue_remove(&vm_page_queue_secluded, + mem, vm_page_t, pageq); + vm_page_secluded_count--; + if (m_object == VM_OBJECT_NULL) { + vm_page_secluded_count_free--; + was_pageable = FALSE; + } else { + assert(!m_object->internal); + vm_page_secluded_count_inuse--; + was_pageable = FALSE; +// was_pageable = TRUE; + } + break; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + + default: + { + /* + * if (mem->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) + * NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue... + * the caller is responsible for determing if the page is on that queue, and if so, must + * either first remove it (it needs both the page queues lock and the object lock to do + * this via vm_pageout_steal_laundry), or avoid the call to vm_page_queues_remove + * + * we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q + * or any of the undefined states + */ + panic("vm_page_queues_remove - bad page q_state (%p, %d)\n", mem, mem->vm_page_q_state); + break; } - else if (mem->pageq.next || mem->pageq.prev) { - was_pageable = FALSE; - panic("vm_page_queues_remove: unmarked page on Q"); - } else { - was_pageable = FALSE; } + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); + mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; - mem->pageq.next = NULL; - mem->pageq.prev = NULL; - VM_PAGE_QUEUES_ASSERT(mem, 0); +#if CONFIG_BACKGROUND_QUEUE + if (remove_from_backgroundq == TRUE) + vm_page_remove_from_backgroundq(mem); +#endif if (was_pageable) { - if (mem->object->internal) { + if (m_object->internal) { vm_page_pageable_internal_count--; } else { vm_page_pageable_external_count--; @@ -6919,48 +7717,117 @@ vm_page_queues_remove(vm_page_t mem) void vm_page_remove_internal(vm_page_t page) { - vm_object_t __object = page->object; + vm_object_t __object = VM_PAGE_OBJECT(page); if (page == __object->memq_hint) { vm_page_t __new_hint; - queue_entry_t __qe; - __qe = queue_next(&page->listq); - if (queue_end(&__object->memq, __qe)) { - __qe = queue_prev(&page->listq); - if (queue_end(&__object->memq, __qe)) { + vm_page_queue_entry_t __qe; + __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->listq); + if (vm_page_queue_end(&__object->memq, __qe)) { + __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->listq); + if (vm_page_queue_end(&__object->memq, __qe)) { __qe = NULL; } } - __new_hint = (vm_page_t) __qe; + __new_hint = (vm_page_t)((uintptr_t) __qe); __object->memq_hint = __new_hint; } - queue_remove(&__object->memq, page, vm_page_t, listq); + vm_page_queue_remove(&__object->memq, page, vm_page_t, listq); +#if CONFIG_SECLUDED_MEMORY + if (__object->eligible_for_secluded) { + vm_page_secluded.eligible_for_secluded--; + } +#endif /* CONFIG_SECLUDED_MEMORY */ } void vm_page_enqueue_inactive(vm_page_t mem, boolean_t first) { - VM_PAGE_QUEUES_ASSERT(mem, 0); + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(mem); + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); assert(!mem->fictitious); assert(!mem->laundry); - assert(!mem->pageout_queue); + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); vm_page_check_pageable_safe(mem); - if (mem->object->internal) { + +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache && + vm_page_secluded_target != 0 && + num_tasks_can_use_secluded_mem == 0 && + m_object->eligible_for_secluded && + secluded_aging_policy == SECLUDED_AGING_FIFO) { + mem->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + vm_page_queue_enter(&vm_page_queue_secluded, mem, + vm_page_t, pageq); + vm_page_secluded_count++; + vm_page_secluded_count_inuse++; + assert(!m_object->internal); +// vm_page_pageable_external_count++; + return; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + + if (m_object->internal) { + mem->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q; + if (first == TRUE) - queue_enter_first(&vm_page_queue_anonymous, mem, vm_page_t, pageq); + vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vm_page_t, pageq); else - queue_enter(&vm_page_queue_anonymous, mem, vm_page_t, pageq); + vm_page_queue_enter(&vm_page_queue_anonymous, mem, vm_page_t, pageq); + vm_page_anonymous_count++; vm_page_pageable_internal_count++; } else { + mem->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q; + if (first == TRUE) - queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, pageq); + vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, pageq); else - queue_enter(&vm_page_queue_inactive, mem, vm_page_t, pageq); + vm_page_queue_enter(&vm_page_queue_inactive, mem, vm_page_t, pageq); + vm_page_pageable_external_count++; } - mem->inactive = TRUE; vm_page_inactive_count++; token_new_pagecount++; + +#if CONFIG_BACKGROUND_QUEUE + if (mem->vm_page_in_background) + vm_page_add_to_backgroundq(mem, FALSE); +#endif +} + +void +vm_page_enqueue_active(vm_page_t mem, boolean_t first) +{ + vm_object_t m_object; + + m_object = VM_PAGE_OBJECT(mem); + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + assert(!mem->fictitious); + assert(!mem->laundry); + assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + vm_page_check_pageable_safe(mem); + + mem->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; + if (first == TRUE) + vm_page_queue_enter_first(&vm_page_queue_active, mem, vm_page_t, pageq); + else + vm_page_queue_enter(&vm_page_queue_active, mem, vm_page_t, pageq); + vm_page_active_count++; + + if (m_object->internal) { + vm_page_pageable_internal_count++; + } else { + vm_page_pageable_external_count++; + } + +#if CONFIG_BACKGROUND_QUEUE + if (mem->vm_page_in_background) + vm_page_add_to_backgroundq(mem, FALSE); +#endif } /* @@ -6970,17 +7837,21 @@ vm_page_enqueue_inactive(vm_page_t mem, boolean_t first) void vm_page_check_pageable_safe(vm_page_t page) { - if (page->object == kernel_object) { + vm_object_t page_object; + + page_object = VM_PAGE_OBJECT(page); + + if (page_object == kernel_object) { panic("vm_page_check_pageable_safe: trying to add page" \ "from kernel object (%p) to pageable queue", kernel_object); } - if (page->object == compressor_object) { + if (page_object == compressor_object) { panic("vm_page_check_pageable_safe: trying to add page" \ "from compressor object (%p) to pageable queue", compressor_object); } - if (page->object == vm_submap_object) { + if (page_object == vm_submap_object) { panic("vm_page_check_pageable_safe: trying to add page" \ "from submap object (%p) to pageable queue", vm_submap_object); } @@ -7197,7 +8068,7 @@ vm_page_iterate_objects(mach_memory_info_t * sites, unsigned int num_sites, } static uint64_t -process_account(mach_memory_info_t * sites, unsigned int __unused num_sites) +process_account(mach_memory_info_t * sites, unsigned int __unused num_sites, uint64_t zones_collectable_bytes) { uint64_t found; unsigned int idx; @@ -7212,9 +8083,12 @@ process_account(mach_memory_info_t * sites, unsigned int __unused num_sites) { sites[idx].site = idx; sites[idx].flags |= VM_KERN_SITE_TAG; - if (VM_KERN_MEMORY_ZONE == idx) sites[idx].flags |= VM_KERN_SITE_HIDE; - else sites[idx].flags |= VM_KERN_SITE_WIRED; - continue; + if (VM_KERN_MEMORY_ZONE == idx) + { + sites[idx].flags |= VM_KERN_SITE_HIDE; + sites[idx].collectable_bytes = zones_collectable_bytes; + } else sites[idx].flags |= VM_KERN_SITE_WIRED; + continue; } lck_spin_lock(&vm_allocation_sites_lock); if ((site = vm_allocation_sites[idx])) @@ -7224,7 +8098,7 @@ process_account(mach_memory_info_t * sites, unsigned int __unused num_sites) sites[idx].flags |= VM_KERN_SITE_WIRED; if (VM_TAG_KMOD == (VM_KERN_SITE_TYPE & site->flags)) { - sites[idx].site = OSKextGetKmodIDForSite(site); + sites[idx].site = OSKextGetKmodIDForSite(site, NULL, 0); sites[idx].flags |= VM_KERN_SITE_KMOD; } else @@ -7251,11 +8125,12 @@ process_account(mach_memory_info_t * sites, unsigned int __unused num_sites) lck_spin_unlock(&vm_allocation_sites_lock); if (site) OSKextFreeSite(site); } + return (found); } kern_return_t -vm_page_diagnose(mach_memory_info_t * sites, unsigned int num_sites) +vm_page_diagnose(mach_memory_info_t * sites, unsigned int num_sites, uint64_t zones_collectable_bytes) { enum { kMaxKernelDepth = 1 }; vm_map_t maps [kMaxKernelDepth]; @@ -7272,6 +8147,8 @@ vm_page_diagnose(mach_memory_info_t * sites, unsigned int num_sites) bzero(sites, num_sites * sizeof(mach_memory_info_t)); + if (!vm_page_wire_count_initial) return (KERN_ABORTED); + vm_page_iterate_objects(sites, num_sites, &vm_page_count_object); wired_size = ptoa_64(vm_page_wire_count + vm_lopage_free_count + vm_page_throttled_count); @@ -7345,7 +8222,7 @@ vm_page_diagnose(mach_memory_info_t * sites, unsigned int num_sites) sites[VME_ALIAS(entry)].size += ptoa_64(count); } } - if (entry == vm_map_last_entry(map)) + while (map && (entry == vm_map_last_entry(map))) { vm_map_unlock(map); if (!stackIdx) map = NULL; @@ -7359,7 +8236,118 @@ vm_page_diagnose(mach_memory_info_t * sites, unsigned int num_sites) } } - process_account(sites, num_sites); + process_account(sites, num_sites, zones_collectable_bytes); return (KERN_SUCCESS); } + +uint32_t +vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen) +{ + vm_allocation_site_t * site; + uint32_t kmodId; + + kmodId = 0; + lck_spin_lock(&vm_allocation_sites_lock); + if ((site = vm_allocation_sites[tag])) + { + if (VM_TAG_KMOD == (VM_KERN_SITE_TYPE & site->flags)) + { + kmodId = OSKextGetKmodIDForSite(site, name, namelen); + } + } + lck_spin_unlock(&vm_allocation_sites_lock); + + return (kmodId); +} + +#if DEBUG || DEVELOPMENT + +#define vm_tag_set_lock(set) lck_spin_lock(&set->lock) +#define vm_tag_set_unlock(set) lck_spin_unlock(&set->lock) + +void +vm_tag_set_init(vm_tag_set_t set, uint32_t count) +{ + lck_spin_init(&set->lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr); + bzero(&set->entries, count * sizeof(struct vm_tag_set_entry)); +} + +kern_return_t +vm_tag_set_enter(vm_tag_set_t set, uint32_t count, vm_tag_t tag) +{ + kern_return_t kr; + uint32_t idx, free; + + vm_tag_set_lock(set); + + assert(tag != VM_KERN_MEMORY_NONE); + + kr = KERN_NO_SPACE; + free = -1U; + for (idx = 0; idx < count; idx++) + { + if (tag == set->entries[idx].tag) + { + set->entries[idx].count++; + kr = KERN_SUCCESS; + break; + } + if ((free == -1U) && !set->entries[idx].count) free = idx; + } + + if ((KERN_SUCCESS != kr) && (free != -1U)) + { + set->entries[free].tag = tag; + set->entries[free].count = 1; + kr = KERN_SUCCESS; + } + + vm_tag_set_unlock(set); + + return (kr); +} + +kern_return_t +vm_tag_set_remove(vm_tag_set_t set, uint32_t count, vm_tag_t tag, vm_tag_t * new_tagp) +{ + kern_return_t kr; + uint32_t idx; + vm_tag_t new_tag; + + assert(tag != VM_KERN_MEMORY_NONE); + new_tag = VM_KERN_MEMORY_NONE; + vm_tag_set_lock(set); + + kr = KERN_NOT_IN_SET; + for (idx = 0; idx < count; idx++) + { + if ((tag != VM_KERN_MEMORY_NONE) + && (tag == set->entries[idx].tag) + && set->entries[idx].count) + { + set->entries[idx].count--; + kr = KERN_SUCCESS; + if (set->entries[idx].count) + { + new_tag = tag; + break; + } + if (!new_tagp) break; + tag = VM_KERN_MEMORY_NONE; + } + + if (set->entries[idx].count && (VM_KERN_MEMORY_NONE == new_tag)) + { + new_tag = set->entries[idx].tag; + if (VM_KERN_MEMORY_NONE == tag) break; + } + } + + vm_tag_set_unlock(set); + if (new_tagp) *new_tagp = new_tag; + + return (kr); +} + +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index c9601f7f6..e984f2b00 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -457,9 +457,7 @@ static void vm_shared_region_reference_locked( vm_shared_region_t shared_region) { -#if DEBUG - lck_mtx_assert(&vm_shared_region_lock, LCK_MTX_ASSERT_OWNED); -#endif + LCK_MTX_ASSERT(&vm_shared_region_lock, LCK_MTX_ASSERT_OWNED); SHARED_REGION_TRACE_DEBUG( ("shared_region: -> reference_locked(%p)\n", @@ -714,6 +712,9 @@ vm_shared_region_create( goto done; } + assert(!sub_map->disable_vmentry_reuse); + sub_map->is_nested_map = TRUE; + /* make the memory entry point to the VM sub map */ mem_entry->is_sub_map = TRUE; mem_entry->backing.map = sub_map; @@ -1042,6 +1043,7 @@ vm_shared_region_map_file( vm_object_size_t obj_size; struct shared_file_mapping_np *mapping_to_slide = NULL; mach_vm_offset_t first_mapping = (mach_vm_offset_t) -1; + vm_map_offset_t lowest_unnestable_addr = 0; @@ -1184,6 +1186,7 @@ vm_shared_region_map_file( mappings[i].sfm_init_prot & VM_PROT_ALL, mappings[i].sfm_max_prot & VM_PROT_ALL, VM_INHERIT_DEFAULT); + } if (kr == KERN_SUCCESS) { @@ -1196,6 +1199,18 @@ vm_shared_region_map_file( if (first_mapping == (mach_vm_offset_t) -1) { first_mapping = target_address; } + + /* + * Record the lowest writable address in this + * sub map, to log any unexpected unnesting below + * that address (see log_unnest_badness()). + */ + if ((mappings[i].sfm_init_prot & VM_PROT_WRITE) && + sr_map->is_nested_map && + (lowest_unnestable_addr == 0 || + (target_address < lowest_unnestable_addr))) { + lowest_unnestable_addr = target_address; + } } else { if (map_port == MACH_PORT_NULL) { /* @@ -1253,7 +1268,7 @@ vm_shared_region_map_file( } if (kr == KERN_SUCCESS && - slide && + slide_size != 0 && mapping_to_slide != NULL) { kr = vm_shared_region_slide(slide, mapping_to_slide->sfm_file_offset, @@ -1277,6 +1292,18 @@ vm_shared_region_map_file( } } + if (kr == KERN_SUCCESS) { + /* adjust the map's "lowest_unnestable_start" */ + lowest_unnestable_addr &= ~(pmap_nesting_size_min-1); + if (lowest_unnestable_addr != + sr_map->lowest_unnestable_start) { + vm_map_lock(sr_map); + sr_map->lowest_unnestable_start = + lowest_unnestable_addr; + vm_map_unlock(sr_map); + } + } + vm_shared_region_lock(); assert(shared_region->sr_ref_count > 1); assert(shared_region->sr_mapping_in_progress); @@ -1309,6 +1336,7 @@ kern_return_t vm_shared_region_enter( struct _vm_map *map, struct task *task, + boolean_t is_64bit, void *fsroot, cpu_type_t cpu) { @@ -1319,9 +1347,7 @@ vm_shared_region_enter( vm_map_offset_t sr_pmap_nesting_start; vm_map_size_t sr_pmap_nesting_size; ipc_port_t sr_handle; - boolean_t is_64bit; - - is_64bit = task_has_64BitAddr(task); + vm_prot_t cur_prot, max_prot; SHARED_REGION_TRACE_DEBUG( ("shared_region: -> " @@ -1356,6 +1382,18 @@ vm_shared_region_enter( sr_pmap_nesting_start = shared_region->sr_pmap_nesting_start; sr_pmap_nesting_size = shared_region->sr_pmap_nesting_size; + cur_prot = VM_PROT_READ; +#if __x86_64__ + /* + * XXX BINARY COMPATIBILITY + * java6 apparently needs to modify some code in the + * dyld shared cache and needs to be allowed to add + * write access... + */ + max_prot = VM_PROT_ALL; +#else /* __x86_64__ */ + max_prot = VM_PROT_READ; +#endif /* __x86_64__ */ /* * Start mapping the shared region's VM sub map into the task's VM map. */ @@ -1374,8 +1412,8 @@ vm_shared_region_enter( sr_handle, sr_offset, TRUE, - VM_PROT_READ, - VM_PROT_ALL, + cur_prot, + max_prot, VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( @@ -1425,8 +1463,8 @@ vm_shared_region_enter( sr_handle, sr_offset, TRUE, - VM_PROT_READ, - VM_PROT_ALL, + cur_prot, + max_prot, VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( @@ -1463,8 +1501,8 @@ vm_shared_region_enter( sr_handle, sr_offset, TRUE, - VM_PROT_READ, - VM_PROT_ALL, + cur_prot, + max_prot, VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( @@ -1501,7 +1539,7 @@ vm_shared_region_enter( return kr; } -#define SANE_SLIDE_INFO_SIZE (2048*1024) /*Can be changed if needed*/ +#define SANE_SLIDE_INFO_SIZE (2560*1024) /*Can be changed if needed*/ struct vm_shared_region_slide_info slide_info; kern_return_t @@ -1667,20 +1705,70 @@ vm_shared_region_get_slide_info_entry(vm_shared_region_t sr) { return (void*)sr->sr_slide_info.slide_info_entry; } - -kern_return_t -vm_shared_region_slide_sanity_check(vm_shared_region_t sr) +static kern_return_t +vm_shared_region_slide_sanity_check_v1(vm_shared_region_slide_info_entry_v1_t s_info) { uint32_t pageIndex=0; uint16_t entryIndex=0; uint16_t *toc = NULL; + + toc = (uint16_t*)((uintptr_t)s_info + s_info->toc_offset); + for (;pageIndex < s_info->toc_count; pageIndex++) { + + entryIndex = (uint16_t)(toc[pageIndex]); + + if (entryIndex >= s_info->entry_count) { + printf("No sliding bitmap entry for pageIndex: %d at entryIndex: %d amongst %d entries\n", pageIndex, entryIndex, s_info->entry_count); + return KERN_FAILURE; + } + + } + return KERN_SUCCESS; +} + +static kern_return_t +vm_shared_region_slide_sanity_check_v2(vm_shared_region_slide_info_entry_v2_t s_info, mach_vm_size_t slide_info_size) +{ + if (s_info->page_size != PAGE_SIZE_FOR_SR_SLIDE) { + return KERN_FAILURE; + } + + /* Ensure that the slide info doesn't reference any data outside of its bounds. */ + + uint32_t page_starts_count = s_info->page_starts_count; + uint32_t page_extras_count = s_info->page_extras_count; + mach_vm_size_t num_trailing_entries = page_starts_count + page_extras_count; + if (num_trailing_entries < page_starts_count) { + return KERN_FAILURE; + } + + /* Scale by sizeof(uint16_t). Hard-coding the size simplifies the overflow check. */ + mach_vm_size_t trailing_size = num_trailing_entries << 1; + if (trailing_size >> 1 != num_trailing_entries) { + return KERN_FAILURE; + } + + mach_vm_size_t required_size = sizeof(*s_info) + trailing_size; + if (required_size < sizeof(*s_info)) { + return KERN_FAILURE; + } + + if (required_size > slide_info_size) { + return KERN_FAILURE; + } + + return KERN_SUCCESS; +} + +kern_return_t +vm_shared_region_slide_sanity_check(vm_shared_region_t sr) +{ vm_shared_region_slide_info_t si; vm_shared_region_slide_info_entry_t s_info; kern_return_t kr; si = vm_shared_region_get_slide_info(sr); s_info = si->slide_info_entry; - toc = (uint16_t*)((uintptr_t)s_info + s_info->toc_offset); kr = mach_vm_protect(kernel_map, (mach_vm_offset_t)(vm_offset_t)s_info, @@ -1690,16 +1778,17 @@ vm_shared_region_slide_sanity_check(vm_shared_region_t sr) panic("vm_shared_region_slide_sanity_check: vm_protect() error 0x%x\n", kr); } - for (;pageIndex < s_info->toc_count; pageIndex++) { - - entryIndex = (uint16_t)(toc[pageIndex]); - - if (entryIndex >= s_info->entry_count) { - printf("No sliding bitmap entry for pageIndex: %d at entryIndex: %d amongst %d entries\n", pageIndex, entryIndex, s_info->entry_count); - goto fail; - } - + if (s_info->version == 1) { + kr = vm_shared_region_slide_sanity_check_v1(&s_info->v1); + } else if (s_info->version == 2) { + kr = vm_shared_region_slide_sanity_check_v2(&s_info->v2, si->slide_info_size); + } else { + goto fail; } + if (kr != KERN_SUCCESS) { + goto fail; + } + return KERN_SUCCESS; fail: if (si->slide_info_entry != NULL) { @@ -1723,8 +1812,8 @@ vm_shared_region_slide_sanity_check(vm_shared_region_t sr) return KERN_FAILURE; } -kern_return_t -vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex) +static kern_return_t +vm_shared_region_slide_page_v1(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex) { uint16_t *toc = NULL; slide_info_entry_toc_t bitmap = NULL; @@ -1733,7 +1822,7 @@ vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t slide = si->slide; int is_64 = task_has_64BitAddr(current_task()); - vm_shared_region_slide_info_entry_t s_info = si->slide_info_entry; + vm_shared_region_slide_info_entry_v1_t s_info = &si->slide_info_entry->v1; toc = (uint16_t*)((uintptr_t)s_info + s_info->toc_offset); if (pageIndex >= s_info->toc_count) { @@ -1779,6 +1868,198 @@ vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, return KERN_SUCCESS; } +static kern_return_t +rebase_chain_32( + uint8_t *page_content, + uint16_t start_offset, + uint32_t slide_amount, + vm_shared_region_slide_info_entry_v2_t s_info) +{ + const uint32_t last_page_offset = PAGE_SIZE_FOR_SR_SLIDE - sizeof(uint32_t); + + const uint32_t delta_mask = (uint32_t)(s_info->delta_mask); + const uint32_t value_mask = ~delta_mask; + const uint32_t value_add = (uint32_t)(s_info->value_add); + const uint32_t delta_shift = __builtin_ctzll(delta_mask) - 2; + + uint32_t page_offset = start_offset; + uint32_t delta = 1; + + while (delta != 0 && page_offset <= last_page_offset) { + uint8_t *loc; + uint32_t value; + + loc = page_content + page_offset; + memcpy(&value, loc, sizeof(value)); + delta = (value & delta_mask) >> delta_shift; + value &= value_mask; + + if (value != 0) { + value += value_add; + value += slide_amount; + } + memcpy(loc, &value, sizeof(value)); + page_offset += delta; + } + + /* If the offset went past the end of the page, then the slide data is invalid. */ + if (page_offset > last_page_offset) { + return KERN_FAILURE; + } + return KERN_SUCCESS; +} + +static kern_return_t +rebase_chain_64( + uint8_t *page_content, + uint16_t start_offset, + uint32_t slide_amount, + vm_shared_region_slide_info_entry_v2_t s_info) +{ + const uint32_t last_page_offset = PAGE_SIZE_FOR_SR_SLIDE - sizeof(uint64_t); + + const uint64_t delta_mask = s_info->delta_mask; + const uint64_t value_mask = ~delta_mask; + const uint64_t value_add = s_info->value_add; + const uint64_t delta_shift = __builtin_ctzll(delta_mask) - 2; + + uint32_t page_offset = start_offset; + uint32_t delta = 1; + + while (delta != 0 && page_offset <= last_page_offset) { + uint8_t *loc; + uint64_t value; + + loc = page_content + page_offset; + memcpy(&value, loc, sizeof(value)); + delta = (uint32_t)((value & delta_mask) >> delta_shift); + value &= value_mask; + + if (value != 0) { + value += value_add; + value += slide_amount; + } + memcpy(loc, &value, sizeof(value)); + page_offset += delta; + } + + if (page_offset + sizeof(uint32_t) == PAGE_SIZE_FOR_SR_SLIDE) { + /* If a pointer straddling the page boundary needs to be adjusted, then + * add the slide to the lower half. The encoding guarantees that the upper + * half on the next page will need no masking. + * + * This assumes a little-endian machine and that the region being slid + * never crosses a 4 GB boundary. */ + + uint8_t *loc = page_content + page_offset; + uint32_t value; + + memcpy(&value, loc, sizeof(value)); + value += slide_amount; + memcpy(loc, &value, sizeof(value)); + } else if (page_offset > last_page_offset) { + return KERN_FAILURE; + } + + return KERN_SUCCESS; +} + +static kern_return_t +rebase_chain( + boolean_t is_64, + uint32_t pageIndex, + uint8_t *page_content, + uint16_t start_offset, + uint32_t slide_amount, + vm_shared_region_slide_info_entry_v2_t s_info) +{ + kern_return_t kr; + if (is_64) { + kr = rebase_chain_64(page_content, start_offset, slide_amount, s_info); + } else { + kr = rebase_chain_32(page_content, start_offset, slide_amount, s_info); + } + + if (kr != KERN_SUCCESS) { + printf("vm_shared_region_slide_page() offset overflow: pageIndex=%u, start_offset=%u, slide_amount=%u\n", + pageIndex, start_offset, slide_amount); + } + return kr; +} + +static kern_return_t +vm_shared_region_slide_page_v2(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex) +{ + vm_shared_region_slide_info_entry_v2_t s_info = &si->slide_info_entry->v2; + const uint32_t slide_amount = si->slide; + + /* The high bits of the delta_mask field are nonzero precisely when the shared + * cache is 64-bit. */ + const boolean_t is_64 = (s_info->delta_mask >> 32) != 0; + + const uint16_t *page_starts = (uint16_t *)((uintptr_t)s_info + s_info->page_starts_offset); + const uint16_t *page_extras = (uint16_t *)((uintptr_t)s_info + s_info->page_extras_offset); + + uint8_t *page_content = (uint8_t *)vaddr; + uint16_t page_entry; + + if (pageIndex >= s_info->page_starts_count) { + printf("vm_shared_region_slide_page() did not find page start in slide info: pageIndex=%u, count=%u\n", + pageIndex, s_info->page_starts_count); + return KERN_FAILURE; + } + page_entry = page_starts[pageIndex]; + + if (page_entry == DYLD_CACHE_SLIDE_PAGE_ATTR_NO_REBASE) { + return KERN_SUCCESS; + } + + if (page_entry & DYLD_CACHE_SLIDE_PAGE_ATTR_EXTRA) { + uint16_t chain_index = page_entry & DYLD_CACHE_SLIDE_PAGE_VALUE; + uint16_t info; + + do { + uint16_t page_start_offset; + kern_return_t kr; + + if (chain_index >= s_info->page_extras_count) { + printf("vm_shared_region_slide_page() out-of-bounds extras index: index=%u, count=%u\n", + chain_index, s_info->page_extras_count); + return KERN_FAILURE; + } + info = page_extras[chain_index]; + page_start_offset = (info & DYLD_CACHE_SLIDE_PAGE_VALUE) << DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT; + + kr = rebase_chain(is_64, pageIndex, page_content, page_start_offset, slide_amount, s_info); + if (kr != KERN_SUCCESS) { + return KERN_FAILURE; + } + + chain_index++; + } while (!(info & DYLD_CACHE_SLIDE_PAGE_ATTR_END)); + } else { + const uint32_t page_start_offset = page_entry << DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT; + kern_return_t kr; + + kr = rebase_chain(is_64, pageIndex, page_content, page_start_offset, slide_amount, s_info); + if (kr != KERN_SUCCESS) { + return KERN_FAILURE; + } + } + + return KERN_SUCCESS; +} + +kern_return_t +vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex) +{ + if (si->slide_info_entry->version == 1) { + return vm_shared_region_slide_page_v1(si, vaddr, pageIndex); + } else { + return vm_shared_region_slide_page_v2(si, vaddr, pageIndex); + } +} + /******************************************************************************/ /* Comm page support */ /******************************************************************************/ @@ -1917,7 +2198,8 @@ vm_commpage_init(void) kern_return_t vm_commpage_enter( vm_map_t map, - task_t task) + task_t task, + boolean_t is64bit) { ipc_port_t commpage_handle, commpage_text_handle; vm_map_offset_t commpage_address, objc_address, commpage_text_address; @@ -1935,8 +2217,8 @@ vm_commpage_enter( vm_flags = VM_FLAGS_FIXED | VM_FLAGS_BEYOND_MAX; /* select the appropriate comm page for this task */ - assert(! (task_has_64BitAddr(task) ^ vm_map_is_64bit(map))); - if (task_has_64BitAddr(task)) { + assert(! (is64bit ^ vm_map_is_64bit(map))); + if (is64bit) { commpage_handle = commpage64_handle; commpage_address = (vm_map_offset_t) _COMM_PAGE64_BASE_ADDRESS; commpage_size = _COMM_PAGE64_AREA_LENGTH; diff --git a/osfmk/vm/vm_shared_region.h b/osfmk/vm/vm_shared_region.h index 9cc2b394c..87097ee39 100644 --- a/osfmk/vm/vm_shared_region.h +++ b/osfmk/vm/vm_shared_region.h @@ -92,13 +92,20 @@ typedef struct vm_shared_region *vm_shared_region_t; #include #include -typedef struct vm_shared_region_slide_info_entry *vm_shared_region_slide_info_entry_t; -struct vm_shared_region_slide_info_entry { +#define PAGE_SIZE_FOR_SR_SLIDE 4096 + +/* Documentation for the slide info format can be found in the dyld project in + * the file 'launch-cache/dyld_cache_format.h'. */ + +typedef struct vm_shared_region_slide_info_entry_v1 *vm_shared_region_slide_info_entry_v1_t; +struct vm_shared_region_slide_info_entry_v1 { uint32_t version; uint32_t toc_offset; // offset from start of header to table-of-contents uint32_t toc_count; // number of entries in toc (same as number of pages in r/w mapping) uint32_t entry_offset; uint32_t entry_count; + // uint16_t toc[toc_count]; + // entrybitmap entries[entries_count]; }; #define NBBY 8 @@ -108,6 +115,34 @@ struct slide_info_entry_toc { uint8_t entry[NUM_SLIDING_BITMAPS_PER_PAGE]; }; +typedef struct vm_shared_region_slide_info_entry_v2 *vm_shared_region_slide_info_entry_v2_t; +struct vm_shared_region_slide_info_entry_v2 { + uint32_t version; + uint32_t page_size; + uint32_t page_starts_offset; + uint32_t page_starts_count; + uint32_t page_extras_offset; + uint32_t page_extras_count; + uint64_t delta_mask; // which (contiguous) set of bits contains the delta to the next rebase location + uint64_t value_add; + // uint16_t page_starts[page_starts_count]; + // uint16_t page_extras[page_extras_count]; +}; + +#define DYLD_CACHE_SLIDE_PAGE_ATTRS 0xC000 // high bits of uint16_t are flags +#define DYLD_CACHE_SLIDE_PAGE_ATTR_EXTRA 0x8000 // index is into extras array (not starts array) +#define DYLD_CACHE_SLIDE_PAGE_ATTR_NO_REBASE 0x4000 // page has no rebasing +#define DYLD_CACHE_SLIDE_PAGE_ATTR_END 0x8000 // last chain entry for page +#define DYLD_CACHE_SLIDE_PAGE_VALUE 0x3FFF // bitwise negation of DYLD_CACHE_SLIDE_PAGE_ATTRS +#define DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT 2 + +typedef union vm_shared_region_slide_info_entry *vm_shared_region_slide_info_entry_t; +union vm_shared_region_slide_info_entry { + uint32_t version; + struct vm_shared_region_slide_info_entry_v1 v1; + struct vm_shared_region_slide_info_entry_v2 v2; +}; + typedef struct vm_shared_region_slide_info *vm_shared_region_slide_info_t; struct vm_shared_region_slide_info { mach_vm_offset_t start; @@ -156,6 +191,7 @@ extern void vm_shared_region_init(void); extern kern_return_t vm_shared_region_enter( struct _vm_map *map, struct task *task, + boolean_t is_64bit, void *fsroot, cpu_type_t cpu); extern kern_return_t vm_shared_region_remove( @@ -211,7 +247,8 @@ extern void vm_commpage_init(void); extern void vm_commpage_text_init(void); extern kern_return_t vm_commpage_enter( struct _vm_map *map, - struct task *task); + struct task *task, + boolean_t is64bit); extern kern_return_t vm_commpage_remove( struct _vm_map *map, struct task *task); diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index 8ed0fc483..37d3cbd0c 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -98,6 +98,7 @@ #include #include #include +#include #include #include @@ -294,7 +295,7 @@ mach_vm_deallocate( */ kern_return_t vm_deallocate( - register vm_map_t map, + vm_map_t map, vm_offset_t start, vm_size_t size) { @@ -346,7 +347,7 @@ mach_vm_inherit( */ kern_return_t vm_inherit( - register vm_map_t map, + vm_map_t map, vm_offset_t start, vm_size_t size, vm_inherit_t new_inheritance) @@ -1163,7 +1164,7 @@ mach_vm_wire( kern_return_t vm_wire( host_priv_t host_priv, - register vm_map_t map, + vm_map_t map, vm_offset_t start, vm_size_t size, vm_prot_t access) @@ -1303,6 +1304,7 @@ vm_toggle_entry_reuse(int toggle, int *old_value) { vm_map_t map = current_map(); + assert(!map->is_nested_map); if(toggle == VM_TOGGLE_GETVALUE && old_value != NULL){ *old_value = map->disable_vmentry_reuse; } else if(toggle == VM_TOGGLE_SET){ @@ -1341,21 +1343,37 @@ kern_return_t mach_vm_behavior_set( vm_map_t map, mach_vm_offset_t start, - mach_vm_size_t size, + mach_vm_size_t size, vm_behavior_t new_behavior) { + vm_map_offset_t align_mask; + if ((map == VM_MAP_NULL) || (start + size < start)) return(KERN_INVALID_ARGUMENT); if (size == 0) return KERN_SUCCESS; - return(vm_map_behavior_set(map, - vm_map_trunc_page(start, - VM_MAP_PAGE_MASK(map)), - vm_map_round_page(start+size, - VM_MAP_PAGE_MASK(map)), - new_behavior)); + switch (new_behavior) { + case VM_BEHAVIOR_REUSABLE: + case VM_BEHAVIOR_REUSE: + case VM_BEHAVIOR_CAN_REUSE: + /* + * Align to the hardware page size, to allow + * malloc() to maximize the amount of re-usability, + * even on systems with larger software page size. + */ + align_mask = PAGE_MASK; + break; + default: + align_mask = VM_MAP_PAGE_MASK(map); + break; + } + + return vm_map_behavior_set(map, + vm_map_trunc_page(start, align_mask), + vm_map_round_page(start+size, align_mask), + new_behavior); } /* @@ -1378,18 +1396,13 @@ vm_behavior_set( vm_size_t size, vm_behavior_t new_behavior) { - if ((map == VM_MAP_NULL) || (start + size < start)) - return(KERN_INVALID_ARGUMENT); - - if (size == 0) - return KERN_SUCCESS; + if (start + size < start) + return KERN_INVALID_ARGUMENT; - return(vm_map_behavior_set(map, - vm_map_trunc_page(start, - VM_MAP_PAGE_MASK(map)), - vm_map_round_page(start+size, - VM_MAP_PAGE_MASK(map)), - new_behavior)); + return mach_vm_behavior_set(map, + (mach_vm_offset_t) start, + (mach_vm_size_t) size, + new_behavior); } /* @@ -1851,6 +1864,7 @@ vm_map_get_upl( return kr; } + /* * mach_make_memory_entry_64 * @@ -1913,6 +1927,7 @@ mach_make_memory_entry_64( if (((permission & 0x00FF0000) & ~(MAP_MEM_ONLY | MAP_MEM_NAMED_CREATE | + MAP_MEM_GRAB_SECLUDED | /* XXX FBDP TODO: restrict usage? */ MAP_MEM_PURGABLE | MAP_MEM_NAMED_REUSE | MAP_MEM_USE_DATA_ADDR | @@ -2044,6 +2059,25 @@ mach_make_memory_entry_64( vm_object_unlock(object); } +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_iokit && /* global boot-arg */ + ((permission & MAP_MEM_GRAB_SECLUDED) +#if 11 + /* XXX FBDP for my testing only */ + || (secluded_for_fbdp && map_size == 97550336) +#endif + )) { +#if 11 + if (!(permission & MAP_MEM_GRAB_SECLUDED) && + secluded_for_fbdp) { + printf("FBDP: object %p size %lld can grab secluded\n", object, (uint64_t) map_size); + } +#endif + object->can_grab_secluded = TRUE; + assert(!object->eligible_for_secluded); + } +#endif /* CONFIG_SECLUDED_MEMORY */ + /* * The VM object is brand new and nobody else knows about it, * so we don't need to lock it. @@ -2163,6 +2197,7 @@ mach_make_memory_entry_64( offset_in_page = 0; } + cur_prot = VM_PROT_ALL; kr = vm_map_copy_extract(target_map, map_start, map_size, @@ -2263,8 +2298,9 @@ mach_make_memory_entry_64( */ protections &= prot; } + if (((prot & protections) != protections) - || (object == kernel_object)) { + || (object == kernel_object)) { kr = KERN_INVALID_RIGHT; vm_object_unlock(object); vm_map_unlock_read(target_map); @@ -2616,6 +2652,7 @@ mach_make_memory_entry_64( } #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */ + vm_object_lock_assert_exclusive(object); object->true_share = TRUE; if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; @@ -2797,8 +2834,8 @@ mach_make_memory_entry_64( assert(object != VM_OBJECT_NULL); user_entry->backing.object = object; /* we now point to this object, hold on */ - vm_object_reference(object); vm_object_lock(object); + vm_object_reference_locked(object); #if VM_OBJECT_TRACKING_OP_TRUESHARE if (!object->true_share && vm_object_tracking_inited) { @@ -3562,7 +3599,7 @@ vm_map_get_phys_page( break; } } else { - phys_page = (ppnum_t)(dst_page->phys_page); + phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page)); vm_object_unlock(object); break; } diff --git a/osfmk/voucher/Makefile b/osfmk/voucher/Makefile new file mode 100644 index 000000000..534b780ef --- /dev/null +++ b/osfmk/voucher/Makefile @@ -0,0 +1,119 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +MIG_TYPES = + +MIG_DEFS = + +MACH_PRIVATE_DEFS = + +# +# MIG-generated headers that are traditionally used by user +# level code. +# +MIG_USHDRS = + +MIG_UUHDRS = + +MIGINCLUDES = ${MIG_UUHDRS} ${MIG_USHDRS} + +DATAFILES = \ + ipc_pthread_priority_types.h \ + ${MIG_TYPES} \ + ${MIG_DEFS} + +INSTALL_MI_LIST = \ + ${DATAFILES} + +INSTALL_KF_MI_LIST = \ + ${DATAFILES} + +INSTALL_KF_MI_LCL_LIST = \ + ${DATAFILES} + +INSTALL_MI_GEN_LIST = + +INSTALL_MI_DIR = voucher + +EXPORT_MI_LIST = \ + ${DATAFILES} + +EXPORT_MI_GEN_LIST = \ + ${MIGINCLUDES} + +EXPORT_MI_DIR = voucher + +${MIGINCLUDES} : ${MIG_TYPES} + +${MIG_UUHDRS} : \ + %.h : %.defs + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(_v)$(MIG) $(MIGFLAGS) \ + -server /dev/null \ + -user /dev/null \ + -header $@ \ + $< + +${MIG_USHDRS} : \ + %_server.h : %.defs + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(_v)$(MIG) $(MIGFLAGS) \ + -server /dev/null \ + -user /dev/null \ + -header /dev/null \ + -sheader $@ \ + $< + +# +# Build path +# + +INCFLAGS_MAKEFILE= -I.. + +MIGKSFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_SERVER=1 +MIGKUFLAGS = -DMACH_KERNEL_PRIVATE -DKERNEL_USER=1 -maxonstack 1024 +# +# MIG-generated headers that are traditionally used by kernel +# level code. +# +MIG_KUHDRS = + +MIG_KUSRC = + +MIG_KSHDRS = + +MIG_KSSRC = + +COMP_FILES = ${MIG_KUSRC} ${MIG_KSSRC} + +do_build_all:: $(COMP_FILES) + +${COMP_FILES} : ${MIG_TYPES} + +${MIG_KUSRC} : \ + %_user.c : %.defs + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS} \ + -user $*_user.c \ + -header $*.h \ + -server /dev/null \ + -sheader /dev/null \ + $< + +${MIG_KSSRC}: \ + %_server.c : %.defs + @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)" + $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS} \ + -user /dev/null \ + -header /dev/null \ + -server $*_server.c \ + -sheader $*_server.h \ + $< + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/osfmk/voucher/ipc_pthread_priority.c b/osfmk/voucher/ipc_pthread_priority.c new file mode 100644 index 000000000..baf70d7ba --- /dev/null +++ b/osfmk/voucher/ipc_pthread_priority.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2012-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +ipc_voucher_attr_control_t ipc_pthread_priority_voucher_attr_control; /* communication channel from PTHPRIORITY to voucher system */ + +#define IPC_PTHREAD_PRIORITY_VALUE_TO_HANDLE(x) ((mach_voucher_attr_value_handle_t)(x)) +#define HANDLE_TO_IPC_PTHREAD_PRIORITY_VALUE(x) ((ipc_pthread_priority_value_t)(x)) + +extern unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t for_propagation); + +kern_return_t +ipc_pthread_priority_release_value( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_value_handle_t value, + mach_voucher_attr_value_reference_t sync); + +kern_return_t +ipc_pthread_priority_get_value( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_recipe_command_t command, + mach_voucher_attr_value_handle_array_t prev_values, + mach_msg_type_number_t __assert_only prev_value_count, + mach_voucher_attr_content_t recipe, + mach_voucher_attr_content_size_t recipe_size, + mach_voucher_attr_value_handle_t *out_value, + mach_voucher_attr_value_flags_t *out_flags, + ipc_voucher_t *out_value_voucher); + +kern_return_t +ipc_pthread_priority_extract_content( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_value_handle_array_t values, + mach_msg_type_number_t value_count, + mach_voucher_attr_recipe_command_t *out_command, + mach_voucher_attr_content_t out_recipe, + mach_voucher_attr_content_size_t *in_out_recipe_size); + +kern_return_t +ipc_pthread_priority_command( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_value_handle_array_t values, + mach_msg_type_number_t value_count, + mach_voucher_attr_command_t command, + mach_voucher_attr_content_t in_content, + mach_voucher_attr_content_size_t in_content_size, + mach_voucher_attr_content_t out_content, + mach_voucher_attr_content_size_t *in_out_content_size); + +void +ipc_pthread_priority_release(ipc_voucher_attr_manager_t __assert_only manager); + +/* + * communication channel from voucher system to IPC_PTHREAD_PRIORITY + */ +struct ipc_voucher_attr_manager ipc_pthread_priority_manager = { + .ivam_release_value = ipc_pthread_priority_release_value, + .ivam_get_value = ipc_pthread_priority_get_value, + .ivam_extract_content = ipc_pthread_priority_extract_content, + .ivam_command = ipc_pthread_priority_command, + .ivam_release = ipc_pthread_priority_release, + .ivam_flags = IVAM_FLAGS_NONE, +}; + +/* + * Routine: ipc_pthread_priority_init + * Purpose: Initialize the IPC_PTHREAD_PRIORITY subsystem. + * Returns: None. + */ +void +ipc_pthread_priority_init() +{ + kern_return_t kr = KERN_SUCCESS; + + /* Register the ipc_pthread_priority manager with the Vouchers sub system. */ + kr = ipc_register_well_known_mach_voucher_attr_manager( + &ipc_pthread_priority_manager, + 0, + MACH_VOUCHER_ATTR_KEY_PTHPRIORITY, + &ipc_pthread_priority_voucher_attr_control); + if (kr != KERN_SUCCESS ) + panic("IPC_PTHREAD_PRIORITY subsystem initialization failed"); + + kprintf("IPC_PTHREAD_PRIORITY subsystem is initialized\n"); + return ; +} + +/* + * IPC_PTHREAD_PRIORITY Resource Manager Routines. + */ + + +/* + * Routine: ipc_pthread_priority_release_value + * Purpose: Release a value, if sync matches the sync count in value. + * Returns: KERN_SUCCESS: on Successful deletion. + * KERN_FAILURE: if sync value does not matches. + */ +kern_return_t +ipc_pthread_priority_release_value( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_value_handle_t value, + mach_voucher_attr_value_reference_t sync) +{ + assert(MACH_VOUCHER_ATTR_KEY_PTHPRIORITY == key); + assert(manager == &ipc_pthread_priority_manager); + + ipc_pthread_priority_value_t ipc_pthread_priority_value = HANDLE_TO_IPC_PTHREAD_PRIORITY_VALUE(value); + + panic("ipc_pthread_priority_release_value called for a persistent PTHPRIORITY value %x with sync value %d\n", ipc_pthread_priority_value, sync); + return KERN_FAILURE; +} + +/* + * Routine: ipc_pthread_priority_get_value + */ +kern_return_t +ipc_pthread_priority_get_value( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_recipe_command_t command, + mach_voucher_attr_value_handle_array_t __unused prev_values, + mach_msg_type_number_t __unused prev_value_count, + mach_voucher_attr_content_t recipe, + mach_voucher_attr_content_size_t recipe_size, + mach_voucher_attr_value_handle_t *out_value, + mach_voucher_attr_value_flags_t *out_flags, + ipc_voucher_t *out_value_voucher) +{ + kern_return_t kr = KERN_SUCCESS; + ipc_pthread_priority_value_t ipc_pthread_priority_value; + ipc_pthread_priority_value_t canonicalize_priority_value; + + assert(MACH_VOUCHER_ATTR_KEY_PTHPRIORITY == key); + assert(manager == &ipc_pthread_priority_manager); + + /* never an out voucher */ + *out_value_voucher = IPC_VOUCHER_NULL; + *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_NONE; + + switch (command) { + + case MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE: + + if (recipe_size != sizeof(ipc_pthread_priority_value_t)) { + return KERN_INVALID_ARGUMENT; + } + + memcpy(&ipc_pthread_priority_value, recipe, recipe_size); + + if (ipc_pthread_priority_value == PTHPRIORITY_ATTR_DEFAULT_VALUE) { + *out_value = IPC_PTHREAD_PRIORITY_VALUE_TO_HANDLE(PTHPRIORITY_ATTR_DEFAULT_VALUE); + return kr; + } + + /* Callout to pthread kext to get the canonicalized value */ + canonicalize_priority_value = (ipc_pthread_priority_value_t) pthread_priority_canonicalize( + (unsigned long)ipc_pthread_priority_value, true); + + *out_value = IPC_PTHREAD_PRIORITY_VALUE_TO_HANDLE(canonicalize_priority_value); + *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST; + return kr; + + default: + kr = KERN_INVALID_ARGUMENT; + break; + } + + return kr; +} + +/* + * Routine: ipc_pthread_priority_extract_content + * Purpose: Extract a set of pthread_priority value from an array of voucher values. + * Returns: KERN_SUCCESS: on Success. + * KERN_NO_SPACE: insufficeint buffer provided to fill an array of pthread_priority values. + */ +kern_return_t +ipc_pthread_priority_extract_content( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_value_handle_array_t values, + mach_msg_type_number_t value_count, + mach_voucher_attr_recipe_command_t *out_command, + mach_voucher_attr_content_t out_recipe, + mach_voucher_attr_content_size_t *in_out_recipe_size) +{ + kern_return_t kr = KERN_SUCCESS; + mach_msg_type_number_t i; + ipc_pthread_priority_value_t ipc_pthread_priority_value; + + assert(MACH_VOUCHER_ATTR_KEY_PTHPRIORITY == key); + assert(manager == &ipc_pthread_priority_manager); + + for (i = 0; i < value_count; i++) { + ipc_pthread_priority_value = HANDLE_TO_IPC_PTHREAD_PRIORITY_VALUE(values[i]); + + if (ipc_pthread_priority_value == PTHPRIORITY_ATTR_DEFAULT_VALUE) { + continue; + } + + if (MACH_VOUCHER_PTHPRIORITY_CONTENT_SIZE > *in_out_recipe_size) { + *in_out_recipe_size = 0; + return KERN_NO_SPACE; + } + + memcpy(&out_recipe[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value)); + *out_command = MACH_VOUCHER_ATTR_PTHPRIORITY_NULL; + *in_out_recipe_size = (mach_voucher_attr_content_size_t)sizeof(ipc_pthread_priority_value); + return kr; + } + + *in_out_recipe_size = 0; + return KERN_INVALID_VALUE; +} + +/* + * Routine: ipc_pthread_priority_command + * Purpose: Execute a command against a set of PTHPRIORITY values. + * Returns: KERN_SUCCESS: On successful execution of command. + * KERN_FAILURE: On failure. + */ +kern_return_t +ipc_pthread_priority_command( + ipc_voucher_attr_manager_t __assert_only manager, + mach_voucher_attr_key_t __assert_only key, + mach_voucher_attr_value_handle_array_t __unused values, + mach_msg_type_number_t __unused value_count, + mach_voucher_attr_command_t __unused command, + mach_voucher_attr_content_t __unused in_content, + mach_voucher_attr_content_size_t __unused in_content_size, + mach_voucher_attr_content_t __unused out_content, + mach_voucher_attr_content_size_t __unused *out_content_size) +{ + assert(MACH_VOUCHER_ATTR_KEY_PTHPRIORITY == key); + assert(manager == &ipc_pthread_priority_manager); + + return KERN_FAILURE; +} + +void +ipc_pthread_priority_release( + ipc_voucher_attr_manager_t __assert_only manager) +{ + assert(manager == &ipc_pthread_priority_manager); +} diff --git a/osfmk/voucher/ipc_pthread_priority_internal.h b/osfmk/voucher/ipc_pthread_priority_internal.h new file mode 100644 index 000000000..c399046ff --- /dev/null +++ b/osfmk/voucher/ipc_pthread_priority_internal.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2012-2013 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _VOUCHER_IPC_PTHREAD_PRIORITY_INTERNAL_H_ +#define _VOUCHER_IPC_PTHREAD_PRIORITY_INTERNAL_H_ + +#define PTHPRIORITY_ATTR_DEFAULT_VALUE (0) + +extern void ipc_pthread_priority_init(void); + + +#endif /* _VOUCHER_IPC_PTHREAD_PRIORITY_INTERNAL_H_ */ + diff --git a/osfmk/voucher/ipc_pthread_priority_types.h b/osfmk/voucher/ipc_pthread_priority_types.h new file mode 100644 index 000000000..f59d06e9b --- /dev/null +++ b/osfmk/voucher/ipc_pthread_priority_types.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2012-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _VOUCHER_IPC_PTHREAD_PRIORITY_TYPES_H_ +#define _VOUCHER_IPC_PTHREAD_PRIORITY_TYPES_H_ + +#include +#include + +#define MACH_VOUCHER_ATTR_PTHPRIORITY_NULL ((mach_voucher_attr_recipe_command_t)701) +#define MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE ((mach_voucher_attr_recipe_command_t)710) + +typedef uint32_t ipc_pthread_priority_value_t; + +#define MACH_VOUCHER_PTHPRIORITY_CONTENT_SIZE (sizeof(ipc_pthread_priority_value_t)) + +#endif /* _VOUCHER_IPC_PTHREAD_PRIORITY_TYPES_H_ */ diff --git a/osfmk/x86_64/Makefile b/osfmk/x86_64/Makefile index a8a465850..d83d1669e 100644 --- a/osfmk/x86_64/Makefile +++ b/osfmk/x86_64/Makefile @@ -13,5 +13,3 @@ EXPORT_MD_DIR = x86_64 include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/osfmk/x86_64/copyio.c b/osfmk/x86_64/copyio.c index 20e246b6e..5ce1c3b73 100644 --- a/osfmk/x86_64/copyio.c +++ b/osfmk/x86_64/copyio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,6 +73,7 @@ static int copyio_phys(addr64_t, addr64_t, vm_size_t, int); */ extern int _bcopy(const void *, void *, vm_size_t); extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *); +extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len); /* @@ -83,8 +84,9 @@ extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *); #define COPYINSTR 2 /* string variant of copyout */ #define COPYINPHYS 3 /* from user virtual to kernel physical */ #define COPYOUTPHYS 4 /* from kernel physical to user virtual */ +#define COPYINWORD 5 /* from user virtual to kernel virtual */ -#if DEVELOPMENT +#if ENABLE_SMAPLOG typedef struct { uint64_t timestamp; thread_t thread; @@ -119,13 +121,13 @@ smaplog_add_entry(boolean_t enabling) smaplog_cbuf[index].smap_state = enabling; smaplog_cbuf[index].copyio_active = (thread->machine.specFlags & CopyIOActive) ? 1 : 0; } -#endif /* DEVELOPMENT */ +#endif /* ENABLE_SMAPLOG */ extern boolean_t pmap_smap_enabled; static inline void user_access_enable(void) { if (pmap_smap_enabled) { stac(); -#if DEVELOPMENT +#if ENABLE_SMAPLOG smaplog_add_entry(TRUE); #endif } @@ -133,48 +135,52 @@ static inline void user_access_enable(void) { static inline void user_access_disable(void) { if (pmap_smap_enabled) { clac(); -#if DEVELOPMENT +#if ENABLE_SMAPLOG smaplog_add_entry(FALSE); #endif } } +#if COPYIO_TRACE_ENABLED +#define COPYIO_TRACE(x, a, b, c, d, e) KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) +#else +#define COPYIO_TRACE(x, a, b, c, d, e) do { } while(0) +#endif + static int copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map) { - thread_t thread; + thread_t thread = current_thread(); pmap_t pmap; vm_size_t bytes_copied; int error = 0; boolean_t istate = FALSE; boolean_t recursive_CopyIOActive; -#if KDEBUG +#if COPYIO_TRACE_ENABLED int debug_type = 0xeff70010; debug_type += (copy_type << 2); #endif + boolean_t nopagezero = thread->map->pmap->pagezero_accessible; assert(nbytes < COPYSIZELIMIT_PANIC); - thread = current_thread(); - - KERNEL_DEBUG(debug_type | DBG_FUNC_START, - (unsigned)(user_addr >> 32), (unsigned)user_addr, - nbytes, thread->machine.copyio_state, 0); + COPYIO_TRACE(debug_type | DBG_FUNC_START, + user_addr, kernel_addr, nbytes, use_kernel_map, 0); - if (nbytes == 0) + if (__improbable(nbytes == 0)) goto out; pmap = thread->map->pmap; - if ((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS) && ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) { + if (__improbable((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS) && ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS))) { panic("Invalid copy parameter, copy type: %d, kernel address: %p", copy_type, kernel_addr); } /* Sanity and security check for addresses to/from a user */ - if (((pmap != kernel_pmap) && (use_kernel_map == 0)) && - ((nbytes && (user_addr+nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map)))) { + if (__improbable(((pmap != kernel_pmap) && (use_kernel_map == 0)) && + ((nbytes && (user_addr+nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map))))) { error = EFAULT; goto out; } @@ -188,14 +194,24 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, * we will later restore the correct cr3. */ recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive; - thread->machine.specFlags |= CopyIOActive; - user_access_enable(); - if (no_shared_cr3) { + + boolean_t pdswitch = no_shared_cr3 || nopagezero; + + if (__improbable(pdswitch)) { istate = ml_set_interrupts_enabled(FALSE); - if (get_cr3_base() != pmap->pm_cr3) + if (nopagezero && pmap_pcid_ncpus) { + pmap_pcid_activate(pmap, cpu_number(), TRUE, TRUE); + } else if (get_cr3_base() != pmap->pm_cr3) { set_cr3_raw(pmap->pm_cr3); + } + thread->machine.specFlags |= CopyIOActive; + } else { + thread->machine.specFlags |= CopyIOActive; } + user_access_enable(); + +#if DEVELOPMENT || DEBUG /* * Ensure that we're running on the target thread's cr3. */ @@ -205,11 +221,14 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, copy_type, (void *)user_addr, kernel_addr, nbytes, lencopied, use_kernel_map, (void *) get_cr3_raw(), (void *) pmap->pm_cr3); } - if (no_shared_cr3) +#endif + + if (__improbable(pdswitch)) { (void) ml_set_interrupts_enabled(istate); + } - KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_addr, - (unsigned)kernel_addr, nbytes, 0, 0); + COPYIO_TRACE(0xeff70044 | DBG_FUNC_NONE, user_addr, + kernel_addr, nbytes, 0, 0); switch (copy_type) { @@ -237,6 +256,12 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, nbytes); break; + case COPYINWORD: + error = _copyin_word((const void *) user_addr, + (void *) kernel_addr, + nbytes); + break; + case COPYINSTR: error = _bcopystr((const void *) user_addr, kernel_addr, @@ -275,23 +300,30 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, error = ENAMETOOLONG; break; } - break; } user_access_disable(); - if (!recursive_CopyIOActive) { - thread->machine.specFlags &= ~CopyIOActive; - } - if (no_shared_cr3) { + + if (__improbable(pdswitch)) { istate = ml_set_interrupts_enabled(FALSE); - if (get_cr3_raw() != kernel_pmap->pm_cr3) - set_cr3_raw(kernel_pmap->pm_cr3); + if (!recursive_CopyIOActive && (get_cr3_raw() != kernel_pmap->pm_cr3)) { + if (nopagezero && pmap_pcid_ncpus) { + pmap_pcid_activate(pmap, cpu_number(), TRUE, FALSE); + } else { + set_cr3_raw(kernel_pmap->pm_cr3); + } + } + + if (!recursive_CopyIOActive) { + thread->machine.specFlags &= ~CopyIOActive; + } (void) ml_set_interrupts_enabled(istate); + } else if (!recursive_CopyIOActive) { + thread->machine.specFlags &= ~CopyIOActive; } out: - KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, - (unsigned)kernel_addr, (unsigned)nbytes, error, 0); + COPYIO_TRACE(debug_type | DBG_FUNC_END, user_addr, kernel_addr, nbytes, error, 0); return (error); } @@ -328,6 +360,24 @@ copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) return copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0); } +/* + * copyin_word + * Read an aligned value from userspace as a single memory transaction. + * This function supports userspace synchronization features + */ +int +copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes) +{ + /* Verify sizes */ + if ((nbytes != 4) && (nbytes != 8)) + return EINVAL; + + /* Test alignment */ + if (user_addr & (nbytes - 1)) + return EINVAL; + return copyio(COPYINWORD, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0); +} + int copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied) { diff --git a/osfmk/x86_64/cswitch.s b/osfmk/x86_64/cswitch.s index 88d88060c..c0574bf8c 100644 --- a/osfmk/x86_64/cswitch.s +++ b/osfmk/x86_64/cswitch.s @@ -69,9 +69,9 @@ Entry(Load_context) movq %rdx,%gs:CPU_KERNEL_STACK /* store stack top */ movq %rdx,%rsp - movq %rdx,%rbp + xorl %ebp, %ebp - xorq %rdi,%rdi /* return zero (no old thread) */ + xorl %edi,%edi /* return zero (no old thread) */ call EXT(thread_continue) @@ -152,6 +152,9 @@ Entry(Shutdown_context) movq %gs:CPU_INT_STACK_TOP,%rsp /* switch to interrupt stack */ + movq %rsp, %gs:CPU_ACTIVE_STACK + movq EXT(kernel_stack_size)(%rip),%rcx /* point to stack top */ + subq %rcx, %gs:CPU_ACTIVE_STACK movq %rdx,%rdi /* processor arg to routine */ call *%rsi /* call routine to run */ hlt /* (should never return) */ diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index edba4c4db..3ab3d89bc 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -284,13 +284,15 @@ L_common_dispatch: clac /* Clear EFLAGS.AC if SMAP is present/enabled */ 1: /* - * On entering the kernel, we don't need to switch cr3 + * On entering the kernel, we typically don't switch CR3 * because the kernel shares the user's address space. - * But we mark the kernel's cr3 as "active". - * If, however, the invalid cr3 flag is set, we have to flush tlbs - * since the kernel's mapping was changed while we were in userspace. + * But we mark the kernel's cr3 as "active" for TLB coherency evaluation + * If, however, the CPU's invalid TLB flag is set, we have to invalidate the TLB + * since the kernel pagetables were changed while we were in userspace. * - * But: if global no_shared_cr3 is TRUE we do switch to the kernel's cr3 + * For threads with a mapped pagezero (some WINE games) on non-SMAP platforms, + * we switch to the kernel's address space on entry. Also, + * if the global no_shared_cr3 is TRUE we do switch to the kernel's cr3 * so that illicit accesses to userspace can be trapped. */ mov %gs:CPU_KERNEL_CR3, %rcx @@ -298,8 +300,14 @@ L_common_dispatch: test $3, %esi /* user/kernel? */ jz 2f /* skip cr3 reload from kernel */ xor %rbp, %rbp + cmpl $0, %gs:CPU_PAGEZERO_MAPPED + jnz 11f cmpl $0, EXT(no_shared_cr3)(%rip) je 2f +11: + xor %eax, %eax + movw %gs:CPU_KERNEL_PCID, %ax + or %rax, %rcx mov %rcx, %cr3 /* load kernel cr3 */ jmp 4f /* and skip tlb flush test */ 2: @@ -371,18 +379,21 @@ Entry(ret_to_user) mov %rcx, %gs:CPU_DR7 2: /* - * On exiting the kernel there's no need to switch cr3 since we're + * On exiting the kernel there's typically no need to switch cr3 since we're * already running in the user's address space which includes the - * kernel. Nevertheless, we now mark the task's cr3 as active. - * But, if no_shared_cr3 is set, we do need to switch cr3 at this point. + * kernel. We now mark the task's cr3 as active, for TLB coherency. + * If the target address space has a pagezero mapping present, or + * if no_shared_cr3 is set, we do need to switch cr3 at this point. */ mov %gs:CPU_TASK_CR3, %rcx mov %rcx, %gs:CPU_ACTIVE_CR3 + cmpl $0, %gs:CPU_PAGEZERO_MAPPED + jnz L_cr3_switch_island movl EXT(no_shared_cr3)(%rip), %eax test %eax, %eax /* -no_shared_cr3 */ - jz 3f - mov %rcx, %cr3 -3: + jnz L_cr3_switch_island + +L_cr3_switch_return: mov %gs:CPU_DR7, %rax /* Is there a debug control register?*/ cmp $0, %rax je 4f @@ -451,6 +462,7 @@ EXT(ret32_set_gs): EXT(ret32_iret): iretq /* return from interrupt */ + L_fast_exit: pop %rdx /* user return eip */ pop %rcx /* pop and toss cs */ @@ -460,6 +472,13 @@ L_fast_exit: sti /* interrupts enabled after sysexit */ sysexitl /* 32-bit sysexit */ +L_cr3_switch_island: + xor %eax, %eax + movw %gs:CPU_ACTIVE_PCID, %ax + or %rax, %rcx + mov %rcx, %cr3 + jmp L_cr3_switch_return + ret_to_kernel: #if DEBUG_IDT64 cmpl $(SS_64), SS_FLAVOR(%r15) /* 64-bit state? */ @@ -678,16 +697,8 @@ Entry(idt64_debug) Entry(idt64_double_fault) PUSH_FUNCTION(HNDL_DOUBLE_FAULT) pushq $(T_DOUBLE_FAULT) + jmp L_dispatch_kernel - push %rax - leaq EXT(idt64_syscall)(%rip), %rax - cmp %rax, ISF64_RIP+8(%rsp) - pop %rax - jne L_dispatch_kernel - - mov ISF64_RSP(%rsp), %rsp - jmp L_syscall_continue - /* * For GP/NP/SS faults, we use the IST1 stack. diff --git a/osfmk/x86_64/locore.s b/osfmk/x86_64/locore.s index bcdbb9779..8620c19fc 100644 --- a/osfmk/x86_64/locore.s +++ b/osfmk/x86_64/locore.s @@ -79,7 +79,6 @@ #define RECOVERY_SECTION .section __VECTORS, __recover #else #define RECOVERY_SECTION .text -#define RECOVERY_SECTION .text #endif #define RECOVER_TABLE_START \ @@ -304,6 +303,41 @@ _bcopystr_fail: movl $(EFAULT),%eax /* return error for failure */ ret +/* + * Copyin 32 or 64 bit aligned word as a single transaction + * rdi: source address (user) + * rsi: destination address (kernel) + * rdx: size (4 or 8) + */ +Entry(_copyin_word) + pushq %rbp /* Save registers */ + movq %rsp, %rbp + cmpl $0x4, %edx /* If size = 4 */ + je L_copyin_word_4 /* handle 32-bit load */ + movl $(EINVAL), %eax /* Set up error status */ + cmpl $0x8, %edx /* If size != 8 */ + jne L_copyin_word_exit /* exit with error */ + RECOVERY_SECTION + RECOVER(L_copyin_word_fail) /* Set up recovery handler for next instruction*/ + movq (%rdi), %rax /* Load quad from user */ + jmp L_copyin_word_store +L_copyin_word_4: + RECOVERY_SECTION + RECOVER(L_copyin_word_fail) /* Set up recovery handler for next instruction */ + movl (%rdi), %eax /* Load long from user */ +L_copyin_word_store: + movq %rax, (%rsi) /* Store to kernel */ + xorl %eax, %eax /* Return success */ +L_copyin_word_exit: + popq %rbp /* Restore registers */ + retq /* Return */ + +L_copyin_word_fail: + movl $(EFAULT), %eax /* Return error for failure */ + popq %rbp /* Restore registers */ + retq /* Return */ + + /* * Done with recovery table. */ diff --git a/osfmk/x86_64/loose_ends.c b/osfmk/x86_64/loose_ends.c index 35ee768fc..161b397c3 100644 --- a/osfmk/x86_64/loose_ends.c +++ b/osfmk/x86_64/loose_ends.c @@ -536,23 +536,15 @@ memmove(void *dst, const void *src, size_t ulen) size_t strlen( - register const char *string) + const char *string) { - register const char *ret = string; + const char *ret = string; while (*string++ != '\0') continue; return string - 1 - ret; } -uint32_t -hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest) -{ - return OSCompareAndSwap((UInt32)oldval, - (UInt32)newval, - (volatile UInt32 *)dest); -} - #if MACH_ASSERT /* diff --git a/osfmk/x86_64/lz4_decode_x86_64.s b/osfmk/x86_64/lz4_decode_x86_64.s new file mode 100644 index 000000000..ae0c69324 --- /dev/null +++ b/osfmk/x86_64/lz4_decode_x86_64.s @@ -0,0 +1,371 @@ +#include +#if LZ4_ENABLE_ASSEMBLY_DECODE_X86_64 + +/* + + int64_t lz4_decode_asm( + uint8_t ** dst_ptr, *dst_ptr points to next output byte to write + uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst + uint8_t * dst_end, "relaxed" end of output buffer (see below) + const uint8_t ** src_ptr, *src_ptr points to next input byte to read + const uint8_t * src_end) "relaxed" end of input buffer (see below) + + We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant. + We never read before dst_begin. + + Return 0 on success, -1 on failure + On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction. + +*/ + +#if MSVC_CALLING_CONVENTIONS +#error TODO implement MSVC calling conventions for LZ4 x86_64 assembly +#endif + +// %rax and %rbx are free to use + +#define dst %rdi // arg0 +#define dst_begin %rsi // arg1 +#define dst_end %rdx // arg2 +#define src %rcx // arg3 +#define src_end %r8 // arg4 + +#define n_literals %r9 +#define n_matches %r10 + +#define copy_src %r11 // match/literal copy source +#define copy_dst %r12 // match/literal copy destination +#define match_distance %r13 // match distance + +#define src_good %r14 +#define dst_good %r15 + +.globl _lz4_decode_asm + +.macro establish_frame + push %rbp + mov %rsp,%rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 +.endm + +.macro clear_frame_and_return + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp +#ifdef __AVX2__ + vzeroupper +#endif + ret +.endm + +// copy_1x16 SOURCE_ADDR DESTINATION_ADDR +// Copy 16 bytes, clobber: xmm0 +.macro copy_1x16 +#ifdef __AVX2__ + vmovdqu ($0),%xmm0 + vmovdqu %xmm0,($1) +#else + movdqu ($0),%xmm0 + movdqu %xmm0,($1) +#endif +.endm + +// copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR +// Copy 16 bytes, and increment both addresses by 16, clobber: xmm0 +.macro copy_1x16_and_increment +#ifdef __AVX2__ + vmovdqu ($0),%xmm0 + vmovdqu %xmm0,($1) +#else + movdqu ($0),%xmm0 + movdqu %xmm0,($1) +#endif + add $$16,$0 + add $$16,$1 +.endm + +// copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR +// Copy 2 times 16 bytes, and increment both addresses by 32, clobber: xmm0 +.macro copy_2x16_and_increment +#ifdef __AVX2__ + vmovdqu ($0),%xmm0 + vmovdqu %xmm0,($1) + vmovdqu 16($0),%xmm0 + vmovdqu %xmm0,16($1) +#else + movdqu ($0),%xmm0 + movdqu %xmm0,($1) + movdqu 16($0),%xmm0 + movdqu %xmm0,16($1) +#endif + add $$32,$0 + add $$32,$1 +.endm + +// copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR +// Copy 32 bytes, and increment both addresses by 32, clobber: xmm0,xmm1 +.macro copy_1x32_and_increment +#ifdef __AVX2__ + vmovdqu ($0),%ymm0 + vmovdqu %ymm0,($1) +#else + movdqu ($0),%xmm0 + movdqu 16($0),%xmm1 + movdqu %xmm0,($1) + movdqu %xmm1,16($1) +#endif + add $$32,$0 + add $$32,$1 +.endm + +.macro check_src_end + cmp src,src_end + jbe L_done // done if src >= src_end +.endm + +.macro check_dst_end + cmp dst,dst_end + jbe L_done // done if dst >= dst_end +.endm + +.text +.p2align 6 +_lz4_decode_asm: + establish_frame + push dst // keep uint8_t ** dst on stack + mov (dst),dst // load current dst from *dst + push src // keep const uint8_t ** src on stack + mov (src),src // load current src from *src + +L_decode_command: + // Keep last known good command + mov dst,dst_good + mov src,src_good + + // Check limits + check_src_end + check_dst_end + + // Decode command + movzb (src),%rax // read command byte LLLLMMMM + add $1,src + mov %rax,n_literals + shr $4,n_literals // n_literals in 0..15 + mov %rax,n_matches + and $0xf,n_matches + add $4,n_matches // n_matches in 4..19 + + // Short literal? + cmp $15,n_literals + je L_decode_long_literal + + // Copy literals, n_literals <= 14: copy 16 bytes +L_copy_short_literal: + copy_1x16 src,dst + add n_literals,src // src += n_literals + add n_literals,dst // dst += n_literals + jmp L_expand_match // continue to match + + // the number of literals is encoded on more bytes, we need to decode them +L_decode_long_literal: + check_src_end // required here, since we may loop an arbitrarily high number of times + movzb (src),%rax + add $1,src + add %rax,n_literals + cmp $255,%rax + je L_decode_long_literal + + // Copy literals, n_literals >= 15 +L_copy_long_literal: + mov src,copy_src // literal copy source + mov dst,copy_dst // literal copy destination + add n_literals,src // update src,dst for next step + add n_literals,dst + check_src_end // required here, since n_literals can be arbitrarily high + check_dst_end + + // fixed + loop + copy_1x32_and_increment copy_src,copy_dst + copy_1x32_and_increment copy_src,copy_dst +L_copy_long_literal_loop: + copy_1x32_and_increment copy_src,copy_dst + cmp copy_dst,dst + ja L_copy_long_literal_loop + // continue to match + +L_expand_match: + // Load match distance, and get match copy source + movzw (src),match_distance + add $2,src + test match_distance,match_distance + jz L_fail // match_distance == 0: FAIL + mov dst,copy_src + sub match_distance,copy_src // copy_src = match copy source + cmp copy_src,dst_begin + ja L_fail // dst_begin > copy_src: FAIL + + // Long n_matches encoding? + cmp $19,n_matches + je L_decode_long_match // unlikely + // Long n_matches with short encoding (17 or 18)? + cmp $16,n_matches + ja L_long_match // unlikely + + // Copy match, n_matches <= 16 +L_copy_short_match: + cmp $16,match_distance + jb L_copy_short_match_overlap + + // Copy match, n_matches <= 16 and match_distance >= 16: copy 16 bytes + copy_1x16 copy_src,dst + add n_matches,dst // update dst + jmp L_decode_command // to next command + + // Copy match, n_matches <= 16 and match_distance < 16: replicate pattern +L_copy_short_match_overlap: + lea L_match_permtable(%rip),%rax + shl $5,match_distance +#ifdef __AVX2__ + vmovdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance + vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 + vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0 + vmovdqu %xmm0,(dst) // and store the result +#else + movdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance + movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 + pshufb %xmm2,%xmm0 // replicate the pattern in xmm0 + movdqu %xmm0,(dst) // and store the result +#endif + add n_matches,dst // update dst + jmp L_decode_command // to next command + + // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them +L_decode_long_match: + mov $255,%rbx +L_decode_long_match_loop: + check_src_end // required here, since we may loop an arbitrarily high number of times + mov (src),%rax + add $1,src + and %rbx,%rax + add %rax,n_matches + cmp %rbx,%rax + je L_decode_long_match_loop + + // n_matches > 16 +L_long_match: + mov dst,copy_dst // copy_dst = match copy destination + add n_matches,dst // update dst + check_dst_end // n_matches may be arbitrarily high + + cmp $16,match_distance + jb L_copy_long_match_overlap // match_distance < 16: overlapping copy + + // Copy match, n_matches >= 16, match_distance >= 16 + // fixed + loop + copy_1x16_and_increment copy_src,copy_dst +L_copy_long_match_loop: + copy_2x16_and_increment copy_src,copy_dst + cmp copy_dst,dst + ja L_copy_long_match_loop + jmp L_decode_command // to next command + + // Copy match, n_matches >= 16, match_distance < 16: replicate pattern +L_copy_long_match_overlap: + lea L_match_permtable(%rip),%rax + mov match_distance,%rbx + shl $5,%rbx +#ifdef __AVX2__ + vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 + vmovdqa %xmm0,%xmm1 // keep a copy for the high bytes + vmovdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes + vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0 + vmovdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes + vpshufb %xmm2,%xmm1,%xmm1 // replicate the pattern in xmm1 + vinserti128 $1,%xmm1,%ymm0,%ymm0 // store all 32 bytes into a single register +#else + movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 + movdqa %xmm0,%xmm1 // keep a copy for the high bytes + movdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes + pshufb %xmm2,%xmm0 // replicate the pattern in xmm0 + movdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes + pshufb %xmm2,%xmm1 // replicate the pattern in xmm1 +#endif + // Here, %xmm0:%xmm1 (or %ymm0 for AVX2) is a 32-byte pattern replicating the first match_distance bytes up to 32 bytes + lea L_match_disttable(%rip),%rax + movzb (%rax,match_distance),%rax // and %rax is now the usable length of this pattern, the largest multiple of match_distance less than or equal to 32. + + // fixed +#ifdef __AVX2__ + vmovdqu %ymm0,(copy_dst) +#else + movdqu %xmm0,(copy_dst) + movdqu %xmm1,16(copy_dst) +#endif + add %rax,copy_dst +L_copy_long_match_overlap_loop: + // loop +#ifdef __AVX2__ + vmovdqu %ymm0,(copy_dst) +#else + movdqu %xmm0,(copy_dst) + movdqu %xmm1,16(copy_dst) +#endif + add %rax,copy_dst + cmp copy_dst,dst + ja L_copy_long_match_overlap + jmp L_decode_command // to next command + +L_fail: + xor %rax,%rax + dec %rax // -1 + jmp L_exit + +L_done: + xor %rax,%rax + // continue to exit + +L_exit: + pop src + mov src_good,(src) + pop dst + mov dst_good,(dst) + clear_frame_and_return + +// permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15 +// value(d)[i] = i%d for i = 0..31 +.p2align 6 +L_match_permtable: +.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0 +.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1 +.byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2 +.byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3 +.byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4 +.byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5 +.byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6 +.byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15 + +// valid repeating pattern size, for each match_distance = 0 to 15 +// value(d) = 32 - (32%d), is the largest a multiple of d <= 32 +.p2align 6 +L_match_disttable: +.byte 32,32,32,30 // 0 .. 3 +.byte 16,30,30,28 // 4 .. 7 +.byte 16,27,30,22 // 8 .. 11 +.byte 24,26,28,30 // 12 .. 15 + +#endif // LZ4_ENABLE_ASSEMBLY_DECODE_X86_64 diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index a73b9e26c..ff941d2e2 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -145,6 +145,9 @@ #include +#if MACH_ASSERT +int pmap_stats_assert = 1; +#endif /* MACH_ASSERT */ #ifdef IWANTTODEBUG #undef DEBUG @@ -166,9 +169,15 @@ boolean_t pmap_trace = FALSE; boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */ -int nx_enabled = 1; /* enable no-execute protection */ +int nx_enabled = 1; /* enable no-execute protection -- set during boot */ + +#if DEBUG || DEVELOPMENT int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */ int allow_stack_exec = 0; /* No apps may execute from the stack by default */ +#else /* DEBUG || DEVELOPMENT */ +const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */ +const int allow_stack_exec = 0; /* No apps may execute from the stack by default */ +#endif /* DEBUG || DEVELOPMENT */ const boolean_t cpu_64bit = TRUE; /* Mais oui! */ @@ -195,9 +204,9 @@ zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ */ boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */ -static struct vm_object kptobj_object_store; -static struct vm_object kpml4obj_object_store; -static struct vm_object kpdptobj_object_store; +static struct vm_object kptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +static struct vm_object kpml4obj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +static struct vm_object kpdptobj_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); /* * Array of physical page attribites for managed pages. @@ -251,7 +260,6 @@ caddr_t DADDR2; boolean_t pmap_disable_kheap_nx = FALSE; boolean_t pmap_disable_kstack_nx = FALSE; -extern boolean_t doconstro_override; extern long __stack_chk_guard[]; @@ -294,7 +302,7 @@ extern vm_offset_t eHIB; extern vm_offset_t stext; extern vm_offset_t etext; extern vm_offset_t sdata, edata; -extern vm_offset_t sconstdata, econstdata; +extern vm_offset_t sconst, econst; extern void *KPTphys; @@ -321,17 +329,28 @@ pmap_cpu_init(void) cdp->cpu_task_map = TASK_MAP_64BIT; pmap_pcid_configure(); if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) { + pmap_smep_enabled = TRUE; +#if DEVELOPMENT || DEBUG boolean_t nsmep; - if (!PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) { + if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) { + pmap_smep_enabled = FALSE; + } +#endif + if (pmap_smep_enabled) { set_cr4(get_cr4() | CR4_SMEP); - pmap_smep_enabled = TRUE; } + } if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) { + pmap_smap_enabled = TRUE; +#if DEVELOPMENT || DEBUG boolean_t nsmap; - if (!PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) { + if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) { + pmap_smap_enabled = FALSE; + } +#endif + if (pmap_smap_enabled) { set_cr4(get_cr4() | CR4_SMAP); - pmap_smap_enabled = TRUE; } } @@ -514,6 +533,12 @@ pmap_bootstrap( kprintf("Kernel traces for pmap operations enabled\n"); } #endif /* PMAP_TRACES */ + +#if MACH_ASSERT + PE_parse_boot_argn("pmap_stats_assert", + &pmap_stats_assert, + sizeof (pmap_stats_assert)); +#endif /* MACH_ASSERT */ } void @@ -613,7 +638,7 @@ hibernate_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end void hibernate_rebuild_pmap_structs(void) { - int32_t cindx, eindx, rindx; + int32_t cindx, eindx, rindx = 0; pv_rooted_entry_t pv_h; eindx = (int32_t)pmap_npages; @@ -784,7 +809,7 @@ pmap_init(void) pv_rooted_entry_t pv_e; pv_e = pai_to_pvh(ppn); - pv_e->va = vaddr; + pv_e->va_and_flags = vaddr; vaddr += PAGE_SIZE; pv_e->pmap = kernel_pmap; queue_init(&pv_e->qlink); @@ -887,9 +912,9 @@ void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, b * The now unused level-1 PTE pages are also freed. */ extern ppnum_t vm_kernel_base_page; -void -pmap_lowmem_finalize(void) -{ +static uint32_t constptes = 0, dataptes = 0; + +void pmap_lowmem_finalize(void) { spl_t spl; int i; @@ -1052,48 +1077,45 @@ pmap_lowmem_finalize(void) } boolean_t doconstro = TRUE; - +#if DEVELOPMENT || DEBUG (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro)); - - if ((sconstdata | econstdata) & PAGE_MASK) { - kprintf("Const DATA misaligned 0x%lx 0x%lx\n", sconstdata, econstdata); - if ((sconstdata & PAGE_MASK) || (doconstro_override == FALSE)) - doconstro = FALSE; - } - - if ((sconstdata > edata) || (sconstdata < sdata) || ((econstdata - sconstdata) >= (edata - sdata))) { - kprintf("Const DATA incorrect size 0x%lx 0x%lx 0x%lx 0x%lx\n", sconstdata, econstdata, sdata, edata); - doconstro = FALSE; - } - - if (doconstro) +#endif + if (doconstro) { + if (sconst & PAGE_MASK) { + panic("CONST segment misaligned 0x%lx 0x%lx\n", + sconst, econst); + } kprintf("Marking const DATA read-only\n"); - + } + vm_offset_t dva; for (dva = sdata; dva < edata; dva += I386_PGBYTES) { assert(((sdata | edata) & PAGE_MASK) == 0); - if ( (sdata | edata) & PAGE_MASK) { - kprintf("DATA misaligned, 0x%lx, 0x%lx\n", sdata, edata); - break; - } + pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva); + dpte = *dptep; + assert((dpte & INTEL_PTE_VALID)); + dpte |= INTEL_PTE_NX; + pmap_store_pte(dptep, dpte); + dataptes++; + } + assert(dataptes > 0); + + for (dva = sconst; dva < econst; dva += I386_PGBYTES) { pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva); dpte = *dptep; assert((dpte & INTEL_PTE_VALID)); - if ((dpte & INTEL_PTE_VALID) == 0) { - kprintf("Missing data mapping 0x%lx 0x%lx 0x%lx\n", dva, sdata, edata); - continue; - } - dpte |= INTEL_PTE_NX; - if (doconstro && (dva >= sconstdata) && (dva < econstdata)) { - dpte &= ~INTEL_PTE_WRITE; - } + dpte &= ~INTEL_PTE_WRITE; + constptes++; pmap_store_pte(dptep, dpte); } + + assert(constptes > 0); + kernel_segment_command_t * seg; kernel_section_t * sec; @@ -1303,8 +1325,12 @@ pmap_create_options( p->ledger = ledger; p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT); - if (pmap_pcid_ncpus) + + p->pagezero_accessible = FALSE; + + if (pmap_pcid_ncpus) { pmap_pcid_initialize(p); + } p->pm_pml4 = zalloc(pmap_anchor_zone); @@ -1343,6 +1369,11 @@ pmap_create_options( pml4[KERNEL_PHYSMAP_PML4_INDEX] = kpml4[KERNEL_PHYSMAP_PML4_INDEX]; } +#if MACH_ASSERT + p->pmap_pid = 0; + strlcpy(p->pmap_procname, "", sizeof (p->pmap_procname)); +#endif /* MACH_ASSERT */ + PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, p, flags, 0, 0, 0); @@ -1358,6 +1389,103 @@ pmap_create( return pmap_create_options(ledger, sz, ((is_64bit) ? PMAP_CREATE_64BIT : 0)); } +/* + * We maintain stats and ledgers so that a task's physical footprint is: + * phys_footprint = ((internal - alternate_accounting) + * + (internal_compressed - alternate_accounting_compressed) + * + iokit_mapped + * + purgeable_nonvolatile + * + purgeable_nonvolatile_compressed + * + page_table) + * where "alternate_accounting" includes "iokit" and "purgeable" memory. + */ + +#if MACH_ASSERT +struct { + uint64_t num_pmaps_checked; + + int phys_footprint_over; + ledger_amount_t phys_footprint_over_total; + ledger_amount_t phys_footprint_over_max; + int phys_footprint_under; + ledger_amount_t phys_footprint_under_total; + ledger_amount_t phys_footprint_under_max; + + int internal_over; + ledger_amount_t internal_over_total; + ledger_amount_t internal_over_max; + int internal_under; + ledger_amount_t internal_under_total; + ledger_amount_t internal_under_max; + + int internal_compressed_over; + ledger_amount_t internal_compressed_over_total; + ledger_amount_t internal_compressed_over_max; + int internal_compressed_under; + ledger_amount_t internal_compressed_under_total; + ledger_amount_t internal_compressed_under_max; + + int iokit_mapped_over; + ledger_amount_t iokit_mapped_over_total; + ledger_amount_t iokit_mapped_over_max; + int iokit_mapped_under; + ledger_amount_t iokit_mapped_under_total; + ledger_amount_t iokit_mapped_under_max; + + int alternate_accounting_over; + ledger_amount_t alternate_accounting_over_total; + ledger_amount_t alternate_accounting_over_max; + int alternate_accounting_under; + ledger_amount_t alternate_accounting_under_total; + ledger_amount_t alternate_accounting_under_max; + + int alternate_accounting_compressed_over; + ledger_amount_t alternate_accounting_compressed_over_total; + ledger_amount_t alternate_accounting_compressed_over_max; + int alternate_accounting_compressed_under; + ledger_amount_t alternate_accounting_compressed_under_total; + ledger_amount_t alternate_accounting_compressed_under_max; + + int page_table_over; + ledger_amount_t page_table_over_total; + ledger_amount_t page_table_over_max; + int page_table_under; + ledger_amount_t page_table_under_total; + ledger_amount_t page_table_under_max; + + int purgeable_volatile_over; + ledger_amount_t purgeable_volatile_over_total; + ledger_amount_t purgeable_volatile_over_max; + int purgeable_volatile_under; + ledger_amount_t purgeable_volatile_under_total; + ledger_amount_t purgeable_volatile_under_max; + + int purgeable_nonvolatile_over; + ledger_amount_t purgeable_nonvolatile_over_total; + ledger_amount_t purgeable_nonvolatile_over_max; + int purgeable_nonvolatile_under; + ledger_amount_t purgeable_nonvolatile_under_total; + ledger_amount_t purgeable_nonvolatile_under_max; + + int purgeable_volatile_compressed_over; + ledger_amount_t purgeable_volatile_compressed_over_total; + ledger_amount_t purgeable_volatile_compressed_over_max; + int purgeable_volatile_compressed_under; + ledger_amount_t purgeable_volatile_compressed_under_total; + ledger_amount_t purgeable_volatile_compressed_under_max; + + int purgeable_nonvolatile_compressed_over; + ledger_amount_t purgeable_nonvolatile_compressed_over_total; + ledger_amount_t purgeable_nonvolatile_compressed_over_max; + int purgeable_nonvolatile_compressed_under; + ledger_amount_t purgeable_nonvolatile_compressed_under_total; + ledger_amount_t purgeable_nonvolatile_compressed_under_max; +} pmap_ledgers_drift; +static void pmap_check_ledgers(pmap_t pmap); +#else /* MACH_ASSERT */ +static inline void pmap_check_ledgers(__unused pmap_t pmap) {} +#endif /* MACH_ASSERT */ + /* * Retire the given physical map from service. * Should only be called if the map contains @@ -1422,6 +1550,8 @@ pmap_destroy(pmap_t p) OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count); PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE); + + pmap_check_ledgers(p); ledger_dereference(p->ledger); zfree(pmap_zone, p); @@ -1629,7 +1759,7 @@ pmap_expand_pml4( * put the page into the pmap's obj list so it * can be found later. */ - pn = m->phys_page; + pn = VM_PAGE_GET_PHYS_PAGE(m); pa = i386_ptob(pn); i = pml4idx(map, vaddr); @@ -1719,7 +1849,7 @@ pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options) * put the page into the pmap's obj list so it * can be found later. */ - pn = m->phys_page; + pn = VM_PAGE_GET_PHYS_PAGE(m); pa = i386_ptob(pn); i = pdptidx(map, vaddr); @@ -1803,8 +1933,8 @@ pmap_expand( unsigned int options) { pt_entry_t *pdp; - register vm_page_t m; - register pmap_paddr_t pa; + vm_page_t m; + pmap_paddr_t pa; uint64_t i; ppnum_t pn; boolean_t is_ept = is_ept_pmap(map); @@ -1839,7 +1969,7 @@ pmap_expand( * put the page into the pmap's obj list so it * can be found later. */ - pn = m->phys_page; + pn = VM_PAGE_GET_PHYS_PAGE(m); pa = i386_ptob(pn); i = pdeidx(map, vaddr); @@ -2003,7 +2133,7 @@ void pmap_collect( pmap_t p) { - register pt_entry_t *pdp, *ptp; + pt_entry_t *pdp, *ptp; pt_entry_t *eptp; int wired; boolean_t is_ept; @@ -2040,7 +2170,7 @@ pmap_collect( */ wired = 0; { - register pt_entry_t *ptep; + pt_entry_t *ptep; for (ptep = ptp; ptep < eptp; ptep++) { if (iswired(*ptep)) { wired = 1; @@ -2068,7 +2198,7 @@ pmap_collect( * And free the pte page itself. */ { - register vm_page_t m; + vm_page_t m; vm_object_lock(p->pm_obj); @@ -2190,7 +2320,7 @@ pmap_list_resident_pages( #endif /* MACH_VM_DEBUG */ - +#if CONFIG_COREDUMP /* temporary workaround */ boolean_t coredumpok(__unused vm_map_t map, __unused vm_offset_t va) @@ -2206,7 +2336,7 @@ coredumpok(__unused vm_map_t map, __unused vm_offset_t va) return TRUE; #endif } - +#endif boolean_t phys_page_exists(ppnum_t pn) @@ -2289,7 +2419,7 @@ pmap_flush_context_init(pmap_flush_context *pfc) pfc->pfc_invalid_global = 0; } -extern unsigned TLBTimeOut; +extern uint64_t TLBTimeOut; void pmap_flush( pmap_flush_context *pfc) @@ -2428,7 +2558,7 @@ pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int o { unsigned int cpu; unsigned int cpu_bit; - cpumask_t cpus_to_signal; + cpumask_t cpus_to_signal = 0; unsigned int my_cpu = cpu_number(); pmap_paddr_t pmap_cr3 = pmap->pm_cr3; boolean_t flush_self = FALSE; @@ -2469,8 +2599,6 @@ pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int o * For idle cpus (with no active map) we mark them invalid but * don't signal -- they'll check as they go busy. */ - cpus_to_signal = 0; - if (pmap_pcid_ncpus) { if (pmap_is_shared) need_global_flush = TRUE; @@ -2757,3 +2885,354 @@ pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset } return rv; } + +#if MACH_ASSERT +extern int pmap_ledgers_panic; +static void +pmap_check_ledgers( + pmap_t pmap) +{ + ledger_amount_t bal; + int pid; + char *procname; + boolean_t do_panic; + + if (pmap->pmap_pid == 0) { + /* + * This pmap was not or is no longer fully associated + * with a task (e.g. the old pmap after a fork()/exec() or + * spawn()). Its "ledger" still points at a task that is + * now using a different (and active) address space, so + * we can't check that all the pmap ledgers are balanced here. + * + * If the "pid" is set, that means that we went through + * pmap_set_process() in task_terminate_internal(), so + * this task's ledger should not have been re-used and + * all the pmap ledgers should be back to 0. + */ + return; + } + + do_panic = FALSE; + pid = pmap->pmap_pid; + procname = pmap->pmap_procname; + + pmap_ledgers_drift.num_pmaps_checked++; + + ledger_get_balance(pmap->ledger, + task_ledgers.phys_footprint, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"phys_footprint\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.phys_footprint_over++; + pmap_ledgers_drift.phys_footprint_over_total += bal; + if (bal > pmap_ledgers_drift.phys_footprint_over_max) { + pmap_ledgers_drift.phys_footprint_over_max = bal; + } + } else { + pmap_ledgers_drift.phys_footprint_under++; + pmap_ledgers_drift.phys_footprint_under_total += bal; + if (bal < pmap_ledgers_drift.phys_footprint_under_max) { + pmap_ledgers_drift.phys_footprint_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.internal, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"internal\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.internal_over++; + pmap_ledgers_drift.internal_over_total += bal; + if (bal > pmap_ledgers_drift.internal_over_max) { + pmap_ledgers_drift.internal_over_max = bal; + } + } else { + pmap_ledgers_drift.internal_under++; + pmap_ledgers_drift.internal_under_total += bal; + if (bal < pmap_ledgers_drift.internal_under_max) { + pmap_ledgers_drift.internal_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.internal_compressed, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"internal_compressed\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.internal_compressed_over++; + pmap_ledgers_drift.internal_compressed_over_total += bal; + if (bal > pmap_ledgers_drift.internal_compressed_over_max) { + pmap_ledgers_drift.internal_compressed_over_max = bal; + } + } else { + pmap_ledgers_drift.internal_compressed_under++; + pmap_ledgers_drift.internal_compressed_under_total += bal; + if (bal < pmap_ledgers_drift.internal_compressed_under_max) { + pmap_ledgers_drift.internal_compressed_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.iokit_mapped, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"iokit_mapped\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.iokit_mapped_over++; + pmap_ledgers_drift.iokit_mapped_over_total += bal; + if (bal > pmap_ledgers_drift.iokit_mapped_over_max) { + pmap_ledgers_drift.iokit_mapped_over_max = bal; + } + } else { + pmap_ledgers_drift.iokit_mapped_under++; + pmap_ledgers_drift.iokit_mapped_under_total += bal; + if (bal < pmap_ledgers_drift.iokit_mapped_under_max) { + pmap_ledgers_drift.iokit_mapped_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.alternate_accounting, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"alternate_accounting\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.alternate_accounting_over++; + pmap_ledgers_drift.alternate_accounting_over_total += bal; + if (bal > pmap_ledgers_drift.alternate_accounting_over_max) { + pmap_ledgers_drift.alternate_accounting_over_max = bal; + } + } else { + pmap_ledgers_drift.alternate_accounting_under++; + pmap_ledgers_drift.alternate_accounting_under_total += bal; + if (bal < pmap_ledgers_drift.alternate_accounting_under_max) { + pmap_ledgers_drift.alternate_accounting_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.alternate_accounting_compressed, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"alternate_accounting_compressed\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.alternate_accounting_compressed_over++; + pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal; + if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) { + pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal; + } + } else { + pmap_ledgers_drift.alternate_accounting_compressed_under++; + pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal; + if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) { + pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.page_table, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"page_table\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.page_table_over++; + pmap_ledgers_drift.page_table_over_total += bal; + if (bal > pmap_ledgers_drift.page_table_over_max) { + pmap_ledgers_drift.page_table_over_max = bal; + } + } else { + pmap_ledgers_drift.page_table_under++; + pmap_ledgers_drift.page_table_under_total += bal; + if (bal < pmap_ledgers_drift.page_table_under_max) { + pmap_ledgers_drift.page_table_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.purgeable_volatile, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"purgeable_volatile\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.purgeable_volatile_over++; + pmap_ledgers_drift.purgeable_volatile_over_total += bal; + if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) { + pmap_ledgers_drift.purgeable_volatile_over_max = bal; + } + } else { + pmap_ledgers_drift.purgeable_volatile_under++; + pmap_ledgers_drift.purgeable_volatile_under_total += bal; + if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) { + pmap_ledgers_drift.purgeable_volatile_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.purgeable_nonvolatile, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"purgeable_nonvolatile\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.purgeable_nonvolatile_over++; + pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal; + if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) { + pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal; + } + } else { + pmap_ledgers_drift.purgeable_nonvolatile_under++; + pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal; + if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) { + pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.purgeable_volatile_compressed, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"purgeable_volatile_compressed\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.purgeable_volatile_compressed_over++; + pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal; + if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) { + pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal; + } + } else { + pmap_ledgers_drift.purgeable_volatile_compressed_under++; + pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal; + if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) { + pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal; + } + } + } + ledger_get_balance(pmap->ledger, + task_ledgers.purgeable_nonvolatile_compressed, + &bal); + if (bal != 0) { + do_panic = TRUE; + printf("LEDGER BALANCE proc %d (%s) " + "\"purgeable_nonvolatile_compressed\" = %lld\n", + pid, procname, bal); + if (bal > 0) { + pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++; + pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal; + if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) { + pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal; + } + } else { + pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++; + pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal; + if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) { + pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal; + } + } + } + + if (do_panic) { + if (pmap_ledgers_panic) { + panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", + pmap, pid, procname); + } else { + printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", + pmap, pid, procname); + } + } + + if (pmap->stats.resident_count != 0 || + pmap->stats.wired_count != 0 || + pmap->stats.device != 0 || + pmap->stats.internal != 0 || + pmap->stats.external != 0 || + pmap->stats.reusable != 0 || + pmap->stats.compressed != 0) { + if (pmap_stats_assert) { + panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld", + pmap, pid, procname, + pmap->stats.resident_count, + pmap->stats.wired_count, + pmap->stats.device, + pmap->stats.internal, + pmap->stats.external, + pmap->stats.reusable, + pmap->stats.compressed); + } else { + printf("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld", + pmap, pid, procname, + pmap->stats.resident_count, + pmap->stats.wired_count, + pmap->stats.device, + pmap->stats.internal, + pmap->stats.external, + pmap->stats.reusable, + pmap->stats.compressed); + } + } +} + +void +pmap_set_process( + pmap_t pmap, + int pid, + char *procname) +{ + if (pmap == NULL) + return; + + pmap->pmap_pid = pid; + strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname)); +} +#endif /* MACH_ASSERT */ + + +#if DEVELOPMENT || DEBUG +int pmap_pagezero_mitigation = 1; +#endif + +void pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound) { +#if DEVELOPMENT || DEBUG + if (pmap_pagezero_mitigation == 0) { + lpmap->pagezero_accessible = FALSE; + return; + } +#endif + lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000)); + if (lpmap == current_pmap()) { + mp_disable_preemption(); + current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible; + mp_enable_preemption(); + } +} diff --git a/osfmk/x86_64/pmap_pcid.c b/osfmk/x86_64/pmap_pcid.c index c8fef93b4..2a7280d41 100644 --- a/osfmk/x86_64/pmap_pcid.c +++ b/osfmk/x86_64/pmap_pcid.c @@ -242,15 +242,24 @@ void pmap_destroy_pcid_sync(pmap_t p) { pmap_pcid_deallocate_pcid(i, p); } -pcid_t pcid_for_pmap_cpu_tuple(pmap_t pmap, int ccpu) { - return pmap->pmap_pcid_cpus[ccpu]; +pcid_t pcid_for_pmap_cpu_tuple(pmap_t cpmap, thread_t cthread, int ccpu) { + pmap_t active_pmap = cpmap; + + if (__improbable(cpmap->pagezero_accessible)) { + if ((cthread->machine.specFlags & CopyIOActive) == 0) { + active_pmap = kernel_pmap; + } + } + + return active_pmap->pmap_pcid_cpus[ccpu]; } + #if PMAP_ASSERT #define PCID_RECORD_SIZE 128 uint64_t pcid_record_array[PCID_RECORD_SIZE]; #endif -void pmap_pcid_activate(pmap_t tpmap, int ccpu) { +void pmap_pcid_activate(pmap_t tpmap, int ccpu, boolean_t nopagezero, boolean_t copyio) { pcid_t new_pcid = tpmap->pmap_pcid_cpus[ccpu]; pmap_t last_pmap; boolean_t pcid_conflict = FALSE, pending_flush = FALSE; @@ -259,8 +268,9 @@ void pmap_pcid_activate(pmap_t tpmap, int ccpu) { if (__improbable(new_pcid == PMAP_PCID_INVALID_PCID)) { new_pcid = tpmap->pmap_pcid_cpus[ccpu] = pmap_pcid_allocate_pcid(ccpu); } + pmap_assert(new_pcid != PMAP_PCID_INVALID_PCID); -#ifdef PCID_ASSERT +#ifdef PCID_ASSERT cpu_datap(ccpu)->cpu_last_pcid = cpu_datap(ccpu)->cpu_active_pcid; #endif cpu_datap(ccpu)->cpu_active_pcid = new_pcid; @@ -268,7 +278,7 @@ void pmap_pcid_activate(pmap_t tpmap, int ccpu) { pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); if (__probable(pending_flush == FALSE)) { last_pmap = cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid]; - pcid_conflict = ((last_pmap != NULL) &&(tpmap != last_pmap)); + pcid_conflict = ((last_pmap != NULL) && (tpmap != last_pmap)); } if (__improbable(pending_flush || pcid_conflict)) { pmap_pcid_validate_cpu(tpmap, ccpu); @@ -277,7 +287,8 @@ void pmap_pcid_activate(pmap_t tpmap, int ccpu) { cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid] = tpmap; pmap_assert(new_pcid < PMAP_PCID_MAX_PCID); - pmap_assert(((tpmap == kernel_pmap) && new_pcid == 0) || ((new_pcid != PMAP_PCID_INVALID_PCID) && (new_pcid != 0))); + pmap_assert(((tpmap == kernel_pmap) && new_pcid == 0) || + ((new_pcid != PMAP_PCID_INVALID_PCID) && (new_pcid != 0))); #if PMAP_ASSERT pcid_record_array[ccpu % PCID_RECORD_SIZE] = tpmap->pm_cr3 | new_pcid | (((uint64_t)(!(pending_flush || pcid_conflict))) <<63); pml4_entry_t *pml4 = pmap64_pml4(tpmap, 0ULL); @@ -285,7 +296,19 @@ void pmap_pcid_activate(pmap_t tpmap, int ccpu) { if (pml4[KERNEL_PML4_INDEX] != kernel_pmap->pm_pml4[KERNEL_PML4_INDEX]) __asm__ volatile("int3"); #endif /* PMAP_ASSERT */ - set_cr3_composed(tpmap->pm_cr3, new_pcid, !(pending_flush || pcid_conflict)); + + pmap_paddr_t ncr3 = tpmap->pm_cr3; + + if (__improbable(nopagezero)) { + pending_flush = TRUE; + if (copyio == FALSE) { + new_pcid = kernel_pmap->pmap_pcid_cpus[ccpu]; + ncr3 = kernel_pmap->pm_cr3; + } + cpu_datap(ccpu)->cpu_kernel_pcid = kernel_pmap->pmap_pcid_cpus[ccpu]; + } + + set_cr3_composed(ncr3, new_pcid, !(pending_flush || pcid_conflict)); if (!pending_flush) { /* We did not previously observe a pending invalidation for this @@ -300,7 +323,7 @@ void pmap_pcid_activate(pmap_t tpmap, int ccpu) { pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); if (__improbable(pending_flush != 0)) { pmap_pcid_validate_cpu(tpmap, ccpu); - set_cr3_composed(tpmap->pm_cr3, new_pcid, FALSE); + set_cr3_composed(ncr3, new_pcid, FALSE); } } cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = &(tpmap->pmap_pcid_coherency_vector[ccpu]); diff --git a/pexpert/Makefile b/pexpert/Makefile index 656af69bc..110210d03 100644 --- a/pexpert/Makefile +++ b/pexpert/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -20,10 +19,8 @@ EXPINC_SUBDIRS_X86_64H = pexpert EXPINC_SUBDIRS_ARM = pexpert EXPINC_SUBDIRS_ARM64 = pexpert -COMP_SUBDIRS = \ +COMP_SUBDIRS = \ conf include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/pexpert/conf/Makefile b/pexpert/conf/Makefile index 76db9a7d8..7bd79d9ae 100644 --- a/pexpert/conf/Makefile +++ b/pexpert/conf/Makefile @@ -37,7 +37,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile OBJPATH=${OBJPATH} \ build_all; -do_build_all:: do_all +do_build_all:: do_all include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/pexpert/conf/Makefile.template b/pexpert/conf/Makefile.template index ffcbdbe1d..cd16445fa 100644 --- a/pexpert/conf/Makefile.template +++ b/pexpert/conf/Makefile.template @@ -24,7 +24,7 @@ pe_identify_machine.o_CWARNFLAGS_ADD = -Wno-cast-align # # Directories for mig generated files # -COMP_SUBDIRS = +COMP_SUBDIRS = # # Make sure we don't remove this by accident if interrupted at the wrong @@ -65,7 +65,7 @@ $(SOBJS): .SFLAGS $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS) $(COMPONENT).filelist: $(OBJS) - @echo LDFILELIST $(COMPONENT) + @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" $(_v)for obj in ${OBJS}; do \ echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist diff --git a/pexpert/conf/Makefile.x86_64 b/pexpert/conf/Makefile.x86_64 index 25f7be596..901979202 100644 --- a/pexpert/conf/Makefile.x86_64 +++ b/pexpert/conf/Makefile.x86_64 @@ -5,4 +5,3 @@ ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### - diff --git a/pexpert/gen/bootargs.c b/pexpert/gen/bootargs.c index c5efead48..33a8db774 100644 --- a/pexpert/gen/bootargs.c +++ b/pexpert/gen/bootargs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,16 +62,17 @@ PE_parse_boot_arg( } #endif -boolean_t -PE_parse_boot_argn( - const char *arg_string, - void *arg_ptr, - int max_len) +static boolean_t +PE_parse_boot_argn_internal( + const char *arg_string, + void * arg_ptr, + int max_len, + boolean_t force_string) { char *args; char *cp, c; uintptr_t i; - long long val; + long long val = 0; boolean_t arg_boolean; boolean_t arg_found; @@ -102,9 +103,12 @@ PE_parse_boot_argn( if (strncmp(args, arg_string, i) || (i!=strlen(arg_string))) goto gotit; + if (arg_boolean) { - argnumcpy(1, arg_ptr, max_len); - arg_found = TRUE; + if (!force_string){ + argnumcpy(1, arg_ptr, max_len); + arg_found = TRUE; + } break; } else { while (*cp && isargsep (*cp)) @@ -120,7 +124,7 @@ PE_parse_boot_argn( arg_found = TRUE; break; } - switch (getval(cp, &val, isargsep, FALSE)) + switch ((force_string && *cp == '=') ? STR : getval(cp, &val, isargsep, FALSE)) { case NUM: argnumcpy(val, arg_ptr, max_len); @@ -147,6 +151,24 @@ PE_parse_boot_argn( return(arg_found); } +boolean_t +PE_parse_boot_argn( + const char *arg_string, + void *arg_ptr, + int max_len) +{ + return PE_parse_boot_argn_internal(arg_string, arg_ptr, max_len, FALSE); +} + +boolean_t +PE_parse_boot_arg_str( + const char *arg_string, + char *arg_ptr, + int strlen) +{ + return PE_parse_boot_argn_internal(arg_string, arg_ptr, strlen, TRUE); +} + static boolean_t isargsep(char c) { @@ -245,8 +267,10 @@ getval( } if (has_value || skip_equal_sign) { - if (*s == '-') - sign = -1, s++; + if (*s == '-') { + sign = -1; + s++; + } intval = *s++-'0'; radix = 10; if (intval == 0) { diff --git a/pexpert/gen/pe_gen.c b/pexpert/gen/pe_gen.c index 5bab0cc14..60664a6a0 100644 --- a/pexpert/gen/pe_gen.c +++ b/pexpert/gen/pe_gen.c @@ -34,7 +34,9 @@ #include #include + static int DEBUGFlag; + static uint32_t gPEKernelConfigurationBitmask; int32_t gPESerialBaud = -1; diff --git a/pexpert/i386/pe_init.c b/pexpert/i386/pe_init.c index 35d44a25c..10a9fb24d 100644 --- a/pexpert/i386/pe_init.c +++ b/pexpert/i386/pe_init.c @@ -189,14 +189,25 @@ void PE_init_platform(boolean_t vm_initialized, void * _args) // New EFI-style PE_state.bootArgs = _args; PE_state.deviceTreeHead = (void *) ml_static_ptovirt(args->deviceTreeP); - PE_state.video.v_baseAddr = args->Video.v_baseAddr; // remains physical address - PE_state.video.v_rowBytes = args->Video.v_rowBytes; - PE_state.video.v_width = args->Video.v_width; - PE_state.video.v_height = args->Video.v_height; - PE_state.video.v_depth = args->Video.v_depth; - PE_state.video.v_display = args->Video.v_display; - strlcpy(PE_state.video.v_pixelFormat, "PPPPPPPP", - sizeof(PE_state.video.v_pixelFormat)); + if (args->Video.v_baseAddr) { + PE_state.video.v_baseAddr = args->Video.v_baseAddr; // remains physical address + PE_state.video.v_rowBytes = args->Video.v_rowBytes; + PE_state.video.v_width = args->Video.v_width; + PE_state.video.v_height = args->Video.v_height; + PE_state.video.v_depth = args->Video.v_depth; + PE_state.video.v_display = args->Video.v_display; + strlcpy(PE_state.video.v_pixelFormat, "PPPPPPPP", + sizeof(PE_state.video.v_pixelFormat)); + } else { + PE_state.video.v_baseAddr = args->VideoV1.v_baseAddr; // remains physical address + PE_state.video.v_rowBytes = args->VideoV1.v_rowBytes; + PE_state.video.v_width = args->VideoV1.v_width; + PE_state.video.v_height = args->VideoV1.v_height; + PE_state.video.v_depth = args->VideoV1.v_depth; + PE_state.video.v_display = args->VideoV1.v_display; + strlcpy(PE_state.video.v_pixelFormat, "PPPPPPPP", + sizeof(PE_state.video.v_pixelFormat)); + } #ifdef kBootArgsFlagHiDPI if (args->flags & kBootArgsFlagHiDPI) @@ -215,7 +226,6 @@ void PE_init_platform(boolean_t vm_initialized, void * _args) } pe_identify_machine(args); - } else { pe_init_debug(); } diff --git a/pexpert/i386/pe_kprintf.c b/pexpert/i386/pe_kprintf.c index b4912b42b..c9f720e21 100644 --- a/pexpert/i386/pe_kprintf.c +++ b/pexpert/i386/pe_kprintf.c @@ -38,6 +38,7 @@ #include #include #include +#include /* Globals */ void (*PE_kputc)(char c); @@ -101,10 +102,14 @@ static void _kprintf(const char *format, ...) #endif /* MP_DEBUG */ static int cpu_last_locked = 0; + +__attribute__((noinline,not_tail_called)) void kprintf(const char *fmt, ...) { - va_list listp; - boolean_t state; + va_list listp; + va_list listp2; + boolean_t state; + void *caller = __builtin_return_address(0); if (!disable_serial_output) { boolean_t early = FALSE; @@ -115,8 +120,16 @@ void kprintf(const char *fmt, ...) * take any locks, just dump to serial */ if (!PE_kputc || early) { va_start(listp, fmt); + va_copy(listp2, listp); + _doprnt_log(fmt, &listp, pal_serial_putc, 16); va_end(listp); + + // If interrupts are enabled + if (ml_get_interrupts_enabled()) { + os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp2, caller); + } + va_end(listp2); return; } @@ -138,14 +151,31 @@ void kprintf(const char *fmt, ...) } va_start(listp, fmt); + va_copy(listp2, listp); _doprnt(fmt, &listp, PE_kputc, 16); va_end(listp); simple_unlock(&kprintf_lock); ml_set_interrupts_enabled(state); + + // If interrupts are enabled + if (ml_get_interrupts_enabled()) { + os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp2, caller); + } + va_end(listp2); + + } + else { + if (ml_get_interrupts_enabled()) { + va_start(listp, fmt); + os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp, caller); + va_end(listp); + } } } + + extern void kprintf_break_lock(void); void kprintf_break_lock(void) diff --git a/pexpert/pexpert/Makefile b/pexpert/pexpert/Makefile index 4c2e34956..5aa7caa76 100644 --- a/pexpert/pexpert/Makefile +++ b/pexpert/pexpert/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -45,8 +44,5 @@ EXPORT_MI_LIST = ${DATAFILES} \ EXPORT_MI_DIR = pexpert - include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/pexpert/pexpert/i386/Makefile b/pexpert/pexpert/i386/Makefile index aba89e1ca..420102bc6 100644 --- a/pexpert/pexpert/i386/Makefile +++ b/pexpert/pexpert/i386/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -20,8 +19,5 @@ EXPORT_MD_LIST = ${DATAFILES} EXPORT_MD_DIR = pexpert/i386 - include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/pexpert/pexpert/i386/boot.h b/pexpert/pexpert/i386/boot.h index fcb93c6ad..656ee5fe6 100644 --- a/pexpert/pexpert/i386/boot.h +++ b/pexpert/pexpert/i386/boot.h @@ -80,7 +80,7 @@ typedef struct EfiMemoryRange { * Video information.. */ -struct Boot_Video { +struct Boot_VideoV1 { uint32_t v_baseAddr; /* Base address of video memory */ uint32_t v_display; /* Display Code (if Applicable */ uint32_t v_rowBytes; /* Number of bytes per pixel row */ @@ -88,7 +88,17 @@ struct Boot_Video { uint32_t v_height; /* Height */ uint32_t v_depth; /* Pixel Depth */ }; +typedef struct Boot_VideoV1 Boot_VideoV1; +struct Boot_Video { + uint32_t v_display; /* Display Code (if Applicable */ + uint32_t v_rowBytes; /* Number of bytes per pixel row */ + uint32_t v_width; /* Width */ + uint32_t v_height; /* Height */ + uint32_t v_depth; /* Pixel Depth */ + uint32_t v_resv[7]; /* Reserved */ + uint64_t v_baseAddr; /* Base address of video memory */ +}; typedef struct Boot_Video Boot_Video; /* Values for v_display */ @@ -147,7 +157,7 @@ typedef struct boot_args { uint32_t MemoryMapDescriptorSize; uint32_t MemoryMapDescriptorVersion; - Boot_Video Video; /* Video Information */ + Boot_VideoV1 VideoV1; /* Video Information */ uint32_t deviceTreeP; /* Physical address of flattened device tree */ uint32_t deviceTreeLength; /* Length of flattened tree */ @@ -179,7 +189,8 @@ typedef struct boot_args { uint32_t boot_SMC_plimit; uint16_t bootProgressMeterStart; uint16_t bootProgressMeterEnd; - uint32_t __reserved4[726]; + Boot_Video Video; /* Video Information */ + uint32_t __reserved4[712]; } boot_args; diff --git a/pexpert/pexpert/machine/Makefile b/pexpert/pexpert/machine/Makefile index cffc94284..7bf35aa25 100644 --- a/pexpert/pexpert/machine/Makefile +++ b/pexpert/pexpert/machine/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -19,8 +18,5 @@ EXPORT_MI_LIST = ${DATAFILES} EXPORT_MI_DIR = pexpert/machine - include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index 5dca696cb..295bf6b33 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -96,6 +96,8 @@ extern int32_t gPESerialBaud; extern uint8_t gPlatformECID[8]; +extern uint32_t gPlatformMemoryID; + unsigned int PE_init_taproot(vm_offset_t *taddr); extern void (*PE_kputc)(char c); @@ -284,6 +286,13 @@ extern boolean_t PE_parse_boot_argn( void *arg_ptr, int max_arg); +#if XNU_KERNEL_PRIVATE +extern boolean_t PE_parse_boot_arg_str( + const char *arg_string, + char * arg_ptr, + int size); +#endif /* XNU_KERNEL_PRIVATE */ + extern boolean_t PE_get_default( const char *property_name, void *property_ptr, diff --git a/pexpert/pexpert/protos.h b/pexpert/pexpert/protos.h index 4659fcc68..c9b6ae694 100644 --- a/pexpert/pexpert/protos.h +++ b/pexpert/pexpert/protos.h @@ -52,14 +52,14 @@ extern void interrupt_disable(void); //from kern/misc_protos.h extern void _doprnt( - register const char *fmt, + const char *fmt, va_list *argp, void (*putc)(char), int radix); extern void _doprnt_log( - register const char *fmt, + const char *fmt, va_list *argp, void (*putc)(char), int radix); diff --git a/security/Makefile b/security/Makefile index d917f8827..ca89616de 100644 --- a/security/Makefile +++ b/security/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -21,7 +20,7 @@ PRIVATE_DATAFILES = \ mac_policy.h # Installed in /usr/include/security/ -INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_DIR = security diff --git a/security/conf/Makefile b/security/conf/Makefile index 76db9a7d8..7bd79d9ae 100644 --- a/security/conf/Makefile +++ b/security/conf/Makefile @@ -37,7 +37,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile OBJPATH=${OBJPATH} \ build_all; -do_build_all:: do_all +do_build_all:: do_all include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/security/conf/Makefile.template b/security/conf/Makefile.template index 9e94a12ca..f857074e3 100644 --- a/security/conf/Makefile.template +++ b/security/conf/Makefile.template @@ -33,7 +33,7 @@ INCFLAGS_MAKEFILE= -I$(SOURCE)/.. # # Directories for mig generated files # -COMP_SUBDIRS = +COMP_SUBDIRS = # # Make sure we don't remove this by accident if interrupted at the wrong @@ -74,7 +74,7 @@ $(SOBJS): .SFLAGS $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS) $(COMPONENT).filelist: $(OBJS) - @echo LDFILELIST $(COMPONENT) + @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)" $(_v)for obj in ${OBJS}; do \ echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist diff --git a/security/mac_base.c b/security/mac_base.c index 7f147cff1..b3cf964b9 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1402,6 +1402,8 @@ __mac_get_fd(proc_t p, struct __mac_get_fd_args *uap, int *ret __unused) case DTYPE_PIPE: case DTYPE_KQUEUE: case DTYPE_FSEVENTS: + case DTYPE_ATALK: + case DTYPE_NETPOLICY: default: error = ENOSYS; // only sockets/vnodes so far break; @@ -1608,6 +1610,8 @@ __mac_set_fd(proc_t p, struct __mac_set_fd_args *uap, int *ret __unused) case DTYPE_PIPE: case DTYPE_KQUEUE: case DTYPE_FSEVENTS: + case DTYPE_ATALK: + case DTYPE_NETPOLICY: default: error = ENOSYS; // only sockets/vnodes so far break; @@ -1622,7 +1626,7 @@ static int mac_set_filelink(proc_t p, user_addr_t mac_p, user_addr_t path_p, int follow) { - register struct vnode *vp; + struct vnode *vp; struct vfs_context *ctx = vfs_context_current(); struct label *intlabel; struct nameidata nd; @@ -1957,6 +1961,28 @@ mac_vnop_removexattr(struct vnode *vp __unused, const char *name __unused) return (ENOENT); } +int +mac_file_setxattr(struct fileglob *fg __unused, const char *name __unused, char *buf __unused, size_t len __unused) +{ + + return (ENOENT); +} + +int +mac_file_getxattr(struct fileglob *fg __unused, const char *name __unused, + char *buf __unused, size_t len __unused, size_t *attrlen __unused) +{ + + return (ENOENT); +} + +int +mac_file_removexattr(struct fileglob *fg __unused, const char *name __unused) +{ + + return (ENOENT); +} + intptr_t mac_label_get(struct label *l __unused, int slot __unused) { return 0; @@ -1979,4 +2005,23 @@ int mac_iokit_check_hid_control(kauth_cred_t cred __unused) return 0; } + +int mac_iokit_check_nvram_delete(kauth_cred_t cred __unused, const char *name __unused); +int mac_iokit_check_nvram_delete(kauth_cred_t cred __unused, const char *name __unused) +{ + return 0; +} + +int mac_iokit_check_nvram_get(kauth_cred_t cred __unused, const char *name __unused); +int mac_iokit_check_nvram_get(kauth_cred_t cred __unused, const char *name __unused) +{ + return 0; +} + +int mac_iokit_check_nvram_set(kauth_cred_t cred __unused, const char *name __unused, io_object_t value __unused); +int mac_iokit_check_nvram_set(kauth_cred_t cred __unused, const char *name __unused, io_object_t value __unused) +{ + return 0; +} + #endif /* !MAC */ diff --git a/security/mac_file.c b/security/mac_file.c index c1aa3281b..7f2de809c 100644 --- a/security/mac_file.c +++ b/security/mac_file.c @@ -193,6 +193,17 @@ mac_file_check_lock(struct ucred *cred, struct fileglob *fg, int op, return (error); } +int +mac_file_check_library_validation(struct proc *proc, + struct fileglob *fg, off_t slice_offset, + user_long_t error_message, size_t error_message_size) +{ + int error; + + MAC_CHECK(file_check_library_validation, proc, fg, slice_offset, error_message, error_message_size); + return (error); +} + /* * On some platforms, VM_PROT_READ implies VM_PROT_EXECUTE. If that is true, * both prot and maxprot will have VM_PROT_EXECUTE set after file_check_mmap @@ -228,3 +239,45 @@ mac_file_check_mmap_downgrade(struct ucred *cred, struct fileglob *fg, *prot = result; } + + +/* + * fileglob XATTR helpers. + */ + +int +mac_file_setxattr(struct fileglob *fg, const char *name, char *buf, size_t len) { + struct vnode *vp = NULL; + + if (!fg || FILEGLOB_DTYPE(fg) != DTYPE_VNODE) { + return EFTYPE; + } + + vp = (struct vnode *)fg->fg_data; + return mac_vnop_setxattr(vp, name, buf, len); +} + +int +mac_file_getxattr(struct fileglob *fg, const char *name, char *buf, size_t len, + size_t *attrlen) { + struct vnode *vp = NULL; + + if (!fg || FILEGLOB_DTYPE(fg) != DTYPE_VNODE) { + return EFTYPE; + } + + vp = (struct vnode *)fg->fg_data; + return mac_vnop_getxattr(vp, name, buf, len, attrlen); +} + +int +mac_file_removexattr(struct fileglob *fg, const char *name) { + struct vnode *vp = NULL; + + if (!fg || FILEGLOB_DTYPE(fg) != DTYPE_VNODE) { + return EFTYPE; + } + + vp = (struct vnode *)fg->fg_data; + return mac_vnop_removexattr(vp, name); +} diff --git a/security/mac_framework.h b/security/mac_framework.h index b9973b2a6..c71d12228 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -2,7 +2,7 @@ * Copyright (c) 2007 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /*- @@ -85,7 +85,9 @@ struct attrlist; struct auditinfo; struct bpf_d; struct componentname; +struct cs_blob; struct devnode; +struct exception_action; struct flock; struct fdescnode; struct fileglob; @@ -200,6 +202,9 @@ int mac_file_check_ioctl(kauth_cred_t cred, struct fileglob *fg, unsigned int cmd); int mac_file_check_lock(kauth_cred_t cred, struct fileglob *fg, int op, struct flock *fl); +int mac_file_check_library_validation(struct proc *proc, + struct fileglob *fg, off_t slice_offset, + user_long_t error_message, size_t error_message_size); int mac_file_check_mmap(kauth_cred_t cred, struct fileglob *fg, int prot, int flags, uint64_t file_pos, int *maxprot); void mac_file_check_mmap_downgrade(kauth_cred_t cred, struct fileglob *fg, @@ -264,6 +269,10 @@ int mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp, int mac_mount_check_label_update(vfs_context_t ctx, struct mount *mp); int mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp, struct componentname *cnp, const char *vfc_name); +int mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp, + const char *name); +int mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp, + const char *name); int mac_mount_check_remount(vfs_context_t ctx, struct mount *mp); int mac_mount_check_setattr(vfs_context_t ctx, struct mount *mp, struct vfs_attr *vfa); @@ -374,7 +383,7 @@ int mac_socket_check_kqfilter(kauth_cred_t cred, struct knote *kn, struct socket *so); int mac_socket_check_listen(kauth_cred_t cred, struct socket *so); int mac_socket_check_receive(kauth_cred_t cred, struct socket *so); -int mac_socket_check_received(kauth_cred_t cred, struct socket *so, +int mac_socket_check_received(kauth_cred_t cred, struct socket *so, struct sockaddr *saddr); int mac_socket_check_select(kauth_cred_t cred, struct socket *so, int which); @@ -463,6 +472,8 @@ int mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp, int mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp); int mac_vnode_check_chroot(vfs_context_t ctx, struct vnode *dvp, struct componentname *cnp); +int mac_vnode_check_clone(vfs_context_t ctx, struct vnode *dvp, + struct vnode *vp, struct componentname *cnp); int mac_vnode_check_create(vfs_context_t ctx, struct vnode *dvp, struct componentname *cnp, struct vnode_attr *vap); int mac_vnode_check_deleteextattr(vfs_context_t ctx, struct vnode *vp, @@ -472,9 +483,10 @@ int mac_vnode_check_exchangedata(vfs_context_t ctx, struct vnode *v1, int mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, struct image_params *imgp); int mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp); -int mac_vnode_check_signature(struct vnode *vp, off_t macho_offset, - unsigned char *sha1, const void * signature, size_t size, - int flags, int *is_platform_binary); +int mac_vnode_check_signature(struct vnode *vp, + struct cs_blob *cs_blob, struct image_params *imgp, + unsigned int *cs_flags, + int flags); int mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp, struct attrlist *alist); int mac_vnode_check_getextattr(vfs_context_t ctx, struct vnode *vp, @@ -504,6 +516,8 @@ int mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp, struct attrlist *alist); int mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp, int which); +int mac_vnode_check_setacl(vfs_context_t ctx, struct vnode *vp, + struct kauth_acl *acl); int mac_vnode_check_setattrlist(vfs_context_t ctxd, struct vnode *vp, struct attrlist *alist); int mac_vnode_check_setextattr(vfs_context_t ctx, struct vnode *vp, @@ -522,7 +536,7 @@ int mac_vnode_check_truncate(vfs_context_t ctx, kauth_cred_t file_cred, struct vnode *vp); int mac_vnode_check_uipc_bind(vfs_context_t ctx, struct vnode *dvp, struct componentname *cnp, struct vnode_attr *vap); -int mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp); +int mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket *so); int mac_vnode_check_unlink(vfs_context_t ctx, struct vnode *dvp, struct vnode *vp, struct componentname *cnp); int mac_vnode_check_write(vfs_context_t ctx, @@ -550,11 +564,20 @@ void mac_vnode_label_update_extattr(struct mount *mp, struct vnode *vp, const char *name); int mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp); -void mac_vnode_notify_rename(vfs_context_t ctx, struct vnode *vp, +void mac_vnode_notify_deleteextattr(vfs_context_t ctx, struct vnode *vp, const char *name); +void mac_vnode_notify_link(vfs_context_t ctx, struct vnode *vp, struct vnode *dvp, struct componentname *cnp); void mac_vnode_notify_open(vfs_context_t ctx, struct vnode *vp, int acc_flags); -void mac_vnode_notify_link(vfs_context_t ctx, struct vnode *vp, - struct vnode *dvp, struct componentname *cnp); +void mac_vnode_notify_rename(vfs_context_t ctx, struct vnode *vp, + struct vnode *dvp, struct componentname *cnp); +void mac_vnode_notify_setacl(vfs_context_t ctx, struct vnode *vp, struct kauth_acl *acl); +void mac_vnode_notify_setattrlist(vfs_context_t ctx, struct vnode *vp, struct attrlist *alist); +void mac_vnode_notify_setextattr(vfs_context_t ctx, struct vnode *vp, const char *name, struct uio *uio); +void mac_vnode_notify_setflags(vfs_context_t ctx, struct vnode *vp, u_long flags); +void mac_vnode_notify_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mode); +void mac_vnode_notify_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, gid_t gid); +void mac_vnode_notify_setutimes(vfs_context_t ctx, struct vnode *vp, struct timespec atime, struct timespec mtime); +void mac_vnode_notify_truncate(vfs_context_t ctx, kauth_cred_t file_cred, struct vnode *vp); int mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offsetInMacho); int vnode_label(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int flags, vfs_context_t ctx); diff --git a/security/mac_mach.c b/security/mac_mach.c index 4ff8cf7a7..222b77f2a 100644 --- a/security/mac_mach.c +++ b/security/mac_mach.c @@ -28,6 +28,9 @@ #include #include +#include +#include +#include #include #include #include @@ -39,6 +42,15 @@ #include #include +#if CONFIG_CSR +#include +// Panic on internal builds, just log otherwise. +#define MAC_MACH_UNEXPECTED(fmt...) \ + if (csr_check(CSR_ALLOW_APPLE_INTERNAL) == 0) { panic(fmt); } else { printf(fmt); } +#else +#define MAC_MACH_UNEXPECTED(fmt...) printf(fmt) +#endif + static struct proc * mac_task_get_proc(struct task *task) { @@ -137,3 +149,109 @@ mac_thread_userret(struct thread *td) MAC_PERFORM(thread_userret, td); } +static struct label * +mac_exc_action_label_alloc(void) +{ + struct label *label = mac_labelzone_alloc(MAC_WAITOK); + + MAC_PERFORM(exc_action_label_init, label); + return label; +} + +static void +mac_exc_action_label_free(struct label *label) +{ + MAC_PERFORM(exc_action_label_destroy, label); + mac_labelzone_free(label); +} + +void +mac_exc_action_label_init(struct exception_action *action) +{ + action->label = mac_exc_action_label_alloc(); + MAC_PERFORM(exc_action_label_associate, action, action->label); +} + +void +mac_exc_action_label_inherit(struct exception_action *parent, struct exception_action *child) +{ + mac_exc_action_label_init(child); + MAC_PERFORM(exc_action_label_copy, parent->label, child->label); +} + +void +mac_exc_action_label_destroy(struct exception_action *action) +{ + struct label *label = action->label; + action->label = NULL; + mac_exc_action_label_free(label); +} + +int mac_exc_action_label_update(struct task *task, struct exception_action *action) { + if (task == kernel_task) { + // The kernel may set exception ports without any check. + return 0; + } + + struct proc *p = mac_task_get_proc(task); + if (p == NULL) + return ESRCH; + + MAC_PERFORM(exc_action_label_update, p, action->label); + proc_rele(p); + return 0; +} + +void mac_exc_action_label_reset(struct exception_action *action) { + struct label *old_label = action->label; + mac_exc_action_label_init(action); + mac_exc_action_label_free(old_label); +} + +void mac_exc_action_label_task_update(struct task *task, struct proc *proc) { + if (get_task_crash_label(task) != NULL) { + MAC_MACH_UNEXPECTED("task already has a crash_label attached to it"); + return; + } + + struct label *label = mac_exc_action_label_alloc(); + MAC_PERFORM(exc_action_label_update, proc, label); + set_task_crash_label(task, label); +} + +void mac_exc_action_label_task_destroy(struct task *task) { + mac_exc_action_label_free(get_task_crash_label(task)); + set_task_crash_label(task, NULL); +} + +int +mac_exc_action_check_exception_send(struct task *victim_task, struct exception_action *action) +{ + int error = 0; + + struct proc *p = get_bsdtask_info(victim_task); + struct label *bsd_label = NULL; + struct label *label = NULL; + + if (p != NULL) { + // Create a label from the still existing bsd process... + label = bsd_label = mac_exc_action_label_alloc(); + MAC_PERFORM(exc_action_label_update, p, bsd_label); + } else { + // ... otherwise use the crash label on the task. + label = get_task_crash_label(victim_task); + } + + if (label == NULL) { + MAC_MACH_UNEXPECTED("mac_exc_action_check_exception_send: no exc_action label for proc %p", p); + return EPERM; + } + + MAC_CHECK(exc_action_check_exception_send, label, action, action->label); + + if (bsd_label != NULL) { + mac_exc_action_label_free(bsd_label); + } + + return (error); +} diff --git a/security/mac_mach_internal.h b/security/mac_mach_internal.h index 79587fd89..406216029 100644 --- a/security/mac_mach_internal.h +++ b/security/mac_mach_internal.h @@ -82,6 +82,18 @@ int mac_task_check_set_host_exception_ports(struct task *task, /* threads */ void act_set_astmacf(struct thread *); void mac_thread_userret(struct thread *); + +/* exception actions */ +void mac_exc_action_label_init(struct exception_action *action); +void mac_exc_action_label_inherit(struct exception_action *parent, struct exception_action *child); +void mac_exc_action_label_destroy(struct exception_action *action); +int mac_exc_action_label_update(struct task *task, struct exception_action *action); +void mac_exc_action_label_reset(struct exception_action *action); + +void mac_exc_action_label_task_update(struct task *task, struct proc *proc); +void mac_exc_action_label_task_destroy(struct task *task); + +int mac_exc_action_check_exception_send(struct task *victim_task, struct exception_action *action); #endif /* MAC */ #endif /* !_SECURITY_MAC_MACH_INTERNAL_H_ */ diff --git a/security/mac_policy.h b/security/mac_policy.h index aa2658494..ba8355e21 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2010 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,7 +89,9 @@ struct attrlist; struct auditinfo; struct bpf_d; +struct cs_blob; struct devnode; +struct exception_action; struct fileglob; struct ifnet; struct inpcb; @@ -302,7 +304,7 @@ typedef int mpo_bpfdesc_check_receive_t( @param vnodelabel Label corresponding to vp @param scriptvnodelabel Script vnode label @param execlabel Userspace provided execution label - @param proc Object process + @param p Object process @param macpolicyattr MAC policy-specific spawn attribute data @param macpolicyattrlen Length of policy-specific spawn attribute data @see mac_execve @@ -689,6 +691,78 @@ typedef void mpo_devfs_label_update_t( struct vnode *vp, struct label *vnodelabel ); +/** + @brief Access control for sending an exception to an exception action + @param crashlabel The crashing process's label + @param action Exception action + @param exclabel Policy label for exception action + + Determine whether the the exception message caused by the victim + process can be sent to the exception action. + + @return Return 0 if the message can be sent, otherwise an + appropriate value for errno should be returned. +*/ +typedef int mpo_exc_action_check_exception_send_t( + struct label *crashlabel, + struct exception_action *action, + struct label *exclabel +); +/** + @brief Create an exception action label + @param action Exception action to label + @param exclabel Policy label to be filled in for exception action + + Set the label on an exception action. +*/ +typedef void mpo_exc_action_label_associate_t( + struct exception_action *action, + struct label *exclabel +); +/** + @brief Copy an exception action label + @param src Source exception action label + @param dest Destination exception action label + + Copy the label information from src to dest. + Exception actions are often inherited, e.g. from parent to child. + In that case, the labels are copied instead of created fresh. +*/ +typedef void mpo_exc_action_label_copy_t( + struct label *src, + struct label *dest +); +/** + @brief Destroy exception action label + @param label The label to be destroyed + + Destroy the label on an exception action. In this entry point, a + policy module should free any internal storage associated with + label so that it may be destroyed. +*/ +typedef void mpo_exc_action_label_destroy_t( + struct label *label +); +/** + @brief Initialize exception action label + @param label New label to initialize + + Initialize a label for an exception action. +*/ +typedef int mpo_exc_action_label_init_t( + struct label *label +); +/** + @brief Update the label on an exception action + @param p Process to update the label from + @param exclabel Policy label to be updated for exception action + + Update the credentials of an exception action with the given task. +*/ +typedef void mpo_exc_action_label_update_t( + struct proc *p, + struct label *exclabel +); /** @brief Access control for changing the offset of a file descriptor @param cred Subject credential @@ -859,6 +933,31 @@ typedef int mpo_file_check_lock_t( int op, struct flock *fl ); +/** + @brief Check with library validation if a macho slice is allowed to be combined into a proc. + @param p Subject process + @param fg Fileglob structure + @param slice_offset offset of the code slice + @param error_message error message returned to user-space in case of error (userspace pointer) + @param error_message_size error message size + + Its a little odd that the MAC/kext writes into userspace since this + implies there is only one MAC module that implements this, however + the alterantive is to allocate memory in xnu, on the hope that + the MAC module will use it, or allocated in the MAC module and then + free it in xnu. Either of these are very appeling, so lets go with + the slightly more hacky way. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. +*/ +typedef int mpo_file_check_library_validation_t( + struct proc *p, + struct fileglob *fg, + off_t slice_offset, + user_long_t error_message, + size_t error_message_size +); /** @brief Access control check for mapping a file @param cred Subject credential @@ -1219,8 +1318,6 @@ typedef void mpo_inpcb_label_update_t( /** @brief Device hardware access control @param devtype Type of device connected - @param properties XML-formatted property list - @param proplen Length of the property list This is the MAC Framework device access control, which is called by the I/O Kit when a new device is connected to the system to determine whether that @@ -1244,7 +1341,6 @@ typedef int mpo_iokit_check_device_t( /** @brief Access control check for opening an I/O Kit device @param cred Subject credential - @param device_path Device path @param user_client User client instance @param user_client_type User client type @@ -1610,7 +1706,7 @@ typedef int mpo_mbuf_label_init_t( @param cred Subject credential @param mp The mount point @param label Label associated with the mount point - @param com Filesystem-dependent request code; see fsctl(2) + @param cmd Filesystem-dependent request code; see fsctl(2) Determine whether the subject identified by the credential can perform the volume operation indicated by com. @@ -1688,6 +1784,41 @@ typedef int mpo_mount_check_mount_t( struct componentname *cnp, const char *vfc_name ); +/** + @brief Access control check for fs_snapshot_create + @param cred Subject credential + @mp Filesystem mount point to create snapshot of + @name Name of snapshot to create + + Determine whether the subject identified by the credential can + create a snapshot of the filesystem at the given mount point. + + @return Return 0 if access is granted, otherwise an appropriate value + for errno should be returned. +*/ +typedef int mpo_mount_check_snapshot_create_t( + kauth_cred_t cred, + struct mount *mp, + const char *name +); +/** + @brief Access control check for fs_snapshot_delete + @param cred Subject credential + @mp Filesystem mount point to delete snapshot of + @name Name of snapshot to delete + + Determine whether the subject identified by the credential can + delete the named snapshot from the filesystem at the given + mount point. + + @return Return 0 if access is granted, otherwise an appropriate value + for errno should be returned. +*/ +typedef int mpo_mount_check_snapshot_delete_t( + kauth_cred_t cred, + struct mount *mp, + const char *name +); /** @brief Access control check remounting a filesystem @param cred Subject credential @@ -2041,7 +2172,7 @@ typedef int mpo_pipe_check_write_t( @brief Create a pipe label @param cred Subject credential @param cpipe object to be labeled - @param label Label for the pipe object + @param pipelabel Label for the pipe object Create a label for the pipe object being created by the supplied user credential. This call is made when the pipe is being created @@ -2616,7 +2747,7 @@ typedef int mpo_proc_check_set_host_special_port_t( /** @brief Access control check for setting host exception ports. @param cred Subject credential - @param exceptions Exception port to set + @param exception Exception port to set @return Return 0 if access is granted, otherwise an appropriate value for errno should be returned. @@ -2967,7 +3098,7 @@ typedef void mpo_proc_label_init_t( /** @brief Access control check for socket accept @param cred Subject credential - @param socket Object socket + @param so Object socket @param socklabel Policy label for socket Determine whether the subject identified by the credential can accept() @@ -3178,9 +3309,9 @@ typedef int mpo_socket_check_receive_t( /** @brief Access control check for socket receive @param cred Subject credential - @param socket Object socket + @param sock Object socket @param socklabel Policy label for socket - @param addr Name of the remote socket + @param saddr Name of the remote socket Determine whether the subject identified by the credential can receive data from the remote host specified by addr. @@ -3770,7 +3901,7 @@ typedef int mpo_system_check_kas_info_t( /** @brief Create a System V message label @param cred Subject credential - @param msqkptr The message queue the message will be placed in + @param msqptr The message queue the message will be placed in @param msqlabel The label of the message queue @param msgptr The message @param msglabel The label of the message @@ -3822,7 +3953,7 @@ typedef void mpo_sysvmsg_label_recycle_t( @param cred Subject credential @param msgptr The message @param msglabel The message's label - @param msqkptr The message queue + @param msqptr The message queue @param msqlabel The message queue's label Determine whether the subject identified by the credential can add the @@ -3946,7 +4077,7 @@ typedef int mpo_sysvmsq_check_msqsnd_t( /** @brief Create a System V message queue label @param cred Subject credential - @param msqkptr The message queue + @param msqptr The message queue @param msqlabel The label of the message queue */ @@ -4226,7 +4357,7 @@ typedef void mpo_sysvshm_label_recycle_t( /** @brief Access control check for getting a process's task name @param cred Subject credential - @param proc Object process + @param p Object process Determine whether the subject identified by the credential can get the passed process's task name port. @@ -4243,7 +4374,7 @@ typedef int mpo_proc_check_get_task_name_t( /** @brief Access control check for getting a process's task port @param cred Subject credential - @param proc Object process + @param p Object process Determine whether the subject identified by the credential can get the passed process's task control port. @@ -4261,7 +4392,7 @@ typedef int mpo_proc_check_get_task_t( /** @brief Access control check for exposing a process's task port @param cred Subject credential - @param proc Object process + @param p Object process Determine whether the subject identified by the credential can expose the passed process's task control port. @@ -4279,7 +4410,7 @@ typedef int mpo_proc_check_expose_task_t( /** @brief Check whether task's IPC may inherit across process exec - @param proc current process instance + @param p current process instance @param cur_vp vnode pointer to current instance @param cur_offset offset of binary of currently executing image @param img_vp vnode pointer to to be exec'ed image @@ -4300,7 +4431,7 @@ typedef int mpo_proc_check_inherit_ipc_ports_t( /** @brief Privilege check for a process to run invalid - @param proc Object process + @param p Object process Determine whether the process may execute even though the system determined that it is untrusted (eg unidentified / modified code). @@ -4384,6 +4515,29 @@ typedef int mpo_vnode_check_chroot_t( struct label *dlabel, struct componentname *cnp ); +/** + @brief Access control check for creating clone + @param cred Subject credential + @param dvp Vnode of directory to create the clone in + @param dlabel Policy label associated with dvp + @param vp Vnode of the file to clone from + @param label Policy label associated with vp + @param cnp Component name for the clone being created + + Determine whether the subject identified by the credential should be + allowed to create a clone of the vnode vp with the name specified by cnp. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. +*/ +typedef int mpo_vnode_check_clone_t( + kauth_cred_t cred, + struct vnode *dvp, + struct label *dlabel, + struct vnode *vp, + struct label *label, + struct componentname *cnp +); /** @brief Access control check for creating vnode @param cred Subject credential @@ -4457,7 +4611,7 @@ typedef int mpo_vnode_check_exchangedata_t( @param vp Object vnode to execute @param scriptvp Script being executed by interpreter, if any. @param vnodelabel Label corresponding to vp - @param scriptvnodelabel Script vnode label + @param scriptlabel Script vnode label @param execlabel Userspace provided execution label @param cnp Component name for file being executed @param macpolicyattr MAC policy-specific spawn attribute data. @@ -4507,12 +4661,24 @@ typedef int mpo_vnode_check_fsgetpath_t( ); /** @brief Access control check after determining the code directory hash - */ -typedef int mpo_vnode_check_signature_t(struct vnode *vp, struct label *label, - off_t macho_offset, unsigned char *sha1, - const void *signature, int size, - int flags, int *is_platform_binary); + @param vp vnode vnode to combine into proc + @param label label associated with the vnode + @param cs_blob the code signature to check + @param cs_flags update code signing flags if needed + @param flags operational flag to mpo_vnode_check_signature + @param fatal_failure_desc description of fatal failure + @param fatal_failure_desc_len failure description len, failure is fatal if non-0 + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. + */ +typedef int mpo_vnode_check_signature_t( + struct vnode *vp, + struct label *label, + struct cs_blob *cs_blob, + unsigned int *cs_flags, + int flags, + char **fatal_failure_desc, size_t *fatal_failure_desc_len); /** @brief Access control check for retrieving file attributes @param cred Subject credential @@ -4566,7 +4732,7 @@ typedef int mpo_vnode_check_getextattr_t( @param cred Subject credential @param vp Object vnode @param label Policy label for vp - @param com Device-dependent request code; see ioctl(2) + @param cmd Device-dependent request code; see ioctl(2) Determine whether the subject identified by the credential can perform the ioctl operation indicated by com. @@ -4587,7 +4753,7 @@ typedef int mpo_vnode_check_ioctl_t( ); /** @brief Access control check for vnode kqfilter - @param cred Subject credential + @param active_cred Subject credential @param kn Object knote @param vp Object vnode @param label Policy label for vp @@ -4923,6 +5089,26 @@ typedef int mpo_vnode_check_select_t( struct label *label, int which ); +/** + @brief Access control check for setting ACL + @param cred Subject credential + @param vp Object node + @param label Policy label for vp + @param acl ACL structure pointer + + Determine whether the subject identified by the credential can set an ACL + on the specified vnode. The ACL pointer will be NULL when removing an ACL. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. Suggested failure: EACCES for label mismatch or + EPERM for lack of privilege. +*/ +typedef int mpo_vnode_check_setacl_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + struct kauth_acl *acl +); /** @brief Access control check for setting file attributes @param cred Subject credential @@ -5133,6 +5319,7 @@ typedef int mpo_vnode_check_uipc_bind_t( @param cred Subject credential @param vp Object vnode @param label Policy label associated with vp + @param so Socket Determine whether the subject identified by the credential can perform a connect operation on the passed UNIX domain socket vnode. @@ -5144,7 +5331,8 @@ typedef int mpo_vnode_check_uipc_bind_t( typedef int mpo_vnode_check_uipc_connect_t( kauth_cred_t cred, struct vnode *vp, - struct label *label + struct label *label, + socket_t so ); /** @brief Access control check for deleting vnode @@ -5659,6 +5847,158 @@ typedef void mpo_vnode_notify_link_t( struct componentname *cnp ); +/** + @brief Inform MAC policies that an extended attribute has been removed from a vnode + @param cred Subject credential + @param vp Object node + @param label Policy label for vp + @param name Extended attribute name + + Inform MAC policies that an extended attribute has been removed from a vnode. +*/ +typedef void mpo_vnode_notify_deleteextattr_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + const char *name +); + + +/** + @brief Inform MAC policies that an ACL has been set on a vnode + @param cred Subject credential + @param vp Object node + @param label Policy label for vp + @param acl ACL structure pointer + + Inform MAC policies that an ACL has been set on a vnode. +*/ +typedef void mpo_vnode_notify_setacl_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + struct kauth_acl *acl +); + +/** + @brief Inform MAC policies that an attributes have been set on a vnode + @param cred Subject credential + @param vp Object vnode + @param label Policy label for vp + @param alist List of attributes to set + + Inform MAC policies that an attributes have been set on a vnode. +*/ +typedef void mpo_vnode_notify_setattrlist_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + struct attrlist *alist +); + +/** + @brief Inform MAC policies that an extended attribute has been set on a vnode + @param cred Subject credential + @param vp Object vnode + @param label Policy label for vp + @param name Extended attribute name + @param uio I/O structure pointer + + Inform MAC policies that an extended attribute has been set on a vnode. +*/ +typedef void mpo_vnode_notify_setextattr_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + const char *name, + struct uio *uio +); + +/** + @brief Inform MAC policies that flags have been set on a vnode + @param cred Subject credential + @param vp Object vnode + @param label Policy label for vp + @param flags File flags; see chflags(2) + + Inform MAC policies that flags have been set on a vnode. +*/ +typedef void mpo_vnode_notify_setflags_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + u_long flags +); + +/** + @brief Inform MAC policies that a new mode has been set on a vnode + @param cred Subject credential + @param vp Object vnode + @param label Policy label for vp + @param mode File mode; see chmod(2) + + Inform MAC policies that a new mode has been set on a vnode. +*/ +typedef void mpo_vnode_notify_setmode_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + mode_t mode +); + +/** + @brief Inform MAC policies that new uid/gid have been set on a vnode + @param cred Subject credential + @param vp Object vnode + @param label Policy label for vp + @param uid User ID + @param gid Group ID + + Inform MAC policies that new uid/gid have been set on a vnode. +*/ +typedef void mpo_vnode_notify_setowner_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + uid_t uid, + gid_t gid +); + +/** + @brief Inform MAC policies that new timestamps have been set on a vnode + @param cred Subject credential + @param vp Object vnode + @param label Policy label for vp + @param atime Access time; see utimes(2) + @param mtime Modification time; see utimes(2) + + Inform MAC policies that new timestamps have been set on a vnode. +*/ +typedef void mpo_vnode_notify_setutimes_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + struct timespec atime, + struct timespec mtime +); + +/** + @brief Inform MAC policies that a vnode has been truncated + @param cred Subject credential + @param file_cred Credential associated with the struct fileproc + @param vp Object vnode + @param label Policy label for vp + + Inform MAC policies that a vnode has been truncated. +*/ +typedef void mpo_vnode_notify_truncate_t( + kauth_cred_t cred, + kauth_cred_t file_cred, + struct vnode *vp, + struct label *label +); + + /** @brief Inform MAC policies that a pty slave has been granted @param p Responsible process @@ -5798,7 +6138,7 @@ typedef void mpo_reserved_hook_t(void); * Please note that this should be kept in sync with the check assumptions * policy in bsd/kern/policy_check.c (policy_ops struct). */ -#define MAC_POLICY_OPS_VERSION 39 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 45 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -5873,15 +6213,15 @@ struct mac_policy_ops { mpo_ipq_label_init_t *mpo_ipq_label_init; mpo_ipq_label_update_t *mpo_ipq_label_update; - mpo_reserved_hook_t *mpo_reserved1; - mpo_reserved_hook_t *mpo_reserved2; - mpo_reserved_hook_t *mpo_reserved3; - mpo_reserved_hook_t *mpo_reserved4; - mpo_reserved_hook_t *mpo_reserved5; - mpo_reserved_hook_t *mpo_reserved6; - mpo_reserved_hook_t *mpo_reserved7; - mpo_reserved_hook_t *mpo_reserved8; - mpo_reserved_hook_t *mpo_reserved9; + mpo_file_check_library_validation_t *mpo_file_check_library_validation; + mpo_vnode_notify_setacl_t *mpo_vnode_notify_setacl; + mpo_vnode_notify_setattrlist_t *mpo_vnode_notify_setattrlist; + mpo_vnode_notify_setextattr_t *mpo_vnode_notify_setextattr; + mpo_vnode_notify_setflags_t *mpo_vnode_notify_setflags; + mpo_vnode_notify_setmode_t *mpo_vnode_notify_setmode; + mpo_vnode_notify_setowner_t *mpo_vnode_notify_setowner; + mpo_vnode_notify_setutimes_t *mpo_vnode_notify_setutimes; + mpo_vnode_notify_truncate_t *mpo_vnode_notify_truncate; mpo_mbuf_label_associate_bpfdesc_t *mpo_mbuf_label_associate_bpfdesc; mpo_mbuf_label_associate_ifnet_t *mpo_mbuf_label_associate_ifnet; @@ -5943,12 +6283,13 @@ struct mac_policy_ops { mpo_proc_check_expose_task_t *mpo_proc_check_expose_task; mpo_proc_check_set_host_special_port_t *mpo_proc_check_set_host_special_port; mpo_proc_check_set_host_exception_port_t *mpo_proc_check_set_host_exception_port; - mpo_reserved_hook_t *mpo_reserved11; - mpo_reserved_hook_t *mpo_reserved12; - mpo_reserved_hook_t *mpo_reserved13; - mpo_reserved_hook_t *mpo_reserved14; - mpo_reserved_hook_t *mpo_reserved15; - mpo_reserved_hook_t *mpo_reserved16; + mpo_exc_action_check_exception_send_t *mpo_exc_action_check_exception_send; + mpo_exc_action_label_associate_t *mpo_exc_action_label_associate; + mpo_exc_action_label_copy_t *mpo_exc_action_label_copy; + mpo_exc_action_label_destroy_t *mpo_exc_action_label_destroy; + mpo_exc_action_label_init_t *mpo_exc_action_label_init; + mpo_exc_action_label_update_t *mpo_exc_action_label_update; + mpo_reserved_hook_t *mpo_reserved17; mpo_reserved_hook_t *mpo_reserved18; mpo_reserved_hook_t *mpo_reserved19; @@ -6068,9 +6409,9 @@ struct mac_policy_ops { mpo_reserved_hook_t *mpo_reserved23; mpo_reserved_hook_t *mpo_reserved24; mpo_reserved_hook_t *mpo_reserved25; - mpo_reserved_hook_t *mpo_reserved26; - mpo_reserved_hook_t *mpo_reserved27; - mpo_reserved_hook_t *mpo_reserved28; + mpo_mount_check_snapshot_create_t *mpo_mount_check_snapshot_create; + mpo_mount_check_snapshot_delete_t *mpo_mount_check_snapshot_delete; + mpo_vnode_check_clone_t *mpo_vnode_check_clone; mpo_proc_check_get_cs_info_t *mpo_proc_check_get_cs_info; mpo_proc_check_set_cs_info_t *mpo_proc_check_set_cs_info; @@ -6156,8 +6497,9 @@ struct mac_policy_ops { mpo_vnode_notify_rename_t *mpo_vnode_notify_rename; - mpo_reserved_hook_t *mpo_reserved32; - mpo_reserved_hook_t *mpo_reserved33; + mpo_vnode_check_setacl_t *mpo_vnode_check_setacl; + + mpo_vnode_notify_deleteextattr_t *mpo_vnode_notify_deleteextattr; mpo_system_check_kas_info_t *mpo_system_check_kas_info; @@ -6258,6 +6600,46 @@ int mac_vnop_getxattr(struct vnode *, const char *, char *, size_t, size_t *); int mac_vnop_removexattr(struct vnode *, const char *); +/** + @brief Set an extended attribute on a vnode-based fileglob. + @param fg fileglob representing file to attach the extended attribute + @param name extended attribute name + @param buf buffer of data to use as the extended attribute value + @param len size of buffer + + Sets the value of an extended attribute on a file. + + Caller must hold an iocount on the vnode represented by the fileglob. +*/ +int mac_file_setxattr(struct fileglob *fg, const char *name, char *buf, size_t len); + +/** + @brief Get an extended attribute from a vnode-based fileglob. + @param fg fileglob representing file to read the extended attribute + @param name extended attribute name + @param buf buffer of data to hold the extended attribute value + @param len size of buffer + @param attrlen size of full extended attribute value + + Gets the value of an extended attribute on a file. + + Caller must hold an iocount on the vnode represented by the fileglob. +*/ +int mac_file_getxattr(struct fileglob *fg, const char *name, char *buf, size_t len, + size_t *attrlen); + +/** + @brief Remove an extended attribute from a vnode-based fileglob. + @param fg fileglob representing file to remove the extended attribute + @param name extended attribute name + + Removes the named extended attribute from the file. + + Caller must hold an iocount on the vnode represented by the fileglob. +*/ +int mac_file_removexattr(struct fileglob *fg, const char *name); + + /* * Arbitrary limit on how much data will be logged by the audit * entry points above. diff --git a/security/mac_process.c b/security/mac_process.c index 8071c3e65..193507f5d 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -291,14 +291,13 @@ mac_cred_check_visible(kauth_cred_t u1, kauth_cred_t u2) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif MAC_CHECK(cred_check_visible, u1, u2); - return (error); } @@ -317,13 +316,12 @@ mac_proc_check_debug(proc_t curp, struct proc *proc) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_debug, cred, proc); @@ -339,13 +337,12 @@ mac_proc_check_fork(proc_t curp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_fork, cred, curp); @@ -407,9 +404,9 @@ mac_proc_check_map_anon(proc_t proc, user_addr_t u_addr, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vm_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vm_enforce) + return 0; #endif if (!mac_proc_check_enforce(proc, MAC_VM_ENFORCE)) return (0); @@ -429,12 +426,12 @@ mac_proc_check_mprotect(proc_t proc, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vm_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vm_enforce) + return 0; #endif - if (!mac_proc_check_enforce(proc, MAC_VM_ENFORCE)) - return (0); + if (!mac_proc_check_enforce(proc, MAC_VM_ENFORCE)) + return (0); cred = kauth_cred_proc_ref(proc); MAC_CHECK(proc_check_mprotect, cred, proc, addr, size, prot); @@ -466,13 +463,12 @@ mac_proc_check_sched(proc_t curp, struct proc *proc) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_sched, cred, proc); @@ -488,13 +484,12 @@ mac_proc_check_signal(proc_t curp, struct proc *proc, int signum) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_signal, cred, proc, signum); @@ -510,12 +505,12 @@ mac_proc_check_wait(proc_t curp, struct proc *proc) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_wait, cred, proc); @@ -531,12 +526,12 @@ mac_proc_check_suspend_resume(proc_t curp, int sr) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_suspend_resume, cred, curp, sr); @@ -552,12 +547,12 @@ mac_proc_check_ledger(proc_t curp, proc_t proc, int ledger_op) int error = 0; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_ledger, cred, proc, ledger_op); @@ -573,12 +568,12 @@ mac_proc_check_cpumon(proc_t curp) int error = 0; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_cpumon, cred); @@ -594,12 +589,12 @@ mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor) int error = 0; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) + return 0; #endif - if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) - return 0; + if (!mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return 0; cred = kauth_cred_proc_ref(curp); MAC_CHECK(proc_check_proc_info, cred, target, callnum, flavor); @@ -608,7 +603,6 @@ mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor) return (error); } - int mac_proc_check_get_cs_info(proc_t curp, proc_t target, unsigned int op) { diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 429980b4e..7d7d6ea9a 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,6 +62,8 @@ * */ +#include + #include #include #include @@ -75,6 +77,7 @@ #include #include #include +#include #include #include @@ -279,9 +282,9 @@ void mac_devfs_label_copy(struct label *src, struct label *dest) { #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_device_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_device_enforce) + return; #endif MAC_PERFORM(devfs_label_copy, src, dest); @@ -292,9 +295,9 @@ mac_devfs_label_update(struct mount *mp, struct devnode *de, struct vnode *vp) { #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_device_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_device_enforce) + return; #endif MAC_PERFORM(devfs_label_update, mp, de, de->dn_label, vp, @@ -309,9 +312,9 @@ mac_vnode_label_associate(struct mount *mp, struct vnode *vp, vfs_context_t ctx) int error = 0; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return (error); + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return (error); #endif /* XXX: should not inspect v_tag in kernel! */ @@ -337,9 +340,9 @@ mac_vnode_label_associate_devfs(struct mount *mp, struct devnode *de, struct vnode *vp) { #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_device_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_device_enforce) + return; #endif MAC_PERFORM(vnode_label_associate_devfs, @@ -363,9 +366,9 @@ void mac_vnode_label_associate_singlelabel(struct mount *mp, struct vnode *vp) { #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; #endif if (!mac_label_vnodes) return; @@ -382,9 +385,9 @@ mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return (0); + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return (0); #endif if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) return (0); @@ -403,12 +406,12 @@ mac_vnode_notify_rename(vfs_context_t ctx, struct vnode *vp, kauth_cred_t cred; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; cred = vfs_context_ucred(ctx); MAC_PERFORM(vnode_notify_rename, cred, vp, vp->v_label, @@ -421,12 +424,12 @@ mac_vnode_notify_open(vfs_context_t ctx, struct vnode *vp, int acc_flags) kauth_cred_t cred; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; cred = vfs_context_ucred(ctx); MAC_PERFORM(vnode_notify_open, cred, vp, vp->v_label, acc_flags); @@ -439,17 +442,170 @@ mac_vnode_notify_link(vfs_context_t ctx, struct vnode *vp, kauth_cred_t cred; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; cred = vfs_context_ucred(ctx); MAC_PERFORM(vnode_notify_link, cred, dvp, dvp->v_label, vp, vp->v_label, cnp); } +void +mac_vnode_notify_deleteextattr(vfs_context_t ctx, struct vnode *vp, const char *name) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_deleteextattr, cred, vp, vp->v_label, name); +} + +void +mac_vnode_notify_setacl(vfs_context_t ctx, struct vnode *vp, struct kauth_acl *acl) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_setacl, cred, vp, vp->v_label, acl); +} + +void +mac_vnode_notify_setattrlist(vfs_context_t ctx, struct vnode *vp, struct attrlist *alist) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_setattrlist, cred, vp, vp->v_label, alist); +} + +void +mac_vnode_notify_setextattr(vfs_context_t ctx, struct vnode *vp, const char *name, struct uio *uio) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_setextattr, cred, vp, vp->v_label, name, uio); +} + +void +mac_vnode_notify_setflags(vfs_context_t ctx, struct vnode *vp, u_long flags) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_setflags, cred, vp, vp->v_label, flags); +} + +void +mac_vnode_notify_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mode) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_setmode, cred, vp, vp->v_label, mode); +} + +void +mac_vnode_notify_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, gid_t gid) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_setowner, cred, vp, vp->v_label, uid, gid); +} + +void +mac_vnode_notify_setutimes(vfs_context_t ctx, struct vnode *vp, struct timespec atime, struct timespec mtime) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_setutimes, cred, vp, vp->v_label, atime, mtime); +} + +void +mac_vnode_notify_truncate(vfs_context_t ctx, kauth_cred_t file_cred, struct vnode *vp) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_truncate, cred, file_cred, vp, vp->v_label); +} + /* * Extended attribute 'name' was updated via * vn_setxattr() or vn_removexattr(). Allow the @@ -462,9 +618,9 @@ mac_vnode_label_update_extattr(struct mount *mp, struct vnode *vp, int error = 0; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return; #endif if (!mac_label_vnodes) return; @@ -488,9 +644,9 @@ mac_vnode_label_store(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif if (!mac_label_vnodes || !mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) @@ -513,9 +669,9 @@ mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode * posix_cred_t pcred = posix_cred_get(new); #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce || !mac_vnode_enforce) - return; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce || !mac_vnode_enforce) + return; #endif /* mark the new cred to indicate "matching" includes the label */ @@ -581,9 +737,9 @@ mac_cred_check_label_update_execve(vfs_context_t ctx, struct vnode *vp, off_t of int result = 0; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce || !mac_vnode_enforce) - return result; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce || !mac_vnode_enforce) + return result; #endif cred = vfs_context_ucred(ctx); @@ -642,9 +798,9 @@ mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp, int mask; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) return 0; @@ -663,12 +819,12 @@ mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_chdir, cred, dvp, dvp->v_label); @@ -683,18 +839,38 @@ mac_vnode_check_chroot(vfs_context_t ctx, struct vnode *dvp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_chroot, cred, dvp, dvp->v_label, cnp); return (error); } +int +mac_vnode_check_clone(vfs_context_t ctx, struct vnode *dvp, + struct vnode *vp, struct componentname *cnp) +{ + kauth_cred_t cred; + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; + + cred = vfs_context_ucred(ctx); + MAC_CHECK(vnode_check_clone, cred, dvp, dvp->v_label, vp, + vp->v_label, cnp); + return (error); +} int mac_vnode_check_create(vfs_context_t ctx, struct vnode *dvp, struct componentname *cnp, struct vnode_attr *vap) @@ -703,12 +879,12 @@ mac_vnode_check_create(vfs_context_t ctx, struct vnode *dvp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_create, cred, dvp, dvp->v_label, cnp, vap); @@ -723,12 +899,12 @@ mac_vnode_check_unlink(vfs_context_t ctx, struct vnode *dvp, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_unlink, cred, dvp, dvp->v_label, vp, @@ -744,12 +920,12 @@ mac_vnode_check_deleteacl(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_deleteacl, cred, vp, vp->v_label, type); @@ -765,12 +941,12 @@ mac_vnode_check_deleteextattr(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_deleteextattr, cred, vp, vp->v_label, name); @@ -784,12 +960,12 @@ mac_vnode_check_exchangedata(vfs_context_t ctx, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_exchangedata, cred, v1, v1->v_label, @@ -806,12 +982,12 @@ mac_vnode_check_getacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_getacl, cred, vp, vp->v_label, type); @@ -827,12 +1003,12 @@ mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_getattrlist, cred, vp, vp->v_label, alist); @@ -849,9 +1025,9 @@ mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, int error = 0; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce || !mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce || !mac_vnode_enforce) + return 0; #endif cred = vfs_context_ucred(ctx); @@ -916,12 +1092,12 @@ mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_fsgetpath, cred, vp, vp->v_label); @@ -929,23 +1105,112 @@ mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp) } int -mac_vnode_check_signature(struct vnode *vp, off_t macho_offset, - unsigned char *sha1, - const void *signature, size_t size, - int flags, int *is_platform_binary) -{ - int error; - -#if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce || !mac_vnode_enforce) - return 0; -#endif - - MAC_CHECK(vnode_check_signature, vp, vp->v_label, macho_offset, sha1, - signature, size, - flags, is_platform_binary); - return (error); +mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob, + struct image_params *imgp, + unsigned int *cs_flags, int flags) +{ + int error; + char *fatal_failure_desc = NULL; + size_t fatal_failure_desc_len = 0; + + char *vn_path = NULL; + vm_size_t vn_pathlen = MAXPATHLEN; + + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce || !mac_vnode_enforce) + return 0; +#endif + + MAC_CHECK(vnode_check_signature, vp, vp->v_label, cs_blob, + cs_flags, flags, &fatal_failure_desc, &fatal_failure_desc_len); + + if (fatal_failure_desc_len) { + // A fatal code signature validation failure occured, formulate a crash + // reason. + + char const *path = NULL; + + vn_path = (char *)kalloc(MAXPATHLEN); + if (vn_path != NULL) { + if (vn_getpath(vp, vn_path, (int*)&vn_pathlen) == 0) { + path = vn_path; + } else { + path = "(get vnode path failed)"; + } + } else { + path = "(path alloc failed)"; + } + + if (error == 0) { + panic("mac_vnode_check_signature: MAC hook returned no error, " + "but status is claimed to be fatal? " + "path: '%s', fatal_failure_desc_len: %ld, fatal_failure_desc:\n%s\n", + path, fatal_failure_desc_len, fatal_failure_desc); + } + + printf("mac_vnode_check_signature: %s: code signature validation failed fatally: %s", + path, fatal_failure_desc); + + if (imgp == NULL) { + goto out; + } + + os_reason_t reason = os_reason_create(OS_REASON_CODESIGNING, + CODESIGNING_EXIT_REASON_TASKGATED_INVALID_SIG); + + if (reason == OS_REASON_NULL) { + printf("mac_vnode_check_signature: %s: failure to allocate exit reason for validation failure: %s\n", + path, fatal_failure_desc); + goto out; + } + + imgp->ip_cs_error = reason; + reason->osr_flags = (OS_REASON_FLAG_GENERATE_CRASH_REPORT | + OS_REASON_FLAG_CONSISTENT_FAILURE); + + if (fatal_failure_desc == NULL) { + // This may happen if allocation for the buffer failed. + printf("mac_vnode_check_signature: %s: fatal failure is missing its description.\n", path); + } else { + mach_vm_address_t data_addr = 0; + + int reason_error = 0; + int kcdata_error = 0; + + if ((reason_error = os_reason_alloc_buffer(reason, kcdata_estimate_required_buffer_size + (1, fatal_failure_desc_len))) == 0 && + (kcdata_error = kcdata_get_memory_addr(&reason->osr_kcd_descriptor, + EXIT_REASON_USER_DESC, fatal_failure_desc_len, + &data_addr)) == KERN_SUCCESS) { + kern_return_t mc_error = kcdata_memcpy(&reason->osr_kcd_descriptor, (mach_vm_address_t)data_addr, + fatal_failure_desc, fatal_failure_desc_len); + + if (mc_error != KERN_SUCCESS) { + printf("mac_vnode_check_signature: %s: failed to copy reason string " + "(kcdata_memcpy error: %d, length: %ld)\n", + path, mc_error, fatal_failure_desc_len); + } + } else { + printf("mac_vnode_check_signature: %s: failed to allocate space for reason string " + "(os_reason_alloc_buffer error: %d, kcdata error: %d, length: %ld)\n", + path, reason_error, kcdata_error, fatal_failure_desc_len); + } + + } + } + +out: + if (vn_path) { + kfree(vn_path, MAXPATHLEN); + } + + if (fatal_failure_desc_len > 0 && fatal_failure_desc != NULL) { + kfree(fatal_failure_desc, fatal_failure_desc_len); + } + + return (error); } #if 0 @@ -956,12 +1221,12 @@ mac_vnode_check_getacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_getacl, cred, vp, vp->v_label, type); @@ -977,12 +1242,12 @@ mac_vnode_check_getextattr(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_getextattr, cred, vp, vp->v_label, @@ -997,12 +1262,12 @@ mac_vnode_check_ioctl(vfs_context_t ctx, struct vnode *vp, u_int cmd) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_ioctl, cred, vp, vp->v_label, cmd); @@ -1017,12 +1282,12 @@ mac_vnode_check_kqfilter(vfs_context_t ctx, kauth_cred_t file_cred, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_kqfilter, cred, file_cred, kn, vp, @@ -1039,12 +1304,12 @@ mac_vnode_check_link(vfs_context_t ctx, struct vnode *dvp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_link, cred, dvp, dvp->v_label, vp, @@ -1059,12 +1324,12 @@ mac_vnode_check_listextattr(vfs_context_t ctx, struct vnode *vp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_listextattr, cred, vp, vp->v_label); @@ -1079,12 +1344,12 @@ mac_vnode_check_lookup(vfs_context_t ctx, struct vnode *dvp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_lookup, cred, dvp, dvp->v_label, cnp); @@ -1098,12 +1363,12 @@ mac_vnode_check_open(vfs_context_t ctx, struct vnode *vp, int acc_mode) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_open, cred, vp, vp->v_label, acc_mode); @@ -1118,12 +1383,12 @@ mac_vnode_check_read(vfs_context_t ctx, struct ucred *file_cred, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_read, cred, file_cred, vp, @@ -1139,12 +1404,12 @@ mac_vnode_check_readdir(vfs_context_t ctx, struct vnode *dvp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_readdir, cred, dvp, dvp->v_label); @@ -1158,12 +1423,12 @@ mac_vnode_check_readlink(vfs_context_t ctx, struct vnode *vp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_readlink, cred, vp, vp->v_label); @@ -1178,12 +1443,12 @@ mac_vnode_check_label_update(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_label_update, cred, vp, vp->v_label, newlabel); @@ -1200,12 +1465,12 @@ mac_vnode_check_rename(vfs_context_t ctx, struct vnode *dvp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); @@ -1232,12 +1497,12 @@ mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_revoke, cred, vp, vp->v_label); @@ -1251,12 +1516,12 @@ mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp, struct attrlist *a int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_searchfs, cred, vp, vp->v_label, alist); @@ -1270,39 +1535,37 @@ mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp, int which) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_select, cred, vp, vp->v_label, which); return (error); } -#if 0 int -mac_vnode_check_setacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type, - struct acl *acl) +mac_vnode_check_setacl(vfs_context_t ctx, struct vnode *vp, + struct kauth_acl *acl) { kauth_cred_t cred; int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); - MAC_CHECK(vnode_check_setacl, cred, vp, vp->v_label, type, acl); + MAC_CHECK(vnode_check_setacl, cred, vp, vp->v_label, acl); return (error); } -#endif int mac_vnode_check_setattrlist(vfs_context_t ctx, struct vnode *vp, @@ -1312,12 +1575,12 @@ mac_vnode_check_setattrlist(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_setattrlist, cred, vp, vp->v_label, alist); @@ -1332,12 +1595,12 @@ mac_vnode_check_setextattr(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_setextattr, cred, vp, vp->v_label, @@ -1352,12 +1615,12 @@ mac_vnode_check_setflags(vfs_context_t ctx, struct vnode *vp, u_long flags) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_setflags, cred, vp, vp->v_label, flags); @@ -1371,12 +1634,12 @@ mac_vnode_check_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mode) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_setmode, cred, vp, vp->v_label, mode); @@ -1391,12 +1654,12 @@ mac_vnode_check_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_setowner, cred, vp, vp->v_label, uid, gid); @@ -1411,12 +1674,12 @@ mac_vnode_check_setutimes(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_setutimes, cred, vp, vp->v_label, atime, @@ -1432,12 +1695,12 @@ mac_vnode_check_stat(vfs_context_t ctx, struct ucred *file_cred, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_stat, cred, file_cred, vp, @@ -1453,12 +1716,12 @@ mac_vnode_check_truncate(vfs_context_t ctx, struct ucred *file_cred, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_truncate, cred, file_cred, vp, @@ -1475,9 +1738,9 @@ mac_vnode_check_write(vfs_context_t ctx, struct ucred *file_cred, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) return 0; @@ -1496,12 +1759,12 @@ mac_vnode_check_uipc_bind(vfs_context_t ctx, struct vnode *dvp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(vnode_check_uipc_bind, cred, dvp, dvp->v_label, cnp, vap); @@ -1509,21 +1772,21 @@ mac_vnode_check_uipc_bind(vfs_context_t ctx, struct vnode *dvp, } int -mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp) +mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket *so) { kauth_cred_t cred; int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); - MAC_CHECK(vnode_check_uipc_connect, cred, vp, vp->v_label); + MAC_CHECK(vnode_check_uipc_connect, cred, vp, vp->v_label, (socket_t) so); return (error); } @@ -1557,9 +1820,9 @@ mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offset) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_proc_enforce || !mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce || !mac_vnode_enforce) + return 0; #endif MAC_CHECK(vnode_find_sigs, p, vp, offset, vp->v_label); @@ -1624,12 +1887,12 @@ mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(mount_check_mount, cred, vp, vp->v_label, cnp, vfc_name); @@ -1637,6 +1900,46 @@ mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp, return (error); } +int +mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp, + const char *name) +{ + kauth_cred_t cred; + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; + + cred = vfs_context_ucred(ctx); + MAC_CHECK(mount_check_snapshot_create, cred, mp, name); + return (error); +} + +int +mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp, + const char *name) +{ + kauth_cred_t cred; + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; +#endif + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; + + cred = vfs_context_ucred(ctx); + MAC_CHECK(mount_check_snapshot_delete, cred, mp, name); + return (error); +} + int mac_mount_check_remount(vfs_context_t ctx, struct mount *mp) { @@ -1644,12 +1947,12 @@ mac_mount_check_remount(vfs_context_t ctx, struct mount *mp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(mount_check_remount, cred, mp, mp->mnt_mntlabel); @@ -1664,12 +1967,12 @@ mac_mount_check_umount(vfs_context_t ctx, struct mount *mp) int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(mount_check_umount, cred, mp, mp->mnt_mntlabel); @@ -1685,12 +1988,12 @@ mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(mount_check_getattr, cred, mp, mp->mnt_mntlabel, vfa); @@ -1705,12 +2008,12 @@ mac_mount_check_setattr(vfs_context_t ctx, struct mount *mp, int error; #if SECURITY_MAC_CHECK_ENFORCE - /* 21167099 - only check if we allow write */ - if (!mac_vnode_enforce) - return 0; + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) + return 0; #endif - if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) - return 0; + if (!mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return 0; cred = vfs_context_ucred(ctx); MAC_CHECK(mount_check_setattr, cred, mp, mp->mnt_mntlabel, vfa); @@ -1929,6 +2232,8 @@ mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp, break; case DTYPE_KQUEUE: case DTYPE_FSEVENTS: + case DTYPE_ATALK: + case DTYPE_NETPOLICY: default: MAC_PERFORM(vnode_label_associate_file, vfs_context_ucred(ctx), mp, mp->mnt_mntlabel, fp->f_fglob, fp->f_fglob->fg_label, diff --git a/tools/Makefile b/tools/Makefile index 95a2076fb..c72cf78cb 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) diff --git a/tools/lldbmacros/Makefile b/tools/lldbmacros/Makefile index e1fe38080..8a074e2d4 100644 --- a/tools/lldbmacros/Makefile +++ b/tools/lldbmacros/Makefile @@ -3,7 +3,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -16,6 +15,13 @@ LLDBMACROS_BOOTSTRAP_DEST:=$(OBJPATH)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSD LLDBMACROS_DEST:=$(LLDBMACROS_BOOTSTRAP_DEST)/lldbmacros/ LLDBMACROS_USERDEBUG_FILES= +LLDBMACROS_USERDEBUG_FILES:= \ + usertaskdebugging/__init__.py \ + usertaskdebugging/gdbserver.py \ + usertaskdebugging/interface.py \ + usertaskdebugging/rsprotocol.py \ + usertaskdebugging/target.py \ + usertaskdebugging/userprocess.py LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ core/standard.py \ @@ -33,6 +39,8 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ bank.py \ xnu.py \ xnudefines.py \ + ktrace.py \ + macho.py \ mbufdefines.py \ netdefines.py \ routedefines.py \ @@ -55,7 +63,9 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ apic.py \ kauth.py \ usertaskgdbserver.py \ - waitq.py + waitq.py \ + pgtrace.py \ + xnutriage.py ifneq ($(PLATFORM),MacOSX) LLDBMACROS_PYTHON_FILES+= \ @@ -65,7 +75,7 @@ endif INSTALL_LLDBMACROS_PYTHON_FILES=$(addprefix $(LLDBMACROS_DEST), $(LLDBMACROS_PYTHON_FILES)) -$(INSTALL_LLDBMACROS_PYTHON_FILES): $(LLDBMACROS_DEST)% : $(LLDBMACROS_SOURCE)% +$(INSTALL_LLDBMACROS_PYTHON_FILES): $(LLDBMACROS_DEST)% : $(LLDBMACROS_SOURCE)% $(_v)$(MKDIR) $(dir $@) $(_v)$(PYTHON) $(LLDBMACROS_SOURCE)/core/syntax_checker.py $< $(_vstdout) $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ diff --git a/tools/lldbmacros/README b/tools/lldbmacros/README deleted file mode 100644 index cd77f789d..000000000 --- a/tools/lldbmacros/README +++ /dev/null @@ -1,356 +0,0 @@ -Table of Contents -================= -A. How to use lldb for kernel debugging -B. Design of lldb kernel debugging platform. -C. Kernel debugging commands. - i. Using commands. - ii. Writing new commands. -D. Kernel type summaries. - i. Using summaries - ii. Writing new summary functions -E. FAQ and Generel Coding Guidelines - i. Frequently Asked Questions - ii. Formatted Output printing guidelines [MUST READ] - iii. Coding conventions. [MUST READ] - iv. Submitting changes in lldbmacros [MUST READ] - v. Common utility functions and paradigms -F. Development and Debugging on lldb kernel debugging platform. - i. Reading a exception backtrace - ii. Loading custom or local lldbmacros and operating_system plugin - iii. Adding debug related 'printf's - - -======================================== -A. How to use lldb for kernel debugging -======================================== -lldb can be used for kernel debugging the same way as gdb. The simplest way is to start lldb with kernel symbol file. The lldb environment by default does not allow loading automatic python modules. Please add the following setting in -File: ~/.lldbinit -settings set target.load-script-from-symbol-file true - -Now lldb will be ready to connect over kdp-remote '' or 'gdb-remote '. In case using a core file please do 'file --core /path/to/corefile' - -Following are detailed steps on how to debug a panic'ed / NMI'ed machine (For the curious souls). - lldb debugging in detail:- - * start lldb with the right symbols file. If you do not know the version apriori, then enable dsymForUUID to load symbols dynamically. - bash$ dsymForUUID --enable - bash$ lldb /path/to/mach_kernel.symbols - Current executable set to '/Sources/Symbols/xnu/xnu-2253~2/mach_kernel' (x86_64). - (lldb) - * connect to remote device or load a core file - # for kdp - (lldb) process connect --plugin kdp-remote udp://17.123.45.67:41139 - # for gdb (eg with astris) - (lldb) process connect --plugin gdb-remote gdb://17.123.45.67:8000 - # for loading a core file - (lldb) file --core /path/to/core/file /path/to/kernel_symbol_file - * Once connected you can debug with basic lldb commands like print, bt, expr etc. The xnu debug macros will also be loaded automatically from the dSYM files. - In case if you are working with older kernel files you can load kernel specific commands by doing - - (lldb) command script import /path/to/xnu/tools/lldbmacros/xnu.py - (lldb) showbootargs - debug=0x14e ncpus=2 - * You can do 'kgmhelp' to get a list of commands available through xnu.py - -SPECIAL: The xnu.py script brings in kernel type summary functions. To enable these please do - - (lldb) showlldbtypesummaries -These could be very handy in printing important information from structures easily. -For ex. - (lldb) print (thread_t)0x80d6a620 - (thread_t) $45 = 0x80d6a620 - thread thread_id processor pri io_policy state wait_queue wait_event wmesg thread_name - 0x80d6a620 0x317 0x902078c8 61 W 0x910cadd4 0x0 SystemSoundServer - - -============================================= -B. Design of lldb kernel debugging platform. -============================================= -The lldb debugger provides python scripting bridge for customizing commands and summaries in lldb. Following is the stack of platforms and how commands and summaries interact with it. - - |------- xnu scripts ----------| - | |- lldb Command/Scripting-| | <-- provides scriptability for kernel data structures through summary/command invocation. - | | |--lldb core--| | | <-- interacts with remote kernel or corefile. - | |-------------------------| | - |------------------------------| - - The xnu script in xnu/tools/lldbmacros provides the following: - * Custom functions to do plumbing of lldb command invocation to python function call. (see doc strings for @lldb_command) - The command interface provides some common features (which can be invoked after passing '--' on cmd line) like - - i. send the output of command to file on disk - ii. search for a string in the output and selectively print the line containing it. - iii. -v options to increase verbosity levels in commands. - For example: (lldb)showalltasks -- -s kernel_task --o /tmp/kernel_task.output -v - will show task summary output with lines matching string 'kernel_task' into a file /tmp/kernel_task.output and with a verbosity level of (default +1) - - * Customization for plugging in summary functions for lldb type summaries. (see doc strings for @lldb_summary) - It will automatically register give types with the functions withing the kernel category. - - * Ability to register test cases for macros (see doc strings for @xnudebug_test). - -The file layout is like following -xnu/ - |-tools/ - |-lldb/ - |-core/ # Core logic about kernel, lldb value abstraction, configs etc. **DO NOT TOUCH THIS DIR** - |-plugins/ # Holds plugins for kernel commands. - |-xnu.py # xnu debug framework along with kgmhelp, xnudebug commands. - |-xnudefines.py - |-utils.py - |-process.py # files containing commands/summaries code for each subsystem - |-... - -The lldbmacros directory has a Makefile that follows the build process for xnu. This packages lldbmacros scripts into the dSYM of each kernel build. This helps in rev-locking the lldb commands with changes in kernel sources. - -============================== -C. Kernel debugging commands. -============================== -i. Using commands. ------------------- -Using xnu debug commands is very similar to kgmacros in gdb. You can use 'kgmhelp' to get a listing of available commands. -If you need detailed help for a command please type 'help ' and the documentation for the command will be displayed. -ex. - (lldb) help pmap_walk - Perform a page-table walk in for . - You can pass -- -v for verbose output. To increase the verbosity add more -v args after the '--'. - Syntax: pmap_walk - -The basic format for every command provided under kgmhelp is like follows -(lldb) command_name [cmd_args..] [-CMDOPTIONS] [-xnuoptions] -where: - command_name : name of command as registed using the @lldb_command decorator and described in 'kgmhelp' - cmd_args : shell like arguments that are passed as is to the registered python function. - If there is error in these arguments than the implementor may display according error message. - xnuoptions : common options for stream based operations on the output of command_name. - Allowed options are - -h : show help string of a command - -s : print only the lines matching - -o : direct the output of command to . Will not display anything on terminal - -v : increase the verbosity of the command. Each '-v' encountered will increase verbosity by 1. - -p : pass the output of command to for processing and followup with command requests by it. - CMDOPTIONS : These are command level options (always a CAPITAL letter option) that are defined by the macro developer. Please do - help to know how each option operates on that particular command. For an example of how to use CMDOPTIONS, take a look at vm_object_walk_pages in memory.py - -ii. Writing new commands. --------------------------- - The python modules are designed in such a way that the command from lldb invokes a python function with the arguments passed at lldb prompt. - It is recommended that you do a decoupled development for command interface and core utility function so that any function/code can - called as a simple util function and get the same output. i.e. - (lldb)showtask 0xabcdef000 is same as python >>> GetTaskSummary(0xabcdef000) or equivalent - - Following is a step by step guideline on how to add a new command ( e.g showtaskvme ). [extra tip: Always good idea to wrap your macro code within # Macro: , # EndMacro.] - 1. register a command to a function. Use the lldb_command decorator to map a 'command_name' to a function. Optionally you can provide getopt compatible option string for customizing your command invocation. Note: Only CAPITAL letter options are allowed. lowercase options are reserved for the framework level features. - 2. Immediately after the register define the function to handle the command invocation. The signature is always like Abc(cmd_args=None, cmd_options={}) - 3. Add documentation for Abc(). This is very important for lldb to show help for each command. [ Follow the guidelines above with documentation ] - 4. Use cmd_args array to get args passed on command. For example a command like "showtaskvme 0xabcdef00" will put have cmd_args=['0xabcdef00'] - - note that we use core.value class as an interface to underlying C structures. Refer [Section B] for more details. - - use kern.globals. & kern.GetValueFromAddress for building values from addresses. - - remember that the ideal type of object to be passed around is core.value - - Anything you 'print' will be relayed to lldb terminal output. - 5. If the user has passed any custom options they would be in cmd_options dict. the format is {'-':''}. The will be '' (empty string) for non-option flags. - 6. If your function finds issue with the passed argument then you can raise ArgumentError('error_message') to notify the user. The framework will automatically catch this and show appropriate help using the function doc string. - - Time for some code example? Try reading the code for function ShowTaskVmeHelper in memory.py. - -SPECIAL Note: Very often you will find yourself making changes to a file for some command/summary and would like to test it out in lldb. -To easily reload your changes in lldb please follow the below example. - * you fire up lldb and start using zprint. And soon you need to add functionality to zprint. - * you happily change a function code in memory.py file to zprint macro. - * now to reload that particular changes without killing your debug session do - (lldb) xnudebug reload memory - memory is reloaded from ./memory.py - (lldb) - - It is very important that you do reload using xnudebug command as it does the plumbing of commands and types for your change in the module. Otherwise you could easily get confused - why your changes are not reflected in the command. - - -========================== -D. Kernel type summaries. -========================== -i. Using summaries ------------------- - The lldb debugger provides ways for user to customize how a particular type of object be decsribed when printed. These are very useful in displaying complex and large structures - where only certain fields are important based on some flag or value in some field or variable. The way it works is every time lldb wants to print an object it checks - for regisetered summaries. We can define python functions and hook it up with lldb as callbacks for type summaries. - For example. - (lldb) print first_zone - (zone_t) $49 = 0xd007c000 - ZONE TOT_SZ ALLOC_ELTS FREE_ELTS FREE_SZ ELT_SZ ALLOC(ELTS PGS SLK) FLAGS NAME - 0x00000000d007c000 29808 182 25 3600 144 4096 28 1 64 X$ zones - (lldb) - Just printing the value of first_zone as (zone_t) 0xd007c000 wouldnt have been much help. But with the registered summary for zone_t we can see all the interesting info easily. - - You do not need to do anything special to use summaries. Once they are registered with lldb they show info automatically when printing objects. However if you wish to - see all the registered type summaries run the command 'type summary list -w kernel' on lldb prompt. - Also if you wish to quickly disable the summaries for a particular command use the 'showraw' command. - -ii. Writing new summary functions ---------------------------------- -lldb provides really flexible interface for building summaries for complex objects and data. If you find that a struct or list can be -diagnosed better if displayed differently, then feel free to add a type summary for that type. Following is an easy guide on how to do that. - - 1. Register a function as a callback for displaying information for a type. Use the @lldb_type_summary() decorator with an array of types you wish to register for callback - 2. Provide a header for the summary using @header() decorator. This is a strong requirement for summaries. This gets displayed before the output - of GetTypeSummary() is displayed. [In case you do not wish to have header then still define it as "" (empty string) ] - 3. Define the function with signature of GetSomeTypeSummary(valobj). It is highly recommended that the naming be consistent to Get.*?Summary(valobj) - The valobj argument holds the core.value object for display. - 4. Use the utility functions and memory read operations to pull out the required information. - [ use kern.globals & kern.GetValueFromAddress for building args to core functions. ] - [ remember that the ideal type of object to be passed around is core.value ] - 5. return a string that would be printed by the caller. When lldb makes a call back it expects a str to be returned. So do not print - directly out to console. [ debug info or logs output is okay to be printed anywhere :) ] - -Time for some code example? Try reading the code for GetTaskSummary() in process.py. - - -====================================== -E. FAQs and Generel Coding Guidelines -====================================== - -i. Frequently Asked Questions ------------------------------ - - Q. How do I avoid printing the summary and see the actual data in a structure? - A. There is a command called 'showraw'. This will disable all kernel specific type summaries and execute any command you provide. - Ex. - (lldb) print (thread_t) 0x80d6a620 - (thread_t) $45 = 0x80d6a620 - thread thread_id processor pri io_policy state wait_queue wait_event wmesg thread_name - 0x80d6a620 0x317 0x902078c8 61 W 0x910cadd4 0x0 SystemSoundServer - (lldb) showraw print (thread_t) 0x80d6a620 - (thread_t) $48 = 0x80d6a620 - - Q. I typed 'showallvnodes' and nothing happens for a long time? OR How do I get output of long running command instantly on the terminal? - A. The lldb command interface tries to build result object from output of a python function. So in case of functions with very long output or runtime it may - seem that the lldb process is hung. But it is not. You can use "-i" option to get immediate output on terminal. - ex. (lldb) showallvnodes -- -i - Immediate Output - .... - - Q. I made a change in a python file for a command or summary, but the output is not reflected in the lldb command? - A. The python framework does not allow for removing a loaded module and then reloading it. So sometimes if a command has a cached value from - old code that it will still call the old function and hence will not display new changes in file on disk. If you find yourself in such a situation - please see [Section C. -> SPECIAL Note]. If the change is to basic class or caching mechanism than it is advised to quit lldb and re-load all modules again. - - Q. I am new to python. I get an error message that I do not understand. what should I do? - A. The syntax for python is different from conventional programming languages. If you get any message with SyntaxError or TypeError or ValueError then please - review your code and look for common errors like - - wrong level of indentation? - - missed a ':' at the end of an if, elif, for, while statement? - - referencing a key in dictionary that doesnt exist? You might see KeyError in such cases. - - mistakenly used python reserved keyword as variable? (check http://docs.python.org/release/3.0.1/reference/lexical_analysis.html#id8) - - Trying to modify a string value? You can only create new strings but never modify existing ones. - - Trying to add a non string value to a string? This typically happens in print "time is " + gettime(). here gettime() returns int and not str. - - using a local variable with same name as global variable? - - assigning a value to global variable without declaring first? Its highly recommended to always declare global variable with 'global' keyword - If you still have difficulty you can look at the python documentation at http://docs.python.org - - Q. I wish to pass value of variable/expression to xnu lldb macro that accepts only pointers. How can I achieve that? - A. Many lldb macros have syntax that accepts pointers (eg showtaskstacks etc). In order to have your expression be evaluated before passing to command use `back ticks`. - For example: - (lldb) showtaskstacks `(task_t)tasks.next` - This way the expressing withing ` ` is evaluated by lldb and the value is passed to the command. - Note that if your argument pointer is bad or the memory is corrupted lldb macros will fail with a long backtrace that may not make sense. gdb used to fail silently but lldb does not. - Please see Section F(i) for more information on reading backtraces. - - Q. I connected to a coredump file with lldb --core corefile and I got RuntimeError: Unable to find lldb thread for tid=XYZ. What should I do? - A. This is most likely the case that lldb ignored the operating system plugin in the dSYM and hence threads are not populated. Please put the line 'settings set target.load-script-from-symbol-file true' in your ~/.lldbinit file. If you do not have access you can alternatively do - bash# lldb - (lldb) settings set target.load-script-from-symbol-file true - (lldb) file --core corefile - - -ii. Formatted output printing - zen and peace for life ------------------------------------------------------- - - To avoid the horrors of printing a tabular data on console and then 2 weeks later again messing with it for a new field, it is recommended to follow these guidelines. - * any python string can be invoked to "".format() and hence makes it very easy to play with formats - * As a convention, I suggest that for printing pointer values in hex use "{0: <#020x}".format(some_int_value). This will print nice 0x prefixed strings with length padded to 20. - * If you need help with format options take a look at http://docs.python.org/library/string.html#format-string-syntax - * [ I'd first create a format string for data and then for the header just change the x's and d's to s and pass the header strings to format command. see GetTaskSummary()] - * If you need to print a string from a core.value object then use str() to get string representation of value. - - -iii. Coding conventions ------------------------ - It is very very HIGHLY RECOMMENDED to follow these guidelines for writing any python code. - * Python is very sensitive to tabs and spaces for alignement. So please make sure you INDENT YOUR CODE WITH SPACES at all times. - * The standard tab width is 4 spaces. Each increasing indent adds 4 spaces begining of the line. - * The format for documentation is - - """ A one line summary describing what this function / class does - Detailed explanation if necessary along with params and return values. - """ - * All Classes and functions should have a doc string describing what the function does - A consistent format is expected. For ex. - def SumOfNumbers(a, b, c, d): - """ Calculate sum of numbers. - params: - a - int, value to be added. can be 0 - b - int/float, value to be added. - returns: - int/float - Sum of two values - raises: - TypeError - If any type is not identified in the params - """ - * A Class or Function should always start with CAPITAL letter and be CamelCase. If a function is for internal use only than it starts with '_'. - * Function params should always be lower_case and be word separated with '_' - * A local variable inside a function should be lower_case and separated with '_' - * A variable for internal use in object should start with '_'. - * if a class variable is supposed to hold non native type of object, it is good idea to comment what type it holds - * A class function with name matching Get(.*?)Summary() is always supposed to return a string which can be printed on stdout or any file. - * Functions begining with "Get" (eg. GetVnodePath()) mean they return a value and will not print any output to stdout. - * Functions degining with "Show" (eg. ShowZTrace()) mean they will print data on screen and may not return any value. - -iv. Submitting changes in lldbmacros ------------------------------------- - To contribute new commands or fixes to existing one, it is recommended that you follow the procedure below. - * Save the changes requried for new command or fix into lldbmacros directory. - * Make sure that the coding conventions are strictly followed. - * Run syntax checker on each of the modified files. It will find basic formatting errors in the changed files for you. - * If you are adding new file then please update the Makefile and xnu.py imports to ensure they get compiled during kernel build. - * Do a clean build of kernel from xnu top level directory. - * Verify that your changes are present in the dSYM directory of new build. - * Re-run all your test and verification steps with the lldbmacros from the newly packaged dSYM/Contents/Resources/Python/lldbmacros. - -v. Common utility functions and paradigms ------------------------------------------ - Please search and look around the code for common util functions and paradigm - * Take a peek at utils.py for common utility like sizeof_fmt() to humanize size strings in KB, MB etc. The convention is to have functions that do self contained actions and does not require intricate knowledge of kernel structures in utils.py - * If you need to get pagesize of the traget system, do not hard code any value. kern.globals.page_size is your friend. Similarly use config['verbosity'] for finding about configs. - * If you are developing a command for structure that is different based on development/release kernels please use "hasattr()" functionality to conditionalize referencing #ifdef'ed fields in structure. See example in def GetTaskSummary(task) in process.py - -=============================================================== -F. Development and Debugging on lldb kernel debugging platform. -=============================================================== - -i. Reading a exception backtrace --------------------------------- - In case of an error the lldbmacros may print out an exception backtrace and halt immediately. The backtrace is very verbose and may be confusing. The important thing is to isolate possible causes of failure, and eventually filing a bug with kernel team. Following are some common ways where you may see an exception instead of your expected result. - * The lldbmacros cannot divine the type of memory by inspection. If a wrong pointer is passed from commandline then, the command code will try to read and show some results. It may still be junk or plain erronous. Please make sure your command arguments are correct. - For example: a common mistake is to pass task address to showactstack. In such a case lldb command may fail and show you a confusing backtrace. - * Kernel debugging is particularly tricky. Many parts of memory may not be readable. There could be failure in network, debugging protocol or just plain bad memory. In such a case please try to see if you can examine memory for the object you are trying to access. - * In case of memory corruption, the lldbmacros may have followed wrong pointer dereferencing. This might lead to failure and a exception to be thrown. - -ii. Loading custom or local lldbmacros and operating_system plugin ------------------------------------------------------------------- - The lldbmacros are packaged right into the dSYM for the kernel executable. This makes debugging very easy since they can get loaded automatically when symbols are loaded. - However, this setup makes it difficult for a lldbmacro developer to load custom/local macros. Following is the suggested solution for customizing your debugging setup: - * set up environment variable DEBUG_XNU_LLDBMACROS=1 on your shell. This will disable the automatic setup of lldbmacros and the operating_system.py from the symbols. - - bash$ export DEBUG_XNU_LLDBMACROS=1 - * start lldb from the shell - - bash$ lldb - * [optional] If you are making changes in the operating_system plugin then you need to set the plugin path for lldb to find your custom operating_system plugin file. - - (lldb)settings set target.process.python-os-plugin-path /path/to/xnu/tools/lldbmacros/core/operating_system.py - If you do not wish to change anything in operating_system plugin then just leave the setting empty. The symbol loading module will set one up for you. - * Load the xnu debug macros from your custom location. - - (lldb)command script import /path/to/xnu/tools/lldbmacros/xnu.py - -iii. Adding debug related 'printf's ------------------------------------ - The xnu debug framework provides a utility function (debuglog) in utils.py. Please use this for any of your debugging needs. It will not print any output unless the user turns on debug logging on the command. Please check the documentaiton of debuglog for usage and options. - - * To enable/disable logging - - (lldb) xnudebug debug - Enabled debug logging. - - - - diff --git a/tools/lldbmacros/README.md b/tools/lldbmacros/README.md new file mode 100644 index 000000000..ed75ee9c7 --- /dev/null +++ b/tools/lldbmacros/README.md @@ -0,0 +1,433 @@ +Table of Contents +================= + + A. How to use lldb for kernel debugging + B. Design of lldb kernel debugging platform. + C. Kernel debugging commands. + i. Using commands. + ii. Writing new commands. + D. Kernel type summaries. + i. Using summaries + ii. Writing new summary functions + E. FAQ and General Coding Guidelines + i. Frequently Asked Questions + ii. Formatted Output printing guidelines [MUST READ] + iii. Coding conventions. [MUST READ] + iv. Submitting changes in lldbmacros [MUST READ] + v. Common utility functions and paradigms + F. Development and Debugging on lldb kernel debugging platform. + i. Reading a exception backtrace + ii. Loading custom or local lldbmacros and operating_system plugin + iii. Adding debug related 'printf's + +A. How to use lldb for kernel debugging +======================================== + +lldb can be used for kernel debugging the same way as gdb. The simplest way is to start lldb with kernel symbol file. The lldb environment by default does not allow loading automatic python modules. Please add the following setting in + + File: ~/.lldbinit + settings set target.load-script-from-symbol-file true + +Now lldb will be ready to connect over kdp-remote '\' or 'gdb-remote \'. In case using a core file please do 'file --core /path/to/corefile' + +Following are detailed steps on how to debug a panic'ed / NMI'ed machine (For the curious souls). + +lldb debugging in detail:- + + * start lldb with the right symbols file. If you do not know the version apriori, then enable dsymForUUID to load symbols dynamically. + bash$ dsymForUUID --enable + bash$ lldb /path/to/mach_kernel.symbols + Current executable set to '/Sources/Symbols/xnu/xnu-2253~2/mach_kernel' (x86_64). + (lldb) + + * connect to remote device or load a core file + #for kdp + (lldb) process connect --plugin kdp-remote udp://17.123.45.67:41139 + #for gdb (eg with astris) + (lldb) process connect --plugin gdb-remote gdb://17.123.45.67:8000 + #for loading a core file + (lldb) file --core /path/to/core/file /path/to/kernel_symbol_file + + * Once connected you can debug with basic lldb commands like print, bt, expr etc. The xnu debug macros will also be loaded automatically from the dSYM files. + In case if you are working with older kernel files you can load kernel specific commands by doing - + (lldb) command script import /path/to/xnu/tools/lldbmacros/xnu.py + (lldb) showbootargs + debug=0x14e ncpus=2 + + * You can do `kgmhelp` to get a list of commands available through xnu.py + +SPECIAL: The `xnu.py` script brings in kernel type summary functions. To enable these please do - + + (lldb) showlldbtypesummaries + +These could be very handy in printing important information from structures easily. +For ex. + + (lldb) print (thread_t)0x80d6a620 + (thread_t) $45 = 0x80d6a620 + thread thread_id processor pri io_policy state wait_queue wait_event wmesg thread_name + 0x80d6a620 0x317 0x902078c8 61 W 0x910cadd4 0x0 SystemSoundServer + + + +B. Design of lldb kernel debugging platform. +============================================= + +The lldb debugger provides python scripting bridge for customizing commands and summaries in lldb. Following is the stack of platforms and how commands and summaries interact with it. + + |------- xnu scripts ----------| + | |- lldb Command/Scripting-| | <-- provides scriptability for kernel data structures through summary/command invocation. + | | |--lldb core--| | | <-- interacts with remote kernel or corefile. + | |-------------------------| | + |------------------------------| + +The xnu script in xnu/tools/lldbmacros provides the following: + + * Custom functions to do plumbing of lldb command invocation to python function call. (see doc strings for @lldb_command) + The command interface provides some common features (which can be invoked after passing '--' on cmd line) like - + + i. send the output of command to file on disk + ii. search for a string in the output and selectively print the line containing it. + iii. -v options to increase verbosity levels in commands. + For example: (lldb)showalltasks -- -s kernel_task --o /tmp/kernel_task.output -v + will show task summary output with lines matching string 'kernel_task' into a file /tmp/kernel_task.output and with a verbosity level of (default +1) + + * Customization for plugging in summary functions for lldb type summaries. (see doc strings for @lldb_summary) + It will automatically register given types with the functions within the kernel category. + + * Ability to register test cases for macros (see doc strings for @xnudebug_test). + +The file layout is like following + + xnu/ + |-tools/ + |-lldbmacros/ + |-core/ # Core logic about kernel, lldb value abstraction, configs etc. **DO NOT TOUCH THIS DIR** + |-plugins/ # Holds plugins for kernel commands. + |-xnu.py # xnu debug framework along with kgmhelp, xnudebug commands. + |-xnudefines.py + |-utils.py + |-process.py # files containing commands/summaries code for each subsystem + |-... + + +The lldbmacros directory has a Makefile that follows the build process for xnu. This packages lldbmacros scripts into the dSYM of each kernel build. This helps in rev-locking the lldb commands with changes in kernel sources. + + +C. Kernel debugging commands. +============================== +i. Using commands. +------------------ +Using xnu debug commands is very similar to kgmacros in gdb. You can use 'kgmhelp' to get a listing of available commands. +If you need detailed help for a command please type 'help ' and the documentation for the command will be displayed. +For ex. + + (lldb) help pmap_walk + Perform a page-table walk in for . + You can pass -- -v for verbose output. To increase the verbosity add more -v args after the '--'. + Syntax: pmap_walk + +The basic format for every command provided under kgmhelp is like follows + + (lldb) command_name [cmd_args..] [-CMDOPTIONS] [-xnuoptions] + where: + command_name : name of command as registed using the @lldb_command decorator and described in 'kgmhelp' + cmd_args : shell like arguments that are passed as is to the registered python function. + If there is error in these arguments than the implementor may display according error message. + xnuoptions : common options for stream based operations on the output of command_name. + Allowed options are + -h : show help string of a command + -s : print only the lines matching + -o : direct the output of command to . Will not display anything on terminal + -v : increase the verbosity of the command. Each '-v' encountered will increase verbosity by 1. + -p : pass the output of command to for processing and followup with command requests by it. + CMDOPTIONS : These are command level options (always a CAPITAL letter option) that are defined by the macro developer. Please do + help to know how each option operates on that particular command. For an example of how to use CMDOPTIONS, take a look at vm_object_walk_pages in memory.py + +ii. Writing new commands. +-------------------------- +The python modules are designed in such a way that the command from lldb invokes a python function with the arguments passed at lldb prompt. + +It is recommended that you do a decoupled development for command interface and core utility function so that any function/code can be called as a simple util function and get the same output. i.e. + + (lldb)showtask 0xabcdef000 is same as python >>> GetTaskSummary(0xabcdef000) or equivalent + +Following is a step by step guideline on how to add a new command ( e.g showtaskvme ). [extra tip: Always good idea to wrap your macro code within # Macro: , # EndMacro.] + + 1. register a command to a function. Use the lldb_command decorator to map a 'command_name' to a function. Optionally you can provide getopt compatible option string for customizing your command invocation. Note: Only CAPITAL letter options are allowed. lowercase options are reserved for the framework level features. + + 2. Immediately after the register define the function to handle the command invocation. The signature is always like Abc(cmd_args=None, cmd_options={}) + + 3. Add documentation for Abc(). This is very important for lldb to show help for each command. [ Follow the guidelines above with documentation ] + + 4. Use cmd_args array to get args passed on command. For example a command like `showtaskvme 0xabcdef00` will put have cmd_args=['0xabcdef00'] + - note that we use core.value class as an interface to underlying C structures. Refer [Section B] for more details. + - use kern.globals.\ & kern.GetValueFromAddress for building values from addresses. + - remember that the ideal type of object to be passed around is core.value + - Anything you 'print' will be relayed to lldb terminal output. + + 5. If the user has passed any custom options they would be in cmd_options dict. the format is `{'-':''}`. The \ will be '' (empty string) for non-option flags. + + 6. If your function finds issue with the passed argument then you can `raise ArgumentError('error_message')` to notify the user. The framework will automatically catch this and show appropriate help using the function doc string. + + Time for some code example? Try reading the code for function ShowTaskVmeHelper in memory.py. + +SPECIAL Note: Very often you will find yourself making changes to a file for some command/summary and would like to test it out in lldb. + +To easily reload your changes in lldb please follow the below example. + + * you fire up lldb and start using zprint. And soon you need to add functionality to zprint. + + * you happily change a function code in memory.py file to zprint macro. + + * now to reload that particular changes without killing your debug session do + (lldb) xnudebug reload memory + memory is reloaded from ./memory.py + (lldb) + + It is very important that you do reload using xnudebug command as it does the plumbing of commands and types for your change in the module. Otherwise you could easily get confused + why your changes are not reflected in the command. + + +D. Kernel type summaries. +========================== +i. Using summaries +------------------ +The lldb debugger provides ways for user to customize how a particular type of object be decsribed when printed. These are very useful in displaying complex and large structures +where only certain fields are important based on some flag or value in some field or variable. The way it works is every time lldb wants to print an object it checks +for registered summaries. We can define python functions and hook it up with lldb as callbacks for type summaries. For example. + + (lldb) print first_zone + (zone_t) $49 = 0xd007c000 + ZONE TOT_SZ ALLOC_ELTS FREE_ELTS FREE_SZ ELT_SZ ALLOC(ELTS PGS SLK) FLAGS NAME + 0x00000000d007c000 29808 182 25 3600 144 4096 28 1 64 X$ zones + (lldb) +Just printing the value of first_zone as (zone_t) 0xd007c000 wouldnt have been much help. But with the registered summary for zone_t we can see all the interesting info easily. + +You do not need to do anything special to use summaries. Once they are registered with lldb they show info automatically when printing objects. However if you wish to +see all the registered type summaries run the command `type summary list -w kernel` on lldb prompt. +Also if you wish to quickly disable the summaries for a particular command use the `showraw` command. + +ii. Writing new summary functions +--------------------------------- +lldb provides really flexible interface for building summaries for complex objects and data. If you find that a struct or list can be +diagnosed better if displayed differently, then feel free to add a type summary for that type. Following is an easy guide on how to do that. + + 1. Register a function as a callback for displaying information for a type. Use the `@lldb_type_summary()` decorator with an array of types you wish to register for callback + + 2. Provide a header for the summary using `@header()` decorator. This is a strong requirement for summaries. This gets displayed before the output + of `GetTypeSummary()` is displayed. [In case you do not wish to have header then still define it as "" (empty string) ] + + 3. Define the function with signature of `GetSomeTypeSummary(valobj)`. It is highly recommended that the naming be consistent to `Get.*?Summary(valobj)` + The valobj argument holds the core.value object for display. + + 4. Use the utility functions and memory read operations to pull out the required information. + [ use `kern.globals` & `kern.GetValueFromAddress` for building args to core functions. ] + [ remember that the ideal type of object to be passed around is core.value ] + + 5. return a string that would be printed by the caller. When lldb makes a call back it expects a str to be returned. So do not print + directly out to console. [ debug info or logs output is okay to be printed anywhere :) ] + +Time for some code example? Try reading the code for GetTaskSummary() in process.py. + + + +E. FAQs and Generel Coding Guidelines +====================================== + +i. Frequently Asked Questions +----------------------------- + + Q. How do I avoid printing the summary and see the actual data in a structure? + + A. There is a command called `showraw`. This will disable all kernel specific type summaries and execute any command you provide. For ex. + + (lldb) print (thread_t) 0x80d6a620 + (thread_t) $45 = 0x80d6a620 + thread thread_id processor pri io_policy state wait_queue wait_event wmesg thread_name + 0x80d6a620 0x317 0x902078c8 61 W 0x910cadd4 0x0 SystemSoundServer + (lldb) showraw print (thread_t) 0x80d6a620 + (thread_t) $48 = 0x80d6a620 + + Q. I typed `showallvnodes` and nothing happens for a long time? OR How do I get output of long running command instantly on the terminal? + + A. The lldb command interface tries to build result object from output of a python function. So in case of functions with very long output or runtime it may + seem that the lldb process is hung. But it is not. You can use "-i" option to get immediate output on terminal. + + ex. (lldb) showallvnodes -- -i + Immediate Output + .... + + Q. I made a change in a python file for a command or summary, but the output is not reflected in the lldb command? + + A. The python framework does not allow for removing a loaded module and then reloading it. So sometimes if a command has a cached value from + old code that it will still call the old function and hence will not display new changes in file on disk. If you find yourself in such a situation + please see [Section C. -> SPECIAL Note]. If the change is to basic class or caching mechanism than it is advised to quit lldb and re-load all modules again. + + Q. I am new to python. I get an error message that I do not understand. what should I do? + + A. The syntax for python is different from conventional programming languages. If you get any message with SyntaxError or TypeError or ValueError then please review your code and look for common errors like + + - wrong level of indentation? + - missed a ':' at the end of an if, elif, for, while statement? + - referencing a key in dictionary that doesn't exist? You might see KeyError in such cases. + - mistakenly used python reserved keyword as variable? (check http://docs.python.org/release/3.0.1/reference/lexical_analysis.html#id8) + - Trying to modify a string value? You can only create new strings but never modify existing ones. + - Trying to add a non string value to a string? This typically happens in print "time is " + gettime(). here gettime() returns int and not str. + - using a local variable with same name as global variable? + - assigning a value to global variable without declaring first? Its highly recommended to always declare global variable with 'global' keyword + If you still have difficulty you can look at the python documentation at http://docs.python.org + + + Q. I wish to pass value of variable/expression to xnu lldb macro that accepts only pointers. How can I achieve that? + + A. Many lldb macros have syntax that accepts pointers (eg showtaskstacks etc). In order to have your expression be evaluated before passing to command use `back ticks`. For example: + + (lldb) showtaskstacks `(task_t)tasks.next` + This way the expressing withing ` ` is evaluated by lldb and the value is passed to the command. + Note that if your argument pointer is bad or the memory is corrupted lldb macros will fail with a long backtrace that may not make sense. gdb used to fail silently but lldb does not. + Please see Section F(i) for more information on reading backtraces. + + Q. I connected to a coredump file with lldb --core corefile and I got RuntimeError: Unable to find lldb thread for tid=XYZ. What should I do? + + A. This is most likely the case that lldb ignored the operating system plugin in the dSYM and hence threads are not populated. Please put the line 'settings set target.load-script-from-symbol-file true' in your ~/.lldbinit file. If you do not have access you can alternatively do + + bash# lldb + (lldb) settings set target.load-script-from-symbol-file true + (lldb) file --core corefile + + +ii. Formatted output printing - zen and peace for life +------------------------------------------------------ + +To avoid the horrors of printing a tabular data on console and then 2 weeks later again messing with it for a new field, it is recommended to follow these guidelines. + + * any python string can be invoked to "".format() and hence makes it very easy to play with formats + + * As a convention, I suggest that for printing pointer values in hex use "{0: <#020x}".format(some_int_value). This will print nice 0x prefixed strings with length padded to 20. + + * If you need help with format options take a look at http://docs.python.org/library/string.html#format-string-syntax + + * [ I'd first create a format string for data and then for the header just change the x's and d's to s and pass the header strings to format command. see GetTaskSummary()] + + * If you need to print a string from a core.value object then use str() to get string representation of value. + + +iii. Coding conventions +----------------------- +It is very very HIGHLY RECOMMENDED to follow these guidelines for writing any python code. + + * Python is very sensitive to tabs and spaces for alignment. So please make sure you **INDENT YOUR CODE WITH SPACES** at all times. + + * The standard tab width is 4 spaces. Each increasing indent adds 4 spaces beginning of the line. + + * The format for documentation is - + """ A one line summary describing what this function / class does + Detailed explanation if necessary along with params and return values. + """ + + * All Classes and functions should have a doc string describing what the function does + A consistent format is expected. For ex. + def SumOfNumbers(a, b, c, d): + """ Calculate sum of numbers. + params: + a - int, value to be added. can be 0 + b - int/float, value to be added. + returns: + int/float - Sum of two values + raises: + TypeError - If any type is not identified in the params + """ + + * A Class or Function should always start with CAPITAL letter and be CamelCase. If a function is for internal use only than it starts with '_'. + + * Function params should always be lower_case and be word separated with '_' + + * A local variable inside a function should be lower_case and separated with '_' + + * A variable for internal use in object should start with '_'. + + * if a class variable is supposed to hold non native type of object, it is good idea to comment what type it holds + + * A class function with name matching `Get(.*?)Summary()` is always supposed to return a string which can be printed on stdout or any file. + + * Functions beginning with "Get" (eg. GetVnodePath()) mean they return a value and will not print any output to stdout. + + * Functions beginning with "Show" (eg. ShowZTrace()) mean they will print data on screen and may not return any value. + + +iv. Submitting changes in lldbmacros +------------------------------------ + +To contribute new commands or fixes to existing one, it is recommended that you follow the procedure below. + + * Save the changes requried for new command or fix into lldbmacros directory. + + * Make sure that the coding conventions are strictly followed. + + * Run syntax checker on each of the modified files. It will find basic formatting errors in the changed files for you. + + * If you are adding new file then please update the Makefile and xnu.py imports to ensure they get compiled during kernel build. + + * Do a clean build of kernel from xnu top level directory. + + * Verify that your changes are present in the dSYM directory of new build. + + * Re-run all your test and verification steps with the lldbmacros from the newly packaged dSYM/Contents/Resources/Python/lldbmacros. + +v. Common utility functions and paradigms +----------------------------------------- +Please search and look around the code for common util functions and paradigm + + * Take a peek at utils.py for common utility like sizeof_fmt() to humanize size strings in KB, MB etc. The convention is to have functions that do self contained actions and does not require intricate knowledge of kernel structures in utils.py + + * If you need to get pagesize of the traget system, do not hard code any value. kern.globals.page_size is your friend. Similarly use config['verbosity'] for finding about configs. + + * If you are developing a command for structure that is different based on development/release kernels please use "hasattr()" functionality to conditionalize referencing #ifdef'ed fields in structure. See example in def GetTaskSummary(task) in process.py + + +F. Development and Debugging on lldb kernel debugging platform. +=============================================================== + +i. Reading a exception backtrace +-------------------------------- +In case of an error the lldbmacros may print out an exception backtrace and halt immediately. The backtrace is very verbose and may be confusing. The important thing is to isolate possible causes of failure, and eventually filing a bug with kernel team. Following are some common ways where you may see an exception instead of your expected result. + + * The lldbmacros cannot divine the type of memory by inspection. If a wrong pointer is passed from commandline then, the command code will try to read and show some results. It may still be junk or plain erronous. Please make sure your command arguments are correct. + For example: a common mistake is to pass task address to showactstack. In such a case lldb command may fail and show you a confusing backtrace. + + * Kernel debugging is particularly tricky. Many parts of memory may not be readable. There could be failure in network, debugging protocol or just plain bad memory. In such a case please try to see if you can examine memory for the object you are trying to access. + + * In case of memory corruption, the lldbmacros may have followed wrong pointer dereferencing. This might lead to failure and a exception to be thrown. + + +ii. Loading custom or local lldbmacros and operating_system plugin +------------------------------------------------------------------ + +The lldbmacros are packaged right into the dSYM for the kernel executable. This makes debugging very easy since they can get loaded automatically when symbols are loaded. +However, this setup makes it difficult for a lldbmacro developer to load custom/local macros. Following is the suggested solution for customizing your debugging setup: + + * set up environment variable DEBUG_XNU_LLDBMACROS=1 on your shell. This will disable the automatic setup of lldbmacros and the operating_system.py from the symbols. + - bash$ export DEBUG_XNU_LLDBMACROS=1 + + * start lldb from the shell + - bash$ lldb + + * [optional] If you are making changes in the operating_system plugin then you need to set the plugin path for lldb to find your custom operating_system plugin file. + - (lldb)settings set target.process.python-os-plugin-path /path/to/xnu/tools/lldbmacros/core/operating_system.py + If you do not wish to change anything in operating_system plugin then just leave the setting empty. The symbol loading module will set one up for you. + + * Load the xnu debug macros from your custom location. + - (lldb)command script import /path/to/xnu/tools/lldbmacros/xnu.py + + +iii. Adding debug related 'printf's +----------------------------------- + +The xnu debug framework provides a utility function (debuglog) in utils.py. Please use this for any of your debugging needs. It will not print any output unless the user turns on debug logging on the command. Please check the documentaiton of debuglog for usage and options. + + * To enable/disable logging + - (lldb) xnudebug debug + Enabled debug logging. + + diff --git a/tools/lldbmacros/core/caching.py b/tools/lldbmacros/core/caching.py index 4a0b2bd6d..449dc7cc4 100644 --- a/tools/lldbmacros/core/caching.py +++ b/tools/lldbmacros/core/caching.py @@ -1,15 +1,15 @@ -""" +""" A basic caching module for xnu debug macros to use. -It is recommended to use [Get|Save][Static|Dynamic]CacheData() apis for -your caching needs. These APIs will handle the case of clearing caches when -a debugger continues and stops or hit a breakpoint. +It is recommended to use [Get|Save][Static|Dynamic]CacheData() apis for +your caching needs. These APIs will handle the case of clearing caches when +a debugger continues and stops or hit a breakpoint. Use Static caches for data that will not change if the program is run and stopped again. e.g. typedata, version numbers etc. An example invocation could be like def getDSYMPathForUUID(uuid): # Get the data from cache cached_data = caching.GetStaticCacheData('dsym.for.uuid', {}) - + if uuid in cached_data: return cached_data[uuid] else: @@ -18,17 +18,17 @@ def getDSYMPathForUUID(uuid): # save the cached_data object to cache. caching.SaveStaticCacheData('dsym.for.uuid', cached_data) - + return cached_data[uuid] -And use Dynamic caches for things like thread data, zones information etc. -These will automatically be dropped when debugger continues the target +And use Dynamic caches for things like thread data, zones information etc. +These will automatically be dropped when debugger continues the target An example use of Dynamic cache could be as follows def GetExecutablePathForPid(pid): # Get the data from cache cached_data = caching.GetDynamicCacheData('exec_for_path', {}) - + if pid in cached_data: return cached_data[pid] else: @@ -37,7 +37,7 @@ def GetExecutablePathForPid(pid): # save the cached_data object to cache. caching.SaveDynamicCacheData('exec_for_path', cached_data) - + return cached_data[pid] """ @@ -49,7 +49,7 @@ def GetExecutablePathForPid(pid): import sys """ -The format for the saved data dictionaries is +The format for the saved data dictionaries is { 'key' : (valueobj, versno), ... @@ -64,7 +64,7 @@ def GetExecutablePathForPid(pid): def _GetDebuggerSessionID(): - """ A default callable function that _GetCurrentSessionID uses to + """ A default callable function that _GetCurrentSessionID uses to identify a stopped session. """ return 0 @@ -80,7 +80,14 @@ def _GetCurrentSessionID(): return session_id; -#Public APIs +#Public APIs + +def ClearAllCache(): + """ remove all cached data. + """ + global _static_data, _dynamic_data + _static_data = {} + _dynamic_data = {} def GetSizeOfCache(): """ Returns number of bytes held in cache. @@ -92,7 +99,7 @@ def GetSizeOfCache(): def GetStaticCacheData(key, default_value = None): - """ Get cached object based on key from the cache of static information. + """ Get cached object based on key from the cache of static information. params: key: str - a unique string identifying your data. default_value : obj - an object that should be returned if key is not found. @@ -119,7 +126,7 @@ def SaveStaticCacheData(key, value): if not config['CacheStaticData']: return - + key = str(key) _static_data[key] = (value, _GetCurrentSessionID()) return diff --git a/tools/lldbmacros/core/cvalue.py b/tools/lldbmacros/core/cvalue.py index f3d9eb5cd..b751795ae 100644 --- a/tools/lldbmacros/core/cvalue.py +++ b/tools/lldbmacros/core/cvalue.py @@ -459,6 +459,12 @@ def getfieldoffset(struct_type, field_name): for field in struct_type.get_fields_array(): if str(field.GetName()) == field_name: return field.GetOffsetInBytes() + + # Hack for anonymous unions - the compiler does this, so cvalue should too + if field.GetName() is None and field.GetType().GetTypeClass() == lldb.eTypeClassUnion : + for union_field in field.GetType().get_fields_array(): + if str(union_field.GetName()) == field_name: + return union_field.GetOffsetInBytes() + field.GetOffsetInBytes() raise TypeError('Field name "%s" not found in type "%s"' % (field_name, str(struct_type))) def islong(x): diff --git a/tools/lldbmacros/core/kernelcore.py b/tools/lldbmacros/core/kernelcore.py index ec6295dff..3c6e5802b 100644 --- a/tools/lldbmacros/core/kernelcore.py +++ b/tools/lldbmacros/core/kernelcore.py @@ -109,7 +109,7 @@ def IterateLinkageChain(queue_head, element_type, field_name, field_ofst=0): link = link.next -def IterateQueue(queue_head, element_ptr_type, element_field_name, backwards=False): +def IterateQueue(queue_head, element_ptr_type, element_field_name, backwards=False, unpack_ptr_fn=None): """ Iterate over an Element Chain queue in kernel of type queue_head_t. (osfmk/kern/queue.h method 2) params: queue_head - value : Value object for queue_head. @@ -117,6 +117,7 @@ def IterateQueue(queue_head, element_ptr_type, element_field_name, backwards=Fal - str : OR a string describing the type. ex. 'task *' element_field_name - str : name of the field in target struct. backwards - backwards : traverse the queue backwards + unpack_ptr_fn - function : a function ptr of signature def unpack_ptr(long v) which returns long. returns: A generator does not return. It is used for iterating. value : an object thats of type (element_type) queue_head->next. Always a pointer object @@ -133,10 +134,19 @@ def IterateQueue(queue_head, element_ptr_type, element_field_name, backwards=Fal queue_head_addr = queue_head.GetValueAsUnsigned() else: queue_head_addr = queue_head.GetAddress().GetLoadAddress(LazyTarget.GetTarget()) + + def unpack_ptr_and_recast(v): + if unpack_ptr_fn is None: + return v + v_unpacked = unpack_ptr_fn(v.GetValueAsUnsigned()) + obj = v.CreateValueFromExpression(None,'(void *)'+str(v_unpacked)) + obj.Cast(element_ptr_type) + return obj + if backwards: - cur_elt = queue_head.GetChildMemberWithName('prev') + cur_elt = unpack_ptr_and_recast(queue_head.GetChildMemberWithName('prev')) else: - cur_elt = queue_head.GetChildMemberWithName('next') + cur_elt = unpack_ptr_and_recast(queue_head.GetChildMemberWithName('next')) while True: @@ -145,9 +155,10 @@ def IterateQueue(queue_head, element_ptr_type, element_field_name, backwards=Fal elt = cur_elt.Cast(element_ptr_type) yield value(elt) if backwards: - cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('prev') + cur_elt = unpack_ptr_and_recast(elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('prev')) else: - cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next') + cur_elt = unpack_ptr_and_recast(elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next')) + class KernelTarget(object): """ A common kernel object that provides access to kernel objects and information. @@ -358,9 +369,9 @@ def __getattribute__(self, name): if name == 'zones' : self._zones_list = caching.GetDynamicCacheData("kern._zones_list", []) if len(self._zones_list) > 0: return self._zones_list - first_zone = self.GetGlobalVariable('first_zone') - for z in IterateLinkedList(first_zone, 'next_zone'): - self._zones_list.append(z) + zone_array = self.GetGlobalVariable('zone_array') + for i in range(0, self.GetGlobalVariable('num_zones')): + self._zones_list.append(addressof(zone_array[i])) caching.SaveDynamicCacheData("kern._zones_list", self._zones_list) return self._zones_list @@ -471,4 +482,3 @@ def __getattribute__(self, name): return self._ptrsize return object.__getattribute__(self, name) - diff --git a/tools/lldbmacros/core/operating_system.py b/tools/lldbmacros/core/operating_system.py index 38ee0b4fc..fe6d71af2 100644 --- a/tools/lldbmacros/core/operating_system.py +++ b/tools/lldbmacros/core/operating_system.py @@ -896,4 +896,3 @@ def get_register_data(self, tid): print "FATAL ERROR: Failed to get register state for thread id 0x%x " % tid print thobj return regs.GetPackedRegisterState() - diff --git a/tools/lldbmacros/core/syntax_checker.py b/tools/lldbmacros/core/syntax_checker.py index f9a7142b5..02ec68eb5 100755 --- a/tools/lldbmacros/core/syntax_checker.py +++ b/tools/lldbmacros/core/syntax_checker.py @@ -43,7 +43,7 @@ try: compile_result = py_compile.compile(fname, cfile="/dev/null", doraise=True) except py_compile.PyCompileError as exc: - print str(exc) + print >>sys.stderr, str(exc) print >>sys.stderr, "Error: Compilation failed. Please fix the errors and try again." sys.exit(1) print "Success: Checked %s. No syntax errors found." % fname diff --git a/tools/lldbmacros/core/xnu_lldb_init.py b/tools/lldbmacros/core/xnu_lldb_init.py index b51824644..41dd202b9 100644 --- a/tools/lldbmacros/core/xnu_lldb_init.py +++ b/tools/lldbmacros/core/xnu_lldb_init.py @@ -1,4 +1,5 @@ import os +import re def GetSettingsValues(debugger, setting_variable_name): """ Queries the lldb internal settings @@ -14,6 +15,52 @@ def GetSettingsValues(debugger, setting_variable_name): retval.append(str(s)) return retval +def GetSymbolsFilePathFromModule(m): + """ Get a file path from a module. + params: m - lldb.target.module + returns: + str : path to first file based symbol. Note this might be dir path inside sources. + """ + for s in m.symbols: + if s.type == 8: + return os.path.dirname(str(s.name)) + return "" + +def GetSourcePathSettings(binary_path, symbols_path): + """ Parse the binary path and symbols_path to find if source-map setting is applicable + params: + binary_path: str path of the kernel module + symbols_path: str path of the symbols stored in binary. Use + returns: + str : string command to set the source-map setting. + """ + retval = "" + train_re = re.compile(r"dsyms/([a-zA-Z]+)/") + _t_arr = train_re.findall(binary_path) + train = '' + if _t_arr: + train = _t_arr[0] + if not train: + return retval + new_path = "~rc/Software/{}/Projects/".format(train) + new_path = os.path.expanduser(new_path) + new_path = os.path.normpath(new_path) + common_path_re = re.compile("(^.*?Sources/)(xnu.*?)/.*$") + _t_arr = common_path_re.findall(symbols_path) + srcpath = "" + projpath = "xnu" + if _t_arr: + srcpath = "".join(_t_arr[0]) + projpath = _t_arr[0][-1] + else: + return retval + + new_path = new_path + os.path.sep + projpath + cmd = "settings append target.source-map {} {}" + retval = cmd.format(srcpath, new_path) + return retval + + def __lldb_init_module(debugger, internal_dict): debug_session_enabled = False if "DEBUG_XNU_LLDBMACROS" in os.environ and len(os.environ['DEBUG_XNU_LLDBMACROS']) > 0: @@ -30,6 +77,12 @@ def __lldb_init_module(debugger, internal_dict): whitelist_trap_cmd = "settings set target.trap-handler-names %s %s" % (' '.join(intel_whitelist), ' '.join(arm_whitelist)) xnu_debug_path = base_dir_name + "/lldbmacros/xnu.py" xnu_load_cmd = "command script import \"%s\"" % xnu_debug_path + + source_map_cmd = "" + try: + source_map_cmd = GetSourcePathSettings(base_dir_name, GetSymbolsFilePathFromModule(debugger.GetTargetAtIndex(0).modules[0]) ) + except Exception as e: + pass if debug_session_enabled : if len(prev_os_plugin) > 0: print "\nDEBUG_XNU_LLDBMACROS is set. Skipping the setting of OS plugin from dSYM.\nYou can manually set the OS plugin by running\n" + osplugin_cmd @@ -44,5 +97,8 @@ def __lldb_init_module(debugger, internal_dict): debugger.HandleCommand(whitelist_trap_cmd) print xnu_load_cmd debugger.HandleCommand(xnu_load_cmd) + if source_map_cmd: + print source_map_cmd + debugger.HandleCommand(source_map_cmd) print "\n" diff --git a/tools/lldbmacros/ioreg.py b/tools/lldbmacros/ioreg.py index 26cb4b64a..4a771880d 100644 --- a/tools/lldbmacros/ioreg.py +++ b/tools/lldbmacros/ioreg.py @@ -34,16 +34,17 @@ def GetObjectSummary(obj): vt = dereference(Cast(obj, 'uintptr_t *')) - 2 * sizeof('uintptr_t') vtype = kern.SymbolicateFromAddress(vt) + if len(vtype): + vtype_str = " <" + vtype[0].GetName() + ">" + else: + vtype_str = "" if hasattr(obj, 'retainCount'): retCount = (obj.retainCount & 0xffff) cntnrRetCount = (retCount >> 16) - out_string = "`object 0x{0: <16x}, vt 0x{1: <16x} <{2:s}>, retain count {3:d}, container retain {4:d}` ".format(obj, vt, vtype[0].GetName(), retCount, cntnrRetCount) + out_string = "`object 0x{0: <16x}, vt 0x{1: <16x}{2:s}, retain count {3:d}, container retain {4:d}` ".format(obj, vt, vtype_str, retCount, cntnrRetCount) else: - if len(vtype): - out_string = "`object 0x{0: <16x}, vt 0x{1: <16x} <{2:s}>` ".format(obj, vt, vtype[0].GetName()) - else: - out_string = "`object 0x{0: <16x}, vt 0x{1: <16x}` ".format(obj, vt) - + out_string = "`object 0x{0: <16x}, vt 0x{1: <16x}{2:s}` ".format(obj, vt, vtype_str) + ztvAddr = kern.GetLoadAddressForSymbol('_ZTV8OSString') if vt == ztvAddr: out_string += GetString(obj) @@ -81,6 +82,25 @@ def GetObjectSummary(obj): return out_string + +def GetObjectTypeStr(obj): + """ Return the type of an OSObject's container class + """ + if obj is None: + return None + + vt = dereference(Cast(obj, 'uintptr_t *')) - 2 * sizeof('uintptr_t') + vtype = kern.SymbolicateFromAddress(vt) + if len(vtype): + return vtype[0].GetName() + + # See if the value is in a kext with no symbols + for kval in IterateLinkedList(kern.globals.kmod, 'next'): + if vt >= unsigned(kval.address) and vt <= (unsigned(kval.address) + unsigned(kval.size)): + return "kmod:{:s}+{:#0x}".format(kval.name, vt - unsigned(kval.address)) + return None + + @lldb_type_summary(['IORegistryEntry *']) @header("") def GetRegistryEntrySummary(entry): @@ -161,6 +181,45 @@ def ShowObject(cmd_args=None): obj = kern.GetValueFromAddress(cmd_args[0], 'OSObject *') print GetObjectSummary(obj) +#Macro: dumpobject +@lldb_command('dumpobject') +def DumpObject(cmd_args=None): + """ Dumps object information if it is a valid object confirmed by showobject + Usage: dumpobject
[class/struct type of object] + """ + if not cmd_args: + print "No arguments passed" + print DumpObject.__doc__ + return False + + if len(cmd_args) == 1: + try: + object_info = lldb_run_command("showobject {:s}".format(cmd_args[0])) + except: + print "Error!! showobject failed due to invalid value" + print DumpObject.__doc__ + return False + + srch = re.search(r'', object_info) + if not srch: + print "Error!! Couldn't find object in registry, input type manually as 2nd argument" + print DumpObject.__doc__ + return False + + object_type = srch.group(1) + else: + type_lookup = lldb_run_command("image lookup -t {:s}".format(cmd_args[1])) + if type_lookup.find(cmd_args[1])!= -1: + object_type = cmd_args[1] + else: + print "Error!! Input type {:s} isn't available in image lookup".format(cmd_args[1]) + return False + + print "******** Object Dump for value \'{:s}\' with type \"{:s}\" ********".format(cmd_args[0], object_type) + print lldb_run_command("p/x *({:s}*){:s}".format(object_type, cmd_args[0])) + +#EndMacro: dumpobject + @lldb_command('setregistryplane') def SetRegistryPlane(cmd_args=None): """ Set the plane to be used for the IOKit registry macros diff --git a/tools/lldbmacros/ipc.py b/tools/lldbmacros/ipc.py index f06087393..6a71e6e3a 100644 --- a/tools/lldbmacros/ipc.py +++ b/tools/lldbmacros/ipc.py @@ -8,6 +8,7 @@ from atm import * from bank import * from waitq import * +from ioreg import * import xnudefines @header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <15s}".format("task", "pid", '#acts', "tablesize", "command")) @@ -365,9 +366,18 @@ def GetKObjectFromPort(portval): io_bits = unsigned(portval.ip_object.io_bits) objtype_index = io_bits & 0xfff if objtype_index < len(xnudefines.kobject_types) : - desc_str = "kobject({0:s})".format(xnudefines.kobject_types[objtype_index]) - if xnudefines.kobject_types[objtype_index] in ('TASK_RESUME', 'TASK'): - desc_str += " " + GetProcNameForTask(Cast(portval.kdata.kobject, 'task *')) + objtype_str = xnudefines.kobject_types[objtype_index] + if objtype_str == 'IOKIT_OBJ': + iokit_classnm = GetObjectTypeStr(portval.kdata.kobject) + if not iokit_classnm: + iokit_classnm = "" + else: + iokit_classnm = re.sub(r'vtable for ', r'', iokit_classnm) + desc_str = "kobject({:s}:{:s})".format(objtype_str, iokit_classnm) + else: + desc_str = "kobject({0:s})".format(objtype_str) + if xnudefines.kobject_types[objtype_index] in ('TASK_RESUME', 'TASK'): + desc_str += " " + GetProcNameForTask(Cast(portval.kdata.kobject, 'task *')) else: desc_str = "kobject(UNKNOWN) {:d}".format(objtype_index) return kobject_str + " " + desc_str @@ -905,13 +915,17 @@ def GetIPCImportanceElemSummary(iie): out_str = '' fmt = "{: <#018x} {: <4s} {: <8d} {: <8d} {: <#018x} {: <#018x}" - type_str = 'TASK' if unsigned(iie.iie_bits) & 0x80000000: type_str = "INH" + inherit_count = 0 + else: + type_str = 'TASK' + iit = Cast(iie, 'struct ipc_importance_task *') + inherit_count = sum(1 for i in IterateQueue(iit.iit_inherits, 'struct ipc_importance_inherit *', 'iii_inheritance')) + refs = unsigned(iie.iie_bits) & 0x7fffffff made_refs = unsigned(iie.iie_made) kmsg_count = sum(1 for i in IterateQueue(iie.iie_kmsgs, 'struct ipc_kmsg *', 'ikm_inheritance')) - inherit_count = sum(1 for i in IterateQueue(iie.iie_inherits, 'struct ipc_importance_inherit *', 'iii_inheritance')) out_str += fmt.format(iie, type_str, refs, made_refs, kmsg_count, inherit_count) if config['verbosity'] > vHUMAN: if kmsg_count > 0: @@ -921,7 +935,7 @@ def GetIPCImportanceElemSummary(iie): out_str += "\n" if inherit_count > 0: out_str += "\n\t" + GetIPCImportanceInheritSummary.header + "\n" - for i in IterateQueue(iie.iie_inherits, 'struct ipc_importance_inherit *', 'iii_inheritance'): + for i in IterateQueue(iit.iit_inherits, 'struct ipc_importance_inherit *', 'iii_inheritance'): out_str += "\t" + GetIPCImportanceInheritSummary(i) + "\n" out_str += "\n" if type_str == "INH": @@ -1302,5 +1316,76 @@ def ShowVoucher(cmd_args=[], cmd_options={}): voucher = kern.GetValueFromAddress(cmd_args[0], 'ipc_voucher_t') print GetIPCVoucherSummary.header print GetIPCVoucherSummary(voucher, show_entries=True) - +def GetSpaceSendRightEntries(space, port): + """ Get entry summaries for all send rights to port address in an IPC space. + params: + space - the IPC space to search for send rights + port_addr - the port address to match, or 0 to get all send rights + returns: an array of IPC entries + """ + entry_table = space.is_table + ports = int(space.is_table_size) + i = 0 + entries = [] + + while i < ports: + entry = GetObjectAtIndexFromArray(entry_table, i) + + entry_ie_bits = unsigned(entry.ie_bits) + if (entry_ie_bits & 0x00010000) != 0 and (not port or entry.ie_object == port): + entries.append(entry) + i += 1 + + return entries + +@lldb_command('showportsendrights') +def ShowPortSendRights(cmd_args=[], cmd_options={}): + """ Display a list of send rights across all tasks for a given port. + Usage: (lldb) showportsendrights + """ + if not cmd_args: + raise ArgumentError("no port address provided") + port = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_port *') + i = 1 + + for t in kern.tasks: + # Write a progress line. Using stderr avoids automatic newline when + # writing to stdout from lldb. Blank spaces at the end clear out long + # lines. + sys.stderr.write("checking {:s} ({}/{})...{:30s}\r".format(Cast(t.bsd_info, 'proc_t').p_name, i, len(kern.tasks), '')) + i += 1 + entries = GetSpaceSendRightEntries(t.itk_space, port) + + if entries: + print GetTaskIPCSummary.header + print GetTaskIPCSummary(t) + print '\t' + GetIPCEntrySummary.header + + for entry in entries: + print "\t" + GetIPCEntrySummary(entry) + +@lldb_command('showtasksuspenders') +def ShowTaskSuspenders(cmd_args=[], cmd_options={}): + """ Display the tasks and send rights that are holding a target task suspended. + Usage: (lldb) showtasksuspenders + """ + if not cmd_args: + raise ArgumentError("no task address provided") + task = kern.GetValueFromAddress(cmd_args[0], 'task_t') + + if task.suspend_count == 0: + print "task {:#x} ({:s}) is not suspended".format(unsigned(task), Cast(task.bsd_info, 'proc_t').p_name) + return + + # If the task has been suspended by the kernel (potentially by + # kperf, using task_suspend_internal) or a client of task_suspend2 + # that does not convert its task suspension token to a port using + # convert_task_suspension_token_to_port, then it's impossible to determine + # which task did the suspension. + port = task.itk_resume + if not port: + print "task {:#x} ({:s}) is suspended but no resume port exists".format(unsigned(task), Cast(task.bsd_info, 'proc_t').p_name) + return + + return ShowPortSendRights(cmd_args=[unsigned(port)], cmd_options=cmd_options) diff --git a/tools/lldbmacros/kcdata.py b/tools/lldbmacros/kcdata.py old mode 100644 new mode 100755 index b5216e3e6..05ac5d521 --- a/tools/lldbmacros/kcdata.py +++ b/tools/lldbmacros/kcdata.py @@ -11,8 +11,9 @@ import os import shlex import subprocess - -cgitb.enable(format='text') +import logging +import contextlib +import base64 kcdata_type_def = { 'KCDATA_TYPE_INVALID': 0x0, @@ -26,14 +27,38 @@ 'KCDATA_TYPE_TYPEDEFINTION': 0x12, 'KCDATA_TYPE_CONTAINER_BEGIN': 0x13, 'KCDATA_TYPE_CONTIANER_END': 0x14, + + 'KCDATA_TYPE_ARRAY_PAD0': 0x20, + 'KCDATA_TYPE_ARRAY_PAD1': 0x21, + 'KCDATA_TYPE_ARRAY_PAD2': 0x22, + 'KCDATA_TYPE_ARRAY_PAD3': 0x23, + 'KCDATA_TYPE_ARRAY_PAD4': 0x24, + 'KCDATA_TYPE_ARRAY_PAD5': 0x25, + 'KCDATA_TYPE_ARRAY_PAD6': 0x26, + 'KCDATA_TYPE_ARRAY_PAD7': 0x27, + 'KCDATA_TYPE_ARRAY_PAD8': 0x28, + 'KCDATA_TYPE_ARRAY_PAD9': 0x29, + 'KCDATA_TYPE_ARRAY_PADa': 0x2a, + 'KCDATA_TYPE_ARRAY_PADb': 0x2b, + 'KCDATA_TYPE_ARRAY_PADc': 0x2c, + 'KCDATA_TYPE_ARRAY_PADd': 0x2d, + 'KCDATA_TYPE_ARRAY_PADe': 0x2e, + 'KCDATA_TYPE_ARRAY_PADf': 0x2f, + 'KCDATA_TYPE_LIBRARY_LOADINFO': 0x30, 'KCDATA_TYPE_LIBRARY_LOADINFO64': 0x31, 'KCDATA_TYPE_TIMEBASE': 0x32, - #'KCDATA_TYPE_MACH_ABSOLUTE_TIME': 0x33, + 'KCDATA_TYPE_MACH_ABSOLUTE_TIME': 0x33, 'KCDATA_TYPE_TIMEVAL': 0x34, 'KCDATA_TYPE_USECS_SINCE_EPOCH': 0x35, + 'KCDATA_TYPE_PID': 0x36, + 'KCDATA_TYPE_PROCNAME': 0x37, + 'KCDATA_TYPE_NESTED_KCDATA': 0x38, + 'STACKSHOT_KCCONTAINER_TASK': 0x903, 'STACKSHOT_KCCONTAINER_THREAD': 0x904, + 'STACKSHOT_KCTYPE_DONATING_PIDS': 0x907, + 'STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO': 0x908, 'STACKSHOT_KCTYPE_KERN_STACKFRAME': 0x90A, 'STACKSHOT_KCTYPE_KERN_STACKFRAME64': 0x90B, 'STACKSHOT_KCTYPE_USER_STACKFRAME': 0x90C, @@ -42,6 +67,21 @@ 'STACKSHOT_KCTYPE_OSVERSION': 0x90F, 'STACKSHOT_KCTYPE_KERN_PAGE_SIZE': 0x910, 'STACKSHOT_KCTYPE_JETSAM_LEVEL': 0x911, + 'STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP': 0x912, + 'STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT': 0x940, + 'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT': 0x941, + + 'STACKSHOT_KCTYPE_KERN_STACKLR': 0x913, + 'STACKSHOT_KCTYPE_KERN_STACKLR64': 0x914, + 'STACKSHOT_KCTYPE_USER_STACKLR': 0x915, + 'STACKSHOT_KCTYPE_USER_STACKLR64': 0x916, + 'STACKSHOT_KCTYPE_NONRUNNABLE_TIDS': 0x917, + 'STACKSHOT_KCTYPE_NONRUNNABLE_TASKS': 0x918, + 'STACKSHOT_KCTYPE_CPU_TIMES': 0x919, + 'STACKSHOT_KCTYPE_STACKSHOT_DURATION': 0x91a, + 'STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS': 0x91b, + 'STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO': 0x91c, + 'KCDATA_TYPE_BUFFER_END': 0xF19158ED, @@ -51,7 +91,11 @@ 'TASK_CRASHINFO_UUID': 0x804, 'TASK_CRASHINFO_PID': 0x805, 'TASK_CRASHINFO_PPID': 0x806, - 'TASK_CRASHINFO_RUSAGE': 0x807, + + # Don't want anyone using this. It's struct rusage from whatever machine generated the data + #'TASK_CRASHINFO_RUSAGE': 0x807, + 'Type_0x807': 0x807, + 'TASK_CRASHINFO_RUSAGE_INFO': 0x808, 'TASK_CRASHINFO_PROC_NAME': 0x809, 'TASK_CRASHINFO_PROC_STARTTIME': 0x80B, @@ -70,14 +114,22 @@ 'TASK_CRASHINFO_RESPONSIBLE_PID': 0x818, 'TASK_CRASHINFO_DIRTY_FLAGS': 0x819, 'TASK_CRASHINFO_CRASHED_THREADID': 0x81A, - - 'KCDATA_BUFFER_BEGIN_CRASHINFO': 0xDEADF157, - 'KCDATA_BUFFER_BEGIN_STACKSHOT': 0x59a25807 + 'TASK_CRASHINFO_COALITION_ID': 0x81B, + 'EXIT_REASON_SNAPSHOT': 0x1001, + 'EXIT_REASON_USER_DESC': 0x1002, + 'EXIT_REASON_USER_PAYLOAD': 0x1003, + 'EXIT_REASON_CODESIGNING_INFO': 0x1004, + 'KCDATA_BUFFER_BEGIN_CRASHINFO': 0xDEADF157, + 'KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT': 0xDE17A59A, + 'KCDATA_BUFFER_BEGIN_STACKSHOT': 0x59a25807, + 'KCDATA_BUFFER_BEGIN_OS_REASON': 0x53A20900, + 'KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG': 0x1E21C09F } kcdata_type_def_rev = dict((v, k) for k, v in kcdata_type_def.iteritems()) KNOWN_TYPES_COLLECTION = {} +KNOWN_TOPLEVEL_CONTAINER_TYPES = () def enum(**args): return type('enum', (), args) @@ -85,6 +137,18 @@ def enum(**args): KCSUBTYPE_TYPE = enum(KC_ST_CHAR=1, KC_ST_INT8=2, KC_ST_UINT8=3, KC_ST_INT16=4, KC_ST_UINT16=5, KC_ST_INT32=6, KC_ST_UINT32=7, KC_ST_INT64=8, KC_ST_UINT64=9) +LEGAL_OLD_STYLE_ARRAY_TYPE_NAMES = ['KCDATA_TYPE_LIBRARY_LOADINFO', + 'KCDATA_TYPE_LIBRARY_LOADINFO64', + 'STACKSHOT_KCTYPE_KERN_STACKFRAME', + 'STACKSHOT_KCTYPE_USER_STACKFRAME', + 'STACKSHOT_KCTYPE_KERN_STACKFRAME64', + 'STACKSHOT_KCTYPE_USER_STACKFRAME64', + 'STACKSHOT_KCTYPE_DONATING_PIDS', + 'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT'] + +KCDATA_FLAGS_STRUCT_PADDING_MASK = 0xf +KCDATA_FLAGS_STRUCT_HAS_PADDING = 0x80 + class KCSubTypeElement(object): """convert kcdata_subtype_descriptor to """ _unpack_formats = (None, 'c', 'b', 'B', 'h', 'H', 'i', 'I', 'q', 'Q') @@ -176,10 +240,20 @@ def GetStringRepr(self, base_data): str_arr.append(self.GetValueAsString(base_data, i)) return '"' + ''.join(str_arr) + '"' - o = '[' + ','.join([self.GetValueAsString(base_data, i) for i in range(self.count)]) + ']' + + count = self.count + if count > len(base_data)/self.size: + count = len(base_data)/self.size + + o = '[' + ','.join([self.GetValueAsString(base_data, i) for i in range(count)]) + ']' + return o - def GetJsonRepr(self, base_data): + def GetJsonRepr(self, base_data, flags=0): + if (flags & (KCDATA_FLAGS_STRUCT_HAS_PADDING | KCDATA_FLAGS_STRUCT_PADDING_MASK)) != 0: + padding = (flags & KCDATA_FLAGS_STRUCT_PADDING_MASK) + if padding: + base_data = base_data[:-padding] if self.custom_JsonRepr: if self.is_array_type: e_data = [self.GetValue(base_data, i) for i in range(self.count)] @@ -188,14 +262,25 @@ def GetJsonRepr(self, base_data): return self.custom_JsonRepr(e_data, self.name) return self.GetStringRepr(base_data) + def sizeof(self): + return self.totalsize + + def ShouldSkip(self, data): + return len(data) < self.offset + self.totalsize + + def ShouldMerge(self): + return False + class KCTypeDescription(object): - def __init__(self, t_type_id, t_elements=[], t_name='anon', custom_repr=None): + def __init__(self, t_type_id, t_elements=[], t_name='anon', custom_repr=None, legacy_size=None, merge=False): self.type_id = t_type_id self.elements = t_elements self.name = t_name self.totalsize = 0 self.custom_JsonRepr = custom_repr + self.legacy_size = legacy_size + self.merge = merge for e in self.elements: self.totalsize += e.GetTotalSize() @@ -219,12 +304,26 @@ def FromKCTypeDescription(other, t_type_id, t_name): retval = KCTypeDescription(t_type_id, other.elements, t_name, other.custom_JsonRepr) return retval - def GetJsonRepr(self, base_data): + def ShouldMerge(self): + return self.merge + + def GetJsonRepr(self, base_data, flags): + if (flags & (KCDATA_FLAGS_STRUCT_HAS_PADDING | KCDATA_FLAGS_STRUCT_PADDING_MASK)) != 0: + padding = (flags & KCDATA_FLAGS_STRUCT_PADDING_MASK) + if padding: + base_data = base_data[:-padding] + elif self.legacy_size and len(base_data) == self.legacy_size + ((-self.legacy_size) & 0xf): + base_data = base_data[:self.legacy_size] if self.custom_JsonRepr: return self.custom_JsonRepr([e.GetValue(base_data) for e in self.elements]) - o = '{' + ", ".join(['"%s": %s' % (e.GetName(), e.GetJsonRepr(base_data)) for e in self.elements]) + '}' + o = ", ".join(['"%s": %s' % (e.GetName(), e.GetJsonRepr(base_data)) for e in self.elements if not e.ShouldSkip(base_data)]) + if not self.merge: + o = '{' + o + '}' return o + def sizeof(self): + return max(st.totalsize + st.offset for st in self.elements) + def GetTypeNameForKey(k): retval = "0x%x" % k @@ -242,85 +341,138 @@ def GetTypeForName(n): return ret +LEGAL_OLD_STYLE_ARRAY_TYPES = map(GetTypeForName, LEGAL_OLD_STYLE_ARRAY_TYPE_NAMES) + +kcdata_type_def_rev[GetTypeForName('KCDATA_BUFFER_BEGIN_STACKSHOT')] = 'kcdata_stackshot' +kcdata_type_def_rev[GetTypeForName('KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT')] = 'kcdata_delta_stackshot' +kcdata_type_def_rev[GetTypeForName('KCDATA_BUFFER_BEGIN_CRASHINFO')] = 'kcdata_crashinfo' +kcdata_type_def_rev[GetTypeForName('KCDATA_BUFFER_BEGIN_OS_REASON')] = 'kcdata_reason' +kcdata_type_def_rev[GetTypeForName('STACKSHOT_KCCONTAINER_TASK')] = 'task_snapshots' +kcdata_type_def_rev[GetTypeForName('STACKSHOT_KCCONTAINER_THREAD')] = 'thread_snapshots' +kcdata_type_def_rev[GetTypeForName('KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG')] = 'xnupost_testconfig' + +class Indent(object): + def __init__(self): + self.n = 0 + def __call__(self, end=False): + if end: + return " " * (self.n-4) + else: + return " " * self.n + @contextlib.contextmanager + def indent(self): + self.n += 4 + try: + yield + finally: + self.n -= 4 + +INDENT = Indent() + class KCObject(object): - """ - """ - def __init__(self, type_code, data, flags=0, field_name=''): + + def __init__(self, type_code, data, offset, flags=0): + self.i_type = type_code self.i_data = data + self.offset = offset self.i_size = len(data) - self.i_name = field_name self.i_flags = flags self.obj_collection = [] self.obj = {} self.is_container_type = False self.is_array_type = False self.is_naked_type = False - if not field_name: - self.i_name = GetTypeNameForKey(type_code) + self.nested_kcdata = None + self.i_name = GetTypeNameForKey(type_code) + self.ParseData() + if self.i_type == GetTypeForName('KCDATA_TYPE_CONTAINER_BEGIN'): + self.__class__ = KCContainerObject + + if self.i_type in KNOWN_TOPLEVEL_CONTAINER_TYPES: + self.__class__ = KCBufferObject + + self.InitAfterParse() + + def InitAfterParse(self): + pass + @staticmethod def FromKCItem(kcitem): - return KCObject(kcitem.i_type, kcitem.i_data, kcitem.i_flags) - - def IsContainerType(self): - return self.is_container_type + return KCObject(kcitem.i_type, kcitem.i_data, kcitem.i_offset, kcitem.i_flags) def IsContainerEnd(self): - if self.i_type in (GetTypeForName('KCDATA_TYPE_CONTIANER_END'), GetTypeForName('KCDATA_TYPE_BUFFER_END')): + return self.i_type == GetTypeForName('KCDATA_TYPE_CONTIANER_END') + + def IsBufferEnd(self): + return self.i_type == GetTypeForName('KCDATA_TYPE_BUFFER_END') + + def IsArray(self): + return self.is_array_type + + def ShouldMerge(self): + if self.nested_kcdata: return True - return False + elif not self.is_array_type and self.i_type in KNOWN_TYPES_COLLECTION: + return KNOWN_TYPES_COLLECTION[self.i_type].ShouldMerge() + else: + return False def GetJsonRepr(self): if self.is_array_type: return '[' + ', '.join([i.GetJsonRepr() for i in self.obj_collection]) + ']' - #if self.is_array_type: - # return '"%s" : [' % self.i_name + ', '.join([i.GetJsonRepr() for i in self.obj_collection]) + ']' - if self.is_container_type: - raise NotImplementedError("Containter types should not have come here") if self.i_type in KNOWN_TYPES_COLLECTION: - return KNOWN_TYPES_COLLECTION[self.i_type].GetJsonRepr(self.i_data) + return KNOWN_TYPES_COLLECTION[self.i_type].GetJsonRepr(self.i_data, self.i_flags) if self.is_naked_type: return json.dumps(self.obj) + if self.nested_kcdata: + return self.nested_kcdata.GetJsonRepr() raise NotImplementedError("Broken GetJsonRepr implementation") def ParseData(self): + + if self.i_type == GetTypeForName('KCDATA_TYPE_CONTAINER_BEGIN'): - self.is_container_type = True self.obj['uniqID'] = self.i_flags self.i_name = str(self.obj['uniqID']) self.obj['typeID'] = struct.unpack_from('I', self.i_data)[0] + logging.info("0x%08x: %sCONTAINER: %s(%x)" % (self.offset, INDENT(), GetTypeNameForKey(self.obj['typeID']), self.i_flags)) - elif self.i_type in (GetTypeForName('KCDATA_BUFFER_BEGIN_CRASHINFO'), GetTypeForName('KCDATA_BUFFER_BEGIN_STACKSHOT')): - self.is_container_type = True + elif self.i_type in (KNOWN_TOPLEVEL_CONTAINER_TYPES): self.obj['uniqID'] = self.i_name self.obj['typeID'] = self.i_type + logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) elif self.i_type == GetTypeForName('KCDATA_TYPE_CONTIANER_END'): self.obj['uniqID'] = self.i_flags + logging.info("0x%08x: %sEND" % (self.offset, INDENT(end=True))) elif self.i_type == GetTypeForName('KCDATA_TYPE_BUFFER_END'): self.obj = '' + logging.info("0x%08x: %sEND_BUFFER" % (self.offset, INDENT(end=True))) elif self.i_type == GetTypeForName('KCDATA_TYPE_UINT32_DESC'): self.is_naked_type = True u_d = struct.unpack_from('32sI', self.i_data) self.i_name = u_d[0].strip(chr(0)) self.obj = u_d[1] + logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) elif self.i_type == GetTypeForName('KCDATA_TYPE_UINT64_DESC'): self.is_naked_type = True u_d = struct.unpack_from('32sQ', self.i_data) self.i_name = u_d[0].strip(chr(0)) self.obj = u_d[1] + logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) elif self.i_type == GetTypeForName('KCDATA_TYPE_TYPEDEFINTION'): self.is_naked_type = True u_d = struct.unpack_from('II32s', self.i_data) - self.obj['name'] = u_d[2].strip(chr(0)) - self.i_name = "typedef<%s>" % self.obj['name'] + self.obj['name'] = u_d[2].split(chr(0))[0] + self.i_name = "typedef[%s]" % self.obj['name'] self.obj['typeID'] = u_d[0] self.obj['numOfFields'] = u_d[1] element_arr = [] @@ -332,41 +484,84 @@ def ParseData(self): #print str(type_desc) self.obj['fields'] = [str(e) for e in element_arr] KNOWN_TYPES_COLLECTION[type_desc.GetTypeID()] = type_desc + logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) elif self.i_type == GetTypeForName('KCDATA_TYPE_ARRAY'): self.is_array_type = True e_t = (self.i_flags >> 32) & 0xffffffff + if e_t not in LEGAL_OLD_STYLE_ARRAY_TYPES: + raise Exception, "illegal old-style array type: %s (0x%x)" % (GetTypeNameForKey(e_t), e_t) e_c = self.i_flags & 0xffffffff - e_s = self.i_size / e_c + e_s = KNOWN_TYPES_COLLECTION[e_t].sizeof() + if e_s * e_c > self.i_size: + raise Excpetion, "array too small for its count" self.obj['typeID'] = e_t self.i_name = GetTypeNameForKey(e_t) self.i_type = e_t self.obj['numOfElements'] = e_c self.obj['sizeOfElement'] = e_s + logging.info("0x%08x: %sARRAY: %s" % (self.offset, INDENT(), self.i_name)) #populate the array here by recursive creation of KCObject - for _i in range(e_c): - _o = KCObject(e_t, self.i_data[(_i * e_s):(_i * e_s) + e_s]) - self.obj_collection.append(_o) + with INDENT.indent(): + for _i in range(e_c): + _o = KCObject(e_t, self.i_data[(_i * e_s):(_i * e_s) + e_s], self.offset + _i*e_s) + self.obj_collection.append(_o) + + elif self.i_type >= GetTypeForName('KCDATA_TYPE_ARRAY_PAD0') and self.i_type <= GetTypeForName('KCDATA_TYPE_ARRAY_PADf'): + self.is_array_type = True + e_t = (self.i_flags >> 32) & 0xffffffff + e_c = self.i_flags & 0xffffffff + e_s = (self.i_size - (self.i_type & 0xf)) / e_c if e_c != 0 else None + self.obj['typeID'] = e_t + self.i_name = GetTypeNameForKey(e_t) + self.i_type = e_t + self.obj['numOfElements'] = e_c + self.obj['sizeOfElement'] = e_s + logging.info("0x%08x: %sARRAY: %s" % (self.offset, INDENT(), self.i_name)) + #populate the array here by recursive creation of KCObject + with INDENT.indent(): + for _i in range(e_c): + _o = KCObject(e_t, self.i_data[(_i * e_s):(_i * e_s) + e_s], self.offset + _i*e_s) + self.obj_collection.append(_o) + + elif self.i_type == GetTypeForName('KCDATA_TYPE_NESTED_KCDATA'): + logging.info("0x%08x: %sNESTED_KCDATA" % (self.offset, INDENT())) + with INDENT.indent(): + nested_iterator = kcdata_item_iterator(self.i_data[:self.i_size]) + nested_buffer = KCObject.FromKCItem(nested_iterator.next()) + if not isinstance(nested_buffer, KCBufferObject): + raise Exception, "nested buffer isn't a KCBufferObject" + nested_buffer.ReadItems(nested_iterator) + self.nested_kcdata = nested_buffer + elif self.i_type in KNOWN_TYPES_COLLECTION: self.i_name = KNOWN_TYPES_COLLECTION[self.i_type].GetName() self.is_naked_type = True + logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) else: self.is_naked_type = True #self.obj = "data of len %d" % len(self.i_data) #self.obj = ''.join(["%x" % ki for ki in struct.unpack('%dB' % len(self.i_data), self.i_data)]) - self.obj = base64.b64encode(self.i_data) + self.obj = map(ord, self.i_data) + logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) class KCContainerObject(KCObject): def __init__(self, *args, **kwargs): - KCObject.__init__(self, *args, **kwargs) + assert False + + def InitAfterParse(self): self.obj_container_dict = {} self.obj_nested_objs = {} + def ShouldMerge(self): + return True + def GetJsonRepr(self): - o = '"%s"' % self.obj['uniqID'] + ' : { "typeID" : %d ,' % self.obj['typeID'] + # o = '"%s"' % self.obj['uniqID'] + ' : { "typeID" : %d ,' % self.obj['typeID'] + o = '"%s"' % self.obj['uniqID'] + ' : { ' for (k, v) in self.obj_container_dict.items(): - if v.IsContainerType(): + if v.ShouldMerge(): o += v.GetJsonRepr() + "," else: o += ' "%s" : ' % k + v.GetJsonRepr() + "," @@ -379,15 +574,51 @@ def GetJsonRepr(self): return o def AddObject(self, kco): - if kco.IsContainerEnd(): - return - if kco.IsContainerType(): + assert not kco.IsContainerEnd() + if isinstance(kco, KCContainerObject): type_name = GetTypeNameForKey(kco.obj['typeID']) if type_name not in self.obj_nested_objs: self.obj_nested_objs[type_name] = {} self.obj_nested_objs[type_name][kco.i_name] = kco return - self.obj_container_dict[kco.i_name] = kco + if kco.i_name in self.obj_container_dict: + if kco.IsArray() and self.obj_container_dict[kco.i_name].IsArray(): + self.obj_container_dict[kco.i_name].obj_collection.extend( kco.obj_collection ) + else: + self.obj_container_dict[kco.i_name] = kco + + def IsEndMarker(self, o): + if not o.IsContainerEnd(): + return False + if o.i_flags != self.i_flags: + raise Exception, "container end marker doesn't match" + return True + + no_end_message = "could not find container end marker" + + def ReadItems(self, iterator): + found_end = False + with INDENT.indent(): + for i in iterator: + o = KCObject.FromKCItem(i) + if self.IsEndMarker(o): + found_end = True + break + if isinstance(o, KCContainerObject): + o.ReadItems(iterator) + self.AddObject(o) + if not found_end: + raise Exception, self.no_end_message + + +class KCBufferObject(KCContainerObject): + + def IsEndMarker(self,o): + if o.IsContainerEnd(): + raise Exception, "container end marker at the toplevel" + return o.IsBufferEnd() + + no_end_message = "could not find buffer end marker" class KCData_item: @@ -400,7 +631,7 @@ def __init__(self, item_type, item_size, item_flags, item_data): self.i_size = item_size self.i_flags = item_flags self.i_data = item_data - self._buf_pos = None + self.i_offset = None def __init__(self, barray, pos=0): """ create an object by parsing data from bytes array @@ -411,38 +642,33 @@ def __init__(self, barray, pos=0): self.i_size = struct.unpack('I', barray[pos+4:pos+8])[0] # int.from_bytes(barray[pos+4:pos+8]) self.i_flags = struct.unpack('Q', barray[pos+8:pos+16])[0] # int.from_bytes(barray[pos+8:pos+16]) self.i_data = barray[pos+16: (pos + 16 + self.i_size)] - self._buf_pos = pos + self.i_offset = pos def __len__(self): return self.i_size + KCData_item.header_size def GetHeaderDescription(self): - outs = "type: 0x%x size: 0x%x flags: 0x%x" % (self.i_type, self.i_size, self.i_flags) - if not self._buf_pos is None: - outs = "pos: 0x%x" % self._buf_pos + outs + outs = "type: 0x%x size: 0x%x flags: 0x%x (%s)" % (self.i_type, self.i_size, self.i_flags, GetTypeNameForKey(self.i_type)) + if not self.i_offset is None: + outs = "pos: 0x%x" % self.i_offset + outs return outs def __str__(self): return self.GetHeaderDescription() - -def kcdata_item_iterator(filename): - if not filename: - return - with open(filename, "r+b") as f: - fmap = mmap.mmap(f.fileno(), 0) - file_len = len(fmap) - curpos = 0 - while curpos < file_len: - item = KCData_item(fmap, curpos) - yield item - curpos += len(item) - fmap.close() - +def kcdata_item_iterator(data): + file_len = len(data) + curpos = 0 + while curpos < file_len: + item = KCData_item(data, curpos) + yield item + curpos += len(item) def _get_data_element(elementValues): return json.dumps(elementValues[-1]) +KNOWN_TOPLEVEL_CONTAINER_TYPES = map(GetTypeForName, ('KCDATA_BUFFER_BEGIN_CRASHINFO', 'KCDATA_BUFFER_BEGIN_STACKSHOT', 'KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT', 'KCDATA_BUFFER_BEGIN_OS_REASON','KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG')) + KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_UINT32_DESC')] = KCTypeDescription(GetTypeForName('KCDATA_TYPE_UINT32_DESC'), ( KCSubTypeElement('desc', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(32, 1), 0, 1), KCSubTypeElement('data', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 32, 0) @@ -460,29 +686,29 @@ def _get_data_element(elementValues): ) KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_TIMEBASE')] = KCTypeDescription(GetTypeForName('KCDATA_TYPE_TIMEBASE'), ( - KCSubTypeElement('numerator', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0), - KCSubTypeElement('denominator', KCSUBTYPE_TYPE.KC_ST_UINT32, 8, 4, 0) + KCSubTypeElement('numer', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0), + KCSubTypeElement('denom', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 4, 0) ), - 'timebase_info' + 'mach_timebase_info' ) STACKSHOT_IO_NUM_PRIORITIES = 4 KNOWN_TYPES_COLLECTION[0x901] = KCTypeDescription(0x901, ( - KCSubTypeElement.FromBasicCtype('disk_reads_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), - KCSubTypeElement.FromBasicCtype('disk_reads_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), - KCSubTypeElement.FromBasicCtype('disk_writes_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), - KCSubTypeElement.FromBasicCtype('disk_writes_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), - KCSubTypeElement('io_priority_count', KCSUBTYPE_TYPE.KC_ST_UINT64, KCSubTypeElement.GetSizeForArray(STACKSHOT_IO_NUM_PRIORITIES, 8), 32, 1), - KCSubTypeElement('io_priority_size', KCSUBTYPE_TYPE.KC_ST_UINT64, KCSubTypeElement.GetSizeForArray(STACKSHOT_IO_NUM_PRIORITIES, 8), 32 + (STACKSHOT_IO_NUM_PRIORITIES * 8), 1), - KCSubTypeElement.FromBasicCtype('paging_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 32 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), - KCSubTypeElement.FromBasicCtype('paging_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 40 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), - KCSubTypeElement.FromBasicCtype('non_paging_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 48 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), - KCSubTypeElement.FromBasicCtype('non_paging_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 56 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), - KCSubTypeElement.FromBasicCtype('data_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 64 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), - KCSubTypeElement.FromBasicCtype('data_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 72 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), - KCSubTypeElement.FromBasicCtype('metadata_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 80 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), - KCSubTypeElement.FromBasicCtype('metadata_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 88 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)) + KCSubTypeElement.FromBasicCtype('ss_disk_reads_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('ss_disk_reads_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('ss_disk_writes_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), + KCSubTypeElement.FromBasicCtype('ss_disk_writes_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), + KCSubTypeElement('ss_io_priority_count', KCSUBTYPE_TYPE.KC_ST_UINT64, KCSubTypeElement.GetSizeForArray(STACKSHOT_IO_NUM_PRIORITIES, 8), 32, 1), + KCSubTypeElement('ss_io_priority_size', KCSUBTYPE_TYPE.KC_ST_UINT64, KCSubTypeElement.GetSizeForArray(STACKSHOT_IO_NUM_PRIORITIES, 8), 32 + (STACKSHOT_IO_NUM_PRIORITIES * 8), 1), + KCSubTypeElement.FromBasicCtype('ss_paging_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 32 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), + KCSubTypeElement.FromBasicCtype('ss_paging_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 40 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), + KCSubTypeElement.FromBasicCtype('ss_non_paging_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 48 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), + KCSubTypeElement.FromBasicCtype('ss_non_paging_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 56 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), + KCSubTypeElement.FromBasicCtype('ss_data_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 64 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), + KCSubTypeElement.FromBasicCtype('ss_data_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 72 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), + KCSubTypeElement.FromBasicCtype('ss_metadata_count', KCSUBTYPE_TYPE.KC_ST_UINT64, 80 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)), + KCSubTypeElement.FromBasicCtype('ss_metadata_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 88 + 2 * (STACKSHOT_IO_NUM_PRIORITIES * 8)) ), 'io_statistics' ) @@ -510,71 +736,121 @@ def _get_data_element(elementValues): KNOWN_TYPES_COLLECTION[0x905] = KCTypeDescription(0x905, ( - KCSubTypeElement.FromBasicCtype('unique_pid', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), - KCSubTypeElement.FromBasicCtype('ss_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), - KCSubTypeElement.FromBasicCtype('user_time_in_terminated_threads', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), - KCSubTypeElement.FromBasicCtype('system_time_in_terminated_threads', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), - KCSubTypeElement.FromBasicCtype('p_start_sec', KCSUBTYPE_TYPE.KC_ST_UINT64, 32), - KCSubTypeElement.FromBasicCtype('task_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 40), - KCSubTypeElement.FromBasicCtype('task_max_resident_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 48), - KCSubTypeElement.FromBasicCtype('suspend_count', KCSUBTYPE_TYPE.KC_ST_UINT32, 56), - KCSubTypeElement.FromBasicCtype('faults', KCSUBTYPE_TYPE.KC_ST_UINT32, 60), - KCSubTypeElement.FromBasicCtype('pageins', KCSUBTYPE_TYPE.KC_ST_UINT32, 64), - KCSubTypeElement.FromBasicCtype('cow_faults', KCSUBTYPE_TYPE.KC_ST_UINT32, 68), - KCSubTypeElement.FromBasicCtype('was_throttled', KCSUBTYPE_TYPE.KC_ST_UINT32, 72), - KCSubTypeElement.FromBasicCtype('did_throttle', KCSUBTYPE_TYPE.KC_ST_UINT32, 76), - KCSubTypeElement.FromBasicCtype('latency_qos', KCSUBTYPE_TYPE.KC_ST_UINT32, 80), - KCSubTypeElement.FromBasicCtype('pid', KCSUBTYPE_TYPE.KC_ST_INT32, 84), - KCSubTypeElement('p_comm', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(32, 1), 88, 1) + KCSubTypeElement.FromBasicCtype('ts_unique_pid', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('ts_ss_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('ts_user_time_in_terminated_thre', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), + KCSubTypeElement.FromBasicCtype('ts_system_time_in_terminated_th', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), + KCSubTypeElement.FromBasicCtype('ts_p_start_sec', KCSUBTYPE_TYPE.KC_ST_UINT64, 32), + KCSubTypeElement.FromBasicCtype('ts_task_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 40), + KCSubTypeElement.FromBasicCtype('ts_max_resident_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 48), + KCSubTypeElement.FromBasicCtype('ts_suspend_count', KCSUBTYPE_TYPE.KC_ST_UINT32, 56), + KCSubTypeElement.FromBasicCtype('ts_faults', KCSUBTYPE_TYPE.KC_ST_UINT32, 60), + KCSubTypeElement.FromBasicCtype('ts_pageins', KCSUBTYPE_TYPE.KC_ST_UINT32, 64), + KCSubTypeElement.FromBasicCtype('ts_cow_faults', KCSUBTYPE_TYPE.KC_ST_UINT32, 68), + KCSubTypeElement.FromBasicCtype('ts_was_throttled', KCSUBTYPE_TYPE.KC_ST_UINT32, 72), + KCSubTypeElement.FromBasicCtype('ts_did_throttle', KCSUBTYPE_TYPE.KC_ST_UINT32, 76), + KCSubTypeElement.FromBasicCtype('ts_latency_qos', KCSUBTYPE_TYPE.KC_ST_UINT32, 80), + KCSubTypeElement.FromBasicCtype('ts_pid', KCSUBTYPE_TYPE.KC_ST_INT32, 84), + KCSubTypeElement('ts_p_comm', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(32, 1), 88, 1) ), - 'task_snapshot_v2' + 'task_snapshot' ) KNOWN_TYPES_COLLECTION[0x906] = KCTypeDescription(0x906, ( - KCSubTypeElement.FromBasicCtype('thread_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), - KCSubTypeElement.FromBasicCtype('wait_event', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), - KCSubTypeElement.FromBasicCtype('continuation', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), - KCSubTypeElement.FromBasicCtype('total_syscalls', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), - KCSubTypeElement.FromBasicCtype('voucher_identifier', KCSUBTYPE_TYPE.KC_ST_UINT64, 32), - KCSubTypeElement.FromBasicCtype('dqserialnum', KCSUBTYPE_TYPE.KC_ST_UINT64, 40), - KCSubTypeElement.FromBasicCtype('user_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 48), - KCSubTypeElement.FromBasicCtype('sys_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 56), - KCSubTypeElement.FromBasicCtype('ss_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 64), - KCSubTypeElement.FromBasicCtype('last_run_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 72), - KCSubTypeElement.FromBasicCtype('last_made_runnable_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 80), - KCSubTypeElement.FromBasicCtype('state', KCSUBTYPE_TYPE.KC_ST_UINT32, 88), - KCSubTypeElement.FromBasicCtype('sched_flags', KCSUBTYPE_TYPE.KC_ST_UINT32, 92), - KCSubTypeElement.FromBasicCtype('base_priority', KCSUBTYPE_TYPE.KC_ST_INT16, 96), - KCSubTypeElement.FromBasicCtype('sched_priority', KCSUBTYPE_TYPE.KC_ST_INT16, 98), - KCSubTypeElement.FromBasicCtype('ts_eqos', KCSUBTYPE_TYPE.KC_ST_UINT8, 100), - KCSubTypeElement.FromBasicCtype('ts_rqos', KCSUBTYPE_TYPE.KC_ST_UINT8, 101), - KCSubTypeElement.FromBasicCtype('ts_rqos_override', KCSUBTYPE_TYPE.KC_ST_UINT8, 102), - KCSubTypeElement.FromBasicCtype('io_tier', KCSUBTYPE_TYPE.KC_ST_UINT8, 103), + KCSubTypeElement.FromBasicCtype('ths_thread_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('ths_wait_event', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('ths_continuation', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), + KCSubTypeElement.FromBasicCtype('ths_total_syscalls', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), + KCSubTypeElement.FromBasicCtype('ths_voucher_identifier', KCSUBTYPE_TYPE.KC_ST_UINT64, 32), + KCSubTypeElement.FromBasicCtype('ths_dqserialnum', KCSUBTYPE_TYPE.KC_ST_UINT64, 40), + KCSubTypeElement.FromBasicCtype('ths_user_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 48), + KCSubTypeElement.FromBasicCtype('ths_sys_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 56), + KCSubTypeElement.FromBasicCtype('ths_ss_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 64), + KCSubTypeElement.FromBasicCtype('ths_last_run_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 72), + KCSubTypeElement.FromBasicCtype('ths_last_made_runnable_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 80), + KCSubTypeElement.FromBasicCtype('ths_state', KCSUBTYPE_TYPE.KC_ST_UINT32, 88), + KCSubTypeElement.FromBasicCtype('ths_sched_flags', KCSUBTYPE_TYPE.KC_ST_UINT32, 92), + KCSubTypeElement.FromBasicCtype('ths_base_priority', KCSUBTYPE_TYPE.KC_ST_INT16, 96), + KCSubTypeElement.FromBasicCtype('ths_sched_priority', KCSUBTYPE_TYPE.KC_ST_INT16, 98), + KCSubTypeElement.FromBasicCtype('ths_eqos', KCSUBTYPE_TYPE.KC_ST_UINT8, 100), + KCSubTypeElement.FromBasicCtype('ths_rqos', KCSUBTYPE_TYPE.KC_ST_UINT8, 101), + KCSubTypeElement.FromBasicCtype('ths_rqos_override', KCSUBTYPE_TYPE.KC_ST_UINT8, 102), + KCSubTypeElement.FromBasicCtype('ths_io_tier', KCSUBTYPE_TYPE.KC_ST_UINT8, 103), + KCSubTypeElement.FromBasicCtype('ths_thread_t', KCSUBTYPE_TYPE.KC_ST_UINT64, 104), +), + 'thread_snapshot', + legacy_size = 0x68 +) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT'), ( + KCSubTypeElement.FromBasicCtype('tds_thread_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('tds_voucher_identifier', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('tds_ss_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), + KCSubTypeElement.FromBasicCtype('tds_last_made_runnable_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), + KCSubTypeElement.FromBasicCtype('tds_state', KCSUBTYPE_TYPE.KC_ST_UINT32, 32), + KCSubTypeElement.FromBasicCtype('tds_sched_flags', KCSUBTYPE_TYPE.KC_ST_UINT32, 36), + KCSubTypeElement.FromBasicCtype('tds_base_priority', KCSUBTYPE_TYPE.KC_ST_INT16, 40), + KCSubTypeElement.FromBasicCtype('tds_sched_priority', KCSUBTYPE_TYPE.KC_ST_INT16, 42), + KCSubTypeElement.FromBasicCtype('tds_eqos', KCSUBTYPE_TYPE.KC_ST_UINT8, 44), + KCSubTypeElement.FromBasicCtype('tds_rqos', KCSUBTYPE_TYPE.KC_ST_UINT8, 45), + KCSubTypeElement.FromBasicCtype('tds_rqos_override', KCSUBTYPE_TYPE.KC_ST_UINT8, 46), + KCSubTypeElement.FromBasicCtype('tds_io_tier', KCSUBTYPE_TYPE.KC_ST_UINT8, 47), +), + 'thread_delta_snapshot' +) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT'), ( + KCSubTypeElement.FromBasicCtype('tds_unique_pid', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('tds_ss_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('tds_user_time_in_terminated_thr', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), + KCSubTypeElement.FromBasicCtype('tds_system_time_in_terminated_t', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), + KCSubTypeElement.FromBasicCtype('tds_task_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 32), + KCSubTypeElement.FromBasicCtype('tds_max_resident_size', KCSUBTYPE_TYPE.KC_ST_UINT64, 40), + KCSubTypeElement.FromBasicCtype('tds_suspend_count', KCSUBTYPE_TYPE.KC_ST_UINT32, 48), + KCSubTypeElement.FromBasicCtype('tds_faults', KCSUBTYPE_TYPE.KC_ST_UINT32, 52), + KCSubTypeElement.FromBasicCtype('tds_pageins', KCSUBTYPE_TYPE.KC_ST_UINT32, 56), + KCSubTypeElement.FromBasicCtype('tds_cow_faults', KCSUBTYPE_TYPE.KC_ST_UINT32, 60), + KCSubTypeElement.FromBasicCtype('tds_was_throttled', KCSUBTYPE_TYPE.KC_ST_UINT32, 64), + KCSubTypeElement.FromBasicCtype('tds_did_throttle', KCSUBTYPE_TYPE.KC_ST_UINT32, 68), + KCSubTypeElement.FromBasicCtype('tds_latency_qos', KCSUBTYPE_TYPE.KC_ST_UINT32, 72), ), - 'thread_snapshot_v2' + 'task_delta_snapshot' ) + KNOWN_TYPES_COLLECTION[0x909] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1) -def _get_uuid_json_data(elementValues, elementName): - return '"<%s>"' % ''.join("%02x" % i for i in elementValues) KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO64')] = KCTypeDescription(GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO64'), ( - KCSubTypeElement('loadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), - KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1, _get_uuid_json_data) + KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), + KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1) ), 'dyld_load_info' ) KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO')] = KCTypeDescription(GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO'), ( - KCSubTypeElement('loadAddress', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0), - KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 4, 1, _get_uuid_json_data) + KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0), + KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 4, 1) ), 'dyld_load_info' ) -KNOWN_TYPES_COLLECTION[0x908] = KCTypeDescription.FromKCTypeDescription(KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO64')], 0x908, 'shared_cache_dyld_info') +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO'), ( + KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), + KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1), + KCSubTypeElement('imageSlidBaseAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 24, 0), +), + 'shared_cache_dyld_load_info', + legacy_size = 0x18 +) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO'), ( + KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), + KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1), +), + 'kernelcache_load_info' +) KNOWN_TYPES_COLLECTION[0x33] = KCSubTypeElement('mach_absolute_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0, KCSubTypeElement._get_naked_element_value) KNOWN_TYPES_COLLECTION[0x907] = KCSubTypeElement.FromBasicCtype('donating_pids', KCSUBTYPE_TYPE.KC_ST_INT32) @@ -588,12 +864,25 @@ def _get_uuid_json_data(elementValues, elementName): 'kernel_stack_frames' ) +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKLR')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKLR'), ( + KCSubTypeElement.FromBasicCtype('lr', KCSUBTYPE_TYPE.KC_ST_UINT32), +), + 'kernel_stack_frames' +) + + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_USER_STACKFRAME')] = KCTypeDescription.FromKCTypeDescription( KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKFRAME')], GetTypeForName('STACKSHOT_KCTYPE_USER_STACKFRAME'), 'user_stack_frames' ) +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_USER_STACKLR')] = KCTypeDescription.FromKCTypeDescription( + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKLR')], + GetTypeForName('STACKSHOT_KCTYPE_USER_STACKLR'), + 'user_stack_frames' +) + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKFRAME64')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKFRAME64'), ( KCSubTypeElement.FromBasicCtype('lr', KCSUBTYPE_TYPE.KC_ST_UINT64), KCSubTypeElement.FromBasicCtype('sp', KCSUBTYPE_TYPE.KC_ST_UINT64, 8) @@ -607,16 +896,44 @@ def _get_uuid_json_data(elementValues, elementName): 'user_stack_frames' ) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKLR64')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKLR64'), ( + KCSubTypeElement.FromBasicCtype('lr', KCSUBTYPE_TYPE.KC_ST_UINT64), +), + 'kernel_stack_frames' +) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_USER_STACKLR64')] = KCTypeDescription.FromKCTypeDescription( + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERN_STACKLR64')], + GetTypeForName('STACKSHOT_KCTYPE_USER_STACKLR64'), + 'user_stack_frames' +) + + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_NONRUNNABLE_TIDS')] = KCSubTypeElement.FromBasicCtype('nonrunnable_threads', KCSUBTYPE_TYPE.KC_ST_INT64) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_NONRUNNABLE_TASKS')] = KCSubTypeElement.FromBasicCtype('nonrunnable_tasks', KCSUBTYPE_TYPE.KC_ST_INT64) + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_OSVERSION')] = KCSubTypeElement('osversion', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(256, 1), 0, 1) -KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_BOOTARGS')] = KCSubTypeElement('bootargs', KCSUBTYPE_TYPE.KC_ST_CHAR, +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_BOOTARGS')] = KCSubTypeElement('boot_args', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(256, 1), 0, 1) KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_KERN_PAGE_SIZE')] = KCSubTypeElement('kernel_page_size', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0, KCSubTypeElement._get_naked_element_value) KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_JETSAM_LEVEL')] = KCSubTypeElement('jetsam_level', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0, KCSubTypeElement._get_naked_element_value) +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP')] = KCSubTypeElement("stackshot_delta_since_timestamp", KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0, KCSubTypeElement._get_naked_element_value) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS'), + ( + KCSubTypeElement.FromBasicCtype('sfs_pages_faulted_in', KCSUBTYPE_TYPE.KC_ST_UINT32, 0), + KCSubTypeElement.FromBasicCtype('sfs_time_spent_faulting', KCSUBTYPE_TYPE.KC_ST_UINT64, 4), + KCSubTypeElement.FromBasicCtype('sfs_system_max_fault_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 12), + KCSubTypeElement.FromBasicCtype('sfs_stopped_faulting', KCSUBTYPE_TYPE.KC_ST_UINT8, 20) + ), + 'stackshot_fault_stats') #KNOWN_TYPES_COLLECTION[0x907] = KCSubTypeElement('donating_pids', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0, KCSubTypeElement._get_naked_element_value) KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_PID')] = KCSubTypeElement('pid', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0) @@ -637,6 +954,7 @@ def _get_uuid_json_data(elementValues, elementName): KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_RESPONSIBLE_PID')] = KCSubTypeElement('responsible_pid', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0) KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_DIRTY_FLAGS')] = KCSubTypeElement('dirty_flags', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0) KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_CRASHED_THREADID')] = KCSubTypeElement('crashed_threadid', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0) +KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_COALITION_ID')] = KCSubTypeElement('coalition_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0) KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_PROC_STATUS')] = KCSubTypeElement('p_status', KCSUBTYPE_TYPE.KC_ST_UINT8, 1, 0, 0) @@ -647,8 +965,19 @@ def _get_uuid_json_data(elementValues, elementName): ), 'proc_uniqidentifierinfo') -KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_EXCEPTION_CODES')] = KCSubTypeElement('TASK_CRASHINFO_EXCEPTION_CODES', KCSUBTYPE_TYPE.KC_ST_INT64, - KCSubTypeElement.GetSizeForArray(2,8), 0, 1) +KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_EXCEPTION_CODES')] = ( + KCTypeDescription(GetTypeForName('TASK_CRASHINFO_EXCEPTION_CODES'), + (KCSubTypeElement.FromBasicCtype('code_0', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('code_1', KCSUBTYPE_TYPE.KC_ST_UINT64, 8)), + 'mach_exception_data_t')) + + +KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_PROC_STARTTIME')] = ( + KCTypeDescription(GetTypeForName('TASK_CRASHINFO_PROC_STARTTIME'), + (KCSubTypeElement.FromBasicCtype('tv_sec', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('tv_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 8)), + 'proc_starttime')) + KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_RUSAGE_INFO')] = KCTypeDescription(GetTypeForName('TASK_CRASHINFO_RUSAGE_INFO'), ( @@ -677,14 +1006,68 @@ def _get_uuid_json_data(elementValues, elementName): KCSubTypeElement.FromBasicCtype('ri_cpu_time_qos_utility', KCSUBTYPE_TYPE.KC_ST_UINT64, 184), KCSubTypeElement.FromBasicCtype('ri_cpu_time_qos_legacy', KCSUBTYPE_TYPE.KC_ST_UINT64, 192), KCSubTypeElement.FromBasicCtype('ri_cpu_time_qos_user_initiated', KCSUBTYPE_TYPE.KC_ST_UINT64, 200), - KCSubTypeElement.FromBasicCtype('ri_cpu_time_qos_user_interactive', KCSUBTYPE_TYPE.KC_ST_UINT64, 208), + KCSubTypeElement.FromBasicCtype('ri_cpu_time_qos_user_interactiv', KCSUBTYPE_TYPE.KC_ST_UINT64, 208), KCSubTypeElement.FromBasicCtype('ri_billed_system_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 216), KCSubTypeElement.FromBasicCtype('ri_serviced_system_time', KCSUBTYPE_TYPE.KC_ST_UINT64, 224) ), - 'rusage_info_v3') + 'rusage_info') + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_CPU_TIMES')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_CPU_TIMES'), + ( + KCSubTypeElement.FromBasicCtype('user_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('system_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + ), 'cpu_times') + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION'), + ( + KCSubTypeElement.FromBasicCtype('stackshot_duration', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('stackshot_duration_outer', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + ), 'stackshot_duration', merge=True +) + +KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_PROCNAME')] = ( + KCSubTypeElement("proc_name", KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(-1, 1), 0, 1)) + +KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_PID')] = ( + KCSubTypeElement('pid', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0)) + +KNOWN_TYPES_COLLECTION[GetTypeForName('EXIT_REASON_SNAPSHOT')] = KCTypeDescription(GetTypeForName('EXIT_REASON_SNAPSHOT'), + ( + KCSubTypeElement.FromBasicCtype('ers_namespace', KCSUBTYPE_TYPE.KC_ST_UINT32, 0), + KCSubTypeElement.FromBasicCtype('ers_code', KCSUBTYPE_TYPE.KC_ST_UINT64, 4), + KCSubTypeElement.FromBasicCtype('ers_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 12), + ), 'exit_reason_basic_info') + +KNOWN_TYPES_COLLECTION[GetTypeForName('EXIT_REASON_USER_DESC')] = ( + KCSubTypeElement("exit_reason_user_description", KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(-1, 1), 0, 1)) + +KNOWN_TYPES_COLLECTION[GetTypeForName('EXIT_REASON_USER_PAYLOAD')] = KCSubTypeElement('exit_reason_user_payload', + KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(-1, 1), 0, 1) + +KNOWN_TYPES_COLLECTION[GetTypeForName('EXIT_REASON_CODESIGNING_INFO')] = KCTypeDescription(GetTypeForName('EXIT_REASON_CODESIGNING_INFO'), + ( + KCSubTypeElement.FromBasicCtype('ceri_virt_addr', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement.FromBasicCtype('ceri_file_offset', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement("ceri_pathname", KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(1024, 1), 16, 1), + KCSubTypeElement("ceri_filename", KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(1024, 1), 1040, 1), + KCSubTypeElement.FromBasicCtype('ceri_codesig_modtime_secs', KCSUBTYPE_TYPE.KC_ST_UINT64, 2064), + KCSubTypeElement.FromBasicCtype('ceri_codesig_modtime_nsecs', KCSUBTYPE_TYPE.KC_ST_UINT64, 2072), + KCSubTypeElement.FromBasicCtype('ceri_page_modtime_secs', KCSUBTYPE_TYPE.KC_ST_UINT64, 2080), + KCSubTypeElement.FromBasicCtype('ceri_page_modtime_nsecs', KCSUBTYPE_TYPE.KC_ST_UINT64, 2088), + KCSubTypeElement.FromBasicCtype('ceri_path_truncated', KCSUBTYPE_TYPE.KC_ST_UINT8, 2096), + KCSubTypeElement.FromBasicCtype('ceri_object_codesigned', KCSUBTYPE_TYPE.KC_ST_UINT8, 2097), + KCSubTypeElement.FromBasicCtype('ceri_page_codesig_validated', KCSUBTYPE_TYPE.KC_ST_UINT8, 2098), + KCSubTypeElement.FromBasicCtype('ceri_page_codesig_tainted', KCSUBTYPE_TYPE.KC_ST_UINT8, 2099), + KCSubTypeElement.FromBasicCtype('ceri_page_codesig_nx', KCSUBTYPE_TYPE.KC_ST_UINT8, 2100), + KCSubTypeElement.FromBasicCtype('ceri_page_wpmapped', KCSUBTYPE_TYPE.KC_ST_UINT8, 2101), + KCSubTypeElement.FromBasicCtype('ceri_page_slid', KCSUBTYPE_TYPE.KC_ST_UINT8, 2102), + KCSubTypeElement.FromBasicCtype('ceri_page_dirty', KCSUBTYPE_TYPE.KC_ST_UINT8, 2103), + KCSubTypeElement.FromBasicCtype('ceri_page_shadow_depth', KCSUBTYPE_TYPE.KC_ST_UINT32, 2104), + ), 'exit_reason_codesigning_info') + def GetSecondsFromMATime(mat, tb): - return (float(mat) * tb['numerator']) / tb['denominator'] + return (float(mat) * tb['numer']) / tb['denom'] def FindLibraryForAddress(liblist, address): current_lib = None @@ -744,20 +1127,38 @@ def GetStateDescription(s): retval.append("TH_IDLE") return retval + +def format_uuid(elementValues): + return ''.join("%02x" % i for i in elementValues) + def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr): import time from operator import itemgetter, attrgetter - ss = j.get('KCDATA_BUFFER_BEGIN_STACKSHOT') + ss = j.get('kcdata_stackshot') if not ss: print "No KCDATA_BUFFER_BEGIN_STACKSHOT object found. Skipping writing report." return timestamp = ss.get('usecs_since_epoch', int(time.time())) timestamp = time.strftime("%Y-%m-%d %H:%M:%S %z",time.gmtime(timestamp)) os_version = ss.get('osversion', 'Unknown') - timebase = ss.get('timebase_info', {"denominator": 1, "numerator": 1}) - dsc_common = [ss.get('shared_cache_dyld_info')['imageUUID'].strip('<>'), - ss.get('shared_cache_dyld_info')['loadAddress'], - "C" + timebase = ss.get('mach_timebase_info', {"denom": 1, "numer": 1}) + if not dsc_uuid and 'imageSlidBaseAddress' not in ss.get('shared_cache_dyld_load_info'): + print "Stackshot format does not include slid shared cache base address and no UUID provided. Skipping writing report." + return + + # If a shared cache UUID is provided, treat the slide as the base address + # for compatibility with existing tools that operate based on this logic + if dsc_uuid: + shared_cache_base_addr = ss.get('shared_cache_dyld_load_info')['imageLoadAddress'] + elif 'imageSlidBaseAddress' in ss.get('shared_cache_dyld_load_info'): + shared_cache_base_addr = ss.get('shared_cache_dyld_load_info')['imageSlidBaseAddress'] + else: + print "No shared cache UUID provided and data doesn't include imageSlidBaseAddress. Skipping writing report." + return + + dsc_common = [format_uuid(ss.get('shared_cache_dyld_load_info')['imageUUID']), + shared_cache_base_addr, + "S" ] dsc_libs = [] @@ -772,6 +1173,9 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr): _addr = int(i[0], 16) + _load_addr dsc_libs.append([_uuid, _addr, "P"]) #print "adding ", [_uuid, _addr, "C"] + elif dsc_uuid: + print "Provided shared cache UUID does not match. Skipping writing report." + return AllImageCatalog = [] obj = {} @@ -780,25 +1184,36 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr): obj["reason"] = "kernel panic stackshot" obj["incident"] = "ABCDEFGH-1234-56IJ-789K-0LMNOPQRSTUV" obj["crashReporterKey"] = "12ab34cd45aabbccdd6712ab34cd45aabbccdd67" - obj["bootArgs"] = ss.get('bootargs','') + obj["bootArgs"] = ss.get('boot_args','') obj["frontmostPids"] = [0] obj["exception"] = "0xDEADF157" obj["processByPid"] = {} processByPid = obj["processByPid"] - ssplist = ss.get('STACKSHOT_KCCONTAINER_TASK', {}) + ssplist = ss.get('task_snapshots', {}) kern_load_info = [] if "0" in ssplist: + kc_uuid = ssplist["0"].get('kernelcache_load_info', None) + if kc_uuid: + kernelcache_uuid = [format_uuid(kc_uuid['imageUUID']), kc_uuid['imageLoadAddress'], "U" ] + kern_load_info.append(kernelcache_uuid) + kl_infos = ssplist["0"].get("dyld_load_info", []) for dlinfo in kl_infos: - kern_load_info.append([dlinfo['imageUUID'].strip('<>'), dlinfo['loadAddress'], "K"]) + kern_load_info.append([format_uuid(dlinfo['imageUUID']), dlinfo['imageLoadAddress'], "K"]) for pid,piddata in ssplist.iteritems(): processByPid[str(pid)] = {} tsnap = processByPid[str(pid)] pr_lib_dsc = dsc_common - if 'shared_cache_dyld_info' in tsnap: - pr_lib_dsc = [tsnap.get('shared_cache_dyld_info')['imageUUID'].strip('<>'), - tsnap.get('shared_cache_dyld_info')['loadAddress'], - "C" + if 'shared_cache_dyld_load_info' in tsnap: + if 'imageSlidBaseAddress' in tsnap.get('shared_cache_dyld_load_info'): + shared_cache_base_addr = tsnap.get('shared_cache_dyld_load_info')['imageSlidBaseAddress'] + else: + print "Specific task shared cache format does not include slid shared cache base address. Skipping writing report." + return + + pr_lib_dsc = [format_uuid(tsnap.get('shared_cache_dyld_load_info')['imageUUID']), + tsnap.get('shared_cache_dyld_load_info')['imageSlidBaseAddress'], + "S" ] pr_libs = [] @@ -810,45 +1225,45 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr): pr_libs = [] else: for dlinfo in piddata.get('dyld_load_info',[]): - pr_libs.append([dlinfo['imageUUID'].strip('<>'), dlinfo['loadAddress'], _lib_type]) + pr_libs.append([format_uuid(dlinfo['imageUUID']), dlinfo['imageLoadAddress'], _lib_type]) pr_libs.extend(kern_load_info) pr_libs.extend(dsc_libs) pr_libs.sort(key=itemgetter(1)) - tasksnap = piddata['task_snapshot_v2'] - tsnap["pid"] = tasksnap["pid"] - tsnap["residentMemoryBytes"] = tasksnap["task_size"] - tsnap["timesDidThrottle"] = tasksnap["did_throttle"] - tsnap["systemTimeTask"] = GetSecondsFromMATime(tasksnap["system_time_in_terminated_threads"], timebase) - tsnap["pageIns"] = tasksnap["pageins"] - tsnap["pageFaults"] = tasksnap["faults"] - tsnap["userTimeTask"] = GetSecondsFromMATime(tasksnap["user_time_in_terminated_threads"], timebase) - tsnap["procname"] = tasksnap["p_comm"] - tsnap["copyOnWriteFaults"] = tasksnap["cow_faults"] - tsnap["timesThrottled"] = tasksnap["was_throttled"] + tasksnap = piddata['task_snapshot'] + tsnap["pid"] = tasksnap["ts_pid"] + tsnap["residentMemoryBytes"] = tasksnap["ts_task_size"] + tsnap["timesDidThrottle"] = tasksnap["ts_did_throttle"] + tsnap["systemTimeTask"] = GetSecondsFromMATime(tasksnap["ts_system_time_in_terminated_th"], timebase) + tsnap["pageIns"] = tasksnap["ts_pageins"] + tsnap["pageFaults"] = tasksnap["ts_faults"] + tsnap["userTimeTask"] = GetSecondsFromMATime(tasksnap[ "ts_user_time_in_terminated_thre"], timebase) + tsnap["procname"] = tasksnap["ts_p_comm"] + tsnap["copyOnWriteFaults"] = tasksnap["ts_cow_faults"] + tsnap["timesThrottled"] = tasksnap["ts_was_throttled"] tsnap["threadById"] = {} threadByID = tsnap["threadById"] - thlist = piddata.get('STACKSHOT_KCCONTAINER_THREAD', {}) + thlist = piddata.get('thread_snapshots', {}) for tid,thdata in thlist.iteritems(): threadByID[str(tid)] = {} thsnap = threadByID[str(tid)] - if "thread_snapshot_v2" not in thdata: + if "thread_snapshot" not in thdata: print "Found broken thread state for thread ID: %s." % tid break - threadsnap = thdata["thread_snapshot_v2"] - thsnap["userTime"] = GetSecondsFromMATime(threadsnap["user_time"], timebase) - thsnap["id"] = threadsnap["thread_id"] - thsnap["basePriority"] = threadsnap["base_priority"] - thsnap["systemTime"] = threadsnap["sys_time"] - thsnap["schedPriority"] = threadsnap["sched_priority"] - thsnap["state"] = GetStateDescription(threadsnap['state']) - thsnap["qosEffective"] = threadsnap["ts_eqos"] - thsnap["qosRequested"] = threadsnap["ts_rqos"] - - if threadsnap['continuation']: - thsnap["continuation"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['continuation']) + threadsnap = thdata["thread_snapshot"] + thsnap["userTime"] = GetSecondsFromMATime(threadsnap["ths_user_time"], timebase) + thsnap["id"] = threadsnap["ths_thread_id"] + thsnap["basePriority"] = threadsnap["ths_base_priority"] + thsnap["systemTime"] = threadsnap["ths_sys_time"] + thsnap["schedPriority"] = threadsnap["ths_sched_priority"] + thsnap["state"] = GetStateDescription(threadsnap['ths_state']) + thsnap["qosEffective"] = threadsnap["ths_eqos"] + thsnap["qosRequested"] = threadsnap["ths_rqos"] + + if threadsnap['ths_continuation']: + thsnap["continuation"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_continuation']) if "kernel_stack_frames" in thdata: kuserframes = [] for f in thdata["kernel_stack_frames"]: @@ -860,8 +1275,8 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr): for f in thdata["user_stack_frames"]: uframes.append(GetSymbolInfoForFrame(AllImageCatalog, pr_libs, f['lr'])) thsnap["userFrames"] = uframes - if threadsnap['wait_event']: - thsnap["waitEvent"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['wait_event']) + if threadsnap['ths_wait_event']: + thsnap["waitEvent"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_wait_event']) obj['binaryImages'] = AllImageCatalog fh = open(outfile_name, "w") @@ -909,13 +1324,13 @@ def ProcessDyldSharedCacheFile(shared_cache_file_path, sdk_str=""): return None uuid = so.splitlines()[0].split(": ")[-1].strip().replace("-","").lower() - + (c, so) = RunCommand("{} -text_info {}".format(dyld_shared_cache_util, shared_cache_file_path)) if c: print "Failed to get text_info from %s" % shared_cache_file_path print so return None - + print "Found %s uuid: %s" % (shared_cache_file_path, uuid) text_info = so @@ -930,48 +1345,97 @@ def ProcessDyldSharedCacheFile(shared_cache_file_path, sdk_str=""): help="Generate a stackshot report file", dest="stackshot_file") +parser.add_argument("--multiple", help="look for multiple stackshots in a single file", action='store_true') + +parser.add_argument("-p", "--plist", required=False, default=False, + help="output as plist", action="store_true") + parser.add_argument("-U", "--uuid", required=False, default="", help="UUID of dyld shared cache to be analysed and filled in libs of stackshot report", dest="uuid") parser.add_argument("-L", "--layout", required=False, type=argparse.FileType("r"), help="Path to layout file for DyldSharedCache. You can generate one by doing \n\tbash$xcrun -sdk dyld_shared_cache_util -text_info ", dest="layout") parser.add_argument("-S", "--sdk", required=False, default="", help="sdk property passed to xcrun command to find the required tools. Default is empty string.", dest="sdk") parser.add_argument("-D", "--dyld_shared_cache", required=False, default="", help="Path to dyld_shared_cache built by B&I", dest="dsc") parser.add_argument("kcdata_file", type=argparse.FileType('r'), help="Path to a kcdata binary file.") +class VerboseAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(message)s') +parser.add_argument('-v', "--verbose", action=VerboseAction, nargs=0) + +@contextlib.contextmanager +def data_from_stream(stream): + try: + fmap = mmap.mmap(stream.fileno(), 0, mmap.MAP_SHARED, mmap.PROT_READ) + except: + yield stream.read() + else: + try: + yield fmap + finally: + fmap.close() + +def iterate_kcdatas(kcdata_file): + with data_from_stream(kcdata_file) as data: + iterator = kcdata_item_iterator(data) + kcdata_buffer = KCObject.FromKCItem(iterator.next()) + if not isinstance(kcdata_buffer, KCBufferObject): + try: + decoded = base64.b64decode(data) + except: + pass + else: + iterator = kcdata_item_iterator(decoded) + kcdata_buffer = KCObject.FromKCItem(iterator.next()) + if not isinstance(kcdata_buffer, KCBufferObject): + import gzip + from io import BytesIO + try: + decompressed = gzip.GzipFile(fileobj=BytesIO(data[:])).read() + except: + pass + else: + iterator = kcdata_item_iterator(decompressed) + kcdata_buffer = KCObject.FromKCItem(iterator.next()) + if not isinstance(kcdata_buffer, KCBufferObject): + raise Exception, "unknown file type" + + kcdata_buffer.ReadItems(iterator) + yield kcdata_buffer + + for magic in iterator: + kcdata_buffer = KCObject.FromKCItem(magic) + if not isinstance(kcdata_buffer, KCBufferObject): + raise Exception, "unknown file type" + kcdata_buffer.ReadItems(iterator) + yield kcdata_buffer + if __name__ == '__main__': args = parser.parse_args() + if args.multiple and args.stackshot_file: + raise NotImplementedError + if args.list_known_types: for (n, t) in KNOWN_TYPES_COLLECTION.items(): print "%d : %s " % (n, str(t)) sys.exit(1) - file_name = args.kcdata_file.name - master_objs = [] - master_container = None - current_container = None - for i in kcdata_item_iterator(file_name): - #print "processed " + str(i) - o = KCObject.FromKCItem(i) - if o.IsContainerType(): - o = KCContainerObject(i.i_type, i.i_data, i.i_flags) - - if current_container is None: - master_objs.append(o) - current_container = o - master_container = o - else: - current_container.AddObject(o) + for i,kcdata_buffer in enumerate(iterate_kcdatas(args.kcdata_file)): + if i > 0 and not args.multiple: + break - if o.IsContainerType(): - master_objs.append(current_container) - current_container = o + str_data = "{" + kcdata_buffer.GetJsonRepr() + "}" + str_data = str_data.replace("\t", " ") + + try: + json_obj = json.loads(str_data) + except: + print >>sys.stderr, "JSON reparsing failed! Printing string data!\n" + import textwrap + print textwrap.fill(str_data, 100) + raise - if o.IsContainerEnd(): - current_container = master_objs.pop() - str_data = "{" + master_container.GetJsonRepr() + "}" - try: - json_obj = json.loads(str_data) dsc_uuid = None dsc_libs_arr = [] libs_re = re.compile("^\s*(0x[a-fA-F0-9]+)\s->\s(0x[a-fA-F0-9]+)\s+<([a-fA-F0-9\-]+)>\s+.*$", re.MULTILINE) @@ -987,11 +1451,12 @@ def ProcessDyldSharedCacheFile(shared_cache_file_path, sdk_str=""): if args.stackshot_file: SaveStackshotReport(json_obj, args.stackshot_file, dsc_uuid, dsc_libs_arr) + elif args.plist: + import Foundation + plist = Foundation.NSPropertyListSerialization.dataWithPropertyList_format_options_error_( + json_obj, Foundation.NSPropertyListXMLFormat_v1_0, 0, None)[0].bytes().tobytes() + #sigh. on some pythons long integers are getting output with L's in the plist. + plist = re.sub(r'^(\s*\d+)L(\s*)$', r"\1\2", plist, flags=re.MULTILINE) + print plist, else: print json.dumps(json_obj, sort_keys=True, indent=4, separators=(',', ': ')) - - except Exception, e: - raise - print e - print "--------------------------------------------"*3 - print str_data diff --git a/tools/lldbmacros/ktrace.py b/tools/lldbmacros/ktrace.py new file mode 100644 index 000000000..1f07e5d35 --- /dev/null +++ b/tools/lldbmacros/ktrace.py @@ -0,0 +1,112 @@ +from xnu import * +from utils import * + +# From the defines in bsd/sys/kdebug.h: + +KdebugClassNames = { + 1: "MACH", + 2: "NETWORK", + 3: "FSYSTEM", + 4: "BSD", + 5: "IOKIT", + 6: "DRIVERS", + 7: "TRACE", + 8: "DLIL", + 9: "WORKQUEUE", + 10: "CORESTORAGE", + 11: "CG", + 20: "MISC", + 30: "SECURITY", + 31: "DYLD", + 32: "QT", + 33: "APPS", + 34: "LAUNCHD", + 36: "PPT", + 37: "PERF", + 38: "IMPORTANCE", + 39: "PERFCTRL", + 40: "BANK", + 41: "XPC", + 42: "ATM", + 43: "ARIADNE", + 44: "DAEMON", + 45: "ENERGYTRACE", + 49: "IMG", + 50: "CLPC", + 128: "ANS", + 129: "SIO", + 130: "SEP", + 131: "ISP", + 132: "OSCAR", + 133: "EMBEDDEDGFX" +} + +def GetKdebugClassName(class_num): + return (KdebugClassNames[class_num] + ' ({})'.format(class_num) if class_num in KdebugClassNames else 'unknown ({})'.format(class_num)) + +@lldb_type_summary(['typefilter_t']) +@header('{0: <20s}'.format("class") + ' '.join(map('{:02x}'.format, xrange(0, 255, 8)))) +def GetKdebugTypefilter(typefilter): + """ Summarizes the provided typefilter. + """ + classes = 256 + subclasses_per_class = 256 + + # 8 bits at a time + subclasses_per_element = 64 + cur_typefilter = cast(typefilter, 'uint64_t *') + subclasses_fmts = ' '.join(['{:02x}'] * 8) + + elements_per_class = subclasses_per_class / subclasses_per_element + + out_str = '' + for i in xrange(0, classes): + print_class = False + subclasses = [0] * elements_per_class + + # check subclass ranges for set bits, remember those subclasses + for j in xrange(0, elements_per_class): + element = unsigned(cur_typefilter[i * elements_per_class + j]) + if element != 0: + print_class = True + if print_class: + subclasses[j] = element + + # if any of the bits were set in a class, print the entire class + if print_class: + out_str += '{:<20s}'.format(GetKdebugClassName(i)) + for element in subclasses: + # split up the 64-bit values into byte-sized pieces + bytes = [unsigned((element >> i) & 0xff) for i in (0, 8, 16, 24, 32, 40, 48, 56)] + out_str += subclasses_fmts.format(*bytes) + out_str += ' ' + + out_str += '\n' + + return out_str + +@lldb_command('showkdebugtypefilter') +def ShowKdebugTypefilter(cmd_args=None): + """ Show the current kdebug typefilter (or the typefilter at an address) + + usage: showkdebugtypefilter [
] + """ + + if cmd_args: + print GetKdebugTypefilter.header + print '-' * len(GetKdebugTypefilter.header) + + typefilter = kern.GetValueFromAddress(cmd_args[0], 'typefilter_t') + if unsigned(typefilter) == 0: + raise ArgumentError('argument provided is NULL') + + print GetKdebugTypefilter() + return + + typefilter = kern.globals.kdbg_typefilter + if unsigned(typefilter) == 0: + raise ArgumentError('no argument provided and active typefilter is not set') + + print GetKdebugTypefilter.header + print '-' * len(GetKdebugTypefilter.header) + print GetKdebugTypefilter(typefilter) diff --git a/tools/lldbmacros/macho.py b/tools/lldbmacros/macho.py new file mode 100644 index 000000000..cf9a3beec --- /dev/null +++ b/tools/lldbmacros/macho.py @@ -0,0 +1,299 @@ +import sys +import macholib +from macholib import MachO as macho +from collections import namedtuple +import re + +# some fixups in macholib that are required for kext support +macholib.mach_o.MH_KEXT_BUNDLE = 0xB + +macholib.mach_o.MH_FILETYPE_NAMES[macholib.mach_o.MH_KEXT_BUNDLE] = "kext bundle" +macholib.mach_o.MH_FILETYPE_SHORTNAMES[macholib.mach_o.MH_KEXT_BUNDLE] = "kext" + +_old_MachOHeader_load = macho.MachOHeader.load +def new_load(s, fh): + try: + _old_MachOHeader_load(s, fh) + except ValueError as e: + if str(e.message).find('total_size > low_offset') >= 0: + pass + else: + raise + except Exception as e: + raise +macho.MachOHeader.load = new_load + +class MemFile(object): + def __init__(self, memory, size): + self._start = 0 + self._readp = 0 + self._end = size + self._mem = memory + + def tell(self): + return self._readp + + def check_bounds(self, seek_position, operation): + if not (self._start <= seek_position <= self._end): + raise IOError("%s to offset %d failed bounds check [%d, %d]" % ( + operation, seek_position, self._start, self._end)) + + def seek(self, offset, whence=0): + seekto = offset + if whence == 0: + seekto += self._start + elif whence == 1: + seekto += self.tell() + elif whence == 2: + seekto += self._end + else: + raise IOError("Invalid whence argument to seek: %r" % (whence,)) + self.check_bounds(seekto, 'seek') + self._readp = seekto + + def write(self, bytes): + raise NotImplementedError('write is not supported') + + def read(self, size=sys.maxsize): + if size < 0: + raise ValueError("Invalid size {} while reading from {}".format(size, self._fileobj)) + here = self.tell() + self.check_bounds(here, 'read') + bytes = min(size, self._end - here) + retval = self._mem[self._readp:self._readp + bytes] + self._readp += bytes + return retval + +MachOSegment = namedtuple('MachOSegment', 'name vmaddr vmsize fileoff filesize') + +class MemMacho(macho.MachO): + + def __init__(self, memdata, size=None): + if size is None: + super(MemMacho,self).__init__(memdata) + return + # + # supports the ObjectGraph protocol + self.graphident = 'mem:%d//'.format(size) + self.filename = 'mem:%d//'.format(size) + + # initialized by load + self.fat = None + self.headers = [] + fp = MemFile(memdata, size) + self.load(fp) + + + def get_segments_with_name(self, filter_re): + """ param: filter_re is a compiled re which will be matched against segment name. + Use: '' to match anything and everything + returns: [ MachOSegment, MachOSegment, ... ] + """ + if type(filter_re) is str: + filter_re = re.compile(filter_re) + retval = [] + for h in self.headers: + for cmd in h.commands: + # cmds is [(load_command, segment, [sections..])] + (lc, segment, sections) = cmd + if isinstance(segment, SEGMENT_TYPES): + segname = segment.segname[:segment.segname.find('\x00')] + if filter_re.match(segname): + retval.append(MachOSegment(segname, segment.vmaddr, segment.vmsize, segment.fileoff, segment.filesize)) + return retval + + def get_sections_with_name(self, filter_re): + """ param: filter_re is a compiled re which will be matched against . + Use: '' to match anything and everything + returns: [ MachOSegment, MachOSegment, ... ] + where each MachOSegment.name is . + """ + if type(filter_re) is str: + filter_re = re.compile(filter_re) + retval = [] + for h in self.headers: + for cmd in h.commands: + # cmds is [(load_command, segment, [sections..])] + (lc, segment, sections) = cmd + if isinstance(segment, SEGMENT_TYPES): + segname = segment.segname[:segment.segname.find('\x00')] + for section in sections: + section_name = section.sectname[:section.sectname.find('\x00')] + full_section_name= "{}.{}".format(segname, section_name) + if filter_re.match(full_section_name): + retval.append(MachOSegment(full_section_name, section.addr, section.size, section.offset, section.size)) + return retval + + + def get_uuid(self): + retval = '' + for h in self.headers: + for cmd in h.commands: + # cmds is [(load_command, segment, [sections..])] + (lc, segment, sections) = cmd + if isinstance(segment, macholib.mach_o.uuid_command): + retval = GetUUIDSummary(segment.uuid) + return retval + +def get_text_segment(segments): + retval = None + for s in segments: + if s.name == '__TEXT_EXEC': + return s + for s in segments: + if s.name == '__TEXT': + return s + return retval + +def get_segment_with_addr(segments, addr): + """ param: segments [MachOSegment, ...] + return: None or MachOSegment where addr is in vmaddr...(vmaddr+vmsize) + """ + for s in segments: + if addr >= s.vmaddr and addr < (s.vmaddr + s.vmsize): + return s + return None + +def GetUUIDSummary(arr): + data = [] + for i in range(16): + data.append(ord(arr[i])) + return "{a[0]:02X}{a[1]:02X}{a[2]:02X}{a[3]:02X}-{a[4]:02X}{a[5]:02X}-{a[6]:02X}{a[7]:02X}-{a[8]:02X}{a[9]:02X}-{a[10]:02X}{a[11]:02X}{a[12]:02X}{a[13]:02X}{a[14]:02X}{a[15]:02X}".format(a=data) + +SEGMENT_TYPES = (macholib.mach_o.segment_command_64, macholib.mach_o.segment_command) + +def get_load_command_human_name(cmd): + """ return string name of LC_LOAD_DYLIB => "load_dylib" + "" if not found + """ + retval = "" + if cmd in macho.LC_REGISTRY: + retval = macho.LC_REGISTRY[cmd].__name__ + retval = retval.replace("_command","") + return retval + +class VisualMachoMap(object): + KB_1 = 1024 + KB_16 = 16 * 1024 + MB_1 = 1 * 1024 * 1024 + GB_1 = 1 * 1024 * 1024 * 1024 + + def __init__(self, name, width=40): + self.name = name + self.width = 40 + self.default_side_padding = 2 + + def get_header_line(self): + return '+' + '-' * (self.width - 2) + '+' + + def get_space_line(self): + return '|' + ' ' * (self.width - 2) + '|' + + def get_dashed_line(self): + return '|' + '-' * (self.width - 2) + '|' + + def get_dotted_line(self): + return '|' + '.' * (self.width - 2) + '|' + + def center_text_in_line(self, line, text): + even_length = bool(len(text) % 2 == 0) + if len(text) > len(line) - 2: + raise ValueError("text is larger than line of text") + + lbreak_pos = len(line)/2 - len(text)/2 + if not even_length: + lbreak_pos -= 1 + out = line[:lbreak_pos] + text + return out + line[len(out):] + + def get_separator_lines(self): + return ['/' + ' ' * (self.width - 2) + '/', '/' + ' ' * (self.width - 2) + '/'] + + def printMachoMap(self, mobj): + MapBlock = namedtuple('MapBlock', 'name vmaddr vmsize fileoff filesize extra_info is_segment') + outstr = self.name + '\n' + other_cmds = '' + blocks = [] + for hdr in mobj.headers: + cmd_index = 0 + for cmd in hdr.commands: + # cmds is [(load_command, segment, [sections..])] + (lc, segment, sections) = cmd + lc_cmd_str = get_load_command_human_name(lc.cmd) + lc_str_rep = "\n\t LC: {:s} size:{:d} nsects:{:d}".format(lc_cmd_str, lc.cmdsize, len(sections)) + # print lc_str_rep + if isinstance(segment, SEGMENT_TYPES): + segname = segment.segname[:segment.segname.find('\x00')] + # print "\tsegment: {:s} vmaddr: {:x} vmsize:{:d} fileoff: {:x} filesize: {:d}".format( + # segname, segment.vmaddr, segment.vmsize, segment.fileoff, segment.filesize) + blocks.append(MapBlock(segname, segment.vmaddr, segment.vmsize, segment.fileoff, segment.filesize, + ' LC:{} : {} init:{:#0X} max:{:#0X}'.format(lc_cmd_str, segname, segment.initprot, segment.maxprot), + True)) + for section in sections: + section_name = section.sectname[:section.sectname.find('\x00')] + blocks.append(MapBlock(section_name, section.addr, section.size, section.offset, + section.size, 'al:{} flags:{:#0X}'.format(section.align, section.flags), False)) + #print "\t\tsection:{:s} addr:{:x} off:{:x} size:{:d}".format(section_name, section.addr, section.offset, section.size) + elif isinstance(segment, macholib.mach_o.uuid_command): + other_cmds += "\n\t uuid: {:s}".format(GetUUIDSummary(segment.uuid)) + elif isinstance(segment, macholib.mach_o.rpath_command): + other_cmds += "\n\t rpath: {:s}".format(segment.path) + elif isinstance(segment, macholib.mach_o.dylib_command): + other_cmds += "\n\t dylib: {:s} ({:s})".format(str(sections[:sections.find('\x00')]), str(segment.current_version)) + else: + other_cmds += lc_str_rep + cmd_index += 1 + + # fixup the self.width param + for _b in blocks: + if self.default_side_padding + len(_b.name) + 2 > self.width: + self.width = self.default_side_padding + len(_b.name) + 2 + if self.width % 2 != 0: + self.width += 1 + + sorted_blocks = sorted(blocks, key=lambda b: b.vmaddr) + mstr = [self.get_header_line()] + prev_block = MapBlock('', 0, 0, 0, 0, '', False) + for b in sorted_blocks: + # TODO add separator blocks if vmaddr is large from prev_block + if b.is_segment: + s = self.get_dashed_line() + else: + s = self.get_dotted_line() + s = self.center_text_in_line(s, b.name) + line = "{:s} {: <#020X} ({: <10d}) floff:{: <#08x} {}".format(s, b.vmaddr, b.vmsize, b.fileoff, b.extra_info) + if (b.vmaddr - prev_block.vmaddr) > VisualMachoMap.KB_16: + mstr.append(self.get_space_line()) + mstr.append(self.get_space_line()) + + mstr.append(line) + + if b.vmsize > VisualMachoMap.MB_1: + mstr.append(self.get_space_line()) + mstr.extend(self.get_separator_lines()) + mstr.append(self.get_space_line()) + #mstr.append(self.get_space_line()) + prev_block = b + mstr.append(self.get_space_line()) + if prev_block.vmsize > VisualMachoMap.KB_16: + mstr.append(self.get_space_line()) + mstr.append(self.get_header_line()) + print outstr + print "\n".join(mstr) + print "\n\n=============== Other Load Commands ===============" + print other_cmds + + +if __name__ == '__main__': + import sys + if len(sys.argv) < 2: + print "Usage: {} /path/to/macho_binary".format(sys.argv[0]) + sys.exit(1) + with open(sys.argv[-1], 'rb') as fp: + data = fp.read() + mobject = MemMacho(data, len(data)) + + p = VisualMachoMap(sys.argv[-1]) + p.printMachoMap(mobject) + sys.exit(0) + diff --git a/tools/lldbmacros/mbufs.py b/tools/lldbmacros/mbufs.py index c16456a00..57bc22438 100644 --- a/tools/lldbmacros/mbufs.py +++ b/tools/lldbmacros/mbufs.py @@ -328,7 +328,10 @@ def GetMbufWalkAllSlabs(show_a, show_f, show_tr): total = total + 1 if (show_tr != 0): - trn = (mca.mca_next_trn + idx - 1) % unsigned(kern.globals.mca_trn_max) + if (mca.mca_next_trn == 0): + trn = 1 + else: + trn = 0 out_string += "Transaction " + str(int(trn)) + " at " + str(int(mca.mca_trns[int(trn)].mca_tstamp)) + " by thread: 0x" + str(hex(mca.mca_trns[int(trn)].mca_thread)) + ":\n" cnt = 0 while (cnt < mca.mca_trns[int(trn)].mca_depth): @@ -461,42 +464,9 @@ def GetPointerAsString(kgm_pc): pointer_format_string = "0x{0:<8x} " return pointer_format_string.format(kgm_pc) -def GetKmodAddrIntAsString(kgm_pc): - global kgm_pkmod - global kgm_pkmodst - global kgm_pkmoden - - out_string = "" - mh_execute_addr = int(lldb_run_command('p/x (uintptr_t *)&_mh_execute_header').split('=')[-1].strip(), 16) - - out_string += GetPointerAsString(kgm_pc) - if ((unsigned(kgm_pc) >= unsigned(kgm_pkmodst)) and (unsigned(kgm_pc) < unsigned(kgm_pkmoden))): - kgm_off = kgm_pc - kgm_pkmodst - out_string += "<" + str(Cast(kgm_pkmod, 'kmod_info_t *').name) + " + 0x" + str(kgm_off) + ">" - else: - kgm_kmodp = kern.globals.kmod - if ((kern.arch == 'x86_64') and (long(kgm_pc) >= long(mh_execute_addr))): - kgm_kmodp = 0 - - while kgm_kmodp: - kgm_off = unsigned((kgm_pc - kgm_kmodp.address) & 0x00000000ffffffff) - if ((long(kgm_kmodp.address) <= long(kgm_pc)) and (kgm_off) < unsigned(kgm_kmodp.size)): - kgm_pkmod = kgm_kmodp - kgm_pkmodst = unsigned(kgm_kmodp.address) - kgm_pkmoden = unsigned(kgm_pkmodst + kgm_kmodp.size) - kgm_kmodp = 0 - else: - kgm_kmodp = kgm_kmodp.next - return out_string - def GetPc(kgm_pc): - out_string = "" - mh_execute_addr = int(lldb_run_command('p/x (uintptr_t *)&_mh_execute_header').split('=')[-1].strip(), 16) - if (unsigned(kgm_pc) < unsigned(mh_execute_addr) or unsigned(kgm_pc) >= unsigned(kern.globals.vm_kernel_top)): - out_string += GetKmodAddrIntAsString(kgm_pc) - else: - out_string += GetSourceInformationForAddress(int(kgm_pc)) - return out_string + "\n" + out_string = GetSourceInformationForAddress(unsigned(kgm_pc)) + "\n" + return out_string # Macro: mbuf_showactive diff --git a/tools/lldbmacros/memory.py b/tools/lldbmacros/memory.py index 16604d864..45a771980 100644 --- a/tools/lldbmacros/memory.py +++ b/tools/lldbmacros/memory.py @@ -8,6 +8,7 @@ from utils import * import xnudefines from process import * +import macho # Macro: memstats @lldb_command('memstats') @@ -16,10 +17,8 @@ def Memstats(cmd_args=None): """ try: print "memorystatus_level: {: >10d}".format(kern.globals.memorystatus_level) - except ValueError: - pass - try: print "memorystatus_available_pages: {: >10d}".format(kern.globals.memorystatus_available_pages) + print "inuse_ptepages_count: {: >10d}".format(kern.globals.inuse_ptepages_count) except ValueError: pass print "vm_page_throttled_count: {: >10d}".format(kern.globals.vm_page_throttled_count) @@ -30,7 +29,7 @@ def Memstats(cmd_args=None): print "vm_page_purgeable_count: {: >10d}".format(kern.globals.vm_page_purgeable_count) print "vm_page_inactive_target: {: >10d}".format(kern.globals.vm_page_inactive_target) print "vm_page_free_target: {: >10d}".format(kern.globals.vm_page_free_target) - print "inuse_ptepages_count: {: >10d}".format(kern.globals.inuse_ptepages_count) + print "vm_page_free_reserved: {: >10d}".format(kern.globals.vm_page_free_reserved) @xnudebug_test('test_memstats') @@ -125,11 +124,118 @@ def ShowMemoryStatus(cmd_args=None): # EndMacro: showmemorystatus +def GetRealMetadata(meta): + """ Get real metadata for a given metadata pointer + """ + try: + if unsigned(meta.zindex) != 255: + return meta + else: + return kern.GetValueFromAddress(unsigned(meta) - unsigned(meta.real_metadata_offset), "struct zone_page_metadata *") + except: + return 0 + +def GetFreeList(meta): + """ Get the free list pointer for a given metadata pointer + """ + global kern + zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address') + zone_map_max_address = kern.GetGlobalVariable('zone_map_max_address') + try: + if unsigned(meta.freelist_offset) == unsigned(0xffffffff): + return 0 + else: + if (unsigned(meta) >= unsigned(zone_map_min_address)) and (unsigned(meta) < unsigned(zone_map_max_address)): + page_index = ((unsigned(meta) - unsigned(kern.GetGlobalVariable('zone_metadata_region_min'))) / sizeof('struct zone_page_metadata')) + return (unsigned(zone_map_min_address) + (kern.globals.page_size * (page_index))) + meta.freelist_offset + else: + return (unsigned(meta) + meta.freelist_offset) + except: + return 0 + +@lldb_type_summary(['zone_page_metadata']) +@header("{:<18s} {:<18s} {:>8s} {:>8s} {:<18s} {:<20s}".format('ZONE_METADATA', 'FREELIST', 'PG_CNT', 'FREE_CNT', 'ZONE', 'NAME')) +def GetZoneMetadataSummary(meta): + """ Summarize a zone metadata object + params: meta - obj representing zone metadata in the kernel + returns: str - summary of the zone metadata + """ + out_str = "" + global kern + zinfo = 0 + try: + out_str += 'Metadata Description:\n' + GetZoneMetadataSummary.header + '\n' + meta = kern.GetValueFromAddress(meta, "struct zone_page_metadata *") + if unsigned(meta.zindex) == 255: + out_str += "{:#018x} {:#018x} {:8d} {:8d} {:#018x} {:s}\n".format(meta, 0, 0, 0, 0, '(fake multipage meta)') + meta = GetRealMetadata(meta) + if meta == 0: + return "" + zinfo = kern.globals.zone_array[unsigned(meta.zindex)] + out_str += "{:#018x} {:#018x} {:8d} {:8d} {:#018x} {:s}".format(meta, GetFreeList(meta), meta.page_count, meta.free_count, addressof(zinfo), zinfo.zone_name) + return out_str + except: + out_str = "" + return out_str + +@header("{:<18s} {:>18s} {:>18s} {:<18s}".format('ADDRESS', 'TYPE', 'OFFSET_IN_PG', 'METADATA')) +def WhatIs(addr): + """ Information about kernel pointer + """ + out_str = "" + global kern + pagesize = kern.globals.page_size + zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address') + zone_map_max_address = kern.GetGlobalVariable('zone_map_max_address') + if (unsigned(addr) >= unsigned(zone_map_min_address)) and (unsigned(addr) < unsigned(zone_map_max_address)): + zone_metadata_region_min = kern.GetGlobalVariable('zone_metadata_region_min') + zone_metadata_region_max = kern.GetGlobalVariable('zone_metadata_region_max') + if (unsigned(addr) >= unsigned(zone_metadata_region_min)) and (unsigned(addr) < unsigned(zone_metadata_region_max)): + metadata_offset = (unsigned(addr) - unsigned(zone_metadata_region_min)) % sizeof('struct zone_page_metadata') + page_offset_str = "{:d}/{:d}".format((unsigned(addr) - (unsigned(addr) & ~(pagesize - 1))), pagesize) + out_str += WhatIs.header + '\n' + out_str += "{:#018x} {:>18s} {:>18s} {:#018x}\n\n".format(unsigned(addr), "Metadata", page_offset_str, unsigned(addr) - metadata_offset) + out_str += GetZoneMetadataSummary((unsigned(addr) - metadata_offset)) + '\n\n' + else: + page_index = ((unsigned(addr) & ~(pagesize - 1)) - unsigned(zone_map_min_address)) / pagesize + meta = unsigned(zone_metadata_region_min) + (page_index * sizeof('struct zone_page_metadata')) + meta = kern.GetValueFromAddress(meta, "struct zone_page_metadata *") + page_meta = GetRealMetadata(meta) + if page_meta != 0: + zinfo = kern.globals.zone_array[unsigned(page_meta.zindex)] + page_offset_str = "{:d}/{:d}".format((unsigned(addr) - (unsigned(addr) & ~(pagesize - 1))), pagesize) + out_str += WhatIs.header + '\n' + out_str += "{:#018x} {:>18s} {:>18s} {:#018x}\n\n".format(unsigned(addr), "Element", page_offset_str, page_meta) + out_str += GetZoneMetadataSummary(unsigned(page_meta)) + '\n\n' + else: + out_str += "Unmapped address within the zone_map ({:#018x}-{:#018x})".format(zone_map_min_address, zone_map_max_address) + else: + out_str += "Address {:#018x} is outside the zone_map ({:#018x}-{:#018x})\n".format(addr, zone_map_min_address, zone_map_max_address) + print out_str + return + +@lldb_command('whatis') +def WhatIsHelper(cmd_args=None): + """ Routine to show information about a kernel pointer + Usage: whatis
+ """ + if not cmd_args: + raise ArgumentError("No arguments passed") + addr = kern.GetValueFromAddress(cmd_args[0], 'void *') + WhatIs(addr) + print "Hexdump:\n" + try: + data_array = kern.GetValueFromAddress(unsigned(addr) - 16, "uint8_t *") + print_hex_data(data_array[0:48], unsigned(addr) - 16, "") + except: + pass + return + # Macro: zprint @lldb_type_summary(['zone','zone_t']) -@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s}({:>6s} {:>6s} {:>6s}) {:^15s} {:<20s}".format( -'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ELT_SZ', 'ALLOC', 'ELTS', 'PGS', 'WASTE', 'FLAGS', 'NAME')) +@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s}({:>6s} {:>6s} {:>6s}) {:^15s} {:<20s}".format( +'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ALL_FREE_PGS', 'ELT_SZ', 'ALLOC', 'ELTS', 'PGS', 'WASTE', 'FLAGS', 'NAME')) def GetZoneSummary(zone): """ Summarize a zone with important information. See help zprint for description of each field params: @@ -145,16 +251,8 @@ def GetZoneSummary(zone): free_size = free_elements * zone.elem_size alloc_pages = zone.alloc_size / pagesize - if zone.use_page_list : - metadata_size = sizeof('struct zone_page_metadata') - metadata_offset = metadata_size - if ((metadata_size % zone.elem_size) != 0) : - metadata_offset += zone.elem_size - (metadata_size % zone.elem_size) - alloc_count = ((pagesize - metadata_offset) / zone.elem_size) * alloc_pages - alloc_waste = metadata_offset * alloc_pages - else : - alloc_count = zone.alloc_size / zone.elem_size - alloc_waste = zone.alloc_size % zone.elem_size + alloc_count = zone.alloc_size / zone.elem_size + alloc_waste = zone.alloc_size % zone.elem_size marks = [ ["collectable", "C"], @@ -168,9 +266,7 @@ def GetZoneSummary(zone): ["zleak_on", "L"], ["doing_alloc_without_vm_priv", "A"], ["doing_alloc_with_vm_priv", "S"], - ["waiting", "W"], - ["doing_gc", "G"], - ["use_page_list", "P"] + ["waiting", "W"] ] if kern.arch == 'x86_64': marks.append(["gzalloc_exempt", "M"]) @@ -183,7 +279,7 @@ def GetZoneSummary(zone): else: markings+=" " out_string += format_string.format(zone, zone.cur_size, zone.page_count, - zone.count, free_elements, free_size, + zone.count, free_elements, free_size, zone.count_all_free_pages, zone.elem_size, zone.alloc_size, alloc_count, alloc_pages, alloc_waste, name = zone.zone_name, markings=markings) @@ -212,7 +308,6 @@ def Zprint(cmd_args=None): W - another thread is waiting for more memory L - zone is being monitored by zleaks G - currently running GC - P - uses zone_page_metadata """ global kern print GetZoneSummary.header @@ -275,6 +370,8 @@ def ShowZfreeListChain(zone, zfirst, zlimit): while ShowZfreeList.elts_found < zlimit: ShowZfreeList.elts_found += 1 znext = dereference(Cast(current, 'vm_offset_t *')) + znext = (unsigned(znext) ^ unsigned(kern.globals.zp_nopoison_cookie)) + znext = kern.GetValueFromAddress(znext, 'vm_offset_t *') backup_ptr = kern.GetValueFromAddress((unsigned(Cast(current, 'vm_offset_t')) + unsigned(zone.elem_size) - sizeof('vm_offset_t')), 'vm_offset_t *') backup_val = dereference(backup_ptr) n_unobfuscated = (unsigned(backup_val) ^ unsigned(kern.globals.zp_nopoison_cookie)) @@ -313,28 +410,23 @@ def ShowZfreeList(cmd_args=None): zlimit = ArgumentStringToInt(cmd_args[1]) ShowZfreeListHeader(zone) - if unsigned(zone.use_page_list) == 1: - if unsigned(zone.allows_foreign) == 1: - for free_page_meta in IterateQueue(zone.pages.any_free_foreign, 'struct zone_page_metadata *', 'pages'): - if ShowZfreeList.elts_found == zlimit: - break - zfirst = Cast(free_page_meta.elements, 'void *') - if unsigned(zfirst) != 0: - ShowZfreeListChain(zone, zfirst, zlimit) - for free_page_meta in IterateQueue(zone.pages.intermediate, 'struct zone_page_metadata *', 'pages'): + if unsigned(zone.allows_foreign) == 1: + for free_page_meta in IterateQueue(zone.pages.any_free_foreign, 'struct zone_page_metadata *', 'pages'): if ShowZfreeList.elts_found == zlimit: break - zfirst = Cast(free_page_meta.elements, 'void *') + zfirst = kern.GetValueFromAddress(GetFreeList(free_page_meta), 'void *') if unsigned(zfirst) != 0: ShowZfreeListChain(zone, zfirst, zlimit) - for free_page_meta in IterateQueue(zone.pages.all_free, 'struct zone_page_metadata *', 'pages'): - if ShowZfreeList.elts_found == zlimit: - break - zfirst = Cast(free_page_meta.elements, 'void *') - if unsigned(zfirst) != 0: - ShowZfreeListChain(zone, zfirst, zlimit) - else: - zfirst = Cast(zone.free_elements, 'void *') + for free_page_meta in IterateQueue(zone.pages.intermediate, 'struct zone_page_metadata *', 'pages'): + if ShowZfreeList.elts_found == zlimit: + break + zfirst = kern.GetValueFromAddress(GetFreeList(free_page_meta), 'void *') + if unsigned(zfirst) != 0: + ShowZfreeListChain(zone, zfirst, zlimit) + for free_page_meta in IterateQueue(zone.pages.all_free, 'struct zone_page_metadata *', 'pages'): + if ShowZfreeList.elts_found == zlimit: + break + zfirst = kern.GetValueFromAddress(GetFreeList(free_page_meta), 'void *') if unsigned(zfirst) != 0: ShowZfreeListChain(zone, zfirst, zlimit) @@ -345,20 +437,28 @@ def ShowZfreeList(cmd_args=None): # EndMacro: showzfreelist +# Macro: zstack_showzonesbeinglogged + +@lldb_command('zstack_showzonesbeinglogged') +def ZstackShowZonesBeingLogged(cmd_args=None): + """ + """ + global kern + for zval in kern.zones: + if zval.zlog_btlog: + print "Zone: %s with its BTLog at: 0x%lx" % (zval.zone_name, zval.zlog_btlog) + +# EndMacro: zstack_showzonesbeinglogged + # Macro: zstack @lldb_command('zstack') def Zstack(cmd_args=None): - """ Zone leak debugging: Print the stack trace of log element at . If a is supplied, it prints log elements starting at . - Usage: zstack [] + """ Zone leak debugging: Print the stack trace logged at in the stacks list. If a is supplied, it prints stacks starting at . + Usage: zstack [] - The suggested usage is to look at indexes below zcurrent and look for common stack traces. - The stack trace that occurs the most is probably the cause of the leak. Find the pc of the - function calling into zalloc and use the countpcs command to find out how often that pc occurs in the log. - The pc occuring in a high percentage of records is most likely the source of the leak. - - The findoldest command is also useful for leak debugging since it identifies the oldest record - in the log, which may indicate the leaker. + The suggested usage is to look at stacks with high percentage of refs (maybe > 25%). + The stack trace that occurs the most is probably the cause of the leak. Use zstack_findleak for that. """ if not cmd_args: print Zstack.__doc__ @@ -366,97 +466,125 @@ def Zstack(cmd_args=None): if int(kern.globals.log_records) == 0: print "Zone logging not enabled. Add 'zlog=' to boot-args." return - if int(kern.globals.zlog_btlog) == 0: - print "Zone logging enabled, but zone has not been initialized yet." - return + btlog_ptr = kern.GetValueFromAddress(cmd_args[0], 'btlog_t *') + btrecords_total_size = unsigned(btlog_ptr.btlog_buffersize) + btrecord_size = unsigned(btlog_ptr.btrecord_size) + btrecords = unsigned(btlog_ptr.btrecords) + btlog_size = unsigned(sizeof('struct btlog')) + depth = unsigned(btlog_ptr.btrecord_btdepth) + zstack_index = ArgumentStringToInt(cmd_args[1]) count = 1 - if len(cmd_args) >= 2: - count = ArgumentStringToInt(cmd_args[1]) - zstack_index = unsigned(cmd_args[0]) + if len(cmd_args) >= 3: + count = ArgumentStringToInt(cmd_args[2]) + + max_count = ((btrecords_total_size - btlog_size)/btrecord_size) + + if (zstack_index + count) > max_count: + count = max_count - zstack_index + while count and (zstack_index != 0xffffff): - zstack_record_offset = zstack_index * unsigned(kern.globals.zlog_btlog.btrecord_size) - zstack_record = kern.GetValueFromAddress(unsigned(kern.globals.zlog_btlog.btrecords) + zstack_record_offset, 'btlog_record_t *') - ShowZStackRecord(zstack_record, zstack_index) - zstack_index = zstack_record.next + zstack_record_offset = zstack_index * btrecord_size + zstack_record = kern.GetValueFromAddress(btrecords + zstack_record_offset, 'btlog_record_t *') + if int(zstack_record.ref_count)!=0: + ShowZStackRecord(zstack_record, zstack_index, depth, unsigned(btlog_ptr.active_element_count)) + zstack_index += 1 count -= 1 # EndMacro : zstack -# Macro: findoldest +# Macro: zstack_inorder -@lldb_command('findoldest') -def FindOldest(cmd_args=None): - """ Zone leak debugging: find and print the oldest record in the log. - - Once it prints a stack trace, find the pc of the caller above all the zalloc, kalloc and - IOKit layers. Then use the countpcs command to see how often this caller has allocated - memory. A caller with a high percentage of records in the log is probably the leaker. +@lldb_command('zstack_inorder') +def ZstackInOrder(cmd_args=None): + """ Zone leak debugging: Print the stack traces starting from head to the tail. + Usage: zstack_inorder """ - if int(kern.globals.log_records) == 0: - print FindOldest.__doc__ + if not cmd_args: + print "Zone leak debugging: Print the stack traces starting from head to the tail. \nUsage: zstack_inorder " return - if int(kern.globals.zlog_btlog) == 0: - print "Zone logging enabled, but zone has not been initialized yet." + if int(kern.globals.log_records) == 0: + print "Zone logging not enabled. Add 'zlog=' to boot-args." return - index = kern.globals.zlog_btlog.head - if unsigned(index) != 0xffffff: - print "Oldest record is at log index: {0: +@lldb_command('zstack_findleak') +def zstack_findleak(cmd_args=None): + """ Zone leak debugging: search the log and print the stack with the most active references in the stack trace. - Usage: countpcs + Usage: zstack_findleak - This is useful for verifying a suspected as being the source of - the leak. If a high percentage of the log entries contain the given , then it's most - likely the source of the leak. Note that this command can take several minutes to run. + This is useful for verifying a suspected stack as being the source of + the leak. """ - if not cmd_args: - print Countpcs.__doc__ - return - if int(kern.globals.log_records) == 0: - print "Zone logging not enabled. Add 'zlog=' to boot-args." - return - if int(kern.globals.zlog_btlog) == 0: - print "Zone logging enabled, but zone has not been initialized yet." - return - - cpcs_index = unsigned(kern.globals.zlog_btlog.head) - target_pc = unsigned(kern.GetValueFromAddress(cmd_args[0], 'void *')) - found = 0 - depth = unsigned(kern.globals.zlog_btlog.btrecord_btdepth) + btlog_ptr = kern.GetValueFromAddress(cmd_args[0], 'btlog_t *') + btrecord_size = unsigned(btlog_ptr.btrecord_size) + btrecords = unsigned(btlog_ptr.btrecords) + + cpcs_index = unsigned(btlog_ptr.head) + depth = unsigned(btlog_ptr.btrecord_btdepth) + highref = 0 + highref_index = 0 + highref_record = 0 while cpcs_index != 0xffffff: - cpcs_record_offset = cpcs_index * unsigned(kern.globals.zlog_btlog.btrecord_size) - cpcs_record = kern.GetValueFromAddress(unsigned(kern.globals.zlog_btlog.btrecords) + cpcs_record_offset, 'btlog_record_t *') - frame = 0 - while frame < depth: - frame_pc = unsigned(cpcs_record.bt[frame]) - if frame_pc == target_pc: - found += 1 - break - frame += 1 + cpcs_record_offset = cpcs_index * btrecord_size + cpcs_record = kern.GetValueFromAddress(btrecords + cpcs_record_offset, 'btlog_record_t *') + if cpcs_record.ref_count > highref: + highref_record = cpcs_record + highref = cpcs_record.ref_count + highref_index = cpcs_index cpcs_index = cpcs_record.next - print "Occured {0: + Usage: zstack_findelem When the kernel panics due to a corrupted zone element, get the element address and use this command. This will show you the stack traces of all logged zalloc and @@ -464,87 +592,59 @@ def FindElem(cmd_args=None): double-frees readily apparent. """ if not cmd_args: - print FindElem.__doc__ + print ZStackFindElem.__doc__ return - if int(kern.globals.log_records) == 0: - print "Zone logging not enabled. Add 'zlog=' to boot-args." - return - if int(kern.globals.zlog_btlog) == 0: - print "Zone logging enabled, but zone has not been initialized yet." + if int(kern.globals.log_records) == 0 or unsigned(kern.globals.corruption_debug_flag) == 0: + print "Zone logging with corruption detection not enabled. Add '-zc zlog=' to boot-args." return - target_element = unsigned(kern.GetValueFromAddress(cmd_args[0], 'void *')) - index = unsigned(kern.globals.zlog_btlog.head) - prev_op = -1 + btlog_ptr = kern.GetValueFromAddress(cmd_args[0], 'btlog_t *') + target_element = unsigned(kern.GetValueFromAddress(cmd_args[1], 'void *')) + + btrecord_size = unsigned(btlog_ptr.btrecord_size) + btrecords = unsigned(btlog_ptr.btrecords) + depth = unsigned(btlog_ptr.btrecord_btdepth) - while index != 0xffffff: - findelem_record_offset = index * unsigned(kern.globals.zlog_btlog.btrecord_size) - findelem_record = kern.GetValueFromAddress(unsigned(kern.globals.zlog_btlog.btrecords) + findelem_record_offset, 'btlog_record_t *') - if unsigned(findelem_record.element) == target_element: - Zstack([index]) - if int(findelem_record.operation) == prev_op: + prev_op = -1 + scan_items = 0 + hashelem = cast(btlog_ptr.elem_linkage_un.element_hash_queue.tqh_first, 'btlog_element_t *') + if (target_element >> 32) != 0: + target_element = target_element ^ 0xFFFFFFFFFFFFFFFF + else: + target_element = target_element ^ 0xFFFFFFFF + while hashelem != 0: + if unsigned(hashelem.elem) == target_element: + recindex = hashelem.recindex + recoffset = recindex * btrecord_size + record = kern.GetValueFromAddress(btrecords + recoffset, 'btlog_record_t *') + out_str = ('-' * 8) + if record.operation == 1: + out_str += "OP: ALLOC. " + else: + out_str += "OP: FREE. " + out_str += "Stack Index {0: - Usage: btlog_find -A - Note: Backtraces will be in chronological order, with oldest entries aged out in FIFO order as needed. """ - if not cmd_args: - raise ArgumentError("Need a btlog_t parameter") - btlog = kern.GetValueFromAddress(cmd_args[0], 'btlog_t *') - printall = False - summarize = False - summary_cache = {} - target_elem = 0xffffffff - - if "-A" in cmd_options: - printall = True - else: - if not printall and len(cmd_args) < 2: - raise ArgumentError(" is missing in args. Need a search pointer.") - target_elem = unsigned(kern.GetValueFromAddress(cmd_args[1], 'void *')) - - if "-S" in cmd_options: - summarize = True - - index = unsigned(btlog.head) - progress = 0 - record_size = unsigned(btlog.btrecord_size) - try: - while index != 0xffffff: - record_offset = index * record_size - record = kern.GetValueFromAddress(unsigned(btlog.btrecords) + record_offset, 'btlog_record_t *') - if printall or unsigned(record.element) == target_elem: - _s = '{0: 3s} {4: >5s} {5: >20s} {6: <30s}".format('kmod_info', 'address', 'size', 'id', 'refs', 'version', 'name')) +@header("{0: <20s} {1: <20s} {2: <20s} {3: >3s} {4: >5s} {5: <20s} {6: <20s} {7: >20s} {8: <30s}".format('kmod_info', 'address', 'size', 'id', 'refs', 'TEXT exec', 'size', 'version', 'name')) def GetKextSummary(kmod): """ returns a string representation of kext information """ out_string = "" - format_string = "{0: <#020x} {1: <#020x} {2: <#020x} {3: >3d} {4: >5d} {5: >20s} {6: <30s}" - out_string += format_string.format(kmod, kmod.address, kmod.size, kmod.id, kmod.reference_count, kmod.version, kmod.name) + format_string = "{0: <#020x} {1: <#020x} {2: <#020x} {3: >3d} {4: >5d} {5: <#020x} {6: <#020x} {7: >20s} {8: <30s}" + segments, sections = GetAllSegmentsAndSectionsFromDataInMemory(unsigned(kmod.address), unsigned(kmod.size)) + text_segment = macho.get_text_segment(segments) + if not text_segment: + text_segment = segments[0] + out_string += format_string.format(kmod, kmod.address, kmod.size, kmod.id, kmod.reference_count, text_segment.vmaddr, text_segment.vmsize, kmod.version, kmod.name) return out_string @lldb_type_summary(['uuid_t']) @@ -1134,25 +1239,96 @@ def ShowAllKexts(cmd_args=None): """Display a summary listing of all loaded kexts (alias: showallkmods) """ kmod_val = kern.globals.kmod + kextuuidinfo = GetKextLoadInformation(show_progress=(config['verbosity'] > vHUMAN)) print "{: <36s} ".format("UUID") + GetKextSummary.header - kextuuidinfo = GetKextLoadInformation() for kval in IterateLinkedList(kmod_val, 'next'): uuid = "........-....-....-....-............" kaddr = unsigned(kval.address) + found_kext_summary = None for l in kextuuidinfo : - if kaddr == int(l[1],16): + if kaddr == int(l[3],16): uuid = l[0] + found_kext_summary = l break - print uuid + " " + GetKextSummary(kval) + if found_kext_summary: + _ksummary = GetKextSummary(found_kext_summary[7]) + else: + _ksummary = GetKextSummary(kval) + print uuid + " " + _ksummary -def GetKextLoadInformation(addr=0): +def GetKmodWithAddr(addr): + """ Go through kmod list and find one with begin_addr as addr + returns: None if not found. else a cvalue of type kmod + """ + kmod_val = kern.globals.kmod + for kval in IterateLinkedList(kmod_val, 'next'): + if addr == unsigned(kval.address): + return kval + return None + +def GetAllSegmentsAndSectionsFromDataInMemory(address, size): + """ reads memory at address and parses mach_header to get segment and section information + returns: Tuple of (segments_list, sections_list) like ([MachOSegment,...], [MachOSegment, ...]) + where MachOSegment has fields like 'name vmaddr vmsize fileoff filesize' + if TEXT segment is not found a dummy segment & section with address, size is returned. + """ + cache_hash = "kern.kexts.segments.{}.{}".format(address, size) + cached_result = caching.GetDynamicCacheData(cache_hash,()) + if cached_result: + return cached_result + + defval = macho.MachOSegment('__TEXT', address, size, 0, size) + if address == 0 or size == 0: + return ([defval], [defval]) + + # if int(kern.globals.gLoadedKextSummaries.version) <= 2: + # until we have separate version. we will pay penalty only on arm64 devices + if kern.arch not in ('arm64',): + return ([defval], [defval]) + + restrict_size_to_read = 1536 + machoObject = None + while machoObject is None: + err = lldb.SBError() + size_to_read = min(size, restrict_size_to_read) + data = LazyTarget.GetProcess().ReadMemory(address, size_to_read, err) + if not err.Success(): + print "Failed to read memory at {} and size {}".format(address, size_to_read) + return ([defval], [defval]) + try: + m = macho.MemMacho(data, len(data)) + machoObject = m + except Exception as e: + if str(e.message).find('unpack requires a string argument') >= 0: + # this may be due to short read of memory. Lets do double read size. + restrict_size_to_read *= 2 + debuglog("Bumping mach header read size to {}".format(restrict_size_to_read)) + continue + else: + print "Failed to read MachO for address {} errormessage: {}".format(address, e.message) + return ([defval], [defval]) + # end of while loop. We have machoObject defined + segments = machoObject.get_segments_with_name('') + sections = machoObject.get_sections_with_name('') + rval = (segments, sections) + caching.SaveDynamicCacheData(cache_hash, rval) + return rval + +def GetKextLoadInformation(addr=0, show_progress=False): """ Extract the kext uuid and load address information from the kernel data structure. params: addr - int - optional integer that is the address to search for. - returns: - [] - array with each entry of format ( 'UUID', 'Hex Load Address') + returns: + [] - array with each entry of format + ( 'UUID', 'Hex Load Address of __TEXT or __TEXT_EXEC section', 'name', + 'addr of macho header', [macho.MachOSegment,..], [MachoSection,...], kext, kmod_obj) """ - # because of , we can't find summaries directly + cached_result = caching.GetDynamicCacheData("kern.kexts.loadinformation", []) + # if specific addr is provided then ignore caching + if cached_result and not addr: + return cached_result + + # because of , we can't find summaries directly #addr = hex(addressof(kern.globals.gLoadedKextSummaries.summaries)) baseaddr = unsigned(kern.globals.gLoadedKextSummaries) + 0x10 summaries_begin = kern.GetValueFromAddress(baseaddr, 'OSKextLoadedKextSummary *') @@ -1163,14 +1339,23 @@ def GetKextLoadInformation(addr=0): entry_size = int(kern.globals.gLoadedKextSummaries.entry_size) retval = [] for i in range(total_summaries): + if show_progress: + print "progress: {}/{}".format(i, total_summaries) tmpaddress = unsigned(summaries_begin) + (i * entry_size) current_kext = kern.GetValueFromAddress(tmpaddress, 'OSKextLoadedKextSummary *') + # code to extract macho information + segments, sections = GetAllSegmentsAndSectionsFromDataInMemory(unsigned(current_kext.address), unsigned(current_kext.size)) + seginfo = macho.get_text_segment(segments) + if not seginfo: + seginfo = segments[0] + kmod_obj = GetKmodWithAddr(unsigned(current_kext.address)) if addr != 0 : - if addr == unsigned(current_kext.address): - retval.append((GetUUIDSummary(current_kext.uuid) , hex(current_kext.address), str(current_kext.name) )) - else: - retval.append((GetUUIDSummary(current_kext.uuid) , hex(current_kext.address), str(current_kext.name) )) - + if addr == unsigned(current_kext.address) or addr == seginfo.vmaddr: + return [(GetUUIDSummary(current_kext.uuid) , hex(seginfo.vmaddr).rstrip('L'), str(current_kext.name), hex(current_kext.address), segments, seginfo, current_kext, kmod_obj)] + retval.append((GetUUIDSummary(current_kext.uuid) , hex(seginfo.vmaddr).rstrip('L'), str(current_kext.name), hex(current_kext.address), segments, seginfo, current_kext, kmod_obj)) + + if not addr: + caching.SaveDynamicCacheData("kern.kexts.loadinformation", retval) return retval lldb_alias('showallkexts', 'showallkmods') @@ -1178,7 +1363,7 @@ def GetKextLoadInformation(addr=0): def GetOSKextVersion(version_num): """ returns a string of format 1.2.3x from the version_num params: version_num - int - return: str + return: str """ if version_num == -1 : return "invalid" @@ -1219,7 +1404,7 @@ def ShowAllKnownKexts(cmd_args=None): print "%d kexts in sKextsByID:" % kext_count print "{0: <20s} {1: <20s} {2: >5s} {3: >20s} {4: <30s}".format('OSKEXT *', 'load_addr', 'id', 'version', 'name') format_string = "{0: <#020x} {1: <20s} {2: >5s} {3: >20s} {4: <30s}" - + while index < kext_count: kext_dict = GetObjectAtIndexFromArray(kext_dictionary, index) kext_name = str(kext_dict.key.string) @@ -1234,38 +1419,46 @@ def ShowAllKnownKexts(cmd_args=None): version = GetOSKextVersion(version_num) print format_string.format(osk, load_addr, id, version, kext_name) index += 1 - + return @lldb_command('showkmodaddr') def ShowKmodAddr(cmd_args=[]): - """ Given an address, print the offset and name for the kmod containing it + """ Given an address, print the offset and name for the kmod containing it Syntax: (lldb) showkmodaddr """ if len(cmd_args) < 1: raise ArgumentError("Insufficient arguments") addr = ArgumentStringToInt(cmd_args[0]) - kmod_val = kern.globals.kmod - for kval in IterateLinkedList(kmod_val, 'next'): - if addr >= unsigned(kval.address) and addr <= (unsigned(kval.address) + unsigned(kval.size)): - print GetKextSummary.header - print GetKextSummary(kval) + " offset = {0: #0x}".format((addr - unsigned(kval.address))) - return True + all_kexts_info = GetKextLoadInformation() + found_kinfo = None + found_segment = None + for kinfo in all_kexts_info: + s = macho.get_segment_with_addr(kinfo[4], addr) + if s: + found_segment = s + found_kinfo = kinfo + break + if found_kinfo: + print GetKextSummary.header + print GetKextSummary(found_kinfo[7]) + " segment: {} offset = {:#0x}".format(found_segment.name, (addr - found_segment.vmaddr)) + return True return False + @lldb_command('addkext','AF:N:') def AddKextSyms(cmd_args=[], cmd_options={}): """ Add kext symbols into lldb. This command finds symbols for a uuid and load the required executable - Usage: + Usage: addkext : Load one kext based on uuid. eg. (lldb)addkext 4DD2344C0-4A81-3EAB-BDCF-FEAFED9EB73E addkext -F : Load kext executable at specified load address addkext -N : Load one kext that matches the name provided. eg. (lldb) addkext -N corecrypto addkext -N -A: Load all kext that matches the name provided. eg. to load all kext with Apple in name do (lldb) addkext -N Apple -A - addkext all : Will load all the kext symbols - SLOW + addkext all : Will load all the kext symbols - SLOW """ - + if "-F" in cmd_options: exec_path = cmd_options["-F"] @@ -1277,8 +1470,6 @@ def AddKextSyms(cmd_args=[], cmd_options={}): raise ArgumentError("Path is {:s} not a filepath. \nPlease check that path points to executable.\ \nFor ex. path/to/Symbols/IOUSBFamily.kext/Contents/PlugIns/AppleUSBHub.kext/Contents/MacOS/AppleUSBHub.\ \nNote: LLDB does not support adding kext based on directory paths like gdb used to.".format(exec_path)) - if not os.access(exec_full_path, os.X_OK): - raise ArgumentError("Path is {:s} not an executable file".format(exec_path)) slide_value = None if cmd_args: @@ -1299,17 +1490,16 @@ def AddKextSyms(cmd_args=[], cmd_options={}): if k[0].lower() == uuid_str.lower(): slide_value = k[1] debuglog("found the slide %s for uuid %s" % (k[1], k[0])) - if slide_value is None: raise ArgumentError("Unable to find load address for module described at %s " % exec_full_path) load_cmd = "target modules load --file %s --slide %s" % (exec_full_path, str(slide_value)) print load_cmd - print lldb_run_command(load_cmd) + print lldb_run_command(load_cmd) kern.symbolicator = None return True all_kexts_info = GetKextLoadInformation() - + if "-N" in cmd_options: kext_name = cmd_options["-N"] kext_name_matches = GetLongestMatchOption(kext_name, [str(x[2]) for x in all_kexts_info], True) @@ -1328,7 +1518,7 @@ def AddKextSyms(cmd_args=[], cmd_options={}): if info and 'DBGSymbolRichExecutable' in info: print "Adding dSYM ({0:s}) for {1:s}".format(cur_uuid, info['DBGSymbolRichExecutable']) addDSYM(cur_uuid, info) - loadDSYM(cur_uuid, int(x[1],16)) + loadDSYM(cur_uuid, int(x[1],16), x[4]) else: print "Failed to get symbol info for {:s}".format(cur_uuid) break @@ -1343,7 +1533,7 @@ def AddKextSyms(cmd_args=[], cmd_options={}): load_all_kexts = False if uuid == "all": load_all_kexts = True - + if not load_all_kexts and len(uuid_regex.findall(uuid)) == 0: raise ArgumentError("Unknown argument {:s}".format(uuid)) @@ -1355,14 +1545,14 @@ def AddKextSyms(cmd_args=[], cmd_options={}): if info and 'DBGSymbolRichExecutable' in info: print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable']) addDSYM(cur_uuid, info) - loadDSYM(cur_uuid, int(k_info[1],16)) + loadDSYM(cur_uuid, int(k_info[1],16), k_info[4]) else: print "Failed to get symbol info for %s" % cur_uuid #end of for loop kern.symbolicator = None return True - + lldb_alias('showkmod', 'showkmodaddr') lldb_alias('showkext', 'showkmodaddr') @@ -1958,32 +2148,38 @@ def GetMutexLockSummary(mtx): return "Invalid lock value: 0x0" if kern.arch == "x86_64": - out_str = "Lock Type\t\t: MUTEX\n" - mtxd = mtx.lck_mtx_sw.lck_mtxd - out_str += "Owner Thread\t\t: {:#x}\n".format(mtxd.lck_mtxd_owner) - cmd_str = "p/d ((lck_mtx_t*){:#x})->lck_mtx_sw.lck_mtxd.".format(mtx) - cmd_out = lldb_run_command(cmd_str + "lck_mtxd_waiters") - out_str += "Number of Waiters\t: {:s}\n".format(cmd_out.split()[-1]) - cmd_out = lldb_run_command(cmd_str + "lck_mtxd_ilocked") - out_str += "ILocked\t\t\t: {:s}\n".format(cmd_out.split()[-1]) - cmd_out = lldb_run_command(cmd_str + "lck_mtxd_mlocked") - out_str += "MLocked\t\t\t: {:s}\n".format(cmd_out.split()[-1]) - cmd_out = lldb_run_command(cmd_str + "lck_mtxd_promoted") - out_str += "Promoted\t\t: {:s}\n".format(cmd_out.split()[-1]) - cmd_out = lldb_run_command(cmd_str + "lck_mtxd_spin") - out_str += "Spin\t\t\t: {:s}\n".format(cmd_out.split()[-1]) + out_str = "Lock Type : MUTEX\n" + if mtx.lck_mtx_tag == 0x07ff1007 : + out_str += "Tagged as indirect, printing ext lock at: {:#x}\n".format(mtx.lck_mtx_ptr) + mtx = Cast(mtx.lck_mtx_ptr, 'lck_mtx_t *') + + if mtx.lck_mtx_tag == 0x07fe2007 : + out_str += "*** Tagged as DESTROYED ({:#x}) ***\n".format(mtx.lck_mtx_tag) + + out_str += "Owner Thread : {mtx.lck_mtx_owner:#x}\n".format(mtx=mtx) + out_str += "Number of Waiters : {mtx.lck_mtx_waiters:#x}\n".format(mtx=mtx) + out_str += "ILocked : {mtx.lck_mtx_ilocked:#x}\n".format(mtx=mtx) + out_str += "MLocked : {mtx.lck_mtx_mlocked:#x}\n".format(mtx=mtx) + out_str += "Promoted : {mtx.lck_mtx_promoted:#x}\n".format(mtx=mtx) + out_str += "Pri : {mtx.lck_mtx_pri:#x}\n".format(mtx=mtx) + out_str += "Spin : {mtx.lck_mtx_spin:#x}\n".format(mtx=mtx) + out_str += "Ext : {mtx.lck_mtx_is_ext:#x}\n".format(mtx=mtx) + if mtx.lck_mtxd_pad32 == 0xFFFFFFFF : + out_str += "Canary (valid) : {mtx.lck_mtxd_pad32:#x}\n".format(mtx=mtx) + else: + out_str += "Canary (INVALID) : {mtx.lck_mtxd_pad32:#x}\n".format(mtx=mtx) return out_str out_str = "Lock Type\t\t: MUTEX\n" - out_str += "Owner Thread\t\t: {:#x}\n".format(mtx.lck_mtx_hdr.lck_mtxd_data & ~0x3) - out_str += "Number of Waiters\t: {:d}\n".format(mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_waiters) + out_str += "Owner Thread\t\t: {:#x}".format(mtx.lck_mtx_data & ~0x3) + if (mtx.lck_mtx_data & ~0x3) == 0xfffffff0: + out_str += " Held as spinlock" + out_str += "\nNumber of Waiters\t: {:d}\n".format(mtx.lck_mtx_waiters) out_str += "Flags\t\t\t: " - if mtx.lck_mtx_hdr.lck_mtxd_data & 0x1: + if mtx.lck_mtx_data & 0x1: out_str += "[Interlock Locked] " - if mtx.lck_mtx_hdr.lck_mtxd_data & 0x2: + if mtx.lck_mtx_data & 0x2: out_str += "[Wait Flag]" - if (mtx.lck_mtx_hdr.lck_mtxd_data & 0x3) == 0: - out_str += "None" return out_str @lldb_type_summary(['lck_spin_t *']) @@ -2003,14 +2199,17 @@ def GetSpinLockSummary(spinlock): out_str += "Interlock\t\t: {:#x}\n".format(spinlock.interlock) return out_str - out_str += "Owner Thread\t\t: {:#x}\n".format(spinlock.lck_spin_data & ~0x3) - out_str += "Flags\t\t\t: " - if spinlock.lck_spin_data & 0x1: - out_str += "[Interlock Locked] " - if spinlock.lck_spin_data & 0x2: - out_str += "[Wait Flag]" - if (spinlock.lck_spin_data & 0x3) == 0: - out_str += "None" + lock_data = spinlock.hwlock.lock_data + if lock_data == 1: + out_str += "Invalid state: interlock is locked but no owner\n" + return out_str + out_str += "Owner Thread\t\t: " + if lock_data == 0: + out_str += "None\n" + else: + out_str += "{:#x}\n".format(lock_data & ~0x1) + if (lock_data & 1) == 0: + out_str += "Invalid state: owned but interlock bit is not set\n" return out_str @lldb_command('showlock', 'MS') @@ -2046,7 +2245,7 @@ def ShowLock(cmd_args=None, cmd_options={}): summary_str = GetMutexLockSummary(lock_mtx) lock_spin = Cast(lock, 'lck_spin_t*') - if lock_spin.lck_spin_type == 0x11: + if lock_spin.type == 0x11: summary_str = GetSpinLockSummary(lock_spin) if summary_str == "": @@ -2414,6 +2613,8 @@ def showmapvme(map, show_pager_info, show_all_shadows): object_str = "KALLOC_MAP" elif object == kern.globals.zone_map: object_str = "ZONE_MAP" + elif hasattr(kern.globals, 'compressor_map') and object == kern.globals.compressor_map: + object_str = "COMPRESSOR_MAP" elif hasattr(kern.globals, 'gzalloc_map') and object == kern.globals.gzalloc_map: object_str = "GZALLOC_MAP" elif hasattr(kern.globals, 'g_kext_map') and object == kern.globals.g_kext_map: @@ -2506,7 +2707,7 @@ def CountMapTags(map, tagcounts, slow): page = _vm_page_unpack_ptr(page_list) while (page != 0): vmpage = kern.GetValueFromAddress(page, 'vm_page_t') - if (addr == unsigned(vmpage.offset)) and (object == vmpage.object): + if (addr == unsigned(vmpage.offset)) and (object == vm_object_t(_vm_page_unpack_ptr(vmpage.vm_page_object))): if (not vmpage.local) and (vmpage.wire_count > 0): count += 1 break @@ -2613,7 +2814,8 @@ def showvmtags(cmd_args=None, cmd_options={}): queue_head = kern.globals.vm_objects_wired for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'): - CountWiredObject(object, tagcounts) + if object != kern.globals.kernel_object: + CountWiredObject(object, tagcounts) queue_head = kern.globals.purgeable_nonvolatile_queue for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'): @@ -2729,9 +2931,55 @@ def VMPageLookup(cmd_args=None): page = _vm_page_unpack_ptr(page_list) while (page != 0) : pg_t = kern.GetValueFromAddress(page, 'vm_page_t') - print format_string.format(page, pg_t.offset, pg_t.object) + print format_string.format(page, pg_t.offset, _vm_page_unpack_ptr(pg_t.vm_page_object)) page = _vm_page_unpack_ptr(pg_t.next_m) + + +@lldb_command('vmpage_get_phys_page') +def VmPageGetPhysPage(cmd_args=None): + """ return the physical page for a vm_page_t + usage: vm_page_get_phys_page + """ + if cmd_args == None or len(cmd_args) < 1: + print "Please provide valid vm_page_t. Type help vm_page_get_phys_page for help." + return + + page = kern.GetValueFromAddress(cmd_args[0], 'vm_page_t') + phys_page = _vm_page_get_phys_page(page) + print("phys_page = 0x%x\n" % phys_page) + + +def _vm_page_get_phys_page(page): + if kern.arch == 'x86_64': + return page.phys_page + + if page == 0 : + return 0 + + m = unsigned(page) + if m >= unsigned(kern.globals.vm_page_array_beginning_addr) and m < unsigned(kern.globals.vm_page_array_ending_addr) : + return (m - unsigned(kern.globals.vm_page_array_beginning_addr)) / sizeof('struct vm_page') + unsigned(kern.globals.vm_first_phys_ppnum) + + page_with_ppnum = Cast(page, 'uint32_t *') + ppnum_offset = sizeof('struct vm_page') / sizeof('uint32_t') + return page_with_ppnum[ppnum_offset] + + +@lldb_command('vmpage_unpack_ptr') +def VmPageUnpackPtr(cmd_args=None): + """ unpack a pointer + usage: vm_page_unpack_ptr + """ + if cmd_args == None or len(cmd_args) < 1: + print "Please provide valid packed pointer argument. Type help vm_page_unpack_ptr for help." + return + + packed = kern.GetValueFromAddress(cmd_args[0],'unsigned long') + unpacked = _vm_page_unpack_ptr(packed) + print("unpacked pointer = 0x%x\n" % unpacked) + + def _vm_page_unpack_ptr(page): if kern.ptrsize == 4 : return page @@ -2740,10 +2988,15 @@ def _vm_page_unpack_ptr(page): return page min_addr = kern.globals.vm_min_kernel_and_kext_address + ptr_shift = kern.globals.vm_packed_pointer_shift + ptr_mask = kern.globals.vm_packed_from_vm_pages_array_mask #INTEL - min_addr = 0xffffff7f80000000 #ARM - min_addr = 0x80000000 #ARM64 - min_addr = 0xffffff8000000000 - return ((page << 6) + min_addr) + if unsigned(page) & unsigned(ptr_mask) : + masked_page = (unsigned(page) & ~ptr_mask) + return (unsigned(addressof(kern.globals.vm_pages[masked_page]))) + return ((unsigned(page) << unsigned(ptr_shift)) + unsigned(min_addr)) @lldb_command('calcvmpagehash') def CalcVMPageHash(cmd_args=None): @@ -2773,6 +3026,8 @@ def _calc_vm_page_hash(obj, off): return hash_id +VM_PAGE_IS_WIRED = 1 + @header("{0: <10s} of {1: <10s} {2: <20s} {3: <20s} {4: <20s} {5: <10s} {6: <5s}\t {7: <28s}\t{8: <50s}".format("index", "total", "vm_page_t", "offset", "next", "phys_page", "wire#", "first bitfield", "second bitfield")) @lldb_command('vmobjectwalkpages', 'SBNQP:') def VMObjectWalkPages(cmd_args=None, cmd_options={}): @@ -2784,7 +3039,7 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): vmobjectwalkpages : Walk and print all the pages for a given object (up to 4K pages by default) vmobjectwalkpages -B : Walk and print all the pages for a given object (up to 4K pages by default), traversing the memq backwards vmobjectwalkpages -N : Walk and print all the pages for a given object, ignore the page limit - vmobjectwalkpages -Q : Walk all pages for a given object, looking for known signs of corruption (i.e. inactive and active both being set for a page) + vmobjectwalkpages -Q : Walk all pages for a given object, looking for known signs of corruption (i.e. q_state == VM_PAGE_IS_WIRED && wire_count == 0) vmobjectwalkpages -P : Walk all the pages for a given object, annotate the specified page in the output with *** vmobjectwalkpages -P -S : Walk all the pages for a given object, stopping when we find the specified page @@ -2818,12 +3073,11 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): if not quiet_mode: print VMObjectWalkPages.header format_string = "{0: <#10d} of {1: <#10d} {2: <#020x} {3: <#020x} {4: <#020x} {5: <#010x} {6: <#05d}\t" - first_bitfield_format_string = "{0: <#1d}:{1: <#1d}:{2: <#1d}:{3: <#1d}:{4: <#1d}:{5: <#1d}:{6: <#1d}:" - first_bitfield_format_string += "{7: <#1d}:{8: <#1d}:{9: <#1d}:{10: <#1d}:{11: <#1d}:{12: <#1d}" - second_bitfield_format_string = first_bitfield_format_string - second_bitfield_format_string += ":{13: <#1d}:{14: <#1d}:{15: <#1d}:{16: <#1d}:{17: <#1d}:{18: <#1d}:{19: <#1d}:" + first_bitfield_format_string = "{0: <#2d}:{1: <#1d}:{2: <#1d}:{3: <#1d}:{4: <#1d}:{5: <#1d}:{6: <#1d}:{7: <#1d}\t" + second_bitfield_format_string = "{0: <#1d}:{1: <#1d}:{2: <#1d}:{3: <#1d}:{4: <#1d}:{5: <#1d}:{6: <#1d}:" + second_bitfield_format_string += "{7: <#1d}:{8: <#1d}:{9: <#1d}:{10: <#1d}:{11: <#1d}:{12: <#1d}:" + second_bitfield_format_string += "{13: <#1d}:{14: <#1d}:{15: <#1d}:{16: <#1d}:{17: <#1d}:{18: <#1d}:{19: <#1d}:" second_bitfield_format_string += "{20: <#1d}:{21: <#1d}:{22: <#1d}:{23: <#1d}:{24: <#1d}:{25: <#1d}:{26: <#1d}\n" - first_bitfield_format_string += "\t" limit = 4096 #arbitrary limit of number of pages to walk ignore_limit = 0 @@ -2835,7 +3089,7 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): page_found = False pages_seen = set() - for vmp in IterateQueue(obj.memq, "vm_page_t", "listq", walk_backwards): + for vmp in IterateQueue(obj.memq, "vm_page_t", "listq", walk_backwards, unpack_ptr_fn=_vm_page_unpack_ptr): page_count += 1 out_string = "" if (page != 0 and not(page_found) and vmp == page): @@ -2846,38 +3100,28 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): if (page_count % 1000) == 0: print "traversed %d pages ...\n" % (page_count) else: - out_string += format_string.format(page_count, res_page_count, vmp, vmp.offset, vmp.listq.next, vmp.phys_page, vmp.wire_count) - out_string += first_bitfield_format_string.format(vmp.active, vmp.inactive, vmp.clean_queue, vmp.local, vmp.speculative, - vmp.throttled, vmp.free, vmp.pageout_queue, vmp.laundry, vmp.reference, - vmp.gobbled, vmp.private, vmp.no_cache) + out_string += format_string.format(page_count, res_page_count, vmp, vmp.offset, _vm_page_unpack_ptr(vmp.listq.next), _vm_page_get_phys_page(vmp), vmp.wire_count) + out_string += first_bitfield_format_string.format(vmp.vm_page_q_state, vmp.vm_page_in_background, vmp.vm_page_on_backgroundq, vmp.gobbled, vmp.laundry, vmp.no_cache, + vmp.private, vmp.reference) out_string += second_bitfield_format_string.format(vmp.busy, vmp.wanted, vmp.tabled, vmp.hashed, vmp.fictitious, vmp.clustered, - vmp.clustered, vmp.pmapped, vmp.xpmapped, vmp.wpmapped, vmp.pageout, vmp.absent, - vmp.error, vmp.dirty, vmp.cleaning, vmp.precious, vmp.precious, vmp.overwriting, - vmp.restart, vmp.unusual, vmp.encrypted, vmp.encrypted, vmp.encrypted_cleaning, - vmp.cs_validated, vmp.cs_tainted, vmp.cs_nx, vmp.reusable, vmp.lopage, vmp.slid, vmp.compressor, + vmp.pmapped, vmp.xpmapped, vmp.wpmapped, vmp.free_when_done, vmp.absent, + vmp.error, vmp.dirty, vmp.cleaning, vmp.precious, vmp.overwriting, + vmp.restart, vmp.unusual, vmp.encrypted, vmp.encrypted_cleaning, + vmp.cs_validated, vmp.cs_tainted, vmp.cs_nx, vmp.reusable, vmp.lopage, vmp.slid, vmp.written_by_kernel) if (vmp in pages_seen): print out_string + "cycle detected! we've seen vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " twice. stopping...\n" return - if (vmp.object != obj): - print out_string + " vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " points to different vm_object_t: " + "{0: <#020x}".format(unsigned(vmp.object)) + if (_vm_page_unpack_ptr(vmp.vm_page_object) != unsigned(obj)): + print out_string + " vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " points to different vm_object_t: " + "{0: <#020x}".format(unsigned(_vm_page_unpack_ptr(vmp.vm_page_object))) return - if (not vmp.local) and (vmp.wire_count > 0): - if (vmp.active or vmp.inactive or vmp.speculative or vmp.throttled or vmp.pageout_queue): - print out_string + " wired page with wrong page queue attributes\n" - print "vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " active: %d inactive: %d speculative: %d throttled %d pageout_queue: %d\n" % (vmp.active, - vmp.inactive, vmp.speculative, vmp.throttled, vmp.pageout_queue) - print "stopping...\n" - return - - if ((vmp.free + vmp.active + vmp.inactive + vmp.speculative + vmp.throttled + vmp.pageout_queue) > 1): - print out_string + " more than one pageout queue bit set active\n" - print "vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " free: %d active: %d inactive: %d speculative: %d throttled: %d pageout_queue: %d\n" % (vmp.free, - vmp.active, vmp.inactive, vmp.speculative, vmp.throttled, vmp.pageout_queue) + if (vmp.vm_page_q_state == VM_PAGE_IS_WIRED) and (vmp.wire_count == 0): + print out_string + " page in wired state with wire_count of 0\n" + print "vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + "\n" print "stopping...\n" return @@ -2956,3 +3200,102 @@ def show_apple_protect_pager(pager, qcnt, idx): vnode_pager = Cast(object.pager,'vnode_pager *') filename = GetVnodePath(vnode_pager.vnode_handle) print "{:>3}/{:<3d} {:#018x} {:>5d} {:>5d} {:>6d} {:#018x} {:#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{:#018x} \n\tvnode:{:#018x} {:s}\n".format(idx, qcnt, pager, pager.ref_count, pager.is_ready, pager.is_mapped, pager.pager_control, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename) + +@lldb_command("show_console_ring") +def ShowConsoleRingData(cmd_args=None): + """ Print console ring buffer stats and data + """ + cr = kern.globals.console_ring + print "console_ring = {:#018x} buffer = {:#018x} length = {:<5d} used = {:<5d} read_ptr = {:#018x} write_ptr = {:#018x}".format(addressof(cr), cr.buffer, cr.len, cr.used, cr.read_ptr, cr.write_ptr) + pending_data = [] + for i in range(unsigned(cr.used)): + idx = ((unsigned(cr.read_ptr) - unsigned(cr.buffer)) + i) % unsigned(cr.len) + pending_data.append("{:c}".format(cr.buffer[idx])) + + if pending_data: + print "Data:" + print "".join(pending_data) + +# Macro: showjetsamsnapshot + +@lldb_command("showjetsamsnapshot", "DA") +def ShowJetsamSnapshot(cmd_args=None, cmd_options={}): + """ Dump entries in the jetsam snapshot table + usage: showjetsamsnapshot [-D] [-A] + Use -D flag to print extra physfootprint details + Use -A flag to print all entries (regardless of valid count) + """ + + # Not shown are uuid, user_data, cpu_time + + global kern + if kern.arch == 'x86_64': + print "Snapshots are not supported.\n" + return + + show_footprint_details = False + show_all_entries = False + + if "-D" in cmd_options: + show_footprint_details = True + + if "-A" in cmd_options: + show_all_entries = True + + valid_count = kern.globals.memorystatus_jetsam_snapshot_count + max_count = kern.globals.memorystatus_jetsam_snapshot_max + + if (show_all_entries == True): + count = max_count + else: + count = valid_count + + print "{:s}".format(valid_count) + print "{:s}".format(max_count) + + if int(count) == 0: + print "The jetsam snapshot is empty." + print "Use -A to force dump all entries (regardless of valid count)" + return + + # Dumps the snapshot header info + print lldb_run_command('p *memorystatus_jetsam_snapshot') + + hdr_format = "{0: >32s} {1: >5s} {2: >4s} {3: >6s} {4: >6s} {5: >20s} {6: >20s} {7: >20s} {8: >5s} {9: >10s} {10: >6s} {11: >6s} {12: >10s} {13: >15s} {14: >15s} {15: >15s} {16: >15s}" + if (show_footprint_details == True): + hdr_format += "{17: >15s} {18: >15s} {19: >12s} {20: >12s} {21: >17s} {22: >10s} {23: >13s} {24: >10s}" + + + if (show_footprint_details == False): + print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'max', 'purgeable', 'lifetimeMax') + print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)') + else: + print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'max', 'purgeable', 'lifetimeMax', '|| internal', 'internal_comp', 'iokit_mapped', 'purge_nonvol', 'purge_nonvol_comp', 'alt_acct', 'alt_acct_comp', 'page_table') + print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)') + + + entry_format = "{e.name: >32s} {index: >5d} {e.priority: >4d} {e.jse_coalition_jetsam_id: >6d} {e.pid: >6d} "\ + "{e.jse_starttime: >20d} {e.jse_killtime: >20d} "\ + "{e.jse_idle_delta: >20d} {e.killed: >5d} {e.jse_memory_region_count: >10d} "\ + "{e.fds: >6d} {e.jse_gencount: >6d} {e.state: >10x} {e.pages: >15d} {e.max_pages: >15d} "\ + "{e.purgeable_pages: >15d} {e.max_pages_lifetime: >15d}" + + if (show_footprint_details == True): + entry_format += "{e.jse_internal_pages: >15d} "\ + "{e.jse_internal_compressed_pages: >15d} "\ + "{e.jse_iokit_mapped_pages: >12d} "\ + "{e.jse_purgeable_nonvolatile_pages: >12d} "\ + "{e.jse_purgeable_nonvolatile_compressed_pages: >17d} "\ + "{e.jse_alternate_accounting_pages: >10d} "\ + "{e.jse_alternate_accounting_compressed_pages: >13d} "\ + "{e.jse_page_table_pages: >10d}" + + snapshot_list = kern.globals.memorystatus_jetsam_snapshot.entries + idx = 0 + while idx < count: + current_entry = Cast(snapshot_list[idx], 'jetsam_snapshot_entry') + print entry_format.format(index=idx, e=current_entry) + idx +=1 + return + +# EndMacro: showjetsamsnapshot diff --git a/tools/lldbmacros/pgtrace.py b/tools/lldbmacros/pgtrace.py new file mode 100644 index 000000000..0b55e87e5 --- /dev/null +++ b/tools/lldbmacros/pgtrace.py @@ -0,0 +1,38 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +from xnu import * + +# Macro: pgtrace +@lldb_command('showpgtrace') +def ShowPgtrace(cmd_args=None, cmd_options={}): + """ Display pgtrace buffer contents + Usage: showpgtrace + """ + + max_entry = kern.globals.pgtrace.size + rd_idx = kern.globals.pgtrace.rdidx + wr_idx = kern.globals.pgtrace.wridx + + print "-"*80 + print "rd_idx=%d wr_idx=%d num_entries=%d max_entry=%d" % (rd_idx, wr_idx, wr_idx-rd_idx, max_entry) + print "-"*80 + + rw_str = { GetEnumValue('pgtrace_rw_t::PGTRACE_RW_LOAD'): "R", + GetEnumValue('pgtrace_rw_t::PGTRACE_RW_STORE'): "W", + GetEnumValue('pgtrace_rw_t::PGTRACE_RW_PREFETCH'): "P" } + + while rd_idx != wr_idx: + clipped_idx = rd_idx % max_entry + entry = kern.globals.pgtrace.logs + sizeof('log_t') * clipped_idx + entry = kern.GetValueFromAddress(entry, 'log_t *') + + entry_str = "[%d] id=%lu time=%lu %s " % (clipped_idx, entry.id, entry.res.rr_time, rw_str[int(entry.res.rr_rw)]) + + for i in range(entry.res.rr_num): + entry_str += "%x=%x " % (entry.res.rr_addrdata[i].ad_addr, entry.res.rr_addrdata[i].ad_data) + + print entry_str + + rd_idx += 1 +# EndMacro diff --git a/tools/lldbmacros/pmap.py b/tools/lldbmacros/pmap.py index 6a3cee283..9b73223f6 100644 --- a/tools/lldbmacros/pmap.py +++ b/tools/lldbmacros/pmap.py @@ -650,6 +650,7 @@ def assert_64bit(val): assert(val < 2**64) ARM64_TTE_SIZE = 8 +ARM64_TTE_SHIFT = 3 ARM64_VMADDR_BITS = 48 def PmapBlockOffsetMaskARM64(level): @@ -721,43 +722,73 @@ def PmapWalkARM64(pmap, vaddr, verbose_level = vHUMAN): assert_64bit(vaddr) paddr = -1 + tt0_index = 0 tt1_index = PmapTTnIndexARM64(vaddr, 1) tt2_index = PmapTTnIndexARM64(vaddr, 2) tt3_index = PmapTTnIndexARM64(vaddr, 3) - # L1 - tte = long(unsigned(pmap.tte[tt1_index])) + # The pmap starts at a page tabel level that is defined by register + # values; the kernel exports the root level for LLDB + level = kern.globals.arm64_root_pgtable_level + assert(level <= 3) + + if level == 0: + root_tt_index = tt0_index + elif level == 1: + root_tt_index = tt1_index + elif level == 2: + root_tt_index = tt2_index + elif level == 3: + root_tt_index = tt3_index + + # If the root of the page table is not a full page, we need to + # truncate the index + root_tt_index = root_tt_index % unsigned(kern.globals.arm64_root_pgtable_num_ttes) + + tte = long(unsigned(pmap.tte[root_tt_index])) assert(type(tte) == long) assert_64bit(tte) - if verbose_level >= vSCRIPT: - print "L1 entry: {:#x}".format(tte) - if verbose_level >= vDETAIL: - PmapDecodeTTEARM64(tte, 1) + while (True): + if (level == 0): + # L0 + # This is unsupported at the moment, as no kernel configurations use L0 + assert(False) - if tte & 0x1 == 0x1: - # Check for L1 block entry - if tte & 0x2 == 0x0: - # Handle L1 block entry - paddr = tte & PmapBlockBaseMaskARM64(1) - paddr = paddr | (vaddr & PmapBlockOffsetMaskARM64(1)) - print "phys: {:#x}".format(paddr) - else: - # Handle L1 table entry - l2_phys = (tte & page_base_mask) + (ARM64_TTE_SIZE * tt2_index) - assert(type(l2_phys) == long) + elif (level == 1): + # L1 + if verbose_level >= vSCRIPT: + print "L1 entry: {:#x}".format(tte) + if verbose_level >= vDETAIL: + PmapDecodeTTEARM64(tte, 1) - l2_virt = kern.PhysToKernelVirt(l2_phys) - assert(type(l2_virt) == long) + if tte & 0x1 == 0x1: + # Check for L1 block entry + if tte & 0x2 == 0x0: + # Handle L1 block entry + paddr = tte & PmapBlockBaseMaskARM64(1) + paddr = paddr | (vaddr & PmapBlockOffsetMaskARM64(1)) + print "phys: {:#x}".format(paddr) + break + else: + # Handle L1 table entry + l2_phys = (tte & page_base_mask) + (ARM64_TTE_SIZE * tt2_index) + assert(type(l2_phys) == long) - if verbose_level >= vDETAIL: - print "L2 physical address: {:#x}. L2 virtual address: {:#x}".format(l2_phys, l2_virt) + l2_virt = kern.PhysToKernelVirt(l2_phys) + assert(type(l2_virt) == long) - # L2 - ttep = kern.GetValueFromAddress(l2_virt, "tt_entry_t*") - tte = long(unsigned(dereference(ttep))) - assert(type(tte) == long) + if verbose_level >= vDETAIL: + print "L2 physical address: {:#x}. L2 virtual address: {:#x}".format(l2_phys, l2_virt) + ttep = kern.GetValueFromAddress(l2_virt, "tt_entry_t*") + tte = long(unsigned(dereference(ttep))) + assert(type(tte) == long) + elif verbose_level >= vHUMAN: + print "L1 entry invalid: {:#x}\n".format(tte) + + elif (level == 2): + # L2 if verbose_level >= vSCRIPT: print "L2 entry: {:#0x}".format(tte) if verbose_level >= vDETAIL: @@ -769,6 +800,7 @@ def PmapWalkARM64(pmap, vaddr, verbose_level = vHUMAN): # Handle L2 block entry paddr = tte & PmapBlockBaseMaskARM64(2) paddr = paddr | (vaddr & PmapBlockOffsetMaskARM64(2)) + break else: # Handle L2 table entry l3_phys = (tte & page_base_mask) + (ARM64_TTE_SIZE * tt3_index) @@ -780,25 +812,31 @@ def PmapWalkARM64(pmap, vaddr, verbose_level = vHUMAN): if verbose_level >= vDETAIL: print "L3 physical address: {:#x}. L3 virtual address: {:#x}".format(l3_phys, l3_virt) - # L3 ttep = kern.GetValueFromAddress(l3_virt, "tt_entry_t*") tte = long(unsigned(dereference(ttep))) assert(type(tte) == long) - - if verbose_level >= vSCRIPT: - print "L3 entry: {:#0x}".format(tte) - if verbose_level >= vDETAIL: - PmapDecodeTTEARM64(tte, 3) - - if tte & 0x3 == 0x3: - paddr = tte & page_base_mask - paddr = paddr | (vaddr & page_offset_mask) - elif verbose_level >= vHUMAN: - print "L3 entry invalid: {:#x}\n".format(tte) elif verbose_level >= vHUMAN: # tte & 0x1 == 0x1 print "L2 entry invalid: {:#x}\n".format(tte) - elif verbose_level >= vHUMAN: - print "L1 entry invalid: {:#x}\n".format(tte) + + elif (level == 3): + # L3 + if verbose_level >= vSCRIPT: + print "L3 entry: {:#0x}".format(tte) + if verbose_level >= vDETAIL: + PmapDecodeTTEARM64(tte, 3) + + if tte & 0x3 == 0x3: + paddr = tte & page_base_mask + paddr = paddr | (vaddr & page_offset_mask) + elif verbose_level >= vHUMAN: + print "L3 entry invalid: {:#x}\n".format(tte) + + # This was the leaf page table page for this request; we're done + break + + # We've parsed one level, so go to the next level + assert(level <= 3) + level = level + 1 if verbose_level >= vHUMAN: if paddr: diff --git a/tools/lldbmacros/process.py b/tools/lldbmacros/process.py index e2ddb8e3d..71108c100 100644 --- a/tools/lldbmacros/process.py +++ b/tools/lldbmacros/process.py @@ -257,7 +257,6 @@ def GetThreadSummary(thread): policy flags: B - darwinbg - L - lowpri cpu T - IO throttle P - IO passive D - Terminated @@ -305,16 +304,13 @@ def GetThreadSummary(thread): io_policy_str = "" - if int(thread.effective_policy.darwinbg) != 0: + if int(thread.effective_policy.thep_darwinbg) != 0: io_policy_str += "B" - if int(thread.effective_policy.lowpri_cpu) != 0: - io_policy_str += "L" - - if int(thread.effective_policy.io_tier) != 0: + if int(thread.effective_policy.thep_io_tier) != 0: io_policy_str += "T" - if int(thread.effective_policy.io_passive) != 0: + if int(thread.effective_policy.thep_io_passive) != 0: io_policy_str += "P" - if int(thread.effective_policy.terminated) != 0: + if int(thread.effective_policy.thep_terminated) != 0: io_policy_str += "D" state = int(thread.state) @@ -401,7 +397,7 @@ def GetCoalitionTasks(queue, coal_type, thread_details=False): tasks = [] field_name = 'task_coalition' for task in IterateLinkageChain(queue, 'task *', field_name, coal_type * sizeof('queue_chain_t')): - task_str = "({0: - """ - out_str = '' - runq = kern.GetValueFromAddress(cmd_args[0], 'struct run_queue *') - out_str += GetRunQSummary(runq) - print out_str - -def GetRunQSummary(runq): - """ Internal function to print summary of run_queue - params: runq - value representing struct run_queue * - return: str - representing the details of given run_queue - """ - out_str = " runq: count {: <10d} highq: {: <10d} urgency {: <10d}\n".format(runq.count, runq.highq, runq.urgency) - - runq_queue_i = 0 - runq_queue_count = sizeof(runq.queues)/sizeof(runq.queues[0]) - - for runq_queue_i in range(runq_queue_count) : - runq_queue_head = addressof(runq.queues[runq_queue_i]) - runq_queue_p = runq_queue_head.next - - if unsigned(runq_queue_p) != unsigned(runq_queue_head): - runq_queue_this_count = 0 - - for thread in IterateQueue(runq_queue_head, "thread_t", "links"): - runq_queue_this_count += 1 - - out_str += " Queue [{: <#012x}] Priority {: <3d} count {:d}\n".format(runq_queue_head, runq_queue_i, runq_queue_this_count) - out_str += "\t" + GetThreadSummary.header + "\n" - for thread in IterateQueue(runq_queue_head, "thread_t", "links"): - out_str += "\t" + GetThreadSummary(thread) + "\n" - if config['verbosity'] > vHUMAN : - out_str += "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n" - return out_str - - -def GetGrrrSummary(grrr_runq): - """ Internal function to print summary of grrr_run_queue - params: grrr_runq - value representing struct grrr_run_queue * - return: str - representing the details of given grrr_run_queue - """ - out_str = " GRRR Info: Count {: <10d} Weight {: <10d} Current Group {: <#012x}\n".format(grrr_runq.count, - grrr_runq.weight, grrr_runq.current_group) - grrr_group_i = 0 - grrr_group_count = sizeof(grrr_runq.groups)/sizeof(grrr_runq.groups[0]) - for grrr_group_i in range(grrr_group_count) : - grrr_group = addressof(grrr_runq.groups[grrr_group_i]) - if grrr_group.count > 0: - out_str += " Group {: <3d} [{: <#012x}] ".format(grrr_group.index, grrr_group) - out_str += "Count {:d} Weight {:d}\n".format(grrr_group.count, grrr_group.weight) - grrr_group_client_head = addressof(grrr_group.clients) - out_str += GetThreadSummary.header - for thread in IterateQueue(grrr_group_client_head, "thread_t", "links"): - out_str += "\t" + GetThreadSummary(thread) + "\n" - if config['verbosity'] > vHUMAN : - out_str += "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n" - return out_str - -def ShowNextThread(processor): - out_str = "" - if (processor.next_thread != 0) : - out_str += " " + "Next thread:\n" - out_str += "\t" + GetThreadSummary.header + "\n" - out_str += "\t" + GetThreadSummary(processor.next_thread) + "\n" - return out_str - -def ShowActiveThread(processor): - out_str = "" - if (processor.active_thread != 0) : - out_str += "\t" + GetThreadSummary.header + "\n" - out_str += "\t" + GetThreadSummary(processor.active_thread) + "\n" - return out_str - -@lldb_command('showallprocessors') -def ShowAllProcessors(cmd_args=None): - """ Routine to print information of all psets and processors - Usage: showallprocessors - """ - pset = addressof(kern.globals.pset0) - show_grrr = 0 - show_priority_runq = 0 - show_priority_pset_runq = 0 - show_group_pset_runq = 0 - sched_string = str(kern.globals.sched_current_dispatch.sched_name) - - if sched_string == "traditional": - show_priority_runq = 1 - elif sched_string == "traditional_with_pset_runqueue": - show_priority_pset_runq = 1 - elif sched_string == "grrr": - show_grrr = 1 - elif sched_string == "multiq": - show_priority_runq = 1 - show_group_pset_runq = 1 - elif sched_string == "dualq": - show_priority_pset_runq = 1 - show_priority_runq = 1 - else : - print "Unknown sched_string {:s}".format(sched_string) - - out_str = '' - - out_str += "Scheduler: {:s} ({:s})\n".format(sched_string, - kern.Symbolicate(unsigned(kern.globals.sched_current_dispatch))) - - out_str += "Runnable threads: {:d} Timeshare threads: {:d} Background threads: {:d}\n".format( - kern.globals.sched_run_count, kern.globals.sched_share_count, kern.globals.sched_background_count) - - if show_group_pset_runq: - # Create a group->task mapping - task_map = {} - for task in kern.tasks: - task_map[unsigned(task.sched_group)] = task - for task in kern.terminated_tasks: - task_map[unsigned(task.sched_group)] = task - - while unsigned(pset) != 0: - out_str += "Processor Set {: <#012x} Count {:d} (cpu_id {:<#x}-{:<#x})\n".format(pset, - pset.cpu_set_count, pset.cpu_set_low, pset.cpu_set_hi) - - if show_priority_pset_runq: - runq = pset.pset_runq - out_str += GetRunQSummary(runq) - - if show_group_pset_runq: - out_str += "Main Runq:\n" - runq = pset.pset_runq - out_str += GetGroupSetSummary(runq, task_map) - out_str += "All Groups:\n" - # TODO: Possibly output task header for each group - for group in IterateQueue(kern.globals.sched_groups, "sched_group_t", "sched_groups"): - if (group.runq.count != 0) : - task = task_map.get(unsigned(group), "Unknown task!") - out_str += "Group {: <#012x} Task {: <#012x}\n".format(unsigned(group), unsigned(task)) - out_str += GetRunQSummary(group.runq) - - out_str += " Active Processors:\n" - for processor in IterateQueue(pset.active_queue, "processor_t", "processor_queue"): - out_str += " " - out_str += GetProcessorSummary(processor) - out_str += ShowActiveThread(processor) - out_str += ShowNextThread(processor) - - if show_priority_runq: - runq = processor.runq - out_str += GetRunQSummary(runq) - if show_grrr: - grrr_runq = processor.grrr_runq - out_str += GetGrrrSummary(grrr_runq) - - out_str += " Idle Processors:\n" - for processor in IterateQueue(pset.idle_queue, "processor_t", "processor_queue"): - out_str += " " + GetProcessorSummary(processor) - out_str += ShowActiveThread(processor) - out_str += ShowNextThread(processor) - - if show_priority_runq: - out_str += GetRunQSummary(processor.runq) - - out_str += " Idle Secondary Processors:\n" - for processor in IterateQueue(pset.idle_secondary_queue, "processor_t", "processor_queue"): - out_str += " " + GetProcessorSummary(processor) - out_str += ShowActiveThread(processor) - out_str += ShowNextThread(processor) - - if show_priority_runq: - out_str += GetRunQSummary(processor.runq) - - pset = pset.pset_list - - out_str += "\nRealtime Queue ({:<#012x}) Count {:d}\n".format(addressof(kern.globals.rt_runq.queue), kern.globals.rt_runq.count) - if kern.globals.rt_runq.count != 0: - out_str += "\t" + GetThreadSummary.header + "\n" - for rt_runq_thread in IterateQueue(kern.globals.rt_runq.queue, "thread_t", "links"): - out_str += "\t" + GetThreadSummary(rt_runq_thread) + "\n" - - out_str += "\nTerminate Queue: ({:<#012x})\n".format(addressof(kern.globals.thread_terminate_queue)) - first = False - for thread in IterateQueue(kern.globals.thread_terminate_queue, "thread_t", "links"): - if first: - out_str += "\t" + GetThreadSummary.header + "\n" - first = True - out_str += "\t" + GetThreadSummary(thread) + "\n" - - out_str += "\nCrashed Threads Queue: ({:<#012x})\n".format(addressof(kern.globals.crashed_threads_queue)) - first = False - for thread in IterateQueue(kern.globals.crashed_threads_queue, "thread_t", "links"): - if first: - out_str += "\t" + GetThreadSummary.header + "\n" - first = True - out_str += "\t" + GetThreadSummary(thread) + "\n" - - out_str += "\n" - - out_str += "\n" - - print out_str -# EndMacro: showallprocessors - def GetLedgerEntrySummary(ledger_template, ledger, i): """ Internal function to get internals of a ledger entry (*not* a ledger itself) params: ledger_template - value representing struct ledger_template_t for the task or thread @@ -1936,11 +1695,8 @@ def ShowAllTaskPolicy(cmd_args=None): ["bg_iotier", "bg-iotier"], ["terminated", "terminated"], ["th_pidbind_bg", "bg-pidbind"], - ["th_workq_bg", "bg-workq"], ["t_apptype", "apptype"], ["t_boosted", "boosted"], - ["t_int_gpu_deny", "gpudeny-int"], - ["t_ext_gpu_deny", "gpudeny-ext"], ["t_role", "role"], ["t_tal_enabled", "tal-enabled"], ["t_base_latency_qos", "latency-base"], @@ -1999,25 +1755,9 @@ def ShowAllTaskPolicy(cmd_args=None): else: effective+="" - - pended_strings = [ - ["t_updating_policy", "updating"], - ["update_sockets", "update_sockets"], - ["t_update_timers", "update_timers"], - ["t_update_watchers", "update_watchers"] - ] - - pended="" - for value in pended_strings: - if t.pended_policy.__getattr__(value[0]) : - pended+=value[1] + ": " + str(t.pended_policy.__getattr__(value[0])) + " " - else: - pended+="" - print "requested: " + requested print "suppression: " + suppression print "effective: " + effective - print "pended: " + pended @lldb_type_summary(['wait_queue', 'wait_queue_t']) @@ -2067,4 +1807,328 @@ def ShowSuspendedTasks(cmd_args=[], options={}): print GetTaskSummary(t) + ' ' + GetProcSummary(Cast(t.bsd_info, 'proc *')) return True +# Macro: showallpte +@lldb_command('showallpte') +def ShowAllPte(cmd_args=None): + """ Prints out the physical address of the pte for all tasks + """ + head_taskp = addressof(kern.globals.tasks) + taskp = Cast(head_taskp.next, 'task *') + while taskp != head_taskp: + procp = Cast(taskp.bsd_info, 'proc *') + out_str = "task = {:#x} pte = {:#x}\t".format(taskp, taskp.map.pmap.ttep) + if procp != 0: + out_str += "{:s}\n".format(procp.p_comm) + else: + out_str += "\n" + print out_str + taskp = Cast(taskp.tasks.next, 'struct task *') + +# EndMacro: showallpte + +# Macro: showallrefcounts +@lldb_command('showallrefcounts') +@header("{0: <20s} {1: ^10s}".format("task", "ref_count")) +def ShowAllRefCounts(cmd_args=None): + """ Prints the ref_count of all tasks + """ + out_str = '' + head_taskp = addressof(kern.globals.tasks) + taskp = Cast(head_taskp.next, 'task *') + print ShowAllRefCounts.header + while taskp != head_taskp: + out_str += "{: <#20x}".format(taskp) + out_str += "{: ^10d}\n".format(taskp.ref_count) + taskp = Cast(taskp.tasks.next, 'task *') + print out_str +# EndMacro: showallrefcounts + +# Macro: showallrunnablethreads +@lldb_command('showallrunnablethreads') +def ShowAllRunnableThreads(cmd_args=None): + """ Prints the sched usage information for all threads of each task + """ + out_str = '' + for taskp in kern.tasks: + for actp in IterateQueue(taskp.threads, 'thread *', 'task_threads'): + if int(actp.state & 0x4): + ShowActStack([unsigned(actp)]) + +# EndMacro: showallrunnablethreads + +# Macro: showallschedusage +@lldb_command('showallschedusage') +@header("{0:<20s} {1:^10s} {2:^10s} {3:^15s}".format("Thread", "Priority", "State", "sched_usage")) +def ShowAllSchedUsage(cmd_args=None): + """ Prints the sched usage information for all threads of each task + """ + out_str = '' + for taskp in kern.tasks: + ShowTask([unsigned(taskp)]) + print ShowAllSchedUsage.header + for actp in IterateQueue(taskp.threads, 'thread *', 'task_threads'): + out_str = "{: <#20x}".format(actp) + out_str += "{: ^10s}".format(str(int(actp.sched_pri))) + state = int(actp.state) + thread_state_chars = {0:'', 1:'W', 2:'S', 4:'R', 8:'U', 16:'H', 32:'A', 64:'P', 128:'I'} + state_str = '' + state_str += thread_state_chars[int(state & 0x1)] + state_str += thread_state_chars[int(state & 0x2)] + state_str += thread_state_chars[int(state & 0x4)] + state_str += thread_state_chars[int(state & 0x8)] + state_str += thread_state_chars[int(state & 0x10)] + state_str += thread_state_chars[int(state & 0x20)] + state_str += thread_state_chars[int(state & 0x40)] + state_str += thread_state_chars[int(state & 0x80)] + out_str += "{: ^10s}".format(state_str) + out_str += "{: >15d}".format(actp.sched_usage) + print out_str + "\n" + print "\n\n" + +# EndMacro: showallschedusage + +#Macro: showprocfilessummary +@lldb_command('showprocfilessummary') +@header("{0: <20s} {1: <20s} {2: >10s}".format("Process", "Name", "Number of Open Files")) +def ShowProcFilesSummary(cmd_args=None): + """ Display the summary of open file descriptors for all processes in task list + Usage: showprocfilessummary + """ + print ShowProcFilesSummary.header + for proc in kern.procs: + proc_filedesc = proc.p_fd + proc_ofiles = proc_filedesc.fd_ofiles + proc_lastfile = unsigned(proc_filedesc.fd_lastfile) + count = 0 + proc_file_count = 0 + if proc_filedesc.fd_nfiles != 0: + while count <= proc_lastfile: + if unsigned(proc_ofiles[count]) != 0: + proc_file_count += 1 + count += 1 + print "{0: <#020x} {1: <20s} {2: >10d}".format(proc, proc.p_comm, proc_file_count) + +#EndMacro: showprocfilessummary + +@lldb_command('workinguserstacks') +def WorkingUserStacks(cmd_args=None): + """ Print out the user stack for each thread in a task, followed by the user libraries. + Syntax: (lldb) workinguserstacks + """ + if not cmd_args: + print "Insufficient arguments" + ShowTaskUserStacks.__doc__ + return False + task = kern.GetValueFromAddress(cmd_args[0], 'task *') + print GetTaskSummary.header + " " + GetProcSummary.header + pval = Cast(task.bsd_info, 'proc *') + print GetTaskSummary(task) + " " + GetProcSummary(pval) + "\n \n" + for thval in IterateQueue(task.threads, 'thread *', 'task_threads'): + print "For thread 0x{0:x}".format(thval) + try: + ShowThreadUserStack([hex(thval)]) + except Exception as exc_err: + print "Failed to show user stack for thread 0x{0:x}".format(thval) + if config['debug']: + raise exc_err + else: + print "Enable debugging ('(lldb) xnudebug debug') to see detailed trace." + WorkingUserLibraries([hex(task)]) + return + +@static_var("exec_load_path", 0) +@lldb_command("workingkuserlibraries") +def WorkingUserLibraries(cmd_args=None): + """ Show binary images known by dyld in target task + For a given user task, inspect the dyld shared library state and print information about all Mach-O images. + Syntax: (lldb)workinguserlibraries + """ + if not cmd_args: + print "Insufficient arguments" + print ShowTaskUserLibraries.__doc__ + return False + + print "{0: <18s} {1: <12s} {2: <36s} {3: <50s}".format('address','type','uuid','path') + out_format = "0x{0:0>16x} {1: <12s} {2: <36s} {3: <50s}" + task = kern.GetValueFromAddress(cmd_args[0], 'task_t') + is_task_64 = int(task.t_flags) & 0x1 + dyld_all_image_infos_address = unsigned(task.all_image_info_addr) + cur_data_offset = 0 + if dyld_all_image_infos_address == 0: + print "No dyld shared library information available for task" + return False + vers_info_data = GetUserDataAsString(task, dyld_all_image_infos_address, 112) + version = _ExtractDataFromString(vers_info_data, cur_data_offset, "uint32_t") + cur_data_offset += 4 + if version > 12: + print "Unknown dyld all_image_infos version number %d" % version + image_info_count = _ExtractDataFromString(vers_info_data, cur_data_offset, "uint32_t") + WorkingUserLibraries.exec_load_path = 0 + if is_task_64: + image_info_size = 24 + image_info_array_address = _ExtractDataFromString(vers_info_data, 8, "uint64_t") + dyld_load_address = _ExtractDataFromString(vers_info_data, 8*4, "uint64_t") + dyld_all_image_infos_address_from_struct = _ExtractDataFromString(vers_info_data, 8*13, "uint64_t") + else: + image_info_size = 12 + image_info_array_address = _ExtractDataFromString(vers_info_data, 4*2, "uint32_t") + dyld_load_address = _ExtractDataFromString(vers_info_data, 4*5, "uint32_t") + dyld_all_image_infos_address_from_struct = _ExtractDataFromString(vers_info_data, 4*14, "uint32_t") + # Account for ASLR slide before dyld can fix the structure + dyld_load_address = dyld_load_address + (dyld_all_image_infos_address - dyld_all_image_infos_address_from_struct) + + i = 0 + while i < image_info_count: + image_info_address = image_info_array_address + i * image_info_size + img_data = GetUserDataAsString(task, image_info_address, image_info_size) + if is_task_64: + image_info_addr = _ExtractDataFromString(img_data, 0, "uint64_t") + image_info_path = _ExtractDataFromString(img_data, 8, "uint64_t") + else: + image_info_addr = _ExtractDataFromString(img_data, 0, "uint32_t") + image_info_path = _ExtractDataFromString(img_data, 4, "uint32_t") + PrintImageInfo(task, image_info_addr, image_info_path) + i += 1 + + # load_path might get set when the main executable is processed. + if WorkingUserLibraries.exec_load_path != 0: + PrintImageInfo(task, dyld_load_address, WorkingUserLibraries.exec_load_path) + return + +# Macro: showstackaftertask +@lldb_command('showstackaftertask','F:') +def Showstackaftertask(cmd_args=None,cmd_options={}): + """ Routine to print the thread stacks for all tasks succeeding a given task + Usage: showstackaftertask <0xaddress of task> + or: showstackaftertask -F + """ + if "-F" in cmd_options: + # Find the task pointer corresponding to its task name + find_task_str = cmd_options["-F"] + task_list = FindTasksByName(find_task_str) + + # Iterate through the list of tasks and print all task stacks thereafter + for tval in task_list: + ListTaskStacks(tval) + return + + if not cmd_args: + raise ArgumentError("Insufficient arguments") + tval = kern.GetValueFromAddress(cmd_args[0], 'task *') + if not tval: + raise ArgumentError("unknown arguments: {:s}".format(str(cmd_args))) + else: + ListTaskStacks(tval) + + ZombStacks() + return +# EndMacro: showstackaftertask + +def ListTaskStacks(task): + """ Search for a given task and print the list of all task stacks thereafter. + """ + # Initialize local variable task_flag to mark when a given task is found. + task_flag=0 + + for t in kern.tasks: + if (task_flag == 1): + ShowTaskStacks(t) + print "\n" + if (t == task): + task_flag = 1 + +# Macro: showstackafterthread +@lldb_command('showstackafterthread') +def Showstackafterthread(cmd_args = None): + """ Routine to print the stacks of all threads succeeding a given thread. + Usage: Showstackafterthread <0xaddress of thread> + """ + # local variable thread_flag is used to mark when a given thread is found. + thread_flag=0 + if cmd_args: + threadval = kern.GetValueFromAddress(cmd_args[0], 'thread *') + else: + raise ArgumentError("No arguments passed") + # Iterate through list of all tasks to look up a given thread + for t in kern.tasks: + if(thread_flag==1): + pval = Cast(t.bsd_info, 'proc *') + print GetTaskSummary.header + " "+ GetProcSummary.header + print GetTaskSummary(t) + " "+ GetProcSummary(pval) + print "\n" + # Look up for a given thread from the the list of threads of a given task + for thval in IterateQueue(t.threads, 'thread *', 'task_threads'): + if (thread_flag==1): + print "\n" + print " " + GetThreadSummary.header + print " " + GetThreadSummary(thval) + print GetThreadBackTrace(thval, prefix="\t")+"\n" + print "\n" + + if(thval==threadval): + pval = Cast(t.bsd_info, 'proc *') + process_name = "{:s}".format(pval.p_comm) + print "\n\n" + print " *** Continuing to dump the thread stacks from the process *** :" + " " + process_name + print "\n\n" + thread_flag = 1 + print '\n' + return + +def FindVMEntriesForVnode(task, vn): + """ returns an array of vme that have the vnode set to defined vnode + each entry in array is of format (vme, start_addr, end_address, protection) + """ + retval = [] + vmmap = task.map + pmap = vmmap.pmap + pager_ops_addr = unsigned(addressof(kern.globals.vnode_pager_ops)) + debuglog("pager_ops_addr %s" % hex(pager_ops_addr)) + + if unsigned(pmap) == 0: + return retval + vme_list_head = vmmap.hdr.links + vme_ptr_type = gettype('vm_map_entry *') + for vme in IterateQueue(vme_list_head, vme_ptr_type, 'links'): + #print vme + if unsigned(vme.is_sub_map) == 0 and unsigned(vme.object.vm_object) != 0: + obj = vme.object.vm_object + else: + continue + + while obj != 0: + if obj.pager != 0: + if obj.internal: + pass + else: + vn_pager = Cast(obj.pager, 'vnode_pager *') + if unsigned(vn_pager.pager_ops) == pager_ops_addr and unsigned(vn_pager.vnode_handle) == unsigned(vn): + retval.append((vme, unsigned(vme.links.start), unsigned(vme.links.end), unsigned(vme.protection))) + obj = obj.shadow + return retval + +@lldb_command('showtaskloadinfo') +def ShowTaskLoadInfo(cmd_args=None, cmd_options={}): + """ Print the load address and uuid for the process + Usage: (lldb)showtaskloadinfo + """ + if not cmd_args: + raise ArgumentError("Insufficient arguments") + t = kern.GetValueFromAddress(cmd_args[0], 'struct task *') + print_format = "0x{0:x} - 0x{1:x} {2: <50s} (??? - ???) <{3: <36s}> {4: <50s}" + p = Cast(t.bsd_info, 'struct proc *') + uuid = p.p_uuid + uuid_out_string = "{a[0]:02X}{a[1]:02X}{a[2]:02X}{a[3]:02X}-{a[4]:02X}{a[5]:02X}-{a[6]:02X}{a[7]:02X}-{a[8]:02X}{a[9]:02X}-{a[10]:02X}{a[11]:02X}{a[12]:02X}{a[13]:02X}{a[14]:02X}{a[15]:02X}".format(a=uuid) + filepath = GetVnodePath(p.p_textvp) + libname = filepath.split('/')[-1] + #print "uuid: %s file: %s" % (uuid_out_string, filepath) + mappings = FindVMEntriesForVnode(t, p.p_textvp) + load_addr = 0 + end_addr = 0 + for m in mappings: + if m[3] == 5: + load_addr = m[1] + end_addr = m[2] + #print "Load address: %s" % hex(m[1]) + print print_format.format(load_addr, end_addr, libname, uuid_out_string, filepath) + return None diff --git a/tools/lldbmacros/scheduler.py b/tools/lldbmacros/scheduler.py index 91a8df3f1..cf828c2db 100644 --- a/tools/lldbmacros/scheduler.py +++ b/tools/lldbmacros/scheduler.py @@ -4,20 +4,41 @@ # TODO: write scheduler related macros here +# Macro: showallprocrunqcount + +@lldb_command('showallprocrunqcount') +def ShowAllProcRunQCount(cmd_args=None): + """ Prints out the runq count for all processors + """ + out_str = "Processor\t# Runnable\n" + processor_itr = kern.globals.processor_list + while processor_itr: + out_str += "{:d}\t\t{:d}\n".format(processor_itr.cpu_id, processor_itr.runq.count) + processor_itr = processor_itr.processor_list + out_str += "RT:\t\t{:d}\n".format(kern.globals.rt_runq.count) + print out_str + +# EndMacro: showallprocrunqcount + # Macro: showinterrupts @lldb_command('showinterrupts') def ShowInterrupts(cmd_args=None): """ Prints IRQ, IPI and TMR counts for each CPU - """ + """ + + if kern.arch not in ('arm', 'arm64'): + print "showinterrupts is only supported on arm/arm64" + return + base_address = kern.GetLoadAddressForSymbol('CpuDataEntries') struct_size = 16 - for x in range (0, unsigned(kern.globals.machine_info.physical_cpu)): + for x in xrange (0, unsigned(kern.globals.machine_info.physical_cpu)): element = kern.GetValueFromAddress(base_address + (x * struct_size), 'uintptr_t *')[1] cpu_data_entry = Cast(element, 'cpu_data_t *') print "CPU {} IRQ: {:d}\n".format(x, cpu_data_entry.cpu_stat.irq_ex_cnt) print "CPU {} IPI: {:d}\n".format(x, cpu_data_entry.cpu_stat.ipi_cnt) - print "CPU {} TMR: {:d}\n".format(x, cpu_data_entry.cpu_stat.timer_cnt) + print "CPU {} TMR: {:d}\n".format(x, cpu_data_entry.cpu_stat.timer_cnt) # EndMacro: showinterrupts # Macro: showactiveinterrupts @@ -58,6 +79,54 @@ def ShowActiveInterrupts(cmd_args=None): mask = mask << 1 # EndMacro: showactiveinterrupts +# Macro: showirqbyipitimerratio +@lldb_command('showirqbyipitimerratio') +def ShowIrqByIpiTimerRatio(cmd_args=None): + """ Prints the ratio of IRQ by sum of IPI & TMR counts for each CPU + """ + if kern.arch == "x86_64": + print "This macro is not supported on x86_64 architecture" + return + + out_str = "IRQ-IT Ratio: " + base_address = kern.GetLoadAddressForSymbol('CpuDataEntries') + struct_size = 16 + for x in range (0, unsigned(kern.globals.machine_info.physical_cpu)): + element = kern.GetValueFromAddress(base_address + (x * struct_size), 'uintptr_t *')[1] + cpu_data_entry = Cast(element, 'cpu_data_t *') + out_str += " CPU {} [{:.2f}]".format(x, float(cpu_data_entry.cpu_stat.irq_ex_cnt)/(cpu_data_entry.cpu_stat.ipi_cnt + cpu_data_entry.cpu_stat.timer_cnt)) + print out_str + +# EndMacro: showirqbyipitimerratio + +#Macro: showinterruptsourceinfo +@lldb_command('showinterruptsourceinfo') +def showinterruptsourceinfo(cmd_args = None): + """ Extract information of interrupt source causing interrupt storms. + """ + if not cmd_args: + print "No arguments passed" + return False + #Dump IOInterruptVector object + print "--- Dumping IOInterruptVector object ---\n" + object_info = lldb_run_command("dumpobject {:s} IOInterruptVector".format(cmd_args[0])) + print object_info + print "--- Dumping IOFilterInterruptEventSource object ---\n" + #Dump the IOFilterInterruptEventSource object. + target_info=re.search('target =\s+(.*)',object_info) + target= target_info.group() + target= target.split() + #Dump the Object pointer of the source who is triggering the Interrupts. + vector_info=lldb_run_command("dumpobject {:s} ".format(target[2])) + print vector_info + owner_info= re.search('owner =\s+(.*)',vector_info) + owner= owner_info.group() + owner= owner.split() + print "\n\n" + out=lldb_run_command(" dumpobject {:s}".format(owner[2])) + print out + +# EndMacro: showinterruptsourceinfo @lldb_command('showcurrentabstime') def ShowCurremtAbsTime(cmd_args=None): @@ -68,15 +137,15 @@ def ShowCurremtAbsTime(cmd_args=None): cur_abstime = 0 while unsigned(pset) != 0: - for processor in IterateQueue(pset.active_queue, "processor_t", "processor_queue"): + for processor in ParanoidIterateLinkageChain(pset.active_queue, "processor_t", "processor_queue"): if unsigned(processor.last_dispatch) > cur_abstime: cur_abstime = unsigned(processor.last_dispatch) - for processor in IterateQueue(pset.idle_queue, "processor_t", "processor_queue"): + for processor in ParanoidIterateLinkageChain(pset.idle_queue, "processor_t", "processor_queue"): if unsigned(processor.last_dispatch) > cur_abstime: cur_abstime = unsigned(processor.last_dispatch) - for processor in IterateQueue(pset.idle_secondary_queue, "processor_t", "processor_queue"): + for processor in ParanoidIterateLinkageChain(pset.idle_secondary_queue, "processor_t", "processor_queue"): if unsigned(processor.last_dispatch) > cur_abstime: cur_abstime = unsigned(processor.last_dispatch) @@ -93,21 +162,76 @@ def ShowAbstimeToNanoTime(cmd_args=[]): if not cmd_args: raise ArgumentError("Invalid argument") timedata = ArgumentStringToInt(cmd_args[0]) - print "%d ns" % kern.GetNanotimeFromAbstime(timedata) + ns = kern.GetNanotimeFromAbstime(timedata) + us = float(ns) / 1000 + ms = us / 1000 + s = ms / 1000 + + if s > 60 : + m = s / 60 + h = m / 60 + d = h / 24 + + print "{:d} ns, {:f} us, {:f} ms, {:f} s, {:f} m, {:f} h, {:f} d".format(ns, us, ms, s, m, h, d) + else: + print "{:d} ns, {:f} us, {:f} ms, {:f} s".format(ns, us, ms, s) # Macro: showschedhistory +def GetSchedMostRecentDispatch(show_processor_details=False): + """ Return the most recent dispatch on the system, printing processor + details if argument is true. + """ + processor_list = kern.globals.processor_list + + most_recent_dispatch = 0 + current_processor = processor_list + + while unsigned(current_processor) > 0: + active_thread = current_processor.active_thread + if unsigned(active_thread) != 0 : + task_val = active_thread.task + proc_val = Cast(task_val.bsd_info, 'proc *') + proc_name = "" if unsigned(proc_val) == 0 else str(proc_val.p_name) + + last_dispatch = unsigned(current_processor.last_dispatch) + + if kern.arch == 'x86_64': + cpu_data = kern.globals.cpu_data_ptr[current_processor.cpu_id] + if (cpu_data != 0) : + cpu_debugger_time = max(cpu_data.debugger_entry_time, cpu_data.debugger_ipi_time) + time_since_dispatch = unsigned(cpu_debugger_time - last_dispatch) + time_since_dispatch_us = kern.GetNanotimeFromAbstime(time_since_dispatch) / 1000.0 + time_since_debugger = unsigned(cpu_debugger_time - kern.globals.debugger_entry_time) + time_since_debugger_us = kern.GetNanotimeFromAbstime(time_since_debugger) / 1000.0 + + if show_processor_details: + print "Processor last dispatch: {:16d} Entered debugger: {:16d} ({:8.3f} us after dispatch, {:8.3f} us after debugger) Active thread: 0x{t:<16x} 0x{t.thread_id:<8x} {proc_name:s}".format(last_dispatch, cpu_debugger_time, + time_since_dispatch_us, time_since_debugger_us, t=active_thread, proc_name=proc_name) + else: + if show_processor_details: + print "Processor last dispatch: {:16d} Active thread: 0x{t:<16x} 0x{t.thread_id:<8x} {proc_name:s}".format(last_dispatch, t=active_thread, proc_name=proc_name) + + if last_dispatch > most_recent_dispatch: + most_recent_dispatch = last_dispatch + + current_processor = current_processor.processor_list + + return most_recent_dispatch + +@header("{:<18s} {:<10s} {:>16s} {:>16s} {:>16s} {:>18s} {:>16s} {:>16s} {:>16s} {:2s} {:2s} {:2s} {:>2s} {:<19s} {:<9s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>11s} {:>8s}".format("thread", "id", "on-core", "off-core", "runnable", "last-duration (us)", "since-off (us)", "since-on (us)", "pending (us)", "BP", "SP", "TP", "MP", "sched-mode", "state", "cpu-usage", "delta", "sch-usage", "stamp", "shift", "task", "thread-name")) def ShowThreadSchedHistory(thread, most_recent_dispatch): - out_str = "" + """ Given a thread and the most recent dispatch time of a thread on the + system, print out details about scheduler history for the thread. + """ + thread_name = "" - if int(thread.uthread) != 0: + if unsigned(thread.uthread) != 0: uthread = Cast(thread.uthread, 'uthread *') - #check for thread name - if int(uthread.pth_name) != 0 : - th_name_strval = Cast(uthread.pth_name, 'char *') - if len(str(th_name_strval)) > 0 : - thread_name = str(th_name_strval) + # Doing the straightforward thing blows up weirdly, so use some indirections to get back on track + if unsigned(uthread.pth_name) != 0 : + thread_name = str(kern.GetValueFromAddress(unsigned(uthread.pth_name), 'char*')) task = thread.task task_name = "unknown" @@ -143,68 +267,460 @@ def ShowThreadSchedHistory(thread, most_recent_dispatch): last_on = thread.computation_epoch last_off = thread.last_run_time + last_runnable = thread.last_made_runnable_time + + if int(last_runnable) == 18446744073709551615 : + last_runnable = 0 time_on_abs = unsigned(last_off - last_on) time_on_us = kern.GetNanotimeFromAbstime(time_on_abs) / 1000.0 + time_pending_abs = unsigned(most_recent_dispatch - last_runnable) + time_pending_us = kern.GetNanotimeFromAbstime(time_pending_abs) / 1000.0 + + if int(last_runnable) == 0 : + time_pending_us = 0 + time_since_off_abs = unsigned(most_recent_dispatch - last_off) time_since_off_us = kern.GetNanotimeFromAbstime(time_since_off_abs) / 1000.0 time_since_on_abs = unsigned(most_recent_dispatch - last_on) time_since_on_us = kern.GetNanotimeFromAbstime(time_since_on_abs) / 1000.0 - fmt = "0x{t:<16x} 0x{t.thread_id:<8x} {t.computation_epoch:16d} {t.last_run_time:16d} {time_on_us:16.3f} {time_since_off_us:16.3f} {time_since_on_us:16.3f}" + fmt = "0x{t:<16x} 0x{t.thread_id:<8x} {t.computation_epoch:16d} {t.last_run_time:16d} {last_runnable:16d} {time_on_us:18.3f} {time_since_off_us:16.3f} {time_since_on_us:16.3f} {time_pending_us:16.3f}" fmt2 = " {t.base_pri:2d} {t.sched_pri:2d} {t.task_priority:2d} {t.max_priority:2d} {sched_mode:19s}" fmt3 = " {state:9s} {t.cpu_usage:10d} {t.cpu_delta:10d} {t.sched_usage:10d} {t.sched_stamp:10d} {t.pri_shift:10d} {name:s} {thread_name:s}" - out_str = fmt.format(t=thread, sched_mode=sched_mode, time_on_us=time_on_us, time_since_off_us=time_since_off_us, time_since_on_us=time_since_on_us) + out_str = fmt.format(t=thread, time_on_us=time_on_us, time_since_off_us=time_since_off_us, time_since_on_us=time_since_on_us, last_runnable=last_runnable, time_pending_us=time_pending_us) out_str += fmt2.format(t=thread, sched_mode=sched_mode) out_str += fmt3.format(t=thread, state=state_str, name=task_name, thread_name=thread_name) - - return out_str + + print out_str @lldb_command('showschedhistory') def ShowSchedHistory(cmd_args=None): """ Routine to print out thread scheduling history + Usage: showschedhistory [ ...] """ - print "Processors: {:d} Runnable threads: {:d} Timeshare threads: {:d} Background threads {:d}\n".format( - kern.globals.processor_avail_count, kern.globals.sched_run_count, kern.globals.sched_share_count, kern.globals.sched_background_count) + if cmd_args: + most_recent_dispatch = GetSchedMostRecentDispatch(False) - print "Mach factor: {:d} Load factor: {:d} Last sched tick {:d}\n".format( - kern.globals.sched_mach_factor, kern.globals.sched_load_average, kern.globals.sched_tick_last_abstime) + print ShowThreadSchedHistory.header + for thread_ptr in cmd_args: + thread = kern.GetValueFromAddress(ArgumentStringToInt(thread_ptr), 'thread *') + ShowThreadSchedHistory(thread, most_recent_dispatch) - print "Sched tick: {:d} Fixed shift: {:d} Pri shift: {:d} Background pri shift {:d}\n".format( - kern.globals.sched_tick, kern.globals.sched_fixed_shift, kern.globals.sched_pri_shift, kern.globals.sched_background_pri_shift) + return + + run_buckets = kern.globals.sched_run_buckets - processor_list = kern.GetGlobalVariable('processor_list') + run_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')] + fixpri_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')] + share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] + share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] + share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] - most_recent_dispatch = 0 - current_processor = processor_list - while unsigned(current_processor) > 0: - active_thread = current_processor.active_thread - if unsigned(active_thread) != 0 : - task_val = active_thread.task - proc_val = Cast(task_val.bsd_info, 'proc *') - proc_name = str(proc_val.p_name) + sched_pri_shifts = kern.globals.sched_run_buckets - last_dispatch = unsigned(current_processor.last_dispatch) + share_fg_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] + share_ut_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] + share_bg_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] - print "Processor last dispatch: {last_dispatch:16d} Active thread: 0x{t:<16x} 0x{t.thread_id:<8x} {proc_name:s}".format(t=active_thread, last_dispatch=last_dispatch, proc_name=proc_name) - if last_dispatch > most_recent_dispatch : - most_recent_dispatch = last_dispatch + print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals) + print "FG Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_ut_count, share_bg_count) + print "Mach factor: {g.sched_mach_factor:d} Load factor: {g.sched_load_average:d} Sched tick: {g.sched_tick:d} timestamp: {g.sched_tick_last_abstime:d} interval:{g.sched_tick_interval:d}\n".format(g=kern.globals) + print "Fixed shift: {g.sched_fixed_shift:d} FG shift: {:d} UT shift: {:d} BG shift: {:d}\n".format(share_fg_shift, share_ut_shift, share_bg_shift, g=kern.globals) + print "sched_pri_decay_band_limit: {g.sched_pri_decay_band_limit:d} sched_decay_usage_age_factor: {g.sched_decay_usage_age_factor:d}\n".format(g=kern.globals) - current_processor = current_processor.processor_list + if kern.arch == 'x86_64': + print "debugger_entry_time: {g.debugger_entry_time:d}\n".format(g=kern.globals) + most_recent_dispatch = GetSchedMostRecentDispatch(True) print "Most recent dispatch: " + str(most_recent_dispatch) - print "{:<18s} {:<10s} {:>16s} {:>16s} {:>16s} {:>16s} {:>16s} {:2s} {:2s} {:2s} {:>2s} {:<19s} {:<9s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>16s} {:>16s}".format( - "thread", "id", "on-core", "off-core", "last-duration", "since-off", "since-on", "BP", "SP", "TP", "MP", "sched-mode", "state", "cpu-usage", "delta", "sch-usage", "stamp", "shift", "task", "thread-name") - + print ShowThreadSchedHistory.header for thread in IterateQueue(kern.globals.threads, 'thread *', 'threads'): - print ShowThreadSchedHistory(thread, most_recent_dispatch) + ShowThreadSchedHistory(thread, most_recent_dispatch) - return # EndMacro: showschedhistory + +# Macro: showallprocessors + +def ShowGroupSetSummary(runq, task_map): + """ Internal function to print summary of group run queue + params: runq - value representing struct run_queue * + """ + print " runq: count {: <10d} highq: {: <10d} urgency {: <10d}\n".format(runq.count, runq.highq, runq.urgency) + + runq_queue_i = 0 + runq_queue_count = sizeof(runq.queues)/sizeof(runq.queues[0]) + + for runq_queue_i in xrange(runq_queue_count) : + runq_queue_head = addressof(runq.queues[runq_queue_i]) + runq_queue_p = runq_queue_head.next + + if unsigned(runq_queue_p) != unsigned(runq_queue_head): + runq_queue_this_count = 0 + + for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links"): + runq_queue_this_count += 1 + + print " Queue [{: <#012x}] Priority {: <3d} count {:d}\n".format(runq_queue_head, runq_queue_i, runq_queue_this_count) + for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links"): + group_addr = unsigned(entry) - (sizeof(dereference(entry)) * unsigned(entry.sched_pri)) + group = kern.GetValueFromAddress(unsigned(group_addr), 'sched_group_t') + task = task_map.get(unsigned(group), 0x0) + if task == 0x0 : + print "Cannot find task for group: {: <#012x}".format(group) + print "\tEntry [{: <#012x}] Priority {: <3d} Group {: <#012x} Task {: <#012x}\n".format(unsigned(entry), entry.sched_pri, unsigned(group), unsigned(task)) + +@lldb_command('showrunq') +def ShowRunq(cmd_args=None): + """ Routine to print information of a runq + Usage: showrunq + """ + + if not cmd_args: + print "No arguments passed" + print ShowRunq.__doc__ + return False + + runq = kern.GetValueFromAddress(cmd_args[0], 'struct run_queue *') + ShowRunQSummary(runq) + +def ShowRunQSummary(runq): + """ Internal function to print summary of run_queue + params: runq - value representing struct run_queue * + """ + print " runq: count {: <10d} highq: {: <10d} urgency {: <10d}\n".format(runq.count, runq.highq, runq.urgency) + + runq_queue_i = 0 + runq_queue_count = sizeof(runq.queues)/sizeof(runq.queues[0]) + + for runq_queue_i in xrange(runq_queue_count) : + runq_queue_head = addressof(runq.queues[runq_queue_i]) + runq_queue_p = runq_queue_head.next + + if unsigned(runq_queue_p) != unsigned(runq_queue_head): + runq_queue_this_count = 0 + + for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links"): + runq_queue_this_count += 1 + + print " Queue [{: <#012x}] Priority {: <3d} count {:d}\n".format(runq_queue_head, runq_queue_i, runq_queue_this_count) + print "\t" + GetThreadSummary.header + "\n" + for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links"): + print "\t" + GetThreadSummary(thread) + "\n" + if config['verbosity'] > vHUMAN : + print "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n" + + +def ShowGrrrSummary(grrr_runq): + """ Internal function to print summary of grrr_run_queue + params: grrr_runq - value representing struct grrr_run_queue * + """ + print " GRRR Info: Count {: <10d} Weight {: <10d} Current Group {: <#012x}\n".format(grrr_runq.count, + grrr_runq.weight, grrr_runq.current_group) + grrr_group_i = 0 + grrr_group_count = sizeof(grrr_runq.groups)/sizeof(grrr_runq.groups[0]) + for grrr_group_i in xrange(grrr_group_count) : + grrr_group = addressof(grrr_runq.groups[grrr_group_i]) + if grrr_group.count > 0: + print " Group {: <3d} [{: <#012x}] ".format(grrr_group.index, grrr_group) + print "Count {:d} Weight {:d}\n".format(grrr_group.count, grrr_group.weight) + grrr_group_client_head = addressof(grrr_group.clients) + print GetThreadSummary.header + for thread in ParanoidIterateLinkageChain(grrr_group_client_head, "thread_t", "runq_links"): + print "\t" + GetThreadSummary(thread) + "\n" + if config['verbosity'] > vHUMAN : + print "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n" + +def ShowNextThread(processor): + if (processor.next_thread != 0) : + print " " + "Next thread:\n" + print "\t" + GetThreadSummary.header + "\n" + print "\t" + GetThreadSummary(processor.next_thread) + "\n" + +def ShowActiveThread(processor): + if (processor.active_thread != 0) : + print "\t" + GetThreadSummary.header + "\n" + print "\t" + GetThreadSummary(processor.active_thread) + "\n" + +@lldb_command('showallprocessors') +@lldb_command('showscheduler') +def ShowScheduler(cmd_args=None): + """ Routine to print information of all psets and processors + Usage: showscheduler + """ + pset = addressof(kern.globals.pset0) + show_grrr = 0 + show_priority_runq = 0 + show_priority_pset_runq = 0 + show_group_pset_runq = 0 + sched_string = str(kern.globals.sched_current_dispatch.sched_name) + + if sched_string == "traditional": + show_priority_runq = 1 + elif sched_string == "traditional_with_pset_runqueue": + show_priority_pset_runq = 1 + elif sched_string == "grrr": + show_grrr = 1 + elif sched_string == "multiq": + show_priority_runq = 1 + show_group_pset_runq = 1 + elif sched_string == "dualq": + show_priority_pset_runq = 1 + show_priority_runq = 1 + else : + print "Unknown sched_string {:s}".format(sched_string) + + print "Scheduler: {:s} ({:s})\n".format(sched_string, + kern.Symbolicate(unsigned(kern.globals.sched_current_dispatch))) + + run_buckets = kern.globals.sched_run_buckets + + run_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')] + fixpri_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')] + share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] + share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] + share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] + + print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals) + print "FG Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_ut_count, share_bg_count) + + if show_group_pset_runq: + print "multiq scheduler config: deep-drain {g.deep_drain:d}, ceiling {g.drain_ceiling:d}, depth limit {g.drain_depth_limit:d}, band limit {g.drain_band_limit:d}, sanity check {g.multiq_sanity_check:d}\n".format(g=kern.globals) + + # Create a group->task mapping + task_map = {} + for task in kern.tasks: + task_map[unsigned(task.sched_group)] = task + for task in kern.terminated_tasks: + task_map[unsigned(task.sched_group)] = task + + print " \n" + + while unsigned(pset) != 0: + print "Processor Set {: <#012x} Count {:d} (cpu_id {:<#x}-{:<#x})\n".format(pset, + pset.cpu_set_count, pset.cpu_set_low, pset.cpu_set_hi) + + if show_priority_pset_runq: + runq = pset.pset_runq + ShowRunQSummary(runq) + + if show_group_pset_runq: + print "Main Runq:\n" + runq = pset.pset_runq + ShowGroupSetSummary(runq, task_map) + print "All Groups:\n" + # TODO: Possibly output task header for each group + for group in IterateQueue(kern.globals.sched_groups, "sched_group_t", "sched_groups"): + if (group.runq.count != 0) : + task = task_map.get(unsigned(group), "Unknown task!") + print "Group {: <#012x} Task {: <#012x}\n".format(unsigned(group), unsigned(task)) + ShowRunQSummary(group.runq) + print " \n" + + print "Active Processors:\n" + for processor in ParanoidIterateLinkageChain(pset.active_queue, "processor_t", "processor_queue"): + print " " + GetProcessorSummary(processor) + ShowActiveThread(processor) + ShowNextThread(processor) + + if show_priority_runq: + runq = processor.runq + ShowRunQSummary(runq) + if show_grrr: + grrr_runq = processor.grrr_runq + ShowGrrrSummary(grrr_runq) + print " \n" + + + print "Idle Processors:\n" + for processor in ParanoidIterateLinkageChain(pset.idle_queue, "processor_t", "processor_queue"): + print " " + GetProcessorSummary(processor) + ShowActiveThread(processor) + ShowNextThread(processor) + + if show_priority_runq: + ShowRunQSummary(processor.runq) + print " \n" + + + print "Idle Secondary Processors:\n" + for processor in ParanoidIterateLinkageChain(pset.idle_secondary_queue, "processor_t", "processor_queue"): + print " " + GetProcessorSummary(processor) + ShowActiveThread(processor) + ShowNextThread(processor) + + if show_priority_runq: + print ShowRunQSummary(processor.runq) + print " \n" + + + pset = pset.pset_list + + print "\nRealtime Queue ({:<#012x}) Count {:d}\n".format(addressof(kern.globals.rt_runq.queue), kern.globals.rt_runq.count) + if kern.globals.rt_runq.count != 0: + print "\t" + GetThreadSummary.header + "\n" + for rt_runq_thread in ParanoidIterateLinkageChain(kern.globals.rt_runq.queue, "thread_t", "runq_links"): + print "\t" + GetThreadSummary(rt_runq_thread) + "\n" + + print "\nTerminate Queue: ({:<#012x})\n".format(addressof(kern.globals.thread_terminate_queue)) + first = False + for thread in ParanoidIterateLinkageChain(kern.globals.thread_terminate_queue, "thread_t", "runq_links"): + if first: + print "\t" + GetThreadSummary.header + "\n" + first = True + print "\t" + GetThreadSummary(thread) + "\n" + + print "\nCrashed Threads Queue: ({:<#012x})\n".format(addressof(kern.globals.crashed_threads_queue)) + first = False + for thread in ParanoidIterateLinkageChain(kern.globals.crashed_threads_queue, "thread_t", "runq_links"): + if first: + print "\t" + GetThreadSummary.header + "\n" + first = True + print "\t" + GetThreadSummary(thread) + "\n" + + print "\nWaiting For Kernel Stacks Queue: ({:<#012x})\n".format(addressof(kern.globals.thread_stack_queue)) + first = False + for thread in ParanoidIterateLinkageChain(kern.globals.thread_stack_queue, "thread_t", "runq_links"): + if first: + print "\t" + GetThreadSummary.header + "\n" + first = True + print "\t" + GetThreadSummary(thread) + "\n" + + print "\n" + + print "\n" + +# EndMacro: showallprocessors + + +def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst=0): + """ Iterate over a Linkage Chain queue in kernel of type queue_head_t. (osfmk/kern/queue.h method 1) + This is equivalent to the qe_foreach_element() macro + Blows up aggressively and descriptively when something goes wrong iterating a queue. + Prints correctness errors, and throws exceptions on 'cannot proceed' errors + If this is annoying, set the global 'enable_paranoia' to false. + + params: + queue_head - value : Value object for queue_head. + element_type - lldb.SBType : pointer type of the element which contains the queue_chain_t. Typically its structs like thread, task etc.. + - str : OR a string describing the type. ex. 'task *' + field_name - str : Name of the field (in element) which holds a queue_chain_t + field_ofst - int : offset from the 'field_name' (in element) which holds a queue_chain_t + This is mostly useful if a particular element contains an array of queue_chain_t + returns: + A generator does not return. It is used for iterating. + value : An object thats of type (element_type). Always a pointer object + example usage: + for thread in IterateQueue(kern.globals.threads, 'thread *', 'threads'): + print thread.thread_id + """ + + if type(element_type) is str: + element_type = gettype(element_type) + + # Some ways of constructing a queue head seem to end up with the + # struct object as the value and not a pointer to the struct head + # In that case, addressof will give us a pointer to the struct, which is what we need + if not queue_head.GetSBValue().GetType().IsPointerType() : + queue_head = addressof(queue_head) + + # Mosh the value into a brand new value, to really get rid of its old cvalue history + queue_head = kern.GetValueFromAddress(unsigned(queue_head), 'struct queue_entry *') + + if unsigned(queue_head) == 0: + if ParanoidIterateLinkageChain.enable_paranoia: + print "bad queue_head_t: {:s}".format(queue_head) + return + + if element_type.IsPointerType(): + elem_ofst = getfieldoffset(element_type.GetPointeeType(), field_name) + field_ofst + else: + elem_ofst = getfieldoffset(element_type, field_name) + field_ofst + + try: + link = queue_head.next + last_link = queue_head + try_read_next = unsigned(queue_head.next) + except: + print "Exception while looking at queue_head: {:>#18x}".format(unsigned(queue_head)) + raise + + if ParanoidIterateLinkageChain.enable_paranoia: + if unsigned(queue_head.next) == 0: + raise ValueError("NULL next pointer on head: queue_head {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, queue_head.next, queue_head.prev)) + if unsigned(queue_head.prev) == 0: + print "NULL prev pointer on head: queue_head {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, queue_head.next, queue_head.prev) + if unsigned(queue_head.next) == unsigned(queue_head) and unsigned(queue_head.prev) != unsigned(queue_head): + print "corrupt queue_head {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, queue_head.next, queue_head.prev) + + if ParanoidIterateLinkageChain.enable_debug : + print "starting at queue_head {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, queue_head.next, queue_head.prev) + + addr = 0 + obj = 0 + + try: + while (unsigned(queue_head) != unsigned(link)): + if ParanoidIterateLinkageChain.enable_paranoia: + if unsigned(link.next) == 0: + raise ValueError("NULL next pointer: queue_head {:>#18x} link: {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, link, link.next, link.prev)) + if unsigned(link.prev) == 0: + print "NULL prev pointer: queue_head {:>#18x} link: {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, link, link.next, link.prev) + if unsigned(last_link) != unsigned(link.prev): + print "Corrupt prev pointer: queue_head {:>#18x} link: {:>#18x} next: {:>#18x} prev: {:>#18x} prev link: {:>#18x} ".format( + queue_head, link, link.next, link.prev, last_link) + + addr = unsigned(link) - unsigned(elem_ofst); + obj = kern.GetValueFromAddress(addr, element_type) + if ParanoidIterateLinkageChain.enable_debug : + print "yielding link: {:>#18x} next: {:>#18x} prev: {:>#18x} addr: {:>#18x} obj: {:>#18x}".format(link, link.next, link.prev, addr, obj) + yield obj + last_link = link + link = link.next + except: + exc_info = sys.exc_info() + try: + print "Exception while iterating queue: {:>#18x} link: {:>#18x} addr: {:>#18x} obj: {:>#18x} last link: {:>#18x}".format(queue_head, link, addr, obj, last_link) + except: + import traceback + traceback.print_exc() + raise exc_info[0], exc_info[1], exc_info[2] + +ParanoidIterateLinkageChain.enable_paranoia = True +ParanoidIterateLinkageChain.enable_debug = False + +# Macro: showallcallouts +@lldb_command('showallcallouts') +def ShowAllCallouts(cmd_args=None): + """ Prints out the pending and delayed thread calls for high priority thread call group + """ + # Get the high priority thread's call group + g = addressof(kern.globals.thread_call_groups[0]) + pq = addressof(g.pending_queue) + dq = addressof(g.delayed_queue) + + print "Active threads: {:d}\n".format(g.active_count) + print "Idle threads: {:d}\n".format(g.idle_count) + print "Pending threads: {:d}\n".format(g.pending_count) + + call = Cast(pq.next, 'thread_call_t') + while unsigned(call) != unsigned(pq): + print "Callout: " + kern.Symbolicate([unsigned(call.tc_call.func)]) + "\n" + call = Cast(call.tc_call.q_link.next, 'thread_call_t') + + print "\nDelayed:\n" + call = Cast(dq.next, 'thread_call_t') + while unsigned(call) != unsigned(dq): + out_str = "Deadline: {:>22d}. Callout: {:#x} <".format(call.tc_call.deadline, unsigned(call.tc_call.func)) + print out_str + kern.Symbolicate(unsigned(call.tc_call.func)) + ">\n" + call = Cast(call.tc_call.q_link.next, 'thread_call_t') + +# EndMacro: showallcallouts + diff --git a/tools/lldbmacros/structanalyze.py b/tools/lldbmacros/structanalyze.py index f4c21553d..467e2018d 100644 --- a/tools/lldbmacros/structanalyze.py +++ b/tools/lldbmacros/structanalyze.py @@ -1,7 +1,7 @@ import lldb from xnu import * -def _showStructPacking(symbol, prefix, begin_offset=0): +def _showStructPacking(symbol, prefix, begin_offset=0, typedef=None): """ recursively parse the field members of structure. params : symbol (lldb.SBType) reference to symbol in binary @@ -13,7 +13,11 @@ def _showStructPacking(symbol, prefix, begin_offset=0): ctype = "union" if symbol.GetTypeClass() == lldb.eTypeClassStruct : ctype = "struct" - outstr = "[%4d] (%s) %s { " % (symbol.GetByteSize(), ctype, symbol.GetName()) + "\n" + + if typedef: + outstr = "[%4d] (%s) (%s) %s { " % (symbol.GetByteSize(), typedef, ctype, symbol.GetName()) + "\n" + else : + outstr = "[%4d] (%s) %s { " % (symbol.GetByteSize(), ctype, symbol.GetName()) + "\n" numFields = symbol.GetNumberOfFields() _has_memory_hole = False _compact_size = 0 # asuming the struct is perfectly packed @@ -33,7 +37,14 @@ def _showStructPacking(symbol, prefix, begin_offset=0): warningstr = " *** Possible memory hole ***" _compact_offset = m_offset _compact_offset += m_size - if m_type.GetTypeClass() == lldb.eTypeClassStruct or m_type.GetTypeClass() == lldb.eTypeClassUnion : + + _type_class = m_type.GetTypeClass() + _canonical_type = m_type.GetCanonicalType() + _canonical_type_class = m_type.GetCanonicalType().GetTypeClass() + + if _type_class == lldb.eTypeClassTypedef and (_canonical_type_class == lldb.eTypeClassStruct or _canonical_type_class == lldb.eTypeClassUnion) : + outstr += prefix + ("*%4d," % m_offset) + _showStructPacking(_canonical_type, prefix+" ", m_offset, str(m_type)) + warningstr + debugstr + "\n" + elif _type_class == lldb.eTypeClassStruct or _type_class == lldb.eTypeClassUnion : outstr += prefix + ("*%4d," % m_offset) + _showStructPacking(m_type, prefix+" ", m_offset) + warningstr + debugstr + "\n" else: outstr += prefix + ("+%4d,[%4d] (%s) %s" % (m_offset, m_size, m_type.GetName(), m_name)) + warningstr + debugstr + "\n" diff --git a/tools/lldbmacros/usertaskdebugging/__init__.py b/tools/lldbmacros/usertaskdebugging/__init__.py new file mode 100644 index 000000000..f4419aa0f --- /dev/null +++ b/tools/lldbmacros/usertaskdebugging/__init__.py @@ -0,0 +1 @@ +""" Internal modules which should not be open sourced """ diff --git a/tools/lldbmacros/usertaskdebugging/gdbserver.py b/tools/lldbmacros/usertaskdebugging/gdbserver.py new file mode 100644 index 000000000..19b871adb --- /dev/null +++ b/tools/lldbmacros/usertaskdebugging/gdbserver.py @@ -0,0 +1,228 @@ +import logging +from interface import Interface +import rsprotocol +import random + + +class GDBServer(object): + """instance of gdbserver""" + def __init__(self, backing_instance): + super(GDBServer, self).__init__() + self.process = backing_instance + self.portnum = random.randint(2000, 8000) + logging.info("Starting gdb server for localhost:%d" % self.portnum) + self.conn = Interface('localhost', self.portnum) + self.version_string = 'name:kdbserver;version:0.1' + + def run(self): + if not self.conn.connect(): + logging.critical("No client connected. Bailing.") + return False + + logging.debug('Starting gdb server.') + + while True: + #loop for running the server. + #read command + readBytes = "" + + while True: + try: + p_bytes = self.conn.read() + except Exception, e: + logging.warn("found exception in read %s" % (str(e))) + logging.debug("currentbytes: %s" % readBytes) + readBytes = '' + break + readBytes += p_bytes + p_begin = readBytes.find('$') + p_end = readBytes.find('#') + if p_begin >= 0 and p_end >= 0 and p_end > p_begin: + break + #if empty message or acks just ignore + if readBytes in ('', '+'): + logging.debug('ignoring message: %s' % readBytes) + continue + req_msg = rsprotocol.Message.fromRSPByteData(readBytes) + resp = self.handleMessage(req_msg) + #in case resp is to detach + if resp is None: + return True + for r_msg in resp: + logging.debug("response: %s" % r_msg.getRSPByteData()) + self.conn.write(r_msg.getRSPByteData()) + return True + + def handleMessage(self, msg): + """ return array of messages that needs to responded. """ + query = msg.getData() + replymsgs = [] + sendAck = None + logging.debug('RCV:' + query) + + if query == "?": + h_msg = rsprotocol.Message(self.process.getSignalInfo()) + replymsgs.append(h_msg) + + elif query[0] == 'm': + replymsgs.append(self.getMemory(query)) + + elif query in ('qVAttachOrWaitSupported'): + logging.debug('Ignoring query %s' % query) + replymsgs.append(rsprotocol.UnSupportedMessage) + + elif query == "qC": + replymsgs.append(self.getCurrentThreadID(query)) + + elif query[0] in ('z', 'Z'): + logging.debug('Ignoring breakpoint query %s' % query) + replymsgs.append(rsprotocol.UnSupportedMessage) + + elif query[0] in ('g', 'p'): + replymsgs.append(self.getRegisterData(query)) + + elif query[0] in ('P', 'G'): + # we do not support writing into registers + replymsgs.append(rsprotocol.Message('E05')) + + elif query in ('QStartNoAckMode'): + replymsgs.append(rsprotocol.OKMessage) + sendAck = True + + elif query in ('QListThreadsInStopReply', 'QThreadSuffixSupported'): + replymsgs.append(rsprotocol.OKMessage) + + elif query == 'qGDBServerVersion': + replymsgs.append(rsprotocol.Message(self.version_string)) + + elif query == 'qShlibInfoAddr': + #return shared library info address if any + replymsgs.append(self.getSharedLibInfoAddress(query)) + + elif query == 'qProcessInfo': + replymsgs.append(self.getProcessInfo(query)) + + elif query == 'qHostInfo': + h_msg = rsprotocol.Message(self.process.getHostInfo()) + replymsgs.append(h_msg) + + elif query == 'vCont?': + replymsgs.append(rsprotocol.Message('vCont;')) + + elif query == 'D': + logging.info('Client requested to detach.') + return None + + elif query.find('qRegisterInfo') >= 0: + replymsgs.append(self.getRegisterInfo(query)) + + elif query.find('qMemoryRegionInfo') >= 0: + replymsgs.append(self.getMemoryRegionInfo(query)) + + elif query.find('qThreadStopInfo') >= 0 or query in ('qfThreadInfo', 'qsThreadInfo'): + replymsgs.append(self.getThreadRegistersInfo(query)) + + else: + replymsgs.append(rsprotocol.UnSupportedMessage) + + if sendAck is not None: + if sendAck: + replymsgs.insert(0, rsprotocol.AckMessage) + else: + replymsgs.insert(0, rsprotocol.NAckMessage) + + return replymsgs + + def getThreadRegistersInfo(self, query): + bytes = '' + if query == 'qfThreadInfo': + bytes = self.process.getFirstThreadInfo() + elif query == 'qsThreadInfo': + bytes = self.process.getSubsequestThreadInfo() + else: + try: + query = query.replace('qThreadStopInfo', '') + tid = int(query, 16) + bytes = self.process.getThreadStopInfo(tid) + except Exception, e: + logging.error("Failed to get register information query: %s error: %s" % (query, e.message)) + return rsprotocol.Message(bytes) + + def getRegisterData(self, query): + if query[0] == 'g': + #TODO should implement thissometime. Considering getThreadRegistersInfo is there + #we wont need this one. + return rsprotocol.UnSupportedMessage + + #the query is of type p;thread:; + bytes = '' + try: + args = query[1:].split(';') + if len(args) > 0: + regnum = int(args[0], 16) + if args[1].find('thread') >= 0: + threadid = int(args[1].split(':')[-1], 16) + bytes = self.process.getRegisterDataForThread(threadid, regnum) + logging.debug('REGISTER INFO bytes = ' + bytes) + except Exception, e: + logging.error("Failed to get register information query: %s error: %s" % (query, e.message)) + return rsprotocol.Message(bytes) + + def getRegisterInfo(self, query): + bytes = '' + try: + query = query.replace('qRegisterInfo', '') + regnum = int(query, 16) + bytes = self.process.getRegisterInfo(regnum) + except Exception, e: + logging.error("Failed to get register information error: %s" % e.message) + return rsprotocol.Message(bytes) + + def getMemory(self, query): + query = query[1:] + addr, size = query.split(',') + mem_address = int(addr, 16) + mem_size = int(size, 16) + bytes = '' + try: + bytes = self.process.readMemory(mem_address, mem_size) + except Exception, e: + logging.warn('Failed to read data %s' % str(e)) + return rsprotocol.Message('E03') + return rsprotocol.Message(bytes) + + def getMemoryRegionInfo(self, query): + return rsprotocol.UnSupportedMessage + + def setMemory(self, query): + logging.info('Not supporting writing to memory. %s' % query) + return rsprotocol.Message('E09') + + def getProcessInfo(self, query): + data = '' + try: + data = self.process.getProcessInfo() + except Exception, e: + logging.error("Failed to get process information") + return rsprotocol.Message(data) + + def getSharedLibInfoAddress(self, query): + data = 'E44' + try: + data = self.process.getSharedLibInfoAddress() + data = self.process.encodeThreadID(data) + except Exception, e: + logging.error("Failed to get Shared Library information") + return rsprotocol.Message(data) + + def getCurrentThreadID(self, query): + tid = '0' + try: + tid = '%x' % (self.process.getCurrentThreadID()) + except Exception, e: + logging.error("Failed to get QC info") + + return rsprotocol.Message('QC'+tid) + + def kill(self): + pass diff --git a/tools/lldbmacros/usertaskdebugging/interface.py b/tools/lldbmacros/usertaskdebugging/interface.py new file mode 100644 index 000000000..590541f4b --- /dev/null +++ b/tools/lldbmacros/usertaskdebugging/interface.py @@ -0,0 +1,63 @@ +import logging +import socket +import select + +class Interface(object): + """Basic communication interface.""" + def __init__(self, host_cfg, portnum): + super(Interface, self).__init__() + self.host_cfg = host_cfg + self.portnum = portnum + self.pkt_size = 8192 + self.socket = None + self.isblocking = True + logging.debug("created %s" % str(self)) + + def connect(self): + self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.socket.bind((self.host_cfg, self.portnum)) + logging.debug("Initializing network interface for communication host: %s:%d", self.host_cfg, self.portnum) + self.socket.listen(5) + num_retries = 3 + while num_retries > 0: + ra,wa,ea = select.select([self.socket], [], [], 30) + if not ra: + num_retries -= 1 + logging.error("select returned empty list") + continue + self.connection, addr = self.socket.accept() + logging.info("Connected to client from %s" % str(addr)) + return True + logging.error("Failed to connect. Exiting after multiple attempts.") + return False + + def read(self): + if self.isblocking: + #BUG TODO make this unblocking soon + #logging.warn("blocking read bug") + self.connection.settimeout(15) + self.isblocking = False + r_bytes = '' + try: + r_bytes = self.connection.recv(self.pkt_size) + except Exception, e: + #logging.debug("Found exception in recv. %s " % (str(e))) + pass + + return r_bytes + + def write(self, bytes): + if not self.isblocking: + self.connection.setblocking(1) + self.isblocking = True + return self.connection.send(bytes) + + def close(self): + if self.connection: + logging.debug('closing connection.') + self.connection.close() + return self.socket + + def __str__(self): + return "interface: %s %d" % (self.host_cfg, self.portnum) diff --git a/tools/lldbmacros/usertaskdebugging/rsprotocol.py b/tools/lldbmacros/usertaskdebugging/rsprotocol.py new file mode 100644 index 000000000..fa4fc7a28 --- /dev/null +++ b/tools/lldbmacros/usertaskdebugging/rsprotocol.py @@ -0,0 +1,62 @@ +import logging + + +class Message(object): + """represents a message of Remote serial protocol""" + def __init__(self, data): + super(Message, self).__init__() + self.data = data + + def __str__(self): + return "Message: %s" % (self.data) + + def getData(self): + #TODO need to parse data and unescape + return self.data + + def getRSPByteData(self): + retval = ''.join(['$',self.data,'#']) + checksum = 0 + for i in self.data: + checksum += ord(i) + checksum = checksum % 0x100 + checksum_str = "{:02x}".format(checksum) + retval += checksum_str + return retval + + @classmethod + def fromRSPByteData(cls, bytedata): + data_begin = 0 + data_end = 0 + try: + data_begin = bytedata.index('$') + data_end = bytedata.index('#') + except ValueError, e: + logging.error('Invalid bytedata considered as message %s' % bytedata) + return None + + #validate the data + if data_begin + 1 >= data_end: + logging.debug("empty message %s"%bytedata) + data_begin -= 1 + + data_begin += 1 + logging.debug("Creating message from data %s" % bytedata[data_begin:data_end]) + ret_obj = cls(bytedata[data_begin:data_end]) + return ret_obj + +class ProtocolAcknowledgement(Message): + """Ack Messages""" + def __init__(self, ack_str): + super(ProtocolAcknowledgement, self).__init__(ack_str) + self.data = ack_str + + def getRSPByteData(self): + return self.data + + +OKMessage = Message('OK') + +AckMessage = ProtocolAcknowledgement('+') +NAckMessage = ProtocolAcknowledgement('-') +UnSupportedMessage = Message('') diff --git a/tools/lldbmacros/usertaskdebugging/target.py b/tools/lldbmacros/usertaskdebugging/target.py new file mode 100644 index 000000000..9a2059268 --- /dev/null +++ b/tools/lldbmacros/usertaskdebugging/target.py @@ -0,0 +1,141 @@ +import logging +import struct + + +class Process(object): + """Base interface for process being debugged. Provides basic functions for gdbserver to interact. + Create a class object for your backing system to provide functionality + + Here is the list of must implement functions: + + please update hinfo['ostype'] and hinfo['vendor'] if its not in (macosx, ios) + + please populate threads_ids_list with ids of threads. + - getThreadStopInfo + - getProcessInfo + - getRegisterDataForThread + - getRegisterInfo + - readMemory + """ + def __init__(self, cputype, cpusubtype, ptrsize): + super(Process, self).__init__() + self.hinfo = { + 'cputype': cputype, 'cpusubtype': cpusubtype, + 'triple': None, 'vendor': 'apple', 'ostype': 'macosx', + 'endian': 'little', 'ptrsize': ptrsize, 'hostname': None, 'os_build': None, + 'os_kernel': None, 'os_version': None, 'watchpoint_exceptions_received': None, + 'default_packet_timeout': '10', 'distribution_id': None + } + + # if cputype is arm assume its ios + if (cputype & 0xc) != 0xc: + self.hinfo['ostype'] = 'ios' + self.ptrsize = ptrsize + self.threads = {} + self.threads_ids_list = [] + + def getHostInfo(self): + retval = '' + for i in self.hinfo.keys(): + if self.hinfo[i] is None: + continue + retval += '%s:%s;' % (str(i), str(self.hinfo[i])) + return retval + + def getRegisterDataForThread(self, th_id, reg_num): + logging.critical("Not Implemented: getRegisterDataForThread") + return '' + + def readMemory(self, address, size): + logging.critical("readMemory: Not Implemented: readMemory") + #E08 means read failed + return 'E08' + + def writeMemory(self, address, data, size): + """ Unimplemented. address in ptr to save data to. data is native endian stream of bytes, + """ + return 'E09' + + def getRegisterInfo(regnum): + #something similar to + #"name:x1;bitsize:64;offset:8;encoding:uint;format:hex;gcc:1;dwarf:1;set:General Purpose Registers;" + logging.critical("getRegisterInfo: Not Implemented: getRegisterInfo") + return 'E45' + + def getProcessInfo(self): + logging.critical("Not Implemented: qProcessInfo") + return '' + + def getFirstThreadInfo(self): + """ describe all thread ids in the process. + """ + thinfo_str = self.getThreadsInfo() + if not thinfo_str: + logging.warning('getFirstThreadInfo: Process has no threads') + return '' + return 'm' + thinfo_str + + def getSubsequestThreadInfo(self): + """ return 'l' for last because all threads are listed in getFirstThreadInfo call. + """ + return 'l' + + def getSharedLibInfoAddress(self): + """ return int data of a hint where shared library is loaded. + """ + logging.critical("Not Implemented: qShlibInfoAddr") + raise NotImplementedError('getSharedLibInfoAddress is not Implemented') + + def getSignalInfo(self): + # return the signal info in required format. + return "T02" + "threads:" + self.getThreadsInfo() + ';' + + def getThreadsInfo(self): + """ returns ',' separeted values of thread ids """ + retval = '' + first = True + for tid in self.threads_ids_list: + if first is True: + first = False + retval += self.encodeThreadID(tid) + else: + retval += ',%s' % self.encodeThreadID(tid) + return retval + + def getCurrentThreadID(self): + """ returns int thread id of the first stopped thread + if subclass supports thread switching etc then + make sure to re-implement this funciton + """ + if self.threads_ids_list: + return self.threads_ids_list[0] + return 0 + + def getThreadStopInfo(self, th_id): + """ returns stop signal and some thread register info. + """ + logging.critical("getThreadStopInfo: Not Implemented. returning basic info.") + + return 'T02thread:%s' % self.encodeThreadID(th_id) + + def encodeRegisterData(self, intdata, bytesize=None): + """ return an encoded string for unsigned int intdata + based on the bytesize and endianness value + """ + if not bytesize: + bytesize = self.ptrsize + + format = ' 4: + format = '= len(self.registerset): + logging.warning("regnum %d is not defined for thread_id 0x%x" % (reg_num, self.thread_id)) + return None + return self.getRegisterValueByName(self.registerset[reg_num]['name']) + + +class UserProcess(target.Process): + """ Represent a user process and thread states """ + def __init__(self, task): + self.task = task + self.proc = Cast(task.bsd_info, 'proc_t') + dataregisters64bit = False + ptrsize = 4 + + if task.t_flags & 0x1: + ptrsize = 8 + if task.t_flags & 0x2: + dataregisters64bit = 8 + + cputype = CPU_TYPE_X86_64 + cpusubtype = CPU_SUBTYPE_X86_64_ALL + + if kern.arch in ('arm'): + cputype = CPU_TYPE_ARM + cpusubtype = CPU_SUBTYPE_ARM_V7 + elif kern.arch in ('armv8', 'arm64'): + cputype = CPU_TYPE_ARM64 + cpusubtype = CPU_SUBTYPE_ARMV8 + + super(UserProcess, self).__init__(cputype, cpusubtype, ptrsize) + + self.hinfo['ostype'] = 'macosx' + if cputype != CPU_TYPE_X86_64: + self.hinfo['ostype'] = 'ios' + + self.cputype = unsigned(self.proc.p_cputype) + self.cpusubtype = unsigned(self.proc.p_cpusubtype) + self.registerset = GetRegisterSetForCPU(cputype, cpusubtype) + logging.debug("process %s is64bit: %d ptrsize: %d cputype: %d cpusubtype:%d", + hex(self.proc), int(dataregisters64bit), ptrsize, + self.cputype, self.cpusubtype + ) + self.threads = {} + self.threads_ids_list = [] + logging.debug("iterating over threads in process") + for thval in IterateQueue(task.threads, 'thread *', 'task_threads'): + self.threads[unsigned(thval.thread_id)] = UserThreadObject(thval, self.cputype, self.cpusubtype, cputype) + self.threads_ids_list.append(unsigned(thval.thread_id)) + + def getRegisterDataForThread(self, th_id, reg_num): + if th_id not in self.threads: + logging.critical("0x%x thread id is not found in this task") + return '' + if reg_num < 0 or reg_num >= len(self.registerset): + logging.warning("regnum %d is not defined for thread_id 0x%x" % (reg_num, th_id)) + return '' + value = self.threads[th_id].getRegisterData(reg_num) + return self.encodeRegisterData(value, bytesize=self.registerset[reg_num]['bitsize']/8) + + def getRegisterCombinedDataForThread(self, th_id): + if th_id not in self.threads: + logging.critical("0x%x thread id is not found in this task" % th_id) + return '' + cur_thread = self.threads[th_id] + retval = 'thread:%s;name:%s;' % (self.encodeThreadID(th_id), cur_thread.getName()) + pos = 0 + for rinfo in self.registerset: + name = rinfo['name'] + format = "%02x:%s;" + value = cur_thread.getRegisterValueByName(name) + value_endian_correct_str = self.encodeRegisterData(value, bytesize=(rinfo['bitsize']/8)) + retval += format % (pos, value_endian_correct_str) + pos += 1 + return retval + + def getThreadStopInfo(self, th_id): + if th_id not in self.threads: + logging.critical("0x%x thread id is not found in this task") + return '' + return 'T02' + self.getRegisterCombinedDataForThread(th_id) + 'threads:' + self.getThreadsInfo()+';' + + def getRegisterInfo(self, regnum): + #something similar to + #"name:x1;bitsize:64;offset:8;encoding:uint;format:hex;gcc:1;dwarf:1;set:General Purpose Registers;" + if regnum > len(self.registerset): + logging.debug("No register_info for number %d." % regnum) + return 'E45' + + rinfo = self.registerset[regnum] + retval = '' + for i in rinfo.keys(): + i_val = str(rinfo[i]) + if i == 'set': + i_val = 'General Purpose Registers' + retval += '%s:%s;' % (str(i), i_val) + + return retval + + def getProcessInfo(self): + retval = '' + #pid:d22c;parent-pid:d34d;real-uid:ecf;real-gid:b;effective-uid:ecf;effective-gid:b;cputype:1000007;cpusubtype:3; + #ostype:macosx;vendor:apple;endian:little;ptrsize:8; + pinfo = {'effective-uid': 'ecf', 'effective-gid': 'b', 'endian': 'little', 'vendor': 'apple'} + pinfo['pid'] = "%x" % (GetProcPIDForTask(self.task)) + pinfo['parent-pid'] = "%x" % (unsigned(self.proc.p_ppid)) + pinfo['ptrsize'] = str(self.ptrsize) + pinfo['ostype'] = 'macosx' + pinfo['cputype'] = "%x" % self.cputype + pinfo['cpusubtype'] = "%x" % self.cpusubtype + pinfo['real-uid'] = "%x" % (unsigned(self.proc.p_ruid)) + pinfo['real-gid'] = "%x" % (unsigned(self.proc.p_rgid)) + if str(kern.arch).find('arm') >= 0: + pinfo['ostype'] = 'ios' + for i in pinfo.keys(): + i_val = str(pinfo[i]) + retval += '%s:%s;' % (str(i), i_val) + return retval + + def readMemory(self, address, size): + data = GetUserDataAsString(self.task, address, size) + if not data: + logging.error("Failed to read memory task:{: <#018x} {: <#018x} {:d}".format(self.task, address, size)) + return self.encodeByteString(data) + + def getSharedLibInfoAddress(self): + return unsigned(self.task.all_image_info_addr) diff --git a/tools/lldbmacros/usertaskgdbserver.py b/tools/lldbmacros/usertaskgdbserver.py index 94de936f6..29bdc5b29 100644 --- a/tools/lldbmacros/usertaskgdbserver.py +++ b/tools/lldbmacros/usertaskgdbserver.py @@ -27,3 +27,20 @@ def DoUserTaskDebuggingServer(cmd_args = [], cmd_options ={}): if not _usertaskdebugging_availabe: print "You do not have the usertask debugging files available. " return + log_level = logging.ERROR + if '-D' in cmd_options: + log_level = logging.DEBUG + elif '-W' in cmd_options: + log_level = logging.WARNING + + setupLogging(debug_level=log_level) + if not cmd_args: + raise ArgumentError("Please provide valid task argument.") + + t = kern.GetValueFromAddress(cmd_args[0], 'task_t') + + up = userprocess.UserProcess(t) + gbs = gdbserver.GDBServer(up) + print "Starting debug session for %s at localhost:%d." % (GetProcNameForTask(t), gbs.portnum) + gbs.run() + print "stopped the debug session" diff --git a/tools/lldbmacros/utils.py b/tools/lldbmacros/utils.py index 68161d7c3..fbb0494bc 100644 --- a/tools/lldbmacros/utils.py +++ b/tools/lldbmacros/utils.py @@ -348,7 +348,7 @@ def addDSYM(uuid, info): # modify the list to show we loaded this _dsymlist[uuid] = True -def loadDSYM(uuid, load_address): +def loadDSYM(uuid, load_address, sections=[]): """ Load an already added symbols to a particular load address params: uuid - str - uuid string load_address - int - address where to load the symbols @@ -358,9 +358,20 @@ def loadDSYM(uuid, load_address): """ if uuid not in _dsymlist: return False - cmd_str = "target modules load --uuid %s --slide %d" % ( uuid, load_address) - debuglog(cmd_str) + if not sections: + cmd_str = "target modules load --uuid %s --slide %d" % ( uuid, load_address) + debuglog(cmd_str) + else: + cmd_str = "target modules load --uuid {} ".format(uuid) + sections_str = "" + for s in sections: + sections_str += " {} {:#0x} ".format(s.name, s.vmaddr) + cmd_str += sections_str + debuglog(cmd_str) + lldb.debugger.HandleCommand(cmd_str) + return True + def RunShellCommand(command): """ Run a shell command in subprocess. @@ -421,3 +432,32 @@ def IsAppleInternal(): except ImportError: retval = False return retval + +def print_hex_data(data, begin_offset=0, desc=""): + """ print on stdout "hexdump -C < data" like output + params: + data - bytearray or array of int where each int < 255 + begin_offset - int offset that should be printed in left column + desc - str optional description to print on the first line to describe data + """ + if desc: + print "{}:".format(desc) + index = 0 + total_len = len(data) + hex_buf = "" + char_buf = "" + while index < total_len: + hex_buf += " {:02x}".format(data[index]) + if data[index] < 0x20 or data[index] > 0x7e: + char_buf += "." + else: + char_buf += "{:c}".format(data[index]) + index += 1 + if index and index < total_len and index % 8 == 0: + hex_buf += " " + if index > 1 and index < total_len and (index % 16) == 0: + print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf) + hex_buf = "" + char_buf = "" + print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf) + return diff --git a/tools/lldbmacros/waitq.py b/tools/lldbmacros/waitq.py index e5914d430..8bfa63919 100644 --- a/tools/lldbmacros/waitq.py +++ b/tools/lldbmacros/waitq.py @@ -29,7 +29,7 @@ def GetWaitqBitsStr(waitq): return out_str def WaitqTableElemType(e): - type = (e.wqte.wqt_bits >> 29) & 0x3 + type = (e.wqte.lt_bits >> 29) & 0x3 wqe_type = { 0: 'FREE', 1: 'ELEM', @@ -39,27 +39,27 @@ def WaitqTableElemType(e): return wqe_type[type] def WaitqTableElemId(e): - return e.wqte.wqt_id.id + return e.wqte.lt_id.id def WaitqTableElemValid(e): if unsigned(e) == 0: return 0 - return (e.wqte.wqt_bits & 0x80000000) == 0x80000000 + return (e.wqte.lt_bits & 0x80000000) == 0x80000000 def WaitqTableElemRefcnt(e): - return (e.wqte.wqt_bits & 0x1fffffff) + return (e.wqte.lt_bits & 0x1fffffff) def WaitqTableIdxFromId(id): - if hasattr(kern.globals, 'g_wqt_idx_max'): - idx = id & unsigned(kern.globals.g_wqt_idx_max) + if hasattr(kern.globals, 'g_lt_idx_max'): + idx = id & unsigned(kern.globals.g_lt_idx_max) else: # best guess idx = id & 0x000000000003ffff return int(idx) def WaitqTableGenFromId(id): - if hasattr(kern.globals, 'g_wqt_idx_max'): - msk = ~unsigned(kern.globals.g_wqt_idx_max) + if hasattr(kern.globals, 'g_lt_idx_max'): + msk = ~unsigned(kern.globals.g_lt_idx_max) else: # best guess msk = ~0x000000000003ffff @@ -73,19 +73,19 @@ def GetWaitqLink(id): if int(id) == 0: return 0, "NULL link id" idx = WaitqTableIdxFromId(id) - if idx >= kern.globals.g_linktable.nelem: + if idx >= kern.globals.g_wqlinktable.nelem: return 0, "Invalid waitq link table id: {:d}".format(id) - slab_slot = idx / kern.globals.g_linktable.slab_elem; - slab = kern.globals.g_linktable.table[int(slab_slot)] + slab_slot = idx / kern.globals.g_wqlinktable.slab_elem; + slab = kern.globals.g_wqlinktable.table[int(slab_slot)] if slab == 0: print "Invalid waitq link table id:", str(id), " (invalid slab)" - first_elem = Cast(slab, 'wqt_elem *') - addr = int(slab) + ((idx - first_elem.wqt_id.idx) * int(kern.globals.g_linktable.elem_sz)) - link = kern.GetValueFromAddress(addr, 'setid_link *') + first_elem = Cast(slab, 'lt_elem *') + addr = int(slab) + ((idx - first_elem.lt_id.idx) * int(kern.globals.g_wqlinktable.elem_sz)) + link = kern.GetValueFromAddress(addr, 'waitq_link *') gen = WaitqTableGenFromId(id) warn_str = '' - if gen > 0 and link.wqte.wqt_id.generation != gen: - warn_str = "WARNING: found idx:{:d}/gen:{:d}, but requested idx:{:d}/gen:{:d}".format(link.wqte.wqt_id.idx, link.wqte.wqt_id.generation, idx, gen) + if gen > 0 and link.wqte.lt_id.generation != gen: + warn_str = "WARNING: found idx:{:d}/gen:{:d}, but requested idx:{:d}/gen:{:d}".format(link.wqte.lt_id.idx, link.wqte.lt_id.generation, idx, gen) link = 0 return link, warn_str @@ -99,13 +99,13 @@ def GetWaitqPrepost(id): if slab == 0: warn_str = "Invalid waitq prepost table id:", str(id), " (invalid slab)" return 0, warn_str - first_elem = Cast(slab, 'wqt_elem *') - addr = int(slab) + ((idx - first_elem.wqt_id.idx) * int(kern.globals.g_prepost_table.elem_sz)) + first_elem = Cast(slab, 'lt_elem *') + addr = int(slab) + ((idx - first_elem.lt_id.idx) * int(kern.globals.g_prepost_table.elem_sz)) wqp = kern.GetValueFromAddress(addr, 'wq_prepost *') gen = WaitqTableGenFromId(id) warn_str = '' - if gen > 0 and wqp.wqte.wqt_id.generation != gen: - warn_str = "WARNING: found idx:{:d}/gen:{:d}, but requested idx:{:d}/gen:{:d}".format(wqp.wqte.wqt_id.idx, wqp.wqte.wqt_id.generation, idx, gen) + if gen > 0 and wqp.wqte.lt_id.generation != gen: + warn_str = "WARNING: found idx:{:d}/gen:{:d}, but requested idx:{:d}/gen:{:d}".format(wqp.wqte.lt_id.idx, wqp.wqte.lt_id.generation, idx, gen) wqp = 0 return wqp, warn_str @@ -123,15 +123,15 @@ def WaitqSetsFromLink(link, sets, depth): sets.append("{: <22s}".format("")) return if WaitqTableElemType(link) == "ELEM": - #sets.append("{: <#18x}".format(unsigned(link.sl_wqs.sl_set))) + #sets.append("{: <#18x}".format(unsigned(link.wql_wqs.wql_set))) #sets.append("{:>7d}/{:<#14x}".format(unsigned(id.idx),unsigned(id.generation))) - sets.append(GetWaitqSetidString(link.wqte.wqt_id.id)) + sets.append(GetWaitqSetidString(link.wqte.lt_id.id)) return if depth >= 950: sets.append("{: <22s}".format("!recursion limit!")) return - left_link = GetWaitqLink(link.sl_link.sl_left_setid)[0] - right_link = GetWaitqLink(link.sl_link.sl_right_setid)[0] + left_link = GetWaitqLink(link.wql_link.left_setid)[0] + right_link = GetWaitqLink(link.wql_link.right_setid)[0] WaitqSetsFromLink(left_link, sets, depth + 1) WaitqSetsFromLink(right_link, sets, depth + 1) return @@ -153,13 +153,13 @@ def GetFrameString(pc, compact=True): else: return re.sub(r'.*(0x[0-9a-f]+)\s+<(\w+)( \+ 0x[0-9a-f]+)*>.*', r'\2(\1)', str, re.UNICODE) -@lldb_type_summary(['setid_link', 'setid_link *']) +@lldb_type_summary(['waitq_link', 'waitq_link *']) @header("{:<18s} {:<18s} {:<19s} {:<10s} {:<1s} {:<4s} {:<10s} {:<20s}".format('addr','id','idx','gen','V','type','refcnt','info')) def GetWaitqSetidLinkSummary(link, verbose=False): has_stats = 0 if not link: return "" - fmt_str = "{l: <#18x} {l.wqte.wqt_id.id: <#18x} {l.wqte.wqt_id.idx: <7d} (->{l.wqte.wqt_next_idx: <7d}) {l.wqte.wqt_id.generation: <#10x} {v: <1s} {t: <4s} {rcnt: <10d} " + fmt_str = "{l: <#18x} {l.wqte.lt_id.id: <#18x} {l.wqte.lt_id.idx: <7d} (->{l.wqte.lt_next_idx: <7d}) {l.wqte.lt_id.generation: <#10x} {v: <1s} {t: <4s} {rcnt: <10d} " if hasattr(link, 'sl_alloc_task'): has_stats = 1 fmt_str += "owner:{l.sl_alloc_task: <#x}/th:{l.sl_alloc_th: <#x}\n" @@ -183,10 +183,10 @@ def GetWaitqSetidLinkSummary(link, verbose=False): refcnt = WaitqTableElemRefcnt(link) out_str = fmt_str.format(l=link, v=v, t=type, rcnt=refcnt) if type == "WQS": - out_str += "wqs:{0: <#18x}".format(unsigned(link.sl_wqs.sl_set)) + out_str += "wqs:{0: <#18x}".format(unsigned(link.wql_wqs.wql_set)) elif type == "LINK": - lID = link.sl_link.sl_left_setid - rID = link.sl_link.sl_right_setid + lID = link.wql_link.left_setid + rID = link.wql_link.right_setid left = GetWaitqLink(lID)[0] right = GetWaitqLink(rID)[0] ltype = "" @@ -241,8 +241,8 @@ def GetWaitqSetidLinkSummary(link, verbose=False): def PrintWaitqSetidLinkTree(link, verbose, sets, indent=87): if not WaitqTableElemType(link) == "LINK": return - lID = link.sl_link.sl_left_setid - rID = link.sl_link.sl_right_setid + lID = link.wql_link.left_setid + rID = link.wql_link.right_setid left = GetWaitqLink(lID)[0] right = GetWaitqLink(rID)[0] @@ -261,9 +261,9 @@ def PrintWaitqSetidLinkTree(link, verbose, sets, indent=87): rstr = "R:{:<#x}({:s})".format(rID, rtype) if ltype == "WQS": - sets.append(addressof(left.sl_wqs.sl_set.wqset_q)) + sets.append(addressof(left.wql_wqs.wql_set.wqset_q)) if rtype == "WQS": - sets.append(addressof(right.sl_wqs.sl_set.wqset_q)) + sets.append(addressof(right.wql_wqs.wql_set.wqset_q)) print "{:s}`->{:s}, {:s}".format(' '*indent, lstr, rstr) if ltype == "WQS": @@ -277,7 +277,7 @@ def PrintWaitqSetidLinkTree(link, verbose, sets, indent=87): # Macro: showsetidlink @lldb_command('showsetidlink', "S:FT") def ShowSetidLink(cmd_args=None, cmd_options={}): - """ Print setid_link structure summary + """ Print waitq_link structure summary Note: you can pass either a complete ID (generation + index), or just the index to the -S argument. @@ -308,26 +308,26 @@ def ShowSetidLink(cmd_args=None, cmd_options={}): followchain = 1 if link == 0: if not cmd_args: - raise ArgumentError("Please pass the address of a setid_link object") - link = kern.GetValueFromAddress(cmd_args[0], 'setid_link *') + raise ArgumentError("Please pass the address of a waitq_link object") + link = kern.GetValueFromAddress(cmd_args[0], 'waitq_link *') if not link: - raise ArgumentError("Invalid setid_link {:s}".format(cmd_args[0])) + raise ArgumentError("Invalid waitq_link {:s}".format(cmd_args[0])) print GetWaitqSetidLinkSummary.header print GetWaitqSetidLinkSummary(link, verbose) if followchain == 1: - next_id = link.wqte.wqt_next_idx - max_elem = int(kern.globals.g_linktable.nelem) - if hasattr(kern.globals, 'g_wqt_idx_max'): - max_elem = unsigned(kern.globals.g_wqt_idx_max) + next_id = link.wqte.lt_next_idx + max_elem = int(kern.globals.g_wqlinktable.nelem) + if hasattr(kern.globals, 'g_lt_idx_max'): + max_elem = unsigned(kern.globals.g_lt_idx_max) while link != 0 and next_id < max_elem: link, warn_str = GetWaitqLink(unsigned(next_id)) if link != 0: print GetWaitqSetidLinkSummary(link, verbose) - next_id = link.wqte.wqt_next_idx + next_id = link.wqte.lt_next_idx if showtree == 1: sets = [] - print "\nLinkTree:{:<#x}({:s})".format(link.wqte.wqt_id.id, WaitqTableElemType(link)) + print "\nLinkTree:{:<#x}({:s})".format(link.wqte.lt_id.id, WaitqTableElemType(link)) PrintWaitqSetidLinkTree(link, verbose, sets, 9) if len(sets) > 0: print "{:d} Sets:".format(len(sets)) @@ -345,6 +345,11 @@ def ShowSetidLink(cmd_args=None, cmd_options={}): nps = "s" print "\tWQS:{:<#x} ({:d} prepost{:s})".format(unsigned(wq),npreposts,nps) # EndMacro: showsetidlink +@lldb_command('showwaitqlink', "S:FT") +def ShowWaitqLink(cmd_args=None, cmd_options={}): + """ Print waitq_link structure summary + """ + ShowSetidLink(cmd_args, cmd_options) # Macro: showallsetidlinks @@ -416,7 +421,7 @@ def ShowAllSetidLinks(cmd_args=None, cmd_options={}): elif opt_type_filt == "": if not opt_subtype_filter == "iP": raise ArgumentError("Invalid sub-type filter \{desc\}: {:s}".format(opt_subtype_filter)) - table = kern.globals.g_linktable + table = kern.globals.g_wqlinktable nelem = int(table.nelem) wq_ptr = {} bt_summary = {} @@ -425,7 +430,7 @@ def ShowAllSetidLinks(cmd_args=None, cmd_options={}): nwqs = 0 nlink = 0 nrsvd = 0 - hdr_str = "Looking through {:d} setid_link objects from g_linktable@{:<#x}".format(nelem, addressof(kern.globals.g_linktable)) + hdr_str = "Looking through {:d} waitq_link objects from g_wqlinktable@{:<#x}".format(nelem, addressof(kern.globals.g_wqlinktable)) if opt_type_filt != "" or opt_valid_only != 0: hdr_str += "\n\t`-> for " if opt_valid_only: @@ -455,8 +460,8 @@ def ShowAllSetidLinks(cmd_args=None, cmd_options={}): while id < nelem: if id == 0: # Set a generation count to differentiate from an invalid ID - first_entry = Cast(kern.globals.g_linktable.table[0], 'wqt_elem *') - link = GetWaitqLink(first_entry.wqt_id.id)[0] + first_entry = Cast(kern.globals.g_wqlinktable.table[0], 'lt_elem *') + link = GetWaitqLink(first_entry.lt_id.id)[0] else: link = GetWaitqLink(id)[0] if not link: @@ -468,8 +473,8 @@ def ShowAllSetidLinks(cmd_args=None, cmd_options={}): inconsistent = 0 do_print = not ( (isvalid and opt_invalid_only) or (not isvalid and opt_valid_only) ) if do_print and opt_subtype_filter != 0 and lt == "LINK": - lID = link.sl_link.sl_left_setid - rID = link.sl_link.sl_right_setid + lID = link.wql_link.left_setid + rID = link.wql_link.right_setid left = GetWaitqLink(lID)[0] right = GetWaitqLink(rID)[0] lValid = WaitqTableElemValid(left) @@ -549,9 +554,9 @@ def ShowAllSetidLinks(cmd_args=None, cmd_options={}): ninconsistent += 1 # print out warnings about inconsistent state as we parse # the list - even if the caller wants a summary - print "[WARNING] inconsistent state in idx: {:d} ({:s} element)".format(link.wqte.wqt_id.idx, lt) + print "[WARNING] inconsistent state in idx: {:d} ({:s} element)".format(link.wqte.lt_id.idx, lt) if opt_cross_check == 1 and lt == "ELEM": - wq = unsigned(addressof(link.sl_wqs.sl_set.wqset_q)) + wq = unsigned(addressof(link.wql_wqs.wql_set.wqset_q)) if wq in wq_ptr: wq_ptr[wq].append(id) l = len(wq_ptr[wq]) @@ -711,7 +716,7 @@ def ShowAllPreposts(cmd_args=None, cmd_options={}): def GetWaitqPrepostSummary(wqp): if not wqp: return - fmt_str = "{w: <#18x} {w.wqte.wqt_id.id: <#18x} {w.wqte.wqt_id.idx: <7d} (->{w.wqte.wqt_next_idx: <7d}) {w.wqte.wqt_id.generation: <#10x} {v: <1s} {t: <4s} {rcnt: <10d} " + fmt_str = "{w: <#18x} {w.wqte.lt_id.id: <#18x} {w.wqte.lt_id.idx: <7d} (->{w.wqte.lt_next_idx: <7d}) {w.wqte.lt_id.generation: <#10x} {v: <1s} {t: <4s} {rcnt: <10d} " type = WaitqTableElemType(wqp) if type == "ELEM": type = "WQ" @@ -775,11 +780,11 @@ def WaitqPrepostFromObj(wqp, head_id, inv_ok, prepost_str, pp_arr = 0, depth = 0 if not WaitqTableElemValid(wqp) and not inv_ok: id = 0 if wqp: - id = wqp.wqte.wqt_id.id + id = wqp.wqte.lt_id.id prepost_str.append("{0: <#18x}:{1: <18s}".format(id, "")) return if etype == "ELEM": # WQP_WQ - prepost_str.append("{0: <#18x}:{1: <#18x}".format(wqp.wqte.wqt_id.id, unsigned(wqp.wqp_wq.wqp_wq_ptr))) + prepost_str.append("{0: <#18x}:{1: <#18x}".format(wqp.wqte.lt_id.id, unsigned(wqp.wqp_wq.wqp_wq_ptr))) return post_wq = 0 @@ -789,20 +794,20 @@ def WaitqPrepostFromObj(wqp, head_id, inv_ok, prepost_str, pp_arr = 0, depth = 0 post_wq = GetWaitqPrepost(wqp.wqp_post.wqp_wq_id)[0] if WaitqTableElemValid(post_wq): if WaitqTableElemType(post_wq) != "ELEM": - prepost_str.append("{0: <#18x}:{1: <18s}".format(post_wq.wqte.wqt_id.id, "")) + prepost_str.append("{0: <#18x}:{1: <18s}".format(post_wq.wqte.lt_id.id, "")) else: - prepost_str.append("{0: <#18x}:{1: <#18x}".format(wqp.wqte.wqt_id.id, unsigned(post_wq.wqp_wq.wqp_wq_ptr))) + prepost_str.append("{0: <#18x}:{1: <#18x}".format(wqp.wqte.lt_id.id, unsigned(post_wq.wqp_wq.wqp_wq_ptr))) if next_id > 0 and next_id != head_id: if depth >= 950: prepost_str.append("{: <37s}".format("!recursion limit!")) return WaitqPrepostFromObj(GetWaitqPrepost(next_id)[0], head_id, inv_ok, prepost_str, pp_arr, depth + 1) else: # "RSVD" or "FREE": - prepost_str.append("{0: <#18x} -> {1: <15d}".format(wqp.wqte.wqt_id.id, wqp.wqte.wqt_next_idx)) - next_id = wqp.wqte.wqt_next_idx + prepost_str.append("{0: <#18x} -> {1: <15d}".format(wqp.wqte.lt_id.id, wqp.wqte.lt_next_idx)) + next_id = wqp.wqte.lt_next_idx max_elem = int(kern.globals.g_prepost_table.nelem) - if hasattr(kern.globals, 'g_wqt_idx_max'): - max_elem = unsigned(kern.globals.g_wqt_idx_max) + if hasattr(kern.globals, 'g_lt_idx_max'): + max_elem = unsigned(kern.globals.g_lt_idx_max) if next_id < max_elem: if depth >= 950: prepost_str.append("{: <37s}".format("!recursion limit!")) @@ -859,7 +864,7 @@ def ShowPrepostChain(cmd_args=None, cmd_options={}): raise ArgumentError("Invalid prepost {:s}".format(cmd_args[0])) pp_arr = [] - GetPrepostChain(wqp.wqte.wqt_id.id, True, pp_arr) + GetPrepostChain(wqp.wqte.lt_id.id, True, pp_arr) pp_cnt = len(pp_arr) idx = 0 nvalid = 0 @@ -889,7 +894,7 @@ def GetWaitqSummary(waitq): fmt_str = "{q: <16x} {state: <3s} {bits: <4s} {q.waitq_eventmask: <#17x} {setid: <#18x} {q.waitq_prepost_id: <#18x}" th_str = [] if waitq.waitq_queue.next and waitq.waitq_queue.prev: - for thread in IterateLinkageChain(addressof(waitq.waitq_queue), 'thread *', 'links'): + for thread in IterateLinkageChain(addressof(waitq.waitq_queue), 'thread *', 'wait_links'): th_str.append("{: <18s} e:{: <#18x}".format(hex(thread), thread.wait_event)) else: th_str.append("{: <39s}".format('')) @@ -963,7 +968,7 @@ def ShowWaitq(cmd_args=None, cmd_options={}): raise ArgumentError("Invalid link ID {:s}".format(cmd_options["-S"])) if WaitqTableElemType(link) != "ELEM": raise ArgumentError("Link ID {:s} points to a SLT_LINK object, not an SLT_WQS!".format(cmd_options["-S"])) - waitq = addressof(link.sl_wqs.sl_set.wqset_q) + waitq = addressof(link.wql_wqs.wql_set.wqset_q) if not waitq and not cmd_args: raise ArgumentError("Please pass the address of a waitq!") diff --git a/tools/lldbmacros/xnu.py b/tools/lldbmacros/xnu.py index bc3830f7c..f93fd476d 100644 --- a/tools/lldbmacros/xnu.py +++ b/tools/lldbmacros/xnu.py @@ -289,6 +289,41 @@ def GetLLDBThreadForKernelThread(thread_obj): return sbthread +def GetKextSymbolInfo(load_addr): + """ Get a string descriptiong load_addr + offset + params: + load_addr - int address value of pc in backtrace. + returns: str - kext name + offset string. If no cached data available, warning message is returned. + """ + symbol_name = "None" + symbol_offset = load_addr + kmod_val = kern.globals.kmod + if kern.arch not in ('arm64',): + for kval in IterateLinkedList(kmod_val, 'next'): + if load_addr >= unsigned(kval.address) and \ + load_addr <= (unsigned(kval.address) + unsigned(kval.size)): + symbol_name = kval.name + symbol_offset = load_addr - unsigned(kval.address) + break + return "{:#018x} {:s} + {:#x} \n".format(load_addr, symbol_name, symbol_offset) + + # only for arm64 we do lookup for split kexts. + cached_kext_info = caching.GetDynamicCacheData("kern.kexts.loadinformation", []) + if not cached_kext_info and str(GetConnectionProtocol()) == "core": + cached_kext_info = GetKextLoadInformation() + + if not cached_kext_info: + return "{:#018x} ~ kext info not available. please run 'showallkexts' once ~ \n".format(load_addr) + + for kval in cached_kext_info: + text_seg = kval[5] + if load_addr >= text_seg.vmaddr and \ + load_addr <= (text_seg.vmaddr + text_seg.vmsize): + symbol_name = kval[2] + symbol_offset = load_addr - text_seg.vmaddr + break + return "{:#018x} {:s} + {:#x} \n".format(load_addr, symbol_name, symbol_offset) + def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""): """ Get a string to display back trace for a thread. params: @@ -330,16 +365,7 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""): symbol = frame.GetSymbol() if not symbol: - symbol_name = "None" - symbol_offset = load_addr - kmod_val = kern.globals.kmod - for kval in IterateLinkedList(kmod_val, 'next'): - if load_addr >= unsigned(kval.address) and \ - load_addr <= (unsigned(kval.address) + unsigned(kval.size)): - symbol_name = kval.name - symbol_offset = load_addr - unsigned(kval.address) - break - out_string += "{:#018x} {:s} + {:#x} \n".format(load_addr, symbol_name, symbol_offset) + out_string += GetKextSymbolInfo(load_addr) else: file_addr = addr.GetFileAddress() start_addr = symbol.GetStartAddress().GetFileAddress() @@ -456,6 +482,9 @@ def XnuDebugCommand(cmd_args=None): reload: Reload a submodule from the xnu/tools/lldb directory. Do not include the ".py" suffix in modulename. usage: xnudebug reload (eg. memory, process, stats etc) + flushcache: + remove any cached data held in static or dynamic data cache. + usage: xnudebug flushcache test: Start running registered test with from various modules. usage: xnudebug test (eg. test_memstats) @@ -468,12 +497,12 @@ def XnuDebugCommand(cmd_args=None): command_args = cmd_args if len(command_args) == 0: raise ArgumentError("No command specified.") - supported_subcommands = ['debug', 'reload', 'test', 'testall'] + supported_subcommands = ['debug', 'reload', 'test', 'testall', 'flushcache'] subcommand = GetLongestMatchOption(command_args[0], supported_subcommands, True) if len(subcommand) == 0: raise ArgumentError("Subcommand (%s) is not a valid command. " % str(command_args[0])) - + subcommand = subcommand[0].lower() if subcommand == 'debug': if command_args[-1].lower().find('dis') >=0 and config['debug']: @@ -483,7 +512,10 @@ def XnuDebugCommand(cmd_args=None): config['debug'] = True EnableLLDBAPILogging() # provided by utils.py print "Enabled debug logging. \nPlease run 'xnudebug debug disable' to disable it again. " - + if subcommand == 'flushcache': + print "Current size of cache: {}".format(caching.GetSizeOfCache()) + caching.ClearAllCache() + if subcommand == 'reload': module_name = command_args[-1] if module_name in sys.modules: @@ -755,3 +787,6 @@ def WalkList(cmd_args=[], cmd_options={}): from kauth import * from waitq import * from usertaskgdbserver import * +from ktrace import * +from pgtrace import * +from xnutriage import * diff --git a/tools/lldbmacros/xnudefines.py b/tools/lldbmacros/xnudefines.py index 3491221b8..604c2791c 100644 --- a/tools/lldbmacros/xnudefines.py +++ b/tools/lldbmacros/xnudefines.py @@ -21,9 +21,9 @@ " supervisor(readonly) user(readonly)", " " ] -kq_state_strings = {0:"", 1:"SEL", 2:"SLEEP", 4:"PROCWAIT", 8:"KEV32", 16:"KEV64"} +kq_state_strings = {0:"", 1:"SEL", 2:"SLEEP", 4:"PROCWAIT", 8:"KEV32", 16:"KEV64", 32:"QOS", 64:"WORKQ", 128:"PROCESS", 256: "DRAIN"} -kn_state_strings = {0:"", 1:"ACTIVE", 2:"QUEUED", 4:"DISABLED", 8:"DROPPING", 16:"USERWAIT", 32:"ATTACHING", 64:"STAYQUED"} +kn_state_strings = {0:"", 1:"ACTIVE", 2:"QUEUED", 4:"DISABLED", 8:"DROPPING", 16:"USERWAIT", 32:"ATTACHING", 64:"STAYQUED", 128:"DEFERDROP"} mach_msg_type_descriptor_strings = {0: "PORT", 1: "OOLDESC", 2: "OOLPORTS", 3: "OOLVOLATILE"} diff --git a/tools/lldbmacros/xnutriage.py b/tools/lldbmacros/xnutriage.py new file mode 100644 index 000000000..ae4050faf --- /dev/null +++ b/tools/lldbmacros/xnutriage.py @@ -0,0 +1,121 @@ +""" + XNU Triage commands +""" +from xnu import * +import sys, shlex +from utils import * +import xnudefines +import re +import os.path + +# Macro: xi +def OutputAddress(cmd_args=None): + """ Returns out address and symbol corresponding to it without newline + Parameters:
+ """ + if not cmd_args: + print "No arguments passed" + print OutputAddress.__doc__ + return False + a = unsigned(cmd_args[0]) + cmd_str = "image lookup -a {:#x}".format(a) + cmd_out = lldb_run_command(cmd_str) + if len(cmd_out) != 0 and cmd_out != "ERROR:": + cmd_out1 = cmd_out.split('\n') + if len(cmd_out1) != 0: + cmd_out2 = cmd_out1[1].split('`') + if cmd_out2 != 0: + cmd_out3 = cmd_out2[1].split(' at') + if len(cmd_out3) != 0: + symbol_str = "{:#x} <{:s}>".format(unsigned(a), cmd_out3[0]) + return symbol_str + return "" + +@lldb_command('xi') +def SymbolicateWithInstruction(cmd_args=None): + """ Prints out address and symbol similar to x/i + Usage: xi
+ """ + if not cmd_args: + print "No arguments passed" + print SymbolicateWithInstruction.__doc__ + return False + a = ArgumentStringToInt(cmd_args[0]) + print OutputAddress([a]) + +# Macro: xi + +# Macro: newbt +@lldb_command('newbt') +def NewBt(cmd_args=None): + """ Prints all the instructions by walking the given stack pointer + """ + if not cmd_args: + print "No arguments passed" + print NewBt.__doc__ + return False + a = ArgumentStringToInt(cmd_args[0]) + while a != 0: + if kern.arch == "x86_64" or kern.arch == "arm64": + offset = 8 + else: + offset = 4 + link_register = dereference(kern.GetValueFromAddress(a + offset, 'uintptr_t *')) + cmd_str = "di -s {:#x} -c 1".format(link_register) + cmd_out = lldb_run_command(cmd_str) + if len(cmd_out) != 0: + cmd_out1 = cmd_out.split('\n') + if len(cmd_out1) != 0: + print OutputAddress([unsigned(link_register)]) + ": " + cmd_out1[0].split(':')[1] + a = dereference(kern.GetValueFromAddress(unsigned(a), 'uintptr_t *')) + +# EndMacro: newbt + +# Macro: parseLR +@lldb_command('parseLR') +def parseLR(cmd_args=None): + """ Decode the LR value from panic log into source code location + """ + global paniclog_data + panic_found = 1 + + if not paniclog_data: + if kern.arch == "x86_64": + paniclog_data += returnfunc("\n(lldb) paniclog\n", "paniclog -v") + else: + paniclog_data += returnfunc("\n(lldb) paniclog\n", "paniclog") + + if panic_found == 1: + srch_string = "lr:\s+0x[a-fA-F0-9]+\s" + lr_pc_srch = re.findall(srch_string, paniclog_data) + if lr_pc_srch: + print paniclog_data, lr_pc_srch + for match in lr_pc_srch: + sp=match.strip("lr: ") + print sp + print "(lldb) list *{:s}".format(sp) + print lldb_run_command("list *{:s}".format(sp)) + + else: + print "Currently unsupported on x86_64 architecture" +#EndMacro: parseLR + +# Macro: parseLRfromfile +@lldb_command('parseLRfromfile') +def parseLRfromfile(cmd_args=None): + """ Decode the LR value from file into source code location + """ + f = open('/tmp/lrparsefile', 'r') + parse_data= f.read() + srch_string = "lr:\s+0x[a-fA-F0-9]+\s" + lr_pc_srch = re.findall(srch_string, parse_data) + if lr_pc_srch: + print paniclog_data, lr_pc_srch + for match in lr_pc_srch: + sp=match.strip("lr: ") + print sp + print "(lldb) list *{:s}".format(sp) + print lldb_run_command("list *{:s}".format(sp)) + +#EndMacro: parseLRfromfile + diff --git a/tools/stackshot/Makefile b/tools/stackshot/Makefile new file mode 100644 index 000000000..97bdca25f --- /dev/null +++ b/tools/stackshot/Makefile @@ -0,0 +1,24 @@ +ifndef SDKROOT +SDKROOT := macosx.internal +endif + +SDKPATH := $(shell xcrun -sdk $(SDKROOT) -show-sdk-path) + +ifndef SRCROOT + SRCROOT := $(CURDIR)/../.. +endif + +ifndef OBJROOT + OBJROOT:=$(CURDIR) +endif + +ARCHES := $(shell file $(SDKPATH)/usr/lib/libc.dylib | perl -lne 'print "-arch $$1" if /for architecture ([\w_]+)/') + +CFLAGS := $(ARCHES) -I $(SDKPATH)/System/Library/Frameworks/System.framework/PrivateHeaders + +# -I $(SRCROOT)/kcdata -iprefix kern/ -iwithprefix $(SRCROOT)/osfmk/kern +#-I $(SRCROOT)/bsd +# + +$(OBJROOT)/stackshot: stackshot.c + xcrun -sdk $(SDKROOT) clang $(CFLAGS) $< -o $@ diff --git a/tools/stackshot/stackshot.c b/tools/stackshot/stackshot.c new file mode 100644 index 000000000..854e00a7d --- /dev/null +++ b/tools/stackshot/stackshot.c @@ -0,0 +1,192 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define STACKSHOT_TAILSPIN (0x80000) + +uint64_t +stackshot_get_mach_absolute_time(void *buffer, uint32_t size) +{ + kcdata_iter_t iter = kcdata_iter_find_type(kcdata_iter(buffer, size), KCDATA_TYPE_MACH_ABSOLUTE_TIME); + if (!kcdata_iter_valid(iter) || kcdata_iter_size(iter) < sizeof(uint64_t)) { + fprintf(stderr, "bad kcdata\n"); + exit(1); + } + return *(uint64_t *)kcdata_iter_payload(iter); +} + +static void usage(char **argv) +{ + fprintf (stderr, "usage: %s [-d] [-t] >file\n", argv[0]); + fprintf (stderr, " -d : take delta stackshot\n"); + fprintf (stderr, " -b : get bootprofile\n"); + fprintf (stderr, " -t : enable tailspin mode\n"); + fprintf (stderr, " -s : fork a sleep process\n"); + fprintf (stderr, " -L : disable loadinfo\n"); + fprintf (stderr, " -k : active kernel threads only\n"); + fprintf (stderr, " -I : disable io statistics\n"); + fprintf (stderr, " -p PID : target a pid\n"); + exit(1); +} + +void forksleep() { + pid_t pid = fork(); + if (pid < 0) { + perror("fork"); + exit(1); + } + + if (pid == 0) { + execlp("sleep", "sleep", "30", NULL); + perror("execlp"); + exit(1); + } +} + + +int main(int argc, char **argv) { + + uint32_t iostats = 0; + uint32_t active_kernel_threads_only = 0; + uint32_t tailspin = 0; + uint32_t bootprofile = 0; + uint32_t loadinfo = STACKSHOT_SAVE_LOADINFO | STACKSHOT_SAVE_KEXT_LOADINFO; + boolean_t delta = FALSE; + boolean_t sleep = FALSE; + pid_t pid = -1; + int c; + + while ((c = getopt(argc, argv, "IkbLdtsp:")) != EOF) { + switch(c) { + case 'I': + iostats |= STACKSHOT_NO_IO_STATS; + break; + case 'k': + active_kernel_threads_only |= STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY; + loadinfo &= ~STACKSHOT_SAVE_LOADINFO; + break; + case 'b': + bootprofile |= STACKSHOT_GET_BOOT_PROFILE; + break; + case 'L': + loadinfo = 0; + break; + case 't': + tailspin |= STACKSHOT_TAILSPIN; + break; + case 'd': + delta = TRUE; + break; + case 's': + sleep = TRUE; + break; + case 'p': + pid = atoi(optarg); + break; + case '?': + case 'h': + default: + usage(argv); + break; + } + } + + if (optind < argc) + { + usage(argv); + } + + void * config = stackshot_config_create(); + if (!config) { + perror("stackshot_config_create"); + return 1; + } + uint32_t flags = loadinfo | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_GET_DQ | STACKSHOT_KCDATA_FORMAT | + tailspin | bootprofile | active_kernel_threads_only | iostats; + + int err = stackshot_config_set_flags(config, flags); + if (err != 0) { + perror("stackshot_config_set_flags"); + return 1; + } + + if (pid != -1) { + int err = stackshot_config_set_pid(config, pid); + if (err != 0) { + perror("stackshot_config_set_flags"); + return 1; + } + } + + err = stackshot_capture_with_config(config); + if (err != 0) { + perror("stackshot_capture_with_config"); + return 1; + } + + void *buf = stackshot_config_get_stackshot_buffer(config); + if (!buf) { + perror("stackshot_config_get_stackshot_buffer"); + return 1; + } + + uint32_t size = stackshot_config_get_stackshot_size(config); + + if (delta) { + // output the original somewhere? + + uint64_t time = stackshot_get_mach_absolute_time(buf, size); + + err = stackshot_config_dealloc_buffer(config); + assert(!err); + + flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT; + int err = stackshot_config_set_flags(config, flags); + if (err != 0) { + perror("stackshot_config_set_flags"); + return 1; + } + + err = stackshot_config_set_delta_timestamp(config, time); + if (err != 0) { + perror("stackshot_config_delta_timestamp"); + return 1; + } + + if (sleep) { + forksleep(); + } + usleep(10000); + + err = stackshot_capture_with_config(config); + if (err != 0) { + perror("stackshot_capture_with_config"); + return 1; + } + + buf = stackshot_config_get_stackshot_buffer(config); + if (!buf) { + perror("stackshot_config_get_stackshot_buffer"); + return 1; + } + + size = stackshot_config_get_stackshot_size(config); + + } + + fwrite(buf, size, 1, stdout); +} diff --git a/tools/tests/MPMMTest/MPMMtest_run.sh b/tools/tests/MPMMTest/MPMMtest_run.sh index 517c730eb..95f4fb2a7 100755 --- a/tools/tests/MPMMTest/MPMMtest_run.sh +++ b/tools/tests/MPMMTest/MPMMtest_run.sh @@ -6,13 +6,25 @@ MPMMTEST_64="${TESTDIR}/MPMMtest_64" KQMPMMTEST="${TESTDIR}/KQMPMMtest" KQMPMMTEST_64="${TESTDIR}/KQMPMMtest_64" +is_64_bit_env() +{ + ARCHOUT=`file /bin/ls` + if [[ $ARCHOUT == *"64-bit"* ]]; then + return 1 + fi + return 0 +} + +is_64_bit_env; +IS_64BIT_BOOTED_OS=$? + if [ -e $MPMMTEST ] && [ -x $MPMMTEST ] then echo ""; echo " Running $MPMMTEST"; $MPMMTEST -perf || { x=$?; echo "$MPMMTEST failed $x "; exit $x; } fi -if [ -e $MPMMTEST_64 ] && [ -x $MPMMTEST_64 ] +if [ -e $MPMMTEST_64 ] && [ -x $MPMMTEST_64 ] && [ $IS_64BIT_BOOTED_OS == 1 ] then echo ""; echo " Running $MPMMTEST_64" $MPMMTEST_64 -perf || { x=$?; echo "$MPMMTEST_64 failed $x"; exit $x; } @@ -24,7 +36,7 @@ then $KQMPMMTEST -perf || { x=$?; echo "$KQMPMMTEST failed $x"; exit $x; } fi -if [ -e $KQMPMMTEST_64 ] && [ -x $KQMPMMTEST_64 ] +if [ -e $KQMPMMTEST_64 ] && [ -x $KQMPMMTEST_64 ] && [ $IS_64BIT_BOOTED_OS == 1 ] then echo ""; echo " Running $KQMPMMTEST_64" $KQMPMMTEST_64 -perf || { x=$?; echo "$KQMPMMTEST_64 failed $x"; exit $?; } diff --git a/tools/tests/Makefile b/tools/tests/Makefile index 7a2093aa2..9080b3347 100644 --- a/tools/tests/Makefile +++ b/tools/tests/Makefile @@ -23,31 +23,44 @@ endif COMMON_TARGETS = unit_tests \ MPMMTest \ + packetdrill \ affinity \ execperf \ kqueue_tests \ superpages \ zero-to-n \ jitter \ - perf_index + perf_index \ + darwintests \ + unixconf -IPHONE_TARGETS = - -MAC_TARGETS = +IPHONE_TARGETS = +MAC_TARGETS = BATS_TARGET = $(BATS_CONFIG_PATH)/BATS ifeq "$(Embedded)" "YES" -TARGETS = $(addprefix $(DSTSUBPATH)/, $(COMMON_TARGETS) $(IPHONE_TARGETS)) +TARGETS = $(addprefix $(DSTSUBPATH)/, $(COMMON_TARGETS) $(IPHONE_TARGETS)) else -TARGETS = $(addprefix $(DSTSUBPATH)/, $(COMMON_TARGETS) $(MAC_TARGETS)) +TARGETS = $(addprefix $(DSTSUBPATH)/, $(COMMON_TARGETS) $(MAC_TARGETS)) endif all: $(BATS_TARGET) $(TARGETS) -$(BATS_TARGET) $(DSTSUBPATH)/%: - mkdir -p $@ - mkdir -p $(OBJROOT)/$(notdir $@) - mkdir -p $(SYMROOT) - $(MAKE) -C $(SRCROOT)/$(notdir $@) SRCROOT=$(SRCROOT)/$(notdir $@) DSTROOT=$@ OBJROOT=$(OBJROOT)/$(notdir $@) SDKROOT=$(SDKROOT) +.PHONY: always + +always: + +$(DSTSUBPATH)/%: always + $(_v)echo Building $@ + $(_v)mkdir -p $@ + $(_v)mkdir -p $(OBJROOT)/$(notdir $@) + $(_v)mkdir -p $(SYMROOT)/$(notdir $@) + $(_v)$(MAKE) -C $(SRCROOT)/$(notdir $@) SRCROOT=$(SRCROOT)/$(notdir $@) DSTROOT=$@ OBJROOT=$(OBJROOT)/$(notdir $@) SYMROOT=$(SYMROOT)/$(notdir $@) SDKROOT=$(SDKROOT) BASEDSTROOT=$(DSTROOT) + +$(BATS_TARGET): $(TARGETS) + $(_v)mkdir -p $@ + $(_v)mkdir -p $(OBJROOT)/$(notdir $@) + $(_v)mkdir -p $(SYMROOT) + $(_v)$(MAKE) -C $(SRCROOT)/$(notdir $@) SRCROOT=$(SRCROOT)/$(notdir $@) DSTROOT=$@ OBJROOT=$(OBJROOT)/$(notdir $@) SDKROOT=$(SDKROOT) BASEDSTROOT=$(DSTROOT) diff --git a/tools/tests/affinity/Makefile b/tools/tests/affinity/Makefile index 98f4e9e45..c4d1a9bc4 100644 --- a/tools/tests/affinity/Makefile +++ b/tools/tests/affinity/Makefile @@ -19,7 +19,7 @@ ARCH_32_FLAGS := $(patsubst %, -arch %, $(ARCH_32)) ARCH_64 := $(filter %64, $(ARCHS)) ARCH_64_FLAGS := $(patsubst %, -arch %, $(ARCH_64)) -CFLAGS :=-g -isysroot $(SDKROOT) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders +CFLAGS :=-g -isysroot $(SDKROOT) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders DSTROOT?=$(shell /bin/pwd) SRCROOT?=$(shell /bin/pwd) @@ -34,7 +34,7 @@ TARGETS := $(if $(ARCH_64), $(ARCH_64_TARGETS)) $(if $(ARCH_32), $(ARCH_32_TARGE all: $(TARGETS) $(ARCH_32_TARGETS): $(DSTROOT)/%: $(SRCROOT)/%.c - $(CC) $(CFLAGS) $(ARCH_32_FLAGS) $< -o $(SYMROOT)/$(notdir $@) # 32-bit fat + $(CC) $(CFLAGS) $(ARCH_32_FLAGS) $< -o $(SYMROOT)/$(notdir $@) # 32-bit fat if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi .SECONDEXPANSION: @@ -43,5 +43,5 @@ $(ARCH_64_TARGETS): $(DSTROOT)/%: $(SRCROOT)/$$(subst 64,,%).c if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi clean: - rm -f $(TARGETS) + rm -f $(TARGETS) rm -rf $(SYMROOT)/*.dSYM diff --git a/tools/tests/darwintests/Makefile b/tools/tests/darwintests/Makefile new file mode 100644 index 000000000..f706e13b7 --- /dev/null +++ b/tools/tests/darwintests/Makefile @@ -0,0 +1,51 @@ +PROJECT := xnu/darwintests + +# When building as part of xnu_tests, we get passed a DSTROOT that's got the +# unit test path in it already. But, BASEDSTROOT doesn't, so use that instead. +ifdef BASEDSTROOT +override DSTROOT = $(BASEDSTROOT) +endif + +DEVELOPER_DIR ?= /Applications/Xcode.app/Contents/Developer/ + +# the xnu build system will only ever call us with the default target +.DEFAULT_GOAL := install + +include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common + +OTHER_CFLAGS = -Weverything -Wno-gnu-union-cast -Wno-missing-field-initializers -Wno-partial-availability +OTHER_CFLAGS += -Wno-missing-noreturn -Wno-vla -Wno-reserved-id-macro -Wno-documentation-unknown-command +OTHER_CFLAGS += -Wno-padded -Wno-used-but-marked-unused +OTHER_CFLAGS += --std=gnu11 -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders + +# to have custom compiler flags to +# target: OTHER_CFLAGS += + +backtracing: OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks +backtracing: OTHER_LDFLAGS += -framework CoreSymbolication + +kdebug: INVALID_ARCHS = i386 +kdebug: OTHER_LDFLAGS = -lktrace + +EXCLUDED_SOURCES += kperf_helpers.c + +kperf: INVALID_ARCHS = i386 +kperf: OTHER_CFLAGS += kperf_helpers.c +kperf: OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks +kperf: OTHER_LDFLAGS += -framework kperf -framework kperfdata -lktrace + +kperf_backtracing: INVALID_ARCHS = i386 +kperf_backtracing: OTHER_CFLAGS += kperf_helpers.c +kperf_backtracing: OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks +kperf_backtracing: OTHER_LDFLAGS += -framework kperf -framework kperfdata -lktrace +kperf_backtracing: OTHER_LDFLAGS += -framework CoreSymbolication + +mach_get_times: OTHER_LDFLAGS += -ldarwintest_utils + +perf_exit: OTHER_LDFLAGS = -lktrace +perf_exit: INVALID_ARCHS = i386 + +stackshot_idle_25570396: INVALID_ARCHS = i386 +stackshot_idle_25570396: OTHER_LDFLAGS += -lkdd -framework Foundation + +include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets diff --git a/tools/tests/darwintests/backtracing.c b/tools/tests/darwintests/backtracing.c new file mode 100644 index 000000000..bc0161fdd --- /dev/null +++ b/tools/tests/darwintests/backtracing.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include +#include + +#define USER_FRAMES (12) + +#define NON_RECURSE_FRAMES (5) + +static const char *user_bt[USER_FRAMES] = { + NULL, NULL, + "backtrace_thread", + "recurse_a", "recurse_b", "recurse_a", "recurse_b", + "recurse_a", "recurse_b", "recurse_a", + "expect_stack", NULL +}; + +static void +expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol, + unsigned long addr, unsigned int bt_idx, unsigned int max_frames) +{ + const char *name; + unsigned int frame_idx = max_frames - bt_idx - 1; + + if (bt[frame_idx] == NULL) { + T_LOG("frame %2u: skipping system frame", frame_idx); + return; + } + + if (CSIsNull(symbol)) { + T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx); + return; + } + + if (frame_idx >= bt_len) { + T_FAIL("unexpected frame '%s' (%#lx) at index %u", + CSSymbolGetName(symbol), addr, frame_idx); + return; + } + + name = CSSymbolGetName(symbol); + T_QUIET; T_ASSERT_NOTNULL(name, NULL); + T_EXPECT_EQ_STR(name, bt[frame_idx], + "frame %2u: saw '%s', expected '%s'", + frame_idx, name, bt[frame_idx]); +} + +static void __attribute__((noinline,not_tail_called)) +expect_stack(void) +{ + uint64_t bt[USER_FRAMES] = { 0 }; + unsigned int bt_len = USER_FRAMES; + int err; + size_t bt_filled; + + static dispatch_once_t expect_stacks_once; + static bool k64; + static CSSymbolicatorRef user_symb; + + dispatch_once(&expect_stacks_once, ^(void) { + int errb; + int mib[] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, 0 /* kernproc */ }; + + struct kinfo_proc kp; + size_t len; + + len = sizeof(kp); + errb = sysctl(mib, sizeof(mib) / sizeof(mib[0]), &kp, &len, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(errb, + "sysctl({ CTL_KERN, KERN_PROC, KERN_PROC_PID, 0})"); + + k64 = kp.kp_proc.p_flag & P_LP64; + T_LOG("executing with a %s-bit kernel", k64 ? "64" : "32"); + + user_symb = CSSymbolicatorCreateWithTask(mach_task_self()); + T_QUIET; T_ASSERT_FALSE(CSIsNull(user_symb), NULL); + T_QUIET; T_ASSERT_TRUE(CSSymbolicatorIsTaskValid(user_symb), NULL); + }); + + bt_filled = USER_FRAMES; + err = sysctlbyname("kern.backtrace.user", bt, &bt_filled, NULL, 0); + if (err == ENOENT) { + T_SKIP("release kernel: kern.backtrace.user sysctl returned ENOENT"); + } + T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname(\"kern.backtrace.user\")"); + + bt_len = (unsigned int)bt_filled; + T_EXPECT_EQ(bt_len, (unsigned int)USER_FRAMES, + "%u frames should be present in backtrace", (unsigned int)USER_FRAMES); + + for (unsigned int i = 0; i < bt_len; i++) { + uintptr_t addr; +#if !defined(__LP64__) + /* + * Backtrace frames come out as kernel words; convert them back to user + * uintptr_t for 32-bit processes. + */ + if (k64) { + addr = (uintptr_t)(bt[i]); + } else { + addr = (uintptr_t)(((uint32_t *)bt)[i]); + } +#else /* defined(__LP32__) */ + addr = (uintptr_t)bt[i]; +#endif /* defined(__LP32__) */ + + CSSymbolRef symbol = CSSymbolicatorGetSymbolWithAddressAtTime( + user_symb, addr, kCSNow); + expect_frame(user_bt, USER_FRAMES, symbol, addr, i, bt_len); + } +} + +static int __attribute__((noinline,not_tail_called)) +recurse_a(unsigned int frames); +static int __attribute__((noinline,not_tail_called)) +recurse_b(unsigned int frames); + +static int __attribute__((noinline,not_tail_called)) +recurse_a(unsigned int frames) +{ + if (frames == 1) { + expect_stack(); + getpid(); + return 0; + } + + return recurse_b(frames - 1) + 1; +} + +static int __attribute__((noinline,not_tail_called)) +recurse_b(unsigned int frames) +{ + if (frames == 1) { + expect_stack(); + getpid(); + return 0; + } + + return recurse_a(frames - 1) + 1; +} + +static void * +backtrace_thread(void *arg) +{ +#pragma unused(arg) + unsigned int calls; + + /* + * backtrace_thread, recurse_a, recurse_b, ..., __sysctlbyname + * + * Always make one less call for this frame (backtrace_thread). + */ + calls = USER_FRAMES - NON_RECURSE_FRAMES; + + T_LOG("backtrace thread calling into %d frames (already at %d frames)", + calls, NON_RECURSE_FRAMES); + (void)recurse_a(calls); + return NULL; +} + +T_DECL(backtrace_user, "test that the kernel can backtrace user stacks", + T_META_ALL_VALID_ARCHS(YES)) +{ + pthread_t thread; + + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread, + NULL), "create additional thread to backtrace"); +} diff --git a/tools/tests/darwintests/gettimeofday.c b/tools/tests/darwintests/gettimeofday.c new file mode 100644 index 000000000..1e2429427 --- /dev/null +++ b/tools/tests/darwintests/gettimeofday.c @@ -0,0 +1,50 @@ +#include +#include +#include + +#include + +extern int __gettimeofday(struct timeval *, struct timezone *); + +T_DECL(gettimeofday, "gettimeofday()", + T_META_CHECK_LEAKS(NO), T_META_ALL_VALID_ARCHS(YES)) +{ + struct timeval tv_a, tv_b, tv_c; + + T_ASSERT_POSIX_ZERO(gettimeofday(&tv_a, NULL), NULL); + T_ASSERT_GT(tv_a.tv_sec, 0L, NULL); + + sleep(1); + + T_ASSERT_POSIX_ZERO(__gettimeofday(&tv_b, NULL), NULL); + T_ASSERT_GE(tv_b.tv_sec, tv_a.tv_sec, NULL); + + sleep(1); + + T_ASSERT_POSIX_ZERO(gettimeofday(&tv_c, NULL), NULL); + T_ASSERT_GE(tv_c.tv_sec, tv_b.tv_sec, NULL); +} + +#if 0 // This symbol isn't exported so we can't test with stock libsyscall +extern int __gettimeofday_with_mach(struct timeval *, struct timezone *, uint64_t *mach_time); + +T_DECL(gettimeofday_with_mach, "gettimeofday_with_mach()", + T_META_CHECK_LEAKS(NO), T_META_ALL_VALID_ARCHS(YES)) +{ + struct timeval gtod_ts; + + uint64_t mach_time_before, mach_time, mach_time_after; + + mach_time_before = mach_absolute_time(); + + T_ASSERT_POSIX_ZERO(__gettimeofday_with_mach(>od_ts, NULL, &mach_time), NULL); + T_ASSERT_GT(gtod_ts.tv_sec, 0L, NULL); + + mach_time_after = mach_absolute_time(); + + T_LOG("%llx > %llx > %llx", mach_time_before, mach_time, mach_time_after); + + T_ASSERT_LT(mach_time_before, mach_time, NULL); + T_ASSERT_GT(mach_time_after, mach_time, NULL); +} +#endif // 0 diff --git a/tools/tests/darwintests/host_notifications.c b/tools/tests/darwintests/host_notifications.c new file mode 100644 index 000000000..d57465c62 --- /dev/null +++ b/tools/tests/darwintests/host_notifications.c @@ -0,0 +1,49 @@ +#include +#include +#include + +#include + +static void do_test(int notify_type, void (^trigger_block)(void)){ + mach_port_t port; + T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), NULL); + + T_ASSERT_MACH_SUCCESS(host_request_notification(mach_host_self(), notify_type, port), NULL); + + trigger_block(); + + struct { + mach_msg_header_t hdr; + mach_msg_trailer_t trailer; + } message = { .hdr = { + .msgh_bits = 0, + .msgh_size = sizeof(mach_msg_header_t), + .msgh_remote_port = MACH_PORT_NULL, + .msgh_local_port = port, + .msgh_voucher_port = MACH_PORT_NULL, + .msgh_id = 0, + }}; + + T_ASSERT_EQ(MACH_RCV_TOO_LARGE, mach_msg_receive(&message.hdr), NULL); + mach_msg_destroy(&message.hdr); +} + +T_DECL(host_notify_calendar_change, "host_request_notification(HOST_NOTIFY_CALENDAR_CHANGE)", T_META_CHECK_LEAKS(NO)) +{ + do_test(HOST_NOTIFY_CALENDAR_CHANGE, ^{ + struct timeval tm; + if (gettimeofday(&tm, NULL) != 0 || settimeofday(&tm, NULL) != 0){ + T_SKIP("Unable to settimeofday()"); + } + }); +} + +T_DECL(host_notify_calendar_set, "host_request_notification(HOST_NOTIFY_CALENDAR_SET)", T_META_CHECK_LEAKS(NO)) +{ + do_test(HOST_NOTIFY_CALENDAR_SET, ^{ + struct timeval tm; + if (gettimeofday(&tm, NULL) != 0 || settimeofday(&tm, NULL) != 0){ + T_SKIP("Unable to settimeofday()"); + } + }); +} diff --git a/tools/tests/darwintests/kdebug.c b/tools/tests/darwintests/kdebug.c new file mode 100644 index 000000000..ea1f049e5 --- /dev/null +++ b/tools/tests/darwintests/kdebug.c @@ -0,0 +1,586 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KTRACE_WAIT_TIMEOUT_S (10) + +#define TRACE_DEBUGID (0xfedfed00U) + +T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events", + T_META_ASROOT(YES)) +{ + ktrace_session_t s; + dispatch_time_t timeout; + __block int events_seen = 0; + + s = ktrace_session_create(); + os_assert(s != NULL); + + ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){}); + ktrace_events_single(s, TRACE_DEBUGID, ^void(struct trace_point *tp) { + events_seen++; + T_PASS("saw traced event"); + + T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of traced event is correct"); + T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of traced event is correct"); + T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of traced event is correct"); + T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of traced event is correct"); + + ktrace_end(s, 1); + }); + + ktrace_set_completion_handler(s, ^(void) { + T_EXPECT_GE(events_seen, 1, NULL); + ktrace_session_destroy(s); + T_END; + }); + + ktrace_filter_pid(s, getpid()); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 1, 2, 3, 4), NULL); + ktrace_end(s, 0); + + dispatch_main(); +} + +#define SIGNPOST_SINGLE_CODE (0x10U) +#define SIGNPOST_PAIRED_CODE (0x20U) + +T_DECL(kdebug_signpost_syscall, + "test that kdebug_signpost(2) emits correct events", + T_META_ASROOT(YES)) +{ + ktrace_session_t s; + __block int single_seen = 0; + __block int paired_seen = 0; + dispatch_time_t timeout; + + s = ktrace_session_create(); + T_ASSERT_NOTNULL(s, NULL); + + /* make sure to get enough events for the KDBUFWAIT to trigger */ + // ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){}); + ktrace_events_single(s, + APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_SINGLE_CODE), + ^void(struct trace_point *tp) + { + single_seen++; + T_PASS("single signpost is traced"); + + T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of single signpost is correct"); + T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of single signpost is correct"); + T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of single signpost is correct"); + T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of single signpost is correct"); + }); + + ktrace_events_single_paired(s, + APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_PAIRED_CODE), + ^void(struct trace_point *start, struct trace_point *end) + { + paired_seen++; + T_PASS("paired signposts are traced"); + + T_EXPECT_EQ(start->arg1, 5UL, "argument 1 of start signpost is correct"); + T_EXPECT_EQ(start->arg2, 6UL, "argument 2 of start signpost is correct"); + T_EXPECT_EQ(start->arg3, 7UL, "argument 3 of start signpost is correct"); + T_EXPECT_EQ(start->arg4, 8UL, "argument 4 of start signpost is correct"); + + T_EXPECT_EQ(end->arg1, 9UL, "argument 1 of end signpost is correct"); + T_EXPECT_EQ(end->arg2, 10UL, "argument 2 of end signpost is correct"); + T_EXPECT_EQ(end->arg3, 11UL, "argument 3 of end signpost is correct"); + T_EXPECT_EQ(end->arg4, 12UL, "argument 4 of end signpost is correct"); + + T_EXPECT_EQ(single_seen, 1, + "signposts are traced in the correct order"); + + ktrace_end(s, 1); + }); + + ktrace_set_completion_handler(s, ^(void) { + if (single_seen == 0) { + T_FAIL("did not see single tracepoint before timeout"); + } + if (paired_seen == 0) { + T_FAIL("did not see paired tracepoints before timeout"); + } + ktrace_session_destroy(s); + T_END; + }); + + ktrace_filter_pid(s, getpid()); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + T_EXPECT_POSIX_SUCCESS(kdebug_signpost( + SIGNPOST_SINGLE_CODE, 1, 2, 3, 4), NULL); + T_EXPECT_POSIX_SUCCESS(kdebug_signpost_start( + SIGNPOST_PAIRED_CODE, 5, 6, 7, 8), NULL); + T_EXPECT_POSIX_SUCCESS(kdebug_signpost_end( + SIGNPOST_PAIRED_CODE, 9, 10, 11, 12), NULL); + ktrace_end(s, 0); + + dispatch_main(); +} + +#define WRAPPING_EVENTS_COUNT (150000) +#define TRACE_ITERATIONS (5000) +#define WRAPPING_EVENTS_THRESHOLD (100) + +T_DECL(kdebug_wrapping, + "ensure that wrapping traces lost events and no events prior to the wrap", + T_META_ASROOT(YES), T_META_CHECK_LEAKS(NO)) +{ + ktrace_session_t s; + __block int events = 0; + int mib[4]; + size_t needed; + kbufinfo_t buf_info; + int wait_wrapping_secs = (WRAPPING_EVENTS_COUNT / TRACE_ITERATIONS) + 5; + int current_secs = wait_wrapping_secs; + + /* use sysctls manually to bypass libktrace assumptions */ + + mib[0] = CTL_KERN; mib[1] = KERN_KDEBUG; mib[2] = KERN_KDSETUP; mib[3] = 0; + needed = 0; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, NULL, &needed, NULL, 0), + "KERN_KDSETUP"); + + mib[2] = KERN_KDSETBUF; mib[3] = WRAPPING_EVENTS_COUNT; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDSETBUF"); + + mib[2] = KERN_KDENABLE; mib[3] = 1; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDENABLE"); + + /* wrapping is on by default */ + + /* wait until wrapped */ + T_LOG("waiting for trace to wrap"); + mib[2] = KERN_KDGETBUF; + needed = sizeof(buf_info); + do { + sleep(1); + for (int i = 0; i < TRACE_ITERATIONS; i++) { + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kdebug_trace(0xfefe0000, 0, 0, 0, 0), NULL); + } + T_QUIET; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, &buf_info, &needed, NULL, 0), + NULL); + } while (!(buf_info.flags & KDBG_WRAPPED) && --current_secs > 0); + + T_ASSERT_TRUE(buf_info.flags & KDBG_WRAPPED, + "trace wrapped (after %d seconds within %d second timeout)", + wait_wrapping_secs - current_secs, wait_wrapping_secs); + + s = ktrace_session_create(); + T_QUIET; T_ASSERT_NOTNULL(s, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(s), NULL); + + ktrace_events_all(s, ^void(struct trace_point *tp) { + if (events == 0) { + T_EXPECT_EQ(tp->debugid, (unsigned int)TRACE_LOST_EVENTS, + "first event's debugid 0x%08x (%s) should be TRACE_LOST_EVENTS", + tp->debugid, + ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK)); + } else { + T_QUIET; + T_EXPECT_NE(tp->debugid, (unsigned int)TRACE_LOST_EVENTS, + "event debugid 0x%08x (%s) should not be TRACE_LOST_EVENTS", + tp->debugid, + ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK)); + } + + events++; + if (events > WRAPPING_EVENTS_THRESHOLD) { + ktrace_end(s, 1); + } + }); + + ktrace_set_completion_handler(s, ^(void) { + ktrace_session_destroy(s); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + dispatch_main(); +} + +__attribute__((aligned(8))) +static const char map_uuid[16] = "map UUID"; + +__attribute__((aligned(8))) +static const char unmap_uuid[16] = "unmap UUID"; + +__attribute__((aligned(8))) +static const char sc_uuid[16] = "shared UUID"; + +static fsid_t map_fsid = { .val = { 42, 43 } }; +static fsid_t unmap_fsid = { .val = { 44, 45 } }; +static fsid_t sc_fsid = { .val = { 46, 47 } }; + +static fsobj_id_t map_fsobjid = { .fid_objno = 42, .fid_generation = 43 }; +static fsobj_id_t unmap_fsobjid = { .fid_objno = 44, .fid_generation = 45 }; +static fsobj_id_t sc_fsobjid = { .fid_objno = 46, .fid_generation = 47 }; + +#define MAP_LOAD_ADDR 0xabadcafe +#define UNMAP_LOAD_ADDR 0xfeedface +#define SC_LOAD_ADDR 0xfedfaced + +__unused +static void +expect_dyld_image_info(struct trace_point *tp, const uint64_t *exp_uuid, + uint64_t exp_load_addr, fsid_t *exp_fsid, fsobj_id_t *exp_fsobjid, + int order) +{ +#if defined(__LP64__) + if (order == 0) { + uint64_t uuid[2]; + uint64_t load_addr; + fsid_t fsid; + + uuid[0] = (uint64_t)tp->arg1; + uuid[1] = (uint64_t)tp->arg2; + load_addr = (uint64_t)tp->arg3; + fsid.val[0] = (int32_t)(tp->arg4 & UINT32_MAX); + fsid.val[1] = (int32_t)((uint64_t)tp->arg4 >> 32); + + T_QUIET; T_EXPECT_EQ(uuid[0], exp_uuid[0], NULL); + T_QUIET; T_EXPECT_EQ(uuid[1], exp_uuid[1], NULL); + T_QUIET; T_EXPECT_EQ(load_addr, exp_load_addr, NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL); + } else if (order == 1) { + fsobj_id_t fsobjid; + + fsobjid.fid_objno = (uint32_t)(tp->arg1 & UINT32_MAX); + fsobjid.fid_generation = (uint32_t)((uint64_t)tp->arg1 >> 32); + + T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL); + T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation, + exp_fsobjid->fid_generation, NULL); + } else { + T_ASSERT_FAIL("unrecognized order of events %d", order); + } +#else /* defined(__LP64__) */ + if (order == 0) { + uint32_t uuid[4]; + + uuid[0] = (uint32_t)tp->arg1; + uuid[1] = (uint32_t)tp->arg2; + uuid[2] = (uint32_t)tp->arg3; + uuid[3] = (uint32_t)tp->arg4; + + T_QUIET; T_EXPECT_EQ(uuid[0], (uint32_t)exp_uuid[0], NULL); + T_QUIET; T_EXPECT_EQ(uuid[1], (uint32_t)(exp_uuid[0] >> 32), NULL); + T_QUIET; T_EXPECT_EQ(uuid[2], (uint32_t)exp_uuid[1], NULL); + T_QUIET; T_EXPECT_EQ(uuid[3], (uint32_t)(exp_uuid[1] >> 32), NULL); + } else if (order == 1) { + uint32_t load_addr; + fsid_t fsid; + fsobj_id_t fsobjid; + + load_addr = (uint32_t)tp->arg1; + fsid.val[0] = (int32_t)tp->arg2; + fsid.val[1] = (int32_t)tp->arg3; + fsobjid.fid_objno = (uint32_t)tp->arg4; + + T_QUIET; T_EXPECT_EQ(load_addr, (uint32_t)exp_load_addr, NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL); + T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL); + } else if (order == 2) { + fsobj_id_t fsobjid; + + fsobjid.fid_generation = tp->arg1; + + T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation, + exp_fsobjid->fid_generation, NULL); + } else { + T_ASSERT_FAIL("unrecognized order of events %d", order); + } +#endif /* defined(__LP64__) */ +} + +#if defined(__LP64__) +#define DYLD_CODE_OFFSET (0) +#define DYLD_EVENTS (2) +#else +#define DYLD_CODE_OFFSET (2) +#define DYLD_EVENTS (3) +#endif + +static void +expect_dyld_events(ktrace_session_t s, const char *name, uint32_t base_code, + const char *exp_uuid, uint64_t exp_load_addr, fsid_t *exp_fsid, + fsobj_id_t *exp_fsobjid, uint8_t *saw_events) +{ + for (int i = 0; i < DYLD_EVENTS; i++) { + ktrace_events_single(s, + KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, + base_code + DYLD_CODE_OFFSET + (unsigned int)i), + ^(struct trace_point *tp) + { + T_LOG("checking %s event %c", name, 'A' + i); + expect_dyld_image_info(tp, (const void *)exp_uuid, exp_load_addr, + exp_fsid, exp_fsobjid, i); + *saw_events |= (1U << i); + }); + } +} + +T_DECL(dyld_events, "test that dyld registering libraries emits events", + T_META_ASROOT(YES)) +{ + ktrace_session_t s; + dyld_kernel_image_info_t info; + + /* + * Use pointers instead of __block variables in order to use these variables + * in the completion block below _and_ pass pointers to them to the + * expect_dyld_events function. + */ + uint8_t saw_events[3] = { 0 }; + uint8_t *saw_mapping = &(saw_events[0]); + uint8_t *saw_unmapping = &(saw_events[1]); + uint8_t *saw_shared_cache = &(saw_events[2]); + + s = ktrace_session_create(); + T_ASSERT_NOTNULL(s, NULL); + + expect_dyld_events(s, "mapping", DBG_DYLD_UUID_MAP_A, map_uuid, + MAP_LOAD_ADDR, &map_fsid, &map_fsobjid, saw_mapping); + expect_dyld_events(s, "unmapping", DBG_DYLD_UUID_UNMAP_A, unmap_uuid, + UNMAP_LOAD_ADDR, &unmap_fsid, &unmap_fsobjid, saw_unmapping); + expect_dyld_events(s, "shared cache", DBG_DYLD_UUID_SHARED_CACHE_A, + sc_uuid, SC_LOAD_ADDR, &sc_fsid, &sc_fsobjid, saw_shared_cache); + + ktrace_set_completion_handler(s, ^(void) { + T_EXPECT_EQ(__builtin_popcount(*saw_mapping), DYLD_EVENTS, NULL); + T_EXPECT_EQ(__builtin_popcount(*saw_unmapping), DYLD_EVENTS, NULL); + T_EXPECT_EQ(__builtin_popcount(*saw_shared_cache), DYLD_EVENTS, NULL); + ktrace_session_destroy(s); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + info.load_addr = MAP_LOAD_ADDR; + memcpy(info.uuid, map_uuid, sizeof(info.uuid)); + info.fsid = map_fsid; + info.fsobjid = map_fsobjid; + T_EXPECT_MACH_SUCCESS(task_register_dyld_image_infos(mach_task_self(), + &info, 1), NULL); + + info.load_addr = UNMAP_LOAD_ADDR; + memcpy(info.uuid, unmap_uuid, sizeof(info.uuid)); + info.fsid = unmap_fsid; + info.fsobjid = unmap_fsobjid; + T_EXPECT_MACH_SUCCESS(task_unregister_dyld_image_infos(mach_task_self(), + &info, 1), NULL); + + info.load_addr = SC_LOAD_ADDR; + memcpy(info.uuid, sc_uuid, sizeof(info.uuid)); + info.fsid = sc_fsid; + info.fsobjid = sc_fsobjid; + T_EXPECT_MACH_SUCCESS(task_register_dyld_shared_cache_image_info( + mach_task_self(), info, FALSE, FALSE), NULL); + + ktrace_end(s, 0); + + dispatch_main(); +} + +#define EXP_KERNEL_EVENTS 5U + +static const uint32_t dev_evts[EXP_KERNEL_EVENTS] = { + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 0), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 1), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 2), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 3), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 4), +}; + +static const uint32_t rel_evts[EXP_KERNEL_EVENTS] = { + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 5), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 6), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 7), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 8), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 9), +}; + +static const uint32_t filt_evts[EXP_KERNEL_EVENTS] = { + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 10), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 11), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 12), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 13), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 14), +}; + +static bool +is_development_kernel(void) +{ + static dispatch_once_t is_development_once; + static bool is_development; + + dispatch_once(&is_development_once, ^(void) { + host_debug_info_internal_data_t info; + mach_msg_type_number_t count = HOST_DEBUG_INFO_INTERNAL_COUNT; + kern_return_t kr; + + kr = host_info(mach_host_self(), HOST_DEBUG_INFO_INTERNAL, + (host_info_t)(void *)&info, &count); + if (kr != KERN_SUCCESS && kr != KERN_NOT_SUPPORTED) { + T_ASSERT_FAIL("check for development kernel failed %d", kr); + } + + is_development = (kr == KERN_SUCCESS); + }); + + return is_development; +} + +static void +assert_kdebug_test(void) +{ + int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDTEST }; + T_ASSERT_POSIX_SUCCESS( + sysctl(mib, sizeof(mib) / sizeof(mib[0]), NULL, NULL, NULL, 0), + "KERN_KDTEST"); +} + +static void +expect_event(struct trace_point *tp, unsigned int *events, + const uint32_t *event_ids, size_t event_ids_len) +{ + unsigned int event_idx = *events; + bool event_found = false; + size_t i; + for (i = 0; i < event_ids_len; i++) { + if (event_ids[i] == (tp->debugid & KDBG_EVENTID_MASK)) { + T_LOG("found event 0x%x", tp->debugid); + event_found = true; + } + } + + if (!event_found) { + return; + } + + *events += 1; + for (i = 0; i < event_idx; i++) { + T_QUIET; T_EXPECT_EQ(((uintptr_t *)&tp->arg1)[i], (uintptr_t)i + 1, + NULL); + } + for (; i < 4; i++) { + T_QUIET; T_EXPECT_EQ(((uintptr_t *)&tp->arg1)[i], (uintptr_t)0, NULL); + } +} + +static void +expect_release_event(struct trace_point *tp, unsigned int *events) +{ + expect_event(tp, events, rel_evts, + sizeof(rel_evts) / sizeof(rel_evts[0])); +} + +static void +expect_development_event(struct trace_point *tp, unsigned int *events) +{ + expect_event(tp, events, dev_evts, + sizeof(dev_evts) / sizeof(dev_evts[0])); +} + +static void +expect_filtered_event(struct trace_point *tp, unsigned int *events) +{ + expect_event(tp, events, filt_evts, + sizeof(filt_evts) / sizeof(filt_evts[0])); +} + +T_DECL(kernel_events, "ensure kernel macros work", + T_META_ASROOT(YES)) +{ + ktrace_session_t s; + + s = ktrace_session_create(); + T_QUIET; T_ASSERT_NOTNULL(s, NULL); + + __block unsigned int dev_seen = 0; + __block unsigned int rel_seen = 0; + __block unsigned int filt_seen = 0; + ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0), + KDBG_EVENTID(DBG_BSD + 1, 0, 0), + ^(struct trace_point *tp) + { + expect_development_event(tp, &dev_seen); + expect_release_event(tp, &rel_seen); + expect_filtered_event(tp, &filt_seen); + }); + + ktrace_set_completion_handler(s, ^(void) { + T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, NULL); + T_EXPECT_EQ(dev_seen, is_development_kernel() ? EXP_KERNEL_EVENTS : 0U, + NULL); + T_EXPECT_EQ(filt_seen, EXP_KERNEL_EVENTS, NULL); + ktrace_session_destroy(s); + T_END; + }); + + ktrace_filter_pid(s, getpid()); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + assert_kdebug_test(); + + ktrace_end(s, 0); + + dispatch_main(); +} + +T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work", + T_META_ASROOT(YES)) +{ + ktrace_session_t s; + + s = ktrace_session_create(); + T_QUIET; T_ASSERT_NOTNULL(s, NULL); + + __block unsigned int dev_seen = 0; + __block unsigned int rel_seen = 0; + __block unsigned int filt_seen = 0; + ktrace_events_all(s, ^(struct trace_point *tp) { + expect_development_event(tp, &dev_seen); + expect_release_event(tp, &rel_seen); + /* to make sure no filtered events are emitted */ + expect_filtered_event(tp, &filt_seen); + }); + + ktrace_set_completion_handler(s, ^(void) { + ktrace_session_destroy(s); + + T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, NULL); + T_EXPECT_EQ(dev_seen, EXP_KERNEL_EVENTS, NULL); + T_EXPECT_EQ(filt_seen, 0U, NULL); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + assert_kdebug_test(); + + ktrace_end(s, 0); + + dispatch_main(); +} + diff --git a/tools/tests/darwintests/kevent_continuous_time.c b/tools/tests/darwintests/kevent_continuous_time.c new file mode 100644 index 000000000..74cc5782e --- /dev/null +++ b/tools/tests/darwintests/kevent_continuous_time.c @@ -0,0 +1,236 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern char **environ; + +static mach_timebase_info_data_t tb_info; +static const uint64_t one_mil = 1000LL*1000LL; + +#define tick_to_ns(ticks) (((ticks) * tb_info.numer) / (tb_info.denom)) +#define tick_to_ms(ticks) (tick_to_ns(ticks)/one_mil) + +#define ns_to_tick(ns) ((ns) * tb_info.denom / tb_info.numer) +#define ms_to_tick(ms) (ns_to_tick((ms) * one_mil)) + +static uint64_t time_delta_ms(void){ + uint64_t abs_now = mach_absolute_time(); + uint64_t cnt_now = mach_continuous_time();; + return tick_to_ms(cnt_now) - tick_to_ms(abs_now); +} + +static int run_sleep_tests = 0; + +static int trigger_sleep(int for_secs) { + if(!run_sleep_tests) return 0; + + // sleep for 1 seconds each iteration + char buf[10]; + snprintf(buf, 10, "%d", for_secs); + + T_LOG("Sleepeing for %s seconds...", buf); + + int spawn_ret, pid; + char *const pmset1_args[] = {"/usr/bin/pmset", "relative", "wake", buf, NULL}; + T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset1_args[0], NULL, NULL, pmset1_args, environ)), NULL); + + T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, NULL); + T_ASSERT_EQ(spawn_ret, 0, NULL); + + char *const pmset2_args[] = {"/usr/bin/pmset", "sleepnow", NULL}; + T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset2_args[0], NULL, NULL, pmset2_args, environ)), NULL); + + T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, NULL); + T_ASSERT_EQ(spawn_ret, 0, NULL); + + return 0; +} + +// waits up to 30 seconds for system to sleep +// returns number of seconds it took for sleep to be entered +// or -1 if sleep wasn't accomplished +static int wait_for_sleep() { + if(!run_sleep_tests) return 0; + + uint64_t before_diff = time_delta_ms(); + + for(int i = 0; i < 30; i++) { + uint64_t after_diff = time_delta_ms(); + + // on OSX, there's enough latency between calls to MCT and MAT + // when the system is going down for sleep for values to diverge a few ms + if(llabs((int64_t)before_diff - (int64_t)after_diff) > 2) { + return i + 1; + } + + sleep(1); + T_LOG("waited %d seconds for sleep...", i+1); + } + return -1; +} + +T_DECL(kevent_continuous_time_periodic_tick, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME)"){ + mach_timebase_info(&tb_info); + int kq; + T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL); + + struct kevent64_s change = {0}; + EV_SET64(&change, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0); + T_LOG("EV_SET(&change, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0);"); + + T_ASSERT_POSIX_ZERO(kevent64(kq, &change, 1, NULL, 0, 0, NULL), NULL); + + uint64_t abs_then = mach_absolute_time(); + uint64_t cnt_then = mach_continuous_time();; + + trigger_sleep(1); + int sleep_secs = wait_for_sleep(); + + struct kevent64_s event = {0}; + T_WITH_ERRNO; T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event"); + T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata); + T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error"); + + uint64_t abs_now = mach_absolute_time(); + uint64_t cnt_now = mach_continuous_time();; + uint64_t ct_ms_progressed = tick_to_ms(cnt_now - cnt_then); + uint64_t ab_ms_progressed = tick_to_ms(abs_now - abs_then); + + T_LOG("ct progressed %llu ms, abs progressed %llu ms", ct_ms_progressed, tick_to_ms(abs_now - abs_then)); + + if (run_sleep_tests) { + T_ASSERT_GT(llabs((int64_t)ct_ms_progressed - (int64_t)ab_ms_progressed), 500LL, "should have > 500ms difference between MCT and MAT"); + } else { + T_ASSERT_LT(llabs((int64_t)ct_ms_progressed - (int64_t)ab_ms_progressed), 10LL, "should have < 10ms difference between MCT and MAT"); + } + + if (sleep_secs < 4) { + T_ASSERT_LT(llabs((int64_t)ct_ms_progressed - 4000), 100LL, "mach_continuous_time should progress ~4 seconds (+/- 100ms) between sleeps"); + } + + sleep(1); + + EV_SET64(&change, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0); + T_LOG("EV_SET(&change, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0);"); + T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL); + + T_ASSERT_POSIX_ZERO(close(kq), NULL); +} + +T_DECL(kevent_continuous_time_absolute, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME and NOTE_ABSOLUTE)"){ + mach_timebase_info(&tb_info); + + int kq; + T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL); + + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t nowus = (uint64_t)tv.tv_sec * USEC_PER_SEC + (uint64_t)tv.tv_usec; + uint64_t fire_at = (3*USEC_PER_SEC) + nowus; + + uint64_t cnt_now = mach_continuous_time(); + uint64_t cnt_then = cnt_now + ms_to_tick(3000); + + T_LOG("currently is %llu, firing at %llu", nowus, fire_at); + + struct kevent64_s change = {0}; + EV_SET64(&change, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0, 0, 0); + T_LOG("EV_SET(&change, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0);"); + + T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL); + + T_LOG("testing NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE between sleep"); + + trigger_sleep(1); + + struct timespec timeout = { + .tv_sec = 10, + .tv_nsec = 0 + }; + struct kevent64_s event = {0}; + T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, &timeout), 1, "kevent() should have returned one event"); + T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata); + T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error"); + + uint64_t elapsed_ms = tick_to_ms(mach_continuous_time() - cnt_now); + int64_t missed_by = tick_to_ns((int64_t)mach_continuous_time() - (int64_t)cnt_then) / 1000000; + + // ~1/2 second is about as good as we'll get + T_ASSERT_LT(llabs(missed_by), 500LL, "timer should pop 3 sec in the future, popped after %lldms", elapsed_ms); + + T_ASSERT_EQ(event.data, 1LL, NULL); + + T_ASSERT_EQ(event.ident, 2ULL, NULL); + + // try getting a periodic tick out of kq + T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, &timeout), 0, NULL); + T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error"); + + T_ASSERT_POSIX_ZERO(close(kq), NULL); +} + +T_DECL(kevent_continuous_time_pops, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME with multiple pops)"){ + // have to throttle rate at which pmset is called + sleep(2); + + mach_timebase_info(&tb_info); + + int kq; + T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL); + + // test that periodic ticks accumulate while asleep + struct kevent64_s change = {0}; + EV_SET64(&change, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0, 0, 0); // tick every 100 ms + T_LOG("EV_SET(&change, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0);"); + + // wait for first pop, then sleep + T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL); + + struct kevent64_s event = {0}; + T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event"); + T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %llu}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata); + T_ASSERT_EQ(event.flags & EV_ERROR, 0, "should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error"); + T_ASSERT_EQ(event.ident, 3ULL, NULL); + + uint64_t cnt_then = mach_continuous_time(); + trigger_sleep(2); + + int sleep_secs = 0; + if(run_sleep_tests) { + sleep_secs = wait_for_sleep(); + } + else { + // simulate 2 seconds of system "sleep" + sleep(2); + } + + uint64_t cnt_now = mach_continuous_time(); + + uint64_t ms_elapsed = tick_to_ms(cnt_now - cnt_then); + if(run_sleep_tests) { + T_ASSERT_LT(llabs((int64_t)ms_elapsed - 2000LL), 500LL, "slept for %llums, expected 2000ms (astris is connected?)", ms_elapsed); + } + + T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event"); + T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %llu}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata); + T_ASSERT_EQ(event.ident, 3ULL, NULL); + + uint64_t expected_pops = ms_elapsed / 100; + uint64_t got_pops = (uint64_t)event.data; + + T_ASSERT_GE(got_pops, expected_pops - 1, "tracking pops while asleep"); + T_ASSERT_POSIX_ZERO(close(kq), NULL); +} diff --git a/tools/tests/darwintests/kperf.c b/tools/tests/darwintests/kperf.c new file mode 100644 index 000000000..6e2c17b03 --- /dev/null +++ b/tools/tests/darwintests/kperf.c @@ -0,0 +1,319 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kperf_helpers.h" + +#define PERF_STK_KHDR UINT32_C(0x25020014) +#define PERF_STK_UHDR UINT32_C(0x25020018) + +/* KDEBUG TRIGGER */ + +#define KDEBUG_TRIGGER_TIMEOUT_NS (10 * NSEC_PER_SEC) + +#define NON_TRIGGER_CLASS UINT8_C(0xfd) +#define NON_TRIGGER_SUBCLASS UINT8_C(0xff) +#define NON_TRIGGER_CODE UINT8_C(0xff) + +#define NON_TRIGGER_EVENT \ + (KDBG_EVENTID(NON_TRIGGER_CLASS, NON_TRIGGER_SUBCLASS, NON_TRIGGER_CODE)) + +static void +expect_kdebug_trigger(const char *filter_desc, const uint32_t *debugids, + unsigned int n_debugids) +{ + __block int missing_kernel_stacks = 0; + __block int missing_user_stacks = 0; + ktrace_session_t s; + kperf_kdebug_filter_t filter; + + s = ktrace_session_create(); + T_QUIET; T_ASSERT_NOTNULL(s, NULL); + + ktrace_events_single(s, PERF_STK_KHDR, ^(struct trace_point *tp) { + missing_kernel_stacks--; + T_LOG("saw kernel stack with %lu frames, flags = %#lx", tp->arg2, + tp->arg1); + }); + ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) { + missing_user_stacks--; + T_LOG("saw user stack with %lu frames, flags = %#lx", tp->arg2, + tp->arg1); + }); + + for (unsigned int i = 0; i < n_debugids; i++) { + ktrace_events_single(s, debugids[i], ^(struct trace_point *tp) { + missing_kernel_stacks++; + missing_user_stacks++; + T_LOG("saw event with debugid 0x%" PRIx32, tp->debugid); + }); + } + + ktrace_events_single(s, NON_TRIGGER_EVENT, + ^(__unused struct trace_point *tp) + { + ktrace_end(s, 0); + }); + + ktrace_set_completion_handler(s, ^{ + T_EXPECT_LE(missing_kernel_stacks, 0, NULL); + T_EXPECT_LE(missing_user_stacks, 0, NULL); + + ktrace_session_destroy(s); + T_END; + }); + + /* configure kperf */ + + kperf_reset(); + + (void)kperf_action_count_set(1); + T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, + KPERF_SAMPLER_KSTACK | KPERF_SAMPLER_USTACK), NULL); + + filter = kperf_kdebug_filter_create(); + T_ASSERT_NOTNULL(filter, NULL); + + T_ASSERT_POSIX_SUCCESS(kperf_kdebug_action_set(1), NULL); + T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_add_desc(filter, filter_desc), + NULL); + T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_set(filter), NULL); + kperf_kdebug_filter_destroy(filter); + + T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + /* trace the triggering debugids */ + + for (unsigned int i = 0; i < n_debugids; i++) { + T_ASSERT_POSIX_SUCCESS(kdebug_trace(debugids[i], 0, 0, 0, 0), NULL); + } + + T_ASSERT_POSIX_SUCCESS(kdebug_trace(NON_TRIGGER_EVENT, 0, 0, 0, 0), NULL); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, KDEBUG_TRIGGER_TIMEOUT_NS), + dispatch_get_main_queue(), ^(void) + { + ktrace_end(s, 1); + }); +} + +#define TRIGGER_CLASS UINT8_C(0xfe) +#define TRIGGER_CLASS_END UINT8_C(0xfd) +#define TRIGGER_SUBCLASS UINT8_C(0xff) +#define TRIGGER_CODE UINT8_C(0) +#define TRIGGER_DEBUGID \ + (KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, TRIGGER_CODE)) + +T_DECL(kdebug_trigger_classes, "test that kdebug trigger samples on classes", + T_META_ASROOT(YES)) +{ + const uint32_t class_debugids[] = { + KDBG_EVENTID(TRIGGER_CLASS, 1, 1), + KDBG_EVENTID(TRIGGER_CLASS, 2, 1), + KDBG_EVENTID(TRIGGER_CLASS_END, 1, 1) | DBG_FUNC_END, + KDBG_EVENTID(TRIGGER_CLASS_END, 2, 1) | DBG_FUNC_END, + }; + + expect_kdebug_trigger("C0xfe,C0xfdr", class_debugids, + sizeof(class_debugids) / sizeof(class_debugids[0])); + dispatch_main(); +} + +T_DECL(kdebug_trigger_subclasses, + "test that kdebug trigger samples on subclasses", + T_META_ASROOT(YES)) +{ + const uint32_t subclass_debugids[] = { + KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 0), + KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 1), + KDBG_EVENTID(TRIGGER_CLASS_END, TRIGGER_SUBCLASS, 0) | DBG_FUNC_END, + KDBG_EVENTID(TRIGGER_CLASS_END, TRIGGER_SUBCLASS, 1) | DBG_FUNC_END + }; + + expect_kdebug_trigger("S0xfeff,S0xfdffr", subclass_debugids, + sizeof(subclass_debugids) / sizeof(subclass_debugids[0])); + dispatch_main(); +} + +T_DECL(kdebug_trigger_debugids, "test that kdebug trigger samples on debugids", + T_META_ASROOT(YES)) +{ + const uint32_t debugids[] = { + TRIGGER_DEBUGID + }; + + expect_kdebug_trigger("D0xfeff0000", debugids, + sizeof(debugids) / sizeof(debugids[0])); + dispatch_main(); +} + +/* + * TODO Set a single function specifier filter, expect not to trigger of all + * events from that class. + */ + +T_DECL(kdbg_callstacks, "test that the kdbg_callstacks samples on syscalls", + T_META_ASROOT(YES)) +{ + ktrace_session_t s; + __block bool saw_user_stack = false; + + s = ktrace_session_create(); + T_ASSERT_NOTNULL(s, NULL); + + /* + * Make sure BSD events are traced in order to trigger samples on syscalls. + */ + ktrace_events_class(s, DBG_BSD, + ^void(__unused struct trace_point *tp) {}); + + ktrace_events_single(s, PERF_STK_UHDR, ^(__unused struct trace_point *tp) { + saw_user_stack = true; + ktrace_end(s, 1); + }); + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + + T_EXPECT_TRUE(saw_user_stack, + "saw user stack after configuring kdbg_callstacks"); + + /* + * Ensure user stacks are not sampled after resetting kdbg_callstacks. + */ + ktrace_session_t s_after = ktrace_session_create(); + T_ASSERT_NOTNULL(s_after, NULL); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + T_ASSERT_POSIX_SUCCESS(kperf_kdbg_callstacks_set(0), NULL); +#pragma clang diagnostic pop + + ktrace_events_class(s_after, DBG_BSD, + ^void(__unused struct trace_point *tp) {}); + + __block bool saw_extra_stack = false; + + ktrace_events_single(s_after, PERF_STK_UHDR, + ^(__unused struct trace_point *tp) + { + saw_extra_stack = true; + ktrace_end(s_after, 1); + }); + + ktrace_set_completion_handler(s_after, ^(void) { + ktrace_session_destroy(s_after); + T_EXPECT_FALSE(saw_extra_stack, + "saw user stack after disabling kdbg_callstacks)"); + kperf_reset(); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s_after, dispatch_get_main_queue()), + NULL); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 1 * NSEC_PER_SEC), + dispatch_get_main_queue(), ^(void) + { + ktrace_end(s_after, 1); + }); + }); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + T_ASSERT_POSIX_SUCCESS(kperf_kdbg_callstacks_set(1), NULL); +#pragma clang diagnostic pop + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC), + dispatch_get_main_queue(), ^(void) + { + ktrace_end(s, 1); + }); + + dispatch_main(); +} + +/* + * PET mode + */ + +#define STACKS_WAIT_DURATION_NS (3 * NSEC_PER_SEC) + +static void +expect_stacks_traced(void (^cb)(void)) +{ + ktrace_session_t s; + + s = ktrace_session_create(); + T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create"); + + __block unsigned int user_stacks = 0; + __block unsigned int kernel_stacks = 0; + + ktrace_events_single(s, PERF_STK_UHDR, ^(__unused struct trace_point *tp) { + user_stacks++; + }); + ktrace_events_single(s, PERF_STK_KHDR, ^(__unused struct trace_point *tp) { + kernel_stacks++; + }); + + ktrace_set_completion_handler(s, ^(void) { + ktrace_session_destroy(s); + T_EXPECT_GT(user_stacks, 0U, NULL); + T_EXPECT_GT(kernel_stacks, 0U, NULL); + cb(); + }); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, STACKS_WAIT_DURATION_NS), + dispatch_get_main_queue(), ^(void) + { + kperf_reset(); + ktrace_end(s, 0); + }); +} + +T_DECL(pet, "test that PET mode samples kernel and user stacks", + T_META_ASROOT(YES)) +{ + configure_kperf_stacks_timer(-1, 10); + T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL); + + expect_stacks_traced(^(void) { + T_END; + }); + + dispatch_main(); +} + +T_DECL(lightweight_pet, + "test that lightweight PET mode samples kernel and user stacks", + T_META_ASROOT(YES)) +{ + int set = 1; + + configure_kperf_stacks_timer(-1, 10); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kperf.lightweight_pet", NULL, NULL, + &set, sizeof(set)), NULL); + T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL); + + expect_stacks_traced(^(void) { + T_END; + }); + + dispatch_main(); +} diff --git a/tools/tests/darwintests/kperf_backtracing.c b/tools/tests/darwintests/kperf_backtracing.c new file mode 100644 index 000000000..37773a57f --- /dev/null +++ b/tools/tests/darwintests/kperf_backtracing.c @@ -0,0 +1,368 @@ +#include +#include +#include +#include +#include +#include + +#include "kperf_helpers.h" + +#define PERF_STK_KHDR UINT32_C(0x25020014) +#define PERF_STK_UHDR UINT32_C(0x25020018) +#define PERF_STK_KDATA UINT32_C(0x2502000c) +#define PERF_STK_UDATA UINT32_C(0x25020010) + +static void +expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol, + unsigned long addr, unsigned int bt_idx, unsigned int max_frames) +{ + const char *name; + unsigned int frame_idx = max_frames - bt_idx - 1; + + if (!bt[frame_idx]) { + T_LOG("frame %2u: skipping system frame", frame_idx); + return; + } + + if (CSIsNull(symbol)) { + T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx); + return; + } + + if (frame_idx >= bt_len) { + T_FAIL("unexpected frame '%s' (%#lx) at index %u", + CSSymbolGetName(symbol), addr, frame_idx); + return; + } + + name = CSSymbolGetName(symbol); + T_QUIET; T_ASSERT_NOTNULL(name, NULL); + T_EXPECT_EQ_STR(name, bt[frame_idx], + "frame %2u: saw '%s', expected '%s'", + frame_idx, name, bt[frame_idx]); +} + +/* + * Expect to see user and kernel stacks with a known signature. + */ +static void +expect_backtrace(ktrace_session_t s, uint64_t tid, unsigned int *stacks_seen, + bool kern, const char **bt, unsigned int bt_len) +{ + CSSymbolicatorRef symb; + uint32_t hdr_debugid; + uint32_t data_debugid; + __block unsigned int stacks = 0; + __block unsigned int frames = 0; + __block unsigned int hdr_frames = 0; + + if (kern) { + static CSSymbolicatorRef kern_symb; + static dispatch_once_t kern_symb_once; + + hdr_debugid = PERF_STK_KHDR; + data_debugid = PERF_STK_KDATA; + + dispatch_once(&kern_symb_once, ^(void) { + kern_symb = CSSymbolicatorCreateWithMachKernel(); + T_QUIET; T_ASSERT_FALSE(CSIsNull(kern_symb), NULL); + }); + symb = kern_symb; + } else { + static CSSymbolicatorRef user_symb; + static dispatch_once_t user_symb_once; + + hdr_debugid = PERF_STK_UHDR; + data_debugid = PERF_STK_UDATA; + + dispatch_once(&user_symb_once, ^(void) { + user_symb = CSSymbolicatorCreateWithTask(mach_task_self()); + T_QUIET; T_ASSERT_FALSE(CSIsNull(user_symb), NULL); + T_QUIET; T_ASSERT_TRUE(CSSymbolicatorIsTaskValid(user_symb), NULL); + }); + symb = user_symb; + } + + ktrace_events_single(s, hdr_debugid, ^(struct trace_point *tp) { + if (tid != 0 && tid != tp->threadid) { + return; + } + + stacks++; + if (!(tp->arg1 & 1)) { + T_FAIL("invalid %s stack on thread %#lx", kern ? "kernel" : "user", + tp->threadid); + return; + } + + hdr_frames = (unsigned int)tp->arg2; + /* ignore extra link register or value pointed to by stack pointer */ + hdr_frames -= 1; + + T_QUIET; T_EXPECT_EQ(hdr_frames, bt_len, + "number of frames in header"); + + T_LOG("%s stack seen", kern ? "kernel" : "user"); + frames = 0; + }); + + ktrace_events_single(s, data_debugid, ^(struct trace_point *tp) { + if (tid != 0 && tid != tp->threadid) { + return; + } + + for (int i = 0; i < 4 && frames < hdr_frames; i++, frames++) { + unsigned long addr = (&tp->arg1)[i]; + CSSymbolRef symbol = CSSymbolicatorGetSymbolWithAddressAtTime( + symb, addr, kCSNow); + + expect_frame(bt, bt_len, symbol, addr, frames, hdr_frames); + } + + /* saw the end of the user stack */ + if (hdr_frames == frames) { + *stacks_seen += 1; + if (!kern) { + ktrace_end(s, 1); + } + } + }); +} + +#define TRIGGERING_DEBUGID (0xfeff0f00) + +/* + * These functions must return an int to avoid the function prologue being + * hoisted out of the path to the spin (breaking being able to get a good + * backtrace). + */ +static int __attribute__((noinline,not_tail_called)) +recurse_a(bool spin, unsigned int frames); +static int __attribute__((noinline,not_tail_called)) +recurse_b(bool spin, unsigned int frames); + +static int __attribute__((noinline,not_tail_called)) +recurse_a(bool spin, unsigned int frames) +{ + if (frames == 0) { + if (spin) { + for (;;); + } else { + kdebug_trace(TRIGGERING_DEBUGID, 0, 0, 0, 0); + return 0; + } + } + + return recurse_b(spin, frames - 1) + 1; +} + +static int __attribute__((noinline,not_tail_called)) +recurse_b(bool spin, unsigned int frames) +{ + if (frames == 0) { + if (spin) { + for (;;); + } else { + kdebug_trace(TRIGGERING_DEBUGID, 0, 0, 0, 0); + return 0; + } + } + + return recurse_a(spin, frames - 1) + 1; +} + +#define USER_FRAMES (12) + +#if defined(__x86_64__) +#define RECURSE_START_OFFSET (4) +#else /* defined(__x86_64__) */ +#define RECURSE_START_OFFSET (3) +#endif /* defined(__x86_64__) */ + +static const char *user_bt[USER_FRAMES] = { +#if defined(__x86_64__) + NULL, +#endif /* defined(__x86_64__) */ + NULL, NULL, + "backtrace_thread", + "recurse_a", "recurse_b", "recurse_a", "recurse_b", + "recurse_a", "recurse_b", "recurse_a", +#if !defined(__x86_64__) + "recurse_b", +#endif /* !defined(__x86_64__) */ + NULL +}; + +#if defined(__x86_64__) + +#define KERNEL_FRAMES (2) +static const char *kernel_bt[KERNEL_FRAMES] = { + "unix_syscall64", "kdebug_trace64" +}; + +#else +#error "architecture unsupported" +#endif /* defined(__arm__) */ + +static dispatch_once_t backtrace_start_once; +static dispatch_semaphore_t backtrace_start; + +static void * +backtrace_thread(void *arg) +{ + bool spin; + unsigned int calls; + + spin = (bool)arg; + dispatch_semaphore_wait(backtrace_start, DISPATCH_TIME_FOREVER); + + /* + * backtrace_thread, recurse_a, recurse_b, ...[, __kdebug_trace64] + * + * Always make one less call for this frame (backtrace_thread). + */ + calls = USER_FRAMES - RECURSE_START_OFFSET - 1 /* backtrace_thread */; + if (spin) { + /* + * Spinning doesn't end up calling __kdebug_trace64. + */ + calls -= 1; + } + + T_LOG("backtrace thread calling into %d frames (already at %d frames)", + calls, RECURSE_START_OFFSET); + (void)recurse_a(spin, calls); + return NULL; +} + +static uint64_t +create_backtrace_thread(bool spin) +{ + pthread_t thread; + uint64_t tid; + + dispatch_once(&backtrace_start_once, ^(void) { + backtrace_start = dispatch_semaphore_create(0); + }); + + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread, + (void *)spin), NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_threadid_np(thread, &tid), NULL); + + return tid; +} + +static void +start_backtrace_thread(void) +{ + T_QUIET; T_ASSERT_NOTNULL(backtrace_start, + "thread to backtrace created before starting it"); + dispatch_semaphore_signal(backtrace_start); +} + +#define TEST_TIMEOUT_NS (5 * NSEC_PER_SEC) + +T_DECL(kdebug_trigger_backtraces, + "test that backtraces from kdebug trigger are correct", + T_META_ASROOT(YES)) +{ + static unsigned int stacks_seen = 0; + ktrace_session_t s; + kperf_kdebug_filter_t filter; + uint64_t tid; + + s = ktrace_session_create(); + T_ASSERT_NOTNULL(s, "ktrace session was created"); + + T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL); + + tid = create_backtrace_thread(false); + expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES); + expect_backtrace(s, tid, &stacks_seen, true, kernel_bt, KERNEL_FRAMES); + + /* + * The triggering event must be traced (and thus registered with libktrace) + * to get backtraces. + */ + ktrace_events_single(s, TRIGGERING_DEBUGID, + ^(__unused struct trace_point *tp){ }); + + ktrace_set_completion_handler(s, ^(void) { + T_EXPECT_GE(stacks_seen, 2U, "saw both kernel and user stacks"); + ktrace_session_destroy(s); + kperf_reset(); + T_END; + }); + + filter = kperf_kdebug_filter_create(); + T_ASSERT_NOTNULL(filter, "kperf kdebug filter was created"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_add_debugid(filter, + TRIGGERING_DEBUGID), NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_set(filter), NULL); + (void)kperf_action_count_set(1); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, + KPERF_SAMPLER_USTACK | KPERF_SAMPLER_KSTACK), NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_action_set(1), NULL); + kperf_kdebug_filter_destroy(filter); + + T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + start_backtrace_thread(); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS), + dispatch_get_main_queue(), ^(void) + { + ktrace_end(s, 0); + }); + + dispatch_main(); +} + +T_DECL(user_backtraces_timer, + "test that user backtraces on a timer are correct", + T_META_ASROOT(YES)) +{ + static unsigned int stacks_seen = 0; + ktrace_session_t s; + uint64_t tid; + + s = ktrace_session_create(); + T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create"); + + ktrace_filter_pid(s, getpid()); + + configure_kperf_stacks_timer(getpid(), 10); + + tid = create_backtrace_thread(true); + /* not calling kdebug_trace(2) on the last frame */ + expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES - 1); + + ktrace_set_completion_handler(s, ^(void) { + T_EXPECT_GE(stacks_seen, 1U, "saw at least one stack"); + ktrace_session_destroy(s); + kperf_reset(); + T_END; + }); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + start_backtrace_thread(); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS), + dispatch_get_main_queue(), ^(void) + { + ktrace_end(s, 0); + }); + + dispatch_main(); +} + +/* TODO test kernel stacks in all modes */ +/* TODO PET mode backtracing */ +/* TODO test deep stacks, further than 128 frames, make sure they are truncated */ +/* TODO test constrained stacks */ diff --git a/tools/tests/darwintests/kperf_helpers.c b/tools/tests/darwintests/kperf_helpers.c new file mode 100644 index 000000000..bf64f6bb8 --- /dev/null +++ b/tools/tests/darwintests/kperf_helpers.c @@ -0,0 +1,25 @@ +#include "kperf_helpers.h" + +#include +#include +#include + +void +configure_kperf_stacks_timer(pid_t pid, unsigned int period_ms) +{ + kperf_reset(); + + (void)kperf_action_count_set(1); + (void)kperf_timer_count_set(1); + + T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, + KPERF_SAMPLER_USTACK | KPERF_SAMPLER_KSTACK), NULL); + + if (pid != -1) { + T_ASSERT_POSIX_SUCCESS(kperf_action_filter_set_by_pid(1, pid), NULL); + } + + T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL); + T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0, + kperf_ns_to_ticks(period_ms * NSEC_PER_MSEC)), NULL); +} diff --git a/tools/tests/darwintests/kperf_helpers.h b/tools/tests/darwintests/kperf_helpers.h new file mode 100644 index 000000000..466f3d9a7 --- /dev/null +++ b/tools/tests/darwintests/kperf_helpers.h @@ -0,0 +1,8 @@ +#ifndef KPERF_HELPERS_H +#define KPERF_HELPERS_H + +#include + +void configure_kperf_stacks_timer(pid_t pid, unsigned int period_ms); + +#endif /* !defined(KPERF_HELPERS_H) */ diff --git a/tools/tests/darwintests/kqueue_fifo_18776047.c b/tools/tests/darwintests/kqueue_fifo_18776047.c new file mode 100644 index 000000000..4bcd3fc53 --- /dev/null +++ b/tools/tests/darwintests/kqueue_fifo_18776047.c @@ -0,0 +1,151 @@ +/* + * testname: kqueue_fifo + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define TMP_FILE_PATH "/tmp/test_kqueue_fifo_18776047" + +#define READ_BUFFER_LEN 256 + +#if defined(PLATFORM_WatchOS) +#define TOTAL_ITERATIONS 5000 +#else +#define TOTAL_ITERATIONS 10000 +#endif + +/* prototypes */ +int write_some_data(int fd); +int read_data(int fd); +void create_fifo(const char * filepath); +void kevent_one_shot(int kq, int fd, int filter); + +int +write_some_data(int fd) +{ + int retval = 0; + int count = 0; + int len = 5; + char * data = "ABCDE"; + while (true) { + errno = 0; + retval = (int)write(fd, data, (size_t)len); + if (retval < 0) { + if (errno == EAGAIN) { + if (len == 1) + return count; + else + len--; + } else { + T_ASSERT_FAIL("write to fd %d of %s of len %d failed.", fd, data, len); + abort(); + } + } else { + count += retval; + } + } +} + +int +read_data(int fd) +{ + int retval, count = 0; + char databuffer[READ_BUFFER_LEN]; + while (true) { + errno = 0; + retval = (int)read(fd, databuffer, READ_BUFFER_LEN); + if (retval < 0) { + if (errno == EAGAIN) { + return count; + } else { + T_ASSERT_FAIL("read from fd %d failed.", fd); + abort(); + } + } + count += retval; + } +} + +void +create_fifo(const char * filepath) +{ + struct stat f_stat; + int ret = 0; + errno = 0; + ret = stat(filepath, &f_stat); + if (ret == 0) { + /* if file exists, make sure its a fifo */ + T_ASSERT_TRUE(S_ISFIFO(f_stat.st_mode), "ensure %s is a fifo", filepath); + } else if (errno == ENOENT) { + ret = mkfifo(filepath, 0777); + T_ASSERT_POSIX_ZERO(ret, "creating a fifo at path %s", filepath); + } else { + T_ASSERT_FAIL("stat operation on %s", filepath); + } +} + +void +kevent_one_shot(int kq, int fd, int filter) +{ + int retval = 0; + struct timespec t_zero = {0, 0}; + struct kevent kev[1]; + + T_QUIET; + T_ASSERT_GE(kq, 0, "ensure kq is valid"); + T_LOG("kevent doing ONESHOT %s", filter == EVFILT_READ ? "read" : "write"); + + EV_SET(kev, fd, filter, EV_ADD | EV_ONESHOT, 0, 0, NULL); + retval = kevent(kq, kev, 1, NULL, 0, &t_zero); + T_QUIET; + T_ASSERT_POSIX_ZERO(retval, "ONESHOT kevent for fd %d, filter %d", fd, filter); +} + +T_DECL(kqueue_fifo_18776047, "Tests kqueue, kevent for watching a fifo.", T_META("owner", "Core Kernel Team")) +{ + struct kevent kev[1]; + int read_fd, write_fd, kq; + int retval = 0; + int iter = 0; + const char * fpath = TMP_FILE_PATH; + T_SETUPBEGIN; + create_fifo(fpath); + + kq = kqueue(); + T_ASSERT_GE(kq, 0, "create a kqueue"); + + read_fd = open(fpath, O_RDONLY | O_APPEND | O_NONBLOCK); + T_ASSERT_POSIX_SUCCESS(read_fd, "opening read fd on fifo."); + + write_fd = open(fpath, O_WRONLY | O_APPEND | O_NONBLOCK); + T_ASSERT_POSIX_SUCCESS(write_fd, "opening write fd on fifo."); + + T_SETUPEND; + + kevent_one_shot(kq, write_fd, EVFILT_WRITE); + kevent_one_shot(kq, read_fd, EVFILT_READ); + + while (iter++ < TOTAL_ITERATIONS) { + retval = kevent(kq, NULL, 0, kev, 1, NULL); + T_QUIET; + T_ASSERT_GE(retval, 0, "kevent on kq %d", kq); + + if (kev[0].ident == (uintptr_t)write_fd) { + retval = write_some_data(write_fd); + T_LOG("writer ready iter: %d wrote %d bytes", iter, retval); + kevent_one_shot(kq, write_fd, EVFILT_WRITE); + } else if (kev[0].ident == (uintptr_t)read_fd) { + retval = read_data(read_fd); + T_LOG("reader ready iter: %d read %d bytes", iter, retval); + kevent_one_shot(kq, read_fd, EVFILT_READ); + } + } + T_PASS("kqueue_fifo_18776047 PASSED"); +} diff --git a/tools/tests/darwintests/mach_boottime_usec.c b/tools/tests/darwintests/mach_boottime_usec.c new file mode 100644 index 000000000..85a1f85e0 --- /dev/null +++ b/tools/tests/darwintests/mach_boottime_usec.c @@ -0,0 +1,20 @@ +#include +#include +#include +#include +#include +#include + +#include + +T_DECL(mach_boottime_usec, "mach_boottime_usec()", + T_META_ALL_VALID_ARCHS(YES)) +{ + uint64_t bt_usec = mach_boottime_usec(); + + struct timeval bt_tv; + size_t len = sizeof(bt_tv); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.boottime", &bt_tv, &len, NULL, 0), NULL); + + T_EXPECT_EQ((uint64_t)bt_tv.tv_sec * USEC_PER_SEC + (uint64_t)bt_tv.tv_usec, bt_usec, NULL); +} diff --git a/tools/tests/darwintests/mach_continuous_time.c b/tools/tests/darwintests/mach_continuous_time.c new file mode 100644 index 000000000..1afdb6d44 --- /dev/null +++ b/tools/tests/darwintests/mach_continuous_time.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern char **environ; + +static const int64_t one_mil = 1000*1000; + +#define to_ns(ticks) ((ticks * tb_info.numer) / (tb_info.denom)) +#define to_ms(ticks) (to_ns(ticks)/one_mil) + +static mach_timebase_info_data_t tb_info; + +static void +update(uint64_t *a, uint64_t *c) { + mach_get_times(a,c,NULL); +} + +T_DECL(mct_monotonic, "Testing mach_continuous_time returns sane, monotonic values", + T_META_ALL_VALID_ARCHS(YES)) +{ + mach_timebase_info(&tb_info); + + volatile uint64_t multiple_test = to_ms(mach_continuous_time()); + for(int i = 0; i < 10; i++) { + uint64_t tmp = to_ms(mach_continuous_time()); + T_ASSERT_GE(tmp, multiple_test, "mach_continuous_time must be monotonic"); + + // each successive call shouldn't be more than 50ms in the future + T_ASSERT_LE(tmp - multiple_test, 50ULL, "mach_continuous_time should not jump forward too fast"); + + multiple_test = tmp; + } +} + +T_DECL(mct_pause, "Testing mach_continuous_time and mach_absolute_time don't diverge") +{ + mach_timebase_info(&tb_info); + + uint64_t abs_now; + uint64_t cnt_now; + int before_diff, after_diff; + + update(&abs_now, &cnt_now); + before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now)); + + sleep(1); + + update(&abs_now, &cnt_now); + after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now)); + + T_ASSERT_LE(abs(after_diff - before_diff), 1, "mach_continuous_time and mach_absolute_time should not diverge"); +} + +T_DECL(mct_sleep, "Testing mach_continuous_time behavior over system sleep"){ +#ifndef MCT_SLEEP_TEST + T_SKIP("Skipping test that sleeps the device; compile with MCT_SLEEP_TEST define to enable."); +#endif + + mach_timebase_info(&tb_info); + + uint64_t abs_now; + uint64_t cnt_now; + int before_diff, after_diff = 0; + + T_LOG("Testing mach_continuous_time is ~5 seconds ahead of mach_absolute_time after 5 second sleep"); + update(&abs_now, &cnt_now); + before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now)); + + // performs: + // pmset relative wake 5 + // pmset sleepnow + + pid_t pid; + int spawn_ret = 0; + time_t before_sleep = time(NULL); + int ct_ms_before_sleep = (int)to_ms(cnt_now); + int ab_ms_before_sleep = (int)to_ms(abs_now); + + char *const pmset1_args[] = {"/usr/bin/pmset", "relative", "wake", "5", NULL}; + T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset1_args[0], NULL, NULL, pmset1_args, environ)), NULL); + + T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed"); + T_ASSERT_EQ(spawn_ret, 0, "pmset relative wait 5 failed"); + + char *const pmset2_args[] = {"/usr/bin/pmset", "sleepnow", NULL}; + T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset2_args[0], NULL, NULL, pmset2_args, environ)), NULL); + + T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed"); + T_ASSERT_EQ(spawn_ret, 0, "pmset relative wait 5 failed"); + + // wait for device to sleep (up to 30 seconds) + for(int i = 0; i < 30; i++) { + update(&abs_now, &cnt_now); + after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now)); + + // on OSX, there's enough latency between calls to MCT and MAT + // when the system is going down for sleep for values to diverge a few ms + if(abs(before_diff - after_diff) > 2) { + break; + } + + sleep(1); + T_LOG("waited %d seconds for sleep...", i+1); + } + + if((after_diff - before_diff) < 4000) { + T_LOG("Device slept for less than 4 seconds, did it really sleep? (%d ms change between abs and cont)", + after_diff - before_diff); + } + + time_t after_sleep = time(NULL); + + int cal_sleep_diff = (int)(double)difftime(after_sleep, before_sleep); + int ct_sleep_diff = ((int)to_ms(cnt_now) - ct_ms_before_sleep)/1000; + int ab_sleep_diff = ((int)to_ms(abs_now) - ab_ms_before_sleep)/1000; + + T_LOG("Calendar progressed: %d sec; continuous time progressed: %d sec; absolute time progressed %d sec", + cal_sleep_diff, ct_sleep_diff, ab_sleep_diff); + + T_ASSERT_LE(abs(ct_sleep_diff - cal_sleep_diff), 2, + "continuous time should progress at ~ same rate as calendar"); +} + +T_DECL(mct_settimeofday, "Testing mach_continuous_time behavior over settimeofday"){ + if (geteuid() != 0){ + T_SKIP("The settimeofday() test requires root privileges to run."); + } + mach_timebase_info(&tb_info); + + struct timeval saved_tv; + struct timezone saved_tz; + int before, after; + + T_ASSERT_POSIX_ZERO(gettimeofday(&saved_tv, &saved_tz), NULL); + + struct timeval forward_tv = saved_tv; + // move time forward by two minutes, ensure mach_continuous_time keeps + // chugging along with mach_absolute_time + forward_tv.tv_sec += 2*60; + + before = (int)to_ms(mach_continuous_time()); + T_ASSERT_POSIX_ZERO(settimeofday(&forward_tv, &saved_tz), NULL); + + after = (int)to_ms(mach_continuous_time()); + T_ASSERT_POSIX_ZERO(settimeofday(&saved_tv, &saved_tz), NULL); + + T_ASSERT_LT(abs(before - after), 1000, "mach_continuous_time should not jump more than 1s"); +} + +T_DECL(mct_aproximate, "Testing mach_continuous_approximate_time()", + T_META_ALL_VALID_ARCHS(YES)) +{ + mach_timebase_info(&tb_info); + + uint64_t absolute = to_ns(mach_continuous_time()); + uint64_t approximate = to_ns(mach_continuous_approximate_time()); + + T_EXPECT_LE(llabs((long long)absolute - (long long)approximate), (long long)(25*NSEC_PER_MSEC), NULL); +} diff --git a/tools/tests/darwintests/mach_get_times.c b/tools/tests/darwintests/mach_get_times.c new file mode 100644 index 000000000..915602b50 --- /dev/null +++ b/tools/tests/darwintests/mach_get_times.c @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +#include +#include + +#define T_LOG_VERBOSE(...) + +#define timespec2nanosec(ts) ((uint64_t)((ts)->tv_sec) * NSEC_PER_SEC + (uint64_t)((ts)->tv_nsec)) + +T_DECL(mach_get_times, "mach_get_times()", + T_META_CHECK_LEAKS(NO), T_META_ALL_VALID_ARCHS(YES)) +{ + const int ITERATIONS = 500000 * dt_ncpu(); + struct timespec gtod_ts; + + uint64_t last_absolute, last_continuous, last_gtod; + T_QUIET; T_ASSERT_EQ(mach_get_times(&last_absolute, &last_continuous, >od_ts), KERN_SUCCESS, NULL); + last_gtod = timespec2nanosec(>od_ts); + + for (int i = 0; i < ITERATIONS; i++) { + uint64_t absolute, continuous, gtod; + T_QUIET; T_ASSERT_EQ(mach_get_times(&absolute, &continuous, >od_ts), KERN_SUCCESS, NULL); + gtod = timespec2nanosec(>od_ts); + + T_LOG_VERBOSE("[%d] abs: %llu.%09llu(+%llu)\tcont: %llu.%09llu(+%llu)\tgtod:%llu.%09llu(+%llu)", i, + absolute / NSEC_PER_SEC, absolute % NSEC_PER_SEC, absolute - last_absolute, + continuous / NSEC_PER_SEC, continuous % NSEC_PER_SEC, continuous - last_continuous, + gtod / NSEC_PER_SEC, gtod % NSEC_PER_SEC, gtod - last_gtod); + + T_QUIET; T_EXPECT_EQ(absolute - last_absolute, continuous - last_continuous, NULL); + + int64_t gtod_diff = (int64_t)gtod - (int64_t)last_gtod; + T_QUIET; T_ASSERT_LE((uint64_t)llabs(gtod_diff), NSEC_PER_SEC, NULL); + + last_absolute = absolute; + last_continuous = continuous; + last_gtod = gtod; + + gtod_ts.tv_sec = 0; gtod_ts.tv_nsec = 0; + } +} diff --git a/tools/tests/darwintests/mach_timebase_info.c b/tools/tests/darwintests/mach_timebase_info.c new file mode 100644 index 000000000..ad0a5c779 --- /dev/null +++ b/tools/tests/darwintests/mach_timebase_info.c @@ -0,0 +1,20 @@ +#include + +#include + +extern kern_return_t mach_timebase_info_trap(mach_timebase_info_t info); + +T_DECL(mach_timebase_info, "mach_timebase_info(_trap)", + T_META_ALL_VALID_ARCHS(YES)) +{ + mach_timebase_info_data_t a, b, c; + + T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info(&a), NULL); + T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info(&b), NULL); + T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info_trap(&c), NULL); + + T_EXPECT_EQ(a.numer, b.numer, NULL); + T_EXPECT_EQ(a.denom, b.denom, NULL); + T_EXPECT_EQ(a.numer, c.numer, NULL); + T_EXPECT_EQ(a.denom, c.denom, NULL); +} diff --git a/tools/tests/darwintests/perf_exit.c b/tools/tests/darwintests/perf_exit.c new file mode 100644 index 000000000..d45a6481b --- /dev/null +++ b/tools/tests/darwintests/perf_exit.c @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include + +#include + +// From bsd/sys/proc_internal.h +#define PID_MAX 99999 + +T_DECL(exit, "exit(2) time from syscall start to end", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO)) { + _Atomic static int ended = 0; + dispatch_queue_t spawn_queue; + + dt_stat_time_t s = dt_stat_time_create("time"); + + uint64_t *begin_ts = malloc(sizeof(uint64_t) * PID_MAX); + if (begin_ts == NULL) { + T_FAIL("Error allocating timestamp array"); + } + + ktrace_session_t session; + session = ktrace_session_create(); + if (session == NULL) { + T_FAIL("Error creating ktrace session"); + } + + ktrace_set_completion_handler(session, ^{ + free(begin_ts); + dt_stat_finalize(s); + T_END; + }); + + ktrace_set_signal_handler(session); + + // We are only interested by the process we launched + ktrace_filter_process(session, "true"); + + ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_EXCP_SC, 1) | DBG_FUNC_START), ^(ktrace_event_t e) { + pid_t pid = ktrace_get_pid_for_thread(session, e->threadid); + if (pid > PID_MAX) { + T_FAIL("Invalid pid returned by ktrace_get_pid_for_thread: %d\n", pid); + } + begin_ts[pid] = e->timestamp; + + }); + ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END), ^(ktrace_event_t e) { + pid_t pid = ktrace_get_pid_for_thread(session, e->threadid); + if (pid > PID_MAX) { + T_FAIL("Invalid pid returned by ktrace_get_pid_for_thread: %d\n", pid); + } + if (begin_ts[pid] == 0) { + return; + } + uint64_t delta = e->timestamp - begin_ts[pid]; + if (!dt_stat_stable(s)) { + dt_stat_mach_time_add(s, delta); + } + else { + ended = 1; + ktrace_end(session, 1); + } + }); + + int ret = ktrace_start(session, dispatch_get_main_queue()); + if (ret != 0) { + T_FAIL("Error starting ktrace"); + } + + // Spawn processes continuously until the test is over + spawn_queue = dispatch_queue_create("spawn_queue", NULL); + dispatch_async(spawn_queue, ^(void) { + while (!ended) { + pid_t pid; + int status; + char *args[] = {"/usr/bin/true", NULL}; + int err = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); + if (err) + T_FAIL("posix_spawn returned %d", err); + + waitpid(pid, &status, 0); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + T_FAIL("Child process of posix_spawn failed to run"); + } + }); + + dispatch_main(); +} diff --git a/tools/tests/darwintests/perf_spawn_fork.c b/tools/tests/darwintests/perf_spawn_fork.c new file mode 100644 index 000000000..c52605ff0 --- /dev/null +++ b/tools/tests/darwintests/perf_spawn_fork.c @@ -0,0 +1,67 @@ +#include +#include +#include +#include + + +#define SPAWN_MEASURE_LOOP(s) \ + char *args[] = {"/usr/bin/true", NULL}; \ + int err; \ + pid_t pid; \ + int status; \ + while (!dt_stat_stable(s)) { \ + T_STAT_MEASURE(s) { \ + err = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); \ + } \ + if (err) { \ + T_FAIL("posix_spawn returned %d", err); \ + } \ + waitpid(pid, &status, 0); \ + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { \ + T_FAIL("Child process of posix_spawn failed to run"); \ + } \ + } + +T_DECL(posix_spawn_platform_binary_latency, "posix_spawn platform binary latency", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO)) { + { + dt_stat_time_t s = dt_stat_time_create("time"); + SPAWN_MEASURE_LOOP(s); + dt_stat_finalize(s); + } + + { + dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on-cpu time"); + SPAWN_MEASURE_LOOP(s); + dt_stat_finalize(s); + } +} + +#define FORK_MEASURE_LOOP(s) \ + pid_t pid; \ + int status; \ + while (!dt_stat_stable(s)) { \ + T_STAT_MEASURE(s) { \ + pid = fork(); \ + if (pid == 0) \ + exit(0); \ + else if (pid == -1) \ + T_FAIL("fork returned -1"); \ + } \ + waitpid(pid, &status, 0); \ + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { \ + T_FAIL("forked process failed to exit properly"); \ + } \ + } + +T_DECL(fork, "fork latency", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO)) { + { + dt_stat_time_t s = dt_stat_time_create("time"); + FORK_MEASURE_LOOP(s); + dt_stat_finalize(s); + } + { + dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on-cpu time"); + FORK_MEASURE_LOOP(s); + dt_stat_finalize(s); + } +} diff --git a/tools/tests/darwintests/proc_core_name_24152432.c b/tools/tests/darwintests/proc_core_name_24152432.c new file mode 100644 index 000000000..3bcc8d810 --- /dev/null +++ b/tools/tests/darwintests/proc_core_name_24152432.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUFFLEN 2048 +#define EVILLEN 19 + +static const char corefile_ctl[] = "kern.corefile"; +static const char coredump_ctl[] = "kern.coredump"; +/* The default coredump location if the kern.coredump ctl is invalid */ +static const char default_dump_fmt[] = "/cores/core.%d"; +/* The coredump location when we set kern.coredump ctl to something valid */ +static const char valid_dump_fmt[] = "/cores/test-core.%d"; + +/* /cores/core.%(null), then BORK immediately after. */ +static char evil[] = {'/', 'c', 'o', 'r', 'e', 's', '/', 'c', 'o', 'r', 'e', '.', '%', '\0', 'B', 'O', 'R', 'K', '\0'}; +/* A valid coredump location to test. */ +static char valid_dump_loc[] = "/cores/test-core.%P"; + +static const struct rlimit lim_infty = { + RLIM_INFINITY, + RLIM_INFINITY +}; + +#if TARGET_OS_OSX +static int fork_and_wait_for_segfault(void); + +static int fork_and_wait_for_segfault() { + int pid, ret; + pid = fork(); + if (pid == 0) { + unsigned int *ptr = NULL; /* Cause a segfault so that we get a coredump */ + *ptr = 0xdeadd00d; + T_FAIL("Expected segmentation fault on write to NULL pointer"); + } + T_ASSERT_TRUE(pid != -1, "Checking fork success in parent"); + + ret = wait(NULL); + T_ASSERT_TRUE(ret != -1, "Waited for child to segfault and dump core"); + return pid; +} +#endif + +T_DECL( + proc_core_name_24152432, + "Tests behavior of core dump when kern.corefile ends in %, e.g., /cores/core.%", + T_META("owner", "Core Kernel Team"), + T_META_ASROOT(YES)) +{ +#if TARGET_OS_OSX + int ret, pid; + int enable_core_dump = 1; + char buf[BUFFLEN]; + memset(buf, 0, BUFFLEN); + size_t oldlen = BUFFLEN; + + ret = sysctlbyname(coredump_ctl, buf, &oldlen, &enable_core_dump, sizeof(int)); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl: enable core dumps"); + memset(buf, 0, BUFFLEN); + oldlen = BUFFLEN; + + ret = setrlimit(RLIMIT_CORE, &lim_infty); + T_ASSERT_POSIX_SUCCESS(ret, "setrlimit: remove limit on maximum coredump size"); + + ret = sysctlbyname(corefile_ctl, buf, &oldlen, evil, EVILLEN); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set bad core dump location, old value was %s", buf); + memset(buf, 0, BUFFLEN); + oldlen = BUFFLEN; + + pid = fork_and_wait_for_segfault(); + + snprintf(buf, BUFFLEN, default_dump_fmt, pid); + ret = remove(buf); + T_ASSERT_TRUE(ret != -1, "Removing coredump file (should be in fallback location)"); + memset(buf, 0, BUFFLEN); + + ret = sysctlbyname(corefile_ctl, buf, &oldlen, valid_dump_loc, strlen(valid_dump_loc)); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set valid core dump location, old value was %s", buf); + memset(buf, 0, BUFFLEN); + + pid = fork_and_wait_for_segfault(); + + snprintf(buf, BUFFLEN, valid_dump_fmt, pid); + ret = remove(buf); + T_ASSERT_TRUE(ret != -1, "Removing coredump file (should be in valid location)"); +#else + T_LOG("proc_core_name appears in OS X only, skipping test."); +#endif + T_PASS("proc_core_name_24152432 PASSED"); +} diff --git a/tools/tests/darwintests/proc_uuid_policy_26567533.c b/tools/tests/darwintests/proc_uuid_policy_26567533.c new file mode 100644 index 000000000..9bc57437d --- /dev/null +++ b/tools/tests/darwintests/proc_uuid_policy_26567533.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include + +#define NUM_PROC_UUID_POLICY_FLAGS 4 + +T_DECL(proc_uuid_policy_26567533, "Tests passing a NULL uuid in (uap->uuid).", T_META("owner", "Core Kernel Team")) +{ + int i, ret; + uuid_t null_uuid; + memset(null_uuid, 0, sizeof(uuid_t)); + + uint32_t policy_flags[] = { + PROC_UUID_POLICY_FLAGS_NONE, + PROC_UUID_NO_CELLULAR, + PROC_UUID_NECP_APP_POLICY, + PROC_UUID_ALT_DYLD_POLICY + }; + + for (i = 0; i < NUM_PROC_UUID_POLICY_FLAGS; i++) { + T_LOG("Testing policy add with flag value 0x%x", policy_flags[i]); + + /* Since UUID is null, this call should fail with errno = EINVAL. */ + ret = proc_uuid_policy(PROC_UUID_POLICY_OPERATION_ADD, null_uuid, sizeof(uuid_t), policy_flags[i]); + + T_ASSERT_TRUE(ret == -1, "proc_uuid_policy returned %d", ret); + T_WITH_ERRNO; + T_ASSERT_TRUE(errno = EINVAL, "errno is %d", errno); + } + + for (i = 0; i < NUM_PROC_UUID_POLICY_FLAGS; i++) { + T_LOG("Testing policy remove with flag value 0x%x", policy_flags[i]); + + /* Since UUID is null, this call should fail with errno = EINVAL. */ + ret = proc_uuid_policy(PROC_UUID_POLICY_OPERATION_REMOVE, null_uuid, sizeof(uuid_t), policy_flags[i]); + + T_ASSERT_TRUE(ret == -1, "proc_uuid_policy returned %d", ret); + T_WITH_ERRNO; + T_ASSERT_TRUE(errno = EINVAL, "errno is %d", errno); + } + T_PASS("proc_uuid_policy_26567533 PASSED"); +} diff --git a/tools/tests/darwintests/socket_poll_close_25786011.c b/tools/tests/darwintests/socket_poll_close_25786011.c new file mode 100644 index 000000000..ef6ddd24c --- /dev/null +++ b/tools/tests/darwintests/socket_poll_close_25786011.c @@ -0,0 +1,35 @@ +#include +#include +#include +#include + +T_DECL(socket_poll_close_25786011, "Tests an invalid poll call to a socket and then calling close.", T_META("owner", "Core Kernel Team")) +{ + int my_socket, ret; + + my_socket = socket(PF_LOCAL, SOCK_STREAM, 0); + T_WITH_ERRNO; T_ASSERT_TRUE(my_socket > 0, "create socket"); + + /* + * Setup a pollfd that we know will return an error when we try + * to create a knote for it. We specify a BSD vnode specific event + * for a socket. + */ + struct pollfd my_pollfd = { + .fd = my_socket, + .events = POLLEXTEND + }; + + /* + * Previously the call to kevent_register() in the kernel from this call + * would leak an iocount reference on the fileproc, which would cause any + * subsequent calls to close() on the associated fd to block indefinitely. + */ + ret = poll(&my_pollfd, 1, 0); + T_WITH_ERRNO; T_ASSERT_TRUE(ret == 1, "poll returned %d", ret); + + ret = close(my_socket); + T_ASSERT_POSIX_ZERO(ret, "close on socket with fd %d\n", my_socket); + + T_PASS("socket_poll_close_25786011 PASSED"); +} diff --git a/tools/tests/darwintests/stackshot_idle_25570396.m b/tools/tests/darwintests/stackshot_idle_25570396.m new file mode 100644 index 000000000..471dcb034 --- /dev/null +++ b/tools/tests/darwintests/stackshot_idle_25570396.m @@ -0,0 +1,264 @@ +/* This program tests that kThreadIdleWorker is being set properly, so + * that idle and active threads can be appropriately identified. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NUMRETRIES 5 // number of times to retry a stackshot +#define NUMENQUEUES 16 // number of blocking jobs to enqueue +#define NUMTHREADS (NUMENQUEUES + 2) // total number of threads (including numenqueues) + +volatile static int spin_threads = 1; + +static void * +take_stackshot(uint32_t extra_flags, uint64_t since_timestamp) +{ + void * stackshot; + int ret, retries; + uint32_t stackshot_flags = STACKSHOT_SAVE_LOADINFO | + STACKSHOT_GET_GLOBAL_MEM_STATS | + STACKSHOT_SAVE_IMP_DONATION_PIDS | + STACKSHOT_KCDATA_FORMAT; + + if (since_timestamp != 0) + stackshot_flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT; + + stackshot_flags |= extra_flags; + + stackshot = stackshot_config_create(); + T_ASSERT_NOTNULL(stackshot, "Allocating stackshot config"); + + ret = stackshot_config_set_flags(stackshot, stackshot_flags); + T_ASSERT_POSIX_ZERO(ret, "Setting flags on stackshot config"); + + ret = stackshot_config_set_pid(stackshot, getpid()); + T_ASSERT_POSIX_ZERO(ret, "Setting target pid on stackshot config"); + + if (since_timestamp != 0) { + ret = stackshot_config_set_delta_timestamp(stackshot, since_timestamp); + T_ASSERT_POSIX_ZERO(ret, "Setting prev snapshot time on stackshot config"); + } + + for (retries = NUMRETRIES; retries > 0; retries--) { + ret = stackshot_capture_with_config(stackshot); + T_ASSERT_TRUE(ret == 0 || ret == EBUSY || ret == ETIMEDOUT, "Attempting to take stackshot (error %d)...", ret); + if (retries == 0 && (ret == EBUSY || ret == ETIMEDOUT)) + T_ASSERT_FAIL("Failed to take stackshot after %d retries: %s", ret, strerror(ret)); + if (ret == 0) + break; + } + return stackshot; +} + +static uint64_t get_stackshot_timestamp(void * stackshot) +{ + kcdata_iter_t iter; + void * buf; + uint64_t default_time = 0; + uint32_t t, buflen; + + buf = stackshot_config_get_stackshot_buffer(stackshot); + T_ASSERT_NOTNULL(buf, "Getting stackshot buffer"); + buflen = stackshot_config_get_stackshot_size(stackshot); + + iter = kcdata_iter(buf, buflen); + t = kcdata_iter_type(iter); + + T_ASSERT_TRUE(t == KCDATA_BUFFER_BEGIN_STACKSHOT || t == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT, + "Making sure stackshot data begins with \"begin\" flag"); + T_ASSERT_TRUE(kcdata_iter_valid(iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME)), + "Getting stackshot timestamp"); + default_time = *(uint64_t *)kcdata_iter_payload(iter); + return default_time; +} + +static void +get_thread_statuses(void * stackshot, int * num_idles, int * num_nonidles) +{ + void *buf; + uint32_t t, buflen; + uint64_t thread_snap_flags; + NSError *error = nil; + NSMutableDictionary *parsed_container, *parsed_threads; + + *num_idles = 0; + *num_nonidles = 0; + + buf = stackshot_config_get_stackshot_buffer(stackshot); + T_ASSERT_NOTNULL(buf, "Getting stackshot buffer"); + buflen = stackshot_config_get_stackshot_size(stackshot); + + kcdata_iter_t iter = kcdata_iter(buf, buflen); + T_ASSERT_TRUE(kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_STACKSHOT || + kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT, + "Checking start of stackshot buffer"); + + iter = kcdata_iter_next(iter); + KCDATA_ITER_FOREACH(iter) + { + t = kcdata_iter_type(iter); + + if (t != KCDATA_TYPE_CONTAINER_BEGIN) { + continue; + } + + if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) { + continue; + } + + parsed_container = parseKCDataContainer(&iter, &error); + T_ASSERT_TRUE(parsed_container && !error, "Parsing container"); + + parsed_threads = parsed_container[@"task_snapshots"][@"thread_snapshots"]; + for (id th_key in parsed_threads) { + /* check to see that tid matches expected idle status */ + thread_snap_flags = [parsed_threads[th_key][@"thread_snapshot"][@"ths_ss_flags"] unsignedLongLongValue]; + (thread_snap_flags & kThreadIdleWorker) ? (*num_idles)++ : (*num_nonidles)++; + } + [parsed_container release]; + } + +} + +/* Dispatch NUMENQUEUES jobs to a concurrent queue that immediately wait on a + * shared semaphore. This should spin up plenty of threads! */ +static void +warm_up_threadpool(dispatch_queue_t q) +{ + int i; + dispatch_semaphore_t thread_wait = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(thread_wait, "Initializing work queue semaphore"); + dispatch_semaphore_t main_wait = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(main_wait, "Initializing main thread semaphore"); + + for (i = 0; i < NUMENQUEUES; i++) { + dispatch_async(q, ^{ + dispatch_semaphore_wait(thread_wait, DISPATCH_TIME_FOREVER); + dispatch_semaphore_signal(main_wait); + }); + } + + sleep(1); // give worker threads enough time to block + + for (i = 0; i < NUMENQUEUES; i++) { + dispatch_semaphore_signal(thread_wait); + dispatch_semaphore_wait(main_wait, DISPATCH_TIME_FOREVER); + } + + dispatch_release(thread_wait); + dispatch_release(main_wait); + + // Give enough time for worker threads to go idle again + sleep(1); +} + +/* Dispatch NUMENQUEUES jobs to a concurrent queue that spin in a tight loop. + * Isn't guaranteed to occupy every worker thread, but it's enough so + * that a thread will go from idle to nonidle. + */ +static void +fill_threadpool_with_spinning(dispatch_queue_t q) +{ + int i; + for (i = 0; i < NUMENQUEUES; i++) { + dispatch_async(q, ^{ + while(spin_threads); // should now appear as non-idle in delta shot + }); + } + sleep(1); // wait for jobs to enqueue +} + +/* Take stackshot, count the number of idle and nonidle threads the stackshot records. + * Where this is called, there should be NUMENQUEUES idle threads (thanks to warm_up_threadpool) + * and 2 nonidle threads (the main thread, and the spinning pthread). + */ +static void +take_and_verify_initial_stackshot(uint64_t * since_time) +{ + void *stackshot; + int num_init_idle_threads, num_init_nonidle_threads; + + stackshot = take_stackshot(0, 0); + *since_time = get_stackshot_timestamp(stackshot); + get_thread_statuses(stackshot, &num_init_idle_threads, &num_init_nonidle_threads); + + T_EXPECT_EQ(num_init_idle_threads, NUMENQUEUES, + "Idle count of %d should match expected value of %d...", + num_init_idle_threads, NUMENQUEUES); + T_EXPECT_EQ(num_init_nonidle_threads, NUMTHREADS - NUMENQUEUES, + "Non-idle count of %d should match expected value of %d...", + num_init_nonidle_threads, NUMTHREADS - NUMENQUEUES); + stackshot_config_dealloc(stackshot); +} + +/* Take a stackshot and a delta stackshot, measuring what changed since the previous + * stackshot. Where this is called, the blocking jobs have been cleared from the work queue, + * and the work queue has NUMENQUEUES tight-spinning jobs on it. Make sure that + * no new idle threads appear in the delta, and make sure that the delta shot isn't + * ignoring the worker threads that have become active. + */ +static void +take_and_verify_delta_stackshot(uint64_t since_time) +{ + void *stackshot; + void *delta_stackshot; + + int num_delta_idles, num_delta_nonidles, num_curr_idles, num_curr_nonidles; + + stackshot = take_stackshot(0, 0); + delta_stackshot = take_stackshot(0, since_time); /* Threads should appear in delta stackshot as non-idle */ + + get_thread_statuses(stackshot, &num_curr_idles, &num_curr_nonidles); + get_thread_statuses(delta_stackshot, &num_delta_idles, &num_delta_nonidles); + + T_EXPECT_EQ(num_delta_idles, 0, "Making sure there are no idles in delta shot"); + T_EXPECT_EQ(num_delta_nonidles + num_curr_idles, NUMTHREADS, + "Making sure delta shot isn't ignoring newly active threads"); + stackshot_config_dealloc(stackshot); + stackshot_config_dealloc(delta_stackshot); +} + +static void * +spinning_non_work_queue_thread(void * ignored) +{ + (void)ignored; + while(spin_threads); + return NULL; +} + +T_DECL(stackshot_idle_25570396, "Tests that stackshot can properly recognize idle and non-idle threads", T_META("owner", "Core Kernel Team")) +{ + int ret; + uint64_t initial_stackshot_time; + pthread_t spinning_thread; + dispatch_queue_t q; + + ret = pthread_create(&spinning_thread, NULL, spinning_non_work_queue_thread, NULL); + T_ASSERT_POSIX_ZERO(ret, "Spinning up non-work-queue thread"); + + q = dispatch_queue_create("com.apple.kernel.test.waiting_semaphores", DISPATCH_QUEUE_CONCURRENT); + + warm_up_threadpool(q); + take_and_verify_initial_stackshot(&initial_stackshot_time); + + fill_threadpool_with_spinning(q); + take_and_verify_delta_stackshot(initial_stackshot_time); + + spin_threads = 0; /* pthread-made thread should now exit */ + ret = pthread_join(spinning_thread, NULL); + T_ASSERT_POSIX_ZERO(ret, "Joining on non-work-queue thread"); +} diff --git a/tools/tests/darwintests/workq_sigprof.c b/tools/tests/darwintests/workq_sigprof.c new file mode 100644 index 000000000..6ea38a8c9 --- /dev/null +++ b/tools/tests/darwintests/workq_sigprof.c @@ -0,0 +1,70 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if !TARGET_OS_IPHONE + +static pthread_t workq_thread; +static bool signal_received; + +static void signal_handler(int sig __unused, siginfo_t *b __unused, void* unused __unused) { + if (pthread_self() == workq_thread) { + signal_received = true; + } +} + +static void workq_block(void *unused __unused) { + workq_thread = pthread_self(); + + /* + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPROF); + pthread_sigmask(SIG_UNBLOCK, &set, NULL); + */ + + uint64_t spin_start = mach_absolute_time(); + while (mach_absolute_time() - spin_start < 30 * NSEC_PER_SEC) + if (signal_received) { + T_PASS("Got SIGPROF!"); + T_END; + } + } + +T_DECL(workq_sigprof, "test that workqueue threads can receive sigprof") +{ + struct sigaction sa = { + .sa_sigaction = signal_handler + }; + sigfillset(&sa.sa_mask); + T_ASSERT_POSIX_ZERO(sigaction(SIGPROF, &sa, NULL), NULL); + + dispatch_queue_t q = dispatch_get_global_queue(0, 0); + dispatch_async_f(q, NULL, workq_block); + + struct itimerval timerval = { + .it_interval = {.tv_usec = 10000}, + .it_value = {.tv_usec = 10000} + }; + T_ASSERT_POSIX_ZERO(setitimer(ITIMER_PROF, &timerval, NULL), NULL); + + dispatch_main(); +} + +#else //!TARGET_OS_IPHONE + +T_DECL(workq_sigprof, "test that workqueue threads can receive sigprof") +{ + T_EXPECTFAIL; + T_FAIL(" setitimer/sigprof doesn't seem to be delivered on embeded platforms"); +} + +#endif //!TARGET_OS_IPHONE diff --git a/tools/tests/jitter/timer_jitter.c b/tools/tests/jitter/timer_jitter.c index abcfe87b8..7e0c9a0c1 100644 --- a/tools/tests/jitter/timer_jitter.c +++ b/tools/tests/jitter/timer_jitter.c @@ -47,6 +47,8 @@ #include #include +#include + typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_FIXEDPRI } my_policy_type_t; #define DEFAULT_MAX_SLEEP_NS 2000000000ll /* Two seconds */ @@ -354,6 +356,18 @@ main(int argc, char **argv) exit(1); } + /* + * Disable the wake monitor. If we are + * performing a large number of + * iterations, the wake monitor may + * cause this process to get suspended, + * thus causing a large jitter value. + */ + if (proc_disable_wakemon(getpid()) != KERN_SUCCESS) { + printf("Couldn't disable wake monitor.\n"); + /* For now, do not exit; this call could be locked down */ + } + /* * Repeatedly pick a random timer length and * try to sleep exactly that long diff --git a/tools/tests/kqueue_tests/Makefile b/tools/tests/kqueue_tests/Makefile index d8ffd2c16..0a3c7daa1 100755 --- a/tools/tests/kqueue_tests/Makefile +++ b/tools/tests/kqueue_tests/Makefile @@ -17,15 +17,15 @@ CFLAGS :=-g $(patsubst %, -arch %,$(ARCHS)) -isysroot $(SDKROOT) DSTROOT?=$(shell /bin/pwd) SYMROOT?=$(shell /bin/pwd) -all: $(addprefix $(DSTROOT)/, file timer) +all: $(addprefix $(DSTROOT)/, file_tests timer_tests) -$(DSTROOT)/file: +$(DSTROOT)/file_tests: kqueue_file_tests.c $(CC) $(CFLAGS) -o $(SYMROOT)/file_tests kqueue_file_tests.c - if [ ! -e $(DSTROOT)/file_tests ]; then ditto $(SYMROOT)/file_tests $(DSTROOT)/file_tests; fi + ditto $(SYMROOT)/file_tests $(DSTROOT)/file_tests -$(DSTROOT)/timer: +$(DSTROOT)/timer_tests: kqueue_timer_tests.c $(CC) $(CFLAGS) -o $(SYMROOT)/timer_tests kqueue_timer_tests.c - if [ ! -e $(DSTROOT)/timer_tests ]; then ditto $(SYMROOT)/timer_tests $(DSTROOT)/timer_tests; fi + ditto $(SYMROOT)/timer_tests $(DSTROOT)/timer_tests clean: rm -rf $(DSTROOT)/file_tests $(DSTROOT)/timer_tests $(SYMROOT)/*.dSYM $(SYMROOT)/file_tests $(SYMROOT)/timer_tests diff --git a/tools/tests/kqueue_tests/kqueue_file_tests.c b/tools/tests/kqueue_tests/kqueue_file_tests.c index a4461b1e7..9602fc861 100644 --- a/tools/tests/kqueue_tests/kqueue_file_tests.c +++ b/tools/tests/kqueue_tests/kqueue_file_tests.c @@ -14,12 +14,13 @@ #include #include #include +#include -#define DIR1 "dir1" +#define DIR1 "/tmp/dir1" #define DOTDOT ".." -#define DIR2 "dir2" -#define FILE1 "file1" -#define FILE2 "file2" +#define DIR2 "/tmp/dir2" +#define FILE1 "/tmp/file1" +#define FILE2 "/tmp/file2" #define KEY "somekey" #define VAL "someval" @@ -30,7 +31,7 @@ #define YES_EVENT 1 -#define OUTPUT_LEVEL 2 +#define OUTPUT_LEVEL 0 #define RESULT_LEVEL 3 #define TEST_STRING "Some text!!! Yes indeed, some of that very structure which has passed on man's knowledge for generations." @@ -40,12 +41,19 @@ #define LENGTHEN_SIZE 500 #define FIFO_SPACE 8192 /* FIFOS have 8K of buffer space */ +/* + * These two variables are the non local memory for holding the return + * values from functions with which pthread_create is called. + */ +int thread_status; +int fifo_read_fd; + /* * Types of actions for setup, cleanup, and execution of tests */ typedef enum {CREAT, MKDIR, READ, WRITE, WRITEFD, FILLFD, UNLINK, LSKEE, RMDIR, MKFIFO, LENGTHEN, TRUNC, SYMLINK, CHMOD, CHOWN, EXCHANGEDATA, RENAME, LSEEK, OPEN, MMAP, NOTHING, - SETXATTR, UTIMES, STAT, HARDLINK, REVOKE} action_id_t; + SETXATTR, UTIMES, STAT, HARDLINK, REVOKE, FUNLOCK} action_id_t; /* * Directs an action as mentioned above @@ -109,7 +117,9 @@ void LOG(int level, FILE *f, const char *fmt, ...) { if (level >= OUTPUT_LEVEL) { /* Indent for ease of reading */ if (level < RESULT_LEVEL) { - fprintf(f, "\t"); + for (int i = RESULT_LEVEL - level; i>0; i--) { + fprintf(f, "\t"); + } } vfprintf(f, fmt, ap); } @@ -117,6 +127,67 @@ void LOG(int level, FILE *f, const char *fmt, ...) { va_end(ap); } +char * +get_action_name(action_id_t a) +{ + switch (a) { + case CREAT: + return "CREAT"; + case MKDIR: + return "MKDIR"; + case READ: + return "READ"; + case WRITE: + return "WRITE"; + case WRITEFD: + return "WRITEFD"; + case FILLFD: + return "FILLFD"; + case UNLINK: + return "UNLINK"; + case LSKEE: + return "LSKEE"; + case RMDIR: + return "RMDIR"; + case MKFIFO: + return "MKFIFO"; + case LENGTHEN: + return "LENGTHEN"; + case TRUNC: + return "TRUNC"; + case SYMLINK: + return "SYMLINK"; + case CHMOD: + return "CHMOD"; + case CHOWN: + return "CHOWN"; + case EXCHANGEDATA: + return "EXCHANGEDATA"; + case RENAME: + return "RENAME"; + case LSEEK: + return "LSEEK"; + case OPEN: + return "OPEN"; + case MMAP: + return "MMAP"; + case NOTHING: + return "NOTHING"; + case SETXATTR: + return "SETXATTR"; + case UTIMES: + return "UTIMES"; + case STAT: + return "STAT"; + case HARDLINK: + return "HARDLINK"; + case REVOKE: + return "REVOKE"; + case FUNLOCK: + return "FUNLOCK"; + } + return "Unknown"; +} /* * Initialize an action struct. Whether to sleep, what action to take, * and arguments for that action. @@ -142,10 +213,11 @@ init_action(action_t *act, int sleep, action_id_t call, int nargs, ...) /* * Opening a fifo is complicated: need to open both sides at once */ -void* +void * open_fifo_readside(void *arg) { - return (void*)open((char*)arg, O_RDONLY); + fifo_read_fd = open((char*)arg, O_RDONLY); + return (&fifo_read_fd); } /* @@ -158,8 +230,9 @@ open_fifo(const char *path, int *readfd, int *writefd) pthread_t thread; int waitres; int res; - int tmpreadfd, tmpwritefd; + int *tmpreadfd, tmpwritefd; + fifo_read_fd = -1; res = pthread_create(&thread, 0, open_fifo_readside, (void*)path); if (res == 0) { tmpwritefd = open(path, O_WRONLY); @@ -167,8 +240,8 @@ open_fifo(const char *path, int *readfd, int *writefd) fcntl(tmpwritefd, F_SETFL, O_WRONLY | O_NONBLOCK); - if ((waitres == 0) && (tmpwritefd >= 0) && (tmpreadfd >= 0)) { - *readfd = tmpreadfd; + if ((waitres == 0) && (tmpwritefd >= 0) && (*tmpreadfd >= 0)) { + *readfd = *tmpreadfd; *writefd = tmpwritefd; } else { res = -1; @@ -215,7 +288,7 @@ execute_action(void *actionptr) struct timeval tv; struct stat sstat; - LOG(1, stderr, "Beginning action of type %d\n", act->act_id); + LOG(1, stderr, "Beginning action of type %d: %s\n", act->act_id, get_action_name(act->act_id)); /* Let other thread get into kevent() sleep */ if(SLEEP == act->act_dosleep) { @@ -342,13 +415,22 @@ execute_action(void *actionptr) res = revoke((char*)args[0]); close(tmpfd); break; + case FUNLOCK: + tmpfd = open((char*)args[0], O_RDONLY); + if (tmpfd != -1) { + res = flock(tmpfd, LOCK_EX); + if (res != -1) + res = flock(tmpfd, LOCK_UN); + (void)close(tmpfd); + } + break; default: res = -1; break; } - - return (void*)res; - + + thread_status = res; + return (&thread_status); } /* @@ -371,9 +453,9 @@ execute_action_list(action_t *actions, int nactions, int failout) int i, res; for (i = 0, res = 0; (0 == res || (!failout)) && (i < nactions); i++) { LOG(1, stderr, "Starting prep action %d\n", i); - res = (int) execute_action(&(actions[i])); + res = *((int *) execute_action(&(actions[i]))); if(res != 0) { - LOG(2, stderr, "Action list failed on step %d.\n", i); + LOG(2, stderr, "Action list failed on step %d. res = %d\n", i, res); } else { LOG(1, stderr, "Action list work succeeded on step %d.\n", i); } @@ -388,12 +470,13 @@ execute_action_list(action_t *actions, int nactions, int failout) int execute_test(test_t *test) { - int i, kqfd, filefd = -1, res2, res, cnt, status, writefd = -1; + int i, kqfd, filefd = -1, res2, res, cnt, writefd = -1; int retval = -1; pthread_t thr; struct kevent evlist; struct timespec ts = {WAIT_TIME, 0l}; - + int *status; + memset(&evlist, 0, sizeof(evlist)); LOG(1, stderr, "Test %s starting.\n", test->t_testname); @@ -434,10 +517,11 @@ execute_test(test_t *test) action_t dowr; init_action(&dowr, NOSLEEP, WRITEFD, 0); dowr.act_fd = writefd; - execute_action(&dowr); + (void)execute_action(&dowr); } /* Helper modifies the file that we're listening on (sleeps first, in general) */ + thread_status = 0; res = pthread_create(&thr, NULL, execute_action, (void*) &test->t_helpthreadact); if (0 == res) { LOG(1, stderr, "Created helper thread.\n"); @@ -475,14 +559,14 @@ execute_test(test_t *test) } /* Success only if you've succeeded to this point AND joined AND other thread is happy*/ - status = 0; - res2 = pthread_join(thr, (void**)&status); + status = NULL; + res2 = pthread_join(thr, (void **)&status); if (res2 < 0) { LOG(2, stderr, "Couldn't join helper thread.\n"); - } else if (status) { - LOG(2, stderr, "Helper action had result %d\n", (int)status); + } else if (*status) { + LOG(2, stderr, "Helper action had result %d\n", *status); } - res = ((res == 0) && (res2 == 0) && (status == 0)) ? 0 : -1; + res = ((res == 0) && (res2 == 0) && (*status == 0)) ? 0 : -1; } else { LOG(2, stderr, "Couldn't start thread.\n"); } @@ -524,11 +608,12 @@ execute_test(test_t *test) retval = -1; } } else { - LOG(2, stderr, "Failed to execute test.\n"); + LOG(2, stderr, "Failed to execute test. res = %d\n", res); retval = -1; } LOG(3, stdout, "Test %s done with result %d.\n", test->t_testname, retval); + return (retval); } void @@ -1558,7 +1643,18 @@ run_poll_tests() execute_test(&test); } -void +void +run_note_funlock_tests() +{ + test_t test; + init_test(&test, "11.1.1: unlock file", FILE1, 1, 1, NOTE_FUNLOCK, YES_EVENT); + init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void *)NULL); + init_action(&test.t_helpthreadact, SLEEP, FUNLOCK, 2, (void*)FILE1, (void *)NULL); + init_action(&(test.t_cleanup_actions[0]), NOSLEEP, UNLINK, 2, (void*)FILE1, (void *)NULL); + execute_test(&test); +} + +void run_all_tests() { run_note_delete_tests(); @@ -1573,6 +1669,7 @@ run_all_tests() run_evfilt_read_tests(); run_evfilt_write_tests(); run_poll_tests(); + run_note_funlock_tests(); } int @@ -1605,9 +1702,12 @@ main(int argc, char **argv) run_evfilt_write_tests(); else if (strcmp(which, "poll") == 0) run_poll_tests(); + else if (strcmp(which, "funlock") == 0) + run_note_funlock_tests(); else { - fprintf(stderr, "Valid options are:\n\tdelete, write, extend," - "attrib, link, rename, revoke, evfiltread, fifo, all, evfiltwrite\n"); + fprintf(stderr, "Valid options are:\n\tdelete, write, extend, " + "attrib, link, rename, revoke, evfiltread, " + "fifo, all, evfiltwrite, funlock\n"); exit(1); } return 0; diff --git a/tools/tests/libMicro/Makefile b/tools/tests/libMicro/Makefile index d9dad443e..3eecede2f 100644 --- a/tools/tests/libMicro/Makefile +++ b/tools/tests/libMicro/Makefile @@ -35,12 +35,12 @@ ARCH = i386 BINS= $(ALL:%=bin-$(ARCH)/%) bin-$(ARCH)/tattle -# TARBALL_CONTENTS = \ +# TARBALL_CONTENTS = \ Makefile.benchmarks \ - Makefile.SunOS \ - Makefile.Linux \ - Makefile.Aix \ - Makefile.com \ + Makefile.SunOS \ + Makefile.Linux \ + Makefile.Aix \ + Makefile.com \ Makefile \ $(ALL:%=%.c) \ elided.c \ @@ -49,7 +49,7 @@ BINS= $(ALL:%=bin-$(ARCH)/%) bin-$(ARCH)/tattle libmicro_main.c \ libmicro.h \ recurse2.c \ - benchmark_finibatch.c \ + benchmark_finibatch.c \ benchmark_initbatch.c \ benchmark_optswitch.c \ benchmark_fini.c \ @@ -73,7 +73,7 @@ BINS= $(ALL:%=bin-$(ARCH)/%) bin-$(ARCH)/tattle README ifeq "$(Embedded)" "YES" -SEMOP_FLAG= +SEMOP_FLAG= endif default $(ALL) run cstyle lint tattle: $(BINS) @@ -95,7 +95,7 @@ default $(ALL) run cstyle lint tattle: $(BINS) fi; \ done; @echo "done" - + .PHONY: clean clean_subdirs clean_$(SUBDIRS) clean: clean_subdirs @@ -104,7 +104,7 @@ clean: clean_subdirs clean_subdirs: for dir in $(SUBDIRS); do $(MAKE) -C $$dir clean; done -bin: +bin: @mkdir -p bin-$(ARCH) $(BINS): bin @@ -115,7 +115,6 @@ $(BINS): bin # commenting the lbMicro.tar as it is not being used. # libMicro.tar: FORCE # @chmod +x ./mk_tarball wrapper -# @./mk_tarball $(TARBALL_CONTENTS) - -# FORCE: +# @./mk_tarball $(TARBALL_CONTENTS) +# FORCE: diff --git a/tools/tests/libMicro/apple/Makefile b/tools/tests/libMicro/apple/Makefile index fa9ce7009..906ef9618 100644 --- a/tools/tests/libMicro/apple/Makefile +++ b/tools/tests/libMicro/apple/Makefile @@ -28,7 +28,6 @@ # Use is subject to license terms. # - include Makefile.benchmarks ARCH= i386 @@ -41,9 +40,7 @@ default $(ALL): $(BINS) clean: rm -rf bin bin-* -bin: +bin: @mkdir -p ../bin-$(ARCH) $(BINS): bin - - diff --git a/tools/tests/libMicro/apple/Makefile.Darwin b/tools/tests/libMicro/apple/Makefile.Darwin index c1677177f..ff1f4a668 100644 --- a/tools/tests/libMicro/apple/Makefile.Darwin +++ b/tools/tests/libMicro/apple/Makefile.Darwin @@ -45,7 +45,7 @@ CC = $(shell xcrun -sdk "$(SDKROOT)" -find gcc) ARCH= i386 ifeq "$(strip $(ARCH))" "fat" -ARCH_FLAG= -arch i386 -arch x86_64 +ARCH_FLAG= -arch i386 -arch x86_64 else ARCH_FLAG= -arch $(ARCH) endif @@ -54,7 +54,7 @@ endif OPT_FLAG= -Os SEMOP_FLAG= -DUSE_SEMOP ifeq "$(Embedded)" "YES" -SEMOP_FLAG= +SEMOP_FLAG= endif ### diff --git a/tools/tests/libMicro/apple/Makefile.benchmarks b/tools/tests/libMicro/apple/Makefile.benchmarks index 210cf37a5..5fd65660d 100644 --- a/tools/tests/libMicro/apple/Makefile.benchmarks +++ b/tools/tests/libMicro/apple/Makefile.benchmarks @@ -28,7 +28,7 @@ Embedded=$(shell tconf --test TARGET_OS_EMBEDDED) -ALL = \ +ALL = \ create_file \ geekbench_stdlib_write \ getppid \ @@ -64,8 +64,7 @@ ALL = \ getaddrinfo_port \ getgrnam -# Compile the following test on desktop platform only +# Compile the following test on desktop platform only ifeq "$(Embedded)" "NO" ALL += od_query_create_with_node endif - diff --git a/tools/tests/libMicro/apple/Makefile.com.Darwin b/tools/tests/libMicro/apple/Makefile.com.Darwin index d16caca8b..a72a31657 100644 --- a/tools/tests/libMicro/apple/Makefile.com.Darwin +++ b/tools/tests/libMicro/apple/Makefile.com.Darwin @@ -33,7 +33,7 @@ include ../Makefile.benchmarks EXTRA_CFILES= \ - exec_bin.c \ + exec_bin.c \ elided.c \ tattle.c @@ -47,10 +47,10 @@ COMPILER_VERSION_CMD=$(COMPILER_VERSION_CMD_$(CC)) default: $(ALL) %.o: ../%.c - $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ -%: %.o - $(CC) -o $(@) $(@).o $($(@)_EXTRA_DEPS) $(CFLAGS) ../../bin-$(ARCH)/libmicro.a $($(@)_EXTRA_LIBS) $(EXTRA_LIBS) -lpthread -lm; cp $@ ../../bin-$(ARCH)/ +%: %.o + $(CC) -o $(@) $(@).o $($(@)_EXTRA_DEPS) $(CFLAGS) ../../bin-$(ARCH)/libmicro.a $($(@)_EXTRA_LIBS) $(EXTRA_LIBS) -lpthread -lm; cp $@ ../../bin-$(ARCH)/ posix_spawn: posix_spawn_bin @@ -58,4 +58,4 @@ posix_spawn_bin: posix_spawn_bin.o $(CC) -o posix_spawn_bin $(CFLAGS) posix_spawn_bin.o od_query_create_with_node: od_query_create_with_node.o - $(CC) -o $(@) $(@).o $($(@)_EXTRA_DEPS) $(CFLAGS) ../../bin-$(ARCH)/libmicro.a $($(@)_EXTRA_LIBS) $(EXTRA_LIBS) -lpthread -lm -framework CoreFoundation -framework OpenDirectory; cp $@ ../../bin-$(ARCH)/ + $(CC) -o $(@) $(@).o $($(@)_EXTRA_DEPS) $(CFLAGS) ../../bin-$(ARCH)/libmicro.a $($(@)_EXTRA_LIBS) $(EXTRA_LIBS) -lpthread -lm -framework CoreFoundation -framework OpenDirectory; cp $@ ../../bin-$(ARCH)/ diff --git a/tools/tests/perf_index/Makefile b/tools/tests/perf_index/Makefile index ba1218867..15213c0a2 100644 --- a/tools/tests/perf_index/Makefile +++ b/tools/tests/perf_index/Makefile @@ -21,7 +21,6 @@ endif endif endif - ifeq "$(Embedded)" "YES" TARGET_NAME:=PerfIndex.bundle-ios else diff --git a/tools/tests/zero-to-n/zero-to-n.c b/tools/tests/zero-to-n/zero-to-n.c index 2f36c2635..87ce83bb7 100644 --- a/tools/tests/zero-to-n/zero-to-n.c +++ b/tools/tests/zero-to-n/zero-to-n.c @@ -53,6 +53,8 @@ #include +#include + typedef enum wake_type { WAKE_BROADCAST_ONESEM, WAKE_BROADCAST_PERTHREAD, WAKE_CHAIN, WAKE_HOP } wake_type_t; typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_FIXEDPRI } my_policy_type_t; @@ -92,6 +94,13 @@ static boolean_t g_verbose = FALSE; static boolean_t g_do_affinity = FALSE; static uint64_t g_starttime_abs; static uint32_t g_iteration_sleeptime_us = 0; +static uint32_t g_priority = 0; +static uint32_t g_churn_pri = 0; +static uint32_t g_churn_count = 0; +static uint64_t g_churn_stopped_at = 0; +static boolean_t g_churn_stop = FALSE; + +static pthread_t* g_churn_threads = NULL; /* Threshold for dropping a 'bad run' tracepoint */ static uint64_t g_traceworthy_latency_ns = TRACEWORTHY_NANOS; @@ -105,6 +114,9 @@ static boolean_t g_do_sleep = TRUE; /* Every thread spins until all threads have checked in */ static boolean_t g_do_all_spin = FALSE; +/* Every thread backgrounds temporarily before parking */ +static boolean_t g_drop_priority = FALSE; + /* One randomly chosen thread holds up the train for a certain duration. */ static boolean_t g_do_one_long_spin = FALSE; static uint32_t g_one_long_spin_id = 0; @@ -137,6 +149,92 @@ nanos_to_abs(uint64_t ns) return (uint64_t)(ns * (((double)g_mti.denom) / ((double)g_mti.numer))); } +inline static void +yield(void) +{ +#if defined(__x86_64__) || defined(__i386__) + asm volatile("pause"); +#else +#error Unrecognized architecture +#endif +} + +static void * +churn_thread(__unused void *arg) +{ + uint64_t spin_count = 0; + + /* + * As a safety measure to avoid wedging, we will bail on the spin if + * it's been more than 1s after the most recent run start + */ + + while (g_churn_stop == FALSE && + mach_absolute_time() < (g_starttime_abs + NSEC_PER_SEC)) { + spin_count++; + yield(); + } + + /* This is totally racy, but only here to detect if anyone stops early */ + g_churn_stopped_at += spin_count; + + return NULL; +} + +static void +create_churn_threads() +{ + if (g_churn_count == 0) + g_churn_count = g_numcpus - 1; + + errno_t err; + + struct sched_param param = { .sched_priority = (int)g_churn_pri }; + pthread_attr_t attr; + + /* Array for churn threads */ + g_churn_threads = (pthread_t*) valloc(sizeof(pthread_t) * g_churn_count); + assert(g_churn_threads); + + if ((err = pthread_attr_init(&attr))) + errc(EX_OSERR, err, "pthread_attr_init"); + + if ((err = pthread_attr_setschedparam(&attr, ¶m))) + errc(EX_OSERR, err, "pthread_attr_setschedparam"); + + if ((err = pthread_attr_setschedpolicy(&attr, SCHED_RR))) + errc(EX_OSERR, err, "pthread_attr_setschedpolicy"); + + for (uint32_t i = 0 ; i < g_churn_count ; i++) { + pthread_t new_thread; + + if ((err = pthread_create(&new_thread, &attr, churn_thread, NULL))) + errc(EX_OSERR, err, "pthread_create"); + g_churn_threads[i] = new_thread; + } + + if ((err = pthread_attr_destroy(&attr))) + errc(EX_OSERR, err, "pthread_attr_destroy"); +} + +static void +join_churn_threads(void) +{ + if (g_churn_stopped_at != 0) + printf("Warning: Some of the churn threads may have stopped early: %lld\n", + g_churn_stopped_at); + + OSMemoryBarrier(); + + g_churn_stop = TRUE; + + /* Rejoin churn threads */ + for (uint32_t i = 0; i < g_churn_count; i++) { + errno_t err = pthread_join(g_churn_threads[i], NULL); + if (err) errc(EX_OSERR, err, "pthread_join %d", i); + } +} + /* * Figure out what thread policy to use */ @@ -183,6 +281,16 @@ thread_setup(uint32_t my_id) errno_t ret; thread_time_constraint_policy_data_t pol; + if (g_priority) { + int policy = SCHED_OTHER; + if (g_policy == MY_POLICY_FIXEDPRI) + policy = SCHED_RR; + + struct sched_param param = {.sched_priority = (int)g_priority}; + if ((ret = pthread_setschedparam(pthread_self(), policy, ¶m))) + errc(EX_OSERR, ret, "pthread_setschedparam: %d", my_id); + } + switch (g_policy) { case MY_POLICY_TIMESHARE: break; @@ -373,6 +481,12 @@ worker_thread(void *arg) debug_log("Thread %p new value is %d, iteration %d\n", pthread_self(), new, i); + if (g_drop_priority) { + /* Drop priority to BG momentarily */ + errno_t ret = setpriority(PRIO_DARWIN_THREAD, 0, PRIO_DARWIN_BG); + if (ret) errc(EX_OSERR, ret, "setpriority PRIO_DARWIN_BG"); + } + if (g_do_all_spin) { /* Everyone spins until the last thread checks in. */ @@ -382,6 +496,12 @@ worker_thread(void *arg) } } + if (g_drop_priority) { + /* Restore normal priority */ + errno_t ret = setpriority(PRIO_DARWIN_THREAD, 0, 0); + if (ret) errc(EX_OSERR, ret, "setpriority 0"); + } + debug_log("Thread %p done spinning, iteration %d\n", pthread_self(), i); } @@ -565,6 +685,11 @@ main(int argc, char **argv) thread_setup(0); + g_starttime_abs = mach_absolute_time(); + + if (g_churn_pri) + create_churn_threads(); + /* Let everyone get settled */ kr = semaphore_wait(g_main_sem); mach_assert_zero(kr); @@ -640,6 +765,9 @@ main(int argc, char **argv) if (ret) errc(EX_OSERR, ret, "pthread_join %d", i); } + if (g_churn_pri) + join_churn_threads(); + compute_stats(worst_latencies_ns, g_iterations, &avg, &max, &min, &stddev); printf("Results (from a stop):\n"); printf("Max:\t\t%.2f us\n", ((float)max) / 1000.0); @@ -715,51 +843,84 @@ selfexec_with_apptype(int argc, char *argv[]) static void __attribute__((noreturn)) usage() { - errx(EX_USAGE, "Usage: zn " - " [--trace ] " - "[--spin-one] [--spin-all] [--spin-time ] [--affinity] [--no-sleep] [--verbose]"); + errx(EX_USAGE, "Usage: %s " + " \n\t\t" + "[--trace ] " + "[--verbose] [--spin-one] [--spin-all] [--spin-time ] [--affinity]\n\t\t" + "[--no-sleep] [--drop-priority] [--churn-pri ] [--churn-count ]", + getprogname()); +} + +static struct option* g_longopts; +static int option_index; + +static uint32_t +read_dec_arg() +{ + char *cp; + /* char* optarg is a magic global */ + + uint32_t arg_val = (uint32_t)strtoull(optarg, &cp, 10); + + if (cp == optarg || *cp) + errx(EX_USAGE, "arg --%s requires a decimal number, found \"%s\"", + g_longopts[option_index].name, optarg); + + return arg_val; } static void parse_args(int argc, char *argv[]) { - int ch, option_index = 0; - char *cp; + enum { + OPT_GETOPT = 0, + OPT_SPIN_TIME, + OPT_TRACE, + OPT_PRIORITY, + OPT_CHURN_PRI, + OPT_CHURN_COUNT, + }; static struct option longopts[] = { - { "spin-time", required_argument, NULL, 2 }, - { "trace", required_argument, NULL, 3 }, + { "spin-time", required_argument, NULL, OPT_SPIN_TIME }, + { "trace", required_argument, NULL, OPT_TRACE }, + { "priority", required_argument, NULL, OPT_PRIORITY }, + { "churn-pri", required_argument, NULL, OPT_CHURN_PRI }, + { "churn-count", required_argument, NULL, OPT_CHURN_COUNT }, { "switched_apptype", no_argument, (int*)&g_seen_apptype, TRUE }, { "spin-one", no_argument, (int*)&g_do_one_long_spin, TRUE }, { "spin-all", no_argument, (int*)&g_do_all_spin, TRUE }, { "affinity", no_argument, (int*)&g_do_affinity, TRUE }, { "no-sleep", no_argument, (int*)&g_do_sleep, FALSE }, + { "drop-priority", no_argument, (int*)&g_drop_priority, TRUE }, { "verbose", no_argument, (int*)&g_verbose, TRUE }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 } }; + g_longopts = longopts; + int ch = 0; + while ((ch = getopt_long(argc, argv, "h", longopts, &option_index)) != -1) { switch (ch) { - case 0: + case OPT_GETOPT: /* getopt_long set a variable */ break; - case 2: - /* spin-time */ + case OPT_SPIN_TIME: g_do_each_spin = TRUE; - g_each_spin_duration_ns = strtoull(optarg, &cp, 10); - - if (cp == optarg || *cp) - errx(EX_USAGE, "arg --%s requires a decimal number, found \"%s\"", - longopts[option_index].name, optarg); + g_each_spin_duration_ns = read_dec_arg(); break; - case 3: - /* trace */ - g_traceworthy_latency_ns = strtoull(optarg, &cp, 10); - - if (cp == optarg || *cp) - errx(EX_USAGE, "arg --%s requires a decimal number, found \"%s\"", - longopts[option_index].name, optarg); + case OPT_TRACE: + g_traceworthy_latency_ns = read_dec_arg(); + break; + case OPT_PRIORITY: + g_priority = read_dec_arg(); + break; + case OPT_CHURN_PRI: + g_churn_pri = read_dec_arg(); + break; + case OPT_CHURN_COUNT: + g_churn_count = read_dec_arg(); break; case '?': case 'h': @@ -787,6 +948,8 @@ parse_args(int argc, char *argv[]) usage(); } + char *cp; + /* How many threads? */ g_numthreads = (uint32_t)strtoull(argv[0], &cp, 10); diff --git a/tools/trace/ios_trace_ipc.sh b/tools/trace/ios_trace_ipc.sh new file mode 100755 index 000000000..90c17056f --- /dev/null +++ b/tools/trace/ios_trace_ipc.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# +# Initiate tracing +CODE_MACH_KMSG_INFO=0x1200028 +CODE_MACH_PROC_EXEC=0x401000C +CODE_MACH_MSG_SEND=0x120000C +CODE_MACH_MSG_RECV=0x1200010 +CODE_TRACE_DATA_EXEC=0x7000008 + +ofile=${1:-ipc.raw} +sleepsec=${2:-3} + +trace -i -b 8192 +trace -n +trace -g +if [ $sleepsec -gt 0 ]; then + echo "" + echo "Sleeping for ${sleepsec}..." + sleep ${sleepsec} +fi +echo "Tracing!" + +ps -Ac | sed 's,\s*\([0-9][0-9]*\) .*[0-9]*:[0-9]*\.[0-9]* \(.*\), 00000000.0 0.0(0.0) proc_exec \1 0 0 0 0 0 \2,' > "ps_${ofile}.txt" +trace -L ${ofile} -k ${CODE_MACH_KMSG_INFO} -k ${CODE_MACH_PROC_EXEC} -k ${CODE_MACH_MSG_SEND} -k ${CODE_MACH_MSG_RECV} diff --git a/tools/trace/parse_ipc_trace.py b/tools/trace/parse_ipc_trace.py new file mode 100644 index 000000000..19d9a1401 --- /dev/null +++ b/tools/trace/parse_ipc_trace.py @@ -0,0 +1,906 @@ +#!/usr/bin/env python +# machtrace_parse.py +# Parse Mach IPC kmsg data trace from XNU +# +# Jeremy C. Andrus +# +from __future__ import division + +import argparse +import subprocess +import sys +import re +from collections import deque + +import os.path + +from collections import defaultdict + +g_verbose = 0 +g_min_messages = 10 +g_rolling_window = 200 + +def RunCommand(cmd_string): + """ + returns: (int,str) : exit_code and output_str + """ + global g_verbose + if g_verbose > 1: + sys.stderr.write("\tCMD:{}\n".format(cmd_string)) + output_str = "" + exit_code = 0 + try: + output_str = subprocess.check_output(cmd_string, shell=True) + except subprocess.CalledProcessError, e: + exit_code = e.returncode + finally: + return (exit_code, output_str.strip()) + + +class IPCNode: + """ Class interface to a graph node representing a logical service name. + In general, this should correspond to a unique binary on the system + which could be started / stopped as different PIDs throughout the life + of the system. + """ + def __init__(self, name = ''): + global g_verbose + self.nname = "L_" + name.replace(".", "_").replace("-", "_") + self.nicename = name + self.outgoing = {} + self.incoming = {} + self.msg_stat = {'o.num':0, 'o.first':0.0, 'o.last':0.0, 'o.window':deque(), 'o.avg':0, 'o.peak':0, \ + 'i.num':0, 'i.first':0.0, 'i.last':0.0, 'i.window':deque(), 'i.avg':0, 'i.peak':0} + self.pidset = {} + self.scalefactor = 100.0 + if g_verbose > 0: + sys.stderr.write(' New node: "{}"{}\n'.format(self.nname, ' '*50)) + + def add_outgoing_edge(self, edge, time): + self.outgoing[edge.ename()] = [edge, time] + + def add_incoming_edge(self, edge, time): + self.incoming[edge.ename()] = [edge, time] + + def addpid(self, pid, time): + if not pid in self.pidset: + self.pidset[pid] = [time, 0] + self.pidset[pid][1] = time + + def incoming_msg(self, size, time_us): + global g_min_messages + global g_rolling_window + num = self.msg_stat['i.num'] + 1 + self.msg_stat['i.num'] = num + time_us = float(time_us) + if self.msg_stat['i.first'] == 0.0: + self.msg_stat['i.first'] = time_us + self.msg_stat['i.last'] = time_us + else: + self.msg_stat['i.last'] = time_us + if num > g_min_messages: + avg = (num * self.scalefactor) / (time_us - self.msg_stat['i.first']) + self.msg_stat['i.avg'] = avg + + self.msg_stat['i.window'].append(time_us) + if len(self.msg_stat['i.window']) > g_rolling_window: + self.msg_stat['i.window'].popleft() + n = len(self.msg_stat['i.window']) + ravg = float(len(self.msg_stat['i.window']) * self.scalefactor) / \ + (self.msg_stat['i.window'][-1] - self.msg_stat['i.window'][0]) + if ravg > self.msg_stat['i.peak']: + self.msg_stat['i.peak'] = ravg + + def outgoing_msg(self, size, time_us): + global g_min_messages + global g_rolling_window + num = self.msg_stat['o.num'] + 1 + self.msg_stat['o.num'] = num + time_us = float(time_us) + if self.msg_stat['o.first'] == 0.0: + self.msg_stat['o.first'] = time_us + self.msg_stat['o.last'] = time_us + else: + self.msg_stat['o.last'] = time_us + if num > g_min_messages: + avg = (num * self.scalefactor) / (time_us - self.msg_stat['o.first']) + self.msg_stat['o.avg'] = avg + + self.msg_stat['o.window'].append(time_us) + if len(self.msg_stat['o.window']) > g_rolling_window: + self.msg_stat['o.window'].popleft() + n = len(self.msg_stat['o.window']) + ravg = float(len(self.msg_stat['o.window']) * self.scalefactor) / \ + (self.msg_stat['o.window'][-1] - self.msg_stat['o.window'][0]) + if ravg > self.msg_stat['o.peak']: + self.msg_stat['o.peak'] = ravg + + def nmsgs(self): + return self.msg_stat['o.num'], self.msg_stat['i.num'] + + def recycled(self): + return len(self.pidset) + + def label(self, timebase = 1000000.0): + oavg = float(self.msg_stat['o.avg']) / self.scalefactor + opeak = float(self.msg_stat['o.peak']) / self.scalefactor + oactive = self.msg_stat['o.last'] - self.msg_stat['o.first'] + iavg = float(self.msg_stat['i.avg']) / self.scalefactor + ipeak = float(self.msg_stat['i.peak']) / self.scalefactor + iactive = self.msg_stat['i.last'] - self.msg_stat['i.first'] + if timebase > 0.0: + oavg = oavg * timebase + opeak = opeak * timebase + oactive = oactive / timebase + iavg = iavg * timebase + ipeak = ipeak * timebase + iactive = iactive / timebase + return "{:s}\\no:{:d}/({:d}:{:.1f}s)/{:.1f}:{:.1f})\\ni:{:d}({:d}:{:.1f}s)/{:.1f}:{:.1f})\\nR:{:d}"\ + .format(self.nicename, \ + len(self.outgoing), self.msg_stat['o.num'], oactive, oavg, opeak, \ + len(self.incoming), self.msg_stat['i.num'], iactive, iavg, ipeak, \ + len(self.pidset)) + +class IPCEdge: + """ Class interface to an graph edge representing two services / programs + communicating via Mach IPC. Note that this communication could + use many different PIDs. The connected graph nodes (see IPCNode) + represent logical services on the system which could be instantiated + as many different PIDs depending on the lifecycle of the process + (dictated in part by launchd). + """ + + F_TRACED = 0x00000100 + F_COMPLEX = 0x00000200 + F_OOLMEM = 0x00000400 + F_VCPY = 0x00000800 + F_PCPY = 0x00001000 + F_SND64 = 0x00002000 + F_RAISEIMP = 0x00004000 + F_APP_SRC = 0x00008000 + F_APP_DST = 0x00010000 + F_DAEMON_SRC = 0x00020000 + F_DAEMON_DST = 0x00040000 + F_DST_NDFLTQ = 0x00080000 + F_SRC_NDFLTQ = 0x00100000 + F_DST_SONCE = 0x00200000 + F_SRC_SONCE = 0x00400000 + F_CHECKIN = 0x00800000 + F_ONEWAY = 0x01000000 + F_IOKIT = 0x02000000 + F_SNDRCV = 0x04000000 + F_DSTQFULL = 0x08000000 + F_VOUCHER = 0x10000000 + F_TIMER = 0x20000000 + F_SEMA = 0x40000000 + F_PORTS_MASK = 0x000000FF + + DTYPES = [ 'std', 'xpc', 'iokit', 'std.reply', 'xpc.reply', 'iokit.reply' ] + DFLAVORS = [ 'std', 'ool', 'vcpy', 'iokit' ] + + def __init__(self, src = IPCNode(), dst = IPCNode(), data = '0', flags = '0', time = 0.0): + self.src = src + self.dst = dst + self.flags = 0 + self.dweight = 0 + self.pweight = 0 + self.weight = 0 + self._data = { 'std':0, 'ool':0, 'vcpy':0, 'iokit':0 } + self._dtype = { 'std':0, 'xpc':0, 'iokit':0, 'std.reply':0, 'xpc.reply':0, 'iokit.reply':0 } + self._msgs = { 'std':0, 'ool':0, 'vcpy':0, 'iokit':0 } + self._mtype = { 'std':0, 'xpc':0, 'iokit':0, 'std.reply':0, 'xpc.reply':0, 'iokit.reply':0 } + self.ports = 0 + self.task64 = False + self.task32 = False + self.src.add_outgoing_edge(self, time) + self.dst.add_incoming_edge(self, time) + self.addmsg(data, flags, time) + + def ename(self): + return self.src.nname + " -> " + self.dst.nname + + def msgdata(self): + return self._data, self._dtype + + def data(self, flavor = None): + if not flavor: + return sum(self._data.itervalues()) + elif flavor in self._data: + return self._data[flavor] + else: + return 0 + + def dtype(self, type): + if not type: + return sum(self._dtype.itervalues()) + elif type in self._dtype: + return self._dtype[type] + else: + return 0 + + def msgs(self, flavor = None): + if not flavor: + return sum(self._msgs.itervalues()) + elif flavor in self._msgs: + return self._msgs[flavor] + else: + return 0 + + def mtype(self, type): + if not type: + return sum(self._mtype.itervalues()) + elif type in self._mtype: + return self._mtype[type] + else: + return 0 + + def selfedge(self): + if self.src.nname == self.dst.nname: + return True + return False + + def addmsg(self, data_hex_str, flags_str, time): + global g_verbose + f = int(flags_str, 16) + self.flags |= f + df = {f:0 for f in self.DFLAVORS} + dt = {t:0 for t in self.DTYPES} + if not f & self.F_TRACED: + return df, dt + self.weight += 1 + if f & self.F_SND64: + self.task64 = True + else: + self.task32 = True + if not f & self.F_COMPLEX: + self.dweight += 1 + df['std'] = int(data_hex_str, 16) + if f & self.F_IOKIT: + df['iokit'] = df['std'] + df['std'] = 0 + self._data['iokit'] += df['iokit'] + self._msgs['iokit'] += 1 + else: + self._data['std'] += df['std'] + self._msgs['std'] += 1 + elif f & self.F_OOLMEM: + self.dweight += 1 + df['ool'] = int(data_hex_str, 16) + if f & self.F_IOKIT: + df['iokit'] = df['ool'] + df['ool'] = 0 + self._data['iokit'] += df['iokit'] + self._msgs['iokit'] += 1 + elif f & self.F_VCPY: + df['vcpy'] = df['ool'] + df['ool'] = 0 + self._data['vcpy'] += df['vcpy'] + self._msgs['vcpy'] += 1 + else: + self._data['ool'] += df['ool'] + self._msgs['ool'] += 1 + # Complex messages can contain ports and data + if f & self.F_COMPLEX: + nports = f & self.F_PORTS_MASK + if nports > 0: + self.pweight += 1 + self.ports += nports + dsize = sum(df.values()) + if f & self.F_DST_SONCE: + if f & self.F_IOKIT: + dt['iokit.reply'] = dsize + self._dtype['iokit.reply'] += dsize + self._mtype['iokit.reply'] += 1 + elif f & (self.F_DST_NDFLTQ | self.F_SRC_NDFLTQ): + dt['xpc.reply'] = dsize + self._dtype['xpc.reply'] += dsize + self._mtype['xpc.reply'] += 1 + else: + dt['std.reply'] = dsize + self._dtype['std.reply'] += dsize + self._mtype['std.reply'] += 1 + elif f & self.F_IOKIT: + dt['iokit'] = dsize + self._dtype['iokit'] += dsize + self._mtype['iokit'] += 1 + elif f & (self.F_DST_NDFLTQ | self.F_SRC_NDFLTQ): + dt['xpc'] = dsize + self._dtype['xpc'] += dsize + self._mtype['xpc'] += 1 + else: + dt['std'] = dsize + self._dtype['std'] += dsize + self._mtype['std'] += 1 + self.src.outgoing_msg(dsize, time) + self.dst.incoming_msg(dsize, time) + if g_verbose > 2: + sys.stderr.write(' {}->{} ({}/{}){}\r'.format(self.src.nname, self.dst.nname, df['ool'], df['std'], ' ' *50)) + return df, dt + + def avgmsg(self): + avgsz = self.data() / self.dweight + msgs_with_data = self.dweight / self.weight + avgports = self.ports / self.pweight + msgs_with_ports = self.pweight / self.weight + return (avgsz, msgs_with_data, avgports, msgs_with_ports) + + +class EdgeError(Exception): + """ IPCEdge exception class + """ + def __init__(self, edge, nm): + self.msg = "Edge {} (w:{}) didn't match incoming name {}!".format(edge.ename(), edge.weight, nm) + +class IPCGraph: + """ Class interface to a directed graph of IPC interconnectivity + """ + def __init__(self, name = '', timebase = 0.0): + global g_verbose + if len(name) == 0: + self.name = 'ipcgraph' + else: + self.name = name + if g_verbose > 0: + sys.stderr.write('Creating new IPCGraph named {}...\n'.format(self.name)) + self.nodes = {} + self.edges = {} + self.msgs = defaultdict(lambda: {f:0 for f in IPCEdge.DFLAVORS}) + self.msgtypes = defaultdict(lambda: {t:0 for t in IPCEdge.DTYPES}) + self.nmsgs = 0 + self.totals = {} + self.maxdweight = 0 + for f in IPCEdge.DFLAVORS: + self.totals['n'+f] = 0 + self.totals['D'+f] = 0 + if timebase and timebase > 0.0: + self.timebase = timebase + else: + self.timebase = 0.0 + + def __iter__(self): + return edges + + def edgename(self, src, dst): + if src and dst: + return src.nname + ' -> ' + dst.nname + return '' + + def addmsg(self, src_str, src_pid, dst_str, dst_pid, data_hex_str, flags_str, time): + src = None + dst = None + for k, v in self.nodes.iteritems(): + if not src and k == src_str: + src = v + if not dst and k == dst_str: + dst = v + if src and dst: + break + if not src: + src = IPCNode(src_str) + self.nodes[src_str] = src; + if not dst: + dst = IPCNode(dst_str) + self.nodes[dst_str] = dst + src.addpid(src_pid, time) + dst.addpid(dst_pid, time) + + nm = self.edgename(src, dst) + msgdata = {} + msgDtype = {} + e = self.edges.get(nm) + if e != None: + if e.ename() != nm: + raise EdgeError(e,nm) + msgdata, msgDtype = e.addmsg(data_hex_str, flags_str, time) + else: + e = IPCEdge(src, dst, data_hex_str, flags_str, time) + msgdata, msgDtype = e.msgdata() + self.edges[nm] = e + + if self.maxdweight < e.dweight: + self.maxdweight = e.dweight + + if sum(msgdata.values()) == 0: + self.msgs[0]['std'] += 1 + self.msgtypes[0]['std'] += 1 + if not 'enames' in self.msgs[0]: + self.msgs[0]['enames'] = [ nm ] + elif not nm in self.msgs[0]['enames']: + self.msgs[0]['enames'].append(nm) + else: + for k,d in msgdata.iteritems(): + if d > 0: + self.msgs[d][k] += 1 + self.totals['n'+k] += 1 + self.totals['D'+k] += d + if not 'enames' in self.msgs[d]: + self.msgs[d]['enames'] = [ nm ] + elif not nm in self.msgs[d]['enames']: + self.msgs[d]['enames'].append(nm) + for k,d in msgDtype.iteritems(): + if d > 0: + self.msgtypes[d][k] += 1 + self.nmsgs += 1 + if self.nmsgs % 1024 == 0: + sys.stderr.write(" {:d}...\r".format(self.nmsgs)); + + def print_dot_node(self, ofile, node): + omsgs, imsgs = node.nmsgs() + recycled = node.recycled() * 5 + tcolor = 'black' + if recycled >= 50: + tcolor = 'white' + if recycled == 5: + bgcolor = 'white' + elif recycled <= 100: + bgcolor = 'grey{:d}'.format(100 - recycled) + else: + bgcolor = 'red' + ofile.write("\t{:s} [style=filled,fontcolor={:s},fillcolor={:s},label=\"{:s}\"];\n"\ + .format(node.nname, tcolor, bgcolor, node.label())) + + def print_dot_edge(self, nm, edge, ofile): + #weight = 100 * edge.dweight / self.maxdweight + #if weight < 1: + # weight = 1 + weight = edge.dweight + penwidth = edge.weight / 512 + if penwidth < 0.5: + penwidth = 0.5 + if penwidth > 7.99: + penwidth = 8 + attrs = "weight={},penwidth={}".format(round(weight,2), round(penwidth,2)) + + if edge.flags & edge.F_RAISEIMP: + attrs += ",arrowhead=dot" + + xpc = edge.dtype('xpc') + edge.dtype('xpc.reply') + iokit = edge.dtype('iokit') + edge.dtype('iokit.reply') + std = edge.dtype('std') + edge.dtype('std.reply') + if xpc > (iokit + std): + attrs += ',color=blue' + elif iokit > (std + xpc): + attrs += ',color=red' + + if edge.data('vcpy') > (edge.data('ool') + edge.data('std')): + attrs += ',style="dotted"' + #ltype = [] + #if edge.flags & (edge.F_DST_NDFLTQ | edge.F_SRC_NDFLTQ): + # ltype.append('dotted') + #if edge.flags & edge.F_APP_SRC: + # ltype.append('bold') + #if len(ltype) > 0: + # attrs += ',style="' + reduce(lambda a, v: a + ',' + v, ltype) + '"' + # + #if edge.data('ool') > (edge.data('std') + edge.data('vcpy')): + # attrs += ",color=blue" + #if edge.data('vcpy') > (edge.data('ool') + edge.data('std')): + # attrs += ",color=green" + + ofile.write("\t{:s} [{:s}];\n".format(nm, attrs)) + + def print_follow_graph(self, ofile, follow, visited = None): + ofile.write("digraph {:s} {{\n".format(self.name)) + ofile.write("\tsplines=ortho;\n") + if not visited: + visited = [] + for f in follow: + sys.stderr.write("following {}\n".format(f)) + lvl = 0 + printedges = {} + while len(follow) > 0: + cnodes = [] + for nm, e in self.edges.iteritems(): + nicename = e.src.nicename + # Find all nodes to which 'follow' nodes communicate + if e.src.nicename in follow: + printedges[nm] = e + if not e.selfedge() and not e.dst in cnodes: + cnodes.append(e.dst) + visited.extend(follow) + follow = [] + for n in cnodes: + if not n.nicename in visited: + follow.append(n.nicename) + lvl += 1 + for f in follow: + sys.stderr.write("{}following {}\n".format(' |--'*lvl, f)) + # END: while len(follow) + for k, v in self.nodes.iteritems(): + if v.nicename in visited: + self.print_dot_node(ofile, v) + for nm, edge in printedges.iteritems(): + self.print_dot_edge(nm, edge, ofile) + ofile.write("}\n\n") + + def print_graph(self, ofile, follow): + ofile.write("digraph {:s} {{\n".format(self.name)) + ofile.write("\tsplines=ortho;\n") + for k, v in self.nodes.iteritems(): + self.print_dot_node(ofile, v) + for nm, edge in self.edges.iteritems(): + self.print_dot_edge(nm, edge, ofile) + ofile.write("}\n\n") + + def print_nodegrid(self, ofile, type='msg', dfilter=None): + showdata = False + dfname = dfilter + if not dfname: + dfname = 'all' + if type == 'data': + showdata = True + ofile.write("{} Data sent between nodes.\nRow == SOURCE; Column == DESTINATION\n".format(dfname)) + else: + ofile.write("{} Messages sent between nodes.\nRow == SOURCE; Column == DESTINATION\n".format(dfname)) + + if not dfilter: + dfilter = IPCEdge.DTYPES + ofile.write(' ,' + ','.join(self.nodes.keys()) + '\n') + for snm, src in self.nodes.iteritems(): + odata = [] + for dnm, dst in self.nodes.iteritems(): + enm = self.edgename(src, dst) + e = self.edges.get(enm) + if e and enm in src.outgoing.keys(): + if showdata: + dsize = reduce(lambda accum, t: accum + e.dtype(t), dfilter, 0) + odata.append('{:d}'.format(dsize)) + else: + nmsg = reduce(lambda accum, t: accum + e.mtype(t), dfilter, 0) + odata.append('{:d}'.format(nmsg)) + else: + odata.append('0') + ofile.write(snm + ',' + ','.join(odata) + '\n') + + def print_datasummary(self, ofile): + m = {} + for type in IPCEdge.DTYPES: + m[type] = [0, 0] + for k, v in self.edges.iteritems(): + for t in IPCEdge.DTYPES: + m[t][0] += v.mtype(t) + m[t][1] += v.dtype(t) + tdata = 0 + tmsgs = 0 + for f in IPCEdge.DFLAVORS: + tdata += self.totals['D'+f] + tmsgs += self.totals['n'+f] + # we account for 0-sized messages differently + tmsgs += self.msgs[0]['std'] + ofile.write("Nodes:{:d}\nEdges:{:d}\n".format(len(self.nodes),len(self.edges))) + ofile.write("Total Messages,{}\nTotal Data,{}\n".format(tmsgs, tdata)) + ofile.write("Flavor,Messages,Data,\n") + for f in IPCEdge.DFLAVORS: + ofile.write("{:s},{:d},{:d}\n".format(f, self.totals['n'+f], self.totals['D'+f])) + ofile.write("Style,Messages,Data,\n") + for t in IPCEdge.DTYPES: + ofile.write("{:s},{:d},{:d}\n".format(t, m[t][0], m[t][1])) + + def print_freqdata(self, ofile, gnuplot = False): + flavoridx = {} + ostr = "Message Size" + idx = 1 + for f in IPCEdge.DFLAVORS: + ostr += ',{fmt:s} Freq,{fmt:s} CDF,{fmt:s} Data CDF,{fmt:s} Cumulative Data'.format(fmt=f) + idx += 1 + flavoridx[f] = idx + idx += 3 + ostr += ',#Unique SVC pairs\n' + ofile.write(ostr) + + lastmsg = 0 + maxmsgs = {} + totalmsgs = {} + Tdata = {} + for f in IPCEdge.DFLAVORS: + maxmsgs[f] = 0 + totalmsgs[f] = 0 + Tdata[f] = 0 + + for k, v in sorted(self.msgs.iteritems()): + lastmsg = k + _nmsgs = {} + for f in IPCEdge.DFLAVORS: + _nmsgs[f] = v[f] + if v[f] > maxmsgs[f]: + maxmsgs[f] = v[f] + if k > 0: + Tdata[f] += v[f] * k + totalmsgs[f] += v[f] + + cdf = {f:0 for f in IPCEdge.DFLAVORS} + dcdf = {f:0 for f in IPCEdge.DFLAVORS} + if k > 0: # Only use messages with data size > 0 + for f in IPCEdge.DFLAVORS: + if self.totals['n'+f] > 0: + cdf[f] = int(100 * totalmsgs[f] / self.totals['n'+f]) + if self.totals['D'+f] > 0: + dcdf[f] = int(100 * Tdata[f] / self.totals['D'+f]) + + ostr = "{:d}".format(k) + for f in IPCEdge.DFLAVORS: + ostr += ",{:d},{:d},{:d},{:d}".format(_nmsgs[f],cdf[f],dcdf[f],Tdata[f]) + ostr += ",{:d}\n".format(len(v['enames'])) + ofile.write(ostr) + + if not gnuplot: + return + + colors = [ 'blue', 'red', 'green', 'black', 'grey', 'yellow' ] + idx = 0 + flavorcolor = {} + maxdata = 0 + maxmsg = max(maxmsgs.values()) + for f in IPCEdge.DFLAVORS: + flavorcolor[f] = colors[idx] + if self.totals['D'+f] > maxdata: + maxdata = self.totals['D'+f] + idx += 1 + + sys.stderr.write("Creating GNUPlot...\n") + + cdf_data_fmt = """\ + set terminal postscript eps enhanced color solid 'Courier' 12 + set border 3 + set size 1.5, 1.5 + set xtics nomirror + set ytics nomirror + set xrange [1:2048] + set yrange [0:100] + set ylabel font 'Courier,14' "Total Message CDF\\n(% of total number of messages)" + set xlabel font 'Courier,14' "Message Size (bytes)" + set datafile separator "," + set ytics ( '0' 0, '10' 10, '20' 20, '30' 30, '40' 40, '50' 50, '60' 60, '70' 70, '80' 80, '90' 90, '100' 100) + plot """ + plots = [] + for f in IPCEdge.DFLAVORS: + plots.append("'{{csvfile:s}}' using 1:{:d} title '{:s} Messages' with lines lw 2 lt 1 lc rgb \"{:s}\"".format(flavoridx[f]+1, f, flavorcolor[f])) + cdf_data_fmt += ', \\\n'.join(plots) + + dcdf_data_fmt = """\ + set terminal postscript eps enhanced color solid 'Courier' 12 + set border 3 + set size 1.5, 1.5 + set xtics nomirror + set ytics nomirror + set xrange [1:32768] + set yrange [0:100] + set ylabel font 'Courier,14' "Total Data CDF\\n(% of total data transmitted)" + set xlabel font 'Courier,14' "Message Size (bytes)" + set datafile separator "," + set ytics ( '0' 0, '10' 10, '20' 20, '30' 30, '40' 40, '50' 50, '60' 60, '70' 70, '80' 80, '90' 90, '100' 100) + plot """ + plots = [] + for f in IPCEdge.DFLAVORS: + plots.append("'{{csvfile:s}}' using 1:{:d} title '{:s} Message Data' with lines lw 2 lt 1 lc rgb \"{:s}\"".format(flavoridx[f]+2, f, flavorcolor[f])) + dcdf_data_fmt += ', \\\n'.join(plots) + + freq_data_fmt = """\ + set terminal postscript eps enhanced color solid 'Courier' 12 + set size 1.5, 1.5 + set xrange [1:32768] + set yrange [0:9000] + set x2range [1:32768] + set y2range [0:{maxdata:d}] + set xtics nomirror + set ytics nomirror + set y2tics + set autoscale y2 + set grid x y2 + set ylabel font 'Courier,14' "Number of Messages" + set y2label font 'Courier,14' "Data Transferred (bytes)" + set xlabel font 'Courier,14' "Message Size (bytes)" + set datafile separator "," + set tics out + set boxwidth 1 + set style fill solid + plot """ + plots = [] + for f in IPCEdge.DFLAVORS: + plots.append("'{{csvfile:s}}' using 1:{:d} axes x1y1 title '{:s} Messages' with boxes lt 1 lc rgb \"{:s}\"".format(flavoridx[f], f, flavorcolor[f])) + plots.append("'{{csvfile:s}}' using 1:{:d} axes x2y2 title '{:s} Data' with line lt 1 lw 2 lc rgb \"{:s}\"".format(flavoridx[f]+3, f, flavorcolor[f])) + freq_data_fmt += ', \\\n'.join(plots) + try: + new_file = re.sub(r'(.*)\.\w+$', r'\1_cdf.plot', ofile.name) + sys.stderr.write("\t{:s}...\n".format(new_file)) + plotfile = open(new_file, 'w') + plotfile.write(cdf_data_fmt.format(lastmsg=lastmsg, maxdata=maxdata, maxmsg=maxmsg, csvfile=ofile.name)) + plotfile.flush() + plotfile.close() + + new_file = re.sub(r'(.*)\.\w+$', r'\1_dcdf.plot', ofile.name) + sys.stderr.write("\t{:s}...\n".format(new_file)) + plotfile = open(new_file, 'w') + plotfile.write(dcdf_data_fmt.format(lastmsg=lastmsg, maxdata=maxdata, maxmsg=maxmsg, csvfile=ofile.name)) + plotfile.flush() + plotfile.close() + + new_file = re.sub(r'(.*)\.\w+$', r'\1_hist.plot', ofile.name) + sys.stderr.write("\t{:s}...\n".format(new_file)) + plotfile = open(new_file, 'w') + plotfile.write(freq_data_fmt.format(lastmsg=lastmsg, maxdata=maxdata, maxmsg=maxmsg, csvfile=ofile.name)) + plotfile.flush() + plotfile.close() + except: + sys.stderr.write("\nFailed to write gnuplot script!\n"); + return + + +def convert_raw_tracefiles(args): + if not args.raw or len(args.raw) < 1: + return + + if not args.tracefile: + args.tracefile = [] + + for rawfile in args.raw: + sys.stderr.write("Converting RAW tracefile '{:s}'...\n".format(rawfile.name)) + if args.tbfreq and len(args.tbfreq) > 0: + args.tbfreq = " -F " + args.tbfreq + else: + args.tbfreq = "" + tfile = re.sub(r'(.*)(\.\w+)*$', r'\1.ascii', rawfile.name) + cmd = 'trace -R {:s}{:s} -o {:s}'.format(rawfile.name, args.tbfreq, tfile) + if args.tracecodes and len(args.tracecodes) > 0: + cmd += " -N {}".format(args.tracecodes[0]) + elif os.path.isfile('bsd/kern/trace.codes'): + cmd += " -N bsd/kern/trace.codes" + if args.traceargs and len(args.traceargs) > 0: + cmd += ' '.join(args.traceargs) + (ret, outstr) = RunCommand(cmd) + if ret != 0: + os.stderr.write("Couldn't convert raw trace file. ret=={:d}\nE: {:s}\n".format(ret, outstr)) + sys.exit(ret) + + if not os.path.isfile(tfile): + sys.stderr.write("Failure to convert raw trace file '{:s}'\ncmd: '{:s}'\n".format(args.raw[0].name, cmd)) + sys.exit(1) + args.tracefile.append(open(tfile, 'r')) + # END: for rawfile in args.raw + + +def parse_tracefile_line(line, exclude, include, exflags, incflags, active_proc, graph, base=16): + val = line.split() + if len(val) < 10: + return + if val[2] == "proc_exec" or val[2] == "TRACE_DATA_EXEC": + pid = int(val[3], base) + active_proc[pid] = val[9] + if val[2] == "MACH_IPC_kmsg_info": + sendpid = int(val[3], base) + destpid = int(val[4], base) + if sendpid == 0: + src = "kernel_task" + elif sendpid in active_proc: + src = active_proc[sendpid] + else: + src = "{:d}".format(sendpid) + if destpid == 0: + dst = "kernel_task" + elif destpid in active_proc: + dst = active_proc[destpid] + else: + dst = "{:d}".format(destpid) + if exclude and len(exclude) > 0 and (src in exclude or dst in exclude): + return + if include and len(include) > 0 and (not (src in include or dst in include)): + return + flags = int(val[6], 16) + if exflags or incflags: + if exflags and (flags & int(exflags[0], 0)): + return + if incflags and (flags & int(incflags[0], 0)) != int(incflags[0], 0): + return + # create a graph edge + if (flags & IPCEdge.F_TRACED): + graph.addmsg(src, sendpid, dst, destpid, val[5], val[6], float(val[0])) + # END: MACH_IPC_kmsg_info + +# +# Main +# +def main(argv=sys.argv): + """ Main program entry point. + + Trace file output lines look like this: + {abstime} {delta} MACH_IPC_kmsg_info {src_pid} {dst_pid} {msg_len} {flags} {threadid} {cpu} {proc_name} + e.g. + 4621921.2 33.8(0.0) MACH_IPC_kmsg_info ac 9d c 230002 b2e 1 MobileMail + + Or like this: + {abstime} {delta} proc_exec {pid} 0 0 0 {threadid} {cpu} {proc_name} + e.g. + 4292212.3 511.2 proc_exec c8 0 0 0 b44 0 voiced + """ + global g_verbose + + parser = argparse.ArgumentParser(description='Parse an XNU Mach IPC kmsg ktrace file') + + # output a DOT formatted graph file + parser.add_argument('--printgraph', '-g', dest='graph', default=None, type=argparse.FileType('w'), help='Output a DOT connectivity graph from the trace data') + parser.add_argument('--graphname', dest='name', default='ipcgraph', help='A name for the DOT graph output') + parser.add_argument('--graphfollow', dest='follow', nargs='+', metavar='NAME', help='Graph only the transitive closure of services / processes which communicate with the given service(s)') + + # output a CDF of message data + parser.add_argument('--printfreq', '-f', dest='freq', default=None, type=argparse.FileType('w'), help='Output a frequency distribution of message data (in CSV format)') + parser.add_argument('--gnuplot', dest='gnuplot', action='store_true', help='Write out a gnuplot file along with the frequency distribution data') + + # output a simple summary of message data + parser.add_argument('--printsummary', '-s', dest='summary', default=None, type=argparse.FileType('w'), help='Output a summary of all messages in the trace data') + + # Output a CSV grid of node data/messages + parser.add_argument('--printnodegrid', '-n', dest='nodegrid', default=None, type=argparse.FileType('w'), help='Output a CSV grid of all messages/data sent between nodes (defaults to # messages)') + parser.add_argument('--ngridtype', dest='ngridtype', default=None, choices=['msgs', 'data'], help='Used with the --printnodegrid argument, this option control whether the grid will be # of messages sent between nodes, or amount of data sent between nodes') + parser.add_argument('--ngridfilter', dest='ngridfilter', default=None, nargs='+', choices=IPCEdge.DTYPES, help='Used with the --printnodegrid argument, this option controls the type of messages or data counted') + + parser.add_argument('--raw', '-R', dest='raw', nargs='+', type=argparse.FileType('r'), metavar='tracefile', help='Process a raw tracefile using the "trace" utility on the host. This requires an ssh connection to the device, or a manual specification of the tbfrequency.') + parser.add_argument('--tbfreq', '-T', dest='tbfreq', default=None, help='The value of sysctl hw.tbfrequency run on the device') + parser.add_argument('--device', '-D', dest='device', nargs=1, metavar='DEV', help='The name of the iOS device reachable via "ssh DEV"') + parser.add_argument('--tracecodes', '-N', dest='tracecodes', nargs=1, metavar='TRACE.CODES', help='Path to a custom trace.codes file. By default, the script will look for bsd/kern/trace.codes from the current directory)') + parser.add_argument('--traceargs', dest='traceargs', nargs='+', metavar='TRACE_OPT', help='Extra options to the "trace" program run on the host') + + parser.add_argument('--psfile', dest='psfile', nargs='+', type=argparse.FileType('r'), help='Process list file output by ios_trace_ipc.sh') + + parser.add_argument('--exclude', dest='exclude', metavar='NAME', nargs='+', help='List of services to exclude from processing. Any messages sent to or originating from these services will be discarded.') + parser.add_argument('--include', dest='include', metavar='NAME', nargs='+', help='List of services to include in processing. Only messages sent to or originating from these services will be processed.') + parser.add_argument('--exflags', dest='exflags', metavar='0xFLAGS', nargs=1, help='Messages with any of these flags bits set will be discarded') + parser.add_argument('--incflags', dest='incflags', metavar='0xFLAGS', nargs=1, type=int, help='Only messages with all of these flags bits set will be processed') + + parser.add_argument('--verbose', '-v', dest='verbose', action='count', help='be verbose (can be used multiple times)') + parser.add_argument('tracefile', nargs='*', type=argparse.FileType('r'), help='Input trace file') + + args = parser.parse_args() + + g_verbose = args.verbose + + if not args.graph and not args.freq and not args.summary and not args.nodegrid: + sys.stderr.write("Please select at least one output format: [-gfsn] {file}\n") + sys.exit(1) + + convert_raw_tracefiles(args) + + graph = IPCGraph(args.name, args.tbfreq) + + nfiles = len(args.tracefile) + idx = 0 + while idx < nfiles: + active_proc = {} + # Parse a ps output file (generated by ios_trace_ipc.sh) + # This pre-fills the active_proc list + if args.psfile and len(args.psfile) > idx: + sys.stderr.write("Parsing {:s}...\n".format(args.psfile[idx].name)) + for line in args.psfile[idx]: + if line.strip() == '': + continue + parse_tracefile_line(line.strip(), None, None, None, None, active_proc, graph, 10) + # END: for line in psfile + + sys.stderr.write("Parsing {:s}...\n".format(args.tracefile[idx].name)) + for line in args.tracefile[idx]: + if line.strip() == '': + continue + parse_tracefile_line(line.strip(), args.exclude, args.include, args.exflags, args.incflags, active_proc, graph) + # END: for line in tracefile + idx += 1 + # END: foreach tracefile/psfile + + if args.graph: + if args.follow and len(args.follow) > 0: + sys.stderr.write("Writing follow-graph to {:s}...\n".format(args.graph.name)) + graph.print_follow_graph(args.graph, args.follow) + else: + sys.stderr.write("Writing graph output to {:s}...\n".format(args.graph.name)) + graph.print_graph(args.graph, args.follow) + if args.freq: + sys.stderr.write("Writing CDF data to {:s}...\n".format(args.freq.name)) + graph.print_freqdata(args.freq, args.gnuplot) + if args.summary: + sys.stderr.write("Writing summary data to {:s}...\n".format(args.summary.name)) + graph.print_datasummary(args.summary) + if args.nodegrid: + nm = args.ngridtype + sys.stderr.write("Writing node grid data to {:s}...]\n".format(args.nodegrid.name)) + graph.print_nodegrid(args.nodegrid, args.ngridtype, args.ngridfilter) + +if __name__ == '__main__': + sys.exit(main())